mcp-hangar 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_hangar/__init__.py +139 -0
- mcp_hangar/application/__init__.py +1 -0
- mcp_hangar/application/commands/__init__.py +67 -0
- mcp_hangar/application/commands/auth_commands.py +118 -0
- mcp_hangar/application/commands/auth_handlers.py +296 -0
- mcp_hangar/application/commands/commands.py +59 -0
- mcp_hangar/application/commands/handlers.py +189 -0
- mcp_hangar/application/discovery/__init__.py +21 -0
- mcp_hangar/application/discovery/discovery_metrics.py +283 -0
- mcp_hangar/application/discovery/discovery_orchestrator.py +497 -0
- mcp_hangar/application/discovery/lifecycle_manager.py +315 -0
- mcp_hangar/application/discovery/security_validator.py +414 -0
- mcp_hangar/application/event_handlers/__init__.py +50 -0
- mcp_hangar/application/event_handlers/alert_handler.py +191 -0
- mcp_hangar/application/event_handlers/audit_handler.py +203 -0
- mcp_hangar/application/event_handlers/knowledge_base_handler.py +120 -0
- mcp_hangar/application/event_handlers/logging_handler.py +69 -0
- mcp_hangar/application/event_handlers/metrics_handler.py +152 -0
- mcp_hangar/application/event_handlers/persistent_audit_store.py +217 -0
- mcp_hangar/application/event_handlers/security_handler.py +604 -0
- mcp_hangar/application/mcp/tooling.py +158 -0
- mcp_hangar/application/ports/__init__.py +9 -0
- mcp_hangar/application/ports/observability.py +237 -0
- mcp_hangar/application/queries/__init__.py +52 -0
- mcp_hangar/application/queries/auth_handlers.py +237 -0
- mcp_hangar/application/queries/auth_queries.py +118 -0
- mcp_hangar/application/queries/handlers.py +227 -0
- mcp_hangar/application/read_models/__init__.py +11 -0
- mcp_hangar/application/read_models/provider_views.py +139 -0
- mcp_hangar/application/sagas/__init__.py +11 -0
- mcp_hangar/application/sagas/group_rebalance_saga.py +137 -0
- mcp_hangar/application/sagas/provider_failover_saga.py +266 -0
- mcp_hangar/application/sagas/provider_recovery_saga.py +172 -0
- mcp_hangar/application/services/__init__.py +9 -0
- mcp_hangar/application/services/provider_service.py +208 -0
- mcp_hangar/application/services/traced_provider_service.py +211 -0
- mcp_hangar/bootstrap/runtime.py +328 -0
- mcp_hangar/context.py +178 -0
- mcp_hangar/domain/__init__.py +117 -0
- mcp_hangar/domain/contracts/__init__.py +57 -0
- mcp_hangar/domain/contracts/authentication.py +225 -0
- mcp_hangar/domain/contracts/authorization.py +229 -0
- mcp_hangar/domain/contracts/event_store.py +178 -0
- mcp_hangar/domain/contracts/metrics_publisher.py +59 -0
- mcp_hangar/domain/contracts/persistence.py +383 -0
- mcp_hangar/domain/contracts/provider_runtime.py +146 -0
- mcp_hangar/domain/discovery/__init__.py +20 -0
- mcp_hangar/domain/discovery/conflict_resolver.py +267 -0
- mcp_hangar/domain/discovery/discovered_provider.py +185 -0
- mcp_hangar/domain/discovery/discovery_service.py +412 -0
- mcp_hangar/domain/discovery/discovery_source.py +192 -0
- mcp_hangar/domain/events.py +433 -0
- mcp_hangar/domain/exceptions.py +525 -0
- mcp_hangar/domain/model/__init__.py +70 -0
- mcp_hangar/domain/model/aggregate.py +58 -0
- mcp_hangar/domain/model/circuit_breaker.py +152 -0
- mcp_hangar/domain/model/event_sourced_api_key.py +413 -0
- mcp_hangar/domain/model/event_sourced_provider.py +423 -0
- mcp_hangar/domain/model/event_sourced_role_assignment.py +268 -0
- mcp_hangar/domain/model/health_tracker.py +183 -0
- mcp_hangar/domain/model/load_balancer.py +185 -0
- mcp_hangar/domain/model/provider.py +810 -0
- mcp_hangar/domain/model/provider_group.py +656 -0
- mcp_hangar/domain/model/tool_catalog.py +105 -0
- mcp_hangar/domain/policies/__init__.py +19 -0
- mcp_hangar/domain/policies/provider_health.py +187 -0
- mcp_hangar/domain/repository.py +249 -0
- mcp_hangar/domain/security/__init__.py +85 -0
- mcp_hangar/domain/security/input_validator.py +710 -0
- mcp_hangar/domain/security/rate_limiter.py +387 -0
- mcp_hangar/domain/security/roles.py +237 -0
- mcp_hangar/domain/security/sanitizer.py +387 -0
- mcp_hangar/domain/security/secrets.py +501 -0
- mcp_hangar/domain/services/__init__.py +20 -0
- mcp_hangar/domain/services/audit_service.py +376 -0
- mcp_hangar/domain/services/image_builder.py +328 -0
- mcp_hangar/domain/services/provider_launcher.py +1046 -0
- mcp_hangar/domain/value_objects.py +1138 -0
- mcp_hangar/errors.py +818 -0
- mcp_hangar/fastmcp_server.py +1105 -0
- mcp_hangar/gc.py +134 -0
- mcp_hangar/infrastructure/__init__.py +79 -0
- mcp_hangar/infrastructure/async_executor.py +133 -0
- mcp_hangar/infrastructure/auth/__init__.py +37 -0
- mcp_hangar/infrastructure/auth/api_key_authenticator.py +388 -0
- mcp_hangar/infrastructure/auth/event_sourced_store.py +567 -0
- mcp_hangar/infrastructure/auth/jwt_authenticator.py +360 -0
- mcp_hangar/infrastructure/auth/middleware.py +340 -0
- mcp_hangar/infrastructure/auth/opa_authorizer.py +243 -0
- mcp_hangar/infrastructure/auth/postgres_store.py +659 -0
- mcp_hangar/infrastructure/auth/projections.py +366 -0
- mcp_hangar/infrastructure/auth/rate_limiter.py +311 -0
- mcp_hangar/infrastructure/auth/rbac_authorizer.py +323 -0
- mcp_hangar/infrastructure/auth/sqlite_store.py +624 -0
- mcp_hangar/infrastructure/command_bus.py +112 -0
- mcp_hangar/infrastructure/discovery/__init__.py +110 -0
- mcp_hangar/infrastructure/discovery/docker_source.py +289 -0
- mcp_hangar/infrastructure/discovery/entrypoint_source.py +249 -0
- mcp_hangar/infrastructure/discovery/filesystem_source.py +383 -0
- mcp_hangar/infrastructure/discovery/kubernetes_source.py +247 -0
- mcp_hangar/infrastructure/event_bus.py +260 -0
- mcp_hangar/infrastructure/event_sourced_repository.py +443 -0
- mcp_hangar/infrastructure/event_store.py +396 -0
- mcp_hangar/infrastructure/knowledge_base/__init__.py +259 -0
- mcp_hangar/infrastructure/knowledge_base/contracts.py +202 -0
- mcp_hangar/infrastructure/knowledge_base/memory.py +177 -0
- mcp_hangar/infrastructure/knowledge_base/postgres.py +545 -0
- mcp_hangar/infrastructure/knowledge_base/sqlite.py +513 -0
- mcp_hangar/infrastructure/metrics_publisher.py +36 -0
- mcp_hangar/infrastructure/observability/__init__.py +10 -0
- mcp_hangar/infrastructure/observability/langfuse_adapter.py +534 -0
- mcp_hangar/infrastructure/persistence/__init__.py +33 -0
- mcp_hangar/infrastructure/persistence/audit_repository.py +371 -0
- mcp_hangar/infrastructure/persistence/config_repository.py +398 -0
- mcp_hangar/infrastructure/persistence/database.py +333 -0
- mcp_hangar/infrastructure/persistence/database_common.py +330 -0
- mcp_hangar/infrastructure/persistence/event_serializer.py +280 -0
- mcp_hangar/infrastructure/persistence/event_upcaster.py +166 -0
- mcp_hangar/infrastructure/persistence/in_memory_event_store.py +150 -0
- mcp_hangar/infrastructure/persistence/recovery_service.py +312 -0
- mcp_hangar/infrastructure/persistence/sqlite_event_store.py +386 -0
- mcp_hangar/infrastructure/persistence/unit_of_work.py +409 -0
- mcp_hangar/infrastructure/persistence/upcasters/README.md +13 -0
- mcp_hangar/infrastructure/persistence/upcasters/__init__.py +7 -0
- mcp_hangar/infrastructure/query_bus.py +153 -0
- mcp_hangar/infrastructure/saga_manager.py +401 -0
- mcp_hangar/logging_config.py +209 -0
- mcp_hangar/metrics.py +1007 -0
- mcp_hangar/models.py +31 -0
- mcp_hangar/observability/__init__.py +54 -0
- mcp_hangar/observability/health.py +487 -0
- mcp_hangar/observability/metrics.py +319 -0
- mcp_hangar/observability/tracing.py +433 -0
- mcp_hangar/progress.py +542 -0
- mcp_hangar/retry.py +613 -0
- mcp_hangar/server/__init__.py +120 -0
- mcp_hangar/server/__main__.py +6 -0
- mcp_hangar/server/auth_bootstrap.py +340 -0
- mcp_hangar/server/auth_cli.py +335 -0
- mcp_hangar/server/auth_config.py +305 -0
- mcp_hangar/server/bootstrap.py +735 -0
- mcp_hangar/server/cli.py +161 -0
- mcp_hangar/server/config.py +224 -0
- mcp_hangar/server/context.py +215 -0
- mcp_hangar/server/http_auth_middleware.py +165 -0
- mcp_hangar/server/lifecycle.py +467 -0
- mcp_hangar/server/state.py +117 -0
- mcp_hangar/server/tools/__init__.py +16 -0
- mcp_hangar/server/tools/discovery.py +186 -0
- mcp_hangar/server/tools/groups.py +75 -0
- mcp_hangar/server/tools/health.py +301 -0
- mcp_hangar/server/tools/provider.py +939 -0
- mcp_hangar/server/tools/registry.py +320 -0
- mcp_hangar/server/validation.py +113 -0
- mcp_hangar/stdio_client.py +229 -0
- mcp_hangar-0.2.0.dist-info/METADATA +347 -0
- mcp_hangar-0.2.0.dist-info/RECORD +160 -0
- mcp_hangar-0.2.0.dist-info/WHEEL +4 -0
- mcp_hangar-0.2.0.dist-info/entry_points.txt +2 -0
- mcp_hangar-0.2.0.dist-info/licenses/LICENSE +21 -0
mcp_hangar/metrics.py
ADDED
|
@@ -0,0 +1,1007 @@
|
|
|
1
|
+
"""Prometheus metrics for MCP Registry.
|
|
2
|
+
|
|
3
|
+
Production-grade metrics following Prometheus/OpenMetrics best practices:
|
|
4
|
+
- Consistent naming: mcp_registry_<subsystem>_<metric>_<unit>
|
|
5
|
+
- Proper label cardinality control
|
|
6
|
+
- Thread-safe implementations
|
|
7
|
+
- Standard histogram buckets for different use cases
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from collections import defaultdict
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from functools import wraps
|
|
13
|
+
import platform
|
|
14
|
+
import threading
|
|
15
|
+
import time
|
|
16
|
+
from typing import Dict, List, Optional
|
|
17
|
+
|
|
18
|
+
# =============================================================================
|
|
19
|
+
# Core Metric Types
|
|
20
|
+
# =============================================================================
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class MetricSample:
|
|
25
|
+
"""Single metric sample with labels."""
|
|
26
|
+
|
|
27
|
+
value: float
|
|
28
|
+
labels: Dict[str, str] = field(default_factory=dict)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Counter:
|
|
32
|
+
"""
|
|
33
|
+
Prometheus counter - monotonically increasing value.
|
|
34
|
+
|
|
35
|
+
Use for: requests, errors, completions, bytes transferred.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, name: str, description: str, labels: List[str] = None):
|
|
39
|
+
self.name = name
|
|
40
|
+
self.description = description
|
|
41
|
+
self.label_names = labels or []
|
|
42
|
+
self._values: Dict[tuple, float] = defaultdict(float)
|
|
43
|
+
self._created: Dict[tuple, float] = {}
|
|
44
|
+
self._lock = threading.Lock()
|
|
45
|
+
|
|
46
|
+
def inc(self, value: float = 1.0, **labels) -> None:
|
|
47
|
+
"""Increment counter by value (must be >= 0)."""
|
|
48
|
+
if value < 0:
|
|
49
|
+
raise ValueError("Counter can only increase")
|
|
50
|
+
key = self._make_key(labels)
|
|
51
|
+
with self._lock:
|
|
52
|
+
if key not in self._created:
|
|
53
|
+
self._created[key] = time.time()
|
|
54
|
+
self._values[key] += value
|
|
55
|
+
|
|
56
|
+
def _make_key(self, labels: dict) -> tuple:
|
|
57
|
+
return tuple(labels.get(label_name, "") for label_name in self.label_names)
|
|
58
|
+
|
|
59
|
+
def labels(self, **label_values) -> "_LabeledCounter":
|
|
60
|
+
"""Return counter with preset labels for reuse."""
|
|
61
|
+
return _LabeledCounter(self, label_values)
|
|
62
|
+
|
|
63
|
+
def collect(self) -> List[MetricSample]:
|
|
64
|
+
"""Collect all samples."""
|
|
65
|
+
with self._lock:
|
|
66
|
+
return [MetricSample(value=v, labels=dict(zip(self.label_names, k))) for k, v in self._values.items()]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Gauge:
|
|
70
|
+
"""
|
|
71
|
+
Prometheus gauge - value that can go up and down.
|
|
72
|
+
|
|
73
|
+
Use for: in-progress operations, current state, temperature, queue size.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self, name: str, description: str, labels: List[str] = None):
|
|
77
|
+
self.name = name
|
|
78
|
+
self.description = description
|
|
79
|
+
self.label_names = labels or []
|
|
80
|
+
self._values: Dict[tuple, float] = {}
|
|
81
|
+
self._lock = threading.Lock()
|
|
82
|
+
|
|
83
|
+
def set(self, value: float, **labels) -> None:
|
|
84
|
+
"""Set gauge to value."""
|
|
85
|
+
key = self._make_key(labels)
|
|
86
|
+
with self._lock:
|
|
87
|
+
self._values[key] = value
|
|
88
|
+
|
|
89
|
+
def inc(self, value: float = 1.0, **labels) -> None:
|
|
90
|
+
"""Increment gauge."""
|
|
91
|
+
key = self._make_key(labels)
|
|
92
|
+
with self._lock:
|
|
93
|
+
self._values[key] = self._values.get(key, 0) + value
|
|
94
|
+
|
|
95
|
+
def dec(self, value: float = 1.0, **labels) -> None:
|
|
96
|
+
"""Decrement gauge."""
|
|
97
|
+
key = self._make_key(labels)
|
|
98
|
+
with self._lock:
|
|
99
|
+
self._values[key] = self._values.get(key, 0) - value
|
|
100
|
+
|
|
101
|
+
def set_to_current_time(self, **labels) -> None:
|
|
102
|
+
"""Set gauge to current Unix timestamp."""
|
|
103
|
+
self.set(time.time(), **labels)
|
|
104
|
+
|
|
105
|
+
def _make_key(self, labels: dict) -> tuple:
|
|
106
|
+
return tuple(labels.get(label_name, "") for label_name in self.label_names)
|
|
107
|
+
|
|
108
|
+
def labels(self, **label_values) -> "_LabeledGauge":
|
|
109
|
+
"""Return gauge with preset labels."""
|
|
110
|
+
return _LabeledGauge(self, label_values)
|
|
111
|
+
|
|
112
|
+
def collect(self) -> List[MetricSample]:
|
|
113
|
+
"""Collect all samples."""
|
|
114
|
+
with self._lock:
|
|
115
|
+
return [MetricSample(value=v, labels=dict(zip(self.label_names, k))) for k, v in self._values.items()]
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class Histogram:
|
|
119
|
+
"""
|
|
120
|
+
Prometheus histogram - distribution of values in buckets.
|
|
121
|
+
|
|
122
|
+
Use for: request latencies, response sizes.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
# Standard bucket presets
|
|
126
|
+
DEFAULT_BUCKETS = (0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0)
|
|
127
|
+
LATENCY_BUCKETS = (
|
|
128
|
+
0.001,
|
|
129
|
+
0.0025,
|
|
130
|
+
0.005,
|
|
131
|
+
0.01,
|
|
132
|
+
0.025,
|
|
133
|
+
0.05,
|
|
134
|
+
0.1,
|
|
135
|
+
0.25,
|
|
136
|
+
0.5,
|
|
137
|
+
1.0,
|
|
138
|
+
2.5,
|
|
139
|
+
5.0,
|
|
140
|
+
10.0,
|
|
141
|
+
30.0,
|
|
142
|
+
)
|
|
143
|
+
SIZE_BUCKETS = (100, 1000, 10000, 100000, 1000000, 10000000)
|
|
144
|
+
|
|
145
|
+
def __init__(
|
|
146
|
+
self,
|
|
147
|
+
name: str,
|
|
148
|
+
description: str,
|
|
149
|
+
labels: List[str] = None,
|
|
150
|
+
buckets: tuple = None,
|
|
151
|
+
):
|
|
152
|
+
self.name = name
|
|
153
|
+
self.description = description
|
|
154
|
+
self.label_names = labels or []
|
|
155
|
+
self.buckets = tuple(sorted(buckets or self.DEFAULT_BUCKETS)) + (float("inf"),)
|
|
156
|
+
self._lock = threading.Lock()
|
|
157
|
+
self._buckets: Dict[tuple, Dict[float, int]] = defaultdict(lambda: {b: 0 for b in self.buckets})
|
|
158
|
+
self._sums: Dict[tuple, float] = defaultdict(float)
|
|
159
|
+
self._counts: Dict[tuple, int] = defaultdict(int)
|
|
160
|
+
|
|
161
|
+
def observe(self, value: float, **labels) -> None:
|
|
162
|
+
"""Record an observation."""
|
|
163
|
+
key = self._make_key(labels)
|
|
164
|
+
with self._lock:
|
|
165
|
+
self._sums[key] += value
|
|
166
|
+
self._counts[key] += 1
|
|
167
|
+
# Add to the first bucket that fits (buckets are sorted)
|
|
168
|
+
for bucket in self.buckets:
|
|
169
|
+
if value <= bucket:
|
|
170
|
+
self._buckets[key][bucket] += 1
|
|
171
|
+
break # Only add to the first matching bucket
|
|
172
|
+
|
|
173
|
+
def _make_key(self, labels: dict) -> tuple:
|
|
174
|
+
return tuple(labels.get(label_name, "") for label_name in self.label_names)
|
|
175
|
+
|
|
176
|
+
def labels(self, **label_values) -> "_LabeledHistogram":
|
|
177
|
+
"""Return histogram with preset labels."""
|
|
178
|
+
return _LabeledHistogram(self, label_values)
|
|
179
|
+
|
|
180
|
+
def time(self) -> "_Timer":
|
|
181
|
+
"""Context manager for timing code blocks."""
|
|
182
|
+
return _Timer(self, {})
|
|
183
|
+
|
|
184
|
+
def collect(self) -> tuple:
|
|
185
|
+
"""Collect buckets, sum, and count samples."""
|
|
186
|
+
buckets = []
|
|
187
|
+
sums = []
|
|
188
|
+
counts = []
|
|
189
|
+
|
|
190
|
+
with self._lock:
|
|
191
|
+
for key, bucket_values in self._buckets.items():
|
|
192
|
+
base_labels = dict(zip(self.label_names, key))
|
|
193
|
+
cumulative = 0
|
|
194
|
+
for bucket in self.buckets:
|
|
195
|
+
cumulative += bucket_values.get(bucket, 0)
|
|
196
|
+
le = "+Inf" if bucket == float("inf") else str(bucket)
|
|
197
|
+
buckets.append(MetricSample(value=cumulative, labels={**base_labels, "le": le}))
|
|
198
|
+
sums.append(MetricSample(value=self._sums[key], labels=base_labels))
|
|
199
|
+
counts.append(MetricSample(value=self._counts[key], labels=base_labels))
|
|
200
|
+
|
|
201
|
+
return buckets, sums, counts
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class Summary:
|
|
205
|
+
"""
|
|
206
|
+
Prometheus summary - streaming quantiles.
|
|
207
|
+
|
|
208
|
+
Simpler implementation using min/max/avg for now.
|
|
209
|
+
Use for: streaming data where quantiles aren't critical.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
def __init__(self, name: str, description: str, labels: List[str] = None):
|
|
213
|
+
self.name = name
|
|
214
|
+
self.description = description
|
|
215
|
+
self.label_names = labels or []
|
|
216
|
+
self._lock = threading.Lock()
|
|
217
|
+
self._sums: Dict[tuple, float] = defaultdict(float)
|
|
218
|
+
self._counts: Dict[tuple, int] = defaultdict(int)
|
|
219
|
+
|
|
220
|
+
def observe(self, value: float, **labels) -> None:
|
|
221
|
+
"""Record an observation."""
|
|
222
|
+
key = self._make_key(labels)
|
|
223
|
+
with self._lock:
|
|
224
|
+
self._sums[key] += value
|
|
225
|
+
self._counts[key] += 1
|
|
226
|
+
|
|
227
|
+
def _make_key(self, labels: dict) -> tuple:
|
|
228
|
+
return tuple(labels.get(label_name, "") for label_name in self.label_names)
|
|
229
|
+
|
|
230
|
+
def collect(self) -> tuple:
|
|
231
|
+
"""Collect sum and count samples."""
|
|
232
|
+
sums = []
|
|
233
|
+
counts = []
|
|
234
|
+
with self._lock:
|
|
235
|
+
for key in self._sums:
|
|
236
|
+
base_labels = dict(zip(self.label_names, key))
|
|
237
|
+
sums.append(MetricSample(value=self._sums[key], labels=base_labels))
|
|
238
|
+
counts.append(MetricSample(value=self._counts[key], labels=base_labels))
|
|
239
|
+
return sums, counts
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class Info:
|
|
243
|
+
"""
|
|
244
|
+
Prometheus info metric - static key-value pairs.
|
|
245
|
+
|
|
246
|
+
Use for: version info, build metadata, configuration.
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
def __init__(self, name: str, description: str):
|
|
250
|
+
self.name = name
|
|
251
|
+
self.description = description
|
|
252
|
+
self._labels: Dict[str, str] = {}
|
|
253
|
+
self._lock = threading.Lock()
|
|
254
|
+
|
|
255
|
+
def info(self, **labels) -> None:
|
|
256
|
+
"""Set info labels."""
|
|
257
|
+
with self._lock:
|
|
258
|
+
self._labels = {k: str(v) for k, v in labels.items()}
|
|
259
|
+
|
|
260
|
+
def collect(self) -> List[MetricSample]:
|
|
261
|
+
"""Collect info sample."""
|
|
262
|
+
with self._lock:
|
|
263
|
+
if self._labels:
|
|
264
|
+
return [MetricSample(value=1.0, labels=self._labels)]
|
|
265
|
+
return []
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
# =============================================================================
|
|
269
|
+
# Labeled Metric Helpers
|
|
270
|
+
# =============================================================================
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class _LabeledCounter:
|
|
274
|
+
"""Counter with preset labels."""
|
|
275
|
+
|
|
276
|
+
def __init__(self, counter: Counter, labels: dict):
|
|
277
|
+
self._counter = counter
|
|
278
|
+
self._labels = labels
|
|
279
|
+
|
|
280
|
+
def inc(self, value: float = 1.0) -> None:
|
|
281
|
+
self._counter.inc(value, **self._labels)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class _LabeledGauge:
|
|
285
|
+
"""Gauge with preset labels."""
|
|
286
|
+
|
|
287
|
+
def __init__(self, gauge: Gauge, labels: dict):
|
|
288
|
+
self._gauge = gauge
|
|
289
|
+
self._labels = labels
|
|
290
|
+
|
|
291
|
+
def set(self, value: float) -> None:
|
|
292
|
+
self._gauge.set(value, **self._labels)
|
|
293
|
+
|
|
294
|
+
def inc(self, value: float = 1.0) -> None:
|
|
295
|
+
self._gauge.inc(value, **self._labels)
|
|
296
|
+
|
|
297
|
+
def dec(self, value: float = 1.0) -> None:
|
|
298
|
+
self._gauge.dec(value, **self._labels)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
class _LabeledHistogram:
|
|
302
|
+
"""Histogram with preset labels."""
|
|
303
|
+
|
|
304
|
+
def __init__(self, histogram: Histogram, labels: dict):
|
|
305
|
+
self._histogram = histogram
|
|
306
|
+
self._labels = labels
|
|
307
|
+
|
|
308
|
+
def observe(self, value: float) -> None:
|
|
309
|
+
self._histogram.observe(value, **self._labels)
|
|
310
|
+
|
|
311
|
+
def time(self) -> "_Timer":
|
|
312
|
+
return _Timer(self._histogram, self._labels)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
class _Timer:
|
|
316
|
+
"""Context manager for timing operations."""
|
|
317
|
+
|
|
318
|
+
def __init__(self, histogram: Histogram, labels: dict):
|
|
319
|
+
self._histogram = histogram
|
|
320
|
+
self._labels = labels
|
|
321
|
+
self._start: Optional[float] = None
|
|
322
|
+
|
|
323
|
+
def __enter__(self) -> "_Timer":
|
|
324
|
+
self._start = time.perf_counter()
|
|
325
|
+
return self
|
|
326
|
+
|
|
327
|
+
def __exit__(self, *args) -> None:
|
|
328
|
+
duration = time.perf_counter() - self._start
|
|
329
|
+
self._histogram.observe(duration, **self._labels)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
# =============================================================================
|
|
333
|
+
# Metrics Registry
|
|
334
|
+
# =============================================================================
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
class CollectorRegistry:
|
|
338
|
+
"""Central registry for all metrics with Prometheus exposition format output."""
|
|
339
|
+
|
|
340
|
+
def __init__(self):
|
|
341
|
+
self._collectors: Dict[str, any] = {}
|
|
342
|
+
self._lock = threading.Lock()
|
|
343
|
+
|
|
344
|
+
def register(self, collector) -> None:
|
|
345
|
+
"""Register a metric collector."""
|
|
346
|
+
with self._lock:
|
|
347
|
+
if collector.name in self._collectors:
|
|
348
|
+
raise ValueError(f"Metric {collector.name} already registered")
|
|
349
|
+
self._collectors[collector.name] = collector
|
|
350
|
+
|
|
351
|
+
def unregister(self, name: str) -> None:
|
|
352
|
+
"""Unregister a metric."""
|
|
353
|
+
with self._lock:
|
|
354
|
+
self._collectors.pop(name, None)
|
|
355
|
+
|
|
356
|
+
def get(self, name: str):
|
|
357
|
+
"""Get collector by name."""
|
|
358
|
+
return self._collectors.get(name)
|
|
359
|
+
|
|
360
|
+
def collect(self) -> str:
|
|
361
|
+
"""Generate Prometheus exposition format output."""
|
|
362
|
+
lines = []
|
|
363
|
+
|
|
364
|
+
with self._lock:
|
|
365
|
+
collectors = list(self._collectors.items())
|
|
366
|
+
|
|
367
|
+
for name, collector in collectors:
|
|
368
|
+
lines.extend(self._format_metric(name, collector))
|
|
369
|
+
lines.append("")
|
|
370
|
+
|
|
371
|
+
return "\n".join(lines)
|
|
372
|
+
|
|
373
|
+
def _format_metric(self, name: str, collector) -> List[str]:
|
|
374
|
+
"""Format a single metric in Prometheus format."""
|
|
375
|
+
lines = []
|
|
376
|
+
lines.append(f"# HELP {name} {collector.description}")
|
|
377
|
+
|
|
378
|
+
if isinstance(collector, Counter):
|
|
379
|
+
lines.append(f"# TYPE {name} counter")
|
|
380
|
+
for sample in collector.collect():
|
|
381
|
+
labels = self._format_labels(sample.labels)
|
|
382
|
+
lines.append(f"{name}_total{labels} {sample.value}")
|
|
383
|
+
|
|
384
|
+
elif isinstance(collector, Gauge):
|
|
385
|
+
lines.append(f"# TYPE {name} gauge")
|
|
386
|
+
for sample in collector.collect():
|
|
387
|
+
labels = self._format_labels(sample.labels)
|
|
388
|
+
lines.append(f"{name}{labels} {sample.value}")
|
|
389
|
+
|
|
390
|
+
elif isinstance(collector, Histogram):
|
|
391
|
+
lines.append(f"# TYPE {name} histogram")
|
|
392
|
+
buckets, sums, counts = collector.collect()
|
|
393
|
+
for sample in buckets:
|
|
394
|
+
labels = self._format_labels(sample.labels)
|
|
395
|
+
lines.append(f"{name}_bucket{labels} {int(sample.value)}")
|
|
396
|
+
for sample in sums:
|
|
397
|
+
labels = self._format_labels(sample.labels)
|
|
398
|
+
lines.append(f"{name}_sum{labels} {sample.value}")
|
|
399
|
+
for sample in counts:
|
|
400
|
+
labels = self._format_labels(sample.labels)
|
|
401
|
+
lines.append(f"{name}_count{labels} {int(sample.value)}")
|
|
402
|
+
|
|
403
|
+
elif isinstance(collector, Summary):
|
|
404
|
+
lines.append(f"# TYPE {name} summary")
|
|
405
|
+
sums, counts = collector.collect()
|
|
406
|
+
for sample in sums:
|
|
407
|
+
labels = self._format_labels(sample.labels)
|
|
408
|
+
lines.append(f"{name}_sum{labels} {sample.value}")
|
|
409
|
+
for sample in counts:
|
|
410
|
+
labels = self._format_labels(sample.labels)
|
|
411
|
+
lines.append(f"{name}_count{labels} {int(sample.value)}")
|
|
412
|
+
|
|
413
|
+
elif isinstance(collector, Info):
|
|
414
|
+
lines.append(f"# TYPE {name}_info gauge")
|
|
415
|
+
for sample in collector.collect():
|
|
416
|
+
labels = self._format_labels(sample.labels)
|
|
417
|
+
lines.append(f"{name}_info{labels} 1")
|
|
418
|
+
|
|
419
|
+
return lines
|
|
420
|
+
|
|
421
|
+
def _format_labels(self, labels: Dict[str, str]) -> str:
|
|
422
|
+
"""Format labels in Prometheus format."""
|
|
423
|
+
if not labels:
|
|
424
|
+
return ""
|
|
425
|
+
# Escape label values properly
|
|
426
|
+
escaped = []
|
|
427
|
+
for k, v in sorted(labels.items()):
|
|
428
|
+
if v is None:
|
|
429
|
+
v = ""
|
|
430
|
+
v = str(v).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
|
|
431
|
+
escaped.append(f'{k}="{v}"')
|
|
432
|
+
return "{" + ",".join(escaped) + "}"
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
# =============================================================================
|
|
436
|
+
# Global Registry
|
|
437
|
+
# =============================================================================
|
|
438
|
+
|
|
439
|
+
REGISTRY = CollectorRegistry()
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
# =============================================================================
|
|
443
|
+
# MCP Registry Metrics - Following Best Practices
|
|
444
|
+
# =============================================================================
|
|
445
|
+
|
|
446
|
+
# -----------------------------------------------------------------------------
|
|
447
|
+
# Build/Version Info
|
|
448
|
+
# -----------------------------------------------------------------------------
|
|
449
|
+
|
|
450
|
+
BUILD_INFO = Info(
|
|
451
|
+
name="mcp_registry_build",
|
|
452
|
+
description="Build and version information for MCP Registry",
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# -----------------------------------------------------------------------------
|
|
456
|
+
# Process Metrics
|
|
457
|
+
# -----------------------------------------------------------------------------
|
|
458
|
+
|
|
459
|
+
PROCESS_START_TIME = Gauge(
|
|
460
|
+
name="mcp_registry_process_start_time_seconds",
|
|
461
|
+
description="Unix timestamp of process start time",
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
# -----------------------------------------------------------------------------
|
|
465
|
+
# Provider Lifecycle Metrics
|
|
466
|
+
# -----------------------------------------------------------------------------
|
|
467
|
+
|
|
468
|
+
PROVIDER_INFO = Gauge(
|
|
469
|
+
name="mcp_registry_provider_info",
|
|
470
|
+
description="Provider configuration info (always 1, labels contain metadata)",
|
|
471
|
+
labels=["provider", "mode"],
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
PROVIDER_STATE_CURRENT = Gauge(
|
|
475
|
+
name="mcp_registry_provider_state",
|
|
476
|
+
description="Current provider state (0=cold, 1=initializing, 2=ready, 3=degraded, 4=dead)",
|
|
477
|
+
labels=["provider"],
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
PROVIDER_UP = Gauge(
|
|
481
|
+
name="mcp_registry_provider_up",
|
|
482
|
+
description="Whether provider is up and ready (1=up, 0=down)",
|
|
483
|
+
labels=["provider"],
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
PROVIDER_INITIALIZED = Gauge(
|
|
487
|
+
name="mcp_registry_provider_initialized",
|
|
488
|
+
description="Whether provider has been initialized at least once (1=yes, 0=no/cold)",
|
|
489
|
+
labels=["provider"],
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
PROVIDER_LAST_STATE_CHANGE_SECONDS = Gauge(
|
|
493
|
+
name="mcp_registry_provider_last_state_change_timestamp_seconds",
|
|
494
|
+
description="Unix timestamp of last provider state change",
|
|
495
|
+
labels=["provider"],
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
PROVIDER_STARTS_TOTAL = Counter(
|
|
499
|
+
name="mcp_registry_provider_starts",
|
|
500
|
+
description="Total number of provider start attempts",
|
|
501
|
+
labels=["provider", "result"], # result: success, failure
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
PROVIDER_STOPS_TOTAL = Counter(
|
|
505
|
+
name="mcp_registry_provider_stops",
|
|
506
|
+
description="Total number of provider stops",
|
|
507
|
+
labels=["provider", "reason"], # reason: idle, manual, error, gc
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
PROVIDER_COLD_START_SECONDS = Histogram(
|
|
511
|
+
name="mcp_registry_provider_cold_start_seconds",
|
|
512
|
+
description="Time from cold start to ready state (critical UX metric)",
|
|
513
|
+
labels=["provider", "mode"],
|
|
514
|
+
buckets=(0.1, 0.25, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 15.0, 30.0, 60.0),
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
PROVIDER_COLD_START_IN_PROGRESS = Gauge(
|
|
518
|
+
name="mcp_registry_provider_cold_start_in_progress",
|
|
519
|
+
description="Number of providers currently in cold start",
|
|
520
|
+
labels=["provider"],
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
# -----------------------------------------------------------------------------
|
|
524
|
+
# Tool Invocation Metrics (RED method: Rate, Errors, Duration)
|
|
525
|
+
# -----------------------------------------------------------------------------
|
|
526
|
+
|
|
527
|
+
TOOL_CALLS_TOTAL = Counter(
|
|
528
|
+
name="mcp_registry_tool_calls",
|
|
529
|
+
description="Total number of tool calls",
|
|
530
|
+
labels=["provider", "tool", "status"], # status: success, error
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
TOOL_CALL_DURATION_SECONDS = Histogram(
|
|
534
|
+
name="mcp_registry_tool_call_duration_seconds",
|
|
535
|
+
description="Duration of tool calls in seconds",
|
|
536
|
+
labels=["provider", "tool"],
|
|
537
|
+
buckets=Histogram.LATENCY_BUCKETS,
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
TOOL_CALL_ERRORS_TOTAL = Counter(
|
|
541
|
+
name="mcp_registry_tool_call_errors",
|
|
542
|
+
description="Total number of tool call errors by error type",
|
|
543
|
+
labels=["provider", "tool", "error_type"],
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
# -----------------------------------------------------------------------------
|
|
547
|
+
# Health Check Metrics
|
|
548
|
+
# -----------------------------------------------------------------------------
|
|
549
|
+
|
|
550
|
+
HEALTH_CHECK_TOTAL = Counter(
|
|
551
|
+
name="mcp_registry_health_checks",
|
|
552
|
+
description="Total number of health check executions",
|
|
553
|
+
labels=["provider", "result"], # result: cold, healthy, unhealthy
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
HEALTH_CHECK_DURATION_SECONDS = Histogram(
|
|
557
|
+
name="mcp_registry_health_check_duration_seconds",
|
|
558
|
+
description="Duration of health checks in seconds",
|
|
559
|
+
labels=["provider"],
|
|
560
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0),
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
HEALTH_CHECK_CONSECUTIVE_FAILURES = Gauge(
|
|
564
|
+
name="mcp_registry_health_check_consecutive_failures",
|
|
565
|
+
description="Number of consecutive health check failures",
|
|
566
|
+
labels=["provider"],
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
# -----------------------------------------------------------------------------
|
|
570
|
+
# Connection Pool Metrics
|
|
571
|
+
# -----------------------------------------------------------------------------
|
|
572
|
+
|
|
573
|
+
CONNECTIONS_ACTIVE = Gauge(
|
|
574
|
+
name="mcp_registry_connections_active",
|
|
575
|
+
description="Number of active connections to providers",
|
|
576
|
+
labels=["provider"],
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
CONNECTIONS_TOTAL = Counter(
|
|
580
|
+
name="mcp_registry_connections",
|
|
581
|
+
description="Total number of connections established",
|
|
582
|
+
labels=["provider", "result"],
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
CONNECTION_DURATION_SECONDS = Histogram(
|
|
586
|
+
name="mcp_registry_connection_duration_seconds",
|
|
587
|
+
description="Duration of provider connections in seconds",
|
|
588
|
+
labels=["provider"],
|
|
589
|
+
buckets=(1, 5, 10, 30, 60, 300, 600, 1800, 3600),
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
# -----------------------------------------------------------------------------
|
|
593
|
+
# Message Metrics
|
|
594
|
+
# -----------------------------------------------------------------------------
|
|
595
|
+
|
|
596
|
+
MESSAGES_SENT_TOTAL = Counter(
|
|
597
|
+
name="mcp_registry_messages_sent",
|
|
598
|
+
description="Total number of JSON-RPC messages sent",
|
|
599
|
+
labels=["provider", "method"],
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
MESSAGES_RECEIVED_TOTAL = Counter(
|
|
603
|
+
name="mcp_registry_messages_received",
|
|
604
|
+
description="Total number of JSON-RPC messages received",
|
|
605
|
+
labels=["provider", "type"], # type: response, notification, error
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
MESSAGE_SIZE_BYTES = Histogram(
|
|
609
|
+
name="mcp_registry_message_size_bytes",
|
|
610
|
+
description="Size of JSON-RPC messages in bytes",
|
|
611
|
+
labels=["provider", "direction"], # direction: sent, received
|
|
612
|
+
buckets=Histogram.SIZE_BUCKETS,
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
# -----------------------------------------------------------------------------
|
|
616
|
+
# GC (Garbage Collection) Metrics
|
|
617
|
+
# -----------------------------------------------------------------------------
|
|
618
|
+
|
|
619
|
+
GC_CYCLES_TOTAL = Counter(
|
|
620
|
+
name="mcp_registry_gc_cycles",
|
|
621
|
+
description="Total number of garbage collection cycles",
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
GC_CYCLE_DURATION_SECONDS = Histogram(
|
|
625
|
+
name="mcp_registry_gc_cycle_duration_seconds",
|
|
626
|
+
description="Duration of garbage collection cycles in seconds",
|
|
627
|
+
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5),
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
GC_PROVIDERS_COLLECTED_TOTAL = Counter(
|
|
631
|
+
name="mcp_registry_gc_providers_collected",
|
|
632
|
+
description="Total number of providers collected by GC",
|
|
633
|
+
labels=["reason"], # reason: idle, dead, error
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
# -----------------------------------------------------------------------------
|
|
637
|
+
# Error Metrics
|
|
638
|
+
# -----------------------------------------------------------------------------
|
|
639
|
+
|
|
640
|
+
ERRORS_TOTAL = Counter(
|
|
641
|
+
name="mcp_registry_errors",
|
|
642
|
+
description="Total number of errors by type and component",
|
|
643
|
+
labels=["component", "error_type"], # component: provider, tool, health, gc, server
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
# -----------------------------------------------------------------------------
|
|
647
|
+
# Rate Limiter Metrics
|
|
648
|
+
# -----------------------------------------------------------------------------
|
|
649
|
+
|
|
650
|
+
RATE_LIMIT_HITS_TOTAL = Counter(
|
|
651
|
+
name="mcp_registry_rate_limit_hits",
|
|
652
|
+
description="Total number of requests that hit rate limits",
|
|
653
|
+
labels=["endpoint"],
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
# -----------------------------------------------------------------------------
|
|
657
|
+
# Discovery Metrics
|
|
658
|
+
# -----------------------------------------------------------------------------
|
|
659
|
+
|
|
660
|
+
DISCOVERY_SOURCES_TOTAL = Gauge(
|
|
661
|
+
name="mcp_registry_discovery_sources",
|
|
662
|
+
description="Number of configured discovery sources",
|
|
663
|
+
labels=["source_type", "mode"],
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
DISCOVERY_SOURCES_HEALTHY = Gauge(
|
|
667
|
+
name="mcp_registry_discovery_sources_healthy",
|
|
668
|
+
description="Whether discovery source is healthy (1=healthy, 0=unhealthy)",
|
|
669
|
+
labels=["source_type"],
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
DISCOVERY_PROVIDERS_TOTAL = Gauge(
|
|
673
|
+
name="mcp_registry_discovery_providers",
|
|
674
|
+
description="Number of discovered providers",
|
|
675
|
+
labels=["source_type", "status"], # status: discovered, registered, quarantined
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
DISCOVERY_CYCLES_TOTAL = Counter(
|
|
679
|
+
name="mcp_registry_discovery_cycles",
|
|
680
|
+
description="Total number of discovery cycles executed",
|
|
681
|
+
labels=["source_type"],
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
DISCOVERY_CYCLE_DURATION_SECONDS = Histogram(
|
|
685
|
+
name="mcp_registry_discovery_cycle_duration_seconds",
|
|
686
|
+
description="Duration of discovery cycles in seconds",
|
|
687
|
+
labels=["source_type"],
|
|
688
|
+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0),
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
DISCOVERY_REGISTRATIONS_TOTAL = Counter(
|
|
692
|
+
name="mcp_registry_discovery_registrations",
|
|
693
|
+
description="Total provider registrations from discovery",
|
|
694
|
+
labels=["source_type"],
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
DISCOVERY_DEREGISTRATIONS_TOTAL = Counter(
|
|
698
|
+
name="mcp_registry_discovery_deregistrations",
|
|
699
|
+
description="Total provider deregistrations from discovery",
|
|
700
|
+
labels=["source_type", "reason"], # reason: ttl_expired, source_removed, manual
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
DISCOVERY_CONFLICTS_TOTAL = Counter(
|
|
704
|
+
name="mcp_registry_discovery_conflicts",
|
|
705
|
+
description="Total discovery conflicts",
|
|
706
|
+
labels=["conflict_type"], # conflict_type: static_wins, source_priority
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
DISCOVERY_QUARANTINE_TOTAL = Counter(
|
|
710
|
+
name="mcp_registry_discovery_quarantine",
|
|
711
|
+
description="Total providers quarantined",
|
|
712
|
+
labels=["reason"], # reason: health_check_failed, validation_failed, rate_limited
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
DISCOVERY_ERRORS_TOTAL = Counter(
|
|
716
|
+
name="mcp_registry_discovery_errors",
|
|
717
|
+
description="Total discovery errors",
|
|
718
|
+
labels=["source_type", "error_type"],
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
DISCOVERY_LAST_CYCLE_TIMESTAMP = Gauge(
|
|
722
|
+
name="mcp_registry_discovery_last_cycle_timestamp_seconds",
|
|
723
|
+
description="Unix timestamp of last discovery cycle",
|
|
724
|
+
labels=["source_type"],
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
# =============================================================================
|
|
729
|
+
# Register All Metrics
|
|
730
|
+
# =============================================================================
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def _register_all_metrics():
|
|
734
|
+
"""Register all predefined metrics."""
|
|
735
|
+
metrics = [
|
|
736
|
+
BUILD_INFO,
|
|
737
|
+
PROCESS_START_TIME,
|
|
738
|
+
PROVIDER_INFO,
|
|
739
|
+
PROVIDER_STATE_CURRENT,
|
|
740
|
+
PROVIDER_UP,
|
|
741
|
+
PROVIDER_INITIALIZED,
|
|
742
|
+
PROVIDER_LAST_STATE_CHANGE_SECONDS,
|
|
743
|
+
PROVIDER_STARTS_TOTAL,
|
|
744
|
+
PROVIDER_STOPS_TOTAL,
|
|
745
|
+
PROVIDER_COLD_START_SECONDS,
|
|
746
|
+
PROVIDER_COLD_START_IN_PROGRESS,
|
|
747
|
+
TOOL_CALLS_TOTAL,
|
|
748
|
+
TOOL_CALL_DURATION_SECONDS,
|
|
749
|
+
TOOL_CALL_ERRORS_TOTAL,
|
|
750
|
+
HEALTH_CHECK_TOTAL,
|
|
751
|
+
HEALTH_CHECK_DURATION_SECONDS,
|
|
752
|
+
HEALTH_CHECK_CONSECUTIVE_FAILURES,
|
|
753
|
+
CONNECTIONS_ACTIVE,
|
|
754
|
+
CONNECTIONS_TOTAL,
|
|
755
|
+
CONNECTION_DURATION_SECONDS,
|
|
756
|
+
MESSAGES_SENT_TOTAL,
|
|
757
|
+
MESSAGES_RECEIVED_TOTAL,
|
|
758
|
+
MESSAGE_SIZE_BYTES,
|
|
759
|
+
GC_CYCLES_TOTAL,
|
|
760
|
+
GC_CYCLE_DURATION_SECONDS,
|
|
761
|
+
GC_PROVIDERS_COLLECTED_TOTAL,
|
|
762
|
+
ERRORS_TOTAL,
|
|
763
|
+
RATE_LIMIT_HITS_TOTAL,
|
|
764
|
+
# Discovery metrics
|
|
765
|
+
DISCOVERY_SOURCES_TOTAL,
|
|
766
|
+
DISCOVERY_SOURCES_HEALTHY,
|
|
767
|
+
DISCOVERY_PROVIDERS_TOTAL,
|
|
768
|
+
DISCOVERY_CYCLES_TOTAL,
|
|
769
|
+
DISCOVERY_CYCLE_DURATION_SECONDS,
|
|
770
|
+
DISCOVERY_REGISTRATIONS_TOTAL,
|
|
771
|
+
DISCOVERY_DEREGISTRATIONS_TOTAL,
|
|
772
|
+
DISCOVERY_CONFLICTS_TOTAL,
|
|
773
|
+
DISCOVERY_QUARANTINE_TOTAL,
|
|
774
|
+
DISCOVERY_ERRORS_TOTAL,
|
|
775
|
+
DISCOVERY_LAST_CYCLE_TIMESTAMP,
|
|
776
|
+
]
|
|
777
|
+
for metric in metrics:
|
|
778
|
+
REGISTRY.register(metric)
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
_register_all_metrics()
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
# =============================================================================
|
|
785
|
+
# Convenience Functions
|
|
786
|
+
# =============================================================================
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
def get_metrics() -> str:
|
|
790
|
+
"""Get all metrics in Prometheus exposition format."""
|
|
791
|
+
return REGISTRY.collect()
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
def init_metrics(version: str = "1.0.0"):
|
|
795
|
+
"""Initialize metrics on server startup."""
|
|
796
|
+
BUILD_INFO.info(
|
|
797
|
+
version=version,
|
|
798
|
+
python_version=platform.python_version(),
|
|
799
|
+
platform=platform.system(),
|
|
800
|
+
)
|
|
801
|
+
PROCESS_START_TIME.set(time.time())
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
def observe_tool_call(provider: str, tool: str, duration: float, success: bool, error_type: str = None):
|
|
805
|
+
"""Record a tool call observation."""
|
|
806
|
+
status = "success" if success else "error"
|
|
807
|
+
TOOL_CALLS_TOTAL.inc(provider=provider, tool=tool, status=status)
|
|
808
|
+
TOOL_CALL_DURATION_SECONDS.observe(duration, provider=provider, tool=tool)
|
|
809
|
+
if not success and error_type:
|
|
810
|
+
TOOL_CALL_ERRORS_TOTAL.inc(provider=provider, tool=tool, error_type=error_type)
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
def observe_health_check(
|
|
814
|
+
provider: str,
|
|
815
|
+
duration: float,
|
|
816
|
+
healthy: bool,
|
|
817
|
+
is_cold: bool = False,
|
|
818
|
+
consecutive_failures: int = 0,
|
|
819
|
+
):
|
|
820
|
+
"""Record a health check observation.
|
|
821
|
+
|
|
822
|
+
Args:
|
|
823
|
+
provider: Provider ID
|
|
824
|
+
duration: Health check duration in seconds
|
|
825
|
+
healthy: Whether the check passed (only meaningful if not cold)
|
|
826
|
+
is_cold: Whether provider is in cold state (not started yet)
|
|
827
|
+
consecutive_failures: Number of consecutive failures
|
|
828
|
+
"""
|
|
829
|
+
if is_cold:
|
|
830
|
+
result = "cold"
|
|
831
|
+
elif healthy:
|
|
832
|
+
result = "healthy"
|
|
833
|
+
else:
|
|
834
|
+
result = "unhealthy"
|
|
835
|
+
|
|
836
|
+
HEALTH_CHECK_TOTAL.inc(provider=provider, result=result)
|
|
837
|
+
HEALTH_CHECK_DURATION_SECONDS.observe(duration, provider=provider)
|
|
838
|
+
HEALTH_CHECK_CONSECUTIVE_FAILURES.set(consecutive_failures, provider=provider)
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
def update_provider_state(provider: str, state: str, mode: str = "subprocess"):
|
|
842
|
+
"""Update provider state metrics."""
|
|
843
|
+
state_map = {"cold": 0, "initializing": 1, "ready": 2, "degraded": 3, "dead": 4}
|
|
844
|
+
PROVIDER_STATE_CURRENT.set(state_map.get(state, 0), provider=provider)
|
|
845
|
+
PROVIDER_UP.set(1 if state == "ready" else 0, provider=provider)
|
|
846
|
+
PROVIDER_INITIALIZED.set(0 if state == "cold" else 1, provider=provider)
|
|
847
|
+
PROVIDER_INFO.set(1, provider=provider, mode=mode)
|
|
848
|
+
PROVIDER_LAST_STATE_CHANGE_SECONDS.set(time.time(), provider=provider)
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
def record_provider_start(provider: str, success: bool):
|
|
852
|
+
"""Record a provider start attempt."""
|
|
853
|
+
result = "success" if success else "failure"
|
|
854
|
+
PROVIDER_STARTS_TOTAL.inc(provider=provider, result=result)
|
|
855
|
+
if success:
|
|
856
|
+
PROVIDER_INITIALIZED.set(1, provider=provider)
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
def record_provider_stop(provider: str, reason: str):
|
|
860
|
+
"""Record a provider stop."""
|
|
861
|
+
PROVIDER_STOPS_TOTAL.inc(provider=provider, reason=reason)
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def record_cold_start(provider: str, duration: float, mode: str = "subprocess"):
|
|
865
|
+
"""Record cold start duration - the critical UX metric.
|
|
866
|
+
|
|
867
|
+
This measures time from user request to provider ready state.
|
|
868
|
+
High values here directly impact user experience.
|
|
869
|
+
|
|
870
|
+
Args:
|
|
871
|
+
provider: Provider ID
|
|
872
|
+
duration: Time in seconds from start to ready
|
|
873
|
+
mode: Provider mode (subprocess, docker, etc.)
|
|
874
|
+
"""
|
|
875
|
+
PROVIDER_COLD_START_SECONDS.observe(duration, provider=provider, mode=mode)
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
def cold_start_begin(provider: str):
|
|
879
|
+
"""Mark beginning of cold start (for in-progress tracking)."""
|
|
880
|
+
PROVIDER_COLD_START_IN_PROGRESS.set(1, provider=provider)
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
def cold_start_end(provider: str):
|
|
884
|
+
"""Mark end of cold start."""
|
|
885
|
+
PROVIDER_COLD_START_IN_PROGRESS.set(0, provider=provider)
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
def record_gc_cycle(duration: float, collected: Dict[str, int] = None):
|
|
889
|
+
"""Record a GC cycle."""
|
|
890
|
+
GC_CYCLES_TOTAL.inc()
|
|
891
|
+
GC_CYCLE_DURATION_SECONDS.observe(duration)
|
|
892
|
+
if collected:
|
|
893
|
+
for reason, count in collected.items():
|
|
894
|
+
for _ in range(count):
|
|
895
|
+
GC_PROVIDERS_COLLECTED_TOTAL.inc(reason=reason)
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
def record_error(component: str, error_type: str):
|
|
899
|
+
"""Record an error."""
|
|
900
|
+
ERRORS_TOTAL.inc(component=component, error_type=error_type)
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
# =============================================================================
|
|
904
|
+
# Discovery Metrics Functions
|
|
905
|
+
# =============================================================================
|
|
906
|
+
|
|
907
|
+
|
|
908
|
+
def update_discovery_source(source_type: str, mode: str, is_healthy: bool, providers_count: int):
|
|
909
|
+
"""Update discovery source metrics.
|
|
910
|
+
|
|
911
|
+
Args:
|
|
912
|
+
source_type: Type of source (filesystem, docker, kubernetes, entrypoint)
|
|
913
|
+
mode: Discovery mode (additive, authoritative)
|
|
914
|
+
is_healthy: Whether the source is healthy
|
|
915
|
+
providers_count: Number of providers discovered by this source
|
|
916
|
+
"""
|
|
917
|
+
DISCOVERY_SOURCES_TOTAL.set(1, source_type=source_type, mode=mode)
|
|
918
|
+
DISCOVERY_SOURCES_HEALTHY.set(1 if is_healthy else 0, source_type=source_type)
|
|
919
|
+
DISCOVERY_PROVIDERS_TOTAL.set(providers_count, source_type=source_type, status="discovered")
|
|
920
|
+
|
|
921
|
+
|
|
922
|
+
def record_discovery_cycle(
|
|
923
|
+
source_type: str,
|
|
924
|
+
duration: float,
|
|
925
|
+
discovered: int = 0,
|
|
926
|
+
registered: int = 0,
|
|
927
|
+
quarantined: int = 0,
|
|
928
|
+
):
|
|
929
|
+
"""Record a discovery cycle execution.
|
|
930
|
+
|
|
931
|
+
Args:
|
|
932
|
+
source_type: Type of source
|
|
933
|
+
duration: Duration of the cycle in seconds
|
|
934
|
+
discovered: Number of providers discovered
|
|
935
|
+
registered: Number of providers registered
|
|
936
|
+
quarantined: Number of providers quarantined
|
|
937
|
+
"""
|
|
938
|
+
DISCOVERY_CYCLES_TOTAL.inc(source_type=source_type)
|
|
939
|
+
DISCOVERY_CYCLE_DURATION_SECONDS.observe(duration, source_type=source_type)
|
|
940
|
+
DISCOVERY_LAST_CYCLE_TIMESTAMP.set(time.time(), source_type=source_type)
|
|
941
|
+
|
|
942
|
+
# Update provider counts
|
|
943
|
+
DISCOVERY_PROVIDERS_TOTAL.set(discovered, source_type=source_type, status="discovered")
|
|
944
|
+
DISCOVERY_PROVIDERS_TOTAL.set(registered, source_type=source_type, status="registered")
|
|
945
|
+
DISCOVERY_PROVIDERS_TOTAL.set(quarantined, source_type=source_type, status="quarantined")
|
|
946
|
+
|
|
947
|
+
|
|
948
|
+
def record_discovery_registration(source_type: str):
|
|
949
|
+
"""Record a provider registration from discovery."""
|
|
950
|
+
DISCOVERY_REGISTRATIONS_TOTAL.inc(source_type=source_type)
|
|
951
|
+
|
|
952
|
+
|
|
953
|
+
def record_discovery_deregistration(source_type: str, reason: str):
|
|
954
|
+
"""Record a provider deregistration from discovery.
|
|
955
|
+
|
|
956
|
+
Args:
|
|
957
|
+
source_type: Type of source
|
|
958
|
+
reason: Reason for deregistration (ttl_expired, source_removed, manual)
|
|
959
|
+
"""
|
|
960
|
+
DISCOVERY_DEREGISTRATIONS_TOTAL.inc(source_type=source_type, reason=reason)
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
def record_discovery_conflict(conflict_type: str):
|
|
964
|
+
"""Record a discovery conflict.
|
|
965
|
+
|
|
966
|
+
Args:
|
|
967
|
+
conflict_type: Type of conflict (static_wins, source_priority)
|
|
968
|
+
"""
|
|
969
|
+
DISCOVERY_CONFLICTS_TOTAL.inc(conflict_type=conflict_type)
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
def record_discovery_quarantine(reason: str):
|
|
973
|
+
"""Record a provider quarantine.
|
|
974
|
+
|
|
975
|
+
Args:
|
|
976
|
+
reason: Reason for quarantine (health_check_failed, validation_failed, rate_limited)
|
|
977
|
+
"""
|
|
978
|
+
DISCOVERY_QUARANTINE_TOTAL.inc(reason=reason)
|
|
979
|
+
|
|
980
|
+
|
|
981
|
+
def record_discovery_error(source_type: str, error_type: str):
|
|
982
|
+
"""Record a discovery error.
|
|
983
|
+
|
|
984
|
+
Args:
|
|
985
|
+
source_type: Type of source
|
|
986
|
+
error_type: Type of error
|
|
987
|
+
"""
|
|
988
|
+
DISCOVERY_ERRORS_TOTAL.inc(source_type=source_type, error_type=error_type)
|
|
989
|
+
|
|
990
|
+
|
|
991
|
+
# =============================================================================
|
|
992
|
+
# Timing Decorator
|
|
993
|
+
# =============================================================================
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
def timed(histogram: Histogram, **labels):
|
|
997
|
+
"""Decorator to time function execution."""
|
|
998
|
+
|
|
999
|
+
def decorator(func):
|
|
1000
|
+
@wraps(func)
|
|
1001
|
+
def wrapper(*args, **kwargs):
|
|
1002
|
+
with histogram.labels(**labels).time():
|
|
1003
|
+
return func(*args, **kwargs)
|
|
1004
|
+
|
|
1005
|
+
return wrapper
|
|
1006
|
+
|
|
1007
|
+
return decorator
|