mcp-hangar 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. mcp_hangar/__init__.py +139 -0
  2. mcp_hangar/application/__init__.py +1 -0
  3. mcp_hangar/application/commands/__init__.py +67 -0
  4. mcp_hangar/application/commands/auth_commands.py +118 -0
  5. mcp_hangar/application/commands/auth_handlers.py +296 -0
  6. mcp_hangar/application/commands/commands.py +59 -0
  7. mcp_hangar/application/commands/handlers.py +189 -0
  8. mcp_hangar/application/discovery/__init__.py +21 -0
  9. mcp_hangar/application/discovery/discovery_metrics.py +283 -0
  10. mcp_hangar/application/discovery/discovery_orchestrator.py +497 -0
  11. mcp_hangar/application/discovery/lifecycle_manager.py +315 -0
  12. mcp_hangar/application/discovery/security_validator.py +414 -0
  13. mcp_hangar/application/event_handlers/__init__.py +50 -0
  14. mcp_hangar/application/event_handlers/alert_handler.py +191 -0
  15. mcp_hangar/application/event_handlers/audit_handler.py +203 -0
  16. mcp_hangar/application/event_handlers/knowledge_base_handler.py +120 -0
  17. mcp_hangar/application/event_handlers/logging_handler.py +69 -0
  18. mcp_hangar/application/event_handlers/metrics_handler.py +152 -0
  19. mcp_hangar/application/event_handlers/persistent_audit_store.py +217 -0
  20. mcp_hangar/application/event_handlers/security_handler.py +604 -0
  21. mcp_hangar/application/mcp/tooling.py +158 -0
  22. mcp_hangar/application/ports/__init__.py +9 -0
  23. mcp_hangar/application/ports/observability.py +237 -0
  24. mcp_hangar/application/queries/__init__.py +52 -0
  25. mcp_hangar/application/queries/auth_handlers.py +237 -0
  26. mcp_hangar/application/queries/auth_queries.py +118 -0
  27. mcp_hangar/application/queries/handlers.py +227 -0
  28. mcp_hangar/application/read_models/__init__.py +11 -0
  29. mcp_hangar/application/read_models/provider_views.py +139 -0
  30. mcp_hangar/application/sagas/__init__.py +11 -0
  31. mcp_hangar/application/sagas/group_rebalance_saga.py +137 -0
  32. mcp_hangar/application/sagas/provider_failover_saga.py +266 -0
  33. mcp_hangar/application/sagas/provider_recovery_saga.py +172 -0
  34. mcp_hangar/application/services/__init__.py +9 -0
  35. mcp_hangar/application/services/provider_service.py +208 -0
  36. mcp_hangar/application/services/traced_provider_service.py +211 -0
  37. mcp_hangar/bootstrap/runtime.py +328 -0
  38. mcp_hangar/context.py +178 -0
  39. mcp_hangar/domain/__init__.py +117 -0
  40. mcp_hangar/domain/contracts/__init__.py +57 -0
  41. mcp_hangar/domain/contracts/authentication.py +225 -0
  42. mcp_hangar/domain/contracts/authorization.py +229 -0
  43. mcp_hangar/domain/contracts/event_store.py +178 -0
  44. mcp_hangar/domain/contracts/metrics_publisher.py +59 -0
  45. mcp_hangar/domain/contracts/persistence.py +383 -0
  46. mcp_hangar/domain/contracts/provider_runtime.py +146 -0
  47. mcp_hangar/domain/discovery/__init__.py +20 -0
  48. mcp_hangar/domain/discovery/conflict_resolver.py +267 -0
  49. mcp_hangar/domain/discovery/discovered_provider.py +185 -0
  50. mcp_hangar/domain/discovery/discovery_service.py +412 -0
  51. mcp_hangar/domain/discovery/discovery_source.py +192 -0
  52. mcp_hangar/domain/events.py +433 -0
  53. mcp_hangar/domain/exceptions.py +525 -0
  54. mcp_hangar/domain/model/__init__.py +70 -0
  55. mcp_hangar/domain/model/aggregate.py +58 -0
  56. mcp_hangar/domain/model/circuit_breaker.py +152 -0
  57. mcp_hangar/domain/model/event_sourced_api_key.py +413 -0
  58. mcp_hangar/domain/model/event_sourced_provider.py +423 -0
  59. mcp_hangar/domain/model/event_sourced_role_assignment.py +268 -0
  60. mcp_hangar/domain/model/health_tracker.py +183 -0
  61. mcp_hangar/domain/model/load_balancer.py +185 -0
  62. mcp_hangar/domain/model/provider.py +810 -0
  63. mcp_hangar/domain/model/provider_group.py +656 -0
  64. mcp_hangar/domain/model/tool_catalog.py +105 -0
  65. mcp_hangar/domain/policies/__init__.py +19 -0
  66. mcp_hangar/domain/policies/provider_health.py +187 -0
  67. mcp_hangar/domain/repository.py +249 -0
  68. mcp_hangar/domain/security/__init__.py +85 -0
  69. mcp_hangar/domain/security/input_validator.py +710 -0
  70. mcp_hangar/domain/security/rate_limiter.py +387 -0
  71. mcp_hangar/domain/security/roles.py +237 -0
  72. mcp_hangar/domain/security/sanitizer.py +387 -0
  73. mcp_hangar/domain/security/secrets.py +501 -0
  74. mcp_hangar/domain/services/__init__.py +20 -0
  75. mcp_hangar/domain/services/audit_service.py +376 -0
  76. mcp_hangar/domain/services/image_builder.py +328 -0
  77. mcp_hangar/domain/services/provider_launcher.py +1046 -0
  78. mcp_hangar/domain/value_objects.py +1138 -0
  79. mcp_hangar/errors.py +818 -0
  80. mcp_hangar/fastmcp_server.py +1105 -0
  81. mcp_hangar/gc.py +134 -0
  82. mcp_hangar/infrastructure/__init__.py +79 -0
  83. mcp_hangar/infrastructure/async_executor.py +133 -0
  84. mcp_hangar/infrastructure/auth/__init__.py +37 -0
  85. mcp_hangar/infrastructure/auth/api_key_authenticator.py +388 -0
  86. mcp_hangar/infrastructure/auth/event_sourced_store.py +567 -0
  87. mcp_hangar/infrastructure/auth/jwt_authenticator.py +360 -0
  88. mcp_hangar/infrastructure/auth/middleware.py +340 -0
  89. mcp_hangar/infrastructure/auth/opa_authorizer.py +243 -0
  90. mcp_hangar/infrastructure/auth/postgres_store.py +659 -0
  91. mcp_hangar/infrastructure/auth/projections.py +366 -0
  92. mcp_hangar/infrastructure/auth/rate_limiter.py +311 -0
  93. mcp_hangar/infrastructure/auth/rbac_authorizer.py +323 -0
  94. mcp_hangar/infrastructure/auth/sqlite_store.py +624 -0
  95. mcp_hangar/infrastructure/command_bus.py +112 -0
  96. mcp_hangar/infrastructure/discovery/__init__.py +110 -0
  97. mcp_hangar/infrastructure/discovery/docker_source.py +289 -0
  98. mcp_hangar/infrastructure/discovery/entrypoint_source.py +249 -0
  99. mcp_hangar/infrastructure/discovery/filesystem_source.py +383 -0
  100. mcp_hangar/infrastructure/discovery/kubernetes_source.py +247 -0
  101. mcp_hangar/infrastructure/event_bus.py +260 -0
  102. mcp_hangar/infrastructure/event_sourced_repository.py +443 -0
  103. mcp_hangar/infrastructure/event_store.py +396 -0
  104. mcp_hangar/infrastructure/knowledge_base/__init__.py +259 -0
  105. mcp_hangar/infrastructure/knowledge_base/contracts.py +202 -0
  106. mcp_hangar/infrastructure/knowledge_base/memory.py +177 -0
  107. mcp_hangar/infrastructure/knowledge_base/postgres.py +545 -0
  108. mcp_hangar/infrastructure/knowledge_base/sqlite.py +513 -0
  109. mcp_hangar/infrastructure/metrics_publisher.py +36 -0
  110. mcp_hangar/infrastructure/observability/__init__.py +10 -0
  111. mcp_hangar/infrastructure/observability/langfuse_adapter.py +534 -0
  112. mcp_hangar/infrastructure/persistence/__init__.py +33 -0
  113. mcp_hangar/infrastructure/persistence/audit_repository.py +371 -0
  114. mcp_hangar/infrastructure/persistence/config_repository.py +398 -0
  115. mcp_hangar/infrastructure/persistence/database.py +333 -0
  116. mcp_hangar/infrastructure/persistence/database_common.py +330 -0
  117. mcp_hangar/infrastructure/persistence/event_serializer.py +280 -0
  118. mcp_hangar/infrastructure/persistence/event_upcaster.py +166 -0
  119. mcp_hangar/infrastructure/persistence/in_memory_event_store.py +150 -0
  120. mcp_hangar/infrastructure/persistence/recovery_service.py +312 -0
  121. mcp_hangar/infrastructure/persistence/sqlite_event_store.py +386 -0
  122. mcp_hangar/infrastructure/persistence/unit_of_work.py +409 -0
  123. mcp_hangar/infrastructure/persistence/upcasters/README.md +13 -0
  124. mcp_hangar/infrastructure/persistence/upcasters/__init__.py +7 -0
  125. mcp_hangar/infrastructure/query_bus.py +153 -0
  126. mcp_hangar/infrastructure/saga_manager.py +401 -0
  127. mcp_hangar/logging_config.py +209 -0
  128. mcp_hangar/metrics.py +1007 -0
  129. mcp_hangar/models.py +31 -0
  130. mcp_hangar/observability/__init__.py +54 -0
  131. mcp_hangar/observability/health.py +487 -0
  132. mcp_hangar/observability/metrics.py +319 -0
  133. mcp_hangar/observability/tracing.py +433 -0
  134. mcp_hangar/progress.py +542 -0
  135. mcp_hangar/retry.py +613 -0
  136. mcp_hangar/server/__init__.py +120 -0
  137. mcp_hangar/server/__main__.py +6 -0
  138. mcp_hangar/server/auth_bootstrap.py +340 -0
  139. mcp_hangar/server/auth_cli.py +335 -0
  140. mcp_hangar/server/auth_config.py +305 -0
  141. mcp_hangar/server/bootstrap.py +735 -0
  142. mcp_hangar/server/cli.py +161 -0
  143. mcp_hangar/server/config.py +224 -0
  144. mcp_hangar/server/context.py +215 -0
  145. mcp_hangar/server/http_auth_middleware.py +165 -0
  146. mcp_hangar/server/lifecycle.py +467 -0
  147. mcp_hangar/server/state.py +117 -0
  148. mcp_hangar/server/tools/__init__.py +16 -0
  149. mcp_hangar/server/tools/discovery.py +186 -0
  150. mcp_hangar/server/tools/groups.py +75 -0
  151. mcp_hangar/server/tools/health.py +301 -0
  152. mcp_hangar/server/tools/provider.py +939 -0
  153. mcp_hangar/server/tools/registry.py +320 -0
  154. mcp_hangar/server/validation.py +113 -0
  155. mcp_hangar/stdio_client.py +229 -0
  156. mcp_hangar-0.2.0.dist-info/METADATA +347 -0
  157. mcp_hangar-0.2.0.dist-info/RECORD +160 -0
  158. mcp_hangar-0.2.0.dist-info/WHEEL +4 -0
  159. mcp_hangar-0.2.0.dist-info/entry_points.txt +2 -0
  160. mcp_hangar-0.2.0.dist-info/licenses/LICENSE +21 -0
mcp_hangar/metrics.py ADDED
@@ -0,0 +1,1007 @@
1
+ """Prometheus metrics for MCP Registry.
2
+
3
+ Production-grade metrics following Prometheus/OpenMetrics best practices:
4
+ - Consistent naming: mcp_registry_<subsystem>_<metric>_<unit>
5
+ - Proper label cardinality control
6
+ - Thread-safe implementations
7
+ - Standard histogram buckets for different use cases
8
+ """
9
+
10
+ from collections import defaultdict
11
+ from dataclasses import dataclass, field
12
+ from functools import wraps
13
+ import platform
14
+ import threading
15
+ import time
16
+ from typing import Dict, List, Optional
17
+
18
+ # =============================================================================
19
+ # Core Metric Types
20
+ # =============================================================================
21
+
22
+
23
+ @dataclass
24
+ class MetricSample:
25
+ """Single metric sample with labels."""
26
+
27
+ value: float
28
+ labels: Dict[str, str] = field(default_factory=dict)
29
+
30
+
31
+ class Counter:
32
+ """
33
+ Prometheus counter - monotonically increasing value.
34
+
35
+ Use for: requests, errors, completions, bytes transferred.
36
+ """
37
+
38
+ def __init__(self, name: str, description: str, labels: List[str] = None):
39
+ self.name = name
40
+ self.description = description
41
+ self.label_names = labels or []
42
+ self._values: Dict[tuple, float] = defaultdict(float)
43
+ self._created: Dict[tuple, float] = {}
44
+ self._lock = threading.Lock()
45
+
46
+ def inc(self, value: float = 1.0, **labels) -> None:
47
+ """Increment counter by value (must be >= 0)."""
48
+ if value < 0:
49
+ raise ValueError("Counter can only increase")
50
+ key = self._make_key(labels)
51
+ with self._lock:
52
+ if key not in self._created:
53
+ self._created[key] = time.time()
54
+ self._values[key] += value
55
+
56
+ def _make_key(self, labels: dict) -> tuple:
57
+ return tuple(labels.get(label_name, "") for label_name in self.label_names)
58
+
59
+ def labels(self, **label_values) -> "_LabeledCounter":
60
+ """Return counter with preset labels for reuse."""
61
+ return _LabeledCounter(self, label_values)
62
+
63
+ def collect(self) -> List[MetricSample]:
64
+ """Collect all samples."""
65
+ with self._lock:
66
+ return [MetricSample(value=v, labels=dict(zip(self.label_names, k))) for k, v in self._values.items()]
67
+
68
+
69
+ class Gauge:
70
+ """
71
+ Prometheus gauge - value that can go up and down.
72
+
73
+ Use for: in-progress operations, current state, temperature, queue size.
74
+ """
75
+
76
+ def __init__(self, name: str, description: str, labels: List[str] = None):
77
+ self.name = name
78
+ self.description = description
79
+ self.label_names = labels or []
80
+ self._values: Dict[tuple, float] = {}
81
+ self._lock = threading.Lock()
82
+
83
+ def set(self, value: float, **labels) -> None:
84
+ """Set gauge to value."""
85
+ key = self._make_key(labels)
86
+ with self._lock:
87
+ self._values[key] = value
88
+
89
+ def inc(self, value: float = 1.0, **labels) -> None:
90
+ """Increment gauge."""
91
+ key = self._make_key(labels)
92
+ with self._lock:
93
+ self._values[key] = self._values.get(key, 0) + value
94
+
95
+ def dec(self, value: float = 1.0, **labels) -> None:
96
+ """Decrement gauge."""
97
+ key = self._make_key(labels)
98
+ with self._lock:
99
+ self._values[key] = self._values.get(key, 0) - value
100
+
101
+ def set_to_current_time(self, **labels) -> None:
102
+ """Set gauge to current Unix timestamp."""
103
+ self.set(time.time(), **labels)
104
+
105
+ def _make_key(self, labels: dict) -> tuple:
106
+ return tuple(labels.get(label_name, "") for label_name in self.label_names)
107
+
108
+ def labels(self, **label_values) -> "_LabeledGauge":
109
+ """Return gauge with preset labels."""
110
+ return _LabeledGauge(self, label_values)
111
+
112
+ def collect(self) -> List[MetricSample]:
113
+ """Collect all samples."""
114
+ with self._lock:
115
+ return [MetricSample(value=v, labels=dict(zip(self.label_names, k))) for k, v in self._values.items()]
116
+
117
+
118
+ class Histogram:
119
+ """
120
+ Prometheus histogram - distribution of values in buckets.
121
+
122
+ Use for: request latencies, response sizes.
123
+ """
124
+
125
+ # Standard bucket presets
126
+ DEFAULT_BUCKETS = (0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0)
127
+ LATENCY_BUCKETS = (
128
+ 0.001,
129
+ 0.0025,
130
+ 0.005,
131
+ 0.01,
132
+ 0.025,
133
+ 0.05,
134
+ 0.1,
135
+ 0.25,
136
+ 0.5,
137
+ 1.0,
138
+ 2.5,
139
+ 5.0,
140
+ 10.0,
141
+ 30.0,
142
+ )
143
+ SIZE_BUCKETS = (100, 1000, 10000, 100000, 1000000, 10000000)
144
+
145
+ def __init__(
146
+ self,
147
+ name: str,
148
+ description: str,
149
+ labels: List[str] = None,
150
+ buckets: tuple = None,
151
+ ):
152
+ self.name = name
153
+ self.description = description
154
+ self.label_names = labels or []
155
+ self.buckets = tuple(sorted(buckets or self.DEFAULT_BUCKETS)) + (float("inf"),)
156
+ self._lock = threading.Lock()
157
+ self._buckets: Dict[tuple, Dict[float, int]] = defaultdict(lambda: {b: 0 for b in self.buckets})
158
+ self._sums: Dict[tuple, float] = defaultdict(float)
159
+ self._counts: Dict[tuple, int] = defaultdict(int)
160
+
161
+ def observe(self, value: float, **labels) -> None:
162
+ """Record an observation."""
163
+ key = self._make_key(labels)
164
+ with self._lock:
165
+ self._sums[key] += value
166
+ self._counts[key] += 1
167
+ # Add to the first bucket that fits (buckets are sorted)
168
+ for bucket in self.buckets:
169
+ if value <= bucket:
170
+ self._buckets[key][bucket] += 1
171
+ break # Only add to the first matching bucket
172
+
173
+ def _make_key(self, labels: dict) -> tuple:
174
+ return tuple(labels.get(label_name, "") for label_name in self.label_names)
175
+
176
+ def labels(self, **label_values) -> "_LabeledHistogram":
177
+ """Return histogram with preset labels."""
178
+ return _LabeledHistogram(self, label_values)
179
+
180
+ def time(self) -> "_Timer":
181
+ """Context manager for timing code blocks."""
182
+ return _Timer(self, {})
183
+
184
+ def collect(self) -> tuple:
185
+ """Collect buckets, sum, and count samples."""
186
+ buckets = []
187
+ sums = []
188
+ counts = []
189
+
190
+ with self._lock:
191
+ for key, bucket_values in self._buckets.items():
192
+ base_labels = dict(zip(self.label_names, key))
193
+ cumulative = 0
194
+ for bucket in self.buckets:
195
+ cumulative += bucket_values.get(bucket, 0)
196
+ le = "+Inf" if bucket == float("inf") else str(bucket)
197
+ buckets.append(MetricSample(value=cumulative, labels={**base_labels, "le": le}))
198
+ sums.append(MetricSample(value=self._sums[key], labels=base_labels))
199
+ counts.append(MetricSample(value=self._counts[key], labels=base_labels))
200
+
201
+ return buckets, sums, counts
202
+
203
+
204
+ class Summary:
205
+ """
206
+ Prometheus summary - streaming quantiles.
207
+
208
+ Simpler implementation using min/max/avg for now.
209
+ Use for: streaming data where quantiles aren't critical.
210
+ """
211
+
212
+ def __init__(self, name: str, description: str, labels: List[str] = None):
213
+ self.name = name
214
+ self.description = description
215
+ self.label_names = labels or []
216
+ self._lock = threading.Lock()
217
+ self._sums: Dict[tuple, float] = defaultdict(float)
218
+ self._counts: Dict[tuple, int] = defaultdict(int)
219
+
220
+ def observe(self, value: float, **labels) -> None:
221
+ """Record an observation."""
222
+ key = self._make_key(labels)
223
+ with self._lock:
224
+ self._sums[key] += value
225
+ self._counts[key] += 1
226
+
227
+ def _make_key(self, labels: dict) -> tuple:
228
+ return tuple(labels.get(label_name, "") for label_name in self.label_names)
229
+
230
+ def collect(self) -> tuple:
231
+ """Collect sum and count samples."""
232
+ sums = []
233
+ counts = []
234
+ with self._lock:
235
+ for key in self._sums:
236
+ base_labels = dict(zip(self.label_names, key))
237
+ sums.append(MetricSample(value=self._sums[key], labels=base_labels))
238
+ counts.append(MetricSample(value=self._counts[key], labels=base_labels))
239
+ return sums, counts
240
+
241
+
242
+ class Info:
243
+ """
244
+ Prometheus info metric - static key-value pairs.
245
+
246
+ Use for: version info, build metadata, configuration.
247
+ """
248
+
249
+ def __init__(self, name: str, description: str):
250
+ self.name = name
251
+ self.description = description
252
+ self._labels: Dict[str, str] = {}
253
+ self._lock = threading.Lock()
254
+
255
+ def info(self, **labels) -> None:
256
+ """Set info labels."""
257
+ with self._lock:
258
+ self._labels = {k: str(v) for k, v in labels.items()}
259
+
260
+ def collect(self) -> List[MetricSample]:
261
+ """Collect info sample."""
262
+ with self._lock:
263
+ if self._labels:
264
+ return [MetricSample(value=1.0, labels=self._labels)]
265
+ return []
266
+
267
+
268
+ # =============================================================================
269
+ # Labeled Metric Helpers
270
+ # =============================================================================
271
+
272
+
273
+ class _LabeledCounter:
274
+ """Counter with preset labels."""
275
+
276
+ def __init__(self, counter: Counter, labels: dict):
277
+ self._counter = counter
278
+ self._labels = labels
279
+
280
+ def inc(self, value: float = 1.0) -> None:
281
+ self._counter.inc(value, **self._labels)
282
+
283
+
284
+ class _LabeledGauge:
285
+ """Gauge with preset labels."""
286
+
287
+ def __init__(self, gauge: Gauge, labels: dict):
288
+ self._gauge = gauge
289
+ self._labels = labels
290
+
291
+ def set(self, value: float) -> None:
292
+ self._gauge.set(value, **self._labels)
293
+
294
+ def inc(self, value: float = 1.0) -> None:
295
+ self._gauge.inc(value, **self._labels)
296
+
297
+ def dec(self, value: float = 1.0) -> None:
298
+ self._gauge.dec(value, **self._labels)
299
+
300
+
301
+ class _LabeledHistogram:
302
+ """Histogram with preset labels."""
303
+
304
+ def __init__(self, histogram: Histogram, labels: dict):
305
+ self._histogram = histogram
306
+ self._labels = labels
307
+
308
+ def observe(self, value: float) -> None:
309
+ self._histogram.observe(value, **self._labels)
310
+
311
+ def time(self) -> "_Timer":
312
+ return _Timer(self._histogram, self._labels)
313
+
314
+
315
+ class _Timer:
316
+ """Context manager for timing operations."""
317
+
318
+ def __init__(self, histogram: Histogram, labels: dict):
319
+ self._histogram = histogram
320
+ self._labels = labels
321
+ self._start: Optional[float] = None
322
+
323
+ def __enter__(self) -> "_Timer":
324
+ self._start = time.perf_counter()
325
+ return self
326
+
327
+ def __exit__(self, *args) -> None:
328
+ duration = time.perf_counter() - self._start
329
+ self._histogram.observe(duration, **self._labels)
330
+
331
+
332
+ # =============================================================================
333
+ # Metrics Registry
334
+ # =============================================================================
335
+
336
+
337
+ class CollectorRegistry:
338
+ """Central registry for all metrics with Prometheus exposition format output."""
339
+
340
+ def __init__(self):
341
+ self._collectors: Dict[str, any] = {}
342
+ self._lock = threading.Lock()
343
+
344
+ def register(self, collector) -> None:
345
+ """Register a metric collector."""
346
+ with self._lock:
347
+ if collector.name in self._collectors:
348
+ raise ValueError(f"Metric {collector.name} already registered")
349
+ self._collectors[collector.name] = collector
350
+
351
+ def unregister(self, name: str) -> None:
352
+ """Unregister a metric."""
353
+ with self._lock:
354
+ self._collectors.pop(name, None)
355
+
356
+ def get(self, name: str):
357
+ """Get collector by name."""
358
+ return self._collectors.get(name)
359
+
360
+ def collect(self) -> str:
361
+ """Generate Prometheus exposition format output."""
362
+ lines = []
363
+
364
+ with self._lock:
365
+ collectors = list(self._collectors.items())
366
+
367
+ for name, collector in collectors:
368
+ lines.extend(self._format_metric(name, collector))
369
+ lines.append("")
370
+
371
+ return "\n".join(lines)
372
+
373
+ def _format_metric(self, name: str, collector) -> List[str]:
374
+ """Format a single metric in Prometheus format."""
375
+ lines = []
376
+ lines.append(f"# HELP {name} {collector.description}")
377
+
378
+ if isinstance(collector, Counter):
379
+ lines.append(f"# TYPE {name} counter")
380
+ for sample in collector.collect():
381
+ labels = self._format_labels(sample.labels)
382
+ lines.append(f"{name}_total{labels} {sample.value}")
383
+
384
+ elif isinstance(collector, Gauge):
385
+ lines.append(f"# TYPE {name} gauge")
386
+ for sample in collector.collect():
387
+ labels = self._format_labels(sample.labels)
388
+ lines.append(f"{name}{labels} {sample.value}")
389
+
390
+ elif isinstance(collector, Histogram):
391
+ lines.append(f"# TYPE {name} histogram")
392
+ buckets, sums, counts = collector.collect()
393
+ for sample in buckets:
394
+ labels = self._format_labels(sample.labels)
395
+ lines.append(f"{name}_bucket{labels} {int(sample.value)}")
396
+ for sample in sums:
397
+ labels = self._format_labels(sample.labels)
398
+ lines.append(f"{name}_sum{labels} {sample.value}")
399
+ for sample in counts:
400
+ labels = self._format_labels(sample.labels)
401
+ lines.append(f"{name}_count{labels} {int(sample.value)}")
402
+
403
+ elif isinstance(collector, Summary):
404
+ lines.append(f"# TYPE {name} summary")
405
+ sums, counts = collector.collect()
406
+ for sample in sums:
407
+ labels = self._format_labels(sample.labels)
408
+ lines.append(f"{name}_sum{labels} {sample.value}")
409
+ for sample in counts:
410
+ labels = self._format_labels(sample.labels)
411
+ lines.append(f"{name}_count{labels} {int(sample.value)}")
412
+
413
+ elif isinstance(collector, Info):
414
+ lines.append(f"# TYPE {name}_info gauge")
415
+ for sample in collector.collect():
416
+ labels = self._format_labels(sample.labels)
417
+ lines.append(f"{name}_info{labels} 1")
418
+
419
+ return lines
420
+
421
+ def _format_labels(self, labels: Dict[str, str]) -> str:
422
+ """Format labels in Prometheus format."""
423
+ if not labels:
424
+ return ""
425
+ # Escape label values properly
426
+ escaped = []
427
+ for k, v in sorted(labels.items()):
428
+ if v is None:
429
+ v = ""
430
+ v = str(v).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
431
+ escaped.append(f'{k}="{v}"')
432
+ return "{" + ",".join(escaped) + "}"
433
+
434
+
435
+ # =============================================================================
436
+ # Global Registry
437
+ # =============================================================================
438
+
439
+ REGISTRY = CollectorRegistry()
440
+
441
+
442
+ # =============================================================================
443
+ # MCP Registry Metrics - Following Best Practices
444
+ # =============================================================================
445
+
446
+ # -----------------------------------------------------------------------------
447
+ # Build/Version Info
448
+ # -----------------------------------------------------------------------------
449
+
450
+ BUILD_INFO = Info(
451
+ name="mcp_registry_build",
452
+ description="Build and version information for MCP Registry",
453
+ )
454
+
455
+ # -----------------------------------------------------------------------------
456
+ # Process Metrics
457
+ # -----------------------------------------------------------------------------
458
+
459
+ PROCESS_START_TIME = Gauge(
460
+ name="mcp_registry_process_start_time_seconds",
461
+ description="Unix timestamp of process start time",
462
+ )
463
+
464
+ # -----------------------------------------------------------------------------
465
+ # Provider Lifecycle Metrics
466
+ # -----------------------------------------------------------------------------
467
+
468
+ PROVIDER_INFO = Gauge(
469
+ name="mcp_registry_provider_info",
470
+ description="Provider configuration info (always 1, labels contain metadata)",
471
+ labels=["provider", "mode"],
472
+ )
473
+
474
+ PROVIDER_STATE_CURRENT = Gauge(
475
+ name="mcp_registry_provider_state",
476
+ description="Current provider state (0=cold, 1=initializing, 2=ready, 3=degraded, 4=dead)",
477
+ labels=["provider"],
478
+ )
479
+
480
+ PROVIDER_UP = Gauge(
481
+ name="mcp_registry_provider_up",
482
+ description="Whether provider is up and ready (1=up, 0=down)",
483
+ labels=["provider"],
484
+ )
485
+
486
+ PROVIDER_INITIALIZED = Gauge(
487
+ name="mcp_registry_provider_initialized",
488
+ description="Whether provider has been initialized at least once (1=yes, 0=no/cold)",
489
+ labels=["provider"],
490
+ )
491
+
492
+ PROVIDER_LAST_STATE_CHANGE_SECONDS = Gauge(
493
+ name="mcp_registry_provider_last_state_change_timestamp_seconds",
494
+ description="Unix timestamp of last provider state change",
495
+ labels=["provider"],
496
+ )
497
+
498
+ PROVIDER_STARTS_TOTAL = Counter(
499
+ name="mcp_registry_provider_starts",
500
+ description="Total number of provider start attempts",
501
+ labels=["provider", "result"], # result: success, failure
502
+ )
503
+
504
+ PROVIDER_STOPS_TOTAL = Counter(
505
+ name="mcp_registry_provider_stops",
506
+ description="Total number of provider stops",
507
+ labels=["provider", "reason"], # reason: idle, manual, error, gc
508
+ )
509
+
510
+ PROVIDER_COLD_START_SECONDS = Histogram(
511
+ name="mcp_registry_provider_cold_start_seconds",
512
+ description="Time from cold start to ready state (critical UX metric)",
513
+ labels=["provider", "mode"],
514
+ buckets=(0.1, 0.25, 0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 15.0, 30.0, 60.0),
515
+ )
516
+
517
+ PROVIDER_COLD_START_IN_PROGRESS = Gauge(
518
+ name="mcp_registry_provider_cold_start_in_progress",
519
+ description="Number of providers currently in cold start",
520
+ labels=["provider"],
521
+ )
522
+
523
+ # -----------------------------------------------------------------------------
524
+ # Tool Invocation Metrics (RED method: Rate, Errors, Duration)
525
+ # -----------------------------------------------------------------------------
526
+
527
+ TOOL_CALLS_TOTAL = Counter(
528
+ name="mcp_registry_tool_calls",
529
+ description="Total number of tool calls",
530
+ labels=["provider", "tool", "status"], # status: success, error
531
+ )
532
+
533
+ TOOL_CALL_DURATION_SECONDS = Histogram(
534
+ name="mcp_registry_tool_call_duration_seconds",
535
+ description="Duration of tool calls in seconds",
536
+ labels=["provider", "tool"],
537
+ buckets=Histogram.LATENCY_BUCKETS,
538
+ )
539
+
540
+ TOOL_CALL_ERRORS_TOTAL = Counter(
541
+ name="mcp_registry_tool_call_errors",
542
+ description="Total number of tool call errors by error type",
543
+ labels=["provider", "tool", "error_type"],
544
+ )
545
+
546
+ # -----------------------------------------------------------------------------
547
+ # Health Check Metrics
548
+ # -----------------------------------------------------------------------------
549
+
550
+ HEALTH_CHECK_TOTAL = Counter(
551
+ name="mcp_registry_health_checks",
552
+ description="Total number of health check executions",
553
+ labels=["provider", "result"], # result: cold, healthy, unhealthy
554
+ )
555
+
556
+ HEALTH_CHECK_DURATION_SECONDS = Histogram(
557
+ name="mcp_registry_health_check_duration_seconds",
558
+ description="Duration of health checks in seconds",
559
+ labels=["provider"],
560
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0),
561
+ )
562
+
563
+ HEALTH_CHECK_CONSECUTIVE_FAILURES = Gauge(
564
+ name="mcp_registry_health_check_consecutive_failures",
565
+ description="Number of consecutive health check failures",
566
+ labels=["provider"],
567
+ )
568
+
569
+ # -----------------------------------------------------------------------------
570
+ # Connection Pool Metrics
571
+ # -----------------------------------------------------------------------------
572
+
573
+ CONNECTIONS_ACTIVE = Gauge(
574
+ name="mcp_registry_connections_active",
575
+ description="Number of active connections to providers",
576
+ labels=["provider"],
577
+ )
578
+
579
+ CONNECTIONS_TOTAL = Counter(
580
+ name="mcp_registry_connections",
581
+ description="Total number of connections established",
582
+ labels=["provider", "result"],
583
+ )
584
+
585
+ CONNECTION_DURATION_SECONDS = Histogram(
586
+ name="mcp_registry_connection_duration_seconds",
587
+ description="Duration of provider connections in seconds",
588
+ labels=["provider"],
589
+ buckets=(1, 5, 10, 30, 60, 300, 600, 1800, 3600),
590
+ )
591
+
592
+ # -----------------------------------------------------------------------------
593
+ # Message Metrics
594
+ # -----------------------------------------------------------------------------
595
+
596
+ MESSAGES_SENT_TOTAL = Counter(
597
+ name="mcp_registry_messages_sent",
598
+ description="Total number of JSON-RPC messages sent",
599
+ labels=["provider", "method"],
600
+ )
601
+
602
+ MESSAGES_RECEIVED_TOTAL = Counter(
603
+ name="mcp_registry_messages_received",
604
+ description="Total number of JSON-RPC messages received",
605
+ labels=["provider", "type"], # type: response, notification, error
606
+ )
607
+
608
+ MESSAGE_SIZE_BYTES = Histogram(
609
+ name="mcp_registry_message_size_bytes",
610
+ description="Size of JSON-RPC messages in bytes",
611
+ labels=["provider", "direction"], # direction: sent, received
612
+ buckets=Histogram.SIZE_BUCKETS,
613
+ )
614
+
615
+ # -----------------------------------------------------------------------------
616
+ # GC (Garbage Collection) Metrics
617
+ # -----------------------------------------------------------------------------
618
+
619
+ GC_CYCLES_TOTAL = Counter(
620
+ name="mcp_registry_gc_cycles",
621
+ description="Total number of garbage collection cycles",
622
+ )
623
+
624
+ GC_CYCLE_DURATION_SECONDS = Histogram(
625
+ name="mcp_registry_gc_cycle_duration_seconds",
626
+ description="Duration of garbage collection cycles in seconds",
627
+ buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5),
628
+ )
629
+
630
+ GC_PROVIDERS_COLLECTED_TOTAL = Counter(
631
+ name="mcp_registry_gc_providers_collected",
632
+ description="Total number of providers collected by GC",
633
+ labels=["reason"], # reason: idle, dead, error
634
+ )
635
+
636
+ # -----------------------------------------------------------------------------
637
+ # Error Metrics
638
+ # -----------------------------------------------------------------------------
639
+
640
+ ERRORS_TOTAL = Counter(
641
+ name="mcp_registry_errors",
642
+ description="Total number of errors by type and component",
643
+ labels=["component", "error_type"], # component: provider, tool, health, gc, server
644
+ )
645
+
646
+ # -----------------------------------------------------------------------------
647
+ # Rate Limiter Metrics
648
+ # -----------------------------------------------------------------------------
649
+
650
+ RATE_LIMIT_HITS_TOTAL = Counter(
651
+ name="mcp_registry_rate_limit_hits",
652
+ description="Total number of requests that hit rate limits",
653
+ labels=["endpoint"],
654
+ )
655
+
656
+ # -----------------------------------------------------------------------------
657
+ # Discovery Metrics
658
+ # -----------------------------------------------------------------------------
659
+
660
+ DISCOVERY_SOURCES_TOTAL = Gauge(
661
+ name="mcp_registry_discovery_sources",
662
+ description="Number of configured discovery sources",
663
+ labels=["source_type", "mode"],
664
+ )
665
+
666
+ DISCOVERY_SOURCES_HEALTHY = Gauge(
667
+ name="mcp_registry_discovery_sources_healthy",
668
+ description="Whether discovery source is healthy (1=healthy, 0=unhealthy)",
669
+ labels=["source_type"],
670
+ )
671
+
672
+ DISCOVERY_PROVIDERS_TOTAL = Gauge(
673
+ name="mcp_registry_discovery_providers",
674
+ description="Number of discovered providers",
675
+ labels=["source_type", "status"], # status: discovered, registered, quarantined
676
+ )
677
+
678
+ DISCOVERY_CYCLES_TOTAL = Counter(
679
+ name="mcp_registry_discovery_cycles",
680
+ description="Total number of discovery cycles executed",
681
+ labels=["source_type"],
682
+ )
683
+
684
+ DISCOVERY_CYCLE_DURATION_SECONDS = Histogram(
685
+ name="mcp_registry_discovery_cycle_duration_seconds",
686
+ description="Duration of discovery cycles in seconds",
687
+ labels=["source_type"],
688
+ buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0),
689
+ )
690
+
691
+ DISCOVERY_REGISTRATIONS_TOTAL = Counter(
692
+ name="mcp_registry_discovery_registrations",
693
+ description="Total provider registrations from discovery",
694
+ labels=["source_type"],
695
+ )
696
+
697
+ DISCOVERY_DEREGISTRATIONS_TOTAL = Counter(
698
+ name="mcp_registry_discovery_deregistrations",
699
+ description="Total provider deregistrations from discovery",
700
+ labels=["source_type", "reason"], # reason: ttl_expired, source_removed, manual
701
+ )
702
+
703
+ DISCOVERY_CONFLICTS_TOTAL = Counter(
704
+ name="mcp_registry_discovery_conflicts",
705
+ description="Total discovery conflicts",
706
+ labels=["conflict_type"], # conflict_type: static_wins, source_priority
707
+ )
708
+
709
+ DISCOVERY_QUARANTINE_TOTAL = Counter(
710
+ name="mcp_registry_discovery_quarantine",
711
+ description="Total providers quarantined",
712
+ labels=["reason"], # reason: health_check_failed, validation_failed, rate_limited
713
+ )
714
+
715
+ DISCOVERY_ERRORS_TOTAL = Counter(
716
+ name="mcp_registry_discovery_errors",
717
+ description="Total discovery errors",
718
+ labels=["source_type", "error_type"],
719
+ )
720
+
721
+ DISCOVERY_LAST_CYCLE_TIMESTAMP = Gauge(
722
+ name="mcp_registry_discovery_last_cycle_timestamp_seconds",
723
+ description="Unix timestamp of last discovery cycle",
724
+ labels=["source_type"],
725
+ )
726
+
727
+
728
+ # =============================================================================
729
+ # Register All Metrics
730
+ # =============================================================================
731
+
732
+
733
+ def _register_all_metrics():
734
+ """Register all predefined metrics."""
735
+ metrics = [
736
+ BUILD_INFO,
737
+ PROCESS_START_TIME,
738
+ PROVIDER_INFO,
739
+ PROVIDER_STATE_CURRENT,
740
+ PROVIDER_UP,
741
+ PROVIDER_INITIALIZED,
742
+ PROVIDER_LAST_STATE_CHANGE_SECONDS,
743
+ PROVIDER_STARTS_TOTAL,
744
+ PROVIDER_STOPS_TOTAL,
745
+ PROVIDER_COLD_START_SECONDS,
746
+ PROVIDER_COLD_START_IN_PROGRESS,
747
+ TOOL_CALLS_TOTAL,
748
+ TOOL_CALL_DURATION_SECONDS,
749
+ TOOL_CALL_ERRORS_TOTAL,
750
+ HEALTH_CHECK_TOTAL,
751
+ HEALTH_CHECK_DURATION_SECONDS,
752
+ HEALTH_CHECK_CONSECUTIVE_FAILURES,
753
+ CONNECTIONS_ACTIVE,
754
+ CONNECTIONS_TOTAL,
755
+ CONNECTION_DURATION_SECONDS,
756
+ MESSAGES_SENT_TOTAL,
757
+ MESSAGES_RECEIVED_TOTAL,
758
+ MESSAGE_SIZE_BYTES,
759
+ GC_CYCLES_TOTAL,
760
+ GC_CYCLE_DURATION_SECONDS,
761
+ GC_PROVIDERS_COLLECTED_TOTAL,
762
+ ERRORS_TOTAL,
763
+ RATE_LIMIT_HITS_TOTAL,
764
+ # Discovery metrics
765
+ DISCOVERY_SOURCES_TOTAL,
766
+ DISCOVERY_SOURCES_HEALTHY,
767
+ DISCOVERY_PROVIDERS_TOTAL,
768
+ DISCOVERY_CYCLES_TOTAL,
769
+ DISCOVERY_CYCLE_DURATION_SECONDS,
770
+ DISCOVERY_REGISTRATIONS_TOTAL,
771
+ DISCOVERY_DEREGISTRATIONS_TOTAL,
772
+ DISCOVERY_CONFLICTS_TOTAL,
773
+ DISCOVERY_QUARANTINE_TOTAL,
774
+ DISCOVERY_ERRORS_TOTAL,
775
+ DISCOVERY_LAST_CYCLE_TIMESTAMP,
776
+ ]
777
+ for metric in metrics:
778
+ REGISTRY.register(metric)
779
+
780
+
781
+ _register_all_metrics()
782
+
783
+
784
+ # =============================================================================
785
+ # Convenience Functions
786
+ # =============================================================================
787
+
788
+
789
+ def get_metrics() -> str:
790
+ """Get all metrics in Prometheus exposition format."""
791
+ return REGISTRY.collect()
792
+
793
+
794
+ def init_metrics(version: str = "1.0.0"):
795
+ """Initialize metrics on server startup."""
796
+ BUILD_INFO.info(
797
+ version=version,
798
+ python_version=platform.python_version(),
799
+ platform=platform.system(),
800
+ )
801
+ PROCESS_START_TIME.set(time.time())
802
+
803
+
804
+ def observe_tool_call(provider: str, tool: str, duration: float, success: bool, error_type: str = None):
805
+ """Record a tool call observation."""
806
+ status = "success" if success else "error"
807
+ TOOL_CALLS_TOTAL.inc(provider=provider, tool=tool, status=status)
808
+ TOOL_CALL_DURATION_SECONDS.observe(duration, provider=provider, tool=tool)
809
+ if not success and error_type:
810
+ TOOL_CALL_ERRORS_TOTAL.inc(provider=provider, tool=tool, error_type=error_type)
811
+
812
+
813
+ def observe_health_check(
814
+ provider: str,
815
+ duration: float,
816
+ healthy: bool,
817
+ is_cold: bool = False,
818
+ consecutive_failures: int = 0,
819
+ ):
820
+ """Record a health check observation.
821
+
822
+ Args:
823
+ provider: Provider ID
824
+ duration: Health check duration in seconds
825
+ healthy: Whether the check passed (only meaningful if not cold)
826
+ is_cold: Whether provider is in cold state (not started yet)
827
+ consecutive_failures: Number of consecutive failures
828
+ """
829
+ if is_cold:
830
+ result = "cold"
831
+ elif healthy:
832
+ result = "healthy"
833
+ else:
834
+ result = "unhealthy"
835
+
836
+ HEALTH_CHECK_TOTAL.inc(provider=provider, result=result)
837
+ HEALTH_CHECK_DURATION_SECONDS.observe(duration, provider=provider)
838
+ HEALTH_CHECK_CONSECUTIVE_FAILURES.set(consecutive_failures, provider=provider)
839
+
840
+
841
+ def update_provider_state(provider: str, state: str, mode: str = "subprocess"):
842
+ """Update provider state metrics."""
843
+ state_map = {"cold": 0, "initializing": 1, "ready": 2, "degraded": 3, "dead": 4}
844
+ PROVIDER_STATE_CURRENT.set(state_map.get(state, 0), provider=provider)
845
+ PROVIDER_UP.set(1 if state == "ready" else 0, provider=provider)
846
+ PROVIDER_INITIALIZED.set(0 if state == "cold" else 1, provider=provider)
847
+ PROVIDER_INFO.set(1, provider=provider, mode=mode)
848
+ PROVIDER_LAST_STATE_CHANGE_SECONDS.set(time.time(), provider=provider)
849
+
850
+
851
+ def record_provider_start(provider: str, success: bool):
852
+ """Record a provider start attempt."""
853
+ result = "success" if success else "failure"
854
+ PROVIDER_STARTS_TOTAL.inc(provider=provider, result=result)
855
+ if success:
856
+ PROVIDER_INITIALIZED.set(1, provider=provider)
857
+
858
+
859
+ def record_provider_stop(provider: str, reason: str):
860
+ """Record a provider stop."""
861
+ PROVIDER_STOPS_TOTAL.inc(provider=provider, reason=reason)
862
+
863
+
864
+ def record_cold_start(provider: str, duration: float, mode: str = "subprocess"):
865
+ """Record cold start duration - the critical UX metric.
866
+
867
+ This measures time from user request to provider ready state.
868
+ High values here directly impact user experience.
869
+
870
+ Args:
871
+ provider: Provider ID
872
+ duration: Time in seconds from start to ready
873
+ mode: Provider mode (subprocess, docker, etc.)
874
+ """
875
+ PROVIDER_COLD_START_SECONDS.observe(duration, provider=provider, mode=mode)
876
+
877
+
878
+ def cold_start_begin(provider: str):
879
+ """Mark beginning of cold start (for in-progress tracking)."""
880
+ PROVIDER_COLD_START_IN_PROGRESS.set(1, provider=provider)
881
+
882
+
883
+ def cold_start_end(provider: str):
884
+ """Mark end of cold start."""
885
+ PROVIDER_COLD_START_IN_PROGRESS.set(0, provider=provider)
886
+
887
+
888
+ def record_gc_cycle(duration: float, collected: Dict[str, int] = None):
889
+ """Record a GC cycle."""
890
+ GC_CYCLES_TOTAL.inc()
891
+ GC_CYCLE_DURATION_SECONDS.observe(duration)
892
+ if collected:
893
+ for reason, count in collected.items():
894
+ for _ in range(count):
895
+ GC_PROVIDERS_COLLECTED_TOTAL.inc(reason=reason)
896
+
897
+
898
+ def record_error(component: str, error_type: str):
899
+ """Record an error."""
900
+ ERRORS_TOTAL.inc(component=component, error_type=error_type)
901
+
902
+
903
+ # =============================================================================
904
+ # Discovery Metrics Functions
905
+ # =============================================================================
906
+
907
+
908
+ def update_discovery_source(source_type: str, mode: str, is_healthy: bool, providers_count: int):
909
+ """Update discovery source metrics.
910
+
911
+ Args:
912
+ source_type: Type of source (filesystem, docker, kubernetes, entrypoint)
913
+ mode: Discovery mode (additive, authoritative)
914
+ is_healthy: Whether the source is healthy
915
+ providers_count: Number of providers discovered by this source
916
+ """
917
+ DISCOVERY_SOURCES_TOTAL.set(1, source_type=source_type, mode=mode)
918
+ DISCOVERY_SOURCES_HEALTHY.set(1 if is_healthy else 0, source_type=source_type)
919
+ DISCOVERY_PROVIDERS_TOTAL.set(providers_count, source_type=source_type, status="discovered")
920
+
921
+
922
+ def record_discovery_cycle(
923
+ source_type: str,
924
+ duration: float,
925
+ discovered: int = 0,
926
+ registered: int = 0,
927
+ quarantined: int = 0,
928
+ ):
929
+ """Record a discovery cycle execution.
930
+
931
+ Args:
932
+ source_type: Type of source
933
+ duration: Duration of the cycle in seconds
934
+ discovered: Number of providers discovered
935
+ registered: Number of providers registered
936
+ quarantined: Number of providers quarantined
937
+ """
938
+ DISCOVERY_CYCLES_TOTAL.inc(source_type=source_type)
939
+ DISCOVERY_CYCLE_DURATION_SECONDS.observe(duration, source_type=source_type)
940
+ DISCOVERY_LAST_CYCLE_TIMESTAMP.set(time.time(), source_type=source_type)
941
+
942
+ # Update provider counts
943
+ DISCOVERY_PROVIDERS_TOTAL.set(discovered, source_type=source_type, status="discovered")
944
+ DISCOVERY_PROVIDERS_TOTAL.set(registered, source_type=source_type, status="registered")
945
+ DISCOVERY_PROVIDERS_TOTAL.set(quarantined, source_type=source_type, status="quarantined")
946
+
947
+
948
+ def record_discovery_registration(source_type: str):
949
+ """Record a provider registration from discovery."""
950
+ DISCOVERY_REGISTRATIONS_TOTAL.inc(source_type=source_type)
951
+
952
+
953
+ def record_discovery_deregistration(source_type: str, reason: str):
954
+ """Record a provider deregistration from discovery.
955
+
956
+ Args:
957
+ source_type: Type of source
958
+ reason: Reason for deregistration (ttl_expired, source_removed, manual)
959
+ """
960
+ DISCOVERY_DEREGISTRATIONS_TOTAL.inc(source_type=source_type, reason=reason)
961
+
962
+
963
+ def record_discovery_conflict(conflict_type: str):
964
+ """Record a discovery conflict.
965
+
966
+ Args:
967
+ conflict_type: Type of conflict (static_wins, source_priority)
968
+ """
969
+ DISCOVERY_CONFLICTS_TOTAL.inc(conflict_type=conflict_type)
970
+
971
+
972
+ def record_discovery_quarantine(reason: str):
973
+ """Record a provider quarantine.
974
+
975
+ Args:
976
+ reason: Reason for quarantine (health_check_failed, validation_failed, rate_limited)
977
+ """
978
+ DISCOVERY_QUARANTINE_TOTAL.inc(reason=reason)
979
+
980
+
981
+ def record_discovery_error(source_type: str, error_type: str):
982
+ """Record a discovery error.
983
+
984
+ Args:
985
+ source_type: Type of source
986
+ error_type: Type of error
987
+ """
988
+ DISCOVERY_ERRORS_TOTAL.inc(source_type=source_type, error_type=error_type)
989
+
990
+
991
+ # =============================================================================
992
+ # Timing Decorator
993
+ # =============================================================================
994
+
995
+
996
+ def timed(histogram: Histogram, **labels):
997
+ """Decorator to time function execution."""
998
+
999
+ def decorator(func):
1000
+ @wraps(func)
1001
+ def wrapper(*args, **kwargs):
1002
+ with histogram.labels(**labels).time():
1003
+ return func(*args, **kwargs)
1004
+
1005
+ return wrapper
1006
+
1007
+ return decorator