digitalkin 0.3.2.dev7__py3-none-any.whl → 0.3.2.dev10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- digitalkin/__version__.py +1 -1
- digitalkin/grpc_servers/module_servicer.py +0 -11
- digitalkin/grpc_servers/utils/grpc_client_wrapper.py +2 -2
- digitalkin/grpc_servers/utils/utility_schema_extender.py +2 -1
- digitalkin/models/grpc_servers/models.py +91 -6
- digitalkin/models/module/module_context.py +136 -23
- digitalkin/models/module/setup_types.py +177 -260
- digitalkin/models/module/tool_cache.py +27 -187
- digitalkin/models/module/tool_reference.py +42 -45
- digitalkin/models/services/registry.py +0 -7
- digitalkin/modules/_base_module.py +85 -58
- digitalkin/services/registry/__init__.py +1 -1
- digitalkin/services/registry/default_registry.py +1 -1
- digitalkin/services/registry/grpc_registry.py +1 -1
- digitalkin/services/registry/registry_models.py +1 -29
- digitalkin/services/registry/registry_strategy.py +1 -1
- digitalkin/utils/schema_splitter.py +207 -0
- {digitalkin-0.3.2.dev7.dist-info → digitalkin-0.3.2.dev10.dist-info}/METADATA +1 -1
- {digitalkin-0.3.2.dev7.dist-info → digitalkin-0.3.2.dev10.dist-info}/RECORD +29 -22
- {digitalkin-0.3.2.dev7.dist-info → digitalkin-0.3.2.dev10.dist-info}/top_level.txt +1 -0
- modules/archetype_with_tools_module.py +244 -0
- monitoring/digitalkin_observability/__init__.py +46 -0
- monitoring/digitalkin_observability/http_server.py +150 -0
- monitoring/digitalkin_observability/interceptors.py +176 -0
- monitoring/digitalkin_observability/metrics.py +201 -0
- monitoring/digitalkin_observability/prometheus.py +137 -0
- monitoring/tests/test_metrics.py +172 -0
- digitalkin/models/module/module_helpers.py +0 -189
- {digitalkin-0.3.2.dev7.dist-info → digitalkin-0.3.2.dev10.dist-info}/WHEEL +0 -0
- {digitalkin-0.3.2.dev7.dist-info → digitalkin-0.3.2.dev10.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Standalone observability module for DigitalKin.
|
|
2
|
+
|
|
3
|
+
This module can be copied into your project and used independently.
|
|
4
|
+
It has no dependencies on the digitalkin package.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from digitalkin_observability import (
|
|
8
|
+
MetricsCollector,
|
|
9
|
+
MetricsServer,
|
|
10
|
+
MetricsServerInterceptor,
|
|
11
|
+
PrometheusExporter,
|
|
12
|
+
get_metrics,
|
|
13
|
+
start_metrics_server,
|
|
14
|
+
stop_metrics_server,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
# Start metrics HTTP server
|
|
18
|
+
start_metrics_server(port=8081)
|
|
19
|
+
|
|
20
|
+
# Track metrics
|
|
21
|
+
metrics = get_metrics()
|
|
22
|
+
metrics.inc_jobs_started("my_module")
|
|
23
|
+
metrics.inc_jobs_completed("my_module", duration=1.5)
|
|
24
|
+
|
|
25
|
+
# Export to Prometheus format
|
|
26
|
+
print(PrometheusExporter.export())
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from digitalkin_observability.http_server import (
|
|
30
|
+
MetricsServer,
|
|
31
|
+
start_metrics_server,
|
|
32
|
+
stop_metrics_server,
|
|
33
|
+
)
|
|
34
|
+
from digitalkin_observability.interceptors import MetricsServerInterceptor
|
|
35
|
+
from digitalkin_observability.metrics import MetricsCollector, get_metrics
|
|
36
|
+
from digitalkin_observability.prometheus import PrometheusExporter
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"MetricsCollector",
|
|
40
|
+
"MetricsServer",
|
|
41
|
+
"MetricsServerInterceptor",
|
|
42
|
+
"PrometheusExporter",
|
|
43
|
+
"get_metrics",
|
|
44
|
+
"start_metrics_server",
|
|
45
|
+
"stop_metrics_server",
|
|
46
|
+
]
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Simple HTTP server for exposing Prometheus metrics.
|
|
2
|
+
|
|
3
|
+
This module provides an HTTP server that exposes metrics at /metrics endpoint.
|
|
4
|
+
No external dependencies required beyond Python standard library.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from http.server import BaseHTTPRequestHandler, HTTPServer
|
|
11
|
+
from threading import Thread
|
|
12
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Self
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MetricsHandler(BaseHTTPRequestHandler):
|
|
21
|
+
"""HTTP request handler for metrics endpoint."""
|
|
22
|
+
|
|
23
|
+
def do_GET(self) -> None:
|
|
24
|
+
"""Handle GET requests."""
|
|
25
|
+
if self.path == "/metrics":
|
|
26
|
+
self._serve_metrics()
|
|
27
|
+
elif self.path == "/health":
|
|
28
|
+
self._serve_health()
|
|
29
|
+
else:
|
|
30
|
+
self.send_error(404, "Not Found")
|
|
31
|
+
|
|
32
|
+
def _serve_metrics(self) -> None:
|
|
33
|
+
"""Serve Prometheus metrics."""
|
|
34
|
+
from digitalkin_observability.prometheus import PrometheusExporter
|
|
35
|
+
|
|
36
|
+
content = PrometheusExporter.export()
|
|
37
|
+
self.send_response(200)
|
|
38
|
+
self.send_header("Content-Type", "text/plain; charset=utf-8")
|
|
39
|
+
self.send_header("Content-Length", str(len(content)))
|
|
40
|
+
self.end_headers()
|
|
41
|
+
self.wfile.write(content.encode("utf-8"))
|
|
42
|
+
|
|
43
|
+
def _serve_health(self) -> None:
|
|
44
|
+
"""Serve health check."""
|
|
45
|
+
content = '{"status": "ok"}'
|
|
46
|
+
self.send_response(200)
|
|
47
|
+
self.send_header("Content-Type", "application/json")
|
|
48
|
+
self.send_header("Content-Length", str(len(content)))
|
|
49
|
+
self.end_headers()
|
|
50
|
+
self.wfile.write(content.encode("utf-8"))
|
|
51
|
+
|
|
52
|
+
def log_message(self, format: str, *args: object) -> None:
|
|
53
|
+
"""Suppress default logging."""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class MetricsServer:
|
|
57
|
+
"""HTTP server for exposing metrics to Prometheus.
|
|
58
|
+
|
|
59
|
+
Usage:
|
|
60
|
+
server = MetricsServer(port=8081)
|
|
61
|
+
server.start()
|
|
62
|
+
# ... run your application ...
|
|
63
|
+
server.stop()
|
|
64
|
+
|
|
65
|
+
Or as context manager:
|
|
66
|
+
with MetricsServer(port=8081):
|
|
67
|
+
# ... run your application ...
|
|
68
|
+
|
|
69
|
+
Or as async context manager:
|
|
70
|
+
async with MetricsServer(port=8081):
|
|
71
|
+
# ... run your application ...
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
instance: ClassVar["MetricsServer | None"] = None
|
|
75
|
+
|
|
76
|
+
def __init__(self, host: str = "0.0.0.0", port: int = 8081) -> None:
|
|
77
|
+
"""Initialize the metrics server.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
host: Host to bind to (default: 0.0.0.0 for all interfaces).
|
|
81
|
+
port: Port to listen on (default: 8081).
|
|
82
|
+
"""
|
|
83
|
+
self.host = host
|
|
84
|
+
self.port = port
|
|
85
|
+
self._server: HTTPServer | None = None
|
|
86
|
+
self._thread: Thread | None = None
|
|
87
|
+
|
|
88
|
+
def start(self) -> None:
|
|
89
|
+
"""Start the metrics server in a background thread."""
|
|
90
|
+
if self._server is not None:
|
|
91
|
+
logger.warning("Metrics server already running")
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
self._server = HTTPServer((self.host, self.port), MetricsHandler)
|
|
95
|
+
self._thread = Thread(target=self._server.serve_forever, daemon=True)
|
|
96
|
+
self._thread.start()
|
|
97
|
+
logger.info(
|
|
98
|
+
"Metrics server started on http://%s:%s/metrics",
|
|
99
|
+
self.host,
|
|
100
|
+
self.port,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def stop(self) -> None:
|
|
104
|
+
"""Stop the metrics server."""
|
|
105
|
+
if self._server is not None:
|
|
106
|
+
self._server.shutdown()
|
|
107
|
+
self._server = None
|
|
108
|
+
self._thread = None
|
|
109
|
+
logger.info("Metrics server stopped")
|
|
110
|
+
|
|
111
|
+
async def __aenter__(self) -> "Self":
|
|
112
|
+
"""Async context manager entry."""
|
|
113
|
+
self.start()
|
|
114
|
+
return self
|
|
115
|
+
|
|
116
|
+
async def __aexit__(self, *args: object) -> None:
|
|
117
|
+
"""Async context manager exit."""
|
|
118
|
+
self.stop()
|
|
119
|
+
|
|
120
|
+
def __enter__(self) -> "Self":
|
|
121
|
+
"""Context manager entry."""
|
|
122
|
+
self.start()
|
|
123
|
+
return self
|
|
124
|
+
|
|
125
|
+
def __exit__(self, *args: object) -> None:
|
|
126
|
+
"""Context manager exit."""
|
|
127
|
+
self.stop()
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def start_metrics_server(host: str = "0.0.0.0", port: int = 8081) -> MetricsServer:
|
|
131
|
+
"""Start a metrics server singleton.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
host: Host to bind to.
|
|
135
|
+
port: Port to listen on.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
The MetricsServer instance.
|
|
139
|
+
"""
|
|
140
|
+
if MetricsServer.instance is None:
|
|
141
|
+
MetricsServer.instance = MetricsServer(host, port)
|
|
142
|
+
MetricsServer.instance.start()
|
|
143
|
+
return MetricsServer.instance
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def stop_metrics_server() -> None:
|
|
147
|
+
"""Stop the metrics server singleton."""
|
|
148
|
+
if MetricsServer.instance is not None:
|
|
149
|
+
MetricsServer.instance.stop()
|
|
150
|
+
MetricsServer.instance = None
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""gRPC interceptors for automatic metrics collection.
|
|
2
|
+
|
|
3
|
+
This module provides gRPC server interceptors that automatically track
|
|
4
|
+
request duration and errors. Requires grpcio package.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import time
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Awaitable, Callable
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import grpc
|
|
17
|
+
|
|
18
|
+
from digitalkin_observability.metrics import get_metrics
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MetricsServerInterceptor:
|
|
22
|
+
"""Intercepts all gRPC calls to collect metrics.
|
|
23
|
+
|
|
24
|
+
This interceptor automatically tracks:
|
|
25
|
+
- Request duration (histogram)
|
|
26
|
+
- Error counts
|
|
27
|
+
|
|
28
|
+
Usage:
|
|
29
|
+
import grpc
|
|
30
|
+
from digitalkin_observability import MetricsServerInterceptor
|
|
31
|
+
|
|
32
|
+
interceptors = [MetricsServerInterceptor()]
|
|
33
|
+
server = grpc.aio.server(interceptors=interceptors)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
async def intercept_service(
|
|
37
|
+
self,
|
|
38
|
+
continuation: Callable[["grpc.HandlerCallDetails"], Awaitable["grpc.RpcMethodHandler"]],
|
|
39
|
+
handler_call_details: "grpc.HandlerCallDetails",
|
|
40
|
+
) -> "grpc.RpcMethodHandler":
|
|
41
|
+
"""Intercept a gRPC service call to collect metrics.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
continuation: The next interceptor or the actual handler.
|
|
45
|
+
handler_call_details: Details about the call being intercepted.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
The RPC method handler.
|
|
49
|
+
"""
|
|
50
|
+
start = time.perf_counter()
|
|
51
|
+
metrics = get_metrics()
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
handler = await continuation(handler_call_details)
|
|
55
|
+
return _MetricsWrappedHandler(handler, start, handler_call_details.method)
|
|
56
|
+
except Exception:
|
|
57
|
+
metrics.inc_errors()
|
|
58
|
+
metrics.observe_grpc_duration(time.perf_counter() - start)
|
|
59
|
+
raise
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class _MetricsWrappedHandler:
|
|
63
|
+
"""Wrapper that measures actual handler execution time."""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
handler: "grpc.RpcMethodHandler",
|
|
68
|
+
start_time: float,
|
|
69
|
+
method: str,
|
|
70
|
+
) -> None:
|
|
71
|
+
self._handler = handler
|
|
72
|
+
self._start_time = start_time
|
|
73
|
+
self._method = method
|
|
74
|
+
|
|
75
|
+
# Copy attributes from original handler
|
|
76
|
+
self.request_streaming = handler.request_streaming
|
|
77
|
+
self.response_streaming = handler.response_streaming
|
|
78
|
+
self.request_deserializer = handler.request_deserializer
|
|
79
|
+
self.response_serializer = handler.response_serializer
|
|
80
|
+
|
|
81
|
+
# Wrap the appropriate method based on streaming type
|
|
82
|
+
if handler.unary_unary:
|
|
83
|
+
self.unary_unary = self._wrap_unary_unary(handler.unary_unary)
|
|
84
|
+
self.unary_stream = None
|
|
85
|
+
self.stream_unary = None
|
|
86
|
+
self.stream_stream = None
|
|
87
|
+
elif handler.unary_stream:
|
|
88
|
+
self.unary_unary = None
|
|
89
|
+
self.unary_stream = self._wrap_unary_stream(handler.unary_stream)
|
|
90
|
+
self.stream_unary = None
|
|
91
|
+
self.stream_stream = None
|
|
92
|
+
elif handler.stream_unary:
|
|
93
|
+
self.unary_unary = None
|
|
94
|
+
self.unary_stream = None
|
|
95
|
+
self.stream_unary = self._wrap_stream_unary(handler.stream_unary)
|
|
96
|
+
self.stream_stream = None
|
|
97
|
+
elif handler.stream_stream:
|
|
98
|
+
self.unary_unary = None
|
|
99
|
+
self.unary_stream = None
|
|
100
|
+
self.stream_unary = None
|
|
101
|
+
self.stream_stream = self._wrap_stream_stream(handler.stream_stream)
|
|
102
|
+
else:
|
|
103
|
+
self.unary_unary = None
|
|
104
|
+
self.unary_stream = None
|
|
105
|
+
self.stream_unary = None
|
|
106
|
+
self.stream_stream = None
|
|
107
|
+
|
|
108
|
+
def _wrap_unary_unary(
|
|
109
|
+
self,
|
|
110
|
+
handler: Callable[["Any", "grpc.aio.ServicerContext"], Awaitable["Any"]],
|
|
111
|
+
) -> Callable[["Any", "grpc.aio.ServicerContext"], Awaitable["Any"]]:
|
|
112
|
+
"""Wrap a unary-unary handler."""
|
|
113
|
+
async def wrapped(request: "Any", context: "grpc.aio.ServicerContext") -> "Any":
|
|
114
|
+
metrics = get_metrics()
|
|
115
|
+
try:
|
|
116
|
+
return await handler(request, context)
|
|
117
|
+
except Exception:
|
|
118
|
+
metrics.inc_errors()
|
|
119
|
+
raise
|
|
120
|
+
finally:
|
|
121
|
+
metrics.observe_grpc_duration(time.perf_counter() - self._start_time)
|
|
122
|
+
|
|
123
|
+
return wrapped
|
|
124
|
+
|
|
125
|
+
def _wrap_unary_stream(
|
|
126
|
+
self,
|
|
127
|
+
handler: Callable[["Any", "grpc.aio.ServicerContext"], "Any"],
|
|
128
|
+
) -> Callable[["Any", "grpc.aio.ServicerContext"], "Any"]:
|
|
129
|
+
"""Wrap a unary-stream handler."""
|
|
130
|
+
async def wrapped(request: "Any", context: "grpc.aio.ServicerContext") -> "Any":
|
|
131
|
+
metrics = get_metrics()
|
|
132
|
+
try:
|
|
133
|
+
async for response in handler(request, context):
|
|
134
|
+
yield response
|
|
135
|
+
except Exception:
|
|
136
|
+
metrics.inc_errors()
|
|
137
|
+
raise
|
|
138
|
+
finally:
|
|
139
|
+
metrics.observe_grpc_duration(time.perf_counter() - self._start_time)
|
|
140
|
+
|
|
141
|
+
return wrapped
|
|
142
|
+
|
|
143
|
+
def _wrap_stream_unary(
|
|
144
|
+
self,
|
|
145
|
+
handler: Callable[["Any", "grpc.aio.ServicerContext"], Awaitable["Any"]],
|
|
146
|
+
) -> Callable[["Any", "grpc.aio.ServicerContext"], Awaitable["Any"]]:
|
|
147
|
+
"""Wrap a stream-unary handler."""
|
|
148
|
+
async def wrapped(request_iterator: "Any", context: "grpc.aio.ServicerContext") -> "Any":
|
|
149
|
+
metrics = get_metrics()
|
|
150
|
+
try:
|
|
151
|
+
return await handler(request_iterator, context)
|
|
152
|
+
except Exception:
|
|
153
|
+
metrics.inc_errors()
|
|
154
|
+
raise
|
|
155
|
+
finally:
|
|
156
|
+
metrics.observe_grpc_duration(time.perf_counter() - self._start_time)
|
|
157
|
+
|
|
158
|
+
return wrapped
|
|
159
|
+
|
|
160
|
+
def _wrap_stream_stream(
|
|
161
|
+
self,
|
|
162
|
+
handler: Callable[["Any", "grpc.aio.ServicerContext"], "Any"],
|
|
163
|
+
) -> Callable[["Any", "grpc.aio.ServicerContext"], "Any"]:
|
|
164
|
+
"""Wrap a stream-stream handler."""
|
|
165
|
+
async def wrapped(request_iterator: "Any", context: "grpc.aio.ServicerContext") -> "Any":
|
|
166
|
+
metrics = get_metrics()
|
|
167
|
+
try:
|
|
168
|
+
async for response in handler(request_iterator, context):
|
|
169
|
+
yield response
|
|
170
|
+
except Exception:
|
|
171
|
+
metrics.inc_errors()
|
|
172
|
+
raise
|
|
173
|
+
finally:
|
|
174
|
+
metrics.observe_grpc_duration(time.perf_counter() - self._start_time)
|
|
175
|
+
|
|
176
|
+
return wrapped
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Core metrics collection for DigitalKin.
|
|
2
|
+
|
|
3
|
+
This module provides a thread-safe singleton MetricsCollector that tracks
|
|
4
|
+
various metrics about job execution, gRPC requests, and system performance.
|
|
5
|
+
|
|
6
|
+
No external dependencies required.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from threading import Lock
|
|
14
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class Histogram:
|
|
22
|
+
"""Simple histogram with configurable buckets."""
|
|
23
|
+
|
|
24
|
+
buckets: tuple[float, ...] = (0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0)
|
|
25
|
+
counts: dict[float, int] = field(default_factory=lambda: defaultdict(int))
|
|
26
|
+
total_sum: float = 0.0
|
|
27
|
+
count: int = 0
|
|
28
|
+
|
|
29
|
+
def observe(self, value: float) -> None:
|
|
30
|
+
"""Record an observation in the histogram."""
|
|
31
|
+
self.total_sum += value
|
|
32
|
+
self.count += 1
|
|
33
|
+
for bucket in self.buckets:
|
|
34
|
+
if value <= bucket:
|
|
35
|
+
self.counts[bucket] += 1
|
|
36
|
+
|
|
37
|
+
def reset(self) -> None:
|
|
38
|
+
"""Reset histogram state."""
|
|
39
|
+
self.counts = defaultdict(int)
|
|
40
|
+
self.total_sum = 0.0
|
|
41
|
+
self.count = 0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class MetricsCollector:
|
|
45
|
+
"""Thread-safe singleton metrics collector.
|
|
46
|
+
|
|
47
|
+
Collects various metrics about job execution, gRPC requests,
|
|
48
|
+
and system performance. Designed to be stateless per-request
|
|
49
|
+
while maintaining aggregate counters.
|
|
50
|
+
|
|
51
|
+
Usage:
|
|
52
|
+
metrics = MetricsCollector() # or get_metrics()
|
|
53
|
+
metrics.inc_jobs_started("my_module")
|
|
54
|
+
metrics.inc_jobs_completed("my_module", duration=1.5)
|
|
55
|
+
print(metrics.snapshot())
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
_instance: ClassVar[MetricsCollector | None] = None
|
|
59
|
+
_lock: ClassVar[Lock] = Lock()
|
|
60
|
+
|
|
61
|
+
def __new__(cls) -> "MetricsCollector":
|
|
62
|
+
"""Create or return the singleton instance."""
|
|
63
|
+
if cls._instance is None:
|
|
64
|
+
with cls._lock:
|
|
65
|
+
if cls._instance is None:
|
|
66
|
+
instance = super().__new__(cls)
|
|
67
|
+
instance._init_metrics()
|
|
68
|
+
cls._instance = instance
|
|
69
|
+
return cls._instance
|
|
70
|
+
|
|
71
|
+
def _init_metrics(self) -> None:
|
|
72
|
+
"""Initialize all metric storage."""
|
|
73
|
+
# Counters
|
|
74
|
+
self.jobs_started_total: int = 0
|
|
75
|
+
self.jobs_completed_total: int = 0
|
|
76
|
+
self.jobs_failed_total: int = 0
|
|
77
|
+
self.jobs_cancelled_total: int = 0
|
|
78
|
+
self.messages_sent_total: int = 0
|
|
79
|
+
self.heartbeats_sent_total: int = 0
|
|
80
|
+
self.errors_total: int = 0
|
|
81
|
+
|
|
82
|
+
# Gauges
|
|
83
|
+
self.active_jobs: int = 0
|
|
84
|
+
self.active_connections: int = 0
|
|
85
|
+
self.queue_depth: dict[str, int] = {}
|
|
86
|
+
|
|
87
|
+
# Histograms
|
|
88
|
+
self.job_duration_seconds = Histogram()
|
|
89
|
+
self.message_latency_seconds = Histogram()
|
|
90
|
+
self.grpc_request_duration_seconds = Histogram()
|
|
91
|
+
|
|
92
|
+
# Labels for breakdown
|
|
93
|
+
self._by_module: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
|
94
|
+
self._by_protocol: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
|
95
|
+
|
|
96
|
+
# Instance lock for thread safety
|
|
97
|
+
self._instance_lock = Lock()
|
|
98
|
+
|
|
99
|
+
def inc_jobs_started(self, module_name: str) -> None:
|
|
100
|
+
"""Increment jobs started counter."""
|
|
101
|
+
with self._instance_lock:
|
|
102
|
+
self.jobs_started_total += 1
|
|
103
|
+
self.active_jobs += 1
|
|
104
|
+
self._by_module[module_name]["started"] += 1
|
|
105
|
+
|
|
106
|
+
def inc_jobs_completed(self, module_name: str, duration: float) -> None:
|
|
107
|
+
"""Increment jobs completed counter and record duration."""
|
|
108
|
+
with self._instance_lock:
|
|
109
|
+
self.jobs_completed_total += 1
|
|
110
|
+
self.active_jobs = max(0, self.active_jobs - 1)
|
|
111
|
+
self._by_module[module_name]["completed"] += 1
|
|
112
|
+
self.job_duration_seconds.observe(duration)
|
|
113
|
+
|
|
114
|
+
def inc_jobs_failed(self, module_name: str) -> None:
|
|
115
|
+
"""Increment jobs failed counter."""
|
|
116
|
+
with self._instance_lock:
|
|
117
|
+
self.jobs_failed_total += 1
|
|
118
|
+
self.active_jobs = max(0, self.active_jobs - 1)
|
|
119
|
+
self._by_module[module_name]["failed"] += 1
|
|
120
|
+
|
|
121
|
+
def inc_jobs_cancelled(self, module_name: str) -> None:
|
|
122
|
+
"""Increment jobs cancelled counter."""
|
|
123
|
+
with self._instance_lock:
|
|
124
|
+
self.jobs_cancelled_total += 1
|
|
125
|
+
self.active_jobs = max(0, self.active_jobs - 1)
|
|
126
|
+
self._by_module[module_name]["cancelled"] += 1
|
|
127
|
+
|
|
128
|
+
def inc_messages_sent(self, protocol: str | None = None) -> None:
|
|
129
|
+
"""Increment messages sent counter."""
|
|
130
|
+
with self._instance_lock:
|
|
131
|
+
self.messages_sent_total += 1
|
|
132
|
+
if protocol:
|
|
133
|
+
self._by_protocol[protocol]["messages"] += 1
|
|
134
|
+
|
|
135
|
+
def inc_heartbeats_sent(self) -> None:
|
|
136
|
+
"""Increment heartbeats sent counter."""
|
|
137
|
+
with self._instance_lock:
|
|
138
|
+
self.heartbeats_sent_total += 1
|
|
139
|
+
|
|
140
|
+
def inc_errors(self) -> None:
|
|
141
|
+
"""Increment errors counter."""
|
|
142
|
+
with self._instance_lock:
|
|
143
|
+
self.errors_total += 1
|
|
144
|
+
|
|
145
|
+
def set_queue_depth(self, job_id: str, depth: int) -> None:
|
|
146
|
+
"""Set the queue depth for a job."""
|
|
147
|
+
with self._instance_lock:
|
|
148
|
+
self.queue_depth[job_id] = depth
|
|
149
|
+
|
|
150
|
+
def clear_queue_depth(self, job_id: str) -> None:
|
|
151
|
+
"""Clear queue depth tracking for a job."""
|
|
152
|
+
with self._instance_lock:
|
|
153
|
+
self.queue_depth.pop(job_id, None)
|
|
154
|
+
|
|
155
|
+
def observe_grpc_duration(self, duration: float) -> None:
|
|
156
|
+
"""Record a gRPC request duration."""
|
|
157
|
+
with self._instance_lock:
|
|
158
|
+
self.grpc_request_duration_seconds.observe(duration)
|
|
159
|
+
|
|
160
|
+
def observe_message_latency(self, latency: float) -> None:
|
|
161
|
+
"""Record a message latency."""
|
|
162
|
+
with self._instance_lock:
|
|
163
|
+
self.message_latency_seconds.observe(latency)
|
|
164
|
+
|
|
165
|
+
def snapshot(self) -> dict[str, Any]:
|
|
166
|
+
"""Return current metrics as dict for export."""
|
|
167
|
+
with self._instance_lock:
|
|
168
|
+
return {
|
|
169
|
+
"jobs_started_total": self.jobs_started_total,
|
|
170
|
+
"jobs_completed_total": self.jobs_completed_total,
|
|
171
|
+
"jobs_failed_total": self.jobs_failed_total,
|
|
172
|
+
"jobs_cancelled_total": self.jobs_cancelled_total,
|
|
173
|
+
"active_jobs": self.active_jobs,
|
|
174
|
+
"messages_sent_total": self.messages_sent_total,
|
|
175
|
+
"heartbeats_sent_total": self.heartbeats_sent_total,
|
|
176
|
+
"errors_total": self.errors_total,
|
|
177
|
+
"active_connections": self.active_connections,
|
|
178
|
+
"total_queue_depth": sum(self.queue_depth.values()),
|
|
179
|
+
"job_duration_seconds": {
|
|
180
|
+
"count": self.job_duration_seconds.count,
|
|
181
|
+
"sum": self.job_duration_seconds.total_sum,
|
|
182
|
+
"buckets": dict(self.job_duration_seconds.counts),
|
|
183
|
+
},
|
|
184
|
+
"grpc_request_duration_seconds": {
|
|
185
|
+
"count": self.grpc_request_duration_seconds.count,
|
|
186
|
+
"sum": self.grpc_request_duration_seconds.total_sum,
|
|
187
|
+
"buckets": dict(self.grpc_request_duration_seconds.counts),
|
|
188
|
+
},
|
|
189
|
+
"by_module": {k: dict(v) for k, v in self._by_module.items()},
|
|
190
|
+
"by_protocol": {k: dict(v) for k, v in self._by_protocol.items()},
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
def reset(self) -> None:
|
|
194
|
+
"""Reset all metrics. Useful for testing."""
|
|
195
|
+
with self._instance_lock:
|
|
196
|
+
self._init_metrics()
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def get_metrics() -> MetricsCollector:
|
|
200
|
+
"""Get the global MetricsCollector instance."""
|
|
201
|
+
return MetricsCollector()
|