digitalkin 0.3.1.dev2__py3-none-any.whl → 0.3.2.dev14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- base_server/server_async_insecure.py +6 -5
- base_server/server_async_secure.py +6 -5
- base_server/server_sync_insecure.py +5 -4
- base_server/server_sync_secure.py +5 -4
- digitalkin/__version__.py +1 -1
- digitalkin/core/job_manager/base_job_manager.py +1 -1
- digitalkin/core/job_manager/single_job_manager.py +28 -9
- digitalkin/core/job_manager/taskiq_broker.py +7 -6
- digitalkin/core/job_manager/taskiq_job_manager.py +1 -1
- digitalkin/core/task_manager/surrealdb_repository.py +7 -7
- digitalkin/core/task_manager/task_session.py +60 -98
- digitalkin/grpc_servers/module_server.py +109 -168
- digitalkin/grpc_servers/module_servicer.py +38 -16
- digitalkin/grpc_servers/utils/grpc_client_wrapper.py +24 -8
- digitalkin/grpc_servers/utils/utility_schema_extender.py +100 -0
- digitalkin/models/__init__.py +1 -1
- digitalkin/models/core/job_manager_models.py +0 -8
- digitalkin/models/core/task_monitor.py +4 -0
- digitalkin/models/grpc_servers/models.py +91 -6
- digitalkin/models/module/__init__.py +18 -13
- digitalkin/models/module/base_types.py +61 -0
- digitalkin/models/module/module_context.py +173 -13
- digitalkin/models/module/module_types.py +28 -392
- digitalkin/models/module/setup_types.py +490 -0
- digitalkin/models/module/tool_cache.py +68 -0
- digitalkin/models/module/tool_reference.py +117 -0
- digitalkin/models/module/utility.py +167 -0
- digitalkin/models/services/registry.py +35 -0
- digitalkin/modules/__init__.py +5 -1
- digitalkin/modules/_base_module.py +154 -61
- digitalkin/modules/archetype_module.py +6 -1
- digitalkin/modules/tool_module.py +6 -1
- digitalkin/modules/triggers/__init__.py +8 -0
- digitalkin/modules/triggers/healthcheck_ping_trigger.py +45 -0
- digitalkin/modules/triggers/healthcheck_services_trigger.py +63 -0
- digitalkin/modules/triggers/healthcheck_status_trigger.py +52 -0
- digitalkin/services/__init__.py +4 -0
- digitalkin/services/communication/__init__.py +7 -0
- digitalkin/services/communication/communication_strategy.py +76 -0
- digitalkin/services/communication/default_communication.py +101 -0
- digitalkin/services/communication/grpc_communication.py +234 -0
- digitalkin/services/cost/grpc_cost.py +1 -1
- digitalkin/services/filesystem/grpc_filesystem.py +1 -1
- digitalkin/services/registry/__init__.py +22 -1
- digitalkin/services/registry/default_registry.py +135 -4
- digitalkin/services/registry/exceptions.py +47 -0
- digitalkin/services/registry/grpc_registry.py +306 -0
- digitalkin/services/registry/registry_models.py +15 -0
- digitalkin/services/registry/registry_strategy.py +88 -4
- digitalkin/services/services_config.py +25 -3
- digitalkin/services/services_models.py +5 -1
- digitalkin/services/setup/default_setup.py +1 -1
- digitalkin/services/setup/grpc_setup.py +1 -1
- digitalkin/services/storage/grpc_storage.py +1 -1
- digitalkin/services/user_profile/__init__.py +11 -0
- digitalkin/services/user_profile/grpc_user_profile.py +2 -2
- digitalkin/services/user_profile/user_profile_strategy.py +0 -15
- digitalkin/utils/schema_splitter.py +207 -0
- {digitalkin-0.3.1.dev2.dist-info → digitalkin-0.3.2.dev14.dist-info}/METADATA +5 -5
- digitalkin-0.3.2.dev14.dist-info/RECORD +143 -0
- {digitalkin-0.3.1.dev2.dist-info → digitalkin-0.3.2.dev14.dist-info}/top_level.txt +1 -0
- modules/archetype_with_tools_module.py +244 -0
- modules/cpu_intensive_module.py +1 -1
- modules/dynamic_setup_module.py +5 -29
- modules/minimal_llm_module.py +1 -1
- modules/text_transform_module.py +1 -1
- monitoring/digitalkin_observability/__init__.py +46 -0
- monitoring/digitalkin_observability/http_server.py +150 -0
- monitoring/digitalkin_observability/interceptors.py +176 -0
- monitoring/digitalkin_observability/metrics.py +201 -0
- monitoring/digitalkin_observability/prometheus.py +137 -0
- monitoring/tests/test_metrics.py +172 -0
- services/filesystem_module.py +7 -5
- services/storage_module.py +4 -2
- digitalkin/grpc_servers/registry_server.py +0 -65
- digitalkin/grpc_servers/registry_servicer.py +0 -456
- digitalkin-0.3.1.dev2.dist-info/RECORD +0 -119
- {digitalkin-0.3.1.dev2.dist-info → digitalkin-0.3.2.dev14.dist-info}/WHEEL +0 -0
- {digitalkin-0.3.1.dev2.dist-info → digitalkin-0.3.2.dev14.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""gRPC interceptors for automatic metrics collection.
|
|
2
|
+
|
|
3
|
+
This module provides gRPC server interceptors that automatically track
|
|
4
|
+
request duration and errors. Requires grpcio package.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import time
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Awaitable, Callable
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import grpc
|
|
17
|
+
|
|
18
|
+
from digitalkin_observability.metrics import get_metrics
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MetricsServerInterceptor:
|
|
22
|
+
"""Intercepts all gRPC calls to collect metrics.
|
|
23
|
+
|
|
24
|
+
This interceptor automatically tracks:
|
|
25
|
+
- Request duration (histogram)
|
|
26
|
+
- Error counts
|
|
27
|
+
|
|
28
|
+
Usage:
|
|
29
|
+
import grpc
|
|
30
|
+
from digitalkin_observability import MetricsServerInterceptor
|
|
31
|
+
|
|
32
|
+
interceptors = [MetricsServerInterceptor()]
|
|
33
|
+
server = grpc.aio.server(interceptors=interceptors)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
async def intercept_service(
|
|
37
|
+
self,
|
|
38
|
+
continuation: Callable[["grpc.HandlerCallDetails"], Awaitable["grpc.RpcMethodHandler"]],
|
|
39
|
+
handler_call_details: "grpc.HandlerCallDetails",
|
|
40
|
+
) -> "grpc.RpcMethodHandler":
|
|
41
|
+
"""Intercept a gRPC service call to collect metrics.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
continuation: The next interceptor or the actual handler.
|
|
45
|
+
handler_call_details: Details about the call being intercepted.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
The RPC method handler.
|
|
49
|
+
"""
|
|
50
|
+
start = time.perf_counter()
|
|
51
|
+
metrics = get_metrics()
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
handler = await continuation(handler_call_details)
|
|
55
|
+
return _MetricsWrappedHandler(handler, start, handler_call_details.method)
|
|
56
|
+
except Exception:
|
|
57
|
+
metrics.inc_errors()
|
|
58
|
+
metrics.observe_grpc_duration(time.perf_counter() - start)
|
|
59
|
+
raise
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class _MetricsWrappedHandler:
|
|
63
|
+
"""Wrapper that measures actual handler execution time."""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
handler: "grpc.RpcMethodHandler",
|
|
68
|
+
start_time: float,
|
|
69
|
+
method: str,
|
|
70
|
+
) -> None:
|
|
71
|
+
self._handler = handler
|
|
72
|
+
self._start_time = start_time
|
|
73
|
+
self._method = method
|
|
74
|
+
|
|
75
|
+
# Copy attributes from original handler
|
|
76
|
+
self.request_streaming = handler.request_streaming
|
|
77
|
+
self.response_streaming = handler.response_streaming
|
|
78
|
+
self.request_deserializer = handler.request_deserializer
|
|
79
|
+
self.response_serializer = handler.response_serializer
|
|
80
|
+
|
|
81
|
+
# Wrap the appropriate method based on streaming type
|
|
82
|
+
if handler.unary_unary:
|
|
83
|
+
self.unary_unary = self._wrap_unary_unary(handler.unary_unary)
|
|
84
|
+
self.unary_stream = None
|
|
85
|
+
self.stream_unary = None
|
|
86
|
+
self.stream_stream = None
|
|
87
|
+
elif handler.unary_stream:
|
|
88
|
+
self.unary_unary = None
|
|
89
|
+
self.unary_stream = self._wrap_unary_stream(handler.unary_stream)
|
|
90
|
+
self.stream_unary = None
|
|
91
|
+
self.stream_stream = None
|
|
92
|
+
elif handler.stream_unary:
|
|
93
|
+
self.unary_unary = None
|
|
94
|
+
self.unary_stream = None
|
|
95
|
+
self.stream_unary = self._wrap_stream_unary(handler.stream_unary)
|
|
96
|
+
self.stream_stream = None
|
|
97
|
+
elif handler.stream_stream:
|
|
98
|
+
self.unary_unary = None
|
|
99
|
+
self.unary_stream = None
|
|
100
|
+
self.stream_unary = None
|
|
101
|
+
self.stream_stream = self._wrap_stream_stream(handler.stream_stream)
|
|
102
|
+
else:
|
|
103
|
+
self.unary_unary = None
|
|
104
|
+
self.unary_stream = None
|
|
105
|
+
self.stream_unary = None
|
|
106
|
+
self.stream_stream = None
|
|
107
|
+
|
|
108
|
+
def _wrap_unary_unary(
|
|
109
|
+
self,
|
|
110
|
+
handler: Callable[["Any", "grpc.aio.ServicerContext"], Awaitable["Any"]],
|
|
111
|
+
) -> Callable[["Any", "grpc.aio.ServicerContext"], Awaitable["Any"]]:
|
|
112
|
+
"""Wrap a unary-unary handler."""
|
|
113
|
+
async def wrapped(request: "Any", context: "grpc.aio.ServicerContext") -> "Any":
|
|
114
|
+
metrics = get_metrics()
|
|
115
|
+
try:
|
|
116
|
+
return await handler(request, context)
|
|
117
|
+
except Exception:
|
|
118
|
+
metrics.inc_errors()
|
|
119
|
+
raise
|
|
120
|
+
finally:
|
|
121
|
+
metrics.observe_grpc_duration(time.perf_counter() - self._start_time)
|
|
122
|
+
|
|
123
|
+
return wrapped
|
|
124
|
+
|
|
125
|
+
def _wrap_unary_stream(
|
|
126
|
+
self,
|
|
127
|
+
handler: Callable[["Any", "grpc.aio.ServicerContext"], "Any"],
|
|
128
|
+
) -> Callable[["Any", "grpc.aio.ServicerContext"], "Any"]:
|
|
129
|
+
"""Wrap a unary-stream handler."""
|
|
130
|
+
async def wrapped(request: "Any", context: "grpc.aio.ServicerContext") -> "Any":
|
|
131
|
+
metrics = get_metrics()
|
|
132
|
+
try:
|
|
133
|
+
async for response in handler(request, context):
|
|
134
|
+
yield response
|
|
135
|
+
except Exception:
|
|
136
|
+
metrics.inc_errors()
|
|
137
|
+
raise
|
|
138
|
+
finally:
|
|
139
|
+
metrics.observe_grpc_duration(time.perf_counter() - self._start_time)
|
|
140
|
+
|
|
141
|
+
return wrapped
|
|
142
|
+
|
|
143
|
+
def _wrap_stream_unary(
|
|
144
|
+
self,
|
|
145
|
+
handler: Callable[["Any", "grpc.aio.ServicerContext"], Awaitable["Any"]],
|
|
146
|
+
) -> Callable[["Any", "grpc.aio.ServicerContext"], Awaitable["Any"]]:
|
|
147
|
+
"""Wrap a stream-unary handler."""
|
|
148
|
+
async def wrapped(request_iterator: "Any", context: "grpc.aio.ServicerContext") -> "Any":
|
|
149
|
+
metrics = get_metrics()
|
|
150
|
+
try:
|
|
151
|
+
return await handler(request_iterator, context)
|
|
152
|
+
except Exception:
|
|
153
|
+
metrics.inc_errors()
|
|
154
|
+
raise
|
|
155
|
+
finally:
|
|
156
|
+
metrics.observe_grpc_duration(time.perf_counter() - self._start_time)
|
|
157
|
+
|
|
158
|
+
return wrapped
|
|
159
|
+
|
|
160
|
+
def _wrap_stream_stream(
|
|
161
|
+
self,
|
|
162
|
+
handler: Callable[["Any", "grpc.aio.ServicerContext"], "Any"],
|
|
163
|
+
) -> Callable[["Any", "grpc.aio.ServicerContext"], "Any"]:
|
|
164
|
+
"""Wrap a stream-stream handler."""
|
|
165
|
+
async def wrapped(request_iterator: "Any", context: "grpc.aio.ServicerContext") -> "Any":
|
|
166
|
+
metrics = get_metrics()
|
|
167
|
+
try:
|
|
168
|
+
async for response in handler(request_iterator, context):
|
|
169
|
+
yield response
|
|
170
|
+
except Exception:
|
|
171
|
+
metrics.inc_errors()
|
|
172
|
+
raise
|
|
173
|
+
finally:
|
|
174
|
+
metrics.observe_grpc_duration(time.perf_counter() - self._start_time)
|
|
175
|
+
|
|
176
|
+
return wrapped
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Core metrics collection for DigitalKin.
|
|
2
|
+
|
|
3
|
+
This module provides a thread-safe singleton MetricsCollector that tracks
|
|
4
|
+
various metrics about job execution, gRPC requests, and system performance.
|
|
5
|
+
|
|
6
|
+
No external dependencies required.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from threading import Lock
|
|
14
|
+
from typing import TYPE_CHECKING, ClassVar
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class Histogram:
|
|
22
|
+
"""Simple histogram with configurable buckets."""
|
|
23
|
+
|
|
24
|
+
buckets: tuple[float, ...] = (0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0)
|
|
25
|
+
counts: dict[float, int] = field(default_factory=lambda: defaultdict(int))
|
|
26
|
+
total_sum: float = 0.0
|
|
27
|
+
count: int = 0
|
|
28
|
+
|
|
29
|
+
def observe(self, value: float) -> None:
|
|
30
|
+
"""Record an observation in the histogram."""
|
|
31
|
+
self.total_sum += value
|
|
32
|
+
self.count += 1
|
|
33
|
+
for bucket in self.buckets:
|
|
34
|
+
if value <= bucket:
|
|
35
|
+
self.counts[bucket] += 1
|
|
36
|
+
|
|
37
|
+
def reset(self) -> None:
|
|
38
|
+
"""Reset histogram state."""
|
|
39
|
+
self.counts = defaultdict(int)
|
|
40
|
+
self.total_sum = 0.0
|
|
41
|
+
self.count = 0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class MetricsCollector:
|
|
45
|
+
"""Thread-safe singleton metrics collector.
|
|
46
|
+
|
|
47
|
+
Collects various metrics about job execution, gRPC requests,
|
|
48
|
+
and system performance. Designed to be stateless per-request
|
|
49
|
+
while maintaining aggregate counters.
|
|
50
|
+
|
|
51
|
+
Usage:
|
|
52
|
+
metrics = MetricsCollector() # or get_metrics()
|
|
53
|
+
metrics.inc_jobs_started("my_module")
|
|
54
|
+
metrics.inc_jobs_completed("my_module", duration=1.5)
|
|
55
|
+
print(metrics.snapshot())
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
_instance: ClassVar[MetricsCollector | None] = None
|
|
59
|
+
_lock: ClassVar[Lock] = Lock()
|
|
60
|
+
|
|
61
|
+
def __new__(cls) -> "MetricsCollector":
|
|
62
|
+
"""Create or return the singleton instance."""
|
|
63
|
+
if cls._instance is None:
|
|
64
|
+
with cls._lock:
|
|
65
|
+
if cls._instance is None:
|
|
66
|
+
instance = super().__new__(cls)
|
|
67
|
+
instance._init_metrics()
|
|
68
|
+
cls._instance = instance
|
|
69
|
+
return cls._instance
|
|
70
|
+
|
|
71
|
+
def _init_metrics(self) -> None:
|
|
72
|
+
"""Initialize all metric storage."""
|
|
73
|
+
# Counters
|
|
74
|
+
self.jobs_started_total: int = 0
|
|
75
|
+
self.jobs_completed_total: int = 0
|
|
76
|
+
self.jobs_failed_total: int = 0
|
|
77
|
+
self.jobs_cancelled_total: int = 0
|
|
78
|
+
self.messages_sent_total: int = 0
|
|
79
|
+
self.heartbeats_sent_total: int = 0
|
|
80
|
+
self.errors_total: int = 0
|
|
81
|
+
|
|
82
|
+
# Gauges
|
|
83
|
+
self.active_jobs: int = 0
|
|
84
|
+
self.active_connections: int = 0
|
|
85
|
+
self.queue_depth: dict[str, int] = {}
|
|
86
|
+
|
|
87
|
+
# Histograms
|
|
88
|
+
self.job_duration_seconds = Histogram()
|
|
89
|
+
self.message_latency_seconds = Histogram()
|
|
90
|
+
self.grpc_request_duration_seconds = Histogram()
|
|
91
|
+
|
|
92
|
+
# Labels for breakdown
|
|
93
|
+
self._by_module: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
|
94
|
+
self._by_protocol: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
|
95
|
+
|
|
96
|
+
# Instance lock for thread safety
|
|
97
|
+
self._instance_lock = Lock()
|
|
98
|
+
|
|
99
|
+
def inc_jobs_started(self, module_name: str) -> None:
|
|
100
|
+
"""Increment jobs started counter."""
|
|
101
|
+
with self._instance_lock:
|
|
102
|
+
self.jobs_started_total += 1
|
|
103
|
+
self.active_jobs += 1
|
|
104
|
+
self._by_module[module_name]["started"] += 1
|
|
105
|
+
|
|
106
|
+
def inc_jobs_completed(self, module_name: str, duration: float) -> None:
|
|
107
|
+
"""Increment jobs completed counter and record duration."""
|
|
108
|
+
with self._instance_lock:
|
|
109
|
+
self.jobs_completed_total += 1
|
|
110
|
+
self.active_jobs = max(0, self.active_jobs - 1)
|
|
111
|
+
self._by_module[module_name]["completed"] += 1
|
|
112
|
+
self.job_duration_seconds.observe(duration)
|
|
113
|
+
|
|
114
|
+
def inc_jobs_failed(self, module_name: str) -> None:
|
|
115
|
+
"""Increment jobs failed counter."""
|
|
116
|
+
with self._instance_lock:
|
|
117
|
+
self.jobs_failed_total += 1
|
|
118
|
+
self.active_jobs = max(0, self.active_jobs - 1)
|
|
119
|
+
self._by_module[module_name]["failed"] += 1
|
|
120
|
+
|
|
121
|
+
def inc_jobs_cancelled(self, module_name: str) -> None:
|
|
122
|
+
"""Increment jobs cancelled counter."""
|
|
123
|
+
with self._instance_lock:
|
|
124
|
+
self.jobs_cancelled_total += 1
|
|
125
|
+
self.active_jobs = max(0, self.active_jobs - 1)
|
|
126
|
+
self._by_module[module_name]["cancelled"] += 1
|
|
127
|
+
|
|
128
|
+
def inc_messages_sent(self, protocol: str | None = None) -> None:
|
|
129
|
+
"""Increment messages sent counter."""
|
|
130
|
+
with self._instance_lock:
|
|
131
|
+
self.messages_sent_total += 1
|
|
132
|
+
if protocol:
|
|
133
|
+
self._by_protocol[protocol]["messages"] += 1
|
|
134
|
+
|
|
135
|
+
def inc_heartbeats_sent(self) -> None:
|
|
136
|
+
"""Increment heartbeats sent counter."""
|
|
137
|
+
with self._instance_lock:
|
|
138
|
+
self.heartbeats_sent_total += 1
|
|
139
|
+
|
|
140
|
+
def inc_errors(self) -> None:
|
|
141
|
+
"""Increment errors counter."""
|
|
142
|
+
with self._instance_lock:
|
|
143
|
+
self.errors_total += 1
|
|
144
|
+
|
|
145
|
+
def set_queue_depth(self, job_id: str, depth: int) -> None:
|
|
146
|
+
"""Set the queue depth for a job."""
|
|
147
|
+
with self._instance_lock:
|
|
148
|
+
self.queue_depth[job_id] = depth
|
|
149
|
+
|
|
150
|
+
def clear_queue_depth(self, job_id: str) -> None:
|
|
151
|
+
"""Clear queue depth tracking for a job."""
|
|
152
|
+
with self._instance_lock:
|
|
153
|
+
self.queue_depth.pop(job_id, None)
|
|
154
|
+
|
|
155
|
+
def observe_grpc_duration(self, duration: float) -> None:
|
|
156
|
+
"""Record a gRPC request duration."""
|
|
157
|
+
with self._instance_lock:
|
|
158
|
+
self.grpc_request_duration_seconds.observe(duration)
|
|
159
|
+
|
|
160
|
+
def observe_message_latency(self, latency: float) -> None:
|
|
161
|
+
"""Record a message latency."""
|
|
162
|
+
with self._instance_lock:
|
|
163
|
+
self.message_latency_seconds.observe(latency)
|
|
164
|
+
|
|
165
|
+
def snapshot(self) -> dict[str, Any]:
|
|
166
|
+
"""Return current metrics as dict for export."""
|
|
167
|
+
with self._instance_lock:
|
|
168
|
+
return {
|
|
169
|
+
"jobs_started_total": self.jobs_started_total,
|
|
170
|
+
"jobs_completed_total": self.jobs_completed_total,
|
|
171
|
+
"jobs_failed_total": self.jobs_failed_total,
|
|
172
|
+
"jobs_cancelled_total": self.jobs_cancelled_total,
|
|
173
|
+
"active_jobs": self.active_jobs,
|
|
174
|
+
"messages_sent_total": self.messages_sent_total,
|
|
175
|
+
"heartbeats_sent_total": self.heartbeats_sent_total,
|
|
176
|
+
"errors_total": self.errors_total,
|
|
177
|
+
"active_connections": self.active_connections,
|
|
178
|
+
"total_queue_depth": sum(self.queue_depth.values()),
|
|
179
|
+
"job_duration_seconds": {
|
|
180
|
+
"count": self.job_duration_seconds.count,
|
|
181
|
+
"sum": self.job_duration_seconds.total_sum,
|
|
182
|
+
"buckets": dict(self.job_duration_seconds.counts),
|
|
183
|
+
},
|
|
184
|
+
"grpc_request_duration_seconds": {
|
|
185
|
+
"count": self.grpc_request_duration_seconds.count,
|
|
186
|
+
"sum": self.grpc_request_duration_seconds.total_sum,
|
|
187
|
+
"buckets": dict(self.grpc_request_duration_seconds.counts),
|
|
188
|
+
},
|
|
189
|
+
"by_module": {k: dict(v) for k, v in self._by_module.items()},
|
|
190
|
+
"by_protocol": {k: dict(v) for k, v in self._by_protocol.items()},
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
def reset(self) -> None:
|
|
194
|
+
"""Reset all metrics. Useful for testing."""
|
|
195
|
+
with self._instance_lock:
|
|
196
|
+
self._init_metrics()
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def get_metrics() -> MetricsCollector:
|
|
200
|
+
"""Get the global MetricsCollector instance."""
|
|
201
|
+
return MetricsCollector()
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Prometheus metrics exporter for DigitalKin.
|
|
2
|
+
|
|
3
|
+
This module exports metrics in Prometheus text exposition format.
|
|
4
|
+
No external dependencies required.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from digitalkin_observability.metrics import get_metrics
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PrometheusExporter:
|
|
13
|
+
"""Exports metrics in Prometheus text format.
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
output = PrometheusExporter.export()
|
|
17
|
+
# Returns Prometheus-compatible text format
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def export() -> str:
|
|
22
|
+
"""Generate Prometheus-compatible metrics output."""
|
|
23
|
+
snapshot = get_metrics().snapshot()
|
|
24
|
+
lines: list[str] = []
|
|
25
|
+
|
|
26
|
+
# Counters
|
|
27
|
+
lines.extend([
|
|
28
|
+
"# HELP digitalkin_jobs_started_total Total jobs started",
|
|
29
|
+
"# TYPE digitalkin_jobs_started_total counter",
|
|
30
|
+
f"digitalkin_jobs_started_total {snapshot['jobs_started_total']}",
|
|
31
|
+
"",
|
|
32
|
+
"# HELP digitalkin_jobs_completed_total Total jobs completed successfully",
|
|
33
|
+
"# TYPE digitalkin_jobs_completed_total counter",
|
|
34
|
+
f"digitalkin_jobs_completed_total {snapshot['jobs_completed_total']}",
|
|
35
|
+
"",
|
|
36
|
+
"# HELP digitalkin_jobs_failed_total Total jobs failed",
|
|
37
|
+
"# TYPE digitalkin_jobs_failed_total counter",
|
|
38
|
+
f"digitalkin_jobs_failed_total {snapshot['jobs_failed_total']}",
|
|
39
|
+
"",
|
|
40
|
+
"# HELP digitalkin_jobs_cancelled_total Total jobs cancelled",
|
|
41
|
+
"# TYPE digitalkin_jobs_cancelled_total counter",
|
|
42
|
+
f"digitalkin_jobs_cancelled_total {snapshot['jobs_cancelled_total']}",
|
|
43
|
+
"",
|
|
44
|
+
"# HELP digitalkin_messages_sent_total Total messages sent",
|
|
45
|
+
"# TYPE digitalkin_messages_sent_total counter",
|
|
46
|
+
f"digitalkin_messages_sent_total {snapshot['messages_sent_total']}",
|
|
47
|
+
"",
|
|
48
|
+
"# HELP digitalkin_heartbeats_sent_total Total heartbeats sent",
|
|
49
|
+
"# TYPE digitalkin_heartbeats_sent_total counter",
|
|
50
|
+
f"digitalkin_heartbeats_sent_total {snapshot['heartbeats_sent_total']}",
|
|
51
|
+
"",
|
|
52
|
+
"# HELP digitalkin_errors_total Total errors",
|
|
53
|
+
"# TYPE digitalkin_errors_total counter",
|
|
54
|
+
f"digitalkin_errors_total {snapshot['errors_total']}",
|
|
55
|
+
"",
|
|
56
|
+
])
|
|
57
|
+
|
|
58
|
+
# Gauges
|
|
59
|
+
lines.extend([
|
|
60
|
+
"# HELP digitalkin_active_jobs Current number of active jobs",
|
|
61
|
+
"# TYPE digitalkin_active_jobs gauge",
|
|
62
|
+
f"digitalkin_active_jobs {snapshot['active_jobs']}",
|
|
63
|
+
"",
|
|
64
|
+
"# HELP digitalkin_active_connections Current number of active connections",
|
|
65
|
+
"# TYPE digitalkin_active_connections gauge",
|
|
66
|
+
f"digitalkin_active_connections {snapshot['active_connections']}",
|
|
67
|
+
"",
|
|
68
|
+
"# HELP digitalkin_total_queue_depth Total items in all job queues",
|
|
69
|
+
"# TYPE digitalkin_total_queue_depth gauge",
|
|
70
|
+
f"digitalkin_total_queue_depth {snapshot['total_queue_depth']}",
|
|
71
|
+
"",
|
|
72
|
+
])
|
|
73
|
+
|
|
74
|
+
# Job duration histogram
|
|
75
|
+
lines.extend(PrometheusExporter._format_histogram(
|
|
76
|
+
"digitalkin_job_duration_seconds",
|
|
77
|
+
"Job execution duration in seconds",
|
|
78
|
+
snapshot["job_duration_seconds"],
|
|
79
|
+
))
|
|
80
|
+
|
|
81
|
+
# gRPC request duration histogram
|
|
82
|
+
lines.extend(PrometheusExporter._format_histogram(
|
|
83
|
+
"digitalkin_grpc_request_duration_seconds",
|
|
84
|
+
"gRPC request duration in seconds",
|
|
85
|
+
snapshot["grpc_request_duration_seconds"],
|
|
86
|
+
))
|
|
87
|
+
|
|
88
|
+
# Per-module breakdown
|
|
89
|
+
if snapshot["by_module"]:
|
|
90
|
+
lines.extend([
|
|
91
|
+
"",
|
|
92
|
+
"# HELP digitalkin_jobs_by_module Jobs breakdown by module and status",
|
|
93
|
+
"# TYPE digitalkin_jobs_by_module counter",
|
|
94
|
+
])
|
|
95
|
+
for module_name, counts in snapshot["by_module"].items():
|
|
96
|
+
for status, value in counts.items():
|
|
97
|
+
lines.append(
|
|
98
|
+
f'digitalkin_jobs_by_module{{module="{module_name}",status="{status}"}} {value}'
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Per-protocol breakdown
|
|
102
|
+
if snapshot["by_protocol"]:
|
|
103
|
+
lines.extend([
|
|
104
|
+
"",
|
|
105
|
+
"# HELP digitalkin_messages_by_protocol Messages breakdown by protocol",
|
|
106
|
+
"# TYPE digitalkin_messages_by_protocol counter",
|
|
107
|
+
])
|
|
108
|
+
for protocol, counts in snapshot["by_protocol"].items():
|
|
109
|
+
for metric, value in counts.items():
|
|
110
|
+
lines.append(
|
|
111
|
+
f'digitalkin_messages_by_protocol{{protocol="{protocol}",metric="{metric}"}} {value}'
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
return "\n".join(lines)
|
|
115
|
+
|
|
116
|
+
@staticmethod
|
|
117
|
+
def _format_histogram(name: str, help_text: str, data: dict) -> list[str]:
|
|
118
|
+
"""Format a histogram for Prometheus output."""
|
|
119
|
+
lines = [
|
|
120
|
+
"",
|
|
121
|
+
f"# HELP {name} {help_text}",
|
|
122
|
+
f"# TYPE {name} histogram",
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
# Sort buckets and output cumulative counts
|
|
126
|
+
cumulative = 0
|
|
127
|
+
for bucket in sorted(data.get("buckets", {}).keys()):
|
|
128
|
+
cumulative += data["buckets"][bucket]
|
|
129
|
+
lines.append(f'{name}_bucket{{le="{bucket}"}} {cumulative}')
|
|
130
|
+
|
|
131
|
+
lines.extend([
|
|
132
|
+
f'{name}_bucket{{le="+Inf"}} {data.get("count", 0)}',
|
|
133
|
+
f'{name}_sum {data.get("sum", 0)}',
|
|
134
|
+
f'{name}_count {data.get("count", 0)}',
|
|
135
|
+
])
|
|
136
|
+
|
|
137
|
+
return lines
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Tests for metrics collection.
|
|
2
|
+
|
|
3
|
+
Run with: python -m pytest tests/test_metrics.py
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
|
|
11
|
+
# Add the parent directory to the path so we can import digitalkin_observability
|
|
12
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
13
|
+
|
|
14
|
+
from digitalkin_observability import MetricsCollector, PrometheusExporter, get_metrics
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestMetricsCollector:
|
|
18
|
+
"""Tests for MetricsCollector singleton."""
|
|
19
|
+
|
|
20
|
+
def setup_method(self) -> None:
|
|
21
|
+
"""Reset metrics before each test."""
|
|
22
|
+
get_metrics().reset()
|
|
23
|
+
|
|
24
|
+
def test_singleton_returns_same_instance(self) -> None:
|
|
25
|
+
"""Test that get_metrics returns the same instance."""
|
|
26
|
+
m1 = get_metrics()
|
|
27
|
+
m2 = get_metrics()
|
|
28
|
+
assert m1 is m2
|
|
29
|
+
|
|
30
|
+
def test_inc_jobs_started(self) -> None:
|
|
31
|
+
"""Test incrementing jobs started counter."""
|
|
32
|
+
metrics = get_metrics()
|
|
33
|
+
metrics.inc_jobs_started("TestModule")
|
|
34
|
+
|
|
35
|
+
assert metrics.jobs_started_total == 1
|
|
36
|
+
assert metrics.active_jobs == 1
|
|
37
|
+
|
|
38
|
+
def test_inc_jobs_completed(self) -> None:
|
|
39
|
+
"""Test incrementing jobs completed counter."""
|
|
40
|
+
metrics = get_metrics()
|
|
41
|
+
metrics.inc_jobs_started("TestModule")
|
|
42
|
+
metrics.inc_jobs_completed("TestModule", 1.5)
|
|
43
|
+
|
|
44
|
+
assert metrics.jobs_completed_total == 1
|
|
45
|
+
assert metrics.active_jobs == 0
|
|
46
|
+
assert metrics.job_duration_seconds.count == 1
|
|
47
|
+
assert metrics.job_duration_seconds.total_sum == 1.5
|
|
48
|
+
|
|
49
|
+
def test_inc_jobs_failed(self) -> None:
|
|
50
|
+
"""Test incrementing jobs failed counter."""
|
|
51
|
+
metrics = get_metrics()
|
|
52
|
+
metrics.inc_jobs_started("TestModule")
|
|
53
|
+
metrics.inc_jobs_failed("TestModule")
|
|
54
|
+
|
|
55
|
+
assert metrics.jobs_failed_total == 1
|
|
56
|
+
assert metrics.active_jobs == 0
|
|
57
|
+
|
|
58
|
+
def test_inc_jobs_cancelled(self) -> None:
|
|
59
|
+
"""Test incrementing jobs cancelled counter."""
|
|
60
|
+
metrics = get_metrics()
|
|
61
|
+
metrics.inc_jobs_started("TestModule")
|
|
62
|
+
metrics.inc_jobs_cancelled("TestModule")
|
|
63
|
+
|
|
64
|
+
assert metrics.jobs_cancelled_total == 1
|
|
65
|
+
assert metrics.active_jobs == 0
|
|
66
|
+
|
|
67
|
+
def test_inc_messages_sent(self) -> None:
|
|
68
|
+
"""Test incrementing messages sent counter."""
|
|
69
|
+
metrics = get_metrics()
|
|
70
|
+
metrics.inc_messages_sent("message")
|
|
71
|
+
metrics.inc_messages_sent("file")
|
|
72
|
+
metrics.inc_messages_sent()
|
|
73
|
+
|
|
74
|
+
assert metrics.messages_sent_total == 3
|
|
75
|
+
|
|
76
|
+
def test_queue_depth_tracking(self) -> None:
|
|
77
|
+
"""Test queue depth tracking."""
|
|
78
|
+
metrics = get_metrics()
|
|
79
|
+
metrics.set_queue_depth("job1", 5)
|
|
80
|
+
metrics.set_queue_depth("job2", 3)
|
|
81
|
+
|
|
82
|
+
assert metrics.queue_depth["job1"] == 5
|
|
83
|
+
assert metrics.queue_depth["job2"] == 3
|
|
84
|
+
|
|
85
|
+
metrics.clear_queue_depth("job1")
|
|
86
|
+
assert "job1" not in metrics.queue_depth
|
|
87
|
+
|
|
88
|
+
def test_snapshot(self) -> None:
|
|
89
|
+
"""Test snapshot returns all metrics."""
|
|
90
|
+
metrics = get_metrics()
|
|
91
|
+
metrics.inc_jobs_started("TestModule")
|
|
92
|
+
metrics.inc_jobs_completed("TestModule", 0.5)
|
|
93
|
+
metrics.inc_messages_sent("message")
|
|
94
|
+
|
|
95
|
+
snapshot = metrics.snapshot()
|
|
96
|
+
|
|
97
|
+
assert snapshot["jobs_started_total"] == 1
|
|
98
|
+
assert snapshot["jobs_completed_total"] == 1
|
|
99
|
+
assert snapshot["messages_sent_total"] == 1
|
|
100
|
+
assert "job_duration_seconds" in snapshot
|
|
101
|
+
assert "by_module" in snapshot
|
|
102
|
+
assert "TestModule" in snapshot["by_module"]
|
|
103
|
+
|
|
104
|
+
def test_histogram_observe(self) -> None:
|
|
105
|
+
"""Test histogram observations."""
|
|
106
|
+
metrics = get_metrics()
|
|
107
|
+
metrics.observe_grpc_duration(0.05)
|
|
108
|
+
metrics.observe_grpc_duration(0.15)
|
|
109
|
+
|
|
110
|
+
assert metrics.grpc_request_duration_seconds.count == 2
|
|
111
|
+
assert metrics.grpc_request_duration_seconds.total_sum == pytest.approx(0.2)
|
|
112
|
+
|
|
113
|
+
def test_reset_clears_all_metrics(self) -> None:
|
|
114
|
+
"""Test reset clears all metrics."""
|
|
115
|
+
metrics = get_metrics()
|
|
116
|
+
metrics.inc_jobs_started("TestModule")
|
|
117
|
+
metrics.inc_errors()
|
|
118
|
+
|
|
119
|
+
metrics.reset()
|
|
120
|
+
|
|
121
|
+
assert metrics.jobs_started_total == 0
|
|
122
|
+
assert metrics.errors_total == 0
|
|
123
|
+
assert metrics.active_jobs == 0
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class TestPrometheusExporter:
|
|
127
|
+
"""Tests for Prometheus exporter."""
|
|
128
|
+
|
|
129
|
+
def setup_method(self) -> None:
|
|
130
|
+
"""Reset metrics before each test."""
|
|
131
|
+
get_metrics().reset()
|
|
132
|
+
|
|
133
|
+
def test_export_returns_string(self) -> None:
|
|
134
|
+
"""Test that export returns a string."""
|
|
135
|
+
output = PrometheusExporter.export()
|
|
136
|
+
assert isinstance(output, str)
|
|
137
|
+
|
|
138
|
+
def test_export_contains_job_counters(self) -> None:
|
|
139
|
+
"""Test export contains job counters."""
|
|
140
|
+
metrics = get_metrics()
|
|
141
|
+
metrics.inc_jobs_started("TestModule")
|
|
142
|
+
|
|
143
|
+
output = PrometheusExporter.export()
|
|
144
|
+
|
|
145
|
+
assert "digitalkin_jobs_started_total 1" in output
|
|
146
|
+
assert "digitalkin_active_jobs 1" in output
|
|
147
|
+
|
|
148
|
+
def test_export_contains_histogram(self) -> None:
|
|
149
|
+
"""Test export contains histogram data."""
|
|
150
|
+
metrics = get_metrics()
|
|
151
|
+
metrics.observe_grpc_duration(0.05)
|
|
152
|
+
|
|
153
|
+
output = PrometheusExporter.export()
|
|
154
|
+
|
|
155
|
+
assert "digitalkin_grpc_request_duration_seconds" in output
|
|
156
|
+
assert "# TYPE digitalkin_grpc_request_duration_seconds histogram" in output
|
|
157
|
+
|
|
158
|
+
def test_export_contains_module_breakdown(self) -> None:
|
|
159
|
+
"""Test export contains per-module breakdown."""
|
|
160
|
+
metrics = get_metrics()
|
|
161
|
+
metrics.inc_jobs_started("MyModule")
|
|
162
|
+
|
|
163
|
+
output = PrometheusExporter.export()
|
|
164
|
+
|
|
165
|
+
assert 'digitalkin_jobs_by_module{module="MyModule",status="started"} 1' in output
|
|
166
|
+
|
|
167
|
+
def test_export_contains_help_and_type(self) -> None:
|
|
168
|
+
"""Test export contains HELP and TYPE comments."""
|
|
169
|
+
output = PrometheusExporter.export()
|
|
170
|
+
|
|
171
|
+
assert "# HELP digitalkin_jobs_started_total" in output
|
|
172
|
+
assert "# TYPE digitalkin_jobs_started_total counter" in output
|