kailash 0.6.3__py3-none-any.whl → 0.6.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +3 -3
- kailash/api/custom_nodes_secure.py +3 -3
- kailash/api/gateway.py +1 -1
- kailash/api/studio.py +2 -3
- kailash/api/workflow_api.py +3 -4
- kailash/core/resilience/bulkhead.py +460 -0
- kailash/core/resilience/circuit_breaker.py +92 -10
- kailash/edge/discovery.py +86 -0
- kailash/mcp_server/__init__.py +309 -33
- kailash/mcp_server/advanced_features.py +1022 -0
- kailash/mcp_server/ai_registry_server.py +27 -2
- kailash/mcp_server/auth.py +789 -0
- kailash/mcp_server/client.py +645 -378
- kailash/mcp_server/discovery.py +1593 -0
- kailash/mcp_server/errors.py +673 -0
- kailash/mcp_server/oauth.py +1727 -0
- kailash/mcp_server/protocol.py +1126 -0
- kailash/mcp_server/registry_integration.py +587 -0
- kailash/mcp_server/server.py +1213 -98
- kailash/mcp_server/transports.py +1169 -0
- kailash/mcp_server/utils/__init__.py +6 -1
- kailash/mcp_server/utils/cache.py +250 -7
- kailash/middleware/auth/auth_manager.py +3 -3
- kailash/middleware/communication/api_gateway.py +2 -9
- kailash/middleware/communication/realtime.py +1 -1
- kailash/middleware/mcp/enhanced_server.py +1 -1
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/admin/audit_log.py +6 -6
- kailash/nodes/admin/permission_check.py +8 -8
- kailash/nodes/admin/role_management.py +32 -28
- kailash/nodes/admin/schema.sql +6 -1
- kailash/nodes/admin/schema_manager.py +13 -13
- kailash/nodes/admin/security_event.py +16 -20
- kailash/nodes/admin/tenant_isolation.py +3 -3
- kailash/nodes/admin/transaction_utils.py +3 -3
- kailash/nodes/admin/user_management.py +21 -22
- kailash/nodes/ai/a2a.py +11 -11
- kailash/nodes/ai/ai_providers.py +9 -12
- kailash/nodes/ai/embedding_generator.py +13 -14
- kailash/nodes/ai/intelligent_agent_orchestrator.py +19 -19
- kailash/nodes/ai/iterative_llm_agent.py +2 -2
- kailash/nodes/ai/llm_agent.py +210 -33
- kailash/nodes/ai/self_organizing.py +2 -2
- kailash/nodes/alerts/discord.py +4 -4
- kailash/nodes/api/graphql.py +6 -6
- kailash/nodes/api/http.py +12 -17
- kailash/nodes/api/rate_limiting.py +4 -4
- kailash/nodes/api/rest.py +15 -15
- kailash/nodes/auth/mfa.py +3 -4
- kailash/nodes/auth/risk_assessment.py +2 -2
- kailash/nodes/auth/session_management.py +5 -5
- kailash/nodes/auth/sso.py +143 -0
- kailash/nodes/base.py +6 -2
- kailash/nodes/base_async.py +16 -2
- kailash/nodes/base_with_acl.py +2 -2
- kailash/nodes/cache/__init__.py +9 -0
- kailash/nodes/cache/cache.py +1172 -0
- kailash/nodes/cache/cache_invalidation.py +870 -0
- kailash/nodes/cache/redis_pool_manager.py +595 -0
- kailash/nodes/code/async_python.py +2 -1
- kailash/nodes/code/python.py +196 -35
- kailash/nodes/compliance/data_retention.py +6 -6
- kailash/nodes/compliance/gdpr.py +5 -5
- kailash/nodes/data/__init__.py +10 -0
- kailash/nodes/data/optimistic_locking.py +906 -0
- kailash/nodes/data/readers.py +8 -8
- kailash/nodes/data/redis.py +349 -0
- kailash/nodes/data/sql.py +314 -3
- kailash/nodes/data/streaming.py +21 -0
- kailash/nodes/enterprise/__init__.py +8 -0
- kailash/nodes/enterprise/audit_logger.py +285 -0
- kailash/nodes/enterprise/batch_processor.py +22 -3
- kailash/nodes/enterprise/data_lineage.py +1 -1
- kailash/nodes/enterprise/mcp_executor.py +205 -0
- kailash/nodes/enterprise/service_discovery.py +150 -0
- kailash/nodes/enterprise/tenant_assignment.py +108 -0
- kailash/nodes/logic/async_operations.py +2 -2
- kailash/nodes/logic/convergence.py +1 -1
- kailash/nodes/logic/operations.py +1 -1
- kailash/nodes/monitoring/__init__.py +11 -1
- kailash/nodes/monitoring/health_check.py +456 -0
- kailash/nodes/monitoring/log_processor.py +817 -0
- kailash/nodes/monitoring/metrics_collector.py +627 -0
- kailash/nodes/monitoring/performance_benchmark.py +137 -11
- kailash/nodes/rag/advanced.py +7 -7
- kailash/nodes/rag/agentic.py +49 -2
- kailash/nodes/rag/conversational.py +3 -3
- kailash/nodes/rag/evaluation.py +3 -3
- kailash/nodes/rag/federated.py +3 -3
- kailash/nodes/rag/graph.py +3 -3
- kailash/nodes/rag/multimodal.py +3 -3
- kailash/nodes/rag/optimized.py +5 -5
- kailash/nodes/rag/privacy.py +3 -3
- kailash/nodes/rag/query_processing.py +6 -6
- kailash/nodes/rag/realtime.py +1 -1
- kailash/nodes/rag/registry.py +2 -6
- kailash/nodes/rag/router.py +1 -1
- kailash/nodes/rag/similarity.py +7 -7
- kailash/nodes/rag/strategies.py +4 -4
- kailash/nodes/security/abac_evaluator.py +6 -6
- kailash/nodes/security/behavior_analysis.py +5 -6
- kailash/nodes/security/credential_manager.py +1 -1
- kailash/nodes/security/rotating_credentials.py +11 -11
- kailash/nodes/security/threat_detection.py +8 -8
- kailash/nodes/testing/credential_testing.py +2 -2
- kailash/nodes/transform/processors.py +5 -5
- kailash/runtime/local.py +162 -14
- kailash/runtime/parameter_injection.py +425 -0
- kailash/runtime/parameter_injector.py +657 -0
- kailash/runtime/testing.py +2 -2
- kailash/testing/fixtures.py +2 -2
- kailash/workflow/builder.py +99 -18
- kailash/workflow/builder_improvements.py +207 -0
- kailash/workflow/input_handling.py +170 -0
- {kailash-0.6.3.dist-info → kailash-0.6.4.dist-info}/METADATA +22 -9
- {kailash-0.6.3.dist-info → kailash-0.6.4.dist-info}/RECORD +120 -94
- {kailash-0.6.3.dist-info → kailash-0.6.4.dist-info}/WHEEL +0 -0
- {kailash-0.6.3.dist-info → kailash-0.6.4.dist-info}/entry_points.txt +0 -0
- {kailash-0.6.3.dist-info → kailash-0.6.4.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.6.3.dist-info → kailash-0.6.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,627 @@
|
|
1
|
+
"""Metrics collector node for system and application monitoring.
|
2
|
+
|
3
|
+
This module provides comprehensive metrics collection capabilities including
|
4
|
+
system metrics (CPU, memory, disk), application metrics, and custom metrics
|
5
|
+
with support for various output formats.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import json
|
9
|
+
import logging
|
10
|
+
import os
|
11
|
+
import time
|
12
|
+
from datetime import UTC, datetime
|
13
|
+
from enum import Enum
|
14
|
+
from typing import Any, Callable, Dict, List, Optional, Union
|
15
|
+
|
16
|
+
import psutil
|
17
|
+
|
18
|
+
from kailash.nodes.base import NodeParameter, register_node
|
19
|
+
from kailash.nodes.base_async import AsyncNode
|
20
|
+
from kailash.sdk_exceptions import NodeExecutionError
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
class MetricFormat(Enum):
|
26
|
+
"""Supported metric output formats."""
|
27
|
+
|
28
|
+
JSON = "json"
|
29
|
+
PROMETHEUS = "prometheus"
|
30
|
+
OPENTELEMETRY = "opentelemetry"
|
31
|
+
STATSD = "statsd"
|
32
|
+
|
33
|
+
|
34
|
+
class MetricType(Enum):
|
35
|
+
"""Types of metrics that can be collected."""
|
36
|
+
|
37
|
+
COUNTER = "counter"
|
38
|
+
GAUGE = "gauge"
|
39
|
+
HISTOGRAM = "histogram"
|
40
|
+
SUMMARY = "summary"
|
41
|
+
|
42
|
+
|
43
|
+
@register_node()
|
44
|
+
class MetricsCollectorNode(AsyncNode):
|
45
|
+
"""Node for collecting system and application metrics.
|
46
|
+
|
47
|
+
This node provides comprehensive metrics collection including:
|
48
|
+
- System metrics: CPU, memory, disk, network usage
|
49
|
+
- Process metrics: Resource usage for specific processes
|
50
|
+
- Application metrics: Custom metrics from applications
|
51
|
+
- Metric aggregation and buffering
|
52
|
+
- Multiple output formats (JSON, Prometheus, OpenTelemetry)
|
53
|
+
- Configurable collection intervals and filtering
|
54
|
+
|
55
|
+
Design Purpose:
|
56
|
+
- Provide unified metrics collection for monitoring
|
57
|
+
- Support various monitoring backends
|
58
|
+
- Enable performance tracking and alerting
|
59
|
+
- Facilitate observability and debugging
|
60
|
+
|
61
|
+
Examples:
|
62
|
+
>>> # Collect system metrics
|
63
|
+
>>> collector = MetricsCollectorNode()
|
64
|
+
>>> result = await collector.execute(
|
65
|
+
... metric_types=["system.cpu", "system.memory"],
|
66
|
+
... format="prometheus"
|
67
|
+
... )
|
68
|
+
|
69
|
+
>>> # Collect custom application metrics
|
70
|
+
>>> result = await collector.execute(
|
71
|
+
... custom_metrics=[
|
72
|
+
... {"name": "requests_total", "type": "counter", "value": 1000},
|
73
|
+
... {"name": "response_time", "type": "histogram", "value": 0.125}
|
74
|
+
... ],
|
75
|
+
... format="json"
|
76
|
+
... )
|
77
|
+
"""
|
78
|
+
|
79
|
+
def __init__(self, **kwargs):
|
80
|
+
"""Initialize the metrics collector node."""
|
81
|
+
super().__init__(**kwargs)
|
82
|
+
self.metric_buffer = []
|
83
|
+
self.last_collection_time = None
|
84
|
+
self.logger.info(f"Initialized MetricsCollectorNode: {self.id}")
|
85
|
+
|
86
|
+
def get_parameters(self) -> Dict[str, NodeParameter]:
|
87
|
+
"""Define the parameters this node accepts."""
|
88
|
+
return {
|
89
|
+
"metric_types": NodeParameter(
|
90
|
+
name="metric_types",
|
91
|
+
type=list,
|
92
|
+
required=False,
|
93
|
+
default=["system.cpu", "system.memory"],
|
94
|
+
description="List of metric types to collect",
|
95
|
+
),
|
96
|
+
"custom_metrics": NodeParameter(
|
97
|
+
name="custom_metrics",
|
98
|
+
type=list,
|
99
|
+
required=False,
|
100
|
+
default=[],
|
101
|
+
description="Custom metrics to include",
|
102
|
+
),
|
103
|
+
"format": NodeParameter(
|
104
|
+
name="format",
|
105
|
+
type=str,
|
106
|
+
required=False,
|
107
|
+
default="json",
|
108
|
+
description="Output format (json, prometheus, opentelemetry, statsd)",
|
109
|
+
),
|
110
|
+
"labels": NodeParameter(
|
111
|
+
name="labels",
|
112
|
+
type=dict,
|
113
|
+
required=False,
|
114
|
+
default={},
|
115
|
+
description="Labels to add to all metrics",
|
116
|
+
),
|
117
|
+
"include_process": NodeParameter(
|
118
|
+
name="include_process",
|
119
|
+
type=bool,
|
120
|
+
required=False,
|
121
|
+
default=True,
|
122
|
+
description="Include current process metrics",
|
123
|
+
),
|
124
|
+
"process_ids": NodeParameter(
|
125
|
+
name="process_ids",
|
126
|
+
type=list,
|
127
|
+
required=False,
|
128
|
+
default=[],
|
129
|
+
description="Additional process IDs to monitor",
|
130
|
+
),
|
131
|
+
"aggregate": NodeParameter(
|
132
|
+
name="aggregate",
|
133
|
+
type=bool,
|
134
|
+
required=False,
|
135
|
+
default=False,
|
136
|
+
description="Aggregate metrics over time",
|
137
|
+
),
|
138
|
+
"interval": NodeParameter(
|
139
|
+
name="interval",
|
140
|
+
type=float,
|
141
|
+
required=False,
|
142
|
+
default=60.0,
|
143
|
+
description="Collection interval in seconds (for aggregation)",
|
144
|
+
),
|
145
|
+
}
|
146
|
+
|
147
|
+
def get_output_schema(self) -> Dict[str, NodeParameter]:
|
148
|
+
"""Define the output schema for this node."""
|
149
|
+
return {
|
150
|
+
"metrics": NodeParameter(
|
151
|
+
name="metrics",
|
152
|
+
type=Any, # Can be list or string depending on format
|
153
|
+
description="Collected metrics in specified format",
|
154
|
+
),
|
155
|
+
"metric_count": NodeParameter(
|
156
|
+
name="metric_count",
|
157
|
+
type=int,
|
158
|
+
description="Number of metrics collected",
|
159
|
+
),
|
160
|
+
"collection_time": NodeParameter(
|
161
|
+
name="collection_time",
|
162
|
+
type=float,
|
163
|
+
description="Time taken to collect metrics",
|
164
|
+
),
|
165
|
+
"timestamp": NodeParameter(
|
166
|
+
name="timestamp",
|
167
|
+
type=str,
|
168
|
+
description="ISO timestamp of collection",
|
169
|
+
),
|
170
|
+
"format": NodeParameter(
|
171
|
+
name="format",
|
172
|
+
type=str,
|
173
|
+
description="Format of the metrics output",
|
174
|
+
),
|
175
|
+
}
|
176
|
+
|
177
|
+
async def async_run(self, **kwargs) -> Dict[str, Any]:
|
178
|
+
"""Collect metrics based on configuration."""
|
179
|
+
metric_types = kwargs.get("metric_types", ["system.cpu", "system.memory"])
|
180
|
+
custom_metrics = kwargs.get("custom_metrics", [])
|
181
|
+
output_format = MetricFormat(kwargs.get("format", "json"))
|
182
|
+
labels = kwargs.get("labels", {})
|
183
|
+
include_process = kwargs.get("include_process", True)
|
184
|
+
process_ids = kwargs.get("process_ids", [])
|
185
|
+
aggregate = kwargs.get("aggregate", False)
|
186
|
+
interval = kwargs.get("interval", 60.0)
|
187
|
+
|
188
|
+
start_time = time.time()
|
189
|
+
collected_metrics = []
|
190
|
+
|
191
|
+
try:
|
192
|
+
# Collect system metrics
|
193
|
+
if any(mt.startswith("system.") for mt in metric_types):
|
194
|
+
system_metrics = await self._collect_system_metrics(metric_types)
|
195
|
+
collected_metrics.extend(system_metrics)
|
196
|
+
|
197
|
+
# Collect process metrics
|
198
|
+
if include_process or process_ids:
|
199
|
+
process_metrics = await self._collect_process_metrics(
|
200
|
+
include_current=include_process, process_ids=process_ids
|
201
|
+
)
|
202
|
+
collected_metrics.extend(process_metrics)
|
203
|
+
|
204
|
+
# Add custom metrics
|
205
|
+
if custom_metrics:
|
206
|
+
validated_custom = self._validate_custom_metrics(custom_metrics)
|
207
|
+
collected_metrics.extend(validated_custom)
|
208
|
+
|
209
|
+
# Add labels to all metrics
|
210
|
+
if labels:
|
211
|
+
for metric in collected_metrics:
|
212
|
+
metric["labels"] = {**labels, **metric.get("labels", {})}
|
213
|
+
|
214
|
+
# Handle aggregation if requested
|
215
|
+
if aggregate:
|
216
|
+
collected_metrics = self._aggregate_metrics(collected_metrics, interval)
|
217
|
+
|
218
|
+
# Format output
|
219
|
+
formatted_output = self._format_metrics(collected_metrics, output_format)
|
220
|
+
|
221
|
+
collection_time = time.time() - start_time
|
222
|
+
|
223
|
+
return {
|
224
|
+
"metrics": formatted_output,
|
225
|
+
"metric_count": len(collected_metrics),
|
226
|
+
"collection_time": collection_time,
|
227
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
228
|
+
"format": output_format.value,
|
229
|
+
}
|
230
|
+
|
231
|
+
except Exception as e:
|
232
|
+
self.logger.error(f"Metrics collection failed: {str(e)}")
|
233
|
+
raise NodeExecutionError(f"Failed to collect metrics: {str(e)}")
|
234
|
+
|
235
|
+
async def _collect_system_metrics(
|
236
|
+
self, metric_types: List[str]
|
237
|
+
) -> List[Dict[str, Any]]:
|
238
|
+
"""Collect system-level metrics."""
|
239
|
+
metrics = []
|
240
|
+
timestamp = time.time()
|
241
|
+
|
242
|
+
# CPU metrics
|
243
|
+
if "system.cpu" in metric_types or "system.cpu.percent" in metric_types:
|
244
|
+
cpu_percent = psutil.cpu_percent(interval=0.1, percpu=True)
|
245
|
+
metrics.append(
|
246
|
+
{
|
247
|
+
"name": "system_cpu_usage_percent",
|
248
|
+
"type": MetricType.GAUGE.value,
|
249
|
+
"value": sum(cpu_percent) / len(cpu_percent),
|
250
|
+
"timestamp": timestamp,
|
251
|
+
"labels": {"total_cores": str(len(cpu_percent))},
|
252
|
+
}
|
253
|
+
)
|
254
|
+
|
255
|
+
# Per-core metrics
|
256
|
+
for i, percent in enumerate(cpu_percent):
|
257
|
+
metrics.append(
|
258
|
+
{
|
259
|
+
"name": "system_cpu_core_usage_percent",
|
260
|
+
"type": MetricType.GAUGE.value,
|
261
|
+
"value": percent,
|
262
|
+
"timestamp": timestamp,
|
263
|
+
"labels": {"core": str(i)},
|
264
|
+
}
|
265
|
+
)
|
266
|
+
|
267
|
+
# Memory metrics
|
268
|
+
if "system.memory" in metric_types:
|
269
|
+
memory = psutil.virtual_memory()
|
270
|
+
metrics.extend(
|
271
|
+
[
|
272
|
+
{
|
273
|
+
"name": "system_memory_total_bytes",
|
274
|
+
"type": MetricType.GAUGE.value,
|
275
|
+
"value": memory.total,
|
276
|
+
"timestamp": timestamp,
|
277
|
+
},
|
278
|
+
{
|
279
|
+
"name": "system_memory_used_bytes",
|
280
|
+
"type": MetricType.GAUGE.value,
|
281
|
+
"value": memory.used,
|
282
|
+
"timestamp": timestamp,
|
283
|
+
},
|
284
|
+
{
|
285
|
+
"name": "system_memory_available_bytes",
|
286
|
+
"type": MetricType.GAUGE.value,
|
287
|
+
"value": memory.available,
|
288
|
+
"timestamp": timestamp,
|
289
|
+
},
|
290
|
+
{
|
291
|
+
"name": "system_memory_usage_percent",
|
292
|
+
"type": MetricType.GAUGE.value,
|
293
|
+
"value": memory.percent,
|
294
|
+
"timestamp": timestamp,
|
295
|
+
},
|
296
|
+
]
|
297
|
+
)
|
298
|
+
|
299
|
+
# Disk metrics
|
300
|
+
if "system.disk" in metric_types:
|
301
|
+
for partition in psutil.disk_partitions():
|
302
|
+
try:
|
303
|
+
usage = psutil.disk_usage(partition.mountpoint)
|
304
|
+
metrics.extend(
|
305
|
+
[
|
306
|
+
{
|
307
|
+
"name": "system_disk_total_bytes",
|
308
|
+
"type": MetricType.GAUGE.value,
|
309
|
+
"value": usage.total,
|
310
|
+
"timestamp": timestamp,
|
311
|
+
"labels": {
|
312
|
+
"device": partition.device,
|
313
|
+
"mountpoint": partition.mountpoint,
|
314
|
+
},
|
315
|
+
},
|
316
|
+
{
|
317
|
+
"name": "system_disk_used_bytes",
|
318
|
+
"type": MetricType.GAUGE.value,
|
319
|
+
"value": usage.used,
|
320
|
+
"timestamp": timestamp,
|
321
|
+
"labels": {
|
322
|
+
"device": partition.device,
|
323
|
+
"mountpoint": partition.mountpoint,
|
324
|
+
},
|
325
|
+
},
|
326
|
+
{
|
327
|
+
"name": "system_disk_usage_percent",
|
328
|
+
"type": MetricType.GAUGE.value,
|
329
|
+
"value": usage.percent,
|
330
|
+
"timestamp": timestamp,
|
331
|
+
"labels": {
|
332
|
+
"device": partition.device,
|
333
|
+
"mountpoint": partition.mountpoint,
|
334
|
+
},
|
335
|
+
},
|
336
|
+
]
|
337
|
+
)
|
338
|
+
except PermissionError:
|
339
|
+
continue
|
340
|
+
|
341
|
+
# Network metrics
|
342
|
+
if "system.network" in metric_types:
|
343
|
+
net_io = psutil.net_io_counters()
|
344
|
+
metrics.extend(
|
345
|
+
[
|
346
|
+
{
|
347
|
+
"name": "system_network_bytes_sent",
|
348
|
+
"type": MetricType.COUNTER.value,
|
349
|
+
"value": net_io.bytes_sent,
|
350
|
+
"timestamp": timestamp,
|
351
|
+
},
|
352
|
+
{
|
353
|
+
"name": "system_network_bytes_recv",
|
354
|
+
"type": MetricType.COUNTER.value,
|
355
|
+
"value": net_io.bytes_recv,
|
356
|
+
"timestamp": timestamp,
|
357
|
+
},
|
358
|
+
{
|
359
|
+
"name": "system_network_packets_sent",
|
360
|
+
"type": MetricType.COUNTER.value,
|
361
|
+
"value": net_io.packets_sent,
|
362
|
+
"timestamp": timestamp,
|
363
|
+
},
|
364
|
+
{
|
365
|
+
"name": "system_network_packets_recv",
|
366
|
+
"type": MetricType.COUNTER.value,
|
367
|
+
"value": net_io.packets_recv,
|
368
|
+
"timestamp": timestamp,
|
369
|
+
},
|
370
|
+
]
|
371
|
+
)
|
372
|
+
|
373
|
+
return metrics
|
374
|
+
|
375
|
+
async def _collect_process_metrics(
|
376
|
+
self, include_current: bool = True, process_ids: List[int] = None
|
377
|
+
) -> List[Dict[str, Any]]:
|
378
|
+
"""Collect process-level metrics."""
|
379
|
+
metrics = []
|
380
|
+
timestamp = time.time()
|
381
|
+
|
382
|
+
pids_to_monitor = []
|
383
|
+
if include_current:
|
384
|
+
pids_to_monitor.append(os.getpid())
|
385
|
+
if process_ids:
|
386
|
+
pids_to_monitor.extend(process_ids)
|
387
|
+
|
388
|
+
for pid in pids_to_monitor:
|
389
|
+
try:
|
390
|
+
process = psutil.Process(pid)
|
391
|
+
|
392
|
+
# Process CPU usage
|
393
|
+
cpu_percent = process.cpu_percent(interval=0.1)
|
394
|
+
metrics.append(
|
395
|
+
{
|
396
|
+
"name": "process_cpu_usage_percent",
|
397
|
+
"type": MetricType.GAUGE.value,
|
398
|
+
"value": cpu_percent,
|
399
|
+
"timestamp": timestamp,
|
400
|
+
"labels": {
|
401
|
+
"pid": str(pid),
|
402
|
+
"name": process.name(),
|
403
|
+
},
|
404
|
+
}
|
405
|
+
)
|
406
|
+
|
407
|
+
# Process memory usage
|
408
|
+
memory_info = process.memory_info()
|
409
|
+
metrics.extend(
|
410
|
+
[
|
411
|
+
{
|
412
|
+
"name": "process_memory_rss_bytes",
|
413
|
+
"type": MetricType.GAUGE.value,
|
414
|
+
"value": memory_info.rss,
|
415
|
+
"timestamp": timestamp,
|
416
|
+
"labels": {
|
417
|
+
"pid": str(pid),
|
418
|
+
"name": process.name(),
|
419
|
+
},
|
420
|
+
},
|
421
|
+
{
|
422
|
+
"name": "process_memory_vms_bytes",
|
423
|
+
"type": MetricType.GAUGE.value,
|
424
|
+
"value": memory_info.vms,
|
425
|
+
"timestamp": timestamp,
|
426
|
+
"labels": {
|
427
|
+
"pid": str(pid),
|
428
|
+
"name": process.name(),
|
429
|
+
},
|
430
|
+
},
|
431
|
+
]
|
432
|
+
)
|
433
|
+
|
434
|
+
# Process thread count
|
435
|
+
metrics.append(
|
436
|
+
{
|
437
|
+
"name": "process_num_threads",
|
438
|
+
"type": MetricType.GAUGE.value,
|
439
|
+
"value": process.num_threads(),
|
440
|
+
"timestamp": timestamp,
|
441
|
+
"labels": {
|
442
|
+
"pid": str(pid),
|
443
|
+
"name": process.name(),
|
444
|
+
},
|
445
|
+
}
|
446
|
+
)
|
447
|
+
|
448
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
449
|
+
self.logger.warning(f"Could not collect metrics for PID {pid}")
|
450
|
+
continue
|
451
|
+
|
452
|
+
return metrics
|
453
|
+
|
454
|
+
def _validate_custom_metrics(
|
455
|
+
self, custom_metrics: List[Dict[str, Any]]
|
456
|
+
) -> List[Dict[str, Any]]:
|
457
|
+
"""Validate and normalize custom metrics."""
|
458
|
+
validated = []
|
459
|
+
timestamp = time.time()
|
460
|
+
|
461
|
+
for metric in custom_metrics:
|
462
|
+
# Validate required fields
|
463
|
+
if "name" not in metric or "value" not in metric:
|
464
|
+
self.logger.warning(f"Skipping invalid metric: {metric}")
|
465
|
+
continue
|
466
|
+
|
467
|
+
# Set defaults
|
468
|
+
validated_metric = {
|
469
|
+
"name": metric["name"],
|
470
|
+
"type": metric.get("type", MetricType.GAUGE.value),
|
471
|
+
"value": float(metric["value"]),
|
472
|
+
"timestamp": metric.get("timestamp", timestamp),
|
473
|
+
"labels": metric.get("labels", {}),
|
474
|
+
}
|
475
|
+
|
476
|
+
# Validate metric type
|
477
|
+
try:
|
478
|
+
MetricType(validated_metric["type"])
|
479
|
+
except ValueError:
|
480
|
+
validated_metric["type"] = MetricType.GAUGE.value
|
481
|
+
|
482
|
+
validated.append(validated_metric)
|
483
|
+
|
484
|
+
return validated
|
485
|
+
|
486
|
+
def _aggregate_metrics(
|
487
|
+
self, metrics: List[Dict[str, Any]], interval: float
|
488
|
+
) -> List[Dict[str, Any]]:
|
489
|
+
"""Aggregate metrics over time."""
|
490
|
+
# Store metrics in buffer
|
491
|
+
self.metric_buffer.extend(metrics)
|
492
|
+
|
493
|
+
# Remove old metrics outside the interval window
|
494
|
+
cutoff_time = time.time() - interval
|
495
|
+
self.metric_buffer = [
|
496
|
+
m for m in self.metric_buffer if m.get("timestamp", 0) > cutoff_time
|
497
|
+
]
|
498
|
+
|
499
|
+
# Group metrics by name and labels
|
500
|
+
aggregated = {}
|
501
|
+
for metric in self.metric_buffer:
|
502
|
+
key = (metric["name"], tuple(sorted(metric.get("labels", {}).items())))
|
503
|
+
|
504
|
+
if key not in aggregated:
|
505
|
+
aggregated[key] = {
|
506
|
+
"name": metric["name"],
|
507
|
+
"type": metric["type"],
|
508
|
+
"labels": metric.get("labels", {}),
|
509
|
+
"values": [],
|
510
|
+
}
|
511
|
+
|
512
|
+
aggregated[key]["values"].append(metric["value"])
|
513
|
+
|
514
|
+
# Calculate aggregated values
|
515
|
+
result = []
|
516
|
+
for key, agg_metric in aggregated.items():
|
517
|
+
values = agg_metric["values"]
|
518
|
+
|
519
|
+
if agg_metric["type"] == MetricType.COUNTER.value:
|
520
|
+
# For counters, use the latest value
|
521
|
+
value = values[-1] if values else 0
|
522
|
+
elif agg_metric["type"] == MetricType.GAUGE.value:
|
523
|
+
# For gauges, use the average
|
524
|
+
value = sum(values) / len(values) if values else 0
|
525
|
+
else:
|
526
|
+
# For histograms/summaries, return all values
|
527
|
+
value = values
|
528
|
+
|
529
|
+
result.append(
|
530
|
+
{
|
531
|
+
"name": agg_metric["name"],
|
532
|
+
"type": agg_metric["type"],
|
533
|
+
"value": value,
|
534
|
+
"timestamp": time.time(),
|
535
|
+
"labels": agg_metric["labels"],
|
536
|
+
"sample_count": len(values),
|
537
|
+
}
|
538
|
+
)
|
539
|
+
|
540
|
+
return result
|
541
|
+
|
542
|
+
def _format_metrics(
|
543
|
+
self, metrics: List[Dict[str, Any]], format: MetricFormat
|
544
|
+
) -> Union[List[Dict[str, Any]], str]:
|
545
|
+
"""Format metrics according to specified format."""
|
546
|
+
if format == MetricFormat.JSON:
|
547
|
+
return metrics
|
548
|
+
|
549
|
+
elif format == MetricFormat.PROMETHEUS:
|
550
|
+
lines = []
|
551
|
+
for metric in metrics:
|
552
|
+
# Build label string
|
553
|
+
label_parts = []
|
554
|
+
for k, v in metric.get("labels", {}).items():
|
555
|
+
label_parts.append(f'{k}="{v}"')
|
556
|
+
label_str = "{" + ",".join(label_parts) + "}" if label_parts else ""
|
557
|
+
|
558
|
+
# Format metric line
|
559
|
+
if metric["type"] == MetricType.COUNTER.value:
|
560
|
+
lines.append(f"# TYPE {metric['name']} counter")
|
561
|
+
elif metric["type"] == MetricType.GAUGE.value:
|
562
|
+
lines.append(f"# TYPE {metric['name']} gauge")
|
563
|
+
|
564
|
+
lines.append(f"{metric['name']}{label_str} {metric['value']}")
|
565
|
+
|
566
|
+
return "\n".join(lines)
|
567
|
+
|
568
|
+
elif format == MetricFormat.OPENTELEMETRY:
|
569
|
+
# OpenTelemetry JSON format
|
570
|
+
otel_metrics = []
|
571
|
+
for metric in metrics:
|
572
|
+
otel_metric = {
|
573
|
+
"name": metric["name"],
|
574
|
+
"description": f"{metric['name']} metric",
|
575
|
+
"unit": "1",
|
576
|
+
"data": {
|
577
|
+
"data_points": [
|
578
|
+
{
|
579
|
+
"attributes": metric.get("labels", {}),
|
580
|
+
"time_unix_nano": int(metric["timestamp"] * 1e9),
|
581
|
+
"value": metric["value"],
|
582
|
+
}
|
583
|
+
]
|
584
|
+
},
|
585
|
+
}
|
586
|
+
|
587
|
+
if metric["type"] == MetricType.COUNTER.value:
|
588
|
+
otel_metric["data"]["type"] = "Sum"
|
589
|
+
otel_metric["data"]["is_monotonic"] = True
|
590
|
+
else:
|
591
|
+
otel_metric["data"]["type"] = "Gauge"
|
592
|
+
|
593
|
+
otel_metrics.append(otel_metric)
|
594
|
+
|
595
|
+
return json.dumps(
|
596
|
+
{"resource_metrics": [{"scope_metrics": [{"metrics": otel_metrics}]}]}
|
597
|
+
)
|
598
|
+
|
599
|
+
elif format == MetricFormat.STATSD:
|
600
|
+
lines = []
|
601
|
+
for metric in metrics:
|
602
|
+
# StatsD format: metric_name:value|type
|
603
|
+
if metric["type"] == MetricType.COUNTER.value:
|
604
|
+
type_char = "c"
|
605
|
+
elif metric["type"] == MetricType.GAUGE.value:
|
606
|
+
type_char = "g"
|
607
|
+
else:
|
608
|
+
type_char = "ms" # timing
|
609
|
+
|
610
|
+
# Add tags if present
|
611
|
+
tags = []
|
612
|
+
for k, v in metric.get("labels", {}).items():
|
613
|
+
tags.append(f"{k}:{v}")
|
614
|
+
tag_str = f"|#{','.join(tags)}" if tags else ""
|
615
|
+
|
616
|
+
lines.append(f"{metric['name']}:{metric['value']}|{type_char}{tag_str}")
|
617
|
+
|
618
|
+
return "\n".join(lines)
|
619
|
+
|
620
|
+
else:
|
621
|
+
raise ValueError(f"Unsupported format: {format}")
|
622
|
+
|
623
|
+
def run(self, **kwargs) -> Dict[str, Any]:
|
624
|
+
"""Synchronous wrapper for compatibility."""
|
625
|
+
import asyncio
|
626
|
+
|
627
|
+
return asyncio.run(self.async_run(**kwargs))
|