agent_os_kernel 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_control_plane/__init__.py +662 -0
- agent_control_plane/a2a_adapter.py +543 -0
- agent_control_plane/adapter.py +417 -0
- agent_control_plane/agent_hibernation.py +394 -0
- agent_control_plane/agent_kernel.py +470 -0
- agent_control_plane/compliance.py +720 -0
- agent_control_plane/constraint_graphs.py +478 -0
- agent_control_plane/control_plane.py +854 -0
- agent_control_plane/example_executors.py +195 -0
- agent_control_plane/execution_engine.py +231 -0
- agent_control_plane/flight_recorder.py +846 -0
- agent_control_plane/governance_layer.py +435 -0
- agent_control_plane/hf_utils.py +563 -0
- agent_control_plane/interfaces/__init__.py +55 -0
- agent_control_plane/interfaces/kernel_interface.py +361 -0
- agent_control_plane/interfaces/plugin_interface.py +497 -0
- agent_control_plane/interfaces/protocol_interfaces.py +387 -0
- agent_control_plane/kernel_space.py +1009 -0
- agent_control_plane/langchain_adapter.py +424 -0
- agent_control_plane/lifecycle.py +3113 -0
- agent_control_plane/mcp_adapter.py +653 -0
- agent_control_plane/ml_safety.py +563 -0
- agent_control_plane/multimodal.py +727 -0
- agent_control_plane/mute_agent.py +422 -0
- agent_control_plane/observability.py +787 -0
- agent_control_plane/orchestrator.py +482 -0
- agent_control_plane/plugin_registry.py +750 -0
- agent_control_plane/policy_engine.py +954 -0
- agent_control_plane/process_isolation.py +777 -0
- agent_control_plane/shadow_mode.py +310 -0
- agent_control_plane/signals.py +493 -0
- agent_control_plane/supervisor_agents.py +430 -0
- agent_control_plane/time_travel_debugger.py +557 -0
- agent_control_plane/tool_registry.py +452 -0
- agent_control_plane/vfs.py +697 -0
- agent_kernel/__init__.py +69 -0
- agent_kernel/analyzer.py +435 -0
- agent_kernel/auditor.py +36 -0
- agent_kernel/completeness_auditor.py +237 -0
- agent_kernel/detector.py +203 -0
- agent_kernel/kernel.py +744 -0
- agent_kernel/memory_manager.py +85 -0
- agent_kernel/models.py +374 -0
- agent_kernel/nudge_mechanism.py +263 -0
- agent_kernel/outcome_analyzer.py +338 -0
- agent_kernel/patcher.py +582 -0
- agent_kernel/semantic_analyzer.py +316 -0
- agent_kernel/semantic_purge.py +349 -0
- agent_kernel/simulator.py +449 -0
- agent_kernel/teacher.py +85 -0
- agent_kernel/triage.py +152 -0
- agent_os/__init__.py +409 -0
- agent_os/_adversarial_impl.py +200 -0
- agent_os/_circuit_breaker_impl.py +232 -0
- agent_os/_mcp_metrics.py +193 -0
- agent_os/adversarial.py +20 -0
- agent_os/agents_compat.py +490 -0
- agent_os/audit_logger.py +135 -0
- agent_os/base_agent.py +651 -0
- agent_os/circuit_breaker.py +34 -0
- agent_os/cli/__init__.py +659 -0
- agent_os/cli/cmd_audit.py +128 -0
- agent_os/cli/cmd_init.py +152 -0
- agent_os/cli/cmd_policy.py +41 -0
- agent_os/cli/cmd_policy_gen.py +180 -0
- agent_os/cli/cmd_validate.py +258 -0
- agent_os/cli/mcp_scan.py +265 -0
- agent_os/cli/output.py +192 -0
- agent_os/cli/policy_checker.py +330 -0
- agent_os/compat.py +74 -0
- agent_os/constraint_graph.py +234 -0
- agent_os/content_governance.py +140 -0
- agent_os/context_budget.py +305 -0
- agent_os/credential_redactor.py +224 -0
- agent_os/diff_policy.py +89 -0
- agent_os/egress_policy.py +159 -0
- agent_os/escalation.py +276 -0
- agent_os/event_bus.py +124 -0
- agent_os/exceptions.py +180 -0
- agent_os/execution_context_policy.py +141 -0
- agent_os/github_enterprise.py +96 -0
- agent_os/health.py +20 -0
- agent_os/integrations/__init__.py +279 -0
- agent_os/integrations/a2a_adapter.py +279 -0
- agent_os/integrations/agent_lightning/__init__.py +30 -0
- agent_os/integrations/anthropic_adapter.py +420 -0
- agent_os/integrations/autogen_adapter.py +620 -0
- agent_os/integrations/base.py +1137 -0
- agent_os/integrations/compat.py +229 -0
- agent_os/integrations/config.py +98 -0
- agent_os/integrations/conversation_guardian.py +957 -0
- agent_os/integrations/crewai_adapter.py +467 -0
- agent_os/integrations/drift_detector.py +425 -0
- agent_os/integrations/dry_run.py +124 -0
- agent_os/integrations/escalation.py +582 -0
- agent_os/integrations/gemini_adapter.py +364 -0
- agent_os/integrations/google_adk_adapter.py +633 -0
- agent_os/integrations/guardrails_adapter.py +394 -0
- agent_os/integrations/health.py +197 -0
- agent_os/integrations/langchain_adapter.py +654 -0
- agent_os/integrations/llamafirewall.py +343 -0
- agent_os/integrations/llamaindex_adapter.py +188 -0
- agent_os/integrations/logging.py +191 -0
- agent_os/integrations/maf_adapter.py +631 -0
- agent_os/integrations/mistral_adapter.py +365 -0
- agent_os/integrations/openai_adapter.py +816 -0
- agent_os/integrations/openai_agents_sdk.py +406 -0
- agent_os/integrations/policy_compose.py +171 -0
- agent_os/integrations/profiling.py +144 -0
- agent_os/integrations/pydantic_ai_adapter.py +420 -0
- agent_os/integrations/rate_limiter.py +130 -0
- agent_os/integrations/rbac.py +143 -0
- agent_os/integrations/registry.py +113 -0
- agent_os/integrations/scope_guard.py +303 -0
- agent_os/integrations/semantic_kernel_adapter.py +769 -0
- agent_os/integrations/smolagents_adapter.py +629 -0
- agent_os/integrations/templates.py +178 -0
- agent_os/integrations/token_budget.py +134 -0
- agent_os/integrations/tool_aliases.py +190 -0
- agent_os/integrations/webhooks.py +177 -0
- agent_os/lite.py +208 -0
- agent_os/mcp_gateway.py +385 -0
- agent_os/mcp_message_signer.py +273 -0
- agent_os/mcp_protocols.py +161 -0
- agent_os/mcp_response_scanner.py +232 -0
- agent_os/mcp_security.py +924 -0
- agent_os/mcp_session_auth.py +231 -0
- agent_os/mcp_sliding_rate_limiter.py +184 -0
- agent_os/memory_guard.py +409 -0
- agent_os/metrics.py +134 -0
- agent_os/mute.py +428 -0
- agent_os/mute_agent.py +209 -0
- agent_os/policies/__init__.py +77 -0
- agent_os/policies/async_evaluator.py +275 -0
- agent_os/policies/backends.py +670 -0
- agent_os/policies/bridge.py +169 -0
- agent_os/policies/budget.py +85 -0
- agent_os/policies/cli.py +294 -0
- agent_os/policies/conflict_resolution.py +270 -0
- agent_os/policies/data_classification.py +252 -0
- agent_os/policies/evaluator.py +239 -0
- agent_os/policies/policy_schema.json +228 -0
- agent_os/policies/rate_limiting.py +145 -0
- agent_os/policies/schema.py +115 -0
- agent_os/policies/shared.py +331 -0
- agent_os/prompt_injection.py +694 -0
- agent_os/providers.py +182 -0
- agent_os/py.typed +0 -0
- agent_os/retry.py +81 -0
- agent_os/reversibility.py +251 -0
- agent_os/sandbox.py +432 -0
- agent_os/sandbox_provider.py +140 -0
- agent_os/secure_codegen.py +525 -0
- agent_os/security_skills.py +538 -0
- agent_os/semantic_policy.py +422 -0
- agent_os/server/__init__.py +15 -0
- agent_os/server/__main__.py +25 -0
- agent_os/server/app.py +277 -0
- agent_os/server/models.py +104 -0
- agent_os/shift_left_metrics.py +130 -0
- agent_os/stateless.py +742 -0
- agent_os/supervisor.py +148 -0
- agent_os/task_outcome.py +148 -0
- agent_os/transparency.py +181 -0
- agent_os/trust_root.py +128 -0
- agent_os_kernel-3.1.0.dist-info/METADATA +1269 -0
- agent_os_kernel-3.1.0.dist-info/RECORD +337 -0
- agent_os_kernel-3.1.0.dist-info/WHEEL +4 -0
- agent_os_kernel-3.1.0.dist-info/entry_points.txt +2 -0
- agent_os_kernel-3.1.0.dist-info/licenses/LICENSE +21 -0
- agent_os_observability/__init__.py +27 -0
- agent_os_observability/dashboards.py +898 -0
- agent_os_observability/metrics.py +398 -0
- agent_os_observability/server.py +223 -0
- agent_os_observability/tracer.py +232 -0
- agent_primitives/__init__.py +24 -0
- agent_primitives/failures.py +84 -0
- agent_primitives/py.typed +0 -0
- amb_core/__init__.py +177 -0
- amb_core/adapters/__init__.py +57 -0
- amb_core/adapters/aws_sqs_broker.py +376 -0
- amb_core/adapters/azure_servicebus_broker.py +340 -0
- amb_core/adapters/kafka_broker.py +260 -0
- amb_core/adapters/nats_broker.py +285 -0
- amb_core/adapters/rabbitmq_broker.py +235 -0
- amb_core/adapters/redis_broker.py +262 -0
- amb_core/broker.py +145 -0
- amb_core/bus.py +481 -0
- amb_core/cloudevents.py +509 -0
- amb_core/dlq.py +345 -0
- amb_core/hf_utils.py +536 -0
- amb_core/memory_broker.py +410 -0
- amb_core/models.py +141 -0
- amb_core/persistence.py +529 -0
- amb_core/schema.py +294 -0
- amb_core/tracing.py +358 -0
- atr/__init__.py +640 -0
- atr/access.py +348 -0
- atr/composition.py +645 -0
- atr/decorator.py +357 -0
- atr/executor.py +384 -0
- atr/health.py +557 -0
- atr/hf_utils.py +449 -0
- atr/injection.py +422 -0
- atr/metrics.py +440 -0
- atr/policies.py +403 -0
- atr/py.typed +2 -0
- atr/registry.py +452 -0
- atr/schema.py +480 -0
- atr/tools/safe/__init__.py +75 -0
- atr/tools/safe/calculator.py +467 -0
- atr/tools/safe/datetime_tool.py +443 -0
- atr/tools/safe/file_reader.py +402 -0
- atr/tools/safe/http_client.py +316 -0
- atr/tools/safe/json_parser.py +374 -0
- atr/tools/safe/text_tool.py +537 -0
- atr/tools/safe/toolkit.py +175 -0
- caas/__init__.py +162 -0
- caas/api/__init__.py +7 -0
- caas/api/server.py +1328 -0
- caas/caching.py +834 -0
- caas/cli.py +210 -0
- caas/conversation.py +223 -0
- caas/decay.py +72 -0
- caas/detection/__init__.py +9 -0
- caas/detection/detector.py +238 -0
- caas/enrichment.py +130 -0
- caas/gateway/__init__.py +27 -0
- caas/gateway/trust_gateway.py +474 -0
- caas/hf_utils.py +479 -0
- caas/ingestion/__init__.py +23 -0
- caas/ingestion/processors.py +253 -0
- caas/ingestion/structure_parser.py +188 -0
- caas/models.py +356 -0
- caas/pragmatic_truth.py +444 -0
- caas/routing/__init__.py +10 -0
- caas/routing/heuristic_router.py +58 -0
- caas/storage/__init__.py +9 -0
- caas/storage/store.py +389 -0
- caas/triad.py +213 -0
- caas/tuning/__init__.py +9 -0
- caas/tuning/tuner.py +329 -0
- caas/vfs/__init__.py +14 -0
- caas/vfs/filesystem.py +452 -0
- cmvk/__init__.py +218 -0
- cmvk/audit.py +402 -0
- cmvk/benchmarks.py +478 -0
- cmvk/constitutional.py +904 -0
- cmvk/hf_utils.py +301 -0
- cmvk/metrics.py +473 -0
- cmvk/profiles.py +300 -0
- cmvk/py.typed +0 -0
- cmvk/types.py +12 -0
- cmvk/verification.py +956 -0
- emk/__init__.py +89 -0
- emk/causal.py +352 -0
- emk/hf_utils.py +421 -0
- emk/indexer.py +83 -0
- emk/py.typed +0 -0
- emk/schema.py +204 -0
- emk/sleep_cycle.py +347 -0
- emk/store.py +281 -0
- iatp/__init__.py +166 -0
- iatp/attestation.py +461 -0
- iatp/cli.py +317 -0
- iatp/hf_utils.py +472 -0
- iatp/ipc_pipes.py +580 -0
- iatp/main.py +412 -0
- iatp/models/__init__.py +447 -0
- iatp/policy_engine.py +337 -0
- iatp/py.typed +2 -0
- iatp/recovery.py +321 -0
- iatp/security/__init__.py +270 -0
- iatp/sidecar/__init__.py +519 -0
- iatp/telemetry/__init__.py +164 -0
- iatp/tests/__init__.py +1 -0
- iatp/tests/test_attestation.py +370 -0
- iatp/tests/test_cli.py +131 -0
- iatp/tests/test_ed25519_attestation.py +211 -0
- iatp/tests/test_models.py +130 -0
- iatp/tests/test_policy_engine.py +347 -0
- iatp/tests/test_recovery.py +281 -0
- iatp/tests/test_security.py +222 -0
- iatp/tests/test_sidecar.py +167 -0
- iatp/tests/test_telemetry.py +175 -0
- mcp_kernel_server/__init__.py +28 -0
- mcp_kernel_server/cli.py +274 -0
- mcp_kernel_server/resources.py +217 -0
- mcp_kernel_server/server.py +564 -0
- mcp_kernel_server/tools.py +1174 -0
- mute_agent/__init__.py +68 -0
- mute_agent/core/__init__.py +1 -0
- mute_agent/core/execution_agent.py +166 -0
- mute_agent/core/handshake_protocol.py +201 -0
- mute_agent/core/reasoning_agent.py +238 -0
- mute_agent/knowledge_graph/__init__.py +1 -0
- mute_agent/knowledge_graph/graph_elements.py +65 -0
- mute_agent/knowledge_graph/multidimensional_graph.py +170 -0
- mute_agent/knowledge_graph/subgraph.py +224 -0
- mute_agent/listener/__init__.py +43 -0
- mute_agent/listener/adapters/__init__.py +31 -0
- mute_agent/listener/adapters/base_adapter.py +189 -0
- mute_agent/listener/adapters/caas_adapter.py +344 -0
- mute_agent/listener/adapters/control_plane_adapter.py +436 -0
- mute_agent/listener/adapters/iatp_adapter.py +332 -0
- mute_agent/listener/adapters/scak_adapter.py +251 -0
- mute_agent/listener/listener.py +610 -0
- mute_agent/listener/state_observer.py +436 -0
- mute_agent/listener/threshold_config.py +313 -0
- mute_agent/super_system/__init__.py +1 -0
- mute_agent/super_system/router.py +204 -0
- mute_agent/visualization/__init__.py +10 -0
- mute_agent/visualization/graph_debugger.py +502 -0
- nexus/README.md +60 -0
- nexus/__init__.py +51 -0
- nexus/arbiter.py +359 -0
- nexus/client.py +466 -0
- nexus/dmz.py +444 -0
- nexus/escrow.py +430 -0
- nexus/exceptions.py +286 -0
- nexus/pyproject.toml +36 -0
- nexus/registry.py +393 -0
- nexus/reputation.py +425 -0
- nexus/schemas/__init__.py +51 -0
- nexus/schemas/compliance.py +276 -0
- nexus/schemas/escrow.py +251 -0
- nexus/schemas/manifest.py +225 -0
- nexus/schemas/receipt.py +208 -0
- nexus/tests/__init__.py +0 -0
- nexus/tests/conftest.py +146 -0
- nexus/tests/test_arbiter.py +192 -0
- nexus/tests/test_dmz.py +194 -0
- nexus/tests/test_escrow.py +276 -0
- nexus/tests/test_exceptions.py +225 -0
- nexus/tests/test_registry.py +232 -0
- nexus/tests/test_reputation.py +328 -0
- nexus/tests/test_schemas.py +295 -0
|
@@ -0,0 +1,787 @@
|
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
"""
|
|
4
|
+
Observability and Metrics - Real-time Monitoring and Prometheus Integration
|
|
5
|
+
|
|
6
|
+
This module provides production-grade observability features including real-time
|
|
7
|
+
metrics, Prometheus integration, trace visualization, and monitoring dashboards.
|
|
8
|
+
|
|
9
|
+
Research Foundations:
|
|
10
|
+
- Prometheus monitoring best practices
|
|
11
|
+
- OpenTelemetry for distributed tracing
|
|
12
|
+
- "Observability Engineering" (O'Reilly, 2022) - metrics, logs, traces
|
|
13
|
+
- SRE principles from Google SRE Book
|
|
14
|
+
|
|
15
|
+
See docs/RESEARCH_FOUNDATION.md for complete references.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from typing import Any, Dict, List, Optional, Callable
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from enum import Enum
|
|
21
|
+
from datetime import datetime, timedelta
|
|
22
|
+
from collections import defaultdict, deque
|
|
23
|
+
import time
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class MetricType(Enum):
|
|
27
|
+
"""Types of metrics"""
|
|
28
|
+
COUNTER = "counter"
|
|
29
|
+
GAUGE = "gauge"
|
|
30
|
+
HISTOGRAM = "histogram"
|
|
31
|
+
SUMMARY = "summary"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AlertSeverity(Enum):
|
|
35
|
+
"""Alert severity levels"""
|
|
36
|
+
INFO = "info"
|
|
37
|
+
WARNING = "warning"
|
|
38
|
+
ERROR = "error"
|
|
39
|
+
CRITICAL = "critical"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class Metric:
|
|
44
|
+
"""
|
|
45
|
+
A metric measurement.
|
|
46
|
+
|
|
47
|
+
Attributes:
|
|
48
|
+
name: Metric name
|
|
49
|
+
metric_type: Type of metric
|
|
50
|
+
value: Current value
|
|
51
|
+
labels: Key-value labels
|
|
52
|
+
timestamp: When measured
|
|
53
|
+
help_text: Description of metric
|
|
54
|
+
"""
|
|
55
|
+
name: str
|
|
56
|
+
metric_type: MetricType
|
|
57
|
+
value: float
|
|
58
|
+
labels: Dict[str, str] = field(default_factory=dict)
|
|
59
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
60
|
+
help_text: str = ""
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class Alert:
|
|
65
|
+
"""
|
|
66
|
+
An alert notification.
|
|
67
|
+
|
|
68
|
+
Attributes:
|
|
69
|
+
alert_id: Unique identifier
|
|
70
|
+
name: Alert name
|
|
71
|
+
severity: Severity level
|
|
72
|
+
message: Alert message
|
|
73
|
+
labels: Context labels
|
|
74
|
+
firing: Whether alert is currently firing
|
|
75
|
+
started_at: When alert started firing
|
|
76
|
+
"""
|
|
77
|
+
alert_id: str
|
|
78
|
+
name: str
|
|
79
|
+
severity: AlertSeverity
|
|
80
|
+
message: str
|
|
81
|
+
labels: Dict[str, str] = field(default_factory=dict)
|
|
82
|
+
firing: bool = True
|
|
83
|
+
started_at: datetime = field(default_factory=datetime.now)
|
|
84
|
+
resolved_at: Optional[datetime] = None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class Trace:
|
|
89
|
+
"""
|
|
90
|
+
A distributed trace for a request/operation.
|
|
91
|
+
|
|
92
|
+
Attributes:
|
|
93
|
+
trace_id: Unique trace identifier
|
|
94
|
+
spans: List of spans in this trace
|
|
95
|
+
started_at: Trace start time
|
|
96
|
+
duration_ms: Total duration
|
|
97
|
+
metadata: Additional trace metadata
|
|
98
|
+
"""
|
|
99
|
+
trace_id: str
|
|
100
|
+
spans: List['Span'] = field(default_factory=list)
|
|
101
|
+
started_at: datetime = field(default_factory=datetime.now)
|
|
102
|
+
duration_ms: Optional[float] = None
|
|
103
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@dataclass
|
|
107
|
+
class Span:
|
|
108
|
+
"""
|
|
109
|
+
A span within a trace.
|
|
110
|
+
|
|
111
|
+
Attributes:
|
|
112
|
+
span_id: Unique span identifier
|
|
113
|
+
parent_span_id: Parent span if nested
|
|
114
|
+
operation_name: Name of operation
|
|
115
|
+
started_at: Span start time
|
|
116
|
+
duration_ms: Span duration
|
|
117
|
+
tags: Span tags
|
|
118
|
+
logs: Span logs
|
|
119
|
+
"""
|
|
120
|
+
span_id: str
|
|
121
|
+
parent_span_id: Optional[str]
|
|
122
|
+
operation_name: str
|
|
123
|
+
started_at: datetime = field(default_factory=datetime.now)
|
|
124
|
+
duration_ms: Optional[float] = None
|
|
125
|
+
tags: Dict[str, Any] = field(default_factory=dict)
|
|
126
|
+
logs: List[Dict[str, Any]] = field(default_factory=list)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class PrometheusExporter:
|
|
130
|
+
"""
|
|
131
|
+
Prometheus metrics exporter.
|
|
132
|
+
|
|
133
|
+
Exports metrics in Prometheus text format for scraping.
|
|
134
|
+
|
|
135
|
+
Features:
|
|
136
|
+
- Counter, gauge, histogram, summary metrics
|
|
137
|
+
- Multi-dimensional labels
|
|
138
|
+
- Automatic metric registration
|
|
139
|
+
- Text format export for Prometheus scraping
|
|
140
|
+
|
|
141
|
+
Usage:
|
|
142
|
+
exporter = PrometheusExporter()
|
|
143
|
+
|
|
144
|
+
# Record metrics
|
|
145
|
+
exporter.increment_counter(
|
|
146
|
+
"agent_requests_total",
|
|
147
|
+
labels={"agent_id": "agent1", "status": "success"}
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
exporter.set_gauge(
|
|
151
|
+
"agent_active_sessions",
|
|
152
|
+
value=5,
|
|
153
|
+
labels={"agent_id": "agent1"}
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Export for Prometheus
|
|
157
|
+
metrics_text = exporter.export()
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
def __init__(self):
|
|
161
|
+
self._metrics: Dict[str, Dict[str, Metric]] = defaultdict(dict)
|
|
162
|
+
self._metric_metadata: Dict[str, Dict[str, Any]] = {}
|
|
163
|
+
|
|
164
|
+
def increment_counter(
|
|
165
|
+
self,
|
|
166
|
+
name: str,
|
|
167
|
+
value: float = 1.0,
|
|
168
|
+
labels: Optional[Dict[str, str]] = None,
|
|
169
|
+
help_text: str = ""
|
|
170
|
+
):
|
|
171
|
+
"""
|
|
172
|
+
Increment a counter metric.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
name: Metric name
|
|
176
|
+
value: Amount to increment
|
|
177
|
+
labels: Metric labels
|
|
178
|
+
help_text: Help text for metric
|
|
179
|
+
"""
|
|
180
|
+
labels = labels or {}
|
|
181
|
+
label_key = self._make_label_key(labels)
|
|
182
|
+
|
|
183
|
+
if name not in self._metric_metadata:
|
|
184
|
+
self._metric_metadata[name] = {
|
|
185
|
+
"type": MetricType.COUNTER,
|
|
186
|
+
"help": help_text or f"Counter metric {name}"
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if label_key in self._metrics[name]:
|
|
190
|
+
self._metrics[name][label_key].value += value
|
|
191
|
+
self._metrics[name][label_key].timestamp = datetime.now()
|
|
192
|
+
else:
|
|
193
|
+
self._metrics[name][label_key] = Metric(
|
|
194
|
+
name=name,
|
|
195
|
+
metric_type=MetricType.COUNTER,
|
|
196
|
+
value=value,
|
|
197
|
+
labels=labels,
|
|
198
|
+
help_text=help_text
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
def set_gauge(
|
|
202
|
+
self,
|
|
203
|
+
name: str,
|
|
204
|
+
value: float,
|
|
205
|
+
labels: Optional[Dict[str, str]] = None,
|
|
206
|
+
help_text: str = ""
|
|
207
|
+
):
|
|
208
|
+
"""
|
|
209
|
+
Set a gauge metric.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
name: Metric name
|
|
213
|
+
value: Value to set
|
|
214
|
+
labels: Metric labels
|
|
215
|
+
help_text: Help text
|
|
216
|
+
"""
|
|
217
|
+
labels = labels or {}
|
|
218
|
+
label_key = self._make_label_key(labels)
|
|
219
|
+
|
|
220
|
+
if name not in self._metric_metadata:
|
|
221
|
+
self._metric_metadata[name] = {
|
|
222
|
+
"type": MetricType.GAUGE,
|
|
223
|
+
"help": help_text or f"Gauge metric {name}"
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
self._metrics[name][label_key] = Metric(
|
|
227
|
+
name=name,
|
|
228
|
+
metric_type=MetricType.GAUGE,
|
|
229
|
+
value=value,
|
|
230
|
+
labels=labels,
|
|
231
|
+
help_text=help_text
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
def observe_histogram(
|
|
235
|
+
self,
|
|
236
|
+
name: str,
|
|
237
|
+
value: float,
|
|
238
|
+
labels: Optional[Dict[str, str]] = None,
|
|
239
|
+
help_text: str = ""
|
|
240
|
+
):
|
|
241
|
+
"""
|
|
242
|
+
Observe a histogram metric.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
name: Metric name
|
|
246
|
+
value: Observed value
|
|
247
|
+
labels: Metric labels
|
|
248
|
+
help_text: Help text
|
|
249
|
+
"""
|
|
250
|
+
# Simplified histogram - in production would have buckets
|
|
251
|
+
labels = labels or {}
|
|
252
|
+
label_key = self._make_label_key(labels)
|
|
253
|
+
|
|
254
|
+
if name not in self._metric_metadata:
|
|
255
|
+
self._metric_metadata[name] = {
|
|
256
|
+
"type": MetricType.HISTOGRAM,
|
|
257
|
+
"help": help_text or f"Histogram metric {name}"
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
# Store as gauge for simplification
|
|
261
|
+
self._metrics[name][label_key] = Metric(
|
|
262
|
+
name=name,
|
|
263
|
+
metric_type=MetricType.HISTOGRAM,
|
|
264
|
+
value=value,
|
|
265
|
+
labels=labels,
|
|
266
|
+
help_text=help_text
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
def export(self) -> str:
|
|
270
|
+
"""
|
|
271
|
+
Export metrics in Prometheus text format.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
Prometheus-formatted metrics text
|
|
275
|
+
"""
|
|
276
|
+
lines = []
|
|
277
|
+
|
|
278
|
+
for metric_name, metadata in self._metric_metadata.items():
|
|
279
|
+
# HELP line
|
|
280
|
+
lines.append(f"# HELP {metric_name} {metadata['help']}")
|
|
281
|
+
|
|
282
|
+
# TYPE line
|
|
283
|
+
lines.append(f"# TYPE {metric_name} {metadata['type'].value}")
|
|
284
|
+
|
|
285
|
+
# Metric lines
|
|
286
|
+
for label_key, metric in self._metrics[metric_name].items():
|
|
287
|
+
if metric.labels:
|
|
288
|
+
label_str = ",".join(
|
|
289
|
+
f'{k}="{v}"' for k, v in metric.labels.items()
|
|
290
|
+
)
|
|
291
|
+
lines.append(f"{metric_name}{{{label_str}}} {metric.value}")
|
|
292
|
+
else:
|
|
293
|
+
lines.append(f"{metric_name} {metric.value}")
|
|
294
|
+
|
|
295
|
+
lines.append("") # Blank line between metrics
|
|
296
|
+
|
|
297
|
+
return "\n".join(lines)
|
|
298
|
+
|
|
299
|
+
def get_metrics(self) -> Dict[str, List[Metric]]:
|
|
300
|
+
"""Get all metrics"""
|
|
301
|
+
return {
|
|
302
|
+
name: list(metrics.values())
|
|
303
|
+
for name, metrics in self._metrics.items()
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
def _make_label_key(self, labels: Dict[str, str]) -> str:
|
|
307
|
+
"""Create unique key from labels"""
|
|
308
|
+
if not labels:
|
|
309
|
+
return "default"
|
|
310
|
+
return ",".join(f"{k}={v}" for k, v in sorted(labels.items()))
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
class AlertManager:
|
|
314
|
+
"""
|
|
315
|
+
Alert management system.
|
|
316
|
+
|
|
317
|
+
Features:
|
|
318
|
+
- Rule-based alerting
|
|
319
|
+
- Threshold monitoring
|
|
320
|
+
- Alert aggregation and deduplication
|
|
321
|
+
- Alert routing and notifications
|
|
322
|
+
|
|
323
|
+
Usage:
|
|
324
|
+
alert_mgr = AlertManager()
|
|
325
|
+
|
|
326
|
+
# Define alert rule
|
|
327
|
+
alert_mgr.add_rule(
|
|
328
|
+
name="high_error_rate",
|
|
329
|
+
condition=lambda metrics: metrics.get("error_rate", 0) > 0.05,
|
|
330
|
+
severity=AlertSeverity.ERROR,
|
|
331
|
+
message="Error rate exceeds 5%"
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
# Check alerts
|
|
335
|
+
alerts = alert_mgr.evaluate(current_metrics)
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
def __init__(self):
|
|
339
|
+
self._rules: Dict[str, Dict[str, Any]] = {}
|
|
340
|
+
self._active_alerts: Dict[str, Alert] = {}
|
|
341
|
+
self._alert_history: List[Alert] = []
|
|
342
|
+
|
|
343
|
+
def add_rule(
|
|
344
|
+
self,
|
|
345
|
+
name: str,
|
|
346
|
+
condition: Callable[[Dict[str, Any]], bool],
|
|
347
|
+
severity: AlertSeverity,
|
|
348
|
+
message: str,
|
|
349
|
+
labels: Optional[Dict[str, str]] = None
|
|
350
|
+
):
|
|
351
|
+
"""
|
|
352
|
+
Add an alerting rule.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
name: Rule name
|
|
356
|
+
condition: Function that evaluates alert condition
|
|
357
|
+
severity: Alert severity
|
|
358
|
+
message: Alert message
|
|
359
|
+
labels: Additional labels
|
|
360
|
+
"""
|
|
361
|
+
self._rules[name] = {
|
|
362
|
+
"condition": condition,
|
|
363
|
+
"severity": severity,
|
|
364
|
+
"message": message,
|
|
365
|
+
"labels": labels or {}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
def evaluate(
|
|
369
|
+
self,
|
|
370
|
+
metrics: Dict[str, Any]
|
|
371
|
+
) -> List[Alert]:
|
|
372
|
+
"""
|
|
373
|
+
Evaluate alert rules against current metrics.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
metrics: Current metrics to evaluate
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
List of firing alerts
|
|
380
|
+
"""
|
|
381
|
+
current_firing = set()
|
|
382
|
+
|
|
383
|
+
for rule_name, rule in self._rules.items():
|
|
384
|
+
try:
|
|
385
|
+
should_fire = rule["condition"](metrics)
|
|
386
|
+
|
|
387
|
+
if should_fire:
|
|
388
|
+
current_firing.add(rule_name)
|
|
389
|
+
|
|
390
|
+
if rule_name not in self._active_alerts:
|
|
391
|
+
# New alert
|
|
392
|
+
alert = Alert(
|
|
393
|
+
alert_id=f"{rule_name}-{int(time.time())}",
|
|
394
|
+
name=rule_name,
|
|
395
|
+
severity=rule["severity"],
|
|
396
|
+
message=rule["message"],
|
|
397
|
+
labels=rule["labels"]
|
|
398
|
+
)
|
|
399
|
+
self._active_alerts[rule_name] = alert
|
|
400
|
+
self._alert_history.append(alert)
|
|
401
|
+
else:
|
|
402
|
+
# Alert should resolve
|
|
403
|
+
if rule_name in self._active_alerts:
|
|
404
|
+
alert = self._active_alerts[rule_name]
|
|
405
|
+
alert.firing = False
|
|
406
|
+
alert.resolved_at = datetime.now()
|
|
407
|
+
del self._active_alerts[rule_name]
|
|
408
|
+
|
|
409
|
+
except Exception as e:
|
|
410
|
+
# Log error but don't fail alerting
|
|
411
|
+
pass
|
|
412
|
+
|
|
413
|
+
return list(self._active_alerts.values())
|
|
414
|
+
|
|
415
|
+
def get_active_alerts(self) -> List[Alert]:
|
|
416
|
+
"""Get currently firing alerts"""
|
|
417
|
+
return list(self._active_alerts.values())
|
|
418
|
+
|
|
419
|
+
def get_alert_history(
|
|
420
|
+
self,
|
|
421
|
+
hours: int = 24
|
|
422
|
+
) -> List[Alert]:
|
|
423
|
+
"""Get alert history"""
|
|
424
|
+
cutoff = datetime.now() - timedelta(hours=hours)
|
|
425
|
+
return [
|
|
426
|
+
alert for alert in self._alert_history
|
|
427
|
+
if alert.started_at > cutoff
|
|
428
|
+
]
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
class TraceCollector:
|
|
432
|
+
"""
|
|
433
|
+
Distributed tracing collector.
|
|
434
|
+
|
|
435
|
+
Features:
|
|
436
|
+
- Trace and span collection
|
|
437
|
+
- Parent-child span relationships
|
|
438
|
+
- Trace visualization data
|
|
439
|
+
- Performance analysis
|
|
440
|
+
|
|
441
|
+
Usage:
|
|
442
|
+
collector = TraceCollector()
|
|
443
|
+
|
|
444
|
+
# Start trace
|
|
445
|
+
trace_id = collector.start_trace("agent_request")
|
|
446
|
+
|
|
447
|
+
# Add spans
|
|
448
|
+
span_id = collector.start_span(
|
|
449
|
+
trace_id=trace_id,
|
|
450
|
+
operation_name="policy_check"
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
# End span
|
|
454
|
+
collector.end_span(trace_id, span_id)
|
|
455
|
+
|
|
456
|
+
# Get trace
|
|
457
|
+
trace = collector.get_trace(trace_id)
|
|
458
|
+
"""
|
|
459
|
+
|
|
460
|
+
def __init__(self):
|
|
461
|
+
self._traces: Dict[str, Trace] = {}
|
|
462
|
+
self._active_spans: Dict[str, Dict[str, Span]] = defaultdict(dict)
|
|
463
|
+
|
|
464
|
+
def start_trace(
|
|
465
|
+
self,
|
|
466
|
+
operation_name: str,
|
|
467
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
468
|
+
) -> str:
|
|
469
|
+
"""
|
|
470
|
+
Start a new trace.
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
operation_name: Name of the operation
|
|
474
|
+
metadata: Additional metadata
|
|
475
|
+
|
|
476
|
+
Returns:
|
|
477
|
+
trace_id
|
|
478
|
+
"""
|
|
479
|
+
import uuid
|
|
480
|
+
trace_id = str(uuid.uuid4())
|
|
481
|
+
|
|
482
|
+
trace = Trace(
|
|
483
|
+
trace_id=trace_id,
|
|
484
|
+
metadata=metadata or {}
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
self._traces[trace_id] = trace
|
|
488
|
+
|
|
489
|
+
# Create root span
|
|
490
|
+
self.start_span(
|
|
491
|
+
trace_id=trace_id,
|
|
492
|
+
operation_name=operation_name
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
return trace_id
|
|
496
|
+
|
|
497
|
+
def start_span(
|
|
498
|
+
self,
|
|
499
|
+
trace_id: str,
|
|
500
|
+
operation_name: str,
|
|
501
|
+
parent_span_id: Optional[str] = None,
|
|
502
|
+
tags: Optional[Dict[str, Any]] = None
|
|
503
|
+
) -> str:
|
|
504
|
+
"""
|
|
505
|
+
Start a new span within a trace.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
trace_id: Trace ID
|
|
509
|
+
operation_name: Operation name
|
|
510
|
+
parent_span_id: Parent span ID if nested
|
|
511
|
+
tags: Span tags
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
span_id
|
|
515
|
+
"""
|
|
516
|
+
import uuid
|
|
517
|
+
span_id = str(uuid.uuid4())
|
|
518
|
+
|
|
519
|
+
span = Span(
|
|
520
|
+
span_id=span_id,
|
|
521
|
+
parent_span_id=parent_span_id,
|
|
522
|
+
operation_name=operation_name,
|
|
523
|
+
tags=tags or {}
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
self._active_spans[trace_id][span_id] = span
|
|
527
|
+
|
|
528
|
+
return span_id
|
|
529
|
+
|
|
530
|
+
def end_span(
|
|
531
|
+
self,
|
|
532
|
+
trace_id: str,
|
|
533
|
+
span_id: str,
|
|
534
|
+
tags: Optional[Dict[str, Any]] = None
|
|
535
|
+
):
|
|
536
|
+
"""
|
|
537
|
+
End a span.
|
|
538
|
+
|
|
539
|
+
Args:
|
|
540
|
+
trace_id: Trace ID
|
|
541
|
+
span_id: Span ID
|
|
542
|
+
tags: Additional tags to add
|
|
543
|
+
"""
|
|
544
|
+
if trace_id not in self._active_spans:
|
|
545
|
+
return
|
|
546
|
+
|
|
547
|
+
if span_id not in self._active_spans[trace_id]:
|
|
548
|
+
return
|
|
549
|
+
|
|
550
|
+
span = self._active_spans[trace_id][span_id]
|
|
551
|
+
duration = (datetime.now() - span.started_at).total_seconds() * 1000
|
|
552
|
+
span.duration_ms = duration
|
|
553
|
+
|
|
554
|
+
if tags:
|
|
555
|
+
span.tags.update(tags)
|
|
556
|
+
|
|
557
|
+
# Move to trace
|
|
558
|
+
if trace_id in self._traces:
|
|
559
|
+
self._traces[trace_id].spans.append(span)
|
|
560
|
+
|
|
561
|
+
# Remove from active
|
|
562
|
+
del self._active_spans[trace_id][span_id]
|
|
563
|
+
|
|
564
|
+
def end_trace(self, trace_id: str):
|
|
565
|
+
"""End a trace"""
|
|
566
|
+
if trace_id not in self._traces:
|
|
567
|
+
return
|
|
568
|
+
|
|
569
|
+
trace = self._traces[trace_id]
|
|
570
|
+
duration = (datetime.now() - trace.started_at).total_seconds() * 1000
|
|
571
|
+
trace.duration_ms = duration
|
|
572
|
+
|
|
573
|
+
# End any remaining active spans
|
|
574
|
+
if trace_id in self._active_spans:
|
|
575
|
+
for span_id in list(self._active_spans[trace_id].keys()):
|
|
576
|
+
self.end_span(trace_id, span_id)
|
|
577
|
+
|
|
578
|
+
def get_trace(self, trace_id: str) -> Optional[Trace]:
|
|
579
|
+
"""Get a trace by ID"""
|
|
580
|
+
return self._traces.get(trace_id)
|
|
581
|
+
|
|
582
|
+
def list_traces(
|
|
583
|
+
self,
|
|
584
|
+
limit: int = 100
|
|
585
|
+
) -> List[Trace]:
|
|
586
|
+
"""List recent traces"""
|
|
587
|
+
traces = sorted(
|
|
588
|
+
self._traces.values(),
|
|
589
|
+
key=lambda t: t.started_at,
|
|
590
|
+
reverse=True
|
|
591
|
+
)
|
|
592
|
+
return traces[:limit]
|
|
593
|
+
|
|
594
|
+
def get_trace_visualization(
|
|
595
|
+
self,
|
|
596
|
+
trace_id: str
|
|
597
|
+
) -> Dict[str, Any]:
|
|
598
|
+
"""
|
|
599
|
+
Get trace data formatted for visualization.
|
|
600
|
+
|
|
601
|
+
Args:
|
|
602
|
+
trace_id: Trace ID
|
|
603
|
+
|
|
604
|
+
Returns:
|
|
605
|
+
Visualization data with spans in hierarchical format
|
|
606
|
+
"""
|
|
607
|
+
trace = self.get_trace(trace_id)
|
|
608
|
+
if not trace:
|
|
609
|
+
return {}
|
|
610
|
+
|
|
611
|
+
# Build span hierarchy
|
|
612
|
+
span_tree = self._build_span_tree(trace.spans)
|
|
613
|
+
|
|
614
|
+
return {
|
|
615
|
+
"trace_id": trace_id,
|
|
616
|
+
"duration_ms": trace.duration_ms,
|
|
617
|
+
"started_at": trace.started_at.isoformat(),
|
|
618
|
+
"span_count": len(trace.spans),
|
|
619
|
+
"span_tree": span_tree,
|
|
620
|
+
"metadata": trace.metadata
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
def _build_span_tree(
|
|
624
|
+
self,
|
|
625
|
+
spans: List[Span]
|
|
626
|
+
) -> List[Dict[str, Any]]:
|
|
627
|
+
"""Build hierarchical span tree"""
|
|
628
|
+
# Group spans by parent
|
|
629
|
+
by_parent = defaultdict(list)
|
|
630
|
+
for span in spans:
|
|
631
|
+
by_parent[span.parent_span_id].append(span)
|
|
632
|
+
|
|
633
|
+
# Build tree starting from root (parent_span_id = None)
|
|
634
|
+
def build_node(span: Span) -> Dict[str, Any]:
|
|
635
|
+
children = by_parent.get(span.span_id, [])
|
|
636
|
+
return {
|
|
637
|
+
"span_id": span.span_id,
|
|
638
|
+
"operation_name": span.operation_name,
|
|
639
|
+
"duration_ms": span.duration_ms,
|
|
640
|
+
"tags": span.tags,
|
|
641
|
+
"children": [build_node(child) for child in children]
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
return [build_node(span) for span in by_parent[None]]
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
class ObservabilityDashboard:
|
|
648
|
+
"""
|
|
649
|
+
Central observability dashboard aggregating metrics, alerts, and traces.
|
|
650
|
+
|
|
651
|
+
Features:
|
|
652
|
+
- Real-time metrics display
|
|
653
|
+
- Active alert monitoring
|
|
654
|
+
- Trace visualization
|
|
655
|
+
- System health overview
|
|
656
|
+
|
|
657
|
+
Usage:
|
|
658
|
+
dashboard = ObservabilityDashboard(
|
|
659
|
+
prometheus=prometheus_exporter,
|
|
660
|
+
alerts=alert_manager,
|
|
661
|
+
traces=trace_collector
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
# Get dashboard data
|
|
665
|
+
data = dashboard.get_dashboard_data()
|
|
666
|
+
"""
|
|
667
|
+
|
|
668
|
+
def __init__(
|
|
669
|
+
self,
|
|
670
|
+
prometheus: PrometheusExporter,
|
|
671
|
+
alerts: AlertManager,
|
|
672
|
+
traces: TraceCollector
|
|
673
|
+
):
|
|
674
|
+
self.prometheus = prometheus
|
|
675
|
+
self.alerts = alerts
|
|
676
|
+
self.traces = traces
|
|
677
|
+
|
|
678
|
+
def get_dashboard_data(self) -> Dict[str, Any]:
|
|
679
|
+
"""
|
|
680
|
+
Get comprehensive dashboard data.
|
|
681
|
+
|
|
682
|
+
Returns:
|
|
683
|
+
Dashboard data with metrics, alerts, traces
|
|
684
|
+
"""
|
|
685
|
+
# Get key metrics
|
|
686
|
+
metrics = self.prometheus.get_metrics()
|
|
687
|
+
|
|
688
|
+
# Get active alerts
|
|
689
|
+
active_alerts = self.alerts.get_active_alerts()
|
|
690
|
+
|
|
691
|
+
# Get recent traces
|
|
692
|
+
recent_traces = self.traces.list_traces(limit=10)
|
|
693
|
+
|
|
694
|
+
return {
|
|
695
|
+
"timestamp": datetime.now().isoformat(),
|
|
696
|
+
"metrics": {
|
|
697
|
+
name: [
|
|
698
|
+
{
|
|
699
|
+
"value": m.value,
|
|
700
|
+
"labels": m.labels,
|
|
701
|
+
"timestamp": m.timestamp.isoformat()
|
|
702
|
+
}
|
|
703
|
+
for m in metric_list
|
|
704
|
+
]
|
|
705
|
+
for name, metric_list in metrics.items()
|
|
706
|
+
},
|
|
707
|
+
"alerts": {
|
|
708
|
+
"active_count": len(active_alerts),
|
|
709
|
+
"alerts": [
|
|
710
|
+
{
|
|
711
|
+
"name": alert.name,
|
|
712
|
+
"severity": alert.severity.value,
|
|
713
|
+
"message": alert.message,
|
|
714
|
+
"started_at": alert.started_at.isoformat()
|
|
715
|
+
}
|
|
716
|
+
for alert in active_alerts
|
|
717
|
+
]
|
|
718
|
+
},
|
|
719
|
+
"traces": {
|
|
720
|
+
"recent_count": len(recent_traces),
|
|
721
|
+
"traces": [
|
|
722
|
+
{
|
|
723
|
+
"trace_id": trace.trace_id,
|
|
724
|
+
"duration_ms": trace.duration_ms,
|
|
725
|
+
"span_count": len(trace.spans),
|
|
726
|
+
"started_at": trace.started_at.isoformat()
|
|
727
|
+
}
|
|
728
|
+
for trace in recent_traces
|
|
729
|
+
]
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
def get_health_status(self) -> Dict[str, Any]:
|
|
734
|
+
"""
|
|
735
|
+
Get overall system health status.
|
|
736
|
+
|
|
737
|
+
Returns:
|
|
738
|
+
Health status with overall assessment
|
|
739
|
+
"""
|
|
740
|
+
active_alerts = self.alerts.get_active_alerts()
|
|
741
|
+
|
|
742
|
+
# Determine health based on alerts
|
|
743
|
+
critical_count = sum(
|
|
744
|
+
1 for a in active_alerts
|
|
745
|
+
if a.severity == AlertSeverity.CRITICAL
|
|
746
|
+
)
|
|
747
|
+
error_count = sum(
|
|
748
|
+
1 for a in active_alerts
|
|
749
|
+
if a.severity == AlertSeverity.ERROR
|
|
750
|
+
)
|
|
751
|
+
|
|
752
|
+
if critical_count > 0:
|
|
753
|
+
status = "critical"
|
|
754
|
+
elif error_count > 0:
|
|
755
|
+
status = "degraded"
|
|
756
|
+
elif len(active_alerts) > 0:
|
|
757
|
+
status = "warning"
|
|
758
|
+
else:
|
|
759
|
+
status = "healthy"
|
|
760
|
+
|
|
761
|
+
return {
|
|
762
|
+
"status": status,
|
|
763
|
+
"active_alerts": len(active_alerts),
|
|
764
|
+
"critical_alerts": critical_count,
|
|
765
|
+
"error_alerts": error_count,
|
|
766
|
+
"checked_at": datetime.now().isoformat()
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
def create_observability_suite() -> Dict[str, Any]:
|
|
771
|
+
"""
|
|
772
|
+
Create a complete observability suite.
|
|
773
|
+
|
|
774
|
+
Returns:
|
|
775
|
+
Dictionary with all observability components
|
|
776
|
+
"""
|
|
777
|
+
prometheus = PrometheusExporter()
|
|
778
|
+
alert_manager = AlertManager()
|
|
779
|
+
trace_collector = TraceCollector()
|
|
780
|
+
dashboard = ObservabilityDashboard(prometheus, alert_manager, trace_collector)
|
|
781
|
+
|
|
782
|
+
return {
|
|
783
|
+
"prometheus": prometheus,
|
|
784
|
+
"alerts": alert_manager,
|
|
785
|
+
"traces": trace_collector,
|
|
786
|
+
"dashboard": dashboard
|
|
787
|
+
}
|