agent_os_kernel 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_control_plane/__init__.py +662 -0
- agent_control_plane/a2a_adapter.py +543 -0
- agent_control_plane/adapter.py +417 -0
- agent_control_plane/agent_hibernation.py +394 -0
- agent_control_plane/agent_kernel.py +470 -0
- agent_control_plane/compliance.py +720 -0
- agent_control_plane/constraint_graphs.py +478 -0
- agent_control_plane/control_plane.py +854 -0
- agent_control_plane/example_executors.py +195 -0
- agent_control_plane/execution_engine.py +231 -0
- agent_control_plane/flight_recorder.py +846 -0
- agent_control_plane/governance_layer.py +435 -0
- agent_control_plane/hf_utils.py +563 -0
- agent_control_plane/interfaces/__init__.py +55 -0
- agent_control_plane/interfaces/kernel_interface.py +361 -0
- agent_control_plane/interfaces/plugin_interface.py +497 -0
- agent_control_plane/interfaces/protocol_interfaces.py +387 -0
- agent_control_plane/kernel_space.py +1009 -0
- agent_control_plane/langchain_adapter.py +424 -0
- agent_control_plane/lifecycle.py +3113 -0
- agent_control_plane/mcp_adapter.py +653 -0
- agent_control_plane/ml_safety.py +563 -0
- agent_control_plane/multimodal.py +727 -0
- agent_control_plane/mute_agent.py +422 -0
- agent_control_plane/observability.py +787 -0
- agent_control_plane/orchestrator.py +482 -0
- agent_control_plane/plugin_registry.py +750 -0
- agent_control_plane/policy_engine.py +954 -0
- agent_control_plane/process_isolation.py +777 -0
- agent_control_plane/shadow_mode.py +310 -0
- agent_control_plane/signals.py +493 -0
- agent_control_plane/supervisor_agents.py +430 -0
- agent_control_plane/time_travel_debugger.py +557 -0
- agent_control_plane/tool_registry.py +452 -0
- agent_control_plane/vfs.py +697 -0
- agent_kernel/__init__.py +69 -0
- agent_kernel/analyzer.py +435 -0
- agent_kernel/auditor.py +36 -0
- agent_kernel/completeness_auditor.py +237 -0
- agent_kernel/detector.py +203 -0
- agent_kernel/kernel.py +744 -0
- agent_kernel/memory_manager.py +85 -0
- agent_kernel/models.py +374 -0
- agent_kernel/nudge_mechanism.py +263 -0
- agent_kernel/outcome_analyzer.py +338 -0
- agent_kernel/patcher.py +582 -0
- agent_kernel/semantic_analyzer.py +316 -0
- agent_kernel/semantic_purge.py +349 -0
- agent_kernel/simulator.py +449 -0
- agent_kernel/teacher.py +85 -0
- agent_kernel/triage.py +152 -0
- agent_os/__init__.py +409 -0
- agent_os/_adversarial_impl.py +200 -0
- agent_os/_circuit_breaker_impl.py +232 -0
- agent_os/_mcp_metrics.py +193 -0
- agent_os/adversarial.py +20 -0
- agent_os/agents_compat.py +490 -0
- agent_os/audit_logger.py +135 -0
- agent_os/base_agent.py +651 -0
- agent_os/circuit_breaker.py +34 -0
- agent_os/cli/__init__.py +659 -0
- agent_os/cli/cmd_audit.py +128 -0
- agent_os/cli/cmd_init.py +152 -0
- agent_os/cli/cmd_policy.py +41 -0
- agent_os/cli/cmd_policy_gen.py +180 -0
- agent_os/cli/cmd_validate.py +258 -0
- agent_os/cli/mcp_scan.py +265 -0
- agent_os/cli/output.py +192 -0
- agent_os/cli/policy_checker.py +330 -0
- agent_os/compat.py +74 -0
- agent_os/constraint_graph.py +234 -0
- agent_os/content_governance.py +140 -0
- agent_os/context_budget.py +305 -0
- agent_os/credential_redactor.py +224 -0
- agent_os/diff_policy.py +89 -0
- agent_os/egress_policy.py +159 -0
- agent_os/escalation.py +276 -0
- agent_os/event_bus.py +124 -0
- agent_os/exceptions.py +180 -0
- agent_os/execution_context_policy.py +141 -0
- agent_os/github_enterprise.py +96 -0
- agent_os/health.py +20 -0
- agent_os/integrations/__init__.py +279 -0
- agent_os/integrations/a2a_adapter.py +279 -0
- agent_os/integrations/agent_lightning/__init__.py +30 -0
- agent_os/integrations/anthropic_adapter.py +420 -0
- agent_os/integrations/autogen_adapter.py +620 -0
- agent_os/integrations/base.py +1137 -0
- agent_os/integrations/compat.py +229 -0
- agent_os/integrations/config.py +98 -0
- agent_os/integrations/conversation_guardian.py +957 -0
- agent_os/integrations/crewai_adapter.py +467 -0
- agent_os/integrations/drift_detector.py +425 -0
- agent_os/integrations/dry_run.py +124 -0
- agent_os/integrations/escalation.py +582 -0
- agent_os/integrations/gemini_adapter.py +364 -0
- agent_os/integrations/google_adk_adapter.py +633 -0
- agent_os/integrations/guardrails_adapter.py +394 -0
- agent_os/integrations/health.py +197 -0
- agent_os/integrations/langchain_adapter.py +654 -0
- agent_os/integrations/llamafirewall.py +343 -0
- agent_os/integrations/llamaindex_adapter.py +188 -0
- agent_os/integrations/logging.py +191 -0
- agent_os/integrations/maf_adapter.py +631 -0
- agent_os/integrations/mistral_adapter.py +365 -0
- agent_os/integrations/openai_adapter.py +816 -0
- agent_os/integrations/openai_agents_sdk.py +406 -0
- agent_os/integrations/policy_compose.py +171 -0
- agent_os/integrations/profiling.py +144 -0
- agent_os/integrations/pydantic_ai_adapter.py +420 -0
- agent_os/integrations/rate_limiter.py +130 -0
- agent_os/integrations/rbac.py +143 -0
- agent_os/integrations/registry.py +113 -0
- agent_os/integrations/scope_guard.py +303 -0
- agent_os/integrations/semantic_kernel_adapter.py +769 -0
- agent_os/integrations/smolagents_adapter.py +629 -0
- agent_os/integrations/templates.py +178 -0
- agent_os/integrations/token_budget.py +134 -0
- agent_os/integrations/tool_aliases.py +190 -0
- agent_os/integrations/webhooks.py +177 -0
- agent_os/lite.py +208 -0
- agent_os/mcp_gateway.py +385 -0
- agent_os/mcp_message_signer.py +273 -0
- agent_os/mcp_protocols.py +161 -0
- agent_os/mcp_response_scanner.py +232 -0
- agent_os/mcp_security.py +924 -0
- agent_os/mcp_session_auth.py +231 -0
- agent_os/mcp_sliding_rate_limiter.py +184 -0
- agent_os/memory_guard.py +409 -0
- agent_os/metrics.py +134 -0
- agent_os/mute.py +428 -0
- agent_os/mute_agent.py +209 -0
- agent_os/policies/__init__.py +77 -0
- agent_os/policies/async_evaluator.py +275 -0
- agent_os/policies/backends.py +670 -0
- agent_os/policies/bridge.py +169 -0
- agent_os/policies/budget.py +85 -0
- agent_os/policies/cli.py +294 -0
- agent_os/policies/conflict_resolution.py +270 -0
- agent_os/policies/data_classification.py +252 -0
- agent_os/policies/evaluator.py +239 -0
- agent_os/policies/policy_schema.json +228 -0
- agent_os/policies/rate_limiting.py +145 -0
- agent_os/policies/schema.py +115 -0
- agent_os/policies/shared.py +331 -0
- agent_os/prompt_injection.py +694 -0
- agent_os/providers.py +182 -0
- agent_os/py.typed +0 -0
- agent_os/retry.py +81 -0
- agent_os/reversibility.py +251 -0
- agent_os/sandbox.py +432 -0
- agent_os/sandbox_provider.py +140 -0
- agent_os/secure_codegen.py +525 -0
- agent_os/security_skills.py +538 -0
- agent_os/semantic_policy.py +422 -0
- agent_os/server/__init__.py +15 -0
- agent_os/server/__main__.py +25 -0
- agent_os/server/app.py +277 -0
- agent_os/server/models.py +104 -0
- agent_os/shift_left_metrics.py +130 -0
- agent_os/stateless.py +742 -0
- agent_os/supervisor.py +148 -0
- agent_os/task_outcome.py +148 -0
- agent_os/transparency.py +181 -0
- agent_os/trust_root.py +128 -0
- agent_os_kernel-3.1.0.dist-info/METADATA +1269 -0
- agent_os_kernel-3.1.0.dist-info/RECORD +337 -0
- agent_os_kernel-3.1.0.dist-info/WHEEL +4 -0
- agent_os_kernel-3.1.0.dist-info/entry_points.txt +2 -0
- agent_os_kernel-3.1.0.dist-info/licenses/LICENSE +21 -0
- agent_os_observability/__init__.py +27 -0
- agent_os_observability/dashboards.py +898 -0
- agent_os_observability/metrics.py +398 -0
- agent_os_observability/server.py +223 -0
- agent_os_observability/tracer.py +232 -0
- agent_primitives/__init__.py +24 -0
- agent_primitives/failures.py +84 -0
- agent_primitives/py.typed +0 -0
- amb_core/__init__.py +177 -0
- amb_core/adapters/__init__.py +57 -0
- amb_core/adapters/aws_sqs_broker.py +376 -0
- amb_core/adapters/azure_servicebus_broker.py +340 -0
- amb_core/adapters/kafka_broker.py +260 -0
- amb_core/adapters/nats_broker.py +285 -0
- amb_core/adapters/rabbitmq_broker.py +235 -0
- amb_core/adapters/redis_broker.py +262 -0
- amb_core/broker.py +145 -0
- amb_core/bus.py +481 -0
- amb_core/cloudevents.py +509 -0
- amb_core/dlq.py +345 -0
- amb_core/hf_utils.py +536 -0
- amb_core/memory_broker.py +410 -0
- amb_core/models.py +141 -0
- amb_core/persistence.py +529 -0
- amb_core/schema.py +294 -0
- amb_core/tracing.py +358 -0
- atr/__init__.py +640 -0
- atr/access.py +348 -0
- atr/composition.py +645 -0
- atr/decorator.py +357 -0
- atr/executor.py +384 -0
- atr/health.py +557 -0
- atr/hf_utils.py +449 -0
- atr/injection.py +422 -0
- atr/metrics.py +440 -0
- atr/policies.py +403 -0
- atr/py.typed +2 -0
- atr/registry.py +452 -0
- atr/schema.py +480 -0
- atr/tools/safe/__init__.py +75 -0
- atr/tools/safe/calculator.py +467 -0
- atr/tools/safe/datetime_tool.py +443 -0
- atr/tools/safe/file_reader.py +402 -0
- atr/tools/safe/http_client.py +316 -0
- atr/tools/safe/json_parser.py +374 -0
- atr/tools/safe/text_tool.py +537 -0
- atr/tools/safe/toolkit.py +175 -0
- caas/__init__.py +162 -0
- caas/api/__init__.py +7 -0
- caas/api/server.py +1328 -0
- caas/caching.py +834 -0
- caas/cli.py +210 -0
- caas/conversation.py +223 -0
- caas/decay.py +72 -0
- caas/detection/__init__.py +9 -0
- caas/detection/detector.py +238 -0
- caas/enrichment.py +130 -0
- caas/gateway/__init__.py +27 -0
- caas/gateway/trust_gateway.py +474 -0
- caas/hf_utils.py +479 -0
- caas/ingestion/__init__.py +23 -0
- caas/ingestion/processors.py +253 -0
- caas/ingestion/structure_parser.py +188 -0
- caas/models.py +356 -0
- caas/pragmatic_truth.py +444 -0
- caas/routing/__init__.py +10 -0
- caas/routing/heuristic_router.py +58 -0
- caas/storage/__init__.py +9 -0
- caas/storage/store.py +389 -0
- caas/triad.py +213 -0
- caas/tuning/__init__.py +9 -0
- caas/tuning/tuner.py +329 -0
- caas/vfs/__init__.py +14 -0
- caas/vfs/filesystem.py +452 -0
- cmvk/__init__.py +218 -0
- cmvk/audit.py +402 -0
- cmvk/benchmarks.py +478 -0
- cmvk/constitutional.py +904 -0
- cmvk/hf_utils.py +301 -0
- cmvk/metrics.py +473 -0
- cmvk/profiles.py +300 -0
- cmvk/py.typed +0 -0
- cmvk/types.py +12 -0
- cmvk/verification.py +956 -0
- emk/__init__.py +89 -0
- emk/causal.py +352 -0
- emk/hf_utils.py +421 -0
- emk/indexer.py +83 -0
- emk/py.typed +0 -0
- emk/schema.py +204 -0
- emk/sleep_cycle.py +347 -0
- emk/store.py +281 -0
- iatp/__init__.py +166 -0
- iatp/attestation.py +461 -0
- iatp/cli.py +317 -0
- iatp/hf_utils.py +472 -0
- iatp/ipc_pipes.py +580 -0
- iatp/main.py +412 -0
- iatp/models/__init__.py +447 -0
- iatp/policy_engine.py +337 -0
- iatp/py.typed +2 -0
- iatp/recovery.py +321 -0
- iatp/security/__init__.py +270 -0
- iatp/sidecar/__init__.py +519 -0
- iatp/telemetry/__init__.py +164 -0
- iatp/tests/__init__.py +1 -0
- iatp/tests/test_attestation.py +370 -0
- iatp/tests/test_cli.py +131 -0
- iatp/tests/test_ed25519_attestation.py +211 -0
- iatp/tests/test_models.py +130 -0
- iatp/tests/test_policy_engine.py +347 -0
- iatp/tests/test_recovery.py +281 -0
- iatp/tests/test_security.py +222 -0
- iatp/tests/test_sidecar.py +167 -0
- iatp/tests/test_telemetry.py +175 -0
- mcp_kernel_server/__init__.py +28 -0
- mcp_kernel_server/cli.py +274 -0
- mcp_kernel_server/resources.py +217 -0
- mcp_kernel_server/server.py +564 -0
- mcp_kernel_server/tools.py +1174 -0
- mute_agent/__init__.py +68 -0
- mute_agent/core/__init__.py +1 -0
- mute_agent/core/execution_agent.py +166 -0
- mute_agent/core/handshake_protocol.py +201 -0
- mute_agent/core/reasoning_agent.py +238 -0
- mute_agent/knowledge_graph/__init__.py +1 -0
- mute_agent/knowledge_graph/graph_elements.py +65 -0
- mute_agent/knowledge_graph/multidimensional_graph.py +170 -0
- mute_agent/knowledge_graph/subgraph.py +224 -0
- mute_agent/listener/__init__.py +43 -0
- mute_agent/listener/adapters/__init__.py +31 -0
- mute_agent/listener/adapters/base_adapter.py +189 -0
- mute_agent/listener/adapters/caas_adapter.py +344 -0
- mute_agent/listener/adapters/control_plane_adapter.py +436 -0
- mute_agent/listener/adapters/iatp_adapter.py +332 -0
- mute_agent/listener/adapters/scak_adapter.py +251 -0
- mute_agent/listener/listener.py +610 -0
- mute_agent/listener/state_observer.py +436 -0
- mute_agent/listener/threshold_config.py +313 -0
- mute_agent/super_system/__init__.py +1 -0
- mute_agent/super_system/router.py +204 -0
- mute_agent/visualization/__init__.py +10 -0
- mute_agent/visualization/graph_debugger.py +502 -0
- nexus/README.md +60 -0
- nexus/__init__.py +51 -0
- nexus/arbiter.py +359 -0
- nexus/client.py +466 -0
- nexus/dmz.py +444 -0
- nexus/escrow.py +430 -0
- nexus/exceptions.py +286 -0
- nexus/pyproject.toml +36 -0
- nexus/registry.py +393 -0
- nexus/reputation.py +425 -0
- nexus/schemas/__init__.py +51 -0
- nexus/schemas/compliance.py +276 -0
- nexus/schemas/escrow.py +251 -0
- nexus/schemas/manifest.py +225 -0
- nexus/schemas/receipt.py +208 -0
- nexus/tests/__init__.py +0 -0
- nexus/tests/conftest.py +146 -0
- nexus/tests/test_arbiter.py +192 -0
- nexus/tests/test_dmz.py +194 -0
- nexus/tests/test_escrow.py +276 -0
- nexus/tests/test_exceptions.py +225 -0
- nexus/tests/test_registry.py +232 -0
- nexus/tests/test_reputation.py +328 -0
- nexus/tests/test_schemas.py +295 -0
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
"""
|
|
4
|
+
Prometheus Metrics for Agent OS Kernel.
|
|
5
|
+
|
|
6
|
+
Key metrics for CISOs:
|
|
7
|
+
- Safety violation rate (target: 0%)
|
|
8
|
+
- Policy enforcement latency (<5ms target)
|
|
9
|
+
- Agent uptime
|
|
10
|
+
- MTTR after SIGKILL
|
|
11
|
+
|
|
12
|
+
Key metrics for ML Ops:
|
|
13
|
+
- CMVK consensus rate
|
|
14
|
+
- Model disagreement tracking
|
|
15
|
+
- Verification latency
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from prometheus_client import Counter, Histogram, Gauge, Info, Summary, generate_latest, CONTENT_TYPE_LATEST
|
|
19
|
+
from typing import Optional
|
|
20
|
+
import time
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class KernelMetrics:
|
|
24
|
+
"""
|
|
25
|
+
Prometheus metrics for Agent OS kernel operations.
|
|
26
|
+
|
|
27
|
+
Usage:
|
|
28
|
+
metrics = KernelMetrics()
|
|
29
|
+
|
|
30
|
+
# Record policy check
|
|
31
|
+
with metrics.policy_check_latency():
|
|
32
|
+
result = policy_engine.check(action)
|
|
33
|
+
|
|
34
|
+
# Record violation
|
|
35
|
+
if not result.allowed:
|
|
36
|
+
metrics.record_violation(agent_id, action)
|
|
37
|
+
|
|
38
|
+
# Expose metrics
|
|
39
|
+
@app.get("/metrics")
|
|
40
|
+
def metrics_endpoint():
|
|
41
|
+
return Response(metrics.export(), media_type="text/plain")
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, namespace: str = "agent_os"):
|
|
45
|
+
self.namespace = namespace
|
|
46
|
+
|
|
47
|
+
# =====================================================================
|
|
48
|
+
# Safety Metrics (Most Important for CISOs)
|
|
49
|
+
# =====================================================================
|
|
50
|
+
|
|
51
|
+
self.violations_total = Counter(
|
|
52
|
+
f"{namespace}_violations_total",
|
|
53
|
+
"Total policy violations detected",
|
|
54
|
+
["agent_id", "action", "policy", "severity"]
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
self.violations_blocked = Counter(
|
|
58
|
+
f"{namespace}_violations_blocked_total",
|
|
59
|
+
"Violations blocked by kernel (SIGKILL issued)",
|
|
60
|
+
["agent_id", "action"]
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
self.violation_rate = Gauge(
|
|
64
|
+
f"{namespace}_violation_rate",
|
|
65
|
+
"Current violation rate (violations per 1000 requests)",
|
|
66
|
+
["window"]
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# =====================================================================
|
|
70
|
+
# Performance Metrics
|
|
71
|
+
# =====================================================================
|
|
72
|
+
|
|
73
|
+
self.policy_check_duration = Histogram(
|
|
74
|
+
f"{namespace}_policy_check_duration_seconds",
|
|
75
|
+
"Time to check policies",
|
|
76
|
+
["policy"],
|
|
77
|
+
buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1]
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
self.execution_duration = Histogram(
|
|
81
|
+
f"{namespace}_execution_duration_seconds",
|
|
82
|
+
"Time to execute governed action",
|
|
83
|
+
["action", "status"],
|
|
84
|
+
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0]
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
self.kernel_latency = Histogram(
|
|
88
|
+
f"{namespace}_kernel_latency_seconds",
|
|
89
|
+
"Total kernel overhead (policy + dispatch)",
|
|
90
|
+
buckets=[0.001, 0.005, 0.01, 0.025, 0.05, 0.1]
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# =====================================================================
|
|
94
|
+
# Throughput Metrics
|
|
95
|
+
# =====================================================================
|
|
96
|
+
|
|
97
|
+
self.requests_total = Counter(
|
|
98
|
+
f"{namespace}_requests_total",
|
|
99
|
+
"Total requests processed",
|
|
100
|
+
["action", "status"]
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
self.active_agents = Gauge(
|
|
104
|
+
f"{namespace}_active_agents",
|
|
105
|
+
"Number of active agents"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# =====================================================================
|
|
109
|
+
# Signal Metrics
|
|
110
|
+
# =====================================================================
|
|
111
|
+
|
|
112
|
+
self.signals_sent = Counter(
|
|
113
|
+
f"{namespace}_signals_total",
|
|
114
|
+
"Signals sent to agents",
|
|
115
|
+
["signal", "reason"]
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
self.sigkill_count = Counter(
|
|
119
|
+
f"{namespace}_sigkill_total",
|
|
120
|
+
"SIGKILL signals issued",
|
|
121
|
+
["agent_id", "reason"]
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# =====================================================================
|
|
125
|
+
# Recovery Metrics
|
|
126
|
+
# =====================================================================
|
|
127
|
+
|
|
128
|
+
self.mttr_seconds = Histogram(
|
|
129
|
+
f"{namespace}_mttr_seconds",
|
|
130
|
+
"Mean Time To Recovery after SIGKILL",
|
|
131
|
+
buckets=[1, 5, 10, 30, 60, 120, 300]
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
self.recovery_success = Counter(
|
|
135
|
+
f"{namespace}_recovery_total",
|
|
136
|
+
"Recovery attempts",
|
|
137
|
+
["status"]
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# =====================================================================
|
|
141
|
+
# Uptime Metrics
|
|
142
|
+
# =====================================================================
|
|
143
|
+
|
|
144
|
+
self.kernel_uptime = Gauge(
|
|
145
|
+
f"{namespace}_kernel_uptime_seconds",
|
|
146
|
+
"Kernel uptime in seconds"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
self.agent_crashes = Counter(
|
|
150
|
+
f"{namespace}_agent_crashes_total",
|
|
151
|
+
"Agent crashes (user space)",
|
|
152
|
+
["agent_id", "reason"]
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
self.kernel_crashes = Counter(
|
|
156
|
+
f"{namespace}_kernel_crashes_total",
|
|
157
|
+
"Kernel crashes (should be 0)"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# =====================================================================
|
|
161
|
+
# CMVK Metrics (ML Ops)
|
|
162
|
+
# =====================================================================
|
|
163
|
+
|
|
164
|
+
self.cmvk_verifications_total = Counter(
|
|
165
|
+
f"{namespace}_cmvk_verifications_total",
|
|
166
|
+
"Total CMVK verifications performed",
|
|
167
|
+
["result"] # verified, flagged, rejected
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
self.cmvk_consensus_ratio = Gauge(
|
|
171
|
+
f"{namespace}_cmvk_consensus_ratio",
|
|
172
|
+
"Current model consensus ratio (0.0-1.0)"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
self.cmvk_model_disagreements = Counter(
|
|
176
|
+
f"{namespace}_cmvk_model_disagreements_total",
|
|
177
|
+
"Model disagreements detected",
|
|
178
|
+
["model_pair"] # e.g., "gpt4_claude", "claude_gemini"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
self.cmvk_drift_score = Histogram(
|
|
182
|
+
f"{namespace}_cmvk_drift_score",
|
|
183
|
+
"Distribution of drift scores",
|
|
184
|
+
buckets=[0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.40, 0.50, 1.0]
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
self.cmvk_verification_duration = Histogram(
|
|
188
|
+
f"{namespace}_cmvk_verification_duration_seconds",
|
|
189
|
+
"Time to complete CMVK verification",
|
|
190
|
+
["model_count"],
|
|
191
|
+
buckets=[0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 15.0]
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
self.cmvk_model_latency = Histogram(
|
|
195
|
+
f"{namespace}_cmvk_model_latency_seconds",
|
|
196
|
+
"Per-model response latency",
|
|
197
|
+
["model"],
|
|
198
|
+
buckets=[0.5, 1.0, 2.0, 3.0, 5.0, 10.0]
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
self.cmvk_claims_by_confidence = Counter(
|
|
202
|
+
f"{namespace}_cmvk_claims_by_confidence",
|
|
203
|
+
"Claims grouped by confidence level",
|
|
204
|
+
["confidence_bucket"] # high (>0.9), medium (0.7-0.9), low (<0.7)
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# =====================================================================
|
|
208
|
+
# Agent-Level Metrics
|
|
209
|
+
# =====================================================================
|
|
210
|
+
|
|
211
|
+
self.agent_llm_calls = Counter(
|
|
212
|
+
f"{namespace}_agent_llm_calls_total",
|
|
213
|
+
"Total LLM API calls by agent",
|
|
214
|
+
["agent_id", "model"]
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
self.agent_errors = Counter(
|
|
218
|
+
f"{namespace}_agent_errors_total",
|
|
219
|
+
"Agent errors by type",
|
|
220
|
+
["agent_id", "error_type"]
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
self.agent_execution_duration = Histogram(
|
|
224
|
+
f"{namespace}_agent_execution_duration_seconds",
|
|
225
|
+
"Agent task execution time",
|
|
226
|
+
["agent_id"],
|
|
227
|
+
buckets=[0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0]
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# =====================================================================
|
|
231
|
+
# Info Metrics
|
|
232
|
+
# =====================================================================
|
|
233
|
+
|
|
234
|
+
self.kernel_info = Info(
|
|
235
|
+
f"{namespace}_kernel",
|
|
236
|
+
"Kernel version and configuration"
|
|
237
|
+
)
|
|
238
|
+
self.kernel_info.info({
|
|
239
|
+
"version": "1.1.0",
|
|
240
|
+
"policy_mode": "strict"
|
|
241
|
+
})
|
|
242
|
+
|
|
243
|
+
# Internal tracking
|
|
244
|
+
self._start_time = time.time()
|
|
245
|
+
self._request_count = 0
|
|
246
|
+
self._violation_count = 0
|
|
247
|
+
|
|
248
|
+
# =========================================================================
|
|
249
|
+
# Recording Methods
|
|
250
|
+
# =========================================================================
|
|
251
|
+
|
|
252
|
+
def record_request(self, action: str, status: str):
|
|
253
|
+
"""Record a request."""
|
|
254
|
+
self.requests_total.labels(action=action, status=status).inc()
|
|
255
|
+
self._request_count += 1
|
|
256
|
+
self._update_violation_rate()
|
|
257
|
+
|
|
258
|
+
def record_violation(self, agent_id: str, action: str, policy: str, severity: str = "high"):
|
|
259
|
+
"""Record a policy violation."""
|
|
260
|
+
self.violations_total.labels(
|
|
261
|
+
agent_id=agent_id,
|
|
262
|
+
action=action,
|
|
263
|
+
policy=policy,
|
|
264
|
+
severity=severity
|
|
265
|
+
).inc()
|
|
266
|
+
self._violation_count += 1
|
|
267
|
+
self._update_violation_rate()
|
|
268
|
+
|
|
269
|
+
def record_blocked(self, agent_id: str, action: str):
|
|
270
|
+
"""Record a blocked violation (SIGKILL issued)."""
|
|
271
|
+
self.violations_blocked.labels(agent_id=agent_id, action=action).inc()
|
|
272
|
+
self.sigkill_count.labels(agent_id=agent_id, reason="policy_violation").inc()
|
|
273
|
+
self.signals_sent.labels(signal="SIGKILL", reason="policy_violation").inc()
|
|
274
|
+
|
|
275
|
+
def record_signal(self, signal: str, reason: str):
|
|
276
|
+
"""Record a signal sent."""
|
|
277
|
+
self.signals_sent.labels(signal=signal, reason=reason).inc()
|
|
278
|
+
|
|
279
|
+
def record_recovery(self, duration_seconds: float, success: bool):
|
|
280
|
+
"""Record recovery after SIGKILL."""
|
|
281
|
+
self.mttr_seconds.observe(duration_seconds)
|
|
282
|
+
self.recovery_success.labels(status="success" if success else "failed").inc()
|
|
283
|
+
|
|
284
|
+
def record_crash(self, agent_id: str, reason: str, is_kernel: bool = False):
|
|
285
|
+
"""Record a crash."""
|
|
286
|
+
if is_kernel:
|
|
287
|
+
self.kernel_crashes.inc()
|
|
288
|
+
else:
|
|
289
|
+
self.agent_crashes.labels(agent_id=agent_id, reason=reason).inc()
|
|
290
|
+
|
|
291
|
+
# =========================================================================
|
|
292
|
+
# CMVK Recording Methods
|
|
293
|
+
# =========================================================================
|
|
294
|
+
|
|
295
|
+
def record_cmvk_verification(
|
|
296
|
+
self,
|
|
297
|
+
result: str,
|
|
298
|
+
confidence: float,
|
|
299
|
+
drift_score: float,
|
|
300
|
+
duration_seconds: float,
|
|
301
|
+
model_count: int = 3
|
|
302
|
+
):
|
|
303
|
+
"""Record a CMVK verification."""
|
|
304
|
+
self.cmvk_verifications_total.labels(result=result).inc()
|
|
305
|
+
self.cmvk_drift_score.observe(drift_score)
|
|
306
|
+
self.cmvk_consensus_ratio.set(1.0 - drift_score)
|
|
307
|
+
self.cmvk_verification_duration.labels(model_count=str(model_count)).observe(duration_seconds)
|
|
308
|
+
|
|
309
|
+
# Bucket by confidence
|
|
310
|
+
if confidence >= 0.9:
|
|
311
|
+
bucket = "high"
|
|
312
|
+
elif confidence >= 0.7:
|
|
313
|
+
bucket = "medium"
|
|
314
|
+
else:
|
|
315
|
+
bucket = "low"
|
|
316
|
+
self.cmvk_claims_by_confidence.labels(confidence_bucket=bucket).inc()
|
|
317
|
+
|
|
318
|
+
def record_cmvk_model_response(self, model: str, latency_seconds: float):
|
|
319
|
+
"""Record individual model response in CMVK."""
|
|
320
|
+
self.cmvk_model_latency.labels(model=model).observe(latency_seconds)
|
|
321
|
+
|
|
322
|
+
def record_cmvk_disagreement(self, model_a: str, model_b: str):
|
|
323
|
+
"""Record a disagreement between two models."""
|
|
324
|
+
pair = f"{model_a}_{model_b}" if model_a < model_b else f"{model_b}_{model_a}"
|
|
325
|
+
self.cmvk_model_disagreements.labels(model_pair=pair).inc()
|
|
326
|
+
|
|
327
|
+
# =========================================================================
|
|
328
|
+
# Agent Recording Methods
|
|
329
|
+
# =========================================================================
|
|
330
|
+
|
|
331
|
+
def record_agent_llm_call(self, agent_id: str, model: str):
|
|
332
|
+
"""Record an LLM API call by an agent."""
|
|
333
|
+
self.agent_llm_calls.labels(agent_id=agent_id, model=model).inc()
|
|
334
|
+
|
|
335
|
+
def record_agent_error(self, agent_id: str, error_type: str):
|
|
336
|
+
"""Record an agent error."""
|
|
337
|
+
self.agent_errors.labels(agent_id=agent_id, error_type=error_type).inc()
|
|
338
|
+
|
|
339
|
+
def record_agent_execution(self, agent_id: str, duration_seconds: float):
|
|
340
|
+
"""Record agent task execution time."""
|
|
341
|
+
self.agent_execution_duration.labels(agent_id=agent_id).observe(duration_seconds)
|
|
342
|
+
|
|
343
|
+
def _update_violation_rate(self):
|
|
344
|
+
"""Update violation rate gauge."""
|
|
345
|
+
if self._request_count > 0:
|
|
346
|
+
rate = (self._violation_count / self._request_count) * 1000
|
|
347
|
+
self.violation_rate.labels(window="all_time").set(rate)
|
|
348
|
+
|
|
349
|
+
def update_uptime(self):
|
|
350
|
+
"""Update uptime gauge."""
|
|
351
|
+
self.kernel_uptime.set(time.time() - self._start_time)
|
|
352
|
+
|
|
353
|
+
# =========================================================================
|
|
354
|
+
# Context Managers
|
|
355
|
+
# =========================================================================
|
|
356
|
+
|
|
357
|
+
def policy_check_latency(self, policy: str = "default"):
|
|
358
|
+
"""Context manager to measure policy check latency."""
|
|
359
|
+
return self.policy_check_duration.labels(policy=policy).time()
|
|
360
|
+
|
|
361
|
+
def execution_latency(self, action: str, status: str = "success"):
|
|
362
|
+
"""Context manager to measure execution latency."""
|
|
363
|
+
return self.execution_duration.labels(action=action, status=status).time()
|
|
364
|
+
|
|
365
|
+
# =========================================================================
|
|
366
|
+
# Export
|
|
367
|
+
# =========================================================================
|
|
368
|
+
|
|
369
|
+
def export(self) -> bytes:
|
|
370
|
+
"""Export metrics in Prometheus format."""
|
|
371
|
+
self.update_uptime()
|
|
372
|
+
return generate_latest()
|
|
373
|
+
|
|
374
|
+
def content_type(self) -> str:
|
|
375
|
+
"""Get content type for metrics response."""
|
|
376
|
+
return CONTENT_TYPE_LATEST
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def metrics_endpoint(metrics: KernelMetrics):
|
|
380
|
+
"""
|
|
381
|
+
Create a metrics endpoint handler.
|
|
382
|
+
|
|
383
|
+
Usage with FastAPI:
|
|
384
|
+
from fastapi import FastAPI, Response
|
|
385
|
+
|
|
386
|
+
app = FastAPI()
|
|
387
|
+
metrics = KernelMetrics()
|
|
388
|
+
|
|
389
|
+
@app.get("/metrics")
|
|
390
|
+
def get_metrics():
|
|
391
|
+
return Response(
|
|
392
|
+
content=metrics.export(),
|
|
393
|
+
media_type=metrics.content_type()
|
|
394
|
+
)
|
|
395
|
+
"""
|
|
396
|
+
def handler():
|
|
397
|
+
return metrics.export(), metrics.content_type()
|
|
398
|
+
return handler
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# Copyright (c) Microsoft Corporation.
|
|
2
|
+
# Licensed under the MIT License.
|
|
3
|
+
"""
|
|
4
|
+
HTTP Metrics Server for Agent OS Kernel.
|
|
5
|
+
|
|
6
|
+
Standalone server exposing /metrics endpoint for Prometheus scraping.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
# Start server
|
|
10
|
+
python -m agent_os_observability.server
|
|
11
|
+
|
|
12
|
+
# Or programmatically
|
|
13
|
+
from agent_os_observability import MetricsServer
|
|
14
|
+
server = MetricsServer(port=9090)
|
|
15
|
+
server.start()
|
|
16
|
+
|
|
17
|
+
# Scrape with Prometheus
|
|
18
|
+
# scrape_configs:
|
|
19
|
+
# - job_name: 'agent-os'
|
|
20
|
+
# static_configs:
|
|
21
|
+
# - targets: ['localhost:9090']
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import asyncio
|
|
25
|
+
import threading
|
|
26
|
+
from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
27
|
+
from typing import Optional
|
|
28
|
+
from .metrics import KernelMetrics
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class MetricsHandler(BaseHTTPRequestHandler):
|
|
32
|
+
"""HTTP handler for /metrics endpoint."""
|
|
33
|
+
|
|
34
|
+
# Class-level metrics instance (set by server)
|
|
35
|
+
metrics: Optional[KernelMetrics] = None
|
|
36
|
+
|
|
37
|
+
def do_GET(self):
|
|
38
|
+
"""Handle GET requests."""
|
|
39
|
+
if self.path == "/metrics":
|
|
40
|
+
self._serve_metrics()
|
|
41
|
+
elif self.path == "/health":
|
|
42
|
+
self._serve_health()
|
|
43
|
+
elif self.path == "/ready":
|
|
44
|
+
self._serve_ready()
|
|
45
|
+
else:
|
|
46
|
+
self.send_error(404, "Not Found")
|
|
47
|
+
|
|
48
|
+
def _serve_metrics(self):
|
|
49
|
+
"""Serve Prometheus metrics."""
|
|
50
|
+
if self.metrics is None:
|
|
51
|
+
self.send_error(500, "Metrics not initialized")
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
content = self.metrics.export()
|
|
55
|
+
self.send_response(200)
|
|
56
|
+
self.send_header("Content-Type", self.metrics.content_type())
|
|
57
|
+
self.send_header("Content-Length", len(content))
|
|
58
|
+
self.end_headers()
|
|
59
|
+
self.wfile.write(content)
|
|
60
|
+
|
|
61
|
+
def _serve_health(self):
|
|
62
|
+
"""Serve health check."""
|
|
63
|
+
content = b'{"status": "healthy"}'
|
|
64
|
+
self.send_response(200)
|
|
65
|
+
self.send_header("Content-Type", "application/json")
|
|
66
|
+
self.send_header("Content-Length", len(content))
|
|
67
|
+
self.end_headers()
|
|
68
|
+
self.wfile.write(content)
|
|
69
|
+
|
|
70
|
+
def _serve_ready(self):
|
|
71
|
+
"""Serve readiness check."""
|
|
72
|
+
content = b'{"ready": true}'
|
|
73
|
+
self.send_response(200)
|
|
74
|
+
self.send_header("Content-Type", "application/json")
|
|
75
|
+
self.send_header("Content-Length", len(content))
|
|
76
|
+
self.end_headers()
|
|
77
|
+
self.wfile.write(content)
|
|
78
|
+
|
|
79
|
+
def log_message(self, format, *args):
|
|
80
|
+
"""Suppress default logging (too noisy for /metrics)."""
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class MetricsServer:
|
|
85
|
+
"""
|
|
86
|
+
Standalone HTTP server for Agent OS metrics.
|
|
87
|
+
|
|
88
|
+
Endpoints:
|
|
89
|
+
GET /metrics - Prometheus metrics
|
|
90
|
+
GET /health - Health check ({"status": "healthy"})
|
|
91
|
+
GET /ready - Readiness check ({"ready": true})
|
|
92
|
+
|
|
93
|
+
Example:
|
|
94
|
+
# Start with default metrics
|
|
95
|
+
server = MetricsServer(port=9090)
|
|
96
|
+
server.start()
|
|
97
|
+
|
|
98
|
+
# Share metrics with kernel
|
|
99
|
+
from agent_os import StatelessKernel
|
|
100
|
+
kernel = StatelessKernel(metrics=server.metrics)
|
|
101
|
+
|
|
102
|
+
# Stop server
|
|
103
|
+
server.stop()
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(
|
|
107
|
+
self,
|
|
108
|
+
port: int = 9090,
|
|
109
|
+
host: str = "0.0.0.0",
|
|
110
|
+
metrics: Optional[KernelMetrics] = None
|
|
111
|
+
):
|
|
112
|
+
self.port = port
|
|
113
|
+
self.host = host
|
|
114
|
+
self.metrics = metrics or KernelMetrics()
|
|
115
|
+
self._server: Optional[HTTPServer] = None
|
|
116
|
+
self._thread: Optional[threading.Thread] = None
|
|
117
|
+
|
|
118
|
+
def start(self, blocking: bool = False):
|
|
119
|
+
"""
|
|
120
|
+
Start the metrics server.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
blocking: If True, block the current thread. Default False (background).
|
|
124
|
+
"""
|
|
125
|
+
# Set metrics on handler class
|
|
126
|
+
MetricsHandler.metrics = self.metrics
|
|
127
|
+
|
|
128
|
+
self._server = HTTPServer((self.host, self.port), MetricsHandler)
|
|
129
|
+
|
|
130
|
+
if blocking:
|
|
131
|
+
print(f"Agent OS Metrics Server running on http://{self.host}:{self.port}")
|
|
132
|
+
print(f" /metrics - Prometheus metrics")
|
|
133
|
+
print(f" /health - Health check")
|
|
134
|
+
print(f" /ready - Readiness check")
|
|
135
|
+
self._server.serve_forever()
|
|
136
|
+
else:
|
|
137
|
+
self._thread = threading.Thread(target=self._server.serve_forever, daemon=True)
|
|
138
|
+
self._thread.start()
|
|
139
|
+
print(f"Agent OS Metrics Server started on http://{self.host}:{self.port}/metrics")
|
|
140
|
+
|
|
141
|
+
def stop(self):
|
|
142
|
+
"""Stop the metrics server."""
|
|
143
|
+
if self._server:
|
|
144
|
+
self._server.shutdown()
|
|
145
|
+
self._server = None
|
|
146
|
+
if self._thread:
|
|
147
|
+
self._thread.join(timeout=5)
|
|
148
|
+
self._thread = None
|
|
149
|
+
|
|
150
|
+
def __enter__(self):
|
|
151
|
+
self.start()
|
|
152
|
+
return self
|
|
153
|
+
|
|
154
|
+
def __exit__(self, *args):
|
|
155
|
+
self.stop()
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# =============================================================================
|
|
159
|
+
# FastAPI Integration
|
|
160
|
+
# =============================================================================
|
|
161
|
+
|
|
162
|
+
def create_fastapi_router(metrics: Optional[KernelMetrics] = None):
|
|
163
|
+
"""
|
|
164
|
+
Create FastAPI router for metrics.
|
|
165
|
+
|
|
166
|
+
Usage:
|
|
167
|
+
from fastapi import FastAPI
|
|
168
|
+
from agent_os_observability import create_fastapi_router, KernelMetrics
|
|
169
|
+
|
|
170
|
+
app = FastAPI()
|
|
171
|
+
metrics = KernelMetrics()
|
|
172
|
+
app.include_router(create_fastapi_router(metrics))
|
|
173
|
+
"""
|
|
174
|
+
try:
|
|
175
|
+
from fastapi import APIRouter, Response
|
|
176
|
+
except ImportError:
|
|
177
|
+
raise ImportError("FastAPI not installed. Install with: pip install fastapi")
|
|
178
|
+
|
|
179
|
+
router = APIRouter(tags=["observability"])
|
|
180
|
+
_metrics = metrics or KernelMetrics()
|
|
181
|
+
|
|
182
|
+
@router.get("/metrics")
|
|
183
|
+
def get_metrics():
|
|
184
|
+
return Response(
|
|
185
|
+
content=_metrics.export(),
|
|
186
|
+
media_type=_metrics.content_type()
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
@router.get("/health")
|
|
190
|
+
def health():
|
|
191
|
+
return {"status": "healthy"}
|
|
192
|
+
|
|
193
|
+
@router.get("/ready")
|
|
194
|
+
def ready():
|
|
195
|
+
return {"ready": True}
|
|
196
|
+
|
|
197
|
+
return router
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
# =============================================================================
|
|
201
|
+
# CLI Entry Point
|
|
202
|
+
# =============================================================================
|
|
203
|
+
|
|
204
|
+
def main():
|
|
205
|
+
"""Run metrics server from command line."""
|
|
206
|
+
import argparse
|
|
207
|
+
|
|
208
|
+
parser = argparse.ArgumentParser(description="Agent OS Metrics Server")
|
|
209
|
+
parser.add_argument("--port", type=int, default=9090, help="Port to listen on")
|
|
210
|
+
parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
|
|
211
|
+
args = parser.parse_args()
|
|
212
|
+
|
|
213
|
+
server = MetricsServer(port=args.port, host=args.host)
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
server.start(blocking=True)
|
|
217
|
+
except KeyboardInterrupt:
|
|
218
|
+
print("\nShutting down...")
|
|
219
|
+
server.stop()
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
if __name__ == "__main__":
|
|
223
|
+
main()
|