agent_os_kernel 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (337) hide show
  1. agent_control_plane/__init__.py +662 -0
  2. agent_control_plane/a2a_adapter.py +543 -0
  3. agent_control_plane/adapter.py +417 -0
  4. agent_control_plane/agent_hibernation.py +394 -0
  5. agent_control_plane/agent_kernel.py +470 -0
  6. agent_control_plane/compliance.py +720 -0
  7. agent_control_plane/constraint_graphs.py +478 -0
  8. agent_control_plane/control_plane.py +854 -0
  9. agent_control_plane/example_executors.py +195 -0
  10. agent_control_plane/execution_engine.py +231 -0
  11. agent_control_plane/flight_recorder.py +846 -0
  12. agent_control_plane/governance_layer.py +435 -0
  13. agent_control_plane/hf_utils.py +563 -0
  14. agent_control_plane/interfaces/__init__.py +55 -0
  15. agent_control_plane/interfaces/kernel_interface.py +361 -0
  16. agent_control_plane/interfaces/plugin_interface.py +497 -0
  17. agent_control_plane/interfaces/protocol_interfaces.py +387 -0
  18. agent_control_plane/kernel_space.py +1009 -0
  19. agent_control_plane/langchain_adapter.py +424 -0
  20. agent_control_plane/lifecycle.py +3113 -0
  21. agent_control_plane/mcp_adapter.py +653 -0
  22. agent_control_plane/ml_safety.py +563 -0
  23. agent_control_plane/multimodal.py +727 -0
  24. agent_control_plane/mute_agent.py +422 -0
  25. agent_control_plane/observability.py +787 -0
  26. agent_control_plane/orchestrator.py +482 -0
  27. agent_control_plane/plugin_registry.py +750 -0
  28. agent_control_plane/policy_engine.py +954 -0
  29. agent_control_plane/process_isolation.py +777 -0
  30. agent_control_plane/shadow_mode.py +310 -0
  31. agent_control_plane/signals.py +493 -0
  32. agent_control_plane/supervisor_agents.py +430 -0
  33. agent_control_plane/time_travel_debugger.py +557 -0
  34. agent_control_plane/tool_registry.py +452 -0
  35. agent_control_plane/vfs.py +697 -0
  36. agent_kernel/__init__.py +69 -0
  37. agent_kernel/analyzer.py +435 -0
  38. agent_kernel/auditor.py +36 -0
  39. agent_kernel/completeness_auditor.py +237 -0
  40. agent_kernel/detector.py +203 -0
  41. agent_kernel/kernel.py +744 -0
  42. agent_kernel/memory_manager.py +85 -0
  43. agent_kernel/models.py +374 -0
  44. agent_kernel/nudge_mechanism.py +263 -0
  45. agent_kernel/outcome_analyzer.py +338 -0
  46. agent_kernel/patcher.py +582 -0
  47. agent_kernel/semantic_analyzer.py +316 -0
  48. agent_kernel/semantic_purge.py +349 -0
  49. agent_kernel/simulator.py +449 -0
  50. agent_kernel/teacher.py +85 -0
  51. agent_kernel/triage.py +152 -0
  52. agent_os/__init__.py +409 -0
  53. agent_os/_adversarial_impl.py +200 -0
  54. agent_os/_circuit_breaker_impl.py +232 -0
  55. agent_os/_mcp_metrics.py +193 -0
  56. agent_os/adversarial.py +20 -0
  57. agent_os/agents_compat.py +490 -0
  58. agent_os/audit_logger.py +135 -0
  59. agent_os/base_agent.py +651 -0
  60. agent_os/circuit_breaker.py +34 -0
  61. agent_os/cli/__init__.py +659 -0
  62. agent_os/cli/cmd_audit.py +128 -0
  63. agent_os/cli/cmd_init.py +152 -0
  64. agent_os/cli/cmd_policy.py +41 -0
  65. agent_os/cli/cmd_policy_gen.py +180 -0
  66. agent_os/cli/cmd_validate.py +258 -0
  67. agent_os/cli/mcp_scan.py +265 -0
  68. agent_os/cli/output.py +192 -0
  69. agent_os/cli/policy_checker.py +330 -0
  70. agent_os/compat.py +74 -0
  71. agent_os/constraint_graph.py +234 -0
  72. agent_os/content_governance.py +140 -0
  73. agent_os/context_budget.py +305 -0
  74. agent_os/credential_redactor.py +224 -0
  75. agent_os/diff_policy.py +89 -0
  76. agent_os/egress_policy.py +159 -0
  77. agent_os/escalation.py +276 -0
  78. agent_os/event_bus.py +124 -0
  79. agent_os/exceptions.py +180 -0
  80. agent_os/execution_context_policy.py +141 -0
  81. agent_os/github_enterprise.py +96 -0
  82. agent_os/health.py +20 -0
  83. agent_os/integrations/__init__.py +279 -0
  84. agent_os/integrations/a2a_adapter.py +279 -0
  85. agent_os/integrations/agent_lightning/__init__.py +30 -0
  86. agent_os/integrations/anthropic_adapter.py +420 -0
  87. agent_os/integrations/autogen_adapter.py +620 -0
  88. agent_os/integrations/base.py +1137 -0
  89. agent_os/integrations/compat.py +229 -0
  90. agent_os/integrations/config.py +98 -0
  91. agent_os/integrations/conversation_guardian.py +957 -0
  92. agent_os/integrations/crewai_adapter.py +467 -0
  93. agent_os/integrations/drift_detector.py +425 -0
  94. agent_os/integrations/dry_run.py +124 -0
  95. agent_os/integrations/escalation.py +582 -0
  96. agent_os/integrations/gemini_adapter.py +364 -0
  97. agent_os/integrations/google_adk_adapter.py +633 -0
  98. agent_os/integrations/guardrails_adapter.py +394 -0
  99. agent_os/integrations/health.py +197 -0
  100. agent_os/integrations/langchain_adapter.py +654 -0
  101. agent_os/integrations/llamafirewall.py +343 -0
  102. agent_os/integrations/llamaindex_adapter.py +188 -0
  103. agent_os/integrations/logging.py +191 -0
  104. agent_os/integrations/maf_adapter.py +631 -0
  105. agent_os/integrations/mistral_adapter.py +365 -0
  106. agent_os/integrations/openai_adapter.py +816 -0
  107. agent_os/integrations/openai_agents_sdk.py +406 -0
  108. agent_os/integrations/policy_compose.py +171 -0
  109. agent_os/integrations/profiling.py +144 -0
  110. agent_os/integrations/pydantic_ai_adapter.py +420 -0
  111. agent_os/integrations/rate_limiter.py +130 -0
  112. agent_os/integrations/rbac.py +143 -0
  113. agent_os/integrations/registry.py +113 -0
  114. agent_os/integrations/scope_guard.py +303 -0
  115. agent_os/integrations/semantic_kernel_adapter.py +769 -0
  116. agent_os/integrations/smolagents_adapter.py +629 -0
  117. agent_os/integrations/templates.py +178 -0
  118. agent_os/integrations/token_budget.py +134 -0
  119. agent_os/integrations/tool_aliases.py +190 -0
  120. agent_os/integrations/webhooks.py +177 -0
  121. agent_os/lite.py +208 -0
  122. agent_os/mcp_gateway.py +385 -0
  123. agent_os/mcp_message_signer.py +273 -0
  124. agent_os/mcp_protocols.py +161 -0
  125. agent_os/mcp_response_scanner.py +232 -0
  126. agent_os/mcp_security.py +924 -0
  127. agent_os/mcp_session_auth.py +231 -0
  128. agent_os/mcp_sliding_rate_limiter.py +184 -0
  129. agent_os/memory_guard.py +409 -0
  130. agent_os/metrics.py +134 -0
  131. agent_os/mute.py +428 -0
  132. agent_os/mute_agent.py +209 -0
  133. agent_os/policies/__init__.py +77 -0
  134. agent_os/policies/async_evaluator.py +275 -0
  135. agent_os/policies/backends.py +670 -0
  136. agent_os/policies/bridge.py +169 -0
  137. agent_os/policies/budget.py +85 -0
  138. agent_os/policies/cli.py +294 -0
  139. agent_os/policies/conflict_resolution.py +270 -0
  140. agent_os/policies/data_classification.py +252 -0
  141. agent_os/policies/evaluator.py +239 -0
  142. agent_os/policies/policy_schema.json +228 -0
  143. agent_os/policies/rate_limiting.py +145 -0
  144. agent_os/policies/schema.py +115 -0
  145. agent_os/policies/shared.py +331 -0
  146. agent_os/prompt_injection.py +694 -0
  147. agent_os/providers.py +182 -0
  148. agent_os/py.typed +0 -0
  149. agent_os/retry.py +81 -0
  150. agent_os/reversibility.py +251 -0
  151. agent_os/sandbox.py +432 -0
  152. agent_os/sandbox_provider.py +140 -0
  153. agent_os/secure_codegen.py +525 -0
  154. agent_os/security_skills.py +538 -0
  155. agent_os/semantic_policy.py +422 -0
  156. agent_os/server/__init__.py +15 -0
  157. agent_os/server/__main__.py +25 -0
  158. agent_os/server/app.py +277 -0
  159. agent_os/server/models.py +104 -0
  160. agent_os/shift_left_metrics.py +130 -0
  161. agent_os/stateless.py +742 -0
  162. agent_os/supervisor.py +148 -0
  163. agent_os/task_outcome.py +148 -0
  164. agent_os/transparency.py +181 -0
  165. agent_os/trust_root.py +128 -0
  166. agent_os_kernel-3.1.0.dist-info/METADATA +1269 -0
  167. agent_os_kernel-3.1.0.dist-info/RECORD +337 -0
  168. agent_os_kernel-3.1.0.dist-info/WHEEL +4 -0
  169. agent_os_kernel-3.1.0.dist-info/entry_points.txt +2 -0
  170. agent_os_kernel-3.1.0.dist-info/licenses/LICENSE +21 -0
  171. agent_os_observability/__init__.py +27 -0
  172. agent_os_observability/dashboards.py +898 -0
  173. agent_os_observability/metrics.py +398 -0
  174. agent_os_observability/server.py +223 -0
  175. agent_os_observability/tracer.py +232 -0
  176. agent_primitives/__init__.py +24 -0
  177. agent_primitives/failures.py +84 -0
  178. agent_primitives/py.typed +0 -0
  179. amb_core/__init__.py +177 -0
  180. amb_core/adapters/__init__.py +57 -0
  181. amb_core/adapters/aws_sqs_broker.py +376 -0
  182. amb_core/adapters/azure_servicebus_broker.py +340 -0
  183. amb_core/adapters/kafka_broker.py +260 -0
  184. amb_core/adapters/nats_broker.py +285 -0
  185. amb_core/adapters/rabbitmq_broker.py +235 -0
  186. amb_core/adapters/redis_broker.py +262 -0
  187. amb_core/broker.py +145 -0
  188. amb_core/bus.py +481 -0
  189. amb_core/cloudevents.py +509 -0
  190. amb_core/dlq.py +345 -0
  191. amb_core/hf_utils.py +536 -0
  192. amb_core/memory_broker.py +410 -0
  193. amb_core/models.py +141 -0
  194. amb_core/persistence.py +529 -0
  195. amb_core/schema.py +294 -0
  196. amb_core/tracing.py +358 -0
  197. atr/__init__.py +640 -0
  198. atr/access.py +348 -0
  199. atr/composition.py +645 -0
  200. atr/decorator.py +357 -0
  201. atr/executor.py +384 -0
  202. atr/health.py +557 -0
  203. atr/hf_utils.py +449 -0
  204. atr/injection.py +422 -0
  205. atr/metrics.py +440 -0
  206. atr/policies.py +403 -0
  207. atr/py.typed +2 -0
  208. atr/registry.py +452 -0
  209. atr/schema.py +480 -0
  210. atr/tools/safe/__init__.py +75 -0
  211. atr/tools/safe/calculator.py +467 -0
  212. atr/tools/safe/datetime_tool.py +443 -0
  213. atr/tools/safe/file_reader.py +402 -0
  214. atr/tools/safe/http_client.py +316 -0
  215. atr/tools/safe/json_parser.py +374 -0
  216. atr/tools/safe/text_tool.py +537 -0
  217. atr/tools/safe/toolkit.py +175 -0
  218. caas/__init__.py +162 -0
  219. caas/api/__init__.py +7 -0
  220. caas/api/server.py +1328 -0
  221. caas/caching.py +834 -0
  222. caas/cli.py +210 -0
  223. caas/conversation.py +223 -0
  224. caas/decay.py +72 -0
  225. caas/detection/__init__.py +9 -0
  226. caas/detection/detector.py +238 -0
  227. caas/enrichment.py +130 -0
  228. caas/gateway/__init__.py +27 -0
  229. caas/gateway/trust_gateway.py +474 -0
  230. caas/hf_utils.py +479 -0
  231. caas/ingestion/__init__.py +23 -0
  232. caas/ingestion/processors.py +253 -0
  233. caas/ingestion/structure_parser.py +188 -0
  234. caas/models.py +356 -0
  235. caas/pragmatic_truth.py +444 -0
  236. caas/routing/__init__.py +10 -0
  237. caas/routing/heuristic_router.py +58 -0
  238. caas/storage/__init__.py +9 -0
  239. caas/storage/store.py +389 -0
  240. caas/triad.py +213 -0
  241. caas/tuning/__init__.py +9 -0
  242. caas/tuning/tuner.py +329 -0
  243. caas/vfs/__init__.py +14 -0
  244. caas/vfs/filesystem.py +452 -0
  245. cmvk/__init__.py +218 -0
  246. cmvk/audit.py +402 -0
  247. cmvk/benchmarks.py +478 -0
  248. cmvk/constitutional.py +904 -0
  249. cmvk/hf_utils.py +301 -0
  250. cmvk/metrics.py +473 -0
  251. cmvk/profiles.py +300 -0
  252. cmvk/py.typed +0 -0
  253. cmvk/types.py +12 -0
  254. cmvk/verification.py +956 -0
  255. emk/__init__.py +89 -0
  256. emk/causal.py +352 -0
  257. emk/hf_utils.py +421 -0
  258. emk/indexer.py +83 -0
  259. emk/py.typed +0 -0
  260. emk/schema.py +204 -0
  261. emk/sleep_cycle.py +347 -0
  262. emk/store.py +281 -0
  263. iatp/__init__.py +166 -0
  264. iatp/attestation.py +461 -0
  265. iatp/cli.py +317 -0
  266. iatp/hf_utils.py +472 -0
  267. iatp/ipc_pipes.py +580 -0
  268. iatp/main.py +412 -0
  269. iatp/models/__init__.py +447 -0
  270. iatp/policy_engine.py +337 -0
  271. iatp/py.typed +2 -0
  272. iatp/recovery.py +321 -0
  273. iatp/security/__init__.py +270 -0
  274. iatp/sidecar/__init__.py +519 -0
  275. iatp/telemetry/__init__.py +164 -0
  276. iatp/tests/__init__.py +1 -0
  277. iatp/tests/test_attestation.py +370 -0
  278. iatp/tests/test_cli.py +131 -0
  279. iatp/tests/test_ed25519_attestation.py +211 -0
  280. iatp/tests/test_models.py +130 -0
  281. iatp/tests/test_policy_engine.py +347 -0
  282. iatp/tests/test_recovery.py +281 -0
  283. iatp/tests/test_security.py +222 -0
  284. iatp/tests/test_sidecar.py +167 -0
  285. iatp/tests/test_telemetry.py +175 -0
  286. mcp_kernel_server/__init__.py +28 -0
  287. mcp_kernel_server/cli.py +274 -0
  288. mcp_kernel_server/resources.py +217 -0
  289. mcp_kernel_server/server.py +564 -0
  290. mcp_kernel_server/tools.py +1174 -0
  291. mute_agent/__init__.py +68 -0
  292. mute_agent/core/__init__.py +1 -0
  293. mute_agent/core/execution_agent.py +166 -0
  294. mute_agent/core/handshake_protocol.py +201 -0
  295. mute_agent/core/reasoning_agent.py +238 -0
  296. mute_agent/knowledge_graph/__init__.py +1 -0
  297. mute_agent/knowledge_graph/graph_elements.py +65 -0
  298. mute_agent/knowledge_graph/multidimensional_graph.py +170 -0
  299. mute_agent/knowledge_graph/subgraph.py +224 -0
  300. mute_agent/listener/__init__.py +43 -0
  301. mute_agent/listener/adapters/__init__.py +31 -0
  302. mute_agent/listener/adapters/base_adapter.py +189 -0
  303. mute_agent/listener/adapters/caas_adapter.py +344 -0
  304. mute_agent/listener/adapters/control_plane_adapter.py +436 -0
  305. mute_agent/listener/adapters/iatp_adapter.py +332 -0
  306. mute_agent/listener/adapters/scak_adapter.py +251 -0
  307. mute_agent/listener/listener.py +610 -0
  308. mute_agent/listener/state_observer.py +436 -0
  309. mute_agent/listener/threshold_config.py +313 -0
  310. mute_agent/super_system/__init__.py +1 -0
  311. mute_agent/super_system/router.py +204 -0
  312. mute_agent/visualization/__init__.py +10 -0
  313. mute_agent/visualization/graph_debugger.py +502 -0
  314. nexus/README.md +60 -0
  315. nexus/__init__.py +51 -0
  316. nexus/arbiter.py +359 -0
  317. nexus/client.py +466 -0
  318. nexus/dmz.py +444 -0
  319. nexus/escrow.py +430 -0
  320. nexus/exceptions.py +286 -0
  321. nexus/pyproject.toml +36 -0
  322. nexus/registry.py +393 -0
  323. nexus/reputation.py +425 -0
  324. nexus/schemas/__init__.py +51 -0
  325. nexus/schemas/compliance.py +276 -0
  326. nexus/schemas/escrow.py +251 -0
  327. nexus/schemas/manifest.py +225 -0
  328. nexus/schemas/receipt.py +208 -0
  329. nexus/tests/__init__.py +0 -0
  330. nexus/tests/conftest.py +146 -0
  331. nexus/tests/test_arbiter.py +192 -0
  332. nexus/tests/test_dmz.py +194 -0
  333. nexus/tests/test_escrow.py +276 -0
  334. nexus/tests/test_exceptions.py +225 -0
  335. nexus/tests/test_registry.py +232 -0
  336. nexus/tests/test_reputation.py +328 -0
  337. nexus/tests/test_schemas.py +295 -0
@@ -0,0 +1,398 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+ """
4
+ Prometheus Metrics for Agent OS Kernel.
5
+
6
+ Key metrics for CISOs:
7
+ - Safety violation rate (target: 0%)
8
+ - Policy enforcement latency (<5ms target)
9
+ - Agent uptime
10
+ - MTTR after SIGKILL
11
+
12
+ Key metrics for ML Ops:
13
+ - CMVK consensus rate
14
+ - Model disagreement tracking
15
+ - Verification latency
16
+ """
17
+
18
+ from prometheus_client import Counter, Histogram, Gauge, Info, Summary, generate_latest, CONTENT_TYPE_LATEST
19
+ from typing import Optional
20
+ import time
21
+
22
+
23
+ class KernelMetrics:
24
+ """
25
+ Prometheus metrics for Agent OS kernel operations.
26
+
27
+ Usage:
28
+ metrics = KernelMetrics()
29
+
30
+ # Record policy check
31
+ with metrics.policy_check_latency():
32
+ result = policy_engine.check(action)
33
+
34
+ # Record violation
35
+ if not result.allowed:
36
+ metrics.record_violation(agent_id, action)
37
+
38
+ # Expose metrics
39
+ @app.get("/metrics")
40
+ def metrics_endpoint():
41
+ return Response(metrics.export(), media_type="text/plain")
42
+ """
43
+
44
+ def __init__(self, namespace: str = "agent_os"):
45
+ self.namespace = namespace
46
+
47
+ # =====================================================================
48
+ # Safety Metrics (Most Important for CISOs)
49
+ # =====================================================================
50
+
51
+ self.violations_total = Counter(
52
+ f"{namespace}_violations_total",
53
+ "Total policy violations detected",
54
+ ["agent_id", "action", "policy", "severity"]
55
+ )
56
+
57
+ self.violations_blocked = Counter(
58
+ f"{namespace}_violations_blocked_total",
59
+ "Violations blocked by kernel (SIGKILL issued)",
60
+ ["agent_id", "action"]
61
+ )
62
+
63
+ self.violation_rate = Gauge(
64
+ f"{namespace}_violation_rate",
65
+ "Current violation rate (violations per 1000 requests)",
66
+ ["window"]
67
+ )
68
+
69
+ # =====================================================================
70
+ # Performance Metrics
71
+ # =====================================================================
72
+
73
+ self.policy_check_duration = Histogram(
74
+ f"{namespace}_policy_check_duration_seconds",
75
+ "Time to check policies",
76
+ ["policy"],
77
+ buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1]
78
+ )
79
+
80
+ self.execution_duration = Histogram(
81
+ f"{namespace}_execution_duration_seconds",
82
+ "Time to execute governed action",
83
+ ["action", "status"],
84
+ buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0]
85
+ )
86
+
87
+ self.kernel_latency = Histogram(
88
+ f"{namespace}_kernel_latency_seconds",
89
+ "Total kernel overhead (policy + dispatch)",
90
+ buckets=[0.001, 0.005, 0.01, 0.025, 0.05, 0.1]
91
+ )
92
+
93
+ # =====================================================================
94
+ # Throughput Metrics
95
+ # =====================================================================
96
+
97
+ self.requests_total = Counter(
98
+ f"{namespace}_requests_total",
99
+ "Total requests processed",
100
+ ["action", "status"]
101
+ )
102
+
103
+ self.active_agents = Gauge(
104
+ f"{namespace}_active_agents",
105
+ "Number of active agents"
106
+ )
107
+
108
+ # =====================================================================
109
+ # Signal Metrics
110
+ # =====================================================================
111
+
112
+ self.signals_sent = Counter(
113
+ f"{namespace}_signals_total",
114
+ "Signals sent to agents",
115
+ ["signal", "reason"]
116
+ )
117
+
118
+ self.sigkill_count = Counter(
119
+ f"{namespace}_sigkill_total",
120
+ "SIGKILL signals issued",
121
+ ["agent_id", "reason"]
122
+ )
123
+
124
+ # =====================================================================
125
+ # Recovery Metrics
126
+ # =====================================================================
127
+
128
+ self.mttr_seconds = Histogram(
129
+ f"{namespace}_mttr_seconds",
130
+ "Mean Time To Recovery after SIGKILL",
131
+ buckets=[1, 5, 10, 30, 60, 120, 300]
132
+ )
133
+
134
+ self.recovery_success = Counter(
135
+ f"{namespace}_recovery_total",
136
+ "Recovery attempts",
137
+ ["status"]
138
+ )
139
+
140
+ # =====================================================================
141
+ # Uptime Metrics
142
+ # =====================================================================
143
+
144
+ self.kernel_uptime = Gauge(
145
+ f"{namespace}_kernel_uptime_seconds",
146
+ "Kernel uptime in seconds"
147
+ )
148
+
149
+ self.agent_crashes = Counter(
150
+ f"{namespace}_agent_crashes_total",
151
+ "Agent crashes (user space)",
152
+ ["agent_id", "reason"]
153
+ )
154
+
155
+ self.kernel_crashes = Counter(
156
+ f"{namespace}_kernel_crashes_total",
157
+ "Kernel crashes (should be 0)"
158
+ )
159
+
160
+ # =====================================================================
161
+ # CMVK Metrics (ML Ops)
162
+ # =====================================================================
163
+
164
+ self.cmvk_verifications_total = Counter(
165
+ f"{namespace}_cmvk_verifications_total",
166
+ "Total CMVK verifications performed",
167
+ ["result"] # verified, flagged, rejected
168
+ )
169
+
170
+ self.cmvk_consensus_ratio = Gauge(
171
+ f"{namespace}_cmvk_consensus_ratio",
172
+ "Current model consensus ratio (0.0-1.0)"
173
+ )
174
+
175
+ self.cmvk_model_disagreements = Counter(
176
+ f"{namespace}_cmvk_model_disagreements_total",
177
+ "Model disagreements detected",
178
+ ["model_pair"] # e.g., "gpt4_claude", "claude_gemini"
179
+ )
180
+
181
+ self.cmvk_drift_score = Histogram(
182
+ f"{namespace}_cmvk_drift_score",
183
+ "Distribution of drift scores",
184
+ buckets=[0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.40, 0.50, 1.0]
185
+ )
186
+
187
+ self.cmvk_verification_duration = Histogram(
188
+ f"{namespace}_cmvk_verification_duration_seconds",
189
+ "Time to complete CMVK verification",
190
+ ["model_count"],
191
+ buckets=[0.5, 1.0, 2.0, 3.0, 5.0, 10.0, 15.0]
192
+ )
193
+
194
+ self.cmvk_model_latency = Histogram(
195
+ f"{namespace}_cmvk_model_latency_seconds",
196
+ "Per-model response latency",
197
+ ["model"],
198
+ buckets=[0.5, 1.0, 2.0, 3.0, 5.0, 10.0]
199
+ )
200
+
201
+ self.cmvk_claims_by_confidence = Counter(
202
+ f"{namespace}_cmvk_claims_by_confidence",
203
+ "Claims grouped by confidence level",
204
+ ["confidence_bucket"] # high (>0.9), medium (0.7-0.9), low (<0.7)
205
+ )
206
+
207
+ # =====================================================================
208
+ # Agent-Level Metrics
209
+ # =====================================================================
210
+
211
+ self.agent_llm_calls = Counter(
212
+ f"{namespace}_agent_llm_calls_total",
213
+ "Total LLM API calls by agent",
214
+ ["agent_id", "model"]
215
+ )
216
+
217
+ self.agent_errors = Counter(
218
+ f"{namespace}_agent_errors_total",
219
+ "Agent errors by type",
220
+ ["agent_id", "error_type"]
221
+ )
222
+
223
+ self.agent_execution_duration = Histogram(
224
+ f"{namespace}_agent_execution_duration_seconds",
225
+ "Agent task execution time",
226
+ ["agent_id"],
227
+ buckets=[0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0]
228
+ )
229
+
230
+ # =====================================================================
231
+ # Info Metrics
232
+ # =====================================================================
233
+
234
+ self.kernel_info = Info(
235
+ f"{namespace}_kernel",
236
+ "Kernel version and configuration"
237
+ )
238
+ self.kernel_info.info({
239
+ "version": "1.1.0",
240
+ "policy_mode": "strict"
241
+ })
242
+
243
+ # Internal tracking
244
+ self._start_time = time.time()
245
+ self._request_count = 0
246
+ self._violation_count = 0
247
+
248
+ # =========================================================================
249
+ # Recording Methods
250
+ # =========================================================================
251
+
252
+ def record_request(self, action: str, status: str):
253
+ """Record a request."""
254
+ self.requests_total.labels(action=action, status=status).inc()
255
+ self._request_count += 1
256
+ self._update_violation_rate()
257
+
258
+ def record_violation(self, agent_id: str, action: str, policy: str, severity: str = "high"):
259
+ """Record a policy violation."""
260
+ self.violations_total.labels(
261
+ agent_id=agent_id,
262
+ action=action,
263
+ policy=policy,
264
+ severity=severity
265
+ ).inc()
266
+ self._violation_count += 1
267
+ self._update_violation_rate()
268
+
269
+ def record_blocked(self, agent_id: str, action: str):
270
+ """Record a blocked violation (SIGKILL issued)."""
271
+ self.violations_blocked.labels(agent_id=agent_id, action=action).inc()
272
+ self.sigkill_count.labels(agent_id=agent_id, reason="policy_violation").inc()
273
+ self.signals_sent.labels(signal="SIGKILL", reason="policy_violation").inc()
274
+
275
+ def record_signal(self, signal: str, reason: str):
276
+ """Record a signal sent."""
277
+ self.signals_sent.labels(signal=signal, reason=reason).inc()
278
+
279
+ def record_recovery(self, duration_seconds: float, success: bool):
280
+ """Record recovery after SIGKILL."""
281
+ self.mttr_seconds.observe(duration_seconds)
282
+ self.recovery_success.labels(status="success" if success else "failed").inc()
283
+
284
+ def record_crash(self, agent_id: str, reason: str, is_kernel: bool = False):
285
+ """Record a crash."""
286
+ if is_kernel:
287
+ self.kernel_crashes.inc()
288
+ else:
289
+ self.agent_crashes.labels(agent_id=agent_id, reason=reason).inc()
290
+
291
+ # =========================================================================
292
+ # CMVK Recording Methods
293
+ # =========================================================================
294
+
295
+ def record_cmvk_verification(
296
+ self,
297
+ result: str,
298
+ confidence: float,
299
+ drift_score: float,
300
+ duration_seconds: float,
301
+ model_count: int = 3
302
+ ):
303
+ """Record a CMVK verification."""
304
+ self.cmvk_verifications_total.labels(result=result).inc()
305
+ self.cmvk_drift_score.observe(drift_score)
306
+ self.cmvk_consensus_ratio.set(1.0 - drift_score)
307
+ self.cmvk_verification_duration.labels(model_count=str(model_count)).observe(duration_seconds)
308
+
309
+ # Bucket by confidence
310
+ if confidence >= 0.9:
311
+ bucket = "high"
312
+ elif confidence >= 0.7:
313
+ bucket = "medium"
314
+ else:
315
+ bucket = "low"
316
+ self.cmvk_claims_by_confidence.labels(confidence_bucket=bucket).inc()
317
+
318
+ def record_cmvk_model_response(self, model: str, latency_seconds: float):
319
+ """Record individual model response in CMVK."""
320
+ self.cmvk_model_latency.labels(model=model).observe(latency_seconds)
321
+
322
+ def record_cmvk_disagreement(self, model_a: str, model_b: str):
323
+ """Record a disagreement between two models."""
324
+ pair = f"{model_a}_{model_b}" if model_a < model_b else f"{model_b}_{model_a}"
325
+ self.cmvk_model_disagreements.labels(model_pair=pair).inc()
326
+
327
+ # =========================================================================
328
+ # Agent Recording Methods
329
+ # =========================================================================
330
+
331
+ def record_agent_llm_call(self, agent_id: str, model: str):
332
+ """Record an LLM API call by an agent."""
333
+ self.agent_llm_calls.labels(agent_id=agent_id, model=model).inc()
334
+
335
+ def record_agent_error(self, agent_id: str, error_type: str):
336
+ """Record an agent error."""
337
+ self.agent_errors.labels(agent_id=agent_id, error_type=error_type).inc()
338
+
339
+ def record_agent_execution(self, agent_id: str, duration_seconds: float):
340
+ """Record agent task execution time."""
341
+ self.agent_execution_duration.labels(agent_id=agent_id).observe(duration_seconds)
342
+
343
+ def _update_violation_rate(self):
344
+ """Update violation rate gauge."""
345
+ if self._request_count > 0:
346
+ rate = (self._violation_count / self._request_count) * 1000
347
+ self.violation_rate.labels(window="all_time").set(rate)
348
+
349
+ def update_uptime(self):
350
+ """Update uptime gauge."""
351
+ self.kernel_uptime.set(time.time() - self._start_time)
352
+
353
+ # =========================================================================
354
+ # Context Managers
355
+ # =========================================================================
356
+
357
+ def policy_check_latency(self, policy: str = "default"):
358
+ """Context manager to measure policy check latency."""
359
+ return self.policy_check_duration.labels(policy=policy).time()
360
+
361
+ def execution_latency(self, action: str, status: str = "success"):
362
+ """Context manager to measure execution latency."""
363
+ return self.execution_duration.labels(action=action, status=status).time()
364
+
365
+ # =========================================================================
366
+ # Export
367
+ # =========================================================================
368
+
369
+ def export(self) -> bytes:
370
+ """Export metrics in Prometheus format."""
371
+ self.update_uptime()
372
+ return generate_latest()
373
+
374
+ def content_type(self) -> str:
375
+ """Get content type for metrics response."""
376
+ return CONTENT_TYPE_LATEST
377
+
378
+
379
+ def metrics_endpoint(metrics: KernelMetrics):
380
+ """
381
+ Create a metrics endpoint handler.
382
+
383
+ Usage with FastAPI:
384
+ from fastapi import FastAPI, Response
385
+
386
+ app = FastAPI()
387
+ metrics = KernelMetrics()
388
+
389
+ @app.get("/metrics")
390
+ def get_metrics():
391
+ return Response(
392
+ content=metrics.export(),
393
+ media_type=metrics.content_type()
394
+ )
395
+ """
396
+ def handler():
397
+ return metrics.export(), metrics.content_type()
398
+ return handler
@@ -0,0 +1,223 @@
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT License.
3
+ """
4
+ HTTP Metrics Server for Agent OS Kernel.
5
+
6
+ Standalone server exposing /metrics endpoint for Prometheus scraping.
7
+
8
+ Usage:
9
+ # Start server
10
+ python -m agent_os_observability.server
11
+
12
+ # Or programmatically
13
+ from agent_os_observability import MetricsServer
14
+ server = MetricsServer(port=9090)
15
+ server.start()
16
+
17
+ # Scrape with Prometheus
18
+ # scrape_configs:
19
+ # - job_name: 'agent-os'
20
+ # static_configs:
21
+ # - targets: ['localhost:9090']
22
+ """
23
+
24
+ import asyncio
25
+ import threading
26
+ from http.server import HTTPServer, BaseHTTPRequestHandler
27
+ from typing import Optional
28
+ from .metrics import KernelMetrics
29
+
30
+
31
+ class MetricsHandler(BaseHTTPRequestHandler):
32
+ """HTTP handler for /metrics endpoint."""
33
+
34
+ # Class-level metrics instance (set by server)
35
+ metrics: Optional[KernelMetrics] = None
36
+
37
+ def do_GET(self):
38
+ """Handle GET requests."""
39
+ if self.path == "/metrics":
40
+ self._serve_metrics()
41
+ elif self.path == "/health":
42
+ self._serve_health()
43
+ elif self.path == "/ready":
44
+ self._serve_ready()
45
+ else:
46
+ self.send_error(404, "Not Found")
47
+
48
+ def _serve_metrics(self):
49
+ """Serve Prometheus metrics."""
50
+ if self.metrics is None:
51
+ self.send_error(500, "Metrics not initialized")
52
+ return
53
+
54
+ content = self.metrics.export()
55
+ self.send_response(200)
56
+ self.send_header("Content-Type", self.metrics.content_type())
57
+ self.send_header("Content-Length", len(content))
58
+ self.end_headers()
59
+ self.wfile.write(content)
60
+
61
+ def _serve_health(self):
62
+ """Serve health check."""
63
+ content = b'{"status": "healthy"}'
64
+ self.send_response(200)
65
+ self.send_header("Content-Type", "application/json")
66
+ self.send_header("Content-Length", len(content))
67
+ self.end_headers()
68
+ self.wfile.write(content)
69
+
70
+ def _serve_ready(self):
71
+ """Serve readiness check."""
72
+ content = b'{"ready": true}'
73
+ self.send_response(200)
74
+ self.send_header("Content-Type", "application/json")
75
+ self.send_header("Content-Length", len(content))
76
+ self.end_headers()
77
+ self.wfile.write(content)
78
+
79
+ def log_message(self, format, *args):
80
+ """Suppress default logging (too noisy for /metrics)."""
81
+ pass
82
+
83
+
84
+ class MetricsServer:
85
+ """
86
+ Standalone HTTP server for Agent OS metrics.
87
+
88
+ Endpoints:
89
+ GET /metrics - Prometheus metrics
90
+ GET /health - Health check ({"status": "healthy"})
91
+ GET /ready - Readiness check ({"ready": true})
92
+
93
+ Example:
94
+ # Start with default metrics
95
+ server = MetricsServer(port=9090)
96
+ server.start()
97
+
98
+ # Share metrics with kernel
99
+ from agent_os import StatelessKernel
100
+ kernel = StatelessKernel(metrics=server.metrics)
101
+
102
+ # Stop server
103
+ server.stop()
104
+ """
105
+
106
+ def __init__(
107
+ self,
108
+ port: int = 9090,
109
+ host: str = "0.0.0.0",
110
+ metrics: Optional[KernelMetrics] = None
111
+ ):
112
+ self.port = port
113
+ self.host = host
114
+ self.metrics = metrics or KernelMetrics()
115
+ self._server: Optional[HTTPServer] = None
116
+ self._thread: Optional[threading.Thread] = None
117
+
118
+ def start(self, blocking: bool = False):
119
+ """
120
+ Start the metrics server.
121
+
122
+ Args:
123
+ blocking: If True, block the current thread. Default False (background).
124
+ """
125
+ # Set metrics on handler class
126
+ MetricsHandler.metrics = self.metrics
127
+
128
+ self._server = HTTPServer((self.host, self.port), MetricsHandler)
129
+
130
+ if blocking:
131
+ print(f"Agent OS Metrics Server running on http://{self.host}:{self.port}")
132
+ print(f" /metrics - Prometheus metrics")
133
+ print(f" /health - Health check")
134
+ print(f" /ready - Readiness check")
135
+ self._server.serve_forever()
136
+ else:
137
+ self._thread = threading.Thread(target=self._server.serve_forever, daemon=True)
138
+ self._thread.start()
139
+ print(f"Agent OS Metrics Server started on http://{self.host}:{self.port}/metrics")
140
+
141
+ def stop(self):
142
+ """Stop the metrics server."""
143
+ if self._server:
144
+ self._server.shutdown()
145
+ self._server = None
146
+ if self._thread:
147
+ self._thread.join(timeout=5)
148
+ self._thread = None
149
+
150
+ def __enter__(self):
151
+ self.start()
152
+ return self
153
+
154
+ def __exit__(self, *args):
155
+ self.stop()
156
+
157
+
158
+ # =============================================================================
159
+ # FastAPI Integration
160
+ # =============================================================================
161
+
162
+ def create_fastapi_router(metrics: Optional[KernelMetrics] = None):
163
+ """
164
+ Create FastAPI router for metrics.
165
+
166
+ Usage:
167
+ from fastapi import FastAPI
168
+ from agent_os_observability import create_fastapi_router, KernelMetrics
169
+
170
+ app = FastAPI()
171
+ metrics = KernelMetrics()
172
+ app.include_router(create_fastapi_router(metrics))
173
+ """
174
+ try:
175
+ from fastapi import APIRouter, Response
176
+ except ImportError:
177
+ raise ImportError("FastAPI not installed. Install with: pip install fastapi")
178
+
179
+ router = APIRouter(tags=["observability"])
180
+ _metrics = metrics or KernelMetrics()
181
+
182
+ @router.get("/metrics")
183
+ def get_metrics():
184
+ return Response(
185
+ content=_metrics.export(),
186
+ media_type=_metrics.content_type()
187
+ )
188
+
189
+ @router.get("/health")
190
+ def health():
191
+ return {"status": "healthy"}
192
+
193
+ @router.get("/ready")
194
+ def ready():
195
+ return {"ready": True}
196
+
197
+ return router
198
+
199
+
200
+ # =============================================================================
201
+ # CLI Entry Point
202
+ # =============================================================================
203
+
204
+ def main():
205
+ """Run metrics server from command line."""
206
+ import argparse
207
+
208
+ parser = argparse.ArgumentParser(description="Agent OS Metrics Server")
209
+ parser.add_argument("--port", type=int, default=9090, help="Port to listen on")
210
+ parser.add_argument("--host", default="0.0.0.0", help="Host to bind to")
211
+ args = parser.parse_args()
212
+
213
+ server = MetricsServer(port=args.port, host=args.host)
214
+
215
+ try:
216
+ server.start(blocking=True)
217
+ except KeyboardInterrupt:
218
+ print("\nShutting down...")
219
+ server.stop()
220
+
221
+
222
+ if __name__ == "__main__":
223
+ main()