agentops-accelerator 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. agentops/__init__.py +10 -0
  2. agentops/__main__.py +6 -0
  3. agentops/agent/__init__.py +12 -0
  4. agentops/agent/_legacy_ids.py +92 -0
  5. agentops/agent/analyzer.py +207 -0
  6. agentops/agent/checks/__init__.py +1 -0
  7. agentops/agent/checks/catalog.py +880 -0
  8. agentops/agent/checks/errors.py +279 -0
  9. agentops/agent/checks/foundry_config.py +75 -0
  10. agentops/agent/checks/latency.py +84 -0
  11. agentops/agent/checks/opex.py +157 -0
  12. agentops/agent/checks/opex_workspace.py +874 -0
  13. agentops/agent/checks/posture.py +36 -0
  14. agentops/agent/checks/posture_rules/__init__.py +53 -0
  15. agentops/agent/checks/posture_rules/content_filter.py +59 -0
  16. agentops/agent/checks/posture_rules/diagnostics.py +74 -0
  17. agentops/agent/checks/posture_rules/local_auth.py +55 -0
  18. agentops/agent/checks/posture_rules/managed_identity.py +59 -0
  19. agentops/agent/checks/posture_rules/network.py +68 -0
  20. agentops/agent/checks/regression.py +78 -0
  21. agentops/agent/checks/release_readiness.py +182 -0
  22. agentops/agent/checks/safety.py +247 -0
  23. agentops/agent/checks/spec_conformance.py +375 -0
  24. agentops/agent/cockpit.py +5159 -0
  25. agentops/agent/config.py +240 -0
  26. agentops/agent/findings.py +113 -0
  27. agentops/agent/history.py +142 -0
  28. agentops/agent/knowledge/__init__.py +182 -0
  29. agentops/agent/knowledge/waf-checklist.csv +39 -0
  30. agentops/agent/llm_assist/__init__.py +16 -0
  31. agentops/agent/llm_assist/_base.py +124 -0
  32. agentops/agent/llm_assist/_bundle_rule.py +154 -0
  33. agentops/agent/llm_assist/_client.py +347 -0
  34. agentops/agent/llm_assist/_dataset_rules.py +191 -0
  35. agentops/agent/llm_assist/_engine.py +106 -0
  36. agentops/agent/llm_assist/_prompt_rules.py +291 -0
  37. agentops/agent/llm_assist/_spec_rules.py +235 -0
  38. agentops/agent/production_telemetry.py +430 -0
  39. agentops/agent/report.py +207 -0
  40. agentops/agent/server/__init__.py +1 -0
  41. agentops/agent/server/app.py +84 -0
  42. agentops/agent/server/auth.py +94 -0
  43. agentops/agent/server/chat.py +44 -0
  44. agentops/agent/server/protocol.py +72 -0
  45. agentops/agent/sources/__init__.py +1 -0
  46. agentops/agent/sources/azure_monitor.py +523 -0
  47. agentops/agent/sources/azure_resources.py +602 -0
  48. agentops/agent/sources/foundry_control.py +174 -0
  49. agentops/agent/sources/results_history.py +494 -0
  50. agentops/agent/sources/spec_detectors/__init__.py +42 -0
  51. agentops/agent/sources/spec_detectors/_base.py +58 -0
  52. agentops/agent/sources/spec_detectors/agents_md.py +75 -0
  53. agentops/agent/sources/spec_detectors/spec_kit.py +172 -0
  54. agentops/agent/time_range.py +117 -0
  55. agentops/cli/__init__.py +1 -0
  56. agentops/cli/app.py +4823 -0
  57. agentops/core/__init__.py +1 -0
  58. agentops/core/agentops_config.py +592 -0
  59. agentops/core/config_loader.py +22 -0
  60. agentops/core/evaluators.py +480 -0
  61. agentops/core/release_evidence.py +56 -0
  62. agentops/core/results.py +117 -0
  63. agentops/mcp/__init__.py +10 -0
  64. agentops/mcp/server.py +232 -0
  65. agentops/pipeline/__init__.py +8 -0
  66. agentops/pipeline/cloud_results.py +189 -0
  67. agentops/pipeline/cloud_runner.py +901 -0
  68. agentops/pipeline/comparison.py +108 -0
  69. agentops/pipeline/diagnostics.py +51 -0
  70. agentops/pipeline/invocations.py +535 -0
  71. agentops/pipeline/official_eval.py +414 -0
  72. agentops/pipeline/orchestrator.py +775 -0
  73. agentops/pipeline/prompt_deploy.py +377 -0
  74. agentops/pipeline/publisher.py +121 -0
  75. agentops/pipeline/reporter.py +202 -0
  76. agentops/pipeline/runtime.py +409 -0
  77. agentops/pipeline/thresholds.py +84 -0
  78. agentops/services/__init__.py +1 -0
  79. agentops/services/cicd.py +720 -0
  80. agentops/services/eval_analysis.py +848 -0
  81. agentops/services/evidence_pack.py +757 -0
  82. agentops/services/initializer.py +86 -0
  83. agentops/services/preflight.py +470 -0
  84. agentops/services/setup_wizard.py +709 -0
  85. agentops/services/skills.py +643 -0
  86. agentops/services/trace_promotion.py +300 -0
  87. agentops/services/workflow_analysis.py +1129 -0
  88. agentops/templates/.gitignore +15 -0
  89. agentops/templates/__init__.py +1 -0
  90. agentops/templates/agent-server/Dockerfile +23 -0
  91. agentops/templates/agent-server/README.md +61 -0
  92. agentops/templates/agent-server/main.bicep +94 -0
  93. agentops/templates/agent.yaml +87 -0
  94. agentops/templates/agentops.yaml +58 -0
  95. agentops/templates/foundry.svg +71 -0
  96. agentops/templates/icon.png +0 -0
  97. agentops/templates/pipelines/azuredevops/agentops-deploy-dev-azd.yml +118 -0
  98. agentops/templates/pipelines/azuredevops/agentops-deploy-dev.yml +73 -0
  99. agentops/templates/pipelines/azuredevops/agentops-deploy-prod-azd.yml +141 -0
  100. agentops/templates/pipelines/azuredevops/agentops-deploy-prod.yml +94 -0
  101. agentops/templates/pipelines/azuredevops/agentops-deploy-prompt-agent.yml +167 -0
  102. agentops/templates/pipelines/azuredevops/agentops-deploy-qa-azd.yml +118 -0
  103. agentops/templates/pipelines/azuredevops/agentops-deploy-qa.yml +68 -0
  104. agentops/templates/pipelines/azuredevops/agentops-pr-prompt-agent.yml +210 -0
  105. agentops/templates/pipelines/azuredevops/agentops-pr.yml +155 -0
  106. agentops/templates/pipelines/azuredevops/agentops-watchdog.yml +106 -0
  107. agentops/templates/project.gitignore +36 -0
  108. agentops/templates/sample-traces.jsonl +3 -0
  109. agentops/templates/skills/agentops-agent/SKILL.md +137 -0
  110. agentops/templates/skills/agentops-config/SKILL.md +113 -0
  111. agentops/templates/skills/agentops-dataset/SKILL.md +84 -0
  112. agentops/templates/skills/agentops-eval/SKILL.md +189 -0
  113. agentops/templates/skills/agentops-report/SKILL.md +71 -0
  114. agentops/templates/skills/agentops-workflow/SKILL.md +471 -0
  115. agentops/templates/smoke.jsonl +3 -0
  116. agentops/templates/waf-checklist.README.md +84 -0
  117. agentops/templates/waf-checklist.csv +22 -0
  118. agentops/templates/workflows/agentops-deploy-dev-azd.yml +166 -0
  119. agentops/templates/workflows/agentops-deploy-dev.yml +187 -0
  120. agentops/templates/workflows/agentops-deploy-prod-azd.yml +183 -0
  121. agentops/templates/workflows/agentops-deploy-prod.yml +171 -0
  122. agentops/templates/workflows/agentops-deploy-prompt-agent.yml +197 -0
  123. agentops/templates/workflows/agentops-deploy-qa-azd.yml +156 -0
  124. agentops/templates/workflows/agentops-deploy-qa.yml +145 -0
  125. agentops/templates/workflows/agentops-pr-prompt-agent.yml +210 -0
  126. agentops/templates/workflows/agentops-pr.yml +148 -0
  127. agentops/templates/workflows/agentops-watchdog.yml +122 -0
  128. agentops/utils/__init__.py +1 -0
  129. agentops/utils/azd_env.py +435 -0
  130. agentops/utils/azure_endpoints.py +62 -0
  131. agentops/utils/colors.py +47 -0
  132. agentops/utils/dotenv_loader.py +105 -0
  133. agentops/utils/foundry_discovery.py +229 -0
  134. agentops/utils/logging.py +59 -0
  135. agentops/utils/telemetry.py +554 -0
  136. agentops/utils/yaml.py +36 -0
  137. agentops_accelerator-0.3.0.dist-info/METADATA +278 -0
  138. agentops_accelerator-0.3.0.dist-info/RECORD +142 -0
  139. agentops_accelerator-0.3.0.dist-info/WHEEL +5 -0
  140. agentops_accelerator-0.3.0.dist-info/entry_points.txt +2 -0
  141. agentops_accelerator-0.3.0.dist-info/licenses/LICENSE +21 -0
  142. agentops_accelerator-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,279 @@
1
+ """Errors / failure rate check."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List, Optional
6
+
7
+ from agentops.agent.config import ErrorsCheckConfig
8
+ from agentops.agent.findings import Category, Finding, Severity
9
+ from agentops.agent.sources.azure_monitor import AzureMonitorPayload
10
+ from agentops.agent.sources.foundry_control import FoundryControlPayload
11
+
12
+
13
+ def run_errors_check(
14
+ monitor: Optional[AzureMonitorPayload],
15
+ foundry: Optional[FoundryControlPayload],
16
+ config: ErrorsCheckConfig,
17
+ ) -> List[Finding]:
18
+ findings: List[Finding] = []
19
+
20
+ if (
21
+ monitor
22
+ and monitor.error_rate is not None
23
+ and monitor.error_rate > config.rate_threshold
24
+ ):
25
+ severity = (
26
+ Severity.CRITICAL
27
+ if monitor.error_rate >= config.rate_threshold * 2
28
+ else Severity.WARNING
29
+ )
30
+ findings.append(
31
+ Finding(
32
+ id="errors.production_rate",
33
+ severity=severity,
34
+ category=Category.RELIABILITY,
35
+ title="Production error rate above threshold",
36
+ summary=(
37
+ f"App Insights reports {monitor.error_count} failed "
38
+ f"requests over {monitor.request_count} total "
39
+ f"({monitor.error_rate * 100:.2f}%), above the "
40
+ f"{config.rate_threshold * 100:.2f}% threshold."
41
+ ),
42
+ recommendation=(
43
+ "Open the App Insights resource, group failures by "
44
+ "operation, and inspect the most common exception "
45
+ "type."
46
+ ),
47
+ source="azure_monitor",
48
+ evidence={
49
+ "error_count": monitor.error_count,
50
+ "request_count": monitor.request_count,
51
+ "error_rate": monitor.error_rate,
52
+ "threshold": config.rate_threshold,
53
+ },
54
+ )
55
+ )
56
+
57
+ if (
58
+ foundry
59
+ and foundry.failure_rate is not None
60
+ and foundry.failure_rate > config.rate_threshold
61
+ ):
62
+ findings.append(
63
+ Finding(
64
+ id="errors.foundry_runs",
65
+ severity=Severity.WARNING,
66
+ category=Category.RELIABILITY,
67
+ title="Foundry agent run failure rate elevated",
68
+ summary=(
69
+ f"Foundry control plane reports "
70
+ f"{foundry.failed_runs}/{foundry.total_runs} failed "
71
+ f"runs ({foundry.failure_rate * 100:.2f}%)."
72
+ ),
73
+ recommendation=(
74
+ "Review recent Foundry runs, paying attention to "
75
+ "tool-call errors and rate limits."
76
+ ),
77
+ source="foundry_control",
78
+ evidence={
79
+ "failed_runs": foundry.failed_runs,
80
+ "total_runs": foundry.total_runs,
81
+ "failure_rate": foundry.failure_rate,
82
+ },
83
+ )
84
+ )
85
+
86
+ findings.extend(_check_no_runtime_telemetry(monitor))
87
+ findings.extend(_check_rate_limit_pressure(monitor, config))
88
+ findings.extend(_check_no_token_telemetry(monitor))
89
+
90
+ return findings
91
+
92
+
93
+ def _check_rate_limit_pressure(
94
+ monitor: Optional[AzureMonitorPayload],
95
+ config: ErrorsCheckConfig,
96
+ ) -> List[Finding]:
97
+ """AI.154 — surface HTTP 429 spikes from Azure OpenAI / AI Services.
98
+
99
+ Rate-limit responses indicate the workload is exhausting its TPM /
100
+ RPM quota or PTU capacity. Even when the overall error rate is
101
+ healthy, 429s tell the team to raise quotas or add a backoff /
102
+ gateway layer **before** users see degraded behaviour.
103
+ """
104
+ if monitor is None or not monitor.rate_limit_429_count:
105
+ return []
106
+ # Treat the same rate threshold as the error-rate check: if 429s
107
+ # exceed ``rate_threshold`` of total requests, escalate. With no
108
+ # request_count info, fall back to a hard floor of 10 hits.
109
+ total = monitor.request_count
110
+ threshold_hits = max(10, int(total * config.rate_threshold)) if total else 10
111
+ if monitor.rate_limit_429_count < threshold_hits:
112
+ return []
113
+ severity = (
114
+ Severity.CRITICAL
115
+ if monitor.rate_limit_429_count >= threshold_hits * 2
116
+ else Severity.WARNING
117
+ )
118
+ return [
119
+ Finding(
120
+ id="errors.rate_limit_pressure",
121
+ severity=severity,
122
+ category=Category.RELIABILITY,
123
+ title="Azure OpenAI rate-limit responses (HTTP 429) above threshold",
124
+ summary=(
125
+ f"App Insights reports {monitor.rate_limit_429_count} HTTP "
126
+ f"429 responses from Azure OpenAI / AI Services over the "
127
+ "lookback window. The workload is hitting its TPM / RPM "
128
+ "ceiling and clients are being throttled."
129
+ ),
130
+ recommendation=(
131
+ "Raise the deployment's TPM / RPM quota, switch high-volume "
132
+ "flows to a Provisioned-Throughput Unit (PTU) deployment, "
133
+ "or add an APIM gateway with retry + backoff so clients "
134
+ "do not see the 429s directly."
135
+ ),
136
+ source="azure_monitor",
137
+ evidence={
138
+ "rate_limit_429_count": monitor.rate_limit_429_count,
139
+ "request_count": monitor.request_count,
140
+ "threshold_hits": threshold_hits,
141
+ },
142
+ )
143
+ ]
144
+
145
+
146
+ def _check_no_token_telemetry(
147
+ monitor: Optional[AzureMonitorPayload],
148
+ ) -> List[Finding]:
149
+ """AI.132 — warn when the runtime emits requests but no token telemetry.
150
+
151
+ The OpenTelemetry GenAI semantic conventions
152
+ (``gen_ai.usage.input_tokens`` / ``gen_ai.usage.output_tokens``)
153
+ are the canonical signal for token-cost monitoring. When the agent
154
+ runtime emits dependency spans but no token attributes, the team
155
+ flies blind on cost and on AI.132's "Monitor token usage" guidance.
156
+ """
157
+ if monitor is None:
158
+ return []
159
+ if (monitor.diagnostics or {}).get("token_status") == "error":
160
+ return []
161
+ if monitor.request_count <= 0:
162
+ return [] # absence of telemetry is covered by errors.no_runtime_telemetry
163
+ in_t = monitor.input_token_count or 0
164
+ out_t = monitor.output_token_count or 0
165
+ if in_t > 0 or out_t > 0:
166
+ return []
167
+ return [
168
+ Finding(
169
+ id="opex.no_token_telemetry",
170
+ severity=Severity.WARNING,
171
+ category=Category.OPERATIONAL_EXCELLENCE,
172
+ title="Runtime emits requests but no token-usage telemetry",
173
+ summary=(
174
+ f"App Insights recorded {monitor.request_count} agent "
175
+ "requests but reports zero input / output tokens. The "
176
+ "OpenTelemetry GenAI conventions "
177
+ "(`gen_ai.usage.input_tokens` / "
178
+ "`gen_ai.usage.output_tokens`) are not being emitted, so "
179
+ "token-cost monitoring and the Tokens card on the "
180
+ "cockpit stay grey."
181
+ ),
182
+ recommendation=(
183
+ "Wire the OpenAI instrumentor on the agent runtime "
184
+ "(`opentelemetry-instrumentation-openai-v2` or the "
185
+ "Azure SDK's built-in tracing). The instrumentor sets "
186
+ "the token-usage attributes from the model response "
187
+ "automatically."
188
+ ),
189
+ source="azure_monitor",
190
+ evidence={
191
+ "request_count": monitor.request_count,
192
+ "input_token_count": in_t,
193
+ "output_token_count": out_t,
194
+ },
195
+ )
196
+ ]
197
+
198
+
199
+ def _check_no_runtime_telemetry(
200
+ monitor: Optional[AzureMonitorPayload],
201
+ ) -> List[Finding]:
202
+ """Warn when Azure Monitor is not wired, or wired but silent.
203
+
204
+ Two failure modes count, both blockers for production
205
+ observability:
206
+
207
+ * **Not configured.** The ``azure_monitor`` source is enabled but
208
+ has no ``app_insights_resource_id`` / ``log_analytics_workspace_id``,
209
+ so it reports ``status: skipped``. Doctor has no production
210
+ observability at all.
211
+ * **Configured but empty.** The source reports ``status: ok`` but
212
+ ``request_count == 0`` over the lookback, so the App Insights
213
+ workspace exists but the agent runtime is not emitting
214
+ telemetry to it.
215
+
216
+ The two cases share one finding because the user-facing
217
+ remediation is identical: wire the OpenTelemetry exporter on the
218
+ agent runtime side, and configure the resource id on the
219
+ ``azure_monitor`` source in ``agent.yaml``. If the source is
220
+ explicitly ``enabled: false`` we treat that as an opt-out and
221
+ stay quiet.
222
+ """
223
+ if monitor is None:
224
+ return []
225
+ diag = monitor.diagnostics or {}
226
+ status = diag.get("status")
227
+
228
+ if status == "disabled":
229
+ return []
230
+
231
+ if status == "ok" and monitor.request_count <= 0:
232
+ summary = (
233
+ "Application Insights / Log Analytics is reachable but "
234
+ "reports 0 requests over the lookback window. The "
235
+ "agent runtime is not emitting telemetry, so the "
236
+ "cockpit, latency, errors, and runtime-safety "
237
+ "checks have nothing to grade."
238
+ )
239
+ evidence = {
240
+ "request_count": monitor.request_count,
241
+ "monitor_status": status,
242
+ "mode": "configured_but_empty",
243
+ }
244
+ elif status == "skipped":
245
+ summary = (
246
+ "The `azure_monitor` source is not configured "
247
+ f"({diag.get('reason') or 'unknown reason'}). Without "
248
+ "App Insights wired up, Doctor has no production "
249
+ "observability, so latency, errors, runtime safety, and "
250
+ "telemetry-based reliability checks all stay grey."
251
+ )
252
+ evidence = {
253
+ "monitor_status": status,
254
+ "reason": diag.get("reason"),
255
+ "mode": "not_configured",
256
+ }
257
+ else:
258
+ return []
259
+
260
+ return [
261
+ Finding(
262
+ id="errors.no_runtime_telemetry",
263
+ severity=Severity.WARNING,
264
+ category=Category.RELIABILITY,
265
+ title="Production telemetry is not wired to the agent",
266
+ summary=summary,
267
+ recommendation=(
268
+ "Configure `sources.azure_monitor.app_insights_resource_id` "
269
+ "or set `APPLICATIONINSIGHTS_CONNECTION_STRING` with an "
270
+ "`ApplicationId`, install the `[agent]` extra, and connect "
271
+ "Azure Monitor OpenTelemetry on the agent runtime "
272
+ "(call `configure_azure_monitor()` on startup). "
273
+ "See `docs/tutorial-end-to-end.md` -> "
274
+ "'Wire observability'."
275
+ ),
276
+ source="azure_monitor",
277
+ evidence=evidence,
278
+ )
279
+ ]
@@ -0,0 +1,75 @@
1
+ """Foundry control-plane configuration audit (Operational Excellence category).
2
+
3
+ Mirrors the ``errors.no_runtime_telemetry`` pattern for the Foundry
4
+ control plane. The Doctor warns when Foundry was expected but the
5
+ control-plane source is unconfigured or unreachable.
6
+
7
+ If the user explicitly opted out (``foundry_control.enabled: false``)
8
+ we stay silent - that is the documented way to say "we are not on
9
+ Foundry".
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from typing import List, Optional
15
+
16
+ from agentops.agent.findings import Category, Finding, Severity
17
+ from agentops.agent.sources.foundry_control import FoundryControlPayload
18
+
19
+ SOURCE_NAME = "foundry_control"
20
+
21
+
22
+ def run_foundry_config_check(
23
+ foundry: Optional[FoundryControlPayload],
24
+ ) -> List[Finding]:
25
+ """Audit the Foundry control plane configuration."""
26
+ if foundry is None:
27
+ return []
28
+
29
+ diag = foundry.diagnostics or {}
30
+ status = diag.get("status")
31
+
32
+ if status == "disabled":
33
+ return []
34
+
35
+ findings: List[Finding] = []
36
+ if status != "ok":
37
+ findings.append(_no_foundry_control_finding(diag))
38
+ return findings
39
+
40
+ return findings
41
+
42
+
43
+ def _no_foundry_control_finding(diag: dict) -> Finding:
44
+ status = diag.get("status") or "unknown"
45
+ reason = diag.get("reason") or (
46
+ "the source is enabled but did not return a healthy status"
47
+ )
48
+ return Finding(
49
+ id="opex.no_foundry_control_configured",
50
+ severity=Severity.WARNING,
51
+ category=Category.OPERATIONAL_EXCELLENCE,
52
+ title="Foundry control plane is not configured",
53
+ summary=(
54
+ "The `foundry_control` source is enabled but reports "
55
+ f"`status: {status}` ({reason}). Without it, Doctor "
56
+ "cannot see Foundry-side agents, evaluation rules, or "
57
+ "run failures, so safety-config and Foundry-run checks "
58
+ "stay grey."
59
+ ),
60
+ recommendation=(
61
+ "Set `sources.foundry_control.project_endpoint` (or the "
62
+ "`AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` env var) in "
63
+ "`.agentops/agent.yaml`, install the `[foundry]` extra, "
64
+ "and grant the running identity at least `Reader` on the "
65
+ "Foundry project. If this project does not use Foundry, "
66
+ "set `sources.foundry_control.enabled: false` to opt out "
67
+ "explicitly."
68
+ ),
69
+ source=SOURCE_NAME,
70
+ evidence={
71
+ "monitor_status": status,
72
+ "reason": reason,
73
+ "mode": "not_configured",
74
+ },
75
+ )
@@ -0,0 +1,84 @@
1
+ """Latency check based on Azure Monitor and AgentOps results history."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import List, Optional
6
+
7
+ from agentops.agent.config import LatencyCheckConfig
8
+ from agentops.agent.findings import Category, Finding, Severity
9
+ from agentops.agent.sources.azure_monitor import AzureMonitorPayload
10
+ from agentops.agent.sources.results_history import ResultsHistory
11
+
12
+
13
+ def run_latency_check(
14
+ history: ResultsHistory,
15
+ monitor: Optional[AzureMonitorPayload],
16
+ config: LatencyCheckConfig,
17
+ ) -> List[Finding]:
18
+ findings: List[Finding] = []
19
+ threshold = config.p95_threshold_seconds
20
+
21
+ if monitor and monitor.p95_duration_seconds is not None:
22
+ p95 = monitor.p95_duration_seconds
23
+ if p95 > threshold:
24
+ severity = (
25
+ Severity.CRITICAL if p95 >= threshold * 2 else Severity.WARNING
26
+ )
27
+ findings.append(
28
+ Finding(
29
+ id="latency.p95_production",
30
+ severity=severity,
31
+ category=Category.PERFORMANCE,
32
+ title="Production p95 latency exceeds threshold",
33
+ summary=(
34
+ f"Application Insights reports p95 latency of "
35
+ f"{p95:.2f}s, above the configured threshold of "
36
+ f"{threshold:.2f}s."
37
+ ),
38
+ recommendation=(
39
+ "Review recent deployments for tool-call loops or "
40
+ "long-running RAG retrievals, and consider scaling "
41
+ "out the agent runtime."
42
+ ),
43
+ source="azure_monitor",
44
+ evidence={
45
+ "p95_seconds": p95,
46
+ "threshold_seconds": threshold,
47
+ "request_count": monitor.request_count,
48
+ },
49
+ )
50
+ )
51
+
52
+ if history.runs:
53
+ latest = history.runs[-1]
54
+ avg_latency = latest.metrics.get("avg_latency_seconds")
55
+ if avg_latency is not None and avg_latency > threshold:
56
+ severity = (
57
+ Severity.CRITICAL
58
+ if avg_latency >= threshold * 2
59
+ else Severity.WARNING
60
+ )
61
+ findings.append(
62
+ Finding(
63
+ id="latency.eval_avg",
64
+ severity=severity,
65
+ category=Category.PERFORMANCE,
66
+ title="Evaluation average latency above threshold",
67
+ summary=(
68
+ f"Run `{latest.run_id}` averaged "
69
+ f"{avg_latency:.2f}s per item, above the "
70
+ f"{threshold:.2f}s threshold."
71
+ ),
72
+ recommendation=(
73
+ "Profile the slowest dataset rows and inspect tool "
74
+ "calls; re-run evals after addressing the regression."
75
+ ),
76
+ source="results_history",
77
+ evidence={
78
+ "run_id": latest.run_id,
79
+ "avg_latency_seconds": avg_latency,
80
+ "threshold_seconds": threshold,
81
+ },
82
+ )
83
+ )
84
+ return findings
@@ -0,0 +1,157 @@
1
+ """Operational excellence check.
2
+
3
+ Pipeline-hygiene findings that are time-based or stability-based rather
4
+ than file-based (which live in :mod:`agentops.agent.checks.mlops`).
5
+
6
+ Findings emitted:
7
+
8
+ * ``opex.stale_evaluation`` - Doctor warns when no fresh eval run has
9
+ landed in the configured window.
10
+ * ``opex.flaky_metric`` - a metric's coefficient of variation across
11
+ recent runs is high enough to suggest a flaky judge / non-deterministic
12
+ prompt rather than real change.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from datetime import datetime, timezone
18
+ from statistics import mean, pstdev
19
+ from typing import List
20
+
21
+ from agentops.agent.config import OpexCheckConfig
22
+ from agentops.agent.findings import Category, Finding, Severity
23
+ from agentops.agent.sources.results_history import ResultsHistory
24
+
25
+ SOURCE_NAME = "results_history"
26
+
27
+
28
+ def run_opex_check(
29
+ history: ResultsHistory, config: OpexCheckConfig
30
+ ) -> List[Finding]:
31
+ """Detect operational-excellence regressions (stale runs + flaky metrics)."""
32
+ if not config.enabled:
33
+ return []
34
+
35
+ findings: List[Finding] = []
36
+ findings.extend(_check_stale_evaluation(history, config))
37
+ findings.extend(_check_flaky_metric(history, config))
38
+ return findings
39
+
40
+
41
+ def _check_stale_evaluation(
42
+ history: ResultsHistory, config: OpexCheckConfig
43
+ ) -> List[Finding]:
44
+ if not history.runs:
45
+ return []
46
+ latest = history.runs[-1]
47
+ if latest.timestamp is None:
48
+ return []
49
+ now = datetime.now(timezone.utc)
50
+ age_days = (now - latest.timestamp).total_seconds() / 86400.0
51
+ if age_days <= config.stale_after_days:
52
+ return []
53
+
54
+ severity = (
55
+ Severity.CRITICAL
56
+ if age_days >= config.stale_after_days * 2
57
+ else Severity.WARNING
58
+ )
59
+ return [
60
+ Finding(
61
+ id="opex.stale_evaluation",
62
+ severity=severity,
63
+ category=Category.OPERATIONAL_EXCELLENCE,
64
+ title="No fresh evaluation run in the configured window",
65
+ summary=(
66
+ f"The most recent eval run (`{latest.run_id}`) is "
67
+ f"{age_days:.1f} day(s) old, above the configured "
68
+ f"threshold of {config.stale_after_days} day(s). The "
69
+ "agent's measured quality is drifting away from its "
70
+ "last validated baseline."
71
+ ),
72
+ recommendation=(
73
+ "Run `agentops eval run` (locally or via CI) to "
74
+ "produce a fresh local `results.json` or Foundry cloud "
75
+ "evaluation, then re-run `agentops doctor`."
76
+ ),
77
+ source=SOURCE_NAME,
78
+ evidence={
79
+ "latest_run_id": latest.run_id,
80
+ "latest_timestamp": latest.timestamp.isoformat(),
81
+ "age_days": round(age_days, 2),
82
+ "threshold_days": config.stale_after_days,
83
+ },
84
+ )
85
+ ]
86
+
87
+
88
+ def _check_flaky_metric(
89
+ history: ResultsHistory, config: OpexCheckConfig
90
+ ) -> List[Finding]:
91
+ """Flag metrics whose coefficient of variation is suspiciously high.
92
+
93
+ A high CV (stddev / mean) across many runs without a corresponding
94
+ agent change is the fingerprint of a non-deterministic judge or a
95
+ prompt that's overly sensitive to phrasing. Real regressions show
96
+ up as monotonic drops (caught by the ``regression`` check); flaky
97
+ metrics oscillate.
98
+
99
+ We only consider metrics with at least ``min_runs_for_flaky`` data
100
+ points and a mean that's safely above zero to avoid amplifying noise
101
+ on near-zero scores.
102
+ """
103
+ runs = history.runs
104
+ if len(runs) < config.min_runs_for_flaky:
105
+ return []
106
+
107
+ # Collect each metric's series across the recent window.
108
+ series: dict[str, List[float]] = {}
109
+ for run in runs[-config.min_runs_for_flaky :]:
110
+ for name, value in run.metrics.items():
111
+ series.setdefault(name, []).append(value)
112
+
113
+ findings: List[Finding] = []
114
+ for metric, values in series.items():
115
+ if len(values) < config.min_runs_for_flaky:
116
+ continue
117
+ avg = mean(values)
118
+ if avg <= 0.05:
119
+ # Near-zero metrics make CV explode without signal.
120
+ continue
121
+ cv = pstdev(values) / avg
122
+ if cv < config.flaky_cv_threshold:
123
+ continue
124
+ findings.append(
125
+ Finding(
126
+ id=f"opex.flaky_metric.{metric}",
127
+ severity=Severity.WARNING,
128
+ category=Category.OPERATIONAL_EXCELLENCE,
129
+ title=f"`{metric}` is unstable across recent runs",
130
+ summary=(
131
+ f"`{metric}` shows a coefficient of variation of "
132
+ f"{cv * 100:.1f}% across the last {len(values)} "
133
+ "runs (threshold: "
134
+ f"{config.flaky_cv_threshold * 100:.0f}%). That "
135
+ "kind of oscillation usually points at a "
136
+ "non-deterministic judge model or a prompt that's "
137
+ "overly sensitive to phrasing - not at real "
138
+ "agent change."
139
+ ),
140
+ recommendation=(
141
+ "Pin the judge model's `temperature` / `seed` "
142
+ "(or switch to a deterministic evaluator), and "
143
+ "review the metric's prompt for ambiguity. If "
144
+ "the metric is intrinsically noisy, raise "
145
+ "`min_runs` on the regression check so signals "
146
+ "average out."
147
+ ),
148
+ source=SOURCE_NAME,
149
+ evidence={
150
+ "metric": metric,
151
+ "cv": round(cv, 4),
152
+ "mean": round(avg, 4),
153
+ "samples": len(values),
154
+ },
155
+ )
156
+ )
157
+ return findings