agentops-accelerator 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. agentops/__init__.py +10 -0
  2. agentops/__main__.py +6 -0
  3. agentops/agent/__init__.py +12 -0
  4. agentops/agent/_legacy_ids.py +92 -0
  5. agentops/agent/analyzer.py +207 -0
  6. agentops/agent/checks/__init__.py +1 -0
  7. agentops/agent/checks/catalog.py +880 -0
  8. agentops/agent/checks/errors.py +279 -0
  9. agentops/agent/checks/foundry_config.py +75 -0
  10. agentops/agent/checks/latency.py +84 -0
  11. agentops/agent/checks/opex.py +157 -0
  12. agentops/agent/checks/opex_workspace.py +874 -0
  13. agentops/agent/checks/posture.py +36 -0
  14. agentops/agent/checks/posture_rules/__init__.py +53 -0
  15. agentops/agent/checks/posture_rules/content_filter.py +59 -0
  16. agentops/agent/checks/posture_rules/diagnostics.py +74 -0
  17. agentops/agent/checks/posture_rules/local_auth.py +55 -0
  18. agentops/agent/checks/posture_rules/managed_identity.py +59 -0
  19. agentops/agent/checks/posture_rules/network.py +68 -0
  20. agentops/agent/checks/regression.py +78 -0
  21. agentops/agent/checks/release_readiness.py +182 -0
  22. agentops/agent/checks/safety.py +247 -0
  23. agentops/agent/checks/spec_conformance.py +375 -0
  24. agentops/agent/cockpit.py +5159 -0
  25. agentops/agent/config.py +240 -0
  26. agentops/agent/findings.py +113 -0
  27. agentops/agent/history.py +142 -0
  28. agentops/agent/knowledge/__init__.py +182 -0
  29. agentops/agent/knowledge/waf-checklist.csv +39 -0
  30. agentops/agent/llm_assist/__init__.py +16 -0
  31. agentops/agent/llm_assist/_base.py +124 -0
  32. agentops/agent/llm_assist/_bundle_rule.py +154 -0
  33. agentops/agent/llm_assist/_client.py +347 -0
  34. agentops/agent/llm_assist/_dataset_rules.py +191 -0
  35. agentops/agent/llm_assist/_engine.py +106 -0
  36. agentops/agent/llm_assist/_prompt_rules.py +291 -0
  37. agentops/agent/llm_assist/_spec_rules.py +235 -0
  38. agentops/agent/production_telemetry.py +430 -0
  39. agentops/agent/report.py +207 -0
  40. agentops/agent/server/__init__.py +1 -0
  41. agentops/agent/server/app.py +84 -0
  42. agentops/agent/server/auth.py +94 -0
  43. agentops/agent/server/chat.py +44 -0
  44. agentops/agent/server/protocol.py +72 -0
  45. agentops/agent/sources/__init__.py +1 -0
  46. agentops/agent/sources/azure_monitor.py +523 -0
  47. agentops/agent/sources/azure_resources.py +602 -0
  48. agentops/agent/sources/foundry_control.py +174 -0
  49. agentops/agent/sources/results_history.py +494 -0
  50. agentops/agent/sources/spec_detectors/__init__.py +42 -0
  51. agentops/agent/sources/spec_detectors/_base.py +58 -0
  52. agentops/agent/sources/spec_detectors/agents_md.py +75 -0
  53. agentops/agent/sources/spec_detectors/spec_kit.py +172 -0
  54. agentops/agent/time_range.py +117 -0
  55. agentops/cli/__init__.py +1 -0
  56. agentops/cli/app.py +4823 -0
  57. agentops/core/__init__.py +1 -0
  58. agentops/core/agentops_config.py +592 -0
  59. agentops/core/config_loader.py +22 -0
  60. agentops/core/evaluators.py +480 -0
  61. agentops/core/release_evidence.py +56 -0
  62. agentops/core/results.py +117 -0
  63. agentops/mcp/__init__.py +10 -0
  64. agentops/mcp/server.py +232 -0
  65. agentops/pipeline/__init__.py +8 -0
  66. agentops/pipeline/cloud_results.py +189 -0
  67. agentops/pipeline/cloud_runner.py +901 -0
  68. agentops/pipeline/comparison.py +108 -0
  69. agentops/pipeline/diagnostics.py +51 -0
  70. agentops/pipeline/invocations.py +535 -0
  71. agentops/pipeline/official_eval.py +414 -0
  72. agentops/pipeline/orchestrator.py +775 -0
  73. agentops/pipeline/prompt_deploy.py +377 -0
  74. agentops/pipeline/publisher.py +121 -0
  75. agentops/pipeline/reporter.py +202 -0
  76. agentops/pipeline/runtime.py +409 -0
  77. agentops/pipeline/thresholds.py +84 -0
  78. agentops/services/__init__.py +1 -0
  79. agentops/services/cicd.py +720 -0
  80. agentops/services/eval_analysis.py +848 -0
  81. agentops/services/evidence_pack.py +757 -0
  82. agentops/services/initializer.py +86 -0
  83. agentops/services/preflight.py +470 -0
  84. agentops/services/setup_wizard.py +709 -0
  85. agentops/services/skills.py +643 -0
  86. agentops/services/trace_promotion.py +300 -0
  87. agentops/services/workflow_analysis.py +1129 -0
  88. agentops/templates/.gitignore +15 -0
  89. agentops/templates/__init__.py +1 -0
  90. agentops/templates/agent-server/Dockerfile +23 -0
  91. agentops/templates/agent-server/README.md +61 -0
  92. agentops/templates/agent-server/main.bicep +94 -0
  93. agentops/templates/agent.yaml +87 -0
  94. agentops/templates/agentops.yaml +58 -0
  95. agentops/templates/foundry.svg +71 -0
  96. agentops/templates/icon.png +0 -0
  97. agentops/templates/pipelines/azuredevops/agentops-deploy-dev-azd.yml +118 -0
  98. agentops/templates/pipelines/azuredevops/agentops-deploy-dev.yml +73 -0
  99. agentops/templates/pipelines/azuredevops/agentops-deploy-prod-azd.yml +141 -0
  100. agentops/templates/pipelines/azuredevops/agentops-deploy-prod.yml +94 -0
  101. agentops/templates/pipelines/azuredevops/agentops-deploy-prompt-agent.yml +167 -0
  102. agentops/templates/pipelines/azuredevops/agentops-deploy-qa-azd.yml +118 -0
  103. agentops/templates/pipelines/azuredevops/agentops-deploy-qa.yml +68 -0
  104. agentops/templates/pipelines/azuredevops/agentops-pr-prompt-agent.yml +210 -0
  105. agentops/templates/pipelines/azuredevops/agentops-pr.yml +155 -0
  106. agentops/templates/pipelines/azuredevops/agentops-watchdog.yml +106 -0
  107. agentops/templates/project.gitignore +36 -0
  108. agentops/templates/sample-traces.jsonl +3 -0
  109. agentops/templates/skills/agentops-agent/SKILL.md +137 -0
  110. agentops/templates/skills/agentops-config/SKILL.md +113 -0
  111. agentops/templates/skills/agentops-dataset/SKILL.md +84 -0
  112. agentops/templates/skills/agentops-eval/SKILL.md +189 -0
  113. agentops/templates/skills/agentops-report/SKILL.md +71 -0
  114. agentops/templates/skills/agentops-workflow/SKILL.md +471 -0
  115. agentops/templates/smoke.jsonl +3 -0
  116. agentops/templates/waf-checklist.README.md +84 -0
  117. agentops/templates/waf-checklist.csv +22 -0
  118. agentops/templates/workflows/agentops-deploy-dev-azd.yml +166 -0
  119. agentops/templates/workflows/agentops-deploy-dev.yml +187 -0
  120. agentops/templates/workflows/agentops-deploy-prod-azd.yml +183 -0
  121. agentops/templates/workflows/agentops-deploy-prod.yml +171 -0
  122. agentops/templates/workflows/agentops-deploy-prompt-agent.yml +197 -0
  123. agentops/templates/workflows/agentops-deploy-qa-azd.yml +156 -0
  124. agentops/templates/workflows/agentops-deploy-qa.yml +145 -0
  125. agentops/templates/workflows/agentops-pr-prompt-agent.yml +210 -0
  126. agentops/templates/workflows/agentops-pr.yml +148 -0
  127. agentops/templates/workflows/agentops-watchdog.yml +122 -0
  128. agentops/utils/__init__.py +1 -0
  129. agentops/utils/azd_env.py +435 -0
  130. agentops/utils/azure_endpoints.py +62 -0
  131. agentops/utils/colors.py +47 -0
  132. agentops/utils/dotenv_loader.py +105 -0
  133. agentops/utils/foundry_discovery.py +229 -0
  134. agentops/utils/logging.py +59 -0
  135. agentops/utils/telemetry.py +554 -0
  136. agentops/utils/yaml.py +36 -0
  137. agentops_accelerator-0.3.0.dist-info/METADATA +278 -0
  138. agentops_accelerator-0.3.0.dist-info/RECORD +142 -0
  139. agentops_accelerator-0.3.0.dist-info/WHEEL +5 -0
  140. agentops_accelerator-0.3.0.dist-info/entry_points.txt +2 -0
  141. agentops_accelerator-0.3.0.dist-info/licenses/LICENSE +21 -0
  142. agentops_accelerator-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,880 @@
1
+ """Canonical catalog of checks the AgentOps doctor can emit.
2
+
3
+ The catalog is the single source of truth used by
4
+ ``agentops doctor explain`` to describe what the analyzer verifies. Each
5
+ entry is a static, hand-curated record - the doctor itself does not
6
+ use this module at analysis time, so a missing or stale entry only
7
+ affects discoverability, not behavior. A drift test
8
+ (``tests/unit/test_doctor_catalog.py``) keeps the catalog in step with
9
+ the rule registries (`RULE_REGISTRY`, LLM-assist `_ALL_RULES`, and the
10
+ deterministic id constants emitted by `run_*` functions).
11
+
12
+ Categories mirror the Microsoft Well-Architected Framework for AI
13
+ pillars, see :class:`agentops.agent.findings.Category`.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from dataclasses import dataclass, field
19
+ from typing import Dict, Iterable, List, Tuple
20
+
21
+ from agentops.agent.findings import Category, Severity
22
+
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Data source labels
26
+ # ---------------------------------------------------------------------------
27
+
28
+ # Human-readable labels for the "requires" field. Keys must match the
29
+ # names used by the analyzer sources / checks.
30
+ SOURCE_LABELS: Dict[str, str] = {
31
+ "workspace": "workspace files",
32
+ "results_history": "eval history (local results + Foundry cloud fallback)",
33
+ "azure_monitor": "Azure Monitor / App Insights",
34
+ "foundry_control": "Foundry control plane",
35
+ "azure_resources": "Azure resources (ARM)",
36
+ "spec_workspace": "spec docs (.specify / AGENTS.md)",
37
+ "judge_model": "judge model deployment",
38
+ }
39
+
40
+ SOURCE_DESCRIPTIONS: Dict[str, str] = {
41
+ "workspace": (
42
+ "Local project files: `.agentops/` configs, bundles, datasets, "
43
+ "GitHub Actions / Azure DevOps workflows, AI Landing Zone deployment "
44
+ "signals, `.gitignore`, `CHANGELOG.md`, and other repo files used for "
45
+ "CI / release hygiene checks."
46
+ ),
47
+ "results_history": (
48
+ "Past AgentOps evaluation outputs. Doctor reads local "
49
+ "`.agentops/results/*/results.json` first, then falls back to Foundry "
50
+ "cloud evaluation runs when local history is missing or too short. "
51
+ "Used for metric regressions, stale evaluations, flaky metrics, eval "
52
+ "latency, and content-safety hits from previous runs."
53
+ ),
54
+ "azure_monitor": (
55
+ "Application Insights or Log Analytics telemetry from the deployed "
56
+ "agent. Used for production latency, error rate, rate-limit pressure, "
57
+ "token telemetry, and runtime content-filter signals. Requires "
58
+ "`app_insights_resource_id` or `log_analytics_workspace_id`."
59
+ ),
60
+ "foundry_control": (
61
+ "Foundry project metadata from the control plane: agents, recent run "
62
+ "failures, and continuous-evaluation rules. Uses "
63
+ "`AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` or `sources.foundry_control.project_endpoint`."
64
+ ),
65
+ "azure_resources": (
66
+ "Azure ARM resource posture for the Cognitive Services / Azure OpenAI "
67
+ "account: local auth, managed identity, deployments, and diagnostic "
68
+ "settings. This source is enabled by default and fail-open. Doctor "
69
+ "uses explicit `.agentops/agent.yaml` fields when present; otherwise "
70
+ "it reads AZD's `.azure/<env>/.env` when present, then the Foundry "
71
+ "project endpoint from the environment to match the backing Azure AI "
72
+ "account."
73
+ ),
74
+ "spec_workspace": (
75
+ "Spec-driven-development documents such as `.specify/spec.md`, "
76
+ "`.specify/plan.md`, `.specify/tasks.md`, `AGENTS.md`, and Copilot "
77
+ "instructions. Used to check whether the implementation, bundles, "
78
+ "datasets, and tasks still match the intended agent behavior."
79
+ ),
80
+ "judge_model": (
81
+ "A Foundry/OpenAI model deployment used only by opt-in LLM-judged "
82
+ "checks. It reviews semantic signals like prompt guardrails, dataset "
83
+ "PII risk, bundle coverage, and spec-vs-implementation gaps."
84
+ ),
85
+ }
86
+
87
+ # Recognized check flags. Keep this list short and stable.
88
+ #
89
+ # * ``dynamic_id`` - the id has a dynamic suffix (e.g. metric name)
90
+ # * ``llm_judged`` - the check calls a judge LLM (opt-in, costs tokens)
91
+ # * ``opt_in`` - the check is off by default and must be enabled
92
+ FLAG_LABELS: Dict[str, str] = {
93
+ "dynamic_id": "id varies per metric/signal",
94
+ "llm_judged": "uses a judge model (opt-in)",
95
+ "opt_in": "opt-in (disabled by default)",
96
+ }
97
+
98
+ # Public reference pages shown by `agentops doctor explain`. Exact rule
99
+ # links win; category links keep the list useful for newer rules that
100
+ # do not yet have a narrower page.
101
+ CATEGORY_REFERENCE_URLS: Dict[Category, str] = {
102
+ Category.QUALITY: (
103
+ "https://learn.microsoft.com/azure/ai-foundry/concepts/"
104
+ "evaluation-approach-gen-ai"
105
+ ),
106
+ Category.PERFORMANCE: (
107
+ "https://learn.microsoft.com/azure/well-architected/ai/"
108
+ "performance-efficiency"
109
+ ),
110
+ Category.RELIABILITY: (
111
+ "https://learn.microsoft.com/azure/well-architected/ai/reliability"
112
+ ),
113
+ Category.OPERATIONAL_EXCELLENCE: (
114
+ "https://learn.microsoft.com/azure/well-architected/ai/operations"
115
+ ),
116
+ Category.SECURITY: (
117
+ "https://learn.microsoft.com/azure/well-architected/ai/security"
118
+ ),
119
+ Category.RESPONSIBLE_AI: (
120
+ "https://learn.microsoft.com/azure/well-architected/ai/responsible-ai"
121
+ ),
122
+ }
123
+
124
+ CHECK_REFERENCE_URLS: Dict[str, str] = {
125
+ "regression.<metric>": CATEGORY_REFERENCE_URLS[Category.QUALITY],
126
+ "latency.p95_production": CATEGORY_REFERENCE_URLS[Category.PERFORMANCE],
127
+ "latency.eval_avg": CATEGORY_REFERENCE_URLS[Category.PERFORMANCE],
128
+ "errors.production_rate": CATEGORY_REFERENCE_URLS[Category.RELIABILITY],
129
+ "errors.foundry_runs": CATEGORY_REFERENCE_URLS[Category.RELIABILITY],
130
+ "errors.no_runtime_telemetry": CATEGORY_REFERENCE_URLS[Category.RELIABILITY],
131
+ "safety.<metric>": (
132
+ "https://learn.microsoft.com/azure/ai-foundry/concepts/"
133
+ "evaluation-metrics-built-in"
134
+ ),
135
+ "safety.runtime.<signal>": (
136
+ "https://learn.microsoft.com/azure/ai-foundry/concepts/content-filtering"
137
+ ),
138
+ "safety.config.continuous_eval_missing": (
139
+ "https://learn.microsoft.com/azure/ai-foundry/how-to/online-evaluation"
140
+ ),
141
+ "safety.config.continuous_eval_disabled": (
142
+ "https://learn.microsoft.com/azure/ai-foundry/how-to/online-evaluation"
143
+ ),
144
+ }
145
+
146
+
147
+ # ---------------------------------------------------------------------------
148
+ # CheckSpec
149
+ # ---------------------------------------------------------------------------
150
+
151
+
152
+ @dataclass(frozen=True)
153
+ class CheckSpec:
154
+ """A documented doctor check.
155
+
156
+ ``id`` is the canonical finding id. When the id has a dynamic
157
+ suffix the spec uses the placeholder form (e.g.
158
+ ``regression.<metric>``) and the ``dynamic_id`` flag is set so the
159
+ UI can render the wildcard explicitly.
160
+ """
161
+
162
+ id: str
163
+ category: Category
164
+ title: str
165
+ summary: str
166
+ severities: Tuple[Severity, ...]
167
+ requires: Tuple[str, ...] = field(default_factory=tuple)
168
+ flags: Tuple[str, ...] = field(default_factory=tuple)
169
+
170
+ @property
171
+ def is_dynamic(self) -> bool:
172
+ return "dynamic_id" in self.flags
173
+
174
+ @property
175
+ def is_llm_judged(self) -> bool:
176
+ return "llm_judged" in self.flags
177
+
178
+
179
+ # ---------------------------------------------------------------------------
180
+ # Catalog
181
+ # ---------------------------------------------------------------------------
182
+
183
+ # Order within each category is informational (most-actionable first);
184
+ # the overall WAF pillar order is fixed by `CATEGORY_ORDER` below.
185
+ CHECKS: Tuple[CheckSpec, ...] = (
186
+ # ------------------------------------------------------------------
187
+ # Quality
188
+ # ------------------------------------------------------------------
189
+ CheckSpec(
190
+ id="regression.<metric>",
191
+ category=Category.QUALITY,
192
+ title="Metric regression vs rolling baseline",
193
+ summary=(
194
+ "For each metric in the regression watchlist, compare the "
195
+ "latest run to a rolling baseline of previous runs and flag "
196
+ "drops that exceed the configured tolerance."
197
+ ),
198
+ severities=(Severity.WARNING, Severity.CRITICAL),
199
+ requires=("results_history",),
200
+ flags=("dynamic_id",),
201
+ ),
202
+ # ------------------------------------------------------------------
203
+ # Performance
204
+ # ------------------------------------------------------------------
205
+ CheckSpec(
206
+ id="latency.p95_production",
207
+ category=Category.PERFORMANCE,
208
+ title="Production p95 latency over threshold",
209
+ summary=(
210
+ "App Insights reports a p95 request latency above the "
211
+ "configured ceiling - usually tool-call loops or slow "
212
+ "retrievals leaking into prod."
213
+ ),
214
+ severities=(Severity.WARNING, Severity.CRITICAL),
215
+ requires=("azure_monitor",),
216
+ ),
217
+ CheckSpec(
218
+ id="latency.eval_avg",
219
+ category=Category.PERFORMANCE,
220
+ title="Evaluation average latency over threshold",
221
+ summary=(
222
+ "The latest eval run's average per-item latency exceeds "
223
+ "the configured threshold."
224
+ ),
225
+ severities=(Severity.WARNING, Severity.CRITICAL),
226
+ requires=("results_history",),
227
+ ),
228
+ # ------------------------------------------------------------------
229
+ # Reliability
230
+ # ------------------------------------------------------------------
231
+ CheckSpec(
232
+ id="errors.production_rate",
233
+ category=Category.RELIABILITY,
234
+ title="Production error rate over threshold",
235
+ summary=(
236
+ "App Insights error rate is above the configured ceiling."
237
+ ),
238
+ severities=(Severity.WARNING, Severity.CRITICAL),
239
+ requires=("azure_monitor",),
240
+ ),
241
+ CheckSpec(
242
+ id="errors.foundry_runs",
243
+ category=Category.RELIABILITY,
244
+ title="Foundry run failure rate over threshold",
245
+ summary=(
246
+ "The Foundry control plane reports a run-failure rate "
247
+ "above the configured threshold."
248
+ ),
249
+ severities=(Severity.WARNING, Severity.CRITICAL),
250
+ requires=("foundry_control",),
251
+ ),
252
+ CheckSpec(
253
+ id="errors.rate_limit_pressure",
254
+ category=Category.RELIABILITY,
255
+ title="Rate-limit pressure on the model deployment",
256
+ summary=(
257
+ "App Insights shows a non-trivial volume of 429 / "
258
+ "throttling responses against the model endpoint."
259
+ ),
260
+ severities=(Severity.WARNING, Severity.CRITICAL),
261
+ requires=("azure_monitor",),
262
+ ),
263
+ CheckSpec(
264
+ id="errors.no_runtime_telemetry",
265
+ category=Category.RELIABILITY,
266
+ title="Runtime telemetry is not configured",
267
+ summary=(
268
+ "The `azure_monitor` source is enabled but no telemetry "
269
+ "was returned - latency / error / token signals will all "
270
+ "be silent until App Insights is wired up."
271
+ ),
272
+ severities=(Severity.WARNING,),
273
+ requires=("azure_monitor",),
274
+ ),
275
+ # ------------------------------------------------------------------
276
+ # Operational Excellence
277
+ # ------------------------------------------------------------------
278
+ CheckSpec(
279
+ id="opex.stale_evaluation",
280
+ category=Category.OPERATIONAL_EXCELLENCE,
281
+ title="No fresh evaluation run in the configured window",
282
+ summary=(
283
+ "The most recent eval is older than the configured "
284
+ "freshness window - measured quality is drifting from the "
285
+ "last validated baseline."
286
+ ),
287
+ severities=(Severity.WARNING, Severity.CRITICAL),
288
+ requires=("results_history",),
289
+ ),
290
+ CheckSpec(
291
+ id="opex.flaky_metric.<metric>",
292
+ category=Category.OPERATIONAL_EXCELLENCE,
293
+ title="Metric is unstable across recent runs",
294
+ summary=(
295
+ "A metric's coefficient of variation is high enough to "
296
+ "suggest a flaky judge or a prompt that's overly sensitive "
297
+ "to phrasing rather than real agent change."
298
+ ),
299
+ severities=(Severity.WARNING,),
300
+ requires=("results_history",),
301
+ flags=("dynamic_id",),
302
+ ),
303
+ CheckSpec(
304
+ id="opex.no_token_telemetry",
305
+ category=Category.OPERATIONAL_EXCELLENCE,
306
+ title="Token usage telemetry is missing",
307
+ summary=(
308
+ "App Insights returned request volume but no token usage "
309
+ "metrics - cost trends and prompt drift can't be tracked."
310
+ ),
311
+ severities=(Severity.WARNING,),
312
+ requires=("azure_monitor",),
313
+ ),
314
+ CheckSpec(
315
+ id="opex.unpinned_agent",
316
+ category=Category.OPERATIONAL_EXCELLENCE,
317
+ title="Agent target is not pinned to a version",
318
+ summary=(
319
+ "`agent:` in `agentops.yaml` lacks an explicit version, so "
320
+ "CI silently tracks whatever the Foundry default resolves "
321
+ "to."
322
+ ),
323
+ severities=(Severity.WARNING,),
324
+ requires=("workspace",),
325
+ ),
326
+ CheckSpec(
327
+ id="opex.no_thresholds",
328
+ category=Category.OPERATIONAL_EXCELLENCE,
329
+ title="`agentops.yaml` has no explicit thresholds block",
330
+ summary=(
331
+ "Without thresholds the eval gate relies entirely on "
332
+ "auto-defaults - fine for exploration, loose for a merge "
333
+ "gate."
334
+ ),
335
+ severities=(Severity.WARNING,),
336
+ requires=("workspace",),
337
+ ),
338
+ CheckSpec(
339
+ id="opex.no_pr_gate",
340
+ category=Category.OPERATIONAL_EXCELLENCE,
341
+ title="Repository has no AgentOps PR gate",
342
+ summary=(
343
+ "`.github/workflows/` exists but has no `agentops-pr.yml` "
344
+ "- PRs can merge without running an eval."
345
+ ),
346
+ severities=(Severity.WARNING,),
347
+ requires=("workspace",),
348
+ ),
349
+ CheckSpec(
350
+ id="opex.no_deploy_workflow",
351
+ category=Category.OPERATIONAL_EXCELLENCE,
352
+ title="Repository has a PR gate but no deploy workflow",
353
+ summary=(
354
+ "There is no `agentops-deploy-*.yml`, so evals are never "
355
+ "re-run against promoted environments (dev / qa / prod)."
356
+ ),
357
+ severities=(Severity.WARNING,),
358
+ requires=("workspace",),
359
+ ),
360
+ CheckSpec(
361
+ id="opex.ailz_readiness",
362
+ category=Category.OPERATIONAL_EXCELLENCE,
363
+ title="AI Landing Zone deployment readiness detected",
364
+ summary=(
365
+ "Doctor found canonical AI Landing Zone signals and reports "
366
+ "the local readiness dimensions for landing-zone preflight, "
367
+ "azd/Bicep workflow deployment, AgentOps eval config, and "
368
+ "private-network runner planning."
369
+ ),
370
+ severities=(Severity.INFO,),
371
+ requires=("workspace",),
372
+ ),
373
+ CheckSpec(
374
+ id="opex.ailz_gaps",
375
+ category=Category.OPERATIONAL_EXCELLENCE,
376
+ title="AI Landing Zone deployment readiness has gaps",
377
+ summary=(
378
+ "The workspace looks like an AI Landing Zone project, but "
379
+ "one or more readiness dimensions are missing before CI/CD "
380
+ "can confidently provision and validate the workload."
381
+ ),
382
+ severities=(Severity.WARNING,),
383
+ requires=("workspace",),
384
+ ),
385
+ CheckSpec(
386
+ id="opex.release.no_eval_evidence",
387
+ category=Category.OPERATIONAL_EXCELLENCE,
388
+ title="No evaluation evidence is available for release",
389
+ summary=(
390
+ "No completed AgentOps eval run was found, so the project has no "
391
+ "quality evidence to attach to a production promotion."
392
+ ),
393
+ severities=(Severity.WARNING,),
394
+ requires=("workspace", "results_history"),
395
+ ),
396
+ CheckSpec(
397
+ id="opex.release.latest_eval_failed",
398
+ category=Category.OPERATIONAL_EXCELLENCE,
399
+ title="Latest evaluation run failed",
400
+ summary=(
401
+ "The most recent eval result did not pass, making the release "
402
+ "evidence blocked until the failing rows or thresholds are fixed."
403
+ ),
404
+ severities=(Severity.CRITICAL,),
405
+ requires=("results_history",),
406
+ ),
407
+ CheckSpec(
408
+ id="opex.release.no_baseline",
409
+ category=Category.OPERATIONAL_EXCELLENCE,
410
+ title="No baseline result is available for regression gating",
411
+ summary=(
412
+ "The current eval can pass thresholds, but AgentOps has no "
413
+ "known-good baseline or prior run to show whether the candidate "
414
+ "regressed from production behavior."
415
+ ),
416
+ severities=(Severity.WARNING,),
417
+ requires=("workspace", "results_history"),
418
+ ),
419
+ CheckSpec(
420
+ id="opex.release.no_trace_regression_dataset",
421
+ category=Category.OPERATIONAL_EXCELLENCE,
422
+ title="Production traces are not feeding a regression dataset yet",
423
+ summary=(
424
+ "No trace-regression manifest exists, so production incidents and "
425
+ "high-value conversations are not yet part of the eval flywheel."
426
+ ),
427
+ severities=(Severity.INFO,),
428
+ requires=("workspace", "results_history"),
429
+ ),
430
+ CheckSpec(
431
+ id="opex.release.no_continuous_eval",
432
+ category=Category.OPERATIONAL_EXCELLENCE,
433
+ title="No enabled Foundry continuous evaluation rule is attached",
434
+ summary=(
435
+ "The Foundry control plane is reachable, but no enabled online "
436
+ "evaluation rule was detected for ongoing production scoring."
437
+ ),
438
+ severities=(Severity.WARNING,),
439
+ requires=("foundry_control",),
440
+ ),
441
+ CheckSpec(
442
+ id="opex.results_not_gitignored",
443
+ category=Category.OPERATIONAL_EXCELLENCE,
444
+ title="Eval results are not gitignored",
445
+ summary=(
446
+ "`.agentops/results/` is checked into the repo - large "
447
+ "binary diffs and stale runs will pollute history."
448
+ ),
449
+ severities=(Severity.WARNING,),
450
+ requires=("workspace",),
451
+ ),
452
+ CheckSpec(
453
+ id="opex.unversioned_dataset",
454
+ category=Category.OPERATIONAL_EXCELLENCE,
455
+ title="Dataset YAML files are missing a `version` field",
456
+ summary=(
457
+ "Datasets without a `version` make run reproducibility "
458
+ "ambiguous when the dataset is edited."
459
+ ),
460
+ severities=(Severity.WARNING,),
461
+ requires=("workspace",),
462
+ ),
463
+ CheckSpec(
464
+ id="opex.unversioned_bundle",
465
+ category=Category.OPERATIONAL_EXCELLENCE,
466
+ title="Bundle YAML files are missing a `version` field",
467
+ summary=(
468
+ "Bundles without a `version` make evaluator stack changes "
469
+ "invisible across runs."
470
+ ),
471
+ severities=(Severity.WARNING,),
472
+ requires=("workspace",),
473
+ ),
474
+ CheckSpec(
475
+ id="opex.results_dir_bloat",
476
+ category=Category.OPERATIONAL_EXCELLENCE,
477
+ title="Eval results directory is bloated",
478
+ summary=(
479
+ "`.agentops/results/` has accumulated a large number of "
480
+ "historical runs - prune or archive to keep checkouts fast."
481
+ ),
482
+ severities=(Severity.WARNING,),
483
+ requires=("workspace",),
484
+ ),
485
+ CheckSpec(
486
+ id="opex.workflow_concurrency_lock",
487
+ category=Category.OPERATIONAL_EXCELLENCE,
488
+ title="AgentOps workflows are missing a `concurrency:` block",
489
+ summary=(
490
+ "Without concurrency locks, parallel CI runs can race on "
491
+ "the same eval target and produce conflicting telemetry."
492
+ ),
493
+ severities=(Severity.WARNING,),
494
+ requires=("workspace",),
495
+ ),
496
+ CheckSpec(
497
+ id="opex.workflow_action_sha_pinning",
498
+ category=Category.OPERATIONAL_EXCELLENCE,
499
+ title="AgentOps workflows pin actions by tag, not by SHA",
500
+ summary=(
501
+ "Tag-pinned actions can change underneath you. Pin to a "
502
+ "commit SHA for supply-chain hardening."
503
+ ),
504
+ severities=(Severity.WARNING,),
505
+ requires=("workspace",),
506
+ ),
507
+ CheckSpec(
508
+ id="opex.max_tokens_undefined",
509
+ category=Category.OPERATIONAL_EXCELLENCE,
510
+ title="`max_tokens` is not set on model / evaluator configuration",
511
+ summary=(
512
+ "Unbounded `max_tokens` invites long, expensive responses "
513
+ "and unpredictable latency."
514
+ ),
515
+ severities=(Severity.WARNING,),
516
+ requires=("workspace",),
517
+ ),
518
+ CheckSpec(
519
+ id="opex.no_foundry_control_configured",
520
+ category=Category.OPERATIONAL_EXCELLENCE,
521
+ title="Foundry control plane is not configured",
522
+ summary=(
523
+ "The `foundry_control` source is enabled but not reachable "
524
+ "- Foundry-side agents, eval rules, and run failures will "
525
+ "stay invisible."
526
+ ),
527
+ severities=(Severity.WARNING,),
528
+ requires=("foundry_control",),
529
+ ),
530
+ CheckSpec(
531
+ id="opex.spec_conformance.spec_missing",
532
+ category=Category.OPERATIONAL_EXCELLENCE,
533
+ title="Spec setup detected, but no usable specification was found",
534
+ summary=(
535
+ "Doctor found signs that this repo uses spec-driven "
536
+ "development (for example `.specify/`, `AGENTS.md`, or a "
537
+ "`copilot-instructions.md` shell), but could not load a "
538
+ "real spec body. Without that reference, it cannot check "
539
+ "whether bundles, datasets, tasks, and "
540
+ "implementation still match the intended agent behavior."
541
+ ),
542
+ severities=(Severity.WARNING,),
543
+ requires=("spec_workspace",),
544
+ ),
545
+ CheckSpec(
546
+ id="opex.spec_conformance.tasks_stale",
547
+ category=Category.OPERATIONAL_EXCELLENCE,
548
+ title="Spec tasks have been left open past the freshness window",
549
+ summary=(
550
+ "Doctor found unchecked task-list items in the spec "
551
+ "(for example `tasks.md` in a spec-kit workspace) and the "
552
+ "spec has not been updated within the configured freshness "
553
+ "window. This usually means the implementation plan is no "
554
+ "longer trustworthy: either the work is done but the tasks "
555
+ "were not checked off, the tasks are no longer relevant, or "
556
+ "the agent behavior changed without the spec being refreshed."
557
+ ),
558
+ severities=(Severity.INFO, Severity.WARNING),
559
+ requires=("spec_workspace",),
560
+ ),
561
+ CheckSpec(
562
+ id="opex.spec_conformance.tasks_orphaned",
563
+ category=Category.OPERATIONAL_EXCELLENCE,
564
+ title="`tasks.md` references items not present in the spec",
565
+ summary=(
566
+ "Task entries don't map back to anything in the spec - "
567
+ "the plan and the spec are drifting apart."
568
+ ),
569
+ severities=(Severity.WARNING,),
570
+ requires=("spec_workspace",),
571
+ ),
572
+ CheckSpec(
573
+ id="opex.spec_conformance.evaluator_drift",
574
+ category=Category.OPERATIONAL_EXCELLENCE,
575
+ title="Spec lists evaluators that the bundle does not implement",
576
+ summary=(
577
+ "The spec mentions evaluators that are absent from the "
578
+ "AgentOps bundle - real evals don't cover what the spec "
579
+ "promises."
580
+ ),
581
+ severities=(Severity.WARNING,),
582
+ requires=("workspace", "spec_workspace"),
583
+ ),
584
+ CheckSpec(
585
+ id="opex.spec_conformance.dataset_drift",
586
+ category=Category.OPERATIONAL_EXCELLENCE,
587
+ title="Spec mentions datasets that don't exist in the workspace",
588
+ summary=(
589
+ "Dataset names referenced in the spec are missing from "
590
+ "`.agentops/datasets/`."
591
+ ),
592
+ severities=(Severity.WARNING,),
593
+ requires=("workspace", "spec_workspace"),
594
+ ),
595
+ CheckSpec(
596
+ id="opex.spec_conformance.agent_drift",
597
+ category=Category.OPERATIONAL_EXCELLENCE,
598
+ title="Spec describes an agent target inconsistent with `agentops.yaml`",
599
+ summary=(
600
+ "The agent name / version in the spec doesn't match the "
601
+ "one pinned in the AgentOps config."
602
+ ),
603
+ severities=(Severity.WARNING,),
604
+ requires=("workspace", "spec_workspace"),
605
+ ),
606
+ CheckSpec(
607
+ id="opex.spec_conformance.llm.implementation_gap",
608
+ category=Category.OPERATIONAL_EXCELLENCE,
609
+ title="LLM detects spec capabilities missing from the implementation",
610
+ summary=(
611
+ "A judge model compared the spec to the AgentOps workspace "
612
+ "and flagged capabilities the spec promises but the "
613
+ "implementation does not cover."
614
+ ),
615
+ severities=(Severity.WARNING,),
616
+ requires=("workspace", "spec_workspace", "judge_model"),
617
+ flags=("llm_judged", "opt_in"),
618
+ ),
619
+ CheckSpec(
620
+ id="opex.spec_conformance.llm.input_too_large",
621
+ category=Category.OPERATIONAL_EXCELLENCE,
622
+ title="Spec is too large to evaluate with the judge model",
623
+ summary=(
624
+ "The merged spec exceeded the judge model's input budget "
625
+ "and was skipped or truncated - raise `max_input_chars` "
626
+ "or split the spec."
627
+ ),
628
+ severities=(Severity.INFO,),
629
+ requires=("spec_workspace", "judge_model"),
630
+ flags=("llm_judged", "opt_in"),
631
+ ),
632
+ CheckSpec(
633
+ id="opex.llm.bundle_coverage",
634
+ category=Category.OPERATIONAL_EXCELLENCE,
635
+ title="LLM-judged evaluator-bundle coverage gap",
636
+ summary=(
637
+ "A judge model reviewed the bundle and flagged risk "
638
+ "dimensions (e.g. safety, groundedness) that no evaluator "
639
+ "currently covers."
640
+ ),
641
+ severities=(Severity.WARNING,),
642
+ requires=("workspace", "judge_model"),
643
+ flags=("llm_judged", "opt_in"),
644
+ ),
645
+ # ------------------------------------------------------------------
646
+ # Security
647
+ # ------------------------------------------------------------------
648
+ CheckSpec(
649
+ id="waf.security.local_auth_disabled",
650
+ category=Category.SECURITY,
651
+ title="Local (API key) authentication is enabled",
652
+ summary=(
653
+ "The Cognitive Services / Azure OpenAI account still "
654
+ "accepts key-based auth - WAF-AI recommends Entra ID "
655
+ "(managed identity) only."
656
+ ),
657
+ severities=(Severity.CRITICAL,),
658
+ requires=("azure_resources",),
659
+ ),
660
+ CheckSpec(
661
+ id="waf.security.managed_identity",
662
+ category=Category.SECURITY,
663
+ title="Account has no managed identity assigned",
664
+ summary=(
665
+ "Without a managed identity the agent runtime has to fall "
666
+ "back to keys or service principals with broader scopes."
667
+ ),
668
+ severities=(Severity.CRITICAL,),
669
+ requires=("azure_resources",),
670
+ ),
671
+ CheckSpec(
672
+ id="waf.security.diagnostic_settings",
673
+ category=Category.SECURITY,
674
+ title="Diagnostic settings are missing or incomplete",
675
+ summary=(
676
+ "The AI account is not streaming logs / metrics to a Log "
677
+ "Analytics workspace - investigations and audits will be "
678
+ "blind."
679
+ ),
680
+ severities=(Severity.WARNING, Severity.CRITICAL),
681
+ requires=("azure_resources",),
682
+ ),
683
+ # ------------------------------------------------------------------
684
+ # Responsible AI
685
+ # ------------------------------------------------------------------
686
+ CheckSpec(
687
+ id="safety.<metric>",
688
+ category=Category.RESPONSIBLE_AI,
689
+ title="Content-safety metric tripped in the latest eval",
690
+ summary=(
691
+ "One of the content-safety metrics (violence, self_harm, "
692
+ "sexual, hate_unfairness, protected_material) hit the "
693
+ "configured severity floor on the latest eval run."
694
+ ),
695
+ severities=(Severity.WARNING, Severity.CRITICAL),
696
+ requires=("results_history",),
697
+ flags=("dynamic_id",),
698
+ ),
699
+ CheckSpec(
700
+ id="safety.runtime.<signal>",
701
+ category=Category.RESPONSIBLE_AI,
702
+ title="Production content-filter or jailbreak signal observed",
703
+ summary=(
704
+ "App Insights / Log Analytics recorded one or more content "
705
+ "filter or jailbreak triggers within the lookback window."
706
+ ),
707
+ severities=(Severity.WARNING, Severity.CRITICAL),
708
+ requires=("azure_monitor",),
709
+ flags=("dynamic_id",),
710
+ ),
711
+ CheckSpec(
712
+ id="safety.config.continuous_eval_missing",
713
+ category=Category.RESPONSIBLE_AI,
714
+ title="Foundry continuous evaluation is not configured",
715
+ summary=(
716
+ "The Foundry project has no continuous-evaluation rule "
717
+ "wired up - safety regressions in production won't be "
718
+ "caught between manual runs."
719
+ ),
720
+ severities=(Severity.WARNING,),
721
+ requires=("foundry_control",),
722
+ ),
723
+ CheckSpec(
724
+ id="safety.config.continuous_eval_disabled",
725
+ category=Category.RESPONSIBLE_AI,
726
+ title="Foundry continuous evaluation is configured but disabled",
727
+ summary=(
728
+ "A continuous-evaluation rule exists in Foundry but is "
729
+ "currently turned off."
730
+ ),
731
+ severities=(Severity.WARNING,),
732
+ requires=("foundry_control",),
733
+ ),
734
+ CheckSpec(
735
+ id="responsible_ai.llm.prompt_transparency",
736
+ category=Category.RESPONSIBLE_AI,
737
+ title="System prompt lacks AI-disclosure / transparency",
738
+ summary=(
739
+ "A judge model reviewed the agent's system prompt and "
740
+ "flagged missing user-facing AI disclosure or transparency "
741
+ "language."
742
+ ),
743
+ severities=(Severity.WARNING,),
744
+ requires=("foundry_control", "judge_model"),
745
+ flags=("llm_judged", "opt_in"),
746
+ ),
747
+ CheckSpec(
748
+ id="responsible_ai.llm.prompt_safety_guardrails",
749
+ category=Category.RESPONSIBLE_AI,
750
+ title="System prompt is missing safety guardrails",
751
+ summary=(
752
+ "A judge model flagged the system prompt as lacking "
753
+ "explicit guardrails around harmful / disallowed content."
754
+ ),
755
+ severities=(Severity.WARNING,),
756
+ requires=("foundry_control", "judge_model"),
757
+ flags=("llm_judged", "opt_in"),
758
+ ),
759
+ CheckSpec(
760
+ id="responsible_ai.llm.prompt_jailbreak_surface",
761
+ category=Category.RESPONSIBLE_AI,
762
+ title="System prompt has an unusually large jailbreak surface",
763
+ summary=(
764
+ "A judge model evaluated the prompt's resistance to "
765
+ "common jailbreak vectors and surfaced a high-risk "
766
+ "pattern."
767
+ ),
768
+ severities=(Severity.WARNING,),
769
+ requires=("foundry_control", "judge_model"),
770
+ flags=("llm_judged", "opt_in"),
771
+ ),
772
+ CheckSpec(
773
+ id="responsible_ai.llm.dataset_pii_risk",
774
+ category=Category.RESPONSIBLE_AI,
775
+ title="Dataset contains likely PII",
776
+ summary=(
777
+ "A judge model scanned the eval dataset for personally "
778
+ "identifiable information and flagged samples that should "
779
+ "be redacted or synthesized."
780
+ ),
781
+ severities=(Severity.WARNING,),
782
+ requires=("foundry_control", "judge_model"),
783
+ flags=("llm_judged", "opt_in"),
784
+ ),
785
+ CheckSpec(
786
+ id="responsible_ai.llm.dataset_bias_signals",
787
+ category=Category.RESPONSIBLE_AI,
788
+ title="Dataset shows demographic / topical bias signals",
789
+ summary=(
790
+ "A judge model identified imbalanced coverage of "
791
+ "demographic or topical groups in the eval dataset."
792
+ ),
793
+ severities=(Severity.WARNING,),
794
+ requires=("foundry_control", "judge_model"),
795
+ flags=("llm_judged", "opt_in"),
796
+ ),
797
+ )
798
+
799
+
800
+ # Pillar display order. Keep this aligned with `findings.Category`.
801
+ CATEGORY_ORDER: Tuple[Category, ...] = (
802
+ Category.QUALITY,
803
+ Category.PERFORMANCE,
804
+ Category.RELIABILITY,
805
+ Category.OPERATIONAL_EXCELLENCE,
806
+ Category.SECURITY,
807
+ Category.RESPONSIBLE_AI,
808
+ )
809
+
810
+
811
+ # Human-readable category descriptions used by the list view header.
812
+ CATEGORY_DESCRIPTIONS: Dict[Category, str] = {
813
+ Category.QUALITY: (
814
+ "eval-driven signals (regression watchlist)"
815
+ ),
816
+ Category.PERFORMANCE: (
817
+ "latency and throughput signals from eval and production"
818
+ ),
819
+ Category.RELIABILITY: (
820
+ "error, failure, and rate-limit signals"
821
+ ),
822
+ Category.OPERATIONAL_EXCELLENCE: (
823
+ "workspace hygiene, CI gates, spec / config drift, Foundry audit"
824
+ ),
825
+ Category.SECURITY: (
826
+ "identity, auth and diagnostics posture (WAF-AI security pillar)"
827
+ ),
828
+ Category.RESPONSIBLE_AI: (
829
+ "content safety, prompt and dataset RAI heuristics"
830
+ ),
831
+ }
832
+
833
+
834
+ # ---------------------------------------------------------------------------
835
+ # Helpers
836
+ # ---------------------------------------------------------------------------
837
+
838
+
839
+ def all_checks() -> Tuple[CheckSpec, ...]:
840
+ """Return the full catalog as an immutable tuple."""
841
+ return CHECKS
842
+
843
+
844
+ def by_category(
845
+ checks: Iterable[CheckSpec] = CHECKS,
846
+ ) -> Dict[Category, List[CheckSpec]]:
847
+ """Group ``checks`` by category, preserving :data:`CATEGORY_ORDER`."""
848
+ grouped: Dict[Category, List[CheckSpec]] = {c: [] for c in CATEGORY_ORDER}
849
+ for spec in checks:
850
+ grouped.setdefault(spec.category, []).append(spec)
851
+ return grouped
852
+
853
+
854
+ def filter_checks(
855
+ *,
856
+ category: Category | None = None,
857
+ source: str | None = None,
858
+ ) -> List[CheckSpec]:
859
+ """Return the catalog filtered by category and/or required source."""
860
+ out: List[CheckSpec] = []
861
+ for spec in CHECKS:
862
+ if category is not None and spec.category != category:
863
+ continue
864
+ if source is not None and source not in spec.requires:
865
+ continue
866
+ out.append(spec)
867
+ return out
868
+
869
+
870
+ def reference_url_for(spec: CheckSpec) -> str | None:
871
+ """Return the best public documentation URL for ``spec``.
872
+
873
+ The catalog prefers a rule-specific reference when one exists, and
874
+ otherwise falls back to the public WAF-AI pillar page. Returning a
875
+ URL from here means the CLI can display a clickable "learn more"
876
+ line without hardcoding doc links in the presentation layer.
877
+ """
878
+ return CHECK_REFERENCE_URLS.get(spec.id) or CATEGORY_REFERENCE_URLS.get(
879
+ spec.category
880
+ )