agentops-accelerator 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. agentops/__init__.py +10 -0
  2. agentops/__main__.py +6 -0
  3. agentops/agent/__init__.py +12 -0
  4. agentops/agent/_legacy_ids.py +92 -0
  5. agentops/agent/analyzer.py +207 -0
  6. agentops/agent/checks/__init__.py +1 -0
  7. agentops/agent/checks/catalog.py +880 -0
  8. agentops/agent/checks/errors.py +279 -0
  9. agentops/agent/checks/foundry_config.py +75 -0
  10. agentops/agent/checks/latency.py +84 -0
  11. agentops/agent/checks/opex.py +157 -0
  12. agentops/agent/checks/opex_workspace.py +874 -0
  13. agentops/agent/checks/posture.py +36 -0
  14. agentops/agent/checks/posture_rules/__init__.py +53 -0
  15. agentops/agent/checks/posture_rules/content_filter.py +59 -0
  16. agentops/agent/checks/posture_rules/diagnostics.py +74 -0
  17. agentops/agent/checks/posture_rules/local_auth.py +55 -0
  18. agentops/agent/checks/posture_rules/managed_identity.py +59 -0
  19. agentops/agent/checks/posture_rules/network.py +68 -0
  20. agentops/agent/checks/regression.py +78 -0
  21. agentops/agent/checks/release_readiness.py +182 -0
  22. agentops/agent/checks/safety.py +247 -0
  23. agentops/agent/checks/spec_conformance.py +375 -0
  24. agentops/agent/cockpit.py +5159 -0
  25. agentops/agent/config.py +240 -0
  26. agentops/agent/findings.py +113 -0
  27. agentops/agent/history.py +142 -0
  28. agentops/agent/knowledge/__init__.py +182 -0
  29. agentops/agent/knowledge/waf-checklist.csv +39 -0
  30. agentops/agent/llm_assist/__init__.py +16 -0
  31. agentops/agent/llm_assist/_base.py +124 -0
  32. agentops/agent/llm_assist/_bundle_rule.py +154 -0
  33. agentops/agent/llm_assist/_client.py +347 -0
  34. agentops/agent/llm_assist/_dataset_rules.py +191 -0
  35. agentops/agent/llm_assist/_engine.py +106 -0
  36. agentops/agent/llm_assist/_prompt_rules.py +291 -0
  37. agentops/agent/llm_assist/_spec_rules.py +235 -0
  38. agentops/agent/production_telemetry.py +430 -0
  39. agentops/agent/report.py +207 -0
  40. agentops/agent/server/__init__.py +1 -0
  41. agentops/agent/server/app.py +84 -0
  42. agentops/agent/server/auth.py +94 -0
  43. agentops/agent/server/chat.py +44 -0
  44. agentops/agent/server/protocol.py +72 -0
  45. agentops/agent/sources/__init__.py +1 -0
  46. agentops/agent/sources/azure_monitor.py +523 -0
  47. agentops/agent/sources/azure_resources.py +602 -0
  48. agentops/agent/sources/foundry_control.py +174 -0
  49. agentops/agent/sources/results_history.py +494 -0
  50. agentops/agent/sources/spec_detectors/__init__.py +42 -0
  51. agentops/agent/sources/spec_detectors/_base.py +58 -0
  52. agentops/agent/sources/spec_detectors/agents_md.py +75 -0
  53. agentops/agent/sources/spec_detectors/spec_kit.py +172 -0
  54. agentops/agent/time_range.py +117 -0
  55. agentops/cli/__init__.py +1 -0
  56. agentops/cli/app.py +4823 -0
  57. agentops/core/__init__.py +1 -0
  58. agentops/core/agentops_config.py +592 -0
  59. agentops/core/config_loader.py +22 -0
  60. agentops/core/evaluators.py +480 -0
  61. agentops/core/release_evidence.py +56 -0
  62. agentops/core/results.py +117 -0
  63. agentops/mcp/__init__.py +10 -0
  64. agentops/mcp/server.py +232 -0
  65. agentops/pipeline/__init__.py +8 -0
  66. agentops/pipeline/cloud_results.py +189 -0
  67. agentops/pipeline/cloud_runner.py +901 -0
  68. agentops/pipeline/comparison.py +108 -0
  69. agentops/pipeline/diagnostics.py +51 -0
  70. agentops/pipeline/invocations.py +535 -0
  71. agentops/pipeline/official_eval.py +414 -0
  72. agentops/pipeline/orchestrator.py +775 -0
  73. agentops/pipeline/prompt_deploy.py +377 -0
  74. agentops/pipeline/publisher.py +121 -0
  75. agentops/pipeline/reporter.py +202 -0
  76. agentops/pipeline/runtime.py +409 -0
  77. agentops/pipeline/thresholds.py +84 -0
  78. agentops/services/__init__.py +1 -0
  79. agentops/services/cicd.py +720 -0
  80. agentops/services/eval_analysis.py +848 -0
  81. agentops/services/evidence_pack.py +757 -0
  82. agentops/services/initializer.py +86 -0
  83. agentops/services/preflight.py +470 -0
  84. agentops/services/setup_wizard.py +709 -0
  85. agentops/services/skills.py +643 -0
  86. agentops/services/trace_promotion.py +300 -0
  87. agentops/services/workflow_analysis.py +1129 -0
  88. agentops/templates/.gitignore +15 -0
  89. agentops/templates/__init__.py +1 -0
  90. agentops/templates/agent-server/Dockerfile +23 -0
  91. agentops/templates/agent-server/README.md +61 -0
  92. agentops/templates/agent-server/main.bicep +94 -0
  93. agentops/templates/agent.yaml +87 -0
  94. agentops/templates/agentops.yaml +58 -0
  95. agentops/templates/foundry.svg +71 -0
  96. agentops/templates/icon.png +0 -0
  97. agentops/templates/pipelines/azuredevops/agentops-deploy-dev-azd.yml +118 -0
  98. agentops/templates/pipelines/azuredevops/agentops-deploy-dev.yml +73 -0
  99. agentops/templates/pipelines/azuredevops/agentops-deploy-prod-azd.yml +141 -0
  100. agentops/templates/pipelines/azuredevops/agentops-deploy-prod.yml +94 -0
  101. agentops/templates/pipelines/azuredevops/agentops-deploy-prompt-agent.yml +167 -0
  102. agentops/templates/pipelines/azuredevops/agentops-deploy-qa-azd.yml +118 -0
  103. agentops/templates/pipelines/azuredevops/agentops-deploy-qa.yml +68 -0
  104. agentops/templates/pipelines/azuredevops/agentops-pr-prompt-agent.yml +210 -0
  105. agentops/templates/pipelines/azuredevops/agentops-pr.yml +155 -0
  106. agentops/templates/pipelines/azuredevops/agentops-watchdog.yml +106 -0
  107. agentops/templates/project.gitignore +36 -0
  108. agentops/templates/sample-traces.jsonl +3 -0
  109. agentops/templates/skills/agentops-agent/SKILL.md +137 -0
  110. agentops/templates/skills/agentops-config/SKILL.md +113 -0
  111. agentops/templates/skills/agentops-dataset/SKILL.md +84 -0
  112. agentops/templates/skills/agentops-eval/SKILL.md +189 -0
  113. agentops/templates/skills/agentops-report/SKILL.md +71 -0
  114. agentops/templates/skills/agentops-workflow/SKILL.md +471 -0
  115. agentops/templates/smoke.jsonl +3 -0
  116. agentops/templates/waf-checklist.README.md +84 -0
  117. agentops/templates/waf-checklist.csv +22 -0
  118. agentops/templates/workflows/agentops-deploy-dev-azd.yml +166 -0
  119. agentops/templates/workflows/agentops-deploy-dev.yml +187 -0
  120. agentops/templates/workflows/agentops-deploy-prod-azd.yml +183 -0
  121. agentops/templates/workflows/agentops-deploy-prod.yml +171 -0
  122. agentops/templates/workflows/agentops-deploy-prompt-agent.yml +197 -0
  123. agentops/templates/workflows/agentops-deploy-qa-azd.yml +156 -0
  124. agentops/templates/workflows/agentops-deploy-qa.yml +145 -0
  125. agentops/templates/workflows/agentops-pr-prompt-agent.yml +210 -0
  126. agentops/templates/workflows/agentops-pr.yml +148 -0
  127. agentops/templates/workflows/agentops-watchdog.yml +122 -0
  128. agentops/utils/__init__.py +1 -0
  129. agentops/utils/azd_env.py +435 -0
  130. agentops/utils/azure_endpoints.py +62 -0
  131. agentops/utils/colors.py +47 -0
  132. agentops/utils/dotenv_loader.py +105 -0
  133. agentops/utils/foundry_discovery.py +229 -0
  134. agentops/utils/logging.py +59 -0
  135. agentops/utils/telemetry.py +554 -0
  136. agentops/utils/yaml.py +36 -0
  137. agentops_accelerator-0.3.0.dist-info/METADATA +278 -0
  138. agentops_accelerator-0.3.0.dist-info/RECORD +142 -0
  139. agentops_accelerator-0.3.0.dist-info/WHEEL +5 -0
  140. agentops_accelerator-0.3.0.dist-info/entry_points.txt +2 -0
  141. agentops_accelerator-0.3.0.dist-info/licenses/LICENSE +21 -0
  142. agentops_accelerator-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,247 @@
1
+ """Safety check: scans local evaluations, production telemetry, and
2
+ Foundry control-plane configuration for Responsible-AI gaps.
3
+
4
+ The check emits findings under :class:`Category.RESPONSIBLE_AI` from
5
+ three independent layers:
6
+
7
+ * **Eval** - content-safety metric hits in the latest eval run.
8
+ Backwards-compatible id ``safety.<metric>``; evidence carries
9
+ ``layer=eval``.
10
+ * **Runtime** - content-filter triggers observed in App Insights /
11
+ Log Analytics within the lookback window. Id ``safety.runtime.content_filter``.
12
+ * **Config** - Foundry continuous-evaluation rules missing or disabled.
13
+ Id ``safety.config.continuous_eval_missing`` /
14
+ ``safety.config.continuous_eval_disabled``.
15
+
16
+ Each layer fails open: if its source did not produce a payload, that
17
+ layer simply emits nothing.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from typing import Any, Dict, List, Optional
23
+
24
+ from agentops.agent.config import SafetyCheckConfig
25
+ from agentops.agent.findings import Category, Finding, Severity
26
+ from agentops.agent.sources.azure_monitor import AzureMonitorPayload
27
+ from agentops.agent.sources.foundry_control import FoundryControlPayload
28
+ from agentops.agent.sources.results_history import ResultsHistory
29
+
30
+ _SEVERITY_RANK = {"low": 0, "medium": 1, "high": 2}
31
+
32
+ _SAFETY_METRICS = (
33
+ "violence",
34
+ "self_harm",
35
+ "sexual",
36
+ "hate_unfairness",
37
+ "protected_material",
38
+ )
39
+
40
+
41
+ def _severity_value(raw: Any) -> int:
42
+ if isinstance(raw, str):
43
+ return _SEVERITY_RANK.get(raw.strip().lower(), -1)
44
+ if isinstance(raw, (int, float)):
45
+ return int(raw)
46
+ return -1
47
+
48
+
49
+ def run_safety_check(
50
+ history: ResultsHistory,
51
+ config: SafetyCheckConfig,
52
+ monitor: Optional[AzureMonitorPayload] = None,
53
+ foundry: Optional[FoundryControlPayload] = None,
54
+ ) -> List[Finding]:
55
+ """Run all three safety layers and return the merged findings."""
56
+ findings: List[Finding] = []
57
+ findings.extend(_find_eval_safety(history, config))
58
+ if monitor is not None:
59
+ findings.extend(_find_runtime_safety(monitor, config))
60
+ if foundry is not None:
61
+ findings.extend(_find_config_safety(foundry))
62
+ return findings
63
+
64
+
65
+ def _find_eval_safety(
66
+ history: ResultsHistory, config: SafetyCheckConfig
67
+ ) -> List[Finding]:
68
+ if not history.runs:
69
+ return []
70
+
71
+ floor = _SEVERITY_RANK.get(config.severity_floor.strip().lower(), 1)
72
+ latest = history.runs[-1]
73
+
74
+ hits: Dict[str, Dict[str, Any]] = {}
75
+ for item in latest.item_evaluations:
76
+ if not isinstance(item, dict):
77
+ continue
78
+ metrics = item.get("metrics") or item.get("scores") or {}
79
+ if not isinstance(metrics, dict):
80
+ continue
81
+ for key, value in metrics.items():
82
+ metric = str(key).lower()
83
+ if not any(m in metric for m in _SAFETY_METRICS):
84
+ continue
85
+ level = _severity_value(value)
86
+ if level < floor:
87
+ continue
88
+ existing = hits.get(metric)
89
+ if existing is None or level > existing.get("level", -1):
90
+ hits[metric] = {
91
+ "level": level,
92
+ "value": value,
93
+ "row": item.get("input")
94
+ or item.get("question")
95
+ or item.get("id"),
96
+ }
97
+
98
+ findings: List[Finding] = []
99
+ for metric, info in hits.items():
100
+ severity = (
101
+ Severity.CRITICAL if info["level"] >= 2 else Severity.WARNING
102
+ )
103
+ findings.append(
104
+ Finding(
105
+ id=f"safety.{metric}",
106
+ severity=severity,
107
+ category=Category.RESPONSIBLE_AI,
108
+ title=f"Content-safety hit on `{metric}`",
109
+ summary=(
110
+ f"Run `{latest.run_id}` produced a `{metric}` rating "
111
+ f"of `{info['value']}` on at least one row."
112
+ ),
113
+ recommendation=(
114
+ "Inspect the offending dataset row and the model "
115
+ "response, tighten the system prompt or add a safety "
116
+ "filter, and re-evaluate."
117
+ ),
118
+ source="results_history",
119
+ evidence={
120
+ "layer": "eval",
121
+ "metric": metric,
122
+ "value": info["value"],
123
+ "row": info.get("row"),
124
+ "run_id": latest.run_id,
125
+ },
126
+ )
127
+ )
128
+ return findings
129
+
130
+
131
+ def _find_runtime_safety(
132
+ monitor: AzureMonitorPayload, config: SafetyCheckConfig
133
+ ) -> List[Finding]:
134
+ findings: List[Finding] = []
135
+ for violation in monitor.safety_violations:
136
+ if not isinstance(violation, dict):
137
+ continue
138
+ hits = int(violation.get("hits", 0) or 0)
139
+ if hits < config.min_runtime_hits:
140
+ continue
141
+ signal = str(violation.get("signal") or "content_filter")
142
+ severity = (
143
+ Severity.CRITICAL
144
+ if hits >= config.runtime_critical_hits
145
+ else Severity.WARNING
146
+ )
147
+ findings.append(
148
+ Finding(
149
+ id=f"safety.runtime.{signal}",
150
+ severity=severity,
151
+ category=Category.RESPONSIBLE_AI,
152
+ title=f"Content-filter triggers detected in production (`{signal}`)",
153
+ summary=(
154
+ f"App Insights observed {hits} `{signal}` event(s) "
155
+ "over the lookback window. Each one is a response "
156
+ "the model refused to complete or a request blocked "
157
+ "by Azure AI Content Safety."
158
+ ),
159
+ recommendation=(
160
+ "Inspect the underlying traces in Application "
161
+ "Insights, identify whether the spike originates "
162
+ "from a single client, a regression in the system "
163
+ "prompt, or actual adversarial input, and adjust "
164
+ "guardrails accordingly."
165
+ ),
166
+ source="azure_monitor",
167
+ evidence={
168
+ "layer": "runtime",
169
+ "signal": signal,
170
+ "hits": hits,
171
+ },
172
+ )
173
+ )
174
+ return findings
175
+
176
+
177
+ def _find_config_safety(foundry: FoundryControlPayload) -> List[Finding]:
178
+ if not foundry.agents:
179
+ return []
180
+
181
+ rules = foundry.evaluation_rules
182
+ diag = foundry.diagnostics or {}
183
+
184
+ # We only emit config findings if we were actually able to *probe*
185
+ # for rules (avoid false positives when the SDK lacks the surface).
186
+ if (
187
+ "evaluation_rules_count" not in diag
188
+ and "evaluation_rules_warning" not in diag
189
+ ):
190
+ return []
191
+
192
+ findings: List[Finding] = []
193
+
194
+ if not rules:
195
+ findings.append(
196
+ Finding(
197
+ id="safety.config.continuous_eval_missing",
198
+ severity=Severity.WARNING,
199
+ category=Category.RESPONSIBLE_AI,
200
+ title="No continuous evaluation rules configured",
201
+ summary=(
202
+ f"Foundry project lists {len(foundry.agents)} agent(s) "
203
+ "but no continuous-evaluation rules. Production "
204
+ "responses are not being scored on quality / safety "
205
+ "after deployment."
206
+ ),
207
+ recommendation=(
208
+ "Attach continuous evaluation rules to your agents "
209
+ "in Foundry (Operate -> Evaluations) so deployed "
210
+ "responses are scored against safety and quality "
211
+ "metrics in production."
212
+ ),
213
+ source="foundry_control",
214
+ evidence={
215
+ "layer": "config",
216
+ "agents": [a.agent_id for a in foundry.agents],
217
+ },
218
+ )
219
+ )
220
+ return findings
221
+
222
+ disabled = [r for r in rules if r.enabled is False]
223
+ if disabled:
224
+ findings.append(
225
+ Finding(
226
+ id="safety.config.continuous_eval_disabled",
227
+ severity=Severity.WARNING,
228
+ category=Category.RESPONSIBLE_AI,
229
+ title="One or more continuous evaluation rules are disabled",
230
+ summary=(
231
+ f"{len(disabled)} of {len(rules)} continuous "
232
+ "evaluation rule(s) are disabled. Production safety "
233
+ "scoring is partially or fully turned off."
234
+ ),
235
+ recommendation=(
236
+ "Re-enable the disabled rules in Foundry "
237
+ "(Operate -> Evaluations) or remove them if they "
238
+ "are intentionally retired."
239
+ ),
240
+ source="foundry_control",
241
+ evidence={
242
+ "layer": "config",
243
+ "disabled_rules": [r.rule_id for r in disabled],
244
+ },
245
+ )
246
+ )
247
+ return findings
@@ -0,0 +1,375 @@
1
+ """Spec-conformance check (Operational Excellence pillar).
2
+
3
+ Compares the project's spec-driven-development artifacts
4
+ (``.specify/spec.md`` + ``plan.md`` + ``tasks.md``, ``AGENTS.md``,
5
+ ``.github/copilot-instructions.md``) against the AgentOps workspace
6
+ (``run.yaml``, ``.agentops/bundles/``, ``.agentops/datasets/``)
7
+ and flags drift between the two.
8
+
9
+ All findings live under :class:`Category.OPERATIONAL_EXCELLENCE` with
10
+ the ``opex.spec_conformance.*`` id prefix. Deterministic rules emit
11
+ ``info``/``warning`` only — never ``critical`` — because spec
12
+ conformance is a soft signal.
13
+
14
+ The companion opt-in LLM rule
15
+ (``opex.spec_conformance.llm.implementation_gap``) lives in
16
+ :mod:`agentops.agent.llm_assist._spec_rules`.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from datetime import datetime, timezone
22
+ from pathlib import Path
23
+ from typing import Iterable, List, Optional
24
+
25
+ import yaml
26
+
27
+ from agentops.agent.config import SpecConformanceCheckConfig
28
+ from agentops.agent.findings import Category, Finding, Severity
29
+ from agentops.agent.sources.spec_detectors import (
30
+ DETECTORS,
31
+ Detector,
32
+ SpecDocument,
33
+ )
34
+
35
+ SOURCE_NAME = "spec_workspace"
36
+
37
+ def run_spec_conformance_check(
38
+ workspace: Path,
39
+ config: SpecConformanceCheckConfig,
40
+ ) -> List[Finding]:
41
+ """Run all deterministic spec-conformance rules and return findings."""
42
+ if not config.enabled:
43
+ return []
44
+
45
+ selected = _select_detectors(config.detectors)
46
+ documents: List[SpecDocument] = []
47
+ hint_only: List[Path] = []
48
+ for det in selected:
49
+ doc = det.detect(workspace)
50
+ if doc is not None:
51
+ documents.append(doc)
52
+ else:
53
+ hint_only.extend(det.hint_paths(workspace))
54
+
55
+ findings: List[Finding] = []
56
+
57
+ if not documents:
58
+ if hint_only:
59
+ findings.append(
60
+ Finding(
61
+ id="opex.spec_conformance.spec_missing",
62
+ severity=Severity.WARNING,
63
+ category=Category.OPERATIONAL_EXCELLENCE,
64
+ title=(
65
+ "Spec setup detected, but no usable specification was found"
66
+ ),
67
+ summary=(
68
+ "Doctor found signs that this repo uses "
69
+ "spec-driven development (for example "
70
+ "`.specify/`, `AGENTS.md`, or a "
71
+ "`copilot-instructions.md` shell), but could "
72
+ "not load a real spec body. Without that "
73
+ "reference, Doctor cannot check whether "
74
+ "bundles, datasets, tasks, and "
75
+ "implementation still match the intended agent "
76
+ "behavior."
77
+ ),
78
+ recommendation=(
79
+ "Add a readable spec such as `.specify/spec.md` "
80
+ "(spec-kit) or `AGENTS.md` that describes the "
81
+ "agent's intended behavior, capabilities, "
82
+ "evaluators, and datasets, then re-run "
83
+ "`agentops doctor`."
84
+ ),
85
+ source=SOURCE_NAME,
86
+ evidence={"hint_paths": [str(p) for p in hint_only]},
87
+ )
88
+ )
89
+ return _filter_skipped(findings, config.skip)
90
+
91
+ for doc in documents:
92
+ findings.extend(_check_tasks(doc, config.stale_after_days))
93
+ findings.extend(_check_evaluator_drift(workspace, doc))
94
+ findings.extend(_check_dataset_drift(workspace, doc))
95
+ findings.extend(_check_agent_drift(workspace, doc))
96
+
97
+ deduped: List[Finding] = []
98
+ seen: set[tuple[str, str]] = set()
99
+ for f in findings:
100
+ key = (f.id, _evidence_key(f))
101
+ if key in seen:
102
+ continue
103
+ seen.add(key)
104
+ deduped.append(f)
105
+
106
+ return _filter_skipped(deduped, config.skip)
107
+
108
+
109
+ def _select_detectors(names: Iterable[str]) -> List[Detector]:
110
+ requested = {n.strip().lower() for n in names if n and n.strip()}
111
+ if not requested:
112
+ return list(DETECTORS)
113
+ return [d for d in DETECTORS if d.name.lower() in requested]
114
+
115
+
116
+ def _filter_skipped(findings: List[Finding], skip: Iterable[str]) -> List[Finding]:
117
+ skip_set = {s.strip() for s in skip if s and s.strip()}
118
+ if not skip_set:
119
+ return findings
120
+ return [f for f in findings if f.id not in skip_set]
121
+
122
+
123
+ def _evidence_key(f: Finding) -> str:
124
+ """Stable key from a finding's evidence for de-duplication across detectors."""
125
+ if not isinstance(f.evidence, dict):
126
+ return ""
127
+ parts = []
128
+ for k in sorted(f.evidence):
129
+ v = f.evidence[k]
130
+ parts.append(f"{k}={v!r}")
131
+ return "|".join(parts)
132
+
133
+
134
+ def _check_tasks(doc: SpecDocument, stale_after_days: int) -> List[Finding]:
135
+ findings: List[Finding] = []
136
+ if not doc.tasks:
137
+ return findings
138
+
139
+ now = datetime.now(timezone.utc)
140
+ last_modified = doc.last_modified
141
+ age_days = (
142
+ (now - last_modified).total_seconds() / 86400.0
143
+ if last_modified is not None
144
+ else None
145
+ )
146
+
147
+ unchecked = [t for t in doc.tasks if not t.checked]
148
+ if unchecked and age_days is not None and age_days > stale_after_days:
149
+ findings.append(
150
+ Finding(
151
+ id="opex.spec_conformance.tasks_stale",
152
+ severity=Severity.WARNING,
153
+ category=Category.OPERATIONAL_EXCELLENCE,
154
+ title="Spec tasks have been left open past the freshness window",
155
+ summary=(
156
+ f"Doctor found {len(unchecked)} unchecked task(s) "
157
+ "in the spec (for example `tasks.md` in a spec-kit "
158
+ "workspace), and the spec has not been updated for "
159
+ f"{age_days:.1f} day(s). The configured freshness "
160
+ f"window is {stale_after_days} day(s). This usually "
161
+ "means the implementation plan is no longer "
162
+ "trustworthy: either the work is done but the tasks "
163
+ "were not checked off, the tasks are no longer "
164
+ "relevant, or the agent behavior changed without the "
165
+ "spec being refreshed."
166
+ ),
167
+ recommendation=(
168
+ "Review the open tasks. Check off completed work, "
169
+ "remove tasks that no longer apply, or update the "
170
+ "spec so the task list reflects the current agent "
171
+ "behavior and evaluation plan."
172
+ ),
173
+ source=SOURCE_NAME,
174
+ evidence={
175
+ "format": doc.format,
176
+ "open_tasks": len(unchecked),
177
+ "age_days": round(age_days, 2),
178
+ "threshold_days": stale_after_days,
179
+ },
180
+ )
181
+ )
182
+
183
+ orphans: List[str] = []
184
+ for task in doc.tasks:
185
+ if not task.checked:
186
+ continue
187
+ for rel in task.mentioned_paths:
188
+ candidate = doc.root / rel
189
+ if not candidate.exists():
190
+ # Try resolving from the workspace root instead of the
191
+ # spec root (e.g. spec-kit lives under .specify/ but
192
+ # paths are workspace-relative).
193
+ if not (doc.root.parent / rel).exists():
194
+ orphans.append(rel)
195
+
196
+ if orphans:
197
+ findings.append(
198
+ Finding(
199
+ id="opex.spec_conformance.tasks_orphaned",
200
+ severity=Severity.WARNING,
201
+ category=Category.OPERATIONAL_EXCELLENCE,
202
+ title="Completed tasks reference paths that don't exist",
203
+ summary=(
204
+ "One or more checked task items in the spec point "
205
+ "at files that aren't in the workspace. Either "
206
+ "the implementation was removed or the spec is "
207
+ "out of date."
208
+ ),
209
+ recommendation=(
210
+ "Update the spec to reflect the current code "
211
+ "layout, or restore the missing files."
212
+ ),
213
+ source=SOURCE_NAME,
214
+ evidence={
215
+ "format": doc.format,
216
+ "missing_paths": orphans[:10],
217
+ },
218
+ )
219
+ )
220
+
221
+ return findings
222
+
223
+
224
+ def _check_evaluator_drift(workspace: Path, doc: SpecDocument) -> List[Finding]:
225
+ mentioned = doc.references.get("evaluators") or []
226
+ if not mentioned:
227
+ return []
228
+ declared = _collect_evaluator_names(workspace)
229
+ if not declared:
230
+ return []
231
+ missing = [e for e in mentioned if e not in declared]
232
+ if not missing:
233
+ return []
234
+ return [
235
+ Finding(
236
+ id="opex.spec_conformance.evaluator_drift",
237
+ severity=Severity.WARNING,
238
+ category=Category.OPERATIONAL_EXCELLENCE,
239
+ title="Spec names evaluators that no bundle declares",
240
+ summary=(
241
+ "The spec mentions evaluator classes that are absent "
242
+ "from every `.agentops/bundles/*.yaml`. The "
243
+ "implementation isn't measuring what the spec "
244
+ "promises."
245
+ ),
246
+ recommendation=(
247
+ "Either add the missing evaluator(s) to a bundle or "
248
+ "update the spec to reflect what the project actually "
249
+ "evaluates."
250
+ ),
251
+ source=SOURCE_NAME,
252
+ evidence={"missing_evaluators": missing[:10]},
253
+ )
254
+ ]
255
+
256
+
257
+ def _check_dataset_drift(workspace: Path, doc: SpecDocument) -> List[Finding]:
258
+ mentioned = doc.references.get("datasets") or []
259
+ if not mentioned:
260
+ return []
261
+ available = {p.name for p in (workspace / ".agentops" / "datasets").glob("*.y*ml")}
262
+ available |= {p.name for p in (workspace / ".agentops" / "data").glob("*.jsonl")}
263
+ if not available:
264
+ return []
265
+ missing = [d for d in mentioned if Path(d).name not in available]
266
+ if not missing:
267
+ return []
268
+ return [
269
+ Finding(
270
+ id="opex.spec_conformance.dataset_drift",
271
+ severity=Severity.WARNING,
272
+ category=Category.OPERATIONAL_EXCELLENCE,
273
+ title="Spec references datasets that aren't in the workspace",
274
+ summary=(
275
+ "Dataset filenames mentioned in the spec do not "
276
+ "exist under `.agentops/datasets/` or "
277
+ "`.agentops/data/`."
278
+ ),
279
+ recommendation=(
280
+ "Add the missing dataset file(s) under "
281
+ "`.agentops/datasets/` (and the matching JSONL under "
282
+ "`.agentops/data/`), or update the spec."
283
+ ),
284
+ source=SOURCE_NAME,
285
+ evidence={"missing_datasets": missing[:10]},
286
+ )
287
+ ]
288
+
289
+
290
+ def _check_agent_drift(workspace: Path, doc: SpecDocument) -> List[Finding]:
291
+ mentioned = doc.references.get("agent_ids") or []
292
+ if not mentioned:
293
+ return []
294
+ run_yaml = workspace / ".agentops" / "run.yaml"
295
+ if not run_yaml.exists():
296
+ run_yaml = workspace / "run.yaml"
297
+ if not run_yaml.exists():
298
+ return []
299
+ try:
300
+ raw = yaml.safe_load(run_yaml.read_text(encoding="utf-8"))
301
+ except (OSError, yaml.YAMLError):
302
+ return []
303
+ if not isinstance(raw, dict):
304
+ return []
305
+ target = raw.get("target") or {}
306
+ endpoint = target.get("endpoint") or {}
307
+ declared_agent = str(endpoint.get("agent_id") or "")
308
+ if not declared_agent:
309
+ return []
310
+ if declared_agent in mentioned:
311
+ return []
312
+ return [
313
+ Finding(
314
+ id="opex.spec_conformance.agent_drift",
315
+ severity=Severity.WARNING,
316
+ category=Category.OPERATIONAL_EXCELLENCE,
317
+ title="Spec's agent identifier doesn't match `run.yaml`",
318
+ summary=(
319
+ f"`run.yaml` targets agent `{declared_agent}` but the "
320
+ "spec mentions a different agent identifier. The "
321
+ "evaluation is running against a different agent "
322
+ "than the spec describes."
323
+ ),
324
+ recommendation=(
325
+ "Pin `run.yaml`'s `target.endpoint.agent_id` to the "
326
+ "agent named in the spec, or update the spec to "
327
+ "match."
328
+ ),
329
+ source=SOURCE_NAME,
330
+ evidence={
331
+ "spec_agent_ids": mentioned[:5],
332
+ "run_yaml_agent_id": declared_agent,
333
+ },
334
+ )
335
+ ]
336
+
337
+ def _collect_evaluator_names(workspace: Path) -> set[str]:
338
+ """Read every bundle YAML and return the set of evaluator class names."""
339
+ out: set[str] = set()
340
+ bundles_dir = workspace / ".agentops" / "bundles"
341
+ if not bundles_dir.is_dir():
342
+ return out
343
+ for p in bundles_dir.glob("*.y*ml"):
344
+ try:
345
+ raw = yaml.safe_load(p.read_text(encoding="utf-8"))
346
+ except (OSError, yaml.YAMLError):
347
+ continue
348
+ if not isinstance(raw, dict):
349
+ continue
350
+ for ev in raw.get("evaluators") or []:
351
+ if isinstance(ev, dict):
352
+ name = ev.get("class") or ev.get("name")
353
+ if isinstance(name, str):
354
+ out.add(name.strip())
355
+ elif isinstance(ev, str):
356
+ out.add(ev.strip())
357
+ return out
358
+
359
+
360
+ def detect_documents(
361
+ workspace: Path,
362
+ config: Optional[SpecConformanceCheckConfig] = None,
363
+ ) -> List[SpecDocument]:
364
+ """Public helper: return all spec documents discovered in ``workspace``.
365
+
366
+ Used by the LLM rule to share detection with the deterministic
367
+ check without re-implementing the registry walk.
368
+ """
369
+ cfg = config or SpecConformanceCheckConfig()
370
+ out: List[SpecDocument] = []
371
+ for det in _select_detectors(cfg.detectors):
372
+ doc = det.detect(workspace)
373
+ if doc is not None:
374
+ out.append(doc)
375
+ return out