agentops-accelerator 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentops/__init__.py +10 -0
- agentops/__main__.py +6 -0
- agentops/agent/__init__.py +12 -0
- agentops/agent/_legacy_ids.py +92 -0
- agentops/agent/analyzer.py +207 -0
- agentops/agent/checks/__init__.py +1 -0
- agentops/agent/checks/catalog.py +880 -0
- agentops/agent/checks/errors.py +279 -0
- agentops/agent/checks/foundry_config.py +75 -0
- agentops/agent/checks/latency.py +84 -0
- agentops/agent/checks/opex.py +157 -0
- agentops/agent/checks/opex_workspace.py +874 -0
- agentops/agent/checks/posture.py +36 -0
- agentops/agent/checks/posture_rules/__init__.py +53 -0
- agentops/agent/checks/posture_rules/content_filter.py +59 -0
- agentops/agent/checks/posture_rules/diagnostics.py +74 -0
- agentops/agent/checks/posture_rules/local_auth.py +55 -0
- agentops/agent/checks/posture_rules/managed_identity.py +59 -0
- agentops/agent/checks/posture_rules/network.py +68 -0
- agentops/agent/checks/regression.py +78 -0
- agentops/agent/checks/release_readiness.py +182 -0
- agentops/agent/checks/safety.py +247 -0
- agentops/agent/checks/spec_conformance.py +375 -0
- agentops/agent/cockpit.py +5159 -0
- agentops/agent/config.py +240 -0
- agentops/agent/findings.py +113 -0
- agentops/agent/history.py +142 -0
- agentops/agent/knowledge/__init__.py +182 -0
- agentops/agent/knowledge/waf-checklist.csv +39 -0
- agentops/agent/llm_assist/__init__.py +16 -0
- agentops/agent/llm_assist/_base.py +124 -0
- agentops/agent/llm_assist/_bundle_rule.py +154 -0
- agentops/agent/llm_assist/_client.py +347 -0
- agentops/agent/llm_assist/_dataset_rules.py +191 -0
- agentops/agent/llm_assist/_engine.py +106 -0
- agentops/agent/llm_assist/_prompt_rules.py +291 -0
- agentops/agent/llm_assist/_spec_rules.py +235 -0
- agentops/agent/production_telemetry.py +430 -0
- agentops/agent/report.py +207 -0
- agentops/agent/server/__init__.py +1 -0
- agentops/agent/server/app.py +84 -0
- agentops/agent/server/auth.py +94 -0
- agentops/agent/server/chat.py +44 -0
- agentops/agent/server/protocol.py +72 -0
- agentops/agent/sources/__init__.py +1 -0
- agentops/agent/sources/azure_monitor.py +523 -0
- agentops/agent/sources/azure_resources.py +602 -0
- agentops/agent/sources/foundry_control.py +174 -0
- agentops/agent/sources/results_history.py +494 -0
- agentops/agent/sources/spec_detectors/__init__.py +42 -0
- agentops/agent/sources/spec_detectors/_base.py +58 -0
- agentops/agent/sources/spec_detectors/agents_md.py +75 -0
- agentops/agent/sources/spec_detectors/spec_kit.py +172 -0
- agentops/agent/time_range.py +117 -0
- agentops/cli/__init__.py +1 -0
- agentops/cli/app.py +4823 -0
- agentops/core/__init__.py +1 -0
- agentops/core/agentops_config.py +592 -0
- agentops/core/config_loader.py +22 -0
- agentops/core/evaluators.py +480 -0
- agentops/core/release_evidence.py +56 -0
- agentops/core/results.py +117 -0
- agentops/mcp/__init__.py +10 -0
- agentops/mcp/server.py +232 -0
- agentops/pipeline/__init__.py +8 -0
- agentops/pipeline/cloud_results.py +189 -0
- agentops/pipeline/cloud_runner.py +901 -0
- agentops/pipeline/comparison.py +108 -0
- agentops/pipeline/diagnostics.py +51 -0
- agentops/pipeline/invocations.py +535 -0
- agentops/pipeline/official_eval.py +414 -0
- agentops/pipeline/orchestrator.py +775 -0
- agentops/pipeline/prompt_deploy.py +377 -0
- agentops/pipeline/publisher.py +121 -0
- agentops/pipeline/reporter.py +202 -0
- agentops/pipeline/runtime.py +409 -0
- agentops/pipeline/thresholds.py +84 -0
- agentops/services/__init__.py +1 -0
- agentops/services/cicd.py +720 -0
- agentops/services/eval_analysis.py +848 -0
- agentops/services/evidence_pack.py +757 -0
- agentops/services/initializer.py +86 -0
- agentops/services/preflight.py +470 -0
- agentops/services/setup_wizard.py +709 -0
- agentops/services/skills.py +643 -0
- agentops/services/trace_promotion.py +300 -0
- agentops/services/workflow_analysis.py +1129 -0
- agentops/templates/.gitignore +15 -0
- agentops/templates/__init__.py +1 -0
- agentops/templates/agent-server/Dockerfile +23 -0
- agentops/templates/agent-server/README.md +61 -0
- agentops/templates/agent-server/main.bicep +94 -0
- agentops/templates/agent.yaml +87 -0
- agentops/templates/agentops.yaml +58 -0
- agentops/templates/foundry.svg +71 -0
- agentops/templates/icon.png +0 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-dev-azd.yml +118 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-dev.yml +73 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prod-azd.yml +141 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prod.yml +94 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prompt-agent.yml +167 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-qa-azd.yml +118 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-qa.yml +68 -0
- agentops/templates/pipelines/azuredevops/agentops-pr-prompt-agent.yml +210 -0
- agentops/templates/pipelines/azuredevops/agentops-pr.yml +155 -0
- agentops/templates/pipelines/azuredevops/agentops-watchdog.yml +106 -0
- agentops/templates/project.gitignore +36 -0
- agentops/templates/sample-traces.jsonl +3 -0
- agentops/templates/skills/agentops-agent/SKILL.md +137 -0
- agentops/templates/skills/agentops-config/SKILL.md +113 -0
- agentops/templates/skills/agentops-dataset/SKILL.md +84 -0
- agentops/templates/skills/agentops-eval/SKILL.md +189 -0
- agentops/templates/skills/agentops-report/SKILL.md +71 -0
- agentops/templates/skills/agentops-workflow/SKILL.md +471 -0
- agentops/templates/smoke.jsonl +3 -0
- agentops/templates/waf-checklist.README.md +84 -0
- agentops/templates/waf-checklist.csv +22 -0
- agentops/templates/workflows/agentops-deploy-dev-azd.yml +166 -0
- agentops/templates/workflows/agentops-deploy-dev.yml +187 -0
- agentops/templates/workflows/agentops-deploy-prod-azd.yml +183 -0
- agentops/templates/workflows/agentops-deploy-prod.yml +171 -0
- agentops/templates/workflows/agentops-deploy-prompt-agent.yml +197 -0
- agentops/templates/workflows/agentops-deploy-qa-azd.yml +156 -0
- agentops/templates/workflows/agentops-deploy-qa.yml +145 -0
- agentops/templates/workflows/agentops-pr-prompt-agent.yml +210 -0
- agentops/templates/workflows/agentops-pr.yml +148 -0
- agentops/templates/workflows/agentops-watchdog.yml +122 -0
- agentops/utils/__init__.py +1 -0
- agentops/utils/azd_env.py +435 -0
- agentops/utils/azure_endpoints.py +62 -0
- agentops/utils/colors.py +47 -0
- agentops/utils/dotenv_loader.py +105 -0
- agentops/utils/foundry_discovery.py +229 -0
- agentops/utils/logging.py +59 -0
- agentops/utils/telemetry.py +554 -0
- agentops/utils/yaml.py +36 -0
- agentops_accelerator-0.3.0.dist-info/METADATA +278 -0
- agentops_accelerator-0.3.0.dist-info/RECORD +142 -0
- agentops_accelerator-0.3.0.dist-info/WHEEL +5 -0
- agentops_accelerator-0.3.0.dist-info/entry_points.txt +2 -0
- agentops_accelerator-0.3.0.dist-info/licenses/LICENSE +21 -0
- agentops_accelerator-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""Safety check: scans local evaluations, production telemetry, and
|
|
2
|
+
Foundry control-plane configuration for Responsible-AI gaps.
|
|
3
|
+
|
|
4
|
+
The check emits findings under :class:`Category.RESPONSIBLE_AI` from
|
|
5
|
+
three independent layers:
|
|
6
|
+
|
|
7
|
+
* **Eval** - content-safety metric hits in the latest eval run.
|
|
8
|
+
Backwards-compatible id ``safety.<metric>``; evidence carries
|
|
9
|
+
``layer=eval``.
|
|
10
|
+
* **Runtime** - content-filter triggers observed in App Insights /
|
|
11
|
+
Log Analytics within the lookback window. Id ``safety.runtime.content_filter``.
|
|
12
|
+
* **Config** - Foundry continuous-evaluation rules missing or disabled.
|
|
13
|
+
Id ``safety.config.continuous_eval_missing`` /
|
|
14
|
+
``safety.config.continuous_eval_disabled``.
|
|
15
|
+
|
|
16
|
+
Each layer fails open: if its source did not produce a payload, that
|
|
17
|
+
layer simply emits nothing.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from typing import Any, Dict, List, Optional
|
|
23
|
+
|
|
24
|
+
from agentops.agent.config import SafetyCheckConfig
|
|
25
|
+
from agentops.agent.findings import Category, Finding, Severity
|
|
26
|
+
from agentops.agent.sources.azure_monitor import AzureMonitorPayload
|
|
27
|
+
from agentops.agent.sources.foundry_control import FoundryControlPayload
|
|
28
|
+
from agentops.agent.sources.results_history import ResultsHistory
|
|
29
|
+
|
|
30
|
+
_SEVERITY_RANK = {"low": 0, "medium": 1, "high": 2}
|
|
31
|
+
|
|
32
|
+
_SAFETY_METRICS = (
|
|
33
|
+
"violence",
|
|
34
|
+
"self_harm",
|
|
35
|
+
"sexual",
|
|
36
|
+
"hate_unfairness",
|
|
37
|
+
"protected_material",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _severity_value(raw: Any) -> int:
|
|
42
|
+
if isinstance(raw, str):
|
|
43
|
+
return _SEVERITY_RANK.get(raw.strip().lower(), -1)
|
|
44
|
+
if isinstance(raw, (int, float)):
|
|
45
|
+
return int(raw)
|
|
46
|
+
return -1
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def run_safety_check(
|
|
50
|
+
history: ResultsHistory,
|
|
51
|
+
config: SafetyCheckConfig,
|
|
52
|
+
monitor: Optional[AzureMonitorPayload] = None,
|
|
53
|
+
foundry: Optional[FoundryControlPayload] = None,
|
|
54
|
+
) -> List[Finding]:
|
|
55
|
+
"""Run all three safety layers and return the merged findings."""
|
|
56
|
+
findings: List[Finding] = []
|
|
57
|
+
findings.extend(_find_eval_safety(history, config))
|
|
58
|
+
if monitor is not None:
|
|
59
|
+
findings.extend(_find_runtime_safety(monitor, config))
|
|
60
|
+
if foundry is not None:
|
|
61
|
+
findings.extend(_find_config_safety(foundry))
|
|
62
|
+
return findings
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _find_eval_safety(
|
|
66
|
+
history: ResultsHistory, config: SafetyCheckConfig
|
|
67
|
+
) -> List[Finding]:
|
|
68
|
+
if not history.runs:
|
|
69
|
+
return []
|
|
70
|
+
|
|
71
|
+
floor = _SEVERITY_RANK.get(config.severity_floor.strip().lower(), 1)
|
|
72
|
+
latest = history.runs[-1]
|
|
73
|
+
|
|
74
|
+
hits: Dict[str, Dict[str, Any]] = {}
|
|
75
|
+
for item in latest.item_evaluations:
|
|
76
|
+
if not isinstance(item, dict):
|
|
77
|
+
continue
|
|
78
|
+
metrics = item.get("metrics") or item.get("scores") or {}
|
|
79
|
+
if not isinstance(metrics, dict):
|
|
80
|
+
continue
|
|
81
|
+
for key, value in metrics.items():
|
|
82
|
+
metric = str(key).lower()
|
|
83
|
+
if not any(m in metric for m in _SAFETY_METRICS):
|
|
84
|
+
continue
|
|
85
|
+
level = _severity_value(value)
|
|
86
|
+
if level < floor:
|
|
87
|
+
continue
|
|
88
|
+
existing = hits.get(metric)
|
|
89
|
+
if existing is None or level > existing.get("level", -1):
|
|
90
|
+
hits[metric] = {
|
|
91
|
+
"level": level,
|
|
92
|
+
"value": value,
|
|
93
|
+
"row": item.get("input")
|
|
94
|
+
or item.get("question")
|
|
95
|
+
or item.get("id"),
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
findings: List[Finding] = []
|
|
99
|
+
for metric, info in hits.items():
|
|
100
|
+
severity = (
|
|
101
|
+
Severity.CRITICAL if info["level"] >= 2 else Severity.WARNING
|
|
102
|
+
)
|
|
103
|
+
findings.append(
|
|
104
|
+
Finding(
|
|
105
|
+
id=f"safety.{metric}",
|
|
106
|
+
severity=severity,
|
|
107
|
+
category=Category.RESPONSIBLE_AI,
|
|
108
|
+
title=f"Content-safety hit on `{metric}`",
|
|
109
|
+
summary=(
|
|
110
|
+
f"Run `{latest.run_id}` produced a `{metric}` rating "
|
|
111
|
+
f"of `{info['value']}` on at least one row."
|
|
112
|
+
),
|
|
113
|
+
recommendation=(
|
|
114
|
+
"Inspect the offending dataset row and the model "
|
|
115
|
+
"response, tighten the system prompt or add a safety "
|
|
116
|
+
"filter, and re-evaluate."
|
|
117
|
+
),
|
|
118
|
+
source="results_history",
|
|
119
|
+
evidence={
|
|
120
|
+
"layer": "eval",
|
|
121
|
+
"metric": metric,
|
|
122
|
+
"value": info["value"],
|
|
123
|
+
"row": info.get("row"),
|
|
124
|
+
"run_id": latest.run_id,
|
|
125
|
+
},
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
return findings
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _find_runtime_safety(
|
|
132
|
+
monitor: AzureMonitorPayload, config: SafetyCheckConfig
|
|
133
|
+
) -> List[Finding]:
|
|
134
|
+
findings: List[Finding] = []
|
|
135
|
+
for violation in monitor.safety_violations:
|
|
136
|
+
if not isinstance(violation, dict):
|
|
137
|
+
continue
|
|
138
|
+
hits = int(violation.get("hits", 0) or 0)
|
|
139
|
+
if hits < config.min_runtime_hits:
|
|
140
|
+
continue
|
|
141
|
+
signal = str(violation.get("signal") or "content_filter")
|
|
142
|
+
severity = (
|
|
143
|
+
Severity.CRITICAL
|
|
144
|
+
if hits >= config.runtime_critical_hits
|
|
145
|
+
else Severity.WARNING
|
|
146
|
+
)
|
|
147
|
+
findings.append(
|
|
148
|
+
Finding(
|
|
149
|
+
id=f"safety.runtime.{signal}",
|
|
150
|
+
severity=severity,
|
|
151
|
+
category=Category.RESPONSIBLE_AI,
|
|
152
|
+
title=f"Content-filter triggers detected in production (`{signal}`)",
|
|
153
|
+
summary=(
|
|
154
|
+
f"App Insights observed {hits} `{signal}` event(s) "
|
|
155
|
+
"over the lookback window. Each one is a response "
|
|
156
|
+
"the model refused to complete or a request blocked "
|
|
157
|
+
"by Azure AI Content Safety."
|
|
158
|
+
),
|
|
159
|
+
recommendation=(
|
|
160
|
+
"Inspect the underlying traces in Application "
|
|
161
|
+
"Insights, identify whether the spike originates "
|
|
162
|
+
"from a single client, a regression in the system "
|
|
163
|
+
"prompt, or actual adversarial input, and adjust "
|
|
164
|
+
"guardrails accordingly."
|
|
165
|
+
),
|
|
166
|
+
source="azure_monitor",
|
|
167
|
+
evidence={
|
|
168
|
+
"layer": "runtime",
|
|
169
|
+
"signal": signal,
|
|
170
|
+
"hits": hits,
|
|
171
|
+
},
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
return findings
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _find_config_safety(foundry: FoundryControlPayload) -> List[Finding]:
|
|
178
|
+
if not foundry.agents:
|
|
179
|
+
return []
|
|
180
|
+
|
|
181
|
+
rules = foundry.evaluation_rules
|
|
182
|
+
diag = foundry.diagnostics or {}
|
|
183
|
+
|
|
184
|
+
# We only emit config findings if we were actually able to *probe*
|
|
185
|
+
# for rules (avoid false positives when the SDK lacks the surface).
|
|
186
|
+
if (
|
|
187
|
+
"evaluation_rules_count" not in diag
|
|
188
|
+
and "evaluation_rules_warning" not in diag
|
|
189
|
+
):
|
|
190
|
+
return []
|
|
191
|
+
|
|
192
|
+
findings: List[Finding] = []
|
|
193
|
+
|
|
194
|
+
if not rules:
|
|
195
|
+
findings.append(
|
|
196
|
+
Finding(
|
|
197
|
+
id="safety.config.continuous_eval_missing",
|
|
198
|
+
severity=Severity.WARNING,
|
|
199
|
+
category=Category.RESPONSIBLE_AI,
|
|
200
|
+
title="No continuous evaluation rules configured",
|
|
201
|
+
summary=(
|
|
202
|
+
f"Foundry project lists {len(foundry.agents)} agent(s) "
|
|
203
|
+
"but no continuous-evaluation rules. Production "
|
|
204
|
+
"responses are not being scored on quality / safety "
|
|
205
|
+
"after deployment."
|
|
206
|
+
),
|
|
207
|
+
recommendation=(
|
|
208
|
+
"Attach continuous evaluation rules to your agents "
|
|
209
|
+
"in Foundry (Operate -> Evaluations) so deployed "
|
|
210
|
+
"responses are scored against safety and quality "
|
|
211
|
+
"metrics in production."
|
|
212
|
+
),
|
|
213
|
+
source="foundry_control",
|
|
214
|
+
evidence={
|
|
215
|
+
"layer": "config",
|
|
216
|
+
"agents": [a.agent_id for a in foundry.agents],
|
|
217
|
+
},
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
return findings
|
|
221
|
+
|
|
222
|
+
disabled = [r for r in rules if r.enabled is False]
|
|
223
|
+
if disabled:
|
|
224
|
+
findings.append(
|
|
225
|
+
Finding(
|
|
226
|
+
id="safety.config.continuous_eval_disabled",
|
|
227
|
+
severity=Severity.WARNING,
|
|
228
|
+
category=Category.RESPONSIBLE_AI,
|
|
229
|
+
title="One or more continuous evaluation rules are disabled",
|
|
230
|
+
summary=(
|
|
231
|
+
f"{len(disabled)} of {len(rules)} continuous "
|
|
232
|
+
"evaluation rule(s) are disabled. Production safety "
|
|
233
|
+
"scoring is partially or fully turned off."
|
|
234
|
+
),
|
|
235
|
+
recommendation=(
|
|
236
|
+
"Re-enable the disabled rules in Foundry "
|
|
237
|
+
"(Operate -> Evaluations) or remove them if they "
|
|
238
|
+
"are intentionally retired."
|
|
239
|
+
),
|
|
240
|
+
source="foundry_control",
|
|
241
|
+
evidence={
|
|
242
|
+
"layer": "config",
|
|
243
|
+
"disabled_rules": [r.rule_id for r in disabled],
|
|
244
|
+
},
|
|
245
|
+
)
|
|
246
|
+
)
|
|
247
|
+
return findings
|
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""Spec-conformance check (Operational Excellence pillar).
|
|
2
|
+
|
|
3
|
+
Compares the project's spec-driven-development artifacts
|
|
4
|
+
(``.specify/spec.md`` + ``plan.md`` + ``tasks.md``, ``AGENTS.md``,
|
|
5
|
+
``.github/copilot-instructions.md``) against the AgentOps workspace
|
|
6
|
+
(``run.yaml``, ``.agentops/bundles/``, ``.agentops/datasets/``)
|
|
7
|
+
and flags drift between the two.
|
|
8
|
+
|
|
9
|
+
All findings live under :class:`Category.OPERATIONAL_EXCELLENCE` with
|
|
10
|
+
the ``opex.spec_conformance.*`` id prefix. Deterministic rules emit
|
|
11
|
+
``info``/``warning`` only — never ``critical`` — because spec
|
|
12
|
+
conformance is a soft signal.
|
|
13
|
+
|
|
14
|
+
The companion opt-in LLM rule
|
|
15
|
+
(``opex.spec_conformance.llm.implementation_gap``) lives in
|
|
16
|
+
:mod:`agentops.agent.llm_assist._spec_rules`.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from datetime import datetime, timezone
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Iterable, List, Optional
|
|
24
|
+
|
|
25
|
+
import yaml
|
|
26
|
+
|
|
27
|
+
from agentops.agent.config import SpecConformanceCheckConfig
|
|
28
|
+
from agentops.agent.findings import Category, Finding, Severity
|
|
29
|
+
from agentops.agent.sources.spec_detectors import (
|
|
30
|
+
DETECTORS,
|
|
31
|
+
Detector,
|
|
32
|
+
SpecDocument,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
SOURCE_NAME = "spec_workspace"
|
|
36
|
+
|
|
37
|
+
def run_spec_conformance_check(
|
|
38
|
+
workspace: Path,
|
|
39
|
+
config: SpecConformanceCheckConfig,
|
|
40
|
+
) -> List[Finding]:
|
|
41
|
+
"""Run all deterministic spec-conformance rules and return findings."""
|
|
42
|
+
if not config.enabled:
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
selected = _select_detectors(config.detectors)
|
|
46
|
+
documents: List[SpecDocument] = []
|
|
47
|
+
hint_only: List[Path] = []
|
|
48
|
+
for det in selected:
|
|
49
|
+
doc = det.detect(workspace)
|
|
50
|
+
if doc is not None:
|
|
51
|
+
documents.append(doc)
|
|
52
|
+
else:
|
|
53
|
+
hint_only.extend(det.hint_paths(workspace))
|
|
54
|
+
|
|
55
|
+
findings: List[Finding] = []
|
|
56
|
+
|
|
57
|
+
if not documents:
|
|
58
|
+
if hint_only:
|
|
59
|
+
findings.append(
|
|
60
|
+
Finding(
|
|
61
|
+
id="opex.spec_conformance.spec_missing",
|
|
62
|
+
severity=Severity.WARNING,
|
|
63
|
+
category=Category.OPERATIONAL_EXCELLENCE,
|
|
64
|
+
title=(
|
|
65
|
+
"Spec setup detected, but no usable specification was found"
|
|
66
|
+
),
|
|
67
|
+
summary=(
|
|
68
|
+
"Doctor found signs that this repo uses "
|
|
69
|
+
"spec-driven development (for example "
|
|
70
|
+
"`.specify/`, `AGENTS.md`, or a "
|
|
71
|
+
"`copilot-instructions.md` shell), but could "
|
|
72
|
+
"not load a real spec body. Without that "
|
|
73
|
+
"reference, Doctor cannot check whether "
|
|
74
|
+
"bundles, datasets, tasks, and "
|
|
75
|
+
"implementation still match the intended agent "
|
|
76
|
+
"behavior."
|
|
77
|
+
),
|
|
78
|
+
recommendation=(
|
|
79
|
+
"Add a readable spec such as `.specify/spec.md` "
|
|
80
|
+
"(spec-kit) or `AGENTS.md` that describes the "
|
|
81
|
+
"agent's intended behavior, capabilities, "
|
|
82
|
+
"evaluators, and datasets, then re-run "
|
|
83
|
+
"`agentops doctor`."
|
|
84
|
+
),
|
|
85
|
+
source=SOURCE_NAME,
|
|
86
|
+
evidence={"hint_paths": [str(p) for p in hint_only]},
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
return _filter_skipped(findings, config.skip)
|
|
90
|
+
|
|
91
|
+
for doc in documents:
|
|
92
|
+
findings.extend(_check_tasks(doc, config.stale_after_days))
|
|
93
|
+
findings.extend(_check_evaluator_drift(workspace, doc))
|
|
94
|
+
findings.extend(_check_dataset_drift(workspace, doc))
|
|
95
|
+
findings.extend(_check_agent_drift(workspace, doc))
|
|
96
|
+
|
|
97
|
+
deduped: List[Finding] = []
|
|
98
|
+
seen: set[tuple[str, str]] = set()
|
|
99
|
+
for f in findings:
|
|
100
|
+
key = (f.id, _evidence_key(f))
|
|
101
|
+
if key in seen:
|
|
102
|
+
continue
|
|
103
|
+
seen.add(key)
|
|
104
|
+
deduped.append(f)
|
|
105
|
+
|
|
106
|
+
return _filter_skipped(deduped, config.skip)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _select_detectors(names: Iterable[str]) -> List[Detector]:
|
|
110
|
+
requested = {n.strip().lower() for n in names if n and n.strip()}
|
|
111
|
+
if not requested:
|
|
112
|
+
return list(DETECTORS)
|
|
113
|
+
return [d for d in DETECTORS if d.name.lower() in requested]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _filter_skipped(findings: List[Finding], skip: Iterable[str]) -> List[Finding]:
|
|
117
|
+
skip_set = {s.strip() for s in skip if s and s.strip()}
|
|
118
|
+
if not skip_set:
|
|
119
|
+
return findings
|
|
120
|
+
return [f for f in findings if f.id not in skip_set]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _evidence_key(f: Finding) -> str:
|
|
124
|
+
"""Stable key from a finding's evidence for de-duplication across detectors."""
|
|
125
|
+
if not isinstance(f.evidence, dict):
|
|
126
|
+
return ""
|
|
127
|
+
parts = []
|
|
128
|
+
for k in sorted(f.evidence):
|
|
129
|
+
v = f.evidence[k]
|
|
130
|
+
parts.append(f"{k}={v!r}")
|
|
131
|
+
return "|".join(parts)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _check_tasks(doc: SpecDocument, stale_after_days: int) -> List[Finding]:
|
|
135
|
+
findings: List[Finding] = []
|
|
136
|
+
if not doc.tasks:
|
|
137
|
+
return findings
|
|
138
|
+
|
|
139
|
+
now = datetime.now(timezone.utc)
|
|
140
|
+
last_modified = doc.last_modified
|
|
141
|
+
age_days = (
|
|
142
|
+
(now - last_modified).total_seconds() / 86400.0
|
|
143
|
+
if last_modified is not None
|
|
144
|
+
else None
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
unchecked = [t for t in doc.tasks if not t.checked]
|
|
148
|
+
if unchecked and age_days is not None and age_days > stale_after_days:
|
|
149
|
+
findings.append(
|
|
150
|
+
Finding(
|
|
151
|
+
id="opex.spec_conformance.tasks_stale",
|
|
152
|
+
severity=Severity.WARNING,
|
|
153
|
+
category=Category.OPERATIONAL_EXCELLENCE,
|
|
154
|
+
title="Spec tasks have been left open past the freshness window",
|
|
155
|
+
summary=(
|
|
156
|
+
f"Doctor found {len(unchecked)} unchecked task(s) "
|
|
157
|
+
"in the spec (for example `tasks.md` in a spec-kit "
|
|
158
|
+
"workspace), and the spec has not been updated for "
|
|
159
|
+
f"{age_days:.1f} day(s). The configured freshness "
|
|
160
|
+
f"window is {stale_after_days} day(s). This usually "
|
|
161
|
+
"means the implementation plan is no longer "
|
|
162
|
+
"trustworthy: either the work is done but the tasks "
|
|
163
|
+
"were not checked off, the tasks are no longer "
|
|
164
|
+
"relevant, or the agent behavior changed without the "
|
|
165
|
+
"spec being refreshed."
|
|
166
|
+
),
|
|
167
|
+
recommendation=(
|
|
168
|
+
"Review the open tasks. Check off completed work, "
|
|
169
|
+
"remove tasks that no longer apply, or update the "
|
|
170
|
+
"spec so the task list reflects the current agent "
|
|
171
|
+
"behavior and evaluation plan."
|
|
172
|
+
),
|
|
173
|
+
source=SOURCE_NAME,
|
|
174
|
+
evidence={
|
|
175
|
+
"format": doc.format,
|
|
176
|
+
"open_tasks": len(unchecked),
|
|
177
|
+
"age_days": round(age_days, 2),
|
|
178
|
+
"threshold_days": stale_after_days,
|
|
179
|
+
},
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
orphans: List[str] = []
|
|
184
|
+
for task in doc.tasks:
|
|
185
|
+
if not task.checked:
|
|
186
|
+
continue
|
|
187
|
+
for rel in task.mentioned_paths:
|
|
188
|
+
candidate = doc.root / rel
|
|
189
|
+
if not candidate.exists():
|
|
190
|
+
# Try resolving from the workspace root instead of the
|
|
191
|
+
# spec root (e.g. spec-kit lives under .specify/ but
|
|
192
|
+
# paths are workspace-relative).
|
|
193
|
+
if not (doc.root.parent / rel).exists():
|
|
194
|
+
orphans.append(rel)
|
|
195
|
+
|
|
196
|
+
if orphans:
|
|
197
|
+
findings.append(
|
|
198
|
+
Finding(
|
|
199
|
+
id="opex.spec_conformance.tasks_orphaned",
|
|
200
|
+
severity=Severity.WARNING,
|
|
201
|
+
category=Category.OPERATIONAL_EXCELLENCE,
|
|
202
|
+
title="Completed tasks reference paths that don't exist",
|
|
203
|
+
summary=(
|
|
204
|
+
"One or more checked task items in the spec point "
|
|
205
|
+
"at files that aren't in the workspace. Either "
|
|
206
|
+
"the implementation was removed or the spec is "
|
|
207
|
+
"out of date."
|
|
208
|
+
),
|
|
209
|
+
recommendation=(
|
|
210
|
+
"Update the spec to reflect the current code "
|
|
211
|
+
"layout, or restore the missing files."
|
|
212
|
+
),
|
|
213
|
+
source=SOURCE_NAME,
|
|
214
|
+
evidence={
|
|
215
|
+
"format": doc.format,
|
|
216
|
+
"missing_paths": orphans[:10],
|
|
217
|
+
},
|
|
218
|
+
)
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
return findings
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _check_evaluator_drift(workspace: Path, doc: SpecDocument) -> List[Finding]:
|
|
225
|
+
mentioned = doc.references.get("evaluators") or []
|
|
226
|
+
if not mentioned:
|
|
227
|
+
return []
|
|
228
|
+
declared = _collect_evaluator_names(workspace)
|
|
229
|
+
if not declared:
|
|
230
|
+
return []
|
|
231
|
+
missing = [e for e in mentioned if e not in declared]
|
|
232
|
+
if not missing:
|
|
233
|
+
return []
|
|
234
|
+
return [
|
|
235
|
+
Finding(
|
|
236
|
+
id="opex.spec_conformance.evaluator_drift",
|
|
237
|
+
severity=Severity.WARNING,
|
|
238
|
+
category=Category.OPERATIONAL_EXCELLENCE,
|
|
239
|
+
title="Spec names evaluators that no bundle declares",
|
|
240
|
+
summary=(
|
|
241
|
+
"The spec mentions evaluator classes that are absent "
|
|
242
|
+
"from every `.agentops/bundles/*.yaml`. The "
|
|
243
|
+
"implementation isn't measuring what the spec "
|
|
244
|
+
"promises."
|
|
245
|
+
),
|
|
246
|
+
recommendation=(
|
|
247
|
+
"Either add the missing evaluator(s) to a bundle or "
|
|
248
|
+
"update the spec to reflect what the project actually "
|
|
249
|
+
"evaluates."
|
|
250
|
+
),
|
|
251
|
+
source=SOURCE_NAME,
|
|
252
|
+
evidence={"missing_evaluators": missing[:10]},
|
|
253
|
+
)
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _check_dataset_drift(workspace: Path, doc: SpecDocument) -> List[Finding]:
|
|
258
|
+
mentioned = doc.references.get("datasets") or []
|
|
259
|
+
if not mentioned:
|
|
260
|
+
return []
|
|
261
|
+
available = {p.name for p in (workspace / ".agentops" / "datasets").glob("*.y*ml")}
|
|
262
|
+
available |= {p.name for p in (workspace / ".agentops" / "data").glob("*.jsonl")}
|
|
263
|
+
if not available:
|
|
264
|
+
return []
|
|
265
|
+
missing = [d for d in mentioned if Path(d).name not in available]
|
|
266
|
+
if not missing:
|
|
267
|
+
return []
|
|
268
|
+
return [
|
|
269
|
+
Finding(
|
|
270
|
+
id="opex.spec_conformance.dataset_drift",
|
|
271
|
+
severity=Severity.WARNING,
|
|
272
|
+
category=Category.OPERATIONAL_EXCELLENCE,
|
|
273
|
+
title="Spec references datasets that aren't in the workspace",
|
|
274
|
+
summary=(
|
|
275
|
+
"Dataset filenames mentioned in the spec do not "
|
|
276
|
+
"exist under `.agentops/datasets/` or "
|
|
277
|
+
"`.agentops/data/`."
|
|
278
|
+
),
|
|
279
|
+
recommendation=(
|
|
280
|
+
"Add the missing dataset file(s) under "
|
|
281
|
+
"`.agentops/datasets/` (and the matching JSONL under "
|
|
282
|
+
"`.agentops/data/`), or update the spec."
|
|
283
|
+
),
|
|
284
|
+
source=SOURCE_NAME,
|
|
285
|
+
evidence={"missing_datasets": missing[:10]},
|
|
286
|
+
)
|
|
287
|
+
]
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _check_agent_drift(workspace: Path, doc: SpecDocument) -> List[Finding]:
|
|
291
|
+
mentioned = doc.references.get("agent_ids") or []
|
|
292
|
+
if not mentioned:
|
|
293
|
+
return []
|
|
294
|
+
run_yaml = workspace / ".agentops" / "run.yaml"
|
|
295
|
+
if not run_yaml.exists():
|
|
296
|
+
run_yaml = workspace / "run.yaml"
|
|
297
|
+
if not run_yaml.exists():
|
|
298
|
+
return []
|
|
299
|
+
try:
|
|
300
|
+
raw = yaml.safe_load(run_yaml.read_text(encoding="utf-8"))
|
|
301
|
+
except (OSError, yaml.YAMLError):
|
|
302
|
+
return []
|
|
303
|
+
if not isinstance(raw, dict):
|
|
304
|
+
return []
|
|
305
|
+
target = raw.get("target") or {}
|
|
306
|
+
endpoint = target.get("endpoint") or {}
|
|
307
|
+
declared_agent = str(endpoint.get("agent_id") or "")
|
|
308
|
+
if not declared_agent:
|
|
309
|
+
return []
|
|
310
|
+
if declared_agent in mentioned:
|
|
311
|
+
return []
|
|
312
|
+
return [
|
|
313
|
+
Finding(
|
|
314
|
+
id="opex.spec_conformance.agent_drift",
|
|
315
|
+
severity=Severity.WARNING,
|
|
316
|
+
category=Category.OPERATIONAL_EXCELLENCE,
|
|
317
|
+
title="Spec's agent identifier doesn't match `run.yaml`",
|
|
318
|
+
summary=(
|
|
319
|
+
f"`run.yaml` targets agent `{declared_agent}` but the "
|
|
320
|
+
"spec mentions a different agent identifier. The "
|
|
321
|
+
"evaluation is running against a different agent "
|
|
322
|
+
"than the spec describes."
|
|
323
|
+
),
|
|
324
|
+
recommendation=(
|
|
325
|
+
"Pin `run.yaml`'s `target.endpoint.agent_id` to the "
|
|
326
|
+
"agent named in the spec, or update the spec to "
|
|
327
|
+
"match."
|
|
328
|
+
),
|
|
329
|
+
source=SOURCE_NAME,
|
|
330
|
+
evidence={
|
|
331
|
+
"spec_agent_ids": mentioned[:5],
|
|
332
|
+
"run_yaml_agent_id": declared_agent,
|
|
333
|
+
},
|
|
334
|
+
)
|
|
335
|
+
]
|
|
336
|
+
|
|
337
|
+
def _collect_evaluator_names(workspace: Path) -> set[str]:
|
|
338
|
+
"""Read every bundle YAML and return the set of evaluator class names."""
|
|
339
|
+
out: set[str] = set()
|
|
340
|
+
bundles_dir = workspace / ".agentops" / "bundles"
|
|
341
|
+
if not bundles_dir.is_dir():
|
|
342
|
+
return out
|
|
343
|
+
for p in bundles_dir.glob("*.y*ml"):
|
|
344
|
+
try:
|
|
345
|
+
raw = yaml.safe_load(p.read_text(encoding="utf-8"))
|
|
346
|
+
except (OSError, yaml.YAMLError):
|
|
347
|
+
continue
|
|
348
|
+
if not isinstance(raw, dict):
|
|
349
|
+
continue
|
|
350
|
+
for ev in raw.get("evaluators") or []:
|
|
351
|
+
if isinstance(ev, dict):
|
|
352
|
+
name = ev.get("class") or ev.get("name")
|
|
353
|
+
if isinstance(name, str):
|
|
354
|
+
out.add(name.strip())
|
|
355
|
+
elif isinstance(ev, str):
|
|
356
|
+
out.add(ev.strip())
|
|
357
|
+
return out
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def detect_documents(
|
|
361
|
+
workspace: Path,
|
|
362
|
+
config: Optional[SpecConformanceCheckConfig] = None,
|
|
363
|
+
) -> List[SpecDocument]:
|
|
364
|
+
"""Public helper: return all spec documents discovered in ``workspace``.
|
|
365
|
+
|
|
366
|
+
Used by the LLM rule to share detection with the deterministic
|
|
367
|
+
check without re-implementing the registry walk.
|
|
368
|
+
"""
|
|
369
|
+
cfg = config or SpecConformanceCheckConfig()
|
|
370
|
+
out: List[SpecDocument] = []
|
|
371
|
+
for det in _select_detectors(cfg.detectors):
|
|
372
|
+
doc = det.detect(workspace)
|
|
373
|
+
if doc is not None:
|
|
374
|
+
out.append(doc)
|
|
375
|
+
return out
|