agentops-accelerator 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. agentops/__init__.py +10 -0
  2. agentops/__main__.py +6 -0
  3. agentops/agent/__init__.py +12 -0
  4. agentops/agent/_legacy_ids.py +92 -0
  5. agentops/agent/analyzer.py +207 -0
  6. agentops/agent/checks/__init__.py +1 -0
  7. agentops/agent/checks/catalog.py +880 -0
  8. agentops/agent/checks/errors.py +279 -0
  9. agentops/agent/checks/foundry_config.py +75 -0
  10. agentops/agent/checks/latency.py +84 -0
  11. agentops/agent/checks/opex.py +157 -0
  12. agentops/agent/checks/opex_workspace.py +874 -0
  13. agentops/agent/checks/posture.py +36 -0
  14. agentops/agent/checks/posture_rules/__init__.py +53 -0
  15. agentops/agent/checks/posture_rules/content_filter.py +59 -0
  16. agentops/agent/checks/posture_rules/diagnostics.py +74 -0
  17. agentops/agent/checks/posture_rules/local_auth.py +55 -0
  18. agentops/agent/checks/posture_rules/managed_identity.py +59 -0
  19. agentops/agent/checks/posture_rules/network.py +68 -0
  20. agentops/agent/checks/regression.py +78 -0
  21. agentops/agent/checks/release_readiness.py +182 -0
  22. agentops/agent/checks/safety.py +247 -0
  23. agentops/agent/checks/spec_conformance.py +375 -0
  24. agentops/agent/cockpit.py +5159 -0
  25. agentops/agent/config.py +240 -0
  26. agentops/agent/findings.py +113 -0
  27. agentops/agent/history.py +142 -0
  28. agentops/agent/knowledge/__init__.py +182 -0
  29. agentops/agent/knowledge/waf-checklist.csv +39 -0
  30. agentops/agent/llm_assist/__init__.py +16 -0
  31. agentops/agent/llm_assist/_base.py +124 -0
  32. agentops/agent/llm_assist/_bundle_rule.py +154 -0
  33. agentops/agent/llm_assist/_client.py +347 -0
  34. agentops/agent/llm_assist/_dataset_rules.py +191 -0
  35. agentops/agent/llm_assist/_engine.py +106 -0
  36. agentops/agent/llm_assist/_prompt_rules.py +291 -0
  37. agentops/agent/llm_assist/_spec_rules.py +235 -0
  38. agentops/agent/production_telemetry.py +430 -0
  39. agentops/agent/report.py +207 -0
  40. agentops/agent/server/__init__.py +1 -0
  41. agentops/agent/server/app.py +84 -0
  42. agentops/agent/server/auth.py +94 -0
  43. agentops/agent/server/chat.py +44 -0
  44. agentops/agent/server/protocol.py +72 -0
  45. agentops/agent/sources/__init__.py +1 -0
  46. agentops/agent/sources/azure_monitor.py +523 -0
  47. agentops/agent/sources/azure_resources.py +602 -0
  48. agentops/agent/sources/foundry_control.py +174 -0
  49. agentops/agent/sources/results_history.py +494 -0
  50. agentops/agent/sources/spec_detectors/__init__.py +42 -0
  51. agentops/agent/sources/spec_detectors/_base.py +58 -0
  52. agentops/agent/sources/spec_detectors/agents_md.py +75 -0
  53. agentops/agent/sources/spec_detectors/spec_kit.py +172 -0
  54. agentops/agent/time_range.py +117 -0
  55. agentops/cli/__init__.py +1 -0
  56. agentops/cli/app.py +4823 -0
  57. agentops/core/__init__.py +1 -0
  58. agentops/core/agentops_config.py +592 -0
  59. agentops/core/config_loader.py +22 -0
  60. agentops/core/evaluators.py +480 -0
  61. agentops/core/release_evidence.py +56 -0
  62. agentops/core/results.py +117 -0
  63. agentops/mcp/__init__.py +10 -0
  64. agentops/mcp/server.py +232 -0
  65. agentops/pipeline/__init__.py +8 -0
  66. agentops/pipeline/cloud_results.py +189 -0
  67. agentops/pipeline/cloud_runner.py +901 -0
  68. agentops/pipeline/comparison.py +108 -0
  69. agentops/pipeline/diagnostics.py +51 -0
  70. agentops/pipeline/invocations.py +535 -0
  71. agentops/pipeline/official_eval.py +414 -0
  72. agentops/pipeline/orchestrator.py +775 -0
  73. agentops/pipeline/prompt_deploy.py +377 -0
  74. agentops/pipeline/publisher.py +121 -0
  75. agentops/pipeline/reporter.py +202 -0
  76. agentops/pipeline/runtime.py +409 -0
  77. agentops/pipeline/thresholds.py +84 -0
  78. agentops/services/__init__.py +1 -0
  79. agentops/services/cicd.py +720 -0
  80. agentops/services/eval_analysis.py +848 -0
  81. agentops/services/evidence_pack.py +757 -0
  82. agentops/services/initializer.py +86 -0
  83. agentops/services/preflight.py +470 -0
  84. agentops/services/setup_wizard.py +709 -0
  85. agentops/services/skills.py +643 -0
  86. agentops/services/trace_promotion.py +300 -0
  87. agentops/services/workflow_analysis.py +1129 -0
  88. agentops/templates/.gitignore +15 -0
  89. agentops/templates/__init__.py +1 -0
  90. agentops/templates/agent-server/Dockerfile +23 -0
  91. agentops/templates/agent-server/README.md +61 -0
  92. agentops/templates/agent-server/main.bicep +94 -0
  93. agentops/templates/agent.yaml +87 -0
  94. agentops/templates/agentops.yaml +58 -0
  95. agentops/templates/foundry.svg +71 -0
  96. agentops/templates/icon.png +0 -0
  97. agentops/templates/pipelines/azuredevops/agentops-deploy-dev-azd.yml +118 -0
  98. agentops/templates/pipelines/azuredevops/agentops-deploy-dev.yml +73 -0
  99. agentops/templates/pipelines/azuredevops/agentops-deploy-prod-azd.yml +141 -0
  100. agentops/templates/pipelines/azuredevops/agentops-deploy-prod.yml +94 -0
  101. agentops/templates/pipelines/azuredevops/agentops-deploy-prompt-agent.yml +167 -0
  102. agentops/templates/pipelines/azuredevops/agentops-deploy-qa-azd.yml +118 -0
  103. agentops/templates/pipelines/azuredevops/agentops-deploy-qa.yml +68 -0
  104. agentops/templates/pipelines/azuredevops/agentops-pr-prompt-agent.yml +210 -0
  105. agentops/templates/pipelines/azuredevops/agentops-pr.yml +155 -0
  106. agentops/templates/pipelines/azuredevops/agentops-watchdog.yml +106 -0
  107. agentops/templates/project.gitignore +36 -0
  108. agentops/templates/sample-traces.jsonl +3 -0
  109. agentops/templates/skills/agentops-agent/SKILL.md +137 -0
  110. agentops/templates/skills/agentops-config/SKILL.md +113 -0
  111. agentops/templates/skills/agentops-dataset/SKILL.md +84 -0
  112. agentops/templates/skills/agentops-eval/SKILL.md +189 -0
  113. agentops/templates/skills/agentops-report/SKILL.md +71 -0
  114. agentops/templates/skills/agentops-workflow/SKILL.md +471 -0
  115. agentops/templates/smoke.jsonl +3 -0
  116. agentops/templates/waf-checklist.README.md +84 -0
  117. agentops/templates/waf-checklist.csv +22 -0
  118. agentops/templates/workflows/agentops-deploy-dev-azd.yml +166 -0
  119. agentops/templates/workflows/agentops-deploy-dev.yml +187 -0
  120. agentops/templates/workflows/agentops-deploy-prod-azd.yml +183 -0
  121. agentops/templates/workflows/agentops-deploy-prod.yml +171 -0
  122. agentops/templates/workflows/agentops-deploy-prompt-agent.yml +197 -0
  123. agentops/templates/workflows/agentops-deploy-qa-azd.yml +156 -0
  124. agentops/templates/workflows/agentops-deploy-qa.yml +145 -0
  125. agentops/templates/workflows/agentops-pr-prompt-agent.yml +210 -0
  126. agentops/templates/workflows/agentops-pr.yml +148 -0
  127. agentops/templates/workflows/agentops-watchdog.yml +122 -0
  128. agentops/utils/__init__.py +1 -0
  129. agentops/utils/azd_env.py +435 -0
  130. agentops/utils/azure_endpoints.py +62 -0
  131. agentops/utils/colors.py +47 -0
  132. agentops/utils/dotenv_loader.py +105 -0
  133. agentops/utils/foundry_discovery.py +229 -0
  134. agentops/utils/logging.py +59 -0
  135. agentops/utils/telemetry.py +554 -0
  136. agentops/utils/yaml.py +36 -0
  137. agentops_accelerator-0.3.0.dist-info/METADATA +278 -0
  138. agentops_accelerator-0.3.0.dist-info/RECORD +142 -0
  139. agentops_accelerator-0.3.0.dist-info/WHEEL +5 -0
  140. agentops_accelerator-0.3.0.dist-info/entry_points.txt +2 -0
  141. agentops_accelerator-0.3.0.dist-info/licenses/LICENSE +21 -0
  142. agentops_accelerator-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,39 @@
1
+ pillar,area,item_id,title,detection_source,detection_signal,doctor_check_id,status,reference_url
2
+ Security,Identity,waf.security.local_auth_disabled,Disable local (key-based) auth on the AI account,azure_resources,account.disable_local_auth == true,waf.security.local_auth_disabled,implemented,https://learn.microsoft.com/azure/well-architected/ai/security
3
+ Security,Identity,waf.security.managed_identity,Use managed identity for the AI account,azure_resources,"account.identity.type in {SystemAssigned, UserAssigned}",waf.security.managed_identity,implemented,https://learn.microsoft.com/azure/well-architected/ai/security
4
+ Security,Telemetry,waf.security.diagnostic_settings,Diagnostic settings forward logs to a workspace,azure_resources,account has at least one diagnostic setting with a workspace_id,waf.security.diagnostic_settings,implemented,https://learn.microsoft.com/azure/well-architected/ai/security
5
+ ResponsibleAI,ContentSafety,waf.rai.safety_metric_hit,Content-safety evaluator flagged a row in the latest eval,results_history,"row metric >= severity_floor on Violence/SelfHarm/Sexual/HateUnfairness",safety,implemented,https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-metrics-built-in
6
+ ResponsibleAI,ContentSafety,waf.rai.runtime_content_filter,Content-filter triggers detected in production,azure_monitor,KQL hits on gen_ai.response.finish_reasons contains content_filter,safety.runtime.content_filter,implemented,https://learn.microsoft.com/azure/ai-foundry/concepts/content-filtering
7
+ ResponsibleAI,ContinuousEval,waf.rai.continuous_eval_missing,Continuous evaluation rules attached to agents,foundry_control,foundry.evaluation_rules is empty while agents exist,safety.config.continuous_eval_missing,implemented,https://learn.microsoft.com/azure/ai-foundry/how-to/online-evaluation
8
+ ResponsibleAI,ContinuousEval,waf.rai.continuous_eval_disabled,Continuous evaluation rules enabled,foundry_control,any evaluation_rule.enabled == false,safety.config.continuous_eval_disabled,implemented,https://learn.microsoft.com/azure/ai-foundry/how-to/online-evaluation
9
+ OperationalExcellence,CI-CD,waf.opex.unpinned_agent,Agent target is pinned to a version,workspace_files,"agentops.yaml agent: lacks :version (or :latest)",opex.unpinned_agent,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
10
+ OperationalExcellence,CI-CD,waf.opex.thresholds_defined,agentops.yaml declares explicit thresholds,workspace_files,agentops.yaml has no thresholds: block,opex.no_thresholds,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
11
+ OperationalExcellence,CI-CD,waf.opex.pr_gate,Repository has an AgentOps PR gate,workspace_files,.github/workflows/agentops-pr.yml exists,opex.no_pr_gate,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
12
+ OperationalExcellence,CI-CD,waf.opex.deploy_gate,Repository has AgentOps deploy workflows,workspace_files,at least one .github/workflows/agentops-deploy-*.yml exists,opex.no_deploy_workflow,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
13
+ OperationalExcellence,Governance,waf.opex.results_gitignored,Eval results are not committed to git,workspace_files,.agentops/results/ entry in any reachable .gitignore,opex.results_not_gitignored,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
14
+ OperationalExcellence,Governance,waf.opex.dataset_versioned,Dataset YAML files declare a version,workspace_files,.agentops/datasets/*.yaml has a top-level version: field,opex.unversioned_dataset,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
15
+ OperationalExcellence,Telemetry,waf.opex.stale_evaluation,Evaluations are run regularly,results_history,latest run timestamp older than stale_after_days,opex.stale_evaluation,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
16
+ Quality,Regression,waf.quality.metric_drop,No regression in evaluation metrics,results_history,latest_metric - baseline_metric > threshold_drop,regression,implemented,https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-approach-gen-ai
17
+ Performance,Latency,waf.perf.latency_threshold,p95 and eval-average latency under threshold,azure_monitor + results_history,"p95_seconds > p95_threshold_seconds or avg_latency > threshold",latency,implemented,https://learn.microsoft.com/azure/well-architected/ai/performance-efficiency
18
+ Reliability,Telemetry,waf.reliability.production_error_rate,Production error rate under threshold,azure_monitor,error_rate > rate_threshold,errors.production_rate,implemented,https://learn.microsoft.com/azure/well-architected/ai/reliability
19
+ Reliability,Telemetry,waf.reliability.foundry_run_failures,Foundry agent runs not failing,foundry_control,foundry.failure_rate > rate_threshold,errors.foundry_runs,implemented,https://learn.microsoft.com/azure/well-architected/ai/reliability
20
+ Reliability,Telemetry,waf.reliability.no_runtime_telemetry,Production telemetry is wired to the agent,azure_monitor,monitor.status == ok and request_count == 0 over lookback,errors.no_runtime_telemetry,implemented,https://learn.microsoft.com/azure/well-architected/ai/reliability
21
+ OperationalExcellence,Governance,waf.opex.bundle_versioned,Bundle YAML files declare a version,workspace_files,.agentops/bundles/*.yaml has a top-level version: field,opex.unversioned_bundle,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
22
+ OperationalExcellence,Retention,waf.opex.results_dir_bloat,Eval results directory has an archival policy,workspace_files,.agentops/results/ holds <= 50 run folders,opex.results_dir_bloat,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
23
+ OperationalExcellence,Stability,waf.opex.flaky_metric,Eval metrics are stable across runs,results_history,coefficient of variation across last N runs < flaky_cv_threshold,opex.flaky_metric,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
24
+ OperationalExcellence,Cost,waf.opex.workflow_concurrency,AgentOps workflows declare a concurrency block,workspace_files,agentops-pr.yml / agentops-deploy-*.yml has a top-level concurrency: block,opex.workflow_concurrency_lock,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
25
+ OperationalExcellence,Reproducibility,waf.opex.workflow_action_pinning,AgentOps workflows pin actions by commit SHA,workspace_files,every uses: pins to a 40-char SHA,opex.workflow_action_sha_pinning,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
26
+ OperationalExcellence,Foundry,waf.opex.foundry_control_configured,Foundry control plane source is wired,foundry_control,foundry_control.diagnostics.status == ok,opex.no_foundry_control_configured,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
27
+ ResponsibleAI,Transparency,waf.rai.prompt_transparency,Agent system prompt declares AI nature and source citation,foundry_control,LLM judges agent instructions for AI-disclosure / source-citation / role-scope,responsible_ai.llm.prompt_transparency,implemented,https://learn.microsoft.com/azure/well-architected/ai/responsible-ai
28
+ ResponsibleAI,SafetyGuardrails,waf.rai.prompt_safety_guardrails,Agent system prompt has explicit refusal patterns,foundry_control,LLM judges instructions for refusal guidance across the four harm categories,responsible_ai.llm.prompt_safety_guardrails,implemented,https://learn.microsoft.com/azure/well-architected/ai/responsible-ai
29
+ ResponsibleAI,SafetyGuardrails,waf.rai.prompt_jailbreak_surface,Agent system prompt resists jailbreak / injection trapdoors,foundry_control,LLM scans instructions for override-phrasing / embedded secrets / unbounded role-play,responsible_ai.llm.prompt_jailbreak_surface,implemented,https://learn.microsoft.com/azure/well-architected/ai/responsible-ai
30
+ ResponsibleAI,Privacy,waf.rai.dataset_pii_risk,Evaluation dataset is free of PII,workspace_files,LLM scans .agentops/data/*.jsonl sample for personal information,responsible_ai.llm.dataset_pii_risk,implemented,https://learn.microsoft.com/azure/well-architected/ai/responsible-ai
31
+ ResponsibleAI,Fairness,waf.rai.dataset_bias_signals,Evaluation dataset covers diverse cohorts,workspace_files,LLM judges dataset sample for demographic / role / domain / tone skew,responsible_ai.llm.dataset_bias_signals,implemented,https://learn.microsoft.com/azure/well-architected/ai/responsible-ai
32
+ OperationalExcellence,EvaluatorCoverage,waf.opex.llm_bundle_coverage,Bundle covers the evaluators the agent needs,workspace_files,LLM compares bundle YAML against agent description and recommends missing built-ins,opex.llm.bundle_coverage,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
33
+ OperationalExcellence,Documentation,waf.opex.spec_present,Spec-driven scaffolding has spec content,workspace_files,detector hint paths present but no spec document found,opex.spec_conformance.spec_missing,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
34
+ OperationalExcellence,Documentation,waf.opex.tasks_fresh,Spec tasks are kept fresh,workspace_files,tasks.md unchecked items older than stale_after_days,opex.spec_conformance.tasks_stale,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
35
+ OperationalExcellence,Documentation,waf.opex.tasks_grounded,Completed spec tasks reference paths that exist,workspace_files,checked task references a path missing from the workspace,opex.spec_conformance.tasks_orphaned,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
36
+ OperationalExcellence,Documentation,waf.opex.evaluator_alignment,Spec evaluators match bundle evaluators,workspace_files,evaluator mentioned in spec absent from every bundle YAML,opex.spec_conformance.evaluator_drift,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
37
+ OperationalExcellence,Documentation,waf.opex.dataset_alignment,Spec datasets exist in workspace,workspace_files,dataset mentioned in spec absent from .agentops/datasets or .agentops/data,opex.spec_conformance.dataset_drift,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
38
+ OperationalExcellence,Documentation,waf.opex.agent_alignment,Spec agent id matches run.yaml,workspace_files,spec mentions an agent_id that doesn't match run.yaml target.endpoint.agent_id,opex.spec_conformance.agent_drift,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
39
+ OperationalExcellence,Documentation,waf.opex.llm_spec_implementation_gap,LLM judge cross-checks spec vs implementation,workspace_files,LLM judge compares spec capabilities to workspace fingerprint,opex.spec_conformance.llm.implementation_gap,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
@@ -0,0 +1,16 @@
1
+ """LLM-judged Doctor checks.
2
+
3
+ This package adds an opt-in layer of Doctor checks that invoke a judge
4
+ model (via Foundry's OpenAI client) to evaluate semantic signals -
5
+ prompt quality, dataset PII risk, bias, bundle coverage. See
6
+ ``docs/doctor-explained.md`` for the full rationale.
7
+
8
+ Entry point is :func:`run_llm_assist_check`. Everything else here is
9
+ implementation detail; do not import from sub-modules directly.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from agentops.agent.llm_assist._engine import run_llm_assist_check
15
+
16
+ __all__ = ["run_llm_assist_check"]
@@ -0,0 +1,124 @@
1
+ """Base helpers for individual LLM-judged rules.
2
+
3
+ Every rule shares the same shape: a focused system prompt, a Pydantic
4
+ schema for the verdict, and a small builder that converts a verdict
5
+ into a :class:`Finding`. This module factors out the duplicate code.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ from dataclasses import dataclass
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ from pydantic import BaseModel, ConfigDict, Field
15
+
16
+ from agentops.agent.findings import Category, Finding, Severity
17
+ from agentops.agent.llm_assist._client import JudgementMeta
18
+
19
+
20
+ class BaseVerdict(BaseModel):
21
+ """Minimum schema every judge response must satisfy."""
22
+
23
+ model_config = ConfigDict(extra="allow")
24
+ risk: str = Field(description="Low, Medium, or High")
25
+ confidence: float = Field(ge=0.0, le=1.0)
26
+ reasoning: str
27
+ suggestions: List[str] = Field(
28
+ default_factory=list,
29
+ description=(
30
+ "Two to four concrete, actionable fixes the user can apply, "
31
+ "tailored to what the judge actually observed."
32
+ ),
33
+ )
34
+
35
+
36
+ def hash_text(*chunks: str) -> str:
37
+ h = hashlib.sha256()
38
+ for chunk in chunks:
39
+ h.update(chunk.encode("utf-8", errors="replace"))
40
+ h.update(b"\0")
41
+ return h.hexdigest()[:16]
42
+
43
+
44
+ def normalised_risk(verdict: BaseVerdict) -> str:
45
+ """Return verdict.risk lower-cased and bounded to {low, medium, high}."""
46
+ raw = (getattr(verdict, "risk", "") or "").strip().lower()
47
+ if raw in {"low", "medium", "high"}:
48
+ return raw
49
+ if raw in {"none", "ok", "clean"}:
50
+ return "low"
51
+ if raw in {"warning", "moderate"}:
52
+ return "medium"
53
+ if raw in {"critical", "severe"}:
54
+ return "high"
55
+ return "low"
56
+
57
+
58
+ def severity_for(risk: str) -> Severity:
59
+ # LLM findings cap at WARNING by design.
60
+ return Severity.WARNING if risk in {"medium", "high"} else Severity.INFO
61
+
62
+
63
+ @dataclass
64
+ class FindingBuilderArgs:
65
+ rule_id: str
66
+ title: str
67
+ category: Category
68
+ summary_template: str
69
+ recommendation: str
70
+ verdict: BaseVerdict
71
+ meta: JudgementMeta
72
+ extra_evidence: Dict[str, Any]
73
+
74
+
75
+ def build_llm_finding(args: FindingBuilderArgs) -> Optional[Finding]:
76
+ risk = normalised_risk(args.verdict)
77
+ if risk == "low":
78
+ return None
79
+ severity = severity_for(risk)
80
+
81
+ # If the judge produced concrete suggestions, splice them into the
82
+ # recommendation so the user sees actionable, case-specific fixes
83
+ # right next to the canonical guidance.
84
+ suggestions: List[str] = []
85
+ for raw in getattr(args.verdict, "suggestions", []) or []:
86
+ text = str(raw).strip()
87
+ if text:
88
+ suggestions.append(text)
89
+ recommendation = args.recommendation
90
+ if suggestions:
91
+ bullets = "\n".join(f"- {s}" for s in suggestions[:6])
92
+ recommendation = (
93
+ f"{args.recommendation}\n\n"
94
+ f"**Concrete fixes the judge model suggested for this "
95
+ f"specific case:**\n{bullets}"
96
+ )
97
+
98
+ evidence: Dict[str, Any] = {
99
+ "confidence": round(args.verdict.confidence, 3),
100
+ "reasoning": args.verdict.reasoning,
101
+ "model_deployment": args.meta.model_deployment,
102
+ "cache_hit": args.meta.cache_hit,
103
+ "risk": risk,
104
+ }
105
+ if suggestions:
106
+ evidence["suggestions"] = suggestions
107
+ evidence.update(args.extra_evidence)
108
+ if args.meta.input_tokens or args.meta.output_tokens:
109
+ evidence["tokens"] = {
110
+ "input": args.meta.input_tokens,
111
+ "output": args.meta.output_tokens,
112
+ }
113
+ return Finding(
114
+ id=args.rule_id,
115
+ severity=severity,
116
+ category=args.category,
117
+ title=f"[LLM-judged] {args.title}",
118
+ summary=args.summary_template.format(
119
+ risk=risk, reasoning=args.verdict.reasoning
120
+ ),
121
+ recommendation=recommendation,
122
+ source="llm_judge",
123
+ evidence=evidence,
124
+ )
@@ -0,0 +1,154 @@
1
+ """LLM-judged Operational Excellence check: evaluator-bundle coverage.
2
+
3
+ Reads the project's evaluator bundle YAML and a short agent description
4
+ excerpt, then asks the judge model whether the bundle covers the
5
+ evaluators a project of that shape typically needs (e.g. a RAG agent
6
+ without ``GroundednessEvaluator``).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from pathlib import Path
12
+ from typing import List, Optional
13
+
14
+ import yaml
15
+
16
+ from agentops.agent.findings import Category, Finding
17
+ from agentops.agent.llm_assist._base import (
18
+ BaseVerdict,
19
+ FindingBuilderArgs,
20
+ build_llm_finding,
21
+ hash_text,
22
+ )
23
+ from agentops.agent.llm_assist._client import LLMJudge
24
+ from agentops.agent.sources.foundry_control import FoundryAgentSummary
25
+
26
+
27
+ _COVERAGE_SYSTEM = """You audit a project's evaluator bundle for the
28
+ Microsoft Well-Architected Framework for AI Operational Excellence
29
+ pillar. You receive:
30
+
31
+ 1. The bundle YAML (evaluators list + thresholds).
32
+ 2. The agent's name + a short instructions excerpt that hints at its
33
+ use case (RAG, conversational, tool-using, etc.).
34
+
35
+ Decide which Foundry / azure-ai-evaluation built-in evaluators are
36
+ notably missing for that use case. Examples:
37
+
38
+ * RAG agent without GroundednessEvaluator or RetrievalEvaluator.
39
+ * Tool-using agent without ToolCallAccuracyEvaluator.
40
+ * Customer-support chat agent without CoherenceEvaluator.
41
+ * Any agent serving end-users without content-safety evaluators
42
+ (Violence, SelfHarm, Sexual, HateUnfairness).
43
+
44
+ Respond as compact JSON. Do NOT recommend custom evaluators; stick to
45
+ Foundry / azure-ai-evaluation built-ins.
46
+
47
+ {"risk": "low|medium|high", "confidence": <0.0-1.0>,
48
+ "reasoning": "<one short paragraph>",
49
+ "suggestions": ["<fix 1>", "<fix 2>", "<fix 3>"],
50
+ "missing_evaluators": ["GroundednessEvaluator", ...]}
51
+ """
52
+
53
+
54
+ class CoverageVerdict(BaseVerdict):
55
+ missing_evaluators: List[str] = []
56
+
57
+
58
+ def _load_bundle(workspace: Path) -> Optional[str]:
59
+ bundles = workspace / ".agentops" / "bundles"
60
+ if not bundles.is_dir():
61
+ return None
62
+ yamls = sorted(bundles.glob("*.yaml"))
63
+ if not yamls:
64
+ return None
65
+ bundle_path = yamls[0]
66
+ try:
67
+ text = bundle_path.read_text(encoding="utf-8")
68
+ except OSError:
69
+ return None
70
+ return text
71
+
72
+
73
+ def _agent_excerpt(agents: List[FoundryAgentSummary]) -> Optional[str]:
74
+ for agent in agents:
75
+ if not agent.instructions:
76
+ continue
77
+ excerpt = agent.instructions.strip()
78
+ if len(excerpt) > 800:
79
+ excerpt = excerpt[:800] + "..."
80
+ return (
81
+ f"Agent name: {agent.name or agent.agent_id}\n"
82
+ f"Model: {agent.model or 'unknown'}\n\n"
83
+ f"Instructions excerpt:\n{excerpt}"
84
+ )
85
+ if agents:
86
+ a = agents[0]
87
+ return (
88
+ f"Agent name: {a.name or a.agent_id}\nModel: "
89
+ f"{a.model or 'unknown'}\n(instructions unavailable)"
90
+ )
91
+ return None
92
+
93
+
94
+ def check_bundle_coverage(
95
+ judge: LLMJudge,
96
+ workspace: Path,
97
+ agents: List[FoundryAgentSummary],
98
+ min_confidence: float,
99
+ ) -> List[Finding]:
100
+ bundle_text = _load_bundle(workspace)
101
+ if bundle_text is None:
102
+ return []
103
+ agent_excerpt = _agent_excerpt(agents)
104
+ if agent_excerpt is None:
105
+ return []
106
+
107
+ # Sanity check: skip when the YAML is unparseable.
108
+ try:
109
+ yaml.safe_load(bundle_text)
110
+ except yaml.YAMLError:
111
+ return []
112
+
113
+ ih = hash_text("bundle_coverage", bundle_text, agent_excerpt)
114
+ result = judge.call(
115
+ system=_COVERAGE_SYSTEM,
116
+ user=(
117
+ "Bundle YAML:\n```yaml\n"
118
+ f"{bundle_text}\n```\n\n"
119
+ f"Agent context:\n{agent_excerpt}"
120
+ ),
121
+ schema=CoverageVerdict,
122
+ inputs_hash=ih,
123
+ )
124
+ if result is None:
125
+ return []
126
+ verdict, meta = result
127
+ if verdict.confidence < min_confidence:
128
+ return []
129
+ finding = build_llm_finding(
130
+ FindingBuilderArgs(
131
+ rule_id="opex.llm.bundle_coverage",
132
+ title="Evaluator bundle may be missing built-ins for this agent",
133
+ category=Category.OPERATIONAL_EXCELLENCE,
134
+ summary_template=(
135
+ "The judge model identified built-in evaluators that "
136
+ "fit this agent's use case but are not in the bundle "
137
+ "(risk={risk}): {reasoning}"
138
+ ),
139
+ recommendation=(
140
+ "Review the suggested evaluators in this finding's "
141
+ "evidence and add them to `.agentops/bundles/*.yaml` "
142
+ "if they fit. Use the canonical names from "
143
+ "`docs/foundry-evaluation-sdk-built-in-evaluators.md`."
144
+ ),
145
+ verdict=verdict,
146
+ meta=meta,
147
+ extra_evidence={
148
+ "missing_evaluators": getattr(
149
+ verdict, "missing_evaluators", []
150
+ ),
151
+ },
152
+ )
153
+ )
154
+ return [finding] if finding is not None else []