agentops-accelerator 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentops/__init__.py +10 -0
- agentops/__main__.py +6 -0
- agentops/agent/__init__.py +12 -0
- agentops/agent/_legacy_ids.py +92 -0
- agentops/agent/analyzer.py +207 -0
- agentops/agent/checks/__init__.py +1 -0
- agentops/agent/checks/catalog.py +880 -0
- agentops/agent/checks/errors.py +279 -0
- agentops/agent/checks/foundry_config.py +75 -0
- agentops/agent/checks/latency.py +84 -0
- agentops/agent/checks/opex.py +157 -0
- agentops/agent/checks/opex_workspace.py +874 -0
- agentops/agent/checks/posture.py +36 -0
- agentops/agent/checks/posture_rules/__init__.py +53 -0
- agentops/agent/checks/posture_rules/content_filter.py +59 -0
- agentops/agent/checks/posture_rules/diagnostics.py +74 -0
- agentops/agent/checks/posture_rules/local_auth.py +55 -0
- agentops/agent/checks/posture_rules/managed_identity.py +59 -0
- agentops/agent/checks/posture_rules/network.py +68 -0
- agentops/agent/checks/regression.py +78 -0
- agentops/agent/checks/release_readiness.py +182 -0
- agentops/agent/checks/safety.py +247 -0
- agentops/agent/checks/spec_conformance.py +375 -0
- agentops/agent/cockpit.py +5159 -0
- agentops/agent/config.py +240 -0
- agentops/agent/findings.py +113 -0
- agentops/agent/history.py +142 -0
- agentops/agent/knowledge/__init__.py +182 -0
- agentops/agent/knowledge/waf-checklist.csv +39 -0
- agentops/agent/llm_assist/__init__.py +16 -0
- agentops/agent/llm_assist/_base.py +124 -0
- agentops/agent/llm_assist/_bundle_rule.py +154 -0
- agentops/agent/llm_assist/_client.py +347 -0
- agentops/agent/llm_assist/_dataset_rules.py +191 -0
- agentops/agent/llm_assist/_engine.py +106 -0
- agentops/agent/llm_assist/_prompt_rules.py +291 -0
- agentops/agent/llm_assist/_spec_rules.py +235 -0
- agentops/agent/production_telemetry.py +430 -0
- agentops/agent/report.py +207 -0
- agentops/agent/server/__init__.py +1 -0
- agentops/agent/server/app.py +84 -0
- agentops/agent/server/auth.py +94 -0
- agentops/agent/server/chat.py +44 -0
- agentops/agent/server/protocol.py +72 -0
- agentops/agent/sources/__init__.py +1 -0
- agentops/agent/sources/azure_monitor.py +523 -0
- agentops/agent/sources/azure_resources.py +602 -0
- agentops/agent/sources/foundry_control.py +174 -0
- agentops/agent/sources/results_history.py +494 -0
- agentops/agent/sources/spec_detectors/__init__.py +42 -0
- agentops/agent/sources/spec_detectors/_base.py +58 -0
- agentops/agent/sources/spec_detectors/agents_md.py +75 -0
- agentops/agent/sources/spec_detectors/spec_kit.py +172 -0
- agentops/agent/time_range.py +117 -0
- agentops/cli/__init__.py +1 -0
- agentops/cli/app.py +4823 -0
- agentops/core/__init__.py +1 -0
- agentops/core/agentops_config.py +592 -0
- agentops/core/config_loader.py +22 -0
- agentops/core/evaluators.py +480 -0
- agentops/core/release_evidence.py +56 -0
- agentops/core/results.py +117 -0
- agentops/mcp/__init__.py +10 -0
- agentops/mcp/server.py +232 -0
- agentops/pipeline/__init__.py +8 -0
- agentops/pipeline/cloud_results.py +189 -0
- agentops/pipeline/cloud_runner.py +901 -0
- agentops/pipeline/comparison.py +108 -0
- agentops/pipeline/diagnostics.py +51 -0
- agentops/pipeline/invocations.py +535 -0
- agentops/pipeline/official_eval.py +414 -0
- agentops/pipeline/orchestrator.py +775 -0
- agentops/pipeline/prompt_deploy.py +377 -0
- agentops/pipeline/publisher.py +121 -0
- agentops/pipeline/reporter.py +202 -0
- agentops/pipeline/runtime.py +409 -0
- agentops/pipeline/thresholds.py +84 -0
- agentops/services/__init__.py +1 -0
- agentops/services/cicd.py +720 -0
- agentops/services/eval_analysis.py +848 -0
- agentops/services/evidence_pack.py +757 -0
- agentops/services/initializer.py +86 -0
- agentops/services/preflight.py +470 -0
- agentops/services/setup_wizard.py +709 -0
- agentops/services/skills.py +643 -0
- agentops/services/trace_promotion.py +300 -0
- agentops/services/workflow_analysis.py +1129 -0
- agentops/templates/.gitignore +15 -0
- agentops/templates/__init__.py +1 -0
- agentops/templates/agent-server/Dockerfile +23 -0
- agentops/templates/agent-server/README.md +61 -0
- agentops/templates/agent-server/main.bicep +94 -0
- agentops/templates/agent.yaml +87 -0
- agentops/templates/agentops.yaml +58 -0
- agentops/templates/foundry.svg +71 -0
- agentops/templates/icon.png +0 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-dev-azd.yml +118 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-dev.yml +73 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prod-azd.yml +141 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prod.yml +94 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prompt-agent.yml +167 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-qa-azd.yml +118 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-qa.yml +68 -0
- agentops/templates/pipelines/azuredevops/agentops-pr-prompt-agent.yml +210 -0
- agentops/templates/pipelines/azuredevops/agentops-pr.yml +155 -0
- agentops/templates/pipelines/azuredevops/agentops-watchdog.yml +106 -0
- agentops/templates/project.gitignore +36 -0
- agentops/templates/sample-traces.jsonl +3 -0
- agentops/templates/skills/agentops-agent/SKILL.md +137 -0
- agentops/templates/skills/agentops-config/SKILL.md +113 -0
- agentops/templates/skills/agentops-dataset/SKILL.md +84 -0
- agentops/templates/skills/agentops-eval/SKILL.md +189 -0
- agentops/templates/skills/agentops-report/SKILL.md +71 -0
- agentops/templates/skills/agentops-workflow/SKILL.md +471 -0
- agentops/templates/smoke.jsonl +3 -0
- agentops/templates/waf-checklist.README.md +84 -0
- agentops/templates/waf-checklist.csv +22 -0
- agentops/templates/workflows/agentops-deploy-dev-azd.yml +166 -0
- agentops/templates/workflows/agentops-deploy-dev.yml +187 -0
- agentops/templates/workflows/agentops-deploy-prod-azd.yml +183 -0
- agentops/templates/workflows/agentops-deploy-prod.yml +171 -0
- agentops/templates/workflows/agentops-deploy-prompt-agent.yml +197 -0
- agentops/templates/workflows/agentops-deploy-qa-azd.yml +156 -0
- agentops/templates/workflows/agentops-deploy-qa.yml +145 -0
- agentops/templates/workflows/agentops-pr-prompt-agent.yml +210 -0
- agentops/templates/workflows/agentops-pr.yml +148 -0
- agentops/templates/workflows/agentops-watchdog.yml +122 -0
- agentops/utils/__init__.py +1 -0
- agentops/utils/azd_env.py +435 -0
- agentops/utils/azure_endpoints.py +62 -0
- agentops/utils/colors.py +47 -0
- agentops/utils/dotenv_loader.py +105 -0
- agentops/utils/foundry_discovery.py +229 -0
- agentops/utils/logging.py +59 -0
- agentops/utils/telemetry.py +554 -0
- agentops/utils/yaml.py +36 -0
- agentops_accelerator-0.3.0.dist-info/METADATA +278 -0
- agentops_accelerator-0.3.0.dist-info/RECORD +142 -0
- agentops_accelerator-0.3.0.dist-info/WHEEL +5 -0
- agentops_accelerator-0.3.0.dist-info/entry_points.txt +2 -0
- agentops_accelerator-0.3.0.dist-info/licenses/LICENSE +21 -0
- agentops_accelerator-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
pillar,area,item_id,title,detection_source,detection_signal,doctor_check_id,status,reference_url
|
|
2
|
+
Security,Identity,waf.security.local_auth_disabled,Disable local (key-based) auth on the AI account,azure_resources,account.disable_local_auth == true,waf.security.local_auth_disabled,implemented,https://learn.microsoft.com/azure/well-architected/ai/security
|
|
3
|
+
Security,Identity,waf.security.managed_identity,Use managed identity for the AI account,azure_resources,"account.identity.type in {SystemAssigned, UserAssigned}",waf.security.managed_identity,implemented,https://learn.microsoft.com/azure/well-architected/ai/security
|
|
4
|
+
Security,Telemetry,waf.security.diagnostic_settings,Diagnostic settings forward logs to a workspace,azure_resources,account has at least one diagnostic setting with a workspace_id,waf.security.diagnostic_settings,implemented,https://learn.microsoft.com/azure/well-architected/ai/security
|
|
5
|
+
ResponsibleAI,ContentSafety,waf.rai.safety_metric_hit,Content-safety evaluator flagged a row in the latest eval,results_history,"row metric >= severity_floor on Violence/SelfHarm/Sexual/HateUnfairness",safety,implemented,https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-metrics-built-in
|
|
6
|
+
ResponsibleAI,ContentSafety,waf.rai.runtime_content_filter,Content-filter triggers detected in production,azure_monitor,KQL hits on gen_ai.response.finish_reasons contains content_filter,safety.runtime.content_filter,implemented,https://learn.microsoft.com/azure/ai-foundry/concepts/content-filtering
|
|
7
|
+
ResponsibleAI,ContinuousEval,waf.rai.continuous_eval_missing,Continuous evaluation rules attached to agents,foundry_control,foundry.evaluation_rules is empty while agents exist,safety.config.continuous_eval_missing,implemented,https://learn.microsoft.com/azure/ai-foundry/how-to/online-evaluation
|
|
8
|
+
ResponsibleAI,ContinuousEval,waf.rai.continuous_eval_disabled,Continuous evaluation rules enabled,foundry_control,any evaluation_rule.enabled == false,safety.config.continuous_eval_disabled,implemented,https://learn.microsoft.com/azure/ai-foundry/how-to/online-evaluation
|
|
9
|
+
OperationalExcellence,CI-CD,waf.opex.unpinned_agent,Agent target is pinned to a version,workspace_files,"agentops.yaml agent: lacks :version (or :latest)",opex.unpinned_agent,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
10
|
+
OperationalExcellence,CI-CD,waf.opex.thresholds_defined,agentops.yaml declares explicit thresholds,workspace_files,agentops.yaml has no thresholds: block,opex.no_thresholds,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
11
|
+
OperationalExcellence,CI-CD,waf.opex.pr_gate,Repository has an AgentOps PR gate,workspace_files,.github/workflows/agentops-pr.yml exists,opex.no_pr_gate,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
12
|
+
OperationalExcellence,CI-CD,waf.opex.deploy_gate,Repository has AgentOps deploy workflows,workspace_files,at least one .github/workflows/agentops-deploy-*.yml exists,opex.no_deploy_workflow,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
13
|
+
OperationalExcellence,Governance,waf.opex.results_gitignored,Eval results are not committed to git,workspace_files,.agentops/results/ entry in any reachable .gitignore,opex.results_not_gitignored,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
14
|
+
OperationalExcellence,Governance,waf.opex.dataset_versioned,Dataset YAML files declare a version,workspace_files,.agentops/datasets/*.yaml has a top-level version: field,opex.unversioned_dataset,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
15
|
+
OperationalExcellence,Telemetry,waf.opex.stale_evaluation,Evaluations are run regularly,results_history,latest run timestamp older than stale_after_days,opex.stale_evaluation,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
16
|
+
Quality,Regression,waf.quality.metric_drop,No regression in evaluation metrics,results_history,latest_metric - baseline_metric > threshold_drop,regression,implemented,https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-approach-gen-ai
|
|
17
|
+
Performance,Latency,waf.perf.latency_threshold,p95 and eval-average latency under threshold,azure_monitor + results_history,"p95_seconds > p95_threshold_seconds or avg_latency > threshold",latency,implemented,https://learn.microsoft.com/azure/well-architected/ai/performance-efficiency
|
|
18
|
+
Reliability,Telemetry,waf.reliability.production_error_rate,Production error rate under threshold,azure_monitor,error_rate > rate_threshold,errors.production_rate,implemented,https://learn.microsoft.com/azure/well-architected/ai/reliability
|
|
19
|
+
Reliability,Telemetry,waf.reliability.foundry_run_failures,Foundry agent runs not failing,foundry_control,foundry.failure_rate > rate_threshold,errors.foundry_runs,implemented,https://learn.microsoft.com/azure/well-architected/ai/reliability
|
|
20
|
+
Reliability,Telemetry,waf.reliability.no_runtime_telemetry,Production telemetry is wired to the agent,azure_monitor,monitor.status == ok and request_count == 0 over lookback,errors.no_runtime_telemetry,implemented,https://learn.microsoft.com/azure/well-architected/ai/reliability
|
|
21
|
+
OperationalExcellence,Governance,waf.opex.bundle_versioned,Bundle YAML files declare a version,workspace_files,.agentops/bundles/*.yaml has a top-level version: field,opex.unversioned_bundle,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
22
|
+
OperationalExcellence,Retention,waf.opex.results_dir_bloat,Eval results directory has an archival policy,workspace_files,.agentops/results/ holds <= 50 run folders,opex.results_dir_bloat,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
23
|
+
OperationalExcellence,Stability,waf.opex.flaky_metric,Eval metrics are stable across runs,results_history,coefficient of variation across last N runs < flaky_cv_threshold,opex.flaky_metric,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
24
|
+
OperationalExcellence,Cost,waf.opex.workflow_concurrency,AgentOps workflows declare a concurrency block,workspace_files,agentops-pr.yml / agentops-deploy-*.yml has a top-level concurrency: block,opex.workflow_concurrency_lock,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
25
|
+
OperationalExcellence,Reproducibility,waf.opex.workflow_action_pinning,AgentOps workflows pin actions by commit SHA,workspace_files,every uses: pins to a 40-char SHA,opex.workflow_action_sha_pinning,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
26
|
+
OperationalExcellence,Foundry,waf.opex.foundry_control_configured,Foundry control plane source is wired,foundry_control,foundry_control.diagnostics.status == ok,opex.no_foundry_control_configured,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
27
|
+
ResponsibleAI,Transparency,waf.rai.prompt_transparency,Agent system prompt declares AI nature and source citation,foundry_control,LLM judges agent instructions for AI-disclosure / source-citation / role-scope,responsible_ai.llm.prompt_transparency,implemented,https://learn.microsoft.com/azure/well-architected/ai/responsible-ai
|
|
28
|
+
ResponsibleAI,SafetyGuardrails,waf.rai.prompt_safety_guardrails,Agent system prompt has explicit refusal patterns,foundry_control,LLM judges instructions for refusal guidance across the four harm categories,responsible_ai.llm.prompt_safety_guardrails,implemented,https://learn.microsoft.com/azure/well-architected/ai/responsible-ai
|
|
29
|
+
ResponsibleAI,SafetyGuardrails,waf.rai.prompt_jailbreak_surface,Agent system prompt resists jailbreak / injection trapdoors,foundry_control,LLM scans instructions for override-phrasing / embedded secrets / unbounded role-play,responsible_ai.llm.prompt_jailbreak_surface,implemented,https://learn.microsoft.com/azure/well-architected/ai/responsible-ai
|
|
30
|
+
ResponsibleAI,Privacy,waf.rai.dataset_pii_risk,Evaluation dataset is free of PII,workspace_files,LLM scans .agentops/data/*.jsonl sample for personal information,responsible_ai.llm.dataset_pii_risk,implemented,https://learn.microsoft.com/azure/well-architected/ai/responsible-ai
|
|
31
|
+
ResponsibleAI,Fairness,waf.rai.dataset_bias_signals,Evaluation dataset covers diverse cohorts,workspace_files,LLM judges dataset sample for demographic / role / domain / tone skew,responsible_ai.llm.dataset_bias_signals,implemented,https://learn.microsoft.com/azure/well-architected/ai/responsible-ai
|
|
32
|
+
OperationalExcellence,EvaluatorCoverage,waf.opex.llm_bundle_coverage,Bundle covers the evaluators the agent needs,workspace_files,LLM compares bundle YAML against agent description and recommends missing built-ins,opex.llm.bundle_coverage,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
33
|
+
OperationalExcellence,Documentation,waf.opex.spec_present,Spec-driven scaffolding has spec content,workspace_files,detector hint paths present but no spec document found,opex.spec_conformance.spec_missing,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
34
|
+
OperationalExcellence,Documentation,waf.opex.tasks_fresh,Spec tasks are kept fresh,workspace_files,tasks.md unchecked items older than stale_after_days,opex.spec_conformance.tasks_stale,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
35
|
+
OperationalExcellence,Documentation,waf.opex.tasks_grounded,Completed spec tasks reference paths that exist,workspace_files,checked task references a path missing from the workspace,opex.spec_conformance.tasks_orphaned,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
36
|
+
OperationalExcellence,Documentation,waf.opex.evaluator_alignment,Spec evaluators match bundle evaluators,workspace_files,evaluator mentioned in spec absent from every bundle YAML,opex.spec_conformance.evaluator_drift,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
37
|
+
OperationalExcellence,Documentation,waf.opex.dataset_alignment,Spec datasets exist in workspace,workspace_files,dataset mentioned in spec absent from .agentops/datasets or .agentops/data,opex.spec_conformance.dataset_drift,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
38
|
+
OperationalExcellence,Documentation,waf.opex.agent_alignment,Spec agent id matches run.yaml,workspace_files,spec mentions an agent_id that doesn't match run.yaml target.endpoint.agent_id,opex.spec_conformance.agent_drift,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
39
|
+
OperationalExcellence,Documentation,waf.opex.llm_spec_implementation_gap,LLM judge cross-checks spec vs implementation,workspace_files,LLM judge compares spec capabilities to workspace fingerprint,opex.spec_conformance.llm.implementation_gap,implemented,https://learn.microsoft.com/azure/well-architected/ai/operations
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""LLM-judged Doctor checks.
|
|
2
|
+
|
|
3
|
+
This package adds an opt-in layer of Doctor checks that invoke a judge
|
|
4
|
+
model (via Foundry's OpenAI client) to evaluate semantic signals -
|
|
5
|
+
prompt quality, dataset PII risk, bias, bundle coverage. See
|
|
6
|
+
``docs/doctor-explained.md`` for the full rationale.
|
|
7
|
+
|
|
8
|
+
Entry point is :func:`run_llm_assist_check`. Everything else here is
|
|
9
|
+
implementation detail; do not import from sub-modules directly.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from agentops.agent.llm_assist._engine import run_llm_assist_check
|
|
15
|
+
|
|
16
|
+
__all__ = ["run_llm_assist_check"]
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Base helpers for individual LLM-judged rules.
|
|
2
|
+
|
|
3
|
+
Every rule shares the same shape: a focused system prompt, a Pydantic
|
|
4
|
+
schema for the verdict, and a small builder that converts a verdict
|
|
5
|
+
into a :class:`Finding`. This module factors out the duplicate code.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import hashlib
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
15
|
+
|
|
16
|
+
from agentops.agent.findings import Category, Finding, Severity
|
|
17
|
+
from agentops.agent.llm_assist._client import JudgementMeta
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BaseVerdict(BaseModel):
|
|
21
|
+
"""Minimum schema every judge response must satisfy."""
|
|
22
|
+
|
|
23
|
+
model_config = ConfigDict(extra="allow")
|
|
24
|
+
risk: str = Field(description="Low, Medium, or High")
|
|
25
|
+
confidence: float = Field(ge=0.0, le=1.0)
|
|
26
|
+
reasoning: str
|
|
27
|
+
suggestions: List[str] = Field(
|
|
28
|
+
default_factory=list,
|
|
29
|
+
description=(
|
|
30
|
+
"Two to four concrete, actionable fixes the user can apply, "
|
|
31
|
+
"tailored to what the judge actually observed."
|
|
32
|
+
),
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def hash_text(*chunks: str) -> str:
|
|
37
|
+
h = hashlib.sha256()
|
|
38
|
+
for chunk in chunks:
|
|
39
|
+
h.update(chunk.encode("utf-8", errors="replace"))
|
|
40
|
+
h.update(b"\0")
|
|
41
|
+
return h.hexdigest()[:16]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def normalised_risk(verdict: BaseVerdict) -> str:
|
|
45
|
+
"""Return verdict.risk lower-cased and bounded to {low, medium, high}."""
|
|
46
|
+
raw = (getattr(verdict, "risk", "") or "").strip().lower()
|
|
47
|
+
if raw in {"low", "medium", "high"}:
|
|
48
|
+
return raw
|
|
49
|
+
if raw in {"none", "ok", "clean"}:
|
|
50
|
+
return "low"
|
|
51
|
+
if raw in {"warning", "moderate"}:
|
|
52
|
+
return "medium"
|
|
53
|
+
if raw in {"critical", "severe"}:
|
|
54
|
+
return "high"
|
|
55
|
+
return "low"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def severity_for(risk: str) -> Severity:
|
|
59
|
+
# LLM findings cap at WARNING by design.
|
|
60
|
+
return Severity.WARNING if risk in {"medium", "high"} else Severity.INFO
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass
|
|
64
|
+
class FindingBuilderArgs:
|
|
65
|
+
rule_id: str
|
|
66
|
+
title: str
|
|
67
|
+
category: Category
|
|
68
|
+
summary_template: str
|
|
69
|
+
recommendation: str
|
|
70
|
+
verdict: BaseVerdict
|
|
71
|
+
meta: JudgementMeta
|
|
72
|
+
extra_evidence: Dict[str, Any]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def build_llm_finding(args: FindingBuilderArgs) -> Optional[Finding]:
|
|
76
|
+
risk = normalised_risk(args.verdict)
|
|
77
|
+
if risk == "low":
|
|
78
|
+
return None
|
|
79
|
+
severity = severity_for(risk)
|
|
80
|
+
|
|
81
|
+
# If the judge produced concrete suggestions, splice them into the
|
|
82
|
+
# recommendation so the user sees actionable, case-specific fixes
|
|
83
|
+
# right next to the canonical guidance.
|
|
84
|
+
suggestions: List[str] = []
|
|
85
|
+
for raw in getattr(args.verdict, "suggestions", []) or []:
|
|
86
|
+
text = str(raw).strip()
|
|
87
|
+
if text:
|
|
88
|
+
suggestions.append(text)
|
|
89
|
+
recommendation = args.recommendation
|
|
90
|
+
if suggestions:
|
|
91
|
+
bullets = "\n".join(f"- {s}" for s in suggestions[:6])
|
|
92
|
+
recommendation = (
|
|
93
|
+
f"{args.recommendation}\n\n"
|
|
94
|
+
f"**Concrete fixes the judge model suggested for this "
|
|
95
|
+
f"specific case:**\n{bullets}"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
evidence: Dict[str, Any] = {
|
|
99
|
+
"confidence": round(args.verdict.confidence, 3),
|
|
100
|
+
"reasoning": args.verdict.reasoning,
|
|
101
|
+
"model_deployment": args.meta.model_deployment,
|
|
102
|
+
"cache_hit": args.meta.cache_hit,
|
|
103
|
+
"risk": risk,
|
|
104
|
+
}
|
|
105
|
+
if suggestions:
|
|
106
|
+
evidence["suggestions"] = suggestions
|
|
107
|
+
evidence.update(args.extra_evidence)
|
|
108
|
+
if args.meta.input_tokens or args.meta.output_tokens:
|
|
109
|
+
evidence["tokens"] = {
|
|
110
|
+
"input": args.meta.input_tokens,
|
|
111
|
+
"output": args.meta.output_tokens,
|
|
112
|
+
}
|
|
113
|
+
return Finding(
|
|
114
|
+
id=args.rule_id,
|
|
115
|
+
severity=severity,
|
|
116
|
+
category=args.category,
|
|
117
|
+
title=f"[LLM-judged] {args.title}",
|
|
118
|
+
summary=args.summary_template.format(
|
|
119
|
+
risk=risk, reasoning=args.verdict.reasoning
|
|
120
|
+
),
|
|
121
|
+
recommendation=recommendation,
|
|
122
|
+
source="llm_judge",
|
|
123
|
+
evidence=evidence,
|
|
124
|
+
)
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""LLM-judged Operational Excellence check: evaluator-bundle coverage.
|
|
2
|
+
|
|
3
|
+
Reads the project's evaluator bundle YAML and a short agent description
|
|
4
|
+
excerpt, then asks the judge model whether the bundle covers the
|
|
5
|
+
evaluators a project of that shape typically needs (e.g. a RAG agent
|
|
6
|
+
without ``GroundednessEvaluator``).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List, Optional
|
|
13
|
+
|
|
14
|
+
import yaml
|
|
15
|
+
|
|
16
|
+
from agentops.agent.findings import Category, Finding
|
|
17
|
+
from agentops.agent.llm_assist._base import (
|
|
18
|
+
BaseVerdict,
|
|
19
|
+
FindingBuilderArgs,
|
|
20
|
+
build_llm_finding,
|
|
21
|
+
hash_text,
|
|
22
|
+
)
|
|
23
|
+
from agentops.agent.llm_assist._client import LLMJudge
|
|
24
|
+
from agentops.agent.sources.foundry_control import FoundryAgentSummary
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
_COVERAGE_SYSTEM = """You audit a project's evaluator bundle for the
|
|
28
|
+
Microsoft Well-Architected Framework for AI Operational Excellence
|
|
29
|
+
pillar. You receive:
|
|
30
|
+
|
|
31
|
+
1. The bundle YAML (evaluators list + thresholds).
|
|
32
|
+
2. The agent's name + a short instructions excerpt that hints at its
|
|
33
|
+
use case (RAG, conversational, tool-using, etc.).
|
|
34
|
+
|
|
35
|
+
Decide which Foundry / azure-ai-evaluation built-in evaluators are
|
|
36
|
+
notably missing for that use case. Examples:
|
|
37
|
+
|
|
38
|
+
* RAG agent without GroundednessEvaluator or RetrievalEvaluator.
|
|
39
|
+
* Tool-using agent without ToolCallAccuracyEvaluator.
|
|
40
|
+
* Customer-support chat agent without CoherenceEvaluator.
|
|
41
|
+
* Any agent serving end-users without content-safety evaluators
|
|
42
|
+
(Violence, SelfHarm, Sexual, HateUnfairness).
|
|
43
|
+
|
|
44
|
+
Respond as compact JSON. Do NOT recommend custom evaluators; stick to
|
|
45
|
+
Foundry / azure-ai-evaluation built-ins.
|
|
46
|
+
|
|
47
|
+
{"risk": "low|medium|high", "confidence": <0.0-1.0>,
|
|
48
|
+
"reasoning": "<one short paragraph>",
|
|
49
|
+
"suggestions": ["<fix 1>", "<fix 2>", "<fix 3>"],
|
|
50
|
+
"missing_evaluators": ["GroundednessEvaluator", ...]}
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class CoverageVerdict(BaseVerdict):
|
|
55
|
+
missing_evaluators: List[str] = []
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _load_bundle(workspace: Path) -> Optional[str]:
|
|
59
|
+
bundles = workspace / ".agentops" / "bundles"
|
|
60
|
+
if not bundles.is_dir():
|
|
61
|
+
return None
|
|
62
|
+
yamls = sorted(bundles.glob("*.yaml"))
|
|
63
|
+
if not yamls:
|
|
64
|
+
return None
|
|
65
|
+
bundle_path = yamls[0]
|
|
66
|
+
try:
|
|
67
|
+
text = bundle_path.read_text(encoding="utf-8")
|
|
68
|
+
except OSError:
|
|
69
|
+
return None
|
|
70
|
+
return text
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _agent_excerpt(agents: List[FoundryAgentSummary]) -> Optional[str]:
|
|
74
|
+
for agent in agents:
|
|
75
|
+
if not agent.instructions:
|
|
76
|
+
continue
|
|
77
|
+
excerpt = agent.instructions.strip()
|
|
78
|
+
if len(excerpt) > 800:
|
|
79
|
+
excerpt = excerpt[:800] + "..."
|
|
80
|
+
return (
|
|
81
|
+
f"Agent name: {agent.name or agent.agent_id}\n"
|
|
82
|
+
f"Model: {agent.model or 'unknown'}\n\n"
|
|
83
|
+
f"Instructions excerpt:\n{excerpt}"
|
|
84
|
+
)
|
|
85
|
+
if agents:
|
|
86
|
+
a = agents[0]
|
|
87
|
+
return (
|
|
88
|
+
f"Agent name: {a.name or a.agent_id}\nModel: "
|
|
89
|
+
f"{a.model or 'unknown'}\n(instructions unavailable)"
|
|
90
|
+
)
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def check_bundle_coverage(
|
|
95
|
+
judge: LLMJudge,
|
|
96
|
+
workspace: Path,
|
|
97
|
+
agents: List[FoundryAgentSummary],
|
|
98
|
+
min_confidence: float,
|
|
99
|
+
) -> List[Finding]:
|
|
100
|
+
bundle_text = _load_bundle(workspace)
|
|
101
|
+
if bundle_text is None:
|
|
102
|
+
return []
|
|
103
|
+
agent_excerpt = _agent_excerpt(agents)
|
|
104
|
+
if agent_excerpt is None:
|
|
105
|
+
return []
|
|
106
|
+
|
|
107
|
+
# Sanity check: skip when the YAML is unparseable.
|
|
108
|
+
try:
|
|
109
|
+
yaml.safe_load(bundle_text)
|
|
110
|
+
except yaml.YAMLError:
|
|
111
|
+
return []
|
|
112
|
+
|
|
113
|
+
ih = hash_text("bundle_coverage", bundle_text, agent_excerpt)
|
|
114
|
+
result = judge.call(
|
|
115
|
+
system=_COVERAGE_SYSTEM,
|
|
116
|
+
user=(
|
|
117
|
+
"Bundle YAML:\n```yaml\n"
|
|
118
|
+
f"{bundle_text}\n```\n\n"
|
|
119
|
+
f"Agent context:\n{agent_excerpt}"
|
|
120
|
+
),
|
|
121
|
+
schema=CoverageVerdict,
|
|
122
|
+
inputs_hash=ih,
|
|
123
|
+
)
|
|
124
|
+
if result is None:
|
|
125
|
+
return []
|
|
126
|
+
verdict, meta = result
|
|
127
|
+
if verdict.confidence < min_confidence:
|
|
128
|
+
return []
|
|
129
|
+
finding = build_llm_finding(
|
|
130
|
+
FindingBuilderArgs(
|
|
131
|
+
rule_id="opex.llm.bundle_coverage",
|
|
132
|
+
title="Evaluator bundle may be missing built-ins for this agent",
|
|
133
|
+
category=Category.OPERATIONAL_EXCELLENCE,
|
|
134
|
+
summary_template=(
|
|
135
|
+
"The judge model identified built-in evaluators that "
|
|
136
|
+
"fit this agent's use case but are not in the bundle "
|
|
137
|
+
"(risk={risk}): {reasoning}"
|
|
138
|
+
),
|
|
139
|
+
recommendation=(
|
|
140
|
+
"Review the suggested evaluators in this finding's "
|
|
141
|
+
"evidence and add them to `.agentops/bundles/*.yaml` "
|
|
142
|
+
"if they fit. Use the canonical names from "
|
|
143
|
+
"`docs/foundry-evaluation-sdk-built-in-evaluators.md`."
|
|
144
|
+
),
|
|
145
|
+
verdict=verdict,
|
|
146
|
+
meta=meta,
|
|
147
|
+
extra_evidence={
|
|
148
|
+
"missing_evaluators": getattr(
|
|
149
|
+
verdict, "missing_evaluators", []
|
|
150
|
+
),
|
|
151
|
+
},
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
return [finding] if finding is not None else []
|