PyPI - agentops-accelerator - Versions diffs - 0.3.0__py3-none-any.whl - Mend

agentops-accelerator 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

agentops/__init__.py +10 -0
agentops/__main__.py +6 -0
agentops/agent/__init__.py +12 -0
agentops/agent/_legacy_ids.py +92 -0
agentops/agent/analyzer.py +207 -0
agentops/agent/checks/__init__.py +1 -0
agentops/agent/checks/catalog.py +880 -0
agentops/agent/checks/errors.py +279 -0
agentops/agent/checks/foundry_config.py +75 -0
agentops/agent/checks/latency.py +84 -0
agentops/agent/checks/opex.py +157 -0
agentops/agent/checks/opex_workspace.py +874 -0
agentops/agent/checks/posture.py +36 -0
agentops/agent/checks/posture_rules/__init__.py +53 -0
agentops/agent/checks/posture_rules/content_filter.py +59 -0
agentops/agent/checks/posture_rules/diagnostics.py +74 -0
agentops/agent/checks/posture_rules/local_auth.py +55 -0
agentops/agent/checks/posture_rules/managed_identity.py +59 -0
agentops/agent/checks/posture_rules/network.py +68 -0
agentops/agent/checks/regression.py +78 -0
agentops/agent/checks/release_readiness.py +182 -0
agentops/agent/checks/safety.py +247 -0
agentops/agent/checks/spec_conformance.py +375 -0
agentops/agent/cockpit.py +5159 -0
agentops/agent/config.py +240 -0
agentops/agent/findings.py +113 -0
agentops/agent/history.py +142 -0
agentops/agent/knowledge/__init__.py +182 -0
agentops/agent/knowledge/waf-checklist.csv +39 -0
agentops/agent/llm_assist/__init__.py +16 -0
agentops/agent/llm_assist/_base.py +124 -0
agentops/agent/llm_assist/_bundle_rule.py +154 -0
agentops/agent/llm_assist/_client.py +347 -0
agentops/agent/llm_assist/_dataset_rules.py +191 -0
agentops/agent/llm_assist/_engine.py +106 -0
agentops/agent/llm_assist/_prompt_rules.py +291 -0
agentops/agent/llm_assist/_spec_rules.py +235 -0
agentops/agent/production_telemetry.py +430 -0
agentops/agent/report.py +207 -0
agentops/agent/server/__init__.py +1 -0
agentops/agent/server/app.py +84 -0
agentops/agent/server/auth.py +94 -0
agentops/agent/server/chat.py +44 -0
agentops/agent/server/protocol.py +72 -0
agentops/agent/sources/__init__.py +1 -0
agentops/agent/sources/azure_monitor.py +523 -0
agentops/agent/sources/azure_resources.py +602 -0
agentops/agent/sources/foundry_control.py +174 -0
agentops/agent/sources/results_history.py +494 -0
agentops/agent/sources/spec_detectors/__init__.py +42 -0
agentops/agent/sources/spec_detectors/_base.py +58 -0
agentops/agent/sources/spec_detectors/agents_md.py +75 -0
agentops/agent/sources/spec_detectors/spec_kit.py +172 -0
agentops/agent/time_range.py +117 -0
agentops/cli/__init__.py +1 -0
agentops/cli/app.py +4823 -0
agentops/core/__init__.py +1 -0
agentops/core/agentops_config.py +592 -0
agentops/core/config_loader.py +22 -0
agentops/core/evaluators.py +480 -0
agentops/core/release_evidence.py +56 -0
agentops/core/results.py +117 -0
agentops/mcp/__init__.py +10 -0
agentops/mcp/server.py +232 -0
agentops/pipeline/__init__.py +8 -0
agentops/pipeline/cloud_results.py +189 -0
agentops/pipeline/cloud_runner.py +901 -0
agentops/pipeline/comparison.py +108 -0
agentops/pipeline/diagnostics.py +51 -0
agentops/pipeline/invocations.py +535 -0
agentops/pipeline/official_eval.py +414 -0
agentops/pipeline/orchestrator.py +775 -0
agentops/pipeline/prompt_deploy.py +377 -0
agentops/pipeline/publisher.py +121 -0
agentops/pipeline/reporter.py +202 -0
agentops/pipeline/runtime.py +409 -0
agentops/pipeline/thresholds.py +84 -0
agentops/services/__init__.py +1 -0
agentops/services/cicd.py +720 -0
agentops/services/eval_analysis.py +848 -0
agentops/services/evidence_pack.py +757 -0
agentops/services/initializer.py +86 -0
agentops/services/preflight.py +470 -0
agentops/services/setup_wizard.py +709 -0
agentops/services/skills.py +643 -0
agentops/services/trace_promotion.py +300 -0
agentops/services/workflow_analysis.py +1129 -0
agentops/templates/.gitignore +15 -0
agentops/templates/__init__.py +1 -0
agentops/templates/agent-server/Dockerfile +23 -0
agentops/templates/agent-server/README.md +61 -0
agentops/templates/agent-server/main.bicep +94 -0
agentops/templates/agent.yaml +87 -0
agentops/templates/agentops.yaml +58 -0
agentops/templates/foundry.svg +71 -0
agentops/templates/icon.png +0 -0
agentops/templates/pipelines/azuredevops/agentops-deploy-dev-azd.yml +118 -0
agentops/templates/pipelines/azuredevops/agentops-deploy-dev.yml +73 -0
agentops/templates/pipelines/azuredevops/agentops-deploy-prod-azd.yml +141 -0
agentops/templates/pipelines/azuredevops/agentops-deploy-prod.yml +94 -0
agentops/templates/pipelines/azuredevops/agentops-deploy-prompt-agent.yml +167 -0
agentops/templates/pipelines/azuredevops/agentops-deploy-qa-azd.yml +118 -0
agentops/templates/pipelines/azuredevops/agentops-deploy-qa.yml +68 -0
agentops/templates/pipelines/azuredevops/agentops-pr-prompt-agent.yml +210 -0
agentops/templates/pipelines/azuredevops/agentops-pr.yml +155 -0
agentops/templates/pipelines/azuredevops/agentops-watchdog.yml +106 -0
agentops/templates/project.gitignore +36 -0
agentops/templates/sample-traces.jsonl +3 -0
agentops/templates/skills/agentops-agent/SKILL.md +137 -0
agentops/templates/skills/agentops-config/SKILL.md +113 -0
agentops/templates/skills/agentops-dataset/SKILL.md +84 -0
agentops/templates/skills/agentops-eval/SKILL.md +189 -0
agentops/templates/skills/agentops-report/SKILL.md +71 -0
agentops/templates/skills/agentops-workflow/SKILL.md +471 -0
agentops/templates/smoke.jsonl +3 -0
agentops/templates/waf-checklist.README.md +84 -0
agentops/templates/waf-checklist.csv +22 -0
agentops/templates/workflows/agentops-deploy-dev-azd.yml +166 -0
agentops/templates/workflows/agentops-deploy-dev.yml +187 -0
agentops/templates/workflows/agentops-deploy-prod-azd.yml +183 -0
agentops/templates/workflows/agentops-deploy-prod.yml +171 -0
agentops/templates/workflows/agentops-deploy-prompt-agent.yml +197 -0
agentops/templates/workflows/agentops-deploy-qa-azd.yml +156 -0
agentops/templates/workflows/agentops-deploy-qa.yml +145 -0
agentops/templates/workflows/agentops-pr-prompt-agent.yml +210 -0
agentops/templates/workflows/agentops-pr.yml +148 -0
agentops/templates/workflows/agentops-watchdog.yml +122 -0
agentops/utils/__init__.py +1 -0
agentops/utils/azd_env.py +435 -0
agentops/utils/azure_endpoints.py +62 -0
agentops/utils/colors.py +47 -0
agentops/utils/dotenv_loader.py +105 -0
agentops/utils/foundry_discovery.py +229 -0
agentops/utils/logging.py +59 -0
agentops/utils/telemetry.py +554 -0
agentops/utils/yaml.py +36 -0
agentops_accelerator-0.3.0.dist-info/METADATA +278 -0
agentops_accelerator-0.3.0.dist-info/RECORD +142 -0
agentops_accelerator-0.3.0.dist-info/WHEEL +5 -0
agentops_accelerator-0.3.0.dist-info/entry_points.txt +2 -0
agentops_accelerator-0.3.0.dist-info/licenses/LICENSE +21 -0
agentops_accelerator-0.3.0.dist-info/top_level.txt +1 -0

agentops/agent/checks/errors.py ADDED Viewed

@@ -0,0 +1,279 @@
+"""Errors / failure rate check."""
+from __future__ import annotations
+from typing import List, Optional
+from agentops.agent.config import ErrorsCheckConfig
+from agentops.agent.findings import Category, Finding, Severity
+from agentops.agent.sources.azure_monitor import AzureMonitorPayload
+from agentops.agent.sources.foundry_control import FoundryControlPayload
+def run_errors_check(
+    monitor: Optional[AzureMonitorPayload],
+    foundry: Optional[FoundryControlPayload],
+    config: ErrorsCheckConfig,
+) -> List[Finding]:
+    findings: List[Finding] = []
+    if (
+        monitor
+        and monitor.error_rate is not None
+        and monitor.error_rate > config.rate_threshold
+    ):
+        severity = (
+            Severity.CRITICAL
+            if monitor.error_rate >= config.rate_threshold * 2
+            else Severity.WARNING
+        )
+        findings.append(
+            Finding(
+                id="errors.production_rate",
+                severity=severity,
+                category=Category.RELIABILITY,
+                title="Production error rate above threshold",
+                summary=(
+                    f"App Insights reports {monitor.error_count} failed "
+                    f"requests over {monitor.request_count} total "
+                    f"({monitor.error_rate * 100:.2f}%), above the "
+                    f"{config.rate_threshold * 100:.2f}% threshold."
+                ),
+                recommendation=(
+                    "Open the App Insights resource, group failures by "
+                    "operation, and inspect the most common exception "
+                    "type."
+                ),
+                source="azure_monitor",
+                evidence={
+                    "error_count": monitor.error_count,
+                    "request_count": monitor.request_count,
+                    "error_rate": monitor.error_rate,
+                    "threshold": config.rate_threshold,
+                },
+            )
+        )
+    if (
+        foundry
+        and foundry.failure_rate is not None
+        and foundry.failure_rate > config.rate_threshold
+    ):
+        findings.append(
+            Finding(
+                id="errors.foundry_runs",
+                severity=Severity.WARNING,
+                category=Category.RELIABILITY,
+                title="Foundry agent run failure rate elevated",
+                summary=(
+                    f"Foundry control plane reports "
+                    f"{foundry.failed_runs}/{foundry.total_runs} failed "
+                    f"runs ({foundry.failure_rate * 100:.2f}%)."
+                ),
+                recommendation=(
+                    "Review recent Foundry runs, paying attention to "
+                    "tool-call errors and rate limits."
+                ),
+                source="foundry_control",
+                evidence={
+                    "failed_runs": foundry.failed_runs,
+                    "total_runs": foundry.total_runs,
+                    "failure_rate": foundry.failure_rate,
+                },
+            )
+        )
+    findings.extend(_check_no_runtime_telemetry(monitor))
+    findings.extend(_check_rate_limit_pressure(monitor, config))
+    findings.extend(_check_no_token_telemetry(monitor))
+    return findings
+def _check_rate_limit_pressure(
+    monitor: Optional[AzureMonitorPayload],
+    config: ErrorsCheckConfig,
+) -> List[Finding]:
+    """AI.154 — surface HTTP 429 spikes from Azure OpenAI / AI Services.
+    Rate-limit responses indicate the workload is exhausting its TPM /
+    RPM quota or PTU capacity. Even when the overall error rate is
+    healthy, 429s tell the team to raise quotas or add a backoff /
+    gateway layer **before** users see degraded behaviour.
+    """
+    if monitor is None or not monitor.rate_limit_429_count:
+        return []
+    # Treat the same rate threshold as the error-rate check: if 429s
+    # exceed ``rate_threshold`` of total requests, escalate. With no
+    # request_count info, fall back to a hard floor of 10 hits.
+    total = monitor.request_count
+    threshold_hits = max(10, int(total * config.rate_threshold)) if total else 10
+    if monitor.rate_limit_429_count < threshold_hits:
+        return []
+    severity = (
+        Severity.CRITICAL
+        if monitor.rate_limit_429_count >= threshold_hits * 2
+        else Severity.WARNING
+    )
+    return [
+        Finding(
+            id="errors.rate_limit_pressure",
+            severity=severity,
+            category=Category.RELIABILITY,
+            title="Azure OpenAI rate-limit responses (HTTP 429) above threshold",
+            summary=(
+                f"App Insights reports {monitor.rate_limit_429_count} HTTP "
+                f"429 responses from Azure OpenAI / AI Services over the "
+                "lookback window. The workload is hitting its TPM / RPM "
+                "ceiling and clients are being throttled."
+            ),
+            recommendation=(
+                "Raise the deployment's TPM / RPM quota, switch high-volume "
+                "flows to a Provisioned-Throughput Unit (PTU) deployment, "
+                "or add an APIM gateway with retry + backoff so clients "
+                "do not see the 429s directly."
+            ),
+            source="azure_monitor",
+            evidence={
+                "rate_limit_429_count": monitor.rate_limit_429_count,
+                "request_count": monitor.request_count,
+                "threshold_hits": threshold_hits,
+            },
+        )
+    ]
+def _check_no_token_telemetry(
+    monitor: Optional[AzureMonitorPayload],
+) -> List[Finding]:
+    """AI.132 — warn when the runtime emits requests but no token telemetry.
+    The OpenTelemetry GenAI semantic conventions
+    (``gen_ai.usage.input_tokens`` / ``gen_ai.usage.output_tokens``)
+    are the canonical signal for token-cost monitoring. When the agent
+    runtime emits dependency spans but no token attributes, the team
+    flies blind on cost and on AI.132's "Monitor token usage" guidance.
+    """
+    if monitor is None:
+        return []
+    if (monitor.diagnostics or {}).get("token_status") == "error":
+        return []
+    if monitor.request_count <= 0:
+        return []  # absence of telemetry is covered by errors.no_runtime_telemetry
+    in_t = monitor.input_token_count or 0
+    out_t = monitor.output_token_count or 0
+    if in_t > 0 or out_t > 0:
+        return []
+    return [
+        Finding(
+            id="opex.no_token_telemetry",
+            severity=Severity.WARNING,
+            category=Category.OPERATIONAL_EXCELLENCE,
+            title="Runtime emits requests but no token-usage telemetry",
+            summary=(
+                f"App Insights recorded {monitor.request_count} agent "
+                "requests but reports zero input / output tokens. The "
+                "OpenTelemetry GenAI conventions "
+                "(`gen_ai.usage.input_tokens` / "
+                "`gen_ai.usage.output_tokens`) are not being emitted, so "
+                "token-cost monitoring and the Tokens card on the "
+                "cockpit stay grey."
+            ),
+            recommendation=(
+                "Wire the OpenAI instrumentor on the agent runtime "
+                "(`opentelemetry-instrumentation-openai-v2` or the "
+                "Azure SDK's built-in tracing). The instrumentor sets "
+                "the token-usage attributes from the model response "
+                "automatically."
+            ),
+            source="azure_monitor",
+            evidence={
+                "request_count": monitor.request_count,
+                "input_token_count": in_t,
+                "output_token_count": out_t,
+            },
+        )
+    ]
+def _check_no_runtime_telemetry(
+    monitor: Optional[AzureMonitorPayload],
+) -> List[Finding]:
+    """Warn when Azure Monitor is not wired, or wired but silent.
+    Two failure modes count, both blockers for production
+    observability:
+    * **Not configured.** The ``azure_monitor`` source is enabled but
+      has no ``app_insights_resource_id`` / ``log_analytics_workspace_id``,
+      so it reports ``status: skipped``. Doctor has no production
+      observability at all.
+    * **Configured but empty.** The source reports ``status: ok`` but
+      ``request_count == 0`` over the lookback, so the App Insights
+      workspace exists but the agent runtime is not emitting
+      telemetry to it.
+    The two cases share one finding because the user-facing
+    remediation is identical: wire the OpenTelemetry exporter on the
+    agent runtime side, and configure the resource id on the
+    ``azure_monitor`` source in ``agent.yaml``. If the source is
+    explicitly ``enabled: false`` we treat that as an opt-out and
+    stay quiet.
+    """
+    if monitor is None:
+        return []
+    diag = monitor.diagnostics or {}
+    status = diag.get("status")
+    if status == "disabled":
+        return []
+    if status == "ok" and monitor.request_count <= 0:
+        summary = (
+            "Application Insights / Log Analytics is reachable but "
+            "reports 0 requests over the lookback window. The "
+            "agent runtime is not emitting telemetry, so the "
+            "cockpit, latency, errors, and runtime-safety "
+            "checks have nothing to grade."
+        )
+        evidence = {
+            "request_count": monitor.request_count,
+            "monitor_status": status,
+            "mode": "configured_but_empty",
+        }
+    elif status == "skipped":
+        summary = (
+            "The `azure_monitor` source is not configured "
+            f"({diag.get('reason') or 'unknown reason'}). Without "
+            "App Insights wired up, Doctor has no production "
+            "observability, so latency, errors, runtime safety, and "
+            "telemetry-based reliability checks all stay grey."
+        )
+        evidence = {
+            "monitor_status": status,
+            "reason": diag.get("reason"),
+            "mode": "not_configured",
+        }
+    else:
+        return []
+    return [
+        Finding(
+            id="errors.no_runtime_telemetry",
+            severity=Severity.WARNING,
+            category=Category.RELIABILITY,
+            title="Production telemetry is not wired to the agent",
+            summary=summary,
+            recommendation=(
+                "Configure `sources.azure_monitor.app_insights_resource_id` "
+                "or set `APPLICATIONINSIGHTS_CONNECTION_STRING` with an "
+                "`ApplicationId`, install the `[agent]` extra, and connect "
+                "Azure Monitor OpenTelemetry on the agent runtime "
+                "(call `configure_azure_monitor()` on startup). "
+                "See `docs/tutorial-end-to-end.md` -> "
+                "'Wire observability'."
+            ),
+            source="azure_monitor",
+            evidence=evidence,
+        )
+    ]

agentops/agent/checks/foundry_config.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""Foundry control-plane configuration audit (Operational Excellence category).
+Mirrors the ``errors.no_runtime_telemetry`` pattern for the Foundry
+control plane. The Doctor warns when Foundry was expected but the
+control-plane source is unconfigured or unreachable.
+If the user explicitly opted out (``foundry_control.enabled: false``)
+we stay silent - that is the documented way to say "we are not on
+Foundry".
+"""
+from __future__ import annotations
+from typing import List, Optional
+from agentops.agent.findings import Category, Finding, Severity
+from agentops.agent.sources.foundry_control import FoundryControlPayload
+SOURCE_NAME = "foundry_control"
+def run_foundry_config_check(
+    foundry: Optional[FoundryControlPayload],
+) -> List[Finding]:
+    """Audit the Foundry control plane configuration."""
+    if foundry is None:
+        return []
+    diag = foundry.diagnostics or {}
+    status = diag.get("status")
+    if status == "disabled":
+        return []
+    findings: List[Finding] = []
+    if status != "ok":
+        findings.append(_no_foundry_control_finding(diag))
+        return findings
+    return findings
+def _no_foundry_control_finding(diag: dict) -> Finding:
+    status = diag.get("status") or "unknown"
+    reason = diag.get("reason") or (
+        "the source is enabled but did not return a healthy status"
+    )
+    return Finding(
+        id="opex.no_foundry_control_configured",
+        severity=Severity.WARNING,
+        category=Category.OPERATIONAL_EXCELLENCE,
+        title="Foundry control plane is not configured",
+        summary=(
+            "The `foundry_control` source is enabled but reports "
+            f"`status: {status}` ({reason}). Without it, Doctor "
+            "cannot see Foundry-side agents, evaluation rules, or "
+            "run failures, so safety-config and Foundry-run checks "
+            "stay grey."
+        ),
+        recommendation=(
+            "Set `sources.foundry_control.project_endpoint` (or the "
+            "`AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` env var) in "
+            "`.agentops/agent.yaml`, install the `[foundry]` extra, "
+            "and grant the running identity at least `Reader` on the "
+            "Foundry project. If this project does not use Foundry, "
+            "set `sources.foundry_control.enabled: false` to opt out "
+            "explicitly."
+        ),
+        source=SOURCE_NAME,
+        evidence={
+            "monitor_status": status,
+            "reason": reason,
+            "mode": "not_configured",
+        },
+    )

agentops/agent/checks/latency.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Latency check based on Azure Monitor and AgentOps results history."""
+from __future__ import annotations
+from typing import List, Optional
+from agentops.agent.config import LatencyCheckConfig
+from agentops.agent.findings import Category, Finding, Severity
+from agentops.agent.sources.azure_monitor import AzureMonitorPayload
+from agentops.agent.sources.results_history import ResultsHistory
+def run_latency_check(
+    history: ResultsHistory,
+    monitor: Optional[AzureMonitorPayload],
+    config: LatencyCheckConfig,
+) -> List[Finding]:
+    findings: List[Finding] = []
+    threshold = config.p95_threshold_seconds
+    if monitor and monitor.p95_duration_seconds is not None:
+        p95 = monitor.p95_duration_seconds
+        if p95 > threshold:
+            severity = (
+                Severity.CRITICAL if p95 >= threshold * 2 else Severity.WARNING
+            )
+            findings.append(
+                Finding(
+                    id="latency.p95_production",
+                    severity=severity,
+                    category=Category.PERFORMANCE,
+                    title="Production p95 latency exceeds threshold",
+                    summary=(
+                        f"Application Insights reports p95 latency of "
+                        f"{p95:.2f}s, above the configured threshold of "
+                        f"{threshold:.2f}s."
+                    ),
+                    recommendation=(
+                        "Review recent deployments for tool-call loops or "
+                        "long-running RAG retrievals, and consider scaling "
+                        "out the agent runtime."
+                    ),
+                    source="azure_monitor",
+                    evidence={
+                        "p95_seconds": p95,
+                        "threshold_seconds": threshold,
+                        "request_count": monitor.request_count,
+                    },
+                )
+            )
+    if history.runs:
+        latest = history.runs[-1]
+        avg_latency = latest.metrics.get("avg_latency_seconds")
+        if avg_latency is not None and avg_latency > threshold:
+            severity = (
+                Severity.CRITICAL
+                if avg_latency >= threshold * 2
+                else Severity.WARNING
+            )
+            findings.append(
+                Finding(
+                    id="latency.eval_avg",
+                    severity=severity,
+                    category=Category.PERFORMANCE,
+                    title="Evaluation average latency above threshold",
+                    summary=(
+                        f"Run `{latest.run_id}` averaged "
+                        f"{avg_latency:.2f}s per item, above the "
+                        f"{threshold:.2f}s threshold."
+                    ),
+                    recommendation=(
+                        "Profile the slowest dataset rows and inspect tool "
+                        "calls; re-run evals after addressing the regression."
+                    ),
+                    source="results_history",
+                    evidence={
+                        "run_id": latest.run_id,
+                        "avg_latency_seconds": avg_latency,
+                        "threshold_seconds": threshold,
+                    },
+                )
+            )
+    return findings

agentops/agent/checks/opex.py ADDED Viewed

@@ -0,0 +1,157 @@
+"""Operational excellence check.
+Pipeline-hygiene findings that are time-based or stability-based rather
+than file-based (which live in :mod:`agentops.agent.checks.mlops`).
+Findings emitted:
+* ``opex.stale_evaluation`` - Doctor warns when no fresh eval run has
+  landed in the configured window.
+* ``opex.flaky_metric`` - a metric's coefficient of variation across
+  recent runs is high enough to suggest a flaky judge / non-deterministic
+  prompt rather than real change.
+"""
+from __future__ import annotations
+from datetime import datetime, timezone
+from statistics import mean, pstdev
+from typing import List
+from agentops.agent.config import OpexCheckConfig
+from agentops.agent.findings import Category, Finding, Severity
+from agentops.agent.sources.results_history import ResultsHistory
+SOURCE_NAME = "results_history"
+def run_opex_check(
+    history: ResultsHistory, config: OpexCheckConfig
+) -> List[Finding]:
+    """Detect operational-excellence regressions (stale runs + flaky metrics)."""
+    if not config.enabled:
+        return []
+    findings: List[Finding] = []
+    findings.extend(_check_stale_evaluation(history, config))
+    findings.extend(_check_flaky_metric(history, config))
+    return findings
+def _check_stale_evaluation(
+    history: ResultsHistory, config: OpexCheckConfig
+) -> List[Finding]:
+    if not history.runs:
+        return []
+    latest = history.runs[-1]
+    if latest.timestamp is None:
+        return []
+    now = datetime.now(timezone.utc)
+    age_days = (now - latest.timestamp).total_seconds() / 86400.0
+    if age_days <= config.stale_after_days:
+        return []
+    severity = (
+        Severity.CRITICAL
+        if age_days >= config.stale_after_days * 2
+        else Severity.WARNING
+    )
+    return [
+        Finding(
+            id="opex.stale_evaluation",
+            severity=severity,
+            category=Category.OPERATIONAL_EXCELLENCE,
+            title="No fresh evaluation run in the configured window",
+            summary=(
+                f"The most recent eval run (`{latest.run_id}`) is "
+                f"{age_days:.1f} day(s) old, above the configured "
+                f"threshold of {config.stale_after_days} day(s). The "
+                "agent's measured quality is drifting away from its "
+                "last validated baseline."
+            ),
+            recommendation=(
+                "Run `agentops eval run` (locally or via CI) to "
+                "produce a fresh local `results.json` or Foundry cloud "
+                "evaluation, then re-run `agentops doctor`."
+            ),
+            source=SOURCE_NAME,
+            evidence={
+                "latest_run_id": latest.run_id,
+                "latest_timestamp": latest.timestamp.isoformat(),
+                "age_days": round(age_days, 2),
+                "threshold_days": config.stale_after_days,
+            },
+        )
+    ]
+def _check_flaky_metric(
+    history: ResultsHistory, config: OpexCheckConfig
+) -> List[Finding]:
+    """Flag metrics whose coefficient of variation is suspiciously high.
+    A high CV (stddev / mean) across many runs without a corresponding
+    agent change is the fingerprint of a non-deterministic judge or a
+    prompt that's overly sensitive to phrasing. Real regressions show
+    up as monotonic drops (caught by the ``regression`` check); flaky
+    metrics oscillate.
+    We only consider metrics with at least ``min_runs_for_flaky`` data
+    points and a mean that's safely above zero to avoid amplifying noise
+    on near-zero scores.
+    """
+    runs = history.runs
+    if len(runs) < config.min_runs_for_flaky:
+        return []
+    # Collect each metric's series across the recent window.
+    series: dict[str, List[float]] = {}
+    for run in runs[-config.min_runs_for_flaky :]:
+        for name, value in run.metrics.items():
+            series.setdefault(name, []).append(value)
+    findings: List[Finding] = []
+    for metric, values in series.items():
+        if len(values) < config.min_runs_for_flaky:
+            continue
+        avg = mean(values)
+        if avg <= 0.05:
+            # Near-zero metrics make CV explode without signal.
+            continue
+        cv = pstdev(values) / avg
+        if cv < config.flaky_cv_threshold:
+            continue
+        findings.append(
+            Finding(
+                id=f"opex.flaky_metric.{metric}",
+                severity=Severity.WARNING,
+                category=Category.OPERATIONAL_EXCELLENCE,
+                title=f"`{metric}` is unstable across recent runs",
+                summary=(
+                    f"`{metric}` shows a coefficient of variation of "
+                    f"{cv * 100:.1f}% across the last {len(values)} "
+                    "runs (threshold: "
+                    f"{config.flaky_cv_threshold * 100:.0f}%). That "
+                    "kind of oscillation usually points at a "
+                    "non-deterministic judge model or a prompt that's "
+                    "overly sensitive to phrasing - not at real "
+                    "agent change."
+                ),
+                recommendation=(
+                    "Pin the judge model's `temperature` / `seed` "
+                    "(or switch to a deterministic evaluator), and "
+                    "review the metric's prompt for ambiguity. If "
+                    "the metric is intrinsically noisy, raise "
+                    "`min_runs` on the regression check so signals "
+                    "average out."
+                ),
+                source=SOURCE_NAME,
+                evidence={
+                    "metric": metric,
+                    "cv": round(cv, 4),
+                    "mean": round(avg, 4),
+                    "samples": len(values),
+                },
+            )
+        )
+    return findings