agentops-accelerator 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. agentops/__init__.py +10 -0
  2. agentops/__main__.py +6 -0
  3. agentops/agent/__init__.py +12 -0
  4. agentops/agent/_legacy_ids.py +92 -0
  5. agentops/agent/analyzer.py +207 -0
  6. agentops/agent/checks/__init__.py +1 -0
  7. agentops/agent/checks/catalog.py +880 -0
  8. agentops/agent/checks/errors.py +279 -0
  9. agentops/agent/checks/foundry_config.py +75 -0
  10. agentops/agent/checks/latency.py +84 -0
  11. agentops/agent/checks/opex.py +157 -0
  12. agentops/agent/checks/opex_workspace.py +874 -0
  13. agentops/agent/checks/posture.py +36 -0
  14. agentops/agent/checks/posture_rules/__init__.py +53 -0
  15. agentops/agent/checks/posture_rules/content_filter.py +59 -0
  16. agentops/agent/checks/posture_rules/diagnostics.py +74 -0
  17. agentops/agent/checks/posture_rules/local_auth.py +55 -0
  18. agentops/agent/checks/posture_rules/managed_identity.py +59 -0
  19. agentops/agent/checks/posture_rules/network.py +68 -0
  20. agentops/agent/checks/regression.py +78 -0
  21. agentops/agent/checks/release_readiness.py +182 -0
  22. agentops/agent/checks/safety.py +247 -0
  23. agentops/agent/checks/spec_conformance.py +375 -0
  24. agentops/agent/cockpit.py +5159 -0
  25. agentops/agent/config.py +240 -0
  26. agentops/agent/findings.py +113 -0
  27. agentops/agent/history.py +142 -0
  28. agentops/agent/knowledge/__init__.py +182 -0
  29. agentops/agent/knowledge/waf-checklist.csv +39 -0
  30. agentops/agent/llm_assist/__init__.py +16 -0
  31. agentops/agent/llm_assist/_base.py +124 -0
  32. agentops/agent/llm_assist/_bundle_rule.py +154 -0
  33. agentops/agent/llm_assist/_client.py +347 -0
  34. agentops/agent/llm_assist/_dataset_rules.py +191 -0
  35. agentops/agent/llm_assist/_engine.py +106 -0
  36. agentops/agent/llm_assist/_prompt_rules.py +291 -0
  37. agentops/agent/llm_assist/_spec_rules.py +235 -0
  38. agentops/agent/production_telemetry.py +430 -0
  39. agentops/agent/report.py +207 -0
  40. agentops/agent/server/__init__.py +1 -0
  41. agentops/agent/server/app.py +84 -0
  42. agentops/agent/server/auth.py +94 -0
  43. agentops/agent/server/chat.py +44 -0
  44. agentops/agent/server/protocol.py +72 -0
  45. agentops/agent/sources/__init__.py +1 -0
  46. agentops/agent/sources/azure_monitor.py +523 -0
  47. agentops/agent/sources/azure_resources.py +602 -0
  48. agentops/agent/sources/foundry_control.py +174 -0
  49. agentops/agent/sources/results_history.py +494 -0
  50. agentops/agent/sources/spec_detectors/__init__.py +42 -0
  51. agentops/agent/sources/spec_detectors/_base.py +58 -0
  52. agentops/agent/sources/spec_detectors/agents_md.py +75 -0
  53. agentops/agent/sources/spec_detectors/spec_kit.py +172 -0
  54. agentops/agent/time_range.py +117 -0
  55. agentops/cli/__init__.py +1 -0
  56. agentops/cli/app.py +4823 -0
  57. agentops/core/__init__.py +1 -0
  58. agentops/core/agentops_config.py +592 -0
  59. agentops/core/config_loader.py +22 -0
  60. agentops/core/evaluators.py +480 -0
  61. agentops/core/release_evidence.py +56 -0
  62. agentops/core/results.py +117 -0
  63. agentops/mcp/__init__.py +10 -0
  64. agentops/mcp/server.py +232 -0
  65. agentops/pipeline/__init__.py +8 -0
  66. agentops/pipeline/cloud_results.py +189 -0
  67. agentops/pipeline/cloud_runner.py +901 -0
  68. agentops/pipeline/comparison.py +108 -0
  69. agentops/pipeline/diagnostics.py +51 -0
  70. agentops/pipeline/invocations.py +535 -0
  71. agentops/pipeline/official_eval.py +414 -0
  72. agentops/pipeline/orchestrator.py +775 -0
  73. agentops/pipeline/prompt_deploy.py +377 -0
  74. agentops/pipeline/publisher.py +121 -0
  75. agentops/pipeline/reporter.py +202 -0
  76. agentops/pipeline/runtime.py +409 -0
  77. agentops/pipeline/thresholds.py +84 -0
  78. agentops/services/__init__.py +1 -0
  79. agentops/services/cicd.py +720 -0
  80. agentops/services/eval_analysis.py +848 -0
  81. agentops/services/evidence_pack.py +757 -0
  82. agentops/services/initializer.py +86 -0
  83. agentops/services/preflight.py +470 -0
  84. agentops/services/setup_wizard.py +709 -0
  85. agentops/services/skills.py +643 -0
  86. agentops/services/trace_promotion.py +300 -0
  87. agentops/services/workflow_analysis.py +1129 -0
  88. agentops/templates/.gitignore +15 -0
  89. agentops/templates/__init__.py +1 -0
  90. agentops/templates/agent-server/Dockerfile +23 -0
  91. agentops/templates/agent-server/README.md +61 -0
  92. agentops/templates/agent-server/main.bicep +94 -0
  93. agentops/templates/agent.yaml +87 -0
  94. agentops/templates/agentops.yaml +58 -0
  95. agentops/templates/foundry.svg +71 -0
  96. agentops/templates/icon.png +0 -0
  97. agentops/templates/pipelines/azuredevops/agentops-deploy-dev-azd.yml +118 -0
  98. agentops/templates/pipelines/azuredevops/agentops-deploy-dev.yml +73 -0
  99. agentops/templates/pipelines/azuredevops/agentops-deploy-prod-azd.yml +141 -0
  100. agentops/templates/pipelines/azuredevops/agentops-deploy-prod.yml +94 -0
  101. agentops/templates/pipelines/azuredevops/agentops-deploy-prompt-agent.yml +167 -0
  102. agentops/templates/pipelines/azuredevops/agentops-deploy-qa-azd.yml +118 -0
  103. agentops/templates/pipelines/azuredevops/agentops-deploy-qa.yml +68 -0
  104. agentops/templates/pipelines/azuredevops/agentops-pr-prompt-agent.yml +210 -0
  105. agentops/templates/pipelines/azuredevops/agentops-pr.yml +155 -0
  106. agentops/templates/pipelines/azuredevops/agentops-watchdog.yml +106 -0
  107. agentops/templates/project.gitignore +36 -0
  108. agentops/templates/sample-traces.jsonl +3 -0
  109. agentops/templates/skills/agentops-agent/SKILL.md +137 -0
  110. agentops/templates/skills/agentops-config/SKILL.md +113 -0
  111. agentops/templates/skills/agentops-dataset/SKILL.md +84 -0
  112. agentops/templates/skills/agentops-eval/SKILL.md +189 -0
  113. agentops/templates/skills/agentops-report/SKILL.md +71 -0
  114. agentops/templates/skills/agentops-workflow/SKILL.md +471 -0
  115. agentops/templates/smoke.jsonl +3 -0
  116. agentops/templates/waf-checklist.README.md +84 -0
  117. agentops/templates/waf-checklist.csv +22 -0
  118. agentops/templates/workflows/agentops-deploy-dev-azd.yml +166 -0
  119. agentops/templates/workflows/agentops-deploy-dev.yml +187 -0
  120. agentops/templates/workflows/agentops-deploy-prod-azd.yml +183 -0
  121. agentops/templates/workflows/agentops-deploy-prod.yml +171 -0
  122. agentops/templates/workflows/agentops-deploy-prompt-agent.yml +197 -0
  123. agentops/templates/workflows/agentops-deploy-qa-azd.yml +156 -0
  124. agentops/templates/workflows/agentops-deploy-qa.yml +145 -0
  125. agentops/templates/workflows/agentops-pr-prompt-agent.yml +210 -0
  126. agentops/templates/workflows/agentops-pr.yml +148 -0
  127. agentops/templates/workflows/agentops-watchdog.yml +122 -0
  128. agentops/utils/__init__.py +1 -0
  129. agentops/utils/azd_env.py +435 -0
  130. agentops/utils/azure_endpoints.py +62 -0
  131. agentops/utils/colors.py +47 -0
  132. agentops/utils/dotenv_loader.py +105 -0
  133. agentops/utils/foundry_discovery.py +229 -0
  134. agentops/utils/logging.py +59 -0
  135. agentops/utils/telemetry.py +554 -0
  136. agentops/utils/yaml.py +36 -0
  137. agentops_accelerator-0.3.0.dist-info/METADATA +278 -0
  138. agentops_accelerator-0.3.0.dist-info/RECORD +142 -0
  139. agentops_accelerator-0.3.0.dist-info/WHEEL +5 -0
  140. agentops_accelerator-0.3.0.dist-info/entry_points.txt +2 -0
  141. agentops_accelerator-0.3.0.dist-info/licenses/LICENSE +21 -0
  142. agentops_accelerator-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,113 @@
1
+ ---
2
+ name: agentops-config
3
+ description: Generate or update agentops.yaml (flat 1.0 schema) for AgentOps release-readiness gates. Trigger on "configure agentops", "agentops.yaml", "set up evaluation", "what should I evaluate". Infer the agent target and dataset from the codebase; ask only when nothing can be found.
4
+ ---
5
+
6
+ # AgentOps Config
7
+
8
+ Generate `agentops.yaml` at the project root. This file references the agent
9
+ candidate and dataset used to answer "can we ship it?" The flat schema has only
10
+ a handful of fields - most projects need just `version`, `agent`, and
11
+ `dataset`.
12
+
13
+ This skill configures AgentOps release gates. It does **not** create or deploy
14
+ Foundry agents. If the user needs to create/deploy a Prompt Agent or Hosted
15
+ Agent first, hand off to Foundry Toolkit / the `microsoft-foundry` skill / azd,
16
+ then return here once there is a `name:version` or URL.
17
+
18
+ ## Step 0 - Prerequisites
19
+
20
+ 1. `pip install "agentops-accelerator[foundry] @ git+https://github.com/Azure/agentops.git@main"` if `agentops` is missing.
21
+ 2. Run `agentops eval analyze` first. If it reports missing or ambiguous
22
+ target/dataset/scenario signals, use this skill to adapt the config.
23
+ 3. If `agentops.yaml` does not exist, run `agentops init` first. The init
24
+ wizard already collects the agent reference and dataset path, so
25
+ `agentops-config` is most useful when the user wants to **tweak** an
26
+ existing config (add thresholds, switch to a different agent target,
27
+ add HTTP auth headers, etc.) rather than create one from scratch.
28
+
29
+ ## Step 1 - Detect the agent target
30
+
31
+ Search the codebase for the strongest signal and pick one:
32
+
33
+ | Signal | `agent:` value |
34
+ |---|---|
35
+ | Foundry Prompt Agent ID `name:N` | `"<name>:<N>"` |
36
+ | Foundry Hosted Agent URL `https://...services.ai.azure.com/...agents/...` | the full URL |
37
+ | Any other HTTP endpoint your agent serves (FastAPI, Express, ACA, AKS) | the full URL |
38
+ | Direct model use (`openai.chat.completions.create(model=...)`) with no orchestration | `"model:<deployment-name>"` |
39
+
40
+ Look in: `README.md`, `main.py`/`server.py`/`app.ts`, `.agentops/.env`,
41
+ `.env`/`.env.local`, `.azure/<env>/.env`, `infra/`, IaC outputs. If nothing is
42
+ found, ask the user once.
43
+
44
+ ## Step 2 - Detect the dataset
45
+
46
+ If a JSONL with rows that include `input` already exists in the repo, use
47
+ its path. Otherwise leave the default `.agentops/data/smoke.jsonl` and
48
+ hand off to the `agentops-dataset` skill before the first run.
49
+
50
+ ## Step 3 - Write agentops.yaml
51
+
52
+ Minimal example:
53
+
54
+ ```yaml
55
+ version: 1
56
+ agent: "my-rag:3"
57
+ dataset: .agentops/data/smoke.jsonl
58
+ ```
59
+
60
+ HTTP/JSON example:
61
+
62
+ ```yaml
63
+ version: 1
64
+ agent: "https://my-aca-app.eastus2.azurecontainerapps.io/chat"
65
+ dataset: .agentops/data/smoke.jsonl
66
+ request_field: message # default is "message"
67
+ response_field: text # dot-path; default is "text"
68
+ auth_header_env: MY_API_TOKEN
69
+ ```
70
+
71
+ Optional extras (only add when the user asks for them):
72
+
73
+ ```yaml
74
+ thresholds:
75
+ coherence: ">=3"
76
+ groundedness: ">=3"
77
+ avg_latency_seconds: "<=30"
78
+
79
+ # Publish results to the Foundry Evaluations panel.
80
+ # - execution: local + publish: true → Classic Foundry (uploads metrics)
81
+ # - execution: cloud → New Foundry (server-side run;
82
+ # publish is implicit, cloud always publishes)
83
+ execution: local
84
+ publish: true
85
+ # project_endpoint: "https://<resource>.services.ai.azure.com/api/projects/<p>"
86
+
87
+ # Cloud dataset submission policy. The local JSONL remains the source of truth;
88
+ # cloud runs sync it to Foundry Data/Datasets by default.
89
+ dataset_sync:
90
+ mode: auto # auto | foundry | inline
91
+ # name: agentops-smoke
92
+ # version: content-hash
93
+
94
+ evaluators: # rare - AgentOps auto-selects from agent + dataset
95
+ - name: similarity
96
+ threshold: ">=4"
97
+ ```
98
+
99
+ ## Step 4 - Validate
100
+
101
+ Run `agentops eval run` once. If the config is malformed AgentOps prints a
102
+ clear error pointing at the offending key. Adjust and re-run.
103
+
104
+ ## Guardrails
105
+
106
+ - Do **not** add legacy keys (`bundle`, `target`, `execution`, `output`,
107
+ `backend`). The 1.0 schema rejects them.
108
+ - Do **not** fabricate agent IDs, endpoint URLs, or model deployment
109
+ names. Ask the user when uncertain.
110
+ - Keep the file small. Auto-selection covers most metrics.
111
+ - Keep local JSONL canonical. For cloud runs, prefer `dataset_sync.mode: auto`
112
+ so AgentOps keeps Foundry Data/Datasets in sync; use `inline` only for quick
113
+ experiments or environments without dataset upload permission.
@@ -0,0 +1,84 @@
1
+ ---
2
+ name: agentops-dataset
3
+ description: Create or extend a small JSONL dataset for AgentOps release-readiness gates. Trigger on "create dataset", "generate test data", "JSONL", "more eval rows". Infer the agent's domain from the codebase and produce realistic rows; never fabricate data when the domain is unclear.
4
+ ---
5
+
6
+ # AgentOps Dataset
7
+
8
+ Generate a small, realistic JSONL dataset for the agent under evaluation.
9
+ Default location: `.agentops/data/smoke.jsonl` (referenced from
10
+ `agentops.yaml`). These rows are repo-side release-gate inputs: keep them
11
+ reviewable and deterministic, not a full replacement for Foundry dataset
12
+ management.
13
+
14
+ ## Step 0 - Prerequisites
15
+
16
+ 1. `pip install "agentops-accelerator @ git+https://github.com/Azure/agentops.git@main"` if `agentops` is missing.
17
+ 2. Run `agentops eval analyze` first. If it reports missing dataset columns or
18
+ recommends `agentops-dataset`, use this skill before the first eval run.
19
+ 3. If `agentops.yaml` does not exist, run `agentops init` first (the init
20
+ wizard will prompt for the agent reference, project endpoint, and
21
+ dataset path, then create a starter `.agentops/data/smoke.jsonl`).
22
+
23
+ ## Step 1 - Pick the columns
24
+
25
+ Read `agentops.yaml` (and the agent code) to figure out the agent type,
26
+ then choose the row schema:
27
+
28
+ | Agent type | Required columns | Optional columns |
29
+ |---|---|---|
30
+ | Direct model / Q&A | `input`, `expected` | - |
31
+ | RAG | `input`, `expected`, `context` | - |
32
+ | Conversational | `input`, `expected` | - |
33
+ | Tool-using agent | `input`, `expected`, `tool_calls` | `tool_definitions` |
34
+
35
+ `input` is always the user prompt. `expected` is the gold answer.
36
+ `context` is the retrieved passage(s). `tool_calls` is a list of
37
+ `{name, arguments}` describing the expected tool invocations.
38
+
39
+ ## Step 2 - Ground the rows in the codebase
40
+
41
+ - Read the README, system prompt, tool definitions, and any sample
42
+ fixtures.
43
+ - Generate **5–10 rows** that exercise the agent's actual capabilities.
44
+ - If the domain is unclear, generate a tiny generic draft and clearly
45
+ flag it as a placeholder.
46
+
47
+ ## Step 3 - Write the JSONL
48
+
49
+ One JSON object per line, no trailing commas, UTF-8:
50
+
51
+ ```json
52
+ {"input": "What is the refund policy?", "expected": "Refunds within 30 days...", "context": "Refund policy: ..."}
53
+ ```
54
+
55
+ Save to the path referenced by `dataset:` in `agentops.yaml` (default
56
+ `.agentops/data/smoke.jsonl`).
57
+
58
+ This file is the AgentOps source of truth. In Foundry cloud evaluation,
59
+ AgentOps syncs it to a stable Foundry dataset version by default and reuses the
60
+ same Foundry dataset version while the JSONL content is unchanged. If the user
61
+ forces `dataset_sync.mode: inline`, Foundry may show generated `eval-data-*`
62
+ backing assets in the project Data/Datasets page.
63
+
64
+ ## Step 4 - Sanity-check
65
+
66
+ Run a quick eval and confirm rows are picked up:
67
+
68
+ ```bash
69
+ agentops eval run
70
+ ```
71
+
72
+ Open `.agentops/results/latest/report.md` and confirm the row count
73
+ matches.
74
+
75
+ ## Guardrails
76
+
77
+ - Do not invent customer data, real names, or sensitive content.
78
+ - Keep rows short - datasets are meant to be quick gates, not full QA
79
+ suites.
80
+ - If the user already has a domain dataset, prefer pointing
81
+ `agentops.yaml` at that file rather than generating new rows.
82
+ - If the user asks why Foundry shows `eval-data-*`, explain that those are
83
+ cloud-eval backing assets from inline compatibility mode; normal cloud runs
84
+ should use the stable `agentops-*` Foundry dataset.
@@ -0,0 +1,189 @@
1
+ ---
2
+ name: agentops-eval
3
+ description: Run AgentOps release-readiness evaluations against Foundry prompt agents, Foundry hosted endpoints, HTTP/JSON agents, or raw model deployments. Trigger on phrases like "run eval", "evaluate my agent", "benchmark", "agentops eval", "compare runs", "can we ship". Uses the flat agentops.yaml schema.
4
+ ---
5
+
6
+ # AgentOps Eval
7
+
8
+ End-to-end release-gate workflow: install -> init -> configure -> run -> read
9
+ report -> decide whether the candidate is ready to ship.
10
+
11
+ AgentOps evaluates an existing candidate. It does **not** create or deploy
12
+ Foundry agents. If the user still needs a Prompt Agent or Hosted Agent, hand off
13
+ to Foundry Toolkit / the `microsoft-foundry` skill / azd first, then come back
14
+ with a `name:version` or URL.
15
+
16
+ ## Step 0 - Setup
17
+
18
+ 1. Install if missing: `pip install "agentops-accelerator[foundry] @ git+https://github.com/Azure/agentops.git@main"`.
19
+ 2. If `agentops.yaml` does not exist at the project root, run `agentops init`.
20
+ The init wizard prompts (azd-style) for the Foundry project endpoint,
21
+ agent reference, and dataset path, persists each answer to
22
+ `.agentops/.env` + `agentops.yaml` as it goes. Existing azd workspaces, or
23
+ runs with `--azd-env`, use `.azure/<env>/.env` instead. Pass `--no-prompt`
24
+ plus the explicit flags
25
+ (`--project-endpoint`, `--agent`, `--dataset`, …) for non-interactive
26
+ runs. Run `agentops init show` later to inspect the resolved config.
27
+
28
+ ## Step 1 - Analyze evaluation setup
29
+
30
+ Run the deterministic local triage first:
31
+
32
+ ```bash
33
+ agentops eval analyze
34
+ ```
35
+
36
+ Use its output to decide whether the repo is ready for `agentops eval run` or
37
+ needs skill-assisted setup. If it recommends `agentops-config`, fix the target
38
+ and protocol. If it recommends `agentops-dataset`, create/map realistic JSONL
39
+ rows. If it recommends `agentops-eval`, inspect the app scenario and evaluator
40
+ expectations before running.
41
+
42
+ ## Step 2 - Identify the agent target
43
+
44
+ Read the codebase (README, entry point, env vars) and pick the right value
45
+ for the `agent:` field of `agentops.yaml`:
46
+
47
+ | Pattern in code / env | `agent:` value |
48
+ |---|---|
49
+ | Foundry Prompt Agent ID like `name:1` | `"<name>:<version>"` |
50
+ | Foundry Hosted Agent endpoint URL ending in `/agents/...` | `"https://<resource>.services.ai.azure.com/api/projects/<p>/agents/..."` |
51
+ | Plain HTTP/JSON endpoint (FastAPI, Express, ACA, AKS) | `"https://<host>/<path>"` |
52
+ | Raw Foundry/Azure OpenAI model deployment | `"model:<deployment-name>"` |
53
+
54
+ If nothing is found, ask the user once for the agent identifier.
55
+
56
+ ## Step 3 - Make sure the dataset exists
57
+
58
+ `agentops.yaml` points to a JSONL file (default
59
+ `.agentops/data/smoke.jsonl`). Each row needs at least `input` and a label
60
+ that maps to the metric you care about (`expected`, `context`,
61
+ `tool_calls`...). If the dataset is empty or unrelated, run the
62
+ `agentops-dataset` skill before running the eval.
63
+
64
+ ## Step 4 - Run the evaluation
65
+
66
+ ```bash
67
+ agentops eval run
68
+ ```
69
+
70
+ Optional flags:
71
+
72
+ - `--config <path>` - point at a different `agentops.yaml`.
73
+ - `--output <dir>` - choose where to write `results.json` and `report.md`
74
+ (defaults to `.agentops/results/<timestamp>/`).
75
+
76
+ Exit codes:
77
+
78
+ - `0` - succeeded and all thresholds passed
79
+ - `2` - succeeded but at least one threshold failed (gate-friendly)
80
+ - `1` - runtime/configuration error
81
+
82
+ ## Step 4b - Pick the right execution path
83
+
84
+ | Target | Foundry server-side eval through AgentOps | AgentOps local runner | Default guidance |
85
+ |---|---|---|---|
86
+ | Foundry Prompt Agent (`name:version`) | `execution: cloud` | yes | Use cloud when the user wants the official Foundry-hosted eval record; use local for fast feedback or fallback. |
87
+ | Foundry Hosted Agent URL | no | yes | Use local runner; optionally set `publish: true` to upload local metrics to Classic Foundry. |
88
+ | Generic HTTP/JSON endpoint | no | yes | Use local runner. |
89
+ | Raw model deployment | no | yes | Use local runner. |
90
+
91
+ For prompt-agent CI gates, prefer AgentOps cloud eval because Foundry executes
92
+ the managed eval while AgentOps enforces thresholds and writes normalized
93
+ `results.json` / `report.md` artifacts. The official AI Agent Evaluation GitHub
94
+ Action or Azure DevOps extension is still useful for standalone platform-native
95
+ validation, but do not substitute it for the AgentOps PR gate when the user needs
96
+ threshold enforcement, baselines, Doctor readiness, release evidence, or local
97
+ fallback.
98
+
99
+ ## Step 5 - Inspect results and release evidence
100
+
101
+ ```bash
102
+ agentops report generate # regenerate report.md from latest results.json
103
+ agentops report generate --in <results.json>
104
+ ```
105
+
106
+ Open `.agentops/results/latest/report.md`. To compare two runs, hand both
107
+ `results.json` files to the user or run the next eval with
108
+ `--baseline <previous-results.json>` so AgentOps adds a **Comparison vs
109
+ Baseline** section to the report.
110
+
111
+ For production promotion, generate the Doctor evidence pack:
112
+
113
+ ```bash
114
+ agentops doctor --evidence-pack
115
+ ```
116
+
117
+ Open `.agentops/release/latest/evidence.md`. It summarizes eval, baseline,
118
+ Doctor, workflow, Foundry, monitoring, AI Landing Zone, and trace-regression
119
+ readiness without creating a second exit-code contract.
120
+
121
+ ## Step 5b - (Optional) Promote reviewed traces to regression rows
122
+
123
+ If the user has exported Foundry/App Insights traces, preview candidate
124
+ regression rows first:
125
+
126
+ ```bash
127
+ agentops eval promote-traces --source <traces.jsonl>
128
+ ```
129
+
130
+ Only write files after review:
131
+
132
+ ```bash
133
+ agentops eval promote-traces --source <traces.jsonl> --apply
134
+ ```
135
+
136
+ Default `self-similarity` labels are for drift detection, not human-verified
137
+ ground truth. Use `--label-mode pending` when reviewers must fill expected
138
+ answers before the dataset gates releases.
139
+
140
+ ## Step 6 - (Optional) Foundry execution / visibility
141
+
142
+ Two modes are supported. Both write a deep-link into
143
+ `.agentops/results/latest/cloud_evaluation.json` and require
144
+ `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` (or the inline `project_endpoint`).
145
+
146
+ **Classic Foundry Evaluations panel** (works for any target kind):
147
+ AgentOps runs locally first, then uploads the metrics it computed.
148
+
149
+ ```yaml
150
+ execution: local
151
+ publish: true
152
+ # project_endpoint: "https://<resource>.services.ai.azure.com/api/projects/<p>"
153
+ ```
154
+
155
+ **New Foundry Evaluations panel** (preview): Foundry runs the agent +
156
+ evaluators server-side via the OpenAI Evals API. Only works for
157
+ `name:version` Foundry agents. `publish` is implicit - a cloud run is
158
+ always recorded by Foundry. The local JSONL remains the dataset source of
159
+ truth; AgentOps syncs it to Foundry Data/Datasets by default and uses that
160
+ dataset version in the Evals run.
161
+
162
+ ```yaml
163
+ execution: cloud
164
+ # project_endpoint: "https://<resource>.services.ai.azure.com/api/projects/<p>"
165
+ dataset_sync:
166
+ mode: auto # sync local JSONL to Foundry Data/Datasets
167
+ ```
168
+
169
+ With `execution: local` and no `publish: true`, AgentOps runs locally
170
+ and only writes local artifacts.
171
+
172
+ After a cloud run, inspect `.agentops/results/latest/cloud_evaluation.json`.
173
+ Its `dataset` block explains whether the run used inline rows or a Foundry
174
+ dataset reference.
175
+
176
+ ## Tips
177
+
178
+ - Evaluators are auto-selected from the agent type and dataset columns.
179
+ Override only when needed via the `evaluators:` block - most users do
180
+ not need it.
181
+ - Set thresholds in `thresholds:` to gate CI:
182
+ ```yaml
183
+ thresholds:
184
+ coherence: ">=3"
185
+ avg_latency_seconds: "<=10"
186
+ ```
187
+ - For HTTP/JSON agents that need auth, set
188
+ `auth_header_env: MY_TOKEN_VAR` and AgentOps adds
189
+ `Authorization: Bearer $MY_TOKEN_VAR`.
@@ -0,0 +1,71 @@
1
+ ---
2
+ name: agentops-report
3
+ description: Read, regenerate, and explain AgentOps release-gate reports. Trigger on "show report", "explain scores", "regenerate report", "what do these metrics mean", "where is the proof". Operates on results.json and report.md produced by `agentops eval run`.
4
+ ---
5
+
6
+ # AgentOps Report
7
+
8
+ Help the user understand a finished AgentOps run and the evidence it provides
9
+ for the release decision. Reports explain the repo-side gate; they do not
10
+ replace Foundry Evaluations, Traces, or Monitor drilldown.
11
+
12
+ ## Step 0 - Locate the run
13
+
14
+ Latest run: `.agentops/results/latest/`. Each run produces:
15
+
16
+ - `results.json` - machine-readable metrics, per-row scores, thresholds.
17
+ - `report.md` - human-readable summary suitable for PR comments.
18
+ - `cloud_evaluation.json` (when Foundry visibility is enabled) - deep-link
19
+ to the Foundry Evaluations panel. `mode: classic` when `execution: local`
20
+ and `publish: true` upload metrics to Classic Foundry; `mode: cloud` when
21
+ `execution: cloud` runs server-side via the OpenAI Evals API.
22
+
23
+ ## Step 1 - Regenerate report.md if needed
24
+
25
+ ```bash
26
+ agentops report generate # uses .agentops/results/latest/results.json
27
+ agentops report generate --in <results.json> --out <report.md>
28
+ ```
29
+
30
+ `report generate` always reads the flat 1.0 results schema and emits
31
+ Markdown. There is no HTML format.
32
+
33
+ ## Step 2 - Explain the metrics
34
+
35
+ Common metrics and their meaning:
36
+
37
+ | Metric | Range | Higher is better? | Notes |
38
+ |---|---|---|---|
39
+ | `similarity` | 1-5 | yes | LLM-judged similarity to `expected`. |
40
+ | `coherence` | 1-5 | yes | Answer is internally consistent. |
41
+ | `fluency` | 1-5 | yes | Natural language quality. |
42
+ | `groundedness` | 1-5 | yes | Answer is supported by `context` (RAG). |
43
+ | `relevance` | 1-5 | yes | Answer is on-topic for `input`. |
44
+ | `f1_score` | 0-1 | yes | Token overlap with `expected`. |
45
+ | `tool_call_accuracy` | 0-1 | yes | Predicted tool calls match `tool_calls`. |
46
+ | `intent_resolution` | 0-1 | yes | User intent was resolved. |
47
+ | `task_completion` | 0-1 | yes | Multi-step task finished. |
48
+ | `avg_latency_seconds` | seconds | no | Wall-clock latency per row. |
49
+
50
+ Pass/fail rows are derived from `thresholds:` in `agentops.yaml`. The
51
+ exit code of the original run reflects the gate:
52
+
53
+ - `0` → all thresholds passed
54
+ - `2` → one or more thresholds failed
55
+ - `1` → runtime error
56
+
57
+ ## Step 3 - Help the user act on results
58
+
59
+ - For low scores on a specific metric, point at the lowest-scoring rows
60
+ in `results.json` (`row_metrics[]` and `item_evaluations[]`) and
61
+ suggest concrete prompt or retrieval changes.
62
+ - For latency regressions, look at `run_metrics.avg_latency_seconds` and
63
+ per-row latency.
64
+ - To compare a new run against a previous one, re-run with
65
+ `agentops eval run --baseline <previous-results.json>` and explain the
66
+ generated **Comparison vs Baseline** section.
67
+
68
+ ## Guardrails
69
+
70
+ - Never invent metric values. If a metric is absent, say so.
71
+ - Do not edit `results.json` by hand - re-run the eval.