agentops-accelerator 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentops/__init__.py +10 -0
- agentops/__main__.py +6 -0
- agentops/agent/__init__.py +12 -0
- agentops/agent/_legacy_ids.py +92 -0
- agentops/agent/analyzer.py +207 -0
- agentops/agent/checks/__init__.py +1 -0
- agentops/agent/checks/catalog.py +880 -0
- agentops/agent/checks/errors.py +279 -0
- agentops/agent/checks/foundry_config.py +75 -0
- agentops/agent/checks/latency.py +84 -0
- agentops/agent/checks/opex.py +157 -0
- agentops/agent/checks/opex_workspace.py +874 -0
- agentops/agent/checks/posture.py +36 -0
- agentops/agent/checks/posture_rules/__init__.py +53 -0
- agentops/agent/checks/posture_rules/content_filter.py +59 -0
- agentops/agent/checks/posture_rules/diagnostics.py +74 -0
- agentops/agent/checks/posture_rules/local_auth.py +55 -0
- agentops/agent/checks/posture_rules/managed_identity.py +59 -0
- agentops/agent/checks/posture_rules/network.py +68 -0
- agentops/agent/checks/regression.py +78 -0
- agentops/agent/checks/release_readiness.py +182 -0
- agentops/agent/checks/safety.py +247 -0
- agentops/agent/checks/spec_conformance.py +375 -0
- agentops/agent/cockpit.py +5159 -0
- agentops/agent/config.py +240 -0
- agentops/agent/findings.py +113 -0
- agentops/agent/history.py +142 -0
- agentops/agent/knowledge/__init__.py +182 -0
- agentops/agent/knowledge/waf-checklist.csv +39 -0
- agentops/agent/llm_assist/__init__.py +16 -0
- agentops/agent/llm_assist/_base.py +124 -0
- agentops/agent/llm_assist/_bundle_rule.py +154 -0
- agentops/agent/llm_assist/_client.py +347 -0
- agentops/agent/llm_assist/_dataset_rules.py +191 -0
- agentops/agent/llm_assist/_engine.py +106 -0
- agentops/agent/llm_assist/_prompt_rules.py +291 -0
- agentops/agent/llm_assist/_spec_rules.py +235 -0
- agentops/agent/production_telemetry.py +430 -0
- agentops/agent/report.py +207 -0
- agentops/agent/server/__init__.py +1 -0
- agentops/agent/server/app.py +84 -0
- agentops/agent/server/auth.py +94 -0
- agentops/agent/server/chat.py +44 -0
- agentops/agent/server/protocol.py +72 -0
- agentops/agent/sources/__init__.py +1 -0
- agentops/agent/sources/azure_monitor.py +523 -0
- agentops/agent/sources/azure_resources.py +602 -0
- agentops/agent/sources/foundry_control.py +174 -0
- agentops/agent/sources/results_history.py +494 -0
- agentops/agent/sources/spec_detectors/__init__.py +42 -0
- agentops/agent/sources/spec_detectors/_base.py +58 -0
- agentops/agent/sources/spec_detectors/agents_md.py +75 -0
- agentops/agent/sources/spec_detectors/spec_kit.py +172 -0
- agentops/agent/time_range.py +117 -0
- agentops/cli/__init__.py +1 -0
- agentops/cli/app.py +4823 -0
- agentops/core/__init__.py +1 -0
- agentops/core/agentops_config.py +592 -0
- agentops/core/config_loader.py +22 -0
- agentops/core/evaluators.py +480 -0
- agentops/core/release_evidence.py +56 -0
- agentops/core/results.py +117 -0
- agentops/mcp/__init__.py +10 -0
- agentops/mcp/server.py +232 -0
- agentops/pipeline/__init__.py +8 -0
- agentops/pipeline/cloud_results.py +189 -0
- agentops/pipeline/cloud_runner.py +901 -0
- agentops/pipeline/comparison.py +108 -0
- agentops/pipeline/diagnostics.py +51 -0
- agentops/pipeline/invocations.py +535 -0
- agentops/pipeline/official_eval.py +414 -0
- agentops/pipeline/orchestrator.py +775 -0
- agentops/pipeline/prompt_deploy.py +377 -0
- agentops/pipeline/publisher.py +121 -0
- agentops/pipeline/reporter.py +202 -0
- agentops/pipeline/runtime.py +409 -0
- agentops/pipeline/thresholds.py +84 -0
- agentops/services/__init__.py +1 -0
- agentops/services/cicd.py +720 -0
- agentops/services/eval_analysis.py +848 -0
- agentops/services/evidence_pack.py +757 -0
- agentops/services/initializer.py +86 -0
- agentops/services/preflight.py +470 -0
- agentops/services/setup_wizard.py +709 -0
- agentops/services/skills.py +643 -0
- agentops/services/trace_promotion.py +300 -0
- agentops/services/workflow_analysis.py +1129 -0
- agentops/templates/.gitignore +15 -0
- agentops/templates/__init__.py +1 -0
- agentops/templates/agent-server/Dockerfile +23 -0
- agentops/templates/agent-server/README.md +61 -0
- agentops/templates/agent-server/main.bicep +94 -0
- agentops/templates/agent.yaml +87 -0
- agentops/templates/agentops.yaml +58 -0
- agentops/templates/foundry.svg +71 -0
- agentops/templates/icon.png +0 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-dev-azd.yml +118 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-dev.yml +73 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prod-azd.yml +141 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prod.yml +94 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prompt-agent.yml +167 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-qa-azd.yml +118 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-qa.yml +68 -0
- agentops/templates/pipelines/azuredevops/agentops-pr-prompt-agent.yml +210 -0
- agentops/templates/pipelines/azuredevops/agentops-pr.yml +155 -0
- agentops/templates/pipelines/azuredevops/agentops-watchdog.yml +106 -0
- agentops/templates/project.gitignore +36 -0
- agentops/templates/sample-traces.jsonl +3 -0
- agentops/templates/skills/agentops-agent/SKILL.md +137 -0
- agentops/templates/skills/agentops-config/SKILL.md +113 -0
- agentops/templates/skills/agentops-dataset/SKILL.md +84 -0
- agentops/templates/skills/agentops-eval/SKILL.md +189 -0
- agentops/templates/skills/agentops-report/SKILL.md +71 -0
- agentops/templates/skills/agentops-workflow/SKILL.md +471 -0
- agentops/templates/smoke.jsonl +3 -0
- agentops/templates/waf-checklist.README.md +84 -0
- agentops/templates/waf-checklist.csv +22 -0
- agentops/templates/workflows/agentops-deploy-dev-azd.yml +166 -0
- agentops/templates/workflows/agentops-deploy-dev.yml +187 -0
- agentops/templates/workflows/agentops-deploy-prod-azd.yml +183 -0
- agentops/templates/workflows/agentops-deploy-prod.yml +171 -0
- agentops/templates/workflows/agentops-deploy-prompt-agent.yml +197 -0
- agentops/templates/workflows/agentops-deploy-qa-azd.yml +156 -0
- agentops/templates/workflows/agentops-deploy-qa.yml +145 -0
- agentops/templates/workflows/agentops-pr-prompt-agent.yml +210 -0
- agentops/templates/workflows/agentops-pr.yml +148 -0
- agentops/templates/workflows/agentops-watchdog.yml +122 -0
- agentops/utils/__init__.py +1 -0
- agentops/utils/azd_env.py +435 -0
- agentops/utils/azure_endpoints.py +62 -0
- agentops/utils/colors.py +47 -0
- agentops/utils/dotenv_loader.py +105 -0
- agentops/utils/foundry_discovery.py +229 -0
- agentops/utils/logging.py +59 -0
- agentops/utils/telemetry.py +554 -0
- agentops/utils/yaml.py +36 -0
- agentops_accelerator-0.3.0.dist-info/METADATA +278 -0
- agentops_accelerator-0.3.0.dist-info/RECORD +142 -0
- agentops_accelerator-0.3.0.dist-info/WHEEL +5 -0
- agentops_accelerator-0.3.0.dist-info/entry_points.txt +2 -0
- agentops_accelerator-0.3.0.dist-info/licenses/LICENSE +21 -0
- agentops_accelerator-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: agentops-config
|
|
3
|
+
description: Generate or update agentops.yaml (flat 1.0 schema) for AgentOps release-readiness gates. Trigger on "configure agentops", "agentops.yaml", "set up evaluation", "what should I evaluate". Infer the agent target and dataset from the codebase; ask only when nothing can be found.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# AgentOps Config
|
|
7
|
+
|
|
8
|
+
Generate `agentops.yaml` at the project root. This file references the agent
|
|
9
|
+
candidate and dataset used to answer "can we ship it?" The flat schema has only
|
|
10
|
+
a handful of fields - most projects need just `version`, `agent`, and
|
|
11
|
+
`dataset`.
|
|
12
|
+
|
|
13
|
+
This skill configures AgentOps release gates. It does **not** create or deploy
|
|
14
|
+
Foundry agents. If the user needs to create/deploy a Prompt Agent or Hosted
|
|
15
|
+
Agent first, hand off to Foundry Toolkit / the `microsoft-foundry` skill / azd,
|
|
16
|
+
then return here once there is a `name:version` or URL.
|
|
17
|
+
|
|
18
|
+
## Step 0 - Prerequisites
|
|
19
|
+
|
|
20
|
+
1. `pip install "agentops-accelerator[foundry] @ git+https://github.com/Azure/agentops.git@main"` if `agentops` is missing.
|
|
21
|
+
2. Run `agentops eval analyze` first. If it reports missing or ambiguous
|
|
22
|
+
target/dataset/scenario signals, use this skill to adapt the config.
|
|
23
|
+
3. If `agentops.yaml` does not exist, run `agentops init` first. The init
|
|
24
|
+
wizard already collects the agent reference and dataset path, so
|
|
25
|
+
`agentops-config` is most useful when the user wants to **tweak** an
|
|
26
|
+
existing config (add thresholds, switch to a different agent target,
|
|
27
|
+
add HTTP auth headers, etc.) rather than create one from scratch.
|
|
28
|
+
|
|
29
|
+
## Step 1 - Detect the agent target
|
|
30
|
+
|
|
31
|
+
Search the codebase for the strongest signal and pick one:
|
|
32
|
+
|
|
33
|
+
| Signal | `agent:` value |
|
|
34
|
+
|---|---|
|
|
35
|
+
| Foundry Prompt Agent ID `name:N` | `"<name>:<N>"` |
|
|
36
|
+
| Foundry Hosted Agent URL `https://...services.ai.azure.com/...agents/...` | the full URL |
|
|
37
|
+
| Any other HTTP endpoint your agent serves (FastAPI, Express, ACA, AKS) | the full URL |
|
|
38
|
+
| Direct model use (`openai.chat.completions.create(model=...)`) with no orchestration | `"model:<deployment-name>"` |
|
|
39
|
+
|
|
40
|
+
Look in: `README.md`, `main.py`/`server.py`/`app.ts`, `.agentops/.env`,
|
|
41
|
+
`.env`/`.env.local`, `.azure/<env>/.env`, `infra/`, IaC outputs. If nothing is
|
|
42
|
+
found, ask the user once.
|
|
43
|
+
|
|
44
|
+
## Step 2 - Detect the dataset
|
|
45
|
+
|
|
46
|
+
If a JSONL with rows that include `input` already exists in the repo, use
|
|
47
|
+
its path. Otherwise leave the default `.agentops/data/smoke.jsonl` and
|
|
48
|
+
hand off to the `agentops-dataset` skill before the first run.
|
|
49
|
+
|
|
50
|
+
## Step 3 - Write agentops.yaml
|
|
51
|
+
|
|
52
|
+
Minimal example:
|
|
53
|
+
|
|
54
|
+
```yaml
|
|
55
|
+
version: 1
|
|
56
|
+
agent: "my-rag:3"
|
|
57
|
+
dataset: .agentops/data/smoke.jsonl
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
HTTP/JSON example:
|
|
61
|
+
|
|
62
|
+
```yaml
|
|
63
|
+
version: 1
|
|
64
|
+
agent: "https://my-aca-app.eastus2.azurecontainerapps.io/chat"
|
|
65
|
+
dataset: .agentops/data/smoke.jsonl
|
|
66
|
+
request_field: message # default is "message"
|
|
67
|
+
response_field: text # dot-path; default is "text"
|
|
68
|
+
auth_header_env: MY_API_TOKEN
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Optional extras (only add when the user asks for them):
|
|
72
|
+
|
|
73
|
+
```yaml
|
|
74
|
+
thresholds:
|
|
75
|
+
coherence: ">=3"
|
|
76
|
+
groundedness: ">=3"
|
|
77
|
+
avg_latency_seconds: "<=30"
|
|
78
|
+
|
|
79
|
+
# Publish results to the Foundry Evaluations panel.
|
|
80
|
+
# - execution: local + publish: true → Classic Foundry (uploads metrics)
|
|
81
|
+
# - execution: cloud → New Foundry (server-side run;
|
|
82
|
+
# publish is implicit, cloud always publishes)
|
|
83
|
+
execution: local
|
|
84
|
+
publish: true
|
|
85
|
+
# project_endpoint: "https://<resource>.services.ai.azure.com/api/projects/<p>"
|
|
86
|
+
|
|
87
|
+
# Cloud dataset submission policy. The local JSONL remains the source of truth;
|
|
88
|
+
# cloud runs sync it to Foundry Data/Datasets by default.
|
|
89
|
+
dataset_sync:
|
|
90
|
+
mode: auto # auto | foundry | inline
|
|
91
|
+
# name: agentops-smoke
|
|
92
|
+
# version: content-hash
|
|
93
|
+
|
|
94
|
+
evaluators: # rare - AgentOps auto-selects from agent + dataset
|
|
95
|
+
- name: similarity
|
|
96
|
+
threshold: ">=4"
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Step 4 - Validate
|
|
100
|
+
|
|
101
|
+
Run `agentops eval run` once. If the config is malformed AgentOps prints a
|
|
102
|
+
clear error pointing at the offending key. Adjust and re-run.
|
|
103
|
+
|
|
104
|
+
## Guardrails
|
|
105
|
+
|
|
106
|
+
- Do **not** add legacy keys (`bundle`, `target`, `execution`, `output`,
|
|
107
|
+
`backend`). The 1.0 schema rejects them.
|
|
108
|
+
- Do **not** fabricate agent IDs, endpoint URLs, or model deployment
|
|
109
|
+
names. Ask the user when uncertain.
|
|
110
|
+
- Keep the file small. Auto-selection covers most metrics.
|
|
111
|
+
- Keep local JSONL canonical. For cloud runs, prefer `dataset_sync.mode: auto`
|
|
112
|
+
so AgentOps keeps Foundry Data/Datasets in sync; use `inline` only for quick
|
|
113
|
+
experiments or environments without dataset upload permission.
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: agentops-dataset
|
|
3
|
+
description: Create or extend a small JSONL dataset for AgentOps release-readiness gates. Trigger on "create dataset", "generate test data", "JSONL", "more eval rows". Infer the agent's domain from the codebase and produce realistic rows; never fabricate data when the domain is unclear.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# AgentOps Dataset
|
|
7
|
+
|
|
8
|
+
Generate a small, realistic JSONL dataset for the agent under evaluation.
|
|
9
|
+
Default location: `.agentops/data/smoke.jsonl` (referenced from
|
|
10
|
+
`agentops.yaml`). These rows are repo-side release-gate inputs: keep them
|
|
11
|
+
reviewable and deterministic, not a full replacement for Foundry dataset
|
|
12
|
+
management.
|
|
13
|
+
|
|
14
|
+
## Step 0 - Prerequisites
|
|
15
|
+
|
|
16
|
+
1. `pip install "agentops-accelerator @ git+https://github.com/Azure/agentops.git@main"` if `agentops` is missing.
|
|
17
|
+
2. Run `agentops eval analyze` first. If it reports missing dataset columns or
|
|
18
|
+
recommends `agentops-dataset`, use this skill before the first eval run.
|
|
19
|
+
3. If `agentops.yaml` does not exist, run `agentops init` first (the init
|
|
20
|
+
wizard will prompt for the agent reference, project endpoint, and
|
|
21
|
+
dataset path, then create a starter `.agentops/data/smoke.jsonl`).
|
|
22
|
+
|
|
23
|
+
## Step 1 - Pick the columns
|
|
24
|
+
|
|
25
|
+
Read `agentops.yaml` (and the agent code) to figure out the agent type,
|
|
26
|
+
then choose the row schema:
|
|
27
|
+
|
|
28
|
+
| Agent type | Required columns | Optional columns |
|
|
29
|
+
|---|---|---|
|
|
30
|
+
| Direct model / Q&A | `input`, `expected` | - |
|
|
31
|
+
| RAG | `input`, `expected`, `context` | - |
|
|
32
|
+
| Conversational | `input`, `expected` | - |
|
|
33
|
+
| Tool-using agent | `input`, `expected`, `tool_calls` | `tool_definitions` |
|
|
34
|
+
|
|
35
|
+
`input` is always the user prompt. `expected` is the gold answer.
|
|
36
|
+
`context` is the retrieved passage(s). `tool_calls` is a list of
|
|
37
|
+
`{name, arguments}` describing the expected tool invocations.
|
|
38
|
+
|
|
39
|
+
## Step 2 - Ground the rows in the codebase
|
|
40
|
+
|
|
41
|
+
- Read the README, system prompt, tool definitions, and any sample
|
|
42
|
+
fixtures.
|
|
43
|
+
- Generate **5–10 rows** that exercise the agent's actual capabilities.
|
|
44
|
+
- If the domain is unclear, generate a tiny generic draft and clearly
|
|
45
|
+
flag it as a placeholder.
|
|
46
|
+
|
|
47
|
+
## Step 3 - Write the JSONL
|
|
48
|
+
|
|
49
|
+
One JSON object per line, no trailing commas, UTF-8:
|
|
50
|
+
|
|
51
|
+
```json
|
|
52
|
+
{"input": "What is the refund policy?", "expected": "Refunds within 30 days...", "context": "Refund policy: ..."}
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Save to the path referenced by `dataset:` in `agentops.yaml` (default
|
|
56
|
+
`.agentops/data/smoke.jsonl`).
|
|
57
|
+
|
|
58
|
+
This file is the AgentOps source of truth. In Foundry cloud evaluation,
|
|
59
|
+
AgentOps syncs it to a stable Foundry dataset version by default and reuses the
|
|
60
|
+
same Foundry dataset version while the JSONL content is unchanged. If the user
|
|
61
|
+
forces `dataset_sync.mode: inline`, Foundry may show generated `eval-data-*`
|
|
62
|
+
backing assets in the project Data/Datasets page.
|
|
63
|
+
|
|
64
|
+
## Step 4 - Sanity-check
|
|
65
|
+
|
|
66
|
+
Run a quick eval and confirm rows are picked up:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
agentops eval run
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Open `.agentops/results/latest/report.md` and confirm the row count
|
|
73
|
+
matches.
|
|
74
|
+
|
|
75
|
+
## Guardrails
|
|
76
|
+
|
|
77
|
+
- Do not invent customer data, real names, or sensitive content.
|
|
78
|
+
- Keep rows short - datasets are meant to be quick gates, not full QA
|
|
79
|
+
suites.
|
|
80
|
+
- If the user already has a domain dataset, prefer pointing
|
|
81
|
+
`agentops.yaml` at that file rather than generating new rows.
|
|
82
|
+
- If the user asks why Foundry shows `eval-data-*`, explain that those are
|
|
83
|
+
cloud-eval backing assets from inline compatibility mode; normal cloud runs
|
|
84
|
+
should use the stable `agentops-*` Foundry dataset.
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: agentops-eval
|
|
3
|
+
description: Run AgentOps release-readiness evaluations against Foundry prompt agents, Foundry hosted endpoints, HTTP/JSON agents, or raw model deployments. Trigger on phrases like "run eval", "evaluate my agent", "benchmark", "agentops eval", "compare runs", "can we ship". Uses the flat agentops.yaml schema.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# AgentOps Eval
|
|
7
|
+
|
|
8
|
+
End-to-end release-gate workflow: install -> init -> configure -> run -> read
|
|
9
|
+
report -> decide whether the candidate is ready to ship.
|
|
10
|
+
|
|
11
|
+
AgentOps evaluates an existing candidate. It does **not** create or deploy
|
|
12
|
+
Foundry agents. If the user still needs a Prompt Agent or Hosted Agent, hand off
|
|
13
|
+
to Foundry Toolkit / the `microsoft-foundry` skill / azd first, then come back
|
|
14
|
+
with a `name:version` or URL.
|
|
15
|
+
|
|
16
|
+
## Step 0 - Setup
|
|
17
|
+
|
|
18
|
+
1. Install if missing: `pip install "agentops-accelerator[foundry] @ git+https://github.com/Azure/agentops.git@main"`.
|
|
19
|
+
2. If `agentops.yaml` does not exist at the project root, run `agentops init`.
|
|
20
|
+
The init wizard prompts (azd-style) for the Foundry project endpoint,
|
|
21
|
+
agent reference, and dataset path, persists each answer to
|
|
22
|
+
`.agentops/.env` + `agentops.yaml` as it goes. Existing azd workspaces, or
|
|
23
|
+
runs with `--azd-env`, use `.azure/<env>/.env` instead. Pass `--no-prompt`
|
|
24
|
+
plus the explicit flags
|
|
25
|
+
(`--project-endpoint`, `--agent`, `--dataset`, …) for non-interactive
|
|
26
|
+
runs. Run `agentops init show` later to inspect the resolved config.
|
|
27
|
+
|
|
28
|
+
## Step 1 - Analyze evaluation setup
|
|
29
|
+
|
|
30
|
+
Run the deterministic local triage first:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
agentops eval analyze
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Use its output to decide whether the repo is ready for `agentops eval run` or
|
|
37
|
+
needs skill-assisted setup. If it recommends `agentops-config`, fix the target
|
|
38
|
+
and protocol. If it recommends `agentops-dataset`, create/map realistic JSONL
|
|
39
|
+
rows. If it recommends `agentops-eval`, inspect the app scenario and evaluator
|
|
40
|
+
expectations before running.
|
|
41
|
+
|
|
42
|
+
## Step 2 - Identify the agent target
|
|
43
|
+
|
|
44
|
+
Read the codebase (README, entry point, env vars) and pick the right value
|
|
45
|
+
for the `agent:` field of `agentops.yaml`:
|
|
46
|
+
|
|
47
|
+
| Pattern in code / env | `agent:` value |
|
|
48
|
+
|---|---|
|
|
49
|
+
| Foundry Prompt Agent ID like `name:1` | `"<name>:<version>"` |
|
|
50
|
+
| Foundry Hosted Agent endpoint URL ending in `/agents/...` | `"https://<resource>.services.ai.azure.com/api/projects/<p>/agents/..."` |
|
|
51
|
+
| Plain HTTP/JSON endpoint (FastAPI, Express, ACA, AKS) | `"https://<host>/<path>"` |
|
|
52
|
+
| Raw Foundry/Azure OpenAI model deployment | `"model:<deployment-name>"` |
|
|
53
|
+
|
|
54
|
+
If nothing is found, ask the user once for the agent identifier.
|
|
55
|
+
|
|
56
|
+
## Step 3 - Make sure the dataset exists
|
|
57
|
+
|
|
58
|
+
`agentops.yaml` points to a JSONL file (default
|
|
59
|
+
`.agentops/data/smoke.jsonl`). Each row needs at least `input` and a label
|
|
60
|
+
that maps to the metric you care about (`expected`, `context`,
|
|
61
|
+
`tool_calls`...). If the dataset is empty or unrelated, run the
|
|
62
|
+
`agentops-dataset` skill before running the eval.
|
|
63
|
+
|
|
64
|
+
## Step 4 - Run the evaluation
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
agentops eval run
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Optional flags:
|
|
71
|
+
|
|
72
|
+
- `--config <path>` - point at a different `agentops.yaml`.
|
|
73
|
+
- `--output <dir>` - choose where to write `results.json` and `report.md`
|
|
74
|
+
(defaults to `.agentops/results/<timestamp>/`).
|
|
75
|
+
|
|
76
|
+
Exit codes:
|
|
77
|
+
|
|
78
|
+
- `0` - succeeded and all thresholds passed
|
|
79
|
+
- `2` - succeeded but at least one threshold failed (gate-friendly)
|
|
80
|
+
- `1` - runtime/configuration error
|
|
81
|
+
|
|
82
|
+
## Step 4b - Pick the right execution path
|
|
83
|
+
|
|
84
|
+
| Target | Foundry server-side eval through AgentOps | AgentOps local runner | Default guidance |
|
|
85
|
+
|---|---|---|---|
|
|
86
|
+
| Foundry Prompt Agent (`name:version`) | `execution: cloud` | yes | Use cloud when the user wants the official Foundry-hosted eval record; use local for fast feedback or fallback. |
|
|
87
|
+
| Foundry Hosted Agent URL | no | yes | Use local runner; optionally set `publish: true` to upload local metrics to Classic Foundry. |
|
|
88
|
+
| Generic HTTP/JSON endpoint | no | yes | Use local runner. |
|
|
89
|
+
| Raw model deployment | no | yes | Use local runner. |
|
|
90
|
+
|
|
91
|
+
For prompt-agent CI gates, prefer AgentOps cloud eval because Foundry executes
|
|
92
|
+
the managed eval while AgentOps enforces thresholds and writes normalized
|
|
93
|
+
`results.json` / `report.md` artifacts. The official AI Agent Evaluation GitHub
|
|
94
|
+
Action or Azure DevOps extension is still useful for standalone platform-native
|
|
95
|
+
validation, but do not substitute it for the AgentOps PR gate when the user needs
|
|
96
|
+
threshold enforcement, baselines, Doctor readiness, release evidence, or local
|
|
97
|
+
fallback.
|
|
98
|
+
|
|
99
|
+
## Step 5 - Inspect results and release evidence
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
agentops report generate # regenerate report.md from latest results.json
|
|
103
|
+
agentops report generate --in <results.json>
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Open `.agentops/results/latest/report.md`. To compare two runs, hand both
|
|
107
|
+
`results.json` files to the user or run the next eval with
|
|
108
|
+
`--baseline <previous-results.json>` so AgentOps adds a **Comparison vs
|
|
109
|
+
Baseline** section to the report.
|
|
110
|
+
|
|
111
|
+
For production promotion, generate the Doctor evidence pack:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
agentops doctor --evidence-pack
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Open `.agentops/release/latest/evidence.md`. It summarizes eval, baseline,
|
|
118
|
+
Doctor, workflow, Foundry, monitoring, AI Landing Zone, and trace-regression
|
|
119
|
+
readiness without creating a second exit-code contract.
|
|
120
|
+
|
|
121
|
+
## Step 5b - (Optional) Promote reviewed traces to regression rows
|
|
122
|
+
|
|
123
|
+
If the user has exported Foundry/App Insights traces, preview candidate
|
|
124
|
+
regression rows first:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
agentops eval promote-traces --source <traces.jsonl>
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Only write files after review:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
agentops eval promote-traces --source <traces.jsonl> --apply
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Default `self-similarity` labels are for drift detection, not human-verified
|
|
137
|
+
ground truth. Use `--label-mode pending` when reviewers must fill expected
|
|
138
|
+
answers before the dataset gates releases.
|
|
139
|
+
|
|
140
|
+
## Step 6 - (Optional) Foundry execution / visibility
|
|
141
|
+
|
|
142
|
+
Two modes are supported. Both write a deep-link into
|
|
143
|
+
`.agentops/results/latest/cloud_evaluation.json` and require
|
|
144
|
+
`AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` (or the inline `project_endpoint`).
|
|
145
|
+
|
|
146
|
+
**Classic Foundry Evaluations panel** (works for any target kind):
|
|
147
|
+
AgentOps runs locally first, then uploads the metrics it computed.
|
|
148
|
+
|
|
149
|
+
```yaml
|
|
150
|
+
execution: local
|
|
151
|
+
publish: true
|
|
152
|
+
# project_endpoint: "https://<resource>.services.ai.azure.com/api/projects/<p>"
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**New Foundry Evaluations panel** (preview): Foundry runs the agent +
|
|
156
|
+
evaluators server-side via the OpenAI Evals API. Only works for
|
|
157
|
+
`name:version` Foundry agents. `publish` is implicit - a cloud run is
|
|
158
|
+
always recorded by Foundry. The local JSONL remains the dataset source of
|
|
159
|
+
truth; AgentOps syncs it to Foundry Data/Datasets by default and uses that
|
|
160
|
+
dataset version in the Evals run.
|
|
161
|
+
|
|
162
|
+
```yaml
|
|
163
|
+
execution: cloud
|
|
164
|
+
# project_endpoint: "https://<resource>.services.ai.azure.com/api/projects/<p>"
|
|
165
|
+
dataset_sync:
|
|
166
|
+
mode: auto # sync local JSONL to Foundry Data/Datasets
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
With `execution: local` and no `publish: true`, AgentOps runs locally
|
|
170
|
+
and only writes local artifacts.
|
|
171
|
+
|
|
172
|
+
After a cloud run, inspect `.agentops/results/latest/cloud_evaluation.json`.
|
|
173
|
+
Its `dataset` block explains whether the run used inline rows or a Foundry
|
|
174
|
+
dataset reference.
|
|
175
|
+
|
|
176
|
+
## Tips
|
|
177
|
+
|
|
178
|
+
- Evaluators are auto-selected from the agent type and dataset columns.
|
|
179
|
+
Override only when needed via the `evaluators:` block - most users do
|
|
180
|
+
not need it.
|
|
181
|
+
- Set thresholds in `thresholds:` to gate CI:
|
|
182
|
+
```yaml
|
|
183
|
+
thresholds:
|
|
184
|
+
coherence: ">=3"
|
|
185
|
+
avg_latency_seconds: "<=10"
|
|
186
|
+
```
|
|
187
|
+
- For HTTP/JSON agents that need auth, set
|
|
188
|
+
`auth_header_env: MY_TOKEN_VAR` and AgentOps adds
|
|
189
|
+
`Authorization: Bearer $MY_TOKEN_VAR`.
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: agentops-report
|
|
3
|
+
description: Read, regenerate, and explain AgentOps release-gate reports. Trigger on "show report", "explain scores", "regenerate report", "what do these metrics mean", "where is the proof". Operates on results.json and report.md produced by `agentops eval run`.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# AgentOps Report
|
|
7
|
+
|
|
8
|
+
Help the user understand a finished AgentOps run and the evidence it provides
|
|
9
|
+
for the release decision. Reports explain the repo-side gate; they do not
|
|
10
|
+
replace Foundry Evaluations, Traces, or Monitor drilldown.
|
|
11
|
+
|
|
12
|
+
## Step 0 - Locate the run
|
|
13
|
+
|
|
14
|
+
Latest run: `.agentops/results/latest/`. Each run produces:
|
|
15
|
+
|
|
16
|
+
- `results.json` - machine-readable metrics, per-row scores, thresholds.
|
|
17
|
+
- `report.md` - human-readable summary suitable for PR comments.
|
|
18
|
+
- `cloud_evaluation.json` (when Foundry visibility is enabled) - deep-link
|
|
19
|
+
to the Foundry Evaluations panel. `mode: classic` when `execution: local`
|
|
20
|
+
and `publish: true` upload metrics to Classic Foundry; `mode: cloud` when
|
|
21
|
+
`execution: cloud` runs server-side via the OpenAI Evals API.
|
|
22
|
+
|
|
23
|
+
## Step 1 - Regenerate report.md if needed
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
agentops report generate # uses .agentops/results/latest/results.json
|
|
27
|
+
agentops report generate --in <results.json> --out <report.md>
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
`report generate` always reads the flat 1.0 results schema and emits
|
|
31
|
+
Markdown. There is no HTML format.
|
|
32
|
+
|
|
33
|
+
## Step 2 - Explain the metrics
|
|
34
|
+
|
|
35
|
+
Common metrics and their meaning:
|
|
36
|
+
|
|
37
|
+
| Metric | Range | Higher is better? | Notes |
|
|
38
|
+
|---|---|---|---|
|
|
39
|
+
| `similarity` | 1-5 | yes | LLM-judged similarity to `expected`. |
|
|
40
|
+
| `coherence` | 1-5 | yes | Answer is internally consistent. |
|
|
41
|
+
| `fluency` | 1-5 | yes | Natural language quality. |
|
|
42
|
+
| `groundedness` | 1-5 | yes | Answer is supported by `context` (RAG). |
|
|
43
|
+
| `relevance` | 1-5 | yes | Answer is on-topic for `input`. |
|
|
44
|
+
| `f1_score` | 0-1 | yes | Token overlap with `expected`. |
|
|
45
|
+
| `tool_call_accuracy` | 0-1 | yes | Predicted tool calls match `tool_calls`. |
|
|
46
|
+
| `intent_resolution` | 0-1 | yes | User intent was resolved. |
|
|
47
|
+
| `task_completion` | 0-1 | yes | Multi-step task finished. |
|
|
48
|
+
| `avg_latency_seconds` | seconds | no | Wall-clock latency per row. |
|
|
49
|
+
|
|
50
|
+
Pass/fail rows are derived from `thresholds:` in `agentops.yaml`. The
|
|
51
|
+
exit code of the original run reflects the gate:
|
|
52
|
+
|
|
53
|
+
- `0` → all thresholds passed
|
|
54
|
+
- `2` → one or more thresholds failed
|
|
55
|
+
- `1` → runtime error
|
|
56
|
+
|
|
57
|
+
## Step 3 - Help the user act on results
|
|
58
|
+
|
|
59
|
+
- For low scores on a specific metric, point at the lowest-scoring rows
|
|
60
|
+
in `results.json` (`row_metrics[]` and `item_evaluations[]`) and
|
|
61
|
+
suggest concrete prompt or retrieval changes.
|
|
62
|
+
- For latency regressions, look at `run_metrics.avg_latency_seconds` and
|
|
63
|
+
per-row latency.
|
|
64
|
+
- To compare a new run against a previous one, re-run with
|
|
65
|
+
`agentops eval run --baseline <previous-results.json>` and explain the
|
|
66
|
+
generated **Comparison vs Baseline** section.
|
|
67
|
+
|
|
68
|
+
## Guardrails
|
|
69
|
+
|
|
70
|
+
- Never invent metric values. If a metric is absent, say so.
|
|
71
|
+
- Do not edit `results.json` by hand - re-run the eval.
|