agentops-toolkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. agentops_toolkit-0.1.0/PKG-INFO +704 -0
  2. agentops_toolkit-0.1.0/README.md +657 -0
  3. agentops_toolkit-0.1.0/pyproject.toml +107 -0
  4. agentops_toolkit-0.1.0/src/agentops_toolkit/__init__.py +3 -0
  5. agentops_toolkit-0.1.0/src/agentops_toolkit/adapters/__init__.py +30 -0
  6. agentops_toolkit-0.1.0/src/agentops_toolkit/adapters/agent_service.py +79 -0
  7. agentops_toolkit-0.1.0/src/agentops_toolkit/adapters/autogen.py +81 -0
  8. agentops_toolkit-0.1.0/src/agentops_toolkit/adapters/generic.py +121 -0
  9. agentops_toolkit-0.1.0/src/agentops_toolkit/adapters/registry.py +158 -0
  10. agentops_toolkit-0.1.0/src/agentops_toolkit/adapters/semantic_kernel.py +92 -0
  11. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/agent_quality.yaml +22 -0
  12. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/agent_safety.yaml +20 -0
  13. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/custom.yaml +9 -0
  14. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/multi_agent_quality.yaml +25 -0
  15. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/rag_agentic_retrieval.yaml +23 -0
  16. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/rag_complete.yaml +36 -0
  17. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/rag_cross_iq.yaml +22 -0
  18. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/rag_fabric_iq.yaml +22 -0
  19. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/rag_foundry_iq.yaml +26 -0
  20. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/rag_permission_aware.yaml +21 -0
  21. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/rag_quality.yaml +22 -0
  22. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/rag_safety.yaml +20 -0
  23. agentops_toolkit-0.1.0/src/agentops_toolkit/bundles/rag_work_iq.yaml +22 -0
  24. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/__init__.py +1 -0
  25. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/app.py +67 -0
  26. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/bundle_cmd.py +214 -0
  27. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/config_cmd.py +200 -0
  28. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/dataset_cmd.py +317 -0
  29. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/eval_cmd.py +173 -0
  30. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/init_cmd.py +224 -0
  31. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/model_cmd.py +83 -0
  32. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/monitor_cmd.py +91 -0
  33. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/report_cmd.py +258 -0
  34. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/run_cmd.py +151 -0
  35. agentops_toolkit-0.1.0/src/agentops_toolkit/cli/trace_cmd.py +82 -0
  36. agentops_toolkit-0.1.0/src/agentops_toolkit/connectors/__init__.py +4 -0
  37. agentops_toolkit-0.1.0/src/agentops_toolkit/core/__init__.py +5 -0
  38. agentops_toolkit-0.1.0/src/agentops_toolkit/core/aggregator.py +4 -0
  39. agentops_toolkit-0.1.0/src/agentops_toolkit/core/bundle_registry.py +127 -0
  40. agentops_toolkit-0.1.0/src/agentops_toolkit/core/client.py +4 -0
  41. agentops_toolkit-0.1.0/src/agentops_toolkit/core/config_loader.py +157 -0
  42. agentops_toolkit-0.1.0/src/agentops_toolkit/core/errors.py +116 -0
  43. agentops_toolkit-0.1.0/src/agentops_toolkit/core/foundry_client.py +50 -0
  44. agentops_toolkit-0.1.0/src/agentops_toolkit/core/foundry_sdk_client.py +139 -0
  45. agentops_toolkit-0.1.0/src/agentops_toolkit/core/hooks.py +4 -0
  46. agentops_toolkit-0.1.0/src/agentops_toolkit/core/logging.py +78 -0
  47. agentops_toolkit-0.1.0/src/agentops_toolkit/core/persistence.py +4 -0
  48. agentops_toolkit-0.1.0/src/agentops_toolkit/core/pipeline.py +291 -0
  49. agentops_toolkit-0.1.0/src/agentops_toolkit/core/rate_limiter.py +4 -0
  50. agentops_toolkit-0.1.0/src/agentops_toolkit/core/registry.py +4 -0
  51. agentops_toolkit-0.1.0/src/agentops_toolkit/core/runner.py +4 -0
  52. agentops_toolkit-0.1.0/src/agentops_toolkit/evaluators/__init__.py +5 -0
  53. agentops_toolkit-0.1.0/src/agentops_toolkit/evaluators/base.py +166 -0
  54. agentops_toolkit-0.1.0/src/agentops_toolkit/evaluators/citation.py +131 -0
  55. agentops_toolkit-0.1.0/src/agentops_toolkit/evaluators/rag_iq.py +179 -0
  56. agentops_toolkit-0.1.0/src/agentops_toolkit/mcp/__init__.py +4 -0
  57. agentops_toolkit-0.1.0/src/agentops_toolkit/mcp/client.py +93 -0
  58. agentops_toolkit-0.1.0/src/agentops_toolkit/models/__init__.py +68 -0
  59. agentops_toolkit-0.1.0/src/agentops_toolkit/models/bundle.py +47 -0
  60. agentops_toolkit-0.1.0/src/agentops_toolkit/models/config.py +229 -0
  61. agentops_toolkit-0.1.0/src/agentops_toolkit/models/dataset.py +60 -0
  62. agentops_toolkit-0.1.0/src/agentops_toolkit/models/observability.py +4 -0
  63. agentops_toolkit-0.1.0/src/agentops_toolkit/models/rag.py +4 -0
  64. agentops_toolkit-0.1.0/src/agentops_toolkit/models/run.py +146 -0
  65. agentops_toolkit-0.1.0/src/agentops_toolkit/obs/__init__.py +9 -0
  66. agentops_toolkit-0.1.0/src/agentops_toolkit/obs/decorators.py +69 -0
  67. agentops_toolkit-0.1.0/src/agentops_toolkit/obs/monitor.py +84 -0
  68. agentops_toolkit-0.1.0/src/agentops_toolkit/obs/tracing.py +148 -0
@@ -0,0 +1,704 @@
1
+ Metadata-Version: 2.3
2
+ Name: agentops-toolkit
3
+ Version: 0.1.0
4
+ Summary: CLI toolkit for evaluating, tracing, and monitoring AI agents on Azure AI Foundry
5
+ Keywords: ai,agent,evaluation,azure,foundry,observability
6
+ Author: DB Lee
7
+ Author-email: DB Lee <donlee@microsoft.com>
8
+ License: MIT
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Software Development :: Testing
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Dist: azure-ai-evaluation>=1.0.0b1,<2.0.0
18
+ Requires-Dist: azure-ai-projects>=1.0.0,<3.0.0
19
+ Requires-Dist: azure-identity>=1.17.0,<2.0.0
20
+ Requires-Dist: typer[all]>=0.12.0,<1.0.0
21
+ Requires-Dist: rich>=13.0.0
22
+ Requires-Dist: ruamel-yaml>=0.18.0
23
+ Requires-Dist: pydantic>=2.5.0,<3.0.0
24
+ Requires-Dist: aiofiles>=24.0.0
25
+ Requires-Dist: httpx>=0.27.0,<1.0.0
26
+ Requires-Dist: pytest>=8.0.0 ; extra == 'dev'
27
+ Requires-Dist: pytest-asyncio>=0.24.0 ; extra == 'dev'
28
+ Requires-Dist: pytest-cov>=5.0.0 ; extra == 'dev'
29
+ Requires-Dist: ruff>=0.8.0 ; extra == 'dev'
30
+ Requires-Dist: mypy>=1.11.0 ; extra == 'dev'
31
+ Requires-Dist: pre-commit>=4.0.0 ; extra == 'dev'
32
+ Requires-Dist: azure-search-documents>=11.6.0b6,<12.0.0 ; extra == 'iq'
33
+ Requires-Dist: azure-monitor-opentelemetry>=1.6.0,<2.0.0 ; extra == 'observability'
34
+ Requires-Dist: opentelemetry-sdk>=1.25.0,<2.0.0 ; extra == 'observability'
35
+ Requires-Dist: azure-core-tracing-opentelemetry>=1.0.0b11,<2.0.0 ; extra == 'observability'
36
+ Requires-Dist: opentelemetry-instrumentation-openai-v2>=2.0.0,<3.0.0 ; extra == 'observability'
37
+ Requires-Dist: opentelemetry-exporter-otlp>=1.25.0,<2.0.0 ; extra == 'observability'
38
+ Requires-Python: >=3.12
39
+ Project-URL: Homepage, https://github.com/mcaps-microsoft/agentops-toolkit
40
+ Project-URL: Documentation, https://github.com/mcaps-microsoft/agentops-toolkit#readme
41
+ Project-URL: Repository, https://github.com/mcaps-microsoft/agentops-toolkit
42
+ Project-URL: Issues, https://github.com/mcaps-microsoft/agentops-toolkit/issues
43
+ Provides-Extra: dev
44
+ Provides-Extra: iq
45
+ Provides-Extra: observability
46
+ Description-Content-Type: text/markdown
47
+
48
+ # AgentOps Toolkit
49
+
50
+ > **Evaluate, trace, and monitor AI agents — from terminal to production.**
51
+
52
+ AgentOps is an open-source CLI toolkit that makes it easy to evaluate AI agent applications using [Azure AI Foundry](https://ai.azure.com) evaluators. It provides prescriptive, best-practice bundles of evaluators for common agent patterns — RAG, tool-using agents, and multi-agent systems — so you can go from prototype to production-grade evaluation in minutes, not days.
53
+
54
+ ```
55
+ pip install agentops-toolkit
56
+ ```
57
+
58
+ ---
59
+
60
+ ## Why AgentOps?
61
+
62
+ | Problem | AgentOps Solution |
63
+ |---|---|
64
+ | Evaluation setup requires significant glue code | One command: `agentops init` scaffolds everything |
65
+ | No standard approach — every team builds differently | Prescriptive evaluator bundles per use case |
66
+ | No CLI workflow for Foundry evaluation | Full CLI with `agentops eval`, `agentops run`, `agentops report` |
67
+ | Model migration breaks things silently | `agentops eval migrate` compares models side-by-side |
68
+ | Copilot users want natural language | `@agentops evaluate my RAG agent` in VS Code Chat |
69
+
70
+ ---
71
+
72
+ ## Feature Overview
73
+
74
+ ### Phase 1 — Evaluation (GA)
75
+
76
+ | Feature | Command | Description |
77
+ |---|---|---|
78
+ | **Project scaffolding** | `agentops init` | One-command setup with config, bundles, sample dataset, and directory structure |
79
+ | **Evaluator bundles** | `agentops bundle list\|show\|create` | 13 pre-built bundles for RAG, agents, multi-agent, and IQ knowledge layers + custom bundle creation |
80
+ | **Dataset management** | `agentops dataset list\|validate\|import\|describe` | JSONL/CSV/JSON datasets with schema validation, column coverage checks, and statistics |
81
+ | **Agent execution** | `agentops run start` | Run agent against dataset with async concurrency, streaming persistence, and crash recovery |
82
+ | **Evaluation engine** | `agentops eval run\|entry` | Score agent outputs using 15 Azure AI Foundry evaluators (quality + safety + agent-specific) |
83
+ | **Reporting** | `agentops report show\|export` | Rich terminal reports, markdown for PRs, HTML for stakeholders, JSON for automation |
84
+ | **Run comparison** | `agentops run compare` | Side-by-side evaluator deltas with regression detection |
85
+ | **Custom evaluators** | `CustomEvaluator` subclass | Add domain-specific metrics (finance accuracy, medical safety) via Python + YAML registration |
86
+ | **Python API** | `AgentOpsClient` | Programmatic access: `client.run()`, `client.evaluate()`, `client.compare()` |
87
+ | **CI/CD gating** | `fail_on_threshold: true` | Exit code 1 when evaluator scores drop below thresholds — CI gate ready |
88
+ | **Run management** | `agentops run list\|show` | List past runs with scores; show detailed run metadata and per-entry status |
89
+ | **Config validation** | `agentops config validate\|show` | 15 validation rules with actionable error messages, resolved config display, and env var resolution |
90
+
91
+ ### Phase 2 — Observability & Integration
92
+
93
+ | Feature | Command | Description |
94
+ |---|---|---|
95
+ | **OpenTelemetry tracing** | `agentops trace init\|instrument\|run\|list` | One-command tracing → Application Insights; auto-instruments OpenAI SDK, OpenAI Agents SDK, Semantic Kernel, LangChain |
96
+ | **Local tracing** | `agentops trace init --local` | Console, Aspire Dashboard, or AI Toolkit for VS Code — no cloud required |
97
+ | **Monitoring setup** | `agentops monitor setup\|status` | Wire Agent Monitoring Dashboard, view monitoring health |
98
+ | **Monitoring dashboards** | `agentops monitor dashboard` | Pre-built Azure Monitor templates: agent-overview, eval-quality, safety-monitor |
99
+ | **Alerting** | `agentops monitor alert create\|list` | Azure Monitor alert rules for evaluation metric regressions |
100
+ | **Continuous evaluation** | `agentops eval --continuous` | Sample production traffic, evaluate with configurable sampling rate |
101
+ | **AI Red Teaming** | `agentops eval --red-team` | Adversarial scanning via Foundry AI Red Teaming Agent |
102
+ | **Model migration** | `agentops eval migrate` | Side-by-side model comparison with statistical significance and confidence intervals |
103
+ | **CI/CD pipeline gen** | `agentops eval cicd` | Generate GitHub Actions / Azure DevOps evaluation pipelines |
104
+ | **IQ dataset generation** | `agentops dataset generate --from-iq` | Generate golden datasets from Foundry IQ knowledge bases or Work IQ M365 data |
105
+ | **IQ-specific evaluators** | `CitationAccuracy`, `PermissionCompliance`, `SourceCoverage`, `TemporalRelevance`, `AttributionAccuracy` | Evaluators for agentic retrieval, citation fidelity, ACL enforcement, data freshness, and M365 source attribution |
106
+ | **Model lifecycle** | `agentops model list\|recommend\|benchmark\|quota\|deploy\|retire` | Browse Foundry catalog, compare benchmarks, check quota, deploy/retire via MCP |
107
+ | **MCP eval backend** | `--via mcp` | Alternative evaluation path via Foundry MCP Server for interactive workflows |
108
+ | **MCP monitoring** | `agentops monitor metrics` | Pull model monitoring metrics directly from Foundry MCP Server |
109
+
110
+ ### Phase 3 — Copilot & Framework Adapters
111
+
112
+ | Feature | Command | Description |
113
+ |---|---|---|
114
+ | **Copilot CLI** | `gh copilot suggest` | Natural-language → `agentops` command generation |
115
+ | **Copilot Extension** | `@agentops` in VS Code Chat | Interactive evaluation, inline reports, guided diagnostics |
116
+ | **Semantic Kernel adapter** | Auto-detected | Plugin discovery, kernel I/O capture, planner step tracing |
117
+ | **AutoGen adapter** | Auto-detected | Multi-agent message stream capture, conversation turn evaluation |
118
+ | **Agent Service adapter** | Configured | Pull agent definitions from Foundry, replay threads |
119
+ | **Generic adapter** | `@agentops.trace` decorator | Instrument any framework with one decorator or HTTP endpoint |
120
+ | **Framework auto-detection** | `agentops init` | Scans `pyproject.toml` / `requirements.txt` to pick the right adapter |
121
+
122
+ ---
123
+
124
+ ## Quick Start
125
+
126
+ ### 1. Initialize your project
127
+
128
+ ```
129
+ $ agentops init --use-case rag --framework semantic-kernel
130
+
131
+ ✓ AgentOps initialized for project 'my-rag-agent'
132
+ Use case: rag
133
+ Framework: semantic-kernel (auto-detected)
134
+ Bundle: rag_quality (4 evaluators)
135
+
136
+ Next steps:
137
+ 1. Add your test data to agentops/datasets/golden_set.jsonl
138
+ 2. Run: agentops run start
139
+ 3. View results: agentops report show latest
140
+ ```
141
+
142
+ This creates:
143
+
144
+ ```
145
+ my-rag-agent/
146
+ ├── agentops.yaml # Configuration — your single source of truth
147
+ ├── agentops/
148
+ │ ├── bundles/
149
+ │ │ └── rag_quality.yaml # Evaluator bundle (groundedness + relevance + coherence + fluency)
150
+ │ ├── datasets/
151
+ │ │ └── golden_set.jsonl # Sample test dataset (add your real data here)
152
+ │ ├── runs/ # Captured evaluation runs
153
+ │ └── reports/ # Generated reports
154
+ └── src/
155
+ └── agent.py # Your agent code
156
+ ```
157
+
158
+ ### 2. Add your test data
159
+
160
+ ```jsonl
161
+ {"query": "What is our refund policy?", "context": "Our refund policy allows returns within 30 days...", "ground_truth": "You can return items within 30 days for a full refund."}
162
+ {"query": "How do I reset my password?", "context": "To reset your password, go to Settings > Security...", "ground_truth": "Go to Settings > Security and click Reset Password."}
163
+ ```
164
+
165
+ ### 3. Run evaluation
166
+
167
+ ```
168
+ $ agentops run start
169
+
170
+ Running 'default' on 'golden_set' with bundle 'rag_quality'...
171
+
172
+ Agent: src/agent.py
173
+ Dataset: 50 entries
174
+ Bundle: rag_quality (4 evaluators)
175
+
176
+ ████████████████████████████ 50/50 entries [100%] ⏱ 3m 42s
177
+
178
+ ✓ 48 success ✗ 2 errors ⏭ 0 skipped
179
+
180
+ ✓ Run 'default' completed (2026-02-26_a1b2c3d4)
181
+
182
+ Evaluator Mean Median Pass Rate
183
+ ─────────────────────────────────────────
184
+ groundedness 4.2 4.0 92%
185
+ relevance 4.5 5.0 96%
186
+ coherence 4.1 4.0 88%
187
+ fluency 4.7 5.0 98%
188
+
189
+ Aggregate score: 4.38 / 5.00
190
+ Overall pass rate: 88%
191
+
192
+ Full report: agentops report show 2026-02-26_a1b2c3d4
193
+ ```
194
+
195
+ ### 4. View the report
196
+
197
+ ```
198
+ $ agentops report show latest
199
+
200
+ ╭─────────────────────────────────────────────────────────╮
201
+ │ AgentOps Evaluation Report │
202
+ │ Run: 2026-02-26_a1b2c3d4 │
203
+ │ Date: 2026-02-26 10:35:42 │
204
+ │ Dataset: golden_set (50 entries) │
205
+ │ Bundle: rag_quality │
206
+ ╰─────────────────────────────────────────────────────────╯
207
+
208
+ Evaluator Mean Med Min Max StdDev Pass Rate
209
+ ──────────────────────────────────────────────────────────────
210
+ groundedness 4.20 4.0 2.0 5.0 0.80 92% ✓
211
+ relevance 4.50 5.0 3.0 5.0 0.60 96% ✓
212
+ coherence 4.10 4.0 2.0 5.0 0.90 88% ✓
213
+ fluency 4.70 5.0 3.0 5.0 0.50 98% ✓
214
+
215
+ Aggregate: 4.38 / 5.00 Pass rate: 88%
216
+
217
+ ⚠ 3 entries below threshold:
218
+ Entry #12: groundedness=2.0 (threshold: 3.0)
219
+ Entry #34: coherence=2.0 (threshold: 3.0)
220
+ Entry #45: coherence=2.0 (threshold: 3.0)
221
+
222
+ Export: agentops report export latest --format html
223
+ ```
224
+
225
+ ---
226
+
227
+ ## Features
228
+
229
+ ### Evaluator Bundles — prescriptive, per use case
230
+
231
+ ```
232
+ $ agentops bundle list
233
+
234
+ Bundle Use Case Evaluators Description
235
+ ────────────────────────────────────────────────────────────────────
236
+ rag_quality rag 4 Core quality for RAG
237
+ rag_safety rag 5 Safety evaluators for RAG
238
+ rag_complete rag 10 Comprehensive RAG evaluation
239
+ agent_quality agent 4 Quality for tool-using agents
240
+ agent_safety agent 5 Safety for agents
241
+ multi_agent_quality multi-agent 5 Quality for orchestrated agents
242
+ custom any 0 Empty template for customization
243
+ rag_foundry_iq rag 6 RAG + Foundry IQ knowledge bases
244
+ rag_agentic_retrieval rag 5 RAG + agentic retrieval pipeline
245
+ rag_permission_aware rag 4 RAG + ACL enforcement testing
246
+ rag_fabric_iq rag 5 RAG + Fabric IQ ontologies
247
+ rag_work_iq rag 5 RAG + M365 collaboration data
248
+ rag_cross_iq rag 5 RAG + multi-IQ source evaluation
249
+ ```
250
+
251
+ ```
252
+ $ agentops bundle show rag_quality
253
+
254
+ Bundle: rag_quality
255
+ Use Case: rag
256
+ Built-in: ✓
257
+
258
+ Evaluator Category Inputs Score Range Threshold
259
+ ──────────────────────────────────────────────────────────────────────────────
260
+ groundedness Quality query, response, context 1-5 ≥ 3.0
261
+ relevance Quality query, response, context 1-5 ≥ 3.0
262
+ coherence Quality query, response 1-5 ≥ 3.0
263
+ fluency Quality response 1-5 ≥ 3.0
264
+ ```
265
+
266
+ ### Create custom bundles
267
+
268
+ ```
269
+ $ agentops bundle create my_safety --evaluators groundedness,violence,hate_unfairness,jailbreak \
270
+ --threshold groundedness=4.0 --threshold violence=4.5
271
+
272
+ ✓ Bundle 'my_safety' created with 4 evaluators
273
+ Saved to: agentops/bundles/my_safety.yaml
274
+ ```
275
+
276
+ ### Dataset management
277
+
278
+ ```
279
+ $ agentops dataset list
280
+
281
+ Dataset Format Entries Has Context Has Ground Truth Path
282
+ ──────────────────────────────────────────────────────────────────────
283
+ golden_set jsonl 50 ✓ ✓ agentops/datasets/golden_set.jsonl
284
+ edge_cases jsonl 12 ✓ ✗ agentops/datasets/edge_cases.jsonl
285
+ ```
286
+
287
+ ```
288
+ $ agentops dataset validate golden_set --bundle rag_quality
289
+
290
+ ✓ Dataset 'golden_set' is valid
291
+ 50 entries parsed
292
+ Required columns: query ✓, response ✓, context ✓
293
+ Optional columns: ground_truth ✓
294
+ Warnings: 0
295
+ ```
296
+
297
+ ### Compare runs — catch regressions
298
+
299
+ ```
300
+ $ agentops run compare 2026-02-26_a1b2c3d4 2026-02-25_e5f6g7h8
301
+
302
+ Comparing: 2026-02-26_a1b2c3d4 vs 2026-02-25_e5f6g7h8
303
+
304
+ Evaluator Run A (Feb 26) Run B (Feb 25) Delta Trend
305
+ ──────────────────────────────────────────────────────────────────
306
+ groundedness 4.20 3.90 +0.30 ▲ improved
307
+ relevance 4.50 4.55 -0.05 ≈ stable
308
+ coherence 4.10 3.80 +0.30 ▲ improved
309
+ fluency 4.70 4.60 +0.10 ▲ improved
310
+
311
+ Aggregate 4.38 4.12 +0.26 ▲ improved
312
+
313
+ Regressions: 3 entries scored lower on ≥1 evaluator
314
+ Entry #12: groundedness 3→2 (▼ regression)
315
+ Entry #34: relevance 4→3 (▼ regression)
316
+ Entry #45: coherence 5→3 (▼ regression)
317
+ ```
318
+
319
+ ### Model migration — evaluate before you switch
320
+
321
+ ```
322
+ $ agentops eval migrate --from gpt-4o --to gpt-4.1 --dataset golden_set.jsonl
323
+
324
+ Running side-by-side evaluation...
325
+
326
+ Model A (gpt-4o): ████████████████████ 50/50 ⏱ 2m 14s
327
+ Model B (gpt-4.1): ████████████████████ 50/50 ⏱ 1m 52s
328
+
329
+ ╭───────────────────────────────────────────────────────────╮
330
+ │ Model Migration Report: gpt-4o → gpt-4.1 │
331
+ ╰───────────────────────────────────────────────────────────╯
332
+
333
+ Evaluator gpt-4o gpt-4.1 Delta p-value Verdict
334
+ ──────────────────────────────────────────────────────────────────
335
+ groundedness 4.20 4.35 +0.15 0.021 ▲ significant
336
+ relevance 4.50 4.48 -0.02 0.814 ≈ no difference
337
+ coherence 4.10 4.22 +0.12 0.045 ▲ significant
338
+ fluency 4.70 4.75 +0.05 0.312 ≈ no difference
339
+
340
+ Latency P95 3.4s 2.1s -38% ▲ faster
341
+
342
+ Recommendation: ✓ Safe to migrate — no regressions detected
343
+ ```
344
+
345
+ ### Configuration — one YAML file
346
+
347
+ ```yaml
348
+ # agentops.yaml — minimal config
349
+ schema_version: "1.0"
350
+
351
+ project:
352
+ name: my-rag-agent
353
+
354
+ foundry:
355
+ project_connection: ${FOUNDRY_CONNECTION}
356
+
357
+ agent:
358
+ framework: semantic-kernel
359
+ use_case: rag
360
+ entry_point: src/agent.py
361
+ ```
362
+
363
+ ```
364
+ $ agentops config validate
365
+
366
+ ✔ Schema version: 1.0
367
+ ✔ Project: my-rag-agent
368
+ ✔ Foundry connection: ✔ (resolved from FOUNDRY_CONNECTION)
369
+ ✔ Agent framework: semantic-kernel (entry_point exists)
370
+ ✔ Datasets: 2 defined (golden_set ✔, edge_cases ✔)
371
+ ✔ Bundles: 1 custom (my_safety ✔)
372
+ ✔ All 15 validation rules passed
373
+ ```
374
+
375
+ ### CI/CD integration — gate on quality
376
+
377
+ ```yaml
378
+ # .github/workflows/agent-eval.yml
379
+ name: Agent Evaluation
380
+ on: [pull_request]
381
+
382
+ jobs:
383
+ evaluate:
384
+ runs-on: ubuntu-latest
385
+ steps:
386
+ - uses: actions/checkout@v4
387
+ - run: pip install agentops-toolkit
388
+ - run: agentops run start --no-interactive --format json
389
+ env:
390
+ FOUNDRY_CONNECTION: ${{ secrets.FOUNDRY_CONNECTION }}
391
+ - run: agentops report export latest --format markdown >> $GITHUB_STEP_SUMMARY
392
+ ```
393
+
394
+ ### Export reports — any format
395
+
396
+ ```
397
+ $ agentops report export latest --format markdown --output report.md
398
+ ✓ Report exported to report.md
399
+
400
+ $ agentops report export latest --format html --output report.html
401
+ ✓ Report exported to report.html
402
+
403
+ $ agentops report export latest --format json | jq '.summary.pass_rate'
404
+ 0.88
405
+ ```
406
+
407
+ ---
408
+
409
+ ## Phase 2: Observability & Monitoring
410
+
411
+ ### Tracing — one-command OpenTelemetry setup
412
+
413
+ ```
414
+ $ agentops trace init --project-endpoint $FOUNDRY_PROJECT_ENDPOINT
415
+
416
+ ✓ Tracing configured for project 'my-rag-agent'
417
+ Exporter: Application Insights (Foundry-linked)
418
+ Service: my-rag-agent
419
+
420
+ Next: agentops trace run -- python src/agent.py
421
+ ```
422
+
423
+ ```
424
+ $ agentops trace init --local --otlp http://localhost:4317
425
+
426
+ ✓ Local tracing configured
427
+ Exporter: OTLP → http://localhost:4317 (Aspire Dashboard)
428
+ Service: my-rag-agent
429
+
430
+ Next: agentops trace run -- python src/agent.py
431
+ ```
432
+
433
+ ### Monitoring dashboards — pre-built templates
434
+
435
+ ```
436
+ $ agentops monitor dashboard --template agent-overview
437
+
438
+ ✓ Dashboard 'Agent Overview' configured
439
+ Token usage, latency P50/P95/P99, error rates, throughput
440
+ View: https://portal.azure.com/...#dashboard/agent-overview
441
+
442
+ $ agentops monitor alert create \
443
+ --name "groundedness-regression" \
444
+ --metric "eval.groundedness.avg" \
445
+ --operator lt --threshold 3.5 --severity 2
446
+
447
+ ✓ Alert 'groundedness-regression' created
448
+ Fires when eval.groundedness.avg < 3.5 over 15m window
449
+ ```
450
+
451
+ ### Continuous evaluation — sample production traffic
452
+
453
+ ```
454
+ $ agentops eval --continuous \
455
+ --agent-id my-agent-001 \
456
+ --evaluators relevance,groundedness,coherence \
457
+ --sampling-percent 10
458
+
459
+ ✓ Continuous evaluation enabled for agent 'my-agent-001'
460
+ Sampling: 10% of interactions
461
+ Evaluators: relevance, groundedness, coherence
462
+ Results: Application Insights → Foundry Portal
463
+ ```
464
+
465
+ ### AI Red Teaming — adversarial testing
466
+
467
+ ```
468
+ $ agentops eval --red-team \
469
+ --target-endpoint https://my-agent.azurewebsites.net/api/chat \
470
+ --risk-categories violence,hate,self_harm \
471
+ --iterations 100
472
+
473
+ Running AI Red Teaming scan...
474
+ ████████████████████████████ 100/100 attacks ⏱ 8m 12s
475
+
476
+ ╭───────────────────────────────────────╮
477
+ │ Red Team Scan Results │
478
+ ╰───────────────────────────────────────╯
479
+
480
+ Category Attacks Defused Breached Rate
481
+ ──────────────────────────────────────────────────
482
+ violence 34 33 1 97%
483
+ hate 33 33 0 100%
484
+ self_harm 33 32 1 97%
485
+
486
+ Overall defense rate: 98%
487
+ ⚠ 2 breaches found — review in agentops/reports/red-team-latest.json
488
+ ```
489
+
490
+ ---
491
+
492
+ ## Phase 2: IQ Knowledge Layer Integration
493
+
494
+ ### Generate golden datasets from Foundry IQ
495
+
496
+ ```
497
+ $ agentops dataset generate --from-iq foundry-iq \
498
+ --queries queries.txt \
499
+ --output golden_dataset.jsonl \
500
+ --include-citations
501
+
502
+ Querying knowledge base 'my-kb'...
503
+ ████████████████████████████ 25/25 queries ⏱ 1m 05s
504
+
505
+ ✓ Dataset generated: golden_dataset.jsonl
506
+ 25 entries with citations from 3 sources
507
+ Sources: sharepoint://policies, blob://manuals, web://docs
508
+ ```
509
+
510
+ ### Evaluate with IQ-specific bundles
511
+
512
+ ```
513
+ $ agentops eval --run latest --bundle rag_foundry_iq
514
+
515
+ Evaluating with bundle 'rag_foundry_iq' (6 evaluators)...
516
+
517
+ Evaluator Mean Pass Rate
518
+ ──────────────────────────────────────
519
+ groundedness 4.30 94% ✓
520
+ relevance 4.50 96% ✓
521
+ coherence 4.20 90% ✓
522
+ fluency 4.60 98% ✓
523
+ citation_accuracy 0.92 92% ✓
524
+ source_coverage 0.85 85% ⚠
525
+
526
+ ⚠ source_coverage below threshold (0.85 < 0.90)
527
+ 5 entries missed expected sources — review details with:
528
+ agentops report show latest --evaluator source_coverage --verbose
529
+ ```
530
+
531
+ ---
532
+
533
+ ## Phase 2: Foundry MCP Server Integration
534
+
535
+ ### Model lifecycle — discover, compare, deploy
536
+
537
+ ```
538
+ $ agentops model recommend --current gpt-4o
539
+
540
+ ╭───────────────────────────────────────────────────╮
541
+ │ Model Recommendations for gpt-4o │
542
+ ╰───────────────────────────────────────────────────╯
543
+
544
+ Model Provider Reason Est. Impact
545
+ ──────────────────────────────────────────────────────────────────────
546
+ gpt-4.1 OpenAI 15% faster, equivalent quality ▲ latency
547
+ gpt-4.1-m OpenAI 40% cheaper, slight quality ▼ -0.1 score
548
+ o4-mini OpenAI Reasoning model, new capability + reasoning
549
+
550
+ Next: agentops model benchmark --from gpt-4o --to gpt-4.1
551
+ ```
552
+
553
+ ```
554
+ $ agentops model quota --model gpt-4.1
555
+
556
+ Resource Available Limit Utilization
557
+ ──────────────────────────────────────────────────────
558
+ TPM 120,000 150,000 80%
559
+ RPM 600 1,000 60%
560
+
561
+ ✓ Sufficient quota for evaluation + production deployment
562
+ ```
563
+
564
+ ---
565
+
566
+ ## Phase 3: Copilot Integration
567
+
568
+ ### Natural language via GitHub Copilot CLI
569
+
570
+ ```
571
+ $ gh copilot suggest "run groundedness eval on my golden dataset"
572
+ → agentops run start --bundle rag_quality --dataset golden_set
573
+
574
+ $ gh copilot suggest "compare my last two evaluation runs"
575
+ → agentops run compare latest latest~1
576
+
577
+ $ gh copilot suggest "set up monitoring for my RAG agent"
578
+ → agentops monitor setup --service-name my-rag-agent
579
+ ```
580
+
581
+ ### VS Code Chat — `@agentops`
582
+
583
+ ```
584
+ You: @agentops evaluate my last run and explain the regressions
585
+
586
+ AgentOps: I ran evaluation on run `2026-02-26_a1b2c3d4` with bundle `rag_quality`.
587
+
588
+ **Results:**
589
+ | Evaluator | Score | Δ vs Previous | Status |
590
+ |--------------|-------|---------------|---------|
591
+ | groundedness | 4.2 | +0.3 | ✓ |
592
+ | relevance | 4.5 | -0.05 | ≈ |
593
+ | coherence | 4.1 | +0.3 | ✓ |
594
+ | fluency | 4.7 | +0.1 | ✓ |
595
+
596
+ **3 regressions detected:**
597
+ - Entry #12: groundedness dropped 3→2. The agent hallucinated a return policy
598
+ detail not in the provided context. Consider adding explicit grounding
599
+ instructions for policy-related queries.
600
+ - Entry #34: relevance dropped 4→3. The response addressed a related but
601
+ different question. Check the query routing logic.
602
+ - Entry #45: coherence dropped 5→3. The response has contradictory statements
603
+ in paragraphs 2 and 3.
604
+ ```
605
+
606
+ ---
607
+
608
+ ## Supported Frameworks
609
+
610
+ | Framework | Integration | Status |
611
+ |---|---|---|
612
+ | **Semantic Kernel** | Auto-discover plugins, capture kernel I/O, planner step tracing | Phase 3 |
613
+ | **AutoGen** | Hook message streams, multi-turn conversation capture | Phase 3 |
614
+ | **Azure AI Agent Service** | Pull agent defs from Foundry, replay threads | Phase 3 |
615
+ | **Custom / any framework** | Generic adapter via `@agentops.trace` decorator or HTTP endpoint | Phase 3 |
616
+
617
+ ---
618
+
619
+ ## Architecture
620
+
621
+ ```
622
+ ┌──────────────────────────────────────────────┐
623
+ │ GitHub Copilot CLI / VS Code Chat │ ← Natural language
624
+ │ "evaluate my RAG agent on golden dataset" │
625
+ └─────────────────┬────────────────────────────┘
626
+
627
+
628
+ ┌──────────────────────────────────────────────┐
629
+ │ AgentOps Toolkit (CLI) │
630
+ │ │
631
+ │ Bundles → Runs → Evaluation → Reports │
632
+ │ Tracing → Monitoring → Continuous Eval │
633
+ │ Model Lifecycle → Migration → Retirement │
634
+ │ │
635
+ │ Agent Framework Adapters: │
636
+ │ [Semantic Kernel] [AutoGen] [Agent Service] │
637
+ └─────────────────┬────────────────────────────┘
638
+
639
+ ┌───────────┼───────────────┐
640
+ ▼ ▼ ▼
641
+ ┌───────────┐ ┌─────────┐ ┌─────────────┐
642
+ │ Foundry │ │ Foundry │ │ IQ Knowledge│
643
+ │ Evaluation│ │ MCP │ │ Layer │
644
+ │ SDK │ │ Server │ │ (Foundry IQ │
645
+ │ │ │ │ │ Fabric IQ │
646
+ │ Evaluators│ │ Models │ │ Work IQ) │
647
+ │ Dashboard │ │ Eval │ │ │
648
+ │ Tracing │ │ Monitor │ │ Retrieval │
649
+ │ Continuous│ │ Agents │ │ Citations │
650
+ └───────────┘ └─────────┘ └─────────────┘
651
+ ```
652
+
653
+ ---
654
+
655
+ ## Requirements
656
+
657
+ - **Python 3.10+**
658
+ - **Azure AI Foundry project** with evaluation APIs enabled
659
+ - **Azure credentials** — `az login` or `DefaultAzureCredential`
660
+
661
+ ---
662
+
663
+ ## Installation
664
+
665
+ ```bash
666
+ # Core toolkit
667
+ pip install agentops-toolkit
668
+
669
+ # With observability support (Phase 2)
670
+ pip install agentops-toolkit[observability]
671
+
672
+ # With IQ knowledge layer support (Phase 2)
673
+ pip install agentops-toolkit[iq]
674
+
675
+ # Everything
676
+ pip install agentops-toolkit[observability,iq]
677
+ ```
678
+
679
+ ---
680
+
681
+ ## Documentation
682
+
683
+ | Document | Description |
684
+ |---|---|
685
+ | [Build Plan](design/BUILD_PLAN.md) | Project plan, phases, sprints, team |
686
+ | [Requirements](design/REQUIREMENTS.md) | 96+ functional & non-functional requirements |
687
+ | [Specifications Index](design/SPECIFICATIONS.md) | 9 technical specifications (SPEC-001–009) |
688
+ | [Review Report](design/REVIEW_REPORT.md) | Design consistency & feasibility review |
689
+
690
+ ---
691
+
692
+ ## Contributing
693
+
694
+ AgentOps is open source. See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
695
+
696
+ ---
697
+
698
+ ## License
699
+
700
+ [MIT](LICENSE)
701
+
702
+ ---
703
+
704
+ *Built with Azure AI Foundry. Designed for developers who ship agents to production.*