agentops-accelerator 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. agentops/__init__.py +10 -0
  2. agentops/__main__.py +6 -0
  3. agentops/agent/__init__.py +12 -0
  4. agentops/agent/_legacy_ids.py +92 -0
  5. agentops/agent/analyzer.py +207 -0
  6. agentops/agent/checks/__init__.py +1 -0
  7. agentops/agent/checks/catalog.py +880 -0
  8. agentops/agent/checks/errors.py +279 -0
  9. agentops/agent/checks/foundry_config.py +75 -0
  10. agentops/agent/checks/latency.py +84 -0
  11. agentops/agent/checks/opex.py +157 -0
  12. agentops/agent/checks/opex_workspace.py +874 -0
  13. agentops/agent/checks/posture.py +36 -0
  14. agentops/agent/checks/posture_rules/__init__.py +53 -0
  15. agentops/agent/checks/posture_rules/content_filter.py +59 -0
  16. agentops/agent/checks/posture_rules/diagnostics.py +74 -0
  17. agentops/agent/checks/posture_rules/local_auth.py +55 -0
  18. agentops/agent/checks/posture_rules/managed_identity.py +59 -0
  19. agentops/agent/checks/posture_rules/network.py +68 -0
  20. agentops/agent/checks/regression.py +78 -0
  21. agentops/agent/checks/release_readiness.py +182 -0
  22. agentops/agent/checks/safety.py +247 -0
  23. agentops/agent/checks/spec_conformance.py +375 -0
  24. agentops/agent/cockpit.py +5159 -0
  25. agentops/agent/config.py +240 -0
  26. agentops/agent/findings.py +113 -0
  27. agentops/agent/history.py +142 -0
  28. agentops/agent/knowledge/__init__.py +182 -0
  29. agentops/agent/knowledge/waf-checklist.csv +39 -0
  30. agentops/agent/llm_assist/__init__.py +16 -0
  31. agentops/agent/llm_assist/_base.py +124 -0
  32. agentops/agent/llm_assist/_bundle_rule.py +154 -0
  33. agentops/agent/llm_assist/_client.py +347 -0
  34. agentops/agent/llm_assist/_dataset_rules.py +191 -0
  35. agentops/agent/llm_assist/_engine.py +106 -0
  36. agentops/agent/llm_assist/_prompt_rules.py +291 -0
  37. agentops/agent/llm_assist/_spec_rules.py +235 -0
  38. agentops/agent/production_telemetry.py +430 -0
  39. agentops/agent/report.py +207 -0
  40. agentops/agent/server/__init__.py +1 -0
  41. agentops/agent/server/app.py +84 -0
  42. agentops/agent/server/auth.py +94 -0
  43. agentops/agent/server/chat.py +44 -0
  44. agentops/agent/server/protocol.py +72 -0
  45. agentops/agent/sources/__init__.py +1 -0
  46. agentops/agent/sources/azure_monitor.py +523 -0
  47. agentops/agent/sources/azure_resources.py +602 -0
  48. agentops/agent/sources/foundry_control.py +174 -0
  49. agentops/agent/sources/results_history.py +494 -0
  50. agentops/agent/sources/spec_detectors/__init__.py +42 -0
  51. agentops/agent/sources/spec_detectors/_base.py +58 -0
  52. agentops/agent/sources/spec_detectors/agents_md.py +75 -0
  53. agentops/agent/sources/spec_detectors/spec_kit.py +172 -0
  54. agentops/agent/time_range.py +117 -0
  55. agentops/cli/__init__.py +1 -0
  56. agentops/cli/app.py +4823 -0
  57. agentops/core/__init__.py +1 -0
  58. agentops/core/agentops_config.py +592 -0
  59. agentops/core/config_loader.py +22 -0
  60. agentops/core/evaluators.py +480 -0
  61. agentops/core/release_evidence.py +56 -0
  62. agentops/core/results.py +117 -0
  63. agentops/mcp/__init__.py +10 -0
  64. agentops/mcp/server.py +232 -0
  65. agentops/pipeline/__init__.py +8 -0
  66. agentops/pipeline/cloud_results.py +189 -0
  67. agentops/pipeline/cloud_runner.py +901 -0
  68. agentops/pipeline/comparison.py +108 -0
  69. agentops/pipeline/diagnostics.py +51 -0
  70. agentops/pipeline/invocations.py +535 -0
  71. agentops/pipeline/official_eval.py +414 -0
  72. agentops/pipeline/orchestrator.py +775 -0
  73. agentops/pipeline/prompt_deploy.py +377 -0
  74. agentops/pipeline/publisher.py +121 -0
  75. agentops/pipeline/reporter.py +202 -0
  76. agentops/pipeline/runtime.py +409 -0
  77. agentops/pipeline/thresholds.py +84 -0
  78. agentops/services/__init__.py +1 -0
  79. agentops/services/cicd.py +720 -0
  80. agentops/services/eval_analysis.py +848 -0
  81. agentops/services/evidence_pack.py +757 -0
  82. agentops/services/initializer.py +86 -0
  83. agentops/services/preflight.py +470 -0
  84. agentops/services/setup_wizard.py +709 -0
  85. agentops/services/skills.py +643 -0
  86. agentops/services/trace_promotion.py +300 -0
  87. agentops/services/workflow_analysis.py +1129 -0
  88. agentops/templates/.gitignore +15 -0
  89. agentops/templates/__init__.py +1 -0
  90. agentops/templates/agent-server/Dockerfile +23 -0
  91. agentops/templates/agent-server/README.md +61 -0
  92. agentops/templates/agent-server/main.bicep +94 -0
  93. agentops/templates/agent.yaml +87 -0
  94. agentops/templates/agentops.yaml +58 -0
  95. agentops/templates/foundry.svg +71 -0
  96. agentops/templates/icon.png +0 -0
  97. agentops/templates/pipelines/azuredevops/agentops-deploy-dev-azd.yml +118 -0
  98. agentops/templates/pipelines/azuredevops/agentops-deploy-dev.yml +73 -0
  99. agentops/templates/pipelines/azuredevops/agentops-deploy-prod-azd.yml +141 -0
  100. agentops/templates/pipelines/azuredevops/agentops-deploy-prod.yml +94 -0
  101. agentops/templates/pipelines/azuredevops/agentops-deploy-prompt-agent.yml +167 -0
  102. agentops/templates/pipelines/azuredevops/agentops-deploy-qa-azd.yml +118 -0
  103. agentops/templates/pipelines/azuredevops/agentops-deploy-qa.yml +68 -0
  104. agentops/templates/pipelines/azuredevops/agentops-pr-prompt-agent.yml +210 -0
  105. agentops/templates/pipelines/azuredevops/agentops-pr.yml +155 -0
  106. agentops/templates/pipelines/azuredevops/agentops-watchdog.yml +106 -0
  107. agentops/templates/project.gitignore +36 -0
  108. agentops/templates/sample-traces.jsonl +3 -0
  109. agentops/templates/skills/agentops-agent/SKILL.md +137 -0
  110. agentops/templates/skills/agentops-config/SKILL.md +113 -0
  111. agentops/templates/skills/agentops-dataset/SKILL.md +84 -0
  112. agentops/templates/skills/agentops-eval/SKILL.md +189 -0
  113. agentops/templates/skills/agentops-report/SKILL.md +71 -0
  114. agentops/templates/skills/agentops-workflow/SKILL.md +471 -0
  115. agentops/templates/smoke.jsonl +3 -0
  116. agentops/templates/waf-checklist.README.md +84 -0
  117. agentops/templates/waf-checklist.csv +22 -0
  118. agentops/templates/workflows/agentops-deploy-dev-azd.yml +166 -0
  119. agentops/templates/workflows/agentops-deploy-dev.yml +187 -0
  120. agentops/templates/workflows/agentops-deploy-prod-azd.yml +183 -0
  121. agentops/templates/workflows/agentops-deploy-prod.yml +171 -0
  122. agentops/templates/workflows/agentops-deploy-prompt-agent.yml +197 -0
  123. agentops/templates/workflows/agentops-deploy-qa-azd.yml +156 -0
  124. agentops/templates/workflows/agentops-deploy-qa.yml +145 -0
  125. agentops/templates/workflows/agentops-pr-prompt-agent.yml +210 -0
  126. agentops/templates/workflows/agentops-pr.yml +148 -0
  127. agentops/templates/workflows/agentops-watchdog.yml +122 -0
  128. agentops/utils/__init__.py +1 -0
  129. agentops/utils/azd_env.py +435 -0
  130. agentops/utils/azure_endpoints.py +62 -0
  131. agentops/utils/colors.py +47 -0
  132. agentops/utils/dotenv_loader.py +105 -0
  133. agentops/utils/foundry_discovery.py +229 -0
  134. agentops/utils/logging.py +59 -0
  135. agentops/utils/telemetry.py +554 -0
  136. agentops/utils/yaml.py +36 -0
  137. agentops_accelerator-0.3.0.dist-info/METADATA +278 -0
  138. agentops_accelerator-0.3.0.dist-info/RECORD +142 -0
  139. agentops_accelerator-0.3.0.dist-info/WHEEL +5 -0
  140. agentops_accelerator-0.3.0.dist-info/entry_points.txt +2 -0
  141. agentops_accelerator-0.3.0.dist-info/licenses/LICENSE +21 -0
  142. agentops_accelerator-0.3.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,901 @@
1
+ """Cloud-side publisher: submit a run to the New Foundry Evaluations panel.
2
+
3
+ Unlike :mod:`agentops.pipeline.publisher` (which uploads metrics that
4
+ AgentOps already computed locally to the *Classic* Foundry Evaluations
5
+ panel via OneDP), this module asks **Foundry to execute the agent and the
6
+ evaluators server-side** through the OpenAI Evals API.
7
+
8
+ The flow:
9
+
10
+ 1. Build an :class:`azure.ai.projects.AIProjectClient` from the configured
11
+ project endpoint using ``DefaultAzureCredential``.
12
+ 2. Get the OpenAI client via ``project_client.get_openai_client()``. We do
13
+ **not** pass ``api_version`` - the SDK picks the correct one (passing
14
+ one explicitly has historically caused 404s in this codebase).
15
+ 3. Inline the JSONL dataset rows as a ``file_content`` source.
16
+ 4. Create the eval definition with ``client.evals.create(...)``, mapping
17
+ each AgentOps evaluator preset onto an ``azure_ai_evaluator`` testing
18
+ criterion.
19
+ 5. Create the run with ``client.evals.runs.create(...)``, pointing at the
20
+ inline rows and using ``azure_ai_target_completions`` with an
21
+ ``agent_reference`` so Foundry invokes the agent itself.
22
+ 6. Poll until the run terminates, then return identifiers + the portal URL.
23
+
24
+ This module never re-runs the agent locally and never invokes evaluators
25
+ locally; that work happens inside Foundry. The local ``results.json``
26
+ (produced before this hop) remains the canonical record from AgentOps's
27
+ point of view.
28
+
29
+ Limitations (documented in the YAML schema docstring as well):
30
+
31
+ * Only ``foundry_prompt`` agents (``name:version``) are supported. HTTP
32
+ endpoints, local adapters, and direct model deployments are rejected.
33
+ * Only builtin evaluators that map cleanly onto ``azure_ai_evaluator``
34
+ testing criteria are supported. Custom evaluators are skipped with a
35
+ warning.
36
+ * Latency reported by the New Foundry view is Foundry-to-Foundry, not the
37
+ client-perceived latency captured locally.
38
+ """
39
+
40
+ from __future__ import annotations
41
+
42
+ import hashlib
43
+ import json
44
+ import logging
45
+ import os
46
+ import re
47
+ import time
48
+ import uuid
49
+ from dataclasses import dataclass, field
50
+ from pathlib import Path
51
+ from typing import Any, Callable, Dict, List, Optional
52
+
53
+ from agentops.core.agentops_config import DatasetSyncConfig
54
+ from agentops.core.results import RunResult
55
+
56
+ logger = logging.getLogger("agentops.pipeline.cloud_runner")
57
+
58
+
59
+ # ---------------------------------------------------------------------------
60
+ # Public types
61
+ # ---------------------------------------------------------------------------
62
+
63
+
64
+ @dataclass(frozen=True)
65
+ class CloudRunResult:
66
+ """Outcome of a cloud (New Foundry) publish."""
67
+
68
+ eval_id: str
69
+ run_id: str
70
+ status: str
71
+ report_url: Optional[str]
72
+ evaluation_name: str
73
+ #: Raw per-row output items downloaded from the Foundry Evals API.
74
+ #: Each item is a dict with at least ``datasource_item`` (the original
75
+ #: input row), ``sample`` (the agent response), and ``results``
76
+ #: (per-criterion scores). May be empty if the SDK returns no items
77
+ #: or the download failed (in which case orchestrator falls back to
78
+ #: a thin RunResult that just records the portal URL).
79
+ output_items: List[Dict[str, Any]] = field(default_factory=list)
80
+ #: Dataset lineage and submission mode recorded for reports/Cockpit.
81
+ dataset: Dict[str, Any] = field(default_factory=dict)
82
+
83
+
84
+ # Map AgentOps evaluator class names to the OpenAI Evals API evaluator
85
+ # names that ``azure_ai_evaluator`` recognises. Any preset whose
86
+ # ``class_name`` is not in this map is skipped (with a warning) when
87
+ # building testing criteria.
88
+ _AZURE_AI_EVALUATOR_NAMES: Dict[str, str] = {
89
+ "CoherenceEvaluator": "builtin.coherence",
90
+ "FluencyEvaluator": "builtin.fluency",
91
+ "SimilarityEvaluator": "builtin.similarity",
92
+ "F1ScoreEvaluator": "builtin.f1_score",
93
+ "RelevanceEvaluator": "builtin.relevance",
94
+ "GroundednessEvaluator": "builtin.groundedness",
95
+ "RetrievalEvaluator": "builtin.retrieval",
96
+ "ResponseCompletenessEvaluator": "builtin.response_completeness",
97
+ "ToolCallAccuracyEvaluator": "builtin.tool_call_accuracy",
98
+ "IntentResolutionEvaluator": "builtin.intent_resolution",
99
+ "TaskAdherenceEvaluator": "builtin.task_adherence",
100
+ }
101
+
102
+ _CLOUD_EVALUATORS_REQUIRING_DEPLOYMENT = {
103
+ "CoherenceEvaluator",
104
+ "FluencyEvaluator",
105
+ "SimilarityEvaluator",
106
+ "RelevanceEvaluator",
107
+ "GroundednessEvaluator",
108
+ "RetrievalEvaluator",
109
+ "ResponseCompletenessEvaluator",
110
+ "ToolCallAccuracyEvaluator",
111
+ "IntentResolutionEvaluator",
112
+ "TaskAdherenceEvaluator",
113
+ }
114
+
115
+ _CLOUD_PLACEHOLDERS = {
116
+ "$prompt": "{{item.input}}",
117
+ "$prediction": "{{sample.output_text}}",
118
+ "$expected": "{{item.expected}}",
119
+ "$context": "{{item.context}}",
120
+ "$tool_calls": "{{item.tool_calls}}",
121
+ "$tool_definitions": "{{item.tool_definitions}}",
122
+ }
123
+
124
+
125
+ _DEFAULT_POLL_INTERVAL_SECONDS = 2.0
126
+ _DEFAULT_MAX_POLL_ATTEMPTS = 300 # 10 minutes at 2s intervals
127
+ _DEFAULT_HEARTBEAT_SECONDS = 10.0
128
+ _TERMINAL_STATUSES = {"completed", "failed", "canceled", "cancelled"}
129
+
130
+
131
+ # ---------------------------------------------------------------------------
132
+ # Entry point
133
+ # ---------------------------------------------------------------------------
134
+
135
+
136
+ def run_on_foundry_cloud(
137
+ result: RunResult,
138
+ *,
139
+ dataset_path: Path,
140
+ project_endpoint: str,
141
+ evaluation_name: Optional[str] = None,
142
+ dataset_sync: Optional[DatasetSyncConfig] = None,
143
+ poll_interval_seconds: float = _DEFAULT_POLL_INTERVAL_SECONDS,
144
+ max_poll_attempts: int = _DEFAULT_MAX_POLL_ATTEMPTS,
145
+ progress: Optional[Callable[[str], None]] = None,
146
+ ) -> CloudRunResult:
147
+ """Submit ``result``'s target to Foundry for server-side evaluation.
148
+
149
+ Parameters
150
+ ----------
151
+ result:
152
+ Local run result. Used to derive the agent reference and the list
153
+ of evaluator presets that should map onto ``azure_ai_evaluator``
154
+ testing criteria.
155
+ dataset_path:
156
+ Path to the JSONL dataset to submit. Must already exist.
157
+ project_endpoint:
158
+ Foundry project endpoint URL (e.g.
159
+ ``https://contoso.services.ai.azure.com/api/projects/p``).
160
+ evaluation_name:
161
+ Optional display name. Defaults to ``agentops-cloud-<short-uuid>``.
162
+ dataset_sync:
163
+ Optional submission policy. ``auto`` and ``inline`` currently use
164
+ inline ``file_content`` compatibility and record that lineage; ``foundry``
165
+ fails fast until the Foundry dataset reference path is validated.
166
+ poll_interval_seconds, max_poll_attempts:
167
+ Control polling cadence and bound. The default budget is
168
+ ~10 minutes.
169
+ progress:
170
+ Optional callback invoked with one-line status updates. The
171
+ orchestrator wires this to the same channel that prints per-row
172
+ progress so the user sees what is happening during the long
173
+ cloud round-trip.
174
+
175
+ Raises
176
+ ------
177
+ ImportError
178
+ ``azure-ai-projects`` / ``azure-identity`` are not installed.
179
+ ValueError
180
+ Target is not a Foundry agent or the dataset is missing.
181
+ RuntimeError
182
+ Polling timed out or the run terminated with a non-completed
183
+ status.
184
+ """
185
+ progress = progress or (lambda _msg: None)
186
+
187
+ if result.target.kind != "foundry_prompt":
188
+ raise ValueError(
189
+ "publish: foundry_cloud only supports Foundry agents declared "
190
+ "as 'name:version' (foundry_prompt targets). Got "
191
+ f"target.kind={result.target.kind!r}."
192
+ )
193
+ if not dataset_path.exists():
194
+ raise ValueError(f"dataset file not found: {dataset_path}")
195
+
196
+ agent_name = result.target.name
197
+ agent_version = result.target.version
198
+ if not agent_name or not agent_version:
199
+ raise ValueError(
200
+ "Cloud publish requires a fully qualified 'name:version' agent "
201
+ f"reference; got name={agent_name!r} version={agent_version!r}"
202
+ )
203
+
204
+ try:
205
+ from azure.ai.projects import AIProjectClient # noqa: WPS433
206
+ from azure.identity import DefaultAzureCredential # noqa: WPS433
207
+ except ImportError as exc: # pragma: no cover - exercised only at runtime
208
+ raise ImportError(
209
+ "publish: foundry_cloud requires 'azure-ai-projects' and "
210
+ "'azure-identity'. Install with:\n"
211
+ " pip install azure-ai-projects azure-identity"
212
+ ) from exc
213
+
214
+ credential = DefaultAzureCredential(exclude_developer_cli_credential=True, process_timeout=30)
215
+ project_client = AIProjectClient(
216
+ endpoint=project_endpoint,
217
+ credential=credential,
218
+ )
219
+
220
+ # NB: do not pass api_version - the SDK chooses the right one. Passing
221
+ # an explicit version has historically caused 404s in this codebase.
222
+ openai_client = project_client.get_openai_client()
223
+
224
+ eval_name = evaluation_name or f"agentops-cloud-{uuid.uuid4().hex[:8]}"
225
+ testing_criteria = _build_testing_criteria(result)
226
+ if not testing_criteria:
227
+ raise ValueError(
228
+ "no AgentOps evaluators map onto azure_ai_evaluator testing "
229
+ "criteria; nothing to evaluate server-side."
230
+ )
231
+
232
+ progress(f"cloud: preparing run '{eval_name}'")
233
+ progress(
234
+ "cloud: remote Foundry evaluations are asynchronous; small smoke "
235
+ "runs commonly take 30-90s depending on queueing, agent latency, "
236
+ "and evaluator model latency."
237
+ )
238
+
239
+ item_schema = _build_item_schema(dataset_path)
240
+ source, dataset_lineage = _build_dataset_source(
241
+ dataset_path,
242
+ dataset_sync or DatasetSyncConfig(),
243
+ project_client=project_client,
244
+ progress=progress,
245
+ )
246
+
247
+ progress(
248
+ f"cloud: creating eval ({len(testing_criteria)} criteria, "
249
+ f"item_schema fields: {sorted(item_schema['properties'].keys())})"
250
+ )
251
+ eval_obj = openai_client.evals.create(
252
+ name=eval_name,
253
+ data_source_config={
254
+ "type": "custom",
255
+ "item_schema": item_schema,
256
+ "include_sample_schema": True,
257
+ },
258
+ testing_criteria=testing_criteria, # type: ignore[arg-type]
259
+ )
260
+ eval_id = eval_obj.id
261
+
262
+ progress(
263
+ f"cloud: starting run for agent {agent_name}:{agent_version}"
264
+ )
265
+ try:
266
+ run_obj = openai_client.evals.runs.create(
267
+ eval_id=eval_id,
268
+ name=f"{eval_name}-run",
269
+ data_source={ # type: ignore[arg-type]
270
+ "type": "azure_ai_target_completions",
271
+ "source": source,
272
+ "input_messages": {
273
+ "type": "template",
274
+ "template": [
275
+ {
276
+ "type": "message",
277
+ "role": "user",
278
+ "content": {
279
+ "type": "input_text",
280
+ "text": "{{item.input}}",
281
+ },
282
+ }
283
+ ],
284
+ },
285
+ "target": {
286
+ "type": "azure_ai_agent",
287
+ "name": agent_name,
288
+ "version": agent_version,
289
+ },
290
+ },
291
+ )
292
+ except Exception as exc: # noqa: BLE001
293
+ raise _friendly_run_create_error(
294
+ exc, agent_name=agent_name, agent_version=agent_version
295
+ ) from exc
296
+ run_id = run_obj.id
297
+
298
+ progress(
299
+ f"cloud: polling run {run_id} (interval "
300
+ f"{poll_interval_seconds:g}s, max {max_poll_attempts} attempts)"
301
+ )
302
+ final_run = _poll_until_terminal(
303
+ openai_client,
304
+ eval_id=eval_id,
305
+ run_id=run_id,
306
+ interval_seconds=poll_interval_seconds,
307
+ max_attempts=max_poll_attempts,
308
+ progress=progress,
309
+ )
310
+
311
+ status = getattr(final_run, "status", "unknown")
312
+ report_url = _extract_report_url(final_run)
313
+
314
+ if status != "completed":
315
+ raise RuntimeError(
316
+ f"cloud evaluation run {run_id} terminated with status "
317
+ f"{status!r}; see {report_url or 'the Foundry portal'}."
318
+ )
319
+
320
+ progress(f"cloud: done. status={status}")
321
+
322
+ # Download per-row results from Foundry so the local results.json can
323
+ # be populated without re-invoking the agent client-side.
324
+ output_items = _list_output_items(
325
+ openai_client,
326
+ eval_id=eval_id,
327
+ run_id=run_id,
328
+ progress=progress,
329
+ )
330
+
331
+ return CloudRunResult(
332
+ eval_id=eval_id,
333
+ run_id=run_id,
334
+ status=status,
335
+ report_url=report_url,
336
+ evaluation_name=eval_name,
337
+ output_items=output_items,
338
+ dataset=dataset_lineage,
339
+ )
340
+
341
+
342
+ # ---------------------------------------------------------------------------
343
+ # Helpers
344
+ # ---------------------------------------------------------------------------
345
+
346
+
347
+ def _build_testing_criteria(result: RunResult) -> List[Dict[str, Any]]:
348
+ """Map evaluator class names from ``result`` onto Azure AI evaluators.
349
+
350
+ Prefer ``result.evaluators`` because it records the evaluator set selected
351
+ for the run even when every local invocation failed and no aggregate
352
+ metrics were produced. Fall back to aggregate metric keys for compatibility
353
+ with older result payloads.
354
+ """
355
+ # Lazy import to avoid pulling evaluators into modules that don't
356
+ # need them.
357
+ from agentops.core.evaluators import CATALOG
358
+
359
+ evaluator_deployment = _evaluator_deployment_name()
360
+
361
+ # ``CATALOG`` is keyed by preset.name (== class name); ``aggregate_metrics``
362
+ # is keyed by preset.score_key. Build a one-shot reverse index for older
363
+ # result payloads or synthesized tests that only carry metric keys.
364
+ by_score_key = {p.score_key: p for p in CATALOG.values()}
365
+ presets = [CATALOG[name] for name in result.evaluators if name in CATALOG]
366
+ if not presets:
367
+ presets = [
368
+ preset
369
+ for metric_name in result.aggregate_metrics.keys()
370
+ if (preset := by_score_key.get(metric_name)) is not None
371
+ ]
372
+
373
+ criteria: List[Dict[str, Any]] = []
374
+ seen: set = set()
375
+ for preset in presets:
376
+ # Latency is computed locally; Foundry has its own server-side view.
377
+ if "runtime" in preset.categories:
378
+ continue
379
+ azure_name = _AZURE_AI_EVALUATOR_NAMES.get(preset.class_name)
380
+ if not azure_name:
381
+ logger.warning(
382
+ "no azure_ai_evaluator mapping for %s; skipping in cloud run",
383
+ preset.class_name,
384
+ )
385
+ continue
386
+ if azure_name in seen:
387
+ continue
388
+ seen.add(azure_name)
389
+ criterion: Dict[str, Any] = {
390
+ "type": "azure_ai_evaluator",
391
+ "name": preset.score_key,
392
+ "evaluator_name": azure_name,
393
+ "data_mapping": _build_cloud_data_mapping(preset),
394
+ }
395
+ if preset.class_name in _CLOUD_EVALUATORS_REQUIRING_DEPLOYMENT:
396
+ if not evaluator_deployment:
397
+ raise ValueError(
398
+ "publish: foundry_cloud requires AZURE_OPENAI_DEPLOYMENT "
399
+ "or AZURE_AI_MODEL_DEPLOYMENT_NAME for Azure AI "
400
+ f"evaluator {preset.class_name}."
401
+ )
402
+ criterion["initialization_parameters"] = {
403
+ "deployment_name": evaluator_deployment,
404
+ }
405
+ criteria.append(criterion)
406
+ return criteria
407
+
408
+
409
+ def _evaluator_deployment_name() -> Optional[str]:
410
+ return os.getenv("AZURE_OPENAI_DEPLOYMENT") or os.getenv(
411
+ "AZURE_AI_MODEL_DEPLOYMENT_NAME"
412
+ )
413
+
414
+
415
+ def _build_cloud_data_mapping(preset: Any) -> Dict[str, str]:
416
+ mapping: Dict[str, str] = {}
417
+ for input_field, placeholder in preset.input_mapping.items():
418
+ if placeholder == "$prediction" and getattr(preset, "needs_conversation", False):
419
+ mapping[input_field] = "{{sample.output_items}}"
420
+ continue
421
+ mapped = _CLOUD_PLACEHOLDERS.get(placeholder)
422
+ if mapped:
423
+ mapping[input_field] = mapped
424
+ return mapping
425
+
426
+
427
+ def _build_file_content_source(
428
+ dataset_path: Path,
429
+ *,
430
+ progress: Callable[[str], None],
431
+ ) -> Dict[str, Any]:
432
+ """Inline JSONL rows for Foundry target-completions runs.
433
+
434
+ New Foundry currently validates file-id sources by extension after the
435
+ upload is materialized server-side. Inline ``file_content`` avoids a
436
+ service-side filename loss where valid ``.jsonl`` uploads can be read back
437
+ as extensionless files.
438
+ """
439
+ progress(f"cloud: preparing {dataset_path.name}")
440
+ content: List[Dict[str, Any]] = []
441
+ with dataset_path.open("r", encoding="utf-8") as handle:
442
+ for line_number, line in enumerate(handle, start=1):
443
+ text = line.strip()
444
+ if not text:
445
+ continue
446
+ row = json.loads(text)
447
+ if not isinstance(row, dict):
448
+ raise ValueError(
449
+ f"dataset row {line_number} must be a JSON object for "
450
+ "publish: foundry_cloud"
451
+ )
452
+ content.append({"item": row})
453
+ if not content:
454
+ raise ValueError("dataset must contain at least one row for publish: foundry_cloud")
455
+ progress(f"cloud: prepared {len(content)} row(s)")
456
+ return {
457
+ "type": "file_content",
458
+ "content": content,
459
+ }
460
+
461
+
462
+ def _build_dataset_source(
463
+ dataset_path: Path,
464
+ dataset_sync: DatasetSyncConfig,
465
+ *,
466
+ project_client: Any,
467
+ progress: Callable[[str], None],
468
+ ) -> tuple[Dict[str, Any], Dict[str, Any]]:
469
+ if dataset_sync.mode == "inline":
470
+ source = _build_file_content_source(dataset_path, progress=progress)
471
+ return source, _build_inline_dataset_lineage(dataset_path, dataset_sync)
472
+
473
+ try:
474
+ source, lineage = _build_foundry_dataset_source(
475
+ dataset_path,
476
+ dataset_sync,
477
+ project_client=project_client,
478
+ progress=progress,
479
+ )
480
+ except Exception as exc: # noqa: BLE001
481
+ if dataset_sync.mode == "foundry":
482
+ raise
483
+ reason = _summarize_dataset_sync_error(exc)
484
+ logger.debug(
485
+ "Foundry dataset sync failed; falling back to inline file_content",
486
+ exc_info=True,
487
+ )
488
+ progress(
489
+ "cloud: dataset sync unavailable; using inline rows for this run. "
490
+ f"Reason: {reason}"
491
+ )
492
+ source = _build_file_content_source(dataset_path, progress=progress)
493
+ lineage = _build_inline_dataset_lineage(dataset_path, dataset_sync)
494
+ lineage["status"] = "auto_fallback_inline"
495
+ lineage["sync_error"] = reason
496
+ return source, lineage
497
+ return source, lineage
498
+
499
+
500
+ def _build_foundry_dataset_source(
501
+ dataset_path: Path,
502
+ dataset_sync: DatasetSyncConfig,
503
+ *,
504
+ project_client: Any,
505
+ progress: Callable[[str], None],
506
+ ) -> tuple[Dict[str, Any], Dict[str, Any]]:
507
+ sha256 = _sha256_file(dataset_path)
508
+ name = dataset_sync.name or _derived_foundry_dataset_name(dataset_path)
509
+ version = _resolved_foundry_dataset_version(dataset_sync.version, sha256)
510
+ progress(f"cloud: syncing dataset to Foundry {name}@{version}")
511
+
512
+ dataset = _get_or_upload_foundry_dataset(
513
+ project_client,
514
+ name=name,
515
+ version=version,
516
+ dataset_path=dataset_path,
517
+ progress=progress,
518
+ )
519
+ dataset_id = _dataset_attr(dataset, "id")
520
+ if not dataset_id:
521
+ raise RuntimeError(
522
+ f"Foundry dataset {name}@{version} did not return an id."
523
+ )
524
+ progress(f"cloud: using Foundry dataset {name}@{version}")
525
+ return (
526
+ {
527
+ "type": "file_id",
528
+ "id": dataset_id,
529
+ },
530
+ {
531
+ "mode": "foundry",
532
+ "requested_mode": dataset_sync.mode,
533
+ "source_type": "file_id",
534
+ "local_path": str(dataset_path),
535
+ "sha256": sha256,
536
+ "status": "synced",
537
+ "foundry_name": name,
538
+ "foundry_version": version,
539
+ "foundry_id": dataset_id,
540
+ "foundry_uri": _dataset_attr(dataset, "dataUri"),
541
+ },
542
+ )
543
+
544
+
545
+ def _get_or_upload_foundry_dataset(
546
+ project_client: Any,
547
+ *,
548
+ name: str,
549
+ version: str,
550
+ dataset_path: Path,
551
+ progress: Callable[[str], None],
552
+ ) -> Any:
553
+ try:
554
+ dataset = project_client.datasets.get(name=name, version=version)
555
+ progress(f"cloud: found existing Foundry dataset {name}@{version}")
556
+ return dataset
557
+ except Exception as exc: # noqa: BLE001
558
+ if not _looks_not_found(exc):
559
+ raise
560
+
561
+ try:
562
+ return project_client.datasets.upload_file(
563
+ name=name,
564
+ version=version,
565
+ file_path=str(dataset_path),
566
+ )
567
+ except Exception as exc: # noqa: BLE001
568
+ if _looks_conflict(exc):
569
+ progress(
570
+ f"cloud: Foundry dataset {name}@{version} already exists; "
571
+ "reusing it"
572
+ )
573
+ return project_client.datasets.get(name=name, version=version)
574
+ raise
575
+
576
+
577
+ def _build_inline_dataset_lineage(
578
+ dataset_path: Path,
579
+ dataset_sync: DatasetSyncConfig,
580
+ ) -> Dict[str, Any]:
581
+ """Describe the local-to-Foundry dataset relationship for inline runs."""
582
+ lineage: Dict[str, Any] = {
583
+ "mode": "inline",
584
+ "requested_mode": dataset_sync.mode,
585
+ "source_type": "file_content",
586
+ "local_path": str(dataset_path),
587
+ "sha256": _sha256_file(dataset_path),
588
+ "status": "compatibility_inline",
589
+ "foundry_behavior": (
590
+ "Foundry may materialize inline rows as eval-data-* backing "
591
+ "dataset assets in the project Data page."
592
+ ),
593
+ }
594
+ if dataset_sync.name:
595
+ lineage["configured_name"] = dataset_sync.name
596
+ if dataset_sync.version:
597
+ lineage["configured_version"] = dataset_sync.version
598
+ return lineage
599
+
600
+
601
+ def _summarize_dataset_sync_error(exc: Exception) -> str:
602
+ text = str(exc)
603
+ lower = text.lower()
604
+ if "defaultazurecredential failed to retrieve a token" in lower:
605
+ return (
606
+ "Azure authentication was unavailable for Foundry dataset sync. "
607
+ "Run `az login` or set `dataset_sync.mode: inline` to skip dataset "
608
+ "asset sync during quick demos."
609
+ )
610
+ if "azureclicredential: failed to invoke the azure cli" in lower:
611
+ return (
612
+ "Azure CLI authentication was unavailable for Foundry dataset sync. "
613
+ "Run `az login` or set `dataset_sync.mode: inline` for this run."
614
+ )
615
+ first_line = text.splitlines()[0].strip() if text else exc.__class__.__name__
616
+ if len(first_line) > 180:
617
+ first_line = first_line[:177] + "..."
618
+ return f"{exc.__class__.__name__}: {first_line}"
619
+
620
+
621
+ def _derived_foundry_dataset_name(dataset_path: Path) -> str:
622
+ stem = dataset_path.stem.lower()
623
+ slug = re.sub(r"[^a-z0-9_-]+", "-", stem).strip("-_")
624
+ return f"agentops-{slug or 'dataset'}"
625
+
626
+
627
+ def _resolved_foundry_dataset_version(configured: str, sha256: str) -> str:
628
+ if configured == "content-hash":
629
+ return f"sha256-{sha256[:16]}"
630
+ return configured
631
+
632
+
633
+ def _dataset_attr(dataset: Any, name: str) -> Optional[str]:
634
+ value = getattr(dataset, name, None)
635
+ if value is None and isinstance(dataset, dict):
636
+ value = dataset.get(name)
637
+ return str(value) if value else None
638
+
639
+
640
+ def _looks_not_found(exc: Exception) -> bool:
641
+ status_code = getattr(exc, "status_code", None)
642
+ if status_code == 404:
643
+ return True
644
+ text = str(exc).lower()
645
+ return "not found" in text or "resource not found" in text or "404" in text
646
+
647
+
648
+ def _looks_conflict(exc: Exception) -> bool:
649
+ status_code = getattr(exc, "status_code", None)
650
+ if status_code == 409:
651
+ return True
652
+ text = str(exc).lower()
653
+ return "already exists" in text or "conflict" in text or "409" in text
654
+
655
+
656
+ def _sha256_file(path: Path) -> str:
657
+ digest = hashlib.sha256()
658
+ with path.open("rb") as handle:
659
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
660
+ digest.update(chunk)
661
+ return digest.hexdigest()
662
+
663
+
664
+ def _build_item_schema(dataset_path: Path) -> Dict[str, Any]:
665
+ """Inspect the first dataset row to derive a JSON schema.
666
+
667
+ Foundry's Evals API requires an ``item_schema`` declaring the shape of
668
+ each row. We read the first non-empty line of the JSONL file and
669
+ advertise every top-level key as a string property; this is permissive
670
+ enough for typical AgentOps datasets (input, expected, context,
671
+ tool_calls, tool_definitions).
672
+ """
673
+ properties: Dict[str, Dict[str, str]] = {}
674
+ with dataset_path.open("r", encoding="utf-8") as handle:
675
+ for line in handle:
676
+ line = line.strip()
677
+ if not line:
678
+ continue
679
+ row = json.loads(line)
680
+ if isinstance(row, dict):
681
+ for key in row.keys():
682
+ properties[str(key)] = {"type": "string"}
683
+ break
684
+ if not properties:
685
+ # Fall back to a single 'input' field so eval creation does not
686
+ # blow up on an empty dataset.
687
+ properties["input"] = {"type": "string"}
688
+ return {
689
+ "type": "object",
690
+ "properties": properties,
691
+ "required": list(properties.keys()),
692
+ }
693
+
694
+
695
+ def _poll_until_terminal(
696
+ openai_client: Any,
697
+ *,
698
+ eval_id: str,
699
+ run_id: str,
700
+ interval_seconds: float,
701
+ max_attempts: int,
702
+ progress: Callable[[str], None],
703
+ ) -> Any:
704
+ """Poll ``runs.retrieve`` until the run reaches a terminal status."""
705
+ last_status: Optional[str] = None
706
+ started = time.monotonic()
707
+ last_progress_at = started
708
+ for attempt in range(1, max_attempts + 1):
709
+ run = openai_client.evals.runs.retrieve(eval_id=eval_id, run_id=run_id)
710
+ status = getattr(run, "status", "unknown")
711
+ now = time.monotonic()
712
+ elapsed = now - started
713
+ status_changed = status != last_status
714
+ heartbeat_due = now - last_progress_at >= _DEFAULT_HEARTBEAT_SECONDS
715
+ if status_changed or heartbeat_due:
716
+ label = "run status ->" if status_changed else "still"
717
+ progress(
718
+ f"cloud: {label} {status} "
719
+ f"(elapsed {_format_elapsed(elapsed)}, attempt {attempt}/{max_attempts})"
720
+ )
721
+ last_progress_at = now
722
+ last_status = status
723
+ if status in _TERMINAL_STATUSES:
724
+ return run
725
+ time.sleep(interval_seconds)
726
+ raise RuntimeError(
727
+ f"cloud evaluation run {run_id} did not finish within "
728
+ f"{max_attempts} polls of {interval_seconds:g}s "
729
+ f"(last status: {last_status!r})."
730
+ )
731
+
732
+
733
+ def _format_elapsed(seconds: float) -> str:
734
+ total = max(0, int(seconds))
735
+ minutes, remaining = divmod(total, 60)
736
+ if minutes:
737
+ return f"{minutes}m{remaining:02d}s"
738
+ return f"{remaining}s"
739
+
740
+
741
+ def _friendly_run_create_error(
742
+ exc: Exception,
743
+ *,
744
+ agent_name: str,
745
+ agent_version: str,
746
+ ) -> Exception:
747
+ """Convert a noisy Foundry/OpenAI ``evals.runs.create`` failure into a
748
+ short, actionable ``RuntimeError``.
749
+
750
+ The Evals API returns the underlying validation message inside a
751
+ nested JSON envelope (``error.message`` →
752
+ ``Evaluation failed validation: {"Code": "ResourceNotFound", ...}``).
753
+ Rendering the raw exception dumps the whole envelope on stderr, which
754
+ is unreadable. We pick out the inner detail and rephrase it in the
755
+ common-case forms users actually hit.
756
+ """
757
+ raw = _extract_error_message(exc) or str(exc)
758
+ lowered = raw.lower()
759
+
760
+ if "was not found" in lowered or "resourcenotfound" in lowered:
761
+ return RuntimeError(
762
+ f"Agent '{agent_name}:{agent_version}' was not found in your "
763
+ "Foundry project.\n"
764
+ " - Verify the name and version in target.endpoint.agent_id "
765
+ "(format: name:version).\n"
766
+ " - Confirm AZURE_AI_FOUNDRY_PROJECT_ENDPOINT points to the "
767
+ "project that owns the agent.\n"
768
+ " - Make sure the agent is deployed; list agents in the "
769
+ "Foundry portal under Agents."
770
+ )
771
+
772
+ if "permission" in lowered or "forbidden" in lowered or "403" in raw:
773
+ return RuntimeError(
774
+ "Foundry denied the evaluation request (permission).\n"
775
+ f" - Confirm you have access to the project that owns "
776
+ f"agent '{agent_name}:{agent_version}'.\n"
777
+ " - Try `az login` with the correct tenant or check the "
778
+ "managed identity assigned to this environment."
779
+ )
780
+
781
+ if "quota" in lowered or "ratelimit" in lowered or "429" in raw:
782
+ return RuntimeError(
783
+ "Foundry rate-limited the evaluation request. Retry in a "
784
+ "few minutes, or reduce dataset size."
785
+ )
786
+
787
+ return RuntimeError(f"Cloud evaluation could not start: {raw}")
788
+
789
+
790
+ def _extract_error_message(exc: Exception) -> Optional[str]:
791
+ """Best-effort extraction of the human-readable message buried inside
792
+ an OpenAI / Azure SDK error.
793
+ """
794
+ body = getattr(exc, "body", None)
795
+ if isinstance(body, dict):
796
+ err = body.get("error") if isinstance(body.get("error"), dict) else body
797
+ if isinstance(err, dict):
798
+ msg = err.get("message")
799
+ if isinstance(msg, str) and msg:
800
+ inner = _strip_validation_envelope(msg)
801
+ return inner or msg
802
+ msg = getattr(exc, "message", None)
803
+ if isinstance(msg, str) and msg:
804
+ return _strip_validation_envelope(msg) or msg
805
+ return None
806
+
807
+
808
+ def _strip_validation_envelope(text: str) -> Optional[str]:
809
+ """Pull the ``Message: ...`` line out of the validation envelope that
810
+ Foundry returns inside ``error.message``. Returns ``None`` if no such
811
+ line is present so callers can fall back to the original text.
812
+ """
813
+ for line in text.splitlines():
814
+ s = line.strip()
815
+ if s.lower().startswith("message:"):
816
+ return s.split(":", 1)[1].strip()
817
+ return None
818
+
819
+
820
+ def _extract_report_url(run: Any) -> Optional[str]:
821
+ """Best-effort extraction of the portal URL from a run object."""
822
+ for attr in ("report_url", "reportUrl"):
823
+ value = getattr(run, attr, None)
824
+ if isinstance(value, str) and value:
825
+ return value
826
+ metadata = getattr(run, "metadata", None)
827
+ if isinstance(metadata, dict):
828
+ for key in ("report_url", "reportUrl"):
829
+ value = metadata.get(key)
830
+ if isinstance(value, str) and value:
831
+ return value
832
+ return None
833
+
834
+
835
+ def _list_output_items(
836
+ openai_client: Any,
837
+ *,
838
+ eval_id: str,
839
+ run_id: str,
840
+ progress: Callable[[str], None],
841
+ ) -> List[Dict[str, Any]]:
842
+ """Download per-row output items from a completed Foundry eval run.
843
+
844
+ Returns a list of dicts (one per dataset row) containing the original
845
+ ``datasource_item`` (input row), the ``sample`` returned by the agent,
846
+ and the per-criterion ``results``. Returns ``[]`` on any failure so
847
+ the orchestrator can still emit a ``results.json`` that records the
848
+ Foundry portal URL (no fallback to local invocation).
849
+ """
850
+ try:
851
+ # The OpenAI Evals API exposes a paginated list endpoint at
852
+ # ``client.evals.runs.output_items.list``. We accept either a
853
+ # paginator object with ``.data`` / iteration, or a plain list.
854
+ output_items_api = openai_client.evals.runs.output_items
855
+ page = output_items_api.list(eval_id=eval_id, run_id=run_id)
856
+ except Exception as exc: # noqa: BLE001
857
+ logger.debug("could not list output_items: %s", exc)
858
+ progress(
859
+ f"cloud: WARNING - could not download per-row results "
860
+ f"({exc.__class__.__name__}); local results.json will record the "
861
+ f"portal URL only."
862
+ )
863
+ return []
864
+
865
+ items: List[Dict[str, Any]] = []
866
+ try:
867
+ iterable = getattr(page, "data", None) or page
868
+ for raw in iterable:
869
+ item = _coerce_output_item_to_dict(raw)
870
+ if item is not None:
871
+ items.append(item)
872
+ except Exception as exc: # noqa: BLE001
873
+ logger.debug("could not iterate output_items: %s", exc)
874
+ progress(
875
+ f"cloud: WARNING - failed to iterate output_items "
876
+ f"({exc.__class__.__name__}); local results.json will be thin."
877
+ )
878
+ return []
879
+
880
+ progress(f"cloud: downloaded {len(items)} output item(s)")
881
+ return items
882
+
883
+
884
+ def _coerce_output_item_to_dict(raw: Any) -> Optional[Dict[str, Any]]:
885
+ """Convert an SDK output item (Pydantic model or dict) into a plain dict."""
886
+ if isinstance(raw, dict):
887
+ return raw
888
+ for method in ("model_dump", "to_dict", "dict"):
889
+ fn = getattr(raw, method, None)
890
+ if callable(fn):
891
+ try:
892
+ value = fn()
893
+ if isinstance(value, dict):
894
+ return value
895
+ except Exception: # noqa: BLE001
896
+ continue
897
+ # Fallback: pull known attributes off the object.
898
+ keys = ("id", "status", "datasource_item", "sample", "results")
899
+ if any(hasattr(raw, k) for k in keys):
900
+ return {k: getattr(raw, k, None) for k in keys}
901
+ return None