agentops-accelerator 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentops/__init__.py +10 -0
- agentops/__main__.py +6 -0
- agentops/agent/__init__.py +12 -0
- agentops/agent/_legacy_ids.py +92 -0
- agentops/agent/analyzer.py +207 -0
- agentops/agent/checks/__init__.py +1 -0
- agentops/agent/checks/catalog.py +880 -0
- agentops/agent/checks/errors.py +279 -0
- agentops/agent/checks/foundry_config.py +75 -0
- agentops/agent/checks/latency.py +84 -0
- agentops/agent/checks/opex.py +157 -0
- agentops/agent/checks/opex_workspace.py +874 -0
- agentops/agent/checks/posture.py +36 -0
- agentops/agent/checks/posture_rules/__init__.py +53 -0
- agentops/agent/checks/posture_rules/content_filter.py +59 -0
- agentops/agent/checks/posture_rules/diagnostics.py +74 -0
- agentops/agent/checks/posture_rules/local_auth.py +55 -0
- agentops/agent/checks/posture_rules/managed_identity.py +59 -0
- agentops/agent/checks/posture_rules/network.py +68 -0
- agentops/agent/checks/regression.py +78 -0
- agentops/agent/checks/release_readiness.py +182 -0
- agentops/agent/checks/safety.py +247 -0
- agentops/agent/checks/spec_conformance.py +375 -0
- agentops/agent/cockpit.py +5159 -0
- agentops/agent/config.py +240 -0
- agentops/agent/findings.py +113 -0
- agentops/agent/history.py +142 -0
- agentops/agent/knowledge/__init__.py +182 -0
- agentops/agent/knowledge/waf-checklist.csv +39 -0
- agentops/agent/llm_assist/__init__.py +16 -0
- agentops/agent/llm_assist/_base.py +124 -0
- agentops/agent/llm_assist/_bundle_rule.py +154 -0
- agentops/agent/llm_assist/_client.py +347 -0
- agentops/agent/llm_assist/_dataset_rules.py +191 -0
- agentops/agent/llm_assist/_engine.py +106 -0
- agentops/agent/llm_assist/_prompt_rules.py +291 -0
- agentops/agent/llm_assist/_spec_rules.py +235 -0
- agentops/agent/production_telemetry.py +430 -0
- agentops/agent/report.py +207 -0
- agentops/agent/server/__init__.py +1 -0
- agentops/agent/server/app.py +84 -0
- agentops/agent/server/auth.py +94 -0
- agentops/agent/server/chat.py +44 -0
- agentops/agent/server/protocol.py +72 -0
- agentops/agent/sources/__init__.py +1 -0
- agentops/agent/sources/azure_monitor.py +523 -0
- agentops/agent/sources/azure_resources.py +602 -0
- agentops/agent/sources/foundry_control.py +174 -0
- agentops/agent/sources/results_history.py +494 -0
- agentops/agent/sources/spec_detectors/__init__.py +42 -0
- agentops/agent/sources/spec_detectors/_base.py +58 -0
- agentops/agent/sources/spec_detectors/agents_md.py +75 -0
- agentops/agent/sources/spec_detectors/spec_kit.py +172 -0
- agentops/agent/time_range.py +117 -0
- agentops/cli/__init__.py +1 -0
- agentops/cli/app.py +4823 -0
- agentops/core/__init__.py +1 -0
- agentops/core/agentops_config.py +592 -0
- agentops/core/config_loader.py +22 -0
- agentops/core/evaluators.py +480 -0
- agentops/core/release_evidence.py +56 -0
- agentops/core/results.py +117 -0
- agentops/mcp/__init__.py +10 -0
- agentops/mcp/server.py +232 -0
- agentops/pipeline/__init__.py +8 -0
- agentops/pipeline/cloud_results.py +189 -0
- agentops/pipeline/cloud_runner.py +901 -0
- agentops/pipeline/comparison.py +108 -0
- agentops/pipeline/diagnostics.py +51 -0
- agentops/pipeline/invocations.py +535 -0
- agentops/pipeline/official_eval.py +414 -0
- agentops/pipeline/orchestrator.py +775 -0
- agentops/pipeline/prompt_deploy.py +377 -0
- agentops/pipeline/publisher.py +121 -0
- agentops/pipeline/reporter.py +202 -0
- agentops/pipeline/runtime.py +409 -0
- agentops/pipeline/thresholds.py +84 -0
- agentops/services/__init__.py +1 -0
- agentops/services/cicd.py +720 -0
- agentops/services/eval_analysis.py +848 -0
- agentops/services/evidence_pack.py +757 -0
- agentops/services/initializer.py +86 -0
- agentops/services/preflight.py +470 -0
- agentops/services/setup_wizard.py +709 -0
- agentops/services/skills.py +643 -0
- agentops/services/trace_promotion.py +300 -0
- agentops/services/workflow_analysis.py +1129 -0
- agentops/templates/.gitignore +15 -0
- agentops/templates/__init__.py +1 -0
- agentops/templates/agent-server/Dockerfile +23 -0
- agentops/templates/agent-server/README.md +61 -0
- agentops/templates/agent-server/main.bicep +94 -0
- agentops/templates/agent.yaml +87 -0
- agentops/templates/agentops.yaml +58 -0
- agentops/templates/foundry.svg +71 -0
- agentops/templates/icon.png +0 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-dev-azd.yml +118 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-dev.yml +73 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prod-azd.yml +141 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prod.yml +94 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-prompt-agent.yml +167 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-qa-azd.yml +118 -0
- agentops/templates/pipelines/azuredevops/agentops-deploy-qa.yml +68 -0
- agentops/templates/pipelines/azuredevops/agentops-pr-prompt-agent.yml +210 -0
- agentops/templates/pipelines/azuredevops/agentops-pr.yml +155 -0
- agentops/templates/pipelines/azuredevops/agentops-watchdog.yml +106 -0
- agentops/templates/project.gitignore +36 -0
- agentops/templates/sample-traces.jsonl +3 -0
- agentops/templates/skills/agentops-agent/SKILL.md +137 -0
- agentops/templates/skills/agentops-config/SKILL.md +113 -0
- agentops/templates/skills/agentops-dataset/SKILL.md +84 -0
- agentops/templates/skills/agentops-eval/SKILL.md +189 -0
- agentops/templates/skills/agentops-report/SKILL.md +71 -0
- agentops/templates/skills/agentops-workflow/SKILL.md +471 -0
- agentops/templates/smoke.jsonl +3 -0
- agentops/templates/waf-checklist.README.md +84 -0
- agentops/templates/waf-checklist.csv +22 -0
- agentops/templates/workflows/agentops-deploy-dev-azd.yml +166 -0
- agentops/templates/workflows/agentops-deploy-dev.yml +187 -0
- agentops/templates/workflows/agentops-deploy-prod-azd.yml +183 -0
- agentops/templates/workflows/agentops-deploy-prod.yml +171 -0
- agentops/templates/workflows/agentops-deploy-prompt-agent.yml +197 -0
- agentops/templates/workflows/agentops-deploy-qa-azd.yml +156 -0
- agentops/templates/workflows/agentops-deploy-qa.yml +145 -0
- agentops/templates/workflows/agentops-pr-prompt-agent.yml +210 -0
- agentops/templates/workflows/agentops-pr.yml +148 -0
- agentops/templates/workflows/agentops-watchdog.yml +122 -0
- agentops/utils/__init__.py +1 -0
- agentops/utils/azd_env.py +435 -0
- agentops/utils/azure_endpoints.py +62 -0
- agentops/utils/colors.py +47 -0
- agentops/utils/dotenv_loader.py +105 -0
- agentops/utils/foundry_discovery.py +229 -0
- agentops/utils/logging.py +59 -0
- agentops/utils/telemetry.py +554 -0
- agentops/utils/yaml.py +36 -0
- agentops_accelerator-0.3.0.dist-info/METADATA +278 -0
- agentops_accelerator-0.3.0.dist-info/RECORD +142 -0
- agentops_accelerator-0.3.0.dist-info/WHEEL +5 -0
- agentops_accelerator-0.3.0.dist-info/entry_points.txt +2 -0
- agentops_accelerator-0.3.0.dist-info/licenses/LICENSE +21 -0
- agentops_accelerator-0.3.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,901 @@
|
|
|
1
|
+
"""Cloud-side publisher: submit a run to the New Foundry Evaluations panel.
|
|
2
|
+
|
|
3
|
+
Unlike :mod:`agentops.pipeline.publisher` (which uploads metrics that
|
|
4
|
+
AgentOps already computed locally to the *Classic* Foundry Evaluations
|
|
5
|
+
panel via OneDP), this module asks **Foundry to execute the agent and the
|
|
6
|
+
evaluators server-side** through the OpenAI Evals API.
|
|
7
|
+
|
|
8
|
+
The flow:
|
|
9
|
+
|
|
10
|
+
1. Build an :class:`azure.ai.projects.AIProjectClient` from the configured
|
|
11
|
+
project endpoint using ``DefaultAzureCredential``.
|
|
12
|
+
2. Get the OpenAI client via ``project_client.get_openai_client()``. We do
|
|
13
|
+
**not** pass ``api_version`` - the SDK picks the correct one (passing
|
|
14
|
+
one explicitly has historically caused 404s in this codebase).
|
|
15
|
+
3. Inline the JSONL dataset rows as a ``file_content`` source.
|
|
16
|
+
4. Create the eval definition with ``client.evals.create(...)``, mapping
|
|
17
|
+
each AgentOps evaluator preset onto an ``azure_ai_evaluator`` testing
|
|
18
|
+
criterion.
|
|
19
|
+
5. Create the run with ``client.evals.runs.create(...)``, pointing at the
|
|
20
|
+
inline rows and using ``azure_ai_target_completions`` with an
|
|
21
|
+
``agent_reference`` so Foundry invokes the agent itself.
|
|
22
|
+
6. Poll until the run terminates, then return identifiers + the portal URL.
|
|
23
|
+
|
|
24
|
+
This module never re-runs the agent locally and never invokes evaluators
|
|
25
|
+
locally; that work happens inside Foundry. The local ``results.json``
|
|
26
|
+
(produced before this hop) remains the canonical record from AgentOps's
|
|
27
|
+
point of view.
|
|
28
|
+
|
|
29
|
+
Limitations (documented in the YAML schema docstring as well):
|
|
30
|
+
|
|
31
|
+
* Only ``foundry_prompt`` agents (``name:version``) are supported. HTTP
|
|
32
|
+
endpoints, local adapters, and direct model deployments are rejected.
|
|
33
|
+
* Only builtin evaluators that map cleanly onto ``azure_ai_evaluator``
|
|
34
|
+
testing criteria are supported. Custom evaluators are skipped with a
|
|
35
|
+
warning.
|
|
36
|
+
* Latency reported by the New Foundry view is Foundry-to-Foundry, not the
|
|
37
|
+
client-perceived latency captured locally.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
import hashlib
|
|
43
|
+
import json
|
|
44
|
+
import logging
|
|
45
|
+
import os
|
|
46
|
+
import re
|
|
47
|
+
import time
|
|
48
|
+
import uuid
|
|
49
|
+
from dataclasses import dataclass, field
|
|
50
|
+
from pathlib import Path
|
|
51
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
52
|
+
|
|
53
|
+
from agentops.core.agentops_config import DatasetSyncConfig
|
|
54
|
+
from agentops.core.results import RunResult
|
|
55
|
+
|
|
56
|
+
logger = logging.getLogger("agentops.pipeline.cloud_runner")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
# Public types
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass(frozen=True)
|
|
65
|
+
class CloudRunResult:
|
|
66
|
+
"""Outcome of a cloud (New Foundry) publish."""
|
|
67
|
+
|
|
68
|
+
eval_id: str
|
|
69
|
+
run_id: str
|
|
70
|
+
status: str
|
|
71
|
+
report_url: Optional[str]
|
|
72
|
+
evaluation_name: str
|
|
73
|
+
#: Raw per-row output items downloaded from the Foundry Evals API.
|
|
74
|
+
#: Each item is a dict with at least ``datasource_item`` (the original
|
|
75
|
+
#: input row), ``sample`` (the agent response), and ``results``
|
|
76
|
+
#: (per-criterion scores). May be empty if the SDK returns no items
|
|
77
|
+
#: or the download failed (in which case orchestrator falls back to
|
|
78
|
+
#: a thin RunResult that just records the portal URL).
|
|
79
|
+
output_items: List[Dict[str, Any]] = field(default_factory=list)
|
|
80
|
+
#: Dataset lineage and submission mode recorded for reports/Cockpit.
|
|
81
|
+
dataset: Dict[str, Any] = field(default_factory=dict)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# Map AgentOps evaluator class names to the OpenAI Evals API evaluator
|
|
85
|
+
# names that ``azure_ai_evaluator`` recognises. Any preset whose
|
|
86
|
+
# ``class_name`` is not in this map is skipped (with a warning) when
|
|
87
|
+
# building testing criteria.
|
|
88
|
+
_AZURE_AI_EVALUATOR_NAMES: Dict[str, str] = {
|
|
89
|
+
"CoherenceEvaluator": "builtin.coherence",
|
|
90
|
+
"FluencyEvaluator": "builtin.fluency",
|
|
91
|
+
"SimilarityEvaluator": "builtin.similarity",
|
|
92
|
+
"F1ScoreEvaluator": "builtin.f1_score",
|
|
93
|
+
"RelevanceEvaluator": "builtin.relevance",
|
|
94
|
+
"GroundednessEvaluator": "builtin.groundedness",
|
|
95
|
+
"RetrievalEvaluator": "builtin.retrieval",
|
|
96
|
+
"ResponseCompletenessEvaluator": "builtin.response_completeness",
|
|
97
|
+
"ToolCallAccuracyEvaluator": "builtin.tool_call_accuracy",
|
|
98
|
+
"IntentResolutionEvaluator": "builtin.intent_resolution",
|
|
99
|
+
"TaskAdherenceEvaluator": "builtin.task_adherence",
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
_CLOUD_EVALUATORS_REQUIRING_DEPLOYMENT = {
|
|
103
|
+
"CoherenceEvaluator",
|
|
104
|
+
"FluencyEvaluator",
|
|
105
|
+
"SimilarityEvaluator",
|
|
106
|
+
"RelevanceEvaluator",
|
|
107
|
+
"GroundednessEvaluator",
|
|
108
|
+
"RetrievalEvaluator",
|
|
109
|
+
"ResponseCompletenessEvaluator",
|
|
110
|
+
"ToolCallAccuracyEvaluator",
|
|
111
|
+
"IntentResolutionEvaluator",
|
|
112
|
+
"TaskAdherenceEvaluator",
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
_CLOUD_PLACEHOLDERS = {
|
|
116
|
+
"$prompt": "{{item.input}}",
|
|
117
|
+
"$prediction": "{{sample.output_text}}",
|
|
118
|
+
"$expected": "{{item.expected}}",
|
|
119
|
+
"$context": "{{item.context}}",
|
|
120
|
+
"$tool_calls": "{{item.tool_calls}}",
|
|
121
|
+
"$tool_definitions": "{{item.tool_definitions}}",
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
_DEFAULT_POLL_INTERVAL_SECONDS = 2.0
|
|
126
|
+
_DEFAULT_MAX_POLL_ATTEMPTS = 300 # 10 minutes at 2s intervals
|
|
127
|
+
_DEFAULT_HEARTBEAT_SECONDS = 10.0
|
|
128
|
+
_TERMINAL_STATUSES = {"completed", "failed", "canceled", "cancelled"}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ---------------------------------------------------------------------------
|
|
132
|
+
# Entry point
|
|
133
|
+
# ---------------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def run_on_foundry_cloud(
|
|
137
|
+
result: RunResult,
|
|
138
|
+
*,
|
|
139
|
+
dataset_path: Path,
|
|
140
|
+
project_endpoint: str,
|
|
141
|
+
evaluation_name: Optional[str] = None,
|
|
142
|
+
dataset_sync: Optional[DatasetSyncConfig] = None,
|
|
143
|
+
poll_interval_seconds: float = _DEFAULT_POLL_INTERVAL_SECONDS,
|
|
144
|
+
max_poll_attempts: int = _DEFAULT_MAX_POLL_ATTEMPTS,
|
|
145
|
+
progress: Optional[Callable[[str], None]] = None,
|
|
146
|
+
) -> CloudRunResult:
|
|
147
|
+
"""Submit ``result``'s target to Foundry for server-side evaluation.
|
|
148
|
+
|
|
149
|
+
Parameters
|
|
150
|
+
----------
|
|
151
|
+
result:
|
|
152
|
+
Local run result. Used to derive the agent reference and the list
|
|
153
|
+
of evaluator presets that should map onto ``azure_ai_evaluator``
|
|
154
|
+
testing criteria.
|
|
155
|
+
dataset_path:
|
|
156
|
+
Path to the JSONL dataset to submit. Must already exist.
|
|
157
|
+
project_endpoint:
|
|
158
|
+
Foundry project endpoint URL (e.g.
|
|
159
|
+
``https://contoso.services.ai.azure.com/api/projects/p``).
|
|
160
|
+
evaluation_name:
|
|
161
|
+
Optional display name. Defaults to ``agentops-cloud-<short-uuid>``.
|
|
162
|
+
dataset_sync:
|
|
163
|
+
Optional submission policy. ``auto`` and ``inline`` currently use
|
|
164
|
+
inline ``file_content`` compatibility and record that lineage; ``foundry``
|
|
165
|
+
fails fast until the Foundry dataset reference path is validated.
|
|
166
|
+
poll_interval_seconds, max_poll_attempts:
|
|
167
|
+
Control polling cadence and bound. The default budget is
|
|
168
|
+
~10 minutes.
|
|
169
|
+
progress:
|
|
170
|
+
Optional callback invoked with one-line status updates. The
|
|
171
|
+
orchestrator wires this to the same channel that prints per-row
|
|
172
|
+
progress so the user sees what is happening during the long
|
|
173
|
+
cloud round-trip.
|
|
174
|
+
|
|
175
|
+
Raises
|
|
176
|
+
------
|
|
177
|
+
ImportError
|
|
178
|
+
``azure-ai-projects`` / ``azure-identity`` are not installed.
|
|
179
|
+
ValueError
|
|
180
|
+
Target is not a Foundry agent or the dataset is missing.
|
|
181
|
+
RuntimeError
|
|
182
|
+
Polling timed out or the run terminated with a non-completed
|
|
183
|
+
status.
|
|
184
|
+
"""
|
|
185
|
+
progress = progress or (lambda _msg: None)
|
|
186
|
+
|
|
187
|
+
if result.target.kind != "foundry_prompt":
|
|
188
|
+
raise ValueError(
|
|
189
|
+
"publish: foundry_cloud only supports Foundry agents declared "
|
|
190
|
+
"as 'name:version' (foundry_prompt targets). Got "
|
|
191
|
+
f"target.kind={result.target.kind!r}."
|
|
192
|
+
)
|
|
193
|
+
if not dataset_path.exists():
|
|
194
|
+
raise ValueError(f"dataset file not found: {dataset_path}")
|
|
195
|
+
|
|
196
|
+
agent_name = result.target.name
|
|
197
|
+
agent_version = result.target.version
|
|
198
|
+
if not agent_name or not agent_version:
|
|
199
|
+
raise ValueError(
|
|
200
|
+
"Cloud publish requires a fully qualified 'name:version' agent "
|
|
201
|
+
f"reference; got name={agent_name!r} version={agent_version!r}"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
from azure.ai.projects import AIProjectClient # noqa: WPS433
|
|
206
|
+
from azure.identity import DefaultAzureCredential # noqa: WPS433
|
|
207
|
+
except ImportError as exc: # pragma: no cover - exercised only at runtime
|
|
208
|
+
raise ImportError(
|
|
209
|
+
"publish: foundry_cloud requires 'azure-ai-projects' and "
|
|
210
|
+
"'azure-identity'. Install with:\n"
|
|
211
|
+
" pip install azure-ai-projects azure-identity"
|
|
212
|
+
) from exc
|
|
213
|
+
|
|
214
|
+
credential = DefaultAzureCredential(exclude_developer_cli_credential=True, process_timeout=30)
|
|
215
|
+
project_client = AIProjectClient(
|
|
216
|
+
endpoint=project_endpoint,
|
|
217
|
+
credential=credential,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# NB: do not pass api_version - the SDK chooses the right one. Passing
|
|
221
|
+
# an explicit version has historically caused 404s in this codebase.
|
|
222
|
+
openai_client = project_client.get_openai_client()
|
|
223
|
+
|
|
224
|
+
eval_name = evaluation_name or f"agentops-cloud-{uuid.uuid4().hex[:8]}"
|
|
225
|
+
testing_criteria = _build_testing_criteria(result)
|
|
226
|
+
if not testing_criteria:
|
|
227
|
+
raise ValueError(
|
|
228
|
+
"no AgentOps evaluators map onto azure_ai_evaluator testing "
|
|
229
|
+
"criteria; nothing to evaluate server-side."
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
progress(f"cloud: preparing run '{eval_name}'")
|
|
233
|
+
progress(
|
|
234
|
+
"cloud: remote Foundry evaluations are asynchronous; small smoke "
|
|
235
|
+
"runs commonly take 30-90s depending on queueing, agent latency, "
|
|
236
|
+
"and evaluator model latency."
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
item_schema = _build_item_schema(dataset_path)
|
|
240
|
+
source, dataset_lineage = _build_dataset_source(
|
|
241
|
+
dataset_path,
|
|
242
|
+
dataset_sync or DatasetSyncConfig(),
|
|
243
|
+
project_client=project_client,
|
|
244
|
+
progress=progress,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
progress(
|
|
248
|
+
f"cloud: creating eval ({len(testing_criteria)} criteria, "
|
|
249
|
+
f"item_schema fields: {sorted(item_schema['properties'].keys())})"
|
|
250
|
+
)
|
|
251
|
+
eval_obj = openai_client.evals.create(
|
|
252
|
+
name=eval_name,
|
|
253
|
+
data_source_config={
|
|
254
|
+
"type": "custom",
|
|
255
|
+
"item_schema": item_schema,
|
|
256
|
+
"include_sample_schema": True,
|
|
257
|
+
},
|
|
258
|
+
testing_criteria=testing_criteria, # type: ignore[arg-type]
|
|
259
|
+
)
|
|
260
|
+
eval_id = eval_obj.id
|
|
261
|
+
|
|
262
|
+
progress(
|
|
263
|
+
f"cloud: starting run for agent {agent_name}:{agent_version}"
|
|
264
|
+
)
|
|
265
|
+
try:
|
|
266
|
+
run_obj = openai_client.evals.runs.create(
|
|
267
|
+
eval_id=eval_id,
|
|
268
|
+
name=f"{eval_name}-run",
|
|
269
|
+
data_source={ # type: ignore[arg-type]
|
|
270
|
+
"type": "azure_ai_target_completions",
|
|
271
|
+
"source": source,
|
|
272
|
+
"input_messages": {
|
|
273
|
+
"type": "template",
|
|
274
|
+
"template": [
|
|
275
|
+
{
|
|
276
|
+
"type": "message",
|
|
277
|
+
"role": "user",
|
|
278
|
+
"content": {
|
|
279
|
+
"type": "input_text",
|
|
280
|
+
"text": "{{item.input}}",
|
|
281
|
+
},
|
|
282
|
+
}
|
|
283
|
+
],
|
|
284
|
+
},
|
|
285
|
+
"target": {
|
|
286
|
+
"type": "azure_ai_agent",
|
|
287
|
+
"name": agent_name,
|
|
288
|
+
"version": agent_version,
|
|
289
|
+
},
|
|
290
|
+
},
|
|
291
|
+
)
|
|
292
|
+
except Exception as exc: # noqa: BLE001
|
|
293
|
+
raise _friendly_run_create_error(
|
|
294
|
+
exc, agent_name=agent_name, agent_version=agent_version
|
|
295
|
+
) from exc
|
|
296
|
+
run_id = run_obj.id
|
|
297
|
+
|
|
298
|
+
progress(
|
|
299
|
+
f"cloud: polling run {run_id} (interval "
|
|
300
|
+
f"{poll_interval_seconds:g}s, max {max_poll_attempts} attempts)"
|
|
301
|
+
)
|
|
302
|
+
final_run = _poll_until_terminal(
|
|
303
|
+
openai_client,
|
|
304
|
+
eval_id=eval_id,
|
|
305
|
+
run_id=run_id,
|
|
306
|
+
interval_seconds=poll_interval_seconds,
|
|
307
|
+
max_attempts=max_poll_attempts,
|
|
308
|
+
progress=progress,
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
status = getattr(final_run, "status", "unknown")
|
|
312
|
+
report_url = _extract_report_url(final_run)
|
|
313
|
+
|
|
314
|
+
if status != "completed":
|
|
315
|
+
raise RuntimeError(
|
|
316
|
+
f"cloud evaluation run {run_id} terminated with status "
|
|
317
|
+
f"{status!r}; see {report_url or 'the Foundry portal'}."
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
progress(f"cloud: done. status={status}")
|
|
321
|
+
|
|
322
|
+
# Download per-row results from Foundry so the local results.json can
|
|
323
|
+
# be populated without re-invoking the agent client-side.
|
|
324
|
+
output_items = _list_output_items(
|
|
325
|
+
openai_client,
|
|
326
|
+
eval_id=eval_id,
|
|
327
|
+
run_id=run_id,
|
|
328
|
+
progress=progress,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
return CloudRunResult(
|
|
332
|
+
eval_id=eval_id,
|
|
333
|
+
run_id=run_id,
|
|
334
|
+
status=status,
|
|
335
|
+
report_url=report_url,
|
|
336
|
+
evaluation_name=eval_name,
|
|
337
|
+
output_items=output_items,
|
|
338
|
+
dataset=dataset_lineage,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
# ---------------------------------------------------------------------------
|
|
343
|
+
# Helpers
|
|
344
|
+
# ---------------------------------------------------------------------------
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _build_testing_criteria(result: RunResult) -> List[Dict[str, Any]]:
|
|
348
|
+
"""Map evaluator class names from ``result`` onto Azure AI evaluators.
|
|
349
|
+
|
|
350
|
+
Prefer ``result.evaluators`` because it records the evaluator set selected
|
|
351
|
+
for the run even when every local invocation failed and no aggregate
|
|
352
|
+
metrics were produced. Fall back to aggregate metric keys for compatibility
|
|
353
|
+
with older result payloads.
|
|
354
|
+
"""
|
|
355
|
+
# Lazy import to avoid pulling evaluators into modules that don't
|
|
356
|
+
# need them.
|
|
357
|
+
from agentops.core.evaluators import CATALOG
|
|
358
|
+
|
|
359
|
+
evaluator_deployment = _evaluator_deployment_name()
|
|
360
|
+
|
|
361
|
+
# ``CATALOG`` is keyed by preset.name (== class name); ``aggregate_metrics``
|
|
362
|
+
# is keyed by preset.score_key. Build a one-shot reverse index for older
|
|
363
|
+
# result payloads or synthesized tests that only carry metric keys.
|
|
364
|
+
by_score_key = {p.score_key: p for p in CATALOG.values()}
|
|
365
|
+
presets = [CATALOG[name] for name in result.evaluators if name in CATALOG]
|
|
366
|
+
if not presets:
|
|
367
|
+
presets = [
|
|
368
|
+
preset
|
|
369
|
+
for metric_name in result.aggregate_metrics.keys()
|
|
370
|
+
if (preset := by_score_key.get(metric_name)) is not None
|
|
371
|
+
]
|
|
372
|
+
|
|
373
|
+
criteria: List[Dict[str, Any]] = []
|
|
374
|
+
seen: set = set()
|
|
375
|
+
for preset in presets:
|
|
376
|
+
# Latency is computed locally; Foundry has its own server-side view.
|
|
377
|
+
if "runtime" in preset.categories:
|
|
378
|
+
continue
|
|
379
|
+
azure_name = _AZURE_AI_EVALUATOR_NAMES.get(preset.class_name)
|
|
380
|
+
if not azure_name:
|
|
381
|
+
logger.warning(
|
|
382
|
+
"no azure_ai_evaluator mapping for %s; skipping in cloud run",
|
|
383
|
+
preset.class_name,
|
|
384
|
+
)
|
|
385
|
+
continue
|
|
386
|
+
if azure_name in seen:
|
|
387
|
+
continue
|
|
388
|
+
seen.add(azure_name)
|
|
389
|
+
criterion: Dict[str, Any] = {
|
|
390
|
+
"type": "azure_ai_evaluator",
|
|
391
|
+
"name": preset.score_key,
|
|
392
|
+
"evaluator_name": azure_name,
|
|
393
|
+
"data_mapping": _build_cloud_data_mapping(preset),
|
|
394
|
+
}
|
|
395
|
+
if preset.class_name in _CLOUD_EVALUATORS_REQUIRING_DEPLOYMENT:
|
|
396
|
+
if not evaluator_deployment:
|
|
397
|
+
raise ValueError(
|
|
398
|
+
"publish: foundry_cloud requires AZURE_OPENAI_DEPLOYMENT "
|
|
399
|
+
"or AZURE_AI_MODEL_DEPLOYMENT_NAME for Azure AI "
|
|
400
|
+
f"evaluator {preset.class_name}."
|
|
401
|
+
)
|
|
402
|
+
criterion["initialization_parameters"] = {
|
|
403
|
+
"deployment_name": evaluator_deployment,
|
|
404
|
+
}
|
|
405
|
+
criteria.append(criterion)
|
|
406
|
+
return criteria
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _evaluator_deployment_name() -> Optional[str]:
|
|
410
|
+
return os.getenv("AZURE_OPENAI_DEPLOYMENT") or os.getenv(
|
|
411
|
+
"AZURE_AI_MODEL_DEPLOYMENT_NAME"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def _build_cloud_data_mapping(preset: Any) -> Dict[str, str]:
|
|
416
|
+
mapping: Dict[str, str] = {}
|
|
417
|
+
for input_field, placeholder in preset.input_mapping.items():
|
|
418
|
+
if placeholder == "$prediction" and getattr(preset, "needs_conversation", False):
|
|
419
|
+
mapping[input_field] = "{{sample.output_items}}"
|
|
420
|
+
continue
|
|
421
|
+
mapped = _CLOUD_PLACEHOLDERS.get(placeholder)
|
|
422
|
+
if mapped:
|
|
423
|
+
mapping[input_field] = mapped
|
|
424
|
+
return mapping
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def _build_file_content_source(
|
|
428
|
+
dataset_path: Path,
|
|
429
|
+
*,
|
|
430
|
+
progress: Callable[[str], None],
|
|
431
|
+
) -> Dict[str, Any]:
|
|
432
|
+
"""Inline JSONL rows for Foundry target-completions runs.
|
|
433
|
+
|
|
434
|
+
New Foundry currently validates file-id sources by extension after the
|
|
435
|
+
upload is materialized server-side. Inline ``file_content`` avoids a
|
|
436
|
+
service-side filename loss where valid ``.jsonl`` uploads can be read back
|
|
437
|
+
as extensionless files.
|
|
438
|
+
"""
|
|
439
|
+
progress(f"cloud: preparing {dataset_path.name}")
|
|
440
|
+
content: List[Dict[str, Any]] = []
|
|
441
|
+
with dataset_path.open("r", encoding="utf-8") as handle:
|
|
442
|
+
for line_number, line in enumerate(handle, start=1):
|
|
443
|
+
text = line.strip()
|
|
444
|
+
if not text:
|
|
445
|
+
continue
|
|
446
|
+
row = json.loads(text)
|
|
447
|
+
if not isinstance(row, dict):
|
|
448
|
+
raise ValueError(
|
|
449
|
+
f"dataset row {line_number} must be a JSON object for "
|
|
450
|
+
"publish: foundry_cloud"
|
|
451
|
+
)
|
|
452
|
+
content.append({"item": row})
|
|
453
|
+
if not content:
|
|
454
|
+
raise ValueError("dataset must contain at least one row for publish: foundry_cloud")
|
|
455
|
+
progress(f"cloud: prepared {len(content)} row(s)")
|
|
456
|
+
return {
|
|
457
|
+
"type": "file_content",
|
|
458
|
+
"content": content,
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def _build_dataset_source(
|
|
463
|
+
dataset_path: Path,
|
|
464
|
+
dataset_sync: DatasetSyncConfig,
|
|
465
|
+
*,
|
|
466
|
+
project_client: Any,
|
|
467
|
+
progress: Callable[[str], None],
|
|
468
|
+
) -> tuple[Dict[str, Any], Dict[str, Any]]:
|
|
469
|
+
if dataset_sync.mode == "inline":
|
|
470
|
+
source = _build_file_content_source(dataset_path, progress=progress)
|
|
471
|
+
return source, _build_inline_dataset_lineage(dataset_path, dataset_sync)
|
|
472
|
+
|
|
473
|
+
try:
|
|
474
|
+
source, lineage = _build_foundry_dataset_source(
|
|
475
|
+
dataset_path,
|
|
476
|
+
dataset_sync,
|
|
477
|
+
project_client=project_client,
|
|
478
|
+
progress=progress,
|
|
479
|
+
)
|
|
480
|
+
except Exception as exc: # noqa: BLE001
|
|
481
|
+
if dataset_sync.mode == "foundry":
|
|
482
|
+
raise
|
|
483
|
+
reason = _summarize_dataset_sync_error(exc)
|
|
484
|
+
logger.debug(
|
|
485
|
+
"Foundry dataset sync failed; falling back to inline file_content",
|
|
486
|
+
exc_info=True,
|
|
487
|
+
)
|
|
488
|
+
progress(
|
|
489
|
+
"cloud: dataset sync unavailable; using inline rows for this run. "
|
|
490
|
+
f"Reason: {reason}"
|
|
491
|
+
)
|
|
492
|
+
source = _build_file_content_source(dataset_path, progress=progress)
|
|
493
|
+
lineage = _build_inline_dataset_lineage(dataset_path, dataset_sync)
|
|
494
|
+
lineage["status"] = "auto_fallback_inline"
|
|
495
|
+
lineage["sync_error"] = reason
|
|
496
|
+
return source, lineage
|
|
497
|
+
return source, lineage
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def _build_foundry_dataset_source(
|
|
501
|
+
dataset_path: Path,
|
|
502
|
+
dataset_sync: DatasetSyncConfig,
|
|
503
|
+
*,
|
|
504
|
+
project_client: Any,
|
|
505
|
+
progress: Callable[[str], None],
|
|
506
|
+
) -> tuple[Dict[str, Any], Dict[str, Any]]:
|
|
507
|
+
sha256 = _sha256_file(dataset_path)
|
|
508
|
+
name = dataset_sync.name or _derived_foundry_dataset_name(dataset_path)
|
|
509
|
+
version = _resolved_foundry_dataset_version(dataset_sync.version, sha256)
|
|
510
|
+
progress(f"cloud: syncing dataset to Foundry {name}@{version}")
|
|
511
|
+
|
|
512
|
+
dataset = _get_or_upload_foundry_dataset(
|
|
513
|
+
project_client,
|
|
514
|
+
name=name,
|
|
515
|
+
version=version,
|
|
516
|
+
dataset_path=dataset_path,
|
|
517
|
+
progress=progress,
|
|
518
|
+
)
|
|
519
|
+
dataset_id = _dataset_attr(dataset, "id")
|
|
520
|
+
if not dataset_id:
|
|
521
|
+
raise RuntimeError(
|
|
522
|
+
f"Foundry dataset {name}@{version} did not return an id."
|
|
523
|
+
)
|
|
524
|
+
progress(f"cloud: using Foundry dataset {name}@{version}")
|
|
525
|
+
return (
|
|
526
|
+
{
|
|
527
|
+
"type": "file_id",
|
|
528
|
+
"id": dataset_id,
|
|
529
|
+
},
|
|
530
|
+
{
|
|
531
|
+
"mode": "foundry",
|
|
532
|
+
"requested_mode": dataset_sync.mode,
|
|
533
|
+
"source_type": "file_id",
|
|
534
|
+
"local_path": str(dataset_path),
|
|
535
|
+
"sha256": sha256,
|
|
536
|
+
"status": "synced",
|
|
537
|
+
"foundry_name": name,
|
|
538
|
+
"foundry_version": version,
|
|
539
|
+
"foundry_id": dataset_id,
|
|
540
|
+
"foundry_uri": _dataset_attr(dataset, "dataUri"),
|
|
541
|
+
},
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
def _get_or_upload_foundry_dataset(
|
|
546
|
+
project_client: Any,
|
|
547
|
+
*,
|
|
548
|
+
name: str,
|
|
549
|
+
version: str,
|
|
550
|
+
dataset_path: Path,
|
|
551
|
+
progress: Callable[[str], None],
|
|
552
|
+
) -> Any:
|
|
553
|
+
try:
|
|
554
|
+
dataset = project_client.datasets.get(name=name, version=version)
|
|
555
|
+
progress(f"cloud: found existing Foundry dataset {name}@{version}")
|
|
556
|
+
return dataset
|
|
557
|
+
except Exception as exc: # noqa: BLE001
|
|
558
|
+
if not _looks_not_found(exc):
|
|
559
|
+
raise
|
|
560
|
+
|
|
561
|
+
try:
|
|
562
|
+
return project_client.datasets.upload_file(
|
|
563
|
+
name=name,
|
|
564
|
+
version=version,
|
|
565
|
+
file_path=str(dataset_path),
|
|
566
|
+
)
|
|
567
|
+
except Exception as exc: # noqa: BLE001
|
|
568
|
+
if _looks_conflict(exc):
|
|
569
|
+
progress(
|
|
570
|
+
f"cloud: Foundry dataset {name}@{version} already exists; "
|
|
571
|
+
"reusing it"
|
|
572
|
+
)
|
|
573
|
+
return project_client.datasets.get(name=name, version=version)
|
|
574
|
+
raise
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
def _build_inline_dataset_lineage(
|
|
578
|
+
dataset_path: Path,
|
|
579
|
+
dataset_sync: DatasetSyncConfig,
|
|
580
|
+
) -> Dict[str, Any]:
|
|
581
|
+
"""Describe the local-to-Foundry dataset relationship for inline runs."""
|
|
582
|
+
lineage: Dict[str, Any] = {
|
|
583
|
+
"mode": "inline",
|
|
584
|
+
"requested_mode": dataset_sync.mode,
|
|
585
|
+
"source_type": "file_content",
|
|
586
|
+
"local_path": str(dataset_path),
|
|
587
|
+
"sha256": _sha256_file(dataset_path),
|
|
588
|
+
"status": "compatibility_inline",
|
|
589
|
+
"foundry_behavior": (
|
|
590
|
+
"Foundry may materialize inline rows as eval-data-* backing "
|
|
591
|
+
"dataset assets in the project Data page."
|
|
592
|
+
),
|
|
593
|
+
}
|
|
594
|
+
if dataset_sync.name:
|
|
595
|
+
lineage["configured_name"] = dataset_sync.name
|
|
596
|
+
if dataset_sync.version:
|
|
597
|
+
lineage["configured_version"] = dataset_sync.version
|
|
598
|
+
return lineage
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def _summarize_dataset_sync_error(exc: Exception) -> str:
|
|
602
|
+
text = str(exc)
|
|
603
|
+
lower = text.lower()
|
|
604
|
+
if "defaultazurecredential failed to retrieve a token" in lower:
|
|
605
|
+
return (
|
|
606
|
+
"Azure authentication was unavailable for Foundry dataset sync. "
|
|
607
|
+
"Run `az login` or set `dataset_sync.mode: inline` to skip dataset "
|
|
608
|
+
"asset sync during quick demos."
|
|
609
|
+
)
|
|
610
|
+
if "azureclicredential: failed to invoke the azure cli" in lower:
|
|
611
|
+
return (
|
|
612
|
+
"Azure CLI authentication was unavailable for Foundry dataset sync. "
|
|
613
|
+
"Run `az login` or set `dataset_sync.mode: inline` for this run."
|
|
614
|
+
)
|
|
615
|
+
first_line = text.splitlines()[0].strip() if text else exc.__class__.__name__
|
|
616
|
+
if len(first_line) > 180:
|
|
617
|
+
first_line = first_line[:177] + "..."
|
|
618
|
+
return f"{exc.__class__.__name__}: {first_line}"
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def _derived_foundry_dataset_name(dataset_path: Path) -> str:
|
|
622
|
+
stem = dataset_path.stem.lower()
|
|
623
|
+
slug = re.sub(r"[^a-z0-9_-]+", "-", stem).strip("-_")
|
|
624
|
+
return f"agentops-{slug or 'dataset'}"
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
def _resolved_foundry_dataset_version(configured: str, sha256: str) -> str:
|
|
628
|
+
if configured == "content-hash":
|
|
629
|
+
return f"sha256-{sha256[:16]}"
|
|
630
|
+
return configured
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def _dataset_attr(dataset: Any, name: str) -> Optional[str]:
|
|
634
|
+
value = getattr(dataset, name, None)
|
|
635
|
+
if value is None and isinstance(dataset, dict):
|
|
636
|
+
value = dataset.get(name)
|
|
637
|
+
return str(value) if value else None
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def _looks_not_found(exc: Exception) -> bool:
|
|
641
|
+
status_code = getattr(exc, "status_code", None)
|
|
642
|
+
if status_code == 404:
|
|
643
|
+
return True
|
|
644
|
+
text = str(exc).lower()
|
|
645
|
+
return "not found" in text or "resource not found" in text or "404" in text
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def _looks_conflict(exc: Exception) -> bool:
|
|
649
|
+
status_code = getattr(exc, "status_code", None)
|
|
650
|
+
if status_code == 409:
|
|
651
|
+
return True
|
|
652
|
+
text = str(exc).lower()
|
|
653
|
+
return "already exists" in text or "conflict" in text or "409" in text
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def _sha256_file(path: Path) -> str:
|
|
657
|
+
digest = hashlib.sha256()
|
|
658
|
+
with path.open("rb") as handle:
|
|
659
|
+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
660
|
+
digest.update(chunk)
|
|
661
|
+
return digest.hexdigest()
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
def _build_item_schema(dataset_path: Path) -> Dict[str, Any]:
|
|
665
|
+
"""Inspect the first dataset row to derive a JSON schema.
|
|
666
|
+
|
|
667
|
+
Foundry's Evals API requires an ``item_schema`` declaring the shape of
|
|
668
|
+
each row. We read the first non-empty line of the JSONL file and
|
|
669
|
+
advertise every top-level key as a string property; this is permissive
|
|
670
|
+
enough for typical AgentOps datasets (input, expected, context,
|
|
671
|
+
tool_calls, tool_definitions).
|
|
672
|
+
"""
|
|
673
|
+
properties: Dict[str, Dict[str, str]] = {}
|
|
674
|
+
with dataset_path.open("r", encoding="utf-8") as handle:
|
|
675
|
+
for line in handle:
|
|
676
|
+
line = line.strip()
|
|
677
|
+
if not line:
|
|
678
|
+
continue
|
|
679
|
+
row = json.loads(line)
|
|
680
|
+
if isinstance(row, dict):
|
|
681
|
+
for key in row.keys():
|
|
682
|
+
properties[str(key)] = {"type": "string"}
|
|
683
|
+
break
|
|
684
|
+
if not properties:
|
|
685
|
+
# Fall back to a single 'input' field so eval creation does not
|
|
686
|
+
# blow up on an empty dataset.
|
|
687
|
+
properties["input"] = {"type": "string"}
|
|
688
|
+
return {
|
|
689
|
+
"type": "object",
|
|
690
|
+
"properties": properties,
|
|
691
|
+
"required": list(properties.keys()),
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def _poll_until_terminal(
|
|
696
|
+
openai_client: Any,
|
|
697
|
+
*,
|
|
698
|
+
eval_id: str,
|
|
699
|
+
run_id: str,
|
|
700
|
+
interval_seconds: float,
|
|
701
|
+
max_attempts: int,
|
|
702
|
+
progress: Callable[[str], None],
|
|
703
|
+
) -> Any:
|
|
704
|
+
"""Poll ``runs.retrieve`` until the run reaches a terminal status."""
|
|
705
|
+
last_status: Optional[str] = None
|
|
706
|
+
started = time.monotonic()
|
|
707
|
+
last_progress_at = started
|
|
708
|
+
for attempt in range(1, max_attempts + 1):
|
|
709
|
+
run = openai_client.evals.runs.retrieve(eval_id=eval_id, run_id=run_id)
|
|
710
|
+
status = getattr(run, "status", "unknown")
|
|
711
|
+
now = time.monotonic()
|
|
712
|
+
elapsed = now - started
|
|
713
|
+
status_changed = status != last_status
|
|
714
|
+
heartbeat_due = now - last_progress_at >= _DEFAULT_HEARTBEAT_SECONDS
|
|
715
|
+
if status_changed or heartbeat_due:
|
|
716
|
+
label = "run status ->" if status_changed else "still"
|
|
717
|
+
progress(
|
|
718
|
+
f"cloud: {label} {status} "
|
|
719
|
+
f"(elapsed {_format_elapsed(elapsed)}, attempt {attempt}/{max_attempts})"
|
|
720
|
+
)
|
|
721
|
+
last_progress_at = now
|
|
722
|
+
last_status = status
|
|
723
|
+
if status in _TERMINAL_STATUSES:
|
|
724
|
+
return run
|
|
725
|
+
time.sleep(interval_seconds)
|
|
726
|
+
raise RuntimeError(
|
|
727
|
+
f"cloud evaluation run {run_id} did not finish within "
|
|
728
|
+
f"{max_attempts} polls of {interval_seconds:g}s "
|
|
729
|
+
f"(last status: {last_status!r})."
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def _format_elapsed(seconds: float) -> str:
|
|
734
|
+
total = max(0, int(seconds))
|
|
735
|
+
minutes, remaining = divmod(total, 60)
|
|
736
|
+
if minutes:
|
|
737
|
+
return f"{minutes}m{remaining:02d}s"
|
|
738
|
+
return f"{remaining}s"
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def _friendly_run_create_error(
|
|
742
|
+
exc: Exception,
|
|
743
|
+
*,
|
|
744
|
+
agent_name: str,
|
|
745
|
+
agent_version: str,
|
|
746
|
+
) -> Exception:
|
|
747
|
+
"""Convert a noisy Foundry/OpenAI ``evals.runs.create`` failure into a
|
|
748
|
+
short, actionable ``RuntimeError``.
|
|
749
|
+
|
|
750
|
+
The Evals API returns the underlying validation message inside a
|
|
751
|
+
nested JSON envelope (``error.message`` →
|
|
752
|
+
``Evaluation failed validation: {"Code": "ResourceNotFound", ...}``).
|
|
753
|
+
Rendering the raw exception dumps the whole envelope on stderr, which
|
|
754
|
+
is unreadable. We pick out the inner detail and rephrase it in the
|
|
755
|
+
common-case forms users actually hit.
|
|
756
|
+
"""
|
|
757
|
+
raw = _extract_error_message(exc) or str(exc)
|
|
758
|
+
lowered = raw.lower()
|
|
759
|
+
|
|
760
|
+
if "was not found" in lowered or "resourcenotfound" in lowered:
|
|
761
|
+
return RuntimeError(
|
|
762
|
+
f"Agent '{agent_name}:{agent_version}' was not found in your "
|
|
763
|
+
"Foundry project.\n"
|
|
764
|
+
" - Verify the name and version in target.endpoint.agent_id "
|
|
765
|
+
"(format: name:version).\n"
|
|
766
|
+
" - Confirm AZURE_AI_FOUNDRY_PROJECT_ENDPOINT points to the "
|
|
767
|
+
"project that owns the agent.\n"
|
|
768
|
+
" - Make sure the agent is deployed; list agents in the "
|
|
769
|
+
"Foundry portal under Agents."
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
if "permission" in lowered or "forbidden" in lowered or "403" in raw:
|
|
773
|
+
return RuntimeError(
|
|
774
|
+
"Foundry denied the evaluation request (permission).\n"
|
|
775
|
+
f" - Confirm you have access to the project that owns "
|
|
776
|
+
f"agent '{agent_name}:{agent_version}'.\n"
|
|
777
|
+
" - Try `az login` with the correct tenant or check the "
|
|
778
|
+
"managed identity assigned to this environment."
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
if "quota" in lowered or "ratelimit" in lowered or "429" in raw:
|
|
782
|
+
return RuntimeError(
|
|
783
|
+
"Foundry rate-limited the evaluation request. Retry in a "
|
|
784
|
+
"few minutes, or reduce dataset size."
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
return RuntimeError(f"Cloud evaluation could not start: {raw}")
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def _extract_error_message(exc: Exception) -> Optional[str]:
|
|
791
|
+
"""Best-effort extraction of the human-readable message buried inside
|
|
792
|
+
an OpenAI / Azure SDK error.
|
|
793
|
+
"""
|
|
794
|
+
body = getattr(exc, "body", None)
|
|
795
|
+
if isinstance(body, dict):
|
|
796
|
+
err = body.get("error") if isinstance(body.get("error"), dict) else body
|
|
797
|
+
if isinstance(err, dict):
|
|
798
|
+
msg = err.get("message")
|
|
799
|
+
if isinstance(msg, str) and msg:
|
|
800
|
+
inner = _strip_validation_envelope(msg)
|
|
801
|
+
return inner or msg
|
|
802
|
+
msg = getattr(exc, "message", None)
|
|
803
|
+
if isinstance(msg, str) and msg:
|
|
804
|
+
return _strip_validation_envelope(msg) or msg
|
|
805
|
+
return None
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
def _strip_validation_envelope(text: str) -> Optional[str]:
|
|
809
|
+
"""Pull the ``Message: ...`` line out of the validation envelope that
|
|
810
|
+
Foundry returns inside ``error.message``. Returns ``None`` if no such
|
|
811
|
+
line is present so callers can fall back to the original text.
|
|
812
|
+
"""
|
|
813
|
+
for line in text.splitlines():
|
|
814
|
+
s = line.strip()
|
|
815
|
+
if s.lower().startswith("message:"):
|
|
816
|
+
return s.split(":", 1)[1].strip()
|
|
817
|
+
return None
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
def _extract_report_url(run: Any) -> Optional[str]:
|
|
821
|
+
"""Best-effort extraction of the portal URL from a run object."""
|
|
822
|
+
for attr in ("report_url", "reportUrl"):
|
|
823
|
+
value = getattr(run, attr, None)
|
|
824
|
+
if isinstance(value, str) and value:
|
|
825
|
+
return value
|
|
826
|
+
metadata = getattr(run, "metadata", None)
|
|
827
|
+
if isinstance(metadata, dict):
|
|
828
|
+
for key in ("report_url", "reportUrl"):
|
|
829
|
+
value = metadata.get(key)
|
|
830
|
+
if isinstance(value, str) and value:
|
|
831
|
+
return value
|
|
832
|
+
return None
|
|
833
|
+
|
|
834
|
+
|
|
835
|
+
def _list_output_items(
|
|
836
|
+
openai_client: Any,
|
|
837
|
+
*,
|
|
838
|
+
eval_id: str,
|
|
839
|
+
run_id: str,
|
|
840
|
+
progress: Callable[[str], None],
|
|
841
|
+
) -> List[Dict[str, Any]]:
|
|
842
|
+
"""Download per-row output items from a completed Foundry eval run.
|
|
843
|
+
|
|
844
|
+
Returns a list of dicts (one per dataset row) containing the original
|
|
845
|
+
``datasource_item`` (input row), the ``sample`` returned by the agent,
|
|
846
|
+
and the per-criterion ``results``. Returns ``[]`` on any failure so
|
|
847
|
+
the orchestrator can still emit a ``results.json`` that records the
|
|
848
|
+
Foundry portal URL (no fallback to local invocation).
|
|
849
|
+
"""
|
|
850
|
+
try:
|
|
851
|
+
# The OpenAI Evals API exposes a paginated list endpoint at
|
|
852
|
+
# ``client.evals.runs.output_items.list``. We accept either a
|
|
853
|
+
# paginator object with ``.data`` / iteration, or a plain list.
|
|
854
|
+
output_items_api = openai_client.evals.runs.output_items
|
|
855
|
+
page = output_items_api.list(eval_id=eval_id, run_id=run_id)
|
|
856
|
+
except Exception as exc: # noqa: BLE001
|
|
857
|
+
logger.debug("could not list output_items: %s", exc)
|
|
858
|
+
progress(
|
|
859
|
+
f"cloud: WARNING - could not download per-row results "
|
|
860
|
+
f"({exc.__class__.__name__}); local results.json will record the "
|
|
861
|
+
f"portal URL only."
|
|
862
|
+
)
|
|
863
|
+
return []
|
|
864
|
+
|
|
865
|
+
items: List[Dict[str, Any]] = []
|
|
866
|
+
try:
|
|
867
|
+
iterable = getattr(page, "data", None) or page
|
|
868
|
+
for raw in iterable:
|
|
869
|
+
item = _coerce_output_item_to_dict(raw)
|
|
870
|
+
if item is not None:
|
|
871
|
+
items.append(item)
|
|
872
|
+
except Exception as exc: # noqa: BLE001
|
|
873
|
+
logger.debug("could not iterate output_items: %s", exc)
|
|
874
|
+
progress(
|
|
875
|
+
f"cloud: WARNING - failed to iterate output_items "
|
|
876
|
+
f"({exc.__class__.__name__}); local results.json will be thin."
|
|
877
|
+
)
|
|
878
|
+
return []
|
|
879
|
+
|
|
880
|
+
progress(f"cloud: downloaded {len(items)} output item(s)")
|
|
881
|
+
return items
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
def _coerce_output_item_to_dict(raw: Any) -> Optional[Dict[str, Any]]:
|
|
885
|
+
"""Convert an SDK output item (Pydantic model or dict) into a plain dict."""
|
|
886
|
+
if isinstance(raw, dict):
|
|
887
|
+
return raw
|
|
888
|
+
for method in ("model_dump", "to_dict", "dict"):
|
|
889
|
+
fn = getattr(raw, method, None)
|
|
890
|
+
if callable(fn):
|
|
891
|
+
try:
|
|
892
|
+
value = fn()
|
|
893
|
+
if isinstance(value, dict):
|
|
894
|
+
return value
|
|
895
|
+
except Exception: # noqa: BLE001
|
|
896
|
+
continue
|
|
897
|
+
# Fallback: pull known attributes off the object.
|
|
898
|
+
keys = ("id", "status", "datasource_item", "sample", "results")
|
|
899
|
+
if any(hasattr(raw, k) for k in keys):
|
|
900
|
+
return {k: getattr(raw, k, None) for k in keys}
|
|
901
|
+
return None
|