agentops-toolkit 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/PKG-INFO +1 -1
  2. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/pyproject.toml +1 -1
  3. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/__init__.py +1 -1
  4. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/eval_cmd.py +1 -1
  5. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/report_cmd.py +1 -3
  6. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/pipeline.py +160 -8
  7. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/evaluators/base.py +10 -2
  8. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/models/config.py +3 -2
  9. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/README.md +0 -0
  10. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/adapters/__init__.py +0 -0
  11. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/adapters/agent_service.py +0 -0
  12. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/adapters/autogen.py +0 -0
  13. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/adapters/generic.py +0 -0
  14. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/adapters/registry.py +0 -0
  15. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/adapters/semantic_kernel.py +0 -0
  16. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/agent_quality.yaml +0 -0
  17. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/agent_safety.yaml +0 -0
  18. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/custom.yaml +0 -0
  19. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/multi_agent_quality.yaml +0 -0
  20. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/rag_agentic_retrieval.yaml +0 -0
  21. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/rag_complete.yaml +0 -0
  22. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/rag_cross_iq.yaml +0 -0
  23. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/rag_fabric_iq.yaml +0 -0
  24. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/rag_foundry_iq.yaml +0 -0
  25. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/rag_permission_aware.yaml +0 -0
  26. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/rag_quality.yaml +0 -0
  27. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/rag_safety.yaml +0 -0
  28. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/bundles/rag_work_iq.yaml +0 -0
  29. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/__init__.py +0 -0
  30. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/app.py +0 -0
  31. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/bundle_cmd.py +0 -0
  32. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/config_cmd.py +0 -0
  33. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/dataset_cmd.py +0 -0
  34. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/init_cmd.py +0 -0
  35. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/model_cmd.py +0 -0
  36. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/monitor_cmd.py +0 -0
  37. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/run_cmd.py +0 -0
  38. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/cli/trace_cmd.py +0 -0
  39. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/connectors/__init__.py +0 -0
  40. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/__init__.py +0 -0
  41. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/aggregator.py +0 -0
  42. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/bundle_registry.py +0 -0
  43. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/client.py +0 -0
  44. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/config_loader.py +0 -0
  45. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/errors.py +0 -0
  46. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/foundry_client.py +0 -0
  47. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/foundry_sdk_client.py +0 -0
  48. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/hooks.py +0 -0
  49. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/logging.py +0 -0
  50. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/persistence.py +0 -0
  51. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/rate_limiter.py +0 -0
  52. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/registry.py +0 -0
  53. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/core/runner.py +0 -0
  54. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/evaluators/__init__.py +0 -0
  55. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/evaluators/citation.py +0 -0
  56. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/evaluators/rag_iq.py +0 -0
  57. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/mcp/__init__.py +0 -0
  58. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/mcp/client.py +0 -0
  59. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/models/__init__.py +0 -0
  60. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/models/bundle.py +0 -0
  61. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/models/dataset.py +0 -0
  62. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/models/observability.py +0 -0
  63. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/models/rag.py +0 -0
  64. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/models/run.py +0 -0
  65. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/obs/__init__.py +0 -0
  66. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/obs/decorators.py +0 -0
  67. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/obs/monitor.py +0 -0
  68. {agentops_toolkit-0.2.3 → agentops_toolkit-0.2.4}/src/agentops_toolkit/obs/tracing.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: agentops-toolkit
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: CLI toolkit for evaluating, tracing, and monitoring AI agents on Azure AI Foundry
5
5
  Keywords: ai,agent,evaluation,azure,foundry,observability
6
6
  Author: DB Lee
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "agentops-toolkit"
3
- version = "0.2.3"
3
+ version = "0.2.4"
4
4
  description = "CLI toolkit for evaluating, tracing, and monitoring AI agents on Azure AI Foundry"
5
5
  readme = "README.md"
6
6
  license = { text = "MIT" }
@@ -1,3 +1,3 @@
1
1
  """AgentOps Toolkit — Evaluate, trace, and monitor AI agents."""
2
2
 
3
- __version__ = "0.2.3"
3
+ __version__ = "0.2.4"
@@ -72,7 +72,7 @@ def eval_run(
72
72
  # Resolve bundle
73
73
  bundle_name = bundle or (cfg.bundles.default if cfg else "rag_quality")
74
74
  project_connection = cfg.foundry.project_connection if cfg else ""
75
- model_deployment = getattr(cfg.foundry, "model_deployment", "gpt-4o") if cfg else "gpt-4o"
75
+ model_deployment = getattr(cfg.foundry, "model_deployment", "") if cfg else ""
76
76
  output_dir = cfg.runs.output_dir if cfg else "agentops/runs"
77
77
 
78
78
  # Run evaluation
@@ -150,9 +150,7 @@ def report_show(
150
150
  if evaluator:
151
151
  results = [er for er in results if er.evaluator_name == evaluator]
152
152
  scores_str = ", ".join(
153
- f"{er.evaluator_name}={er.score}"
154
- for er in results
155
- if er.score is not None
153
+ f"{er.evaluator_name}={er.score}" for er in results if er.score is not None
156
154
  )
157
155
  console.print(f" [{entry.status.value}] {entry.dataset_entry_id}: {scores_str}")
158
156
 
@@ -186,18 +186,139 @@ def compute_summary(entries: list[RunEntry], total_duration_ms: float) -> RunSum
186
186
  )
187
187
 
188
188
 
189
+ def _derive_ai_services_endpoint(project_endpoint: str) -> str:
190
+ """Derive the AI Services base endpoint from a project endpoint.
191
+
192
+ Project endpoint format:
193
+ https://<account>.services.ai.azure.com/api/projects/<project>
194
+ AI Services base endpoint:
195
+ https://<account>.services.ai.azure.com
196
+ """
197
+ from urllib.parse import urlparse
198
+
199
+ parsed = urlparse(project_endpoint)
200
+ return f"{parsed.scheme}://{parsed.netloc}"
201
+
202
+
203
+ def _is_reasoning_model(model_name: str) -> bool:
204
+ """Check if a model is an o-series reasoning model.
205
+
206
+ Reasoning models (o1, o3, o4-mini, etc.) require API version
207
+ 2024-12-01-preview or later, which is incompatible with the
208
+ azure-ai-evaluation SDK's default of 2024-02-15-preview.
209
+ """
210
+ import re
211
+
212
+ name = model_name.lower().strip()
213
+ # Match o-series: o1, o1-mini, o3, o4-mini, etc.
214
+ return bool(re.match(r"^o\d", name))
215
+
216
+
217
+ # The azure-ai-evaluation SDK uses "2024-02-15-preview" by default.
218
+ # Reasoning models require "2024-12-01-preview" or later.
219
+ _EVAL_SDK_API_VERSION = "2024-02-15-preview"
220
+ _REASONING_MODEL_API_VERSION = "2024-12-01-preview"
221
+
222
+
223
+ def _discover_chat_deployment(
224
+ project_endpoint: str,
225
+ ) -> tuple[str, str] | None:
226
+ """Auto-discover a chat-capable model deployment via AI Foundry SDK.
227
+
228
+ Returns a ``(deployment_name, api_version)`` tuple for the best candidate,
229
+ or *None* when no suitable deployment is found.
230
+
231
+ Selection criteria:
232
+ 1. Must have ``chat_completion`` capability.
233
+ 2. Prefers models compatible with the evaluation SDK's default API
234
+ version (``2024-02-15-preview``). Reasoning / o-series models that
235
+ require a newer version are ranked lower but still usable.
236
+ 3. Among compatible models, smaller / cheaper variants are preferred
237
+ (``mini`` → ``gpt-4.1`` → others).
238
+ """
239
+ try:
240
+ from azure.ai.projects import AIProjectClient
241
+ from azure.ai.projects.models import ModelDeployment
242
+ from azure.identity import DefaultAzureCredential
243
+
244
+ client = AIProjectClient(
245
+ credential=DefaultAzureCredential(),
246
+ endpoint=project_endpoint,
247
+ )
248
+
249
+ # Partition into compatible and reasoning-only buckets
250
+ compatible: list[tuple[str, str]] = [] # (name, model_name)
251
+ reasoning_only: list[tuple[str, str]] = []
252
+
253
+ for deployment in client.deployments.list():
254
+ if not isinstance(deployment, ModelDeployment):
255
+ continue
256
+ caps = deployment.capabilities or {}
257
+ if caps.get("chat_completion") != "true":
258
+ continue
259
+ model_name = deployment.model_name or deployment.name
260
+ if _is_reasoning_model(model_name):
261
+ reasoning_only.append((deployment.name, model_name))
262
+ else:
263
+ compatible.append((deployment.name, model_name))
264
+
265
+ def _pick_preferred(
266
+ candidates: list[tuple[str, str]],
267
+ ) -> str | None:
268
+ """Return the best deployment name from *candidates*."""
269
+ preferred = ["mini", "4.1-mini", "4o-mini", "gpt-4.1"]
270
+ for pref in preferred:
271
+ for dep_name, mdl_name in candidates:
272
+ if pref in dep_name.lower() or pref in mdl_name.lower():
273
+ return dep_name
274
+ return candidates[0][0] if candidates else None
275
+
276
+ # First try compatible models (work with eval SDK default API version)
277
+ if compatible:
278
+ chosen = _pick_preferred(compatible)
279
+ if chosen:
280
+ logger.info(
281
+ "Auto-discovered deployment '%s' (api_version=%s)",
282
+ chosen,
283
+ _EVAL_SDK_API_VERSION,
284
+ )
285
+ return (chosen, _EVAL_SDK_API_VERSION)
286
+
287
+ # Fall back to reasoning models with a newer API version
288
+ if reasoning_only:
289
+ chosen = _pick_preferred(reasoning_only)
290
+ if chosen:
291
+ logger.warning(
292
+ "Only reasoning model deployments available. Using '%s' with api_version=%s",
293
+ chosen,
294
+ _REASONING_MODEL_API_VERSION,
295
+ )
296
+ return (chosen, _REASONING_MODEL_API_VERSION)
297
+
298
+ return None
299
+
300
+ except Exception as e:
301
+ logger.warning("Failed to auto-discover deployments: %s", e)
302
+ return None
303
+
304
+
189
305
  async def run_evaluation(
190
306
  dataset_path: str | Path,
191
307
  bundle_name: str,
192
308
  output_dir: str | Path = "agentops/runs",
193
309
  run_name: str = "default",
194
310
  project_connection: str = "",
195
- model_deployment: str = "gpt-4o",
311
+ model_deployment: str = "",
196
312
  ) -> Run:
197
313
  """Execute the full evaluation pipeline.
198
314
 
199
315
  For Sprint 1, this evaluates a pre-populated dataset (agent responses
200
316
  already in the JSONL) against the specified bundle's evaluators.
317
+
318
+ Args:
319
+ project_connection: AI Foundry project endpoint URL.
320
+ model_deployment: Model deployment name for LLM-judge evaluators.
321
+ If empty, auto-discovers a suitable chat model from the project.
201
322
  """
202
323
  output_dir = Path(output_dir)
203
324
 
@@ -213,16 +334,47 @@ async def run_evaluation(
213
334
  # Build model_config for LLM-judge evaluators
214
335
  model_config: dict[str, Any] | None = None
215
336
  if project_connection:
216
- try:
217
- from azure.identity import DefaultAzureCredential
337
+ # Derive the AI Services base endpoint from the project endpoint
338
+ azure_endpoint = _derive_ai_services_endpoint(project_connection)
339
+
340
+ # Resolve deployment + API version
341
+ deployment: str | None = None
342
+ api_version: str = _EVAL_SDK_API_VERSION
343
+
344
+ if model_deployment:
345
+ # Explicit deployment — pick correct API version
346
+ deployment = model_deployment
347
+ if _is_reasoning_model(model_deployment):
348
+ api_version = _REASONING_MODEL_API_VERSION
349
+ logger.info(
350
+ "Reasoning model detected; using api_version=%s",
351
+ api_version,
352
+ )
353
+ else:
354
+ # Auto-discover
355
+ result = _discover_chat_deployment(project_connection)
356
+ if result:
357
+ deployment, api_version = result
218
358
 
359
+ if deployment:
219
360
  model_config = {
220
- "azure_endpoint": project_connection,
221
- "azure_deployment": model_deployment,
222
- "credential": DefaultAzureCredential(),
361
+ "azure_endpoint": azure_endpoint,
362
+ "azure_deployment": deployment,
363
+ "api_version": api_version,
223
364
  }
224
- except ImportError:
225
- logger.warning("azure-identity not installed; LLM-judge evaluators may fail")
365
+ logger.info(
366
+ "Using model deployment '%s' at %s (api_version=%s)",
367
+ deployment,
368
+ azure_endpoint,
369
+ api_version,
370
+ )
371
+ else:
372
+ logger.warning(
373
+ "No chat-capable model deployment found. "
374
+ "LLM-judge evaluators will fail. "
375
+ "Deploy a chat model in your AI Foundry project or set "
376
+ "model_deployment in agentops.yaml."
377
+ )
226
378
 
227
379
  # Build evaluators
228
380
  evaluators = [
@@ -39,8 +39,16 @@ class BaseEvaluator(ABC):
39
39
 
40
40
  # Evaluators that use an LLM judge and require model_config
41
41
  _LLM_JUDGE_EVALUATORS: set[str] = {
42
- "groundedness", "relevance", "coherence", "fluency", "similarity",
43
- "hate_unfairness", "sexual", "violence", "self_harm", "protected_material",
42
+ "groundedness",
43
+ "relevance",
44
+ "coherence",
45
+ "fluency",
46
+ "similarity",
47
+ "hate_unfairness",
48
+ "sexual",
49
+ "violence",
50
+ "self_harm",
51
+ "protected_material",
44
52
  }
45
53
 
46
54
 
@@ -74,8 +74,9 @@ class FoundryConnection(BaseModel):
74
74
  description="Foundry project endpoint URL or ${ENV_VAR}",
75
75
  )
76
76
  model_deployment: str = Field(
77
- default="gpt-4o",
78
- description="Model deployment name for LLM-judge evaluators",
77
+ default="",
78
+ description="Model deployment name for LLM-judge evaluators. "
79
+ "If empty, auto-discovers a suitable chat model from the project.",
79
80
  )
80
81
  credential: CredentialType = CredentialType.DEFAULT
81
82
  rate_limit: RateLimitConfig = Field(default_factory=RateLimitConfig)