azure-ai-evaluation 1.11.1__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (35) hide show
  1. azure/ai/evaluation/_aoai/aoai_grader.py +63 -19
  2. azure/ai/evaluation/_aoai/label_grader.py +8 -3
  3. azure/ai/evaluation/_aoai/python_grader.py +8 -3
  4. azure/ai/evaluation/_aoai/score_model_grader.py +8 -3
  5. azure/ai/evaluation/_aoai/string_check_grader.py +9 -4
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +9 -4
  7. azure/ai/evaluation/_eval_mapping.py +2 -0
  8. azure/ai/evaluation/_evaluate/_evaluate.py +106 -4
  9. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +252 -48
  10. azure/ai/evaluation/_evaluate/_utils.py +7 -3
  11. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  12. azure/ai/evaluation/_evaluators/_common/_base_eval.py +77 -3
  13. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  14. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +6 -0
  15. azure/ai/evaluation/_evaluators/_path_efficiency/__init__.py +7 -0
  16. azure/ai/evaluation/_evaluators/_path_efficiency/_path_efficiency.py +342 -0
  17. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +7 -1
  18. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  19. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  20. azure/ai/evaluation/_evaluators/_task_success/__init__.py +7 -0
  21. azure/ai/evaluation/_evaluators/_task_success/_task_success.py +168 -0
  22. azure/ai/evaluation/_evaluators/_task_success/task_success.prompty +220 -0
  23. azure/ai/evaluation/_exceptions.py +1 -0
  24. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +7 -2
  25. azure/ai/evaluation/_version.py +1 -1
  26. azure/ai/evaluation/red_team/_mlflow_integration.py +454 -35
  27. azure/ai/evaluation/red_team/_red_team.py +9 -0
  28. azure/ai/evaluation/red_team/_red_team_result.py +230 -1
  29. azure/ai/evaluation/red_team/_result_processor.py +416 -23
  30. azure/ai/evaluation/red_team/_utils/formatting_utils.py +1 -1
  31. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/METADATA +13 -3
  32. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/RECORD +35 -30
  33. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/WHEEL +0 -0
  34. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/licenses/NOTICE.txt +0 -0
  35. {azure_ai_evaluation-1.11.1.dist-info → azure_ai_evaluation-1.12.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,7 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from typing import Dict, List, Optional, TypedDict
4
+ from typing import Any, Dict, List, Optional, TypedDict
5
5
  import json
6
6
  from azure.ai.evaluation._common._experimental import experimental
7
7
 
@@ -244,6 +244,91 @@ class AttackDetails(TypedDict):
244
244
  attack_success_threshold: Optional[int]
245
245
 
246
246
 
247
+ @experimental
248
+ class RedTeamOutputResultProperties(TypedDict, total=False):
249
+ """Additional metadata captured for each evaluation result."""
250
+
251
+ attack_success: Optional[bool]
252
+ attack_success_threshold: Optional[int]
253
+ attack_technique: str
254
+ attack_complexity: str
255
+ risk_category: str
256
+ risk_assessment: Optional[Dict[str, Any]]
257
+ reason: Optional[str]
258
+ severity_label: Optional[str]
259
+ metadata: Optional[Dict[str, Any]]
260
+
261
+
262
+ @experimental
263
+ class EvaluationRunOutputItemMessage(TypedDict, total=False):
264
+ """Representation of a single message within an evaluation sample."""
265
+
266
+ role: str
267
+ content: Any
268
+ name: Optional[str]
269
+ tool_calls: Optional[List[Dict[str, Any]]]
270
+
271
+
272
+ @experimental
273
+ class RedTeamRunOutputItemResult(TypedDict, total=False):
274
+ """Flattened evaluation result for a single risk category."""
275
+
276
+ # Should extend EvaluationRunOutputItemResult
277
+
278
+ object: str
279
+ type: str
280
+ name: str
281
+ passed: Optional[bool]
282
+ score: Optional[float]
283
+ metric: Optional[str]
284
+ threshold: Optional[float]
285
+ reason: Optional[str]
286
+ sample: "RedTeamRunOutputItemSample"
287
+ properties: RedTeamOutputResultProperties
288
+
289
+
290
+ @experimental
291
+ class RedTeamDatasourceItem(TypedDict, total=False):
292
+ """Metadata about the datasource item that produced this conversation."""
293
+
294
+ id: Optional[str]
295
+ input_data: Dict[str, Any]
296
+
297
+
298
+ @experimental
299
+ class RedTeamRunOutputItemSample(TypedDict, total=False):
300
+ """Sample payload containing the red team conversation."""
301
+
302
+ # Should extend EvaluationRunOutputItemSample
303
+
304
+ object: str
305
+ input: List[EvaluationRunOutputItemMessage]
306
+ output: List[EvaluationRunOutputItemMessage]
307
+ finish_reason: Optional[str]
308
+ model: Optional[str]
309
+ error: Optional[Dict[str, Any]]
310
+ usage: Optional[Dict[str, Any]]
311
+ seed: Optional[int]
312
+ temperature: Optional[float]
313
+ top_p: Optional[float]
314
+ max_completion_tokens: Optional[float]
315
+ metadata: Optional[Dict[str, Any]]
316
+
317
+
318
+ @experimental
319
+ class RedTeamOutputItem(TypedDict, total=False):
320
+ """Structured representation of a conversation and its evaluation artifacts."""
321
+
322
+ object: str
323
+ id: str
324
+ created_time: int
325
+ status: str
326
+ datasource_item_id: Optional[str]
327
+ datasource_item: Optional[RedTeamDatasourceItem]
328
+ sample: RedTeamRunOutputItemSample
329
+ results: List[RedTeamRunOutputItemResult]
330
+
331
+
247
332
  @experimental
248
333
  class ScanResult(TypedDict):
249
334
  """TypedDict representation of a Red Team Agent evaluation result with the updated structure.
@@ -254,6 +339,12 @@ class ScanResult(TypedDict):
254
339
  :type parameters: RedTeamingParameters
255
340
  :param attack_details: List of AttackDetails objects representing the conversations in the evaluation
256
341
  :type attack_details: List[AttackDetails]
342
+ :param output_items: List of structured output items from the evaluation
343
+ :type output_items: List[RedTeamOutputItem]
344
+ :param AOAI_Compatible_Row_Results: List of evaluation results for each risk category
345
+ :type AOAI_Compatible_Row_Results: List[RedTeamRunOutputItemResult]
346
+ :param AOAI_Compatible_Summary: The evaluation run metadata in eval.run format
347
+ :type AOAI_Compatible_Summary: RedTeamRun
257
348
  :param studio_url: Optional URL for the studio
258
349
  :type studio_url: Optional[str]
259
350
  """
@@ -261,9 +352,147 @@ class ScanResult(TypedDict):
261
352
  scorecard: RedTeamingScorecard
262
353
  parameters: RedTeamingParameters
263
354
  attack_details: List[AttackDetails]
355
+ output_items: List[RedTeamOutputItem]
356
+ AOAI_Compatible_Row_Results: List[RedTeamRunOutputItemResult]
357
+ AOAI_Compatible_Summary: "RedTeamRun"
264
358
  studio_url: Optional[str]
265
359
 
266
360
 
361
+ @experimental
362
+ class ResultCount(TypedDict):
363
+ """Count of evaluation results by status.
364
+
365
+ :param total: Total number of evaluation results
366
+ :type total: int
367
+ :param passed: Number of passed evaluation results
368
+ :type passed: int
369
+ :param failed: Number of failed evaluation results
370
+ :type failed: int
371
+ :param errored: Number of errored evaluation results
372
+ :type errored: int
373
+ """
374
+
375
+ total: int
376
+ passed: int
377
+ failed: int
378
+ errored: int
379
+
380
+
381
+ @experimental
382
+ class PerTestingCriteriaResult(TypedDict):
383
+ """Result count for a specific testing criteria.
384
+
385
+ :param testing_criteria: The name of the testing criteria (e.g., risk category)
386
+ :type testing_criteria: str
387
+ :param passed: Number of passed results for this criteria
388
+ :type passed: int
389
+ :param failed: Number of failed results for this criteria
390
+ :type failed: int
391
+ """
392
+
393
+ testing_criteria: str
394
+ passed: int
395
+ failed: int
396
+
397
+
398
+ @experimental
399
+ class DataSourceItemGenerationParams(TypedDict, total=False):
400
+ """Parameters for data source item generation.
401
+
402
+ :param type: Type of data source generation (e.g., "red_team")
403
+ :type type: str
404
+ :param attack_strategies: List of attack strategies used
405
+ :type attack_strategies: List[str]
406
+ :param num_turns: Number of turns in the conversation
407
+ :type num_turns: int
408
+ """
409
+
410
+ type: str
411
+ attack_strategies: List[str]
412
+ num_turns: int
413
+
414
+
415
+ @experimental
416
+ class DataSource(TypedDict, total=False):
417
+ """Data source information for the red team evaluation.
418
+
419
+ :param type: Type of data source (e.g., "azure_ai_red_team")
420
+ :type type: str
421
+ :param target: Target configuration for the data source
422
+ :type target: Dict[str, Any]
423
+ :param item_generation_params: Parameters used for generating data items
424
+ :type item_generation_params: DataSourceItemGenerationParams
425
+ """
426
+
427
+ type: str
428
+ target: Dict[str, Any]
429
+ item_generation_params: DataSourceItemGenerationParams
430
+
431
+
432
+ @experimental
433
+ class OutputItemsList(TypedDict):
434
+ """Wrapper for list of output items.
435
+
436
+ :param object: Object type identifier (always "list")
437
+ :type object: str
438
+ :param data: List of red team output items
439
+ :type data: List[RedTeamOutputItem]
440
+ """
441
+
442
+ object: str
443
+ data: List[RedTeamOutputItem]
444
+
445
+
446
+ @experimental
447
+ class RedTeamRun(TypedDict, total=False):
448
+ """TypedDict representation of a Red Team evaluation run in eval.run format.
449
+
450
+ :param object: Object type identifier (always "eval.run")
451
+ :type object: str
452
+ :param id: Unique identifier for the run
453
+ :type id: str
454
+ :param eval_id: Identifier for the evaluation experiment
455
+ :type eval_id: str
456
+ :param created_at: Timestamp when the run was created (Unix epoch seconds)
457
+ :type created_at: int
458
+ :param status: Status of the run (e.g., "completed", "failed", "in_progress")
459
+ :type status: str
460
+ :param name: Display name for the run
461
+ :type name: str
462
+ :param report_url: URL to view the run report in Azure AI Studio
463
+ :type report_url: Optional[str]
464
+ :param data_source: Information about the data source used for the evaluation
465
+ :type data_source: DataSource
466
+ :param metadata: Additional metadata for the run
467
+ :type metadata: Dict[str, Any]
468
+ :param result_count: Aggregated counts of evaluation results
469
+ :type result_count: ResultCount
470
+ :param per_model_usage: Usage statistics per model (if applicable)
471
+ :type per_model_usage: List[Any]
472
+ :param per_testing_criteria_results: Results aggregated by testing criteria
473
+ :type per_testing_criteria_results: List[PerTestingCriteriaResult]
474
+ :param output_items: Wrapped list of output items from the evaluation
475
+ :type output_items: OutputItemsList
476
+ :param conversations: Optional list of attack details/conversations
477
+ :type conversations: List[AttackDetails]
478
+ """
479
+
480
+ object: str
481
+ id: str
482
+ eval_id: str
483
+ created_at: int
484
+ status: str
485
+ name: str
486
+ report_url: Optional[str]
487
+ data_source: DataSource
488
+ metadata: Dict[str, Any]
489
+ result_count: ResultCount
490
+ per_model_usage: List[Any]
491
+ per_testing_criteria_results: List[PerTestingCriteriaResult]
492
+ output_items: OutputItemsList
493
+ conversations: List[AttackDetails]
494
+
495
+
267
496
  @experimental
268
497
  class RedTeamResult:
269
498
  def __init__(self, scan_result: Optional[ScanResult] = None, attack_details: Optional[List[AttackDetails]] = None):