azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
  3. azure/ai/evaluation/_aoai/label_grader.py +6 -10
  4. azure/ai/evaluation/_aoai/python_grader.py +7 -10
  5. azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
  6. azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +241 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -2
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
  33. azure/ai/evaluation/_evaluate/_utils.py +10 -3
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
  38. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  39. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
  40. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
  41. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  42. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  43. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
  44. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  45. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  46. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  47. azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
  48. azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
  49. azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
  50. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
  52. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  53. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  55. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  56. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  57. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  58. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  59. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  60. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  61. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  62. azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
  63. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  64. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  65. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  66. azure/ai/evaluation/_exceptions.py +6 -1
  67. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  68. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  69. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  70. azure/ai/evaluation/_model_configurations.py +26 -0
  71. azure/ai/evaluation/_version.py +1 -1
  72. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  73. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  74. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  75. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  76. azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
  77. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  78. azure/ai/evaluation/red_team/_red_team.py +494 -37
  79. azure/ai/evaluation/red_team/_red_team_result.py +48 -28
  80. azure/ai/evaluation/red_team/_result_processor.py +558 -29
  81. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  82. azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
  83. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  84. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  85. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  86. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  87. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  88. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  90. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  91. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  92. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  94. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  95. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
  96. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
  97. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  98. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  99. {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,12 @@ import json
6
6
  from azure.ai.evaluation._common._experimental import experimental
7
7
 
8
8
 
9
+ # NOTE: The RedTeam* TypedDict classes below are being gradually deprecated in favor of
10
+ # the standard EvalRun* models from azure.ai.evaluation._common.onedp.models._models.
11
+ # New code should use EvalRunOutputItem instead of RedTeamOutputItem, etc.
12
+ # These TypedDicts are maintained for backward compatibility during the transition.
13
+
14
+
9
15
  @experimental
10
16
  class RiskCategorySummary(TypedDict):
11
17
  """Summary of attack success rates across risk categories.
@@ -216,7 +222,7 @@ class RiskAssessment(TypedDict):
216
222
 
217
223
 
218
224
  @experimental
219
- class AttackDetails(TypedDict):
225
+ class AttackDetails(TypedDict, total=False):
220
226
  """TypedDict representation of a simulated conversation in a red team evaluation.
221
227
 
222
228
  :param attack_success: Whether the attack was successful
@@ -233,6 +239,8 @@ class AttackDetails(TypedDict):
233
239
  :type risk_assessment: Optional[RiskAssessment]
234
240
  :param attack_success_threshold: The threshold value used to determine attack success
235
241
  :type attack_success_threshold: Optional[int]
242
+ :param risk_sub_type: Optional risk sub-category/sub-type for the attack
243
+ :type risk_sub_type: Optional[str]
236
244
  """
237
245
 
238
246
  attack_success: Optional[bool]
@@ -242,6 +250,7 @@ class AttackDetails(TypedDict):
242
250
  conversation: List[Dict[str, str]]
243
251
  risk_assessment: Optional[RiskAssessment]
244
252
  attack_success_threshold: Optional[int]
253
+ risk_sub_type: Optional[str]
245
254
 
246
255
 
247
256
  @experimental
@@ -271,7 +280,11 @@ class EvaluationRunOutputItemMessage(TypedDict, total=False):
271
280
 
272
281
  @experimental
273
282
  class RedTeamRunOutputItemResult(TypedDict, total=False):
274
- """Flattened evaluation result for a single risk category."""
283
+ """Flattened evaluation result for a single risk category.
284
+
285
+ :param label: String label "pass" or "fail" that aligns with the passed field
286
+ :type label: Optional[str]
287
+ """
275
288
 
276
289
  # Should extend EvaluationRunOutputItemResult
277
290
 
@@ -279,11 +292,11 @@ class RedTeamRunOutputItemResult(TypedDict, total=False):
279
292
  type: str
280
293
  name: str
281
294
  passed: Optional[bool]
295
+ label: Optional[str]
282
296
  score: Optional[float]
283
297
  metric: Optional[str]
284
298
  threshold: Optional[float]
285
299
  reason: Optional[str]
286
- sample: "RedTeamRunOutputItemSample"
287
300
  properties: RedTeamOutputResultProperties
288
301
 
289
302
 
@@ -317,7 +330,12 @@ class RedTeamRunOutputItemSample(TypedDict, total=False):
317
330
 
318
331
  @experimental
319
332
  class RedTeamOutputItem(TypedDict, total=False):
320
- """Structured representation of a conversation and its evaluation artifacts."""
333
+ """Structured representation of a conversation and its evaluation artifacts.
334
+
335
+ DEPRECATED: This TypedDict duplicates the EvalRunOutputItem model from
336
+ azure.ai.evaluation._common.onedp.models._models. New code should use
337
+ EvalRunOutputItem directly instead of this TypedDict wrapper.
338
+ """
321
339
 
322
340
  object: str
323
341
  id: str
@@ -339,12 +357,10 @@ class ScanResult(TypedDict):
339
357
  :type parameters: RedTeamingParameters
340
358
  :param attack_details: List of AttackDetails objects representing the conversations in the evaluation
341
359
  :type attack_details: List[AttackDetails]
342
- :param output_items: List of structured output items from the evaluation
343
- :type output_items: List[RedTeamOutputItem]
344
360
  :param AOAI_Compatible_Row_Results: List of evaluation results for each risk category
345
- :type AOAI_Compatible_Row_Results: List[RedTeamRunOutputItemResult]
361
+ :type AOAI_Compatible_Row_Results: Optional[List[RedTeamRunOutputItemResult]]
346
362
  :param AOAI_Compatible_Summary: The evaluation run metadata in eval.run format
347
- :type AOAI_Compatible_Summary: RedTeamRun
363
+ :type AOAI_Compatible_Summary: Optional[RedTeamRun]
348
364
  :param studio_url: Optional URL for the studio
349
365
  :type studio_url: Optional[str]
350
366
  """
@@ -352,9 +368,8 @@ class ScanResult(TypedDict):
352
368
  scorecard: RedTeamingScorecard
353
369
  parameters: RedTeamingParameters
354
370
  attack_details: List[AttackDetails]
355
- output_items: List[RedTeamOutputItem]
356
- AOAI_Compatible_Row_Results: List[RedTeamRunOutputItemResult]
357
- AOAI_Compatible_Summary: "RedTeamRun"
371
+ AOAI_Compatible_Row_Results: Optional[List[RedTeamRunOutputItemResult]]
372
+ AOAI_Compatible_Summary: Optional["RedTeamRun"]
358
373
  studio_url: Optional[str]
359
374
 
360
375
 
@@ -379,11 +394,13 @@ class ResultCount(TypedDict):
379
394
 
380
395
 
381
396
  @experimental
382
- class PerTestingCriteriaResult(TypedDict):
397
+ class PerTestingCriteriaResult(TypedDict, total=False):
383
398
  """Result count for a specific testing criteria.
384
399
 
385
400
  :param testing_criteria: The name of the testing criteria (e.g., risk category)
386
401
  :type testing_criteria: str
402
+ :param attack_strategy: The attack strategy used (optional, for attack strategy summaries)
403
+ :type attack_strategy: Optional[str]
387
404
  :param passed: Number of passed results for this criteria
388
405
  :type passed: int
389
406
  :param failed: Number of failed results for this criteria
@@ -391,6 +408,7 @@ class PerTestingCriteriaResult(TypedDict):
391
408
  """
392
409
 
393
410
  testing_criteria: str
411
+ attack_strategy: Optional[str]
394
412
  passed: int
395
413
  failed: int
396
414
 
@@ -465,8 +483,8 @@ class RedTeamRun(TypedDict, total=False):
465
483
  :type data_source: DataSource
466
484
  :param metadata: Additional metadata for the run
467
485
  :type metadata: Dict[str, Any]
468
- :param result_count: Aggregated counts of evaluation results
469
- :type result_count: ResultCount
486
+ :param result_counts: Aggregated counts of evaluation results
487
+ :type result_counts: ResultCount
470
488
  :param per_model_usage: Usage statistics per model (if applicable)
471
489
  :type per_model_usage: List[Any]
472
490
  :param per_testing_criteria_results: Results aggregated by testing criteria
@@ -486,7 +504,7 @@ class RedTeamRun(TypedDict, total=False):
486
504
  report_url: Optional[str]
487
505
  data_source: DataSource
488
506
  metadata: Dict[str, Any]
489
- result_count: ResultCount
507
+ result_counts: ResultCount
490
508
  per_model_usage: List[Any]
491
509
  per_testing_criteria_results: List[PerTestingCriteriaResult]
492
510
  output_items: OutputItemsList
@@ -513,21 +531,23 @@ class RedTeamResult:
513
531
  return self.scan_result.get("scorecard", None) if self.scan_result else None
514
532
 
515
533
  def to_eval_qr_json_lines(self) -> str:
516
- """
517
- Converts conversations in messages format to query-response format suitable for evaluation.
534
+ """Converts conversations in messages format to query-response format suitable for evaluation.
518
535
 
519
536
  The output format follows the JSONL pattern with each line containing:
520
- {
521
- "query": "user message content",
522
- "response": "assistant message content",
523
- "risk_category": "risk category",
524
- "attack_strategy": "strategy name",
525
- "attack_complexity": "complexity level",
526
- "attack_success": "true|false", (if available from evaluation)
527
- "category": "risk category", (if available from evaluation)
528
- "severity_level": "low|medium|high", (if available from evaluation)
529
- "threshold": "threshold value" (if available from evaluation)
530
- }
537
+
538
+ .. code-block:: javascript
539
+
540
+ {
541
+ "query": "user message content",
542
+ "response": "assistant message content",
543
+ "risk_category": "risk category",
544
+ "attack_strategy": "strategy name",
545
+ "attack_complexity": "complexity level",
546
+ "attack_success": "true|false", // (if available from evaluation)
547
+ "category": "risk category", // (if available from evaluation)
548
+ "severity_level": "low|medium|high", // (if available from evaluation)
549
+ "threshold": "threshold value" // (if available from evaluation)
550
+ }
531
551
 
532
552
  :returns: A list of strings containing query-response pairs in JSONL format.
533
553
  :rtype: List[str]