azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. azure/ai/evaluation/__init__.py +2 -0
  2. azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
  3. azure/ai/evaluation/_aoai/label_grader.py +14 -13
  4. azure/ai/evaluation/_aoai/python_grader.py +15 -13
  5. azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
  6. azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
  7. azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
  8. azure/ai/evaluation/_common/__init__.py +2 -1
  9. azure/ai/evaluation/_common/constants.py +109 -0
  10. azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
  11. azure/ai/evaluation/_common/onedp/__init__.py +2 -2
  12. azure/ai/evaluation/_common/onedp/_client.py +44 -14
  13. azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
  14. azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
  15. azure/ai/evaluation/_common/onedp/_validation.py +18 -2
  16. azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
  17. azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
  18. azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
  19. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
  20. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
  21. azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
  22. azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
  23. azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
  24. azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
  25. azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
  26. azure/ai/evaluation/_common/rai_service.py +299 -2
  27. azure/ai/evaluation/_common/utils.py +173 -39
  28. azure/ai/evaluation/_constants.py +100 -0
  29. azure/ai/evaluation/_eval_mapping.py +10 -0
  30. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
  31. azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
  32. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
  33. azure/ai/evaluation/_evaluate/_utils.py +17 -6
  34. azure/ai/evaluation/_evaluator_definition.py +76 -0
  35. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
  36. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
  37. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
  38. azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
  39. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
  41. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
  42. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
  43. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
  44. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
  45. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
  46. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
  48. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
  49. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
  50. azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
  51. azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
  52. azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
  53. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
  54. azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
  55. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
  56. azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
  57. azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
  58. azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
  59. azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
  60. azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
  61. azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
  62. azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
  64. azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
  65. azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
  66. azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
  67. azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
  68. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
  69. azure/ai/evaluation/_exceptions.py +6 -0
  70. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
  71. azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
  72. azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
  73. azure/ai/evaluation/_model_configurations.py +26 -0
  74. azure/ai/evaluation/_version.py +1 -1
  75. azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
  76. azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
  77. azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
  78. azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
  79. azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
  80. azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
  81. azure/ai/evaluation/red_team/_red_team.py +503 -37
  82. azure/ai/evaluation/red_team/_red_team_result.py +264 -15
  83. azure/ai/evaluation/red_team/_result_processor.py +953 -31
  84. azure/ai/evaluation/red_team/_utils/constants.py +1 -0
  85. azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
  86. azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
  87. azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
  88. azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
  89. azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
  90. azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
  91. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  92. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
  93. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
  94. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
  95. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
  96. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
  97. azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
  98. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
  99. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
  100. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
  101. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
  102. {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,17 @@
1
1
  # ---------------------------------------------------------
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
- from typing import Dict, List, Optional, TypedDict
4
+ from typing import Any, Dict, List, Optional, TypedDict
5
5
  import json
6
6
  from azure.ai.evaluation._common._experimental import experimental
7
7
 
8
8
 
9
+ # NOTE: The RedTeam* TypedDict classes below are being gradually deprecated in favor of
10
+ # the standard EvalRun* models from azure.ai.evaluation._common.onedp.models._models.
11
+ # New code should use EvalRunOutputItem instead of RedTeamOutputItem, etc.
12
+ # These TypedDicts are maintained for backward compatibility during the transition.
13
+
14
+
9
15
  @experimental
10
16
  class RiskCategorySummary(TypedDict):
11
17
  """Summary of attack success rates across risk categories.
@@ -216,7 +222,7 @@ class RiskAssessment(TypedDict):
216
222
 
217
223
 
218
224
  @experimental
219
- class AttackDetails(TypedDict):
225
+ class AttackDetails(TypedDict, total=False):
220
226
  """TypedDict representation of a simulated conversation in a red team evaluation.
221
227
 
222
228
  :param attack_success: Whether the attack was successful
@@ -233,6 +239,8 @@ class AttackDetails(TypedDict):
233
239
  :type risk_assessment: Optional[RiskAssessment]
234
240
  :param attack_success_threshold: The threshold value used to determine attack success
235
241
  :type attack_success_threshold: Optional[int]
242
+ :param risk_sub_type: Optional risk sub-category/sub-type for the attack
243
+ :type risk_sub_type: Optional[str]
236
244
  """
237
245
 
238
246
  attack_success: Optional[bool]
@@ -242,6 +250,101 @@ class AttackDetails(TypedDict):
242
250
  conversation: List[Dict[str, str]]
243
251
  risk_assessment: Optional[RiskAssessment]
244
252
  attack_success_threshold: Optional[int]
253
+ risk_sub_type: Optional[str]
254
+
255
+
256
+ @experimental
257
+ class RedTeamOutputResultProperties(TypedDict, total=False):
258
+ """Additional metadata captured for each evaluation result."""
259
+
260
+ attack_success: Optional[bool]
261
+ attack_success_threshold: Optional[int]
262
+ attack_technique: str
263
+ attack_complexity: str
264
+ risk_category: str
265
+ risk_assessment: Optional[Dict[str, Any]]
266
+ reason: Optional[str]
267
+ severity_label: Optional[str]
268
+ metadata: Optional[Dict[str, Any]]
269
+
270
+
271
+ @experimental
272
+ class EvaluationRunOutputItemMessage(TypedDict, total=False):
273
+ """Representation of a single message within an evaluation sample."""
274
+
275
+ role: str
276
+ content: Any
277
+ name: Optional[str]
278
+ tool_calls: Optional[List[Dict[str, Any]]]
279
+
280
+
281
+ @experimental
282
+ class RedTeamRunOutputItemResult(TypedDict, total=False):
283
+ """Flattened evaluation result for a single risk category.
284
+
285
+ :param label: String label "pass" or "fail" that aligns with the passed field
286
+ :type label: Optional[str]
287
+ """
288
+
289
+ # Should extend EvaluationRunOutputItemResult
290
+
291
+ object: str
292
+ type: str
293
+ name: str
294
+ passed: Optional[bool]
295
+ label: Optional[str]
296
+ score: Optional[float]
297
+ metric: Optional[str]
298
+ threshold: Optional[float]
299
+ reason: Optional[str]
300
+ properties: RedTeamOutputResultProperties
301
+
302
+
303
+ @experimental
304
+ class RedTeamDatasourceItem(TypedDict, total=False):
305
+ """Metadata about the datasource item that produced this conversation."""
306
+
307
+ id: Optional[str]
308
+ input_data: Dict[str, Any]
309
+
310
+
311
+ @experimental
312
+ class RedTeamRunOutputItemSample(TypedDict, total=False):
313
+ """Sample payload containing the red team conversation."""
314
+
315
+ # Should extend EvaluationRunOutputItemSample
316
+
317
+ object: str
318
+ input: List[EvaluationRunOutputItemMessage]
319
+ output: List[EvaluationRunOutputItemMessage]
320
+ finish_reason: Optional[str]
321
+ model: Optional[str]
322
+ error: Optional[Dict[str, Any]]
323
+ usage: Optional[Dict[str, Any]]
324
+ seed: Optional[int]
325
+ temperature: Optional[float]
326
+ top_p: Optional[float]
327
+ max_completion_tokens: Optional[float]
328
+ metadata: Optional[Dict[str, Any]]
329
+
330
+
331
+ @experimental
332
+ class RedTeamOutputItem(TypedDict, total=False):
333
+ """Structured representation of a conversation and its evaluation artifacts.
334
+
335
+ DEPRECATED: This TypedDict duplicates the EvalRunOutputItem model from
336
+ azure.ai.evaluation._common.onedp.models._models. New code should use
337
+ EvalRunOutputItem directly instead of this TypedDict wrapper.
338
+ """
339
+
340
+ object: str
341
+ id: str
342
+ created_time: int
343
+ status: str
344
+ datasource_item_id: Optional[str]
345
+ datasource_item: Optional[RedTeamDatasourceItem]
346
+ sample: RedTeamRunOutputItemSample
347
+ results: List[RedTeamRunOutputItemResult]
245
348
 
246
349
 
247
350
  @experimental
@@ -254,6 +357,10 @@ class ScanResult(TypedDict):
254
357
  :type parameters: RedTeamingParameters
255
358
  :param attack_details: List of AttackDetails objects representing the conversations in the evaluation
256
359
  :type attack_details: List[AttackDetails]
360
+ :param AOAI_Compatible_Row_Results: List of evaluation results for each risk category
361
+ :type AOAI_Compatible_Row_Results: Optional[List[RedTeamRunOutputItemResult]]
362
+ :param AOAI_Compatible_Summary: The evaluation run metadata in eval.run format
363
+ :type AOAI_Compatible_Summary: Optional[RedTeamRun]
257
364
  :param studio_url: Optional URL for the studio
258
365
  :type studio_url: Optional[str]
259
366
  """
@@ -261,9 +368,149 @@ class ScanResult(TypedDict):
261
368
  scorecard: RedTeamingScorecard
262
369
  parameters: RedTeamingParameters
263
370
  attack_details: List[AttackDetails]
371
+ AOAI_Compatible_Row_Results: Optional[List[RedTeamRunOutputItemResult]]
372
+ AOAI_Compatible_Summary: Optional["RedTeamRun"]
264
373
  studio_url: Optional[str]
265
374
 
266
375
 
376
+ @experimental
377
+ class ResultCount(TypedDict):
378
+ """Count of evaluation results by status.
379
+
380
+ :param total: Total number of evaluation results
381
+ :type total: int
382
+ :param passed: Number of passed evaluation results
383
+ :type passed: int
384
+ :param failed: Number of failed evaluation results
385
+ :type failed: int
386
+ :param errored: Number of errored evaluation results
387
+ :type errored: int
388
+ """
389
+
390
+ total: int
391
+ passed: int
392
+ failed: int
393
+ errored: int
394
+
395
+
396
+ @experimental
397
+ class PerTestingCriteriaResult(TypedDict, total=False):
398
+ """Result count for a specific testing criteria.
399
+
400
+ :param testing_criteria: The name of the testing criteria (e.g., risk category)
401
+ :type testing_criteria: str
402
+ :param attack_strategy: The attack strategy used (optional, for attack strategy summaries)
403
+ :type attack_strategy: Optional[str]
404
+ :param passed: Number of passed results for this criteria
405
+ :type passed: int
406
+ :param failed: Number of failed results for this criteria
407
+ :type failed: int
408
+ """
409
+
410
+ testing_criteria: str
411
+ attack_strategy: Optional[str]
412
+ passed: int
413
+ failed: int
414
+
415
+
416
+ @experimental
417
+ class DataSourceItemGenerationParams(TypedDict, total=False):
418
+ """Parameters for data source item generation.
419
+
420
+ :param type: Type of data source generation (e.g., "red_team")
421
+ :type type: str
422
+ :param attack_strategies: List of attack strategies used
423
+ :type attack_strategies: List[str]
424
+ :param num_turns: Number of turns in the conversation
425
+ :type num_turns: int
426
+ """
427
+
428
+ type: str
429
+ attack_strategies: List[str]
430
+ num_turns: int
431
+
432
+
433
+ @experimental
434
+ class DataSource(TypedDict, total=False):
435
+ """Data source information for the red team evaluation.
436
+
437
+ :param type: Type of data source (e.g., "azure_ai_red_team")
438
+ :type type: str
439
+ :param target: Target configuration for the data source
440
+ :type target: Dict[str, Any]
441
+ :param item_generation_params: Parameters used for generating data items
442
+ :type item_generation_params: DataSourceItemGenerationParams
443
+ """
444
+
445
+ type: str
446
+ target: Dict[str, Any]
447
+ item_generation_params: DataSourceItemGenerationParams
448
+
449
+
450
+ @experimental
451
+ class OutputItemsList(TypedDict):
452
+ """Wrapper for list of output items.
453
+
454
+ :param object: Object type identifier (always "list")
455
+ :type object: str
456
+ :param data: List of red team output items
457
+ :type data: List[RedTeamOutputItem]
458
+ """
459
+
460
+ object: str
461
+ data: List[RedTeamOutputItem]
462
+
463
+
464
+ @experimental
465
+ class RedTeamRun(TypedDict, total=False):
466
+ """TypedDict representation of a Red Team evaluation run in eval.run format.
467
+
468
+ :param object: Object type identifier (always "eval.run")
469
+ :type object: str
470
+ :param id: Unique identifier for the run
471
+ :type id: str
472
+ :param eval_id: Identifier for the evaluation experiment
473
+ :type eval_id: str
474
+ :param created_at: Timestamp when the run was created (Unix epoch seconds)
475
+ :type created_at: int
476
+ :param status: Status of the run (e.g., "completed", "failed", "in_progress")
477
+ :type status: str
478
+ :param name: Display name for the run
479
+ :type name: str
480
+ :param report_url: URL to view the run report in Azure AI Studio
481
+ :type report_url: Optional[str]
482
+ :param data_source: Information about the data source used for the evaluation
483
+ :type data_source: DataSource
484
+ :param metadata: Additional metadata for the run
485
+ :type metadata: Dict[str, Any]
486
+ :param result_counts: Aggregated counts of evaluation results
487
+ :type result_counts: ResultCount
488
+ :param per_model_usage: Usage statistics per model (if applicable)
489
+ :type per_model_usage: List[Any]
490
+ :param per_testing_criteria_results: Results aggregated by testing criteria
491
+ :type per_testing_criteria_results: List[PerTestingCriteriaResult]
492
+ :param output_items: Wrapped list of output items from the evaluation
493
+ :type output_items: OutputItemsList
494
+ :param conversations: Optional list of attack details/conversations
495
+ :type conversations: List[AttackDetails]
496
+ """
497
+
498
+ object: str
499
+ id: str
500
+ eval_id: str
501
+ created_at: int
502
+ status: str
503
+ name: str
504
+ report_url: Optional[str]
505
+ data_source: DataSource
506
+ metadata: Dict[str, Any]
507
+ result_counts: ResultCount
508
+ per_model_usage: List[Any]
509
+ per_testing_criteria_results: List[PerTestingCriteriaResult]
510
+ output_items: OutputItemsList
511
+ conversations: List[AttackDetails]
512
+
513
+
267
514
  @experimental
268
515
  class RedTeamResult:
269
516
  def __init__(self, scan_result: Optional[ScanResult] = None, attack_details: Optional[List[AttackDetails]] = None):
@@ -284,21 +531,23 @@ class RedTeamResult:
284
531
  return self.scan_result.get("scorecard", None) if self.scan_result else None
285
532
 
286
533
  def to_eval_qr_json_lines(self) -> str:
287
- """
288
- Converts conversations in messages format to query-response format suitable for evaluation.
534
+ """Converts conversations in messages format to query-response format suitable for evaluation.
289
535
 
290
536
  The output format follows the JSONL pattern with each line containing:
291
- {
292
- "query": "user message content",
293
- "response": "assistant message content",
294
- "risk_category": "risk category",
295
- "attack_strategy": "strategy name",
296
- "attack_complexity": "complexity level",
297
- "attack_success": "true|false", (if available from evaluation)
298
- "category": "risk category", (if available from evaluation)
299
- "severity_level": "low|medium|high", (if available from evaluation)
300
- "threshold": "threshold value" (if available from evaluation)
301
- }
537
+
538
+ .. code-block:: javascript
539
+
540
+ {
541
+ "query": "user message content",
542
+ "response": "assistant message content",
543
+ "risk_category": "risk category",
544
+ "attack_strategy": "strategy name",
545
+ "attack_complexity": "complexity level",
546
+ "attack_success": "true|false", // (if available from evaluation)
547
+ "category": "risk category", // (if available from evaluation)
548
+ "severity_level": "low|medium|high", // (if available from evaluation)
549
+ "threshold": "threshold value" // (if available from evaluation)
550
+ }
302
551
 
303
552
  :returns: A list of strings containing query-response pairs in JSONL format.
304
553
  :rtype: List[str]