azure-ai-evaluation 1.12.0__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +6 -9
- azure/ai/evaluation/_aoai/label_grader.py +6 -10
- azure/ai/evaluation/_aoai/python_grader.py +7 -10
- azure/ai/evaluation/_aoai/score_model_grader.py +5 -7
- azure/ai/evaluation/_aoai/string_check_grader.py +4 -9
- azure/ai/evaluation/_aoai/text_similarity_grader.py +7 -21
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +241 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -2
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1019 -5
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +86 -11
- azure/ai/evaluation/_evaluate/_utils.py +10 -3
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +16 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +107 -45
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +30 -29
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +14 -6
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/{_path_efficiency → _task_completion}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/{_task_success/_task_success.py → _task_completion/_task_completion.py} +39 -30
- azure/ai/evaluation/_evaluators/{_task_success/task_success.prompty → _task_completion/task_completion.prompty} +2 -2
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/{_path_efficiency/_path_efficiency.py → _task_navigation_efficiency/_task_navigation_efficiency.py} +115 -73
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/{_task_success → _tool_success}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -1
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +41 -352
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +494 -37
- azure/ai/evaluation/red_team/_red_team_result.py +48 -28
- azure/ai/evaluation/red_team/_result_processor.py +558 -29
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +125 -24
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +38 -8
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +99 -86
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.12.0.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,12 @@ import json
|
|
|
6
6
|
from azure.ai.evaluation._common._experimental import experimental
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
# NOTE: The RedTeam* TypedDict classes below are being gradually deprecated in favor of
|
|
10
|
+
# the standard EvalRun* models from azure.ai.evaluation._common.onedp.models._models.
|
|
11
|
+
# New code should use EvalRunOutputItem instead of RedTeamOutputItem, etc.
|
|
12
|
+
# These TypedDicts are maintained for backward compatibility during the transition.
|
|
13
|
+
|
|
14
|
+
|
|
9
15
|
@experimental
|
|
10
16
|
class RiskCategorySummary(TypedDict):
|
|
11
17
|
"""Summary of attack success rates across risk categories.
|
|
@@ -216,7 +222,7 @@ class RiskAssessment(TypedDict):
|
|
|
216
222
|
|
|
217
223
|
|
|
218
224
|
@experimental
|
|
219
|
-
class AttackDetails(TypedDict):
|
|
225
|
+
class AttackDetails(TypedDict, total=False):
|
|
220
226
|
"""TypedDict representation of a simulated conversation in a red team evaluation.
|
|
221
227
|
|
|
222
228
|
:param attack_success: Whether the attack was successful
|
|
@@ -233,6 +239,8 @@ class AttackDetails(TypedDict):
|
|
|
233
239
|
:type risk_assessment: Optional[RiskAssessment]
|
|
234
240
|
:param attack_success_threshold: The threshold value used to determine attack success
|
|
235
241
|
:type attack_success_threshold: Optional[int]
|
|
242
|
+
:param risk_sub_type: Optional risk sub-category/sub-type for the attack
|
|
243
|
+
:type risk_sub_type: Optional[str]
|
|
236
244
|
"""
|
|
237
245
|
|
|
238
246
|
attack_success: Optional[bool]
|
|
@@ -242,6 +250,7 @@ class AttackDetails(TypedDict):
|
|
|
242
250
|
conversation: List[Dict[str, str]]
|
|
243
251
|
risk_assessment: Optional[RiskAssessment]
|
|
244
252
|
attack_success_threshold: Optional[int]
|
|
253
|
+
risk_sub_type: Optional[str]
|
|
245
254
|
|
|
246
255
|
|
|
247
256
|
@experimental
|
|
@@ -271,7 +280,11 @@ class EvaluationRunOutputItemMessage(TypedDict, total=False):
|
|
|
271
280
|
|
|
272
281
|
@experimental
|
|
273
282
|
class RedTeamRunOutputItemResult(TypedDict, total=False):
|
|
274
|
-
"""Flattened evaluation result for a single risk category.
|
|
283
|
+
"""Flattened evaluation result for a single risk category.
|
|
284
|
+
|
|
285
|
+
:param label: String label "pass" or "fail" that aligns with the passed field
|
|
286
|
+
:type label: Optional[str]
|
|
287
|
+
"""
|
|
275
288
|
|
|
276
289
|
# Should extend EvaluationRunOutputItemResult
|
|
277
290
|
|
|
@@ -279,11 +292,11 @@ class RedTeamRunOutputItemResult(TypedDict, total=False):
|
|
|
279
292
|
type: str
|
|
280
293
|
name: str
|
|
281
294
|
passed: Optional[bool]
|
|
295
|
+
label: Optional[str]
|
|
282
296
|
score: Optional[float]
|
|
283
297
|
metric: Optional[str]
|
|
284
298
|
threshold: Optional[float]
|
|
285
299
|
reason: Optional[str]
|
|
286
|
-
sample: "RedTeamRunOutputItemSample"
|
|
287
300
|
properties: RedTeamOutputResultProperties
|
|
288
301
|
|
|
289
302
|
|
|
@@ -317,7 +330,12 @@ class RedTeamRunOutputItemSample(TypedDict, total=False):
|
|
|
317
330
|
|
|
318
331
|
@experimental
|
|
319
332
|
class RedTeamOutputItem(TypedDict, total=False):
|
|
320
|
-
"""Structured representation of a conversation and its evaluation artifacts.
|
|
333
|
+
"""Structured representation of a conversation and its evaluation artifacts.
|
|
334
|
+
|
|
335
|
+
DEPRECATED: This TypedDict duplicates the EvalRunOutputItem model from
|
|
336
|
+
azure.ai.evaluation._common.onedp.models._models. New code should use
|
|
337
|
+
EvalRunOutputItem directly instead of this TypedDict wrapper.
|
|
338
|
+
"""
|
|
321
339
|
|
|
322
340
|
object: str
|
|
323
341
|
id: str
|
|
@@ -339,12 +357,10 @@ class ScanResult(TypedDict):
|
|
|
339
357
|
:type parameters: RedTeamingParameters
|
|
340
358
|
:param attack_details: List of AttackDetails objects representing the conversations in the evaluation
|
|
341
359
|
:type attack_details: List[AttackDetails]
|
|
342
|
-
:param output_items: List of structured output items from the evaluation
|
|
343
|
-
:type output_items: List[RedTeamOutputItem]
|
|
344
360
|
:param AOAI_Compatible_Row_Results: List of evaluation results for each risk category
|
|
345
|
-
:type AOAI_Compatible_Row_Results: List[RedTeamRunOutputItemResult]
|
|
361
|
+
:type AOAI_Compatible_Row_Results: Optional[List[RedTeamRunOutputItemResult]]
|
|
346
362
|
:param AOAI_Compatible_Summary: The evaluation run metadata in eval.run format
|
|
347
|
-
:type AOAI_Compatible_Summary: RedTeamRun
|
|
363
|
+
:type AOAI_Compatible_Summary: Optional[RedTeamRun]
|
|
348
364
|
:param studio_url: Optional URL for the studio
|
|
349
365
|
:type studio_url: Optional[str]
|
|
350
366
|
"""
|
|
@@ -352,9 +368,8 @@ class ScanResult(TypedDict):
|
|
|
352
368
|
scorecard: RedTeamingScorecard
|
|
353
369
|
parameters: RedTeamingParameters
|
|
354
370
|
attack_details: List[AttackDetails]
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
AOAI_Compatible_Summary: "RedTeamRun"
|
|
371
|
+
AOAI_Compatible_Row_Results: Optional[List[RedTeamRunOutputItemResult]]
|
|
372
|
+
AOAI_Compatible_Summary: Optional["RedTeamRun"]
|
|
358
373
|
studio_url: Optional[str]
|
|
359
374
|
|
|
360
375
|
|
|
@@ -379,11 +394,13 @@ class ResultCount(TypedDict):
|
|
|
379
394
|
|
|
380
395
|
|
|
381
396
|
@experimental
|
|
382
|
-
class PerTestingCriteriaResult(TypedDict):
|
|
397
|
+
class PerTestingCriteriaResult(TypedDict, total=False):
|
|
383
398
|
"""Result count for a specific testing criteria.
|
|
384
399
|
|
|
385
400
|
:param testing_criteria: The name of the testing criteria (e.g., risk category)
|
|
386
401
|
:type testing_criteria: str
|
|
402
|
+
:param attack_strategy: The attack strategy used (optional, for attack strategy summaries)
|
|
403
|
+
:type attack_strategy: Optional[str]
|
|
387
404
|
:param passed: Number of passed results for this criteria
|
|
388
405
|
:type passed: int
|
|
389
406
|
:param failed: Number of failed results for this criteria
|
|
@@ -391,6 +408,7 @@ class PerTestingCriteriaResult(TypedDict):
|
|
|
391
408
|
"""
|
|
392
409
|
|
|
393
410
|
testing_criteria: str
|
|
411
|
+
attack_strategy: Optional[str]
|
|
394
412
|
passed: int
|
|
395
413
|
failed: int
|
|
396
414
|
|
|
@@ -465,8 +483,8 @@ class RedTeamRun(TypedDict, total=False):
|
|
|
465
483
|
:type data_source: DataSource
|
|
466
484
|
:param metadata: Additional metadata for the run
|
|
467
485
|
:type metadata: Dict[str, Any]
|
|
468
|
-
:param
|
|
469
|
-
:type
|
|
486
|
+
:param result_counts: Aggregated counts of evaluation results
|
|
487
|
+
:type result_counts: ResultCount
|
|
470
488
|
:param per_model_usage: Usage statistics per model (if applicable)
|
|
471
489
|
:type per_model_usage: List[Any]
|
|
472
490
|
:param per_testing_criteria_results: Results aggregated by testing criteria
|
|
@@ -486,7 +504,7 @@ class RedTeamRun(TypedDict, total=False):
|
|
|
486
504
|
report_url: Optional[str]
|
|
487
505
|
data_source: DataSource
|
|
488
506
|
metadata: Dict[str, Any]
|
|
489
|
-
|
|
507
|
+
result_counts: ResultCount
|
|
490
508
|
per_model_usage: List[Any]
|
|
491
509
|
per_testing_criteria_results: List[PerTestingCriteriaResult]
|
|
492
510
|
output_items: OutputItemsList
|
|
@@ -513,21 +531,23 @@ class RedTeamResult:
|
|
|
513
531
|
return self.scan_result.get("scorecard", None) if self.scan_result else None
|
|
514
532
|
|
|
515
533
|
def to_eval_qr_json_lines(self) -> str:
|
|
516
|
-
"""
|
|
517
|
-
Converts conversations in messages format to query-response format suitable for evaluation.
|
|
534
|
+
"""Converts conversations in messages format to query-response format suitable for evaluation.
|
|
518
535
|
|
|
519
536
|
The output format follows the JSONL pattern with each line containing:
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
537
|
+
|
|
538
|
+
.. code-block:: javascript
|
|
539
|
+
|
|
540
|
+
{
|
|
541
|
+
"query": "user message content",
|
|
542
|
+
"response": "assistant message content",
|
|
543
|
+
"risk_category": "risk category",
|
|
544
|
+
"attack_strategy": "strategy name",
|
|
545
|
+
"attack_complexity": "complexity level",
|
|
546
|
+
"attack_success": "true|false", // (if available from evaluation)
|
|
547
|
+
"category": "risk category", // (if available from evaluation)
|
|
548
|
+
"severity_level": "low|medium|high", // (if available from evaluation)
|
|
549
|
+
"threshold": "threshold value" // (if available from evaluation)
|
|
550
|
+
}
|
|
531
551
|
|
|
532
552
|
:returns: A list of strings containing query-response pairs in JSONL format.
|
|
533
553
|
:rtype: List[str]
|