azure-ai-evaluation 1.11.2__py3-none-any.whl → 1.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +2 -0
- azure/ai/evaluation/_aoai/aoai_grader.py +69 -28
- azure/ai/evaluation/_aoai/label_grader.py +14 -13
- azure/ai/evaluation/_aoai/python_grader.py +15 -13
- azure/ai/evaluation/_aoai/score_model_grader.py +13 -10
- azure/ai/evaluation/_aoai/string_check_grader.py +13 -13
- azure/ai/evaluation/_aoai/text_similarity_grader.py +16 -25
- azure/ai/evaluation/_common/__init__.py +2 -1
- azure/ai/evaluation/_common/constants.py +109 -0
- azure/ai/evaluation/_common/evaluation_onedp_client.py +5 -5
- azure/ai/evaluation/_common/onedp/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/_utils/model_base.py +1 -1
- azure/ai/evaluation/_common/onedp/_validation.py +18 -2
- azure/ai/evaluation/_common/onedp/aio/__init__.py +2 -2
- azure/ai/evaluation/_common/onedp/aio/_client.py +44 -14
- azure/ai/evaluation/_common/onedp/aio/_configuration.py +9 -7
- azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +3942 -1631
- azure/ai/evaluation/_common/onedp/models/__init__.py +196 -6
- azure/ai/evaluation/_common/onedp/models/_enums.py +217 -0
- azure/ai/evaluation/_common/onedp/models/_models.py +3876 -603
- azure/ai/evaluation/_common/onedp/operations/__init__.py +12 -0
- azure/ai/evaluation/_common/onedp/operations/_operations.py +5422 -2577
- azure/ai/evaluation/_common/rai_service.py +299 -2
- azure/ai/evaluation/_common/utils.py +173 -39
- azure/ai/evaluation/_constants.py +100 -0
- azure/ai/evaluation/_eval_mapping.py +10 -0
- azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +10 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +1125 -9
- azure/ai/evaluation/_evaluate/_evaluate_aoai.py +330 -51
- azure/ai/evaluation/_evaluate/_utils.py +17 -6
- azure/ai/evaluation/_evaluator_definition.py +76 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -0
- azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +0 -17
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +7 -1
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +80 -4
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +181 -3
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +7 -1
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +28 -13
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +19 -14
- azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +23 -4
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -7
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +36 -19
- azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +43 -20
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +7 -1
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +7 -1
- azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +32 -6
- azure/ai/evaluation/_evaluators/_task_completion/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_completion/_task_completion.py +177 -0
- azure/ai/evaluation/_evaluators/_task_completion/task_completion.prompty +220 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py +384 -0
- azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +23 -127
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +263 -0
- azure/ai/evaluation/_evaluators/_tool_input_accuracy/tool_input_accuracy.prompty +76 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/_tool_output_utilization.py +225 -0
- azure/ai/evaluation/_evaluators/_tool_output_utilization/tool_output_utilization.prompty +221 -0
- azure/ai/evaluation/_evaluators/_tool_selection/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +266 -0
- azure/ai/evaluation/_evaluators/_tool_selection/tool_selection.prompty +104 -0
- azure/ai/evaluation/_evaluators/_tool_success/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_tool_success/_tool_success.py +301 -0
- azure/ai/evaluation/_evaluators/_tool_success/tool_success.prompty +321 -0
- azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +0 -19
- azure/ai/evaluation/_exceptions.py +6 -0
- azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +14 -1
- azure/ai/evaluation/_legacy/prompty/_prompty.py +2 -1
- azure/ai/evaluation/_legacy/prompty/_utils.py +54 -6
- azure/ai/evaluation/_model_configurations.py +26 -0
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/red_team/_attack_objective_generator.py +3 -1
- azure/ai/evaluation/red_team/_attack_strategy.py +1 -0
- azure/ai/evaluation/red_team/_callback_chat_target.py +45 -14
- azure/ai/evaluation/red_team/_evaluation_processor.py +129 -12
- azure/ai/evaluation/red_team/_mlflow_integration.py +144 -36
- azure/ai/evaluation/red_team/_orchestrator_manager.py +309 -51
- azure/ai/evaluation/red_team/_red_team.py +503 -37
- azure/ai/evaluation/red_team/_red_team_result.py +264 -15
- azure/ai/evaluation/red_team/_result_processor.py +953 -31
- azure/ai/evaluation/red_team/_utils/constants.py +1 -0
- azure/ai/evaluation/red_team/_utils/formatting_utils.py +126 -25
- azure/ai/evaluation/red_team/_utils/metric_mapping.py +10 -7
- azure/ai/evaluation/red_team/_utils/strategy_utils.py +3 -25
- azure/ai/evaluation/simulator/_adversarial_simulator.py +1 -1
- azure/ai/evaluation/simulator/_conversation/__init__.py +1 -1
- azure/ai/evaluation/simulator/_conversation/_conversation.py +1 -1
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +25 -2
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +1 -1
- azure/ai/evaluation/simulator/_model_tools/models.py +1 -1
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/METADATA +44 -10
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/RECORD +102 -84
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/WHEEL +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/licenses/NOTICE.txt +0 -0
- {azure_ai_evaluation-1.11.2.dist-info → azure_ai_evaluation-1.13.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,17 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
-
from typing import Dict, List, Optional, TypedDict
|
|
4
|
+
from typing import Any, Dict, List, Optional, TypedDict
|
|
5
5
|
import json
|
|
6
6
|
from azure.ai.evaluation._common._experimental import experimental
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
# NOTE: The RedTeam* TypedDict classes below are being gradually deprecated in favor of
|
|
10
|
+
# the standard EvalRun* models from azure.ai.evaluation._common.onedp.models._models.
|
|
11
|
+
# New code should use EvalRunOutputItem instead of RedTeamOutputItem, etc.
|
|
12
|
+
# These TypedDicts are maintained for backward compatibility during the transition.
|
|
13
|
+
|
|
14
|
+
|
|
9
15
|
@experimental
|
|
10
16
|
class RiskCategorySummary(TypedDict):
|
|
11
17
|
"""Summary of attack success rates across risk categories.
|
|
@@ -216,7 +222,7 @@ class RiskAssessment(TypedDict):
|
|
|
216
222
|
|
|
217
223
|
|
|
218
224
|
@experimental
|
|
219
|
-
class AttackDetails(TypedDict):
|
|
225
|
+
class AttackDetails(TypedDict, total=False):
|
|
220
226
|
"""TypedDict representation of a simulated conversation in a red team evaluation.
|
|
221
227
|
|
|
222
228
|
:param attack_success: Whether the attack was successful
|
|
@@ -233,6 +239,8 @@ class AttackDetails(TypedDict):
|
|
|
233
239
|
:type risk_assessment: Optional[RiskAssessment]
|
|
234
240
|
:param attack_success_threshold: The threshold value used to determine attack success
|
|
235
241
|
:type attack_success_threshold: Optional[int]
|
|
242
|
+
:param risk_sub_type: Optional risk sub-category/sub-type for the attack
|
|
243
|
+
:type risk_sub_type: Optional[str]
|
|
236
244
|
"""
|
|
237
245
|
|
|
238
246
|
attack_success: Optional[bool]
|
|
@@ -242,6 +250,101 @@ class AttackDetails(TypedDict):
|
|
|
242
250
|
conversation: List[Dict[str, str]]
|
|
243
251
|
risk_assessment: Optional[RiskAssessment]
|
|
244
252
|
attack_success_threshold: Optional[int]
|
|
253
|
+
risk_sub_type: Optional[str]
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
@experimental
|
|
257
|
+
class RedTeamOutputResultProperties(TypedDict, total=False):
|
|
258
|
+
"""Additional metadata captured for each evaluation result."""
|
|
259
|
+
|
|
260
|
+
attack_success: Optional[bool]
|
|
261
|
+
attack_success_threshold: Optional[int]
|
|
262
|
+
attack_technique: str
|
|
263
|
+
attack_complexity: str
|
|
264
|
+
risk_category: str
|
|
265
|
+
risk_assessment: Optional[Dict[str, Any]]
|
|
266
|
+
reason: Optional[str]
|
|
267
|
+
severity_label: Optional[str]
|
|
268
|
+
metadata: Optional[Dict[str, Any]]
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
@experimental
|
|
272
|
+
class EvaluationRunOutputItemMessage(TypedDict, total=False):
|
|
273
|
+
"""Representation of a single message within an evaluation sample."""
|
|
274
|
+
|
|
275
|
+
role: str
|
|
276
|
+
content: Any
|
|
277
|
+
name: Optional[str]
|
|
278
|
+
tool_calls: Optional[List[Dict[str, Any]]]
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
@experimental
|
|
282
|
+
class RedTeamRunOutputItemResult(TypedDict, total=False):
|
|
283
|
+
"""Flattened evaluation result for a single risk category.
|
|
284
|
+
|
|
285
|
+
:param label: String label "pass" or "fail" that aligns with the passed field
|
|
286
|
+
:type label: Optional[str]
|
|
287
|
+
"""
|
|
288
|
+
|
|
289
|
+
# Should extend EvaluationRunOutputItemResult
|
|
290
|
+
|
|
291
|
+
object: str
|
|
292
|
+
type: str
|
|
293
|
+
name: str
|
|
294
|
+
passed: Optional[bool]
|
|
295
|
+
label: Optional[str]
|
|
296
|
+
score: Optional[float]
|
|
297
|
+
metric: Optional[str]
|
|
298
|
+
threshold: Optional[float]
|
|
299
|
+
reason: Optional[str]
|
|
300
|
+
properties: RedTeamOutputResultProperties
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
@experimental
|
|
304
|
+
class RedTeamDatasourceItem(TypedDict, total=False):
|
|
305
|
+
"""Metadata about the datasource item that produced this conversation."""
|
|
306
|
+
|
|
307
|
+
id: Optional[str]
|
|
308
|
+
input_data: Dict[str, Any]
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
@experimental
|
|
312
|
+
class RedTeamRunOutputItemSample(TypedDict, total=False):
|
|
313
|
+
"""Sample payload containing the red team conversation."""
|
|
314
|
+
|
|
315
|
+
# Should extend EvaluationRunOutputItemSample
|
|
316
|
+
|
|
317
|
+
object: str
|
|
318
|
+
input: List[EvaluationRunOutputItemMessage]
|
|
319
|
+
output: List[EvaluationRunOutputItemMessage]
|
|
320
|
+
finish_reason: Optional[str]
|
|
321
|
+
model: Optional[str]
|
|
322
|
+
error: Optional[Dict[str, Any]]
|
|
323
|
+
usage: Optional[Dict[str, Any]]
|
|
324
|
+
seed: Optional[int]
|
|
325
|
+
temperature: Optional[float]
|
|
326
|
+
top_p: Optional[float]
|
|
327
|
+
max_completion_tokens: Optional[float]
|
|
328
|
+
metadata: Optional[Dict[str, Any]]
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@experimental
|
|
332
|
+
class RedTeamOutputItem(TypedDict, total=False):
|
|
333
|
+
"""Structured representation of a conversation and its evaluation artifacts.
|
|
334
|
+
|
|
335
|
+
DEPRECATED: This TypedDict duplicates the EvalRunOutputItem model from
|
|
336
|
+
azure.ai.evaluation._common.onedp.models._models. New code should use
|
|
337
|
+
EvalRunOutputItem directly instead of this TypedDict wrapper.
|
|
338
|
+
"""
|
|
339
|
+
|
|
340
|
+
object: str
|
|
341
|
+
id: str
|
|
342
|
+
created_time: int
|
|
343
|
+
status: str
|
|
344
|
+
datasource_item_id: Optional[str]
|
|
345
|
+
datasource_item: Optional[RedTeamDatasourceItem]
|
|
346
|
+
sample: RedTeamRunOutputItemSample
|
|
347
|
+
results: List[RedTeamRunOutputItemResult]
|
|
245
348
|
|
|
246
349
|
|
|
247
350
|
@experimental
|
|
@@ -254,6 +357,10 @@ class ScanResult(TypedDict):
|
|
|
254
357
|
:type parameters: RedTeamingParameters
|
|
255
358
|
:param attack_details: List of AttackDetails objects representing the conversations in the evaluation
|
|
256
359
|
:type attack_details: List[AttackDetails]
|
|
360
|
+
:param AOAI_Compatible_Row_Results: List of evaluation results for each risk category
|
|
361
|
+
:type AOAI_Compatible_Row_Results: Optional[List[RedTeamRunOutputItemResult]]
|
|
362
|
+
:param AOAI_Compatible_Summary: The evaluation run metadata in eval.run format
|
|
363
|
+
:type AOAI_Compatible_Summary: Optional[RedTeamRun]
|
|
257
364
|
:param studio_url: Optional URL for the studio
|
|
258
365
|
:type studio_url: Optional[str]
|
|
259
366
|
"""
|
|
@@ -261,9 +368,149 @@ class ScanResult(TypedDict):
|
|
|
261
368
|
scorecard: RedTeamingScorecard
|
|
262
369
|
parameters: RedTeamingParameters
|
|
263
370
|
attack_details: List[AttackDetails]
|
|
371
|
+
AOAI_Compatible_Row_Results: Optional[List[RedTeamRunOutputItemResult]]
|
|
372
|
+
AOAI_Compatible_Summary: Optional["RedTeamRun"]
|
|
264
373
|
studio_url: Optional[str]
|
|
265
374
|
|
|
266
375
|
|
|
376
|
+
@experimental
|
|
377
|
+
class ResultCount(TypedDict):
|
|
378
|
+
"""Count of evaluation results by status.
|
|
379
|
+
|
|
380
|
+
:param total: Total number of evaluation results
|
|
381
|
+
:type total: int
|
|
382
|
+
:param passed: Number of passed evaluation results
|
|
383
|
+
:type passed: int
|
|
384
|
+
:param failed: Number of failed evaluation results
|
|
385
|
+
:type failed: int
|
|
386
|
+
:param errored: Number of errored evaluation results
|
|
387
|
+
:type errored: int
|
|
388
|
+
"""
|
|
389
|
+
|
|
390
|
+
total: int
|
|
391
|
+
passed: int
|
|
392
|
+
failed: int
|
|
393
|
+
errored: int
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
@experimental
|
|
397
|
+
class PerTestingCriteriaResult(TypedDict, total=False):
|
|
398
|
+
"""Result count for a specific testing criteria.
|
|
399
|
+
|
|
400
|
+
:param testing_criteria: The name of the testing criteria (e.g., risk category)
|
|
401
|
+
:type testing_criteria: str
|
|
402
|
+
:param attack_strategy: The attack strategy used (optional, for attack strategy summaries)
|
|
403
|
+
:type attack_strategy: Optional[str]
|
|
404
|
+
:param passed: Number of passed results for this criteria
|
|
405
|
+
:type passed: int
|
|
406
|
+
:param failed: Number of failed results for this criteria
|
|
407
|
+
:type failed: int
|
|
408
|
+
"""
|
|
409
|
+
|
|
410
|
+
testing_criteria: str
|
|
411
|
+
attack_strategy: Optional[str]
|
|
412
|
+
passed: int
|
|
413
|
+
failed: int
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
@experimental
|
|
417
|
+
class DataSourceItemGenerationParams(TypedDict, total=False):
|
|
418
|
+
"""Parameters for data source item generation.
|
|
419
|
+
|
|
420
|
+
:param type: Type of data source generation (e.g., "red_team")
|
|
421
|
+
:type type: str
|
|
422
|
+
:param attack_strategies: List of attack strategies used
|
|
423
|
+
:type attack_strategies: List[str]
|
|
424
|
+
:param num_turns: Number of turns in the conversation
|
|
425
|
+
:type num_turns: int
|
|
426
|
+
"""
|
|
427
|
+
|
|
428
|
+
type: str
|
|
429
|
+
attack_strategies: List[str]
|
|
430
|
+
num_turns: int
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
@experimental
|
|
434
|
+
class DataSource(TypedDict, total=False):
|
|
435
|
+
"""Data source information for the red team evaluation.
|
|
436
|
+
|
|
437
|
+
:param type: Type of data source (e.g., "azure_ai_red_team")
|
|
438
|
+
:type type: str
|
|
439
|
+
:param target: Target configuration for the data source
|
|
440
|
+
:type target: Dict[str, Any]
|
|
441
|
+
:param item_generation_params: Parameters used for generating data items
|
|
442
|
+
:type item_generation_params: DataSourceItemGenerationParams
|
|
443
|
+
"""
|
|
444
|
+
|
|
445
|
+
type: str
|
|
446
|
+
target: Dict[str, Any]
|
|
447
|
+
item_generation_params: DataSourceItemGenerationParams
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
@experimental
|
|
451
|
+
class OutputItemsList(TypedDict):
|
|
452
|
+
"""Wrapper for list of output items.
|
|
453
|
+
|
|
454
|
+
:param object: Object type identifier (always "list")
|
|
455
|
+
:type object: str
|
|
456
|
+
:param data: List of red team output items
|
|
457
|
+
:type data: List[RedTeamOutputItem]
|
|
458
|
+
"""
|
|
459
|
+
|
|
460
|
+
object: str
|
|
461
|
+
data: List[RedTeamOutputItem]
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
@experimental
|
|
465
|
+
class RedTeamRun(TypedDict, total=False):
|
|
466
|
+
"""TypedDict representation of a Red Team evaluation run in eval.run format.
|
|
467
|
+
|
|
468
|
+
:param object: Object type identifier (always "eval.run")
|
|
469
|
+
:type object: str
|
|
470
|
+
:param id: Unique identifier for the run
|
|
471
|
+
:type id: str
|
|
472
|
+
:param eval_id: Identifier for the evaluation experiment
|
|
473
|
+
:type eval_id: str
|
|
474
|
+
:param created_at: Timestamp when the run was created (Unix epoch seconds)
|
|
475
|
+
:type created_at: int
|
|
476
|
+
:param status: Status of the run (e.g., "completed", "failed", "in_progress")
|
|
477
|
+
:type status: str
|
|
478
|
+
:param name: Display name for the run
|
|
479
|
+
:type name: str
|
|
480
|
+
:param report_url: URL to view the run report in Azure AI Studio
|
|
481
|
+
:type report_url: Optional[str]
|
|
482
|
+
:param data_source: Information about the data source used for the evaluation
|
|
483
|
+
:type data_source: DataSource
|
|
484
|
+
:param metadata: Additional metadata for the run
|
|
485
|
+
:type metadata: Dict[str, Any]
|
|
486
|
+
:param result_counts: Aggregated counts of evaluation results
|
|
487
|
+
:type result_counts: ResultCount
|
|
488
|
+
:param per_model_usage: Usage statistics per model (if applicable)
|
|
489
|
+
:type per_model_usage: List[Any]
|
|
490
|
+
:param per_testing_criteria_results: Results aggregated by testing criteria
|
|
491
|
+
:type per_testing_criteria_results: List[PerTestingCriteriaResult]
|
|
492
|
+
:param output_items: Wrapped list of output items from the evaluation
|
|
493
|
+
:type output_items: OutputItemsList
|
|
494
|
+
:param conversations: Optional list of attack details/conversations
|
|
495
|
+
:type conversations: List[AttackDetails]
|
|
496
|
+
"""
|
|
497
|
+
|
|
498
|
+
object: str
|
|
499
|
+
id: str
|
|
500
|
+
eval_id: str
|
|
501
|
+
created_at: int
|
|
502
|
+
status: str
|
|
503
|
+
name: str
|
|
504
|
+
report_url: Optional[str]
|
|
505
|
+
data_source: DataSource
|
|
506
|
+
metadata: Dict[str, Any]
|
|
507
|
+
result_counts: ResultCount
|
|
508
|
+
per_model_usage: List[Any]
|
|
509
|
+
per_testing_criteria_results: List[PerTestingCriteriaResult]
|
|
510
|
+
output_items: OutputItemsList
|
|
511
|
+
conversations: List[AttackDetails]
|
|
512
|
+
|
|
513
|
+
|
|
267
514
|
@experimental
|
|
268
515
|
class RedTeamResult:
|
|
269
516
|
def __init__(self, scan_result: Optional[ScanResult] = None, attack_details: Optional[List[AttackDetails]] = None):
|
|
@@ -284,21 +531,23 @@ class RedTeamResult:
|
|
|
284
531
|
return self.scan_result.get("scorecard", None) if self.scan_result else None
|
|
285
532
|
|
|
286
533
|
def to_eval_qr_json_lines(self) -> str:
|
|
287
|
-
"""
|
|
288
|
-
Converts conversations in messages format to query-response format suitable for evaluation.
|
|
534
|
+
"""Converts conversations in messages format to query-response format suitable for evaluation.
|
|
289
535
|
|
|
290
536
|
The output format follows the JSONL pattern with each line containing:
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
537
|
+
|
|
538
|
+
.. code-block:: javascript
|
|
539
|
+
|
|
540
|
+
{
|
|
541
|
+
"query": "user message content",
|
|
542
|
+
"response": "assistant message content",
|
|
543
|
+
"risk_category": "risk category",
|
|
544
|
+
"attack_strategy": "strategy name",
|
|
545
|
+
"attack_complexity": "complexity level",
|
|
546
|
+
"attack_success": "true|false", // (if available from evaluation)
|
|
547
|
+
"category": "risk category", // (if available from evaluation)
|
|
548
|
+
"severity_level": "low|medium|high", // (if available from evaluation)
|
|
549
|
+
"threshold": "threshold value" // (if available from evaluation)
|
|
550
|
+
}
|
|
302
551
|
|
|
303
552
|
:returns: A list of strings containing query-response pairs in JSONL format.
|
|
304
553
|
:rtype: List[str]
|