azure-ai-evaluation 1.0.0b4__py3-none-any.whl → 1.0.0b5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (79) hide show
  1. azure/ai/evaluation/__init__.py +22 -0
  2. azure/ai/evaluation/_common/constants.py +5 -0
  3. azure/ai/evaluation/_common/math.py +11 -0
  4. azure/ai/evaluation/_common/rai_service.py +172 -35
  5. azure/ai/evaluation/_common/utils.py +162 -23
  6. azure/ai/evaluation/_constants.py +6 -6
  7. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/__init__.py +3 -2
  8. azure/ai/evaluation/_evaluate/{_batch_run_client/batch_run_context.py → _batch_run/eval_run_context.py} +4 -4
  9. azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/proxy_client.py +6 -3
  10. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +35 -0
  11. azure/ai/evaluation/_evaluate/_eval_run.py +21 -4
  12. azure/ai/evaluation/_evaluate/_evaluate.py +267 -139
  13. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +5 -5
  14. azure/ai/evaluation/_evaluate/_utils.py +40 -7
  15. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +1 -1
  16. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +14 -9
  17. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +76 -34
  18. azure/ai/evaluation/_evaluators/_common/_base_eval.py +20 -19
  19. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +18 -8
  20. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +48 -9
  21. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +56 -19
  22. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +5 -5
  23. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +30 -1
  24. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +30 -1
  25. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +30 -1
  26. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +30 -1
  27. azure/ai/evaluation/_evaluators/_eci/_eci.py +3 -1
  28. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +20 -20
  29. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +66 -36
  30. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +1 -1
  31. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +49 -15
  32. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  33. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  34. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +3 -7
  35. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  36. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +130 -0
  37. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +57 -0
  38. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +96 -0
  39. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +120 -0
  40. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +96 -0
  41. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +96 -0
  42. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +96 -0
  43. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +44 -11
  44. azure/ai/evaluation/_evaluators/_qa/_qa.py +7 -3
  45. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +21 -19
  46. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +78 -42
  47. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +125 -82
  48. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +74 -24
  49. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +2 -2
  50. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  51. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +150 -0
  52. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +17 -14
  53. azure/ai/evaluation/_evaluators/_xpia/xpia.py +32 -5
  54. azure/ai/evaluation/_exceptions.py +17 -0
  55. azure/ai/evaluation/_model_configurations.py +18 -1
  56. azure/ai/evaluation/_version.py +1 -1
  57. azure/ai/evaluation/simulator/__init__.py +2 -1
  58. azure/ai/evaluation/simulator/_adversarial_scenario.py +5 -0
  59. azure/ai/evaluation/simulator/_adversarial_simulator.py +4 -1
  60. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  61. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  62. azure/ai/evaluation/simulator/_direct_attack_simulator.py +1 -1
  63. azure/ai/evaluation/simulator/_helpers/__init__.py +1 -2
  64. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +22 -1
  65. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +79 -34
  66. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +1 -1
  67. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +4 -4
  68. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +6 -1
  69. azure/ai/evaluation/simulator/_simulator.py +115 -61
  70. azure/ai/evaluation/simulator/_utils.py +6 -6
  71. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/METADATA +166 -9
  72. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/NOTICE.txt +20 -0
  73. azure_ai_evaluation-1.0.0b5.dist-info/RECORD +120 -0
  74. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/WHEEL +1 -1
  75. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +0 -49
  76. azure_ai_evaluation-1.0.0b4.dist-info/RECORD +0 -106
  77. /azure/ai/evaluation/{simulator/_helpers → _common}/_experimental.py +0 -0
  78. /azure/ai/evaluation/_evaluate/{_batch_run_client → _batch_run}/code_client.py +0 -0
  79. {azure_ai_evaluation-1.0.0b4.dist-info → azure_ai_evaluation-1.0.0b5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,150 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from typing import Optional, Dict
5
+ from typing_extensions import override
6
+
7
+ from azure.ai.evaluation._common._experimental import experimental
8
+ from azure.ai.evaluation._common.constants import EvaluationMetrics
9
+ from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
10
+
11
+
12
+ @experimental
13
+ class GroundednessProEvaluator(RaiServiceEvaluatorBase):
14
+ """
15
+ Initialize a Groundedness Pro evaluator for determine if the response is grounded
16
+ in the query and context.
17
+
18
+ If this evaluator is supplied to the `evaluate` function, the aggregated metric
19
+ for the groundedness pro label will be "groundedness_pro_passing_rate".
20
+
21
+ :param credential: The credential for connecting to Azure AI project. Required
22
+ :type credential: ~azure.core.credentials.TokenCredential
23
+ :param azure_ai_project: The scope of the Azure AI project.
24
+ It contains subscription id, resource group, and project name.
25
+ :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
26
+ :param kwargs: Additional arguments to pass to the evaluator.
27
+ :type kwargs: Any
28
+
29
+ **Usage**
30
+
31
+ .. code-block:: python
32
+
33
+ azure_ai_project = {
34
+ "subscription_id": "<subscription_id>",
35
+ "resource_group_name": "<resource_group_name>",
36
+ "project_name": "<project_name>",
37
+ }
38
+ credential = DefaultAzureCredential()
39
+
40
+ eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
41
+ result = eval_fn(query="What's the capital of France", response="Paris", context="Paris.")
42
+
43
+ **Output format**
44
+
45
+ .. code-block:: python
46
+
47
+ {
48
+ "groundedness_pro_label": True,
49
+ "reason": "'All Contents are grounded"
50
+ }
51
+
52
+ **Usage with conversation input**
53
+
54
+ .. code-block:: python
55
+
56
+ azure_ai_project = {
57
+ "subscription_id": "<subscription_id>",
58
+ "resource_group_name": "<resource_group_name>",
59
+ "project_name": "<project_name>",
60
+ }
61
+ credential = DefaultAzureCredential()
62
+
63
+ eval_fn = GroundednessProEvaluator(azure_ai_project, credential)
64
+ conversation = {
65
+ "messages": [
66
+ {"role": "user", "content": "What is the capital of France?"},
67
+ {"role": "assistant", "content": "Paris.", "context": "Paris."}
68
+ {"role": "user", "content": "What is the capital of Germany?"},
69
+ {"role": "assistant", "content": "Berlin.", "context": "Berlin."}
70
+ ]
71
+ }
72
+ result = eval_fn(conversation=conversation)
73
+
74
+ **Output format**
75
+
76
+ .. code-block:: python
77
+
78
+ {
79
+ "groundedness_pro_label": 1.0,
80
+ "evaluation_per_turn": {
81
+ "groundedness_pro_label": [True, True],
82
+ "groundedness_pro_reason": ["All contents are grounded", "All contents are grounded"]
83
+ }
84
+ }
85
+ """
86
+
87
+ @override
88
+ def __init__(
89
+ self,
90
+ credential,
91
+ azure_ai_project,
92
+ **kwargs,
93
+ ):
94
+ self._passing_score = 3 # TODO update once the binarization PR is merged
95
+ self._output_prefix = "groundedness_pro"
96
+ super().__init__(
97
+ eval_metric=EvaluationMetrics.GROUNDEDNESS,
98
+ azure_ai_project=azure_ai_project,
99
+ credential=credential,
100
+ **kwargs,
101
+ )
102
+
103
+ @override
104
+ def __call__(
105
+ self,
106
+ *,
107
+ query: Optional[str] = None,
108
+ response: Optional[str] = None,
109
+ context: Optional[str] = None,
110
+ conversation=None,
111
+ **kwargs,
112
+ ):
113
+ """Evaluate groundedness. Accepts either a query, response and context for a single-turn evaluation, or a
114
+ or a conversation for a multi-turn evaluation. If the conversation has more than one turn,
115
+ the evaluator will aggregate the results of each turn, with the per-turn results available
116
+ in the output under the "evaluation_per_turn" key.
117
+
118
+ :keyword query: The query to be evaluated.
119
+ :paramtype query: Optional[str]
120
+ :keyword response: The response to be evaluated.
121
+ :paramtype response: Optional[str]
122
+ :keyword context: The context to be evaluated.
123
+ :paramtype context: Optional[str]
124
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
125
+ key "messages", and potentially a global context under the key "context". Conversation turns are expected
126
+ to be dictionaries with keys "content", "role", and possibly "context".
127
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
128
+ :return: The relevance score.
129
+ :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[float, Dict[str, List[Union[str, bool]]]]]]
130
+ """
131
+ return super().__call__(query=query, response=response, context=context, conversation=conversation, **kwargs)
132
+
133
+ @override
134
+ async def _do_eval(self, eval_input: Dict):
135
+ """This evaluator has some unique post-processing that requires data that
136
+ the rai_service script is not currently built to handle. So we post-post-process
137
+ the result here to message it into the right form.
138
+
139
+ :param eval_input: The input to the evaluation function.
140
+ :type eval_input: Dict
141
+ :return: The evaluation result.
142
+ :rtype: Dict
143
+ """
144
+ result = await super()._do_eval(eval_input)
145
+ real_result = {}
146
+ real_result[self._output_prefix + "_label"] = (
147
+ result[EvaluationMetrics.GROUNDEDNESS + "_score"] >= self._passing_score
148
+ )
149
+ real_result[self._output_prefix + "_reason"] = result[EvaluationMetrics.GROUNDEDNESS + "_reason"]
150
+ return real_result
@@ -5,13 +5,11 @@
5
5
  import math
6
6
  import os
7
7
  import re
8
- from typing import Union
9
8
 
10
9
  from promptflow._utils.async_utils import async_run_allowing_running_loop
11
10
  from promptflow.core import AsyncPrompty
12
11
 
13
12
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
14
- from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
15
13
 
16
14
  from ..._common.utils import construct_prompty_model_config, validate_model_config
17
15
 
@@ -23,19 +21,19 @@ except ImportError:
23
21
 
24
22
  class _AsyncSimilarityEvaluator:
25
23
  # Constants must be defined within eval's directory to be save/loadable
26
- PROMPTY_FILE = "similarity.prompty"
27
- LLM_CALL_TIMEOUT = 600
28
- DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
24
+ _PROMPTY_FILE = "similarity.prompty"
25
+ _LLM_CALL_TIMEOUT = 600
26
+ _DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
29
27
 
30
- def __init__(self, model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration]):
28
+ def __init__(self, model_config: dict):
31
29
  prompty_model_config = construct_prompty_model_config(
32
- model_config,
33
- self.DEFAULT_OPEN_API_VERSION,
30
+ validate_model_config(model_config),
31
+ self._DEFAULT_OPEN_API_VERSION,
34
32
  USER_AGENT,
35
33
  )
36
34
 
37
35
  current_dir = os.path.dirname(__file__)
38
- prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
36
+ prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
39
37
  self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
40
38
 
41
39
  async def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
@@ -68,7 +66,7 @@ class _AsyncSimilarityEvaluator:
68
66
 
69
67
  # Run the evaluation flow
70
68
  llm_output = await self._flow(
71
- query=query, response=response, ground_truth=ground_truth, timeout=self.LLM_CALL_TIMEOUT, **kwargs
69
+ query=query, response=response, ground_truth=ground_truth, timeout=self._LLM_CALL_TIMEOUT, **kwargs
72
70
  )
73
71
 
74
72
  score = math.nan
@@ -77,7 +75,7 @@ class _AsyncSimilarityEvaluator:
77
75
  if match:
78
76
  score = float(match.group())
79
77
 
80
- return {"gpt_similarity": float(score)}
78
+ return {"similarity": float(score), "gpt_similarity": float(score)}
81
79
 
82
80
 
83
81
  class SimilarityEvaluator:
@@ -103,12 +101,17 @@ class SimilarityEvaluator:
103
101
  .. code-block:: python
104
102
 
105
103
  {
106
- "gpt_similarity": 3.0
104
+ "similarity": 3.0,
105
+ "gpt_similarity": 3.0,
107
106
  }
107
+
108
+ Note: To align with our support of a diverse set of models, a key without the `gpt_` prefix has been added.
109
+ To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output;
110
+ however, it is recommended to use the new key moving forward as the old key will be deprecated in the future.
108
111
  """
109
112
 
110
- def __init__(self, model_config: dict):
111
- self._async_evaluator = _AsyncSimilarityEvaluator(validate_model_config(model_config))
113
+ def __init__(self, model_config):
114
+ self._async_evaluator = _AsyncSimilarityEvaluator(model_config)
112
115
 
113
116
  def __call__(self, *, query: str, response: str, ground_truth: str, **kwargs):
114
117
  """
@@ -2,19 +2,24 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
  import logging
5
+ from typing import Optional
5
6
 
6
7
  from typing_extensions import override
7
8
 
9
+ from azure.ai.evaluation._common._experimental import experimental
8
10
  from azure.ai.evaluation._common.constants import EvaluationMetrics
9
11
  from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
10
12
 
11
13
  logger = logging.getLogger(__name__)
12
14
 
13
15
 
16
+ @experimental
14
17
  class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
15
18
  """A Cross-Domain Prompt Injection Attack (XPIA) jailbreak evaluator.
16
19
 
17
20
  Detect whether cross domain injected attacks are present in your AI system's response.
21
+ Metrics include the overall evaluation label and reason for the Q/A Pair, as well as sub-labels
22
+ for manipulated content, intrusion, and information.
18
23
 
19
24
  :param credential: The credential for connecting to Azure AI project. Required
20
25
  :type credential: ~azure.core.credentials.TokenCredential
@@ -24,10 +29,7 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
24
29
  :param eval_last_turn: Set to True to evaluate only the most recent exchange in the dialogue,
25
30
  focusing on the latest user inquiry and the assistant's corresponding response. Defaults to False
26
31
  :type eval_last_turn: bool
27
- :return: A function that evaluates and generates metrics for XPIA chat scenario. Metrics include the overall
28
- evaluation label and reason for the Q/A Pair, as well as sub-labels for manipulated content, intrusion, and
29
- information.
30
- :rtype: Callable
32
+
31
33
  **Usage**
32
34
 
33
35
  .. code-block:: python
@@ -53,7 +55,7 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
53
55
  def __init__(
54
56
  self,
55
57
  credential,
56
- azure_ai_project: dict,
58
+ azure_ai_project,
57
59
  eval_last_turn: bool = False,
58
60
  ):
59
61
  super().__init__(
@@ -62,3 +64,28 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase):
62
64
  credential=credential,
63
65
  eval_last_turn=eval_last_turn,
64
66
  )
67
+
68
+ @override
69
+ def __call__(
70
+ self,
71
+ *,
72
+ query: Optional[str] = None,
73
+ response: Optional[str] = None,
74
+ conversation=None,
75
+ **kwargs,
76
+ ):
77
+ """
78
+ Evaluate whether cross domain injected attacks are present in your AI system's response.
79
+
80
+ :keyword query: The query to be evaluated.
81
+ :paramtype query: str
82
+ :keyword response: The response to be evaluated.
83
+ :paramtype response: str
84
+ :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
85
+ key "messages". Conversation turns are expected
86
+ to be dictionaries with keys "content" and "role".
87
+ :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
88
+ :return: The fluency score.
89
+ :rtype: Union[Dict[str, Union[str, bool]], Dict[str, Union[str, bool, Dict[str, List[Union[str, bool]]]]]]
90
+ """
91
+ return super().__call__(query=query, response=response, conversation=conversation, **kwargs)
@@ -22,6 +22,7 @@ class ErrorCategory(Enum):
22
22
  * FAILED_EXECUTION -> Execution failed
23
23
  * SERVICE_UNAVAILABLE -> Service is unavailable
24
24
  * MISSING_PACKAGE -> Required package is missing
25
+ * FAILED_REMOTE_TRACKING -> Remote tracking failed
25
26
  * UNKNOWN -> Undefined placeholder. Avoid using.
26
27
  """
27
28
 
@@ -33,6 +34,7 @@ class ErrorCategory(Enum):
33
34
  FAILED_EXECUTION = "FAILED_EXECUTION"
34
35
  SERVICE_UNAVAILABLE = "SERVICE UNAVAILABLE"
35
36
  MISSING_PACKAGE = "MISSING PACKAGE"
37
+ FAILED_REMOTE_TRACKING = "FAILED REMOTE TRACKING"
36
38
  UNKNOWN = "UNKNOWN"
37
39
 
38
40
 
@@ -59,12 +61,15 @@ class ErrorTarget(Enum):
59
61
  RAI_CLIENT = "RAIClient"
60
62
  COHERENCE_EVALUATOR = "CoherenceEvaluator"
61
63
  CONTENT_SAFETY_CHAT_EVALUATOR = "ContentSafetyEvaluator"
64
+ CONTENT_SAFETY_MULTIMODAL_EVALUATOR = "ContentSafetyMultimodalEvaluator"
62
65
  ECI_EVALUATOR = "ECIEvaluator"
63
66
  F1_EVALUATOR = "F1Evaluator"
64
67
  GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator"
65
68
  PROTECTED_MATERIAL_EVALUATOR = "ProtectedMaterialEvaluator"
66
69
  RELEVANCE_EVALUATOR = "RelevanceEvaluator"
67
70
  SIMILARITY_EVALUATOR = "SimilarityEvaluator"
71
+ FLUENCY_EVALUATOR = "FluencyEvaluator"
72
+ RETRIEVAL_EVALUATOR = "RetrievalEvaluator"
68
73
  INDIRECT_ATTACK_EVALUATOR = "IndirectAttackEvaluator"
69
74
  INDIRECT_ATTACK_SIMULATOR = "IndirectAttackSimulator"
70
75
  ADVERSARIAL_SIMULATOR = "AdversarialSimulator"
@@ -90,6 +95,8 @@ class EvaluationException(AzureError):
90
95
  :type category: ~azure.ai.evaluation._exceptions.ErrorCategory
91
96
  :param blame: The source of blame for the error, defaults to Unknown.
92
97
  :type balance: ~azure.ai.evaluation._exceptions.ErrorBlame
98
+ :param tsg_link: A link to the TSG page for troubleshooting the error.
99
+ :type tsg_link: str
93
100
  """
94
101
 
95
102
  def __init__(
@@ -100,10 +107,20 @@ class EvaluationException(AzureError):
100
107
  target: ErrorTarget = ErrorTarget.UNKNOWN,
101
108
  category: ErrorCategory = ErrorCategory.UNKNOWN,
102
109
  blame: ErrorBlame = ErrorBlame.UNKNOWN,
110
+ tsg_link: Optional[str] = None,
103
111
  **kwargs,
104
112
  ) -> None:
105
113
  self.category = category
106
114
  self.target = target
107
115
  self.blame = blame
108
116
  self.internal_message = internal_message
117
+ self.tsg_link = tsg_link
109
118
  super().__init__(message, *args, **kwargs)
119
+
120
+ def __str__(self):
121
+ error_blame = "InternalError" if self.blame != ErrorBlame.USER_ERROR else "UserError"
122
+ msg = f"({error_blame}) {super().__str__()}"
123
+ if self.tsg_link:
124
+ msg += f"\nVisit {self.tsg_link} to troubleshoot this issue."
125
+
126
+ return msg
@@ -2,7 +2,7 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- from typing import Dict, Literal, TypedDict
5
+ from typing import Any, Dict, List, Literal, TypedDict, Union
6
6
 
7
7
  from typing_extensions import NotRequired
8
8
 
@@ -53,3 +53,20 @@ class EvaluatorConfig(TypedDict, total=False):
53
53
 
54
54
  column_mapping: Dict[str, str]
55
55
  """Dictionary mapping evaluator input name to column in data"""
56
+
57
+
58
+ class Message(TypedDict):
59
+ role: str
60
+ content: Union[str, List[Dict]]
61
+ context: NotRequired[Dict[str, Any]]
62
+
63
+
64
+ class Conversation(TypedDict):
65
+ messages: Union[List[Message], List[Dict]]
66
+ context: NotRequired[Dict[str, Any]]
67
+
68
+
69
+ class EvaluationResult(TypedDict):
70
+ metrics: Dict
71
+ studio_url: NotRequired[str]
72
+ rows: List[Dict]
@@ -2,4 +2,4 @@
2
2
  # Copyright (c) Microsoft Corporation. All rights reserved.
3
3
  # ---------------------------------------------------------
4
4
 
5
- VERSION = "1.0.0b4"
5
+ VERSION = "1.0.0b5"
@@ -1,4 +1,4 @@
1
- from ._adversarial_scenario import AdversarialScenario
1
+ from ._adversarial_scenario import AdversarialScenario, AdversarialScenarioJailbreak
2
2
  from ._adversarial_simulator import AdversarialSimulator
3
3
  from ._constants import SupportedLanguages
4
4
  from ._direct_attack_simulator import DirectAttackSimulator
@@ -8,6 +8,7 @@ from ._simulator import Simulator
8
8
  __all__ = [
9
9
  "AdversarialSimulator",
10
10
  "AdversarialScenario",
11
+ "AdversarialScenarioJailbreak",
11
12
  "DirectAttackSimulator",
12
13
  "IndirectAttackSimulator",
13
14
  "SupportedLanguages",
@@ -16,6 +16,11 @@ class AdversarialScenario(Enum):
16
16
  ADVERSARIAL_CONTENT_GEN_UNGROUNDED = "adv_content_gen_ungrounded"
17
17
  ADVERSARIAL_CONTENT_GEN_GROUNDED = "adv_content_gen_grounded"
18
18
  ADVERSARIAL_CONTENT_PROTECTED_MATERIAL = "adv_content_protected_material"
19
+
20
+
21
+ class AdversarialScenarioJailbreak(Enum):
22
+ """Adversarial scenario types for XPIA Jailbreak"""
23
+
19
24
  ADVERSARIAL_INDIRECT_JAILBREAK = "adv_xpia"
20
25
 
21
26
 
@@ -10,6 +10,7 @@ from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast
10
10
 
11
11
  from tqdm import tqdm
12
12
 
13
+ from azure.ai.evaluation._common._experimental import experimental
13
14
  from azure.ai.evaluation._common.utils import validate_azure_ai_project
14
15
  from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
15
16
  from azure.ai.evaluation._http_utils import get_async_http_client
@@ -21,7 +22,6 @@ from azure.core.pipeline.policies import AsyncRetryPolicy, RetryMode
21
22
  from ._constants import SupportedLanguages
22
23
  from ._conversation import CallbackConversationBot, ConversationBot, ConversationRole, ConversationTurn
23
24
  from ._conversation._conversation import simulate_conversation
24
- from ._helpers import experimental
25
25
  from ._model_tools import (
26
26
  AdversarialTemplateHandler,
27
27
  ManagedIdentityAPITokenManager,
@@ -276,6 +276,9 @@ class AdversarialSimulator:
276
276
  "target_population",
277
277
  "topic",
278
278
  "ch_template_placeholder",
279
+ "chatbot_name",
280
+ "name",
281
+ "group",
279
282
  ):
280
283
  template_parameters.pop(key, None)
281
284
  if conversation_category:
@@ -0,0 +1,3 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------