PyPI - azure-ai-evaluation - Versions diffs - 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl - Mend

azure-ai-evaluation 1.6.0py3-none-any.whl → 1.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (69) hide show

azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py CHANGED Viewed

@@ -53,6 +53,16 @@ class GroundednessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize with threshold and call a GroundednessEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START groundedness_evaluator]
+            :end-before: [END groundedness_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call GroundednessEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. note::
         To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.

azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py CHANGED Viewed

@@ -33,6 +33,16 @@ class IntentResolutionEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize and call an IntentResolutionEvaluator with a query and response.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START intent_resolution_evaluator]
+            :end-before: [END intent_resolution_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call IntentResolutionEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
     _PROMPTY_FILE = "intent_resolution.prompty"

azure/ai/evaluation/_evaluators/_meteor/_meteor.py CHANGED Viewed

@@ -45,6 +45,16 @@ class MeteorScoreEvaluator(EvaluatorBase):
             :dedent: 8
             :caption: Initialize and call a MeteorScoreEvaluator with alpha of 0.8.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START meteor_score_evaluator]
+            :end-before: [END meteor_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call MeteorScoreEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py CHANGED Viewed

@@ -37,6 +37,17 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
             :language: python
             :dedent: 8
             :caption: Initialize and call a ProtectedMaterialEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START protected_material_evaluator]
+            :end-before: [END protected_material_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call ProtectedMaterialEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
     id = "azureml://registries/azureml/models/Protected-Material-Evaluator/versions/3"

azure/ai/evaluation/_evaluators/_qa/_qa.py CHANGED Viewed

@@ -48,6 +48,16 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize and call a QAEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START qa_evaluator]
+            :end-before: [END qa_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call QAEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_relevance/_relevance.py CHANGED Viewed

@@ -39,6 +39,16 @@ class RelevanceEvaluator(PromptyEvaluatorBase):
             :dedent: 8
             :caption: Initialize and call a RelevanceEvaluator with a query, response, and context.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START relevance_evaluator]
+            :end-before: [END relevance_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call RelevanceEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py CHANGED Viewed

@@ -37,13 +37,26 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
     :param model_config: Configuration for the Azure OpenAI model.
     :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
         ~azure.ai.evaluation.OpenAIModelConfiguration]
     .. admonition:: Example:
         .. literalinclude:: ../samples/evaluation_samples_evaluate.py
             :start-after: [START completeness_evaluator]
             :end-before: [END completeness_evaluator]
             :language: python
             :dedent: 8
             :caption: Initialize and call a CompletenessEvaluator with a response and groundtruth.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START completeness_evaluator]
+            :end-before: [END completeness_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call CompletenessEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
     # Constants must be defined within eval's directory to be save/loadable

azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py CHANGED Viewed

@@ -45,6 +45,16 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize and call a RetrievalEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START retrieval_evaluator]
+            :end-before: [END retrieval_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call RetrievalEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with Threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_rouge/_rouge.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 from enum import Enum
-from typing import Dict
+from typing import Dict, Union
 from typing_extensions import overload, override
 from azure.ai.evaluation._vendor.rouge_score import rouge_scorer
@@ -12,7 +12,7 @@ from azure.ai.evaluation._constants import EVALUATION_PASS_FAIL_MAPPING
 import math
-class RougeType(Enum):
+class RougeType(str, Enum):
     """
     Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
     """
@@ -70,6 +70,16 @@ class RougeScoreEvaluator(EvaluatorBase):
             :dedent: 8
             :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START rouge_score_evaluator]
+            :end-before: [END rouge_score_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -169,8 +179,8 @@ class RougeScoreEvaluator(EvaluatorBase):
         """
         ground_truth = eval_input["ground_truth"]
         response = eval_input["response"]
-        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type.value])
-        metrics = scorer.score(ground_truth, response)[self._rouge_type.value]
+        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
+        metrics = scorer.score(ground_truth, response)[self._rouge_type]
         binary_results = {
             "rouge_precision_result": False,
             "rouge_recall_result": False,

azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py CHANGED Viewed

@@ -41,6 +41,16 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
             :dedent: 8
             :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START groundedness_pro_evaluator]
+            :end-before: [END groundedness_pro_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example with threshold:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_similarity/_similarity.py CHANGED Viewed

@@ -40,6 +40,16 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
             :dedent: 8
             :caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START similarity_evaluator]
+            :end-before: [END similarity_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. admonition:: Example:
         .. literalinclude:: ../samples/evaluation_samples_threshold.py

azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py CHANGED Viewed

@@ -42,6 +42,17 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :language: python
             :dedent: 8
             :caption: Initialize and call an TaskAdherenceEvaluator with a query and response.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START task_adherence_evaluator]
+            :end-before: [END task_adherence_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
     _PROMPTY_FILE = "task_adherence.prompty"

azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py CHANGED Viewed

@@ -45,6 +45,16 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
             :dedent: 8
             :caption: Initialize and call a ToolCallAccuracyEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START tool_call_accuracy_evaluator]
+            :end-before: [END tool_call_accuracy_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. note::
         To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -214,12 +224,18 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         score = math.nan
         if llm_output:
             score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[0-1]")
-            return {
-                self._result_key: bool(float(score)),
-                f"{self._result_key}_reason": reason,
-                "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
-            }
-        return {self._result_key: float(score)}
+            if score >= 0 and score <= 1:
+                return {
+                    self._result_key: bool(float(score)),
+                    f"{self._result_key}_reason": reason,
+                    "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
+                }
+        raise EvaluationException(
+            message="Tool call accuracy evaluator: Invalid score returned from LLM.",
+            blame=ErrorBlame.SYSTEM_ERROR,
+            category=ErrorCategory.INVALID_VALUE,
+            target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
+        )
     async def _real_call(self, **kwargs):
         """The asynchronous call where real end-to-end evaluation logic is performed.
@@ -231,13 +247,55 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         """
         # Convert inputs into list of evaluable inputs.
         eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
+        if len(eval_input_list) == 0:
+            return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
+                    f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
+                    f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
+                    f"{self._AGGREGATE_RESULT_KEY}_reason":
+                        "No tool calls were made.",
+                    "per_tool_call_details": []
+                    }
         per_turn_results = []
         # Evaluate all inputs.
         for eval_input in eval_input_list:
-            per_turn_results.append(await self._do_eval(eval_input))
+            if self._is_applicable_tool(eval_input):
+                per_turn_results.append(await self._do_eval(eval_input))
+            else:
+                per_turn_results.append(self._not_applicable_result(eval_input))
         return self._aggregate_results(per_turn_results=per_turn_results)
+    def _is_applicable_tool(self, eval_input):
+        """Determine if a given tool should be evaluated, since we only evaluate tools that
+        have sufficient context available.
+        :type eval_input: Dict
+        :return: True if the tool call should be evaluated
+        :rtype: bool
+        """
+        tool_definition = eval_input.get("tool_definition")
+        if tool_definition is None or len(tool_definition) != 1:
+            return False
+        tool_type = tool_definition[0].get("type")
+        if tool_type is None or tool_type != "function":
+            return False
+        return True
+    def _not_applicable_result(self, eval_input):
+        """Return a result indicating that the tool call is not applicable for evaluation.
+        :param eval_input: The input to the evaluator.
+        :type eval_input: Dict
+        :return: A dictionary containing the result of the evaluation.
+        :rtype: Dict[str, Union[str, float]]
+        """
+        return {
+            f"{self._result_key}": self._NOT_APPLICABLE_RESULT,
+            f"{self._result_key}_reason": "Tool call not supported for evaluation",
+            "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
+        }
     def _aggregate_results(self, per_turn_results):
         """Aggregate the evaluation results of each conversation turn into a single result.
@@ -260,11 +318,23 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
         # Go over each turn, and rotate the results into a
         # metric: List[values] format for the evals_per_turn dictionary.
-        score = sum([1 if per_turn_result.get(self._result_key) else 0 for per_turn_result in per_turn_results])/len(per_turn_results)
+        num_evaluated = len([per_turn_result for per_turn_result in per_turn_results
+                             if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT])
+        if num_evaluated == 0:
+            # None of the invoked tools were applicable, return not applicable result
+            # (If a tool fails evaluation, we'll throw an exception)
+            return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
+                    f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
+                    f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
+                    f"{self._AGGREGATE_RESULT_KEY}_reason":
+                        "Tool call accuracy evaluation is not yet supported for the invoked tools.",
+                    "per_tool_call_details": []
+                    }
+        # ignore not_applicable results, where the _result_key will be "not applicable"
+        score = sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results])/num_evaluated
         aggregated[self._AGGREGATE_RESULT_KEY] = score
-        aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' if score >= self.threshold else 'fail'
+        aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
         aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
         aggregated["per_tool_call_details"] = per_turn_results
         return aggregated

azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py CHANGED Viewed

@@ -41,6 +41,16 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
             :dedent: 8
             :caption: Initialize and call a UngroundedAttributesEvaluator with a query, response and context.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START ungrounded_attributes_evaluator]
+            :end-before: [END ungrounded_attributes_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     .. note::
         If this evaluator is supplied to the `evaluate` function, the metric

azure/ai/evaluation/_evaluators/_xpia/xpia.py CHANGED Viewed

@@ -54,6 +54,17 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
             :language: python
             :dedent: 8
             :caption: Initialize and call an IndirectAttackEvaluator.
+    .. admonition:: Example using Azure AI Project URL:
+        .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
+            :start-after: [START indirect_attack_evaluator]
+            :end-before: [END indirect_attack_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
+                https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
     """
     id = "azureml://registries/azureml/models/Indirect-Attack-Evaluator/versions/3"

azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py CHANGED Viewed

@@ -161,6 +161,8 @@ class _SafetyEvaluation:
         adversarial_scenario: Optional[Union[AdversarialScenario, AdversarialScenarioJailbreak, _UnstableAdversarialScenario]] = None,
         source_text: Optional[str] = None,
         direct_attack: bool = False,
+        randomization_seed: Optional[int] = None,
+        concurrent_async_tasks: Optional[int] = 5,
     ) -> Dict[str, str]:
         """
         Generates synthetic conversations based on provided parameters.
@@ -245,6 +247,8 @@ class _SafetyEvaluation:
                 conversation_turns=conversation_turns,
                 text=source_text,
                 target=callback,
+                randomization_seed=randomization_seed,
+                concurrent_async_task=concurrent_async_tasks
             )
         # if DirectAttack, run DirectAttackSimulator
@@ -258,6 +262,8 @@ class _SafetyEvaluation:
                 max_conversation_turns=max_conversation_turns,
                 max_simulation_results=max_simulation_results,
                 target=callback,
+                randomization_seed=randomization_seed,
+                concurrent_async_task=concurrent_async_tasks,
             )
             jailbreak_outputs = simulator_outputs["jailbreak"]
             simulator_outputs = simulator_outputs["regular"]
@@ -275,6 +281,7 @@ class _SafetyEvaluation:
                 num_queries=max_simulation_results,
                 target=callback,
                 text=source_text if source_text else "",
+                concurrent_async_tasks=concurrent_async_tasks,
             )
         ## Run AdversarialSimulator
@@ -290,6 +297,8 @@ class _SafetyEvaluation:
                 conversation_turns=conversation_turns,
                 target=callback,
                 text=source_text,
+                randomization_seed=randomization_seed,
+                concurrent_async_task=concurrent_async_tasks
             )
         ## If no outputs are generated, raise an exception
@@ -677,7 +686,9 @@ class _SafetyEvaluation:
             violence_defects = []
             sexual_defects = []
             self_harm_defects = []
-            for i, row in enumerate(zip(evaluation_result_dict[jailbreak_cols[0]]['rows'], evaluation_result_dict[regular_cols[0]]['rows'])):
+            jailbreak_rows = sorted(evaluation_result_dict[jailbreak_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
+            regular_rows = sorted(evaluation_result_dict[regular_cols[0]]['rows'], key=lambda x: (x.get('inputs.category') is None, x.get('inputs.category', '')))
+            for i, row in enumerate(zip(jailbreak_rows, regular_rows)):
                 hate_unfairness_defects.append(1 if (row[0]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.hate_unfairness_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
                 violence_defects.append(1 if (row[0]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.violence_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
                 sexual_defects.append(1 if (row[0]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT and not row[1]['outputs.content_safety.sexual_score'] > CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT) else 0)
@@ -716,8 +727,10 @@ class _SafetyEvaluation:
             data_path: Optional[Union[str, os.PathLike]] = None,
             jailbreak_data_path: Optional[Union[str, os.PathLike]] = None,
             output_path: Optional[Union[str, os.PathLike]] = None,
-            data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None
-        ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
+            data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]] = None,
+            randomization_seed: Optional[int] = None,
+            concurrent_async_tasks: Optional[int] = 5,
+        ) -> Union[Dict[str, EvaluationResult], Dict[str, str], Dict[str, Union[str,os.PathLike]]]:
         '''
         Evaluates the target function based on the provided parameters.
@@ -744,12 +757,17 @@ class _SafetyEvaluation:
         :param data_path: The path to the data file generated by the Simulator. If None, the Simulator will be run.
         :type data_path: Optional[Union[str, os.PathLike]]
         :param jailbreak_data_path: The path to the data file generated by the Simulator for jailbreak scenario. If None, the DirectAttackSimulator will be run.
-        :type jailbreak_data_path: Optional[Union[str, os.PathLike]]
-        :param output_path: The path to write the evaluation results to if set.
+        :type jailbreak_data_path: Optional[Union[str, os.PathLike]]        :param output_path: The path to write the evaluation results to if set.
         :type output_path: Optional[Union[str, os.PathLike]]
+        :param data_paths: A dictionary of data paths to evaluate. If None, the Simulator will be run.
+        :type data_paths: Optional[Union[Dict[str, str], Dict[str, Union[str,os.PathLike]]]]
+        :param randomization_seed: The seed used to randomize prompt selection. If unset, the system's default seed is used.
+        :type randomization_seed: Optional[int]
+        :param concurrent_async_tasks: The number of concurrent async tasks to run. If None, the system's default is used.
+        :type concurrent_async_tasks: Optional[int]
         '''
-        ## Log inputs
-        self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}")
+        ## Log inputs
+        self.logger.info(f"User inputs: evaluators{evaluators}, evaluation_name={evaluation_name}, num_turns={num_turns}, num_rows={num_rows}, scenario={scenario},conversation_turns={conversation_turns}, tasks={tasks}, source_text={source_text}, data_path={data_path}, jailbreak_data_path={jailbreak_data_path}, output_path={output_path}, randomization_seed={randomization_seed}, concurrent_async_tasks={concurrent_async_tasks}")
         ## Validate arguments
         self._validate_inputs(
@@ -779,6 +797,7 @@ class _SafetyEvaluation:
                 tasks=tasks,
                 source_text=source_text,
                 direct_attack=_SafetyEvaluator.DIRECT_ATTACK in evaluators,
+                randomization_seed=randomization_seed,
             )
         elif data_path:
             data_paths = {Path(data_path).stem: data_path}

azure/ai/evaluation/_version.py CHANGED Viewed

@@ -3,4 +3,4 @@
 # ---------------------------------------------------------
 # represents upcoming version
-VERSION = "1.6.0"
+VERSION = "1.8.0"

azure/ai/evaluation/red_team/_agent/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------

azure-ai-evaluation 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl

Potentially problematic release.

azure-ai-evaluation 1.6.0py3-none-any.whl → 1.8.0py3-none-any.whl