azure-ai-evaluation 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (144) hide show
  1. azure/ai/evaluation/__init__.py +10 -0
  2. azure/ai/evaluation/_aoai/__init__.py +10 -0
  3. azure/ai/evaluation/_aoai/aoai_grader.py +89 -0
  4. azure/ai/evaluation/_aoai/label_grader.py +66 -0
  5. azure/ai/evaluation/_aoai/string_check_grader.py +65 -0
  6. azure/ai/evaluation/_aoai/text_similarity_grader.py +88 -0
  7. azure/ai/evaluation/_azure/_clients.py +4 -4
  8. azure/ai/evaluation/_azure/_envs.py +208 -0
  9. azure/ai/evaluation/_azure/_token_manager.py +12 -7
  10. azure/ai/evaluation/_common/__init__.py +7 -0
  11. azure/ai/evaluation/_common/evaluation_onedp_client.py +163 -0
  12. azure/ai/evaluation/_common/onedp/__init__.py +32 -0
  13. azure/ai/evaluation/_common/onedp/_client.py +139 -0
  14. azure/ai/evaluation/_common/onedp/_configuration.py +73 -0
  15. azure/ai/evaluation/_common/onedp/_model_base.py +1232 -0
  16. azure/ai/evaluation/_common/onedp/_patch.py +21 -0
  17. azure/ai/evaluation/_common/onedp/_serialization.py +2032 -0
  18. azure/ai/evaluation/_common/onedp/_types.py +21 -0
  19. azure/ai/evaluation/_common/onedp/_validation.py +50 -0
  20. azure/ai/evaluation/_common/onedp/_vendor.py +50 -0
  21. azure/ai/evaluation/_common/onedp/_version.py +9 -0
  22. azure/ai/evaluation/_common/onedp/aio/__init__.py +29 -0
  23. azure/ai/evaluation/_common/onedp/aio/_client.py +143 -0
  24. azure/ai/evaluation/_common/onedp/aio/_configuration.py +75 -0
  25. azure/ai/evaluation/_common/onedp/aio/_patch.py +21 -0
  26. azure/ai/evaluation/_common/onedp/aio/_vendor.py +40 -0
  27. azure/ai/evaluation/_common/onedp/aio/operations/__init__.py +39 -0
  28. azure/ai/evaluation/_common/onedp/aio/operations/_operations.py +4494 -0
  29. azure/ai/evaluation/_common/onedp/aio/operations/_patch.py +21 -0
  30. azure/ai/evaluation/_common/onedp/models/__init__.py +142 -0
  31. azure/ai/evaluation/_common/onedp/models/_enums.py +162 -0
  32. azure/ai/evaluation/_common/onedp/models/_models.py +2228 -0
  33. azure/ai/evaluation/_common/onedp/models/_patch.py +21 -0
  34. azure/ai/evaluation/_common/onedp/operations/__init__.py +39 -0
  35. azure/ai/evaluation/_common/onedp/operations/_operations.py +5655 -0
  36. azure/ai/evaluation/_common/onedp/operations/_patch.py +21 -0
  37. azure/ai/evaluation/_common/onedp/py.typed +1 -0
  38. azure/ai/evaluation/_common/onedp/servicepatterns/__init__.py +1 -0
  39. azure/ai/evaluation/_common/onedp/servicepatterns/aio/__init__.py +1 -0
  40. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/__init__.py +25 -0
  41. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_operations.py +34 -0
  42. azure/ai/evaluation/_common/onedp/servicepatterns/aio/operations/_patch.py +20 -0
  43. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/__init__.py +1 -0
  44. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/__init__.py +1 -0
  45. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/__init__.py +22 -0
  46. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_operations.py +29 -0
  47. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/aio/operations/_patch.py +20 -0
  48. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/__init__.py +22 -0
  49. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_operations.py +29 -0
  50. azure/ai/evaluation/_common/onedp/servicepatterns/buildingblocks/operations/_patch.py +20 -0
  51. azure/ai/evaluation/_common/onedp/servicepatterns/operations/__init__.py +25 -0
  52. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_operations.py +34 -0
  53. azure/ai/evaluation/_common/onedp/servicepatterns/operations/_patch.py +20 -0
  54. azure/ai/evaluation/_common/rai_service.py +165 -34
  55. azure/ai/evaluation/_common/raiclient/_version.py +1 -1
  56. azure/ai/evaluation/_common/utils.py +79 -1
  57. azure/ai/evaluation/_constants.py +16 -0
  58. azure/ai/evaluation/_converters/_ai_services.py +162 -118
  59. azure/ai/evaluation/_converters/_models.py +76 -6
  60. azure/ai/evaluation/_eval_mapping.py +73 -0
  61. azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py +30 -16
  62. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +8 -0
  63. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +5 -0
  64. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +17 -1
  65. azure/ai/evaluation/_evaluate/_eval_run.py +1 -1
  66. azure/ai/evaluation/_evaluate/_evaluate.py +325 -76
  67. azure/ai/evaluation/_evaluate/_evaluate_aoai.py +553 -0
  68. azure/ai/evaluation/_evaluate/_utils.py +117 -4
  69. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +11 -1
  70. azure/ai/evaluation/_evaluators/_code_vulnerability/_code_vulnerability.py +9 -1
  71. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +12 -2
  72. azure/ai/evaluation/_evaluators/_common/_base_eval.py +12 -3
  73. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +12 -3
  74. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +2 -2
  75. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +12 -2
  76. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +14 -4
  77. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +9 -8
  78. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +10 -0
  79. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +10 -0
  80. azure/ai/evaluation/_evaluators/_document_retrieval/__init__.py +11 -0
  81. azure/ai/evaluation/_evaluators/_document_retrieval/_document_retrieval.py +469 -0
  82. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +10 -0
  83. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +11 -1
  84. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +10 -0
  85. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +11 -1
  86. azure/ai/evaluation/_evaluators/_intent_resolution/_intent_resolution.py +16 -2
  87. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +10 -0
  88. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +11 -0
  89. azure/ai/evaluation/_evaluators/_qa/_qa.py +10 -0
  90. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +11 -1
  91. azure/ai/evaluation/_evaluators/_response_completeness/_response_completeness.py +20 -2
  92. azure/ai/evaluation/_evaluators/_response_completeness/response_completeness.prompty +31 -46
  93. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +10 -0
  94. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +10 -0
  95. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +10 -0
  96. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +11 -1
  97. azure/ai/evaluation/_evaluators/_task_adherence/_task_adherence.py +16 -2
  98. azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +86 -12
  99. azure/ai/evaluation/_evaluators/_ungrounded_attributes/_ungrounded_attributes.py +10 -0
  100. azure/ai/evaluation/_evaluators/_xpia/xpia.py +11 -0
  101. azure/ai/evaluation/_exceptions.py +2 -0
  102. azure/ai/evaluation/_legacy/_adapters/__init__.py +0 -14
  103. azure/ai/evaluation/_legacy/_adapters/_check.py +17 -0
  104. azure/ai/evaluation/_legacy/_adapters/_flows.py +1 -1
  105. azure/ai/evaluation/_legacy/_batch_engine/_engine.py +51 -32
  106. azure/ai/evaluation/_legacy/_batch_engine/_openai_injector.py +114 -8
  107. azure/ai/evaluation/_legacy/_batch_engine/_result.py +6 -0
  108. azure/ai/evaluation/_legacy/_batch_engine/_run.py +6 -0
  109. azure/ai/evaluation/_legacy/_batch_engine/_run_submitter.py +69 -29
  110. azure/ai/evaluation/_legacy/_batch_engine/_trace.py +54 -62
  111. azure/ai/evaluation/_legacy/_batch_engine/_utils.py +19 -1
  112. azure/ai/evaluation/_legacy/_common/__init__.py +3 -0
  113. azure/ai/evaluation/_legacy/_common/_async_token_provider.py +124 -0
  114. azure/ai/evaluation/_legacy/_common/_thread_pool_executor_with_context.py +15 -0
  115. azure/ai/evaluation/_legacy/prompty/_connection.py +11 -74
  116. azure/ai/evaluation/_legacy/prompty/_exceptions.py +80 -0
  117. azure/ai/evaluation/_legacy/prompty/_prompty.py +119 -9
  118. azure/ai/evaluation/_legacy/prompty/_utils.py +72 -2
  119. azure/ai/evaluation/_safety_evaluation/_safety_evaluation.py +114 -22
  120. azure/ai/evaluation/_version.py +1 -1
  121. azure/ai/evaluation/red_team/_attack_strategy.py +1 -1
  122. azure/ai/evaluation/red_team/_red_team.py +976 -546
  123. azure/ai/evaluation/red_team/_utils/metric_mapping.py +23 -0
  124. azure/ai/evaluation/red_team/_utils/strategy_utils.py +1 -1
  125. azure/ai/evaluation/simulator/_adversarial_simulator.py +63 -39
  126. azure/ai/evaluation/simulator/_constants.py +1 -0
  127. azure/ai/evaluation/simulator/_conversation/__init__.py +13 -6
  128. azure/ai/evaluation/simulator/_conversation/_conversation.py +2 -1
  129. azure/ai/evaluation/simulator/_conversation/constants.py +1 -1
  130. azure/ai/evaluation/simulator/_direct_attack_simulator.py +38 -25
  131. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +1 -0
  132. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +43 -28
  133. azure/ai/evaluation/simulator/_model_tools/__init__.py +2 -1
  134. azure/ai/evaluation/simulator/_model_tools/_generated_rai_client.py +26 -18
  135. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +5 -10
  136. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +65 -41
  137. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +15 -10
  138. azure/ai/evaluation/simulator/_model_tools/models.py +20 -17
  139. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/METADATA +49 -3
  140. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/RECORD +144 -86
  141. /azure/ai/evaluation/_legacy/{_batch_engine → _common}/_logging.py +0 -0
  142. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/NOTICE.txt +0 -0
  143. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/WHEEL +0 -0
  144. {azure_ai_evaluation-1.5.0.dist-info → azure_ai_evaluation-1.7.0.dist-info}/top_level.txt +0 -0
@@ -37,13 +37,26 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
37
37
  :param model_config: Configuration for the Azure OpenAI model.
38
38
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
39
39
  ~azure.ai.evaluation.OpenAIModelConfiguration]
40
+
40
41
  .. admonition:: Example:
42
+
41
43
  .. literalinclude:: ../samples/evaluation_samples_evaluate.py
42
44
  :start-after: [START completeness_evaluator]
43
45
  :end-before: [END completeness_evaluator]
44
46
  :language: python
45
47
  :dedent: 8
46
48
  :caption: Initialize and call a CompletenessEvaluator with a response and groundtruth.
49
+
50
+ .. admonition:: Example using Azure AI Project URL:
51
+
52
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
53
+ :start-after: [START completeness_evaluator]
54
+ :end-before: [END completeness_evaluator]
55
+ :language: python
56
+ :dedent: 8
57
+ :caption: Initialize and call CompletenessEvaluator using Azure AI Project URL in the following format
58
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
59
+
47
60
  """
48
61
 
49
62
  # Constants must be defined within eval's directory to be save/loadable
@@ -60,11 +73,16 @@ class ResponseCompletenessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
60
73
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
61
74
 
62
75
  @override
63
- def __init__(self, model_config, *, threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD):
76
+ def __init__(self, model_config, *,
77
+ threshold: Optional[float] = _DEFAULT_COMPLETENESS_THRESHOLD,
78
+ **kwargs):
64
79
  current_dir = os.path.dirname(__file__)
65
80
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
66
81
  self.threshold = threshold
67
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
82
+ super().__init__(model_config=model_config,
83
+ prompty_file=prompty_path,
84
+ result_key=self._RESULT_KEY,
85
+ **kwargs)
68
86
 
69
87
  @overload
70
88
  def __call__(
@@ -22,65 +22,51 @@ inputs:
22
22
  ---
23
23
  system:
24
24
  # Instruction
25
- ## Context
26
- ### You are an expert in evaluating the quality of an answer from an intelligent system based on provided definitions and data. Your goal will involve answering the questions below using the information provided.
27
- - **Definition**: You are given a definition of the response quality that is being evaluated to help guide your Score.
28
- - **Data**: Your input data include a response and its ground truth.
29
- - **Questions**: To complete your evaluation you will be asked to evaluate the Data in different ways.
30
-
25
+ ## Goal
26
+ ### You are an expert in evaluating the quality of a Response from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
27
+ - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
28
+ - **Data**: Your input data include Response and Ground Truth.
29
+ - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
31
30
 
31
+ user:
32
32
  # Definition
33
+ **Completeness** refers to how accurately and thoroughly a response represents the information provided in the ground truth. It considers both the inclusion of all relevant statements and the correctness of those statements. Each statement in the ground truth should be evaluated individually to determine if it is accurately reflected in the response without missing any key information. The scale ranges from 1 to 5, with higher numbers indicating greater completeness.
33
34
 
34
- **Level 1: Fully incomplete**
35
-
36
- **Definition:**
37
- A response is considered fully incomplete if it does not contain any the necessary and relevant information with respect to the ground truth. In other words, it completely misses all the information - especially claims and statements - established in the ground truth.
35
+ # Ratings
36
+ ## [Completeness: 1] (Fully Incomplete)
37
+ **Definition:** A response that does not contain any of the necessary and relevant information with respect to the ground truth. It completely misses all the information, especially claims and statements, established in the ground truth.
38
38
 
39
39
  **Examples:**
40
- 1. **Response:** "Flu shot cannot cure cancer. Stay healthy requires sleeping exactly 8 hours a day. A few hours of exercise per week will have little benefits for physical and mental health. Physical and mental health benefits are separate topics. Scientists have not studied any of them."
41
- **Ground Truth:** "Flu shot can prevent flu-related illnesses. Staying healthy requires proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
42
-
43
-
44
- **Level 2: Barely complete**
40
+ **Response:** "Flu shot cannot cure cancer. Stay healthy requires sleeping exactly 8 hours a day. A few hours of exercise per week will have little benefits for physical and mental health. Physical and mental health benefits are separate topics. Scientists have not studied any of them."
41
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Staying healthy requires proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
45
42
 
46
- **Definition:**
47
- A response is considered barely complete if it only contains a small percentage of all the necessary and relevant information with respect to the ground truth. In other words, it misses almost all the information - especially claims and statements - established in the ground truth.
43
+ ## [Completeness: 2] (Barely Complete)
44
+ **Definition:** A response that contains only a small percentage of all the necessary and relevant information with respect to the ground truth. It misses almost all the information, especially claims and statements, established in the ground truth.
48
45
 
49
46
  **Examples:**
50
- 1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires 2 meals a day. Exercise per week makes not difference to physical and mental health. This is because physical and mental health benefits have low correlation through scientific studies. Scientists are making this observation in studies."
51
- **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
47
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires 2 meals a day. Exercise per week makes no difference to physical and mental health. This is because physical and mental health benefits have low correlation through scientific studies. Scientists are making this observation in studies."
48
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
52
49
 
53
-
54
- **Level 3: Moderately complete**
55
-
56
- **Definition:**
57
- A response is considered moderately complete if it contains half of the necessary and relevant information with respect to the ground truth. In other words, it miss half of the information - especially claims and statements - established in the ground truth.
50
+ ## [Completeness: 3] (Moderately Complete)
51
+ **Definition:** A response that contains half of the necessary and relevant information with respect to the ground truth. It misses half of the information, especially claims and statements, established in the ground truth.
58
52
 
59
53
  **Examples:**
60
- 1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires a few dollar of investments a day. Even a few dollars of investments per week will not make an impact on physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Fiction writers are starting to discover them through their works."
61
- **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
62
-
63
-
64
- **Level 4: Mostly complete**
65
-
66
- **Definition:**
67
- A response is considered mostly complete if it contains most of the necessary and relevant information with respect to the ground truth. In other words, it misses some minor information - especially claims and statements - established in the ground truth.
54
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires a few dollars of investments a day. Even a few dollars of investments per week will not make an impact on physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Fiction writers are starting to discover them through their works."
55
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
68
56
 
57
+ ## [Completeness: 4] (Mostly Complete)
58
+ **Definition:** A response that contains most of the necessary and relevant information with respect to the ground truth. It misses some minor information, especially claims and statements, established in the ground truth.
69
59
 
70
60
  **Examples:**
71
- 1. **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires keto diet and rigorous athletic training. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
72
- **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
73
-
61
+ **Response:** "Flu shot can prevent flu-related illnesses. Staying healthy requires keto diet and rigorous athletic training. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
62
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
74
63
 
75
- **Level 5: Fully complete**
76
-
77
- **Definition:**
78
- A response is considered complete if it perfectly contains all the necessary and relevant information with respect to the ground truth. In other words, it does not miss any information from statements and claims in the ground truth.
64
+ ## [Completeness: 5] (Fully Complete)
65
+ **Definition:** A response that perfectly contains all the necessary and relevant information with respect to the ground truth. It does not miss any information from statements and claims in the ground truth.
79
66
 
80
67
  **Examples:**
81
- 1. **Response:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
82
- **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
83
-
68
+ **Response:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
69
+ **Ground Truth:** "Flu shot can prevent flu-related illnesses. Stay healthy by proper hydration and moderate exercise. Even a few hours of exercise per week can have long-term benefits for physical and mental health. This is because physical and mental health benefits have intricate relationships through behavioral changes. Scientists are starting to discover them through rigorous studies."
84
70
 
85
71
 
86
72
  # Data
@@ -89,11 +75,10 @@ Ground Truth: {{ground_truth}}
89
75
 
90
76
 
91
77
  # Tasks
92
- ## Please provide your assessment Score for the previous answer. Your output should include the following information:
93
- - **ThoughtChain**: To improve the reasoning process, Think Step by Step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and Start your ThoughtChain with "Let's think step by step:".
78
+ ## Please provide your assessment Score for the previous RESPONSE in relation to the GROUND TRUTH based on the Definitions above. Your output should include the following information:
79
+ - **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
94
80
  - **Explanation**: a very short explanation of why you think the input data should get that Score.
95
- - **Score**: based on your previous analysis, provide your Score. The answer you give MUST be a integer score ("1", "2", ...) based on the categories of the definitions.
96
-
81
+ - **Score**: based on your previous analysis, provide your Score. The Score you give MUST be an integer score (i.e., "1", "2"...) based on the levels of the definitions.
97
82
 
98
83
  ## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your score</S2>.
99
84
  # Output
@@ -45,6 +45,16 @@ class RetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]):
45
45
  :dedent: 8
46
46
  :caption: Initialize and call a RetrievalEvaluator.
47
47
 
48
+ .. admonition:: Example using Azure AI Project URL:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51
+ :start-after: [START retrieval_evaluator]
52
+ :end-before: [END retrieval_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize and call RetrievalEvaluator using Azure AI Project URL in the following format
56
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57
+
48
58
  .. admonition:: Example with Threshold:
49
59
 
50
60
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -70,6 +70,16 @@ class RougeScoreEvaluator(EvaluatorBase):
70
70
  :dedent: 8
71
71
  :caption: Initialize and call a RougeScoreEvaluator with a four-gram rouge type.
72
72
 
73
+ .. admonition:: Example using Azure AI Project URL:
74
+
75
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
76
+ :start-after: [START rouge_score_evaluator]
77
+ :end-before: [END rouge_score_evaluator]
78
+ :language: python
79
+ :dedent: 8
80
+ :caption: Initialize and call RougeScoreEvaluator using Azure AI Project URL in the following format
81
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
82
+
73
83
  .. admonition:: Example with threshold:
74
84
 
75
85
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -41,6 +41,16 @@ class GroundednessProEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
41
41
  :dedent: 8
42
42
  :caption: Initialize and call a GroundednessProEvaluator with a query, response, and context.
43
43
 
44
+ .. admonition:: Example using Azure AI Project URL:
45
+
46
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
47
+ :start-after: [START groundedness_pro_evaluator]
48
+ :end-before: [END groundedness_pro_evaluator]
49
+ :language: python
50
+ :dedent: 8
51
+ :caption: Initialize and call GroundednessProEvaluator using Azure AI Project URL in the following format
52
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
53
+
44
54
  .. admonition:: Example with threshold:
45
55
 
46
56
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -28,7 +28,7 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
28
28
  :param model_config: Configuration for the Azure OpenAI model.
29
29
  :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
30
30
  ~azure.ai.evaluation.OpenAIModelConfiguration]
31
- :param threshold: The threshold for the similarity evaluator. Default is 5.
31
+ :param threshold: The threshold for the similarity evaluator. Default is 3.
32
32
  :type threshold: int
33
33
 
34
34
  .. admonition:: Example:
@@ -40,6 +40,16 @@ class SimilarityEvaluator(PromptyEvaluatorBase):
40
40
  :dedent: 8
41
41
  :caption: Initialize and call a SimilarityEvaluator with a four-gram rouge type.
42
42
 
43
+ .. admonition:: Example using Azure AI Project URL:
44
+
45
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
46
+ :start-after: [START similarity_evaluator]
47
+ :end-before: [END similarity_evaluator]
48
+ :language: python
49
+ :dedent: 8
50
+ :caption: Initialize and call SimilarityEvaluator using Azure AI Project URL in the following format
51
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
52
+
43
53
  .. admonition:: Example:
44
54
 
45
55
  .. literalinclude:: ../samples/evaluation_samples_threshold.py
@@ -42,6 +42,17 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
42
42
  :language: python
43
43
  :dedent: 8
44
44
  :caption: Initialize and call an TaskAdherenceEvaluator with a query and response.
45
+
46
+ .. admonition:: Example using Azure AI Project URL:
47
+
48
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
49
+ :start-after: [START task_adherence_evaluator]
50
+ :end-before: [END task_adherence_evaluator]
51
+ :language: python
52
+ :dedent: 8
53
+ :caption: Initialize and call TaskAdherenceEvaluator using Azure AI Project URL in the following format
54
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
55
+
45
56
  """
46
57
 
47
58
  _PROMPTY_FILE = "task_adherence.prompty"
@@ -54,11 +65,14 @@ class TaskAdherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
54
65
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
55
66
 
56
67
  @override
57
- def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE):
68
+ def __init__(self, model_config, *, threshold=_DEFAULT_TASK_ADHERENCE_SCORE,
69
+ **kwargs):
58
70
  current_dir = os.path.dirname(__file__)
59
71
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
60
72
  self.threshold = threshold
61
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
73
+ super().__init__(model_config=model_config, prompty_file=prompty_path,
74
+ result_key=self._RESULT_KEY,
75
+ **kwargs)
62
76
 
63
77
  @overload
64
78
  def __call__(
@@ -45,6 +45,16 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
45
45
  :dedent: 8
46
46
  :caption: Initialize and call a ToolCallAccuracyEvaluator.
47
47
 
48
+ .. admonition:: Example using Azure AI Project URL:
49
+
50
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
51
+ :start-after: [START tool_call_accuracy_evaluator]
52
+ :end-before: [END tool_call_accuracy_evaluator]
53
+ :language: python
54
+ :dedent: 8
55
+ :caption: Initialize and call ToolCallAccuracyEvaluator using Azure AI Project URL in the following format
56
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
57
+
48
58
  .. note::
49
59
 
50
60
  To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added.
@@ -64,11 +74,15 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
64
74
  """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
65
75
 
66
76
  @override
67
- def __init__(self, model_config, *, threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE):
77
+ def __init__(self, model_config, *,
78
+ threshold=_DEFAULT_TOOL_CALL_ACCURACY_SCORE,
79
+ **kwargs):
68
80
  current_dir = os.path.dirname(__file__)
69
81
  prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
70
82
  self.threshold = threshold
71
- super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
83
+ super().__init__(model_config=model_config, prompty_file=prompty_path,
84
+ result_key=self._RESULT_KEY,
85
+ **kwargs)
72
86
 
73
87
  @overload
74
88
  def __call__(
@@ -210,12 +224,18 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
210
224
  score = math.nan
211
225
  if llm_output:
212
226
  score, reason = parse_quality_evaluator_reason_score(llm_output, valid_score_range="[0-1]")
213
- return {
214
- self._result_key: bool(float(score)),
215
- f"{self._result_key}_reason": reason,
216
- "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
217
- }
218
- return {self._result_key: float(score)}
227
+ if score >= 0 and score <= 1:
228
+ return {
229
+ self._result_key: bool(float(score)),
230
+ f"{self._result_key}_reason": reason,
231
+ "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
232
+ }
233
+ raise EvaluationException(
234
+ message="Tool call accuracy evaluator: Invalid score returned from LLM.",
235
+ blame=ErrorBlame.SYSTEM_ERROR,
236
+ category=ErrorCategory.INVALID_VALUE,
237
+ target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
238
+ )
219
239
 
220
240
  async def _real_call(self, **kwargs):
221
241
  """The asynchronous call where real end-to-end evaluation logic is performed.
@@ -227,13 +247,55 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
227
247
  """
228
248
  # Convert inputs into list of evaluable inputs.
229
249
  eval_input_list = self._convert_kwargs_to_eval_input(**kwargs)
250
+ if len(eval_input_list) == 0:
251
+ return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
252
+ f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
253
+ f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
254
+ f"{self._AGGREGATE_RESULT_KEY}_reason":
255
+ "No tool calls were made.",
256
+ "per_tool_call_details": []
257
+ }
258
+
230
259
  per_turn_results = []
231
260
  # Evaluate all inputs.
232
261
  for eval_input in eval_input_list:
233
- per_turn_results.append(await self._do_eval(eval_input))
262
+ if self._is_applicable_tool(eval_input):
263
+ per_turn_results.append(await self._do_eval(eval_input))
264
+ else:
265
+ per_turn_results.append(self._not_applicable_result(eval_input))
234
266
 
235
267
  return self._aggregate_results(per_turn_results=per_turn_results)
236
268
 
269
+ def _is_applicable_tool(self, eval_input):
270
+ """Determine if a given tool should be evaluated, since we only evaluate tools that
271
+ have sufficient context available.
272
+
273
+ :type eval_input: Dict
274
+ :return: True if the tool call should be evaluated
275
+ :rtype: bool
276
+ """
277
+ tool_definition = eval_input.get("tool_definition")
278
+ if tool_definition is None or len(tool_definition) != 1:
279
+ return False
280
+ tool_type = tool_definition[0].get("type")
281
+ if tool_type is None or tool_type != "function":
282
+ return False
283
+ return True
284
+
285
+ def _not_applicable_result(self, eval_input):
286
+ """Return a result indicating that the tool call is not applicable for evaluation.
287
+
288
+ :param eval_input: The input to the evaluator.
289
+ :type eval_input: Dict
290
+ :return: A dictionary containing the result of the evaluation.
291
+ :rtype: Dict[str, Union[str, float]]
292
+ """
293
+ return {
294
+ f"{self._result_key}": self._NOT_APPLICABLE_RESULT,
295
+ f"{self._result_key}_reason": "Tool call not supported for evaluation",
296
+ "tool_call_id" : eval_input.get("tool_call").get("tool_call_id"),
297
+ }
298
+
237
299
  def _aggregate_results(self, per_turn_results):
238
300
  """Aggregate the evaluation results of each conversation turn into a single result.
239
301
 
@@ -256,11 +318,23 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
256
318
  # Go over each turn, and rotate the results into a
257
319
  # metric: List[values] format for the evals_per_turn dictionary.
258
320
 
259
- score = sum([1 if per_turn_result.get(self._result_key) else 0 for per_turn_result in per_turn_results])/len(per_turn_results)
321
+ num_evaluated = len([per_turn_result for per_turn_result in per_turn_results
322
+ if per_turn_result.get(self._result_key) != self._NOT_APPLICABLE_RESULT])
323
+ if num_evaluated == 0:
324
+ # None of the invoked tools were applicable, return not applicable result
325
+ # (If a tool fails evaluation, we'll throw an exception)
326
+ return {self._AGGREGATE_RESULT_KEY: self._NOT_APPLICABLE_RESULT,
327
+ f"{self._AGGREGATE_RESULT_KEY}_result": self._NOT_APPLICABLE_RESULT,
328
+ f"{self._AGGREGATE_RESULT_KEY}_threshold": self.threshold,
329
+ f"{self._AGGREGATE_RESULT_KEY}_reason":
330
+ "Tool call accuracy evaluation is not yet supported for the invoked tools.",
331
+ "per_tool_call_details": []
332
+ }
333
+ # ignore not_applicable results, where the _result_key will be "not applicable"
334
+ score = sum([per_turn_result.get(self._result_key) == True for per_turn_result in per_turn_results])/num_evaluated
260
335
  aggregated[self._AGGREGATE_RESULT_KEY] = score
261
- aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = 'pass' if score >= self.threshold else 'fail'
336
+ aggregated[f'{self._AGGREGATE_RESULT_KEY}_result'] = self._PASS_RESULT if score >= self.threshold else self._FAIL_RESULT
262
337
  aggregated[f'{self._AGGREGATE_RESULT_KEY}_threshold'] = self.threshold
263
-
264
338
  aggregated["per_tool_call_details"] = per_turn_results
265
339
  return aggregated
266
340
 
@@ -41,6 +41,16 @@ class UngroundedAttributesEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
41
41
  :dedent: 8
42
42
  :caption: Initialize and call a UngroundedAttributesEvaluator with a query, response and context.
43
43
 
44
+ .. admonition:: Example using Azure AI Project URL:
45
+
46
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
47
+ :start-after: [START ungrounded_attributes_evaluator]
48
+ :end-before: [END ungrounded_attributes_evaluator]
49
+ :language: python
50
+ :dedent: 8
51
+ :caption: Initialize and call UngroundedAttributesEvaluator using Azure AI Project URL in the following format
52
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
53
+
44
54
  .. note::
45
55
 
46
56
  If this evaluator is supplied to the `evaluate` function, the metric
@@ -54,6 +54,17 @@ class IndirectAttackEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
54
54
  :language: python
55
55
  :dedent: 8
56
56
  :caption: Initialize and call an IndirectAttackEvaluator.
57
+
58
+ .. admonition:: Example using Azure AI Project URL:
59
+
60
+ .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
61
+ :start-after: [START indirect_attack_evaluator]
62
+ :end-before: [END indirect_attack_evaluator]
63
+ :language: python
64
+ :dedent: 8
65
+ :caption: Initialize and call IndirectAttackEvaluator using Azure AI Project URL in the following format
66
+ https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
67
+
57
68
  """
58
69
 
59
70
  id = "azureml://registries/azureml/models/Indirect-Attack-Evaluator/versions/3"
@@ -38,6 +38,7 @@ class ErrorCategory(Enum):
38
38
  FAILED_REMOTE_TRACKING = "FAILED REMOTE TRACKING"
39
39
  PROJECT_ACCESS_ERROR = "PROJECT ACCESS ERROR"
40
40
  UNKNOWN = "UNKNOWN"
41
+ UPLOAD_ERROR = "UPLOAD ERROR"
41
42
 
42
43
 
43
44
  class ErrorBlame(Enum):
@@ -85,6 +86,7 @@ class ErrorTarget(Enum):
85
86
  CONVERSATION = "Conversation"
86
87
  TOOL_CALL_ACCURACY_EVALUATOR = "ToolCallAccuracyEvaluator"
87
88
  RED_TEAM = "RedTeam"
89
+ AOAI_GRADER = "AoaiGrader"
88
90
 
89
91
 
90
92
  class EvaluationException(AzureError):
@@ -5,17 +5,3 @@
5
5
  # NOTE: This contains adapters that make the Promptflow dependency optional. In the first phase,
6
6
  # Promptflow will still be installed as part of the azure-ai-evaluation dependencies. This
7
7
  # will be removed in the future once the code migration is complete.
8
-
9
- from typing import Final
10
-
11
-
12
- _has_legacy = False
13
- try:
14
- from promptflow.client import PFClient
15
-
16
- _has_legacy = True
17
- except ImportError:
18
- pass
19
-
20
- HAS_LEGACY_SDK: Final[bool] = _has_legacy
21
- MISSING_LEGACY_SDK: Final[bool] = not _has_legacy
@@ -0,0 +1,17 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ from typing import Final
6
+
7
+
8
+ _has_legacy = False
9
+ try:
10
+ from promptflow._constants import FlowType
11
+
12
+ _has_legacy = True
13
+ except ImportError:
14
+ pass
15
+
16
+ HAS_LEGACY_SDK: Final[bool] = _has_legacy
17
+ MISSING_LEGACY_SDK: Final[bool] = not _has_legacy
@@ -6,7 +6,7 @@ from typing_extensions import TypeAlias
6
6
 
7
7
 
8
8
  try:
9
- from promptflow._sdk.entities._flows import AsyncPrompty as _AsyncPrompty
9
+ from promptflow.core._flow import AsyncPrompty as _AsyncPrompty
10
10
  from promptflow._sdk.entities._flows import FlexFlow as _FlexFlow
11
11
  from promptflow._sdk.entities._flows.dag import Flow as _Flow
12
12
  except ImportError: