azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +4 -26
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +122 -302
- azure/ai/evaluation/_common/utils.py +35 -393
- azure/ai/evaluation/_constants.py +6 -28
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
- azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
- azure/ai/evaluation/_evaluate/_utils.py +47 -108
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +134 -205
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -3
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
- azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_tracing.py +28 -25
- azure/ai/evaluation/simulator/_utils.py +13 -34
- azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/_experimental.py +0 -172
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_simulator.py +0 -716
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
|
@@ -1,63 +1,113 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
|
|
4
5
|
import os
|
|
5
|
-
|
|
6
|
+
import re
|
|
7
|
+
from typing import Union
|
|
6
8
|
|
|
7
|
-
|
|
9
|
+
import numpy as np
|
|
8
10
|
|
|
9
|
-
from
|
|
10
|
-
from azure.ai.evaluation.
|
|
11
|
+
from promptflow._utils.async_utils import async_run_allowing_running_loop
|
|
12
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
13
|
+
from promptflow.core import AsyncPrompty
|
|
11
14
|
|
|
15
|
+
from ..._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
|
|
16
|
+
from ..._common.utils import (
|
|
17
|
+
check_and_add_api_version_for_aoai_model_config,
|
|
18
|
+
check_and_add_user_agent_for_aoai_model_config,
|
|
19
|
+
)
|
|
12
20
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
21
|
+
try:
|
|
22
|
+
from ..._user_agent import USER_AGENT
|
|
23
|
+
except ImportError:
|
|
24
|
+
USER_AGENT = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class _AsyncCoherenceEvaluator:
|
|
28
|
+
# Constants must be defined within eval's directory to be save/loadable
|
|
29
|
+
PROMPTY_FILE = "coherence.prompty"
|
|
30
|
+
LLM_CALL_TIMEOUT = 600
|
|
31
|
+
DEFAULT_OPEN_API_VERSION = "2024-02-15-preview"
|
|
32
|
+
|
|
33
|
+
def __init__(self, model_config: dict):
|
|
34
|
+
check_and_add_api_version_for_aoai_model_config(model_config, self.DEFAULT_OPEN_API_VERSION)
|
|
35
|
+
|
|
36
|
+
prompty_model_config = {"configuration": model_config, "parameters": {"extra_headers": {}}}
|
|
37
|
+
|
|
38
|
+
# Handle "RuntimeError: Event loop is closed" from httpx AsyncClient
|
|
39
|
+
# https://github.com/encode/httpx/discussions/2959
|
|
40
|
+
prompty_model_config["parameters"]["extra_headers"].update({"Connection": "close"})
|
|
41
|
+
|
|
42
|
+
check_and_add_user_agent_for_aoai_model_config(
|
|
43
|
+
model_config,
|
|
44
|
+
prompty_model_config,
|
|
45
|
+
USER_AGENT,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
current_dir = os.path.dirname(__file__)
|
|
49
|
+
prompty_path = os.path.join(current_dir, self.PROMPTY_FILE)
|
|
50
|
+
self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config)
|
|
51
|
+
|
|
52
|
+
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
53
|
+
# Validate input parameters
|
|
54
|
+
query = str(query or "")
|
|
55
|
+
response = str(response or "")
|
|
56
|
+
|
|
57
|
+
if not (query.strip() and response.strip()):
|
|
58
|
+
msg = "Both 'query' and 'response' must be non-empty strings."
|
|
59
|
+
raise EvaluationException(
|
|
60
|
+
message=msg,
|
|
61
|
+
internal_message=msg,
|
|
62
|
+
error_category=ErrorCategory.INVALID_VALUE,
|
|
63
|
+
error_blame=ErrorBlame.USER_ERROR,
|
|
64
|
+
error_target=ErrorTarget.COHERENCE_EVALUATOR,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Run the evaluation flow
|
|
68
|
+
llm_output = await self._flow(query=query, response=response, timeout=self.LLM_CALL_TIMEOUT, **kwargs)
|
|
69
|
+
|
|
70
|
+
score = np.nan
|
|
71
|
+
if llm_output:
|
|
72
|
+
match = re.search(r"\d", llm_output)
|
|
73
|
+
if match:
|
|
74
|
+
score = float(match.group())
|
|
75
|
+
|
|
76
|
+
return {"gpt_coherence": float(score)}
|
|
16
77
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
78
|
+
|
|
79
|
+
class CoherenceEvaluator:
|
|
80
|
+
"""
|
|
81
|
+
Initialize a coherence evaluator configured for a specific Azure OpenAI model.
|
|
20
82
|
|
|
21
83
|
:param model_config: Configuration for the Azure OpenAI model.
|
|
22
84
|
:type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration,
|
|
23
85
|
~azure.ai.evaluation.OpenAIModelConfiguration]
|
|
24
86
|
|
|
25
|
-
|
|
87
|
+
**Usage**
|
|
26
88
|
|
|
27
|
-
|
|
28
|
-
:start-after: [START coherence_evaluator]
|
|
29
|
-
:end-before: [END coherence_evaluator]
|
|
30
|
-
:language: python
|
|
31
|
-
:dedent: 8
|
|
32
|
-
:caption: Initialize and call a CoherenceEvaluator with a query and response.
|
|
89
|
+
.. code-block:: python
|
|
33
90
|
|
|
34
|
-
|
|
91
|
+
eval_fn = CoherenceEvaluator(model_config)
|
|
92
|
+
result = eval_fn(
|
|
93
|
+
query="What is the capital of Japan?",
|
|
94
|
+
response="The capital of Japan is Tokyo.")
|
|
35
95
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
"""
|
|
96
|
+
**Output format**
|
|
97
|
+
|
|
98
|
+
.. code-block:: python
|
|
40
99
|
|
|
41
|
-
|
|
42
|
-
|
|
100
|
+
{
|
|
101
|
+
"gpt_coherence": 1.0
|
|
102
|
+
}
|
|
103
|
+
"""
|
|
43
104
|
|
|
44
|
-
|
|
45
|
-
|
|
105
|
+
def __init__(self, model_config: dict):
|
|
106
|
+
self._async_evaluator = _AsyncCoherenceEvaluator(model_config)
|
|
46
107
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
|
|
51
|
-
super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY)
|
|
52
|
-
|
|
53
|
-
@overload
|
|
54
|
-
def __call__(
|
|
55
|
-
self,
|
|
56
|
-
*,
|
|
57
|
-
query: str,
|
|
58
|
-
response: str,
|
|
59
|
-
) -> Dict[str, Union[str, float]]:
|
|
60
|
-
"""Evaluate coherence for given input of query, response
|
|
108
|
+
def __call__(self, *, query: str, response: str, **kwargs):
|
|
109
|
+
"""
|
|
110
|
+
Evaluate coherence.
|
|
61
111
|
|
|
62
112
|
:keyword query: The query to be evaluated.
|
|
63
113
|
:paramtype query: str
|
|
@@ -66,42 +116,7 @@ class CoherenceEvaluator(PromptyEvaluatorBase[Union[str, float]]):
|
|
|
66
116
|
:return: The coherence score.
|
|
67
117
|
:rtype: Dict[str, float]
|
|
68
118
|
"""
|
|
119
|
+
return async_run_allowing_running_loop(self._async_evaluator, query=query, response=response, **kwargs)
|
|
69
120
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
self,
|
|
73
|
-
*,
|
|
74
|
-
conversation: Conversation,
|
|
75
|
-
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
76
|
-
"""Evaluate coherence for a conversation
|
|
77
|
-
|
|
78
|
-
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
79
|
-
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
80
|
-
to be dictionaries with keys "content", "role", and possibly "context".
|
|
81
|
-
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
82
|
-
:return: The coherence score.
|
|
83
|
-
:rtype: Dict[str, Union[float, Dict[str, List[float]]]]
|
|
84
|
-
"""
|
|
85
|
-
|
|
86
|
-
@override
|
|
87
|
-
def __call__( # pylint: disable=docstring-missing-param
|
|
88
|
-
self,
|
|
89
|
-
*args,
|
|
90
|
-
**kwargs,
|
|
91
|
-
):
|
|
92
|
-
"""Evaluate coherence. Accepts either a query and response for a single evaluation,
|
|
93
|
-
or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
|
|
94
|
-
turns, the evaluator will aggregate the results of each turn.
|
|
95
|
-
|
|
96
|
-
:keyword query: The query to be evaluated.
|
|
97
|
-
:paramtype query: str
|
|
98
|
-
:keyword response: The response to be evaluated.
|
|
99
|
-
:paramtype response: Optional[str]
|
|
100
|
-
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
101
|
-
key "messages". Conversation turns are expected
|
|
102
|
-
to be dictionaries with keys "content" and "role".
|
|
103
|
-
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
104
|
-
:return: The relevance score.
|
|
105
|
-
:rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
|
|
106
|
-
"""
|
|
107
|
-
return super().__call__(*args, **kwargs)
|
|
121
|
+
def _to_async(self):
|
|
122
|
+
return self._async_evaluator
|
|
@@ -3,9 +3,14 @@ name: Coherence
|
|
|
3
3
|
description: Evaluates coherence score for QA scenario
|
|
4
4
|
model:
|
|
5
5
|
api: chat
|
|
6
|
+
configuration:
|
|
7
|
+
type: azure_openai
|
|
8
|
+
azure_deployment: ${env:AZURE_DEPLOYMENT}
|
|
9
|
+
api_key: ${env:AZURE_OPENAI_API_KEY}
|
|
10
|
+
azure_endpoint: ${env:AZURE_OPENAI_ENDPOINT}
|
|
6
11
|
parameters:
|
|
7
12
|
temperature: 0.0
|
|
8
|
-
max_tokens:
|
|
13
|
+
max_tokens: 1
|
|
9
14
|
top_p: 1.0
|
|
10
15
|
presence_penalty: 0
|
|
11
16
|
frequency_penalty: 0
|
|
@@ -20,80 +25,38 @@ inputs:
|
|
|
20
25
|
|
|
21
26
|
---
|
|
22
27
|
system:
|
|
23
|
-
|
|
24
|
-
## Goal
|
|
25
|
-
### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided.
|
|
26
|
-
- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score.
|
|
27
|
-
- **Data**: Your input data include a QUERY and a RESPONSE.
|
|
28
|
-
- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.
|
|
28
|
+
You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.
|
|
29
29
|
|
|
30
30
|
user:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
**Response:** The water cycle moves water around Earth. Evaporation, then precipitation occurs. It supports life by distributing water.
|
|
64
|
-
|
|
65
|
-
## [Coherence: 4] (Coherent Response)
|
|
66
|
-
**Definition:** The response is coherent and effectively addresses the question. Ideas are logically organized with clear connections between sentences and paragraphs. Appropriate transitions are used to guide the reader through the response, which flows smoothly and is easy to follow.
|
|
67
|
-
|
|
68
|
-
**Examples:**
|
|
69
|
-
**Query:** What is the water cycle and how does it work?
|
|
70
|
-
**Response:** The water cycle is the continuous movement of water on Earth through processes like evaporation, condensation, and precipitation. Water evaporates from bodies of water, forms clouds through condensation, and returns to the surface as precipitation. This cycle is essential for distributing water resources globally.
|
|
71
|
-
|
|
72
|
-
**Query:** Describe the role of mitochondria in cellular function.
|
|
73
|
-
**Response:** Mitochondria are organelles that produce energy for the cell. They convert nutrients into ATP through cellular respiration. This energy powers various cellular activities, making mitochondria vital for cell survival.
|
|
74
|
-
|
|
75
|
-
## [Coherence: 5] (Highly Coherent Response)
|
|
76
|
-
**Definition:** The response is exceptionally coherent, demonstrating sophisticated organization and flow. Ideas are presented in a logical and seamless manner, with excellent use of transitional phrases and cohesive devices. The connections between concepts are clear and enhance the reader's understanding. The response thoroughly addresses the question with clarity and precision.
|
|
77
|
-
|
|
78
|
-
**Examples:**
|
|
79
|
-
**Query:** Analyze the economic impacts of climate change on coastal cities.
|
|
80
|
-
**Response:** Climate change significantly affects the economies of coastal cities through rising sea levels, increased flooding, and more intense storms. These environmental changes can damage infrastructure, disrupt businesses, and lead to costly repairs. For instance, frequent flooding can hinder transportation and commerce, while the threat of severe weather may deter investment and tourism. Consequently, cities may face increased expenses for disaster preparedness and mitigation efforts, straining municipal budgets and impacting economic growth.
|
|
81
|
-
|
|
82
|
-
**Query:** Discuss the significance of the Monroe Doctrine in shaping U.S. foreign policy.
|
|
83
|
-
**Response:** The Monroe Doctrine was a pivotal policy declared in 1823 that asserted U.S. opposition to European colonization in the Americas. By stating that any intervention by external powers in the Western Hemisphere would be viewed as a hostile act, it established the U.S. as a protector of the region. This doctrine shaped U.S. foreign policy by promoting isolation from European conflicts while justifying American influence and expansion in the hemisphere. Its long-term significance lies in its enduring influence on international relations and its role in defining the U.S. position in global affairs.
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
# Data
|
|
87
|
-
QUERY: {{query}}
|
|
88
|
-
RESPONSE: {{response}}
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
# Tasks
|
|
92
|
-
## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information:
|
|
93
|
-
- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:".
|
|
94
|
-
- **Explanation**: a very short explanation of why you think the input Data should get that Score.
|
|
95
|
-
- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions.
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
## Please provide your answers between the tags: <S0>your chain of thoughts</S0>, <S1>your explanation</S1>, <S2>your Score</S2>.
|
|
99
|
-
# Output
|
|
31
|
+
Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale:
|
|
32
|
+
One star: the answer completely lacks coherence
|
|
33
|
+
Two stars: the answer mostly lacks coherence
|
|
34
|
+
Three stars: the answer is partially coherent
|
|
35
|
+
Four stars: the answer is mostly coherent
|
|
36
|
+
Five stars: the answer has perfect coherency
|
|
37
|
+
|
|
38
|
+
This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5.
|
|
39
|
+
|
|
40
|
+
question: What is your favorite indoor activity and why do you enjoy it?
|
|
41
|
+
answer: I like pizza. The sun is shining.
|
|
42
|
+
stars: 1
|
|
43
|
+
|
|
44
|
+
question: Can you describe your favorite movie without giving away any spoilers?
|
|
45
|
+
answer: It is a science fiction movie. There are dinosaurs. The actors eat cake. People must stop the villain.
|
|
46
|
+
stars: 2
|
|
47
|
+
|
|
48
|
+
question: What are some benefits of regular exercise?
|
|
49
|
+
answer: Regular exercise improves your mood. A good workout also helps you sleep better. Trees are green.
|
|
50
|
+
stars: 3
|
|
51
|
+
|
|
52
|
+
question: How do you cope with stress in your daily life?
|
|
53
|
+
answer: I usually go for a walk to clear my head. Listening to music helps me relax as well. Stress is a part of life, but we can manage it through some activities.
|
|
54
|
+
stars: 4
|
|
55
|
+
|
|
56
|
+
question: What can you tell me about climate change and its effects on the environment?
|
|
57
|
+
answer: Climate change has far-reaching effects on the environment. Rising temperatures result in the melting of polar ice caps, contributing to sea-level rise. Additionally, more frequent and severe weather events, such as hurricanes and heatwaves, can cause disruption to ecosystems and human societies alike.
|
|
58
|
+
stars: 5
|
|
59
|
+
|
|
60
|
+
question: {{query}}
|
|
61
|
+
answer: {{response}}
|
|
62
|
+
stars:
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
|
|
5
5
|
from ._content_safety import ContentSafetyEvaluator
|
|
6
|
+
from ._content_safety_base import ContentSafetyEvaluatorBase
|
|
7
|
+
from ._content_safety_chat import ContentSafetyChatEvaluator
|
|
6
8
|
from ._hate_unfairness import HateUnfairnessEvaluator
|
|
7
9
|
from ._self_harm import SelfHarmEvaluator
|
|
8
10
|
from ._sexual import SexualEvaluator
|
|
@@ -14,4 +16,6 @@ __all__ = [
|
|
|
14
16
|
"SelfHarmEvaluator",
|
|
15
17
|
"HateUnfairnessEvaluator",
|
|
16
18
|
"ContentSafetyEvaluator",
|
|
19
|
+
"ContentSafetyChatEvaluator",
|
|
20
|
+
"ContentSafetyEvaluatorBase",
|
|
17
21
|
]
|
|
@@ -2,135 +2,99 @@
|
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
4
|
from concurrent.futures import as_completed
|
|
5
|
-
from typing import Callable, Dict, List, Union
|
|
6
5
|
|
|
7
6
|
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
8
|
-
from typing_extensions import overload, override
|
|
9
7
|
|
|
10
|
-
from azure.ai.evaluation.
|
|
11
|
-
from azure.ai.evaluation._model_configurations import Conversation
|
|
12
|
-
from azure.ai.evaluation._common._experimental import experimental
|
|
8
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
13
9
|
|
|
14
|
-
|
|
15
|
-
from .
|
|
16
|
-
from .
|
|
17
|
-
from .
|
|
10
|
+
try:
|
|
11
|
+
from ._hate_unfairness import HateUnfairnessEvaluator
|
|
12
|
+
from ._self_harm import SelfHarmEvaluator
|
|
13
|
+
from ._sexual import SexualEvaluator
|
|
14
|
+
from ._violence import ViolenceEvaluator
|
|
15
|
+
except ImportError:
|
|
16
|
+
from _hate_unfairness import HateUnfairnessEvaluator
|
|
17
|
+
from _self_harm import SelfHarmEvaluator
|
|
18
|
+
from _sexual import SexualEvaluator
|
|
19
|
+
from _violence import ViolenceEvaluator
|
|
18
20
|
|
|
19
21
|
|
|
20
|
-
|
|
21
|
-
class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
|
|
22
|
+
class ContentSafetyEvaluator:
|
|
22
23
|
"""
|
|
23
24
|
Initialize a content safety evaluator configured to evaluate content safetry metrics for QA scenario.
|
|
24
25
|
|
|
25
|
-
:param credential: The credential for connecting to Azure AI project. Required
|
|
26
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
27
26
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
28
27
|
It contains subscription id, resource group, and project name.
|
|
29
28
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
30
|
-
:param
|
|
31
|
-
|
|
29
|
+
:param parallel: If True, use parallel execution for evaluators. Else, use sequential execution.
|
|
30
|
+
Default is True.
|
|
31
|
+
:param credential: The credential for connecting to Azure AI project.
|
|
32
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
32
33
|
:return: A function that evaluates content-safety metrics for "question-answering" scenario.
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
:
|
|
41
|
-
:
|
|
34
|
+
:rtype: Callable
|
|
35
|
+
|
|
36
|
+
**Usage**
|
|
37
|
+
|
|
38
|
+
.. code-block:: python
|
|
39
|
+
|
|
40
|
+
azure_ai_project = {
|
|
41
|
+
"subscription_id": "<subscription_id>",
|
|
42
|
+
"resource_group_name": "<resource_group_name>",
|
|
43
|
+
"project_name": "<project_name>",
|
|
44
|
+
}
|
|
45
|
+
eval_fn = ContentSafetyEvaluator(azure_ai_project)
|
|
46
|
+
result = eval_fn(
|
|
47
|
+
query="What is the capital of France?",
|
|
48
|
+
response="Paris.",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
**Output format**
|
|
52
|
+
|
|
53
|
+
.. code-block:: python
|
|
54
|
+
|
|
55
|
+
{
|
|
56
|
+
"violence": "Medium",
|
|
57
|
+
"violence_score": 5.0,
|
|
58
|
+
"violence_reason": "Some reason",
|
|
59
|
+
"sexual": "Medium",
|
|
60
|
+
"sexual_score": 5.0,
|
|
61
|
+
"sexual_reason": "Some reason",
|
|
62
|
+
"self_harm": "Medium",
|
|
63
|
+
"self_harm_score": 5.0,
|
|
64
|
+
"self_harm_reason": "Some reason",
|
|
65
|
+
"hate_unfairness": "Medium",
|
|
66
|
+
"hate_unfairness_score": 5.0,
|
|
67
|
+
"hate_unfairness_reason": "Some reason"
|
|
68
|
+
}
|
|
42
69
|
"""
|
|
43
70
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [
|
|
52
|
-
ViolenceEvaluator(credential, azure_ai_project),
|
|
53
|
-
SexualEvaluator(credential, azure_ai_project),
|
|
54
|
-
SelfHarmEvaluator(credential, azure_ai_project),
|
|
55
|
-
HateUnfairnessEvaluator(credential, azure_ai_project),
|
|
71
|
+
def __init__(self, azure_ai_project: dict, parallel: bool = True, credential=None):
|
|
72
|
+
self._parallel = parallel
|
|
73
|
+
self._evaluators = [
|
|
74
|
+
ViolenceEvaluator(azure_ai_project, credential),
|
|
75
|
+
SexualEvaluator(azure_ai_project, credential),
|
|
76
|
+
SelfHarmEvaluator(azure_ai_project, credential),
|
|
77
|
+
HateUnfairnessEvaluator(azure_ai_project, credential),
|
|
56
78
|
]
|
|
57
79
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
*,
|
|
62
|
-
query: str,
|
|
63
|
-
response: str,
|
|
64
|
-
) -> Dict[str, Union[str, float]]:
|
|
65
|
-
"""Evaluate a collection of content safety metrics for the given query/response pair
|
|
80
|
+
def __call__(self, *, query: str, response: str, **kwargs):
|
|
81
|
+
"""
|
|
82
|
+
Evaluates content-safety metrics for "question-answering" scenario.
|
|
66
83
|
|
|
67
84
|
:keyword query: The query to be evaluated.
|
|
68
85
|
:paramtype query: str
|
|
69
86
|
:keyword response: The response to be evaluated.
|
|
70
87
|
:paramtype response: str
|
|
71
|
-
:
|
|
72
|
-
:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
@overload
|
|
76
|
-
def __call__(
|
|
77
|
-
self,
|
|
78
|
-
*,
|
|
79
|
-
conversation: Conversation,
|
|
80
|
-
) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]:
|
|
81
|
-
"""Evaluate a collection of content safety metrics for a conversation
|
|
82
|
-
|
|
83
|
-
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
84
|
-
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
85
|
-
to be dictionaries with keys "content", "role", and possibly "context".
|
|
86
|
-
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
87
|
-
:return: The content safety scores.
|
|
88
|
-
:rtype: Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]
|
|
89
|
-
"""
|
|
90
|
-
|
|
91
|
-
@override
|
|
92
|
-
def __call__( # pylint: disable=docstring-missing-param
|
|
93
|
-
self,
|
|
94
|
-
*args,
|
|
95
|
-
**kwargs,
|
|
96
|
-
):
|
|
97
|
-
"""Evaluate a collection of content safety metrics for the given query/response pair or conversation.
|
|
98
|
-
This inputs must supply either a query AND response, or a conversation, but not both.
|
|
99
|
-
|
|
100
|
-
:keyword query: The query to evaluate.
|
|
101
|
-
:paramtype query: Optional[str]
|
|
102
|
-
:keyword response: The response to evaluate.
|
|
103
|
-
:paramtype response: Optional[str]
|
|
104
|
-
:keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
|
|
105
|
-
key "messages", and potentially a global context under the key "context". Conversation turns are expected
|
|
106
|
-
to be dictionaries with keys "content", "role", and possibly "context".
|
|
107
|
-
:paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
|
|
108
|
-
:return: The evaluation result.
|
|
109
|
-
:rtype: Union[Dict[str, Union[str, float]], Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]]
|
|
110
|
-
"""
|
|
111
|
-
return super().__call__(*args, **kwargs)
|
|
112
|
-
|
|
113
|
-
@override
|
|
114
|
-
async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:
|
|
115
|
-
"""Perform the evaluation using the Azure AI RAI service.
|
|
116
|
-
The exact evaluation performed is determined by the evaluation metric supplied
|
|
117
|
-
by the child class initializer.
|
|
118
|
-
|
|
119
|
-
:param eval_input: The input to the evaluation function.
|
|
120
|
-
:type eval_input: Dict
|
|
121
|
-
:return: The evaluation result.
|
|
122
|
-
:rtype: Dict
|
|
88
|
+
:keyword parallel: Whether to evaluate in parallel.
|
|
89
|
+
:paramtype parallel: bool
|
|
90
|
+
:return: The scores for content-safety.
|
|
91
|
+
:rtype: dict
|
|
123
92
|
"""
|
|
124
|
-
|
|
125
|
-
response = eval_input.get("response", None)
|
|
126
|
-
conversation = eval_input.get("conversation", None)
|
|
127
|
-
results: Dict[str, Union[str, float]] = {}
|
|
128
|
-
# TODO fix this to not explode on empty optional inputs (PF SKD error)
|
|
93
|
+
results = {}
|
|
129
94
|
if self._parallel:
|
|
130
95
|
with ThreadPoolExecutor() as executor:
|
|
131
|
-
# pylint: disable=no-value-for-parameter
|
|
132
96
|
futures = {
|
|
133
|
-
executor.submit(query=query, response=response,
|
|
97
|
+
executor.submit(evaluator, query=query, response=response, **kwargs): evaluator
|
|
134
98
|
for evaluator in self._evaluators
|
|
135
99
|
}
|
|
136
100
|
|
|
@@ -138,7 +102,7 @@ class ContentSafetyEvaluator(EvaluatorBase[Union[str, float]]):
|
|
|
138
102
|
results.update(future.result())
|
|
139
103
|
else:
|
|
140
104
|
for evaluator in self._evaluators:
|
|
141
|
-
result = evaluator(query=query, response=response,
|
|
105
|
+
result = evaluator(query=query, response=response, **kwargs)
|
|
142
106
|
results.update(result)
|
|
143
107
|
|
|
144
108
|
return results
|
|
@@ -1,54 +1,65 @@
|
|
|
1
1
|
# ---------------------------------------------------------
|
|
2
2
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
3
|
# ---------------------------------------------------------
|
|
4
|
+
|
|
4
5
|
from abc import ABC
|
|
5
|
-
|
|
6
|
-
from azure.ai.evaluation._common.
|
|
7
|
-
from azure.ai.evaluation._common.
|
|
8
|
-
from azure.ai.evaluation.
|
|
9
|
-
from azure.
|
|
6
|
+
|
|
7
|
+
from azure.ai.evaluation._common.constants import EvaluationMetrics
|
|
8
|
+
from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service
|
|
9
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
10
|
+
from azure.ai.evaluation._model_configurations import AzureAIProject
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
class
|
|
13
|
+
class ContentSafetyEvaluatorBase(ABC):
|
|
13
14
|
"""
|
|
14
15
|
Initialize a evaluator for a specified Evaluation Metric. Base class that is not
|
|
15
16
|
meant to be instantiated by users.
|
|
16
17
|
|
|
18
|
+
|
|
17
19
|
:param metric: The metric to be evaluated.
|
|
18
20
|
:type metric: ~azure.ai.evaluation._evaluators._content_safety.flow.constants.EvaluationMetrics
|
|
19
|
-
:param credential: The credential for connecting to Azure AI project. Required
|
|
20
|
-
:type credential: ~azure.core.credentials.TokenCredential
|
|
21
21
|
:param azure_ai_project: The scope of the Azure AI project.
|
|
22
22
|
It contains subscription id, resource group, and project name.
|
|
23
23
|
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
|
|
24
|
+
:param credential: The credential for connecting to Azure AI project.
|
|
25
|
+
:type credential: ~azure.core.credentials.TokenCredential
|
|
24
26
|
"""
|
|
25
27
|
|
|
26
|
-
def __init__(
|
|
27
|
-
self,
|
|
28
|
-
metric: Union[EvaluationMetrics, _InternalEvaluationMetrics],
|
|
29
|
-
credential: TokenCredential,
|
|
30
|
-
azure_ai_project,
|
|
31
|
-
):
|
|
28
|
+
def __init__(self, metric: EvaluationMetrics, azure_ai_project: dict, credential=None):
|
|
32
29
|
self._metric = metric
|
|
33
30
|
self._azure_ai_project = azure_ai_project
|
|
34
31
|
self._credential = credential
|
|
35
32
|
|
|
36
|
-
async def __call__(self, *,
|
|
33
|
+
async def __call__(self, *, query: str, response: str, **kwargs):
|
|
37
34
|
"""
|
|
38
35
|
Evaluates content according to this evaluator's metric.
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
:paramtype
|
|
36
|
+
|
|
37
|
+
:keyword query: The query to be evaluated.
|
|
38
|
+
:paramtype query: str
|
|
39
|
+
:keyword response: The response to be evaluated.
|
|
40
|
+
:paramtype response: str
|
|
42
41
|
:return: The evaluation score computation based on the Content Safety metric (self.metric).
|
|
43
|
-
:rtype:
|
|
42
|
+
:rtype: Any
|
|
44
43
|
"""
|
|
45
|
-
#
|
|
46
|
-
|
|
47
|
-
|
|
44
|
+
# Validate inputs
|
|
45
|
+
# Raises value error if failed, so execution alone signifies success.
|
|
46
|
+
if not (query and query.strip() and query != "None") or not (
|
|
47
|
+
response and response.strip() and response != "None"
|
|
48
|
+
):
|
|
49
|
+
msg = "Both 'query' and 'response' must be non-empty strings."
|
|
50
|
+
raise EvaluationException(
|
|
51
|
+
message=msg,
|
|
52
|
+
internal_message=msg,
|
|
53
|
+
error_category=ErrorCategory.MISSING_FIELD,
|
|
54
|
+
error_blame=ErrorBlame.USER_ERROR,
|
|
55
|
+
error_target=ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR,
|
|
56
|
+
)
|
|
57
|
+
|
|
48
58
|
# Run score computation based on supplied metric.
|
|
49
|
-
result = await
|
|
50
|
-
messages=messages,
|
|
59
|
+
result = await evaluate_with_rai_service(
|
|
51
60
|
metric_name=self._metric,
|
|
61
|
+
query=query,
|
|
62
|
+
response=response,
|
|
52
63
|
project_scope=self._azure_ai_project,
|
|
53
64
|
credential=self._credential,
|
|
54
65
|
)
|