azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- azure/ai/evaluation/__init__.py +82 -0
- azure/ai/evaluation/_common/__init__.py +16 -0
- azure/ai/evaluation/_common/_experimental.py +172 -0
- azure/ai/evaluation/_common/constants.py +72 -0
- azure/ai/evaluation/_common/math.py +89 -0
- azure/ai/evaluation/_common/rai_service.py +632 -0
- azure/ai/evaluation/_common/utils.py +445 -0
- azure/ai/evaluation/_constants.py +72 -0
- azure/ai/evaluation/_evaluate/__init__.py +3 -0
- azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
- azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
- azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
- azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
- azure/ai/evaluation/_evaluate/_utils.py +298 -0
- azure/ai/evaluation/_evaluators/__init__.py +3 -0
- azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
- azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
- azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
- azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
- azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
- azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
- azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
- azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
- azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
- azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
- azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
- azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
- azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
- azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
- azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
- azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
- azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
- azure/ai/evaluation/_exceptions.py +128 -0
- azure/ai/evaluation/_http_utils.py +466 -0
- azure/ai/evaluation/_model_configurations.py +123 -0
- azure/ai/evaluation/_user_agent.py +6 -0
- azure/ai/evaluation/_vendor/__init__.py +3 -0
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
- azure/ai/evaluation/_version.py +5 -0
- azure/ai/evaluation/py.typed +0 -0
- azure/ai/evaluation/simulator/__init__.py +16 -0
- azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
- azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
- azure/ai/evaluation/simulator/_constants.py +27 -0
- azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
- azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
- azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
- azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
- azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
- azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
- azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
- azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
- azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
- azure/ai/evaluation/simulator/_simulator.py +716 -0
- azure/ai/evaluation/simulator/_tracing.py +89 -0
- azure/ai/evaluation/simulator/_utils.py +132 -0
- azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
- azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
- {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
- azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
- azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
- azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
- azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import inspect
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
from concurrent.futures import Future
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Callable, Dict, Optional, Union, cast
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from promptflow.contracts.types import AttrDict
|
|
14
|
+
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
15
|
+
|
|
16
|
+
from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
|
|
17
|
+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
|
|
18
|
+
|
|
19
|
+
from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
|
|
20
|
+
|
|
21
|
+
LOGGER = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class CodeRun:
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
run: Future,
|
|
29
|
+
input_data,
|
|
30
|
+
evaluator_name: Optional[str] = None,
|
|
31
|
+
aggregator: Callable[["CodeRun"], Future],
|
|
32
|
+
**kwargs, # pylint: disable=unused-argument
|
|
33
|
+
) -> None:
|
|
34
|
+
self.run = run
|
|
35
|
+
self.evaluator_name = evaluator_name if evaluator_name is not None else ""
|
|
36
|
+
self.input_data = input_data
|
|
37
|
+
self.aggregated_metrics = aggregator(self)
|
|
38
|
+
|
|
39
|
+
def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
|
|
40
|
+
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
41
|
+
result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
|
|
42
|
+
if exclude_inputs:
|
|
43
|
+
result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
|
|
44
|
+
return result_df
|
|
45
|
+
|
|
46
|
+
def get_aggregated_metrics(self) -> Dict[str, Any]:
|
|
47
|
+
try:
|
|
48
|
+
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
49
|
+
aggregated_metrics: Optional[Any] = (
|
|
50
|
+
cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
|
|
51
|
+
if self.aggregated_metrics is not None
|
|
52
|
+
else None
|
|
53
|
+
)
|
|
54
|
+
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
55
|
+
LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", self.evaluator_name, ex)
|
|
56
|
+
aggregated_metrics = None
|
|
57
|
+
|
|
58
|
+
if not isinstance(aggregated_metrics, dict):
|
|
59
|
+
LOGGER.warning(
|
|
60
|
+
"Aggregated metrics for evaluator %s is not a dictionary will not be logged as metrics",
|
|
61
|
+
self.evaluator_name,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
aggregated_metrics = aggregated_metrics if isinstance(aggregated_metrics, dict) else {}
|
|
65
|
+
|
|
66
|
+
return aggregated_metrics
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
70
|
+
def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
|
|
71
|
+
self,
|
|
72
|
+
) -> None:
|
|
73
|
+
self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
|
|
74
|
+
|
|
75
|
+
def _calculate_metric(
|
|
76
|
+
self, evaluator: Callable, input_df: pd.DataFrame, column_mapping: Optional[Dict[str, str]], evaluator_name: str
|
|
77
|
+
) -> pd.DataFrame:
|
|
78
|
+
row_metric_futures = []
|
|
79
|
+
row_metric_results = []
|
|
80
|
+
input_df = _apply_column_mapping(input_df, column_mapping)
|
|
81
|
+
# Ignoring args and kwargs from the signature since they are usually catching extra arguments
|
|
82
|
+
parameters = {
|
|
83
|
+
param.name
|
|
84
|
+
for param in inspect.signature(evaluator).parameters.values()
|
|
85
|
+
if param.name not in ["args", "kwargs"]
|
|
86
|
+
}
|
|
87
|
+
for value in input_df.to_dict("records"):
|
|
88
|
+
# Filter out only the parameters that are present in the input data
|
|
89
|
+
# if no parameters then pass data as is
|
|
90
|
+
filtered_values = {k: v for k, v in value.items() if k in parameters} if len(parameters) > 0 else value
|
|
91
|
+
row_metric_futures.append(self._thread_pool.submit(evaluator, **filtered_values))
|
|
92
|
+
|
|
93
|
+
for row_number, row_metric_future in enumerate(row_metric_futures):
|
|
94
|
+
try:
|
|
95
|
+
result = row_metric_future.result()
|
|
96
|
+
if not isinstance(result, dict):
|
|
97
|
+
result = {"output": result}
|
|
98
|
+
row_metric_results.append(result)
|
|
99
|
+
except Exception as ex: # pylint: disable=broad-except
|
|
100
|
+
msg_1 = f"Error calculating value for row {row_number} for metric {evaluator_name}, "
|
|
101
|
+
msg_2 = f"failed with error {str(ex)} : Stack trace : {str(ex.__traceback__)}"
|
|
102
|
+
LOGGER.info(msg_1 + msg_2)
|
|
103
|
+
# If a row fails to calculate, add an empty dict to maintain the row index
|
|
104
|
+
# This is to ensure the output dataframe has the same number of rows as the input dataframe
|
|
105
|
+
# pd concat will fill NaN for missing values
|
|
106
|
+
row_metric_results.append({})
|
|
107
|
+
|
|
108
|
+
return pd.concat(
|
|
109
|
+
[input_df.add_prefix("inputs."), pd.DataFrame(row_metric_results)],
|
|
110
|
+
axis=1,
|
|
111
|
+
verify_integrity=True,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
@staticmethod
|
|
115
|
+
def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
|
|
116
|
+
try:
|
|
117
|
+
if _has_aggregator(evaluator):
|
|
118
|
+
evaluator_output = run.get_result_df(exclude_inputs=True)
|
|
119
|
+
if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
|
|
120
|
+
aggregate_input = evaluator_output["output"].tolist()
|
|
121
|
+
else:
|
|
122
|
+
aggregate_input = [AttrDict(item) for item in evaluator_output.to_dict("records")]
|
|
123
|
+
|
|
124
|
+
aggr_func = getattr(evaluator, "__aggregate__")
|
|
125
|
+
aggregated_output = aggr_func(aggregate_input)
|
|
126
|
+
return aggregated_output
|
|
127
|
+
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
128
|
+
LOGGER.warning(
|
|
129
|
+
"Error calculating aggregations for evaluator %s, failed with error %s", run.evaluator_name, ex
|
|
130
|
+
)
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
def run(
|
|
134
|
+
self, # pylint: disable=unused-argument
|
|
135
|
+
flow: Callable,
|
|
136
|
+
data: Union[os.PathLike, Path, pd.DataFrame],
|
|
137
|
+
evaluator_name: Optional[str] = None,
|
|
138
|
+
column_mapping: Optional[Dict[str, str]] = None,
|
|
139
|
+
**kwargs,
|
|
140
|
+
) -> CodeRun:
|
|
141
|
+
input_df = data
|
|
142
|
+
if not isinstance(input_df, pd.DataFrame):
|
|
143
|
+
try:
|
|
144
|
+
json_data = load_jsonl(data)
|
|
145
|
+
except json.JSONDecodeError as exc:
|
|
146
|
+
raise EvaluationException(
|
|
147
|
+
message=f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
|
|
148
|
+
internal_message="Failed to parse data as JSON",
|
|
149
|
+
target=ErrorTarget.CODE_CLIENT,
|
|
150
|
+
category=ErrorCategory.INVALID_VALUE,
|
|
151
|
+
blame=ErrorBlame.USER_ERROR,
|
|
152
|
+
) from exc
|
|
153
|
+
|
|
154
|
+
input_df = pd.DataFrame(json_data)
|
|
155
|
+
eval_future = self._thread_pool.submit(
|
|
156
|
+
self._calculate_metric,
|
|
157
|
+
evaluator=flow,
|
|
158
|
+
input_df=input_df,
|
|
159
|
+
column_mapping=column_mapping,
|
|
160
|
+
evaluator_name=evaluator_name,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
return CodeRun(
|
|
164
|
+
run=eval_future,
|
|
165
|
+
input_data=data,
|
|
166
|
+
evaluator_name=evaluator_name,
|
|
167
|
+
aggregator=lambda code_run: self._thread_pool.submit(
|
|
168
|
+
self._calculate_aggregations, evaluator=flow, run=code_run
|
|
169
|
+
),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
|
|
173
|
+
result_df = run.get_result_df(exclude_inputs=not all_results)
|
|
174
|
+
return result_df
|
|
175
|
+
|
|
176
|
+
def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
|
|
177
|
+
try:
|
|
178
|
+
aggregated_metrics = run.get_aggregated_metrics()
|
|
179
|
+
print("Aggregated metrics")
|
|
180
|
+
print(aggregated_metrics)
|
|
181
|
+
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
182
|
+
LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
|
|
183
|
+
return {}
|
|
184
|
+
return aggregated_metrics
|
|
185
|
+
|
|
186
|
+
def get_run_summary(self, run: CodeRun) -> Any: # pylint: disable=unused-argument
|
|
187
|
+
# Not implemented
|
|
188
|
+
return None
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import os
|
|
5
|
+
import types
|
|
6
|
+
from typing import Optional, Type, Union
|
|
7
|
+
|
|
8
|
+
from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
|
|
9
|
+
from promptflow._utils.user_agent_utils import ClientUserAgentUtil
|
|
10
|
+
from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
|
|
11
|
+
|
|
12
|
+
from azure.ai.evaluation._constants import (
|
|
13
|
+
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
|
|
14
|
+
OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
|
|
15
|
+
PF_BATCH_TIMEOUT_SEC,
|
|
16
|
+
PF_BATCH_TIMEOUT_SEC_DEFAULT,
|
|
17
|
+
PF_DISABLE_TRACING,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from ..._user_agent import USER_AGENT
|
|
21
|
+
from .._utils import set_event_loop_policy
|
|
22
|
+
from .code_client import CodeClient
|
|
23
|
+
from .proxy_client import ProxyClient
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class EvalRunContext:
|
|
27
|
+
"""Context manager for eval batch run.
|
|
28
|
+
|
|
29
|
+
:param client: The client to run in the context.
|
|
30
|
+
:type client: Union[
|
|
31
|
+
~azure.ai.evaluation._evaluate._batch_run.code_client.CodeClient,
|
|
32
|
+
~azure.ai.evaluation._evaluate._batch_run.proxy_client.ProxyClient
|
|
33
|
+
]
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
|
|
37
|
+
self.client = client
|
|
38
|
+
self._is_batch_timeout_set_by_system = False
|
|
39
|
+
self._is_otel_timeout_set_by_system = False
|
|
40
|
+
self._original_cwd = os.getcwd()
|
|
41
|
+
|
|
42
|
+
def __enter__(self) -> None:
|
|
43
|
+
# Preserve current working directory, as PF may change it without restoring it afterward
|
|
44
|
+
self._original_cwd = os.getcwd()
|
|
45
|
+
|
|
46
|
+
if isinstance(self.client, CodeClient):
|
|
47
|
+
ClientUserAgentUtil.append_user_agent(USER_AGENT)
|
|
48
|
+
inject_openai_api()
|
|
49
|
+
|
|
50
|
+
if isinstance(self.client, ProxyClient):
|
|
51
|
+
os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
|
|
52
|
+
os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
|
|
53
|
+
os.environ[PF_DISABLE_TRACING] = "true"
|
|
54
|
+
|
|
55
|
+
if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
|
|
56
|
+
os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
57
|
+
self._is_batch_timeout_set_by_system = True
|
|
58
|
+
|
|
59
|
+
# For dealing with the timeout issue of OpenTelemetry exporter when multiple evaluators are running
|
|
60
|
+
if os.environ.get(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT) is None:
|
|
61
|
+
os.environ[OTEL_EXPORTER_OTLP_TRACES_TIMEOUT] = str(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT)
|
|
62
|
+
self._is_otel_timeout_set_by_system = True
|
|
63
|
+
|
|
64
|
+
# For addressing the issue of asyncio event loop closed on Windows
|
|
65
|
+
set_event_loop_policy()
|
|
66
|
+
|
|
67
|
+
def __exit__(
|
|
68
|
+
self,
|
|
69
|
+
exc_type: Optional[Type[BaseException]],
|
|
70
|
+
exc_value: Optional[BaseException],
|
|
71
|
+
exc_tb: Optional[types.TracebackType],
|
|
72
|
+
) -> None:
|
|
73
|
+
os.chdir(self._original_cwd)
|
|
74
|
+
|
|
75
|
+
if isinstance(self.client, CodeClient):
|
|
76
|
+
recover_openai_api()
|
|
77
|
+
|
|
78
|
+
if isinstance(self.client, ProxyClient):
|
|
79
|
+
os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
|
|
80
|
+
os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
|
|
81
|
+
os.environ.pop(PF_DISABLE_TRACING, None)
|
|
82
|
+
|
|
83
|
+
if self._is_batch_timeout_set_by_system:
|
|
84
|
+
os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
|
|
85
|
+
self._is_batch_timeout_set_by_system = False
|
|
86
|
+
|
|
87
|
+
if self._is_otel_timeout_set_by_system:
|
|
88
|
+
os.environ.pop(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, None)
|
|
89
|
+
self._is_otel_timeout_set_by_system = False
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
# pylint: disable=protected-access
|
|
6
|
+
|
|
7
|
+
import inspect
|
|
8
|
+
import logging
|
|
9
|
+
import math
|
|
10
|
+
import os
|
|
11
|
+
from collections import OrderedDict
|
|
12
|
+
from concurrent.futures import Future
|
|
13
|
+
from typing import Any, Callable, Dict, Optional, Union
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
from promptflow.client import PFClient
|
|
17
|
+
from promptflow.entities import Run
|
|
18
|
+
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
19
|
+
|
|
20
|
+
LOGGER = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ProxyRun:
|
|
24
|
+
def __init__(self, run: Future, **kwargs) -> None: # pylint: disable=unused-argument
|
|
25
|
+
self.run = run
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
|
|
29
|
+
def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
|
|
30
|
+
self, pf_client: PFClient
|
|
31
|
+
) -> None:
|
|
32
|
+
self._pf_client = pf_client
|
|
33
|
+
self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
|
|
34
|
+
|
|
35
|
+
def run(
|
|
36
|
+
self,
|
|
37
|
+
flow: Union[str, os.PathLike, Callable],
|
|
38
|
+
data: Union[str, os.PathLike],
|
|
39
|
+
column_mapping: Optional[Dict[str, str]] = None,
|
|
40
|
+
**kwargs
|
|
41
|
+
) -> ProxyRun:
|
|
42
|
+
flow_to_run = flow
|
|
43
|
+
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and hasattr(flow, "_to_async"):
|
|
44
|
+
flow_to_run = flow._to_async() # pylint: disable=protected-access
|
|
45
|
+
|
|
46
|
+
batch_use_async = self._should_batch_use_async(flow_to_run)
|
|
47
|
+
eval_future = self._thread_pool.submit(
|
|
48
|
+
self._pf_client.run,
|
|
49
|
+
flow_to_run,
|
|
50
|
+
data=data,
|
|
51
|
+
column_mapping=column_mapping,
|
|
52
|
+
batch_use_async=batch_use_async,
|
|
53
|
+
**kwargs
|
|
54
|
+
)
|
|
55
|
+
return ProxyRun(run=eval_future)
|
|
56
|
+
|
|
57
|
+
def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
|
|
58
|
+
run: Run = proxy_run.run.result()
|
|
59
|
+
result_df = self._pf_client.get_details(run, all_results=all_results)
|
|
60
|
+
result_df.replace("(Failed)", math.nan, inplace=True)
|
|
61
|
+
return result_df
|
|
62
|
+
|
|
63
|
+
def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
|
|
64
|
+
run: Run = proxy_run.run.result()
|
|
65
|
+
return self._pf_client.get_metrics(run)
|
|
66
|
+
|
|
67
|
+
def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
|
|
68
|
+
run = proxy_run.run.result()
|
|
69
|
+
|
|
70
|
+
# pylint: disable=protected-access
|
|
71
|
+
completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
|
|
72
|
+
failed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")
|
|
73
|
+
|
|
74
|
+
# Update status to "Completed with Errors" if the original status is "Completed" and there are failed lines
|
|
75
|
+
if run.status == "Completed" and failed_lines != "NA" and int(failed_lines) > 0:
|
|
76
|
+
status = "Completed with Errors"
|
|
77
|
+
else:
|
|
78
|
+
status = run.status
|
|
79
|
+
|
|
80
|
+
# Return the ordered dictionary with the updated status
|
|
81
|
+
return OrderedDict(
|
|
82
|
+
[
|
|
83
|
+
("status", status),
|
|
84
|
+
("duration", str(run._end_time - run._created_on)),
|
|
85
|
+
("completed_lines", completed_lines),
|
|
86
|
+
("failed_lines", failed_lines),
|
|
87
|
+
("log_path", str(run._output_path)),
|
|
88
|
+
]
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def _should_batch_use_async(flow):
|
|
93
|
+
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
|
|
94
|
+
if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
|
|
95
|
+
return True
|
|
96
|
+
if inspect.iscoroutinefunction(flow):
|
|
97
|
+
return True
|
|
98
|
+
return False
|
|
99
|
+
return False
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import os
|
|
5
|
+
import types
|
|
6
|
+
from typing import Optional, Type
|
|
7
|
+
|
|
8
|
+
from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP
|
|
9
|
+
from azure.ai.evaluation._constants import PF_DISABLE_TRACING
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TargetRunContext:
|
|
13
|
+
"""Context manager for target batch run.
|
|
14
|
+
|
|
15
|
+
:param upload_snapshot: Whether to upload target snapshot.
|
|
16
|
+
:type upload_snapshot: bool
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, upload_snapshot: bool) -> None:
|
|
20
|
+
self._upload_snapshot = upload_snapshot
|
|
21
|
+
self._original_cwd = os.getcwd()
|
|
22
|
+
|
|
23
|
+
def __enter__(self) -> None:
|
|
24
|
+
# Preserve current working directory, as PF may change it without restoring it afterward
|
|
25
|
+
self._original_cwd = os.getcwd()
|
|
26
|
+
|
|
27
|
+
# Address "[WinError 32] The process cannot access the file" error,
|
|
28
|
+
# caused by conflicts when the venv and target function are in the same directory.
|
|
29
|
+
# Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
|
|
30
|
+
if not self._upload_snapshot:
|
|
31
|
+
os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
|
|
32
|
+
|
|
33
|
+
os.environ[PF_DISABLE_TRACING] = "true"
|
|
34
|
+
|
|
35
|
+
def __exit__(
|
|
36
|
+
self,
|
|
37
|
+
exc_type: Optional[Type[BaseException]],
|
|
38
|
+
exc_value: Optional[BaseException],
|
|
39
|
+
exc_tb: Optional[types.TracebackType],
|
|
40
|
+
) -> None:
|
|
41
|
+
os.chdir(self._original_cwd)
|
|
42
|
+
|
|
43
|
+
if not self._upload_snapshot:
|
|
44
|
+
os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
|
|
45
|
+
|
|
46
|
+
os.environ.pop(PF_DISABLE_TRACING, None)
|