azure-ai-evaluation 1.0.0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of azure-ai-evaluation might be problematic. Click here for more details.
- azure/ai/evaluation/__init__.py +4 -26
- azure/ai/evaluation/_common/constants.py +2 -9
- azure/ai/evaluation/_common/rai_service.py +122 -302
- azure/ai/evaluation/_common/utils.py +35 -393
- azure/ai/evaluation/_constants.py +6 -28
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/__init__.py +2 -3
- azure/ai/evaluation/_evaluate/{_batch_run/eval_run_context.py → _batch_run_client/batch_run_context.py} +8 -25
- azure/ai/evaluation/_evaluate/{_batch_run → _batch_run_client}/code_client.py +30 -68
- azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
- azure/ai/evaluation/_evaluate/_eval_run.py +40 -117
- azure/ai/evaluation/_evaluate/_evaluate.py +255 -416
- azure/ai/evaluation/_evaluate/_telemetry/__init__.py +19 -24
- azure/ai/evaluation/_evaluate/_utils.py +47 -108
- azure/ai/evaluation/_evaluators/_bleu/_bleu.py +19 -18
- azure/ai/evaluation/_evaluators/{_retrieval → _chat}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
- azure/ai/evaluation/_evaluators/{_service_groundedness → _chat/retrieval}/__init__.py +2 -2
- azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
- azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
- azure/ai/evaluation/_evaluators/_coherence/_coherence.py +93 -78
- azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +39 -76
- azure/ai/evaluation/_evaluators/_content_safety/__init__.py +4 -0
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +68 -104
- azure/ai/evaluation/_evaluators/{_multimodal/_content_safety_multimodal_base.py → _content_safety/_content_safety_base.py} +35 -24
- azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
- azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +54 -105
- azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +52 -99
- azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +52 -101
- azure/ai/evaluation/_evaluators/_content_safety/_violence.py +51 -101
- azure/ai/evaluation/_evaluators/_eci/_eci.py +55 -45
- azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +20 -36
- azure/ai/evaluation/_evaluators/_fluency/_fluency.py +94 -76
- azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +41 -66
- azure/ai/evaluation/_evaluators/_gleu/_gleu.py +17 -15
- azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +92 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
- azure/ai/evaluation/_evaluators/_meteor/_meteor.py +27 -21
- azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +80 -89
- azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
- azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
- azure/ai/evaluation/_evaluators/_qa/_qa.py +43 -25
- azure/ai/evaluation/_evaluators/_relevance/_relevance.py +101 -84
- azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +47 -78
- azure/ai/evaluation/_evaluators/_rouge/_rouge.py +27 -27
- azure/ai/evaluation/_evaluators/_similarity/_similarity.py +45 -55
- azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +5 -0
- azure/ai/evaluation/_evaluators/_xpia/xpia.py +106 -91
- azure/ai/evaluation/_exceptions.py +7 -28
- azure/ai/evaluation/_http_utils.py +134 -205
- azure/ai/evaluation/_model_configurations.py +8 -104
- azure/ai/evaluation/_version.py +1 -1
- azure/ai/evaluation/simulator/__init__.py +2 -3
- azure/ai/evaluation/simulator/_adversarial_scenario.py +1 -20
- azure/ai/evaluation/simulator/_adversarial_simulator.py +95 -116
- azure/ai/evaluation/simulator/_constants.py +1 -11
- azure/ai/evaluation/simulator/_conversation/__init__.py +13 -14
- azure/ai/evaluation/simulator/_conversation/_conversation.py +20 -20
- azure/ai/evaluation/simulator/_direct_attack_simulator.py +68 -34
- azure/ai/evaluation/simulator/_helpers/__init__.py +1 -1
- azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +28 -31
- azure/ai/evaluation/simulator/_indirect_attack_simulator.py +95 -108
- azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +22 -70
- azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +14 -30
- azure/ai/evaluation/simulator/_model_tools/_rai_client.py +14 -25
- azure/ai/evaluation/simulator/_model_tools/_template_handler.py +24 -68
- azure/ai/evaluation/simulator/_model_tools/models.py +21 -19
- azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +10 -6
- azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +5 -6
- azure/ai/evaluation/simulator/_tracing.py +28 -25
- azure/ai/evaluation/simulator/_utils.py +13 -34
- azure/ai/evaluation/simulator/simulator.py +579 -0
- azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
- azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
- azure/ai/evaluation/_common/_experimental.py +0 -172
- azure/ai/evaluation/_common/math.py +0 -89
- azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +0 -99
- azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +0 -46
- azure/ai/evaluation/_evaluators/_common/__init__.py +0 -13
- azure/ai/evaluation/_evaluators/_common/_base_eval.py +0 -344
- azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +0 -88
- azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +0 -133
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +0 -113
- azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +0 -99
- azure/ai/evaluation/_evaluators/_multimodal/__init__.py +0 -20
- azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +0 -132
- azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +0 -124
- azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +0 -100
- azure/ai/evaluation/_evaluators/_multimodal/_violence.py +0 -100
- azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +0 -112
- azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +0 -93
- azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +0 -148
- azure/ai/evaluation/_vendor/__init__.py +0 -3
- azure/ai/evaluation/_vendor/rouge_score/__init__.py +0 -14
- azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +0 -328
- azure/ai/evaluation/_vendor/rouge_score/scoring.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenize.py +0 -63
- azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +0 -53
- azure/ai/evaluation/simulator/_data_sources/__init__.py +0 -3
- azure/ai/evaluation/simulator/_data_sources/grounding.json +0 -1150
- azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
- azure/ai/evaluation/simulator/_simulator.py +0 -716
- azure_ai_evaluation-1.0.0.dist-info/METADATA +0 -595
- azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +0 -70
- azure_ai_evaluation-1.0.0.dist-info/RECORD +0 -119
- {azure_ai_evaluation-1.0.0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/top_level.txt +0 -0
|
@@ -4,17 +4,13 @@
|
|
|
4
4
|
import inspect
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
-
import os
|
|
8
|
-
from concurrent.futures import Future
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from typing import Any, Callable, Dict, Optional, Union, cast
|
|
11
7
|
|
|
12
8
|
import pandas as pd
|
|
13
|
-
from promptflow.contracts.types import AttrDict
|
|
14
|
-
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
15
9
|
|
|
10
|
+
from promptflow.contracts.types import AttrDict
|
|
16
11
|
from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
|
|
17
|
-
from
|
|
12
|
+
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
13
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
18
14
|
|
|
19
15
|
from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
|
|
20
16
|
|
|
@@ -22,43 +18,35 @@ LOGGER = logging.getLogger(__name__)
|
|
|
22
18
|
|
|
23
19
|
|
|
24
20
|
class CodeRun:
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
*,
|
|
28
|
-
run: Future,
|
|
29
|
-
input_data,
|
|
30
|
-
evaluator_name: Optional[str] = None,
|
|
31
|
-
aggregator: Callable[["CodeRun"], Future],
|
|
32
|
-
**kwargs, # pylint: disable=unused-argument
|
|
33
|
-
) -> None:
|
|
21
|
+
def __init__(self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs):
|
|
34
22
|
self.run = run
|
|
35
23
|
self.evaluator_name = evaluator_name if evaluator_name is not None else ""
|
|
36
24
|
self.input_data = input_data
|
|
37
|
-
self.aggregated_metrics =
|
|
25
|
+
self.aggregated_metrics = aggregated_metrics
|
|
38
26
|
|
|
39
|
-
def get_result_df(self, exclude_inputs
|
|
27
|
+
def get_result_df(self, exclude_inputs=False):
|
|
40
28
|
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
41
|
-
result_df =
|
|
29
|
+
result_df = self.run.result(timeout=batch_run_timeout)
|
|
42
30
|
if exclude_inputs:
|
|
43
31
|
result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
|
|
44
32
|
return result_df
|
|
45
33
|
|
|
46
|
-
def get_aggregated_metrics(self)
|
|
34
|
+
def get_aggregated_metrics(self):
|
|
47
35
|
try:
|
|
48
36
|
batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
|
|
49
|
-
aggregated_metrics
|
|
50
|
-
|
|
37
|
+
aggregated_metrics = (
|
|
38
|
+
self.aggregated_metrics.result(timeout=batch_run_timeout)
|
|
51
39
|
if self.aggregated_metrics is not None
|
|
52
40
|
else None
|
|
53
41
|
)
|
|
54
42
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
55
|
-
LOGGER.debug("Error calculating metrics for evaluator
|
|
43
|
+
LOGGER.debug(f"Error calculating metrics for evaluator {self.evaluator_name}, failed with error {str(ex)}")
|
|
56
44
|
aggregated_metrics = None
|
|
57
45
|
|
|
58
46
|
if not isinstance(aggregated_metrics, dict):
|
|
59
47
|
LOGGER.warning(
|
|
60
|
-
"Aggregated metrics for evaluator
|
|
61
|
-
|
|
48
|
+
f"Aggregated metrics for evaluator {self.evaluator_name}"
|
|
49
|
+
f" is not a dictionary will not be logged as metrics"
|
|
62
50
|
)
|
|
63
51
|
|
|
64
52
|
aggregated_metrics = aggregated_metrics if isinstance(aggregated_metrics, dict) else {}
|
|
@@ -66,15 +54,11 @@ class CodeRun:
|
|
|
66
54
|
return aggregated_metrics
|
|
67
55
|
|
|
68
56
|
|
|
69
|
-
class CodeClient:
|
|
70
|
-
def __init__(
|
|
71
|
-
self,
|
|
72
|
-
) -> None:
|
|
57
|
+
class CodeClient:
|
|
58
|
+
def __init__(self):
|
|
73
59
|
self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
|
|
74
60
|
|
|
75
|
-
def _calculate_metric(
|
|
76
|
-
self, evaluator: Callable, input_df: pd.DataFrame, column_mapping: Optional[Dict[str, str]], evaluator_name: str
|
|
77
|
-
) -> pd.DataFrame:
|
|
61
|
+
def _calculate_metric(self, evaluator, input_df, column_mapping, evaluator_name):
|
|
78
62
|
row_metric_futures = []
|
|
79
63
|
row_metric_results = []
|
|
80
64
|
input_df = _apply_column_mapping(input_df, column_mapping)
|
|
@@ -111,10 +95,10 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
111
95
|
verify_integrity=True,
|
|
112
96
|
)
|
|
113
97
|
|
|
114
|
-
|
|
115
|
-
def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
|
|
98
|
+
def _calculate_aggregations(self, evaluator, run):
|
|
116
99
|
try:
|
|
117
100
|
if _has_aggregator(evaluator):
|
|
101
|
+
aggregate_input = None
|
|
118
102
|
evaluator_output = run.get_result_df(exclude_inputs=True)
|
|
119
103
|
if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
|
|
120
104
|
aggregate_input = evaluator_output["output"].tolist()
|
|
@@ -126,25 +110,18 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
126
110
|
return aggregated_output
|
|
127
111
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
128
112
|
LOGGER.warning(
|
|
129
|
-
"Error calculating aggregations for evaluator
|
|
113
|
+
f"Error calculating aggregations for evaluator {run.evaluator_name}," f" failed with error {str(ex)}"
|
|
130
114
|
)
|
|
131
115
|
return None
|
|
132
116
|
|
|
133
|
-
def run(
|
|
134
|
-
self, # pylint: disable=unused-argument
|
|
135
|
-
flow: Callable,
|
|
136
|
-
data: Union[os.PathLike, Path, pd.DataFrame],
|
|
137
|
-
evaluator_name: Optional[str] = None,
|
|
138
|
-
column_mapping: Optional[Dict[str, str]] = None,
|
|
139
|
-
**kwargs,
|
|
140
|
-
) -> CodeRun:
|
|
117
|
+
def run(self, flow, data, evaluator_name=None, column_mapping=None, **kwargs):
|
|
141
118
|
input_df = data
|
|
142
119
|
if not isinstance(input_df, pd.DataFrame):
|
|
143
120
|
try:
|
|
144
121
|
json_data = load_jsonl(data)
|
|
145
122
|
except json.JSONDecodeError as exc:
|
|
146
123
|
raise EvaluationException(
|
|
147
|
-
message=f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
|
|
124
|
+
message = f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
|
|
148
125
|
internal_message="Failed to parse data as JSON",
|
|
149
126
|
target=ErrorTarget.CODE_CLIENT,
|
|
150
127
|
category=ErrorCategory.INVALID_VALUE,
|
|
@@ -152,37 +129,22 @@ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
|
|
|
152
129
|
) from exc
|
|
153
130
|
|
|
154
131
|
input_df = pd.DataFrame(json_data)
|
|
155
|
-
eval_future = self._thread_pool.submit(
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
evaluator_name=evaluator_name,
|
|
161
|
-
)
|
|
132
|
+
eval_future = self._thread_pool.submit(self._calculate_metric, flow, input_df, column_mapping, evaluator_name)
|
|
133
|
+
run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
|
|
134
|
+
aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
|
|
135
|
+
run.aggregated_metrics = aggregation_future
|
|
136
|
+
return run
|
|
162
137
|
|
|
163
|
-
|
|
164
|
-
run=eval_future,
|
|
165
|
-
input_data=data,
|
|
166
|
-
evaluator_name=evaluator_name,
|
|
167
|
-
aggregator=lambda code_run: self._thread_pool.submit(
|
|
168
|
-
self._calculate_aggregations, evaluator=flow, run=code_run
|
|
169
|
-
),
|
|
170
|
-
)
|
|
171
|
-
|
|
172
|
-
def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
|
|
138
|
+
def get_details(self, run, all_results=False):
|
|
173
139
|
result_df = run.get_result_df(exclude_inputs=not all_results)
|
|
174
140
|
return result_df
|
|
175
141
|
|
|
176
|
-
def get_metrics(self, run
|
|
142
|
+
def get_metrics(self, run):
|
|
177
143
|
try:
|
|
178
144
|
aggregated_metrics = run.get_aggregated_metrics()
|
|
179
145
|
print("Aggregated metrics")
|
|
180
146
|
print(aggregated_metrics)
|
|
181
147
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
182
|
-
LOGGER.debug("Error calculating metrics for evaluator
|
|
183
|
-
return
|
|
148
|
+
LOGGER.debug(f"Error calculating metrics for evaluator {run.evaluator_name}, failed with error {str(ex)}")
|
|
149
|
+
return None
|
|
184
150
|
return aggregated_metrics
|
|
185
|
-
|
|
186
|
-
def get_run_summary(self, run: CodeRun) -> Any: # pylint: disable=unused-argument
|
|
187
|
-
# Not implemented
|
|
188
|
-
return None
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# ---------------------------------------------------------
|
|
2
|
+
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
3
|
+
# ---------------------------------------------------------
|
|
4
|
+
import inspect
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from promptflow.client import PFClient
|
|
11
|
+
from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
|
|
12
|
+
|
|
13
|
+
LOGGER = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ProxyRun:
|
|
17
|
+
def __init__(self, run, **kwargs):
|
|
18
|
+
self.run = run
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ProxyClient:
|
|
22
|
+
def __init__(self, pf_client: PFClient):
|
|
23
|
+
self._pf_client = pf_client
|
|
24
|
+
self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
|
|
25
|
+
|
|
26
|
+
def run(self, flow, data, column_mapping=None, **kwargs):
|
|
27
|
+
flow_to_run = flow
|
|
28
|
+
if hasattr(flow, "_to_async"):
|
|
29
|
+
flow_to_run = flow._to_async()
|
|
30
|
+
|
|
31
|
+
batch_use_async = self._should_batch_use_async(flow_to_run)
|
|
32
|
+
eval_future = self._thread_pool.submit(
|
|
33
|
+
self._pf_client.run,
|
|
34
|
+
flow_to_run,
|
|
35
|
+
data=data,
|
|
36
|
+
column_mapping=column_mapping,
|
|
37
|
+
batch_use_async=batch_use_async,
|
|
38
|
+
**kwargs
|
|
39
|
+
)
|
|
40
|
+
return ProxyRun(run=eval_future)
|
|
41
|
+
|
|
42
|
+
def get_details(self, proxy_run, all_results=False):
|
|
43
|
+
run = proxy_run.run.result()
|
|
44
|
+
result_df = self._pf_client.get_details(run, all_results=all_results)
|
|
45
|
+
result_df.replace("(Failed)", np.nan, inplace=True)
|
|
46
|
+
return result_df
|
|
47
|
+
|
|
48
|
+
def get_metrics(self, proxy_run):
|
|
49
|
+
run = proxy_run.run.result()
|
|
50
|
+
return self._pf_client.get_metrics(run)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _should_batch_use_async(flow):
|
|
54
|
+
if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
|
|
55
|
+
if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
|
|
56
|
+
return True
|
|
57
|
+
elif inspect.iscoroutinefunction(flow):
|
|
58
|
+
return True
|
|
59
|
+
else:
|
|
60
|
+
return False
|
|
61
|
+
return False
|
|
@@ -8,20 +8,17 @@ import logging
|
|
|
8
8
|
import os
|
|
9
9
|
import posixpath
|
|
10
10
|
import time
|
|
11
|
-
import types
|
|
12
11
|
import uuid
|
|
13
|
-
from typing import Any, Dict,
|
|
12
|
+
from typing import Any, Dict, Optional, Set
|
|
14
13
|
from urllib.parse import urlparse
|
|
15
14
|
|
|
16
|
-
from
|
|
17
|
-
from
|
|
15
|
+
from azure.core.pipeline.policies import RetryPolicy
|
|
16
|
+
from azure.core.rest import HttpResponse
|
|
18
17
|
|
|
19
|
-
from
|
|
18
|
+
from promptflow._sdk.entities import Run
|
|
20
19
|
from azure.ai.evaluation._http_utils import get_http_client
|
|
21
20
|
from azure.ai.evaluation._version import VERSION
|
|
22
|
-
from azure.
|
|
23
|
-
from azure.core.rest import HttpResponse
|
|
24
|
-
from azure.core.exceptions import HttpResponseError
|
|
21
|
+
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
|
|
25
22
|
|
|
26
23
|
LOGGER = logging.getLogger(__name__)
|
|
27
24
|
|
|
@@ -29,20 +26,18 @@ LOGGER = logging.getLogger(__name__)
|
|
|
29
26
|
# Handle optional import. The azure libraries are only present if
|
|
30
27
|
# promptflow-azure is installed.
|
|
31
28
|
try:
|
|
32
|
-
from azure.ai.ml import MLClient
|
|
33
29
|
from azure.ai.ml.entities._credentials import AccountKeyConfiguration # pylint: disable=ungrouped-imports
|
|
34
30
|
from azure.ai.ml.entities._datastore.datastore import Datastore
|
|
35
31
|
from azure.storage.blob import BlobServiceClient
|
|
36
32
|
except (ModuleNotFoundError, ImportError):
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
)
|
|
33
|
+
# If the above mentioned modules cannot be imported, we are running
|
|
34
|
+
# in local mode and MLClient in the constructor will be None, so
|
|
35
|
+
# we will not arrive to Azure-dependent code.
|
|
36
|
+
|
|
37
|
+
# We are logging the import failure only if debug logging level is set because:
|
|
38
|
+
# - If the project configuration was not provided this import is not needed.
|
|
39
|
+
# - If the project configuration was provided, the error will be raised by PFClient.
|
|
40
|
+
LOGGER.debug("promptflow.azure is not installed.")
|
|
46
41
|
|
|
47
42
|
|
|
48
43
|
@dataclasses.dataclass
|
|
@@ -104,6 +99,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
104
99
|
_SCOPE = "https://management.azure.com/.default"
|
|
105
100
|
|
|
106
101
|
EVALUATION_ARTIFACT = "instance_results.jsonl"
|
|
102
|
+
EVALUATION_ARTIFACT_DUMMY_RUN = "eval_results.jsonl"
|
|
107
103
|
|
|
108
104
|
def __init__(
|
|
109
105
|
self,
|
|
@@ -124,8 +120,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
124
120
|
self._run_name = run_name
|
|
125
121
|
self._promptflow_run = promptflow_run
|
|
126
122
|
self._status = RunStatus.NOT_STARTED
|
|
127
|
-
self._url_base
|
|
128
|
-
self.
|
|
123
|
+
self._url_base = None
|
|
124
|
+
self.info = None
|
|
129
125
|
|
|
130
126
|
@property
|
|
131
127
|
def status(self) -> RunStatus:
|
|
@@ -137,20 +133,6 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
137
133
|
"""
|
|
138
134
|
return self._status
|
|
139
135
|
|
|
140
|
-
@property
|
|
141
|
-
def info(self) -> RunInfo:
|
|
142
|
-
if self._info is None:
|
|
143
|
-
msg = "Run info is missing"
|
|
144
|
-
raise EvaluationException(
|
|
145
|
-
message=msg,
|
|
146
|
-
internal_message=msg,
|
|
147
|
-
target=ErrorTarget.EVAL_RUN,
|
|
148
|
-
category=ErrorCategory.UNKNOWN,
|
|
149
|
-
blame=ErrorBlame.UNKNOWN,
|
|
150
|
-
)
|
|
151
|
-
|
|
152
|
-
return self._info
|
|
153
|
-
|
|
154
136
|
def _get_scope(self) -> str:
|
|
155
137
|
"""
|
|
156
138
|
Return the scope information for the workspace.
|
|
@@ -178,14 +160,12 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
178
160
|
)
|
|
179
161
|
self._url_base = None
|
|
180
162
|
self._status = RunStatus.BROKEN
|
|
181
|
-
self.
|
|
163
|
+
self.info = RunInfo.generate(self._run_name)
|
|
182
164
|
else:
|
|
183
165
|
self._url_base = urlparse(self._tracking_uri).netloc
|
|
184
166
|
if self._promptflow_run is not None:
|
|
185
|
-
self.
|
|
186
|
-
self._promptflow_run.name,
|
|
187
|
-
self._promptflow_run._experiment_name, # pylint: disable=protected-access
|
|
188
|
-
self._promptflow_run.name,
|
|
167
|
+
self.info = RunInfo(
|
|
168
|
+
self._promptflow_run.name, self._promptflow_run._experiment_name, self._promptflow_run.name
|
|
189
169
|
)
|
|
190
170
|
else:
|
|
191
171
|
url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create"
|
|
@@ -199,17 +179,15 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
199
179
|
body["run_name"] = self._run_name
|
|
200
180
|
response = self.request_with_retry(url=url, method="POST", json_dict=body)
|
|
201
181
|
if response.status_code != 200:
|
|
202
|
-
self.
|
|
182
|
+
self.info = RunInfo.generate(self._run_name)
|
|
203
183
|
LOGGER.warning(
|
|
204
|
-
"The run failed to start:
|
|
205
|
-
"The results will be saved locally, but will not be logged to Azure."
|
|
206
|
-
response.status_code,
|
|
207
|
-
response.text(),
|
|
184
|
+
f"The run failed to start: {response.status_code}: {response.text()}."
|
|
185
|
+
"The results will be saved locally, but will not be logged to Azure."
|
|
208
186
|
)
|
|
209
187
|
self._status = RunStatus.BROKEN
|
|
210
188
|
else:
|
|
211
189
|
parsed_response = response.json()
|
|
212
|
-
self.
|
|
190
|
+
self.info = RunInfo(
|
|
213
191
|
run_id=parsed_response["run"]["info"]["run_id"],
|
|
214
192
|
experiment_id=parsed_response["run"]["info"]["experiment_id"],
|
|
215
193
|
run_name=parsed_response["run"]["info"]["run_name"],
|
|
@@ -238,7 +216,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
238
216
|
internal_message="Incorrect terminal status. Valid statuses are 'FINISHED', 'FAILED' and 'KILLED'",
|
|
239
217
|
target=ErrorTarget.EVAL_RUN,
|
|
240
218
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
241
|
-
blame=ErrorBlame.UNKNOWN
|
|
219
|
+
blame=ErrorBlame.UNKNOWN
|
|
242
220
|
)
|
|
243
221
|
url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/update"
|
|
244
222
|
body = {
|
|
@@ -252,7 +230,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
252
230
|
LOGGER.warning("Unable to terminate the run.")
|
|
253
231
|
self._status = RunStatus.TERMINATED
|
|
254
232
|
|
|
255
|
-
def __enter__(self)
|
|
233
|
+
def __enter__(self):
|
|
256
234
|
"""The Context Manager enter call.
|
|
257
235
|
|
|
258
236
|
:return: The instance of the class.
|
|
@@ -261,21 +239,8 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
261
239
|
self._start_run()
|
|
262
240
|
return self
|
|
263
241
|
|
|
264
|
-
def __exit__(
|
|
265
|
-
|
|
266
|
-
exc_type: Optional[Type[BaseException]],
|
|
267
|
-
exc_value: Optional[BaseException],
|
|
268
|
-
exc_tb: Optional[types.TracebackType],
|
|
269
|
-
) -> None:
|
|
270
|
-
"""The context manager exit call.
|
|
271
|
-
|
|
272
|
-
:param exc_type: The exception type
|
|
273
|
-
:type exc_type: Optional[Type[BaseException]]
|
|
274
|
-
:param exc_value: The exception value
|
|
275
|
-
:type exc_value: Optional[BaseException]
|
|
276
|
-
:param exc_tb: The exception traceback
|
|
277
|
-
:type exc_tb: Optional[types.TracebackType]
|
|
278
|
-
"""
|
|
242
|
+
def __exit__(self, exc_type, exc_value, exc_tb):
|
|
243
|
+
"""The context manager exit call."""
|
|
279
244
|
self._end_run("FINISHED")
|
|
280
245
|
|
|
281
246
|
def get_run_history_uri(self) -> str:
|
|
@@ -315,7 +280,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
315
280
|
# is an optional dependency.
|
|
316
281
|
from promptflow.azure._utils._token_cache import ArmTokenCache # pylint: disable=import-error,no-name-in-module
|
|
317
282
|
|
|
318
|
-
return ArmTokenCache().get_token(self._ml_client._credential)
|
|
283
|
+
return ArmTokenCache().get_token(self._ml_client._credential)
|
|
319
284
|
|
|
320
285
|
def request_with_retry(
|
|
321
286
|
self, url: str, method: str, json_dict: Dict[str, Any], headers: Optional[Dict[str, str]] = None
|
|
@@ -361,10 +326,9 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
361
326
|
:type response: HttpResponse
|
|
362
327
|
"""
|
|
363
328
|
LOGGER.warning(
|
|
364
|
-
"Unable to
|
|
365
|
-
|
|
366
|
-
response.
|
|
367
|
-
response.text(),
|
|
329
|
+
f"Unable to {failed_op}, "
|
|
330
|
+
f"the request failed with status code {response.status_code}, "
|
|
331
|
+
f"{response.text()=}."
|
|
368
332
|
)
|
|
369
333
|
|
|
370
334
|
def _check_state_and_log(self, action: str, bad_states: Set[RunStatus], should_raise: bool) -> bool:
|
|
@@ -378,8 +342,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
378
342
|
:type bad_states: Set[RunStatus]
|
|
379
343
|
:param should_raise: Should we raise an error if the bad state has been encountered
|
|
380
344
|
:type should_raise: bool
|
|
381
|
-
:raises: ~azure.ai.evaluations._exceptions.EvaluationException if should_raise is True
|
|
382
|
-
and invalid state was encountered.
|
|
345
|
+
:raises: ~azure.ai.evaluations._exceptions.EvaluationException if should_raise is True and invalid state was encountered.
|
|
383
346
|
:return: Whether or not run is in the correct state.
|
|
384
347
|
:rtype: bool
|
|
385
348
|
"""
|
|
@@ -391,7 +354,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
391
354
|
internal_message=msg,
|
|
392
355
|
target=ErrorTarget.EVAL_RUN,
|
|
393
356
|
category=ErrorCategory.FAILED_EXECUTION,
|
|
394
|
-
blame=ErrorBlame.UNKNOWN
|
|
357
|
+
blame=ErrorBlame.UNKNOWN
|
|
395
358
|
)
|
|
396
359
|
LOGGER.warning(msg)
|
|
397
360
|
return False
|
|
@@ -413,7 +376,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
413
376
|
"""
|
|
414
377
|
if not self._check_state_and_log("log artifact", {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False):
|
|
415
378
|
return
|
|
416
|
-
# Check if artifact
|
|
379
|
+
# Check if artifact dirrectory is empty or does not exist.
|
|
417
380
|
if not os.path.isdir(artifact_folder):
|
|
418
381
|
LOGGER.warning("The path to the artifact is either not a directory or does not exist.")
|
|
419
382
|
return
|
|
@@ -425,7 +388,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
425
388
|
return
|
|
426
389
|
# First we will list the files and the appropriate remote paths for them.
|
|
427
390
|
root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name)
|
|
428
|
-
remote_paths
|
|
391
|
+
remote_paths = {"paths": []}
|
|
429
392
|
local_paths = []
|
|
430
393
|
# Go over the artifact folder and upload all artifacts.
|
|
431
394
|
for root, _, filenames in os.walk(artifact_folder):
|
|
@@ -444,32 +407,15 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
444
407
|
datastore = self._ml_client.datastores.get_default(include_secrets=True)
|
|
445
408
|
account_url = f"{datastore.account_name}.blob.{datastore.endpoint}"
|
|
446
409
|
svc_client = BlobServiceClient(account_url=account_url, credential=self._get_datastore_credential(datastore))
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
blob_client.upload_blob(fp, overwrite=True)
|
|
452
|
-
except HttpResponseError as ex:
|
|
453
|
-
if ex.status_code == 403:
|
|
454
|
-
msg = (
|
|
455
|
-
"Failed to upload evaluation run to the cloud due to insufficient permission to access the storage."
|
|
456
|
-
" Please ensure that the necessary access rights are granted."
|
|
457
|
-
)
|
|
458
|
-
raise EvaluationException(
|
|
459
|
-
message=msg,
|
|
460
|
-
target=ErrorTarget.EVAL_RUN,
|
|
461
|
-
category=ErrorCategory.FAILED_REMOTE_TRACKING,
|
|
462
|
-
blame=ErrorBlame.USER_ERROR,
|
|
463
|
-
tsg_link="https://aka.ms/azsdk/python/evaluation/remotetracking/troubleshoot",
|
|
464
|
-
) from ex
|
|
465
|
-
|
|
466
|
-
raise ex
|
|
410
|
+
for local, remote in zip(local_paths, remote_paths["paths"]):
|
|
411
|
+
blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"])
|
|
412
|
+
with open(local, "rb") as fp:
|
|
413
|
+
blob_client.upload_blob(fp, overwrite=True)
|
|
467
414
|
|
|
468
415
|
# To show artifact in UI we will need to register it. If it is a promptflow run,
|
|
469
416
|
# we are rewriting already registered artifact and need to skip this step.
|
|
470
417
|
if self._is_promptflow_run:
|
|
471
418
|
return
|
|
472
|
-
|
|
473
419
|
url = (
|
|
474
420
|
f"https://{self._url_base}/artifact/v2.0/subscriptions/{self._subscription_id}"
|
|
475
421
|
f"/resourceGroups/{self._resource_group_name}/providers/"
|
|
@@ -492,29 +438,6 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
492
438
|
if response.status_code != 200:
|
|
493
439
|
self._log_warning("register artifact", response)
|
|
494
440
|
|
|
495
|
-
# register artifacts for images if exists in image folder
|
|
496
|
-
try:
|
|
497
|
-
for remote_path in remote_paths["paths"]:
|
|
498
|
-
remote_file_path = remote_path["path"]
|
|
499
|
-
if "images" in os.path.normpath(remote_file_path).split(os.sep):
|
|
500
|
-
response = self.request_with_retry(
|
|
501
|
-
url=url,
|
|
502
|
-
method="POST",
|
|
503
|
-
json_dict={
|
|
504
|
-
"origin": "ExperimentRun",
|
|
505
|
-
"container": f"dcid.{self.info.run_id}",
|
|
506
|
-
"path": posixpath.join("images", os.path.basename(remote_file_path)),
|
|
507
|
-
"dataPath": {
|
|
508
|
-
"dataStoreName": datastore.name,
|
|
509
|
-
"relativePath": remote_file_path,
|
|
510
|
-
},
|
|
511
|
-
},
|
|
512
|
-
)
|
|
513
|
-
if response.status_code != 200:
|
|
514
|
-
self._log_warning("register image artifact", response)
|
|
515
|
-
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
516
|
-
LOGGER.debug("Exception occurred while registering image artifact. ex: %s", ex)
|
|
517
|
-
|
|
518
441
|
def _get_datastore_credential(self, datastore: "Datastore"):
|
|
519
442
|
# Reference the logic in azure.ai.ml._artifact._artifact_utilities
|
|
520
443
|
# https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
|
|
@@ -523,7 +446,7 @@ class EvalRun(contextlib.AbstractContextManager): # pylint: disable=too-many-in
|
|
|
523
446
|
return credential.account_key
|
|
524
447
|
if hasattr(credential, "sas_token"):
|
|
525
448
|
return credential.sas_token
|
|
526
|
-
return self._ml_client.datastores._credential
|
|
449
|
+
return self._ml_client.datastores._credential
|
|
527
450
|
|
|
528
451
|
def log_metric(self, key: str, value: float) -> None:
|
|
529
452
|
"""
|