azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. azure/ai/evaluation/__init__.py +82 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/_experimental.py +172 -0
  4. azure/ai/evaluation/_common/constants.py +72 -0
  5. azure/ai/evaluation/_common/math.py +89 -0
  6. azure/ai/evaluation/_common/rai_service.py +632 -0
  7. azure/ai/evaluation/_common/utils.py +445 -0
  8. azure/ai/evaluation/_constants.py +72 -0
  9. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  10. azure/ai/evaluation/_evaluate/_batch_run/__init__.py +9 -0
  11. azure/ai/evaluation/_evaluate/_batch_run/code_client.py +188 -0
  12. azure/ai/evaluation/_evaluate/_batch_run/eval_run_context.py +89 -0
  13. azure/ai/evaluation/_evaluate/_batch_run/proxy_client.py +99 -0
  14. azure/ai/evaluation/_evaluate/_batch_run/target_run_context.py +46 -0
  15. azure/ai/evaluation/_evaluate/_eval_run.py +571 -0
  16. azure/ai/evaluation/_evaluate/_evaluate.py +850 -0
  17. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +179 -0
  18. azure/ai/evaluation/_evaluate/_utils.py +298 -0
  19. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  20. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  21. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +72 -0
  22. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  23. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +107 -0
  24. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +99 -0
  25. azure/ai/evaluation/_evaluators/_common/__init__.py +13 -0
  26. azure/ai/evaluation/_evaluators/_common/_base_eval.py +344 -0
  27. azure/ai/evaluation/_evaluators/_common/_base_prompty_eval.py +88 -0
  28. azure/ai/evaluation/_evaluators/_common/_base_rai_svc_eval.py +133 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +17 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +144 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +129 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +123 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +125 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +126 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +89 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +157 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +104 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +86 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +69 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +144 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty +113 -0
  47. azure/ai/evaluation/_evaluators/_groundedness/groundedness_without_query.prompty +99 -0
  48. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  49. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +90 -0
  50. azure/ai/evaluation/_evaluators/_multimodal/__init__.py +20 -0
  51. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py +132 -0
  52. azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py +55 -0
  53. azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py +100 -0
  54. azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py +124 -0
  55. azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py +100 -0
  56. azure/ai/evaluation/_evaluators/_multimodal/_sexual.py +100 -0
  57. azure/ai/evaluation/_evaluators/_multimodal/_violence.py +100 -0
  58. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  59. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +113 -0
  60. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_qa/_qa.py +93 -0
  62. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  63. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +114 -0
  64. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +100 -0
  65. azure/ai/evaluation/_evaluators/_retrieval/__init__.py +9 -0
  66. azure/ai/evaluation/_evaluators/_retrieval/_retrieval.py +112 -0
  67. azure/ai/evaluation/_evaluators/_retrieval/retrieval.prompty +93 -0
  68. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  69. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  70. azure/ai/evaluation/_evaluators/_service_groundedness/__init__.py +9 -0
  71. azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +148 -0
  72. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  73. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +140 -0
  74. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +66 -0
  75. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  76. azure/ai/evaluation/_evaluators/_xpia/xpia.py +125 -0
  77. azure/ai/evaluation/_exceptions.py +128 -0
  78. azure/ai/evaluation/_http_utils.py +466 -0
  79. azure/ai/evaluation/_model_configurations.py +123 -0
  80. azure/ai/evaluation/_user_agent.py +6 -0
  81. azure/ai/evaluation/_vendor/__init__.py +3 -0
  82. azure/ai/evaluation/_vendor/rouge_score/__init__.py +14 -0
  83. azure/ai/evaluation/_vendor/rouge_score/rouge_scorer.py +328 -0
  84. azure/ai/evaluation/_vendor/rouge_score/scoring.py +63 -0
  85. azure/ai/evaluation/_vendor/rouge_score/tokenize.py +63 -0
  86. azure/ai/evaluation/_vendor/rouge_score/tokenizers.py +53 -0
  87. azure/ai/evaluation/_version.py +5 -0
  88. azure/ai/evaluation/py.typed +0 -0
  89. azure/ai/evaluation/simulator/__init__.py +16 -0
  90. azure/ai/evaluation/simulator/_adversarial_scenario.py +46 -0
  91. azure/ai/evaluation/simulator/_adversarial_simulator.py +471 -0
  92. azure/ai/evaluation/simulator/_constants.py +27 -0
  93. azure/ai/evaluation/simulator/_conversation/__init__.py +316 -0
  94. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  95. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  96. azure/ai/evaluation/simulator/_data_sources/__init__.py +3 -0
  97. azure/ai/evaluation/simulator/_data_sources/grounding.json +1150 -0
  98. azure/ai/evaluation/simulator/_direct_attack_simulator.py +218 -0
  99. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  100. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  101. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +96 -0
  102. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +220 -0
  103. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  104. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +195 -0
  105. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +244 -0
  106. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +168 -0
  107. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +201 -0
  108. azure/ai/evaluation/simulator/_model_tools/models.py +614 -0
  109. azure/ai/evaluation/simulator/_prompty/__init__.py +0 -0
  110. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +65 -0
  111. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +37 -0
  112. azure/ai/evaluation/simulator/_simulator.py +716 -0
  113. azure/ai/evaluation/simulator/_tracing.py +89 -0
  114. azure/ai/evaluation/simulator/_utils.py +132 -0
  115. azure_ai_evaluation-1.0.0.dist-info/METADATA +595 -0
  116. azure_ai_evaluation-1.0.0.dist-info/NOTICE.txt +70 -0
  117. azure_ai_evaluation-1.0.0.dist-info/RECORD +119 -0
  118. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0.dist-info}/WHEEL +1 -1
  119. azure_ai_evaluation-1.0.0.dist-info/top_level.txt +1 -0
  120. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  121. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  122. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,188 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import inspect
5
+ import json
6
+ import logging
7
+ import os
8
+ from concurrent.futures import Future
9
+ from pathlib import Path
10
+ from typing import Any, Callable, Dict, Optional, Union, cast
11
+
12
+ import pandas as pd
13
+ from promptflow.contracts.types import AttrDict
14
+ from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
15
+
16
+ from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
17
+ from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
18
+
19
+ from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
20
+
21
+ LOGGER = logging.getLogger(__name__)
22
+
23
+
24
+ class CodeRun:
25
+ def __init__(
26
+ self,
27
+ *,
28
+ run: Future,
29
+ input_data,
30
+ evaluator_name: Optional[str] = None,
31
+ aggregator: Callable[["CodeRun"], Future],
32
+ **kwargs, # pylint: disable=unused-argument
33
+ ) -> None:
34
+ self.run = run
35
+ self.evaluator_name = evaluator_name if evaluator_name is not None else ""
36
+ self.input_data = input_data
37
+ self.aggregated_metrics = aggregator(self)
38
+
39
+ def get_result_df(self, exclude_inputs: bool = False) -> pd.DataFrame:
40
+ batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
41
+ result_df = cast(pd.DataFrame, self.run.result(timeout=batch_run_timeout))
42
+ if exclude_inputs:
43
+ result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
44
+ return result_df
45
+
46
+ def get_aggregated_metrics(self) -> Dict[str, Any]:
47
+ try:
48
+ batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
49
+ aggregated_metrics: Optional[Any] = (
50
+ cast(Dict, self.aggregated_metrics.result(timeout=batch_run_timeout))
51
+ if self.aggregated_metrics is not None
52
+ else None
53
+ )
54
+ except Exception as ex: # pylint: disable=broad-exception-caught
55
+ LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", self.evaluator_name, ex)
56
+ aggregated_metrics = None
57
+
58
+ if not isinstance(aggregated_metrics, dict):
59
+ LOGGER.warning(
60
+ "Aggregated metrics for evaluator %s is not a dictionary will not be logged as metrics",
61
+ self.evaluator_name,
62
+ )
63
+
64
+ aggregated_metrics = aggregated_metrics if isinstance(aggregated_metrics, dict) else {}
65
+
66
+ return aggregated_metrics
67
+
68
+
69
+ class CodeClient: # pylint: disable=client-accepts-api-version-keyword
70
+ def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
71
+ self,
72
+ ) -> None:
73
+ self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
74
+
75
+ def _calculate_metric(
76
+ self, evaluator: Callable, input_df: pd.DataFrame, column_mapping: Optional[Dict[str, str]], evaluator_name: str
77
+ ) -> pd.DataFrame:
78
+ row_metric_futures = []
79
+ row_metric_results = []
80
+ input_df = _apply_column_mapping(input_df, column_mapping)
81
+ # Ignoring args and kwargs from the signature since they are usually catching extra arguments
82
+ parameters = {
83
+ param.name
84
+ for param in inspect.signature(evaluator).parameters.values()
85
+ if param.name not in ["args", "kwargs"]
86
+ }
87
+ for value in input_df.to_dict("records"):
88
+ # Filter out only the parameters that are present in the input data
89
+ # if no parameters then pass data as is
90
+ filtered_values = {k: v for k, v in value.items() if k in parameters} if len(parameters) > 0 else value
91
+ row_metric_futures.append(self._thread_pool.submit(evaluator, **filtered_values))
92
+
93
+ for row_number, row_metric_future in enumerate(row_metric_futures):
94
+ try:
95
+ result = row_metric_future.result()
96
+ if not isinstance(result, dict):
97
+ result = {"output": result}
98
+ row_metric_results.append(result)
99
+ except Exception as ex: # pylint: disable=broad-except
100
+ msg_1 = f"Error calculating value for row {row_number} for metric {evaluator_name}, "
101
+ msg_2 = f"failed with error {str(ex)} : Stack trace : {str(ex.__traceback__)}"
102
+ LOGGER.info(msg_1 + msg_2)
103
+ # If a row fails to calculate, add an empty dict to maintain the row index
104
+ # This is to ensure the output dataframe has the same number of rows as the input dataframe
105
+ # pd concat will fill NaN for missing values
106
+ row_metric_results.append({})
107
+
108
+ return pd.concat(
109
+ [input_df.add_prefix("inputs."), pd.DataFrame(row_metric_results)],
110
+ axis=1,
111
+ verify_integrity=True,
112
+ )
113
+
114
+ @staticmethod
115
+ def _calculate_aggregations(evaluator: Callable, run: CodeRun) -> Any:
116
+ try:
117
+ if _has_aggregator(evaluator):
118
+ evaluator_output = run.get_result_df(exclude_inputs=True)
119
+ if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
120
+ aggregate_input = evaluator_output["output"].tolist()
121
+ else:
122
+ aggregate_input = [AttrDict(item) for item in evaluator_output.to_dict("records")]
123
+
124
+ aggr_func = getattr(evaluator, "__aggregate__")
125
+ aggregated_output = aggr_func(aggregate_input)
126
+ return aggregated_output
127
+ except Exception as ex: # pylint: disable=broad-exception-caught
128
+ LOGGER.warning(
129
+ "Error calculating aggregations for evaluator %s, failed with error %s", run.evaluator_name, ex
130
+ )
131
+ return None
132
+
133
+ def run(
134
+ self, # pylint: disable=unused-argument
135
+ flow: Callable,
136
+ data: Union[os.PathLike, Path, pd.DataFrame],
137
+ evaluator_name: Optional[str] = None,
138
+ column_mapping: Optional[Dict[str, str]] = None,
139
+ **kwargs,
140
+ ) -> CodeRun:
141
+ input_df = data
142
+ if not isinstance(input_df, pd.DataFrame):
143
+ try:
144
+ json_data = load_jsonl(data)
145
+ except json.JSONDecodeError as exc:
146
+ raise EvaluationException(
147
+ message=f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
148
+ internal_message="Failed to parse data as JSON",
149
+ target=ErrorTarget.CODE_CLIENT,
150
+ category=ErrorCategory.INVALID_VALUE,
151
+ blame=ErrorBlame.USER_ERROR,
152
+ ) from exc
153
+
154
+ input_df = pd.DataFrame(json_data)
155
+ eval_future = self._thread_pool.submit(
156
+ self._calculate_metric,
157
+ evaluator=flow,
158
+ input_df=input_df,
159
+ column_mapping=column_mapping,
160
+ evaluator_name=evaluator_name,
161
+ )
162
+
163
+ return CodeRun(
164
+ run=eval_future,
165
+ input_data=data,
166
+ evaluator_name=evaluator_name,
167
+ aggregator=lambda code_run: self._thread_pool.submit(
168
+ self._calculate_aggregations, evaluator=flow, run=code_run
169
+ ),
170
+ )
171
+
172
+ def get_details(self, run: CodeRun, all_results: bool = False) -> pd.DataFrame:
173
+ result_df = run.get_result_df(exclude_inputs=not all_results)
174
+ return result_df
175
+
176
+ def get_metrics(self, run: CodeRun) -> Dict[str, Any]:
177
+ try:
178
+ aggregated_metrics = run.get_aggregated_metrics()
179
+ print("Aggregated metrics")
180
+ print(aggregated_metrics)
181
+ except Exception as ex: # pylint: disable=broad-exception-caught
182
+ LOGGER.debug("Error calculating metrics for evaluator %s, failed with error %s", run.evaluator_name, ex)
183
+ return {}
184
+ return aggregated_metrics
185
+
186
+ def get_run_summary(self, run: CodeRun) -> Any: # pylint: disable=unused-argument
187
+ # Not implemented
188
+ return None
@@ -0,0 +1,89 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ import types
6
+ from typing import Optional, Type, Union
7
+
8
+ from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
9
+ from promptflow._utils.user_agent_utils import ClientUserAgentUtil
10
+ from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
11
+
12
+ from azure.ai.evaluation._constants import (
13
+ OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
14
+ OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
15
+ PF_BATCH_TIMEOUT_SEC,
16
+ PF_BATCH_TIMEOUT_SEC_DEFAULT,
17
+ PF_DISABLE_TRACING,
18
+ )
19
+
20
+ from ..._user_agent import USER_AGENT
21
+ from .._utils import set_event_loop_policy
22
+ from .code_client import CodeClient
23
+ from .proxy_client import ProxyClient
24
+
25
+
26
+ class EvalRunContext:
27
+ """Context manager for eval batch run.
28
+
29
+ :param client: The client to run in the context.
30
+ :type client: Union[
31
+ ~azure.ai.evaluation._evaluate._batch_run.code_client.CodeClient,
32
+ ~azure.ai.evaluation._evaluate._batch_run.proxy_client.ProxyClient
33
+ ]
34
+ """
35
+
36
+ def __init__(self, client: Union[CodeClient, ProxyClient]) -> None:
37
+ self.client = client
38
+ self._is_batch_timeout_set_by_system = False
39
+ self._is_otel_timeout_set_by_system = False
40
+ self._original_cwd = os.getcwd()
41
+
42
+ def __enter__(self) -> None:
43
+ # Preserve current working directory, as PF may change it without restoring it afterward
44
+ self._original_cwd = os.getcwd()
45
+
46
+ if isinstance(self.client, CodeClient):
47
+ ClientUserAgentUtil.append_user_agent(USER_AGENT)
48
+ inject_openai_api()
49
+
50
+ if isinstance(self.client, ProxyClient):
51
+ os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
52
+ os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
53
+ os.environ[PF_DISABLE_TRACING] = "true"
54
+
55
+ if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
56
+ os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
57
+ self._is_batch_timeout_set_by_system = True
58
+
59
+ # For dealing with the timeout issue of OpenTelemetry exporter when multiple evaluators are running
60
+ if os.environ.get(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT) is None:
61
+ os.environ[OTEL_EXPORTER_OTLP_TRACES_TIMEOUT] = str(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT)
62
+ self._is_otel_timeout_set_by_system = True
63
+
64
+ # For addressing the issue of asyncio event loop closed on Windows
65
+ set_event_loop_policy()
66
+
67
+ def __exit__(
68
+ self,
69
+ exc_type: Optional[Type[BaseException]],
70
+ exc_value: Optional[BaseException],
71
+ exc_tb: Optional[types.TracebackType],
72
+ ) -> None:
73
+ os.chdir(self._original_cwd)
74
+
75
+ if isinstance(self.client, CodeClient):
76
+ recover_openai_api()
77
+
78
+ if isinstance(self.client, ProxyClient):
79
+ os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
80
+ os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
81
+ os.environ.pop(PF_DISABLE_TRACING, None)
82
+
83
+ if self._is_batch_timeout_set_by_system:
84
+ os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
85
+ self._is_batch_timeout_set_by_system = False
86
+
87
+ if self._is_otel_timeout_set_by_system:
88
+ os.environ.pop(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, None)
89
+ self._is_otel_timeout_set_by_system = False
@@ -0,0 +1,99 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+ # pylint: disable=protected-access
6
+
7
+ import inspect
8
+ import logging
9
+ import math
10
+ import os
11
+ from collections import OrderedDict
12
+ from concurrent.futures import Future
13
+ from typing import Any, Callable, Dict, Optional, Union
14
+
15
+ import pandas as pd
16
+ from promptflow.client import PFClient
17
+ from promptflow.entities import Run
18
+ from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
19
+
20
+ LOGGER = logging.getLogger(__name__)
21
+
22
+
23
+ class ProxyRun:
24
+ def __init__(self, run: Future, **kwargs) -> None: # pylint: disable=unused-argument
25
+ self.run = run
26
+
27
+
28
+ class ProxyClient: # pylint: disable=client-accepts-api-version-keyword
29
+ def __init__( # pylint: disable=missing-client-constructor-parameter-credential,missing-client-constructor-parameter-kwargs
30
+ self, pf_client: PFClient
31
+ ) -> None:
32
+ self._pf_client = pf_client
33
+ self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
34
+
35
+ def run(
36
+ self,
37
+ flow: Union[str, os.PathLike, Callable],
38
+ data: Union[str, os.PathLike],
39
+ column_mapping: Optional[Dict[str, str]] = None,
40
+ **kwargs
41
+ ) -> ProxyRun:
42
+ flow_to_run = flow
43
+ if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and hasattr(flow, "_to_async"):
44
+ flow_to_run = flow._to_async() # pylint: disable=protected-access
45
+
46
+ batch_use_async = self._should_batch_use_async(flow_to_run)
47
+ eval_future = self._thread_pool.submit(
48
+ self._pf_client.run,
49
+ flow_to_run,
50
+ data=data,
51
+ column_mapping=column_mapping,
52
+ batch_use_async=batch_use_async,
53
+ **kwargs
54
+ )
55
+ return ProxyRun(run=eval_future)
56
+
57
+ def get_details(self, proxy_run: ProxyRun, all_results: bool = False) -> pd.DataFrame:
58
+ run: Run = proxy_run.run.result()
59
+ result_df = self._pf_client.get_details(run, all_results=all_results)
60
+ result_df.replace("(Failed)", math.nan, inplace=True)
61
+ return result_df
62
+
63
+ def get_metrics(self, proxy_run: ProxyRun) -> Dict[str, Any]:
64
+ run: Run = proxy_run.run.result()
65
+ return self._pf_client.get_metrics(run)
66
+
67
+ def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
68
+ run = proxy_run.run.result()
69
+
70
+ # pylint: disable=protected-access
71
+ completed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.completed", "NA")
72
+ failed_lines = run._properties.get("system_metrics", {}).get("__pf__.lines.failed", "NA")
73
+
74
+ # Update status to "Completed with Errors" if the original status is "Completed" and there are failed lines
75
+ if run.status == "Completed" and failed_lines != "NA" and int(failed_lines) > 0:
76
+ status = "Completed with Errors"
77
+ else:
78
+ status = run.status
79
+
80
+ # Return the ordered dictionary with the updated status
81
+ return OrderedDict(
82
+ [
83
+ ("status", status),
84
+ ("duration", str(run._end_time - run._created_on)),
85
+ ("completed_lines", completed_lines),
86
+ ("failed_lines", failed_lines),
87
+ ("log_path", str(run._output_path)),
88
+ ]
89
+ )
90
+
91
+ @staticmethod
92
+ def _should_batch_use_async(flow):
93
+ if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
94
+ if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
95
+ return True
96
+ if inspect.iscoroutinefunction(flow):
97
+ return True
98
+ return False
99
+ return False
@@ -0,0 +1,46 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+ import types
6
+ from typing import Optional, Type
7
+
8
+ from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP
9
+ from azure.ai.evaluation._constants import PF_DISABLE_TRACING
10
+
11
+
12
+ class TargetRunContext:
13
+ """Context manager for target batch run.
14
+
15
+ :param upload_snapshot: Whether to upload target snapshot.
16
+ :type upload_snapshot: bool
17
+ """
18
+
19
+ def __init__(self, upload_snapshot: bool) -> None:
20
+ self._upload_snapshot = upload_snapshot
21
+ self._original_cwd = os.getcwd()
22
+
23
+ def __enter__(self) -> None:
24
+ # Preserve current working directory, as PF may change it without restoring it afterward
25
+ self._original_cwd = os.getcwd()
26
+
27
+ # Address "[WinError 32] The process cannot access the file" error,
28
+ # caused by conflicts when the venv and target function are in the same directory.
29
+ # Setting PF_FLOW_ENTRY_IN_TMP to true uploads only the flex entry file (flow.flex.yaml).
30
+ if not self._upload_snapshot:
31
+ os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
32
+
33
+ os.environ[PF_DISABLE_TRACING] = "true"
34
+
35
+ def __exit__(
36
+ self,
37
+ exc_type: Optional[Type[BaseException]],
38
+ exc_value: Optional[BaseException],
39
+ exc_tb: Optional[types.TracebackType],
40
+ ) -> None:
41
+ os.chdir(self._original_cwd)
42
+
43
+ if not self._upload_snapshot:
44
+ os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
45
+
46
+ os.environ.pop(PF_DISABLE_TRACING, None)