azure-ai-evaluation 0.0.0b0__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of azure-ai-evaluation might be problematic. Click here for more details.

Files changed (100) hide show
  1. azure/ai/evaluation/__init__.py +60 -0
  2. azure/ai/evaluation/_common/__init__.py +16 -0
  3. azure/ai/evaluation/_common/constants.py +65 -0
  4. azure/ai/evaluation/_common/rai_service.py +452 -0
  5. azure/ai/evaluation/_common/utils.py +87 -0
  6. azure/ai/evaluation/_constants.py +50 -0
  7. azure/ai/evaluation/_evaluate/__init__.py +3 -0
  8. azure/ai/evaluation/_evaluate/_batch_run_client/__init__.py +8 -0
  9. azure/ai/evaluation/_evaluate/_batch_run_client/batch_run_context.py +72 -0
  10. azure/ai/evaluation/_evaluate/_batch_run_client/code_client.py +150 -0
  11. azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py +61 -0
  12. azure/ai/evaluation/_evaluate/_eval_run.py +494 -0
  13. azure/ai/evaluation/_evaluate/_evaluate.py +689 -0
  14. azure/ai/evaluation/_evaluate/_telemetry/__init__.py +174 -0
  15. azure/ai/evaluation/_evaluate/_utils.py +237 -0
  16. azure/ai/evaluation/_evaluators/__init__.py +3 -0
  17. azure/ai/evaluation/_evaluators/_bleu/__init__.py +9 -0
  18. azure/ai/evaluation/_evaluators/_bleu/_bleu.py +73 -0
  19. azure/ai/evaluation/_evaluators/_chat/__init__.py +9 -0
  20. azure/ai/evaluation/_evaluators/_chat/_chat.py +350 -0
  21. azure/ai/evaluation/_evaluators/_chat/retrieval/__init__.py +9 -0
  22. azure/ai/evaluation/_evaluators/_chat/retrieval/_retrieval.py +163 -0
  23. azure/ai/evaluation/_evaluators/_chat/retrieval/retrieval.prompty +48 -0
  24. azure/ai/evaluation/_evaluators/_coherence/__init__.py +7 -0
  25. azure/ai/evaluation/_evaluators/_coherence/_coherence.py +122 -0
  26. azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +62 -0
  27. azure/ai/evaluation/_evaluators/_content_safety/__init__.py +21 -0
  28. azure/ai/evaluation/_evaluators/_content_safety/_content_safety.py +108 -0
  29. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_base.py +66 -0
  30. azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +296 -0
  31. azure/ai/evaluation/_evaluators/_content_safety/_hate_unfairness.py +78 -0
  32. azure/ai/evaluation/_evaluators/_content_safety/_self_harm.py +76 -0
  33. azure/ai/evaluation/_evaluators/_content_safety/_sexual.py +76 -0
  34. azure/ai/evaluation/_evaluators/_content_safety/_violence.py +76 -0
  35. azure/ai/evaluation/_evaluators/_eci/__init__.py +0 -0
  36. azure/ai/evaluation/_evaluators/_eci/_eci.py +99 -0
  37. azure/ai/evaluation/_evaluators/_f1_score/__init__.py +9 -0
  38. azure/ai/evaluation/_evaluators/_f1_score/_f1_score.py +141 -0
  39. azure/ai/evaluation/_evaluators/_fluency/__init__.py +9 -0
  40. azure/ai/evaluation/_evaluators/_fluency/_fluency.py +122 -0
  41. azure/ai/evaluation/_evaluators/_fluency/fluency.prompty +61 -0
  42. azure/ai/evaluation/_evaluators/_gleu/__init__.py +9 -0
  43. azure/ai/evaluation/_evaluators/_gleu/_gleu.py +71 -0
  44. azure/ai/evaluation/_evaluators/_groundedness/__init__.py +9 -0
  45. azure/ai/evaluation/_evaluators/_groundedness/_groundedness.py +123 -0
  46. azure/ai/evaluation/_evaluators/_groundedness/groundedness.prompty +54 -0
  47. azure/ai/evaluation/_evaluators/_meteor/__init__.py +9 -0
  48. azure/ai/evaluation/_evaluators/_meteor/_meteor.py +96 -0
  49. azure/ai/evaluation/_evaluators/_protected_material/__init__.py +5 -0
  50. azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +104 -0
  51. azure/ai/evaluation/_evaluators/_protected_materials/__init__.py +5 -0
  52. azure/ai/evaluation/_evaluators/_protected_materials/_protected_materials.py +104 -0
  53. azure/ai/evaluation/_evaluators/_qa/__init__.py +9 -0
  54. azure/ai/evaluation/_evaluators/_qa/_qa.py +111 -0
  55. azure/ai/evaluation/_evaluators/_relevance/__init__.py +9 -0
  56. azure/ai/evaluation/_evaluators/_relevance/_relevance.py +131 -0
  57. azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +69 -0
  58. azure/ai/evaluation/_evaluators/_rouge/__init__.py +10 -0
  59. azure/ai/evaluation/_evaluators/_rouge/_rouge.py +98 -0
  60. azure/ai/evaluation/_evaluators/_similarity/__init__.py +9 -0
  61. azure/ai/evaluation/_evaluators/_similarity/_similarity.py +130 -0
  62. azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +71 -0
  63. azure/ai/evaluation/_evaluators/_xpia/__init__.py +5 -0
  64. azure/ai/evaluation/_evaluators/_xpia/xpia.py +140 -0
  65. azure/ai/evaluation/_exceptions.py +107 -0
  66. azure/ai/evaluation/_http_utils.py +395 -0
  67. azure/ai/evaluation/_model_configurations.py +27 -0
  68. azure/ai/evaluation/_user_agent.py +6 -0
  69. azure/ai/evaluation/_version.py +5 -0
  70. azure/ai/evaluation/py.typed +0 -0
  71. azure/ai/evaluation/simulator/__init__.py +15 -0
  72. azure/ai/evaluation/simulator/_adversarial_scenario.py +27 -0
  73. azure/ai/evaluation/simulator/_adversarial_simulator.py +450 -0
  74. azure/ai/evaluation/simulator/_constants.py +17 -0
  75. azure/ai/evaluation/simulator/_conversation/__init__.py +315 -0
  76. azure/ai/evaluation/simulator/_conversation/_conversation.py +178 -0
  77. azure/ai/evaluation/simulator/_conversation/constants.py +30 -0
  78. azure/ai/evaluation/simulator/_direct_attack_simulator.py +252 -0
  79. azure/ai/evaluation/simulator/_helpers/__init__.py +4 -0
  80. azure/ai/evaluation/simulator/_helpers/_language_suffix_mapping.py +17 -0
  81. azure/ai/evaluation/simulator/_helpers/_simulator_data_classes.py +93 -0
  82. azure/ai/evaluation/simulator/_indirect_attack_simulator.py +207 -0
  83. azure/ai/evaluation/simulator/_model_tools/__init__.py +23 -0
  84. azure/ai/evaluation/simulator/_model_tools/_identity_manager.py +147 -0
  85. azure/ai/evaluation/simulator/_model_tools/_proxy_completion_model.py +228 -0
  86. azure/ai/evaluation/simulator/_model_tools/_rai_client.py +157 -0
  87. azure/ai/evaluation/simulator/_model_tools/_template_handler.py +157 -0
  88. azure/ai/evaluation/simulator/_model_tools/models.py +616 -0
  89. azure/ai/evaluation/simulator/_prompty/task_query_response.prompty +69 -0
  90. azure/ai/evaluation/simulator/_prompty/task_simulate.prompty +36 -0
  91. azure/ai/evaluation/simulator/_tracing.py +92 -0
  92. azure/ai/evaluation/simulator/_utils.py +111 -0
  93. azure/ai/evaluation/simulator/simulator.py +579 -0
  94. azure_ai_evaluation-1.0.0b1.dist-info/METADATA +377 -0
  95. azure_ai_evaluation-1.0.0b1.dist-info/RECORD +97 -0
  96. {azure_ai_evaluation-0.0.0b0.dist-info → azure_ai_evaluation-1.0.0b1.dist-info}/WHEEL +1 -1
  97. azure_ai_evaluation-1.0.0b1.dist-info/top_level.txt +1 -0
  98. azure_ai_evaluation-0.0.0b0.dist-info/METADATA +0 -7
  99. azure_ai_evaluation-0.0.0b0.dist-info/RECORD +0 -4
  100. azure_ai_evaluation-0.0.0b0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,50 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+
5
+
6
+ class EvaluationMetrics:
7
+ """Metrics for model evaluation."""
8
+
9
+ GPT_GROUNDEDNESS = "gpt_groundedness"
10
+ GPT_RELEVANCE = "gpt_relevance"
11
+ GPT_COHERENCE = "gpt_coherence"
12
+ GPT_FLUENCY = "gpt_fluency"
13
+ GPT_SIMILARITY = "gpt_similarity"
14
+ F1_SCORE = "f1_score"
15
+ GPT_RETRIEVAL_SCORE = "gpt_retrieval_score"
16
+ HATE_FAIRNESS = "hate_fairness"
17
+ HATE_UNFAIRNESS = "hate_unfairness"
18
+ VIOLENCE = "violence"
19
+ SELF_HARM = "self_harm"
20
+ SEXUAL = "sexual"
21
+ PROTECTED_MATERIAL = "protected_material"
22
+ XPIA = "xpia"
23
+
24
+
25
+ class _InternalEvaluationMetrics:
26
+ """Evaluation metrics that are not publicly supported.
27
+ These metrics are experimental and subject to potential change or migration to the main
28
+ enum over time.
29
+ """
30
+
31
+ ECI = "eci"
32
+
33
+
34
+ class Prefixes:
35
+ """Column prefixes for inputs and outputs."""
36
+
37
+ INPUTS = "inputs."
38
+ OUTPUTS = "outputs."
39
+ TSG_OUTPUTS = "__outputs."
40
+
41
+
42
+ DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
43
+
44
+ CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
45
+
46
+ PF_BATCH_TIMEOUT_SEC_DEFAULT = 3600
47
+ PF_BATCH_TIMEOUT_SEC = "PF_BATCH_TIMEOUT_SEC"
48
+
49
+ OTEL_EXPORTER_OTLP_TRACES_TIMEOUT = "OTEL_EXPORTER_OTLP_TRACES_TIMEOUT"
50
+ OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT = 60
@@ -0,0 +1,3 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
@@ -0,0 +1,8 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ from .batch_run_context import BatchRunContext
5
+ from .code_client import CodeClient
6
+ from .proxy_client import ProxyClient
7
+
8
+ __all__ = ["CodeClient", "ProxyClient", "BatchRunContext"]
@@ -0,0 +1,72 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import os
5
+
6
+ from promptflow._sdk._constants import PF_FLOW_ENTRY_IN_TMP, PF_FLOW_META_LOAD_IN_SUBPROCESS
7
+ from promptflow._utils.user_agent_utils import ClientUserAgentUtil
8
+ from azure.ai.evaluation._constants import (
9
+ OTEL_EXPORTER_OTLP_TRACES_TIMEOUT,
10
+ OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT,
11
+ PF_BATCH_TIMEOUT_SEC,
12
+ PF_BATCH_TIMEOUT_SEC_DEFAULT,
13
+ )
14
+ from promptflow.tracing._integrations._openai_injector import inject_openai_api, recover_openai_api
15
+
16
+ from ..._user_agent import USER_AGENT
17
+ from .._utils import set_event_loop_policy
18
+ from .code_client import CodeClient
19
+ from .proxy_client import ProxyClient
20
+
21
+
22
+ class BatchRunContext:
23
+ """Context manager for batch run clients.
24
+
25
+ :param client: The client to run in the context.
26
+ :type client: Union[
27
+ ~azure.ai.evaluation._evaluate._batch_run_client.code_client.CodeClient,
28
+ ~azure.ai.evaluation._evaluate._batch_run_client.proxy_client.ProxyClient
29
+ ]
30
+ """
31
+
32
+ def __init__(self, client) -> None:
33
+ self.client = client
34
+ self._is_batch_timeout_set_by_system = False
35
+ self._is_otel_timeout_set_by_system = False
36
+
37
+ def __enter__(self):
38
+ if isinstance(self.client, CodeClient):
39
+ ClientUserAgentUtil.append_user_agent(USER_AGENT)
40
+ inject_openai_api()
41
+
42
+ if isinstance(self.client, ProxyClient):
43
+ os.environ[PF_FLOW_ENTRY_IN_TMP] = "true"
44
+ os.environ[PF_FLOW_META_LOAD_IN_SUBPROCESS] = "false"
45
+
46
+ if os.environ.get(PF_BATCH_TIMEOUT_SEC) is None:
47
+ os.environ[PF_BATCH_TIMEOUT_SEC] = str(PF_BATCH_TIMEOUT_SEC_DEFAULT)
48
+ self._is_batch_timeout_set_by_system = True
49
+
50
+ # For dealing with the timeout issue of OpenTelemetry exporter when multiple evaluators are running
51
+ if os.environ.get(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT) is None:
52
+ os.environ[OTEL_EXPORTER_OTLP_TRACES_TIMEOUT] = str(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT_DEFAULT)
53
+ self._is_otel_timeout_set_by_system = True
54
+
55
+ # For addressing the issue of asyncio event loop closed on Windows
56
+ set_event_loop_policy()
57
+
58
+ def __exit__(self, exc_type, exc_val, exc_tb):
59
+ if isinstance(self.client, CodeClient):
60
+ recover_openai_api()
61
+
62
+ if isinstance(self.client, ProxyClient):
63
+ os.environ.pop(PF_FLOW_ENTRY_IN_TMP, None)
64
+ os.environ.pop(PF_FLOW_META_LOAD_IN_SUBPROCESS, None)
65
+
66
+ if self._is_batch_timeout_set_by_system:
67
+ os.environ.pop(PF_BATCH_TIMEOUT_SEC, None)
68
+ self._is_batch_timeout_set_by_system = False
69
+
70
+ if self._is_otel_timeout_set_by_system:
71
+ os.environ.pop(OTEL_EXPORTER_OTLP_TRACES_TIMEOUT, None)
72
+ self._is_otel_timeout_set_by_system = False
@@ -0,0 +1,150 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import inspect
5
+ import json
6
+ import logging
7
+
8
+ import pandas as pd
9
+
10
+ from promptflow.contracts.types import AttrDict
11
+ from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _has_aggregator, get_int_env_var, load_jsonl
12
+ from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
13
+ from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
14
+
15
+ from ..._constants import PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT
16
+
17
+ LOGGER = logging.getLogger(__name__)
18
+
19
+
20
+ class CodeRun:
21
+ def __init__(self, run, input_data, evaluator_name=None, aggregated_metrics=None, **kwargs):
22
+ self.run = run
23
+ self.evaluator_name = evaluator_name if evaluator_name is not None else ""
24
+ self.input_data = input_data
25
+ self.aggregated_metrics = aggregated_metrics
26
+
27
+ def get_result_df(self, exclude_inputs=False):
28
+ batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
29
+ result_df = self.run.result(timeout=batch_run_timeout)
30
+ if exclude_inputs:
31
+ result_df = result_df.drop(columns=[col for col in result_df.columns if col.startswith("inputs.")])
32
+ return result_df
33
+
34
+ def get_aggregated_metrics(self):
35
+ try:
36
+ batch_run_timeout = get_int_env_var(PF_BATCH_TIMEOUT_SEC, PF_BATCH_TIMEOUT_SEC_DEFAULT)
37
+ aggregated_metrics = (
38
+ self.aggregated_metrics.result(timeout=batch_run_timeout)
39
+ if self.aggregated_metrics is not None
40
+ else None
41
+ )
42
+ except Exception as ex: # pylint: disable=broad-exception-caught
43
+ LOGGER.debug(f"Error calculating metrics for evaluator {self.evaluator_name}, failed with error {str(ex)}")
44
+ aggregated_metrics = None
45
+
46
+ if not isinstance(aggregated_metrics, dict):
47
+ LOGGER.warning(
48
+ f"Aggregated metrics for evaluator {self.evaluator_name}"
49
+ f" is not a dictionary will not be logged as metrics"
50
+ )
51
+
52
+ aggregated_metrics = aggregated_metrics if isinstance(aggregated_metrics, dict) else {}
53
+
54
+ return aggregated_metrics
55
+
56
+
57
+ class CodeClient:
58
+ def __init__(self):
59
+ self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
60
+
61
+ def _calculate_metric(self, evaluator, input_df, column_mapping, evaluator_name):
62
+ row_metric_futures = []
63
+ row_metric_results = []
64
+ input_df = _apply_column_mapping(input_df, column_mapping)
65
+ # Ignoring args and kwargs from the signature since they are usually catching extra arguments
66
+ parameters = {
67
+ param.name
68
+ for param in inspect.signature(evaluator).parameters.values()
69
+ if param.name not in ["args", "kwargs"]
70
+ }
71
+ for value in input_df.to_dict("records"):
72
+ # Filter out only the parameters that are present in the input data
73
+ # if no parameters then pass data as is
74
+ filtered_values = {k: v for k, v in value.items() if k in parameters} if len(parameters) > 0 else value
75
+ row_metric_futures.append(self._thread_pool.submit(evaluator, **filtered_values))
76
+
77
+ for row_number, row_metric_future in enumerate(row_metric_futures):
78
+ try:
79
+ result = row_metric_future.result()
80
+ if not isinstance(result, dict):
81
+ result = {"output": result}
82
+ row_metric_results.append(result)
83
+ except Exception as ex: # pylint: disable=broad-except
84
+ msg_1 = f"Error calculating value for row {row_number} for metric {evaluator_name}, "
85
+ msg_2 = f"failed with error {str(ex)} : Stack trace : {str(ex.__traceback__)}"
86
+ LOGGER.info(msg_1 + msg_2)
87
+ # If a row fails to calculate, add an empty dict to maintain the row index
88
+ # This is to ensure the output dataframe has the same number of rows as the input dataframe
89
+ # pd concat will fill NaN for missing values
90
+ row_metric_results.append({})
91
+
92
+ return pd.concat(
93
+ [input_df.add_prefix("inputs."), pd.DataFrame(row_metric_results)],
94
+ axis=1,
95
+ verify_integrity=True,
96
+ )
97
+
98
+ def _calculate_aggregations(self, evaluator, run):
99
+ try:
100
+ if _has_aggregator(evaluator):
101
+ aggregate_input = None
102
+ evaluator_output = run.get_result_df(exclude_inputs=True)
103
+ if len(evaluator_output.columns) == 1 and evaluator_output.columns[0] == "output":
104
+ aggregate_input = evaluator_output["output"].tolist()
105
+ else:
106
+ aggregate_input = [AttrDict(item) for item in evaluator_output.to_dict("records")]
107
+
108
+ aggr_func = getattr(evaluator, "__aggregate__")
109
+ aggregated_output = aggr_func(aggregate_input)
110
+ return aggregated_output
111
+ except Exception as ex: # pylint: disable=broad-exception-caught
112
+ LOGGER.warning(
113
+ f"Error calculating aggregations for evaluator {run.evaluator_name}," f" failed with error {str(ex)}"
114
+ )
115
+ return None
116
+
117
+ def run(self, flow, data, evaluator_name=None, column_mapping=None, **kwargs):
118
+ input_df = data
119
+ if not isinstance(input_df, pd.DataFrame):
120
+ try:
121
+ json_data = load_jsonl(data)
122
+ except json.JSONDecodeError as exc:
123
+ raise EvaluationException(
124
+ message = f"Failed to parse data as JSON: {data}. Provide valid json lines data.",
125
+ internal_message="Failed to parse data as JSON",
126
+ target=ErrorTarget.CODE_CLIENT,
127
+ category=ErrorCategory.INVALID_VALUE,
128
+ blame=ErrorBlame.USER_ERROR,
129
+ ) from exc
130
+
131
+ input_df = pd.DataFrame(json_data)
132
+ eval_future = self._thread_pool.submit(self._calculate_metric, flow, input_df, column_mapping, evaluator_name)
133
+ run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None)
134
+ aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run)
135
+ run.aggregated_metrics = aggregation_future
136
+ return run
137
+
138
+ def get_details(self, run, all_results=False):
139
+ result_df = run.get_result_df(exclude_inputs=not all_results)
140
+ return result_df
141
+
142
+ def get_metrics(self, run):
143
+ try:
144
+ aggregated_metrics = run.get_aggregated_metrics()
145
+ print("Aggregated metrics")
146
+ print(aggregated_metrics)
147
+ except Exception as ex: # pylint: disable=broad-exception-caught
148
+ LOGGER.debug(f"Error calculating metrics for evaluator {run.evaluator_name}, failed with error {str(ex)}")
149
+ return None
150
+ return aggregated_metrics
@@ -0,0 +1,61 @@
1
+ # ---------------------------------------------------------
2
+ # Copyright (c) Microsoft Corporation. All rights reserved.
3
+ # ---------------------------------------------------------
4
+ import inspect
5
+ import logging
6
+ import os
7
+
8
+ import numpy as np
9
+
10
+ from promptflow.client import PFClient
11
+ from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor
12
+
13
+ LOGGER = logging.getLogger(__name__)
14
+
15
+
16
+ class ProxyRun:
17
+ def __init__(self, run, **kwargs):
18
+ self.run = run
19
+
20
+
21
+ class ProxyClient:
22
+ def __init__(self, pf_client: PFClient):
23
+ self._pf_client = pf_client
24
+ self._thread_pool = ThreadPoolExecutor(thread_name_prefix="evaluators_thread")
25
+
26
+ def run(self, flow, data, column_mapping=None, **kwargs):
27
+ flow_to_run = flow
28
+ if hasattr(flow, "_to_async"):
29
+ flow_to_run = flow._to_async()
30
+
31
+ batch_use_async = self._should_batch_use_async(flow_to_run)
32
+ eval_future = self._thread_pool.submit(
33
+ self._pf_client.run,
34
+ flow_to_run,
35
+ data=data,
36
+ column_mapping=column_mapping,
37
+ batch_use_async=batch_use_async,
38
+ **kwargs
39
+ )
40
+ return ProxyRun(run=eval_future)
41
+
42
+ def get_details(self, proxy_run, all_results=False):
43
+ run = proxy_run.run.result()
44
+ result_df = self._pf_client.get_details(run, all_results=all_results)
45
+ result_df.replace("(Failed)", np.nan, inplace=True)
46
+ return result_df
47
+
48
+ def get_metrics(self, proxy_run):
49
+ run = proxy_run.run.result()
50
+ return self._pf_client.get_metrics(run)
51
+
52
+ @staticmethod
53
+ def _should_batch_use_async(flow):
54
+ if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
55
+ if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
56
+ return True
57
+ elif inspect.iscoroutinefunction(flow):
58
+ return True
59
+ else:
60
+ return False
61
+ return False