arize-phoenix 4.4.4rc5__py3-none-any.whl → 4.4.4rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/METADATA +11 -5
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/RECORD +39 -36
- phoenix/config.py +21 -0
- phoenix/datetime_utils.py +4 -0
- phoenix/db/insertion/evaluation.py +4 -4
- phoenix/db/insertion/helpers.py +4 -12
- phoenix/db/insertion/span.py +3 -3
- phoenix/db/models.py +1 -1
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +153 -0
- phoenix/{datasets → experiments}/evaluators/code_evaluators.py +7 -7
- phoenix/{datasets → experiments}/evaluators/llm_evaluators.py +9 -9
- phoenix/{datasets → experiments}/evaluators/utils.py +38 -141
- phoenix/{datasets/experiments.py → experiments/functions.py} +248 -182
- phoenix/experiments/types.py +722 -0
- phoenix/experiments/utils.py +9 -0
- phoenix/server/api/context.py +2 -0
- phoenix/server/api/dataloaders/__init__.py +2 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/routers/v1/__init__.py +1 -1
- phoenix/server/api/routers/v1/dataset_examples.py +10 -10
- phoenix/server/api/routers/v1/datasets.py +6 -6
- phoenix/server/api/routers/v1/evaluations.py +4 -11
- phoenix/server/api/routers/v1/experiment_evaluations.py +22 -23
- phoenix/server/api/routers/v1/experiment_runs.py +4 -16
- phoenix/server/api/routers/v1/experiments.py +5 -5
- phoenix/server/api/routers/v1/spans.py +6 -4
- phoenix/server/api/types/Experiment.py +7 -0
- phoenix/server/app.py +2 -0
- phoenix/server/static/index.js +648 -570
- phoenix/session/client.py +256 -85
- phoenix/trace/fixtures.py +6 -6
- phoenix/utilities/json.py +8 -8
- phoenix/version.py +1 -1
- phoenix/datasets/__init__.py +0 -0
- phoenix/datasets/evaluators/__init__.py +0 -18
- phoenix/datasets/types.py +0 -178
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/WHEEL +0 -0
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → experiments}/tracing.py +0 -0
|
@@ -3,13 +3,13 @@ import json
|
|
|
3
3
|
from binascii import hexlify
|
|
4
4
|
from contextlib import ExitStack
|
|
5
5
|
from copy import deepcopy
|
|
6
|
+
from dataclasses import replace
|
|
6
7
|
from datetime import datetime, timezone
|
|
7
8
|
from itertools import product
|
|
8
9
|
from typing import (
|
|
9
10
|
Any,
|
|
10
11
|
Awaitable,
|
|
11
12
|
Dict,
|
|
12
|
-
Iterable,
|
|
13
13
|
Mapping,
|
|
14
14
|
Optional,
|
|
15
15
|
Sequence,
|
|
@@ -22,6 +22,7 @@ from urllib.parse import urljoin
|
|
|
22
22
|
|
|
23
23
|
import httpx
|
|
24
24
|
import opentelemetry.sdk.trace as trace_sdk
|
|
25
|
+
import pandas as pd
|
|
25
26
|
from openinference.semconv.resource import ResourceAttributes
|
|
26
27
|
from openinference.semconv.trace import (
|
|
27
28
|
OpenInferenceMimeTypeValues,
|
|
@@ -33,76 +34,51 @@ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExport
|
|
|
33
34
|
from opentelemetry.sdk.resources import Resource
|
|
34
35
|
from opentelemetry.sdk.trace import Span
|
|
35
36
|
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
|
36
|
-
from opentelemetry.trace import Status, StatusCode
|
|
37
|
+
from opentelemetry.trace import Status, StatusCode, Tracer
|
|
37
38
|
from typing_extensions import TypeAlias
|
|
38
39
|
|
|
39
|
-
from phoenix.config import
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
from phoenix.datasets.evaluators.utils import (
|
|
40
|
+
from phoenix.config import get_base_url, get_env_client_headers
|
|
41
|
+
from phoenix.evals.executors import get_executor_on_sync_context
|
|
42
|
+
from phoenix.evals.models.rate_limiters import RateLimiter
|
|
43
|
+
from phoenix.evals.utils import get_tqdm_progress_bar_formatter
|
|
44
|
+
from phoenix.experiments.evaluators import create_evaluator
|
|
45
|
+
from phoenix.experiments.evaluators.base import (
|
|
46
46
|
Evaluator,
|
|
47
|
-
EvaluatorName,
|
|
48
47
|
ExperimentEvaluator,
|
|
49
|
-
create_evaluator,
|
|
50
48
|
)
|
|
51
|
-
from phoenix.
|
|
52
|
-
from phoenix.
|
|
49
|
+
from phoenix.experiments.tracing import capture_spans
|
|
50
|
+
from phoenix.experiments.types import (
|
|
51
|
+
DRY_RUN,
|
|
53
52
|
Dataset,
|
|
53
|
+
EvaluationParameters,
|
|
54
54
|
EvaluationResult,
|
|
55
|
+
EvaluationSummary,
|
|
56
|
+
EvaluatorName,
|
|
55
57
|
Example,
|
|
56
58
|
Experiment,
|
|
57
59
|
ExperimentEvaluationRun,
|
|
60
|
+
ExperimentParameters,
|
|
58
61
|
ExperimentResult,
|
|
59
62
|
ExperimentRun,
|
|
60
|
-
ExperimentRunId,
|
|
61
63
|
ExperimentTask,
|
|
64
|
+
RanExperiment,
|
|
65
|
+
TaskSummary,
|
|
62
66
|
TestCase,
|
|
67
|
+
_asdict,
|
|
68
|
+
_replace,
|
|
63
69
|
)
|
|
64
|
-
from phoenix.
|
|
65
|
-
from phoenix.evals.models.rate_limiters import RateLimiter
|
|
66
|
-
from phoenix.evals.utils import get_tqdm_progress_bar_formatter
|
|
67
|
-
from phoenix.session.session import active_session
|
|
70
|
+
from phoenix.experiments.utils import get_dataset_experiments_url, get_experiment_url
|
|
68
71
|
from phoenix.trace.attributes import flatten
|
|
69
72
|
from phoenix.utilities.json import jsonify
|
|
70
73
|
|
|
71
74
|
|
|
72
|
-
def _get_base_url() -> str:
|
|
73
|
-
host = get_env_host()
|
|
74
|
-
if host == "0.0.0.0":
|
|
75
|
-
host = "127.0.0.1"
|
|
76
|
-
base_url = get_env_collector_endpoint() or f"http://{host}:{get_env_port()}"
|
|
77
|
-
return base_url if base_url.endswith("/") else base_url + "/"
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def _get_web_base_url() -> str:
|
|
81
|
-
"""Return the web UI base URL.
|
|
82
|
-
|
|
83
|
-
Returns:
|
|
84
|
-
str: the web UI base URL
|
|
85
|
-
"""
|
|
86
|
-
if session := active_session():
|
|
87
|
-
return session.url
|
|
88
|
-
return _get_base_url()
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def _get_experiment_url(*, dataset_id: str, experiment_id: str) -> str:
|
|
92
|
-
return f"{_get_web_base_url()}datasets/{dataset_id}/compare?experimentId={experiment_id}"
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def _get_dataset_experiments_url(*, dataset_id: str) -> str:
|
|
96
|
-
return f"{_get_web_base_url()}datasets/{dataset_id}/experiments"
|
|
97
|
-
|
|
98
|
-
|
|
99
75
|
def _phoenix_clients() -> Tuple[httpx.Client, httpx.AsyncClient]:
|
|
100
76
|
headers = get_env_client_headers()
|
|
101
77
|
return httpx.Client(
|
|
102
|
-
base_url=
|
|
78
|
+
base_url=get_base_url(),
|
|
103
79
|
headers=headers,
|
|
104
80
|
), httpx.AsyncClient(
|
|
105
|
-
base_url=
|
|
81
|
+
base_url=get_base_url(),
|
|
106
82
|
headers=headers,
|
|
107
83
|
)
|
|
108
84
|
|
|
@@ -114,16 +90,23 @@ Evaluators: TypeAlias = Union[
|
|
|
114
90
|
]
|
|
115
91
|
|
|
116
92
|
|
|
93
|
+
RateLimitErrors: TypeAlias = Union[Type[BaseException], Sequence[Type[BaseException]]]
|
|
94
|
+
|
|
95
|
+
|
|
117
96
|
def run_experiment(
|
|
118
97
|
dataset: Dataset,
|
|
119
98
|
task: ExperimentTask,
|
|
99
|
+
evaluators: Optional[Evaluators] = None,
|
|
120
100
|
*,
|
|
121
101
|
experiment_name: Optional[str] = None,
|
|
122
102
|
experiment_description: Optional[str] = None,
|
|
123
103
|
experiment_metadata: Optional[Mapping[str, Any]] = None,
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
104
|
+
rate_limit_errors: Optional[RateLimitErrors] = None,
|
|
105
|
+
dry_run: Union[bool, int] = False,
|
|
106
|
+
print_summary: bool = True,
|
|
107
|
+
) -> RanExperiment:
|
|
108
|
+
if not dataset.examples:
|
|
109
|
+
raise ValueError(f"Dataset has no examples: {dataset.id=}, {dataset.version_id=}")
|
|
127
110
|
# Add this to the params once supported in the UI
|
|
128
111
|
repetitions = 1
|
|
129
112
|
assert repetitions > 0, "Must run the experiment at least once."
|
|
@@ -131,44 +114,60 @@ def run_experiment(
|
|
|
131
114
|
|
|
132
115
|
sync_client, async_client = _phoenix_clients()
|
|
133
116
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
117
|
+
payload = {
|
|
118
|
+
"version_id": dataset.version_id,
|
|
119
|
+
"name": experiment_name,
|
|
120
|
+
"description": experiment_description,
|
|
121
|
+
"metadata": experiment_metadata,
|
|
122
|
+
"repetitions": repetitions,
|
|
123
|
+
}
|
|
124
|
+
if not dry_run:
|
|
125
|
+
experiment_response = sync_client.post(
|
|
126
|
+
f"/v1/datasets/{dataset.id}/experiments",
|
|
127
|
+
json=payload,
|
|
128
|
+
)
|
|
129
|
+
experiment_response.raise_for_status()
|
|
130
|
+
exp_json = experiment_response.json()["data"]
|
|
131
|
+
project_name = exp_json["project_name"]
|
|
132
|
+
experiment = Experiment(
|
|
133
|
+
dataset_id=dataset.id,
|
|
134
|
+
dataset_version_id=dataset.version_id,
|
|
135
|
+
repetitions=repetitions,
|
|
136
|
+
id=exp_json["id"],
|
|
137
|
+
project_name=project_name,
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
experiment = Experiment(
|
|
141
|
+
dataset_id=dataset.id,
|
|
142
|
+
dataset_version_id=dataset.version_id,
|
|
143
|
+
repetitions=repetitions,
|
|
144
|
+
id=DRY_RUN,
|
|
145
|
+
project_name="",
|
|
146
|
+
)
|
|
148
147
|
|
|
149
|
-
resource =
|
|
150
|
-
tracer_provider = trace_sdk.TracerProvider(resource=resource)
|
|
151
|
-
tracer_provider.add_span_processor(
|
|
152
|
-
SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
|
|
153
|
-
)
|
|
154
|
-
tracer = tracer_provider.get_tracer(__name__)
|
|
148
|
+
tracer, resource = _get_tracer(experiment.project_name)
|
|
155
149
|
root_span_name = f"Task: {_get_task_name(task)}"
|
|
156
150
|
root_span_kind = CHAIN
|
|
157
151
|
|
|
158
|
-
dataset_experiments_url = _get_dataset_experiments_url(dataset_id=dataset.id)
|
|
159
|
-
experiment_compare_url = _get_experiment_url(dataset_id=dataset.id, experiment_id=experiment_id)
|
|
160
152
|
print("🧪 Experiment started.")
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
153
|
+
if dry_run:
|
|
154
|
+
examples = {
|
|
155
|
+
(ex := dataset[i]).id: ex
|
|
156
|
+
for i in pd.Series(range(len(dataset)))
|
|
157
|
+
.sample(min(len(dataset), int(dry_run)), random_state=42)
|
|
158
|
+
.sort_values()
|
|
159
|
+
}
|
|
160
|
+
id_selection = "\n".join(examples)
|
|
161
|
+
print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
|
|
162
|
+
dataset = replace(dataset, examples=examples)
|
|
167
163
|
else:
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
164
|
+
dataset_experiments_url = get_dataset_experiments_url(dataset_id=dataset.id)
|
|
165
|
+
experiment_compare_url = get_experiment_url(
|
|
166
|
+
dataset_id=dataset.id,
|
|
167
|
+
experiment_id=experiment.id,
|
|
168
|
+
)
|
|
169
|
+
print(f"📺 View dataset experiments: {dataset_experiments_url}")
|
|
170
|
+
print(f"🔗 View this experiment: {experiment_compare_url}")
|
|
172
171
|
|
|
173
172
|
def sync_run_experiment(test_case: TestCase) -> ExperimentRun:
|
|
174
173
|
example, repetition_number = test_case.example, test_case.repetition_number
|
|
@@ -193,6 +192,7 @@ def run_experiment(
|
|
|
193
192
|
span.record_exception(exc)
|
|
194
193
|
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
195
194
|
error = exc
|
|
195
|
+
output = jsonify(output)
|
|
196
196
|
span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
|
|
197
197
|
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
198
198
|
if result := ExperimentResult(result=output) if output is not None else None:
|
|
@@ -208,21 +208,21 @@ def run_experiment(
|
|
|
208
208
|
assert isinstance(
|
|
209
209
|
output, (dict, list, str, int, float, bool, type(None))
|
|
210
210
|
), "Output must be JSON serializable"
|
|
211
|
-
|
|
211
|
+
exp_run = ExperimentRun(
|
|
212
212
|
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
213
213
|
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
214
|
-
experiment_id=
|
|
214
|
+
experiment_id=experiment.id,
|
|
215
215
|
dataset_example_id=example.id,
|
|
216
216
|
repetition_number=repetition_number,
|
|
217
217
|
output=result,
|
|
218
218
|
error=repr(error) if error else None,
|
|
219
219
|
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
220
220
|
)
|
|
221
|
-
|
|
222
|
-
f"/v1/experiments/{
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
return
|
|
221
|
+
if not dry_run:
|
|
222
|
+
resp = sync_client.post(f"/v1/experiments/{experiment.id}/runs", json=jsonify(exp_run))
|
|
223
|
+
resp.raise_for_status()
|
|
224
|
+
exp_run = replace(exp_run, id=resp.json()["data"]["id"])
|
|
225
|
+
return exp_run
|
|
226
226
|
|
|
227
227
|
async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
|
|
228
228
|
example, repetition_number = test_case.example, test_case.repetition_number
|
|
@@ -247,6 +247,7 @@ def run_experiment(
|
|
|
247
247
|
span.record_exception(exc)
|
|
248
248
|
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
249
249
|
error = exc
|
|
250
|
+
output = jsonify(output)
|
|
250
251
|
span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
|
|
251
252
|
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
252
253
|
if result := ExperimentResult(result=output) if output is not None else None:
|
|
@@ -262,21 +263,31 @@ def run_experiment(
|
|
|
262
263
|
assert isinstance(
|
|
263
264
|
output, (dict, list, str, int, float, bool, type(None))
|
|
264
265
|
), "Output must be JSON serializable"
|
|
265
|
-
|
|
266
|
+
exp_run = ExperimentRun(
|
|
266
267
|
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
267
268
|
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
268
|
-
experiment_id=
|
|
269
|
+
experiment_id=experiment.id,
|
|
269
270
|
dataset_example_id=example.id,
|
|
270
271
|
repetition_number=repetition_number,
|
|
271
272
|
output=result,
|
|
272
273
|
error=repr(error) if error else None,
|
|
273
274
|
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
274
275
|
)
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
276
|
+
if not dry_run:
|
|
277
|
+
resp = await async_client.post(
|
|
278
|
+
f"/v1/experiments/{experiment.id}/runs", json=jsonify(exp_run)
|
|
279
|
+
)
|
|
280
|
+
resp.raise_for_status()
|
|
281
|
+
exp_run = replace(exp_run, id=resp.json()["data"]["id"])
|
|
282
|
+
return exp_run
|
|
283
|
+
|
|
284
|
+
_errors: Tuple[Type[BaseException], ...]
|
|
285
|
+
if not hasattr(rate_limit_errors, "__iter__"):
|
|
286
|
+
_errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
|
|
287
|
+
else:
|
|
288
|
+
rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
|
|
289
|
+
_errors = tuple(filter(None, rate_limit_errors))
|
|
290
|
+
rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
|
|
280
291
|
|
|
281
292
|
rate_limited_sync_run_experiment = functools.reduce(
|
|
282
293
|
lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_run_experiment
|
|
@@ -295,84 +306,97 @@ def run_experiment(
|
|
|
295
306
|
)
|
|
296
307
|
|
|
297
308
|
test_cases = [
|
|
298
|
-
TestCase(example=ex, repetition_number=rep)
|
|
299
|
-
for ex, rep in product(dataset.examples, range(1, repetitions + 1))
|
|
309
|
+
TestCase(example=deepcopy(ex), repetition_number=rep)
|
|
310
|
+
for ex, rep in product(dataset.examples.values(), range(1, repetitions + 1))
|
|
300
311
|
]
|
|
301
|
-
|
|
302
|
-
experiment = Experiment(
|
|
303
|
-
id=experiment_id,
|
|
304
|
-
dataset_id=dataset.id,
|
|
305
|
-
dataset_version_id=dataset.version_id,
|
|
306
|
-
project_name=project_name,
|
|
307
|
-
)
|
|
308
|
-
|
|
312
|
+
task_runs, _execution_details = executor.run(test_cases)
|
|
309
313
|
print("✅ Task runs completed.")
|
|
310
|
-
|
|
314
|
+
params = ExperimentParameters(n_examples=len(dataset.examples), n_repetitions=repetitions)
|
|
315
|
+
task_summary = TaskSummary.from_task_runs(params, task_runs)
|
|
316
|
+
ran_experiment: RanExperiment = object.__new__(RanExperiment)
|
|
317
|
+
ran_experiment.__init__( # type: ignore[misc]
|
|
318
|
+
params=params,
|
|
319
|
+
dataset=dataset,
|
|
320
|
+
runs={r.id: r for r in task_runs},
|
|
321
|
+
task_summary=task_summary,
|
|
322
|
+
**_asdict(experiment),
|
|
323
|
+
)
|
|
311
324
|
if evaluators_by_name:
|
|
312
|
-
|
|
313
|
-
|
|
325
|
+
return evaluate_experiment(
|
|
326
|
+
ran_experiment,
|
|
314
327
|
evaluators=evaluators_by_name,
|
|
315
|
-
|
|
316
|
-
|
|
328
|
+
dry_run=dry_run,
|
|
329
|
+
print_summary=print_summary,
|
|
330
|
+
rate_limit_errors=rate_limit_errors,
|
|
317
331
|
)
|
|
318
|
-
|
|
319
|
-
|
|
332
|
+
if print_summary:
|
|
333
|
+
print(ran_experiment)
|
|
334
|
+
return ran_experiment
|
|
320
335
|
|
|
321
336
|
|
|
322
337
|
def evaluate_experiment(
|
|
323
338
|
experiment: Experiment,
|
|
324
|
-
evaluators:
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
],
|
|
329
|
-
) ->
|
|
339
|
+
evaluators: Evaluators,
|
|
340
|
+
*,
|
|
341
|
+
dry_run: Union[bool, int] = False,
|
|
342
|
+
print_summary: bool = True,
|
|
343
|
+
rate_limit_errors: Optional[RateLimitErrors] = None,
|
|
344
|
+
) -> RanExperiment:
|
|
345
|
+
if not dry_run and _is_dry_run(experiment):
|
|
346
|
+
dry_run = True
|
|
347
|
+
evaluators_by_name = _evaluators_by_name(evaluators)
|
|
348
|
+
if not evaluators_by_name:
|
|
349
|
+
raise ValueError("Must specify at least one Evaluator")
|
|
330
350
|
sync_client, async_client = _phoenix_clients()
|
|
331
351
|
dataset_id = experiment.dataset_id
|
|
332
352
|
dataset_version_id = experiment.dataset_version_id
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
353
|
+
if isinstance(experiment, RanExperiment):
|
|
354
|
+
ran_experiment: RanExperiment = experiment
|
|
355
|
+
else:
|
|
356
|
+
dataset = Dataset.from_dict(
|
|
337
357
|
sync_client.get(
|
|
338
358
|
f"/v1/datasets/{dataset_id}/examples",
|
|
339
|
-
params={"
|
|
340
|
-
)
|
|
341
|
-
.json()
|
|
342
|
-
.get("data", {})
|
|
343
|
-
.get("examples", [])
|
|
359
|
+
params={"version_id": str(dataset_version_id)},
|
|
360
|
+
).json()["data"]
|
|
344
361
|
)
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
362
|
+
if not dataset.examples:
|
|
363
|
+
raise ValueError(f"Dataset has no examples: {dataset_id=}, {dataset_version_id=}")
|
|
364
|
+
experiment_runs = tuple(
|
|
365
|
+
ExperimentRun.from_dict(exp_run)
|
|
366
|
+
for exp_run in sync_client.get(f"/v1/experiments/{experiment.id}/runs").json()["data"]
|
|
367
|
+
)
|
|
368
|
+
if not experiment_runs:
|
|
369
|
+
raise ValueError("Experiment has not been run")
|
|
370
|
+
params = ExperimentParameters(n_examples=len(dataset.examples))
|
|
371
|
+
task_summary = TaskSummary.from_task_runs(params, experiment_runs)
|
|
372
|
+
ran_experiment = object.__new__(RanExperiment)
|
|
373
|
+
ran_experiment.__init__( # type: ignore[misc]
|
|
374
|
+
dataset=dataset,
|
|
375
|
+
params=params,
|
|
376
|
+
runs=experiment_runs,
|
|
377
|
+
task_summary=task_summary,
|
|
378
|
+
**_asdict(experiment),
|
|
379
|
+
)
|
|
380
|
+
print("🧠 Evaluation started.")
|
|
381
|
+
examples = ran_experiment.dataset.examples
|
|
382
|
+
if dry_run:
|
|
383
|
+
if not _is_dry_run(ran_experiment):
|
|
384
|
+
dataset = ran_experiment.dataset
|
|
385
|
+
examples = {
|
|
386
|
+
(ex := dataset[i]).id: ex
|
|
387
|
+
for i in pd.Series(range(len(dataset)))
|
|
388
|
+
.sample(min(len(dataset), int(dry_run)), random_state=42)
|
|
389
|
+
.sort_values()
|
|
390
|
+
}
|
|
391
|
+
dataset = replace(ran_experiment.dataset, examples=examples)
|
|
392
|
+
ran_experiment = _replace(ran_experiment, id=DRY_RUN, dataset=dataset)
|
|
393
|
+
id_selection = "\n".join(examples)
|
|
394
|
+
print(f"🌵️ This is a dry-run for these example IDs:\n{id_selection}")
|
|
371
395
|
# not all dataset examples have associated experiment runs, so we need to pair them up
|
|
372
396
|
example_run_pairs = []
|
|
373
|
-
|
|
374
|
-
for exp_run in
|
|
375
|
-
example =
|
|
397
|
+
examples = ran_experiment.dataset.examples
|
|
398
|
+
for exp_run in ran_experiment.runs.values():
|
|
399
|
+
example = examples.get(exp_run.dataset_example_id)
|
|
376
400
|
if example:
|
|
377
401
|
example_run_pairs.append((deepcopy(example), exp_run))
|
|
378
402
|
evaluation_input = [
|
|
@@ -380,13 +404,7 @@ def _evaluate_experiment(
|
|
|
380
404
|
for (example, run), evaluator in product(example_run_pairs, evaluators_by_name.values())
|
|
381
405
|
]
|
|
382
406
|
|
|
383
|
-
|
|
384
|
-
resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
|
|
385
|
-
tracer_provider = trace_sdk.TracerProvider(resource=resource)
|
|
386
|
-
tracer_provider.add_span_processor(
|
|
387
|
-
SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
|
|
388
|
-
)
|
|
389
|
-
tracer = tracer_provider.get_tracer(__name__)
|
|
407
|
+
tracer, resource = _get_tracer(None if dry_run else "evaluators")
|
|
390
408
|
root_span_kind = EVALUATOR
|
|
391
409
|
|
|
392
410
|
def sync_evaluate_run(
|
|
@@ -404,7 +422,7 @@ def _evaluate_experiment(
|
|
|
404
422
|
stack.enter_context(capture_spans(resource))
|
|
405
423
|
try:
|
|
406
424
|
result = evaluator.evaluate(
|
|
407
|
-
output=
|
|
425
|
+
output=experiment_run.task_output,
|
|
408
426
|
expected=example.output,
|
|
409
427
|
input=example.input,
|
|
410
428
|
metadata=example.metadata,
|
|
@@ -418,8 +436,8 @@ def _evaluate_experiment(
|
|
|
418
436
|
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
419
437
|
span.set_status(status)
|
|
420
438
|
|
|
421
|
-
|
|
422
|
-
experiment_run_id=
|
|
439
|
+
eval_run = ExperimentEvaluationRun(
|
|
440
|
+
experiment_run_id=experiment_run.id,
|
|
423
441
|
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
424
442
|
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
425
443
|
name=evaluator.name,
|
|
@@ -428,9 +446,11 @@ def _evaluate_experiment(
|
|
|
428
446
|
result=result,
|
|
429
447
|
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
430
448
|
)
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
449
|
+
if not dry_run:
|
|
450
|
+
resp = sync_client.post("/v1/experiment_evaluations", json=jsonify(eval_run))
|
|
451
|
+
resp.raise_for_status()
|
|
452
|
+
eval_run = replace(eval_run, id=resp.json()["data"]["id"])
|
|
453
|
+
return eval_run
|
|
434
454
|
|
|
435
455
|
async def async_evaluate_run(
|
|
436
456
|
obj: Tuple[Example, ExperimentRun, Evaluator],
|
|
@@ -447,7 +467,7 @@ def _evaluate_experiment(
|
|
|
447
467
|
stack.enter_context(capture_spans(resource))
|
|
448
468
|
try:
|
|
449
469
|
result = await evaluator.async_evaluate(
|
|
450
|
-
output=
|
|
470
|
+
output=experiment_run.task_output,
|
|
451
471
|
expected=example.output,
|
|
452
472
|
input=example.input,
|
|
453
473
|
metadata=example.metadata,
|
|
@@ -461,8 +481,8 @@ def _evaluate_experiment(
|
|
|
461
481
|
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
462
482
|
span.set_status(status)
|
|
463
483
|
|
|
464
|
-
|
|
465
|
-
experiment_run_id=
|
|
484
|
+
eval_run = ExperimentEvaluationRun(
|
|
485
|
+
experiment_run_id=experiment_run.id,
|
|
466
486
|
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
467
487
|
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
468
488
|
name=evaluator.name,
|
|
@@ -471,22 +491,47 @@ def _evaluate_experiment(
|
|
|
471
491
|
result=result,
|
|
472
492
|
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
473
493
|
)
|
|
474
|
-
|
|
475
|
-
"/v1/experiment_evaluations", json=jsonify(
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
return
|
|
494
|
+
if not dry_run:
|
|
495
|
+
resp = await async_client.post("/v1/experiment_evaluations", json=jsonify(eval_run))
|
|
496
|
+
resp.raise_for_status()
|
|
497
|
+
eval_run = replace(eval_run, id=resp.json()["data"]["id"])
|
|
498
|
+
return eval_run
|
|
499
|
+
|
|
500
|
+
_errors: Tuple[Type[BaseException], ...]
|
|
501
|
+
if not hasattr(rate_limit_errors, "__iter__"):
|
|
502
|
+
_errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
|
|
503
|
+
else:
|
|
504
|
+
rate_limit_errors = cast(Sequence[Type[BaseException]], rate_limit_errors)
|
|
505
|
+
_errors = tuple(filter(None, rate_limit_errors))
|
|
506
|
+
rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in _errors]
|
|
507
|
+
|
|
508
|
+
rate_limited_sync_evaluate_run = functools.reduce(
|
|
509
|
+
lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_evaluate_run
|
|
510
|
+
)
|
|
511
|
+
rate_limited_async_evaluate_run = functools.reduce(
|
|
512
|
+
lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_evaluate_run
|
|
513
|
+
)
|
|
479
514
|
|
|
480
515
|
executor = get_executor_on_sync_context(
|
|
481
|
-
|
|
482
|
-
|
|
516
|
+
rate_limited_sync_evaluate_run,
|
|
517
|
+
rate_limited_async_evaluate_run,
|
|
483
518
|
max_retries=0,
|
|
484
519
|
exit_on_error=False,
|
|
485
520
|
fallback_return_value=None,
|
|
486
521
|
tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
|
|
487
522
|
)
|
|
488
|
-
|
|
489
|
-
|
|
523
|
+
eval_runs, _execution_details = executor.run(evaluation_input)
|
|
524
|
+
eval_summary = EvaluationSummary.from_eval_runs(
|
|
525
|
+
EvaluationParameters(
|
|
526
|
+
eval_names=frozenset(evaluators_by_name),
|
|
527
|
+
exp_params=ran_experiment.params,
|
|
528
|
+
),
|
|
529
|
+
*eval_runs,
|
|
530
|
+
)
|
|
531
|
+
ran_experiment = ran_experiment.add(eval_summary, *eval_runs)
|
|
532
|
+
if print_summary:
|
|
533
|
+
print(ran_experiment)
|
|
534
|
+
return ran_experiment
|
|
490
535
|
|
|
491
536
|
|
|
492
537
|
def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Evaluator]:
|
|
@@ -519,6 +564,18 @@ def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Eva
|
|
|
519
564
|
return evaluators_by_name
|
|
520
565
|
|
|
521
566
|
|
|
567
|
+
def _get_tracer(project_name: Optional[str] = None) -> Tuple[Tracer, Resource]:
|
|
568
|
+
resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
|
|
569
|
+
tracer_provider = trace_sdk.TracerProvider(resource=resource)
|
|
570
|
+
span_processor = (
|
|
571
|
+
SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{get_base_url()}", "v1/traces")))
|
|
572
|
+
if project_name
|
|
573
|
+
else _NoOpProcessor()
|
|
574
|
+
)
|
|
575
|
+
tracer_provider.add_span_processor(span_processor)
|
|
576
|
+
return tracer_provider.get_tracer(__name__), resource
|
|
577
|
+
|
|
578
|
+
|
|
522
579
|
def _str_trace_id(id_: int) -> str:
|
|
523
580
|
return hexlify(id_.to_bytes(16, "big")).decode()
|
|
524
581
|
|
|
@@ -539,6 +596,15 @@ def _get_task_name(task: ExperimentTask) -> str:
|
|
|
539
596
|
return str(task)
|
|
540
597
|
|
|
541
598
|
|
|
599
|
+
def _is_dry_run(obj: Any) -> bool:
|
|
600
|
+
return hasattr(obj, "id") and isinstance(obj.id, str) and obj.id.startswith(DRY_RUN)
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
class _NoOpProcessor(trace_sdk.SpanProcessor):
|
|
604
|
+
def force_flush(self, *_: Any) -> bool:
|
|
605
|
+
return True
|
|
606
|
+
|
|
607
|
+
|
|
542
608
|
INPUT_VALUE = SpanAttributes.INPUT_VALUE
|
|
543
609
|
OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
|
|
544
610
|
INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
|