arize-phoenix 4.4.4rc4__py3-none-any.whl → 4.4.4rc6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/METADATA +12 -6
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/RECORD +47 -42
- phoenix/config.py +21 -0
- phoenix/datetime_utils.py +4 -0
- phoenix/db/insertion/dataset.py +19 -16
- phoenix/db/insertion/evaluation.py +4 -4
- phoenix/db/insertion/helpers.py +4 -12
- phoenix/db/insertion/span.py +3 -3
- phoenix/db/migrations/versions/10460e46d750_datasets.py +2 -2
- phoenix/db/models.py +8 -3
- phoenix/experiments/__init__.py +6 -0
- phoenix/experiments/evaluators/__init__.py +29 -0
- phoenix/experiments/evaluators/base.py +153 -0
- phoenix/{datasets → experiments}/evaluators/code_evaluators.py +25 -53
- phoenix/{datasets → experiments}/evaluators/llm_evaluators.py +62 -31
- phoenix/experiments/evaluators/utils.py +189 -0
- phoenix/experiments/functions.py +616 -0
- phoenix/{datasets → experiments}/tracing.py +19 -0
- phoenix/experiments/types.py +722 -0
- phoenix/experiments/utils.py +9 -0
- phoenix/server/api/context.py +4 -0
- phoenix/server/api/dataloaders/__init__.py +4 -0
- phoenix/server/api/dataloaders/average_experiment_run_latency.py +54 -0
- phoenix/server/api/dataloaders/experiment_run_counts.py +42 -0
- phoenix/server/api/helpers/dataset_helpers.py +8 -7
- phoenix/server/api/input_types/ClearProjectInput.py +15 -0
- phoenix/server/api/mutations/project_mutations.py +9 -4
- phoenix/server/api/routers/v1/__init__.py +1 -1
- phoenix/server/api/routers/v1/dataset_examples.py +10 -10
- phoenix/server/api/routers/v1/datasets.py +152 -48
- phoenix/server/api/routers/v1/evaluations.py +4 -11
- phoenix/server/api/routers/v1/experiment_evaluations.py +23 -23
- phoenix/server/api/routers/v1/experiment_runs.py +5 -17
- phoenix/server/api/routers/v1/experiments.py +5 -5
- phoenix/server/api/routers/v1/spans.py +6 -4
- phoenix/server/api/types/Experiment.py +12 -0
- phoenix/server/api/types/ExperimentRun.py +1 -1
- phoenix/server/api/types/ExperimentRunAnnotation.py +1 -1
- phoenix/server/app.py +4 -0
- phoenix/server/static/index.js +712 -588
- phoenix/session/client.py +321 -28
- phoenix/trace/fixtures.py +6 -6
- phoenix/utilities/json.py +8 -8
- phoenix/version.py +1 -1
- phoenix/datasets/__init__.py +0 -0
- phoenix/datasets/evaluators/__init__.py +0 -18
- phoenix/datasets/evaluators/_utils.py +0 -13
- phoenix/datasets/experiments.py +0 -485
- phoenix/datasets/types.py +0 -212
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/WHEEL +0 -0
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-4.4.4rc4.dist-info → arize_phoenix-4.4.4rc6.dist-info}/licenses/LICENSE +0 -0
phoenix/datasets/experiments.py
DELETED
|
@@ -1,485 +0,0 @@
|
|
|
1
|
-
import functools
|
|
2
|
-
import json
|
|
3
|
-
from binascii import hexlify
|
|
4
|
-
from contextlib import ExitStack
|
|
5
|
-
from copy import deepcopy
|
|
6
|
-
from datetime import datetime, timezone
|
|
7
|
-
from itertools import product
|
|
8
|
-
from typing import (
|
|
9
|
-
Any,
|
|
10
|
-
Awaitable,
|
|
11
|
-
Callable,
|
|
12
|
-
Coroutine,
|
|
13
|
-
Iterable,
|
|
14
|
-
Mapping,
|
|
15
|
-
Optional,
|
|
16
|
-
Tuple,
|
|
17
|
-
Type,
|
|
18
|
-
Union,
|
|
19
|
-
cast,
|
|
20
|
-
)
|
|
21
|
-
from urllib.parse import urljoin
|
|
22
|
-
|
|
23
|
-
import httpx
|
|
24
|
-
import opentelemetry.sdk.trace as trace_sdk
|
|
25
|
-
from openinference.semconv.resource import ResourceAttributes
|
|
26
|
-
from openinference.semconv.trace import (
|
|
27
|
-
OpenInferenceMimeTypeValues,
|
|
28
|
-
OpenInferenceSpanKindValues,
|
|
29
|
-
SpanAttributes,
|
|
30
|
-
)
|
|
31
|
-
from opentelemetry.context import Context
|
|
32
|
-
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
33
|
-
from opentelemetry.sdk.resources import Resource
|
|
34
|
-
from opentelemetry.sdk.trace import Span
|
|
35
|
-
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
|
36
|
-
from opentelemetry.trace import Status, StatusCode
|
|
37
|
-
from typing_extensions import TypeAlias
|
|
38
|
-
|
|
39
|
-
from phoenix.config import (
|
|
40
|
-
get_env_client_headers,
|
|
41
|
-
get_env_collector_endpoint,
|
|
42
|
-
get_env_host,
|
|
43
|
-
get_env_port,
|
|
44
|
-
)
|
|
45
|
-
from phoenix.datasets.tracing import capture_spans
|
|
46
|
-
from phoenix.datasets.types import (
|
|
47
|
-
CanAsyncEvaluate,
|
|
48
|
-
CanEvaluate,
|
|
49
|
-
Dataset,
|
|
50
|
-
EvaluationResult,
|
|
51
|
-
Example,
|
|
52
|
-
Experiment,
|
|
53
|
-
ExperimentEvaluationRun,
|
|
54
|
-
ExperimentEvaluator,
|
|
55
|
-
ExperimentResult,
|
|
56
|
-
ExperimentRun,
|
|
57
|
-
ExperimentRunId,
|
|
58
|
-
JSONSerializable,
|
|
59
|
-
TestCase,
|
|
60
|
-
)
|
|
61
|
-
from phoenix.evals.executors import get_executor_on_sync_context
|
|
62
|
-
from phoenix.evals.models.rate_limiters import RateLimiter
|
|
63
|
-
from phoenix.evals.utils import get_tqdm_progress_bar_formatter
|
|
64
|
-
from phoenix.session.session import active_session
|
|
65
|
-
from phoenix.trace.attributes import flatten
|
|
66
|
-
from phoenix.utilities.json import jsonify
|
|
67
|
-
|
|
68
|
-
ExperimentTask: TypeAlias = Union[
|
|
69
|
-
Callable[[Example], JSONSerializable],
|
|
70
|
-
Callable[[Example], Coroutine[None, None, JSONSerializable]],
|
|
71
|
-
]
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def _get_base_url() -> str:
|
|
75
|
-
host = get_env_host()
|
|
76
|
-
if host == "0.0.0.0":
|
|
77
|
-
host = "127.0.0.1"
|
|
78
|
-
base_url = get_env_collector_endpoint() or f"http://{host}:{get_env_port()}"
|
|
79
|
-
return base_url if base_url.endswith("/") else base_url + "/"
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def _get_web_base_url() -> str:
|
|
83
|
-
"""Return the web UI base URL.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
str: the web UI base URL
|
|
87
|
-
"""
|
|
88
|
-
if session := active_session():
|
|
89
|
-
return session.url
|
|
90
|
-
return _get_base_url()
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def _get_experiment_url(*, dataset_id: str, experiment_id: str) -> str:
|
|
94
|
-
return f"{_get_web_base_url()}datasets/{dataset_id}/compare?experimentId={experiment_id}"
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def _get_dataset_experiments_url(*, dataset_id: str) -> str:
|
|
98
|
-
return f"{_get_web_base_url()}datasets/{dataset_id}/experiments"
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def _phoenix_client() -> httpx.Client:
|
|
102
|
-
headers = get_env_client_headers()
|
|
103
|
-
client = httpx.Client(base_url=_get_base_url(), headers=headers)
|
|
104
|
-
return client
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def run_experiment(
|
|
108
|
-
dataset: Dataset,
|
|
109
|
-
task: ExperimentTask,
|
|
110
|
-
*,
|
|
111
|
-
experiment_name: Optional[str] = None,
|
|
112
|
-
experiment_description: Optional[str] = None,
|
|
113
|
-
experiment_metadata: Optional[Mapping[str, Any]] = None,
|
|
114
|
-
evaluators: Optional[Union[ExperimentEvaluator, Iterable[ExperimentEvaluator]]] = None,
|
|
115
|
-
rate_limit_errors: Optional[Union[Type[BaseException], Tuple[Type[BaseException], ...]]] = None,
|
|
116
|
-
) -> Experiment:
|
|
117
|
-
# Add this to the params once supported in the UI
|
|
118
|
-
repetitions = 1
|
|
119
|
-
assert repetitions > 0, "Must run the experiment at least once."
|
|
120
|
-
|
|
121
|
-
client = _phoenix_client()
|
|
122
|
-
|
|
123
|
-
experiment_response = client.post(
|
|
124
|
-
f"/v1/datasets/{dataset.id}/experiments",
|
|
125
|
-
json={
|
|
126
|
-
"version-id": dataset.version_id,
|
|
127
|
-
"name": experiment_name,
|
|
128
|
-
"description": experiment_description,
|
|
129
|
-
"metadata": experiment_metadata,
|
|
130
|
-
"repetitions": repetitions,
|
|
131
|
-
},
|
|
132
|
-
)
|
|
133
|
-
experiment_response.raise_for_status()
|
|
134
|
-
exp_json = experiment_response.json()
|
|
135
|
-
experiment_id = exp_json["id"]
|
|
136
|
-
project_name = exp_json["project_name"]
|
|
137
|
-
|
|
138
|
-
resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
|
|
139
|
-
tracer_provider = trace_sdk.TracerProvider(resource=resource)
|
|
140
|
-
tracer_provider.add_span_processor(
|
|
141
|
-
SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
|
|
142
|
-
)
|
|
143
|
-
tracer = tracer_provider.get_tracer(__name__)
|
|
144
|
-
root_span_name = f"Task: {task.__qualname__}"
|
|
145
|
-
root_span_kind = CHAIN.value
|
|
146
|
-
|
|
147
|
-
dataset_experiments_url = _get_dataset_experiments_url(dataset_id=dataset.id)
|
|
148
|
-
experiment_compare_url = _get_experiment_url(dataset_id=dataset.id, experiment_id=experiment_id)
|
|
149
|
-
print("🧪 Experiment started.")
|
|
150
|
-
print(f"📺 View dataset experiments: {dataset_experiments_url}")
|
|
151
|
-
print(f"🔗 View this experiment: {experiment_compare_url}")
|
|
152
|
-
|
|
153
|
-
errors: Tuple[Optional[Type[BaseException]], ...]
|
|
154
|
-
if not hasattr(rate_limit_errors, "__iter__"):
|
|
155
|
-
errors = (rate_limit_errors,)
|
|
156
|
-
else:
|
|
157
|
-
rate_limit_errors = cast(Tuple[Type[BaseException], ...], rate_limit_errors)
|
|
158
|
-
errors = rate_limit_errors
|
|
159
|
-
|
|
160
|
-
rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in errors]
|
|
161
|
-
|
|
162
|
-
def sync_run_experiment(test_case: TestCase) -> ExperimentRun:
|
|
163
|
-
example, repetition_number = test_case.example, test_case.repetition_number
|
|
164
|
-
output = None
|
|
165
|
-
error: Optional[BaseException] = None
|
|
166
|
-
status = Status(StatusCode.OK)
|
|
167
|
-
with ExitStack() as stack:
|
|
168
|
-
span: Span = stack.enter_context(
|
|
169
|
-
tracer.start_as_current_span(root_span_name, context=Context())
|
|
170
|
-
)
|
|
171
|
-
stack.enter_context(capture_spans(resource))
|
|
172
|
-
try:
|
|
173
|
-
# Do not use keyword arguments, which can fail at runtime
|
|
174
|
-
# even when function obeys protocol, because keyword arguments
|
|
175
|
-
# are implementation details.
|
|
176
|
-
_output = task(example)
|
|
177
|
-
if isinstance(_output, Awaitable):
|
|
178
|
-
raise RuntimeError("Task is async but running in sync context")
|
|
179
|
-
else:
|
|
180
|
-
output = _output
|
|
181
|
-
except BaseException as exc:
|
|
182
|
-
span.record_exception(exc)
|
|
183
|
-
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
184
|
-
error = exc
|
|
185
|
-
span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
|
|
186
|
-
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
187
|
-
if result := ExperimentResult(result=output) if output is not None else None:
|
|
188
|
-
if isinstance(output, str):
|
|
189
|
-
span.set_attribute(OUTPUT_VALUE, output)
|
|
190
|
-
else:
|
|
191
|
-
span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
|
|
192
|
-
span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
|
|
193
|
-
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
194
|
-
span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
195
|
-
span.set_status(status)
|
|
196
|
-
|
|
197
|
-
assert isinstance(
|
|
198
|
-
output, (dict, list, str, int, float, bool, type(None))
|
|
199
|
-
), "Output must be JSON serializable"
|
|
200
|
-
experiment_run = ExperimentRun(
|
|
201
|
-
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
202
|
-
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
203
|
-
experiment_id=experiment_id,
|
|
204
|
-
dataset_example_id=example.id,
|
|
205
|
-
repetition_number=repetition_number,
|
|
206
|
-
output=result,
|
|
207
|
-
error=repr(error) if error else None,
|
|
208
|
-
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
209
|
-
)
|
|
210
|
-
return experiment_run
|
|
211
|
-
|
|
212
|
-
async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
|
|
213
|
-
example, repetition_number = test_case.example, test_case.repetition_number
|
|
214
|
-
output = None
|
|
215
|
-
error: Optional[BaseException] = None
|
|
216
|
-
status = Status(StatusCode.OK)
|
|
217
|
-
with ExitStack() as stack:
|
|
218
|
-
span: Span = stack.enter_context(
|
|
219
|
-
tracer.start_as_current_span(root_span_name, context=Context())
|
|
220
|
-
)
|
|
221
|
-
stack.enter_context(capture_spans(resource))
|
|
222
|
-
try:
|
|
223
|
-
# Do not use keyword arguments, which can fail at runtime
|
|
224
|
-
# even when function obeys protocol, because keyword arguments
|
|
225
|
-
# are implementation details.
|
|
226
|
-
_output = task(example)
|
|
227
|
-
if isinstance(_output, Awaitable):
|
|
228
|
-
output = await _output
|
|
229
|
-
else:
|
|
230
|
-
output = _output
|
|
231
|
-
except BaseException as exc:
|
|
232
|
-
span.record_exception(exc)
|
|
233
|
-
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
234
|
-
error = exc
|
|
235
|
-
span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
|
|
236
|
-
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
237
|
-
if result := ExperimentResult(result=output) if output is not None else None:
|
|
238
|
-
if isinstance(output, str):
|
|
239
|
-
span.set_attribute(OUTPUT_VALUE, output)
|
|
240
|
-
else:
|
|
241
|
-
span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
|
|
242
|
-
span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
|
|
243
|
-
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
244
|
-
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
245
|
-
span.set_status(status)
|
|
246
|
-
|
|
247
|
-
assert isinstance(
|
|
248
|
-
output, (dict, list, str, int, float, bool, type(None))
|
|
249
|
-
), "Output must be JSON serializable"
|
|
250
|
-
experiment_run = ExperimentRun(
|
|
251
|
-
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
252
|
-
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
253
|
-
experiment_id=experiment_id,
|
|
254
|
-
dataset_example_id=example.id,
|
|
255
|
-
repetition_number=repetition_number,
|
|
256
|
-
output=result,
|
|
257
|
-
error=repr(error) if error else None,
|
|
258
|
-
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
259
|
-
)
|
|
260
|
-
return experiment_run
|
|
261
|
-
|
|
262
|
-
rate_limited_sync_run_experiment = functools.reduce(
|
|
263
|
-
lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_run_experiment
|
|
264
|
-
)
|
|
265
|
-
rate_limited_async_run_experiment = functools.reduce(
|
|
266
|
-
lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_run_experiment
|
|
267
|
-
)
|
|
268
|
-
|
|
269
|
-
executor = get_executor_on_sync_context(
|
|
270
|
-
rate_limited_sync_run_experiment,
|
|
271
|
-
rate_limited_async_run_experiment,
|
|
272
|
-
max_retries=0,
|
|
273
|
-
exit_on_error=False,
|
|
274
|
-
fallback_return_value=None,
|
|
275
|
-
tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
test_cases = [
|
|
279
|
-
TestCase(example=ex, repetition_number=rep)
|
|
280
|
-
for ex, rep in product(dataset.examples, range(1, repetitions + 1))
|
|
281
|
-
]
|
|
282
|
-
experiment_payloads, _execution_details = executor.run(test_cases)
|
|
283
|
-
for payload in experiment_payloads:
|
|
284
|
-
if payload is not None:
|
|
285
|
-
resp = client.post(f"/v1/experiments/{experiment_id}/runs", json=jsonify(payload))
|
|
286
|
-
resp.raise_for_status()
|
|
287
|
-
|
|
288
|
-
experiment = Experiment(
|
|
289
|
-
id=experiment_id,
|
|
290
|
-
dataset_id=dataset.id,
|
|
291
|
-
dataset_version_id=dataset.version_id,
|
|
292
|
-
project_name=project_name,
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
print("✅ Task runs completed.")
|
|
296
|
-
print("🧠 Evaluation started.")
|
|
297
|
-
|
|
298
|
-
if evaluators is not None:
|
|
299
|
-
_evaluate_experiment(experiment, evaluators, dataset.examples, client)
|
|
300
|
-
|
|
301
|
-
return experiment
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
def evaluate_experiment(
|
|
305
|
-
experiment: Experiment,
|
|
306
|
-
evaluators: Union[ExperimentEvaluator, Iterable[ExperimentEvaluator]],
|
|
307
|
-
) -> None:
|
|
308
|
-
client = _phoenix_client()
|
|
309
|
-
dataset_id = experiment.dataset_id
|
|
310
|
-
dataset_version_id = experiment.dataset_version_id
|
|
311
|
-
|
|
312
|
-
dataset_examples = [
|
|
313
|
-
Example.from_dict(ex)
|
|
314
|
-
for ex in (
|
|
315
|
-
client.get(
|
|
316
|
-
f"/v1/datasets/{dataset_id}/examples",
|
|
317
|
-
params={"version-id": str(dataset_version_id)},
|
|
318
|
-
)
|
|
319
|
-
.json()
|
|
320
|
-
.get("data", {})
|
|
321
|
-
.get("examples", [])
|
|
322
|
-
)
|
|
323
|
-
]
|
|
324
|
-
_evaluate_experiment(experiment, evaluators, dataset_examples, client)
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
ExperimentEvaluatorName: TypeAlias = str
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
def _evaluate_experiment(
|
|
331
|
-
experiment: Experiment,
|
|
332
|
-
evaluators: Union[ExperimentEvaluator, Iterable[ExperimentEvaluator]],
|
|
333
|
-
dataset_examples: Iterable[Example],
|
|
334
|
-
client: httpx.Client,
|
|
335
|
-
) -> None:
|
|
336
|
-
if isinstance(evaluators, (CanEvaluate, CanAsyncEvaluate)):
|
|
337
|
-
evaluators = [evaluators]
|
|
338
|
-
|
|
339
|
-
experiment_id = experiment.id
|
|
340
|
-
|
|
341
|
-
experiment_runs = [
|
|
342
|
-
ExperimentRun.from_dict(exp_run)
|
|
343
|
-
for exp_run in client.get(f"/v1/experiments/{experiment_id}/runs").json()
|
|
344
|
-
]
|
|
345
|
-
|
|
346
|
-
# not all dataset examples have associated experiment runs, so we need to pair them up
|
|
347
|
-
example_run_pairs = []
|
|
348
|
-
examples_by_id = {example.id: example for example in dataset_examples}
|
|
349
|
-
for exp_run in experiment_runs:
|
|
350
|
-
example = examples_by_id.get(exp_run.dataset_example_id)
|
|
351
|
-
if example:
|
|
352
|
-
example_run_pairs.append((deepcopy(example), exp_run))
|
|
353
|
-
evaluation_inputs = [
|
|
354
|
-
(example, run, evaluator.name, evaluator)
|
|
355
|
-
for (example, run), evaluator in product(example_run_pairs, evaluators)
|
|
356
|
-
]
|
|
357
|
-
|
|
358
|
-
project_name = "evaluators"
|
|
359
|
-
resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
|
|
360
|
-
tracer_provider = trace_sdk.TracerProvider(resource=resource)
|
|
361
|
-
tracer_provider.add_span_processor(
|
|
362
|
-
SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
|
|
363
|
-
)
|
|
364
|
-
tracer = tracer_provider.get_tracer(__name__)
|
|
365
|
-
root_span_kind = "EVALUATOR"
|
|
366
|
-
|
|
367
|
-
def sync_evaluate_run(
|
|
368
|
-
obj: Tuple[Example, ExperimentRun, ExperimentEvaluatorName, ExperimentEvaluator],
|
|
369
|
-
) -> ExperimentEvaluationRun:
|
|
370
|
-
example, experiment_run, name, evaluator = obj
|
|
371
|
-
result: Optional[EvaluationResult] = None
|
|
372
|
-
error: Optional[BaseException] = None
|
|
373
|
-
status = Status(StatusCode.OK)
|
|
374
|
-
root_span_name = f"Evaluation: {name}"
|
|
375
|
-
with ExitStack() as stack:
|
|
376
|
-
span: Span = stack.enter_context(
|
|
377
|
-
tracer.start_as_current_span(root_span_name, context=Context())
|
|
378
|
-
)
|
|
379
|
-
stack.enter_context(capture_spans(resource))
|
|
380
|
-
try:
|
|
381
|
-
# Do not use keyword arguments, which can fail at runtime
|
|
382
|
-
# even when function obeys protocol, because keyword arguments
|
|
383
|
-
# are implementation details.
|
|
384
|
-
if not isinstance(evaluator, CanEvaluate):
|
|
385
|
-
raise RuntimeError("Task is async but running in sync context")
|
|
386
|
-
_output = evaluator.evaluate(example, experiment_run)
|
|
387
|
-
if isinstance(_output, Awaitable):
|
|
388
|
-
raise RuntimeError("Task is async but running in sync context")
|
|
389
|
-
result = _output
|
|
390
|
-
except BaseException as exc:
|
|
391
|
-
span.record_exception(exc)
|
|
392
|
-
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
393
|
-
error = exc
|
|
394
|
-
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
395
|
-
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
396
|
-
span.set_status(status)
|
|
397
|
-
|
|
398
|
-
evaluator_payload = ExperimentEvaluationRun(
|
|
399
|
-
experiment_run_id=cast(ExperimentRunId, experiment_run.id),
|
|
400
|
-
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
401
|
-
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
402
|
-
name=evaluator.name,
|
|
403
|
-
annotator_kind=evaluator.annotator_kind,
|
|
404
|
-
error=repr(error) if error else None,
|
|
405
|
-
result=result,
|
|
406
|
-
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
407
|
-
)
|
|
408
|
-
return evaluator_payload
|
|
409
|
-
|
|
410
|
-
async def async_evaluate_run(
|
|
411
|
-
obj: Tuple[Example, ExperimentRun, ExperimentEvaluatorName, ExperimentEvaluator],
|
|
412
|
-
) -> ExperimentEvaluationRun:
|
|
413
|
-
example, experiment_run, name, evaluator = obj
|
|
414
|
-
result: Optional[EvaluationResult] = None
|
|
415
|
-
error: Optional[BaseException] = None
|
|
416
|
-
status = Status(StatusCode.OK)
|
|
417
|
-
root_span_name = f"Evaluation: {name}"
|
|
418
|
-
with ExitStack() as stack:
|
|
419
|
-
span: Span = stack.enter_context(
|
|
420
|
-
tracer.start_as_current_span(root_span_name, context=Context())
|
|
421
|
-
)
|
|
422
|
-
stack.enter_context(capture_spans(resource))
|
|
423
|
-
try:
|
|
424
|
-
# Do not use keyword arguments, which can fail at runtime
|
|
425
|
-
# even when function obeys protocol, because keyword arguments
|
|
426
|
-
# are implementation details.
|
|
427
|
-
if isinstance(evaluator, CanAsyncEvaluate):
|
|
428
|
-
result = await evaluator.async_evaluate(example, experiment_run)
|
|
429
|
-
else:
|
|
430
|
-
_output = evaluator.evaluate(example, experiment_run)
|
|
431
|
-
if isinstance(_output, Awaitable):
|
|
432
|
-
result = await _output
|
|
433
|
-
else:
|
|
434
|
-
result = _output
|
|
435
|
-
except BaseException as exc:
|
|
436
|
-
span.record_exception(exc)
|
|
437
|
-
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
438
|
-
error = exc
|
|
439
|
-
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
440
|
-
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
441
|
-
span.set_status(status)
|
|
442
|
-
|
|
443
|
-
evaluator_payload = ExperimentEvaluationRun(
|
|
444
|
-
experiment_run_id=cast(ExperimentRunId, experiment_run.id),
|
|
445
|
-
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
446
|
-
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
447
|
-
name=evaluator.name,
|
|
448
|
-
annotator_kind=evaluator.annotator_kind,
|
|
449
|
-
error=repr(error) if error else None,
|
|
450
|
-
result=result,
|
|
451
|
-
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
452
|
-
)
|
|
453
|
-
return evaluator_payload
|
|
454
|
-
|
|
455
|
-
executor = get_executor_on_sync_context(
|
|
456
|
-
sync_evaluate_run,
|
|
457
|
-
async_evaluate_run,
|
|
458
|
-
max_retries=0,
|
|
459
|
-
exit_on_error=False,
|
|
460
|
-
fallback_return_value=None,
|
|
461
|
-
tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
|
|
462
|
-
)
|
|
463
|
-
evaluation_payloads, _execution_details = executor.run(evaluation_inputs)
|
|
464
|
-
for payload in evaluation_payloads:
|
|
465
|
-
if payload is not None:
|
|
466
|
-
resp = client.post("/v1/experiment_evaluations", json=jsonify(payload))
|
|
467
|
-
resp.raise_for_status()
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
def _str_trace_id(id_: int) -> str:
|
|
471
|
-
return hexlify(id_.to_bytes(16, "big")).decode()
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
def _decode_unix_nano(time_unix_nano: int) -> datetime:
|
|
475
|
-
return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
INPUT_VALUE = SpanAttributes.INPUT_VALUE
|
|
479
|
-
OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
|
|
480
|
-
INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
|
|
481
|
-
OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE
|
|
482
|
-
OPENINFERENCE_SPAN_KIND = SpanAttributes.OPENINFERENCE_SPAN_KIND
|
|
483
|
-
|
|
484
|
-
CHAIN = OpenInferenceSpanKindValues.CHAIN
|
|
485
|
-
JSON = OpenInferenceMimeTypeValues.JSON
|
phoenix/datasets/types.py
DELETED
|
@@ -1,212 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from dataclasses import dataclass, field
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from types import MappingProxyType
|
|
6
|
-
from typing import (
|
|
7
|
-
TYPE_CHECKING,
|
|
8
|
-
Any,
|
|
9
|
-
Dict,
|
|
10
|
-
List,
|
|
11
|
-
Mapping,
|
|
12
|
-
Optional,
|
|
13
|
-
Protocol,
|
|
14
|
-
Sequence,
|
|
15
|
-
Union,
|
|
16
|
-
runtime_checkable,
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
from typing_extensions import TypeAlias
|
|
20
|
-
|
|
21
|
-
JSONSerializable: TypeAlias = Optional[Union[Dict[str, Any], List[Any], str, int, float, bool]]
|
|
22
|
-
|
|
23
|
-
ExperimentId: TypeAlias = str
|
|
24
|
-
DatasetId: TypeAlias = str
|
|
25
|
-
DatasetVersionId: TypeAlias = str
|
|
26
|
-
ExampleId: TypeAlias = str
|
|
27
|
-
RepetitionNumber: TypeAlias = int
|
|
28
|
-
ExperimentRunId: TypeAlias = str
|
|
29
|
-
TraceId: TypeAlias = str
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@dataclass(frozen=True)
|
|
33
|
-
class Example:
|
|
34
|
-
id: ExampleId
|
|
35
|
-
updated_at: datetime
|
|
36
|
-
input: Mapping[str, JSONSerializable]
|
|
37
|
-
output: Mapping[str, JSONSerializable]
|
|
38
|
-
metadata: Mapping[str, JSONSerializable] = field(default_factory=lambda: MappingProxyType({}))
|
|
39
|
-
|
|
40
|
-
@classmethod
|
|
41
|
-
def from_dict(cls, obj: Mapping[str, Any]) -> Example:
|
|
42
|
-
return cls(
|
|
43
|
-
input=obj["input"],
|
|
44
|
-
output=obj["output"],
|
|
45
|
-
metadata=obj.get("metadata") or {},
|
|
46
|
-
id=obj["id"],
|
|
47
|
-
updated_at=obj["updated_at"],
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
@dataclass(frozen=True)
|
|
52
|
-
class Dataset:
|
|
53
|
-
id: DatasetId
|
|
54
|
-
version_id: DatasetVersionId
|
|
55
|
-
examples: Sequence[Example]
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
@dataclass(frozen=True)
|
|
59
|
-
class TestCase:
|
|
60
|
-
example: Example
|
|
61
|
-
repetition_number: RepetitionNumber
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
@dataclass(frozen=True)
|
|
65
|
-
class Experiment:
|
|
66
|
-
id: ExperimentId
|
|
67
|
-
dataset_id: DatasetId
|
|
68
|
-
dataset_version_id: DatasetVersionId
|
|
69
|
-
project_name: Optional[str] = None
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
@dataclass(frozen=True)
|
|
73
|
-
class ExperimentResult:
|
|
74
|
-
result: JSONSerializable
|
|
75
|
-
|
|
76
|
-
@classmethod
|
|
77
|
-
def from_dict(cls, obj: Optional[Mapping[str, Any]]) -> Optional[ExperimentResult]:
|
|
78
|
-
if not obj:
|
|
79
|
-
return None
|
|
80
|
-
return cls(result=obj["result"])
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
@dataclass(frozen=True)
|
|
84
|
-
class ExperimentRun:
|
|
85
|
-
start_time: datetime
|
|
86
|
-
end_time: datetime
|
|
87
|
-
experiment_id: ExperimentId
|
|
88
|
-
dataset_example_id: ExampleId
|
|
89
|
-
repetition_number: RepetitionNumber
|
|
90
|
-
output: Optional[ExperimentResult] = None
|
|
91
|
-
error: Optional[str] = None
|
|
92
|
-
id: Optional[ExperimentRunId] = None
|
|
93
|
-
trace_id: Optional[TraceId] = None
|
|
94
|
-
|
|
95
|
-
@classmethod
|
|
96
|
-
def from_dict(cls, obj: Mapping[str, Any]) -> ExperimentRun:
|
|
97
|
-
return cls(
|
|
98
|
-
start_time=obj["start_time"],
|
|
99
|
-
end_time=obj["end_time"],
|
|
100
|
-
experiment_id=obj["experiment_id"],
|
|
101
|
-
dataset_example_id=obj["dataset_example_id"],
|
|
102
|
-
repetition_number=obj.get("repetition_number") or 1,
|
|
103
|
-
output=ExperimentResult.from_dict(obj["output"]),
|
|
104
|
-
error=obj.get("error"),
|
|
105
|
-
id=obj.get("id"),
|
|
106
|
-
trace_id=obj.get("trace_id"),
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
def __post_init__(self) -> None:
|
|
110
|
-
if bool(self.output) == bool(self.error):
|
|
111
|
-
ValueError("Must specify either result or error")
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
@dataclass(frozen=True)
|
|
115
|
-
class EvaluationResult:
|
|
116
|
-
score: Optional[float] = None
|
|
117
|
-
label: Optional[str] = None
|
|
118
|
-
explanation: Optional[str] = None
|
|
119
|
-
metadata: Mapping[str, JSONSerializable] = field(default_factory=lambda: MappingProxyType({}))
|
|
120
|
-
|
|
121
|
-
@classmethod
|
|
122
|
-
def from_dict(cls, obj: Optional[Mapping[str, Any]]) -> Optional[EvaluationResult]:
|
|
123
|
-
if not obj:
|
|
124
|
-
return None
|
|
125
|
-
return cls(
|
|
126
|
-
score=obj.get("score"),
|
|
127
|
-
label=obj.get("label"),
|
|
128
|
-
explanation=obj.get("explanation"),
|
|
129
|
-
metadata=obj.get("metadata") or {},
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
def __post_init__(self) -> None:
|
|
133
|
-
if self.score is None and not self.label and not self.explanation:
|
|
134
|
-
ValueError("Must specify one of score, label, or explanation")
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
@dataclass(frozen=True)
|
|
138
|
-
class ExperimentEvaluationRun:
|
|
139
|
-
experiment_run_id: ExperimentRunId
|
|
140
|
-
start_time: datetime
|
|
141
|
-
end_time: datetime
|
|
142
|
-
name: str
|
|
143
|
-
annotator_kind: str
|
|
144
|
-
error: Optional[str] = None
|
|
145
|
-
result: Optional[EvaluationResult] = None
|
|
146
|
-
id: Optional[str] = None
|
|
147
|
-
trace_id: Optional[TraceId] = None
|
|
148
|
-
|
|
149
|
-
@classmethod
|
|
150
|
-
def from_dict(cls, obj: Mapping[str, Any]) -> ExperimentEvaluationRun:
|
|
151
|
-
return cls(
|
|
152
|
-
experiment_run_id=obj["experiment_run_id"],
|
|
153
|
-
start_time=obj["start_time"],
|
|
154
|
-
end_time=obj["end_time"],
|
|
155
|
-
name=obj["name"],
|
|
156
|
-
annotator_kind=obj["annotator_kind"],
|
|
157
|
-
error=obj.get("error"),
|
|
158
|
-
result=EvaluationResult.from_dict(obj.get("result")),
|
|
159
|
-
id=obj.get("id"),
|
|
160
|
-
trace_id=obj.get("trace_id"),
|
|
161
|
-
)
|
|
162
|
-
|
|
163
|
-
def __post_init__(self) -> None:
|
|
164
|
-
if bool(self.result) == bool(self.error):
|
|
165
|
-
ValueError("Must specify either result or error")
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
class _HasName(Protocol):
|
|
169
|
-
name: str
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
class _HasKind(Protocol):
|
|
173
|
-
@property
|
|
174
|
-
def annotator_kind(self) -> str: ...
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
@runtime_checkable
|
|
178
|
-
class CanEvaluate(_HasName, _HasKind, Protocol):
|
|
179
|
-
def evaluate(
|
|
180
|
-
self,
|
|
181
|
-
example: Example,
|
|
182
|
-
experiment_run: ExperimentRun,
|
|
183
|
-
) -> EvaluationResult: ...
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
@runtime_checkable
|
|
187
|
-
class CanAsyncEvaluate(_HasName, _HasKind, Protocol):
|
|
188
|
-
async def async_evaluate(
|
|
189
|
-
self,
|
|
190
|
-
example: Example,
|
|
191
|
-
experiment_run: ExperimentRun,
|
|
192
|
-
) -> EvaluationResult: ...
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
ExperimentEvaluator: TypeAlias = Union[CanEvaluate, CanAsyncEvaluate]
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
# Someday we'll do type checking in unit tests.
|
|
199
|
-
if TYPE_CHECKING:
|
|
200
|
-
|
|
201
|
-
class _EvaluatorDummy:
|
|
202
|
-
annotator_kind: str
|
|
203
|
-
name: str
|
|
204
|
-
|
|
205
|
-
def evaluate(self, _: Example, __: ExperimentRun) -> EvaluationResult:
|
|
206
|
-
raise NotImplementedError
|
|
207
|
-
|
|
208
|
-
async def async_evaluate(self, _: Example, __: ExperimentRun) -> EvaluationResult:
|
|
209
|
-
raise NotImplementedError
|
|
210
|
-
|
|
211
|
-
_: ExperimentEvaluator
|
|
212
|
-
_ = _EvaluatorDummy()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|