arize-phoenix 4.4.2__py3-none-any.whl → 4.4.4rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/METADATA +12 -11
- {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/RECORD +110 -57
- phoenix/__init__.py +0 -27
- phoenix/config.py +21 -7
- phoenix/core/model.py +25 -25
- phoenix/core/model_schema.py +66 -64
- phoenix/core/model_schema_adapter.py +27 -25
- phoenix/datasets/__init__.py +0 -0
- phoenix/datasets/evaluators.py +275 -0
- phoenix/datasets/experiments.py +469 -0
- phoenix/datasets/tracing.py +66 -0
- phoenix/datasets/types.py +212 -0
- phoenix/db/bulk_inserter.py +54 -14
- phoenix/db/insertion/dataset.py +234 -0
- phoenix/db/insertion/evaluation.py +6 -6
- phoenix/db/insertion/helpers.py +13 -2
- phoenix/db/migrations/types.py +29 -0
- phoenix/db/migrations/versions/10460e46d750_datasets.py +291 -0
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +2 -28
- phoenix/db/models.py +230 -3
- phoenix/inferences/fixtures.py +23 -23
- phoenix/inferences/inferences.py +7 -7
- phoenix/inferences/validation.py +1 -1
- phoenix/metrics/binning.py +2 -2
- phoenix/server/api/context.py +16 -0
- phoenix/server/api/dataloaders/__init__.py +16 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +100 -0
- phoenix/server/api/dataloaders/dataset_example_spans.py +43 -0
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +85 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +43 -0
- phoenix/server/api/dataloaders/experiment_sequence_number.py +49 -0
- phoenix/server/api/dataloaders/project_by_name.py +31 -0
- phoenix/server/api/dataloaders/span_descendants.py +2 -3
- phoenix/server/api/dataloaders/span_projects.py +33 -0
- phoenix/server/api/dataloaders/trace_row_ids.py +39 -0
- phoenix/server/api/helpers/dataset_helpers.py +178 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
- phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
- phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
- phoenix/server/api/input_types/DatasetSort.py +17 -0
- phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
- phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
- phoenix/server/api/input_types/DeleteExperimentsInput.py +9 -0
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
- phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
- phoenix/server/api/mutations/__init__.py +13 -0
- phoenix/server/api/mutations/auth.py +11 -0
- phoenix/server/api/mutations/dataset_mutations.py +520 -0
- phoenix/server/api/mutations/experiment_mutations.py +65 -0
- phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +17 -14
- phoenix/server/api/mutations/project_mutations.py +42 -0
- phoenix/server/api/queries.py +503 -0
- phoenix/server/api/routers/v1/__init__.py +77 -2
- phoenix/server/api/routers/v1/dataset_examples.py +178 -0
- phoenix/server/api/routers/v1/datasets.py +861 -0
- phoenix/server/api/routers/v1/evaluations.py +4 -2
- phoenix/server/api/routers/v1/experiment_evaluations.py +65 -0
- phoenix/server/api/routers/v1/experiment_runs.py +108 -0
- phoenix/server/api/routers/v1/experiments.py +174 -0
- phoenix/server/api/routers/v1/spans.py +3 -1
- phoenix/server/api/routers/v1/traces.py +1 -4
- phoenix/server/api/schema.py +2 -303
- phoenix/server/api/types/AnnotatorKind.py +10 -0
- phoenix/server/api/types/Cluster.py +19 -19
- phoenix/server/api/types/CreateDatasetPayload.py +8 -0
- phoenix/server/api/types/Dataset.py +282 -63
- phoenix/server/api/types/DatasetExample.py +85 -0
- phoenix/server/api/types/DatasetExampleRevision.py +34 -0
- phoenix/server/api/types/DatasetVersion.py +14 -0
- phoenix/server/api/types/Dimension.py +30 -29
- phoenix/server/api/types/EmbeddingDimension.py +40 -34
- phoenix/server/api/types/Event.py +16 -16
- phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
- phoenix/server/api/types/Experiment.py +135 -0
- phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
- phoenix/server/api/types/ExperimentComparison.py +19 -0
- phoenix/server/api/types/ExperimentRun.py +91 -0
- phoenix/server/api/types/ExperimentRunAnnotation.py +57 -0
- phoenix/server/api/types/Inferences.py +80 -0
- phoenix/server/api/types/InferencesRole.py +23 -0
- phoenix/server/api/types/Model.py +43 -42
- phoenix/server/api/types/Project.py +26 -12
- phoenix/server/api/types/Segments.py +1 -1
- phoenix/server/api/types/Span.py +78 -2
- phoenix/server/api/types/TimeSeries.py +6 -6
- phoenix/server/api/types/Trace.py +15 -4
- phoenix/server/api/types/UMAPPoints.py +1 -1
- phoenix/server/api/types/node.py +5 -111
- phoenix/server/api/types/pagination.py +10 -52
- phoenix/server/app.py +99 -49
- phoenix/server/main.py +49 -27
- phoenix/server/openapi/docs.py +3 -0
- phoenix/server/static/index.js +2246 -1368
- phoenix/server/templates/index.html +1 -0
- phoenix/services.py +15 -15
- phoenix/session/client.py +316 -21
- phoenix/session/session.py +47 -37
- phoenix/trace/exporter.py +14 -9
- phoenix/trace/fixtures.py +133 -7
- phoenix/trace/span_evaluations.py +3 -3
- phoenix/trace/trace_dataset.py +6 -6
- phoenix/utilities/json.py +61 -0
- phoenix/utilities/re.py +50 -0
- phoenix/version.py +1 -1
- phoenix/server/api/types/DatasetRole.py +0 -23
- {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/WHEEL +0 -0
- {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-4.4.2.dist-info → arize_phoenix-4.4.4rc0.dist-info}/licenses/LICENSE +0 -0
- /phoenix/server/api/{helpers.py → helpers/__init__.py} +0 -0
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import json
|
|
3
|
+
from binascii import hexlify
|
|
4
|
+
from contextlib import ExitStack
|
|
5
|
+
from copy import deepcopy
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from itertools import product
|
|
8
|
+
from typing import (
|
|
9
|
+
Any,
|
|
10
|
+
Awaitable,
|
|
11
|
+
Callable,
|
|
12
|
+
Coroutine,
|
|
13
|
+
Iterable,
|
|
14
|
+
Mapping,
|
|
15
|
+
Optional,
|
|
16
|
+
Tuple,
|
|
17
|
+
Type,
|
|
18
|
+
Union,
|
|
19
|
+
cast,
|
|
20
|
+
)
|
|
21
|
+
from urllib.parse import urljoin
|
|
22
|
+
|
|
23
|
+
import httpx
|
|
24
|
+
import opentelemetry.sdk.trace as trace_sdk
|
|
25
|
+
from openinference.semconv.resource import ResourceAttributes
|
|
26
|
+
from openinference.semconv.trace import (
|
|
27
|
+
OpenInferenceMimeTypeValues,
|
|
28
|
+
OpenInferenceSpanKindValues,
|
|
29
|
+
SpanAttributes,
|
|
30
|
+
)
|
|
31
|
+
from opentelemetry.context import Context
|
|
32
|
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
33
|
+
from opentelemetry.sdk.resources import Resource
|
|
34
|
+
from opentelemetry.sdk.trace import Span
|
|
35
|
+
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
|
36
|
+
from opentelemetry.trace import Status, StatusCode
|
|
37
|
+
from typing_extensions import TypeAlias
|
|
38
|
+
|
|
39
|
+
from phoenix.config import (
|
|
40
|
+
get_env_client_headers,
|
|
41
|
+
get_env_collector_endpoint,
|
|
42
|
+
get_env_host,
|
|
43
|
+
get_env_port,
|
|
44
|
+
)
|
|
45
|
+
from phoenix.datasets.tracing import capture_spans
|
|
46
|
+
from phoenix.datasets.types import (
|
|
47
|
+
CanAsyncEvaluate,
|
|
48
|
+
CanEvaluate,
|
|
49
|
+
Dataset,
|
|
50
|
+
EvaluationResult,
|
|
51
|
+
Example,
|
|
52
|
+
Experiment,
|
|
53
|
+
ExperimentEvaluationRun,
|
|
54
|
+
ExperimentEvaluator,
|
|
55
|
+
ExperimentResult,
|
|
56
|
+
ExperimentRun,
|
|
57
|
+
ExperimentRunId,
|
|
58
|
+
JSONSerializable,
|
|
59
|
+
TestCase,
|
|
60
|
+
)
|
|
61
|
+
from phoenix.evals.executors import get_executor_on_sync_context
|
|
62
|
+
from phoenix.evals.models.rate_limiters import RateLimiter
|
|
63
|
+
from phoenix.evals.utils import get_tqdm_progress_bar_formatter
|
|
64
|
+
from phoenix.trace.attributes import flatten
|
|
65
|
+
from phoenix.utilities.json import jsonify
|
|
66
|
+
|
|
67
|
+
ExperimentTask: TypeAlias = Union[
|
|
68
|
+
Callable[[Example], JSONSerializable],
|
|
69
|
+
Callable[[Example], Coroutine[None, None, JSONSerializable]],
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _get_base_url() -> str:
|
|
74
|
+
host = get_env_host()
|
|
75
|
+
if host == "0.0.0.0":
|
|
76
|
+
host = "127.0.0.1"
|
|
77
|
+
base_url = get_env_collector_endpoint() or f"http://{host}:{get_env_port()}"
|
|
78
|
+
return base_url if base_url.endswith("/") else base_url + "/"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _get_experiment_url(*, dataset_id: str, experiment_id: str) -> str:
|
|
82
|
+
return f"{_get_base_url()}datasets/{dataset_id}/compare?experimentId={experiment_id}"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _get_dataset_experiments_url(*, dataset_id: str) -> str:
|
|
86
|
+
return f"{_get_base_url()}datasets/{dataset_id}/experiments"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _phoenix_client() -> httpx.Client:
|
|
90
|
+
headers = get_env_client_headers()
|
|
91
|
+
client = httpx.Client(base_url=_get_base_url(), headers=headers)
|
|
92
|
+
return client
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def run_experiment(
|
|
96
|
+
dataset: Dataset,
|
|
97
|
+
task: ExperimentTask,
|
|
98
|
+
*,
|
|
99
|
+
experiment_name: Optional[str] = None,
|
|
100
|
+
experiment_description: Optional[str] = None,
|
|
101
|
+
experiment_metadata: Optional[Mapping[str, Any]] = None,
|
|
102
|
+
evaluators: Optional[Union[ExperimentEvaluator, Iterable[ExperimentEvaluator]]] = None,
|
|
103
|
+
rate_limit_errors: Optional[Union[Type[BaseException], Tuple[Type[BaseException], ...]]] = None,
|
|
104
|
+
) -> Experiment:
|
|
105
|
+
# Add this to the params once supported in the UI
|
|
106
|
+
repetitions = 1
|
|
107
|
+
assert repetitions > 0, "Must run the experiment at least once."
|
|
108
|
+
|
|
109
|
+
client = _phoenix_client()
|
|
110
|
+
|
|
111
|
+
experiment_response = client.post(
|
|
112
|
+
f"/v1/datasets/{dataset.id}/experiments",
|
|
113
|
+
json={
|
|
114
|
+
"version-id": dataset.version_id,
|
|
115
|
+
"name": experiment_name,
|
|
116
|
+
"description": experiment_description,
|
|
117
|
+
"metadata": experiment_metadata,
|
|
118
|
+
"repetitions": repetitions,
|
|
119
|
+
},
|
|
120
|
+
)
|
|
121
|
+
experiment_response.raise_for_status()
|
|
122
|
+
exp_json = experiment_response.json()
|
|
123
|
+
experiment_id = exp_json["id"]
|
|
124
|
+
project_name = exp_json["project_name"]
|
|
125
|
+
|
|
126
|
+
resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
|
|
127
|
+
tracer_provider = trace_sdk.TracerProvider(resource=resource)
|
|
128
|
+
tracer_provider.add_span_processor(
|
|
129
|
+
SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
|
|
130
|
+
)
|
|
131
|
+
tracer = tracer_provider.get_tracer(__name__)
|
|
132
|
+
root_span_name = f"Task: {task.__qualname__}"
|
|
133
|
+
root_span_kind = CHAIN.value
|
|
134
|
+
|
|
135
|
+
dataset_experiments_url = _get_dataset_experiments_url(dataset_id=dataset.id)
|
|
136
|
+
experiment_compare_url = _get_experiment_url(dataset_id=dataset.id, experiment_id=experiment_id)
|
|
137
|
+
print(f"🧪 Experiment started: {experiment_compare_url}")
|
|
138
|
+
|
|
139
|
+
errors: Tuple[Optional[Type[BaseException]], ...]
|
|
140
|
+
if not hasattr(rate_limit_errors, "__iter__"):
|
|
141
|
+
errors = (rate_limit_errors,)
|
|
142
|
+
else:
|
|
143
|
+
rate_limit_errors = cast(Tuple[Type[BaseException], ...], rate_limit_errors)
|
|
144
|
+
errors = rate_limit_errors
|
|
145
|
+
|
|
146
|
+
rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in errors]
|
|
147
|
+
|
|
148
|
+
def sync_run_experiment(test_case: TestCase) -> ExperimentRun:
|
|
149
|
+
example, repetition_number = test_case.example, test_case.repetition_number
|
|
150
|
+
output = None
|
|
151
|
+
error: Optional[BaseException] = None
|
|
152
|
+
status = Status(StatusCode.OK)
|
|
153
|
+
with ExitStack() as stack:
|
|
154
|
+
span: Span = stack.enter_context(
|
|
155
|
+
tracer.start_as_current_span(root_span_name, context=Context())
|
|
156
|
+
)
|
|
157
|
+
stack.enter_context(capture_spans(resource))
|
|
158
|
+
try:
|
|
159
|
+
# Do not use keyword arguments, which can fail at runtime
|
|
160
|
+
# even when function obeys protocol, because keyword arguments
|
|
161
|
+
# are implementation details.
|
|
162
|
+
_output = task(example)
|
|
163
|
+
if isinstance(_output, Awaitable):
|
|
164
|
+
raise RuntimeError("Task is async but running in sync context")
|
|
165
|
+
else:
|
|
166
|
+
output = _output
|
|
167
|
+
except BaseException as exc:
|
|
168
|
+
span.record_exception(exc)
|
|
169
|
+
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
170
|
+
error = exc
|
|
171
|
+
span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
|
|
172
|
+
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
173
|
+
if result := ExperimentResult(result=output) if output is not None else None:
|
|
174
|
+
if isinstance(output, str):
|
|
175
|
+
span.set_attribute(OUTPUT_VALUE, output)
|
|
176
|
+
else:
|
|
177
|
+
span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
|
|
178
|
+
span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
|
|
179
|
+
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
180
|
+
span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
181
|
+
span.set_status(status)
|
|
182
|
+
|
|
183
|
+
assert isinstance(
|
|
184
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
185
|
+
), "Output must be JSON serializable"
|
|
186
|
+
experiment_run = ExperimentRun(
|
|
187
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
188
|
+
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
189
|
+
experiment_id=experiment_id,
|
|
190
|
+
dataset_example_id=example.id,
|
|
191
|
+
repetition_number=repetition_number,
|
|
192
|
+
output=result,
|
|
193
|
+
error=repr(error) if error else None,
|
|
194
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
195
|
+
)
|
|
196
|
+
return experiment_run
|
|
197
|
+
|
|
198
|
+
async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
|
|
199
|
+
example, repetition_number = test_case.example, test_case.repetition_number
|
|
200
|
+
output = None
|
|
201
|
+
error: Optional[BaseException] = None
|
|
202
|
+
status = Status(StatusCode.OK)
|
|
203
|
+
with ExitStack() as stack:
|
|
204
|
+
span: Span = stack.enter_context(
|
|
205
|
+
tracer.start_as_current_span(root_span_name, context=Context())
|
|
206
|
+
)
|
|
207
|
+
stack.enter_context(capture_spans(resource))
|
|
208
|
+
try:
|
|
209
|
+
# Do not use keyword arguments, which can fail at runtime
|
|
210
|
+
# even when function obeys protocol, because keyword arguments
|
|
211
|
+
# are implementation details.
|
|
212
|
+
_output = task(example)
|
|
213
|
+
if isinstance(_output, Awaitable):
|
|
214
|
+
output = await _output
|
|
215
|
+
else:
|
|
216
|
+
output = _output
|
|
217
|
+
except BaseException as exc:
|
|
218
|
+
span.record_exception(exc)
|
|
219
|
+
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
220
|
+
error = exc
|
|
221
|
+
span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
|
|
222
|
+
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
223
|
+
if result := ExperimentResult(result=output) if output is not None else None:
|
|
224
|
+
if isinstance(output, str):
|
|
225
|
+
span.set_attribute(OUTPUT_VALUE, output)
|
|
226
|
+
else:
|
|
227
|
+
span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
|
|
228
|
+
span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
|
|
229
|
+
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
230
|
+
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
231
|
+
span.set_status(status)
|
|
232
|
+
|
|
233
|
+
assert isinstance(
|
|
234
|
+
output, (dict, list, str, int, float, bool, type(None))
|
|
235
|
+
), "Output must be JSON serializable"
|
|
236
|
+
experiment_run = ExperimentRun(
|
|
237
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
238
|
+
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
239
|
+
experiment_id=experiment_id,
|
|
240
|
+
dataset_example_id=example.id,
|
|
241
|
+
repetition_number=repetition_number,
|
|
242
|
+
output=result,
|
|
243
|
+
error=repr(error) if error else None,
|
|
244
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
245
|
+
)
|
|
246
|
+
return experiment_run
|
|
247
|
+
|
|
248
|
+
rate_limited_sync_run_experiment = functools.reduce(
|
|
249
|
+
lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_run_experiment
|
|
250
|
+
)
|
|
251
|
+
rate_limited_async_run_experiment = functools.reduce(
|
|
252
|
+
lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_run_experiment
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
executor = get_executor_on_sync_context(
|
|
256
|
+
rate_limited_sync_run_experiment,
|
|
257
|
+
rate_limited_async_run_experiment,
|
|
258
|
+
max_retries=0,
|
|
259
|
+
exit_on_error=False,
|
|
260
|
+
fallback_return_value=None,
|
|
261
|
+
tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
test_cases = [
|
|
265
|
+
TestCase(example=ex, repetition_number=rep)
|
|
266
|
+
for ex, rep in product(dataset.examples, range(1, repetitions + 1))
|
|
267
|
+
]
|
|
268
|
+
experiment_payloads, _execution_details = executor.run(test_cases)
|
|
269
|
+
for payload in experiment_payloads:
|
|
270
|
+
if payload is not None:
|
|
271
|
+
resp = client.post(f"/v1/experiments/{experiment_id}/runs", json=jsonify(payload))
|
|
272
|
+
resp.raise_for_status()
|
|
273
|
+
|
|
274
|
+
experiment = Experiment(
|
|
275
|
+
id=experiment_id,
|
|
276
|
+
dataset_id=dataset.id,
|
|
277
|
+
dataset_version_id=dataset.version_id,
|
|
278
|
+
project_name=project_name,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
print(f"✅ Task runs completed. View all experiments: {dataset_experiments_url}")
|
|
282
|
+
|
|
283
|
+
if evaluators is not None:
|
|
284
|
+
_evaluate_experiment(experiment, evaluators, dataset.examples, client)
|
|
285
|
+
|
|
286
|
+
return experiment
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def evaluate_experiment(
|
|
290
|
+
experiment: Experiment,
|
|
291
|
+
evaluators: Union[ExperimentEvaluator, Iterable[ExperimentEvaluator]],
|
|
292
|
+
) -> None:
|
|
293
|
+
client = _phoenix_client()
|
|
294
|
+
dataset_id = experiment.dataset_id
|
|
295
|
+
dataset_version_id = experiment.dataset_version_id
|
|
296
|
+
|
|
297
|
+
dataset_examples = [
|
|
298
|
+
Example.from_dict(ex)
|
|
299
|
+
for ex in (
|
|
300
|
+
client.get(
|
|
301
|
+
f"/v1/datasets/{dataset_id}/examples",
|
|
302
|
+
params={"version-id": str(dataset_version_id)},
|
|
303
|
+
)
|
|
304
|
+
.json()
|
|
305
|
+
.get("data", {})
|
|
306
|
+
.get("examples", [])
|
|
307
|
+
)
|
|
308
|
+
]
|
|
309
|
+
_evaluate_experiment(experiment, evaluators, dataset_examples, client)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
ExperimentEvaluatorName: TypeAlias = str
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _evaluate_experiment(
|
|
316
|
+
experiment: Experiment,
|
|
317
|
+
evaluators: Union[ExperimentEvaluator, Iterable[ExperimentEvaluator]],
|
|
318
|
+
dataset_examples: Iterable[Example],
|
|
319
|
+
client: httpx.Client,
|
|
320
|
+
) -> None:
|
|
321
|
+
if isinstance(evaluators, (CanEvaluate, CanAsyncEvaluate)):
|
|
322
|
+
evaluators = [evaluators]
|
|
323
|
+
|
|
324
|
+
experiment_id = experiment.id
|
|
325
|
+
|
|
326
|
+
experiment_runs = [
|
|
327
|
+
ExperimentRun.from_dict(exp_run)
|
|
328
|
+
for exp_run in client.get(f"/v1/experiments/{experiment_id}/runs").json()
|
|
329
|
+
]
|
|
330
|
+
|
|
331
|
+
# not all dataset examples have associated experiment runs, so we need to pair them up
|
|
332
|
+
example_run_pairs = []
|
|
333
|
+
examples_by_id = {example.id: example for example in dataset_examples}
|
|
334
|
+
for exp_run in experiment_runs:
|
|
335
|
+
example = examples_by_id.get(exp_run.dataset_example_id)
|
|
336
|
+
if example:
|
|
337
|
+
example_run_pairs.append((deepcopy(example), exp_run))
|
|
338
|
+
evaluation_inputs = [
|
|
339
|
+
(example, run, evaluator.name, evaluator)
|
|
340
|
+
for (example, run), evaluator in product(example_run_pairs, evaluators)
|
|
341
|
+
]
|
|
342
|
+
|
|
343
|
+
project_name = "evaluators"
|
|
344
|
+
resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
|
|
345
|
+
tracer_provider = trace_sdk.TracerProvider(resource=resource)
|
|
346
|
+
tracer_provider.add_span_processor(
|
|
347
|
+
SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
|
|
348
|
+
)
|
|
349
|
+
tracer = tracer_provider.get_tracer(__name__)
|
|
350
|
+
root_span_kind = "EVALUATOR"
|
|
351
|
+
|
|
352
|
+
def sync_evaluate_run(
|
|
353
|
+
obj: Tuple[Example, ExperimentRun, ExperimentEvaluatorName, ExperimentEvaluator],
|
|
354
|
+
) -> ExperimentEvaluationRun:
|
|
355
|
+
example, experiment_run, name, evaluator = obj
|
|
356
|
+
result: Optional[EvaluationResult] = None
|
|
357
|
+
error: Optional[BaseException] = None
|
|
358
|
+
status = Status(StatusCode.OK)
|
|
359
|
+
root_span_name = f"Evaluation: {name}"
|
|
360
|
+
with ExitStack() as stack:
|
|
361
|
+
span: Span = stack.enter_context(
|
|
362
|
+
tracer.start_as_current_span(root_span_name, context=Context())
|
|
363
|
+
)
|
|
364
|
+
stack.enter_context(capture_spans(resource))
|
|
365
|
+
try:
|
|
366
|
+
# Do not use keyword arguments, which can fail at runtime
|
|
367
|
+
# even when function obeys protocol, because keyword arguments
|
|
368
|
+
# are implementation details.
|
|
369
|
+
if not isinstance(evaluator, CanEvaluate):
|
|
370
|
+
raise RuntimeError("Task is async but running in sync context")
|
|
371
|
+
_output = evaluator.evaluate(example, experiment_run)
|
|
372
|
+
if isinstance(_output, Awaitable):
|
|
373
|
+
raise RuntimeError("Task is async but running in sync context")
|
|
374
|
+
result = _output
|
|
375
|
+
except BaseException as exc:
|
|
376
|
+
span.record_exception(exc)
|
|
377
|
+
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
378
|
+
error = exc
|
|
379
|
+
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
380
|
+
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
381
|
+
span.set_status(status)
|
|
382
|
+
|
|
383
|
+
evaluator_payload = ExperimentEvaluationRun(
|
|
384
|
+
experiment_run_id=cast(ExperimentRunId, experiment_run.id),
|
|
385
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
386
|
+
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
387
|
+
name=evaluator.name,
|
|
388
|
+
annotator_kind=evaluator.annotator_kind,
|
|
389
|
+
error=repr(error) if error else None,
|
|
390
|
+
result=result,
|
|
391
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
392
|
+
)
|
|
393
|
+
return evaluator_payload
|
|
394
|
+
|
|
395
|
+
async def async_evaluate_run(
|
|
396
|
+
obj: Tuple[Example, ExperimentRun, ExperimentEvaluatorName, ExperimentEvaluator],
|
|
397
|
+
) -> ExperimentEvaluationRun:
|
|
398
|
+
example, experiment_run, name, evaluator = obj
|
|
399
|
+
result: Optional[EvaluationResult] = None
|
|
400
|
+
error: Optional[BaseException] = None
|
|
401
|
+
status = Status(StatusCode.OK)
|
|
402
|
+
root_span_name = f"Evaluation: {name}"
|
|
403
|
+
with ExitStack() as stack:
|
|
404
|
+
span: Span = stack.enter_context(
|
|
405
|
+
tracer.start_as_current_span(root_span_name, context=Context())
|
|
406
|
+
)
|
|
407
|
+
stack.enter_context(capture_spans(resource))
|
|
408
|
+
try:
|
|
409
|
+
# Do not use keyword arguments, which can fail at runtime
|
|
410
|
+
# even when function obeys protocol, because keyword arguments
|
|
411
|
+
# are implementation details.
|
|
412
|
+
if isinstance(evaluator, CanAsyncEvaluate):
|
|
413
|
+
result = await evaluator.async_evaluate(example, experiment_run)
|
|
414
|
+
else:
|
|
415
|
+
_output = evaluator.evaluate(example, experiment_run)
|
|
416
|
+
if isinstance(_output, Awaitable):
|
|
417
|
+
result = await _output
|
|
418
|
+
else:
|
|
419
|
+
result = _output
|
|
420
|
+
except BaseException as exc:
|
|
421
|
+
span.record_exception(exc)
|
|
422
|
+
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
423
|
+
error = exc
|
|
424
|
+
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
425
|
+
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
426
|
+
span.set_status(status)
|
|
427
|
+
|
|
428
|
+
evaluator_payload = ExperimentEvaluationRun(
|
|
429
|
+
experiment_run_id=cast(ExperimentRunId, experiment_run.id),
|
|
430
|
+
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
431
|
+
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
432
|
+
name=evaluator.name,
|
|
433
|
+
annotator_kind=evaluator.annotator_kind,
|
|
434
|
+
error=repr(error) if error else None,
|
|
435
|
+
result=result,
|
|
436
|
+
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
437
|
+
)
|
|
438
|
+
return evaluator_payload
|
|
439
|
+
|
|
440
|
+
executor = get_executor_on_sync_context(
|
|
441
|
+
sync_evaluate_run,
|
|
442
|
+
async_evaluate_run,
|
|
443
|
+
max_retries=0,
|
|
444
|
+
exit_on_error=False,
|
|
445
|
+
fallback_return_value=None,
|
|
446
|
+
)
|
|
447
|
+
evaluation_payloads, _execution_details = executor.run(evaluation_inputs)
|
|
448
|
+
for payload in evaluation_payloads:
|
|
449
|
+
if payload is not None:
|
|
450
|
+
resp = client.post("/v1/experiment_evaluations", json=jsonify(payload))
|
|
451
|
+
resp.raise_for_status()
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _str_trace_id(id_: int) -> str:
|
|
455
|
+
return hexlify(id_.to_bytes(16, "big")).decode()
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def _decode_unix_nano(time_unix_nano: int) -> datetime:
|
|
459
|
+
return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
INPUT_VALUE = SpanAttributes.INPUT_VALUE
|
|
463
|
+
OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
|
|
464
|
+
INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
|
|
465
|
+
OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE
|
|
466
|
+
OPENINFERENCE_SPAN_KIND = SpanAttributes.OPENINFERENCE_SPAN_KIND
|
|
467
|
+
|
|
468
|
+
CHAIN = OpenInferenceSpanKindValues.CHAIN
|
|
469
|
+
JSON = OpenInferenceMimeTypeValues.JSON
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from contextvars import ContextVar
|
|
5
|
+
from threading import Lock
|
|
6
|
+
from typing import Any, Callable, Iterator, Optional
|
|
7
|
+
|
|
8
|
+
from opentelemetry.sdk.resources import Resource
|
|
9
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
10
|
+
from opentelemetry.trace import INVALID_TRACE_ID
|
|
11
|
+
from wrapt import apply_patch, resolve_path, wrap_function_wrapper
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SpanModifier:
|
|
15
|
+
__slots__ = ("_resource",)
|
|
16
|
+
|
|
17
|
+
def __init__(self, resource: Resource) -> None:
|
|
18
|
+
self._resource = resource
|
|
19
|
+
|
|
20
|
+
def modify_resource(self, span: ReadableSpan) -> None:
|
|
21
|
+
if (ctx := span._context) is None or ctx.span_id == INVALID_TRACE_ID:
|
|
22
|
+
return
|
|
23
|
+
span._resource = span._resource.merge(self._resource)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_ACTIVE_MODIFIER: ContextVar[Optional[SpanModifier]] = ContextVar("active_modifier")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def override_span(init: Callable[..., None], span: ReadableSpan, args: Any, kwargs: Any) -> None:
|
|
30
|
+
init(*args, **kwargs)
|
|
31
|
+
if isinstance(span_modifier := _ACTIVE_MODIFIER.get(None), SpanModifier):
|
|
32
|
+
span_modifier.modify_resource(span)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
_SPAN_INIT_MONKEY_PATCH_LOCK = Lock()
|
|
36
|
+
_SPAN_INIT_MONKEY_PATCH_COUNT = 0
|
|
37
|
+
_SPAN_INIT_MODULE = ReadableSpan.__init__.__module__
|
|
38
|
+
_SPAN_INIT_NAME = ReadableSpan.__init__.__qualname__
|
|
39
|
+
_SPAN_INIT_PARENT, _SPAN_INIT_ATTR, _SPAN_INIT_ORIGINAL = resolve_path(
|
|
40
|
+
_SPAN_INIT_MODULE, _SPAN_INIT_NAME
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@contextmanager
|
|
45
|
+
def _monkey_patch_span_init() -> Iterator[None]:
|
|
46
|
+
global _SPAN_INIT_MONKEY_PATCH_COUNT
|
|
47
|
+
with _SPAN_INIT_MONKEY_PATCH_LOCK:
|
|
48
|
+
_SPAN_INIT_MONKEY_PATCH_COUNT += 1
|
|
49
|
+
if _SPAN_INIT_MONKEY_PATCH_COUNT == 1:
|
|
50
|
+
wrap_function_wrapper(
|
|
51
|
+
module=_SPAN_INIT_MODULE, name=_SPAN_INIT_NAME, wrapper=override_span
|
|
52
|
+
)
|
|
53
|
+
yield
|
|
54
|
+
with _SPAN_INIT_MONKEY_PATCH_LOCK:
|
|
55
|
+
_SPAN_INIT_MONKEY_PATCH_COUNT -= 1
|
|
56
|
+
if _SPAN_INIT_MONKEY_PATCH_COUNT == 0:
|
|
57
|
+
apply_patch(_SPAN_INIT_PARENT, _SPAN_INIT_ATTR, _SPAN_INIT_ORIGINAL)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@contextmanager
|
|
61
|
+
def capture_spans(resource: Resource) -> Iterator[SpanModifier]:
|
|
62
|
+
modifier = SpanModifier(resource)
|
|
63
|
+
with _monkey_patch_span_init():
|
|
64
|
+
token = _ACTIVE_MODIFIER.set(modifier)
|
|
65
|
+
yield modifier
|
|
66
|
+
_ACTIVE_MODIFIER.reset(token)
|