arize-phoenix 4.4.3__py3-none-any.whl → 4.4.4rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc0.dist-info}/METADATA +4 -4
  2. {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc0.dist-info}/RECORD +108 -55
  3. phoenix/__init__.py +0 -27
  4. phoenix/config.py +21 -7
  5. phoenix/core/model.py +25 -25
  6. phoenix/core/model_schema.py +64 -62
  7. phoenix/core/model_schema_adapter.py +27 -25
  8. phoenix/datasets/__init__.py +0 -0
  9. phoenix/datasets/evaluators.py +275 -0
  10. phoenix/datasets/experiments.py +469 -0
  11. phoenix/datasets/tracing.py +66 -0
  12. phoenix/datasets/types.py +212 -0
  13. phoenix/db/bulk_inserter.py +54 -14
  14. phoenix/db/insertion/dataset.py +234 -0
  15. phoenix/db/insertion/evaluation.py +6 -6
  16. phoenix/db/insertion/helpers.py +13 -2
  17. phoenix/db/migrations/types.py +29 -0
  18. phoenix/db/migrations/versions/10460e46d750_datasets.py +291 -0
  19. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +2 -28
  20. phoenix/db/models.py +230 -3
  21. phoenix/inferences/fixtures.py +23 -23
  22. phoenix/inferences/inferences.py +7 -7
  23. phoenix/inferences/validation.py +1 -1
  24. phoenix/server/api/context.py +16 -0
  25. phoenix/server/api/dataloaders/__init__.py +16 -0
  26. phoenix/server/api/dataloaders/dataset_example_revisions.py +100 -0
  27. phoenix/server/api/dataloaders/dataset_example_spans.py +43 -0
  28. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +85 -0
  29. phoenix/server/api/dataloaders/experiment_error_rates.py +43 -0
  30. phoenix/server/api/dataloaders/experiment_sequence_number.py +49 -0
  31. phoenix/server/api/dataloaders/project_by_name.py +31 -0
  32. phoenix/server/api/dataloaders/span_descendants.py +2 -3
  33. phoenix/server/api/dataloaders/span_projects.py +33 -0
  34. phoenix/server/api/dataloaders/trace_row_ids.py +39 -0
  35. phoenix/server/api/helpers/dataset_helpers.py +178 -0
  36. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
  37. phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
  38. phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
  39. phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
  40. phoenix/server/api/input_types/DatasetSort.py +17 -0
  41. phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
  42. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
  43. phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
  44. phoenix/server/api/input_types/DeleteExperimentsInput.py +9 -0
  45. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
  46. phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
  47. phoenix/server/api/mutations/__init__.py +13 -0
  48. phoenix/server/api/mutations/auth.py +11 -0
  49. phoenix/server/api/mutations/dataset_mutations.py +520 -0
  50. phoenix/server/api/mutations/experiment_mutations.py +65 -0
  51. phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +17 -14
  52. phoenix/server/api/mutations/project_mutations.py +42 -0
  53. phoenix/server/api/queries.py +503 -0
  54. phoenix/server/api/routers/v1/__init__.py +77 -2
  55. phoenix/server/api/routers/v1/dataset_examples.py +178 -0
  56. phoenix/server/api/routers/v1/datasets.py +861 -0
  57. phoenix/server/api/routers/v1/evaluations.py +4 -2
  58. phoenix/server/api/routers/v1/experiment_evaluations.py +65 -0
  59. phoenix/server/api/routers/v1/experiment_runs.py +108 -0
  60. phoenix/server/api/routers/v1/experiments.py +174 -0
  61. phoenix/server/api/routers/v1/spans.py +3 -1
  62. phoenix/server/api/routers/v1/traces.py +1 -4
  63. phoenix/server/api/schema.py +2 -303
  64. phoenix/server/api/types/AnnotatorKind.py +10 -0
  65. phoenix/server/api/types/Cluster.py +19 -19
  66. phoenix/server/api/types/CreateDatasetPayload.py +8 -0
  67. phoenix/server/api/types/Dataset.py +282 -63
  68. phoenix/server/api/types/DatasetExample.py +85 -0
  69. phoenix/server/api/types/DatasetExampleRevision.py +34 -0
  70. phoenix/server/api/types/DatasetVersion.py +14 -0
  71. phoenix/server/api/types/Dimension.py +30 -29
  72. phoenix/server/api/types/EmbeddingDimension.py +40 -34
  73. phoenix/server/api/types/Event.py +16 -16
  74. phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
  75. phoenix/server/api/types/Experiment.py +135 -0
  76. phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
  77. phoenix/server/api/types/ExperimentComparison.py +19 -0
  78. phoenix/server/api/types/ExperimentRun.py +91 -0
  79. phoenix/server/api/types/ExperimentRunAnnotation.py +57 -0
  80. phoenix/server/api/types/Inferences.py +80 -0
  81. phoenix/server/api/types/InferencesRole.py +23 -0
  82. phoenix/server/api/types/Model.py +43 -42
  83. phoenix/server/api/types/Project.py +26 -12
  84. phoenix/server/api/types/Span.py +78 -2
  85. phoenix/server/api/types/TimeSeries.py +6 -6
  86. phoenix/server/api/types/Trace.py +15 -4
  87. phoenix/server/api/types/UMAPPoints.py +1 -1
  88. phoenix/server/api/types/node.py +5 -111
  89. phoenix/server/api/types/pagination.py +10 -52
  90. phoenix/server/app.py +99 -49
  91. phoenix/server/main.py +49 -27
  92. phoenix/server/openapi/docs.py +3 -0
  93. phoenix/server/static/index.js +2246 -1368
  94. phoenix/server/templates/index.html +1 -0
  95. phoenix/services.py +15 -15
  96. phoenix/session/client.py +316 -21
  97. phoenix/session/session.py +47 -37
  98. phoenix/trace/exporter.py +14 -9
  99. phoenix/trace/fixtures.py +133 -7
  100. phoenix/trace/span_evaluations.py +3 -3
  101. phoenix/trace/trace_dataset.py +6 -6
  102. phoenix/utilities/json.py +61 -0
  103. phoenix/utilities/re.py +50 -0
  104. phoenix/version.py +1 -1
  105. phoenix/server/api/types/DatasetRole.py +0 -23
  106. {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc0.dist-info}/WHEEL +0 -0
  107. {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc0.dist-info}/licenses/IP_NOTICE +0 -0
  108. {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc0.dist-info}/licenses/LICENSE +0 -0
  109. /phoenix/server/api/{helpers.py → helpers/__init__.py} +0 -0
@@ -0,0 +1,469 @@
1
+ import functools
2
+ import json
3
+ from binascii import hexlify
4
+ from contextlib import ExitStack
5
+ from copy import deepcopy
6
+ from datetime import datetime, timezone
7
+ from itertools import product
8
+ from typing import (
9
+ Any,
10
+ Awaitable,
11
+ Callable,
12
+ Coroutine,
13
+ Iterable,
14
+ Mapping,
15
+ Optional,
16
+ Tuple,
17
+ Type,
18
+ Union,
19
+ cast,
20
+ )
21
+ from urllib.parse import urljoin
22
+
23
+ import httpx
24
+ import opentelemetry.sdk.trace as trace_sdk
25
+ from openinference.semconv.resource import ResourceAttributes
26
+ from openinference.semconv.trace import (
27
+ OpenInferenceMimeTypeValues,
28
+ OpenInferenceSpanKindValues,
29
+ SpanAttributes,
30
+ )
31
+ from opentelemetry.context import Context
32
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
33
+ from opentelemetry.sdk.resources import Resource
34
+ from opentelemetry.sdk.trace import Span
35
+ from opentelemetry.sdk.trace.export import SimpleSpanProcessor
36
+ from opentelemetry.trace import Status, StatusCode
37
+ from typing_extensions import TypeAlias
38
+
39
+ from phoenix.config import (
40
+ get_env_client_headers,
41
+ get_env_collector_endpoint,
42
+ get_env_host,
43
+ get_env_port,
44
+ )
45
+ from phoenix.datasets.tracing import capture_spans
46
+ from phoenix.datasets.types import (
47
+ CanAsyncEvaluate,
48
+ CanEvaluate,
49
+ Dataset,
50
+ EvaluationResult,
51
+ Example,
52
+ Experiment,
53
+ ExperimentEvaluationRun,
54
+ ExperimentEvaluator,
55
+ ExperimentResult,
56
+ ExperimentRun,
57
+ ExperimentRunId,
58
+ JSONSerializable,
59
+ TestCase,
60
+ )
61
+ from phoenix.evals.executors import get_executor_on_sync_context
62
+ from phoenix.evals.models.rate_limiters import RateLimiter
63
+ from phoenix.evals.utils import get_tqdm_progress_bar_formatter
64
+ from phoenix.trace.attributes import flatten
65
+ from phoenix.utilities.json import jsonify
66
+
67
+ ExperimentTask: TypeAlias = Union[
68
+ Callable[[Example], JSONSerializable],
69
+ Callable[[Example], Coroutine[None, None, JSONSerializable]],
70
+ ]
71
+
72
+
73
+ def _get_base_url() -> str:
74
+ host = get_env_host()
75
+ if host == "0.0.0.0":
76
+ host = "127.0.0.1"
77
+ base_url = get_env_collector_endpoint() or f"http://{host}:{get_env_port()}"
78
+ return base_url if base_url.endswith("/") else base_url + "/"
79
+
80
+
81
+ def _get_experiment_url(*, dataset_id: str, experiment_id: str) -> str:
82
+ return f"{_get_base_url()}datasets/{dataset_id}/compare?experimentId={experiment_id}"
83
+
84
+
85
+ def _get_dataset_experiments_url(*, dataset_id: str) -> str:
86
+ return f"{_get_base_url()}datasets/{dataset_id}/experiments"
87
+
88
+
89
+ def _phoenix_client() -> httpx.Client:
90
+ headers = get_env_client_headers()
91
+ client = httpx.Client(base_url=_get_base_url(), headers=headers)
92
+ return client
93
+
94
+
95
+ def run_experiment(
96
+ dataset: Dataset,
97
+ task: ExperimentTask,
98
+ *,
99
+ experiment_name: Optional[str] = None,
100
+ experiment_description: Optional[str] = None,
101
+ experiment_metadata: Optional[Mapping[str, Any]] = None,
102
+ evaluators: Optional[Union[ExperimentEvaluator, Iterable[ExperimentEvaluator]]] = None,
103
+ rate_limit_errors: Optional[Union[Type[BaseException], Tuple[Type[BaseException], ...]]] = None,
104
+ ) -> Experiment:
105
+ # Add this to the params once supported in the UI
106
+ repetitions = 1
107
+ assert repetitions > 0, "Must run the experiment at least once."
108
+
109
+ client = _phoenix_client()
110
+
111
+ experiment_response = client.post(
112
+ f"/v1/datasets/{dataset.id}/experiments",
113
+ json={
114
+ "version-id": dataset.version_id,
115
+ "name": experiment_name,
116
+ "description": experiment_description,
117
+ "metadata": experiment_metadata,
118
+ "repetitions": repetitions,
119
+ },
120
+ )
121
+ experiment_response.raise_for_status()
122
+ exp_json = experiment_response.json()
123
+ experiment_id = exp_json["id"]
124
+ project_name = exp_json["project_name"]
125
+
126
+ resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
127
+ tracer_provider = trace_sdk.TracerProvider(resource=resource)
128
+ tracer_provider.add_span_processor(
129
+ SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
130
+ )
131
+ tracer = tracer_provider.get_tracer(__name__)
132
+ root_span_name = f"Task: {task.__qualname__}"
133
+ root_span_kind = CHAIN.value
134
+
135
+ dataset_experiments_url = _get_dataset_experiments_url(dataset_id=dataset.id)
136
+ experiment_compare_url = _get_experiment_url(dataset_id=dataset.id, experiment_id=experiment_id)
137
+ print(f"🧪 Experiment started: {experiment_compare_url}")
138
+
139
+ errors: Tuple[Optional[Type[BaseException]], ...]
140
+ if not hasattr(rate_limit_errors, "__iter__"):
141
+ errors = (rate_limit_errors,)
142
+ else:
143
+ rate_limit_errors = cast(Tuple[Type[BaseException], ...], rate_limit_errors)
144
+ errors = rate_limit_errors
145
+
146
+ rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in errors]
147
+
148
+ def sync_run_experiment(test_case: TestCase) -> ExperimentRun:
149
+ example, repetition_number = test_case.example, test_case.repetition_number
150
+ output = None
151
+ error: Optional[BaseException] = None
152
+ status = Status(StatusCode.OK)
153
+ with ExitStack() as stack:
154
+ span: Span = stack.enter_context(
155
+ tracer.start_as_current_span(root_span_name, context=Context())
156
+ )
157
+ stack.enter_context(capture_spans(resource))
158
+ try:
159
+ # Do not use keyword arguments, which can fail at runtime
160
+ # even when function obeys protocol, because keyword arguments
161
+ # are implementation details.
162
+ _output = task(example)
163
+ if isinstance(_output, Awaitable):
164
+ raise RuntimeError("Task is async but running in sync context")
165
+ else:
166
+ output = _output
167
+ except BaseException as exc:
168
+ span.record_exception(exc)
169
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
170
+ error = exc
171
+ span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
172
+ span.set_attribute(INPUT_MIME_TYPE, JSON.value)
173
+ if result := ExperimentResult(result=output) if output is not None else None:
174
+ if isinstance(output, str):
175
+ span.set_attribute(OUTPUT_VALUE, output)
176
+ else:
177
+ span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
178
+ span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
179
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
180
+ span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
181
+ span.set_status(status)
182
+
183
+ assert isinstance(
184
+ output, (dict, list, str, int, float, bool, type(None))
185
+ ), "Output must be JSON serializable"
186
+ experiment_run = ExperimentRun(
187
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
188
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
189
+ experiment_id=experiment_id,
190
+ dataset_example_id=example.id,
191
+ repetition_number=repetition_number,
192
+ output=result,
193
+ error=repr(error) if error else None,
194
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
195
+ )
196
+ return experiment_run
197
+
198
+ async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
199
+ example, repetition_number = test_case.example, test_case.repetition_number
200
+ output = None
201
+ error: Optional[BaseException] = None
202
+ status = Status(StatusCode.OK)
203
+ with ExitStack() as stack:
204
+ span: Span = stack.enter_context(
205
+ tracer.start_as_current_span(root_span_name, context=Context())
206
+ )
207
+ stack.enter_context(capture_spans(resource))
208
+ try:
209
+ # Do not use keyword arguments, which can fail at runtime
210
+ # even when function obeys protocol, because keyword arguments
211
+ # are implementation details.
212
+ _output = task(example)
213
+ if isinstance(_output, Awaitable):
214
+ output = await _output
215
+ else:
216
+ output = _output
217
+ except BaseException as exc:
218
+ span.record_exception(exc)
219
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
220
+ error = exc
221
+ span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
222
+ span.set_attribute(INPUT_MIME_TYPE, JSON.value)
223
+ if result := ExperimentResult(result=output) if output is not None else None:
224
+ if isinstance(output, str):
225
+ span.set_attribute(OUTPUT_VALUE, output)
226
+ else:
227
+ span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
228
+ span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
229
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
230
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
231
+ span.set_status(status)
232
+
233
+ assert isinstance(
234
+ output, (dict, list, str, int, float, bool, type(None))
235
+ ), "Output must be JSON serializable"
236
+ experiment_run = ExperimentRun(
237
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
238
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
239
+ experiment_id=experiment_id,
240
+ dataset_example_id=example.id,
241
+ repetition_number=repetition_number,
242
+ output=result,
243
+ error=repr(error) if error else None,
244
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
245
+ )
246
+ return experiment_run
247
+
248
+ rate_limited_sync_run_experiment = functools.reduce(
249
+ lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_run_experiment
250
+ )
251
+ rate_limited_async_run_experiment = functools.reduce(
252
+ lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_run_experiment
253
+ )
254
+
255
+ executor = get_executor_on_sync_context(
256
+ rate_limited_sync_run_experiment,
257
+ rate_limited_async_run_experiment,
258
+ max_retries=0,
259
+ exit_on_error=False,
260
+ fallback_return_value=None,
261
+ tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
262
+ )
263
+
264
+ test_cases = [
265
+ TestCase(example=ex, repetition_number=rep)
266
+ for ex, rep in product(dataset.examples, range(1, repetitions + 1))
267
+ ]
268
+ experiment_payloads, _execution_details = executor.run(test_cases)
269
+ for payload in experiment_payloads:
270
+ if payload is not None:
271
+ resp = client.post(f"/v1/experiments/{experiment_id}/runs", json=jsonify(payload))
272
+ resp.raise_for_status()
273
+
274
+ experiment = Experiment(
275
+ id=experiment_id,
276
+ dataset_id=dataset.id,
277
+ dataset_version_id=dataset.version_id,
278
+ project_name=project_name,
279
+ )
280
+
281
+ print(f"✅ Task runs completed. View all experiments: {dataset_experiments_url}")
282
+
283
+ if evaluators is not None:
284
+ _evaluate_experiment(experiment, evaluators, dataset.examples, client)
285
+
286
+ return experiment
287
+
288
+
289
+ def evaluate_experiment(
290
+ experiment: Experiment,
291
+ evaluators: Union[ExperimentEvaluator, Iterable[ExperimentEvaluator]],
292
+ ) -> None:
293
+ client = _phoenix_client()
294
+ dataset_id = experiment.dataset_id
295
+ dataset_version_id = experiment.dataset_version_id
296
+
297
+ dataset_examples = [
298
+ Example.from_dict(ex)
299
+ for ex in (
300
+ client.get(
301
+ f"/v1/datasets/{dataset_id}/examples",
302
+ params={"version-id": str(dataset_version_id)},
303
+ )
304
+ .json()
305
+ .get("data", {})
306
+ .get("examples", [])
307
+ )
308
+ ]
309
+ _evaluate_experiment(experiment, evaluators, dataset_examples, client)
310
+
311
+
312
+ ExperimentEvaluatorName: TypeAlias = str
313
+
314
+
315
+ def _evaluate_experiment(
316
+ experiment: Experiment,
317
+ evaluators: Union[ExperimentEvaluator, Iterable[ExperimentEvaluator]],
318
+ dataset_examples: Iterable[Example],
319
+ client: httpx.Client,
320
+ ) -> None:
321
+ if isinstance(evaluators, (CanEvaluate, CanAsyncEvaluate)):
322
+ evaluators = [evaluators]
323
+
324
+ experiment_id = experiment.id
325
+
326
+ experiment_runs = [
327
+ ExperimentRun.from_dict(exp_run)
328
+ for exp_run in client.get(f"/v1/experiments/{experiment_id}/runs").json()
329
+ ]
330
+
331
+ # not all dataset examples have associated experiment runs, so we need to pair them up
332
+ example_run_pairs = []
333
+ examples_by_id = {example.id: example for example in dataset_examples}
334
+ for exp_run in experiment_runs:
335
+ example = examples_by_id.get(exp_run.dataset_example_id)
336
+ if example:
337
+ example_run_pairs.append((deepcopy(example), exp_run))
338
+ evaluation_inputs = [
339
+ (example, run, evaluator.name, evaluator)
340
+ for (example, run), evaluator in product(example_run_pairs, evaluators)
341
+ ]
342
+
343
+ project_name = "evaluators"
344
+ resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
345
+ tracer_provider = trace_sdk.TracerProvider(resource=resource)
346
+ tracer_provider.add_span_processor(
347
+ SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
348
+ )
349
+ tracer = tracer_provider.get_tracer(__name__)
350
+ root_span_kind = "EVALUATOR"
351
+
352
+ def sync_evaluate_run(
353
+ obj: Tuple[Example, ExperimentRun, ExperimentEvaluatorName, ExperimentEvaluator],
354
+ ) -> ExperimentEvaluationRun:
355
+ example, experiment_run, name, evaluator = obj
356
+ result: Optional[EvaluationResult] = None
357
+ error: Optional[BaseException] = None
358
+ status = Status(StatusCode.OK)
359
+ root_span_name = f"Evaluation: {name}"
360
+ with ExitStack() as stack:
361
+ span: Span = stack.enter_context(
362
+ tracer.start_as_current_span(root_span_name, context=Context())
363
+ )
364
+ stack.enter_context(capture_spans(resource))
365
+ try:
366
+ # Do not use keyword arguments, which can fail at runtime
367
+ # even when function obeys protocol, because keyword arguments
368
+ # are implementation details.
369
+ if not isinstance(evaluator, CanEvaluate):
370
+ raise RuntimeError("Task is async but running in sync context")
371
+ _output = evaluator.evaluate(example, experiment_run)
372
+ if isinstance(_output, Awaitable):
373
+ raise RuntimeError("Task is async but running in sync context")
374
+ result = _output
375
+ except BaseException as exc:
376
+ span.record_exception(exc)
377
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
378
+ error = exc
379
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
380
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
381
+ span.set_status(status)
382
+
383
+ evaluator_payload = ExperimentEvaluationRun(
384
+ experiment_run_id=cast(ExperimentRunId, experiment_run.id),
385
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
386
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
387
+ name=evaluator.name,
388
+ annotator_kind=evaluator.annotator_kind,
389
+ error=repr(error) if error else None,
390
+ result=result,
391
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
392
+ )
393
+ return evaluator_payload
394
+
395
+ async def async_evaluate_run(
396
+ obj: Tuple[Example, ExperimentRun, ExperimentEvaluatorName, ExperimentEvaluator],
397
+ ) -> ExperimentEvaluationRun:
398
+ example, experiment_run, name, evaluator = obj
399
+ result: Optional[EvaluationResult] = None
400
+ error: Optional[BaseException] = None
401
+ status = Status(StatusCode.OK)
402
+ root_span_name = f"Evaluation: {name}"
403
+ with ExitStack() as stack:
404
+ span: Span = stack.enter_context(
405
+ tracer.start_as_current_span(root_span_name, context=Context())
406
+ )
407
+ stack.enter_context(capture_spans(resource))
408
+ try:
409
+ # Do not use keyword arguments, which can fail at runtime
410
+ # even when function obeys protocol, because keyword arguments
411
+ # are implementation details.
412
+ if isinstance(evaluator, CanAsyncEvaluate):
413
+ result = await evaluator.async_evaluate(example, experiment_run)
414
+ else:
415
+ _output = evaluator.evaluate(example, experiment_run)
416
+ if isinstance(_output, Awaitable):
417
+ result = await _output
418
+ else:
419
+ result = _output
420
+ except BaseException as exc:
421
+ span.record_exception(exc)
422
+ status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
423
+ error = exc
424
+ span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
425
+ span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
426
+ span.set_status(status)
427
+
428
+ evaluator_payload = ExperimentEvaluationRun(
429
+ experiment_run_id=cast(ExperimentRunId, experiment_run.id),
430
+ start_time=_decode_unix_nano(cast(int, span.start_time)),
431
+ end_time=_decode_unix_nano(cast(int, span.end_time)),
432
+ name=evaluator.name,
433
+ annotator_kind=evaluator.annotator_kind,
434
+ error=repr(error) if error else None,
435
+ result=result,
436
+ trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
437
+ )
438
+ return evaluator_payload
439
+
440
+ executor = get_executor_on_sync_context(
441
+ sync_evaluate_run,
442
+ async_evaluate_run,
443
+ max_retries=0,
444
+ exit_on_error=False,
445
+ fallback_return_value=None,
446
+ )
447
+ evaluation_payloads, _execution_details = executor.run(evaluation_inputs)
448
+ for payload in evaluation_payloads:
449
+ if payload is not None:
450
+ resp = client.post("/v1/experiment_evaluations", json=jsonify(payload))
451
+ resp.raise_for_status()
452
+
453
+
454
+ def _str_trace_id(id_: int) -> str:
455
+ return hexlify(id_.to_bytes(16, "big")).decode()
456
+
457
+
458
+ def _decode_unix_nano(time_unix_nano: int) -> datetime:
459
+ return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
460
+
461
+
462
+ INPUT_VALUE = SpanAttributes.INPUT_VALUE
463
+ OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
464
+ INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
465
+ OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE
466
+ OPENINFERENCE_SPAN_KIND = SpanAttributes.OPENINFERENCE_SPAN_KIND
467
+
468
+ CHAIN = OpenInferenceSpanKindValues.CHAIN
469
+ JSON = OpenInferenceMimeTypeValues.JSON
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from contextvars import ContextVar
5
+ from threading import Lock
6
+ from typing import Any, Callable, Iterator, Optional
7
+
8
+ from opentelemetry.sdk.resources import Resource
9
+ from opentelemetry.sdk.trace import ReadableSpan
10
+ from opentelemetry.trace import INVALID_TRACE_ID
11
+ from wrapt import apply_patch, resolve_path, wrap_function_wrapper
12
+
13
+
14
+ class SpanModifier:
15
+ __slots__ = ("_resource",)
16
+
17
+ def __init__(self, resource: Resource) -> None:
18
+ self._resource = resource
19
+
20
+ def modify_resource(self, span: ReadableSpan) -> None:
21
+ if (ctx := span._context) is None or ctx.span_id == INVALID_TRACE_ID:
22
+ return
23
+ span._resource = span._resource.merge(self._resource)
24
+
25
+
26
+ _ACTIVE_MODIFIER: ContextVar[Optional[SpanModifier]] = ContextVar("active_modifier")
27
+
28
+
29
+ def override_span(init: Callable[..., None], span: ReadableSpan, args: Any, kwargs: Any) -> None:
30
+ init(*args, **kwargs)
31
+ if isinstance(span_modifier := _ACTIVE_MODIFIER.get(None), SpanModifier):
32
+ span_modifier.modify_resource(span)
33
+
34
+
35
+ _SPAN_INIT_MONKEY_PATCH_LOCK = Lock()
36
+ _SPAN_INIT_MONKEY_PATCH_COUNT = 0
37
+ _SPAN_INIT_MODULE = ReadableSpan.__init__.__module__
38
+ _SPAN_INIT_NAME = ReadableSpan.__init__.__qualname__
39
+ _SPAN_INIT_PARENT, _SPAN_INIT_ATTR, _SPAN_INIT_ORIGINAL = resolve_path(
40
+ _SPAN_INIT_MODULE, _SPAN_INIT_NAME
41
+ )
42
+
43
+
44
+ @contextmanager
45
+ def _monkey_patch_span_init() -> Iterator[None]:
46
+ global _SPAN_INIT_MONKEY_PATCH_COUNT
47
+ with _SPAN_INIT_MONKEY_PATCH_LOCK:
48
+ _SPAN_INIT_MONKEY_PATCH_COUNT += 1
49
+ if _SPAN_INIT_MONKEY_PATCH_COUNT == 1:
50
+ wrap_function_wrapper(
51
+ module=_SPAN_INIT_MODULE, name=_SPAN_INIT_NAME, wrapper=override_span
52
+ )
53
+ yield
54
+ with _SPAN_INIT_MONKEY_PATCH_LOCK:
55
+ _SPAN_INIT_MONKEY_PATCH_COUNT -= 1
56
+ if _SPAN_INIT_MONKEY_PATCH_COUNT == 0:
57
+ apply_patch(_SPAN_INIT_PARENT, _SPAN_INIT_ATTR, _SPAN_INIT_ORIGINAL)
58
+
59
+
60
+ @contextmanager
61
+ def capture_spans(resource: Resource) -> Iterator[SpanModifier]:
62
+ modifier = SpanModifier(resource)
63
+ with _monkey_patch_span_init():
64
+ token = _ACTIVE_MODIFIER.set(modifier)
65
+ yield modifier
66
+ _ACTIVE_MODIFIER.reset(token)