arize-phoenix 4.4.4rc5__py3-none-any.whl → 4.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (118) hide show
  1. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/METADATA +5 -5
  2. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/RECORD +56 -117
  3. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/WHEEL +1 -1
  4. phoenix/__init__.py +27 -0
  5. phoenix/config.py +7 -21
  6. phoenix/core/model.py +25 -25
  7. phoenix/core/model_schema.py +62 -64
  8. phoenix/core/model_schema_adapter.py +25 -27
  9. phoenix/db/bulk_inserter.py +14 -54
  10. phoenix/db/insertion/evaluation.py +6 -6
  11. phoenix/db/insertion/helpers.py +2 -13
  12. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +28 -2
  13. phoenix/db/models.py +4 -236
  14. phoenix/inferences/fixtures.py +23 -23
  15. phoenix/inferences/inferences.py +7 -7
  16. phoenix/inferences/validation.py +1 -1
  17. phoenix/server/api/context.py +0 -18
  18. phoenix/server/api/dataloaders/__init__.py +0 -18
  19. phoenix/server/api/dataloaders/span_descendants.py +3 -2
  20. phoenix/server/api/routers/v1/__init__.py +2 -77
  21. phoenix/server/api/routers/v1/evaluations.py +2 -4
  22. phoenix/server/api/routers/v1/spans.py +1 -3
  23. phoenix/server/api/routers/v1/traces.py +4 -1
  24. phoenix/server/api/schema.py +303 -2
  25. phoenix/server/api/types/Cluster.py +19 -19
  26. phoenix/server/api/types/Dataset.py +63 -282
  27. phoenix/server/api/types/DatasetRole.py +23 -0
  28. phoenix/server/api/types/Dimension.py +29 -30
  29. phoenix/server/api/types/EmbeddingDimension.py +34 -40
  30. phoenix/server/api/types/Event.py +16 -16
  31. phoenix/server/api/{mutations/export_events_mutations.py → types/ExportEventsMutation.py} +14 -17
  32. phoenix/server/api/types/Model.py +42 -43
  33. phoenix/server/api/types/Project.py +12 -26
  34. phoenix/server/api/types/Span.py +2 -79
  35. phoenix/server/api/types/TimeSeries.py +6 -6
  36. phoenix/server/api/types/Trace.py +4 -15
  37. phoenix/server/api/types/UMAPPoints.py +1 -1
  38. phoenix/server/api/types/node.py +111 -5
  39. phoenix/server/api/types/pagination.py +52 -10
  40. phoenix/server/app.py +49 -101
  41. phoenix/server/main.py +27 -49
  42. phoenix/server/openapi/docs.py +0 -3
  43. phoenix/server/static/index.js +2595 -3523
  44. phoenix/server/templates/index.html +0 -1
  45. phoenix/services.py +15 -15
  46. phoenix/session/client.py +21 -438
  47. phoenix/session/session.py +37 -47
  48. phoenix/trace/exporter.py +9 -14
  49. phoenix/trace/fixtures.py +7 -133
  50. phoenix/trace/schemas.py +2 -1
  51. phoenix/trace/span_evaluations.py +3 -3
  52. phoenix/trace/trace_dataset.py +6 -6
  53. phoenix/version.py +1 -1
  54. phoenix/datasets/__init__.py +0 -0
  55. phoenix/datasets/evaluators/__init__.py +0 -18
  56. phoenix/datasets/evaluators/code_evaluators.py +0 -99
  57. phoenix/datasets/evaluators/llm_evaluators.py +0 -244
  58. phoenix/datasets/evaluators/utils.py +0 -292
  59. phoenix/datasets/experiments.py +0 -550
  60. phoenix/datasets/tracing.py +0 -85
  61. phoenix/datasets/types.py +0 -178
  62. phoenix/db/insertion/dataset.py +0 -237
  63. phoenix/db/migrations/types.py +0 -29
  64. phoenix/db/migrations/versions/10460e46d750_datasets.py +0 -291
  65. phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -100
  66. phoenix/server/api/dataloaders/dataset_example_spans.py +0 -43
  67. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +0 -85
  68. phoenix/server/api/dataloaders/experiment_error_rates.py +0 -43
  69. phoenix/server/api/dataloaders/experiment_run_counts.py +0 -42
  70. phoenix/server/api/dataloaders/experiment_sequence_number.py +0 -49
  71. phoenix/server/api/dataloaders/project_by_name.py +0 -31
  72. phoenix/server/api/dataloaders/span_projects.py +0 -33
  73. phoenix/server/api/dataloaders/trace_row_ids.py +0 -39
  74. phoenix/server/api/helpers/dataset_helpers.py +0 -179
  75. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +0 -16
  76. phoenix/server/api/input_types/AddSpansToDatasetInput.py +0 -14
  77. phoenix/server/api/input_types/ClearProjectInput.py +0 -15
  78. phoenix/server/api/input_types/CreateDatasetInput.py +0 -12
  79. phoenix/server/api/input_types/DatasetExampleInput.py +0 -14
  80. phoenix/server/api/input_types/DatasetSort.py +0 -17
  81. phoenix/server/api/input_types/DatasetVersionSort.py +0 -16
  82. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +0 -13
  83. phoenix/server/api/input_types/DeleteDatasetInput.py +0 -7
  84. phoenix/server/api/input_types/DeleteExperimentsInput.py +0 -9
  85. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +0 -35
  86. phoenix/server/api/input_types/PatchDatasetInput.py +0 -14
  87. phoenix/server/api/mutations/__init__.py +0 -13
  88. phoenix/server/api/mutations/auth.py +0 -11
  89. phoenix/server/api/mutations/dataset_mutations.py +0 -520
  90. phoenix/server/api/mutations/experiment_mutations.py +0 -65
  91. phoenix/server/api/mutations/project_mutations.py +0 -47
  92. phoenix/server/api/openapi/__init__.py +0 -0
  93. phoenix/server/api/openapi/main.py +0 -6
  94. phoenix/server/api/openapi/schema.py +0 -16
  95. phoenix/server/api/queries.py +0 -503
  96. phoenix/server/api/routers/v1/dataset_examples.py +0 -178
  97. phoenix/server/api/routers/v1/datasets.py +0 -965
  98. phoenix/server/api/routers/v1/experiment_evaluations.py +0 -66
  99. phoenix/server/api/routers/v1/experiment_runs.py +0 -108
  100. phoenix/server/api/routers/v1/experiments.py +0 -174
  101. phoenix/server/api/types/AnnotatorKind.py +0 -10
  102. phoenix/server/api/types/CreateDatasetPayload.py +0 -8
  103. phoenix/server/api/types/DatasetExample.py +0 -85
  104. phoenix/server/api/types/DatasetExampleRevision.py +0 -34
  105. phoenix/server/api/types/DatasetVersion.py +0 -14
  106. phoenix/server/api/types/ExampleRevisionInterface.py +0 -14
  107. phoenix/server/api/types/Experiment.py +0 -140
  108. phoenix/server/api/types/ExperimentAnnotationSummary.py +0 -13
  109. phoenix/server/api/types/ExperimentComparison.py +0 -19
  110. phoenix/server/api/types/ExperimentRun.py +0 -91
  111. phoenix/server/api/types/ExperimentRunAnnotation.py +0 -57
  112. phoenix/server/api/types/Inferences.py +0 -80
  113. phoenix/server/api/types/InferencesRole.py +0 -23
  114. phoenix/utilities/json.py +0 -61
  115. phoenix/utilities/re.py +0 -50
  116. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/licenses/IP_NOTICE +0 -0
  117. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/licenses/LICENSE +0 -0
  118. /phoenix/server/api/{helpers/__init__.py → helpers.py} +0 -0
@@ -1,550 +0,0 @@
1
- import functools
2
- import json
3
- from binascii import hexlify
4
- from contextlib import ExitStack
5
- from copy import deepcopy
6
- from datetime import datetime, timezone
7
- from itertools import product
8
- from typing import (
9
- Any,
10
- Awaitable,
11
- Dict,
12
- Iterable,
13
- Mapping,
14
- Optional,
15
- Sequence,
16
- Tuple,
17
- Type,
18
- Union,
19
- cast,
20
- )
21
- from urllib.parse import urljoin
22
-
23
- import httpx
24
- import opentelemetry.sdk.trace as trace_sdk
25
- from openinference.semconv.resource import ResourceAttributes
26
- from openinference.semconv.trace import (
27
- OpenInferenceMimeTypeValues,
28
- OpenInferenceSpanKindValues,
29
- SpanAttributes,
30
- )
31
- from opentelemetry.context import Context
32
- from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
33
- from opentelemetry.sdk.resources import Resource
34
- from opentelemetry.sdk.trace import Span
35
- from opentelemetry.sdk.trace.export import SimpleSpanProcessor
36
- from opentelemetry.trace import Status, StatusCode
37
- from typing_extensions import TypeAlias
38
-
39
- from phoenix.config import (
40
- get_env_client_headers,
41
- get_env_collector_endpoint,
42
- get_env_host,
43
- get_env_port,
44
- )
45
- from phoenix.datasets.evaluators.utils import (
46
- Evaluator,
47
- EvaluatorName,
48
- ExperimentEvaluator,
49
- create_evaluator,
50
- )
51
- from phoenix.datasets.tracing import capture_spans
52
- from phoenix.datasets.types import (
53
- Dataset,
54
- EvaluationResult,
55
- Example,
56
- Experiment,
57
- ExperimentEvaluationRun,
58
- ExperimentResult,
59
- ExperimentRun,
60
- ExperimentRunId,
61
- ExperimentTask,
62
- TestCase,
63
- )
64
- from phoenix.evals.executors import get_executor_on_sync_context
65
- from phoenix.evals.models.rate_limiters import RateLimiter
66
- from phoenix.evals.utils import get_tqdm_progress_bar_formatter
67
- from phoenix.session.session import active_session
68
- from phoenix.trace.attributes import flatten
69
- from phoenix.utilities.json import jsonify
70
-
71
-
72
- def _get_base_url() -> str:
73
- host = get_env_host()
74
- if host == "0.0.0.0":
75
- host = "127.0.0.1"
76
- base_url = get_env_collector_endpoint() or f"http://{host}:{get_env_port()}"
77
- return base_url if base_url.endswith("/") else base_url + "/"
78
-
79
-
80
- def _get_web_base_url() -> str:
81
- """Return the web UI base URL.
82
-
83
- Returns:
84
- str: the web UI base URL
85
- """
86
- if session := active_session():
87
- return session.url
88
- return _get_base_url()
89
-
90
-
91
- def _get_experiment_url(*, dataset_id: str, experiment_id: str) -> str:
92
- return f"{_get_web_base_url()}datasets/{dataset_id}/compare?experimentId={experiment_id}"
93
-
94
-
95
- def _get_dataset_experiments_url(*, dataset_id: str) -> str:
96
- return f"{_get_web_base_url()}datasets/{dataset_id}/experiments"
97
-
98
-
99
- def _phoenix_clients() -> Tuple[httpx.Client, httpx.AsyncClient]:
100
- headers = get_env_client_headers()
101
- return httpx.Client(
102
- base_url=_get_base_url(),
103
- headers=headers,
104
- ), httpx.AsyncClient(
105
- base_url=_get_base_url(),
106
- headers=headers,
107
- )
108
-
109
-
110
- Evaluators: TypeAlias = Union[
111
- ExperimentEvaluator,
112
- Sequence[ExperimentEvaluator],
113
- Mapping[EvaluatorName, ExperimentEvaluator],
114
- ]
115
-
116
-
117
- def run_experiment(
118
- dataset: Dataset,
119
- task: ExperimentTask,
120
- *,
121
- experiment_name: Optional[str] = None,
122
- experiment_description: Optional[str] = None,
123
- experiment_metadata: Optional[Mapping[str, Any]] = None,
124
- evaluators: Optional[Evaluators] = None,
125
- rate_limit_errors: Optional[Union[Type[BaseException], Tuple[Type[BaseException], ...]]] = None,
126
- ) -> Experiment:
127
- # Add this to the params once supported in the UI
128
- repetitions = 1
129
- assert repetitions > 0, "Must run the experiment at least once."
130
- evaluators_by_name = _evaluators_by_name(evaluators)
131
-
132
- sync_client, async_client = _phoenix_clients()
133
-
134
- experiment_response = sync_client.post(
135
- f"/v1/datasets/{dataset.id}/experiments",
136
- json={
137
- "version-id": dataset.version_id,
138
- "name": experiment_name,
139
- "description": experiment_description,
140
- "metadata": experiment_metadata,
141
- "repetitions": repetitions,
142
- },
143
- )
144
- experiment_response.raise_for_status()
145
- exp_json = experiment_response.json()
146
- experiment_id = exp_json["id"]
147
- project_name = exp_json["project_name"]
148
-
149
- resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
150
- tracer_provider = trace_sdk.TracerProvider(resource=resource)
151
- tracer_provider.add_span_processor(
152
- SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
153
- )
154
- tracer = tracer_provider.get_tracer(__name__)
155
- root_span_name = f"Task: {_get_task_name(task)}"
156
- root_span_kind = CHAIN
157
-
158
- dataset_experiments_url = _get_dataset_experiments_url(dataset_id=dataset.id)
159
- experiment_compare_url = _get_experiment_url(dataset_id=dataset.id, experiment_id=experiment_id)
160
- print("🧪 Experiment started.")
161
- print(f"📺 View dataset experiments: {dataset_experiments_url}")
162
- print(f"🔗 View this experiment: {experiment_compare_url}")
163
-
164
- errors: Tuple[Optional[Type[BaseException]], ...]
165
- if not hasattr(rate_limit_errors, "__iter__"):
166
- errors = (rate_limit_errors,)
167
- else:
168
- rate_limit_errors = cast(Tuple[Type[BaseException], ...], rate_limit_errors)
169
- errors = rate_limit_errors
170
-
171
- rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in errors]
172
-
173
- def sync_run_experiment(test_case: TestCase) -> ExperimentRun:
174
- example, repetition_number = test_case.example, test_case.repetition_number
175
- output = None
176
- error: Optional[BaseException] = None
177
- status = Status(StatusCode.OK)
178
- with ExitStack() as stack:
179
- span: Span = stack.enter_context(
180
- tracer.start_as_current_span(root_span_name, context=Context())
181
- )
182
- stack.enter_context(capture_spans(resource))
183
- try:
184
- # Do not use keyword arguments, which can fail at runtime
185
- # even when function obeys protocol, because keyword arguments
186
- # are implementation details.
187
- _output = task(example)
188
- if isinstance(_output, Awaitable):
189
- raise RuntimeError("Task is async but running in sync context")
190
- else:
191
- output = _output
192
- except BaseException as exc:
193
- span.record_exception(exc)
194
- status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
195
- error = exc
196
- span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
197
- span.set_attribute(INPUT_MIME_TYPE, JSON.value)
198
- if result := ExperimentResult(result=output) if output is not None else None:
199
- if isinstance(output, str):
200
- span.set_attribute(OUTPUT_VALUE, output)
201
- else:
202
- span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
203
- span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
204
- span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
205
- span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
206
- span.set_status(status)
207
-
208
- assert isinstance(
209
- output, (dict, list, str, int, float, bool, type(None))
210
- ), "Output must be JSON serializable"
211
- experiment_run = ExperimentRun(
212
- start_time=_decode_unix_nano(cast(int, span.start_time)),
213
- end_time=_decode_unix_nano(cast(int, span.end_time)),
214
- experiment_id=experiment_id,
215
- dataset_example_id=example.id,
216
- repetition_number=repetition_number,
217
- output=result,
218
- error=repr(error) if error else None,
219
- trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
220
- )
221
- resp = sync_client.post(
222
- f"/v1/experiments/{experiment_id}/runs", json=jsonify(experiment_run)
223
- )
224
- resp.raise_for_status()
225
- return experiment_run
226
-
227
- async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
228
- example, repetition_number = test_case.example, test_case.repetition_number
229
- output = None
230
- error: Optional[BaseException] = None
231
- status = Status(StatusCode.OK)
232
- with ExitStack() as stack:
233
- span: Span = stack.enter_context(
234
- tracer.start_as_current_span(root_span_name, context=Context())
235
- )
236
- stack.enter_context(capture_spans(resource))
237
- try:
238
- # Do not use keyword arguments, which can fail at runtime
239
- # even when function obeys protocol, because keyword arguments
240
- # are implementation details.
241
- _output = task(example)
242
- if isinstance(_output, Awaitable):
243
- output = await _output
244
- else:
245
- output = _output
246
- except BaseException as exc:
247
- span.record_exception(exc)
248
- status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
249
- error = exc
250
- span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
251
- span.set_attribute(INPUT_MIME_TYPE, JSON.value)
252
- if result := ExperimentResult(result=output) if output is not None else None:
253
- if isinstance(output, str):
254
- span.set_attribute(OUTPUT_VALUE, output)
255
- else:
256
- span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
257
- span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
258
- span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
259
- span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
260
- span.set_status(status)
261
-
262
- assert isinstance(
263
- output, (dict, list, str, int, float, bool, type(None))
264
- ), "Output must be JSON serializable"
265
- experiment_run = ExperimentRun(
266
- start_time=_decode_unix_nano(cast(int, span.start_time)),
267
- end_time=_decode_unix_nano(cast(int, span.end_time)),
268
- experiment_id=experiment_id,
269
- dataset_example_id=example.id,
270
- repetition_number=repetition_number,
271
- output=result,
272
- error=repr(error) if error else None,
273
- trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
274
- )
275
- resp = await async_client.post(
276
- f"/v1/experiments/{experiment_id}/runs", json=jsonify(experiment_run)
277
- )
278
- resp.raise_for_status()
279
- return experiment_run
280
-
281
- rate_limited_sync_run_experiment = functools.reduce(
282
- lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_run_experiment
283
- )
284
- rate_limited_async_run_experiment = functools.reduce(
285
- lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_run_experiment
286
- )
287
-
288
- executor = get_executor_on_sync_context(
289
- rate_limited_sync_run_experiment,
290
- rate_limited_async_run_experiment,
291
- max_retries=0,
292
- exit_on_error=False,
293
- fallback_return_value=None,
294
- tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
295
- )
296
-
297
- test_cases = [
298
- TestCase(example=ex, repetition_number=rep)
299
- for ex, rep in product(dataset.examples, range(1, repetitions + 1))
300
- ]
301
- _, _execution_details = executor.run(test_cases)
302
- experiment = Experiment(
303
- id=experiment_id,
304
- dataset_id=dataset.id,
305
- dataset_version_id=dataset.version_id,
306
- project_name=project_name,
307
- )
308
-
309
- print("✅ Task runs completed.")
310
-
311
- if evaluators_by_name:
312
- _evaluate_experiment(
313
- experiment,
314
- evaluators=evaluators_by_name,
315
- dataset_examples=dataset.examples,
316
- clients=(sync_client, async_client),
317
- )
318
-
319
- return experiment
320
-
321
-
322
- def evaluate_experiment(
323
- experiment: Experiment,
324
- evaluators: Union[
325
- ExperimentEvaluator,
326
- Sequence[ExperimentEvaluator],
327
- Mapping[EvaluatorName, ExperimentEvaluator],
328
- ],
329
- ) -> None:
330
- sync_client, async_client = _phoenix_clients()
331
- dataset_id = experiment.dataset_id
332
- dataset_version_id = experiment.dataset_version_id
333
-
334
- dataset_examples = [
335
- Example.from_dict(ex)
336
- for ex in (
337
- sync_client.get(
338
- f"/v1/datasets/{dataset_id}/examples",
339
- params={"version-id": str(dataset_version_id)},
340
- )
341
- .json()
342
- .get("data", {})
343
- .get("examples", [])
344
- )
345
- ]
346
- _evaluate_experiment(
347
- experiment,
348
- evaluators=evaluators,
349
- dataset_examples=dataset_examples,
350
- clients=(sync_client, async_client),
351
- )
352
-
353
-
354
- def _evaluate_experiment(
355
- experiment: Experiment,
356
- *,
357
- evaluators: Evaluators,
358
- dataset_examples: Iterable[Example],
359
- clients: Tuple[httpx.Client, httpx.AsyncClient],
360
- ) -> None:
361
- evaluators_by_name = _evaluators_by_name(evaluators)
362
- if not evaluators_by_name:
363
- raise ValueError("Must specify at least one Evaluator")
364
- experiment_id = experiment.id
365
- sync_client, async_client = clients
366
- experiment_runs = [
367
- ExperimentRun.from_dict(exp_run)
368
- for exp_run in sync_client.get(f"/v1/experiments/{experiment_id}/runs").json()
369
- ]
370
-
371
- # not all dataset examples have associated experiment runs, so we need to pair them up
372
- example_run_pairs = []
373
- examples_by_id = {example.id: example for example in dataset_examples}
374
- for exp_run in experiment_runs:
375
- example = examples_by_id.get(exp_run.dataset_example_id)
376
- if example:
377
- example_run_pairs.append((deepcopy(example), exp_run))
378
- evaluation_input = [
379
- (example, run, evaluator)
380
- for (example, run), evaluator in product(example_run_pairs, evaluators_by_name.values())
381
- ]
382
-
383
- project_name = "evaluators"
384
- resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
385
- tracer_provider = trace_sdk.TracerProvider(resource=resource)
386
- tracer_provider.add_span_processor(
387
- SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
388
- )
389
- tracer = tracer_provider.get_tracer(__name__)
390
- root_span_kind = EVALUATOR
391
-
392
- def sync_evaluate_run(
393
- obj: Tuple[Example, ExperimentRun, Evaluator],
394
- ) -> ExperimentEvaluationRun:
395
- example, experiment_run, evaluator = obj
396
- result: Optional[EvaluationResult] = None
397
- error: Optional[BaseException] = None
398
- status = Status(StatusCode.OK)
399
- root_span_name = f"Evaluation: {evaluator.name}"
400
- with ExitStack() as stack:
401
- span: Span = stack.enter_context(
402
- tracer.start_as_current_span(root_span_name, context=Context())
403
- )
404
- stack.enter_context(capture_spans(resource))
405
- try:
406
- result = evaluator.evaluate(
407
- output=None if experiment_run.output is None else experiment_run.output.result,
408
- expected=example.output,
409
- input=example.input,
410
- metadata=example.metadata,
411
- )
412
- except BaseException as exc:
413
- span.record_exception(exc)
414
- status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
415
- error = exc
416
- if result:
417
- span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
418
- span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
419
- span.set_status(status)
420
-
421
- evaluator_payload = ExperimentEvaluationRun(
422
- experiment_run_id=cast(ExperimentRunId, experiment_run.id),
423
- start_time=_decode_unix_nano(cast(int, span.start_time)),
424
- end_time=_decode_unix_nano(cast(int, span.end_time)),
425
- name=evaluator.name,
426
- annotator_kind=evaluator.kind,
427
- error=repr(error) if error else None,
428
- result=result,
429
- trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
430
- )
431
- resp = sync_client.post("/v1/experiment_evaluations", json=jsonify(evaluator_payload))
432
- resp.raise_for_status()
433
- return evaluator_payload
434
-
435
- async def async_evaluate_run(
436
- obj: Tuple[Example, ExperimentRun, Evaluator],
437
- ) -> ExperimentEvaluationRun:
438
- example, experiment_run, evaluator = obj
439
- result: Optional[EvaluationResult] = None
440
- error: Optional[BaseException] = None
441
- status = Status(StatusCode.OK)
442
- root_span_name = f"Evaluation: {evaluator.name}"
443
- with ExitStack() as stack:
444
- span: Span = stack.enter_context(
445
- tracer.start_as_current_span(root_span_name, context=Context())
446
- )
447
- stack.enter_context(capture_spans(resource))
448
- try:
449
- result = await evaluator.async_evaluate(
450
- output=None if experiment_run.output is None else experiment_run.output.result,
451
- expected=example.output,
452
- input=example.input,
453
- metadata=example.metadata,
454
- )
455
- except BaseException as exc:
456
- span.record_exception(exc)
457
- status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
458
- error = exc
459
- if result:
460
- span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
461
- span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
462
- span.set_status(status)
463
-
464
- evaluator_payload = ExperimentEvaluationRun(
465
- experiment_run_id=cast(ExperimentRunId, experiment_run.id),
466
- start_time=_decode_unix_nano(cast(int, span.start_time)),
467
- end_time=_decode_unix_nano(cast(int, span.end_time)),
468
- name=evaluator.name,
469
- annotator_kind=evaluator.kind,
470
- error=repr(error) if error else None,
471
- result=result,
472
- trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
473
- )
474
- resp = await async_client.post(
475
- "/v1/experiment_evaluations", json=jsonify(evaluator_payload)
476
- )
477
- resp.raise_for_status()
478
- return evaluator_payload
479
-
480
- executor = get_executor_on_sync_context(
481
- sync_evaluate_run,
482
- async_evaluate_run,
483
- max_retries=0,
484
- exit_on_error=False,
485
- fallback_return_value=None,
486
- tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
487
- )
488
- print("🧠 Evaluation started.")
489
- _, _execution_details = executor.run(evaluation_input)
490
-
491
-
492
- def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Evaluator]:
493
- evaluators_by_name: Dict[EvaluatorName, Evaluator] = {}
494
- if obj is None:
495
- return evaluators_by_name
496
- if isinstance(mapping := obj, Mapping):
497
- for name, value in mapping.items():
498
- evaluator = (
499
- create_evaluator(name=name)(value) if not isinstance(value, Evaluator) else value
500
- )
501
- name = evaluator.name
502
- if name in evaluators_by_name:
503
- raise ValueError(f"Two evaluators have the same name: {name}")
504
- evaluators_by_name[name] = evaluator
505
- elif isinstance(seq := obj, Sequence):
506
- for value in seq:
507
- evaluator = create_evaluator()(value) if not isinstance(value, Evaluator) else value
508
- name = evaluator.name
509
- if name in evaluators_by_name:
510
- raise ValueError(f"Two evaluators have the same name: {name}")
511
- evaluators_by_name[name] = evaluator
512
- else:
513
- assert not isinstance(obj, Mapping) and not isinstance(obj, Sequence)
514
- evaluator = create_evaluator()(obj) if not isinstance(obj, Evaluator) else obj
515
- name = evaluator.name
516
- if name in evaluators_by_name:
517
- raise ValueError(f"Two evaluators have the same name: {name}")
518
- evaluators_by_name[name] = evaluator
519
- return evaluators_by_name
520
-
521
-
522
- def _str_trace_id(id_: int) -> str:
523
- return hexlify(id_.to_bytes(16, "big")).decode()
524
-
525
-
526
- def _decode_unix_nano(time_unix_nano: int) -> datetime:
527
- return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
528
-
529
-
530
- def _get_task_name(task: ExperimentTask) -> str:
531
- """
532
- Makes a best-effort attempt to get the name of the task.
533
- """
534
-
535
- if isinstance(task, functools.partial):
536
- return task.func.__qualname__
537
- if hasattr(task, "__qualname__"):
538
- return task.__qualname__
539
- return str(task)
540
-
541
-
542
- INPUT_VALUE = SpanAttributes.INPUT_VALUE
543
- OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
544
- INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
545
- OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE
546
- OPENINFERENCE_SPAN_KIND = SpanAttributes.OPENINFERENCE_SPAN_KIND
547
-
548
- CHAIN = OpenInferenceSpanKindValues.CHAIN.value
549
- EVALUATOR = OpenInferenceSpanKindValues.EVALUATOR.value
550
- JSON = OpenInferenceMimeTypeValues.JSON
@@ -1,85 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from contextlib import contextmanager
4
- from contextvars import ContextVar
5
- from threading import Lock
6
- from typing import Any, Callable, Iterator, Optional
7
-
8
- from opentelemetry.sdk.resources import Resource
9
- from opentelemetry.sdk.trace import ReadableSpan
10
- from opentelemetry.trace import INVALID_TRACE_ID
11
- from wrapt import apply_patch, resolve_path, wrap_function_wrapper
12
-
13
-
14
- class SpanModifier:
15
- """
16
- A class that modifies spans with the specified resource attributes.
17
- """
18
-
19
- __slots__ = ("_resource",)
20
-
21
- def __init__(self, resource: Resource) -> None:
22
- self._resource = resource
23
-
24
- def modify_resource(self, span: ReadableSpan) -> None:
25
- """
26
- Takes a span and merges in the resource attributes specified in the constructor.
27
-
28
- Args:
29
- span: ReadableSpan: the span to modify
30
- """
31
- if (ctx := span._context) is None or ctx.span_id == INVALID_TRACE_ID:
32
- return
33
- span._resource = span._resource.merge(self._resource)
34
-
35
-
36
- _ACTIVE_MODIFIER: ContextVar[Optional[SpanModifier]] = ContextVar("active_modifier")
37
-
38
-
39
- def override_span(init: Callable[..., None], span: ReadableSpan, args: Any, kwargs: Any) -> None:
40
- init(*args, **kwargs)
41
- if isinstance(span_modifier := _ACTIVE_MODIFIER.get(None), SpanModifier):
42
- span_modifier.modify_resource(span)
43
-
44
-
45
- _SPAN_INIT_MONKEY_PATCH_LOCK = Lock()
46
- _SPAN_INIT_MONKEY_PATCH_COUNT = 0
47
- _SPAN_INIT_MODULE = ReadableSpan.__init__.__module__
48
- _SPAN_INIT_NAME = ReadableSpan.__init__.__qualname__
49
- _SPAN_INIT_PARENT, _SPAN_INIT_ATTR, _SPAN_INIT_ORIGINAL = resolve_path(
50
- _SPAN_INIT_MODULE, _SPAN_INIT_NAME
51
- )
52
-
53
-
54
- @contextmanager
55
- def _monkey_patch_span_init() -> Iterator[None]:
56
- global _SPAN_INIT_MONKEY_PATCH_COUNT
57
- with _SPAN_INIT_MONKEY_PATCH_LOCK:
58
- _SPAN_INIT_MONKEY_PATCH_COUNT += 1
59
- if _SPAN_INIT_MONKEY_PATCH_COUNT == 1:
60
- wrap_function_wrapper(
61
- module=_SPAN_INIT_MODULE, name=_SPAN_INIT_NAME, wrapper=override_span
62
- )
63
- yield
64
- with _SPAN_INIT_MONKEY_PATCH_LOCK:
65
- _SPAN_INIT_MONKEY_PATCH_COUNT -= 1
66
- if _SPAN_INIT_MONKEY_PATCH_COUNT == 0:
67
- apply_patch(_SPAN_INIT_PARENT, _SPAN_INIT_ATTR, _SPAN_INIT_ORIGINAL)
68
-
69
-
70
- @contextmanager
71
- def capture_spans(resource: Resource) -> Iterator[SpanModifier]:
72
- """
73
- A context manager that captures spans and modifies them with the specified resources.
74
-
75
- Args:
76
- resource: Resource: The resource to merge into the spans created within the context.
77
-
78
- Returns:
79
- modifier: Iterator[SpanModifier]: The span modifier that is active within the context.
80
- """
81
- modifier = SpanModifier(resource)
82
- with _monkey_patch_span_init():
83
- token = _ACTIVE_MODIFIER.set(modifier)
84
- yield modifier
85
- _ACTIVE_MODIFIER.reset(token)