arize-phoenix 4.4.4rc5__py3-none-any.whl → 4.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/METADATA +5 -5
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/RECORD +56 -117
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/WHEEL +1 -1
- phoenix/__init__.py +27 -0
- phoenix/config.py +7 -21
- phoenix/core/model.py +25 -25
- phoenix/core/model_schema.py +62 -64
- phoenix/core/model_schema_adapter.py +25 -27
- phoenix/db/bulk_inserter.py +14 -54
- phoenix/db/insertion/evaluation.py +6 -6
- phoenix/db/insertion/helpers.py +2 -13
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +28 -2
- phoenix/db/models.py +4 -236
- phoenix/inferences/fixtures.py +23 -23
- phoenix/inferences/inferences.py +7 -7
- phoenix/inferences/validation.py +1 -1
- phoenix/server/api/context.py +0 -18
- phoenix/server/api/dataloaders/__init__.py +0 -18
- phoenix/server/api/dataloaders/span_descendants.py +3 -2
- phoenix/server/api/routers/v1/__init__.py +2 -77
- phoenix/server/api/routers/v1/evaluations.py +2 -4
- phoenix/server/api/routers/v1/spans.py +1 -3
- phoenix/server/api/routers/v1/traces.py +4 -1
- phoenix/server/api/schema.py +303 -2
- phoenix/server/api/types/Cluster.py +19 -19
- phoenix/server/api/types/Dataset.py +63 -282
- phoenix/server/api/types/DatasetRole.py +23 -0
- phoenix/server/api/types/Dimension.py +29 -30
- phoenix/server/api/types/EmbeddingDimension.py +34 -40
- phoenix/server/api/types/Event.py +16 -16
- phoenix/server/api/{mutations/export_events_mutations.py → types/ExportEventsMutation.py} +14 -17
- phoenix/server/api/types/Model.py +42 -43
- phoenix/server/api/types/Project.py +12 -26
- phoenix/server/api/types/Span.py +2 -79
- phoenix/server/api/types/TimeSeries.py +6 -6
- phoenix/server/api/types/Trace.py +4 -15
- phoenix/server/api/types/UMAPPoints.py +1 -1
- phoenix/server/api/types/node.py +111 -5
- phoenix/server/api/types/pagination.py +52 -10
- phoenix/server/app.py +49 -101
- phoenix/server/main.py +27 -49
- phoenix/server/openapi/docs.py +0 -3
- phoenix/server/static/index.js +2595 -3523
- phoenix/server/templates/index.html +0 -1
- phoenix/services.py +15 -15
- phoenix/session/client.py +21 -438
- phoenix/session/session.py +37 -47
- phoenix/trace/exporter.py +9 -14
- phoenix/trace/fixtures.py +7 -133
- phoenix/trace/schemas.py +2 -1
- phoenix/trace/span_evaluations.py +3 -3
- phoenix/trace/trace_dataset.py +6 -6
- phoenix/version.py +1 -1
- phoenix/datasets/__init__.py +0 -0
- phoenix/datasets/evaluators/__init__.py +0 -18
- phoenix/datasets/evaluators/code_evaluators.py +0 -99
- phoenix/datasets/evaluators/llm_evaluators.py +0 -244
- phoenix/datasets/evaluators/utils.py +0 -292
- phoenix/datasets/experiments.py +0 -550
- phoenix/datasets/tracing.py +0 -85
- phoenix/datasets/types.py +0 -178
- phoenix/db/insertion/dataset.py +0 -237
- phoenix/db/migrations/types.py +0 -29
- phoenix/db/migrations/versions/10460e46d750_datasets.py +0 -291
- phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -100
- phoenix/server/api/dataloaders/dataset_example_spans.py +0 -43
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +0 -85
- phoenix/server/api/dataloaders/experiment_error_rates.py +0 -43
- phoenix/server/api/dataloaders/experiment_run_counts.py +0 -42
- phoenix/server/api/dataloaders/experiment_sequence_number.py +0 -49
- phoenix/server/api/dataloaders/project_by_name.py +0 -31
- phoenix/server/api/dataloaders/span_projects.py +0 -33
- phoenix/server/api/dataloaders/trace_row_ids.py +0 -39
- phoenix/server/api/helpers/dataset_helpers.py +0 -179
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +0 -16
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +0 -14
- phoenix/server/api/input_types/ClearProjectInput.py +0 -15
- phoenix/server/api/input_types/CreateDatasetInput.py +0 -12
- phoenix/server/api/input_types/DatasetExampleInput.py +0 -14
- phoenix/server/api/input_types/DatasetSort.py +0 -17
- phoenix/server/api/input_types/DatasetVersionSort.py +0 -16
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +0 -13
- phoenix/server/api/input_types/DeleteDatasetInput.py +0 -7
- phoenix/server/api/input_types/DeleteExperimentsInput.py +0 -9
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +0 -35
- phoenix/server/api/input_types/PatchDatasetInput.py +0 -14
- phoenix/server/api/mutations/__init__.py +0 -13
- phoenix/server/api/mutations/auth.py +0 -11
- phoenix/server/api/mutations/dataset_mutations.py +0 -520
- phoenix/server/api/mutations/experiment_mutations.py +0 -65
- phoenix/server/api/mutations/project_mutations.py +0 -47
- phoenix/server/api/openapi/__init__.py +0 -0
- phoenix/server/api/openapi/main.py +0 -6
- phoenix/server/api/openapi/schema.py +0 -16
- phoenix/server/api/queries.py +0 -503
- phoenix/server/api/routers/v1/dataset_examples.py +0 -178
- phoenix/server/api/routers/v1/datasets.py +0 -965
- phoenix/server/api/routers/v1/experiment_evaluations.py +0 -66
- phoenix/server/api/routers/v1/experiment_runs.py +0 -108
- phoenix/server/api/routers/v1/experiments.py +0 -174
- phoenix/server/api/types/AnnotatorKind.py +0 -10
- phoenix/server/api/types/CreateDatasetPayload.py +0 -8
- phoenix/server/api/types/DatasetExample.py +0 -85
- phoenix/server/api/types/DatasetExampleRevision.py +0 -34
- phoenix/server/api/types/DatasetVersion.py +0 -14
- phoenix/server/api/types/ExampleRevisionInterface.py +0 -14
- phoenix/server/api/types/Experiment.py +0 -140
- phoenix/server/api/types/ExperimentAnnotationSummary.py +0 -13
- phoenix/server/api/types/ExperimentComparison.py +0 -19
- phoenix/server/api/types/ExperimentRun.py +0 -91
- phoenix/server/api/types/ExperimentRunAnnotation.py +0 -57
- phoenix/server/api/types/Inferences.py +0 -80
- phoenix/server/api/types/InferencesRole.py +0 -23
- phoenix/utilities/json.py +0 -61
- phoenix/utilities/re.py +0 -50
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/licenses/LICENSE +0 -0
- /phoenix/server/api/{helpers/__init__.py → helpers.py} +0 -0
phoenix/datasets/experiments.py
DELETED
|
@@ -1,550 +0,0 @@
|
|
|
1
|
-
import functools
|
|
2
|
-
import json
|
|
3
|
-
from binascii import hexlify
|
|
4
|
-
from contextlib import ExitStack
|
|
5
|
-
from copy import deepcopy
|
|
6
|
-
from datetime import datetime, timezone
|
|
7
|
-
from itertools import product
|
|
8
|
-
from typing import (
|
|
9
|
-
Any,
|
|
10
|
-
Awaitable,
|
|
11
|
-
Dict,
|
|
12
|
-
Iterable,
|
|
13
|
-
Mapping,
|
|
14
|
-
Optional,
|
|
15
|
-
Sequence,
|
|
16
|
-
Tuple,
|
|
17
|
-
Type,
|
|
18
|
-
Union,
|
|
19
|
-
cast,
|
|
20
|
-
)
|
|
21
|
-
from urllib.parse import urljoin
|
|
22
|
-
|
|
23
|
-
import httpx
|
|
24
|
-
import opentelemetry.sdk.trace as trace_sdk
|
|
25
|
-
from openinference.semconv.resource import ResourceAttributes
|
|
26
|
-
from openinference.semconv.trace import (
|
|
27
|
-
OpenInferenceMimeTypeValues,
|
|
28
|
-
OpenInferenceSpanKindValues,
|
|
29
|
-
SpanAttributes,
|
|
30
|
-
)
|
|
31
|
-
from opentelemetry.context import Context
|
|
32
|
-
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
33
|
-
from opentelemetry.sdk.resources import Resource
|
|
34
|
-
from opentelemetry.sdk.trace import Span
|
|
35
|
-
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
|
36
|
-
from opentelemetry.trace import Status, StatusCode
|
|
37
|
-
from typing_extensions import TypeAlias
|
|
38
|
-
|
|
39
|
-
from phoenix.config import (
|
|
40
|
-
get_env_client_headers,
|
|
41
|
-
get_env_collector_endpoint,
|
|
42
|
-
get_env_host,
|
|
43
|
-
get_env_port,
|
|
44
|
-
)
|
|
45
|
-
from phoenix.datasets.evaluators.utils import (
|
|
46
|
-
Evaluator,
|
|
47
|
-
EvaluatorName,
|
|
48
|
-
ExperimentEvaluator,
|
|
49
|
-
create_evaluator,
|
|
50
|
-
)
|
|
51
|
-
from phoenix.datasets.tracing import capture_spans
|
|
52
|
-
from phoenix.datasets.types import (
|
|
53
|
-
Dataset,
|
|
54
|
-
EvaluationResult,
|
|
55
|
-
Example,
|
|
56
|
-
Experiment,
|
|
57
|
-
ExperimentEvaluationRun,
|
|
58
|
-
ExperimentResult,
|
|
59
|
-
ExperimentRun,
|
|
60
|
-
ExperimentRunId,
|
|
61
|
-
ExperimentTask,
|
|
62
|
-
TestCase,
|
|
63
|
-
)
|
|
64
|
-
from phoenix.evals.executors import get_executor_on_sync_context
|
|
65
|
-
from phoenix.evals.models.rate_limiters import RateLimiter
|
|
66
|
-
from phoenix.evals.utils import get_tqdm_progress_bar_formatter
|
|
67
|
-
from phoenix.session.session import active_session
|
|
68
|
-
from phoenix.trace.attributes import flatten
|
|
69
|
-
from phoenix.utilities.json import jsonify
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def _get_base_url() -> str:
|
|
73
|
-
host = get_env_host()
|
|
74
|
-
if host == "0.0.0.0":
|
|
75
|
-
host = "127.0.0.1"
|
|
76
|
-
base_url = get_env_collector_endpoint() or f"http://{host}:{get_env_port()}"
|
|
77
|
-
return base_url if base_url.endswith("/") else base_url + "/"
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def _get_web_base_url() -> str:
|
|
81
|
-
"""Return the web UI base URL.
|
|
82
|
-
|
|
83
|
-
Returns:
|
|
84
|
-
str: the web UI base URL
|
|
85
|
-
"""
|
|
86
|
-
if session := active_session():
|
|
87
|
-
return session.url
|
|
88
|
-
return _get_base_url()
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def _get_experiment_url(*, dataset_id: str, experiment_id: str) -> str:
|
|
92
|
-
return f"{_get_web_base_url()}datasets/{dataset_id}/compare?experimentId={experiment_id}"
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
def _get_dataset_experiments_url(*, dataset_id: str) -> str:
|
|
96
|
-
return f"{_get_web_base_url()}datasets/{dataset_id}/experiments"
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def _phoenix_clients() -> Tuple[httpx.Client, httpx.AsyncClient]:
|
|
100
|
-
headers = get_env_client_headers()
|
|
101
|
-
return httpx.Client(
|
|
102
|
-
base_url=_get_base_url(),
|
|
103
|
-
headers=headers,
|
|
104
|
-
), httpx.AsyncClient(
|
|
105
|
-
base_url=_get_base_url(),
|
|
106
|
-
headers=headers,
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
Evaluators: TypeAlias = Union[
|
|
111
|
-
ExperimentEvaluator,
|
|
112
|
-
Sequence[ExperimentEvaluator],
|
|
113
|
-
Mapping[EvaluatorName, ExperimentEvaluator],
|
|
114
|
-
]
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def run_experiment(
|
|
118
|
-
dataset: Dataset,
|
|
119
|
-
task: ExperimentTask,
|
|
120
|
-
*,
|
|
121
|
-
experiment_name: Optional[str] = None,
|
|
122
|
-
experiment_description: Optional[str] = None,
|
|
123
|
-
experiment_metadata: Optional[Mapping[str, Any]] = None,
|
|
124
|
-
evaluators: Optional[Evaluators] = None,
|
|
125
|
-
rate_limit_errors: Optional[Union[Type[BaseException], Tuple[Type[BaseException], ...]]] = None,
|
|
126
|
-
) -> Experiment:
|
|
127
|
-
# Add this to the params once supported in the UI
|
|
128
|
-
repetitions = 1
|
|
129
|
-
assert repetitions > 0, "Must run the experiment at least once."
|
|
130
|
-
evaluators_by_name = _evaluators_by_name(evaluators)
|
|
131
|
-
|
|
132
|
-
sync_client, async_client = _phoenix_clients()
|
|
133
|
-
|
|
134
|
-
experiment_response = sync_client.post(
|
|
135
|
-
f"/v1/datasets/{dataset.id}/experiments",
|
|
136
|
-
json={
|
|
137
|
-
"version-id": dataset.version_id,
|
|
138
|
-
"name": experiment_name,
|
|
139
|
-
"description": experiment_description,
|
|
140
|
-
"metadata": experiment_metadata,
|
|
141
|
-
"repetitions": repetitions,
|
|
142
|
-
},
|
|
143
|
-
)
|
|
144
|
-
experiment_response.raise_for_status()
|
|
145
|
-
exp_json = experiment_response.json()
|
|
146
|
-
experiment_id = exp_json["id"]
|
|
147
|
-
project_name = exp_json["project_name"]
|
|
148
|
-
|
|
149
|
-
resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
|
|
150
|
-
tracer_provider = trace_sdk.TracerProvider(resource=resource)
|
|
151
|
-
tracer_provider.add_span_processor(
|
|
152
|
-
SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
|
|
153
|
-
)
|
|
154
|
-
tracer = tracer_provider.get_tracer(__name__)
|
|
155
|
-
root_span_name = f"Task: {_get_task_name(task)}"
|
|
156
|
-
root_span_kind = CHAIN
|
|
157
|
-
|
|
158
|
-
dataset_experiments_url = _get_dataset_experiments_url(dataset_id=dataset.id)
|
|
159
|
-
experiment_compare_url = _get_experiment_url(dataset_id=dataset.id, experiment_id=experiment_id)
|
|
160
|
-
print("🧪 Experiment started.")
|
|
161
|
-
print(f"📺 View dataset experiments: {dataset_experiments_url}")
|
|
162
|
-
print(f"🔗 View this experiment: {experiment_compare_url}")
|
|
163
|
-
|
|
164
|
-
errors: Tuple[Optional[Type[BaseException]], ...]
|
|
165
|
-
if not hasattr(rate_limit_errors, "__iter__"):
|
|
166
|
-
errors = (rate_limit_errors,)
|
|
167
|
-
else:
|
|
168
|
-
rate_limit_errors = cast(Tuple[Type[BaseException], ...], rate_limit_errors)
|
|
169
|
-
errors = rate_limit_errors
|
|
170
|
-
|
|
171
|
-
rate_limiters = [RateLimiter(rate_limit_error=rate_limit_error) for rate_limit_error in errors]
|
|
172
|
-
|
|
173
|
-
def sync_run_experiment(test_case: TestCase) -> ExperimentRun:
|
|
174
|
-
example, repetition_number = test_case.example, test_case.repetition_number
|
|
175
|
-
output = None
|
|
176
|
-
error: Optional[BaseException] = None
|
|
177
|
-
status = Status(StatusCode.OK)
|
|
178
|
-
with ExitStack() as stack:
|
|
179
|
-
span: Span = stack.enter_context(
|
|
180
|
-
tracer.start_as_current_span(root_span_name, context=Context())
|
|
181
|
-
)
|
|
182
|
-
stack.enter_context(capture_spans(resource))
|
|
183
|
-
try:
|
|
184
|
-
# Do not use keyword arguments, which can fail at runtime
|
|
185
|
-
# even when function obeys protocol, because keyword arguments
|
|
186
|
-
# are implementation details.
|
|
187
|
-
_output = task(example)
|
|
188
|
-
if isinstance(_output, Awaitable):
|
|
189
|
-
raise RuntimeError("Task is async but running in sync context")
|
|
190
|
-
else:
|
|
191
|
-
output = _output
|
|
192
|
-
except BaseException as exc:
|
|
193
|
-
span.record_exception(exc)
|
|
194
|
-
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
195
|
-
error = exc
|
|
196
|
-
span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
|
|
197
|
-
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
198
|
-
if result := ExperimentResult(result=output) if output is not None else None:
|
|
199
|
-
if isinstance(output, str):
|
|
200
|
-
span.set_attribute(OUTPUT_VALUE, output)
|
|
201
|
-
else:
|
|
202
|
-
span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
|
|
203
|
-
span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
|
|
204
|
-
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
205
|
-
span.set_attribute(SpanAttributes.OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
206
|
-
span.set_status(status)
|
|
207
|
-
|
|
208
|
-
assert isinstance(
|
|
209
|
-
output, (dict, list, str, int, float, bool, type(None))
|
|
210
|
-
), "Output must be JSON serializable"
|
|
211
|
-
experiment_run = ExperimentRun(
|
|
212
|
-
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
213
|
-
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
214
|
-
experiment_id=experiment_id,
|
|
215
|
-
dataset_example_id=example.id,
|
|
216
|
-
repetition_number=repetition_number,
|
|
217
|
-
output=result,
|
|
218
|
-
error=repr(error) if error else None,
|
|
219
|
-
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
220
|
-
)
|
|
221
|
-
resp = sync_client.post(
|
|
222
|
-
f"/v1/experiments/{experiment_id}/runs", json=jsonify(experiment_run)
|
|
223
|
-
)
|
|
224
|
-
resp.raise_for_status()
|
|
225
|
-
return experiment_run
|
|
226
|
-
|
|
227
|
-
async def async_run_experiment(test_case: TestCase) -> ExperimentRun:
|
|
228
|
-
example, repetition_number = test_case.example, test_case.repetition_number
|
|
229
|
-
output = None
|
|
230
|
-
error: Optional[BaseException] = None
|
|
231
|
-
status = Status(StatusCode.OK)
|
|
232
|
-
with ExitStack() as stack:
|
|
233
|
-
span: Span = stack.enter_context(
|
|
234
|
-
tracer.start_as_current_span(root_span_name, context=Context())
|
|
235
|
-
)
|
|
236
|
-
stack.enter_context(capture_spans(resource))
|
|
237
|
-
try:
|
|
238
|
-
# Do not use keyword arguments, which can fail at runtime
|
|
239
|
-
# even when function obeys protocol, because keyword arguments
|
|
240
|
-
# are implementation details.
|
|
241
|
-
_output = task(example)
|
|
242
|
-
if isinstance(_output, Awaitable):
|
|
243
|
-
output = await _output
|
|
244
|
-
else:
|
|
245
|
-
output = _output
|
|
246
|
-
except BaseException as exc:
|
|
247
|
-
span.record_exception(exc)
|
|
248
|
-
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
249
|
-
error = exc
|
|
250
|
-
span.set_attribute(INPUT_VALUE, json.dumps(example.input, ensure_ascii=False))
|
|
251
|
-
span.set_attribute(INPUT_MIME_TYPE, JSON.value)
|
|
252
|
-
if result := ExperimentResult(result=output) if output is not None else None:
|
|
253
|
-
if isinstance(output, str):
|
|
254
|
-
span.set_attribute(OUTPUT_VALUE, output)
|
|
255
|
-
else:
|
|
256
|
-
span.set_attribute(OUTPUT_VALUE, json.dumps(output, ensure_ascii=False))
|
|
257
|
-
span.set_attribute(OUTPUT_MIME_TYPE, JSON.value)
|
|
258
|
-
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
259
|
-
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
260
|
-
span.set_status(status)
|
|
261
|
-
|
|
262
|
-
assert isinstance(
|
|
263
|
-
output, (dict, list, str, int, float, bool, type(None))
|
|
264
|
-
), "Output must be JSON serializable"
|
|
265
|
-
experiment_run = ExperimentRun(
|
|
266
|
-
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
267
|
-
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
268
|
-
experiment_id=experiment_id,
|
|
269
|
-
dataset_example_id=example.id,
|
|
270
|
-
repetition_number=repetition_number,
|
|
271
|
-
output=result,
|
|
272
|
-
error=repr(error) if error else None,
|
|
273
|
-
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
274
|
-
)
|
|
275
|
-
resp = await async_client.post(
|
|
276
|
-
f"/v1/experiments/{experiment_id}/runs", json=jsonify(experiment_run)
|
|
277
|
-
)
|
|
278
|
-
resp.raise_for_status()
|
|
279
|
-
return experiment_run
|
|
280
|
-
|
|
281
|
-
rate_limited_sync_run_experiment = functools.reduce(
|
|
282
|
-
lambda fn, limiter: limiter.limit(fn), rate_limiters, sync_run_experiment
|
|
283
|
-
)
|
|
284
|
-
rate_limited_async_run_experiment = functools.reduce(
|
|
285
|
-
lambda fn, limiter: limiter.alimit(fn), rate_limiters, async_run_experiment
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
executor = get_executor_on_sync_context(
|
|
289
|
-
rate_limited_sync_run_experiment,
|
|
290
|
-
rate_limited_async_run_experiment,
|
|
291
|
-
max_retries=0,
|
|
292
|
-
exit_on_error=False,
|
|
293
|
-
fallback_return_value=None,
|
|
294
|
-
tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
|
|
295
|
-
)
|
|
296
|
-
|
|
297
|
-
test_cases = [
|
|
298
|
-
TestCase(example=ex, repetition_number=rep)
|
|
299
|
-
for ex, rep in product(dataset.examples, range(1, repetitions + 1))
|
|
300
|
-
]
|
|
301
|
-
_, _execution_details = executor.run(test_cases)
|
|
302
|
-
experiment = Experiment(
|
|
303
|
-
id=experiment_id,
|
|
304
|
-
dataset_id=dataset.id,
|
|
305
|
-
dataset_version_id=dataset.version_id,
|
|
306
|
-
project_name=project_name,
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
print("✅ Task runs completed.")
|
|
310
|
-
|
|
311
|
-
if evaluators_by_name:
|
|
312
|
-
_evaluate_experiment(
|
|
313
|
-
experiment,
|
|
314
|
-
evaluators=evaluators_by_name,
|
|
315
|
-
dataset_examples=dataset.examples,
|
|
316
|
-
clients=(sync_client, async_client),
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
return experiment
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
def evaluate_experiment(
|
|
323
|
-
experiment: Experiment,
|
|
324
|
-
evaluators: Union[
|
|
325
|
-
ExperimentEvaluator,
|
|
326
|
-
Sequence[ExperimentEvaluator],
|
|
327
|
-
Mapping[EvaluatorName, ExperimentEvaluator],
|
|
328
|
-
],
|
|
329
|
-
) -> None:
|
|
330
|
-
sync_client, async_client = _phoenix_clients()
|
|
331
|
-
dataset_id = experiment.dataset_id
|
|
332
|
-
dataset_version_id = experiment.dataset_version_id
|
|
333
|
-
|
|
334
|
-
dataset_examples = [
|
|
335
|
-
Example.from_dict(ex)
|
|
336
|
-
for ex in (
|
|
337
|
-
sync_client.get(
|
|
338
|
-
f"/v1/datasets/{dataset_id}/examples",
|
|
339
|
-
params={"version-id": str(dataset_version_id)},
|
|
340
|
-
)
|
|
341
|
-
.json()
|
|
342
|
-
.get("data", {})
|
|
343
|
-
.get("examples", [])
|
|
344
|
-
)
|
|
345
|
-
]
|
|
346
|
-
_evaluate_experiment(
|
|
347
|
-
experiment,
|
|
348
|
-
evaluators=evaluators,
|
|
349
|
-
dataset_examples=dataset_examples,
|
|
350
|
-
clients=(sync_client, async_client),
|
|
351
|
-
)
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
def _evaluate_experiment(
|
|
355
|
-
experiment: Experiment,
|
|
356
|
-
*,
|
|
357
|
-
evaluators: Evaluators,
|
|
358
|
-
dataset_examples: Iterable[Example],
|
|
359
|
-
clients: Tuple[httpx.Client, httpx.AsyncClient],
|
|
360
|
-
) -> None:
|
|
361
|
-
evaluators_by_name = _evaluators_by_name(evaluators)
|
|
362
|
-
if not evaluators_by_name:
|
|
363
|
-
raise ValueError("Must specify at least one Evaluator")
|
|
364
|
-
experiment_id = experiment.id
|
|
365
|
-
sync_client, async_client = clients
|
|
366
|
-
experiment_runs = [
|
|
367
|
-
ExperimentRun.from_dict(exp_run)
|
|
368
|
-
for exp_run in sync_client.get(f"/v1/experiments/{experiment_id}/runs").json()
|
|
369
|
-
]
|
|
370
|
-
|
|
371
|
-
# not all dataset examples have associated experiment runs, so we need to pair them up
|
|
372
|
-
example_run_pairs = []
|
|
373
|
-
examples_by_id = {example.id: example for example in dataset_examples}
|
|
374
|
-
for exp_run in experiment_runs:
|
|
375
|
-
example = examples_by_id.get(exp_run.dataset_example_id)
|
|
376
|
-
if example:
|
|
377
|
-
example_run_pairs.append((deepcopy(example), exp_run))
|
|
378
|
-
evaluation_input = [
|
|
379
|
-
(example, run, evaluator)
|
|
380
|
-
for (example, run), evaluator in product(example_run_pairs, evaluators_by_name.values())
|
|
381
|
-
]
|
|
382
|
-
|
|
383
|
-
project_name = "evaluators"
|
|
384
|
-
resource = Resource({ResourceAttributes.PROJECT_NAME: project_name} if project_name else {})
|
|
385
|
-
tracer_provider = trace_sdk.TracerProvider(resource=resource)
|
|
386
|
-
tracer_provider.add_span_processor(
|
|
387
|
-
SimpleSpanProcessor(OTLPSpanExporter(urljoin(f"{_get_base_url()}", "v1/traces")))
|
|
388
|
-
)
|
|
389
|
-
tracer = tracer_provider.get_tracer(__name__)
|
|
390
|
-
root_span_kind = EVALUATOR
|
|
391
|
-
|
|
392
|
-
def sync_evaluate_run(
|
|
393
|
-
obj: Tuple[Example, ExperimentRun, Evaluator],
|
|
394
|
-
) -> ExperimentEvaluationRun:
|
|
395
|
-
example, experiment_run, evaluator = obj
|
|
396
|
-
result: Optional[EvaluationResult] = None
|
|
397
|
-
error: Optional[BaseException] = None
|
|
398
|
-
status = Status(StatusCode.OK)
|
|
399
|
-
root_span_name = f"Evaluation: {evaluator.name}"
|
|
400
|
-
with ExitStack() as stack:
|
|
401
|
-
span: Span = stack.enter_context(
|
|
402
|
-
tracer.start_as_current_span(root_span_name, context=Context())
|
|
403
|
-
)
|
|
404
|
-
stack.enter_context(capture_spans(resource))
|
|
405
|
-
try:
|
|
406
|
-
result = evaluator.evaluate(
|
|
407
|
-
output=None if experiment_run.output is None else experiment_run.output.result,
|
|
408
|
-
expected=example.output,
|
|
409
|
-
input=example.input,
|
|
410
|
-
metadata=example.metadata,
|
|
411
|
-
)
|
|
412
|
-
except BaseException as exc:
|
|
413
|
-
span.record_exception(exc)
|
|
414
|
-
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
415
|
-
error = exc
|
|
416
|
-
if result:
|
|
417
|
-
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
418
|
-
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
419
|
-
span.set_status(status)
|
|
420
|
-
|
|
421
|
-
evaluator_payload = ExperimentEvaluationRun(
|
|
422
|
-
experiment_run_id=cast(ExperimentRunId, experiment_run.id),
|
|
423
|
-
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
424
|
-
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
425
|
-
name=evaluator.name,
|
|
426
|
-
annotator_kind=evaluator.kind,
|
|
427
|
-
error=repr(error) if error else None,
|
|
428
|
-
result=result,
|
|
429
|
-
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
430
|
-
)
|
|
431
|
-
resp = sync_client.post("/v1/experiment_evaluations", json=jsonify(evaluator_payload))
|
|
432
|
-
resp.raise_for_status()
|
|
433
|
-
return evaluator_payload
|
|
434
|
-
|
|
435
|
-
async def async_evaluate_run(
|
|
436
|
-
obj: Tuple[Example, ExperimentRun, Evaluator],
|
|
437
|
-
) -> ExperimentEvaluationRun:
|
|
438
|
-
example, experiment_run, evaluator = obj
|
|
439
|
-
result: Optional[EvaluationResult] = None
|
|
440
|
-
error: Optional[BaseException] = None
|
|
441
|
-
status = Status(StatusCode.OK)
|
|
442
|
-
root_span_name = f"Evaluation: {evaluator.name}"
|
|
443
|
-
with ExitStack() as stack:
|
|
444
|
-
span: Span = stack.enter_context(
|
|
445
|
-
tracer.start_as_current_span(root_span_name, context=Context())
|
|
446
|
-
)
|
|
447
|
-
stack.enter_context(capture_spans(resource))
|
|
448
|
-
try:
|
|
449
|
-
result = await evaluator.async_evaluate(
|
|
450
|
-
output=None if experiment_run.output is None else experiment_run.output.result,
|
|
451
|
-
expected=example.output,
|
|
452
|
-
input=example.input,
|
|
453
|
-
metadata=example.metadata,
|
|
454
|
-
)
|
|
455
|
-
except BaseException as exc:
|
|
456
|
-
span.record_exception(exc)
|
|
457
|
-
status = Status(StatusCode.ERROR, f"{type(exc).__name__}: {exc}")
|
|
458
|
-
error = exc
|
|
459
|
-
if result:
|
|
460
|
-
span.set_attributes(dict(flatten(jsonify(result), recurse_on_sequence=True)))
|
|
461
|
-
span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
|
|
462
|
-
span.set_status(status)
|
|
463
|
-
|
|
464
|
-
evaluator_payload = ExperimentEvaluationRun(
|
|
465
|
-
experiment_run_id=cast(ExperimentRunId, experiment_run.id),
|
|
466
|
-
start_time=_decode_unix_nano(cast(int, span.start_time)),
|
|
467
|
-
end_time=_decode_unix_nano(cast(int, span.end_time)),
|
|
468
|
-
name=evaluator.name,
|
|
469
|
-
annotator_kind=evaluator.kind,
|
|
470
|
-
error=repr(error) if error else None,
|
|
471
|
-
result=result,
|
|
472
|
-
trace_id=_str_trace_id(span.get_span_context().trace_id), # type: ignore[no-untyped-call]
|
|
473
|
-
)
|
|
474
|
-
resp = await async_client.post(
|
|
475
|
-
"/v1/experiment_evaluations", json=jsonify(evaluator_payload)
|
|
476
|
-
)
|
|
477
|
-
resp.raise_for_status()
|
|
478
|
-
return evaluator_payload
|
|
479
|
-
|
|
480
|
-
executor = get_executor_on_sync_context(
|
|
481
|
-
sync_evaluate_run,
|
|
482
|
-
async_evaluate_run,
|
|
483
|
-
max_retries=0,
|
|
484
|
-
exit_on_error=False,
|
|
485
|
-
fallback_return_value=None,
|
|
486
|
-
tqdm_bar_format=get_tqdm_progress_bar_formatter("running experiment evaluations"),
|
|
487
|
-
)
|
|
488
|
-
print("🧠 Evaluation started.")
|
|
489
|
-
_, _execution_details = executor.run(evaluation_input)
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
def _evaluators_by_name(obj: Optional[Evaluators]) -> Mapping[EvaluatorName, Evaluator]:
|
|
493
|
-
evaluators_by_name: Dict[EvaluatorName, Evaluator] = {}
|
|
494
|
-
if obj is None:
|
|
495
|
-
return evaluators_by_name
|
|
496
|
-
if isinstance(mapping := obj, Mapping):
|
|
497
|
-
for name, value in mapping.items():
|
|
498
|
-
evaluator = (
|
|
499
|
-
create_evaluator(name=name)(value) if not isinstance(value, Evaluator) else value
|
|
500
|
-
)
|
|
501
|
-
name = evaluator.name
|
|
502
|
-
if name in evaluators_by_name:
|
|
503
|
-
raise ValueError(f"Two evaluators have the same name: {name}")
|
|
504
|
-
evaluators_by_name[name] = evaluator
|
|
505
|
-
elif isinstance(seq := obj, Sequence):
|
|
506
|
-
for value in seq:
|
|
507
|
-
evaluator = create_evaluator()(value) if not isinstance(value, Evaluator) else value
|
|
508
|
-
name = evaluator.name
|
|
509
|
-
if name in evaluators_by_name:
|
|
510
|
-
raise ValueError(f"Two evaluators have the same name: {name}")
|
|
511
|
-
evaluators_by_name[name] = evaluator
|
|
512
|
-
else:
|
|
513
|
-
assert not isinstance(obj, Mapping) and not isinstance(obj, Sequence)
|
|
514
|
-
evaluator = create_evaluator()(obj) if not isinstance(obj, Evaluator) else obj
|
|
515
|
-
name = evaluator.name
|
|
516
|
-
if name in evaluators_by_name:
|
|
517
|
-
raise ValueError(f"Two evaluators have the same name: {name}")
|
|
518
|
-
evaluators_by_name[name] = evaluator
|
|
519
|
-
return evaluators_by_name
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
def _str_trace_id(id_: int) -> str:
|
|
523
|
-
return hexlify(id_.to_bytes(16, "big")).decode()
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
def _decode_unix_nano(time_unix_nano: int) -> datetime:
|
|
527
|
-
return datetime.fromtimestamp(time_unix_nano / 1e9, tz=timezone.utc)
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
def _get_task_name(task: ExperimentTask) -> str:
|
|
531
|
-
"""
|
|
532
|
-
Makes a best-effort attempt to get the name of the task.
|
|
533
|
-
"""
|
|
534
|
-
|
|
535
|
-
if isinstance(task, functools.partial):
|
|
536
|
-
return task.func.__qualname__
|
|
537
|
-
if hasattr(task, "__qualname__"):
|
|
538
|
-
return task.__qualname__
|
|
539
|
-
return str(task)
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
INPUT_VALUE = SpanAttributes.INPUT_VALUE
|
|
543
|
-
OUTPUT_VALUE = SpanAttributes.OUTPUT_VALUE
|
|
544
|
-
INPUT_MIME_TYPE = SpanAttributes.INPUT_MIME_TYPE
|
|
545
|
-
OUTPUT_MIME_TYPE = SpanAttributes.OUTPUT_MIME_TYPE
|
|
546
|
-
OPENINFERENCE_SPAN_KIND = SpanAttributes.OPENINFERENCE_SPAN_KIND
|
|
547
|
-
|
|
548
|
-
CHAIN = OpenInferenceSpanKindValues.CHAIN.value
|
|
549
|
-
EVALUATOR = OpenInferenceSpanKindValues.EVALUATOR.value
|
|
550
|
-
JSON = OpenInferenceMimeTypeValues.JSON
|
phoenix/datasets/tracing.py
DELETED
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from contextlib import contextmanager
|
|
4
|
-
from contextvars import ContextVar
|
|
5
|
-
from threading import Lock
|
|
6
|
-
from typing import Any, Callable, Iterator, Optional
|
|
7
|
-
|
|
8
|
-
from opentelemetry.sdk.resources import Resource
|
|
9
|
-
from opentelemetry.sdk.trace import ReadableSpan
|
|
10
|
-
from opentelemetry.trace import INVALID_TRACE_ID
|
|
11
|
-
from wrapt import apply_patch, resolve_path, wrap_function_wrapper
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class SpanModifier:
|
|
15
|
-
"""
|
|
16
|
-
A class that modifies spans with the specified resource attributes.
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
__slots__ = ("_resource",)
|
|
20
|
-
|
|
21
|
-
def __init__(self, resource: Resource) -> None:
|
|
22
|
-
self._resource = resource
|
|
23
|
-
|
|
24
|
-
def modify_resource(self, span: ReadableSpan) -> None:
|
|
25
|
-
"""
|
|
26
|
-
Takes a span and merges in the resource attributes specified in the constructor.
|
|
27
|
-
|
|
28
|
-
Args:
|
|
29
|
-
span: ReadableSpan: the span to modify
|
|
30
|
-
"""
|
|
31
|
-
if (ctx := span._context) is None or ctx.span_id == INVALID_TRACE_ID:
|
|
32
|
-
return
|
|
33
|
-
span._resource = span._resource.merge(self._resource)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
_ACTIVE_MODIFIER: ContextVar[Optional[SpanModifier]] = ContextVar("active_modifier")
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def override_span(init: Callable[..., None], span: ReadableSpan, args: Any, kwargs: Any) -> None:
|
|
40
|
-
init(*args, **kwargs)
|
|
41
|
-
if isinstance(span_modifier := _ACTIVE_MODIFIER.get(None), SpanModifier):
|
|
42
|
-
span_modifier.modify_resource(span)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
_SPAN_INIT_MONKEY_PATCH_LOCK = Lock()
|
|
46
|
-
_SPAN_INIT_MONKEY_PATCH_COUNT = 0
|
|
47
|
-
_SPAN_INIT_MODULE = ReadableSpan.__init__.__module__
|
|
48
|
-
_SPAN_INIT_NAME = ReadableSpan.__init__.__qualname__
|
|
49
|
-
_SPAN_INIT_PARENT, _SPAN_INIT_ATTR, _SPAN_INIT_ORIGINAL = resolve_path(
|
|
50
|
-
_SPAN_INIT_MODULE, _SPAN_INIT_NAME
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@contextmanager
|
|
55
|
-
def _monkey_patch_span_init() -> Iterator[None]:
|
|
56
|
-
global _SPAN_INIT_MONKEY_PATCH_COUNT
|
|
57
|
-
with _SPAN_INIT_MONKEY_PATCH_LOCK:
|
|
58
|
-
_SPAN_INIT_MONKEY_PATCH_COUNT += 1
|
|
59
|
-
if _SPAN_INIT_MONKEY_PATCH_COUNT == 1:
|
|
60
|
-
wrap_function_wrapper(
|
|
61
|
-
module=_SPAN_INIT_MODULE, name=_SPAN_INIT_NAME, wrapper=override_span
|
|
62
|
-
)
|
|
63
|
-
yield
|
|
64
|
-
with _SPAN_INIT_MONKEY_PATCH_LOCK:
|
|
65
|
-
_SPAN_INIT_MONKEY_PATCH_COUNT -= 1
|
|
66
|
-
if _SPAN_INIT_MONKEY_PATCH_COUNT == 0:
|
|
67
|
-
apply_patch(_SPAN_INIT_PARENT, _SPAN_INIT_ATTR, _SPAN_INIT_ORIGINAL)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
@contextmanager
|
|
71
|
-
def capture_spans(resource: Resource) -> Iterator[SpanModifier]:
|
|
72
|
-
"""
|
|
73
|
-
A context manager that captures spans and modifies them with the specified resources.
|
|
74
|
-
|
|
75
|
-
Args:
|
|
76
|
-
resource: Resource: The resource to merge into the spans created within the context.
|
|
77
|
-
|
|
78
|
-
Returns:
|
|
79
|
-
modifier: Iterator[SpanModifier]: The span modifier that is active within the context.
|
|
80
|
-
"""
|
|
81
|
-
modifier = SpanModifier(resource)
|
|
82
|
-
with _monkey_patch_span_init():
|
|
83
|
-
token = _ACTIVE_MODIFIER.set(modifier)
|
|
84
|
-
yield modifier
|
|
85
|
-
_ACTIVE_MODIFIER.reset(token)
|