arize-phoenix 4.4.3__py3-none-any.whl → 4.4.4rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc1.dist-info}/METADATA +4 -4
- {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc1.dist-info}/RECORD +111 -55
- {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc1.dist-info}/WHEEL +1 -1
- phoenix/__init__.py +0 -27
- phoenix/config.py +21 -7
- phoenix/core/model.py +25 -25
- phoenix/core/model_schema.py +64 -62
- phoenix/core/model_schema_adapter.py +27 -25
- phoenix/datasets/__init__.py +0 -0
- phoenix/datasets/evaluators.py +275 -0
- phoenix/datasets/experiments.py +469 -0
- phoenix/datasets/tracing.py +66 -0
- phoenix/datasets/types.py +212 -0
- phoenix/db/bulk_inserter.py +54 -14
- phoenix/db/insertion/dataset.py +234 -0
- phoenix/db/insertion/evaluation.py +6 -6
- phoenix/db/insertion/helpers.py +13 -2
- phoenix/db/migrations/types.py +29 -0
- phoenix/db/migrations/versions/10460e46d750_datasets.py +291 -0
- phoenix/db/migrations/versions/cf03bd6bae1d_init.py +2 -28
- phoenix/db/models.py +230 -3
- phoenix/inferences/fixtures.py +23 -23
- phoenix/inferences/inferences.py +7 -7
- phoenix/inferences/validation.py +1 -1
- phoenix/server/api/context.py +16 -0
- phoenix/server/api/dataloaders/__init__.py +16 -0
- phoenix/server/api/dataloaders/dataset_example_revisions.py +100 -0
- phoenix/server/api/dataloaders/dataset_example_spans.py +43 -0
- phoenix/server/api/dataloaders/experiment_annotation_summaries.py +85 -0
- phoenix/server/api/dataloaders/experiment_error_rates.py +43 -0
- phoenix/server/api/dataloaders/experiment_sequence_number.py +49 -0
- phoenix/server/api/dataloaders/project_by_name.py +31 -0
- phoenix/server/api/dataloaders/span_descendants.py +2 -3
- phoenix/server/api/dataloaders/span_projects.py +33 -0
- phoenix/server/api/dataloaders/trace_row_ids.py +39 -0
- phoenix/server/api/helpers/dataset_helpers.py +178 -0
- phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
- phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
- phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
- phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
- phoenix/server/api/input_types/DatasetSort.py +17 -0
- phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
- phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
- phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
- phoenix/server/api/input_types/DeleteExperimentsInput.py +9 -0
- phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
- phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
- phoenix/server/api/mutations/__init__.py +13 -0
- phoenix/server/api/mutations/auth.py +11 -0
- phoenix/server/api/mutations/dataset_mutations.py +520 -0
- phoenix/server/api/mutations/experiment_mutations.py +65 -0
- phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +17 -14
- phoenix/server/api/mutations/project_mutations.py +42 -0
- phoenix/server/api/openapi/__init__.py +0 -0
- phoenix/server/api/openapi/main.py +6 -0
- phoenix/server/api/openapi/schema.py +15 -0
- phoenix/server/api/queries.py +503 -0
- phoenix/server/api/routers/v1/__init__.py +77 -2
- phoenix/server/api/routers/v1/dataset_examples.py +178 -0
- phoenix/server/api/routers/v1/datasets.py +861 -0
- phoenix/server/api/routers/v1/evaluations.py +4 -2
- phoenix/server/api/routers/v1/experiment_evaluations.py +65 -0
- phoenix/server/api/routers/v1/experiment_runs.py +108 -0
- phoenix/server/api/routers/v1/experiments.py +174 -0
- phoenix/server/api/routers/v1/spans.py +3 -1
- phoenix/server/api/routers/v1/traces.py +1 -4
- phoenix/server/api/schema.py +2 -303
- phoenix/server/api/types/AnnotatorKind.py +10 -0
- phoenix/server/api/types/Cluster.py +19 -19
- phoenix/server/api/types/CreateDatasetPayload.py +8 -0
- phoenix/server/api/types/Dataset.py +282 -63
- phoenix/server/api/types/DatasetExample.py +85 -0
- phoenix/server/api/types/DatasetExampleRevision.py +34 -0
- phoenix/server/api/types/DatasetVersion.py +14 -0
- phoenix/server/api/types/Dimension.py +30 -29
- phoenix/server/api/types/EmbeddingDimension.py +40 -34
- phoenix/server/api/types/Event.py +16 -16
- phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
- phoenix/server/api/types/Experiment.py +135 -0
- phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
- phoenix/server/api/types/ExperimentComparison.py +19 -0
- phoenix/server/api/types/ExperimentRun.py +91 -0
- phoenix/server/api/types/ExperimentRunAnnotation.py +57 -0
- phoenix/server/api/types/Inferences.py +80 -0
- phoenix/server/api/types/InferencesRole.py +23 -0
- phoenix/server/api/types/Model.py +43 -42
- phoenix/server/api/types/Project.py +26 -12
- phoenix/server/api/types/Span.py +78 -2
- phoenix/server/api/types/TimeSeries.py +6 -6
- phoenix/server/api/types/Trace.py +15 -4
- phoenix/server/api/types/UMAPPoints.py +1 -1
- phoenix/server/api/types/node.py +5 -111
- phoenix/server/api/types/pagination.py +10 -52
- phoenix/server/app.py +99 -49
- phoenix/server/main.py +49 -27
- phoenix/server/openapi/docs.py +3 -0
- phoenix/server/static/index.js +2246 -1368
- phoenix/server/templates/index.html +1 -0
- phoenix/services.py +15 -15
- phoenix/session/client.py +316 -21
- phoenix/session/session.py +47 -37
- phoenix/trace/exporter.py +14 -9
- phoenix/trace/fixtures.py +133 -7
- phoenix/trace/span_evaluations.py +3 -3
- phoenix/trace/trace_dataset.py +6 -6
- phoenix/utilities/json.py +61 -0
- phoenix/utilities/re.py +50 -0
- phoenix/version.py +1 -1
- phoenix/server/api/types/DatasetRole.py +0 -23
- {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc1.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc1.dist-info}/licenses/LICENSE +0 -0
- /phoenix/server/api/{helpers.py → helpers/__init__.py} +0 -0
|
@@ -31,6 +31,7 @@
|
|
|
31
31
|
// injected into the client before React runs
|
|
32
32
|
value: Object.freeze({
|
|
33
33
|
basename: "{{basename}}",
|
|
34
|
+
platformVersion: "{{platform_version}}",
|
|
34
35
|
hasInferences: Boolean("{{has_inferences}}" == "True"),
|
|
35
36
|
hasCorpus: Boolean("{{has_corpus}}" == "True"),
|
|
36
37
|
UMAP: {
|
phoenix/services.py
CHANGED
|
@@ -99,10 +99,10 @@ class AppService(Service):
|
|
|
99
99
|
|
|
100
100
|
working_dir = SERVER_DIR
|
|
101
101
|
|
|
102
|
-
# Internal references to the name / directory of the
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
102
|
+
# Internal references to the name / directory of the inferences(s)
|
|
103
|
+
__primary_inferences_name: str
|
|
104
|
+
__reference_inferences_name: Optional[str]
|
|
105
|
+
__corpus_inferences_name: Optional[str]
|
|
106
106
|
__trace_dataset_name: Optional[str]
|
|
107
107
|
|
|
108
108
|
def __init__(
|
|
@@ -112,10 +112,10 @@ class AppService(Service):
|
|
|
112
112
|
host: str,
|
|
113
113
|
port: int,
|
|
114
114
|
root_path: str,
|
|
115
|
-
|
|
115
|
+
primary_inferences_name: str,
|
|
116
116
|
umap_params: str,
|
|
117
|
-
|
|
118
|
-
|
|
117
|
+
reference_inferences_name: Optional[str],
|
|
118
|
+
corpus_inferences_name: Optional[str],
|
|
119
119
|
trace_dataset_name: Optional[str],
|
|
120
120
|
):
|
|
121
121
|
self.database_url = database_url
|
|
@@ -123,10 +123,10 @@ class AppService(Service):
|
|
|
123
123
|
self.host = host
|
|
124
124
|
self.port = port
|
|
125
125
|
self.root_path = root_path # TODO(mikeldking): Add support for root_path
|
|
126
|
-
self.
|
|
126
|
+
self.__primary_inferences_name = primary_inferences_name
|
|
127
127
|
self.__umap_params = umap_params
|
|
128
|
-
self.
|
|
129
|
-
self.
|
|
128
|
+
self.__reference_inferences_name = reference_inferences_name
|
|
129
|
+
self.__corpus_inferences_name = corpus_inferences_name
|
|
130
130
|
self.__trace_dataset_name = trace_dataset_name
|
|
131
131
|
super().__init__()
|
|
132
132
|
|
|
@@ -147,12 +147,12 @@ class AppService(Service):
|
|
|
147
147
|
self.__umap_params,
|
|
148
148
|
"datasets",
|
|
149
149
|
"--primary",
|
|
150
|
-
str(self.
|
|
150
|
+
str(self.__primary_inferences_name),
|
|
151
151
|
]
|
|
152
|
-
if self.
|
|
153
|
-
command.extend(["--reference", str(self.
|
|
154
|
-
if self.
|
|
155
|
-
command.extend(["--corpus", str(self.
|
|
152
|
+
if self.__reference_inferences_name is not None:
|
|
153
|
+
command.extend(["--reference", str(self.__reference_inferences_name)])
|
|
154
|
+
if self.__corpus_inferences_name is not None:
|
|
155
|
+
command.extend(["--corpus", str(self.__corpus_inferences_name)])
|
|
156
156
|
if self.__trace_dataset_name is not None:
|
|
157
157
|
command.extend(["--trace", str(self.__trace_dataset_name)])
|
|
158
158
|
logger.info(f"command: {' '.join(command)}")
|
phoenix/session/client.py
CHANGED
|
@@ -1,27 +1,46 @@
|
|
|
1
|
+
import csv
|
|
1
2
|
import gzip
|
|
2
3
|
import logging
|
|
3
4
|
import weakref
|
|
5
|
+
from collections import Counter
|
|
4
6
|
from datetime import datetime
|
|
5
|
-
from io import BytesIO
|
|
6
|
-
from
|
|
7
|
-
from
|
|
7
|
+
from io import BytesIO, StringIO
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import (
|
|
10
|
+
Any,
|
|
11
|
+
BinaryIO,
|
|
12
|
+
Dict,
|
|
13
|
+
Iterable,
|
|
14
|
+
List,
|
|
15
|
+
Literal,
|
|
16
|
+
Mapping,
|
|
17
|
+
Optional,
|
|
18
|
+
Tuple,
|
|
19
|
+
Union,
|
|
20
|
+
cast,
|
|
21
|
+
)
|
|
22
|
+
from urllib.parse import quote, urljoin
|
|
8
23
|
|
|
24
|
+
import httpx
|
|
9
25
|
import pandas as pd
|
|
10
26
|
import pyarrow as pa
|
|
11
27
|
from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ExportTraceServiceRequest
|
|
12
28
|
from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
|
|
13
29
|
from opentelemetry.proto.resource.v1.resource_pb2 import Resource
|
|
14
30
|
from opentelemetry.proto.trace.v1.trace_pb2 import ResourceSpans, ScopeSpans
|
|
15
|
-
from pyarrow import ArrowInvalid
|
|
16
|
-
from
|
|
31
|
+
from pyarrow import ArrowInvalid, Table
|
|
32
|
+
from typing_extensions import TypeAlias, assert_never
|
|
17
33
|
|
|
18
34
|
from phoenix.config import (
|
|
35
|
+
get_env_client_headers,
|
|
19
36
|
get_env_collector_endpoint,
|
|
20
37
|
get_env_host,
|
|
21
38
|
get_env_port,
|
|
22
39
|
get_env_project_name,
|
|
23
40
|
)
|
|
41
|
+
from phoenix.datasets.types import Dataset, Example
|
|
24
42
|
from phoenix.datetime_utils import normalize_datetime
|
|
43
|
+
from phoenix.db.insertion.dataset import DatasetKeys
|
|
25
44
|
from phoenix.session.data_extractor import DEFAULT_SPAN_LIMIT, TraceDataExtractor
|
|
26
45
|
from phoenix.trace import Evaluations, TraceDataset
|
|
27
46
|
from phoenix.trace.dsl import SpanQuery
|
|
@@ -36,14 +55,20 @@ class Client(TraceDataExtractor):
|
|
|
36
55
|
*,
|
|
37
56
|
endpoint: Optional[str] = None,
|
|
38
57
|
warn_if_server_not_running: bool = True,
|
|
58
|
+
headers: Optional[Mapping[str, str]] = None,
|
|
39
59
|
**kwargs: Any, # for backward-compatibility
|
|
40
60
|
):
|
|
41
61
|
"""
|
|
42
62
|
Client for connecting to a Phoenix server.
|
|
43
63
|
|
|
44
64
|
Args:
|
|
45
|
-
endpoint (str, optional): Phoenix server endpoint, e.g.
|
|
46
|
-
|
|
65
|
+
endpoint (str, optional): Phoenix server endpoint, e.g.
|
|
66
|
+
http://localhost:6006. If not provided, the endpoint will be
|
|
67
|
+
inferred from the environment variables.
|
|
68
|
+
|
|
69
|
+
headers (Mapping[str, str], optional): Headers to include in each
|
|
70
|
+
network request. If not provided, the headers will be inferred from
|
|
71
|
+
the environment variables (if present).
|
|
47
72
|
"""
|
|
48
73
|
if kwargs.pop("use_active_session_if_available", None) is not None:
|
|
49
74
|
print(
|
|
@@ -52,14 +77,14 @@ class Client(TraceDataExtractor):
|
|
|
52
77
|
)
|
|
53
78
|
if kwargs:
|
|
54
79
|
raise TypeError(f"Unexpected keyword arguments: {', '.join(kwargs)}")
|
|
80
|
+
headers = headers or get_env_client_headers()
|
|
55
81
|
host = get_env_host()
|
|
56
82
|
if host == "0.0.0.0":
|
|
57
83
|
host = "127.0.0.1"
|
|
58
84
|
base_url = endpoint or get_env_collector_endpoint() or f"http://{host}:{get_env_port()}"
|
|
59
85
|
self._base_url = base_url if base_url.endswith("/") else base_url + "/"
|
|
60
|
-
|
|
61
|
-
self.
|
|
62
|
-
weakref.finalize(self, self._session.close)
|
|
86
|
+
self._client = httpx.Client(headers=headers)
|
|
87
|
+
weakref.finalize(self, self._client.close)
|
|
63
88
|
if warn_if_server_not_running:
|
|
64
89
|
self._warn_if_phoenix_is_not_running()
|
|
65
90
|
|
|
@@ -98,7 +123,7 @@ class Client(TraceDataExtractor):
|
|
|
98
123
|
"stop_time is deprecated. Use end_time instead.",
|
|
99
124
|
)
|
|
100
125
|
end_time = end_time or stop_time
|
|
101
|
-
response = self.
|
|
126
|
+
response = self._client.post(
|
|
102
127
|
url=urljoin(self._base_url, "v1/spans"),
|
|
103
128
|
params={"project-name": project_name},
|
|
104
129
|
json={
|
|
@@ -145,8 +170,8 @@ class Client(TraceDataExtractor):
|
|
|
145
170
|
empty list if no evaluations are found.
|
|
146
171
|
"""
|
|
147
172
|
project_name = project_name or get_env_project_name()
|
|
148
|
-
response = self.
|
|
149
|
-
urljoin(self._base_url, "v1/evaluations"),
|
|
173
|
+
response = self._client.get(
|
|
174
|
+
url=urljoin(self._base_url, "v1/evaluations"),
|
|
150
175
|
params={"project-name": project_name},
|
|
151
176
|
)
|
|
152
177
|
if response.status_code == 404:
|
|
@@ -167,7 +192,7 @@ class Client(TraceDataExtractor):
|
|
|
167
192
|
|
|
168
193
|
def _warn_if_phoenix_is_not_running(self) -> None:
|
|
169
194
|
try:
|
|
170
|
-
self.
|
|
195
|
+
self._client.get(urljoin(self._base_url, "arize_phoenix_version")).raise_for_status()
|
|
171
196
|
except Exception:
|
|
172
197
|
logger.warning(
|
|
173
198
|
f"Arize Phoenix is not running on {self._base_url}. Launch Phoenix "
|
|
@@ -197,9 +222,9 @@ class Client(TraceDataExtractor):
|
|
|
197
222
|
headers = {"content-type": "application/x-pandas-arrow"}
|
|
198
223
|
with pa.ipc.new_stream(sink, table.schema) as writer:
|
|
199
224
|
writer.write_table(table)
|
|
200
|
-
self.
|
|
201
|
-
urljoin(self._base_url, "v1/evaluations"),
|
|
202
|
-
|
|
225
|
+
self._client.post(
|
|
226
|
+
url=urljoin(self._base_url, "v1/evaluations"),
|
|
227
|
+
content=cast(bytes, sink.getvalue().to_pybytes()),
|
|
203
228
|
headers=headers,
|
|
204
229
|
).raise_for_status()
|
|
205
230
|
|
|
@@ -239,16 +264,286 @@ class Client(TraceDataExtractor):
|
|
|
239
264
|
]
|
|
240
265
|
for otlp_span in otlp_spans:
|
|
241
266
|
serialized = otlp_span.SerializeToString()
|
|
242
|
-
|
|
243
|
-
self.
|
|
244
|
-
urljoin(self._base_url, "v1/traces"),
|
|
245
|
-
|
|
267
|
+
content = gzip.compress(serialized)
|
|
268
|
+
self._client.post(
|
|
269
|
+
url=urljoin(self._base_url, "v1/traces"),
|
|
270
|
+
content=content,
|
|
246
271
|
headers={
|
|
247
272
|
"content-type": "application/x-protobuf",
|
|
248
273
|
"content-encoding": "gzip",
|
|
249
274
|
},
|
|
250
275
|
).raise_for_status()
|
|
251
276
|
|
|
277
|
+
def _get_dataset_id_by_name(self, name: str) -> str:
|
|
278
|
+
"""
|
|
279
|
+
Gets a dataset by name.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
name (str): The name of the dataset.
|
|
283
|
+
version_id (Optional[str]): The version ID of the dataset. Default None.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
Dataset: The dataset object.
|
|
287
|
+
"""
|
|
288
|
+
response = self._client.get(
|
|
289
|
+
urljoin(self._base_url, "/v1/datasets"),
|
|
290
|
+
params={"name": name},
|
|
291
|
+
)
|
|
292
|
+
response.raise_for_status()
|
|
293
|
+
if not (records := response.json()["data"]):
|
|
294
|
+
raise ValueError(f"Failed to query dataset by name: {name}")
|
|
295
|
+
if len(records) > 1 or not records[0]:
|
|
296
|
+
raise ValueError(f"Failed to find a single dataset with the given name: {name}")
|
|
297
|
+
dataset = records[0]
|
|
298
|
+
return str(dataset["id"])
|
|
299
|
+
|
|
300
|
+
def get_dataset(
|
|
301
|
+
self,
|
|
302
|
+
*,
|
|
303
|
+
id: Optional[str] = None,
|
|
304
|
+
name: Optional[str] = None,
|
|
305
|
+
version_id: Optional[str] = None,
|
|
306
|
+
) -> Dataset:
|
|
307
|
+
"""
|
|
308
|
+
Gets the dataset for a specific version, or gets the latest version of
|
|
309
|
+
the dataset if no version is specified.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
|
|
313
|
+
id (Optional[str]): An ID for the dataset.
|
|
314
|
+
|
|
315
|
+
name (Optional[str]): the name for the dataset. If provided, the ID
|
|
316
|
+
is ignored and the dataset is retrieved by name.
|
|
317
|
+
|
|
318
|
+
version_id (Optional[str]): An ID for the version of the dataset, or
|
|
319
|
+
None.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
A dataset object.
|
|
323
|
+
"""
|
|
324
|
+
if name:
|
|
325
|
+
id = self._get_dataset_id_by_name(name)
|
|
326
|
+
|
|
327
|
+
if not id:
|
|
328
|
+
raise ValueError("Dataset id or name must be provided.")
|
|
329
|
+
|
|
330
|
+
response = self._client.get(
|
|
331
|
+
urljoin(self._base_url, f"/v1/datasets/{quote(id)}/examples"),
|
|
332
|
+
params={"version-id": version_id} if version_id else None,
|
|
333
|
+
)
|
|
334
|
+
response.raise_for_status()
|
|
335
|
+
data = response.json()["data"]
|
|
336
|
+
examples = [
|
|
337
|
+
Example(
|
|
338
|
+
id=example["id"],
|
|
339
|
+
input=example["input"],
|
|
340
|
+
output=example["output"],
|
|
341
|
+
metadata=example["metadata"],
|
|
342
|
+
updated_at=datetime.fromisoformat(example["updated_at"]),
|
|
343
|
+
)
|
|
344
|
+
for example in data["examples"]
|
|
345
|
+
]
|
|
346
|
+
resolved_dataset_id = data["dataset_id"]
|
|
347
|
+
resolved_version_id = data["version_id"]
|
|
348
|
+
return Dataset(
|
|
349
|
+
id=resolved_dataset_id,
|
|
350
|
+
version_id=resolved_version_id,
|
|
351
|
+
examples=examples,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
def get_dataset_versions(
|
|
355
|
+
self,
|
|
356
|
+
dataset_id: str,
|
|
357
|
+
/,
|
|
358
|
+
*,
|
|
359
|
+
limit: Optional[int] = 100,
|
|
360
|
+
) -> pd.DataFrame:
|
|
361
|
+
"""
|
|
362
|
+
Get dataset versions as pandas DataFrame.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
dataset_id (str): dataset ID
|
|
366
|
+
limit (Optional[int]): maximum number of versions to return,
|
|
367
|
+
starting from the most recent version
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
pandas DataFrame
|
|
371
|
+
"""
|
|
372
|
+
url = urljoin(self._base_url, f"v1/datasets/{dataset_id}/versions")
|
|
373
|
+
response = httpx.get(url=url, params={"limit": limit})
|
|
374
|
+
response.raise_for_status()
|
|
375
|
+
if not (records := response.json()["data"]):
|
|
376
|
+
return pd.DataFrame()
|
|
377
|
+
df = pd.DataFrame.from_records(records, index="version_id")
|
|
378
|
+
df["created_at"] = pd.to_datetime(df.created_at)
|
|
379
|
+
return df
|
|
380
|
+
|
|
381
|
+
def download_dataset_examples(
|
|
382
|
+
self,
|
|
383
|
+
dataset_id: str,
|
|
384
|
+
/,
|
|
385
|
+
*,
|
|
386
|
+
dataset_version_id: Optional[str] = None,
|
|
387
|
+
) -> pd.DataFrame:
|
|
388
|
+
"""
|
|
389
|
+
Download dataset examples as pandas DataFrame.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
dataset_id (str): dataset ID
|
|
393
|
+
dataset_version_id (Optional[str]): dataset version ID, if omitted,
|
|
394
|
+
the latest version is returned.
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
pandas DataFrame
|
|
398
|
+
"""
|
|
399
|
+
url = f"v1/datasets/{dataset_id}/csv"
|
|
400
|
+
response = httpx.get(
|
|
401
|
+
url=urljoin(self._base_url, url),
|
|
402
|
+
params={"version": dataset_version_id} if dataset_version_id else {},
|
|
403
|
+
)
|
|
404
|
+
response.raise_for_status()
|
|
405
|
+
return pd.read_csv(
|
|
406
|
+
StringIO(response.content.decode()),
|
|
407
|
+
index_col="example_id",
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
def upload_dataset(
|
|
411
|
+
self,
|
|
412
|
+
table: Union[str, Path, pd.DataFrame],
|
|
413
|
+
/,
|
|
414
|
+
*,
|
|
415
|
+
name: str,
|
|
416
|
+
input_keys: Iterable[str],
|
|
417
|
+
output_keys: Iterable[str],
|
|
418
|
+
metadata_keys: Iterable[str] = (),
|
|
419
|
+
description: Optional[str] = None,
|
|
420
|
+
action: Literal["create", "append"] = "create",
|
|
421
|
+
) -> Dataset:
|
|
422
|
+
"""
|
|
423
|
+
Upload examples as dataset to the Phoenix server.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
table (str | Path | pd.DataFrame): Location of a CSV text file, or
|
|
427
|
+
pandas DataFrame.
|
|
428
|
+
name: (str): Name of the dataset. Required if action=append.
|
|
429
|
+
input_keys (Iterable[str]): List of column names used as input keys.
|
|
430
|
+
input_keys, output_keys, metadata_keys must be disjoint, and must
|
|
431
|
+
exist in CSV column headers.
|
|
432
|
+
output_keys (Iterable[str]): List of column names used as output keys.
|
|
433
|
+
input_keys, output_keys, metadata_keys must be disjoint, and must
|
|
434
|
+
exist in CSV column headers.
|
|
435
|
+
metadata_keys (Iterable[str]): List of column names used as metadata keys.
|
|
436
|
+
input_keys, output_keys, metadata_keys must be disjoint, and must
|
|
437
|
+
exist in CSV column headers.
|
|
438
|
+
description: (Optional[str]): Description of the dataset.
|
|
439
|
+
action: (Literal["create", "append"): Create new dataset or append to an
|
|
440
|
+
existing dataset. If action=append, dataset name is required.
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
A Dataset object with the uploaded examples.
|
|
444
|
+
"""
|
|
445
|
+
if action not in ("create", "append"):
|
|
446
|
+
raise ValueError(f"Invalid action: {action}")
|
|
447
|
+
if not name:
|
|
448
|
+
raise ValueError("Dataset name must not be blank")
|
|
449
|
+
keys = DatasetKeys(
|
|
450
|
+
frozenset(input_keys),
|
|
451
|
+
frozenset(output_keys),
|
|
452
|
+
frozenset(metadata_keys),
|
|
453
|
+
)
|
|
454
|
+
if isinstance(table, pd.DataFrame):
|
|
455
|
+
file = _prepare_pyarrow(table, keys)
|
|
456
|
+
elif isinstance(table, (str, Path)):
|
|
457
|
+
file = _prepare_csv(Path(table), keys)
|
|
458
|
+
else:
|
|
459
|
+
assert_never(table)
|
|
460
|
+
response = self._client.post(
|
|
461
|
+
url=urljoin(self._base_url, "v1/datasets/upload"),
|
|
462
|
+
files={"file": file},
|
|
463
|
+
data={
|
|
464
|
+
"action": action,
|
|
465
|
+
"name": name,
|
|
466
|
+
"description": description,
|
|
467
|
+
"input_keys[]": sorted(keys.input),
|
|
468
|
+
"output_keys[]": sorted(keys.output),
|
|
469
|
+
"metadata_keys[]": sorted(keys.metadata),
|
|
470
|
+
},
|
|
471
|
+
params={"sync": True},
|
|
472
|
+
)
|
|
473
|
+
response.raise_for_status()
|
|
474
|
+
data = response.json()["data"]
|
|
475
|
+
dataset_id = data["dataset_id"]
|
|
476
|
+
response = self._client.get(
|
|
477
|
+
url=urljoin(self._base_url, f"v1/datasets/{dataset_id}/examples")
|
|
478
|
+
)
|
|
479
|
+
response.raise_for_status()
|
|
480
|
+
data = response.json()["data"]
|
|
481
|
+
version_id = data["version_id"]
|
|
482
|
+
examples = data["examples"]
|
|
483
|
+
return Dataset(
|
|
484
|
+
id=dataset_id,
|
|
485
|
+
version_id=version_id,
|
|
486
|
+
examples=[
|
|
487
|
+
Example(
|
|
488
|
+
id=example["id"],
|
|
489
|
+
input=example["input"],
|
|
490
|
+
output=example["output"],
|
|
491
|
+
metadata=example["metadata"],
|
|
492
|
+
updated_at=datetime.fromisoformat(example["updated_at"]),
|
|
493
|
+
)
|
|
494
|
+
for example in examples
|
|
495
|
+
],
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
FileName: TypeAlias = str
|
|
500
|
+
FilePointer: TypeAlias = BinaryIO
|
|
501
|
+
FileType: TypeAlias = str
|
|
502
|
+
FileHeaders: TypeAlias = Dict[str, str]
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def _prepare_csv(
|
|
506
|
+
path: Path,
|
|
507
|
+
keys: DatasetKeys,
|
|
508
|
+
) -> Tuple[FileName, FilePointer, FileType, FileHeaders]:
|
|
509
|
+
path = path.resolve()
|
|
510
|
+
if not path.is_file():
|
|
511
|
+
raise FileNotFoundError(f"File does not exist: {path}")
|
|
512
|
+
with open(path, "r") as f:
|
|
513
|
+
rows = csv.reader(f)
|
|
514
|
+
try:
|
|
515
|
+
column_headers = next(rows)
|
|
516
|
+
_ = next(rows)
|
|
517
|
+
except StopIteration:
|
|
518
|
+
raise ValueError("csv file has no data")
|
|
519
|
+
(header, freq), *_ = Counter(column_headers).most_common(1)
|
|
520
|
+
if freq > 1:
|
|
521
|
+
raise ValueError(f"Duplicated column header in CSV file: {header}")
|
|
522
|
+
keys.check_differences(frozenset(column_headers))
|
|
523
|
+
file = BytesIO()
|
|
524
|
+
with open(path, "rb") as f:
|
|
525
|
+
file.write(gzip.compress(f.read()))
|
|
526
|
+
return path.name, file, "text/csv", {"Content-Encoding": "gzip"}
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _prepare_pyarrow(
|
|
530
|
+
df: pd.DataFrame,
|
|
531
|
+
keys: DatasetKeys,
|
|
532
|
+
) -> Tuple[FileName, FilePointer, FileType, FileHeaders]:
|
|
533
|
+
if df.empty:
|
|
534
|
+
raise ValueError("dataframe has no data")
|
|
535
|
+
(header, freq), *_ = Counter(df.columns).most_common(1)
|
|
536
|
+
if freq > 1:
|
|
537
|
+
raise ValueError(f"Duplicated column header in file: {header}")
|
|
538
|
+
keys.check_differences(frozenset(df.columns))
|
|
539
|
+
table = Table.from_pandas(df.loc[:, list(keys)])
|
|
540
|
+
sink = pa.BufferOutputStream()
|
|
541
|
+
options = pa.ipc.IpcWriteOptions(compression="lz4")
|
|
542
|
+
with pa.ipc.new_stream(sink, table.schema, options=options) as writer:
|
|
543
|
+
writer.write_table(table)
|
|
544
|
+
file = BytesIO(sink.getvalue().to_pybytes())
|
|
545
|
+
return "pandas", file, "application/x-pandas-pyarrow", {}
|
|
546
|
+
|
|
252
547
|
|
|
253
548
|
def _to_iso_format(value: Optional[datetime]) -> Optional[str]:
|
|
254
549
|
return value.isoformat() if value else None
|
phoenix/session/session.py
CHANGED
|
@@ -37,10 +37,16 @@ from phoenix.config import (
|
|
|
37
37
|
get_exported_files,
|
|
38
38
|
get_working_dir,
|
|
39
39
|
)
|
|
40
|
-
from phoenix.core.model_schema_adapter import
|
|
40
|
+
from phoenix.core.model_schema_adapter import create_model_from_inferences
|
|
41
41
|
from phoenix.inferences.inferences import EMPTY_INFERENCES, Inferences
|
|
42
42
|
from phoenix.pointcloud.umap_parameters import get_umap_parameters
|
|
43
|
-
from phoenix.server.app import
|
|
43
|
+
from phoenix.server.app import (
|
|
44
|
+
SessionFactory,
|
|
45
|
+
_db,
|
|
46
|
+
create_app,
|
|
47
|
+
create_engine_and_run_migrations,
|
|
48
|
+
instrument_engine_if_enabled,
|
|
49
|
+
)
|
|
44
50
|
from phoenix.server.thread_server import ThreadServer
|
|
45
51
|
from phoenix.services import AppService
|
|
46
52
|
from phoenix.session.client import Client
|
|
@@ -108,9 +114,9 @@ class Session(TraceDataExtractor, ABC):
|
|
|
108
114
|
def __init__(
|
|
109
115
|
self,
|
|
110
116
|
database_url: str,
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
117
|
+
primary_inferences: Inferences,
|
|
118
|
+
reference_inferences: Optional[Inferences] = None,
|
|
119
|
+
corpus_inferences: Optional[Inferences] = None,
|
|
114
120
|
trace_dataset: Optional[TraceDataset] = None,
|
|
115
121
|
default_umap_parameters: Optional[Mapping[str, Any]] = None,
|
|
116
122
|
host: Optional[str] = None,
|
|
@@ -118,9 +124,9 @@ class Session(TraceDataExtractor, ABC):
|
|
|
118
124
|
notebook_env: Optional[NotebookEnvironment] = None,
|
|
119
125
|
):
|
|
120
126
|
self._database_url = database_url
|
|
121
|
-
self.
|
|
122
|
-
self.
|
|
123
|
-
self.
|
|
127
|
+
self.primary_inferences = primary_inferences
|
|
128
|
+
self.reference_inferences = reference_inferences
|
|
129
|
+
self.corpus_inferences = corpus_inferences
|
|
124
130
|
self.trace_dataset = trace_dataset
|
|
125
131
|
self.umap_parameters = get_umap_parameters(default_umap_parameters)
|
|
126
132
|
self.host = host or get_env_host()
|
|
@@ -264,9 +270,9 @@ class ProcessSession(Session):
|
|
|
264
270
|
def __init__(
|
|
265
271
|
self,
|
|
266
272
|
database_url: str,
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
273
|
+
primary_inferences: Inferences,
|
|
274
|
+
reference_inferences: Optional[Inferences] = None,
|
|
275
|
+
corpus_inferences: Optional[Inferences] = None,
|
|
270
276
|
trace_dataset: Optional[TraceDataset] = None,
|
|
271
277
|
default_umap_parameters: Optional[Mapping[str, Any]] = None,
|
|
272
278
|
host: Optional[str] = None,
|
|
@@ -276,20 +282,20 @@ class ProcessSession(Session):
|
|
|
276
282
|
) -> None:
|
|
277
283
|
super().__init__(
|
|
278
284
|
database_url=database_url,
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
285
|
+
primary_inferences=primary_inferences,
|
|
286
|
+
reference_inferences=reference_inferences,
|
|
287
|
+
corpus_inferences=corpus_inferences,
|
|
282
288
|
trace_dataset=trace_dataset,
|
|
283
289
|
default_umap_parameters=default_umap_parameters,
|
|
284
290
|
host=host,
|
|
285
291
|
port=port,
|
|
286
292
|
notebook_env=notebook_env,
|
|
287
293
|
)
|
|
288
|
-
|
|
289
|
-
if isinstance(
|
|
290
|
-
|
|
291
|
-
if isinstance(
|
|
292
|
-
|
|
294
|
+
primary_inferences.to_disc()
|
|
295
|
+
if isinstance(reference_inferences, Inferences):
|
|
296
|
+
reference_inferences.to_disc()
|
|
297
|
+
if isinstance(corpus_inferences, Inferences):
|
|
298
|
+
corpus_inferences.to_disc()
|
|
293
299
|
if isinstance(trace_dataset, TraceDataset):
|
|
294
300
|
trace_dataset.to_disc()
|
|
295
301
|
umap_params_str = (
|
|
@@ -304,13 +310,13 @@ class ProcessSession(Session):
|
|
|
304
310
|
host=self.host,
|
|
305
311
|
port=self.port,
|
|
306
312
|
root_path=self.root_path,
|
|
307
|
-
|
|
313
|
+
primary_inferences_name=self.primary_inferences.name,
|
|
308
314
|
umap_params=umap_params_str,
|
|
309
|
-
|
|
310
|
-
self.
|
|
315
|
+
reference_inferences_name=(
|
|
316
|
+
self.reference_inferences.name if self.reference_inferences is not None else None
|
|
311
317
|
),
|
|
312
|
-
|
|
313
|
-
self.
|
|
318
|
+
corpus_inferences_name=(
|
|
319
|
+
self.corpus_inferences.name if self.corpus_inferences is not None else None
|
|
314
320
|
),
|
|
315
321
|
trace_dataset_name=(
|
|
316
322
|
self.trace_dataset.name if self.trace_dataset is not None else None
|
|
@@ -330,9 +336,9 @@ class ThreadSession(Session):
|
|
|
330
336
|
def __init__(
|
|
331
337
|
self,
|
|
332
338
|
database_url: str,
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
339
|
+
primary_inferences: Inferences,
|
|
340
|
+
reference_inferences: Optional[Inferences] = None,
|
|
341
|
+
corpus_inferences: Optional[Inferences] = None,
|
|
336
342
|
trace_dataset: Optional[TraceDataset] = None,
|
|
337
343
|
default_umap_parameters: Optional[Mapping[str, Any]] = None,
|
|
338
344
|
host: Optional[str] = None,
|
|
@@ -342,29 +348,32 @@ class ThreadSession(Session):
|
|
|
342
348
|
):
|
|
343
349
|
super().__init__(
|
|
344
350
|
database_url=database_url,
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
351
|
+
primary_inferences=primary_inferences,
|
|
352
|
+
reference_inferences=reference_inferences,
|
|
353
|
+
corpus_inferences=corpus_inferences,
|
|
348
354
|
trace_dataset=trace_dataset,
|
|
349
355
|
default_umap_parameters=default_umap_parameters,
|
|
350
356
|
host=host,
|
|
351
357
|
port=port,
|
|
352
358
|
notebook_env=notebook_env,
|
|
353
359
|
)
|
|
354
|
-
self.model =
|
|
355
|
-
|
|
356
|
-
|
|
360
|
+
self.model = create_model_from_inferences(
|
|
361
|
+
primary_inferences,
|
|
362
|
+
reference_inferences,
|
|
357
363
|
)
|
|
358
364
|
self.corpus = (
|
|
359
|
-
|
|
360
|
-
|
|
365
|
+
create_model_from_inferences(
|
|
366
|
+
corpus_inferences,
|
|
361
367
|
)
|
|
362
|
-
if
|
|
368
|
+
if corpus_inferences is not None
|
|
363
369
|
else None
|
|
364
370
|
)
|
|
365
371
|
# Initialize an app service that keeps the server running
|
|
372
|
+
engine = create_engine_and_run_migrations(database_url)
|
|
373
|
+
instrumentation_cleanups = instrument_engine_if_enabled(engine)
|
|
374
|
+
factory = SessionFactory(session_factory=_db(engine), dialect=engine.dialect.name)
|
|
366
375
|
self.app = create_app(
|
|
367
|
-
|
|
376
|
+
db=factory,
|
|
368
377
|
export_path=self.export_path,
|
|
369
378
|
model=self.model,
|
|
370
379
|
corpus=self.corpus,
|
|
@@ -375,6 +384,7 @@ class ThreadSession(Session):
|
|
|
375
384
|
if (trace_dataset and (initial_evaluations := trace_dataset.evaluations))
|
|
376
385
|
else None
|
|
377
386
|
),
|
|
387
|
+
clean_up_callbacks=instrumentation_cleanups,
|
|
378
388
|
)
|
|
379
389
|
self.server = ThreadServer(
|
|
380
390
|
app=self.app,
|