arize-phoenix 4.4.3__py3-none-any.whl → 4.4.4rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (112) hide show
  1. {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc1.dist-info}/METADATA +4 -4
  2. {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc1.dist-info}/RECORD +111 -55
  3. {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc1.dist-info}/WHEEL +1 -1
  4. phoenix/__init__.py +0 -27
  5. phoenix/config.py +21 -7
  6. phoenix/core/model.py +25 -25
  7. phoenix/core/model_schema.py +64 -62
  8. phoenix/core/model_schema_adapter.py +27 -25
  9. phoenix/datasets/__init__.py +0 -0
  10. phoenix/datasets/evaluators.py +275 -0
  11. phoenix/datasets/experiments.py +469 -0
  12. phoenix/datasets/tracing.py +66 -0
  13. phoenix/datasets/types.py +212 -0
  14. phoenix/db/bulk_inserter.py +54 -14
  15. phoenix/db/insertion/dataset.py +234 -0
  16. phoenix/db/insertion/evaluation.py +6 -6
  17. phoenix/db/insertion/helpers.py +13 -2
  18. phoenix/db/migrations/types.py +29 -0
  19. phoenix/db/migrations/versions/10460e46d750_datasets.py +291 -0
  20. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +2 -28
  21. phoenix/db/models.py +230 -3
  22. phoenix/inferences/fixtures.py +23 -23
  23. phoenix/inferences/inferences.py +7 -7
  24. phoenix/inferences/validation.py +1 -1
  25. phoenix/server/api/context.py +16 -0
  26. phoenix/server/api/dataloaders/__init__.py +16 -0
  27. phoenix/server/api/dataloaders/dataset_example_revisions.py +100 -0
  28. phoenix/server/api/dataloaders/dataset_example_spans.py +43 -0
  29. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +85 -0
  30. phoenix/server/api/dataloaders/experiment_error_rates.py +43 -0
  31. phoenix/server/api/dataloaders/experiment_sequence_number.py +49 -0
  32. phoenix/server/api/dataloaders/project_by_name.py +31 -0
  33. phoenix/server/api/dataloaders/span_descendants.py +2 -3
  34. phoenix/server/api/dataloaders/span_projects.py +33 -0
  35. phoenix/server/api/dataloaders/trace_row_ids.py +39 -0
  36. phoenix/server/api/helpers/dataset_helpers.py +178 -0
  37. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +16 -0
  38. phoenix/server/api/input_types/AddSpansToDatasetInput.py +14 -0
  39. phoenix/server/api/input_types/CreateDatasetInput.py +12 -0
  40. phoenix/server/api/input_types/DatasetExampleInput.py +14 -0
  41. phoenix/server/api/input_types/DatasetSort.py +17 -0
  42. phoenix/server/api/input_types/DatasetVersionSort.py +16 -0
  43. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +13 -0
  44. phoenix/server/api/input_types/DeleteDatasetInput.py +7 -0
  45. phoenix/server/api/input_types/DeleteExperimentsInput.py +9 -0
  46. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +35 -0
  47. phoenix/server/api/input_types/PatchDatasetInput.py +14 -0
  48. phoenix/server/api/mutations/__init__.py +13 -0
  49. phoenix/server/api/mutations/auth.py +11 -0
  50. phoenix/server/api/mutations/dataset_mutations.py +520 -0
  51. phoenix/server/api/mutations/experiment_mutations.py +65 -0
  52. phoenix/server/api/{types/ExportEventsMutation.py → mutations/export_events_mutations.py} +17 -14
  53. phoenix/server/api/mutations/project_mutations.py +42 -0
  54. phoenix/server/api/openapi/__init__.py +0 -0
  55. phoenix/server/api/openapi/main.py +6 -0
  56. phoenix/server/api/openapi/schema.py +15 -0
  57. phoenix/server/api/queries.py +503 -0
  58. phoenix/server/api/routers/v1/__init__.py +77 -2
  59. phoenix/server/api/routers/v1/dataset_examples.py +178 -0
  60. phoenix/server/api/routers/v1/datasets.py +861 -0
  61. phoenix/server/api/routers/v1/evaluations.py +4 -2
  62. phoenix/server/api/routers/v1/experiment_evaluations.py +65 -0
  63. phoenix/server/api/routers/v1/experiment_runs.py +108 -0
  64. phoenix/server/api/routers/v1/experiments.py +174 -0
  65. phoenix/server/api/routers/v1/spans.py +3 -1
  66. phoenix/server/api/routers/v1/traces.py +1 -4
  67. phoenix/server/api/schema.py +2 -303
  68. phoenix/server/api/types/AnnotatorKind.py +10 -0
  69. phoenix/server/api/types/Cluster.py +19 -19
  70. phoenix/server/api/types/CreateDatasetPayload.py +8 -0
  71. phoenix/server/api/types/Dataset.py +282 -63
  72. phoenix/server/api/types/DatasetExample.py +85 -0
  73. phoenix/server/api/types/DatasetExampleRevision.py +34 -0
  74. phoenix/server/api/types/DatasetVersion.py +14 -0
  75. phoenix/server/api/types/Dimension.py +30 -29
  76. phoenix/server/api/types/EmbeddingDimension.py +40 -34
  77. phoenix/server/api/types/Event.py +16 -16
  78. phoenix/server/api/types/ExampleRevisionInterface.py +14 -0
  79. phoenix/server/api/types/Experiment.py +135 -0
  80. phoenix/server/api/types/ExperimentAnnotationSummary.py +13 -0
  81. phoenix/server/api/types/ExperimentComparison.py +19 -0
  82. phoenix/server/api/types/ExperimentRun.py +91 -0
  83. phoenix/server/api/types/ExperimentRunAnnotation.py +57 -0
  84. phoenix/server/api/types/Inferences.py +80 -0
  85. phoenix/server/api/types/InferencesRole.py +23 -0
  86. phoenix/server/api/types/Model.py +43 -42
  87. phoenix/server/api/types/Project.py +26 -12
  88. phoenix/server/api/types/Span.py +78 -2
  89. phoenix/server/api/types/TimeSeries.py +6 -6
  90. phoenix/server/api/types/Trace.py +15 -4
  91. phoenix/server/api/types/UMAPPoints.py +1 -1
  92. phoenix/server/api/types/node.py +5 -111
  93. phoenix/server/api/types/pagination.py +10 -52
  94. phoenix/server/app.py +99 -49
  95. phoenix/server/main.py +49 -27
  96. phoenix/server/openapi/docs.py +3 -0
  97. phoenix/server/static/index.js +2246 -1368
  98. phoenix/server/templates/index.html +1 -0
  99. phoenix/services.py +15 -15
  100. phoenix/session/client.py +316 -21
  101. phoenix/session/session.py +47 -37
  102. phoenix/trace/exporter.py +14 -9
  103. phoenix/trace/fixtures.py +133 -7
  104. phoenix/trace/span_evaluations.py +3 -3
  105. phoenix/trace/trace_dataset.py +6 -6
  106. phoenix/utilities/json.py +61 -0
  107. phoenix/utilities/re.py +50 -0
  108. phoenix/version.py +1 -1
  109. phoenix/server/api/types/DatasetRole.py +0 -23
  110. {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc1.dist-info}/licenses/IP_NOTICE +0 -0
  111. {arize_phoenix-4.4.3.dist-info → arize_phoenix-4.4.4rc1.dist-info}/licenses/LICENSE +0 -0
  112. /phoenix/server/api/{helpers.py → helpers/__init__.py} +0 -0
@@ -31,6 +31,7 @@
31
31
  // injected into the client before React runs
32
32
  value: Object.freeze({
33
33
  basename: "{{basename}}",
34
+ platformVersion: "{{platform_version}}",
34
35
  hasInferences: Boolean("{{has_inferences}}" == "True"),
35
36
  hasCorpus: Boolean("{{has_corpus}}" == "True"),
36
37
  UMAP: {
phoenix/services.py CHANGED
@@ -99,10 +99,10 @@ class AppService(Service):
99
99
 
100
100
  working_dir = SERVER_DIR
101
101
 
102
- # Internal references to the name / directory of the dataset(s)
103
- __primary_dataset_name: str
104
- __reference_dataset_name: Optional[str]
105
- __corpus_dataset_name: Optional[str]
102
+ # Internal references to the name / directory of the inferences(s)
103
+ __primary_inferences_name: str
104
+ __reference_inferences_name: Optional[str]
105
+ __corpus_inferences_name: Optional[str]
106
106
  __trace_dataset_name: Optional[str]
107
107
 
108
108
  def __init__(
@@ -112,10 +112,10 @@ class AppService(Service):
112
112
  host: str,
113
113
  port: int,
114
114
  root_path: str,
115
- primary_dataset_name: str,
115
+ primary_inferences_name: str,
116
116
  umap_params: str,
117
- reference_dataset_name: Optional[str],
118
- corpus_dataset_name: Optional[str],
117
+ reference_inferences_name: Optional[str],
118
+ corpus_inferences_name: Optional[str],
119
119
  trace_dataset_name: Optional[str],
120
120
  ):
121
121
  self.database_url = database_url
@@ -123,10 +123,10 @@ class AppService(Service):
123
123
  self.host = host
124
124
  self.port = port
125
125
  self.root_path = root_path # TODO(mikeldking): Add support for root_path
126
- self.__primary_dataset_name = primary_dataset_name
126
+ self.__primary_inferences_name = primary_inferences_name
127
127
  self.__umap_params = umap_params
128
- self.__reference_dataset_name = reference_dataset_name
129
- self.__corpus_dataset_name = corpus_dataset_name
128
+ self.__reference_inferences_name = reference_inferences_name
129
+ self.__corpus_inferences_name = corpus_inferences_name
130
130
  self.__trace_dataset_name = trace_dataset_name
131
131
  super().__init__()
132
132
 
@@ -147,12 +147,12 @@ class AppService(Service):
147
147
  self.__umap_params,
148
148
  "datasets",
149
149
  "--primary",
150
- str(self.__primary_dataset_name),
150
+ str(self.__primary_inferences_name),
151
151
  ]
152
- if self.__reference_dataset_name is not None:
153
- command.extend(["--reference", str(self.__reference_dataset_name)])
154
- if self.__corpus_dataset_name is not None:
155
- command.extend(["--corpus", str(self.__corpus_dataset_name)])
152
+ if self.__reference_inferences_name is not None:
153
+ command.extend(["--reference", str(self.__reference_inferences_name)])
154
+ if self.__corpus_inferences_name is not None:
155
+ command.extend(["--corpus", str(self.__corpus_inferences_name)])
156
156
  if self.__trace_dataset_name is not None:
157
157
  command.extend(["--trace", str(self.__trace_dataset_name)])
158
158
  logger.info(f"command: {' '.join(command)}")
phoenix/session/client.py CHANGED
@@ -1,27 +1,46 @@
1
+ import csv
1
2
  import gzip
2
3
  import logging
3
4
  import weakref
5
+ from collections import Counter
4
6
  from datetime import datetime
5
- from io import BytesIO
6
- from typing import Any, List, Optional, Union, cast
7
- from urllib.parse import urljoin
7
+ from io import BytesIO, StringIO
8
+ from pathlib import Path
9
+ from typing import (
10
+ Any,
11
+ BinaryIO,
12
+ Dict,
13
+ Iterable,
14
+ List,
15
+ Literal,
16
+ Mapping,
17
+ Optional,
18
+ Tuple,
19
+ Union,
20
+ cast,
21
+ )
22
+ from urllib.parse import quote, urljoin
8
23
 
24
+ import httpx
9
25
  import pandas as pd
10
26
  import pyarrow as pa
11
27
  from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ExportTraceServiceRequest
12
28
  from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
13
29
  from opentelemetry.proto.resource.v1.resource_pb2 import Resource
14
30
  from opentelemetry.proto.trace.v1.trace_pb2 import ResourceSpans, ScopeSpans
15
- from pyarrow import ArrowInvalid
16
- from requests import Session
31
+ from pyarrow import ArrowInvalid, Table
32
+ from typing_extensions import TypeAlias, assert_never
17
33
 
18
34
  from phoenix.config import (
35
+ get_env_client_headers,
19
36
  get_env_collector_endpoint,
20
37
  get_env_host,
21
38
  get_env_port,
22
39
  get_env_project_name,
23
40
  )
41
+ from phoenix.datasets.types import Dataset, Example
24
42
  from phoenix.datetime_utils import normalize_datetime
43
+ from phoenix.db.insertion.dataset import DatasetKeys
25
44
  from phoenix.session.data_extractor import DEFAULT_SPAN_LIMIT, TraceDataExtractor
26
45
  from phoenix.trace import Evaluations, TraceDataset
27
46
  from phoenix.trace.dsl import SpanQuery
@@ -36,14 +55,20 @@ class Client(TraceDataExtractor):
36
55
  *,
37
56
  endpoint: Optional[str] = None,
38
57
  warn_if_server_not_running: bool = True,
58
+ headers: Optional[Mapping[str, str]] = None,
39
59
  **kwargs: Any, # for backward-compatibility
40
60
  ):
41
61
  """
42
62
  Client for connecting to a Phoenix server.
43
63
 
44
64
  Args:
45
- endpoint (str, optional): Phoenix server endpoint, e.g. http://localhost:6006. If not
46
- provided, the endpoint will be inferred from the environment variables.
65
+ endpoint (str, optional): Phoenix server endpoint, e.g.
66
+ http://localhost:6006. If not provided, the endpoint will be
67
+ inferred from the environment variables.
68
+
69
+ headers (Mapping[str, str], optional): Headers to include in each
70
+ network request. If not provided, the headers will be inferred from
71
+ the environment variables (if present).
47
72
  """
48
73
  if kwargs.pop("use_active_session_if_available", None) is not None:
49
74
  print(
@@ -52,14 +77,14 @@ class Client(TraceDataExtractor):
52
77
  )
53
78
  if kwargs:
54
79
  raise TypeError(f"Unexpected keyword arguments: {', '.join(kwargs)}")
80
+ headers = headers or get_env_client_headers()
55
81
  host = get_env_host()
56
82
  if host == "0.0.0.0":
57
83
  host = "127.0.0.1"
58
84
  base_url = endpoint or get_env_collector_endpoint() or f"http://{host}:{get_env_port()}"
59
85
  self._base_url = base_url if base_url.endswith("/") else base_url + "/"
60
-
61
- self._session = Session()
62
- weakref.finalize(self, self._session.close)
86
+ self._client = httpx.Client(headers=headers)
87
+ weakref.finalize(self, self._client.close)
63
88
  if warn_if_server_not_running:
64
89
  self._warn_if_phoenix_is_not_running()
65
90
 
@@ -98,7 +123,7 @@ class Client(TraceDataExtractor):
98
123
  "stop_time is deprecated. Use end_time instead.",
99
124
  )
100
125
  end_time = end_time or stop_time
101
- response = self._session.post(
126
+ response = self._client.post(
102
127
  url=urljoin(self._base_url, "v1/spans"),
103
128
  params={"project-name": project_name},
104
129
  json={
@@ -145,8 +170,8 @@ class Client(TraceDataExtractor):
145
170
  empty list if no evaluations are found.
146
171
  """
147
172
  project_name = project_name or get_env_project_name()
148
- response = self._session.get(
149
- urljoin(self._base_url, "v1/evaluations"),
173
+ response = self._client.get(
174
+ url=urljoin(self._base_url, "v1/evaluations"),
150
175
  params={"project-name": project_name},
151
176
  )
152
177
  if response.status_code == 404:
@@ -167,7 +192,7 @@ class Client(TraceDataExtractor):
167
192
 
168
193
  def _warn_if_phoenix_is_not_running(self) -> None:
169
194
  try:
170
- self._session.get(urljoin(self._base_url, "arize_phoenix_version")).raise_for_status()
195
+ self._client.get(urljoin(self._base_url, "arize_phoenix_version")).raise_for_status()
171
196
  except Exception:
172
197
  logger.warning(
173
198
  f"Arize Phoenix is not running on {self._base_url}. Launch Phoenix "
@@ -197,9 +222,9 @@ class Client(TraceDataExtractor):
197
222
  headers = {"content-type": "application/x-pandas-arrow"}
198
223
  with pa.ipc.new_stream(sink, table.schema) as writer:
199
224
  writer.write_table(table)
200
- self._session.post(
201
- urljoin(self._base_url, "v1/evaluations"),
202
- data=cast(bytes, sink.getvalue().to_pybytes()),
225
+ self._client.post(
226
+ url=urljoin(self._base_url, "v1/evaluations"),
227
+ content=cast(bytes, sink.getvalue().to_pybytes()),
203
228
  headers=headers,
204
229
  ).raise_for_status()
205
230
 
@@ -239,16 +264,286 @@ class Client(TraceDataExtractor):
239
264
  ]
240
265
  for otlp_span in otlp_spans:
241
266
  serialized = otlp_span.SerializeToString()
242
- data = gzip.compress(serialized)
243
- self._session.post(
244
- urljoin(self._base_url, "v1/traces"),
245
- data=data,
267
+ content = gzip.compress(serialized)
268
+ self._client.post(
269
+ url=urljoin(self._base_url, "v1/traces"),
270
+ content=content,
246
271
  headers={
247
272
  "content-type": "application/x-protobuf",
248
273
  "content-encoding": "gzip",
249
274
  },
250
275
  ).raise_for_status()
251
276
 
277
+ def _get_dataset_id_by_name(self, name: str) -> str:
278
+ """
279
+ Gets a dataset by name.
280
+
281
+ Args:
282
+ name (str): The name of the dataset.
283
+ version_id (Optional[str]): The version ID of the dataset. Default None.
284
+
285
+ Returns:
286
+ Dataset: The dataset object.
287
+ """
288
+ response = self._client.get(
289
+ urljoin(self._base_url, "/v1/datasets"),
290
+ params={"name": name},
291
+ )
292
+ response.raise_for_status()
293
+ if not (records := response.json()["data"]):
294
+ raise ValueError(f"Failed to query dataset by name: {name}")
295
+ if len(records) > 1 or not records[0]:
296
+ raise ValueError(f"Failed to find a single dataset with the given name: {name}")
297
+ dataset = records[0]
298
+ return str(dataset["id"])
299
+
300
+ def get_dataset(
301
+ self,
302
+ *,
303
+ id: Optional[str] = None,
304
+ name: Optional[str] = None,
305
+ version_id: Optional[str] = None,
306
+ ) -> Dataset:
307
+ """
308
+ Gets the dataset for a specific version, or gets the latest version of
309
+ the dataset if no version is specified.
310
+
311
+ Args:
312
+
313
+ id (Optional[str]): An ID for the dataset.
314
+
315
+ name (Optional[str]): the name for the dataset. If provided, the ID
316
+ is ignored and the dataset is retrieved by name.
317
+
318
+ version_id (Optional[str]): An ID for the version of the dataset, or
319
+ None.
320
+
321
+ Returns:
322
+ A dataset object.
323
+ """
324
+ if name:
325
+ id = self._get_dataset_id_by_name(name)
326
+
327
+ if not id:
328
+ raise ValueError("Dataset id or name must be provided.")
329
+
330
+ response = self._client.get(
331
+ urljoin(self._base_url, f"/v1/datasets/{quote(id)}/examples"),
332
+ params={"version-id": version_id} if version_id else None,
333
+ )
334
+ response.raise_for_status()
335
+ data = response.json()["data"]
336
+ examples = [
337
+ Example(
338
+ id=example["id"],
339
+ input=example["input"],
340
+ output=example["output"],
341
+ metadata=example["metadata"],
342
+ updated_at=datetime.fromisoformat(example["updated_at"]),
343
+ )
344
+ for example in data["examples"]
345
+ ]
346
+ resolved_dataset_id = data["dataset_id"]
347
+ resolved_version_id = data["version_id"]
348
+ return Dataset(
349
+ id=resolved_dataset_id,
350
+ version_id=resolved_version_id,
351
+ examples=examples,
352
+ )
353
+
354
+ def get_dataset_versions(
355
+ self,
356
+ dataset_id: str,
357
+ /,
358
+ *,
359
+ limit: Optional[int] = 100,
360
+ ) -> pd.DataFrame:
361
+ """
362
+ Get dataset versions as pandas DataFrame.
363
+
364
+ Args:
365
+ dataset_id (str): dataset ID
366
+ limit (Optional[int]): maximum number of versions to return,
367
+ starting from the most recent version
368
+
369
+ Returns:
370
+ pandas DataFrame
371
+ """
372
+ url = urljoin(self._base_url, f"v1/datasets/{dataset_id}/versions")
373
+ response = httpx.get(url=url, params={"limit": limit})
374
+ response.raise_for_status()
375
+ if not (records := response.json()["data"]):
376
+ return pd.DataFrame()
377
+ df = pd.DataFrame.from_records(records, index="version_id")
378
+ df["created_at"] = pd.to_datetime(df.created_at)
379
+ return df
380
+
381
+ def download_dataset_examples(
382
+ self,
383
+ dataset_id: str,
384
+ /,
385
+ *,
386
+ dataset_version_id: Optional[str] = None,
387
+ ) -> pd.DataFrame:
388
+ """
389
+ Download dataset examples as pandas DataFrame.
390
+
391
+ Args:
392
+ dataset_id (str): dataset ID
393
+ dataset_version_id (Optional[str]): dataset version ID, if omitted,
394
+ the latest version is returned.
395
+
396
+ Returns:
397
+ pandas DataFrame
398
+ """
399
+ url = f"v1/datasets/{dataset_id}/csv"
400
+ response = httpx.get(
401
+ url=urljoin(self._base_url, url),
402
+ params={"version": dataset_version_id} if dataset_version_id else {},
403
+ )
404
+ response.raise_for_status()
405
+ return pd.read_csv(
406
+ StringIO(response.content.decode()),
407
+ index_col="example_id",
408
+ )
409
+
410
+ def upload_dataset(
411
+ self,
412
+ table: Union[str, Path, pd.DataFrame],
413
+ /,
414
+ *,
415
+ name: str,
416
+ input_keys: Iterable[str],
417
+ output_keys: Iterable[str],
418
+ metadata_keys: Iterable[str] = (),
419
+ description: Optional[str] = None,
420
+ action: Literal["create", "append"] = "create",
421
+ ) -> Dataset:
422
+ """
423
+ Upload examples as dataset to the Phoenix server.
424
+
425
+ Args:
426
+ table (str | Path | pd.DataFrame): Location of a CSV text file, or
427
+ pandas DataFrame.
428
+ name: (str): Name of the dataset. Required if action=append.
429
+ input_keys (Iterable[str]): List of column names used as input keys.
430
+ input_keys, output_keys, metadata_keys must be disjoint, and must
431
+ exist in CSV column headers.
432
+ output_keys (Iterable[str]): List of column names used as output keys.
433
+ input_keys, output_keys, metadata_keys must be disjoint, and must
434
+ exist in CSV column headers.
435
+ metadata_keys (Iterable[str]): List of column names used as metadata keys.
436
+ input_keys, output_keys, metadata_keys must be disjoint, and must
437
+ exist in CSV column headers.
438
+ description: (Optional[str]): Description of the dataset.
439
+ action: (Literal["create", "append"): Create new dataset or append to an
440
+ existing dataset. If action=append, dataset name is required.
441
+
442
+ Returns:
443
+ A Dataset object with the uploaded examples.
444
+ """
445
+ if action not in ("create", "append"):
446
+ raise ValueError(f"Invalid action: {action}")
447
+ if not name:
448
+ raise ValueError("Dataset name must not be blank")
449
+ keys = DatasetKeys(
450
+ frozenset(input_keys),
451
+ frozenset(output_keys),
452
+ frozenset(metadata_keys),
453
+ )
454
+ if isinstance(table, pd.DataFrame):
455
+ file = _prepare_pyarrow(table, keys)
456
+ elif isinstance(table, (str, Path)):
457
+ file = _prepare_csv(Path(table), keys)
458
+ else:
459
+ assert_never(table)
460
+ response = self._client.post(
461
+ url=urljoin(self._base_url, "v1/datasets/upload"),
462
+ files={"file": file},
463
+ data={
464
+ "action": action,
465
+ "name": name,
466
+ "description": description,
467
+ "input_keys[]": sorted(keys.input),
468
+ "output_keys[]": sorted(keys.output),
469
+ "metadata_keys[]": sorted(keys.metadata),
470
+ },
471
+ params={"sync": True},
472
+ )
473
+ response.raise_for_status()
474
+ data = response.json()["data"]
475
+ dataset_id = data["dataset_id"]
476
+ response = self._client.get(
477
+ url=urljoin(self._base_url, f"v1/datasets/{dataset_id}/examples")
478
+ )
479
+ response.raise_for_status()
480
+ data = response.json()["data"]
481
+ version_id = data["version_id"]
482
+ examples = data["examples"]
483
+ return Dataset(
484
+ id=dataset_id,
485
+ version_id=version_id,
486
+ examples=[
487
+ Example(
488
+ id=example["id"],
489
+ input=example["input"],
490
+ output=example["output"],
491
+ metadata=example["metadata"],
492
+ updated_at=datetime.fromisoformat(example["updated_at"]),
493
+ )
494
+ for example in examples
495
+ ],
496
+ )
497
+
498
+
499
+ FileName: TypeAlias = str
500
+ FilePointer: TypeAlias = BinaryIO
501
+ FileType: TypeAlias = str
502
+ FileHeaders: TypeAlias = Dict[str, str]
503
+
504
+
505
+ def _prepare_csv(
506
+ path: Path,
507
+ keys: DatasetKeys,
508
+ ) -> Tuple[FileName, FilePointer, FileType, FileHeaders]:
509
+ path = path.resolve()
510
+ if not path.is_file():
511
+ raise FileNotFoundError(f"File does not exist: {path}")
512
+ with open(path, "r") as f:
513
+ rows = csv.reader(f)
514
+ try:
515
+ column_headers = next(rows)
516
+ _ = next(rows)
517
+ except StopIteration:
518
+ raise ValueError("csv file has no data")
519
+ (header, freq), *_ = Counter(column_headers).most_common(1)
520
+ if freq > 1:
521
+ raise ValueError(f"Duplicated column header in CSV file: {header}")
522
+ keys.check_differences(frozenset(column_headers))
523
+ file = BytesIO()
524
+ with open(path, "rb") as f:
525
+ file.write(gzip.compress(f.read()))
526
+ return path.name, file, "text/csv", {"Content-Encoding": "gzip"}
527
+
528
+
529
+ def _prepare_pyarrow(
530
+ df: pd.DataFrame,
531
+ keys: DatasetKeys,
532
+ ) -> Tuple[FileName, FilePointer, FileType, FileHeaders]:
533
+ if df.empty:
534
+ raise ValueError("dataframe has no data")
535
+ (header, freq), *_ = Counter(df.columns).most_common(1)
536
+ if freq > 1:
537
+ raise ValueError(f"Duplicated column header in file: {header}")
538
+ keys.check_differences(frozenset(df.columns))
539
+ table = Table.from_pandas(df.loc[:, list(keys)])
540
+ sink = pa.BufferOutputStream()
541
+ options = pa.ipc.IpcWriteOptions(compression="lz4")
542
+ with pa.ipc.new_stream(sink, table.schema, options=options) as writer:
543
+ writer.write_table(table)
544
+ file = BytesIO(sink.getvalue().to_pybytes())
545
+ return "pandas", file, "application/x-pandas-pyarrow", {}
546
+
252
547
 
253
548
  def _to_iso_format(value: Optional[datetime]) -> Optional[str]:
254
549
  return value.isoformat() if value else None
@@ -37,10 +37,16 @@ from phoenix.config import (
37
37
  get_exported_files,
38
38
  get_working_dir,
39
39
  )
40
- from phoenix.core.model_schema_adapter import create_model_from_datasets
40
+ from phoenix.core.model_schema_adapter import create_model_from_inferences
41
41
  from phoenix.inferences.inferences import EMPTY_INFERENCES, Inferences
42
42
  from phoenix.pointcloud.umap_parameters import get_umap_parameters
43
- from phoenix.server.app import create_app
43
+ from phoenix.server.app import (
44
+ SessionFactory,
45
+ _db,
46
+ create_app,
47
+ create_engine_and_run_migrations,
48
+ instrument_engine_if_enabled,
49
+ )
44
50
  from phoenix.server.thread_server import ThreadServer
45
51
  from phoenix.services import AppService
46
52
  from phoenix.session.client import Client
@@ -108,9 +114,9 @@ class Session(TraceDataExtractor, ABC):
108
114
  def __init__(
109
115
  self,
110
116
  database_url: str,
111
- primary_dataset: Inferences,
112
- reference_dataset: Optional[Inferences] = None,
113
- corpus_dataset: Optional[Inferences] = None,
117
+ primary_inferences: Inferences,
118
+ reference_inferences: Optional[Inferences] = None,
119
+ corpus_inferences: Optional[Inferences] = None,
114
120
  trace_dataset: Optional[TraceDataset] = None,
115
121
  default_umap_parameters: Optional[Mapping[str, Any]] = None,
116
122
  host: Optional[str] = None,
@@ -118,9 +124,9 @@ class Session(TraceDataExtractor, ABC):
118
124
  notebook_env: Optional[NotebookEnvironment] = None,
119
125
  ):
120
126
  self._database_url = database_url
121
- self.primary_dataset = primary_dataset
122
- self.reference_dataset = reference_dataset
123
- self.corpus_dataset = corpus_dataset
127
+ self.primary_inferences = primary_inferences
128
+ self.reference_inferences = reference_inferences
129
+ self.corpus_inferences = corpus_inferences
124
130
  self.trace_dataset = trace_dataset
125
131
  self.umap_parameters = get_umap_parameters(default_umap_parameters)
126
132
  self.host = host or get_env_host()
@@ -264,9 +270,9 @@ class ProcessSession(Session):
264
270
  def __init__(
265
271
  self,
266
272
  database_url: str,
267
- primary_dataset: Inferences,
268
- reference_dataset: Optional[Inferences] = None,
269
- corpus_dataset: Optional[Inferences] = None,
273
+ primary_inferences: Inferences,
274
+ reference_inferences: Optional[Inferences] = None,
275
+ corpus_inferences: Optional[Inferences] = None,
270
276
  trace_dataset: Optional[TraceDataset] = None,
271
277
  default_umap_parameters: Optional[Mapping[str, Any]] = None,
272
278
  host: Optional[str] = None,
@@ -276,20 +282,20 @@ class ProcessSession(Session):
276
282
  ) -> None:
277
283
  super().__init__(
278
284
  database_url=database_url,
279
- primary_dataset=primary_dataset,
280
- reference_dataset=reference_dataset,
281
- corpus_dataset=corpus_dataset,
285
+ primary_inferences=primary_inferences,
286
+ reference_inferences=reference_inferences,
287
+ corpus_inferences=corpus_inferences,
282
288
  trace_dataset=trace_dataset,
283
289
  default_umap_parameters=default_umap_parameters,
284
290
  host=host,
285
291
  port=port,
286
292
  notebook_env=notebook_env,
287
293
  )
288
- primary_dataset.to_disc()
289
- if isinstance(reference_dataset, Inferences):
290
- reference_dataset.to_disc()
291
- if isinstance(corpus_dataset, Inferences):
292
- corpus_dataset.to_disc()
294
+ primary_inferences.to_disc()
295
+ if isinstance(reference_inferences, Inferences):
296
+ reference_inferences.to_disc()
297
+ if isinstance(corpus_inferences, Inferences):
298
+ corpus_inferences.to_disc()
293
299
  if isinstance(trace_dataset, TraceDataset):
294
300
  trace_dataset.to_disc()
295
301
  umap_params_str = (
@@ -304,13 +310,13 @@ class ProcessSession(Session):
304
310
  host=self.host,
305
311
  port=self.port,
306
312
  root_path=self.root_path,
307
- primary_dataset_name=self.primary_dataset.name,
313
+ primary_inferences_name=self.primary_inferences.name,
308
314
  umap_params=umap_params_str,
309
- reference_dataset_name=(
310
- self.reference_dataset.name if self.reference_dataset is not None else None
315
+ reference_inferences_name=(
316
+ self.reference_inferences.name if self.reference_inferences is not None else None
311
317
  ),
312
- corpus_dataset_name=(
313
- self.corpus_dataset.name if self.corpus_dataset is not None else None
318
+ corpus_inferences_name=(
319
+ self.corpus_inferences.name if self.corpus_inferences is not None else None
314
320
  ),
315
321
  trace_dataset_name=(
316
322
  self.trace_dataset.name if self.trace_dataset is not None else None
@@ -330,9 +336,9 @@ class ThreadSession(Session):
330
336
  def __init__(
331
337
  self,
332
338
  database_url: str,
333
- primary_dataset: Inferences,
334
- reference_dataset: Optional[Inferences] = None,
335
- corpus_dataset: Optional[Inferences] = None,
339
+ primary_inferences: Inferences,
340
+ reference_inferences: Optional[Inferences] = None,
341
+ corpus_inferences: Optional[Inferences] = None,
336
342
  trace_dataset: Optional[TraceDataset] = None,
337
343
  default_umap_parameters: Optional[Mapping[str, Any]] = None,
338
344
  host: Optional[str] = None,
@@ -342,29 +348,32 @@ class ThreadSession(Session):
342
348
  ):
343
349
  super().__init__(
344
350
  database_url=database_url,
345
- primary_dataset=primary_dataset,
346
- reference_dataset=reference_dataset,
347
- corpus_dataset=corpus_dataset,
351
+ primary_inferences=primary_inferences,
352
+ reference_inferences=reference_inferences,
353
+ corpus_inferences=corpus_inferences,
348
354
  trace_dataset=trace_dataset,
349
355
  default_umap_parameters=default_umap_parameters,
350
356
  host=host,
351
357
  port=port,
352
358
  notebook_env=notebook_env,
353
359
  )
354
- self.model = create_model_from_datasets(
355
- primary_dataset,
356
- reference_dataset,
360
+ self.model = create_model_from_inferences(
361
+ primary_inferences,
362
+ reference_inferences,
357
363
  )
358
364
  self.corpus = (
359
- create_model_from_datasets(
360
- corpus_dataset,
365
+ create_model_from_inferences(
366
+ corpus_inferences,
361
367
  )
362
- if corpus_dataset is not None
368
+ if corpus_inferences is not None
363
369
  else None
364
370
  )
365
371
  # Initialize an app service that keeps the server running
372
+ engine = create_engine_and_run_migrations(database_url)
373
+ instrumentation_cleanups = instrument_engine_if_enabled(engine)
374
+ factory = SessionFactory(session_factory=_db(engine), dialect=engine.dialect.name)
366
375
  self.app = create_app(
367
- database_url=database_url,
376
+ db=factory,
368
377
  export_path=self.export_path,
369
378
  model=self.model,
370
379
  corpus=self.corpus,
@@ -375,6 +384,7 @@ class ThreadSession(Session):
375
384
  if (trace_dataset and (initial_evaluations := trace_dataset.evaluations))
376
385
  else None
377
386
  ),
387
+ clean_up_callbacks=instrumentation_cleanups,
378
388
  )
379
389
  self.server = ThreadServer(
380
390
  app=self.app,