arize-phoenix 4.4.4rc5__py3-none-any.whl → 4.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (118) hide show
  1. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/METADATA +5 -5
  2. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/RECORD +56 -117
  3. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/WHEEL +1 -1
  4. phoenix/__init__.py +27 -0
  5. phoenix/config.py +7 -21
  6. phoenix/core/model.py +25 -25
  7. phoenix/core/model_schema.py +62 -64
  8. phoenix/core/model_schema_adapter.py +25 -27
  9. phoenix/db/bulk_inserter.py +14 -54
  10. phoenix/db/insertion/evaluation.py +6 -6
  11. phoenix/db/insertion/helpers.py +2 -13
  12. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +28 -2
  13. phoenix/db/models.py +4 -236
  14. phoenix/inferences/fixtures.py +23 -23
  15. phoenix/inferences/inferences.py +7 -7
  16. phoenix/inferences/validation.py +1 -1
  17. phoenix/server/api/context.py +0 -18
  18. phoenix/server/api/dataloaders/__init__.py +0 -18
  19. phoenix/server/api/dataloaders/span_descendants.py +3 -2
  20. phoenix/server/api/routers/v1/__init__.py +2 -77
  21. phoenix/server/api/routers/v1/evaluations.py +2 -4
  22. phoenix/server/api/routers/v1/spans.py +1 -3
  23. phoenix/server/api/routers/v1/traces.py +4 -1
  24. phoenix/server/api/schema.py +303 -2
  25. phoenix/server/api/types/Cluster.py +19 -19
  26. phoenix/server/api/types/Dataset.py +63 -282
  27. phoenix/server/api/types/DatasetRole.py +23 -0
  28. phoenix/server/api/types/Dimension.py +29 -30
  29. phoenix/server/api/types/EmbeddingDimension.py +34 -40
  30. phoenix/server/api/types/Event.py +16 -16
  31. phoenix/server/api/{mutations/export_events_mutations.py → types/ExportEventsMutation.py} +14 -17
  32. phoenix/server/api/types/Model.py +42 -43
  33. phoenix/server/api/types/Project.py +12 -26
  34. phoenix/server/api/types/Span.py +2 -79
  35. phoenix/server/api/types/TimeSeries.py +6 -6
  36. phoenix/server/api/types/Trace.py +4 -15
  37. phoenix/server/api/types/UMAPPoints.py +1 -1
  38. phoenix/server/api/types/node.py +111 -5
  39. phoenix/server/api/types/pagination.py +52 -10
  40. phoenix/server/app.py +49 -101
  41. phoenix/server/main.py +27 -49
  42. phoenix/server/openapi/docs.py +0 -3
  43. phoenix/server/static/index.js +2595 -3523
  44. phoenix/server/templates/index.html +0 -1
  45. phoenix/services.py +15 -15
  46. phoenix/session/client.py +21 -438
  47. phoenix/session/session.py +37 -47
  48. phoenix/trace/exporter.py +9 -14
  49. phoenix/trace/fixtures.py +7 -133
  50. phoenix/trace/schemas.py +2 -1
  51. phoenix/trace/span_evaluations.py +3 -3
  52. phoenix/trace/trace_dataset.py +6 -6
  53. phoenix/version.py +1 -1
  54. phoenix/datasets/__init__.py +0 -0
  55. phoenix/datasets/evaluators/__init__.py +0 -18
  56. phoenix/datasets/evaluators/code_evaluators.py +0 -99
  57. phoenix/datasets/evaluators/llm_evaluators.py +0 -244
  58. phoenix/datasets/evaluators/utils.py +0 -292
  59. phoenix/datasets/experiments.py +0 -550
  60. phoenix/datasets/tracing.py +0 -85
  61. phoenix/datasets/types.py +0 -178
  62. phoenix/db/insertion/dataset.py +0 -237
  63. phoenix/db/migrations/types.py +0 -29
  64. phoenix/db/migrations/versions/10460e46d750_datasets.py +0 -291
  65. phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -100
  66. phoenix/server/api/dataloaders/dataset_example_spans.py +0 -43
  67. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +0 -85
  68. phoenix/server/api/dataloaders/experiment_error_rates.py +0 -43
  69. phoenix/server/api/dataloaders/experiment_run_counts.py +0 -42
  70. phoenix/server/api/dataloaders/experiment_sequence_number.py +0 -49
  71. phoenix/server/api/dataloaders/project_by_name.py +0 -31
  72. phoenix/server/api/dataloaders/span_projects.py +0 -33
  73. phoenix/server/api/dataloaders/trace_row_ids.py +0 -39
  74. phoenix/server/api/helpers/dataset_helpers.py +0 -179
  75. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +0 -16
  76. phoenix/server/api/input_types/AddSpansToDatasetInput.py +0 -14
  77. phoenix/server/api/input_types/ClearProjectInput.py +0 -15
  78. phoenix/server/api/input_types/CreateDatasetInput.py +0 -12
  79. phoenix/server/api/input_types/DatasetExampleInput.py +0 -14
  80. phoenix/server/api/input_types/DatasetSort.py +0 -17
  81. phoenix/server/api/input_types/DatasetVersionSort.py +0 -16
  82. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +0 -13
  83. phoenix/server/api/input_types/DeleteDatasetInput.py +0 -7
  84. phoenix/server/api/input_types/DeleteExperimentsInput.py +0 -9
  85. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +0 -35
  86. phoenix/server/api/input_types/PatchDatasetInput.py +0 -14
  87. phoenix/server/api/mutations/__init__.py +0 -13
  88. phoenix/server/api/mutations/auth.py +0 -11
  89. phoenix/server/api/mutations/dataset_mutations.py +0 -520
  90. phoenix/server/api/mutations/experiment_mutations.py +0 -65
  91. phoenix/server/api/mutations/project_mutations.py +0 -47
  92. phoenix/server/api/openapi/__init__.py +0 -0
  93. phoenix/server/api/openapi/main.py +0 -6
  94. phoenix/server/api/openapi/schema.py +0 -16
  95. phoenix/server/api/queries.py +0 -503
  96. phoenix/server/api/routers/v1/dataset_examples.py +0 -178
  97. phoenix/server/api/routers/v1/datasets.py +0 -965
  98. phoenix/server/api/routers/v1/experiment_evaluations.py +0 -66
  99. phoenix/server/api/routers/v1/experiment_runs.py +0 -108
  100. phoenix/server/api/routers/v1/experiments.py +0 -174
  101. phoenix/server/api/types/AnnotatorKind.py +0 -10
  102. phoenix/server/api/types/CreateDatasetPayload.py +0 -8
  103. phoenix/server/api/types/DatasetExample.py +0 -85
  104. phoenix/server/api/types/DatasetExampleRevision.py +0 -34
  105. phoenix/server/api/types/DatasetVersion.py +0 -14
  106. phoenix/server/api/types/ExampleRevisionInterface.py +0 -14
  107. phoenix/server/api/types/Experiment.py +0 -140
  108. phoenix/server/api/types/ExperimentAnnotationSummary.py +0 -13
  109. phoenix/server/api/types/ExperimentComparison.py +0 -19
  110. phoenix/server/api/types/ExperimentRun.py +0 -91
  111. phoenix/server/api/types/ExperimentRunAnnotation.py +0 -57
  112. phoenix/server/api/types/Inferences.py +0 -80
  113. phoenix/server/api/types/InferencesRole.py +0 -23
  114. phoenix/utilities/json.py +0 -61
  115. phoenix/utilities/re.py +0 -50
  116. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/licenses/IP_NOTICE +0 -0
  117. {arize_phoenix-4.4.4rc5.dist-info → arize_phoenix-4.5.0.dist-info}/licenses/LICENSE +0 -0
  118. /phoenix/server/api/{helpers/__init__.py → helpers.py} +0 -0
@@ -31,7 +31,6 @@
31
31
  // injected into the client before React runs
32
32
  value: Object.freeze({
33
33
  basename: "{{basename}}",
34
- platformVersion: "{{platform_version}}",
35
34
  hasInferences: Boolean("{{has_inferences}}" == "True"),
36
35
  hasCorpus: Boolean("{{has_corpus}}" == "True"),
37
36
  UMAP: {
phoenix/services.py CHANGED
@@ -99,10 +99,10 @@ class AppService(Service):
99
99
 
100
100
  working_dir = SERVER_DIR
101
101
 
102
- # Internal references to the name / directory of the inferences(s)
103
- __primary_inferences_name: str
104
- __reference_inferences_name: Optional[str]
105
- __corpus_inferences_name: Optional[str]
102
+ # Internal references to the name / directory of the dataset(s)
103
+ __primary_dataset_name: str
104
+ __reference_dataset_name: Optional[str]
105
+ __corpus_dataset_name: Optional[str]
106
106
  __trace_dataset_name: Optional[str]
107
107
 
108
108
  def __init__(
@@ -112,10 +112,10 @@ class AppService(Service):
112
112
  host: str,
113
113
  port: int,
114
114
  root_path: str,
115
- primary_inferences_name: str,
115
+ primary_dataset_name: str,
116
116
  umap_params: str,
117
- reference_inferences_name: Optional[str],
118
- corpus_inferences_name: Optional[str],
117
+ reference_dataset_name: Optional[str],
118
+ corpus_dataset_name: Optional[str],
119
119
  trace_dataset_name: Optional[str],
120
120
  ):
121
121
  self.database_url = database_url
@@ -123,10 +123,10 @@ class AppService(Service):
123
123
  self.host = host
124
124
  self.port = port
125
125
  self.root_path = root_path # TODO(mikeldking): Add support for root_path
126
- self.__primary_inferences_name = primary_inferences_name
126
+ self.__primary_dataset_name = primary_dataset_name
127
127
  self.__umap_params = umap_params
128
- self.__reference_inferences_name = reference_inferences_name
129
- self.__corpus_inferences_name = corpus_inferences_name
128
+ self.__reference_dataset_name = reference_dataset_name
129
+ self.__corpus_dataset_name = corpus_dataset_name
130
130
  self.__trace_dataset_name = trace_dataset_name
131
131
  super().__init__()
132
132
 
@@ -147,12 +147,12 @@ class AppService(Service):
147
147
  self.__umap_params,
148
148
  "datasets",
149
149
  "--primary",
150
- str(self.__primary_inferences_name),
150
+ str(self.__primary_dataset_name),
151
151
  ]
152
- if self.__reference_inferences_name is not None:
153
- command.extend(["--reference", str(self.__reference_inferences_name)])
154
- if self.__corpus_inferences_name is not None:
155
- command.extend(["--corpus", str(self.__corpus_inferences_name)])
152
+ if self.__reference_dataset_name is not None:
153
+ command.extend(["--reference", str(self.__reference_dataset_name)])
154
+ if self.__corpus_dataset_name is not None:
155
+ command.extend(["--corpus", str(self.__corpus_dataset_name)])
156
156
  if self.__trace_dataset_name is not None:
157
157
  command.extend(["--trace", str(self.__trace_dataset_name)])
158
158
  logger.info(f"command: {' '.join(command)}")
phoenix/session/client.py CHANGED
@@ -1,48 +1,27 @@
1
- import csv
2
1
  import gzip
3
2
  import logging
4
3
  import weakref
5
- from collections import Counter
6
4
  from datetime import datetime
7
- from io import BytesIO, StringIO
8
- from pathlib import Path
9
- from typing import (
10
- Any,
11
- BinaryIO,
12
- Dict,
13
- Iterable,
14
- List,
15
- Literal,
16
- Mapping,
17
- Optional,
18
- Sequence,
19
- Tuple,
20
- Union,
21
- cast,
22
- )
23
- from urllib.parse import quote, urljoin
5
+ from io import BytesIO
6
+ from typing import Any, List, Optional, Union, cast
7
+ from urllib.parse import urljoin
24
8
 
25
- import httpx
26
9
  import pandas as pd
27
10
  import pyarrow as pa
28
- from httpx import HTTPStatusError
29
11
  from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ExportTraceServiceRequest
30
12
  from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
31
13
  from opentelemetry.proto.resource.v1.resource_pb2 import Resource
32
14
  from opentelemetry.proto.trace.v1.trace_pb2 import ResourceSpans, ScopeSpans
33
- from pyarrow import ArrowInvalid, Table
34
- from typing_extensions import TypeAlias, assert_never
15
+ from pyarrow import ArrowInvalid
16
+ from requests import Session
35
17
 
36
18
  from phoenix.config import (
37
- get_env_client_headers,
38
19
  get_env_collector_endpoint,
39
20
  get_env_host,
40
21
  get_env_port,
41
22
  get_env_project_name,
42
23
  )
43
- from phoenix.datasets.types import Dataset, Example
44
24
  from phoenix.datetime_utils import normalize_datetime
45
- from phoenix.db.insertion.dataset import DatasetKeys
46
25
  from phoenix.session.data_extractor import DEFAULT_SPAN_LIMIT, TraceDataExtractor
47
26
  from phoenix.trace import Evaluations, TraceDataset
48
27
  from phoenix.trace.dsl import SpanQuery
@@ -50,8 +29,6 @@ from phoenix.trace.otel import encode_span_to_otlp
50
29
 
51
30
  logger = logging.getLogger(__name__)
52
31
 
53
- DatasetAction: TypeAlias = Literal["create", "append"]
54
-
55
32
 
56
33
  class Client(TraceDataExtractor):
57
34
  def __init__(
@@ -59,20 +36,14 @@ class Client(TraceDataExtractor):
59
36
  *,
60
37
  endpoint: Optional[str] = None,
61
38
  warn_if_server_not_running: bool = True,
62
- headers: Optional[Mapping[str, str]] = None,
63
39
  **kwargs: Any, # for backward-compatibility
64
40
  ):
65
41
  """
66
42
  Client for connecting to a Phoenix server.
67
43
 
68
44
  Args:
69
- endpoint (str, optional): Phoenix server endpoint, e.g.
70
- http://localhost:6006. If not provided, the endpoint will be
71
- inferred from the environment variables.
72
-
73
- headers (Mapping[str, str], optional): Headers to include in each
74
- network request. If not provided, the headers will be inferred from
75
- the environment variables (if present).
45
+ endpoint (str, optional): Phoenix server endpoint, e.g. http://localhost:6006. If not
46
+ provided, the endpoint will be inferred from the environment variables.
76
47
  """
77
48
  if kwargs.pop("use_active_session_if_available", None) is not None:
78
49
  print(
@@ -81,34 +52,17 @@ class Client(TraceDataExtractor):
81
52
  )
82
53
  if kwargs:
83
54
  raise TypeError(f"Unexpected keyword arguments: {', '.join(kwargs)}")
84
- headers = headers or get_env_client_headers()
85
55
  host = get_env_host()
86
56
  if host == "0.0.0.0":
87
57
  host = "127.0.0.1"
88
58
  base_url = endpoint or get_env_collector_endpoint() or f"http://{host}:{get_env_port()}"
89
59
  self._base_url = base_url if base_url.endswith("/") else base_url + "/"
90
- self._client = httpx.Client(headers=headers)
91
- weakref.finalize(self, self._client.close)
60
+
61
+ self._session = Session()
62
+ weakref.finalize(self, self._session.close)
92
63
  if warn_if_server_not_running:
93
64
  self._warn_if_phoenix_is_not_running()
94
65
 
95
- @property
96
- def web_url(self) -> str:
97
- """
98
- Return the web URL of the Phoenix UI. This is different from the base
99
- URL in the cases where there is a proxy like colab
100
-
101
-
102
- Returns:
103
- str: A fully qualified URL to the Phoenix UI.
104
- """
105
- # Avoid circular import
106
- from phoenix.session.session import active_session
107
-
108
- if session := active_session():
109
- return session.url
110
- return self._base_url
111
-
112
66
  def query_spans(
113
67
  self,
114
68
  *queries: SpanQuery,
@@ -144,7 +98,7 @@ class Client(TraceDataExtractor):
144
98
  "stop_time is deprecated. Use end_time instead.",
145
99
  )
146
100
  end_time = end_time or stop_time
147
- response = self._client.post(
101
+ response = self._session.post(
148
102
  url=urljoin(self._base_url, "v1/spans"),
149
103
  params={"project-name": project_name},
150
104
  json={
@@ -191,8 +145,8 @@ class Client(TraceDataExtractor):
191
145
  empty list if no evaluations are found.
192
146
  """
193
147
  project_name = project_name or get_env_project_name()
194
- response = self._client.get(
195
- url=urljoin(self._base_url, "v1/evaluations"),
148
+ response = self._session.get(
149
+ urljoin(self._base_url, "v1/evaluations"),
196
150
  params={"project-name": project_name},
197
151
  )
198
152
  if response.status_code == 404:
@@ -213,7 +167,7 @@ class Client(TraceDataExtractor):
213
167
 
214
168
  def _warn_if_phoenix_is_not_running(self) -> None:
215
169
  try:
216
- self._client.get(urljoin(self._base_url, "arize_phoenix_version")).raise_for_status()
170
+ self._session.get(urljoin(self._base_url, "arize_phoenix_version")).raise_for_status()
217
171
  except Exception:
218
172
  logger.warning(
219
173
  f"Arize Phoenix is not running on {self._base_url}. Launch Phoenix "
@@ -243,9 +197,9 @@ class Client(TraceDataExtractor):
243
197
  headers = {"content-type": "application/x-pandas-arrow"}
244
198
  with pa.ipc.new_stream(sink, table.schema) as writer:
245
199
  writer.write_table(table)
246
- self._client.post(
247
- url=urljoin(self._base_url, "v1/evaluations"),
248
- content=cast(bytes, sink.getvalue().to_pybytes()),
200
+ self._session.post(
201
+ urljoin(self._base_url, "v1/evaluations"),
202
+ data=cast(bytes, sink.getvalue().to_pybytes()),
249
203
  headers=headers,
250
204
  ).raise_for_status()
251
205
 
@@ -285,387 +239,16 @@ class Client(TraceDataExtractor):
285
239
  ]
286
240
  for otlp_span in otlp_spans:
287
241
  serialized = otlp_span.SerializeToString()
288
- content = gzip.compress(serialized)
289
- self._client.post(
290
- url=urljoin(self._base_url, "v1/traces"),
291
- content=content,
242
+ data = gzip.compress(serialized)
243
+ self._session.post(
244
+ urljoin(self._base_url, "v1/traces"),
245
+ data=data,
292
246
  headers={
293
247
  "content-type": "application/x-protobuf",
294
248
  "content-encoding": "gzip",
295
249
  },
296
250
  ).raise_for_status()
297
251
 
298
- def _get_dataset_id_by_name(self, name: str) -> str:
299
- """
300
- Gets a dataset by name.
301
-
302
- Args:
303
- name (str): The name of the dataset.
304
- version_id (Optional[str]): The version ID of the dataset. Default None.
305
-
306
- Returns:
307
- Dataset: The dataset object.
308
- """
309
- response = self._client.get(
310
- urljoin(self._base_url, "/v1/datasets"),
311
- params={"name": name},
312
- )
313
- response.raise_for_status()
314
- if not (records := response.json()["data"]):
315
- raise ValueError(f"Failed to query dataset by name: {name}")
316
- if len(records) > 1 or not records[0]:
317
- raise ValueError(f"Failed to find a single dataset with the given name: {name}")
318
- dataset = records[0]
319
- return str(dataset["id"])
320
-
321
- def get_dataset(
322
- self,
323
- *,
324
- id: Optional[str] = None,
325
- name: Optional[str] = None,
326
- version_id: Optional[str] = None,
327
- ) -> Dataset:
328
- """
329
- Gets the dataset for a specific version, or gets the latest version of
330
- the dataset if no version is specified.
331
-
332
- Args:
333
-
334
- id (Optional[str]): An ID for the dataset.
335
-
336
- name (Optional[str]): the name for the dataset. If provided, the ID
337
- is ignored and the dataset is retrieved by name.
338
-
339
- version_id (Optional[str]): An ID for the version of the dataset, or
340
- None.
341
-
342
- Returns:
343
- A dataset object.
344
- """
345
- if name:
346
- id = self._get_dataset_id_by_name(name)
347
-
348
- if not id:
349
- raise ValueError("Dataset id or name must be provided.")
350
-
351
- response = self._client.get(
352
- urljoin(self._base_url, f"/v1/datasets/{quote(id)}/examples"),
353
- params={"version-id": version_id} if version_id else None,
354
- )
355
- response.raise_for_status()
356
- data = response.json()["data"]
357
- examples = [
358
- Example(
359
- id=example["id"],
360
- input=example["input"],
361
- output=example["output"],
362
- metadata=example["metadata"],
363
- updated_at=datetime.fromisoformat(example["updated_at"]),
364
- )
365
- for example in data["examples"]
366
- ]
367
- resolved_dataset_id = data["dataset_id"]
368
- resolved_version_id = data["version_id"]
369
- return Dataset(
370
- id=resolved_dataset_id,
371
- version_id=resolved_version_id,
372
- examples=examples,
373
- )
374
-
375
- def get_dataset_versions(
376
- self,
377
- dataset_id: str,
378
- /,
379
- *,
380
- limit: Optional[int] = 100,
381
- ) -> pd.DataFrame:
382
- """
383
- Get dataset versions as pandas DataFrame.
384
-
385
- Args:
386
- dataset_id (str): dataset ID
387
- limit (Optional[int]): maximum number of versions to return,
388
- starting from the most recent version
389
-
390
- Returns:
391
- pandas DataFrame
392
- """
393
- url = urljoin(self._base_url, f"v1/datasets/{dataset_id}/versions")
394
- response = httpx.get(url=url, params={"limit": limit})
395
- response.raise_for_status()
396
- if not (records := response.json()["data"]):
397
- return pd.DataFrame()
398
- df = pd.DataFrame.from_records(records, index="version_id")
399
- df["created_at"] = pd.to_datetime(df.created_at)
400
- return df
401
-
402
- def download_dataset_examples(
403
- self,
404
- dataset_id: str,
405
- /,
406
- *,
407
- dataset_version_id: Optional[str] = None,
408
- ) -> pd.DataFrame:
409
- """
410
- Download dataset examples as pandas DataFrame.
411
-
412
- Args:
413
- dataset_id (str): dataset ID
414
- dataset_version_id (Optional[str]): dataset version ID, if omitted,
415
- the latest version is returned.
416
-
417
- Returns:
418
- pandas DataFrame
419
- """
420
- url = f"v1/datasets/{dataset_id}/csv"
421
- response = httpx.get(
422
- url=urljoin(self._base_url, url),
423
- params={"version": dataset_version_id} if dataset_version_id else {},
424
- )
425
- response.raise_for_status()
426
- return pd.read_csv(
427
- StringIO(response.content.decode()),
428
- index_col="example_id",
429
- )
430
-
431
- def create_examples(
432
- self,
433
- *,
434
- dataset_name: str,
435
- inputs: Iterable[Mapping[str, Any]],
436
- outputs: Iterable[Mapping[str, Any]] = (),
437
- metadata: Iterable[Mapping[str, Any]] = (),
438
- dataset_description: Optional[str] = None,
439
- ) -> Dataset:
440
- """
441
- Upload examples as dataset to the Phoenix server.
442
-
443
- Args:
444
- dataset_name: (str): Name of the dataset
445
- inputs (Iterable[Mapping[str, Any]]): List of dictionaries object each
446
- corresponding to an example in the dataset.
447
- outputs (Iterable[Mapping[str, Any]]): List of dictionaries object each
448
- corresponding to an example in the dataset.
449
- metadata (Iterable[Mapping[str, Any]]): List of dictionaries object each
450
- corresponding to an example in the dataset.
451
- dataset_description: (Optional[str]): Description of the dataset.
452
-
453
- Returns:
454
- A Dataset object with the uploaded examples.
455
- """
456
- # convert to list to avoid issues with pandas Series
457
- inputs, outputs, metadata = list(inputs), list(outputs), list(metadata)
458
- if not inputs or not _is_all_dict(inputs):
459
- raise ValueError(
460
- "`inputs` should be a non-empty sequence containing only dictionary objects"
461
- )
462
- for name, seq in {"outputs": outputs, "metadata": metadata}.items():
463
- if seq and not (len(seq) == len(inputs) and _is_all_dict(seq)):
464
- raise ValueError(
465
- f"`{name}` should be a sequence of the same length as `inputs` "
466
- "containing only dictionary objects"
467
- )
468
- action: DatasetAction = "create"
469
- print("📤 Uploading dataset...")
470
- response = self._client.post(
471
- url=urljoin(self._base_url, "v1/datasets/upload"),
472
- headers={"Content-Encoding": "gzip"},
473
- json={
474
- "action": action,
475
- "name": dataset_name,
476
- "description": dataset_description,
477
- "inputs": inputs,
478
- "outputs": outputs,
479
- "metadata": metadata,
480
- },
481
- params={"sync": True},
482
- )
483
- try:
484
- response.raise_for_status()
485
- except HTTPStatusError as e:
486
- if msg := response.text:
487
- raise DatasetUploadError(msg) from e
488
- raise
489
- data = response.json()["data"]
490
- dataset_id = data["dataset_id"]
491
- response = self._client.get(
492
- url=urljoin(self._base_url, f"v1/datasets/{dataset_id}/examples")
493
- )
494
- response.raise_for_status()
495
- data = response.json()["data"]
496
- version_id = data["version_id"]
497
- examples = data["examples"]
498
- print(f"💾 Examples uploaded: {self.web_url}datasets/{dataset_id}/examples")
499
- print(f"🗄️ Dataset version ID: {version_id}")
500
-
501
- return Dataset(
502
- id=dataset_id,
503
- version_id=version_id,
504
- examples=[
505
- Example(
506
- id=example["id"],
507
- input=example["input"],
508
- output=example["output"],
509
- metadata=example["metadata"],
510
- updated_at=datetime.fromisoformat(example["updated_at"]),
511
- )
512
- for example in examples
513
- ],
514
- )
515
-
516
- def upload_dataset(
517
- self,
518
- table: Union[str, Path, pd.DataFrame],
519
- /,
520
- *,
521
- name: str,
522
- input_keys: Iterable[str],
523
- output_keys: Iterable[str] = (),
524
- metadata_keys: Iterable[str] = (),
525
- description: Optional[str] = None,
526
- action: Literal["create", "append"] = "create",
527
- ) -> Dataset:
528
- """
529
- Upload examples as dataset to the Phoenix server.
530
-
531
- Args:
532
- table (str | Path | pd.DataFrame): Location of a CSV text file, or
533
- pandas DataFrame.
534
- name: (str): Name of the dataset. Required if action=append.
535
- input_keys (Iterable[str]): List of column names used as input keys.
536
- input_keys, output_keys, metadata_keys must be disjoint, and must
537
- exist in CSV column headers.
538
- output_keys (Iterable[str]): List of column names used as output keys.
539
- input_keys, output_keys, metadata_keys must be disjoint, and must
540
- exist in CSV column headers.
541
- metadata_keys (Iterable[str]): List of column names used as metadata keys.
542
- input_keys, output_keys, metadata_keys must be disjoint, and must
543
- exist in CSV column headers.
544
- description: (Optional[str]): Description of the dataset.
545
- action: (Literal["create", "append"): Create new dataset or append to an
546
- existing dataset. If action=append, dataset name is required.
547
-
548
- Returns:
549
- A Dataset object with the uploaded examples.
550
- """
551
- if action not in ("create", "append"):
552
- raise ValueError(f"Invalid action: {action}")
553
- if not name:
554
- raise ValueError("Dataset name must not be blank")
555
- keys = DatasetKeys(
556
- frozenset(input_keys),
557
- frozenset(output_keys),
558
- frozenset(metadata_keys),
559
- )
560
- if isinstance(table, pd.DataFrame):
561
- file = _prepare_pyarrow(table, keys)
562
- elif isinstance(table, (str, Path)):
563
- file = _prepare_csv(Path(table), keys)
564
- else:
565
- assert_never(table)
566
- print("📤 Uploading dataset...")
567
- response = self._client.post(
568
- url=urljoin(self._base_url, "v1/datasets/upload"),
569
- files={"file": file},
570
- data={
571
- "action": action,
572
- "name": name,
573
- "description": description,
574
- "input_keys[]": sorted(keys.input),
575
- "output_keys[]": sorted(keys.output),
576
- "metadata_keys[]": sorted(keys.metadata),
577
- },
578
- params={"sync": True},
579
- )
580
- try:
581
- response.raise_for_status()
582
- except HTTPStatusError as e:
583
- if msg := response.text:
584
- raise DatasetUploadError(msg) from e
585
- raise
586
- data = response.json()["data"]
587
- dataset_id = data["dataset_id"]
588
- response = self._client.get(
589
- url=urljoin(self._base_url, f"v1/datasets/{dataset_id}/examples")
590
- )
591
- response.raise_for_status()
592
- data = response.json()["data"]
593
- version_id = data["version_id"]
594
- examples = data["examples"]
595
- print(f"💾 Examples uploaded: {self.web_url}datasets/{dataset_id}/examples")
596
- print(f"🗄️ Dataset version ID: {version_id}")
597
-
598
- return Dataset(
599
- id=dataset_id,
600
- version_id=version_id,
601
- examples=[
602
- Example(
603
- id=example["id"],
604
- input=example["input"],
605
- output=example["output"],
606
- metadata=example["metadata"],
607
- updated_at=datetime.fromisoformat(example["updated_at"]),
608
- )
609
- for example in examples
610
- ],
611
- )
612
-
613
-
614
- FileName: TypeAlias = str
615
- FilePointer: TypeAlias = BinaryIO
616
- FileType: TypeAlias = str
617
- FileHeaders: TypeAlias = Dict[str, str]
618
-
619
-
620
- def _prepare_csv(
621
- path: Path,
622
- keys: DatasetKeys,
623
- ) -> Tuple[FileName, FilePointer, FileType, FileHeaders]:
624
- path = path.resolve()
625
- if not path.is_file():
626
- raise FileNotFoundError(f"File does not exist: {path}")
627
- with open(path, "r") as f:
628
- rows = csv.reader(f)
629
- try:
630
- column_headers = next(rows)
631
- _ = next(rows)
632
- except StopIteration:
633
- raise ValueError("csv file has no data")
634
- (header, freq), *_ = Counter(column_headers).most_common(1)
635
- if freq > 1:
636
- raise ValueError(f"Duplicated column header in CSV file: {header}")
637
- keys.check_differences(frozenset(column_headers))
638
- file = BytesIO()
639
- with open(path, "rb") as f:
640
- file.write(gzip.compress(f.read()))
641
- return path.name, file, "text/csv", {"Content-Encoding": "gzip"}
642
-
643
-
644
- def _prepare_pyarrow(
645
- df: pd.DataFrame,
646
- keys: DatasetKeys,
647
- ) -> Tuple[FileName, FilePointer, FileType, FileHeaders]:
648
- if df.empty:
649
- raise ValueError("dataframe has no data")
650
- (header, freq), *_ = Counter(df.columns).most_common(1)
651
- if freq > 1:
652
- raise ValueError(f"Duplicated column header in file: {header}")
653
- keys.check_differences(frozenset(df.columns))
654
- table = Table.from_pandas(df.loc[:, list(keys)])
655
- sink = pa.BufferOutputStream()
656
- options = pa.ipc.IpcWriteOptions(compression="lz4")
657
- with pa.ipc.new_stream(sink, table.schema, options=options) as writer:
658
- writer.write_table(table)
659
- file = BytesIO(sink.getvalue().to_pybytes())
660
- return "pandas", file, "application/x-pandas-pyarrow", {}
661
-
662
252
 
663
253
  def _to_iso_format(value: Optional[datetime]) -> Optional[str]:
664
254
  return value.isoformat() if value else None
665
-
666
-
667
- def _is_all_dict(seq: Sequence[Any]) -> bool:
668
- return all(map(lambda obj: isinstance(obj, dict), seq))
669
-
670
-
671
- class DatasetUploadError(Exception): ...