arize-phoenix 4.4.4rc6__py3-none-any.whl → 4.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

Files changed (123) hide show
  1. {arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.5.0.dist-info}/METADATA +8 -14
  2. {arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.5.0.dist-info}/RECORD +58 -122
  3. {arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.5.0.dist-info}/WHEEL +1 -1
  4. phoenix/__init__.py +27 -0
  5. phoenix/config.py +7 -42
  6. phoenix/core/model.py +25 -25
  7. phoenix/core/model_schema.py +62 -64
  8. phoenix/core/model_schema_adapter.py +25 -27
  9. phoenix/datetime_utils.py +0 -4
  10. phoenix/db/bulk_inserter.py +14 -54
  11. phoenix/db/insertion/evaluation.py +10 -10
  12. phoenix/db/insertion/helpers.py +14 -17
  13. phoenix/db/insertion/span.py +3 -3
  14. phoenix/db/migrations/versions/cf03bd6bae1d_init.py +28 -2
  15. phoenix/db/models.py +4 -236
  16. phoenix/inferences/fixtures.py +23 -23
  17. phoenix/inferences/inferences.py +7 -7
  18. phoenix/inferences/validation.py +1 -1
  19. phoenix/server/api/context.py +0 -20
  20. phoenix/server/api/dataloaders/__init__.py +0 -20
  21. phoenix/server/api/dataloaders/span_descendants.py +3 -2
  22. phoenix/server/api/routers/v1/__init__.py +2 -77
  23. phoenix/server/api/routers/v1/evaluations.py +13 -8
  24. phoenix/server/api/routers/v1/spans.py +5 -9
  25. phoenix/server/api/routers/v1/traces.py +4 -1
  26. phoenix/server/api/schema.py +303 -2
  27. phoenix/server/api/types/Cluster.py +19 -19
  28. phoenix/server/api/types/Dataset.py +63 -282
  29. phoenix/server/api/types/DatasetRole.py +23 -0
  30. phoenix/server/api/types/Dimension.py +29 -30
  31. phoenix/server/api/types/EmbeddingDimension.py +34 -40
  32. phoenix/server/api/types/Event.py +16 -16
  33. phoenix/server/api/{mutations/export_events_mutations.py → types/ExportEventsMutation.py} +14 -17
  34. phoenix/server/api/types/Model.py +42 -43
  35. phoenix/server/api/types/Project.py +12 -26
  36. phoenix/server/api/types/Span.py +2 -79
  37. phoenix/server/api/types/TimeSeries.py +6 -6
  38. phoenix/server/api/types/Trace.py +4 -15
  39. phoenix/server/api/types/UMAPPoints.py +1 -1
  40. phoenix/server/api/types/node.py +111 -5
  41. phoenix/server/api/types/pagination.py +52 -10
  42. phoenix/server/app.py +49 -103
  43. phoenix/server/main.py +27 -49
  44. phoenix/server/openapi/docs.py +0 -3
  45. phoenix/server/static/index.js +1384 -2390
  46. phoenix/server/templates/index.html +0 -1
  47. phoenix/services.py +15 -15
  48. phoenix/session/client.py +23 -611
  49. phoenix/session/session.py +37 -47
  50. phoenix/trace/exporter.py +9 -14
  51. phoenix/trace/fixtures.py +7 -133
  52. phoenix/trace/schemas.py +2 -1
  53. phoenix/trace/span_evaluations.py +3 -3
  54. phoenix/trace/trace_dataset.py +6 -6
  55. phoenix/version.py +1 -1
  56. phoenix/db/insertion/dataset.py +0 -237
  57. phoenix/db/migrations/types.py +0 -29
  58. phoenix/db/migrations/versions/10460e46d750_datasets.py +0 -291
  59. phoenix/experiments/__init__.py +0 -6
  60. phoenix/experiments/evaluators/__init__.py +0 -29
  61. phoenix/experiments/evaluators/base.py +0 -153
  62. phoenix/experiments/evaluators/code_evaluators.py +0 -99
  63. phoenix/experiments/evaluators/llm_evaluators.py +0 -244
  64. phoenix/experiments/evaluators/utils.py +0 -189
  65. phoenix/experiments/functions.py +0 -616
  66. phoenix/experiments/tracing.py +0 -85
  67. phoenix/experiments/types.py +0 -722
  68. phoenix/experiments/utils.py +0 -9
  69. phoenix/server/api/dataloaders/average_experiment_run_latency.py +0 -54
  70. phoenix/server/api/dataloaders/dataset_example_revisions.py +0 -100
  71. phoenix/server/api/dataloaders/dataset_example_spans.py +0 -43
  72. phoenix/server/api/dataloaders/experiment_annotation_summaries.py +0 -85
  73. phoenix/server/api/dataloaders/experiment_error_rates.py +0 -43
  74. phoenix/server/api/dataloaders/experiment_run_counts.py +0 -42
  75. phoenix/server/api/dataloaders/experiment_sequence_number.py +0 -49
  76. phoenix/server/api/dataloaders/project_by_name.py +0 -31
  77. phoenix/server/api/dataloaders/span_projects.py +0 -33
  78. phoenix/server/api/dataloaders/trace_row_ids.py +0 -39
  79. phoenix/server/api/helpers/dataset_helpers.py +0 -179
  80. phoenix/server/api/input_types/AddExamplesToDatasetInput.py +0 -16
  81. phoenix/server/api/input_types/AddSpansToDatasetInput.py +0 -14
  82. phoenix/server/api/input_types/ClearProjectInput.py +0 -15
  83. phoenix/server/api/input_types/CreateDatasetInput.py +0 -12
  84. phoenix/server/api/input_types/DatasetExampleInput.py +0 -14
  85. phoenix/server/api/input_types/DatasetSort.py +0 -17
  86. phoenix/server/api/input_types/DatasetVersionSort.py +0 -16
  87. phoenix/server/api/input_types/DeleteDatasetExamplesInput.py +0 -13
  88. phoenix/server/api/input_types/DeleteDatasetInput.py +0 -7
  89. phoenix/server/api/input_types/DeleteExperimentsInput.py +0 -9
  90. phoenix/server/api/input_types/PatchDatasetExamplesInput.py +0 -35
  91. phoenix/server/api/input_types/PatchDatasetInput.py +0 -14
  92. phoenix/server/api/mutations/__init__.py +0 -13
  93. phoenix/server/api/mutations/auth.py +0 -11
  94. phoenix/server/api/mutations/dataset_mutations.py +0 -520
  95. phoenix/server/api/mutations/experiment_mutations.py +0 -65
  96. phoenix/server/api/mutations/project_mutations.py +0 -47
  97. phoenix/server/api/openapi/__init__.py +0 -0
  98. phoenix/server/api/openapi/main.py +0 -6
  99. phoenix/server/api/openapi/schema.py +0 -16
  100. phoenix/server/api/queries.py +0 -503
  101. phoenix/server/api/routers/v1/dataset_examples.py +0 -178
  102. phoenix/server/api/routers/v1/datasets.py +0 -965
  103. phoenix/server/api/routers/v1/experiment_evaluations.py +0 -65
  104. phoenix/server/api/routers/v1/experiment_runs.py +0 -96
  105. phoenix/server/api/routers/v1/experiments.py +0 -174
  106. phoenix/server/api/types/AnnotatorKind.py +0 -10
  107. phoenix/server/api/types/CreateDatasetPayload.py +0 -8
  108. phoenix/server/api/types/DatasetExample.py +0 -85
  109. phoenix/server/api/types/DatasetExampleRevision.py +0 -34
  110. phoenix/server/api/types/DatasetVersion.py +0 -14
  111. phoenix/server/api/types/ExampleRevisionInterface.py +0 -14
  112. phoenix/server/api/types/Experiment.py +0 -147
  113. phoenix/server/api/types/ExperimentAnnotationSummary.py +0 -13
  114. phoenix/server/api/types/ExperimentComparison.py +0 -19
  115. phoenix/server/api/types/ExperimentRun.py +0 -91
  116. phoenix/server/api/types/ExperimentRunAnnotation.py +0 -57
  117. phoenix/server/api/types/Inferences.py +0 -80
  118. phoenix/server/api/types/InferencesRole.py +0 -23
  119. phoenix/utilities/json.py +0 -61
  120. phoenix/utilities/re.py +0 -50
  121. {arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.5.0.dist-info}/licenses/IP_NOTICE +0 -0
  122. {arize_phoenix-4.4.4rc6.dist-info → arize_phoenix-4.5.0.dist-info}/licenses/LICENSE +0 -0
  123. /phoenix/server/api/{helpers/__init__.py → helpers.py} +0 -0
phoenix/session/client.py CHANGED
@@ -1,49 +1,27 @@
1
- import csv
2
1
  import gzip
3
2
  import logging
4
- import re
5
3
  import weakref
6
- from collections import Counter
7
4
  from datetime import datetime
8
- from io import BytesIO, StringIO
9
- from pathlib import Path
10
- from typing import (
11
- Any,
12
- BinaryIO,
13
- Dict,
14
- Iterable,
15
- List,
16
- Literal,
17
- Mapping,
18
- Optional,
19
- Sequence,
20
- Tuple,
21
- Union,
22
- cast,
23
- )
24
- from urllib.parse import quote, urljoin
5
+ from io import BytesIO
6
+ from typing import Any, List, Optional, Union, cast
7
+ from urllib.parse import urljoin
25
8
 
26
- import httpx
27
9
  import pandas as pd
28
10
  import pyarrow as pa
29
- from httpx import HTTPStatusError, Response
30
11
  from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import ExportTraceServiceRequest
31
12
  from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
32
13
  from opentelemetry.proto.resource.v1.resource_pb2 import Resource
33
14
  from opentelemetry.proto.trace.v1.trace_pb2 import ResourceSpans, ScopeSpans
34
- from pyarrow import ArrowInvalid, Table
35
- from typing_extensions import TypeAlias, assert_never
15
+ from pyarrow import ArrowInvalid
16
+ from requests import Session
36
17
 
37
18
  from phoenix.config import (
38
- get_env_client_headers,
39
19
  get_env_collector_endpoint,
40
20
  get_env_host,
41
21
  get_env_port,
42
22
  get_env_project_name,
43
23
  )
44
24
  from phoenix.datetime_utils import normalize_datetime
45
- from phoenix.db.insertion.dataset import DatasetKeys
46
- from phoenix.experiments.types import Dataset, Example
47
25
  from phoenix.session.data_extractor import DEFAULT_SPAN_LIMIT, TraceDataExtractor
48
26
  from phoenix.trace import Evaluations, TraceDataset
49
27
  from phoenix.trace.dsl import SpanQuery
@@ -51,8 +29,6 @@ from phoenix.trace.otel import encode_span_to_otlp
51
29
 
52
30
  logger = logging.getLogger(__name__)
53
31
 
54
- DatasetAction: TypeAlias = Literal["create", "append"]
55
-
56
32
 
57
33
  class Client(TraceDataExtractor):
58
34
  def __init__(
@@ -60,20 +36,14 @@ class Client(TraceDataExtractor):
60
36
  *,
61
37
  endpoint: Optional[str] = None,
62
38
  warn_if_server_not_running: bool = True,
63
- headers: Optional[Mapping[str, str]] = None,
64
39
  **kwargs: Any, # for backward-compatibility
65
40
  ):
66
41
  """
67
42
  Client for connecting to a Phoenix server.
68
43
 
69
44
  Args:
70
- endpoint (str, optional): Phoenix server endpoint, e.g.
71
- http://localhost:6006. If not provided, the endpoint will be
72
- inferred from the environment variables.
73
-
74
- headers (Mapping[str, str], optional): Headers to include in each
75
- network request. If not provided, the headers will be inferred from
76
- the environment variables (if present).
45
+ endpoint (str, optional): Phoenix server endpoint, e.g. http://localhost:6006. If not
46
+ provided, the endpoint will be inferred from the environment variables.
77
47
  """
78
48
  if kwargs.pop("use_active_session_if_available", None) is not None:
79
49
  print(
@@ -82,34 +52,17 @@ class Client(TraceDataExtractor):
82
52
  )
83
53
  if kwargs:
84
54
  raise TypeError(f"Unexpected keyword arguments: {', '.join(kwargs)}")
85
- headers = headers or get_env_client_headers()
86
55
  host = get_env_host()
87
56
  if host == "0.0.0.0":
88
57
  host = "127.0.0.1"
89
58
  base_url = endpoint or get_env_collector_endpoint() or f"http://{host}:{get_env_port()}"
90
59
  self._base_url = base_url if base_url.endswith("/") else base_url + "/"
91
- self._client = httpx.Client(headers=headers)
92
- weakref.finalize(self, self._client.close)
60
+
61
+ self._session = Session()
62
+ weakref.finalize(self, self._session.close)
93
63
  if warn_if_server_not_running:
94
64
  self._warn_if_phoenix_is_not_running()
95
65
 
96
- @property
97
- def web_url(self) -> str:
98
- """
99
- Return the web URL of the Phoenix UI. This is different from the base
100
- URL in the cases where there is a proxy like colab
101
-
102
-
103
- Returns:
104
- str: A fully qualified URL to the Phoenix UI.
105
- """
106
- # Avoid circular import
107
- from phoenix.session.session import active_session
108
-
109
- if session := active_session():
110
- return session.url
111
- return self._base_url
112
-
113
66
  def query_spans(
114
67
  self,
115
68
  *queries: SpanQuery,
@@ -145,12 +98,9 @@ class Client(TraceDataExtractor):
145
98
  "stop_time is deprecated. Use end_time instead.",
146
99
  )
147
100
  end_time = end_time or stop_time
148
- response = self._client.post(
101
+ response = self._session.post(
149
102
  url=urljoin(self._base_url, "v1/spans"),
150
- params={
151
- "project_name": project_name,
152
- "project-name": project_name, # for backward-compatibility
153
- },
103
+ params={"project-name": project_name},
154
104
  json={
155
105
  "queries": [q.to_dict() for q in queries],
156
106
  "start_time": _to_iso_format(normalize_datetime(start_time)),
@@ -195,12 +145,9 @@ class Client(TraceDataExtractor):
195
145
  empty list if no evaluations are found.
196
146
  """
197
147
  project_name = project_name or get_env_project_name()
198
- response = self._client.get(
199
- url=urljoin(self._base_url, "v1/evaluations"),
200
- params={
201
- "project_name": project_name,
202
- "project-name": project_name, # for backward-compatibility
203
- },
148
+ response = self._session.get(
149
+ urljoin(self._base_url, "v1/evaluations"),
150
+ params={"project-name": project_name},
204
151
  )
205
152
  if response.status_code == 404:
206
153
  logger.info("No evaluations found.")
@@ -220,7 +167,7 @@ class Client(TraceDataExtractor):
220
167
 
221
168
  def _warn_if_phoenix_is_not_running(self) -> None:
222
169
  try:
223
- self._client.get(urljoin(self._base_url, "arize_phoenix_version")).raise_for_status()
170
+ self._session.get(urljoin(self._base_url, "arize_phoenix_version")).raise_for_status()
224
171
  except Exception:
225
172
  logger.warning(
226
173
  f"Arize Phoenix is not running on {self._base_url}. Launch Phoenix "
@@ -250,9 +197,9 @@ class Client(TraceDataExtractor):
250
197
  headers = {"content-type": "application/x-pandas-arrow"}
251
198
  with pa.ipc.new_stream(sink, table.schema) as writer:
252
199
  writer.write_table(table)
253
- self._client.post(
254
- url=urljoin(self._base_url, "v1/evaluations"),
255
- content=cast(bytes, sink.getvalue().to_pybytes()),
200
+ self._session.post(
201
+ urljoin(self._base_url, "v1/evaluations"),
202
+ data=cast(bytes, sink.getvalue().to_pybytes()),
256
203
  headers=headers,
257
204
  ).raise_for_status()
258
205
 
@@ -292,551 +239,16 @@ class Client(TraceDataExtractor):
292
239
  ]
293
240
  for otlp_span in otlp_spans:
294
241
  serialized = otlp_span.SerializeToString()
295
- content = gzip.compress(serialized)
296
- self._client.post(
297
- url=urljoin(self._base_url, "v1/traces"),
298
- content=content,
242
+ data = gzip.compress(serialized)
243
+ self._session.post(
244
+ urljoin(self._base_url, "v1/traces"),
245
+ data=data,
299
246
  headers={
300
247
  "content-type": "application/x-protobuf",
301
248
  "content-encoding": "gzip",
302
249
  },
303
250
  ).raise_for_status()
304
251
 
305
- def _get_dataset_id_by_name(self, name: str) -> str:
306
- """
307
- Gets a dataset by name.
308
-
309
- Args:
310
- name (str): The name of the dataset.
311
- version_id (Optional[str]): The version ID of the dataset. Default None.
312
-
313
- Returns:
314
- Dataset: The dataset object.
315
- """
316
- response = self._client.get(
317
- urljoin(self._base_url, "/v1/datasets"),
318
- params={"name": name},
319
- )
320
- response.raise_for_status()
321
- if not (records := response.json()["data"]):
322
- raise ValueError(f"Failed to query dataset by name: {name}")
323
- if len(records) > 1 or not records[0]:
324
- raise ValueError(f"Failed to find a single dataset with the given name: {name}")
325
- dataset = records[0]
326
- return str(dataset["id"])
327
-
328
- def get_dataset(
329
- self,
330
- *,
331
- id: Optional[str] = None,
332
- name: Optional[str] = None,
333
- version_id: Optional[str] = None,
334
- ) -> Dataset:
335
- """
336
- Gets the dataset for a specific version, or gets the latest version of
337
- the dataset if no version is specified.
338
-
339
- Args:
340
-
341
- id (Optional[str]): An ID for the dataset.
342
-
343
- name (Optional[str]): the name for the dataset. If provided, the ID
344
- is ignored and the dataset is retrieved by name.
345
-
346
- version_id (Optional[str]): An ID for the version of the dataset, or
347
- None.
348
-
349
- Returns:
350
- A dataset object.
351
- """
352
- if name:
353
- id = self._get_dataset_id_by_name(name)
354
-
355
- if not id:
356
- raise ValueError("Dataset id or name must be provided.")
357
-
358
- response = self._client.get(
359
- urljoin(self._base_url, f"/v1/datasets/{quote(id)}/examples"),
360
- params={"version_id": version_id} if version_id else None,
361
- )
362
- response.raise_for_status()
363
- data = response.json()["data"]
364
- examples = {
365
- example["id"]: Example(
366
- id=example["id"],
367
- input=example["input"],
368
- output=example["output"],
369
- metadata=example["metadata"],
370
- updated_at=datetime.fromisoformat(example["updated_at"]),
371
- )
372
- for example in data["examples"]
373
- }
374
- resolved_dataset_id = data["dataset_id"]
375
- resolved_version_id = data["version_id"]
376
- return Dataset(
377
- id=resolved_dataset_id,
378
- version_id=resolved_version_id,
379
- examples=examples,
380
- )
381
-
382
- def get_dataset_versions(
383
- self,
384
- dataset_id: str,
385
- /,
386
- *,
387
- limit: Optional[int] = 100,
388
- ) -> pd.DataFrame:
389
- """
390
- Get dataset versions as pandas DataFrame.
391
-
392
- Args:
393
- dataset_id (str): dataset ID
394
- limit (Optional[int]): maximum number of versions to return,
395
- starting from the most recent version
396
-
397
- Returns:
398
- pandas DataFrame
399
- """
400
- url = urljoin(self._base_url, f"v1/datasets/{dataset_id}/versions")
401
- response = httpx.get(url=url, params={"limit": limit})
402
- response.raise_for_status()
403
- if not (records := response.json()["data"]):
404
- return pd.DataFrame()
405
- df = pd.DataFrame.from_records(records, index="version_id")
406
- df["created_at"] = pd.to_datetime(df.created_at)
407
- return df
408
-
409
- def download_dataset_examples(
410
- self,
411
- dataset_id: str,
412
- /,
413
- *,
414
- dataset_version_id: Optional[str] = None,
415
- ) -> pd.DataFrame:
416
- """
417
- Download dataset examples as pandas DataFrame.
418
-
419
- Args:
420
- dataset_id (str): dataset ID
421
- dataset_version_id (Optional[str]): dataset version ID, if omitted,
422
- the latest version is returned.
423
-
424
- Returns:
425
- pandas DataFrame
426
- """
427
- url = f"v1/datasets/{dataset_id}/csv"
428
- response = httpx.get(
429
- url=urljoin(self._base_url, url),
430
- params={"version_id": dataset_version_id} if dataset_version_id else {},
431
- )
432
- response.raise_for_status()
433
- return pd.read_csv(
434
- StringIO(response.content.decode()),
435
- index_col="example_id",
436
- )
437
-
438
- def upload_dataset(
439
- self,
440
- *,
441
- dataset_name: str,
442
- dataframe: Optional[pd.DataFrame] = None,
443
- csv_file_path: Optional[Union[str, Path]] = None,
444
- input_keys: Iterable[str] = (),
445
- output_keys: Iterable[str] = (),
446
- metadata_keys: Iterable[str] = (),
447
- inputs: Iterable[Mapping[str, Any]] = (),
448
- outputs: Iterable[Mapping[str, Any]] = (),
449
- metadata: Iterable[Mapping[str, Any]] = (),
450
- dataset_description: Optional[str] = None,
451
- ) -> Dataset:
452
- """
453
- Upload examples as dataset to the Phoenix server. If `dataframe` or
454
- `csv_file_path` are provided, must also provide `input_keys` (and
455
- optionally with `output_keys` or `metadata_keys` or both), which is a
456
- list of strings denoting the column names in the dataframe or the csv
457
- file. On the other hand, a sequence of dictionaries can also be provided
458
- via `inputs` (and optionally with `outputs` or `metadat` or both), each
459
- item of which represents a separate example in the dataset.
460
-
461
- Args:
462
- dataset_name: (str): Name of the dataset.
463
- dataframe (pd.DataFrame): pandas DataFrame.
464
- csv_file_path (str | Path): Location of a CSV text file
465
- input_keys (Iterable[str]): List of column names used as input keys.
466
- input_keys, output_keys, metadata_keys must be disjoint, and must
467
- exist in CSV column headers.
468
- output_keys (Iterable[str]): List of column names used as output keys.
469
- input_keys, output_keys, metadata_keys must be disjoint, and must
470
- exist in CSV column headers.
471
- metadata_keys (Iterable[str]): List of column names used as metadata keys.
472
- input_keys, output_keys, metadata_keys must be disjoint, and must
473
- exist in CSV column headers.
474
- inputs (Iterable[Mapping[str, Any]]): List of dictionaries object each
475
- corresponding to an example in the dataset.
476
- outputs (Iterable[Mapping[str, Any]]): List of dictionaries object each
477
- corresponding to an example in the dataset.
478
- metadata (Iterable[Mapping[str, Any]]): List of dictionaries object each
479
- corresponding to an example in the dataset.
480
- dataset_description: (Optional[str]): Description of the dataset.
481
-
482
- Returns:
483
- A Dataset object with the uploaded examples.
484
- """
485
- if dataframe is not None or csv_file_path is not None:
486
- if dataframe is not None and csv_file_path is not None:
487
- raise ValueError(
488
- "Please provide either `dataframe` or `csv_file_path`, but not both"
489
- )
490
- if list(inputs) or list(outputs) or list(metadata):
491
- option = "dataframe" if dataframe is not None else "csv_file_path"
492
- raise ValueError(
493
- f"Please provide only either `{option}` or list of dictionaries "
494
- f"via `inputs` (with `outputs` and `metadata`) but not both."
495
- )
496
- table = dataframe if dataframe is not None else csv_file_path
497
- assert table is not None # for type-checker
498
- return self._upload_tabular_dataset(
499
- table,
500
- dataset_name=dataset_name,
501
- input_keys=input_keys,
502
- output_keys=output_keys,
503
- metadata_keys=metadata_keys,
504
- dataset_description=dataset_description,
505
- )
506
- return self._upload_json_dataset(
507
- dataset_name=dataset_name,
508
- inputs=inputs,
509
- outputs=outputs,
510
- metadata=metadata,
511
- dataset_description=dataset_description,
512
- )
513
-
514
- def append_to_dataset(
515
- self,
516
- *,
517
- dataset_name: str,
518
- dataframe: Optional[pd.DataFrame] = None,
519
- csv_file_path: Optional[Union[str, Path]] = None,
520
- input_keys: Iterable[str] = (),
521
- output_keys: Iterable[str] = (),
522
- metadata_keys: Iterable[str] = (),
523
- inputs: Iterable[Mapping[str, Any]] = (),
524
- outputs: Iterable[Mapping[str, Any]] = (),
525
- metadata: Iterable[Mapping[str, Any]] = (),
526
- dataset_description: Optional[str] = None,
527
- ) -> Dataset:
528
- """
529
- Append examples to dataset on the Phoenix server. If `dataframe` or
530
- `csv_file_path` are provided, must also provide `input_keys` (and
531
- optionally with `output_keys` or `metadata_keys` or both), which is a
532
- list of strings denoting the column names in the dataframe or the csv
533
- file. On the other hand, a sequence of dictionaries can also be provided
534
- via `inputs` (and optionally with `outputs` or `metadat` or both), each
535
- item of which represents a separate example in the dataset.
536
-
537
- Args:
538
- dataset_name: (str): Name of the dataset.
539
- dataframe (pd.DataFrame): pandas DataFrame.
540
- csv_file_path (str | Path): Location of a CSV text file
541
- input_keys (Iterable[str]): List of column names used as input keys.
542
- input_keys, output_keys, metadata_keys must be disjoint, and must
543
- exist in CSV column headers.
544
- output_keys (Iterable[str]): List of column names used as output keys.
545
- input_keys, output_keys, metadata_keys must be disjoint, and must
546
- exist in CSV column headers.
547
- metadata_keys (Iterable[str]): List of column names used as metadata keys.
548
- input_keys, output_keys, metadata_keys must be disjoint, and must
549
- exist in CSV column headers.
550
- inputs (Iterable[Mapping[str, Any]]): List of dictionaries object each
551
- corresponding to an example in the dataset.
552
- outputs (Iterable[Mapping[str, Any]]): List of dictionaries object each
553
- corresponding to an example in the dataset.
554
- metadata (Iterable[Mapping[str, Any]]): List of dictionaries object each
555
- corresponding to an example in the dataset.
556
- dataset_description: (Optional[str]): Description of the dataset.
557
-
558
- Returns:
559
- A Dataset object with its examples.
560
- """
561
- if dataframe is not None or csv_file_path is not None:
562
- if dataframe is not None and csv_file_path is not None:
563
- raise ValueError(
564
- "Please provide either `dataframe` or `csv_file_path`, but not both"
565
- )
566
- if list(inputs) or list(outputs) or list(metadata):
567
- option = "dataframe" if dataframe is not None else "csv_file_path"
568
- raise ValueError(
569
- f"Please provide only either `{option}` or list of dictionaries "
570
- f"via `inputs` (with `outputs` and `metadata`) but not both."
571
- )
572
- table = dataframe if dataframe is not None else csv_file_path
573
- assert table is not None # for type-checker
574
- return self._upload_tabular_dataset(
575
- table,
576
- dataset_name=dataset_name,
577
- input_keys=input_keys,
578
- output_keys=output_keys,
579
- metadata_keys=metadata_keys,
580
- dataset_description=dataset_description,
581
- action="append",
582
- )
583
- return self._upload_json_dataset(
584
- dataset_name=dataset_name,
585
- inputs=inputs,
586
- outputs=outputs,
587
- metadata=metadata,
588
- dataset_description=dataset_description,
589
- action="append",
590
- )
591
-
592
- def _upload_tabular_dataset(
593
- self,
594
- table: Union[str, Path, pd.DataFrame],
595
- /,
596
- *,
597
- dataset_name: str,
598
- input_keys: Iterable[str],
599
- output_keys: Iterable[str] = (),
600
- metadata_keys: Iterable[str] = (),
601
- dataset_description: Optional[str] = None,
602
- action: DatasetAction = "create",
603
- ) -> Dataset:
604
- """
605
- Upload examples as dataset to the Phoenix server.
606
-
607
- Args:
608
- table (str | Path | pd.DataFrame): Location of a CSV text file, or
609
- pandas DataFrame.
610
- dataset_name: (str): Name of the dataset. Required if action=append.
611
- input_keys (Iterable[str]): List of column names used as input keys.
612
- input_keys, output_keys, metadata_keys must be disjoint, and must
613
- exist in CSV column headers.
614
- output_keys (Iterable[str]): List of column names used as output keys.
615
- input_keys, output_keys, metadata_keys must be disjoint, and must
616
- exist in CSV column headers.
617
- metadata_keys (Iterable[str]): List of column names used as metadata keys.
618
- input_keys, output_keys, metadata_keys must be disjoint, and must
619
- exist in CSV column headers.
620
- dataset_description: (Optional[str]): Description of the dataset.
621
- action: (Literal["create", "append"]): Create new dataset or append to an
622
- existing one. If action="append" and dataset does not exist, it'll
623
- be created.
624
-
625
- Returns:
626
- A Dataset object with the uploaded examples.
627
- """
628
- if action not in ("create", "append"):
629
- raise ValueError(f"Invalid action: {action}")
630
- if not dataset_name:
631
- raise ValueError("Dataset name must not be blank")
632
- input_keys, output_keys, metadata_keys = (
633
- (keys,) if isinstance(keys, str) else (keys or ())
634
- for keys in (input_keys, output_keys, metadata_keys)
635
- )
636
- if not any(map(bool, (input_keys, output_keys, metadata_keys))):
637
- input_keys, output_keys, metadata_keys = _infer_keys(table)
638
- keys = DatasetKeys(
639
- frozenset(input_keys),
640
- frozenset(output_keys),
641
- frozenset(metadata_keys),
642
- )
643
- if isinstance(table, pd.DataFrame):
644
- file = _prepare_pyarrow(table, keys)
645
- elif isinstance(table, (str, Path)):
646
- file = _prepare_csv(Path(table), keys)
647
- else:
648
- assert_never(table)
649
- print("📤 Uploading dataset...")
650
- response = self._client.post(
651
- url=urljoin(self._base_url, "v1/datasets/upload"),
652
- files={"file": file},
653
- data={
654
- "action": action,
655
- "name": dataset_name,
656
- "description": dataset_description,
657
- "input_keys[]": sorted(keys.input),
658
- "output_keys[]": sorted(keys.output),
659
- "metadata_keys[]": sorted(keys.metadata),
660
- },
661
- params={"sync": True},
662
- )
663
- return self._process_dataset_upload_response(response)
664
-
665
- def _upload_json_dataset(
666
- self,
667
- *,
668
- dataset_name: str,
669
- inputs: Iterable[Mapping[str, Any]],
670
- outputs: Iterable[Mapping[str, Any]] = (),
671
- metadata: Iterable[Mapping[str, Any]] = (),
672
- dataset_description: Optional[str] = None,
673
- action: DatasetAction = "create",
674
- ) -> Dataset:
675
- """
676
- Upload examples as dataset to the Phoenix server.
677
-
678
- Args:
679
- dataset_name: (str): Name of the dataset
680
- inputs (Iterable[Mapping[str, Any]]): List of dictionaries object each
681
- corresponding to an example in the dataset.
682
- outputs (Iterable[Mapping[str, Any]]): List of dictionaries object each
683
- corresponding to an example in the dataset.
684
- metadata (Iterable[Mapping[str, Any]]): List of dictionaries object each
685
- corresponding to an example in the dataset.
686
- dataset_description: (Optional[str]): Description of the dataset.
687
- action: (Literal["create", "append"]): Create new dataset or append to an
688
- existing one. If action="append" and dataset does not exist, it'll
689
- be created.
690
-
691
- Returns:
692
- A Dataset object with the uploaded examples.
693
- """
694
- # convert to list to avoid issues with pandas Series
695
- inputs, outputs, metadata = list(inputs), list(outputs), list(metadata)
696
- if not inputs or not _is_all_dict(inputs):
697
- raise ValueError(
698
- "`inputs` should be a non-empty sequence containing only dictionary objects"
699
- )
700
- for name, seq in {"outputs": outputs, "metadata": metadata}.items():
701
- if seq and not (len(seq) == len(inputs) and _is_all_dict(seq)):
702
- raise ValueError(
703
- f"`{name}` should be a sequence of the same length as `inputs` "
704
- "containing only dictionary objects"
705
- )
706
- print("📤 Uploading dataset...")
707
- response = self._client.post(
708
- url=urljoin(self._base_url, "v1/datasets/upload"),
709
- headers={"Content-Encoding": "gzip"},
710
- json={
711
- "action": action,
712
- "name": dataset_name,
713
- "description": dataset_description,
714
- "inputs": inputs,
715
- "outputs": outputs,
716
- "metadata": metadata,
717
- },
718
- params={"sync": True},
719
- )
720
- return self._process_dataset_upload_response(response)
721
-
722
- def _process_dataset_upload_response(self, response: Response) -> Dataset:
723
- try:
724
- response.raise_for_status()
725
- except HTTPStatusError as e:
726
- if msg := response.text:
727
- raise DatasetUploadError(msg) from e
728
- raise
729
- data = response.json()["data"]
730
- dataset_id = data["dataset_id"]
731
- response = self._client.get(
732
- url=urljoin(self._base_url, f"v1/datasets/{dataset_id}/examples")
733
- )
734
- response.raise_for_status()
735
- data = response.json()["data"]
736
- version_id = data["version_id"]
737
- examples = data["examples"]
738
- print(f"💾 Examples uploaded: {self.web_url}datasets/{dataset_id}/examples")
739
- print(f"🗄️ Dataset version ID: {version_id}")
740
-
741
- return Dataset(
742
- id=dataset_id,
743
- version_id=version_id,
744
- examples={
745
- example["id"]: Example(
746
- id=example["id"],
747
- input=example["input"],
748
- output=example["output"],
749
- metadata=example["metadata"],
750
- updated_at=datetime.fromisoformat(example["updated_at"]),
751
- )
752
- for example in examples
753
- },
754
- )
755
-
756
-
757
- FileName: TypeAlias = str
758
- FilePointer: TypeAlias = BinaryIO
759
- FileType: TypeAlias = str
760
- FileHeaders: TypeAlias = Dict[str, str]
761
-
762
-
763
- def _get_csv_column_headers(path: Path) -> Tuple[str, ...]:
764
- path = path.resolve()
765
- if not path.is_file():
766
- raise FileNotFoundError(f"File does not exist: {path}")
767
- with open(path, "r") as f:
768
- rows = csv.reader(f)
769
- try:
770
- column_headers = tuple(next(rows))
771
- _ = next(rows)
772
- except StopIteration:
773
- raise ValueError("csv file has no data")
774
- return column_headers
775
-
776
-
777
- def _prepare_csv(
778
- path: Path,
779
- keys: DatasetKeys,
780
- ) -> Tuple[FileName, FilePointer, FileType, FileHeaders]:
781
- column_headers = _get_csv_column_headers(path)
782
- (header, freq), *_ = Counter(column_headers).most_common(1)
783
- if freq > 1:
784
- raise ValueError(f"Duplicated column header in CSV file: {header}")
785
- keys.check_differences(frozenset(column_headers))
786
- file = BytesIO()
787
- with open(path, "rb") as f:
788
- file.write(gzip.compress(f.read()))
789
- return path.name, file, "text/csv", {"Content-Encoding": "gzip"}
790
-
791
-
792
- def _prepare_pyarrow(
793
- df: pd.DataFrame,
794
- keys: DatasetKeys,
795
- ) -> Tuple[FileName, FilePointer, FileType, FileHeaders]:
796
- if df.empty:
797
- raise ValueError("dataframe has no data")
798
- (header, freq), *_ = Counter(df.columns).most_common(1)
799
- if freq > 1:
800
- raise ValueError(f"Duplicated column header in file: {header}")
801
- keys.check_differences(frozenset(df.columns))
802
- table = Table.from_pandas(df.loc[:, list(keys)])
803
- sink = pa.BufferOutputStream()
804
- options = pa.ipc.IpcWriteOptions(compression="lz4")
805
- with pa.ipc.new_stream(sink, table.schema, options=options) as writer:
806
- writer.write_table(table)
807
- file = BytesIO(sink.getvalue().to_pybytes())
808
- return "pandas", file, "application/x-pandas-pyarrow", {}
809
-
810
-
811
- _response_header = re.compile(r"(?i)(response|answer)s*$")
812
-
813
-
814
- def _infer_keys(
815
- table: Union[str, Path, pd.DataFrame],
816
- ) -> Tuple[Tuple[str, ...], Tuple[str, ...], Tuple[str, ...]]:
817
- column_headers = (
818
- tuple(table.columns)
819
- if isinstance(table, pd.DataFrame)
820
- else _get_csv_column_headers(Path(table))
821
- )
822
- for i, header in enumerate(column_headers):
823
- if _response_header.search(header):
824
- break
825
- else:
826
- i = len(column_headers)
827
- return (
828
- column_headers[:i],
829
- column_headers[i : i + 1],
830
- column_headers[i + 1 :],
831
- )
832
-
833
252
 
834
253
  def _to_iso_format(value: Optional[datetime]) -> Optional[str]:
835
254
  return value.isoformat() if value else None
836
-
837
-
838
- def _is_all_dict(seq: Sequence[Any]) -> bool:
839
- return all(map(lambda obj: isinstance(obj, dict), seq))
840
-
841
-
842
- class DatasetUploadError(Exception): ...