arize 8.0.0a22__py3-none-any.whl → 8.0.0a23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. arize/__init__.py +17 -9
  2. arize/_exporter/client.py +55 -36
  3. arize/_exporter/parsers/tracing_data_parser.py +41 -30
  4. arize/_exporter/validation.py +3 -3
  5. arize/_flight/client.py +207 -76
  6. arize/_generated/api_client/__init__.py +30 -6
  7. arize/_generated/api_client/api/__init__.py +1 -0
  8. arize/_generated/api_client/api/datasets_api.py +864 -190
  9. arize/_generated/api_client/api/experiments_api.py +167 -131
  10. arize/_generated/api_client/api/projects_api.py +1197 -0
  11. arize/_generated/api_client/api_client.py +2 -2
  12. arize/_generated/api_client/configuration.py +42 -34
  13. arize/_generated/api_client/exceptions.py +2 -2
  14. arize/_generated/api_client/models/__init__.py +15 -4
  15. arize/_generated/api_client/models/dataset.py +10 -10
  16. arize/_generated/api_client/models/dataset_example.py +111 -0
  17. arize/_generated/api_client/models/dataset_example_update.py +100 -0
  18. arize/_generated/api_client/models/dataset_version.py +13 -13
  19. arize/_generated/api_client/models/datasets_create_request.py +16 -8
  20. arize/_generated/api_client/models/datasets_examples_insert_request.py +100 -0
  21. arize/_generated/api_client/models/datasets_examples_list200_response.py +106 -0
  22. arize/_generated/api_client/models/datasets_examples_update_request.py +102 -0
  23. arize/_generated/api_client/models/datasets_list200_response.py +10 -4
  24. arize/_generated/api_client/models/experiment.py +14 -16
  25. arize/_generated/api_client/models/experiment_run.py +108 -0
  26. arize/_generated/api_client/models/experiment_run_create.py +102 -0
  27. arize/_generated/api_client/models/experiments_create_request.py +16 -10
  28. arize/_generated/api_client/models/experiments_list200_response.py +10 -4
  29. arize/_generated/api_client/models/experiments_runs_list200_response.py +19 -5
  30. arize/_generated/api_client/models/{error.py → pagination_metadata.py} +13 -11
  31. arize/_generated/api_client/models/primitive_value.py +172 -0
  32. arize/_generated/api_client/models/problem.py +100 -0
  33. arize/_generated/api_client/models/project.py +99 -0
  34. arize/_generated/api_client/models/{datasets_list_examples200_response.py → projects_create_request.py} +13 -11
  35. arize/_generated/api_client/models/projects_list200_response.py +106 -0
  36. arize/_generated/api_client/rest.py +2 -2
  37. arize/_generated/api_client/test/test_dataset.py +4 -2
  38. arize/_generated/api_client/test/test_dataset_example.py +56 -0
  39. arize/_generated/api_client/test/test_dataset_example_update.py +52 -0
  40. arize/_generated/api_client/test/test_dataset_version.py +7 -2
  41. arize/_generated/api_client/test/test_datasets_api.py +27 -13
  42. arize/_generated/api_client/test/test_datasets_create_request.py +8 -4
  43. arize/_generated/api_client/test/{test_datasets_list_examples200_response.py → test_datasets_examples_insert_request.py} +19 -15
  44. arize/_generated/api_client/test/test_datasets_examples_list200_response.py +66 -0
  45. arize/_generated/api_client/test/test_datasets_examples_update_request.py +61 -0
  46. arize/_generated/api_client/test/test_datasets_list200_response.py +9 -3
  47. arize/_generated/api_client/test/test_experiment.py +2 -4
  48. arize/_generated/api_client/test/test_experiment_run.py +56 -0
  49. arize/_generated/api_client/test/test_experiment_run_create.py +54 -0
  50. arize/_generated/api_client/test/test_experiments_api.py +6 -6
  51. arize/_generated/api_client/test/test_experiments_create_request.py +9 -6
  52. arize/_generated/api_client/test/test_experiments_list200_response.py +9 -5
  53. arize/_generated/api_client/test/test_experiments_runs_list200_response.py +15 -5
  54. arize/_generated/api_client/test/test_pagination_metadata.py +53 -0
  55. arize/_generated/api_client/test/{test_error.py → test_primitive_value.py} +13 -14
  56. arize/_generated/api_client/test/test_problem.py +57 -0
  57. arize/_generated/api_client/test/test_project.py +58 -0
  58. arize/_generated/api_client/test/test_projects_api.py +59 -0
  59. arize/_generated/api_client/test/test_projects_create_request.py +54 -0
  60. arize/_generated/api_client/test/test_projects_list200_response.py +70 -0
  61. arize/_generated/api_client_README.md +43 -29
  62. arize/_generated/protocol/flight/flight_pb2.py +400 -0
  63. arize/_lazy.py +27 -19
  64. arize/client.py +268 -55
  65. arize/config.py +365 -116
  66. arize/constants/__init__.py +1 -0
  67. arize/constants/config.py +11 -4
  68. arize/constants/ml.py +6 -4
  69. arize/constants/openinference.py +2 -0
  70. arize/constants/pyarrow.py +2 -0
  71. arize/constants/spans.py +3 -1
  72. arize/datasets/__init__.py +1 -0
  73. arize/datasets/client.py +299 -84
  74. arize/datasets/errors.py +32 -2
  75. arize/datasets/validation.py +18 -8
  76. arize/embeddings/__init__.py +2 -0
  77. arize/embeddings/auto_generator.py +23 -19
  78. arize/embeddings/base_generators.py +89 -36
  79. arize/embeddings/constants.py +2 -0
  80. arize/embeddings/cv_generators.py +26 -4
  81. arize/embeddings/errors.py +27 -5
  82. arize/embeddings/nlp_generators.py +31 -12
  83. arize/embeddings/tabular_generators.py +32 -20
  84. arize/embeddings/usecases.py +12 -2
  85. arize/exceptions/__init__.py +1 -0
  86. arize/exceptions/auth.py +11 -1
  87. arize/exceptions/base.py +29 -4
  88. arize/exceptions/models.py +21 -2
  89. arize/exceptions/parameters.py +31 -0
  90. arize/exceptions/spaces.py +12 -1
  91. arize/exceptions/types.py +86 -7
  92. arize/exceptions/values.py +220 -20
  93. arize/experiments/__init__.py +1 -0
  94. arize/experiments/client.py +389 -285
  95. arize/experiments/evaluators/__init__.py +1 -0
  96. arize/experiments/evaluators/base.py +74 -41
  97. arize/experiments/evaluators/exceptions.py +6 -3
  98. arize/experiments/evaluators/executors.py +121 -73
  99. arize/experiments/evaluators/rate_limiters.py +106 -57
  100. arize/experiments/evaluators/types.py +34 -7
  101. arize/experiments/evaluators/utils.py +65 -27
  102. arize/experiments/functions.py +103 -101
  103. arize/experiments/tracing.py +52 -44
  104. arize/experiments/types.py +56 -31
  105. arize/logging.py +54 -22
  106. arize/models/__init__.py +1 -0
  107. arize/models/batch_validation/__init__.py +1 -0
  108. arize/models/batch_validation/errors.py +543 -65
  109. arize/models/batch_validation/validator.py +339 -300
  110. arize/models/bounded_executor.py +20 -7
  111. arize/models/casting.py +75 -29
  112. arize/models/client.py +326 -107
  113. arize/models/proto.py +95 -40
  114. arize/models/stream_validation.py +42 -14
  115. arize/models/surrogate_explainer/__init__.py +1 -0
  116. arize/models/surrogate_explainer/mimic.py +24 -13
  117. arize/pre_releases.py +43 -0
  118. arize/projects/__init__.py +1 -0
  119. arize/projects/client.py +129 -0
  120. arize/regions.py +40 -0
  121. arize/spans/__init__.py +1 -0
  122. arize/spans/client.py +130 -106
  123. arize/spans/columns.py +13 -0
  124. arize/spans/conversion.py +54 -38
  125. arize/spans/validation/__init__.py +1 -0
  126. arize/spans/validation/annotations/__init__.py +1 -0
  127. arize/spans/validation/annotations/annotations_validation.py +6 -4
  128. arize/spans/validation/annotations/dataframe_form_validation.py +13 -11
  129. arize/spans/validation/annotations/value_validation.py +35 -11
  130. arize/spans/validation/common/__init__.py +1 -0
  131. arize/spans/validation/common/argument_validation.py +33 -8
  132. arize/spans/validation/common/dataframe_form_validation.py +35 -9
  133. arize/spans/validation/common/errors.py +211 -11
  134. arize/spans/validation/common/value_validation.py +80 -13
  135. arize/spans/validation/evals/__init__.py +1 -0
  136. arize/spans/validation/evals/dataframe_form_validation.py +28 -8
  137. arize/spans/validation/evals/evals_validation.py +34 -4
  138. arize/spans/validation/evals/value_validation.py +26 -3
  139. arize/spans/validation/metadata/__init__.py +1 -1
  140. arize/spans/validation/metadata/argument_validation.py +14 -5
  141. arize/spans/validation/metadata/dataframe_form_validation.py +26 -10
  142. arize/spans/validation/metadata/value_validation.py +24 -10
  143. arize/spans/validation/spans/__init__.py +1 -0
  144. arize/spans/validation/spans/dataframe_form_validation.py +34 -13
  145. arize/spans/validation/spans/spans_validation.py +35 -4
  146. arize/spans/validation/spans/value_validation.py +76 -7
  147. arize/types.py +293 -157
  148. arize/utils/__init__.py +1 -0
  149. arize/utils/arrow.py +31 -15
  150. arize/utils/cache.py +34 -6
  151. arize/utils/dataframe.py +19 -2
  152. arize/utils/online_tasks/__init__.py +2 -0
  153. arize/utils/online_tasks/dataframe_preprocessor.py +53 -41
  154. arize/utils/openinference_conversion.py +44 -5
  155. arize/utils/proto.py +10 -0
  156. arize/utils/size.py +5 -3
  157. arize/version.py +3 -1
  158. {arize-8.0.0a22.dist-info → arize-8.0.0a23.dist-info}/METADATA +4 -3
  159. arize-8.0.0a23.dist-info/RECORD +174 -0
  160. {arize-8.0.0a22.dist-info → arize-8.0.0a23.dist-info}/WHEEL +1 -1
  161. arize-8.0.0a23.dist-info/licenses/LICENSE +176 -0
  162. arize-8.0.0a23.dist-info/licenses/NOTICE +13 -0
  163. arize/_generated/protocol/flight/export_pb2.py +0 -61
  164. arize/_generated/protocol/flight/ingest_pb2.py +0 -365
  165. arize-8.0.0a22.dist-info/RECORD +0 -146
  166. arize-8.0.0a22.dist-info/licenses/LICENSE.md +0 -12
arize/regions.py ADDED
@@ -0,0 +1,40 @@
1
+ """Region definitions and configuration for Arize deployment zones."""
2
+
3
+ from dataclasses import dataclass
4
+ from enum import StrEnum
5
+
6
+ from arize.constants.config import DEFAULT_FLIGHT_PORT
7
+
8
+
9
+ class Region(StrEnum):
10
+ """Enum representing available Arize deployment regions."""
11
+
12
+ US_CENTRAL_1 = "us-central-1a"
13
+ EU_WEST_1 = "eu-west-1a"
14
+ CA_CENTRAL_1 = "ca-central-1a"
15
+ US_EAST_1 = "us-east-1b"
16
+ UNSPECIFIED = ""
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class RegionEndpoints:
21
+ """Container for region-specific API endpoint hostnames and ports."""
22
+
23
+ api_host: str
24
+ otlp_host: str
25
+ flight_host: str
26
+ flight_port: int
27
+
28
+
29
+ def _get_region_endpoints(region: Region) -> RegionEndpoints:
30
+ return RegionEndpoints(
31
+ api_host=f"api.{region}.arize.com",
32
+ otlp_host=f"otlp.{region}.arize.com",
33
+ flight_host=f"flight.{region}.arize.com",
34
+ flight_port=DEFAULT_FLIGHT_PORT,
35
+ )
36
+
37
+
38
+ REGION_ENDPOINTS: dict[Region, RegionEndpoints] = {
39
+ r: _get_region_endpoints(r) for r in Region if r != Region.UNSPECIFIED
40
+ }
arize/spans/__init__.py CHANGED
@@ -0,0 +1 @@
1
+ """LLM tracing spans functionality for the Arize SDK."""
arize/spans/client.py CHANGED
@@ -1,3 +1,5 @@
1
+ """Client implementation for managing spans and traces in the Arize platform."""
2
+
1
3
  # type: ignore[pb2]
2
4
  from __future__ import annotations
3
5
 
@@ -6,7 +8,7 @@ import logging
6
8
  import re
7
9
  from datetime import datetime, timezone
8
10
  from functools import partial
9
- from typing import TYPE_CHECKING, Any, Dict, List
11
+ from typing import TYPE_CHECKING, Any
10
12
 
11
13
  import numpy as np
12
14
  import pandas as pd
@@ -16,10 +18,6 @@ from google.protobuf import json_format, message
16
18
  from arize._exporter.client import ArizeExportClient
17
19
  from arize._flight.client import ArizeFlightClient, FlightPostArrowFileResponse
18
20
  from arize._flight.types import FlightRequestType
19
- from arize._generated.protocol.flight.ingest_pb2 import (
20
- WriteSpanAnnotationResponse,
21
- WriteSpanEvaluationResponse,
22
- )
23
21
  from arize.constants.spans import DEFAULT_DATETIME_FMT
24
22
  from arize.exceptions.base import (
25
23
  INVALID_ARROW_CONVERSION_MSG,
@@ -29,9 +27,7 @@ from arize.exceptions.models import MissingProjectNameError
29
27
  from arize.exceptions.spaces import MissingSpaceIDError
30
28
  from arize.logging import CtxAdapter
31
29
  from arize.types import Environments, SimilaritySearchParams
32
- from arize.utils.arrow import (
33
- post_arrow_table,
34
- )
30
+ from arize.utils.arrow import post_arrow_table
35
31
  from arize.utils.dataframe import (
36
32
  remove_extraneous_columns,
37
33
  reset_dataframe_index,
@@ -41,13 +37,21 @@ from arize.utils.proto import get_pb_schema_tracing
41
37
  if TYPE_CHECKING:
42
38
  import requests
43
39
 
40
+ from arize._generated.protocol.flight import flight_pb2
44
41
  from arize.config import SDKConfiguration
45
42
 
46
43
  logger = logging.getLogger(__name__)
47
44
 
48
45
 
49
46
  class SpansClient:
50
- def __init__(self, *, sdk_config: SDKConfiguration):
47
+ """Client for logging LLM tracing spans and evaluations to Arize."""
48
+
49
+ def __init__(self, *, sdk_config: SDKConfiguration) -> None:
50
+ """Initialize the spans client with SDK configuration.
51
+
52
+ Args:
53
+ sdk_config: SDK configuration containing API endpoints and credentials.
54
+ """
51
55
  self._sdk_config = sdk_config
52
56
 
53
57
  def log(
@@ -62,12 +66,14 @@ class SpansClient:
62
66
  timeout: float | None = None,
63
67
  tmp_dir: str = "",
64
68
  ) -> requests.Response:
65
- """
66
- Logs a pandas dataframe containing LLM tracing data to Arize via a POST request. Returns a
67
- :class:`Response` object from the Requests HTTP library to ensure successful delivery of
68
- records.
69
+ """Logs a pandas dataframe containing LLM tracing data to Arize via a POST request.
70
+
71
+ Returns a :class:`Response` object from the Requests HTTP library to ensure
72
+ successful delivery of records.
69
73
 
70
74
  Args:
75
+ space_id (str): The space ID where the project resides.
76
+ project_name (str): A unique name to identify your project in the Arize platform.
71
77
  dataframe (pd.DataFrame): The dataframe containing the LLM traces.
72
78
  evals_dataframe (pd.DataFrame, optional): A dataframe containing LLM evaluations data.
73
79
  The evaluations are joined to their corresponding spans via a left outer join, i.e.,
@@ -76,11 +82,10 @@ class SpansClient:
76
82
  Defaults to "%Y-%m-%dT%H:%M:%S.%f+00:00".
77
83
  validate (bool, optional): When set to True, validation is run before sending data.
78
84
  Defaults to True.
79
- tmp_dir (str, optional): Temporary directory/file to store the serialized data in binary
80
- before sending to Arize.
81
85
  timeout (float, optional): You can stop waiting for a response after a given number
82
86
  of seconds with the timeout parameter. Defaults to None.
83
- project_name (str, optional): A unique name to identify your project in the Arize platform.
87
+ tmp_dir (str, optional): Temporary directory/file to store the serialized data in binary
88
+ before sending to Arize.
84
89
 
85
90
  Returns:
86
91
  `Response` object
@@ -220,12 +225,12 @@ class SpansClient:
220
225
  log.debug("Converting data to Arrow format")
221
226
  pa_table = pa.Table.from_pandas(df, preserve_index=False)
222
227
  except pa.ArrowInvalid as e:
223
- log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
228
+ log.exception(INVALID_ARROW_CONVERSION_MSG)
224
229
  raise pa.ArrowInvalid(
225
- f"Error converting to Arrow format: {str(e)}"
230
+ f"Error converting to Arrow format: {e!s}"
226
231
  ) from e
227
- except Exception as e:
228
- log.error(f"Unexpected error creating Arrow table: {str(e)}")
232
+ except Exception:
233
+ log.exception("Unexpected error creating Arrow table")
229
234
  raise
230
235
 
231
236
  proto_schema = get_pb_schema_tracing(project_name=project_name)
@@ -262,27 +267,23 @@ class SpansClient:
262
267
  force_http: bool = False,
263
268
  timeout: float | None = None,
264
269
  tmp_dir: str = "",
265
- ) -> WriteSpanEvaluationResponse:
266
- """
267
- Logs a pandas dataframe containing LLM evaluations data to Arize via a Flight gRPC request.
268
- The dataframe must contain a column `context.span_id`
269
- such that Arize can assign each evaluation to its respective span.
270
+ ) -> flight_pb2.WriteSpanEvaluationResponse:
271
+ """Logs a pandas dataframe containing LLM evaluations data to Arize via a Flight gRPC request.
272
+
273
+ The dataframe must contain a column `context.span_id` such that Arize can assign
274
+ each evaluation to its respective span.
270
275
 
271
276
  Args:
277
+ space_id (str): The space ID where the project resides.
278
+ project_name (str): A unique name to identify your project in the Arize platform.
272
279
  dataframe (pd.DataFrame): A dataframe containing LLM evaluations data.
273
- model_id (str): A unique name to identify your model in the Arize platform.
274
- (Deprecated: Use `project_name` instead.)
275
- model_version (str, optional): Used to group a subset of traces a given
276
- model_id to compare and track changes. It should match the model_id of the spans
277
- sent previously, to which evaluations will be assigned. Defaults to None.
278
280
  validate (bool, optional): When set to True, validation is run before sending data.
279
281
  Defaults to True.
280
- path (str, optional): Temporary directory/file to store the serialized data in binary
281
- before sending to Arize.
282
+ force_http (bool, optional): Force the use of HTTP for data upload. Defaults to False.
282
283
  timeout (float, optional): You can stop waiting for a response after a given number
283
284
  of seconds with the timeout parameter. Defaults to None.
284
- project_name (str, optional): A unique name to identify your project in the Arize platform.
285
- Either model_id or project_name must be provided.
285
+ tmp_dir (str, optional): Temporary directory/file to store the serialized data in binary
286
+ before sending to Arize.
286
287
  """
287
288
  from arize.spans.columns import EVAL_COLUMN_PATTERN, SPAN_SPAN_ID_COL
288
289
  from arize.spans.validation.evals import evals_validation
@@ -358,12 +359,12 @@ class SpansClient:
358
359
  log.debug("Converting data to Arrow format")
359
360
  pa_table = pa.Table.from_pandas(evals_df, preserve_index=False)
360
361
  except pa.ArrowInvalid as e:
361
- log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
362
+ log.exception(INVALID_ARROW_CONVERSION_MSG)
362
363
  raise pa.ArrowInvalid(
363
- f"Error converting to Arrow format: {str(e)}"
364
+ f"Error converting to Arrow format: {e!s}"
364
365
  ) from e
365
- except Exception as e:
366
- log.error(f"Unexpected error creating Arrow table: {str(e)}")
366
+ except Exception:
367
+ log.exception("Unexpected error creating Arrow table")
367
368
  raise
368
369
 
369
370
  if force_http:
@@ -395,8 +396,8 @@ class SpansClient:
395
396
  response = None
396
397
  with ArizeFlightClient(
397
398
  api_key=self._sdk_config.api_key,
398
- host=self._sdk_config.flight_server_host,
399
- port=self._sdk_config.flight_server_port,
399
+ host=self._sdk_config.flight_host,
400
+ port=self._sdk_config.flight_port,
400
401
  scheme=self._sdk_config.flight_scheme,
401
402
  request_verify=self._sdk_config.request_verify,
402
403
  max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -409,8 +410,8 @@ class SpansClient:
409
410
  request_type=request_type,
410
411
  )
411
412
  except Exception as e:
412
- msg = f"Error during update request: {str(e)}"
413
- log.error(msg)
413
+ msg = f"Error during update request: {e!s}"
414
+ log.exception(msg)
414
415
  raise RuntimeError(msg) from e
415
416
 
416
417
  if response is None:
@@ -437,18 +438,18 @@ class SpansClient:
437
438
  project_name: str,
438
439
  dataframe: pd.DataFrame,
439
440
  validate: bool = True,
440
- ) -> WriteSpanAnnotationResponse:
441
- """
442
- Logs a pandas dataframe containing LLM span annotations to Arize via a Flight gRPC request.
443
- The dataframe must contain a column `context.span_id`
444
- such that Arize can assign each annotation to its respective span.
445
- Annotation columns should follow the pattern `annotation.<name>.<suffix>` where suffix is
446
- either `label` or `score`. An optional `annotation.notes` column can be included for
447
- free-form text notes.
441
+ ) -> flight_pb2.WriteSpanAnnotationResponse:
442
+ """Logs a pandas dataframe containing LLM span annotations to Arize via a Flight gRPC request.
443
+
444
+ The dataframe must contain a column `context.span_id` such that Arize can assign
445
+ each annotation to its respective span. Annotation columns should follow the pattern
446
+ `annotation.<name>.<suffix>` where suffix is either `label` or `score`. An optional
447
+ `annotation.notes` column can be included for free-form text notes.
448
448
 
449
449
  Args:
450
- dataframe (pd.DataFrame): A dataframe containing LLM annotation data.
450
+ space_id (str): The space ID where the project resides.
451
451
  project_name (str): A unique name to identify your project in the Arize platform.
452
+ dataframe (pd.DataFrame): A dataframe containing LLM annotation data.
452
453
  validate (bool, optional): When set to True, validation is run before sending data.
453
454
  Defaults to True.
454
455
  """
@@ -588,12 +589,12 @@ class SpansClient:
588
589
  log.debug("Converting data to Arrow format")
589
590
  pa_table = pa.Table.from_pandas(anno_df, preserve_index=False)
590
591
  except pa.ArrowInvalid as e:
591
- log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
592
+ log.exception(INVALID_ARROW_CONVERSION_MSG)
592
593
  raise pa.ArrowInvalid(
593
- f"Error converting to Arrow format: {str(e)}"
594
+ f"Error converting to Arrow format: {e!s}"
594
595
  ) from e
595
- except Exception as e:
596
- log.error(f"Unexpected error creating Arrow table: {str(e)}")
596
+ except Exception:
597
+ log.exception("Unexpected error creating Arrow table")
597
598
  raise
598
599
 
599
600
  if ANNOTATION_NOTES_COLUMN_NAME in anno_df.columns:
@@ -611,8 +612,8 @@ class SpansClient:
611
612
  response = None
612
613
  with ArizeFlightClient(
613
614
  api_key=self._sdk_config.api_key,
614
- host=self._sdk_config.flight_server_host,
615
- port=self._sdk_config.flight_server_port,
615
+ host=self._sdk_config.flight_host,
616
+ port=self._sdk_config.flight_port,
616
617
  scheme=self._sdk_config.flight_scheme,
617
618
  request_verify=self._sdk_config.request_verify,
618
619
  max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -625,8 +626,8 @@ class SpansClient:
625
626
  request_type=request_type,
626
627
  )
627
628
  except Exception as e:
628
- msg = f"Error during update request: {str(e)}"
629
- log.error(msg)
629
+ msg = f"Error during update request: {e!s}"
630
+ log.exception(msg)
630
631
  raise RuntimeError(msg) from e
631
632
 
632
633
  if response is None:
@@ -654,9 +655,10 @@ class SpansClient:
654
655
  dataframe: pd.DataFrame,
655
656
  patch_document_column_name: str = "patch_document",
656
657
  validate: bool = True,
657
- ) -> Dict[str, Any]:
658
- """
659
- Log metadata updates using JSON Merge Patch format. This method is only supported for LLM model types.
658
+ ) -> dict[str, Any]:
659
+ """Log metadata updates using JSON Merge Patch format.
660
+
661
+ This method is only supported for LLM model types.
660
662
 
661
663
  The dataframe must contain a column `context.span_id` to identify spans and either:
662
664
  1. A column with JSON patch documents (specified by patch_document_column_name), or
@@ -674,8 +676,9 @@ class SpansClient:
674
676
  Note: This differs from standard JSON Merge Patch where null values remove fields.
675
677
 
676
678
  Args:
677
- dataframe: DataFrame with span_ids and either patch documents or metadata field columns.
679
+ space_id: The space ID where the project resides.
678
680
  project_name: A unique name to identify your project in the Arize platform.
681
+ dataframe: DataFrame with span_ids and either patch documents or metadata field columns.
679
682
  patch_document_column_name: Name of the column containing JSON patch documents.
680
683
  Defaults to "patch_document".
681
684
  validate: When set to True, validation is run before sending data.
@@ -813,11 +816,10 @@ class SpansClient:
813
816
  )
814
817
 
815
818
  # Create a new column for patch documents if we're going to use it
816
- if has_metadata_fields or has_patch_document:
817
- # Use 'patch_document' as the standardized column name for downstream processing
818
- final_patch_column = "patch_document"
819
- if final_patch_column not in metadata_df.columns:
820
- metadata_df[final_patch_column] = None
819
+ # Use 'patch_document' as the standardized column name for downstream processing
820
+ final_patch_column = "patch_document"
821
+ if final_patch_column not in metadata_df.columns:
822
+ metadata_df[final_patch_column] = None
821
823
 
822
824
  # Process metadata field columns if they exist
823
825
  if has_metadata_fields:
@@ -865,7 +867,7 @@ class SpansClient:
865
867
  if patch:
866
868
  processed_patches.append(patch)
867
869
  if errors:
868
- validation_errors.append(errors)
870
+ validation_errors.extend(errors)
869
871
 
870
872
  # If validation is enabled and errors found, raise ValidationFailure
871
873
  if validate and validation_errors:
@@ -922,9 +924,11 @@ class SpansClient:
922
924
  metadata_df[final_patch_column] = metadata_df[
923
925
  final_patch_column
924
926
  ].apply(
925
- lambda p: json.dumps(p)
926
- if not isinstance(p, float) or not np.isnan(p)
927
- else json.dumps({})
927
+ lambda p: (
928
+ json.dumps(p)
929
+ if not isinstance(p, float) or not np.isnan(p)
930
+ else json.dumps({})
931
+ )
928
932
  )
929
933
 
930
934
  # Convert to Arrow table
@@ -932,20 +936,20 @@ class SpansClient:
932
936
  log.debug("Converting data to Arrow format")
933
937
  pa_table = pa.Table.from_pandas(metadata_df, preserve_index=False)
934
938
  except pa.ArrowInvalid as e:
935
- log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
939
+ log.exception(INVALID_ARROW_CONVERSION_MSG)
936
940
  raise pa.ArrowInvalid(
937
- f"Error converting to Arrow format: {str(e)}"
941
+ f"Error converting to Arrow format: {e!s}"
938
942
  ) from e
939
- except Exception as e:
940
- log.error(f"Unexpected error creating Arrow table: {str(e)}")
943
+ except Exception:
944
+ log.exception("Unexpected error creating Arrow table")
941
945
  raise
942
946
 
943
947
  request_type = FlightRequestType.METADATA
944
948
  response = None
945
949
  with ArizeFlightClient(
946
950
  api_key=self._sdk_config.api_key,
947
- host=self._sdk_config.flight_server_host,
948
- port=self._sdk_config.flight_server_port,
951
+ host=self._sdk_config.flight_host,
952
+ port=self._sdk_config.flight_port,
949
953
  scheme=self._sdk_config.flight_scheme,
950
954
  request_verify=self._sdk_config.request_verify,
951
955
  max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -958,8 +962,8 @@ class SpansClient:
958
962
  request_type=request_type,
959
963
  )
960
964
  except Exception as e:
961
- msg = f"Error during update request: {str(e)}"
962
- log.error(msg)
965
+ msg = f"Error during update request: {e!s}"
966
+ log.exception(msg)
963
967
  raise RuntimeError(msg) from e
964
968
 
965
969
  if response is None:
@@ -987,14 +991,25 @@ class SpansClient:
987
991
  start_time: datetime,
988
992
  end_time: datetime,
989
993
  where: str = "",
990
- columns: List | None = None,
994
+ columns: list | None = None,
991
995
  similarity_search_params: SimilaritySearchParams | None = None,
992
996
  stream_chunk_size: int | None = None,
993
997
  ) -> pd.DataFrame:
998
+ """Export span data from Arize to a pandas DataFrame.
999
+
1000
+ Retrieves trace/span data from the specified project within a time range
1001
+ and returns it as a pandas DataFrame. Supports filtering with SQL-like
1002
+ WHERE clauses and similarity search for semantic retrieval.
1003
+
1004
+ Returns:
1005
+ -------
1006
+ pd.DataFrame: DataFrame containing the requested span data with columns
1007
+ for span metadata, attributes, events, and any custom fields.
1008
+ """
994
1009
  with ArizeFlightClient(
995
1010
  api_key=self._sdk_config.api_key,
996
- host=self._sdk_config.flight_server_host,
997
- port=self._sdk_config.flight_server_port,
1011
+ host=self._sdk_config.flight_host,
1012
+ port=self._sdk_config.flight_port,
998
1013
  scheme=self._sdk_config.flight_scheme,
999
1014
  request_verify=self._sdk_config.request_verify,
1000
1015
  max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -1017,19 +1032,27 @@ class SpansClient:
1017
1032
  def export_to_parquet(
1018
1033
  self,
1019
1034
  *,
1035
+ path: str,
1020
1036
  space_id: str,
1021
1037
  project_name: str,
1022
1038
  start_time: datetime,
1023
1039
  end_time: datetime,
1024
1040
  where: str = "",
1025
- columns: List | None = None,
1041
+ columns: list | None = None,
1026
1042
  similarity_search_params: SimilaritySearchParams | None = None,
1027
1043
  stream_chunk_size: int | None = None,
1028
- ) -> pd.DataFrame:
1044
+ ) -> None:
1045
+ """Export span data from Arize to a Parquet file.
1046
+
1047
+ Retrieves trace/span data from the specified project within a time range
1048
+ and writes it directly to a Parquet file at the specified path. Supports
1049
+ filtering with SQL-like WHERE clauses and similarity search for semantic
1050
+ retrieval. Efficient for large datasets and long-term storage.
1051
+ """
1029
1052
  with ArizeFlightClient(
1030
1053
  api_key=self._sdk_config.api_key,
1031
- host=self._sdk_config.flight_server_host,
1032
- port=self._sdk_config.flight_server_port,
1054
+ host=self._sdk_config.flight_host,
1055
+ port=self._sdk_config.flight_port,
1033
1056
  scheme=self._sdk_config.flight_scheme,
1034
1057
  request_verify=self._sdk_config.request_verify,
1035
1058
  max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -1038,6 +1061,7 @@ class SpansClient:
1038
1061
  flight_client=flight_client,
1039
1062
  )
1040
1063
  return exporter.export_to_parquet(
1064
+ path=path,
1041
1065
  space_id=space_id,
1042
1066
  model_id=project_name,
1043
1067
  environment=Environments.TRACING,
@@ -1050,7 +1074,7 @@ class SpansClient:
1050
1074
  )
1051
1075
 
1052
1076
 
1053
- def _build_patch_document(row):
1077
+ def _build_patch_document(row: pd.Series) -> dict[str, object]:
1054
1078
  # Extract and preserve metadata values with proper types
1055
1079
  patch = {}
1056
1080
  for key in row.index:
@@ -1070,8 +1094,11 @@ def _build_patch_document(row):
1070
1094
 
1071
1095
 
1072
1096
  def _process_patch_document(
1073
- metadata_df, patch_document_column_name, field_patches, row_idx
1074
- ):
1097
+ metadata_df: pd.DataFrame,
1098
+ patch_document_column_name: str,
1099
+ field_patches: pd.DataFrame,
1100
+ row_idx: int,
1101
+ ) -> dict[str, object]:
1075
1102
  # Get the field patch for this row
1076
1103
  field_patch = field_patches.iloc[row_idx]
1077
1104
 
@@ -1111,15 +1138,14 @@ def _process_patch_document(
1111
1138
  explicit_patch = {}
1112
1139
 
1113
1140
  # Merge patches - explicit patch takes precedence
1114
- merged_patch = {**field_patch, **explicit_patch}
1115
- return merged_patch
1141
+ return {**field_patch, **explicit_patch}
1116
1142
 
1117
1143
 
1118
1144
  def _ensure_dict_patch(
1119
1145
  metadata_df: pd.DataFrame,
1120
1146
  final_patch_column: str,
1121
1147
  row_idx: int,
1122
- ):
1148
+ ) -> tuple[dict[str, object], list[str]]:
1123
1149
  patch = metadata_df.loc[row_idx, final_patch_column]
1124
1150
  validation_errors = []
1125
1151
 
@@ -1141,19 +1167,19 @@ def _ensure_dict_patch(
1141
1167
  parsed = json.loads(patch)
1142
1168
  if isinstance(parsed, dict):
1143
1169
  return parsed
1144
- else:
1145
- error_msg = (
1146
- f"Row {row_idx}: JSON must be an object/dictionary, "
1147
- f"got {type(parsed).__name__}"
1148
- )
1149
- logger.warning(error_msg)
1150
- validation_errors.append(error_msg)
1151
- return {}, validation_errors # if not validate else None
1152
1170
  except json.JSONDecodeError as e:
1153
1171
  error_msg = f"Row {row_idx}: Invalid JSON in patch document: {e}"
1154
1172
  logger.warning(error_msg)
1155
1173
  validation_errors.append(error_msg)
1156
1174
  return {}, validation_errors # if not validate else None
1175
+ else:
1176
+ error_msg = (
1177
+ f"Row {row_idx}: JSON must be an object/dictionary, "
1178
+ f"got {type(parsed).__name__}"
1179
+ )
1180
+ logger.warning(error_msg)
1181
+ validation_errors.append(error_msg)
1182
+ return {}, validation_errors # if not validate else None
1157
1183
 
1158
1184
  # For other types, log warning
1159
1185
  error_msg = f"Row {row_idx}: Unsupported patch type: {type(patch).__name__}"
@@ -1165,7 +1191,7 @@ def _ensure_dict_patch(
1165
1191
  def _format_note_for_storage(
1166
1192
  note_text: str,
1167
1193
  current_time_ms: int,
1168
- ):
1194
+ ) -> list[str] | None:
1169
1195
  if pd.isna(note_text):
1170
1196
  return None
1171
1197
  note_obj = {
@@ -1225,9 +1251,7 @@ def _log_flight_update_summary(
1225
1251
  logger.warning("Flight update response missing counts", extra=metrics)
1226
1252
  else:
1227
1253
  all_processed = int(spans_processed) == int(total_spans)
1228
- msg = (
1229
- "✅ All spans processed" if all_processed else "Partial processing"
1230
- )
1254
+ msg = "All spans processed" if all_processed else "Partial processing"
1231
1255
  logger.info(msg, extra=metrics)
1232
1256
 
1233
1257
  # Emit individual error lines (structured per-error, easy to aggregate)
@@ -1246,7 +1270,7 @@ def _message_to_dict(
1246
1270
  msg: message.Message,
1247
1271
  preserve_names: bool = True,
1248
1272
  use_int_enums: bool = False,
1249
- ):
1273
+ ) -> dict[str, object]:
1250
1274
  return json_format.MessageToDict(
1251
1275
  msg,
1252
1276
  preserving_proto_field_name=preserve_names,
arize/spans/columns.py CHANGED
@@ -1,3 +1,5 @@
1
+ """Span column definitions and OpenInference semantic conventions."""
2
+
1
3
  from enum import Enum
2
4
 
3
5
  import openinference.semconv.trace as oinf
@@ -5,6 +7,8 @@ import opentelemetry.semconv.trace as otel
5
7
 
6
8
 
7
9
  class SpanColumnDataType(Enum):
10
+ """Enum representing supported data types for span columns."""
11
+
8
12
  BOOL = 1
9
13
  NUMERIC = 2
10
14
  STRING = 3
@@ -15,12 +19,21 @@ class SpanColumnDataType(Enum):
15
19
 
16
20
 
17
21
  class SpanColumn:
22
+ """Configuration for a custom span column with name, data type, and annotation settings."""
23
+
18
24
  def __init__(
19
25
  self,
20
26
  name: str,
21
27
  data_type: SpanColumnDataType,
22
28
  required: bool = False,
23
29
  ) -> None:
30
+ """Initialize a span column configuration.
31
+
32
+ Args:
33
+ name: Name of the span column.
34
+ data_type: Data type of the column values.
35
+ required: Whether the column is required.
36
+ """
24
37
  self.name = name
25
38
  self.required = required
26
39
  self.data_type = data_type