arize 8.0.0a22__py3-none-any.whl → 8.0.0b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. arize/__init__.py +28 -19
  2. arize/_exporter/client.py +56 -37
  3. arize/_exporter/parsers/tracing_data_parser.py +41 -30
  4. arize/_exporter/validation.py +3 -3
  5. arize/_flight/client.py +207 -76
  6. arize/_generated/api_client/__init__.py +30 -6
  7. arize/_generated/api_client/api/__init__.py +1 -0
  8. arize/_generated/api_client/api/datasets_api.py +864 -190
  9. arize/_generated/api_client/api/experiments_api.py +167 -131
  10. arize/_generated/api_client/api/projects_api.py +1197 -0
  11. arize/_generated/api_client/api_client.py +2 -2
  12. arize/_generated/api_client/configuration.py +42 -34
  13. arize/_generated/api_client/exceptions.py +2 -2
  14. arize/_generated/api_client/models/__init__.py +15 -4
  15. arize/_generated/api_client/models/dataset.py +10 -10
  16. arize/_generated/api_client/models/dataset_example.py +111 -0
  17. arize/_generated/api_client/models/dataset_example_update.py +100 -0
  18. arize/_generated/api_client/models/dataset_version.py +13 -13
  19. arize/_generated/api_client/models/datasets_create_request.py +16 -8
  20. arize/_generated/api_client/models/datasets_examples_insert_request.py +100 -0
  21. arize/_generated/api_client/models/datasets_examples_list200_response.py +106 -0
  22. arize/_generated/api_client/models/datasets_examples_update_request.py +102 -0
  23. arize/_generated/api_client/models/datasets_list200_response.py +10 -4
  24. arize/_generated/api_client/models/experiment.py +14 -16
  25. arize/_generated/api_client/models/experiment_run.py +108 -0
  26. arize/_generated/api_client/models/experiment_run_create.py +102 -0
  27. arize/_generated/api_client/models/experiments_create_request.py +16 -10
  28. arize/_generated/api_client/models/experiments_list200_response.py +10 -4
  29. arize/_generated/api_client/models/experiments_runs_list200_response.py +19 -5
  30. arize/_generated/api_client/models/{error.py → pagination_metadata.py} +13 -11
  31. arize/_generated/api_client/models/primitive_value.py +172 -0
  32. arize/_generated/api_client/models/problem.py +100 -0
  33. arize/_generated/api_client/models/project.py +99 -0
  34. arize/_generated/api_client/models/{datasets_list_examples200_response.py → projects_create_request.py} +13 -11
  35. arize/_generated/api_client/models/projects_list200_response.py +106 -0
  36. arize/_generated/api_client/rest.py +2 -2
  37. arize/_generated/api_client/test/test_dataset.py +4 -2
  38. arize/_generated/api_client/test/test_dataset_example.py +56 -0
  39. arize/_generated/api_client/test/test_dataset_example_update.py +52 -0
  40. arize/_generated/api_client/test/test_dataset_version.py +7 -2
  41. arize/_generated/api_client/test/test_datasets_api.py +27 -13
  42. arize/_generated/api_client/test/test_datasets_create_request.py +8 -4
  43. arize/_generated/api_client/test/{test_datasets_list_examples200_response.py → test_datasets_examples_insert_request.py} +19 -15
  44. arize/_generated/api_client/test/test_datasets_examples_list200_response.py +66 -0
  45. arize/_generated/api_client/test/test_datasets_examples_update_request.py +61 -0
  46. arize/_generated/api_client/test/test_datasets_list200_response.py +9 -3
  47. arize/_generated/api_client/test/test_experiment.py +2 -4
  48. arize/_generated/api_client/test/test_experiment_run.py +56 -0
  49. arize/_generated/api_client/test/test_experiment_run_create.py +54 -0
  50. arize/_generated/api_client/test/test_experiments_api.py +6 -6
  51. arize/_generated/api_client/test/test_experiments_create_request.py +9 -6
  52. arize/_generated/api_client/test/test_experiments_list200_response.py +9 -5
  53. arize/_generated/api_client/test/test_experiments_runs_list200_response.py +15 -5
  54. arize/_generated/api_client/test/test_pagination_metadata.py +53 -0
  55. arize/_generated/api_client/test/{test_error.py → test_primitive_value.py} +13 -14
  56. arize/_generated/api_client/test/test_problem.py +57 -0
  57. arize/_generated/api_client/test/test_project.py +58 -0
  58. arize/_generated/api_client/test/test_projects_api.py +59 -0
  59. arize/_generated/api_client/test/test_projects_create_request.py +54 -0
  60. arize/_generated/api_client/test/test_projects_list200_response.py +70 -0
  61. arize/_generated/api_client_README.md +43 -29
  62. arize/_generated/protocol/flight/flight_pb2.py +400 -0
  63. arize/_lazy.py +27 -19
  64. arize/client.py +181 -58
  65. arize/config.py +324 -116
  66. arize/constants/__init__.py +1 -0
  67. arize/constants/config.py +11 -4
  68. arize/constants/ml.py +6 -4
  69. arize/constants/openinference.py +2 -0
  70. arize/constants/pyarrow.py +2 -0
  71. arize/constants/spans.py +3 -1
  72. arize/datasets/__init__.py +1 -0
  73. arize/datasets/client.py +304 -84
  74. arize/datasets/errors.py +32 -2
  75. arize/datasets/validation.py +18 -8
  76. arize/embeddings/__init__.py +2 -0
  77. arize/embeddings/auto_generator.py +23 -19
  78. arize/embeddings/base_generators.py +89 -36
  79. arize/embeddings/constants.py +2 -0
  80. arize/embeddings/cv_generators.py +26 -4
  81. arize/embeddings/errors.py +27 -5
  82. arize/embeddings/nlp_generators.py +43 -18
  83. arize/embeddings/tabular_generators.py +46 -31
  84. arize/embeddings/usecases.py +12 -2
  85. arize/exceptions/__init__.py +1 -0
  86. arize/exceptions/auth.py +11 -1
  87. arize/exceptions/base.py +29 -4
  88. arize/exceptions/models.py +21 -2
  89. arize/exceptions/parameters.py +31 -0
  90. arize/exceptions/spaces.py +12 -1
  91. arize/exceptions/types.py +86 -7
  92. arize/exceptions/values.py +220 -20
  93. arize/experiments/__init__.py +13 -0
  94. arize/experiments/client.py +394 -285
  95. arize/experiments/evaluators/__init__.py +1 -0
  96. arize/experiments/evaluators/base.py +74 -41
  97. arize/experiments/evaluators/exceptions.py +6 -3
  98. arize/experiments/evaluators/executors.py +121 -73
  99. arize/experiments/evaluators/rate_limiters.py +106 -57
  100. arize/experiments/evaluators/types.py +34 -7
  101. arize/experiments/evaluators/utils.py +65 -27
  102. arize/experiments/functions.py +103 -101
  103. arize/experiments/tracing.py +52 -44
  104. arize/experiments/types.py +56 -31
  105. arize/logging.py +54 -22
  106. arize/ml/__init__.py +1 -0
  107. arize/ml/batch_validation/__init__.py +1 -0
  108. arize/{models → ml}/batch_validation/errors.py +545 -67
  109. arize/{models → ml}/batch_validation/validator.py +344 -303
  110. arize/ml/bounded_executor.py +47 -0
  111. arize/{models → ml}/casting.py +118 -108
  112. arize/{models → ml}/client.py +339 -118
  113. arize/{models → ml}/proto.py +97 -42
  114. arize/{models → ml}/stream_validation.py +43 -15
  115. arize/ml/surrogate_explainer/__init__.py +1 -0
  116. arize/{models → ml}/surrogate_explainer/mimic.py +25 -10
  117. arize/{types.py → ml/types.py} +355 -354
  118. arize/pre_releases.py +44 -0
  119. arize/projects/__init__.py +1 -0
  120. arize/projects/client.py +134 -0
  121. arize/regions.py +40 -0
  122. arize/spans/__init__.py +1 -0
  123. arize/spans/client.py +204 -175
  124. arize/spans/columns.py +13 -0
  125. arize/spans/conversion.py +60 -37
  126. arize/spans/validation/__init__.py +1 -0
  127. arize/spans/validation/annotations/__init__.py +1 -0
  128. arize/spans/validation/annotations/annotations_validation.py +6 -4
  129. arize/spans/validation/annotations/dataframe_form_validation.py +13 -11
  130. arize/spans/validation/annotations/value_validation.py +35 -11
  131. arize/spans/validation/common/__init__.py +1 -0
  132. arize/spans/validation/common/argument_validation.py +33 -8
  133. arize/spans/validation/common/dataframe_form_validation.py +35 -9
  134. arize/spans/validation/common/errors.py +211 -11
  135. arize/spans/validation/common/value_validation.py +81 -14
  136. arize/spans/validation/evals/__init__.py +1 -0
  137. arize/spans/validation/evals/dataframe_form_validation.py +28 -8
  138. arize/spans/validation/evals/evals_validation.py +34 -4
  139. arize/spans/validation/evals/value_validation.py +26 -3
  140. arize/spans/validation/metadata/__init__.py +1 -1
  141. arize/spans/validation/metadata/argument_validation.py +14 -5
  142. arize/spans/validation/metadata/dataframe_form_validation.py +26 -10
  143. arize/spans/validation/metadata/value_validation.py +24 -10
  144. arize/spans/validation/spans/__init__.py +1 -0
  145. arize/spans/validation/spans/dataframe_form_validation.py +35 -14
  146. arize/spans/validation/spans/spans_validation.py +35 -4
  147. arize/spans/validation/spans/value_validation.py +78 -8
  148. arize/utils/__init__.py +1 -0
  149. arize/utils/arrow.py +31 -15
  150. arize/utils/cache.py +34 -6
  151. arize/utils/dataframe.py +20 -3
  152. arize/utils/online_tasks/__init__.py +2 -0
  153. arize/utils/online_tasks/dataframe_preprocessor.py +58 -47
  154. arize/utils/openinference_conversion.py +44 -5
  155. arize/utils/proto.py +10 -0
  156. arize/utils/size.py +5 -3
  157. arize/utils/types.py +105 -0
  158. arize/version.py +3 -1
  159. {arize-8.0.0a22.dist-info → arize-8.0.0b0.dist-info}/METADATA +13 -6
  160. arize-8.0.0b0.dist-info/RECORD +175 -0
  161. {arize-8.0.0a22.dist-info → arize-8.0.0b0.dist-info}/WHEEL +1 -1
  162. arize-8.0.0b0.dist-info/licenses/LICENSE +176 -0
  163. arize-8.0.0b0.dist-info/licenses/NOTICE +13 -0
  164. arize/_generated/protocol/flight/export_pb2.py +0 -61
  165. arize/_generated/protocol/flight/ingest_pb2.py +0 -365
  166. arize/models/__init__.py +0 -0
  167. arize/models/batch_validation/__init__.py +0 -0
  168. arize/models/bounded_executor.py +0 -34
  169. arize/models/surrogate_explainer/__init__.py +0 -0
  170. arize-8.0.0a22.dist-info/RECORD +0 -146
  171. arize-8.0.0a22.dist-info/licenses/LICENSE.md +0 -12
arize/spans/client.py CHANGED
@@ -1,3 +1,5 @@
1
+ """Client implementation for managing spans and traces in the Arize platform."""
2
+
1
3
  # type: ignore[pb2]
2
4
  from __future__ import annotations
3
5
 
@@ -6,7 +8,7 @@ import logging
6
8
  import re
7
9
  from datetime import datetime, timezone
8
10
  from functools import partial
9
- from typing import TYPE_CHECKING, Any, Dict, List
11
+ from typing import TYPE_CHECKING, Any
10
12
 
11
13
  import numpy as np
12
14
  import pandas as pd
@@ -16,10 +18,6 @@ from google.protobuf import json_format, message
16
18
  from arize._exporter.client import ArizeExportClient
17
19
  from arize._flight.client import ArizeFlightClient, FlightPostArrowFileResponse
18
20
  from arize._flight.types import FlightRequestType
19
- from arize._generated.protocol.flight.ingest_pb2 import (
20
- WriteSpanAnnotationResponse,
21
- WriteSpanEvaluationResponse,
22
- )
23
21
  from arize.constants.spans import DEFAULT_DATETIME_FMT
24
22
  from arize.exceptions.base import (
25
23
  INVALID_ARROW_CONVERSION_MSG,
@@ -28,10 +26,8 @@ from arize.exceptions.base import (
28
26
  from arize.exceptions.models import MissingProjectNameError
29
27
  from arize.exceptions.spaces import MissingSpaceIDError
30
28
  from arize.logging import CtxAdapter
31
- from arize.types import Environments, SimilaritySearchParams
32
- from arize.utils.arrow import (
33
- post_arrow_table,
34
- )
29
+ from arize.ml.types import Environments
30
+ from arize.utils.arrow import post_arrow_table
35
31
  from arize.utils.dataframe import (
36
32
  remove_extraneous_columns,
37
33
  reset_dataframe_index,
@@ -41,13 +37,25 @@ from arize.utils.proto import get_pb_schema_tracing
41
37
  if TYPE_CHECKING:
42
38
  import requests
43
39
 
40
+ from arize._generated.protocol.flight import flight_pb2
44
41
  from arize.config import SDKConfiguration
45
42
 
46
43
  logger = logging.getLogger(__name__)
47
44
 
48
45
 
49
46
  class SpansClient:
50
- def __init__(self, *, sdk_config: SDKConfiguration):
47
+ """Client for logging LLM tracing spans and evaluations to Arize.
48
+
49
+ This class is primarily intended for internal use within the SDK. Users are
50
+ highly encouraged to access resource-specific functionality via
51
+ :class:`arize.ArizeClient`.
52
+ """
53
+
54
+ def __init__(self, *, sdk_config: SDKConfiguration) -> None:
55
+ """
56
+ Args:
57
+ sdk_config: Resolved SDK configuration.
58
+ """ # noqa: D205, D212
51
59
  self._sdk_config = sdk_config
52
60
 
53
61
  def log(
@@ -62,25 +70,26 @@ class SpansClient:
62
70
  timeout: float | None = None,
63
71
  tmp_dir: str = "",
64
72
  ) -> requests.Response:
65
- """
66
- Logs a pandas dataframe containing LLM tracing data to Arize via a POST request. Returns a
67
- :class:`Response` object from the Requests HTTP library to ensure successful delivery of
68
- records.
73
+ """Logs a pandas dataframe containing LLM tracing data to Arize via a POST request.
74
+
75
+ Returns a :class:`Response` object from the Requests HTTP library to ensure
76
+ successful delivery of records.
69
77
 
70
78
  Args:
71
- dataframe (pd.DataFrame): The dataframe containing the LLM traces.
72
- evals_dataframe (pd.DataFrame, optional): A dataframe containing LLM evaluations data.
79
+ space_id: The space ID where the project resides.
80
+ project_name: A unique name to identify your project in the Arize platform.
81
+ dataframe: The dataframe containing the LLM traces.
82
+ evals_dataframe: A dataframe containing LLM evaluations data.
73
83
  The evaluations are joined to their corresponding spans via a left outer join, i.e.,
74
84
  using only `context.span_id` from the spans dataframe. Defaults to None.
75
- datetime_format (str): format for the timestamp captured in the LLM traces.
85
+ datetime_format: format for the timestamp captured in the LLM traces.
76
86
  Defaults to "%Y-%m-%dT%H:%M:%S.%f+00:00".
77
- validate (bool, optional): When set to True, validation is run before sending data.
87
+ validate: When set to True, validation is run before sending data.
78
88
  Defaults to True.
79
- tmp_dir (str, optional): Temporary directory/file to store the serialized data in binary
80
- before sending to Arize.
81
- timeout (float, optional): You can stop waiting for a response after a given number
89
+ timeout: You can stop waiting for a response after a given number
82
90
  of seconds with the timeout parameter. Defaults to None.
83
- project_name (str, optional): A unique name to identify your project in the Arize platform.
91
+ tmp_dir: Temporary directory/file to store the serialized data in binary
92
+ before sending to Arize.
84
93
 
85
94
  Returns:
86
95
  `Response` object
@@ -220,12 +229,12 @@ class SpansClient:
220
229
  log.debug("Converting data to Arrow format")
221
230
  pa_table = pa.Table.from_pandas(df, preserve_index=False)
222
231
  except pa.ArrowInvalid as e:
223
- log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
232
+ log.exception(INVALID_ARROW_CONVERSION_MSG)
224
233
  raise pa.ArrowInvalid(
225
- f"Error converting to Arrow format: {str(e)}"
234
+ f"Error converting to Arrow format: {e!s}"
226
235
  ) from e
227
- except Exception as e:
228
- log.error(f"Unexpected error creating Arrow table: {str(e)}")
236
+ except Exception:
237
+ log.exception("Unexpected error creating Arrow table")
229
238
  raise
230
239
 
231
240
  proto_schema = get_pb_schema_tracing(project_name=project_name)
@@ -262,27 +271,23 @@ class SpansClient:
262
271
  force_http: bool = False,
263
272
  timeout: float | None = None,
264
273
  tmp_dir: str = "",
265
- ) -> WriteSpanEvaluationResponse:
266
- """
267
- Logs a pandas dataframe containing LLM evaluations data to Arize via a Flight gRPC request.
268
- The dataframe must contain a column `context.span_id`
269
- such that Arize can assign each evaluation to its respective span.
274
+ ) -> flight_pb2.WriteSpanEvaluationResponse:
275
+ """Logs a pandas dataframe containing LLM evaluations data to Arize via a Flight gRPC request.
276
+
277
+ The dataframe must contain a column `context.span_id` such that Arize can assign
278
+ each evaluation to its respective span.
270
279
 
271
280
  Args:
272
- dataframe (pd.DataFrame): A dataframe containing LLM evaluations data.
273
- model_id (str): A unique name to identify your model in the Arize platform.
274
- (Deprecated: Use `project_name` instead.)
275
- model_version (str, optional): Used to group a subset of traces a given
276
- model_id to compare and track changes. It should match the model_id of the spans
277
- sent previously, to which evaluations will be assigned. Defaults to None.
278
- validate (bool, optional): When set to True, validation is run before sending data.
281
+ space_id: The space ID where the project resides.
282
+ project_name: A unique name to identify your project in the Arize platform.
283
+ dataframe: A dataframe containing LLM evaluations data.
284
+ validate: When set to True, validation is run before sending data.
279
285
  Defaults to True.
280
- path (str, optional): Temporary directory/file to store the serialized data in binary
281
- before sending to Arize.
282
- timeout (float, optional): You can stop waiting for a response after a given number
286
+ force_http: Force the use of HTTP for data upload. Defaults to False.
287
+ timeout: You can stop waiting for a response after a given number
283
288
  of seconds with the timeout parameter. Defaults to None.
284
- project_name (str, optional): A unique name to identify your project in the Arize platform.
285
- Either model_id or project_name must be provided.
289
+ tmp_dir: Temporary directory/file to store the serialized data in binary
290
+ before sending to Arize.
286
291
  """
287
292
  from arize.spans.columns import EVAL_COLUMN_PATTERN, SPAN_SPAN_ID_COL
288
293
  from arize.spans.validation.evals import evals_validation
@@ -358,12 +363,12 @@ class SpansClient:
358
363
  log.debug("Converting data to Arrow format")
359
364
  pa_table = pa.Table.from_pandas(evals_df, preserve_index=False)
360
365
  except pa.ArrowInvalid as e:
361
- log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
366
+ log.exception(INVALID_ARROW_CONVERSION_MSG)
362
367
  raise pa.ArrowInvalid(
363
- f"Error converting to Arrow format: {str(e)}"
368
+ f"Error converting to Arrow format: {e!s}"
364
369
  ) from e
365
- except Exception as e:
366
- log.error(f"Unexpected error creating Arrow table: {str(e)}")
370
+ except Exception:
371
+ log.exception("Unexpected error creating Arrow table")
367
372
  raise
368
373
 
369
374
  if force_http:
@@ -395,8 +400,8 @@ class SpansClient:
395
400
  response = None
396
401
  with ArizeFlightClient(
397
402
  api_key=self._sdk_config.api_key,
398
- host=self._sdk_config.flight_server_host,
399
- port=self._sdk_config.flight_server_port,
403
+ host=self._sdk_config.flight_host,
404
+ port=self._sdk_config.flight_port,
400
405
  scheme=self._sdk_config.flight_scheme,
401
406
  request_verify=self._sdk_config.request_verify,
402
407
  max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -409,8 +414,8 @@ class SpansClient:
409
414
  request_type=request_type,
410
415
  )
411
416
  except Exception as e:
412
- msg = f"Error during update request: {str(e)}"
413
- log.error(msg)
417
+ msg = f"Error during update request: {e!s}"
418
+ log.exception(msg)
414
419
  raise RuntimeError(msg) from e
415
420
 
416
421
  if response is None:
@@ -437,19 +442,19 @@ class SpansClient:
437
442
  project_name: str,
438
443
  dataframe: pd.DataFrame,
439
444
  validate: bool = True,
440
- ) -> WriteSpanAnnotationResponse:
441
- """
442
- Logs a pandas dataframe containing LLM span annotations to Arize via a Flight gRPC request.
443
- The dataframe must contain a column `context.span_id`
444
- such that Arize can assign each annotation to its respective span.
445
- Annotation columns should follow the pattern `annotation.<name>.<suffix>` where suffix is
446
- either `label` or `score`. An optional `annotation.notes` column can be included for
447
- free-form text notes.
445
+ ) -> flight_pb2.WriteSpanAnnotationResponse:
446
+ """Logs a pandas dataframe containing LLM span annotations to Arize via a Flight gRPC request.
447
+
448
+ The dataframe must contain a column `context.span_id` such that Arize can assign
449
+ each annotation to its respective span. Annotation columns should follow the pattern
450
+ `annotation.<name>.<suffix>` where suffix is either `label` or `score`. An optional
451
+ `annotation.notes` column can be included for free-form text notes.
448
452
 
449
453
  Args:
450
- dataframe (pd.DataFrame): A dataframe containing LLM annotation data.
451
- project_name (str): A unique name to identify your project in the Arize platform.
452
- validate (bool, optional): When set to True, validation is run before sending data.
454
+ space_id: The space ID where the project resides.
455
+ project_name: A unique name to identify your project in the Arize platform.
456
+ dataframe: A dataframe containing LLM annotation data.
457
+ validate: When set to True, validation is run before sending data.
453
458
  Defaults to True.
454
459
  """
455
460
  from arize.spans.columns import (
@@ -588,12 +593,12 @@ class SpansClient:
588
593
  log.debug("Converting data to Arrow format")
589
594
  pa_table = pa.Table.from_pandas(anno_df, preserve_index=False)
590
595
  except pa.ArrowInvalid as e:
591
- log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
596
+ log.exception(INVALID_ARROW_CONVERSION_MSG)
592
597
  raise pa.ArrowInvalid(
593
- f"Error converting to Arrow format: {str(e)}"
598
+ f"Error converting to Arrow format: {e!s}"
594
599
  ) from e
595
- except Exception as e:
596
- log.error(f"Unexpected error creating Arrow table: {str(e)}")
600
+ except Exception:
601
+ log.exception("Unexpected error creating Arrow table")
597
602
  raise
598
603
 
599
604
  if ANNOTATION_NOTES_COLUMN_NAME in anno_df.columns:
@@ -611,8 +616,8 @@ class SpansClient:
611
616
  response = None
612
617
  with ArizeFlightClient(
613
618
  api_key=self._sdk_config.api_key,
614
- host=self._sdk_config.flight_server_host,
615
- port=self._sdk_config.flight_server_port,
619
+ host=self._sdk_config.flight_host,
620
+ port=self._sdk_config.flight_port,
616
621
  scheme=self._sdk_config.flight_scheme,
617
622
  request_verify=self._sdk_config.request_verify,
618
623
  max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -625,8 +630,8 @@ class SpansClient:
625
630
  request_type=request_type,
626
631
  )
627
632
  except Exception as e:
628
- msg = f"Error during update request: {str(e)}"
629
- log.error(msg)
633
+ msg = f"Error during update request: {e!s}"
634
+ log.exception(msg)
630
635
  raise RuntimeError(msg) from e
631
636
 
632
637
  if response is None:
@@ -654,11 +659,13 @@ class SpansClient:
654
659
  dataframe: pd.DataFrame,
655
660
  patch_document_column_name: str = "patch_document",
656
661
  validate: bool = True,
657
- ) -> Dict[str, Any]:
658
- """
659
- Log metadata updates using JSON Merge Patch format. This method is only supported for LLM model types.
662
+ ) -> dict[str, Any]:
663
+ """Log metadata updates using JSON Merge Patch format.
664
+
665
+ This method is only supported for LLM model types.
660
666
 
661
667
  The dataframe must contain a column `context.span_id` to identify spans and either:
668
+
662
669
  1. A column with JSON patch documents (specified by patch_document_column_name), or
663
670
  2. One or more columns with prefix `attributes.metadata.` that will be automatically
664
671
  converted to a patch document (e.g., `attributes.metadata.tag` → `{"tag": value}`).
@@ -666,7 +673,8 @@ class SpansClient:
666
673
  If both methods are used, the explicit patch document is applied after the individual field updates.
667
674
  The patches will be applied to the `attributes.metadata` field of each span.
668
675
 
669
- **Type Handling:**
676
+ Type Handling:
677
+
670
678
  - The client primarily supports string, integer, and float data types.
671
679
  - Boolean values are converted to string representations.
672
680
  - Nested JSON objects and arrays are serialized to JSON strings during transmission.
@@ -674,20 +682,23 @@ class SpansClient:
674
682
  Note: This differs from standard JSON Merge Patch where null values remove fields.
675
683
 
676
684
  Args:
677
- dataframe: DataFrame with span_ids and either patch documents or metadata field columns.
685
+ space_id: The space ID where the project resides.
678
686
  project_name: A unique name to identify your project in the Arize platform.
687
+ dataframe: DataFrame with span_ids and either patch documents or metadata field columns.
679
688
  patch_document_column_name: Name of the column containing JSON patch documents.
680
689
  Defaults to "patch_document".
681
690
  validate: When set to True, validation is run before sending data.
682
691
 
683
692
  Returns:
684
693
  Dictionary containing update results with the following keys:
694
+
685
695
  - spans_processed: Total number of spans in the input dataframe
686
696
  - spans_updated: Count of successfully updated span metadata records
687
697
  - spans_failed: Count of spans that failed to update
688
698
  - errors: List of dictionaries with 'span_id' and 'error_message' keys for each failed span
689
699
 
690
- Error types from the server include:
700
+ Error types from the server include:
701
+
691
702
  - parse_failure: Failed to parse JSON metadata
692
703
  - patch_failure: Failed to apply JSON patch
693
704
  - type_conflict: Type conflict in metadata
@@ -696,58 +707,60 @@ class SpansClient:
696
707
  - druid_rejection: Backend rejected the update
697
708
 
698
709
  Raises:
699
- AuthError: When API key or space ID is missing
700
- ValidationFailure: When validation of the dataframe or values fails
701
- ImportError: When required tracing dependencies are missing
702
- ArrowInvalid: When the dataframe cannot be converted to Arrow format
703
- RuntimeError: If the request fails or no response is received
704
-
705
- Example:
706
- ```python
707
- # Method 1: Using a patch document
708
- df = pd.DataFrame(
709
- {
710
- "context.span_id": ["span1", "span2"],
711
- "patch_document": [
712
- {"tag": "important"},
713
- {"priority": "high"},
714
- ],
715
- }
716
- )
717
-
718
- # Method 2: Using direct field columns
719
- df = pd.DataFrame(
720
- {
721
- "context.span_id": ["span1", "span2"],
722
- "attributes.metadata.tag": ["important", "standard"],
723
- "attributes.metadata.priority": ["high", "medium"],
724
- }
725
- )
726
-
727
- # Method 3: Combining both approaches
728
- df = pd.DataFrame(
729
- {
730
- "context.span_id": ["span1"],
731
- "attributes.metadata.tag": ["important"],
732
- "patch_document": [
733
- {"priority": "high"}
734
- ], # This will override any conflicting fields
735
- }
736
- )
737
-
738
- # Method 4: Setting fields to null
739
- df = pd.DataFrame(
740
- {
741
- "context.span_id": ["span1"],
742
- "attributes.metadata.old_field": [
743
- None
744
- ], # Sets field to JSON null
745
- "patch_document": [
746
- {"other_field": None}
747
- ], # Also sets field to JSON null
748
- }
749
- )
750
- ```
710
+ AuthError: When API key or space ID is missing.
711
+ ValidationFailure: When validation of the dataframe or values fails.
712
+ ImportError: When required tracing dependencies are missing.
713
+ ArrowInvalid: When the dataframe cannot be converted to Arrow format.
714
+ RuntimeError: If the request fails or no response is received.
715
+
716
+ Examples:
717
+ Method 1: Using a patch document
718
+
719
+ >>> df = pd.DataFrame(
720
+ ... {
721
+ ... "context.span_id": ["span1", "span2"],
722
+ ... "patch_document": [
723
+ ... {"tag": "important"},
724
+ ... {"priority": "high"},
725
+ ... ],
726
+ ... }
727
+ ... )
728
+
729
+ Method 2: Using direct field columns
730
+
731
+ >>> df = pd.DataFrame(
732
+ ... {
733
+ ... "context.span_id": ["span1", "span2"],
734
+ ... "attributes.metadata.tag": ["important", "standard"],
735
+ ... "attributes.metadata.priority": ["high", "medium"],
736
+ ... }
737
+ ... )
738
+
739
+ Method 3: Combining both approaches
740
+
741
+ >>> df = pd.DataFrame(
742
+ ... {
743
+ ... "context.span_id": ["span1"],
744
+ ... "attributes.metadata.tag": ["important"],
745
+ ... "patch_document": [
746
+ ... {"priority": "high"}
747
+ ... ], # Overrides conflicting fields
748
+ ... }
749
+ ... )
750
+
751
+ Method 4: Setting fields to null
752
+
753
+ >>> df = pd.DataFrame(
754
+ ... {
755
+ ... "context.span_id": ["span1"],
756
+ ... "attributes.metadata.old_field": [
757
+ ... None
758
+ ... ], # Sets field to JSON null
759
+ ... "patch_document": [
760
+ ... {"other_field": None}
761
+ ... ], # Also sets field to JSON null
762
+ ... }
763
+ ... )
751
764
  """
752
765
  # Import validation modules
753
766
  from arize.spans.columns import SPAN_SPAN_ID_COL
@@ -813,11 +826,10 @@ class SpansClient:
813
826
  )
814
827
 
815
828
  # Create a new column for patch documents if we're going to use it
816
- if has_metadata_fields or has_patch_document:
817
- # Use 'patch_document' as the standardized column name for downstream processing
818
- final_patch_column = "patch_document"
819
- if final_patch_column not in metadata_df.columns:
820
- metadata_df[final_patch_column] = None
829
+ # Use 'patch_document' as the standardized column name for downstream processing
830
+ final_patch_column = "patch_document"
831
+ if final_patch_column not in metadata_df.columns:
832
+ metadata_df[final_patch_column] = None
821
833
 
822
834
  # Process metadata field columns if they exist
823
835
  if has_metadata_fields:
@@ -865,7 +877,7 @@ class SpansClient:
865
877
  if patch:
866
878
  processed_patches.append(patch)
867
879
  if errors:
868
- validation_errors.append(errors)
880
+ validation_errors.extend(errors)
869
881
 
870
882
  # If validation is enabled and errors found, raise ValidationFailure
871
883
  if validate and validation_errors:
@@ -922,9 +934,11 @@ class SpansClient:
922
934
  metadata_df[final_patch_column] = metadata_df[
923
935
  final_patch_column
924
936
  ].apply(
925
- lambda p: json.dumps(p)
926
- if not isinstance(p, float) or not np.isnan(p)
927
- else json.dumps({})
937
+ lambda p: (
938
+ json.dumps(p)
939
+ if not isinstance(p, float) or not np.isnan(p)
940
+ else json.dumps({})
941
+ )
928
942
  )
929
943
 
930
944
  # Convert to Arrow table
@@ -932,20 +946,20 @@ class SpansClient:
932
946
  log.debug("Converting data to Arrow format")
933
947
  pa_table = pa.Table.from_pandas(metadata_df, preserve_index=False)
934
948
  except pa.ArrowInvalid as e:
935
- log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
949
+ log.exception(INVALID_ARROW_CONVERSION_MSG)
936
950
  raise pa.ArrowInvalid(
937
- f"Error converting to Arrow format: {str(e)}"
951
+ f"Error converting to Arrow format: {e!s}"
938
952
  ) from e
939
- except Exception as e:
940
- log.error(f"Unexpected error creating Arrow table: {str(e)}")
953
+ except Exception:
954
+ log.exception("Unexpected error creating Arrow table")
941
955
  raise
942
956
 
943
957
  request_type = FlightRequestType.METADATA
944
958
  response = None
945
959
  with ArizeFlightClient(
946
960
  api_key=self._sdk_config.api_key,
947
- host=self._sdk_config.flight_server_host,
948
- port=self._sdk_config.flight_server_port,
961
+ host=self._sdk_config.flight_host,
962
+ port=self._sdk_config.flight_port,
949
963
  scheme=self._sdk_config.flight_scheme,
950
964
  request_verify=self._sdk_config.request_verify,
951
965
  max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -958,8 +972,8 @@ class SpansClient:
958
972
  request_type=request_type,
959
973
  )
960
974
  except Exception as e:
961
- msg = f"Error during update request: {str(e)}"
962
- log.error(msg)
975
+ msg = f"Error during update request: {e!s}"
976
+ log.exception(msg)
963
977
  raise RuntimeError(msg) from e
964
978
 
965
979
  if response is None:
@@ -987,14 +1001,23 @@ class SpansClient:
987
1001
  start_time: datetime,
988
1002
  end_time: datetime,
989
1003
  where: str = "",
990
- columns: List | None = None,
991
- similarity_search_params: SimilaritySearchParams | None = None,
1004
+ columns: list | None = None,
992
1005
  stream_chunk_size: int | None = None,
993
1006
  ) -> pd.DataFrame:
1007
+ """Export span data from Arize to a pandas DataFrame.
1008
+
1009
+ Retrieves trace/span data from the specified project within a time range
1010
+ and returns it as a pandas DataFrame. Supports filtering with SQL-like
1011
+ WHERE clauses and similarity search for semantic retrieval.
1012
+
1013
+ Returns:
1014
+ pd.DataFrame: DataFrame containing the requested span data with columns
1015
+ for span metadata, attributes, events, and any custom fields.
1016
+ """
994
1017
  with ArizeFlightClient(
995
1018
  api_key=self._sdk_config.api_key,
996
- host=self._sdk_config.flight_server_host,
997
- port=self._sdk_config.flight_server_port,
1019
+ host=self._sdk_config.flight_host,
1020
+ port=self._sdk_config.flight_port,
998
1021
  scheme=self._sdk_config.flight_scheme,
999
1022
  request_verify=self._sdk_config.request_verify,
1000
1023
  max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -1010,26 +1033,32 @@ class SpansClient:
1010
1033
  end_time=end_time,
1011
1034
  where=where,
1012
1035
  columns=columns,
1013
- similarity_search_params=similarity_search_params,
1014
1036
  stream_chunk_size=stream_chunk_size,
1015
1037
  )
1016
1038
 
1017
1039
  def export_to_parquet(
1018
1040
  self,
1019
1041
  *,
1042
+ path: str,
1020
1043
  space_id: str,
1021
1044
  project_name: str,
1022
1045
  start_time: datetime,
1023
1046
  end_time: datetime,
1024
1047
  where: str = "",
1025
- columns: List | None = None,
1026
- similarity_search_params: SimilaritySearchParams | None = None,
1048
+ columns: list | None = None,
1027
1049
  stream_chunk_size: int | None = None,
1028
- ) -> pd.DataFrame:
1050
+ ) -> None:
1051
+ """Export span data from Arize to a Parquet file.
1052
+
1053
+ Retrieves trace/span data from the specified project within a time range
1054
+ and writes it directly to a Parquet file at the specified path. Supports
1055
+ filtering with SQL-like WHERE clauses and similarity search for semantic
1056
+ retrieval. Efficient for large datasets and long-term storage.
1057
+ """
1029
1058
  with ArizeFlightClient(
1030
1059
  api_key=self._sdk_config.api_key,
1031
- host=self._sdk_config.flight_server_host,
1032
- port=self._sdk_config.flight_server_port,
1060
+ host=self._sdk_config.flight_host,
1061
+ port=self._sdk_config.flight_port,
1033
1062
  scheme=self._sdk_config.flight_scheme,
1034
1063
  request_verify=self._sdk_config.request_verify,
1035
1064
  max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -1038,6 +1067,7 @@ class SpansClient:
1038
1067
  flight_client=flight_client,
1039
1068
  )
1040
1069
  return exporter.export_to_parquet(
1070
+ path=path,
1041
1071
  space_id=space_id,
1042
1072
  model_id=project_name,
1043
1073
  environment=Environments.TRACING,
@@ -1045,12 +1075,11 @@ class SpansClient:
1045
1075
  end_time=end_time,
1046
1076
  where=where,
1047
1077
  columns=columns,
1048
- similarity_search_params=similarity_search_params,
1049
1078
  stream_chunk_size=stream_chunk_size,
1050
1079
  )
1051
1080
 
1052
1081
 
1053
- def _build_patch_document(row):
1082
+ def _build_patch_document(row: pd.Series) -> dict[str, object]:
1054
1083
  # Extract and preserve metadata values with proper types
1055
1084
  patch = {}
1056
1085
  for key in row.index:
@@ -1070,8 +1099,11 @@ def _build_patch_document(row):
1070
1099
 
1071
1100
 
1072
1101
  def _process_patch_document(
1073
- metadata_df, patch_document_column_name, field_patches, row_idx
1074
- ):
1102
+ metadata_df: pd.DataFrame,
1103
+ patch_document_column_name: str,
1104
+ field_patches: pd.DataFrame,
1105
+ row_idx: int,
1106
+ ) -> dict[str, object]:
1075
1107
  # Get the field patch for this row
1076
1108
  field_patch = field_patches.iloc[row_idx]
1077
1109
 
@@ -1111,15 +1143,14 @@ def _process_patch_document(
1111
1143
  explicit_patch = {}
1112
1144
 
1113
1145
  # Merge patches - explicit patch takes precedence
1114
- merged_patch = {**field_patch, **explicit_patch}
1115
- return merged_patch
1146
+ return {**field_patch, **explicit_patch}
1116
1147
 
1117
1148
 
1118
1149
  def _ensure_dict_patch(
1119
1150
  metadata_df: pd.DataFrame,
1120
1151
  final_patch_column: str,
1121
1152
  row_idx: int,
1122
- ):
1153
+ ) -> tuple[dict[str, object], list[str]]:
1123
1154
  patch = metadata_df.loc[row_idx, final_patch_column]
1124
1155
  validation_errors = []
1125
1156
 
@@ -1141,19 +1172,19 @@ def _ensure_dict_patch(
1141
1172
  parsed = json.loads(patch)
1142
1173
  if isinstance(parsed, dict):
1143
1174
  return parsed
1144
- else:
1145
- error_msg = (
1146
- f"Row {row_idx}: JSON must be an object/dictionary, "
1147
- f"got {type(parsed).__name__}"
1148
- )
1149
- logger.warning(error_msg)
1150
- validation_errors.append(error_msg)
1151
- return {}, validation_errors # if not validate else None
1152
1175
  except json.JSONDecodeError as e:
1153
1176
  error_msg = f"Row {row_idx}: Invalid JSON in patch document: {e}"
1154
1177
  logger.warning(error_msg)
1155
1178
  validation_errors.append(error_msg)
1156
1179
  return {}, validation_errors # if not validate else None
1180
+ else:
1181
+ error_msg = (
1182
+ f"Row {row_idx}: JSON must be an object/dictionary, "
1183
+ f"got {type(parsed).__name__}"
1184
+ )
1185
+ logger.warning(error_msg)
1186
+ validation_errors.append(error_msg)
1187
+ return {}, validation_errors # if not validate else None
1157
1188
 
1158
1189
  # For other types, log warning
1159
1190
  error_msg = f"Row {row_idx}: Unsupported patch type: {type(patch).__name__}"
@@ -1165,7 +1196,7 @@ def _ensure_dict_patch(
1165
1196
  def _format_note_for_storage(
1166
1197
  note_text: str,
1167
1198
  current_time_ms: int,
1168
- ):
1199
+ ) -> list[str] | None:
1169
1200
  if pd.isna(note_text):
1170
1201
  return None
1171
1202
  note_obj = {
@@ -1225,9 +1256,7 @@ def _log_flight_update_summary(
1225
1256
  logger.warning("Flight update response missing counts", extra=metrics)
1226
1257
  else:
1227
1258
  all_processed = int(spans_processed) == int(total_spans)
1228
- msg = (
1229
- "✅ All spans processed" if all_processed else "Partial processing"
1230
- )
1259
+ msg = "All spans processed" if all_processed else "Partial processing"
1231
1260
  logger.info(msg, extra=metrics)
1232
1261
 
1233
1262
  # Emit individual error lines (structured per-error, easy to aggregate)
@@ -1246,7 +1275,7 @@ def _message_to_dict(
1246
1275
  msg: message.Message,
1247
1276
  preserve_names: bool = True,
1248
1277
  use_int_enums: bool = False,
1249
- ):
1278
+ ) -> dict[str, object]:
1250
1279
  return json_format.MessageToDict(
1251
1280
  msg,
1252
1281
  preserving_proto_field_name=preserve_names,