arize 8.0.0a21__py3-none-any.whl → 8.0.0a23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. arize/__init__.py +17 -9
  2. arize/_exporter/client.py +55 -36
  3. arize/_exporter/parsers/tracing_data_parser.py +41 -30
  4. arize/_exporter/validation.py +3 -3
  5. arize/_flight/client.py +208 -77
  6. arize/_generated/api_client/__init__.py +30 -6
  7. arize/_generated/api_client/api/__init__.py +1 -0
  8. arize/_generated/api_client/api/datasets_api.py +864 -190
  9. arize/_generated/api_client/api/experiments_api.py +167 -131
  10. arize/_generated/api_client/api/projects_api.py +1197 -0
  11. arize/_generated/api_client/api_client.py +2 -2
  12. arize/_generated/api_client/configuration.py +42 -34
  13. arize/_generated/api_client/exceptions.py +2 -2
  14. arize/_generated/api_client/models/__init__.py +15 -4
  15. arize/_generated/api_client/models/dataset.py +10 -10
  16. arize/_generated/api_client/models/dataset_example.py +111 -0
  17. arize/_generated/api_client/models/dataset_example_update.py +100 -0
  18. arize/_generated/api_client/models/dataset_version.py +13 -13
  19. arize/_generated/api_client/models/datasets_create_request.py +16 -8
  20. arize/_generated/api_client/models/datasets_examples_insert_request.py +100 -0
  21. arize/_generated/api_client/models/datasets_examples_list200_response.py +106 -0
  22. arize/_generated/api_client/models/datasets_examples_update_request.py +102 -0
  23. arize/_generated/api_client/models/datasets_list200_response.py +10 -4
  24. arize/_generated/api_client/models/experiment.py +14 -16
  25. arize/_generated/api_client/models/experiment_run.py +108 -0
  26. arize/_generated/api_client/models/experiment_run_create.py +102 -0
  27. arize/_generated/api_client/models/experiments_create_request.py +16 -10
  28. arize/_generated/api_client/models/experiments_list200_response.py +10 -4
  29. arize/_generated/api_client/models/experiments_runs_list200_response.py +19 -5
  30. arize/_generated/api_client/models/{error.py → pagination_metadata.py} +13 -11
  31. arize/_generated/api_client/models/primitive_value.py +172 -0
  32. arize/_generated/api_client/models/problem.py +100 -0
  33. arize/_generated/api_client/models/project.py +99 -0
  34. arize/_generated/api_client/models/{datasets_list_examples200_response.py → projects_create_request.py} +13 -11
  35. arize/_generated/api_client/models/projects_list200_response.py +106 -0
  36. arize/_generated/api_client/rest.py +2 -2
  37. arize/_generated/api_client/test/test_dataset.py +4 -2
  38. arize/_generated/api_client/test/test_dataset_example.py +56 -0
  39. arize/_generated/api_client/test/test_dataset_example_update.py +52 -0
  40. arize/_generated/api_client/test/test_dataset_version.py +7 -2
  41. arize/_generated/api_client/test/test_datasets_api.py +27 -13
  42. arize/_generated/api_client/test/test_datasets_create_request.py +8 -4
  43. arize/_generated/api_client/test/{test_datasets_list_examples200_response.py → test_datasets_examples_insert_request.py} +19 -15
  44. arize/_generated/api_client/test/test_datasets_examples_list200_response.py +66 -0
  45. arize/_generated/api_client/test/test_datasets_examples_update_request.py +61 -0
  46. arize/_generated/api_client/test/test_datasets_list200_response.py +9 -3
  47. arize/_generated/api_client/test/test_experiment.py +2 -4
  48. arize/_generated/api_client/test/test_experiment_run.py +56 -0
  49. arize/_generated/api_client/test/test_experiment_run_create.py +54 -0
  50. arize/_generated/api_client/test/test_experiments_api.py +6 -6
  51. arize/_generated/api_client/test/test_experiments_create_request.py +9 -6
  52. arize/_generated/api_client/test/test_experiments_list200_response.py +9 -5
  53. arize/_generated/api_client/test/test_experiments_runs_list200_response.py +15 -5
  54. arize/_generated/api_client/test/test_pagination_metadata.py +53 -0
  55. arize/_generated/api_client/test/{test_error.py → test_primitive_value.py} +13 -14
  56. arize/_generated/api_client/test/test_problem.py +57 -0
  57. arize/_generated/api_client/test/test_project.py +58 -0
  58. arize/_generated/api_client/test/test_projects_api.py +59 -0
  59. arize/_generated/api_client/test/test_projects_create_request.py +54 -0
  60. arize/_generated/api_client/test/test_projects_list200_response.py +70 -0
  61. arize/_generated/api_client_README.md +43 -29
  62. arize/_generated/protocol/flight/flight_pb2.py +400 -0
  63. arize/_lazy.py +27 -19
  64. arize/client.py +269 -55
  65. arize/config.py +365 -116
  66. arize/constants/__init__.py +1 -0
  67. arize/constants/config.py +11 -4
  68. arize/constants/ml.py +6 -4
  69. arize/constants/openinference.py +2 -0
  70. arize/constants/pyarrow.py +2 -0
  71. arize/constants/spans.py +3 -1
  72. arize/datasets/__init__.py +1 -0
  73. arize/datasets/client.py +299 -84
  74. arize/datasets/errors.py +32 -2
  75. arize/datasets/validation.py +18 -8
  76. arize/embeddings/__init__.py +2 -0
  77. arize/embeddings/auto_generator.py +23 -19
  78. arize/embeddings/base_generators.py +89 -36
  79. arize/embeddings/constants.py +2 -0
  80. arize/embeddings/cv_generators.py +26 -4
  81. arize/embeddings/errors.py +27 -5
  82. arize/embeddings/nlp_generators.py +31 -12
  83. arize/embeddings/tabular_generators.py +32 -20
  84. arize/embeddings/usecases.py +12 -2
  85. arize/exceptions/__init__.py +1 -0
  86. arize/exceptions/auth.py +11 -1
  87. arize/exceptions/base.py +29 -4
  88. arize/exceptions/models.py +21 -2
  89. arize/exceptions/parameters.py +31 -0
  90. arize/exceptions/spaces.py +12 -1
  91. arize/exceptions/types.py +86 -7
  92. arize/exceptions/values.py +220 -20
  93. arize/experiments/__init__.py +1 -0
  94. arize/experiments/client.py +390 -286
  95. arize/experiments/evaluators/__init__.py +1 -0
  96. arize/experiments/evaluators/base.py +74 -41
  97. arize/experiments/evaluators/exceptions.py +6 -3
  98. arize/experiments/evaluators/executors.py +121 -73
  99. arize/experiments/evaluators/rate_limiters.py +106 -57
  100. arize/experiments/evaluators/types.py +34 -7
  101. arize/experiments/evaluators/utils.py +65 -27
  102. arize/experiments/functions.py +103 -101
  103. arize/experiments/tracing.py +52 -44
  104. arize/experiments/types.py +56 -31
  105. arize/logging.py +54 -22
  106. arize/models/__init__.py +1 -0
  107. arize/models/batch_validation/__init__.py +1 -0
  108. arize/models/batch_validation/errors.py +543 -65
  109. arize/models/batch_validation/validator.py +339 -300
  110. arize/models/bounded_executor.py +20 -7
  111. arize/models/casting.py +75 -29
  112. arize/models/client.py +326 -107
  113. arize/models/proto.py +95 -40
  114. arize/models/stream_validation.py +42 -14
  115. arize/models/surrogate_explainer/__init__.py +1 -0
  116. arize/models/surrogate_explainer/mimic.py +24 -13
  117. arize/pre_releases.py +43 -0
  118. arize/projects/__init__.py +1 -0
  119. arize/projects/client.py +129 -0
  120. arize/regions.py +40 -0
  121. arize/spans/__init__.py +1 -0
  122. arize/spans/client.py +130 -106
  123. arize/spans/columns.py +13 -0
  124. arize/spans/conversion.py +54 -38
  125. arize/spans/validation/__init__.py +1 -0
  126. arize/spans/validation/annotations/__init__.py +1 -0
  127. arize/spans/validation/annotations/annotations_validation.py +6 -4
  128. arize/spans/validation/annotations/dataframe_form_validation.py +13 -11
  129. arize/spans/validation/annotations/value_validation.py +35 -11
  130. arize/spans/validation/common/__init__.py +1 -0
  131. arize/spans/validation/common/argument_validation.py +33 -8
  132. arize/spans/validation/common/dataframe_form_validation.py +35 -9
  133. arize/spans/validation/common/errors.py +211 -11
  134. arize/spans/validation/common/value_validation.py +80 -13
  135. arize/spans/validation/evals/__init__.py +1 -0
  136. arize/spans/validation/evals/dataframe_form_validation.py +28 -8
  137. arize/spans/validation/evals/evals_validation.py +34 -4
  138. arize/spans/validation/evals/value_validation.py +26 -3
  139. arize/spans/validation/metadata/__init__.py +1 -1
  140. arize/spans/validation/metadata/argument_validation.py +14 -5
  141. arize/spans/validation/metadata/dataframe_form_validation.py +26 -10
  142. arize/spans/validation/metadata/value_validation.py +24 -10
  143. arize/spans/validation/spans/__init__.py +1 -0
  144. arize/spans/validation/spans/dataframe_form_validation.py +34 -13
  145. arize/spans/validation/spans/spans_validation.py +35 -4
  146. arize/spans/validation/spans/value_validation.py +76 -7
  147. arize/types.py +293 -157
  148. arize/utils/__init__.py +1 -0
  149. arize/utils/arrow.py +31 -15
  150. arize/utils/cache.py +34 -6
  151. arize/utils/dataframe.py +19 -2
  152. arize/utils/online_tasks/__init__.py +2 -0
  153. arize/utils/online_tasks/dataframe_preprocessor.py +53 -41
  154. arize/utils/openinference_conversion.py +44 -5
  155. arize/utils/proto.py +10 -0
  156. arize/utils/size.py +5 -3
  157. arize/version.py +3 -1
  158. {arize-8.0.0a21.dist-info → arize-8.0.0a23.dist-info}/METADATA +4 -3
  159. arize-8.0.0a23.dist-info/RECORD +174 -0
  160. {arize-8.0.0a21.dist-info → arize-8.0.0a23.dist-info}/WHEEL +1 -1
  161. arize-8.0.0a23.dist-info/licenses/LICENSE +176 -0
  162. arize-8.0.0a23.dist-info/licenses/NOTICE +13 -0
  163. arize/_generated/protocol/flight/export_pb2.py +0 -61
  164. arize/_generated/protocol/flight/ingest_pb2.py +0 -365
  165. arize-8.0.0a21.dist-info/RECORD +0 -146
  166. arize-8.0.0a21.dist-info/licenses/LICENSE.md +0 -12
arize/utils/__init__.py CHANGED
@@ -0,0 +1 @@
1
+ """Utility functions and helper modules for the Arize SDK."""
arize/utils/arrow.py CHANGED
@@ -1,3 +1,5 @@
1
+ """Apache Arrow utilities for data serialization and file operations."""
2
+
1
3
  # type: ignore[pb2]
2
4
  from __future__ import annotations
3
5
 
@@ -5,7 +7,7 @@ import base64
5
7
  import logging
6
8
  import os
7
9
  import tempfile
8
- from typing import TYPE_CHECKING, Any, Dict
10
+ from typing import TYPE_CHECKING, Any
9
11
 
10
12
  import pyarrow as pa
11
13
 
@@ -23,16 +25,30 @@ def post_arrow_table(
23
25
  files_url: str,
24
26
  pa_table: pa.Table,
25
27
  proto_schema: pb2.Schema,
26
- headers: Dict[str, str],
28
+ headers: dict[str, str],
27
29
  timeout: float | None,
28
30
  verify: bool,
29
31
  max_chunksize: int,
30
32
  tmp_dir: str = "",
31
33
  ) -> requests.Response:
32
- # We import here to avoid depending onn requests for all arrow utils
34
+ """Post a PyArrow table to Arize via HTTP file upload.
35
+
36
+ Args:
37
+ files_url: The URL endpoint for file uploads.
38
+ pa_table: The PyArrow table containing the data.
39
+ proto_schema: The protobuf schema for the data.
40
+ headers: HTTP headers for the request.
41
+ timeout: Request timeout in seconds, or None for no timeout.
42
+ verify: Whether to verify SSL certificates.
43
+ max_chunksize: Maximum chunk size for splitting large tables.
44
+ tmp_dir: Temporary directory for serialization. Defaults to "".
45
+
46
+ Returns:
47
+ The HTTP response from the upload request.
48
+ """
49
+ # We import here to avoid depending on requests for all arrow utils
33
50
  import requests
34
51
 
35
- logger.debug("Preparing to log Arrow table via file upload")
36
52
  logger.debug(
37
53
  "Preparing to log Arrow table via file upload",
38
54
  extra={"rows": pa_table.num_rows, "cols": pa_table.num_columns},
@@ -94,20 +110,20 @@ def post_arrow_table(
94
110
  tdir.cleanup() # cleaning the entire dir, no need to clean the file
95
111
  except Exception as e:
96
112
  logger.warning(
97
- f"Failed to remove temporary directory {tdir.name}: {str(e)}"
113
+ f"Failed to remove temporary directory {tdir.name}: {e!s}"
98
114
  )
99
115
  elif cleanup_file:
100
116
  try:
101
117
  os.remove(outfile)
102
118
  except Exception as e:
103
119
  logger.warning(
104
- f"Failed to remove temporary file {outfile}: {str(e)}"
120
+ f"Failed to remove temporary file {outfile}: {e!s}"
105
121
  )
106
122
 
107
123
 
108
124
  def _append_to_pyarrow_metadata(
109
- pa_schema: pa.Schema, new_metadata: Dict[str, Any]
110
- ):
125
+ pa_schema: pa.Schema, new_metadata: dict[str, Any]
126
+ ) -> object:
111
127
  # Ensure metadata is handled correctly, even if initially None.
112
128
  metadata = pa_schema.metadata
113
129
  if metadata is None:
@@ -129,9 +145,10 @@ def _append_to_pyarrow_metadata(
129
145
  def _write_arrow_file(
130
146
  path: str, pa_table: pa.Table, pa_schema: pa.Schema, max_chunksize: int
131
147
  ) -> None:
132
- with pa.OSFile(path, mode="wb") as sink, pa.ipc.RecordBatchStreamWriter(
133
- sink, pa_schema
134
- ) as writer:
148
+ with (
149
+ pa.OSFile(path, mode="wb") as sink,
150
+ pa.ipc.RecordBatchStreamWriter(sink, pa_schema) as writer,
151
+ ):
135
152
  writer.write_table(pa_table, max_chunksize)
136
153
 
137
154
 
@@ -145,10 +162,9 @@ def _maybe_log_project_url(response: requests.Response) -> None:
145
162
 
146
163
 
147
164
  def _mktemp_in(directory: str) -> str:
148
- """
149
- Create a unique temp file path inside `directory` without leaving
150
- an open file descriptor around (Windows-safe). The file exists on
151
- disk and is closed; caller can open/write it later.
165
+ """Create a unique temp file path inside `directory` without leaving an open file descriptor.
166
+
167
+ Windows-safe. The file exists on disk and is closed; caller can open/write it later.
152
168
  """
153
169
  with tempfile.NamedTemporaryFile(
154
170
  dir=directory,
arize/utils/cache.py CHANGED
@@ -1,10 +1,16 @@
1
+ """Caching utilities for resource management and persistence."""
2
+
1
3
  from __future__ import annotations
2
4
 
3
5
  import logging
4
6
  from pathlib import Path
7
+ from typing import TYPE_CHECKING
5
8
 
6
9
  import pandas as pd
7
10
 
11
+ if TYPE_CHECKING:
12
+ from datetime import datetime
13
+
8
14
  logger = logging.getLogger(__name__)
9
15
 
10
16
 
@@ -12,9 +18,21 @@ def load_cached_resource(
12
18
  cache_dir: str,
13
19
  resource: str,
14
20
  resource_id: str,
15
- resource_updated_at: str | None,
21
+ resource_updated_at: datetime | None,
16
22
  format: str = "parquet",
17
23
  ) -> pd.DataFrame | None:
24
+ """Load a cached resource from the local cache directory.
25
+
26
+ Args:
27
+ cache_dir: Directory path for cache storage.
28
+ resource: Resource type name (e.g., "dataset", "experiment").
29
+ resource_id: Unique identifier for the resource.
30
+ resource_updated_at: Optional timestamp of last resource update.
31
+ format: File format for cached data. Defaults to "parquet".
32
+
33
+ Returns:
34
+ The cached DataFrame if found and valid, None otherwise.
35
+ """
18
36
  key = _get_cache_key(resource, resource_id, resource_updated_at)
19
37
  filepath = _get_abs_file_path(cache_dir, f"{key}.{format}", resource)
20
38
  if not filepath.exists():
@@ -30,10 +48,20 @@ def cache_resource(
30
48
  cache_dir: str,
31
49
  resource: str,
32
50
  resource_id: str,
33
- resource_updated_at: str | None,
51
+ resource_updated_at: datetime | None,
34
52
  resource_data: pd.DataFrame,
35
53
  format: str = "parquet",
36
54
  ) -> None:
55
+ """Save a resource to the local cache directory.
56
+
57
+ Args:
58
+ cache_dir: Directory path for cache storage.
59
+ resource: Resource type name (e.g., "dataset", "experiment").
60
+ resource_id: Unique identifier for the resource.
61
+ resource_updated_at: Optional timestamp of last resource update.
62
+ resource_data: DataFrame containing the resource data.
63
+ format: File format for cached data. Defaults to "parquet".
64
+ """
37
65
  key = _get_cache_key(resource, resource_id, resource_updated_at)
38
66
  filepath = _get_abs_file_path(cache_dir, f"{key}.{format}", resource)
39
67
  filepath.parent.mkdir(parents=True, exist_ok=True)
@@ -44,12 +72,12 @@ def cache_resource(
44
72
  def _get_cache_key(
45
73
  resource: str,
46
74
  resource_id: str,
47
- resource_updated_at: str | None,
75
+ resource_updated_at: datetime | None,
48
76
  ) -> str:
49
77
  # include updated_at if present to produce a new key when dataset changes
50
78
  key = f"{resource}_{resource_id}"
51
79
  if resource_updated_at:
52
- key += f"_{resource_updated_at}"
80
+ key += f"_{resource_updated_at.strftime('%Y%m%dT%H%M%S')}"
53
81
  return key
54
82
 
55
83
 
@@ -58,8 +86,8 @@ def _get_abs_file_path(
58
86
  filename: str,
59
87
  subdirectory: str | None = None,
60
88
  ) -> Path:
61
- """
62
- Return an absolute path to a file located under `directory[/subdirectory]/filename`.
89
+ """Return an absolute path to a file located under `directory[/subdirectory]/filename`.
90
+
63
91
  Expands '~' and resolves relative components.
64
92
  """
65
93
  base = Path(directory).expanduser()
arize/utils/dataframe.py CHANGED
@@ -1,5 +1,6 @@
1
+ """DataFrame manipulation and validation utilities."""
2
+
1
3
  import re
2
- from typing import List
3
4
 
4
5
  import pandas as pd
5
6
 
@@ -8,6 +9,11 @@ from arize.types import BaseSchema
8
9
 
9
10
  # Resets the dataframe index if it is not a RangeIndex
10
11
  def reset_dataframe_index(dataframe: pd.DataFrame) -> None:
12
+ """Reset the DataFrame index in-place if it is not a RangeIndex.
13
+
14
+ Args:
15
+ dataframe: The pandas DataFrame to reset.
16
+ """
11
17
  if not isinstance(dataframe.index, pd.RangeIndex):
12
18
  drop = dataframe.index.name in dataframe.columns
13
19
  dataframe.reset_index(inplace=True, drop=drop)
@@ -16,9 +22,20 @@ def reset_dataframe_index(dataframe: pd.DataFrame) -> None:
16
22
  def remove_extraneous_columns(
17
23
  df: pd.DataFrame,
18
24
  schema: BaseSchema | None = None,
19
- column_list: List[str] | None = None,
25
+ column_list: list[str] | None = None,
20
26
  regex: str | None = None,
21
27
  ) -> pd.DataFrame:
28
+ """Filter DataFrame to keep only relevant columns based on schema, list, or regex.
29
+
30
+ Args:
31
+ df: The pandas DataFrame to filter.
32
+ schema: Optional schema defining used columns. Defaults to None.
33
+ column_list: Optional explicit list of columns to keep. Defaults to None.
34
+ regex: Optional regex pattern to match column names. Defaults to None.
35
+
36
+ Returns:
37
+ A filtered DataFrame containing only the relevant columns.
38
+ """
22
39
  relevant_columns = set()
23
40
  if schema is not None:
24
41
  relevant_columns.update(schema.get_used_columns())
@@ -1,3 +1,5 @@
1
+ """Online task processing utilities for dataframe preprocessing."""
2
+
1
3
  from arize.utils.online_tasks.dataframe_preprocessor import (
2
4
  extract_nested_data_to_column,
3
5
  )
@@ -1,6 +1,7 @@
1
+ """DataFrame preprocessing utilities for online tasks."""
2
+
1
3
  import json
2
4
  import logging
3
- from typing import Any, List, Tuple
4
5
 
5
6
  import numpy as np
6
7
  import pandas as pd
@@ -8,17 +9,34 @@ import pandas as pd
8
9
  logger = logging.getLogger(__name__)
9
10
 
10
11
 
12
+ class ColumnNotFoundError(Exception):
13
+ """Raised when a specified column is not found in the DataFrame."""
14
+
15
+ def __init__(self, attribute: str) -> None:
16
+ """Initialize with the attribute that couldn't be mapped to a column.
17
+
18
+ Args:
19
+ attribute: The attribute string that has no matching column prefix.
20
+ """
21
+ self.attribute = attribute
22
+ super().__init__(
23
+ f"No column found in DataFrame for attribute: {attribute}"
24
+ )
25
+
26
+
11
27
  def extract_nested_data_to_column(
12
- attributes: List[str], df: pd.DataFrame
28
+ attributes: list[str], df: pd.DataFrame
13
29
  ) -> pd.DataFrame:
14
- """
30
+ """Extract nested attributes from complex data structures into new DataFrame columns.
31
+
15
32
  This function, used in Online Tasks, is typically run on data exported from Arize.
16
- It prepares the DataFrame by extracting relevant attributes from complex, deeply nested
17
- data structures, such as those found in LLM outputs or JSON-like records. It helps extract
18
- specific values from these nested structures by identifying the longest matching column name
19
- in the DataFrame and recursively accessing the desired attribute path within each row.
20
- This preprocessing step ensures that the extracted values are available as new columns,
21
- allowing evaluators to process and assess these values effectively.
33
+ It prepares the DataFrame by extracting relevant attributes from complex, deeply
34
+ nested data structures, such as those found in LLM outputs or JSON-like records.
35
+ It helps extract specific values from these nested structures by identifying the
36
+ longest matching column name in the DataFrame and recursively accessing the desired
37
+ attribute path within each row. This preprocessing step ensures that the extracted
38
+ values are available as new columns, allowing evaluators to process and assess
39
+ these values effectively.
22
40
 
23
41
  For each attributes string in `attributes` (e.g. "attributes.llm.output_messages.0.message.content"),
24
42
  1) Find the largest prefix that is actually a column name in `df`. (e.g. "attributes.llm.output_messages")
@@ -37,13 +55,12 @@ def extract_nested_data_to_column(
37
55
  5) Log how many rows were dropped and, if zero rows remain, log a message indicating that
38
56
  there are no rows satisfying *all* of the queries.
39
57
  """
40
-
41
58
  # Make a copy so as not to alter the input df
42
59
  result_df = df.copy()
43
60
 
44
61
  # Keep track of which new columns we add. Each column name will match each user-inputted attribute
45
62
  # (e.g. "attributes.llm.output_messages.0.message.content")
46
- new_cols: List[str] = []
63
+ new_cols: list[str] = []
47
64
 
48
65
  for attribute in attributes:
49
66
  parts = attribute.split(".")
@@ -58,7 +75,7 @@ def extract_nested_data_to_column(
58
75
  prefix_len = i
59
76
 
60
77
  if prefix_col is None:
61
- raise Exception("No such column found in DataFrame.")
78
+ raise ColumnNotFoundError(attribute)
62
79
 
63
80
  # 2) The remainder after the prefix
64
81
  remainder = ".".join(parts[prefix_len:])
@@ -68,13 +85,14 @@ def extract_nested_data_to_column(
68
85
  row: pd.Series,
69
86
  prefix_col: str = prefix_col,
70
87
  remainder: str = remainder,
71
- ) -> Any:
88
+ ) -> object:
72
89
  val = row[prefix_col]
73
90
  try:
74
91
  result = _introspect_arize_attribute(val, remainder)
75
- return result if result is not None else np.nan
76
92
  except Exception:
77
93
  return np.nan
94
+ else:
95
+ return result if result is not None else np.nan
78
96
 
79
97
  result_df[attribute] = result_df.apply(
80
98
  apply_introspect_arize_attribute, axis=1
@@ -101,9 +119,9 @@ def extract_nested_data_to_column(
101
119
  return result_df
102
120
 
103
121
 
104
- def _introspect_arize_attribute(value: Any, attribute: str) -> Any:
105
- """
106
- Recursively drill into `value` following the dot-delimited `attribute`.
122
+ def _introspect_arize_attribute(value: object, attribute: str) -> object:
123
+ """Recursively drill into `value` following the dot-delimited `attribute`.
124
+
107
125
  Example:
108
126
  value: [{'message.role': 'assistant', 'message.content': 'The capital of China is Beijing.'}]
109
127
  attribute: "0.message.content"
@@ -124,8 +142,8 @@ def _introspect_arize_attribute(value: Any, attribute: str) -> Any:
124
142
 
125
143
 
126
144
  def _introspect_arize_attribute_parts(
127
- current_value: Any, attribute_parts_unprocessed: List[str]
128
- ) -> Any:
145
+ current_value: object, attribute_parts_unprocessed: list[str]
146
+ ) -> object:
129
147
  # If no more parts, we return whatever we have
130
148
  if not attribute_parts_unprocessed:
131
149
  return current_value
@@ -148,10 +166,9 @@ def _introspect_arize_attribute_parts(
148
166
 
149
167
 
150
168
  def _parse_value(
151
- current_value: Any, attribute_parts_unprocessed: List[str]
152
- ) -> Tuple[Any, int]:
153
- """
154
- Attempt to parse out the next value from `current_value` using the earliest parts:
169
+ current_value: object, attribute_parts_unprocessed: list[str]
170
+ ) -> tuple[object, int]:
171
+ """Attempt to parse out the next value from `current_value` using the earliest parts.
155
172
 
156
173
  1) If `attribute_parts_unprocessed[0]` is an integer index and `current_value` is a list/tuple,
157
174
  index into it.
@@ -164,7 +181,6 @@ def _parse_value(
164
181
  - parsed_value: the found value or None if not found
165
182
  - num_parts_processed: how many parts were processed (1 or more)
166
183
  """
167
-
168
184
  if not attribute_parts_unprocessed:
169
185
  return (None, 0)
170
186
 
@@ -182,35 +198,31 @@ def _parse_value(
182
198
  if isinstance(current_value, (list, tuple)):
183
199
  if 0 <= idx < len(current_value):
184
200
  return (current_value[idx], num_parts_processed)
185
- else:
186
- return (None, num_parts_processed)
187
- else:
188
201
  return (None, num_parts_processed)
202
+ return (None, num_parts_processed)
189
203
 
190
204
  # 2) Try dict approach
191
205
  if isinstance(current_value, dict):
192
206
  # a) direct match
193
207
  if key in current_value:
194
208
  return (current_value[key], num_parts_processed)
195
- else:
196
- # b) try combining multiple parts to handle dotted key
197
- for num_parts_processed in range(
198
- 1, len(attribute_parts_unprocessed)
199
- ):
200
- key += "." + attribute_parts_unprocessed[num_parts_processed]
201
- if key in current_value:
202
- return (
203
- current_value[key],
204
- num_parts_processed + 1,
205
- )
206
- return (None, num_parts_processed)
209
+ # b) try combining multiple parts to handle dotted key
210
+ for num_parts_processed in range(1, len(attribute_parts_unprocessed)):
211
+ key += "." + attribute_parts_unprocessed[num_parts_processed]
212
+ if key in current_value:
213
+ return (
214
+ current_value[key],
215
+ num_parts_processed + 1,
216
+ )
217
+ return (None, num_parts_processed)
207
218
 
208
219
  # If we get here, we couldn't handle it (not a list or dict or mismatch)
209
220
  return (None, num_parts_processed)
210
221
 
211
222
 
212
- def _ensure_deserialized(val: Any) -> Any:
213
- """
223
+ def _ensure_deserialized(val: object) -> object:
224
+ """Ensure value is deserialized from numpy array or JSON string.
225
+
214
226
  1) If `val` is a numpy array, convert to a Python list.
215
227
  2) If `val` is a string, attempt to parse as JSON.
216
228
  3) Otherwise return as-is.
@@ -1,11 +1,24 @@
1
+ """OpenInference data conversion utilities for column transformations."""
2
+
1
3
  import json
4
+ import logging
2
5
 
3
6
  import pandas as pd
4
7
 
5
8
  from arize.constants.openinference import OPEN_INFERENCE_JSON_STR_TYPES
6
9
 
10
+ logger = logging.getLogger(__name__)
11
+
7
12
 
8
13
  def convert_datetime_columns_to_int(df: pd.DataFrame) -> pd.DataFrame:
14
+ """Convert datetime columns in a DataFrame to milliseconds since epoch.
15
+
16
+ Args:
17
+ df: The pandas DataFrame to convert.
18
+
19
+ Returns:
20
+ The DataFrame with datetime columns converted to integers.
21
+ """
9
22
  for col in df.select_dtypes(
10
23
  include=["datetime64[ns]", "datetime64[ns, UTC]"]
11
24
  ):
@@ -14,6 +27,14 @@ def convert_datetime_columns_to_int(df: pd.DataFrame) -> pd.DataFrame:
14
27
 
15
28
 
16
29
  def convert_boolean_columns_to_str(df: pd.DataFrame) -> pd.DataFrame:
30
+ """Convert boolean columns in a DataFrame to string type.
31
+
32
+ Args:
33
+ df: The pandas DataFrame to convert.
34
+
35
+ Returns:
36
+ The DataFrame with boolean columns converted to strings.
37
+ """
17
38
  for col in df.columns:
18
39
  if df[col].dtype == "bool":
19
40
  df[col] = df[col].astype("string")
@@ -21,33 +42,51 @@ def convert_boolean_columns_to_str(df: pd.DataFrame) -> pd.DataFrame:
21
42
 
22
43
 
23
44
  def convert_default_columns_to_json_str(df: pd.DataFrame) -> pd.DataFrame:
45
+ """Convert dictionary values in specific columns to JSON strings.
46
+
47
+ Args:
48
+ df: The pandas DataFrame to convert.
49
+
50
+ Returns:
51
+ The DataFrame with dictionaries in eligible columns converted to JSON strings.
52
+ """
24
53
  for col in df.columns:
25
54
  if _should_convert_json(col):
26
55
  try:
27
56
  df[col] = df[col].apply(
28
57
  lambda x: json.dumps(x) if isinstance(x, dict) else x
29
58
  )
30
- except Exception:
59
+ except Exception as e:
60
+ logger.debug(
61
+ f"Failed to convert column '{col}' to JSON string: {e}"
62
+ )
31
63
  continue
32
64
  return df
33
65
 
34
66
 
35
67
  def convert_json_str_to_dict(df: pd.DataFrame) -> pd.DataFrame:
68
+ """Convert JSON string values in specific columns to Python dictionaries.
69
+
70
+ Args:
71
+ df: The pandas DataFrame to convert.
72
+
73
+ Returns:
74
+ The DataFrame with JSON strings in eligible columns converted to dictionaries.
75
+ """
36
76
  for col in df.columns:
37
77
  if _should_convert_json(col):
38
78
  try:
39
79
  df[col] = df[col].apply(
40
80
  lambda x: json.loads(x) if isinstance(x, str) else x
41
81
  )
42
- except Exception:
82
+ except Exception as e:
83
+ logger.debug(f"Failed to parse column '{col}' as JSON: {e}")
43
84
  continue
44
85
  return df
45
86
 
46
87
 
47
88
  def _should_convert_json(col_name: str) -> bool:
48
- """
49
- Check if a column should be converted to/from a JSON string/PythonDictionary.
50
- """
89
+ """Check if a column should be converted to/from a JSON string/PythonDictionary."""
51
90
  is_eval_metadata = col_name.startswith("eval.") and col_name.endswith(
52
91
  ".metadata"
53
92
  )
arize/utils/proto.py CHANGED
@@ -1,3 +1,5 @@
1
+ """Protocol buffer schema utilities for tracing data."""
2
+
1
3
  # type: ignore[pb2]
2
4
  from arize._generated.protocol.rec import public_pb2 as pb2
3
5
 
@@ -5,6 +7,14 @@ from arize._generated.protocol.rec import public_pb2 as pb2
5
7
  def get_pb_schema_tracing(
6
8
  project_name: str,
7
9
  ) -> pb2.Schema:
10
+ """Create a protobuf schema for LLM tracing data.
11
+
12
+ Args:
13
+ project_name: The name of the project/model.
14
+
15
+ Returns:
16
+ A configured pb2.Schema object for tracing environment.
17
+ """
8
18
  s = pb2.Schema()
9
19
  s.constants.model_id = project_name
10
20
  s.constants.environment = pb2.Schema.Environment.TRACING
arize/utils/size.py CHANGED
@@ -1,13 +1,15 @@
1
+ """Size calculation utilities for payloads and data structures."""
2
+
1
3
  import sys
2
- from typing import Any, Dict, List
4
+ from typing import Any
3
5
 
4
6
  import pandas as pd
5
7
 
6
8
 
7
- def get_payload_size_mb(payload: List[Dict[str, Any]] | pd.DataFrame) -> float:
9
+ def get_payload_size_mb(payload: list[dict[str, Any]] | pd.DataFrame) -> float:
8
10
  """Return approximate size of payload in MB."""
9
11
  if isinstance(payload, pd.DataFrame):
10
- # memory_usage(deep=True) sums all columns memory footprint
12
+ # memory_usage(deep=True) sums all columns' memory footprint
11
13
  size_bytes = payload.memory_usage(deep=True).sum()
12
14
  elif isinstance(payload, list):
13
15
  # sys.getsizeof() gives shallow size; sum all element sizes for rough total
arize/version.py CHANGED
@@ -1 +1,3 @@
1
- __version__ = "8.0.0a21"
1
+ """Version information for the Arize SDK."""
2
+
3
+ __version__ = "8.0.0a23"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: arize
3
- Version: 8.0.0a21
3
+ Version: 8.0.0a23
4
4
  Summary: A helper library to interact with Arize AI APIs
5
5
  Project-URL: Homepage, https://arize.com
6
6
  Project-URL: Documentation, https://docs.arize.com/arize
@@ -9,8 +9,9 @@ Project-URL: Source, https://github.com/Arize-ai/client_python
9
9
  Project-URL: Changelog, https://github.com/Arize-ai/client_python/blob/main/CHANGELOG.md
10
10
  Author-email: Arize AI <support@arize.com>
11
11
  Maintainer-email: Arize AI <support@arize.com>
12
- License: BSD
13
- License-File: LICENSE.md
12
+ License: Apache-2.0
13
+ License-File: LICENSE
14
+ License-File: NOTICE
14
15
  Keywords: Arize,Evaluations,Explainability,LLM,Monitoring,Observability,Tracing
15
16
  Classifier: Development Status :: 3 - Alpha
16
17
  Classifier: Intended Audience :: Developers