arize 8.0.0b1__py3-none-any.whl → 8.0.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. arize/__init__.py +9 -2
  2. arize/_client_factory.py +50 -0
  3. arize/_exporter/client.py +18 -17
  4. arize/_exporter/parsers/tracing_data_parser.py +9 -4
  5. arize/_exporter/validation.py +1 -1
  6. arize/_flight/client.py +37 -17
  7. arize/_generated/api_client/api/datasets_api.py +6 -6
  8. arize/_generated/api_client/api/experiments_api.py +6 -6
  9. arize/_generated/api_client/api/projects_api.py +3 -3
  10. arize/_lazy.py +61 -10
  11. arize/client.py +66 -50
  12. arize/config.py +175 -48
  13. arize/constants/config.py +1 -0
  14. arize/constants/ml.py +9 -16
  15. arize/constants/spans.py +5 -10
  16. arize/datasets/client.py +45 -28
  17. arize/datasets/errors.py +1 -1
  18. arize/datasets/validation.py +2 -2
  19. arize/embeddings/auto_generator.py +16 -9
  20. arize/embeddings/base_generators.py +15 -9
  21. arize/embeddings/cv_generators.py +2 -2
  22. arize/embeddings/errors.py +2 -2
  23. arize/embeddings/nlp_generators.py +8 -8
  24. arize/embeddings/tabular_generators.py +6 -6
  25. arize/exceptions/base.py +0 -52
  26. arize/exceptions/config.py +22 -0
  27. arize/exceptions/parameters.py +1 -330
  28. arize/exceptions/values.py +8 -5
  29. arize/experiments/__init__.py +4 -0
  30. arize/experiments/client.py +31 -18
  31. arize/experiments/evaluators/base.py +12 -9
  32. arize/experiments/evaluators/executors.py +16 -7
  33. arize/experiments/evaluators/rate_limiters.py +3 -1
  34. arize/experiments/evaluators/types.py +9 -7
  35. arize/experiments/evaluators/utils.py +7 -5
  36. arize/experiments/functions.py +128 -58
  37. arize/experiments/tracing.py +4 -1
  38. arize/experiments/types.py +34 -31
  39. arize/logging.py +54 -33
  40. arize/ml/batch_validation/errors.py +10 -1004
  41. arize/ml/batch_validation/validator.py +351 -291
  42. arize/ml/bounded_executor.py +25 -6
  43. arize/ml/casting.py +51 -33
  44. arize/ml/client.py +43 -35
  45. arize/ml/proto.py +21 -22
  46. arize/ml/stream_validation.py +64 -27
  47. arize/ml/surrogate_explainer/mimic.py +18 -10
  48. arize/ml/types.py +27 -67
  49. arize/pre_releases.py +10 -6
  50. arize/projects/client.py +9 -4
  51. arize/py.typed +0 -0
  52. arize/regions.py +11 -11
  53. arize/spans/client.py +125 -31
  54. arize/spans/columns.py +32 -36
  55. arize/spans/conversion.py +12 -11
  56. arize/spans/validation/annotations/dataframe_form_validation.py +1 -1
  57. arize/spans/validation/annotations/value_validation.py +11 -14
  58. arize/spans/validation/common/argument_validation.py +3 -3
  59. arize/spans/validation/common/dataframe_form_validation.py +7 -7
  60. arize/spans/validation/common/value_validation.py +11 -14
  61. arize/spans/validation/evals/dataframe_form_validation.py +4 -4
  62. arize/spans/validation/evals/evals_validation.py +6 -6
  63. arize/spans/validation/evals/value_validation.py +1 -1
  64. arize/spans/validation/metadata/argument_validation.py +1 -1
  65. arize/spans/validation/metadata/dataframe_form_validation.py +2 -2
  66. arize/spans/validation/metadata/value_validation.py +23 -1
  67. arize/spans/validation/spans/dataframe_form_validation.py +2 -2
  68. arize/spans/validation/spans/spans_validation.py +6 -6
  69. arize/utils/arrow.py +38 -2
  70. arize/utils/cache.py +2 -2
  71. arize/utils/dataframe.py +4 -4
  72. arize/utils/online_tasks/dataframe_preprocessor.py +15 -11
  73. arize/utils/openinference_conversion.py +10 -10
  74. arize/utils/proto.py +0 -1
  75. arize/utils/types.py +6 -6
  76. arize/version.py +1 -1
  77. {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/METADATA +32 -7
  78. {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/RECORD +81 -78
  79. {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/WHEEL +0 -0
  80. {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/LICENSE +0 -0
  81. {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/NOTICE +0 -0
@@ -11,6 +11,7 @@ from arize.constants.ml import (
11
11
  MAX_FUTURE_YEARS_FROM_CURRENT_TIME,
12
12
  MAX_PAST_YEARS_FROM_CURRENT_TIME,
13
13
  )
14
+ from arize.exceptions.base import ValidationError
14
15
  from arize.exceptions.parameters import InvalidModelVersion, InvalidProjectName
15
16
  from arize.spans.columns import (
16
17
  SPAN_END_TIME_COL,
@@ -56,7 +57,7 @@ def check_invalid_model_version(
56
57
  model_version: The optional model version to validate.
57
58
 
58
59
  Returns:
59
- List of validation errors if model version is invalid (empty if valid or None).
60
+ List of validation errors if model version is invalid (empty if valid or :obj:`None`).
60
61
  """
61
62
  if model_version is None:
62
63
  return []
@@ -73,7 +74,7 @@ def check_string_column_value_length(
73
74
  max_len: int,
74
75
  is_required: bool,
75
76
  must_be_json: bool = False,
76
- ) -> list[InvalidMissingValueInColumn | InvalidStringLengthInColumn]:
77
+ ) -> list[ValidationError]:
77
78
  """Validate string column values are within length bounds and optionally valid JSON.
78
79
 
79
80
  Args:
@@ -90,7 +91,7 @@ def check_string_column_value_length(
90
91
  if col_name not in df.columns:
91
92
  return []
92
93
 
93
- errors = []
94
+ errors: list[ValidationError] = []
94
95
  if is_required and df[col_name].isnull().any():
95
96
  errors.append(
96
97
  InvalidMissingValueInColumn(
@@ -129,7 +130,7 @@ def check_string_column_allowed_values(
129
130
  col_name: str,
130
131
  allowed_values: list[str],
131
132
  is_required: bool,
132
- ) -> list[InvalidMissingValueInColumn | InvalidStringValueNotAllowedInColumn]:
133
+ ) -> list[ValidationError]:
133
134
  """Validate that string column values are within allowed values.
134
135
 
135
136
  Args:
@@ -144,7 +145,7 @@ def check_string_column_allowed_values(
144
145
  if col_name not in df.columns:
145
146
  return []
146
147
 
147
- errors = []
148
+ errors: list[ValidationError] = []
148
149
  if is_required and df[col_name].isnull().any():
149
150
  errors.append(
150
151
  InvalidMissingValueInColumn(
@@ -177,7 +178,7 @@ def check_string_column_allowed_values(
177
178
  def check_float_column_valid_numbers(
178
179
  df: pd.DataFrame,
179
180
  col_name: str,
180
- ) -> list[InvalidFloatValueInColumn]:
181
+ ) -> list[ValidationError]:
181
182
  """Check that float column contains only finite numbers, no infinity values.
182
183
 
183
184
  Args:
@@ -201,11 +202,7 @@ def check_float_column_valid_numbers(
201
202
 
202
203
  def check_value_columns_start_end_time(
203
204
  df: pd.DataFrame,
204
- ) -> list[
205
- InvalidMissingValueInColumn
206
- | InvalidTimestampValueInColumn
207
- | InvalidStartAndEndTimeValuesInColumn
208
- ]:
205
+ ) -> list[ValidationError]:
209
206
  """Validate start and end time columns for timestamps and logical ordering.
210
207
 
211
208
  Args:
@@ -214,7 +211,7 @@ def check_value_columns_start_end_time(
214
211
  Returns:
215
212
  List of validation errors for missing values, invalid timestamps, or start > end.
216
213
  """
217
- errors = []
214
+ errors: list[ValidationError] = []
218
215
  errors += check_value_timestamp(
219
216
  df=df,
220
217
  col_name=SPAN_START_TIME_COL.name,
@@ -243,7 +240,7 @@ def check_value_timestamp(
243
240
  df: pd.DataFrame,
244
241
  col_name: str,
245
242
  is_required: bool,
246
- ) -> list[InvalidMissingValueInColumn | InvalidTimestampValueInColumn]:
243
+ ) -> list[ValidationError]:
247
244
  """Validate timestamp column values are within reasonable bounds.
248
245
 
249
246
  Args:
@@ -258,7 +255,7 @@ def check_value_timestamp(
258
255
  if col_name not in df.columns:
259
256
  return []
260
257
 
261
- errors = []
258
+ errors: list[ValidationError] = []
262
259
  if is_required and df[col_name].isnull().any():
263
260
  errors.append(
264
261
  InvalidMissingValueInColumn(
@@ -27,10 +27,10 @@ def log_info_dataframe_extra_column_names(
27
27
  """Logs informational message about columns that don't follow evaluation naming conventions.
28
28
 
29
29
  Args:
30
- df: DataFrame to check for extra column names, or None.
30
+ df: DataFrame to check for extra column names, or :obj:`None`.
31
31
 
32
32
  Returns:
33
- None.
33
+ :obj:`None`.
34
34
  """
35
35
  if df is None:
36
36
  return
@@ -57,13 +57,13 @@ def log_info_dataframe_extra_column_names(
57
57
  def check_dataframe_column_content_type(
58
58
  df: pd.DataFrame,
59
59
  ) -> list[InvalidDataFrameColumnContentTypes]:
60
- """Validates that evaluation DataFrame columns contain expected data types.
60
+ """Validates that evaluation :class:`pandas.DataFrame` columns contain expected data types.
61
61
 
62
62
  Checks that label columns contain strings, score columns contain numbers,
63
63
  and explanation columns contain strings.
64
64
 
65
65
  Args:
66
- df: The DataFrame to validate.
66
+ df: The :class:`pandas.DataFrame` to validate.
67
67
 
68
68
  Returns:
69
69
  List of validation errors for columns with incorrect types.
@@ -55,13 +55,13 @@ def validate_argument_types(
55
55
  def validate_dataframe_form(
56
56
  evals_dataframe: pd.DataFrame,
57
57
  ) -> list[ValidationError]:
58
- """Validate the structure and form of an evaluations DataFrame.
58
+ """Validate the structure and form of an evaluations :class:`pandas.DataFrame`.
59
59
 
60
60
  Args:
61
- evals_dataframe: The DataFrame containing evaluation data to validate.
61
+ evals_dataframe: The :class:`pandas.DataFrame` containing evaluation data to validate.
62
62
 
63
63
  Returns:
64
- List of validation errors found in the DataFrame structure.
64
+ List of validation errors found in the :class:`pandas.DataFrame` structure.
65
65
  """
66
66
  df_validation.log_info_dataframe_extra_column_names(evals_dataframe)
67
67
  checks = chain(
@@ -84,15 +84,15 @@ def validate_values(
84
84
  project_name: str,
85
85
  model_version: str | None = None,
86
86
  ) -> list[ValidationError]:
87
- """Validate the values within an evaluations DataFrame.
87
+ """Validate the values within an evaluations :class:`pandas.DataFrame`.
88
88
 
89
89
  Args:
90
- evals_dataframe: The DataFrame containing evaluation data to validate.
90
+ evals_dataframe: The :class:`pandas.DataFrame` containing evaluation data to validate.
91
91
  project_name: The project name associated with the evaluations.
92
92
  model_version: Optional model version. Defaults to None.
93
93
 
94
94
  Returns:
95
- List of validation errors found in DataFrame values.
95
+ List of validation errors found in :class:`pandas.DataFrame` values.
96
96
  """
97
97
  checks = chain(
98
98
  # Common
@@ -40,7 +40,7 @@ def check_eval_cols(
40
40
  Returns:
41
41
  List of validation errors found in evaluation columns.
42
42
  """
43
- checks = []
43
+ checks: list[list[ValidationError]] = []
44
44
  for col in dataframe.columns:
45
45
  if col.endswith(EVAL_LABEL_SUFFIX):
46
46
  checks.append(
@@ -39,7 +39,7 @@ def validate_argument_types(
39
39
  Returns:
40
40
  A list of validation errors, empty if none found
41
41
  """
42
- errors = []
42
+ errors: list[ValidationError] = []
43
43
 
44
44
  # Check metadata_dataframe type
45
45
  if not isinstance(metadata_dataframe, pd.DataFrame):
@@ -7,7 +7,7 @@ from arize.spans.columns import SPAN_SPAN_ID_COL
7
7
 
8
8
 
9
9
  class MetadataFormError(ValidationError):
10
- """Raised when metadata DataFrame structure or format is invalid."""
10
+ """Raised when metadata :class:`pandas.DataFrame` structure or format is invalid."""
11
11
 
12
12
  def __init__(self, message: str, resolution: str) -> None:
13
13
  """Initialize the exception with metadata form error context.
@@ -41,7 +41,7 @@ def validate_dataframe_form(
41
41
  Returns:
42
42
  A list of validation errors, empty if none found
43
43
  """
44
- errors = []
44
+ errors: list[ValidationError] = []
45
45
 
46
46
  # Check for empty dataframe
47
47
  if metadata_dataframe.empty:
@@ -34,6 +34,28 @@ class MetadataValueError(ValidationError):
34
34
  return f"{self.message} {self.resolution}"
35
35
 
36
36
 
37
+ class InvalidPatchDocumentFormat(ValidationError):
38
+ """Raised when patch document format is invalid or cannot be parsed."""
39
+
40
+ def __init__(self, row_idx: int, message: str) -> None:
41
+ """Initialize the exception with patch document format error context.
42
+
43
+ Args:
44
+ row_idx: The row index where the invalid patch was found.
45
+ message: Detailed error message describing the format issue.
46
+ """
47
+ self.row_idx = row_idx
48
+ self.message = message
49
+
50
+ def __repr__(self) -> str:
51
+ """Return a string representation for debugging and logging."""
52
+ return "Invalid_Patch_Document_Format"
53
+
54
+ def error_message(self) -> str:
55
+ """Return the error message for this exception."""
56
+ return f"Row {self.row_idx}: {self.message}"
57
+
58
+
37
59
  def calculate_json_depth(obj: object, current_depth: int = 1) -> int:
38
60
  """Calculate the maximum nesting depth of a JSON object.
39
61
 
@@ -67,7 +89,7 @@ def validate_values(
67
89
  Returns:
68
90
  A list of validation errors, empty if none found
69
91
  """
70
- errors = []
92
+ errors: list[ValidationError] = []
71
93
 
72
94
  # Skip validation if span_id column is not present
73
95
  if SPAN_SPAN_ID_COL.name not in metadata_dataframe.columns:
@@ -50,13 +50,13 @@ def log_info_dataframe_extra_column_names(
50
50
  def check_dataframe_column_content_type(
51
51
  df: pd.DataFrame,
52
52
  ) -> list[InvalidDataFrameColumnContentTypes]:
53
- """Validates that span DataFrame columns contain data types matching Open Inference Specification.
53
+ """Validates span :class:`pandas.DataFrame` columns match OpenInference types.
54
54
 
55
55
  Checks that columns have appropriate data types: lists of dicts, dicts, numeric,
56
56
  boolean, timestamp, JSON strings, or plain strings based on column specifications.
57
57
 
58
58
  Args:
59
- df: The DataFrame to validate.
59
+ df: The :class:`pandas.DataFrame` to validate.
60
60
 
61
61
  Returns:
62
62
  List of validation errors for columns with incorrect types.
@@ -56,13 +56,13 @@ def validate_argument_types(
56
56
  def validate_dataframe_form(
57
57
  spans_dataframe: pd.DataFrame,
58
58
  ) -> list[ValidationError]:
59
- """Validate the structure and form of a spans DataFrame.
59
+ """Validate the structure and form of a spans :class:`pandas.DataFrame`.
60
60
 
61
61
  Args:
62
- spans_dataframe: The DataFrame containing spans data to validate.
62
+ spans_dataframe: The :class:`pandas.DataFrame` containing spans data to validate.
63
63
 
64
64
  Returns:
65
- List of validation errors found in the DataFrame structure.
65
+ List of validation errors found in the :class:`pandas.DataFrame` structure.
66
66
  """
67
67
  df_validation.log_info_dataframe_extra_column_names(spans_dataframe)
68
68
  checks = chain(
@@ -88,15 +88,15 @@ def validate_values(
88
88
  project_name: str,
89
89
  model_version: str | None = None,
90
90
  ) -> list[ValidationError]:
91
- """Validate the values within a spans DataFrame.
91
+ """Validate the values within a spans :class:`pandas.DataFrame`.
92
92
 
93
93
  Args:
94
- spans_dataframe: The DataFrame containing spans data to validate.
94
+ spans_dataframe: The :class:`pandas.DataFrame` containing spans data to validate.
95
95
  project_name: The project name associated with the spans.
96
96
  model_version: Optional model version. Defaults to None.
97
97
 
98
98
  Returns:
99
- List of validation errors found in DataFrame values.
99
+ List of validation errors found in :class:`pandas.DataFrame` values.
100
100
  """
101
101
  checks = chain(
102
102
  # Common
arize/utils/arrow.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Apache Arrow utilities for data serialization and file operations."""
2
2
 
3
- # type: ignore[pb2]
4
3
  from __future__ import annotations
5
4
 
6
5
  import base64
@@ -38,7 +37,7 @@ def post_arrow_table(
38
37
  pa_table: The PyArrow table containing the data.
39
38
  proto_schema: The protobuf schema for the data.
40
39
  headers: HTTP headers for the request.
41
- timeout: Request timeout in seconds, or None for no timeout.
40
+ timeout: Request timeout in seconds, or :obj:`None` for no timeout.
42
41
  verify: Whether to verify SSL certificates.
43
42
  max_chunksize: Maximum chunk size for splitting large tables.
44
43
  tmp_dir: Temporary directory for serialization. Defaults to "".
@@ -124,6 +123,18 @@ def post_arrow_table(
124
123
  def _append_to_pyarrow_metadata(
125
124
  pa_schema: pa.Schema, new_metadata: dict[str, Any]
126
125
  ) -> object:
126
+ """Append metadata to a PyArrow schema without overwriting existing keys.
127
+
128
+ Args:
129
+ pa_schema: The PyArrow schema to add metadata to.
130
+ new_metadata: Dictionary of metadata key-value pairs to append.
131
+
132
+ Returns:
133
+ pa.Schema: A new PyArrow schema with the merged metadata.
134
+
135
+ Raises:
136
+ KeyError: If any keys in new_metadata conflict with existing schema metadata.
137
+ """
127
138
  # Ensure metadata is handled correctly, even if initially None.
128
139
  metadata = pa_schema.metadata
129
140
  if metadata is None:
@@ -145,6 +156,14 @@ def _append_to_pyarrow_metadata(
145
156
  def _write_arrow_file(
146
157
  path: str, pa_table: pa.Table, pa_schema: pa.Schema, max_chunksize: int
147
158
  ) -> None:
159
+ """Write a PyArrow table to an Arrow IPC file with specified schema and chunk size.
160
+
161
+ Args:
162
+ path: The file path where the Arrow file will be written.
163
+ pa_table: The PyArrow table containing the data to write.
164
+ pa_schema: The PyArrow schema to use for the file.
165
+ max_chunksize: Maximum number of rows per record batch chunk.
166
+ """
148
167
  with (
149
168
  pa.OSFile(path, mode="wb") as sink,
150
169
  pa.ipc.RecordBatchStreamWriter(sink, pa_schema) as writer,
@@ -153,6 +172,15 @@ def _write_arrow_file(
153
172
 
154
173
 
155
174
  def _maybe_log_project_url(response: requests.Response) -> None:
175
+ """Attempt to extract and log the Arize project URL from an HTTP response.
176
+
177
+ Args:
178
+ response: The HTTP response object from an Arize API request.
179
+
180
+ Notes:
181
+ Logs success message with URL if extraction succeeds, or warning if it fails.
182
+ This function never raises exceptions.
183
+ """
156
184
  try:
157
185
  url = get_arize_project_url(response)
158
186
  if url:
@@ -176,6 +204,14 @@ def _mktemp_in(directory: str) -> str:
176
204
 
177
205
 
178
206
  def _filesize(path: str) -> int:
207
+ """Get the size of a file in bytes.
208
+
209
+ Args:
210
+ path: The file path to check.
211
+
212
+ Returns:
213
+ int: The file size in bytes, or -1 if the file cannot be accessed.
214
+ """
179
215
  try:
180
216
  return os.path.getsize(path)
181
217
  except Exception:
arize/utils/cache.py CHANGED
@@ -31,7 +31,7 @@ def load_cached_resource(
31
31
  format: File format for cached data. Defaults to "parquet".
32
32
 
33
33
  Returns:
34
- The cached DataFrame if found and valid, None otherwise.
34
+ The cached :class:`pandas.DataFrame` if found and valid, :obj:`None` otherwise.
35
35
  """
36
36
  key = _get_cache_key(resource, resource_id, resource_updated_at)
37
37
  filepath = _get_abs_file_path(cache_dir, f"{key}.{format}", resource)
@@ -59,7 +59,7 @@ def cache_resource(
59
59
  resource: Resource type name (e.g., "dataset", "experiment").
60
60
  resource_id: Unique identifier for the resource.
61
61
  resource_updated_at: Optional timestamp of last resource update.
62
- resource_data: DataFrame containing the resource data.
62
+ resource_data: :class:`pandas.DataFrame` containing the resource data.
63
63
  format: File format for cached data. Defaults to "parquet".
64
64
  """
65
65
  key = _get_cache_key(resource, resource_id, resource_updated_at)
arize/utils/dataframe.py CHANGED
@@ -9,10 +9,10 @@ from arize.ml.types import BaseSchema
9
9
 
10
10
  # Resets the dataframe index if it is not a RangeIndex
11
11
  def reset_dataframe_index(dataframe: pd.DataFrame) -> None:
12
- """Reset the DataFrame index in-place if it is not a RangeIndex.
12
+ """Reset the :class:`pandas.DataFrame` index in-place if it is not a RangeIndex.
13
13
 
14
14
  Args:
15
- dataframe: The pandas DataFrame to reset.
15
+ dataframe: The :class:`pandas.DataFrame` to reset.
16
16
  """
17
17
  if not isinstance(dataframe.index, pd.RangeIndex):
18
18
  drop = dataframe.index.name in dataframe.columns
@@ -25,10 +25,10 @@ def remove_extraneous_columns(
25
25
  column_list: list[str] | None = None,
26
26
  regex: str | None = None,
27
27
  ) -> pd.DataFrame:
28
- """Filter DataFrame to keep only relevant columns based on schema, list, or regex.
28
+ """Filter :class:`pandas.DataFrame` to keep only relevant columns based on schema, list, or regex.
29
29
 
30
30
  Args:
31
- df: The pandas DataFrame to filter.
31
+ df: The :class:`pandas.DataFrame` to filter.
32
32
  schema: Optional schema defining used columns. Defaults to None.
33
33
  column_list: Optional explicit list of columns to keep. Defaults to None.
34
34
  regex: Optional regex pattern to match column names. Defaults to None.
@@ -10,7 +10,7 @@ logger = logging.getLogger(__name__)
10
10
 
11
11
 
12
12
  class ColumnNotFoundError(Exception):
13
- """Raised when a specified column is not found in the DataFrame."""
13
+ """Raised when a specified column is not found in the :class:`pandas.DataFrame`."""
14
14
 
15
15
  def __init__(self, attribute: str) -> None:
16
16
  """Initialize with the attribute that couldn't be mapped to a column.
@@ -27,13 +27,13 @@ class ColumnNotFoundError(Exception):
27
27
  def extract_nested_data_to_column(
28
28
  attributes: list[str], df: pd.DataFrame
29
29
  ) -> pd.DataFrame:
30
- """Extract nested attributes from complex data structures into new DataFrame columns.
30
+ """Extract nested attributes from complex data structures into new :class:`pandas.DataFrame` columns.
31
31
 
32
32
  This function, used in Online Tasks, is typically run on data exported from Arize.
33
- It prepares the DataFrame by extracting relevant attributes from complex, deeply
33
+ It prepares the :class:`pandas.DataFrame` by extracting relevant attributes from complex, deeply
34
34
  nested data structures, such as those found in LLM outputs or JSON-like records.
35
35
  It helps extract specific values from these nested structures by identifying the
36
- longest matching column name in the DataFrame and recursively accessing the desired
36
+ longest matching column name in the :class:`pandas.DataFrame` and recursively accessing the desired
37
37
  attribute path within each row. This preprocessing step ensures that the extracted
38
38
  values are available as new columns, allowing evaluators to process and assess
39
39
  these values effectively.
@@ -81,9 +81,12 @@ def extract_nested_data_to_column(
81
81
  remainder = ".".join(parts[prefix_len:])
82
82
 
83
83
  # 3) Apply introspect row-by-row
84
+ # Type narrowing: prefix_col is guaranteed to be str after the None check above
85
+ prefix_col_str: str = prefix_col
86
+
84
87
  def apply_introspect_arize_attribute(
85
- row: pd.Series,
86
- prefix_col: str = prefix_col,
88
+ row: pd.Series, # type: ignore[type-arg]
89
+ prefix_col: str = prefix_col_str,
87
90
  remainder: str = remainder,
88
91
  ) -> object:
89
92
  val = row[prefix_col]
@@ -94,8 +97,9 @@ def extract_nested_data_to_column(
94
97
  else:
95
98
  return result if result is not None else np.nan
96
99
 
97
- result_df[attribute] = result_df.apply(
98
- apply_introspect_arize_attribute, axis=1
100
+ result_df[attribute] = result_df.apply( # type: ignore[call-overload]
101
+ apply_introspect_arize_attribute,
102
+ axis=1,
99
103
  )
100
104
 
101
105
  new_cols.append(attribute)
@@ -127,7 +131,7 @@ def _introspect_arize_attribute(value: object, attribute: str) -> object:
127
131
  attribute: "0.message.content"
128
132
  Returns: 'The capital of China is Beijing.'
129
133
 
130
- - Returns None immediately when a key or index is not found
134
+ - Returns :obj:`None` immediately when a key or index is not found
131
135
  - Handles integer parts for lists
132
136
  - Parses JSON strings
133
137
  - Converts NumPy arrays to lists
@@ -174,10 +178,10 @@ def _parse_value(
174
178
  2) Else if `current_value` is a dict, check if `attribute_parts_unprocessed[0]` is a key.
175
179
  If not found, try combining `attribute_parts_unprocessed[0] + '.' + attribute_parts_unprocessed[1]`...
176
180
  to handle dotted keys in the dict.
177
- 3) If none match, return (None, 1) to signal "not found, consume 1 part."
181
+ 3) If none match, return (:obj:`None`, 1) to signal "not found, consume 1 part."
178
182
 
179
183
  Returns (parsed_value, num_parts_processed):
180
- - parsed_value: the found value or None if not found
184
+ - parsed_value: the found value or :obj:`None` if not found
181
185
  - num_parts_processed: how many parts were processed (1 or more)
182
186
  """
183
187
  if not attribute_parts_unprocessed:
@@ -11,13 +11,13 @@ logger = logging.getLogger(__name__)
11
11
 
12
12
 
13
13
  def convert_datetime_columns_to_int(df: pd.DataFrame) -> pd.DataFrame:
14
- """Convert datetime columns in a DataFrame to milliseconds since epoch.
14
+ """Convert datetime columns in a :class:`pandas.DataFrame` to milliseconds since epoch.
15
15
 
16
16
  Args:
17
- df: The pandas DataFrame to convert.
17
+ df: The :class:`pandas.DataFrame` to convert.
18
18
 
19
19
  Returns:
20
- The DataFrame with datetime columns converted to integers.
20
+ The :class:`pandas.DataFrame` with datetime columns converted to integers.
21
21
  """
22
22
  for col in df.select_dtypes(
23
23
  include=["datetime64[ns]", "datetime64[ns, UTC]"]
@@ -27,13 +27,13 @@ def convert_datetime_columns_to_int(df: pd.DataFrame) -> pd.DataFrame:
27
27
 
28
28
 
29
29
  def convert_boolean_columns_to_str(df: pd.DataFrame) -> pd.DataFrame:
30
- """Convert boolean columns in a DataFrame to string type.
30
+ """Convert boolean columns in a :class:`pandas.DataFrame` to string type.
31
31
 
32
32
  Args:
33
- df: The pandas DataFrame to convert.
33
+ df: The :class:`pandas.DataFrame` to convert.
34
34
 
35
35
  Returns:
36
- The DataFrame with boolean columns converted to strings.
36
+ The :class:`pandas.DataFrame` with boolean columns converted to strings.
37
37
  """
38
38
  for col in df.columns:
39
39
  if df[col].dtype == "bool":
@@ -45,10 +45,10 @@ def convert_default_columns_to_json_str(df: pd.DataFrame) -> pd.DataFrame:
45
45
  """Convert dictionary values in specific columns to JSON strings.
46
46
 
47
47
  Args:
48
- df: The pandas DataFrame to convert.
48
+ df: The :class:`pandas.DataFrame` to convert.
49
49
 
50
50
  Returns:
51
- The DataFrame with dictionaries in eligible columns converted to JSON strings.
51
+ The :class:`pandas.DataFrame` with dictionaries in eligible columns converted to JSON strings.
52
52
  """
53
53
  for col in df.columns:
54
54
  if _should_convert_json(col):
@@ -68,10 +68,10 @@ def convert_json_str_to_dict(df: pd.DataFrame) -> pd.DataFrame:
68
68
  """Convert JSON string values in specific columns to Python dictionaries.
69
69
 
70
70
  Args:
71
- df: The pandas DataFrame to convert.
71
+ df: The :class:`pandas.DataFrame` to convert.
72
72
 
73
73
  Returns:
74
- The DataFrame with JSON strings in eligible columns converted to dictionaries.
74
+ The :class:`pandas.DataFrame` with JSON strings in eligible columns converted to dictionaries.
75
75
  """
76
76
  for col in df.columns:
77
77
  if _should_convert_json(col):
arize/utils/proto.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Protocol buffer schema utilities for tracing data."""
2
2
 
3
- # type: ignore[pb2]
4
3
  from arize._generated.protocol.rec import public_pb2 as pb2
5
4
 
6
5
 
arize/utils/types.py CHANGED
@@ -43,7 +43,7 @@ def is_array_of(arr: Sequence[object], tp: T) -> bool:
43
43
  return isinstance(arr, np.ndarray) and all(isinstance(x, tp) for x in arr)
44
44
 
45
45
 
46
- def is_list_of(lst: Sequence[object], tp: T) -> bool:
46
+ def is_list_of(lst: object, tp: T) -> bool:
47
47
  """Check if a value is a list with all elements of a specific type.
48
48
 
49
49
  Args:
@@ -70,10 +70,10 @@ def is_iterable_of(lst: Sequence[object], tp: T) -> bool:
70
70
 
71
71
 
72
72
  def is_dict_of(
73
- d: dict[object, object],
74
- key_allowed_types: T,
75
- value_allowed_types: T = (),
76
- value_list_allowed_types: T = (),
73
+ d: object,
74
+ key_allowed_types: type | tuple[type, ...],
75
+ value_allowed_types: type | tuple[type, ...] = (),
76
+ value_list_allowed_types: type | tuple[type, ...] = (),
77
77
  ) -> bool:
78
78
  """Method to check types are valid for dictionary.
79
79
 
@@ -98,7 +98,7 @@ def is_dict_of(
98
98
  and all(isinstance(k, key_allowed_types) for k in d)
99
99
  and all(
100
100
  isinstance(v, value_allowed_types)
101
- or any(is_list_of(v, t) for t in value_list_allowed_types)
101
+ or any(is_list_of(v, t) for t in value_list_allowed_types) # type: ignore[union-attr]
102
102
  for v in d.values()
103
103
  if value_allowed_types or value_list_allowed_types
104
104
  )
arize/version.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for the Arize SDK."""
2
2
 
3
- __version__ = "8.0.0b1"
3
+ __version__ = "8.0.0b4"