PyPI - arize - Versions diffs - 8.0.0a21__py3-none-any.whl → 8.0.0a23__py3-none-any.whl - Mend

arize 8.0.0a21py3-none-any.whl → 8.0.0a23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (166) hide show

arize/__init__.py +17 -9
arize/_exporter/client.py +55 -36
arize/_exporter/parsers/tracing_data_parser.py +41 -30
arize/_exporter/validation.py +3 -3
arize/_flight/client.py +208 -77
arize/_generated/api_client/__init__.py +30 -6
arize/_generated/api_client/api/__init__.py +1 -0
arize/_generated/api_client/api/datasets_api.py +864 -190
arize/_generated/api_client/api/experiments_api.py +167 -131
arize/_generated/api_client/api/projects_api.py +1197 -0
arize/_generated/api_client/api_client.py +2 -2
arize/_generated/api_client/configuration.py +42 -34
arize/_generated/api_client/exceptions.py +2 -2
arize/_generated/api_client/models/__init__.py +15 -4
arize/_generated/api_client/models/dataset.py +10 -10
arize/_generated/api_client/models/dataset_example.py +111 -0
arize/_generated/api_client/models/dataset_example_update.py +100 -0
arize/_generated/api_client/models/dataset_version.py +13 -13
arize/_generated/api_client/models/datasets_create_request.py +16 -8
arize/_generated/api_client/models/datasets_examples_insert_request.py +100 -0
arize/_generated/api_client/models/datasets_examples_list200_response.py +106 -0
arize/_generated/api_client/models/datasets_examples_update_request.py +102 -0
arize/_generated/api_client/models/datasets_list200_response.py +10 -4
arize/_generated/api_client/models/experiment.py +14 -16
arize/_generated/api_client/models/experiment_run.py +108 -0
arize/_generated/api_client/models/experiment_run_create.py +102 -0
arize/_generated/api_client/models/experiments_create_request.py +16 -10
arize/_generated/api_client/models/experiments_list200_response.py +10 -4
arize/_generated/api_client/models/experiments_runs_list200_response.py +19 -5
arize/_generated/api_client/models/{error.py → pagination_metadata.py} +13 -11
arize/_generated/api_client/models/primitive_value.py +172 -0
arize/_generated/api_client/models/problem.py +100 -0
arize/_generated/api_client/models/project.py +99 -0
arize/_generated/api_client/models/{datasets_list_examples200_response.py → projects_create_request.py} +13 -11
arize/_generated/api_client/models/projects_list200_response.py +106 -0
arize/_generated/api_client/rest.py +2 -2
arize/_generated/api_client/test/test_dataset.py +4 -2
arize/_generated/api_client/test/test_dataset_example.py +56 -0
arize/_generated/api_client/test/test_dataset_example_update.py +52 -0
arize/_generated/api_client/test/test_dataset_version.py +7 -2
arize/_generated/api_client/test/test_datasets_api.py +27 -13
arize/_generated/api_client/test/test_datasets_create_request.py +8 -4
arize/_generated/api_client/test/{test_datasets_list_examples200_response.py → test_datasets_examples_insert_request.py} +19 -15
arize/_generated/api_client/test/test_datasets_examples_list200_response.py +66 -0
arize/_generated/api_client/test/test_datasets_examples_update_request.py +61 -0
arize/_generated/api_client/test/test_datasets_list200_response.py +9 -3
arize/_generated/api_client/test/test_experiment.py +2 -4
arize/_generated/api_client/test/test_experiment_run.py +56 -0
arize/_generated/api_client/test/test_experiment_run_create.py +54 -0
arize/_generated/api_client/test/test_experiments_api.py +6 -6
arize/_generated/api_client/test/test_experiments_create_request.py +9 -6
arize/_generated/api_client/test/test_experiments_list200_response.py +9 -5
arize/_generated/api_client/test/test_experiments_runs_list200_response.py +15 -5
arize/_generated/api_client/test/test_pagination_metadata.py +53 -0
arize/_generated/api_client/test/{test_error.py → test_primitive_value.py} +13 -14
arize/_generated/api_client/test/test_problem.py +57 -0
arize/_generated/api_client/test/test_project.py +58 -0
arize/_generated/api_client/test/test_projects_api.py +59 -0
arize/_generated/api_client/test/test_projects_create_request.py +54 -0
arize/_generated/api_client/test/test_projects_list200_response.py +70 -0
arize/_generated/api_client_README.md +43 -29
arize/_generated/protocol/flight/flight_pb2.py +400 -0
arize/_lazy.py +27 -19
arize/client.py +269 -55
arize/config.py +365 -116
arize/constants/__init__.py +1 -0
arize/constants/config.py +11 -4
arize/constants/ml.py +6 -4
arize/constants/openinference.py +2 -0
arize/constants/pyarrow.py +2 -0
arize/constants/spans.py +3 -1
arize/datasets/__init__.py +1 -0
arize/datasets/client.py +299 -84
arize/datasets/errors.py +32 -2
arize/datasets/validation.py +18 -8
arize/embeddings/__init__.py +2 -0
arize/embeddings/auto_generator.py +23 -19
arize/embeddings/base_generators.py +89 -36
arize/embeddings/constants.py +2 -0
arize/embeddings/cv_generators.py +26 -4
arize/embeddings/errors.py +27 -5
arize/embeddings/nlp_generators.py +31 -12
arize/embeddings/tabular_generators.py +32 -20
arize/embeddings/usecases.py +12 -2
arize/exceptions/__init__.py +1 -0
arize/exceptions/auth.py +11 -1
arize/exceptions/base.py +29 -4
arize/exceptions/models.py +21 -2
arize/exceptions/parameters.py +31 -0
arize/exceptions/spaces.py +12 -1
arize/exceptions/types.py +86 -7
arize/exceptions/values.py +220 -20
arize/experiments/__init__.py +1 -0
arize/experiments/client.py +390 -286
arize/experiments/evaluators/__init__.py +1 -0
arize/experiments/evaluators/base.py +74 -41
arize/experiments/evaluators/exceptions.py +6 -3
arize/experiments/evaluators/executors.py +121 -73
arize/experiments/evaluators/rate_limiters.py +106 -57
arize/experiments/evaluators/types.py +34 -7
arize/experiments/evaluators/utils.py +65 -27
arize/experiments/functions.py +103 -101
arize/experiments/tracing.py +52 -44
arize/experiments/types.py +56 -31
arize/logging.py +54 -22
arize/models/__init__.py +1 -0
arize/models/batch_validation/__init__.py +1 -0
arize/models/batch_validation/errors.py +543 -65
arize/models/batch_validation/validator.py +339 -300
arize/models/bounded_executor.py +20 -7
arize/models/casting.py +75 -29
arize/models/client.py +326 -107
arize/models/proto.py +95 -40
arize/models/stream_validation.py +42 -14
arize/models/surrogate_explainer/__init__.py +1 -0
arize/models/surrogate_explainer/mimic.py +24 -13
arize/pre_releases.py +43 -0
arize/projects/__init__.py +1 -0
arize/projects/client.py +129 -0
arize/regions.py +40 -0
arize/spans/__init__.py +1 -0
arize/spans/client.py +130 -106
arize/spans/columns.py +13 -0
arize/spans/conversion.py +54 -38
arize/spans/validation/__init__.py +1 -0
arize/spans/validation/annotations/__init__.py +1 -0
arize/spans/validation/annotations/annotations_validation.py +6 -4
arize/spans/validation/annotations/dataframe_form_validation.py +13 -11
arize/spans/validation/annotations/value_validation.py +35 -11
arize/spans/validation/common/__init__.py +1 -0
arize/spans/validation/common/argument_validation.py +33 -8
arize/spans/validation/common/dataframe_form_validation.py +35 -9
arize/spans/validation/common/errors.py +211 -11
arize/spans/validation/common/value_validation.py +80 -13
arize/spans/validation/evals/__init__.py +1 -0
arize/spans/validation/evals/dataframe_form_validation.py +28 -8
arize/spans/validation/evals/evals_validation.py +34 -4
arize/spans/validation/evals/value_validation.py +26 -3
arize/spans/validation/metadata/__init__.py +1 -1
arize/spans/validation/metadata/argument_validation.py +14 -5
arize/spans/validation/metadata/dataframe_form_validation.py +26 -10
arize/spans/validation/metadata/value_validation.py +24 -10
arize/spans/validation/spans/__init__.py +1 -0
arize/spans/validation/spans/dataframe_form_validation.py +34 -13
arize/spans/validation/spans/spans_validation.py +35 -4
arize/spans/validation/spans/value_validation.py +76 -7
arize/types.py +293 -157
arize/utils/__init__.py +1 -0
arize/utils/arrow.py +31 -15
arize/utils/cache.py +34 -6
arize/utils/dataframe.py +19 -2
arize/utils/online_tasks/__init__.py +2 -0
arize/utils/online_tasks/dataframe_preprocessor.py +53 -41
arize/utils/openinference_conversion.py +44 -5
arize/utils/proto.py +10 -0
arize/utils/size.py +5 -3
arize/version.py +3 -1
{arize-8.0.0a21.dist-info → arize-8.0.0a23.dist-info}/METADATA +4 -3
arize-8.0.0a23.dist-info/RECORD +174 -0
{arize-8.0.0a21.dist-info → arize-8.0.0a23.dist-info}/WHEEL +1 -1
arize-8.0.0a23.dist-info/licenses/LICENSE +176 -0
arize-8.0.0a23.dist-info/licenses/NOTICE +13 -0
arize/_generated/protocol/flight/export_pb2.py +0 -61
arize/_generated/protocol/flight/ingest_pb2.py +0 -365
arize-8.0.0a21.dist-info/RECORD +0 -146
arize-8.0.0a21.dist-info/licenses/LICENSE.md +0 -12

arize/models/client.py CHANGED Viewed

@@ -1,11 +1,14 @@
+"""Client implementation for managing ML models in the Arize platform."""
 # type: ignore[pb2]
 from __future__ import annotations
 import copy
 import logging
 import time
-from typing import TYPE_CHECKING, Dict, List, Tuple
+from typing import TYPE_CHECKING
+from arize._generated.protocol.rec import public_pb2 as pb2
 from arize._lazy import require
 from arize.constants.ml import (
     LLM_RUN_METADATA_PROMPT_TOKEN_COUNT_TAG_NAME,
@@ -43,6 +46,7 @@ from arize.types import (
     BaseSchema,
     CorpusSchema,
     Embedding,
+    EmbeddingColumnNames,
     Environments,
     LLMRunMetadata,
     Metrics,
@@ -64,12 +68,7 @@ if TYPE_CHECKING:
     import requests
     from requests_futures.sessions import FuturesSession
-    from arize._generated.protocol.rec import public_pb2 as pb2
     from arize.config import SDKConfiguration
-    from arize.types import (
-        EmbeddingColumnNames,
-        Schema,
-    )
 logger = logging.getLogger(__name__)
@@ -96,7 +95,14 @@ _MIMIC_EXTRA = "mimic-explainer"
 class MLModelsClient:
-    def __init__(self, *, sdk_config: SDKConfiguration):
+    """Client for logging ML model predictions and actuals to Arize."""
+    def __init__(self, *, sdk_config: SDKConfiguration) -> None:
+        """Initialize the ML models client with SDK configuration.
+        Args:
+            sdk_config: SDK configuration containing API endpoints and credentials.
+        """
         self._sdk_config = sdk_config
         # internal cache for the futures session
@@ -114,21 +120,86 @@ class MLModelsClient:
         prediction_timestamp: int | None = None,
         prediction_label: PredictionLabelTypes | None = None,
         actual_label: ActualLabelTypes | None = None,
-        features: Dict[str, str | bool | float | int | List[str] | TypedValue]
+        features: dict[str, str | bool | float | int | list[str] | TypedValue]
         | None = None,
-        embedding_features: Dict[str, Embedding] | None = None,
-        shap_values: Dict[str, float] | None = None,
-        tags: Dict[str, str | bool | float | int | TypedValue] | None = None,
+        embedding_features: dict[str, Embedding] | None = None,
+        shap_values: dict[str, float] | None = None,
+        tags: dict[str, str | bool | float | int | TypedValue] | None = None,
         batch_id: str | None = None,
         prompt: str | Embedding | None = None,
         response: str | Embedding | None = None,
         prompt_template: str | None = None,
         prompt_template_version: str | None = None,
         llm_model_name: str | None = None,
-        llm_params: Dict[str, str | bool | float | int] | None = None,
+        llm_params: dict[str, str | bool | float | int] | None = None,
         llm_run_metadata: LLMRunMetadata | None = None,
         timeout: float | None = None,
     ) -> cf.Future:
+        """Log a single model prediction or actual to Arize asynchronously.
+        This method sends a single prediction, actual, or both to Arize for ML monitoring.
+        The request is made asynchronously and returns a Future that can be used to check
+        the status or retrieve the response.
+        Args:
+            space_id: The space ID where the model resides.
+            model_name: A unique name to identify your model in the Arize platform.
+            model_type: The type of model. Supported types: BINARY, MULTI_CLASS, REGRESSION,
+                RANKING, OBJECT_DETECTION. Note: GENERATIVE_LLM is not supported; use the
+                spans module instead.
+            environment: The environment this data belongs to (PRODUCTION, TRAINING, or
+                VALIDATION).
+            model_version: Optional version identifier for the model.
+            prediction_id: Unique identifier for this prediction. If not provided, one
+                will be auto-generated for PRODUCTION environment.
+            prediction_timestamp: Unix timestamp (seconds) for when the prediction was made.
+                If not provided, the current time is used. Must be within 1 year in the
+                future and 2 years in the past from the current time.
+            prediction_label: The prediction output from your model. Type depends on
+                model_type (e.g., string for categorical, float for numeric).
+            actual_label: The ground truth label. Type depends on model_type.
+            features: Dictionary of feature name to feature value. Values can be str, bool,
+                float, int, list[str], or TypedValue.
+            embedding_features: Dictionary of embedding feature name to Embedding object.
+                Maximum 50 embeddings per record. Object detection models support only 1.
+            shap_values: Dictionary of feature name to SHAP value (float) for feature
+                importance/explainability.
+            tags: Dictionary of metadata tags. Tag names cannot end with "_shap" or be
+                reserved names. Values must be under 1000 characters (warning at 100).
+            batch_id: Required for VALIDATION environment; identifies the validation batch.
+            prompt: For generative models, the prompt text or embedding sent to the model.
+            response: For generative models, the response text or embedding from the model.
+            prompt_template: Template used to generate the prompt.
+            prompt_template_version: Version identifier for the prompt template.
+            llm_model_name: Name of the LLM model used (e.g., "gpt-4").
+            llm_params: Dictionary of LLM configuration parameters (e.g., temperature,
+                max_tokens).
+            llm_run_metadata: Metadata about the LLM run including token counts and latency.
+            timeout: Maximum time (in seconds) to wait for the request to complete.
+        Returns:
+            A concurrent.futures.Future object representing the async request. Call
+            .result() to block and retrieve the Response object, or check .done() for
+            completion status.
+        Raises:
+            ValueError: If model_type is GENERATIVE_LLM, or if validation environment is
+                missing batch_id, or if training/validation environment is missing
+                prediction or actual, or if timestamp is out of range, or if no data
+                is provided (must have prediction_label, actual_label, tags, or shap_values),
+                or if tag names end with "_shap" or exceed length limits.
+            MissingSpaceIDError: If space_id is not provided or empty.
+            MissingModelNameError: If model_name is not provided or empty.
+            InvalidValueType: If features, tags, or other parameters have incorrect types.
+            InvalidNumberOfEmbeddings: If more than 50 embedding features are provided.
+            KeyError: If tag names include reserved names.
+        Notes:
+            - Timestamps must be within 1 year future and 2 years past from current time
+            - Tag values are truncated at 1000 characters, with warnings at 100 characters
+            - For GENERATIVE_LLM models, use the spans module or OTEL tracing instead
+            - The Future returned can be monitored for request status asynchronously
+        """
         require(_STREAM_EXTRA, _STREAM_DEPS)
         from arize._generated.protocol.rec import public_pb2 as pb2
         from arize.models.proto import (
@@ -179,16 +250,15 @@ class MLModelsClient:
                 _validate_mapping_key(feat_name, "features")
                 if is_list_of(feat_value, str):
                     continue
-                else:
-                    val = convert_element(feat_value)
-                    if val is not None and not isinstance(
-                        val, (str, bool, float, int)
-                    ):
-                        raise InvalidValueType(
-                            f"feature '{feat_name}'",
-                            feat_value,
-                            "one of: bool, int, float, str",
-                        )
+                val = convert_element(feat_value)
+                if val is not None and not isinstance(
+                    val, (str, bool, float, int)
+                ):
+                    raise InvalidValueType(
+                        f"feature '{feat_name}'",
+                        feat_value,
+                        "one of: bool, int, float, str",
+                    )
         # Validate embedding_features type
         if embedding_features:
@@ -247,7 +317,7 @@ class MLModelsClient:
                         f"{MAX_TAG_LENGTH}. The tag {tag_name} with value {tag_value} has "
                         f"{len(str(val))} characters."
                     )
-                elif len(str(val)) > MAX_TAG_LENGTH_TRUNCATION:
+                if len(str(val)) > MAX_TAG_LENGTH_TRUNCATION:
                     logger.warning(
                         get_truncation_warning_message(
                             "tags", MAX_TAG_LENGTH_TRUNCATION
@@ -304,7 +374,7 @@ class MLModelsClient:
             if embedding_features or prompt or response:
                 # NOTE: Deep copy is necessary to avoid side effects on the original input dictionary
                 combined_embedding_features = (
-                    {k: v for k, v in embedding_features.items()}
+                    dict(embedding_features.items())
                     if embedding_features
                     else {}
                 )
@@ -453,7 +523,6 @@ class MLModelsClient:
             indexes=None,
         )
-    # TODO(Kiko): Handle sync argument
     def log_batch(
         self,
         *,
@@ -466,12 +535,64 @@ class MLModelsClient:
         model_version: str = "",
         batch_id: str = "",
         validate: bool = True,
-        metrics_validation: List[Metrics] | None = None,
+        metrics_validation: list[Metrics] | None = None,
         surrogate_explainability: bool = False,
         timeout: float | None = None,
         tmp_dir: str = "",
-        sync: bool = False,
     ) -> requests.Response:
+        """Log a batch of model predictions and actuals to Arize from a pandas DataFrame.
+        This method uploads multiple records to Arize in a single batch operation using
+        Apache Arrow format for efficient transfer. The dataframe structure is defined
+        by the provided schema which maps dataframe columns to Arize data fields.
+        Args:
+            space_id: The space ID where the model resides.
+            model_name: A unique name to identify your model in the Arize platform.
+            model_type: The type of model. Supported types: BINARY, MULTI_CLASS, REGRESSION,
+                RANKING, OBJECT_DETECTION. Note: GENERATIVE_LLM is not supported; use the
+                spans module instead.
+            dataframe: Pandas DataFrame containing the data to upload. Columns should
+                correspond to the schema field mappings.
+            schema: Schema object (Schema or CorpusSchema) that defines the mapping between
+                dataframe columns and Arize data fields (e.g., prediction_label_column_name,
+                feature_column_names, etc.).
+            environment: The environment this data belongs to (PRODUCTION, TRAINING,
+                VALIDATION, or CORPUS).
+            model_version: Optional version identifier for the model.
+            batch_id: Required for VALIDATION environment; identifies the validation batch.
+            validate: When True, performs comprehensive validation before sending data.
+                Includes checks for required fields, data types, and value constraints.
+            metrics_validation: Optional list of metric families to validate against.
+            surrogate_explainability: When True, automatically generates SHAP values using
+                MIMIC surrogate explainer. Requires the 'mimic-explainer' extra. Has no
+                effect if shap_values_column_names is already specified in schema.
+            timeout: Maximum time (in seconds) to wait for the request to complete.
+            tmp_dir: Optional temporary directory to store serialized Arrow data before
+                upload.
+        Returns:
+            A requests.Response object from the upload request. Check .status_code for
+            success (200) or error conditions.
+        Raises:
+            MissingSpaceIDError: If space_id is not provided or empty.
+            MissingModelNameError: If model_name is not provided or empty.
+            ValueError: If model_type is GENERATIVE_LLM, or if environment is CORPUS with
+                non-CorpusSchema, or if training/validation records are incomplete.
+            ValidationFailure: If validate=True and validation checks fail. Contains list
+                of validation error messages.
+            pa.ArrowInvalid: If the dataframe cannot be converted to Arrow format, typically
+                due to mixed types in columns not specified in the schema.
+        Notes:
+            - Categorical dtype columns are automatically converted to string
+            - Extraneous columns not in the schema are removed before upload
+            - Surrogate explainability requires 'mimic-explainer' extra
+            - For GENERATIVE_LLM models, use the spans module or OTEL tracing instead
+            - If logging actuals without predictions, ensure predictions were logged first
+            - Data is sent via Apache Arrow for efficient large batch transfers
+        """
         require(_BATCH_EXTRA, _BATCH_DEPS)
         import pandas.api.types as ptypes
         import pyarrow as pa
@@ -506,8 +627,8 @@ class MLModelsClient:
             # Thus we can only offer this functionality with pandas>=1.0.0.
             try:
                 dataframe, schema = cast_typed_columns(dataframe, schema)
-            except Exception as e:
-                logger.error(e)
+            except Exception:
+                logger.exception("Error casting typed columns")
                 raise
         logger.debug("Performing required validation.")
@@ -546,7 +667,7 @@ class MLModelsClient:
         # always validate pd.Category is not present, if yes, convert to string
         has_cat_col = any(
-            [ptypes.is_categorical_dtype(x) for x in dataframe.dtypes]
+            ptypes.is_categorical_dtype(x) for x in dataframe.dtypes
         )
         if has_cat_col:
             cat_cols = [
@@ -554,7 +675,13 @@ class MLModelsClient:
                 for col_name, col_cat in dataframe.dtypes.items()
                 if col_cat.name == "category"
             ]
-            cat_str_map = dict(zip(cat_cols, ["str"] * len(cat_cols)))
+            cat_str_map = dict(
+                zip(
+                    cat_cols,
+                    ["str"] * len(cat_cols),
+                    strict=True,
+                )
+            )
             dataframe = dataframe.astype(cat_str_map)
         if surrogate_explainability:
@@ -588,12 +715,12 @@ class MLModelsClient:
             # error conditions that we're currently not aware of.
             pa_table = pa.Table.from_pandas(dataframe, preserve_index=False)
         except pa.ArrowInvalid as e:
-            logger.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
+            logger.exception(INVALID_ARROW_CONVERSION_MSG)
             raise pa.ArrowInvalid(
-                f"Error converting to Arrow format: {str(e)}"
+                f"Error converting to Arrow format: {e!s}"
             ) from e
-        except Exception as e:
-            logger.error(f"Unexpected error creating Arrow table: {str(e)}")
+        except Exception:
+            logger.exception("Unexpected error creating Arrow table")
             raise
         if validate:
@@ -678,18 +805,53 @@ class MLModelsClient:
         model_version: str = "",
         batch_id: str = "",
         where: str = "",
-        columns: List | None = None,
+        columns: list | None = None,
         similarity_search_params: SimilaritySearchParams | None = None,
         stream_chunk_size: int | None = None,
     ) -> pd.DataFrame:
+        """Export model data from Arize to a pandas DataFrame.
+        Retrieves prediction and optional actual data for a model within a specified time
+        range and returns it as a pandas DataFrame for analysis.
+        Args:
+            space_id: The space ID where the model resides.
+            model_name: The name of the model to export data from.
+            environment: The environment to export from (PRODUCTION, TRAINING, or VALIDATION).
+            start_time: Start of the time range (inclusive) as a datetime object.
+            end_time: End of the time range (inclusive) as a datetime object.
+            include_actuals: When True, includes actual labels in the export. When False,
+                only predictions are returned.
+            model_version: Optional model version to filter by. Empty string returns all
+                versions.
+            batch_id: Optional batch ID to filter by (for VALIDATION environment).
+            where: Optional SQL-like WHERE clause to filter rows (e.g., "feature_x > 0.5").
+            columns: Optional list of column names to include. If None, all columns are
+                returned.
+            similarity_search_params: Optional parameters for embedding similarity search
+                filtering.
+            stream_chunk_size: Optional chunk size for streaming large result sets.
+        Returns:
+            A pandas DataFrame containing the exported data with columns for predictions,
+            actuals (if requested), features, tags, timestamps, and other model metadata.
+        Raises:
+            RuntimeError: If the Flight client request fails or returns no response.
+        Notes:
+            - Uses Apache Arrow Flight for efficient data transfer
+            - Large exports may benefit from specifying stream_chunk_size
+            - The where clause supports SQL-like filtering syntax
+        """
         require(_BATCH_EXTRA, _BATCH_DEPS)
         from arize._exporter.client import ArizeExportClient
         from arize._flight.client import ArizeFlightClient
         with ArizeFlightClient(
             api_key=self._sdk_config.api_key,
-            host=self._sdk_config.flight_server_host,
-            port=self._sdk_config.flight_server_port,
+            host=self._sdk_config.flight_host,
+            port=self._sdk_config.flight_port,
             scheme=self._sdk_config.flight_scheme,
             request_verify=self._sdk_config.request_verify,
             max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -724,18 +886,53 @@ class MLModelsClient:
         model_version: str = "",
         batch_id: str = "",
         where: str = "",
-        columns: List | None = None,
+        columns: list | None = None,
         similarity_search_params: SimilaritySearchParams | None = None,
         stream_chunk_size: int | None = None,
     ) -> pd.DataFrame:
+        """Export model data from Arize to a Parquet file and return as DataFrame.
+        Retrieves prediction and optional actual data for a model within a specified time
+        range, saves it as a Parquet file, and returns it as a pandas DataFrame.
+        Args:
+            space_id: The space ID where the model resides.
+            model_name: The name of the model to export data from.
+            environment: The environment to export from (PRODUCTION, TRAINING, or VALIDATION).
+            start_time: Start of the time range (inclusive) as a datetime object.
+            end_time: End of the time range (inclusive) as a datetime object.
+            include_actuals: When True, includes actual labels in the export. When False,
+                only predictions are returned.
+            model_version: Optional model version to filter by. Empty string returns all
+                versions.
+            batch_id: Optional batch ID to filter by (for VALIDATION environment).
+            where: Optional SQL-like WHERE clause to filter rows (e.g., "feature_x > 0.5").
+            columns: Optional list of column names to include. If None, all columns are
+                returned.
+            similarity_search_params: Optional parameters for embedding similarity search
+                filtering.
+            stream_chunk_size: Optional chunk size for streaming large result sets.
+        Returns:
+            A pandas DataFrame containing the exported data. The data is also saved to a
+            Parquet file by the underlying export client.
+        Raises:
+            RuntimeError: If the Flight client request fails or returns no response.
+        Notes:
+            - Uses Apache Arrow Flight for efficient data transfer
+            - The Parquet file location is managed by the ArizeExportClient
+            - Large exports may benefit from specifying stream_chunk_size
+        """
         require(_BATCH_EXTRA, _BATCH_DEPS)
         from arize._exporter.client import ArizeExportClient
         from arize._flight.client import ArizeFlightClient
         with ArizeFlightClient(
             api_key=self._sdk_config.api_key,
-            host=self._sdk_config.flight_server_host,
-            port=self._sdk_config.flight_server_port,
+            host=self._sdk_config.flight_host,
+            port=self._sdk_config.flight_port,
             scheme=self._sdk_config.flight_scheme,
             request_verify=self._sdk_config.request_verify,
             max_chunksize=self._sdk_config.pyarrow_max_chunksize,
@@ -759,6 +956,7 @@ class MLModelsClient:
             )
     def _ensure_session(self) -> FuturesSession:
+        """Lazily initialize and return the FuturesSession for async streaming requests."""
         from requests_futures.sessions import FuturesSession
         session = object.__getattribute__(self, "_session")
@@ -778,10 +976,11 @@ class MLModelsClient:
     def _post(
         self,
         record: pb2.Record,
-        headers: Dict[str, str],
+        headers: dict[str, str],
         timeout: float | None,
-        indexes: Tuple,
-    ):
+        indexes: tuple,
+    ) -> object:
+        """Post a record to Arize via async HTTP request with protobuf JSON serialization."""
         from google.protobuf.json_format import MessageToDict
         session = self._ensure_session()
@@ -801,9 +1000,10 @@ class MLModelsClient:
         return resp
-def _validate_mapping_key(key_name: str, name: str):
+def _validate_mapping_key(key_name: str, name: str) -> None:
+    """Validate that a mapping key (feature/tag name) is a string and doesn't end with '_shap'."""
     if not isinstance(key_name, str):
-        raise ValueError(
+        raise TypeError(
             f"{name} dictionary key {key_name} must be named with string, type used: {type(key_name)}"
         )
     if key_name.endswith("_shap"):
@@ -813,7 +1013,8 @@ def _validate_mapping_key(key_name: str, name: str):
     return
-def _is_timestamp_in_range(now: int, ts: int):
+def _is_timestamp_in_range(now: int, ts: int) -> bool:
+    """Check if a timestamp is within the acceptable range (1 year future, 2 years past)."""
     max_time = now + (MAX_FUTURE_YEARS_FROM_CURRENT_TIME * 365 * 24 * 60 * 60)
     min_time = now - (MAX_PAST_YEARS_FROM_CURRENT_TIME * 365 * 24 * 60 * 60)
     return min_time <= ts <= max_time
@@ -826,7 +1027,8 @@ def _get_pb_schema(
     model_type: ModelTypes,
     environment: Environments,
     batch_id: str,
-):
+) -> object:
+    """Construct a protocol buffer Schema from the user's Schema for batch logging."""
     s = pb2.Schema()
     s.constants.model_id = model_id
@@ -874,48 +1076,52 @@ def _get_pb_schema(
     if model_type == ModelTypes.OBJECT_DETECTION:
         if schema.object_detection_prediction_column_names is not None:
-            s.arrow_schema.prediction_object_detection_label_column_names.bboxes_coordinates_column_name = (
-                schema.object_detection_prediction_column_names.bounding_boxes_coordinates_column_name  # noqa: E501
+            obj_det_pred = schema.object_detection_prediction_column_names
+            pred_labels = (
+                s.arrow_schema.prediction_object_detection_label_column_names
             )
-            s.arrow_schema.prediction_object_detection_label_column_names.bboxes_categories_column_name = (
-                schema.object_detection_prediction_column_names.categories_column_name  # noqa: E501
+            pred_labels.bboxes_coordinates_column_name = (
+                obj_det_pred.bounding_boxes_coordinates_column_name
             )
-            if (
-                schema.object_detection_prediction_column_names.scores_column_name
-                is not None
-            ):
-                s.arrow_schema.prediction_object_detection_label_column_names.bboxes_scores_column_name = (
-                    schema.object_detection_prediction_column_names.scores_column_name  # noqa: E501
+            pred_labels.bboxes_categories_column_name = (
+                obj_det_pred.categories_column_name
+            )
+            if obj_det_pred.scores_column_name is not None:
+                pred_labels.bboxes_scores_column_name = (
+                    obj_det_pred.scores_column_name
                 )
         if schema.semantic_segmentation_prediction_column_names is not None:
-            s.arrow_schema.prediction_semantic_segmentation_label_column_names.polygons_coordinates_column_name = (  # noqa: E501
-                schema.semantic_segmentation_prediction_column_names.polygon_coordinates_column_name
+            seg_pred_cols = schema.semantic_segmentation_prediction_column_names
+            pred_seg_labels = s.arrow_schema.prediction_semantic_segmentation_label_column_names
+            pred_seg_labels.polygons_coordinates_column_name = (
+                seg_pred_cols.polygon_coordinates_column_name
             )
-            s.arrow_schema.prediction_semantic_segmentation_label_column_names.polygons_categories_column_name = (  # noqa: E501
-                schema.semantic_segmentation_prediction_column_names.categories_column_name
+            pred_seg_labels.polygons_categories_column_name = (
+                seg_pred_cols.categories_column_name
             )
         if schema.instance_segmentation_prediction_column_names is not None:
-            s.arrow_schema.prediction_instance_segmentation_label_column_names.polygons_coordinates_column_name = (  # noqa: E501
-                schema.instance_segmentation_prediction_column_names.polygon_coordinates_column_name
+            inst_seg_pred_cols = (
+                schema.instance_segmentation_prediction_column_names
             )
-            s.arrow_schema.prediction_instance_segmentation_label_column_names.polygons_categories_column_name = (  # noqa: E501
-                schema.instance_segmentation_prediction_column_names.categories_column_name
+            pred_inst_seg_labels = s.arrow_schema.prediction_instance_segmentation_label_column_names
+            pred_inst_seg_labels.polygons_coordinates_column_name = (
+                inst_seg_pred_cols.polygon_coordinates_column_name
             )
-            if (
-                schema.instance_segmentation_prediction_column_names.scores_column_name
-                is not None
-            ):
-                s.arrow_schema.prediction_instance_segmentation_label_column_names.polygons_scores_column_name = (  # noqa: E501
-                    schema.instance_segmentation_prediction_column_names.scores_column_name
+            pred_inst_seg_labels.polygons_categories_column_name = (
+                inst_seg_pred_cols.categories_column_name
+            )
+            if inst_seg_pred_cols.scores_column_name is not None:
+                pred_inst_seg_labels.polygons_scores_column_name = (
+                    inst_seg_pred_cols.scores_column_name
                 )
             if (
-                schema.instance_segmentation_prediction_column_names.bounding_boxes_coordinates_column_name
+                inst_seg_pred_cols.bounding_boxes_coordinates_column_name
                 is not None
             ):
-                s.arrow_schema.prediction_instance_segmentation_label_column_names.bboxes_coordinates_column_name = (  # noqa: E501
-                    schema.instance_segmentation_prediction_column_names.bounding_boxes_coordinates_column_name
+                pred_inst_seg_labels.bboxes_coordinates_column_name = (
+                    inst_seg_pred_cols.bounding_boxes_coordinates_column_name
                 )
     if schema.prediction_score_column_name is not None:
@@ -1038,50 +1244,61 @@ def _get_pb_schema(
     if model_type == ModelTypes.OBJECT_DETECTION:
         if schema.object_detection_actual_column_names is not None:
-            s.arrow_schema.actual_object_detection_label_column_names.bboxes_coordinates_column_name = (  # noqa: E501
-                schema.object_detection_actual_column_names.bounding_boxes_coordinates_column_name
+            obj_det_actual = schema.object_detection_actual_column_names
+            actual_labels = (
+                s.arrow_schema.actual_object_detection_label_column_names
             )
-            s.arrow_schema.actual_object_detection_label_column_names.bboxes_categories_column_name = (  # noqa: E501
-                schema.object_detection_actual_column_names.categories_column_name
+            actual_labels.bboxes_coordinates_column_name = (
+                obj_det_actual.bounding_boxes_coordinates_column_name
             )
-            if (
-                schema.object_detection_actual_column_names.scores_column_name
-                is not None
-            ):
-                s.arrow_schema.actual_object_detection_label_column_names.bboxes_scores_column_name = (  # noqa: E501
-                    schema.object_detection_actual_column_names.scores_column_name
+            actual_labels.bboxes_categories_column_name = (
+                obj_det_actual.categories_column_name
+            )
+            if obj_det_actual.scores_column_name is not None:
+                actual_labels.bboxes_scores_column_name = (
+                    obj_det_actual.scores_column_name
                 )
         if schema.semantic_segmentation_actual_column_names is not None:
-            s.arrow_schema.actual_semantic_segmentation_label_column_names.polygons_coordinates_column_name = (  # noqa: E501
-                schema.semantic_segmentation_actual_column_names.polygon_coordinates_column_name
+            sem_seg_actual = schema.semantic_segmentation_actual_column_names
+            sem_seg_labels = (
+                s.arrow_schema.actual_semantic_segmentation_label_column_names
+            )
+            sem_seg_labels.polygons_coordinates_column_name = (
+                sem_seg_actual.polygon_coordinates_column_name
             )
-            s.arrow_schema.actual_semantic_segmentation_label_column_names.polygons_categories_column_name = (  # noqa: E501
-                schema.semantic_segmentation_actual_column_names.categories_column_name
+            sem_seg_labels.polygons_categories_column_name = (
+                sem_seg_actual.categories_column_name
             )
         if schema.instance_segmentation_actual_column_names is not None:
-            s.arrow_schema.actual_instance_segmentation_label_column_names.polygons_coordinates_column_name = (  # noqa: E501
-                schema.instance_segmentation_actual_column_names.polygon_coordinates_column_name
+            inst_seg_actual = schema.instance_segmentation_actual_column_names
+            inst_seg_labels = (
+                s.arrow_schema.actual_instance_segmentation_label_column_names
             )
-            s.arrow_schema.actual_instance_segmentation_label_column_names.polygons_categories_column_name = (  # noqa: E501
-                schema.instance_segmentation_actual_column_names.categories_column_name
+            inst_seg_labels.polygons_coordinates_column_name = (
+                inst_seg_actual.polygon_coordinates_column_name
+            )
+            inst_seg_labels.polygons_categories_column_name = (
+                inst_seg_actual.categories_column_name
             )
             if (
-                schema.instance_segmentation_actual_column_names.bounding_boxes_coordinates_column_name
+                inst_seg_actual.bounding_boxes_coordinates_column_name
                 is not None
             ):
-                s.arrow_schema.actual_instance_segmentation_label_column_names.bboxes_coordinates_column_name = (  # noqa: E501
-                    schema.instance_segmentation_actual_column_names.bounding_boxes_coordinates_column_name
+                inst_seg_labels.bboxes_coordinates_column_name = (
+                    inst_seg_actual.bounding_boxes_coordinates_column_name
                 )
     if model_type == ModelTypes.GENERATIVE_LLM:
         if schema.prompt_template_column_names is not None:
-            s.arrow_schema.prompt_template_column_names.template_column_name = (
-                schema.prompt_template_column_names.template_column_name
+            prompt_template_names = schema.prompt_template_column_names
+            arrow_prompt_names = s.arrow_schema.prompt_template_column_names
+            arrow_prompt_names.template_column_name = (
+                prompt_template_names.template_column_name
             )
-            s.arrow_schema.prompt_template_column_names.template_version_column_name = (  # noqa: E501
-                schema.prompt_template_column_names.template_version_column_name
+            arrow_prompt_names.template_version_column_name = (
+                prompt_template_names.template_version_column_name
             )
         if schema.llm_config_column_names is not None:
             s.arrow_schema.llm_config_column_names.model_column_name = (
@@ -1114,6 +1331,7 @@ def _get_pb_schema_corpus(
     schema: CorpusSchema,
     model_id: str,
 ) -> pb2.Schema:
+    """Construct a protocol buffer Schema from CorpusSchema for document corpus logging."""
     s = pb2.Schema()
     s.constants.model_id = model_id
     s.constants.environment = pb2.Schema.Environment.CORPUS
@@ -1127,11 +1345,12 @@ def _get_pb_schema_corpus(
             schema.document_version_column_name
         )
     if schema.document_text_embedding_column_names is not None:
-        s.arrow_schema.document_column_names.text_column_name.vector_column_name = schema.document_text_embedding_column_names.vector_column_name  # noqa: E501
-        s.arrow_schema.document_column_names.text_column_name.data_column_name = schema.document_text_embedding_column_names.data_column_name  # noqa: E501
-        if (
-            schema.document_text_embedding_column_names.link_to_data_column_name
-            is not None
-        ):
-            s.arrow_schema.document_column_names.text_column_name.link_to_data_column_name = schema.document_text_embedding_column_names.link_to_data_column_name  # noqa: E501
+        doc_text_emb_cols = schema.document_text_embedding_column_names
+        doc_text_col = s.arrow_schema.document_column_names.text_column_name
+        doc_text_col.vector_column_name = doc_text_emb_cols.vector_column_name
+        doc_text_col.data_column_name = doc_text_emb_cols.data_column_name
+        if doc_text_emb_cols.link_to_data_column_name is not None:
+            doc_text_col.link_to_data_column_name = (
+                doc_text_emb_cols.link_to_data_column_name
+            )
     return s

arize 8.0.0a21__py3-none-any.whl → 8.0.0a23__py3-none-any.whl

arize 8.0.0a21py3-none-any.whl → 8.0.0a23py3-none-any.whl