PyPI - arize - Versions diffs - 8.0.0a13__py3-none-any.whl → 8.0.0a14__py3-none-any.whl - Mend

arize 8.0.0a13py3-none-any.whl → 8.0.0a14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

arize/_exporter/client.py +18 -3
arize/datasets/client.py +88 -83
arize/{utils → models}/casting.py +12 -12
arize/models/client.py +330 -5
arize/{utils → models}/proto.py +1 -369
arize/spans/client.py +30 -6
arize/utils/arrow.py +4 -4
arize/version.py +1 -1
{arize-8.0.0a13.dist-info → arize-8.0.0a14.dist-info}/METADATA +11 -3
{arize-8.0.0a13.dist-info → arize-8.0.0a14.dist-info}/RECORD +12 -12
{arize-8.0.0a13.dist-info → arize-8.0.0a14.dist-info}/WHEEL +0 -0
{arize-8.0.0a13.dist-info → arize-8.0.0a14.dist-info}/licenses/LICENSE.md +0 -0

arize/_exporter/client.py CHANGED Viewed

@@ -20,7 +20,6 @@ from arize._generated.protocol.flight import export_pb2
 from arize.logging import CtxAdapter
 from arize.types import Environments, SimilaritySearchParams
 from arize.utils.dataframe import reset_dataframe_index
-from arize.utils.proto import get_pb_flight_doput_request
 logger = logging.getLogger(__name__)
@@ -131,7 +130,7 @@ class ArizeExportClient:
         reset_dataframe_index(df)
         return df
-    def export_model_to_parquet(
+    def export_to_parquet(
         self,
         path: str,
         space_id: str,
@@ -285,7 +284,7 @@ class ArizeExportClient:
             end_time=Timestamp(seconds=int(end_time.timestamp())),
             filter_expression=where,
             similarity_search_params=(
-                get_pb_flight_doput_request(similarity_search_params)
+                _get_pb_similarity_search_params(similarity_search_params)
                 if similarity_search_params
                 else None
             ),
@@ -326,3 +325,19 @@ class ArizeExportClient:
             colour="#008000",
             unit=" row",
         )
+def _get_pb_similarity_search_params(
+    similarity_params: SimilaritySearchParams,
+) -> export_pb2.SimilaritySearchParams:
+    proto_params = export_pb2.SimilaritySearchParams()
+    proto_params.search_column_name = similarity_params.search_column_name
+    proto_params.threshold = similarity_params.threshold
+    for ref in similarity_params.references:
+        new_ref = proto_params.references.add()
+        new_ref.prediction_id = ref.prediction_id
+        new_ref.reference_column_name = ref.reference_column_name
+        if ref.prediction_timestamp:
+            new_ref.prediction_timestamp.FromDatetime(ref.prediction_timestamp)
+    return proto_params

arize/datasets/client.py CHANGED Viewed

@@ -39,99 +39,104 @@ class DatasetsClient:
         name: str,
         space_id: str,
         examples: List[Dict[str, Any]] | pd.DataFrame,
+        force_http: bool = False,
     ):
-        if (
-            isinstance(examples, list)
-            and len(examples) > REST_LIMIT_DATASET_EXAMPLES
-        ):
-            logger.info(
-                f"Uploading {len(examples)} examples via REST may be slow. "
-                "Trying to convert to DataFrame for more efficient upload."
+        if not isinstance(examples, (list, pd.DataFrame)):
+            raise TypeError(
+                "Examples must be a list of dicts or a pandas DataFrame"
             )
-            # If we have too many examples, try to convert to a dataframe
-            try:
-                data = pd.DataFrame(examples)
-            except Exception as e:
-                logger.warning(
-                    f"Could not convert examples to DataFrame: {e}. "
-                    "Falling back to list upload, which may be less performant."
-                )
-                # If we can’t convert to a dataframe, just use the list
-                data = examples
-        else:
-            # If we have a dataframe or a small list, just use it directly
-            data = examples
-        if isinstance(data, list):
-            # If the data is in list format, use the REST endpoint
+        if len(examples) <= REST_LIMIT_DATASET_EXAMPLES or force_http:
             from arize._generated import api_client as gen
+            data = (
+                examples.to_dict(orient="records")
+                if isinstance(examples, pd.DataFrame)
+                else examples
+            )
             body = gen.DatasetsCreateRequest(
                 name=name,
                 spaceId=space_id,
                 examples=data,
             )
             return self._api.datasets_create(datasets_create_request=body)
-        elif isinstance(data, pd.DataFrame):
-            # Convert datetime columns to int64 (ms since epoch)
-            # TODO(Kiko): Missing validation block
-            # data = _convert_datetime_columns_to_int(data)
-            # df = self._set_default_columns_for_dataset(data)
-            # if convert_dict_to_json:
-            #     df = _convert_default_columns_to_json_str(df)
-            # df = _convert_boolean_columns_to_str(df)
-            # validation_errors = Validator.validate(df)
-            # validation_errors.extend(
-            #     Validator.validate_max_chunk_size(max_chunk_size)
-            # )
-            # if validation_errors:
-            #     raise RuntimeError(
-            #         [e.error_message() for e in validation_errors]
-            #     )
-            # Convert to Arrow table
+        # If we have too many examples, try to convert to a dataframe
+        # and log via gRPC + flight
+        logger.info(
+            f"Uploading {len(examples)} examples via REST may be slow. "
+            "Trying to convert to DataFrame for more efficient upload via "
+            "gRPC + Flight."
+        )
+        data = (
+            pd.DataFrame(examples) if isinstance(examples, list) else examples
+        )
+        return self._create_dataset_via_flight(
+            name=name,
+            space_id=space_id,
+            examples=data,
+        )
+    def _create_dataset_via_flight(
+        self,
+        name: str,
+        space_id: str,
+        examples: pd.DataFrame,
+    ):
+        # Convert datetime columns to int64 (ms since epoch)
+        # TODO(Kiko): Missing validation block
+        # data = _convert_datetime_columns_to_int(data)
+        # df = self._set_default_columns_for_dataset(data)
+        # if convert_dict_to_json:
+        #     df = _convert_default_columns_to_json_str(df)
+        # df = _convert_boolean_columns_to_str(df)
+        # validation_errors = Validator.validate(df)
+        # validation_errors.extend(
+        #     Validator.validate_max_chunk_size(max_chunk_size)
+        # )
+        # if validation_errors:
+        #     raise RuntimeError(
+        #         [e.error_message() for e in validation_errors]
+        #     )
+        # Convert to Arrow table
+        try:
+            logger.debug("Converting data to Arrow format")
+            pa_table = pa.Table.from_pandas(examples)
+        except pa.ArrowInvalid as e:
+            logger.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
+            raise pa.ArrowInvalid(
+                f"Error converting to Arrow format: {str(e)}"
+            ) from e
+        except Exception as e:
+            logger.error(f"Unexpected error creating Arrow table: {str(e)}")
+            raise
+        response = None
+        with ArizeFlightClient(
+            api_key=self._sdk_config.api_key,
+            host=self._sdk_config.flight_server_host,
+            port=self._sdk_config.flight_server_port,
+            scheme=self._sdk_config.flight_scheme,
+            request_verify=self._sdk_config.request_verify,
+        ) as flight_client:
             try:
-                logger.debug("Converting data to Arrow format")
-                pa_table = pa.Table.from_pandas(data)
-            except pa.ArrowInvalid as e:
-                logger.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
-                raise pa.ArrowInvalid(
-                    f"Error converting to Arrow format: {str(e)}"
-                ) from e
+                response = flight_client.create_dataset(
+                    space_id=space_id,
+                    dataset_name=name,
+                    pa_table=pa_table,
+                )
             except Exception as e:
-                logger.error(f"Unexpected error creating Arrow table: {str(e)}")
-                raise
-            response = None
-            with ArizeFlightClient(
-                api_key=self._sdk_config.api_key,
-                host=self._sdk_config.flight_server_host,
-                port=self._sdk_config.flight_server_port,
-                scheme=self._sdk_config.flight_scheme,
-                request_verify=self._sdk_config.request_verify,
-            ) as flight_client:
-                try:
-                    response = flight_client.create_dataset(
-                        space_id=space_id,
-                        dataset_name=name,
-                        pa_table=pa_table,
-                    )
-                except Exception as e:
-                    msg = f"Error during update request: {str(e)}"
-                    logger.error(msg)
-                    raise RuntimeError(msg) from e
-            if response is None:
-                # This should not happen with proper Flight client implementation,
-                # but we handle it defensively
-                msg = "No response received from flight server during update"
+                msg = f"Error during update request: {str(e)}"
                 logger.error(msg)
-                raise RuntimeError(msg)
-            # The response from flightserver is the dataset ID. To return the dataset
-            # object we make a GET query
-            dataset = self.get(dataset_id=response)
-            return dataset
-        else:
-            raise TypeError(
-                "Examples must be a list of dicts or a pandas DataFrame"
-            )
+                raise RuntimeError(msg) from e
+        if response is None:
+            # This should not happen with proper Flight client implementation,
+            # but we handle it defensively
+            msg = "No response received from flight server during update"
+            logger.error(msg)
+            raise RuntimeError(msg)
+        # The response from flightserver is the dataset ID. To return the dataset
+        # object we make a GET query
+        dataset = self.get(dataset_id=response)
+        return dataset

arize/{utils → models}/casting.py RENAMED Viewed

@@ -131,28 +131,28 @@ def cast_typed_columns(
         f = getattr(schema, field_name)
         if f:
             try:
-                validate_typed_columns(field_name, f)
+                _validate_typed_columns(field_name, f)
             except InvalidTypedColumnsError:
                 raise
-            dataframe = cast_columns(dataframe, f)
+            dataframe = _cast_columns(dataframe, f)
     # Now that the dataframe values have been cast to the specified types:
     # for downstream validation to work as expected,
     # feature & tag schema field types should be List[string] of column names.
     # Since Schema is a frozen class, we must construct a new instance.
-    return dataframe, convert_schema_field_types(schema)
+    return dataframe, _convert_schema_field_types(schema)
 def cast_dictionary(d: dict) -> dict:
     cast_dict = {}
     for k, v in d.items():
         if isinstance(v, TypedValue):
-            v = cast_value(v)
+            v = _cast_value(v)
         cast_dict[k] = v
     return cast_dict
-def cast_value(
+def _cast_value(
     typed_value: TypedValue,
 ) -> Union[str, int, float, List[str], None]:
     """
@@ -224,7 +224,7 @@ def _cast_to_str(typed_value: TypedValue) -> Union[str, None]:
         raise CastingError(str(e), typed_value) from e
-def validate_typed_columns(
+def _validate_typed_columns(
     field_name: str, typed_columns: TypedColumns
 ) -> None:
     """
@@ -253,7 +253,7 @@ def validate_typed_columns(
         )
-def cast_columns(
+def _cast_columns(
     dataframe: pd.DataFrame, columns: TypedColumns
 ) -> pd.DataFrame:
     """
@@ -288,7 +288,7 @@ def cast_columns(
             # uses pd.NA for missing values (when storage arg is not configured)
             # In the future, try out pd.convert_dtypes (new in pandas 2.0):
             # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.convert_dtypes.html
-            dataframe = cast_df(dataframe, columns.to_str, "string")
+            dataframe = _cast_df(dataframe, columns.to_str, "string")
         except Exception as e:
             raise ColumnCastingError(
                 error_msg=str(e),
@@ -300,7 +300,7 @@ def cast_columns(
         # see https://pandas.pydata.org/docs/reference/api/pandas.Int64Dtype.html
         # uses pd.NA for missing values
         try:
-            dataframe = cast_df(dataframe, columns.to_int, "Int64")
+            dataframe = _cast_df(dataframe, columns.to_int, "Int64")
         except Exception as e:
             raise ColumnCastingError(
                 error_msg=str(e),
@@ -312,7 +312,7 @@ def cast_columns(
         # see https://pandas.pydata.org/docs/reference/api/pandas.Float64Dtype.html
         # uses pd.NA for missing values
         try:
-            dataframe = cast_df(dataframe, columns.to_float, "Float64")
+            dataframe = _cast_df(dataframe, columns.to_float, "Float64")
         except Exception as e:
             raise ColumnCastingError(
                 error_msg=str(e),
@@ -323,7 +323,7 @@ def cast_columns(
     return dataframe
-def cast_df(
+def _cast_df(
     df: pd.DataFrame, cols: List[str], target_type_str: str
 ) -> pd.DataFrame:
     """
@@ -354,7 +354,7 @@ def cast_df(
     return df.astype({col: target_type_str for col in cols})
-def convert_schema_field_types(
+def _convert_schema_field_types(
     schema: Schema,
 ) -> Schema:
     """

arize 8.0.0a13__py3-none-any.whl → 8.0.0a14__py3-none-any.whl

arize 8.0.0a13py3-none-any.whl → 8.0.0a14py3-none-any.whl