PyPI - arize - Versions diffs - 8.0.0a16__py3-none-any.whl → 8.0.0a17__py3-none-any.whl - Mend

arize 8.0.0a16py3-none-any.whl → 8.0.0a17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

arize/__init__.py +1 -0
arize/_flight/client.py +32 -1
arize/client.py +8 -0
arize/config.py +14 -0
arize/constants/config.py +4 -0
arize/datasets/client.py +77 -56
arize/experiments/client.py +118 -17
arize/utils/cache.py +68 -0
arize/version.py +1 -1
{arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/METADATA +217 -14
{arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/RECORD +13 -12
{arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/WHEEL +0 -0
{arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/licenses/LICENSE.md +0 -0

arize/__init__.py CHANGED Viewed

@@ -87,3 +87,4 @@ def make_to_df(field_name: str):
 models.DatasetsList200Response.to_df = make_to_df("datasets")  # type: ignore[attr-defined]
 models.DatasetsListExamples200Response.to_df = make_to_df("examples")  # type: ignore[attr-defined]
 models.ExperimentsList200Response.to_df = make_to_df("experiments")  # type: ignore[attr-defined]
+models.ExperimentsRunsList200Response.to_df = make_to_df("experiment_runs")  # type: ignore[attr-defined]

arize/_flight/client.py CHANGED Viewed

@@ -25,6 +25,7 @@ from arize.utils.proto import get_pb_schema_tracing
 from arize.version import __version__
 if TYPE_CHECKING:
+    import pandas as pd
     import pyarrow as pa
@@ -260,7 +261,7 @@ class ArizeFlightClient:
         space_id: str,
         dataset_id: str,
         dataset_version_id: str | None = None,
-    ):
+    ) -> pd.DataFrame:
         # TODO(Kiko): Space ID should not be needed,
         # should work on server tech debt to remove this
         doget_request = flight_ing_pb2.DoGetRequest(
@@ -283,6 +284,36 @@ class ArizeFlightClient:
             logger.exception(f"Failed to get dataset id={dataset_id}")
             raise RuntimeError(f"Failed to get dataset id={dataset_id}") from e
+    # ---------- experiment methods ----------
+    def get_experiment_runs(
+        self,
+        space_id: str,
+        experiment_id: str,
+    ) -> pd.DataFrame:
+        # TODO(Kiko): Space ID should not be needed,
+        # should work on server tech debt to remove this
+        doget_request = flight_ing_pb2.DoGetRequest(
+            get_experiment=flight_ing_pb2.GetExperimentRequest(
+                space_id=space_id,
+                experiment_id=experiment_id,
+            )
+        )
+        descriptor = flight.Ticket(
+            json_format.MessageToJson(doget_request).encode("utf-8")
+        )
+        try:
+            reader = self.do_get(descriptor, options=self.call_options)
+            # read all data into pandas dataframe
+            df = reader.read_all().to_pandas()
+            df = convert_json_str_to_dict(df)
+            return df
+        except Exception as e:
+            logger.exception(f"Failed to get experiment id={experiment_id}")
+            raise RuntimeError(
+                f"Failed to get experiment id={experiment_id}"
+            ) from e
     def init_experiment(
         self,
         space_id: str,

arize/client.py CHANGED Viewed

@@ -12,6 +12,14 @@ if TYPE_CHECKING:
     from arize.spans.client import SpansClient
+# TODO(Kiko): models need to follow resource first pattern
+# -  models.DatasetsList200Response
+# -  models.DatasetsListExamples200Response
+# -  models.ExperimentsList200Response
+# -  models.ExperimentsRunsList200Response
+# TODO(Kiko): Root client should have option to clear caches
+# TODO(Kiko): Document caching behavior
+# TODO(Kiko): Force keyword arguments
 # TODO(Kiko): Protobuf versioning is too old
 # TODO(Kiko): Make sure the client has same options as SDKConfiguration
 # TODO(Kiko): It does not make any sense to require space ID in run_experiment, dataset ID should suffice

arize/config.py CHANGED Viewed

@@ -7,6 +7,8 @@ from typing import Any, Dict
 from arize.constants.config import (
     DEFAULT_API_HOST,
+    DEFAULT_ARIZE_DIRECTORY,
+    DEFAULT_ENABLE_CACHING,
     DEFAULT_FLIGHT_HOST,
     DEFAULT_FLIGHT_PORT,
     DEFAULT_FLIGHT_TRANSPORT_SCHEME,
@@ -19,6 +21,8 @@ from arize.constants.config import (
     DEFAULT_STREAM_MAX_WORKERS,
     ENV_API_HOST,
     ENV_API_KEY,
+    ENV_ARIZE_DIRECTORY,
+    ENV_ENABLE_CACHING,
     ENV_FLIGHT_HOST,
     ENV_FLIGHT_PORT,
     ENV_FLIGHT_TRANSPORT_SCHEME,
@@ -116,6 +120,14 @@ def _max_http_payload_size_mb_factory() -> float:
     )
+def _arize_dir_factory() -> str:
+    return os.getenv(ENV_ARIZE_DIRECTORY, DEFAULT_ARIZE_DIRECTORY)
+def _enable_cache_factory() -> bool:
+    return _parse_bool(os.getenv(ENV_ENABLE_CACHING, DEFAULT_ENABLE_CACHING))
 def _mask_secret(secret: str, N: int = 4) -> str:
     """Show first N chars then '***'; empty string if empty."""
     return f"{secret[:N]}***"
@@ -147,6 +159,8 @@ class SDKConfiguration:
     max_http_payload_size_mb: float = field(
         default_factory=_max_http_payload_size_mb_factory
     )
+    arize_direcory: str = field(default_factory=_arize_dir_factory)
+    enable_caching: bool = field(default_factory=_enable_cache_factory)
     # Private, excluded from comparisons & repr
     _headers: Dict[str, str] = field(init=False, repr=False, compare=False)

arize/constants/config.py CHANGED Viewed

@@ -11,6 +11,8 @@ ENV_PYARROW_MAX_CHUNKSIZE = "ARIZE_MAX_CHUNKSIZE"
 ENV_REQUEST_VERIFY = "ARIZE_REQUEST_VERIFY"
 ENV_INSECURE = "ARIZE_INSECURE"
 ENV_MAX_HTTP_PAYLOAD_SIZE_MB = "ARIZE_MAX_HTTP_PAYLOAD_SIZE_MB"
+ENV_ARIZE_DIRECTORY = "ARIZE_DIRECTORY"
+ENV_ENABLE_CACHING = "ARIZE_ENABLE_CACHING"
 # Server configuration default values
 DEFAULT_API_HOST = "api.arize.com"  # NOTE: Must not prefix with https://
@@ -22,6 +24,8 @@ DEFAULT_PYARROW_MAX_CHUNKSIZE = 10_000
 DEFAULT_REQUEST_VERIFY = True
 DEFAULT_INSECURE = False
 DEFAULT_MAX_HTTP_PAYLOAD_SIZE_MB = 100
+DEFAULT_ARIZE_DIRECTORY = "~/.arize"
+DEFAULT_ENABLE_CACHING = True
 # ML Streaming configuration
 ENV_STREAM_MAX_WORKERS = "ARIZE_STREAM_MAX_WORKERS"

arize/datasets/client.py CHANGED Viewed

@@ -13,6 +13,7 @@ from arize._generated.api_client import models
 from arize.config import SDKConfiguration
 from arize.datasets.validation import validate_dataset_df
 from arize.exceptions.base import INVALID_ARROW_CONVERSION_MSG
+from arize.utils.cache import cache_resource, load_cached_resource
 from arize.utils.openinference_conversion import (
     convert_boolean_columns_to_str,
     convert_datetime_columns_to_int,
@@ -22,9 +23,6 @@ from arize.utils.size import get_payload_size_mb
 logger = logging.getLogger(__name__)
-# TODO(Kiko): Decide based on size of payload instead
-REST_LIMIT_DATASET_EXAMPLES = 0
 class DatasetsClient:
     def __init__(self, sdk_config: SDKConfiguration):
@@ -42,57 +40,8 @@ class DatasetsClient:
         self.delete = self._api.datasets_delete
         # Custom methods
-        self.list_examples = self._list_examples
         self.create = self._create_dataset
-    def _list_examples(
-        self,
-        dataset_id: str,
-        dataset_version_id: str = "",
-        limit: int = 100,
-        all: bool = False,
-    ):
-        if not all:
-            return self._api.datasets_list_examples(
-                dataset_id=dataset_id,
-                dataset_version_id=dataset_version_id,
-                limit=limit,
-            )
-        # TODO(Kiko): Space ID should not be needed,
-        # should work on server tech debt to remove this
-        dataset = self.get(dataset_id=dataset_id)
-        space_id = dataset.space_id
-        with ArizeFlightClient(
-            api_key=self._sdk_config.api_key,
-            host=self._sdk_config.flight_server_host,
-            port=self._sdk_config.flight_server_port,
-            scheme=self._sdk_config.flight_scheme,
-            request_verify=self._sdk_config.request_verify,
-            max_chunksize=self._sdk_config.pyarrow_max_chunksize,
-        ) as flight_client:
-            try:
-                response = flight_client.get_dataset_examples(
-                    space_id=space_id,
-                    dataset_id=dataset_id,
-                    dataset_version_id=dataset_version_id,
-                )
-            except Exception as e:
-                msg = f"Error during request: {str(e)}"
-                logger.error(msg)
-                raise RuntimeError(msg) from e
-        if response is None:
-            # This should not happen with proper Flight client implementation,
-            # but we handle it defensively
-            msg = "No response received from flight server during request"
-            logger.error(msg)
-            raise RuntimeError(msg)
-        # The response from flightserver is the dataset ID. To return the dataset
-        # object we make a GET query
-        return models.DatasetsListExamples200Response(
-            examples=response.to_dict(orient="records")
-        )
+        self.list_examples = self._list_examples
     def _create_dataset(
         self,
@@ -203,23 +152,95 @@ class DatasetsClient:
         dataset = self.get(dataset_id=response)
         return dataset
+    def _list_examples(
+        self,
+        dataset_id: str,
+        dataset_version_id: str = "",
+        limit: int = 100,
+        all: bool = False,
+    ):
+        if not all:
+            return self._api.datasets_list_examples(
+                dataset_id=dataset_id,
+                dataset_version_id=dataset_version_id,
+                limit=limit,
+            )
+        dataset = self.get(dataset_id=dataset_id)
+        dataset_updated_at = getattr(dataset, "updated_at", None)
+        # TODO(Kiko): Space ID should not be needed,
+        # should work on server tech debt to remove this
+        space_id = dataset.space_id
+        dataset_df = None
+        # try to load dataset from cache
+        if self._sdk_config.enable_caching:
+            dataset_df = load_cached_resource(
+                cache_dir=self._sdk_config.arize_direcory,
+                resource="dataset",
+                resource_id=dataset_id,
+                resource_updated_at=dataset_updated_at,
+            )
+        if dataset_df is not None:
+            return models.DatasetsListExamples200Response(
+                examples=dataset_df.to_dict(orient="records")
+            )
+        with ArizeFlightClient(
+            api_key=self._sdk_config.api_key,
+            host=self._sdk_config.flight_server_host,
+            port=self._sdk_config.flight_server_port,
+            scheme=self._sdk_config.flight_scheme,
+            request_verify=self._sdk_config.request_verify,
+            max_chunksize=self._sdk_config.pyarrow_max_chunksize,
+        ) as flight_client:
+            try:
+                dataset_df = flight_client.get_dataset_examples(
+                    space_id=space_id,
+                    dataset_id=dataset_id,
+                    dataset_version_id=dataset_version_id,
+                )
+            except Exception as e:
+                msg = f"Error during request: {str(e)}"
+                logger.error(msg)
+                raise RuntimeError(msg) from e
+        if dataset_df is None:
+            # This should not happen with proper Flight client implementation,
+            # but we handle it defensively
+            msg = "No response received from flight server during request"
+            logger.error(msg)
+            raise RuntimeError(msg)
+        # cache dataset for future use
+        cache_resource(
+            cache_dir=self._sdk_config.arize_direcory,
+            resource="dataset",
+            resource_id=dataset_id,
+            resource_updated_at=dataset_updated_at,
+            resource_data=dataset_df,
+        )
+        return models.DatasetsListExamples200Response(
+            examples=dataset_df.to_dict(orient="records")
+        )
 def _set_default_columns_for_dataset(df: pd.DataFrame) -> pd.DataFrame:
     current_time = int(time.time() * 1000)
     if "created_at" in df.columns:
-        if df["created_at"].isnull().values.any():
+        if df["created_at"].isnull().values.any():  # type: ignore
             df["created_at"].fillna(current_time, inplace=True)
     else:
         df["created_at"] = current_time
     if "updated_at" in df.columns:
-        if df["updated_at"].isnull().values.any():
+        if df["updated_at"].isnull().values.any():  # type: ignore
             df["updated_at"].fillna(current_time, inplace=True)
     else:
         df["updated_at"] = current_time
     if "id" in df.columns:
-        if df["id"].isnull().values.any():
+        if df["id"].isnull().values.any():  # type: ignore
             df["id"] = df["id"].apply(
                 lambda x: str(uuid.uuid4()) if pd.isnull(x) else x
             )

arize/experiments/client.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import hashlib
 import logging
 from typing import TYPE_CHECKING, Any, Dict, List, Tuple
@@ -19,6 +20,7 @@ from opentelemetry.trace import Tracer
 from arize._flight.client import ArizeFlightClient
 from arize._flight.types import FlightRequestType
+from arize._generated.api_client import models
 from arize.config import SDKConfiguration
 from arize.exceptions.base import INVALID_ARROW_CONVERSION_MSG
 from arize.experiments.evaluators.base import Evaluators
@@ -31,6 +33,7 @@ from arize.experiments.types import (
     ExperimentTask,
     ExperimentTaskResultFieldNames,
 )
+from arize.utils.cache import cache_resource, load_cached_resource
 from arize.utils.openinference_conversion import (
     convert_boolean_columns_to_str,
     convert_default_columns_to_json_str,
@@ -57,27 +60,28 @@ class ExperimentsClient:
         self._datasets_api = gen.DatasetsApi(
             self._sdk_config.get_generated_client()
         )
         self.list = self._api.experiments_list
         self.get = self._api.experiments_get
         self.delete = self._api.experiments_delete
-        self.list_runs = self._api.experiments_runs_list  # REST ?
         # Custom methods
-        self.create = self._create_experiment
         self.run = self._run_experiment
+        self.create = self._create_experiment
+        self.list_runs = self._api.experiments_runs_list
     def _run_experiment(
         self,
         name: str,
         dataset_id: str,
         task: ExperimentTask,
-        dataset_df: pd.DataFrame | None = None,
         evaluators: Evaluators | None = None,
         dry_run: bool = False,
+        dry_run_count: int = 10,
         concurrency: int = 3,
         set_global_tracer_provider: bool = False,
         exit_on_error: bool = False,
-    ) -> Tuple[str, pd.DataFrame] | None:
+    ) -> Tuple[Experiment | None, pd.DataFrame] | None:
         """
         Run an experiment on a dataset and upload the results.
@@ -87,9 +91,6 @@ class ExperimentsClient:
         Args:
             experiment_name (str): The name of the experiment.
             task (ExperimentTask): The task to be performed in the experiment.
-            dataset_df (Optional[pd.DataFrame], optional): The dataset as a pandas DataFrame.
-                If not provided, the dataset will be downloaded using dataset_id or dataset_name.
-                Defaults to None.
             dataset_id (Optional[str], optional): The ID of the dataset to use.
                 Required if dataset_df and dataset_name are not provided. Defaults to None.
             dataset_name (Optional[str], optional): The name of the dataset to use.
@@ -116,6 +117,7 @@ class ExperimentsClient:
         # should work on server tech debt to remove this
         dataset = self._datasets_api.datasets_get(dataset_id=dataset_id)
         space_id = dataset.space_id
+        dataset_updated_at = getattr(dataset, "updated_at", None)
         with ArizeFlightClient(
             api_key=self._sdk_config.api_key,
@@ -152,10 +154,20 @@ class ExperimentsClient:
                     raise RuntimeError(msg)
                 experiment_id, trace_model_name = response
-            # download dataset if not provided
+            dataset_df = None
+            # try to load dataset from cache
+            if self._sdk_config.enable_caching:
+                dataset_df = load_cached_resource(
+                    cache_dir=self._sdk_config.arize_direcory,
+                    resource="dataset",
+                    resource_id=dataset_id,
+                    resource_updated_at=dataset_updated_at,
+                )
             if dataset_df is None:
+                # download dataset
                 try:
-                    response = flight_client.get_dataset_examples(
+                    dataset_df = flight_client.get_dataset_examples(
                         space_id=space_id,
                         dataset_id=dataset_id,
                     )
@@ -163,7 +175,7 @@ class ExperimentsClient:
                     msg = f"Error during request: {str(e)}"
                     logger.error(msg)
                     raise RuntimeError(msg) from e
-                if response is None:
+                if dataset_df is None:
                     # This should not happen with proper Flight client implementation,
                     # but we handle it defensively
                     msg = (
@@ -172,13 +184,21 @@ class ExperimentsClient:
                     logger.error(msg)
                     raise RuntimeError(msg)
-            if dataset_df is None or dataset_df.empty:
+            if dataset_df.empty:
                 raise ValueError(f"Dataset {dataset_id} is empty")
-            input_df = dataset_df.copy()
+            # cache dataset for future use
+            cache_resource(
+                cache_dir=self._sdk_config.arize_direcory,
+                resource="dataset",
+                resource_id=dataset_id,
+                resource_updated_at=dataset_updated_at,
+                resource_data=dataset_df,
+            )
             if dry_run:
-                # only dry_run experiment on a subset (first 10 rows) of the dataset
-                input_df = input_df.head(10)
+                # only dry_run experiment on a subset (first N rows) of the dataset
+                dataset_df = dataset_df.head(dry_run_count)
             # trace model and resource for the experiment
             tracer, resource = _get_tracer_resource(
@@ -193,7 +213,7 @@ class ExperimentsClient:
             output_df = run_experiment(
                 experiment_name=name,
                 experiment_id=experiment_id,
-                dataset=input_df,
+                dataset=dataset_df,
                 task=task,
                 tracer=tracer,
                 resource=resource,
@@ -204,7 +224,7 @@ class ExperimentsClient:
             output_df = convert_default_columns_to_json_str(output_df)
             output_df = convert_boolean_columns_to_str(output_df)
             if dry_run:
-                return "", output_df
+                return None, output_df
             # Convert to Arrow table
             try:
@@ -241,7 +261,10 @@ class ExperimentsClient:
                 logger.error(msg)
                 raise RuntimeError(msg)
-            return str(post_resp.experiment_id), output_df  # type: ignore
+            experiment = self.get(
+                experiment_id=str(post_resp.experiment_id)  # type: ignore
+            )
+            return experiment, output_df
     def _create_experiment(
         self,
@@ -352,6 +375,78 @@ class ExperimentsClient:
             experiment_df=experiment_df,
         )
+    def _list_runs(
+        self,
+        experiment_id: str,
+        limit: int = 100,
+        all: bool = False,
+    ):
+        if not all:
+            return self._api.experiments_runs_list(
+                experiment_id=experiment_id,
+                limit=limit,
+            )
+        experiment = self.get(experiment_id=experiment_id)
+        experiment_updated_at = getattr(experiment, "updated_at", None)
+        # TODO(Kiko): Space ID should not be needed,
+        # should work on server tech debt to remove this
+        dataset = self._datasets_api.datasets_get(
+            dataset_id=experiment.dataset_id
+        )
+        space_id = dataset.space_id
+        experiment_df = None
+        # try to load dataset from cache
+        if self._sdk_config.enable_caching:
+            experiment_df = load_cached_resource(
+                cache_dir=self._sdk_config.arize_direcory,
+                resource="experiment",
+                resource_id=experiment_id,
+                resource_updated_at=experiment_updated_at,
+            )
+        if experiment_df is not None:
+            return models.ExperimentsRunsList200Response(
+                experimentRuns=experiment_df.to_dict(orient="records")
+            )
+        with ArizeFlightClient(
+            api_key=self._sdk_config.api_key,
+            host=self._sdk_config.flight_server_host,
+            port=self._sdk_config.flight_server_port,
+            scheme=self._sdk_config.flight_scheme,
+            request_verify=self._sdk_config.request_verify,
+            max_chunksize=self._sdk_config.pyarrow_max_chunksize,
+        ) as flight_client:
+            try:
+                experiment_df = flight_client.get_experiment_runs(
+                    space_id=space_id,
+                    experiment_id=experiment_id,
+                )
+            except Exception as e:
+                msg = f"Error during request: {str(e)}"
+                logger.error(msg)
+                raise RuntimeError(msg) from e
+        if experiment_df is None:
+            # This should not happen with proper Flight client implementation,
+            # but we handle it defensively
+            msg = "No response received from flight server during request"
+            logger.error(msg)
+            raise RuntimeError(msg)
+        # cache dataset for future use
+        cache_resource(
+            cache_dir=self._sdk_config.arize_direcory,
+            resource="dataset",
+            resource_id=experiment_id,
+            resource_updated_at=experiment_updated_at,
+            resource_data=experiment_df,
+        )
+        return models.ExperimentsRunsList200Response(
+            experimentRuns=experiment_df.to_dict(orient="records")
+        )
     def _create_experiment_via_flight(
         self,
         name: str,
@@ -463,3 +558,9 @@ def _get_tracer_resource(
         trace.set_tracer_provider(tracer_provider)
     return tracer_provider.get_tracer(__name__), resource
+def _dataset_cache_key(dataset_id: str, dataset_updated_at: str | None) -> str:
+    # include updated_at if present to produce a new key when dataset changes
+    key_src = f"{dataset_id}:{dataset_updated_at or ''}"
+    return hashlib.sha256(key_src.encode("utf-8")).hexdigest()

arize/utils/cache.py ADDED Viewed

@@ -0,0 +1,68 @@
+from __future__ import annotations
+import logging
+from pathlib import Path
+import pandas as pd
+logger = logging.getLogger(__name__)
+def load_cached_resource(
+    cache_dir: str,
+    resource: str,
+    resource_id: str,
+    resource_updated_at: str | None,
+    format: str = "parquet",
+) -> pd.DataFrame | None:
+    key = _get_cache_key(resource, resource_id, resource_updated_at)
+    filepath = _get_abs_file_path(cache_dir, f"{key}.{format}", resource)
+    if not filepath.exists():
+        return None
+    try:
+        return pd.read_parquet(filepath)
+    except Exception as e:
+        logger.warning(f"Failed to load cached resource from {filepath}: {e}")
+        return None
+def cache_resource(
+    cache_dir: str,
+    resource: str,
+    resource_id: str,
+    resource_updated_at: str | None,
+    resource_data: pd.DataFrame,
+    format: str = "parquet",
+) -> None:
+    key = _get_cache_key(resource, resource_id, resource_updated_at)
+    filepath = _get_abs_file_path(cache_dir, f"{key}.{format}", resource)
+    filepath.parent.mkdir(parents=True, exist_ok=True)
+    resource_data.to_parquet(filepath, index=False)
+    logger.debug(f"Cached resource to {filepath}")
+def _get_cache_key(
+    resource: str,
+    resource_id: str,
+    resource_updated_at: str | None,
+) -> str:
+    # include updated_at if present to produce a new key when dataset changes
+    key = f"{resource}_{resource_id}"
+    if resource_updated_at:
+        key += f"_{resource_updated_at}"
+    return key
+def _get_abs_file_path(
+    directory: str,
+    filename: str,
+    subdirectory: str | None = None,
+) -> Path:
+    """
+    Return an absolute path to a file located under `directory[/subdirectory]/filename`.
+    Expands '~' and resolves relative components.
+    """
+    base = Path(directory).expanduser()
+    if subdirectory:
+        base = base / subdirectory
+    return (base / filename).resolve()

arize/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "8.0.~~0a16~~"
1	+ __version__ = "8.0.0a17"

{arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: arize
-Version: 8.0.0a16
+Version: 8.0.0a17
 Summary: A helper library to interact with Arize AI APIs
 Project-URL: Homepage, https://arize.com
 Project-URL: Documentation, https://docs.arize.com/arize
@@ -99,11 +99,24 @@ Description-Content-Type: text/markdown
   - [Operations on Datasets](#operations-on-datasets)
     - [List Datasets](#list-datasets)
     - [Create a Dataset](#create-a-dataset)
-    - [Get Dataset by ID](#get-dataset-by-id)
+    - [Get Dataset](#get-dataset)
     - [Delete a Dataset](#delete-a-dataset)
-- [Configure Logging](#configure-logging)
-  - [In Code](#in-code)
-  - [Via Environment Variables](#via-environment-variables)
+    - [List Dataset Examples](#list-dataset-examples)
+  - [Operations on Experiments](#operations-on-experiments)
+    - [List Experiments](#list-experiments)
+    - [Run an Experiment](#run-an-experiment)
+    - [Create an Experiment](#create-an-experiment)
+    - [Get an Experiment](#get-an-experiment)
+    - [Delete an Experiment](#delete-an-experiment)
+    - [List Experiment runs](#list-experiment-runs)
+- [SDK Configuration](#sdk-configuration)
+  - [Logging](#logging)
+    - [In Code](#in-code)
+    - [Via Environment Variables](#via-environment-variables)
+  - [Caching](#caching)
+    - [In Code](#in-code-1)
+    - [Via Environment Variables](#via-environment-variables-1)
+    - [Clean the cache](#clean-the-cache)
 - [Community](#community)
 # Overview
@@ -398,9 +411,9 @@ dataset_list = resp.datasets
 # Get the response as a dictionary
 resp_dict = resp.to_dict()
 # Get the response in JSON format
-resp_dict = resp.to_json()
+resp_json = resp.to_json()
 # Get the response as a pandas dataframe
-resp_dict = resp.to_df()
+resp_df = resp.to_df()
 ```
 ### Create a Dataset
@@ -430,9 +443,10 @@ If the number of examples (rows in dataframe, items in list) is too large, the c
 ```python
 created_dataset = client.datasets.create(
-    space_i="<target-space-id>",
+    space_id="<target-space-id>",
     name="<your-dataset-name>", # Name must be unique within a space
     examples=..., # List of dictionaries or pandas dataframe
+    # force_http=... # Optionally pass force_http to create datasets via HTTP instead of gRPC, defaults to False
 )
 ```
@@ -445,8 +459,7 @@ dataset_dict = create_dataset.to_dict()
 dataset_dict = create_dataset.to_json()
 ```
-### Get Dataset by ID
+### Get Dataset
 To get a dataset by its ID use `client.datasets.get()`, you can optionally also pass the version ID of a particular version of interest of the dataset. The returned type is `Dataset`.
@@ -467,9 +480,167 @@ client.datasets.delete(
 )
 ```
-# Configure Logging
+### List Dataset Examples
+You can list the examples of a given dataset using `client.datasets.list_examples()` and passing the dataset ID and, optionally, the dataset version ID. You can specify the number of examples desired using the `limit` parameter. If you want a large number of examples, consider using the `all=True` parameter, which will make it so the SDK exports the data using Arrow Flight via gRPC, for increased performance.
+```python
+resp = client.datasets.list_examples(
+    dataset_id="your-dataset-id>",
+    dataset_version_id="your-dataset-version-id>", # Optional, defaults to latest version
+    limit=... # number of desired examples. Defaults to 100
+    all=... # Whether or not to export all of the examples. Defaults to False
+)
+```
+The response is an object of type `DatasetsExamplesList200Response`, and you can access the list of examples via its `examples` attribute. In addition, you can transform the response object to a dictionary, to JSON format, or a pandas dataframe.
+```python
+# Get the list of datasets from the response
+examples_list = resp.examples
+# Get the response as a dictionary
+resp_dict = resp.to_dict()
+# Get the response in JSON format
+resp_json = resp.to_json()
+# Get the response as a pandas dataframe
+resp_df = resp.to_df()
+```
+## Operations on Experiments
+### List Experiments
+You can list all experiments that the user has access to using `client.experiments.list()`. You can use the `limit` parameter to specify the maximum number of datasets desired in the response and you can specify the `dataset_id` to target the list operation to a particular dataset.
+```python
+resp = client.experiments.list(
+    limit=... # Optional
+    dataset_id=... # Optional
+)
+```
+The response is an object of type `ExperimentsList200Response`, and you can access the list of experiments via its `experiments` attribute. In addition, you can transform the response object to a dictionary, to JSON format, or a pandas dataframe.
+```python
+# Get the list of datasets from the response
+experiment_list = resp.experiments
+# Get the response as a dictionary
+resp_dict = resp.to_dict()
+# Get the response in JSON format
+resp_json = resp.to_json()
+# Get the response as a pandas dataframe
+resp_df = resp.to_df()
+```
+### Run an Experiment
+You can run an experiment on a dataset using `client.experiments.run()` by defining a task, evaluators (optional), and passing the dataset id of the dataset you want to use, together with a name for the experiment. The function will download the entire dataset from Arize (unless cached, see caching section under "SDK Configuration"), execute the task to obtain an output, and perform evaluations (if evaluators were passed). The experiments will also be traced, and these traces will be visible in Arize. The experiment will be created and the data logged into Arize automatically. You can avoid logging to Arize by making `dry_run=True`. The function will return the `Experiment` object (or `None` if `dry_run=True`) together with the dataframe with the experiment data.
+```python
+experiment, experiment_df = client.run_experiment(
+    name="<name-your-experiment>",
+    dataset_id="<id-of-dataset-to-use>",
+    task=... # The task to be performed in the experiment.
+    evaluators=... # Optional: The evaluators to use in the experiment.
+    dry_run=..., # If True, the experiment result will not be uploaded to Arize. Defaults to False
+    dry_run_count=..., # Number of examples of the dataset to use in the dry run. Defaults to 10
+    concurrency=..., # The number of concurrent tasks to run. Defaults to 3.
+    set_global_tracer_provider=..., # If True, sets the global tracer provider for the experiment. Defaults to False
+    exit_on_error=..., # If True, the experiment will stop running on first occurrence of an error. Defaults to False
+)
+```
+The `Experiment` object also counts with convenience method similar to `List***` objects:
+```python
+# Get the response as a dictionary
+experiment_dict = create_experiment.to_dict()
+# Get the response in JSON format
+experiment_dict = create_experiment.to_json()
+```
+### Create an Experiment
+It is possible that you have run the experiment yourself without the above function, and hence you already have experiment data that you want to send to Arize. In this case, use the `client.experiments.create()` method by passing the runs data, we currently don't support creating an empty experiment, for instance, these are 2 rows of runs, as a list of dictionaries. You can also pass a pandas dataframe for the runs data.
+> NOTE: If you don't have experiment data and want to run experiment, see the `client.experiments.run()` section above.
+```python
+# TODO
+runs = [
+]
+```
+In addition, you must specify which columns are the `example_id` and the `result`, you can do so by using the `ExperimentTaskResultFieldNames`. Moreover, if you choose to pass evaluation data, you can indicate the evaluation columns using `EvaluationResultFieldNames`:
+```python
+# TODO
+```
+If the number of runs (rows in dataframe, items in list) is too large, the client SDK will try to send the data via Arrow Flight via gRPC for better performance. If you want to force the data transfer to HTTP you can use the `force_http` flag. The response is an `Experiment` object.
+```python
+created_experiment = client.experiments.create(
+    name="<your-experiment-name>", # Name must be unique within a dataset
+    dataset_id="<desired-dataset-id>",
+    experiment_runs=..., # List of dictionaries or pandas dataframe
+    task_fields=ExperimentTaskResultFieldNames(...),
+    evaluator_columns=... # Optional
+    # force_http=... # Optionally pass force_http to create experiments via HTTP instead of gRPC, defaults to False
+)
+```
+### Get an Experiment
+To get a dataset by its ID use `client.datasets.get()`, you can optionally also pass the version ID of a particular version of interest of the dataset. The returned type is `Dataset`.
+```python
+dataset = client.datasets.get(
+    dataset_id=... # The unique identifier of the dataset
+    dataset_version_id=... # The unique identifier of the dataset version
+)
+```
+### Delete an Experiment
+To delete an experiment by its ID use `client.experiments.delete()`. The call returns `None` if successful deletion took place, error otherwise.
+```python
+client.experiments.delete(
+    experiment_id=... # The unique identifier of the experiment
+)
+```
+### List Experiment runs
-## In Code
+You can list the runs of a given experiment using `client.experiments.list_runs()` and passing the experiment ID. You can specify the number of runs desired using the `limit` parameter. If you want a large number of runs, consider using the `all=True` parameter, which will make it so the SDK exports the data using Arrow Flight via gRPC, for increased performance.
+```python
+resp = client.experiments.list_runs(
+    experiment_id="your-experiment-id>",
+    limit=... # number of desired runs. Defaults to 100
+    all=... # Whether or not to export all of the runs. Defaults to False
+)
+```
+The response is an object of type `ExperimentsRunsList200Response`, and you can access the list of runs via its `experiment_runs` attribute. In addition, you can transform the response object to a dictionary, to JSON format, or a pandas dataframe.
+```python
+# Get the list of datasets from the response
+run_list = resp.experiments_runs
+# Get the response as a dictionary
+resp_dict = resp.to_dict()
+# Get the response in JSON format
+resp_json = resp.to_json()
+# Get the response as a pandas dataframe
+resp_df = resp.to_df()
+```
+# SDK Configuration
+## Logging
+### In Code
 You can use `configure_logging` to set up the logging behavior of the Arize package to your needs.
@@ -482,14 +653,14 @@ configure_logging(
 )
 ```
-## Via Environment Variables
+### Via Environment Variables
 Configure the same options as the section above, via:
 ```python
 import os
-# You can disable logging altogether
+# Whether or not you want to disable logging altogether
 os.environ["ARIZE_LOG_ENABLE"] = "true"
 # Set up the logging level
 os.environ["ARIZE_LOG_LEVEL"] = "debug"
@@ -499,6 +670,38 @@ os.environ["ARIZE_LOG_STRUCTURED"] = "false"
 The default behavior of Arize's logs is: enabled, `INFO` level, and not structured.
+## Caching
+When downloading big segments of data from Arize, such as a `Dataset` with all of its examples, the SDK will cache the file in `parquet` format under `~/.arize/datasets/dataset_<updated_at_timestamp>.parquet`.
+### In Code
+You can disable caching via the `enable_caching` parameter when instantiating the client, and also edit the "arize directory":
+```python
+client = ArizeClient(
+    enable_caching=False, # Optional parameter, defaults to True
+    arize_directory="my-desired-directory", # Optional parameter, defaults to ~/.arize
+)
+```
+### Via Environment Variables
+You can also configure the above via:
+```python
+import os
+# Whether or not you want to disable caching
+os.environ["ARIZE_ENABLE_CACHING"] = "true"
+# Where you want the SDK to store the files
+os.environ["ARIZE_DIRECTORY"] = "~/.arize"
+```
+### Clean the cache
+To clean the cache you can directly `rm` the files or directory.
 # Community
 Join our community to connect with thousands of AI builders.

{arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
-arize/__init__.py,sha256=_MTUL3z00_ytJ2YcT5lvgjRJjRCCNgOAMYReM0VlNA0,2896
+arize/__init__.py,sha256=G9wbTaZsccUIwntIriIIW74lS1-tHeG58Vt4XV1ZV9s,3002
 arize/_lazy.py,sha256=1Lnm4l42t7W-m2JYCYD-S7ASBOIl0XJkBuli3Ei1VXA,2474
-arize/client.py,sha256=frYYTpE7px7Peg7bwXjdSDBinHBxLkgEWef6qGotlAE,6876
-arize/config.py,sha256=dHofOMN5PPmgBydaz6K3EZvyd77u3gH8d9MDAWA_RF8,7444
+arize/client.py,sha256=-SeZloT7qqWRtr1WXS5d2yn7gvpNYYyGE2yjGPvYi74,7236
+arize/config.py,sha256=PDKUkJfGvTxX2NZ5FLxXz1YaXBOuAkyL5eW7kdbZc5A,7909
 arize/logging.py,sha256=OahBaJRG-z5DPqWrj2_rbe2n0r4fMGOrXpxN_4M_i_w,7244
 arize/types.py,sha256=z1yg5-brmTD4kVHDmmTVkYke53JpusXXeOOpdQw7rYg,69508
-arize/version.py,sha256=3zKIu3lsiVs70w9ALInVHDEz29CoKjmoBoLAVkpcNco,25
+arize/version.py,sha256=dVbZUbQ1PraD-0qvMFzVVGSr1QRGrJYBgb-CUfl0LQc,25
 arize/_exporter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arize/_exporter/client.py,sha256=k3xS-2wx_UlB5toI5RKBoy1bi3ONIxh4KQy4A4a2Omc,15822
 arize/_exporter/validation.py,sha256=6ROu5p7uaolxQ93lO_Eiwv9NVw_uyi3E5T--C5Klo5Q,1021
 arize/_exporter/parsers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arize/_exporter/parsers/tracing_data_parser.py,sha256=zVS-w8t1HJkz-AIC_JCdjPJ7gJXgFpfELfqNM_vK42E,5395
 arize/_flight/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-arize/_flight/client.py,sha256=gAyU5Jsu-zEyq6mS8OtRp3xI85M2JWR_fg7UpYIYxLA,14373
+arize/_flight/client.py,sha256=14dYkHM0Pi-GP1AeNPQX-RQ3uMmtwRwxoSmR7--1eW0,15499
 arize/_flight/types.py,sha256=GB_4dQu2ElIrcDGAcqhG7oI4g-b0ZdSlbrQkf0TFzVE,194
 arize/_generated/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arize/_generated/api_client_README.md,sha256=OSAc24mxj4fZB7k0i8DIZ8uoXfn6hGjptO5om6ferRE,5632
@@ -55,14 +55,14 @@ arize/_generated/protocol/flight/ingest_pb2.py,sha256=-wC5rbLK4yjROQuXOU9c_gPwA4
 arize/_generated/protocol/rec/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arize/_generated/protocol/rec/public_pb2.py,sha256=vgP-yTSZLeomVwfIzcOo6t3i1mPCCNJGgd41ZkfLNng,79898
 arize/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-arize/constants/config.py,sha256=hD13trH2Ih1dG8vq18ItHCyBo1nqOp29_YGHgUoa_5w,1367
+arize/constants/config.py,sha256=RvvMZrhbMSv3_Do1jKTVGyWt_Pwal82pIL4S9FH0XS4,1518
 arize/constants/ml.py,sha256=X_vtKpt1AdhLoT2DWEyKDSXAVEuzjwGFacIbgUOpB3M,2358
 arize/constants/model_mapping.json,sha256=OPE54rBATzmwRhx0tycsxnGae1jBhtqEmQqQvzleTSc,5725
 arize/constants/openinference.py,sha256=3tVLyUz6ZvE8ht_ZLnndYXFhDjt_ibJbFeBM1PcxIbY,532
 arize/constants/pyarrow.py,sha256=XUZQXQ-431fQYM2ZJy6xRwW4pfABPg7NZspQ5BXAxRc,24
 arize/constants/spans.py,sha256=EfMgbEIK_2EUcvUY5BGnNAbS7bupBKePlI3j2L5T5CE,2532
 arize/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-arize/datasets/client.py,sha256=LtQYUOx69L5F7eYmrH4OaCxvJUGrSS3W6F2xJ-HIR8M,8167
+arize/datasets/client.py,sha256=g4qAWYkteDjcw8EgTdr4XBrtT0JYF7ewD8D-slNxAZ4,8970
 arize/datasets/errors.py,sha256=9hmE7KyBWBSi4FkVQYsI3E-KPgzXaCZc681czNBhS-Q,1685
 arize/datasets/validation.py,sha256=KT_X9bnEMxGbh2o9N3aXwgTMVOQPzz1AW-JyaKxcs48,1336
 arize/embeddings/__init__.py,sha256=6_C8908W_qDixkoBJl1wapgmQCzI8TPLH207kzbYsFA,156
@@ -83,7 +83,7 @@ arize/exceptions/spaces.py,sha256=C1mbtbUx7bVFnGM7iJg03pttnd-jVl2dnFmO102wXrA,31
 arize/exceptions/types.py,sha256=ALzH6S63zbHSno2n6Lp3lRf7Galo-HctrkkDU61fKBo,6050
 arize/exceptions/values.py,sha256=aNAL4P9nN0LOtuHrIARBbty2V0ZtMgBsT1wyz1fB6Kk,18948
 arize/experiments/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-arize/experiments/client.py,sha256=9dY8mKMCjJy2-neN18u0ZR99hfBiMgfEFo2ZlG5e4Fo,18628
+arize/experiments/client.py,sha256=VXK2Dl8wOruvX6yLeHyhSMJ-hZIRh9AXdzrYNgxl4pM,22329
 arize/experiments/functions.py,sha256=-6yAumc4ZZxoouEnKXkR8GxFqEFfDBCOOC3j6OAVt40,33833
 arize/experiments/tracing.py,sha256=DGhJrJU2yUchMUVWPr_4PTqmM0VbSiNnRoV08hnN4nU,9660
 arize/experiments/types.py,sha256=EEf0EdjldNX6Hg98bX0E9HtZeu__3Ofy0x9fDqrflAg,12752
@@ -133,13 +133,14 @@ arize/spans/validation/spans/spans_validation.py,sha256=p6IjbQMtOhotGBfw3axj7yMW
 arize/spans/validation/spans/value_validation.py,sha256=H3qV96w6JQNCed_MxhWDas9Jf6vUj6RFabShcwf4jr4,19102
 arize/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 arize/utils/arrow.py,sha256=6kbTY3mPL8oAk9C3sL-vE5dLuQ7bNU74qbRHcSbuIBg,5334
+arize/utils/cache.py,sha256=5KP6D-Dru-HjB7hSwFttUf8B4veXNqK7wq82B4bfECU,1892
 arize/utils/dataframe.py,sha256=I0FloPgNiqlKga32tMOvTE70598QA8Hhrgf-6zjYMAM,1120
 arize/utils/openinference_conversion.py,sha256=i3QBngObcc-LrUWFe_pg9egrFs2pqqbFSncUA-wnqNE,1679
 arize/utils/proto.py,sha256=RfdiXtq2cvIG1IV8W0jz2m-vdrA2CD8f542UUi6GLoY,381
 arize/utils/size.py,sha256=uAM-bs7Jk7fIu6vjQ9khZuJZnpAmFvA3lTXiRT0aJS4,788
 arize/utils/online_tasks/__init__.py,sha256=nDuTLUTYnZaWgyJoYR1P7O8ZKA-Nba7X6tJ9OislbWM,144
 arize/utils/online_tasks/dataframe_preprocessor.py,sha256=YyeeeFu_FwCYImbYvBZvQIH_5TK2lHru8KSfqV893ps,8884
-arize-8.0.0a16.dist-info/METADATA,sha256=Rmirc79t04oHmZLhAhsQ5sRbcmQ4x6GevaU4n81fe7E,19286
-arize-8.0.0a16.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-arize-8.0.0a16.dist-info/licenses/LICENSE.md,sha256=8vLN8Gms62NCBorxIv9MUvuK7myueb6_-dhXHPmm4H0,1479
-arize-8.0.0a16.dist-info/RECORD,,
+arize-8.0.0a17.dist-info/METADATA,sha256=FUSvD19Y91lZs32i1d3nDB1oM8Aqv38LTaL2LDlouyE,28471
+arize-8.0.0a17.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+arize-8.0.0a17.dist-info/licenses/LICENSE.md,sha256=8vLN8Gms62NCBorxIv9MUvuK7myueb6_-dhXHPmm4H0,1479
+arize-8.0.0a17.dist-info/RECORD,,

{arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/WHEEL RENAMED Viewed

File without changes

{arize-8.0.0a16.dist-info → arize-8.0.0a17.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

arize 8.0.0a16__py3-none-any.whl → 8.0.0a17__py3-none-any.whl

arize 8.0.0a16py3-none-any.whl → 8.0.0a17py3-none-any.whl