PyPI - arize - Versions diffs - 8.0.0b1__py3-none-any.whl → 8.0.0b4__py3-none-any.whl - Mend

arize 8.0.0b1py3-none-any.whl → 8.0.0b4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

arize/__init__.py +9 -2
arize/_client_factory.py +50 -0
arize/_exporter/client.py +18 -17
arize/_exporter/parsers/tracing_data_parser.py +9 -4
arize/_exporter/validation.py +1 -1
arize/_flight/client.py +37 -17
arize/_generated/api_client/api/datasets_api.py +6 -6
arize/_generated/api_client/api/experiments_api.py +6 -6
arize/_generated/api_client/api/projects_api.py +3 -3
arize/_lazy.py +61 -10
arize/client.py +66 -50
arize/config.py +175 -48
arize/constants/config.py +1 -0
arize/constants/ml.py +9 -16
arize/constants/spans.py +5 -10
arize/datasets/client.py +45 -28
arize/datasets/errors.py +1 -1
arize/datasets/validation.py +2 -2
arize/embeddings/auto_generator.py +16 -9
arize/embeddings/base_generators.py +15 -9
arize/embeddings/cv_generators.py +2 -2
arize/embeddings/errors.py +2 -2
arize/embeddings/nlp_generators.py +8 -8
arize/embeddings/tabular_generators.py +6 -6
arize/exceptions/base.py +0 -52
arize/exceptions/config.py +22 -0
arize/exceptions/parameters.py +1 -330
arize/exceptions/values.py +8 -5
arize/experiments/__init__.py +4 -0
arize/experiments/client.py +31 -18
arize/experiments/evaluators/base.py +12 -9
arize/experiments/evaluators/executors.py +16 -7
arize/experiments/evaluators/rate_limiters.py +3 -1
arize/experiments/evaluators/types.py +9 -7
arize/experiments/evaluators/utils.py +7 -5
arize/experiments/functions.py +128 -58
arize/experiments/tracing.py +4 -1
arize/experiments/types.py +34 -31
arize/logging.py +54 -33
arize/ml/batch_validation/errors.py +10 -1004
arize/ml/batch_validation/validator.py +351 -291
arize/ml/bounded_executor.py +25 -6
arize/ml/casting.py +51 -33
arize/ml/client.py +43 -35
arize/ml/proto.py +21 -22
arize/ml/stream_validation.py +64 -27
arize/ml/surrogate_explainer/mimic.py +18 -10
arize/ml/types.py +27 -67
arize/pre_releases.py +10 -6
arize/projects/client.py +9 -4
arize/py.typed +0 -0
arize/regions.py +11 -11
arize/spans/client.py +125 -31
arize/spans/columns.py +32 -36
arize/spans/conversion.py +12 -11
arize/spans/validation/annotations/dataframe_form_validation.py +1 -1
arize/spans/validation/annotations/value_validation.py +11 -14
arize/spans/validation/common/argument_validation.py +3 -3
arize/spans/validation/common/dataframe_form_validation.py +7 -7
arize/spans/validation/common/value_validation.py +11 -14
arize/spans/validation/evals/dataframe_form_validation.py +4 -4
arize/spans/validation/evals/evals_validation.py +6 -6
arize/spans/validation/evals/value_validation.py +1 -1
arize/spans/validation/metadata/argument_validation.py +1 -1
arize/spans/validation/metadata/dataframe_form_validation.py +2 -2
arize/spans/validation/metadata/value_validation.py +23 -1
arize/spans/validation/spans/dataframe_form_validation.py +2 -2
arize/spans/validation/spans/spans_validation.py +6 -6
arize/utils/arrow.py +38 -2
arize/utils/cache.py +2 -2
arize/utils/dataframe.py +4 -4
arize/utils/online_tasks/dataframe_preprocessor.py +15 -11
arize/utils/openinference_conversion.py +10 -10
arize/utils/proto.py +0 -1
arize/utils/types.py +6 -6
arize/version.py +1 -1
{arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/METADATA +32 -7
{arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/RECORD +81 -78
{arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/WHEEL +0 -0
{arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/LICENSE +0 -0
{arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/NOTICE +0 -0

arize/experiments/evaluators/executors.py CHANGED Viewed

@@ -77,7 +77,7 @@ class Executor(Protocol):
     def run(
         self, inputs: Sequence[Any]
-    ) -> tuple[list[object], list[ExecutionDetails]]:
+    ) -> tuple[list[Unset | object], list[ExecutionDetails]]:
         """Execute the generation function on all inputs and return outputs with execution details."""
         ...
@@ -94,7 +94,7 @@ class AsyncExecutor(Executor):
         concurrency (int, optional): The number of concurrent consumers. Defaults to 3.
-        tqdm_bar_format (Optional[str], optional): The format string for the progress bar.
+        tqdm_bar_format (str | :obj:`None`, optional): The format string for the progress bar.
             Defaults to None.
         max_retries (int, optional): The maximum number of times to retry on exceptions.
@@ -119,6 +119,7 @@ class AsyncExecutor(Executor):
         exit_on_error: bool = True,
         fallback_return_value: Unset | object = _unset,
         termination_signal: signal.Signals = signal.SIGINT,
+        timeout: int = 120,
     ) -> None:
         """Initialize the async executor with configuration parameters.
@@ -130,6 +131,7 @@ class AsyncExecutor(Executor):
             exit_on_error: Whether to exit on first error.
             fallback_return_value: Value to return when execution fails.
             termination_signal: Signal to handle for graceful termination.
+            timeout: Timeout for each task in seconds.
         """
         self.generate = generation_fn
         self.fallback_return_value = fallback_return_value
@@ -139,6 +141,7 @@ class AsyncExecutor(Executor):
         self.exit_on_error = exit_on_error
         self.base_priority = 0
         self.termination_signal = termination_signal
+        self.timeout = timeout
     async def producer(
         self,
@@ -195,7 +198,7 @@ class AsyncExecutor(Executor):
                 )
                 done, _pending = await asyncio.wait(
                     [generate_task, termination_event_watcher],
-                    timeout=120,
+                    timeout=self.timeout,
                     return_when=asyncio.FIRST_COMPLETED,
                 )
@@ -252,7 +255,7 @@ class AsyncExecutor(Executor):
     async def execute(
         self, inputs: Sequence[Any]
-    ) -> tuple[list[object], list[ExecutionDetails]]:
+    ) -> tuple[list[Unset | object], list[ExecutionDetails]]:
         """Execute all inputs asynchronously using producer-consumer pattern."""
         termination_event = asyncio.Event()
@@ -329,7 +332,7 @@ class AsyncExecutor(Executor):
     def run(
         self, inputs: Sequence[Any]
-    ) -> tuple[list[object], list[ExecutionDetails]]:
+    ) -> tuple[list[Unset | object], list[ExecutionDetails]]:
         """Execute all inputs asynchronously and return outputs with execution details."""
         return asyncio.run(self.execute(inputs))
@@ -341,7 +344,7 @@ class SyncExecutor(Executor):
         generation_fn (Callable[[object], Any]): The generation function that takes an input and
             returns an output.
-        tqdm_bar_format (Optional[str], optional): The format string for the progress bar. Defaults
+        tqdm_bar_format (str | :obj:`None`, optional): The format string for the progress bar. Defaults
             to None.
         max_retries (int, optional): The maximum number of times to retry on exceptions. Defaults to
@@ -403,7 +406,9 @@ class SyncExecutor(Executor):
         else:
             yield
-    def run(self, inputs: Sequence[Any]) -> tuple[list[object], list[object]]:
+    def run(
+        self, inputs: Sequence[Any]
+    ) -> tuple[list[Unset | object], list[ExecutionDetails]]:
         """Execute all inputs synchronously and return outputs with execution details."""
         with self._executor_signal_handling(self.termination_signal):
             outputs = [self.fallback_return_value] * len(inputs)
@@ -460,6 +465,7 @@ def get_executor_on_sync_context(
     max_retries: int = 10,
     exit_on_error: bool = True,
     fallback_return_value: Unset | object = _unset,
+    timeout: int = 120,
 ) -> Executor:
     """Get an appropriate executor based on the current threading context.
@@ -475,6 +481,7 @@ def get_executor_on_sync_context(
         max_retries: Maximum number of retry attempts. Defaults to 10.
         exit_on_error: Whether to exit on first error. Defaults to True.
         fallback_return_value: Value to return on failure. Defaults to unset.
+        timeout: Timeout for each task in seconds. Defaults to 120.
     Returns:
         An Executor instance configured for the current context.
@@ -513,6 +520,7 @@ def get_executor_on_sync_context(
                 max_retries=max_retries,
                 exit_on_error=exit_on_error,
                 fallback_return_value=fallback_return_value,
+                timeout=timeout,
             )
         logger.warning(
             "🐌!! If running inside a notebook, patching the event loop with "
@@ -533,6 +541,7 @@ def get_executor_on_sync_context(
         max_retries=max_retries,
         exit_on_error=exit_on_error,
         fallback_return_value=fallback_return_value,
+        timeout=timeout,
     )

arize/experiments/evaluators/rate_limiters.py CHANGED Viewed

@@ -276,7 +276,9 @@ class RateLimiter:
         """Apply rate limiting to an asynchronous function."""
         @wraps(fn)
-        async def wrapper(*args: object, **kwargs: object) -> GenericType:
+        async def wrapper(
+            *args: ParameterSpec.args, **kwargs: ParameterSpec.kwargs
+        ) -> GenericType:
             self._initialize_async_primitives()
             if self._rate_limit_handling_lock is None or not isinstance(
                 self._rate_limit_handling_lock, asyncio.Lock

arize/experiments/evaluators/types.py CHANGED Viewed

@@ -4,7 +4,7 @@ from __future__ import annotations
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 if TYPE_CHECKING:
     from collections.abc import Mapping
@@ -60,10 +60,12 @@ class EvaluationResult:
         if not obj:
             return None
         return cls(
-            score=obj.get("score"),
-            label=obj.get("label"),
-            explanation=obj.get("explanation"),
-            metadata=obj.get("metadata") or {},
+            score=cast("float | None", obj.get("score")),
+            label=cast("str | None", obj.get("label")),
+            explanation=cast("str | None", obj.get("explanation")),
+            metadata=cast(
+                "Mapping[str, JSONSerializable]", obj.get("metadata") or {}
+            ),
         )
     def __post_init__(self) -> None:
@@ -94,14 +96,14 @@ EvaluatorOutput = (
 @dataclass
 class EvaluationResultFieldNames:
-    """Column names for mapping evaluation results in a DataFrame.
+    """Column names for mapping evaluation results in a :class:`pandas.DataFrame`.
     Args:
         score: Optional name of column containing evaluation scores
         label: Optional name of column containing evaluation labels
         explanation: Optional name of column containing evaluation explanations
         metadata: Optional mapping of metadata keys to column names. If a column name
-            is None or empty string, the metadata key will be used as the column name.
+            is :obj:`None` or empty string, the metadata key will be used as the column name.
     Examples:
         >>> # Basic usage with score and label columns

arize/experiments/evaluators/utils.py CHANGED Viewed

@@ -2,8 +2,8 @@
 import functools
 import inspect
-from collections.abc import Callable
-from typing import TYPE_CHECKING
+from collections.abc import Awaitable, Callable
+from typing import TYPE_CHECKING, Any
 from tqdm.auto import tqdm
@@ -154,10 +154,10 @@ def _wrap_coroutine_evaluation_function(
     name: str,
     sig: inspect.Signature,
     convert_to_score: Callable[[object], EvaluationResult],
-) -> Callable[[Callable[..., object]], "Evaluator"]:
+) -> Callable[[Callable[..., Awaitable[object]]], "Evaluator"]:
     from ..evaluators.base import Evaluator
-    def wrapper(func: Callable[..., object]) -> "Evaluator":
+    def wrapper(func: Callable[..., Awaitable[object]]) -> "Evaluator":
         class AsyncEvaluator(Evaluator):
             def __init__(self) -> None:
                 self._name = name
@@ -224,9 +224,11 @@ def _default_eval_scorer(result: object) -> EvaluationResult:
     raise ValueError(f"Unsupported evaluation result type: {type(result)}")
-def printif(condition: bool, *args: object, **kwargs: object) -> None:
+def printif(condition: bool, *args: Any, **kwargs: Any) -> None:  # noqa: ANN401
     """Print to tqdm output if the condition is true.
+    Note: *args/**kwargs use Any for proper pass-through to tqdm.write().
     Args:
         condition: Whether to print the message.
         *args: Positional arguments to pass to tqdm.write.

arize/experiments/functions.py CHANGED Viewed

@@ -7,7 +7,14 @@ import json
 import logging
 import traceback
 from binascii import hexlify
-from collections.abc import Awaitable, Callable, Mapping, Sequence
+from collections.abc import (
+    Awaitable,
+    Callable,
+    Coroutine,
+    Iterable,
+    Mapping,
+    Sequence,
+)
 from contextlib import ExitStack
 from copy import deepcopy
 from datetime import date, datetime, time, timedelta, timezone
@@ -34,7 +41,7 @@ from openinference.semconv.trace import (
 )
 from opentelemetry.context import Context
 from opentelemetry.sdk.resources import Resource
-from opentelemetry.trace import Status, StatusCode, Tracer
+from opentelemetry.trace import NoOpTracer, Status, StatusCode, Tracer
 if TYPE_CHECKING:
     from opentelemetry.sdk.trace import Span
@@ -48,6 +55,7 @@ from arize.experiments.evaluators.types import (
     EvaluationResult,
     EvaluationResultFieldNames,
     EvaluatorName,
+    JSONSerializable,
 )
 from arize.experiments.evaluators.utils import create_evaluator
 from arize.experiments.tracing import capture_spans, flatten
@@ -64,6 +72,9 @@ RateLimitErrors: TypeAlias = type[BaseException] | Sequence[type[BaseException]]
 logger = logging.getLogger(__name__)
+# Module-level singleton for no-op tracing
+_NOOP_TRACER = NoOpTracer()
 def run_experiment(
     experiment_name: str,
@@ -76,23 +87,25 @@ def run_experiment(
     evaluators: Evaluators | None = None,
     concurrency: int = 3,
     exit_on_error: bool = False,
+    timeout: int = 120,
 ) -> pd.DataFrame:
     """Run an experiment on a dataset.
     Args:
         experiment_name (str): The name for the experiment.
         experiment_id (str): The ID for the experiment.
-        dataset (pd.DataFrame): The dataset to run the experiment on.
+        dataset (:class:`pandas.DataFrame`): The dataset to run the experiment on.
         task (ExperimentTask): The task to be executed on the dataset.
         tracer (Tracer): Tracer for tracing the experiment.
         resource (Resource): The resource for tracing the experiment.
-        rate_limit_errors (Optional[RateLimitErrors]): Optional rate limit errors.
-        evaluators (Optional[Evaluators]): Optional evaluators to assess the task.
+        rate_limit_errors (RateLimitErrors | :obj:`None`): Optional rate limit errors.
+        evaluators (Evaluators | :obj:`None`): Optional evaluators to assess the task.
         concurrency (int): The number of concurrent tasks to run. Default is 3.
         exit_on_error (bool): Whether to exit on error. Default is False.
+        timeout (int): The timeout for each task execution in seconds. Default is 120.
     Returns:
-        pd.DataFrame: The results of the experiment.
+        :class:`pandas.DataFrame`: The results of the experiment.
     """
     task_signature = inspect.signature(task)
     _validate_task_signature(task_signature)
@@ -114,11 +127,12 @@ def run_experiment(
         error: BaseException | None = None
         status = Status(StatusCode.OK)
         with ExitStack() as stack:
-            span: Span = stack.enter_context(
+            # Type ignore: OpenTelemetry interface vs implementation type mismatch
+            span: Span = stack.enter_context(  # type: ignore[assignment]
                 cm=tracer.start_as_current_span(
                     name=root_span_name, context=Context()
                 )
-            )  # type:ignore
+            )
             stack.enter_context(capture_spans(resource))
             span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
             try:
@@ -144,9 +158,12 @@ def run_experiment(
                     raise TypeError(sync_error_message)
                 output = _output
-            output = jsonify(output)
+            # Type ignore: jsonify returns object but runtime result is JSONSerializable
+            output = jsonify(output)  # type: ignore[assignment]
             if example.input:
-                span.set_attribute(INPUT_VALUE, example.input)  # type: ignore
+                # OpenTelemetry type hints are restrictive, but Arize's tracing layer
+                # accepts JSON-serializable structures which are auto-serialized
+                span.set_attribute(INPUT_VALUE, example.input)  # type: ignore[arg-type]
             else:
                 span.set_attribute(
                     INPUT_VALUE,
@@ -185,9 +202,9 @@ def run_experiment(
                 else datetime.now(tz=timezone.utc)
             ),
             dataset_example_id=example.id,
-            output=output,  # type:ignore
+            output=output,
             error=repr(error) if error else None,
-            trace_id=_str_trace_id(span.get_span_context().trace_id),  # type: ignore
+            trace_id=_str_trace_id(span.get_span_context().trace_id),
         )
     async def async_run_experiment(example: Example) -> ExperimentRun:
@@ -195,11 +212,12 @@ def run_experiment(
         error: BaseException | None = None
         status = Status(StatusCode.OK)
         with ExitStack() as stack:
-            span: Span = stack.enter_context(
+            # Type ignore: OpenTelemetry interface vs implementation type mismatch
+            span: Span = stack.enter_context(  # type: ignore[assignment]
                 cm=tracer.start_as_current_span(
                     name=root_span_name, context=Context()
                 )
-            )  # type:ignore
+            )
             stack.enter_context(capture_spans(resource))
             span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
             try:
@@ -218,9 +236,12 @@ def run_experiment(
                 )
                 error = exc
                 _print_experiment_error(exc, example_id=example.id, kind="task")
-            output = jsonify(output)
+            # Type ignore: jsonify returns object but runtime result is JSONSerializable
+            output = jsonify(output)  # type: ignore[assignment]
             if example.input:
-                span.set_attribute(INPUT_VALUE, example.input)  # type: ignore
+                # OpenTelemetry type hints are restrictive, but Arize's tracing layer
+                # accepts JSON-serializable structures which are auto-serialized
+                span.set_attribute(INPUT_VALUE, example.input)  # type: ignore[arg-type]
             else:
                 span.set_attribute(
                     INPUT_VALUE,
@@ -259,14 +280,14 @@ def run_experiment(
                 else datetime.now(tz=timezone.utc)
             ),
             dataset_example_id=example.id,
-            output=output,  # type: ignore
+            output=output,
             error=repr(error) if error else None,
-            trace_id=_str_trace_id(span.get_span_context().trace_id),  # type: ignore
+            trace_id=_str_trace_id(span.get_span_context().trace_id),
         )
     _errors: tuple[type[BaseException], ...]
     if not isinstance(rate_limit_errors, Sequence):
-        _errors = (rate_limit_errors,)  # type: ignore
+        _errors = (rate_limit_errors,) if rate_limit_errors is not None else ()
     else:
         _errors = tuple(filter(None, rate_limit_errors))
     rate_limiters = [RateLimiter(rate_limit_error=rle) for rle in _errors]
@@ -282,30 +303,43 @@ def run_experiment(
     )
     executor = get_executor_on_sync_context(
-        sync_fn=rate_limited_sync_run_experiment,
-        async_fn=rate_limited_async_run_experiment,
+        sync_fn=cast(
+            "Callable[[object], Any]", rate_limited_sync_run_experiment
+        ),
+        async_fn=cast(
+            "Callable[[object], Coroutine[Any, Any, Any]]",
+            rate_limited_async_run_experiment,
+        ),
         max_retries=0,
         exit_on_error=exit_on_error,
         fallback_return_value=None,
         tqdm_bar_format=get_tqdm_progress_bar_formatter("running tasks"),
         concurrency=concurrency,
+        timeout=timeout,
     )
     runs, _ = executor.run(examples)
-    task_summary = _TaskSummary.from_task_runs(len(dataset), runs)
+    task_summary = _TaskSummary.from_task_runs(
+        len(dataset), cast("list[ExperimentRun | None]", runs)
+    )
     if exit_on_error and (None in runs):
         # When exit_on_error is True, the result of a failed task execution is None
         # If any task execution failed, raise an error to exit early
         raise RuntimeError("An error occurred during execution of tasks.")
+    # Filter out None values before accessing attributes
+    runs_filtered = [
+        r for r in cast("list[ExperimentRun | None]", runs) if r is not None
+    ]
     out_df = pd.DataFrame()
-    out_df["id"] = [run.id for run in runs]
-    out_df["example_id"] = [run.dataset_example_id for run in runs]
-    out_df["result"] = [run.output for run in runs]
-    out_df["result.trace.id"] = [run.trace_id for run in runs]
+    out_df["id"] = [run.id for run in runs_filtered]
+    out_df["example_id"] = [run.dataset_example_id for run in runs_filtered]
+    out_df["result"] = [run.output for run in runs_filtered]  # type: ignore[assignment]
+    out_df["result.trace.id"] = [run.trace_id for run in runs_filtered]
     out_df["result.trace.timestamp"] = [
-        int(run.start_time.timestamp() * 1e3) for run in runs
+        int(run.start_time.timestamp() * 1e3) for run in runs_filtered
     ]
     out_df.set_index("id", inplace=True, drop=False)
     logger.info(f"✅ Task runs completed.\n{task_summary}")
@@ -314,13 +348,14 @@ def run_experiment(
         eval_results = evaluate_experiment(
             experiment_name=experiment_name,
             examples=examples,
-            experiment_results=runs,
+            experiment_results=cast("Sequence[ExperimentRun]", runs),
             evaluators=evaluators,
             rate_limit_errors=rate_limit_errors,
             concurrency=concurrency,
             tracer=tracer,
             resource=resource,
             exit_on_error=exit_on_error,
+            timeout=timeout,
         )
         if exit_on_error and (None in eval_results):
@@ -329,7 +364,7 @@ def run_experiment(
             )
         # group evaluation results by name
-        eval_results_by_name = {}
+        eval_results_by_name: dict[str, list[ExperimentEvaluationRun]] = {}
         for r in eval_results:
             if r is None:
                 continue
@@ -351,7 +386,8 @@ def run_experiment(
             }
             for attr, getter in eval_data.items():
-                out_df[f"eval.{eval_name}.{attr}"] = out_df.index.map(
+                # Type ignore: pandas DataFrame column assignment type is overly restrictive
+                out_df[f"eval.{eval_name}.{attr}"] = out_df.index.map(  # type: ignore[assignment]
                     {r.experiment_run_id: getter(r) for r in eval_res}
                 )
             out_df = _add_metadata_to_output_df(out_df, eval_res, eval_name)
@@ -368,9 +404,10 @@ def evaluate_experiment(
     evaluators: Evaluators | None = None,
     rate_limit_errors: RateLimitErrors | None = None,
     concurrency: int = 3,
-    tracer: Tracer | None = None,
+    tracer: Tracer = _NOOP_TRACER,
     resource: Resource | None = None,
     exit_on_error: bool = False,
+    timeout: int = 120,
 ) -> list[ExperimentEvaluationRun]:
     """Evaluate the results of an experiment using the provided evaluators.
@@ -379,11 +416,12 @@ def evaluate_experiment(
         examples (Sequence[Example]): The examples to evaluate.
         experiment_results (Sequence[ExperimentRun]): The results of the experiment.
         evaluators (Evaluators): The evaluators to use for assessment.
-        rate_limit_errors (Optional[RateLimitErrors]): Optional rate limit errors.
+        rate_limit_errors (RateLimitErrors | :obj:`None`): Optional rate limit errors.
         concurrency (int): The number of concurrent tasks to run. Default is 3.
-        tracer (Optional[Tracer]): Optional tracer for tracing the evaluation.
-        resource (Optional[Resource]): Optional resource for the evaluation.
+        tracer (Tracer): Tracer for tracing the evaluation. Defaults to NoOpTracer().
+        resource (Resource | :obj:`None`): Optional resource for the evaluation.
         exit_on_error (bool): Whether to exit on error. Default is False.
+        timeout (int): The timeout for each evaluation in seconds. Default is 120.
     Returns:
         List[ExperimentEvaluationRun]: The evaluation results.
@@ -419,12 +457,16 @@ def evaluate_experiment(
         status = Status(StatusCode.OK)
         root_span_name = f"Evaluation: {evaluator.name}"
         with ExitStack() as stack:
-            span: Span = stack.enter_context(
-                tracer.start_as_current_span(  # type:ignore
-                    name=root_span_name, context=Context()
-                )
+            span: Span = cast(
+                "Span",
+                stack.enter_context(
+                    tracer.start_as_current_span(
+                        name=root_span_name, context=Context()
+                    )
+                ),
             )
-            stack.enter_context(capture_spans(resource))  # type:ignore
+            if resource is not None:
+                stack.enter_context(capture_spans(resource))
             span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
             try:
                 result = evaluator.evaluate(
@@ -450,7 +492,15 @@ def evaluate_experiment(
                 )
             if result:
                 span.set_attributes(
-                    dict(flatten(jsonify(result), recurse_on_sequence=True))
+                    dict(
+                        flatten(
+                            cast(
+                                "Mapping[str, Any] | Iterable[Any]",
+                                jsonify(result),
+                            ),
+                            recurse_on_sequence=True,
+                        )
+                    )
                 )
             span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
             span.set_status(status)
@@ -467,7 +517,7 @@ def evaluate_experiment(
             annotator_kind=evaluator.kind,
             error=repr(error) if error else None,
             result=result,
-            trace_id=_str_trace_id(span.get_span_context().trace_id),  # type:ignore
+            trace_id=_str_trace_id(span.get_span_context().trace_id),
         )
     async def async_eval_run(
@@ -479,12 +529,16 @@ def evaluate_experiment(
         status = Status(StatusCode.OK)
         root_span_name = f"Evaluation: {evaluator.name}"
         with ExitStack() as stack:
-            span: Span = stack.enter_context(
-                tracer.start_as_current_span(  # type:ignore
-                    name=root_span_name, context=Context()
-                )
+            span: Span = cast(
+                "Span",
+                stack.enter_context(
+                    tracer.start_as_current_span(
+                        name=root_span_name, context=Context()
+                    )
+                ),
             )
-            stack.enter_context(capture_spans(resource))  # type:ignore
+            if resource is not None:
+                stack.enter_context(capture_spans(resource))
             span.set_attribute(METADATA, json.dumps(md, ensure_ascii=False))
             try:
                 result = await evaluator.async_evaluate(
@@ -510,7 +564,15 @@ def evaluate_experiment(
                 )
             if result:
                 span.set_attributes(
-                    dict(flatten(jsonify(result), recurse_on_sequence=True))
+                    dict(
+                        flatten(
+                            cast(
+                                "Mapping[str, Any] | Iterable[Any]",
+                                jsonify(result),
+                            ),
+                            recurse_on_sequence=True,
+                        )
+                    )
                 )
             span.set_attribute(OPENINFERENCE_SPAN_KIND, root_span_kind)
             span.set_status(status)
@@ -526,7 +588,7 @@ def evaluate_experiment(
             annotator_kind=evaluator.kind,
             error=repr(error) if error else None,
             result=result,
-            trace_id=_str_trace_id(span.get_span_context().trace_id),  # type:ignore
+            trace_id=_str_trace_id(span.get_span_context().trace_id),
         )
     _errors: tuple[type[BaseException], ...]
@@ -547,8 +609,11 @@ def evaluate_experiment(
     )
     executor = get_executor_on_sync_context(
-        rate_limited_sync_evaluate_run,
-        rate_limited_async_evaluate_run,
+        cast("Callable[[object], Any]", rate_limited_sync_evaluate_run),
+        cast(
+            "Callable[[object], Coroutine[Any, Any, Any]]",
+            rate_limited_async_evaluate_run,
+        ),
         max_retries=0,
         exit_on_error=exit_on_error,
         fallback_return_value=None,
@@ -556,16 +621,18 @@ def evaluate_experiment(
             "running experiment evaluations"
         ),
         concurrency=concurrency,
+        timeout=timeout,
     )
     eval_runs, _ = executor.run(evaluation_input)
-    return eval_runs
+    # Cast: run returns list[Unset | object], but sync/async_eval_run guarantee ExperimentEvaluationRun
+    return cast("list[ExperimentEvaluationRun]", eval_runs)
 def _add_metadata_to_output_df(
     output_df: pd.DataFrame,
     eval_runs: list[ExperimentEvaluationRun],
     evaluator_name: str,
-) -> object:
+) -> pd.DataFrame:
     for eval_run in eval_runs:
         if eval_run.result is None:
             continue
@@ -596,7 +663,9 @@ def _dataframe_to_examples(dataset: pd.DataFrame) -> list[Example]:
     examples = []
     for _, row in dataset.iterrows():
-        example = Example(dataset_row=row.to_dict())
+        example = Example(
+            dataset_row=cast("Mapping[str, JSONSerializable]", row.to_dict())
+        )
         examples.append(example)
     return examples
@@ -763,7 +832,8 @@ def get_result_attr(r: object, attr: str, default: object = None) -> object:
     Returns:
         The attribute value if found, otherwise the default value.
     """
-    return getattr(r.result, attr, default) if r.result else default
+    # Type ignore: r typed as object but expected to have result attribute at runtime
+    return getattr(r.result, attr, default) if r.result else default  # type: ignore[attr-defined]
 def transform_to_experiment_format(
@@ -771,16 +841,16 @@ def transform_to_experiment_format(
     task_fields: ExperimentTaskFieldNames,
     evaluator_fields: dict[str, EvaluationResultFieldNames] | None = None,
 ) -> pd.DataFrame:
-    """Transform a DataFrame to match the format returned by run_experiment().
+    """Transform a :class:`pandas.DataFrame` to match the format returned by run_experiment().
     Args:
-        experiment_runs: Input list of dictionaries or DataFrame containing experiment results
+        experiment_runs: Input list of dictionaries or :class:`pandas.DataFrame` containing experiment results
         task_fields: Field name mapping for task results
         evaluator_fields: Dictionary mapping evaluator names (str)
             to their field name mappings (EvaluationResultFieldNames)
     Returns:
-        DataFrame in the format matching run_experiment() output
+        :class:`pandas.DataFrame` in the format matching run_experiment() output
     """
     data = (
         experiment_runs
@@ -822,7 +892,7 @@ def _add_evaluator_columns(
     evaluator_name: str,
     column_names: EvaluationResultFieldNames,
 ) -> None:
-    """Helper function to add evaluator columns to output DataFrame."""
+    """Helper function to add evaluator columns to output :class:`pandas.DataFrame`."""
     # Add score if specified
     if column_names.score and column_names.score in input_df.columns:
         output_df[f"eval.{evaluator_name}.score"] = input_df[column_names.score]

arize/experiments/tracing.py CHANGED Viewed

@@ -57,7 +57,10 @@ _ACTIVE_MODIFIER: ContextVar[SpanModifier | None] = ContextVar(
 def override_span(
-    init: Callable[..., None], span: ReadableSpan, args: object, kwargs: object
+    init: Callable[..., None],
+    span: ReadableSpan,
+    args: tuple[object, ...],
+    kwargs: dict[str, object],
 ) -> None:
     """Override span initialization to apply active span modifiers.

arize 8.0.0b1__py3-none-any.whl → 8.0.0b4__py3-none-any.whl

arize 8.0.0b1py3-none-any.whl → 8.0.0b4py3-none-any.whl