PyPI - aiqa-client - Versions diffs - 0.5.2__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

aiqa-client 0.5.2py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

aiqa/__init__.py +8 -2
aiqa/client.py +17 -2
aiqa/constants.py +1 -1
aiqa/experiment_runner.py +248 -77
aiqa/llm_as_judge.py +281 -0
aiqa/span_helpers.py +511 -0
aiqa/tracing.py +169 -561
aiqa/tracing_llm_utils.py +20 -9
aiqa/types.py +61 -0
{aiqa_client-0.5.2.dist-info → aiqa_client-0.6.1.dist-info}/METADATA +1 -1
aiqa_client-0.6.1.dist-info/RECORD +17 -0
{aiqa_client-0.5.2.dist-info → aiqa_client-0.6.1.dist-info}/WHEEL +1 -1
aiqa_client-0.5.2.dist-info/RECORD +0 -14
{aiqa_client-0.5.2.dist-info → aiqa_client-0.6.1.dist-info}/licenses/LICENSE.txt +0 -0
{aiqa_client-0.5.2.dist-info → aiqa_client-0.6.1.dist-info}/top_level.txt +0 -0

aiqa/__init__.py CHANGED Viewed

@@ -26,8 +26,8 @@ Example:
     result = my_function()
 """
-from .tracing import (
-    WithTracing,
+from .tracing import WithTracing
+from .span_helpers import (
     flush_tracing,
     set_span_attribute,
     set_span_name,
@@ -39,7 +39,10 @@ from .tracing import (
     extract_trace_context,
     set_conversation_id,
     set_component_tag,
+    set_token_usage,
+    set_provider_and_model,
     get_span,
+    submit_feedback,
 )
 from .client import get_aiqa_client
 from .experiment_runner import ExperimentRunner
@@ -60,7 +63,10 @@ __all__ = [
     "extract_trace_context",
     "set_conversation_id",
     "set_component_tag",
+    "set_token_usage",
+    "set_provider_and_model",
     "get_span",
+    "submit_feedback",
     "VERSION",
 ]

aiqa/client.py CHANGED Viewed

@@ -5,8 +5,10 @@ from functools import lru_cache
 from typing import Optional, TYPE_CHECKING, Any, Dict
 from opentelemetry import trace
 from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult, SpanExporter as SpanExporterBase
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+from opentelemetry.sdk.trace import ReadableSpan
+from opentelemetry.trace import SpanContext
 import requests
 from .constants import AIQA_TRACER_NAME, LOG_TAG
@@ -254,11 +256,24 @@ def _attach_aiqa_processor(provider: TracerProvider) -> None:
         else:
             endpoint = f"{base_url}/v1/traces"
-        # Create OTLP exporter with authentication headers only
+        # Get timeout from environment variable (in seconds)
+        # Supports OTEL_EXPORTER_OTLP_TIMEOUT (standard) or AIQA_EXPORT_TIMEOUT (custom)
+        # Default is 30 seconds (more generous than OTLP default of 10s)
+        timeout = 30.0
+        otlp_timeout = os.getenv("OTEL_EXPORTER_OTLP_TIMEOUT")
+        if otlp_timeout:
+            try:
+                timeout = float(otlp_timeout)
+            except ValueError:
+                logger.warning(f"Invalid OTEL_EXPORTER_OTLP_TIMEOUT value '{otlp_timeout}', using default 30.0")
+        # Create OTLP exporter with authentication headers and timeout
         # The exporter will set Content-Type and other headers automatically
         exporter = OTLPSpanExporter(
             endpoint=endpoint,
             headers=auth_headers if auth_headers else None,
+            timeout=timeout,
         )
         provider.add_span_processor(BatchSpanProcessor(exporter))

aiqa/constants.py CHANGED Viewed

@@ -3,6 +3,6 @@ Constants used across the AIQA client package.
 """
 AIQA_TRACER_NAME = "aiqa-tracer"
-VERSION = "0.5.2" # automatically updated by set-version-json.sh
+VERSION = "0.6.1" # automatically updated by set-version-json.sh
 LOG_TAG = "AIQA" # Used in all logging output to identify AIQA messages

aiqa/experiment_runner.py CHANGED Viewed

@@ -4,10 +4,52 @@ ExperimentRunner - runs experiments on datasets and scores results
 import os
 import time
+import asyncio
 from .constants import LOG_TAG
 from .http_utils import build_headers, get_server_url, get_api_key, format_http_error
 from typing import Any, Dict, List, Optional, Callable, Awaitable, Union
+from .tracing import WithTracing
+from .span_helpers import set_span_attribute, flush_tracing
+from .llm_as_judge import score_llm_metric_local, get_model_from_server, call_llm_fallback
 import requests
+from .types import MetricResult, ScoreThisInputOutputMetricType, Example, Result, Metric, CallLLMType
+# Type aliases for engine/scoring functions to improve code completion and clarity
+from typing import TypedDict
+# Function that processes input and parameters to produce an output (sync or async)
+CallMyCodeType = Callable[[Any, Dict[str, Any]], Union[Any, Awaitable[Any]]]
+# Function that scores a given output, using input, example, and parameters (usually async)
+# Returns a dictionary with score/message/etc.
+ScoreThisOutputType = Callable[[Any, Any, Dict[str, Any], Dict[str, Any]], Awaitable[Dict[str, Any]]]
+def _filter_input_for_run(input_data: Any) -> Dict[str, Any]:
+    """Tracing:Filter input - drop most, keep just ids"""
+    if not isinstance(input_data, dict):
+        return {}
+    self_obj = input_data.get("self")
+    if not self_obj:
+        return {}
+    return {
+        "dataset": getattr(self_obj, "dataset_id", None),
+        "experiment": getattr(self_obj, "experiment_id", None),
+    }
+def _filter_input_for_run_example(
+    self: "ExperimentRunner",
+    example: Dict[str, Any],
+    call_my_code: Any = None,
+    score_this_output: Any = None,
+) -> Dict[str, Any]:
+    """Filter input for run_example method to extract dataset, experiment, and example IDs."""
+    result = _filter_input_for_run({"self": self})
+    if isinstance(example, dict):
+        result["example"] = example.get("id")
+    return result
 class ExperimentRunner:
@@ -24,6 +66,7 @@ class ExperimentRunner:
         server_url: Optional[str] = None,
         api_key: Optional[str] = None,
         organisation_id: Optional[str] = None,
+        llm_call_fn: Optional[CallLLMType] = None,
     ):
         """
         Initialize the ExperimentRunner.
@@ -33,7 +76,11 @@ class ExperimentRunner:
             experiment_id: Usually unset, and a fresh experiment is created with a random ID
             server_url: URL of the AIQA server (defaults to AIQA_SERVER_URL env var)
             api_key: API key for authentication (defaults to AIQA_API_KEY env var)
-            organisation_id: Organisation ID for the experiment
+            organisation_id: Optional organisation ID for the experiment. If not provided, will be
+                            derived from the dataset when needed.
+            llm_call_fn: Optional async function that takes (system_prompt, user_message) and returns
+                        raw content string (typically JSON). If not provided, will check for OPENAI_API_KEY
+                        or ANTHROPIC_API_KEY environment variables.
         """
         self.dataset_id = dataset_id
         self.experiment_id = experiment_id
@@ -42,6 +89,8 @@ class ExperimentRunner:
         self.organisation = organisation_id
         self.experiment: Optional[Dict[str, Any]] = None
         self.scores: List[Dict[str, Any]] = []
+        self.llm_call_fn = llm_call_fn
+        self._dataset_cache: Optional[Dict[str, Any]] = None
     def _get_headers(self) -> Dict[str, str]:
         """Build HTTP headers for API requests."""
@@ -54,6 +103,9 @@ class ExperimentRunner:
         Returns:
             The dataset object with metrics and other information
         """
+        if self._dataset_cache is not None:
+            return self._dataset_cache
         response = requests.get(
             f"{self.server_url}/dataset/{self.dataset_id}",
             headers=self._get_headers(),
@@ -62,7 +114,14 @@ class ExperimentRunner:
         if not response.ok:
             raise Exception(format_http_error(response, "fetch dataset"))
-        return response.json()
+        dataset = response.json()
+        self._dataset_cache = dataset
+        # If organisation_id wasn't set, derive it from the dataset
+        if not self.organisation and dataset.get("organisation"):
+            self.organisation = dataset.get("organisation")
+        return dataset
     def get_example_inputs(self, limit: int = 10000) -> List[Dict[str, Any]]:
         """
@@ -108,8 +167,13 @@ class ExperimentRunner:
         Returns:
             The created experiment object
         """
+        # Ensure we have the organisation ID - try to get it from the dataset if not set
+        if not self.organisation:
+            dataset = self.get_dataset()
+            self.organisation = dataset.get("organisation")
         if not self.organisation or not self.dataset_id:
-            raise Exception("Organisation and dataset ID are required to create an experiment")
+            raise Exception("Organisation and dataset ID are required to create an experiment. Organisation can be derived from the dataset or set via organisation_id parameter.")
         if not experiment_setup:
             experiment_setup = {}
@@ -138,19 +202,19 @@ class ExperimentRunner:
         self.experiment = experiment
         return experiment
-    def score_and_store(
+    async def score_and_store(
         self,
-        example: Dict[str, Any],
-        result: Any,
-        scores: Optional[Dict[str, Any]] = None,
-    ) -> Dict[str, Any]:
+        example: Example,
+        output: Any,
+        result: Result,
+    ) -> Result:
         """
         Ask the server to score an example result. Stores the score for later summary calculation.
         Args:
             example: The example object
-            result: The output from running the engine on the example
-            scores: Optional pre-computed scores
+            output: The output from running the engine on the example
+            result: The result object for locally calculated scores
         Returns:
             The score result from the server
@@ -158,22 +222,34 @@ class ExperimentRunner:
         # Do we have an experiment ID? If not, we need to create the experiment first
         if not self.experiment_id:
             self.create_experiment()
-        if scores is None:
-            scores = {}
-        print(f"Scoring and storing example: {example['id']}")
+        example_id = example.get("id")
+        if not example_id:
+            raise ValueError("Example must have an 'id' field")
+        if result is None:
+            example_id = example.get("id")
+            if not example_id:
+                raise ValueError("Example must have an 'id' field")
+            result = Result(exampleId=example_id, scores={}, messages={}, errors={})
+        scores = result.get("scores") or {}
+        print(f"Scoring and storing example: {example_id}")
         print(f"Scores: {scores}")
-        response = requests.post(
-            f"{self.server_url}/experiment/{self.experiment_id}/example/{example['id']}/scoreAndStore",
-            json={
-                "output": result,
-                "traceId": example.get("traceId"),
-                "scores": scores,
-            },
-            headers=self._get_headers(),
-        )
+        # Run synchronous requests.post in a thread pool to avoid blocking
+        def _do_request():
+            return requests.post(
+                f"{self.server_url}/experiment/{self.experiment_id}/example/{example_id}/scoreAndStore",
+                json={
+                    "output": result,
+                    "traceId": example.get("traceId"),
+                    "scores": scores,
+                },
+                headers=self._get_headers(),
+            )
+        response = await asyncio.to_thread(_do_request)
         if not response.ok:
             raise Exception(format_http_error(response, "score and store"))
@@ -182,12 +258,11 @@ class ExperimentRunner:
         print(f"scoreAndStore response: {json_result}")
         return json_result
+    @WithTracing(filter_input=_filter_input_for_run)
     async def run(
         self,
-        engine: Callable[[Any], Union[Any, Awaitable[Any]]],
-        scorer: Optional[
-            Callable[[Any, Dict[str, Any]], Awaitable[Dict[str, Any]]]
-        ] = None,
+        call_my_code: CallMyCodeType,
+        scorer_for_metric_id: Optional[Dict[str, ScoreThisInputOutputMetricType]] = None,
     ) -> None:
         """
         Run an engine function on all examples and score the results.
@@ -199,42 +274,43 @@ class ExperimentRunner:
         examples = self.get_example_inputs()
         # Wrap engine to match run_example signature (input, parameters)
-        def wrapped_engine(input_data, parameters):
-            return engine(input_data)
-        # Wrap scorer to match run_example signature (output, example, parameters)
-        async def wrapped_scorer(output, example, parameters):
-            if scorer:
-                return await scorer(output, example)
-            return {}
+        async def wrapped_engine(input_data, parameters):
+            result = call_my_code(input_data, parameters)
+            # Handle async functions
+            if hasattr(result, "__await__"):
+                result = await result
+            return result
         for example in examples:
-            scores = await self.run_example(example, wrapped_engine, wrapped_scorer)
-            if scores:
-                self.scores.append(
-                    {
-                        "example": example,
-                        "result": scores,
-                        "scores": scores,
-                    }
-                )
+            try:
+                scores = await self.run_example(example, wrapped_engine, scorer_for_metric_id)
+                if scores:
+                    self.scores.append(
+                        {
+                            "example": example,
+                            "result": scores,
+                            "scores": scores,
+                        }
+                    )
+            except Exception as e:
+                print(f"Error processing example {example.get('id', 'unknown')}: {e}")
+                # Continue with next example instead of failing entire run
+    @WithTracing(filter_input=_filter_input_for_run_example)
     async def run_example(
         self,
-        example: Dict[str, Any],
-        call_my_code: Callable[[Any, Dict[str, Any]], Union[Any, Awaitable[Any]]],
-        score_this_output: Optional[
-            Callable[[Any, Dict[str, Any], Dict[str, Any]], Awaitable[Dict[str, Any]]]
-        ] = None,
-    ) -> List[Dict[str, Any]]:
+        example: Example,
+        call_my_code: CallMyCodeType,
+        scorer_for_metric_id: Optional[Dict[str, ScoreThisInputOutputMetricType]] = None,
+    ) -> List[Result]:
         """
         Run the engine on an example with the given parameters (looping over comparison parameters),
         and score the result. Also calls scoreAndStore to store the result in the server.
         Args:
-            example: The example to run
+            example: The example to run. See Example.ts type
             call_my_code: Function that takes input and parameters, returns output (can be async)
-            score_this_output: Optional function that scores the output given the example and parameters
+            scorer_for_metric_id: Optional dictionary of metric IDs to functions that score the output given the example and parameters
         Returns:
             One set of scores for each comparison parameter set. If no comparison parameters,
@@ -261,41 +337,87 @@ class ExperimentRunner:
             )
             # Run engine anyway -- this could make sense if it's all about the parameters
+        # Set example.id on the root span (created by @WithTracing decorator)
+        # This ensures the root span from the trace has example=Example.id set
+        example_id = example.get("id")
+        if not example_id:
+            raise ValueError("Example must have an 'id' field")
+        set_span_attribute("example", example_id)
         all_scores: List[Dict[str, Any]] = []
+        dataset_metrics = self.get_dataset().get("metrics", [])
+        specific_metrics = example.get("metrics", [])
+        metrics = [*dataset_metrics, *specific_metrics]
         # This loop should not be parallelized - it should run sequentially, one after the other
         # to avoid creating interference between the runs.
         for parameters in parameters_loop:
             parameters_here = {**parameters_fixed, **parameters}
             print(f"Running with parameters: {parameters_here}")
+            # Save original env var values for cleanup
+            original_env_vars: Dict[str, Optional[str]] = {}
             # Set env vars from parameters_here
             for key, value in parameters_here.items():
                 if value:
+                    original_env_vars[key] = os.environ.get(key)
                     os.environ[key] = str(value)
-            start = time.time() * 1000  # milliseconds
-            output = call_my_code(input_data, parameters_here)
-            # Handle async functions
-            if hasattr(output, "__await__"):
-                import asyncio
-                output = await output
-            end = time.time() * 1000  # milliseconds
-            duration = int(end - start)
-            print(f"Output: {output}")
-            scores: Dict[str, Any] = {}
-            if score_this_output:
-                scores = await score_this_output(output, example, parameters_here)
-            scores["duration"] = duration
-            # TODO: this call as async and wait for all to complete before returning
-            print(f"Call scoreAndStore ... for example: {example['id']} with scores: {scores}")
-            result = self.score_and_store(example, output, scores)
-            print(f"scoreAndStore returned: {result}")
-            all_scores.append(result)
+            try:
+                start = time.time() * 1000  # milliseconds
+                output = call_my_code(input_data, parameters_here)
+                # Handle async functions
+                if hasattr(output, "__await__"):
+                    output = await output
+                end = time.time() * 1000  # milliseconds
+                duration = int(end - start)
+                print(f"Output: {output}")
+                # Score it
+                result = Result(exampleId=example_id, scores={}, messages={}, errors={})
+                for metric in metrics:
+                    metric_id = metric.get("id")
+                    if not metric_id:
+                        print(f"Warning: Metric missing 'id' field, skipping: {metric}")
+                        continue
+                    scorer = scorer_for_metric_id.get(metric_id) if scorer_for_metric_id else None
+                    if scorer:
+                        metric_result = await scorer(input_data, output, metric)
+                    elif metric.get("type") == "llm":
+                        metric_result = await self._score_llm_metric(input_data, output, example, metric)
+                    else:
+                        metric_type = metric.get("type", "unknown")
+                        print(f"Skipping metric: {metric_id} {metric_type} - no scorer")
+                        continue
+                    # Handle None metric_result (e.g., if scoring failed)
+                    if not metric_result:
+                        print(f"Warning: Metric {metric_id} returned None result, skipping")
+                        result["errors"][metric_id] = "Scoring function returned None"
+                        continue
+                    result["scores"][metric_id] = metric_result.get("score")
+                    result["messages"][metric_id] = metric_result.get("message")
+                    result["errors"][metric_id] = metric_result.get("error")
+                # Always add duration to scores as a system metric
+                result["scores"]["duration"] = duration
+                # Flush spans before scoreAndStore to ensure they're indexed in ES
+                # This prevents race condition where scoreAndStore looks up spans before they're indexed
+                await flush_tracing()
+                print(f"Call scoreAndStore ... for example: {example_id} with scores: {result['scores']}")
+                result = await self.score_and_store(example, output, result)
+                print(f"scoreAndStore returned: {result}")
+                all_scores.append(result)
+            finally:
+                # Restore original env var values
+                for key, original_value in original_env_vars.items():
+                    if original_value is None:
+                        # Variable didn't exist before, remove it
+                        os.environ.pop(key, None)
+                    else:
+                        # Restore original value
+                        os.environ[key] = original_value
         return all_scores
@@ -306,6 +428,9 @@ class ExperimentRunner:
         Returns:
             Dictionary of metric names to summary statistics
         """
+        if not self.experiment_id:
+            raise ValueError("No experiment ID available. Create an experiment first.")
         response = requests.get(
             f"{self.server_url}/experiment/{self.experiment_id}",
             headers=self._get_headers(),
@@ -317,3 +442,49 @@ class ExperimentRunner:
         experiment2 = response.json()
         return experiment2.get("summary_results", {})
+    async def _score_llm_metric(
+        self,
+        input_data: Any,
+        output: Any,
+        example: Example,
+        metric: Metric,
+    ) -> MetricResult:
+        """
+        Score an LLM metric by fetching model API key from server if needed.
+        Args:
+            input_data: The input data to score
+            output: The output to score
+            example: The example object
+            metric: The metric definition
+        Returns:
+            MetricResult object with score:[0,1], message (optional), and error (optional)
+        """
+        # If model is specified, try to fetch API key from server
+        model_id = metric.get("model")
+        api_key = None
+        provider = metric.get("provider")
+        if model_id:
+            model_data = await get_model_from_server(
+                model_id, self.server_url, self._get_headers()
+            )
+            if model_data:
+                api_key = model_data.get("api_key")
+                # If provider not set in metric, try to get it from model
+                if not provider and model_data.get("provider"):
+                    provider = model_data.get("provider")
+        # Create a custom llm_call_fn if we have an API key from the model
+        llm_call_fn = self.llm_call_fn
+        if api_key and not llm_call_fn:
+            async def _model_llm_call(system_prompt: str, user_message: str) -> str:
+                return await call_llm_fallback(system_prompt, user_message, api_key, provider)
+            llm_call_fn = _model_llm_call
+        return await score_llm_metric_local(
+            input_data, output, example, metric, llm_call_fn
+        )

aiqa-client 0.5.2__py3-none-any.whl → 0.6.1__py3-none-any.whl

aiqa-client 0.5.2py3-none-any.whl → 0.6.1py3-none-any.whl