PyPI - lmnr - Versions diffs - 0.4.53.dev0__py3-none-any.whl → 0.7.26__py3-none-any.whl - Mend

lmnr 0.4.53.dev0py3-none-any.whl → 0.7.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (133) hide show

lmnr/__init__.py +32 -11
lmnr/cli/__init__.py +270 -0
lmnr/cli/datasets.py +371 -0
lmnr/cli/evals.py +111 -0
lmnr/cli/rules.py +42 -0
lmnr/opentelemetry_lib/__init__.py +70 -0
lmnr/opentelemetry_lib/decorators/__init__.py +337 -0
lmnr/opentelemetry_lib/litellm/__init__.py +685 -0
lmnr/opentelemetry_lib/litellm/utils.py +100 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/__init__.py +849 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/config.py +13 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_emitter.py +211 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_models.py +41 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/span_utils.py +401 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/streaming.py +425 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/utils.py +332 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/version.py +1 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/__init__.py +451 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/proxy.py +144 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_agent/__init__.py +100 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/__init__.py +476 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/utils.py +12 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/__init__.py +599 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/config.py +9 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/schema_utils.py +26 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/utils.py +330 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/__init__.py +488 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/config.py +8 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_emitter.py +143 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_models.py +41 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/span_utils.py +229 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/utils.py +92 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/version.py +1 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/__init__.py +381 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/utils.py +36 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/langgraph/__init__.py +121 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/langgraph/utils.py +60 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/__init__.py +61 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/__init__.py +472 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/chat_wrappers.py +1185 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/completion_wrappers.py +305 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/config.py +16 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/embeddings_wrappers.py +312 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_emitter.py +100 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_models.py +41 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/image_gen_wrappers.py +68 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/utils.py +197 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v0/__init__.py +176 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/__init__.py +368 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/assistant_wrappers.py +325 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/event_handler_wrapper.py +135 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/responses_wrappers.py +786 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/version.py +1 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openhands_ai/__init__.py +388 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/opentelemetry/__init__.py +69 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/skyvern/__init__.py +191 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/threading/__init__.py +197 -0
lmnr/opentelemetry_lib/tracing/__init__.py +263 -0
lmnr/opentelemetry_lib/tracing/_instrument_initializers.py +516 -0
lmnr/{openllmetry_sdk → opentelemetry_lib}/tracing/attributes.py +21 -8
lmnr/opentelemetry_lib/tracing/context.py +200 -0
lmnr/opentelemetry_lib/tracing/exporter.py +153 -0
lmnr/opentelemetry_lib/tracing/instruments.py +140 -0
lmnr/opentelemetry_lib/tracing/processor.py +193 -0
lmnr/opentelemetry_lib/tracing/span.py +398 -0
lmnr/opentelemetry_lib/tracing/tracer.py +57 -0
lmnr/opentelemetry_lib/tracing/utils.py +62 -0
lmnr/opentelemetry_lib/utils/package_check.py +18 -0
lmnr/opentelemetry_lib/utils/wrappers.py +11 -0
lmnr/sdk/browser/__init__.py +0 -0
lmnr/sdk/browser/background_send_events.py +158 -0
lmnr/sdk/browser/browser_use_cdp_otel.py +100 -0
lmnr/sdk/browser/browser_use_otel.py +142 -0
lmnr/sdk/browser/bubus_otel.py +71 -0
lmnr/sdk/browser/cdp_utils.py +518 -0
lmnr/sdk/browser/inject_script.js +514 -0
lmnr/sdk/browser/patchright_otel.py +151 -0
lmnr/sdk/browser/playwright_otel.py +322 -0
lmnr/sdk/browser/pw_utils.py +363 -0
lmnr/sdk/browser/recorder/record.umd.min.cjs +84 -0
lmnr/sdk/browser/utils.py +70 -0
lmnr/sdk/client/asynchronous/async_client.py +180 -0
lmnr/sdk/client/asynchronous/resources/__init__.py +6 -0
lmnr/sdk/client/asynchronous/resources/base.py +32 -0
lmnr/sdk/client/asynchronous/resources/browser_events.py +41 -0
lmnr/sdk/client/asynchronous/resources/datasets.py +131 -0
lmnr/sdk/client/asynchronous/resources/evals.py +266 -0
lmnr/sdk/client/asynchronous/resources/evaluators.py +85 -0
lmnr/sdk/client/asynchronous/resources/tags.py +83 -0
lmnr/sdk/client/synchronous/resources/__init__.py +6 -0
lmnr/sdk/client/synchronous/resources/base.py +32 -0
lmnr/sdk/client/synchronous/resources/browser_events.py +40 -0
lmnr/sdk/client/synchronous/resources/datasets.py +131 -0
lmnr/sdk/client/synchronous/resources/evals.py +263 -0
lmnr/sdk/client/synchronous/resources/evaluators.py +85 -0
lmnr/sdk/client/synchronous/resources/tags.py +83 -0
lmnr/sdk/client/synchronous/sync_client.py +191 -0
lmnr/sdk/datasets/__init__.py +94 -0
lmnr/sdk/datasets/file_utils.py +91 -0
lmnr/sdk/decorators.py +163 -26
lmnr/sdk/eval_control.py +3 -2
lmnr/sdk/evaluations.py +403 -191
lmnr/sdk/laminar.py +1080 -549
lmnr/sdk/log.py +7 -2
lmnr/sdk/types.py +246 -134
lmnr/sdk/utils.py +151 -7
lmnr/version.py +46 -0
{lmnr-0.4.53.dev0.dist-info → lmnr-0.7.26.dist-info}/METADATA +152 -106
lmnr-0.7.26.dist-info/RECORD +116 -0
lmnr-0.7.26.dist-info/WHEEL +4 -0
lmnr-0.7.26.dist-info/entry_points.txt +3 -0
lmnr/cli.py +0 -101
lmnr/openllmetry_sdk/.python-version +0 -1
lmnr/openllmetry_sdk/__init__.py +0 -72
lmnr/openllmetry_sdk/config/__init__.py +0 -9
lmnr/openllmetry_sdk/decorators/base.py +0 -185
lmnr/openllmetry_sdk/instruments.py +0 -38
lmnr/openllmetry_sdk/tracing/__init__.py +0 -1
lmnr/openllmetry_sdk/tracing/content_allow_list.py +0 -24
lmnr/openllmetry_sdk/tracing/context_manager.py +0 -13
lmnr/openllmetry_sdk/tracing/tracing.py +0 -884
lmnr/openllmetry_sdk/utils/in_memory_span_exporter.py +0 -61
lmnr/openllmetry_sdk/utils/package_check.py +0 -7
lmnr/openllmetry_sdk/version.py +0 -1
lmnr/sdk/datasets.py +0 -55
lmnr-0.4.53.dev0.dist-info/LICENSE +0 -75
lmnr-0.4.53.dev0.dist-info/RECORD +0 -33
lmnr-0.4.53.dev0.dist-info/WHEEL +0 -4
lmnr-0.4.53.dev0.dist-info/entry_points.txt +0 -3
/lmnr/{openllmetry_sdk → opentelemetry_lib}/.flake8 +0 -0
/lmnr/{openllmetry_sdk → opentelemetry_lib}/utils/__init__.py +0 -0
/lmnr/{openllmetry_sdk → opentelemetry_lib}/utils/json_encoder.py +0 -0
/lmnr/{openllmetry_sdk/decorators/__init__.py → py.typed} +0 -0

lmnr/sdk/evaluations.py CHANGED Viewed

@@ -1,44 +1,59 @@
 import asyncio
 import re
-import sys
 import uuid
+from typing import Any
+from typing_extensions import TypedDict
 from tqdm import tqdm
-from typing import Any, Awaitable, Optional, Set, Union
-from ..openllmetry_sdk.instruments import Instruments
-from ..openllmetry_sdk.tracing.attributes import SPAN_TYPE
+from lmnr.opentelemetry_lib.tracing.instruments import Instruments
+from lmnr.opentelemetry_lib.tracing.attributes import HUMAN_EVALUATOR_OPTIONS, SPAN_TYPE
-from .datasets import EvaluationDataset
-from .eval_control import EVALUATION_INSTANCE, PREPARE_ONLY
-from .laminar import Laminar as L
-from .log import get_default_logger
-from .types import (
+from lmnr.sdk.client.asynchronous.async_client import AsyncLaminarClient
+from lmnr.sdk.client.synchronous.sync_client import LaminarClient
+from lmnr.sdk.datasets import EvaluationDataset, LaminarDataset
+from lmnr.sdk.eval_control import EVALUATION_INSTANCES, PREPARE_ONLY
+from lmnr.sdk.laminar import Laminar as L
+from lmnr.sdk.log import get_default_logger
+from lmnr.sdk.types import (
     Datapoint,
+    EvaluationDatapointDatasetLink,
     EvaluationResultDatapoint,
     EvaluatorFunction,
     ExecutorFunction,
     HumanEvaluator,
     Numeric,
     NumericTypes,
+    PartialEvaluationDatapoint,
     SpanType,
     TraceType,
 )
-from .utils import is_async
+from lmnr.sdk.utils import from_env, is_async, json_dumps
 DEFAULT_BATCH_SIZE = 5
+MAX_EXPORT_BATCH_SIZE = 64
+class EvaluationRunResult(TypedDict):
+    average_scores: dict[str, Numeric]
+    evaluation_id: uuid.UUID
+    project_id: uuid.UUID
+    url: str
+    error_message: str | None
 def get_evaluation_url(
-    project_id: str, evaluation_id: str, base_url: str = "https://www.lmnr.ai"
+    project_id: str, evaluation_id: str, base_url: str | None = None
 ):
+    if not base_url or base_url == "https://api.lmnr.ai":
+        base_url = "https://www.lmnr.ai"
     url = base_url
-    if url.endswith("/"):
-        url = url[:-1]
+    url = re.sub(r"\/$", "", url)
     if url.endswith("localhost") or url.endswith("127.0.0.1"):
-        # We best effort assume that the frontend is running on port 3000
-        # TODO: expose the frontend port?
-        url = url + ":3000"
+        # We best effort assume that the frontend is running on port 5667
+        url = url + ":5667"
     return f"{url}/project/{project_id}/evaluations/{evaluation_id}"
@@ -52,13 +67,17 @@ def get_average_scores(results: list[EvaluationResultDatapoint]) -> dict[str, Nu
     average_scores = {}
     for key, values in per_score_values.items():
-        average_scores[key] = sum(values) / len(values)
+        scores = [v for v in values if v is not None]
+        # If there are no scores, we don't want to include the key in the average scores
+        if len(scores) > 0:
+            average_scores[key] = sum(scores) / len(scores)
     return average_scores
 class EvaluationReporter:
-    def __init__(self, base_url: str = "https://www.lmnr.ai"):
+    def __init__(self, base_url):
         self.base_url = base_url
     def start(self, length: int):
@@ -71,89 +90,107 @@ class EvaluationReporter:
     def update(self, batch_length: int):
         self.cli_progress.update(batch_length)
-    def stopWithError(self, error: Exception):
-        self.cli_progress.close()
-        sys.stderr.write(f"\nError: {error}\n")
+    def stop_with_error(self, error: Exception):
+        if hasattr(self, "cli_progress"):
+            self.cli_progress.close()
+        raise error
     def stop(
         self, average_scores: dict[str, Numeric], project_id: str, evaluation_id: str
     ):
         self.cli_progress.close()
-        print(
-            f"\nCheck the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
-        )
         print("Average scores:")
         for name, score in average_scores.items():
             print(f"{name}: {score}")
-        print("\n")
+        print(
+            f"Check the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
+        )
 class Evaluation:
     def __init__(
         self,
-        data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
+        data: EvaluationDataset | list[Datapoint | dict],
         executor: Any,
-        evaluators: dict[str, EvaluatorFunction],
-        human_evaluators: list[HumanEvaluator] = [],
-        name: Optional[str] = None,
-        group_id: Optional[str] = None,
-        batch_size: int = DEFAULT_BATCH_SIZE,
-        project_api_key: Optional[str] = None,
-        base_url: Optional[str] = None,
-        http_port: Optional[int] = None,
-        grpc_port: Optional[int] = None,
-        instruments: Optional[Set[Instruments]] = None,
+        evaluators: dict[str, EvaluatorFunction | HumanEvaluator],
+        name: str | None = None,
+        group_name: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        concurrency_limit: int = DEFAULT_BATCH_SIZE,
+        project_api_key: str | None = None,
+        base_url: str | None = None,
+        base_http_url: str | None = None,
+        http_port: int | None = None,
+        grpc_port: int | None = None,
+        instruments: (
+            set[Instruments] | list[Instruments] | tuple[Instruments] | None
+        ) = None,
+        disabled_instruments: (
+            set[Instruments] | list[Instruments] | tuple[Instruments] | None
+        ) = None,
+        max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
+        trace_export_timeout_seconds: int | None = None,
     ):
         """
-        Initializes an instance of the Evaluations class.
+        Initializes an instance of the Evaluation class.
         Parameters:
-            data (Union[List[EvaluationDatapoint|dict], EvaluationDataset]):\
+            data (list[Datapoint|dict] | EvaluationDataset):\
                 List of data points to evaluate or an evaluation dataset.
-                            `data` is the input to the executor function,
-                            `target` is the input to the evaluator function.
+                    `data` is the input to the executor function.
+                    `target` is the input to the evaluator function.
+                    `metadata` is optional metadata to associate with the\
+                        datapoint.
             executor (Callable[..., Any]): The executor function.\
-                            Takes the data point + any additional arguments\
-                            and returns the output to evaluate.
-            evaluators (dict[str, Callable[..., Any]]): Evaluator functions and\
-                names. Each evaluator function takes the output of the executor\
-                _and_ the target data, and returns a score. The score can be a\
-                single number or a dict of string keys and number values.\
-                If the score is a single number, it will be named after the\
-                evaluator function. Evaluator function names must contain only\
-                letters, digits, hyphens, underscores, or spaces.
-            human_evaluators (list[HumanEvaluator], optional):\
-                [Beta] List of instances of HumanEvaluator. For now, human\
-                evaluator only holds the queue name.
-                Defaults to an empty list.
-            name (Optional[str], optional): Optional name of the evaluation.\
+                    Takes the data point + any additional arguments and returns\
+                    the output to evaluate.
+            evaluators (dict[str, Callable[..., Any] | HumanEvaluator]): Evaluator\
+                functions and HumanEvaluator instances with names. Each evaluator\
+                function takes the output of the executor _and_ the target data,\
+                and returns a score. The score can be a single number or a dict\
+                of string keys and number values. If the score is a single number,\
+                it will be named after the evaluator function.\
+                HumanEvaluator instances create empty spans for manual evaluation.\
+                Evaluator names must contain only letters, digits, hyphens,\
+                underscores, or spaces.
+            name (str | None, optional): Optional name of the evaluation.\
                 Used to identify the evaluation in the group.\
                 If not provided, a random name will be generated.
                 Defaults to None.
-            group_id (Optional[str], optional): an identifier to group\
-                evaluations. Only evaluations within the same group_id can be\
+            group_name (str | None, optional): an identifier to group\
+                evaluations. Only evaluations within the same group_name can be\
                 visually compared. If not provided, "default" is assigned.
                 Defaults to None
-            batch_size (int, optional): The batch size for evaluation. This many\
-                data points will be evaluated in parallel.
+            metadata (dict[str, Any] | None): optional metadata to associate with\
+            concurrency_limit (int, optional): The concurrency limit for\
+                evaluation. This many data points will be evaluated in parallel\
+                with a pool of workers.
                 Defaults to DEFAULT_BATCH_SIZE.
-            project_api_key (Optional[str], optional): The project API key.\
+            project_api_key (str | None, optional): The project API key.\
                 If not provided, LMNR_PROJECT_API_KEY environment variable is\
                 used.
                 Defaults to an empty string.
-            base_url (Optional[str], optional): The base URL for Laminar API.\
+            base_url (str | None, optional): The base URL for Laminar API.\
                 Useful if self-hosted. Do NOT include the port, use `http_port`\
                 and `grpc_port` instead.
                 Defaults to "https://api.lmnr.ai".
-            http_port (Optional[int], optional): The port for Laminar API\
+            base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
+                Only set this if your Laminar backend HTTP is proxied\
+                through a different host. If not specified, defaults\
+                to https://api.lmnr.ai.
+            http_port (int | None, optional): The port for Laminar API\
                 HTTP service. Defaults to 443 if not specified.
-            grpc_port (Optional[int], optional): The port for Laminar API\
+            grpc_port (int | None, optional): The port for Laminar API\
                 gRPC service. Defaults to 8443 if not specified.
-            instruments (Optional[Set[Instruments]], optional): Set of modules\
+            instruments (set[Instruments] | None, optional): Set of modules\
                 to auto-instrument. If None, all available instruments will be\
                 used.
                 See https://docs.lmnr.ai/tracing/automatic-instrumentation
                 Defaults to None.
+            disabled_instruments (set[Instruments] | None, optional): Set of modules\
+                to disable auto-instrumentations. If None, only modules passed\
+                as `instruments` will be disabled.
+                Defaults to None.
         """
         if not evaluators:
@@ -168,7 +205,8 @@ class Evaluation:
                     "underscores, or spaces."
                 )
-        self.is_finished = False
+        base_url = base_url or from_env("LMNR_BASE_URL") or "https://api.lmnr.ai"
         self.reporter = EvaluationReporter(base_url)
         if isinstance(data, list):
             self.data = [
@@ -177,212 +215,386 @@ class Evaluation:
             ]
         else:
             self.data = data
+        if not isinstance(self.data, LaminarDataset) and len(self.data) == 0:
+            raise ValueError("No data provided. Skipping evaluation")
         self.executor = executor
         self.evaluators = evaluators
-        self.group_id = group_id
+        self.group_name = group_name
         self.name = name
-        self.batch_size = batch_size
+        self.metadata = metadata
+        self.concurrency_limit = concurrency_limit
+        self.batch_size = concurrency_limit
         self._logger = get_default_logger(self.__class__.__name__)
-        self.human_evaluators = human_evaluators
+        self.upload_tasks = []
+        self.base_http_url = f"{base_http_url or base_url}:{http_port or 443}"
+        api_key = project_api_key or from_env("LMNR_PROJECT_API_KEY")
+        if not api_key and not L.is_initialized():
+            raise ValueError(
+                "Please pass the project API key to `evaluate`"
+                " or set the LMNR_PROJECT_API_KEY environment variable"
+                " in your environment or .env file"
+            )
+        self.project_api_key = api_key
+        if L.is_initialized():
+            self.client = AsyncLaminarClient(
+                base_url=L.get_base_http_url(),
+                project_api_key=L.get_project_api_key(),
+            )
+        else:
+            self.client = AsyncLaminarClient(
+                base_url=self.base_http_url,
+                project_api_key=self.project_api_key,
+            )
         L.initialize(
             project_api_key=project_api_key,
             base_url=base_url,
+            base_http_url=self.base_http_url,
             http_port=http_port,
             grpc_port=grpc_port,
             instruments=instruments,
+            disabled_instruments=disabled_instruments,
+            max_export_batch_size=max_export_batch_size,
+            export_timeout_seconds=trace_export_timeout_seconds,
         )
-    async def run(self) -> Awaitable[None]:
-        if self.is_finished:
-            raise Exception("Evaluation is already finished")
+    async def run(self) -> EvaluationRunResult:
         return await self._run()
-    async def _run(self) -> None:
-        self.reporter.start(len(self.data))
+    async def _run(self) -> EvaluationRunResult:
+        if isinstance(self.data, LaminarDataset):
+            self.data.set_client(
+                LaminarClient(
+                    base_url=self.base_http_url,
+                    project_api_key=self.project_api_key,
+                )
+            )
+            if not self.data.id:
+                try:
+                    datasets = await self.client.datasets.get_dataset_by_name(
+                        self.data.name
+                    )
+                    if len(datasets) == 0:
+                        self._logger.warning(f"Dataset {self.data.name} not found")
+                    else:
+                        self.data.id = datasets[0].id
+                except Exception as e:
+                    # Backward compatibility with old Laminar API (self hosted)
+                    self._logger.warning(f"Error getting dataset {self.data.name}: {e}")
         try:
-            result_datapoints = await self._evaluate_in_batches()
+            evaluation = await self.client.evals.init(
+                name=self.name, group_name=self.group_name, metadata=self.metadata
+            )
+            evaluation_id = evaluation.id
+            project_id = evaluation.projectId
+            url = get_evaluation_url(project_id, evaluation_id, self.reporter.base_url)
+            print(f"Check the results at {url}")
+            self.reporter.start(len(self.data))
+            result_datapoints = await self._evaluate_in_batches(evaluation.id)
+            # Wait for all background upload tasks to complete
+            if self.upload_tasks:
+                self._logger.debug(
+                    f"Waiting for {len(self.upload_tasks)} upload tasks to complete"
+                )
+                await asyncio.gather(*self.upload_tasks)
+                self._logger.debug("All upload tasks completed")
         except Exception as e:
-            self.reporter.stopWithError(e)
-            self.is_finished = True
-            return
-        # For now add all human evaluators to all result datapoints
-        # In the future, we will add ways to specify which human evaluators
-        # to add to which result datapoints, e.g. sample some randomly
-        for result_datapoint in result_datapoints:
-            result_datapoint.human_evaluators = self.human_evaluators or {}
-        evaluation = await L.create_evaluation(
-            data=result_datapoints, group_id=self.group_id, name=self.name
-        )
+            await self._shutdown()
+            self.reporter.stop_with_error(e)
         average_scores = get_average_scores(result_datapoints)
         self.reporter.stop(average_scores, evaluation.projectId, evaluation.id)
-        self.is_finished = True
-    async def _evaluate_in_batches(self) -> list[EvaluationResultDatapoint]:
-        result_datapoints = []
-        for i in range(0, len(self.data), self.batch_size):
-            batch = (
-                self.data[i : i + self.batch_size]
-                if isinstance(self.data, list)
-                else self.data.slice(i, i + self.batch_size)
-            )
-            batch_datapoints = await self._evaluate_batch(batch)
-            result_datapoints.extend(batch_datapoints)
-            self.reporter.update(len(batch))
-        return result_datapoints
-    async def _evaluate_batch(
-        self, batch: list[Datapoint]
+        await self._shutdown()
+        return {
+            "average_scores": average_scores,
+            "evaluation_id": evaluation_id,
+            "project_id": project_id,
+            "url": url,
+            "error_message": None,
+        }
+    async def _shutdown(self):
+        # We use flush() instead of shutdown() because multiple evaluations
+        # can be run sequentially in the same process. `shutdown()` would
+        # close the OTLP exporter and we wouldn't be able to export traces in
+        # the next evaluation.
+        L.flush()
+        await self.client.close()
+        if isinstance(self.data, LaminarDataset) and self.data.client:
+            self.data.client.close()
+    async def _evaluate_in_batches(
+        self, eval_id: uuid.UUID
     ) -> list[EvaluationResultDatapoint]:
-        batch_promises = [self._evaluate_datapoint(datapoint) for datapoint in batch]
-        results = await asyncio.gather(*batch_promises)
-        return results
+        semaphore = asyncio.Semaphore(self.concurrency_limit)
+        tasks = []
+        data_iter = self.data if isinstance(self.data, list) else range(len(self.data))
+        async def evaluate_task(datapoint, index):
+            try:
+                result = await self._evaluate_datapoint(eval_id, datapoint, index)
+                self.reporter.update(1)
+                return index, result
+            finally:
+                semaphore.release()
+        # Create tasks only after acquiring semaphore
+        for idx, item in enumerate(data_iter):
+            await semaphore.acquire()
+            datapoint = item if isinstance(self.data, list) else self.data[item]
+            task = asyncio.create_task(evaluate_task(datapoint, idx))
+            tasks.append(task)
+        # Wait for all tasks to complete and preserve order
+        results = await asyncio.gather(*tasks)
+        ordered_results = [result for _, result in sorted(results, key=lambda x: x[0])]
+        return ordered_results
     async def _evaluate_datapoint(
-        self, datapoint: Datapoint
+        self, eval_id: uuid.UUID, datapoint: Datapoint, index: int
     ) -> EvaluationResultDatapoint:
+        evaluation_id = uuid.uuid4()
         with L.start_as_current_span("evaluation") as evaluation_span:
             L._set_trace_type(trace_type=TraceType.EVALUATION)
             evaluation_span.set_attribute(SPAN_TYPE, SpanType.EVALUATION.value)
             with L.start_as_current_span(
                 "executor", input={"data": datapoint.data}
             ) as executor_span:
-                executor_span.set_attribute(SPAN_TYPE, SpanType.EXECUTOR.value)
-                output = (
-                    await self.executor(datapoint.data)
-                    if is_async(self.executor)
-                    else self.executor(datapoint.data)
-                )
-                L.set_span_output(output)
                 executor_span_id = uuid.UUID(
                     int=executor_span.get_span_context().span_id
                 )
+                trace_id = uuid.UUID(int=executor_span.get_span_context().trace_id)
+                partial_datapoint = PartialEvaluationDatapoint(
+                    id=evaluation_id,
+                    data=datapoint.data,
+                    target=datapoint.target,
+                    index=index,
+                    trace_id=trace_id,
+                    executor_span_id=executor_span_id,
+                    metadata=datapoint.metadata,
+                )
+                if isinstance(self.data, LaminarDataset):
+                    partial_datapoint.dataset_link = EvaluationDatapointDatasetLink(
+                        dataset_id=self.data.id,
+                        datapoint_id=datapoint.id,
+                        created_at=datapoint.created_at,
+                    )
+                # First, create datapoint with trace_id so that we can show the dp in the UI
+                await self.client.evals.save_datapoints(
+                    eval_id, [partial_datapoint], self.group_name
+                )
+                executor_span.set_attribute(SPAN_TYPE, SpanType.EXECUTOR.value)
+                # Run synchronous executors in a thread pool to avoid blocking
+                if not is_async(self.executor):
+                    loop = asyncio.get_event_loop()
+                    output = await loop.run_in_executor(
+                        None, self.executor, datapoint.data
+                    )
+                else:
+                    output = await self.executor(datapoint.data)
+                L.set_span_output(output)
             target = datapoint.target
             # Iterate over evaluators
             scores: dict[str, Numeric] = {}
             for evaluator_name, evaluator in self.evaluators.items():
-                with L.start_as_current_span(
-                    evaluator_name, input={"output": output, "target": target}
-                ) as evaluator_span:
-                    evaluator_span.set_attribute(SPAN_TYPE, SpanType.EVALUATOR.value)
-                    value = (
-                        await evaluator(output, target)
-                        if is_async(evaluator)
-                        else evaluator(output, target)
-                    )
-                    L.set_span_output(value)
-                # If evaluator returns a single number, use evaluator name as key
-                if isinstance(value, NumericTypes):
-                    scores[evaluator_name] = value
+                # Check if evaluator is a HumanEvaluator instance
+                if isinstance(evaluator, HumanEvaluator):
+                    # Create an empty span for human evaluators
+                    with L.start_as_current_span(
+                        evaluator_name, input={"output": output, "target": target}
+                    ) as human_evaluator_span:
+                        human_evaluator_span.set_attribute(
+                            SPAN_TYPE, SpanType.HUMAN_EVALUATOR.value
+                        )
+                        if evaluator.options:
+                            human_evaluator_span.set_attribute(
+                                HUMAN_EVALUATOR_OPTIONS, json_dumps(evaluator.options)
+                            )
+                        # Human evaluators don't execute automatically, just create the span
+                        L.set_span_output(None)
+                    # We don't want to save the score for human evaluators
+                    scores[evaluator_name] = None
                 else:
-                    scores.update(value)
+                    # Regular evaluator function
+                    with L.start_as_current_span(
+                        evaluator_name, input={"output": output, "target": target}
+                    ) as evaluator_span:
+                        evaluator_span.set_attribute(
+                            SPAN_TYPE, SpanType.EVALUATOR.value
+                        )
+                        if is_async(evaluator):
+                            value = await evaluator(output, target)
+                        else:
+                            loop = asyncio.get_event_loop()
+                            value = await loop.run_in_executor(
+                                None, evaluator, output, target
+                            )
+                        L.set_span_output(value)
+                    # If evaluator returns a single number, use evaluator name as key
+                    if isinstance(value, NumericTypes):
+                        scores[evaluator_name] = value
+                    else:
+                        scores.update(value)
             trace_id = uuid.UUID(int=evaluation_span.get_span_context().trace_id)
-            return EvaluationResultDatapoint(
-                data=datapoint.data,
-                target=target,
-                executor_output=output,
-                scores=scores,
-                trace_id=trace_id,
-                executor_span_id=executor_span_id,
+        eval_datapoint = EvaluationResultDatapoint(
+            id=evaluation_id,
+            data=datapoint.data,
+            target=target,
+            executor_output=output,
+            scores=scores,
+            trace_id=trace_id,
+            executor_span_id=executor_span_id,
+            index=index,
+            metadata=datapoint.metadata,
+        )
+        if isinstance(self.data, LaminarDataset):
+            eval_datapoint.dataset_link = EvaluationDatapointDatasetLink(
+                dataset_id=self.data.id,
+                datapoint_id=datapoint.id,
+                created_at=datapoint.created_at,
+            )
+        # Create background upload task without awaiting it
+        upload_task = asyncio.create_task(
+            self.client.evals.save_datapoints(
+                eval_id, [eval_datapoint], self.group_name
             )
+        )
+        self.upload_tasks.append(upload_task)
+        return eval_datapoint
 def evaluate(
-    data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
+    data: EvaluationDataset | list[Datapoint | dict],
     executor: ExecutorFunction,
-    evaluators: dict[str, EvaluatorFunction],
-    human_evaluators: list[HumanEvaluator] = [],
-    name: Optional[str] = None,
-    group_id: Optional[str] = None,
-    batch_size: int = DEFAULT_BATCH_SIZE,
-    project_api_key: Optional[str] = None,
-    base_url: Optional[str] = None,
-    http_port: Optional[int] = None,
-    grpc_port: Optional[int] = None,
-    instruments: Optional[Set[Instruments]] = None,
-) -> Optional[Awaitable[None]]:
+    evaluators: dict[str, EvaluatorFunction | HumanEvaluator],
+    name: str | None = None,
+    group_name: str | None = None,
+    metadata: dict[str, Any] | None = None,
+    concurrency_limit: int = DEFAULT_BATCH_SIZE,
+    project_api_key: str | None = None,
+    base_url: str | None = None,
+    base_http_url: str | None = None,
+    http_port: int | None = None,
+    grpc_port: int | None = None,
+    instruments: (
+        set[Instruments] | list[Instruments] | tuple[Instruments] | None
+    ) = None,
+    disabled_instruments: (
+        set[Instruments] | list[Instruments] | tuple[Instruments] | None
+    ) = None,
+    max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
+    trace_export_timeout_seconds: int | None = None,
+) -> EvaluationRunResult | None:
     """
     If added to the file which is called through `lmnr eval` command, then
     registers the evaluation; otherwise, runs the evaluation.
     If there is no event loop, creates it and runs the evaluation until
     completion.
-    If there is an event loop, schedules the evaluation as a task in the
-    event loop and returns an awaitable handle.
+    If there is an event loop, returns an awaitable handle immediately. IMPORTANT:
+    You must await the call to `evaluate`.
     Parameters:
-        data (Union[list[EvaluationDatapoint|dict]], EvaluationDataset]):\
-                    List of data points to evaluate or an evaluation dataset.
-                        `data` is the input to the executor function,
-                        `target` is the input to the evaluator function.
+        data (list[EvaluationDatapoint|dict] | EvaluationDataset):\
+            List of data points to evaluate or an evaluation dataset.
+                `data` is the input to the executor function,
+                `target` is the input to the evaluator function.
         executor (Callable[..., Any]): The executor function.\
-                        Takes the data point + any additional arguments\
-                        and returns the output to evaluate.
-        evaluators (List[Callable[..., Any]]):
-            evaluators (dict[str, Callable[..., Any]]): Evaluator functions and\
-                names. Each evaluator function takes the output of the executor\
-                _and_ the target data, and returns a score. The score can be a\
-                single number or a dict of string keys and number values.\
-                If the score is a single number, it will be named after the\
-                evaluator function. Evaluator function names must contain only\
-                letters, digits, hyphens, underscores, or spaces.
-        human_evaluators (list[HumanEvaluator], optional):\
-            [Beta] List of instances of HumanEvaluator. For now, human\
-            evaluator only holds the queue name.
-            Defaults to an empty list.
-        name (Optional[str], optional): Optional name of the evaluation.\
-                        Used to identify the evaluation in the group.\
-                        If not provided, a random name will be generated.
-                        Defaults to None.
-        group_id (Optional[str], optional): an identifier to group evaluations.\
-                        Only evaluations within the same group_id can be\
-                        visually compared. If not provided, set to "default".
-                        Defaults to None
-        batch_size (int, optional): The batch size for evaluation.
+            Takes the data point + any additional arguments\
+            and returns the output to evaluate.
+        evaluators (dict[str, Callable[..., Any] | HumanEvaluator]): Evaluator\
+            functions and HumanEvaluator instances with names. Each evaluator\
+            function takes the output of the executor _and_ the target data,\
+            and returns a score. The score can be a single number or a dict\
+            of string keys and number values. If the score is a single number,\
+            it will be named after the evaluator function.\
+            HumanEvaluator instances create empty spans for manual evaluation.\
+            Evaluator function names must contain only letters, digits, hyphens,\
+            underscores, or spaces.
+        name (str | None, optional): Optional name of the evaluation.\
+            Used to identify the evaluation in the group. If not provided, a\
+            random name will be generated.
+            Defaults to None.
+        group_name (str | None, optional): An identifier to group evaluations.\
+            Only evaluations within the same group_name can be visually compared.\
+            If not provided, set to "default".
+            Defaults to None
+        metadata (dict[str, Any] | None, optional): Optional metadata to associate with\
+        concurrency_limit (int, optional): The concurrency limit for evaluation.
                         Defaults to DEFAULT_BATCH_SIZE.
-        project_api_key (Optional[str], optional): The project API key.
+        project_api_key (str | None, optional): The project API key.
                         Defaults to None.
-        base_url (Optional[str], optional): The base URL for Laminar API.\
+        base_url (str | None, optional): The base URL for Laminar API.\
                         Useful if self-hosted elsewhere. Do NOT include the\
                         port, use `http_port` and `grpc_port` instead.
                         Defaults to "https://api.lmnr.ai".
-        http_port (Optional[int], optional): The port for Laminar API's HTTP\
+        base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
+                        Only set this if your Laminar backend HTTP is proxied\
+                        through a different host. If not specified, defaults\
+                        to https://api.lmnr.ai.
+        http_port (int | None, optional): The port for Laminar API's HTTP\
                         service. 443 is used if not specified.
                         Defaults to None.
-        grpc_port (Optional[int], optional): The port for Laminar API's gRPC\
+        grpc_port (int | None, optional): The port for Laminar API's gRPC\
                         service. 8443 is used if not specified.
                         Defaults to None.
-        instruments (Optional[Set[Instruments]], optional): Set of modules to\
+        instruments (set[Instruments] | None, optional): Set of modules to\
                         auto-instrument. If None, all available instruments\
                         will be used.
                         Defaults to None.
+        disabled_instruments (set[Instruments] | None, optional): Set of modules\
+                        to disable auto-instrumentations. If None, no\
+                        If None, only modules passed as `instruments` will be disabled.
+                        Defaults to None.
+        trace_export_timeout_seconds (int | None, optional): The timeout for\
+                        trace export on OpenTelemetry exporter. Defaults to None.
     """
     evaluation = Evaluation(
         data=data,
         executor=executor,
         evaluators=evaluators,
-        group_id=group_id,
-        human_evaluators=human_evaluators,
+        group_name=group_name,
+        metadata=metadata,
         name=name,
-        batch_size=batch_size,
+        concurrency_limit=concurrency_limit,
         project_api_key=project_api_key,
         base_url=base_url,
+        base_http_url=base_http_url,
         http_port=http_port,
         grpc_port=grpc_port,
         instruments=instruments,
+        disabled_instruments=disabled_instruments,
+        max_export_batch_size=max_export_batch_size,
+        trace_export_timeout_seconds=trace_export_timeout_seconds,
     )
     if PREPARE_ONLY.get():
-        EVALUATION_INSTANCE.set(evaluation)
+        existing_evaluations = EVALUATION_INSTANCES.get([])
+        new_evaluations = (existing_evaluations or []) + [evaluation]
+        EVALUATION_INSTANCES.set(new_evaluations)
+        return None
     else:
-        loop = asyncio.get_event_loop()
-        if loop.is_running():
-            return loop.run_until_complete(evaluation.run())
-        else:
+        try:
+            loop = asyncio.get_event_loop()
+            if loop.is_running():
+                return evaluation.run()
+            else:
+                return asyncio.run(evaluation.run())
+        except RuntimeError:
             return asyncio.run(evaluation.run())

lmnr 0.4.53.dev0__py3-none-any.whl → 0.7.26__py3-none-any.whl

lmnr 0.4.53.dev0py3-none-any.whl → 0.7.26py3-none-any.whl