PyPI - lmnr - Versions diffs - 0.6.16__py3-none-any.whl → 0.7.26__py3-none-any.whl - Mend

lmnr 0.6.16py3-none-any.whl → 0.7.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

lmnr/__init__.py +6 -15
lmnr/cli/__init__.py +270 -0
lmnr/cli/datasets.py +371 -0
lmnr/{cli.py → cli/evals.py} +20 -102
lmnr/cli/rules.py +42 -0
lmnr/opentelemetry_lib/__init__.py +9 -2
lmnr/opentelemetry_lib/decorators/__init__.py +274 -168
lmnr/opentelemetry_lib/litellm/__init__.py +352 -38
lmnr/opentelemetry_lib/litellm/utils.py +82 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/__init__.py +849 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/config.py +13 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_emitter.py +211 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_models.py +41 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/span_utils.py +401 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/streaming.py +425 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/utils.py +332 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/version.py +1 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/__init__.py +451 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/proxy.py +144 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_agent/__init__.py +100 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/__init__.py +476 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/utils.py +12 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/__init__.py +191 -129
lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/schema_utils.py +26 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/utils.py +126 -41
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/__init__.py +488 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/config.py +8 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_emitter.py +143 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_models.py +41 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/span_utils.py +229 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/utils.py +92 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/version.py +1 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/__init__.py +381 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/utils.py +36 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/langgraph/__init__.py +16 -16
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/__init__.py +61 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/__init__.py +472 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/chat_wrappers.py +1185 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/completion_wrappers.py +305 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/config.py +16 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/embeddings_wrappers.py +312 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_emitter.py +100 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_models.py +41 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/image_gen_wrappers.py +68 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/utils.py +197 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v0/__init__.py +176 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/__init__.py +368 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/assistant_wrappers.py +325 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/event_handler_wrapper.py +135 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/responses_wrappers.py +786 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/version.py +1 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/openhands_ai/__init__.py +388 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/opentelemetry/__init__.py +69 -0
lmnr/opentelemetry_lib/opentelemetry/instrumentation/skyvern/__init__.py +59 -61
lmnr/opentelemetry_lib/opentelemetry/instrumentation/threading/__init__.py +197 -0
lmnr/opentelemetry_lib/tracing/__init__.py +119 -18
lmnr/opentelemetry_lib/tracing/_instrument_initializers.py +124 -25
lmnr/opentelemetry_lib/tracing/attributes.py +4 -0
lmnr/opentelemetry_lib/tracing/context.py +200 -0
lmnr/opentelemetry_lib/tracing/exporter.py +109 -15
lmnr/opentelemetry_lib/tracing/instruments.py +22 -5
lmnr/opentelemetry_lib/tracing/processor.py +128 -30
lmnr/opentelemetry_lib/tracing/span.py +398 -0
lmnr/opentelemetry_lib/tracing/tracer.py +40 -1
lmnr/opentelemetry_lib/tracing/utils.py +62 -0
lmnr/opentelemetry_lib/utils/package_check.py +9 -0
lmnr/opentelemetry_lib/utils/wrappers.py +11 -0
lmnr/sdk/browser/background_send_events.py +158 -0
lmnr/sdk/browser/browser_use_cdp_otel.py +100 -0
lmnr/sdk/browser/browser_use_otel.py +12 -12
lmnr/sdk/browser/bubus_otel.py +71 -0
lmnr/sdk/browser/cdp_utils.py +518 -0
lmnr/sdk/browser/inject_script.js +514 -0
lmnr/sdk/browser/patchright_otel.py +18 -44
lmnr/sdk/browser/playwright_otel.py +104 -187
lmnr/sdk/browser/pw_utils.py +249 -210
lmnr/sdk/browser/recorder/record.umd.min.cjs +84 -0
lmnr/sdk/browser/utils.py +1 -1
lmnr/sdk/client/asynchronous/async_client.py +47 -15
lmnr/sdk/client/asynchronous/resources/__init__.py +2 -7
lmnr/sdk/client/asynchronous/resources/browser_events.py +1 -0
lmnr/sdk/client/asynchronous/resources/datasets.py +131 -0
lmnr/sdk/client/asynchronous/resources/evals.py +122 -18
lmnr/sdk/client/asynchronous/resources/evaluators.py +85 -0
lmnr/sdk/client/asynchronous/resources/tags.py +4 -10
lmnr/sdk/client/synchronous/resources/__init__.py +2 -2
lmnr/sdk/client/synchronous/resources/datasets.py +131 -0
lmnr/sdk/client/synchronous/resources/evals.py +83 -17
lmnr/sdk/client/synchronous/resources/evaluators.py +85 -0
lmnr/sdk/client/synchronous/resources/tags.py +4 -10
lmnr/sdk/client/synchronous/sync_client.py +47 -15
lmnr/sdk/datasets/__init__.py +94 -0
lmnr/sdk/datasets/file_utils.py +91 -0
lmnr/sdk/decorators.py +103 -23
lmnr/sdk/evaluations.py +122 -33
lmnr/sdk/laminar.py +816 -333
lmnr/sdk/log.py +7 -2
lmnr/sdk/types.py +124 -143
lmnr/sdk/utils.py +115 -2
lmnr/version.py +1 -1
{lmnr-0.6.16.dist-info → lmnr-0.7.26.dist-info}/METADATA +71 -78
lmnr-0.7.26.dist-info/RECORD +116 -0
lmnr-0.7.26.dist-info/WHEEL +4 -0
lmnr-0.7.26.dist-info/entry_points.txt +3 -0
lmnr/opentelemetry_lib/tracing/context_properties.py +0 -65
lmnr/sdk/browser/rrweb/rrweb.umd.min.cjs +0 -98
lmnr/sdk/client/asynchronous/resources/agent.py +0 -329
lmnr/sdk/client/synchronous/resources/agent.py +0 -323
lmnr/sdk/datasets.py +0 -60
lmnr-0.6.16.dist-info/LICENSE +0 -75
lmnr-0.6.16.dist-info/RECORD +0 -61
lmnr-0.6.16.dist-info/WHEEL +0 -4
lmnr-0.6.16.dist-info/entry_points.txt +0 -3

lmnr/sdk/datasets/file_utils.py ADDED Viewed

@@ -0,0 +1,91 @@
+from pathlib import Path
+from typing import Any
+import csv
+import orjson
+from lmnr.sdk.log import get_default_logger
+LOG = get_default_logger(__name__, verbose=False)
+def _is_supported_file(file: Path) -> bool:
+    """Check if a file is supported."""
+    return file.suffix in [".json", ".csv", ".jsonl"]
+def _collect_files(paths: list[Path], recursive: bool = False) -> list[Path]:
+    """
+    Collect all supported files from the given paths.
+    Handles both files and directories. If a path is a directory,
+    collects all supported files within it (recursively if specified).
+    """
+    collected_files = []
+    for path in paths:
+        if path.is_file():
+            if _is_supported_file(path):
+                collected_files.append(path)
+            else:
+                LOG.warning(f"Skipping unsupported file type: {path}")
+        elif path.is_dir():
+            for item in path.iterdir():
+                if item.is_file() and _is_supported_file(item):
+                    collected_files.append(item)
+                elif recursive and item.is_dir():
+                    # Recursively collect files from subdirectories
+                    collected_files.extend(_collect_files([item], recursive=True))
+        else:
+            LOG.warning(f"Path does not exist or is not accessible: {path}")
+    return collected_files
+def _read_file(file: Path) -> list[dict[str, Any]]:
+    """Read data from a single file and return as a list of dictionaries."""
+    if file.suffix == ".json":
+        result = orjson.loads(file.read_bytes())
+        if isinstance(result, list):
+            return result
+        else:
+            return [result]
+    elif file.suffix == ".csv":
+        return [dict(row) for row in csv.DictReader(file.read_text().splitlines())]
+    elif file.suffix == ".jsonl":
+        return [
+            orjson.loads(line) for line in file.read_text().splitlines() if line.strip()
+        ]
+    else:
+        raise ValueError(f"Unsupported file type: {file.suffix}")
+def load_from_paths(paths: list[Path], recursive: bool = False) -> list[dict[str, Any]]:
+    """
+    Load data from all files in the specified paths.
+    First collects all file paths, then reads each file's data.
+    """
+    files = _collect_files(paths, recursive)
+    if not files:
+        LOG.warning("No supported files found in the specified paths")
+        return []
+    LOG.info(f"Found {len(files)} file(s) to read")
+    result = []
+    for file in files:
+        try:
+            data = _read_file(file)
+            result.extend(data)
+            LOG.info(f"Read {len(data)} record(s) from {file}")
+        except Exception as e:
+            LOG.error(f"Error reading file {file}: {e}")
+            raise
+    return result
+def parse_paths(paths: list[str]) -> list[Path]:
+    """Parse paths."""
+    return [Path(path) for path in paths]

lmnr/sdk/decorators.py CHANGED Viewed

@@ -1,15 +1,15 @@
 from lmnr.opentelemetry_lib.decorators import (
-    entity_method,
-    aentity_method,
-    json_dumps,
+    observe_base,
+    async_observe_base,
 )
 from opentelemetry.trace import INVALID_SPAN, get_current_span
-from typing import Any, Callable, Literal, TypeVar, cast
+from typing import Any, Callable, Coroutine, Literal, TypeVar, overload
 from typing_extensions import ParamSpec
 from lmnr.opentelemetry_lib.tracing.attributes import SESSION_ID
 from lmnr.sdk.log import get_default_logger
+from lmnr.sdk.types import TraceType
 from .utils import is_async
@@ -19,6 +19,8 @@ P = ParamSpec("P")
 R = TypeVar("R")
+# Overload for synchronous functions
+@overload
 def observe(
     *,
     name: str | None = None,
@@ -28,9 +30,52 @@ def observe(
     ignore_output: bool = False,
     span_type: Literal["DEFAULT", "LLM", "TOOL"] = "DEFAULT",
     ignore_inputs: list[str] | None = None,
+    input_formatter: Callable[..., str] | None = None,
+    output_formatter: Callable[..., str] | None = None,
     metadata: dict[str, Any] | None = None,
     tags: list[str] | None = None,
-) -> Callable[[Callable[P, R]], Callable[P, R]]:
+    preserve_global_context: bool = False,
+) -> Callable[[Callable[P, R]], Callable[P, R]]: ...
+# Overload for asynchronous functions
+@overload
+def observe(
+    *,
+    name: str | None = None,
+    session_id: str | None = None,
+    user_id: str | None = None,
+    ignore_input: bool = False,
+    ignore_output: bool = False,
+    span_type: Literal["DEFAULT", "LLM", "TOOL"] = "DEFAULT",
+    ignore_inputs: list[str] | None = None,
+    input_formatter: Callable[..., str] | None = None,
+    output_formatter: Callable[..., str] | None = None,
+    metadata: dict[str, Any] | None = None,
+    tags: list[str] | None = None,
+    preserve_global_context: bool = False,
+) -> Callable[
+    [Callable[P, Coroutine[Any, Any, R]]], Callable[P, Coroutine[Any, Any, R]]
+]: ...
+# Implementation
+def observe(
+    *,
+    name: str | None = None,
+    session_id: str | None = None,
+    user_id: str | None = None,
+    ignore_input: bool = False,
+    ignore_output: bool = False,
+    span_type: Literal["DEFAULT", "LLM", "TOOL"] = "DEFAULT",
+    ignore_inputs: list[str] | None = None,
+    input_formatter: Callable[..., str] | None = None,
+    output_formatter: Callable[..., str] | None = None,
+    metadata: dict[str, Any] | None = None,
+    tags: list[str] | None = None,
+    preserve_global_context: bool = False,
+):
+    # Return type is determined by overloads above
     """The main decorator entrypoint for Laminar. This is used to wrap
     functions and methods to create spans.
@@ -53,10 +98,24 @@ def observe(
             def foo(a, b, `sensitive_data`), and you want to ignore the\
             `sensitive_data` argument, you can pass ["sensitive_data"] to\
             this argument. Defaults to None.
+        input_formatter (Callable[P, str] | None, optional): A custom function\
+            to format the input of the wrapped function. This function should\
+            accept the same parameters as the wrapped function and return a string.\
+            All function arguments are passed to this function. Ignored if\
+            `ignore_input` is True. Does not respect `ignore_inputs` argument.
+            Defaults to None.
+        output_formatter (Callable[[R], str] | None, optional): A custom function\
+            to format the output of the wrapped function. This function should\
+            accept a single parameter (the return value of the wrapped function)\
+            and return a string. Ignored if `ignore_output` is True.\
+            Defaults to None.
         metadata (dict[str, Any] | None, optional): Metadata to associate with\
             the trace. Must be JSON serializable. Defaults to None.
         tags (list[str] | None, optional): Tags to associate with the trace.
             Defaults to None.
+        preserve_global_context (bool, optional): Whether to preserve the global\
+            OpenTelemetry context. If set to True, Laminar spans will continue\
+            traces started in the global context. Defaults to False.
     Raises:
         Exception: re-raises the exception if the wrapped function raises an\
             exception
@@ -65,7 +124,9 @@ def observe(
         R: Returns the result of the wrapped function
     """
-    def decorator(func: Callable) -> Callable:
+    def decorator(
+        func: Callable[P, R] | Callable[P, Coroutine[Any, Any, R]],
+    ) -> Callable[P, R] | Callable[P, Coroutine[Any, Any, R]]:
         current_span = get_current_span()
         if current_span != INVALID_SPAN:
             if session_id is not None:
@@ -75,41 +136,60 @@ def observe(
             association_properties["session_id"] = session_id
         if user_id is not None:
             association_properties["user_id"] = user_id
-        if metadata is not None:
-            association_properties.update(
-                {
-                    f"metadata.{k}": (
-                        v if isinstance(v, (str, int, float, bool)) else json_dumps(v)
-                    )
-                    for k, v in metadata.items()
-                }
-            )
+        if span_type in ["EVALUATION", "EXECUTOR", "EVALUATOR"]:
+            association_properties["trace_type"] = TraceType.EVALUATION.value
         if tags is not None:
             if not isinstance(tags, list) or not all(
                 isinstance(tag, str) for tag in tags
             ):
                 logger.warning("Tags must be a list of strings. Tags will be ignored.")
             else:
-                association_properties["tags"] = tags
-        result = (
-            aentity_method(
+                # list(set(tags)) to deduplicate tags
+                association_properties["tags"] = list(set(tags))
+        if input_formatter is not None and ignore_input:
+            logger.warning(
+                f"observe, function {func.__name__}: Input formatter"
+                " is ignored because `ignore_input` is True. Specify only one of"
+                " `ignore_input` or `input_formatter`."
+            )
+        if input_formatter is not None and ignore_inputs is not None:
+            logger.warning(
+                f"observe, function {func.__name__}: Both input formatter and"
+                " `ignore_inputs` are specified. Input formatter"
+                " will pass all arguments to the formatter regardless of"
+                " `ignore_inputs`."
+            )
+        if output_formatter is not None and ignore_output:
+            logger.warning(
+                f"observe, function {func.__name__}: Output formatter"
+                " is ignored because `ignore_output` is True. Specify only one of"
+                " `ignore_output` or `output_formatter`."
+            )
+        if is_async(func):
+            return async_observe_base(
                 name=name,
                 ignore_input=ignore_input,
                 ignore_output=ignore_output,
                 span_type=span_type,
+                metadata=metadata,
                 ignore_inputs=ignore_inputs,
+                input_formatter=input_formatter,
+                output_formatter=output_formatter,
                 association_properties=association_properties,
+                preserve_global_context=preserve_global_context,
             )(func)
-            if is_async(func)
-            else entity_method(
+        else:
+            return observe_base(
                 name=name,
                 ignore_input=ignore_input,
                 ignore_output=ignore_output,
                 span_type=span_type,
+                metadata=metadata,
                 ignore_inputs=ignore_inputs,
+                input_formatter=input_formatter,
+                output_formatter=output_formatter,
                 association_properties=association_properties,
+                preserve_global_context=preserve_global_context,
             )(func)
-        )
-        return result
-    return cast(Callable, decorator)
+    return decorator

lmnr/sdk/evaluations.py CHANGED Viewed

@@ -2,11 +2,13 @@ import asyncio
 import re
 import uuid
+from typing import Any
+from typing_extensions import TypedDict
 from tqdm import tqdm
-from typing import Any, Awaitable
 from lmnr.opentelemetry_lib.tracing.instruments import Instruments
-from lmnr.opentelemetry_lib.tracing.attributes import SPAN_TYPE
+from lmnr.opentelemetry_lib.tracing.attributes import HUMAN_EVALUATOR_OPTIONS, SPAN_TYPE
 from lmnr.sdk.client.asynchronous.async_client import AsyncLaminarClient
 from lmnr.sdk.client.synchronous.sync_client import LaminarClient
@@ -16,6 +18,7 @@ from lmnr.sdk.laminar import Laminar as L
 from lmnr.sdk.log import get_default_logger
 from lmnr.sdk.types import (
     Datapoint,
+    EvaluationDatapointDatasetLink,
     EvaluationResultDatapoint,
     EvaluatorFunction,
     ExecutorFunction,
@@ -26,12 +29,20 @@ from lmnr.sdk.types import (
     SpanType,
     TraceType,
 )
-from lmnr.sdk.utils import from_env, is_async
+from lmnr.sdk.utils import from_env, is_async, json_dumps
 DEFAULT_BATCH_SIZE = 5
 MAX_EXPORT_BATCH_SIZE = 64
+class EvaluationRunResult(TypedDict):
+    average_scores: dict[str, Numeric]
+    evaluation_id: uuid.UUID
+    project_id: uuid.UUID
+    url: str
+    error_message: str | None
 def get_evaluation_url(
     project_id: str, evaluation_id: str, base_url: str | None = None
 ):
@@ -57,7 +68,7 @@ def get_average_scores(results: list[EvaluationResultDatapoint]) -> dict[str, Nu
     average_scores = {}
     for key, values in per_score_values.items():
         scores = [v for v in values if v is not None]
         # If there are no scores, we don't want to include the key in the average scores
         if len(scores) > 0:
             average_scores[key] = sum(scores) / len(scores)
@@ -79,21 +90,21 @@ class EvaluationReporter:
     def update(self, batch_length: int):
         self.cli_progress.update(batch_length)
-    def stopWithError(self, error: Exception):
-        self.cli_progress.close()
+    def stop_with_error(self, error: Exception):
+        if hasattr(self, "cli_progress"):
+            self.cli_progress.close()
         raise error
     def stop(
         self, average_scores: dict[str, Numeric], project_id: str, evaluation_id: str
     ):
         self.cli_progress.close()
-        print(
-            f"\nCheck the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
-        )
         print("Average scores:")
         for name, score in average_scores.items():
             print(f"{name}: {score}")
-        print("\n")
+        print(
+            f"Check the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
+        )
 class Evaluation:
@@ -108,9 +119,15 @@ class Evaluation:
         concurrency_limit: int = DEFAULT_BATCH_SIZE,
         project_api_key: str | None = None,
         base_url: str | None = None,
+        base_http_url: str | None = None,
         http_port: int | None = None,
         grpc_port: int | None = None,
-        instruments: set[Instruments] | None = None,
+        instruments: (
+            set[Instruments] | list[Instruments] | tuple[Instruments] | None
+        ) = None,
+        disabled_instruments: (
+            set[Instruments] | list[Instruments] | tuple[Instruments] | None
+        ) = None,
         max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
         trace_export_timeout_seconds: int | None = None,
     ):
@@ -157,6 +174,10 @@ class Evaluation:
                 Useful if self-hosted. Do NOT include the port, use `http_port`\
                 and `grpc_port` instead.
                 Defaults to "https://api.lmnr.ai".
+            base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
+                Only set this if your Laminar backend HTTP is proxied\
+                through a different host. If not specified, defaults\
+                to https://api.lmnr.ai.
             http_port (int | None, optional): The port for Laminar API\
                 HTTP service. Defaults to 443 if not specified.
             grpc_port (int | None, optional): The port for Laminar API\
@@ -166,6 +187,10 @@ class Evaluation:
                 used.
                 See https://docs.lmnr.ai/tracing/automatic-instrumentation
                 Defaults to None.
+            disabled_instruments (set[Instruments] | None, optional): Set of modules\
+                to disable auto-instrumentations. If None, only modules passed\
+                as `instruments` will be disabled.
+                Defaults to None.
         """
         if not evaluators:
@@ -190,6 +215,8 @@ class Evaluation:
             ]
         else:
             self.data = data
+        if not isinstance(self.data, LaminarDataset) and len(self.data) == 0:
+            raise ValueError("No data provided. Skipping evaluation")
         self.executor = executor
         self.evaluators = evaluators
         self.group_name = group_name
@@ -199,7 +226,7 @@ class Evaluation:
         self.batch_size = concurrency_limit
         self._logger = get_default_logger(self.__class__.__name__)
         self.upload_tasks = []
-        self.base_http_url = f"{base_url}:{http_port or 443}"
+        self.base_http_url = f"{base_http_url or base_url}:{http_port or 443}"
         api_key = project_api_key or from_env("LMNR_PROJECT_API_KEY")
         if not api_key and not L.is_initialized():
@@ -224,31 +251,51 @@ class Evaluation:
         L.initialize(
             project_api_key=project_api_key,
             base_url=base_url,
+            base_http_url=self.base_http_url,
             http_port=http_port,
             grpc_port=grpc_port,
             instruments=instruments,
+            disabled_instruments=disabled_instruments,
             max_export_batch_size=max_export_batch_size,
             export_timeout_seconds=trace_export_timeout_seconds,
         )
-    async def run(self) -> Awaitable[dict[str, int | float]]:
+    async def run(self) -> EvaluationRunResult:
         return await self._run()
-    async def _run(self) -> dict[str, int | float]:
+    async def _run(self) -> EvaluationRunResult:
         if isinstance(self.data, LaminarDataset):
             self.data.set_client(
                 LaminarClient(
-                    self.base_http_url,
-                    self.project_api_key,
+                    base_url=self.base_http_url,
+                    project_api_key=self.project_api_key,
                 )
             )
-        self.reporter.start(len(self.data))
+            if not self.data.id:
+                try:
+                    datasets = await self.client.datasets.get_dataset_by_name(
+                        self.data.name
+                    )
+                    if len(datasets) == 0:
+                        self._logger.warning(f"Dataset {self.data.name} not found")
+                    else:
+                        self.data.id = datasets[0].id
+                except Exception as e:
+                    # Backward compatibility with old Laminar API (self hosted)
+                    self._logger.warning(f"Error getting dataset {self.data.name}: {e}")
         try:
             evaluation = await self.client.evals.init(
                 name=self.name, group_name=self.group_name, metadata=self.metadata
             )
-            result_datapoints = await self._evaluate_in_batches(evaluation.id)
+            evaluation_id = evaluation.id
+            project_id = evaluation.projectId
+            url = get_evaluation_url(project_id, evaluation_id, self.reporter.base_url)
+            print(f"Check the results at {url}")
+            self.reporter.start(len(self.data))
+            result_datapoints = await self._evaluate_in_batches(evaluation.id)
             # Wait for all background upload tasks to complete
             if self.upload_tasks:
                 self._logger.debug(
@@ -257,14 +304,19 @@ class Evaluation:
                 await asyncio.gather(*self.upload_tasks)
                 self._logger.debug("All upload tasks completed")
         except Exception as e:
-            self.reporter.stopWithError(e)
             await self._shutdown()
-            raise
+            self.reporter.stop_with_error(e)
         average_scores = get_average_scores(result_datapoints)
         self.reporter.stop(average_scores, evaluation.projectId, evaluation.id)
         await self._shutdown()
-        return average_scores
+        return {
+            "average_scores": average_scores,
+            "evaluation_id": evaluation_id,
+            "project_id": project_id,
+            "url": url,
+            "error_message": None,
+        }
     async def _shutdown(self):
         # We use flush() instead of shutdown() because multiple evaluations
@@ -319,6 +371,7 @@ class Evaluation:
                     int=executor_span.get_span_context().span_id
                 )
                 trace_id = uuid.UUID(int=executor_span.get_span_context().trace_id)
                 partial_datapoint = PartialEvaluationDatapoint(
                     id=evaluation_id,
                     data=datapoint.data,
@@ -328,6 +381,12 @@ class Evaluation:
                     executor_span_id=executor_span_id,
                     metadata=datapoint.metadata,
                 )
+                if isinstance(self.data, LaminarDataset):
+                    partial_datapoint.dataset_link = EvaluationDatapointDatasetLink(
+                        dataset_id=self.data.id,
+                        datapoint_id=datapoint.id,
+                        created_at=datapoint.created_at,
+                    )
                 # First, create datapoint with trace_id so that we can show the dp in the UI
                 await self.client.evals.save_datapoints(
                     eval_id, [partial_datapoint], self.group_name
@@ -352,22 +411,28 @@ class Evaluation:
                 if isinstance(evaluator, HumanEvaluator):
                     # Create an empty span for human evaluators
                     with L.start_as_current_span(
-                        evaluator_name,
-                        input={"output": output, "target": target}
+                        evaluator_name, input={"output": output, "target": target}
                     ) as human_evaluator_span:
-                        human_evaluator_span.set_attribute(SPAN_TYPE, SpanType.HUMAN_EVALUATOR.value)
+                        human_evaluator_span.set_attribute(
+                            SPAN_TYPE, SpanType.HUMAN_EVALUATOR.value
+                        )
+                        if evaluator.options:
+                            human_evaluator_span.set_attribute(
+                                HUMAN_EVALUATOR_OPTIONS, json_dumps(evaluator.options)
+                            )
                         # Human evaluators don't execute automatically, just create the span
                         L.set_span_output(None)
                     # We don't want to save the score for human evaluators
                     scores[evaluator_name] = None
                 else:
                     # Regular evaluator function
                     with L.start_as_current_span(
-                        evaluator_name,
-                        input={"output": output, "target": target}
+                        evaluator_name, input={"output": output, "target": target}
                     ) as evaluator_span:
-                        evaluator_span.set_attribute(SPAN_TYPE, SpanType.EVALUATOR.value)
+                        evaluator_span.set_attribute(
+                            SPAN_TYPE, SpanType.EVALUATOR.value
+                        )
                         if is_async(evaluator):
                             value = await evaluator(output, target)
                         else:
@@ -385,7 +450,7 @@ class Evaluation:
             trace_id = uuid.UUID(int=evaluation_span.get_span_context().trace_id)
-        datapoint = EvaluationResultDatapoint(
+        eval_datapoint = EvaluationResultDatapoint(
             id=evaluation_id,
             data=datapoint.data,
             target=target,
@@ -396,14 +461,22 @@ class Evaluation:
             index=index,
             metadata=datapoint.metadata,
         )
+        if isinstance(self.data, LaminarDataset):
+            eval_datapoint.dataset_link = EvaluationDatapointDatasetLink(
+                dataset_id=self.data.id,
+                datapoint_id=datapoint.id,
+                created_at=datapoint.created_at,
+            )
         # Create background upload task without awaiting it
         upload_task = asyncio.create_task(
-            self.client.evals.save_datapoints(eval_id, [datapoint], self.group_name)
+            self.client.evals.save_datapoints(
+                eval_id, [eval_datapoint], self.group_name
+            )
         )
         self.upload_tasks.append(upload_task)
-        return datapoint
+        return eval_datapoint
 def evaluate(
@@ -416,12 +489,18 @@ def evaluate(
     concurrency_limit: int = DEFAULT_BATCH_SIZE,
     project_api_key: str | None = None,
     base_url: str | None = None,
+    base_http_url: str | None = None,
     http_port: int | None = None,
     grpc_port: int | None = None,
-    instruments: set[Instruments] | None = None,
+    instruments: (
+        set[Instruments] | list[Instruments] | tuple[Instruments] | None
+    ) = None,
+    disabled_instruments: (
+        set[Instruments] | list[Instruments] | tuple[Instruments] | None
+    ) = None,
     max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
     trace_export_timeout_seconds: int | None = None,
-) -> Awaitable[None] | None:
+) -> EvaluationRunResult | None:
     """
     If added to the file which is called through `lmnr eval` command, then
     registers the evaluation; otherwise, runs the evaluation.
@@ -465,6 +544,10 @@ def evaluate(
                         Useful if self-hosted elsewhere. Do NOT include the\
                         port, use `http_port` and `grpc_port` instead.
                         Defaults to "https://api.lmnr.ai".
+        base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
+                        Only set this if your Laminar backend HTTP is proxied\
+                        through a different host. If not specified, defaults\
+                        to https://api.lmnr.ai.
         http_port (int | None, optional): The port for Laminar API's HTTP\
                         service. 443 is used if not specified.
                         Defaults to None.
@@ -475,6 +558,10 @@ def evaluate(
                         auto-instrument. If None, all available instruments\
                         will be used.
                         Defaults to None.
+        disabled_instruments (set[Instruments] | None, optional): Set of modules\
+                        to disable auto-instrumentations. If None, no\
+                        If None, only modules passed as `instruments` will be disabled.
+                        Defaults to None.
         trace_export_timeout_seconds (int | None, optional): The timeout for\
                         trace export on OpenTelemetry exporter. Defaults to None.
     """
@@ -488,9 +575,11 @@ def evaluate(
         concurrency_limit=concurrency_limit,
         project_api_key=project_api_key,
         base_url=base_url,
+        base_http_url=base_http_url,
         http_port=http_port,
         grpc_port=grpc_port,
         instruments=instruments,
+        disabled_instruments=disabled_instruments,
         max_export_batch_size=max_export_batch_size,
         trace_export_timeout_seconds=trace_export_timeout_seconds,
     )

lmnr 0.6.16__py3-none-any.whl → 0.7.26__py3-none-any.whl

lmnr 0.6.16py3-none-any.whl → 0.7.26py3-none-any.whl