PyPI - lmnr - Versions diffs - 0.4.23__tar.gz → 0.4.25__tar.gz - Mend

lmnr 0.4.23tar.gz → 0.4.25tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

{lmnr-0.4.23 → lmnr-0.4.25}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lmnr
-Version: 0.4.23
+Version: 0.4.25
 Summary: Python SDK for Laminar AI
 License: Apache-2.0
 Author: lmnr.ai
@@ -16,6 +16,7 @@ Requires-Dist: argparse (>=1.0,<2.0)
 Requires-Dist: backoff (>=2.0,<3.0)
 Requires-Dist: deprecated (>=1.0,<2.0)
 Requires-Dist: jinja2 (>=3.0,<4.0)
+Requires-Dist: openai (>=1.52.0,<2.0.0)
 Requires-Dist: opentelemetry-api (>=1.27.0,<2.0.0)
 Requires-Dist: opentelemetry-exporter-otlp-proto-grpc (>=1.27.0,<2.0.0)
 Requires-Dist: opentelemetry-exporter-otlp-proto-http (>=1.27.0,<2.0.0)
@@ -246,6 +247,7 @@ You can run evaluations locally by providing executor (part of the logic used in
 - `executor` – the logic you want to evaluate. This function must take `data` as the first argument, and produce any output. It can be both a function or an `async` function.
 - `evaluators` – Dictionary which maps evaluator names to evaluators. Functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Each function can produce either a single number or `dict[str, int|float]` of scores. Each evaluator can be both a function or an `async` function.
 - `name` – optional name for the evaluation. Automatically generated if not provided.
+- `group_id` – optional group name for the evaluation. Evaluations within the same group can be compared visually side-by-side
 \* If you already have the outputs of executors you want to evaluate, you can specify the executor as an identity function, that takes in `data` and returns only needed value(s) from it.

{lmnr-0.4.23 → lmnr-0.4.25}/README.md RENAMED Viewed

@@ -188,6 +188,7 @@ You can run evaluations locally by providing executor (part of the logic used in
 - `executor` – the logic you want to evaluate. This function must take `data` as the first argument, and produce any output. It can be both a function or an `async` function.
 - `evaluators` – Dictionary which maps evaluator names to evaluators. Functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Each function can produce either a single number or `dict[str, int|float]` of scores. Each evaluator can be both a function or an `async` function.
 - `name` – optional name for the evaluation. Automatically generated if not provided.
+- `group_id` – optional group name for the evaluation. Evaluations within the same group can be compared visually side-by-side
 \* If you already have the outputs of executors you want to evaluate, you can specify the executor as an identity function, that takes in `data` and returns only needed value(s) from it.

{lmnr-0.4.23 → lmnr-0.4.25}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "lmnr"
-version = "0.4.23"
+version = "0.4.25"
 description = "Python SDK for Laminar AI"
 authors = [
   { name = "lmnr.ai", email = "founders@lmnr.ai" }
@@ -11,7 +11,7 @@ license = "Apache-2.0"
 [tool.poetry]
 name = "lmnr"
-version = "0.4.23"
+version = "0.4.25"
 description = "Python SDK for Laminar AI"
 authors = ["lmnr.ai"]
 readme = "README.md"
@@ -62,6 +62,7 @@ opentelemetry-instrumentation-groq = ">=0.33.1"
 tqdm = "~=4.0"
 argparse = "~=1.0"
+openai = "^1.52.0"
 [tool.poetry.group.dev.dependencies]
 autopep8 = "^2.2.0"
 flake8 = "7.0.0"

{lmnr-0.4.23 → lmnr-0.4.25}/src/lmnr/__init__.py RENAMED Viewed

@@ -1,3 +1,4 @@
+from .sdk.datasets import EvaluationDataset, LaminarDataset
 from .sdk.evaluations import evaluate
 from .sdk.laminar import Laminar
 from .sdk.types import ChatMessage, PipelineRunError, PipelineRunResponse, NodeInput

lmnr-0.4.25/src/lmnr/sdk/datasets.py ADDED Viewed

@@ -0,0 +1,58 @@
+from abc import ABC, abstractmethod
+import logging
+from .log import get_default_logger
+from .laminar import Laminar as L
+from .types import (
+    Datapoint,
+)
+DEFAULT_FETCH_SIZE = 25
+class EvaluationDataset(ABC):
+    @abstractmethod
+    def __init__(self, *args, **kwargs):
+        pass
+    @abstractmethod
+    def __len__(self) -> int:
+        pass
+    @abstractmethod
+    def __getitem__(self, idx) -> Datapoint:
+        pass
+    def slice(self, start: int, end: int):
+        return [self[i] for i in range(max(start, 0), min(end, len(self)))]
+class LaminarDataset(EvaluationDataset):
+    def __init__(self, name: str, fetch_size: int = DEFAULT_FETCH_SIZE):
+        self.name = name
+        self._len = None
+        self._fetched_items = []
+        self._offset = 0
+        self._fetch_size = fetch_size
+        self._logger = get_default_logger(self.__class__.__name__, level=logging.DEBUG)
+    def _fetch_batch(self):
+        self._logger.debug(
+            f"dataset {self.name}. Fetching batch from {self._offset} to "
+            + f"{self._offset + self._fetch_size}"
+        )
+        resp = L.get_datapoints(self.name, self._offset, self._fetch_size)
+        self._fetched_items += resp.items
+        self._offset = len(self._fetched_items)
+        if self._len is None:
+            self._len = resp.totalCount
+    def __len__(self) -> int:
+        if self._len is None:
+            self._fetch_batch()
+        return self._len
+    def __getitem__(self, idx) -> Datapoint:
+        if idx >= len(self._fetched_items):
+            self._fetch_batch()
+        return self._fetched_items[idx]

{lmnr-0.4.23 → lmnr-0.4.25}/src/lmnr/sdk/evaluations.py RENAMED Viewed

@@ -1,17 +1,18 @@
 import asyncio
 import re
 import sys
-from abc import ABC, abstractmethod
-from contextlib import contextmanager
-from typing import Any, Awaitable, Optional, Set, Union
 import uuid
+from contextlib import contextmanager
 from tqdm import tqdm
+from typing import Any, Awaitable, Optional, Set, Union
 from ..traceloop_sdk.instruments import Instruments
 from ..traceloop_sdk.tracing.attributes import SPAN_TYPE
+from .datasets import EvaluationDataset
 from .laminar import Laminar as L
+from .log import get_default_logger
 from .types import (
     Datapoint,
     EvaluationResultDatapoint,
@@ -84,7 +85,7 @@ class EvaluationReporter:
     ):
         self.cli_progress.close()
         print(
-            f"\nCheck progress and results at {get_evaluation_url(project_id, evaluation_id)}\n"
+            f"\nCheck the results at {get_evaluation_url(project_id, evaluation_id)}\n"
         )
         print("Average scores:")
         for name, score in average_scores.items():
@@ -92,31 +93,14 @@ class EvaluationReporter:
         print("\n")
-class EvaluationDataset(ABC):
-    @abstractmethod
-    def __init__(self, *args, **kwargs):
-        pass
-    @abstractmethod
-    def __len__(self) -> int:
-        pass
-    @abstractmethod
-    def __getitem__(self, idx) -> Datapoint:
-        pass
-    def slice(self, start: int, end: int):
-        return [self[i] for i in range(max(start, 0), min(end, len(self)))]
 class Evaluation:
     def __init__(
         self,
         data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
         executor: Any,
         evaluators: dict[str, EvaluatorFunction],
-        group_id: Optional[str] = None,
         name: Optional[str] = None,
+        group_id: Optional[str] = None,
         batch_size: int = DEFAULT_BATCH_SIZE,
         project_api_key: Optional[str] = None,
         base_url: Optional[str] = None,
@@ -135,33 +119,40 @@ class Evaluation:
             executor (Callable[..., Any]): The executor function.\
                             Takes the data point + any additional arguments\
                             and returns the output to evaluate.
-            evaluators (List[Callable[..., Any]]): List of evaluator functions.\
-                Each evaluator function takes the output of the executor _and_\
-                the target data, and returns a score. The score can be a\
-                single number or a record of string keys and number values.\
+            evaluators (dict[str, Callable[..., Any]]): Evaluator functions and\
+                names. Each evaluator function takes the output of the executor\
+                _and_ the target data, and returns a score. The score can be a\
+                single number or a dict of string keys and number values.\
                 If the score is a single number, it will be named after the\
-                evaluator function. If the function is anonymous, it will be\
-                named `evaluator_${index}`, where index is the index of the\
-                evaluator function in the list starting from 1.
-            group_id (Optional[str], optional): Group id of the evaluation.
-                            Defaults to "default".
-            name (Optional[str], optional): The name of the evaluation.\
-                            It will be auto-generated if not provided.
-            batch_size (int, optional): The batch size for evaluation.
-                            Defaults to DEFAULT_BATCH_SIZE.
-            project_api_key (Optional[str], optional): The project API key.
-                            Defaults to an empty string.
+                evaluator function. Evaluator function names must contain only\
+                letters, digits, hyphens, underscores, or spaces.
+            name (Optional[str], optional): Optional name of the evaluation.\
+                Used to identify the evaluation in the group.\
+                If not provided, a random name will be generated.
+                Defaults to None.
+            group_id (Optional[str], optional): an identifier to group\
+                evaluations. Only evaluations within the same group_id can be\
+                visually compared. If not provided, "default" is assigned.
+                Defaults to None
+            batch_size (int, optional): The batch size for evaluation. This many\
+                data points will be evaluated in parallel.
+                Defaults to DEFAULT_BATCH_SIZE.
+            project_api_key (Optional[str], optional): The project API key.\
+                If not provided, LMNR_PROJECT_API_KEY environment variable is\
+                used.
+                Defaults to an empty string.
             base_url (Optional[str], optional): The base URL for Laminar API.\
-                            Useful if self-hosted elsewhere. Do NOT include the\
-                            port, use `http_port` and `grpc_port` instead.
-                            Defaults to "https://api.lmnr.ai".
+                Useful if self-hosted. Do NOT include the port, use `http_port`\
+                and `grpc_port` instead.
+                Defaults to "https://api.lmnr.ai".
             http_port (Optional[int], optional): The port for Laminar API\
-                            HTTP service. Defaults to 443 if not specified.
+                HTTP service. Defaults to 443 if not specified.
             grpc_port (Optional[int], optional): The port for Laminar API\
-                            gRPC service. Defaults to 8443 if not specified.
+                gRPC service. Defaults to 8443 if not specified.
             instruments (Optional[Set[Instruments]], optional): Set of modules\
                 to auto-instrument. If None, all available instruments will be\
                 used.
+                See https://docs.lmnr.ai/tracing/automatic-instrumentation
                 Defaults to None.
         """
@@ -191,6 +182,7 @@ class Evaluation:
         self.group_id = group_id
         self.name = name
         self.batch_size = batch_size
+        self._logger = get_default_logger(self.__class__.__name__)
         L.initialize(
             project_api_key=project_api_key,
             base_url=base_url,
@@ -215,7 +207,7 @@ class Evaluation:
         )
         try:
-            result_datapoints = await self.evaluate_in_batches()
+            result_datapoints = await self._evaluate_in_batches()
         except Exception as e:
             self.reporter.stopWithError(e)
             self.is_finished = True
@@ -228,7 +220,7 @@ class Evaluation:
             self.reporter.stop(average_scores, evaluation.projectId, evaluation.id)
             self.is_finished = True
-    async def evaluate_in_batches(self) -> list[EvaluationResultDatapoint]:
+    async def _evaluate_in_batches(self) -> list[EvaluationResultDatapoint]:
         result_datapoints = []
         for i in range(0, len(self.data), self.batch_size):
             batch = (
@@ -300,8 +292,8 @@ def evaluate(
     data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
     executor: ExecutorFunction,
     evaluators: dict[str, EvaluatorFunction],
-    group_id: Optional[str] = None,
     name: Optional[str] = None,
+    group_id: Optional[str] = None,
     batch_size: int = DEFAULT_BATCH_SIZE,
     project_api_key: Optional[str] = None,
     base_url: Optional[str] = None,
@@ -326,24 +318,22 @@ def evaluate(
         executor (Callable[..., Any]): The executor function.\
                         Takes the data point + any additional arguments\
                         and returns the output to evaluate.
-        evaluators (List[Callable[..., Any]]): List of evaluator functions.\
-            Each evaluator function takes the output of the executor _and_\
-            the target data, and returns a score. The score can be a\
-            single number or a record of string keys and number values.\
-            If the score is a single number, it will be named after the\
-            evaluator function. If the function is anonymous, it will be\
-            named `evaluator_${index}`, where index is the index of the\
-            evaluator function in the list starting from 1.
-        group_id (Optional[str], optional): an identifier to group evaluations.\
-                        It is practical to group evaluations that evaluate\
-                        the same feature on the same dataset, to be able to\
-                        view their comparisons in the same place. If not\
-                        provided, defaults to "default".
-                        Defaults to None
+        evaluators (List[Callable[..., Any]]):
+            evaluators (dict[str, Callable[..., Any]]): Evaluator functions and\
+                names. Each evaluator function takes the output of the executor\
+                _and_ the target data, and returns a score. The score can be a\
+                single number or a dict of string keys and number values.\
+                If the score is a single number, it will be named after the\
+                evaluator function. Evaluator function names must contain only\
+                letters, digits, hyphens, underscores, or spaces.
         name (Optional[str], optional): Optional name of the evaluation.\
                         Used to identify the evaluation in the group.\
                         If not provided, a random name will be generated.
                         Defaults to None.
+        group_id (Optional[str], optional): an identifier to group evaluations.\
+                        Only evaluations within the same group_id can be\
+                        visually compared. If not provided, set to "default".
+                        Defaults to None
         batch_size (int, optional): The batch size for evaluation.
                         Defaults to DEFAULT_BATCH_SIZE.
         project_api_key (Optional[str], optional): The project API key.

{lmnr-0.4.23 → lmnr-0.4.25}/src/lmnr/sdk/laminar.py RENAMED Viewed

@@ -24,6 +24,7 @@ import json
 import logging
 import os
 import requests
+import urllib.parse
 import uuid
 from lmnr.traceloop_sdk.tracing.attributes import (
@@ -45,6 +46,7 @@ from .log import VerboseColorfulFormatter
 from .types import (
     CreateEvaluationResponse,
     EvaluationResultDatapoint,
+    GetDatapointsResponse,
     PipelineRunError,
     PipelineRunResponse,
     NodeInput,
@@ -284,7 +286,9 @@ class Laminar:
         span_type: Union[Literal["DEFAULT"], Literal["LLM"]] = "DEFAULT",
     ):
         """Start a new span as the current span. Useful for manual
-        instrumentation.
+        instrumentation. If `span_type` is set to `"LLM"`, you should report
+        usage and response attributes manually. See `Laminar.set_span_attributes`
+        for more information.
         Usage example:
         ```python
@@ -297,6 +301,9 @@ class Laminar:
             name (str): name of the span
             input (Any, optional): input to the span. Will be sent as an\
                 attribute, so must be json serializable. Defaults to None.
+            span_type (Union[Literal["DEFAULT"], Literal["LLM"]], optional):\
+                type of the span. If you use `"LLM"`, you should report usage\
+                and response attributes manually. Defaults to "DEFAULT".
         """
         with get_tracer() as tracer:
             span_path = get_span_path(name)
@@ -341,6 +348,22 @@ class Laminar:
     ):
         """Set attributes for the current span. Useful for manual
         instrumentation.
+        Example:
+        ```python
+        with L.start_as_current_span(
+            name="my_span_name", input=input["messages"], span_type="LLM"
+        ):
+            response = await my_custom_call_to_openai(input)
+            L.set_span_output(response["choices"][0]["message"]["content"])
+            L.set_span_attributes({
+                Attributes.PROVIDER: 'openai',
+                Attributes.REQUEST_MODEL: input["model"],
+                Attributes.RESPONSE_MODEL: response["model"],
+                Attributes.INPUT_TOKEN_COUNT: response["usage"]["prompt_tokens"],
+                Attributes.OUTPUT_TOKEN_COUNT: response["usage"]["completion_tokens"],
+            })
+            # ...
+        ```
         Args:
             attributes (dict[ATTRIBUTES, Any]): attributes to set for the span
@@ -433,10 +456,36 @@ class Laminar:
             try:
                 resp_json = response.json()
                 raise ValueError(f"Error creating evaluation {json.dumps(resp_json)}")
-            except Exception:
+            except requests.exceptions.RequestException:
                 raise ValueError(f"Error creating evaluation {response.text}")
         return CreateEvaluationResponse.model_validate(response.json())
+    @classmethod
+    def get_datapoints(
+        cls,
+        dataset_name: str,
+        offset: int,
+        limit: int,
+    ) -> GetDatapointsResponse:
+        params = {"name": dataset_name, "offset": offset, "limit": limit}
+        url = (
+            cls.__base_http_url
+            + "/v1/datasets/datapoints?"
+            + urllib.parse.urlencode(params)
+        )
+        response = requests.get(url, headers=cls._headers())
+        if response.status_code != 200:
+            try:
+                resp_json = response.json()
+                raise ValueError(
+                    f"Error fetching datapoints: [{response.status_code}] {json.dumps(resp_json)}"
+                )
+            except requests.exceptions.RequestException:
+                raise ValueError(
+                    f"Error fetching datapoints: [{response.status_code}] {response.text}"
+                )
+        return GetDatapointsResponse.model_validate(response.json())
     @classmethod
     def _headers(cls):
         assert cls.__project_api_key is not None, "Project API key is not set"

{lmnr-0.4.23 → lmnr-0.4.25}/src/lmnr/sdk/log.py RENAMED Viewed

@@ -37,3 +37,13 @@ class VerboseFormatter(CustomFormatter):
     def format(self, record):
         formatter = logging.Formatter(self.fmt)
         return formatter.format(record)
+def get_default_logger(name: str, level: int = logging.INFO, propagate: bool = False):
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    console_log_handler = logging.StreamHandler()
+    console_log_handler.setFormatter(VerboseColorfulFormatter())
+    logger.addHandler(console_log_handler)
+    logger.propagate = propagate
+    return logger

{lmnr-0.4.23 → lmnr-0.4.25}/src/lmnr/sdk/types.py RENAMED Viewed

@@ -79,6 +79,7 @@ class PipelineRunError(Exception):
 EvaluationDatapointData = dict[str, Any]
 EvaluationDatapointTarget = dict[str, Any]
+EvaluationDatapointMetadata = Optional[dict[str, Any]]
 # EvaluationDatapoint is a single data point in the evaluation
@@ -88,6 +89,7 @@ class Datapoint(pydantic.BaseModel):
     # input to the evaluator function (alongside the executor output).
     # Must be a dict with string keys
     target: EvaluationDatapointTarget
+    metadata: EvaluationDatapointMetadata = pydantic.Field(default=None)
 ExecutorFunctionReturnType = Any
@@ -153,3 +155,8 @@ class TraceType(Enum):
     DEFAULT = "DEFAULT"
     EVENT = "EVENT"  # must not be set manually
     EVALUATION = "EVALUATION"
+class GetDatapointsResponse(pydantic.BaseModel):
+    items: list[Datapoint]
+    totalCount: int

{lmnr-0.4.23 → lmnr-0.4.25}/src/lmnr/traceloop_sdk/decorators/base.py RENAMED Viewed

@@ -1,10 +1,10 @@
 import json
 from functools import wraps
+import logging
 import os
 import pydantic
 import types
 from typing import Any, Optional
-import warnings
 from opentelemetry import trace
 from opentelemetry import context as context_api
@@ -28,12 +28,10 @@ class CustomJSONEncoder(JSONEncoder):
 def json_dumps(data: dict) -> str:
     try:
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", RuntimeWarning)
-            return json.dumps(data, cls=CustomJSONEncoder)
+        return json.dumps(data, cls=CustomJSONEncoder)
     except Exception:
         # Log the exception and return a placeholder if serialization completely fails
-        # Telemetry().log_exception(e)
+        logging.warning("Failed to serialize data to JSON, type: %s", type(data))
         return "{}"  # Return an empty JSON object as a fallback
@@ -141,7 +139,7 @@ def aentity_method(
                 try:
                     if _should_send_prompts():
-                        span.set_attribute(SPAN_OUTPUT, json.dumps(res))
+                        span.set_attribute(SPAN_OUTPUT, json_dumps(res))
                 except TypeError:
                     pass

{lmnr-0.4.23 → lmnr-0.4.25}/src/lmnr/traceloop_sdk/tracing/attributes.py RENAMED Viewed

@@ -14,10 +14,22 @@ TRACE_TYPE = "trace_type"
 # exposed to the user, configurable
 class Attributes(Enum):
+    # == This is the minimum set of attributes for a proper LLM span ==
+    #
     # not SpanAttributes.LLM_USAGE_PROMPT_TOKENS,
     INPUT_TOKEN_COUNT = "gen_ai.usage.input_tokens"
     # not SpanAttributes.LLM_USAGE_COMPLETION_TOKENS,
     OUTPUT_TOKEN_COUNT = "gen_ai.usage.output_tokens"
+    TOTAL_TOKEN_COUNT = SpanAttributes.LLM_USAGE_TOTAL_TOKENS
     PROVIDER = SpanAttributes.LLM_SYSTEM
     REQUEST_MODEL = SpanAttributes.LLM_REQUEST_MODEL
     RESPONSE_MODEL = SpanAttributes.LLM_RESPONSE_MODEL
+    #
+    ## == End of minimum set ==
+    # == Additional attributes ==
+    #
+    INPUT_COST = "gen_ai.usage.input_cost"
+    OUTPUT_COST = "gen_ai.usage.output_cost"
+    TOTAL_COST = "gen_ai.usage.cost"
+    #
+    # == End of additional attributes ==