lmnr 0.6.16__py3-none-any.whl → 0.7.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lmnr/__init__.py +6 -15
- lmnr/cli/__init__.py +270 -0
- lmnr/cli/datasets.py +371 -0
- lmnr/{cli.py → cli/evals.py} +20 -102
- lmnr/cli/rules.py +42 -0
- lmnr/opentelemetry_lib/__init__.py +9 -2
- lmnr/opentelemetry_lib/decorators/__init__.py +274 -168
- lmnr/opentelemetry_lib/litellm/__init__.py +352 -38
- lmnr/opentelemetry_lib/litellm/utils.py +82 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/__init__.py +849 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/config.py +13 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_emitter.py +211 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_models.py +41 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/span_utils.py +401 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/streaming.py +425 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/utils.py +332 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/version.py +1 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/__init__.py +451 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/proxy.py +144 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_agent/__init__.py +100 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/__init__.py +476 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/utils.py +12 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/__init__.py +191 -129
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/schema_utils.py +26 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/utils.py +126 -41
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/__init__.py +488 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/config.py +8 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_emitter.py +143 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_models.py +41 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/span_utils.py +229 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/utils.py +92 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/version.py +1 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/__init__.py +381 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/utils.py +36 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/langgraph/__init__.py +16 -16
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/__init__.py +61 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/__init__.py +472 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/chat_wrappers.py +1185 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/completion_wrappers.py +305 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/config.py +16 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/embeddings_wrappers.py +312 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_emitter.py +100 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_models.py +41 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/image_gen_wrappers.py +68 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/utils.py +197 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v0/__init__.py +176 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/__init__.py +368 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/assistant_wrappers.py +325 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/event_handler_wrapper.py +135 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/responses_wrappers.py +786 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/version.py +1 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/openhands_ai/__init__.py +388 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/opentelemetry/__init__.py +69 -0
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/skyvern/__init__.py +59 -61
- lmnr/opentelemetry_lib/opentelemetry/instrumentation/threading/__init__.py +197 -0
- lmnr/opentelemetry_lib/tracing/__init__.py +119 -18
- lmnr/opentelemetry_lib/tracing/_instrument_initializers.py +124 -25
- lmnr/opentelemetry_lib/tracing/attributes.py +4 -0
- lmnr/opentelemetry_lib/tracing/context.py +200 -0
- lmnr/opentelemetry_lib/tracing/exporter.py +109 -15
- lmnr/opentelemetry_lib/tracing/instruments.py +22 -5
- lmnr/opentelemetry_lib/tracing/processor.py +128 -30
- lmnr/opentelemetry_lib/tracing/span.py +398 -0
- lmnr/opentelemetry_lib/tracing/tracer.py +40 -1
- lmnr/opentelemetry_lib/tracing/utils.py +62 -0
- lmnr/opentelemetry_lib/utils/package_check.py +9 -0
- lmnr/opentelemetry_lib/utils/wrappers.py +11 -0
- lmnr/sdk/browser/background_send_events.py +158 -0
- lmnr/sdk/browser/browser_use_cdp_otel.py +100 -0
- lmnr/sdk/browser/browser_use_otel.py +12 -12
- lmnr/sdk/browser/bubus_otel.py +71 -0
- lmnr/sdk/browser/cdp_utils.py +518 -0
- lmnr/sdk/browser/inject_script.js +514 -0
- lmnr/sdk/browser/patchright_otel.py +18 -44
- lmnr/sdk/browser/playwright_otel.py +104 -187
- lmnr/sdk/browser/pw_utils.py +249 -210
- lmnr/sdk/browser/recorder/record.umd.min.cjs +84 -0
- lmnr/sdk/browser/utils.py +1 -1
- lmnr/sdk/client/asynchronous/async_client.py +47 -15
- lmnr/sdk/client/asynchronous/resources/__init__.py +2 -7
- lmnr/sdk/client/asynchronous/resources/browser_events.py +1 -0
- lmnr/sdk/client/asynchronous/resources/datasets.py +131 -0
- lmnr/sdk/client/asynchronous/resources/evals.py +122 -18
- lmnr/sdk/client/asynchronous/resources/evaluators.py +85 -0
- lmnr/sdk/client/asynchronous/resources/tags.py +4 -10
- lmnr/sdk/client/synchronous/resources/__init__.py +2 -2
- lmnr/sdk/client/synchronous/resources/datasets.py +131 -0
- lmnr/sdk/client/synchronous/resources/evals.py +83 -17
- lmnr/sdk/client/synchronous/resources/evaluators.py +85 -0
- lmnr/sdk/client/synchronous/resources/tags.py +4 -10
- lmnr/sdk/client/synchronous/sync_client.py +47 -15
- lmnr/sdk/datasets/__init__.py +94 -0
- lmnr/sdk/datasets/file_utils.py +91 -0
- lmnr/sdk/decorators.py +103 -23
- lmnr/sdk/evaluations.py +122 -33
- lmnr/sdk/laminar.py +816 -333
- lmnr/sdk/log.py +7 -2
- lmnr/sdk/types.py +124 -143
- lmnr/sdk/utils.py +115 -2
- lmnr/version.py +1 -1
- {lmnr-0.6.16.dist-info → lmnr-0.7.26.dist-info}/METADATA +71 -78
- lmnr-0.7.26.dist-info/RECORD +116 -0
- lmnr-0.7.26.dist-info/WHEEL +4 -0
- lmnr-0.7.26.dist-info/entry_points.txt +3 -0
- lmnr/opentelemetry_lib/tracing/context_properties.py +0 -65
- lmnr/sdk/browser/rrweb/rrweb.umd.min.cjs +0 -98
- lmnr/sdk/client/asynchronous/resources/agent.py +0 -329
- lmnr/sdk/client/synchronous/resources/agent.py +0 -323
- lmnr/sdk/datasets.py +0 -60
- lmnr-0.6.16.dist-info/LICENSE +0 -75
- lmnr-0.6.16.dist-info/RECORD +0 -61
- lmnr-0.6.16.dist-info/WHEEL +0 -4
- lmnr-0.6.16.dist-info/entry_points.txt +0 -3
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Any
|
|
3
|
+
import csv
|
|
4
|
+
import orjson
|
|
5
|
+
|
|
6
|
+
from lmnr.sdk.log import get_default_logger
|
|
7
|
+
|
|
8
|
+
LOG = get_default_logger(__name__, verbose=False)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _is_supported_file(file: Path) -> bool:
|
|
12
|
+
"""Check if a file is supported."""
|
|
13
|
+
return file.suffix in [".json", ".csv", ".jsonl"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _collect_files(paths: list[Path], recursive: bool = False) -> list[Path]:
|
|
17
|
+
"""
|
|
18
|
+
Collect all supported files from the given paths.
|
|
19
|
+
|
|
20
|
+
Handles both files and directories. If a path is a directory,
|
|
21
|
+
collects all supported files within it (recursively if specified).
|
|
22
|
+
"""
|
|
23
|
+
collected_files = []
|
|
24
|
+
|
|
25
|
+
for path in paths:
|
|
26
|
+
if path.is_file():
|
|
27
|
+
if _is_supported_file(path):
|
|
28
|
+
collected_files.append(path)
|
|
29
|
+
else:
|
|
30
|
+
LOG.warning(f"Skipping unsupported file type: {path}")
|
|
31
|
+
elif path.is_dir():
|
|
32
|
+
for item in path.iterdir():
|
|
33
|
+
if item.is_file() and _is_supported_file(item):
|
|
34
|
+
collected_files.append(item)
|
|
35
|
+
elif recursive and item.is_dir():
|
|
36
|
+
# Recursively collect files from subdirectories
|
|
37
|
+
collected_files.extend(_collect_files([item], recursive=True))
|
|
38
|
+
else:
|
|
39
|
+
LOG.warning(f"Path does not exist or is not accessible: {path}")
|
|
40
|
+
|
|
41
|
+
return collected_files
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _read_file(file: Path) -> list[dict[str, Any]]:
|
|
45
|
+
"""Read data from a single file and return as a list of dictionaries."""
|
|
46
|
+
if file.suffix == ".json":
|
|
47
|
+
result = orjson.loads(file.read_bytes())
|
|
48
|
+
if isinstance(result, list):
|
|
49
|
+
return result
|
|
50
|
+
else:
|
|
51
|
+
return [result]
|
|
52
|
+
elif file.suffix == ".csv":
|
|
53
|
+
return [dict(row) for row in csv.DictReader(file.read_text().splitlines())]
|
|
54
|
+
elif file.suffix == ".jsonl":
|
|
55
|
+
return [
|
|
56
|
+
orjson.loads(line) for line in file.read_text().splitlines() if line.strip()
|
|
57
|
+
]
|
|
58
|
+
else:
|
|
59
|
+
raise ValueError(f"Unsupported file type: {file.suffix}")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def load_from_paths(paths: list[Path], recursive: bool = False) -> list[dict[str, Any]]:
|
|
63
|
+
"""
|
|
64
|
+
Load data from all files in the specified paths.
|
|
65
|
+
|
|
66
|
+
First collects all file paths, then reads each file's data.
|
|
67
|
+
"""
|
|
68
|
+
files = _collect_files(paths, recursive)
|
|
69
|
+
|
|
70
|
+
if not files:
|
|
71
|
+
LOG.warning("No supported files found in the specified paths")
|
|
72
|
+
return []
|
|
73
|
+
|
|
74
|
+
LOG.info(f"Found {len(files)} file(s) to read")
|
|
75
|
+
|
|
76
|
+
result = []
|
|
77
|
+
for file in files:
|
|
78
|
+
try:
|
|
79
|
+
data = _read_file(file)
|
|
80
|
+
result.extend(data)
|
|
81
|
+
LOG.info(f"Read {len(data)} record(s) from {file}")
|
|
82
|
+
except Exception as e:
|
|
83
|
+
LOG.error(f"Error reading file {file}: {e}")
|
|
84
|
+
raise
|
|
85
|
+
|
|
86
|
+
return result
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def parse_paths(paths: list[str]) -> list[Path]:
|
|
90
|
+
"""Parse paths."""
|
|
91
|
+
return [Path(path) for path in paths]
|
lmnr/sdk/decorators.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
from lmnr.opentelemetry_lib.decorators import (
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
json_dumps,
|
|
2
|
+
observe_base,
|
|
3
|
+
async_observe_base,
|
|
5
4
|
)
|
|
6
5
|
from opentelemetry.trace import INVALID_SPAN, get_current_span
|
|
7
6
|
|
|
8
|
-
from typing import Any, Callable, Literal, TypeVar,
|
|
7
|
+
from typing import Any, Callable, Coroutine, Literal, TypeVar, overload
|
|
9
8
|
from typing_extensions import ParamSpec
|
|
10
9
|
|
|
11
10
|
from lmnr.opentelemetry_lib.tracing.attributes import SESSION_ID
|
|
12
11
|
from lmnr.sdk.log import get_default_logger
|
|
12
|
+
from lmnr.sdk.types import TraceType
|
|
13
13
|
|
|
14
14
|
from .utils import is_async
|
|
15
15
|
|
|
@@ -19,6 +19,8 @@ P = ParamSpec("P")
|
|
|
19
19
|
R = TypeVar("R")
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
# Overload for synchronous functions
|
|
23
|
+
@overload
|
|
22
24
|
def observe(
|
|
23
25
|
*,
|
|
24
26
|
name: str | None = None,
|
|
@@ -28,9 +30,52 @@ def observe(
|
|
|
28
30
|
ignore_output: bool = False,
|
|
29
31
|
span_type: Literal["DEFAULT", "LLM", "TOOL"] = "DEFAULT",
|
|
30
32
|
ignore_inputs: list[str] | None = None,
|
|
33
|
+
input_formatter: Callable[..., str] | None = None,
|
|
34
|
+
output_formatter: Callable[..., str] | None = None,
|
|
31
35
|
metadata: dict[str, Any] | None = None,
|
|
32
36
|
tags: list[str] | None = None,
|
|
33
|
-
|
|
37
|
+
preserve_global_context: bool = False,
|
|
38
|
+
) -> Callable[[Callable[P, R]], Callable[P, R]]: ...
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Overload for asynchronous functions
|
|
42
|
+
@overload
|
|
43
|
+
def observe(
|
|
44
|
+
*,
|
|
45
|
+
name: str | None = None,
|
|
46
|
+
session_id: str | None = None,
|
|
47
|
+
user_id: str | None = None,
|
|
48
|
+
ignore_input: bool = False,
|
|
49
|
+
ignore_output: bool = False,
|
|
50
|
+
span_type: Literal["DEFAULT", "LLM", "TOOL"] = "DEFAULT",
|
|
51
|
+
ignore_inputs: list[str] | None = None,
|
|
52
|
+
input_formatter: Callable[..., str] | None = None,
|
|
53
|
+
output_formatter: Callable[..., str] | None = None,
|
|
54
|
+
metadata: dict[str, Any] | None = None,
|
|
55
|
+
tags: list[str] | None = None,
|
|
56
|
+
preserve_global_context: bool = False,
|
|
57
|
+
) -> Callable[
|
|
58
|
+
[Callable[P, Coroutine[Any, Any, R]]], Callable[P, Coroutine[Any, Any, R]]
|
|
59
|
+
]: ...
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# Implementation
|
|
63
|
+
def observe(
|
|
64
|
+
*,
|
|
65
|
+
name: str | None = None,
|
|
66
|
+
session_id: str | None = None,
|
|
67
|
+
user_id: str | None = None,
|
|
68
|
+
ignore_input: bool = False,
|
|
69
|
+
ignore_output: bool = False,
|
|
70
|
+
span_type: Literal["DEFAULT", "LLM", "TOOL"] = "DEFAULT",
|
|
71
|
+
ignore_inputs: list[str] | None = None,
|
|
72
|
+
input_formatter: Callable[..., str] | None = None,
|
|
73
|
+
output_formatter: Callable[..., str] | None = None,
|
|
74
|
+
metadata: dict[str, Any] | None = None,
|
|
75
|
+
tags: list[str] | None = None,
|
|
76
|
+
preserve_global_context: bool = False,
|
|
77
|
+
):
|
|
78
|
+
# Return type is determined by overloads above
|
|
34
79
|
"""The main decorator entrypoint for Laminar. This is used to wrap
|
|
35
80
|
functions and methods to create spans.
|
|
36
81
|
|
|
@@ -53,10 +98,24 @@ def observe(
|
|
|
53
98
|
def foo(a, b, `sensitive_data`), and you want to ignore the\
|
|
54
99
|
`sensitive_data` argument, you can pass ["sensitive_data"] to\
|
|
55
100
|
this argument. Defaults to None.
|
|
101
|
+
input_formatter (Callable[P, str] | None, optional): A custom function\
|
|
102
|
+
to format the input of the wrapped function. This function should\
|
|
103
|
+
accept the same parameters as the wrapped function and return a string.\
|
|
104
|
+
All function arguments are passed to this function. Ignored if\
|
|
105
|
+
`ignore_input` is True. Does not respect `ignore_inputs` argument.
|
|
106
|
+
Defaults to None.
|
|
107
|
+
output_formatter (Callable[[R], str] | None, optional): A custom function\
|
|
108
|
+
to format the output of the wrapped function. This function should\
|
|
109
|
+
accept a single parameter (the return value of the wrapped function)\
|
|
110
|
+
and return a string. Ignored if `ignore_output` is True.\
|
|
111
|
+
Defaults to None.
|
|
56
112
|
metadata (dict[str, Any] | None, optional): Metadata to associate with\
|
|
57
113
|
the trace. Must be JSON serializable. Defaults to None.
|
|
58
114
|
tags (list[str] | None, optional): Tags to associate with the trace.
|
|
59
115
|
Defaults to None.
|
|
116
|
+
preserve_global_context (bool, optional): Whether to preserve the global\
|
|
117
|
+
OpenTelemetry context. If set to True, Laminar spans will continue\
|
|
118
|
+
traces started in the global context. Defaults to False.
|
|
60
119
|
Raises:
|
|
61
120
|
Exception: re-raises the exception if the wrapped function raises an\
|
|
62
121
|
exception
|
|
@@ -65,7 +124,9 @@ def observe(
|
|
|
65
124
|
R: Returns the result of the wrapped function
|
|
66
125
|
"""
|
|
67
126
|
|
|
68
|
-
def decorator(
|
|
127
|
+
def decorator(
|
|
128
|
+
func: Callable[P, R] | Callable[P, Coroutine[Any, Any, R]],
|
|
129
|
+
) -> Callable[P, R] | Callable[P, Coroutine[Any, Any, R]]:
|
|
69
130
|
current_span = get_current_span()
|
|
70
131
|
if current_span != INVALID_SPAN:
|
|
71
132
|
if session_id is not None:
|
|
@@ -75,41 +136,60 @@ def observe(
|
|
|
75
136
|
association_properties["session_id"] = session_id
|
|
76
137
|
if user_id is not None:
|
|
77
138
|
association_properties["user_id"] = user_id
|
|
78
|
-
if
|
|
79
|
-
association_properties.
|
|
80
|
-
{
|
|
81
|
-
f"metadata.{k}": (
|
|
82
|
-
v if isinstance(v, (str, int, float, bool)) else json_dumps(v)
|
|
83
|
-
)
|
|
84
|
-
for k, v in metadata.items()
|
|
85
|
-
}
|
|
86
|
-
)
|
|
139
|
+
if span_type in ["EVALUATION", "EXECUTOR", "EVALUATOR"]:
|
|
140
|
+
association_properties["trace_type"] = TraceType.EVALUATION.value
|
|
87
141
|
if tags is not None:
|
|
88
142
|
if not isinstance(tags, list) or not all(
|
|
89
143
|
isinstance(tag, str) for tag in tags
|
|
90
144
|
):
|
|
91
145
|
logger.warning("Tags must be a list of strings. Tags will be ignored.")
|
|
92
146
|
else:
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
147
|
+
# list(set(tags)) to deduplicate tags
|
|
148
|
+
association_properties["tags"] = list(set(tags))
|
|
149
|
+
if input_formatter is not None and ignore_input:
|
|
150
|
+
logger.warning(
|
|
151
|
+
f"observe, function {func.__name__}: Input formatter"
|
|
152
|
+
" is ignored because `ignore_input` is True. Specify only one of"
|
|
153
|
+
" `ignore_input` or `input_formatter`."
|
|
154
|
+
)
|
|
155
|
+
if input_formatter is not None and ignore_inputs is not None:
|
|
156
|
+
logger.warning(
|
|
157
|
+
f"observe, function {func.__name__}: Both input formatter and"
|
|
158
|
+
" `ignore_inputs` are specified. Input formatter"
|
|
159
|
+
" will pass all arguments to the formatter regardless of"
|
|
160
|
+
" `ignore_inputs`."
|
|
161
|
+
)
|
|
162
|
+
if output_formatter is not None and ignore_output:
|
|
163
|
+
logger.warning(
|
|
164
|
+
f"observe, function {func.__name__}: Output formatter"
|
|
165
|
+
" is ignored because `ignore_output` is True. Specify only one of"
|
|
166
|
+
" `ignore_output` or `output_formatter`."
|
|
167
|
+
)
|
|
168
|
+
if is_async(func):
|
|
169
|
+
return async_observe_base(
|
|
96
170
|
name=name,
|
|
97
171
|
ignore_input=ignore_input,
|
|
98
172
|
ignore_output=ignore_output,
|
|
99
173
|
span_type=span_type,
|
|
174
|
+
metadata=metadata,
|
|
100
175
|
ignore_inputs=ignore_inputs,
|
|
176
|
+
input_formatter=input_formatter,
|
|
177
|
+
output_formatter=output_formatter,
|
|
101
178
|
association_properties=association_properties,
|
|
179
|
+
preserve_global_context=preserve_global_context,
|
|
102
180
|
)(func)
|
|
103
|
-
|
|
104
|
-
|
|
181
|
+
else:
|
|
182
|
+
return observe_base(
|
|
105
183
|
name=name,
|
|
106
184
|
ignore_input=ignore_input,
|
|
107
185
|
ignore_output=ignore_output,
|
|
108
186
|
span_type=span_type,
|
|
187
|
+
metadata=metadata,
|
|
109
188
|
ignore_inputs=ignore_inputs,
|
|
189
|
+
input_formatter=input_formatter,
|
|
190
|
+
output_formatter=output_formatter,
|
|
110
191
|
association_properties=association_properties,
|
|
192
|
+
preserve_global_context=preserve_global_context,
|
|
111
193
|
)(func)
|
|
112
|
-
)
|
|
113
|
-
return result
|
|
114
194
|
|
|
115
|
-
return
|
|
195
|
+
return decorator
|
lmnr/sdk/evaluations.py
CHANGED
|
@@ -2,11 +2,13 @@ import asyncio
|
|
|
2
2
|
import re
|
|
3
3
|
import uuid
|
|
4
4
|
|
|
5
|
+
from typing import Any
|
|
6
|
+
from typing_extensions import TypedDict
|
|
7
|
+
|
|
5
8
|
from tqdm import tqdm
|
|
6
|
-
from typing import Any, Awaitable
|
|
7
9
|
|
|
8
10
|
from lmnr.opentelemetry_lib.tracing.instruments import Instruments
|
|
9
|
-
from lmnr.opentelemetry_lib.tracing.attributes import SPAN_TYPE
|
|
11
|
+
from lmnr.opentelemetry_lib.tracing.attributes import HUMAN_EVALUATOR_OPTIONS, SPAN_TYPE
|
|
10
12
|
|
|
11
13
|
from lmnr.sdk.client.asynchronous.async_client import AsyncLaminarClient
|
|
12
14
|
from lmnr.sdk.client.synchronous.sync_client import LaminarClient
|
|
@@ -16,6 +18,7 @@ from lmnr.sdk.laminar import Laminar as L
|
|
|
16
18
|
from lmnr.sdk.log import get_default_logger
|
|
17
19
|
from lmnr.sdk.types import (
|
|
18
20
|
Datapoint,
|
|
21
|
+
EvaluationDatapointDatasetLink,
|
|
19
22
|
EvaluationResultDatapoint,
|
|
20
23
|
EvaluatorFunction,
|
|
21
24
|
ExecutorFunction,
|
|
@@ -26,12 +29,20 @@ from lmnr.sdk.types import (
|
|
|
26
29
|
SpanType,
|
|
27
30
|
TraceType,
|
|
28
31
|
)
|
|
29
|
-
from lmnr.sdk.utils import from_env, is_async
|
|
32
|
+
from lmnr.sdk.utils import from_env, is_async, json_dumps
|
|
30
33
|
|
|
31
34
|
DEFAULT_BATCH_SIZE = 5
|
|
32
35
|
MAX_EXPORT_BATCH_SIZE = 64
|
|
33
36
|
|
|
34
37
|
|
|
38
|
+
class EvaluationRunResult(TypedDict):
|
|
39
|
+
average_scores: dict[str, Numeric]
|
|
40
|
+
evaluation_id: uuid.UUID
|
|
41
|
+
project_id: uuid.UUID
|
|
42
|
+
url: str
|
|
43
|
+
error_message: str | None
|
|
44
|
+
|
|
45
|
+
|
|
35
46
|
def get_evaluation_url(
|
|
36
47
|
project_id: str, evaluation_id: str, base_url: str | None = None
|
|
37
48
|
):
|
|
@@ -57,7 +68,7 @@ def get_average_scores(results: list[EvaluationResultDatapoint]) -> dict[str, Nu
|
|
|
57
68
|
average_scores = {}
|
|
58
69
|
for key, values in per_score_values.items():
|
|
59
70
|
scores = [v for v in values if v is not None]
|
|
60
|
-
|
|
71
|
+
|
|
61
72
|
# If there are no scores, we don't want to include the key in the average scores
|
|
62
73
|
if len(scores) > 0:
|
|
63
74
|
average_scores[key] = sum(scores) / len(scores)
|
|
@@ -79,21 +90,21 @@ class EvaluationReporter:
|
|
|
79
90
|
def update(self, batch_length: int):
|
|
80
91
|
self.cli_progress.update(batch_length)
|
|
81
92
|
|
|
82
|
-
def
|
|
83
|
-
self
|
|
93
|
+
def stop_with_error(self, error: Exception):
|
|
94
|
+
if hasattr(self, "cli_progress"):
|
|
95
|
+
self.cli_progress.close()
|
|
84
96
|
raise error
|
|
85
97
|
|
|
86
98
|
def stop(
|
|
87
99
|
self, average_scores: dict[str, Numeric], project_id: str, evaluation_id: str
|
|
88
100
|
):
|
|
89
101
|
self.cli_progress.close()
|
|
90
|
-
print(
|
|
91
|
-
f"\nCheck the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
|
|
92
|
-
)
|
|
93
102
|
print("Average scores:")
|
|
94
103
|
for name, score in average_scores.items():
|
|
95
104
|
print(f"{name}: {score}")
|
|
96
|
-
print(
|
|
105
|
+
print(
|
|
106
|
+
f"Check the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
|
|
107
|
+
)
|
|
97
108
|
|
|
98
109
|
|
|
99
110
|
class Evaluation:
|
|
@@ -108,9 +119,15 @@ class Evaluation:
|
|
|
108
119
|
concurrency_limit: int = DEFAULT_BATCH_SIZE,
|
|
109
120
|
project_api_key: str | None = None,
|
|
110
121
|
base_url: str | None = None,
|
|
122
|
+
base_http_url: str | None = None,
|
|
111
123
|
http_port: int | None = None,
|
|
112
124
|
grpc_port: int | None = None,
|
|
113
|
-
instruments:
|
|
125
|
+
instruments: (
|
|
126
|
+
set[Instruments] | list[Instruments] | tuple[Instruments] | None
|
|
127
|
+
) = None,
|
|
128
|
+
disabled_instruments: (
|
|
129
|
+
set[Instruments] | list[Instruments] | tuple[Instruments] | None
|
|
130
|
+
) = None,
|
|
114
131
|
max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
|
|
115
132
|
trace_export_timeout_seconds: int | None = None,
|
|
116
133
|
):
|
|
@@ -157,6 +174,10 @@ class Evaluation:
|
|
|
157
174
|
Useful if self-hosted. Do NOT include the port, use `http_port`\
|
|
158
175
|
and `grpc_port` instead.
|
|
159
176
|
Defaults to "https://api.lmnr.ai".
|
|
177
|
+
base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
|
|
178
|
+
Only set this if your Laminar backend HTTP is proxied\
|
|
179
|
+
through a different host. If not specified, defaults\
|
|
180
|
+
to https://api.lmnr.ai.
|
|
160
181
|
http_port (int | None, optional): The port for Laminar API\
|
|
161
182
|
HTTP service. Defaults to 443 if not specified.
|
|
162
183
|
grpc_port (int | None, optional): The port for Laminar API\
|
|
@@ -166,6 +187,10 @@ class Evaluation:
|
|
|
166
187
|
used.
|
|
167
188
|
See https://docs.lmnr.ai/tracing/automatic-instrumentation
|
|
168
189
|
Defaults to None.
|
|
190
|
+
disabled_instruments (set[Instruments] | None, optional): Set of modules\
|
|
191
|
+
to disable auto-instrumentations. If None, only modules passed\
|
|
192
|
+
as `instruments` will be disabled.
|
|
193
|
+
Defaults to None.
|
|
169
194
|
"""
|
|
170
195
|
|
|
171
196
|
if not evaluators:
|
|
@@ -190,6 +215,8 @@ class Evaluation:
|
|
|
190
215
|
]
|
|
191
216
|
else:
|
|
192
217
|
self.data = data
|
|
218
|
+
if not isinstance(self.data, LaminarDataset) and len(self.data) == 0:
|
|
219
|
+
raise ValueError("No data provided. Skipping evaluation")
|
|
193
220
|
self.executor = executor
|
|
194
221
|
self.evaluators = evaluators
|
|
195
222
|
self.group_name = group_name
|
|
@@ -199,7 +226,7 @@ class Evaluation:
|
|
|
199
226
|
self.batch_size = concurrency_limit
|
|
200
227
|
self._logger = get_default_logger(self.__class__.__name__)
|
|
201
228
|
self.upload_tasks = []
|
|
202
|
-
self.base_http_url = f"{base_url}:{http_port or 443}"
|
|
229
|
+
self.base_http_url = f"{base_http_url or base_url}:{http_port or 443}"
|
|
203
230
|
|
|
204
231
|
api_key = project_api_key or from_env("LMNR_PROJECT_API_KEY")
|
|
205
232
|
if not api_key and not L.is_initialized():
|
|
@@ -224,31 +251,51 @@ class Evaluation:
|
|
|
224
251
|
L.initialize(
|
|
225
252
|
project_api_key=project_api_key,
|
|
226
253
|
base_url=base_url,
|
|
254
|
+
base_http_url=self.base_http_url,
|
|
227
255
|
http_port=http_port,
|
|
228
256
|
grpc_port=grpc_port,
|
|
229
257
|
instruments=instruments,
|
|
258
|
+
disabled_instruments=disabled_instruments,
|
|
230
259
|
max_export_batch_size=max_export_batch_size,
|
|
231
260
|
export_timeout_seconds=trace_export_timeout_seconds,
|
|
232
261
|
)
|
|
233
262
|
|
|
234
|
-
async def run(self) ->
|
|
263
|
+
async def run(self) -> EvaluationRunResult:
|
|
235
264
|
return await self._run()
|
|
236
265
|
|
|
237
|
-
async def _run(self) ->
|
|
266
|
+
async def _run(self) -> EvaluationRunResult:
|
|
238
267
|
if isinstance(self.data, LaminarDataset):
|
|
239
268
|
self.data.set_client(
|
|
240
269
|
LaminarClient(
|
|
241
|
-
self.base_http_url,
|
|
242
|
-
self.project_api_key,
|
|
270
|
+
base_url=self.base_http_url,
|
|
271
|
+
project_api_key=self.project_api_key,
|
|
243
272
|
)
|
|
244
273
|
)
|
|
245
|
-
|
|
274
|
+
if not self.data.id:
|
|
275
|
+
try:
|
|
276
|
+
datasets = await self.client.datasets.get_dataset_by_name(
|
|
277
|
+
self.data.name
|
|
278
|
+
)
|
|
279
|
+
if len(datasets) == 0:
|
|
280
|
+
self._logger.warning(f"Dataset {self.data.name} not found")
|
|
281
|
+
else:
|
|
282
|
+
self.data.id = datasets[0].id
|
|
283
|
+
except Exception as e:
|
|
284
|
+
# Backward compatibility with old Laminar API (self hosted)
|
|
285
|
+
self._logger.warning(f"Error getting dataset {self.data.name}: {e}")
|
|
286
|
+
|
|
246
287
|
try:
|
|
247
288
|
evaluation = await self.client.evals.init(
|
|
248
289
|
name=self.name, group_name=self.group_name, metadata=self.metadata
|
|
249
290
|
)
|
|
250
|
-
|
|
291
|
+
evaluation_id = evaluation.id
|
|
292
|
+
project_id = evaluation.projectId
|
|
293
|
+
url = get_evaluation_url(project_id, evaluation_id, self.reporter.base_url)
|
|
294
|
+
|
|
295
|
+
print(f"Check the results at {url}")
|
|
251
296
|
|
|
297
|
+
self.reporter.start(len(self.data))
|
|
298
|
+
result_datapoints = await self._evaluate_in_batches(evaluation.id)
|
|
252
299
|
# Wait for all background upload tasks to complete
|
|
253
300
|
if self.upload_tasks:
|
|
254
301
|
self._logger.debug(
|
|
@@ -257,14 +304,19 @@ class Evaluation:
|
|
|
257
304
|
await asyncio.gather(*self.upload_tasks)
|
|
258
305
|
self._logger.debug("All upload tasks completed")
|
|
259
306
|
except Exception as e:
|
|
260
|
-
self.reporter.stopWithError(e)
|
|
261
307
|
await self._shutdown()
|
|
262
|
-
|
|
308
|
+
self.reporter.stop_with_error(e)
|
|
263
309
|
|
|
264
310
|
average_scores = get_average_scores(result_datapoints)
|
|
265
311
|
self.reporter.stop(average_scores, evaluation.projectId, evaluation.id)
|
|
266
312
|
await self._shutdown()
|
|
267
|
-
return
|
|
313
|
+
return {
|
|
314
|
+
"average_scores": average_scores,
|
|
315
|
+
"evaluation_id": evaluation_id,
|
|
316
|
+
"project_id": project_id,
|
|
317
|
+
"url": url,
|
|
318
|
+
"error_message": None,
|
|
319
|
+
}
|
|
268
320
|
|
|
269
321
|
async def _shutdown(self):
|
|
270
322
|
# We use flush() instead of shutdown() because multiple evaluations
|
|
@@ -319,6 +371,7 @@ class Evaluation:
|
|
|
319
371
|
int=executor_span.get_span_context().span_id
|
|
320
372
|
)
|
|
321
373
|
trace_id = uuid.UUID(int=executor_span.get_span_context().trace_id)
|
|
374
|
+
|
|
322
375
|
partial_datapoint = PartialEvaluationDatapoint(
|
|
323
376
|
id=evaluation_id,
|
|
324
377
|
data=datapoint.data,
|
|
@@ -328,6 +381,12 @@ class Evaluation:
|
|
|
328
381
|
executor_span_id=executor_span_id,
|
|
329
382
|
metadata=datapoint.metadata,
|
|
330
383
|
)
|
|
384
|
+
if isinstance(self.data, LaminarDataset):
|
|
385
|
+
partial_datapoint.dataset_link = EvaluationDatapointDatasetLink(
|
|
386
|
+
dataset_id=self.data.id,
|
|
387
|
+
datapoint_id=datapoint.id,
|
|
388
|
+
created_at=datapoint.created_at,
|
|
389
|
+
)
|
|
331
390
|
# First, create datapoint with trace_id so that we can show the dp in the UI
|
|
332
391
|
await self.client.evals.save_datapoints(
|
|
333
392
|
eval_id, [partial_datapoint], self.group_name
|
|
@@ -352,22 +411,28 @@ class Evaluation:
|
|
|
352
411
|
if isinstance(evaluator, HumanEvaluator):
|
|
353
412
|
# Create an empty span for human evaluators
|
|
354
413
|
with L.start_as_current_span(
|
|
355
|
-
evaluator_name,
|
|
356
|
-
input={"output": output, "target": target}
|
|
414
|
+
evaluator_name, input={"output": output, "target": target}
|
|
357
415
|
) as human_evaluator_span:
|
|
358
|
-
human_evaluator_span.set_attribute(
|
|
416
|
+
human_evaluator_span.set_attribute(
|
|
417
|
+
SPAN_TYPE, SpanType.HUMAN_EVALUATOR.value
|
|
418
|
+
)
|
|
419
|
+
if evaluator.options:
|
|
420
|
+
human_evaluator_span.set_attribute(
|
|
421
|
+
HUMAN_EVALUATOR_OPTIONS, json_dumps(evaluator.options)
|
|
422
|
+
)
|
|
359
423
|
# Human evaluators don't execute automatically, just create the span
|
|
360
424
|
L.set_span_output(None)
|
|
361
|
-
|
|
425
|
+
|
|
362
426
|
# We don't want to save the score for human evaluators
|
|
363
427
|
scores[evaluator_name] = None
|
|
364
428
|
else:
|
|
365
429
|
# Regular evaluator function
|
|
366
430
|
with L.start_as_current_span(
|
|
367
|
-
evaluator_name,
|
|
368
|
-
input={"output": output, "target": target}
|
|
431
|
+
evaluator_name, input={"output": output, "target": target}
|
|
369
432
|
) as evaluator_span:
|
|
370
|
-
evaluator_span.set_attribute(
|
|
433
|
+
evaluator_span.set_attribute(
|
|
434
|
+
SPAN_TYPE, SpanType.EVALUATOR.value
|
|
435
|
+
)
|
|
371
436
|
if is_async(evaluator):
|
|
372
437
|
value = await evaluator(output, target)
|
|
373
438
|
else:
|
|
@@ -385,7 +450,7 @@ class Evaluation:
|
|
|
385
450
|
|
|
386
451
|
trace_id = uuid.UUID(int=evaluation_span.get_span_context().trace_id)
|
|
387
452
|
|
|
388
|
-
|
|
453
|
+
eval_datapoint = EvaluationResultDatapoint(
|
|
389
454
|
id=evaluation_id,
|
|
390
455
|
data=datapoint.data,
|
|
391
456
|
target=target,
|
|
@@ -396,14 +461,22 @@ class Evaluation:
|
|
|
396
461
|
index=index,
|
|
397
462
|
metadata=datapoint.metadata,
|
|
398
463
|
)
|
|
464
|
+
if isinstance(self.data, LaminarDataset):
|
|
465
|
+
eval_datapoint.dataset_link = EvaluationDatapointDatasetLink(
|
|
466
|
+
dataset_id=self.data.id,
|
|
467
|
+
datapoint_id=datapoint.id,
|
|
468
|
+
created_at=datapoint.created_at,
|
|
469
|
+
)
|
|
399
470
|
|
|
400
471
|
# Create background upload task without awaiting it
|
|
401
472
|
upload_task = asyncio.create_task(
|
|
402
|
-
self.client.evals.save_datapoints(
|
|
473
|
+
self.client.evals.save_datapoints(
|
|
474
|
+
eval_id, [eval_datapoint], self.group_name
|
|
475
|
+
)
|
|
403
476
|
)
|
|
404
477
|
self.upload_tasks.append(upload_task)
|
|
405
478
|
|
|
406
|
-
return
|
|
479
|
+
return eval_datapoint
|
|
407
480
|
|
|
408
481
|
|
|
409
482
|
def evaluate(
|
|
@@ -416,12 +489,18 @@ def evaluate(
|
|
|
416
489
|
concurrency_limit: int = DEFAULT_BATCH_SIZE,
|
|
417
490
|
project_api_key: str | None = None,
|
|
418
491
|
base_url: str | None = None,
|
|
492
|
+
base_http_url: str | None = None,
|
|
419
493
|
http_port: int | None = None,
|
|
420
494
|
grpc_port: int | None = None,
|
|
421
|
-
instruments:
|
|
495
|
+
instruments: (
|
|
496
|
+
set[Instruments] | list[Instruments] | tuple[Instruments] | None
|
|
497
|
+
) = None,
|
|
498
|
+
disabled_instruments: (
|
|
499
|
+
set[Instruments] | list[Instruments] | tuple[Instruments] | None
|
|
500
|
+
) = None,
|
|
422
501
|
max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
|
|
423
502
|
trace_export_timeout_seconds: int | None = None,
|
|
424
|
-
) ->
|
|
503
|
+
) -> EvaluationRunResult | None:
|
|
425
504
|
"""
|
|
426
505
|
If added to the file which is called through `lmnr eval` command, then
|
|
427
506
|
registers the evaluation; otherwise, runs the evaluation.
|
|
@@ -465,6 +544,10 @@ def evaluate(
|
|
|
465
544
|
Useful if self-hosted elsewhere. Do NOT include the\
|
|
466
545
|
port, use `http_port` and `grpc_port` instead.
|
|
467
546
|
Defaults to "https://api.lmnr.ai".
|
|
547
|
+
base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
|
|
548
|
+
Only set this if your Laminar backend HTTP is proxied\
|
|
549
|
+
through a different host. If not specified, defaults\
|
|
550
|
+
to https://api.lmnr.ai.
|
|
468
551
|
http_port (int | None, optional): The port for Laminar API's HTTP\
|
|
469
552
|
service. 443 is used if not specified.
|
|
470
553
|
Defaults to None.
|
|
@@ -475,6 +558,10 @@ def evaluate(
|
|
|
475
558
|
auto-instrument. If None, all available instruments\
|
|
476
559
|
will be used.
|
|
477
560
|
Defaults to None.
|
|
561
|
+
disabled_instruments (set[Instruments] | None, optional): Set of modules\
|
|
562
|
+
to disable auto-instrumentations. If None, no\
|
|
563
|
+
If None, only modules passed as `instruments` will be disabled.
|
|
564
|
+
Defaults to None.
|
|
478
565
|
trace_export_timeout_seconds (int | None, optional): The timeout for\
|
|
479
566
|
trace export on OpenTelemetry exporter. Defaults to None.
|
|
480
567
|
"""
|
|
@@ -488,9 +575,11 @@ def evaluate(
|
|
|
488
575
|
concurrency_limit=concurrency_limit,
|
|
489
576
|
project_api_key=project_api_key,
|
|
490
577
|
base_url=base_url,
|
|
578
|
+
base_http_url=base_http_url,
|
|
491
579
|
http_port=http_port,
|
|
492
580
|
grpc_port=grpc_port,
|
|
493
581
|
instruments=instruments,
|
|
582
|
+
disabled_instruments=disabled_instruments,
|
|
494
583
|
max_export_batch_size=max_export_batch_size,
|
|
495
584
|
trace_export_timeout_seconds=trace_export_timeout_seconds,
|
|
496
585
|
)
|