lmnr 0.6.16__py3-none-any.whl → 0.7.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. lmnr/__init__.py +6 -15
  2. lmnr/cli/__init__.py +270 -0
  3. lmnr/cli/datasets.py +371 -0
  4. lmnr/{cli.py → cli/evals.py} +20 -102
  5. lmnr/cli/rules.py +42 -0
  6. lmnr/opentelemetry_lib/__init__.py +9 -2
  7. lmnr/opentelemetry_lib/decorators/__init__.py +274 -168
  8. lmnr/opentelemetry_lib/litellm/__init__.py +352 -38
  9. lmnr/opentelemetry_lib/litellm/utils.py +82 -0
  10. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/__init__.py +849 -0
  11. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/config.py +13 -0
  12. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_emitter.py +211 -0
  13. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_models.py +41 -0
  14. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/span_utils.py +401 -0
  15. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/streaming.py +425 -0
  16. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/utils.py +332 -0
  17. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/version.py +1 -0
  18. lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/__init__.py +451 -0
  19. lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/proxy.py +144 -0
  20. lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_agent/__init__.py +100 -0
  21. lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/__init__.py +476 -0
  22. lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/utils.py +12 -0
  23. lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/__init__.py +191 -129
  24. lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/schema_utils.py +26 -0
  25. lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/utils.py +126 -41
  26. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/__init__.py +488 -0
  27. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/config.py +8 -0
  28. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_emitter.py +143 -0
  29. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_models.py +41 -0
  30. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/span_utils.py +229 -0
  31. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/utils.py +92 -0
  32. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/version.py +1 -0
  33. lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/__init__.py +381 -0
  34. lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/utils.py +36 -0
  35. lmnr/opentelemetry_lib/opentelemetry/instrumentation/langgraph/__init__.py +16 -16
  36. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/__init__.py +61 -0
  37. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/__init__.py +472 -0
  38. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/chat_wrappers.py +1185 -0
  39. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/completion_wrappers.py +305 -0
  40. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/config.py +16 -0
  41. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/embeddings_wrappers.py +312 -0
  42. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_emitter.py +100 -0
  43. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_models.py +41 -0
  44. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/image_gen_wrappers.py +68 -0
  45. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/utils.py +197 -0
  46. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v0/__init__.py +176 -0
  47. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/__init__.py +368 -0
  48. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/assistant_wrappers.py +325 -0
  49. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/event_handler_wrapper.py +135 -0
  50. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/responses_wrappers.py +786 -0
  51. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/version.py +1 -0
  52. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openhands_ai/__init__.py +388 -0
  53. lmnr/opentelemetry_lib/opentelemetry/instrumentation/opentelemetry/__init__.py +69 -0
  54. lmnr/opentelemetry_lib/opentelemetry/instrumentation/skyvern/__init__.py +59 -61
  55. lmnr/opentelemetry_lib/opentelemetry/instrumentation/threading/__init__.py +197 -0
  56. lmnr/opentelemetry_lib/tracing/__init__.py +119 -18
  57. lmnr/opentelemetry_lib/tracing/_instrument_initializers.py +124 -25
  58. lmnr/opentelemetry_lib/tracing/attributes.py +4 -0
  59. lmnr/opentelemetry_lib/tracing/context.py +200 -0
  60. lmnr/opentelemetry_lib/tracing/exporter.py +109 -15
  61. lmnr/opentelemetry_lib/tracing/instruments.py +22 -5
  62. lmnr/opentelemetry_lib/tracing/processor.py +128 -30
  63. lmnr/opentelemetry_lib/tracing/span.py +398 -0
  64. lmnr/opentelemetry_lib/tracing/tracer.py +40 -1
  65. lmnr/opentelemetry_lib/tracing/utils.py +62 -0
  66. lmnr/opentelemetry_lib/utils/package_check.py +9 -0
  67. lmnr/opentelemetry_lib/utils/wrappers.py +11 -0
  68. lmnr/sdk/browser/background_send_events.py +158 -0
  69. lmnr/sdk/browser/browser_use_cdp_otel.py +100 -0
  70. lmnr/sdk/browser/browser_use_otel.py +12 -12
  71. lmnr/sdk/browser/bubus_otel.py +71 -0
  72. lmnr/sdk/browser/cdp_utils.py +518 -0
  73. lmnr/sdk/browser/inject_script.js +514 -0
  74. lmnr/sdk/browser/patchright_otel.py +18 -44
  75. lmnr/sdk/browser/playwright_otel.py +104 -187
  76. lmnr/sdk/browser/pw_utils.py +249 -210
  77. lmnr/sdk/browser/recorder/record.umd.min.cjs +84 -0
  78. lmnr/sdk/browser/utils.py +1 -1
  79. lmnr/sdk/client/asynchronous/async_client.py +47 -15
  80. lmnr/sdk/client/asynchronous/resources/__init__.py +2 -7
  81. lmnr/sdk/client/asynchronous/resources/browser_events.py +1 -0
  82. lmnr/sdk/client/asynchronous/resources/datasets.py +131 -0
  83. lmnr/sdk/client/asynchronous/resources/evals.py +122 -18
  84. lmnr/sdk/client/asynchronous/resources/evaluators.py +85 -0
  85. lmnr/sdk/client/asynchronous/resources/tags.py +4 -10
  86. lmnr/sdk/client/synchronous/resources/__init__.py +2 -2
  87. lmnr/sdk/client/synchronous/resources/datasets.py +131 -0
  88. lmnr/sdk/client/synchronous/resources/evals.py +83 -17
  89. lmnr/sdk/client/synchronous/resources/evaluators.py +85 -0
  90. lmnr/sdk/client/synchronous/resources/tags.py +4 -10
  91. lmnr/sdk/client/synchronous/sync_client.py +47 -15
  92. lmnr/sdk/datasets/__init__.py +94 -0
  93. lmnr/sdk/datasets/file_utils.py +91 -0
  94. lmnr/sdk/decorators.py +103 -23
  95. lmnr/sdk/evaluations.py +122 -33
  96. lmnr/sdk/laminar.py +816 -333
  97. lmnr/sdk/log.py +7 -2
  98. lmnr/sdk/types.py +124 -143
  99. lmnr/sdk/utils.py +115 -2
  100. lmnr/version.py +1 -1
  101. {lmnr-0.6.16.dist-info → lmnr-0.7.26.dist-info}/METADATA +71 -78
  102. lmnr-0.7.26.dist-info/RECORD +116 -0
  103. lmnr-0.7.26.dist-info/WHEEL +4 -0
  104. lmnr-0.7.26.dist-info/entry_points.txt +3 -0
  105. lmnr/opentelemetry_lib/tracing/context_properties.py +0 -65
  106. lmnr/sdk/browser/rrweb/rrweb.umd.min.cjs +0 -98
  107. lmnr/sdk/client/asynchronous/resources/agent.py +0 -329
  108. lmnr/sdk/client/synchronous/resources/agent.py +0 -323
  109. lmnr/sdk/datasets.py +0 -60
  110. lmnr-0.6.16.dist-info/LICENSE +0 -75
  111. lmnr-0.6.16.dist-info/RECORD +0 -61
  112. lmnr-0.6.16.dist-info/WHEEL +0 -4
  113. lmnr-0.6.16.dist-info/entry_points.txt +0 -3
@@ -0,0 +1,91 @@
1
+ from pathlib import Path
2
+ from typing import Any
3
+ import csv
4
+ import orjson
5
+
6
+ from lmnr.sdk.log import get_default_logger
7
+
8
+ LOG = get_default_logger(__name__, verbose=False)
9
+
10
+
11
+ def _is_supported_file(file: Path) -> bool:
12
+ """Check if a file is supported."""
13
+ return file.suffix in [".json", ".csv", ".jsonl"]
14
+
15
+
16
+ def _collect_files(paths: list[Path], recursive: bool = False) -> list[Path]:
17
+ """
18
+ Collect all supported files from the given paths.
19
+
20
+ Handles both files and directories. If a path is a directory,
21
+ collects all supported files within it (recursively if specified).
22
+ """
23
+ collected_files = []
24
+
25
+ for path in paths:
26
+ if path.is_file():
27
+ if _is_supported_file(path):
28
+ collected_files.append(path)
29
+ else:
30
+ LOG.warning(f"Skipping unsupported file type: {path}")
31
+ elif path.is_dir():
32
+ for item in path.iterdir():
33
+ if item.is_file() and _is_supported_file(item):
34
+ collected_files.append(item)
35
+ elif recursive and item.is_dir():
36
+ # Recursively collect files from subdirectories
37
+ collected_files.extend(_collect_files([item], recursive=True))
38
+ else:
39
+ LOG.warning(f"Path does not exist or is not accessible: {path}")
40
+
41
+ return collected_files
42
+
43
+
44
+ def _read_file(file: Path) -> list[dict[str, Any]]:
45
+ """Read data from a single file and return as a list of dictionaries."""
46
+ if file.suffix == ".json":
47
+ result = orjson.loads(file.read_bytes())
48
+ if isinstance(result, list):
49
+ return result
50
+ else:
51
+ return [result]
52
+ elif file.suffix == ".csv":
53
+ return [dict(row) for row in csv.DictReader(file.read_text().splitlines())]
54
+ elif file.suffix == ".jsonl":
55
+ return [
56
+ orjson.loads(line) for line in file.read_text().splitlines() if line.strip()
57
+ ]
58
+ else:
59
+ raise ValueError(f"Unsupported file type: {file.suffix}")
60
+
61
+
62
+ def load_from_paths(paths: list[Path], recursive: bool = False) -> list[dict[str, Any]]:
63
+ """
64
+ Load data from all files in the specified paths.
65
+
66
+ First collects all file paths, then reads each file's data.
67
+ """
68
+ files = _collect_files(paths, recursive)
69
+
70
+ if not files:
71
+ LOG.warning("No supported files found in the specified paths")
72
+ return []
73
+
74
+ LOG.info(f"Found {len(files)} file(s) to read")
75
+
76
+ result = []
77
+ for file in files:
78
+ try:
79
+ data = _read_file(file)
80
+ result.extend(data)
81
+ LOG.info(f"Read {len(data)} record(s) from {file}")
82
+ except Exception as e:
83
+ LOG.error(f"Error reading file {file}: {e}")
84
+ raise
85
+
86
+ return result
87
+
88
+
89
+ def parse_paths(paths: list[str]) -> list[Path]:
90
+ """Parse paths."""
91
+ return [Path(path) for path in paths]
lmnr/sdk/decorators.py CHANGED
@@ -1,15 +1,15 @@
1
1
  from lmnr.opentelemetry_lib.decorators import (
2
- entity_method,
3
- aentity_method,
4
- json_dumps,
2
+ observe_base,
3
+ async_observe_base,
5
4
  )
6
5
  from opentelemetry.trace import INVALID_SPAN, get_current_span
7
6
 
8
- from typing import Any, Callable, Literal, TypeVar, cast
7
+ from typing import Any, Callable, Coroutine, Literal, TypeVar, overload
9
8
  from typing_extensions import ParamSpec
10
9
 
11
10
  from lmnr.opentelemetry_lib.tracing.attributes import SESSION_ID
12
11
  from lmnr.sdk.log import get_default_logger
12
+ from lmnr.sdk.types import TraceType
13
13
 
14
14
  from .utils import is_async
15
15
 
@@ -19,6 +19,8 @@ P = ParamSpec("P")
19
19
  R = TypeVar("R")
20
20
 
21
21
 
22
+ # Overload for synchronous functions
23
+ @overload
22
24
  def observe(
23
25
  *,
24
26
  name: str | None = None,
@@ -28,9 +30,52 @@ def observe(
28
30
  ignore_output: bool = False,
29
31
  span_type: Literal["DEFAULT", "LLM", "TOOL"] = "DEFAULT",
30
32
  ignore_inputs: list[str] | None = None,
33
+ input_formatter: Callable[..., str] | None = None,
34
+ output_formatter: Callable[..., str] | None = None,
31
35
  metadata: dict[str, Any] | None = None,
32
36
  tags: list[str] | None = None,
33
- ) -> Callable[[Callable[P, R]], Callable[P, R]]:
37
+ preserve_global_context: bool = False,
38
+ ) -> Callable[[Callable[P, R]], Callable[P, R]]: ...
39
+
40
+
41
+ # Overload for asynchronous functions
42
+ @overload
43
+ def observe(
44
+ *,
45
+ name: str | None = None,
46
+ session_id: str | None = None,
47
+ user_id: str | None = None,
48
+ ignore_input: bool = False,
49
+ ignore_output: bool = False,
50
+ span_type: Literal["DEFAULT", "LLM", "TOOL"] = "DEFAULT",
51
+ ignore_inputs: list[str] | None = None,
52
+ input_formatter: Callable[..., str] | None = None,
53
+ output_formatter: Callable[..., str] | None = None,
54
+ metadata: dict[str, Any] | None = None,
55
+ tags: list[str] | None = None,
56
+ preserve_global_context: bool = False,
57
+ ) -> Callable[
58
+ [Callable[P, Coroutine[Any, Any, R]]], Callable[P, Coroutine[Any, Any, R]]
59
+ ]: ...
60
+
61
+
62
+ # Implementation
63
+ def observe(
64
+ *,
65
+ name: str | None = None,
66
+ session_id: str | None = None,
67
+ user_id: str | None = None,
68
+ ignore_input: bool = False,
69
+ ignore_output: bool = False,
70
+ span_type: Literal["DEFAULT", "LLM", "TOOL"] = "DEFAULT",
71
+ ignore_inputs: list[str] | None = None,
72
+ input_formatter: Callable[..., str] | None = None,
73
+ output_formatter: Callable[..., str] | None = None,
74
+ metadata: dict[str, Any] | None = None,
75
+ tags: list[str] | None = None,
76
+ preserve_global_context: bool = False,
77
+ ):
78
+ # Return type is determined by overloads above
34
79
  """The main decorator entrypoint for Laminar. This is used to wrap
35
80
  functions and methods to create spans.
36
81
 
@@ -53,10 +98,24 @@ def observe(
53
98
  def foo(a, b, `sensitive_data`), and you want to ignore the\
54
99
  `sensitive_data` argument, you can pass ["sensitive_data"] to\
55
100
  this argument. Defaults to None.
101
+ input_formatter (Callable[P, str] | None, optional): A custom function\
102
+ to format the input of the wrapped function. This function should\
103
+ accept the same parameters as the wrapped function and return a string.\
104
+ All function arguments are passed to this function. Ignored if\
105
+ `ignore_input` is True. Does not respect `ignore_inputs` argument.
106
+ Defaults to None.
107
+ output_formatter (Callable[[R], str] | None, optional): A custom function\
108
+ to format the output of the wrapped function. This function should\
109
+ accept a single parameter (the return value of the wrapped function)\
110
+ and return a string. Ignored if `ignore_output` is True.\
111
+ Defaults to None.
56
112
  metadata (dict[str, Any] | None, optional): Metadata to associate with\
57
113
  the trace. Must be JSON serializable. Defaults to None.
58
114
  tags (list[str] | None, optional): Tags to associate with the trace.
59
115
  Defaults to None.
116
+ preserve_global_context (bool, optional): Whether to preserve the global\
117
+ OpenTelemetry context. If set to True, Laminar spans will continue\
118
+ traces started in the global context. Defaults to False.
60
119
  Raises:
61
120
  Exception: re-raises the exception if the wrapped function raises an\
62
121
  exception
@@ -65,7 +124,9 @@ def observe(
65
124
  R: Returns the result of the wrapped function
66
125
  """
67
126
 
68
- def decorator(func: Callable) -> Callable:
127
+ def decorator(
128
+ func: Callable[P, R] | Callable[P, Coroutine[Any, Any, R]],
129
+ ) -> Callable[P, R] | Callable[P, Coroutine[Any, Any, R]]:
69
130
  current_span = get_current_span()
70
131
  if current_span != INVALID_SPAN:
71
132
  if session_id is not None:
@@ -75,41 +136,60 @@ def observe(
75
136
  association_properties["session_id"] = session_id
76
137
  if user_id is not None:
77
138
  association_properties["user_id"] = user_id
78
- if metadata is not None:
79
- association_properties.update(
80
- {
81
- f"metadata.{k}": (
82
- v if isinstance(v, (str, int, float, bool)) else json_dumps(v)
83
- )
84
- for k, v in metadata.items()
85
- }
86
- )
139
+ if span_type in ["EVALUATION", "EXECUTOR", "EVALUATOR"]:
140
+ association_properties["trace_type"] = TraceType.EVALUATION.value
87
141
  if tags is not None:
88
142
  if not isinstance(tags, list) or not all(
89
143
  isinstance(tag, str) for tag in tags
90
144
  ):
91
145
  logger.warning("Tags must be a list of strings. Tags will be ignored.")
92
146
  else:
93
- association_properties["tags"] = tags
94
- result = (
95
- aentity_method(
147
+ # list(set(tags)) to deduplicate tags
148
+ association_properties["tags"] = list(set(tags))
149
+ if input_formatter is not None and ignore_input:
150
+ logger.warning(
151
+ f"observe, function {func.__name__}: Input formatter"
152
+ " is ignored because `ignore_input` is True. Specify only one of"
153
+ " `ignore_input` or `input_formatter`."
154
+ )
155
+ if input_formatter is not None and ignore_inputs is not None:
156
+ logger.warning(
157
+ f"observe, function {func.__name__}: Both input formatter and"
158
+ " `ignore_inputs` are specified. Input formatter"
159
+ " will pass all arguments to the formatter regardless of"
160
+ " `ignore_inputs`."
161
+ )
162
+ if output_formatter is not None and ignore_output:
163
+ logger.warning(
164
+ f"observe, function {func.__name__}: Output formatter"
165
+ " is ignored because `ignore_output` is True. Specify only one of"
166
+ " `ignore_output` or `output_formatter`."
167
+ )
168
+ if is_async(func):
169
+ return async_observe_base(
96
170
  name=name,
97
171
  ignore_input=ignore_input,
98
172
  ignore_output=ignore_output,
99
173
  span_type=span_type,
174
+ metadata=metadata,
100
175
  ignore_inputs=ignore_inputs,
176
+ input_formatter=input_formatter,
177
+ output_formatter=output_formatter,
101
178
  association_properties=association_properties,
179
+ preserve_global_context=preserve_global_context,
102
180
  )(func)
103
- if is_async(func)
104
- else entity_method(
181
+ else:
182
+ return observe_base(
105
183
  name=name,
106
184
  ignore_input=ignore_input,
107
185
  ignore_output=ignore_output,
108
186
  span_type=span_type,
187
+ metadata=metadata,
109
188
  ignore_inputs=ignore_inputs,
189
+ input_formatter=input_formatter,
190
+ output_formatter=output_formatter,
110
191
  association_properties=association_properties,
192
+ preserve_global_context=preserve_global_context,
111
193
  )(func)
112
- )
113
- return result
114
194
 
115
- return cast(Callable, decorator)
195
+ return decorator
lmnr/sdk/evaluations.py CHANGED
@@ -2,11 +2,13 @@ import asyncio
2
2
  import re
3
3
  import uuid
4
4
 
5
+ from typing import Any
6
+ from typing_extensions import TypedDict
7
+
5
8
  from tqdm import tqdm
6
- from typing import Any, Awaitable
7
9
 
8
10
  from lmnr.opentelemetry_lib.tracing.instruments import Instruments
9
- from lmnr.opentelemetry_lib.tracing.attributes import SPAN_TYPE
11
+ from lmnr.opentelemetry_lib.tracing.attributes import HUMAN_EVALUATOR_OPTIONS, SPAN_TYPE
10
12
 
11
13
  from lmnr.sdk.client.asynchronous.async_client import AsyncLaminarClient
12
14
  from lmnr.sdk.client.synchronous.sync_client import LaminarClient
@@ -16,6 +18,7 @@ from lmnr.sdk.laminar import Laminar as L
16
18
  from lmnr.sdk.log import get_default_logger
17
19
  from lmnr.sdk.types import (
18
20
  Datapoint,
21
+ EvaluationDatapointDatasetLink,
19
22
  EvaluationResultDatapoint,
20
23
  EvaluatorFunction,
21
24
  ExecutorFunction,
@@ -26,12 +29,20 @@ from lmnr.sdk.types import (
26
29
  SpanType,
27
30
  TraceType,
28
31
  )
29
- from lmnr.sdk.utils import from_env, is_async
32
+ from lmnr.sdk.utils import from_env, is_async, json_dumps
30
33
 
31
34
  DEFAULT_BATCH_SIZE = 5
32
35
  MAX_EXPORT_BATCH_SIZE = 64
33
36
 
34
37
 
38
+ class EvaluationRunResult(TypedDict):
39
+ average_scores: dict[str, Numeric]
40
+ evaluation_id: uuid.UUID
41
+ project_id: uuid.UUID
42
+ url: str
43
+ error_message: str | None
44
+
45
+
35
46
  def get_evaluation_url(
36
47
  project_id: str, evaluation_id: str, base_url: str | None = None
37
48
  ):
@@ -57,7 +68,7 @@ def get_average_scores(results: list[EvaluationResultDatapoint]) -> dict[str, Nu
57
68
  average_scores = {}
58
69
  for key, values in per_score_values.items():
59
70
  scores = [v for v in values if v is not None]
60
-
71
+
61
72
  # If there are no scores, we don't want to include the key in the average scores
62
73
  if len(scores) > 0:
63
74
  average_scores[key] = sum(scores) / len(scores)
@@ -79,21 +90,21 @@ class EvaluationReporter:
79
90
  def update(self, batch_length: int):
80
91
  self.cli_progress.update(batch_length)
81
92
 
82
- def stopWithError(self, error: Exception):
83
- self.cli_progress.close()
93
+ def stop_with_error(self, error: Exception):
94
+ if hasattr(self, "cli_progress"):
95
+ self.cli_progress.close()
84
96
  raise error
85
97
 
86
98
  def stop(
87
99
  self, average_scores: dict[str, Numeric], project_id: str, evaluation_id: str
88
100
  ):
89
101
  self.cli_progress.close()
90
- print(
91
- f"\nCheck the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
92
- )
93
102
  print("Average scores:")
94
103
  for name, score in average_scores.items():
95
104
  print(f"{name}: {score}")
96
- print("\n")
105
+ print(
106
+ f"Check the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
107
+ )
97
108
 
98
109
 
99
110
  class Evaluation:
@@ -108,9 +119,15 @@ class Evaluation:
108
119
  concurrency_limit: int = DEFAULT_BATCH_SIZE,
109
120
  project_api_key: str | None = None,
110
121
  base_url: str | None = None,
122
+ base_http_url: str | None = None,
111
123
  http_port: int | None = None,
112
124
  grpc_port: int | None = None,
113
- instruments: set[Instruments] | None = None,
125
+ instruments: (
126
+ set[Instruments] | list[Instruments] | tuple[Instruments] | None
127
+ ) = None,
128
+ disabled_instruments: (
129
+ set[Instruments] | list[Instruments] | tuple[Instruments] | None
130
+ ) = None,
114
131
  max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
115
132
  trace_export_timeout_seconds: int | None = None,
116
133
  ):
@@ -157,6 +174,10 @@ class Evaluation:
157
174
  Useful if self-hosted. Do NOT include the port, use `http_port`\
158
175
  and `grpc_port` instead.
159
176
  Defaults to "https://api.lmnr.ai".
177
+ base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
178
+ Only set this if your Laminar backend HTTP is proxied\
179
+ through a different host. If not specified, defaults\
180
+ to https://api.lmnr.ai.
160
181
  http_port (int | None, optional): The port for Laminar API\
161
182
  HTTP service. Defaults to 443 if not specified.
162
183
  grpc_port (int | None, optional): The port for Laminar API\
@@ -166,6 +187,10 @@ class Evaluation:
166
187
  used.
167
188
  See https://docs.lmnr.ai/tracing/automatic-instrumentation
168
189
  Defaults to None.
190
+ disabled_instruments (set[Instruments] | None, optional): Set of modules\
191
+ to disable auto-instrumentations. If None, only modules passed\
192
+ as `instruments` will be disabled.
193
+ Defaults to None.
169
194
  """
170
195
 
171
196
  if not evaluators:
@@ -190,6 +215,8 @@ class Evaluation:
190
215
  ]
191
216
  else:
192
217
  self.data = data
218
+ if not isinstance(self.data, LaminarDataset) and len(self.data) == 0:
219
+ raise ValueError("No data provided. Skipping evaluation")
193
220
  self.executor = executor
194
221
  self.evaluators = evaluators
195
222
  self.group_name = group_name
@@ -199,7 +226,7 @@ class Evaluation:
199
226
  self.batch_size = concurrency_limit
200
227
  self._logger = get_default_logger(self.__class__.__name__)
201
228
  self.upload_tasks = []
202
- self.base_http_url = f"{base_url}:{http_port or 443}"
229
+ self.base_http_url = f"{base_http_url or base_url}:{http_port or 443}"
203
230
 
204
231
  api_key = project_api_key or from_env("LMNR_PROJECT_API_KEY")
205
232
  if not api_key and not L.is_initialized():
@@ -224,31 +251,51 @@ class Evaluation:
224
251
  L.initialize(
225
252
  project_api_key=project_api_key,
226
253
  base_url=base_url,
254
+ base_http_url=self.base_http_url,
227
255
  http_port=http_port,
228
256
  grpc_port=grpc_port,
229
257
  instruments=instruments,
258
+ disabled_instruments=disabled_instruments,
230
259
  max_export_batch_size=max_export_batch_size,
231
260
  export_timeout_seconds=trace_export_timeout_seconds,
232
261
  )
233
262
 
234
- async def run(self) -> Awaitable[dict[str, int | float]]:
263
+ async def run(self) -> EvaluationRunResult:
235
264
  return await self._run()
236
265
 
237
- async def _run(self) -> dict[str, int | float]:
266
+ async def _run(self) -> EvaluationRunResult:
238
267
  if isinstance(self.data, LaminarDataset):
239
268
  self.data.set_client(
240
269
  LaminarClient(
241
- self.base_http_url,
242
- self.project_api_key,
270
+ base_url=self.base_http_url,
271
+ project_api_key=self.project_api_key,
243
272
  )
244
273
  )
245
- self.reporter.start(len(self.data))
274
+ if not self.data.id:
275
+ try:
276
+ datasets = await self.client.datasets.get_dataset_by_name(
277
+ self.data.name
278
+ )
279
+ if len(datasets) == 0:
280
+ self._logger.warning(f"Dataset {self.data.name} not found")
281
+ else:
282
+ self.data.id = datasets[0].id
283
+ except Exception as e:
284
+ # Backward compatibility with old Laminar API (self hosted)
285
+ self._logger.warning(f"Error getting dataset {self.data.name}: {e}")
286
+
246
287
  try:
247
288
  evaluation = await self.client.evals.init(
248
289
  name=self.name, group_name=self.group_name, metadata=self.metadata
249
290
  )
250
- result_datapoints = await self._evaluate_in_batches(evaluation.id)
291
+ evaluation_id = evaluation.id
292
+ project_id = evaluation.projectId
293
+ url = get_evaluation_url(project_id, evaluation_id, self.reporter.base_url)
294
+
295
+ print(f"Check the results at {url}")
251
296
 
297
+ self.reporter.start(len(self.data))
298
+ result_datapoints = await self._evaluate_in_batches(evaluation.id)
252
299
  # Wait for all background upload tasks to complete
253
300
  if self.upload_tasks:
254
301
  self._logger.debug(
@@ -257,14 +304,19 @@ class Evaluation:
257
304
  await asyncio.gather(*self.upload_tasks)
258
305
  self._logger.debug("All upload tasks completed")
259
306
  except Exception as e:
260
- self.reporter.stopWithError(e)
261
307
  await self._shutdown()
262
- raise
308
+ self.reporter.stop_with_error(e)
263
309
 
264
310
  average_scores = get_average_scores(result_datapoints)
265
311
  self.reporter.stop(average_scores, evaluation.projectId, evaluation.id)
266
312
  await self._shutdown()
267
- return average_scores
313
+ return {
314
+ "average_scores": average_scores,
315
+ "evaluation_id": evaluation_id,
316
+ "project_id": project_id,
317
+ "url": url,
318
+ "error_message": None,
319
+ }
268
320
 
269
321
  async def _shutdown(self):
270
322
  # We use flush() instead of shutdown() because multiple evaluations
@@ -319,6 +371,7 @@ class Evaluation:
319
371
  int=executor_span.get_span_context().span_id
320
372
  )
321
373
  trace_id = uuid.UUID(int=executor_span.get_span_context().trace_id)
374
+
322
375
  partial_datapoint = PartialEvaluationDatapoint(
323
376
  id=evaluation_id,
324
377
  data=datapoint.data,
@@ -328,6 +381,12 @@ class Evaluation:
328
381
  executor_span_id=executor_span_id,
329
382
  metadata=datapoint.metadata,
330
383
  )
384
+ if isinstance(self.data, LaminarDataset):
385
+ partial_datapoint.dataset_link = EvaluationDatapointDatasetLink(
386
+ dataset_id=self.data.id,
387
+ datapoint_id=datapoint.id,
388
+ created_at=datapoint.created_at,
389
+ )
331
390
  # First, create datapoint with trace_id so that we can show the dp in the UI
332
391
  await self.client.evals.save_datapoints(
333
392
  eval_id, [partial_datapoint], self.group_name
@@ -352,22 +411,28 @@ class Evaluation:
352
411
  if isinstance(evaluator, HumanEvaluator):
353
412
  # Create an empty span for human evaluators
354
413
  with L.start_as_current_span(
355
- evaluator_name,
356
- input={"output": output, "target": target}
414
+ evaluator_name, input={"output": output, "target": target}
357
415
  ) as human_evaluator_span:
358
- human_evaluator_span.set_attribute(SPAN_TYPE, SpanType.HUMAN_EVALUATOR.value)
416
+ human_evaluator_span.set_attribute(
417
+ SPAN_TYPE, SpanType.HUMAN_EVALUATOR.value
418
+ )
419
+ if evaluator.options:
420
+ human_evaluator_span.set_attribute(
421
+ HUMAN_EVALUATOR_OPTIONS, json_dumps(evaluator.options)
422
+ )
359
423
  # Human evaluators don't execute automatically, just create the span
360
424
  L.set_span_output(None)
361
-
425
+
362
426
  # We don't want to save the score for human evaluators
363
427
  scores[evaluator_name] = None
364
428
  else:
365
429
  # Regular evaluator function
366
430
  with L.start_as_current_span(
367
- evaluator_name,
368
- input={"output": output, "target": target}
431
+ evaluator_name, input={"output": output, "target": target}
369
432
  ) as evaluator_span:
370
- evaluator_span.set_attribute(SPAN_TYPE, SpanType.EVALUATOR.value)
433
+ evaluator_span.set_attribute(
434
+ SPAN_TYPE, SpanType.EVALUATOR.value
435
+ )
371
436
  if is_async(evaluator):
372
437
  value = await evaluator(output, target)
373
438
  else:
@@ -385,7 +450,7 @@ class Evaluation:
385
450
 
386
451
  trace_id = uuid.UUID(int=evaluation_span.get_span_context().trace_id)
387
452
 
388
- datapoint = EvaluationResultDatapoint(
453
+ eval_datapoint = EvaluationResultDatapoint(
389
454
  id=evaluation_id,
390
455
  data=datapoint.data,
391
456
  target=target,
@@ -396,14 +461,22 @@ class Evaluation:
396
461
  index=index,
397
462
  metadata=datapoint.metadata,
398
463
  )
464
+ if isinstance(self.data, LaminarDataset):
465
+ eval_datapoint.dataset_link = EvaluationDatapointDatasetLink(
466
+ dataset_id=self.data.id,
467
+ datapoint_id=datapoint.id,
468
+ created_at=datapoint.created_at,
469
+ )
399
470
 
400
471
  # Create background upload task without awaiting it
401
472
  upload_task = asyncio.create_task(
402
- self.client.evals.save_datapoints(eval_id, [datapoint], self.group_name)
473
+ self.client.evals.save_datapoints(
474
+ eval_id, [eval_datapoint], self.group_name
475
+ )
403
476
  )
404
477
  self.upload_tasks.append(upload_task)
405
478
 
406
- return datapoint
479
+ return eval_datapoint
407
480
 
408
481
 
409
482
  def evaluate(
@@ -416,12 +489,18 @@ def evaluate(
416
489
  concurrency_limit: int = DEFAULT_BATCH_SIZE,
417
490
  project_api_key: str | None = None,
418
491
  base_url: str | None = None,
492
+ base_http_url: str | None = None,
419
493
  http_port: int | None = None,
420
494
  grpc_port: int | None = None,
421
- instruments: set[Instruments] | None = None,
495
+ instruments: (
496
+ set[Instruments] | list[Instruments] | tuple[Instruments] | None
497
+ ) = None,
498
+ disabled_instruments: (
499
+ set[Instruments] | list[Instruments] | tuple[Instruments] | None
500
+ ) = None,
422
501
  max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
423
502
  trace_export_timeout_seconds: int | None = None,
424
- ) -> Awaitable[None] | None:
503
+ ) -> EvaluationRunResult | None:
425
504
  """
426
505
  If added to the file which is called through `lmnr eval` command, then
427
506
  registers the evaluation; otherwise, runs the evaluation.
@@ -465,6 +544,10 @@ def evaluate(
465
544
  Useful if self-hosted elsewhere. Do NOT include the\
466
545
  port, use `http_port` and `grpc_port` instead.
467
546
  Defaults to "https://api.lmnr.ai".
547
+ base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
548
+ Only set this if your Laminar backend HTTP is proxied\
549
+ through a different host. If not specified, defaults\
550
+ to https://api.lmnr.ai.
468
551
  http_port (int | None, optional): The port for Laminar API's HTTP\
469
552
  service. 443 is used if not specified.
470
553
  Defaults to None.
@@ -475,6 +558,10 @@ def evaluate(
475
558
  auto-instrument. If None, all available instruments\
476
559
  will be used.
477
560
  Defaults to None.
561
+ disabled_instruments (set[Instruments] | None, optional): Set of modules\
562
+ to disable auto-instrumentations. If None, no\
563
+ If None, only modules passed as `instruments` will be disabled.
564
+ Defaults to None.
478
565
  trace_export_timeout_seconds (int | None, optional): The timeout for\
479
566
  trace export on OpenTelemetry exporter. Defaults to None.
480
567
  """
@@ -488,9 +575,11 @@ def evaluate(
488
575
  concurrency_limit=concurrency_limit,
489
576
  project_api_key=project_api_key,
490
577
  base_url=base_url,
578
+ base_http_url=base_http_url,
491
579
  http_port=http_port,
492
580
  grpc_port=grpc_port,
493
581
  instruments=instruments,
582
+ disabled_instruments=disabled_instruments,
494
583
  max_export_batch_size=max_export_batch_size,
495
584
  trace_export_timeout_seconds=trace_export_timeout_seconds,
496
585
  )