lmnr 0.4.53.dev0__py3-none-any.whl → 0.7.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. lmnr/__init__.py +32 -11
  2. lmnr/cli/__init__.py +270 -0
  3. lmnr/cli/datasets.py +371 -0
  4. lmnr/cli/evals.py +111 -0
  5. lmnr/cli/rules.py +42 -0
  6. lmnr/opentelemetry_lib/__init__.py +70 -0
  7. lmnr/opentelemetry_lib/decorators/__init__.py +337 -0
  8. lmnr/opentelemetry_lib/litellm/__init__.py +685 -0
  9. lmnr/opentelemetry_lib/litellm/utils.py +100 -0
  10. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/__init__.py +849 -0
  11. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/config.py +13 -0
  12. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_emitter.py +211 -0
  13. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/event_models.py +41 -0
  14. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/span_utils.py +401 -0
  15. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/streaming.py +425 -0
  16. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/utils.py +332 -0
  17. lmnr/opentelemetry_lib/opentelemetry/instrumentation/anthropic/version.py +1 -0
  18. lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/__init__.py +451 -0
  19. lmnr/opentelemetry_lib/opentelemetry/instrumentation/claude_agent/proxy.py +144 -0
  20. lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_agent/__init__.py +100 -0
  21. lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/__init__.py +476 -0
  22. lmnr/opentelemetry_lib/opentelemetry/instrumentation/cua_computer/utils.py +12 -0
  23. lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/__init__.py +599 -0
  24. lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/config.py +9 -0
  25. lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/schema_utils.py +26 -0
  26. lmnr/opentelemetry_lib/opentelemetry/instrumentation/google_genai/utils.py +330 -0
  27. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/__init__.py +488 -0
  28. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/config.py +8 -0
  29. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_emitter.py +143 -0
  30. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/event_models.py +41 -0
  31. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/span_utils.py +229 -0
  32. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/utils.py +92 -0
  33. lmnr/opentelemetry_lib/opentelemetry/instrumentation/groq/version.py +1 -0
  34. lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/__init__.py +381 -0
  35. lmnr/opentelemetry_lib/opentelemetry/instrumentation/kernel/utils.py +36 -0
  36. lmnr/opentelemetry_lib/opentelemetry/instrumentation/langgraph/__init__.py +121 -0
  37. lmnr/opentelemetry_lib/opentelemetry/instrumentation/langgraph/utils.py +60 -0
  38. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/__init__.py +61 -0
  39. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/__init__.py +472 -0
  40. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/chat_wrappers.py +1185 -0
  41. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/completion_wrappers.py +305 -0
  42. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/config.py +16 -0
  43. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/embeddings_wrappers.py +312 -0
  44. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_emitter.py +100 -0
  45. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/event_models.py +41 -0
  46. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/shared/image_gen_wrappers.py +68 -0
  47. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/utils.py +197 -0
  48. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v0/__init__.py +176 -0
  49. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/__init__.py +368 -0
  50. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/assistant_wrappers.py +325 -0
  51. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/event_handler_wrapper.py +135 -0
  52. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/v1/responses_wrappers.py +786 -0
  53. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openai/version.py +1 -0
  54. lmnr/opentelemetry_lib/opentelemetry/instrumentation/openhands_ai/__init__.py +388 -0
  55. lmnr/opentelemetry_lib/opentelemetry/instrumentation/opentelemetry/__init__.py +69 -0
  56. lmnr/opentelemetry_lib/opentelemetry/instrumentation/skyvern/__init__.py +191 -0
  57. lmnr/opentelemetry_lib/opentelemetry/instrumentation/threading/__init__.py +197 -0
  58. lmnr/opentelemetry_lib/tracing/__init__.py +263 -0
  59. lmnr/opentelemetry_lib/tracing/_instrument_initializers.py +516 -0
  60. lmnr/{openllmetry_sdk → opentelemetry_lib}/tracing/attributes.py +21 -8
  61. lmnr/opentelemetry_lib/tracing/context.py +200 -0
  62. lmnr/opentelemetry_lib/tracing/exporter.py +153 -0
  63. lmnr/opentelemetry_lib/tracing/instruments.py +140 -0
  64. lmnr/opentelemetry_lib/tracing/processor.py +193 -0
  65. lmnr/opentelemetry_lib/tracing/span.py +398 -0
  66. lmnr/opentelemetry_lib/tracing/tracer.py +57 -0
  67. lmnr/opentelemetry_lib/tracing/utils.py +62 -0
  68. lmnr/opentelemetry_lib/utils/package_check.py +18 -0
  69. lmnr/opentelemetry_lib/utils/wrappers.py +11 -0
  70. lmnr/sdk/browser/__init__.py +0 -0
  71. lmnr/sdk/browser/background_send_events.py +158 -0
  72. lmnr/sdk/browser/browser_use_cdp_otel.py +100 -0
  73. lmnr/sdk/browser/browser_use_otel.py +142 -0
  74. lmnr/sdk/browser/bubus_otel.py +71 -0
  75. lmnr/sdk/browser/cdp_utils.py +518 -0
  76. lmnr/sdk/browser/inject_script.js +514 -0
  77. lmnr/sdk/browser/patchright_otel.py +151 -0
  78. lmnr/sdk/browser/playwright_otel.py +322 -0
  79. lmnr/sdk/browser/pw_utils.py +363 -0
  80. lmnr/sdk/browser/recorder/record.umd.min.cjs +84 -0
  81. lmnr/sdk/browser/utils.py +70 -0
  82. lmnr/sdk/client/asynchronous/async_client.py +180 -0
  83. lmnr/sdk/client/asynchronous/resources/__init__.py +6 -0
  84. lmnr/sdk/client/asynchronous/resources/base.py +32 -0
  85. lmnr/sdk/client/asynchronous/resources/browser_events.py +41 -0
  86. lmnr/sdk/client/asynchronous/resources/datasets.py +131 -0
  87. lmnr/sdk/client/asynchronous/resources/evals.py +266 -0
  88. lmnr/sdk/client/asynchronous/resources/evaluators.py +85 -0
  89. lmnr/sdk/client/asynchronous/resources/tags.py +83 -0
  90. lmnr/sdk/client/synchronous/resources/__init__.py +6 -0
  91. lmnr/sdk/client/synchronous/resources/base.py +32 -0
  92. lmnr/sdk/client/synchronous/resources/browser_events.py +40 -0
  93. lmnr/sdk/client/synchronous/resources/datasets.py +131 -0
  94. lmnr/sdk/client/synchronous/resources/evals.py +263 -0
  95. lmnr/sdk/client/synchronous/resources/evaluators.py +85 -0
  96. lmnr/sdk/client/synchronous/resources/tags.py +83 -0
  97. lmnr/sdk/client/synchronous/sync_client.py +191 -0
  98. lmnr/sdk/datasets/__init__.py +94 -0
  99. lmnr/sdk/datasets/file_utils.py +91 -0
  100. lmnr/sdk/decorators.py +163 -26
  101. lmnr/sdk/eval_control.py +3 -2
  102. lmnr/sdk/evaluations.py +403 -191
  103. lmnr/sdk/laminar.py +1080 -549
  104. lmnr/sdk/log.py +7 -2
  105. lmnr/sdk/types.py +246 -134
  106. lmnr/sdk/utils.py +151 -7
  107. lmnr/version.py +46 -0
  108. {lmnr-0.4.53.dev0.dist-info → lmnr-0.7.26.dist-info}/METADATA +152 -106
  109. lmnr-0.7.26.dist-info/RECORD +116 -0
  110. lmnr-0.7.26.dist-info/WHEEL +4 -0
  111. lmnr-0.7.26.dist-info/entry_points.txt +3 -0
  112. lmnr/cli.py +0 -101
  113. lmnr/openllmetry_sdk/.python-version +0 -1
  114. lmnr/openllmetry_sdk/__init__.py +0 -72
  115. lmnr/openllmetry_sdk/config/__init__.py +0 -9
  116. lmnr/openllmetry_sdk/decorators/base.py +0 -185
  117. lmnr/openllmetry_sdk/instruments.py +0 -38
  118. lmnr/openllmetry_sdk/tracing/__init__.py +0 -1
  119. lmnr/openllmetry_sdk/tracing/content_allow_list.py +0 -24
  120. lmnr/openllmetry_sdk/tracing/context_manager.py +0 -13
  121. lmnr/openllmetry_sdk/tracing/tracing.py +0 -884
  122. lmnr/openllmetry_sdk/utils/in_memory_span_exporter.py +0 -61
  123. lmnr/openllmetry_sdk/utils/package_check.py +0 -7
  124. lmnr/openllmetry_sdk/version.py +0 -1
  125. lmnr/sdk/datasets.py +0 -55
  126. lmnr-0.4.53.dev0.dist-info/LICENSE +0 -75
  127. lmnr-0.4.53.dev0.dist-info/RECORD +0 -33
  128. lmnr-0.4.53.dev0.dist-info/WHEEL +0 -4
  129. lmnr-0.4.53.dev0.dist-info/entry_points.txt +0 -3
  130. /lmnr/{openllmetry_sdk → opentelemetry_lib}/.flake8 +0 -0
  131. /lmnr/{openllmetry_sdk → opentelemetry_lib}/utils/__init__.py +0 -0
  132. /lmnr/{openllmetry_sdk → opentelemetry_lib}/utils/json_encoder.py +0 -0
  133. /lmnr/{openllmetry_sdk/decorators/__init__.py → py.typed} +0 -0
lmnr/sdk/evaluations.py CHANGED
@@ -1,44 +1,59 @@
1
1
  import asyncio
2
2
  import re
3
- import sys
4
3
  import uuid
5
4
 
5
+ from typing import Any
6
+ from typing_extensions import TypedDict
7
+
6
8
  from tqdm import tqdm
7
- from typing import Any, Awaitable, Optional, Set, Union
8
9
 
9
- from ..openllmetry_sdk.instruments import Instruments
10
- from ..openllmetry_sdk.tracing.attributes import SPAN_TYPE
10
+ from lmnr.opentelemetry_lib.tracing.instruments import Instruments
11
+ from lmnr.opentelemetry_lib.tracing.attributes import HUMAN_EVALUATOR_OPTIONS, SPAN_TYPE
11
12
 
12
- from .datasets import EvaluationDataset
13
- from .eval_control import EVALUATION_INSTANCE, PREPARE_ONLY
14
- from .laminar import Laminar as L
15
- from .log import get_default_logger
16
- from .types import (
13
+ from lmnr.sdk.client.asynchronous.async_client import AsyncLaminarClient
14
+ from lmnr.sdk.client.synchronous.sync_client import LaminarClient
15
+ from lmnr.sdk.datasets import EvaluationDataset, LaminarDataset
16
+ from lmnr.sdk.eval_control import EVALUATION_INSTANCES, PREPARE_ONLY
17
+ from lmnr.sdk.laminar import Laminar as L
18
+ from lmnr.sdk.log import get_default_logger
19
+ from lmnr.sdk.types import (
17
20
  Datapoint,
21
+ EvaluationDatapointDatasetLink,
18
22
  EvaluationResultDatapoint,
19
23
  EvaluatorFunction,
20
24
  ExecutorFunction,
21
25
  HumanEvaluator,
22
26
  Numeric,
23
27
  NumericTypes,
28
+ PartialEvaluationDatapoint,
24
29
  SpanType,
25
30
  TraceType,
26
31
  )
27
- from .utils import is_async
32
+ from lmnr.sdk.utils import from_env, is_async, json_dumps
28
33
 
29
34
  DEFAULT_BATCH_SIZE = 5
35
+ MAX_EXPORT_BATCH_SIZE = 64
36
+
37
+
38
+ class EvaluationRunResult(TypedDict):
39
+ average_scores: dict[str, Numeric]
40
+ evaluation_id: uuid.UUID
41
+ project_id: uuid.UUID
42
+ url: str
43
+ error_message: str | None
30
44
 
31
45
 
32
46
  def get_evaluation_url(
33
- project_id: str, evaluation_id: str, base_url: str = "https://www.lmnr.ai"
47
+ project_id: str, evaluation_id: str, base_url: str | None = None
34
48
  ):
49
+ if not base_url or base_url == "https://api.lmnr.ai":
50
+ base_url = "https://www.lmnr.ai"
51
+
35
52
  url = base_url
36
- if url.endswith("/"):
37
- url = url[:-1]
53
+ url = re.sub(r"\/$", "", url)
38
54
  if url.endswith("localhost") or url.endswith("127.0.0.1"):
39
- # We best effort assume that the frontend is running on port 3000
40
- # TODO: expose the frontend port?
41
- url = url + ":3000"
55
+ # We best effort assume that the frontend is running on port 5667
56
+ url = url + ":5667"
42
57
  return f"{url}/project/{project_id}/evaluations/{evaluation_id}"
43
58
 
44
59
 
@@ -52,13 +67,17 @@ def get_average_scores(results: list[EvaluationResultDatapoint]) -> dict[str, Nu
52
67
 
53
68
  average_scores = {}
54
69
  for key, values in per_score_values.items():
55
- average_scores[key] = sum(values) / len(values)
70
+ scores = [v for v in values if v is not None]
71
+
72
+ # If there are no scores, we don't want to include the key in the average scores
73
+ if len(scores) > 0:
74
+ average_scores[key] = sum(scores) / len(scores)
56
75
 
57
76
  return average_scores
58
77
 
59
78
 
60
79
  class EvaluationReporter:
61
- def __init__(self, base_url: str = "https://www.lmnr.ai"):
80
+ def __init__(self, base_url):
62
81
  self.base_url = base_url
63
82
 
64
83
  def start(self, length: int):
@@ -71,89 +90,107 @@ class EvaluationReporter:
71
90
  def update(self, batch_length: int):
72
91
  self.cli_progress.update(batch_length)
73
92
 
74
- def stopWithError(self, error: Exception):
75
- self.cli_progress.close()
76
- sys.stderr.write(f"\nError: {error}\n")
93
+ def stop_with_error(self, error: Exception):
94
+ if hasattr(self, "cli_progress"):
95
+ self.cli_progress.close()
96
+ raise error
77
97
 
78
98
  def stop(
79
99
  self, average_scores: dict[str, Numeric], project_id: str, evaluation_id: str
80
100
  ):
81
101
  self.cli_progress.close()
82
- print(
83
- f"\nCheck the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
84
- )
85
102
  print("Average scores:")
86
103
  for name, score in average_scores.items():
87
104
  print(f"{name}: {score}")
88
- print("\n")
105
+ print(
106
+ f"Check the results at {get_evaluation_url(project_id, evaluation_id, self.base_url)}\n"
107
+ )
89
108
 
90
109
 
91
110
  class Evaluation:
92
111
  def __init__(
93
112
  self,
94
- data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
113
+ data: EvaluationDataset | list[Datapoint | dict],
95
114
  executor: Any,
96
- evaluators: dict[str, EvaluatorFunction],
97
- human_evaluators: list[HumanEvaluator] = [],
98
- name: Optional[str] = None,
99
- group_id: Optional[str] = None,
100
- batch_size: int = DEFAULT_BATCH_SIZE,
101
- project_api_key: Optional[str] = None,
102
- base_url: Optional[str] = None,
103
- http_port: Optional[int] = None,
104
- grpc_port: Optional[int] = None,
105
- instruments: Optional[Set[Instruments]] = None,
115
+ evaluators: dict[str, EvaluatorFunction | HumanEvaluator],
116
+ name: str | None = None,
117
+ group_name: str | None = None,
118
+ metadata: dict[str, Any] | None = None,
119
+ concurrency_limit: int = DEFAULT_BATCH_SIZE,
120
+ project_api_key: str | None = None,
121
+ base_url: str | None = None,
122
+ base_http_url: str | None = None,
123
+ http_port: int | None = None,
124
+ grpc_port: int | None = None,
125
+ instruments: (
126
+ set[Instruments] | list[Instruments] | tuple[Instruments] | None
127
+ ) = None,
128
+ disabled_instruments: (
129
+ set[Instruments] | list[Instruments] | tuple[Instruments] | None
130
+ ) = None,
131
+ max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
132
+ trace_export_timeout_seconds: int | None = None,
106
133
  ):
107
134
  """
108
- Initializes an instance of the Evaluations class.
135
+ Initializes an instance of the Evaluation class.
109
136
 
110
137
  Parameters:
111
- data (Union[List[EvaluationDatapoint|dict], EvaluationDataset]):\
138
+ data (list[Datapoint|dict] | EvaluationDataset):\
112
139
  List of data points to evaluate or an evaluation dataset.
113
- `data` is the input to the executor function,
114
- `target` is the input to the evaluator function.
140
+ `data` is the input to the executor function.
141
+ `target` is the input to the evaluator function.
142
+ `metadata` is optional metadata to associate with the\
143
+ datapoint.
115
144
  executor (Callable[..., Any]): The executor function.\
116
- Takes the data point + any additional arguments\
117
- and returns the output to evaluate.
118
- evaluators (dict[str, Callable[..., Any]]): Evaluator functions and\
119
- names. Each evaluator function takes the output of the executor\
120
- _and_ the target data, and returns a score. The score can be a\
121
- single number or a dict of string keys and number values.\
122
- If the score is a single number, it will be named after the\
123
- evaluator function. Evaluator function names must contain only\
124
- letters, digits, hyphens, underscores, or spaces.
125
- human_evaluators (list[HumanEvaluator], optional):\
126
- [Beta] List of instances of HumanEvaluator. For now, human\
127
- evaluator only holds the queue name.
128
- Defaults to an empty list.
129
- name (Optional[str], optional): Optional name of the evaluation.\
145
+ Takes the data point + any additional arguments and returns\
146
+ the output to evaluate.
147
+ evaluators (dict[str, Callable[..., Any] | HumanEvaluator]): Evaluator\
148
+ functions and HumanEvaluator instances with names. Each evaluator\
149
+ function takes the output of the executor _and_ the target data,\
150
+ and returns a score. The score can be a single number or a dict\
151
+ of string keys and number values. If the score is a single number,\
152
+ it will be named after the evaluator function.\
153
+ HumanEvaluator instances create empty spans for manual evaluation.\
154
+ Evaluator names must contain only letters, digits, hyphens,\
155
+ underscores, or spaces.
156
+ name (str | None, optional): Optional name of the evaluation.\
130
157
  Used to identify the evaluation in the group.\
131
158
  If not provided, a random name will be generated.
132
159
  Defaults to None.
133
- group_id (Optional[str], optional): an identifier to group\
134
- evaluations. Only evaluations within the same group_id can be\
160
+ group_name (str | None, optional): an identifier to group\
161
+ evaluations. Only evaluations within the same group_name can be\
135
162
  visually compared. If not provided, "default" is assigned.
136
163
  Defaults to None
137
- batch_size (int, optional): The batch size for evaluation. This many\
138
- data points will be evaluated in parallel.
164
+ metadata (dict[str, Any] | None): optional metadata to associate with\
165
+ concurrency_limit (int, optional): The concurrency limit for\
166
+ evaluation. This many data points will be evaluated in parallel\
167
+ with a pool of workers.
139
168
  Defaults to DEFAULT_BATCH_SIZE.
140
- project_api_key (Optional[str], optional): The project API key.\
169
+ project_api_key (str | None, optional): The project API key.\
141
170
  If not provided, LMNR_PROJECT_API_KEY environment variable is\
142
171
  used.
143
172
  Defaults to an empty string.
144
- base_url (Optional[str], optional): The base URL for Laminar API.\
173
+ base_url (str | None, optional): The base URL for Laminar API.\
145
174
  Useful if self-hosted. Do NOT include the port, use `http_port`\
146
175
  and `grpc_port` instead.
147
176
  Defaults to "https://api.lmnr.ai".
148
- http_port (Optional[int], optional): The port for Laminar API\
177
+ base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
178
+ Only set this if your Laminar backend HTTP is proxied\
179
+ through a different host. If not specified, defaults\
180
+ to https://api.lmnr.ai.
181
+ http_port (int | None, optional): The port for Laminar API\
149
182
  HTTP service. Defaults to 443 if not specified.
150
- grpc_port (Optional[int], optional): The port for Laminar API\
183
+ grpc_port (int | None, optional): The port for Laminar API\
151
184
  gRPC service. Defaults to 8443 if not specified.
152
- instruments (Optional[Set[Instruments]], optional): Set of modules\
185
+ instruments (set[Instruments] | None, optional): Set of modules\
153
186
  to auto-instrument. If None, all available instruments will be\
154
187
  used.
155
188
  See https://docs.lmnr.ai/tracing/automatic-instrumentation
156
189
  Defaults to None.
190
+ disabled_instruments (set[Instruments] | None, optional): Set of modules\
191
+ to disable auto-instrumentations. If None, only modules passed\
192
+ as `instruments` will be disabled.
193
+ Defaults to None.
157
194
  """
158
195
 
159
196
  if not evaluators:
@@ -168,7 +205,8 @@ class Evaluation:
168
205
  "underscores, or spaces."
169
206
  )
170
207
 
171
- self.is_finished = False
208
+ base_url = base_url or from_env("LMNR_BASE_URL") or "https://api.lmnr.ai"
209
+
172
210
  self.reporter = EvaluationReporter(base_url)
173
211
  if isinstance(data, list):
174
212
  self.data = [
@@ -177,212 +215,386 @@ class Evaluation:
177
215
  ]
178
216
  else:
179
217
  self.data = data
218
+ if not isinstance(self.data, LaminarDataset) and len(self.data) == 0:
219
+ raise ValueError("No data provided. Skipping evaluation")
180
220
  self.executor = executor
181
221
  self.evaluators = evaluators
182
- self.group_id = group_id
222
+ self.group_name = group_name
183
223
  self.name = name
184
- self.batch_size = batch_size
224
+ self.metadata = metadata
225
+ self.concurrency_limit = concurrency_limit
226
+ self.batch_size = concurrency_limit
185
227
  self._logger = get_default_logger(self.__class__.__name__)
186
- self.human_evaluators = human_evaluators
228
+ self.upload_tasks = []
229
+ self.base_http_url = f"{base_http_url or base_url}:{http_port or 443}"
230
+
231
+ api_key = project_api_key or from_env("LMNR_PROJECT_API_KEY")
232
+ if not api_key and not L.is_initialized():
233
+ raise ValueError(
234
+ "Please pass the project API key to `evaluate`"
235
+ " or set the LMNR_PROJECT_API_KEY environment variable"
236
+ " in your environment or .env file"
237
+ )
238
+ self.project_api_key = api_key
239
+
240
+ if L.is_initialized():
241
+ self.client = AsyncLaminarClient(
242
+ base_url=L.get_base_http_url(),
243
+ project_api_key=L.get_project_api_key(),
244
+ )
245
+ else:
246
+ self.client = AsyncLaminarClient(
247
+ base_url=self.base_http_url,
248
+ project_api_key=self.project_api_key,
249
+ )
250
+
187
251
  L.initialize(
188
252
  project_api_key=project_api_key,
189
253
  base_url=base_url,
254
+ base_http_url=self.base_http_url,
190
255
  http_port=http_port,
191
256
  grpc_port=grpc_port,
192
257
  instruments=instruments,
258
+ disabled_instruments=disabled_instruments,
259
+ max_export_batch_size=max_export_batch_size,
260
+ export_timeout_seconds=trace_export_timeout_seconds,
193
261
  )
194
262
 
195
- async def run(self) -> Awaitable[None]:
196
- if self.is_finished:
197
- raise Exception("Evaluation is already finished")
263
+ async def run(self) -> EvaluationRunResult:
198
264
  return await self._run()
199
265
 
200
- async def _run(self) -> None:
201
- self.reporter.start(len(self.data))
266
+ async def _run(self) -> EvaluationRunResult:
267
+ if isinstance(self.data, LaminarDataset):
268
+ self.data.set_client(
269
+ LaminarClient(
270
+ base_url=self.base_http_url,
271
+ project_api_key=self.project_api_key,
272
+ )
273
+ )
274
+ if not self.data.id:
275
+ try:
276
+ datasets = await self.client.datasets.get_dataset_by_name(
277
+ self.data.name
278
+ )
279
+ if len(datasets) == 0:
280
+ self._logger.warning(f"Dataset {self.data.name} not found")
281
+ else:
282
+ self.data.id = datasets[0].id
283
+ except Exception as e:
284
+ # Backward compatibility with old Laminar API (self hosted)
285
+ self._logger.warning(f"Error getting dataset {self.data.name}: {e}")
202
286
 
203
287
  try:
204
- result_datapoints = await self._evaluate_in_batches()
288
+ evaluation = await self.client.evals.init(
289
+ name=self.name, group_name=self.group_name, metadata=self.metadata
290
+ )
291
+ evaluation_id = evaluation.id
292
+ project_id = evaluation.projectId
293
+ url = get_evaluation_url(project_id, evaluation_id, self.reporter.base_url)
294
+
295
+ print(f"Check the results at {url}")
296
+
297
+ self.reporter.start(len(self.data))
298
+ result_datapoints = await self._evaluate_in_batches(evaluation.id)
299
+ # Wait for all background upload tasks to complete
300
+ if self.upload_tasks:
301
+ self._logger.debug(
302
+ f"Waiting for {len(self.upload_tasks)} upload tasks to complete"
303
+ )
304
+ await asyncio.gather(*self.upload_tasks)
305
+ self._logger.debug("All upload tasks completed")
205
306
  except Exception as e:
206
- self.reporter.stopWithError(e)
207
- self.is_finished = True
208
- return
209
-
210
- # For now add all human evaluators to all result datapoints
211
- # In the future, we will add ways to specify which human evaluators
212
- # to add to which result datapoints, e.g. sample some randomly
213
- for result_datapoint in result_datapoints:
214
- result_datapoint.human_evaluators = self.human_evaluators or {}
215
-
216
- evaluation = await L.create_evaluation(
217
- data=result_datapoints, group_id=self.group_id, name=self.name
218
- )
307
+ await self._shutdown()
308
+ self.reporter.stop_with_error(e)
309
+
219
310
  average_scores = get_average_scores(result_datapoints)
220
311
  self.reporter.stop(average_scores, evaluation.projectId, evaluation.id)
221
- self.is_finished = True
222
-
223
- async def _evaluate_in_batches(self) -> list[EvaluationResultDatapoint]:
224
- result_datapoints = []
225
- for i in range(0, len(self.data), self.batch_size):
226
- batch = (
227
- self.data[i : i + self.batch_size]
228
- if isinstance(self.data, list)
229
- else self.data.slice(i, i + self.batch_size)
230
- )
231
- batch_datapoints = await self._evaluate_batch(batch)
232
- result_datapoints.extend(batch_datapoints)
233
- self.reporter.update(len(batch))
234
- return result_datapoints
235
-
236
- async def _evaluate_batch(
237
- self, batch: list[Datapoint]
312
+ await self._shutdown()
313
+ return {
314
+ "average_scores": average_scores,
315
+ "evaluation_id": evaluation_id,
316
+ "project_id": project_id,
317
+ "url": url,
318
+ "error_message": None,
319
+ }
320
+
321
+ async def _shutdown(self):
322
+ # We use flush() instead of shutdown() because multiple evaluations
323
+ # can be run sequentially in the same process. `shutdown()` would
324
+ # close the OTLP exporter and we wouldn't be able to export traces in
325
+ # the next evaluation.
326
+ L.flush()
327
+ await self.client.close()
328
+ if isinstance(self.data, LaminarDataset) and self.data.client:
329
+ self.data.client.close()
330
+
331
+ async def _evaluate_in_batches(
332
+ self, eval_id: uuid.UUID
238
333
  ) -> list[EvaluationResultDatapoint]:
239
- batch_promises = [self._evaluate_datapoint(datapoint) for datapoint in batch]
240
- results = await asyncio.gather(*batch_promises)
241
- return results
334
+
335
+ semaphore = asyncio.Semaphore(self.concurrency_limit)
336
+ tasks = []
337
+ data_iter = self.data if isinstance(self.data, list) else range(len(self.data))
338
+
339
+ async def evaluate_task(datapoint, index):
340
+ try:
341
+ result = await self._evaluate_datapoint(eval_id, datapoint, index)
342
+ self.reporter.update(1)
343
+ return index, result
344
+ finally:
345
+ semaphore.release()
346
+
347
+ # Create tasks only after acquiring semaphore
348
+ for idx, item in enumerate(data_iter):
349
+ await semaphore.acquire()
350
+ datapoint = item if isinstance(self.data, list) else self.data[item]
351
+ task = asyncio.create_task(evaluate_task(datapoint, idx))
352
+ tasks.append(task)
353
+
354
+ # Wait for all tasks to complete and preserve order
355
+ results = await asyncio.gather(*tasks)
356
+ ordered_results = [result for _, result in sorted(results, key=lambda x: x[0])]
357
+
358
+ return ordered_results
242
359
 
243
360
  async def _evaluate_datapoint(
244
- self, datapoint: Datapoint
361
+ self, eval_id: uuid.UUID, datapoint: Datapoint, index: int
245
362
  ) -> EvaluationResultDatapoint:
363
+ evaluation_id = uuid.uuid4()
246
364
  with L.start_as_current_span("evaluation") as evaluation_span:
247
365
  L._set_trace_type(trace_type=TraceType.EVALUATION)
248
366
  evaluation_span.set_attribute(SPAN_TYPE, SpanType.EVALUATION.value)
249
367
  with L.start_as_current_span(
250
368
  "executor", input={"data": datapoint.data}
251
369
  ) as executor_span:
252
- executor_span.set_attribute(SPAN_TYPE, SpanType.EXECUTOR.value)
253
- output = (
254
- await self.executor(datapoint.data)
255
- if is_async(self.executor)
256
- else self.executor(datapoint.data)
257
- )
258
- L.set_span_output(output)
259
370
  executor_span_id = uuid.UUID(
260
371
  int=executor_span.get_span_context().span_id
261
372
  )
373
+ trace_id = uuid.UUID(int=executor_span.get_span_context().trace_id)
374
+
375
+ partial_datapoint = PartialEvaluationDatapoint(
376
+ id=evaluation_id,
377
+ data=datapoint.data,
378
+ target=datapoint.target,
379
+ index=index,
380
+ trace_id=trace_id,
381
+ executor_span_id=executor_span_id,
382
+ metadata=datapoint.metadata,
383
+ )
384
+ if isinstance(self.data, LaminarDataset):
385
+ partial_datapoint.dataset_link = EvaluationDatapointDatasetLink(
386
+ dataset_id=self.data.id,
387
+ datapoint_id=datapoint.id,
388
+ created_at=datapoint.created_at,
389
+ )
390
+ # First, create datapoint with trace_id so that we can show the dp in the UI
391
+ await self.client.evals.save_datapoints(
392
+ eval_id, [partial_datapoint], self.group_name
393
+ )
394
+ executor_span.set_attribute(SPAN_TYPE, SpanType.EXECUTOR.value)
395
+ # Run synchronous executors in a thread pool to avoid blocking
396
+ if not is_async(self.executor):
397
+ loop = asyncio.get_event_loop()
398
+ output = await loop.run_in_executor(
399
+ None, self.executor, datapoint.data
400
+ )
401
+ else:
402
+ output = await self.executor(datapoint.data)
403
+
404
+ L.set_span_output(output)
262
405
  target = datapoint.target
263
406
 
264
407
  # Iterate over evaluators
265
408
  scores: dict[str, Numeric] = {}
266
409
  for evaluator_name, evaluator in self.evaluators.items():
267
- with L.start_as_current_span(
268
- evaluator_name, input={"output": output, "target": target}
269
- ) as evaluator_span:
270
- evaluator_span.set_attribute(SPAN_TYPE, SpanType.EVALUATOR.value)
271
- value = (
272
- await evaluator(output, target)
273
- if is_async(evaluator)
274
- else evaluator(output, target)
275
- )
276
- L.set_span_output(value)
277
-
278
- # If evaluator returns a single number, use evaluator name as key
279
- if isinstance(value, NumericTypes):
280
- scores[evaluator_name] = value
410
+ # Check if evaluator is a HumanEvaluator instance
411
+ if isinstance(evaluator, HumanEvaluator):
412
+ # Create an empty span for human evaluators
413
+ with L.start_as_current_span(
414
+ evaluator_name, input={"output": output, "target": target}
415
+ ) as human_evaluator_span:
416
+ human_evaluator_span.set_attribute(
417
+ SPAN_TYPE, SpanType.HUMAN_EVALUATOR.value
418
+ )
419
+ if evaluator.options:
420
+ human_evaluator_span.set_attribute(
421
+ HUMAN_EVALUATOR_OPTIONS, json_dumps(evaluator.options)
422
+ )
423
+ # Human evaluators don't execute automatically, just create the span
424
+ L.set_span_output(None)
425
+
426
+ # We don't want to save the score for human evaluators
427
+ scores[evaluator_name] = None
281
428
  else:
282
- scores.update(value)
429
+ # Regular evaluator function
430
+ with L.start_as_current_span(
431
+ evaluator_name, input={"output": output, "target": target}
432
+ ) as evaluator_span:
433
+ evaluator_span.set_attribute(
434
+ SPAN_TYPE, SpanType.EVALUATOR.value
435
+ )
436
+ if is_async(evaluator):
437
+ value = await evaluator(output, target)
438
+ else:
439
+ loop = asyncio.get_event_loop()
440
+ value = await loop.run_in_executor(
441
+ None, evaluator, output, target
442
+ )
443
+ L.set_span_output(value)
444
+
445
+ # If evaluator returns a single number, use evaluator name as key
446
+ if isinstance(value, NumericTypes):
447
+ scores[evaluator_name] = value
448
+ else:
449
+ scores.update(value)
283
450
 
284
451
  trace_id = uuid.UUID(int=evaluation_span.get_span_context().trace_id)
285
- return EvaluationResultDatapoint(
286
- data=datapoint.data,
287
- target=target,
288
- executor_output=output,
289
- scores=scores,
290
- trace_id=trace_id,
291
- executor_span_id=executor_span_id,
452
+
453
+ eval_datapoint = EvaluationResultDatapoint(
454
+ id=evaluation_id,
455
+ data=datapoint.data,
456
+ target=target,
457
+ executor_output=output,
458
+ scores=scores,
459
+ trace_id=trace_id,
460
+ executor_span_id=executor_span_id,
461
+ index=index,
462
+ metadata=datapoint.metadata,
463
+ )
464
+ if isinstance(self.data, LaminarDataset):
465
+ eval_datapoint.dataset_link = EvaluationDatapointDatasetLink(
466
+ dataset_id=self.data.id,
467
+ datapoint_id=datapoint.id,
468
+ created_at=datapoint.created_at,
469
+ )
470
+
471
+ # Create background upload task without awaiting it
472
+ upload_task = asyncio.create_task(
473
+ self.client.evals.save_datapoints(
474
+ eval_id, [eval_datapoint], self.group_name
292
475
  )
476
+ )
477
+ self.upload_tasks.append(upload_task)
478
+
479
+ return eval_datapoint
293
480
 
294
481
 
295
482
  def evaluate(
296
- data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
483
+ data: EvaluationDataset | list[Datapoint | dict],
297
484
  executor: ExecutorFunction,
298
- evaluators: dict[str, EvaluatorFunction],
299
- human_evaluators: list[HumanEvaluator] = [],
300
- name: Optional[str] = None,
301
- group_id: Optional[str] = None,
302
- batch_size: int = DEFAULT_BATCH_SIZE,
303
- project_api_key: Optional[str] = None,
304
- base_url: Optional[str] = None,
305
- http_port: Optional[int] = None,
306
- grpc_port: Optional[int] = None,
307
- instruments: Optional[Set[Instruments]] = None,
308
- ) -> Optional[Awaitable[None]]:
485
+ evaluators: dict[str, EvaluatorFunction | HumanEvaluator],
486
+ name: str | None = None,
487
+ group_name: str | None = None,
488
+ metadata: dict[str, Any] | None = None,
489
+ concurrency_limit: int = DEFAULT_BATCH_SIZE,
490
+ project_api_key: str | None = None,
491
+ base_url: str | None = None,
492
+ base_http_url: str | None = None,
493
+ http_port: int | None = None,
494
+ grpc_port: int | None = None,
495
+ instruments: (
496
+ set[Instruments] | list[Instruments] | tuple[Instruments] | None
497
+ ) = None,
498
+ disabled_instruments: (
499
+ set[Instruments] | list[Instruments] | tuple[Instruments] | None
500
+ ) = None,
501
+ max_export_batch_size: int | None = MAX_EXPORT_BATCH_SIZE,
502
+ trace_export_timeout_seconds: int | None = None,
503
+ ) -> EvaluationRunResult | None:
309
504
  """
310
505
  If added to the file which is called through `lmnr eval` command, then
311
506
  registers the evaluation; otherwise, runs the evaluation.
312
507
 
313
508
  If there is no event loop, creates it and runs the evaluation until
314
509
  completion.
315
- If there is an event loop, schedules the evaluation as a task in the
316
- event loop and returns an awaitable handle.
510
+ If there is an event loop, returns an awaitable handle immediately. IMPORTANT:
511
+ You must await the call to `evaluate`.
317
512
 
318
513
  Parameters:
319
- data (Union[list[EvaluationDatapoint|dict]], EvaluationDataset]):\
320
- List of data points to evaluate or an evaluation dataset.
321
- `data` is the input to the executor function,
322
- `target` is the input to the evaluator function.
514
+ data (list[EvaluationDatapoint|dict] | EvaluationDataset):\
515
+ List of data points to evaluate or an evaluation dataset.
516
+ `data` is the input to the executor function,
517
+ `target` is the input to the evaluator function.
323
518
  executor (Callable[..., Any]): The executor function.\
324
- Takes the data point + any additional arguments\
325
- and returns the output to evaluate.
326
- evaluators (List[Callable[..., Any]]):
327
- evaluators (dict[str, Callable[..., Any]]): Evaluator functions and\
328
- names. Each evaluator function takes the output of the executor\
329
- _and_ the target data, and returns a score. The score can be a\
330
- single number or a dict of string keys and number values.\
331
- If the score is a single number, it will be named after the\
332
- evaluator function. Evaluator function names must contain only\
333
- letters, digits, hyphens, underscores, or spaces.
334
- human_evaluators (list[HumanEvaluator], optional):\
335
- [Beta] List of instances of HumanEvaluator. For now, human\
336
- evaluator only holds the queue name.
337
- Defaults to an empty list.
338
- name (Optional[str], optional): Optional name of the evaluation.\
339
- Used to identify the evaluation in the group.\
340
- If not provided, a random name will be generated.
341
- Defaults to None.
342
- group_id (Optional[str], optional): an identifier to group evaluations.\
343
- Only evaluations within the same group_id can be\
344
- visually compared. If not provided, set to "default".
345
- Defaults to None
346
- batch_size (int, optional): The batch size for evaluation.
519
+ Takes the data point + any additional arguments\
520
+ and returns the output to evaluate.
521
+ evaluators (dict[str, Callable[..., Any] | HumanEvaluator]): Evaluator\
522
+ functions and HumanEvaluator instances with names. Each evaluator\
523
+ function takes the output of the executor _and_ the target data,\
524
+ and returns a score. The score can be a single number or a dict\
525
+ of string keys and number values. If the score is a single number,\
526
+ it will be named after the evaluator function.\
527
+ HumanEvaluator instances create empty spans for manual evaluation.\
528
+ Evaluator function names must contain only letters, digits, hyphens,\
529
+ underscores, or spaces.
530
+ name (str | None, optional): Optional name of the evaluation.\
531
+ Used to identify the evaluation in the group. If not provided, a\
532
+ random name will be generated.
533
+ Defaults to None.
534
+ group_name (str | None, optional): An identifier to group evaluations.\
535
+ Only evaluations within the same group_name can be visually compared.\
536
+ If not provided, set to "default".
537
+ Defaults to None
538
+ metadata (dict[str, Any] | None, optional): Optional metadata to associate with\
539
+ concurrency_limit (int, optional): The concurrency limit for evaluation.
347
540
  Defaults to DEFAULT_BATCH_SIZE.
348
- project_api_key (Optional[str], optional): The project API key.
541
+ project_api_key (str | None, optional): The project API key.
349
542
  Defaults to None.
350
- base_url (Optional[str], optional): The base URL for Laminar API.\
543
+ base_url (str | None, optional): The base URL for Laminar API.\
351
544
  Useful if self-hosted elsewhere. Do NOT include the\
352
545
  port, use `http_port` and `grpc_port` instead.
353
546
  Defaults to "https://api.lmnr.ai".
354
- http_port (Optional[int], optional): The port for Laminar API's HTTP\
547
+ base_http_url (str | None, optional): The base HTTP URL for Laminar API.\
548
+ Only set this if your Laminar backend HTTP is proxied\
549
+ through a different host. If not specified, defaults\
550
+ to https://api.lmnr.ai.
551
+ http_port (int | None, optional): The port for Laminar API's HTTP\
355
552
  service. 443 is used if not specified.
356
553
  Defaults to None.
357
- grpc_port (Optional[int], optional): The port for Laminar API's gRPC\
554
+ grpc_port (int | None, optional): The port for Laminar API's gRPC\
358
555
  service. 8443 is used if not specified.
359
556
  Defaults to None.
360
- instruments (Optional[Set[Instruments]], optional): Set of modules to\
557
+ instruments (set[Instruments] | None, optional): Set of modules to\
361
558
  auto-instrument. If None, all available instruments\
362
559
  will be used.
363
560
  Defaults to None.
561
+ disabled_instruments (set[Instruments] | None, optional): Set of modules\
562
+ to disable auto-instrumentations. If None, no\
563
+ If None, only modules passed as `instruments` will be disabled.
564
+ Defaults to None.
565
+ trace_export_timeout_seconds (int | None, optional): The timeout for\
566
+ trace export on OpenTelemetry exporter. Defaults to None.
364
567
  """
365
-
366
568
  evaluation = Evaluation(
367
569
  data=data,
368
570
  executor=executor,
369
571
  evaluators=evaluators,
370
- group_id=group_id,
371
- human_evaluators=human_evaluators,
572
+ group_name=group_name,
573
+ metadata=metadata,
372
574
  name=name,
373
- batch_size=batch_size,
575
+ concurrency_limit=concurrency_limit,
374
576
  project_api_key=project_api_key,
375
577
  base_url=base_url,
578
+ base_http_url=base_http_url,
376
579
  http_port=http_port,
377
580
  grpc_port=grpc_port,
378
581
  instruments=instruments,
582
+ disabled_instruments=disabled_instruments,
583
+ max_export_batch_size=max_export_batch_size,
584
+ trace_export_timeout_seconds=trace_export_timeout_seconds,
379
585
  )
380
586
 
381
587
  if PREPARE_ONLY.get():
382
- EVALUATION_INSTANCE.set(evaluation)
588
+ existing_evaluations = EVALUATION_INSTANCES.get([])
589
+ new_evaluations = (existing_evaluations or []) + [evaluation]
590
+ EVALUATION_INSTANCES.set(new_evaluations)
591
+ return None
383
592
  else:
384
- loop = asyncio.get_event_loop()
385
- if loop.is_running():
386
- return loop.run_until_complete(evaluation.run())
387
- else:
593
+ try:
594
+ loop = asyncio.get_event_loop()
595
+ if loop.is_running():
596
+ return evaluation.run()
597
+ else:
598
+ return asyncio.run(evaluation.run())
599
+ except RuntimeError:
388
600
  return asyncio.run(evaluation.run())