lmnr 0.4.10__py3-none-any.whl → 0.4.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lmnr/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from .sdk.evaluations import Evaluation
1
+ from .sdk.evaluations import evaluate
2
2
  from .sdk.laminar import Laminar
3
3
  from .sdk.types import ChatMessage, PipelineRunError, PipelineRunResponse, NodeInput
4
4
  from .sdk.decorators import observe
lmnr/cli.py ADDED
@@ -0,0 +1,39 @@
1
+ from argparse import ArgumentParser
2
+ import asyncio
3
+ import importlib
4
+ import os
5
+ import sys
6
+
7
+ from lmnr.sdk.evaluations import set_global_evaluation
8
+
9
+
10
+ # TODO: Refactor this code
11
+ async def run_evaluation(args):
12
+ sys.path.insert(0, os.getcwd())
13
+
14
+ with set_global_evaluation(True):
15
+ file = os.path.abspath(args.file)
16
+
17
+ spec = importlib.util.spec_from_file_location("run_eval", file)
18
+ mod = importlib.util.module_from_spec(spec)
19
+ spec.loader.exec_module(mod)
20
+
21
+ from lmnr.sdk.evaluations import _evaluation
22
+ evaluation = _evaluation
23
+ await evaluation.run()
24
+
25
+
26
+ def cli():
27
+ parser = ArgumentParser(
28
+ prog="lmnr",
29
+ description="CLI for Laminar",
30
+ )
31
+
32
+ subparsers = parser.add_subparsers(title="subcommands", dest="subcommand")
33
+
34
+ parser_eval = subparsers.add_parser("eval", description="Run an evaluation")
35
+ parser_eval.add_argument("file", help="A file containing the evaluation to run")
36
+ parser_eval.set_defaults(func=run_evaluation)
37
+
38
+ parsed = parser.parse_args()
39
+ asyncio.run(parsed.func(parsed))
lmnr/sdk/decorators.py CHANGED
@@ -4,22 +4,20 @@ from lmnr.traceloop_sdk.decorators.base import (
4
4
  )
5
5
  from opentelemetry.trace import INVALID_SPAN, get_current_span
6
6
 
7
- from typing import Callable, Optional, ParamSpec, TypeVar, cast
7
+ from typing import Callable, Optional, cast
8
8
 
9
+ from lmnr.traceloop_sdk.tracing.attributes import SESSION_ID, USER_ID
9
10
  from lmnr.traceloop_sdk.tracing.tracing import update_association_properties
10
11
 
11
12
  from .utils import is_async
12
13
 
13
- P = ParamSpec("P")
14
- R = TypeVar("R")
15
-
16
14
 
17
15
  def observe(
18
16
  *,
19
17
  name: Optional[str] = None,
20
18
  user_id: Optional[str] = None,
21
19
  session_id: Optional[str] = None,
22
- ) -> Callable[[Callable[P, R]], Callable[P, R]]:
20
+ ) -> Callable[[Callable], Callable]:
23
21
  """The main decorator entrypoint for Laminar. This is used to wrap
24
22
  functions and methods to create spans.
25
23
 
@@ -41,16 +39,16 @@ def observe(
41
39
  R: Returns the result of the wrapped function
42
40
  """
43
41
 
44
- def decorator(func: Callable[P, R]) -> Callable[P, R]:
42
+ def decorator(func: Callable) -> Callable:
45
43
  current_span = get_current_span()
46
44
  if current_span != INVALID_SPAN:
47
45
  if session_id is not None:
48
46
  current_span.set_attribute(
49
- "traceloop.association.properties.session_id", session_id
47
+ SESSION_ID, session_id
50
48
  )
51
49
  if user_id is not None:
52
50
  current_span.set_attribute(
53
- "traceloop.association.properties.user_id", user_id
51
+ USER_ID, user_id
54
52
  )
55
53
  association_properties = {}
56
54
  if session_id is not None:
@@ -64,4 +62,4 @@ def observe(
64
62
  else entity_method(name=name)(func)
65
63
  )
66
64
 
67
- return cast(Callable[P, R], decorator)
65
+ return cast(Callable, decorator)
lmnr/sdk/evaluations.py CHANGED
@@ -1,14 +1,78 @@
1
- from typing import Any, Union
2
-
3
- from .types import EvaluationDatapoint
4
- from .utils import is_async
5
- from .laminar import Laminar as L
6
1
  import asyncio
7
-
2
+ import sys
8
3
  from abc import ABC, abstractmethod
4
+ from contextlib import contextmanager
5
+ from typing import Any, Awaitable, Optional, Set, Union
6
+ import uuid
7
+
8
+ from tqdm import tqdm
9
+
10
+ from ..traceloop_sdk.instruments import Instruments
11
+ from ..traceloop_sdk.tracing.attributes import SPAN_TYPE
12
+
13
+ from .laminar import Laminar as L
14
+ from .types import (
15
+ CreateEvaluationResponse,
16
+ Datapoint,
17
+ EvaluationResultDatapoint,
18
+ EvaluatorFunction,
19
+ ExecutorFunction,
20
+ Numeric,
21
+ NumericTypes,
22
+ SpanType,
23
+ TraceType,
24
+ )
25
+ from .utils import is_async
9
26
 
10
27
  DEFAULT_BATCH_SIZE = 5
11
28
 
29
+ _evaluation = None
30
+ _set_global_evaluation = False
31
+
32
+
33
+ @contextmanager
34
+ def set_global_evaluation(set_global_evaluation: bool):
35
+ global _set_global_evaluation
36
+ original = _set_global_evaluation
37
+ try:
38
+ _set_global_evaluation = set_global_evaluation
39
+ yield
40
+ finally:
41
+ _set_global_evaluation = original
42
+ pass
43
+
44
+
45
+ def get_evaluation_url(project_id: str, evaluation_id: str):
46
+ return f"https://www.lmnr.ai/project/{project_id}/evaluations/{evaluation_id}"
47
+
48
+
49
+ class EvaluationReporter:
50
+ def __init__(self):
51
+ pass
52
+
53
+ def start(self, name: str, project_id: str, id: str, length: int):
54
+ print(f"Running evaluation {name}...\n")
55
+ print(f"Check progress and results at {get_evaluation_url(project_id, id)}\n")
56
+ self.cli_progress = tqdm(
57
+ total=length,
58
+ bar_format="{bar} {percentage:3.0f}% | ETA: {remaining}s | {n_fmt}/{total_fmt}",
59
+ ncols=60,
60
+ )
61
+
62
+ def update(self, batch_length: int):
63
+ self.cli_progress.update(batch_length)
64
+
65
+ def stopWithError(self, error: Exception):
66
+ self.cli_progress.close()
67
+ sys.stderr.write(f"\nError: {error}\n")
68
+
69
+ def stop(self, average_scores: dict[str, Numeric]):
70
+ self.cli_progress.close()
71
+ print("\nAverage scores:")
72
+ for name, score in average_scores.items():
73
+ print(f"{name}: {score}")
74
+ print("\n")
75
+
12
76
 
13
77
  class EvaluationDataset(ABC):
14
78
  @abstractmethod
@@ -20,7 +84,7 @@ class EvaluationDataset(ABC):
20
84
  pass
21
85
 
22
86
  @abstractmethod
23
- def __getitem__(self, idx) -> EvaluationDatapoint:
87
+ def __getitem__(self, idx) -> Datapoint:
24
88
  pass
25
89
 
26
90
  def slice(self, start: int, end: int):
@@ -30,18 +94,21 @@ class EvaluationDataset(ABC):
30
94
  class Evaluation:
31
95
  def __init__(
32
96
  self,
33
- name,
34
- data: Union[EvaluationDataset, list[Union[EvaluationDatapoint, dict]]],
97
+ data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
35
98
  executor: Any,
36
- evaluators: list[Any],
99
+ evaluators: dict[str, EvaluatorFunction],
100
+ name: Optional[str] = None,
37
101
  batch_size: int = DEFAULT_BATCH_SIZE,
38
- project_api_key: str = "",
39
- base_url: str = "https://api.lmnr.ai",
102
+ project_api_key: Optional[str] = None,
103
+ base_url: Optional[str] = None,
104
+ http_port: Optional[int] = None,
105
+ grpc_port: Optional[int] = None,
106
+ instruments: Optional[Set[Instruments]] = None,
40
107
  ):
41
108
  """
42
109
  Initializes an instance of the Evaluations class.
110
+
43
111
  Parameters:
44
- name (str): The name of the evaluation.
45
112
  data (Union[List[Union[EvaluationDatapoint, dict]], EvaluationDataset]): List of data points to evaluate or an evaluation dataset.
46
113
  `data` is the input to the executor function,
47
114
  `target` is the input to the evaluator function.
@@ -56,46 +123,43 @@ class Evaluation:
56
123
  evaluator function. If the function is anonymous, it will be
57
124
  named `evaluator_${index}`, where index is the index of the
58
125
  evaluator function in the list starting from 1.
126
+ name (Optional[str], optional): The name of the evaluation.
127
+ It will be auto-generated if not provided.
59
128
  batch_size (int, optional): The batch size for evaluation.
60
129
  Defaults to DEFAULT_BATCH_SIZE.
61
- project_api_key (str, optional): The project API key.
130
+ project_api_key (Optional[str], optional): The project API key.
62
131
  Defaults to an empty string.
63
- base_url (str, optional): The base URL for the LMNR API.
132
+ base_url (Optional[str], optional): The base URL for the Laminar API.
64
133
  Useful if self-hosted elsewhere.
65
134
  Defaults to "https://api.lmnr.ai".
135
+ http_port (Optional[int], optional): The port for the Laminar API HTTP service.
136
+ Defaults to 443.
137
+ instruments (Optional[Set[Instruments]], optional): Set of modules to auto-instrument.
138
+ Defaults to None. If None, all available instruments will be used.
66
139
  """
67
140
 
141
+ self.is_finished = False
68
142
  self.name = name
143
+ self.reporter = EvaluationReporter()
69
144
  self.executor = executor
70
- self.evaluators = dict(
71
- zip(
72
- [
73
- (
74
- e.__name__
75
- if e.__name__ and e.__name__ != "<lambda>"
76
- else f"evaluator_{i+1}"
77
- )
78
- for i, e in enumerate(evaluators)
79
- ],
80
- evaluators,
81
- )
82
- )
83
- self.evaluator_names = list(self.evaluators.keys())
145
+ self.evaluators = evaluators
84
146
  if isinstance(data, list):
85
147
  self.data = [
86
- (
87
- EvaluationDatapoint.model_validate(point)
88
- if isinstance(point, dict)
89
- else point
90
- )
148
+ (Datapoint.model_validate(point) if isinstance(point, dict) else point)
91
149
  for point in data
92
150
  ]
93
151
  else:
94
152
  self.data = data
95
153
  self.batch_size = batch_size
96
- L.initialize(project_api_key=project_api_key, base_url=base_url)
154
+ L.initialize(
155
+ project_api_key=project_api_key,
156
+ base_url=base_url,
157
+ http_port=http_port,
158
+ grpc_port=grpc_port,
159
+ instruments=instruments,
160
+ )
97
161
 
98
- def run(self):
162
+ def run(self) -> Union[None, Awaitable[None]]:
99
163
  """Runs the evaluation.
100
164
 
101
165
  Creates a new evaluation if no evaluation with such name exists, or
@@ -113,16 +177,38 @@ class Evaluation:
113
177
  ```
114
178
 
115
179
  """
180
+ if self.is_finished:
181
+ raise Exception("Evaluation is already finished")
182
+
116
183
  loop = asyncio.get_event_loop()
117
184
  if loop.is_running():
118
185
  return loop.create_task(self._run())
119
186
  else:
120
187
  return loop.run_until_complete(self._run())
121
188
 
122
- async def _run(self):
123
- response = L.create_evaluation(self.name)
189
+ async def _run(self) -> None:
190
+ evaluation = L.create_evaluation(self.name)
191
+ self.reporter.start(
192
+ evaluation.name,
193
+ evaluation.projectId,
194
+ evaluation.id,
195
+ len(self.data),
196
+ )
197
+
198
+ try:
199
+ await self.evaluate_in_batches(evaluation)
200
+ except Exception as e:
201
+ L.update_evaluation_status(evaluation.id, "Error")
202
+ self.reporter.stopWithError(e)
203
+ self.is_finished = True
204
+ return
205
+
206
+ # If we update with status "Finished", we expect averageScores to be not empty
207
+ updated_evaluation = L.update_evaluation_status(evaluation.id, "Finished")
208
+ self.reporter.stop(updated_evaluation.averageScores)
209
+ self.is_finished = True
124
210
 
125
- # Process batches sequentially
211
+ async def evaluate_in_batches(self, evaluation: CreateEvaluationResponse):
126
212
  for i in range(0, len(self.data), self.batch_size):
127
213
  batch = (
128
214
  self.data[i : i + self.batch_size]
@@ -130,49 +216,132 @@ class Evaluation:
130
216
  else self.data.slice(i, i + self.batch_size)
131
217
  )
132
218
  try:
133
- await self._evaluate_batch(batch)
219
+ results = await self._evaluate_batch(batch)
220
+ L.post_evaluation_results(evaluation.id, results)
134
221
  except Exception as e:
135
222
  print(f"Error evaluating batch: {e}")
223
+ finally:
224
+ self.reporter.update(len(batch))
136
225
 
137
- try:
138
- L.update_evaluation_status(response.name, "Finished")
139
- print(f"Evaluation {response.id} complete")
140
- except Exception as e:
141
- print(f"Error updating evaluation status: {e}")
142
-
143
- async def _evaluate_batch(self, batch: list[EvaluationDatapoint]):
226
+ async def _evaluate_batch(
227
+ self, batch: list[Datapoint]
228
+ ) -> list[EvaluationResultDatapoint]:
144
229
  batch_promises = [self._evaluate_datapoint(datapoint) for datapoint in batch]
145
230
  results = await asyncio.gather(*batch_promises)
231
+ return results
232
+
233
+ async def _evaluate_datapoint(
234
+ self, datapoint: Datapoint
235
+ ) -> EvaluationResultDatapoint:
236
+ with L.start_as_current_span("evaluation") as evaluation_span:
237
+ L._set_trace_type(trace_type=TraceType.EVALUATION)
238
+ evaluation_span.set_attribute(SPAN_TYPE, SpanType.EVALUATION.value)
239
+ with L.start_as_current_span(
240
+ "executor", input={"data": datapoint.data}
241
+ ) as executor_span:
242
+ executor_span.set_attribute(SPAN_TYPE, SpanType.EXECUTOR.value)
243
+ output = (
244
+ await self.executor(datapoint.data)
245
+ if is_async(self.executor)
246
+ else self.executor(datapoint.data)
247
+ )
248
+ L.set_span_output(output)
249
+ target = datapoint.target
146
250
 
147
- return L.post_evaluation_results(self.name, results)
251
+ # Iterate over evaluators
252
+ scores: dict[str, Numeric] = {}
253
+ for evaluator_name, evaluator in self.evaluators.items():
254
+ with L.start_as_current_span(
255
+ evaluator_name, input={"output": output, "target": target}
256
+ ) as evaluator_span:
257
+ evaluator_span.set_attribute(SPAN_TYPE, SpanType.EVALUATOR.value)
258
+ value = (
259
+ await evaluator(output, target)
260
+ if is_async(evaluator)
261
+ else evaluator(output, target)
262
+ )
263
+ L.set_span_output(value)
148
264
 
149
- async def _evaluate_datapoint(self, datapoint):
150
- output = (
151
- await self.executor(datapoint.data)
152
- if is_async(self.executor)
153
- else self.executor(datapoint.data)
154
- )
155
- target = datapoint.target
156
-
157
- # Iterate over evaluators
158
- scores = {}
159
- for evaluator_name in self.evaluator_names:
160
- evaluator = self.evaluators[evaluator_name]
161
- value = (
162
- await evaluator(output, target)
163
- if is_async(evaluator)
164
- else evaluator(output, target)
265
+ # If evaluator returns a single number, use evaluator name as key
266
+ if isinstance(value, NumericTypes):
267
+ scores[evaluator_name] = value
268
+ else:
269
+ scores.update(value)
270
+
271
+ trace_id = uuid.UUID(int=evaluation_span.get_span_context().trace_id)
272
+ return EvaluationResultDatapoint(
273
+ data=datapoint.data,
274
+ target=target,
275
+ executor_output=output,
276
+ scores=scores,
277
+ trace_id=trace_id,
165
278
  )
166
279
 
167
- # If evaluator returns a single number, use evaluator name as key
168
- if isinstance(value, (int, float)):
169
- scores[evaluator_name] = value
170
- else:
171
- scores.update(value)
172
-
173
- return {
174
- "executorOutput": output,
175
- "data": datapoint.data,
176
- "target": target,
177
- "scores": scores,
178
- }
280
+
281
+ def evaluate(
282
+ data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
283
+ executor: ExecutorFunction,
284
+ evaluators: dict[str, EvaluatorFunction],
285
+ name: Optional[str] = None,
286
+ batch_size: int = DEFAULT_BATCH_SIZE,
287
+ project_api_key: Optional[str] = None,
288
+ base_url: Optional[str] = None,
289
+ http_port: Optional[int] = None,
290
+ grpc_port: Optional[int] = None,
291
+ instruments: Optional[Set[Instruments]] = None,
292
+ ) -> Optional[Awaitable[None]]:
293
+ """
294
+ If added to the file which is called through lmnr eval command, then simply registers the evaluation.
295
+ Otherwise, if there is no event loop, creates it and runs the evaluation until completion.
296
+ If there is an event loop, schedules the evaluation as a task in the event loop and returns an awaitable handle.
297
+
298
+ Parameters:
299
+ data (Union[List[Union[EvaluationDatapoint, dict]], EvaluationDataset]): List of data points to evaluate or an evaluation dataset.
300
+ `data` is the input to the executor function,
301
+ `target` is the input to the evaluator function.
302
+ executor (Callable[..., Any]): The executor function.
303
+ Takes the data point + any additional arguments
304
+ and returns the output to evaluate.
305
+ evaluators (List[Callable[..., Any]]): List of evaluator functions.
306
+ Each evaluator function takes the output of the executor _and_
307
+ the target data, and returns a score. The score can be a
308
+ single number or a record of string keys and number values.
309
+ If the score is a single number, it will be named after the
310
+ evaluator function. If the function is anonymous, it will be
311
+ named `evaluator_${index}`, where index is the index of the
312
+ evaluator function in the list starting from 1.
313
+ name (Optional[str], optional): The name of the evaluation.
314
+ It will be auto-generated if not provided.
315
+ batch_size (int, optional): The batch size for evaluation.
316
+ Defaults to DEFAULT_BATCH_SIZE.
317
+ project_api_key (Optional[str], optional): The project API key.
318
+ Defaults to an empty string.
319
+ base_url (Optional[str], optional): The base URL for the Laminar API.
320
+ Useful if self-hosted elsewhere.
321
+ Defaults to "https://api.lmnr.ai".
322
+ http_port (Optional[int], optional): The port for the Laminar API HTTP service.
323
+ Defaults to 443.
324
+ grpc_port (Optional[int], optional): The port for the Laminar API gRPC service.
325
+ Defaults to 8443.
326
+ instruments (Optional[Set[Instruments]], optional): Set of modules to auto-instrument.
327
+ Defaults to None. If None, all available instruments will be used.
328
+ """
329
+
330
+ evaluation = Evaluation(
331
+ data=data,
332
+ executor=executor,
333
+ evaluators=evaluators,
334
+ name=name,
335
+ batch_size=batch_size,
336
+ project_api_key=project_api_key,
337
+ base_url=base_url,
338
+ http_port=http_port,
339
+ grpc_port=grpc_port,
340
+ instruments=instruments,
341
+ )
342
+
343
+ global _evaluation
344
+ if _set_global_evaluation:
345
+ _evaluation = evaluation
346
+ else:
347
+ return evaluation.run()