lmnr 0.4.11__py3-none-any.whl → 0.4.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lmnr/__init__.py +1 -1
- lmnr/cli.py +39 -0
- lmnr/sdk/decorators.py +3 -2
- lmnr/sdk/evaluations.py +245 -76
- lmnr/sdk/laminar.py +81 -44
- lmnr/sdk/types.py +44 -5
- lmnr/sdk/utils.py +4 -5
- lmnr/traceloop_sdk/__init__.py +3 -42
- lmnr/traceloop_sdk/config/__init__.py +0 -4
- lmnr/traceloop_sdk/decorators/base.py +16 -9
- lmnr/traceloop_sdk/tracing/attributes.py +8 -0
- lmnr/traceloop_sdk/tracing/tracing.py +31 -201
- {lmnr-0.4.11.dist-info → lmnr-0.4.12.dist-info}/METADATA +75 -101
- {lmnr-0.4.11.dist-info → lmnr-0.4.12.dist-info}/RECORD +17 -18
- lmnr-0.4.12.dist-info/entry_points.txt +3 -0
- lmnr/traceloop_sdk/metrics/__init__.py +0 -0
- lmnr/traceloop_sdk/metrics/metrics.py +0 -176
- lmnr/traceloop_sdk/tracing/manual.py +0 -57
- lmnr-0.4.11.dist-info/entry_points.txt +0 -3
- {lmnr-0.4.11.dist-info → lmnr-0.4.12.dist-info}/LICENSE +0 -0
- {lmnr-0.4.11.dist-info → lmnr-0.4.12.dist-info}/WHEEL +0 -0
lmnr/__init__.py
CHANGED
lmnr/cli.py
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
from argparse import ArgumentParser
|
2
|
+
import asyncio
|
3
|
+
import importlib
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
|
7
|
+
from lmnr.sdk.evaluations import set_global_evaluation
|
8
|
+
|
9
|
+
|
10
|
+
# TODO: Refactor this code
|
11
|
+
async def run_evaluation(args):
|
12
|
+
sys.path.insert(0, os.getcwd())
|
13
|
+
|
14
|
+
with set_global_evaluation(True):
|
15
|
+
file = os.path.abspath(args.file)
|
16
|
+
|
17
|
+
spec = importlib.util.spec_from_file_location("run_eval", file)
|
18
|
+
mod = importlib.util.module_from_spec(spec)
|
19
|
+
spec.loader.exec_module(mod)
|
20
|
+
|
21
|
+
from lmnr.sdk.evaluations import _evaluation
|
22
|
+
evaluation = _evaluation
|
23
|
+
await evaluation.run()
|
24
|
+
|
25
|
+
|
26
|
+
def cli():
|
27
|
+
parser = ArgumentParser(
|
28
|
+
prog="lmnr",
|
29
|
+
description="CLI for Laminar",
|
30
|
+
)
|
31
|
+
|
32
|
+
subparsers = parser.add_subparsers(title="subcommands", dest="subcommand")
|
33
|
+
|
34
|
+
parser_eval = subparsers.add_parser("eval", description="Run an evaluation")
|
35
|
+
parser_eval.add_argument("file", help="A file containing the evaluation to run")
|
36
|
+
parser_eval.set_defaults(func=run_evaluation)
|
37
|
+
|
38
|
+
parsed = parser.parse_args()
|
39
|
+
asyncio.run(parsed.func(parsed))
|
lmnr/sdk/decorators.py
CHANGED
@@ -6,6 +6,7 @@ from opentelemetry.trace import INVALID_SPAN, get_current_span
|
|
6
6
|
|
7
7
|
from typing import Callable, Optional, cast
|
8
8
|
|
9
|
+
from lmnr.traceloop_sdk.tracing.attributes import SESSION_ID, USER_ID
|
9
10
|
from lmnr.traceloop_sdk.tracing.tracing import update_association_properties
|
10
11
|
|
11
12
|
from .utils import is_async
|
@@ -43,11 +44,11 @@ def observe(
|
|
43
44
|
if current_span != INVALID_SPAN:
|
44
45
|
if session_id is not None:
|
45
46
|
current_span.set_attribute(
|
46
|
-
|
47
|
+
SESSION_ID, session_id
|
47
48
|
)
|
48
49
|
if user_id is not None:
|
49
50
|
current_span.set_attribute(
|
50
|
-
|
51
|
+
USER_ID, user_id
|
51
52
|
)
|
52
53
|
association_properties = {}
|
53
54
|
if session_id is not None:
|
lmnr/sdk/evaluations.py
CHANGED
@@ -1,14 +1,78 @@
|
|
1
|
-
from typing import Any, Union
|
2
|
-
|
3
|
-
from .types import EvaluationDatapoint
|
4
|
-
from .utils import is_async
|
5
|
-
from .laminar import Laminar as L
|
6
1
|
import asyncio
|
7
|
-
|
2
|
+
import sys
|
8
3
|
from abc import ABC, abstractmethod
|
4
|
+
from contextlib import contextmanager
|
5
|
+
from typing import Any, Awaitable, Optional, Set, Union
|
6
|
+
import uuid
|
7
|
+
|
8
|
+
from tqdm import tqdm
|
9
|
+
|
10
|
+
from ..traceloop_sdk.instruments import Instruments
|
11
|
+
from ..traceloop_sdk.tracing.attributes import SPAN_TYPE
|
12
|
+
|
13
|
+
from .laminar import Laminar as L
|
14
|
+
from .types import (
|
15
|
+
CreateEvaluationResponse,
|
16
|
+
Datapoint,
|
17
|
+
EvaluationResultDatapoint,
|
18
|
+
EvaluatorFunction,
|
19
|
+
ExecutorFunction,
|
20
|
+
Numeric,
|
21
|
+
NumericTypes,
|
22
|
+
SpanType,
|
23
|
+
TraceType,
|
24
|
+
)
|
25
|
+
from .utils import is_async
|
9
26
|
|
10
27
|
DEFAULT_BATCH_SIZE = 5
|
11
28
|
|
29
|
+
_evaluation = None
|
30
|
+
_set_global_evaluation = False
|
31
|
+
|
32
|
+
|
33
|
+
@contextmanager
|
34
|
+
def set_global_evaluation(set_global_evaluation: bool):
|
35
|
+
global _set_global_evaluation
|
36
|
+
original = _set_global_evaluation
|
37
|
+
try:
|
38
|
+
_set_global_evaluation = set_global_evaluation
|
39
|
+
yield
|
40
|
+
finally:
|
41
|
+
_set_global_evaluation = original
|
42
|
+
pass
|
43
|
+
|
44
|
+
|
45
|
+
def get_evaluation_url(project_id: str, evaluation_id: str):
|
46
|
+
return f"https://www.lmnr.ai/project/{project_id}/evaluations/{evaluation_id}"
|
47
|
+
|
48
|
+
|
49
|
+
class EvaluationReporter:
|
50
|
+
def __init__(self):
|
51
|
+
pass
|
52
|
+
|
53
|
+
def start(self, name: str, project_id: str, id: str, length: int):
|
54
|
+
print(f"Running evaluation {name}...\n")
|
55
|
+
print(f"Check progress and results at {get_evaluation_url(project_id, id)}\n")
|
56
|
+
self.cli_progress = tqdm(
|
57
|
+
total=length,
|
58
|
+
bar_format="{bar} {percentage:3.0f}% | ETA: {remaining}s | {n_fmt}/{total_fmt}",
|
59
|
+
ncols=60,
|
60
|
+
)
|
61
|
+
|
62
|
+
def update(self, batch_length: int):
|
63
|
+
self.cli_progress.update(batch_length)
|
64
|
+
|
65
|
+
def stopWithError(self, error: Exception):
|
66
|
+
self.cli_progress.close()
|
67
|
+
sys.stderr.write(f"\nError: {error}\n")
|
68
|
+
|
69
|
+
def stop(self, average_scores: dict[str, Numeric]):
|
70
|
+
self.cli_progress.close()
|
71
|
+
print("\nAverage scores:")
|
72
|
+
for name, score in average_scores.items():
|
73
|
+
print(f"{name}: {score}")
|
74
|
+
print("\n")
|
75
|
+
|
12
76
|
|
13
77
|
class EvaluationDataset(ABC):
|
14
78
|
@abstractmethod
|
@@ -20,7 +84,7 @@ class EvaluationDataset(ABC):
|
|
20
84
|
pass
|
21
85
|
|
22
86
|
@abstractmethod
|
23
|
-
def __getitem__(self, idx) ->
|
87
|
+
def __getitem__(self, idx) -> Datapoint:
|
24
88
|
pass
|
25
89
|
|
26
90
|
def slice(self, start: int, end: int):
|
@@ -30,18 +94,21 @@ class EvaluationDataset(ABC):
|
|
30
94
|
class Evaluation:
|
31
95
|
def __init__(
|
32
96
|
self,
|
33
|
-
|
34
|
-
data: Union[EvaluationDataset, list[Union[EvaluationDatapoint, dict]]],
|
97
|
+
data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
|
35
98
|
executor: Any,
|
36
|
-
evaluators:
|
99
|
+
evaluators: dict[str, EvaluatorFunction],
|
100
|
+
name: Optional[str] = None,
|
37
101
|
batch_size: int = DEFAULT_BATCH_SIZE,
|
38
|
-
project_api_key: str =
|
39
|
-
base_url: str =
|
102
|
+
project_api_key: Optional[str] = None,
|
103
|
+
base_url: Optional[str] = None,
|
104
|
+
http_port: Optional[int] = None,
|
105
|
+
grpc_port: Optional[int] = None,
|
106
|
+
instruments: Optional[Set[Instruments]] = None,
|
40
107
|
):
|
41
108
|
"""
|
42
109
|
Initializes an instance of the Evaluations class.
|
110
|
+
|
43
111
|
Parameters:
|
44
|
-
name (str): The name of the evaluation.
|
45
112
|
data (Union[List[Union[EvaluationDatapoint, dict]], EvaluationDataset]): List of data points to evaluate or an evaluation dataset.
|
46
113
|
`data` is the input to the executor function,
|
47
114
|
`target` is the input to the evaluator function.
|
@@ -56,46 +123,43 @@ class Evaluation:
|
|
56
123
|
evaluator function. If the function is anonymous, it will be
|
57
124
|
named `evaluator_${index}`, where index is the index of the
|
58
125
|
evaluator function in the list starting from 1.
|
126
|
+
name (Optional[str], optional): The name of the evaluation.
|
127
|
+
It will be auto-generated if not provided.
|
59
128
|
batch_size (int, optional): The batch size for evaluation.
|
60
129
|
Defaults to DEFAULT_BATCH_SIZE.
|
61
|
-
project_api_key (str, optional): The project API key.
|
130
|
+
project_api_key (Optional[str], optional): The project API key.
|
62
131
|
Defaults to an empty string.
|
63
|
-
base_url (str, optional): The base URL for the
|
132
|
+
base_url (Optional[str], optional): The base URL for the Laminar API.
|
64
133
|
Useful if self-hosted elsewhere.
|
65
134
|
Defaults to "https://api.lmnr.ai".
|
135
|
+
http_port (Optional[int], optional): The port for the Laminar API HTTP service.
|
136
|
+
Defaults to 443.
|
137
|
+
instruments (Optional[Set[Instruments]], optional): Set of modules to auto-instrument.
|
138
|
+
Defaults to None. If None, all available instruments will be used.
|
66
139
|
"""
|
67
140
|
|
141
|
+
self.is_finished = False
|
68
142
|
self.name = name
|
143
|
+
self.reporter = EvaluationReporter()
|
69
144
|
self.executor = executor
|
70
|
-
self.evaluators =
|
71
|
-
zip(
|
72
|
-
[
|
73
|
-
(
|
74
|
-
e.__name__
|
75
|
-
if e.__name__ and e.__name__ != "<lambda>"
|
76
|
-
else f"evaluator_{i+1}"
|
77
|
-
)
|
78
|
-
for i, e in enumerate(evaluators)
|
79
|
-
],
|
80
|
-
evaluators,
|
81
|
-
)
|
82
|
-
)
|
83
|
-
self.evaluator_names = list(self.evaluators.keys())
|
145
|
+
self.evaluators = evaluators
|
84
146
|
if isinstance(data, list):
|
85
147
|
self.data = [
|
86
|
-
(
|
87
|
-
EvaluationDatapoint.model_validate(point)
|
88
|
-
if isinstance(point, dict)
|
89
|
-
else point
|
90
|
-
)
|
148
|
+
(Datapoint.model_validate(point) if isinstance(point, dict) else point)
|
91
149
|
for point in data
|
92
150
|
]
|
93
151
|
else:
|
94
152
|
self.data = data
|
95
153
|
self.batch_size = batch_size
|
96
|
-
L.initialize(
|
154
|
+
L.initialize(
|
155
|
+
project_api_key=project_api_key,
|
156
|
+
base_url=base_url,
|
157
|
+
http_port=http_port,
|
158
|
+
grpc_port=grpc_port,
|
159
|
+
instruments=instruments,
|
160
|
+
)
|
97
161
|
|
98
|
-
def run(self):
|
162
|
+
def run(self) -> Union[None, Awaitable[None]]:
|
99
163
|
"""Runs the evaluation.
|
100
164
|
|
101
165
|
Creates a new evaluation if no evaluation with such name exists, or
|
@@ -113,16 +177,38 @@ class Evaluation:
|
|
113
177
|
```
|
114
178
|
|
115
179
|
"""
|
180
|
+
if self.is_finished:
|
181
|
+
raise Exception("Evaluation is already finished")
|
182
|
+
|
116
183
|
loop = asyncio.get_event_loop()
|
117
184
|
if loop.is_running():
|
118
185
|
return loop.create_task(self._run())
|
119
186
|
else:
|
120
187
|
return loop.run_until_complete(self._run())
|
121
188
|
|
122
|
-
async def _run(self):
|
123
|
-
|
189
|
+
async def _run(self) -> None:
|
190
|
+
evaluation = L.create_evaluation(self.name)
|
191
|
+
self.reporter.start(
|
192
|
+
evaluation.name,
|
193
|
+
evaluation.projectId,
|
194
|
+
evaluation.id,
|
195
|
+
len(self.data),
|
196
|
+
)
|
197
|
+
|
198
|
+
try:
|
199
|
+
await self.evaluate_in_batches(evaluation)
|
200
|
+
except Exception as e:
|
201
|
+
L.update_evaluation_status(evaluation.id, "Error")
|
202
|
+
self.reporter.stopWithError(e)
|
203
|
+
self.is_finished = True
|
204
|
+
return
|
205
|
+
|
206
|
+
# If we update with status "Finished", we expect averageScores to be not empty
|
207
|
+
updated_evaluation = L.update_evaluation_status(evaluation.id, "Finished")
|
208
|
+
self.reporter.stop(updated_evaluation.averageScores)
|
209
|
+
self.is_finished = True
|
124
210
|
|
125
|
-
|
211
|
+
async def evaluate_in_batches(self, evaluation: CreateEvaluationResponse):
|
126
212
|
for i in range(0, len(self.data), self.batch_size):
|
127
213
|
batch = (
|
128
214
|
self.data[i : i + self.batch_size]
|
@@ -130,49 +216,132 @@ class Evaluation:
|
|
130
216
|
else self.data.slice(i, i + self.batch_size)
|
131
217
|
)
|
132
218
|
try:
|
133
|
-
await self._evaluate_batch(batch)
|
219
|
+
results = await self._evaluate_batch(batch)
|
220
|
+
L.post_evaluation_results(evaluation.id, results)
|
134
221
|
except Exception as e:
|
135
222
|
print(f"Error evaluating batch: {e}")
|
223
|
+
finally:
|
224
|
+
self.reporter.update(len(batch))
|
136
225
|
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
except Exception as e:
|
141
|
-
print(f"Error updating evaluation status: {e}")
|
142
|
-
|
143
|
-
async def _evaluate_batch(self, batch: list[EvaluationDatapoint]):
|
226
|
+
async def _evaluate_batch(
|
227
|
+
self, batch: list[Datapoint]
|
228
|
+
) -> list[EvaluationResultDatapoint]:
|
144
229
|
batch_promises = [self._evaluate_datapoint(datapoint) for datapoint in batch]
|
145
230
|
results = await asyncio.gather(*batch_promises)
|
231
|
+
return results
|
232
|
+
|
233
|
+
async def _evaluate_datapoint(
|
234
|
+
self, datapoint: Datapoint
|
235
|
+
) -> EvaluationResultDatapoint:
|
236
|
+
with L.start_as_current_span("evaluation") as evaluation_span:
|
237
|
+
L._set_trace_type(trace_type=TraceType.EVALUATION)
|
238
|
+
evaluation_span.set_attribute(SPAN_TYPE, SpanType.EVALUATION.value)
|
239
|
+
with L.start_as_current_span(
|
240
|
+
"executor", input={"data": datapoint.data}
|
241
|
+
) as executor_span:
|
242
|
+
executor_span.set_attribute(SPAN_TYPE, SpanType.EXECUTOR.value)
|
243
|
+
output = (
|
244
|
+
await self.executor(datapoint.data)
|
245
|
+
if is_async(self.executor)
|
246
|
+
else self.executor(datapoint.data)
|
247
|
+
)
|
248
|
+
L.set_span_output(output)
|
249
|
+
target = datapoint.target
|
146
250
|
|
147
|
-
|
251
|
+
# Iterate over evaluators
|
252
|
+
scores: dict[str, Numeric] = {}
|
253
|
+
for evaluator_name, evaluator in self.evaluators.items():
|
254
|
+
with L.start_as_current_span(
|
255
|
+
evaluator_name, input={"output": output, "target": target}
|
256
|
+
) as evaluator_span:
|
257
|
+
evaluator_span.set_attribute(SPAN_TYPE, SpanType.EVALUATOR.value)
|
258
|
+
value = (
|
259
|
+
await evaluator(output, target)
|
260
|
+
if is_async(evaluator)
|
261
|
+
else evaluator(output, target)
|
262
|
+
)
|
263
|
+
L.set_span_output(value)
|
148
264
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
await evaluator(output, target)
|
163
|
-
if is_async(evaluator)
|
164
|
-
else evaluator(output, target)
|
265
|
+
# If evaluator returns a single number, use evaluator name as key
|
266
|
+
if isinstance(value, NumericTypes):
|
267
|
+
scores[evaluator_name] = value
|
268
|
+
else:
|
269
|
+
scores.update(value)
|
270
|
+
|
271
|
+
trace_id = uuid.UUID(int=evaluation_span.get_span_context().trace_id)
|
272
|
+
return EvaluationResultDatapoint(
|
273
|
+
data=datapoint.data,
|
274
|
+
target=target,
|
275
|
+
executor_output=output,
|
276
|
+
scores=scores,
|
277
|
+
trace_id=trace_id,
|
165
278
|
)
|
166
279
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
280
|
+
|
281
|
+
def evaluate(
|
282
|
+
data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
|
283
|
+
executor: ExecutorFunction,
|
284
|
+
evaluators: dict[str, EvaluatorFunction],
|
285
|
+
name: Optional[str] = None,
|
286
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
287
|
+
project_api_key: Optional[str] = None,
|
288
|
+
base_url: Optional[str] = None,
|
289
|
+
http_port: Optional[int] = None,
|
290
|
+
grpc_port: Optional[int] = None,
|
291
|
+
instruments: Optional[Set[Instruments]] = None,
|
292
|
+
) -> Optional[Awaitable[None]]:
|
293
|
+
"""
|
294
|
+
If added to the file which is called through lmnr eval command, then simply registers the evaluation.
|
295
|
+
Otherwise, if there is no event loop, creates it and runs the evaluation until completion.
|
296
|
+
If there is an event loop, schedules the evaluation as a task in the event loop and returns an awaitable handle.
|
297
|
+
|
298
|
+
Parameters:
|
299
|
+
data (Union[List[Union[EvaluationDatapoint, dict]], EvaluationDataset]): List of data points to evaluate or an evaluation dataset.
|
300
|
+
`data` is the input to the executor function,
|
301
|
+
`target` is the input to the evaluator function.
|
302
|
+
executor (Callable[..., Any]): The executor function.
|
303
|
+
Takes the data point + any additional arguments
|
304
|
+
and returns the output to evaluate.
|
305
|
+
evaluators (List[Callable[..., Any]]): List of evaluator functions.
|
306
|
+
Each evaluator function takes the output of the executor _and_
|
307
|
+
the target data, and returns a score. The score can be a
|
308
|
+
single number or a record of string keys and number values.
|
309
|
+
If the score is a single number, it will be named after the
|
310
|
+
evaluator function. If the function is anonymous, it will be
|
311
|
+
named `evaluator_${index}`, where index is the index of the
|
312
|
+
evaluator function in the list starting from 1.
|
313
|
+
name (Optional[str], optional): The name of the evaluation.
|
314
|
+
It will be auto-generated if not provided.
|
315
|
+
batch_size (int, optional): The batch size for evaluation.
|
316
|
+
Defaults to DEFAULT_BATCH_SIZE.
|
317
|
+
project_api_key (Optional[str], optional): The project API key.
|
318
|
+
Defaults to an empty string.
|
319
|
+
base_url (Optional[str], optional): The base URL for the Laminar API.
|
320
|
+
Useful if self-hosted elsewhere.
|
321
|
+
Defaults to "https://api.lmnr.ai".
|
322
|
+
http_port (Optional[int], optional): The port for the Laminar API HTTP service.
|
323
|
+
Defaults to 443.
|
324
|
+
grpc_port (Optional[int], optional): The port for the Laminar API gRPC service.
|
325
|
+
Defaults to 8443.
|
326
|
+
instruments (Optional[Set[Instruments]], optional): Set of modules to auto-instrument.
|
327
|
+
Defaults to None. If None, all available instruments will be used.
|
328
|
+
"""
|
329
|
+
|
330
|
+
evaluation = Evaluation(
|
331
|
+
data=data,
|
332
|
+
executor=executor,
|
333
|
+
evaluators=evaluators,
|
334
|
+
name=name,
|
335
|
+
batch_size=batch_size,
|
336
|
+
project_api_key=project_api_key,
|
337
|
+
base_url=base_url,
|
338
|
+
http_port=http_port,
|
339
|
+
grpc_port=grpc_port,
|
340
|
+
instruments=instruments,
|
341
|
+
)
|
342
|
+
|
343
|
+
global _evaluation
|
344
|
+
if _set_global_evaluation:
|
345
|
+
_evaluation = evaluation
|
346
|
+
else:
|
347
|
+
return evaluation.run()
|