lmnr 0.4.12b3__tar.gz → 0.4.12b4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/PKG-INFO +17 -12
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/README.md +17 -10
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/pyproject.toml +2 -3
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/sdk/decorators.py +3 -2
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/sdk/evaluations.py +90 -58
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/sdk/laminar.py +32 -10
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/sdk/types.py +38 -5
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/sdk/utils.py +4 -5
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/__init__.py +3 -29
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/config/__init__.py +0 -4
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/decorators/base.py +16 -9
- lmnr-0.4.12b4/src/lmnr/traceloop_sdk/tracing/attributes.py +8 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/tracing.py +31 -142
- lmnr-0.4.12b3/src/lmnr/traceloop_sdk/metrics/__init__.py +0 -0
- lmnr-0.4.12b3/src/lmnr/traceloop_sdk/metrics/metrics.py +0 -176
- lmnr-0.4.12b3/src/lmnr/traceloop_sdk/tracing/manual.py +0 -57
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/LICENSE +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/__init__.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/cli.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/sdk/__init__.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/sdk/log.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/.flake8 +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/.python-version +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/decorators/__init__.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/instruments.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/__init__.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_association_properties/test_langchain_and_external_association_properties.yaml +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_association_properties/test_langchain_association_properties.yaml +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_manual/test_manual_report.yaml +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_manual/test_resource_attributes.yaml +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_privacy_no_prompts/test_simple_workflow.yaml +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_prompt_management/test_prompt_management.yaml +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_sdk_initialization/test_resource_attributes.yaml +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_tasks/test_task_io_serialization_with_langchain.yaml +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_workflows/test_simple_aworkflow.yaml +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_workflows/test_simple_workflow.yaml +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_workflows/test_streaming_workflow.yaml +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/conftest.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_association_properties.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_manual.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_nested_tasks.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_privacy_no_prompts.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_sdk_initialization.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_tasks.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_workflows.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/__init__.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/content_allow_list.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/context_manager.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/__init__.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/in_memory_span_exporter.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/json_encoder.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/package_check.py +0 -0
- {lmnr-0.4.12b3 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/version.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lmnr
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.12b4
|
4
4
|
Summary: Python SDK for Laminar AI
|
5
5
|
License: Apache-2.0
|
6
6
|
Author: lmnr.ai
|
@@ -14,7 +14,6 @@ Classifier: Programming Language :: Python :: 3.12
|
|
14
14
|
Requires-Dist: argparse (>=1.0,<2.0)
|
15
15
|
Requires-Dist: asyncio (>=3.0,<4.0)
|
16
16
|
Requires-Dist: backoff (>=2.0,<3.0)
|
17
|
-
Requires-Dist: colorama (>=0.4,<0.5)
|
18
17
|
Requires-Dist: deprecated (>=1.0,<2.0)
|
19
18
|
Requires-Dist: jinja2 (>=3.0,<4.0)
|
20
19
|
Requires-Dist: opentelemetry-api (>=1.27.0,<2.0.0)
|
@@ -197,7 +196,7 @@ L.initialize(project_api_key=os.environ["LMNR_PROJECT_API_KEY"], instruments={In
|
|
197
196
|
|
198
197
|
If you want to fully disable any kind of autoinstrumentation, pass an empty set as `instruments=set()` to `.initialize()`.
|
199
198
|
|
200
|
-
|
199
|
+
Autoinstrumentations are provided by Traceloop's [OpenLLMetry](https://github.com/traceloop/openllmetry).
|
201
200
|
|
202
201
|
## Sending events
|
203
202
|
|
@@ -267,13 +266,14 @@ Evaluation takes in the following parameters:
|
|
267
266
|
- `name` – the name of your evaluation. If no such evaluation exists in the project, it will be created. Otherwise, data will be pushed to the existing evaluation
|
268
267
|
- `data` – an array of `EvaluationDatapoint` objects, where each `EvaluationDatapoint` has two keys: `target` and `data`, each containing a key-value object. Alternatively, you can pass in dictionaries, and we will instantiate `EvaluationDatapoint`s with pydantic if possible
|
269
268
|
- `executor` – the logic you want to evaluate. This function must take `data` as the first argument, and produce any output. *
|
270
|
-
- `evaluators` – evaluaton logic.
|
269
|
+
- `evaluators` – evaluaton logic. Functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Pass a dict from evaluator name to a function. Each function can produce either a single number or `dict[str, int|float]` of scores.
|
271
270
|
|
272
271
|
\* If you already have the outputs of executors you want to evaluate, you can specify the executor as an identity function, that takes in `data` and returns only needed value(s) from it.
|
273
272
|
|
274
|
-
### Example
|
273
|
+
### Example code
|
275
274
|
|
276
275
|
```python
|
276
|
+
from lmnr import evaluate
|
277
277
|
from openai import AsyncOpenAI
|
278
278
|
import asyncio
|
279
279
|
import os
|
@@ -304,20 +304,25 @@ data = [
|
|
304
304
|
]
|
305
305
|
|
306
306
|
|
307
|
-
def
|
307
|
+
def correctness(output, target):
|
308
308
|
return 1 if output == target["capital"] else 0
|
309
309
|
|
310
310
|
|
311
311
|
# Create an Evaluation instance
|
312
|
-
e =
|
313
|
-
name="
|
312
|
+
e = evaluate(
|
313
|
+
name="my-evaluation",
|
314
314
|
data=data,
|
315
315
|
executor=get_capital,
|
316
|
-
evaluators=
|
316
|
+
evaluators={"correctness": correctness},
|
317
317
|
project_api_key=os.environ["LMNR_PROJECT_API_KEY"],
|
318
318
|
)
|
319
|
-
|
320
|
-
# Run the evaluation
|
321
|
-
asyncio.run(e.run())
|
322
319
|
```
|
323
320
|
|
321
|
+
### Running from CLI.
|
322
|
+
|
323
|
+
1. Make sure `lmnr` is installed in a venv. CLI does not work with a global env
|
324
|
+
1. Run `lmnr path/to/my/eval.py`
|
325
|
+
|
326
|
+
### Running from code
|
327
|
+
|
328
|
+
Simply execute the function, e.g. `python3 path/to/my/eval.py`
|
@@ -137,7 +137,7 @@ L.initialize(project_api_key=os.environ["LMNR_PROJECT_API_KEY"], instruments={In
|
|
137
137
|
|
138
138
|
If you want to fully disable any kind of autoinstrumentation, pass an empty set as `instruments=set()` to `.initialize()`.
|
139
139
|
|
140
|
-
|
140
|
+
Autoinstrumentations are provided by Traceloop's [OpenLLMetry](https://github.com/traceloop/openllmetry).
|
141
141
|
|
142
142
|
## Sending events
|
143
143
|
|
@@ -207,13 +207,14 @@ Evaluation takes in the following parameters:
|
|
207
207
|
- `name` – the name of your evaluation. If no such evaluation exists in the project, it will be created. Otherwise, data will be pushed to the existing evaluation
|
208
208
|
- `data` – an array of `EvaluationDatapoint` objects, where each `EvaluationDatapoint` has two keys: `target` and `data`, each containing a key-value object. Alternatively, you can pass in dictionaries, and we will instantiate `EvaluationDatapoint`s with pydantic if possible
|
209
209
|
- `executor` – the logic you want to evaluate. This function must take `data` as the first argument, and produce any output. *
|
210
|
-
- `evaluators` – evaluaton logic.
|
210
|
+
- `evaluators` – evaluaton logic. Functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Pass a dict from evaluator name to a function. Each function can produce either a single number or `dict[str, int|float]` of scores.
|
211
211
|
|
212
212
|
\* If you already have the outputs of executors you want to evaluate, you can specify the executor as an identity function, that takes in `data` and returns only needed value(s) from it.
|
213
213
|
|
214
|
-
### Example
|
214
|
+
### Example code
|
215
215
|
|
216
216
|
```python
|
217
|
+
from lmnr import evaluate
|
217
218
|
from openai import AsyncOpenAI
|
218
219
|
import asyncio
|
219
220
|
import os
|
@@ -244,19 +245,25 @@ data = [
|
|
244
245
|
]
|
245
246
|
|
246
247
|
|
247
|
-
def
|
248
|
+
def correctness(output, target):
|
248
249
|
return 1 if output == target["capital"] else 0
|
249
250
|
|
250
251
|
|
251
252
|
# Create an Evaluation instance
|
252
|
-
e =
|
253
|
-
name="
|
253
|
+
e = evaluate(
|
254
|
+
name="my-evaluation",
|
254
255
|
data=data,
|
255
256
|
executor=get_capital,
|
256
|
-
evaluators=
|
257
|
+
evaluators={"correctness": correctness},
|
257
258
|
project_api_key=os.environ["LMNR_PROJECT_API_KEY"],
|
258
259
|
)
|
259
|
-
|
260
|
-
# Run the evaluation
|
261
|
-
asyncio.run(e.run())
|
262
260
|
```
|
261
|
+
|
262
|
+
### Running from CLI.
|
263
|
+
|
264
|
+
1. Make sure `lmnr` is installed in a venv. CLI does not work with a global env
|
265
|
+
1. Run `lmnr path/to/my/eval.py`
|
266
|
+
|
267
|
+
### Running from code
|
268
|
+
|
269
|
+
Simply execute the function, e.g. `python3 path/to/my/eval.py`
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "lmnr"
|
3
|
-
version = "0.4.
|
3
|
+
version = "0.4.12b4"
|
4
4
|
description = "Python SDK for Laminar AI"
|
5
5
|
authors = [
|
6
6
|
{ name = "lmnr.ai", email = "founders@lmnr.ai" }
|
@@ -11,7 +11,7 @@ license = "Apache-2.0"
|
|
11
11
|
|
12
12
|
[tool.poetry]
|
13
13
|
name = "lmnr"
|
14
|
-
version = "0.4.
|
14
|
+
version = "0.4.12b4"
|
15
15
|
description = "Python SDK for Laminar AI"
|
16
16
|
authors = ["lmnr.ai"]
|
17
17
|
readme = "README.md"
|
@@ -33,7 +33,6 @@ opentelemetry-instrumentation-sqlalchemy = "^0.48b0"
|
|
33
33
|
opentelemetry-instrumentation-urllib3 = "^0.48b0"
|
34
34
|
opentelemetry-instrumentation-threading = "^0.48b0"
|
35
35
|
opentelemetry-semantic-conventions-ai = "0.4.1"
|
36
|
-
colorama = "^0.4"
|
37
36
|
tenacity = "~=8.0"
|
38
37
|
jinja2 = "~=3.0"
|
39
38
|
deprecated = "~=1.0"
|
@@ -6,6 +6,7 @@ from opentelemetry.trace import INVALID_SPAN, get_current_span
|
|
6
6
|
|
7
7
|
from typing import Callable, Optional, cast
|
8
8
|
|
9
|
+
from lmnr.traceloop_sdk.tracing.attributes import SESSION_ID, USER_ID
|
9
10
|
from lmnr.traceloop_sdk.tracing.tracing import update_association_properties
|
10
11
|
|
11
12
|
from .utils import is_async
|
@@ -43,11 +44,11 @@ def observe(
|
|
43
44
|
if current_span != INVALID_SPAN:
|
44
45
|
if session_id is not None:
|
45
46
|
current_span.set_attribute(
|
46
|
-
|
47
|
+
SESSION_ID, session_id
|
47
48
|
)
|
48
49
|
if user_id is not None:
|
49
50
|
current_span.set_attribute(
|
50
|
-
|
51
|
+
USER_ID, user_id
|
51
52
|
)
|
52
53
|
association_properties = {}
|
53
54
|
if session_id is not None:
|
@@ -2,12 +2,26 @@ import asyncio
|
|
2
2
|
import sys
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from contextlib import contextmanager
|
5
|
-
from typing import Any, Awaitable, Optional, Union
|
5
|
+
from typing import Any, Awaitable, Optional, Set, Union
|
6
|
+
import uuid
|
6
7
|
|
7
8
|
from tqdm import tqdm
|
8
9
|
|
10
|
+
from ..traceloop_sdk.instruments import Instruments
|
11
|
+
from ..traceloop_sdk.tracing.attributes import SPAN_TYPE
|
12
|
+
|
9
13
|
from .laminar import Laminar as L
|
10
|
-
from .types import
|
14
|
+
from .types import (
|
15
|
+
CreateEvaluationResponse,
|
16
|
+
Datapoint,
|
17
|
+
EvaluationResultDatapoint,
|
18
|
+
EvaluatorFunction,
|
19
|
+
ExecutorFunction,
|
20
|
+
Numeric,
|
21
|
+
NumericTypes,
|
22
|
+
SpanType,
|
23
|
+
TraceType,
|
24
|
+
)
|
11
25
|
from .utils import is_async
|
12
26
|
|
13
27
|
DEFAULT_BATCH_SIZE = 5
|
@@ -39,7 +53,11 @@ class EvaluationReporter:
|
|
39
53
|
def start(self, name: str, project_id: str, id: str, length: int):
|
40
54
|
print(f"Running evaluation {name}...\n")
|
41
55
|
print(f"Check progress and results at {get_evaluation_url(project_id, id)}\n")
|
42
|
-
self.cli_progress = tqdm(
|
56
|
+
self.cli_progress = tqdm(
|
57
|
+
total=length,
|
58
|
+
bar_format="{bar} {percentage:3.0f}% | ETA: {remaining}s | {n_fmt}/{total_fmt}",
|
59
|
+
ncols=60,
|
60
|
+
)
|
43
61
|
|
44
62
|
def update(self, batch_length: int):
|
45
63
|
self.cli_progress.update(batch_length)
|
@@ -51,7 +69,7 @@ class EvaluationReporter:
|
|
51
69
|
def stop(self, average_scores: dict[str, Numeric]):
|
52
70
|
self.cli_progress.close()
|
53
71
|
print("\nAverage scores:")
|
54
|
-
for
|
72
|
+
for name, score in average_scores.items():
|
55
73
|
print(f"{name}: {score}")
|
56
74
|
print("\n")
|
57
75
|
|
@@ -78,12 +96,14 @@ class Evaluation:
|
|
78
96
|
self,
|
79
97
|
data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
|
80
98
|
executor: Any,
|
81
|
-
evaluators:
|
99
|
+
evaluators: dict[str, EvaluatorFunction],
|
82
100
|
name: Optional[str] = None,
|
83
101
|
batch_size: int = DEFAULT_BATCH_SIZE,
|
84
102
|
project_api_key: Optional[str] = None,
|
85
103
|
base_url: Optional[str] = None,
|
86
104
|
http_port: Optional[int] = None,
|
105
|
+
grpc_port: Optional[int] = None,
|
106
|
+
instruments: Optional[Set[Instruments]] = None,
|
87
107
|
):
|
88
108
|
"""
|
89
109
|
Initializes an instance of the Evaluations class.
|
@@ -114,33 +134,18 @@ class Evaluation:
|
|
114
134
|
Defaults to "https://api.lmnr.ai".
|
115
135
|
http_port (Optional[int], optional): The port for the Laminar API HTTP service.
|
116
136
|
Defaults to 443.
|
137
|
+
instruments (Optional[Set[Instruments]], optional): Set of modules to auto-instrument.
|
138
|
+
Defaults to None. If None, all available instruments will be used.
|
117
139
|
"""
|
118
140
|
|
119
141
|
self.is_finished = False
|
120
142
|
self.name = name
|
121
143
|
self.reporter = EvaluationReporter()
|
122
144
|
self.executor = executor
|
123
|
-
self.evaluators =
|
124
|
-
zip(
|
125
|
-
[
|
126
|
-
(
|
127
|
-
e.__name__
|
128
|
-
if e.__name__ and e.__name__ != "<lambda>"
|
129
|
-
else f"evaluator_{i+1}"
|
130
|
-
)
|
131
|
-
for i, e in enumerate(evaluators)
|
132
|
-
],
|
133
|
-
evaluators,
|
134
|
-
)
|
135
|
-
)
|
136
|
-
self.evaluator_names = list(self.evaluators.keys())
|
145
|
+
self.evaluators = evaluators
|
137
146
|
if isinstance(data, list):
|
138
147
|
self.data = [
|
139
|
-
(
|
140
|
-
Datapoint.model_validate(point)
|
141
|
-
if isinstance(point, dict)
|
142
|
-
else point
|
143
|
-
)
|
148
|
+
(Datapoint.model_validate(point) if isinstance(point, dict) else point)
|
144
149
|
for point in data
|
145
150
|
]
|
146
151
|
else:
|
@@ -150,7 +155,8 @@ class Evaluation:
|
|
150
155
|
project_api_key=project_api_key,
|
151
156
|
base_url=base_url,
|
152
157
|
http_port=http_port,
|
153
|
-
|
158
|
+
grpc_port=grpc_port,
|
159
|
+
instruments=instruments,
|
154
160
|
)
|
155
161
|
|
156
162
|
def run(self) -> Union[None, Awaitable[None]]:
|
@@ -205,7 +211,7 @@ class Evaluation:
|
|
205
211
|
async def evaluate_in_batches(self, evaluation: CreateEvaluationResponse):
|
206
212
|
for i in range(0, len(self.data), self.batch_size):
|
207
213
|
batch = (
|
208
|
-
self.data[i: i + self.batch_size]
|
214
|
+
self.data[i : i + self.batch_size]
|
209
215
|
if isinstance(self.data, list)
|
210
216
|
else self.data.slice(i, i + self.batch_size)
|
211
217
|
)
|
@@ -217,52 +223,72 @@ class Evaluation:
|
|
217
223
|
finally:
|
218
224
|
self.reporter.update(len(batch))
|
219
225
|
|
220
|
-
async def _evaluate_batch(
|
226
|
+
async def _evaluate_batch(
|
227
|
+
self, batch: list[Datapoint]
|
228
|
+
) -> list[EvaluationResultDatapoint]:
|
221
229
|
batch_promises = [self._evaluate_datapoint(datapoint) for datapoint in batch]
|
222
230
|
results = await asyncio.gather(*batch_promises)
|
223
231
|
return results
|
224
232
|
|
225
|
-
async def _evaluate_datapoint(
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
233
|
+
async def _evaluate_datapoint(
|
234
|
+
self, datapoint: Datapoint
|
235
|
+
) -> EvaluationResultDatapoint:
|
236
|
+
with L.start_as_current_span("evaluation") as evaluation_span:
|
237
|
+
L._set_trace_type(trace_type=TraceType.EVALUATION)
|
238
|
+
evaluation_span.set_attribute(SPAN_TYPE, SpanType.EVALUATION.value)
|
239
|
+
with L.start_as_current_span(
|
240
|
+
"executor", input={"data": datapoint.data}
|
241
|
+
) as executor_span:
|
242
|
+
executor_span.set_attribute(SPAN_TYPE, SpanType.EXECUTOR.value)
|
243
|
+
output = (
|
244
|
+
await self.executor(datapoint.data)
|
245
|
+
if is_async(self.executor)
|
246
|
+
else self.executor(datapoint.data)
|
247
|
+
)
|
248
|
+
L.set_span_output(output)
|
249
|
+
target = datapoint.target
|
250
|
+
|
251
|
+
# Iterate over evaluators
|
252
|
+
scores: dict[str, Numeric] = {}
|
253
|
+
for evaluator_name, evaluator in self.evaluators.items():
|
254
|
+
with L.start_as_current_span(
|
255
|
+
"evaluator", input={"output": output, "target": target}
|
256
|
+
) as evaluator_span:
|
257
|
+
evaluator_span.set_attribute(SPAN_TYPE, SpanType.EVALUATOR.value)
|
258
|
+
value = (
|
259
|
+
await evaluator(output, target)
|
260
|
+
if is_async(evaluator)
|
261
|
+
else evaluator(output, target)
|
262
|
+
)
|
263
|
+
L.set_span_output(value)
|
264
|
+
|
265
|
+
# If evaluator returns a single number, use evaluator name as key
|
266
|
+
if isinstance(value, NumericTypes):
|
267
|
+
scores[evaluator_name] = value
|
268
|
+
else:
|
269
|
+
scores.update(value)
|
270
|
+
|
271
|
+
trace_id = uuid.UUID(int=evaluation_span.get_span_context().trace_id)
|
272
|
+
return EvaluationResultDatapoint(
|
273
|
+
data=datapoint.data,
|
274
|
+
target=target,
|
275
|
+
executor_output=output,
|
276
|
+
scores=scores,
|
277
|
+
trace_id=trace_id,
|
241
278
|
)
|
242
279
|
|
243
|
-
# If evaluator returns a single number, use evaluator name as key
|
244
|
-
if isinstance(value, NumericTypes):
|
245
|
-
scores[evaluator_name] = value
|
246
|
-
else:
|
247
|
-
scores.update(value)
|
248
|
-
|
249
|
-
return EvaluationResultDatapoint(
|
250
|
-
data=datapoint.data,
|
251
|
-
target=target,
|
252
|
-
executorOutput=output,
|
253
|
-
scores=scores,
|
254
|
-
)
|
255
|
-
|
256
280
|
|
257
281
|
def evaluate(
|
258
282
|
data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
|
259
|
-
executor:
|
260
|
-
evaluators:
|
283
|
+
executor: ExecutorFunction,
|
284
|
+
evaluators: dict[str, EvaluatorFunction],
|
261
285
|
name: Optional[str] = None,
|
262
286
|
batch_size: int = DEFAULT_BATCH_SIZE,
|
263
287
|
project_api_key: Optional[str] = None,
|
264
288
|
base_url: Optional[str] = None,
|
265
289
|
http_port: Optional[int] = None,
|
290
|
+
grpc_port: Optional[int] = None,
|
291
|
+
instruments: Optional[Set[Instruments]] = None,
|
266
292
|
) -> Optional[Awaitable[None]]:
|
267
293
|
"""
|
268
294
|
If added to the file which is called through lmnr eval command, then simply registers the evaluation.
|
@@ -295,6 +321,10 @@ def evaluate(
|
|
295
321
|
Defaults to "https://api.lmnr.ai".
|
296
322
|
http_port (Optional[int], optional): The port for the Laminar API HTTP service.
|
297
323
|
Defaults to 443.
|
324
|
+
grpc_port (Optional[int], optional): The port for the Laminar API gRPC service.
|
325
|
+
Defaults to 8443.
|
326
|
+
instruments (Optional[Set[Instruments]], optional): Set of modules to auto-instrument.
|
327
|
+
Defaults to None. If None, all available instruments will be used.
|
298
328
|
"""
|
299
329
|
|
300
330
|
evaluation = Evaluation(
|
@@ -306,6 +336,8 @@ def evaluate(
|
|
306
336
|
project_api_key=project_api_key,
|
307
337
|
base_url=base_url,
|
308
338
|
http_port=http_port,
|
339
|
+
grpc_port=grpc_port,
|
340
|
+
instruments=instruments,
|
309
341
|
)
|
310
342
|
|
311
343
|
global _evaluation
|
@@ -5,7 +5,6 @@ from opentelemetry.trace import (
|
|
5
5
|
get_current_span,
|
6
6
|
SpanKind,
|
7
7
|
)
|
8
|
-
from opentelemetry.semconv_ai import SpanAttributes
|
9
8
|
from opentelemetry.util.types import AttributeValue
|
10
9
|
from opentelemetry.context.context import Context
|
11
10
|
from opentelemetry.util import types
|
@@ -26,7 +25,17 @@ import os
|
|
26
25
|
import requests
|
27
26
|
import uuid
|
28
27
|
|
29
|
-
from lmnr.traceloop_sdk.tracing.
|
28
|
+
from lmnr.traceloop_sdk.tracing.attributes import (
|
29
|
+
SESSION_ID,
|
30
|
+
SPAN_INPUT,
|
31
|
+
SPAN_OUTPUT,
|
32
|
+
TRACE_TYPE,
|
33
|
+
USER_ID,
|
34
|
+
)
|
35
|
+
from lmnr.traceloop_sdk.tracing.tracing import (
|
36
|
+
set_association_properties,
|
37
|
+
update_association_properties,
|
38
|
+
)
|
30
39
|
|
31
40
|
from .log import VerboseColorfulFormatter
|
32
41
|
|
@@ -37,6 +46,7 @@ from .types import (
|
|
37
46
|
PipelineRunResponse,
|
38
47
|
NodeInput,
|
39
48
|
PipelineRunRequest,
|
49
|
+
TraceType,
|
40
50
|
UpdateEvaluationResponse,
|
41
51
|
)
|
42
52
|
|
@@ -356,8 +366,8 @@ class Laminar:
|
|
356
366
|
) as span:
|
357
367
|
if input is not None:
|
358
368
|
span.set_attribute(
|
359
|
-
|
360
|
-
json.dumps(
|
369
|
+
SPAN_INPUT,
|
370
|
+
json.dumps(input),
|
361
371
|
)
|
362
372
|
yield span
|
363
373
|
|
@@ -371,9 +381,7 @@ class Laminar:
|
|
371
381
|
"""
|
372
382
|
span = get_current_span()
|
373
383
|
if output is not None and span != INVALID_SPAN:
|
374
|
-
span.set_attribute(
|
375
|
-
SpanAttributes.TRACELOOP_ENTITY_OUTPUT, json.dumps(output)
|
376
|
-
)
|
384
|
+
span.set_attribute(SPAN_OUTPUT, json.dumps(output))
|
377
385
|
|
378
386
|
@classmethod
|
379
387
|
def set_session(
|
@@ -396,9 +404,23 @@ class Laminar:
|
|
396
404
|
"""
|
397
405
|
association_properties = {}
|
398
406
|
if session_id is not None:
|
399
|
-
association_properties[
|
407
|
+
association_properties[SESSION_ID] = session_id
|
400
408
|
if user_id is not None:
|
401
|
-
association_properties[
|
409
|
+
association_properties[USER_ID] = user_id
|
410
|
+
update_association_properties(association_properties)
|
411
|
+
|
412
|
+
@classmethod
|
413
|
+
def _set_trace_type(
|
414
|
+
cls,
|
415
|
+
trace_type: TraceType,
|
416
|
+
):
|
417
|
+
"""Set the trace_type for the current span and the context
|
418
|
+
Args:
|
419
|
+
trace_type (TraceType): Type of the trace
|
420
|
+
"""
|
421
|
+
association_properties = {
|
422
|
+
TRACE_TYPE: trace_type.value,
|
423
|
+
}
|
402
424
|
update_association_properties(association_properties)
|
403
425
|
|
404
426
|
@classmethod
|
@@ -430,7 +452,7 @@ class Laminar:
|
|
430
452
|
) -> requests.Response:
|
431
453
|
body = {
|
432
454
|
"evaluationId": str(evaluation_id),
|
433
|
-
"points": [datapoint.
|
455
|
+
"points": [datapoint.to_dict() for datapoint in data],
|
434
456
|
}
|
435
457
|
response = requests.post(
|
436
458
|
cls.__base_http_url + "/v1/evaluation-datapoints",
|
@@ -1,10 +1,11 @@
|
|
1
1
|
import datetime
|
2
|
-
import
|
2
|
+
from enum import Enum
|
3
3
|
import pydantic
|
4
|
-
import
|
4
|
+
import requests
|
5
5
|
from typing import Any, Awaitable, Callable, Literal, Optional, Union
|
6
|
+
import uuid
|
6
7
|
|
7
|
-
from .utils import
|
8
|
+
from .utils import serialize
|
8
9
|
|
9
10
|
|
10
11
|
class ChatMessage(pydantic.BaseModel):
|
@@ -37,7 +38,7 @@ class PipelineRunRequest(pydantic.BaseModel):
|
|
37
38
|
def to_dict(self):
|
38
39
|
return {
|
39
40
|
"inputs": {
|
40
|
-
k: v.model_dump() if isinstance(v, pydantic.BaseModel) else
|
41
|
+
k: v.model_dump() if isinstance(v, pydantic.BaseModel) else serialize(v)
|
41
42
|
for k, v in self.inputs.items()
|
42
43
|
},
|
43
44
|
"pipeline": self.pipeline,
|
@@ -125,5 +126,37 @@ UpdateEvaluationResponse = CreateEvaluationResponse
|
|
125
126
|
class EvaluationResultDatapoint(pydantic.BaseModel):
|
126
127
|
data: EvaluationDatapointData
|
127
128
|
target: EvaluationDatapointTarget
|
128
|
-
|
129
|
+
executor_output: ExecutorFunctionReturnType
|
129
130
|
scores: dict[str, Numeric]
|
131
|
+
trace_id: uuid.UUID
|
132
|
+
|
133
|
+
# uuid is not serializable by default, so we need to convert it to a string
|
134
|
+
def to_dict(self):
|
135
|
+
return {
|
136
|
+
"data": {
|
137
|
+
k: v.model_dump() if isinstance(v, pydantic.BaseModel) else serialize(v)
|
138
|
+
for k, v in self.data.items()
|
139
|
+
},
|
140
|
+
"target": {
|
141
|
+
k: v.model_dump() if isinstance(v, pydantic.BaseModel) else serialize(v)
|
142
|
+
for k, v in self.target.items()
|
143
|
+
},
|
144
|
+
"executorOutput": serialize(self.executor_output),
|
145
|
+
"scores": self.scores,
|
146
|
+
"traceId": str(self.trace_id),
|
147
|
+
}
|
148
|
+
|
149
|
+
|
150
|
+
class SpanType(Enum):
|
151
|
+
DEFAULT = "DEFAULT"
|
152
|
+
LLM = "LLM"
|
153
|
+
PIPELINE = "PIPELINE" # must not be set manually
|
154
|
+
EXECUTOR = "EXECUTOR"
|
155
|
+
EVALUATOR = "EVALUATOR"
|
156
|
+
EVALUATION = "EVALUATION"
|
157
|
+
|
158
|
+
|
159
|
+
class TraceType(Enum):
|
160
|
+
DEFAULT = "DEFAULT"
|
161
|
+
EVENT = "EVENT" # must not be set manually
|
162
|
+
EVALUATION = "EVALUATION"
|
@@ -1,5 +1,4 @@
|
|
1
1
|
import asyncio
|
2
|
-
import copy
|
3
2
|
import datetime
|
4
3
|
import dataclasses
|
5
4
|
import enum
|
@@ -50,7 +49,7 @@ def is_iterator(o: typing.Any) -> bool:
|
|
50
49
|
return hasattr(o, "__iter__") and hasattr(o, "__next__")
|
51
50
|
|
52
51
|
|
53
|
-
def
|
52
|
+
def serialize(obj: typing.Any) -> dict[str, typing.Any]:
|
54
53
|
def to_dict_inner(o: typing.Any):
|
55
54
|
if isinstance(o, (datetime.datetime, datetime.date)):
|
56
55
|
return o.strftime("%Y-%m-%dT%H:%M:%S.%f%z")
|
@@ -59,7 +58,7 @@ def to_dict(obj: typing.Any) -> dict[str, typing.Any]:
|
|
59
58
|
elif isinstance(o, (int, float, str, bool)):
|
60
59
|
return o
|
61
60
|
elif isinstance(o, uuid.UUID):
|
62
|
-
return str(o) # same as in return, but explicit
|
61
|
+
return str(o) # same as in final return, but explicit
|
63
62
|
elif isinstance(o, enum.Enum):
|
64
63
|
return o.value
|
65
64
|
elif dataclasses.is_dataclass(o):
|
@@ -90,11 +89,11 @@ def get_input_from_func_args(
|
|
90
89
|
) -> dict[str, typing.Any]:
|
91
90
|
# Remove implicitly passed "self" or "cls" argument for
|
92
91
|
# instance or class methods
|
93
|
-
res = copy
|
92
|
+
res = func_kwargs.copy()
|
94
93
|
for i, k in enumerate(inspect.signature(func).parameters.keys()):
|
95
94
|
if is_method and k in ["self", "cls"]:
|
96
95
|
continue
|
97
96
|
# If param has default value, then it's not present in func args
|
98
|
-
if len(func_args)
|
97
|
+
if i < len(func_args):
|
99
98
|
res[k] = func_args[i]
|
100
99
|
return res
|