lmnr 0.4.12b2__tar.gz → 0.4.12b4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/PKG-INFO +17 -12
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/README.md +17 -10
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/pyproject.toml +2 -3
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/decorators.py +3 -2
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/evaluations.py +131 -74
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/laminar.py +33 -11
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/types.py +38 -5
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/utils.py +4 -5
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/__init__.py +3 -29
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/config/__init__.py +0 -4
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/decorators/base.py +16 -9
- lmnr-0.4.12b4/src/lmnr/traceloop_sdk/tracing/attributes.py +8 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/tracing.py +31 -142
- lmnr-0.4.12b2/src/lmnr/traceloop_sdk/metrics/__init__.py +0 -0
- lmnr-0.4.12b2/src/lmnr/traceloop_sdk/metrics/metrics.py +0 -176
- lmnr-0.4.12b2/src/lmnr/traceloop_sdk/tracing/manual.py +0 -57
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/LICENSE +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/__init__.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/cli.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/__init__.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/log.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/.flake8 +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/.python-version +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/decorators/__init__.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/instruments.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/__init__.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_association_properties/test_langchain_and_external_association_properties.yaml +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_association_properties/test_langchain_association_properties.yaml +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_manual/test_manual_report.yaml +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_manual/test_resource_attributes.yaml +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_privacy_no_prompts/test_simple_workflow.yaml +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_prompt_management/test_prompt_management.yaml +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_sdk_initialization/test_resource_attributes.yaml +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_tasks/test_task_io_serialization_with_langchain.yaml +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_workflows/test_simple_aworkflow.yaml +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_workflows/test_simple_workflow.yaml +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_workflows/test_streaming_workflow.yaml +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/conftest.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_association_properties.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_manual.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_nested_tasks.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_privacy_no_prompts.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_sdk_initialization.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_tasks.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_workflows.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/__init__.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/content_allow_list.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/context_manager.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/__init__.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/in_memory_span_exporter.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/json_encoder.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/package_check.py +0 -0
- {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/version.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lmnr
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.12b4
|
4
4
|
Summary: Python SDK for Laminar AI
|
5
5
|
License: Apache-2.0
|
6
6
|
Author: lmnr.ai
|
@@ -14,7 +14,6 @@ Classifier: Programming Language :: Python :: 3.12
|
|
14
14
|
Requires-Dist: argparse (>=1.0,<2.0)
|
15
15
|
Requires-Dist: asyncio (>=3.0,<4.0)
|
16
16
|
Requires-Dist: backoff (>=2.0,<3.0)
|
17
|
-
Requires-Dist: colorama (>=0.4,<0.5)
|
18
17
|
Requires-Dist: deprecated (>=1.0,<2.0)
|
19
18
|
Requires-Dist: jinja2 (>=3.0,<4.0)
|
20
19
|
Requires-Dist: opentelemetry-api (>=1.27.0,<2.0.0)
|
@@ -197,7 +196,7 @@ L.initialize(project_api_key=os.environ["LMNR_PROJECT_API_KEY"], instruments={In
|
|
197
196
|
|
198
197
|
If you want to fully disable any kind of autoinstrumentation, pass an empty set as `instruments=set()` to `.initialize()`.
|
199
198
|
|
200
|
-
|
199
|
+
Autoinstrumentations are provided by Traceloop's [OpenLLMetry](https://github.com/traceloop/openllmetry).
|
201
200
|
|
202
201
|
## Sending events
|
203
202
|
|
@@ -267,13 +266,14 @@ Evaluation takes in the following parameters:
|
|
267
266
|
- `name` – the name of your evaluation. If no such evaluation exists in the project, it will be created. Otherwise, data will be pushed to the existing evaluation
|
268
267
|
- `data` – an array of `EvaluationDatapoint` objects, where each `EvaluationDatapoint` has two keys: `target` and `data`, each containing a key-value object. Alternatively, you can pass in dictionaries, and we will instantiate `EvaluationDatapoint`s with pydantic if possible
|
269
268
|
- `executor` – the logic you want to evaluate. This function must take `data` as the first argument, and produce any output. *
|
270
|
-
- `evaluators` – evaluaton logic.
|
269
|
+
- `evaluators` – evaluaton logic. Functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Pass a dict from evaluator name to a function. Each function can produce either a single number or `dict[str, int|float]` of scores.
|
271
270
|
|
272
271
|
\* If you already have the outputs of executors you want to evaluate, you can specify the executor as an identity function, that takes in `data` and returns only needed value(s) from it.
|
273
272
|
|
274
|
-
### Example
|
273
|
+
### Example code
|
275
274
|
|
276
275
|
```python
|
276
|
+
from lmnr import evaluate
|
277
277
|
from openai import AsyncOpenAI
|
278
278
|
import asyncio
|
279
279
|
import os
|
@@ -304,20 +304,25 @@ data = [
|
|
304
304
|
]
|
305
305
|
|
306
306
|
|
307
|
-
def
|
307
|
+
def correctness(output, target):
|
308
308
|
return 1 if output == target["capital"] else 0
|
309
309
|
|
310
310
|
|
311
311
|
# Create an Evaluation instance
|
312
|
-
e =
|
313
|
-
name="
|
312
|
+
e = evaluate(
|
313
|
+
name="my-evaluation",
|
314
314
|
data=data,
|
315
315
|
executor=get_capital,
|
316
|
-
evaluators=
|
316
|
+
evaluators={"correctness": correctness},
|
317
317
|
project_api_key=os.environ["LMNR_PROJECT_API_KEY"],
|
318
318
|
)
|
319
|
-
|
320
|
-
# Run the evaluation
|
321
|
-
asyncio.run(e.run())
|
322
319
|
```
|
323
320
|
|
321
|
+
### Running from CLI.
|
322
|
+
|
323
|
+
1. Make sure `lmnr` is installed in a venv. CLI does not work with a global env
|
324
|
+
1. Run `lmnr path/to/my/eval.py`
|
325
|
+
|
326
|
+
### Running from code
|
327
|
+
|
328
|
+
Simply execute the function, e.g. `python3 path/to/my/eval.py`
|
@@ -137,7 +137,7 @@ L.initialize(project_api_key=os.environ["LMNR_PROJECT_API_KEY"], instruments={In
|
|
137
137
|
|
138
138
|
If you want to fully disable any kind of autoinstrumentation, pass an empty set as `instruments=set()` to `.initialize()`.
|
139
139
|
|
140
|
-
|
140
|
+
Autoinstrumentations are provided by Traceloop's [OpenLLMetry](https://github.com/traceloop/openllmetry).
|
141
141
|
|
142
142
|
## Sending events
|
143
143
|
|
@@ -207,13 +207,14 @@ Evaluation takes in the following parameters:
|
|
207
207
|
- `name` – the name of your evaluation. If no such evaluation exists in the project, it will be created. Otherwise, data will be pushed to the existing evaluation
|
208
208
|
- `data` – an array of `EvaluationDatapoint` objects, where each `EvaluationDatapoint` has two keys: `target` and `data`, each containing a key-value object. Alternatively, you can pass in dictionaries, and we will instantiate `EvaluationDatapoint`s with pydantic if possible
|
209
209
|
- `executor` – the logic you want to evaluate. This function must take `data` as the first argument, and produce any output. *
|
210
|
-
- `evaluators` – evaluaton logic.
|
210
|
+
- `evaluators` – evaluaton logic. Functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Pass a dict from evaluator name to a function. Each function can produce either a single number or `dict[str, int|float]` of scores.
|
211
211
|
|
212
212
|
\* If you already have the outputs of executors you want to evaluate, you can specify the executor as an identity function, that takes in `data` and returns only needed value(s) from it.
|
213
213
|
|
214
|
-
### Example
|
214
|
+
### Example code
|
215
215
|
|
216
216
|
```python
|
217
|
+
from lmnr import evaluate
|
217
218
|
from openai import AsyncOpenAI
|
218
219
|
import asyncio
|
219
220
|
import os
|
@@ -244,19 +245,25 @@ data = [
|
|
244
245
|
]
|
245
246
|
|
246
247
|
|
247
|
-
def
|
248
|
+
def correctness(output, target):
|
248
249
|
return 1 if output == target["capital"] else 0
|
249
250
|
|
250
251
|
|
251
252
|
# Create an Evaluation instance
|
252
|
-
e =
|
253
|
-
name="
|
253
|
+
e = evaluate(
|
254
|
+
name="my-evaluation",
|
254
255
|
data=data,
|
255
256
|
executor=get_capital,
|
256
|
-
evaluators=
|
257
|
+
evaluators={"correctness": correctness},
|
257
258
|
project_api_key=os.environ["LMNR_PROJECT_API_KEY"],
|
258
259
|
)
|
259
|
-
|
260
|
-
# Run the evaluation
|
261
|
-
asyncio.run(e.run())
|
262
260
|
```
|
261
|
+
|
262
|
+
### Running from CLI.
|
263
|
+
|
264
|
+
1. Make sure `lmnr` is installed in a venv. CLI does not work with a global env
|
265
|
+
1. Run `lmnr path/to/my/eval.py`
|
266
|
+
|
267
|
+
### Running from code
|
268
|
+
|
269
|
+
Simply execute the function, e.g. `python3 path/to/my/eval.py`
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "lmnr"
|
3
|
-
version = "0.4.
|
3
|
+
version = "0.4.12b4"
|
4
4
|
description = "Python SDK for Laminar AI"
|
5
5
|
authors = [
|
6
6
|
{ name = "lmnr.ai", email = "founders@lmnr.ai" }
|
@@ -11,7 +11,7 @@ license = "Apache-2.0"
|
|
11
11
|
|
12
12
|
[tool.poetry]
|
13
13
|
name = "lmnr"
|
14
|
-
version = "0.4.
|
14
|
+
version = "0.4.12b4"
|
15
15
|
description = "Python SDK for Laminar AI"
|
16
16
|
authors = ["lmnr.ai"]
|
17
17
|
readme = "README.md"
|
@@ -33,7 +33,6 @@ opentelemetry-instrumentation-sqlalchemy = "^0.48b0"
|
|
33
33
|
opentelemetry-instrumentation-urllib3 = "^0.48b0"
|
34
34
|
opentelemetry-instrumentation-threading = "^0.48b0"
|
35
35
|
opentelemetry-semantic-conventions-ai = "0.4.1"
|
36
|
-
colorama = "^0.4"
|
37
36
|
tenacity = "~=8.0"
|
38
37
|
jinja2 = "~=3.0"
|
39
38
|
deprecated = "~=1.0"
|
@@ -6,6 +6,7 @@ from opentelemetry.trace import INVALID_SPAN, get_current_span
|
|
6
6
|
|
7
7
|
from typing import Callable, Optional, cast
|
8
8
|
|
9
|
+
from lmnr.traceloop_sdk.tracing.attributes import SESSION_ID, USER_ID
|
9
10
|
from lmnr.traceloop_sdk.tracing.tracing import update_association_properties
|
10
11
|
|
11
12
|
from .utils import is_async
|
@@ -43,11 +44,11 @@ def observe(
|
|
43
44
|
if current_span != INVALID_SPAN:
|
44
45
|
if session_id is not None:
|
45
46
|
current_span.set_attribute(
|
46
|
-
|
47
|
+
SESSION_ID, session_id
|
47
48
|
)
|
48
49
|
if user_id is not None:
|
49
50
|
current_span.set_attribute(
|
50
|
-
|
51
|
+
USER_ID, user_id
|
51
52
|
)
|
52
53
|
association_properties = {}
|
53
54
|
if session_id is not None:
|
@@ -2,12 +2,26 @@ import asyncio
|
|
2
2
|
import sys
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from contextlib import contextmanager
|
5
|
-
from typing import Any, Awaitable, Optional, Union
|
5
|
+
from typing import Any, Awaitable, Optional, Set, Union
|
6
|
+
import uuid
|
6
7
|
|
7
8
|
from tqdm import tqdm
|
8
9
|
|
10
|
+
from ..traceloop_sdk.instruments import Instruments
|
11
|
+
from ..traceloop_sdk.tracing.attributes import SPAN_TYPE
|
12
|
+
|
9
13
|
from .laminar import Laminar as L
|
10
|
-
from .types import
|
14
|
+
from .types import (
|
15
|
+
CreateEvaluationResponse,
|
16
|
+
Datapoint,
|
17
|
+
EvaluationResultDatapoint,
|
18
|
+
EvaluatorFunction,
|
19
|
+
ExecutorFunction,
|
20
|
+
Numeric,
|
21
|
+
NumericTypes,
|
22
|
+
SpanType,
|
23
|
+
TraceType,
|
24
|
+
)
|
11
25
|
from .utils import is_async
|
12
26
|
|
13
27
|
DEFAULT_BATCH_SIZE = 5
|
@@ -39,7 +53,11 @@ class EvaluationReporter:
|
|
39
53
|
def start(self, name: str, project_id: str, id: str, length: int):
|
40
54
|
print(f"Running evaluation {name}...\n")
|
41
55
|
print(f"Check progress and results at {get_evaluation_url(project_id, id)}\n")
|
42
|
-
self.cli_progress = tqdm(
|
56
|
+
self.cli_progress = tqdm(
|
57
|
+
total=length,
|
58
|
+
bar_format="{bar} {percentage:3.0f}% | ETA: {remaining}s | {n_fmt}/{total_fmt}",
|
59
|
+
ncols=60,
|
60
|
+
)
|
43
61
|
|
44
62
|
def update(self, batch_length: int):
|
45
63
|
self.cli_progress.update(batch_length)
|
@@ -51,7 +69,7 @@ class EvaluationReporter:
|
|
51
69
|
def stop(self, average_scores: dict[str, Numeric]):
|
52
70
|
self.cli_progress.close()
|
53
71
|
print("\nAverage scores:")
|
54
|
-
for
|
72
|
+
for name, score in average_scores.items():
|
55
73
|
print(f"{name}: {score}")
|
56
74
|
print("\n")
|
57
75
|
|
@@ -76,20 +94,21 @@ class EvaluationDataset(ABC):
|
|
76
94
|
class Evaluation:
|
77
95
|
def __init__(
|
78
96
|
self,
|
79
|
-
name: str,
|
80
97
|
data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
|
81
98
|
executor: Any,
|
82
|
-
evaluators:
|
99
|
+
evaluators: dict[str, EvaluatorFunction],
|
100
|
+
name: Optional[str] = None,
|
83
101
|
batch_size: int = DEFAULT_BATCH_SIZE,
|
84
102
|
project_api_key: Optional[str] = None,
|
85
103
|
base_url: Optional[str] = None,
|
86
104
|
http_port: Optional[int] = None,
|
105
|
+
grpc_port: Optional[int] = None,
|
106
|
+
instruments: Optional[Set[Instruments]] = None,
|
87
107
|
):
|
88
108
|
"""
|
89
109
|
Initializes an instance of the Evaluations class.
|
90
110
|
|
91
111
|
Parameters:
|
92
|
-
name (str): The name of the evaluation.
|
93
112
|
data (Union[List[Union[EvaluationDatapoint, dict]], EvaluationDataset]): List of data points to evaluate or an evaluation dataset.
|
94
113
|
`data` is the input to the executor function,
|
95
114
|
`target` is the input to the evaluator function.
|
@@ -104,6 +123,8 @@ class Evaluation:
|
|
104
123
|
evaluator function. If the function is anonymous, it will be
|
105
124
|
named `evaluator_${index}`, where index is the index of the
|
106
125
|
evaluator function in the list starting from 1.
|
126
|
+
name (Optional[str], optional): The name of the evaluation.
|
127
|
+
It will be auto-generated if not provided.
|
107
128
|
batch_size (int, optional): The batch size for evaluation.
|
108
129
|
Defaults to DEFAULT_BATCH_SIZE.
|
109
130
|
project_api_key (Optional[str], optional): The project API key.
|
@@ -113,33 +134,18 @@ class Evaluation:
|
|
113
134
|
Defaults to "https://api.lmnr.ai".
|
114
135
|
http_port (Optional[int], optional): The port for the Laminar API HTTP service.
|
115
136
|
Defaults to 443.
|
137
|
+
instruments (Optional[Set[Instruments]], optional): Set of modules to auto-instrument.
|
138
|
+
Defaults to None. If None, all available instruments will be used.
|
116
139
|
"""
|
117
140
|
|
118
141
|
self.is_finished = False
|
119
142
|
self.name = name
|
120
143
|
self.reporter = EvaluationReporter()
|
121
144
|
self.executor = executor
|
122
|
-
self.evaluators =
|
123
|
-
zip(
|
124
|
-
[
|
125
|
-
(
|
126
|
-
e.__name__
|
127
|
-
if e.__name__ and e.__name__ != "<lambda>"
|
128
|
-
else f"evaluator_{i+1}"
|
129
|
-
)
|
130
|
-
for i, e in enumerate(evaluators)
|
131
|
-
],
|
132
|
-
evaluators,
|
133
|
-
)
|
134
|
-
)
|
135
|
-
self.evaluator_names = list(self.evaluators.keys())
|
145
|
+
self.evaluators = evaluators
|
136
146
|
if isinstance(data, list):
|
137
147
|
self.data = [
|
138
|
-
(
|
139
|
-
Datapoint.model_validate(point)
|
140
|
-
if isinstance(point, dict)
|
141
|
-
else point
|
142
|
-
)
|
148
|
+
(Datapoint.model_validate(point) if isinstance(point, dict) else point)
|
143
149
|
for point in data
|
144
150
|
]
|
145
151
|
else:
|
@@ -149,7 +155,8 @@ class Evaluation:
|
|
149
155
|
project_api_key=project_api_key,
|
150
156
|
base_url=base_url,
|
151
157
|
http_port=http_port,
|
152
|
-
|
158
|
+
grpc_port=grpc_port,
|
159
|
+
instruments=instruments,
|
153
160
|
)
|
154
161
|
|
155
162
|
def run(self) -> Union[None, Awaitable[None]]:
|
@@ -204,7 +211,7 @@ class Evaluation:
|
|
204
211
|
async def evaluate_in_batches(self, evaluation: CreateEvaluationResponse):
|
205
212
|
for i in range(0, len(self.data), self.batch_size):
|
206
213
|
batch = (
|
207
|
-
self.data[i: i + self.batch_size]
|
214
|
+
self.data[i : i + self.batch_size]
|
208
215
|
if isinstance(self.data, list)
|
209
216
|
else self.data.slice(i, i + self.batch_size)
|
210
217
|
)
|
@@ -216,71 +223,121 @@ class Evaluation:
|
|
216
223
|
finally:
|
217
224
|
self.reporter.update(len(batch))
|
218
225
|
|
219
|
-
async def _evaluate_batch(
|
226
|
+
async def _evaluate_batch(
|
227
|
+
self, batch: list[Datapoint]
|
228
|
+
) -> list[EvaluationResultDatapoint]:
|
220
229
|
batch_promises = [self._evaluate_datapoint(datapoint) for datapoint in batch]
|
221
230
|
results = await asyncio.gather(*batch_promises)
|
222
231
|
return results
|
223
232
|
|
224
|
-
async def _evaluate_datapoint(
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
233
|
+
async def _evaluate_datapoint(
|
234
|
+
self, datapoint: Datapoint
|
235
|
+
) -> EvaluationResultDatapoint:
|
236
|
+
with L.start_as_current_span("evaluation") as evaluation_span:
|
237
|
+
L._set_trace_type(trace_type=TraceType.EVALUATION)
|
238
|
+
evaluation_span.set_attribute(SPAN_TYPE, SpanType.EVALUATION.value)
|
239
|
+
with L.start_as_current_span(
|
240
|
+
"executor", input={"data": datapoint.data}
|
241
|
+
) as executor_span:
|
242
|
+
executor_span.set_attribute(SPAN_TYPE, SpanType.EXECUTOR.value)
|
243
|
+
output = (
|
244
|
+
await self.executor(datapoint.data)
|
245
|
+
if is_async(self.executor)
|
246
|
+
else self.executor(datapoint.data)
|
247
|
+
)
|
248
|
+
L.set_span_output(output)
|
249
|
+
target = datapoint.target
|
250
|
+
|
251
|
+
# Iterate over evaluators
|
252
|
+
scores: dict[str, Numeric] = {}
|
253
|
+
for evaluator_name, evaluator in self.evaluators.items():
|
254
|
+
with L.start_as_current_span(
|
255
|
+
"evaluator", input={"output": output, "target": target}
|
256
|
+
) as evaluator_span:
|
257
|
+
evaluator_span.set_attribute(SPAN_TYPE, SpanType.EVALUATOR.value)
|
258
|
+
value = (
|
259
|
+
await evaluator(output, target)
|
260
|
+
if is_async(evaluator)
|
261
|
+
else evaluator(output, target)
|
262
|
+
)
|
263
|
+
L.set_span_output(value)
|
264
|
+
|
265
|
+
# If evaluator returns a single number, use evaluator name as key
|
266
|
+
if isinstance(value, NumericTypes):
|
267
|
+
scores[evaluator_name] = value
|
268
|
+
else:
|
269
|
+
scores.update(value)
|
270
|
+
|
271
|
+
trace_id = uuid.UUID(int=evaluation_span.get_span_context().trace_id)
|
272
|
+
return EvaluationResultDatapoint(
|
273
|
+
data=datapoint.data,
|
274
|
+
target=target,
|
275
|
+
executor_output=output,
|
276
|
+
scores=scores,
|
277
|
+
trace_id=trace_id,
|
240
278
|
)
|
241
279
|
|
242
|
-
# If evaluator returns a single number, use evaluator name as key
|
243
|
-
if isinstance(value, NumericTypes):
|
244
|
-
scores[evaluator_name] = value
|
245
|
-
else:
|
246
|
-
scores.update(value)
|
247
|
-
|
248
|
-
return EvaluationResultDatapoint(
|
249
|
-
data=datapoint.data,
|
250
|
-
target=target,
|
251
|
-
executorOutput=output,
|
252
|
-
scores=scores,
|
253
|
-
)
|
254
|
-
|
255
280
|
|
256
281
|
def evaluate(
|
257
|
-
name: str,
|
258
282
|
data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
|
259
|
-
executor:
|
260
|
-
evaluators:
|
283
|
+
executor: ExecutorFunction,
|
284
|
+
evaluators: dict[str, EvaluatorFunction],
|
285
|
+
name: Optional[str] = None,
|
261
286
|
batch_size: int = DEFAULT_BATCH_SIZE,
|
262
287
|
project_api_key: Optional[str] = None,
|
263
288
|
base_url: Optional[str] = None,
|
264
289
|
http_port: Optional[int] = None,
|
290
|
+
grpc_port: Optional[int] = None,
|
291
|
+
instruments: Optional[Set[Instruments]] = None,
|
265
292
|
) -> Optional[Awaitable[None]]:
|
266
293
|
"""
|
267
|
-
|
268
|
-
|
269
|
-
If `_set_global_evaluation` is `True`, sets the global evaluation to be run in another part of the program.
|
270
|
-
|
271
|
-
Otherwise, if there is no event loop, runs the evaluation in the current thread until completion.
|
294
|
+
If added to the file which is called through lmnr eval command, then simply registers the evaluation.
|
295
|
+
Otherwise, if there is no event loop, creates it and runs the evaluation until completion.
|
272
296
|
If there is an event loop, schedules the evaluation as a task in the event loop and returns an awaitable handle.
|
297
|
+
|
298
|
+
Parameters:
|
299
|
+
data (Union[List[Union[EvaluationDatapoint, dict]], EvaluationDataset]): List of data points to evaluate or an evaluation dataset.
|
300
|
+
`data` is the input to the executor function,
|
301
|
+
`target` is the input to the evaluator function.
|
302
|
+
executor (Callable[..., Any]): The executor function.
|
303
|
+
Takes the data point + any additional arguments
|
304
|
+
and returns the output to evaluate.
|
305
|
+
evaluators (List[Callable[..., Any]]): List of evaluator functions.
|
306
|
+
Each evaluator function takes the output of the executor _and_
|
307
|
+
the target data, and returns a score. The score can be a
|
308
|
+
single number or a record of string keys and number values.
|
309
|
+
If the score is a single number, it will be named after the
|
310
|
+
evaluator function. If the function is anonymous, it will be
|
311
|
+
named `evaluator_${index}`, where index is the index of the
|
312
|
+
evaluator function in the list starting from 1.
|
313
|
+
name (Optional[str], optional): The name of the evaluation.
|
314
|
+
It will be auto-generated if not provided.
|
315
|
+
batch_size (int, optional): The batch size for evaluation.
|
316
|
+
Defaults to DEFAULT_BATCH_SIZE.
|
317
|
+
project_api_key (Optional[str], optional): The project API key.
|
318
|
+
Defaults to an empty string.
|
319
|
+
base_url (Optional[str], optional): The base URL for the Laminar API.
|
320
|
+
Useful if self-hosted elsewhere.
|
321
|
+
Defaults to "https://api.lmnr.ai".
|
322
|
+
http_port (Optional[int], optional): The port for the Laminar API HTTP service.
|
323
|
+
Defaults to 443.
|
324
|
+
grpc_port (Optional[int], optional): The port for the Laminar API gRPC service.
|
325
|
+
Defaults to 8443.
|
326
|
+
instruments (Optional[Set[Instruments]], optional): Set of modules to auto-instrument.
|
327
|
+
Defaults to None. If None, all available instruments will be used.
|
273
328
|
"""
|
274
329
|
|
275
330
|
evaluation = Evaluation(
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
batch_size,
|
281
|
-
project_api_key,
|
282
|
-
base_url,
|
283
|
-
http_port,
|
331
|
+
data=data,
|
332
|
+
executor=executor,
|
333
|
+
evaluators=evaluators,
|
334
|
+
name=name,
|
335
|
+
batch_size=batch_size,
|
336
|
+
project_api_key=project_api_key,
|
337
|
+
base_url=base_url,
|
338
|
+
http_port=http_port,
|
339
|
+
grpc_port=grpc_port,
|
340
|
+
instruments=instruments,
|
284
341
|
)
|
285
342
|
|
286
343
|
global _evaluation
|
@@ -5,7 +5,6 @@ from opentelemetry.trace import (
|
|
5
5
|
get_current_span,
|
6
6
|
SpanKind,
|
7
7
|
)
|
8
|
-
from opentelemetry.semconv_ai import SpanAttributes
|
9
8
|
from opentelemetry.util.types import AttributeValue
|
10
9
|
from opentelemetry.context.context import Context
|
11
10
|
from opentelemetry.util import types
|
@@ -26,7 +25,17 @@ import os
|
|
26
25
|
import requests
|
27
26
|
import uuid
|
28
27
|
|
29
|
-
from lmnr.traceloop_sdk.tracing.
|
28
|
+
from lmnr.traceloop_sdk.tracing.attributes import (
|
29
|
+
SESSION_ID,
|
30
|
+
SPAN_INPUT,
|
31
|
+
SPAN_OUTPUT,
|
32
|
+
TRACE_TYPE,
|
33
|
+
USER_ID,
|
34
|
+
)
|
35
|
+
from lmnr.traceloop_sdk.tracing.tracing import (
|
36
|
+
set_association_properties,
|
37
|
+
update_association_properties,
|
38
|
+
)
|
30
39
|
|
31
40
|
from .log import VerboseColorfulFormatter
|
32
41
|
|
@@ -37,6 +46,7 @@ from .types import (
|
|
37
46
|
PipelineRunResponse,
|
38
47
|
NodeInput,
|
39
48
|
PipelineRunRequest,
|
49
|
+
TraceType,
|
40
50
|
UpdateEvaluationResponse,
|
41
51
|
)
|
42
52
|
|
@@ -356,8 +366,8 @@ class Laminar:
|
|
356
366
|
) as span:
|
357
367
|
if input is not None:
|
358
368
|
span.set_attribute(
|
359
|
-
|
360
|
-
json.dumps(
|
369
|
+
SPAN_INPUT,
|
370
|
+
json.dumps(input),
|
361
371
|
)
|
362
372
|
yield span
|
363
373
|
|
@@ -371,9 +381,7 @@ class Laminar:
|
|
371
381
|
"""
|
372
382
|
span = get_current_span()
|
373
383
|
if output is not None and span != INVALID_SPAN:
|
374
|
-
span.set_attribute(
|
375
|
-
SpanAttributes.TRACELOOP_ENTITY_OUTPUT, json.dumps(output)
|
376
|
-
)
|
384
|
+
span.set_attribute(SPAN_OUTPUT, json.dumps(output))
|
377
385
|
|
378
386
|
@classmethod
|
379
387
|
def set_session(
|
@@ -396,9 +404,23 @@ class Laminar:
|
|
396
404
|
"""
|
397
405
|
association_properties = {}
|
398
406
|
if session_id is not None:
|
399
|
-
association_properties[
|
407
|
+
association_properties[SESSION_ID] = session_id
|
400
408
|
if user_id is not None:
|
401
|
-
association_properties[
|
409
|
+
association_properties[USER_ID] = user_id
|
410
|
+
update_association_properties(association_properties)
|
411
|
+
|
412
|
+
@classmethod
|
413
|
+
def _set_trace_type(
|
414
|
+
cls,
|
415
|
+
trace_type: TraceType,
|
416
|
+
):
|
417
|
+
"""Set the trace_type for the current span and the context
|
418
|
+
Args:
|
419
|
+
trace_type (TraceType): Type of the trace
|
420
|
+
"""
|
421
|
+
association_properties = {
|
422
|
+
TRACE_TYPE: trace_type.value,
|
423
|
+
}
|
402
424
|
update_association_properties(association_properties)
|
403
425
|
|
404
426
|
@classmethod
|
@@ -410,7 +432,7 @@ class Laminar:
|
|
410
432
|
set_association_properties(props)
|
411
433
|
|
412
434
|
@classmethod
|
413
|
-
def create_evaluation(cls, name: str) -> CreateEvaluationResponse:
|
435
|
+
def create_evaluation(cls, name: Optional[str]) -> CreateEvaluationResponse:
|
414
436
|
response = requests.post(
|
415
437
|
cls.__base_http_url + "/v1/evaluations",
|
416
438
|
data=json.dumps({"name": name}),
|
@@ -430,7 +452,7 @@ class Laminar:
|
|
430
452
|
) -> requests.Response:
|
431
453
|
body = {
|
432
454
|
"evaluationId": str(evaluation_id),
|
433
|
-
"points": [datapoint.
|
455
|
+
"points": [datapoint.to_dict() for datapoint in data],
|
434
456
|
}
|
435
457
|
response = requests.post(
|
436
458
|
cls.__base_http_url + "/v1/evaluation-datapoints",
|
@@ -1,10 +1,11 @@
|
|
1
1
|
import datetime
|
2
|
-
import
|
2
|
+
from enum import Enum
|
3
3
|
import pydantic
|
4
|
-
import
|
4
|
+
import requests
|
5
5
|
from typing import Any, Awaitable, Callable, Literal, Optional, Union
|
6
|
+
import uuid
|
6
7
|
|
7
|
-
from .utils import
|
8
|
+
from .utils import serialize
|
8
9
|
|
9
10
|
|
10
11
|
class ChatMessage(pydantic.BaseModel):
|
@@ -37,7 +38,7 @@ class PipelineRunRequest(pydantic.BaseModel):
|
|
37
38
|
def to_dict(self):
|
38
39
|
return {
|
39
40
|
"inputs": {
|
40
|
-
k: v.model_dump() if isinstance(v, pydantic.BaseModel) else
|
41
|
+
k: v.model_dump() if isinstance(v, pydantic.BaseModel) else serialize(v)
|
41
42
|
for k, v in self.inputs.items()
|
42
43
|
},
|
43
44
|
"pipeline": self.pipeline,
|
@@ -125,5 +126,37 @@ UpdateEvaluationResponse = CreateEvaluationResponse
|
|
125
126
|
class EvaluationResultDatapoint(pydantic.BaseModel):
|
126
127
|
data: EvaluationDatapointData
|
127
128
|
target: EvaluationDatapointTarget
|
128
|
-
|
129
|
+
executor_output: ExecutorFunctionReturnType
|
129
130
|
scores: dict[str, Numeric]
|
131
|
+
trace_id: uuid.UUID
|
132
|
+
|
133
|
+
# uuid is not serializable by default, so we need to convert it to a string
|
134
|
+
def to_dict(self):
|
135
|
+
return {
|
136
|
+
"data": {
|
137
|
+
k: v.model_dump() if isinstance(v, pydantic.BaseModel) else serialize(v)
|
138
|
+
for k, v in self.data.items()
|
139
|
+
},
|
140
|
+
"target": {
|
141
|
+
k: v.model_dump() if isinstance(v, pydantic.BaseModel) else serialize(v)
|
142
|
+
for k, v in self.target.items()
|
143
|
+
},
|
144
|
+
"executorOutput": serialize(self.executor_output),
|
145
|
+
"scores": self.scores,
|
146
|
+
"traceId": str(self.trace_id),
|
147
|
+
}
|
148
|
+
|
149
|
+
|
150
|
+
class SpanType(Enum):
|
151
|
+
DEFAULT = "DEFAULT"
|
152
|
+
LLM = "LLM"
|
153
|
+
PIPELINE = "PIPELINE" # must not be set manually
|
154
|
+
EXECUTOR = "EXECUTOR"
|
155
|
+
EVALUATOR = "EVALUATOR"
|
156
|
+
EVALUATION = "EVALUATION"
|
157
|
+
|
158
|
+
|
159
|
+
class TraceType(Enum):
|
160
|
+
DEFAULT = "DEFAULT"
|
161
|
+
EVENT = "EVENT" # must not be set manually
|
162
|
+
EVALUATION = "EVALUATION"
|