lmnr 0.4.12b2__tar.gz → 0.4.12b4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/PKG-INFO +17 -12
  2. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/README.md +17 -10
  3. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/pyproject.toml +2 -3
  4. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/decorators.py +3 -2
  5. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/evaluations.py +131 -74
  6. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/laminar.py +33 -11
  7. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/types.py +38 -5
  8. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/utils.py +4 -5
  9. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/__init__.py +3 -29
  10. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/config/__init__.py +0 -4
  11. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/decorators/base.py +16 -9
  12. lmnr-0.4.12b4/src/lmnr/traceloop_sdk/tracing/attributes.py +8 -0
  13. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/tracing.py +31 -142
  14. lmnr-0.4.12b2/src/lmnr/traceloop_sdk/metrics/__init__.py +0 -0
  15. lmnr-0.4.12b2/src/lmnr/traceloop_sdk/metrics/metrics.py +0 -176
  16. lmnr-0.4.12b2/src/lmnr/traceloop_sdk/tracing/manual.py +0 -57
  17. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/LICENSE +0 -0
  18. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/__init__.py +0 -0
  19. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/cli.py +0 -0
  20. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/__init__.py +0 -0
  21. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/sdk/log.py +0 -0
  22. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/.flake8 +0 -0
  23. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/.python-version +0 -0
  24. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/decorators/__init__.py +0 -0
  25. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/instruments.py +0 -0
  26. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/__init__.py +0 -0
  27. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_association_properties/test_langchain_and_external_association_properties.yaml +0 -0
  28. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_association_properties/test_langchain_association_properties.yaml +0 -0
  29. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_manual/test_manual_report.yaml +0 -0
  30. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_manual/test_resource_attributes.yaml +0 -0
  31. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_privacy_no_prompts/test_simple_workflow.yaml +0 -0
  32. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_prompt_management/test_prompt_management.yaml +0 -0
  33. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_sdk_initialization/test_resource_attributes.yaml +0 -0
  34. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_tasks/test_task_io_serialization_with_langchain.yaml +0 -0
  35. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_workflows/test_simple_aworkflow.yaml +0 -0
  36. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_workflows/test_simple_workflow.yaml +0 -0
  37. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/cassettes/test_workflows/test_streaming_workflow.yaml +0 -0
  38. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/conftest.py +0 -0
  39. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_association_properties.py +0 -0
  40. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_manual.py +0 -0
  41. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_nested_tasks.py +0 -0
  42. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_privacy_no_prompts.py +0 -0
  43. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_sdk_initialization.py +0 -0
  44. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_tasks.py +0 -0
  45. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tests/test_workflows.py +0 -0
  46. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/__init__.py +0 -0
  47. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/content_allow_list.py +0 -0
  48. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/tracing/context_manager.py +0 -0
  49. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/__init__.py +0 -0
  50. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/in_memory_span_exporter.py +0 -0
  51. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/json_encoder.py +0 -0
  52. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/utils/package_check.py +0 -0
  53. {lmnr-0.4.12b2 → lmnr-0.4.12b4}/src/lmnr/traceloop_sdk/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lmnr
3
- Version: 0.4.12b2
3
+ Version: 0.4.12b4
4
4
  Summary: Python SDK for Laminar AI
5
5
  License: Apache-2.0
6
6
  Author: lmnr.ai
@@ -14,7 +14,6 @@ Classifier: Programming Language :: Python :: 3.12
14
14
  Requires-Dist: argparse (>=1.0,<2.0)
15
15
  Requires-Dist: asyncio (>=3.0,<4.0)
16
16
  Requires-Dist: backoff (>=2.0,<3.0)
17
- Requires-Dist: colorama (>=0.4,<0.5)
18
17
  Requires-Dist: deprecated (>=1.0,<2.0)
19
18
  Requires-Dist: jinja2 (>=3.0,<4.0)
20
19
  Requires-Dist: opentelemetry-api (>=1.27.0,<2.0.0)
@@ -197,7 +196,7 @@ L.initialize(project_api_key=os.environ["LMNR_PROJECT_API_KEY"], instruments={In
197
196
 
198
197
  If you want to fully disable any kind of autoinstrumentation, pass an empty set as `instruments=set()` to `.initialize()`.
199
198
 
200
- Majority of the autoinstrumentations are provided by Traceloop's [OpenLLMetry](https://github.com/traceloop/openllmetry).
199
+ Autoinstrumentations are provided by Traceloop's [OpenLLMetry](https://github.com/traceloop/openllmetry).
201
200
 
202
201
  ## Sending events
203
202
 
@@ -267,13 +266,14 @@ Evaluation takes in the following parameters:
267
266
  - `name` – the name of your evaluation. If no such evaluation exists in the project, it will be created. Otherwise, data will be pushed to the existing evaluation
268
267
  - `data` – an array of `EvaluationDatapoint` objects, where each `EvaluationDatapoint` has two keys: `target` and `data`, each containing a key-value object. Alternatively, you can pass in dictionaries, and we will instantiate `EvaluationDatapoint`s with pydantic if possible
269
268
  - `executor` – the logic you want to evaluate. This function must take `data` as the first argument, and produce any output. *
270
- - `evaluators` – evaluaton logic. List of functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Each function can produce either a single number or `dict[str, int|float]` of scores.
269
+ - `evaluators` – evaluaton logic. Functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Pass a dict from evaluator name to a function. Each function can produce either a single number or `dict[str, int|float]` of scores.
271
270
 
272
271
  \* If you already have the outputs of executors you want to evaluate, you can specify the executor as an identity function, that takes in `data` and returns only needed value(s) from it.
273
272
 
274
- ### Example
273
+ ### Example code
275
274
 
276
275
  ```python
276
+ from lmnr import evaluate
277
277
  from openai import AsyncOpenAI
278
278
  import asyncio
279
279
  import os
@@ -304,20 +304,25 @@ data = [
304
304
  ]
305
305
 
306
306
 
307
- def evaluator_A(output, target):
307
+ def correctness(output, target):
308
308
  return 1 if output == target["capital"] else 0
309
309
 
310
310
 
311
311
  # Create an Evaluation instance
312
- e = Evaluation(
313
- name="py-evaluation-async",
312
+ e = evaluate(
313
+ name="my-evaluation",
314
314
  data=data,
315
315
  executor=get_capital,
316
- evaluators=[evaluator_A],
316
+ evaluators={"correctness": correctness},
317
317
  project_api_key=os.environ["LMNR_PROJECT_API_KEY"],
318
318
  )
319
-
320
- # Run the evaluation
321
- asyncio.run(e.run())
322
319
  ```
323
320
 
321
+ ### Running from CLI.
322
+
323
+ 1. Make sure `lmnr` is installed in a venv. CLI does not work with a global env
324
+ 1. Run `lmnr path/to/my/eval.py`
325
+
326
+ ### Running from code
327
+
328
+ Simply execute the function, e.g. `python3 path/to/my/eval.py`
@@ -137,7 +137,7 @@ L.initialize(project_api_key=os.environ["LMNR_PROJECT_API_KEY"], instruments={In
137
137
 
138
138
  If you want to fully disable any kind of autoinstrumentation, pass an empty set as `instruments=set()` to `.initialize()`.
139
139
 
140
- Majority of the autoinstrumentations are provided by Traceloop's [OpenLLMetry](https://github.com/traceloop/openllmetry).
140
+ Autoinstrumentations are provided by Traceloop's [OpenLLMetry](https://github.com/traceloop/openllmetry).
141
141
 
142
142
  ## Sending events
143
143
 
@@ -207,13 +207,14 @@ Evaluation takes in the following parameters:
207
207
  - `name` – the name of your evaluation. If no such evaluation exists in the project, it will be created. Otherwise, data will be pushed to the existing evaluation
208
208
  - `data` – an array of `EvaluationDatapoint` objects, where each `EvaluationDatapoint` has two keys: `target` and `data`, each containing a key-value object. Alternatively, you can pass in dictionaries, and we will instantiate `EvaluationDatapoint`s with pydantic if possible
209
209
  - `executor` – the logic you want to evaluate. This function must take `data` as the first argument, and produce any output. *
210
- - `evaluators` – evaluaton logic. List of functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Each function can produce either a single number or `dict[str, int|float]` of scores.
210
+ - `evaluators` – evaluaton logic. Functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Pass a dict from evaluator name to a function. Each function can produce either a single number or `dict[str, int|float]` of scores.
211
211
 
212
212
  \* If you already have the outputs of executors you want to evaluate, you can specify the executor as an identity function, that takes in `data` and returns only needed value(s) from it.
213
213
 
214
- ### Example
214
+ ### Example code
215
215
 
216
216
  ```python
217
+ from lmnr import evaluate
217
218
  from openai import AsyncOpenAI
218
219
  import asyncio
219
220
  import os
@@ -244,19 +245,25 @@ data = [
244
245
  ]
245
246
 
246
247
 
247
- def evaluator_A(output, target):
248
+ def correctness(output, target):
248
249
  return 1 if output == target["capital"] else 0
249
250
 
250
251
 
251
252
  # Create an Evaluation instance
252
- e = Evaluation(
253
- name="py-evaluation-async",
253
+ e = evaluate(
254
+ name="my-evaluation",
254
255
  data=data,
255
256
  executor=get_capital,
256
- evaluators=[evaluator_A],
257
+ evaluators={"correctness": correctness},
257
258
  project_api_key=os.environ["LMNR_PROJECT_API_KEY"],
258
259
  )
259
-
260
- # Run the evaluation
261
- asyncio.run(e.run())
262
260
  ```
261
+
262
+ ### Running from CLI.
263
+
264
+ 1. Make sure `lmnr` is installed in a venv. CLI does not work with a global env
265
+ 1. Run `lmnr path/to/my/eval.py`
266
+
267
+ ### Running from code
268
+
269
+ Simply execute the function, e.g. `python3 path/to/my/eval.py`
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lmnr"
3
- version = "0.4.12b2"
3
+ version = "0.4.12b4"
4
4
  description = "Python SDK for Laminar AI"
5
5
  authors = [
6
6
  { name = "lmnr.ai", email = "founders@lmnr.ai" }
@@ -11,7 +11,7 @@ license = "Apache-2.0"
11
11
 
12
12
  [tool.poetry]
13
13
  name = "lmnr"
14
- version = "0.4.12b2"
14
+ version = "0.4.12b4"
15
15
  description = "Python SDK for Laminar AI"
16
16
  authors = ["lmnr.ai"]
17
17
  readme = "README.md"
@@ -33,7 +33,6 @@ opentelemetry-instrumentation-sqlalchemy = "^0.48b0"
33
33
  opentelemetry-instrumentation-urllib3 = "^0.48b0"
34
34
  opentelemetry-instrumentation-threading = "^0.48b0"
35
35
  opentelemetry-semantic-conventions-ai = "0.4.1"
36
- colorama = "^0.4"
37
36
  tenacity = "~=8.0"
38
37
  jinja2 = "~=3.0"
39
38
  deprecated = "~=1.0"
@@ -6,6 +6,7 @@ from opentelemetry.trace import INVALID_SPAN, get_current_span
6
6
 
7
7
  from typing import Callable, Optional, cast
8
8
 
9
+ from lmnr.traceloop_sdk.tracing.attributes import SESSION_ID, USER_ID
9
10
  from lmnr.traceloop_sdk.tracing.tracing import update_association_properties
10
11
 
11
12
  from .utils import is_async
@@ -43,11 +44,11 @@ def observe(
43
44
  if current_span != INVALID_SPAN:
44
45
  if session_id is not None:
45
46
  current_span.set_attribute(
46
- "traceloop.association.properties.session_id", session_id
47
+ SESSION_ID, session_id
47
48
  )
48
49
  if user_id is not None:
49
50
  current_span.set_attribute(
50
- "traceloop.association.properties.user_id", user_id
51
+ USER_ID, user_id
51
52
  )
52
53
  association_properties = {}
53
54
  if session_id is not None:
@@ -2,12 +2,26 @@ import asyncio
2
2
  import sys
3
3
  from abc import ABC, abstractmethod
4
4
  from contextlib import contextmanager
5
- from typing import Any, Awaitable, Optional, Union
5
+ from typing import Any, Awaitable, Optional, Set, Union
6
+ import uuid
6
7
 
7
8
  from tqdm import tqdm
8
9
 
10
+ from ..traceloop_sdk.instruments import Instruments
11
+ from ..traceloop_sdk.tracing.attributes import SPAN_TYPE
12
+
9
13
  from .laminar import Laminar as L
10
- from .types import CreateEvaluationResponse, Datapoint, EvaluationResultDatapoint, Numeric, NumericTypes
14
+ from .types import (
15
+ CreateEvaluationResponse,
16
+ Datapoint,
17
+ EvaluationResultDatapoint,
18
+ EvaluatorFunction,
19
+ ExecutorFunction,
20
+ Numeric,
21
+ NumericTypes,
22
+ SpanType,
23
+ TraceType,
24
+ )
11
25
  from .utils import is_async
12
26
 
13
27
  DEFAULT_BATCH_SIZE = 5
@@ -39,7 +53,11 @@ class EvaluationReporter:
39
53
  def start(self, name: str, project_id: str, id: str, length: int):
40
54
  print(f"Running evaluation {name}...\n")
41
55
  print(f"Check progress and results at {get_evaluation_url(project_id, id)}\n")
42
- self.cli_progress = tqdm(total=length, bar_format="{bar} {percentage:3.0f}% | ETA: {remaining}s | {n_fmt}/{total_fmt}", ncols=60)
56
+ self.cli_progress = tqdm(
57
+ total=length,
58
+ bar_format="{bar} {percentage:3.0f}% | ETA: {remaining}s | {n_fmt}/{total_fmt}",
59
+ ncols=60,
60
+ )
43
61
 
44
62
  def update(self, batch_length: int):
45
63
  self.cli_progress.update(batch_length)
@@ -51,7 +69,7 @@ class EvaluationReporter:
51
69
  def stop(self, average_scores: dict[str, Numeric]):
52
70
  self.cli_progress.close()
53
71
  print("\nAverage scores:")
54
- for (name, score) in average_scores.items():
72
+ for name, score in average_scores.items():
55
73
  print(f"{name}: {score}")
56
74
  print("\n")
57
75
 
@@ -76,20 +94,21 @@ class EvaluationDataset(ABC):
76
94
  class Evaluation:
77
95
  def __init__(
78
96
  self,
79
- name: str,
80
97
  data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
81
98
  executor: Any,
82
- evaluators: list[Any],
99
+ evaluators: dict[str, EvaluatorFunction],
100
+ name: Optional[str] = None,
83
101
  batch_size: int = DEFAULT_BATCH_SIZE,
84
102
  project_api_key: Optional[str] = None,
85
103
  base_url: Optional[str] = None,
86
104
  http_port: Optional[int] = None,
105
+ grpc_port: Optional[int] = None,
106
+ instruments: Optional[Set[Instruments]] = None,
87
107
  ):
88
108
  """
89
109
  Initializes an instance of the Evaluations class.
90
110
 
91
111
  Parameters:
92
- name (str): The name of the evaluation.
93
112
  data (Union[List[Union[EvaluationDatapoint, dict]], EvaluationDataset]): List of data points to evaluate or an evaluation dataset.
94
113
  `data` is the input to the executor function,
95
114
  `target` is the input to the evaluator function.
@@ -104,6 +123,8 @@ class Evaluation:
104
123
  evaluator function. If the function is anonymous, it will be
105
124
  named `evaluator_${index}`, where index is the index of the
106
125
  evaluator function in the list starting from 1.
126
+ name (Optional[str], optional): The name of the evaluation.
127
+ It will be auto-generated if not provided.
107
128
  batch_size (int, optional): The batch size for evaluation.
108
129
  Defaults to DEFAULT_BATCH_SIZE.
109
130
  project_api_key (Optional[str], optional): The project API key.
@@ -113,33 +134,18 @@ class Evaluation:
113
134
  Defaults to "https://api.lmnr.ai".
114
135
  http_port (Optional[int], optional): The port for the Laminar API HTTP service.
115
136
  Defaults to 443.
137
+ instruments (Optional[Set[Instruments]], optional): Set of modules to auto-instrument.
138
+ Defaults to None. If None, all available instruments will be used.
116
139
  """
117
140
 
118
141
  self.is_finished = False
119
142
  self.name = name
120
143
  self.reporter = EvaluationReporter()
121
144
  self.executor = executor
122
- self.evaluators = dict(
123
- zip(
124
- [
125
- (
126
- e.__name__
127
- if e.__name__ and e.__name__ != "<lambda>"
128
- else f"evaluator_{i+1}"
129
- )
130
- for i, e in enumerate(evaluators)
131
- ],
132
- evaluators,
133
- )
134
- )
135
- self.evaluator_names = list(self.evaluators.keys())
145
+ self.evaluators = evaluators
136
146
  if isinstance(data, list):
137
147
  self.data = [
138
- (
139
- Datapoint.model_validate(point)
140
- if isinstance(point, dict)
141
- else point
142
- )
148
+ (Datapoint.model_validate(point) if isinstance(point, dict) else point)
143
149
  for point in data
144
150
  ]
145
151
  else:
@@ -149,7 +155,8 @@ class Evaluation:
149
155
  project_api_key=project_api_key,
150
156
  base_url=base_url,
151
157
  http_port=http_port,
152
- instruments=set(),
158
+ grpc_port=grpc_port,
159
+ instruments=instruments,
153
160
  )
154
161
 
155
162
  def run(self) -> Union[None, Awaitable[None]]:
@@ -204,7 +211,7 @@ class Evaluation:
204
211
  async def evaluate_in_batches(self, evaluation: CreateEvaluationResponse):
205
212
  for i in range(0, len(self.data), self.batch_size):
206
213
  batch = (
207
- self.data[i: i + self.batch_size]
214
+ self.data[i : i + self.batch_size]
208
215
  if isinstance(self.data, list)
209
216
  else self.data.slice(i, i + self.batch_size)
210
217
  )
@@ -216,71 +223,121 @@ class Evaluation:
216
223
  finally:
217
224
  self.reporter.update(len(batch))
218
225
 
219
- async def _evaluate_batch(self, batch: list[Datapoint]) -> list[EvaluationResultDatapoint]:
226
+ async def _evaluate_batch(
227
+ self, batch: list[Datapoint]
228
+ ) -> list[EvaluationResultDatapoint]:
220
229
  batch_promises = [self._evaluate_datapoint(datapoint) for datapoint in batch]
221
230
  results = await asyncio.gather(*batch_promises)
222
231
  return results
223
232
 
224
- async def _evaluate_datapoint(self, datapoint) -> EvaluationResultDatapoint:
225
- output = (
226
- await self.executor(datapoint.data)
227
- if is_async(self.executor)
228
- else self.executor(datapoint.data)
229
- )
230
- target = datapoint.target
231
-
232
- # Iterate over evaluators
233
- scores: dict[str, Numeric] = {}
234
- for evaluator_name in self.evaluator_names:
235
- evaluator = self.evaluators[evaluator_name]
236
- value = (
237
- await evaluator(output, target)
238
- if is_async(evaluator)
239
- else evaluator(output, target)
233
+ async def _evaluate_datapoint(
234
+ self, datapoint: Datapoint
235
+ ) -> EvaluationResultDatapoint:
236
+ with L.start_as_current_span("evaluation") as evaluation_span:
237
+ L._set_trace_type(trace_type=TraceType.EVALUATION)
238
+ evaluation_span.set_attribute(SPAN_TYPE, SpanType.EVALUATION.value)
239
+ with L.start_as_current_span(
240
+ "executor", input={"data": datapoint.data}
241
+ ) as executor_span:
242
+ executor_span.set_attribute(SPAN_TYPE, SpanType.EXECUTOR.value)
243
+ output = (
244
+ await self.executor(datapoint.data)
245
+ if is_async(self.executor)
246
+ else self.executor(datapoint.data)
247
+ )
248
+ L.set_span_output(output)
249
+ target = datapoint.target
250
+
251
+ # Iterate over evaluators
252
+ scores: dict[str, Numeric] = {}
253
+ for evaluator_name, evaluator in self.evaluators.items():
254
+ with L.start_as_current_span(
255
+ "evaluator", input={"output": output, "target": target}
256
+ ) as evaluator_span:
257
+ evaluator_span.set_attribute(SPAN_TYPE, SpanType.EVALUATOR.value)
258
+ value = (
259
+ await evaluator(output, target)
260
+ if is_async(evaluator)
261
+ else evaluator(output, target)
262
+ )
263
+ L.set_span_output(value)
264
+
265
+ # If evaluator returns a single number, use evaluator name as key
266
+ if isinstance(value, NumericTypes):
267
+ scores[evaluator_name] = value
268
+ else:
269
+ scores.update(value)
270
+
271
+ trace_id = uuid.UUID(int=evaluation_span.get_span_context().trace_id)
272
+ return EvaluationResultDatapoint(
273
+ data=datapoint.data,
274
+ target=target,
275
+ executor_output=output,
276
+ scores=scores,
277
+ trace_id=trace_id,
240
278
  )
241
279
 
242
- # If evaluator returns a single number, use evaluator name as key
243
- if isinstance(value, NumericTypes):
244
- scores[evaluator_name] = value
245
- else:
246
- scores.update(value)
247
-
248
- return EvaluationResultDatapoint(
249
- data=datapoint.data,
250
- target=target,
251
- executorOutput=output,
252
- scores=scores,
253
- )
254
-
255
280
 
256
281
  def evaluate(
257
- name: str,
258
282
  data: Union[EvaluationDataset, list[Union[Datapoint, dict]]],
259
- executor: Any,
260
- evaluators: list[Any],
283
+ executor: ExecutorFunction,
284
+ evaluators: dict[str, EvaluatorFunction],
285
+ name: Optional[str] = None,
261
286
  batch_size: int = DEFAULT_BATCH_SIZE,
262
287
  project_api_key: Optional[str] = None,
263
288
  base_url: Optional[str] = None,
264
289
  http_port: Optional[int] = None,
290
+ grpc_port: Optional[int] = None,
291
+ instruments: Optional[Set[Instruments]] = None,
265
292
  ) -> Optional[Awaitable[None]]:
266
293
  """
267
- Run evaluation.
268
-
269
- If `_set_global_evaluation` is `True`, sets the global evaluation to be run in another part of the program.
270
-
271
- Otherwise, if there is no event loop, runs the evaluation in the current thread until completion.
294
+ If added to the file which is called through lmnr eval command, then simply registers the evaluation.
295
+ Otherwise, if there is no event loop, creates it and runs the evaluation until completion.
272
296
  If there is an event loop, schedules the evaluation as a task in the event loop and returns an awaitable handle.
297
+
298
+ Parameters:
299
+ data (Union[List[Union[EvaluationDatapoint, dict]], EvaluationDataset]): List of data points to evaluate or an evaluation dataset.
300
+ `data` is the input to the executor function,
301
+ `target` is the input to the evaluator function.
302
+ executor (Callable[..., Any]): The executor function.
303
+ Takes the data point + any additional arguments
304
+ and returns the output to evaluate.
305
+ evaluators (List[Callable[..., Any]]): List of evaluator functions.
306
+ Each evaluator function takes the output of the executor _and_
307
+ the target data, and returns a score. The score can be a
308
+ single number or a record of string keys and number values.
309
+ If the score is a single number, it will be named after the
310
+ evaluator function. If the function is anonymous, it will be
311
+ named `evaluator_${index}`, where index is the index of the
312
+ evaluator function in the list starting from 1.
313
+ name (Optional[str], optional): The name of the evaluation.
314
+ It will be auto-generated if not provided.
315
+ batch_size (int, optional): The batch size for evaluation.
316
+ Defaults to DEFAULT_BATCH_SIZE.
317
+ project_api_key (Optional[str], optional): The project API key.
318
+ Defaults to an empty string.
319
+ base_url (Optional[str], optional): The base URL for the Laminar API.
320
+ Useful if self-hosted elsewhere.
321
+ Defaults to "https://api.lmnr.ai".
322
+ http_port (Optional[int], optional): The port for the Laminar API HTTP service.
323
+ Defaults to 443.
324
+ grpc_port (Optional[int], optional): The port for the Laminar API gRPC service.
325
+ Defaults to 8443.
326
+ instruments (Optional[Set[Instruments]], optional): Set of modules to auto-instrument.
327
+ Defaults to None. If None, all available instruments will be used.
273
328
  """
274
329
 
275
330
  evaluation = Evaluation(
276
- name,
277
- data,
278
- executor,
279
- evaluators,
280
- batch_size,
281
- project_api_key,
282
- base_url,
283
- http_port,
331
+ data=data,
332
+ executor=executor,
333
+ evaluators=evaluators,
334
+ name=name,
335
+ batch_size=batch_size,
336
+ project_api_key=project_api_key,
337
+ base_url=base_url,
338
+ http_port=http_port,
339
+ grpc_port=grpc_port,
340
+ instruments=instruments,
284
341
  )
285
342
 
286
343
  global _evaluation
@@ -5,7 +5,6 @@ from opentelemetry.trace import (
5
5
  get_current_span,
6
6
  SpanKind,
7
7
  )
8
- from opentelemetry.semconv_ai import SpanAttributes
9
8
  from opentelemetry.util.types import AttributeValue
10
9
  from opentelemetry.context.context import Context
11
10
  from opentelemetry.util import types
@@ -26,7 +25,17 @@ import os
26
25
  import requests
27
26
  import uuid
28
27
 
29
- from lmnr.traceloop_sdk.tracing.tracing import set_association_properties, update_association_properties
28
+ from lmnr.traceloop_sdk.tracing.attributes import (
29
+ SESSION_ID,
30
+ SPAN_INPUT,
31
+ SPAN_OUTPUT,
32
+ TRACE_TYPE,
33
+ USER_ID,
34
+ )
35
+ from lmnr.traceloop_sdk.tracing.tracing import (
36
+ set_association_properties,
37
+ update_association_properties,
38
+ )
30
39
 
31
40
  from .log import VerboseColorfulFormatter
32
41
 
@@ -37,6 +46,7 @@ from .types import (
37
46
  PipelineRunResponse,
38
47
  NodeInput,
39
48
  PipelineRunRequest,
49
+ TraceType,
40
50
  UpdateEvaluationResponse,
41
51
  )
42
52
 
@@ -356,8 +366,8 @@ class Laminar:
356
366
  ) as span:
357
367
  if input is not None:
358
368
  span.set_attribute(
359
- SpanAttributes.TRACELOOP_ENTITY_INPUT,
360
- json.dumps({"input": input}),
369
+ SPAN_INPUT,
370
+ json.dumps(input),
361
371
  )
362
372
  yield span
363
373
 
@@ -371,9 +381,7 @@ class Laminar:
371
381
  """
372
382
  span = get_current_span()
373
383
  if output is not None and span != INVALID_SPAN:
374
- span.set_attribute(
375
- SpanAttributes.TRACELOOP_ENTITY_OUTPUT, json.dumps(output)
376
- )
384
+ span.set_attribute(SPAN_OUTPUT, json.dumps(output))
377
385
 
378
386
  @classmethod
379
387
  def set_session(
@@ -396,9 +404,23 @@ class Laminar:
396
404
  """
397
405
  association_properties = {}
398
406
  if session_id is not None:
399
- association_properties["session_id"] = session_id
407
+ association_properties[SESSION_ID] = session_id
400
408
  if user_id is not None:
401
- association_properties["user_id"] = user_id
409
+ association_properties[USER_ID] = user_id
410
+ update_association_properties(association_properties)
411
+
412
+ @classmethod
413
+ def _set_trace_type(
414
+ cls,
415
+ trace_type: TraceType,
416
+ ):
417
+ """Set the trace_type for the current span and the context
418
+ Args:
419
+ trace_type (TraceType): Type of the trace
420
+ """
421
+ association_properties = {
422
+ TRACE_TYPE: trace_type.value,
423
+ }
402
424
  update_association_properties(association_properties)
403
425
 
404
426
  @classmethod
@@ -410,7 +432,7 @@ class Laminar:
410
432
  set_association_properties(props)
411
433
 
412
434
  @classmethod
413
- def create_evaluation(cls, name: str) -> CreateEvaluationResponse:
435
+ def create_evaluation(cls, name: Optional[str]) -> CreateEvaluationResponse:
414
436
  response = requests.post(
415
437
  cls.__base_http_url + "/v1/evaluations",
416
438
  data=json.dumps({"name": name}),
@@ -430,7 +452,7 @@ class Laminar:
430
452
  ) -> requests.Response:
431
453
  body = {
432
454
  "evaluationId": str(evaluation_id),
433
- "points": [datapoint.model_dump() for datapoint in data],
455
+ "points": [datapoint.to_dict() for datapoint in data],
434
456
  }
435
457
  response = requests.post(
436
458
  cls.__base_http_url + "/v1/evaluation-datapoints",
@@ -1,10 +1,11 @@
1
1
  import datetime
2
- import requests
2
+ from enum import Enum
3
3
  import pydantic
4
- import uuid
4
+ import requests
5
5
  from typing import Any, Awaitable, Callable, Literal, Optional, Union
6
+ import uuid
6
7
 
7
- from .utils import to_dict
8
+ from .utils import serialize
8
9
 
9
10
 
10
11
  class ChatMessage(pydantic.BaseModel):
@@ -37,7 +38,7 @@ class PipelineRunRequest(pydantic.BaseModel):
37
38
  def to_dict(self):
38
39
  return {
39
40
  "inputs": {
40
- k: v.model_dump() if isinstance(v, pydantic.BaseModel) else to_dict(v)
41
+ k: v.model_dump() if isinstance(v, pydantic.BaseModel) else serialize(v)
41
42
  for k, v in self.inputs.items()
42
43
  },
43
44
  "pipeline": self.pipeline,
@@ -125,5 +126,37 @@ UpdateEvaluationResponse = CreateEvaluationResponse
125
126
  class EvaluationResultDatapoint(pydantic.BaseModel):
126
127
  data: EvaluationDatapointData
127
128
  target: EvaluationDatapointTarget
128
- executorOutput: ExecutorFunctionReturnType
129
+ executor_output: ExecutorFunctionReturnType
129
130
  scores: dict[str, Numeric]
131
+ trace_id: uuid.UUID
132
+
133
+ # uuid is not serializable by default, so we need to convert it to a string
134
+ def to_dict(self):
135
+ return {
136
+ "data": {
137
+ k: v.model_dump() if isinstance(v, pydantic.BaseModel) else serialize(v)
138
+ for k, v in self.data.items()
139
+ },
140
+ "target": {
141
+ k: v.model_dump() if isinstance(v, pydantic.BaseModel) else serialize(v)
142
+ for k, v in self.target.items()
143
+ },
144
+ "executorOutput": serialize(self.executor_output),
145
+ "scores": self.scores,
146
+ "traceId": str(self.trace_id),
147
+ }
148
+
149
+
150
+ class SpanType(Enum):
151
+ DEFAULT = "DEFAULT"
152
+ LLM = "LLM"
153
+ PIPELINE = "PIPELINE" # must not be set manually
154
+ EXECUTOR = "EXECUTOR"
155
+ EVALUATOR = "EVALUATOR"
156
+ EVALUATION = "EVALUATION"
157
+
158
+
159
+ class TraceType(Enum):
160
+ DEFAULT = "DEFAULT"
161
+ EVENT = "EVENT" # must not be set manually
162
+ EVALUATION = "EVALUATION"