lmnr 0.4.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lmnr
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: Python SDK for Laminar AI
5
5
  License: Apache-2.0
6
6
  Author: lmnr.ai
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3.9
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Dist: asyncio (>=3.4.3,<4.0.0)
14
15
  Requires-Dist: backoff (>=2.2.1,<3.0.0)
15
16
  Requires-Dist: pydantic (>=2.7.4,<3.0.0)
16
17
  Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
@@ -149,3 +150,65 @@ PipelineRunResponse(
149
150
  )
150
151
  ```
151
152
 
153
+ ## Running offline evaluations on your data
154
+
155
+ You can evaluate your code with your own data and send it to Laminar using the `Evaluation` class.
156
+
157
+ Evaluation takes in the following parameters:
158
+ - `name` – the name of your evaluation. If no such evaluation exists in the project, it will be created. Otherwise, data will be pushed to the existing evaluation
159
+ - `data` – an array of `EvaluationDatapoint` objects, where each `EvaluationDatapoint` has two keys: `target` and `data`, each containing a key-value object. Alternatively, you can pass in dictionaries, and we will instantiate `EvaluationDatapoint`s with pydantic if possible
160
+ - `executor` – the logic you want to evaluate. This function must take `data` as the first argument, and produce any output. *
161
+ - `evaluators` – evaluaton logic. List of functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Each function can produce either a single number or `dict[str, int|float]` of scores.
162
+
163
+ \* If you already have the outputs of executors you want to evaluate, you can specify the executor as an identity function, that takes in `data` and returns only needed value(s) from it.
164
+
165
+ ### Example
166
+
167
+ ```python
168
+ from openai import AsyncOpenAI
169
+ import asyncio
170
+ import os
171
+
172
+ openai_client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
173
+
174
+ async def get_capital(data):
175
+ country = data["country"]
176
+ response = await openai_client.chat.completions.create(
177
+ model="gpt-4o-mini",
178
+ messages=[
179
+ {"role": "system", "content": "You are a helpful assistant."},
180
+ {
181
+ "role": "user",
182
+ "content": f"What is the capital of {country}? Just name the "
183
+ "city and nothing else",
184
+ },
185
+ ],
186
+ )
187
+ return response.choices[0].message.content.strip()
188
+
189
+
190
+ # Evaluation data
191
+ data = [
192
+ {"data": {"country": "Canada"}, "target": {"capital": "Ottawa"}},
193
+ {"data": {"country": "Germany"}, "target": {"capital": "Berlin"}},
194
+ {"data": {"country": "Tanzania"}, "target": {"capital": "Dodoma"}},
195
+ ]
196
+
197
+
198
+ def evaluator_A(output, target):
199
+ return 1 if output == target["capital"] else 0
200
+
201
+
202
+ # Create an Evaluation instance
203
+ e = Evaluation(
204
+ name="py-evaluation-async",
205
+ data=data,
206
+ executor=get_capital,
207
+ evaluators=[evaluator_A],
208
+ project_api_key=os.environ["LMNR_PROJECT_API_KEY"],
209
+ )
210
+
211
+ # Run the evaluation
212
+ asyncio.run(e.run())
213
+ ```
214
+
@@ -128,3 +128,65 @@ PipelineRunResponse(
128
128
  run_id='53b012d5-5759-48a6-a9c5-0011610e3669'
129
129
  )
130
130
  ```
131
+
132
+ ## Running offline evaluations on your data
133
+
134
+ You can evaluate your code with your own data and send it to Laminar using the `Evaluation` class.
135
+
136
+ Evaluation takes in the following parameters:
137
+ - `name` – the name of your evaluation. If no such evaluation exists in the project, it will be created. Otherwise, data will be pushed to the existing evaluation
138
+ - `data` – an array of `EvaluationDatapoint` objects, where each `EvaluationDatapoint` has two keys: `target` and `data`, each containing a key-value object. Alternatively, you can pass in dictionaries, and we will instantiate `EvaluationDatapoint`s with pydantic if possible
139
+ - `executor` – the logic you want to evaluate. This function must take `data` as the first argument, and produce any output. *
140
+ - `evaluators` – evaluaton logic. List of functions that take output of executor as the first argument, `target` as the second argument and produce a numeric scores. Each function can produce either a single number or `dict[str, int|float]` of scores.
141
+
142
+ \* If you already have the outputs of executors you want to evaluate, you can specify the executor as an identity function, that takes in `data` and returns only needed value(s) from it.
143
+
144
+ ### Example
145
+
146
+ ```python
147
+ from openai import AsyncOpenAI
148
+ import asyncio
149
+ import os
150
+
151
+ openai_client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
152
+
153
+ async def get_capital(data):
154
+ country = data["country"]
155
+ response = await openai_client.chat.completions.create(
156
+ model="gpt-4o-mini",
157
+ messages=[
158
+ {"role": "system", "content": "You are a helpful assistant."},
159
+ {
160
+ "role": "user",
161
+ "content": f"What is the capital of {country}? Just name the "
162
+ "city and nothing else",
163
+ },
164
+ ],
165
+ )
166
+ return response.choices[0].message.content.strip()
167
+
168
+
169
+ # Evaluation data
170
+ data = [
171
+ {"data": {"country": "Canada"}, "target": {"capital": "Ottawa"}},
172
+ {"data": {"country": "Germany"}, "target": {"capital": "Berlin"}},
173
+ {"data": {"country": "Tanzania"}, "target": {"capital": "Dodoma"}},
174
+ ]
175
+
176
+
177
+ def evaluator_A(output, target):
178
+ return 1 if output == target["capital"] else 0
179
+
180
+
181
+ # Create an Evaluation instance
182
+ e = Evaluation(
183
+ name="py-evaluation-async",
184
+ data=data,
185
+ executor=get_capital,
186
+ evaluators=[evaluator_A],
187
+ project_api_key=os.environ["LMNR_PROJECT_API_KEY"],
188
+ )
189
+
190
+ # Run the evaluation
191
+ asyncio.run(e.run())
192
+ ```
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lmnr"
3
- version = "0.4.0"
3
+ version = "0.4.1"
4
4
  description = "Python SDK for Laminar AI"
5
5
  authors = [
6
6
  { name = "lmnr.ai", email = "founders@lmnr.ai" }
@@ -11,7 +11,7 @@ license = "Apache-2.0"
11
11
 
12
12
  [tool.poetry]
13
13
  name = "lmnr"
14
- version = "0.4.0"
14
+ version = "0.4.1"
15
15
  description = "Python SDK for Laminar AI"
16
16
  authors = ["lmnr.ai"]
17
17
  readme = "README.md"
@@ -24,6 +24,7 @@ requests = "^2.32.3"
24
24
  python-dotenv = "^1.0.1"
25
25
  backoff = "^2.2.1"
26
26
  traceloop-sdk = "^0.29.2"
27
+ asyncio = "^3.4.3"
27
28
 
28
29
  [tool.poetry.group.dev.dependencies]
29
30
  black = "^24.8.0"
@@ -1,3 +1,4 @@
1
+ from .sdk.evaluations import Evaluation
1
2
  from .sdk.laminar import Laminar
2
3
  from .sdk.types import ChatMessage, PipelineRunError, PipelineRunResponse, NodeInput
3
4
  from .sdk.decorators import observe
@@ -0,0 +1,163 @@
1
+ from typing import Union
2
+
3
+ from .utils import is_async
4
+ from .types import EvaluatorFunction, ExecutorFunction, EvaluationDatapoint, Numeric
5
+ from .laminar import Laminar as L
6
+ import asyncio
7
+
8
+ from abc import ABC, abstractmethod
9
+
10
+ DEFAULT_BATCH_SIZE = 5
11
+
12
+
13
+ class EvaluationDataset(ABC):
14
+ @abstractmethod
15
+ def __init__(self, *args, **kwargs):
16
+ pass
17
+
18
+ @abstractmethod
19
+ def __len__(self) -> int:
20
+ pass
21
+
22
+ @abstractmethod
23
+ def __getitem__(self, idx) -> EvaluationDatapoint:
24
+ pass
25
+
26
+ def slice(self, start: int, end: int):
27
+ return [self[i] for i in range(max(start, 0), min(end, len(self)))]
28
+
29
+
30
+ class Evaluation:
31
+ def __init__(
32
+ self,
33
+ name,
34
+ data: Union[EvaluationDataset, list[Union[EvaluationDatapoint, dict]]],
35
+ executor: ExecutorFunction,
36
+ evaluators: list[EvaluatorFunction],
37
+ batch_size: int = DEFAULT_BATCH_SIZE,
38
+ project_api_key: str = "",
39
+ base_url: str = "https://api.lmnr.ai",
40
+ ):
41
+ """
42
+ Initializes an instance of the Evaluations class.
43
+ Parameters:
44
+ name (str): The name of the evaluation.
45
+ data (Union[List[Union[EvaluationDatapoint, dict]], EvaluationDataset]): List of data points to evaluate or an evaluation dataset.
46
+ `data` is the input to the executor function,
47
+ `target` is the input to the evaluator function.
48
+ executor (Callable[..., Any]): The executor function.
49
+ Takes the data point + any additional arguments
50
+ and returns the output to evaluate.
51
+ evaluators (List[Callable[..., Any]]): List of evaluator functions.
52
+ Each evaluator function takes the output of the executor _and_
53
+ the target data, and returns a score. The score can be a
54
+ single number or a record of string keys and number values.
55
+ If the score is a single number, it will be named after the
56
+ evaluator function. If the function is anonymous, it will be
57
+ named `evaluator_${index}`, where index is the index of the
58
+ evaluator function in the list starting from 1.
59
+ batch_size (int, optional): The batch size for evaluation.
60
+ Defaults to DEFAULT_BATCH_SIZE.
61
+ project_api_key (str, optional): The project API key.
62
+ Defaults to an empty string.
63
+ base_url (str, optional): The base URL for the LMNR API.
64
+ Useful if self-hosted elsewhere.
65
+ Defaults to "https://api.lmnr.ai".
66
+ """
67
+
68
+ self.name = name
69
+ self.executor = executor
70
+ self.evaluators = dict(
71
+ zip(
72
+ [
73
+ (
74
+ e.__name__
75
+ if e.__name__ and e.__name__ != "<lambda>"
76
+ else f"evaluator_{i+1}"
77
+ )
78
+ for i, e in enumerate(evaluators)
79
+ ],
80
+ evaluators,
81
+ )
82
+ )
83
+ self.evaluator_names = list(self.evaluators.keys())
84
+ if isinstance(data, list):
85
+ self.data = [
86
+ (
87
+ EvaluationDatapoint.model_validate(point)
88
+ if isinstance(point, dict)
89
+ else point
90
+ )
91
+ for point in data
92
+ ]
93
+ else:
94
+ self.data = data
95
+ self.batch_size = batch_size
96
+ L.initialize(project_api_key=project_api_key, base_url=base_url)
97
+
98
+ async def run(self):
99
+ """Runs the evaluation.
100
+
101
+ Creates a new evaluation if no evaluation with such name exists, or
102
+ adds data to an existing one otherwise. Evaluates data points in
103
+ batches of `self.batch_size`. The executor
104
+ function is called on each data point to get the output,
105
+ and then evaluate it by each evaluator function.
106
+ """
107
+ response = L.create_evaluation(self.name)
108
+ batch_promises = []
109
+
110
+ for i in range(0, len(self.data), self.batch_size):
111
+ batch = (
112
+ self.data[i : i + self.batch_size]
113
+ if isinstance(self.data, list)
114
+ else self.data.slice(i, i + self.batch_size)
115
+ )
116
+ batch_promises.append(self._evaluate_batch(batch))
117
+
118
+ try:
119
+ await asyncio.gather(*batch_promises)
120
+ L.update_evaluation_status(response.name, "Finished")
121
+ print(f"Evaluation {response.id} complete")
122
+ except Exception as e:
123
+ print(f"Error evaluating batch: {e}")
124
+
125
+ async def _evaluate_batch(self, batch: list[EvaluationDatapoint]):
126
+ results = []
127
+ for datapoint in batch:
128
+ output = (
129
+ await self.executor(datapoint.data)
130
+ if is_async(self.executor)
131
+ else self.executor(datapoint.data)
132
+ )
133
+ target = datapoint.target
134
+
135
+ # iterate in order of evaluators
136
+ scores = {}
137
+ for evaluator_name in self.evaluator_names:
138
+ evaluator = self.evaluators[evaluator_name]
139
+ value = (
140
+ await evaluator(output, target)
141
+ if is_async(evaluator)
142
+ else evaluator(output, target)
143
+ )
144
+
145
+ # if the evaluator returns a single number,
146
+ # use the evaluator name as the key
147
+ if isinstance(value, Numeric):
148
+ scores[evaluator_name] = value
149
+ else:
150
+ # if the evaluator returns an object,
151
+ # use the object keys as the keys
152
+ scores.update(value)
153
+
154
+ results.append(
155
+ {
156
+ "executorOutput": output,
157
+ "data": datapoint.data,
158
+ "target": target,
159
+ "scores": scores,
160
+ }
161
+ )
162
+
163
+ return L.post_evaluation_results(self.name, results)
@@ -25,6 +25,8 @@ import uuid
25
25
  from .log import VerboseColorfulFormatter
26
26
 
27
27
  from .types import (
28
+ CreateEvaluationResponse,
29
+ EvaluationResultDatapoint,
28
30
  PipelineRunError,
29
31
  PipelineRunResponse,
30
32
  NodeInput,
@@ -372,6 +374,71 @@ class Laminar:
372
374
  props.pop("user_id", None)
373
375
  Traceloop.set_association_properties(props)
374
376
 
377
+ @classmethod
378
+ def create_evaluation(cls, name: str) -> CreateEvaluationResponse:
379
+ response = requests.post(
380
+ cls.__base_url + "/v1/evaluations",
381
+ data=json.dumps({"name": name}),
382
+ headers=cls._headers(),
383
+ )
384
+ if response.status_code != 200:
385
+ try:
386
+ resp_json = response.json()
387
+ raise ValueError(f"Error creating evaluation {json.dumps(resp_json)}")
388
+ except Exception:
389
+ raise ValueError(f"Error creating evaluation {response.text}")
390
+ return CreateEvaluationResponse.model_validate(response.json())
391
+
392
+ @classmethod
393
+ def post_evaluation_results(
394
+ cls, evaluation_name: str, data: list[EvaluationResultDatapoint]
395
+ ) -> requests.Response:
396
+ body = {
397
+ "name": evaluation_name,
398
+ "points": data,
399
+ }
400
+ response = requests.post(
401
+ cls.__base_url + "/v1/evaluation-datapoints",
402
+ data=json.dumps(body),
403
+ headers=cls._headers(),
404
+ )
405
+ if response.status_code != 200:
406
+ try:
407
+ resp_json = response.json()
408
+ raise ValueError(
409
+ f"Failed to send evaluation results. Response: {json.dumps(resp_json)}"
410
+ )
411
+ except Exception:
412
+ raise ValueError(
413
+ f"Failed to send evaluation results. Error: {response.text}"
414
+ )
415
+ return response
416
+
417
+ @classmethod
418
+ def update_evaluation_status(
419
+ cls, evaluation_name: str, status: str
420
+ ) -> requests.Response:
421
+ body = {
422
+ "name": evaluation_name,
423
+ "status": status,
424
+ }
425
+ response = requests.put(
426
+ cls.__base_url + "/v1/evaluations/",
427
+ data=json.dumps(body),
428
+ headers=cls._headers(),
429
+ )
430
+ if response.status_code != 200:
431
+ try:
432
+ resp_json = response.json()
433
+ raise ValueError(
434
+ f"Failed to send evaluation status. Response: {json.dumps(resp_json)}"
435
+ )
436
+ except Exception:
437
+ raise ValueError(
438
+ f"Failed to send evaluation status. Error: {response.text}"
439
+ )
440
+ return response
441
+
375
442
  @classmethod
376
443
  def _headers(cls):
377
444
  return {
@@ -0,0 +1,123 @@
1
+ import datetime
2
+ import requests
3
+ import pydantic
4
+ import uuid
5
+ from typing import Any, Awaitable, Callable, Literal, Optional, TypeAlias, Union
6
+
7
+ from .utils import to_dict
8
+
9
+
10
+ class ChatMessage(pydantic.BaseModel):
11
+ role: str
12
+ content: str
13
+
14
+
15
+ class ConditionedValue(pydantic.BaseModel):
16
+ condition: str
17
+ value: "NodeInput"
18
+
19
+
20
+ Numeric: TypeAlias = Union[int, float]
21
+ NodeInput: TypeAlias = Union[str, list[ChatMessage], ConditionedValue, Numeric, bool]
22
+ PipelineOutput: TypeAlias = Union[NodeInput]
23
+
24
+
25
+ class PipelineRunRequest(pydantic.BaseModel):
26
+ inputs: dict[str, NodeInput]
27
+ pipeline: str
28
+ env: dict[str, str] = pydantic.Field(default_factory=dict)
29
+ metadata: dict[str, str] = pydantic.Field(default_factory=dict)
30
+ stream: bool = pydantic.Field(default=False)
31
+ parent_span_id: Optional[uuid.UUID] = pydantic.Field(default=None)
32
+ trace_id: Optional[uuid.UUID] = pydantic.Field(default=None)
33
+
34
+ # uuid is not serializable by default, so we need to convert it to a string
35
+ def to_dict(self):
36
+ return {
37
+ "inputs": {
38
+ k: v.model_dump() if isinstance(v, pydantic.BaseModel) else to_dict(v)
39
+ for k, v in self.inputs.items()
40
+ },
41
+ "pipeline": self.pipeline,
42
+ "env": self.env,
43
+ "metadata": self.metadata,
44
+ "stream": self.stream,
45
+ "parentSpanId": str(self.parent_span_id) if self.parent_span_id else None,
46
+ "traceId": str(self.trace_id) if self.trace_id else None,
47
+ }
48
+
49
+
50
+ class PipelineRunResponse(pydantic.BaseModel):
51
+ outputs: dict[str, dict[str, PipelineOutput]]
52
+ run_id: str
53
+
54
+
55
+ class PipelineRunError(Exception):
56
+ error_code: str
57
+ error_message: str
58
+
59
+ def __init__(self, response: requests.Response):
60
+ try:
61
+ resp_json = response.json()
62
+ self.error_code = resp_json["error_code"]
63
+ self.error_message = resp_json["error_message"]
64
+ super().__init__(self.error_message)
65
+ except Exception:
66
+ super().__init__(response.text)
67
+
68
+ def __str__(self) -> str:
69
+ try:
70
+ return str(
71
+ {"error_code": self.error_code, "error_message": self.error_message}
72
+ )
73
+ except Exception:
74
+ return super().__str__()
75
+
76
+
77
+ EvaluationDatapointData: TypeAlias = dict[str, Any]
78
+ EvaluationDatapointTarget: TypeAlias = dict[str, Any]
79
+
80
+
81
+ # EvaluationDatapoint is a single data point in the evaluation
82
+ class EvaluationDatapoint(pydantic.BaseModel):
83
+ # input to the executor function. Must be a dict with string keys
84
+ data: EvaluationDatapointData
85
+ # input to the evaluator function (alongside the executor output).
86
+ # Must be a dict with string keys
87
+ target: EvaluationDatapointTarget
88
+
89
+
90
+ ExecutorFunctionReturnType: TypeAlias = Any
91
+ EvaluatorFunctionReturnType: TypeAlias = Union[Numeric, dict[str, Numeric]]
92
+
93
+ ExecutorFunction: TypeAlias = Callable[
94
+ [EvaluationDatapointData, *tuple[Any, ...], dict[str, Any]],
95
+ Union[ExecutorFunctionReturnType, Awaitable[ExecutorFunctionReturnType]],
96
+ ]
97
+
98
+ # EvaluatorFunction is a function that takes the output of the executor and the
99
+ # target data, and returns a score. The score can be a single number or a
100
+ # record of string keys and number values. The latter is useful for evaluating
101
+ # multiple criteria in one go instead of running multiple evaluators.
102
+ EvaluatorFunction: TypeAlias = Callable[
103
+ [ExecutorFunctionReturnType, *tuple[Any, ...], dict[str, Any]],
104
+ Union[EvaluatorFunctionReturnType, Awaitable[EvaluatorFunctionReturnType]],
105
+ ]
106
+
107
+ EvaluationStatus: TypeAlias = Literal["Started", "Finished", "Error"]
108
+
109
+
110
+ class CreateEvaluationResponse(pydantic.BaseModel):
111
+ id: uuid.UUID
112
+ createdAt: datetime.datetime
113
+ name: str
114
+ status: EvaluationStatus
115
+ projectId: uuid.UUID
116
+ metadata: Optional[dict[str, Any]] = None
117
+
118
+
119
+ class EvaluationResultDatapoint(pydantic.BaseModel):
120
+ data: EvaluationDatapointData
121
+ target: EvaluationDatapointTarget
122
+ executor_output: ExecutorFunctionReturnType
123
+ scores: dict[str, Numeric]
@@ -1,71 +0,0 @@
1
- import requests
2
- import pydantic
3
- import uuid
4
- from typing import Optional, Union
5
-
6
- from .utils import to_dict
7
-
8
-
9
- class ChatMessage(pydantic.BaseModel):
10
- role: str
11
- content: str
12
-
13
-
14
- class ConditionedValue(pydantic.BaseModel):
15
- condition: str
16
- value: "NodeInput"
17
-
18
-
19
- NodeInput = Union[str, list[ChatMessage], ConditionedValue] # TypeAlias
20
-
21
-
22
- class PipelineRunRequest(pydantic.BaseModel):
23
- inputs: dict[str, NodeInput]
24
- pipeline: str
25
- env: dict[str, str] = pydantic.Field(default_factory=dict)
26
- metadata: dict[str, str] = pydantic.Field(default_factory=dict)
27
- stream: bool = pydantic.Field(default=False)
28
- parent_span_id: Optional[uuid.UUID] = pydantic.Field(default=None)
29
- trace_id: Optional[uuid.UUID] = pydantic.Field(default=None)
30
-
31
- # uuid is not serializable by default, so we need to convert it to a string
32
- def to_dict(self):
33
- return {
34
- "inputs": {
35
- k: v.model_dump() if isinstance(v, pydantic.BaseModel) else to_dict(v)
36
- for k, v in self.inputs.items()
37
- },
38
- "pipeline": self.pipeline,
39
- "env": self.env,
40
- "metadata": self.metadata,
41
- "stream": self.stream,
42
- "parentSpanId": str(self.parent_span_id) if self.parent_span_id else None,
43
- "traceId": str(self.trace_id) if self.trace_id else None,
44
- }
45
-
46
-
47
- class PipelineRunResponse(pydantic.BaseModel):
48
- outputs: dict[str, dict[str, NodeInput]]
49
- run_id: str
50
-
51
-
52
- class PipelineRunError(Exception):
53
- error_code: str
54
- error_message: str
55
-
56
- def __init__(self, response: requests.Response):
57
- try:
58
- resp_json = response.json()
59
- self.error_code = resp_json["error_code"]
60
- self.error_message = resp_json["error_message"]
61
- super().__init__(self.error_message)
62
- except Exception:
63
- super().__init__(response.text)
64
-
65
- def __str__(self) -> str:
66
- try:
67
- return str(
68
- {"error_code": self.error_code, "error_message": self.error_message}
69
- )
70
- except Exception:
71
- return super().__str__()
File without changes
File without changes
File without changes
File without changes
File without changes