langwatch 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch/__init__.py +6 -3
- langwatch/__version__.py +1 -1
- langwatch/dspy/__init__.py +4 -32
- langwatch/evaluation/__init__.py +535 -7
- langwatch/evaluations.py +183 -353
- langwatch/experiment/__init__.py +108 -0
- langwatch/experiment/experiment.py +912 -0
- langwatch/experiment/platform_run.py +435 -0
- {langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/METADATA +1 -1
- {langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/RECORD +11 -9
- langwatch/evaluation/evaluation.py +0 -484
- {langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/WHEEL +0 -0
|
@@ -1,484 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
import asyncio
|
|
3
|
-
from contextlib import contextmanager
|
|
4
|
-
import json
|
|
5
|
-
import threading
|
|
6
|
-
import time
|
|
7
|
-
import traceback
|
|
8
|
-
import httpx
|
|
9
|
-
import pandas as pd
|
|
10
|
-
from opentelemetry import trace
|
|
11
|
-
from opentelemetry.trace import Span
|
|
12
|
-
from pydantic import BaseModel, Field
|
|
13
|
-
from typing import (
|
|
14
|
-
Any,
|
|
15
|
-
Callable,
|
|
16
|
-
Dict,
|
|
17
|
-
Hashable,
|
|
18
|
-
Iterable,
|
|
19
|
-
Iterator,
|
|
20
|
-
List,
|
|
21
|
-
Literal,
|
|
22
|
-
Optional,
|
|
23
|
-
TypeVar,
|
|
24
|
-
TypedDict,
|
|
25
|
-
Sized,
|
|
26
|
-
Union,
|
|
27
|
-
cast,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
31
|
-
from tqdm.auto import tqdm
|
|
32
|
-
|
|
33
|
-
import langwatch
|
|
34
|
-
from langwatch.attributes import AttributeKey
|
|
35
|
-
from langwatch.domain import Money, TypedValueJson
|
|
36
|
-
from langwatch.telemetry.tracing import LangWatchTrace
|
|
37
|
-
from langwatch.utils.exceptions import better_raise_for_status
|
|
38
|
-
from langwatch.utils.transformation import SerializableWithStringFallback
|
|
39
|
-
|
|
40
|
-
from coolname import generate_slug # type: ignore
|
|
41
|
-
import urllib.parse
|
|
42
|
-
from concurrent.futures import Future, ThreadPoolExecutor, as_completed
|
|
43
|
-
|
|
44
|
-
_tracer = trace.get_tracer(__name__)
|
|
45
|
-
|
|
46
|
-
ItemT = TypeVar("ItemT")
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class EvaluationResult(BaseModel):
|
|
50
|
-
name: str
|
|
51
|
-
evaluator: str
|
|
52
|
-
trace_id: str
|
|
53
|
-
status: Literal["processed", "error", "skipped"]
|
|
54
|
-
data: Optional[Dict[str, Any]] = None
|
|
55
|
-
score: Optional[float] = Field(default=None, description="No description provided")
|
|
56
|
-
passed: Optional[bool] = None
|
|
57
|
-
details: Optional[str] = Field(
|
|
58
|
-
default=None, description="Short human-readable description of the result"
|
|
59
|
-
)
|
|
60
|
-
index: Optional[int] = None
|
|
61
|
-
label: Optional[str] = None
|
|
62
|
-
cost: Optional[float] = None
|
|
63
|
-
duration: Optional[int] = None
|
|
64
|
-
error_type: Optional[str] = None
|
|
65
|
-
traceback: Optional[List[str]] = Field(
|
|
66
|
-
description="Traceback information for debugging", default=None
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class Batch(TypedDict):
|
|
71
|
-
dataset: List[BatchEntry]
|
|
72
|
-
evaluations: List[EvaluationResult]
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
class BatchEntry(BaseModel):
|
|
76
|
-
index: int
|
|
77
|
-
entry: Any
|
|
78
|
-
duration: int
|
|
79
|
-
error: Optional[str] = None
|
|
80
|
-
trace_id: str
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
class IterationInfo(TypedDict):
|
|
84
|
-
index: int
|
|
85
|
-
trace: LangWatchTrace
|
|
86
|
-
item: Any
|
|
87
|
-
duration: int
|
|
88
|
-
error: Optional[Exception]
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
class Evaluation:
|
|
92
|
-
_executor: ThreadPoolExecutor
|
|
93
|
-
_futures: List[Future[Any]]
|
|
94
|
-
_current_index: int
|
|
95
|
-
_current_item: Any
|
|
96
|
-
|
|
97
|
-
def __init__(self, name: str, *, run_id: Optional[str] = None):
|
|
98
|
-
self.name: str = name or generate_slug(3)
|
|
99
|
-
self.experiment_slug: str = self.name
|
|
100
|
-
self.run_id: str = run_id or generate_slug(3)
|
|
101
|
-
self.total: int = 0
|
|
102
|
-
self.progress: int = 0
|
|
103
|
-
self.created_at_nano: int = int(time.time() * 1000)
|
|
104
|
-
self._futures: List[Future[Any]] = []
|
|
105
|
-
|
|
106
|
-
# Sending results
|
|
107
|
-
self.lock = threading.Lock()
|
|
108
|
-
self.batch: Batch = {"dataset": [], "evaluations": []}
|
|
109
|
-
self.last_sent = 0
|
|
110
|
-
self.debounce_interval = 1 # 1 second
|
|
111
|
-
self.threads: List[threading.Thread] = []
|
|
112
|
-
self.initialized = False
|
|
113
|
-
|
|
114
|
-
def init(self):
|
|
115
|
-
if not langwatch.get_api_key():
|
|
116
|
-
raise ValueError(
|
|
117
|
-
"API key was not detected, please set LANGWATCH_API_KEY or call langwatch.login() to login"
|
|
118
|
-
)
|
|
119
|
-
langwatch.ensure_setup()
|
|
120
|
-
|
|
121
|
-
with httpx.Client(timeout=60) as client:
|
|
122
|
-
response = client.post(
|
|
123
|
-
f"{langwatch.get_endpoint()}/api/experiment/init",
|
|
124
|
-
headers={"X-Auth-Token": langwatch.get_api_key() or ""},
|
|
125
|
-
json={
|
|
126
|
-
"experiment_name": self.name,
|
|
127
|
-
"experiment_slug": self.experiment_slug,
|
|
128
|
-
"experiment_type": "BATCH_EVALUATION_V2",
|
|
129
|
-
},
|
|
130
|
-
)
|
|
131
|
-
if response.status_code == 401:
|
|
132
|
-
langwatch.setup(api_key=None)
|
|
133
|
-
raise ValueError(
|
|
134
|
-
"API key is not valid, please try to login again with langwatch.login()"
|
|
135
|
-
)
|
|
136
|
-
better_raise_for_status(response)
|
|
137
|
-
response_json = response.json()
|
|
138
|
-
experiment_path = response_json["path"]
|
|
139
|
-
self.experiment_slug = response_json["slug"]
|
|
140
|
-
|
|
141
|
-
url_encoded_run_id = urllib.parse.quote(self.run_id)
|
|
142
|
-
print(
|
|
143
|
-
f"Follow the results at: {langwatch.get_endpoint()}{experiment_path}?runId={url_encoded_run_id}"
|
|
144
|
-
)
|
|
145
|
-
self.initialized = True
|
|
146
|
-
|
|
147
|
-
def loop(
|
|
148
|
-
self,
|
|
149
|
-
iterable: Union[Iterable[ItemT], pd.DataFrame],
|
|
150
|
-
*,
|
|
151
|
-
threads: int = 4,
|
|
152
|
-
total: Optional[int] = None,
|
|
153
|
-
) -> Iterable[ItemT]:
|
|
154
|
-
if not self.initialized:
|
|
155
|
-
self.init()
|
|
156
|
-
|
|
157
|
-
try:
|
|
158
|
-
total_ = (
|
|
159
|
-
total
|
|
160
|
-
if total
|
|
161
|
-
else (
|
|
162
|
-
len(cast(Sized, iterable)) if hasattr(iterable, "__len__") else None
|
|
163
|
-
)
|
|
164
|
-
)
|
|
165
|
-
if total_ is None and "DataFrame.iterrows" in str(iterable):
|
|
166
|
-
iterable = cast(Iterable[ItemT], list(iterable))
|
|
167
|
-
total_ = len(cast(Sized, iterable))
|
|
168
|
-
progress_bar = tqdm(total=total_, desc="Evaluating")
|
|
169
|
-
|
|
170
|
-
# Supports direct pandas df being passed in
|
|
171
|
-
if isinstance(iterable, pd.DataFrame):
|
|
172
|
-
iterable = cast(Iterable[ItemT], iterable.iterrows()) # type: ignore
|
|
173
|
-
|
|
174
|
-
with ThreadPoolExecutor(max_workers=threads) as executor:
|
|
175
|
-
self._executor = executor
|
|
176
|
-
for index, item in enumerate(iterable):
|
|
177
|
-
self._current_index = index
|
|
178
|
-
self._current_item = item
|
|
179
|
-
|
|
180
|
-
with self._execute_item_iteration(
|
|
181
|
-
index,
|
|
182
|
-
item,
|
|
183
|
-
in_thread=False,
|
|
184
|
-
):
|
|
185
|
-
yield item
|
|
186
|
-
if len(self._futures) == 0:
|
|
187
|
-
progress_bar.update(1)
|
|
188
|
-
|
|
189
|
-
if len(self._futures) > 0:
|
|
190
|
-
for _ in as_completed(self._futures):
|
|
191
|
-
progress_bar.update(1)
|
|
192
|
-
|
|
193
|
-
executor.submit(self._wait_for_completion).result()
|
|
194
|
-
progress_bar.close()
|
|
195
|
-
|
|
196
|
-
except Exception as e:
|
|
197
|
-
Evaluation._log_results(
|
|
198
|
-
langwatch.get_api_key() or "",
|
|
199
|
-
{
|
|
200
|
-
"experiment_slug": self.experiment_slug,
|
|
201
|
-
"run_id": self.run_id,
|
|
202
|
-
"timestamps": {
|
|
203
|
-
"finished_at": int(time.time() * 1000),
|
|
204
|
-
"stopped_at": int(time.time() * 1000),
|
|
205
|
-
},
|
|
206
|
-
},
|
|
207
|
-
)
|
|
208
|
-
raise e
|
|
209
|
-
|
|
210
|
-
def submit(self, func: Callable[..., Any], /, *args: Any, **kwargs: Any):
|
|
211
|
-
_current_index = self._current_index
|
|
212
|
-
_current_item = self._current_item
|
|
213
|
-
|
|
214
|
-
def wrapper():
|
|
215
|
-
with self._execute_item_iteration(
|
|
216
|
-
_current_index, _current_item, in_thread=True
|
|
217
|
-
):
|
|
218
|
-
if asyncio.iscoroutinefunction(func):
|
|
219
|
-
func_result = asyncio.run(func(*args, **kwargs))
|
|
220
|
-
else:
|
|
221
|
-
func_result = func(*args, **kwargs)
|
|
222
|
-
|
|
223
|
-
return func_result
|
|
224
|
-
|
|
225
|
-
future = self._executor.submit(wrapper)
|
|
226
|
-
self._futures.append(future)
|
|
227
|
-
return future
|
|
228
|
-
|
|
229
|
-
@contextmanager
|
|
230
|
-
def _execute_item_iteration(
|
|
231
|
-
self,
|
|
232
|
-
index: int,
|
|
233
|
-
item: Any,
|
|
234
|
-
in_thread: bool = False,
|
|
235
|
-
) -> Iterator[Any]:
|
|
236
|
-
# Iteration will be None if we find ourselves in a parallel loop, but still
|
|
237
|
-
# in the phase of collecting the evaluation.submit() processes. When in_thread,
|
|
238
|
-
# then it's when we actually collect the iteration info.
|
|
239
|
-
iteration = (
|
|
240
|
-
IterationInfo(
|
|
241
|
-
trace=langwatch.trace(
|
|
242
|
-
name="evaluation.loop_iteration",
|
|
243
|
-
metadata={
|
|
244
|
-
"thread_id": self.run_id,
|
|
245
|
-
"loop.index": str(index),
|
|
246
|
-
},
|
|
247
|
-
),
|
|
248
|
-
index=index,
|
|
249
|
-
item=item,
|
|
250
|
-
duration=0,
|
|
251
|
-
error=None,
|
|
252
|
-
)
|
|
253
|
-
if in_thread or len(self._futures) == 0
|
|
254
|
-
else None
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
if iteration is not None:
|
|
258
|
-
iteration["trace"].__enter__()
|
|
259
|
-
|
|
260
|
-
start_time = time.time()
|
|
261
|
-
try:
|
|
262
|
-
yield
|
|
263
|
-
except Exception as e:
|
|
264
|
-
if iteration is not None:
|
|
265
|
-
iteration["error"] = e
|
|
266
|
-
print(f"\n[Evaluation Error] index={index}")
|
|
267
|
-
traceback.print_exc()
|
|
268
|
-
|
|
269
|
-
if iteration is not None:
|
|
270
|
-
try:
|
|
271
|
-
iteration["duration"] = int((time.time() - start_time) * 1000)
|
|
272
|
-
|
|
273
|
-
# If we just started the parallel loop, we need to skip the first iteration
|
|
274
|
-
# from being added to the batch and change the trace name
|
|
275
|
-
if not in_thread and len(self._futures) > 0:
|
|
276
|
-
iteration["trace"].update(name="evaluation.loop")
|
|
277
|
-
else:
|
|
278
|
-
self._add_to_batch(iteration)
|
|
279
|
-
|
|
280
|
-
if iteration["error"] is not None:
|
|
281
|
-
iteration["trace"].update(error=iteration["error"])
|
|
282
|
-
except Exception as e:
|
|
283
|
-
raise e
|
|
284
|
-
finally:
|
|
285
|
-
iteration["trace"].__exit__(None, None, None)
|
|
286
|
-
|
|
287
|
-
def _add_to_batch(self, iteration: IterationInfo):
|
|
288
|
-
entry: Any = (
|
|
289
|
-
iteration["item"].to_dict()
|
|
290
|
-
if hasattr(iteration["item"], "to_dict")
|
|
291
|
-
else (
|
|
292
|
-
iteration["item"].__dict__
|
|
293
|
-
if hasattr(iteration["item"], "__dict__")
|
|
294
|
-
else (
|
|
295
|
-
iteration["item"][1].to_dict()
|
|
296
|
-
if type(iteration["item"]) == tuple
|
|
297
|
-
and hasattr(iteration["item"][1], "to_dict")
|
|
298
|
-
else (
|
|
299
|
-
iteration["item"][1].__dict__
|
|
300
|
-
if type(iteration["item"]) == tuple
|
|
301
|
-
and hasattr(iteration["item"][1], "__dict__")
|
|
302
|
-
else {
|
|
303
|
-
"entry": json.dumps(
|
|
304
|
-
iteration["item"], cls=SerializableWithStringFallback
|
|
305
|
-
)
|
|
306
|
-
}
|
|
307
|
-
)
|
|
308
|
-
)
|
|
309
|
-
)
|
|
310
|
-
)
|
|
311
|
-
with self.lock:
|
|
312
|
-
self.batch["dataset"].append(
|
|
313
|
-
BatchEntry(
|
|
314
|
-
index=iteration["index"],
|
|
315
|
-
entry=entry,
|
|
316
|
-
duration=iteration["duration"],
|
|
317
|
-
error=str(iteration["error"]) if iteration["error"] else None,
|
|
318
|
-
trace_id=iteration["trace"].trace_id or "",
|
|
319
|
-
)
|
|
320
|
-
)
|
|
321
|
-
|
|
322
|
-
if time.time() - self.last_sent >= self.debounce_interval:
|
|
323
|
-
self._send_batch()
|
|
324
|
-
|
|
325
|
-
def _send_batch(self, finished: bool = False):
|
|
326
|
-
with self.lock:
|
|
327
|
-
if (
|
|
328
|
-
len(self.batch["dataset"]) == 0
|
|
329
|
-
and len(self.batch["evaluations"]) == 0
|
|
330
|
-
and not finished
|
|
331
|
-
):
|
|
332
|
-
return
|
|
333
|
-
|
|
334
|
-
# TODO: it is called `inputs` on the api still, unfortunately, so we need to map data back to inputs
|
|
335
|
-
evaluations = []
|
|
336
|
-
for eval in self.batch["evaluations"]:
|
|
337
|
-
eval_ = eval.model_dump(exclude_none=True, exclude_unset=True)
|
|
338
|
-
eval_["inputs"] = eval_["data"]
|
|
339
|
-
if "data" in eval_:
|
|
340
|
-
del eval_["data"]
|
|
341
|
-
evaluations.append(eval_)
|
|
342
|
-
|
|
343
|
-
body = {
|
|
344
|
-
"experiment_slug": self.experiment_slug,
|
|
345
|
-
"name": f"{self.name}",
|
|
346
|
-
"run_id": self.run_id,
|
|
347
|
-
"dataset": [
|
|
348
|
-
entry.model_dump(exclude_none=True, exclude_unset=True)
|
|
349
|
-
for entry in self.batch["dataset"]
|
|
350
|
-
],
|
|
351
|
-
"evaluations": evaluations,
|
|
352
|
-
"progress": self.progress,
|
|
353
|
-
"total": self.total,
|
|
354
|
-
"timestamps": {
|
|
355
|
-
"created_at": self.created_at_nano,
|
|
356
|
-
},
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
if finished:
|
|
360
|
-
if not isinstance(body["timestamps"], dict):
|
|
361
|
-
body["timestamps"] = {}
|
|
362
|
-
body["timestamps"]["finished_at"] = int(time.time() * 1000)
|
|
363
|
-
|
|
364
|
-
# Start a new thread to send the batch
|
|
365
|
-
thread = threading.Thread(
|
|
366
|
-
target=Evaluation._log_results,
|
|
367
|
-
args=(langwatch.get_api_key(), body),
|
|
368
|
-
)
|
|
369
|
-
thread.start()
|
|
370
|
-
self.threads.append(thread)
|
|
371
|
-
|
|
372
|
-
# Clear the batch and update the last sent time
|
|
373
|
-
self.batch = {"dataset": [], "evaluations": []}
|
|
374
|
-
self.last_sent = time.time()
|
|
375
|
-
|
|
376
|
-
@classmethod
|
|
377
|
-
@retry(
|
|
378
|
-
stop=stop_after_attempt(3),
|
|
379
|
-
wait=wait_exponential(multiplier=1, min=2, max=10),
|
|
380
|
-
reraise=True,
|
|
381
|
-
)
|
|
382
|
-
def _log_results(cls, api_key: str, body: Dict[str, Any]):
|
|
383
|
-
response = httpx.post(
|
|
384
|
-
f"{langwatch.get_endpoint()}/api/evaluations/batch/log_results",
|
|
385
|
-
headers={
|
|
386
|
-
"Authorization": f"Bearer {api_key}",
|
|
387
|
-
"Content-Type": "application/json",
|
|
388
|
-
},
|
|
389
|
-
data=json.dumps(body, cls=SerializableWithStringFallback), # type: ignore
|
|
390
|
-
timeout=60,
|
|
391
|
-
)
|
|
392
|
-
better_raise_for_status(response)
|
|
393
|
-
|
|
394
|
-
def _wait_for_completion(self):
|
|
395
|
-
async def wait_for_completion(self: Evaluation):
|
|
396
|
-
# Send any remaining batch
|
|
397
|
-
self._send_batch(finished=True)
|
|
398
|
-
|
|
399
|
-
for thread in self.threads:
|
|
400
|
-
await asyncio.sleep(0)
|
|
401
|
-
thread.join()
|
|
402
|
-
|
|
403
|
-
asyncio.run(wait_for_completion(self))
|
|
404
|
-
|
|
405
|
-
def log(
|
|
406
|
-
self,
|
|
407
|
-
metric: str,
|
|
408
|
-
index: Union[int, Hashable],
|
|
409
|
-
data: Dict[str, Any] = {},
|
|
410
|
-
score: Optional[float] = None,
|
|
411
|
-
passed: Optional[bool] = None,
|
|
412
|
-
label: Optional[str] = None,
|
|
413
|
-
details: Optional[str] = None,
|
|
414
|
-
status: Literal["processed", "error", "skipped"] = "processed",
|
|
415
|
-
duration: Optional[int] = None,
|
|
416
|
-
cost: Optional[Money] = None,
|
|
417
|
-
error: Optional[Exception] = None,
|
|
418
|
-
):
|
|
419
|
-
try:
|
|
420
|
-
index_ = int(cast(Any, index))
|
|
421
|
-
except Exception:
|
|
422
|
-
raise ValueError(f"Index must be an integer, got {index}")
|
|
423
|
-
|
|
424
|
-
eval = EvaluationResult(
|
|
425
|
-
trace_id=format(
|
|
426
|
-
trace.get_current_span().get_span_context().trace_id,
|
|
427
|
-
"x",
|
|
428
|
-
),
|
|
429
|
-
name=metric,
|
|
430
|
-
evaluator=metric,
|
|
431
|
-
status=status if status else "error" if error else "processed",
|
|
432
|
-
data=data,
|
|
433
|
-
score=score,
|
|
434
|
-
passed=passed,
|
|
435
|
-
index=index_,
|
|
436
|
-
label=label,
|
|
437
|
-
cost=cost.amount if cost else None,
|
|
438
|
-
duration=duration,
|
|
439
|
-
details=details if details else str(error) if error else None,
|
|
440
|
-
error_type=type(error).__name__ if error else None,
|
|
441
|
-
traceback=(
|
|
442
|
-
list(traceback.TracebackException.from_exception(error).format())
|
|
443
|
-
if error
|
|
444
|
-
else None
|
|
445
|
-
),
|
|
446
|
-
)
|
|
447
|
-
|
|
448
|
-
with self.lock:
|
|
449
|
-
self.batch["evaluations"].append(eval)
|
|
450
|
-
|
|
451
|
-
def run(
|
|
452
|
-
self,
|
|
453
|
-
evaluator_id: str,
|
|
454
|
-
index: Union[int, Hashable],
|
|
455
|
-
data: Dict[str, Any],
|
|
456
|
-
settings: Dict[str, Any],
|
|
457
|
-
name: Optional[str] = None,
|
|
458
|
-
as_guardrail: bool = False,
|
|
459
|
-
):
|
|
460
|
-
duration: Optional[int] = None
|
|
461
|
-
|
|
462
|
-
start_time = time.time()
|
|
463
|
-
result = langwatch.evaluations.evaluate(
|
|
464
|
-
span=langwatch.get_current_span(),
|
|
465
|
-
slug=evaluator_id,
|
|
466
|
-
name=name or evaluator_id,
|
|
467
|
-
settings=settings,
|
|
468
|
-
as_guardrail=as_guardrail,
|
|
469
|
-
data=data,
|
|
470
|
-
)
|
|
471
|
-
duration = int((time.time() - start_time) * 1000)
|
|
472
|
-
|
|
473
|
-
self.log(
|
|
474
|
-
metric=name or evaluator_id,
|
|
475
|
-
index=index,
|
|
476
|
-
data=data,
|
|
477
|
-
status=result.status,
|
|
478
|
-
score=result.score,
|
|
479
|
-
passed=result.passed,
|
|
480
|
-
details=result.details,
|
|
481
|
-
label=result.label,
|
|
482
|
-
duration=duration,
|
|
483
|
-
cost=result.cost,
|
|
484
|
-
)
|
|
File without changes
|