langwatch 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,484 +0,0 @@
1
- from __future__ import annotations
2
- import asyncio
3
- from contextlib import contextmanager
4
- import json
5
- import threading
6
- import time
7
- import traceback
8
- import httpx
9
- import pandas as pd
10
- from opentelemetry import trace
11
- from opentelemetry.trace import Span
12
- from pydantic import BaseModel, Field
13
- from typing import (
14
- Any,
15
- Callable,
16
- Dict,
17
- Hashable,
18
- Iterable,
19
- Iterator,
20
- List,
21
- Literal,
22
- Optional,
23
- TypeVar,
24
- TypedDict,
25
- Sized,
26
- Union,
27
- cast,
28
- )
29
-
30
- from tenacity import retry, stop_after_attempt, wait_exponential
31
- from tqdm.auto import tqdm
32
-
33
- import langwatch
34
- from langwatch.attributes import AttributeKey
35
- from langwatch.domain import Money, TypedValueJson
36
- from langwatch.telemetry.tracing import LangWatchTrace
37
- from langwatch.utils.exceptions import better_raise_for_status
38
- from langwatch.utils.transformation import SerializableWithStringFallback
39
-
40
- from coolname import generate_slug # type: ignore
41
- import urllib.parse
42
- from concurrent.futures import Future, ThreadPoolExecutor, as_completed
43
-
44
- _tracer = trace.get_tracer(__name__)
45
-
46
- ItemT = TypeVar("ItemT")
47
-
48
-
49
- class EvaluationResult(BaseModel):
50
- name: str
51
- evaluator: str
52
- trace_id: str
53
- status: Literal["processed", "error", "skipped"]
54
- data: Optional[Dict[str, Any]] = None
55
- score: Optional[float] = Field(default=None, description="No description provided")
56
- passed: Optional[bool] = None
57
- details: Optional[str] = Field(
58
- default=None, description="Short human-readable description of the result"
59
- )
60
- index: Optional[int] = None
61
- label: Optional[str] = None
62
- cost: Optional[float] = None
63
- duration: Optional[int] = None
64
- error_type: Optional[str] = None
65
- traceback: Optional[List[str]] = Field(
66
- description="Traceback information for debugging", default=None
67
- )
68
-
69
-
70
- class Batch(TypedDict):
71
- dataset: List[BatchEntry]
72
- evaluations: List[EvaluationResult]
73
-
74
-
75
- class BatchEntry(BaseModel):
76
- index: int
77
- entry: Any
78
- duration: int
79
- error: Optional[str] = None
80
- trace_id: str
81
-
82
-
83
- class IterationInfo(TypedDict):
84
- index: int
85
- trace: LangWatchTrace
86
- item: Any
87
- duration: int
88
- error: Optional[Exception]
89
-
90
-
91
- class Evaluation:
92
- _executor: ThreadPoolExecutor
93
- _futures: List[Future[Any]]
94
- _current_index: int
95
- _current_item: Any
96
-
97
- def __init__(self, name: str, *, run_id: Optional[str] = None):
98
- self.name: str = name or generate_slug(3)
99
- self.experiment_slug: str = self.name
100
- self.run_id: str = run_id or generate_slug(3)
101
- self.total: int = 0
102
- self.progress: int = 0
103
- self.created_at_nano: int = int(time.time() * 1000)
104
- self._futures: List[Future[Any]] = []
105
-
106
- # Sending results
107
- self.lock = threading.Lock()
108
- self.batch: Batch = {"dataset": [], "evaluations": []}
109
- self.last_sent = 0
110
- self.debounce_interval = 1 # 1 second
111
- self.threads: List[threading.Thread] = []
112
- self.initialized = False
113
-
114
- def init(self):
115
- if not langwatch.get_api_key():
116
- raise ValueError(
117
- "API key was not detected, please set LANGWATCH_API_KEY or call langwatch.login() to login"
118
- )
119
- langwatch.ensure_setup()
120
-
121
- with httpx.Client(timeout=60) as client:
122
- response = client.post(
123
- f"{langwatch.get_endpoint()}/api/experiment/init",
124
- headers={"X-Auth-Token": langwatch.get_api_key() or ""},
125
- json={
126
- "experiment_name": self.name,
127
- "experiment_slug": self.experiment_slug,
128
- "experiment_type": "BATCH_EVALUATION_V2",
129
- },
130
- )
131
- if response.status_code == 401:
132
- langwatch.setup(api_key=None)
133
- raise ValueError(
134
- "API key is not valid, please try to login again with langwatch.login()"
135
- )
136
- better_raise_for_status(response)
137
- response_json = response.json()
138
- experiment_path = response_json["path"]
139
- self.experiment_slug = response_json["slug"]
140
-
141
- url_encoded_run_id = urllib.parse.quote(self.run_id)
142
- print(
143
- f"Follow the results at: {langwatch.get_endpoint()}{experiment_path}?runId={url_encoded_run_id}"
144
- )
145
- self.initialized = True
146
-
147
- def loop(
148
- self,
149
- iterable: Union[Iterable[ItemT], pd.DataFrame],
150
- *,
151
- threads: int = 4,
152
- total: Optional[int] = None,
153
- ) -> Iterable[ItemT]:
154
- if not self.initialized:
155
- self.init()
156
-
157
- try:
158
- total_ = (
159
- total
160
- if total
161
- else (
162
- len(cast(Sized, iterable)) if hasattr(iterable, "__len__") else None
163
- )
164
- )
165
- if total_ is None and "DataFrame.iterrows" in str(iterable):
166
- iterable = cast(Iterable[ItemT], list(iterable))
167
- total_ = len(cast(Sized, iterable))
168
- progress_bar = tqdm(total=total_, desc="Evaluating")
169
-
170
- # Supports direct pandas df being passed in
171
- if isinstance(iterable, pd.DataFrame):
172
- iterable = cast(Iterable[ItemT], iterable.iterrows()) # type: ignore
173
-
174
- with ThreadPoolExecutor(max_workers=threads) as executor:
175
- self._executor = executor
176
- for index, item in enumerate(iterable):
177
- self._current_index = index
178
- self._current_item = item
179
-
180
- with self._execute_item_iteration(
181
- index,
182
- item,
183
- in_thread=False,
184
- ):
185
- yield item
186
- if len(self._futures) == 0:
187
- progress_bar.update(1)
188
-
189
- if len(self._futures) > 0:
190
- for _ in as_completed(self._futures):
191
- progress_bar.update(1)
192
-
193
- executor.submit(self._wait_for_completion).result()
194
- progress_bar.close()
195
-
196
- except Exception as e:
197
- Evaluation._log_results(
198
- langwatch.get_api_key() or "",
199
- {
200
- "experiment_slug": self.experiment_slug,
201
- "run_id": self.run_id,
202
- "timestamps": {
203
- "finished_at": int(time.time() * 1000),
204
- "stopped_at": int(time.time() * 1000),
205
- },
206
- },
207
- )
208
- raise e
209
-
210
- def submit(self, func: Callable[..., Any], /, *args: Any, **kwargs: Any):
211
- _current_index = self._current_index
212
- _current_item = self._current_item
213
-
214
- def wrapper():
215
- with self._execute_item_iteration(
216
- _current_index, _current_item, in_thread=True
217
- ):
218
- if asyncio.iscoroutinefunction(func):
219
- func_result = asyncio.run(func(*args, **kwargs))
220
- else:
221
- func_result = func(*args, **kwargs)
222
-
223
- return func_result
224
-
225
- future = self._executor.submit(wrapper)
226
- self._futures.append(future)
227
- return future
228
-
229
- @contextmanager
230
- def _execute_item_iteration(
231
- self,
232
- index: int,
233
- item: Any,
234
- in_thread: bool = False,
235
- ) -> Iterator[Any]:
236
- # Iteration will be None if we find ourselves in a parallel loop, but still
237
- # in the phase of collecting the evaluation.submit() processes. When in_thread,
238
- # then it's when we actually collect the iteration info.
239
- iteration = (
240
- IterationInfo(
241
- trace=langwatch.trace(
242
- name="evaluation.loop_iteration",
243
- metadata={
244
- "thread_id": self.run_id,
245
- "loop.index": str(index),
246
- },
247
- ),
248
- index=index,
249
- item=item,
250
- duration=0,
251
- error=None,
252
- )
253
- if in_thread or len(self._futures) == 0
254
- else None
255
- )
256
-
257
- if iteration is not None:
258
- iteration["trace"].__enter__()
259
-
260
- start_time = time.time()
261
- try:
262
- yield
263
- except Exception as e:
264
- if iteration is not None:
265
- iteration["error"] = e
266
- print(f"\n[Evaluation Error] index={index}")
267
- traceback.print_exc()
268
-
269
- if iteration is not None:
270
- try:
271
- iteration["duration"] = int((time.time() - start_time) * 1000)
272
-
273
- # If we just started the parallel loop, we need to skip the first iteration
274
- # from being added to the batch and change the trace name
275
- if not in_thread and len(self._futures) > 0:
276
- iteration["trace"].update(name="evaluation.loop")
277
- else:
278
- self._add_to_batch(iteration)
279
-
280
- if iteration["error"] is not None:
281
- iteration["trace"].update(error=iteration["error"])
282
- except Exception as e:
283
- raise e
284
- finally:
285
- iteration["trace"].__exit__(None, None, None)
286
-
287
- def _add_to_batch(self, iteration: IterationInfo):
288
- entry: Any = (
289
- iteration["item"].to_dict()
290
- if hasattr(iteration["item"], "to_dict")
291
- else (
292
- iteration["item"].__dict__
293
- if hasattr(iteration["item"], "__dict__")
294
- else (
295
- iteration["item"][1].to_dict()
296
- if type(iteration["item"]) == tuple
297
- and hasattr(iteration["item"][1], "to_dict")
298
- else (
299
- iteration["item"][1].__dict__
300
- if type(iteration["item"]) == tuple
301
- and hasattr(iteration["item"][1], "__dict__")
302
- else {
303
- "entry": json.dumps(
304
- iteration["item"], cls=SerializableWithStringFallback
305
- )
306
- }
307
- )
308
- )
309
- )
310
- )
311
- with self.lock:
312
- self.batch["dataset"].append(
313
- BatchEntry(
314
- index=iteration["index"],
315
- entry=entry,
316
- duration=iteration["duration"],
317
- error=str(iteration["error"]) if iteration["error"] else None,
318
- trace_id=iteration["trace"].trace_id or "",
319
- )
320
- )
321
-
322
- if time.time() - self.last_sent >= self.debounce_interval:
323
- self._send_batch()
324
-
325
- def _send_batch(self, finished: bool = False):
326
- with self.lock:
327
- if (
328
- len(self.batch["dataset"]) == 0
329
- and len(self.batch["evaluations"]) == 0
330
- and not finished
331
- ):
332
- return
333
-
334
- # TODO: it is called `inputs` on the api still, unfortunately, so we need to map data back to inputs
335
- evaluations = []
336
- for eval in self.batch["evaluations"]:
337
- eval_ = eval.model_dump(exclude_none=True, exclude_unset=True)
338
- eval_["inputs"] = eval_["data"]
339
- if "data" in eval_:
340
- del eval_["data"]
341
- evaluations.append(eval_)
342
-
343
- body = {
344
- "experiment_slug": self.experiment_slug,
345
- "name": f"{self.name}",
346
- "run_id": self.run_id,
347
- "dataset": [
348
- entry.model_dump(exclude_none=True, exclude_unset=True)
349
- for entry in self.batch["dataset"]
350
- ],
351
- "evaluations": evaluations,
352
- "progress": self.progress,
353
- "total": self.total,
354
- "timestamps": {
355
- "created_at": self.created_at_nano,
356
- },
357
- }
358
-
359
- if finished:
360
- if not isinstance(body["timestamps"], dict):
361
- body["timestamps"] = {}
362
- body["timestamps"]["finished_at"] = int(time.time() * 1000)
363
-
364
- # Start a new thread to send the batch
365
- thread = threading.Thread(
366
- target=Evaluation._log_results,
367
- args=(langwatch.get_api_key(), body),
368
- )
369
- thread.start()
370
- self.threads.append(thread)
371
-
372
- # Clear the batch and update the last sent time
373
- self.batch = {"dataset": [], "evaluations": []}
374
- self.last_sent = time.time()
375
-
376
- @classmethod
377
- @retry(
378
- stop=stop_after_attempt(3),
379
- wait=wait_exponential(multiplier=1, min=2, max=10),
380
- reraise=True,
381
- )
382
- def _log_results(cls, api_key: str, body: Dict[str, Any]):
383
- response = httpx.post(
384
- f"{langwatch.get_endpoint()}/api/evaluations/batch/log_results",
385
- headers={
386
- "Authorization": f"Bearer {api_key}",
387
- "Content-Type": "application/json",
388
- },
389
- data=json.dumps(body, cls=SerializableWithStringFallback), # type: ignore
390
- timeout=60,
391
- )
392
- better_raise_for_status(response)
393
-
394
- def _wait_for_completion(self):
395
- async def wait_for_completion(self: Evaluation):
396
- # Send any remaining batch
397
- self._send_batch(finished=True)
398
-
399
- for thread in self.threads:
400
- await asyncio.sleep(0)
401
- thread.join()
402
-
403
- asyncio.run(wait_for_completion(self))
404
-
405
- def log(
406
- self,
407
- metric: str,
408
- index: Union[int, Hashable],
409
- data: Dict[str, Any] = {},
410
- score: Optional[float] = None,
411
- passed: Optional[bool] = None,
412
- label: Optional[str] = None,
413
- details: Optional[str] = None,
414
- status: Literal["processed", "error", "skipped"] = "processed",
415
- duration: Optional[int] = None,
416
- cost: Optional[Money] = None,
417
- error: Optional[Exception] = None,
418
- ):
419
- try:
420
- index_ = int(cast(Any, index))
421
- except Exception:
422
- raise ValueError(f"Index must be an integer, got {index}")
423
-
424
- eval = EvaluationResult(
425
- trace_id=format(
426
- trace.get_current_span().get_span_context().trace_id,
427
- "x",
428
- ),
429
- name=metric,
430
- evaluator=metric,
431
- status=status if status else "error" if error else "processed",
432
- data=data,
433
- score=score,
434
- passed=passed,
435
- index=index_,
436
- label=label,
437
- cost=cost.amount if cost else None,
438
- duration=duration,
439
- details=details if details else str(error) if error else None,
440
- error_type=type(error).__name__ if error else None,
441
- traceback=(
442
- list(traceback.TracebackException.from_exception(error).format())
443
- if error
444
- else None
445
- ),
446
- )
447
-
448
- with self.lock:
449
- self.batch["evaluations"].append(eval)
450
-
451
- def run(
452
- self,
453
- evaluator_id: str,
454
- index: Union[int, Hashable],
455
- data: Dict[str, Any],
456
- settings: Dict[str, Any],
457
- name: Optional[str] = None,
458
- as_guardrail: bool = False,
459
- ):
460
- duration: Optional[int] = None
461
-
462
- start_time = time.time()
463
- result = langwatch.evaluations.evaluate(
464
- span=langwatch.get_current_span(),
465
- slug=evaluator_id,
466
- name=name or evaluator_id,
467
- settings=settings,
468
- as_guardrail=as_guardrail,
469
- data=data,
470
- )
471
- duration = int((time.time() - start_time) * 1000)
472
-
473
- self.log(
474
- metric=name or evaluator_id,
475
- index=index,
476
- data=data,
477
- status=result.status,
478
- score=result.score,
479
- passed=result.passed,
480
- details=result.details,
481
- label=result.label,
482
- duration=duration,
483
- cost=result.cost,
484
- )