langwatch 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch/__init__.py +6 -3
- langwatch/__version__.py +1 -1
- langwatch/dspy/__init__.py +4 -32
- langwatch/evaluation/__init__.py +535 -7
- langwatch/evaluations.py +183 -353
- langwatch/experiment/__init__.py +108 -0
- langwatch/experiment/experiment.py +912 -0
- langwatch/experiment/platform_run.py +435 -0
- {langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/METADATA +1 -1
- {langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/RECORD +11 -9
- langwatch/evaluation/evaluation.py +0 -484
- {langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,912 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import asyncio
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from contextvars import ContextVar
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
import json
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
import traceback
|
|
10
|
+
import httpx
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from opentelemetry import trace, context as otel_context
|
|
13
|
+
from opentelemetry.trace import Span
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
from typing import (
|
|
16
|
+
Any,
|
|
17
|
+
Callable,
|
|
18
|
+
Dict,
|
|
19
|
+
Hashable,
|
|
20
|
+
Iterable,
|
|
21
|
+
Iterator,
|
|
22
|
+
List,
|
|
23
|
+
Literal,
|
|
24
|
+
Optional,
|
|
25
|
+
TypeVar,
|
|
26
|
+
TypedDict,
|
|
27
|
+
Sized,
|
|
28
|
+
Union,
|
|
29
|
+
cast,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
33
|
+
from tqdm.auto import tqdm
|
|
34
|
+
|
|
35
|
+
import langwatch
|
|
36
|
+
from langwatch.attributes import AttributeKey
|
|
37
|
+
from langwatch.domain import Money, TypedValueJson
|
|
38
|
+
from langwatch.telemetry.tracing import LangWatchTrace
|
|
39
|
+
from langwatch.utils.exceptions import better_raise_for_status
|
|
40
|
+
from langwatch.utils.transformation import SerializableWithStringFallback
|
|
41
|
+
|
|
42
|
+
from coolname import generate_slug # type: ignore
|
|
43
|
+
import urllib.parse
|
|
44
|
+
from concurrent.futures import Future, ThreadPoolExecutor, as_completed
|
|
45
|
+
|
|
46
|
+
_tracer = trace.get_tracer(__name__)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class TargetContext:
|
|
51
|
+
"""Context for the current target() execution."""
|
|
52
|
+
|
|
53
|
+
target_id: str
|
|
54
|
+
index: int
|
|
55
|
+
trace_id: str
|
|
56
|
+
predicted: Optional[Dict[str, Any]] = None # Set via log_response()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class IterationContext:
|
|
61
|
+
"""Context for the current iteration (index + item)."""
|
|
62
|
+
|
|
63
|
+
index: int
|
|
64
|
+
item: Any
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ContextVar for target context isolation (works across threads)
|
|
68
|
+
_target_context: ContextVar[Optional[TargetContext]] = ContextVar(
|
|
69
|
+
"_target_context", default=None
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# ContextVar for iteration context (index + item) - thread-safe
|
|
73
|
+
_iteration_context: ContextVar[Optional[IterationContext]] = ContextVar(
|
|
74
|
+
"_iteration_context", default=None
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
ItemT = TypeVar("ItemT")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class EvaluationResult(BaseModel):
|
|
81
|
+
name: str
|
|
82
|
+
evaluator: str
|
|
83
|
+
trace_id: str
|
|
84
|
+
status: Literal["processed", "error", "skipped"]
|
|
85
|
+
data: Optional[Dict[str, Any]] = None
|
|
86
|
+
score: Optional[float] = Field(default=None, description="No description provided")
|
|
87
|
+
passed: Optional[bool] = None
|
|
88
|
+
details: Optional[str] = Field(
|
|
89
|
+
default=None, description="Short human-readable description of the result"
|
|
90
|
+
)
|
|
91
|
+
index: Optional[int] = None
|
|
92
|
+
label: Optional[str] = None
|
|
93
|
+
cost: Optional[float] = None
|
|
94
|
+
duration: Optional[int] = None
|
|
95
|
+
error_type: Optional[str] = None
|
|
96
|
+
traceback: Optional[List[str]] = Field(
|
|
97
|
+
description="Traceback information for debugging", default=None
|
|
98
|
+
)
|
|
99
|
+
target_id: Optional[str] = Field(
|
|
100
|
+
default=None, description="ID of the target this evaluation is for"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class TargetInfo(BaseModel):
|
|
105
|
+
"""Represents a registered target with its metadata."""
|
|
106
|
+
|
|
107
|
+
id: str
|
|
108
|
+
name: str
|
|
109
|
+
type: Literal["prompt", "agent", "custom"] = "custom"
|
|
110
|
+
metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class Batch(TypedDict):
|
|
114
|
+
dataset: List[BatchEntry]
|
|
115
|
+
evaluations: List[EvaluationResult]
|
|
116
|
+
targets: List[TargetInfo]
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class BatchEntry(BaseModel):
|
|
120
|
+
index: int
|
|
121
|
+
entry: Any
|
|
122
|
+
duration: int
|
|
123
|
+
error: Optional[str] = None
|
|
124
|
+
trace_id: str
|
|
125
|
+
target_id: Optional[str] = None
|
|
126
|
+
cost: Optional[float] = None
|
|
127
|
+
predicted: Optional[Dict[str, Any]] = None
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class IterationInfo(TypedDict):
|
|
131
|
+
index: int
|
|
132
|
+
trace: LangWatchTrace
|
|
133
|
+
item: Any
|
|
134
|
+
duration: int
|
|
135
|
+
error: Optional[Exception]
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class Experiment:
|
|
139
|
+
_executor: ThreadPoolExecutor
|
|
140
|
+
_futures: List[Future[Any]]
|
|
141
|
+
_current_index: int
|
|
142
|
+
_current_item: Any
|
|
143
|
+
|
|
144
|
+
def __init__(self, name: str, *, run_id: Optional[str] = None):
|
|
145
|
+
self.name: str = name or generate_slug(3)
|
|
146
|
+
self.experiment_slug: str = self.name
|
|
147
|
+
self.run_id: str = run_id or generate_slug(3)
|
|
148
|
+
self.total: int = 0
|
|
149
|
+
self.progress: int = 0
|
|
150
|
+
self.created_at_nano: int = int(time.time() * 1000)
|
|
151
|
+
self._futures: List[Future[Any]] = []
|
|
152
|
+
|
|
153
|
+
# Sending results
|
|
154
|
+
self.lock = threading.Lock()
|
|
155
|
+
self.batch: Batch = {"dataset": [], "evaluations": [], "targets": []}
|
|
156
|
+
self.last_sent = 0
|
|
157
|
+
self.debounce_interval = 1 # 1 second
|
|
158
|
+
self.threads: List[threading.Thread] = []
|
|
159
|
+
self.initialized = False
|
|
160
|
+
|
|
161
|
+
# Target registry - tracks registered targets and their metadata
|
|
162
|
+
self._targets: Dict[str, TargetInfo] = {}
|
|
163
|
+
|
|
164
|
+
# Track whether with_target() was used in the current iteration
|
|
165
|
+
# If so, we don't create row-level dataset entries
|
|
166
|
+
self._current_iteration_used_with_target = False
|
|
167
|
+
|
|
168
|
+
# Track whether target() has EVER been used in this evaluation
|
|
169
|
+
# Once set to True, we stop creating iteration-level traces
|
|
170
|
+
self._evaluation_uses_targets: bool = False
|
|
171
|
+
|
|
172
|
+
# Store the active iteration trace so target() can close it early
|
|
173
|
+
self._active_iteration_trace: Optional[LangWatchTrace] = None
|
|
174
|
+
|
|
175
|
+
def init(self):
|
|
176
|
+
if not langwatch.get_api_key():
|
|
177
|
+
raise ValueError(
|
|
178
|
+
"API key was not detected, please set LANGWATCH_API_KEY or call langwatch.login() to login"
|
|
179
|
+
)
|
|
180
|
+
langwatch.ensure_setup()
|
|
181
|
+
|
|
182
|
+
with httpx.Client(timeout=60) as client:
|
|
183
|
+
response = client.post(
|
|
184
|
+
f"{langwatch.get_endpoint()}/api/experiment/init",
|
|
185
|
+
headers={"X-Auth-Token": langwatch.get_api_key() or ""},
|
|
186
|
+
json={
|
|
187
|
+
"experiment_name": self.name,
|
|
188
|
+
"experiment_slug": self.experiment_slug,
|
|
189
|
+
"experiment_type": "BATCH_EVALUATION_V2",
|
|
190
|
+
},
|
|
191
|
+
)
|
|
192
|
+
if response.status_code == 401:
|
|
193
|
+
langwatch.setup(api_key=None)
|
|
194
|
+
raise ValueError(
|
|
195
|
+
"API key is not valid, please try to login again with langwatch.login()"
|
|
196
|
+
)
|
|
197
|
+
better_raise_for_status(response)
|
|
198
|
+
response_json = response.json()
|
|
199
|
+
experiment_path = response_json["path"]
|
|
200
|
+
self.experiment_slug = response_json["slug"]
|
|
201
|
+
|
|
202
|
+
url_encoded_run_id = urllib.parse.quote(self.run_id)
|
|
203
|
+
print(
|
|
204
|
+
f"Follow the results at: {langwatch.get_endpoint()}{experiment_path}?runId={url_encoded_run_id}"
|
|
205
|
+
)
|
|
206
|
+
self.initialized = True
|
|
207
|
+
|
|
208
|
+
def loop(
|
|
209
|
+
self,
|
|
210
|
+
iterable: Union[Iterable[ItemT], pd.DataFrame],
|
|
211
|
+
*,
|
|
212
|
+
threads: int = 4,
|
|
213
|
+
total: Optional[int] = None,
|
|
214
|
+
) -> Iterable[ItemT]:
|
|
215
|
+
if not self.initialized:
|
|
216
|
+
self.init()
|
|
217
|
+
|
|
218
|
+
try:
|
|
219
|
+
total_ = (
|
|
220
|
+
total
|
|
221
|
+
if total
|
|
222
|
+
else (
|
|
223
|
+
len(cast(Sized, iterable)) if hasattr(iterable, "__len__") else None
|
|
224
|
+
)
|
|
225
|
+
)
|
|
226
|
+
if total_ is None and "DataFrame.iterrows" in str(iterable):
|
|
227
|
+
iterable = cast(Iterable[ItemT], list(iterable))
|
|
228
|
+
total_ = len(cast(Sized, iterable))
|
|
229
|
+
progress_bar = tqdm(total=total_, desc="Evaluating")
|
|
230
|
+
|
|
231
|
+
# Supports direct pandas df being passed in
|
|
232
|
+
if isinstance(iterable, pd.DataFrame):
|
|
233
|
+
iterable = cast(Iterable[ItemT], iterable.iterrows()) # type: ignore
|
|
234
|
+
|
|
235
|
+
with ThreadPoolExecutor(max_workers=threads) as executor:
|
|
236
|
+
self._executor = executor
|
|
237
|
+
for index, item in enumerate(iterable):
|
|
238
|
+
self._current_index = index
|
|
239
|
+
self._current_item = item
|
|
240
|
+
|
|
241
|
+
with self._execute_item_iteration(
|
|
242
|
+
index,
|
|
243
|
+
item,
|
|
244
|
+
in_thread=False,
|
|
245
|
+
):
|
|
246
|
+
yield item
|
|
247
|
+
if len(self._futures) == 0:
|
|
248
|
+
progress_bar.update(1)
|
|
249
|
+
|
|
250
|
+
if len(self._futures) > 0:
|
|
251
|
+
for _ in as_completed(self._futures):
|
|
252
|
+
progress_bar.update(1)
|
|
253
|
+
|
|
254
|
+
executor.submit(self._wait_for_completion).result()
|
|
255
|
+
progress_bar.close()
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
Experiment._log_results(
|
|
259
|
+
langwatch.get_api_key() or "",
|
|
260
|
+
{
|
|
261
|
+
"experiment_slug": self.experiment_slug,
|
|
262
|
+
"run_id": self.run_id,
|
|
263
|
+
"timestamps": {
|
|
264
|
+
"finished_at": int(time.time() * 1000),
|
|
265
|
+
"stopped_at": int(time.time() * 1000),
|
|
266
|
+
},
|
|
267
|
+
},
|
|
268
|
+
)
|
|
269
|
+
raise e
|
|
270
|
+
|
|
271
|
+
def submit(self, func: Callable[..., Any], /, *args: Any, **kwargs: Any):
|
|
272
|
+
_current_index = self._current_index
|
|
273
|
+
_current_item = self._current_item
|
|
274
|
+
|
|
275
|
+
def wrapper():
|
|
276
|
+
with self._execute_item_iteration(
|
|
277
|
+
_current_index, _current_item, in_thread=True
|
|
278
|
+
):
|
|
279
|
+
if asyncio.iscoroutinefunction(func):
|
|
280
|
+
func_result = asyncio.run(func(*args, **kwargs))
|
|
281
|
+
else:
|
|
282
|
+
func_result = func(*args, **kwargs)
|
|
283
|
+
|
|
284
|
+
return func_result
|
|
285
|
+
|
|
286
|
+
future = self._executor.submit(wrapper)
|
|
287
|
+
self._futures.append(future)
|
|
288
|
+
return future
|
|
289
|
+
|
|
290
|
+
@contextmanager
|
|
291
|
+
def _execute_item_iteration(
|
|
292
|
+
self,
|
|
293
|
+
index: int,
|
|
294
|
+
item: Any,
|
|
295
|
+
in_thread: bool = False,
|
|
296
|
+
) -> Iterator[Any]:
|
|
297
|
+
# Reset with_target tracking for this iteration
|
|
298
|
+
self._current_iteration_used_with_target = False
|
|
299
|
+
|
|
300
|
+
# Set iteration context (thread-safe via contextvars)
|
|
301
|
+
# This allows target() to access index/item without race conditions
|
|
302
|
+
iter_ctx = IterationContext(index=index, item=item)
|
|
303
|
+
iter_token = _iteration_context.set(iter_ctx)
|
|
304
|
+
|
|
305
|
+
# Determine if we should create an iteration trace:
|
|
306
|
+
# - Don't create if evaluation uses targets (each target creates its own trace)
|
|
307
|
+
# - Don't create if we're collecting submit() calls (not in_thread yet)
|
|
308
|
+
should_create_iteration_trace = (
|
|
309
|
+
not self._evaluation_uses_targets
|
|
310
|
+
and (in_thread or len(self._futures) == 0)
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
iteration: Optional[IterationInfo] = None
|
|
314
|
+
if should_create_iteration_trace:
|
|
315
|
+
iteration = IterationInfo(
|
|
316
|
+
trace=langwatch.trace(
|
|
317
|
+
name="evaluation.loop_iteration",
|
|
318
|
+
metadata={
|
|
319
|
+
"thread_id": self.run_id,
|
|
320
|
+
"loop.index": str(index),
|
|
321
|
+
},
|
|
322
|
+
),
|
|
323
|
+
index=index,
|
|
324
|
+
item=item,
|
|
325
|
+
duration=0,
|
|
326
|
+
error=None,
|
|
327
|
+
)
|
|
328
|
+
iteration["trace"].__enter__()
|
|
329
|
+
# Store for target() to potentially close early
|
|
330
|
+
self._active_iteration_trace = iteration["trace"]
|
|
331
|
+
|
|
332
|
+
start_time = time.time()
|
|
333
|
+
try:
|
|
334
|
+
yield
|
|
335
|
+
except Exception as e:
|
|
336
|
+
if iteration is not None:
|
|
337
|
+
iteration["error"] = e
|
|
338
|
+
print(f"\n[Evaluation Error] index={index}")
|
|
339
|
+
traceback.print_exc()
|
|
340
|
+
finally:
|
|
341
|
+
# Reset iteration context
|
|
342
|
+
_iteration_context.reset(iter_token)
|
|
343
|
+
|
|
344
|
+
# Handle iteration trace cleanup
|
|
345
|
+
# Note: If target() was used, it may have already closed the trace
|
|
346
|
+
if iteration is not None and not self._evaluation_uses_targets:
|
|
347
|
+
try:
|
|
348
|
+
iteration["duration"] = int((time.time() - start_time) * 1000)
|
|
349
|
+
|
|
350
|
+
# If we just started the parallel loop, we need to skip the first iteration
|
|
351
|
+
# from being added to the batch and change the trace name
|
|
352
|
+
if not in_thread and len(self._futures) > 0:
|
|
353
|
+
iteration["trace"].update(name="evaluation.loop")
|
|
354
|
+
# Only add row-level entry if with_target was NOT used
|
|
355
|
+
# When with_target is used, it creates per-target dataset entries instead
|
|
356
|
+
elif not self._current_iteration_used_with_target:
|
|
357
|
+
self._add_to_batch(iteration)
|
|
358
|
+
|
|
359
|
+
if iteration["error"] is not None:
|
|
360
|
+
iteration["trace"].update(error=iteration["error"])
|
|
361
|
+
except Exception as e:
|
|
362
|
+
raise e
|
|
363
|
+
finally:
|
|
364
|
+
iteration["trace"].__exit__(None, None, None)
|
|
365
|
+
|
|
366
|
+
# Clear active iteration trace reference
|
|
367
|
+
self._active_iteration_trace = None
|
|
368
|
+
|
|
369
|
+
def _add_to_batch(self, iteration: IterationInfo):
|
|
370
|
+
entry: Any = (
|
|
371
|
+
iteration["item"].to_dict()
|
|
372
|
+
if hasattr(iteration["item"], "to_dict")
|
|
373
|
+
else (
|
|
374
|
+
iteration["item"].__dict__
|
|
375
|
+
if hasattr(iteration["item"], "__dict__")
|
|
376
|
+
else (
|
|
377
|
+
iteration["item"][1].to_dict()
|
|
378
|
+
if type(iteration["item"]) == tuple
|
|
379
|
+
and hasattr(iteration["item"][1], "to_dict")
|
|
380
|
+
else (
|
|
381
|
+
iteration["item"][1].__dict__
|
|
382
|
+
if type(iteration["item"]) == tuple
|
|
383
|
+
and hasattr(iteration["item"][1], "__dict__")
|
|
384
|
+
else {
|
|
385
|
+
"entry": json.dumps(
|
|
386
|
+
iteration["item"], cls=SerializableWithStringFallback
|
|
387
|
+
)
|
|
388
|
+
}
|
|
389
|
+
)
|
|
390
|
+
)
|
|
391
|
+
)
|
|
392
|
+
)
|
|
393
|
+
with self.lock:
|
|
394
|
+
self.batch["dataset"].append(
|
|
395
|
+
BatchEntry(
|
|
396
|
+
index=iteration["index"],
|
|
397
|
+
entry=entry,
|
|
398
|
+
duration=iteration["duration"],
|
|
399
|
+
error=str(iteration["error"]) if iteration["error"] else None,
|
|
400
|
+
trace_id=iteration["trace"].trace_id or "",
|
|
401
|
+
)
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
if time.time() - self.last_sent >= self.debounce_interval:
|
|
405
|
+
self._send_batch()
|
|
406
|
+
|
|
407
|
+
def _send_batch(self, finished: bool = False):
|
|
408
|
+
with self.lock:
|
|
409
|
+
if (
|
|
410
|
+
len(self.batch["dataset"]) == 0
|
|
411
|
+
and len(self.batch["evaluations"]) == 0
|
|
412
|
+
and len(self.batch["targets"]) == 0
|
|
413
|
+
and not finished
|
|
414
|
+
):
|
|
415
|
+
return
|
|
416
|
+
|
|
417
|
+
# TODO: it is called `inputs` on the api still, unfortunately, so we need to map data back to inputs
|
|
418
|
+
evaluations = []
|
|
419
|
+
for eval in self.batch["evaluations"]:
|
|
420
|
+
eval_ = eval.model_dump(exclude_none=True, exclude_unset=True)
|
|
421
|
+
eval_["inputs"] = eval_["data"]
|
|
422
|
+
if "data" in eval_:
|
|
423
|
+
del eval_["data"]
|
|
424
|
+
evaluations.append(eval_)
|
|
425
|
+
|
|
426
|
+
# Build targets array for API
|
|
427
|
+
targets = [
|
|
428
|
+
target.model_dump(exclude_none=True, exclude_unset=True)
|
|
429
|
+
for target in self.batch["targets"]
|
|
430
|
+
]
|
|
431
|
+
|
|
432
|
+
body: Dict[str, Any] = {
|
|
433
|
+
"experiment_slug": self.experiment_slug,
|
|
434
|
+
"name": f"{self.name}",
|
|
435
|
+
"run_id": self.run_id,
|
|
436
|
+
"dataset": [
|
|
437
|
+
entry.model_dump(exclude_none=True, exclude_unset=True)
|
|
438
|
+
for entry in self.batch["dataset"]
|
|
439
|
+
],
|
|
440
|
+
"evaluations": evaluations,
|
|
441
|
+
"progress": self.progress,
|
|
442
|
+
"total": self.total,
|
|
443
|
+
"timestamps": {
|
|
444
|
+
"created_at": self.created_at_nano,
|
|
445
|
+
},
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
# Only include targets if we have any
|
|
449
|
+
if len(targets) > 0:
|
|
450
|
+
body["targets"] = targets
|
|
451
|
+
|
|
452
|
+
if finished:
|
|
453
|
+
if not isinstance(body["timestamps"], dict):
|
|
454
|
+
body["timestamps"] = {}
|
|
455
|
+
body["timestamps"]["finished_at"] = int(time.time() * 1000)
|
|
456
|
+
|
|
457
|
+
# Start a new thread to send the batch
|
|
458
|
+
thread = threading.Thread(
|
|
459
|
+
target=Experiment._log_results,
|
|
460
|
+
args=(langwatch.get_api_key(), body),
|
|
461
|
+
)
|
|
462
|
+
thread.start()
|
|
463
|
+
self.threads.append(thread)
|
|
464
|
+
|
|
465
|
+
# Clear the batch and update the last sent time
|
|
466
|
+
self.batch = {"dataset": [], "evaluations": [], "targets": []}
|
|
467
|
+
self.last_sent = time.time()
|
|
468
|
+
|
|
469
|
+
@classmethod
|
|
470
|
+
@retry(
|
|
471
|
+
stop=stop_after_attempt(3),
|
|
472
|
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
|
473
|
+
reraise=True,
|
|
474
|
+
)
|
|
475
|
+
def _log_results(cls, api_key: str, body: Dict[str, Any]):
|
|
476
|
+
response = httpx.post(
|
|
477
|
+
f"{langwatch.get_endpoint()}/api/evaluations/batch/log_results",
|
|
478
|
+
headers={
|
|
479
|
+
"Authorization": f"Bearer {api_key}",
|
|
480
|
+
"Content-Type": "application/json",
|
|
481
|
+
},
|
|
482
|
+
data=json.dumps(body, cls=SerializableWithStringFallback), # type: ignore
|
|
483
|
+
timeout=60,
|
|
484
|
+
)
|
|
485
|
+
better_raise_for_status(response)
|
|
486
|
+
|
|
487
|
+
def _wait_for_completion(self):
|
|
488
|
+
async def wait_for_completion(self: Experiment):
|
|
489
|
+
# Send any remaining batch
|
|
490
|
+
self._send_batch(finished=True)
|
|
491
|
+
|
|
492
|
+
for thread in self.threads:
|
|
493
|
+
await asyncio.sleep(0)
|
|
494
|
+
thread.join()
|
|
495
|
+
|
|
496
|
+
asyncio.run(wait_for_completion(self))
|
|
497
|
+
|
|
498
|
+
def _register_target(
|
|
499
|
+
self,
|
|
500
|
+
target: str,
|
|
501
|
+
metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
|
|
502
|
+
) -> str:
|
|
503
|
+
"""
|
|
504
|
+
Register a target with its metadata. Returns the target ID.
|
|
505
|
+
|
|
506
|
+
If the target was already registered:
|
|
507
|
+
- If no new metadata is provided, the existing target is used
|
|
508
|
+
- If new metadata is provided and differs from existing, raises an error
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
target: The target name/ID
|
|
512
|
+
metadata: Optional metadata for this target (model, temperature, etc.)
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
The target ID
|
|
516
|
+
"""
|
|
517
|
+
with self.lock:
|
|
518
|
+
if target in self._targets:
|
|
519
|
+
existing = self._targets[target]
|
|
520
|
+
if metadata is not None:
|
|
521
|
+
# Check if metadata matches
|
|
522
|
+
existing_meta = existing.metadata or {}
|
|
523
|
+
if existing_meta != metadata:
|
|
524
|
+
raise ValueError(
|
|
525
|
+
f"Target '{target}' was previously registered with different metadata.\n"
|
|
526
|
+
f"Original: {existing_meta}\n"
|
|
527
|
+
f"New: {metadata}\n"
|
|
528
|
+
f"If you want to use different metadata, please use a different target name."
|
|
529
|
+
)
|
|
530
|
+
return target
|
|
531
|
+
|
|
532
|
+
# Register new target
|
|
533
|
+
target_info = TargetInfo(
|
|
534
|
+
id=target,
|
|
535
|
+
name=target,
|
|
536
|
+
type="custom",
|
|
537
|
+
metadata=metadata,
|
|
538
|
+
)
|
|
539
|
+
self._targets[target] = target_info
|
|
540
|
+
self.batch["targets"].append(target_info)
|
|
541
|
+
return target
|
|
542
|
+
|
|
543
|
+
@contextmanager
|
|
544
|
+
def target(
|
|
545
|
+
self,
|
|
546
|
+
name: str,
|
|
547
|
+
metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
|
|
548
|
+
) -> Iterator[None]:
|
|
549
|
+
"""
|
|
550
|
+
Context manager for executing code within a target context.
|
|
551
|
+
|
|
552
|
+
Creates a dataset entry for this specific target execution, capturing
|
|
553
|
+
duration automatically. This enables proper per-target latency tracking
|
|
554
|
+
when comparing multiple models/configurations.
|
|
555
|
+
|
|
556
|
+
Each target() call creates its own independent trace, allowing you to
|
|
557
|
+
view execution details separately for each model/configuration.
|
|
558
|
+
|
|
559
|
+
Inside this context, log() calls will automatically use this target
|
|
560
|
+
unless an explicit target is provided.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
name: Unique identifier for the target
|
|
564
|
+
metadata: Optional metadata for comparison (e.g., {"model": "gpt-4"})
|
|
565
|
+
|
|
566
|
+
Example:
|
|
567
|
+
```python
|
|
568
|
+
for index, row in evaluation.loop(df.iterrows()):
|
|
569
|
+
def task(index, row):
|
|
570
|
+
# Compare GPT-4 and Claude
|
|
571
|
+
with evaluation.target("gpt-4", {"model": "openai/gpt-4"}):
|
|
572
|
+
response = call_gpt4(row["question"])
|
|
573
|
+
# target auto-inferred, use data= to record output
|
|
574
|
+
evaluation.log("quality", index=index, score=0.95,
|
|
575
|
+
data={"output": response})
|
|
576
|
+
|
|
577
|
+
with evaluation.target("claude", {"model": "anthropic/claude"}):
|
|
578
|
+
response = call_claude(row["question"])
|
|
579
|
+
evaluation.log("quality", index=index, score=0.85,
|
|
580
|
+
data={"output": response})
|
|
581
|
+
|
|
582
|
+
evaluation.submit(task, index, row)
|
|
583
|
+
```
|
|
584
|
+
"""
|
|
585
|
+
# On FIRST target() call ever in this evaluation:
|
|
586
|
+
# - Set flag to skip creating iteration-level traces going forward
|
|
587
|
+
# - Close the active iteration trace if any (it won't have useful content)
|
|
588
|
+
if not self._evaluation_uses_targets:
|
|
589
|
+
self._evaluation_uses_targets = True
|
|
590
|
+
# Close the active iteration trace early
|
|
591
|
+
if self._active_iteration_trace is not None:
|
|
592
|
+
self._active_iteration_trace.__exit__(None, None, None)
|
|
593
|
+
self._active_iteration_trace = None
|
|
594
|
+
|
|
595
|
+
# Mark that target() was used in this iteration (for dataset entry logic)
|
|
596
|
+
self._current_iteration_used_with_target = True
|
|
597
|
+
|
|
598
|
+
# Register target
|
|
599
|
+
self._register_target(name, metadata)
|
|
600
|
+
|
|
601
|
+
# Get index and item from iteration context (thread-safe via contextvars)
|
|
602
|
+
# This prevents race conditions when multiple threads are running evaluations
|
|
603
|
+
iter_ctx = _iteration_context.get()
|
|
604
|
+
if iter_ctx is not None:
|
|
605
|
+
index = iter_ctx.index
|
|
606
|
+
current_item = iter_ctx.item
|
|
607
|
+
else:
|
|
608
|
+
# Fallback to instance variables (for backwards compatibility / direct usage)
|
|
609
|
+
index = self._current_index
|
|
610
|
+
current_item = self._current_item
|
|
611
|
+
|
|
612
|
+
target_trace: Optional[LangWatchTrace] = None
|
|
613
|
+
start_time = time.time()
|
|
614
|
+
error_occurred: Optional[Exception] = None
|
|
615
|
+
trace_id = ""
|
|
616
|
+
|
|
617
|
+
# Set up context for log() inference
|
|
618
|
+
ctx = TargetContext(
|
|
619
|
+
target_id=name,
|
|
620
|
+
index=index,
|
|
621
|
+
trace_id="", # Will be set after entering trace
|
|
622
|
+
)
|
|
623
|
+
target_context_token = _target_context.set(ctx)
|
|
624
|
+
|
|
625
|
+
try:
|
|
626
|
+
# Create an INDEPENDENT root trace for this target
|
|
627
|
+
# We use a new tracer without any parent context to get a unique trace_id
|
|
628
|
+
# The key is using the tracer directly with context=None to prevent
|
|
629
|
+
# parent context inheritance
|
|
630
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
631
|
+
from opentelemetry.trace import INVALID_SPAN_CONTEXT
|
|
632
|
+
|
|
633
|
+
tracer = trace.get_tracer("langwatch-evaluation")
|
|
634
|
+
|
|
635
|
+
# Start a new root span with no parent by passing an empty context
|
|
636
|
+
# This ensures each target gets a unique trace_id
|
|
637
|
+
root_context = otel_context.Context()
|
|
638
|
+
|
|
639
|
+
with tracer.start_as_current_span(
|
|
640
|
+
f"evaluation.target.{name}",
|
|
641
|
+
context=root_context,
|
|
642
|
+
attributes={
|
|
643
|
+
"evaluation.run_id": self.run_id,
|
|
644
|
+
"evaluation.index": index,
|
|
645
|
+
"evaluation.target": name,
|
|
646
|
+
},
|
|
647
|
+
) as span:
|
|
648
|
+
span_context = span.get_span_context()
|
|
649
|
+
trace_id = format(span_context.trace_id, "032x")
|
|
650
|
+
ctx.trace_id = trace_id
|
|
651
|
+
|
|
652
|
+
try:
|
|
653
|
+
yield
|
|
654
|
+
except Exception as e:
|
|
655
|
+
error_occurred = e
|
|
656
|
+
raise
|
|
657
|
+
|
|
658
|
+
except Exception as e:
|
|
659
|
+
if error_occurred is None:
|
|
660
|
+
error_occurred = e
|
|
661
|
+
raise
|
|
662
|
+
finally:
|
|
663
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
664
|
+
|
|
665
|
+
# Create dataset entry for this target
|
|
666
|
+
# Use the captured current_item, NOT self._current_item (which may have changed)
|
|
667
|
+
entry_data: Any = (
|
|
668
|
+
current_item.to_dict()
|
|
669
|
+
if hasattr(current_item, "to_dict")
|
|
670
|
+
else (
|
|
671
|
+
current_item.__dict__
|
|
672
|
+
if hasattr(current_item, "__dict__")
|
|
673
|
+
else (
|
|
674
|
+
current_item[1].to_dict()
|
|
675
|
+
if type(current_item) == tuple
|
|
676
|
+
and hasattr(current_item[1], "to_dict")
|
|
677
|
+
else (
|
|
678
|
+
current_item[1].__dict__
|
|
679
|
+
if type(current_item) == tuple
|
|
680
|
+
and hasattr(current_item[1], "__dict__")
|
|
681
|
+
else {
|
|
682
|
+
"entry": json.dumps(
|
|
683
|
+
current_item, cls=SerializableWithStringFallback
|
|
684
|
+
)
|
|
685
|
+
}
|
|
686
|
+
)
|
|
687
|
+
)
|
|
688
|
+
)
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
# Get predicted output from context (set via log_response())
|
|
692
|
+
predicted = ctx.predicted
|
|
693
|
+
|
|
694
|
+
batch_entry = BatchEntry(
|
|
695
|
+
index=index,
|
|
696
|
+
entry=entry_data,
|
|
697
|
+
duration=duration_ms,
|
|
698
|
+
error=str(error_occurred) if error_occurred else None,
|
|
699
|
+
trace_id=trace_id,
|
|
700
|
+
target_id=name,
|
|
701
|
+
predicted=predicted,
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
with self.lock:
|
|
705
|
+
self.batch["dataset"].append(batch_entry)
|
|
706
|
+
|
|
707
|
+
# Reset target context
|
|
708
|
+
_target_context.reset(target_context_token)
|
|
709
|
+
|
|
710
|
+
# Schedule send
|
|
711
|
+
if time.time() - self.last_sent >= self.debounce_interval:
|
|
712
|
+
self._send_batch()
|
|
713
|
+
|
|
714
|
+
def log_response(self, response: Union[str, Dict[str, Any]]) -> None:
|
|
715
|
+
"""
|
|
716
|
+
Log the model's response/output for the current target.
|
|
717
|
+
|
|
718
|
+
Must be called inside a `target()` context. The response will be stored
|
|
719
|
+
in the dataset entry's `predicted` field, which is displayed in the
|
|
720
|
+
results table.
|
|
721
|
+
|
|
722
|
+
Args:
|
|
723
|
+
response: The model's output. Can be a string (will be wrapped as
|
|
724
|
+
{"output": response}) or a dict with named outputs.
|
|
725
|
+
|
|
726
|
+
Example:
|
|
727
|
+
```python
|
|
728
|
+
with evaluation.target("gpt-4", {"model": "openai/gpt-4"}):
|
|
729
|
+
response = call_gpt4(row["question"])
|
|
730
|
+
evaluation.log_response(response) # Store the output
|
|
731
|
+
evaluation.log("quality", index=index, score=0.95) # Log metrics
|
|
732
|
+
```
|
|
733
|
+
|
|
734
|
+
Raises:
|
|
735
|
+
RuntimeError: If called outside of a target() context.
|
|
736
|
+
"""
|
|
737
|
+
ctx = _target_context.get()
|
|
738
|
+
if ctx is None:
|
|
739
|
+
raise RuntimeError(
|
|
740
|
+
"log_response() must be called inside a target() context. "
|
|
741
|
+
"Example: with evaluation.target('my-target'): evaluation.log_response(response)"
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
# Normalize response to dict format
|
|
745
|
+
if isinstance(response, str):
|
|
746
|
+
ctx.predicted = {"output": response}
|
|
747
|
+
elif isinstance(response, dict):
|
|
748
|
+
ctx.predicted = response
|
|
749
|
+
else:
|
|
750
|
+
# Try to convert to string for other types
|
|
751
|
+
ctx.predicted = {"output": str(response)}
|
|
752
|
+
|
|
753
|
+
def log(
|
|
754
|
+
self,
|
|
755
|
+
metric: str,
|
|
756
|
+
index: Union[int, Hashable],
|
|
757
|
+
data: Dict[str, Any] = {},
|
|
758
|
+
score: Optional[float] = None,
|
|
759
|
+
passed: Optional[bool] = None,
|
|
760
|
+
label: Optional[str] = None,
|
|
761
|
+
details: Optional[str] = None,
|
|
762
|
+
status: Literal["processed", "error", "skipped"] = "processed",
|
|
763
|
+
duration: Optional[int] = None,
|
|
764
|
+
cost: Optional[Money] = None,
|
|
765
|
+
error: Optional[Exception] = None,
|
|
766
|
+
target: Optional[str] = None,
|
|
767
|
+
metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
|
|
768
|
+
):
|
|
769
|
+
"""
|
|
770
|
+
Log an evaluation metric result.
|
|
771
|
+
|
|
772
|
+
Args:
|
|
773
|
+
metric: Name of the metric being logged
|
|
774
|
+
index: Row index in the dataset (must be an integer)
|
|
775
|
+
data: Additional data/inputs for the evaluation
|
|
776
|
+
score: Numeric score (0-1 typically)
|
|
777
|
+
passed: Whether the evaluation passed
|
|
778
|
+
label: Label/category for the result
|
|
779
|
+
details: Human-readable description of the result
|
|
780
|
+
status: Status of the evaluation ("processed", "error", "skipped")
|
|
781
|
+
duration: Duration in milliseconds
|
|
782
|
+
cost: Cost of the evaluation
|
|
783
|
+
error: Exception if an error occurred
|
|
784
|
+
target: Optional target name for multi-target comparisons.
|
|
785
|
+
First call with a target name registers it with the provided metadata.
|
|
786
|
+
Subsequent calls with the same target can omit metadata.
|
|
787
|
+
If called inside with_target(), the target is auto-inferred from context.
|
|
788
|
+
metadata: Optional metadata for the target (model, temperature, etc.).
|
|
789
|
+
Only used on the first call for each target.
|
|
790
|
+
Raises error if conflicting metadata is provided for same target.
|
|
791
|
+
"""
|
|
792
|
+
try:
|
|
793
|
+
index_ = int(cast(Any, index))
|
|
794
|
+
except Exception:
|
|
795
|
+
raise ValueError(f"Index must be an integer, got {index}")
|
|
796
|
+
|
|
797
|
+
# Get target context (if inside with_target)
|
|
798
|
+
ctx = _target_context.get()
|
|
799
|
+
|
|
800
|
+
# Use context target if not explicitly provided
|
|
801
|
+
effective_target = target if target is not None else (ctx.target_id if ctx else None)
|
|
802
|
+
|
|
803
|
+
# Register target if provided (explicit or from context)
|
|
804
|
+
target_id: Optional[str] = None
|
|
805
|
+
if effective_target is not None:
|
|
806
|
+
target_id = self._register_target(effective_target, metadata)
|
|
807
|
+
|
|
808
|
+
# Use trace_id from context if available
|
|
809
|
+
trace_id = (
|
|
810
|
+
ctx.trace_id
|
|
811
|
+
if ctx
|
|
812
|
+
else format(trace.get_current_span().get_span_context().trace_id, "x")
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
eval = EvaluationResult(
|
|
816
|
+
trace_id=trace_id,
|
|
817
|
+
name=metric,
|
|
818
|
+
evaluator=metric,
|
|
819
|
+
status=status if status else "error" if error else "processed",
|
|
820
|
+
data=data,
|
|
821
|
+
score=score,
|
|
822
|
+
passed=passed,
|
|
823
|
+
index=index_,
|
|
824
|
+
label=label,
|
|
825
|
+
cost=cost.amount if cost else None,
|
|
826
|
+
duration=duration,
|
|
827
|
+
details=details if details else str(error) if error else None,
|
|
828
|
+
error_type=type(error).__name__ if error else None,
|
|
829
|
+
traceback=(
|
|
830
|
+
list(traceback.TracebackException.from_exception(error).format())
|
|
831
|
+
if error
|
|
832
|
+
else None
|
|
833
|
+
),
|
|
834
|
+
target_id=target_id,
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
with self.lock:
|
|
838
|
+
self.batch["evaluations"].append(eval)
|
|
839
|
+
|
|
840
|
+
def evaluate(
|
|
841
|
+
self,
|
|
842
|
+
evaluator_id: str,
|
|
843
|
+
index: Union[int, Hashable],
|
|
844
|
+
data: Dict[str, Any],
|
|
845
|
+
settings: Dict[str, Any],
|
|
846
|
+
name: Optional[str] = None,
|
|
847
|
+
as_guardrail: bool = False,
|
|
848
|
+
):
|
|
849
|
+
"""
|
|
850
|
+
Run an evaluator on the current row.
|
|
851
|
+
|
|
852
|
+
Args:
|
|
853
|
+
evaluator_id: The evaluator type/slug (e.g., "langevals/exact_match", "ragas/faithfulness")
|
|
854
|
+
index: The row index for this evaluation
|
|
855
|
+
data: Data to pass to the evaluator (e.g., {"input": ..., "output": ..., "expected_output": ...})
|
|
856
|
+
settings: Evaluator-specific settings
|
|
857
|
+
name: Optional display name for the evaluation (defaults to evaluator_id)
|
|
858
|
+
as_guardrail: Whether to run as a guardrail (stricter pass/fail)
|
|
859
|
+
"""
|
|
860
|
+
duration: Optional[int] = None
|
|
861
|
+
|
|
862
|
+
start_time = time.time()
|
|
863
|
+
result = langwatch.evaluations.evaluate(
|
|
864
|
+
span=langwatch.get_current_span(),
|
|
865
|
+
slug=evaluator_id,
|
|
866
|
+
name=name or evaluator_id,
|
|
867
|
+
settings=settings,
|
|
868
|
+
as_guardrail=as_guardrail,
|
|
869
|
+
data=data,
|
|
870
|
+
)
|
|
871
|
+
duration = int((time.time() - start_time) * 1000)
|
|
872
|
+
|
|
873
|
+
self.log(
|
|
874
|
+
metric=name or evaluator_id,
|
|
875
|
+
index=index,
|
|
876
|
+
data=data,
|
|
877
|
+
status=result.status,
|
|
878
|
+
score=result.score,
|
|
879
|
+
passed=result.passed,
|
|
880
|
+
details=result.details,
|
|
881
|
+
label=result.label,
|
|
882
|
+
duration=duration,
|
|
883
|
+
cost=result.cost,
|
|
884
|
+
)
|
|
885
|
+
|
|
886
|
+
def run(
|
|
887
|
+
self,
|
|
888
|
+
evaluator_id: str,
|
|
889
|
+
index: Union[int, Hashable],
|
|
890
|
+
data: Dict[str, Any],
|
|
891
|
+
settings: Dict[str, Any],
|
|
892
|
+
name: Optional[str] = None,
|
|
893
|
+
as_guardrail: bool = False,
|
|
894
|
+
):
|
|
895
|
+
"""
|
|
896
|
+
Deprecated: Use `evaluate()` instead.
|
|
897
|
+
"""
|
|
898
|
+
import warnings
|
|
899
|
+
|
|
900
|
+
warnings.warn(
|
|
901
|
+
"evaluation.run() is deprecated, use evaluation.evaluate() instead",
|
|
902
|
+
DeprecationWarning,
|
|
903
|
+
stacklevel=2,
|
|
904
|
+
)
|
|
905
|
+
return self.evaluate(
|
|
906
|
+
evaluator_id=evaluator_id,
|
|
907
|
+
index=index,
|
|
908
|
+
data=data,
|
|
909
|
+
settings=settings,
|
|
910
|
+
name=name,
|
|
911
|
+
as_guardrail=as_guardrail,
|
|
912
|
+
)
|