langwatch 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,912 @@
1
+ from __future__ import annotations
2
+ import asyncio
3
+ from contextlib import contextmanager
4
+ from contextvars import ContextVar
5
+ from dataclasses import dataclass
6
+ import json
7
+ import threading
8
+ import time
9
+ import traceback
10
+ import httpx
11
+ import pandas as pd
12
+ from opentelemetry import trace, context as otel_context
13
+ from opentelemetry.trace import Span
14
+ from pydantic import BaseModel, Field
15
+ from typing import (
16
+ Any,
17
+ Callable,
18
+ Dict,
19
+ Hashable,
20
+ Iterable,
21
+ Iterator,
22
+ List,
23
+ Literal,
24
+ Optional,
25
+ TypeVar,
26
+ TypedDict,
27
+ Sized,
28
+ Union,
29
+ cast,
30
+ )
31
+
32
+ from tenacity import retry, stop_after_attempt, wait_exponential
33
+ from tqdm.auto import tqdm
34
+
35
+ import langwatch
36
+ from langwatch.attributes import AttributeKey
37
+ from langwatch.domain import Money, TypedValueJson
38
+ from langwatch.telemetry.tracing import LangWatchTrace
39
+ from langwatch.utils.exceptions import better_raise_for_status
40
+ from langwatch.utils.transformation import SerializableWithStringFallback
41
+
42
+ from coolname import generate_slug # type: ignore
43
+ import urllib.parse
44
+ from concurrent.futures import Future, ThreadPoolExecutor, as_completed
45
+
46
+ _tracer = trace.get_tracer(__name__)
47
+
48
+
49
+ @dataclass
50
+ class TargetContext:
51
+ """Context for the current target() execution."""
52
+
53
+ target_id: str
54
+ index: int
55
+ trace_id: str
56
+ predicted: Optional[Dict[str, Any]] = None # Set via log_response()
57
+
58
+
59
+ @dataclass
60
+ class IterationContext:
61
+ """Context for the current iteration (index + item)."""
62
+
63
+ index: int
64
+ item: Any
65
+
66
+
67
+ # ContextVar for target context isolation (works across threads)
68
+ _target_context: ContextVar[Optional[TargetContext]] = ContextVar(
69
+ "_target_context", default=None
70
+ )
71
+
72
+ # ContextVar for iteration context (index + item) - thread-safe
73
+ _iteration_context: ContextVar[Optional[IterationContext]] = ContextVar(
74
+ "_iteration_context", default=None
75
+ )
76
+
77
+ ItemT = TypeVar("ItemT")
78
+
79
+
80
+ class EvaluationResult(BaseModel):
81
+ name: str
82
+ evaluator: str
83
+ trace_id: str
84
+ status: Literal["processed", "error", "skipped"]
85
+ data: Optional[Dict[str, Any]] = None
86
+ score: Optional[float] = Field(default=None, description="No description provided")
87
+ passed: Optional[bool] = None
88
+ details: Optional[str] = Field(
89
+ default=None, description="Short human-readable description of the result"
90
+ )
91
+ index: Optional[int] = None
92
+ label: Optional[str] = None
93
+ cost: Optional[float] = None
94
+ duration: Optional[int] = None
95
+ error_type: Optional[str] = None
96
+ traceback: Optional[List[str]] = Field(
97
+ description="Traceback information for debugging", default=None
98
+ )
99
+ target_id: Optional[str] = Field(
100
+ default=None, description="ID of the target this evaluation is for"
101
+ )
102
+
103
+
104
+ class TargetInfo(BaseModel):
105
+ """Represents a registered target with its metadata."""
106
+
107
+ id: str
108
+ name: str
109
+ type: Literal["prompt", "agent", "custom"] = "custom"
110
+ metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None
111
+
112
+
113
+ class Batch(TypedDict):
114
+ dataset: List[BatchEntry]
115
+ evaluations: List[EvaluationResult]
116
+ targets: List[TargetInfo]
117
+
118
+
119
+ class BatchEntry(BaseModel):
120
+ index: int
121
+ entry: Any
122
+ duration: int
123
+ error: Optional[str] = None
124
+ trace_id: str
125
+ target_id: Optional[str] = None
126
+ cost: Optional[float] = None
127
+ predicted: Optional[Dict[str, Any]] = None
128
+
129
+
130
+ class IterationInfo(TypedDict):
131
+ index: int
132
+ trace: LangWatchTrace
133
+ item: Any
134
+ duration: int
135
+ error: Optional[Exception]
136
+
137
+
138
+ class Experiment:
139
+ _executor: ThreadPoolExecutor
140
+ _futures: List[Future[Any]]
141
+ _current_index: int
142
+ _current_item: Any
143
+
144
+ def __init__(self, name: str, *, run_id: Optional[str] = None):
145
+ self.name: str = name or generate_slug(3)
146
+ self.experiment_slug: str = self.name
147
+ self.run_id: str = run_id or generate_slug(3)
148
+ self.total: int = 0
149
+ self.progress: int = 0
150
+ self.created_at_nano: int = int(time.time() * 1000)
151
+ self._futures: List[Future[Any]] = []
152
+
153
+ # Sending results
154
+ self.lock = threading.Lock()
155
+ self.batch: Batch = {"dataset": [], "evaluations": [], "targets": []}
156
+ self.last_sent = 0
157
+ self.debounce_interval = 1 # 1 second
158
+ self.threads: List[threading.Thread] = []
159
+ self.initialized = False
160
+
161
+ # Target registry - tracks registered targets and their metadata
162
+ self._targets: Dict[str, TargetInfo] = {}
163
+
164
+ # Track whether with_target() was used in the current iteration
165
+ # If so, we don't create row-level dataset entries
166
+ self._current_iteration_used_with_target = False
167
+
168
+ # Track whether target() has EVER been used in this evaluation
169
+ # Once set to True, we stop creating iteration-level traces
170
+ self._evaluation_uses_targets: bool = False
171
+
172
+ # Store the active iteration trace so target() can close it early
173
+ self._active_iteration_trace: Optional[LangWatchTrace] = None
174
+
175
+ def init(self):
176
+ if not langwatch.get_api_key():
177
+ raise ValueError(
178
+ "API key was not detected, please set LANGWATCH_API_KEY or call langwatch.login() to login"
179
+ )
180
+ langwatch.ensure_setup()
181
+
182
+ with httpx.Client(timeout=60) as client:
183
+ response = client.post(
184
+ f"{langwatch.get_endpoint()}/api/experiment/init",
185
+ headers={"X-Auth-Token": langwatch.get_api_key() or ""},
186
+ json={
187
+ "experiment_name": self.name,
188
+ "experiment_slug": self.experiment_slug,
189
+ "experiment_type": "BATCH_EVALUATION_V2",
190
+ },
191
+ )
192
+ if response.status_code == 401:
193
+ langwatch.setup(api_key=None)
194
+ raise ValueError(
195
+ "API key is not valid, please try to login again with langwatch.login()"
196
+ )
197
+ better_raise_for_status(response)
198
+ response_json = response.json()
199
+ experiment_path = response_json["path"]
200
+ self.experiment_slug = response_json["slug"]
201
+
202
+ url_encoded_run_id = urllib.parse.quote(self.run_id)
203
+ print(
204
+ f"Follow the results at: {langwatch.get_endpoint()}{experiment_path}?runId={url_encoded_run_id}"
205
+ )
206
+ self.initialized = True
207
+
208
+ def loop(
209
+ self,
210
+ iterable: Union[Iterable[ItemT], pd.DataFrame],
211
+ *,
212
+ threads: int = 4,
213
+ total: Optional[int] = None,
214
+ ) -> Iterable[ItemT]:
215
+ if not self.initialized:
216
+ self.init()
217
+
218
+ try:
219
+ total_ = (
220
+ total
221
+ if total
222
+ else (
223
+ len(cast(Sized, iterable)) if hasattr(iterable, "__len__") else None
224
+ )
225
+ )
226
+ if total_ is None and "DataFrame.iterrows" in str(iterable):
227
+ iterable = cast(Iterable[ItemT], list(iterable))
228
+ total_ = len(cast(Sized, iterable))
229
+ progress_bar = tqdm(total=total_, desc="Evaluating")
230
+
231
+ # Supports direct pandas df being passed in
232
+ if isinstance(iterable, pd.DataFrame):
233
+ iterable = cast(Iterable[ItemT], iterable.iterrows()) # type: ignore
234
+
235
+ with ThreadPoolExecutor(max_workers=threads) as executor:
236
+ self._executor = executor
237
+ for index, item in enumerate(iterable):
238
+ self._current_index = index
239
+ self._current_item = item
240
+
241
+ with self._execute_item_iteration(
242
+ index,
243
+ item,
244
+ in_thread=False,
245
+ ):
246
+ yield item
247
+ if len(self._futures) == 0:
248
+ progress_bar.update(1)
249
+
250
+ if len(self._futures) > 0:
251
+ for _ in as_completed(self._futures):
252
+ progress_bar.update(1)
253
+
254
+ executor.submit(self._wait_for_completion).result()
255
+ progress_bar.close()
256
+
257
+ except Exception as e:
258
+ Experiment._log_results(
259
+ langwatch.get_api_key() or "",
260
+ {
261
+ "experiment_slug": self.experiment_slug,
262
+ "run_id": self.run_id,
263
+ "timestamps": {
264
+ "finished_at": int(time.time() * 1000),
265
+ "stopped_at": int(time.time() * 1000),
266
+ },
267
+ },
268
+ )
269
+ raise e
270
+
271
+ def submit(self, func: Callable[..., Any], /, *args: Any, **kwargs: Any):
272
+ _current_index = self._current_index
273
+ _current_item = self._current_item
274
+
275
+ def wrapper():
276
+ with self._execute_item_iteration(
277
+ _current_index, _current_item, in_thread=True
278
+ ):
279
+ if asyncio.iscoroutinefunction(func):
280
+ func_result = asyncio.run(func(*args, **kwargs))
281
+ else:
282
+ func_result = func(*args, **kwargs)
283
+
284
+ return func_result
285
+
286
+ future = self._executor.submit(wrapper)
287
+ self._futures.append(future)
288
+ return future
289
+
290
+ @contextmanager
291
+ def _execute_item_iteration(
292
+ self,
293
+ index: int,
294
+ item: Any,
295
+ in_thread: bool = False,
296
+ ) -> Iterator[Any]:
297
+ # Reset with_target tracking for this iteration
298
+ self._current_iteration_used_with_target = False
299
+
300
+ # Set iteration context (thread-safe via contextvars)
301
+ # This allows target() to access index/item without race conditions
302
+ iter_ctx = IterationContext(index=index, item=item)
303
+ iter_token = _iteration_context.set(iter_ctx)
304
+
305
+ # Determine if we should create an iteration trace:
306
+ # - Don't create if evaluation uses targets (each target creates its own trace)
307
+ # - Don't create if we're collecting submit() calls (not in_thread yet)
308
+ should_create_iteration_trace = (
309
+ not self._evaluation_uses_targets
310
+ and (in_thread or len(self._futures) == 0)
311
+ )
312
+
313
+ iteration: Optional[IterationInfo] = None
314
+ if should_create_iteration_trace:
315
+ iteration = IterationInfo(
316
+ trace=langwatch.trace(
317
+ name="evaluation.loop_iteration",
318
+ metadata={
319
+ "thread_id": self.run_id,
320
+ "loop.index": str(index),
321
+ },
322
+ ),
323
+ index=index,
324
+ item=item,
325
+ duration=0,
326
+ error=None,
327
+ )
328
+ iteration["trace"].__enter__()
329
+ # Store for target() to potentially close early
330
+ self._active_iteration_trace = iteration["trace"]
331
+
332
+ start_time = time.time()
333
+ try:
334
+ yield
335
+ except Exception as e:
336
+ if iteration is not None:
337
+ iteration["error"] = e
338
+ print(f"\n[Evaluation Error] index={index}")
339
+ traceback.print_exc()
340
+ finally:
341
+ # Reset iteration context
342
+ _iteration_context.reset(iter_token)
343
+
344
+ # Handle iteration trace cleanup
345
+ # Note: If target() was used, it may have already closed the trace
346
+ if iteration is not None and not self._evaluation_uses_targets:
347
+ try:
348
+ iteration["duration"] = int((time.time() - start_time) * 1000)
349
+
350
+ # If we just started the parallel loop, we need to skip the first iteration
351
+ # from being added to the batch and change the trace name
352
+ if not in_thread and len(self._futures) > 0:
353
+ iteration["trace"].update(name="evaluation.loop")
354
+ # Only add row-level entry if with_target was NOT used
355
+ # When with_target is used, it creates per-target dataset entries instead
356
+ elif not self._current_iteration_used_with_target:
357
+ self._add_to_batch(iteration)
358
+
359
+ if iteration["error"] is not None:
360
+ iteration["trace"].update(error=iteration["error"])
361
+ except Exception as e:
362
+ raise e
363
+ finally:
364
+ iteration["trace"].__exit__(None, None, None)
365
+
366
+ # Clear active iteration trace reference
367
+ self._active_iteration_trace = None
368
+
369
+ def _add_to_batch(self, iteration: IterationInfo):
370
+ entry: Any = (
371
+ iteration["item"].to_dict()
372
+ if hasattr(iteration["item"], "to_dict")
373
+ else (
374
+ iteration["item"].__dict__
375
+ if hasattr(iteration["item"], "__dict__")
376
+ else (
377
+ iteration["item"][1].to_dict()
378
+ if type(iteration["item"]) == tuple
379
+ and hasattr(iteration["item"][1], "to_dict")
380
+ else (
381
+ iteration["item"][1].__dict__
382
+ if type(iteration["item"]) == tuple
383
+ and hasattr(iteration["item"][1], "__dict__")
384
+ else {
385
+ "entry": json.dumps(
386
+ iteration["item"], cls=SerializableWithStringFallback
387
+ )
388
+ }
389
+ )
390
+ )
391
+ )
392
+ )
393
+ with self.lock:
394
+ self.batch["dataset"].append(
395
+ BatchEntry(
396
+ index=iteration["index"],
397
+ entry=entry,
398
+ duration=iteration["duration"],
399
+ error=str(iteration["error"]) if iteration["error"] else None,
400
+ trace_id=iteration["trace"].trace_id or "",
401
+ )
402
+ )
403
+
404
+ if time.time() - self.last_sent >= self.debounce_interval:
405
+ self._send_batch()
406
+
407
+ def _send_batch(self, finished: bool = False):
408
+ with self.lock:
409
+ if (
410
+ len(self.batch["dataset"]) == 0
411
+ and len(self.batch["evaluations"]) == 0
412
+ and len(self.batch["targets"]) == 0
413
+ and not finished
414
+ ):
415
+ return
416
+
417
+ # TODO: it is called `inputs` on the api still, unfortunately, so we need to map data back to inputs
418
+ evaluations = []
419
+ for eval in self.batch["evaluations"]:
420
+ eval_ = eval.model_dump(exclude_none=True, exclude_unset=True)
421
+ eval_["inputs"] = eval_["data"]
422
+ if "data" in eval_:
423
+ del eval_["data"]
424
+ evaluations.append(eval_)
425
+
426
+ # Build targets array for API
427
+ targets = [
428
+ target.model_dump(exclude_none=True, exclude_unset=True)
429
+ for target in self.batch["targets"]
430
+ ]
431
+
432
+ body: Dict[str, Any] = {
433
+ "experiment_slug": self.experiment_slug,
434
+ "name": f"{self.name}",
435
+ "run_id": self.run_id,
436
+ "dataset": [
437
+ entry.model_dump(exclude_none=True, exclude_unset=True)
438
+ for entry in self.batch["dataset"]
439
+ ],
440
+ "evaluations": evaluations,
441
+ "progress": self.progress,
442
+ "total": self.total,
443
+ "timestamps": {
444
+ "created_at": self.created_at_nano,
445
+ },
446
+ }
447
+
448
+ # Only include targets if we have any
449
+ if len(targets) > 0:
450
+ body["targets"] = targets
451
+
452
+ if finished:
453
+ if not isinstance(body["timestamps"], dict):
454
+ body["timestamps"] = {}
455
+ body["timestamps"]["finished_at"] = int(time.time() * 1000)
456
+
457
+ # Start a new thread to send the batch
458
+ thread = threading.Thread(
459
+ target=Experiment._log_results,
460
+ args=(langwatch.get_api_key(), body),
461
+ )
462
+ thread.start()
463
+ self.threads.append(thread)
464
+
465
+ # Clear the batch and update the last sent time
466
+ self.batch = {"dataset": [], "evaluations": [], "targets": []}
467
+ self.last_sent = time.time()
468
+
469
+ @classmethod
470
+ @retry(
471
+ stop=stop_after_attempt(3),
472
+ wait=wait_exponential(multiplier=1, min=2, max=10),
473
+ reraise=True,
474
+ )
475
+ def _log_results(cls, api_key: str, body: Dict[str, Any]):
476
+ response = httpx.post(
477
+ f"{langwatch.get_endpoint()}/api/evaluations/batch/log_results",
478
+ headers={
479
+ "Authorization": f"Bearer {api_key}",
480
+ "Content-Type": "application/json",
481
+ },
482
+ data=json.dumps(body, cls=SerializableWithStringFallback), # type: ignore
483
+ timeout=60,
484
+ )
485
+ better_raise_for_status(response)
486
+
487
+ def _wait_for_completion(self):
488
+ async def wait_for_completion(self: Experiment):
489
+ # Send any remaining batch
490
+ self._send_batch(finished=True)
491
+
492
+ for thread in self.threads:
493
+ await asyncio.sleep(0)
494
+ thread.join()
495
+
496
+ asyncio.run(wait_for_completion(self))
497
+
498
+ def _register_target(
499
+ self,
500
+ target: str,
501
+ metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
502
+ ) -> str:
503
+ """
504
+ Register a target with its metadata. Returns the target ID.
505
+
506
+ If the target was already registered:
507
+ - If no new metadata is provided, the existing target is used
508
+ - If new metadata is provided and differs from existing, raises an error
509
+
510
+ Args:
511
+ target: The target name/ID
512
+ metadata: Optional metadata for this target (model, temperature, etc.)
513
+
514
+ Returns:
515
+ The target ID
516
+ """
517
+ with self.lock:
518
+ if target in self._targets:
519
+ existing = self._targets[target]
520
+ if metadata is not None:
521
+ # Check if metadata matches
522
+ existing_meta = existing.metadata or {}
523
+ if existing_meta != metadata:
524
+ raise ValueError(
525
+ f"Target '{target}' was previously registered with different metadata.\n"
526
+ f"Original: {existing_meta}\n"
527
+ f"New: {metadata}\n"
528
+ f"If you want to use different metadata, please use a different target name."
529
+ )
530
+ return target
531
+
532
+ # Register new target
533
+ target_info = TargetInfo(
534
+ id=target,
535
+ name=target,
536
+ type="custom",
537
+ metadata=metadata,
538
+ )
539
+ self._targets[target] = target_info
540
+ self.batch["targets"].append(target_info)
541
+ return target
542
+
543
+ @contextmanager
544
+ def target(
545
+ self,
546
+ name: str,
547
+ metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
548
+ ) -> Iterator[None]:
549
+ """
550
+ Context manager for executing code within a target context.
551
+
552
+ Creates a dataset entry for this specific target execution, capturing
553
+ duration automatically. This enables proper per-target latency tracking
554
+ when comparing multiple models/configurations.
555
+
556
+ Each target() call creates its own independent trace, allowing you to
557
+ view execution details separately for each model/configuration.
558
+
559
+ Inside this context, log() calls will automatically use this target
560
+ unless an explicit target is provided.
561
+
562
+ Args:
563
+ name: Unique identifier for the target
564
+ metadata: Optional metadata for comparison (e.g., {"model": "gpt-4"})
565
+
566
+ Example:
567
+ ```python
568
+ for index, row in evaluation.loop(df.iterrows()):
569
+ def task(index, row):
570
+ # Compare GPT-4 and Claude
571
+ with evaluation.target("gpt-4", {"model": "openai/gpt-4"}):
572
+ response = call_gpt4(row["question"])
573
+ # target auto-inferred, use data= to record output
574
+ evaluation.log("quality", index=index, score=0.95,
575
+ data={"output": response})
576
+
577
+ with evaluation.target("claude", {"model": "anthropic/claude"}):
578
+ response = call_claude(row["question"])
579
+ evaluation.log("quality", index=index, score=0.85,
580
+ data={"output": response})
581
+
582
+ evaluation.submit(task, index, row)
583
+ ```
584
+ """
585
+ # On FIRST target() call ever in this evaluation:
586
+ # - Set flag to skip creating iteration-level traces going forward
587
+ # - Close the active iteration trace if any (it won't have useful content)
588
+ if not self._evaluation_uses_targets:
589
+ self._evaluation_uses_targets = True
590
+ # Close the active iteration trace early
591
+ if self._active_iteration_trace is not None:
592
+ self._active_iteration_trace.__exit__(None, None, None)
593
+ self._active_iteration_trace = None
594
+
595
+ # Mark that target() was used in this iteration (for dataset entry logic)
596
+ self._current_iteration_used_with_target = True
597
+
598
+ # Register target
599
+ self._register_target(name, metadata)
600
+
601
+ # Get index and item from iteration context (thread-safe via contextvars)
602
+ # This prevents race conditions when multiple threads are running evaluations
603
+ iter_ctx = _iteration_context.get()
604
+ if iter_ctx is not None:
605
+ index = iter_ctx.index
606
+ current_item = iter_ctx.item
607
+ else:
608
+ # Fallback to instance variables (for backwards compatibility / direct usage)
609
+ index = self._current_index
610
+ current_item = self._current_item
611
+
612
+ target_trace: Optional[LangWatchTrace] = None
613
+ start_time = time.time()
614
+ error_occurred: Optional[Exception] = None
615
+ trace_id = ""
616
+
617
+ # Set up context for log() inference
618
+ ctx = TargetContext(
619
+ target_id=name,
620
+ index=index,
621
+ trace_id="", # Will be set after entering trace
622
+ )
623
+ target_context_token = _target_context.set(ctx)
624
+
625
+ try:
626
+ # Create an INDEPENDENT root trace for this target
627
+ # We use a new tracer without any parent context to get a unique trace_id
628
+ # The key is using the tracer directly with context=None to prevent
629
+ # parent context inheritance
630
+ from opentelemetry.sdk.trace import TracerProvider
631
+ from opentelemetry.trace import INVALID_SPAN_CONTEXT
632
+
633
+ tracer = trace.get_tracer("langwatch-evaluation")
634
+
635
+ # Start a new root span with no parent by passing an empty context
636
+ # This ensures each target gets a unique trace_id
637
+ root_context = otel_context.Context()
638
+
639
+ with tracer.start_as_current_span(
640
+ f"evaluation.target.{name}",
641
+ context=root_context,
642
+ attributes={
643
+ "evaluation.run_id": self.run_id,
644
+ "evaluation.index": index,
645
+ "evaluation.target": name,
646
+ },
647
+ ) as span:
648
+ span_context = span.get_span_context()
649
+ trace_id = format(span_context.trace_id, "032x")
650
+ ctx.trace_id = trace_id
651
+
652
+ try:
653
+ yield
654
+ except Exception as e:
655
+ error_occurred = e
656
+ raise
657
+
658
+ except Exception as e:
659
+ if error_occurred is None:
660
+ error_occurred = e
661
+ raise
662
+ finally:
663
+ duration_ms = int((time.time() - start_time) * 1000)
664
+
665
+ # Create dataset entry for this target
666
+ # Use the captured current_item, NOT self._current_item (which may have changed)
667
+ entry_data: Any = (
668
+ current_item.to_dict()
669
+ if hasattr(current_item, "to_dict")
670
+ else (
671
+ current_item.__dict__
672
+ if hasattr(current_item, "__dict__")
673
+ else (
674
+ current_item[1].to_dict()
675
+ if type(current_item) == tuple
676
+ and hasattr(current_item[1], "to_dict")
677
+ else (
678
+ current_item[1].__dict__
679
+ if type(current_item) == tuple
680
+ and hasattr(current_item[1], "__dict__")
681
+ else {
682
+ "entry": json.dumps(
683
+ current_item, cls=SerializableWithStringFallback
684
+ )
685
+ }
686
+ )
687
+ )
688
+ )
689
+ )
690
+
691
+ # Get predicted output from context (set via log_response())
692
+ predicted = ctx.predicted
693
+
694
+ batch_entry = BatchEntry(
695
+ index=index,
696
+ entry=entry_data,
697
+ duration=duration_ms,
698
+ error=str(error_occurred) if error_occurred else None,
699
+ trace_id=trace_id,
700
+ target_id=name,
701
+ predicted=predicted,
702
+ )
703
+
704
+ with self.lock:
705
+ self.batch["dataset"].append(batch_entry)
706
+
707
+ # Reset target context
708
+ _target_context.reset(target_context_token)
709
+
710
+ # Schedule send
711
+ if time.time() - self.last_sent >= self.debounce_interval:
712
+ self._send_batch()
713
+
714
+ def log_response(self, response: Union[str, Dict[str, Any]]) -> None:
715
+ """
716
+ Log the model's response/output for the current target.
717
+
718
+ Must be called inside a `target()` context. The response will be stored
719
+ in the dataset entry's `predicted` field, which is displayed in the
720
+ results table.
721
+
722
+ Args:
723
+ response: The model's output. Can be a string (will be wrapped as
724
+ {"output": response}) or a dict with named outputs.
725
+
726
+ Example:
727
+ ```python
728
+ with evaluation.target("gpt-4", {"model": "openai/gpt-4"}):
729
+ response = call_gpt4(row["question"])
730
+ evaluation.log_response(response) # Store the output
731
+ evaluation.log("quality", index=index, score=0.95) # Log metrics
732
+ ```
733
+
734
+ Raises:
735
+ RuntimeError: If called outside of a target() context.
736
+ """
737
+ ctx = _target_context.get()
738
+ if ctx is None:
739
+ raise RuntimeError(
740
+ "log_response() must be called inside a target() context. "
741
+ "Example: with evaluation.target('my-target'): evaluation.log_response(response)"
742
+ )
743
+
744
+ # Normalize response to dict format
745
+ if isinstance(response, str):
746
+ ctx.predicted = {"output": response}
747
+ elif isinstance(response, dict):
748
+ ctx.predicted = response
749
+ else:
750
+ # Try to convert to string for other types
751
+ ctx.predicted = {"output": str(response)}
752
+
753
+ def log(
754
+ self,
755
+ metric: str,
756
+ index: Union[int, Hashable],
757
+ data: Dict[str, Any] = {},
758
+ score: Optional[float] = None,
759
+ passed: Optional[bool] = None,
760
+ label: Optional[str] = None,
761
+ details: Optional[str] = None,
762
+ status: Literal["processed", "error", "skipped"] = "processed",
763
+ duration: Optional[int] = None,
764
+ cost: Optional[Money] = None,
765
+ error: Optional[Exception] = None,
766
+ target: Optional[str] = None,
767
+ metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
768
+ ):
769
+ """
770
+ Log an evaluation metric result.
771
+
772
+ Args:
773
+ metric: Name of the metric being logged
774
+ index: Row index in the dataset (must be an integer)
775
+ data: Additional data/inputs for the evaluation
776
+ score: Numeric score (0-1 typically)
777
+ passed: Whether the evaluation passed
778
+ label: Label/category for the result
779
+ details: Human-readable description of the result
780
+ status: Status of the evaluation ("processed", "error", "skipped")
781
+ duration: Duration in milliseconds
782
+ cost: Cost of the evaluation
783
+ error: Exception if an error occurred
784
+ target: Optional target name for multi-target comparisons.
785
+ First call with a target name registers it with the provided metadata.
786
+ Subsequent calls with the same target can omit metadata.
787
+ If called inside with_target(), the target is auto-inferred from context.
788
+ metadata: Optional metadata for the target (model, temperature, etc.).
789
+ Only used on the first call for each target.
790
+ Raises error if conflicting metadata is provided for same target.
791
+ """
792
+ try:
793
+ index_ = int(cast(Any, index))
794
+ except Exception:
795
+ raise ValueError(f"Index must be an integer, got {index}")
796
+
797
+ # Get target context (if inside with_target)
798
+ ctx = _target_context.get()
799
+
800
+ # Use context target if not explicitly provided
801
+ effective_target = target if target is not None else (ctx.target_id if ctx else None)
802
+
803
+ # Register target if provided (explicit or from context)
804
+ target_id: Optional[str] = None
805
+ if effective_target is not None:
806
+ target_id = self._register_target(effective_target, metadata)
807
+
808
+ # Use trace_id from context if available
809
+ trace_id = (
810
+ ctx.trace_id
811
+ if ctx
812
+ else format(trace.get_current_span().get_span_context().trace_id, "x")
813
+ )
814
+
815
+ eval = EvaluationResult(
816
+ trace_id=trace_id,
817
+ name=metric,
818
+ evaluator=metric,
819
+ status=status if status else "error" if error else "processed",
820
+ data=data,
821
+ score=score,
822
+ passed=passed,
823
+ index=index_,
824
+ label=label,
825
+ cost=cost.amount if cost else None,
826
+ duration=duration,
827
+ details=details if details else str(error) if error else None,
828
+ error_type=type(error).__name__ if error else None,
829
+ traceback=(
830
+ list(traceback.TracebackException.from_exception(error).format())
831
+ if error
832
+ else None
833
+ ),
834
+ target_id=target_id,
835
+ )
836
+
837
+ with self.lock:
838
+ self.batch["evaluations"].append(eval)
839
+
840
+ def evaluate(
841
+ self,
842
+ evaluator_id: str,
843
+ index: Union[int, Hashable],
844
+ data: Dict[str, Any],
845
+ settings: Dict[str, Any],
846
+ name: Optional[str] = None,
847
+ as_guardrail: bool = False,
848
+ ):
849
+ """
850
+ Run an evaluator on the current row.
851
+
852
+ Args:
853
+ evaluator_id: The evaluator type/slug (e.g., "langevals/exact_match", "ragas/faithfulness")
854
+ index: The row index for this evaluation
855
+ data: Data to pass to the evaluator (e.g., {"input": ..., "output": ..., "expected_output": ...})
856
+ settings: Evaluator-specific settings
857
+ name: Optional display name for the evaluation (defaults to evaluator_id)
858
+ as_guardrail: Whether to run as a guardrail (stricter pass/fail)
859
+ """
860
+ duration: Optional[int] = None
861
+
862
+ start_time = time.time()
863
+ result = langwatch.evaluations.evaluate(
864
+ span=langwatch.get_current_span(),
865
+ slug=evaluator_id,
866
+ name=name or evaluator_id,
867
+ settings=settings,
868
+ as_guardrail=as_guardrail,
869
+ data=data,
870
+ )
871
+ duration = int((time.time() - start_time) * 1000)
872
+
873
+ self.log(
874
+ metric=name or evaluator_id,
875
+ index=index,
876
+ data=data,
877
+ status=result.status,
878
+ score=result.score,
879
+ passed=result.passed,
880
+ details=result.details,
881
+ label=result.label,
882
+ duration=duration,
883
+ cost=result.cost,
884
+ )
885
+
886
+ def run(
887
+ self,
888
+ evaluator_id: str,
889
+ index: Union[int, Hashable],
890
+ data: Dict[str, Any],
891
+ settings: Dict[str, Any],
892
+ name: Optional[str] = None,
893
+ as_guardrail: bool = False,
894
+ ):
895
+ """
896
+ Deprecated: Use `evaluate()` instead.
897
+ """
898
+ import warnings
899
+
900
+ warnings.warn(
901
+ "evaluation.run() is deprecated, use evaluation.evaluate() instead",
902
+ DeprecationWarning,
903
+ stacklevel=2,
904
+ )
905
+ return self.evaluate(
906
+ evaluator_id=evaluator_id,
907
+ index=index,
908
+ data=data,
909
+ settings=settings,
910
+ name=name,
911
+ as_guardrail=as_guardrail,
912
+ )