langwatch 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langwatch/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for LangWatch."""
2
2
 
3
- __version__ = "0.8.1" # x-release-please-version
3
+ __version__ = "0.9.0" # x-release-please-version
@@ -737,10 +737,6 @@ class DSPyTracer:
737
737
  dspy.Module.__original_call__ = dspy.Module.__call__ # type: ignore
738
738
  dspy.Module.__call__ = self.patched_module_call()
739
739
 
740
- if not hasattr(dspy.Predict, "__original_forward__"):
741
- dspy.Predict.__original_forward__ = dspy.Predict.forward # type: ignore
742
- dspy.Predict.forward = self.patched_predict_forward()
743
-
744
740
  language_model_classes = dspy.LM.__subclasses__()
745
741
  for lm in language_model_classes:
746
742
  if not hasattr(lm, "__original_basic_request__") and hasattr(
@@ -776,7 +772,7 @@ class DSPyTracer:
776
772
  def patched_module_call(self):
777
773
  self_ = self
778
774
 
779
- @langwatch.span(ignore_missing_trace_warning=True, type="module")
775
+ @langwatch.span(ignore_missing_trace_warning=True, type="module", capture_output=False)
780
776
  def __call__(self: dspy.Module, *args, **kwargs):
781
777
  span = self_.safe_get_current_span()
782
778
  signature = (
@@ -801,34 +797,10 @@ class DSPyTracer:
801
797
 
802
798
  return __call__
803
799
 
804
- def patched_predict_forward(self):
805
- self_ = self
806
-
807
- @langwatch.span(ignore_missing_trace_warning=True, type="module")
808
- def forward(self: dspy.Predict, **kwargs):
809
- span = self_.safe_get_current_span()
810
- signature = kwargs.get("signature", self.signature)
811
-
812
- if span and signature and hasattr(signature, "__name__"):
813
- span.update(name=f"{self.__class__.__name__}({signature.__name__})")
814
- elif span:
815
- span.update(name=f"{self.__class__.__name__}.forward")
816
-
817
- prediction = self.__class__.__original_forward__(self, **kwargs) # type: ignore
818
-
819
- if span and isinstance(prediction, dspy.Prediction):
820
- span.update(output=prediction._store) # type: ignore
821
- elif span:
822
- span.update(output=prediction) # type: ignore
823
-
824
- return prediction
825
-
826
- return forward
827
-
828
800
  def patched_language_model_call(self):
829
801
  self_ = self
830
802
 
831
- @langwatch.span(ignore_missing_trace_warning=True, type="llm")
803
+ @langwatch.span(ignore_missing_trace_warning=True, type="llm", capture_output=False)
832
804
  def call(self: dspy.LM, prompt=None, messages=None, **kwargs):
833
805
  all_kwargs = self.kwargs | kwargs
834
806
  model = self.model
@@ -895,7 +867,7 @@ class DSPyTracer:
895
867
  def patched_legacy_language_model_request(self):
896
868
  self_ = self
897
869
 
898
- @langwatch.span(ignore_missing_trace_warning=True, type="llm")
870
+ @langwatch.span(ignore_missing_trace_warning=True, type="llm", capture_output=False)
899
871
  def basic_request(self: dspy.LM, prompt, **kwargs):
900
872
  all_kwargs = self.kwargs | kwargs
901
873
  model = all_kwargs.get("model", None)
@@ -947,7 +919,7 @@ class DSPyTracer:
947
919
  ) is not getattr(dspy.Retrieve, "forward", None):
948
920
  return self.__class__.__original_forward__(self, *args, **kwargs) # type: ignore
949
921
 
950
- @langwatch.span(ignore_missing_trace_warning=True, type="rag")
922
+ @langwatch.span(ignore_missing_trace_warning=True, type="rag", capture_output=False)
951
923
  def forward(self, *args, **kwargs):
952
924
  result = self.__class__.__original_forward__(self, *args, **kwargs) # type: ignore
953
925
 
@@ -1,9 +1,36 @@
1
1
  from typing import Optional
2
2
  from langwatch.evaluation.evaluation import Evaluation
3
- from .evaluation import Evaluation
3
+ from langwatch.evaluation.platform_run import (
4
+ evaluate,
5
+ run, # Deprecated, kept for backwards compatibility
6
+ EvaluationRunResult,
7
+ EvaluationRunSummary,
8
+ EvaluationNotFoundError,
9
+ EvaluationTimeoutError,
10
+ EvaluationRunFailedError,
11
+ EvaluationsApiError,
12
+ TargetStats,
13
+ EvaluatorStats,
14
+ )
4
15
 
5
16
 
6
17
  def init(name: str, *, run_id: Optional[str] = None) -> Evaluation:
7
18
  evaluation = Evaluation(name, run_id=run_id)
8
19
  evaluation.init()
9
20
  return evaluation
21
+
22
+
23
+ __all__ = [
24
+ "init",
25
+ "evaluate",
26
+ "run", # Deprecated
27
+ "Evaluation",
28
+ "EvaluationRunResult",
29
+ "EvaluationRunSummary",
30
+ "EvaluationNotFoundError",
31
+ "EvaluationTimeoutError",
32
+ "EvaluationRunFailedError",
33
+ "EvaluationsApiError",
34
+ "TargetStats",
35
+ "EvaluatorStats",
36
+ ]
@@ -1,13 +1,15 @@
1
1
  from __future__ import annotations
2
2
  import asyncio
3
3
  from contextlib import contextmanager
4
+ from contextvars import ContextVar
5
+ from dataclasses import dataclass
4
6
  import json
5
7
  import threading
6
8
  import time
7
9
  import traceback
8
10
  import httpx
9
11
  import pandas as pd
10
- from opentelemetry import trace
12
+ from opentelemetry import trace, context as otel_context
11
13
  from opentelemetry.trace import Span
12
14
  from pydantic import BaseModel, Field
13
15
  from typing import (
@@ -43,6 +45,35 @@ from concurrent.futures import Future, ThreadPoolExecutor, as_completed
43
45
 
44
46
  _tracer = trace.get_tracer(__name__)
45
47
 
48
+
49
+ @dataclass
50
+ class TargetContext:
51
+ """Context for the current target() execution."""
52
+
53
+ target_id: str
54
+ index: int
55
+ trace_id: str
56
+ predicted: Optional[Dict[str, Any]] = None # Set via log_response()
57
+
58
+
59
+ @dataclass
60
+ class IterationContext:
61
+ """Context for the current iteration (index + item)."""
62
+
63
+ index: int
64
+ item: Any
65
+
66
+
67
+ # ContextVar for target context isolation (works across threads)
68
+ _target_context: ContextVar[Optional[TargetContext]] = ContextVar(
69
+ "_target_context", default=None
70
+ )
71
+
72
+ # ContextVar for iteration context (index + item) - thread-safe
73
+ _iteration_context: ContextVar[Optional[IterationContext]] = ContextVar(
74
+ "_iteration_context", default=None
75
+ )
76
+
46
77
  ItemT = TypeVar("ItemT")
47
78
 
48
79
 
@@ -65,11 +96,24 @@ class EvaluationResult(BaseModel):
65
96
  traceback: Optional[List[str]] = Field(
66
97
  description="Traceback information for debugging", default=None
67
98
  )
99
+ target_id: Optional[str] = Field(
100
+ default=None, description="ID of the target this evaluation is for"
101
+ )
102
+
103
+
104
+ class TargetInfo(BaseModel):
105
+ """Represents a registered target with its metadata."""
106
+
107
+ id: str
108
+ name: str
109
+ type: Literal["prompt", "agent", "custom"] = "custom"
110
+ metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None
68
111
 
69
112
 
70
113
  class Batch(TypedDict):
71
114
  dataset: List[BatchEntry]
72
115
  evaluations: List[EvaluationResult]
116
+ targets: List[TargetInfo]
73
117
 
74
118
 
75
119
  class BatchEntry(BaseModel):
@@ -78,6 +122,9 @@ class BatchEntry(BaseModel):
78
122
  duration: int
79
123
  error: Optional[str] = None
80
124
  trace_id: str
125
+ target_id: Optional[str] = None
126
+ cost: Optional[float] = None
127
+ predicted: Optional[Dict[str, Any]] = None
81
128
 
82
129
 
83
130
  class IterationInfo(TypedDict):
@@ -105,12 +152,26 @@ class Evaluation:
105
152
 
106
153
  # Sending results
107
154
  self.lock = threading.Lock()
108
- self.batch: Batch = {"dataset": [], "evaluations": []}
155
+ self.batch: Batch = {"dataset": [], "evaluations": [], "targets": []}
109
156
  self.last_sent = 0
110
157
  self.debounce_interval = 1 # 1 second
111
158
  self.threads: List[threading.Thread] = []
112
159
  self.initialized = False
113
160
 
161
+ # Target registry - tracks registered targets and their metadata
162
+ self._targets: Dict[str, TargetInfo] = {}
163
+
164
+ # Track whether with_target() was used in the current iteration
165
+ # If so, we don't create row-level dataset entries
166
+ self._current_iteration_used_with_target = False
167
+
168
+ # Track whether target() has EVER been used in this evaluation
169
+ # Once set to True, we stop creating iteration-level traces
170
+ self._evaluation_uses_targets: bool = False
171
+
172
+ # Store the active iteration trace so target() can close it early
173
+ self._active_iteration_trace: Optional[LangWatchTrace] = None
174
+
114
175
  def init(self):
115
176
  if not langwatch.get_api_key():
116
177
  raise ValueError(
@@ -233,11 +294,25 @@ class Evaluation:
233
294
  item: Any,
234
295
  in_thread: bool = False,
235
296
  ) -> Iterator[Any]:
236
- # Iteration will be None if we find ourselves in a parallel loop, but still
237
- # in the phase of collecting the evaluation.submit() processes. When in_thread,
238
- # then it's when we actually collect the iteration info.
239
- iteration = (
240
- IterationInfo(
297
+ # Reset with_target tracking for this iteration
298
+ self._current_iteration_used_with_target = False
299
+
300
+ # Set iteration context (thread-safe via contextvars)
301
+ # This allows target() to access index/item without race conditions
302
+ iter_ctx = IterationContext(index=index, item=item)
303
+ iter_token = _iteration_context.set(iter_ctx)
304
+
305
+ # Determine if we should create an iteration trace:
306
+ # - Don't create if evaluation uses targets (each target creates its own trace)
307
+ # - Don't create if we're collecting submit() calls (not in_thread yet)
308
+ should_create_iteration_trace = (
309
+ not self._evaluation_uses_targets
310
+ and (in_thread or len(self._futures) == 0)
311
+ )
312
+
313
+ iteration: Optional[IterationInfo] = None
314
+ if should_create_iteration_trace:
315
+ iteration = IterationInfo(
241
316
  trace=langwatch.trace(
242
317
  name="evaluation.loop_iteration",
243
318
  metadata={
@@ -250,12 +325,9 @@ class Evaluation:
250
325
  duration=0,
251
326
  error=None,
252
327
  )
253
- if in_thread or len(self._futures) == 0
254
- else None
255
- )
256
-
257
- if iteration is not None:
258
328
  iteration["trace"].__enter__()
329
+ # Store for target() to potentially close early
330
+ self._active_iteration_trace = iteration["trace"]
259
331
 
260
332
  start_time = time.time()
261
333
  try:
@@ -265,8 +337,13 @@ class Evaluation:
265
337
  iteration["error"] = e
266
338
  print(f"\n[Evaluation Error] index={index}")
267
339
  traceback.print_exc()
340
+ finally:
341
+ # Reset iteration context
342
+ _iteration_context.reset(iter_token)
268
343
 
269
- if iteration is not None:
344
+ # Handle iteration trace cleanup
345
+ # Note: If target() was used, it may have already closed the trace
346
+ if iteration is not None and not self._evaluation_uses_targets:
270
347
  try:
271
348
  iteration["duration"] = int((time.time() - start_time) * 1000)
272
349
 
@@ -274,7 +351,9 @@ class Evaluation:
274
351
  # from being added to the batch and change the trace name
275
352
  if not in_thread and len(self._futures) > 0:
276
353
  iteration["trace"].update(name="evaluation.loop")
277
- else:
354
+ # Only add row-level entry if with_target was NOT used
355
+ # When with_target is used, it creates per-target dataset entries instead
356
+ elif not self._current_iteration_used_with_target:
278
357
  self._add_to_batch(iteration)
279
358
 
280
359
  if iteration["error"] is not None:
@@ -284,6 +363,9 @@ class Evaluation:
284
363
  finally:
285
364
  iteration["trace"].__exit__(None, None, None)
286
365
 
366
+ # Clear active iteration trace reference
367
+ self._active_iteration_trace = None
368
+
287
369
  def _add_to_batch(self, iteration: IterationInfo):
288
370
  entry: Any = (
289
371
  iteration["item"].to_dict()
@@ -327,6 +409,7 @@ class Evaluation:
327
409
  if (
328
410
  len(self.batch["dataset"]) == 0
329
411
  and len(self.batch["evaluations"]) == 0
412
+ and len(self.batch["targets"]) == 0
330
413
  and not finished
331
414
  ):
332
415
  return
@@ -340,7 +423,13 @@ class Evaluation:
340
423
  del eval_["data"]
341
424
  evaluations.append(eval_)
342
425
 
343
- body = {
426
+ # Build targets array for API
427
+ targets = [
428
+ target.model_dump(exclude_none=True, exclude_unset=True)
429
+ for target in self.batch["targets"]
430
+ ]
431
+
432
+ body: Dict[str, Any] = {
344
433
  "experiment_slug": self.experiment_slug,
345
434
  "name": f"{self.name}",
346
435
  "run_id": self.run_id,
@@ -356,6 +445,10 @@ class Evaluation:
356
445
  },
357
446
  }
358
447
 
448
+ # Only include targets if we have any
449
+ if len(targets) > 0:
450
+ body["targets"] = targets
451
+
359
452
  if finished:
360
453
  if not isinstance(body["timestamps"], dict):
361
454
  body["timestamps"] = {}
@@ -370,7 +463,7 @@ class Evaluation:
370
463
  self.threads.append(thread)
371
464
 
372
465
  # Clear the batch and update the last sent time
373
- self.batch = {"dataset": [], "evaluations": []}
466
+ self.batch = {"dataset": [], "evaluations": [], "targets": []}
374
467
  self.last_sent = time.time()
375
468
 
376
469
  @classmethod
@@ -402,6 +495,261 @@ class Evaluation:
402
495
 
403
496
  asyncio.run(wait_for_completion(self))
404
497
 
498
+ def _register_target(
499
+ self,
500
+ target: str,
501
+ metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
502
+ ) -> str:
503
+ """
504
+ Register a target with its metadata. Returns the target ID.
505
+
506
+ If the target was already registered:
507
+ - If no new metadata is provided, the existing target is used
508
+ - If new metadata is provided and differs from existing, raises an error
509
+
510
+ Args:
511
+ target: The target name/ID
512
+ metadata: Optional metadata for this target (model, temperature, etc.)
513
+
514
+ Returns:
515
+ The target ID
516
+ """
517
+ with self.lock:
518
+ if target in self._targets:
519
+ existing = self._targets[target]
520
+ if metadata is not None:
521
+ # Check if metadata matches
522
+ existing_meta = existing.metadata or {}
523
+ if existing_meta != metadata:
524
+ raise ValueError(
525
+ f"Target '{target}' was previously registered with different metadata.\n"
526
+ f"Original: {existing_meta}\n"
527
+ f"New: {metadata}\n"
528
+ f"If you want to use different metadata, please use a different target name."
529
+ )
530
+ return target
531
+
532
+ # Register new target
533
+ target_info = TargetInfo(
534
+ id=target,
535
+ name=target,
536
+ type="custom",
537
+ metadata=metadata,
538
+ )
539
+ self._targets[target] = target_info
540
+ self.batch["targets"].append(target_info)
541
+ return target
542
+
543
+ @contextmanager
544
+ def target(
545
+ self,
546
+ name: str,
547
+ metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
548
+ ) -> Iterator[None]:
549
+ """
550
+ Context manager for executing code within a target context.
551
+
552
+ Creates a dataset entry for this specific target execution, capturing
553
+ duration automatically. This enables proper per-target latency tracking
554
+ when comparing multiple models/configurations.
555
+
556
+ Each target() call creates its own independent trace, allowing you to
557
+ view execution details separately for each model/configuration.
558
+
559
+ Inside this context, log() calls will automatically use this target
560
+ unless an explicit target is provided.
561
+
562
+ Args:
563
+ name: Unique identifier for the target
564
+ metadata: Optional metadata for comparison (e.g., {"model": "gpt-4"})
565
+
566
+ Example:
567
+ ```python
568
+ for index, row in evaluation.loop(df.iterrows()):
569
+ def task(index, row):
570
+ # Compare GPT-4 and Claude
571
+ with evaluation.target("gpt-4", {"model": "openai/gpt-4"}):
572
+ response = call_gpt4(row["question"])
573
+ # target auto-inferred, use data= to record output
574
+ evaluation.log("quality", index=index, score=0.95,
575
+ data={"output": response})
576
+
577
+ with evaluation.target("claude", {"model": "anthropic/claude"}):
578
+ response = call_claude(row["question"])
579
+ evaluation.log("quality", index=index, score=0.85,
580
+ data={"output": response})
581
+
582
+ evaluation.submit(task, index, row)
583
+ ```
584
+ """
585
+ # On FIRST target() call ever in this evaluation:
586
+ # - Set flag to skip creating iteration-level traces going forward
587
+ # - Close the active iteration trace if any (it won't have useful content)
588
+ if not self._evaluation_uses_targets:
589
+ self._evaluation_uses_targets = True
590
+ # Close the active iteration trace early
591
+ if self._active_iteration_trace is not None:
592
+ self._active_iteration_trace.__exit__(None, None, None)
593
+ self._active_iteration_trace = None
594
+
595
+ # Mark that target() was used in this iteration (for dataset entry logic)
596
+ self._current_iteration_used_with_target = True
597
+
598
+ # Register target
599
+ self._register_target(name, metadata)
600
+
601
+ # Get index and item from iteration context (thread-safe via contextvars)
602
+ # This prevents race conditions when multiple threads are running evaluations
603
+ iter_ctx = _iteration_context.get()
604
+ if iter_ctx is not None:
605
+ index = iter_ctx.index
606
+ current_item = iter_ctx.item
607
+ else:
608
+ # Fallback to instance variables (for backwards compatibility / direct usage)
609
+ index = self._current_index
610
+ current_item = self._current_item
611
+
612
+ target_trace: Optional[LangWatchTrace] = None
613
+ start_time = time.time()
614
+ error_occurred: Optional[Exception] = None
615
+ trace_id = ""
616
+
617
+ # Set up context for log() inference
618
+ ctx = TargetContext(
619
+ target_id=name,
620
+ index=index,
621
+ trace_id="", # Will be set after entering trace
622
+ )
623
+ target_context_token = _target_context.set(ctx)
624
+
625
+ try:
626
+ # Create an INDEPENDENT root trace for this target
627
+ # We use a new tracer without any parent context to get a unique trace_id
628
+ # The key is using the tracer directly with context=None to prevent
629
+ # parent context inheritance
630
+ from opentelemetry.sdk.trace import TracerProvider
631
+ from opentelemetry.trace import INVALID_SPAN_CONTEXT
632
+
633
+ tracer = trace.get_tracer("langwatch-evaluation")
634
+
635
+ # Start a new root span with no parent by passing an empty context
636
+ # This ensures each target gets a unique trace_id
637
+ root_context = otel_context.Context()
638
+
639
+ with tracer.start_as_current_span(
640
+ f"evaluation.target.{name}",
641
+ context=root_context,
642
+ attributes={
643
+ "evaluation.run_id": self.run_id,
644
+ "evaluation.index": index,
645
+ "evaluation.target": name,
646
+ },
647
+ ) as span:
648
+ span_context = span.get_span_context()
649
+ trace_id = format(span_context.trace_id, "032x")
650
+ ctx.trace_id = trace_id
651
+
652
+ try:
653
+ yield
654
+ except Exception as e:
655
+ error_occurred = e
656
+ raise
657
+
658
+ except Exception as e:
659
+ if error_occurred is None:
660
+ error_occurred = e
661
+ raise
662
+ finally:
663
+ duration_ms = int((time.time() - start_time) * 1000)
664
+
665
+ # Create dataset entry for this target
666
+ # Use the captured current_item, NOT self._current_item (which may have changed)
667
+ entry_data: Any = (
668
+ current_item.to_dict()
669
+ if hasattr(current_item, "to_dict")
670
+ else (
671
+ current_item.__dict__
672
+ if hasattr(current_item, "__dict__")
673
+ else (
674
+ current_item[1].to_dict()
675
+ if type(current_item) == tuple
676
+ and hasattr(current_item[1], "to_dict")
677
+ else (
678
+ current_item[1].__dict__
679
+ if type(current_item) == tuple
680
+ and hasattr(current_item[1], "__dict__")
681
+ else {
682
+ "entry": json.dumps(
683
+ current_item, cls=SerializableWithStringFallback
684
+ )
685
+ }
686
+ )
687
+ )
688
+ )
689
+ )
690
+
691
+ # Get predicted output from context (set via log_response())
692
+ predicted = ctx.predicted
693
+
694
+ batch_entry = BatchEntry(
695
+ index=index,
696
+ entry=entry_data,
697
+ duration=duration_ms,
698
+ error=str(error_occurred) if error_occurred else None,
699
+ trace_id=trace_id,
700
+ target_id=name,
701
+ predicted=predicted,
702
+ )
703
+
704
+ with self.lock:
705
+ self.batch["dataset"].append(batch_entry)
706
+
707
+ # Reset target context
708
+ _target_context.reset(target_context_token)
709
+
710
+ # Schedule send
711
+ if time.time() - self.last_sent >= self.debounce_interval:
712
+ self._send_batch()
713
+
714
+ def log_response(self, response: Union[str, Dict[str, Any]]) -> None:
715
+ """
716
+ Log the model's response/output for the current target.
717
+
718
+ Must be called inside a `target()` context. The response will be stored
719
+ in the dataset entry's `predicted` field, which is displayed in the
720
+ results table.
721
+
722
+ Args:
723
+ response: The model's output. Can be a string (will be wrapped as
724
+ {"output": response}) or a dict with named outputs.
725
+
726
+ Example:
727
+ ```python
728
+ with evaluation.target("gpt-4", {"model": "openai/gpt-4"}):
729
+ response = call_gpt4(row["question"])
730
+ evaluation.log_response(response) # Store the output
731
+ evaluation.log("quality", index=index, score=0.95) # Log metrics
732
+ ```
733
+
734
+ Raises:
735
+ RuntimeError: If called outside of a target() context.
736
+ """
737
+ ctx = _target_context.get()
738
+ if ctx is None:
739
+ raise RuntimeError(
740
+ "log_response() must be called inside a target() context. "
741
+ "Example: with evaluation.target('my-target'): evaluation.log_response(response)"
742
+ )
743
+
744
+ # Normalize response to dict format
745
+ if isinstance(response, str):
746
+ ctx.predicted = {"output": response}
747
+ elif isinstance(response, dict):
748
+ ctx.predicted = response
749
+ else:
750
+ # Try to convert to string for other types
751
+ ctx.predicted = {"output": str(response)}
752
+
405
753
  def log(
406
754
  self,
407
755
  metric: str,
@@ -415,17 +763,57 @@ class Evaluation:
415
763
  duration: Optional[int] = None,
416
764
  cost: Optional[Money] = None,
417
765
  error: Optional[Exception] = None,
766
+ target: Optional[str] = None,
767
+ metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
418
768
  ):
769
+ """
770
+ Log an evaluation metric result.
771
+
772
+ Args:
773
+ metric: Name of the metric being logged
774
+ index: Row index in the dataset (must be an integer)
775
+ data: Additional data/inputs for the evaluation
776
+ score: Numeric score (0-1 typically)
777
+ passed: Whether the evaluation passed
778
+ label: Label/category for the result
779
+ details: Human-readable description of the result
780
+ status: Status of the evaluation ("processed", "error", "skipped")
781
+ duration: Duration in milliseconds
782
+ cost: Cost of the evaluation
783
+ error: Exception if an error occurred
784
+ target: Optional target name for multi-target comparisons.
785
+ First call with a target name registers it with the provided metadata.
786
+ Subsequent calls with the same target can omit metadata.
787
+ If called inside with_target(), the target is auto-inferred from context.
788
+ metadata: Optional metadata for the target (model, temperature, etc.).
789
+ Only used on the first call for each target.
790
+ Raises error if conflicting metadata is provided for same target.
791
+ """
419
792
  try:
420
793
  index_ = int(cast(Any, index))
421
794
  except Exception:
422
795
  raise ValueError(f"Index must be an integer, got {index}")
423
796
 
797
+ # Get target context (if inside with_target)
798
+ ctx = _target_context.get()
799
+
800
+ # Use context target if not explicitly provided
801
+ effective_target = target if target is not None else (ctx.target_id if ctx else None)
802
+
803
+ # Register target if provided (explicit or from context)
804
+ target_id: Optional[str] = None
805
+ if effective_target is not None:
806
+ target_id = self._register_target(effective_target, metadata)
807
+
808
+ # Use trace_id from context if available
809
+ trace_id = (
810
+ ctx.trace_id
811
+ if ctx
812
+ else format(trace.get_current_span().get_span_context().trace_id, "x")
813
+ )
814
+
424
815
  eval = EvaluationResult(
425
- trace_id=format(
426
- trace.get_current_span().get_span_context().trace_id,
427
- "x",
428
- ),
816
+ trace_id=trace_id,
429
817
  name=metric,
430
818
  evaluator=metric,
431
819
  status=status if status else "error" if error else "processed",
@@ -443,6 +831,7 @@ class Evaluation:
443
831
  if error
444
832
  else None
445
833
  ),
834
+ target_id=target_id,
446
835
  )
447
836
 
448
837
  with self.lock:
@@ -0,0 +1,462 @@
1
+ """
2
+ Runner for platform-configured evaluations (Evaluations V3).
3
+
4
+ This module provides the `run()` function to execute evaluations that are
5
+ configured in the LangWatch platform from CI/CD pipelines or scripts.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Callable, List, Literal, Optional
10
+ from urllib.parse import urlparse, urlunparse
11
+ import sys
12
+ import time
13
+ import httpx
14
+
15
+ import langwatch
16
+ from langwatch.state import get_api_key, get_endpoint
17
+
18
+
19
+ def _replace_url_domain(url: str, new_base: str) -> str:
20
+ """Replace the domain/scheme of a URL with a new base URL, preserving the path."""
21
+ if not url:
22
+ return url
23
+
24
+ parsed_url = urlparse(url)
25
+ parsed_new_base = urlparse(new_base)
26
+
27
+ # Replace scheme and netloc with new base, keep path/query/fragment
28
+ return urlunparse((
29
+ parsed_new_base.scheme,
30
+ parsed_new_base.netloc,
31
+ parsed_url.path,
32
+ parsed_url.params,
33
+ parsed_url.query,
34
+ parsed_url.fragment,
35
+ ))
36
+
37
+
38
+ class EvaluationNotFoundError(Exception):
39
+ """Raised when evaluation slug doesn't exist."""
40
+
41
+ def __init__(self, slug: str):
42
+ self.slug = slug
43
+ super().__init__(f"Evaluation not found: {slug}")
44
+
45
+
46
+ class EvaluationTimeoutError(Exception):
47
+ """Raised when evaluation run times out."""
48
+
49
+ def __init__(self, run_id: str, progress: int, total: int):
50
+ self.run_id = run_id
51
+ self.progress = progress
52
+ self.total = total
53
+ super().__init__(
54
+ f"Evaluation run timed out: {run_id} ({progress}/{total} completed)"
55
+ )
56
+
57
+
58
+ class EvaluationRunFailedError(Exception):
59
+ """Raised when evaluation run fails."""
60
+
61
+ def __init__(self, run_id: str, error: str):
62
+ self.run_id = run_id
63
+ self.error_message = error
64
+ super().__init__(f"Evaluation run failed: {error}")
65
+
66
+
67
+ class EvaluationsApiError(Exception):
68
+ """Raised for other API errors."""
69
+
70
+ def __init__(self, message: str, status_code: int):
71
+ self.status_code = status_code
72
+ super().__init__(message)
73
+
74
+
75
+ @dataclass
76
+ class TargetStats:
77
+ """Statistics for a single target."""
78
+
79
+ target_id: str
80
+ name: str
81
+ passed: int
82
+ failed: int
83
+ avg_latency: float
84
+ total_cost: float
85
+
86
+
87
+ @dataclass
88
+ class EvaluatorStats:
89
+ """Statistics for a single evaluator."""
90
+
91
+ evaluator_id: str
92
+ name: str
93
+ passed: int
94
+ failed: int
95
+ pass_rate: float
96
+ avg_score: Optional[float] = None
97
+
98
+
99
+ @dataclass
100
+ class EvaluationRunSummary:
101
+ """Summary of a completed evaluation run."""
102
+
103
+ run_id: str
104
+ total_cells: int
105
+ completed_cells: int
106
+ failed_cells: int
107
+ duration: int
108
+ run_url: str = ""
109
+ targets: List[TargetStats] = field(default_factory=list)
110
+ evaluators: List[EvaluatorStats] = field(default_factory=list)
111
+ total_passed: int = 0
112
+ total_failed: int = 0
113
+ pass_rate: float = 0.0
114
+ total_cost: float = 0.0
115
+
116
+
117
+ @dataclass
118
+ class EvaluationRunResult:
119
+ """Result of running a platform evaluation."""
120
+
121
+ run_id: str
122
+ status: Literal["completed", "failed", "stopped"]
123
+ passed: int
124
+ failed: int
125
+ pass_rate: float
126
+ duration: int
127
+ run_url: str
128
+ summary: EvaluationRunSummary
129
+
130
+ def print_summary(self, exit_on_failure: Optional[bool] = None) -> None:
131
+ """
132
+ Print a CI-friendly summary and optionally exit with code 1 on failure.
133
+
134
+ Args:
135
+ exit_on_failure: If True, calls sys.exit(1) when there are failures.
136
+ If False, never exits.
137
+ If None (default), auto-detects: exits in scripts/CI, doesn't exit in notebooks.
138
+ """
139
+ _print_summary(self)
140
+
141
+ # Auto-detect: don't exit in notebooks, exit in scripts/CI
142
+ should_exit = exit_on_failure if exit_on_failure is not None else not _is_notebook()
143
+
144
+ if should_exit and self.failed > 0:
145
+ sys.exit(1)
146
+
147
+
148
+ def _is_notebook() -> bool:
149
+ """Detect if running in a Jupyter notebook."""
150
+ try:
151
+ from IPython import get_ipython # type: ignore
152
+
153
+ shell = get_ipython().__class__.__name__
154
+ if shell == "ZMQInteractiveShell":
155
+ return True # Jupyter notebook or qtconsole
156
+ elif shell == "TerminalInteractiveShell":
157
+ return False # Terminal running IPython
158
+ else:
159
+ return False
160
+ except (ImportError, AttributeError, NameError):
161
+ return False
162
+
163
+
164
+ def evaluate(
165
+ slug: str,
166
+ *,
167
+ poll_interval: float = 2.0,
168
+ timeout: float = 600.0,
169
+ on_progress: Optional[Callable[[int, int], None]] = None,
170
+ api_key: Optional[str] = None,
171
+ ) -> EvaluationRunResult:
172
+ """
173
+ Run a platform-configured evaluation and wait for completion.
174
+
175
+ This runs an Evaluation that you have configured in the LangWatch platform.
176
+ The evaluation will execute all targets and evaluators defined in the configuration.
177
+
178
+ Args:
179
+ slug: The slug of the evaluation to run (found in the evaluation URL)
180
+ poll_interval: Seconds between status checks (default: 2.0)
181
+ timeout: Maximum seconds to wait for completion (default: 600.0 = 10 minutes)
182
+ on_progress: Optional callback for progress updates (completed, total)
183
+ api_key: Optional API key override (uses LANGWATCH_API_KEY env var by default)
184
+
185
+ Returns:
186
+ EvaluationRunResult with pass rate and summary. Call result.print_summary()
187
+ to display results and exit with code 1 on failure.
188
+
189
+ Raises:
190
+ EvaluationNotFoundError: If the evaluation slug doesn't exist
191
+ EvaluationTimeoutError: If the evaluation doesn't complete within timeout
192
+ EvaluationRunFailedError: If the evaluation fails
193
+ EvaluationsApiError: For other API errors
194
+
195
+ Example:
196
+ ```python
197
+ import langwatch
198
+
199
+ result = langwatch.evaluation.evaluate("my-evaluation-slug")
200
+ result.print_summary()
201
+ ```
202
+ """
203
+ langwatch.ensure_setup()
204
+
205
+ effective_api_key = api_key or get_api_key()
206
+ endpoint = get_endpoint()
207
+
208
+ if not effective_api_key:
209
+ raise ValueError(
210
+ "API key not set. Set LANGWATCH_API_KEY environment variable or pass api_key parameter."
211
+ )
212
+
213
+ # Start the run
214
+ start_response = _start_run(slug, endpoint, effective_api_key)
215
+ run_id = start_response["runId"]
216
+ total = start_response.get("total", 0)
217
+
218
+ # Use the run URL from API but replace domain with configured endpoint
219
+ api_run_url = start_response.get("runUrl", "")
220
+ run_url = _replace_url_domain(api_run_url, endpoint) if api_run_url else ""
221
+
222
+ print(f"Started evaluation run: {run_id}")
223
+ if run_url:
224
+ print(f"Follow live: {run_url}")
225
+
226
+ # Track last progress for change detection
227
+ last_progress = 0
228
+
229
+ # Print initial progress
230
+ if total > 0:
231
+ print(f"Progress: 0/{total} (0%)", end="", flush=True)
232
+ if on_progress:
233
+ on_progress(0, total)
234
+
235
+ # Poll until complete
236
+ start_time = time.time()
237
+ while True:
238
+ if time.time() - start_time > timeout:
239
+ print() # Newline after progress
240
+ status = _get_run_status(run_id, endpoint, effective_api_key)
241
+ raise EvaluationTimeoutError(
242
+ run_id, status.get("progress", 0), status.get("total", 0)
243
+ )
244
+
245
+ time.sleep(poll_interval)
246
+
247
+ status = _get_run_status(run_id, endpoint, effective_api_key)
248
+ progress = status.get("progress", 0)
249
+ total = status.get("total", total)
250
+
251
+ # Update progress display if changed
252
+ if progress != last_progress and total > 0:
253
+ percentage = (progress / total) * 100
254
+ # Use carriage return to overwrite the line
255
+ print(f"\rProgress: {progress}/{total} ({percentage:.0f}%)", end="", flush=True)
256
+ last_progress = progress
257
+
258
+ if on_progress:
259
+ on_progress(progress, total)
260
+
261
+ run_status = status.get("status")
262
+
263
+ if run_status == "completed":
264
+ print() # Newline after progress
265
+ summary_data = status.get("summary", {})
266
+ return _build_result(run_id, "completed", summary_data, run_url)
267
+
268
+ if run_status == "failed":
269
+ print() # Newline after progress
270
+ raise EvaluationRunFailedError(
271
+ run_id, status.get("error", "Unknown error")
272
+ )
273
+
274
+ if run_status == "stopped":
275
+ print() # Newline after progress
276
+ summary_data = status.get("summary", {})
277
+ return _build_result(run_id, "stopped", summary_data, run_url)
278
+
279
+
280
+ def _start_run(slug: str, endpoint: str, api_key: str) -> dict:
281
+ """Start an evaluation run."""
282
+ with httpx.Client(timeout=60) as client:
283
+ response = client.post(
284
+ f"{endpoint}/api/evaluations/v3/{slug}/run",
285
+ headers={"X-Auth-Token": api_key},
286
+ )
287
+
288
+ if response.status_code == 404:
289
+ raise EvaluationNotFoundError(slug)
290
+ if response.status_code == 401:
291
+ raise EvaluationsApiError("Unauthorized - check your API key", 401)
292
+ if not response.is_success:
293
+ error_body = response.json() if response.content else {}
294
+ raise EvaluationsApiError(
295
+ error_body.get("error", f"Failed to start evaluation: {response.status_code}"),
296
+ response.status_code,
297
+ )
298
+
299
+ return response.json()
300
+
301
+
302
+ def _get_run_status(run_id: str, endpoint: str, api_key: str) -> dict:
303
+ """Get the status of a run."""
304
+ with httpx.Client(timeout=60) as client:
305
+ response = client.get(
306
+ f"{endpoint}/api/evaluations/v3/runs/{run_id}",
307
+ headers={"X-Auth-Token": api_key},
308
+ )
309
+
310
+ if response.status_code == 404:
311
+ raise EvaluationsApiError(f"Run not found: {run_id}", 404)
312
+ if response.status_code == 401:
313
+ raise EvaluationsApiError("Unauthorized - check your API key", 401)
314
+ if not response.is_success:
315
+ error_body = response.json() if response.content else {}
316
+ raise EvaluationsApiError(
317
+ error_body.get("error", f"Failed to get run status: {response.status_code}"),
318
+ response.status_code,
319
+ )
320
+
321
+ return response.json()
322
+
323
+
324
+ def _build_result(
325
+ run_id: str,
326
+ status: Literal["completed", "failed", "stopped"],
327
+ summary_data: dict,
328
+ run_url: str,
329
+ ) -> EvaluationRunResult:
330
+ """Build the result object from API response."""
331
+ total_cells = summary_data.get("totalCells", 0)
332
+ completed_cells = summary_data.get("completedCells", 0)
333
+ failed_cells = summary_data.get("failedCells", 0)
334
+ duration = summary_data.get("duration", 0)
335
+
336
+ total_passed = summary_data.get("totalPassed", completed_cells - failed_cells)
337
+ total_failed = summary_data.get("totalFailed", failed_cells)
338
+ pass_rate = summary_data.get(
339
+ "passRate",
340
+ (total_passed / completed_cells * 100) if completed_cells > 0 else 0.0,
341
+ )
342
+
343
+ # Parse targets
344
+ targets: List[TargetStats] = []
345
+ for t in summary_data.get("targets", []):
346
+ targets.append(
347
+ TargetStats(
348
+ target_id=t.get("targetId", ""),
349
+ name=t.get("name", ""),
350
+ passed=t.get("passed", 0),
351
+ failed=t.get("failed", 0),
352
+ avg_latency=t.get("avgLatency", 0),
353
+ total_cost=t.get("totalCost", 0),
354
+ )
355
+ )
356
+
357
+ # Parse evaluators
358
+ evaluators: List[EvaluatorStats] = []
359
+ for e in summary_data.get("evaluators", []):
360
+ evaluators.append(
361
+ EvaluatorStats(
362
+ evaluator_id=e.get("evaluatorId", ""),
363
+ name=e.get("name", ""),
364
+ passed=e.get("passed", 0),
365
+ failed=e.get("failed", 0),
366
+ pass_rate=e.get("passRate", 0),
367
+ avg_score=e.get("avgScore"),
368
+ )
369
+ )
370
+
371
+ summary = EvaluationRunSummary(
372
+ run_id=run_id,
373
+ total_cells=total_cells,
374
+ completed_cells=completed_cells,
375
+ failed_cells=failed_cells,
376
+ duration=duration,
377
+ run_url=run_url, # Always use the endpoint-based URL we constructed
378
+ targets=targets,
379
+ evaluators=evaluators,
380
+ total_passed=total_passed,
381
+ total_failed=total_failed,
382
+ pass_rate=pass_rate,
383
+ total_cost=summary_data.get("totalCost", 0),
384
+ )
385
+
386
+ return EvaluationRunResult(
387
+ run_id=run_id,
388
+ status=status,
389
+ passed=total_passed,
390
+ failed=total_failed,
391
+ pass_rate=pass_rate,
392
+ duration=duration,
393
+ run_url=summary.run_url,
394
+ summary=summary,
395
+ )
396
+
397
+
398
+ def _print_summary(result: EvaluationRunResult) -> None:
399
+ """Print a CI-friendly summary of the evaluation results."""
400
+ summary = result.summary
401
+
402
+ print("\n" + "═" * 60)
403
+ print(" EVALUATION RESULTS")
404
+ print("═" * 60)
405
+ print(f" Run ID: {result.run_id}")
406
+ print(f" Status: {result.status.upper()}")
407
+ print(f" Duration: {result.duration / 1000:.1f}s")
408
+ print("─" * 60)
409
+ print(f" Passed: {result.passed}")
410
+ print(f" Failed: {result.failed}")
411
+ print(f" Pass Rate: {result.pass_rate:.1f}%")
412
+
413
+ if summary.targets:
414
+ print("─" * 60)
415
+ print(" TARGETS:")
416
+ for target in summary.targets:
417
+ print(f" {target.name}: {target.passed} passed, {target.failed} failed")
418
+ if target.avg_latency:
419
+ print(f" Avg latency: {target.avg_latency:.0f}ms")
420
+ if target.total_cost:
421
+ print(f" Total cost: ${target.total_cost:.4f}")
422
+
423
+ if summary.evaluators:
424
+ print("─" * 60)
425
+ print(" EVALUATORS:")
426
+ for evaluator in summary.evaluators:
427
+ print(f" {evaluator.name}: {evaluator.pass_rate:.1f}% pass rate")
428
+ if evaluator.avg_score is not None:
429
+ print(f" Avg score: {evaluator.avg_score:.2f}")
430
+
431
+ print("─" * 60)
432
+ print(f" View details: {result.run_url}")
433
+ print("═" * 60 + "\n")
434
+
435
+
436
+ def run(
437
+ slug: str,
438
+ *,
439
+ poll_interval: float = 2.0,
440
+ timeout: float = 600.0,
441
+ on_progress: Optional[Callable[[int, int], None]] = None,
442
+ api_key: Optional[str] = None,
443
+ ) -> EvaluationRunResult:
444
+ """
445
+ Deprecated: Use `evaluate()` instead.
446
+
447
+ Run a platform-configured evaluation and wait for completion.
448
+ """
449
+ import warnings
450
+
451
+ warnings.warn(
452
+ "langwatch.evaluation.run() is deprecated, use langwatch.evaluation.evaluate() instead",
453
+ DeprecationWarning,
454
+ stacklevel=2,
455
+ )
456
+ return evaluate(
457
+ slug,
458
+ poll_interval=poll_interval,
459
+ timeout=timeout,
460
+ on_progress=on_progress,
461
+ api_key=api_key,
462
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langwatch
3
- Version: 0.8.1
3
+ Version: 0.9.0
4
4
  Summary: LangWatch Python SDK, for monitoring your LLMs
5
5
  Author-email: Langwatch Engineers <engineering@langwatch.ai>
6
6
  License: MIT
@@ -1,5 +1,5 @@
1
1
  langwatch/__init__.py,sha256=GMq4SV2Tz2i0JD05shqnw2lBW5cgMx4Zzo141hp106k,4266
2
- langwatch/__version__.py,sha256=l2r_v6gqH58S38dAeIr-BCiWrh25Ql4biGJMjTpZZ1o,91
2
+ langwatch/__version__.py,sha256=sympc_lD0EH1ffjgsP80P8i4Sqm2XBcIgblEeQTq6bs,91
3
3
  langwatch/attributes.py,sha256=nXdI_G85wQQCAdAcwjCiLYdEYj3wATmfgCmhlf6dVIk,3910
4
4
  langwatch/batch_evaluation.py,sha256=Y_S3teXpHV07U-vvJYyV1PB6d0CgyFM_rTzPp6GnEBo,16165
5
5
  langwatch/client.py,sha256=WTNcYSik7kZ2kH-qGDnhbMTosc8e_Xhab_lZlfh5TC8,25559
@@ -15,9 +15,10 @@ langwatch/tracer.py,sha256=t5FOdP1es9H_pPGqGUBLXCyEln0tTi4m4M9b6WxCrPU,975
15
15
  langwatch/types.py,sha256=h6r3tNTzWqENx-9j_JPmOMZfFoKq9SNpEtxpAACk2G0,3114
16
16
  langwatch/dataset/__init__.py,sha256=hZBcbjXuBO2qE5osJtd9wIE9f45F6-jpNTrne5nk4eE,2606
17
17
  langwatch/domain/__init__.py,sha256=gSCOV3WkRhp_--9D1vxw7BYpnMRbpGh-2NbsXd4KZC0,6074
18
- langwatch/dspy/__init__.py,sha256=F35iLwiznMJPgXLVYOvybjDWxdYlSN4vn3EzxC27Awc,34054
19
- langwatch/evaluation/__init__.py,sha256=Jy7PW5VQbMoDGdOLRlQmDEvo_9TDkBLmrLrfocxddLM,281
20
- langwatch/evaluation/evaluation.py,sha256=hmtY7rfgJm4TbTEMUP_x89B2L_Jyi7aNGhjNUxw1N4A,16112
18
+ langwatch/dspy/__init__.py,sha256=wp8AmobV8XGVWOI8MQFmXPHu-8Wq3wvjB6YiHQm9Fdg,33007
19
+ langwatch/evaluation/__init__.py,sha256=dctG-Ec0N_Or2Ta0XW6liYtdpMZN3ZtRXqUoeG5ksnk,870
20
+ langwatch/evaluation/evaluation.py,sha256=MqMiGlsPIS5zqN1wKfhEs6mIGLRwB452iqDTSQFbtYo,31735
21
+ langwatch/evaluation/platform_run.py,sha256=cwuRNtG99nhvqGL-YoOwdvEH3x-hDaVUzl7Vx9orjPo,14546
21
22
  langwatch/exporters/filterable_batch_span_exporter.py,sha256=MlhZjui56XD6p2sa8kEGyr-Hb3wqudknngmemnB4Twg,2142
22
23
  langwatch/generated/langwatch_rest_api_client/__init__.py,sha256=8r-9pAj7fK7vnVX3mT0y_zS4B9ZRqD6RZiBo5fPra60,156
23
24
  langwatch/generated/langwatch_rest_api_client/client.py,sha256=o_mdLqyBCQstu5tS1WZFwqIEbGwkvWQ7eQjuCJw_5VY,12419
@@ -415,6 +416,6 @@ langwatch/utils/initialization.py,sha256=1KoZmkHOvGEVF0j-4t4xRQdA_2C_SPiF7qFXqEG
415
416
  langwatch/utils/module.py,sha256=KLBNOK3mA9gCSifCcQX_lOtU48BJQDWvFKtF6NMvwVA,688
416
417
  langwatch/utils/transformation.py,sha256=76MGXyrYTxM0Yri36NJqLK-XxL4BBYdmKWAXXlw3D4Q,7690
417
418
  langwatch/utils/utils.py,sha256=ZCOSie4o9LdJ7odshNfCNjmgwgQ27ojc5ENqt1rXuSs,596
418
- langwatch-0.8.1.dist-info/METADATA,sha256=osaR4n3f3-Uo3PhYP_Dox70Dgs5fiCBnOEpu4LAhTVQ,13192
419
- langwatch-0.8.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
420
- langwatch-0.8.1.dist-info/RECORD,,
419
+ langwatch-0.9.0.dist-info/METADATA,sha256=JtLLtVbyy0iau3ySelLpMO4RpjrQAEyhd72J9NkxHl8,13192
420
+ langwatch-0.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
421
+ langwatch-0.9.0.dist-info/RECORD,,