langwatch 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch/__version__.py +1 -1
- langwatch/dspy/__init__.py +4 -32
- langwatch/evaluation/__init__.py +28 -1
- langwatch/evaluation/evaluation.py +409 -20
- langwatch/evaluation/platform_run.py +462 -0
- {langwatch-0.8.1.dist-info → langwatch-0.9.0.dist-info}/METADATA +1 -1
- {langwatch-0.8.1.dist-info → langwatch-0.9.0.dist-info}/RECORD +8 -7
- {langwatch-0.8.1.dist-info → langwatch-0.9.0.dist-info}/WHEEL +0 -0
langwatch/__version__.py
CHANGED
langwatch/dspy/__init__.py
CHANGED
|
@@ -737,10 +737,6 @@ class DSPyTracer:
|
|
|
737
737
|
dspy.Module.__original_call__ = dspy.Module.__call__ # type: ignore
|
|
738
738
|
dspy.Module.__call__ = self.patched_module_call()
|
|
739
739
|
|
|
740
|
-
if not hasattr(dspy.Predict, "__original_forward__"):
|
|
741
|
-
dspy.Predict.__original_forward__ = dspy.Predict.forward # type: ignore
|
|
742
|
-
dspy.Predict.forward = self.patched_predict_forward()
|
|
743
|
-
|
|
744
740
|
language_model_classes = dspy.LM.__subclasses__()
|
|
745
741
|
for lm in language_model_classes:
|
|
746
742
|
if not hasattr(lm, "__original_basic_request__") and hasattr(
|
|
@@ -776,7 +772,7 @@ class DSPyTracer:
|
|
|
776
772
|
def patched_module_call(self):
|
|
777
773
|
self_ = self
|
|
778
774
|
|
|
779
|
-
@langwatch.span(ignore_missing_trace_warning=True, type="module")
|
|
775
|
+
@langwatch.span(ignore_missing_trace_warning=True, type="module", capture_output=False)
|
|
780
776
|
def __call__(self: dspy.Module, *args, **kwargs):
|
|
781
777
|
span = self_.safe_get_current_span()
|
|
782
778
|
signature = (
|
|
@@ -801,34 +797,10 @@ class DSPyTracer:
|
|
|
801
797
|
|
|
802
798
|
return __call__
|
|
803
799
|
|
|
804
|
-
def patched_predict_forward(self):
|
|
805
|
-
self_ = self
|
|
806
|
-
|
|
807
|
-
@langwatch.span(ignore_missing_trace_warning=True, type="module")
|
|
808
|
-
def forward(self: dspy.Predict, **kwargs):
|
|
809
|
-
span = self_.safe_get_current_span()
|
|
810
|
-
signature = kwargs.get("signature", self.signature)
|
|
811
|
-
|
|
812
|
-
if span and signature and hasattr(signature, "__name__"):
|
|
813
|
-
span.update(name=f"{self.__class__.__name__}({signature.__name__})")
|
|
814
|
-
elif span:
|
|
815
|
-
span.update(name=f"{self.__class__.__name__}.forward")
|
|
816
|
-
|
|
817
|
-
prediction = self.__class__.__original_forward__(self, **kwargs) # type: ignore
|
|
818
|
-
|
|
819
|
-
if span and isinstance(prediction, dspy.Prediction):
|
|
820
|
-
span.update(output=prediction._store) # type: ignore
|
|
821
|
-
elif span:
|
|
822
|
-
span.update(output=prediction) # type: ignore
|
|
823
|
-
|
|
824
|
-
return prediction
|
|
825
|
-
|
|
826
|
-
return forward
|
|
827
|
-
|
|
828
800
|
def patched_language_model_call(self):
|
|
829
801
|
self_ = self
|
|
830
802
|
|
|
831
|
-
@langwatch.span(ignore_missing_trace_warning=True, type="llm")
|
|
803
|
+
@langwatch.span(ignore_missing_trace_warning=True, type="llm", capture_output=False)
|
|
832
804
|
def call(self: dspy.LM, prompt=None, messages=None, **kwargs):
|
|
833
805
|
all_kwargs = self.kwargs | kwargs
|
|
834
806
|
model = self.model
|
|
@@ -895,7 +867,7 @@ class DSPyTracer:
|
|
|
895
867
|
def patched_legacy_language_model_request(self):
|
|
896
868
|
self_ = self
|
|
897
869
|
|
|
898
|
-
@langwatch.span(ignore_missing_trace_warning=True, type="llm")
|
|
870
|
+
@langwatch.span(ignore_missing_trace_warning=True, type="llm", capture_output=False)
|
|
899
871
|
def basic_request(self: dspy.LM, prompt, **kwargs):
|
|
900
872
|
all_kwargs = self.kwargs | kwargs
|
|
901
873
|
model = all_kwargs.get("model", None)
|
|
@@ -947,7 +919,7 @@ class DSPyTracer:
|
|
|
947
919
|
) is not getattr(dspy.Retrieve, "forward", None):
|
|
948
920
|
return self.__class__.__original_forward__(self, *args, **kwargs) # type: ignore
|
|
949
921
|
|
|
950
|
-
@langwatch.span(ignore_missing_trace_warning=True, type="rag")
|
|
922
|
+
@langwatch.span(ignore_missing_trace_warning=True, type="rag", capture_output=False)
|
|
951
923
|
def forward(self, *args, **kwargs):
|
|
952
924
|
result = self.__class__.__original_forward__(self, *args, **kwargs) # type: ignore
|
|
953
925
|
|
langwatch/evaluation/__init__.py
CHANGED
|
@@ -1,9 +1,36 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
from langwatch.evaluation.evaluation import Evaluation
|
|
3
|
-
from .evaluation import
|
|
3
|
+
from langwatch.evaluation.platform_run import (
|
|
4
|
+
evaluate,
|
|
5
|
+
run, # Deprecated, kept for backwards compatibility
|
|
6
|
+
EvaluationRunResult,
|
|
7
|
+
EvaluationRunSummary,
|
|
8
|
+
EvaluationNotFoundError,
|
|
9
|
+
EvaluationTimeoutError,
|
|
10
|
+
EvaluationRunFailedError,
|
|
11
|
+
EvaluationsApiError,
|
|
12
|
+
TargetStats,
|
|
13
|
+
EvaluatorStats,
|
|
14
|
+
)
|
|
4
15
|
|
|
5
16
|
|
|
6
17
|
def init(name: str, *, run_id: Optional[str] = None) -> Evaluation:
|
|
7
18
|
evaluation = Evaluation(name, run_id=run_id)
|
|
8
19
|
evaluation.init()
|
|
9
20
|
return evaluation
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"init",
|
|
25
|
+
"evaluate",
|
|
26
|
+
"run", # Deprecated
|
|
27
|
+
"Evaluation",
|
|
28
|
+
"EvaluationRunResult",
|
|
29
|
+
"EvaluationRunSummary",
|
|
30
|
+
"EvaluationNotFoundError",
|
|
31
|
+
"EvaluationTimeoutError",
|
|
32
|
+
"EvaluationRunFailedError",
|
|
33
|
+
"EvaluationsApiError",
|
|
34
|
+
"TargetStats",
|
|
35
|
+
"EvaluatorStats",
|
|
36
|
+
]
|
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
import asyncio
|
|
3
3
|
from contextlib import contextmanager
|
|
4
|
+
from contextvars import ContextVar
|
|
5
|
+
from dataclasses import dataclass
|
|
4
6
|
import json
|
|
5
7
|
import threading
|
|
6
8
|
import time
|
|
7
9
|
import traceback
|
|
8
10
|
import httpx
|
|
9
11
|
import pandas as pd
|
|
10
|
-
from opentelemetry import trace
|
|
12
|
+
from opentelemetry import trace, context as otel_context
|
|
11
13
|
from opentelemetry.trace import Span
|
|
12
14
|
from pydantic import BaseModel, Field
|
|
13
15
|
from typing import (
|
|
@@ -43,6 +45,35 @@ from concurrent.futures import Future, ThreadPoolExecutor, as_completed
|
|
|
43
45
|
|
|
44
46
|
_tracer = trace.get_tracer(__name__)
|
|
45
47
|
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class TargetContext:
|
|
51
|
+
"""Context for the current target() execution."""
|
|
52
|
+
|
|
53
|
+
target_id: str
|
|
54
|
+
index: int
|
|
55
|
+
trace_id: str
|
|
56
|
+
predicted: Optional[Dict[str, Any]] = None # Set via log_response()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class IterationContext:
|
|
61
|
+
"""Context for the current iteration (index + item)."""
|
|
62
|
+
|
|
63
|
+
index: int
|
|
64
|
+
item: Any
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ContextVar for target context isolation (works across threads)
|
|
68
|
+
_target_context: ContextVar[Optional[TargetContext]] = ContextVar(
|
|
69
|
+
"_target_context", default=None
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# ContextVar for iteration context (index + item) - thread-safe
|
|
73
|
+
_iteration_context: ContextVar[Optional[IterationContext]] = ContextVar(
|
|
74
|
+
"_iteration_context", default=None
|
|
75
|
+
)
|
|
76
|
+
|
|
46
77
|
ItemT = TypeVar("ItemT")
|
|
47
78
|
|
|
48
79
|
|
|
@@ -65,11 +96,24 @@ class EvaluationResult(BaseModel):
|
|
|
65
96
|
traceback: Optional[List[str]] = Field(
|
|
66
97
|
description="Traceback information for debugging", default=None
|
|
67
98
|
)
|
|
99
|
+
target_id: Optional[str] = Field(
|
|
100
|
+
default=None, description="ID of the target this evaluation is for"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class TargetInfo(BaseModel):
|
|
105
|
+
"""Represents a registered target with its metadata."""
|
|
106
|
+
|
|
107
|
+
id: str
|
|
108
|
+
name: str
|
|
109
|
+
type: Literal["prompt", "agent", "custom"] = "custom"
|
|
110
|
+
metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None
|
|
68
111
|
|
|
69
112
|
|
|
70
113
|
class Batch(TypedDict):
|
|
71
114
|
dataset: List[BatchEntry]
|
|
72
115
|
evaluations: List[EvaluationResult]
|
|
116
|
+
targets: List[TargetInfo]
|
|
73
117
|
|
|
74
118
|
|
|
75
119
|
class BatchEntry(BaseModel):
|
|
@@ -78,6 +122,9 @@ class BatchEntry(BaseModel):
|
|
|
78
122
|
duration: int
|
|
79
123
|
error: Optional[str] = None
|
|
80
124
|
trace_id: str
|
|
125
|
+
target_id: Optional[str] = None
|
|
126
|
+
cost: Optional[float] = None
|
|
127
|
+
predicted: Optional[Dict[str, Any]] = None
|
|
81
128
|
|
|
82
129
|
|
|
83
130
|
class IterationInfo(TypedDict):
|
|
@@ -105,12 +152,26 @@ class Evaluation:
|
|
|
105
152
|
|
|
106
153
|
# Sending results
|
|
107
154
|
self.lock = threading.Lock()
|
|
108
|
-
self.batch: Batch = {"dataset": [], "evaluations": []}
|
|
155
|
+
self.batch: Batch = {"dataset": [], "evaluations": [], "targets": []}
|
|
109
156
|
self.last_sent = 0
|
|
110
157
|
self.debounce_interval = 1 # 1 second
|
|
111
158
|
self.threads: List[threading.Thread] = []
|
|
112
159
|
self.initialized = False
|
|
113
160
|
|
|
161
|
+
# Target registry - tracks registered targets and their metadata
|
|
162
|
+
self._targets: Dict[str, TargetInfo] = {}
|
|
163
|
+
|
|
164
|
+
# Track whether with_target() was used in the current iteration
|
|
165
|
+
# If so, we don't create row-level dataset entries
|
|
166
|
+
self._current_iteration_used_with_target = False
|
|
167
|
+
|
|
168
|
+
# Track whether target() has EVER been used in this evaluation
|
|
169
|
+
# Once set to True, we stop creating iteration-level traces
|
|
170
|
+
self._evaluation_uses_targets: bool = False
|
|
171
|
+
|
|
172
|
+
# Store the active iteration trace so target() can close it early
|
|
173
|
+
self._active_iteration_trace: Optional[LangWatchTrace] = None
|
|
174
|
+
|
|
114
175
|
def init(self):
|
|
115
176
|
if not langwatch.get_api_key():
|
|
116
177
|
raise ValueError(
|
|
@@ -233,11 +294,25 @@ class Evaluation:
|
|
|
233
294
|
item: Any,
|
|
234
295
|
in_thread: bool = False,
|
|
235
296
|
) -> Iterator[Any]:
|
|
236
|
-
#
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
iteration
|
|
240
|
-
|
|
297
|
+
# Reset with_target tracking for this iteration
|
|
298
|
+
self._current_iteration_used_with_target = False
|
|
299
|
+
|
|
300
|
+
# Set iteration context (thread-safe via contextvars)
|
|
301
|
+
# This allows target() to access index/item without race conditions
|
|
302
|
+
iter_ctx = IterationContext(index=index, item=item)
|
|
303
|
+
iter_token = _iteration_context.set(iter_ctx)
|
|
304
|
+
|
|
305
|
+
# Determine if we should create an iteration trace:
|
|
306
|
+
# - Don't create if evaluation uses targets (each target creates its own trace)
|
|
307
|
+
# - Don't create if we're collecting submit() calls (not in_thread yet)
|
|
308
|
+
should_create_iteration_trace = (
|
|
309
|
+
not self._evaluation_uses_targets
|
|
310
|
+
and (in_thread or len(self._futures) == 0)
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
iteration: Optional[IterationInfo] = None
|
|
314
|
+
if should_create_iteration_trace:
|
|
315
|
+
iteration = IterationInfo(
|
|
241
316
|
trace=langwatch.trace(
|
|
242
317
|
name="evaluation.loop_iteration",
|
|
243
318
|
metadata={
|
|
@@ -250,12 +325,9 @@ class Evaluation:
|
|
|
250
325
|
duration=0,
|
|
251
326
|
error=None,
|
|
252
327
|
)
|
|
253
|
-
if in_thread or len(self._futures) == 0
|
|
254
|
-
else None
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
if iteration is not None:
|
|
258
328
|
iteration["trace"].__enter__()
|
|
329
|
+
# Store for target() to potentially close early
|
|
330
|
+
self._active_iteration_trace = iteration["trace"]
|
|
259
331
|
|
|
260
332
|
start_time = time.time()
|
|
261
333
|
try:
|
|
@@ -265,8 +337,13 @@ class Evaluation:
|
|
|
265
337
|
iteration["error"] = e
|
|
266
338
|
print(f"\n[Evaluation Error] index={index}")
|
|
267
339
|
traceback.print_exc()
|
|
340
|
+
finally:
|
|
341
|
+
# Reset iteration context
|
|
342
|
+
_iteration_context.reset(iter_token)
|
|
268
343
|
|
|
269
|
-
|
|
344
|
+
# Handle iteration trace cleanup
|
|
345
|
+
# Note: If target() was used, it may have already closed the trace
|
|
346
|
+
if iteration is not None and not self._evaluation_uses_targets:
|
|
270
347
|
try:
|
|
271
348
|
iteration["duration"] = int((time.time() - start_time) * 1000)
|
|
272
349
|
|
|
@@ -274,7 +351,9 @@ class Evaluation:
|
|
|
274
351
|
# from being added to the batch and change the trace name
|
|
275
352
|
if not in_thread and len(self._futures) > 0:
|
|
276
353
|
iteration["trace"].update(name="evaluation.loop")
|
|
277
|
-
|
|
354
|
+
# Only add row-level entry if with_target was NOT used
|
|
355
|
+
# When with_target is used, it creates per-target dataset entries instead
|
|
356
|
+
elif not self._current_iteration_used_with_target:
|
|
278
357
|
self._add_to_batch(iteration)
|
|
279
358
|
|
|
280
359
|
if iteration["error"] is not None:
|
|
@@ -284,6 +363,9 @@ class Evaluation:
|
|
|
284
363
|
finally:
|
|
285
364
|
iteration["trace"].__exit__(None, None, None)
|
|
286
365
|
|
|
366
|
+
# Clear active iteration trace reference
|
|
367
|
+
self._active_iteration_trace = None
|
|
368
|
+
|
|
287
369
|
def _add_to_batch(self, iteration: IterationInfo):
|
|
288
370
|
entry: Any = (
|
|
289
371
|
iteration["item"].to_dict()
|
|
@@ -327,6 +409,7 @@ class Evaluation:
|
|
|
327
409
|
if (
|
|
328
410
|
len(self.batch["dataset"]) == 0
|
|
329
411
|
and len(self.batch["evaluations"]) == 0
|
|
412
|
+
and len(self.batch["targets"]) == 0
|
|
330
413
|
and not finished
|
|
331
414
|
):
|
|
332
415
|
return
|
|
@@ -340,7 +423,13 @@ class Evaluation:
|
|
|
340
423
|
del eval_["data"]
|
|
341
424
|
evaluations.append(eval_)
|
|
342
425
|
|
|
343
|
-
|
|
426
|
+
# Build targets array for API
|
|
427
|
+
targets = [
|
|
428
|
+
target.model_dump(exclude_none=True, exclude_unset=True)
|
|
429
|
+
for target in self.batch["targets"]
|
|
430
|
+
]
|
|
431
|
+
|
|
432
|
+
body: Dict[str, Any] = {
|
|
344
433
|
"experiment_slug": self.experiment_slug,
|
|
345
434
|
"name": f"{self.name}",
|
|
346
435
|
"run_id": self.run_id,
|
|
@@ -356,6 +445,10 @@ class Evaluation:
|
|
|
356
445
|
},
|
|
357
446
|
}
|
|
358
447
|
|
|
448
|
+
# Only include targets if we have any
|
|
449
|
+
if len(targets) > 0:
|
|
450
|
+
body["targets"] = targets
|
|
451
|
+
|
|
359
452
|
if finished:
|
|
360
453
|
if not isinstance(body["timestamps"], dict):
|
|
361
454
|
body["timestamps"] = {}
|
|
@@ -370,7 +463,7 @@ class Evaluation:
|
|
|
370
463
|
self.threads.append(thread)
|
|
371
464
|
|
|
372
465
|
# Clear the batch and update the last sent time
|
|
373
|
-
self.batch = {"dataset": [], "evaluations": []}
|
|
466
|
+
self.batch = {"dataset": [], "evaluations": [], "targets": []}
|
|
374
467
|
self.last_sent = time.time()
|
|
375
468
|
|
|
376
469
|
@classmethod
|
|
@@ -402,6 +495,261 @@ class Evaluation:
|
|
|
402
495
|
|
|
403
496
|
asyncio.run(wait_for_completion(self))
|
|
404
497
|
|
|
498
|
+
def _register_target(
|
|
499
|
+
self,
|
|
500
|
+
target: str,
|
|
501
|
+
metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
|
|
502
|
+
) -> str:
|
|
503
|
+
"""
|
|
504
|
+
Register a target with its metadata. Returns the target ID.
|
|
505
|
+
|
|
506
|
+
If the target was already registered:
|
|
507
|
+
- If no new metadata is provided, the existing target is used
|
|
508
|
+
- If new metadata is provided and differs from existing, raises an error
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
target: The target name/ID
|
|
512
|
+
metadata: Optional metadata for this target (model, temperature, etc.)
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
The target ID
|
|
516
|
+
"""
|
|
517
|
+
with self.lock:
|
|
518
|
+
if target in self._targets:
|
|
519
|
+
existing = self._targets[target]
|
|
520
|
+
if metadata is not None:
|
|
521
|
+
# Check if metadata matches
|
|
522
|
+
existing_meta = existing.metadata or {}
|
|
523
|
+
if existing_meta != metadata:
|
|
524
|
+
raise ValueError(
|
|
525
|
+
f"Target '{target}' was previously registered with different metadata.\n"
|
|
526
|
+
f"Original: {existing_meta}\n"
|
|
527
|
+
f"New: {metadata}\n"
|
|
528
|
+
f"If you want to use different metadata, please use a different target name."
|
|
529
|
+
)
|
|
530
|
+
return target
|
|
531
|
+
|
|
532
|
+
# Register new target
|
|
533
|
+
target_info = TargetInfo(
|
|
534
|
+
id=target,
|
|
535
|
+
name=target,
|
|
536
|
+
type="custom",
|
|
537
|
+
metadata=metadata,
|
|
538
|
+
)
|
|
539
|
+
self._targets[target] = target_info
|
|
540
|
+
self.batch["targets"].append(target_info)
|
|
541
|
+
return target
|
|
542
|
+
|
|
543
|
+
@contextmanager
|
|
544
|
+
def target(
|
|
545
|
+
self,
|
|
546
|
+
name: str,
|
|
547
|
+
metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
|
|
548
|
+
) -> Iterator[None]:
|
|
549
|
+
"""
|
|
550
|
+
Context manager for executing code within a target context.
|
|
551
|
+
|
|
552
|
+
Creates a dataset entry for this specific target execution, capturing
|
|
553
|
+
duration automatically. This enables proper per-target latency tracking
|
|
554
|
+
when comparing multiple models/configurations.
|
|
555
|
+
|
|
556
|
+
Each target() call creates its own independent trace, allowing you to
|
|
557
|
+
view execution details separately for each model/configuration.
|
|
558
|
+
|
|
559
|
+
Inside this context, log() calls will automatically use this target
|
|
560
|
+
unless an explicit target is provided.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
name: Unique identifier for the target
|
|
564
|
+
metadata: Optional metadata for comparison (e.g., {"model": "gpt-4"})
|
|
565
|
+
|
|
566
|
+
Example:
|
|
567
|
+
```python
|
|
568
|
+
for index, row in evaluation.loop(df.iterrows()):
|
|
569
|
+
def task(index, row):
|
|
570
|
+
# Compare GPT-4 and Claude
|
|
571
|
+
with evaluation.target("gpt-4", {"model": "openai/gpt-4"}):
|
|
572
|
+
response = call_gpt4(row["question"])
|
|
573
|
+
# target auto-inferred, use data= to record output
|
|
574
|
+
evaluation.log("quality", index=index, score=0.95,
|
|
575
|
+
data={"output": response})
|
|
576
|
+
|
|
577
|
+
with evaluation.target("claude", {"model": "anthropic/claude"}):
|
|
578
|
+
response = call_claude(row["question"])
|
|
579
|
+
evaluation.log("quality", index=index, score=0.85,
|
|
580
|
+
data={"output": response})
|
|
581
|
+
|
|
582
|
+
evaluation.submit(task, index, row)
|
|
583
|
+
```
|
|
584
|
+
"""
|
|
585
|
+
# On FIRST target() call ever in this evaluation:
|
|
586
|
+
# - Set flag to skip creating iteration-level traces going forward
|
|
587
|
+
# - Close the active iteration trace if any (it won't have useful content)
|
|
588
|
+
if not self._evaluation_uses_targets:
|
|
589
|
+
self._evaluation_uses_targets = True
|
|
590
|
+
# Close the active iteration trace early
|
|
591
|
+
if self._active_iteration_trace is not None:
|
|
592
|
+
self._active_iteration_trace.__exit__(None, None, None)
|
|
593
|
+
self._active_iteration_trace = None
|
|
594
|
+
|
|
595
|
+
# Mark that target() was used in this iteration (for dataset entry logic)
|
|
596
|
+
self._current_iteration_used_with_target = True
|
|
597
|
+
|
|
598
|
+
# Register target
|
|
599
|
+
self._register_target(name, metadata)
|
|
600
|
+
|
|
601
|
+
# Get index and item from iteration context (thread-safe via contextvars)
|
|
602
|
+
# This prevents race conditions when multiple threads are running evaluations
|
|
603
|
+
iter_ctx = _iteration_context.get()
|
|
604
|
+
if iter_ctx is not None:
|
|
605
|
+
index = iter_ctx.index
|
|
606
|
+
current_item = iter_ctx.item
|
|
607
|
+
else:
|
|
608
|
+
# Fallback to instance variables (for backwards compatibility / direct usage)
|
|
609
|
+
index = self._current_index
|
|
610
|
+
current_item = self._current_item
|
|
611
|
+
|
|
612
|
+
target_trace: Optional[LangWatchTrace] = None
|
|
613
|
+
start_time = time.time()
|
|
614
|
+
error_occurred: Optional[Exception] = None
|
|
615
|
+
trace_id = ""
|
|
616
|
+
|
|
617
|
+
# Set up context for log() inference
|
|
618
|
+
ctx = TargetContext(
|
|
619
|
+
target_id=name,
|
|
620
|
+
index=index,
|
|
621
|
+
trace_id="", # Will be set after entering trace
|
|
622
|
+
)
|
|
623
|
+
target_context_token = _target_context.set(ctx)
|
|
624
|
+
|
|
625
|
+
try:
|
|
626
|
+
# Create an INDEPENDENT root trace for this target
|
|
627
|
+
# We use a new tracer without any parent context to get a unique trace_id
|
|
628
|
+
# The key is using the tracer directly with context=None to prevent
|
|
629
|
+
# parent context inheritance
|
|
630
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
631
|
+
from opentelemetry.trace import INVALID_SPAN_CONTEXT
|
|
632
|
+
|
|
633
|
+
tracer = trace.get_tracer("langwatch-evaluation")
|
|
634
|
+
|
|
635
|
+
# Start a new root span with no parent by passing an empty context
|
|
636
|
+
# This ensures each target gets a unique trace_id
|
|
637
|
+
root_context = otel_context.Context()
|
|
638
|
+
|
|
639
|
+
with tracer.start_as_current_span(
|
|
640
|
+
f"evaluation.target.{name}",
|
|
641
|
+
context=root_context,
|
|
642
|
+
attributes={
|
|
643
|
+
"evaluation.run_id": self.run_id,
|
|
644
|
+
"evaluation.index": index,
|
|
645
|
+
"evaluation.target": name,
|
|
646
|
+
},
|
|
647
|
+
) as span:
|
|
648
|
+
span_context = span.get_span_context()
|
|
649
|
+
trace_id = format(span_context.trace_id, "032x")
|
|
650
|
+
ctx.trace_id = trace_id
|
|
651
|
+
|
|
652
|
+
try:
|
|
653
|
+
yield
|
|
654
|
+
except Exception as e:
|
|
655
|
+
error_occurred = e
|
|
656
|
+
raise
|
|
657
|
+
|
|
658
|
+
except Exception as e:
|
|
659
|
+
if error_occurred is None:
|
|
660
|
+
error_occurred = e
|
|
661
|
+
raise
|
|
662
|
+
finally:
|
|
663
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
664
|
+
|
|
665
|
+
# Create dataset entry for this target
|
|
666
|
+
# Use the captured current_item, NOT self._current_item (which may have changed)
|
|
667
|
+
entry_data: Any = (
|
|
668
|
+
current_item.to_dict()
|
|
669
|
+
if hasattr(current_item, "to_dict")
|
|
670
|
+
else (
|
|
671
|
+
current_item.__dict__
|
|
672
|
+
if hasattr(current_item, "__dict__")
|
|
673
|
+
else (
|
|
674
|
+
current_item[1].to_dict()
|
|
675
|
+
if type(current_item) == tuple
|
|
676
|
+
and hasattr(current_item[1], "to_dict")
|
|
677
|
+
else (
|
|
678
|
+
current_item[1].__dict__
|
|
679
|
+
if type(current_item) == tuple
|
|
680
|
+
and hasattr(current_item[1], "__dict__")
|
|
681
|
+
else {
|
|
682
|
+
"entry": json.dumps(
|
|
683
|
+
current_item, cls=SerializableWithStringFallback
|
|
684
|
+
)
|
|
685
|
+
}
|
|
686
|
+
)
|
|
687
|
+
)
|
|
688
|
+
)
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
# Get predicted output from context (set via log_response())
|
|
692
|
+
predicted = ctx.predicted
|
|
693
|
+
|
|
694
|
+
batch_entry = BatchEntry(
|
|
695
|
+
index=index,
|
|
696
|
+
entry=entry_data,
|
|
697
|
+
duration=duration_ms,
|
|
698
|
+
error=str(error_occurred) if error_occurred else None,
|
|
699
|
+
trace_id=trace_id,
|
|
700
|
+
target_id=name,
|
|
701
|
+
predicted=predicted,
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
with self.lock:
|
|
705
|
+
self.batch["dataset"].append(batch_entry)
|
|
706
|
+
|
|
707
|
+
# Reset target context
|
|
708
|
+
_target_context.reset(target_context_token)
|
|
709
|
+
|
|
710
|
+
# Schedule send
|
|
711
|
+
if time.time() - self.last_sent >= self.debounce_interval:
|
|
712
|
+
self._send_batch()
|
|
713
|
+
|
|
714
|
+
def log_response(self, response: Union[str, Dict[str, Any]]) -> None:
|
|
715
|
+
"""
|
|
716
|
+
Log the model's response/output for the current target.
|
|
717
|
+
|
|
718
|
+
Must be called inside a `target()` context. The response will be stored
|
|
719
|
+
in the dataset entry's `predicted` field, which is displayed in the
|
|
720
|
+
results table.
|
|
721
|
+
|
|
722
|
+
Args:
|
|
723
|
+
response: The model's output. Can be a string (will be wrapped as
|
|
724
|
+
{"output": response}) or a dict with named outputs.
|
|
725
|
+
|
|
726
|
+
Example:
|
|
727
|
+
```python
|
|
728
|
+
with evaluation.target("gpt-4", {"model": "openai/gpt-4"}):
|
|
729
|
+
response = call_gpt4(row["question"])
|
|
730
|
+
evaluation.log_response(response) # Store the output
|
|
731
|
+
evaluation.log("quality", index=index, score=0.95) # Log metrics
|
|
732
|
+
```
|
|
733
|
+
|
|
734
|
+
Raises:
|
|
735
|
+
RuntimeError: If called outside of a target() context.
|
|
736
|
+
"""
|
|
737
|
+
ctx = _target_context.get()
|
|
738
|
+
if ctx is None:
|
|
739
|
+
raise RuntimeError(
|
|
740
|
+
"log_response() must be called inside a target() context. "
|
|
741
|
+
"Example: with evaluation.target('my-target'): evaluation.log_response(response)"
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
# Normalize response to dict format
|
|
745
|
+
if isinstance(response, str):
|
|
746
|
+
ctx.predicted = {"output": response}
|
|
747
|
+
elif isinstance(response, dict):
|
|
748
|
+
ctx.predicted = response
|
|
749
|
+
else:
|
|
750
|
+
# Try to convert to string for other types
|
|
751
|
+
ctx.predicted = {"output": str(response)}
|
|
752
|
+
|
|
405
753
|
def log(
|
|
406
754
|
self,
|
|
407
755
|
metric: str,
|
|
@@ -415,17 +763,57 @@ class Evaluation:
|
|
|
415
763
|
duration: Optional[int] = None,
|
|
416
764
|
cost: Optional[Money] = None,
|
|
417
765
|
error: Optional[Exception] = None,
|
|
766
|
+
target: Optional[str] = None,
|
|
767
|
+
metadata: Optional[Dict[str, Union[str, int, float, bool]]] = None,
|
|
418
768
|
):
|
|
769
|
+
"""
|
|
770
|
+
Log an evaluation metric result.
|
|
771
|
+
|
|
772
|
+
Args:
|
|
773
|
+
metric: Name of the metric being logged
|
|
774
|
+
index: Row index in the dataset (must be an integer)
|
|
775
|
+
data: Additional data/inputs for the evaluation
|
|
776
|
+
score: Numeric score (0-1 typically)
|
|
777
|
+
passed: Whether the evaluation passed
|
|
778
|
+
label: Label/category for the result
|
|
779
|
+
details: Human-readable description of the result
|
|
780
|
+
status: Status of the evaluation ("processed", "error", "skipped")
|
|
781
|
+
duration: Duration in milliseconds
|
|
782
|
+
cost: Cost of the evaluation
|
|
783
|
+
error: Exception if an error occurred
|
|
784
|
+
target: Optional target name for multi-target comparisons.
|
|
785
|
+
First call with a target name registers it with the provided metadata.
|
|
786
|
+
Subsequent calls with the same target can omit metadata.
|
|
787
|
+
If called inside with_target(), the target is auto-inferred from context.
|
|
788
|
+
metadata: Optional metadata for the target (model, temperature, etc.).
|
|
789
|
+
Only used on the first call for each target.
|
|
790
|
+
Raises error if conflicting metadata is provided for same target.
|
|
791
|
+
"""
|
|
419
792
|
try:
|
|
420
793
|
index_ = int(cast(Any, index))
|
|
421
794
|
except Exception:
|
|
422
795
|
raise ValueError(f"Index must be an integer, got {index}")
|
|
423
796
|
|
|
797
|
+
# Get target context (if inside with_target)
|
|
798
|
+
ctx = _target_context.get()
|
|
799
|
+
|
|
800
|
+
# Use context target if not explicitly provided
|
|
801
|
+
effective_target = target if target is not None else (ctx.target_id if ctx else None)
|
|
802
|
+
|
|
803
|
+
# Register target if provided (explicit or from context)
|
|
804
|
+
target_id: Optional[str] = None
|
|
805
|
+
if effective_target is not None:
|
|
806
|
+
target_id = self._register_target(effective_target, metadata)
|
|
807
|
+
|
|
808
|
+
# Use trace_id from context if available
|
|
809
|
+
trace_id = (
|
|
810
|
+
ctx.trace_id
|
|
811
|
+
if ctx
|
|
812
|
+
else format(trace.get_current_span().get_span_context().trace_id, "x")
|
|
813
|
+
)
|
|
814
|
+
|
|
424
815
|
eval = EvaluationResult(
|
|
425
|
-
trace_id=
|
|
426
|
-
trace.get_current_span().get_span_context().trace_id,
|
|
427
|
-
"x",
|
|
428
|
-
),
|
|
816
|
+
trace_id=trace_id,
|
|
429
817
|
name=metric,
|
|
430
818
|
evaluator=metric,
|
|
431
819
|
status=status if status else "error" if error else "processed",
|
|
@@ -443,6 +831,7 @@ class Evaluation:
|
|
|
443
831
|
if error
|
|
444
832
|
else None
|
|
445
833
|
),
|
|
834
|
+
target_id=target_id,
|
|
446
835
|
)
|
|
447
836
|
|
|
448
837
|
with self.lock:
|
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Runner for platform-configured evaluations (Evaluations V3).
|
|
3
|
+
|
|
4
|
+
This module provides the `run()` function to execute evaluations that are
|
|
5
|
+
configured in the LangWatch platform from CI/CD pipelines or scripts.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Callable, List, Literal, Optional
|
|
10
|
+
from urllib.parse import urlparse, urlunparse
|
|
11
|
+
import sys
|
|
12
|
+
import time
|
|
13
|
+
import httpx
|
|
14
|
+
|
|
15
|
+
import langwatch
|
|
16
|
+
from langwatch.state import get_api_key, get_endpoint
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _replace_url_domain(url: str, new_base: str) -> str:
|
|
20
|
+
"""Replace the domain/scheme of a URL with a new base URL, preserving the path."""
|
|
21
|
+
if not url:
|
|
22
|
+
return url
|
|
23
|
+
|
|
24
|
+
parsed_url = urlparse(url)
|
|
25
|
+
parsed_new_base = urlparse(new_base)
|
|
26
|
+
|
|
27
|
+
# Replace scheme and netloc with new base, keep path/query/fragment
|
|
28
|
+
return urlunparse((
|
|
29
|
+
parsed_new_base.scheme,
|
|
30
|
+
parsed_new_base.netloc,
|
|
31
|
+
parsed_url.path,
|
|
32
|
+
parsed_url.params,
|
|
33
|
+
parsed_url.query,
|
|
34
|
+
parsed_url.fragment,
|
|
35
|
+
))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class EvaluationNotFoundError(Exception):
|
|
39
|
+
"""Raised when evaluation slug doesn't exist."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, slug: str):
|
|
42
|
+
self.slug = slug
|
|
43
|
+
super().__init__(f"Evaluation not found: {slug}")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class EvaluationTimeoutError(Exception):
|
|
47
|
+
"""Raised when evaluation run times out."""
|
|
48
|
+
|
|
49
|
+
def __init__(self, run_id: str, progress: int, total: int):
|
|
50
|
+
self.run_id = run_id
|
|
51
|
+
self.progress = progress
|
|
52
|
+
self.total = total
|
|
53
|
+
super().__init__(
|
|
54
|
+
f"Evaluation run timed out: {run_id} ({progress}/{total} completed)"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class EvaluationRunFailedError(Exception):
|
|
59
|
+
"""Raised when evaluation run fails."""
|
|
60
|
+
|
|
61
|
+
def __init__(self, run_id: str, error: str):
|
|
62
|
+
self.run_id = run_id
|
|
63
|
+
self.error_message = error
|
|
64
|
+
super().__init__(f"Evaluation run failed: {error}")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class EvaluationsApiError(Exception):
|
|
68
|
+
"""Raised for other API errors."""
|
|
69
|
+
|
|
70
|
+
def __init__(self, message: str, status_code: int):
|
|
71
|
+
self.status_code = status_code
|
|
72
|
+
super().__init__(message)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class TargetStats:
|
|
77
|
+
"""Statistics for a single target."""
|
|
78
|
+
|
|
79
|
+
target_id: str
|
|
80
|
+
name: str
|
|
81
|
+
passed: int
|
|
82
|
+
failed: int
|
|
83
|
+
avg_latency: float
|
|
84
|
+
total_cost: float
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class EvaluatorStats:
|
|
89
|
+
"""Statistics for a single evaluator."""
|
|
90
|
+
|
|
91
|
+
evaluator_id: str
|
|
92
|
+
name: str
|
|
93
|
+
passed: int
|
|
94
|
+
failed: int
|
|
95
|
+
pass_rate: float
|
|
96
|
+
avg_score: Optional[float] = None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class EvaluationRunSummary:
|
|
101
|
+
"""Summary of a completed evaluation run."""
|
|
102
|
+
|
|
103
|
+
run_id: str
|
|
104
|
+
total_cells: int
|
|
105
|
+
completed_cells: int
|
|
106
|
+
failed_cells: int
|
|
107
|
+
duration: int
|
|
108
|
+
run_url: str = ""
|
|
109
|
+
targets: List[TargetStats] = field(default_factory=list)
|
|
110
|
+
evaluators: List[EvaluatorStats] = field(default_factory=list)
|
|
111
|
+
total_passed: int = 0
|
|
112
|
+
total_failed: int = 0
|
|
113
|
+
pass_rate: float = 0.0
|
|
114
|
+
total_cost: float = 0.0
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass
|
|
118
|
+
class EvaluationRunResult:
|
|
119
|
+
"""Result of running a platform evaluation."""
|
|
120
|
+
|
|
121
|
+
run_id: str
|
|
122
|
+
status: Literal["completed", "failed", "stopped"]
|
|
123
|
+
passed: int
|
|
124
|
+
failed: int
|
|
125
|
+
pass_rate: float
|
|
126
|
+
duration: int
|
|
127
|
+
run_url: str
|
|
128
|
+
summary: EvaluationRunSummary
|
|
129
|
+
|
|
130
|
+
def print_summary(self, exit_on_failure: Optional[bool] = None) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Print a CI-friendly summary and optionally exit with code 1 on failure.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
exit_on_failure: If True, calls sys.exit(1) when there are failures.
|
|
136
|
+
If False, never exits.
|
|
137
|
+
If None (default), auto-detects: exits in scripts/CI, doesn't exit in notebooks.
|
|
138
|
+
"""
|
|
139
|
+
_print_summary(self)
|
|
140
|
+
|
|
141
|
+
# Auto-detect: don't exit in notebooks, exit in scripts/CI
|
|
142
|
+
should_exit = exit_on_failure if exit_on_failure is not None else not _is_notebook()
|
|
143
|
+
|
|
144
|
+
if should_exit and self.failed > 0:
|
|
145
|
+
sys.exit(1)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _is_notebook() -> bool:
|
|
149
|
+
"""Detect if running in a Jupyter notebook."""
|
|
150
|
+
try:
|
|
151
|
+
from IPython import get_ipython # type: ignore
|
|
152
|
+
|
|
153
|
+
shell = get_ipython().__class__.__name__
|
|
154
|
+
if shell == "ZMQInteractiveShell":
|
|
155
|
+
return True # Jupyter notebook or qtconsole
|
|
156
|
+
elif shell == "TerminalInteractiveShell":
|
|
157
|
+
return False # Terminal running IPython
|
|
158
|
+
else:
|
|
159
|
+
return False
|
|
160
|
+
except (ImportError, AttributeError, NameError):
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def evaluate(
|
|
165
|
+
slug: str,
|
|
166
|
+
*,
|
|
167
|
+
poll_interval: float = 2.0,
|
|
168
|
+
timeout: float = 600.0,
|
|
169
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
170
|
+
api_key: Optional[str] = None,
|
|
171
|
+
) -> EvaluationRunResult:
|
|
172
|
+
"""
|
|
173
|
+
Run a platform-configured evaluation and wait for completion.
|
|
174
|
+
|
|
175
|
+
This runs an Evaluation that you have configured in the LangWatch platform.
|
|
176
|
+
The evaluation will execute all targets and evaluators defined in the configuration.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
slug: The slug of the evaluation to run (found in the evaluation URL)
|
|
180
|
+
poll_interval: Seconds between status checks (default: 2.0)
|
|
181
|
+
timeout: Maximum seconds to wait for completion (default: 600.0 = 10 minutes)
|
|
182
|
+
on_progress: Optional callback for progress updates (completed, total)
|
|
183
|
+
api_key: Optional API key override (uses LANGWATCH_API_KEY env var by default)
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
EvaluationRunResult with pass rate and summary. Call result.print_summary()
|
|
187
|
+
to display results and exit with code 1 on failure.
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
EvaluationNotFoundError: If the evaluation slug doesn't exist
|
|
191
|
+
EvaluationTimeoutError: If the evaluation doesn't complete within timeout
|
|
192
|
+
EvaluationRunFailedError: If the evaluation fails
|
|
193
|
+
EvaluationsApiError: For other API errors
|
|
194
|
+
|
|
195
|
+
Example:
|
|
196
|
+
```python
|
|
197
|
+
import langwatch
|
|
198
|
+
|
|
199
|
+
result = langwatch.evaluation.evaluate("my-evaluation-slug")
|
|
200
|
+
result.print_summary()
|
|
201
|
+
```
|
|
202
|
+
"""
|
|
203
|
+
langwatch.ensure_setup()
|
|
204
|
+
|
|
205
|
+
effective_api_key = api_key or get_api_key()
|
|
206
|
+
endpoint = get_endpoint()
|
|
207
|
+
|
|
208
|
+
if not effective_api_key:
|
|
209
|
+
raise ValueError(
|
|
210
|
+
"API key not set. Set LANGWATCH_API_KEY environment variable or pass api_key parameter."
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Start the run
|
|
214
|
+
start_response = _start_run(slug, endpoint, effective_api_key)
|
|
215
|
+
run_id = start_response["runId"]
|
|
216
|
+
total = start_response.get("total", 0)
|
|
217
|
+
|
|
218
|
+
# Use the run URL from API but replace domain with configured endpoint
|
|
219
|
+
api_run_url = start_response.get("runUrl", "")
|
|
220
|
+
run_url = _replace_url_domain(api_run_url, endpoint) if api_run_url else ""
|
|
221
|
+
|
|
222
|
+
print(f"Started evaluation run: {run_id}")
|
|
223
|
+
if run_url:
|
|
224
|
+
print(f"Follow live: {run_url}")
|
|
225
|
+
|
|
226
|
+
# Track last progress for change detection
|
|
227
|
+
last_progress = 0
|
|
228
|
+
|
|
229
|
+
# Print initial progress
|
|
230
|
+
if total > 0:
|
|
231
|
+
print(f"Progress: 0/{total} (0%)", end="", flush=True)
|
|
232
|
+
if on_progress:
|
|
233
|
+
on_progress(0, total)
|
|
234
|
+
|
|
235
|
+
# Poll until complete
|
|
236
|
+
start_time = time.time()
|
|
237
|
+
while True:
|
|
238
|
+
if time.time() - start_time > timeout:
|
|
239
|
+
print() # Newline after progress
|
|
240
|
+
status = _get_run_status(run_id, endpoint, effective_api_key)
|
|
241
|
+
raise EvaluationTimeoutError(
|
|
242
|
+
run_id, status.get("progress", 0), status.get("total", 0)
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
time.sleep(poll_interval)
|
|
246
|
+
|
|
247
|
+
status = _get_run_status(run_id, endpoint, effective_api_key)
|
|
248
|
+
progress = status.get("progress", 0)
|
|
249
|
+
total = status.get("total", total)
|
|
250
|
+
|
|
251
|
+
# Update progress display if changed
|
|
252
|
+
if progress != last_progress and total > 0:
|
|
253
|
+
percentage = (progress / total) * 100
|
|
254
|
+
# Use carriage return to overwrite the line
|
|
255
|
+
print(f"\rProgress: {progress}/{total} ({percentage:.0f}%)", end="", flush=True)
|
|
256
|
+
last_progress = progress
|
|
257
|
+
|
|
258
|
+
if on_progress:
|
|
259
|
+
on_progress(progress, total)
|
|
260
|
+
|
|
261
|
+
run_status = status.get("status")
|
|
262
|
+
|
|
263
|
+
if run_status == "completed":
|
|
264
|
+
print() # Newline after progress
|
|
265
|
+
summary_data = status.get("summary", {})
|
|
266
|
+
return _build_result(run_id, "completed", summary_data, run_url)
|
|
267
|
+
|
|
268
|
+
if run_status == "failed":
|
|
269
|
+
print() # Newline after progress
|
|
270
|
+
raise EvaluationRunFailedError(
|
|
271
|
+
run_id, status.get("error", "Unknown error")
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
if run_status == "stopped":
|
|
275
|
+
print() # Newline after progress
|
|
276
|
+
summary_data = status.get("summary", {})
|
|
277
|
+
return _build_result(run_id, "stopped", summary_data, run_url)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _start_run(slug: str, endpoint: str, api_key: str) -> dict:
|
|
281
|
+
"""Start an evaluation run."""
|
|
282
|
+
with httpx.Client(timeout=60) as client:
|
|
283
|
+
response = client.post(
|
|
284
|
+
f"{endpoint}/api/evaluations/v3/{slug}/run",
|
|
285
|
+
headers={"X-Auth-Token": api_key},
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
if response.status_code == 404:
|
|
289
|
+
raise EvaluationNotFoundError(slug)
|
|
290
|
+
if response.status_code == 401:
|
|
291
|
+
raise EvaluationsApiError("Unauthorized - check your API key", 401)
|
|
292
|
+
if not response.is_success:
|
|
293
|
+
error_body = response.json() if response.content else {}
|
|
294
|
+
raise EvaluationsApiError(
|
|
295
|
+
error_body.get("error", f"Failed to start evaluation: {response.status_code}"),
|
|
296
|
+
response.status_code,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
return response.json()
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _get_run_status(run_id: str, endpoint: str, api_key: str) -> dict:
|
|
303
|
+
"""Get the status of a run."""
|
|
304
|
+
with httpx.Client(timeout=60) as client:
|
|
305
|
+
response = client.get(
|
|
306
|
+
f"{endpoint}/api/evaluations/v3/runs/{run_id}",
|
|
307
|
+
headers={"X-Auth-Token": api_key},
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
if response.status_code == 404:
|
|
311
|
+
raise EvaluationsApiError(f"Run not found: {run_id}", 404)
|
|
312
|
+
if response.status_code == 401:
|
|
313
|
+
raise EvaluationsApiError("Unauthorized - check your API key", 401)
|
|
314
|
+
if not response.is_success:
|
|
315
|
+
error_body = response.json() if response.content else {}
|
|
316
|
+
raise EvaluationsApiError(
|
|
317
|
+
error_body.get("error", f"Failed to get run status: {response.status_code}"),
|
|
318
|
+
response.status_code,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
return response.json()
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _build_result(
|
|
325
|
+
run_id: str,
|
|
326
|
+
status: Literal["completed", "failed", "stopped"],
|
|
327
|
+
summary_data: dict,
|
|
328
|
+
run_url: str,
|
|
329
|
+
) -> EvaluationRunResult:
|
|
330
|
+
"""Build the result object from API response."""
|
|
331
|
+
total_cells = summary_data.get("totalCells", 0)
|
|
332
|
+
completed_cells = summary_data.get("completedCells", 0)
|
|
333
|
+
failed_cells = summary_data.get("failedCells", 0)
|
|
334
|
+
duration = summary_data.get("duration", 0)
|
|
335
|
+
|
|
336
|
+
total_passed = summary_data.get("totalPassed", completed_cells - failed_cells)
|
|
337
|
+
total_failed = summary_data.get("totalFailed", failed_cells)
|
|
338
|
+
pass_rate = summary_data.get(
|
|
339
|
+
"passRate",
|
|
340
|
+
(total_passed / completed_cells * 100) if completed_cells > 0 else 0.0,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Parse targets
|
|
344
|
+
targets: List[TargetStats] = []
|
|
345
|
+
for t in summary_data.get("targets", []):
|
|
346
|
+
targets.append(
|
|
347
|
+
TargetStats(
|
|
348
|
+
target_id=t.get("targetId", ""),
|
|
349
|
+
name=t.get("name", ""),
|
|
350
|
+
passed=t.get("passed", 0),
|
|
351
|
+
failed=t.get("failed", 0),
|
|
352
|
+
avg_latency=t.get("avgLatency", 0),
|
|
353
|
+
total_cost=t.get("totalCost", 0),
|
|
354
|
+
)
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Parse evaluators
|
|
358
|
+
evaluators: List[EvaluatorStats] = []
|
|
359
|
+
for e in summary_data.get("evaluators", []):
|
|
360
|
+
evaluators.append(
|
|
361
|
+
EvaluatorStats(
|
|
362
|
+
evaluator_id=e.get("evaluatorId", ""),
|
|
363
|
+
name=e.get("name", ""),
|
|
364
|
+
passed=e.get("passed", 0),
|
|
365
|
+
failed=e.get("failed", 0),
|
|
366
|
+
pass_rate=e.get("passRate", 0),
|
|
367
|
+
avg_score=e.get("avgScore"),
|
|
368
|
+
)
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
summary = EvaluationRunSummary(
|
|
372
|
+
run_id=run_id,
|
|
373
|
+
total_cells=total_cells,
|
|
374
|
+
completed_cells=completed_cells,
|
|
375
|
+
failed_cells=failed_cells,
|
|
376
|
+
duration=duration,
|
|
377
|
+
run_url=run_url, # Always use the endpoint-based URL we constructed
|
|
378
|
+
targets=targets,
|
|
379
|
+
evaluators=evaluators,
|
|
380
|
+
total_passed=total_passed,
|
|
381
|
+
total_failed=total_failed,
|
|
382
|
+
pass_rate=pass_rate,
|
|
383
|
+
total_cost=summary_data.get("totalCost", 0),
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
return EvaluationRunResult(
|
|
387
|
+
run_id=run_id,
|
|
388
|
+
status=status,
|
|
389
|
+
passed=total_passed,
|
|
390
|
+
failed=total_failed,
|
|
391
|
+
pass_rate=pass_rate,
|
|
392
|
+
duration=duration,
|
|
393
|
+
run_url=summary.run_url,
|
|
394
|
+
summary=summary,
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _print_summary(result: EvaluationRunResult) -> None:
|
|
399
|
+
"""Print a CI-friendly summary of the evaluation results."""
|
|
400
|
+
summary = result.summary
|
|
401
|
+
|
|
402
|
+
print("\n" + "═" * 60)
|
|
403
|
+
print(" EVALUATION RESULTS")
|
|
404
|
+
print("═" * 60)
|
|
405
|
+
print(f" Run ID: {result.run_id}")
|
|
406
|
+
print(f" Status: {result.status.upper()}")
|
|
407
|
+
print(f" Duration: {result.duration / 1000:.1f}s")
|
|
408
|
+
print("─" * 60)
|
|
409
|
+
print(f" Passed: {result.passed}")
|
|
410
|
+
print(f" Failed: {result.failed}")
|
|
411
|
+
print(f" Pass Rate: {result.pass_rate:.1f}%")
|
|
412
|
+
|
|
413
|
+
if summary.targets:
|
|
414
|
+
print("─" * 60)
|
|
415
|
+
print(" TARGETS:")
|
|
416
|
+
for target in summary.targets:
|
|
417
|
+
print(f" {target.name}: {target.passed} passed, {target.failed} failed")
|
|
418
|
+
if target.avg_latency:
|
|
419
|
+
print(f" Avg latency: {target.avg_latency:.0f}ms")
|
|
420
|
+
if target.total_cost:
|
|
421
|
+
print(f" Total cost: ${target.total_cost:.4f}")
|
|
422
|
+
|
|
423
|
+
if summary.evaluators:
|
|
424
|
+
print("─" * 60)
|
|
425
|
+
print(" EVALUATORS:")
|
|
426
|
+
for evaluator in summary.evaluators:
|
|
427
|
+
print(f" {evaluator.name}: {evaluator.pass_rate:.1f}% pass rate")
|
|
428
|
+
if evaluator.avg_score is not None:
|
|
429
|
+
print(f" Avg score: {evaluator.avg_score:.2f}")
|
|
430
|
+
|
|
431
|
+
print("─" * 60)
|
|
432
|
+
print(f" View details: {result.run_url}")
|
|
433
|
+
print("═" * 60 + "\n")
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def run(
|
|
437
|
+
slug: str,
|
|
438
|
+
*,
|
|
439
|
+
poll_interval: float = 2.0,
|
|
440
|
+
timeout: float = 600.0,
|
|
441
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
442
|
+
api_key: Optional[str] = None,
|
|
443
|
+
) -> EvaluationRunResult:
|
|
444
|
+
"""
|
|
445
|
+
Deprecated: Use `evaluate()` instead.
|
|
446
|
+
|
|
447
|
+
Run a platform-configured evaluation and wait for completion.
|
|
448
|
+
"""
|
|
449
|
+
import warnings
|
|
450
|
+
|
|
451
|
+
warnings.warn(
|
|
452
|
+
"langwatch.evaluation.run() is deprecated, use langwatch.evaluation.evaluate() instead",
|
|
453
|
+
DeprecationWarning,
|
|
454
|
+
stacklevel=2,
|
|
455
|
+
)
|
|
456
|
+
return evaluate(
|
|
457
|
+
slug,
|
|
458
|
+
poll_interval=poll_interval,
|
|
459
|
+
timeout=timeout,
|
|
460
|
+
on_progress=on_progress,
|
|
461
|
+
api_key=api_key,
|
|
462
|
+
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
langwatch/__init__.py,sha256=GMq4SV2Tz2i0JD05shqnw2lBW5cgMx4Zzo141hp106k,4266
|
|
2
|
-
langwatch/__version__.py,sha256=
|
|
2
|
+
langwatch/__version__.py,sha256=sympc_lD0EH1ffjgsP80P8i4Sqm2XBcIgblEeQTq6bs,91
|
|
3
3
|
langwatch/attributes.py,sha256=nXdI_G85wQQCAdAcwjCiLYdEYj3wATmfgCmhlf6dVIk,3910
|
|
4
4
|
langwatch/batch_evaluation.py,sha256=Y_S3teXpHV07U-vvJYyV1PB6d0CgyFM_rTzPp6GnEBo,16165
|
|
5
5
|
langwatch/client.py,sha256=WTNcYSik7kZ2kH-qGDnhbMTosc8e_Xhab_lZlfh5TC8,25559
|
|
@@ -15,9 +15,10 @@ langwatch/tracer.py,sha256=t5FOdP1es9H_pPGqGUBLXCyEln0tTi4m4M9b6WxCrPU,975
|
|
|
15
15
|
langwatch/types.py,sha256=h6r3tNTzWqENx-9j_JPmOMZfFoKq9SNpEtxpAACk2G0,3114
|
|
16
16
|
langwatch/dataset/__init__.py,sha256=hZBcbjXuBO2qE5osJtd9wIE9f45F6-jpNTrne5nk4eE,2606
|
|
17
17
|
langwatch/domain/__init__.py,sha256=gSCOV3WkRhp_--9D1vxw7BYpnMRbpGh-2NbsXd4KZC0,6074
|
|
18
|
-
langwatch/dspy/__init__.py,sha256=
|
|
19
|
-
langwatch/evaluation/__init__.py,sha256=
|
|
20
|
-
langwatch/evaluation/evaluation.py,sha256=
|
|
18
|
+
langwatch/dspy/__init__.py,sha256=wp8AmobV8XGVWOI8MQFmXPHu-8Wq3wvjB6YiHQm9Fdg,33007
|
|
19
|
+
langwatch/evaluation/__init__.py,sha256=dctG-Ec0N_Or2Ta0XW6liYtdpMZN3ZtRXqUoeG5ksnk,870
|
|
20
|
+
langwatch/evaluation/evaluation.py,sha256=MqMiGlsPIS5zqN1wKfhEs6mIGLRwB452iqDTSQFbtYo,31735
|
|
21
|
+
langwatch/evaluation/platform_run.py,sha256=cwuRNtG99nhvqGL-YoOwdvEH3x-hDaVUzl7Vx9orjPo,14546
|
|
21
22
|
langwatch/exporters/filterable_batch_span_exporter.py,sha256=MlhZjui56XD6p2sa8kEGyr-Hb3wqudknngmemnB4Twg,2142
|
|
22
23
|
langwatch/generated/langwatch_rest_api_client/__init__.py,sha256=8r-9pAj7fK7vnVX3mT0y_zS4B9ZRqD6RZiBo5fPra60,156
|
|
23
24
|
langwatch/generated/langwatch_rest_api_client/client.py,sha256=o_mdLqyBCQstu5tS1WZFwqIEbGwkvWQ7eQjuCJw_5VY,12419
|
|
@@ -415,6 +416,6 @@ langwatch/utils/initialization.py,sha256=1KoZmkHOvGEVF0j-4t4xRQdA_2C_SPiF7qFXqEG
|
|
|
415
416
|
langwatch/utils/module.py,sha256=KLBNOK3mA9gCSifCcQX_lOtU48BJQDWvFKtF6NMvwVA,688
|
|
416
417
|
langwatch/utils/transformation.py,sha256=76MGXyrYTxM0Yri36NJqLK-XxL4BBYdmKWAXXlw3D4Q,7690
|
|
417
418
|
langwatch/utils/utils.py,sha256=ZCOSie4o9LdJ7odshNfCNjmgwgQ27ojc5ENqt1rXuSs,596
|
|
418
|
-
langwatch-0.
|
|
419
|
-
langwatch-0.
|
|
420
|
-
langwatch-0.
|
|
419
|
+
langwatch-0.9.0.dist-info/METADATA,sha256=JtLLtVbyy0iau3ySelLpMO4RpjrQAEyhd72J9NkxHl8,13192
|
|
420
|
+
langwatch-0.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
421
|
+
langwatch-0.9.0.dist-info/RECORD,,
|
|
File without changes
|