langwatch 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch/__version__.py +1 -1
- langwatch/batch_evaluation.py +5 -4
- langwatch/dspy/__init__.py +7 -34
- langwatch/evaluation/__init__.py +28 -1
- langwatch/evaluation/evaluation.py +412 -22
- langwatch/evaluation/platform_run.py +462 -0
- langwatch/evaluations.py +3 -2
- langwatch/login.py +2 -1
- langwatch/telemetry/tracing.py +3 -2
- langwatch/utils/exceptions.py +22 -1
- {langwatch-0.8.0.dist-info → langwatch-0.9.0.dist-info}/METADATA +1 -1
- {langwatch-0.8.0.dist-info → langwatch-0.9.0.dist-info}/RECORD +13 -12
- {langwatch-0.8.0.dist-info → langwatch-0.9.0.dist-info}/WHEEL +0 -0
langwatch/__version__.py
CHANGED
langwatch/batch_evaluation.py
CHANGED
|
@@ -24,6 +24,7 @@ from tqdm import tqdm
|
|
|
24
24
|
import pandas as pd
|
|
25
25
|
|
|
26
26
|
from langwatch.types import Money
|
|
27
|
+
from langwatch.utils.exceptions import better_raise_for_status
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
class EvaluationResult(BaseModel):
|
|
@@ -150,7 +151,7 @@ class BatchEvaluation:
|
|
|
150
151
|
raise ValueError(
|
|
151
152
|
"API key is not valid, please try to login again with langwatch.login()"
|
|
152
153
|
)
|
|
153
|
-
response
|
|
154
|
+
better_raise_for_status(response)
|
|
154
155
|
experiment_path = response.json()["path"]
|
|
155
156
|
self.experiment_slug = response.json()["slug"]
|
|
156
157
|
|
|
@@ -368,7 +369,7 @@ class BatchEvaluation:
|
|
|
368
369
|
json=body,
|
|
369
370
|
timeout=60,
|
|
370
371
|
)
|
|
371
|
-
response
|
|
372
|
+
better_raise_for_status(response)
|
|
372
373
|
|
|
373
374
|
def wait_for_completion(self):
|
|
374
375
|
async def wait_for_completion(self):
|
|
@@ -414,7 +415,7 @@ async def run_evaluation(
|
|
|
414
415
|
|
|
415
416
|
async with httpx.AsyncClient(timeout=900) as client:
|
|
416
417
|
response = await client.post(**request_params)
|
|
417
|
-
response
|
|
418
|
+
better_raise_for_status(response)
|
|
418
419
|
|
|
419
420
|
result = response.json()
|
|
420
421
|
|
|
@@ -462,7 +463,7 @@ def get_dataset(
|
|
|
462
463
|
|
|
463
464
|
with httpx.Client(timeout=300) as client:
|
|
464
465
|
response = client.get(**request_params)
|
|
465
|
-
response
|
|
466
|
+
better_raise_for_status(response)
|
|
466
467
|
|
|
467
468
|
result = response.json()
|
|
468
469
|
|
langwatch/dspy/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@ import time
|
|
|
5
5
|
import warnings
|
|
6
6
|
import dspy
|
|
7
7
|
from typing import Callable, List, Optional, Any, Type, Union
|
|
8
|
+
from langwatch.utils.exceptions import better_raise_for_status
|
|
8
9
|
from langwatch.utils.transformation import truncate_object_recursively
|
|
9
10
|
from langwatch.telemetry.tracing import LangWatchTrace
|
|
10
11
|
from typing_extensions import TypedDict
|
|
@@ -193,7 +194,7 @@ class LangWatchDSPy:
|
|
|
193
194
|
raise ValueError(
|
|
194
195
|
"API key is not valid, please try to login again with langwatch.login()"
|
|
195
196
|
)
|
|
196
|
-
response
|
|
197
|
+
better_raise_for_status(response)
|
|
197
198
|
|
|
198
199
|
if optimizer and evaluator:
|
|
199
200
|
raise ValueError("You can only provide an optimizer or an evaluator, not both.")
|
|
@@ -386,7 +387,7 @@ class LangWatchDSPy:
|
|
|
386
387
|
data=json.dumps(data), # type: ignore
|
|
387
388
|
timeout=60,
|
|
388
389
|
)
|
|
389
|
-
response
|
|
390
|
+
better_raise_for_status(response)
|
|
390
391
|
self.steps_buffer = []
|
|
391
392
|
|
|
392
393
|
def tracer(self, trace: LangWatchTrace):
|
|
@@ -736,10 +737,6 @@ class DSPyTracer:
|
|
|
736
737
|
dspy.Module.__original_call__ = dspy.Module.__call__ # type: ignore
|
|
737
738
|
dspy.Module.__call__ = self.patched_module_call()
|
|
738
739
|
|
|
739
|
-
if not hasattr(dspy.Predict, "__original_forward__"):
|
|
740
|
-
dspy.Predict.__original_forward__ = dspy.Predict.forward # type: ignore
|
|
741
|
-
dspy.Predict.forward = self.patched_predict_forward()
|
|
742
|
-
|
|
743
740
|
language_model_classes = dspy.LM.__subclasses__()
|
|
744
741
|
for lm in language_model_classes:
|
|
745
742
|
if not hasattr(lm, "__original_basic_request__") and hasattr(
|
|
@@ -775,7 +772,7 @@ class DSPyTracer:
|
|
|
775
772
|
def patched_module_call(self):
|
|
776
773
|
self_ = self
|
|
777
774
|
|
|
778
|
-
@langwatch.span(ignore_missing_trace_warning=True, type="module")
|
|
775
|
+
@langwatch.span(ignore_missing_trace_warning=True, type="module", capture_output=False)
|
|
779
776
|
def __call__(self: dspy.Module, *args, **kwargs):
|
|
780
777
|
span = self_.safe_get_current_span()
|
|
781
778
|
signature = (
|
|
@@ -800,34 +797,10 @@ class DSPyTracer:
|
|
|
800
797
|
|
|
801
798
|
return __call__
|
|
802
799
|
|
|
803
|
-
def patched_predict_forward(self):
|
|
804
|
-
self_ = self
|
|
805
|
-
|
|
806
|
-
@langwatch.span(ignore_missing_trace_warning=True, type="module")
|
|
807
|
-
def forward(self: dspy.Predict, **kwargs):
|
|
808
|
-
span = self_.safe_get_current_span()
|
|
809
|
-
signature = kwargs.get("signature", self.signature)
|
|
810
|
-
|
|
811
|
-
if span and signature and hasattr(signature, "__name__"):
|
|
812
|
-
span.update(name=f"{self.__class__.__name__}({signature.__name__})")
|
|
813
|
-
elif span:
|
|
814
|
-
span.update(name=f"{self.__class__.__name__}.forward")
|
|
815
|
-
|
|
816
|
-
prediction = self.__class__.__original_forward__(self, **kwargs) # type: ignore
|
|
817
|
-
|
|
818
|
-
if span and isinstance(prediction, dspy.Prediction):
|
|
819
|
-
span.update(output=prediction._store) # type: ignore
|
|
820
|
-
elif span:
|
|
821
|
-
span.update(output=prediction) # type: ignore
|
|
822
|
-
|
|
823
|
-
return prediction
|
|
824
|
-
|
|
825
|
-
return forward
|
|
826
|
-
|
|
827
800
|
def patched_language_model_call(self):
|
|
828
801
|
self_ = self
|
|
829
802
|
|
|
830
|
-
@langwatch.span(ignore_missing_trace_warning=True, type="llm")
|
|
803
|
+
@langwatch.span(ignore_missing_trace_warning=True, type="llm", capture_output=False)
|
|
831
804
|
def call(self: dspy.LM, prompt=None, messages=None, **kwargs):
|
|
832
805
|
all_kwargs = self.kwargs | kwargs
|
|
833
806
|
model = self.model
|
|
@@ -894,7 +867,7 @@ class DSPyTracer:
|
|
|
894
867
|
def patched_legacy_language_model_request(self):
|
|
895
868
|
self_ = self
|
|
896
869
|
|
|
897
|
-
@langwatch.span(ignore_missing_trace_warning=True, type="llm")
|
|
870
|
+
@langwatch.span(ignore_missing_trace_warning=True, type="llm", capture_output=False)
|
|
898
871
|
def basic_request(self: dspy.LM, prompt, **kwargs):
|
|
899
872
|
all_kwargs = self.kwargs | kwargs
|
|
900
873
|
model = all_kwargs.get("model", None)
|
|
@@ -946,7 +919,7 @@ class DSPyTracer:
|
|
|
946
919
|
) is not getattr(dspy.Retrieve, "forward", None):
|
|
947
920
|
return self.__class__.__original_forward__(self, *args, **kwargs) # type: ignore
|
|
948
921
|
|
|
949
|
-
@langwatch.span(ignore_missing_trace_warning=True, type="rag")
|
|
922
|
+
@langwatch.span(ignore_missing_trace_warning=True, type="rag", capture_output=False)
|
|
950
923
|
def forward(self, *args, **kwargs):
|
|
951
924
|
result = self.__class__.__original_forward__(self, *args, **kwargs) # type: ignore
|
|
952
925
|
|
langwatch/evaluation/__init__.py
CHANGED
|
@@ -1,9 +1,36 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
from langwatch.evaluation.evaluation import Evaluation
|
|
3
|
-
from .evaluation import
|
|
3
|
+
from langwatch.evaluation.platform_run import (
|
|
4
|
+
evaluate,
|
|
5
|
+
run, # Deprecated, kept for backwards compatibility
|
|
6
|
+
EvaluationRunResult,
|
|
7
|
+
EvaluationRunSummary,
|
|
8
|
+
EvaluationNotFoundError,
|
|
9
|
+
EvaluationTimeoutError,
|
|
10
|
+
EvaluationRunFailedError,
|
|
11
|
+
EvaluationsApiError,
|
|
12
|
+
TargetStats,
|
|
13
|
+
EvaluatorStats,
|
|
14
|
+
)
|
|
4
15
|
|
|
5
16
|
|
|
6
17
|
def init(name: str, *, run_id: Optional[str] = None) -> Evaluation:
|
|
7
18
|
evaluation = Evaluation(name, run_id=run_id)
|
|
8
19
|
evaluation.init()
|
|
9
20
|
return evaluation
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"init",
|
|
25
|
+
"evaluate",
|
|
26
|
+
"run", # Deprecated
|
|
27
|
+
"Evaluation",
|
|
28
|
+
"EvaluationRunResult",
|
|
29
|
+
"EvaluationRunSummary",
|
|
30
|
+
"EvaluationNotFoundError",
|
|
31
|
+
"EvaluationTimeoutError",
|
|
32
|
+
"EvaluationRunFailedError",
|
|
33
|
+
"EvaluationsApiError",
|
|
34
|
+
"TargetStats",
|
|
35
|
+
"EvaluatorStats",
|
|
36
|
+
]
|