langwatch 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch/__init__.py +6 -3
- langwatch/__version__.py +1 -1
- langwatch/client.py +16 -0
- langwatch/domain/__init__.py +3 -0
- langwatch/dspy/__init__.py +67 -34
- langwatch/evaluation/__init__.py +518 -17
- langwatch/evaluations.py +183 -353
- langwatch/experiment/__init__.py +108 -0
- langwatch/{evaluation/evaluation.py → experiment/experiment.py} +44 -5
- langwatch/{evaluation → experiment}/platform_run.py +40 -67
- langwatch/litellm.py +7 -0
- langwatch/openai.py +61 -34
- langwatch/prompts/local_loader.py +12 -0
- langwatch/prompts/prompt_facade.py +10 -3
- langwatch/types.py +5 -0
- langwatch/utils/initialization.py +12 -2
- langwatch/utils/utils.py +3 -1
- {langwatch-0.9.0.dist-info → langwatch-0.10.1.dist-info}/METADATA +1 -1
- {langwatch-0.9.0.dist-info → langwatch-0.10.1.dist-info}/RECORD +20 -19
- {langwatch-0.9.0.dist-info → langwatch-0.10.1.dist-info}/WHEEL +0 -0
langwatch/__init__.py
CHANGED
|
@@ -17,7 +17,8 @@ from typing import TYPE_CHECKING
|
|
|
17
17
|
# Type hints for IntelliSense (only imported for typing)
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
import langwatch.evaluations as evaluations
|
|
20
|
-
import langwatch.
|
|
20
|
+
import langwatch.experiment as experiment
|
|
21
|
+
import langwatch.evaluation as evaluation # Deprecated, use experiment
|
|
21
22
|
import langwatch.dataset as dataset
|
|
22
23
|
import langwatch.dspy as dspy
|
|
23
24
|
import langwatch.langchain as langchain
|
|
@@ -41,7 +42,8 @@ def _api_key():
|
|
|
41
42
|
# Lazy loading configuration
|
|
42
43
|
_LAZY_MODULES = {
|
|
43
44
|
"evaluations": "langwatch.evaluations",
|
|
44
|
-
"
|
|
45
|
+
"experiment": "langwatch.experiment",
|
|
46
|
+
"evaluation": "langwatch.evaluation", # Deprecated, use experiment
|
|
45
47
|
"dataset": "langwatch.dataset",
|
|
46
48
|
"dspy": "langwatch.dspy", # Special handling
|
|
47
49
|
"langchain": "langwatch.langchain", # Special handling
|
|
@@ -150,7 +152,8 @@ __all__ = [
|
|
|
150
152
|
"ensure_setup",
|
|
151
153
|
"get_current_trace",
|
|
152
154
|
"get_current_span",
|
|
153
|
-
"
|
|
155
|
+
"experiment",
|
|
156
|
+
"evaluation", # Deprecated, use experiment
|
|
154
157
|
"dataset",
|
|
155
158
|
"evaluations",
|
|
156
159
|
"langchain",
|
langwatch/__version__.py
CHANGED
langwatch/client.py
CHANGED
|
@@ -50,6 +50,7 @@ class Client(LangWatchClientProtocol):
|
|
|
50
50
|
_registered_instrumentors: ClassVar[
|
|
51
51
|
dict[opentelemetry.trace.TracerProvider, set[BaseInstrumentor]]
|
|
52
52
|
] = {}
|
|
53
|
+
_prompts_path: ClassVar[Optional[str]] = None
|
|
53
54
|
|
|
54
55
|
# Regular attributes for protocol compatibility
|
|
55
56
|
base_attributes: BaseAttributes
|
|
@@ -69,6 +70,7 @@ class Client(LangWatchClientProtocol):
|
|
|
69
70
|
span_exclude_rules: Optional[List[SpanProcessingExcludeRule]] = None,
|
|
70
71
|
ignore_global_tracer_provider_override_warning: Optional[bool] = None,
|
|
71
72
|
skip_open_telemetry_setup: Optional[bool] = None,
|
|
73
|
+
prompts_path: Optional[str] = None,
|
|
72
74
|
) -> "Client":
|
|
73
75
|
"""Ensure only one instance of Client exists (singleton pattern)."""
|
|
74
76
|
if cls._instance is None:
|
|
@@ -88,6 +90,7 @@ class Client(LangWatchClientProtocol):
|
|
|
88
90
|
span_exclude_rules: Optional[List[SpanProcessingExcludeRule]] = None,
|
|
89
91
|
ignore_global_tracer_provider_override_warning: Optional[bool] = None,
|
|
90
92
|
skip_open_telemetry_setup: Optional[bool] = None,
|
|
93
|
+
prompts_path: Optional[str] = None,
|
|
91
94
|
):
|
|
92
95
|
"""
|
|
93
96
|
Initialize the LangWatch tracing client.
|
|
@@ -140,6 +143,8 @@ class Client(LangWatchClientProtocol):
|
|
|
140
143
|
)
|
|
141
144
|
if skip_open_telemetry_setup is not None:
|
|
142
145
|
Client._skip_open_telemetry_setup = skip_open_telemetry_setup
|
|
146
|
+
if prompts_path is not None:
|
|
147
|
+
Client._prompts_path = prompts_path
|
|
143
148
|
if base_attributes is not None:
|
|
144
149
|
Client._base_attributes = base_attributes
|
|
145
150
|
# Ensure required SDK attributes remain present after reconfiguration
|
|
@@ -215,6 +220,9 @@ class Client(LangWatchClientProtocol):
|
|
|
215
220
|
if skip_open_telemetry_setup is not None:
|
|
216
221
|
Client._skip_open_telemetry_setup = skip_open_telemetry_setup
|
|
217
222
|
|
|
223
|
+
if prompts_path is not None:
|
|
224
|
+
Client._prompts_path = prompts_path
|
|
225
|
+
|
|
218
226
|
if base_attributes is not None:
|
|
219
227
|
Client._base_attributes = base_attributes
|
|
220
228
|
elif not Client._base_attributes:
|
|
@@ -284,6 +292,7 @@ class Client(LangWatchClientProtocol):
|
|
|
284
292
|
span_exclude_rules: Optional[List[SpanProcessingExcludeRule]] = None,
|
|
285
293
|
ignore_global_tracer_provider_override_warning: Optional[bool] = None,
|
|
286
294
|
skip_open_telemetry_setup: Optional[bool] = None,
|
|
295
|
+
prompts_path: Optional[str] = None,
|
|
287
296
|
) -> "Client":
|
|
288
297
|
"""Create or get the singleton instance of the LangWatch client. Internal use only."""
|
|
289
298
|
if cls._instance is None:
|
|
@@ -299,6 +308,7 @@ class Client(LangWatchClientProtocol):
|
|
|
299
308
|
span_exclude_rules=span_exclude_rules,
|
|
300
309
|
ignore_global_tracer_provider_override_warning=ignore_global_tracer_provider_override_warning,
|
|
301
310
|
skip_open_telemetry_setup=skip_open_telemetry_setup,
|
|
311
|
+
prompts_path=prompts_path,
|
|
302
312
|
)
|
|
303
313
|
return cls._instance
|
|
304
314
|
|
|
@@ -327,6 +337,7 @@ class Client(LangWatchClientProtocol):
|
|
|
327
337
|
cls._skip_open_telemetry_setup = False
|
|
328
338
|
cls._tracer_provider = None
|
|
329
339
|
cls._rest_api_client = None
|
|
340
|
+
cls._prompts_path = None
|
|
330
341
|
cls._registered_instrumentors.clear()
|
|
331
342
|
|
|
332
343
|
@classmethod
|
|
@@ -416,6 +427,11 @@ class Client(LangWatchClientProtocol):
|
|
|
416
427
|
"""Get whether OpenTelemetry setup is skipped."""
|
|
417
428
|
return Client._skip_open_telemetry_setup
|
|
418
429
|
|
|
430
|
+
@property
|
|
431
|
+
def prompts_path(self) -> Optional[str]:
|
|
432
|
+
"""Get the base path for local prompt files."""
|
|
433
|
+
return Client._prompts_path
|
|
434
|
+
|
|
419
435
|
@disable_sending.setter
|
|
420
436
|
def disable_sending(self, value: bool) -> None:
|
|
421
437
|
"""Set whether sending is disabled. Spans are still created; the exporter conditionally drops them."""
|
langwatch/domain/__init__.py
CHANGED
|
@@ -43,6 +43,7 @@ class ChatMessage(TypedDict, total=False):
|
|
|
43
43
|
tool_calls: Optional[List[ToolCall]]
|
|
44
44
|
tool_call_id: Optional[str]
|
|
45
45
|
name: Optional[str]
|
|
46
|
+
reasoning_content: Optional[str]
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
class TypedValueChatMessages(TypedDict):
|
|
@@ -156,6 +157,7 @@ SpanTypes = Literal[
|
|
|
156
157
|
class SpanMetrics(TypedDict, total=False):
|
|
157
158
|
prompt_tokens: Optional[int]
|
|
158
159
|
completion_tokens: Optional[int]
|
|
160
|
+
reasoning_tokens: Optional[int]
|
|
159
161
|
cost: Optional[float]
|
|
160
162
|
first_token_ms: Optional[int]
|
|
161
163
|
|
|
@@ -179,6 +181,7 @@ class SpanParams(TypedDict, total=False):
|
|
|
179
181
|
functions: Optional[List[Dict[str, Any]]]
|
|
180
182
|
user: Optional[str]
|
|
181
183
|
response_format: Optional[Union[Dict[str, Any], BaseModel]]
|
|
184
|
+
reasoning_effort: Optional[str]
|
|
182
185
|
|
|
183
186
|
|
|
184
187
|
class BaseSpan(TypedDict):
|
langwatch/dspy/__init__.py
CHANGED
|
@@ -6,7 +6,8 @@ import warnings
|
|
|
6
6
|
import dspy
|
|
7
7
|
from typing import Callable, List, Optional, Any, Type, Union
|
|
8
8
|
from langwatch.utils.exceptions import better_raise_for_status
|
|
9
|
-
from langwatch.utils.transformation import truncate_object_recursively
|
|
9
|
+
from langwatch.utils.transformation import SerializableWithStringFallback, truncate_object_recursively
|
|
10
|
+
from langwatch.utils.utils import safe_get
|
|
10
11
|
from langwatch.telemetry.tracing import LangWatchTrace
|
|
11
12
|
from typing_extensions import TypedDict
|
|
12
13
|
import langwatch
|
|
@@ -824,6 +825,7 @@ class DSPyTracer:
|
|
|
824
825
|
"functions",
|
|
825
826
|
"user",
|
|
826
827
|
"response_format",
|
|
828
|
+
"reasoning_effort",
|
|
827
829
|
]
|
|
828
830
|
for param in params:
|
|
829
831
|
if all_kwargs.get(param):
|
|
@@ -842,23 +844,44 @@ class DSPyTracer:
|
|
|
842
844
|
|
|
843
845
|
result = self.__class__.__original_call__(self, prompt, messages, **kwargs) # type: ignore
|
|
844
846
|
|
|
847
|
+
history = self.history[-1] if len(self.history) > 0 else None
|
|
848
|
+
|
|
845
849
|
if span:
|
|
846
|
-
|
|
850
|
+
# Capture full message from history (includes reasoning_content) instead of just result
|
|
851
|
+
choices = safe_get(history, "response", "choices")
|
|
852
|
+
|
|
853
|
+
if choices and len(choices) > 0:
|
|
854
|
+
messages_output = []
|
|
855
|
+
for choice in choices:
|
|
856
|
+
msg = safe_get(choice, "message")
|
|
857
|
+
if msg is not None:
|
|
858
|
+
# Convert Pydantic model to dict if needed
|
|
859
|
+
if hasattr(msg, "model_dump"):
|
|
860
|
+
msg = msg.model_dump(exclude_unset=True)
|
|
861
|
+
elif hasattr(msg, "dict"):
|
|
862
|
+
msg = msg.dict(exclude_unset=True)
|
|
863
|
+
messages_output.append(msg)
|
|
864
|
+
if messages_output:
|
|
865
|
+
span.update(output=messages_output)
|
|
866
|
+
else:
|
|
867
|
+
span.update(output=result)
|
|
868
|
+
else:
|
|
869
|
+
span.update(output=result)
|
|
847
870
|
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
"completion_tokens": history["usage"]["completion_tokens"],
|
|
859
|
-
"prompt_tokens": history["usage"]["prompt_tokens"],
|
|
860
|
-
}
|
|
871
|
+
completion_tokens = safe_get(history, "usage", "completion_tokens")
|
|
872
|
+
prompt_tokens = safe_get(history, "usage", "prompt_tokens")
|
|
873
|
+
if span and completion_tokens is not None and prompt_tokens is not None:
|
|
874
|
+
metrics = {
|
|
875
|
+
"completion_tokens": completion_tokens,
|
|
876
|
+
"prompt_tokens": prompt_tokens,
|
|
877
|
+
}
|
|
878
|
+
# Capture reasoning_tokens if available
|
|
879
|
+
reasoning_tokens = safe_get(
|
|
880
|
+
history, "usage", "completion_tokens_details", "reasoning_tokens"
|
|
861
881
|
)
|
|
882
|
+
if reasoning_tokens is not None:
|
|
883
|
+
metrics["reasoning_tokens"] = reasoning_tokens
|
|
884
|
+
span.update(metrics=metrics)
|
|
862
885
|
|
|
863
886
|
return result
|
|
864
887
|
|
|
@@ -884,26 +907,36 @@ class DSPyTracer:
|
|
|
884
907
|
|
|
885
908
|
result = self.__class__.__original_basic_request__(self, prompt, **kwargs) # type: ignore
|
|
886
909
|
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
910
|
+
# Capture full messages from choices (includes reasoning_content)
|
|
911
|
+
choices = safe_get(result, "choices")
|
|
912
|
+
if span and choices and len(choices) > 0:
|
|
913
|
+
messages_output = []
|
|
914
|
+
for choice in choices:
|
|
915
|
+
msg = safe_get(choice, "message")
|
|
916
|
+
if msg is not None:
|
|
917
|
+
# Convert Pydantic model to dict if needed
|
|
918
|
+
if hasattr(msg, "model_dump"):
|
|
919
|
+
msg = msg.model_dump(exclude_unset=True)
|
|
920
|
+
elif hasattr(msg, "dict"):
|
|
921
|
+
msg = msg.dict(exclude_unset=True)
|
|
922
|
+
messages_output.append(msg)
|
|
923
|
+
if messages_output:
|
|
924
|
+
span.update(output=messages_output)
|
|
925
|
+
|
|
926
|
+
completion_tokens = safe_get(result, "usage", "completion_tokens")
|
|
927
|
+
prompt_tokens = safe_get(result, "usage", "prompt_tokens")
|
|
928
|
+
if span and completion_tokens is not None and prompt_tokens is not None:
|
|
929
|
+
metrics = {
|
|
930
|
+
"completion_tokens": completion_tokens,
|
|
931
|
+
"prompt_tokens": prompt_tokens,
|
|
932
|
+
}
|
|
933
|
+
# Capture reasoning_tokens if available
|
|
934
|
+
reasoning_tokens = safe_get(
|
|
935
|
+
result, "usage", "completion_tokens_details", "reasoning_tokens"
|
|
906
936
|
)
|
|
937
|
+
if reasoning_tokens is not None:
|
|
938
|
+
metrics["reasoning_tokens"] = reasoning_tokens
|
|
939
|
+
span.update(metrics=metrics)
|
|
907
940
|
|
|
908
941
|
return result
|
|
909
942
|
|