langwatch 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch/__init__.py +6 -3
- langwatch/__version__.py +1 -1
- langwatch/evaluation/__init__.py +518 -17
- langwatch/evaluations.py +183 -353
- langwatch/experiment/__init__.py +108 -0
- langwatch/{evaluation/evaluation.py → experiment/experiment.py} +44 -5
- langwatch/{evaluation → experiment}/platform_run.py +40 -67
- {langwatch-0.9.0.dist-info → langwatch-0.10.0.dist-info}/METADATA +1 -1
- {langwatch-0.9.0.dist-info → langwatch-0.10.0.dist-info}/RECORD +10 -9
- {langwatch-0.9.0.dist-info → langwatch-0.10.0.dist-info}/WHEEL +0 -0
langwatch/evaluations.py
CHANGED
|
@@ -1,63 +1,54 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
"""
|
|
2
|
+
langwatch.evaluations - DEPRECATED, use langwatch.evaluation instead.
|
|
3
|
+
|
|
4
|
+
This module is kept for backward compatibility. All functionality has moved
|
|
5
|
+
to langwatch.evaluation (singular).
|
|
6
|
+
|
|
7
|
+
Example migration:
|
|
8
|
+
# Old (deprecated)
|
|
9
|
+
from langwatch.evaluations import evaluate
|
|
10
|
+
result = evaluate("presidio/pii_detection", input="test", output="response")
|
|
11
|
+
|
|
12
|
+
# New (recommended)
|
|
13
|
+
import langwatch
|
|
14
|
+
result = langwatch.evaluation.evaluate(
|
|
15
|
+
"presidio/pii_detection",
|
|
16
|
+
data={"input": "test", "output": "response"}
|
|
17
|
+
)
|
|
18
|
+
"""
|
|
19
|
+
import warnings
|
|
20
|
+
from typing import Any, Dict, List, Literal, Optional, Union, TYPE_CHECKING
|
|
3
21
|
from uuid import UUID
|
|
4
|
-
|
|
22
|
+
|
|
5
23
|
from deprecated import deprecated
|
|
6
24
|
|
|
7
|
-
|
|
8
|
-
import
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
25
|
+
# Re-export everything from the evaluation module
|
|
26
|
+
from langwatch.evaluation import (
|
|
27
|
+
# Types
|
|
28
|
+
BasicEvaluateData,
|
|
29
|
+
EvaluationResultModel,
|
|
30
|
+
# Internal functions
|
|
31
|
+
_prepare_data as _new_prepare_data,
|
|
32
|
+
_handle_response,
|
|
33
|
+
_handle_exception,
|
|
34
|
+
_add_evaluation as _new_add_evaluation,
|
|
35
|
+
)
|
|
17
36
|
|
|
18
37
|
from langwatch.types import (
|
|
19
38
|
Conversation,
|
|
20
|
-
Evaluation,
|
|
21
|
-
EvaluationResult,
|
|
22
39
|
EvaluationTimestamps,
|
|
23
40
|
Money,
|
|
24
41
|
MoneyDict,
|
|
25
|
-
SpanMetrics,
|
|
26
42
|
RAGChunk,
|
|
27
|
-
TypedValueEvaluationResult,
|
|
28
|
-
TypedValueGuardrailResult,
|
|
29
|
-
TypedValueJson,
|
|
30
|
-
)
|
|
31
|
-
from langwatch.utils.exceptions import capture_exception
|
|
32
|
-
from langwatch.utils.transformation import (
|
|
33
|
-
SerializableWithStringFallback,
|
|
34
43
|
)
|
|
35
44
|
|
|
36
45
|
if TYPE_CHECKING:
|
|
37
46
|
from langwatch.telemetry.tracing import LangWatchTrace
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class BasicEvaluateData(BaseModel):
|
|
41
|
-
input: Optional[str] = None
|
|
42
|
-
output: Optional[str] = None
|
|
43
|
-
expected_output: Optional[str] = None
|
|
44
|
-
contexts: Optional[Union[List[RAGChunk], List[str]]] = None
|
|
45
|
-
expected_contexts: Optional[Union[List[RAGChunk], List[str]]] = None
|
|
46
|
-
conversation: Optional[Conversation] = None
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class EvaluationResultModel(BaseModel):
|
|
50
|
-
status: Literal["processed", "skipped", "error"]
|
|
51
|
-
passed: Optional[bool] = None
|
|
52
|
-
score: Optional[float] = None
|
|
53
|
-
details: Optional[str] = None
|
|
54
|
-
label: Optional[str] = None
|
|
55
|
-
cost: Optional[Money] = None
|
|
56
|
-
error_type: Optional[str] = None
|
|
47
|
+
from langwatch.telemetry.span import LangWatchSpan
|
|
57
48
|
|
|
58
49
|
|
|
59
50
|
@deprecated(
|
|
60
|
-
reason="Please use
|
|
51
|
+
reason="Please use `langwatch.evaluation.evaluate()` instead."
|
|
61
52
|
)
|
|
62
53
|
def evaluate(
|
|
63
54
|
slug: str,
|
|
@@ -74,45 +65,85 @@ def evaluate(
|
|
|
74
65
|
span: Optional["LangWatchSpan"] = None,
|
|
75
66
|
api_key: Optional[str] = None,
|
|
76
67
|
data: Optional[Union[BasicEvaluateData, Dict[str, Any]]] = None,
|
|
77
|
-
) -> EvaluationResultModel:
|
|
68
|
+
) -> EvaluationResultModel:
|
|
69
|
+
"""
|
|
70
|
+
Deprecated: Use langwatch.evaluation.evaluate() instead.
|
|
71
|
+
|
|
72
|
+
The new API uses a simpler signature:
|
|
73
|
+
langwatch.evaluation.evaluate(slug, data, name, settings, as_guardrail)
|
|
74
|
+
"""
|
|
75
|
+
import langwatch.evaluation as evaluation
|
|
76
|
+
|
|
78
77
|
if trace:
|
|
79
|
-
warn(
|
|
80
|
-
"The `trace` argument is deprecated and will be removed
|
|
78
|
+
warnings.warn(
|
|
79
|
+
"The `trace` argument is deprecated and will be removed.",
|
|
80
|
+
DeprecationWarning,
|
|
81
81
|
stacklevel=2,
|
|
82
82
|
)
|
|
83
83
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
output=output,
|
|
92
|
-
expected_output=expected_output,
|
|
93
|
-
contexts=contexts,
|
|
94
|
-
expected_contexts=expected_contexts,
|
|
95
|
-
conversation=conversation,
|
|
96
|
-
settings=settings,
|
|
97
|
-
span=span,
|
|
98
|
-
as_guardrail=as_guardrail,
|
|
99
|
-
api_key=api_key,
|
|
100
|
-
data=data,
|
|
101
|
-
)
|
|
102
|
-
try:
|
|
103
|
-
with httpx.Client(timeout=900) as client:
|
|
104
|
-
response = client.post(**request_params)
|
|
105
|
-
better_raise_for_status(response, cls=EvaluatorException)
|
|
106
|
-
except Exception as e:
|
|
107
|
-
return _handle_exception(e, span, as_guardrail)
|
|
108
|
-
|
|
109
|
-
return _handle_response(response.json(), span, as_guardrail)
|
|
84
|
+
# Build data dict from legacy arguments
|
|
85
|
+
data_dict: Dict[str, Any] = {}
|
|
86
|
+
if data:
|
|
87
|
+
if isinstance(data, BasicEvaluateData):
|
|
88
|
+
data_dict = data.model_dump(exclude_unset=True, exclude_none=True)
|
|
89
|
+
else:
|
|
90
|
+
data_dict = data
|
|
110
91
|
|
|
111
|
-
|
|
92
|
+
# Map legacy positional arguments to data dict
|
|
93
|
+
if input is not None:
|
|
94
|
+
warnings.warn(
|
|
95
|
+
"The `input` argument is deprecated. Use `data={'input': ...}` instead.",
|
|
96
|
+
DeprecationWarning,
|
|
97
|
+
stacklevel=2,
|
|
98
|
+
)
|
|
99
|
+
data_dict["input"] = input
|
|
100
|
+
if output is not None:
|
|
101
|
+
warnings.warn(
|
|
102
|
+
"The `output` argument is deprecated. Use `data={'output': ...}` instead.",
|
|
103
|
+
DeprecationWarning,
|
|
104
|
+
stacklevel=2,
|
|
105
|
+
)
|
|
106
|
+
data_dict["output"] = output
|
|
107
|
+
if expected_output is not None:
|
|
108
|
+
warnings.warn(
|
|
109
|
+
"The `expected_output` argument is deprecated. Use `data={'expected_output': ...}` instead.",
|
|
110
|
+
DeprecationWarning,
|
|
111
|
+
stacklevel=2,
|
|
112
|
+
)
|
|
113
|
+
data_dict["expected_output"] = expected_output
|
|
114
|
+
if contexts is not None:
|
|
115
|
+
warnings.warn(
|
|
116
|
+
"The `contexts` argument is deprecated. Use `data={'contexts': ...}` instead.",
|
|
117
|
+
DeprecationWarning,
|
|
118
|
+
stacklevel=2,
|
|
119
|
+
)
|
|
120
|
+
data_dict["contexts"] = contexts
|
|
121
|
+
if expected_contexts is not None:
|
|
122
|
+
warnings.warn(
|
|
123
|
+
"The `expected_contexts` argument is deprecated. Use `data={'expected_contexts': ...}` instead.",
|
|
124
|
+
DeprecationWarning,
|
|
125
|
+
stacklevel=2,
|
|
126
|
+
)
|
|
127
|
+
data_dict["expected_contexts"] = expected_contexts
|
|
128
|
+
if conversation is not None:
|
|
129
|
+
warnings.warn(
|
|
130
|
+
"The `conversation` argument is deprecated. Use `data={'conversation': ...}` instead.",
|
|
131
|
+
DeprecationWarning,
|
|
132
|
+
stacklevel=2,
|
|
133
|
+
)
|
|
134
|
+
data_dict["conversation"] = conversation
|
|
135
|
+
|
|
136
|
+
return evaluation.evaluate(
|
|
137
|
+
slug=slug,
|
|
138
|
+
data=data_dict,
|
|
139
|
+
name=name,
|
|
140
|
+
settings=settings,
|
|
141
|
+
as_guardrail=as_guardrail,
|
|
142
|
+
)
|
|
112
143
|
|
|
113
144
|
|
|
114
145
|
@deprecated(
|
|
115
|
-
reason="Please use
|
|
146
|
+
reason="Please use `langwatch.evaluation.async_evaluate()` instead."
|
|
116
147
|
)
|
|
117
148
|
async def async_evaluate(
|
|
118
149
|
slug: str,
|
|
@@ -129,219 +160,87 @@ async def async_evaluate(
|
|
|
129
160
|
span: Optional["LangWatchSpan"] = None,
|
|
130
161
|
api_key: Optional[str] = None,
|
|
131
162
|
data: Optional[Union[BasicEvaluateData, Dict[str, Any]]] = None,
|
|
132
|
-
) -> EvaluationResultModel:
|
|
163
|
+
) -> EvaluationResultModel:
|
|
164
|
+
"""
|
|
165
|
+
Deprecated: Use langwatch.evaluation.async_evaluate() instead.
|
|
166
|
+
|
|
167
|
+
The new API uses a simpler signature:
|
|
168
|
+
langwatch.evaluation.async_evaluate(slug, data, name, settings, as_guardrail)
|
|
169
|
+
"""
|
|
170
|
+
import langwatch.evaluation as evaluation
|
|
171
|
+
|
|
133
172
|
if trace:
|
|
134
|
-
warn(
|
|
135
|
-
"The `trace` argument is deprecated and will be removed
|
|
173
|
+
warnings.warn(
|
|
174
|
+
"The `trace` argument is deprecated and will be removed.",
|
|
175
|
+
DeprecationWarning,
|
|
136
176
|
stacklevel=2,
|
|
137
177
|
)
|
|
138
178
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
output=output,
|
|
147
|
-
expected_output=expected_output,
|
|
148
|
-
contexts=contexts,
|
|
149
|
-
expected_contexts=expected_contexts,
|
|
150
|
-
conversation=conversation,
|
|
151
|
-
settings=settings,
|
|
152
|
-
span=span,
|
|
153
|
-
as_guardrail=as_guardrail,
|
|
154
|
-
api_key=api_key,
|
|
155
|
-
data=data,
|
|
156
|
-
)
|
|
157
|
-
try:
|
|
158
|
-
async with httpx.AsyncClient(timeout=900) as client:
|
|
159
|
-
response = await client.post(**request_params)
|
|
160
|
-
better_raise_for_status(response)
|
|
161
|
-
except Exception as e:
|
|
162
|
-
return _handle_exception(e, span, as_guardrail)
|
|
163
|
-
|
|
164
|
-
return _handle_response(response.json(), span, as_guardrail)
|
|
165
|
-
|
|
166
|
-
raise ValueError("Async evaluate failed due to issue creating span")
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def _prepare_data(
|
|
170
|
-
slug: str,
|
|
171
|
-
name: Optional[str],
|
|
172
|
-
input: Optional[str],
|
|
173
|
-
output: Optional[str],
|
|
174
|
-
expected_output: Optional[str],
|
|
175
|
-
contexts: Optional[Union[List[RAGChunk], List[str]]] = None,
|
|
176
|
-
expected_contexts: Optional[Union[List[RAGChunk], List[str]]] = None,
|
|
177
|
-
conversation: Optional[Conversation] = None,
|
|
178
|
-
settings: Optional[Dict[str, Any]] = None,
|
|
179
|
-
trace_id: Optional[Union[str, UUID]] = None,
|
|
180
|
-
span_id: Optional[Union[str, UUID]] = None,
|
|
181
|
-
span: Optional["LangWatchSpan"] = None,
|
|
182
|
-
as_guardrail: bool = False,
|
|
183
|
-
api_key: Optional[str] = None,
|
|
184
|
-
data: Optional[Union[BasicEvaluateData, Dict[str, Any]]] = None,
|
|
185
|
-
):
|
|
186
|
-
trace_data: Dict[str, Any] = {}
|
|
187
|
-
|
|
188
|
-
span_ctx = get_current_span().get_span_context()
|
|
189
|
-
if span_ctx and span_ctx.is_valid:
|
|
190
|
-
trace_data["trace_id"] = format(span_ctx.trace_id, "x")
|
|
191
|
-
trace_data["span_id"] = format(span_ctx.span_id, "x")
|
|
179
|
+
# Build data dict from legacy arguments
|
|
180
|
+
data_dict: Dict[str, Any] = {}
|
|
181
|
+
if data:
|
|
182
|
+
if isinstance(data, BasicEvaluateData):
|
|
183
|
+
data_dict = data.model_dump(exclude_unset=True, exclude_none=True)
|
|
184
|
+
else:
|
|
185
|
+
data_dict = data
|
|
192
186
|
|
|
193
|
-
|
|
194
|
-
**trace_data,
|
|
195
|
-
**(
|
|
196
|
-
data.model_dump(exclude_unset=True, exclude_none=True)
|
|
197
|
-
if isinstance(data, BasicEvaluateData)
|
|
198
|
-
else data or {}
|
|
199
|
-
),
|
|
200
|
-
}
|
|
187
|
+
# Map legacy positional arguments to data dict
|
|
201
188
|
if input is not None:
|
|
202
|
-
warn(
|
|
203
|
-
"
|
|
189
|
+
warnings.warn(
|
|
190
|
+
"The `input` argument is deprecated. Use `data={'input': ...}` instead.",
|
|
191
|
+
DeprecationWarning,
|
|
204
192
|
stacklevel=2,
|
|
205
193
|
)
|
|
206
|
-
|
|
194
|
+
data_dict["input"] = input
|
|
207
195
|
if output is not None:
|
|
208
|
-
warn(
|
|
209
|
-
"
|
|
196
|
+
warnings.warn(
|
|
197
|
+
"The `output` argument is deprecated. Use `data={'output': ...}` instead.",
|
|
198
|
+
DeprecationWarning,
|
|
210
199
|
stacklevel=2,
|
|
211
200
|
)
|
|
212
|
-
|
|
201
|
+
data_dict["output"] = output
|
|
213
202
|
if expected_output is not None:
|
|
214
|
-
warn(
|
|
215
|
-
"
|
|
203
|
+
warnings.warn(
|
|
204
|
+
"The `expected_output` argument is deprecated. Use `data={'expected_output': ...}` instead.",
|
|
205
|
+
DeprecationWarning,
|
|
216
206
|
stacklevel=2,
|
|
217
207
|
)
|
|
218
|
-
|
|
208
|
+
data_dict["expected_output"] = expected_output
|
|
219
209
|
if contexts is not None:
|
|
220
|
-
warn(
|
|
221
|
-
"
|
|
210
|
+
warnings.warn(
|
|
211
|
+
"The `contexts` argument is deprecated. Use `data={'contexts': ...}` instead.",
|
|
212
|
+
DeprecationWarning,
|
|
222
213
|
stacklevel=2,
|
|
223
214
|
)
|
|
224
|
-
|
|
215
|
+
data_dict["contexts"] = contexts
|
|
225
216
|
if expected_contexts is not None:
|
|
226
|
-
warn(
|
|
227
|
-
"
|
|
217
|
+
warnings.warn(
|
|
218
|
+
"The `expected_contexts` argument is deprecated. Use `data={'expected_contexts': ...}` instead.",
|
|
219
|
+
DeprecationWarning,
|
|
228
220
|
stacklevel=2,
|
|
229
221
|
)
|
|
230
|
-
|
|
222
|
+
data_dict["expected_contexts"] = expected_contexts
|
|
231
223
|
if conversation is not None:
|
|
232
|
-
warn(
|
|
233
|
-
"
|
|
224
|
+
warnings.warn(
|
|
225
|
+
"The `conversation` argument is deprecated. Use `data={'conversation': ...}` instead.",
|
|
226
|
+
DeprecationWarning,
|
|
234
227
|
stacklevel=2,
|
|
235
228
|
)
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
if span_id is not None:
|
|
245
|
-
warn(
|
|
246
|
-
"The `span_id` argument is deprecated and will be removed in a future version. Future versions of the SDK will not support it. Until that happens, the `span_id` will be mapped to `deprecated.span_id` in the data.",
|
|
247
|
-
stacklevel=2,
|
|
248
|
-
)
|
|
249
|
-
dataDict["deprecated.span_id"] = str(span_id)
|
|
250
|
-
|
|
251
|
-
if span:
|
|
252
|
-
span.update(
|
|
253
|
-
input=TypedValueJson(type="json", value=dataDict),
|
|
254
|
-
params=settings, # type: ignore
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
client = get_instance()
|
|
258
|
-
|
|
259
|
-
return {
|
|
260
|
-
"url": get_endpoint() + f"/api/evaluations/{slug}/evaluate",
|
|
261
|
-
"json": {
|
|
262
|
-
"trace_id": (
|
|
263
|
-
None
|
|
264
|
-
if client and client.disable_sending
|
|
265
|
-
else (
|
|
266
|
-
format(span_ctx.trace_id, "x")
|
|
267
|
-
if span_ctx and span_ctx.is_valid
|
|
268
|
-
else None
|
|
269
|
-
)
|
|
270
|
-
),
|
|
271
|
-
"span_id": (
|
|
272
|
-
None
|
|
273
|
-
if client and client.disable_sending
|
|
274
|
-
else (
|
|
275
|
-
format(span_ctx.span_id, "x")
|
|
276
|
-
if span_ctx and span_ctx.is_valid
|
|
277
|
-
else None
|
|
278
|
-
)
|
|
279
|
-
),
|
|
280
|
-
"name": name,
|
|
281
|
-
"data": dataDict,
|
|
282
|
-
"settings": settings,
|
|
283
|
-
"as_guardrail": as_guardrail,
|
|
284
|
-
},
|
|
285
|
-
"headers": {"X-Auth-Token": get_api_key()},
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
def _handle_response(
|
|
290
|
-
response: Dict[str, Any],
|
|
291
|
-
span: Optional["LangWatchSpan"] = None,
|
|
292
|
-
as_guardrail: bool = False,
|
|
293
|
-
) -> EvaluationResult:
|
|
294
|
-
result = EvaluationResultModel.model_validate(response)
|
|
295
|
-
if span:
|
|
296
|
-
span.update(
|
|
297
|
-
output=(
|
|
298
|
-
TypedValueGuardrailResult(
|
|
299
|
-
type="guardrail_result",
|
|
300
|
-
value=cast(
|
|
301
|
-
EvaluationResult,
|
|
302
|
-
result.model_dump(exclude_unset=True, exclude_none=True),
|
|
303
|
-
),
|
|
304
|
-
)
|
|
305
|
-
if as_guardrail
|
|
306
|
-
else TypedValueEvaluationResult(
|
|
307
|
-
type="evaluation_result",
|
|
308
|
-
value=cast(
|
|
309
|
-
EvaluationResult,
|
|
310
|
-
result.model_dump(exclude_unset=True, exclude_none=True),
|
|
311
|
-
),
|
|
312
|
-
)
|
|
313
|
-
)
|
|
314
|
-
)
|
|
315
|
-
if result.cost:
|
|
316
|
-
span.update(
|
|
317
|
-
metrics=SpanMetrics(
|
|
318
|
-
cost=result.cost.amount,
|
|
319
|
-
)
|
|
320
|
-
)
|
|
321
|
-
|
|
322
|
-
return result
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
def _handle_exception(
|
|
326
|
-
e: Exception, span: Optional["LangWatchSpan"] = None, as_guardrail: bool = False
|
|
327
|
-
):
|
|
328
|
-
response: Dict[str, Any] = {
|
|
329
|
-
"status": "error",
|
|
330
|
-
"details": repr(e),
|
|
331
|
-
}
|
|
332
|
-
if as_guardrail:
|
|
333
|
-
response["passed"] = True
|
|
334
|
-
return _handle_response(
|
|
335
|
-
response,
|
|
336
|
-
span,
|
|
337
|
-
as_guardrail,
|
|
229
|
+
data_dict["conversation"] = conversation
|
|
230
|
+
|
|
231
|
+
return await evaluation.async_evaluate(
|
|
232
|
+
slug=slug,
|
|
233
|
+
data=data_dict,
|
|
234
|
+
name=name,
|
|
235
|
+
settings=settings,
|
|
236
|
+
as_guardrail=as_guardrail,
|
|
338
237
|
)
|
|
339
238
|
|
|
340
239
|
|
|
341
240
|
@deprecated(
|
|
342
|
-
reason="Please use the new `langwatch.evaluation` module instead.
|
|
241
|
+
reason="Please use the new `langwatch.evaluation` module instead."
|
|
343
242
|
)
|
|
344
|
-
def _add_evaluation(
|
|
243
|
+
def _add_evaluation(
|
|
345
244
|
*,
|
|
346
245
|
span: Optional["LangWatchSpan"] = None,
|
|
347
246
|
evaluation_id: Optional[str] = None,
|
|
@@ -357,99 +256,30 @@ def _add_evaluation( # type: ignore
|
|
|
357
256
|
error: Optional[Exception] = None,
|
|
358
257
|
timestamps: Optional[EvaluationTimestamps] = None,
|
|
359
258
|
):
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
elif isinstance(cost, float) or isinstance(cost, int):
|
|
379
|
-
evaluation_result["cost"] = {"currency": "USD", "amount": cost}
|
|
380
|
-
else:
|
|
381
|
-
evaluation_result["cost"] = cost
|
|
382
|
-
|
|
383
|
-
eval_span_created = False
|
|
384
|
-
eval_span = span
|
|
385
|
-
|
|
386
|
-
if not span or span.type != "evaluation":
|
|
387
|
-
eval_span = langwatch.span(
|
|
388
|
-
type="evaluation", span_context=span.get_span_context() if span else None
|
|
389
|
-
)
|
|
390
|
-
eval_span_created = True
|
|
391
|
-
|
|
392
|
-
try:
|
|
393
|
-
eval_span.update(
|
|
394
|
-
name=name,
|
|
395
|
-
output=TypedValueEvaluationResult(
|
|
396
|
-
type="evaluation_result",
|
|
397
|
-
value=evaluation_result,
|
|
398
|
-
),
|
|
399
|
-
error=error,
|
|
400
|
-
timestamps=(
|
|
401
|
-
SpanTimestamps(
|
|
402
|
-
started_at=(
|
|
403
|
-
timestamps["started_at"]
|
|
404
|
-
if "started_at" in timestamps and timestamps["started_at"]
|
|
405
|
-
else cast(int, None)
|
|
406
|
-
),
|
|
407
|
-
finished_at=(
|
|
408
|
-
timestamps["finished_at"]
|
|
409
|
-
if "finished_at" in timestamps and timestamps["finished_at"]
|
|
410
|
-
else cast(int, None)
|
|
411
|
-
),
|
|
412
|
-
)
|
|
413
|
-
if timestamps
|
|
414
|
-
else None
|
|
415
|
-
),
|
|
416
|
-
)
|
|
417
|
-
if "cost" in evaluation_result and evaluation_result["cost"]:
|
|
418
|
-
eval_span.update(
|
|
419
|
-
metrics=SpanMetrics(cost=evaluation_result["cost"]["amount"])
|
|
420
|
-
)
|
|
421
|
-
|
|
422
|
-
span_id = None
|
|
423
|
-
span_ctx = eval_span.get_span_context()
|
|
424
|
-
if span_ctx and span_ctx.is_valid:
|
|
425
|
-
span_id = format(span_ctx.span_id, "x")
|
|
426
|
-
|
|
427
|
-
evaluation = Evaluation(
|
|
428
|
-
evaluation_id=evaluation_id or str(PKSUID("eval")),
|
|
429
|
-
span_id=span_id,
|
|
430
|
-
name=name,
|
|
431
|
-
type=type,
|
|
432
|
-
is_guardrail=is_guardrail,
|
|
433
|
-
status=status,
|
|
434
|
-
passed=passed,
|
|
435
|
-
score=score,
|
|
436
|
-
label=label,
|
|
437
|
-
details=details,
|
|
438
|
-
error=capture_exception(error) if error else None,
|
|
439
|
-
timestamps=timestamps,
|
|
440
|
-
)
|
|
259
|
+
"""
|
|
260
|
+
Deprecated: Use langwatch.evaluation._add_evaluation() instead.
|
|
261
|
+
"""
|
|
262
|
+
return _new_add_evaluation(
|
|
263
|
+
span=span,
|
|
264
|
+
evaluation_id=evaluation_id,
|
|
265
|
+
name=name,
|
|
266
|
+
type=type,
|
|
267
|
+
is_guardrail=is_guardrail,
|
|
268
|
+
status=status,
|
|
269
|
+
passed=passed,
|
|
270
|
+
score=score,
|
|
271
|
+
label=label,
|
|
272
|
+
details=details,
|
|
273
|
+
cost=cost,
|
|
274
|
+
error=error,
|
|
275
|
+
timestamps=timestamps,
|
|
276
|
+
)
|
|
441
277
|
|
|
442
|
-
span.add_event(
|
|
443
|
-
AttributeKey.LangWatchEventEvaluationCustom,
|
|
444
|
-
{
|
|
445
|
-
"json_encoded_event": json.dumps(
|
|
446
|
-
evaluation,
|
|
447
|
-
cls=SerializableWithStringFallback,
|
|
448
|
-
),
|
|
449
|
-
},
|
|
450
|
-
)
|
|
451
278
|
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
279
|
+
__all__ = [
|
|
280
|
+
"evaluate",
|
|
281
|
+
"async_evaluate",
|
|
282
|
+
"BasicEvaluateData",
|
|
283
|
+
"EvaluationResultModel",
|
|
284
|
+
"_add_evaluation",
|
|
285
|
+
]
|