langwatch 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch/__init__.py +6 -3
- langwatch/__version__.py +1 -1
- langwatch/evaluation/__init__.py +518 -17
- langwatch/evaluations.py +183 -353
- langwatch/experiment/__init__.py +108 -0
- langwatch/{evaluation/evaluation.py → experiment/experiment.py} +44 -5
- langwatch/{evaluation → experiment}/platform_run.py +40 -67
- {langwatch-0.9.0.dist-info → langwatch-0.10.0.dist-info}/METADATA +1 -1
- {langwatch-0.9.0.dist-info → langwatch-0.10.0.dist-info}/RECORD +10 -9
- {langwatch-0.9.0.dist-info → langwatch-0.10.0.dist-info}/WHEEL +0 -0
langwatch/__init__.py
CHANGED
|
@@ -17,7 +17,8 @@ from typing import TYPE_CHECKING
|
|
|
17
17
|
# Type hints for IntelliSense (only imported for typing)
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
import langwatch.evaluations as evaluations
|
|
20
|
-
import langwatch.
|
|
20
|
+
import langwatch.experiment as experiment
|
|
21
|
+
import langwatch.evaluation as evaluation # Deprecated, use experiment
|
|
21
22
|
import langwatch.dataset as dataset
|
|
22
23
|
import langwatch.dspy as dspy
|
|
23
24
|
import langwatch.langchain as langchain
|
|
@@ -41,7 +42,8 @@ def _api_key():
|
|
|
41
42
|
# Lazy loading configuration
|
|
42
43
|
_LAZY_MODULES = {
|
|
43
44
|
"evaluations": "langwatch.evaluations",
|
|
44
|
-
"
|
|
45
|
+
"experiment": "langwatch.experiment",
|
|
46
|
+
"evaluation": "langwatch.evaluation", # Deprecated, use experiment
|
|
45
47
|
"dataset": "langwatch.dataset",
|
|
46
48
|
"dspy": "langwatch.dspy", # Special handling
|
|
47
49
|
"langchain": "langwatch.langchain", # Special handling
|
|
@@ -150,7 +152,8 @@ __all__ = [
|
|
|
150
152
|
"ensure_setup",
|
|
151
153
|
"get_current_trace",
|
|
152
154
|
"get_current_span",
|
|
153
|
-
"
|
|
155
|
+
"experiment",
|
|
156
|
+
"evaluation", # Deprecated, use experiment
|
|
154
157
|
"dataset",
|
|
155
158
|
"evaluations",
|
|
156
159
|
"langchain",
|
langwatch/__version__.py
CHANGED
langwatch/evaluation/__init__.py
CHANGED
|
@@ -1,36 +1,537 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
1
|
+
"""
|
|
2
|
+
langwatch.evaluation - Online Evaluations and Guardrails API
|
|
3
|
+
|
|
4
|
+
This module provides the ability to run evaluators and guardrails in real-time
|
|
5
|
+
against LLM inputs/outputs.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
```python
|
|
9
|
+
import langwatch
|
|
10
|
+
|
|
11
|
+
# Run a guardrail
|
|
12
|
+
guardrail = langwatch.evaluation.evaluate(
|
|
13
|
+
"presidio/pii_detection",
|
|
14
|
+
data={"input": user_input, "output": generated_response},
|
|
15
|
+
name="PII Detection",
|
|
16
|
+
as_guardrail=True,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
if not guardrail.passed:
|
|
20
|
+
return "I'm sorry, I can't do that."
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
This module also provides backward compatibility for the deprecated evaluation/experiment API.
|
|
24
|
+
For batch experiments, use `langwatch.experiment` instead.
|
|
25
|
+
"""
|
|
26
|
+
import json
|
|
27
|
+
import warnings
|
|
28
|
+
from typing import Any, Dict, List, Literal, Optional, Union, cast, TYPE_CHECKING
|
|
29
|
+
from uuid import UUID
|
|
30
|
+
|
|
31
|
+
import httpx
|
|
32
|
+
import langwatch
|
|
33
|
+
from langwatch.domain import SpanTimestamps
|
|
34
|
+
from pksuid import PKSUID
|
|
35
|
+
from langwatch.telemetry.span import LangWatchSpan
|
|
36
|
+
from langwatch.telemetry.context import get_current_span
|
|
37
|
+
from langwatch.state import get_api_key, get_endpoint, get_instance
|
|
38
|
+
from langwatch.attributes import AttributeKey
|
|
39
|
+
from langwatch.utils.exceptions import EvaluatorException, better_raise_for_status
|
|
40
|
+
from pydantic import BaseModel
|
|
41
|
+
|
|
42
|
+
from langwatch.types import (
|
|
43
|
+
Conversation,
|
|
44
|
+
Evaluation as _EvaluationTypedDict,
|
|
45
|
+
EvaluationResult,
|
|
46
|
+
EvaluationTimestamps,
|
|
47
|
+
Money,
|
|
48
|
+
MoneyDict,
|
|
49
|
+
SpanMetrics,
|
|
50
|
+
RAGChunk,
|
|
51
|
+
TypedValueEvaluationResult,
|
|
52
|
+
TypedValueGuardrailResult,
|
|
53
|
+
TypedValueJson,
|
|
54
|
+
)
|
|
55
|
+
from langwatch.utils.exceptions import capture_exception
|
|
56
|
+
from langwatch.utils.transformation import (
|
|
57
|
+
SerializableWithStringFallback,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Re-export from experiment module for backward compatibility
|
|
61
|
+
from langwatch.experiment.experiment import Experiment as _Experiment
|
|
62
|
+
from langwatch.experiment.platform_run import (
|
|
63
|
+
run as _experiment_run,
|
|
64
|
+
ExperimentRunResult,
|
|
65
|
+
ExperimentRunSummary,
|
|
66
|
+
ExperimentNotFoundError,
|
|
67
|
+
ExperimentTimeoutError,
|
|
68
|
+
ExperimentRunFailedError,
|
|
69
|
+
ExperimentsApiError,
|
|
12
70
|
TargetStats,
|
|
13
71
|
EvaluatorStats,
|
|
14
72
|
)
|
|
15
73
|
|
|
74
|
+
if TYPE_CHECKING:
|
|
75
|
+
from langwatch.telemetry.tracing import LangWatchTrace
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ============================================================================
|
|
79
|
+
# Online Evaluation / Guardrail Types
|
|
80
|
+
# ============================================================================
|
|
81
|
+
|
|
82
|
+
class BasicEvaluateData(BaseModel):
|
|
83
|
+
"""Helper class for structuring evaluation data."""
|
|
84
|
+
|
|
85
|
+
input: Optional[str] = None
|
|
86
|
+
output: Optional[str] = None
|
|
87
|
+
expected_output: Optional[str] = None
|
|
88
|
+
contexts: Optional[Union[List[RAGChunk], List[str]]] = None
|
|
89
|
+
expected_contexts: Optional[Union[List[RAGChunk], List[str]]] = None
|
|
90
|
+
conversation: Optional[Conversation] = None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class EvaluationResultModel(BaseModel):
|
|
94
|
+
"""Result model returned from running an evaluator."""
|
|
95
|
+
|
|
96
|
+
status: Literal["processed", "skipped", "error"]
|
|
97
|
+
passed: Optional[bool] = None
|
|
98
|
+
score: Optional[float] = None
|
|
99
|
+
details: Optional[str] = None
|
|
100
|
+
label: Optional[str] = None
|
|
101
|
+
cost: Optional[Money] = None
|
|
102
|
+
error_type: Optional[str] = None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# ============================================================================
|
|
106
|
+
# Online Evaluation / Guardrail Functions
|
|
107
|
+
# ============================================================================
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def evaluate(
|
|
111
|
+
slug: str,
|
|
112
|
+
data: Union[BasicEvaluateData, Dict[str, Any]],
|
|
113
|
+
name: Optional[str] = None,
|
|
114
|
+
settings: Optional[Dict[str, Any]] = None,
|
|
115
|
+
as_guardrail: bool = False,
|
|
116
|
+
) -> EvaluationResultModel:
|
|
117
|
+
"""
|
|
118
|
+
Run an evaluator or guardrail against provided data.
|
|
119
|
+
|
|
120
|
+
Creates an OpenTelemetry span attached to the current trace context,
|
|
121
|
+
calls the LangWatch evaluation API, and returns the result.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
slug: The evaluator slug (e.g., "presidio/pii_detection", "langevals/llm_boolean")
|
|
125
|
+
data: Data to pass to the evaluator (input, output, contexts, etc.)
|
|
126
|
+
name: Human-readable name for this evaluation
|
|
127
|
+
settings: Evaluator-specific settings
|
|
128
|
+
as_guardrail: Whether to run as a guardrail (affects error handling)
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
EvaluationResultModel with status, passed, score, details, label, and cost
|
|
132
|
+
|
|
133
|
+
Example:
|
|
134
|
+
```python
|
|
135
|
+
import langwatch
|
|
136
|
+
|
|
137
|
+
# Run as a guardrail (synchronous evaluation that can block responses)
|
|
138
|
+
guardrail = langwatch.evaluation.evaluate(
|
|
139
|
+
"presidio/pii_detection",
|
|
140
|
+
data={"input": user_input, "output": generated_response},
|
|
141
|
+
name="PII Detection Guardrail",
|
|
142
|
+
as_guardrail=True,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if not guardrail.passed:
|
|
146
|
+
print("PII detected:", guardrail.details)
|
|
147
|
+
return "Sorry, I cannot process that request."
|
|
148
|
+
```
|
|
149
|
+
"""
|
|
150
|
+
with langwatch.span(
|
|
151
|
+
name=name or slug, type="guardrail" if as_guardrail else "evaluation"
|
|
152
|
+
) as span:
|
|
153
|
+
request_params = _prepare_data(
|
|
154
|
+
slug=slug,
|
|
155
|
+
name=name,
|
|
156
|
+
data=data,
|
|
157
|
+
settings=settings,
|
|
158
|
+
span=span,
|
|
159
|
+
as_guardrail=as_guardrail,
|
|
160
|
+
)
|
|
161
|
+
try:
|
|
162
|
+
with httpx.Client(timeout=900) as client:
|
|
163
|
+
response = client.post(**request_params)
|
|
164
|
+
better_raise_for_status(response, cls=EvaluatorException)
|
|
165
|
+
except Exception as e:
|
|
166
|
+
return _handle_exception(e, span, as_guardrail)
|
|
167
|
+
|
|
168
|
+
return _handle_response(response.json(), span, as_guardrail)
|
|
169
|
+
|
|
170
|
+
raise ValueError("Evaluate failed due to issue creating span")
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
async def async_evaluate(
|
|
174
|
+
slug: str,
|
|
175
|
+
data: Union[BasicEvaluateData, Dict[str, Any]],
|
|
176
|
+
name: Optional[str] = None,
|
|
177
|
+
settings: Optional[Dict[str, Any]] = None,
|
|
178
|
+
as_guardrail: bool = False,
|
|
179
|
+
) -> EvaluationResultModel:
|
|
180
|
+
"""
|
|
181
|
+
Async version of evaluate().
|
|
182
|
+
|
|
183
|
+
Run an evaluator or guardrail against provided data asynchronously.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
slug: The evaluator slug (e.g., "presidio/pii_detection", "langevals/llm_boolean")
|
|
187
|
+
data: Data to pass to the evaluator (input, output, contexts, etc.)
|
|
188
|
+
name: Human-readable name for this evaluation
|
|
189
|
+
settings: Evaluator-specific settings
|
|
190
|
+
as_guardrail: Whether to run as a guardrail (affects error handling)
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
EvaluationResultModel with status, passed, score, details, label, and cost
|
|
194
|
+
|
|
195
|
+
Example:
|
|
196
|
+
```python
|
|
197
|
+
import langwatch
|
|
198
|
+
|
|
199
|
+
# Run as an online evaluation (async scoring for monitoring)
|
|
200
|
+
result = await langwatch.evaluation.async_evaluate(
|
|
201
|
+
"langevals/llm_boolean",
|
|
202
|
+
data={"input": question, "output": response},
|
|
203
|
+
name="Quality Check",
|
|
204
|
+
settings={"prompt": "Check if the response answers the question."},
|
|
205
|
+
)
|
|
16
206
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
207
|
+
print("Score:", result.score)
|
|
208
|
+
```
|
|
209
|
+
"""
|
|
210
|
+
with langwatch.span(
|
|
211
|
+
name=name or slug, type="guardrail" if as_guardrail else "evaluation"
|
|
212
|
+
) as span:
|
|
213
|
+
request_params = _prepare_data(
|
|
214
|
+
slug=slug,
|
|
215
|
+
name=name,
|
|
216
|
+
data=data,
|
|
217
|
+
settings=settings,
|
|
218
|
+
span=span,
|
|
219
|
+
as_guardrail=as_guardrail,
|
|
220
|
+
)
|
|
221
|
+
try:
|
|
222
|
+
async with httpx.AsyncClient(timeout=900) as client:
|
|
223
|
+
response = await client.post(**request_params)
|
|
224
|
+
better_raise_for_status(response)
|
|
225
|
+
except Exception as e:
|
|
226
|
+
return _handle_exception(e, span, as_guardrail)
|
|
227
|
+
|
|
228
|
+
return _handle_response(response.json(), span, as_guardrail)
|
|
229
|
+
|
|
230
|
+
raise ValueError("Async evaluate failed due to issue creating span")
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _prepare_data(
|
|
234
|
+
slug: str,
|
|
235
|
+
name: Optional[str],
|
|
236
|
+
data: Union[BasicEvaluateData, Dict[str, Any]],
|
|
237
|
+
settings: Optional[Dict[str, Any]] = None,
|
|
238
|
+
span: Optional["LangWatchSpan"] = None,
|
|
239
|
+
as_guardrail: bool = False,
|
|
240
|
+
):
|
|
241
|
+
"""Prepare request data for the evaluation API."""
|
|
242
|
+
trace_data: Dict[str, Any] = {}
|
|
243
|
+
|
|
244
|
+
span_ctx = get_current_span().get_span_context()
|
|
245
|
+
if span_ctx and span_ctx.is_valid:
|
|
246
|
+
trace_data["trace_id"] = format(span_ctx.trace_id, "x")
|
|
247
|
+
trace_data["span_id"] = format(span_ctx.span_id, "x")
|
|
248
|
+
|
|
249
|
+
dataDict: Dict[str, Any] = {
|
|
250
|
+
**trace_data,
|
|
251
|
+
**(
|
|
252
|
+
data.model_dump(exclude_unset=True, exclude_none=True)
|
|
253
|
+
if isinstance(data, BasicEvaluateData)
|
|
254
|
+
else data or {}
|
|
255
|
+
),
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
if span:
|
|
259
|
+
span.update(
|
|
260
|
+
input=TypedValueJson(type="json", value=dataDict),
|
|
261
|
+
params=settings, # type: ignore
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
client = get_instance()
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
"url": get_endpoint() + f"/api/evaluations/{slug}/evaluate",
|
|
268
|
+
"json": {
|
|
269
|
+
"trace_id": (
|
|
270
|
+
None
|
|
271
|
+
if client and client.disable_sending
|
|
272
|
+
else (
|
|
273
|
+
format(span_ctx.trace_id, "x")
|
|
274
|
+
if span_ctx and span_ctx.is_valid
|
|
275
|
+
else None
|
|
276
|
+
)
|
|
277
|
+
),
|
|
278
|
+
"span_id": (
|
|
279
|
+
None
|
|
280
|
+
if client and client.disable_sending
|
|
281
|
+
else (
|
|
282
|
+
format(span_ctx.span_id, "x")
|
|
283
|
+
if span_ctx and span_ctx.is_valid
|
|
284
|
+
else None
|
|
285
|
+
)
|
|
286
|
+
),
|
|
287
|
+
"name": name,
|
|
288
|
+
"data": dataDict,
|
|
289
|
+
"settings": settings,
|
|
290
|
+
"as_guardrail": as_guardrail,
|
|
291
|
+
},
|
|
292
|
+
"headers": {"X-Auth-Token": get_api_key()},
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _handle_response(
|
|
297
|
+
response: Dict[str, Any],
|
|
298
|
+
span: Optional["LangWatchSpan"] = None,
|
|
299
|
+
as_guardrail: bool = False,
|
|
300
|
+
) -> EvaluationResultModel:
|
|
301
|
+
"""Handle API response and update span."""
|
|
302
|
+
result = EvaluationResultModel.model_validate(response)
|
|
303
|
+
if span:
|
|
304
|
+
span.update(
|
|
305
|
+
output=(
|
|
306
|
+
TypedValueGuardrailResult(
|
|
307
|
+
type="guardrail_result",
|
|
308
|
+
value=cast(
|
|
309
|
+
EvaluationResult,
|
|
310
|
+
result.model_dump(exclude_unset=True, exclude_none=True),
|
|
311
|
+
),
|
|
312
|
+
)
|
|
313
|
+
if as_guardrail
|
|
314
|
+
else TypedValueEvaluationResult(
|
|
315
|
+
type="evaluation_result",
|
|
316
|
+
value=cast(
|
|
317
|
+
EvaluationResult,
|
|
318
|
+
result.model_dump(exclude_unset=True, exclude_none=True),
|
|
319
|
+
),
|
|
320
|
+
)
|
|
321
|
+
)
|
|
322
|
+
)
|
|
323
|
+
if result.cost:
|
|
324
|
+
span.update(
|
|
325
|
+
metrics=SpanMetrics(
|
|
326
|
+
cost=result.cost.amount,
|
|
327
|
+
)
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
return result
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def _handle_exception(
|
|
334
|
+
e: Exception, span: Optional["LangWatchSpan"] = None, as_guardrail: bool = False
|
|
335
|
+
) -> EvaluationResultModel:
|
|
336
|
+
"""Handle exceptions during evaluation."""
|
|
337
|
+
response: Dict[str, Any] = {
|
|
338
|
+
"status": "error",
|
|
339
|
+
"details": repr(e),
|
|
340
|
+
}
|
|
341
|
+
if as_guardrail:
|
|
342
|
+
response["passed"] = True
|
|
343
|
+
return _handle_response(
|
|
344
|
+
response,
|
|
345
|
+
span,
|
|
346
|
+
as_guardrail,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _add_evaluation( # type: ignore
|
|
351
|
+
*,
|
|
352
|
+
span: Optional["LangWatchSpan"] = None,
|
|
353
|
+
evaluation_id: Optional[str] = None,
|
|
354
|
+
name: str,
|
|
355
|
+
type: Optional[str] = None,
|
|
356
|
+
is_guardrail: Optional[bool] = None,
|
|
357
|
+
status: Literal["processed", "skipped", "error"] = "processed",
|
|
358
|
+
passed: Optional[bool] = None,
|
|
359
|
+
score: Optional[float] = None,
|
|
360
|
+
label: Optional[str] = None,
|
|
361
|
+
details: Optional[str] = None,
|
|
362
|
+
cost: Optional[Union[Money, MoneyDict, float]] = None,
|
|
363
|
+
error: Optional[Exception] = None,
|
|
364
|
+
timestamps: Optional[EvaluationTimestamps] = None,
|
|
365
|
+
):
|
|
366
|
+
"""Add a manual evaluation result to a span."""
|
|
367
|
+
if not span or not span.trace:
|
|
368
|
+
raise ValueError("No span or trace found, could not add evaluation to span")
|
|
369
|
+
|
|
370
|
+
evaluation_result = EvaluationResult(status=status)
|
|
371
|
+
if passed is not None:
|
|
372
|
+
evaluation_result["passed"] = passed
|
|
373
|
+
if score is not None:
|
|
374
|
+
evaluation_result["score"] = score
|
|
375
|
+
if label is not None:
|
|
376
|
+
evaluation_result["label"] = label
|
|
377
|
+
if details is not None:
|
|
378
|
+
evaluation_result["details"] = details
|
|
379
|
+
if cost is not None:
|
|
380
|
+
if isinstance(cost, Money):
|
|
381
|
+
evaluation_result["cost"] = {
|
|
382
|
+
"currency": cost.currency,
|
|
383
|
+
"amount": cost.amount,
|
|
384
|
+
}
|
|
385
|
+
elif isinstance(cost, float) or isinstance(cost, int):
|
|
386
|
+
evaluation_result["cost"] = {"currency": "USD", "amount": cost}
|
|
387
|
+
else:
|
|
388
|
+
evaluation_result["cost"] = cost
|
|
389
|
+
|
|
390
|
+
eval_span_created = False
|
|
391
|
+
eval_span = span
|
|
392
|
+
|
|
393
|
+
if not span or span.type != "evaluation":
|
|
394
|
+
eval_span = langwatch.span(
|
|
395
|
+
type="evaluation", span_context=span.get_span_context() if span else None
|
|
396
|
+
)
|
|
397
|
+
eval_span_created = True
|
|
398
|
+
|
|
399
|
+
try:
|
|
400
|
+
eval_span.update(
|
|
401
|
+
name=name,
|
|
402
|
+
output=TypedValueEvaluationResult(
|
|
403
|
+
type="evaluation_result",
|
|
404
|
+
value=evaluation_result,
|
|
405
|
+
),
|
|
406
|
+
error=error,
|
|
407
|
+
timestamps=(
|
|
408
|
+
SpanTimestamps(
|
|
409
|
+
started_at=(
|
|
410
|
+
timestamps["started_at"]
|
|
411
|
+
if "started_at" in timestamps and timestamps["started_at"]
|
|
412
|
+
else cast(int, None)
|
|
413
|
+
),
|
|
414
|
+
finished_at=(
|
|
415
|
+
timestamps["finished_at"]
|
|
416
|
+
if "finished_at" in timestamps and timestamps["finished_at"]
|
|
417
|
+
else cast(int, None)
|
|
418
|
+
),
|
|
419
|
+
)
|
|
420
|
+
if timestamps
|
|
421
|
+
else None
|
|
422
|
+
),
|
|
423
|
+
)
|
|
424
|
+
if "cost" in evaluation_result and evaluation_result["cost"]:
|
|
425
|
+
eval_span.update(
|
|
426
|
+
metrics=SpanMetrics(cost=evaluation_result["cost"]["amount"])
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
span_id = None
|
|
430
|
+
span_ctx = eval_span.get_span_context()
|
|
431
|
+
if span_ctx and span_ctx.is_valid:
|
|
432
|
+
span_id = format(span_ctx.span_id, "x")
|
|
433
|
+
|
|
434
|
+
evaluation = _EvaluationTypedDict(
|
|
435
|
+
evaluation_id=evaluation_id or str(PKSUID("eval")),
|
|
436
|
+
span_id=span_id,
|
|
437
|
+
name=name,
|
|
438
|
+
type=type,
|
|
439
|
+
is_guardrail=is_guardrail,
|
|
440
|
+
status=status,
|
|
441
|
+
passed=passed,
|
|
442
|
+
score=score,
|
|
443
|
+
label=label,
|
|
444
|
+
details=details,
|
|
445
|
+
error=capture_exception(error) if error else None,
|
|
446
|
+
timestamps=timestamps,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
span.add_event(
|
|
450
|
+
AttributeKey.LangWatchEventEvaluationCustom,
|
|
451
|
+
{
|
|
452
|
+
"json_encoded_event": json.dumps(
|
|
453
|
+
evaluation,
|
|
454
|
+
cls=SerializableWithStringFallback,
|
|
455
|
+
),
|
|
456
|
+
},
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
finally:
|
|
460
|
+
# If the span was created by the function, we need to end it
|
|
461
|
+
if eval_span_created:
|
|
462
|
+
eval_span.end()
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
# ============================================================================
|
|
466
|
+
# Deprecated Backward Compatibility for Experiment API
|
|
467
|
+
# ============================================================================
|
|
468
|
+
|
|
469
|
+
# Deprecated aliases for old names
|
|
470
|
+
EvaluationRunResult = ExperimentRunResult
|
|
471
|
+
EvaluationRunSummary = ExperimentRunSummary
|
|
472
|
+
EvaluationNotFoundError = ExperimentNotFoundError
|
|
473
|
+
EvaluationTimeoutError = ExperimentTimeoutError
|
|
474
|
+
EvaluationRunFailedError = ExperimentRunFailedError
|
|
475
|
+
EvaluationsApiError = ExperimentsApiError
|
|
476
|
+
|
|
477
|
+
# Keep Evaluation as alias to Experiment for backward compatibility
|
|
478
|
+
Evaluation = _Experiment
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def run(*args, **kwargs) -> ExperimentRunResult:
|
|
482
|
+
"""
|
|
483
|
+
Deprecated: Use langwatch.experiment.run() instead.
|
|
484
|
+
|
|
485
|
+
This function runs a platform-configured experiment.
|
|
486
|
+
"""
|
|
487
|
+
warnings.warn(
|
|
488
|
+
"langwatch.evaluation.run() is deprecated, use langwatch.experiment.run() instead",
|
|
489
|
+
DeprecationWarning,
|
|
490
|
+
stacklevel=2,
|
|
491
|
+
)
|
|
492
|
+
return _experiment_run(*args, **kwargs)
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def init(name: str, *, run_id: Optional[str] = None) -> _Experiment:
|
|
496
|
+
"""
|
|
497
|
+
Deprecated: Use langwatch.experiment.init() instead.
|
|
498
|
+
|
|
499
|
+
This function initializes an SDK-defined experiment.
|
|
500
|
+
"""
|
|
501
|
+
warnings.warn(
|
|
502
|
+
"langwatch.evaluation.init() is deprecated, use langwatch.experiment.init() instead",
|
|
503
|
+
DeprecationWarning,
|
|
504
|
+
stacklevel=2,
|
|
505
|
+
)
|
|
506
|
+
experiment = _Experiment(name, run_id=run_id)
|
|
507
|
+
experiment.init()
|
|
508
|
+
return experiment
|
|
21
509
|
|
|
22
510
|
|
|
23
511
|
__all__ = [
|
|
24
|
-
|
|
512
|
+
# Online Evaluation / Guardrails API (new, recommended)
|
|
25
513
|
"evaluate",
|
|
26
|
-
"
|
|
514
|
+
"async_evaluate",
|
|
515
|
+
"BasicEvaluateData",
|
|
516
|
+
"EvaluationResultModel",
|
|
517
|
+
# Deprecated experiment compatibility
|
|
518
|
+
"init",
|
|
519
|
+
"run",
|
|
27
520
|
"Evaluation",
|
|
521
|
+
# Old names (deprecated)
|
|
28
522
|
"EvaluationRunResult",
|
|
29
523
|
"EvaluationRunSummary",
|
|
30
524
|
"EvaluationNotFoundError",
|
|
31
525
|
"EvaluationTimeoutError",
|
|
32
526
|
"EvaluationRunFailedError",
|
|
33
527
|
"EvaluationsApiError",
|
|
528
|
+
# New experiment names (prefer langwatch.experiment)
|
|
529
|
+
"ExperimentRunResult",
|
|
530
|
+
"ExperimentRunSummary",
|
|
531
|
+
"ExperimentNotFoundError",
|
|
532
|
+
"ExperimentTimeoutError",
|
|
533
|
+
"ExperimentRunFailedError",
|
|
534
|
+
"ExperimentsApiError",
|
|
34
535
|
"TargetStats",
|
|
35
536
|
"EvaluatorStats",
|
|
36
537
|
]
|