langwatch 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langwatch/evaluations.py CHANGED
@@ -1,63 +1,54 @@
1
- import json
2
- from typing import Any, Dict, List, Literal, Optional, Union, cast, TYPE_CHECKING
1
+ """
2
+ langwatch.evaluations - DEPRECATED, use langwatch.evaluation instead.
3
+
4
+ This module is kept for backward compatibility. All functionality has moved
5
+ to langwatch.evaluation (singular).
6
+
7
+ Example migration:
8
+ # Old (deprecated)
9
+ from langwatch.evaluations import evaluate
10
+ result = evaluate("presidio/pii_detection", input="test", output="response")
11
+
12
+ # New (recommended)
13
+ import langwatch
14
+ result = langwatch.evaluation.evaluate(
15
+ "presidio/pii_detection",
16
+ data={"input": "test", "output": "response"}
17
+ )
18
+ """
19
+ import warnings
20
+ from typing import Any, Dict, List, Literal, Optional, Union, TYPE_CHECKING
3
21
  from uuid import UUID
4
- from warnings import warn
22
+
5
23
  from deprecated import deprecated
6
24
 
7
- import httpx
8
- import langwatch
9
- from langwatch.domain import SpanTimestamps
10
- from pksuid import PKSUID
11
- from langwatch.telemetry.span import LangWatchSpan
12
- from langwatch.telemetry.context import get_current_span
13
- from langwatch.state import get_api_key, get_endpoint, get_instance
14
- from langwatch.attributes import AttributeKey
15
- from langwatch.utils.exceptions import EvaluatorException, better_raise_for_status
16
- from pydantic import BaseModel
25
+ # Re-export everything from the evaluation module
26
+ from langwatch.evaluation import (
27
+ # Types
28
+ BasicEvaluateData,
29
+ EvaluationResultModel,
30
+ # Internal functions
31
+ _prepare_data as _new_prepare_data,
32
+ _handle_response,
33
+ _handle_exception,
34
+ _add_evaluation as _new_add_evaluation,
35
+ )
17
36
 
18
37
  from langwatch.types import (
19
38
  Conversation,
20
- Evaluation,
21
- EvaluationResult,
22
39
  EvaluationTimestamps,
23
40
  Money,
24
41
  MoneyDict,
25
- SpanMetrics,
26
42
  RAGChunk,
27
- TypedValueEvaluationResult,
28
- TypedValueGuardrailResult,
29
- TypedValueJson,
30
- )
31
- from langwatch.utils.exceptions import capture_exception
32
- from langwatch.utils.transformation import (
33
- SerializableWithStringFallback,
34
43
  )
35
44
 
36
45
  if TYPE_CHECKING:
37
46
  from langwatch.telemetry.tracing import LangWatchTrace
38
-
39
-
40
- class BasicEvaluateData(BaseModel):
41
- input: Optional[str] = None
42
- output: Optional[str] = None
43
- expected_output: Optional[str] = None
44
- contexts: Optional[Union[List[RAGChunk], List[str]]] = None
45
- expected_contexts: Optional[Union[List[RAGChunk], List[str]]] = None
46
- conversation: Optional[Conversation] = None
47
-
48
-
49
- class EvaluationResultModel(BaseModel):
50
- status: Literal["processed", "skipped", "error"]
51
- passed: Optional[bool] = None
52
- score: Optional[float] = None
53
- details: Optional[str] = None
54
- label: Optional[str] = None
55
- cost: Optional[Money] = None
56
- error_type: Optional[str] = None
47
+ from langwatch.telemetry.span import LangWatchSpan
57
48
 
58
49
 
59
50
  @deprecated(
60
- reason="Please use the new `langwatch.evaluation` module instead. TODO: Link to migration guide"
51
+ reason="Please use `langwatch.evaluation.evaluate()` instead."
61
52
  )
62
53
  def evaluate(
63
54
  slug: str,
@@ -74,45 +65,85 @@ def evaluate(
74
65
  span: Optional["LangWatchSpan"] = None,
75
66
  api_key: Optional[str] = None,
76
67
  data: Optional[Union[BasicEvaluateData, Dict[str, Any]]] = None,
77
- ) -> EvaluationResultModel: # type: ignore
68
+ ) -> EvaluationResultModel:
69
+ """
70
+ Deprecated: Use langwatch.evaluation.evaluate() instead.
71
+
72
+ The new API uses a simpler signature:
73
+ langwatch.evaluation.evaluate(slug, data, name, settings, as_guardrail)
74
+ """
75
+ import langwatch.evaluation as evaluation
76
+
78
77
  if trace:
79
- warn(
80
- "The `trace` argument is deprecated and will be removed in a future version. Future versions of the SDK will not support it. Supplying this argument will have no effect. Please use the `span` argument instead.",
78
+ warnings.warn(
79
+ "The `trace` argument is deprecated and will be removed.",
80
+ DeprecationWarning,
81
81
  stacklevel=2,
82
82
  )
83
83
 
84
- with langwatch.span(
85
- name=name or slug, type="guardrail" if as_guardrail else "evaluation"
86
- ) as span:
87
- request_params = _prepare_data(
88
- slug=slug,
89
- name=name,
90
- input=input,
91
- output=output,
92
- expected_output=expected_output,
93
- contexts=contexts,
94
- expected_contexts=expected_contexts,
95
- conversation=conversation,
96
- settings=settings,
97
- span=span,
98
- as_guardrail=as_guardrail,
99
- api_key=api_key,
100
- data=data,
101
- )
102
- try:
103
- with httpx.Client(timeout=900) as client:
104
- response = client.post(**request_params)
105
- better_raise_for_status(response, cls=EvaluatorException)
106
- except Exception as e:
107
- return _handle_exception(e, span, as_guardrail)
108
-
109
- return _handle_response(response.json(), span, as_guardrail)
84
+ # Build data dict from legacy arguments
85
+ data_dict: Dict[str, Any] = {}
86
+ if data:
87
+ if isinstance(data, BasicEvaluateData):
88
+ data_dict = data.model_dump(exclude_unset=True, exclude_none=True)
89
+ else:
90
+ data_dict = data
110
91
 
111
- raise ValueError("Evaluate failed due to issue creating span")
92
+ # Map legacy positional arguments to data dict
93
+ if input is not None:
94
+ warnings.warn(
95
+ "The `input` argument is deprecated. Use `data={'input': ...}` instead.",
96
+ DeprecationWarning,
97
+ stacklevel=2,
98
+ )
99
+ data_dict["input"] = input
100
+ if output is not None:
101
+ warnings.warn(
102
+ "The `output` argument is deprecated. Use `data={'output': ...}` instead.",
103
+ DeprecationWarning,
104
+ stacklevel=2,
105
+ )
106
+ data_dict["output"] = output
107
+ if expected_output is not None:
108
+ warnings.warn(
109
+ "The `expected_output` argument is deprecated. Use `data={'expected_output': ...}` instead.",
110
+ DeprecationWarning,
111
+ stacklevel=2,
112
+ )
113
+ data_dict["expected_output"] = expected_output
114
+ if contexts is not None:
115
+ warnings.warn(
116
+ "The `contexts` argument is deprecated. Use `data={'contexts': ...}` instead.",
117
+ DeprecationWarning,
118
+ stacklevel=2,
119
+ )
120
+ data_dict["contexts"] = contexts
121
+ if expected_contexts is not None:
122
+ warnings.warn(
123
+ "The `expected_contexts` argument is deprecated. Use `data={'expected_contexts': ...}` instead.",
124
+ DeprecationWarning,
125
+ stacklevel=2,
126
+ )
127
+ data_dict["expected_contexts"] = expected_contexts
128
+ if conversation is not None:
129
+ warnings.warn(
130
+ "The `conversation` argument is deprecated. Use `data={'conversation': ...}` instead.",
131
+ DeprecationWarning,
132
+ stacklevel=2,
133
+ )
134
+ data_dict["conversation"] = conversation
135
+
136
+ return evaluation.evaluate(
137
+ slug=slug,
138
+ data=data_dict,
139
+ name=name,
140
+ settings=settings,
141
+ as_guardrail=as_guardrail,
142
+ )
112
143
 
113
144
 
114
145
  @deprecated(
115
- reason="Please use the new `langwatch.evaluation` module instead. TODO: Link to migration guide"
146
+ reason="Please use `langwatch.evaluation.async_evaluate()` instead."
116
147
  )
117
148
  async def async_evaluate(
118
149
  slug: str,
@@ -129,219 +160,87 @@ async def async_evaluate(
129
160
  span: Optional["LangWatchSpan"] = None,
130
161
  api_key: Optional[str] = None,
131
162
  data: Optional[Union[BasicEvaluateData, Dict[str, Any]]] = None,
132
- ) -> EvaluationResultModel: # type: ignore
163
+ ) -> EvaluationResultModel:
164
+ """
165
+ Deprecated: Use langwatch.evaluation.async_evaluate() instead.
166
+
167
+ The new API uses a simpler signature:
168
+ langwatch.evaluation.async_evaluate(slug, data, name, settings, as_guardrail)
169
+ """
170
+ import langwatch.evaluation as evaluation
171
+
133
172
  if trace:
134
- warn(
135
- "The `trace` argument is deprecated and will be removed in a future version. Future versions of the SDK will not support it. Supplying this argument will have no effect. Please use the `span` argument instead.",
173
+ warnings.warn(
174
+ "The `trace` argument is deprecated and will be removed.",
175
+ DeprecationWarning,
136
176
  stacklevel=2,
137
177
  )
138
178
 
139
- with langwatch.span(
140
- name=name or slug, type="guardrail" if as_guardrail else "evaluation"
141
- ) as span:
142
- request_params = _prepare_data(
143
- slug=slug,
144
- name=name,
145
- input=input,
146
- output=output,
147
- expected_output=expected_output,
148
- contexts=contexts,
149
- expected_contexts=expected_contexts,
150
- conversation=conversation,
151
- settings=settings,
152
- span=span,
153
- as_guardrail=as_guardrail,
154
- api_key=api_key,
155
- data=data,
156
- )
157
- try:
158
- async with httpx.AsyncClient(timeout=900) as client:
159
- response = await client.post(**request_params)
160
- better_raise_for_status(response)
161
- except Exception as e:
162
- return _handle_exception(e, span, as_guardrail)
163
-
164
- return _handle_response(response.json(), span, as_guardrail)
165
-
166
- raise ValueError("Async evaluate failed due to issue creating span")
167
-
168
-
169
- def _prepare_data(
170
- slug: str,
171
- name: Optional[str],
172
- input: Optional[str],
173
- output: Optional[str],
174
- expected_output: Optional[str],
175
- contexts: Optional[Union[List[RAGChunk], List[str]]] = None,
176
- expected_contexts: Optional[Union[List[RAGChunk], List[str]]] = None,
177
- conversation: Optional[Conversation] = None,
178
- settings: Optional[Dict[str, Any]] = None,
179
- trace_id: Optional[Union[str, UUID]] = None,
180
- span_id: Optional[Union[str, UUID]] = None,
181
- span: Optional["LangWatchSpan"] = None,
182
- as_guardrail: bool = False,
183
- api_key: Optional[str] = None,
184
- data: Optional[Union[BasicEvaluateData, Dict[str, Any]]] = None,
185
- ):
186
- trace_data: Dict[str, Any] = {}
187
-
188
- span_ctx = get_current_span().get_span_context()
189
- if span_ctx and span_ctx.is_valid:
190
- trace_data["trace_id"] = format(span_ctx.trace_id, "x")
191
- trace_data["span_id"] = format(span_ctx.span_id, "x")
179
+ # Build data dict from legacy arguments
180
+ data_dict: Dict[str, Any] = {}
181
+ if data:
182
+ if isinstance(data, BasicEvaluateData):
183
+ data_dict = data.model_dump(exclude_unset=True, exclude_none=True)
184
+ else:
185
+ data_dict = data
192
186
 
193
- dataDict: Dict[str, Any] = {
194
- **trace_data,
195
- **(
196
- data.model_dump(exclude_unset=True, exclude_none=True)
197
- if isinstance(data, BasicEvaluateData)
198
- else data or {}
199
- ),
200
- }
187
+ # Map legacy positional arguments to data dict
201
188
  if input is not None:
202
- warn(
203
- "For the `evaluate` or `async_evaluate` function, the `input` argument is deprecated and will be removed in a future version. Future versions of the SDK will not support it. Please use the `data` argument instead, you can use the `input` key in the `data` argument, or use the helper class `BasicEvaluateData`.",
189
+ warnings.warn(
190
+ "The `input` argument is deprecated. Use `data={'input': ...}` instead.",
191
+ DeprecationWarning,
204
192
  stacklevel=2,
205
193
  )
206
- dataDict["input"] = input
194
+ data_dict["input"] = input
207
195
  if output is not None:
208
- warn(
209
- "For the `evaluate` or `async_evaluate` function, the `output` argument is deprecated and will be removed in a future version. Future versions of the SDK will not support it. Please use the `data` argument instead, you can use the `output` key in the `data` argument, or use the helper class `BasicEvaluateData`.",
196
+ warnings.warn(
197
+ "The `output` argument is deprecated. Use `data={'output': ...}` instead.",
198
+ DeprecationWarning,
210
199
  stacklevel=2,
211
200
  )
212
- dataDict["output"] = output
201
+ data_dict["output"] = output
213
202
  if expected_output is not None:
214
- warn(
215
- "For the `evaluate` or `async_evaluate` function, the `expected_output` argument is deprecated and will be removed in a future version. Future versions of the SDK will not support it. Please use the `data` argument instead, you can use the `expected_output` key in the `data` argument, or use the helper class `BasicEvaluateData`.",
203
+ warnings.warn(
204
+ "The `expected_output` argument is deprecated. Use `data={'expected_output': ...}` instead.",
205
+ DeprecationWarning,
216
206
  stacklevel=2,
217
207
  )
218
- dataDict["expected_output"] = expected_output
208
+ data_dict["expected_output"] = expected_output
219
209
  if contexts is not None:
220
- warn(
221
- "For the `evaluate` or `async_evaluate` function, the `contexts` argument is deprecated and will be removed in a future version. Future versions of the SDK will not support it. Please use the `data` argument instead, you can use the `contexts` key in the `data` argument, or use the helper class `BasicEvaluateData`.",
210
+ warnings.warn(
211
+ "The `contexts` argument is deprecated. Use `data={'contexts': ...}` instead.",
212
+ DeprecationWarning,
222
213
  stacklevel=2,
223
214
  )
224
- dataDict["contexts"] = contexts
215
+ data_dict["contexts"] = contexts
225
216
  if expected_contexts is not None:
226
- warn(
227
- "For the `evaluate` or `async_evaluate` function, the `expected_contexts` argument is deprecated and will be removed in a future version. Future versions of the SDK will not support it. Please use the `data` argument instead, you can use the `expected_contexts` key in the `data` argument, or use the helper class `BasicEvaluateData`.",
217
+ warnings.warn(
218
+ "The `expected_contexts` argument is deprecated. Use `data={'expected_contexts': ...}` instead.",
219
+ DeprecationWarning,
228
220
  stacklevel=2,
229
221
  )
230
- dataDict["expected_contexts"] = expected_contexts
222
+ data_dict["expected_contexts"] = expected_contexts
231
223
  if conversation is not None:
232
- warn(
233
- "For the `evaluate` or `async_evaluate` function, the `conversation` argument is deprecated and will be removed in a future version. Future versions of the SDK will not support it. Please use the `data` argument instead, you can use the `conversation` key in the `data` argument, or use the helper class `BasicEvaluateData`.",
224
+ warnings.warn(
225
+ "The `conversation` argument is deprecated. Use `data={'conversation': ...}` instead.",
226
+ DeprecationWarning,
234
227
  stacklevel=2,
235
228
  )
236
- dataDict["conversation"] = conversation
237
-
238
- if trace_id is not None:
239
- warn(
240
- "The `trace_id` argument is deprecated and will be removed in a future version. Future versions of the SDK will not support it. Until that happens, the `trace_id` will be mapped to `deprecated.trace_id` in the data.",
241
- stacklevel=2,
242
- )
243
- dataDict["deprecated.trace_id"] = str(trace_id)
244
- if span_id is not None:
245
- warn(
246
- "The `span_id` argument is deprecated and will be removed in a future version. Future versions of the SDK will not support it. Until that happens, the `span_id` will be mapped to `deprecated.span_id` in the data.",
247
- stacklevel=2,
248
- )
249
- dataDict["deprecated.span_id"] = str(span_id)
250
-
251
- if span:
252
- span.update(
253
- input=TypedValueJson(type="json", value=dataDict),
254
- params=settings, # type: ignore
255
- )
256
-
257
- client = get_instance()
258
-
259
- return {
260
- "url": get_endpoint() + f"/api/evaluations/{slug}/evaluate",
261
- "json": {
262
- "trace_id": (
263
- None
264
- if client and client.disable_sending
265
- else (
266
- format(span_ctx.trace_id, "x")
267
- if span_ctx and span_ctx.is_valid
268
- else None
269
- )
270
- ),
271
- "span_id": (
272
- None
273
- if client and client.disable_sending
274
- else (
275
- format(span_ctx.span_id, "x")
276
- if span_ctx and span_ctx.is_valid
277
- else None
278
- )
279
- ),
280
- "name": name,
281
- "data": dataDict,
282
- "settings": settings,
283
- "as_guardrail": as_guardrail,
284
- },
285
- "headers": {"X-Auth-Token": get_api_key()},
286
- }
287
-
288
-
289
- def _handle_response(
290
- response: Dict[str, Any],
291
- span: Optional["LangWatchSpan"] = None,
292
- as_guardrail: bool = False,
293
- ) -> EvaluationResult:
294
- result = EvaluationResultModel.model_validate(response)
295
- if span:
296
- span.update(
297
- output=(
298
- TypedValueGuardrailResult(
299
- type="guardrail_result",
300
- value=cast(
301
- EvaluationResult,
302
- result.model_dump(exclude_unset=True, exclude_none=True),
303
- ),
304
- )
305
- if as_guardrail
306
- else TypedValueEvaluationResult(
307
- type="evaluation_result",
308
- value=cast(
309
- EvaluationResult,
310
- result.model_dump(exclude_unset=True, exclude_none=True),
311
- ),
312
- )
313
- )
314
- )
315
- if result.cost:
316
- span.update(
317
- metrics=SpanMetrics(
318
- cost=result.cost.amount,
319
- )
320
- )
321
-
322
- return result
323
-
324
-
325
- def _handle_exception(
326
- e: Exception, span: Optional["LangWatchSpan"] = None, as_guardrail: bool = False
327
- ):
328
- response: Dict[str, Any] = {
329
- "status": "error",
330
- "details": repr(e),
331
- }
332
- if as_guardrail:
333
- response["passed"] = True
334
- return _handle_response(
335
- response,
336
- span,
337
- as_guardrail,
229
+ data_dict["conversation"] = conversation
230
+
231
+ return await evaluation.async_evaluate(
232
+ slug=slug,
233
+ data=data_dict,
234
+ name=name,
235
+ settings=settings,
236
+ as_guardrail=as_guardrail,
338
237
  )
339
238
 
340
239
 
341
240
  @deprecated(
342
- reason="Please use the new `langwatch.evaluation` module instead. TODO: Link to migration guide"
241
+ reason="Please use the new `langwatch.evaluation` module instead."
343
242
  )
344
- def _add_evaluation( # type: ignore
243
+ def _add_evaluation(
345
244
  *,
346
245
  span: Optional["LangWatchSpan"] = None,
347
246
  evaluation_id: Optional[str] = None,
@@ -357,99 +256,30 @@ def _add_evaluation( # type: ignore
357
256
  error: Optional[Exception] = None,
358
257
  timestamps: Optional[EvaluationTimestamps] = None,
359
258
  ):
360
- if not span or not span.trace:
361
- raise ValueError("No span or trace found, could not add evaluation to span")
362
-
363
- evaluation_result = EvaluationResult(status=status)
364
- if passed is not None:
365
- evaluation_result["passed"] = passed
366
- if score is not None:
367
- evaluation_result["score"] = score
368
- if label is not None:
369
- evaluation_result["label"] = label
370
- if details is not None:
371
- evaluation_result["details"] = details
372
- if cost is not None:
373
- if isinstance(cost, Money):
374
- evaluation_result["cost"] = {
375
- "currency": cost.currency,
376
- "amount": cost.amount,
377
- }
378
- elif isinstance(cost, float) or isinstance(cost, int):
379
- evaluation_result["cost"] = {"currency": "USD", "amount": cost}
380
- else:
381
- evaluation_result["cost"] = cost
382
-
383
- eval_span_created = False
384
- eval_span = span
385
-
386
- if not span or span.type != "evaluation":
387
- eval_span = langwatch.span(
388
- type="evaluation", span_context=span.get_span_context() if span else None
389
- )
390
- eval_span_created = True
391
-
392
- try:
393
- eval_span.update(
394
- name=name,
395
- output=TypedValueEvaluationResult(
396
- type="evaluation_result",
397
- value=evaluation_result,
398
- ),
399
- error=error,
400
- timestamps=(
401
- SpanTimestamps(
402
- started_at=(
403
- timestamps["started_at"]
404
- if "started_at" in timestamps and timestamps["started_at"]
405
- else cast(int, None)
406
- ),
407
- finished_at=(
408
- timestamps["finished_at"]
409
- if "finished_at" in timestamps and timestamps["finished_at"]
410
- else cast(int, None)
411
- ),
412
- )
413
- if timestamps
414
- else None
415
- ),
416
- )
417
- if "cost" in evaluation_result and evaluation_result["cost"]:
418
- eval_span.update(
419
- metrics=SpanMetrics(cost=evaluation_result["cost"]["amount"])
420
- )
421
-
422
- span_id = None
423
- span_ctx = eval_span.get_span_context()
424
- if span_ctx and span_ctx.is_valid:
425
- span_id = format(span_ctx.span_id, "x")
426
-
427
- evaluation = Evaluation(
428
- evaluation_id=evaluation_id or str(PKSUID("eval")),
429
- span_id=span_id,
430
- name=name,
431
- type=type,
432
- is_guardrail=is_guardrail,
433
- status=status,
434
- passed=passed,
435
- score=score,
436
- label=label,
437
- details=details,
438
- error=capture_exception(error) if error else None,
439
- timestamps=timestamps,
440
- )
259
+ """
260
+ Deprecated: Use langwatch.evaluation._add_evaluation() instead.
261
+ """
262
+ return _new_add_evaluation(
263
+ span=span,
264
+ evaluation_id=evaluation_id,
265
+ name=name,
266
+ type=type,
267
+ is_guardrail=is_guardrail,
268
+ status=status,
269
+ passed=passed,
270
+ score=score,
271
+ label=label,
272
+ details=details,
273
+ cost=cost,
274
+ error=error,
275
+ timestamps=timestamps,
276
+ )
441
277
 
442
- span.add_event(
443
- AttributeKey.LangWatchEventEvaluationCustom,
444
- {
445
- "json_encoded_event": json.dumps(
446
- evaluation,
447
- cls=SerializableWithStringFallback,
448
- ),
449
- },
450
- )
451
278
 
452
- finally:
453
- # If the span was created by the function, we need to end it
454
- if eval_span_created:
455
- eval_span.end()
279
+ __all__ = [
280
+ "evaluate",
281
+ "async_evaluate",
282
+ "BasicEvaluateData",
283
+ "EvaluationResultModel",
284
+ "_add_evaluation",
285
+ ]