langwatch 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langwatch/__version__.py +1 -1
- langwatch/batch_evaluation.py +5 -4
- langwatch/dspy/__init__.py +7 -34
- langwatch/evaluation/__init__.py +28 -1
- langwatch/evaluation/evaluation.py +412 -22
- langwatch/evaluation/platform_run.py +462 -0
- langwatch/evaluations.py +3 -2
- langwatch/login.py +2 -1
- langwatch/telemetry/tracing.py +3 -2
- langwatch/utils/exceptions.py +22 -1
- {langwatch-0.8.0.dist-info → langwatch-0.9.0.dist-info}/METADATA +1 -1
- {langwatch-0.8.0.dist-info → langwatch-0.9.0.dist-info}/RECORD +13 -12
- {langwatch-0.8.0.dist-info → langwatch-0.9.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Runner for platform-configured evaluations (Evaluations V3).
|
|
3
|
+
|
|
4
|
+
This module provides the `run()` function to execute evaluations that are
|
|
5
|
+
configured in the LangWatch platform from CI/CD pipelines or scripts.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Callable, List, Literal, Optional
|
|
10
|
+
from urllib.parse import urlparse, urlunparse
|
|
11
|
+
import sys
|
|
12
|
+
import time
|
|
13
|
+
import httpx
|
|
14
|
+
|
|
15
|
+
import langwatch
|
|
16
|
+
from langwatch.state import get_api_key, get_endpoint
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _replace_url_domain(url: str, new_base: str) -> str:
|
|
20
|
+
"""Replace the domain/scheme of a URL with a new base URL, preserving the path."""
|
|
21
|
+
if not url:
|
|
22
|
+
return url
|
|
23
|
+
|
|
24
|
+
parsed_url = urlparse(url)
|
|
25
|
+
parsed_new_base = urlparse(new_base)
|
|
26
|
+
|
|
27
|
+
# Replace scheme and netloc with new base, keep path/query/fragment
|
|
28
|
+
return urlunparse((
|
|
29
|
+
parsed_new_base.scheme,
|
|
30
|
+
parsed_new_base.netloc,
|
|
31
|
+
parsed_url.path,
|
|
32
|
+
parsed_url.params,
|
|
33
|
+
parsed_url.query,
|
|
34
|
+
parsed_url.fragment,
|
|
35
|
+
))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class EvaluationNotFoundError(Exception):
|
|
39
|
+
"""Raised when evaluation slug doesn't exist."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, slug: str):
|
|
42
|
+
self.slug = slug
|
|
43
|
+
super().__init__(f"Evaluation not found: {slug}")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class EvaluationTimeoutError(Exception):
|
|
47
|
+
"""Raised when evaluation run times out."""
|
|
48
|
+
|
|
49
|
+
def __init__(self, run_id: str, progress: int, total: int):
|
|
50
|
+
self.run_id = run_id
|
|
51
|
+
self.progress = progress
|
|
52
|
+
self.total = total
|
|
53
|
+
super().__init__(
|
|
54
|
+
f"Evaluation run timed out: {run_id} ({progress}/{total} completed)"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class EvaluationRunFailedError(Exception):
|
|
59
|
+
"""Raised when evaluation run fails."""
|
|
60
|
+
|
|
61
|
+
def __init__(self, run_id: str, error: str):
|
|
62
|
+
self.run_id = run_id
|
|
63
|
+
self.error_message = error
|
|
64
|
+
super().__init__(f"Evaluation run failed: {error}")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class EvaluationsApiError(Exception):
|
|
68
|
+
"""Raised for other API errors."""
|
|
69
|
+
|
|
70
|
+
def __init__(self, message: str, status_code: int):
|
|
71
|
+
self.status_code = status_code
|
|
72
|
+
super().__init__(message)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class TargetStats:
|
|
77
|
+
"""Statistics for a single target."""
|
|
78
|
+
|
|
79
|
+
target_id: str
|
|
80
|
+
name: str
|
|
81
|
+
passed: int
|
|
82
|
+
failed: int
|
|
83
|
+
avg_latency: float
|
|
84
|
+
total_cost: float
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class EvaluatorStats:
|
|
89
|
+
"""Statistics for a single evaluator."""
|
|
90
|
+
|
|
91
|
+
evaluator_id: str
|
|
92
|
+
name: str
|
|
93
|
+
passed: int
|
|
94
|
+
failed: int
|
|
95
|
+
pass_rate: float
|
|
96
|
+
avg_score: Optional[float] = None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class EvaluationRunSummary:
|
|
101
|
+
"""Summary of a completed evaluation run."""
|
|
102
|
+
|
|
103
|
+
run_id: str
|
|
104
|
+
total_cells: int
|
|
105
|
+
completed_cells: int
|
|
106
|
+
failed_cells: int
|
|
107
|
+
duration: int
|
|
108
|
+
run_url: str = ""
|
|
109
|
+
targets: List[TargetStats] = field(default_factory=list)
|
|
110
|
+
evaluators: List[EvaluatorStats] = field(default_factory=list)
|
|
111
|
+
total_passed: int = 0
|
|
112
|
+
total_failed: int = 0
|
|
113
|
+
pass_rate: float = 0.0
|
|
114
|
+
total_cost: float = 0.0
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass
|
|
118
|
+
class EvaluationRunResult:
|
|
119
|
+
"""Result of running a platform evaluation."""
|
|
120
|
+
|
|
121
|
+
run_id: str
|
|
122
|
+
status: Literal["completed", "failed", "stopped"]
|
|
123
|
+
passed: int
|
|
124
|
+
failed: int
|
|
125
|
+
pass_rate: float
|
|
126
|
+
duration: int
|
|
127
|
+
run_url: str
|
|
128
|
+
summary: EvaluationRunSummary
|
|
129
|
+
|
|
130
|
+
def print_summary(self, exit_on_failure: Optional[bool] = None) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Print a CI-friendly summary and optionally exit with code 1 on failure.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
exit_on_failure: If True, calls sys.exit(1) when there are failures.
|
|
136
|
+
If False, never exits.
|
|
137
|
+
If None (default), auto-detects: exits in scripts/CI, doesn't exit in notebooks.
|
|
138
|
+
"""
|
|
139
|
+
_print_summary(self)
|
|
140
|
+
|
|
141
|
+
# Auto-detect: don't exit in notebooks, exit in scripts/CI
|
|
142
|
+
should_exit = exit_on_failure if exit_on_failure is not None else not _is_notebook()
|
|
143
|
+
|
|
144
|
+
if should_exit and self.failed > 0:
|
|
145
|
+
sys.exit(1)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _is_notebook() -> bool:
|
|
149
|
+
"""Detect if running in a Jupyter notebook."""
|
|
150
|
+
try:
|
|
151
|
+
from IPython import get_ipython # type: ignore
|
|
152
|
+
|
|
153
|
+
shell = get_ipython().__class__.__name__
|
|
154
|
+
if shell == "ZMQInteractiveShell":
|
|
155
|
+
return True # Jupyter notebook or qtconsole
|
|
156
|
+
elif shell == "TerminalInteractiveShell":
|
|
157
|
+
return False # Terminal running IPython
|
|
158
|
+
else:
|
|
159
|
+
return False
|
|
160
|
+
except (ImportError, AttributeError, NameError):
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def evaluate(
|
|
165
|
+
slug: str,
|
|
166
|
+
*,
|
|
167
|
+
poll_interval: float = 2.0,
|
|
168
|
+
timeout: float = 600.0,
|
|
169
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
170
|
+
api_key: Optional[str] = None,
|
|
171
|
+
) -> EvaluationRunResult:
|
|
172
|
+
"""
|
|
173
|
+
Run a platform-configured evaluation and wait for completion.
|
|
174
|
+
|
|
175
|
+
This runs an Evaluation that you have configured in the LangWatch platform.
|
|
176
|
+
The evaluation will execute all targets and evaluators defined in the configuration.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
slug: The slug of the evaluation to run (found in the evaluation URL)
|
|
180
|
+
poll_interval: Seconds between status checks (default: 2.0)
|
|
181
|
+
timeout: Maximum seconds to wait for completion (default: 600.0 = 10 minutes)
|
|
182
|
+
on_progress: Optional callback for progress updates (completed, total)
|
|
183
|
+
api_key: Optional API key override (uses LANGWATCH_API_KEY env var by default)
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
EvaluationRunResult with pass rate and summary. Call result.print_summary()
|
|
187
|
+
to display results and exit with code 1 on failure.
|
|
188
|
+
|
|
189
|
+
Raises:
|
|
190
|
+
EvaluationNotFoundError: If the evaluation slug doesn't exist
|
|
191
|
+
EvaluationTimeoutError: If the evaluation doesn't complete within timeout
|
|
192
|
+
EvaluationRunFailedError: If the evaluation fails
|
|
193
|
+
EvaluationsApiError: For other API errors
|
|
194
|
+
|
|
195
|
+
Example:
|
|
196
|
+
```python
|
|
197
|
+
import langwatch
|
|
198
|
+
|
|
199
|
+
result = langwatch.evaluation.evaluate("my-evaluation-slug")
|
|
200
|
+
result.print_summary()
|
|
201
|
+
```
|
|
202
|
+
"""
|
|
203
|
+
langwatch.ensure_setup()
|
|
204
|
+
|
|
205
|
+
effective_api_key = api_key or get_api_key()
|
|
206
|
+
endpoint = get_endpoint()
|
|
207
|
+
|
|
208
|
+
if not effective_api_key:
|
|
209
|
+
raise ValueError(
|
|
210
|
+
"API key not set. Set LANGWATCH_API_KEY environment variable or pass api_key parameter."
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Start the run
|
|
214
|
+
start_response = _start_run(slug, endpoint, effective_api_key)
|
|
215
|
+
run_id = start_response["runId"]
|
|
216
|
+
total = start_response.get("total", 0)
|
|
217
|
+
|
|
218
|
+
# Use the run URL from API but replace domain with configured endpoint
|
|
219
|
+
api_run_url = start_response.get("runUrl", "")
|
|
220
|
+
run_url = _replace_url_domain(api_run_url, endpoint) if api_run_url else ""
|
|
221
|
+
|
|
222
|
+
print(f"Started evaluation run: {run_id}")
|
|
223
|
+
if run_url:
|
|
224
|
+
print(f"Follow live: {run_url}")
|
|
225
|
+
|
|
226
|
+
# Track last progress for change detection
|
|
227
|
+
last_progress = 0
|
|
228
|
+
|
|
229
|
+
# Print initial progress
|
|
230
|
+
if total > 0:
|
|
231
|
+
print(f"Progress: 0/{total} (0%)", end="", flush=True)
|
|
232
|
+
if on_progress:
|
|
233
|
+
on_progress(0, total)
|
|
234
|
+
|
|
235
|
+
# Poll until complete
|
|
236
|
+
start_time = time.time()
|
|
237
|
+
while True:
|
|
238
|
+
if time.time() - start_time > timeout:
|
|
239
|
+
print() # Newline after progress
|
|
240
|
+
status = _get_run_status(run_id, endpoint, effective_api_key)
|
|
241
|
+
raise EvaluationTimeoutError(
|
|
242
|
+
run_id, status.get("progress", 0), status.get("total", 0)
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
time.sleep(poll_interval)
|
|
246
|
+
|
|
247
|
+
status = _get_run_status(run_id, endpoint, effective_api_key)
|
|
248
|
+
progress = status.get("progress", 0)
|
|
249
|
+
total = status.get("total", total)
|
|
250
|
+
|
|
251
|
+
# Update progress display if changed
|
|
252
|
+
if progress != last_progress and total > 0:
|
|
253
|
+
percentage = (progress / total) * 100
|
|
254
|
+
# Use carriage return to overwrite the line
|
|
255
|
+
print(f"\rProgress: {progress}/{total} ({percentage:.0f}%)", end="", flush=True)
|
|
256
|
+
last_progress = progress
|
|
257
|
+
|
|
258
|
+
if on_progress:
|
|
259
|
+
on_progress(progress, total)
|
|
260
|
+
|
|
261
|
+
run_status = status.get("status")
|
|
262
|
+
|
|
263
|
+
if run_status == "completed":
|
|
264
|
+
print() # Newline after progress
|
|
265
|
+
summary_data = status.get("summary", {})
|
|
266
|
+
return _build_result(run_id, "completed", summary_data, run_url)
|
|
267
|
+
|
|
268
|
+
if run_status == "failed":
|
|
269
|
+
print() # Newline after progress
|
|
270
|
+
raise EvaluationRunFailedError(
|
|
271
|
+
run_id, status.get("error", "Unknown error")
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
if run_status == "stopped":
|
|
275
|
+
print() # Newline after progress
|
|
276
|
+
summary_data = status.get("summary", {})
|
|
277
|
+
return _build_result(run_id, "stopped", summary_data, run_url)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _start_run(slug: str, endpoint: str, api_key: str) -> dict:
|
|
281
|
+
"""Start an evaluation run."""
|
|
282
|
+
with httpx.Client(timeout=60) as client:
|
|
283
|
+
response = client.post(
|
|
284
|
+
f"{endpoint}/api/evaluations/v3/{slug}/run",
|
|
285
|
+
headers={"X-Auth-Token": api_key},
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
if response.status_code == 404:
|
|
289
|
+
raise EvaluationNotFoundError(slug)
|
|
290
|
+
if response.status_code == 401:
|
|
291
|
+
raise EvaluationsApiError("Unauthorized - check your API key", 401)
|
|
292
|
+
if not response.is_success:
|
|
293
|
+
error_body = response.json() if response.content else {}
|
|
294
|
+
raise EvaluationsApiError(
|
|
295
|
+
error_body.get("error", f"Failed to start evaluation: {response.status_code}"),
|
|
296
|
+
response.status_code,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
return response.json()
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _get_run_status(run_id: str, endpoint: str, api_key: str) -> dict:
|
|
303
|
+
"""Get the status of a run."""
|
|
304
|
+
with httpx.Client(timeout=60) as client:
|
|
305
|
+
response = client.get(
|
|
306
|
+
f"{endpoint}/api/evaluations/v3/runs/{run_id}",
|
|
307
|
+
headers={"X-Auth-Token": api_key},
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
if response.status_code == 404:
|
|
311
|
+
raise EvaluationsApiError(f"Run not found: {run_id}", 404)
|
|
312
|
+
if response.status_code == 401:
|
|
313
|
+
raise EvaluationsApiError("Unauthorized - check your API key", 401)
|
|
314
|
+
if not response.is_success:
|
|
315
|
+
error_body = response.json() if response.content else {}
|
|
316
|
+
raise EvaluationsApiError(
|
|
317
|
+
error_body.get("error", f"Failed to get run status: {response.status_code}"),
|
|
318
|
+
response.status_code,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
return response.json()
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _build_result(
|
|
325
|
+
run_id: str,
|
|
326
|
+
status: Literal["completed", "failed", "stopped"],
|
|
327
|
+
summary_data: dict,
|
|
328
|
+
run_url: str,
|
|
329
|
+
) -> EvaluationRunResult:
|
|
330
|
+
"""Build the result object from API response."""
|
|
331
|
+
total_cells = summary_data.get("totalCells", 0)
|
|
332
|
+
completed_cells = summary_data.get("completedCells", 0)
|
|
333
|
+
failed_cells = summary_data.get("failedCells", 0)
|
|
334
|
+
duration = summary_data.get("duration", 0)
|
|
335
|
+
|
|
336
|
+
total_passed = summary_data.get("totalPassed", completed_cells - failed_cells)
|
|
337
|
+
total_failed = summary_data.get("totalFailed", failed_cells)
|
|
338
|
+
pass_rate = summary_data.get(
|
|
339
|
+
"passRate",
|
|
340
|
+
(total_passed / completed_cells * 100) if completed_cells > 0 else 0.0,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Parse targets
|
|
344
|
+
targets: List[TargetStats] = []
|
|
345
|
+
for t in summary_data.get("targets", []):
|
|
346
|
+
targets.append(
|
|
347
|
+
TargetStats(
|
|
348
|
+
target_id=t.get("targetId", ""),
|
|
349
|
+
name=t.get("name", ""),
|
|
350
|
+
passed=t.get("passed", 0),
|
|
351
|
+
failed=t.get("failed", 0),
|
|
352
|
+
avg_latency=t.get("avgLatency", 0),
|
|
353
|
+
total_cost=t.get("totalCost", 0),
|
|
354
|
+
)
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Parse evaluators
|
|
358
|
+
evaluators: List[EvaluatorStats] = []
|
|
359
|
+
for e in summary_data.get("evaluators", []):
|
|
360
|
+
evaluators.append(
|
|
361
|
+
EvaluatorStats(
|
|
362
|
+
evaluator_id=e.get("evaluatorId", ""),
|
|
363
|
+
name=e.get("name", ""),
|
|
364
|
+
passed=e.get("passed", 0),
|
|
365
|
+
failed=e.get("failed", 0),
|
|
366
|
+
pass_rate=e.get("passRate", 0),
|
|
367
|
+
avg_score=e.get("avgScore"),
|
|
368
|
+
)
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
summary = EvaluationRunSummary(
|
|
372
|
+
run_id=run_id,
|
|
373
|
+
total_cells=total_cells,
|
|
374
|
+
completed_cells=completed_cells,
|
|
375
|
+
failed_cells=failed_cells,
|
|
376
|
+
duration=duration,
|
|
377
|
+
run_url=run_url, # Always use the endpoint-based URL we constructed
|
|
378
|
+
targets=targets,
|
|
379
|
+
evaluators=evaluators,
|
|
380
|
+
total_passed=total_passed,
|
|
381
|
+
total_failed=total_failed,
|
|
382
|
+
pass_rate=pass_rate,
|
|
383
|
+
total_cost=summary_data.get("totalCost", 0),
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
return EvaluationRunResult(
|
|
387
|
+
run_id=run_id,
|
|
388
|
+
status=status,
|
|
389
|
+
passed=total_passed,
|
|
390
|
+
failed=total_failed,
|
|
391
|
+
pass_rate=pass_rate,
|
|
392
|
+
duration=duration,
|
|
393
|
+
run_url=summary.run_url,
|
|
394
|
+
summary=summary,
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _print_summary(result: EvaluationRunResult) -> None:
|
|
399
|
+
"""Print a CI-friendly summary of the evaluation results."""
|
|
400
|
+
summary = result.summary
|
|
401
|
+
|
|
402
|
+
print("\n" + "═" * 60)
|
|
403
|
+
print(" EVALUATION RESULTS")
|
|
404
|
+
print("═" * 60)
|
|
405
|
+
print(f" Run ID: {result.run_id}")
|
|
406
|
+
print(f" Status: {result.status.upper()}")
|
|
407
|
+
print(f" Duration: {result.duration / 1000:.1f}s")
|
|
408
|
+
print("─" * 60)
|
|
409
|
+
print(f" Passed: {result.passed}")
|
|
410
|
+
print(f" Failed: {result.failed}")
|
|
411
|
+
print(f" Pass Rate: {result.pass_rate:.1f}%")
|
|
412
|
+
|
|
413
|
+
if summary.targets:
|
|
414
|
+
print("─" * 60)
|
|
415
|
+
print(" TARGETS:")
|
|
416
|
+
for target in summary.targets:
|
|
417
|
+
print(f" {target.name}: {target.passed} passed, {target.failed} failed")
|
|
418
|
+
if target.avg_latency:
|
|
419
|
+
print(f" Avg latency: {target.avg_latency:.0f}ms")
|
|
420
|
+
if target.total_cost:
|
|
421
|
+
print(f" Total cost: ${target.total_cost:.4f}")
|
|
422
|
+
|
|
423
|
+
if summary.evaluators:
|
|
424
|
+
print("─" * 60)
|
|
425
|
+
print(" EVALUATORS:")
|
|
426
|
+
for evaluator in summary.evaluators:
|
|
427
|
+
print(f" {evaluator.name}: {evaluator.pass_rate:.1f}% pass rate")
|
|
428
|
+
if evaluator.avg_score is not None:
|
|
429
|
+
print(f" Avg score: {evaluator.avg_score:.2f}")
|
|
430
|
+
|
|
431
|
+
print("─" * 60)
|
|
432
|
+
print(f" View details: {result.run_url}")
|
|
433
|
+
print("═" * 60 + "\n")
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def run(
|
|
437
|
+
slug: str,
|
|
438
|
+
*,
|
|
439
|
+
poll_interval: float = 2.0,
|
|
440
|
+
timeout: float = 600.0,
|
|
441
|
+
on_progress: Optional[Callable[[int, int], None]] = None,
|
|
442
|
+
api_key: Optional[str] = None,
|
|
443
|
+
) -> EvaluationRunResult:
|
|
444
|
+
"""
|
|
445
|
+
Deprecated: Use `evaluate()` instead.
|
|
446
|
+
|
|
447
|
+
Run a platform-configured evaluation and wait for completion.
|
|
448
|
+
"""
|
|
449
|
+
import warnings
|
|
450
|
+
|
|
451
|
+
warnings.warn(
|
|
452
|
+
"langwatch.evaluation.run() is deprecated, use langwatch.evaluation.evaluate() instead",
|
|
453
|
+
DeprecationWarning,
|
|
454
|
+
stacklevel=2,
|
|
455
|
+
)
|
|
456
|
+
return evaluate(
|
|
457
|
+
slug,
|
|
458
|
+
poll_interval=poll_interval,
|
|
459
|
+
timeout=timeout,
|
|
460
|
+
on_progress=on_progress,
|
|
461
|
+
api_key=api_key,
|
|
462
|
+
)
|
langwatch/evaluations.py
CHANGED
|
@@ -12,6 +12,7 @@ from langwatch.telemetry.span import LangWatchSpan
|
|
|
12
12
|
from langwatch.telemetry.context import get_current_span
|
|
13
13
|
from langwatch.state import get_api_key, get_endpoint, get_instance
|
|
14
14
|
from langwatch.attributes import AttributeKey
|
|
15
|
+
from langwatch.utils.exceptions import EvaluatorException, better_raise_for_status
|
|
15
16
|
from pydantic import BaseModel
|
|
16
17
|
|
|
17
18
|
from langwatch.types import (
|
|
@@ -101,7 +102,7 @@ def evaluate(
|
|
|
101
102
|
try:
|
|
102
103
|
with httpx.Client(timeout=900) as client:
|
|
103
104
|
response = client.post(**request_params)
|
|
104
|
-
response
|
|
105
|
+
better_raise_for_status(response, cls=EvaluatorException)
|
|
105
106
|
except Exception as e:
|
|
106
107
|
return _handle_exception(e, span, as_guardrail)
|
|
107
108
|
|
|
@@ -156,7 +157,7 @@ async def async_evaluate(
|
|
|
156
157
|
try:
|
|
157
158
|
async with httpx.AsyncClient(timeout=900) as client:
|
|
158
159
|
response = await client.post(**request_params)
|
|
159
|
-
response
|
|
160
|
+
better_raise_for_status(response)
|
|
160
161
|
except Exception as e:
|
|
161
162
|
return _handle_exception(e, span, as_guardrail)
|
|
162
163
|
|
langwatch/login.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import httpx
|
|
2
2
|
|
|
3
3
|
import langwatch
|
|
4
|
+
from langwatch.utils.exceptions import better_raise_for_status
|
|
4
5
|
from .state import get_api_key, get_endpoint
|
|
5
6
|
from getpass import getpass
|
|
6
7
|
|
|
@@ -25,7 +26,7 @@ def login(relogin=False):
|
|
|
25
26
|
)
|
|
26
27
|
if response.status_code == 401:
|
|
27
28
|
raise ValueError("API key is not valid, please try to login again")
|
|
28
|
-
response
|
|
29
|
+
better_raise_for_status(response)
|
|
29
30
|
|
|
30
31
|
langwatch.setup(api_key=api_key)
|
|
31
32
|
print("LangWatch API key set")
|
langwatch/telemetry/tracing.py
CHANGED
|
@@ -6,6 +6,7 @@ import httpx
|
|
|
6
6
|
import threading
|
|
7
7
|
from deprecated import deprecated
|
|
8
8
|
from langwatch.attributes import AttributeKey
|
|
9
|
+
from langwatch.utils.exceptions import better_raise_for_status
|
|
9
10
|
from langwatch.utils.transformation import (
|
|
10
11
|
SerializableWithStringFallback,
|
|
11
12
|
convert_typed_values,
|
|
@@ -300,7 +301,7 @@ class LangWatchTrace:
|
|
|
300
301
|
headers={"X-Auth-Token": get_api_key()},
|
|
301
302
|
timeout=15,
|
|
302
303
|
)
|
|
303
|
-
response
|
|
304
|
+
better_raise_for_status(response)
|
|
304
305
|
path = response.json()["path"]
|
|
305
306
|
return f"{endpoint}{path}"
|
|
306
307
|
|
|
@@ -317,7 +318,7 @@ class LangWatchTrace:
|
|
|
317
318
|
headers={"X-Auth-Token": get_api_key()},
|
|
318
319
|
timeout=15,
|
|
319
320
|
)
|
|
320
|
-
response
|
|
321
|
+
better_raise_for_status(response)
|
|
321
322
|
|
|
322
323
|
def update(
|
|
323
324
|
self,
|
langwatch/utils/exceptions.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import traceback
|
|
2
|
-
from typing import List, cast
|
|
2
|
+
from typing import List, cast, Type
|
|
3
3
|
|
|
4
4
|
from langwatch.domain import ErrorCapture
|
|
5
|
+
import httpx
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
def capture_exception(err: BaseException):
|
|
@@ -12,3 +13,23 @@ def capture_exception(err: BaseException):
|
|
|
12
13
|
except: # python 3.10+
|
|
13
14
|
string_stacktrace = traceback.format_exception(err) # type: ignore
|
|
14
15
|
return ErrorCapture(message=repr(err), stacktrace=string_stacktrace)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EvaluatorException(Exception):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def better_raise_for_status(response: httpx.Response, cls: Type[BaseException] = httpx.HTTPStatusError) -> None:
|
|
23
|
+
try:
|
|
24
|
+
response.raise_for_status()
|
|
25
|
+
except httpx.HTTPStatusError as http_err:
|
|
26
|
+
try:
|
|
27
|
+
json = response.json()
|
|
28
|
+
except Exception:
|
|
29
|
+
raise http_err
|
|
30
|
+
|
|
31
|
+
if "error" in json:
|
|
32
|
+
error = json["error"]
|
|
33
|
+
raise cls(f"{response.status_code} {error}") from http_err
|
|
34
|
+
else:
|
|
35
|
+
raise http_err
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
langwatch/__init__.py,sha256=GMq4SV2Tz2i0JD05shqnw2lBW5cgMx4Zzo141hp106k,4266
|
|
2
|
-
langwatch/__version__.py,sha256=
|
|
2
|
+
langwatch/__version__.py,sha256=sympc_lD0EH1ffjgsP80P8i4Sqm2XBcIgblEeQTq6bs,91
|
|
3
3
|
langwatch/attributes.py,sha256=nXdI_G85wQQCAdAcwjCiLYdEYj3wATmfgCmhlf6dVIk,3910
|
|
4
|
-
langwatch/batch_evaluation.py,sha256=
|
|
4
|
+
langwatch/batch_evaluation.py,sha256=Y_S3teXpHV07U-vvJYyV1PB6d0CgyFM_rTzPp6GnEBo,16165
|
|
5
5
|
langwatch/client.py,sha256=WTNcYSik7kZ2kH-qGDnhbMTosc8e_Xhab_lZlfh5TC8,25559
|
|
6
|
-
langwatch/evaluations.py,sha256=
|
|
6
|
+
langwatch/evaluations.py,sha256=W-nxhcbQ4Niyx949LjHjTehw74pj31dUVs5rjkQUBLo,16951
|
|
7
7
|
langwatch/guardrails.py,sha256=4d320HyklXPUVszF34aWsDKGzuvPggcDM_f45_eJTnc,1352
|
|
8
8
|
langwatch/langchain.py,sha256=HjbBBIDwwt688g0W4K0EsZGuGBbo1Mv5LQ-7Mkn56iQ,18726
|
|
9
9
|
langwatch/litellm.py,sha256=mPcw5cLykt0SQf9bTNSoT7elMx4gj-wZ_K2PC14Bw50,11998
|
|
10
|
-
langwatch/login.py,sha256=
|
|
10
|
+
langwatch/login.py,sha256=o0DxYVMhMCRHeopyF0qlj_U4o6yD1rL8QjfKvKnHv0s,965
|
|
11
11
|
langwatch/openai.py,sha256=h_NCIwJ0qs57PS-R7gQZsnf2_EBAahlYQMuqS9-Cj3Q,25139
|
|
12
12
|
langwatch/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
langwatch/state.py,sha256=qXvPAjO90jdokCU6tPSwjHIac4QU_5N0pSd9dfmc9kY,1204
|
|
@@ -15,9 +15,10 @@ langwatch/tracer.py,sha256=t5FOdP1es9H_pPGqGUBLXCyEln0tTi4m4M9b6WxCrPU,975
|
|
|
15
15
|
langwatch/types.py,sha256=h6r3tNTzWqENx-9j_JPmOMZfFoKq9SNpEtxpAACk2G0,3114
|
|
16
16
|
langwatch/dataset/__init__.py,sha256=hZBcbjXuBO2qE5osJtd9wIE9f45F6-jpNTrne5nk4eE,2606
|
|
17
17
|
langwatch/domain/__init__.py,sha256=gSCOV3WkRhp_--9D1vxw7BYpnMRbpGh-2NbsXd4KZC0,6074
|
|
18
|
-
langwatch/dspy/__init__.py,sha256=
|
|
19
|
-
langwatch/evaluation/__init__.py,sha256=
|
|
20
|
-
langwatch/evaluation/evaluation.py,sha256=
|
|
18
|
+
langwatch/dspy/__init__.py,sha256=wp8AmobV8XGVWOI8MQFmXPHu-8Wq3wvjB6YiHQm9Fdg,33007
|
|
19
|
+
langwatch/evaluation/__init__.py,sha256=dctG-Ec0N_Or2Ta0XW6liYtdpMZN3ZtRXqUoeG5ksnk,870
|
|
20
|
+
langwatch/evaluation/evaluation.py,sha256=MqMiGlsPIS5zqN1wKfhEs6mIGLRwB452iqDTSQFbtYo,31735
|
|
21
|
+
langwatch/evaluation/platform_run.py,sha256=cwuRNtG99nhvqGL-YoOwdvEH3x-hDaVUzl7Vx9orjPo,14546
|
|
21
22
|
langwatch/exporters/filterable_batch_span_exporter.py,sha256=MlhZjui56XD6p2sa8kEGyr-Hb3wqudknngmemnB4Twg,2142
|
|
22
23
|
langwatch/generated/langwatch_rest_api_client/__init__.py,sha256=8r-9pAj7fK7vnVX3mT0y_zS4B9ZRqD6RZiBo5fPra60,156
|
|
23
24
|
langwatch/generated/langwatch_rest_api_client/client.py,sha256=o_mdLqyBCQstu5tS1WZFwqIEbGwkvWQ7eQjuCJw_5VY,12419
|
|
@@ -405,16 +406,16 @@ langwatch/prompts/types/structures.py,sha256=cB94bn-qhFgHHYXcrmJV6Bk9idk5ZmyfXhF
|
|
|
405
406
|
langwatch/telemetry/context.py,sha256=q0hUG9PM3aifIr6ZRuuNNbsGtcAImu9Pv2XTKUp3CGc,4029
|
|
406
407
|
langwatch/telemetry/sampling.py,sha256=XDf6ZoXiwpHaHDYd_dDszSqH8_9-CHFNsGAZWOW1VYk,1327
|
|
407
408
|
langwatch/telemetry/span.py,sha256=g-RGWfQk4Q3b2TpipiHqjEV7rwmidaUHp54q51UxQ6s,32801
|
|
408
|
-
langwatch/telemetry/tracing.py,sha256=
|
|
409
|
+
langwatch/telemetry/tracing.py,sha256=R5e0F5Gea72Otusxj5ceafs_-KuFFylNRjcGNkLAhTQ,27697
|
|
409
410
|
langwatch/telemetry/types.py,sha256=Q9H7nT3GMK1aluRB7CCX8BR7VFKrQY_vdFdyF4Yc98U,501
|
|
410
411
|
langwatch/telemetry/__tests__/test_tracing.py,sha256=Px2vcpbRWBgwwaXzw3MgRfkcL-If2LmPAwaFN1sLyvY,3350
|
|
411
412
|
langwatch/utils/__init__.py,sha256=3rqQTgzEtmICJW_KSPuLa5q8p5udxt5SRi28Z2vZB10,138
|
|
412
413
|
langwatch/utils/capture.py,sha256=uVKPqHCm-o8CpabsUfhqbNFr5sgUHzcKnBadvL2oIwI,1172
|
|
413
|
-
langwatch/utils/exceptions.py,sha256=
|
|
414
|
+
langwatch/utils/exceptions.py,sha256=b-dorrnQ9XBJcijLLNJP9LRQzdOZGEiyQ3f8GcA1kgk,1046
|
|
414
415
|
langwatch/utils/initialization.py,sha256=1KoZmkHOvGEVF0j-4t4xRQdA_2C_SPiF7qFXqEG4Nkw,4553
|
|
415
416
|
langwatch/utils/module.py,sha256=KLBNOK3mA9gCSifCcQX_lOtU48BJQDWvFKtF6NMvwVA,688
|
|
416
417
|
langwatch/utils/transformation.py,sha256=76MGXyrYTxM0Yri36NJqLK-XxL4BBYdmKWAXXlw3D4Q,7690
|
|
417
418
|
langwatch/utils/utils.py,sha256=ZCOSie4o9LdJ7odshNfCNjmgwgQ27ojc5ENqt1rXuSs,596
|
|
418
|
-
langwatch-0.
|
|
419
|
-
langwatch-0.
|
|
420
|
-
langwatch-0.
|
|
419
|
+
langwatch-0.9.0.dist-info/METADATA,sha256=JtLLtVbyy0iau3ySelLpMO4RpjrQAEyhd72J9NkxHl8,13192
|
|
420
|
+
langwatch-0.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
421
|
+
langwatch-0.9.0.dist-info/RECORD,,
|
|
File without changes
|