langwatch 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,435 @@
1
+ """
2
+ Runner for platform-configured experiments (Experiments Workbench).
3
+
4
+ This module provides the `run()` function to execute evaluations that are
5
+ configured in the LangWatch platform from CI/CD pipelines or scripts.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Callable, List, Literal, Optional
10
+ from urllib.parse import urlparse, urlunparse
11
+ import sys
12
+ import time
13
+ import httpx
14
+
15
+ import langwatch
16
+ from langwatch.state import get_api_key, get_endpoint
17
+
18
+
19
+ def _replace_url_domain(url: str, new_base: str) -> str:
20
+ """Replace the domain/scheme of a URL with a new base URL, preserving the path."""
21
+ if not url:
22
+ return url
23
+
24
+ parsed_url = urlparse(url)
25
+ parsed_new_base = urlparse(new_base)
26
+
27
+ # Replace scheme and netloc with new base, keep path/query/fragment
28
+ return urlunparse((
29
+ parsed_new_base.scheme,
30
+ parsed_new_base.netloc,
31
+ parsed_url.path,
32
+ parsed_url.params,
33
+ parsed_url.query,
34
+ parsed_url.fragment,
35
+ ))
36
+
37
+
38
+ class ExperimentNotFoundError(Exception):
39
+ """Raised when experiment slug doesn't exist."""
40
+
41
+ def __init__(self, slug: str):
42
+ self.slug = slug
43
+ super().__init__(f"Evaluation not found: {slug}")
44
+
45
+
46
+ class ExperimentTimeoutError(Exception):
47
+ """Raised when experiment run times out."""
48
+
49
+ def __init__(self, run_id: str, progress: int, total: int):
50
+ self.run_id = run_id
51
+ self.progress = progress
52
+ self.total = total
53
+ super().__init__(
54
+ f"Evaluation run timed out: {run_id} ({progress}/{total} completed)"
55
+ )
56
+
57
+
58
+ class ExperimentRunFailedError(Exception):
59
+ """Raised when experiment run fails."""
60
+
61
+ def __init__(self, run_id: str, error: str):
62
+ self.run_id = run_id
63
+ self.error_message = error
64
+ super().__init__(f"Evaluation run failed: {error}")
65
+
66
+
67
+ class ExperimentsApiError(Exception):
68
+ """Raised for other API errors."""
69
+
70
+ def __init__(self, message: str, status_code: int):
71
+ self.status_code = status_code
72
+ super().__init__(message)
73
+
74
+
75
+ @dataclass
76
+ class TargetStats:
77
+ """Statistics for a single target."""
78
+
79
+ target_id: str
80
+ name: str
81
+ passed: int
82
+ failed: int
83
+ avg_latency: float
84
+ total_cost: float
85
+
86
+
87
+ @dataclass
88
+ class EvaluatorStats:
89
+ """Statistics for a single evaluator."""
90
+
91
+ evaluator_id: str
92
+ name: str
93
+ passed: int
94
+ failed: int
95
+ pass_rate: float
96
+ avg_score: Optional[float] = None
97
+
98
+
99
+ @dataclass
100
+ class ExperimentRunSummary:
101
+ """Summary of a completed experiment run."""
102
+
103
+ run_id: str
104
+ total_cells: int
105
+ completed_cells: int
106
+ failed_cells: int
107
+ duration: int
108
+ run_url: str = ""
109
+ targets: List[TargetStats] = field(default_factory=list)
110
+ evaluators: List[EvaluatorStats] = field(default_factory=list)
111
+ total_passed: int = 0
112
+ total_failed: int = 0
113
+ pass_rate: float = 0.0
114
+ total_cost: float = 0.0
115
+
116
+
117
+ @dataclass
118
+ class ExperimentRunResult:
119
+ """Result of running a platform evaluation."""
120
+
121
+ run_id: str
122
+ status: Literal["completed", "failed", "stopped"]
123
+ passed: int
124
+ failed: int
125
+ pass_rate: float
126
+ duration: int
127
+ run_url: str
128
+ summary: ExperimentRunSummary
129
+
130
+ def print_summary(self, exit_on_failure: Optional[bool] = None) -> None:
131
+ """
132
+ Print a CI-friendly summary and optionally exit with code 1 on failure.
133
+
134
+ Args:
135
+ exit_on_failure: If True, calls sys.exit(1) when there are failures.
136
+ If False, never exits.
137
+ If None (default), auto-detects: exits in scripts/CI, doesn't exit in notebooks.
138
+ """
139
+ _print_summary(self)
140
+
141
+ # Auto-detect: don't exit in notebooks, exit in scripts/CI
142
+ should_exit = exit_on_failure if exit_on_failure is not None else not _is_notebook()
143
+
144
+ if should_exit and self.failed > 0:
145
+ sys.exit(1)
146
+
147
+
148
+ def _is_notebook() -> bool:
149
+ """Detect if running in a Jupyter notebook."""
150
+ try:
151
+ from IPython import get_ipython # type: ignore
152
+
153
+ shell = get_ipython().__class__.__name__
154
+ if shell == "ZMQInteractiveShell":
155
+ return True # Jupyter notebook or qtconsole
156
+ elif shell == "TerminalInteractiveShell":
157
+ return False # Terminal running IPython
158
+ else:
159
+ return False
160
+ except (ImportError, AttributeError, NameError):
161
+ return False
162
+
163
+
164
+ def run(
165
+ slug: str,
166
+ *,
167
+ poll_interval: float = 2.0,
168
+ timeout: float = 600.0,
169
+ on_progress: Optional[Callable[[int, int], None]] = None,
170
+ api_key: Optional[str] = None,
171
+ ) -> ExperimentRunResult:
172
+ """
173
+ Run a platform-configured experiment and wait for completion.
174
+
175
+ This runs an Experiment that you have configured in the LangWatch platform.
176
+ The experiment will execute all targets and evaluators defined in the configuration.
177
+
178
+ Args:
179
+ slug: The slug of the experiment to run (found in the experiment URL)
180
+ poll_interval: Seconds between status checks (default: 2.0)
181
+ timeout: Maximum seconds to wait for completion (default: 600.0 = 10 minutes)
182
+ on_progress: Optional callback for progress updates (completed, total)
183
+ api_key: Optional API key override (uses LANGWATCH_API_KEY env var by default)
184
+
185
+ Returns:
186
+ ExperimentRunResult with pass rate and summary. Call result.print_summary()
187
+ to display results and exit with code 1 on failure.
188
+
189
+ Raises:
190
+ ExperimentNotFoundError: If the experiment slug doesn't exist
191
+ ExperimentTimeoutError: If the experiment doesn't complete within timeout
192
+ ExperimentRunFailedError: If the experiment fails
193
+ ExperimentsApiError: For other API errors
194
+
195
+ Example:
196
+ ```python
197
+ import langwatch
198
+
199
+ result = langwatch.experiment.run("my-experiment-slug")
200
+ result.print_summary()
201
+ ```
202
+ """
203
+ langwatch.ensure_setup()
204
+
205
+ effective_api_key = api_key or get_api_key()
206
+ endpoint = get_endpoint()
207
+
208
+ if not effective_api_key:
209
+ raise ValueError(
210
+ "API key not set. Set LANGWATCH_API_KEY environment variable or pass api_key parameter."
211
+ )
212
+
213
+ # Start the run
214
+ start_response = _start_run(slug, endpoint, effective_api_key)
215
+ run_id = start_response["runId"]
216
+ total = start_response.get("total", 0)
217
+
218
+ # Use the run URL from API but replace domain with configured endpoint
219
+ api_run_url = start_response.get("runUrl", "")
220
+ run_url = _replace_url_domain(api_run_url, endpoint) if api_run_url else ""
221
+
222
+ print(f"Started experiment run: {run_id}")
223
+ if run_url:
224
+ print(f"Follow live: {run_url}")
225
+
226
+ # Track last progress for change detection
227
+ last_progress = 0
228
+
229
+ # Print initial progress
230
+ if total > 0:
231
+ print(f"Progress: 0/{total} (0%)", end="", flush=True)
232
+ if on_progress:
233
+ on_progress(0, total)
234
+
235
+ # Poll until complete
236
+ start_time = time.time()
237
+ while True:
238
+ if time.time() - start_time > timeout:
239
+ print() # Newline after progress
240
+ status = _get_run_status(run_id, endpoint, effective_api_key)
241
+ raise ExperimentTimeoutError(
242
+ run_id, status.get("progress", 0), status.get("total", 0)
243
+ )
244
+
245
+ time.sleep(poll_interval)
246
+
247
+ status = _get_run_status(run_id, endpoint, effective_api_key)
248
+ progress = status.get("progress", 0)
249
+ total = status.get("total", total)
250
+
251
+ # Update progress display if changed
252
+ if progress != last_progress and total > 0:
253
+ percentage = (progress / total) * 100
254
+ # Use carriage return to overwrite the line
255
+ print(f"\rProgress: {progress}/{total} ({percentage:.0f}%)", end="", flush=True)
256
+ last_progress = progress
257
+
258
+ if on_progress:
259
+ on_progress(progress, total)
260
+
261
+ run_status = status.get("status")
262
+
263
+ if run_status == "completed":
264
+ print() # Newline after progress
265
+ summary_data = status.get("summary", {})
266
+ return _build_result(run_id, "completed", summary_data, run_url)
267
+
268
+ if run_status == "failed":
269
+ print() # Newline after progress
270
+ raise ExperimentRunFailedError(
271
+ run_id, status.get("error", "Unknown error")
272
+ )
273
+
274
+ if run_status == "stopped":
275
+ print() # Newline after progress
276
+ summary_data = status.get("summary", {})
277
+ return _build_result(run_id, "stopped", summary_data, run_url)
278
+
279
+
280
+ def _start_run(slug: str, endpoint: str, api_key: str) -> dict:
281
+ """Start an experiment run."""
282
+ with httpx.Client(timeout=60) as client:
283
+ response = client.post(
284
+ f"{endpoint}/api/evaluations/v3/{slug}/run",
285
+ headers={"X-Auth-Token": api_key},
286
+ )
287
+
288
+ if response.status_code == 404:
289
+ raise ExperimentNotFoundError(slug)
290
+ if response.status_code == 401:
291
+ raise ExperimentsApiError("Unauthorized - check your API key", 401)
292
+ if not response.is_success:
293
+ error_body = response.json() if response.content else {}
294
+ raise ExperimentsApiError(
295
+ error_body.get("error", f"Failed to start evaluation: {response.status_code}"),
296
+ response.status_code,
297
+ )
298
+
299
+ return response.json()
300
+
301
+
302
+ def _get_run_status(run_id: str, endpoint: str, api_key: str) -> dict:
303
+ """Get the status of a run."""
304
+ with httpx.Client(timeout=60) as client:
305
+ response = client.get(
306
+ f"{endpoint}/api/evaluations/v3/runs/{run_id}",
307
+ headers={"X-Auth-Token": api_key},
308
+ )
309
+
310
+ if response.status_code == 404:
311
+ raise ExperimentsApiError(f"Run not found: {run_id}", 404)
312
+ if response.status_code == 401:
313
+ raise ExperimentsApiError("Unauthorized - check your API key", 401)
314
+ if not response.is_success:
315
+ error_body = response.json() if response.content else {}
316
+ raise ExperimentsApiError(
317
+ error_body.get("error", f"Failed to get run status: {response.status_code}"),
318
+ response.status_code,
319
+ )
320
+
321
+ return response.json()
322
+
323
+
324
+ def _build_result(
325
+ run_id: str,
326
+ status: Literal["completed", "failed", "stopped"],
327
+ summary_data: dict,
328
+ run_url: str,
329
+ ) -> ExperimentRunResult:
330
+ """Build the result object from API response."""
331
+ total_cells = summary_data.get("totalCells", 0)
332
+ completed_cells = summary_data.get("completedCells", 0)
333
+ failed_cells = summary_data.get("failedCells", 0)
334
+ duration = summary_data.get("duration", 0)
335
+
336
+ total_passed = summary_data.get("totalPassed", completed_cells - failed_cells)
337
+ total_failed = summary_data.get("totalFailed", failed_cells)
338
+ pass_rate = summary_data.get(
339
+ "passRate",
340
+ (total_passed / completed_cells * 100) if completed_cells > 0 else 0.0,
341
+ )
342
+
343
+ # Parse targets
344
+ targets: List[TargetStats] = []
345
+ for t in summary_data.get("targets", []):
346
+ targets.append(
347
+ TargetStats(
348
+ target_id=t.get("targetId", ""),
349
+ name=t.get("name", ""),
350
+ passed=t.get("passed", 0),
351
+ failed=t.get("failed", 0),
352
+ avg_latency=t.get("avgLatency", 0),
353
+ total_cost=t.get("totalCost", 0),
354
+ )
355
+ )
356
+
357
+ # Parse evaluators
358
+ evaluators: List[EvaluatorStats] = []
359
+ for e in summary_data.get("evaluators", []):
360
+ evaluators.append(
361
+ EvaluatorStats(
362
+ evaluator_id=e.get("evaluatorId", ""),
363
+ name=e.get("name", ""),
364
+ passed=e.get("passed", 0),
365
+ failed=e.get("failed", 0),
366
+ pass_rate=e.get("passRate", 0),
367
+ avg_score=e.get("avgScore"),
368
+ )
369
+ )
370
+
371
+ summary = ExperimentRunSummary(
372
+ run_id=run_id,
373
+ total_cells=total_cells,
374
+ completed_cells=completed_cells,
375
+ failed_cells=failed_cells,
376
+ duration=duration,
377
+ run_url=run_url, # Always use the endpoint-based URL we constructed
378
+ targets=targets,
379
+ evaluators=evaluators,
380
+ total_passed=total_passed,
381
+ total_failed=total_failed,
382
+ pass_rate=pass_rate,
383
+ total_cost=summary_data.get("totalCost", 0),
384
+ )
385
+
386
+ return ExperimentRunResult(
387
+ run_id=run_id,
388
+ status=status,
389
+ passed=total_passed,
390
+ failed=total_failed,
391
+ pass_rate=pass_rate,
392
+ duration=duration,
393
+ run_url=summary.run_url,
394
+ summary=summary,
395
+ )
396
+
397
+
398
+ def _print_summary(result: ExperimentRunResult) -> None:
399
+ """Print a CI-friendly summary of the experiment results."""
400
+ summary = result.summary
401
+
402
+ print("\n" + "═" * 60)
403
+ print(" EXPERIMENT RESULTS")
404
+ print("═" * 60)
405
+ print(f" Run ID: {result.run_id}")
406
+ print(f" Status: {result.status.upper()}")
407
+ print(f" Duration: {result.duration / 1000:.1f}s")
408
+ print("─" * 60)
409
+ print(f" Passed: {result.passed}")
410
+ print(f" Failed: {result.failed}")
411
+ print(f" Pass Rate: {result.pass_rate:.1f}%")
412
+
413
+ if summary.targets:
414
+ print("─" * 60)
415
+ print(" TARGETS:")
416
+ for target in summary.targets:
417
+ print(f" {target.name}: {target.passed} passed, {target.failed} failed")
418
+ if target.avg_latency:
419
+ print(f" Avg latency: {target.avg_latency:.0f}ms")
420
+ if target.total_cost:
421
+ print(f" Total cost: ${target.total_cost:.4f}")
422
+
423
+ if summary.evaluators:
424
+ print("─" * 60)
425
+ print(" EVALUATORS:")
426
+ for evaluator in summary.evaluators:
427
+ print(f" {evaluator.name}: {evaluator.pass_rate:.1f}% pass rate")
428
+ if evaluator.avg_score is not None:
429
+ print(f" Avg score: {evaluator.avg_score:.2f}")
430
+
431
+ print("─" * 60)
432
+ print(f" View details: {result.run_url}")
433
+ print("═" * 60 + "\n")
434
+
435
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langwatch
3
- Version: 0.8.1
3
+ Version: 0.10.0
4
4
  Summary: LangWatch Python SDK, for monitoring your LLMs
5
5
  Author-email: Langwatch Engineers <engineering@langwatch.ai>
6
6
  License: MIT
@@ -1,9 +1,9 @@
1
- langwatch/__init__.py,sha256=GMq4SV2Tz2i0JD05shqnw2lBW5cgMx4Zzo141hp106k,4266
2
- langwatch/__version__.py,sha256=l2r_v6gqH58S38dAeIr-BCiWrh25Ql4biGJMjTpZZ1o,91
1
+ langwatch/__init__.py,sha256=VGkLDw3h8hOtzyaTMObWupGTQosn4E17Dk5zcfDmy7g,4462
2
+ langwatch/__version__.py,sha256=o5DKyCSbKMRVAiPvoV2UuvLgN1NaTqfqaz2GumN3K3A,92
3
3
  langwatch/attributes.py,sha256=nXdI_G85wQQCAdAcwjCiLYdEYj3wATmfgCmhlf6dVIk,3910
4
4
  langwatch/batch_evaluation.py,sha256=Y_S3teXpHV07U-vvJYyV1PB6d0CgyFM_rTzPp6GnEBo,16165
5
5
  langwatch/client.py,sha256=WTNcYSik7kZ2kH-qGDnhbMTosc8e_Xhab_lZlfh5TC8,25559
6
- langwatch/evaluations.py,sha256=W-nxhcbQ4Niyx949LjHjTehw74pj31dUVs5rjkQUBLo,16951
6
+ langwatch/evaluations.py,sha256=-rvlpw8J3-0lMn9tdnte1Z3qHpuE60DGB3gmI8VMexQ,8983
7
7
  langwatch/guardrails.py,sha256=4d320HyklXPUVszF34aWsDKGzuvPggcDM_f45_eJTnc,1352
8
8
  langwatch/langchain.py,sha256=HjbBBIDwwt688g0W4K0EsZGuGBbo1Mv5LQ-7Mkn56iQ,18726
9
9
  langwatch/litellm.py,sha256=mPcw5cLykt0SQf9bTNSoT7elMx4gj-wZ_K2PC14Bw50,11998
@@ -15,9 +15,11 @@ langwatch/tracer.py,sha256=t5FOdP1es9H_pPGqGUBLXCyEln0tTi4m4M9b6WxCrPU,975
15
15
  langwatch/types.py,sha256=h6r3tNTzWqENx-9j_JPmOMZfFoKq9SNpEtxpAACk2G0,3114
16
16
  langwatch/dataset/__init__.py,sha256=hZBcbjXuBO2qE5osJtd9wIE9f45F6-jpNTrne5nk4eE,2606
17
17
  langwatch/domain/__init__.py,sha256=gSCOV3WkRhp_--9D1vxw7BYpnMRbpGh-2NbsXd4KZC0,6074
18
- langwatch/dspy/__init__.py,sha256=F35iLwiznMJPgXLVYOvybjDWxdYlSN4vn3EzxC27Awc,34054
19
- langwatch/evaluation/__init__.py,sha256=Jy7PW5VQbMoDGdOLRlQmDEvo_9TDkBLmrLrfocxddLM,281
20
- langwatch/evaluation/evaluation.py,sha256=hmtY7rfgJm4TbTEMUP_x89B2L_Jyi7aNGhjNUxw1N4A,16112
18
+ langwatch/dspy/__init__.py,sha256=wp8AmobV8XGVWOI8MQFmXPHu-8Wq3wvjB6YiHQm9Fdg,33007
19
+ langwatch/evaluation/__init__.py,sha256=8SOSZZbSzXa1jL-9Zlyt0f9u5sOA_TrO1J61ueASBLI,16980
20
+ langwatch/experiment/__init__.py,sha256=nv2OfoNMMZwUA9KfozW2ZNaR1-J1LCmU4NykjGfe9is,3001
21
+ langwatch/experiment/experiment.py,sha256=5xj58FKVC0y_LxgfwjJZP9lDp7tZ9FUUbERBtui_nC8,33026
22
+ langwatch/experiment/platform_run.py,sha256=qiy_bwp786TbkH4HIlZVlJPmCtQlStAq9vUdG4-3VdU,13850
21
23
  langwatch/exporters/filterable_batch_span_exporter.py,sha256=MlhZjui56XD6p2sa8kEGyr-Hb3wqudknngmemnB4Twg,2142
22
24
  langwatch/generated/langwatch_rest_api_client/__init__.py,sha256=8r-9pAj7fK7vnVX3mT0y_zS4B9ZRqD6RZiBo5fPra60,156
23
25
  langwatch/generated/langwatch_rest_api_client/client.py,sha256=o_mdLqyBCQstu5tS1WZFwqIEbGwkvWQ7eQjuCJw_5VY,12419
@@ -415,6 +417,6 @@ langwatch/utils/initialization.py,sha256=1KoZmkHOvGEVF0j-4t4xRQdA_2C_SPiF7qFXqEG
415
417
  langwatch/utils/module.py,sha256=KLBNOK3mA9gCSifCcQX_lOtU48BJQDWvFKtF6NMvwVA,688
416
418
  langwatch/utils/transformation.py,sha256=76MGXyrYTxM0Yri36NJqLK-XxL4BBYdmKWAXXlw3D4Q,7690
417
419
  langwatch/utils/utils.py,sha256=ZCOSie4o9LdJ7odshNfCNjmgwgQ27ojc5ENqt1rXuSs,596
418
- langwatch-0.8.1.dist-info/METADATA,sha256=osaR4n3f3-Uo3PhYP_Dox70Dgs5fiCBnOEpu4LAhTVQ,13192
419
- langwatch-0.8.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
420
- langwatch-0.8.1.dist-info/RECORD,,
420
+ langwatch-0.10.0.dist-info/METADATA,sha256=ZiqO5NpMuwsb60yAVPverSp8GXlMyIctxaDgO3DSRoA,13193
421
+ langwatch-0.10.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
422
+ langwatch-0.10.0.dist-info/RECORD,,