PyPI - langwatch - Versions diffs - 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

langwatch 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

langwatch/__init__.py +6 -3
langwatch/__version__.py +1 -1
langwatch/dspy/__init__.py +4 -32
langwatch/evaluation/__init__.py +535 -7
langwatch/evaluations.py +183 -353
langwatch/experiment/__init__.py +108 -0
langwatch/experiment/experiment.py +912 -0
langwatch/experiment/platform_run.py +435 -0
{langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/METADATA +1 -1
{langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/RECORD +11 -9
langwatch/evaluation/evaluation.py +0 -484
{langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/WHEEL +0 -0

langwatch/experiment/platform_run.py ADDED Viewed

@@ -0,0 +1,435 @@
+"""
+Runner for platform-configured experiments (Experiments Workbench).
+This module provides the `run()` function to execute evaluations that are
+configured in the LangWatch platform from CI/CD pipelines or scripts.
+"""
+from dataclasses import dataclass, field
+from typing import Callable, List, Literal, Optional
+from urllib.parse import urlparse, urlunparse
+import sys
+import time
+import httpx
+import langwatch
+from langwatch.state import get_api_key, get_endpoint
+def _replace_url_domain(url: str, new_base: str) -> str:
+    """Replace the domain/scheme of a URL with a new base URL, preserving the path."""
+    if not url:
+        return url
+    parsed_url = urlparse(url)
+    parsed_new_base = urlparse(new_base)
+    # Replace scheme and netloc with new base, keep path/query/fragment
+    return urlunparse((
+        parsed_new_base.scheme,
+        parsed_new_base.netloc,
+        parsed_url.path,
+        parsed_url.params,
+        parsed_url.query,
+        parsed_url.fragment,
+    ))
+class ExperimentNotFoundError(Exception):
+    """Raised when experiment slug doesn't exist."""
+    def __init__(self, slug: str):
+        self.slug = slug
+        super().__init__(f"Evaluation not found: {slug}")
+class ExperimentTimeoutError(Exception):
+    """Raised when experiment run times out."""
+    def __init__(self, run_id: str, progress: int, total: int):
+        self.run_id = run_id
+        self.progress = progress
+        self.total = total
+        super().__init__(
+            f"Evaluation run timed out: {run_id} ({progress}/{total} completed)"
+        )
+class ExperimentRunFailedError(Exception):
+    """Raised when experiment run fails."""
+    def __init__(self, run_id: str, error: str):
+        self.run_id = run_id
+        self.error_message = error
+        super().__init__(f"Evaluation run failed: {error}")
+class ExperimentsApiError(Exception):
+    """Raised for other API errors."""
+    def __init__(self, message: str, status_code: int):
+        self.status_code = status_code
+        super().__init__(message)
+@dataclass
+class TargetStats:
+    """Statistics for a single target."""
+    target_id: str
+    name: str
+    passed: int
+    failed: int
+    avg_latency: float
+    total_cost: float
+@dataclass
+class EvaluatorStats:
+    """Statistics for a single evaluator."""
+    evaluator_id: str
+    name: str
+    passed: int
+    failed: int
+    pass_rate: float
+    avg_score: Optional[float] = None
+@dataclass
+class ExperimentRunSummary:
+    """Summary of a completed experiment run."""
+    run_id: str
+    total_cells: int
+    completed_cells: int
+    failed_cells: int
+    duration: int
+    run_url: str = ""
+    targets: List[TargetStats] = field(default_factory=list)
+    evaluators: List[EvaluatorStats] = field(default_factory=list)
+    total_passed: int = 0
+    total_failed: int = 0
+    pass_rate: float = 0.0
+    total_cost: float = 0.0
+@dataclass
+class ExperimentRunResult:
+    """Result of running a platform evaluation."""
+    run_id: str
+    status: Literal["completed", "failed", "stopped"]
+    passed: int
+    failed: int
+    pass_rate: float
+    duration: int
+    run_url: str
+    summary: ExperimentRunSummary
+    def print_summary(self, exit_on_failure: Optional[bool] = None) -> None:
+        """
+        Print a CI-friendly summary and optionally exit with code 1 on failure.
+        Args:
+            exit_on_failure: If True, calls sys.exit(1) when there are failures.
+                           If False, never exits.
+                           If None (default), auto-detects: exits in scripts/CI, doesn't exit in notebooks.
+        """
+        _print_summary(self)
+        # Auto-detect: don't exit in notebooks, exit in scripts/CI
+        should_exit = exit_on_failure if exit_on_failure is not None else not _is_notebook()
+        if should_exit and self.failed > 0:
+            sys.exit(1)
+def _is_notebook() -> bool:
+    """Detect if running in a Jupyter notebook."""
+    try:
+        from IPython import get_ipython  # type: ignore
+        shell = get_ipython().__class__.__name__
+        if shell == "ZMQInteractiveShell":
+            return True  # Jupyter notebook or qtconsole
+        elif shell == "TerminalInteractiveShell":
+            return False  # Terminal running IPython
+        else:
+            return False
+    except (ImportError, AttributeError, NameError):
+        return False
+def run(
+    slug: str,
+    *,
+    poll_interval: float = 2.0,
+    timeout: float = 600.0,
+    on_progress: Optional[Callable[[int, int], None]] = None,
+    api_key: Optional[str] = None,
+) -> ExperimentRunResult:
+    """
+    Run a platform-configured experiment and wait for completion.
+    This runs an Experiment that you have configured in the LangWatch platform.
+    The experiment will execute all targets and evaluators defined in the configuration.
+    Args:
+        slug: The slug of the experiment to run (found in the experiment URL)
+        poll_interval: Seconds between status checks (default: 2.0)
+        timeout: Maximum seconds to wait for completion (default: 600.0 = 10 minutes)
+        on_progress: Optional callback for progress updates (completed, total)
+        api_key: Optional API key override (uses LANGWATCH_API_KEY env var by default)
+    Returns:
+        ExperimentRunResult with pass rate and summary. Call result.print_summary()
+        to display results and exit with code 1 on failure.
+    Raises:
+        ExperimentNotFoundError: If the experiment slug doesn't exist
+        ExperimentTimeoutError: If the experiment doesn't complete within timeout
+        ExperimentRunFailedError: If the experiment fails
+        ExperimentsApiError: For other API errors
+    Example:
+        ```python
+        import langwatch
+        result = langwatch.experiment.run("my-experiment-slug")
+        result.print_summary()
+        ```
+    """
+    langwatch.ensure_setup()
+    effective_api_key = api_key or get_api_key()
+    endpoint = get_endpoint()
+    if not effective_api_key:
+        raise ValueError(
+            "API key not set. Set LANGWATCH_API_KEY environment variable or pass api_key parameter."
+        )
+    # Start the run
+    start_response = _start_run(slug, endpoint, effective_api_key)
+    run_id = start_response["runId"]
+    total = start_response.get("total", 0)
+    # Use the run URL from API but replace domain with configured endpoint
+    api_run_url = start_response.get("runUrl", "")
+    run_url = _replace_url_domain(api_run_url, endpoint) if api_run_url else ""
+    print(f"Started experiment run: {run_id}")
+    if run_url:
+        print(f"Follow live: {run_url}")
+    # Track last progress for change detection
+    last_progress = 0
+    # Print initial progress
+    if total > 0:
+        print(f"Progress: 0/{total} (0%)", end="", flush=True)
+    if on_progress:
+        on_progress(0, total)
+    # Poll until complete
+    start_time = time.time()
+    while True:
+        if time.time() - start_time > timeout:
+            print()  # Newline after progress
+            status = _get_run_status(run_id, endpoint, effective_api_key)
+            raise ExperimentTimeoutError(
+                run_id, status.get("progress", 0), status.get("total", 0)
+            )
+        time.sleep(poll_interval)
+        status = _get_run_status(run_id, endpoint, effective_api_key)
+        progress = status.get("progress", 0)
+        total = status.get("total", total)
+        # Update progress display if changed
+        if progress != last_progress and total > 0:
+            percentage = (progress / total) * 100
+            # Use carriage return to overwrite the line
+            print(f"\rProgress: {progress}/{total} ({percentage:.0f}%)", end="", flush=True)
+            last_progress = progress
+        if on_progress:
+            on_progress(progress, total)
+        run_status = status.get("status")
+        if run_status == "completed":
+            print()  # Newline after progress
+            summary_data = status.get("summary", {})
+            return _build_result(run_id, "completed", summary_data, run_url)
+        if run_status == "failed":
+            print()  # Newline after progress
+            raise ExperimentRunFailedError(
+                run_id, status.get("error", "Unknown error")
+            )
+        if run_status == "stopped":
+            print()  # Newline after progress
+            summary_data = status.get("summary", {})
+            return _build_result(run_id, "stopped", summary_data, run_url)
+def _start_run(slug: str, endpoint: str, api_key: str) -> dict:
+    """Start an experiment run."""
+    with httpx.Client(timeout=60) as client:
+        response = client.post(
+            f"{endpoint}/api/evaluations/v3/{slug}/run",
+            headers={"X-Auth-Token": api_key},
+        )
+    if response.status_code == 404:
+        raise ExperimentNotFoundError(slug)
+    if response.status_code == 401:
+        raise ExperimentsApiError("Unauthorized - check your API key", 401)
+    if not response.is_success:
+        error_body = response.json() if response.content else {}
+        raise ExperimentsApiError(
+            error_body.get("error", f"Failed to start evaluation: {response.status_code}"),
+            response.status_code,
+        )
+    return response.json()
+def _get_run_status(run_id: str, endpoint: str, api_key: str) -> dict:
+    """Get the status of a run."""
+    with httpx.Client(timeout=60) as client:
+        response = client.get(
+            f"{endpoint}/api/evaluations/v3/runs/{run_id}",
+            headers={"X-Auth-Token": api_key},
+        )
+    if response.status_code == 404:
+        raise ExperimentsApiError(f"Run not found: {run_id}", 404)
+    if response.status_code == 401:
+        raise ExperimentsApiError("Unauthorized - check your API key", 401)
+    if not response.is_success:
+        error_body = response.json() if response.content else {}
+        raise ExperimentsApiError(
+            error_body.get("error", f"Failed to get run status: {response.status_code}"),
+            response.status_code,
+        )
+    return response.json()
+def _build_result(
+    run_id: str,
+    status: Literal["completed", "failed", "stopped"],
+    summary_data: dict,
+    run_url: str,
+) -> ExperimentRunResult:
+    """Build the result object from API response."""
+    total_cells = summary_data.get("totalCells", 0)
+    completed_cells = summary_data.get("completedCells", 0)
+    failed_cells = summary_data.get("failedCells", 0)
+    duration = summary_data.get("duration", 0)
+    total_passed = summary_data.get("totalPassed", completed_cells - failed_cells)
+    total_failed = summary_data.get("totalFailed", failed_cells)
+    pass_rate = summary_data.get(
+        "passRate",
+        (total_passed / completed_cells * 100) if completed_cells > 0 else 0.0,
+    )
+    # Parse targets
+    targets: List[TargetStats] = []
+    for t in summary_data.get("targets", []):
+        targets.append(
+            TargetStats(
+                target_id=t.get("targetId", ""),
+                name=t.get("name", ""),
+                passed=t.get("passed", 0),
+                failed=t.get("failed", 0),
+                avg_latency=t.get("avgLatency", 0),
+                total_cost=t.get("totalCost", 0),
+            )
+        )
+    # Parse evaluators
+    evaluators: List[EvaluatorStats] = []
+    for e in summary_data.get("evaluators", []):
+        evaluators.append(
+            EvaluatorStats(
+                evaluator_id=e.get("evaluatorId", ""),
+                name=e.get("name", ""),
+                passed=e.get("passed", 0),
+                failed=e.get("failed", 0),
+                pass_rate=e.get("passRate", 0),
+                avg_score=e.get("avgScore"),
+            )
+        )
+    summary = ExperimentRunSummary(
+        run_id=run_id,
+        total_cells=total_cells,
+        completed_cells=completed_cells,
+        failed_cells=failed_cells,
+        duration=duration,
+        run_url=run_url,  # Always use the endpoint-based URL we constructed
+        targets=targets,
+        evaluators=evaluators,
+        total_passed=total_passed,
+        total_failed=total_failed,
+        pass_rate=pass_rate,
+        total_cost=summary_data.get("totalCost", 0),
+    )
+    return ExperimentRunResult(
+        run_id=run_id,
+        status=status,
+        passed=total_passed,
+        failed=total_failed,
+        pass_rate=pass_rate,
+        duration=duration,
+        run_url=summary.run_url,
+        summary=summary,
+    )
+def _print_summary(result: ExperimentRunResult) -> None:
+    """Print a CI-friendly summary of the experiment results."""
+    summary = result.summary
+    print("\n" + "═" * 60)
+    print("  EXPERIMENT RESULTS")
+    print("═" * 60)
+    print(f"  Run ID:     {result.run_id}")
+    print(f"  Status:     {result.status.upper()}")
+    print(f"  Duration:   {result.duration / 1000:.1f}s")
+    print("─" * 60)
+    print(f"  Passed:     {result.passed}")
+    print(f"  Failed:     {result.failed}")
+    print(f"  Pass Rate:  {result.pass_rate:.1f}%")
+    if summary.targets:
+        print("─" * 60)
+        print("  TARGETS:")
+        for target in summary.targets:
+            print(f"    {target.name}: {target.passed} passed, {target.failed} failed")
+            if target.avg_latency:
+                print(f"      Avg latency: {target.avg_latency:.0f}ms")
+            if target.total_cost:
+                print(f"      Total cost: ${target.total_cost:.4f}")
+    if summary.evaluators:
+        print("─" * 60)
+        print("  EVALUATORS:")
+        for evaluator in summary.evaluators:
+            print(f"    {evaluator.name}: {evaluator.pass_rate:.1f}% pass rate")
+            if evaluator.avg_score is not None:
+                print(f"      Avg score: {evaluator.avg_score:.2f}")
+    print("─" * 60)
+    print(f"  View details: {result.run_url}")
+    print("═" * 60 + "\n")

{langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: langwatch
-Version: 0.8.1
+Version: 0.10.0
 Summary: LangWatch Python SDK, for monitoring your LLMs
 Author-email: Langwatch Engineers <engineering@langwatch.ai>
 License: MIT

{langwatch-0.8.1.dist-info → langwatch-0.10.0.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
-langwatch/__init__.py,sha256=GMq4SV2Tz2i0JD05shqnw2lBW5cgMx4Zzo141hp106k,4266
-langwatch/__version__.py,sha256=l2r_v6gqH58S38dAeIr-BCiWrh25Ql4biGJMjTpZZ1o,91
+langwatch/__init__.py,sha256=VGkLDw3h8hOtzyaTMObWupGTQosn4E17Dk5zcfDmy7g,4462
+langwatch/__version__.py,sha256=o5DKyCSbKMRVAiPvoV2UuvLgN1NaTqfqaz2GumN3K3A,92
 langwatch/attributes.py,sha256=nXdI_G85wQQCAdAcwjCiLYdEYj3wATmfgCmhlf6dVIk,3910
 langwatch/batch_evaluation.py,sha256=Y_S3teXpHV07U-vvJYyV1PB6d0CgyFM_rTzPp6GnEBo,16165
 langwatch/client.py,sha256=WTNcYSik7kZ2kH-qGDnhbMTosc8e_Xhab_lZlfh5TC8,25559
-langwatch/evaluations.py,sha256=W-nxhcbQ4Niyx949LjHjTehw74pj31dUVs5rjkQUBLo,16951
+langwatch/evaluations.py,sha256=-rvlpw8J3-0lMn9tdnte1Z3qHpuE60DGB3gmI8VMexQ,8983
 langwatch/guardrails.py,sha256=4d320HyklXPUVszF34aWsDKGzuvPggcDM_f45_eJTnc,1352
 langwatch/langchain.py,sha256=HjbBBIDwwt688g0W4K0EsZGuGBbo1Mv5LQ-7Mkn56iQ,18726
 langwatch/litellm.py,sha256=mPcw5cLykt0SQf9bTNSoT7elMx4gj-wZ_K2PC14Bw50,11998
@@ -15,9 +15,11 @@ langwatch/tracer.py,sha256=t5FOdP1es9H_pPGqGUBLXCyEln0tTi4m4M9b6WxCrPU,975
 langwatch/types.py,sha256=h6r3tNTzWqENx-9j_JPmOMZfFoKq9SNpEtxpAACk2G0,3114
 langwatch/dataset/__init__.py,sha256=hZBcbjXuBO2qE5osJtd9wIE9f45F6-jpNTrne5nk4eE,2606
 langwatch/domain/__init__.py,sha256=gSCOV3WkRhp_--9D1vxw7BYpnMRbpGh-2NbsXd4KZC0,6074
-langwatch/dspy/__init__.py,sha256=F35iLwiznMJPgXLVYOvybjDWxdYlSN4vn3EzxC27Awc,34054
-langwatch/evaluation/__init__.py,sha256=Jy7PW5VQbMoDGdOLRlQmDEvo_9TDkBLmrLrfocxddLM,281
-langwatch/evaluation/evaluation.py,sha256=hmtY7rfgJm4TbTEMUP_x89B2L_Jyi7aNGhjNUxw1N4A,16112
+langwatch/dspy/__init__.py,sha256=wp8AmobV8XGVWOI8MQFmXPHu-8Wq3wvjB6YiHQm9Fdg,33007
+langwatch/evaluation/__init__.py,sha256=8SOSZZbSzXa1jL-9Zlyt0f9u5sOA_TrO1J61ueASBLI,16980
+langwatch/experiment/__init__.py,sha256=nv2OfoNMMZwUA9KfozW2ZNaR1-J1LCmU4NykjGfe9is,3001
+langwatch/experiment/experiment.py,sha256=5xj58FKVC0y_LxgfwjJZP9lDp7tZ9FUUbERBtui_nC8,33026
+langwatch/experiment/platform_run.py,sha256=qiy_bwp786TbkH4HIlZVlJPmCtQlStAq9vUdG4-3VdU,13850
 langwatch/exporters/filterable_batch_span_exporter.py,sha256=MlhZjui56XD6p2sa8kEGyr-Hb3wqudknngmemnB4Twg,2142
 langwatch/generated/langwatch_rest_api_client/__init__.py,sha256=8r-9pAj7fK7vnVX3mT0y_zS4B9ZRqD6RZiBo5fPra60,156
 langwatch/generated/langwatch_rest_api_client/client.py,sha256=o_mdLqyBCQstu5tS1WZFwqIEbGwkvWQ7eQjuCJw_5VY,12419
@@ -415,6 +417,6 @@ langwatch/utils/initialization.py,sha256=1KoZmkHOvGEVF0j-4t4xRQdA_2C_SPiF7qFXqEG
 langwatch/utils/module.py,sha256=KLBNOK3mA9gCSifCcQX_lOtU48BJQDWvFKtF6NMvwVA,688
 langwatch/utils/transformation.py,sha256=76MGXyrYTxM0Yri36NJqLK-XxL4BBYdmKWAXXlw3D4Q,7690
 langwatch/utils/utils.py,sha256=ZCOSie4o9LdJ7odshNfCNjmgwgQ27ojc5ENqt1rXuSs,596
-langwatch-0.8.1.dist-info/METADATA,sha256=osaR4n3f3-Uo3PhYP_Dox70Dgs5fiCBnOEpu4LAhTVQ,13192
-langwatch-0.8.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-langwatch-0.8.1.dist-info/RECORD,,
+langwatch-0.10.0.dist-info/METADATA,sha256=ZiqO5NpMuwsb60yAVPverSp8GXlMyIctxaDgO3DSRoA,13193
+langwatch-0.10.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+langwatch-0.10.0.dist-info/RECORD,,

langwatch 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

langwatch 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl