PyPI - langwatch - Versions diffs - 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

langwatch 0.9.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

langwatch/__init__.py +6 -3
langwatch/__version__.py +1 -1
langwatch/client.py +16 -0
langwatch/domain/__init__.py +3 -0
langwatch/dspy/__init__.py +67 -34
langwatch/evaluation/__init__.py +518 -17
langwatch/evaluations.py +183 -353
langwatch/experiment/__init__.py +108 -0
langwatch/{evaluation/evaluation.py → experiment/experiment.py} +44 -5
langwatch/{evaluation → experiment}/platform_run.py +40 -67
langwatch/litellm.py +7 -0
langwatch/openai.py +61 -34
langwatch/prompts/local_loader.py +12 -0
langwatch/prompts/prompt_facade.py +10 -3
langwatch/types.py +5 -0
langwatch/utils/initialization.py +12 -2
langwatch/utils/utils.py +3 -1
{langwatch-0.9.0.dist-info → langwatch-0.10.1.dist-info}/METADATA +1 -1
{langwatch-0.9.0.dist-info → langwatch-0.10.1.dist-info}/RECORD +20 -19
{langwatch-0.9.0.dist-info → langwatch-0.10.1.dist-info}/WHEEL +0 -0

langwatch/experiment/__init__.py ADDED Viewed

@@ -0,0 +1,108 @@
+"""
+langwatch.experiment - Run experiments on LangWatch platform or via SDK.
+This module provides two ways to run experiments:
+1. Platform experiments (CI/CD):
+   Run experiments configured in the LangWatch platform UI.
+   ```python
+   import langwatch
+   result = langwatch.experiment.run("my-experiment-slug")
+   result.print_summary()
+   ```
+2. SDK-defined experiments:
+   Define and run experiments programmatically.
+   ```python
+   import langwatch
+   experiment = langwatch.experiment.init("my-experiment")
+   for index, row in experiment.loop(df.iterrows(), threads=4):
+       async def task(index, row):
+           result = await my_agent(row["input"])
+           experiment.evaluate(
+               "langevals/exact_match",
+               index=index,
+               data={"output": result, "expected_output": row["expected"]},
+               settings={},
+           )
+       experiment.submit(task, index, row)
+   ```
+"""
+from typing import Optional
+# Re-export the Experiment class for SDK-defined experiments
+from langwatch.experiment.experiment import Experiment
+# Re-export the platform run function and related types
+from langwatch.experiment.platform_run import (
+    run,
+    ExperimentRunResult,
+    ExperimentRunSummary,
+    ExperimentNotFoundError,
+    ExperimentTimeoutError,
+    ExperimentRunFailedError,
+    ExperimentsApiError,
+    TargetStats,
+    EvaluatorStats,
+)
+def init(name: str, *, run_id: Optional[str] = None) -> Experiment:
+    """
+    Initialize an SDK-defined experiment.
+    This creates an Experiment instance that you can use to run evaluators
+    programmatically using datasets and custom logic.
+    Args:
+        name: Name for this experiment run
+        run_id: Optional custom run ID (auto-generated if not provided)
+    Returns:
+        Experiment instance with methods:
+        - loop(): Iterate over dataset rows with parallel execution
+        - evaluate(): Run an evaluator on the current row
+        - log(): Log custom metrics
+        - submit(): Submit async tasks
+    Example:
+        ```python
+        import langwatch
+        experiment = langwatch.experiment.init("my-experiment")
+        for index, row in experiment.loop(df.iterrows(), threads=4):
+            async def task(index, row):
+                result = await my_agent(row["input"])
+                experiment.evaluate(
+                    "langevals/exact_match",
+                    index=index,
+                    data={"output": result, "expected_output": row["expected"]},
+                    settings={},
+                )
+            experiment.submit(task, index, row)
+        ```
+    """
+    experiment = Experiment(name, run_id=run_id)
+    experiment.init()
+    return experiment
+__all__ = [
+    "init",
+    "run",
+    "Experiment",
+    "ExperimentRunResult",
+    "ExperimentRunSummary",
+    "ExperimentNotFoundError",
+    "ExperimentTimeoutError",
+    "ExperimentRunFailedError",
+    "ExperimentsApiError",
+    "TargetStats",
+    "EvaluatorStats",
+]

langwatch/{evaluation/evaluation.py → experiment/experiment.py} RENAMED Viewed

@@ -135,7 +135,7 @@ class IterationInfo(TypedDict):
     error: Optional[Exception]
-class Evaluation:
+class Experiment:
     _executor: ThreadPoolExecutor
     _futures: List[Future[Any]]
     _current_index: int
@@ -255,7 +255,7 @@ class Evaluation:
                 progress_bar.close()
         except Exception as e:
-            Evaluation._log_results(
+            Experiment._log_results(
                 langwatch.get_api_key() or "",
                 {
                     "experiment_slug": self.experiment_slug,
@@ -456,7 +456,7 @@ class Evaluation:
             # Start a new thread to send the batch
             thread = threading.Thread(
-                target=Evaluation._log_results,
+                target=Experiment._log_results,
                 args=(langwatch.get_api_key(), body),
             )
             thread.start()
@@ -485,7 +485,7 @@ class Evaluation:
         better_raise_for_status(response)
     def _wait_for_completion(self):
-        async def wait_for_completion(self: Evaluation):
+        async def wait_for_completion(self: Experiment):
             # Send any remaining batch
             self._send_batch(finished=True)
@@ -837,7 +837,7 @@ class Evaluation:
         with self.lock:
             self.batch["evaluations"].append(eval)
-    def run(
+    def evaluate(
         self,
         evaluator_id: str,
         index: Union[int, Hashable],
@@ -846,6 +846,17 @@ class Evaluation:
         name: Optional[str] = None,
         as_guardrail: bool = False,
     ):
+        """
+        Run an evaluator on the current row.
+        Args:
+            evaluator_id: The evaluator type/slug (e.g., "langevals/exact_match", "ragas/faithfulness")
+            index: The row index for this evaluation
+            data: Data to pass to the evaluator (e.g., {"input": ..., "output": ..., "expected_output": ...})
+            settings: Evaluator-specific settings
+            name: Optional display name for the evaluation (defaults to evaluator_id)
+            as_guardrail: Whether to run as a guardrail (stricter pass/fail)
+        """
         duration: Optional[int] = None
         start_time = time.time()
@@ -871,3 +882,31 @@ class Evaluation:
             duration=duration,
             cost=result.cost,
         )
+    def run(
+        self,
+        evaluator_id: str,
+        index: Union[int, Hashable],
+        data: Dict[str, Any],
+        settings: Dict[str, Any],
+        name: Optional[str] = None,
+        as_guardrail: bool = False,
+    ):
+        """
+        Deprecated: Use `evaluate()` instead.
+        """
+        import warnings
+        warnings.warn(
+            "evaluation.run() is deprecated, use evaluation.evaluate() instead",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self.evaluate(
+            evaluator_id=evaluator_id,
+            index=index,
+            data=data,
+            settings=settings,
+            name=name,
+            as_guardrail=as_guardrail,
+        )

langwatch/{evaluation → experiment}/platform_run.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-Runner for platform-configured evaluations (Evaluations V3).
+Runner for platform-configured experiments (Experiments Workbench).
 This module provides the `run()` function to execute evaluations that are
 configured in the LangWatch platform from CI/CD pipelines or scripts.
@@ -35,16 +35,16 @@ def _replace_url_domain(url: str, new_base: str) -> str:
     ))
-class EvaluationNotFoundError(Exception):
-    """Raised when evaluation slug doesn't exist."""
+class ExperimentNotFoundError(Exception):
+    """Raised when experiment slug doesn't exist."""
     def __init__(self, slug: str):
         self.slug = slug
         super().__init__(f"Evaluation not found: {slug}")
-class EvaluationTimeoutError(Exception):
-    """Raised when evaluation run times out."""
+class ExperimentTimeoutError(Exception):
+    """Raised when experiment run times out."""
     def __init__(self, run_id: str, progress: int, total: int):
         self.run_id = run_id
@@ -55,8 +55,8 @@ class EvaluationTimeoutError(Exception):
         )
-class EvaluationRunFailedError(Exception):
-    """Raised when evaluation run fails."""
+class ExperimentRunFailedError(Exception):
+    """Raised when experiment run fails."""
     def __init__(self, run_id: str, error: str):
         self.run_id = run_id
@@ -64,7 +64,7 @@ class EvaluationRunFailedError(Exception):
         super().__init__(f"Evaluation run failed: {error}")
-class EvaluationsApiError(Exception):
+class ExperimentsApiError(Exception):
     """Raised for other API errors."""
     def __init__(self, message: str, status_code: int):
@@ -97,8 +97,8 @@ class EvaluatorStats:
 @dataclass
-class EvaluationRunSummary:
-    """Summary of a completed evaluation run."""
+class ExperimentRunSummary:
+    """Summary of a completed experiment run."""
     run_id: str
     total_cells: int
@@ -115,7 +115,7 @@ class EvaluationRunSummary:
 @dataclass
-class EvaluationRunResult:
+class ExperimentRunResult:
     """Result of running a platform evaluation."""
     run_id: str
@@ -125,7 +125,7 @@ class EvaluationRunResult:
     pass_rate: float
     duration: int
     run_url: str
-    summary: EvaluationRunSummary
+    summary: ExperimentRunSummary
     def print_summary(self, exit_on_failure: Optional[bool] = None) -> None:
         """
@@ -161,42 +161,42 @@ def _is_notebook() -> bool:
         return False
-def evaluate(
+def run(
     slug: str,
     *,
     poll_interval: float = 2.0,
     timeout: float = 600.0,
     on_progress: Optional[Callable[[int, int], None]] = None,
     api_key: Optional[str] = None,
-) -> EvaluationRunResult:
+) -> ExperimentRunResult:
     """
-    Run a platform-configured evaluation and wait for completion.
+    Run a platform-configured experiment and wait for completion.
-    This runs an Evaluation that you have configured in the LangWatch platform.
-    The evaluation will execute all targets and evaluators defined in the configuration.
+    This runs an Experiment that you have configured in the LangWatch platform.
+    The experiment will execute all targets and evaluators defined in the configuration.
     Args:
-        slug: The slug of the evaluation to run (found in the evaluation URL)
+        slug: The slug of the experiment to run (found in the experiment URL)
         poll_interval: Seconds between status checks (default: 2.0)
         timeout: Maximum seconds to wait for completion (default: 600.0 = 10 minutes)
         on_progress: Optional callback for progress updates (completed, total)
         api_key: Optional API key override (uses LANGWATCH_API_KEY env var by default)
     Returns:
-        EvaluationRunResult with pass rate and summary. Call result.print_summary()
+        ExperimentRunResult with pass rate and summary. Call result.print_summary()
         to display results and exit with code 1 on failure.
     Raises:
-        EvaluationNotFoundError: If the evaluation slug doesn't exist
-        EvaluationTimeoutError: If the evaluation doesn't complete within timeout
-        EvaluationRunFailedError: If the evaluation fails
-        EvaluationsApiError: For other API errors
+        ExperimentNotFoundError: If the experiment slug doesn't exist
+        ExperimentTimeoutError: If the experiment doesn't complete within timeout
+        ExperimentRunFailedError: If the experiment fails
+        ExperimentsApiError: For other API errors
     Example:
         ```python
         import langwatch
-        result = langwatch.evaluation.evaluate("my-evaluation-slug")
+        result = langwatch.experiment.run("my-experiment-slug")
         result.print_summary()
         ```
     """
@@ -219,7 +219,7 @@ def evaluate(
     api_run_url = start_response.get("runUrl", "")
     run_url = _replace_url_domain(api_run_url, endpoint) if api_run_url else ""
-    print(f"Started evaluation run: {run_id}")
+    print(f"Started experiment run: {run_id}")
     if run_url:
         print(f"Follow live: {run_url}")
@@ -238,7 +238,7 @@ def evaluate(
         if time.time() - start_time > timeout:
             print()  # Newline after progress
             status = _get_run_status(run_id, endpoint, effective_api_key)
-            raise EvaluationTimeoutError(
+            raise ExperimentTimeoutError(
                 run_id, status.get("progress", 0), status.get("total", 0)
             )
@@ -267,7 +267,7 @@ def evaluate(
         if run_status == "failed":
             print()  # Newline after progress
-            raise EvaluationRunFailedError(
+            raise ExperimentRunFailedError(
                 run_id, status.get("error", "Unknown error")
             )
@@ -278,7 +278,7 @@ def evaluate(
 def _start_run(slug: str, endpoint: str, api_key: str) -> dict:
-    """Start an evaluation run."""
+    """Start an experiment run."""
     with httpx.Client(timeout=60) as client:
         response = client.post(
             f"{endpoint}/api/evaluations/v3/{slug}/run",
@@ -286,12 +286,12 @@ def _start_run(slug: str, endpoint: str, api_key: str) -> dict:
         )
     if response.status_code == 404:
-        raise EvaluationNotFoundError(slug)
+        raise ExperimentNotFoundError(slug)
     if response.status_code == 401:
-        raise EvaluationsApiError("Unauthorized - check your API key", 401)
+        raise ExperimentsApiError("Unauthorized - check your API key", 401)
     if not response.is_success:
         error_body = response.json() if response.content else {}
-        raise EvaluationsApiError(
+        raise ExperimentsApiError(
             error_body.get("error", f"Failed to start evaluation: {response.status_code}"),
             response.status_code,
         )
@@ -308,12 +308,12 @@ def _get_run_status(run_id: str, endpoint: str, api_key: str) -> dict:
         )
     if response.status_code == 404:
-        raise EvaluationsApiError(f"Run not found: {run_id}", 404)
+        raise ExperimentsApiError(f"Run not found: {run_id}", 404)
     if response.status_code == 401:
-        raise EvaluationsApiError("Unauthorized - check your API key", 401)
+        raise ExperimentsApiError("Unauthorized - check your API key", 401)
     if not response.is_success:
         error_body = response.json() if response.content else {}
-        raise EvaluationsApiError(
+        raise ExperimentsApiError(
             error_body.get("error", f"Failed to get run status: {response.status_code}"),
             response.status_code,
         )
@@ -326,7 +326,7 @@ def _build_result(
     status: Literal["completed", "failed", "stopped"],
     summary_data: dict,
     run_url: str,
-) -> EvaluationRunResult:
+) -> ExperimentRunResult:
     """Build the result object from API response."""
     total_cells = summary_data.get("totalCells", 0)
     completed_cells = summary_data.get("completedCells", 0)
@@ -368,7 +368,7 @@ def _build_result(
             )
         )
-    summary = EvaluationRunSummary(
+    summary = ExperimentRunSummary(
         run_id=run_id,
         total_cells=total_cells,
         completed_cells=completed_cells,
@@ -383,7 +383,7 @@ def _build_result(
         total_cost=summary_data.get("totalCost", 0),
     )
-    return EvaluationRunResult(
+    return ExperimentRunResult(
         run_id=run_id,
         status=status,
         passed=total_passed,
@@ -395,12 +395,12 @@ def _build_result(
     )
-def _print_summary(result: EvaluationRunResult) -> None:
-    """Print a CI-friendly summary of the evaluation results."""
+def _print_summary(result: ExperimentRunResult) -> None:
+    """Print a CI-friendly summary of the experiment results."""
     summary = result.summary
     print("\n" + "═" * 60)
-    print("  EVALUATION RESULTS")
+    print("  EXPERIMENT RESULTS")
     print("═" * 60)
     print(f"  Run ID:     {result.run_id}")
     print(f"  Status:     {result.status.upper()}")
@@ -433,30 +433,3 @@ def _print_summary(result: EvaluationRunResult) -> None:
     print("═" * 60 + "\n")
-def run(
-    slug: str,
-    *,
-    poll_interval: float = 2.0,
-    timeout: float = 600.0,
-    on_progress: Optional[Callable[[int, int], None]] = None,
-    api_key: Optional[str] = None,
-) -> EvaluationRunResult:
-    """
-    Deprecated: Use `evaluate()` instead.
-    Run a platform-configured evaluation and wait for completion.
-    """
-    import warnings
-    warnings.warn(
-        "langwatch.evaluation.run() is deprecated, use langwatch.evaluation.evaluate() instead",
-        DeprecationWarning,
-        stacklevel=2,
-    )
-    return evaluate(
-        slug,
-        poll_interval=poll_interval,
-        timeout=timeout,
-        on_progress=on_progress,
-        api_key=api_key,
-    )

langwatch/litellm.py CHANGED Viewed

@@ -246,6 +246,9 @@ class LiteLLMPatch:
                 SpanMetrics(
                     prompt_tokens=safe_get(usage, "prompt_tokens"),
                     completion_tokens=safe_get(usage, "completion_tokens"),
+                    reasoning_tokens=safe_get(
+                        usage, "completion_tokens_details", "reasoning_tokens"
+                    ),
                 )
                 if usage
                 else SpanMetrics()
@@ -281,6 +284,9 @@ class LiteLLMPatch:
             metrics=SpanMetrics(
                 prompt_tokens=safe_get(response, "usage", "prompt_tokens"),
                 completion_tokens=safe_get(response, "usage", "completion_tokens"),
+                reasoning_tokens=safe_get(
+                    response, "usage", "completion_tokens_details", "reasoning_tokens"
+                ),
             ),
             timestamps=timestamps,
             **kwargs,
@@ -338,6 +344,7 @@ class LiteLLMPatch:
             "functions",
             "user",
             "response_format",
+            "reasoning_effort",
         ]
         for param in params:
             if kwargs.get(param):

langwatch/openai.py CHANGED Viewed

@@ -296,6 +296,9 @@ class OpenAICompletionTracer:
             metrics=SpanMetrics(
                 prompt_tokens=safe_get(response, "usage", "prompt_tokens"),
                 completion_tokens=safe_get(response, "usage", "completion_tokens"),
+                reasoning_tokens=safe_get(
+                    response, "usage", "completion_tokens_details", "reasoning_tokens"
+                ),
             ),
             timestamps=timestamps,
             **kwargs,
@@ -336,22 +339,31 @@ class OpenAICompletionTracer:
             if len(outputs) == 0
             else outputs[0] if len(outputs) == 1 else {"type": "list", "value": outputs}
         )
-        params = SpanParams(
-            temperature=kwargs.get("temperature", 1.0),
-            stream=kwargs.get("stream", False),
-        )
-        functions = kwargs.get("functions", None)
-        if functions:
-            params["functions"] = functions
-        tools = kwargs.get("tools", None)
-        if tools:
-            params["tools"] = tools
-        tool_choice = kwargs.get("tool_choice", None)
-        if tool_choice:
-            params["tool_choice"] = tool_choice
-        response_format = kwargs.get("response_format", None)
-        if response_format:
-            params["response_format"] = response_format
+        span_params = SpanParams()
+        param_names = [
+            "frequency_penalty",
+            "logit_bias",
+            "logprobs",
+            "top_logprobs",
+            "max_tokens",
+            "n",
+            "presence_penalty",
+            "seed",
+            "stop",
+            "stream",
+            "temperature",
+            "top_p",
+            "tools",
+            "tool_choice",
+            "parallel_tool_calls",
+            "functions",
+            "user",
+            "response_format",
+            "reasoning_effort",
+        ]
+        for param in param_names:
+            if kwargs.get(param) is not None:
+                span_params[param] = kwargs.get(param)
         vendor = (
             "azure"
@@ -367,7 +379,7 @@ class OpenAICompletionTracer:
             ),
             output=output,
             error=error,
-            params=params,
+            params=span_params,
             metrics=metrics,
             timestamps=timestamps,
         )
@@ -611,6 +623,9 @@ class OpenAIChatCompletionTracer:
                 SpanMetrics(
                     prompt_tokens=usage.prompt_tokens if usage else None,
                     completion_tokens=usage.completion_tokens if usage else None,
+                    reasoning_tokens=safe_get(
+                        usage, "completion_tokens_details", "reasoning_tokens"
+                    ),
                 )
                 if usage
                 else SpanMetrics()
@@ -643,6 +658,9 @@ class OpenAIChatCompletionTracer:
             metrics=SpanMetrics(
                 prompt_tokens=safe_get(response, "usage", "prompt_tokens"),
                 completion_tokens=safe_get(response, "usage", "completion_tokens"),
+                reasoning_tokens=safe_get(
+                    response, "usage", "completion_tokens_details", "reasoning_tokens"
+                ),
             ),
             timestamps=timestamps,
             **kwargs,
@@ -683,22 +701,31 @@ class OpenAIChatCompletionTracer:
             if len(outputs) == 0
             else outputs[0] if len(outputs) == 1 else {"type": "list", "value": outputs}
         )
-        params = SpanParams(
-            temperature=kwargs.get("temperature", 1.0),
-            stream=kwargs.get("stream", False),
-        )
-        functions = kwargs.get("functions", None)
-        if functions:
-            params["functions"] = functions
-        tools = kwargs.get("tools", None)
-        if tools:
-            params["tools"] = tools
-        tool_choice = kwargs.get("tool_choice", None)
-        if tool_choice:
-            params["tool_choice"] = tool_choice
-        response_format = kwargs.get("response_format", None)
-        if response_format:
-            params["response_format"] = response_format
+        span_params = SpanParams()
+        param_names = [
+            "frequency_penalty",
+            "logit_bias",
+            "logprobs",
+            "top_logprobs",
+            "max_tokens",
+            "n",
+            "presence_penalty",
+            "seed",
+            "stop",
+            "stream",
+            "temperature",
+            "top_p",
+            "tools",
+            "tool_choice",
+            "parallel_tool_calls",
+            "functions",
+            "user",
+            "response_format",
+            "reasoning_effort",
+        ]
+        for param in param_names:
+            if kwargs.get(param) is not None:
+                span_params[param] = kwargs.get(param)
         vendor = (
             "azure"
@@ -714,7 +741,7 @@ class OpenAIChatCompletionTracer:
             ),
             output=output,
             error=error,
-            params=params,
+            params=span_params,
             metrics=metrics,
             timestamps=timestamps,
         )

langwatch/prompts/local_loader.py CHANGED Viewed

@@ -24,6 +24,8 @@ logger = logging.getLogger(__name__)
 class LocalPromptLoader:
     """Loads prompts from local files in CLI format."""
+    _warned_no_prompts_path: bool = False
     def __init__(self, base_path: Optional[Path] = None):
         """Initialize with base path (defaults to current working directory at load time)."""
         self._base_path = base_path
@@ -43,6 +45,16 @@ class LocalPromptLoader:
             # Check if prompts.json exists
             prompts_json_path = self.base_path / "prompts.json"
             if not prompts_json_path.exists():
+                # Warn once if no prompts_path was configured and prompts.json doesn't exist
+                if self._base_path is None and not LocalPromptLoader._warned_no_prompts_path:
+                    LocalPromptLoader._warned_no_prompts_path = True
+                    warnings.warn(
+                        f"No prompts.json found at {prompts_json_path}. "
+                        f"If you have local prompt files, configure the path with "
+                        f"langwatch.setup(prompts_path='/path/to/prompts') or ensure "
+                        f"prompts.json is in the current working directory.",
+                        UserWarning,
+                    )
                 logger.debug(
                     f"No prompts.json found at {prompts_json_path}, falling back to API"
                 )

langwatch 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

langwatch 0.9.0py3-none-any.whl → 0.10.1py3-none-any.whl