npm - langwatch - Versions diffs - 0.11.0 → 0.13.0 - Mend

langwatch 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -1,6 +1,6 @@
 import { L as Logger, C as ConsoleLogger, N as NoOpLogger } from './index-D7rKIGrO.mjs';
-export { F as FilterableBatchSpanProcessor, L as LangWatchExporter, S as SpanProcessingExcludeRule, g as getLangWatchLogger, d as getLangWatchTracer } from './implementation-CVrmD0bz.mjs';
-import { p as paths, P as PromptResponse, g as CreatePromptBody, U as UpdatePromptBody, h as PromptData, i as Prompt, F as FetchPolicy, L as LangWatchSpan } from './types-Kts5RGLY.mjs';
+export { F as FilterableBatchSpanProcessor, L as LangWatchExporter, S as SpanProcessingExcludeRule, g as getLangWatchLogger, d as getLangWatchTracer } from './implementation-Ck58nRkT.mjs';
+import { p as paths, P as PromptResponse, g as CreatePromptBody, U as UpdatePromptBody, h as PromptData, i as Prompt, F as FetchPolicy, L as LangWatchSpan } from './types-5h2Im4pl.mjs';
 import openApiCreateClient from 'openapi-fetch';
 import { z } from 'zod';
 export { l as attributes } from './types-DRiQaKFG.mjs';
@@ -405,7 +405,7 @@ type DatasetsFacadeConfig = {
  * const dataset = await langwatch.datasets.get("my-dataset");
  *
  * // Use with evaluation
- * const evaluation = langwatch.evaluation.init("my-experiment");
+ * const evaluation = langwatch.experiments.init("my-experiment");
  * await evaluation.run(dataset.entries.map(e => e.entry), async ({ item, index }) => {
  *   const output = await myLLM(item.input);
  *   await evaluation.evaluate("my-evaluator", {
@@ -447,16 +447,16 @@ declare class DatasetsFacade {
 }
 /**
- * Types for the Evaluation API
+ * Types for the Experiments API
  *
- * These types define the structure for batch evaluations, including
+ * These types define the structure for batch experiments, including
  * logging metrics, running evaluators, and managing targets.
  */
 /**
  * Status of an evaluation result
  */
-type EvaluationStatus = "processed" | "error" | "skipped";
+type EvaluationStatus$1 = "processed" | "error" | "skipped";
 /**
  * Target types for batch evaluations
  */
@@ -503,11 +503,11 @@ type TargetInfo = z.infer<typeof targetInfoSchema>;
 /**
  * Result of an evaluation
  */
-type EvaluationResult = z.infer<typeof evaluationResultSchema>;
+type EvaluationResult$1 = z.infer<typeof evaluationResultSchema>;
 /**
- * Options for initializing an evaluation
+ * Options for initializing an experiment
  */
-type EvaluationInitOptions = {
+type ExperimentInitOptions = {
     /** Custom run ID (auto-generated if not provided) */
     runId?: string;
     /** Number of parallel threads for submit() */
@@ -533,7 +533,7 @@ type LogOptions = {
     /** Human-readable description of the result */
     details?: string;
     /** Status of the evaluation */
-    status?: EvaluationStatus;
+    status?: EvaluationStatus$1;
     /** Duration in milliseconds */
     duration?: number;
     /** Cost amount in USD */
@@ -551,7 +551,7 @@ type LogOptions = {
 /**
  * Options for the evaluate() method (built-in evaluators)
  */
-type EvaluateOptions = {
+type EvaluateOptions$1 = {
     /**
      * Row index in the dataset.
      * Optional when called inside withTarget() - will be auto-inferred from context.
@@ -627,9 +627,9 @@ type TargetResult<R> = {
 };
 /**
- * Evaluation - Main class for running batch evaluations
+ * Experiment - Main class for running batch experiments
  *
- * Provides a clean API for running evaluations over datasets with:
+ * Provides a clean API for running experiments over datasets with:
  * - Automatic tracing per iteration
  * - Parallel execution with concurrency control
  * - Batched result sending
@@ -638,9 +638,9 @@ type TargetResult<R> = {
  */
 /**
- * Evaluation session for running batch evaluations
+ * Experiment session for running batch experiments
  */
-declare class Evaluation {
+declare class Experiment {
     readonly name: string;
     readonly runId: string;
     readonly experimentSlug: string;
@@ -671,7 +671,7 @@ declare class Evaluation {
         endpoint: string;
         apiKey: string;
         logger: Logger;
-    } & EvaluationInitOptions): Promise<Evaluation>;
+    } & ExperimentInitOptions): Promise<Experiment>;
     /**
      * Initialize the evaluation by creating/getting the experiment
      */
@@ -743,7 +743,7 @@ declare class Evaluation {
      * });
      * ```
      */
-    evaluate(evaluatorSlug: string, options: EvaluateOptions): Promise<void>;
+    evaluate(evaluatorSlug: string, options: EvaluateOptions$1): Promise<void>;
     /**
      * Execute code within a target context with automatic tracing
      *
@@ -811,63 +811,187 @@ declare class Evaluation {
 }
 /**
- * EvaluationFacade - Entry point for the evaluation API
+ * Types for platform-configured experiments (Experiments Workbench)
+ */
+/**
+ * Summary of a completed experiment run
+ */
+type ExperimentRunSummary = {
+    runId?: string;
+    totalCells?: number;
+    completedCells?: number;
+    failedCells?: number;
+    duration?: number;
+    runUrl?: string;
+    timestamps?: {
+        startedAt: number;
+        finishedAt?: number;
+        stoppedAt?: number;
+    };
+    targets?: Array<{
+        targetId: string;
+        name: string;
+        passed: number;
+        failed: number;
+        avgLatency: number;
+        totalCost: number;
+    }>;
+    evaluators?: Array<{
+        evaluatorId: string;
+        name: string;
+        passed: number;
+        failed: number;
+        passRate: number;
+        avgScore?: number;
+    }>;
+    totalPassed?: number;
+    totalFailed?: number;
+    passRate?: number;
+    totalCost?: number;
+};
+/**
+ * Options for running a platform experiment
+ */
+type RunExperimentOptions = {
+    /**
+     * Polling interval in milliseconds (default: 2000)
+     */
+    pollInterval?: number;
+    /**
+     * Maximum time to wait for completion in milliseconds (default: 600000 = 10 minutes)
+     */
+    timeout?: number;
+    /**
+     * Callback for progress updates
+     */
+    onProgress?: (progress: number, total: number) => void;
+};
+/**
+ * Final result of a platform experiment run
+ */
+type ExperimentRunResult = {
+    runId: string;
+    status: "completed" | "failed" | "stopped";
+    passed: number;
+    failed: number;
+    passRate: number;
+    duration: number;
+    runUrl: string;
+    summary: ExperimentRunSummary;
+    /**
+     * Print a CI-friendly summary of the results
+     * @param exitOnFailure - If true (default), calls process.exit(1) when there are failures
+     */
+    printSummary: (exitOnFailure?: boolean) => void;
+};
+/**
+ * ExperimentsFacade - Entry point for the experiments API
  *
- * Provides the `init()` method to create evaluation sessions.
+ * Provides:
+ * - `init()` method to create experiment sessions (SDK-defined experiments)
+ * - `run()` method to execute platform-configured experiments (Experiments Workbench)
  */
-type EvaluationFacadeConfig = {
+type ExperimentsFacadeConfig = {
     langwatchApiClient: LangwatchApiClient;
     endpoint: string;
     apiKey: string;
     logger: Logger;
 };
 /**
- * Facade for creating evaluation sessions
+ * Facade for creating experiment sessions and running platform-configured experiments
  */
-declare class EvaluationFacade {
+declare class ExperimentsFacade {
     private readonly config;
-    constructor(config: EvaluationFacadeConfig);
+    constructor(config: ExperimentsFacadeConfig);
     /**
-     * Initialize a new evaluation session
+     * Initialize a new experiment session (SDK-defined)
      *
      * @param name - Name of the experiment (used as slug)
      * @param options - Optional configuration
-     * @returns An initialized Evaluation instance
+     * @returns An initialized Experiment instance
      *
      * @example
      * ```typescript
-     * const evaluation = await langwatch.evaluation.init('my-experiment');
+     * const experiment = await langwatch.experiments.init('my-experiment');
      *
-     * await evaluation.run(dataset, async ({ item, index }) => {
+     * await experiment.run(dataset, async ({ item, index }) => {
      *   const response = await myAgent(item.question);
-     *   evaluation.log('accuracy', { index, score: 0.95 });
+     *   experiment.log('accuracy', { index, score: 0.95 });
      * });
      * ```
      */
-    init(name: string, options?: EvaluationInitOptions): Promise<Evaluation>;
+    init(name: string, options?: ExperimentInitOptions): Promise<Experiment>;
+    /**
+     * Run a platform-configured experiment (Experiments Workbench)
+     *
+     * This runs an experiment that was configured in the LangWatch platform.
+     * The method automatically prints a summary and exits with code 1 on failure
+     * (unless `exitOnFailure: false` is passed).
+     *
+     * @param slug - The slug of the experiment (found in the experiment URL)
+     * @param options - Optional configuration
+     * @returns The experiment results including pass rate and summary
+     *
+     * @example
+     * ```typescript
+     * import { LangWatch } from "langwatch";
+     *
+     * const langwatch = new LangWatch();
+     *
+     * const result = await langwatch.experiments.run("my-experiment-slug");
+     * result.printSummary();
+     * ```
+     */
+    run(slug: string, options?: RunExperimentOptions): Promise<ExperimentRunResult>;
+    /**
+     * Run an experiment and wait for completion using polling
+     */
+    private runWithPolling;
+    /**
+     * Start an experiment run
+     */
+    private startRun;
+    /**
+     * Get the status of a run
+     */
+    private getRunStatus;
+    /**
+     * Build the result object from API response
+     */
+    private buildResult;
+    /**
+     * Print a CI-friendly summary of the experiment results
+     */
+    private printSummary;
+    private sleep;
+    /**
+     * Replace the domain of a URL with a new base URL, preserving the path
+     */
+    private replaceUrlDomain;
 }
 /**
- * Errors for the Evaluation API
+ * Errors for the Experiments API
  */
 /**
- * Base error for evaluation-related issues
+ * Base error for experiment-related issues
  */
-declare class EvaluationError extends Error {
+declare class ExperimentError extends Error {
     constructor(message: string);
 }
 /**
  * Thrown when initialization fails
  */
-declare class EvaluationInitError extends EvaluationError {
+declare class ExperimentInitError extends ExperimentError {
     readonly cause?: Error | undefined;
     constructor(message: string, cause?: Error | undefined);
 }
 /**
  * Thrown when API calls fail
  */
-declare class EvaluationApiError extends EvaluationError {
+declare class ExperimentApiError extends ExperimentError {
     readonly statusCode?: number | undefined;
     readonly cause?: Error | undefined;
     constructor(message: string, statusCode?: number | undefined, cause?: Error | undefined);
@@ -875,7 +999,7 @@ declare class EvaluationApiError extends EvaluationError {
 /**
  * Thrown when target metadata conflicts
  */
-declare class TargetMetadataConflictError extends EvaluationError {
+declare class TargetMetadataConflictError extends ExperimentError {
     readonly targetName: string;
     readonly existingMetadata: Record<string, unknown>;
     readonly newMetadata: Record<string, unknown>;
@@ -884,12 +1008,164 @@ declare class TargetMetadataConflictError extends EvaluationError {
 /**
  * Thrown when an evaluator call fails
  */
-declare class EvaluatorError extends EvaluationError {
+declare class EvaluatorError extends ExperimentError {
     readonly evaluatorSlug: string;
     readonly cause?: Error | undefined;
     constructor(evaluatorSlug: string, message: string, cause?: Error | undefined);
 }
+/**
+ * Types for the Evaluations API (Online Evaluations / Guardrails)
+ *
+ * These types define the structure for running evaluators and guardrails
+ * in real-time against LLM inputs/outputs.
+ */
+/**
+ * Status of an evaluation result
+ */
+type EvaluationStatus = "processed" | "skipped" | "error";
+/**
+ * Cost information from an evaluation
+ */
+type EvaluationCost = {
+    currency: string;
+    amount: number;
+};
+/**
+ * Result returned from running an evaluator
+ */
+type EvaluationResult = {
+    /** Status of the evaluation */
+    status: EvaluationStatus;
+    /** Whether the evaluation passed (for guardrails) */
+    passed?: boolean;
+    /** Numeric score (typically 0-1) */
+    score?: number;
+    /** Human-readable details about the result */
+    details?: string;
+    /** Label/category for the result */
+    label?: string;
+    /** Cost of running the evaluation */
+    cost?: EvaluationCost;
+};
+/**
+ * Options for the evaluate() method
+ */
+type EvaluateOptions = {
+    /** Data to pass to the evaluator (input, output, contexts, etc.) */
+    data: Record<string, unknown>;
+    /** Human-readable name for this evaluation */
+    name?: string;
+    /** Evaluator-specific settings */
+    settings?: Record<string, unknown>;
+    /** Whether to run as a guardrail (affects error handling) */
+    asGuardrail?: boolean;
+};
+/**
+ * EvaluationsFacade - Entry point for the Evaluations API (Online Evaluations / Guardrails)
+ *
+ * Provides an API for running evaluators and guardrails in real-time against LLM inputs/outputs.
+ *
+ * @example
+ * ```typescript
+ * const langwatch = new LangWatch({ apiKey: "your-api-key" });
+ *
+ * // Run a guardrail
+ * const guardrail = await langwatch.evaluations.evaluate("presidio/pii_detection", {
+ *   data: { input: userInput, output: generatedResponse },
+ *   name: "PII Detection",
+ *   asGuardrail: true,
+ *   settings: {},
+ * });
+ *
+ * if (!guardrail.passed) {
+ *   return "I'm sorry, I can't do that.";
+ * }
+ * ```
+ */
+type EvaluationsFacadeConfig = {
+    endpoint: string;
+    apiKey: string;
+    logger: Logger;
+};
+declare class EvaluationsFacade {
+    #private;
+    constructor(config: EvaluationsFacadeConfig);
+    /**
+     * Run an evaluator or guardrail against provided data
+     *
+     * Creates an OpenTelemetry span attached to the current trace context,
+     * calls the LangWatch evaluation API, and returns the result.
+     *
+     * @param slug - The evaluator slug (e.g., "presidio/pii_detection", "langevals/llm_boolean")
+     * @param options - Evaluation options including data, name, settings, and asGuardrail flag
+     * @returns The evaluation result with status, passed, score, details, label, and cost
+     *
+     * @example
+     * ```typescript
+     * // Run as a guardrail (synchronous evaluation that can block responses)
+     * const guardrail = await langwatch.evaluations.evaluate("presidio/pii_detection", {
+     *   data: { input: userInput, output: generatedResponse },
+     *   name: "PII Detection Guardrail",
+     *   asGuardrail: true,
+     * });
+     *
+     * if (!guardrail.passed) {
+     *   console.log("PII detected:", guardrail.details);
+     *   return "Sorry, I cannot process that request.";
+     * }
+     * ```
+     *
+     * @example
+     * ```typescript
+     * // Run as an online evaluation (async scoring for monitoring)
+     * const result = await langwatch.evaluations.evaluate("langevals/llm_boolean", {
+     *   data: { input: question, output: response },
+     *   name: "Quality Check",
+     *   settings: { prompt: "Check if the response answers the question." },
+     * });
+     *
+     * console.log("Score:", result.score);
+     * console.log("Details:", result.details);
+     * ```
+     */
+    evaluate: (slug: string, options: EvaluateOptions) => Promise<EvaluationResult>;
+}
+/**
+ * Error classes for the Evaluations API
+ */
+/**
+ * Base error for evaluation operations
+ */
+declare class EvaluationError extends Error {
+    constructor(message: string);
+}
+/**
+ * Error when an evaluator call fails
+ */
+declare class EvaluatorCallError extends EvaluationError {
+    readonly evaluatorSlug: string;
+    readonly statusCode?: number;
+    constructor(evaluatorSlug: string, message: string, statusCode?: number);
+}
+/**
+ * Error when evaluator is not found
+ */
+declare class EvaluatorNotFoundError extends EvaluationError {
+    readonly evaluatorSlug: string;
+    constructor(evaluatorSlug: string);
+}
+/**
+ * Error from the evaluations API
+ */
+declare class EvaluationsApiError extends EvaluationError {
+    readonly statusCode: number;
+    constructor(message: string, statusCode: number);
+}
 interface GetTraceParams {
     includeSpans?: boolean;
 }
@@ -913,8 +1189,41 @@ declare class LangWatch {
     private readonly config;
     readonly prompts: PromptsFacade;
     readonly traces: TracesFacade;
-    readonly evaluation: EvaluationFacade;
     readonly datasets: DatasetsFacade;
+    /**
+     * Run experiments on LangWatch platform or via SDK.
+     *
+     * Platform experiments (CI/CD):
+     * ```typescript
+     * const result = await langwatch.experiments.run("my-experiment-slug");
+     * result.printSummary();
+     * ```
+     *
+     * SDK-defined experiments:
+     * ```typescript
+     * const experiment = await langwatch.experiments.init("my-experiment");
+     * // ... run evaluators using experiment.evaluate()
+     * ```
+     */
+    readonly experiments: ExperimentsFacade;
+    /**
+     * Run evaluators and guardrails in real-time (Online Evaluations).
+     *
+     * @example
+     * ```typescript
+     * // Run a guardrail
+     * const guardrail = await langwatch.evaluations.evaluate("presidio/pii_detection", {
+     *   data: { input: userInput, output: generatedResponse },
+     *   name: "PII Detection",
+     *   asGuardrail: true,
+     * });
+     *
+     * if (!guardrail.passed) {
+     *   return "I'm sorry, I can't do that.";
+     * }
+     * ```
+     */
+    readonly evaluations: EvaluationsFacade;
     constructor(options?: LangWatchConstructorOptions);
     get apiClient(): LangwatchApiClient;
 }
@@ -924,4 +1233,4 @@ declare const logger: {
     NoOpLogger: typeof NoOpLogger;
 };
-export { type EvaluateOptions, Evaluation, EvaluationApiError, EvaluationError, EvaluationFacade, EvaluationInitError, type EvaluationInitOptions, type EvaluationResult, type EvaluationStatus, EvaluatorError, FetchPolicy, type GetPromptOptions, LangWatch, type LogOptions, type RunCallback, type RunContext, type RunOptions, type TargetInfo, type TargetMetadata, TargetMetadataConflictError, type TargetType, logger };
+export { type EvaluateOptions, type EvaluationCost, EvaluationError, type EvaluationResult, type EvaluationStatus, EvaluationsApiError, EvaluationsFacade, EvaluatorCallError, EvaluatorError, EvaluatorNotFoundError, Experiment, ExperimentApiError, ExperimentError, type EvaluateOptions$1 as ExperimentEvaluateOptions, type EvaluationResult$1 as ExperimentEvaluationResult, type EvaluationStatus$1 as ExperimentEvaluationStatus, ExperimentInitError, type ExperimentInitOptions, ExperimentsFacade, FetchPolicy, type GetPromptOptions, LangWatch, type LogOptions, type RunCallback, type RunContext, type RunOptions, type TargetInfo, type TargetMetadata, TargetMetadataConflictError, type TargetType, logger };