npm - langwatch - Versions diffs - 0.12.0 → 0.14.0 - Mend

langwatch 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -1,6 +1,6 @@
 import { L as Logger, C as ConsoleLogger, N as NoOpLogger } from './index-D7rKIGrO.mjs';
-export { F as FilterableBatchSpanProcessor, L as LangWatchExporter, S as SpanProcessingExcludeRule, g as getLangWatchLogger, d as getLangWatchTracer } from './implementation-Ck58nRkT.mjs';
-import { p as paths, P as PromptResponse, g as CreatePromptBody, U as UpdatePromptBody, h as PromptData, i as Prompt, F as FetchPolicy, L as LangWatchSpan } from './types-5h2Im4pl.mjs';
+export { F as FilterableBatchSpanProcessor, L as LangWatchExporter, S as SpanProcessingExcludeRule, g as getLangWatchLogger, d as getLangWatchTracer } from './implementation-Dl15eRjo.mjs';
+import { p as paths, P as PromptResponse, g as CreatePromptBody, U as UpdatePromptBody, h as PromptData, i as Prompt, F as FetchPolicy, L as LangWatchSpan } from './types-ExKeJEM0.mjs';
 import openApiCreateClient from 'openapi-fetch';
 import { z } from 'zod';
 export { l as attributes } from './types-DRiQaKFG.mjs';
@@ -405,7 +405,7 @@ type DatasetsFacadeConfig = {
  * const dataset = await langwatch.datasets.get("my-dataset");
  *
  * // Use with evaluation
- * const evaluation = langwatch.evaluation.init("my-experiment");
+ * const evaluation = langwatch.experiments.init("my-experiment");
  * await evaluation.run(dataset.entries.map(e => e.entry), async ({ item, index }) => {
  *   const output = await myLLM(item.input);
  *   await evaluation.evaluate("my-evaluator", {
@@ -447,16 +447,16 @@ declare class DatasetsFacade {
 }
 /**
- * Types for the Evaluation API
+ * Types for the Experiments API
  *
- * These types define the structure for batch evaluations, including
+ * These types define the structure for batch experiments, including
  * logging metrics, running evaluators, and managing targets.
  */
 /**
  * Status of an evaluation result
  */
-type EvaluationStatus = "processed" | "error" | "skipped";
+type EvaluationStatus$1 = "processed" | "error" | "skipped";
 /**
  * Target types for batch evaluations
  */
@@ -503,11 +503,11 @@ type TargetInfo = z.infer<typeof targetInfoSchema>;
 /**
  * Result of an evaluation
  */
-type EvaluationResult = z.infer<typeof evaluationResultSchema>;
+type EvaluationResult$1 = z.infer<typeof evaluationResultSchema>;
 /**
- * Options for initializing an evaluation
+ * Options for initializing an experiment
  */
-type EvaluationInitOptions = {
+type ExperimentInitOptions = {
     /** Custom run ID (auto-generated if not provided) */
     runId?: string;
     /** Number of parallel threads for submit() */
@@ -533,7 +533,7 @@ type LogOptions = {
     /** Human-readable description of the result */
     details?: string;
     /** Status of the evaluation */
-    status?: EvaluationStatus;
+    status?: EvaluationStatus$1;
     /** Duration in milliseconds */
     duration?: number;
     /** Cost amount in USD */
@@ -551,7 +551,7 @@ type LogOptions = {
 /**
  * Options for the evaluate() method (built-in evaluators)
  */
-type EvaluateOptions = {
+type EvaluateOptions$1 = {
     /**
      * Row index in the dataset.
      * Optional when called inside withTarget() - will be auto-inferred from context.
@@ -627,9 +627,9 @@ type TargetResult<R> = {
 };
 /**
- * Evaluation - Main class for running batch evaluations
+ * Experiment - Main class for running batch experiments
  *
- * Provides a clean API for running evaluations over datasets with:
+ * Provides a clean API for running experiments over datasets with:
  * - Automatic tracing per iteration
  * - Parallel execution with concurrency control
  * - Batched result sending
@@ -638,9 +638,9 @@ type TargetResult<R> = {
  */
 /**
- * Evaluation session for running batch evaluations
+ * Experiment session for running batch experiments
  */
-declare class Evaluation {
+declare class Experiment {
     readonly name: string;
     readonly runId: string;
     readonly experimentSlug: string;
@@ -671,7 +671,7 @@ declare class Evaluation {
         endpoint: string;
         apiKey: string;
         logger: Logger;
-    } & EvaluationInitOptions): Promise<Evaluation>;
+    } & ExperimentInitOptions): Promise<Experiment>;
     /**
      * Initialize the evaluation by creating/getting the experiment
      */
@@ -743,7 +743,7 @@ declare class Evaluation {
      * });
      * ```
      */
-    evaluate(evaluatorSlug: string, options: EvaluateOptions): Promise<void>;
+    evaluate(evaluatorSlug: string, options: EvaluateOptions$1): Promise<void>;
     /**
      * Execute code within a target context with automatic tracing
      *
@@ -811,12 +811,12 @@ declare class Evaluation {
 }
 /**
- * Types for platform-configured evaluations (Evaluations V3)
+ * Types for platform-configured experiments (Experiments Workbench)
  */
 /**
- * Summary of a completed evaluation run
+ * Summary of a completed experiment run
  */
-type EvaluationRunSummary = {
+type ExperimentRunSummary = {
     runId?: string;
     totalCells?: number;
     completedCells?: number;
@@ -850,9 +850,9 @@ type EvaluationRunSummary = {
     totalCost?: number;
 };
 /**
- * Options for running a platform evaluation
+ * Options for running a platform experiment
  */
-type RunEvaluationOptions = {
+type RunExperimentOptions = {
     /**
      * Polling interval in milliseconds (default: 2000)
      */
@@ -867,9 +867,9 @@ type RunEvaluationOptions = {
     onProgress?: (progress: number, total: number) => void;
 };
 /**
- * Final result of a platform evaluation run
+ * Final result of a platform experiment run
  */
-type EvaluationRunResult = {
+type ExperimentRunResult = {
     runId: string;
     status: "completed" | "failed" | "stopped";
     passed: number;
@@ -877,7 +877,7 @@ type EvaluationRunResult = {
     passRate: number;
     duration: number;
     runUrl: string;
-    summary: EvaluationRunSummary;
+    summary: ExperimentRunSummary;
     /**
      * Print a CI-friendly summary of the results
      * @param exitOnFailure - If true (default), calls process.exit(1) when there are failures
@@ -886,53 +886,53 @@ type EvaluationRunResult = {
 };
 /**
- * EvaluationFacade - Entry point for the evaluation API
+ * ExperimentsFacade - Entry point for the experiments API
  *
  * Provides:
- * - `init()` method to create evaluation sessions (SDK-defined evaluations)
- * - `run()` method to execute platform-configured evaluations (Evaluations V3)
+ * - `init()` method to create experiment sessions (SDK-defined experiments)
+ * - `run()` method to execute platform-configured experiments (Experiments Workbench)
  */
-type EvaluationFacadeConfig = {
+type ExperimentsFacadeConfig = {
     langwatchApiClient: LangwatchApiClient;
     endpoint: string;
     apiKey: string;
     logger: Logger;
 };
 /**
- * Facade for creating evaluation sessions and running platform-configured evaluations
+ * Facade for creating experiment sessions and running platform-configured experiments
  */
-declare class EvaluationFacade {
+declare class ExperimentsFacade {
     private readonly config;
-    constructor(config: EvaluationFacadeConfig);
+    constructor(config: ExperimentsFacadeConfig);
     /**
-     * Initialize a new evaluation session (SDK-defined)
+     * Initialize a new experiment session (SDK-defined)
      *
      * @param name - Name of the experiment (used as slug)
      * @param options - Optional configuration
-     * @returns An initialized Evaluation instance
+     * @returns An initialized Experiment instance
      *
      * @example
      * ```typescript
-     * const evaluation = await langwatch.evaluation.init('my-experiment');
+     * const experiment = await langwatch.experiments.init('my-experiment');
      *
-     * await evaluation.run(dataset, async ({ item, index }) => {
+     * await experiment.run(dataset, async ({ item, index }) => {
      *   const response = await myAgent(item.question);
-     *   evaluation.log('accuracy', { index, score: 0.95 });
+     *   experiment.log('accuracy', { index, score: 0.95 });
      * });
      * ```
      */
-    init(name: string, options?: EvaluationInitOptions): Promise<Evaluation>;
+    init(name: string, options?: ExperimentInitOptions): Promise<Experiment>;
     /**
-     * Run a platform-configured evaluation (Evaluations V3)
+     * Run a platform-configured experiment (Experiments Workbench)
      *
-     * This runs an evaluation that was configured in the LangWatch platform.
+     * This runs an experiment that was configured in the LangWatch platform.
      * The method automatically prints a summary and exits with code 1 on failure
      * (unless `exitOnFailure: false` is passed).
      *
-     * @param slug - The slug of the evaluation (found in the evaluation URL)
+     * @param slug - The slug of the experiment (found in the experiment URL)
      * @param options - Optional configuration
-     * @returns The evaluation results including pass rate and summary
+     * @returns The experiment results including pass rate and summary
      *
      * @example
      * ```typescript
@@ -940,17 +940,17 @@ declare class EvaluationFacade {
      *
      * const langwatch = new LangWatch();
      *
-     * const result = await langwatch.evaluation.run("my-evaluation-slug");
+     * const result = await langwatch.experiments.run("my-experiment-slug");
      * result.printSummary();
      * ```
      */
-    run(slug: string, options?: RunEvaluationOptions): Promise<EvaluationRunResult>;
+    run(slug: string, options?: RunExperimentOptions): Promise<ExperimentRunResult>;
     /**
-     * Run an evaluation and wait for completion using polling
+     * Run an experiment and wait for completion using polling
      */
     private runWithPolling;
     /**
-     * Start an evaluation run
+     * Start an experiment run
      */
     private startRun;
     /**
@@ -962,7 +962,7 @@ declare class EvaluationFacade {
      */
     private buildResult;
     /**
-     * Print a CI-friendly summary of the evaluation results
+     * Print a CI-friendly summary of the experiment results
      */
     private printSummary;
     private sleep;
@@ -973,25 +973,25 @@ declare class EvaluationFacade {
 }
 /**
- * Errors for the Evaluation API
+ * Errors for the Experiments API
  */
 /**
- * Base error for evaluation-related issues
+ * Base error for experiment-related issues
  */
-declare class EvaluationError extends Error {
+declare class ExperimentError extends Error {
     constructor(message: string);
 }
 /**
  * Thrown when initialization fails
  */
-declare class EvaluationInitError extends EvaluationError {
+declare class ExperimentInitError extends ExperimentError {
     readonly cause?: Error | undefined;
     constructor(message: string, cause?: Error | undefined);
 }
 /**
  * Thrown when API calls fail
  */
-declare class EvaluationApiError extends EvaluationError {
+declare class ExperimentApiError extends ExperimentError {
     readonly statusCode?: number | undefined;
     readonly cause?: Error | undefined;
     constructor(message: string, statusCode?: number | undefined, cause?: Error | undefined);
@@ -999,7 +999,7 @@ declare class EvaluationApiError extends EvaluationError {
 /**
  * Thrown when target metadata conflicts
  */
-declare class TargetMetadataConflictError extends EvaluationError {
+declare class TargetMetadataConflictError extends ExperimentError {
     readonly targetName: string;
     readonly existingMetadata: Record<string, unknown>;
     readonly newMetadata: Record<string, unknown>;
@@ -1008,12 +1008,164 @@ declare class TargetMetadataConflictError extends EvaluationError {
 /**
  * Thrown when an evaluator call fails
  */
-declare class EvaluatorError extends EvaluationError {
+declare class EvaluatorError extends ExperimentError {
     readonly evaluatorSlug: string;
     readonly cause?: Error | undefined;
     constructor(evaluatorSlug: string, message: string, cause?: Error | undefined);
 }
+/**
+ * Types for the Evaluations API (Online Evaluations / Guardrails)
+ *
+ * These types define the structure for running evaluators and guardrails
+ * in real-time against LLM inputs/outputs.
+ */
+/**
+ * Status of an evaluation result
+ */
+type EvaluationStatus = "processed" | "skipped" | "error";
+/**
+ * Cost information from an evaluation
+ */
+type EvaluationCost = {
+    currency: string;
+    amount: number;
+};
+/**
+ * Result returned from running an evaluator
+ */
+type EvaluationResult = {
+    /** Status of the evaluation */
+    status: EvaluationStatus;
+    /** Whether the evaluation passed (for guardrails) */
+    passed?: boolean;
+    /** Numeric score (typically 0-1) */
+    score?: number;
+    /** Human-readable details about the result */
+    details?: string;
+    /** Label/category for the result */
+    label?: string;
+    /** Cost of running the evaluation */
+    cost?: EvaluationCost;
+};
+/**
+ * Options for the evaluate() method
+ */
+type EvaluateOptions = {
+    /** Data to pass to the evaluator (input, output, contexts, etc.) */
+    data: Record<string, unknown>;
+    /** Human-readable name for this evaluation */
+    name?: string;
+    /** Evaluator-specific settings */
+    settings?: Record<string, unknown>;
+    /** Whether to run as a guardrail (affects error handling) */
+    asGuardrail?: boolean;
+};
+/**
+ * EvaluationsFacade - Entry point for the Evaluations API (Online Evaluations / Guardrails)
+ *
+ * Provides an API for running evaluators and guardrails in real-time against LLM inputs/outputs.
+ *
+ * @example
+ * ```typescript
+ * const langwatch = new LangWatch({ apiKey: "your-api-key" });
+ *
+ * // Run a guardrail
+ * const guardrail = await langwatch.evaluations.evaluate("presidio/pii_detection", {
+ *   data: { input: userInput, output: generatedResponse },
+ *   name: "PII Detection",
+ *   asGuardrail: true,
+ *   settings: {},
+ * });
+ *
+ * if (!guardrail.passed) {
+ *   return "I'm sorry, I can't do that.";
+ * }
+ * ```
+ */
+type EvaluationsFacadeConfig = {
+    endpoint: string;
+    apiKey: string;
+    logger: Logger;
+};
+declare class EvaluationsFacade {
+    #private;
+    constructor(config: EvaluationsFacadeConfig);
+    /**
+     * Run an evaluator or guardrail against provided data
+     *
+     * Creates an OpenTelemetry span attached to the current trace context,
+     * calls the LangWatch evaluation API, and returns the result.
+     *
+     * @param slug - The evaluator slug (e.g., "presidio/pii_detection", "langevals/llm_boolean")
+     * @param options - Evaluation options including data, name, settings, and asGuardrail flag
+     * @returns The evaluation result with status, passed, score, details, label, and cost
+     *
+     * @example
+     * ```typescript
+     * // Run as a guardrail (synchronous evaluation that can block responses)
+     * const guardrail = await langwatch.evaluations.evaluate("presidio/pii_detection", {
+     *   data: { input: userInput, output: generatedResponse },
+     *   name: "PII Detection Guardrail",
+     *   asGuardrail: true,
+     * });
+     *
+     * if (!guardrail.passed) {
+     *   console.log("PII detected:", guardrail.details);
+     *   return "Sorry, I cannot process that request.";
+     * }
+     * ```
+     *
+     * @example
+     * ```typescript
+     * // Run as an online evaluation (async scoring for monitoring)
+     * const result = await langwatch.evaluations.evaluate("langevals/llm_boolean", {
+     *   data: { input: question, output: response },
+     *   name: "Quality Check",
+     *   settings: { prompt: "Check if the response answers the question." },
+     * });
+     *
+     * console.log("Score:", result.score);
+     * console.log("Details:", result.details);
+     * ```
+     */
+    evaluate: (slug: string, options: EvaluateOptions) => Promise<EvaluationResult>;
+}
+/**
+ * Error classes for the Evaluations API
+ */
+/**
+ * Base error for evaluation operations
+ */
+declare class EvaluationError extends Error {
+    constructor(message: string);
+}
+/**
+ * Error when an evaluator call fails
+ */
+declare class EvaluatorCallError extends EvaluationError {
+    readonly evaluatorSlug: string;
+    readonly statusCode?: number;
+    constructor(evaluatorSlug: string, message: string, statusCode?: number);
+}
+/**
+ * Error when evaluator is not found
+ */
+declare class EvaluatorNotFoundError extends EvaluationError {
+    readonly evaluatorSlug: string;
+    constructor(evaluatorSlug: string);
+}
+/**
+ * Error from the evaluations API
+ */
+declare class EvaluationsApiError extends EvaluationError {
+    readonly statusCode: number;
+    constructor(message: string, statusCode: number);
+}
 interface GetTraceParams {
     includeSpans?: boolean;
 }
@@ -1037,15 +1189,76 @@ declare class LangWatch {
     private readonly config;
     readonly prompts: PromptsFacade;
     readonly traces: TracesFacade;
-    readonly evaluation: EvaluationFacade;
     readonly datasets: DatasetsFacade;
+    /**
+     * Run experiments on LangWatch platform or via SDK.
+     *
+     * Platform experiments (CI/CD):
+     * ```typescript
+     * const result = await langwatch.experiments.run("my-experiment-slug");
+     * result.printSummary();
+     * ```
+     *
+     * SDK-defined experiments:
+     * ```typescript
+     * const experiment = await langwatch.experiments.init("my-experiment");
+     * // ... run evaluators using experiment.evaluate()
+     * ```
+     */
+    readonly experiments: ExperimentsFacade;
+    /**
+     * Run evaluators and guardrails in real-time (Online Evaluations).
+     *
+     * @example
+     * ```typescript
+     * // Run a guardrail
+     * const guardrail = await langwatch.evaluations.evaluate("presidio/pii_detection", {
+     *   data: { input: userInput, output: generatedResponse },
+     *   name: "PII Detection",
+     *   asGuardrail: true,
+     * });
+     *
+     * if (!guardrail.passed) {
+     *   return "I'm sorry, I can't do that.";
+     * }
+     * ```
+     */
+    readonly evaluations: EvaluationsFacade;
     constructor(options?: LangWatchConstructorOptions);
     get apiClient(): LangwatchApiClient;
 }
+type EvaluatorResponse = NonNullable<paths["/api/evaluators"]["get"]["responses"]["200"]["content"]["application/json"]>[number];
+type EvaluatorField = EvaluatorResponse["fields"][number];
+/**
+ * Service for retrieving evaluator resources via the LangWatch API.
+ *
+ * Provides read-only access to project evaluators with computed fields.
+ */
+declare class EvaluatorsApiService {
+    private readonly apiClient;
+    constructor(config?: Pick<InternalConfig, "langwatchApiClient">);
+    private handleApiError;
+    /**
+     * Fetches all evaluators for the project.
+     */
+    getAll(): Promise<EvaluatorResponse[]>;
+    /**
+     * Fetches a single evaluator by its ID or slug.
+     */
+    get(idOrSlug: string): Promise<EvaluatorResponse>;
+}
+declare class EvaluatorsApiError extends Error {
+    readonly operation: string;
+    readonly originalError?: unknown | undefined;
+    constructor(message: string, operation: string, originalError?: unknown | undefined);
+}
 declare const logger: {
     ConsoleLogger: typeof ConsoleLogger;
     NoOpLogger: typeof NoOpLogger;
 };
-export { type EvaluateOptions, Evaluation, EvaluationApiError, EvaluationError, EvaluationFacade, EvaluationInitError, type EvaluationInitOptions, type EvaluationResult, type EvaluationStatus, EvaluatorError, FetchPolicy, type GetPromptOptions, LangWatch, type LogOptions, type RunCallback, type RunContext, type RunOptions, type TargetInfo, type TargetMetadata, TargetMetadataConflictError, type TargetType, logger };
+export { type EvaluateOptions, type EvaluationCost, EvaluationError, type EvaluationResult, type EvaluationStatus, EvaluationsApiError, EvaluationsFacade, EvaluatorCallError, EvaluatorError, type EvaluatorField, EvaluatorNotFoundError, type EvaluatorResponse, EvaluatorsApiError, EvaluatorsApiService, Experiment, ExperimentApiError, ExperimentError, type EvaluateOptions$1 as ExperimentEvaluateOptions, type EvaluationResult$1 as ExperimentEvaluationResult, type EvaluationStatus$1 as ExperimentEvaluationStatus, ExperimentInitError, type ExperimentInitOptions, ExperimentsFacade, FetchPolicy, type GetPromptOptions, LangWatch, type LogOptions, type RunCallback, type RunContext, type RunOptions, type TargetInfo, type TargetMetadata, TargetMetadataConflictError, type TargetType, logger };