langwatch 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/dist/{add-V2V2U6OX.js → add-UB5U3K3M.js} +11 -11
  2. package/dist/add-UB5U3K3M.js.map +1 -0
  3. package/dist/{add-KNE3HWRY.mjs → add-XV5SUAXF.mjs} +8 -8
  4. package/dist/add-XV5SUAXF.mjs.map +1 -0
  5. package/dist/{chunk-XBHIDR76.mjs → chunk-556ZFJMK.mjs} +3 -3
  6. package/dist/{chunk-CKIZDPIJ.js → chunk-5MQQRSVM.js} +1 -1
  7. package/dist/{chunk-CKIZDPIJ.js.map → chunk-5MQQRSVM.js.map} +1 -1
  8. package/dist/{chunk-SNDTNU3T.js → chunk-ASTAIRXG.js} +2 -2
  9. package/dist/{chunk-SNDTNU3T.js.map → chunk-ASTAIRXG.js.map} +1 -1
  10. package/dist/{chunk-FISQBF2P.js → chunk-D4H6PR6H.js} +153 -76
  11. package/dist/chunk-D4H6PR6H.js.map +1 -0
  12. package/dist/{chunk-WHPBZSTS.mjs → chunk-IIUI2XYW.mjs} +2 -2
  13. package/dist/{chunk-2UTO2QPL.js → chunk-JQYW7RY7.js} +17 -17
  14. package/dist/{chunk-2UTO2QPL.js.map → chunk-JQYW7RY7.js.map} +1 -1
  15. package/dist/{chunk-SMUOMBKY.mjs → chunk-LKE6DMUP.mjs} +2 -2
  16. package/dist/chunk-LKE6DMUP.mjs.map +1 -0
  17. package/dist/{chunk-NQ7TYHRT.js → chunk-N7PJJMU2.js} +2 -2
  18. package/dist/chunk-N7PJJMU2.js.map +1 -0
  19. package/dist/{chunk-A43BYF5Q.js → chunk-ONXIZKC6.js} +11 -11
  20. package/dist/{chunk-A43BYF5Q.js.map → chunk-ONXIZKC6.js.map} +1 -1
  21. package/dist/{chunk-I2SOBPAF.mjs → chunk-RSIPLYVA.mjs} +1 -1
  22. package/dist/{chunk-I2SOBPAF.mjs.map → chunk-RSIPLYVA.mjs.map} +1 -1
  23. package/dist/{chunk-6VUZPNOC.mjs → chunk-WZ7FYUHN.mjs} +139 -62
  24. package/dist/chunk-WZ7FYUHN.mjs.map +1 -0
  25. package/dist/{chunk-FEL5FLHA.mjs → chunk-ZEPKV5YO.mjs} +2 -2
  26. package/dist/cli/index.js +6 -6
  27. package/dist/cli/index.js.map +1 -1
  28. package/dist/cli/index.mjs +6 -6
  29. package/dist/cli/index.mjs.map +1 -1
  30. package/dist/{implementation-TF91Gn0l.d.ts → implementation-CPxv2BdW.d.ts} +1 -1
  31. package/dist/{implementation-pq0g2B5y.d.mts → implementation-CVrmD0bz.d.mts} +1 -1
  32. package/dist/index.d.mts +581 -31
  33. package/dist/index.d.ts +581 -31
  34. package/dist/index.js +977 -17
  35. package/dist/index.js.map +1 -1
  36. package/dist/index.mjs +969 -9
  37. package/dist/index.mjs.map +1 -1
  38. package/dist/{list-VWXENE3O.js → list-DUNP46AD.js} +10 -10
  39. package/dist/{list-VWXENE3O.js.map → list-DUNP46AD.js.map} +1 -1
  40. package/dist/{list-4BFJIHYB.mjs → list-T4QS6CT2.mjs} +7 -7
  41. package/dist/{login-6PM2MUZS.js → login-3H27NIOD.js} +4 -4
  42. package/dist/{login-6PM2MUZS.js.map → login-3H27NIOD.js.map} +1 -1
  43. package/dist/{login-CEO47GSW.mjs → login-T2ET7TKH.mjs} +3 -3
  44. package/dist/login-T2ET7TKH.mjs.map +1 -0
  45. package/dist/observability-sdk/index.d.mts +3 -3
  46. package/dist/observability-sdk/index.d.ts +3 -3
  47. package/dist/observability-sdk/index.js +6 -6
  48. package/dist/observability-sdk/index.js.map +1 -1
  49. package/dist/observability-sdk/index.mjs +10 -10
  50. package/dist/observability-sdk/instrumentation/langchain/index.d.mts +1 -1
  51. package/dist/observability-sdk/instrumentation/langchain/index.d.ts +1 -1
  52. package/dist/observability-sdk/instrumentation/langchain/index.js +16 -16
  53. package/dist/observability-sdk/instrumentation/langchain/index.mjs +2 -2
  54. package/dist/observability-sdk/setup/node/index.js +14 -14
  55. package/dist/observability-sdk/setup/node/index.js.map +1 -1
  56. package/dist/observability-sdk/setup/node/index.mjs +4 -4
  57. package/dist/observability-sdk/setup/node/index.mjs.map +1 -1
  58. package/dist/{remove-5YFEQXTZ.mjs → remove-F5RM4775.mjs} +7 -7
  59. package/dist/{remove-KVWGJOJY.js → remove-V4JL5Z4U.js} +9 -9
  60. package/dist/{remove-KVWGJOJY.js.map → remove-V4JL5Z4U.js.map} +1 -1
  61. package/dist/{sync-6BHY2J72.js → sync-DIOKWE6R.js} +11 -11
  62. package/dist/sync-DIOKWE6R.js.map +1 -0
  63. package/dist/{sync-3P54PWWR.mjs → sync-VGWOLOLJ.mjs} +9 -9
  64. package/dist/sync-VGWOLOLJ.mjs.map +1 -0
  65. package/dist/{types-DhEYjnRD.d.mts → types-Kts5RGLY.d.mts} +15 -1
  66. package/dist/{types-CAQOMGrf.d.ts → types-usU5mTCX.d.ts} +15 -1
  67. package/package.json +3 -2
  68. package/dist/add-KNE3HWRY.mjs.map +0 -1
  69. package/dist/add-V2V2U6OX.js.map +0 -1
  70. package/dist/chunk-6VUZPNOC.mjs.map +0 -1
  71. package/dist/chunk-FISQBF2P.js.map +0 -1
  72. package/dist/chunk-NQ7TYHRT.js.map +0 -1
  73. package/dist/chunk-SMUOMBKY.mjs.map +0 -1
  74. package/dist/login-CEO47GSW.mjs.map +0 -1
  75. package/dist/sync-3P54PWWR.mjs.map +0 -1
  76. package/dist/sync-6BHY2J72.js.map +0 -1
  77. /package/dist/{chunk-XBHIDR76.mjs.map → chunk-556ZFJMK.mjs.map} +0 -0
  78. /package/dist/{chunk-WHPBZSTS.mjs.map → chunk-IIUI2XYW.mjs.map} +0 -0
  79. /package/dist/{chunk-FEL5FLHA.mjs.map → chunk-ZEPKV5YO.mjs.map} +0 -0
  80. /package/dist/{list-4BFJIHYB.mjs.map → list-T4QS6CT2.mjs.map} +0 -0
  81. /package/dist/{remove-5YFEQXTZ.mjs.map → remove-F5RM4775.mjs.map} +0 -0
package/dist/index.d.mts CHANGED
@@ -1,9 +1,9 @@
1
1
  import { L as Logger, C as ConsoleLogger, N as NoOpLogger } from './index-D7rKIGrO.mjs';
2
- export { F as FilterableBatchSpanProcessor, L as LangWatchExporter, S as SpanProcessingExcludeRule, g as getLangWatchLogger, d as getLangWatchTracer } from './implementation-pq0g2B5y.mjs';
3
- export { l as attributes } from './types-DRiQaKFG.mjs';
4
- import { p as paths, P as PromptResponse, g as CreatePromptBody, U as UpdatePromptBody, h as PromptData, i as Prompt } from './types-DhEYjnRD.mjs';
2
+ export { F as FilterableBatchSpanProcessor, L as LangWatchExporter, S as SpanProcessingExcludeRule, g as getLangWatchLogger, d as getLangWatchTracer } from './implementation-CVrmD0bz.mjs';
3
+ import { p as paths, P as PromptResponse, g as CreatePromptBody, U as UpdatePromptBody, h as PromptData, i as Prompt, F as FetchPolicy, L as LangWatchSpan } from './types-Kts5RGLY.mjs';
5
4
  import openApiCreateClient from 'openapi-fetch';
6
5
  import { z } from 'zod';
6
+ export { l as attributes } from './types-DRiQaKFG.mjs';
7
7
  import '@opentelemetry/sdk-trace-base';
8
8
  import '@opentelemetry/exporter-trace-otlp-http';
9
9
  import '@opentelemetry/core';
@@ -53,23 +53,23 @@ declare class PromptsApiService {
53
53
  private readonly apiClient;
54
54
  constructor(config?: Pick<InternalConfig, "langwatchApiClient">);
55
55
  /**
56
- * Handles API errors by throwing a PromptsError with operation context.
56
+ * Handles API errors by throwing a PromptsApiError with operation context.
57
57
  * @param operation Description of the operation being performed.
58
58
  * @param error The error object returned from the API client.
59
- * @throws {PromptsError}
59
+ * @throws {PromptsApiError}
60
60
  */
61
61
  private handleApiError;
62
62
  /**
63
63
  * Fetches all prompts from the API.
64
64
  * @returns Array of raw PromptResponse data.
65
- * @throws {PromptsError} If the API call fails.
65
+ * @throws {PromptsApiError} If the API call fails.
66
66
  */
67
67
  getAll(): Promise<PromptResponse[]>;
68
68
  /**
69
69
  * Fetches a single prompt by its ID.
70
70
  * @param id The prompt's unique identifier.
71
71
  * @returns Raw PromptResponse data.
72
- * @throws {PromptsError} If the API call fails.
72
+ * @throws {PromptsApiError} If the API call fails.
73
73
  */
74
74
  get: (id: string, options?: {
75
75
  version?: string;
@@ -78,14 +78,14 @@ declare class PromptsApiService {
78
78
  * Validates if a prompt exists.
79
79
  * @param id The prompt's unique identifier.
80
80
  * @returns True if prompt exists, false otherwise.
81
- * @throws {PromptsError} If the API call fails (not 404).
81
+ * @throws {PromptsApiError} If the API call fails (not 404).
82
82
  */
83
83
  exists(id: string): Promise<boolean>;
84
84
  /**
85
85
  * Creates a new prompt.
86
86
  * @param params The prompt creation payload, matching the OpenAPI schema.
87
87
  * @returns Raw PromptResponse data of the created prompt.
88
- * @throws {PromptsError} If the API call fails.
88
+ * @throws {PromptsApiError} If the API call fails.
89
89
  */
90
90
  create(params: CreatePromptBody): Promise<PromptResponse>;
91
91
  /**
@@ -93,13 +93,13 @@ declare class PromptsApiService {
93
93
  * @param id The prompt's unique identifier.
94
94
  * @param params The update payload, matching the OpenAPI schema.
95
95
  * @returns Raw PromptResponse data of the updated prompt.
96
- * @throws {PromptsError} If the API call fails.
96
+ * @throws {PromptsApiError} If the API call fails.
97
97
  */
98
98
  update(id: string, params: UpdatePromptBody): Promise<PromptResponse>;
99
99
  /**
100
100
  * Deletes a prompt by its ID.
101
101
  * @param id The prompt's unique identifier.
102
- * @throws {PromptsError} If the API call fails.
102
+ * @throws {PromptsApiError} If the API call fails.
103
103
  */
104
104
  delete(id: string): Promise<{
105
105
  success: boolean;
@@ -108,7 +108,7 @@ declare class PromptsApiService {
108
108
  * Fetches all versions for a given prompt.
109
109
  * @param id The prompt's unique identifier.
110
110
  * @returns Array of raw PromptResponse data for each version.
111
- * @throws {PromptsError} If the API call fails.
111
+ * @throws {PromptsApiError} If the API call fails.
112
112
  */
113
113
  getVersions(id: string): Promise<PromptResponse[]>;
114
114
  /**
@@ -116,7 +116,7 @@ declare class PromptsApiService {
116
116
  * @param handle The prompt's handle/identifier.
117
117
  * @param config Local prompt configuration.
118
118
  * @returns Object with created flag and raw PromptResponse data.
119
- * @throws {PromptsError} If the API call fails.
119
+ * @throws {PromptsApiError} If the API call fails.
120
120
  */
121
121
  upsert(handle: string, config: {
122
122
  model: string;
@@ -134,6 +134,7 @@ declare class PromptsApiService {
134
134
  }>;
135
135
  /**
136
136
  * Sync a prompt with local content, handling conflicts and version management
137
+ * You probably don't need to use this method directly.
137
138
  */
138
139
  sync(params: {
139
140
  name: string;
@@ -277,6 +278,17 @@ declare class LocalPromptsService {
277
278
  private convertToPromptData;
278
279
  }
279
280
 
281
+ /**
282
+ * Options for fetching a prompt.
283
+ */
284
+ interface GetPromptOptions {
285
+ /** Specific version to fetch */
286
+ version?: string;
287
+ /** Fetch policy to use */
288
+ fetchPolicy?: FetchPolicy;
289
+ /** Cache TTL in minutes (only used with CACHE_TTL policy) */
290
+ cacheTtlMinutes?: number;
291
+ }
280
292
  interface PromptsFacadeDependencies {
281
293
  promptsApiService: PromptsApiService;
282
294
  localPromptsService: LocalPromptsService;
@@ -285,9 +297,10 @@ interface PromptsFacadeDependencies {
285
297
  * Facade for prompt operations in the LangWatch SDK.
286
298
  * Provides a simplified interface for common prompt management tasks.
287
299
  */
288
- declare class PromptsFacade {
300
+ declare class PromptsFacade implements Pick<PromptsApiService, "sync" | "delete"> {
289
301
  private readonly promptsApiService;
290
302
  private readonly localPromptsService;
303
+ private readonly cache;
291
304
  constructor(config: InternalConfig & PromptsFacadeDependencies);
292
305
  /**
293
306
  * Creates a new prompt.
@@ -303,9 +316,15 @@ declare class PromptsFacade {
303
316
  * @returns The Prompt instance.
304
317
  * @throws {PromptsError} If the prompt is not found or the API call fails.
305
318
  */
306
- get(handleOrId: string, options?: {
307
- version?: string;
308
- }): Promise<Prompt>;
319
+ get(handleOrId: string, options?: GetPromptOptions): Promise<Prompt>;
320
+ private getMaterializedFirst;
321
+ private getAlwaysFetch;
322
+ private getMaterializedOnly;
323
+ /**
324
+ * Builds a cache key that includes both handle and version to prevent collisions.
325
+ */
326
+ private buildCacheKey;
327
+ private getCacheTtl;
309
328
  /**
310
329
  * Retrieves all prompts.
311
330
  * @returns Array of Prompt instances.
@@ -320,26 +339,555 @@ declare class PromptsFacade {
320
339
  * @throws {PromptsError} If the API call fails.
321
340
  */
322
341
  update(handleOrId: string, newData: UpdatePromptBody): Promise<Prompt>;
323
- /**
324
- * Deletes a prompt by handle or ID.
325
- * @param handleOrId The prompt's handle or unique identifier.
326
- * @throws {PromptsError} If the API call fails.
327
- */
328
- delete(handleOrId: string): Promise<{
342
+ get delete(): (id: string) => Promise<{
329
343
  success: boolean;
330
344
  }>;
331
345
  /**
332
- * Syncs a prompt with the server.
333
- * @param params The sync parameters.
334
- * @returns The sync result.
335
- * @throws {PromptsError} If the API call fails.
346
+ * Delegated method to the prompts API service.
336
347
  */
337
- sync(params: {
348
+ get sync(): (params: {
338
349
  name: string;
339
- configData: any;
350
+ configData: ConfigData;
340
351
  localVersion?: number;
341
352
  commitMessage?: string;
342
- }): Promise<SyncResult>;
353
+ }) => Promise<SyncResult>;
354
+ }
355
+
356
+ /**
357
+ * Types for the Dataset API
358
+ */
359
+ /**
360
+ * A single entry in a dataset
361
+ */
362
+ type DatasetEntry<T extends Record<string, unknown> = Record<string, unknown>> = {
363
+ /** Unique identifier for this entry */
364
+ id: string;
365
+ /** The dataset this entry belongs to */
366
+ datasetId: string;
367
+ /** The project this entry belongs to */
368
+ projectId: string;
369
+ /** The actual data for this entry */
370
+ entry: T;
371
+ /** When this entry was created */
372
+ createdAt: string;
373
+ /** When this entry was last updated */
374
+ updatedAt: string;
375
+ };
376
+ /**
377
+ * A dataset containing multiple entries
378
+ */
379
+ type Dataset<T extends Record<string, unknown> = Record<string, unknown>> = {
380
+ /** Array of dataset entries */
381
+ entries: DatasetEntry<T>[];
382
+ };
383
+ /**
384
+ * Options for getting a dataset
385
+ */
386
+ type GetDatasetOptions = {
387
+ /** Skip tracing for this operation */
388
+ ignoreTracing?: boolean;
389
+ };
390
+
391
+ type DatasetsFacadeConfig = {
392
+ langwatchApiClient: LangwatchApiClient;
393
+ logger: Logger;
394
+ };
395
+ /**
396
+ * Facade for dataset operations
397
+ *
398
+ * Provides a simple interface for fetching datasets from LangWatch.
399
+ *
400
+ * @example
401
+ * ```typescript
402
+ * const langwatch = new LangWatch({ apiKey: "your-api-key" });
403
+ *
404
+ * // Get a dataset by slug or ID
405
+ * const dataset = await langwatch.datasets.get("my-dataset");
406
+ *
407
+ * // Use with evaluation
408
+ * const evaluation = langwatch.evaluation.init("my-experiment");
409
+ * await evaluation.run(dataset.entries.map(e => e.entry), async ({ item, index }) => {
410
+ * const output = await myLLM(item.input);
411
+ * await evaluation.evaluate("my-evaluator", {
412
+ * data: { input: item.input, output, expected_output: item.expected_output },
413
+ * settings: {}
414
+ * });
415
+ * });
416
+ * ```
417
+ */
418
+ declare class DatasetsFacade {
419
+ #private;
420
+ constructor(config: DatasetsFacadeConfig);
421
+ /**
422
+ * Fetches a dataset by its slug or ID
423
+ *
424
+ * @param slugOrId - The slug or ID of the dataset to fetch
425
+ * @param options - Optional configuration
426
+ * @returns The dataset with all entries
427
+ *
428
+ * @example
429
+ * ```typescript
430
+ * // Get dataset by slug
431
+ * const dataset = await langwatch.datasets.get("product-qa");
432
+ *
433
+ * // Get dataset by ID
434
+ * const dataset = await langwatch.datasets.get("ds_abc123");
435
+ *
436
+ * // Typed dataset
437
+ * type MyDatasetEntry = { input: string; expected_output: string; };
438
+ * const dataset = await langwatch.datasets.get<MyDatasetEntry>("my-dataset");
439
+ *
440
+ * // Iterate over entries
441
+ * for (const entry of dataset.entries) {
442
+ * console.log(entry.entry.input); // typed as string
443
+ * }
444
+ * ```
445
+ */
446
+ get: <T extends Record<string, unknown> = Record<string, unknown>>(slugOrId: string, options?: GetDatasetOptions) => Promise<Dataset<T>>;
447
+ }
448
+
449
+ /**
450
+ * Types for the Evaluation API
451
+ *
452
+ * These types define the structure for batch evaluations, including
453
+ * logging metrics, running evaluators, and managing targets.
454
+ */
455
+
456
+ /**
457
+ * Status of an evaluation result
458
+ */
459
+ type EvaluationStatus = "processed" | "error" | "skipped";
460
+ /**
461
+ * Target types for batch evaluations
462
+ */
463
+ type TargetType = "prompt" | "agent" | "custom";
464
+ /**
465
+ * Metadata for targets - used for comparison charts
466
+ */
467
+ type TargetMetadata = Record<string, string | number | boolean>;
468
+ declare const targetInfoSchema: z.ZodObject<{
469
+ id: z.ZodString;
470
+ name: z.ZodString;
471
+ type: z.ZodDefault<z.ZodEnum<{
472
+ agent: "agent";
473
+ custom: "custom";
474
+ prompt: "prompt";
475
+ }>>;
476
+ metadata: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnion<readonly [z.ZodString, z.ZodNumber, z.ZodBoolean]>>>>;
477
+ }, z.core.$strip>;
478
+ declare const evaluationResultSchema: z.ZodObject<{
479
+ name: z.ZodString;
480
+ evaluator: z.ZodString;
481
+ trace_id: z.ZodString;
482
+ status: z.ZodEnum<{
483
+ error: "error";
484
+ processed: "processed";
485
+ skipped: "skipped";
486
+ }>;
487
+ data: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
488
+ score: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
489
+ passed: z.ZodOptional<z.ZodNullable<z.ZodBoolean>>;
490
+ details: z.ZodOptional<z.ZodNullable<z.ZodString>>;
491
+ index: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
492
+ label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
493
+ cost: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
494
+ duration: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
495
+ error_type: z.ZodOptional<z.ZodNullable<z.ZodString>>;
496
+ traceback: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodString>>>;
497
+ target_id: z.ZodOptional<z.ZodNullable<z.ZodString>>;
498
+ }, z.core.$strip>;
499
+ /**
500
+ * Information about a registered target
501
+ */
502
+ type TargetInfo = z.infer<typeof targetInfoSchema>;
503
+ /**
504
+ * Result of an evaluation
505
+ */
506
+ type EvaluationResult = z.infer<typeof evaluationResultSchema>;
507
+ /**
508
+ * Options for initializing an evaluation
509
+ */
510
+ type EvaluationInitOptions = {
511
+ /** Custom run ID (auto-generated if not provided) */
512
+ runId?: string;
513
+ /** Number of parallel threads for submit() */
514
+ threads?: number;
515
+ };
516
+ /**
517
+ * Options for the log() method
518
+ */
519
+ type LogOptions = {
520
+ /**
521
+ * Row index in the dataset.
522
+ * Optional when called inside withTarget() - will be auto-inferred from context.
523
+ */
524
+ index?: number;
525
+ /** Additional data/inputs for the evaluation */
526
+ data?: Record<string, unknown>;
527
+ /** Numeric score (typically 0-1) */
528
+ score?: number;
529
+ /** Whether the evaluation passed */
530
+ passed?: boolean;
531
+ /** Label/category for the result */
532
+ label?: string;
533
+ /** Human-readable description of the result */
534
+ details?: string;
535
+ /** Status of the evaluation */
536
+ status?: EvaluationStatus;
537
+ /** Duration in milliseconds */
538
+ duration?: number;
539
+ /** Cost amount in USD */
540
+ cost?: number;
541
+ /** Error if one occurred */
542
+ error?: Error;
543
+ /**
544
+ * Target name for multi-target comparisons.
545
+ * Optional when called inside withTarget() - will be auto-inferred from context.
546
+ */
547
+ target?: string;
548
+ /** Metadata for the target (only used on first call per target) */
549
+ metadata?: TargetMetadata;
550
+ };
551
+ /**
552
+ * Options for the evaluate() method (built-in evaluators)
553
+ */
554
+ type EvaluateOptions = {
555
+ /**
556
+ * Row index in the dataset.
557
+ * Optional when called inside withTarget() - will be auto-inferred from context.
558
+ */
559
+ index?: number;
560
+ /** Data to pass to the evaluator */
561
+ data: Record<string, unknown>;
562
+ /** Evaluator settings */
563
+ settings?: Record<string, unknown>;
564
+ /** Human-readable name for the evaluation */
565
+ name?: string;
566
+ /** Whether to run as a guardrail */
567
+ asGuardrail?: boolean;
568
+ /**
569
+ * Target name for multi-target comparisons.
570
+ * Optional when called inside withTarget() - will be auto-inferred from context.
571
+ */
572
+ target?: string;
573
+ /** Metadata for the target */
574
+ metadata?: TargetMetadata;
575
+ };
576
+ /**
577
+ * Context passed to the run() callback
578
+ */
579
+ type RunContext<T> = {
580
+ /** Current index in the dataset */
581
+ index: number;
582
+ /** The dataset item */
583
+ item: T;
584
+ /** The span for this iteration (for custom instrumentation) */
585
+ span: LangWatchSpan;
586
+ };
587
+ /**
588
+ * Options for the run() method
589
+ */
590
+ type RunOptions = {
591
+ /** Number of concurrent executions (default: 4) */
592
+ concurrency?: number;
593
+ };
594
+ /**
595
+ * Callback function for run()
596
+ */
597
+ type RunCallback<T> = (context: RunContext<T>) => Promise<void> | void;
598
+ /**
599
+ * Context passed to the withTarget() callback
600
+ */
601
+ type TargetContext = {
602
+ /** The LangWatch span for this target execution */
603
+ span: LangWatchSpan;
604
+ /** The trace ID for this target execution */
605
+ traceId: string;
606
+ /** The span ID for this target execution */
607
+ spanId: string;
608
+ };
609
+ /**
610
+ * Callback function for withTarget()
611
+ */
612
+ type TargetCallback<R> = (context: TargetContext) => Promise<R> | R;
613
+ /**
614
+ * Result from withTarget() including captured metrics
615
+ */
616
+ type TargetResult<R> = {
617
+ /** The return value from the callback */
618
+ result: R;
619
+ /** Duration in milliseconds (automatically captured) */
620
+ duration: number;
621
+ /** Cost in USD (captured from span if available) */
622
+ cost?: number;
623
+ /** The trace ID for this execution */
624
+ traceId: string;
625
+ /** The span ID for this execution */
626
+ spanId: string;
627
+ };
628
+
629
+ /**
630
+ * Evaluation - Main class for running batch evaluations
631
+ *
632
+ * Provides a clean API for running evaluations over datasets with:
633
+ * - Automatic tracing per iteration
634
+ * - Parallel execution with concurrency control
635
+ * - Batched result sending
636
+ * - Built-in evaluator support
637
+ * - Multi-target comparison with withTarget() context isolation
638
+ */
639
+
640
+ /**
641
+ * Evaluation session for running batch evaluations
642
+ */
643
+ declare class Evaluation {
644
+ readonly name: string;
645
+ readonly runId: string;
646
+ readonly experimentSlug: string;
647
+ private readonly apiClient;
648
+ private readonly endpoint;
649
+ private readonly apiKey;
650
+ private readonly logger;
651
+ private readonly concurrency;
652
+ private initialized;
653
+ private createdAtMs;
654
+ private total;
655
+ private progress;
656
+ private batch;
657
+ private lastSentMs;
658
+ private pendingFlush;
659
+ private flushTimeout;
660
+ private targets;
661
+ private currentTraceId;
662
+ private currentIndex;
663
+ private iterationUsedWithTarget;
664
+ private evaluationUsesTargets;
665
+ private constructor();
666
+ /**
667
+ * Initialize an evaluation session
668
+ */
669
+ static init(name: string, options: {
670
+ apiClient: LangwatchApiClient;
671
+ endpoint: string;
672
+ apiKey: string;
673
+ logger: Logger;
674
+ } & EvaluationInitOptions): Promise<Evaluation>;
675
+ /**
676
+ * Initialize the evaluation by creating/getting the experiment
677
+ */
678
+ private initialize;
679
+ /**
680
+ * Run evaluation over a dataset with a callback
681
+ *
682
+ * @param dataset - Array of items to evaluate
683
+ * @param callback - Function called for each item with { item, index, span }
684
+ * @param options - Concurrency options
685
+ *
686
+ * @example
687
+ * ```typescript
688
+ * await evaluation.run(dataset, async ({ item, index, span }) => {
689
+ * const response = await myAgent(item.question);
690
+ * evaluation.log('accuracy', { index, score: 0.95 });
691
+ * }, { concurrency: 4 });
692
+ * ```
693
+ */
694
+ run<T>(dataset: T[], callback: RunCallback<T>, options?: RunOptions): Promise<void>;
695
+ /**
696
+ * Execute a single item in the dataset
697
+ */
698
+ private executeItem;
699
+ /**
700
+ * Log a custom metric result
701
+ *
702
+ * @param metric - Name of the metric
703
+ * @param options - Metric options including index, score, passed, etc.
704
+ *
705
+ * If called inside a withTarget() block, the target and index are automatically
706
+ * inferred from the context and don't need to be specified.
707
+ *
708
+ * @example
709
+ * ```typescript
710
+ * // Explicit target (outside withTarget)
711
+ * evaluation.log('accuracy', { index, score: 0.95, target: 'gpt-4' });
712
+ *
713
+ * // Implicit target (inside withTarget)
714
+ * await evaluation.withTarget('gpt-4', { model: 'openai/gpt-4' }, async () => {
715
+ * evaluation.log('accuracy', { score: 0.95 }); // target and index auto-inferred
716
+ * });
717
+ * ```
718
+ */
719
+ log(metric: string, options: LogOptions): void;
720
+ /**
721
+ * Run a built-in evaluator
722
+ *
723
+ * @param evaluatorSlug - The evaluator identifier (e.g., 'ragas/faithfulness')
724
+ * @param options - Evaluator options including data and settings
725
+ *
726
+ * If called inside a withTarget() block, the target and index are automatically
727
+ * inferred from the context and don't need to be specified.
728
+ *
729
+ * @example
730
+ * ```typescript
731
+ * // Inside withTarget() - target and index auto-inferred
732
+ * await evaluation.withTarget('gpt-4', { model: 'openai/gpt-4' }, async () => {
733
+ * await evaluation.evaluate('ragas/faithfulness', {
734
+ * data: { input, output, contexts },
735
+ * });
736
+ * });
737
+ *
738
+ * // Or explicit index/target
739
+ * await evaluation.evaluate('ragas/faithfulness', {
740
+ * index,
741
+ * data: { input, output, contexts },
742
+ * target: 'gpt-4',
743
+ * });
744
+ * ```
745
+ */
746
+ evaluate(evaluatorSlug: string, options: EvaluateOptions): Promise<void>;
747
+ /**
748
+ * Execute code within a target context with automatic tracing
749
+ *
750
+ * Creates a new span for this target execution and sets up context
751
+ * so that log() calls inside the callback automatically use this target.
752
+ * Duration and output are captured automatically.
753
+ *
754
+ * This creates a dataset entry per target (like Evaluations V3), enabling
755
+ * proper per-target latency and cost tracking.
756
+ *
757
+ * @param targetName - Unique identifier for the target
758
+ * @param metadata - Optional metadata for comparison (e.g., { model: 'gpt-4' })
759
+ * @param callback - Function to execute within the target context
760
+ * @returns The callback result along with captured metrics
761
+ *
762
+ * @example
763
+ * ```typescript
764
+ * await evaluation.run(dataset, async ({ item, index }) => {
765
+ * // Compare GPT-4 and Claude on the same input
766
+ * const [gpt4Result, claudeResult] = await Promise.all([
767
+ * evaluation.withTarget('gpt-4', { model: 'openai/gpt-4' }, async () => {
768
+ * const response = await openai.chat(item.question);
769
+ * evaluation.log('quality', { score: 0.95 }); // target auto-inferred
770
+ * return response;
771
+ * }),
772
+ * evaluation.withTarget('claude-3', { model: 'anthropic/claude-3' }, async () => {
773
+ * const response = await anthropic.messages(item.question);
774
+ * evaluation.log('quality', { score: 0.85 }); // target auto-inferred
775
+ * return response;
776
+ * }),
777
+ * ]);
778
+ * });
779
+ * ```
780
+ */
781
+ withTarget<R>(targetName: string, metadata: TargetMetadata | null, callback: TargetCallback<R>): Promise<TargetResult<R>>;
782
+ withTarget<R>(targetName: string, callback: TargetCallback<R>): Promise<TargetResult<R>>;
783
+ /**
784
+ * Register a target for multi-target comparison
785
+ */
786
+ private registerTarget;
787
+ /**
788
+ * Schedule a debounced send
789
+ */
790
+ private scheduleSend;
791
+ /**
792
+ * Send current batch to the API
793
+ */
794
+ private sendBatch;
795
+ /**
796
+ * Flush all pending data
797
+ */
798
+ private flush;
799
+ /**
800
+ * Serialize a dataset item for the API
801
+ */
802
+ private serializeItem;
803
+ /**
804
+ * Get trace ID from current OpenTelemetry context
805
+ */
806
+ private getTraceIdFromContext;
807
+ /**
808
+ * Get span ID from current OpenTelemetry context
809
+ */
810
+ private getSpanIdFromContext;
811
+ }
812
+
813
+ /**
814
+ * EvaluationFacade - Entry point for the evaluation API
815
+ *
816
+ * Provides the `init()` method to create evaluation sessions.
817
+ */
818
+
819
+ type EvaluationFacadeConfig = {
820
+ langwatchApiClient: LangwatchApiClient;
821
+ endpoint: string;
822
+ apiKey: string;
823
+ logger: Logger;
824
+ };
825
+ /**
826
+ * Facade for creating evaluation sessions
827
+ */
828
+ declare class EvaluationFacade {
829
+ private readonly config;
830
+ constructor(config: EvaluationFacadeConfig);
831
+ /**
832
+ * Initialize a new evaluation session
833
+ *
834
+ * @param name - Name of the experiment (used as slug)
835
+ * @param options - Optional configuration
836
+ * @returns An initialized Evaluation instance
837
+ *
838
+ * @example
839
+ * ```typescript
840
+ * const evaluation = await langwatch.evaluation.init('my-experiment');
841
+ *
842
+ * await evaluation.run(dataset, async ({ item, index }) => {
843
+ * const response = await myAgent(item.question);
844
+ * evaluation.log('accuracy', { index, score: 0.95 });
845
+ * });
846
+ * ```
847
+ */
848
+ init(name: string, options?: EvaluationInitOptions): Promise<Evaluation>;
849
+ }
850
+
851
+ /**
852
+ * Errors for the Evaluation API
853
+ */
854
+ /**
855
+ * Base error for evaluation-related issues
856
+ */
857
+ declare class EvaluationError extends Error {
858
+ constructor(message: string);
859
+ }
860
+ /**
861
+ * Thrown when initialization fails
862
+ */
863
+ declare class EvaluationInitError extends EvaluationError {
864
+ readonly cause?: Error | undefined;
865
+ constructor(message: string, cause?: Error | undefined);
866
+ }
867
+ /**
868
+ * Thrown when API calls fail
869
+ */
870
+ declare class EvaluationApiError extends EvaluationError {
871
+ readonly statusCode?: number | undefined;
872
+ readonly cause?: Error | undefined;
873
+ constructor(message: string, statusCode?: number | undefined, cause?: Error | undefined);
874
+ }
875
+ /**
876
+ * Thrown when target metadata conflicts
877
+ */
878
+ declare class TargetMetadataConflictError extends EvaluationError {
879
+ readonly targetName: string;
880
+ readonly existingMetadata: Record<string, unknown>;
881
+ readonly newMetadata: Record<string, unknown>;
882
+ constructor(targetName: string, existingMetadata: Record<string, unknown>, newMetadata: Record<string, unknown>);
883
+ }
884
+ /**
885
+ * Thrown when an evaluator call fails
886
+ */
887
+ declare class EvaluatorError extends EvaluationError {
888
+ readonly evaluatorSlug: string;
889
+ readonly cause?: Error | undefined;
890
+ constructor(evaluatorSlug: string, message: string, cause?: Error | undefined);
343
891
  }
344
892
 
345
893
  interface GetTraceParams {
@@ -365,6 +913,8 @@ declare class LangWatch {
365
913
  private readonly config;
366
914
  readonly prompts: PromptsFacade;
367
915
  readonly traces: TracesFacade;
916
+ readonly evaluation: EvaluationFacade;
917
+ readonly datasets: DatasetsFacade;
368
918
  constructor(options?: LangWatchConstructorOptions);
369
919
  get apiClient(): LangwatchApiClient;
370
920
  }
@@ -374,4 +924,4 @@ declare const logger: {
374
924
  NoOpLogger: typeof NoOpLogger;
375
925
  };
376
926
 
377
- export { LangWatch, logger };
927
+ export { type EvaluateOptions, Evaluation, EvaluationApiError, EvaluationError, EvaluationFacade, EvaluationInitError, type EvaluationInitOptions, type EvaluationResult, type EvaluationStatus, EvaluatorError, FetchPolicy, type GetPromptOptions, LangWatch, type LogOptions, type RunCallback, type RunContext, type RunOptions, type TargetInfo, type TargetMetadata, TargetMetadataConflictError, type TargetType, logger };