langwatch 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{add-CXUS4ZSQ.js → add-UB5U3K3M.js} +11 -11
- package/dist/add-UB5U3K3M.js.map +1 -0
- package/dist/{add-5FRWEQ32.mjs → add-XV5SUAXF.mjs} +8 -8
- package/dist/add-XV5SUAXF.mjs.map +1 -0
- package/dist/{chunk-VGVWXKVM.mjs → chunk-556ZFJMK.mjs} +3 -3
- package/dist/{chunk-CKIZDPIJ.js → chunk-5MQQRSVM.js} +1 -1
- package/dist/{chunk-CKIZDPIJ.js.map → chunk-5MQQRSVM.js.map} +1 -1
- package/dist/{chunk-SNDTNU3T.js → chunk-ASTAIRXG.js} +2 -2
- package/dist/{chunk-SNDTNU3T.js.map → chunk-ASTAIRXG.js.map} +1 -1
- package/dist/{chunk-NM5OKM7F.js → chunk-D4H6PR6H.js} +21 -20
- package/dist/chunk-D4H6PR6H.js.map +1 -0
- package/dist/{chunk-WHPBZSTS.mjs → chunk-IIUI2XYW.mjs} +2 -2
- package/dist/{chunk-BTCJWUS5.js → chunk-JQYW7RY7.js} +17 -17
- package/dist/{chunk-BTCJWUS5.js.map → chunk-JQYW7RY7.js.map} +1 -1
- package/dist/{chunk-W6FD5ZO3.mjs → chunk-LKE6DMUP.mjs} +2 -2
- package/dist/{chunk-W6FD5ZO3.mjs.map → chunk-LKE6DMUP.mjs.map} +1 -1
- package/dist/{chunk-YWO3NE5A.js → chunk-N7PJJMU2.js} +2 -2
- package/dist/{chunk-YWO3NE5A.js.map → chunk-N7PJJMU2.js.map} +1 -1
- package/dist/{chunk-A43BYF5Q.js → chunk-ONXIZKC6.js} +11 -11
- package/dist/{chunk-A43BYF5Q.js.map → chunk-ONXIZKC6.js.map} +1 -1
- package/dist/{chunk-I2SOBPAF.mjs → chunk-RSIPLYVA.mjs} +1 -1
- package/dist/{chunk-I2SOBPAF.mjs.map → chunk-RSIPLYVA.mjs.map} +1 -1
- package/dist/{chunk-I3X7VMSP.mjs → chunk-WZ7FYUHN.mjs} +8 -7
- package/dist/chunk-WZ7FYUHN.mjs.map +1 -0
- package/dist/{chunk-FEL5FLHA.mjs → chunk-ZEPKV5YO.mjs} +2 -2
- package/dist/cli/index.js +6 -6
- package/dist/cli/index.mjs +6 -6
- package/dist/index.d.mts +541 -2
- package/dist/index.d.ts +541 -2
- package/dist/index.js +975 -17
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +967 -9
- package/dist/index.mjs.map +1 -1
- package/dist/{list-K6E3OGYH.js → list-DUNP46AD.js} +10 -10
- package/dist/{list-K6E3OGYH.js.map → list-DUNP46AD.js.map} +1 -1
- package/dist/{list-DQ6XLQCK.mjs → list-T4QS6CT2.mjs} +7 -7
- package/dist/{login-HX7WMLPL.js → login-3H27NIOD.js} +4 -4
- package/dist/{login-HX7WMLPL.js.map → login-3H27NIOD.js.map} +1 -1
- package/dist/{login-TJ2NCUAJ.mjs → login-T2ET7TKH.mjs} +3 -3
- package/dist/login-T2ET7TKH.mjs.map +1 -0
- package/dist/observability-sdk/index.js +6 -6
- package/dist/observability-sdk/index.js.map +1 -1
- package/dist/observability-sdk/index.mjs +10 -10
- package/dist/observability-sdk/instrumentation/langchain/index.js +16 -16
- package/dist/observability-sdk/instrumentation/langchain/index.mjs +2 -2
- package/dist/observability-sdk/setup/node/index.js +13 -13
- package/dist/observability-sdk/setup/node/index.mjs +3 -3
- package/dist/{remove-5ZOYQTF4.mjs → remove-F5RM4775.mjs} +7 -7
- package/dist/{remove-45A7WUCB.js → remove-V4JL5Z4U.js} +9 -9
- package/dist/{remove-45A7WUCB.js.map → remove-V4JL5Z4U.js.map} +1 -1
- package/dist/{sync-BE7XZC2A.js → sync-DIOKWE6R.js} +11 -11
- package/dist/sync-DIOKWE6R.js.map +1 -0
- package/dist/{sync-LL6TTFMS.mjs → sync-VGWOLOLJ.mjs} +9 -9
- package/dist/sync-VGWOLOLJ.mjs.map +1 -0
- package/package.json +125 -142
- package/dist/add-5FRWEQ32.mjs.map +0 -1
- package/dist/add-CXUS4ZSQ.js.map +0 -1
- package/dist/chunk-I3X7VMSP.mjs.map +0 -1
- package/dist/chunk-NM5OKM7F.js.map +0 -1
- package/dist/login-TJ2NCUAJ.mjs.map +0 -1
- package/dist/sync-BE7XZC2A.js.map +0 -1
- package/dist/sync-LL6TTFMS.mjs.map +0 -1
- /package/dist/{chunk-VGVWXKVM.mjs.map → chunk-556ZFJMK.mjs.map} +0 -0
- /package/dist/{chunk-WHPBZSTS.mjs.map → chunk-IIUI2XYW.mjs.map} +0 -0
- /package/dist/{chunk-FEL5FLHA.mjs.map → chunk-ZEPKV5YO.mjs.map} +0 -0
- /package/dist/{list-DQ6XLQCK.mjs.map → list-T4QS6CT2.mjs.map} +0 -0
- /package/dist/{remove-5ZOYQTF4.mjs.map → remove-F5RM4775.mjs.map} +0 -0
package/dist/index.d.mts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { L as Logger, C as ConsoleLogger, N as NoOpLogger } from './index-D7rKIGrO.mjs';
|
|
2
2
|
export { F as FilterableBatchSpanProcessor, L as LangWatchExporter, S as SpanProcessingExcludeRule, g as getLangWatchLogger, d as getLangWatchTracer } from './implementation-CVrmD0bz.mjs';
|
|
3
|
-
import { p as paths, P as PromptResponse, g as CreatePromptBody, U as UpdatePromptBody, h as PromptData, i as Prompt, F as FetchPolicy } from './types-Kts5RGLY.mjs';
|
|
3
|
+
import { p as paths, P as PromptResponse, g as CreatePromptBody, U as UpdatePromptBody, h as PromptData, i as Prompt, F as FetchPolicy, L as LangWatchSpan } from './types-Kts5RGLY.mjs';
|
|
4
4
|
import openApiCreateClient from 'openapi-fetch';
|
|
5
5
|
import { z } from 'zod';
|
|
6
6
|
export { l as attributes } from './types-DRiQaKFG.mjs';
|
|
@@ -353,6 +353,543 @@ declare class PromptsFacade implements Pick<PromptsApiService, "sync" | "delete"
|
|
|
353
353
|
}) => Promise<SyncResult>;
|
|
354
354
|
}
|
|
355
355
|
|
|
356
|
+
/**
|
|
357
|
+
* Types for the Dataset API
|
|
358
|
+
*/
|
|
359
|
+
/**
|
|
360
|
+
* A single entry in a dataset
|
|
361
|
+
*/
|
|
362
|
+
type DatasetEntry<T extends Record<string, unknown> = Record<string, unknown>> = {
|
|
363
|
+
/** Unique identifier for this entry */
|
|
364
|
+
id: string;
|
|
365
|
+
/** The dataset this entry belongs to */
|
|
366
|
+
datasetId: string;
|
|
367
|
+
/** The project this entry belongs to */
|
|
368
|
+
projectId: string;
|
|
369
|
+
/** The actual data for this entry */
|
|
370
|
+
entry: T;
|
|
371
|
+
/** When this entry was created */
|
|
372
|
+
createdAt: string;
|
|
373
|
+
/** When this entry was last updated */
|
|
374
|
+
updatedAt: string;
|
|
375
|
+
};
|
|
376
|
+
/**
|
|
377
|
+
* A dataset containing multiple entries
|
|
378
|
+
*/
|
|
379
|
+
type Dataset<T extends Record<string, unknown> = Record<string, unknown>> = {
|
|
380
|
+
/** Array of dataset entries */
|
|
381
|
+
entries: DatasetEntry<T>[];
|
|
382
|
+
};
|
|
383
|
+
/**
|
|
384
|
+
* Options for getting a dataset
|
|
385
|
+
*/
|
|
386
|
+
type GetDatasetOptions = {
|
|
387
|
+
/** Skip tracing for this operation */
|
|
388
|
+
ignoreTracing?: boolean;
|
|
389
|
+
};
|
|
390
|
+
|
|
391
|
+
type DatasetsFacadeConfig = {
|
|
392
|
+
langwatchApiClient: LangwatchApiClient;
|
|
393
|
+
logger: Logger;
|
|
394
|
+
};
|
|
395
|
+
/**
|
|
396
|
+
* Facade for dataset operations
|
|
397
|
+
*
|
|
398
|
+
* Provides a simple interface for fetching datasets from LangWatch.
|
|
399
|
+
*
|
|
400
|
+
* @example
|
|
401
|
+
* ```typescript
|
|
402
|
+
* const langwatch = new LangWatch({ apiKey: "your-api-key" });
|
|
403
|
+
*
|
|
404
|
+
* // Get a dataset by slug or ID
|
|
405
|
+
* const dataset = await langwatch.datasets.get("my-dataset");
|
|
406
|
+
*
|
|
407
|
+
* // Use with evaluation
|
|
408
|
+
* const evaluation = langwatch.evaluation.init("my-experiment");
|
|
409
|
+
* await evaluation.run(dataset.entries.map(e => e.entry), async ({ item, index }) => {
|
|
410
|
+
* const output = await myLLM(item.input);
|
|
411
|
+
* await evaluation.evaluate("my-evaluator", {
|
|
412
|
+
* data: { input: item.input, output, expected_output: item.expected_output },
|
|
413
|
+
* settings: {}
|
|
414
|
+
* });
|
|
415
|
+
* });
|
|
416
|
+
* ```
|
|
417
|
+
*/
|
|
418
|
+
declare class DatasetsFacade {
|
|
419
|
+
#private;
|
|
420
|
+
constructor(config: DatasetsFacadeConfig);
|
|
421
|
+
/**
|
|
422
|
+
* Fetches a dataset by its slug or ID
|
|
423
|
+
*
|
|
424
|
+
* @param slugOrId - The slug or ID of the dataset to fetch
|
|
425
|
+
* @param options - Optional configuration
|
|
426
|
+
* @returns The dataset with all entries
|
|
427
|
+
*
|
|
428
|
+
* @example
|
|
429
|
+
* ```typescript
|
|
430
|
+
* // Get dataset by slug
|
|
431
|
+
* const dataset = await langwatch.datasets.get("product-qa");
|
|
432
|
+
*
|
|
433
|
+
* // Get dataset by ID
|
|
434
|
+
* const dataset = await langwatch.datasets.get("ds_abc123");
|
|
435
|
+
*
|
|
436
|
+
* // Typed dataset
|
|
437
|
+
* type MyDatasetEntry = { input: string; expected_output: string; };
|
|
438
|
+
* const dataset = await langwatch.datasets.get<MyDatasetEntry>("my-dataset");
|
|
439
|
+
*
|
|
440
|
+
* // Iterate over entries
|
|
441
|
+
* for (const entry of dataset.entries) {
|
|
442
|
+
* console.log(entry.entry.input); // typed as string
|
|
443
|
+
* }
|
|
444
|
+
* ```
|
|
445
|
+
*/
|
|
446
|
+
get: <T extends Record<string, unknown> = Record<string, unknown>>(slugOrId: string, options?: GetDatasetOptions) => Promise<Dataset<T>>;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
/**
|
|
450
|
+
* Types for the Evaluation API
|
|
451
|
+
*
|
|
452
|
+
* These types define the structure for batch evaluations, including
|
|
453
|
+
* logging metrics, running evaluators, and managing targets.
|
|
454
|
+
*/
|
|
455
|
+
|
|
456
|
+
/**
|
|
457
|
+
* Status of an evaluation result
|
|
458
|
+
*/
|
|
459
|
+
type EvaluationStatus = "processed" | "error" | "skipped";
|
|
460
|
+
/**
|
|
461
|
+
* Target types for batch evaluations
|
|
462
|
+
*/
|
|
463
|
+
type TargetType = "prompt" | "agent" | "custom";
|
|
464
|
+
/**
|
|
465
|
+
* Metadata for targets - used for comparison charts
|
|
466
|
+
*/
|
|
467
|
+
type TargetMetadata = Record<string, string | number | boolean>;
|
|
468
|
+
declare const targetInfoSchema: z.ZodObject<{
|
|
469
|
+
id: z.ZodString;
|
|
470
|
+
name: z.ZodString;
|
|
471
|
+
type: z.ZodDefault<z.ZodEnum<{
|
|
472
|
+
agent: "agent";
|
|
473
|
+
custom: "custom";
|
|
474
|
+
prompt: "prompt";
|
|
475
|
+
}>>;
|
|
476
|
+
metadata: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnion<readonly [z.ZodString, z.ZodNumber, z.ZodBoolean]>>>>;
|
|
477
|
+
}, z.core.$strip>;
|
|
478
|
+
declare const evaluationResultSchema: z.ZodObject<{
|
|
479
|
+
name: z.ZodString;
|
|
480
|
+
evaluator: z.ZodString;
|
|
481
|
+
trace_id: z.ZodString;
|
|
482
|
+
status: z.ZodEnum<{
|
|
483
|
+
error: "error";
|
|
484
|
+
processed: "processed";
|
|
485
|
+
skipped: "skipped";
|
|
486
|
+
}>;
|
|
487
|
+
data: z.ZodOptional<z.ZodNullable<z.ZodRecord<z.ZodString, z.ZodUnknown>>>;
|
|
488
|
+
score: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
|
|
489
|
+
passed: z.ZodOptional<z.ZodNullable<z.ZodBoolean>>;
|
|
490
|
+
details: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
491
|
+
index: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
|
|
492
|
+
label: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
493
|
+
cost: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
|
|
494
|
+
duration: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
|
|
495
|
+
error_type: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
496
|
+
traceback: z.ZodOptional<z.ZodNullable<z.ZodArray<z.ZodString>>>;
|
|
497
|
+
target_id: z.ZodOptional<z.ZodNullable<z.ZodString>>;
|
|
498
|
+
}, z.core.$strip>;
|
|
499
|
+
/**
|
|
500
|
+
* Information about a registered target
|
|
501
|
+
*/
|
|
502
|
+
type TargetInfo = z.infer<typeof targetInfoSchema>;
|
|
503
|
+
/**
|
|
504
|
+
* Result of an evaluation
|
|
505
|
+
*/
|
|
506
|
+
type EvaluationResult = z.infer<typeof evaluationResultSchema>;
|
|
507
|
+
/**
|
|
508
|
+
* Options for initializing an evaluation
|
|
509
|
+
*/
|
|
510
|
+
type EvaluationInitOptions = {
|
|
511
|
+
/** Custom run ID (auto-generated if not provided) */
|
|
512
|
+
runId?: string;
|
|
513
|
+
/** Number of parallel threads for submit() */
|
|
514
|
+
threads?: number;
|
|
515
|
+
};
|
|
516
|
+
/**
|
|
517
|
+
* Options for the log() method
|
|
518
|
+
*/
|
|
519
|
+
type LogOptions = {
|
|
520
|
+
/**
|
|
521
|
+
* Row index in the dataset.
|
|
522
|
+
* Optional when called inside withTarget() - will be auto-inferred from context.
|
|
523
|
+
*/
|
|
524
|
+
index?: number;
|
|
525
|
+
/** Additional data/inputs for the evaluation */
|
|
526
|
+
data?: Record<string, unknown>;
|
|
527
|
+
/** Numeric score (typically 0-1) */
|
|
528
|
+
score?: number;
|
|
529
|
+
/** Whether the evaluation passed */
|
|
530
|
+
passed?: boolean;
|
|
531
|
+
/** Label/category for the result */
|
|
532
|
+
label?: string;
|
|
533
|
+
/** Human-readable description of the result */
|
|
534
|
+
details?: string;
|
|
535
|
+
/** Status of the evaluation */
|
|
536
|
+
status?: EvaluationStatus;
|
|
537
|
+
/** Duration in milliseconds */
|
|
538
|
+
duration?: number;
|
|
539
|
+
/** Cost amount in USD */
|
|
540
|
+
cost?: number;
|
|
541
|
+
/** Error if one occurred */
|
|
542
|
+
error?: Error;
|
|
543
|
+
/**
|
|
544
|
+
* Target name for multi-target comparisons.
|
|
545
|
+
* Optional when called inside withTarget() - will be auto-inferred from context.
|
|
546
|
+
*/
|
|
547
|
+
target?: string;
|
|
548
|
+
/** Metadata for the target (only used on first call per target) */
|
|
549
|
+
metadata?: TargetMetadata;
|
|
550
|
+
};
|
|
551
|
+
/**
|
|
552
|
+
* Options for the evaluate() method (built-in evaluators)
|
|
553
|
+
*/
|
|
554
|
+
type EvaluateOptions = {
|
|
555
|
+
/**
|
|
556
|
+
* Row index in the dataset.
|
|
557
|
+
* Optional when called inside withTarget() - will be auto-inferred from context.
|
|
558
|
+
*/
|
|
559
|
+
index?: number;
|
|
560
|
+
/** Data to pass to the evaluator */
|
|
561
|
+
data: Record<string, unknown>;
|
|
562
|
+
/** Evaluator settings */
|
|
563
|
+
settings?: Record<string, unknown>;
|
|
564
|
+
/** Human-readable name for the evaluation */
|
|
565
|
+
name?: string;
|
|
566
|
+
/** Whether to run as a guardrail */
|
|
567
|
+
asGuardrail?: boolean;
|
|
568
|
+
/**
|
|
569
|
+
* Target name for multi-target comparisons.
|
|
570
|
+
* Optional when called inside withTarget() - will be auto-inferred from context.
|
|
571
|
+
*/
|
|
572
|
+
target?: string;
|
|
573
|
+
/** Metadata for the target */
|
|
574
|
+
metadata?: TargetMetadata;
|
|
575
|
+
};
|
|
576
|
+
/**
|
|
577
|
+
* Context passed to the run() callback
|
|
578
|
+
*/
|
|
579
|
+
type RunContext<T> = {
|
|
580
|
+
/** Current index in the dataset */
|
|
581
|
+
index: number;
|
|
582
|
+
/** The dataset item */
|
|
583
|
+
item: T;
|
|
584
|
+
/** The span for this iteration (for custom instrumentation) */
|
|
585
|
+
span: LangWatchSpan;
|
|
586
|
+
};
|
|
587
|
+
/**
|
|
588
|
+
* Options for the run() method
|
|
589
|
+
*/
|
|
590
|
+
type RunOptions = {
|
|
591
|
+
/** Number of concurrent executions (default: 4) */
|
|
592
|
+
concurrency?: number;
|
|
593
|
+
};
|
|
594
|
+
/**
|
|
595
|
+
* Callback function for run()
|
|
596
|
+
*/
|
|
597
|
+
type RunCallback<T> = (context: RunContext<T>) => Promise<void> | void;
|
|
598
|
+
/**
|
|
599
|
+
* Context passed to the withTarget() callback
|
|
600
|
+
*/
|
|
601
|
+
type TargetContext = {
|
|
602
|
+
/** The LangWatch span for this target execution */
|
|
603
|
+
span: LangWatchSpan;
|
|
604
|
+
/** The trace ID for this target execution */
|
|
605
|
+
traceId: string;
|
|
606
|
+
/** The span ID for this target execution */
|
|
607
|
+
spanId: string;
|
|
608
|
+
};
|
|
609
|
+
/**
|
|
610
|
+
* Callback function for withTarget()
|
|
611
|
+
*/
|
|
612
|
+
type TargetCallback<R> = (context: TargetContext) => Promise<R> | R;
|
|
613
|
+
/**
|
|
614
|
+
* Result from withTarget() including captured metrics
|
|
615
|
+
*/
|
|
616
|
+
type TargetResult<R> = {
|
|
617
|
+
/** The return value from the callback */
|
|
618
|
+
result: R;
|
|
619
|
+
/** Duration in milliseconds (automatically captured) */
|
|
620
|
+
duration: number;
|
|
621
|
+
/** Cost in USD (captured from span if available) */
|
|
622
|
+
cost?: number;
|
|
623
|
+
/** The trace ID for this execution */
|
|
624
|
+
traceId: string;
|
|
625
|
+
/** The span ID for this execution */
|
|
626
|
+
spanId: string;
|
|
627
|
+
};
|
|
628
|
+
|
|
629
|
+
/**
|
|
630
|
+
* Evaluation - Main class for running batch evaluations
|
|
631
|
+
*
|
|
632
|
+
* Provides a clean API for running evaluations over datasets with:
|
|
633
|
+
* - Automatic tracing per iteration
|
|
634
|
+
* - Parallel execution with concurrency control
|
|
635
|
+
* - Batched result sending
|
|
636
|
+
* - Built-in evaluator support
|
|
637
|
+
* - Multi-target comparison with withTarget() context isolation
|
|
638
|
+
*/
|
|
639
|
+
|
|
640
|
+
/**
|
|
641
|
+
* Evaluation session for running batch evaluations
|
|
642
|
+
*/
|
|
643
|
+
declare class Evaluation {
|
|
644
|
+
readonly name: string;
|
|
645
|
+
readonly runId: string;
|
|
646
|
+
readonly experimentSlug: string;
|
|
647
|
+
private readonly apiClient;
|
|
648
|
+
private readonly endpoint;
|
|
649
|
+
private readonly apiKey;
|
|
650
|
+
private readonly logger;
|
|
651
|
+
private readonly concurrency;
|
|
652
|
+
private initialized;
|
|
653
|
+
private createdAtMs;
|
|
654
|
+
private total;
|
|
655
|
+
private progress;
|
|
656
|
+
private batch;
|
|
657
|
+
private lastSentMs;
|
|
658
|
+
private pendingFlush;
|
|
659
|
+
private flushTimeout;
|
|
660
|
+
private targets;
|
|
661
|
+
private currentTraceId;
|
|
662
|
+
private currentIndex;
|
|
663
|
+
private iterationUsedWithTarget;
|
|
664
|
+
private evaluationUsesTargets;
|
|
665
|
+
private constructor();
|
|
666
|
+
/**
|
|
667
|
+
* Initialize an evaluation session
|
|
668
|
+
*/
|
|
669
|
+
static init(name: string, options: {
|
|
670
|
+
apiClient: LangwatchApiClient;
|
|
671
|
+
endpoint: string;
|
|
672
|
+
apiKey: string;
|
|
673
|
+
logger: Logger;
|
|
674
|
+
} & EvaluationInitOptions): Promise<Evaluation>;
|
|
675
|
+
/**
|
|
676
|
+
* Initialize the evaluation by creating/getting the experiment
|
|
677
|
+
*/
|
|
678
|
+
private initialize;
|
|
679
|
+
/**
|
|
680
|
+
* Run evaluation over a dataset with a callback
|
|
681
|
+
*
|
|
682
|
+
* @param dataset - Array of items to evaluate
|
|
683
|
+
* @param callback - Function called for each item with { item, index, span }
|
|
684
|
+
* @param options - Concurrency options
|
|
685
|
+
*
|
|
686
|
+
* @example
|
|
687
|
+
* ```typescript
|
|
688
|
+
* await evaluation.run(dataset, async ({ item, index, span }) => {
|
|
689
|
+
* const response = await myAgent(item.question);
|
|
690
|
+
* evaluation.log('accuracy', { index, score: 0.95 });
|
|
691
|
+
* }, { concurrency: 4 });
|
|
692
|
+
* ```
|
|
693
|
+
*/
|
|
694
|
+
run<T>(dataset: T[], callback: RunCallback<T>, options?: RunOptions): Promise<void>;
|
|
695
|
+
/**
|
|
696
|
+
* Execute a single item in the dataset
|
|
697
|
+
*/
|
|
698
|
+
private executeItem;
|
|
699
|
+
/**
|
|
700
|
+
* Log a custom metric result
|
|
701
|
+
*
|
|
702
|
+
* @param metric - Name of the metric
|
|
703
|
+
* @param options - Metric options including index, score, passed, etc.
|
|
704
|
+
*
|
|
705
|
+
* If called inside a withTarget() block, the target and index are automatically
|
|
706
|
+
* inferred from the context and don't need to be specified.
|
|
707
|
+
*
|
|
708
|
+
* @example
|
|
709
|
+
* ```typescript
|
|
710
|
+
* // Explicit target (outside withTarget)
|
|
711
|
+
* evaluation.log('accuracy', { index, score: 0.95, target: 'gpt-4' });
|
|
712
|
+
*
|
|
713
|
+
* // Implicit target (inside withTarget)
|
|
714
|
+
* await evaluation.withTarget('gpt-4', { model: 'openai/gpt-4' }, async () => {
|
|
715
|
+
* evaluation.log('accuracy', { score: 0.95 }); // target and index auto-inferred
|
|
716
|
+
* });
|
|
717
|
+
* ```
|
|
718
|
+
*/
|
|
719
|
+
log(metric: string, options: LogOptions): void;
|
|
720
|
+
/**
|
|
721
|
+
* Run a built-in evaluator
|
|
722
|
+
*
|
|
723
|
+
* @param evaluatorSlug - The evaluator identifier (e.g., 'ragas/faithfulness')
|
|
724
|
+
* @param options - Evaluator options including data and settings
|
|
725
|
+
*
|
|
726
|
+
* If called inside a withTarget() block, the target and index are automatically
|
|
727
|
+
* inferred from the context and don't need to be specified.
|
|
728
|
+
*
|
|
729
|
+
* @example
|
|
730
|
+
* ```typescript
|
|
731
|
+
* // Inside withTarget() - target and index auto-inferred
|
|
732
|
+
* await evaluation.withTarget('gpt-4', { model: 'openai/gpt-4' }, async () => {
|
|
733
|
+
* await evaluation.evaluate('ragas/faithfulness', {
|
|
734
|
+
* data: { input, output, contexts },
|
|
735
|
+
* });
|
|
736
|
+
* });
|
|
737
|
+
*
|
|
738
|
+
* // Or explicit index/target
|
|
739
|
+
* await evaluation.evaluate('ragas/faithfulness', {
|
|
740
|
+
* index,
|
|
741
|
+
* data: { input, output, contexts },
|
|
742
|
+
* target: 'gpt-4',
|
|
743
|
+
* });
|
|
744
|
+
* ```
|
|
745
|
+
*/
|
|
746
|
+
evaluate(evaluatorSlug: string, options: EvaluateOptions): Promise<void>;
|
|
747
|
+
/**
|
|
748
|
+
* Execute code within a target context with automatic tracing
|
|
749
|
+
*
|
|
750
|
+
* Creates a new span for this target execution and sets up context
|
|
751
|
+
* so that log() calls inside the callback automatically use this target.
|
|
752
|
+
* Duration and output are captured automatically.
|
|
753
|
+
*
|
|
754
|
+
* This creates a dataset entry per target (like Evaluations V3), enabling
|
|
755
|
+
* proper per-target latency and cost tracking.
|
|
756
|
+
*
|
|
757
|
+
* @param targetName - Unique identifier for the target
|
|
758
|
+
* @param metadata - Optional metadata for comparison (e.g., { model: 'gpt-4' })
|
|
759
|
+
* @param callback - Function to execute within the target context
|
|
760
|
+
* @returns The callback result along with captured metrics
|
|
761
|
+
*
|
|
762
|
+
* @example
|
|
763
|
+
* ```typescript
|
|
764
|
+
* await evaluation.run(dataset, async ({ item, index }) => {
|
|
765
|
+
* // Compare GPT-4 and Claude on the same input
|
|
766
|
+
* const [gpt4Result, claudeResult] = await Promise.all([
|
|
767
|
+
* evaluation.withTarget('gpt-4', { model: 'openai/gpt-4' }, async () => {
|
|
768
|
+
* const response = await openai.chat(item.question);
|
|
769
|
+
* evaluation.log('quality', { score: 0.95 }); // target auto-inferred
|
|
770
|
+
* return response;
|
|
771
|
+
* }),
|
|
772
|
+
* evaluation.withTarget('claude-3', { model: 'anthropic/claude-3' }, async () => {
|
|
773
|
+
* const response = await anthropic.messages(item.question);
|
|
774
|
+
* evaluation.log('quality', { score: 0.85 }); // target auto-inferred
|
|
775
|
+
* return response;
|
|
776
|
+
* }),
|
|
777
|
+
* ]);
|
|
778
|
+
* });
|
|
779
|
+
* ```
|
|
780
|
+
*/
|
|
781
|
+
withTarget<R>(targetName: string, metadata: TargetMetadata | null, callback: TargetCallback<R>): Promise<TargetResult<R>>;
|
|
782
|
+
withTarget<R>(targetName: string, callback: TargetCallback<R>): Promise<TargetResult<R>>;
|
|
783
|
+
/**
|
|
784
|
+
* Register a target for multi-target comparison
|
|
785
|
+
*/
|
|
786
|
+
private registerTarget;
|
|
787
|
+
/**
|
|
788
|
+
* Schedule a debounced send
|
|
789
|
+
*/
|
|
790
|
+
private scheduleSend;
|
|
791
|
+
/**
|
|
792
|
+
* Send current batch to the API
|
|
793
|
+
*/
|
|
794
|
+
private sendBatch;
|
|
795
|
+
/**
|
|
796
|
+
* Flush all pending data
|
|
797
|
+
*/
|
|
798
|
+
private flush;
|
|
799
|
+
/**
|
|
800
|
+
* Serialize a dataset item for the API
|
|
801
|
+
*/
|
|
802
|
+
private serializeItem;
|
|
803
|
+
/**
|
|
804
|
+
* Get trace ID from current OpenTelemetry context
|
|
805
|
+
*/
|
|
806
|
+
private getTraceIdFromContext;
|
|
807
|
+
/**
|
|
808
|
+
* Get span ID from current OpenTelemetry context
|
|
809
|
+
*/
|
|
810
|
+
private getSpanIdFromContext;
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
/**
|
|
814
|
+
* EvaluationFacade - Entry point for the evaluation API
|
|
815
|
+
*
|
|
816
|
+
* Provides the `init()` method to create evaluation sessions.
|
|
817
|
+
*/
|
|
818
|
+
|
|
819
|
+
type EvaluationFacadeConfig = {
|
|
820
|
+
langwatchApiClient: LangwatchApiClient;
|
|
821
|
+
endpoint: string;
|
|
822
|
+
apiKey: string;
|
|
823
|
+
logger: Logger;
|
|
824
|
+
};
|
|
825
|
+
/**
|
|
826
|
+
* Facade for creating evaluation sessions
|
|
827
|
+
*/
|
|
828
|
+
declare class EvaluationFacade {
|
|
829
|
+
private readonly config;
|
|
830
|
+
constructor(config: EvaluationFacadeConfig);
|
|
831
|
+
/**
|
|
832
|
+
* Initialize a new evaluation session
|
|
833
|
+
*
|
|
834
|
+
* @param name - Name of the experiment (used as slug)
|
|
835
|
+
* @param options - Optional configuration
|
|
836
|
+
* @returns An initialized Evaluation instance
|
|
837
|
+
*
|
|
838
|
+
* @example
|
|
839
|
+
* ```typescript
|
|
840
|
+
* const evaluation = await langwatch.evaluation.init('my-experiment');
|
|
841
|
+
*
|
|
842
|
+
* await evaluation.run(dataset, async ({ item, index }) => {
|
|
843
|
+
* const response = await myAgent(item.question);
|
|
844
|
+
* evaluation.log('accuracy', { index, score: 0.95 });
|
|
845
|
+
* });
|
|
846
|
+
* ```
|
|
847
|
+
*/
|
|
848
|
+
init(name: string, options?: EvaluationInitOptions): Promise<Evaluation>;
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
/**
|
|
852
|
+
* Errors for the Evaluation API
|
|
853
|
+
*/
|
|
854
|
+
/**
|
|
855
|
+
* Base error for evaluation-related issues
|
|
856
|
+
*/
|
|
857
|
+
declare class EvaluationError extends Error {
|
|
858
|
+
constructor(message: string);
|
|
859
|
+
}
|
|
860
|
+
/**
|
|
861
|
+
* Thrown when initialization fails
|
|
862
|
+
*/
|
|
863
|
+
declare class EvaluationInitError extends EvaluationError {
|
|
864
|
+
readonly cause?: Error | undefined;
|
|
865
|
+
constructor(message: string, cause?: Error | undefined);
|
|
866
|
+
}
|
|
867
|
+
/**
|
|
868
|
+
* Thrown when API calls fail
|
|
869
|
+
*/
|
|
870
|
+
declare class EvaluationApiError extends EvaluationError {
|
|
871
|
+
readonly statusCode?: number | undefined;
|
|
872
|
+
readonly cause?: Error | undefined;
|
|
873
|
+
constructor(message: string, statusCode?: number | undefined, cause?: Error | undefined);
|
|
874
|
+
}
|
|
875
|
+
/**
|
|
876
|
+
* Thrown when target metadata conflicts
|
|
877
|
+
*/
|
|
878
|
+
declare class TargetMetadataConflictError extends EvaluationError {
|
|
879
|
+
readonly targetName: string;
|
|
880
|
+
readonly existingMetadata: Record<string, unknown>;
|
|
881
|
+
readonly newMetadata: Record<string, unknown>;
|
|
882
|
+
constructor(targetName: string, existingMetadata: Record<string, unknown>, newMetadata: Record<string, unknown>);
|
|
883
|
+
}
|
|
884
|
+
/**
|
|
885
|
+
* Thrown when an evaluator call fails
|
|
886
|
+
*/
|
|
887
|
+
declare class EvaluatorError extends EvaluationError {
|
|
888
|
+
readonly evaluatorSlug: string;
|
|
889
|
+
readonly cause?: Error | undefined;
|
|
890
|
+
constructor(evaluatorSlug: string, message: string, cause?: Error | undefined);
|
|
891
|
+
}
|
|
892
|
+
|
|
356
893
|
interface GetTraceParams {
|
|
357
894
|
includeSpans?: boolean;
|
|
358
895
|
}
|
|
@@ -376,6 +913,8 @@ declare class LangWatch {
|
|
|
376
913
|
private readonly config;
|
|
377
914
|
readonly prompts: PromptsFacade;
|
|
378
915
|
readonly traces: TracesFacade;
|
|
916
|
+
readonly evaluation: EvaluationFacade;
|
|
917
|
+
readonly datasets: DatasetsFacade;
|
|
379
918
|
constructor(options?: LangWatchConstructorOptions);
|
|
380
919
|
get apiClient(): LangwatchApiClient;
|
|
381
920
|
}
|
|
@@ -385,4 +924,4 @@ declare const logger: {
|
|
|
385
924
|
NoOpLogger: typeof NoOpLogger;
|
|
386
925
|
};
|
|
387
926
|
|
|
388
|
-
export { FetchPolicy, type GetPromptOptions, LangWatch, logger };
|
|
927
|
+
export { type EvaluateOptions, Evaluation, EvaluationApiError, EvaluationError, EvaluationFacade, EvaluationInitError, type EvaluationInitOptions, type EvaluationResult, type EvaluationStatus, EvaluatorError, FetchPolicy, type GetPromptOptions, LangWatch, type LogOptions, type RunCallback, type RunContext, type RunOptions, type TargetInfo, type TargetMetadata, TargetMetadataConflictError, type TargetType, logger };
|