@fallom/trace 0.2.15 → 0.2.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -590,9 +590,159 @@ async function datasetFromFallom(datasetKey, version, config) {
590
590
  );
591
591
  return items;
592
592
  }
593
+ var EvaluationDataset;
593
594
  var init_helpers = __esm({
594
595
  "src/evals/helpers.ts"() {
595
596
  "use strict";
597
+ EvaluationDataset = class {
598
+ constructor() {
599
+ this._goldens = [];
600
+ this._testCases = [];
601
+ this._datasetKey = null;
602
+ this._datasetName = null;
603
+ this._version = null;
604
+ }
605
+ /** List of golden records (inputs with optional expected outputs). */
606
+ get goldens() {
607
+ return this._goldens;
608
+ }
609
+ /** List of test cases (inputs with actual outputs from your LLM). */
610
+ get testCases() {
611
+ return this._testCases;
612
+ }
613
+ /** The Fallom dataset key if pulled from Fallom. */
614
+ get datasetKey() {
615
+ return this._datasetKey;
616
+ }
617
+ /**
618
+ * Pull a dataset from Fallom.
619
+ *
620
+ * @param alias - The dataset key/alias in Fallom
621
+ * @param version - Specific version to pull (default: latest)
622
+ * @returns Self for chaining
623
+ */
624
+ async pull(alias, version) {
625
+ const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await Promise.resolve().then(() => (init_core(), core_exports));
626
+ if (!_initialized2) {
627
+ throw new Error("Fallom evals not initialized. Call evals.init() first.");
628
+ }
629
+ const params = new URLSearchParams({ include_entries: "true" });
630
+ if (version !== void 0) {
631
+ params.set("version", String(version));
632
+ }
633
+ const url = `${_baseUrl2}/api/datasets/${encodeURIComponent(alias)}?${params}`;
634
+ const response = await fetch(url, {
635
+ headers: {
636
+ Authorization: `Bearer ${_apiKey2}`,
637
+ "Content-Type": "application/json"
638
+ }
639
+ });
640
+ if (response.status === 404) {
641
+ throw new Error(`Dataset '${alias}' not found`);
642
+ } else if (response.status === 403) {
643
+ throw new Error(`Access denied to dataset '${alias}'`);
644
+ }
645
+ if (!response.ok) {
646
+ throw new Error(`Failed to fetch dataset: ${response.statusText}`);
647
+ }
648
+ const data = await response.json();
649
+ this._datasetKey = alias;
650
+ this._datasetName = data.dataset?.name || alias;
651
+ this._version = data.version?.version || null;
652
+ this._goldens = [];
653
+ for (const entry of data.entries || []) {
654
+ this._goldens.push({
655
+ input: entry.input || "",
656
+ expectedOutput: entry.output,
657
+ systemMessage: entry.systemMessage,
658
+ metadata: entry.metadata
659
+ });
660
+ }
661
+ console.log(
662
+ `\u2713 Pulled dataset '${this._datasetName}' (version ${this._version}) with ${this._goldens.length} goldens`
663
+ );
664
+ return this;
665
+ }
666
+ /**
667
+ * Add a golden record manually.
668
+ * @param golden - A Golden object
669
+ * @returns Self for chaining
670
+ */
671
+ addGolden(golden) {
672
+ this._goldens.push(golden);
673
+ return this;
674
+ }
675
+ /**
676
+ * Add multiple golden records.
677
+ * @param goldens - Array of Golden objects
678
+ * @returns Self for chaining
679
+ */
680
+ addGoldens(goldens) {
681
+ this._goldens.push(...goldens);
682
+ return this;
683
+ }
684
+ /**
685
+ * Add a test case with actual LLM output.
686
+ * @param testCase - An LLMTestCase object
687
+ * @returns Self for chaining
688
+ */
689
+ addTestCase(testCase) {
690
+ this._testCases.push(testCase);
691
+ return this;
692
+ }
693
+ /**
694
+ * Add multiple test cases.
695
+ * @param testCases - Array of LLMTestCase objects
696
+ * @returns Self for chaining
697
+ */
698
+ addTestCases(testCases) {
699
+ this._testCases.push(...testCases);
700
+ return this;
701
+ }
702
+ /**
703
+ * Automatically generate test cases by running all goldens through your LLM app.
704
+ *
705
+ * @param llmApp - A callable that takes messages and returns response
706
+ * @param options - Configuration options
707
+ * @returns Self for chaining
708
+ */
709
+ async generateTestCases(llmApp, options = {}) {
710
+ const { includeContext = false } = options;
711
+ console.log(`Generating test cases for ${this._goldens.length} goldens...`);
712
+ for (let i = 0; i < this._goldens.length; i++) {
713
+ const golden = this._goldens[i];
714
+ const messages = [];
715
+ if (golden.systemMessage) {
716
+ messages.push({ role: "system", content: golden.systemMessage });
717
+ }
718
+ messages.push({ role: "user", content: golden.input });
719
+ const response = await llmApp(messages);
720
+ const testCase = {
721
+ input: golden.input,
722
+ actualOutput: response.content,
723
+ expectedOutput: golden.expectedOutput,
724
+ systemMessage: golden.systemMessage,
725
+ context: includeContext ? response.context : golden.context,
726
+ metadata: golden.metadata
727
+ };
728
+ this._testCases.push(testCase);
729
+ console.log(
730
+ ` [${i + 1}/${this._goldens.length}] Generated output for: ${golden.input.slice(0, 50)}...`
731
+ );
732
+ }
733
+ console.log(`\u2713 Generated ${this._testCases.length} test cases`);
734
+ return this;
735
+ }
736
+ /** Clear all test cases (useful for re-running with different LLM). */
737
+ clearTestCases() {
738
+ this._testCases = [];
739
+ return this;
740
+ }
741
+ /** Return the number of goldens. */
742
+ get length() {
743
+ return this._goldens.length;
744
+ }
745
+ };
596
746
  }
597
747
  });
598
748
 
@@ -707,9 +857,22 @@ async function evaluate(options) {
707
857
  name,
708
858
  description,
709
859
  verbose = true,
860
+ testCases,
710
861
  _skipUpload = false
711
862
  } = options;
712
- const dataset = await resolveDataset(datasetInput);
863
+ let dataset;
864
+ if (testCases !== void 0 && testCases.length > 0) {
865
+ dataset = testCases.map((tc) => ({
866
+ input: tc.input,
867
+ output: tc.actualOutput,
868
+ systemMessage: tc.systemMessage,
869
+ metadata: tc.metadata
870
+ }));
871
+ } else if (datasetInput !== void 0) {
872
+ dataset = await resolveDataset(datasetInput);
873
+ } else {
874
+ throw new Error("Either 'dataset' or 'testCases' must be provided");
875
+ }
713
876
  for (const m of metrics) {
714
877
  if (typeof m === "string" && !AVAILABLE_METRICS.includes(m)) {
715
878
  throw new Error(
@@ -775,6 +938,9 @@ async function compareModels(options) {
775
938
  description,
776
939
  verbose = true
777
940
  } = options;
941
+ if (!datasetInput) {
942
+ throw new Error("'dataset' is required for compareModels()");
943
+ }
778
944
  const dataset = await resolveDataset(datasetInput);
779
945
  const results = {};
780
946
  if (includeProduction) {
@@ -2050,6 +2216,54 @@ function clearPromptContext() {
2050
2216
  promptContext = null;
2051
2217
  }
2052
2218
 
2219
+ // src/trace/wrappers/shared-utils.ts
2220
+ function sanitizeMetadataOnly(key, value) {
2221
+ const contentKeys = [
2222
+ "text",
2223
+ "content",
2224
+ "message",
2225
+ "messages",
2226
+ "object",
2227
+ "prompt",
2228
+ "system",
2229
+ "input",
2230
+ "output",
2231
+ "response",
2232
+ "toolCalls",
2233
+ "toolResults",
2234
+ "steps",
2235
+ "reasoning",
2236
+ "rawResponse",
2237
+ "rawCall",
2238
+ "body",
2239
+ "candidates",
2240
+ "parts"
2241
+ ];
2242
+ if (contentKeys.includes(key)) {
2243
+ if (typeof value === "string") {
2244
+ return `[content omitted: ${value.length} chars]`;
2245
+ }
2246
+ if (Array.isArray(value)) {
2247
+ return `[content omitted: ${value.length} items]`;
2248
+ }
2249
+ if (typeof value === "object" && value !== null) {
2250
+ return "[content omitted]";
2251
+ }
2252
+ }
2253
+ if (typeof value === "string") {
2254
+ if (value.startsWith("data:image/")) {
2255
+ return "[base64 image omitted]";
2256
+ }
2257
+ if (value.length > 1e3) {
2258
+ return `[large string omitted: ${value.length} chars]`;
2259
+ }
2260
+ }
2261
+ if (value instanceof Uint8Array || value && value.type === "Buffer") {
2262
+ return "[binary data omitted]";
2263
+ }
2264
+ return value;
2265
+ }
2266
+
2053
2267
  // src/trace/wrappers/openai.ts
2054
2268
  function wrapOpenAI(client, sessionCtx) {
2055
2269
  const originalCreate = client.chat.completions.create.bind(
@@ -2097,6 +2311,13 @@ function wrapOpenAI(client, sessionCtx) {
2097
2311
  if (response?.usage) {
2098
2312
  attributes["fallom.raw.usage"] = JSON.stringify(response.usage);
2099
2313
  }
2314
+ try {
2315
+ attributes["fallom.raw.metadata"] = JSON.stringify(
2316
+ response,
2317
+ sanitizeMetadataOnly
2318
+ );
2319
+ } catch {
2320
+ }
2100
2321
  const waterfallTimings = {
2101
2322
  requestStart: 0,
2102
2323
  requestEnd: endTime - startTime,
@@ -2223,6 +2444,13 @@ function wrapAnthropic(client, sessionCtx) {
2223
2444
  if (response?.usage) {
2224
2445
  attributes["fallom.raw.usage"] = JSON.stringify(response.usage);
2225
2446
  }
2447
+ try {
2448
+ attributes["fallom.raw.metadata"] = JSON.stringify(
2449
+ response,
2450
+ sanitizeMetadataOnly
2451
+ );
2452
+ } catch {
2453
+ }
2226
2454
  const waterfallTimings = {
2227
2455
  requestStart: 0,
2228
2456
  requestEnd: endTime - startTime,
@@ -2343,6 +2571,13 @@ function wrapGoogleAI(model, sessionCtx) {
2343
2571
  if (result?.usageMetadata) {
2344
2572
  attributes["fallom.raw.usage"] = JSON.stringify(result.usageMetadata);
2345
2573
  }
2574
+ try {
2575
+ attributes["fallom.raw.metadata"] = JSON.stringify(
2576
+ result,
2577
+ sanitizeMetadataOnly
2578
+ );
2579
+ } catch {
2580
+ }
2346
2581
  const waterfallTimings = {
2347
2582
  requestStart: 0,
2348
2583
  requestEnd: endTime - startTime,
@@ -2539,6 +2774,13 @@ function createGenerateTextWrapper(aiModule, sessionCtx, debug = false) {
2539
2774
  result.experimental_providerMetadata
2540
2775
  );
2541
2776
  }
2777
+ try {
2778
+ attributes["fallom.raw.metadata"] = JSON.stringify(
2779
+ result,
2780
+ sanitizeMetadataOnly
2781
+ );
2782
+ } catch {
2783
+ }
2542
2784
  const totalDurationMs = endTime - startTime;
2543
2785
  const sortedToolTimings = Array.from(toolTimings.values()).sort(
2544
2786
  (a, b) => a.startTime - b.startTime
@@ -2867,6 +3109,10 @@ function createStreamTextWrapper(aiModule, sessionCtx, debug = false) {
2867
3109
  if (firstTokenTime) {
2868
3110
  attributes["fallom.time_to_first_token_ms"] = firstTokenTime - startTime;
2869
3111
  }
3112
+ try {
3113
+ attributes["fallom.raw.metadata"] = JSON.stringify(result, sanitizeMetadataOnly);
3114
+ } catch {
3115
+ }
2870
3116
  const totalDurationMs = endTime - startTime;
2871
3117
  const sortedToolTimings = Array.from(toolTimings.values()).sort(
2872
3118
  (a, b) => a.startTime - b.startTime
@@ -3072,6 +3318,10 @@ function createGenerateObjectWrapper(aiModule, sessionCtx, debug = false) {
3072
3318
  result.experimental_providerMetadata
3073
3319
  );
3074
3320
  }
3321
+ try {
3322
+ attributes["fallom.raw.metadata"] = JSON.stringify(result, sanitizeMetadataOnly);
3323
+ } catch {
3324
+ }
3075
3325
  const promptCtx = getPromptContext();
3076
3326
  sendTrace({
3077
3327
  config_key: ctx.configKey,
@@ -3191,6 +3441,10 @@ function createStreamObjectWrapper(aiModule, sessionCtx, debug = false) {
3191
3441
  if (providerMetadata) {
3192
3442
  attributes["fallom.raw.providerMetadata"] = JSON.stringify(providerMetadata);
3193
3443
  }
3444
+ try {
3445
+ attributes["fallom.raw.metadata"] = JSON.stringify(result, sanitizeMetadataOnly);
3446
+ } catch {
3447
+ }
3194
3448
  const promptCtx = getPromptContext();
3195
3449
  sendTrace({
3196
3450
  config_key: ctx.configKey,
@@ -3287,6 +3541,13 @@ function wrapMastraAgent(agent, sessionCtx) {
3287
3541
  attributes["fallom.raw.request"] = JSON.stringify(input);
3288
3542
  attributes["fallom.raw.response"] = JSON.stringify(result);
3289
3543
  }
3544
+ try {
3545
+ attributes["fallom.raw.metadata"] = JSON.stringify(
3546
+ result,
3547
+ sanitizeMetadataOnly
3548
+ );
3549
+ } catch {
3550
+ }
3290
3551
  sendTrace({
3291
3552
  config_key: ctx.configKey,
3292
3553
  session_id: ctx.sessionId,
@@ -3543,6 +3804,7 @@ var evals_exports = {};
3543
3804
  __export(evals_exports, {
3544
3805
  AVAILABLE_METRICS: () => AVAILABLE_METRICS,
3545
3806
  DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
3807
+ EvaluationDataset: () => EvaluationDataset,
3546
3808
  METRIC_PROMPTS: () => METRIC_PROMPTS,
3547
3809
  compareModels: () => compareModels,
3548
3810
  createCustomModel: () => createCustomModel,
package/dist/index.mjs CHANGED
@@ -5,6 +5,7 @@ import {
5
5
  import {
6
6
  AVAILABLE_METRICS,
7
7
  DEFAULT_JUDGE_MODEL,
8
+ EvaluationDataset,
8
9
  METRIC_PROMPTS,
9
10
  compareModels,
10
11
  createCustomModel,
@@ -18,7 +19,7 @@ import {
18
19
  init as init2,
19
20
  isCustomMetric,
20
21
  uploadResultsPublic
21
- } from "./chunk-2NGJF2JZ.mjs";
22
+ } from "./chunk-3HBKT4HK.mjs";
22
23
  import {
23
24
  __export
24
25
  } from "./chunk-7P6ASYW6.mjs";
@@ -1055,6 +1056,54 @@ function clearPromptContext() {
1055
1056
  promptContext = null;
1056
1057
  }
1057
1058
 
1059
+ // src/trace/wrappers/shared-utils.ts
1060
+ function sanitizeMetadataOnly(key, value) {
1061
+ const contentKeys = [
1062
+ "text",
1063
+ "content",
1064
+ "message",
1065
+ "messages",
1066
+ "object",
1067
+ "prompt",
1068
+ "system",
1069
+ "input",
1070
+ "output",
1071
+ "response",
1072
+ "toolCalls",
1073
+ "toolResults",
1074
+ "steps",
1075
+ "reasoning",
1076
+ "rawResponse",
1077
+ "rawCall",
1078
+ "body",
1079
+ "candidates",
1080
+ "parts"
1081
+ ];
1082
+ if (contentKeys.includes(key)) {
1083
+ if (typeof value === "string") {
1084
+ return `[content omitted: ${value.length} chars]`;
1085
+ }
1086
+ if (Array.isArray(value)) {
1087
+ return `[content omitted: ${value.length} items]`;
1088
+ }
1089
+ if (typeof value === "object" && value !== null) {
1090
+ return "[content omitted]";
1091
+ }
1092
+ }
1093
+ if (typeof value === "string") {
1094
+ if (value.startsWith("data:image/")) {
1095
+ return "[base64 image omitted]";
1096
+ }
1097
+ if (value.length > 1e3) {
1098
+ return `[large string omitted: ${value.length} chars]`;
1099
+ }
1100
+ }
1101
+ if (value instanceof Uint8Array || value && value.type === "Buffer") {
1102
+ return "[binary data omitted]";
1103
+ }
1104
+ return value;
1105
+ }
1106
+
1058
1107
  // src/trace/wrappers/openai.ts
1059
1108
  function wrapOpenAI(client, sessionCtx) {
1060
1109
  const originalCreate = client.chat.completions.create.bind(
@@ -1102,6 +1151,13 @@ function wrapOpenAI(client, sessionCtx) {
1102
1151
  if (response?.usage) {
1103
1152
  attributes["fallom.raw.usage"] = JSON.stringify(response.usage);
1104
1153
  }
1154
+ try {
1155
+ attributes["fallom.raw.metadata"] = JSON.stringify(
1156
+ response,
1157
+ sanitizeMetadataOnly
1158
+ );
1159
+ } catch {
1160
+ }
1105
1161
  const waterfallTimings = {
1106
1162
  requestStart: 0,
1107
1163
  requestEnd: endTime - startTime,
@@ -1228,6 +1284,13 @@ function wrapAnthropic(client, sessionCtx) {
1228
1284
  if (response?.usage) {
1229
1285
  attributes["fallom.raw.usage"] = JSON.stringify(response.usage);
1230
1286
  }
1287
+ try {
1288
+ attributes["fallom.raw.metadata"] = JSON.stringify(
1289
+ response,
1290
+ sanitizeMetadataOnly
1291
+ );
1292
+ } catch {
1293
+ }
1231
1294
  const waterfallTimings = {
1232
1295
  requestStart: 0,
1233
1296
  requestEnd: endTime - startTime,
@@ -1348,6 +1411,13 @@ function wrapGoogleAI(model, sessionCtx) {
1348
1411
  if (result?.usageMetadata) {
1349
1412
  attributes["fallom.raw.usage"] = JSON.stringify(result.usageMetadata);
1350
1413
  }
1414
+ try {
1415
+ attributes["fallom.raw.metadata"] = JSON.stringify(
1416
+ result,
1417
+ sanitizeMetadataOnly
1418
+ );
1419
+ } catch {
1420
+ }
1351
1421
  const waterfallTimings = {
1352
1422
  requestStart: 0,
1353
1423
  requestEnd: endTime - startTime,
@@ -1544,6 +1614,13 @@ function createGenerateTextWrapper(aiModule, sessionCtx, debug = false) {
1544
1614
  result.experimental_providerMetadata
1545
1615
  );
1546
1616
  }
1617
+ try {
1618
+ attributes["fallom.raw.metadata"] = JSON.stringify(
1619
+ result,
1620
+ sanitizeMetadataOnly
1621
+ );
1622
+ } catch {
1623
+ }
1547
1624
  const totalDurationMs = endTime - startTime;
1548
1625
  const sortedToolTimings = Array.from(toolTimings.values()).sort(
1549
1626
  (a, b) => a.startTime - b.startTime
@@ -1872,6 +1949,10 @@ function createStreamTextWrapper(aiModule, sessionCtx, debug = false) {
1872
1949
  if (firstTokenTime) {
1873
1950
  attributes["fallom.time_to_first_token_ms"] = firstTokenTime - startTime;
1874
1951
  }
1952
+ try {
1953
+ attributes["fallom.raw.metadata"] = JSON.stringify(result, sanitizeMetadataOnly);
1954
+ } catch {
1955
+ }
1875
1956
  const totalDurationMs = endTime - startTime;
1876
1957
  const sortedToolTimings = Array.from(toolTimings.values()).sort(
1877
1958
  (a, b) => a.startTime - b.startTime
@@ -2077,6 +2158,10 @@ function createGenerateObjectWrapper(aiModule, sessionCtx, debug = false) {
2077
2158
  result.experimental_providerMetadata
2078
2159
  );
2079
2160
  }
2161
+ try {
2162
+ attributes["fallom.raw.metadata"] = JSON.stringify(result, sanitizeMetadataOnly);
2163
+ } catch {
2164
+ }
2080
2165
  const promptCtx = getPromptContext();
2081
2166
  sendTrace({
2082
2167
  config_key: ctx.configKey,
@@ -2196,6 +2281,10 @@ function createStreamObjectWrapper(aiModule, sessionCtx, debug = false) {
2196
2281
  if (providerMetadata) {
2197
2282
  attributes["fallom.raw.providerMetadata"] = JSON.stringify(providerMetadata);
2198
2283
  }
2284
+ try {
2285
+ attributes["fallom.raw.metadata"] = JSON.stringify(result, sanitizeMetadataOnly);
2286
+ } catch {
2287
+ }
2199
2288
  const promptCtx = getPromptContext();
2200
2289
  sendTrace({
2201
2290
  config_key: ctx.configKey,
@@ -2292,6 +2381,13 @@ function wrapMastraAgent(agent, sessionCtx) {
2292
2381
  attributes["fallom.raw.request"] = JSON.stringify(input);
2293
2382
  attributes["fallom.raw.response"] = JSON.stringify(result);
2294
2383
  }
2384
+ try {
2385
+ attributes["fallom.raw.metadata"] = JSON.stringify(
2386
+ result,
2387
+ sanitizeMetadataOnly
2388
+ );
2389
+ } catch {
2390
+ }
2295
2391
  sendTrace({
2296
2392
  config_key: ctx.configKey,
2297
2393
  session_id: ctx.sessionId,
@@ -2545,6 +2641,7 @@ var evals_exports = {};
2545
2641
  __export(evals_exports, {
2546
2642
  AVAILABLE_METRICS: () => AVAILABLE_METRICS,
2547
2643
  DEFAULT_JUDGE_MODEL: () => DEFAULT_JUDGE_MODEL,
2644
+ EvaluationDataset: () => EvaluationDataset,
2548
2645
  METRIC_PROMPTS: () => METRIC_PROMPTS,
2549
2646
  compareModels: () => compareModels,
2550
2647
  createCustomModel: () => createCustomModel,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@fallom/trace",
3
- "version": "0.2.15",
3
+ "version": "0.2.17",
4
4
  "description": "Model A/B testing and tracing for LLM applications. Zero latency, production-ready.",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",