@fallom/trace 0.2.25 → 0.2.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -423,7 +423,7 @@ function datasetFromTraces(traces) {
423
423
  return items;
424
424
  }
425
425
  async function datasetFromFallom(datasetKey, version, config) {
426
- const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await import("./core-Q3IHBEHB.mjs").then(
426
+ const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await import("./core-NTEI2B5Z.mjs").then(
427
427
  (m) => ({
428
428
  _apiKey: config?._apiKey ?? m._apiKey,
429
429
  _baseUrl: config?._baseUrl ?? m._baseUrl,
@@ -496,7 +496,7 @@ var EvaluationDataset = class {
496
496
  * @returns Self for chaining
497
497
  */
498
498
  async pull(alias, version) {
499
- const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await import("./core-Q3IHBEHB.mjs");
499
+ const { _apiKey: _apiKey2, _baseUrl: _baseUrl2, _initialized: _initialized2 } = await import("./core-NTEI2B5Z.mjs");
500
500
  if (!_initialized2) {
501
501
  throw new Error("Fallom evals not initialized. Call evals.init() first.");
502
502
  }
@@ -698,13 +698,22 @@ async function evaluate(options) {
698
698
  _skipUpload = false
699
699
  } = options;
700
700
  let dataset;
701
+ let testCaseExtras = /* @__PURE__ */ new Map();
701
702
  if (testCases !== void 0 && testCases.length > 0) {
702
- dataset = testCases.map((tc) => ({
703
- input: tc.input,
704
- output: tc.actualOutput,
705
- systemMessage: tc.systemMessage,
706
- metadata: tc.metadata
707
- }));
703
+ dataset = testCases.map((tc, idx) => {
704
+ if (tc.expectedOutput || tc.context) {
705
+ testCaseExtras.set(idx, {
706
+ expectedOutput: tc.expectedOutput,
707
+ context: tc.context
708
+ });
709
+ }
710
+ return {
711
+ input: tc.input,
712
+ output: tc.actualOutput,
713
+ systemMessage: tc.systemMessage,
714
+ metadata: tc.metadata
715
+ };
716
+ });
708
717
  } else if (datasetInput !== void 0) {
709
718
  dataset = await resolveDataset(datasetInput);
710
719
  } else {
@@ -723,10 +732,14 @@ async function evaluate(options) {
723
732
  for (let i = 0; i < dataset.length; i++) {
724
733
  const item = dataset[i];
725
734
  if (verbose) console.log(`Evaluating item ${i + 1}/${dataset.length}...`);
735
+ const extras = testCaseExtras.get(i);
726
736
  const result = {
727
737
  input: item.input,
728
738
  output: item.output,
729
739
  systemMessage: item.systemMessage,
740
+ expectedOutput: extras?.expectedOutput,
741
+ context: extras?.context,
742
+ metadata: item.metadata,
730
743
  model: "production",
731
744
  isProduction: true,
732
745
  reasoning: {}
@@ -826,6 +839,7 @@ async function compareModels(options) {
826
839
  input: item.input,
827
840
  output,
828
841
  systemMessage: item.systemMessage,
842
+ metadata: item.metadata,
829
843
  model: model.name,
830
844
  isProduction: false,
831
845
  reasoning: {},
@@ -937,6 +951,9 @@ async function uploadResults(results, name, description, judgeModel, verbose) {
937
951
  results: allResults.map((r) => ({
938
952
  input: r.input,
939
953
  system_message: r.systemMessage,
954
+ expected_output: r.expectedOutput,
955
+ context: r.context,
956
+ metadata: r.metadata,
940
957
  model: r.model,
941
958
  output: r.output,
942
959
  is_production: r.isProduction,