axiom 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin.cjs CHANGED
@@ -390,6 +390,7 @@ var loadPullCommand = (program2) => {
390
390
 
391
391
  // src/cli/commands/eval.command.ts
392
392
  var import_commander3 = require("commander");
393
+ var import_nanoid = require("nanoid");
393
394
 
394
395
  // ../../node_modules/.pnpm/tinyrainbow@2.0.0/node_modules/tinyrainbow/dist/chunk-BVHSVHOK.js
395
396
  var f = {
@@ -476,6 +477,7 @@ var r = process.env.FORCE_TTY !== void 0 || (0, import_tty.isatty)(1);
476
477
  var u = p(r);
477
478
 
478
479
  // src/evals/run-vitest.ts
480
+ var import_node_path3 = __toESM(require("path"), 1);
479
481
  var import_node = require("vitest/node");
480
482
 
481
483
  // src/evals/context/storage.ts
@@ -593,7 +595,280 @@ var import_api5 = require("@opentelemetry/api");
593
595
 
594
596
  // src/otel/semconv/attributes.ts
595
597
  var import_semantic_conventions = require("@opentelemetry/semantic-conventions");
598
+
599
+ // src/otel/semconv/eval_proposal.ts
600
+ var ATTR_EVAL_ID = "eval.id";
601
+ var ATTR_EVAL_NAME = "eval.name";
602
+ var ATTR_EVAL_VERSION = "eval.version";
603
+ var ATTR_EVAL_TYPE = "eval.type";
604
+ var ATTR_EVAL_TAGS = "eval.tags";
605
+ var ATTR_EVAL_BASELINE_ID = "eval.baseline.id";
606
+ var ATTR_EVAL_BASELINE_NAME = "eval.baseline.name";
607
+ var ATTR_EVAL_METADATA = "eval.metadata";
608
+ var ATTR_EVAL_COLLECTION_ID = "eval.collection.id";
609
+ var ATTR_EVAL_COLLECTION_SIZE = "eval.collection.size";
610
+ var ATTR_EVAL_COLLECTION_NAME = "eval.collection.name";
611
+ var ATTR_EVAL_CONFIG_FLAGS = "eval.config.flags";
612
+ var ATTR_EVAL_CASE_INDEX = "eval.case.index";
613
+ var ATTR_EVAL_CASE_INPUT = "eval.case.input";
614
+ var ATTR_EVAL_CASE_OUTPUT = "eval.case.output";
615
+ var ATTR_EVAL_CASE_EXPECTED = "eval.case.expected";
616
+ var ATTR_EVAL_CASE_SCORES = "eval.case.scores";
617
+ var ATTR_EVAL_CASE_METADATA = "eval.case.metadata";
618
+ var ATTR_EVAL_TASK_OUTPUT = "eval.task.output";
619
+ var ATTR_EVAL_TASK_NAME = "eval.task.name";
620
+ var ATTR_EVAL_TASK_TYPE = "eval.task.type";
621
+ var ATTR_EVAL_RUN_ID = "eval.run.id";
622
+ var ATTR_EVAL_SCORE_NAME = "eval.score.name";
623
+ var ATTR_EVAL_SCORE_VALUE = "eval.score.value";
624
+ var ATTR_EVAL_SCORE_THRESHOLD = "eval.score.threshold";
625
+ var ATTR_EVAL_SCORE_PASSED = "eval.score.passed";
626
+ var ATTR_EVAL_SCORE_METADATA = "eval.score.metadata";
627
+ var ATTR_EVAL_USER_NAME = "eval.user.name";
628
+ var ATTR_EVAL_USER_EMAIL = "eval.user.email";
629
+
630
+ // src/otel/semconv/attributes.ts
596
631
  var import_incubating = require("@opentelemetry/semantic-conventions/incubating");
632
+ var ATTR_AXIOM_GEN_AI_SCHEMA_URL = "axiom.gen_ai.schema_url";
633
+ var ATTR_AXIOM_GEN_AI_SDK_NAME = "axiom.gen_ai.sdk.name";
634
+ var ATTR_AXIOM_GEN_AI_SDK_VERSION = "axiom.gen_ai.sdk.version";
635
+ var ATTR_GEN_AI_CAPABILITY_NAME = "gen_ai.capability.name";
636
+ var ATTR_GEN_AI_STEP_NAME = "gen_ai.step.name";
637
+ var ATTR_GEN_AI_TOOL_ARGUMENTS = "gen_ai.tool.arguments";
638
+ var ATTR_GEN_AI_TOOL_MESSAGE = "gen_ai.tool.message";
639
+ var GEN_AI_PROVIDER_NAME_VALUE_ASSEMBLYAI = "assemblyai";
640
+ var GEN_AI_PROVIDER_NAME_VALUE_CEREBRAS = "cerebras";
641
+ var GEN_AI_PROVIDER_NAME_VALUE_DEEPGRAM = "deepgram";
642
+ var GEN_AI_PROVIDER_NAME_VALUE_DEEPINFRA = "deepinfra";
643
+ var GEN_AI_PROVIDER_NAME_VALUE_ELEVENLABS = "elevenlabs";
644
+ var GEN_AI_PROVIDER_NAME_VALUE_FAL = "fal";
645
+ var GEN_AI_PROVIDER_NAME_VALUE_FIREWORKS = "fireworks";
646
+ var GEN_AI_PROVIDER_NAME_VALUE_GLADIA = "gladia";
647
+ var GEN_AI_PROVIDER_NAME_VALUE_HUME = "hume";
648
+ var GEN_AI_PROVIDER_NAME_VALUE_LMNT = "lmnt";
649
+ var GEN_AI_PROVIDER_NAME_VALUE_LUMA = "luma";
650
+ var GEN_AI_PROVIDER_NAME_VALUE_REPLICATE = "replicate";
651
+ var GEN_AI_PROVIDER_NAME_VALUE_REVAI = "revai";
652
+ var GEN_AI_PROVIDER_NAME_VALUE_TOGETHERAI = "togetherai";
653
+ var GEN_AI_PROVIDER_NAME_VALUE_VERCEL = "vercel";
654
+ var Attr = {
655
+ __EXPERIMENTAL_Flag: (flagName) => `flag.${flagName}`,
656
+ __EXPERIMENTAL_Fact: (factName) => `fact.${factName}`,
657
+ Axiom: {
658
+ GenAI: {
659
+ SchemaURL: ATTR_AXIOM_GEN_AI_SCHEMA_URL,
660
+ SDK: {
661
+ Name: ATTR_AXIOM_GEN_AI_SDK_NAME,
662
+ Version: ATTR_AXIOM_GEN_AI_SDK_VERSION
663
+ }
664
+ }
665
+ },
666
+ GenAI: {
667
+ PromptMetadata: {
668
+ ID: "axiom.gen_ai.prompt.id",
669
+ Name: "axiom.gen_ai.prompt.name",
670
+ Slug: "axiom.gen_ai.prompt.slug",
671
+ Version: "axiom.gen_ai.prompt.version"
672
+ },
673
+ /**
674
+ * These two are used to identify the span
675
+ */
676
+ Capability: {
677
+ Name: ATTR_GEN_AI_CAPABILITY_NAME
678
+ },
679
+ Step: {
680
+ Name: ATTR_GEN_AI_STEP_NAME
681
+ },
682
+ Provider: {
683
+ Name: import_incubating.ATTR_GEN_AI_PROVIDER_NAME,
684
+ Name_Values: {
685
+ Anthropic: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_ANTHROPIC,
686
+ AssemblyAI: GEN_AI_PROVIDER_NAME_VALUE_ASSEMBLYAI,
687
+ AWSBedrock: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_AWS_BEDROCK,
688
+ AzureAIInference: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_AZURE_AI_INFERENCE,
689
+ AzureAIOpenAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_AZURE_AI_OPENAI,
690
+ Cerebras: GEN_AI_PROVIDER_NAME_VALUE_CEREBRAS,
691
+ Cohere: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_COHERE,
692
+ Deepgram: GEN_AI_PROVIDER_NAME_VALUE_DEEPGRAM,
693
+ DeepInfra: GEN_AI_PROVIDER_NAME_VALUE_DEEPINFRA,
694
+ Deepseek: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_DEEPSEEK,
695
+ ElevenLabs: GEN_AI_PROVIDER_NAME_VALUE_ELEVENLABS,
696
+ Fal: GEN_AI_PROVIDER_NAME_VALUE_FAL,
697
+ Fireworks: GEN_AI_PROVIDER_NAME_VALUE_FIREWORKS,
698
+ GCPGemini: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GCP_GEMINI,
699
+ GCPGenAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GCP_GEN_AI,
700
+ GCPVertexAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GCP_VERTEX_AI,
701
+ Gladia: GEN_AI_PROVIDER_NAME_VALUE_GLADIA,
702
+ Groq: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GROQ,
703
+ Hume: GEN_AI_PROVIDER_NAME_VALUE_HUME,
704
+ IBMWatsonxAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_IBM_WATSONX_AI,
705
+ Lmnt: GEN_AI_PROVIDER_NAME_VALUE_LMNT,
706
+ Luma: GEN_AI_PROVIDER_NAME_VALUE_LUMA,
707
+ MistralAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_MISTRAL_AI,
708
+ OpenAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_OPENAI,
709
+ Perplexity: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_PERPLEXITY,
710
+ Replicate: GEN_AI_PROVIDER_NAME_VALUE_REPLICATE,
711
+ RevAI: GEN_AI_PROVIDER_NAME_VALUE_REVAI,
712
+ TogetherAI: GEN_AI_PROVIDER_NAME_VALUE_TOGETHERAI,
713
+ Vercel: GEN_AI_PROVIDER_NAME_VALUE_VERCEL,
714
+ XAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_X_AI
715
+ }
716
+ },
717
+ /**
718
+ * Regular attributes
719
+ */
720
+ Agent: {
721
+ Description: import_incubating.ATTR_GEN_AI_AGENT_DESCRIPTION,
722
+ // not yet used by axiom-ai
723
+ ID: import_incubating.ATTR_GEN_AI_AGENT_ID,
724
+ // not yet used by axiom-ai
725
+ Name: import_incubating.ATTR_GEN_AI_AGENT_NAME
726
+ // not yet used by axiom-ai
727
+ },
728
+ Conversation: {
729
+ ID: import_incubating.ATTR_GEN_AI_CONVERSATION_ID
730
+ // not yet used by axiom-ai, anyway probably needs to be provided by user
731
+ },
732
+ Input: {
733
+ Messages: import_incubating.ATTR_GEN_AI_INPUT_MESSAGES
734
+ },
735
+ Operation: {
736
+ Name: import_incubating.ATTR_GEN_AI_OPERATION_NAME,
737
+ Name_Values: {
738
+ /**
739
+ * Note that "text_completion" is deprecated in favor of "chat" for both OpenAI and Anthropic
740
+ */
741
+ Chat: import_incubating.GEN_AI_OPERATION_NAME_VALUE_CHAT,
742
+ CreateAgent: import_incubating.GEN_AI_OPERATION_NAME_VALUE_CREATE_AGENT,
743
+ Embeddings: import_incubating.GEN_AI_OPERATION_NAME_VALUE_EMBEDDINGS,
744
+ ExecuteTool: import_incubating.GEN_AI_OPERATION_NAME_VALUE_EXECUTE_TOOL,
745
+ GenerateContent: import_incubating.GEN_AI_OPERATION_NAME_VALUE_GENERATE_CONTENT,
746
+ InvokeAgent: import_incubating.GEN_AI_OPERATION_NAME_VALUE_INVOKE_AGENT
747
+ }
748
+ },
749
+ Output: {
750
+ Messages: import_incubating.ATTR_GEN_AI_OUTPUT_MESSAGES,
751
+ Type: import_incubating.ATTR_GEN_AI_OUTPUT_TYPE,
752
+ Type_Values: {
753
+ Text: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_TEXT,
754
+ Json: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_JSON,
755
+ Image: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_IMAGE,
756
+ Speech: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_SPEECH
757
+ }
758
+ },
759
+ /**
760
+ * The provider that is hosting the model, eg AWS Bedrock
761
+ * There doesn't seem to be a semconv for this
762
+ */
763
+ Request: {
764
+ ChoiceCount: import_incubating.ATTR_GEN_AI_REQUEST_CHOICE_COUNT,
765
+ // not yet used by axiom-ai
766
+ EncodingFormats: import_incubating.ATTR_GEN_AI_REQUEST_ENCODING_FORMATS,
767
+ // not yet used by axiom-ai
768
+ FrequencyPenalty: import_incubating.ATTR_GEN_AI_REQUEST_FREQUENCY_PENALTY,
769
+ MaxTokens: import_incubating.ATTR_GEN_AI_REQUEST_MAX_TOKENS,
770
+ /**
771
+ * The model you asked for
772
+ */
773
+ Model: import_incubating.ATTR_GEN_AI_REQUEST_MODEL,
774
+ PresencePenalty: import_incubating.ATTR_GEN_AI_REQUEST_PRESENCE_PENALTY,
775
+ Seed: import_incubating.ATTR_GEN_AI_REQUEST_SEED,
776
+ StopSequences: import_incubating.ATTR_GEN_AI_REQUEST_STOP_SEQUENCES,
777
+ Temperature: import_incubating.ATTR_GEN_AI_REQUEST_TEMPERATURE,
778
+ TopK: import_incubating.ATTR_GEN_AI_REQUEST_TOP_K,
779
+ TopP: import_incubating.ATTR_GEN_AI_REQUEST_TOP_P
780
+ },
781
+ Response: {
782
+ FinishReasons: import_incubating.ATTR_GEN_AI_RESPONSE_FINISH_REASONS,
783
+ ID: import_incubating.ATTR_GEN_AI_RESPONSE_ID,
784
+ /**
785
+ * The model that was actually used (might be different bc routing) - only ever get this from the response, otherwise omit
786
+ */
787
+ Model: import_incubating.ATTR_GEN_AI_RESPONSE_MODEL
788
+ // somehow not landing on the span for google models? check up on this...
789
+ },
790
+ Tool: {
791
+ CallID: import_incubating.ATTR_GEN_AI_TOOL_CALL_ID,
792
+ Description: import_incubating.ATTR_GEN_AI_TOOL_DESCRIPTION,
793
+ Name: import_incubating.ATTR_GEN_AI_TOOL_NAME,
794
+ Type: import_incubating.ATTR_GEN_AI_TOOL_TYPE,
795
+ /**
796
+ * Note, OTel Semantic Convention suggest only putting tool inputs/outputs on the parent chat span
797
+ * But we at least want to give users THE OPTION to put them on the tool spans themselves as well
798
+ * Because it enables a lot of things with querying
799
+ * @see https://github.com/open-telemetry/semantic-conventions/releases/tag/v1.37.0
800
+ */
801
+ Arguments: ATTR_GEN_AI_TOOL_ARGUMENTS,
802
+ /**
803
+ * Note, OTel Semantic Convention suggest only putting tool inputs/outputs on the parent chat span
804
+ * But we at least want to give users THE OPTION to put them on the tool spans themselves as well
805
+ * Because it enables a lot of things with querying
806
+ * @see https://github.com/open-telemetry/semantic-conventions/releases/tag/v1.37.0
807
+ */
808
+ Message: ATTR_GEN_AI_TOOL_MESSAGE
809
+ },
810
+ Usage: {
811
+ InputTokens: import_incubating.ATTR_GEN_AI_USAGE_INPUT_TOKENS,
812
+ OutputTokens: import_incubating.ATTR_GEN_AI_USAGE_OUTPUT_TOKENS
813
+ }
814
+ },
815
+ Eval: {
816
+ ID: ATTR_EVAL_ID,
817
+ Name: ATTR_EVAL_NAME,
818
+ Version: ATTR_EVAL_VERSION,
819
+ Type: ATTR_EVAL_TYPE,
820
+ Baseline: {
821
+ ID: ATTR_EVAL_BASELINE_ID,
822
+ Name: ATTR_EVAL_BASELINE_NAME
823
+ },
824
+ Tags: ATTR_EVAL_TAGS,
825
+ Metadata: ATTR_EVAL_METADATA,
826
+ Collection: {
827
+ ID: ATTR_EVAL_COLLECTION_ID,
828
+ Name: ATTR_EVAL_COLLECTION_NAME,
829
+ Size: ATTR_EVAL_COLLECTION_SIZE
830
+ },
831
+ Config: {
832
+ Flags: ATTR_EVAL_CONFIG_FLAGS
833
+ },
834
+ Run: {
835
+ ID: ATTR_EVAL_RUN_ID
836
+ },
837
+ Case: {
838
+ Index: ATTR_EVAL_CASE_INDEX,
839
+ Input: ATTR_EVAL_CASE_INPUT,
840
+ Output: ATTR_EVAL_CASE_OUTPUT,
841
+ Expected: ATTR_EVAL_CASE_EXPECTED,
842
+ Scores: ATTR_EVAL_CASE_SCORES,
843
+ Metadata: ATTR_EVAL_CASE_METADATA
844
+ },
845
+ Task: {
846
+ Output: ATTR_EVAL_TASK_OUTPUT,
847
+ Name: ATTR_EVAL_TASK_NAME,
848
+ Type: ATTR_EVAL_TASK_TYPE
849
+ },
850
+ Score: {
851
+ Name: ATTR_EVAL_SCORE_NAME,
852
+ Value: ATTR_EVAL_SCORE_VALUE,
853
+ Threshold: ATTR_EVAL_SCORE_THRESHOLD,
854
+ Passed: ATTR_EVAL_SCORE_PASSED,
855
+ Metadata: ATTR_EVAL_SCORE_METADATA
856
+ },
857
+ User: {
858
+ Name: ATTR_EVAL_USER_NAME,
859
+ Email: ATTR_EVAL_USER_EMAIL
860
+ }
861
+ },
862
+ Error: {
863
+ Type: import_semantic_conventions.ATTR_ERROR_TYPE,
864
+ Message: import_incubating.ATTR_ERROR_MESSAGE
865
+ },
866
+ HTTP: {
867
+ Response: {
868
+ StatusCode: import_semantic_conventions.ATTR_HTTP_RESPONSE_STATUS_CODE
869
+ }
870
+ }
871
+ };
597
872
 
598
873
  // src/otel/startActiveSpan.ts
599
874
  var import_api2 = require("@opentelemetry/api");
@@ -604,7 +879,7 @@ var import_api4 = require("@opentelemetry/api");
604
879
  // package.json
605
880
  var package_default = {
606
881
  name: "axiom",
607
- version: "0.23.0",
882
+ version: "0.25.0",
608
883
  type: "module",
609
884
  author: "Axiom, Inc.",
610
885
  contributors: [
@@ -826,47 +1101,40 @@ function resolveAxiomConnection(config) {
826
1101
 
827
1102
  // src/evals/eval.service.ts
828
1103
  var findEvaluationCases = async (evalId, config) => {
829
- try {
830
- const { dataset, url, token } = resolveAxiomConnection(config);
831
- const apl = `['${dataset}'] | where trace_id == "${evalId}" | order by _time`;
832
- const headers = new Headers({
833
- Authorization: `Bearer ${token}`,
834
- "Content-Type": "application/json"
835
- });
836
- const resp = await fetch(`${url}/v1/datasets/_apl?format=legacy`, {
837
- headers,
838
- method: "POST",
839
- body: JSON.stringify({ apl })
840
- });
841
- const payload = await resp.json();
842
- if (!resp.ok) {
843
- console.log(payload);
844
- return void 0;
845
- }
846
- if (payload.matches.length) {
847
- return buildSpanTree(payload.matches);
848
- }
849
- } catch (err) {
850
- console.log(err);
851
- return void 0;
1104
+ const { dataset, url, token } = resolveAxiomConnection(config);
1105
+ const apl = `['${dataset}'] | where trace_id == "${evalId}" | order by _time`;
1106
+ const headers = new Headers({
1107
+ Authorization: `Bearer ${token}`,
1108
+ "Content-Type": "application/json"
1109
+ });
1110
+ const resp = await fetch(`${url}/v1/datasets/_apl?format=legacy`, {
1111
+ headers,
1112
+ method: "POST",
1113
+ body: JSON.stringify({ apl })
1114
+ });
1115
+ const payload = await resp.json();
1116
+ if (!resp.ok) {
1117
+ throw new Error(`Failed to query evaluation cases: ${payload.message || resp.statusText}`);
852
1118
  }
1119
+ return payload.matches.length ? buildSpanTree(payload.matches) : null;
853
1120
  };
854
1121
  var mapSpanToEval = (span) => {
855
- const flagConfigRaw = span.data.attributes["eval.config.flags"] ?? span.data.attributes.custom["eval.config.flags"];
1122
+ const flagConfigRaw = span.data.attributes[Attr.Eval.Config.Flags] ?? span.data.attributes.custom[Attr.Eval.Config.Flags];
856
1123
  return {
857
- id: span.data.attributes.custom["eval.id"],
858
- name: span.data.attributes.custom["eval.name"],
859
- type: span.data.attributes.custom["eval.type"],
860
- version: span.data.attributes.custom["eval.version"],
1124
+ id: span.data.attributes.custom[Attr.Eval.ID],
1125
+ name: span.data.attributes.custom[Attr.Eval.Name],
1126
+ type: span.data.attributes.custom[Attr.Eval.Type],
1127
+ version: span.data.attributes.custom[Attr.Eval.Version],
861
1128
  collection: {
862
- name: span.data.attributes.custom["eval.collection.name"],
863
- size: span.data.attributes.custom["eval.collection.size"]
1129
+ name: span.data.attributes.custom[Attr.Eval.Collection.Name],
1130
+ size: span.data.attributes.custom[Attr.Eval.Collection.Size]
864
1131
  },
865
1132
  baseline: {
866
- id: span.data.attributes.custom["eval.baseline.id"],
867
- name: span.data.attributes.custom["eval.baseline.name"]
1133
+ id: span.data.attributes.custom[Attr.Eval.Baseline.ID],
1134
+ name: span.data.attributes.custom[Attr.Eval.Baseline.Name]
868
1135
  },
869
1136
  prompt: {
1137
+ // TODO: do we still want this?
870
1138
  model: span.data.attributes.custom["eval.prompt.model"],
871
1139
  params: span.data.attributes.custom["eval.prompt.params"]
872
1140
  },
@@ -874,10 +1142,10 @@ var mapSpanToEval = (span) => {
874
1142
  status: span.data.status.code,
875
1143
  traceId: span.data.trace_id,
876
1144
  runAt: span._time,
877
- tags: span.data.attributes.custom["eval.tags"].length ? JSON.parse(span.data.attributes.custom["eval.tags"]) : [],
1145
+ tags: span.data.attributes.custom[Attr.Eval.Tags].length ? JSON.parse(span.data.attributes.custom[Attr.Eval.Tags]) : [],
878
1146
  user: {
879
- name: span.data.attributes.custom["eval.user.name"],
880
- email: span.data.attributes.custom["eval.user.email"]
1147
+ name: span.data.attributes.custom[Attr.Eval.User.Name],
1148
+ email: span.data.attributes.custom[Attr.Eval.User.Email]
881
1149
  },
882
1150
  cases: [],
883
1151
  flagConfig: flagConfigRaw ? JSON.parse(flagConfigRaw) : void 0
@@ -892,19 +1160,17 @@ var mapSpanToCase = (item) => {
892
1160
  } else {
893
1161
  duration = d;
894
1162
  }
895
- const runtimeFlagsRaw = data.attributes.custom["eval.case.config.runtime_flags"];
896
1163
  return {
897
- index: data.attributes.custom["eval.case.index"],
898
- input: data.attributes.custom["eval.case.input"],
899
- output: data.attributes.custom["eval.case.output"],
900
- expected: data.attributes.custom["eval.case.expected"],
1164
+ index: data.attributes.custom[Attr.Eval.Case.Index],
1165
+ input: data.attributes.custom[Attr.Eval.Case.Input],
1166
+ output: data.attributes.custom[Attr.Eval.Case.Output],
1167
+ expected: data.attributes.custom[Attr.Eval.Case.Expected],
901
1168
  duration,
902
1169
  status: data.status.code,
903
- scores: data.attributes.custom["eval.case.scores"] ? JSON.parse(data.attributes.custom["eval.case.scores"]) : {},
1170
+ scores: data.attributes.custom[Attr.Eval.Case.Scores] ? JSON.parse(data.attributes.custom[Attr.Eval.Case.Scores]) : {},
904
1171
  runAt: item._time,
905
1172
  spanId: data.span_id,
906
- traceId: data.trace_id,
907
- runtimeFlags: runtimeFlagsRaw ? JSON.parse(runtimeFlagsRaw) : void 0
1173
+ traceId: data.trace_id
908
1174
  };
909
1175
  };
910
1176
  var buildSpanTree = (spans) => {
@@ -966,10 +1232,10 @@ var buildSpanTree = (spans) => {
966
1232
  );
967
1233
  caseData.scores = {};
968
1234
  scoreSpans.forEach((score) => {
969
- const name = score.data.attributes.custom["eval.score.name"];
1235
+ const name = score.data.attributes.custom[Attr.Eval.Score.Name];
970
1236
  caseData.scores[name] = {
971
1237
  name,
972
- value: score.data.attributes.custom["eval.score.value"],
1238
+ value: score.data.attributes.custom[Attr.Eval.Score.Value],
973
1239
  metadata: {
974
1240
  error: score.data.attributes.error
975
1241
  }
@@ -1394,7 +1660,11 @@ function calculateFlagDiff(suite) {
1394
1660
  }
1395
1661
  return diffs;
1396
1662
  }
1397
- function printFinalReport({ suiteData }) {
1663
+ function printFinalReport({
1664
+ suiteData,
1665
+ config,
1666
+ registrationStatus
1667
+ }) {
1398
1668
  console.log("");
1399
1669
  console.log(u.bgBlue(u.white(" FINAL EVALUATION REPORT ")));
1400
1670
  console.log("");
@@ -1404,8 +1674,28 @@ function printFinalReport({ suiteData }) {
1404
1674
  printSuiteBox({ suite, scorerAverages, calculateBaselineScorerAverage, flagDiff });
1405
1675
  console.log("");
1406
1676
  }
1407
- console.log("View full report:");
1408
- console.log("https://app.axiom.co/evaluations/run/<run-id>");
1677
+ const runId = suiteData[0]?.runId;
1678
+ const orgId = suiteData[0]?.orgId;
1679
+ const anyRegistered = registrationStatus.some((s2) => s2.registered);
1680
+ const anyFailed = registrationStatus.some((s2) => !s2.registered);
1681
+ if (anyRegistered && orgId && config?.consoleEndpointUrl) {
1682
+ console.log("View full report:");
1683
+ console.log(`${config.consoleEndpointUrl}/${orgId}/ai-engineering/evaluations?runId=${runId}`);
1684
+ } else {
1685
+ console.log("Results not available in Axiom UI (registration failed)");
1686
+ }
1687
+ if (anyFailed) {
1688
+ console.log("");
1689
+ for (const status of registrationStatus) {
1690
+ if (!status.registered) {
1691
+ console.log(u.yellow(`\u26A0\uFE0F Warning: Failed to register "${status.name}" with Axiom`));
1692
+ if (status.error) {
1693
+ console.log(u.dim(` Error: ${status.error}`));
1694
+ }
1695
+ console.log(u.dim(` Results for this evaluation will not be available in the Axiom UI.`));
1696
+ }
1697
+ }
1698
+ }
1409
1699
  }
1410
1700
 
1411
1701
  // src/cli/errors.ts
@@ -1434,14 +1724,19 @@ var AxiomReporter = class {
1434
1724
  __publicField(this, "_suiteData", []);
1435
1725
  __publicField(this, "_baselines", /* @__PURE__ */ new Map());
1436
1726
  __publicField(this, "_printedFlagOverrides", false);
1727
+ __publicField(this, "_config");
1437
1728
  }
1438
1729
  onTestRunStart() {
1439
1730
  this.start = performance.now();
1440
1731
  this.startTime = (/* @__PURE__ */ new Date()).getTime();
1732
+ const config = getAxiomConfig();
1733
+ if (config) {
1734
+ this._config = resolveAxiomConnection(config);
1735
+ }
1441
1736
  }
1442
1737
  async onTestSuiteReady(_testSuite) {
1443
1738
  const meta = _testSuite.meta();
1444
- if (_testSuite.state() === "skipped") {
1739
+ if (_testSuite.state() === "skipped" || !meta?.evaluation) {
1445
1740
  return;
1446
1741
  }
1447
1742
  if (!this._printedFlagOverrides) {
@@ -1473,7 +1768,7 @@ var AxiomReporter = class {
1473
1768
  }
1474
1769
  async onTestSuiteResult(testSuite) {
1475
1770
  const meta = testSuite.meta();
1476
- if (testSuite.state() === "skipped") {
1771
+ if (testSuite.state() === "skipped" || !meta?.evaluation) {
1477
1772
  return;
1478
1773
  }
1479
1774
  const durationSeconds = Number((performance.now() - this.start) / 1e3).toFixed(2);
@@ -1509,8 +1804,11 @@ var AxiomReporter = class {
1509
1804
  baseline: suiteBaseline || null,
1510
1805
  configFlags: meta.evaluation.configFlags,
1511
1806
  flagConfig: meta.evaluation.flagConfig,
1807
+ runId: meta.evaluation.runId,
1808
+ orgId: meta.evaluation.orgId,
1512
1809
  cases,
1513
- outOfScopeFlags: meta.evaluation.outOfScopeFlags
1810
+ outOfScopeFlags: meta.evaluation.outOfScopeFlags,
1811
+ registrationStatus: meta.evaluation.registrationStatus
1514
1812
  });
1515
1813
  printEvalNameAndFileName(testSuite, meta);
1516
1814
  printBaselineNameAndVersion(meta);
@@ -1526,8 +1824,15 @@ var AxiomReporter = class {
1526
1824
  if (shouldClear) {
1527
1825
  process.stdout.write("\x1B[2J\x1B[0f");
1528
1826
  }
1827
+ const registrationStatus = this._suiteData.map((suite) => ({
1828
+ name: suite.name,
1829
+ registered: suite.registrationStatus?.status === "success",
1830
+ error: suite.registrationStatus?.status === "failed" ? suite.registrationStatus.error : void 0
1831
+ }));
1529
1832
  printFinalReport({
1530
- suiteData: this._suiteData
1833
+ suiteData: this._suiteData,
1834
+ config: this._config,
1835
+ registrationStatus
1531
1836
  });
1532
1837
  const DEBUG = process.env.AXIOM_DEBUG === "true";
1533
1838
  if (DEBUG && this._endOfRunConfigEnd) {
@@ -1693,11 +1998,11 @@ function setupEvalProvider(connection) {
1693
1998
  axiomProvider = new import_sdk_trace_node.NodeTracerProvider({
1694
1999
  resource: (0, import_resources.resourceFromAttributes)({
1695
2000
  ["service.name"]: "axiom",
1696
- ["service.version"]: "0.23.0"
2001
+ ["service.version"]: "0.25.0"
1697
2002
  }),
1698
2003
  spanProcessors: [processor]
1699
2004
  });
1700
- axiomTracer = axiomProvider.getTracer("axiom", "0.23.0");
2005
+ axiomTracer = axiomProvider.getTracer("axiom", "0.25.0");
1701
2006
  }
1702
2007
  async function initInstrumentation(config) {
1703
2008
  if (initialized) {
@@ -1709,7 +2014,7 @@ async function initInstrumentation(config) {
1709
2014
  }
1710
2015
  initializationPromise = (async () => {
1711
2016
  if (!config.enabled) {
1712
- axiomTracer = import_api10.trace.getTracer("axiom", "0.23.0");
2017
+ axiomTracer = import_api10.trace.getTracer("axiom", "0.25.0");
1713
2018
  initialized = true;
1714
2019
  return;
1715
2020
  }
@@ -1772,10 +2077,32 @@ var flush = async () => {
1772
2077
  };
1773
2078
 
1774
2079
  // src/evals/run-vitest.ts
2080
+ var printCollectedEvals = (result, rootDir) => {
2081
+ if (!result.testModules || result.testModules.length === 0) {
2082
+ console.log(u.yellow("\nNo evaluations found\n"));
2083
+ return;
2084
+ }
2085
+ console.log(u.bold("\nFound evaluations:\n"));
2086
+ let totalEvals = 0;
2087
+ let totalCases = 0;
2088
+ for (const module2 of result.testModules) {
2089
+ const relativePath = import_node_path3.default.relative(rootDir, module2.moduleId);
2090
+ for (const suite of module2.children.suites()) {
2091
+ totalEvals++;
2092
+ const caseCount = suite.children.size;
2093
+ totalCases += caseCount;
2094
+ console.log(u.green(`\u2713 ${suite.name} (${caseCount} cases)`));
2095
+ console.log(u.dim(` ${relativePath}`));
2096
+ console.log("");
2097
+ }
2098
+ }
2099
+ console.log(u.bold(`Total: ${totalEvals} evaluations, ${totalCases} test cases
2100
+ `));
2101
+ };
1775
2102
  var runVitest = async (dir, opts) => {
1776
2103
  setAxiomConfig(opts.config);
1777
2104
  await initInstrumentation({
1778
- enabled: !opts.debug,
2105
+ enabled: !opts.debug && !opts.list,
1779
2106
  config: opts.config
1780
2107
  });
1781
2108
  const providedConfig = {
@@ -1789,6 +2116,9 @@ var runVitest = async (dir, opts) => {
1789
2116
  if (opts.debug) {
1790
2117
  console.log(u.bgWhite(u.blackBright(" Debug mode enabled ")));
1791
2118
  }
2119
+ if (opts.list) {
2120
+ console.log(u.bgWhite(u.blackBright(" List mode ")));
2121
+ }
1792
2122
  const vi = await (0, import_node.createVitest)("test", {
1793
2123
  root: dir ? dir : process.cwd(),
1794
2124
  mode: "test",
@@ -1808,10 +2138,18 @@ var runVitest = async (dir, opts) => {
1808
2138
  provide: {
1809
2139
  baseline: opts.baseline,
1810
2140
  debug: opts.debug,
2141
+ list: opts.list,
1811
2142
  overrides: opts.overrides,
1812
- axiomConfig: providedConfig
2143
+ axiomConfig: providedConfig,
2144
+ runId: opts.runId
1813
2145
  }
1814
2146
  });
2147
+ if (opts.list) {
2148
+ const result = await vi.collect();
2149
+ printCollectedEvals(result, dir || process.cwd());
2150
+ await vi.close();
2151
+ process.exit(0);
2152
+ }
1815
2153
  await vi.start();
1816
2154
  const dispose = (0, import_node.registerConsoleShortcuts)(vi, process.stdin, process.stdout);
1817
2155
  if (!vi.shouldKeepServer()) {
@@ -1868,6 +2206,7 @@ function isGlob(str) {
1868
2206
  }
1869
2207
 
1870
2208
  // src/cli/commands/eval.command.ts
2209
+ var createRunId = (0, import_nanoid.customAlphabet)("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ", 10);
1871
2210
  var loadEvalCommand = (program2, flagOverrides = {}) => {
1872
2211
  return program2.addCommand(
1873
2212
  new import_commander3.Command("eval").description("run evals locally").addArgument(
@@ -1875,7 +2214,7 @@ var loadEvalCommand = (program2, flagOverrides = {}) => {
1875
2214
  ".",
1876
2215
  "any *.eval.ts file in current directory"
1877
2216
  )
1878
- ).option("-w, --watch true", "keep server running and watch for changes", false).option("-t, --token <TOKEN>", "axiom token", process.env.AXIOM_TOKEN).option("-d, --dataset <DATASET>", "axiom dataset name", process.env.AXIOM_DATASET).option("-u, --url <AXIOM URL>", "axiom url", process.env.AXIOM_URL ?? "https://api.axiom.co").option("-b, --baseline <BASELINE ID>", "id of baseline evaluation to compare against").option("--debug", "run locally without sending to Axiom or loading baselines", false).action(async (target, options) => {
2217
+ ).option("-w, --watch true", "keep server running and watch for changes", false).option("-t, --token <TOKEN>", "axiom token", process.env.AXIOM_TOKEN).option("-d, --dataset <DATASET>", "axiom dataset name", process.env.AXIOM_DATASET).option("-u, --url <AXIOM URL>", "axiom url", process.env.AXIOM_URL ?? "https://api.axiom.co").option("-b, --baseline <BASELINE ID>", "id of baseline evaluation to compare against").option("--debug", "run locally without sending to Axiom or loading baselines", false).option("--list", "list evaluations and test cases without running them", false).action(async (target, options) => {
1879
2218
  try {
1880
2219
  if (options.debug) {
1881
2220
  process.env.AXIOM_DEBUG = "true";
@@ -1909,6 +2248,7 @@ var loadEvalCommand = (program2, flagOverrides = {}) => {
1909
2248
  );
1910
2249
  console.log("");
1911
2250
  }
2251
+ const runId = createRunId();
1912
2252
  await runEvalWithContext(flagOverrides, async () => {
1913
2253
  return runVitest(".", {
1914
2254
  watch: options.watch,
@@ -1917,8 +2257,10 @@ var loadEvalCommand = (program2, flagOverrides = {}) => {
1917
2257
  exclude,
1918
2258
  testNamePattern,
1919
2259
  debug: options.debug,
2260
+ list: options.list,
1920
2261
  overrides: flagOverrides,
1921
- config
2262
+ config,
2263
+ runId
1922
2264
  });
1923
2265
  });
1924
2266
  } catch (error) {
@@ -1937,7 +2279,7 @@ var loadEvalCommand = (program2, flagOverrides = {}) => {
1937
2279
  // src/cli/utils/parse-flag-overrides.ts
1938
2280
  var import_zod5 = require("zod");
1939
2281
  var import_node_fs2 = require("fs");
1940
- var import_node_path3 = require("path");
2282
+ var import_node_path4 = require("path");
1941
2283
  var FLAG_RE = /^--flag\.([^=]+)(?:=(.*))?$/;
1942
2284
  var CONFIG_RE = /^--flags-config(?:=(.*))?$/;
1943
2285
  function ensureNoSpaceSeparatedSyntax(flagName, value, nextToken, flagType) {
@@ -1966,8 +2308,8 @@ function coerceValue(raw) {
1966
2308
  return raw;
1967
2309
  }
1968
2310
  }
1969
- function loadConfigFile(path3) {
1970
- const abs = (0, import_node_path3.resolve)(process.cwd(), path3);
2311
+ function loadConfigFile(path4) {
2312
+ const abs = (0, import_node_path4.resolve)(process.cwd(), path4);
1971
2313
  try {
1972
2314
  const contents = (0, import_node_fs2.readFileSync)(abs, "utf8");
1973
2315
  const parsed = JSON.parse(contents);
@@ -1979,7 +2321,7 @@ function loadConfigFile(path3) {
1979
2321
  }
1980
2322
  return parsed;
1981
2323
  } catch (err) {
1982
- console.error(`\u274C Could not read or parse flags config "${path3}": ${err.message}`);
2324
+ console.error(`\u274C Could not read or parse flags config "${path4}": ${err.message}`);
1983
2325
  process.exit(1);
1984
2326
  }
1985
2327
  }
@@ -2042,7 +2384,7 @@ var import_commander4 = require("commander");
2042
2384
  var loadVersionCommand = (program2) => {
2043
2385
  return program2.addCommand(
2044
2386
  new import_commander4.Command("version").description("cli version").action(() => {
2045
- console.log("0.23.0");
2387
+ console.log("0.25.0");
2046
2388
  })
2047
2389
  );
2048
2390
  };
@@ -2052,7 +2394,7 @@ var { loadEnvConfig } = import_env.default;
2052
2394
  loadEnvConfig(process.cwd());
2053
2395
  var { cleanedArgv, overrides } = extractOverrides(process.argv.slice(2));
2054
2396
  var program = new import_commander5.Command();
2055
- program.name("axiom").description("Axiom's CLI to manage your objects and run evals").version("0.23.0");
2397
+ program.name("axiom").description("Axiom's CLI to manage your objects and run evals").version("0.25.0");
2056
2398
  loadPushCommand(program);
2057
2399
  loadPullCommand(program);
2058
2400
  loadEvalCommand(program, overrides);