axiom 0.24.0 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin.cjs +340 -34
- package/dist/bin.cjs.map +1 -1
- package/dist/bin.js +43 -9
- package/dist/bin.js.map +1 -1
- package/dist/{chunk-6E6HEZTE.js → chunk-BSZFDG3O.js} +12 -5
- package/dist/chunk-BSZFDG3O.js.map +1 -0
- package/dist/{chunk-CW7MNTNT.js → chunk-JGAXOVPZ.js} +28 -28
- package/dist/chunk-JGAXOVPZ.js.map +1 -0
- package/dist/evals.cjs +45 -37
- package/dist/evals.cjs.map +1 -1
- package/dist/evals.d.cts +1 -0
- package/dist/evals.d.ts +1 -0
- package/dist/evals.js +10 -9
- package/dist/evals.js.map +1 -1
- package/dist/index.cjs +11 -4
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/dist/chunk-6E6HEZTE.js.map +0 -1
- package/dist/chunk-CW7MNTNT.js.map +0 -1
package/dist/bin.cjs
CHANGED
|
@@ -477,6 +477,7 @@ var r = process.env.FORCE_TTY !== void 0 || (0, import_tty.isatty)(1);
|
|
|
477
477
|
var u = p(r);
|
|
478
478
|
|
|
479
479
|
// src/evals/run-vitest.ts
|
|
480
|
+
var import_node_path3 = __toESM(require("path"), 1);
|
|
480
481
|
var import_node = require("vitest/node");
|
|
481
482
|
|
|
482
483
|
// src/evals/context/storage.ts
|
|
@@ -594,7 +595,280 @@ var import_api5 = require("@opentelemetry/api");
|
|
|
594
595
|
|
|
595
596
|
// src/otel/semconv/attributes.ts
|
|
596
597
|
var import_semantic_conventions = require("@opentelemetry/semantic-conventions");
|
|
598
|
+
|
|
599
|
+
// src/otel/semconv/eval_proposal.ts
|
|
600
|
+
var ATTR_EVAL_ID = "eval.id";
|
|
601
|
+
var ATTR_EVAL_NAME = "eval.name";
|
|
602
|
+
var ATTR_EVAL_VERSION = "eval.version";
|
|
603
|
+
var ATTR_EVAL_TYPE = "eval.type";
|
|
604
|
+
var ATTR_EVAL_TAGS = "eval.tags";
|
|
605
|
+
var ATTR_EVAL_BASELINE_ID = "eval.baseline.id";
|
|
606
|
+
var ATTR_EVAL_BASELINE_NAME = "eval.baseline.name";
|
|
607
|
+
var ATTR_EVAL_METADATA = "eval.metadata";
|
|
608
|
+
var ATTR_EVAL_COLLECTION_ID = "eval.collection.id";
|
|
609
|
+
var ATTR_EVAL_COLLECTION_SIZE = "eval.collection.size";
|
|
610
|
+
var ATTR_EVAL_COLLECTION_NAME = "eval.collection.name";
|
|
611
|
+
var ATTR_EVAL_CONFIG_FLAGS = "eval.config.flags";
|
|
612
|
+
var ATTR_EVAL_CASE_INDEX = "eval.case.index";
|
|
613
|
+
var ATTR_EVAL_CASE_INPUT = "eval.case.input";
|
|
614
|
+
var ATTR_EVAL_CASE_OUTPUT = "eval.case.output";
|
|
615
|
+
var ATTR_EVAL_CASE_EXPECTED = "eval.case.expected";
|
|
616
|
+
var ATTR_EVAL_CASE_SCORES = "eval.case.scores";
|
|
617
|
+
var ATTR_EVAL_CASE_METADATA = "eval.case.metadata";
|
|
618
|
+
var ATTR_EVAL_TASK_OUTPUT = "eval.task.output";
|
|
619
|
+
var ATTR_EVAL_TASK_NAME = "eval.task.name";
|
|
620
|
+
var ATTR_EVAL_TASK_TYPE = "eval.task.type";
|
|
621
|
+
var ATTR_EVAL_RUN_ID = "eval.run.id";
|
|
622
|
+
var ATTR_EVAL_SCORE_NAME = "eval.score.name";
|
|
623
|
+
var ATTR_EVAL_SCORE_VALUE = "eval.score.value";
|
|
624
|
+
var ATTR_EVAL_SCORE_THRESHOLD = "eval.score.threshold";
|
|
625
|
+
var ATTR_EVAL_SCORE_PASSED = "eval.score.passed";
|
|
626
|
+
var ATTR_EVAL_SCORE_METADATA = "eval.score.metadata";
|
|
627
|
+
var ATTR_EVAL_USER_NAME = "eval.user.name";
|
|
628
|
+
var ATTR_EVAL_USER_EMAIL = "eval.user.email";
|
|
629
|
+
|
|
630
|
+
// src/otel/semconv/attributes.ts
|
|
597
631
|
var import_incubating = require("@opentelemetry/semantic-conventions/incubating");
|
|
632
|
+
var ATTR_AXIOM_GEN_AI_SCHEMA_URL = "axiom.gen_ai.schema_url";
|
|
633
|
+
var ATTR_AXIOM_GEN_AI_SDK_NAME = "axiom.gen_ai.sdk.name";
|
|
634
|
+
var ATTR_AXIOM_GEN_AI_SDK_VERSION = "axiom.gen_ai.sdk.version";
|
|
635
|
+
var ATTR_GEN_AI_CAPABILITY_NAME = "gen_ai.capability.name";
|
|
636
|
+
var ATTR_GEN_AI_STEP_NAME = "gen_ai.step.name";
|
|
637
|
+
var ATTR_GEN_AI_TOOL_ARGUMENTS = "gen_ai.tool.arguments";
|
|
638
|
+
var ATTR_GEN_AI_TOOL_MESSAGE = "gen_ai.tool.message";
|
|
639
|
+
var GEN_AI_PROVIDER_NAME_VALUE_ASSEMBLYAI = "assemblyai";
|
|
640
|
+
var GEN_AI_PROVIDER_NAME_VALUE_CEREBRAS = "cerebras";
|
|
641
|
+
var GEN_AI_PROVIDER_NAME_VALUE_DEEPGRAM = "deepgram";
|
|
642
|
+
var GEN_AI_PROVIDER_NAME_VALUE_DEEPINFRA = "deepinfra";
|
|
643
|
+
var GEN_AI_PROVIDER_NAME_VALUE_ELEVENLABS = "elevenlabs";
|
|
644
|
+
var GEN_AI_PROVIDER_NAME_VALUE_FAL = "fal";
|
|
645
|
+
var GEN_AI_PROVIDER_NAME_VALUE_FIREWORKS = "fireworks";
|
|
646
|
+
var GEN_AI_PROVIDER_NAME_VALUE_GLADIA = "gladia";
|
|
647
|
+
var GEN_AI_PROVIDER_NAME_VALUE_HUME = "hume";
|
|
648
|
+
var GEN_AI_PROVIDER_NAME_VALUE_LMNT = "lmnt";
|
|
649
|
+
var GEN_AI_PROVIDER_NAME_VALUE_LUMA = "luma";
|
|
650
|
+
var GEN_AI_PROVIDER_NAME_VALUE_REPLICATE = "replicate";
|
|
651
|
+
var GEN_AI_PROVIDER_NAME_VALUE_REVAI = "revai";
|
|
652
|
+
var GEN_AI_PROVIDER_NAME_VALUE_TOGETHERAI = "togetherai";
|
|
653
|
+
var GEN_AI_PROVIDER_NAME_VALUE_VERCEL = "vercel";
|
|
654
|
+
var Attr = {
|
|
655
|
+
__EXPERIMENTAL_Flag: (flagName) => `flag.${flagName}`,
|
|
656
|
+
__EXPERIMENTAL_Fact: (factName) => `fact.${factName}`,
|
|
657
|
+
Axiom: {
|
|
658
|
+
GenAI: {
|
|
659
|
+
SchemaURL: ATTR_AXIOM_GEN_AI_SCHEMA_URL,
|
|
660
|
+
SDK: {
|
|
661
|
+
Name: ATTR_AXIOM_GEN_AI_SDK_NAME,
|
|
662
|
+
Version: ATTR_AXIOM_GEN_AI_SDK_VERSION
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
},
|
|
666
|
+
GenAI: {
|
|
667
|
+
PromptMetadata: {
|
|
668
|
+
ID: "axiom.gen_ai.prompt.id",
|
|
669
|
+
Name: "axiom.gen_ai.prompt.name",
|
|
670
|
+
Slug: "axiom.gen_ai.prompt.slug",
|
|
671
|
+
Version: "axiom.gen_ai.prompt.version"
|
|
672
|
+
},
|
|
673
|
+
/**
|
|
674
|
+
* These two are used to identify the span
|
|
675
|
+
*/
|
|
676
|
+
Capability: {
|
|
677
|
+
Name: ATTR_GEN_AI_CAPABILITY_NAME
|
|
678
|
+
},
|
|
679
|
+
Step: {
|
|
680
|
+
Name: ATTR_GEN_AI_STEP_NAME
|
|
681
|
+
},
|
|
682
|
+
Provider: {
|
|
683
|
+
Name: import_incubating.ATTR_GEN_AI_PROVIDER_NAME,
|
|
684
|
+
Name_Values: {
|
|
685
|
+
Anthropic: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_ANTHROPIC,
|
|
686
|
+
AssemblyAI: GEN_AI_PROVIDER_NAME_VALUE_ASSEMBLYAI,
|
|
687
|
+
AWSBedrock: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_AWS_BEDROCK,
|
|
688
|
+
AzureAIInference: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_AZURE_AI_INFERENCE,
|
|
689
|
+
AzureAIOpenAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_AZURE_AI_OPENAI,
|
|
690
|
+
Cerebras: GEN_AI_PROVIDER_NAME_VALUE_CEREBRAS,
|
|
691
|
+
Cohere: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_COHERE,
|
|
692
|
+
Deepgram: GEN_AI_PROVIDER_NAME_VALUE_DEEPGRAM,
|
|
693
|
+
DeepInfra: GEN_AI_PROVIDER_NAME_VALUE_DEEPINFRA,
|
|
694
|
+
Deepseek: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_DEEPSEEK,
|
|
695
|
+
ElevenLabs: GEN_AI_PROVIDER_NAME_VALUE_ELEVENLABS,
|
|
696
|
+
Fal: GEN_AI_PROVIDER_NAME_VALUE_FAL,
|
|
697
|
+
Fireworks: GEN_AI_PROVIDER_NAME_VALUE_FIREWORKS,
|
|
698
|
+
GCPGemini: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GCP_GEMINI,
|
|
699
|
+
GCPGenAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GCP_GEN_AI,
|
|
700
|
+
GCPVertexAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GCP_VERTEX_AI,
|
|
701
|
+
Gladia: GEN_AI_PROVIDER_NAME_VALUE_GLADIA,
|
|
702
|
+
Groq: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GROQ,
|
|
703
|
+
Hume: GEN_AI_PROVIDER_NAME_VALUE_HUME,
|
|
704
|
+
IBMWatsonxAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_IBM_WATSONX_AI,
|
|
705
|
+
Lmnt: GEN_AI_PROVIDER_NAME_VALUE_LMNT,
|
|
706
|
+
Luma: GEN_AI_PROVIDER_NAME_VALUE_LUMA,
|
|
707
|
+
MistralAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_MISTRAL_AI,
|
|
708
|
+
OpenAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_OPENAI,
|
|
709
|
+
Perplexity: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_PERPLEXITY,
|
|
710
|
+
Replicate: GEN_AI_PROVIDER_NAME_VALUE_REPLICATE,
|
|
711
|
+
RevAI: GEN_AI_PROVIDER_NAME_VALUE_REVAI,
|
|
712
|
+
TogetherAI: GEN_AI_PROVIDER_NAME_VALUE_TOGETHERAI,
|
|
713
|
+
Vercel: GEN_AI_PROVIDER_NAME_VALUE_VERCEL,
|
|
714
|
+
XAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_X_AI
|
|
715
|
+
}
|
|
716
|
+
},
|
|
717
|
+
/**
|
|
718
|
+
* Regular attributes
|
|
719
|
+
*/
|
|
720
|
+
Agent: {
|
|
721
|
+
Description: import_incubating.ATTR_GEN_AI_AGENT_DESCRIPTION,
|
|
722
|
+
// not yet used by axiom-ai
|
|
723
|
+
ID: import_incubating.ATTR_GEN_AI_AGENT_ID,
|
|
724
|
+
// not yet used by axiom-ai
|
|
725
|
+
Name: import_incubating.ATTR_GEN_AI_AGENT_NAME
|
|
726
|
+
// not yet used by axiom-ai
|
|
727
|
+
},
|
|
728
|
+
Conversation: {
|
|
729
|
+
ID: import_incubating.ATTR_GEN_AI_CONVERSATION_ID
|
|
730
|
+
// not yet used by axiom-ai, anyway probably needs to be provided by user
|
|
731
|
+
},
|
|
732
|
+
Input: {
|
|
733
|
+
Messages: import_incubating.ATTR_GEN_AI_INPUT_MESSAGES
|
|
734
|
+
},
|
|
735
|
+
Operation: {
|
|
736
|
+
Name: import_incubating.ATTR_GEN_AI_OPERATION_NAME,
|
|
737
|
+
Name_Values: {
|
|
738
|
+
/**
|
|
739
|
+
* Note that "text_completion" is deprecated in favor of "chat" for both OpenAI and Anthropic
|
|
740
|
+
*/
|
|
741
|
+
Chat: import_incubating.GEN_AI_OPERATION_NAME_VALUE_CHAT,
|
|
742
|
+
CreateAgent: import_incubating.GEN_AI_OPERATION_NAME_VALUE_CREATE_AGENT,
|
|
743
|
+
Embeddings: import_incubating.GEN_AI_OPERATION_NAME_VALUE_EMBEDDINGS,
|
|
744
|
+
ExecuteTool: import_incubating.GEN_AI_OPERATION_NAME_VALUE_EXECUTE_TOOL,
|
|
745
|
+
GenerateContent: import_incubating.GEN_AI_OPERATION_NAME_VALUE_GENERATE_CONTENT,
|
|
746
|
+
InvokeAgent: import_incubating.GEN_AI_OPERATION_NAME_VALUE_INVOKE_AGENT
|
|
747
|
+
}
|
|
748
|
+
},
|
|
749
|
+
Output: {
|
|
750
|
+
Messages: import_incubating.ATTR_GEN_AI_OUTPUT_MESSAGES,
|
|
751
|
+
Type: import_incubating.ATTR_GEN_AI_OUTPUT_TYPE,
|
|
752
|
+
Type_Values: {
|
|
753
|
+
Text: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_TEXT,
|
|
754
|
+
Json: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_JSON,
|
|
755
|
+
Image: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_IMAGE,
|
|
756
|
+
Speech: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_SPEECH
|
|
757
|
+
}
|
|
758
|
+
},
|
|
759
|
+
/**
|
|
760
|
+
* The provider that is hosting the model, eg AWS Bedrock
|
|
761
|
+
* There doesn't seem to be a semconv for this
|
|
762
|
+
*/
|
|
763
|
+
Request: {
|
|
764
|
+
ChoiceCount: import_incubating.ATTR_GEN_AI_REQUEST_CHOICE_COUNT,
|
|
765
|
+
// not yet used by axiom-ai
|
|
766
|
+
EncodingFormats: import_incubating.ATTR_GEN_AI_REQUEST_ENCODING_FORMATS,
|
|
767
|
+
// not yet used by axiom-ai
|
|
768
|
+
FrequencyPenalty: import_incubating.ATTR_GEN_AI_REQUEST_FREQUENCY_PENALTY,
|
|
769
|
+
MaxTokens: import_incubating.ATTR_GEN_AI_REQUEST_MAX_TOKENS,
|
|
770
|
+
/**
|
|
771
|
+
* The model you asked for
|
|
772
|
+
*/
|
|
773
|
+
Model: import_incubating.ATTR_GEN_AI_REQUEST_MODEL,
|
|
774
|
+
PresencePenalty: import_incubating.ATTR_GEN_AI_REQUEST_PRESENCE_PENALTY,
|
|
775
|
+
Seed: import_incubating.ATTR_GEN_AI_REQUEST_SEED,
|
|
776
|
+
StopSequences: import_incubating.ATTR_GEN_AI_REQUEST_STOP_SEQUENCES,
|
|
777
|
+
Temperature: import_incubating.ATTR_GEN_AI_REQUEST_TEMPERATURE,
|
|
778
|
+
TopK: import_incubating.ATTR_GEN_AI_REQUEST_TOP_K,
|
|
779
|
+
TopP: import_incubating.ATTR_GEN_AI_REQUEST_TOP_P
|
|
780
|
+
},
|
|
781
|
+
Response: {
|
|
782
|
+
FinishReasons: import_incubating.ATTR_GEN_AI_RESPONSE_FINISH_REASONS,
|
|
783
|
+
ID: import_incubating.ATTR_GEN_AI_RESPONSE_ID,
|
|
784
|
+
/**
|
|
785
|
+
* The model that was actually used (might be different bc routing) - only ever get this from the response, otherwise omit
|
|
786
|
+
*/
|
|
787
|
+
Model: import_incubating.ATTR_GEN_AI_RESPONSE_MODEL
|
|
788
|
+
// somehow not landing on the span for google models? check up on this...
|
|
789
|
+
},
|
|
790
|
+
Tool: {
|
|
791
|
+
CallID: import_incubating.ATTR_GEN_AI_TOOL_CALL_ID,
|
|
792
|
+
Description: import_incubating.ATTR_GEN_AI_TOOL_DESCRIPTION,
|
|
793
|
+
Name: import_incubating.ATTR_GEN_AI_TOOL_NAME,
|
|
794
|
+
Type: import_incubating.ATTR_GEN_AI_TOOL_TYPE,
|
|
795
|
+
/**
|
|
796
|
+
* Note, OTel Semantic Convention suggest only putting tool inputs/outputs on the parent chat span
|
|
797
|
+
* But we at least want to give users THE OPTION to put them on the tool spans themselves as well
|
|
798
|
+
* Because it enables a lot of things with querying
|
|
799
|
+
* @see https://github.com/open-telemetry/semantic-conventions/releases/tag/v1.37.0
|
|
800
|
+
*/
|
|
801
|
+
Arguments: ATTR_GEN_AI_TOOL_ARGUMENTS,
|
|
802
|
+
/**
|
|
803
|
+
* Note, OTel Semantic Convention suggest only putting tool inputs/outputs on the parent chat span
|
|
804
|
+
* But we at least want to give users THE OPTION to put them on the tool spans themselves as well
|
|
805
|
+
* Because it enables a lot of things with querying
|
|
806
|
+
* @see https://github.com/open-telemetry/semantic-conventions/releases/tag/v1.37.0
|
|
807
|
+
*/
|
|
808
|
+
Message: ATTR_GEN_AI_TOOL_MESSAGE
|
|
809
|
+
},
|
|
810
|
+
Usage: {
|
|
811
|
+
InputTokens: import_incubating.ATTR_GEN_AI_USAGE_INPUT_TOKENS,
|
|
812
|
+
OutputTokens: import_incubating.ATTR_GEN_AI_USAGE_OUTPUT_TOKENS
|
|
813
|
+
}
|
|
814
|
+
},
|
|
815
|
+
Eval: {
|
|
816
|
+
ID: ATTR_EVAL_ID,
|
|
817
|
+
Name: ATTR_EVAL_NAME,
|
|
818
|
+
Version: ATTR_EVAL_VERSION,
|
|
819
|
+
Type: ATTR_EVAL_TYPE,
|
|
820
|
+
Baseline: {
|
|
821
|
+
ID: ATTR_EVAL_BASELINE_ID,
|
|
822
|
+
Name: ATTR_EVAL_BASELINE_NAME
|
|
823
|
+
},
|
|
824
|
+
Tags: ATTR_EVAL_TAGS,
|
|
825
|
+
Metadata: ATTR_EVAL_METADATA,
|
|
826
|
+
Collection: {
|
|
827
|
+
ID: ATTR_EVAL_COLLECTION_ID,
|
|
828
|
+
Name: ATTR_EVAL_COLLECTION_NAME,
|
|
829
|
+
Size: ATTR_EVAL_COLLECTION_SIZE
|
|
830
|
+
},
|
|
831
|
+
Config: {
|
|
832
|
+
Flags: ATTR_EVAL_CONFIG_FLAGS
|
|
833
|
+
},
|
|
834
|
+
Run: {
|
|
835
|
+
ID: ATTR_EVAL_RUN_ID
|
|
836
|
+
},
|
|
837
|
+
Case: {
|
|
838
|
+
Index: ATTR_EVAL_CASE_INDEX,
|
|
839
|
+
Input: ATTR_EVAL_CASE_INPUT,
|
|
840
|
+
Output: ATTR_EVAL_CASE_OUTPUT,
|
|
841
|
+
Expected: ATTR_EVAL_CASE_EXPECTED,
|
|
842
|
+
Scores: ATTR_EVAL_CASE_SCORES,
|
|
843
|
+
Metadata: ATTR_EVAL_CASE_METADATA
|
|
844
|
+
},
|
|
845
|
+
Task: {
|
|
846
|
+
Output: ATTR_EVAL_TASK_OUTPUT,
|
|
847
|
+
Name: ATTR_EVAL_TASK_NAME,
|
|
848
|
+
Type: ATTR_EVAL_TASK_TYPE
|
|
849
|
+
},
|
|
850
|
+
Score: {
|
|
851
|
+
Name: ATTR_EVAL_SCORE_NAME,
|
|
852
|
+
Value: ATTR_EVAL_SCORE_VALUE,
|
|
853
|
+
Threshold: ATTR_EVAL_SCORE_THRESHOLD,
|
|
854
|
+
Passed: ATTR_EVAL_SCORE_PASSED,
|
|
855
|
+
Metadata: ATTR_EVAL_SCORE_METADATA
|
|
856
|
+
},
|
|
857
|
+
User: {
|
|
858
|
+
Name: ATTR_EVAL_USER_NAME,
|
|
859
|
+
Email: ATTR_EVAL_USER_EMAIL
|
|
860
|
+
}
|
|
861
|
+
},
|
|
862
|
+
Error: {
|
|
863
|
+
Type: import_semantic_conventions.ATTR_ERROR_TYPE,
|
|
864
|
+
Message: import_incubating.ATTR_ERROR_MESSAGE
|
|
865
|
+
},
|
|
866
|
+
HTTP: {
|
|
867
|
+
Response: {
|
|
868
|
+
StatusCode: import_semantic_conventions.ATTR_HTTP_RESPONSE_STATUS_CODE
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
};
|
|
598
872
|
|
|
599
873
|
// src/otel/startActiveSpan.ts
|
|
600
874
|
var import_api2 = require("@opentelemetry/api");
|
|
@@ -605,7 +879,7 @@ var import_api4 = require("@opentelemetry/api");
|
|
|
605
879
|
// package.json
|
|
606
880
|
var package_default = {
|
|
607
881
|
name: "axiom",
|
|
608
|
-
version: "0.
|
|
882
|
+
version: "0.25.0",
|
|
609
883
|
type: "module",
|
|
610
884
|
author: "Axiom, Inc.",
|
|
611
885
|
contributors: [
|
|
@@ -845,21 +1119,22 @@ var findEvaluationCases = async (evalId, config) => {
|
|
|
845
1119
|
return payload.matches.length ? buildSpanTree(payload.matches) : null;
|
|
846
1120
|
};
|
|
847
1121
|
var mapSpanToEval = (span) => {
|
|
848
|
-
const flagConfigRaw = span.data.attributes[
|
|
1122
|
+
const flagConfigRaw = span.data.attributes[Attr.Eval.Config.Flags] ?? span.data.attributes.custom[Attr.Eval.Config.Flags];
|
|
849
1123
|
return {
|
|
850
|
-
id: span.data.attributes.custom[
|
|
851
|
-
name: span.data.attributes.custom[
|
|
852
|
-
type: span.data.attributes.custom[
|
|
853
|
-
version: span.data.attributes.custom[
|
|
1124
|
+
id: span.data.attributes.custom[Attr.Eval.ID],
|
|
1125
|
+
name: span.data.attributes.custom[Attr.Eval.Name],
|
|
1126
|
+
type: span.data.attributes.custom[Attr.Eval.Type],
|
|
1127
|
+
version: span.data.attributes.custom[Attr.Eval.Version],
|
|
854
1128
|
collection: {
|
|
855
|
-
name: span.data.attributes.custom[
|
|
856
|
-
size: span.data.attributes.custom[
|
|
1129
|
+
name: span.data.attributes.custom[Attr.Eval.Collection.Name],
|
|
1130
|
+
size: span.data.attributes.custom[Attr.Eval.Collection.Size]
|
|
857
1131
|
},
|
|
858
1132
|
baseline: {
|
|
859
|
-
id: span.data.attributes.custom[
|
|
860
|
-
name: span.data.attributes.custom[
|
|
1133
|
+
id: span.data.attributes.custom[Attr.Eval.Baseline.ID],
|
|
1134
|
+
name: span.data.attributes.custom[Attr.Eval.Baseline.Name]
|
|
861
1135
|
},
|
|
862
1136
|
prompt: {
|
|
1137
|
+
// TODO: do we still want this?
|
|
863
1138
|
model: span.data.attributes.custom["eval.prompt.model"],
|
|
864
1139
|
params: span.data.attributes.custom["eval.prompt.params"]
|
|
865
1140
|
},
|
|
@@ -867,10 +1142,10 @@ var mapSpanToEval = (span) => {
|
|
|
867
1142
|
status: span.data.status.code,
|
|
868
1143
|
traceId: span.data.trace_id,
|
|
869
1144
|
runAt: span._time,
|
|
870
|
-
tags: span.data.attributes.custom[
|
|
1145
|
+
tags: span.data.attributes.custom[Attr.Eval.Tags].length ? JSON.parse(span.data.attributes.custom[Attr.Eval.Tags]) : [],
|
|
871
1146
|
user: {
|
|
872
|
-
name: span.data.attributes.custom[
|
|
873
|
-
email: span.data.attributes.custom[
|
|
1147
|
+
name: span.data.attributes.custom[Attr.Eval.User.Name],
|
|
1148
|
+
email: span.data.attributes.custom[Attr.Eval.User.Email]
|
|
874
1149
|
},
|
|
875
1150
|
cases: [],
|
|
876
1151
|
flagConfig: flagConfigRaw ? JSON.parse(flagConfigRaw) : void 0
|
|
@@ -885,19 +1160,17 @@ var mapSpanToCase = (item) => {
|
|
|
885
1160
|
} else {
|
|
886
1161
|
duration = d;
|
|
887
1162
|
}
|
|
888
|
-
const runtimeFlagsRaw = data.attributes.custom["eval.case.config.runtime_flags"];
|
|
889
1163
|
return {
|
|
890
|
-
index: data.attributes.custom[
|
|
891
|
-
input: data.attributes.custom[
|
|
892
|
-
output: data.attributes.custom[
|
|
893
|
-
expected: data.attributes.custom[
|
|
1164
|
+
index: data.attributes.custom[Attr.Eval.Case.Index],
|
|
1165
|
+
input: data.attributes.custom[Attr.Eval.Case.Input],
|
|
1166
|
+
output: data.attributes.custom[Attr.Eval.Case.Output],
|
|
1167
|
+
expected: data.attributes.custom[Attr.Eval.Case.Expected],
|
|
894
1168
|
duration,
|
|
895
1169
|
status: data.status.code,
|
|
896
|
-
scores: data.attributes.custom[
|
|
1170
|
+
scores: data.attributes.custom[Attr.Eval.Case.Scores] ? JSON.parse(data.attributes.custom[Attr.Eval.Case.Scores]) : {},
|
|
897
1171
|
runAt: item._time,
|
|
898
1172
|
spanId: data.span_id,
|
|
899
|
-
traceId: data.trace_id
|
|
900
|
-
runtimeFlags: runtimeFlagsRaw ? JSON.parse(runtimeFlagsRaw) : void 0
|
|
1173
|
+
traceId: data.trace_id
|
|
901
1174
|
};
|
|
902
1175
|
};
|
|
903
1176
|
var buildSpanTree = (spans) => {
|
|
@@ -959,10 +1232,10 @@ var buildSpanTree = (spans) => {
|
|
|
959
1232
|
);
|
|
960
1233
|
caseData.scores = {};
|
|
961
1234
|
scoreSpans.forEach((score) => {
|
|
962
|
-
const name = score.data.attributes.custom[
|
|
1235
|
+
const name = score.data.attributes.custom[Attr.Eval.Score.Name];
|
|
963
1236
|
caseData.scores[name] = {
|
|
964
1237
|
name,
|
|
965
|
-
value: score.data.attributes.custom[
|
|
1238
|
+
value: score.data.attributes.custom[Attr.Eval.Score.Value],
|
|
966
1239
|
metadata: {
|
|
967
1240
|
error: score.data.attributes.error
|
|
968
1241
|
}
|
|
@@ -1725,11 +1998,11 @@ function setupEvalProvider(connection) {
|
|
|
1725
1998
|
axiomProvider = new import_sdk_trace_node.NodeTracerProvider({
|
|
1726
1999
|
resource: (0, import_resources.resourceFromAttributes)({
|
|
1727
2000
|
["service.name"]: "axiom",
|
|
1728
|
-
["service.version"]: "0.
|
|
2001
|
+
["service.version"]: "0.25.0"
|
|
1729
2002
|
}),
|
|
1730
2003
|
spanProcessors: [processor]
|
|
1731
2004
|
});
|
|
1732
|
-
axiomTracer = axiomProvider.getTracer("axiom", "0.
|
|
2005
|
+
axiomTracer = axiomProvider.getTracer("axiom", "0.25.0");
|
|
1733
2006
|
}
|
|
1734
2007
|
async function initInstrumentation(config) {
|
|
1735
2008
|
if (initialized) {
|
|
@@ -1741,7 +2014,7 @@ async function initInstrumentation(config) {
|
|
|
1741
2014
|
}
|
|
1742
2015
|
initializationPromise = (async () => {
|
|
1743
2016
|
if (!config.enabled) {
|
|
1744
|
-
axiomTracer = import_api10.trace.getTracer("axiom", "0.
|
|
2017
|
+
axiomTracer = import_api10.trace.getTracer("axiom", "0.25.0");
|
|
1745
2018
|
initialized = true;
|
|
1746
2019
|
return;
|
|
1747
2020
|
}
|
|
@@ -1804,10 +2077,32 @@ var flush = async () => {
|
|
|
1804
2077
|
};
|
|
1805
2078
|
|
|
1806
2079
|
// src/evals/run-vitest.ts
|
|
2080
|
+
var printCollectedEvals = (result, rootDir) => {
|
|
2081
|
+
if (!result.testModules || result.testModules.length === 0) {
|
|
2082
|
+
console.log(u.yellow("\nNo evaluations found\n"));
|
|
2083
|
+
return;
|
|
2084
|
+
}
|
|
2085
|
+
console.log(u.bold("\nFound evaluations:\n"));
|
|
2086
|
+
let totalEvals = 0;
|
|
2087
|
+
let totalCases = 0;
|
|
2088
|
+
for (const module2 of result.testModules) {
|
|
2089
|
+
const relativePath = import_node_path3.default.relative(rootDir, module2.moduleId);
|
|
2090
|
+
for (const suite of module2.children.suites()) {
|
|
2091
|
+
totalEvals++;
|
|
2092
|
+
const caseCount = suite.children.size;
|
|
2093
|
+
totalCases += caseCount;
|
|
2094
|
+
console.log(u.green(`\u2713 ${suite.name} (${caseCount} cases)`));
|
|
2095
|
+
console.log(u.dim(` ${relativePath}`));
|
|
2096
|
+
console.log("");
|
|
2097
|
+
}
|
|
2098
|
+
}
|
|
2099
|
+
console.log(u.bold(`Total: ${totalEvals} evaluations, ${totalCases} test cases
|
|
2100
|
+
`));
|
|
2101
|
+
};
|
|
1807
2102
|
var runVitest = async (dir, opts) => {
|
|
1808
2103
|
setAxiomConfig(opts.config);
|
|
1809
2104
|
await initInstrumentation({
|
|
1810
|
-
enabled: !opts.debug,
|
|
2105
|
+
enabled: !opts.debug && !opts.list,
|
|
1811
2106
|
config: opts.config
|
|
1812
2107
|
});
|
|
1813
2108
|
const providedConfig = {
|
|
@@ -1821,6 +2116,9 @@ var runVitest = async (dir, opts) => {
|
|
|
1821
2116
|
if (opts.debug) {
|
|
1822
2117
|
console.log(u.bgWhite(u.blackBright(" Debug mode enabled ")));
|
|
1823
2118
|
}
|
|
2119
|
+
if (opts.list) {
|
|
2120
|
+
console.log(u.bgWhite(u.blackBright(" List mode ")));
|
|
2121
|
+
}
|
|
1824
2122
|
const vi = await (0, import_node.createVitest)("test", {
|
|
1825
2123
|
root: dir ? dir : process.cwd(),
|
|
1826
2124
|
mode: "test",
|
|
@@ -1840,11 +2138,18 @@ var runVitest = async (dir, opts) => {
|
|
|
1840
2138
|
provide: {
|
|
1841
2139
|
baseline: opts.baseline,
|
|
1842
2140
|
debug: opts.debug,
|
|
2141
|
+
list: opts.list,
|
|
1843
2142
|
overrides: opts.overrides,
|
|
1844
2143
|
axiomConfig: providedConfig,
|
|
1845
2144
|
runId: opts.runId
|
|
1846
2145
|
}
|
|
1847
2146
|
});
|
|
2147
|
+
if (opts.list) {
|
|
2148
|
+
const result = await vi.collect();
|
|
2149
|
+
printCollectedEvals(result, dir || process.cwd());
|
|
2150
|
+
await vi.close();
|
|
2151
|
+
process.exit(0);
|
|
2152
|
+
}
|
|
1848
2153
|
await vi.start();
|
|
1849
2154
|
const dispose = (0, import_node.registerConsoleShortcuts)(vi, process.stdin, process.stdout);
|
|
1850
2155
|
if (!vi.shouldKeepServer()) {
|
|
@@ -1909,7 +2214,7 @@ var loadEvalCommand = (program2, flagOverrides = {}) => {
|
|
|
1909
2214
|
".",
|
|
1910
2215
|
"any *.eval.ts file in current directory"
|
|
1911
2216
|
)
|
|
1912
|
-
).option("-w, --watch true", "keep server running and watch for changes", false).option("-t, --token <TOKEN>", "axiom token", process.env.AXIOM_TOKEN).option("-d, --dataset <DATASET>", "axiom dataset name", process.env.AXIOM_DATASET).option("-u, --url <AXIOM URL>", "axiom url", process.env.AXIOM_URL ?? "https://api.axiom.co").option("-b, --baseline <BASELINE ID>", "id of baseline evaluation to compare against").option("--debug", "run locally without sending to Axiom or loading baselines", false).action(async (target, options) => {
|
|
2217
|
+
).option("-w, --watch true", "keep server running and watch for changes", false).option("-t, --token <TOKEN>", "axiom token", process.env.AXIOM_TOKEN).option("-d, --dataset <DATASET>", "axiom dataset name", process.env.AXIOM_DATASET).option("-u, --url <AXIOM URL>", "axiom url", process.env.AXIOM_URL ?? "https://api.axiom.co").option("-b, --baseline <BASELINE ID>", "id of baseline evaluation to compare against").option("--debug", "run locally without sending to Axiom or loading baselines", false).option("--list", "list evaluations and test cases without running them", false).action(async (target, options) => {
|
|
1913
2218
|
try {
|
|
1914
2219
|
if (options.debug) {
|
|
1915
2220
|
process.env.AXIOM_DEBUG = "true";
|
|
@@ -1952,6 +2257,7 @@ var loadEvalCommand = (program2, flagOverrides = {}) => {
|
|
|
1952
2257
|
exclude,
|
|
1953
2258
|
testNamePattern,
|
|
1954
2259
|
debug: options.debug,
|
|
2260
|
+
list: options.list,
|
|
1955
2261
|
overrides: flagOverrides,
|
|
1956
2262
|
config,
|
|
1957
2263
|
runId
|
|
@@ -1973,7 +2279,7 @@ var loadEvalCommand = (program2, flagOverrides = {}) => {
|
|
|
1973
2279
|
// src/cli/utils/parse-flag-overrides.ts
|
|
1974
2280
|
var import_zod5 = require("zod");
|
|
1975
2281
|
var import_node_fs2 = require("fs");
|
|
1976
|
-
var
|
|
2282
|
+
var import_node_path4 = require("path");
|
|
1977
2283
|
var FLAG_RE = /^--flag\.([^=]+)(?:=(.*))?$/;
|
|
1978
2284
|
var CONFIG_RE = /^--flags-config(?:=(.*))?$/;
|
|
1979
2285
|
function ensureNoSpaceSeparatedSyntax(flagName, value, nextToken, flagType) {
|
|
@@ -2002,8 +2308,8 @@ function coerceValue(raw) {
|
|
|
2002
2308
|
return raw;
|
|
2003
2309
|
}
|
|
2004
2310
|
}
|
|
2005
|
-
function loadConfigFile(
|
|
2006
|
-
const abs = (0,
|
|
2311
|
+
function loadConfigFile(path4) {
|
|
2312
|
+
const abs = (0, import_node_path4.resolve)(process.cwd(), path4);
|
|
2007
2313
|
try {
|
|
2008
2314
|
const contents = (0, import_node_fs2.readFileSync)(abs, "utf8");
|
|
2009
2315
|
const parsed = JSON.parse(contents);
|
|
@@ -2015,7 +2321,7 @@ function loadConfigFile(path3) {
|
|
|
2015
2321
|
}
|
|
2016
2322
|
return parsed;
|
|
2017
2323
|
} catch (err) {
|
|
2018
|
-
console.error(`\u274C Could not read or parse flags config "${
|
|
2324
|
+
console.error(`\u274C Could not read or parse flags config "${path4}": ${err.message}`);
|
|
2019
2325
|
process.exit(1);
|
|
2020
2326
|
}
|
|
2021
2327
|
}
|
|
@@ -2078,7 +2384,7 @@ var import_commander4 = require("commander");
|
|
|
2078
2384
|
var loadVersionCommand = (program2) => {
|
|
2079
2385
|
return program2.addCommand(
|
|
2080
2386
|
new import_commander4.Command("version").description("cli version").action(() => {
|
|
2081
|
-
console.log("0.
|
|
2387
|
+
console.log("0.25.0");
|
|
2082
2388
|
})
|
|
2083
2389
|
);
|
|
2084
2390
|
};
|
|
@@ -2088,7 +2394,7 @@ var { loadEnvConfig } = import_env.default;
|
|
|
2088
2394
|
loadEnvConfig(process.cwd());
|
|
2089
2395
|
var { cleanedArgv, overrides } = extractOverrides(process.argv.slice(2));
|
|
2090
2396
|
var program = new import_commander5.Command();
|
|
2091
|
-
program.name("axiom").description("Axiom's CLI to manage your objects and run evals").version("0.
|
|
2397
|
+
program.name("axiom").description("Axiom's CLI to manage your objects and run evals").version("0.25.0");
|
|
2092
2398
|
loadPushCommand(program);
|
|
2093
2399
|
loadPullCommand(program);
|
|
2094
2400
|
loadEvalCommand(program, overrides);
|