npm - axiom - Versions diffs - 0.23.0 → 0.25.0 - Mend

axiom 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/bin.cjs +408 -66
package/dist/bin.cjs.map +1 -1
package/dist/bin.js +50 -11
package/dist/bin.js.map +1 -1
package/dist/{chunk-B2XK7HHK.js → chunk-BSZFDG3O.js} +16 -4
package/dist/chunk-BSZFDG3O.js.map +1 -0
package/dist/{chunk-CSMTIO7U.js → chunk-JGAXOVPZ.js} +107 -84
package/dist/chunk-JGAXOVPZ.js.map +1 -0
package/dist/evals.cjs +164 -97
package/dist/evals.cjs.map +1 -1
package/dist/evals.d.cts +12 -0
package/dist/evals.d.ts +12 -0
package/dist/evals.js +48 -15
package/dist/evals.js.map +1 -1
package/dist/index.cjs +15 -3
package/dist/index.cjs.map +1 -1
package/dist/index.js +1 -1
package/package.json +1 -1
package/dist/chunk-B2XK7HHK.js.map +0 -1
package/dist/chunk-CSMTIO7U.js.map +0 -1

package/dist/bin.cjs CHANGED Viewed

@@ -390,6 +390,7 @@ var loadPullCommand = (program2) => {
 // src/cli/commands/eval.command.ts
 var import_commander3 = require("commander");
+var import_nanoid = require("nanoid");
 // ../../node_modules/.pnpm/tinyrainbow@2.0.0/node_modules/tinyrainbow/dist/chunk-BVHSVHOK.js
 var f = {
@@ -476,6 +477,7 @@ var r = process.env.FORCE_TTY !== void 0 || (0, import_tty.isatty)(1);
 var u = p(r);
 // src/evals/run-vitest.ts
+var import_node_path3 = __toESM(require("path"), 1);
 var import_node = require("vitest/node");
 // src/evals/context/storage.ts
@@ -593,7 +595,280 @@ var import_api5 = require("@opentelemetry/api");
 // src/otel/semconv/attributes.ts
 var import_semantic_conventions = require("@opentelemetry/semantic-conventions");
+// src/otel/semconv/eval_proposal.ts
+var ATTR_EVAL_ID = "eval.id";
+var ATTR_EVAL_NAME = "eval.name";
+var ATTR_EVAL_VERSION = "eval.version";
+var ATTR_EVAL_TYPE = "eval.type";
+var ATTR_EVAL_TAGS = "eval.tags";
+var ATTR_EVAL_BASELINE_ID = "eval.baseline.id";
+var ATTR_EVAL_BASELINE_NAME = "eval.baseline.name";
+var ATTR_EVAL_METADATA = "eval.metadata";
+var ATTR_EVAL_COLLECTION_ID = "eval.collection.id";
+var ATTR_EVAL_COLLECTION_SIZE = "eval.collection.size";
+var ATTR_EVAL_COLLECTION_NAME = "eval.collection.name";
+var ATTR_EVAL_CONFIG_FLAGS = "eval.config.flags";
+var ATTR_EVAL_CASE_INDEX = "eval.case.index";
+var ATTR_EVAL_CASE_INPUT = "eval.case.input";
+var ATTR_EVAL_CASE_OUTPUT = "eval.case.output";
+var ATTR_EVAL_CASE_EXPECTED = "eval.case.expected";
+var ATTR_EVAL_CASE_SCORES = "eval.case.scores";
+var ATTR_EVAL_CASE_METADATA = "eval.case.metadata";
+var ATTR_EVAL_TASK_OUTPUT = "eval.task.output";
+var ATTR_EVAL_TASK_NAME = "eval.task.name";
+var ATTR_EVAL_TASK_TYPE = "eval.task.type";
+var ATTR_EVAL_RUN_ID = "eval.run.id";
+var ATTR_EVAL_SCORE_NAME = "eval.score.name";
+var ATTR_EVAL_SCORE_VALUE = "eval.score.value";
+var ATTR_EVAL_SCORE_THRESHOLD = "eval.score.threshold";
+var ATTR_EVAL_SCORE_PASSED = "eval.score.passed";
+var ATTR_EVAL_SCORE_METADATA = "eval.score.metadata";
+var ATTR_EVAL_USER_NAME = "eval.user.name";
+var ATTR_EVAL_USER_EMAIL = "eval.user.email";
+// src/otel/semconv/attributes.ts
 var import_incubating = require("@opentelemetry/semantic-conventions/incubating");
+var ATTR_AXIOM_GEN_AI_SCHEMA_URL = "axiom.gen_ai.schema_url";
+var ATTR_AXIOM_GEN_AI_SDK_NAME = "axiom.gen_ai.sdk.name";
+var ATTR_AXIOM_GEN_AI_SDK_VERSION = "axiom.gen_ai.sdk.version";
+var ATTR_GEN_AI_CAPABILITY_NAME = "gen_ai.capability.name";
+var ATTR_GEN_AI_STEP_NAME = "gen_ai.step.name";
+var ATTR_GEN_AI_TOOL_ARGUMENTS = "gen_ai.tool.arguments";
+var ATTR_GEN_AI_TOOL_MESSAGE = "gen_ai.tool.message";
+var GEN_AI_PROVIDER_NAME_VALUE_ASSEMBLYAI = "assemblyai";
+var GEN_AI_PROVIDER_NAME_VALUE_CEREBRAS = "cerebras";
+var GEN_AI_PROVIDER_NAME_VALUE_DEEPGRAM = "deepgram";
+var GEN_AI_PROVIDER_NAME_VALUE_DEEPINFRA = "deepinfra";
+var GEN_AI_PROVIDER_NAME_VALUE_ELEVENLABS = "elevenlabs";
+var GEN_AI_PROVIDER_NAME_VALUE_FAL = "fal";
+var GEN_AI_PROVIDER_NAME_VALUE_FIREWORKS = "fireworks";
+var GEN_AI_PROVIDER_NAME_VALUE_GLADIA = "gladia";
+var GEN_AI_PROVIDER_NAME_VALUE_HUME = "hume";
+var GEN_AI_PROVIDER_NAME_VALUE_LMNT = "lmnt";
+var GEN_AI_PROVIDER_NAME_VALUE_LUMA = "luma";
+var GEN_AI_PROVIDER_NAME_VALUE_REPLICATE = "replicate";
+var GEN_AI_PROVIDER_NAME_VALUE_REVAI = "revai";
+var GEN_AI_PROVIDER_NAME_VALUE_TOGETHERAI = "togetherai";
+var GEN_AI_PROVIDER_NAME_VALUE_VERCEL = "vercel";
+var Attr = {
+  __EXPERIMENTAL_Flag: (flagName) => `flag.${flagName}`,
+  __EXPERIMENTAL_Fact: (factName) => `fact.${factName}`,
+  Axiom: {
+    GenAI: {
+      SchemaURL: ATTR_AXIOM_GEN_AI_SCHEMA_URL,
+      SDK: {
+        Name: ATTR_AXIOM_GEN_AI_SDK_NAME,
+        Version: ATTR_AXIOM_GEN_AI_SDK_VERSION
+      }
+    }
+  },
+  GenAI: {
+    PromptMetadata: {
+      ID: "axiom.gen_ai.prompt.id",
+      Name: "axiom.gen_ai.prompt.name",
+      Slug: "axiom.gen_ai.prompt.slug",
+      Version: "axiom.gen_ai.prompt.version"
+    },
+    /**
+     * These two are used to identify the span
+     */
+    Capability: {
+      Name: ATTR_GEN_AI_CAPABILITY_NAME
+    },
+    Step: {
+      Name: ATTR_GEN_AI_STEP_NAME
+    },
+    Provider: {
+      Name: import_incubating.ATTR_GEN_AI_PROVIDER_NAME,
+      Name_Values: {
+        Anthropic: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_ANTHROPIC,
+        AssemblyAI: GEN_AI_PROVIDER_NAME_VALUE_ASSEMBLYAI,
+        AWSBedrock: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_AWS_BEDROCK,
+        AzureAIInference: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_AZURE_AI_INFERENCE,
+        AzureAIOpenAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_AZURE_AI_OPENAI,
+        Cerebras: GEN_AI_PROVIDER_NAME_VALUE_CEREBRAS,
+        Cohere: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_COHERE,
+        Deepgram: GEN_AI_PROVIDER_NAME_VALUE_DEEPGRAM,
+        DeepInfra: GEN_AI_PROVIDER_NAME_VALUE_DEEPINFRA,
+        Deepseek: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_DEEPSEEK,
+        ElevenLabs: GEN_AI_PROVIDER_NAME_VALUE_ELEVENLABS,
+        Fal: GEN_AI_PROVIDER_NAME_VALUE_FAL,
+        Fireworks: GEN_AI_PROVIDER_NAME_VALUE_FIREWORKS,
+        GCPGemini: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GCP_GEMINI,
+        GCPGenAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GCP_GEN_AI,
+        GCPVertexAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GCP_VERTEX_AI,
+        Gladia: GEN_AI_PROVIDER_NAME_VALUE_GLADIA,
+        Groq: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_GROQ,
+        Hume: GEN_AI_PROVIDER_NAME_VALUE_HUME,
+        IBMWatsonxAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_IBM_WATSONX_AI,
+        Lmnt: GEN_AI_PROVIDER_NAME_VALUE_LMNT,
+        Luma: GEN_AI_PROVIDER_NAME_VALUE_LUMA,
+        MistralAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_MISTRAL_AI,
+        OpenAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_OPENAI,
+        Perplexity: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_PERPLEXITY,
+        Replicate: GEN_AI_PROVIDER_NAME_VALUE_REPLICATE,
+        RevAI: GEN_AI_PROVIDER_NAME_VALUE_REVAI,
+        TogetherAI: GEN_AI_PROVIDER_NAME_VALUE_TOGETHERAI,
+        Vercel: GEN_AI_PROVIDER_NAME_VALUE_VERCEL,
+        XAI: import_incubating.GEN_AI_PROVIDER_NAME_VALUE_X_AI
+      }
+    },
+    /**
+     * Regular attributes
+     */
+    Agent: {
+      Description: import_incubating.ATTR_GEN_AI_AGENT_DESCRIPTION,
+      // not yet used by axiom-ai
+      ID: import_incubating.ATTR_GEN_AI_AGENT_ID,
+      // not yet used by axiom-ai
+      Name: import_incubating.ATTR_GEN_AI_AGENT_NAME
+      // not yet used by axiom-ai
+    },
+    Conversation: {
+      ID: import_incubating.ATTR_GEN_AI_CONVERSATION_ID
+      // not yet used by axiom-ai, anyway probably needs to be provided by user
+    },
+    Input: {
+      Messages: import_incubating.ATTR_GEN_AI_INPUT_MESSAGES
+    },
+    Operation: {
+      Name: import_incubating.ATTR_GEN_AI_OPERATION_NAME,
+      Name_Values: {
+        /**
+         * Note that "text_completion" is deprecated in favor of "chat" for both OpenAI and Anthropic
+         */
+        Chat: import_incubating.GEN_AI_OPERATION_NAME_VALUE_CHAT,
+        CreateAgent: import_incubating.GEN_AI_OPERATION_NAME_VALUE_CREATE_AGENT,
+        Embeddings: import_incubating.GEN_AI_OPERATION_NAME_VALUE_EMBEDDINGS,
+        ExecuteTool: import_incubating.GEN_AI_OPERATION_NAME_VALUE_EXECUTE_TOOL,
+        GenerateContent: import_incubating.GEN_AI_OPERATION_NAME_VALUE_GENERATE_CONTENT,
+        InvokeAgent: import_incubating.GEN_AI_OPERATION_NAME_VALUE_INVOKE_AGENT
+      }
+    },
+    Output: {
+      Messages: import_incubating.ATTR_GEN_AI_OUTPUT_MESSAGES,
+      Type: import_incubating.ATTR_GEN_AI_OUTPUT_TYPE,
+      Type_Values: {
+        Text: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_TEXT,
+        Json: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_JSON,
+        Image: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_IMAGE,
+        Speech: import_incubating.GEN_AI_OUTPUT_TYPE_VALUE_SPEECH
+      }
+    },
+    /**
+     * The provider that is hosting the model, eg AWS Bedrock
+     * There doesn't seem to be a semconv for this
+     */
+    Request: {
+      ChoiceCount: import_incubating.ATTR_GEN_AI_REQUEST_CHOICE_COUNT,
+      // not yet used by axiom-ai
+      EncodingFormats: import_incubating.ATTR_GEN_AI_REQUEST_ENCODING_FORMATS,
+      // not yet used by axiom-ai
+      FrequencyPenalty: import_incubating.ATTR_GEN_AI_REQUEST_FREQUENCY_PENALTY,
+      MaxTokens: import_incubating.ATTR_GEN_AI_REQUEST_MAX_TOKENS,
+      /**
+       * The model you asked for
+       */
+      Model: import_incubating.ATTR_GEN_AI_REQUEST_MODEL,
+      PresencePenalty: import_incubating.ATTR_GEN_AI_REQUEST_PRESENCE_PENALTY,
+      Seed: import_incubating.ATTR_GEN_AI_REQUEST_SEED,
+      StopSequences: import_incubating.ATTR_GEN_AI_REQUEST_STOP_SEQUENCES,
+      Temperature: import_incubating.ATTR_GEN_AI_REQUEST_TEMPERATURE,
+      TopK: import_incubating.ATTR_GEN_AI_REQUEST_TOP_K,
+      TopP: import_incubating.ATTR_GEN_AI_REQUEST_TOP_P
+    },
+    Response: {
+      FinishReasons: import_incubating.ATTR_GEN_AI_RESPONSE_FINISH_REASONS,
+      ID: import_incubating.ATTR_GEN_AI_RESPONSE_ID,
+      /**
+       * The model that was actually used (might be different bc routing) - only ever get this from the response, otherwise omit
+       */
+      Model: import_incubating.ATTR_GEN_AI_RESPONSE_MODEL
+      // somehow not landing on the span for google models? check up on this...
+    },
+    Tool: {
+      CallID: import_incubating.ATTR_GEN_AI_TOOL_CALL_ID,
+      Description: import_incubating.ATTR_GEN_AI_TOOL_DESCRIPTION,
+      Name: import_incubating.ATTR_GEN_AI_TOOL_NAME,
+      Type: import_incubating.ATTR_GEN_AI_TOOL_TYPE,
+      /**
+       * Note, OTel Semantic Convention suggest only putting tool inputs/outputs on the parent chat span
+       * But we at least want to give users THE OPTION to put them on the tool spans themselves as well
+       * Because it enables a lot of things with querying
+       * @see https://github.com/open-telemetry/semantic-conventions/releases/tag/v1.37.0
+       */
+      Arguments: ATTR_GEN_AI_TOOL_ARGUMENTS,
+      /**
+       * Note, OTel Semantic Convention suggest only putting tool inputs/outputs on the parent chat span
+       * But we at least want to give users THE OPTION to put them on the tool spans themselves as well
+       * Because it enables a lot of things with querying
+       * @see https://github.com/open-telemetry/semantic-conventions/releases/tag/v1.37.0
+       */
+      Message: ATTR_GEN_AI_TOOL_MESSAGE
+    },
+    Usage: {
+      InputTokens: import_incubating.ATTR_GEN_AI_USAGE_INPUT_TOKENS,
+      OutputTokens: import_incubating.ATTR_GEN_AI_USAGE_OUTPUT_TOKENS
+    }
+  },
+  Eval: {
+    ID: ATTR_EVAL_ID,
+    Name: ATTR_EVAL_NAME,
+    Version: ATTR_EVAL_VERSION,
+    Type: ATTR_EVAL_TYPE,
+    Baseline: {
+      ID: ATTR_EVAL_BASELINE_ID,
+      Name: ATTR_EVAL_BASELINE_NAME
+    },
+    Tags: ATTR_EVAL_TAGS,
+    Metadata: ATTR_EVAL_METADATA,
+    Collection: {
+      ID: ATTR_EVAL_COLLECTION_ID,
+      Name: ATTR_EVAL_COLLECTION_NAME,
+      Size: ATTR_EVAL_COLLECTION_SIZE
+    },
+    Config: {
+      Flags: ATTR_EVAL_CONFIG_FLAGS
+    },
+    Run: {
+      ID: ATTR_EVAL_RUN_ID
+    },
+    Case: {
+      Index: ATTR_EVAL_CASE_INDEX,
+      Input: ATTR_EVAL_CASE_INPUT,
+      Output: ATTR_EVAL_CASE_OUTPUT,
+      Expected: ATTR_EVAL_CASE_EXPECTED,
+      Scores: ATTR_EVAL_CASE_SCORES,
+      Metadata: ATTR_EVAL_CASE_METADATA
+    },
+    Task: {
+      Output: ATTR_EVAL_TASK_OUTPUT,
+      Name: ATTR_EVAL_TASK_NAME,
+      Type: ATTR_EVAL_TASK_TYPE
+    },
+    Score: {
+      Name: ATTR_EVAL_SCORE_NAME,
+      Value: ATTR_EVAL_SCORE_VALUE,
+      Threshold: ATTR_EVAL_SCORE_THRESHOLD,
+      Passed: ATTR_EVAL_SCORE_PASSED,
+      Metadata: ATTR_EVAL_SCORE_METADATA
+    },
+    User: {
+      Name: ATTR_EVAL_USER_NAME,
+      Email: ATTR_EVAL_USER_EMAIL
+    }
+  },
+  Error: {
+    Type: import_semantic_conventions.ATTR_ERROR_TYPE,
+    Message: import_incubating.ATTR_ERROR_MESSAGE
+  },
+  HTTP: {
+    Response: {
+      StatusCode: import_semantic_conventions.ATTR_HTTP_RESPONSE_STATUS_CODE
+    }
+  }
+};
 // src/otel/startActiveSpan.ts
 var import_api2 = require("@opentelemetry/api");
@@ -604,7 +879,7 @@ var import_api4 = require("@opentelemetry/api");
 // package.json
 var package_default = {
   name: "axiom",
-  version: "0.23.0",
+  version: "0.25.0",
   type: "module",
   author: "Axiom, Inc.",
   contributors: [
@@ -826,47 +1101,40 @@ function resolveAxiomConnection(config) {
 // src/evals/eval.service.ts
 var findEvaluationCases = async (evalId, config) => {
-  try {
-    const { dataset, url, token } = resolveAxiomConnection(config);
-    const apl = `['${dataset}'] | where trace_id == "${evalId}" | order by _time`;
-    const headers = new Headers({
-      Authorization: `Bearer ${token}`,
-      "Content-Type": "application/json"
-    });
-    const resp = await fetch(`${url}/v1/datasets/_apl?format=legacy`, {
-      headers,
-      method: "POST",
-      body: JSON.stringify({ apl })
-    });
-    const payload = await resp.json();
-    if (!resp.ok) {
-      console.log(payload);
-      return void 0;
-    }
-    if (payload.matches.length) {
-      return buildSpanTree(payload.matches);
-    }
-  } catch (err) {
-    console.log(err);
-    return void 0;
+  const { dataset, url, token } = resolveAxiomConnection(config);
+  const apl = `['${dataset}'] | where trace_id == "${evalId}" | order by _time`;
+  const headers = new Headers({
+    Authorization: `Bearer ${token}`,
+    "Content-Type": "application/json"
+  });
+  const resp = await fetch(`${url}/v1/datasets/_apl?format=legacy`, {
+    headers,
+    method: "POST",
+    body: JSON.stringify({ apl })
+  });
+  const payload = await resp.json();
+  if (!resp.ok) {
+    throw new Error(`Failed to query evaluation cases: ${payload.message || resp.statusText}`);
   }
+  return payload.matches.length ? buildSpanTree(payload.matches) : null;
 };
 var mapSpanToEval = (span) => {
-  const flagConfigRaw = span.data.attributes["eval.config.flags"] ?? span.data.attributes.custom["eval.config.flags"];
+  const flagConfigRaw = span.data.attributes[Attr.Eval.Config.Flags] ?? span.data.attributes.custom[Attr.Eval.Config.Flags];
   return {
-    id: span.data.attributes.custom["eval.id"],
-    name: span.data.attributes.custom["eval.name"],
-    type: span.data.attributes.custom["eval.type"],
-    version: span.data.attributes.custom["eval.version"],
+    id: span.data.attributes.custom[Attr.Eval.ID],
+    name: span.data.attributes.custom[Attr.Eval.Name],
+    type: span.data.attributes.custom[Attr.Eval.Type],
+    version: span.data.attributes.custom[Attr.Eval.Version],
     collection: {
-      name: span.data.attributes.custom["eval.collection.name"],
-      size: span.data.attributes.custom["eval.collection.size"]
+      name: span.data.attributes.custom[Attr.Eval.Collection.Name],
+      size: span.data.attributes.custom[Attr.Eval.Collection.Size]
     },
     baseline: {
-      id: span.data.attributes.custom["eval.baseline.id"],
-      name: span.data.attributes.custom["eval.baseline.name"]
+      id: span.data.attributes.custom[Attr.Eval.Baseline.ID],
+      name: span.data.attributes.custom[Attr.Eval.Baseline.Name]
     },
     prompt: {
+      // TODO: do we still want this?
       model: span.data.attributes.custom["eval.prompt.model"],
       params: span.data.attributes.custom["eval.prompt.params"]
     },
@@ -874,10 +1142,10 @@ var mapSpanToEval = (span) => {
     status: span.data.status.code,
     traceId: span.data.trace_id,
     runAt: span._time,
-    tags: span.data.attributes.custom["eval.tags"].length ? JSON.parse(span.data.attributes.custom["eval.tags"]) : [],
+    tags: span.data.attributes.custom[Attr.Eval.Tags].length ? JSON.parse(span.data.attributes.custom[Attr.Eval.Tags]) : [],
     user: {
-      name: span.data.attributes.custom["eval.user.name"],
-      email: span.data.attributes.custom["eval.user.email"]
+      name: span.data.attributes.custom[Attr.Eval.User.Name],
+      email: span.data.attributes.custom[Attr.Eval.User.Email]
     },
     cases: [],
     flagConfig: flagConfigRaw ? JSON.parse(flagConfigRaw) : void 0
@@ -892,19 +1160,17 @@ var mapSpanToCase = (item) => {
   } else {
     duration = d;
   }
-  const runtimeFlagsRaw = data.attributes.custom["eval.case.config.runtime_flags"];
   return {
-    index: data.attributes.custom["eval.case.index"],
-    input: data.attributes.custom["eval.case.input"],
-    output: data.attributes.custom["eval.case.output"],
-    expected: data.attributes.custom["eval.case.expected"],
+    index: data.attributes.custom[Attr.Eval.Case.Index],
+    input: data.attributes.custom[Attr.Eval.Case.Input],
+    output: data.attributes.custom[Attr.Eval.Case.Output],
+    expected: data.attributes.custom[Attr.Eval.Case.Expected],
     duration,
     status: data.status.code,
-    scores: data.attributes.custom["eval.case.scores"] ? JSON.parse(data.attributes.custom["eval.case.scores"]) : {},
+    scores: data.attributes.custom[Attr.Eval.Case.Scores] ? JSON.parse(data.attributes.custom[Attr.Eval.Case.Scores]) : {},
     runAt: item._time,
     spanId: data.span_id,
-    traceId: data.trace_id,
-    runtimeFlags: runtimeFlagsRaw ? JSON.parse(runtimeFlagsRaw) : void 0
+    traceId: data.trace_id
   };
 };
 var buildSpanTree = (spans) => {
@@ -966,10 +1232,10 @@ var buildSpanTree = (spans) => {
     );
     caseData.scores = {};
     scoreSpans.forEach((score) => {
-      const name = score.data.attributes.custom["eval.score.name"];
+      const name = score.data.attributes.custom[Attr.Eval.Score.Name];
       caseData.scores[name] = {
         name,
-        value: score.data.attributes.custom["eval.score.value"],
+        value: score.data.attributes.custom[Attr.Eval.Score.Value],
         metadata: {
           error: score.data.attributes.error
         }
@@ -1394,7 +1660,11 @@ function calculateFlagDiff(suite) {
   }
   return diffs;
 }
-function printFinalReport({ suiteData }) {
+function printFinalReport({
+  suiteData,
+  config,
+  registrationStatus
+}) {
   console.log("");
   console.log(u.bgBlue(u.white(" FINAL EVALUATION REPORT ")));
   console.log("");
@@ -1404,8 +1674,28 @@ function printFinalReport({ suiteData }) {
     printSuiteBox({ suite, scorerAverages, calculateBaselineScorerAverage, flagDiff });
     console.log("");
   }
-  console.log("View full report:");
-  console.log("https://app.axiom.co/evaluations/run/<run-id>");
+  const runId = suiteData[0]?.runId;
+  const orgId = suiteData[0]?.orgId;
+  const anyRegistered = registrationStatus.some((s2) => s2.registered);
+  const anyFailed = registrationStatus.some((s2) => !s2.registered);
+  if (anyRegistered && orgId && config?.consoleEndpointUrl) {
+    console.log("View full report:");
+    console.log(`${config.consoleEndpointUrl}/${orgId}/ai-engineering/evaluations?runId=${runId}`);
+  } else {
+    console.log("Results not available in Axiom UI (registration failed)");
+  }
+  if (anyFailed) {
+    console.log("");
+    for (const status of registrationStatus) {
+      if (!status.registered) {
+        console.log(u.yellow(`\u26A0\uFE0F  Warning: Failed to register "${status.name}" with Axiom`));
+        if (status.error) {
+          console.log(u.dim(`   Error: ${status.error}`));
+        }
+        console.log(u.dim(`   Results for this evaluation will not be available in the Axiom UI.`));
+      }
+    }
+  }
 }
 // src/cli/errors.ts
@@ -1434,14 +1724,19 @@ var AxiomReporter = class {
     __publicField(this, "_suiteData", []);
     __publicField(this, "_baselines", /* @__PURE__ */ new Map());
     __publicField(this, "_printedFlagOverrides", false);
+    __publicField(this, "_config");
   }
   onTestRunStart() {
     this.start = performance.now();
     this.startTime = (/* @__PURE__ */ new Date()).getTime();
+    const config = getAxiomConfig();
+    if (config) {
+      this._config = resolveAxiomConnection(config);
+    }
   }
   async onTestSuiteReady(_testSuite) {
     const meta = _testSuite.meta();
-    if (_testSuite.state() === "skipped") {
+    if (_testSuite.state() === "skipped" || !meta?.evaluation) {
       return;
     }
     if (!this._printedFlagOverrides) {
@@ -1473,7 +1768,7 @@ var AxiomReporter = class {
   }
   async onTestSuiteResult(testSuite) {
     const meta = testSuite.meta();
-    if (testSuite.state() === "skipped") {
+    if (testSuite.state() === "skipped" || !meta?.evaluation) {
       return;
     }
     const durationSeconds = Number((performance.now() - this.start) / 1e3).toFixed(2);
@@ -1509,8 +1804,11 @@ var AxiomReporter = class {
       baseline: suiteBaseline || null,
       configFlags: meta.evaluation.configFlags,
       flagConfig: meta.evaluation.flagConfig,
+      runId: meta.evaluation.runId,
+      orgId: meta.evaluation.orgId,
       cases,
-      outOfScopeFlags: meta.evaluation.outOfScopeFlags
+      outOfScopeFlags: meta.evaluation.outOfScopeFlags,
+      registrationStatus: meta.evaluation.registrationStatus
     });
     printEvalNameAndFileName(testSuite, meta);
     printBaselineNameAndVersion(meta);
@@ -1526,8 +1824,15 @@ var AxiomReporter = class {
     if (shouldClear) {
       process.stdout.write("\x1B[2J\x1B[0f");
     }
+    const registrationStatus = this._suiteData.map((suite) => ({
+      name: suite.name,
+      registered: suite.registrationStatus?.status === "success",
+      error: suite.registrationStatus?.status === "failed" ? suite.registrationStatus.error : void 0
+    }));
     printFinalReport({
-      suiteData: this._suiteData
+      suiteData: this._suiteData,
+      config: this._config,
+      registrationStatus
     });
     const DEBUG = process.env.AXIOM_DEBUG === "true";
     if (DEBUG && this._endOfRunConfigEnd) {
@@ -1693,11 +1998,11 @@ function setupEvalProvider(connection) {
   axiomProvider = new import_sdk_trace_node.NodeTracerProvider({
     resource: (0, import_resources.resourceFromAttributes)({
       ["service.name"]: "axiom",
-      ["service.version"]: "0.23.0"
+      ["service.version"]: "0.25.0"
     }),
     spanProcessors: [processor]
   });
-  axiomTracer = axiomProvider.getTracer("axiom", "0.23.0");
+  axiomTracer = axiomProvider.getTracer("axiom", "0.25.0");
 }
 async function initInstrumentation(config) {
   if (initialized) {
@@ -1709,7 +2014,7 @@ async function initInstrumentation(config) {
   }
   initializationPromise = (async () => {
     if (!config.enabled) {
-      axiomTracer = import_api10.trace.getTracer("axiom", "0.23.0");
+      axiomTracer = import_api10.trace.getTracer("axiom", "0.25.0");
       initialized = true;
       return;
     }
@@ -1772,10 +2077,32 @@ var flush = async () => {
 };
 // src/evals/run-vitest.ts
+var printCollectedEvals = (result, rootDir) => {
+  if (!result.testModules || result.testModules.length === 0) {
+    console.log(u.yellow("\nNo evaluations found\n"));
+    return;
+  }
+  console.log(u.bold("\nFound evaluations:\n"));
+  let totalEvals = 0;
+  let totalCases = 0;
+  for (const module2 of result.testModules) {
+    const relativePath = import_node_path3.default.relative(rootDir, module2.moduleId);
+    for (const suite of module2.children.suites()) {
+      totalEvals++;
+      const caseCount = suite.children.size;
+      totalCases += caseCount;
+      console.log(u.green(`\u2713 ${suite.name} (${caseCount} cases)`));
+      console.log(u.dim(`  ${relativePath}`));
+      console.log("");
+    }
+  }
+  console.log(u.bold(`Total: ${totalEvals} evaluations, ${totalCases} test cases
+`));
+};
 var runVitest = async (dir, opts) => {
   setAxiomConfig(opts.config);
   await initInstrumentation({
-    enabled: !opts.debug,
+    enabled: !opts.debug && !opts.list,
     config: opts.config
   });
   const providedConfig = {
@@ -1789,6 +2116,9 @@ var runVitest = async (dir, opts) => {
   if (opts.debug) {
     console.log(u.bgWhite(u.blackBright(" Debug mode enabled ")));
   }
+  if (opts.list) {
+    console.log(u.bgWhite(u.blackBright(" List mode ")));
+  }
   const vi = await (0, import_node.createVitest)("test", {
     root: dir ? dir : process.cwd(),
     mode: "test",
@@ -1808,10 +2138,18 @@ var runVitest = async (dir, opts) => {
     provide: {
       baseline: opts.baseline,
       debug: opts.debug,
+      list: opts.list,
       overrides: opts.overrides,
-      axiomConfig: providedConfig
+      axiomConfig: providedConfig,
+      runId: opts.runId
     }
   });
+  if (opts.list) {
+    const result = await vi.collect();
+    printCollectedEvals(result, dir || process.cwd());
+    await vi.close();
+    process.exit(0);
+  }
   await vi.start();
   const dispose = (0, import_node.registerConsoleShortcuts)(vi, process.stdin, process.stdout);
   if (!vi.shouldKeepServer()) {
@@ -1868,6 +2206,7 @@ function isGlob(str) {
 }
 // src/cli/commands/eval.command.ts
+var createRunId = (0, import_nanoid.customAlphabet)("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ", 10);
 var loadEvalCommand = (program2, flagOverrides = {}) => {
   return program2.addCommand(
     new import_commander3.Command("eval").description("run evals locally").addArgument(
@@ -1875,7 +2214,7 @@ var loadEvalCommand = (program2, flagOverrides = {}) => {
         ".",
         "any *.eval.ts file in current directory"
       )
-    ).option("-w, --watch true", "keep server running and watch for changes", false).option("-t, --token <TOKEN>", "axiom token", process.env.AXIOM_TOKEN).option("-d, --dataset <DATASET>", "axiom dataset name", process.env.AXIOM_DATASET).option("-u, --url <AXIOM URL>", "axiom url", process.env.AXIOM_URL ?? "https://api.axiom.co").option("-b, --baseline <BASELINE ID>", "id of baseline evaluation to compare against").option("--debug", "run locally without sending to Axiom or loading baselines", false).action(async (target, options) => {
+    ).option("-w, --watch true", "keep server running and watch for changes", false).option("-t, --token <TOKEN>", "axiom token", process.env.AXIOM_TOKEN).option("-d, --dataset <DATASET>", "axiom dataset name", process.env.AXIOM_DATASET).option("-u, --url <AXIOM URL>", "axiom url", process.env.AXIOM_URL ?? "https://api.axiom.co").option("-b, --baseline <BASELINE ID>", "id of baseline evaluation to compare against").option("--debug", "run locally without sending to Axiom or loading baselines", false).option("--list", "list evaluations and test cases without running them", false).action(async (target, options) => {
       try {
         if (options.debug) {
           process.env.AXIOM_DEBUG = "true";
@@ -1909,6 +2248,7 @@ var loadEvalCommand = (program2, flagOverrides = {}) => {
           );
           console.log("");
         }
+        const runId = createRunId();
         await runEvalWithContext(flagOverrides, async () => {
           return runVitest(".", {
             watch: options.watch,
@@ -1917,8 +2257,10 @@ var loadEvalCommand = (program2, flagOverrides = {}) => {
             exclude,
             testNamePattern,
             debug: options.debug,
+            list: options.list,
             overrides: flagOverrides,
-            config
+            config,
+            runId
           });
         });
       } catch (error) {
@@ -1937,7 +2279,7 @@ var loadEvalCommand = (program2, flagOverrides = {}) => {
 // src/cli/utils/parse-flag-overrides.ts
 var import_zod5 = require("zod");
 var import_node_fs2 = require("fs");
-var import_node_path3 = require("path");
+var import_node_path4 = require("path");
 var FLAG_RE = /^--flag\.([^=]+)(?:=(.*))?$/;
 var CONFIG_RE = /^--flags-config(?:=(.*))?$/;
 function ensureNoSpaceSeparatedSyntax(flagName, value, nextToken, flagType) {
@@ -1966,8 +2308,8 @@ function coerceValue(raw) {
     return raw;
   }
 }
-function loadConfigFile(path3) {
-  const abs = (0, import_node_path3.resolve)(process.cwd(), path3);
+function loadConfigFile(path4) {
+  const abs = (0, import_node_path4.resolve)(process.cwd(), path4);
   try {
     const contents = (0, import_node_fs2.readFileSync)(abs, "utf8");
     const parsed = JSON.parse(contents);
@@ -1979,7 +2321,7 @@ function loadConfigFile(path3) {
     }
     return parsed;
   } catch (err) {
-    console.error(`\u274C Could not read or parse flags config "${path3}": ${err.message}`);
+    console.error(`\u274C Could not read or parse flags config "${path4}": ${err.message}`);
     process.exit(1);
   }
 }
@@ -2042,7 +2384,7 @@ var import_commander4 = require("commander");
 var loadVersionCommand = (program2) => {
   return program2.addCommand(
     new import_commander4.Command("version").description("cli version").action(() => {
-      console.log("0.23.0");
+      console.log("0.25.0");
     })
   );
 };
@@ -2052,7 +2394,7 @@ var { loadEnvConfig } = import_env.default;
 loadEnvConfig(process.cwd());
 var { cleanedArgv, overrides } = extractOverrides(process.argv.slice(2));
 var program = new import_commander5.Command();
-program.name("axiom").description("Axiom's CLI to manage your objects and run evals").version("0.23.0");
+program.name("axiom").description("Axiom's CLI to manage your objects and run evals").version("0.25.0");
 loadPushCommand(program);
 loadPullCommand(program);
 loadEvalCommand(program, overrides);