@elizaos/plugin-training 2.0.3-beta.6 → 2.0.3-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/backends/native.d.ts +96 -0
- package/dist/backends/native.d.ts.map +1 -0
- package/dist/backends/native.js +308 -0
- package/dist/backends/native.js.map +1 -0
- package/dist/cli/train.d.ts +22 -0
- package/dist/cli/train.d.ts.map +1 -0
- package/dist/cli/train.js +219 -0
- package/dist/cli/train.js.map +1 -0
- package/dist/core/action-benchmark-runner.d.ts +55 -0
- package/dist/core/action-benchmark-runner.d.ts.map +1 -0
- package/dist/core/action-benchmark-runner.js +341 -0
- package/dist/core/action-benchmark-runner.js.map +1 -0
- package/dist/core/artifact-store.d.ts +72 -0
- package/dist/core/artifact-store.d.ts.map +1 -0
- package/dist/core/artifact-store.js +50 -0
- package/dist/core/artifact-store.js.map +1 -0
- package/dist/core/benchmark-matrix-artifact.d.ts +102 -0
- package/dist/core/benchmark-matrix-artifact.d.ts.map +1 -0
- package/dist/core/benchmark-matrix-artifact.js +381 -0
- package/dist/core/benchmark-matrix-artifact.js.map +1 -0
- package/dist/core/benchmark-vs-cerebras-runner.d.ts +37 -0
- package/dist/core/benchmark-vs-cerebras-runner.d.ts.map +1 -0
- package/dist/core/benchmark-vs-cerebras-runner.js +151 -0
- package/dist/core/benchmark-vs-cerebras-runner.js.map +1 -0
- package/dist/core/cerebras-eval-model.d.ts +54 -0
- package/dist/core/cerebras-eval-model.d.ts.map +1 -0
- package/dist/core/cerebras-eval-model.js +249 -0
- package/dist/core/cerebras-eval-model.js.map +1 -0
- package/dist/core/cli.d.ts +15 -0
- package/dist/core/cli.d.ts.map +1 -0
- package/dist/core/cli.js +1003 -0
- package/dist/core/cli.js.map +1 -0
- package/dist/core/context-audit.d.ts +51 -0
- package/dist/core/context-audit.d.ts.map +1 -0
- package/dist/core/context-audit.js +166 -0
- package/dist/core/context-audit.js.map +1 -0
- package/dist/core/context-catalog.d.ts +47 -0
- package/dist/core/context-catalog.d.ts.map +1 -0
- package/dist/core/context-catalog.js +269 -0
- package/dist/core/context-catalog.js.map +1 -0
- package/dist/core/context-types.d.ts +3 -0
- package/dist/core/context-types.d.ts.map +1 -0
- package/dist/core/context-types.js +18 -0
- package/dist/core/context-types.js.map +1 -0
- package/dist/core/dataset-generator.d.ts +135 -0
- package/dist/core/dataset-generator.d.ts.map +1 -0
- package/dist/core/dataset-generator.js +895 -0
- package/dist/core/dataset-generator.js.map +1 -0
- package/dist/core/eliza1-benchmark-recipe.d.ts +18 -0
- package/dist/core/eliza1-benchmark-recipe.d.ts.map +1 -0
- package/dist/core/eliza1-benchmark-recipe.js +64 -0
- package/dist/core/eliza1-benchmark-recipe.js.map +1 -0
- package/dist/core/eliza1-bundle-stager.d.ts +57 -0
- package/dist/core/eliza1-bundle-stager.d.ts.map +1 -0
- package/dist/core/eliza1-bundle-stager.js +149 -0
- package/dist/core/eliza1-bundle-stager.js.map +1 -0
- package/dist/core/ensure-cron-job.d.ts +53 -0
- package/dist/core/ensure-cron-job.d.ts.map +1 -0
- package/dist/core/ensure-cron-job.js +51 -0
- package/dist/core/ensure-cron-job.js.map +1 -0
- package/dist/core/eval-comparison-artifact.d.ts +72 -0
- package/dist/core/eval-comparison-artifact.d.ts.map +1 -0
- package/dist/core/eval-comparison-artifact.js +281 -0
- package/dist/core/eval-comparison-artifact.js.map +1 -0
- package/dist/core/feed-generation-runner.d.ts +37 -0
- package/dist/core/feed-generation-runner.d.ts.map +1 -0
- package/dist/core/feed-generation-runner.js +232 -0
- package/dist/core/feed-generation-runner.js.map +1 -0
- package/dist/core/html-escape.d.ts +5 -0
- package/dist/core/html-escape.d.ts.map +1 -0
- package/dist/core/html-escape.js +11 -0
- package/dist/core/html-escape.js.map +1 -0
- package/dist/core/huggingface-dataset-ingest.d.ts +52 -0
- package/dist/core/huggingface-dataset-ingest.d.ts.map +1 -0
- package/dist/core/huggingface-dataset-ingest.js +134 -0
- package/dist/core/huggingface-dataset-ingest.js.map +1 -0
- package/dist/core/index.d.ts +29 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +204 -0
- package/dist/core/index.js.map +1 -0
- package/dist/core/privacy-filter.d.ts +95 -0
- package/dist/core/privacy-filter.d.ts.map +1 -0
- package/dist/core/privacy-filter.js +324 -0
- package/dist/core/privacy-filter.js.map +1 -0
- package/dist/core/promotion-gate.d.ts +117 -0
- package/dist/core/promotion-gate.d.ts.map +1 -0
- package/dist/core/promotion-gate.js +85 -0
- package/dist/core/promotion-gate.js.map +1 -0
- package/dist/core/promotion-persist.d.ts +116 -0
- package/dist/core/promotion-persist.d.ts.map +1 -0
- package/dist/core/promotion-persist.js +93 -0
- package/dist/core/promotion-persist.js.map +1 -0
- package/dist/core/prompt-compare.d.ts +99 -0
- package/dist/core/prompt-compare.d.ts.map +1 -0
- package/dist/core/prompt-compare.js +210 -0
- package/dist/core/prompt-compare.js.map +1 -0
- package/dist/core/replay-validator.d.ts +136 -0
- package/dist/core/replay-validator.d.ts.map +1 -0
- package/dist/core/replay-validator.js +312 -0
- package/dist/core/replay-validator.js.map +1 -0
- package/dist/core/roleplay-executor.d.ts +123 -0
- package/dist/core/roleplay-executor.d.ts.map +1 -0
- package/dist/core/roleplay-executor.js +675 -0
- package/dist/core/roleplay-executor.js.map +1 -0
- package/dist/core/roleplay-trajectories.d.ts +54 -0
- package/dist/core/roleplay-trajectories.d.ts.map +1 -0
- package/dist/core/roleplay-trajectories.js +88 -0
- package/dist/core/roleplay-trajectories.js.map +1 -0
- package/dist/core/scenario-blueprints.d.ts +62 -0
- package/dist/core/scenario-blueprints.d.ts.map +1 -0
- package/dist/core/scenario-blueprints.js +850 -0
- package/dist/core/scenario-blueprints.js.map +1 -0
- package/dist/core/scenario-runner.d.ts +36 -0
- package/dist/core/scenario-runner.d.ts.map +1 -0
- package/dist/core/scenario-runner.js +216 -0
- package/dist/core/scenario-runner.js.map +1 -0
- package/dist/core/skill-scoring-cron.d.ts +57 -0
- package/dist/core/skill-scoring-cron.d.ts.map +1 -0
- package/dist/core/skill-scoring-cron.js +180 -0
- package/dist/core/skill-scoring-cron.js.map +1 -0
- package/dist/core/test-trajectory-collector.d.ts +37 -0
- package/dist/core/test-trajectory-collector.d.ts.map +1 -0
- package/dist/core/test-trajectory-collector.js +225 -0
- package/dist/core/test-trajectory-collector.js.map +1 -0
- package/dist/core/track-c-queue-task.d.ts +37 -0
- package/dist/core/track-c-queue-task.d.ts.map +1 -0
- package/dist/core/track-c-queue-task.js +104 -0
- package/dist/core/track-c-queue-task.js.map +1 -0
- package/dist/core/training-analysis-index.d.ts +104 -0
- package/dist/core/training-analysis-index.d.ts.map +1 -0
- package/dist/core/training-analysis-index.js +3297 -0
- package/dist/core/training-analysis-index.js.map +1 -0
- package/dist/core/training-collection-runner.d.ts +508 -0
- package/dist/core/training-collection-runner.d.ts.map +1 -0
- package/dist/core/training-collection-runner.js +2299 -0
- package/dist/core/training-collection-runner.js.map +1 -0
- package/dist/core/training-config.d.ts +52 -0
- package/dist/core/training-config.d.ts.map +1 -0
- package/dist/core/training-config.js +117 -0
- package/dist/core/training-config.js.map +1 -0
- package/dist/core/training-orchestrator.d.ts +112 -0
- package/dist/core/training-orchestrator.d.ts.map +1 -0
- package/dist/core/training-orchestrator.js +729 -0
- package/dist/core/training-orchestrator.js.map +1 -0
- package/dist/core/training-readiness-report.d.ts +52 -0
- package/dist/core/training-readiness-report.d.ts.map +1 -0
- package/dist/core/training-readiness-report.js +765 -0
- package/dist/core/training-readiness-report.js.map +1 -0
- package/dist/core/trajectory-consumer.d.ts +15 -0
- package/dist/core/trajectory-consumer.d.ts.map +1 -0
- package/dist/core/trajectory-consumer.js +61 -0
- package/dist/core/trajectory-consumer.js.map +1 -0
- package/dist/core/trajectory-export-bundle.d.ts +95 -0
- package/dist/core/trajectory-export-bundle.d.ts.map +1 -0
- package/dist/core/trajectory-export-bundle.js +561 -0
- package/dist/core/trajectory-export-bundle.js.map +1 -0
- package/dist/core/trajectory-export-cron.d.ts +57 -0
- package/dist/core/trajectory-export-cron.d.ts.map +1 -0
- package/dist/core/trajectory-export-cron.js +170 -0
- package/dist/core/trajectory-export-cron.js.map +1 -0
- package/dist/core/trajectory-hf-upload.d.ts +50 -0
- package/dist/core/trajectory-hf-upload.d.ts.map +1 -0
- package/dist/core/trajectory-hf-upload.js +111 -0
- package/dist/core/trajectory-hf-upload.js.map +1 -0
- package/dist/core/trajectory-task-datasets.d.ts +62 -0
- package/dist/core/trajectory-task-datasets.d.ts.map +1 -0
- package/dist/core/trajectory-task-datasets.js +427 -0
- package/dist/core/trajectory-task-datasets.js.map +1 -0
- package/dist/core/wait-for-service.d.ts +25 -0
- package/dist/core/wait-for-service.d.ts.map +1 -0
- package/dist/core/wait-for-service.js +19 -0
- package/dist/core/wait-for-service.js.map +1 -0
- package/dist/core/workspace-runtime.d.ts +4 -0
- package/dist/core/workspace-runtime.d.ts.map +1 -0
- package/dist/core/workspace-runtime.js +25 -0
- package/dist/core/workspace-runtime.js.map +1 -0
- package/dist/dspy/artifact.d.ts +54 -0
- package/dist/dspy/artifact.d.ts.map +1 -0
- package/dist/dspy/artifact.js +61 -0
- package/dist/dspy/artifact.js.map +1 -0
- package/dist/dspy/chain-of-thought.d.ts +27 -0
- package/dist/dspy/chain-of-thought.d.ts.map +1 -0
- package/dist/dspy/chain-of-thought.js +43 -0
- package/dist/dspy/chain-of-thought.js.map +1 -0
- package/dist/dspy/examples.d.ts +72 -0
- package/dist/dspy/examples.d.ts.map +1 -0
- package/dist/dspy/examples.js +105 -0
- package/dist/dspy/examples.js.map +1 -0
- package/dist/dspy/index.d.ts +15 -0
- package/dist/dspy/index.d.ts.map +1 -0
- package/dist/dspy/index.js +40 -0
- package/dist/dspy/index.js.map +1 -0
- package/dist/dspy/lm-adapter.d.ts +100 -0
- package/dist/dspy/lm-adapter.d.ts.map +1 -0
- package/dist/dspy/lm-adapter.js +81 -0
- package/dist/dspy/lm-adapter.js.map +1 -0
- package/dist/dspy/optimizers/dspy-bootstrap-fewshot.d.ts +23 -0
- package/dist/dspy/optimizers/dspy-bootstrap-fewshot.d.ts.map +1 -0
- package/dist/dspy/optimizers/dspy-bootstrap-fewshot.js +85 -0
- package/dist/dspy/optimizers/dspy-bootstrap-fewshot.js.map +1 -0
- package/dist/dspy/optimizers/dspy-copro.d.ts +29 -0
- package/dist/dspy/optimizers/dspy-copro.d.ts.map +1 -0
- package/dist/dspy/optimizers/dspy-copro.js +141 -0
- package/dist/dspy/optimizers/dspy-copro.js.map +1 -0
- package/dist/dspy/optimizers/dspy-mipro.d.ts +37 -0
- package/dist/dspy/optimizers/dspy-mipro.d.ts.map +1 -0
- package/dist/dspy/optimizers/dspy-mipro.js +194 -0
- package/dist/dspy/optimizers/dspy-mipro.js.map +1 -0
- package/dist/dspy/optimizers/index.d.ts +5 -0
- package/dist/dspy/optimizers/index.d.ts.map +1 -0
- package/dist/dspy/optimizers/index.js +11 -0
- package/dist/dspy/optimizers/index.js.map +1 -0
- package/dist/dspy/optimizers/types.d.ts +39 -0
- package/dist/dspy/optimizers/types.d.ts.map +1 -0
- package/dist/dspy/optimizers/types.js +1 -0
- package/dist/dspy/optimizers/types.js.map +1 -0
- package/dist/dspy/predict.d.ts +49 -0
- package/dist/dspy/predict.d.ts.map +1 -0
- package/dist/dspy/predict.js +73 -0
- package/dist/dspy/predict.js.map +1 -0
- package/dist/dspy/signature.d.ts +88 -0
- package/dist/dspy/signature.d.ts.map +1 -0
- package/dist/dspy/signature.js +205 -0
- package/dist/dspy/signature.js.map +1 -0
- package/dist/index.d.ts +15 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +15 -0
- package/dist/index.js.map +1 -0
- package/dist/optimizers/bootstrap-fewshot.d.ts +42 -0
- package/dist/optimizers/bootstrap-fewshot.d.ts.map +1 -0
- package/dist/optimizers/bootstrap-fewshot.js +92 -0
- package/dist/optimizers/bootstrap-fewshot.js.map +1 -0
- package/dist/optimizers/gepa.d.ts +63 -0
- package/dist/optimizers/gepa.d.ts.map +1 -0
- package/dist/optimizers/gepa.js +232 -0
- package/dist/optimizers/gepa.js.map +1 -0
- package/dist/optimizers/index.d.ts +7 -0
- package/dist/optimizers/index.d.ts.map +1 -0
- package/dist/optimizers/index.js +51 -0
- package/dist/optimizers/index.js.map +1 -0
- package/dist/optimizers/instruction-search.d.ts +39 -0
- package/dist/optimizers/instruction-search.d.ts.map +1 -0
- package/dist/optimizers/instruction-search.js +108 -0
- package/dist/optimizers/instruction-search.js.map +1 -0
- package/dist/optimizers/prompt-evolution.d.ts +39 -0
- package/dist/optimizers/prompt-evolution.d.ts.map +1 -0
- package/dist/optimizers/prompt-evolution.js +101 -0
- package/dist/optimizers/prompt-evolution.js.map +1 -0
- package/dist/optimizers/scoring.d.ts +139 -0
- package/dist/optimizers/scoring.d.ts.map +1 -0
- package/dist/optimizers/scoring.js +299 -0
- package/dist/optimizers/scoring.js.map +1 -0
- package/dist/optimizers/types.d.ts +105 -0
- package/dist/optimizers/types.d.ts.map +1 -0
- package/dist/optimizers/types.js +1 -0
- package/dist/optimizers/types.js.map +1 -0
- package/dist/register-runtime.d.ts +3 -0
- package/dist/register-runtime.d.ts.map +1 -0
- package/dist/register-runtime.js +60 -0
- package/dist/register-runtime.js.map +1 -0
- package/dist/register-terminal-view.d.ts +15 -0
- package/dist/register-terminal-view.d.ts.map +1 -0
- package/dist/register-terminal-view.js +31 -0
- package/dist/register-terminal-view.js.map +1 -0
- package/dist/routes/experience-routes.d.ts +21 -0
- package/dist/routes/experience-routes.d.ts.map +1 -0
- package/dist/routes/experience-routes.js +513 -0
- package/dist/routes/experience-routes.js.map +1 -0
- package/dist/routes/index.d.ts +5 -0
- package/dist/routes/index.d.ts.map +1 -0
- package/dist/routes/index.js +17 -0
- package/dist/routes/index.js.map +1 -0
- package/dist/routes/training-routes.d.ts +10 -0
- package/dist/routes/training-routes.d.ts.map +1 -0
- package/dist/routes/training-routes.js +1239 -0
- package/dist/routes/training-routes.js.map +1 -0
- package/dist/routes/training-vast-routes.d.ts +35 -0
- package/dist/routes/training-vast-routes.d.ts.map +1 -0
- package/dist/routes/training-vast-routes.js +249 -0
- package/dist/routes/training-vast-routes.js.map +1 -0
- package/dist/routes/trajectory-routes.d.ts +19 -0
- package/dist/routes/trajectory-routes.d.ts.map +1 -0
- package/dist/routes/trajectory-routes.js +1122 -0
- package/dist/routes/trajectory-routes.js.map +1 -0
- package/dist/services/index.d.ts +9 -0
- package/dist/services/index.d.ts.map +1 -0
- package/dist/services/index.js +63 -0
- package/dist/services/index.js.map +1 -0
- package/dist/services/training-backend-check.d.ts +8 -0
- package/dist/services/training-backend-check.d.ts.map +1 -0
- package/dist/services/training-backend-check.js +31 -0
- package/dist/services/training-backend-check.js.map +1 -0
- package/dist/services/training-service-like.d.ts +40 -0
- package/dist/services/training-service-like.d.ts.map +1 -0
- package/dist/services/training-service-like.js +1 -0
- package/dist/services/training-service-like.js.map +1 -0
- package/dist/services/training-service-registry.d.ts +4 -0
- package/dist/services/training-service-registry.d.ts.map +1 -0
- package/dist/services/training-service-registry.js +12 -0
- package/dist/services/training-service-registry.js.map +1 -0
- package/dist/services/training-service.d.ts +59 -0
- package/dist/services/training-service.d.ts.map +1 -0
- package/dist/services/training-service.js +154 -0
- package/dist/services/training-service.js.map +1 -0
- package/dist/services/training-trigger.d.ts +177 -0
- package/dist/services/training-trigger.d.ts.map +1 -0
- package/dist/services/training-trigger.js +300 -0
- package/dist/services/training-trigger.js.map +1 -0
- package/dist/services/training-vast-service.d.ts +149 -0
- package/dist/services/training-vast-service.d.ts.map +1 -0
- package/dist/services/training-vast-service.js +648 -0
- package/dist/services/training-vast-service.js.map +1 -0
- package/dist/services/vast-inference-stats.d.ts +37 -0
- package/dist/services/vast-inference-stats.d.ts.map +1 -0
- package/dist/services/vast-inference-stats.js +81 -0
- package/dist/services/vast-inference-stats.js.map +1 -0
- package/dist/services/vast-job-store.d.ts +74 -0
- package/dist/services/vast-job-store.d.ts.map +1 -0
- package/dist/services/vast-job-store.js +194 -0
- package/dist/services/vast-job-store.js.map +1 -0
- package/dist/services/vast-subprocess.d.ts +27 -0
- package/dist/services/vast-subprocess.d.ts.map +1 -0
- package/dist/services/vast-subprocess.js +78 -0
- package/dist/services/vast-subprocess.js.map +1 -0
- package/dist/setup-routes.d.ts +17 -0
- package/dist/setup-routes.d.ts.map +1 -0
- package/dist/setup-routes.js +319 -0
- package/dist/setup-routes.js.map +1 -0
- package/dist/ui/FineTuningSpatialView.d.ts +49 -0
- package/dist/ui/FineTuningSpatialView.d.ts.map +1 -0
- package/dist/ui/FineTuningSpatialView.js +154 -0
- package/dist/ui/FineTuningSpatialView.js.map +1 -0
- package/dist/ui/FineTuningView.d.ts +7 -0
- package/dist/ui/FineTuningView.d.ts.map +1 -0
- package/dist/ui/FineTuningView.helpers.d.ts +17 -0
- package/dist/ui/FineTuningView.helpers.d.ts.map +1 -0
- package/dist/ui/FineTuningView.helpers.js +30 -0
- package/dist/ui/FineTuningView.helpers.js.map +1 -0
- package/dist/ui/FineTuningView.interact.d.ts +2 -0
- package/dist/ui/FineTuningView.interact.d.ts.map +1 -0
- package/dist/ui/FineTuningView.interact.js +300 -0
- package/dist/ui/FineTuningView.interact.js.map +1 -0
- package/dist/ui/FineTuningView.js +4653 -0
- package/dist/ui/FineTuningView.js.map +1 -0
- package/dist/ui/fine-tuning-panels.d.ts +100 -0
- package/dist/ui/fine-tuning-panels.d.ts.map +1 -0
- package/dist/ui/fine-tuning-panels.helpers.d.ts +19 -0
- package/dist/ui/fine-tuning-panels.helpers.d.ts.map +1 -0
- package/dist/ui/fine-tuning-panels.helpers.js +77 -0
- package/dist/ui/fine-tuning-panels.helpers.js.map +1 -0
- package/dist/ui/fine-tuning-panels.js +928 -0
- package/dist/ui/fine-tuning-panels.js.map +1 -0
- package/dist/ui/index.d.ts +5 -0
- package/dist/ui/index.d.ts.map +1 -0
- package/dist/ui/index.js +5 -0
- package/dist/ui/index.js.map +1 -0
- package/dist/ui/training-view-bundle.d.ts +3 -0
- package/dist/ui/training-view-bundle.d.ts.map +1 -0
- package/dist/ui/training-view-bundle.js +7 -0
- package/dist/ui/training-view-bundle.js.map +1 -0
- package/dist/views/bundle.js +5312 -0
- package/dist/views/bundle.js.map +1 -0
- package/package.json +7 -7
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import {
|
|
5
|
+
canonicalElizaOneTierSort,
|
|
6
|
+
normalizeElizaOneBenchmarkTier
|
|
7
|
+
} from "./eliza1-benchmark-recipe.js";
|
|
8
|
+
import { EVAL_COMPARISON_ARTIFACT_SCHEMA } from "./eval-comparison-artifact.js";
|
|
9
|
+
import { trainingStateRoot } from "./training-config.js";
|
|
10
|
+
const BENCHMARK_MATRIX_ARTIFACT_SCHEMA = "eliza_benchmark_matrix_artifact";
|
|
11
|
+
const BENCHMARK_MATRIX_ARTIFACT_VERSION = 1;
|
|
12
|
+
const ACTION_BENCHMARK_REPORT_SCHEMA = "eliza_action_selection_benchmark_report";
|
|
13
|
+
const ACTION_SELECTION_BENCHMARK_ID = "eliza_harness_action_selection";
|
|
14
|
+
const LOCAL_EVAL_COMPARISON_BENCHMARK_ID = "eliza_harness_local_eval_comparison";
|
|
15
|
+
const ELIZA_ONE_MATRIX_TIERS = [
|
|
16
|
+
"0b",
|
|
17
|
+
"2b",
|
|
18
|
+
"4b",
|
|
19
|
+
"9b",
|
|
20
|
+
"27b"
|
|
21
|
+
];
|
|
22
|
+
function asRecord(value) {
|
|
23
|
+
return value && typeof value === "object" && !Array.isArray(value) ? value : null;
|
|
24
|
+
}
|
|
25
|
+
function asString(value) {
|
|
26
|
+
return typeof value === "string" && value.trim().length > 0 ? value.trim() : null;
|
|
27
|
+
}
|
|
28
|
+
function asNumber(value) {
|
|
29
|
+
return typeof value === "number" && Number.isFinite(value) ? value : null;
|
|
30
|
+
}
|
|
31
|
+
function finiteScore(value) {
|
|
32
|
+
if (!Number.isFinite(value))
|
|
33
|
+
throw new Error(`score must be finite; got ${value}`);
|
|
34
|
+
return value;
|
|
35
|
+
}
|
|
36
|
+
function roundMetric(value) {
|
|
37
|
+
return value === null ? null : Number(value.toFixed(6));
|
|
38
|
+
}
|
|
39
|
+
function percentDelta(base, next) {
|
|
40
|
+
if (base === null || next === null || base === 0) return null;
|
|
41
|
+
return (next - base) / Math.abs(base) * 100;
|
|
42
|
+
}
|
|
43
|
+
function isDryRunRow(row) {
|
|
44
|
+
if (!row) return false;
|
|
45
|
+
const rawSource = asRecord(row.raw.source);
|
|
46
|
+
return row.metrics.dryRun === true || row.raw.dryRun === true || rawSource?.dryRun === true;
|
|
47
|
+
}
|
|
48
|
+
function inferTier(modelId, explicit) {
|
|
49
|
+
const tier = asString(explicit);
|
|
50
|
+
if (tier) return normalizeElizaOneBenchmarkTier(tier) ?? tier;
|
|
51
|
+
const normalized = modelId.toLowerCase();
|
|
52
|
+
if (normalized.includes("27b")) return "27b";
|
|
53
|
+
if (normalized.includes("9b")) return "9b";
|
|
54
|
+
if (normalized.includes("4b")) return "4b";
|
|
55
|
+
if (normalized.includes("2b")) return "2b";
|
|
56
|
+
if (normalized.includes("0b")) return "0b";
|
|
57
|
+
return null;
|
|
58
|
+
}
|
|
59
|
+
function normalizeRow(row) {
|
|
60
|
+
const explicitReferenceTier = asString(row.tier);
|
|
61
|
+
return {
|
|
62
|
+
modelId: row.modelId,
|
|
63
|
+
benchmark: row.benchmark,
|
|
64
|
+
score: finiteScore(row.score),
|
|
65
|
+
variant: row.variant,
|
|
66
|
+
tier: row.variant === "reference" && !explicitReferenceTier ? null : inferTier(row.modelId, row.tier),
|
|
67
|
+
provider: asString(row.provider),
|
|
68
|
+
datasetVersion: asString(row.datasetVersion),
|
|
69
|
+
codeCommit: asString(row.codeCommit),
|
|
70
|
+
ts: row.ts ?? null,
|
|
71
|
+
metrics: row.metrics ?? {},
|
|
72
|
+
raw: row.raw ?? {}
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
function selectReferenceModelId(rows, explicit) {
|
|
76
|
+
if (explicit) return explicit;
|
|
77
|
+
return rows.find((row) => row.variant === "reference")?.modelId ?? rows.find((row) => row.provider === "cerebras")?.modelId ?? null;
|
|
78
|
+
}
|
|
79
|
+
function scoreFor(rows, benchmark, variant, tier) {
|
|
80
|
+
if (variant === "reference") {
|
|
81
|
+
return rows.find(
|
|
82
|
+
(row) => row.benchmark === benchmark && row.variant === "reference" && row.tier === tier
|
|
83
|
+
) ?? rows.find(
|
|
84
|
+
(row) => row.benchmark === benchmark && row.variant === "reference" && row.tier === null
|
|
85
|
+
) ?? null;
|
|
86
|
+
}
|
|
87
|
+
return rows.find(
|
|
88
|
+
(row) => row.benchmark === benchmark && row.variant === variant && row.tier === tier
|
|
89
|
+
) ?? null;
|
|
90
|
+
}
|
|
91
|
+
function buildComparisons(rows, referenceModelId) {
|
|
92
|
+
const tiers = Array.from(
|
|
93
|
+
new Set(
|
|
94
|
+
rows.map((row) => row.tier).filter((tier) => tier !== null)
|
|
95
|
+
)
|
|
96
|
+
).sort(canonicalElizaOneTierSort);
|
|
97
|
+
const benchmarks = Array.from(
|
|
98
|
+
new Set(rows.map((row) => row.benchmark))
|
|
99
|
+
).sort();
|
|
100
|
+
const comparisons = [];
|
|
101
|
+
for (const tier of tiers) {
|
|
102
|
+
for (const benchmark of benchmarks) {
|
|
103
|
+
const base = scoreFor(rows, benchmark, "base", tier);
|
|
104
|
+
const trained = scoreFor(rows, benchmark, "trained", tier);
|
|
105
|
+
const reference = scoreFor(rows, benchmark, "reference", tier);
|
|
106
|
+
if (!base && !trained && !reference) continue;
|
|
107
|
+
const dryRun = isDryRunRow(base) || isDryRunRow(trained) || isDryRunRow(reference);
|
|
108
|
+
comparisons.push({
|
|
109
|
+
tier,
|
|
110
|
+
benchmark,
|
|
111
|
+
baseModelId: base?.modelId ?? null,
|
|
112
|
+
trainedModelId: trained?.modelId ?? null,
|
|
113
|
+
referenceModelId: reference?.modelId ?? referenceModelId,
|
|
114
|
+
baseScore: base?.score ?? null,
|
|
115
|
+
trainedScore: trained?.score ?? null,
|
|
116
|
+
referenceScore: reference?.score ?? null,
|
|
117
|
+
improvementAbsolute: roundMetric(
|
|
118
|
+
base && trained ? trained.score - base.score : null
|
|
119
|
+
),
|
|
120
|
+
improvementPercent: roundMetric(
|
|
121
|
+
percentDelta(base?.score ?? null, trained?.score ?? null)
|
|
122
|
+
),
|
|
123
|
+
trainedVsReferenceAbsolute: roundMetric(
|
|
124
|
+
trained && reference ? trained.score - reference.score : null
|
|
125
|
+
),
|
|
126
|
+
trainedVsReferencePercent: roundMetric(
|
|
127
|
+
percentDelta(reference?.score ?? null, trained?.score ?? null)
|
|
128
|
+
),
|
|
129
|
+
dryRun
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return comparisons;
|
|
134
|
+
}
|
|
135
|
+
function safeTimestamp(value) {
|
|
136
|
+
return value.replace(/[:.]/g, "-");
|
|
137
|
+
}
|
|
138
|
+
function rowFromActionBenchmarkArtifact(payload, source) {
|
|
139
|
+
const reportSource = asRecord(payload.source) ?? {};
|
|
140
|
+
const embeddedVariant = reportSource.variant;
|
|
141
|
+
const modelId = source.modelId ?? asString(reportSource.modelId) ?? void 0;
|
|
142
|
+
const variant = source.variant ?? (embeddedVariant === "reference" || embeddedVariant === "base" || embeddedVariant === "trained" ? embeddedVariant : void 0);
|
|
143
|
+
if (!modelId || !variant) {
|
|
144
|
+
throw new Error(
|
|
145
|
+
`Action benchmark artifact ${source.path} requires modelId and variant`
|
|
146
|
+
);
|
|
147
|
+
}
|
|
148
|
+
const summary = asRecord(payload.summary) ?? {};
|
|
149
|
+
const dryRun = payload.dryRun === true || reportSource.dryRun === true;
|
|
150
|
+
const useMocks = source.useMocks === true || reportSource.useMocks === true || payload.useMocks === true;
|
|
151
|
+
const score = asNumber(summary.accuracy) ?? (dryRun ? 0 : null);
|
|
152
|
+
if (score === null) {
|
|
153
|
+
throw new Error(
|
|
154
|
+
`Action benchmark artifact ${source.path} missing accuracy`
|
|
155
|
+
);
|
|
156
|
+
}
|
|
157
|
+
const caseSamples = Array.isArray(payload.results) ? payload.results.map(asRecord).filter((result) => result !== null).slice(0, 8).map((result) => ({
|
|
158
|
+
caseId: asString(result.caseId),
|
|
159
|
+
prompt: asString(result.prompt) ?? asString(result.input) ?? asString(result.userPrompt),
|
|
160
|
+
expectedAction: asString(result.expectedAction),
|
|
161
|
+
actualAction: asString(result.actualAction),
|
|
162
|
+
pass: result.pass === true,
|
|
163
|
+
response: asString(result.response) ?? asString(result.output) ?? asString(result.finalResponse) ?? asString(result.failureReason),
|
|
164
|
+
latencyMs: asNumber(result.latencyMs),
|
|
165
|
+
trajectoryPath: asString(result.trajectoryPath)
|
|
166
|
+
})) : [];
|
|
167
|
+
return [
|
|
168
|
+
{
|
|
169
|
+
modelId,
|
|
170
|
+
variant,
|
|
171
|
+
benchmark: source.benchmark ?? asString(reportSource.benchmark) ?? ACTION_SELECTION_BENCHMARK_ID,
|
|
172
|
+
score,
|
|
173
|
+
tier: source.tier ?? asString(reportSource.tier) ?? void 0,
|
|
174
|
+
provider: source.provider ?? asString(reportSource.provider) ?? void 0,
|
|
175
|
+
datasetVersion: source.datasetVersion ?? asString(reportSource.datasetVersion) ?? void 0,
|
|
176
|
+
codeCommit: source.codeCommit ?? asString(reportSource.codeCommit) ?? void 0,
|
|
177
|
+
ts: asString(payload.generatedAt) ?? void 0,
|
|
178
|
+
metrics: {
|
|
179
|
+
plannerAccuracy: summary.plannerAccuracy,
|
|
180
|
+
executionAccuracy: summary.executionAccuracy,
|
|
181
|
+
total: summary.total,
|
|
182
|
+
passed: summary.passed,
|
|
183
|
+
failed: summary.failed,
|
|
184
|
+
latency: summary.latency,
|
|
185
|
+
failureModes: payload.failureModes,
|
|
186
|
+
dryRun,
|
|
187
|
+
useMocks
|
|
188
|
+
},
|
|
189
|
+
raw: {
|
|
190
|
+
artifactPath: source.path,
|
|
191
|
+
schema: payload.schema,
|
|
192
|
+
source: payload.source,
|
|
193
|
+
caseSamples,
|
|
194
|
+
dryRun,
|
|
195
|
+
useMocks
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
];
|
|
199
|
+
}
|
|
200
|
+
function rowsFromEvalComparisonArtifact(payload, source) {
|
|
201
|
+
const models = asRecord(payload.models) ?? {};
|
|
202
|
+
const metrics = asRecord(payload.metrics) ?? {};
|
|
203
|
+
const benchmark = source.benchmark ?? LOCAL_EVAL_COMPARISON_BENCHMARK_ID;
|
|
204
|
+
const baseModelId = source.variant === "base" ? source.modelId : asString(models.base);
|
|
205
|
+
const trainedModelId = source.variant === "trained" ? source.modelId : asString(models.trained);
|
|
206
|
+
const rows = [];
|
|
207
|
+
const baseScore = asNumber(metrics.baseScore);
|
|
208
|
+
if (baseModelId && baseScore !== null) {
|
|
209
|
+
rows.push({
|
|
210
|
+
modelId: baseModelId,
|
|
211
|
+
variant: "base",
|
|
212
|
+
benchmark,
|
|
213
|
+
score: baseScore,
|
|
214
|
+
tier: source.tier,
|
|
215
|
+
provider: source.provider,
|
|
216
|
+
datasetVersion: source.datasetVersion,
|
|
217
|
+
codeCommit: source.codeCommit,
|
|
218
|
+
ts: asString(payload.generatedAt) ?? void 0,
|
|
219
|
+
metrics: {
|
|
220
|
+
latencyMs: metrics.baseLatencyMs,
|
|
221
|
+
promptCount: metrics.promptCount
|
|
222
|
+
},
|
|
223
|
+
raw: {
|
|
224
|
+
artifactPath: source.path,
|
|
225
|
+
schema: payload.schema
|
|
226
|
+
}
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
const trainedScore = asNumber(metrics.trainedScore);
|
|
230
|
+
if (trainedModelId && trainedScore !== null) {
|
|
231
|
+
rows.push({
|
|
232
|
+
modelId: trainedModelId,
|
|
233
|
+
variant: "trained",
|
|
234
|
+
benchmark,
|
|
235
|
+
score: trainedScore,
|
|
236
|
+
tier: source.tier,
|
|
237
|
+
provider: source.provider,
|
|
238
|
+
datasetVersion: source.datasetVersion,
|
|
239
|
+
codeCommit: source.codeCommit,
|
|
240
|
+
ts: asString(payload.generatedAt) ?? void 0,
|
|
241
|
+
metrics: {
|
|
242
|
+
latencyMs: metrics.trainedLatencyMs,
|
|
243
|
+
promptCount: metrics.promptCount,
|
|
244
|
+
improvementAbsolute: metrics.improvementAbsolute,
|
|
245
|
+
improvementPercent: metrics.improvementPercent
|
|
246
|
+
},
|
|
247
|
+
raw: {
|
|
248
|
+
artifactPath: source.path,
|
|
249
|
+
schema: payload.schema
|
|
250
|
+
}
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
return rows;
|
|
254
|
+
}
|
|
255
|
+
function rowsFromBenchmarkMatrixArtifact(payload, source) {
|
|
256
|
+
const rows = Array.isArray(payload.rows) ? payload.rows.map(asRecord).filter((row) => row !== null) : [];
|
|
257
|
+
return rows.map((row) => {
|
|
258
|
+
const modelId = asString(row.modelId);
|
|
259
|
+
const benchmark = asString(row.benchmark);
|
|
260
|
+
const variant = row.variant;
|
|
261
|
+
const score = asNumber(row.score);
|
|
262
|
+
if (!modelId || !benchmark || score === null || variant !== "reference" && variant !== "base" && variant !== "trained") {
|
|
263
|
+
throw new Error(
|
|
264
|
+
`Benchmark matrix artifact ${source.path} has an invalid row`
|
|
265
|
+
);
|
|
266
|
+
}
|
|
267
|
+
return {
|
|
268
|
+
modelId,
|
|
269
|
+
benchmark: source.benchmark ?? benchmark,
|
|
270
|
+
score,
|
|
271
|
+
variant,
|
|
272
|
+
tier: source.tier ?? asString(row.tier) ?? void 0,
|
|
273
|
+
provider: source.provider ?? asString(row.provider) ?? void 0,
|
|
274
|
+
datasetVersion: source.datasetVersion ?? asString(row.datasetVersion) ?? void 0,
|
|
275
|
+
codeCommit: source.codeCommit ?? asString(row.codeCommit) ?? void 0,
|
|
276
|
+
ts: row.ts,
|
|
277
|
+
metrics: asRecord(row.metrics) ?? {},
|
|
278
|
+
raw: {
|
|
279
|
+
...asRecord(row.raw) ?? {},
|
|
280
|
+
artifactPath: source.path,
|
|
281
|
+
schema: payload.schema
|
|
282
|
+
}
|
|
283
|
+
};
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
function buildBenchmarkMatrixRowsFromArtifactPayload(payload, source) {
|
|
287
|
+
if (payload.schema === ACTION_BENCHMARK_REPORT_SCHEMA) {
|
|
288
|
+
return rowFromActionBenchmarkArtifact(payload, source);
|
|
289
|
+
}
|
|
290
|
+
if (payload.schema === EVAL_COMPARISON_ARTIFACT_SCHEMA) {
|
|
291
|
+
return rowsFromEvalComparisonArtifact(payload, source);
|
|
292
|
+
}
|
|
293
|
+
if (payload.schema === BENCHMARK_MATRIX_ARTIFACT_SCHEMA) {
|
|
294
|
+
return rowsFromBenchmarkMatrixArtifact(payload, source);
|
|
295
|
+
}
|
|
296
|
+
throw new Error(`Unsupported benchmark artifact schema in ${source.path}`);
|
|
297
|
+
}
|
|
298
|
+
async function buildBenchmarkMatrixRowsFromArtifacts(artifacts) {
|
|
299
|
+
const rows = [];
|
|
300
|
+
for (const source of artifacts) {
|
|
301
|
+
const payload = asRecord(JSON.parse(await readFile(source.path, "utf-8")));
|
|
302
|
+
if (!payload)
|
|
303
|
+
throw new Error(`Artifact ${source.path} must be a JSON object`);
|
|
304
|
+
rows.push(...buildBenchmarkMatrixRowsFromArtifactPayload(payload, source));
|
|
305
|
+
}
|
|
306
|
+
return rows;
|
|
307
|
+
}
|
|
308
|
+
function buildBenchmarkMatrixArtifactPayload(input) {
|
|
309
|
+
const rows = input.rows.map(normalizeRow);
|
|
310
|
+
const referenceModelId = selectReferenceModelId(rows, input.referenceModelId);
|
|
311
|
+
const tiers = Array.from(
|
|
312
|
+
new Set(
|
|
313
|
+
rows.map((row) => row.tier).filter((tier) => !!tier)
|
|
314
|
+
)
|
|
315
|
+
).sort(canonicalElizaOneTierSort);
|
|
316
|
+
const benchmarks = Array.from(
|
|
317
|
+
new Set(rows.map((row) => row.benchmark))
|
|
318
|
+
).sort();
|
|
319
|
+
const comparisons = buildComparisons(rows, referenceModelId);
|
|
320
|
+
return {
|
|
321
|
+
schema: BENCHMARK_MATRIX_ARTIFACT_SCHEMA,
|
|
322
|
+
version: BENCHMARK_MATRIX_ARTIFACT_VERSION,
|
|
323
|
+
generatedAt: input.generatedAt ?? (/* @__PURE__ */ new Date()).toISOString(),
|
|
324
|
+
source: input.source ?? { kind: "training_benchmark_matrix" },
|
|
325
|
+
referenceModelId,
|
|
326
|
+
tiers,
|
|
327
|
+
benchmarks,
|
|
328
|
+
counts: {
|
|
329
|
+
rows: rows.length,
|
|
330
|
+
comparisons: comparisons.length,
|
|
331
|
+
tiers: tiers.length,
|
|
332
|
+
benchmarks: benchmarks.length
|
|
333
|
+
},
|
|
334
|
+
rows,
|
|
335
|
+
comparisons
|
|
336
|
+
};
|
|
337
|
+
}
|
|
338
|
+
async function writeBenchmarkMatrixArtifact(input) {
|
|
339
|
+
const artifact = buildBenchmarkMatrixArtifactPayload(input);
|
|
340
|
+
const outputDir = input.outputDir ?? join(
|
|
341
|
+
trainingStateRoot(),
|
|
342
|
+
"benchmarks",
|
|
343
|
+
safeTimestamp(artifact.generatedAt)
|
|
344
|
+
);
|
|
345
|
+
await mkdir(outputDir, { recursive: true });
|
|
346
|
+
const artifactPath = join(outputDir, "benchmark-matrix.json");
|
|
347
|
+
await writeFile(
|
|
348
|
+
artifactPath,
|
|
349
|
+
`${JSON.stringify(artifact, null, 2)}
|
|
350
|
+
`,
|
|
351
|
+
"utf-8"
|
|
352
|
+
);
|
|
353
|
+
return { outputDir, artifactPath, artifact };
|
|
354
|
+
}
|
|
355
|
+
async function writeBenchmarkMatrixArtifactFromArtifacts(input) {
|
|
356
|
+
const rows = await buildBenchmarkMatrixRowsFromArtifacts(input.artifacts);
|
|
357
|
+
return writeBenchmarkMatrixArtifact({
|
|
358
|
+
rows,
|
|
359
|
+
outputDir: input.outputDir,
|
|
360
|
+
generatedAt: input.generatedAt,
|
|
361
|
+
referenceModelId: input.referenceModelId,
|
|
362
|
+
source: input.source ?? {
|
|
363
|
+
kind: "training_benchmark_matrix_from_artifacts",
|
|
364
|
+
artifacts: input.artifacts.map((artifact) => artifact.path)
|
|
365
|
+
}
|
|
366
|
+
});
|
|
367
|
+
}
|
|
368
|
+
export {
|
|
369
|
+
ACTION_BENCHMARK_REPORT_SCHEMA,
|
|
370
|
+
ACTION_SELECTION_BENCHMARK_ID,
|
|
371
|
+
BENCHMARK_MATRIX_ARTIFACT_SCHEMA,
|
|
372
|
+
BENCHMARK_MATRIX_ARTIFACT_VERSION,
|
|
373
|
+
ELIZA_ONE_MATRIX_TIERS,
|
|
374
|
+
LOCAL_EVAL_COMPARISON_BENCHMARK_ID,
|
|
375
|
+
buildBenchmarkMatrixArtifactPayload,
|
|
376
|
+
buildBenchmarkMatrixRowsFromArtifactPayload,
|
|
377
|
+
buildBenchmarkMatrixRowsFromArtifacts,
|
|
378
|
+
writeBenchmarkMatrixArtifact,
|
|
379
|
+
writeBenchmarkMatrixArtifactFromArtifacts
|
|
380
|
+
};
|
|
381
|
+
//# sourceMappingURL=benchmark-matrix-artifact.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/core/benchmark-matrix-artifact.ts"],"sourcesContent":["import { mkdir, writeFile } from \"node:fs/promises\";\nimport { readFile } from \"node:fs/promises\";\nimport { join } from \"node:path\";\nimport {\n canonicalElizaOneTierSort,\n normalizeElizaOneBenchmarkTier,\n} from \"./eliza1-benchmark-recipe.js\";\nimport { EVAL_COMPARISON_ARTIFACT_SCHEMA } from \"./eval-comparison-artifact.js\";\nimport { trainingStateRoot } from \"./training-config.js\";\n\nexport const BENCHMARK_MATRIX_ARTIFACT_SCHEMA =\n \"eliza_benchmark_matrix_artifact\";\nexport const BENCHMARK_MATRIX_ARTIFACT_VERSION = 1;\nexport const ACTION_BENCHMARK_REPORT_SCHEMA =\n \"eliza_action_selection_benchmark_report\";\nexport const ACTION_SELECTION_BENCHMARK_ID = \"eliza_harness_action_selection\";\nexport const LOCAL_EVAL_COMPARISON_BENCHMARK_ID =\n \"eliza_harness_local_eval_comparison\";\n\nexport const ELIZA_ONE_MATRIX_TIERS = [\n \"0b\",\n \"2b\",\n \"4b\",\n \"9b\",\n \"27b\",\n] as const;\n\nexport type ElizaOneMatrixTier = (typeof ELIZA_ONE_MATRIX_TIERS)[number];\nexport type BenchmarkMatrixVariant = \"reference\" | \"base\" | \"trained\";\n\nexport interface BenchmarkMatrixRowInput {\n modelId: string;\n benchmark: string;\n score: number;\n variant: BenchmarkMatrixVariant;\n tier?: string;\n provider?: string;\n datasetVersion?: string;\n codeCommit?: string;\n ts?: number | string;\n metrics?: Record<string, unknown>;\n raw?: Record<string, unknown>;\n}\n\nexport interface BenchmarkMatrixInput {\n rows: BenchmarkMatrixRowInput[];\n outputDir?: string;\n generatedAt?: string;\n referenceModelId?: string;\n source?: Record<string, unknown>;\n}\n\nexport interface BenchmarkMatrixArtifactSource {\n path: string;\n modelId?: string;\n benchmark?: string;\n variant?: BenchmarkMatrixVariant;\n tier?: string;\n provider?: string;\n datasetVersion?: string;\n codeCommit?: string;\n useMocks?: boolean;\n}\n\nexport interface BenchmarkMatrixFromArtifactsInput {\n artifacts: BenchmarkMatrixArtifactSource[];\n outputDir?: string;\n generatedAt?: string;\n referenceModelId?: string;\n source?: Record<string, unknown>;\n}\n\nexport interface BenchmarkMatrixCell {\n modelId: string;\n benchmark: string;\n score: number;\n variant: BenchmarkMatrixVariant;\n tier: string | null;\n provider: string | null;\n datasetVersion: string | null;\n codeCommit: string | null;\n ts: number | string | null;\n metrics: Record<string, unknown>;\n raw: Record<string, unknown>;\n}\n\nexport interface BenchmarkMatrixComparison {\n tier: string;\n benchmark: string;\n baseModelId: string | null;\n trainedModelId: string | null;\n referenceModelId: string | null;\n baseScore: number | null;\n trainedScore: number | null;\n referenceScore: number | null;\n improvementAbsolute: number | null;\n improvementPercent: number | null;\n trainedVsReferenceAbsolute: number | null;\n trainedVsReferencePercent: number | null;\n dryRun: boolean;\n}\n\nexport interface BenchmarkMatrixArtifact {\n schema: typeof BENCHMARK_MATRIX_ARTIFACT_SCHEMA;\n version: typeof BENCHMARK_MATRIX_ARTIFACT_VERSION;\n generatedAt: string;\n source: Record<string, unknown>;\n referenceModelId: string | null;\n tiers: string[];\n benchmarks: string[];\n counts: {\n rows: number;\n comparisons: number;\n tiers: number;\n benchmarks: number;\n };\n rows: BenchmarkMatrixCell[];\n comparisons: BenchmarkMatrixComparison[];\n}\n\nexport interface BenchmarkMatrixArtifactResult {\n outputDir: string;\n artifactPath: string;\n artifact: BenchmarkMatrixArtifact;\n}\n\nfunction asRecord(value: unknown): Record<string, unknown> | null {\n return value && typeof value === \"object\" && !Array.isArray(value)\n ? (value as Record<string, unknown>)\n : null;\n}\n\nfunction asString(value: unknown): string | null {\n return typeof value === \"string\" && value.trim().length > 0\n ? value.trim()\n : null;\n}\n\nfunction asNumber(value: unknown): number | null {\n return typeof value === \"number\" && Number.isFinite(value) ? value : null;\n}\n\nfunction finiteScore(value: number): number {\n if (!Number.isFinite(value))\n throw new Error(`score must be finite; got ${value}`);\n return value;\n}\n\nfunction roundMetric(value: number | null): number | null {\n return value === null ? null : Number(value.toFixed(6));\n}\n\nfunction percentDelta(base: number | null, next: number | null): number | null {\n if (base === null || next === null || base === 0) return null;\n return ((next - base) / Math.abs(base)) * 100;\n}\n\nfunction isDryRunRow(row: BenchmarkMatrixCell | null | undefined): boolean {\n if (!row) return false;\n const rawSource = asRecord(row.raw.source);\n return (\n row.metrics.dryRun === true ||\n row.raw.dryRun === true ||\n rawSource?.dryRun === true\n );\n}\n\nfunction inferTier(modelId: string, explicit?: string): string | null {\n const tier = asString(explicit);\n if (tier) return normalizeElizaOneBenchmarkTier(tier) ?? tier;\n const normalized = modelId.toLowerCase();\n if (normalized.includes(\"27b\")) return \"27b\";\n if (normalized.includes(\"9b\")) return \"9b\";\n if (normalized.includes(\"4b\")) return \"4b\";\n if (normalized.includes(\"2b\")) return \"2b\";\n if (normalized.includes(\"0b\")) return \"0b\";\n return null;\n}\n\nfunction normalizeRow(row: BenchmarkMatrixRowInput): BenchmarkMatrixCell {\n const explicitReferenceTier = asString(row.tier);\n return {\n modelId: row.modelId,\n benchmark: row.benchmark,\n score: finiteScore(row.score),\n variant: row.variant,\n tier:\n row.variant === \"reference\" && !explicitReferenceTier\n ? null\n : inferTier(row.modelId, row.tier),\n provider: asString(row.provider),\n datasetVersion: asString(row.datasetVersion),\n codeCommit: asString(row.codeCommit),\n ts: row.ts ?? null,\n metrics: row.metrics ?? {},\n raw: row.raw ?? {},\n };\n}\n\nfunction selectReferenceModelId(\n rows: readonly BenchmarkMatrixCell[],\n explicit?: string,\n): string | null {\n if (explicit) return explicit;\n return (\n rows.find((row) => row.variant === \"reference\")?.modelId ??\n rows.find((row) => row.provider === \"cerebras\")?.modelId ??\n null\n );\n}\n\nfunction scoreFor(\n rows: readonly BenchmarkMatrixCell[],\n benchmark: string,\n variant: BenchmarkMatrixVariant,\n tier?: string,\n): BenchmarkMatrixCell | null {\n if (variant === \"reference\") {\n return (\n rows.find(\n (row) =>\n row.benchmark === benchmark &&\n row.variant === \"reference\" &&\n row.tier === tier,\n ) ??\n rows.find(\n (row) =>\n row.benchmark === benchmark &&\n row.variant === \"reference\" &&\n row.tier === null,\n ) ??\n null\n );\n }\n return (\n rows.find(\n (row) =>\n row.benchmark === benchmark &&\n row.variant === variant &&\n row.tier === tier,\n ) ?? null\n );\n}\n\nfunction buildComparisons(\n rows: readonly BenchmarkMatrixCell[],\n referenceModelId: string | null,\n): BenchmarkMatrixComparison[] {\n const tiers = Array.from(\n new Set(\n rows\n .map((row) => row.tier)\n .filter((tier): tier is string => tier !== null),\n ),\n ).sort(canonicalElizaOneTierSort);\n const benchmarks = Array.from(\n new Set(rows.map((row) => row.benchmark)),\n ).sort();\n const comparisons: BenchmarkMatrixComparison[] = [];\n for (const tier of tiers) {\n for (const benchmark of benchmarks) {\n const base = scoreFor(rows, benchmark, \"base\", tier);\n const trained = scoreFor(rows, benchmark, \"trained\", tier);\n const reference = scoreFor(rows, benchmark, \"reference\", tier);\n if (!base && !trained && !reference) continue;\n const dryRun =\n isDryRunRow(base) || isDryRunRow(trained) || isDryRunRow(reference);\n comparisons.push({\n tier,\n benchmark,\n baseModelId: base?.modelId ?? null,\n trainedModelId: trained?.modelId ?? null,\n referenceModelId: reference?.modelId ?? referenceModelId,\n baseScore: base?.score ?? null,\n trainedScore: trained?.score ?? null,\n referenceScore: reference?.score ?? null,\n improvementAbsolute: roundMetric(\n base && trained ? trained.score - base.score : null,\n ),\n improvementPercent: roundMetric(\n percentDelta(base?.score ?? null, trained?.score ?? null),\n ),\n trainedVsReferenceAbsolute: roundMetric(\n trained && reference ? trained.score - reference.score : null,\n ),\n trainedVsReferencePercent: roundMetric(\n percentDelta(reference?.score ?? null, trained?.score ?? null),\n ),\n dryRun,\n });\n }\n }\n return comparisons;\n}\n\nfunction safeTimestamp(value: string): string {\n return value.replace(/[:.]/g, \"-\");\n}\n\nfunction rowFromActionBenchmarkArtifact(\n payload: Record<string, unknown>,\n source: BenchmarkMatrixArtifactSource,\n): BenchmarkMatrixRowInput[] {\n const reportSource = asRecord(payload.source) ?? {};\n const embeddedVariant = reportSource.variant;\n const modelId = source.modelId ?? asString(reportSource.modelId) ?? undefined;\n const variant =\n source.variant ??\n (embeddedVariant === \"reference\" ||\n embeddedVariant === \"base\" ||\n embeddedVariant === \"trained\"\n ? embeddedVariant\n : undefined);\n if (!modelId || !variant) {\n throw new Error(\n `Action benchmark artifact ${source.path} requires modelId and variant`,\n );\n }\n const summary = asRecord(payload.summary) ?? {};\n const dryRun = payload.dryRun === true || reportSource.dryRun === true;\n const useMocks =\n source.useMocks === true ||\n reportSource.useMocks === true ||\n payload.useMocks === true;\n const score = asNumber(summary.accuracy) ?? (dryRun ? 0 : null);\n if (score === null) {\n throw new Error(\n `Action benchmark artifact ${source.path} missing accuracy`,\n );\n }\n const caseSamples = Array.isArray(payload.results)\n ? payload.results\n .map(asRecord)\n .filter((result): result is Record<string, unknown> => result !== null)\n .slice(0, 8)\n .map((result) => ({\n caseId: asString(result.caseId),\n prompt:\n asString(result.prompt) ??\n asString(result.input) ??\n asString(result.userPrompt),\n expectedAction: asString(result.expectedAction),\n actualAction: asString(result.actualAction),\n pass: result.pass === true,\n response:\n asString(result.response) ??\n asString(result.output) ??\n asString(result.finalResponse) ??\n asString(result.failureReason),\n latencyMs: asNumber(result.latencyMs),\n trajectoryPath: asString(result.trajectoryPath),\n }))\n : [];\n return [\n {\n modelId,\n variant,\n benchmark:\n source.benchmark ??\n asString(reportSource.benchmark) ??\n ACTION_SELECTION_BENCHMARK_ID,\n score,\n tier: source.tier ?? asString(reportSource.tier) ?? undefined,\n provider: source.provider ?? asString(reportSource.provider) ?? undefined,\n datasetVersion:\n source.datasetVersion ??\n asString(reportSource.datasetVersion) ??\n undefined,\n codeCommit:\n source.codeCommit ?? asString(reportSource.codeCommit) ?? undefined,\n ts: asString(payload.generatedAt) ?? undefined,\n metrics: {\n plannerAccuracy: summary.plannerAccuracy,\n executionAccuracy: summary.executionAccuracy,\n total: summary.total,\n passed: summary.passed,\n failed: summary.failed,\n latency: summary.latency,\n failureModes: payload.failureModes,\n dryRun,\n useMocks,\n },\n raw: {\n artifactPath: source.path,\n schema: payload.schema,\n source: payload.source,\n caseSamples,\n dryRun,\n useMocks,\n },\n },\n ];\n}\n\nfunction rowsFromEvalComparisonArtifact(\n payload: Record<string, unknown>,\n source: BenchmarkMatrixArtifactSource,\n): BenchmarkMatrixRowInput[] {\n const models = asRecord(payload.models) ?? {};\n const metrics = asRecord(payload.metrics) ?? {};\n const benchmark = source.benchmark ?? LOCAL_EVAL_COMPARISON_BENCHMARK_ID;\n const baseModelId =\n source.variant === \"base\" ? source.modelId : asString(models.base);\n const trainedModelId =\n source.variant === \"trained\" ? source.modelId : asString(models.trained);\n const rows: BenchmarkMatrixRowInput[] = [];\n const baseScore = asNumber(metrics.baseScore);\n if (baseModelId && baseScore !== null) {\n rows.push({\n modelId: baseModelId,\n variant: \"base\",\n benchmark,\n score: baseScore,\n tier: source.tier,\n provider: source.provider,\n datasetVersion: source.datasetVersion,\n codeCommit: source.codeCommit,\n ts: asString(payload.generatedAt) ?? undefined,\n metrics: {\n latencyMs: metrics.baseLatencyMs,\n promptCount: metrics.promptCount,\n },\n raw: {\n artifactPath: source.path,\n schema: payload.schema,\n },\n });\n }\n const trainedScore = asNumber(metrics.trainedScore);\n if (trainedModelId && trainedScore !== null) {\n rows.push({\n modelId: trainedModelId,\n variant: \"trained\",\n benchmark,\n score: trainedScore,\n tier: source.tier,\n provider: source.provider,\n datasetVersion: source.datasetVersion,\n codeCommit: source.codeCommit,\n ts: asString(payload.generatedAt) ?? undefined,\n metrics: {\n latencyMs: metrics.trainedLatencyMs,\n promptCount: metrics.promptCount,\n improvementAbsolute: metrics.improvementAbsolute,\n improvementPercent: metrics.improvementPercent,\n },\n raw: {\n artifactPath: source.path,\n schema: payload.schema,\n },\n });\n }\n return rows;\n}\n\nfunction rowsFromBenchmarkMatrixArtifact(\n payload: Record<string, unknown>,\n source: BenchmarkMatrixArtifactSource,\n): BenchmarkMatrixRowInput[] {\n const rows = Array.isArray(payload.rows)\n ? payload.rows\n .map(asRecord)\n .filter((row): row is Record<string, unknown> => row !== null)\n : [];\n return rows.map((row) => {\n const modelId = asString(row.modelId);\n const benchmark = asString(row.benchmark);\n const variant = row.variant;\n const score = asNumber(row.score);\n if (\n !modelId ||\n !benchmark ||\n score === null ||\n (variant !== \"reference\" && variant !== \"base\" && variant !== \"trained\")\n ) {\n throw new Error(\n `Benchmark matrix artifact ${source.path} has an invalid row`,\n );\n }\n return {\n modelId,\n benchmark: source.benchmark ?? benchmark,\n score,\n variant,\n tier: source.tier ?? asString(row.tier) ?? undefined,\n provider: source.provider ?? asString(row.provider) ?? undefined,\n datasetVersion:\n source.datasetVersion ?? asString(row.datasetVersion) ?? undefined,\n codeCommit: source.codeCommit ?? asString(row.codeCommit) ?? undefined,\n ts: row.ts as number | string | undefined,\n metrics: asRecord(row.metrics) ?? {},\n raw: {\n ...(asRecord(row.raw) ?? {}),\n artifactPath: source.path,\n schema: payload.schema,\n },\n };\n });\n}\n\nexport function buildBenchmarkMatrixRowsFromArtifactPayload(\n payload: Record<string, unknown>,\n source: BenchmarkMatrixArtifactSource,\n): BenchmarkMatrixRowInput[] {\n if (payload.schema === ACTION_BENCHMARK_REPORT_SCHEMA) {\n return rowFromActionBenchmarkArtifact(payload, source);\n }\n if (payload.schema === EVAL_COMPARISON_ARTIFACT_SCHEMA) {\n return rowsFromEvalComparisonArtifact(payload, source);\n }\n if (payload.schema === BENCHMARK_MATRIX_ARTIFACT_SCHEMA) {\n return rowsFromBenchmarkMatrixArtifact(payload, source);\n }\n throw new Error(`Unsupported benchmark artifact schema in ${source.path}`);\n}\n\nexport async function buildBenchmarkMatrixRowsFromArtifacts(\n artifacts: BenchmarkMatrixArtifactSource[],\n): Promise<BenchmarkMatrixRowInput[]> {\n const rows: BenchmarkMatrixRowInput[] = [];\n for (const source of artifacts) {\n const payload = asRecord(JSON.parse(await readFile(source.path, \"utf-8\")));\n if (!payload)\n throw new Error(`Artifact ${source.path} must be a JSON object`);\n rows.push(...buildBenchmarkMatrixRowsFromArtifactPayload(payload, source));\n }\n return rows;\n}\n\nexport function buildBenchmarkMatrixArtifactPayload(\n input: BenchmarkMatrixInput,\n): BenchmarkMatrixArtifact {\n const rows = input.rows.map(normalizeRow);\n const referenceModelId = selectReferenceModelId(rows, input.referenceModelId);\n const tiers = Array.from(\n new Set(\n rows.map((row) => row.tier).filter((tier): tier is string => !!tier),\n ),\n ).sort(canonicalElizaOneTierSort);\n const benchmarks = Array.from(\n new Set(rows.map((row) => row.benchmark)),\n ).sort();\n const comparisons = buildComparisons(rows, referenceModelId);\n return {\n schema: BENCHMARK_MATRIX_ARTIFACT_SCHEMA,\n version: BENCHMARK_MATRIX_ARTIFACT_VERSION,\n generatedAt: input.generatedAt ?? new Date().toISOString(),\n source: input.source ?? { kind: \"training_benchmark_matrix\" },\n referenceModelId,\n tiers,\n benchmarks,\n counts: {\n rows: rows.length,\n comparisons: comparisons.length,\n tiers: tiers.length,\n benchmarks: benchmarks.length,\n },\n rows,\n comparisons,\n };\n}\n\nexport async function writeBenchmarkMatrixArtifact(\n input: BenchmarkMatrixInput,\n): Promise<BenchmarkMatrixArtifactResult> {\n const artifact = buildBenchmarkMatrixArtifactPayload(input);\n const outputDir =\n input.outputDir ??\n join(\n trainingStateRoot(),\n \"benchmarks\",\n safeTimestamp(artifact.generatedAt),\n );\n await mkdir(outputDir, { recursive: true });\n const artifactPath = join(outputDir, \"benchmark-matrix.json\");\n await writeFile(\n artifactPath,\n `${JSON.stringify(artifact, null, 2)}\\n`,\n \"utf-8\",\n );\n return { outputDir, artifactPath, artifact };\n}\n\nexport async function writeBenchmarkMatrixArtifactFromArtifacts(\n input: BenchmarkMatrixFromArtifactsInput,\n): Promise<BenchmarkMatrixArtifactResult> {\n const rows = await buildBenchmarkMatrixRowsFromArtifacts(input.artifacts);\n return writeBenchmarkMatrixArtifact({\n rows,\n outputDir: input.outputDir,\n generatedAt: input.generatedAt,\n referenceModelId: input.referenceModelId,\n source: input.source ?? {\n kind: \"training_benchmark_matrix_from_artifacts\",\n artifacts: input.artifacts.map((artifact) => artifact.path),\n },\n });\n}\n"],"mappings":"AAAA,SAAS,OAAO,iBAAiB;AACjC,SAAS,gBAAgB;AACzB,SAAS,YAAY;AACrB;AAAA,EACE;AAAA,EACA;AAAA,OACK;AACP,SAAS,uCAAuC;AAChD,SAAS,yBAAyB;AAE3B,MAAM,mCACX;AACK,MAAM,oCAAoC;AAC1C,MAAM,iCACX;AACK,MAAM,gCAAgC;AACtC,MAAM,qCACX;AAEK,MAAM,yBAAyB;AAAA,EACpC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAqGA,SAAS,SAAS,OAAgD;AAChE,SAAO,SAAS,OAAO,UAAU,YAAY,CAAC,MAAM,QAAQ,KAAK,IAC5D,QACD;AACN;AAEA,SAAS,SAAS,OAA+B;AAC/C,SAAO,OAAO,UAAU,YAAY,MAAM,KAAK,EAAE,SAAS,IACtD,MAAM,KAAK,IACX;AACN;AAEA,SAAS,SAAS,OAA+B;AAC/C,SAAO,OAAO,UAAU,YAAY,OAAO,SAAS,KAAK,IAAI,QAAQ;AACvE;AAEA,SAAS,YAAY,OAAuB;AAC1C,MAAI,CAAC,OAAO,SAAS,KAAK;AACxB,UAAM,IAAI,MAAM,6BAA6B,KAAK,EAAE;AACtD,SAAO;AACT;AAEA,SAAS,YAAY,OAAqC;AACxD,SAAO,UAAU,OAAO,OAAO,OAAO,MAAM,QAAQ,CAAC,CAAC;AACxD;AAEA,SAAS,aAAa,MAAqB,MAAoC;AAC7E,MAAI,SAAS,QAAQ,SAAS,QAAQ,SAAS,EAAG,QAAO;AACzD,UAAS,OAAO,QAAQ,KAAK,IAAI,IAAI,IAAK;AAC5C;AAEA,SAAS,YAAY,KAAsD;AACzE,MAAI,CAAC,IAAK,QAAO;AACjB,QAAM,YAAY,SAAS,IAAI,IAAI,MAAM;AACzC,SACE,IAAI,QAAQ,WAAW,QACvB,IAAI,IAAI,WAAW,QACnB,WAAW,WAAW;AAE1B;AAEA,SAAS,UAAU,SAAiB,UAAkC;AACpE,QAAM,OAAO,SAAS,QAAQ;AAC9B,MAAI,KAAM,QAAO,+BAA+B,IAAI,KAAK;AACzD,QAAM,aAAa,QAAQ,YAAY;AACvC,MAAI,WAAW,SAAS,KAAK,EAAG,QAAO;AACvC,MAAI,WAAW,SAAS,IAAI,EAAG,QAAO;AACtC,MAAI,WAAW,SAAS,IAAI,EAAG,QAAO;AACtC,MAAI,WAAW,SAAS,IAAI,EAAG,QAAO;AACtC,MAAI,WAAW,SAAS,IAAI,EAAG,QAAO;AACtC,SAAO;AACT;AAEA,SAAS,aAAa,KAAmD;AACvE,QAAM,wBAAwB,SAAS,IAAI,IAAI;AAC/C,SAAO;AAAA,IACL,SAAS,IAAI;AAAA,IACb,WAAW,IAAI;AAAA,IACf,OAAO,YAAY,IAAI,KAAK;AAAA,IAC5B,SAAS,IAAI;AAAA,IACb,MACE,IAAI,YAAY,eAAe,CAAC,wBAC5B,OACA,UAAU,IAAI,SAAS,IAAI,IAAI;AAAA,IACrC,UAAU,SAAS,IAAI,QAAQ;AAAA,IAC/B,gBAAgB,SAAS,IAAI,cAAc;AAAA,IAC3C,YAAY,SAAS,IAAI,UAAU;AAAA,IACnC,IAAI,IAAI,MAAM;AAAA,IACd,SAAS,IAAI,WAAW,CAAC;AAAA,IACzB,KAAK,IAAI,OAAO,CAAC;AAAA,EACnB;AACF;AAEA,SAAS,uBACP,MACA,UACe;AACf,MAAI,SAAU,QAAO;AACrB,SACE,KAAK,KAAK,CAAC,QAAQ,IAAI,YAAY,WAAW,GAAG,WACjD,KAAK,KAAK,CAAC,QAAQ,IAAI,aAAa,UAAU,GAAG,WACjD;AAEJ;AAEA,SAAS,SACP,MACA,WACA,SACA,MAC4B;AAC5B,MAAI,YAAY,aAAa;AAC3B,WACE,KAAK;AAAA,MACH,CAAC,QACC,IAAI,cAAc,aAClB,IAAI,YAAY,eAChB,IAAI,SAAS;AAAA,IACjB,KACA,KAAK;AAAA,MACH,CAAC,QACC,IAAI,cAAc,aAClB,IAAI,YAAY,eAChB,IAAI,SAAS;AAAA,IACjB,KACA;AAAA,EAEJ;AACA,SACE,KAAK;AAAA,IACH,CAAC,QACC,IAAI,cAAc,aAClB,IAAI,YAAY,WAChB,IAAI,SAAS;AAAA,EACjB,KAAK;AAET;AAEA,SAAS,iBACP,MACA,kBAC6B;AAC7B,QAAM,QAAQ,MAAM;AAAA,IAClB,IAAI;AAAA,MACF,KACG,IAAI,CAAC,QAAQ,IAAI,IAAI,EACrB,OAAO,CAAC,SAAyB,SAAS,IAAI;AAAA,IACnD;AAAA,EACF,EAAE,KAAK,yBAAyB;AAChC,QAAM,aAAa,MAAM;AAAA,IACvB,IAAI,IAAI,KAAK,IAAI,CAAC,QAAQ,IAAI,SAAS,CAAC;AAAA,EAC1C,EAAE,KAAK;AACP,QAAM,cAA2C,CAAC;AAClD,aAAW,QAAQ,OAAO;AACxB,eAAW,aAAa,YAAY;AAClC,YAAM,OAAO,SAAS,MAAM,WAAW,QAAQ,IAAI;AACnD,YAAM,UAAU,SAAS,MAAM,WAAW,WAAW,IAAI;AACzD,YAAM,YAAY,SAAS,MAAM,WAAW,aAAa,IAAI;AAC7D,UAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,UAAW;AACrC,YAAM,SACJ,YAAY,IAAI,KAAK,YAAY,OAAO,KAAK,YAAY,SAAS;AACpE,kBAAY,KAAK;AAAA,QACf;AAAA,QACA;AAAA,QACA,aAAa,MAAM,WAAW;AAAA,QAC9B,gBAAgB,SAAS,WAAW;AAAA,QACpC,kBAAkB,WAAW,WAAW;AAAA,QACxC,WAAW,MAAM,SAAS;AAAA,QAC1B,cAAc,SAAS,SAAS;AAAA,QAChC,gBAAgB,WAAW,SAAS;AAAA,QACpC,qBAAqB;AAAA,UACnB,QAAQ,UAAU,QAAQ,QAAQ,KAAK,QAAQ;AAAA,QACjD;AAAA,QACA,oBAAoB;AAAA,UAClB,aAAa,MAAM,SAAS,MAAM,SAAS,SAAS,IAAI;AAAA,QAC1D;AAAA,QACA,4BAA4B;AAAA,UAC1B,WAAW,YAAY,QAAQ,QAAQ,UAAU,QAAQ;AAAA,QAC3D;AAAA,QACA,2BAA2B;AAAA,UACzB,aAAa,WAAW,SAAS,MAAM,SAAS,SAAS,IAAI;AAAA,QAC/D;AAAA,QACA;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AACA,SAAO;AACT;AAEA,SAAS,cAAc,OAAuB;AAC5C,SAAO,MAAM,QAAQ,SAAS,GAAG;AACnC;AAEA,SAAS,+BACP,SACA,QAC2B;AAC3B,QAAM,eAAe,SAAS,QAAQ,MAAM,KAAK,CAAC;AAClD,QAAM,kBAAkB,aAAa;AACrC,QAAM,UAAU,OAAO,WAAW,SAAS,aAAa,OAAO,KAAK;AACpE,QAAM,UACJ,OAAO,YACN,oBAAoB,eACrB,oBAAoB,UACpB,oBAAoB,YAChB,kBACA;AACN,MAAI,CAAC,WAAW,CAAC,SAAS;AACxB,UAAM,IAAI;AAAA,MACR,6BAA6B,OAAO,IAAI;AAAA,IAC1C;AAAA,EACF;AACA,QAAM,UAAU,SAAS,QAAQ,OAAO,KAAK,CAAC;AAC9C,QAAM,SAAS,QAAQ,WAAW,QAAQ,aAAa,WAAW;AAClE,QAAM,WACJ,OAAO,aAAa,QACpB,aAAa,aAAa,QAC1B,QAAQ,aAAa;AACvB,QAAM,QAAQ,SAAS,QAAQ,QAAQ,MAAM,SAAS,IAAI;AAC1D,MAAI,UAAU,MAAM;AAClB,UAAM,IAAI;AAAA,MACR,6BAA6B,OAAO,IAAI;AAAA,IAC1C;AAAA,EACF;AACA,QAAM,cAAc,MAAM,QAAQ,QAAQ,OAAO,IAC7C,QAAQ,QACL,IAAI,QAAQ,EACZ,OAAO,CAAC,WAA8C,WAAW,IAAI,EACrE,MAAM,GAAG,CAAC,EACV,IAAI,CAAC,YAAY;AAAA,IAChB,QAAQ,SAAS,OAAO,MAAM;AAAA,IAC9B,QACE,SAAS,OAAO,MAAM,KACtB,SAAS,OAAO,KAAK,KACrB,SAAS,OAAO,UAAU;AAAA,IAC5B,gBAAgB,SAAS,OAAO,cAAc;AAAA,IAC9C,cAAc,SAAS,OAAO,YAAY;AAAA,IAC1C,MAAM,OAAO,SAAS;AAAA,IACtB,UACE,SAAS,OAAO,QAAQ,KACxB,SAAS,OAAO,MAAM,KACtB,SAAS,OAAO,aAAa,KAC7B,SAAS,OAAO,aAAa;AAAA,IAC/B,WAAW,SAAS,OAAO,SAAS;AAAA,IACpC,gBAAgB,SAAS,OAAO,cAAc;AAAA,EAChD,EAAE,IACJ,CAAC;AACL,SAAO;AAAA,IACL;AAAA,MACE;AAAA,MACA;AAAA,MACA,WACE,OAAO,aACP,SAAS,aAAa,SAAS,KAC/B;AAAA,MACF;AAAA,MACA,MAAM,OAAO,QAAQ,SAAS,aAAa,IAAI,KAAK;AAAA,MACpD,UAAU,OAAO,YAAY,SAAS,aAAa,QAAQ,KAAK;AAAA,MAChE,gBACE,OAAO,kBACP,SAAS,aAAa,cAAc,KACpC;AAAA,MACF,YACE,OAAO,cAAc,SAAS,aAAa,UAAU,KAAK;AAAA,MAC5D,IAAI,SAAS,QAAQ,WAAW,KAAK;AAAA,MACrC,SAAS;AAAA,QACP,iBAAiB,QAAQ;AAAA,QACzB,mBAAmB,QAAQ;AAAA,QAC3B,OAAO,QAAQ;AAAA,QACf,QAAQ,QAAQ;AAAA,QAChB,QAAQ,QAAQ;AAAA,QAChB,SAAS,QAAQ;AAAA,QACjB,cAAc,QAAQ;AAAA,QACtB;AAAA,QACA;AAAA,MACF;AAAA,MACA,KAAK;AAAA,QACH,cAAc,OAAO;AAAA,QACrB,QAAQ,QAAQ;AAAA,QAChB,QAAQ,QAAQ;AAAA,QAChB;AAAA,QACA;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,+BACP,SACA,QAC2B;AAC3B,QAAM,SAAS,SAAS,QAAQ,MAAM,KAAK,CAAC;AAC5C,QAAM,UAAU,SAAS,QAAQ,OAAO,KAAK,CAAC;AAC9C,QAAM,YAAY,OAAO,aAAa;AACtC,QAAM,cACJ,OAAO,YAAY,SAAS,OAAO,UAAU,SAAS,OAAO,IAAI;AACnE,QAAM,iBACJ,OAAO,YAAY,YAAY,OAAO,UAAU,SAAS,OAAO,OAAO;AACzE,QAAM,OAAkC,CAAC;AACzC,QAAM,YAAY,SAAS,QAAQ,SAAS;AAC5C,MAAI,eAAe,cAAc,MAAM;AACrC,SAAK,KAAK;AAAA,MACR,SAAS;AAAA,MACT,SAAS;AAAA,MACT;AAAA,MACA,OAAO;AAAA,MACP,MAAM,OAAO;AAAA,MACb,UAAU,OAAO;AAAA,MACjB,gBAAgB,OAAO;AAAA,MACvB,YAAY,OAAO;AAAA,MACnB,IAAI,SAAS,QAAQ,WAAW,KAAK;AAAA,MACrC,SAAS;AAAA,QACP,WAAW,QAAQ;AAAA,QACnB,aAAa,QAAQ;AAAA,MACvB;AAAA,MACA,KAAK;AAAA,QACH,cAAc,OAAO;AAAA,QACrB,QAAQ,QAAQ;AAAA,MAClB;AAAA,IACF,CAAC;AAAA,EACH;AACA,QAAM,eAAe,SAAS,QAAQ,YAAY;AAClD,MAAI,kBAAkB,iBAAiB,MAAM;AAC3C,SAAK,KAAK;AAAA,MACR,SAAS;AAAA,MACT,SAAS;AAAA,MACT;AAAA,MACA,OAAO;AAAA,MACP,MAAM,OAAO;AAAA,MACb,UAAU,OAAO;AAAA,MACjB,gBAAgB,OAAO;AAAA,MACvB,YAAY,OAAO;AAAA,MACnB,IAAI,SAAS,QAAQ,WAAW,KAAK;AAAA,MACrC,SAAS;AAAA,QACP,WAAW,QAAQ;AAAA,QACnB,aAAa,QAAQ;AAAA,QACrB,qBAAqB,QAAQ;AAAA,QAC7B,oBAAoB,QAAQ;AAAA,MAC9B;AAAA,MACA,KAAK;AAAA,QACH,cAAc,OAAO;AAAA,QACrB,QAAQ,QAAQ;AAAA,MAClB;AAAA,IACF,CAAC;AAAA,EACH;AACA,SAAO;AACT;AAEA,SAAS,gCACP,SACA,QAC2B;AAC3B,QAAM,OAAO,MAAM,QAAQ,QAAQ,IAAI,IACnC,QAAQ,KACL,IAAI,QAAQ,EACZ,OAAO,CAAC,QAAwC,QAAQ,IAAI,IAC/D,CAAC;AACL,SAAO,KAAK,IAAI,CAAC,QAAQ;AACvB,UAAM,UAAU,SAAS,IAAI,OAAO;AACpC,UAAM,YAAY,SAAS,IAAI,SAAS;AACxC,UAAM,UAAU,IAAI;AACpB,UAAM,QAAQ,SAAS,IAAI,KAAK;AAChC,QACE,CAAC,WACD,CAAC,aACD,UAAU,QACT,YAAY,eAAe,YAAY,UAAU,YAAY,WAC9D;AACA,YAAM,IAAI;AAAA,QACR,6BAA6B,OAAO,IAAI;AAAA,MAC1C;AAAA,IACF;AACA,WAAO;AAAA,MACL;AAAA,MACA,WAAW,OAAO,aAAa;AAAA,MAC/B;AAAA,MACA;AAAA,MACA,MAAM,OAAO,QAAQ,SAAS,IAAI,IAAI,KAAK;AAAA,MAC3C,UAAU,OAAO,YAAY,SAAS,IAAI,QAAQ,KAAK;AAAA,MACvD,gBACE,OAAO,kBAAkB,SAAS,IAAI,cAAc,KAAK;AAAA,MAC3D,YAAY,OAAO,cAAc,SAAS,IAAI,UAAU,KAAK;AAAA,MAC7D,IAAI,IAAI;AAAA,MACR,SAAS,SAAS,IAAI,OAAO,KAAK,CAAC;AAAA,MACnC,KAAK;AAAA,QACH,GAAI,SAAS,IAAI,GAAG,KAAK,CAAC;AAAA,QAC1B,cAAc,OAAO;AAAA,QACrB,QAAQ,QAAQ;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AACH;AAEO,SAAS,4CACd,SACA,QAC2B;AAC3B,MAAI,QAAQ,WAAW,gCAAgC;AACrD,WAAO,+BAA+B,SAAS,MAAM;AAAA,EACvD;AACA,MAAI,QAAQ,WAAW,iCAAiC;AACtD,WAAO,+BAA+B,SAAS,MAAM;AAAA,EACvD;AACA,MAAI,QAAQ,WAAW,kCAAkC;AACvD,WAAO,gCAAgC,SAAS,MAAM;AAAA,EACxD;AACA,QAAM,IAAI,MAAM,4CAA4C,OAAO,IAAI,EAAE;AAC3E;AAEA,eAAsB,sCACpB,WACoC;AACpC,QAAM,OAAkC,CAAC;AACzC,aAAW,UAAU,WAAW;AAC9B,UAAM,UAAU,SAAS,KAAK,MAAM,MAAM,SAAS,OAAO,MAAM,OAAO,CAAC,CAAC;AACzE,QAAI,CAAC;AACH,YAAM,IAAI,MAAM,YAAY,OAAO,IAAI,wBAAwB;AACjE,SAAK,KAAK,GAAG,4CAA4C,SAAS,MAAM,CAAC;AAAA,EAC3E;AACA,SAAO;AACT;AAEO,SAAS,oCACd,OACyB;AACzB,QAAM,OAAO,MAAM,KAAK,IAAI,YAAY;AACxC,QAAM,mBAAmB,uBAAuB,MAAM,MAAM,gBAAgB;AAC5E,QAAM,QAAQ,MAAM;AAAA,IAClB,IAAI;AAAA,MACF,KAAK,IAAI,CAAC,QAAQ,IAAI,IAAI,EAAE,OAAO,CAAC,SAAyB,CAAC,CAAC,IAAI;AAAA,IACrE;AAAA,EACF,EAAE,KAAK,yBAAyB;AAChC,QAAM,aAAa,MAAM;AAAA,IACvB,IAAI,IAAI,KAAK,IAAI,CAAC,QAAQ,IAAI,SAAS,CAAC;AAAA,EAC1C,EAAE,KAAK;AACP,QAAM,cAAc,iBAAiB,MAAM,gBAAgB;AAC3D,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,SAAS;AAAA,IACT,aAAa,MAAM,gBAAe,oBAAI,KAAK,GAAE,YAAY;AAAA,IACzD,QAAQ,MAAM,UAAU,EAAE,MAAM,4BAA4B;AAAA,IAC5D;AAAA,IACA;AAAA,IACA;AAAA,IACA,QAAQ;AAAA,MACN,MAAM,KAAK;AAAA,MACX,aAAa,YAAY;AAAA,MACzB,OAAO,MAAM;AAAA,MACb,YAAY,WAAW;AAAA,IACzB;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAEA,eAAsB,6BACpB,OACwC;AACxC,QAAM,WAAW,oCAAoC,KAAK;AAC1D,QAAM,YACJ,MAAM,aACN;AAAA,IACE,kBAAkB;AAAA,IAClB;AAAA,IACA,cAAc,SAAS,WAAW;AAAA,EACpC;AACF,QAAM,MAAM,WAAW,EAAE,WAAW,KAAK,CAAC;AAC1C,QAAM,eAAe,KAAK,WAAW,uBAAuB;AAC5D,QAAM;AAAA,IACJ;AAAA,IACA,GAAG,KAAK,UAAU,UAAU,MAAM,CAAC,CAAC;AAAA;AAAA,IACpC;AAAA,EACF;AACA,SAAO,EAAE,WAAW,cAAc,SAAS;AAC7C;AAEA,eAAsB,0CACpB,OACwC;AACxC,QAAM,OAAO,MAAM,sCAAsC,MAAM,SAAS;AACxE,SAAO,6BAA6B;AAAA,IAClC;AAAA,IACA,WAAW,MAAM;AAAA,IACjB,aAAa,MAAM;AAAA,IACnB,kBAAkB,MAAM;AAAA,IACxB,QAAQ,MAAM,UAAU;AAAA,MACtB,MAAM;AAAA,MACN,WAAW,MAAM,UAAU,IAAI,CAAC,aAAa,SAAS,IAAI;AAAA,IAC5D;AAAA,EACF,CAAC;AACH;","names":[]}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
export type BenchmarkVsCerebrasBenchmark = "eliza_harness_action_selection" | "clawbench" | "hermes" | "all";
|
|
2
|
+
export interface BenchmarkVsCerebrasRunOptions {
|
|
3
|
+
trainingRoot?: string;
|
|
4
|
+
python?: string;
|
|
5
|
+
tiers?: string;
|
|
6
|
+
benchmark?: BenchmarkVsCerebrasBenchmark;
|
|
7
|
+
variants?: "trained" | "base" | "both";
|
|
8
|
+
cerebrasModel?: string;
|
|
9
|
+
maxSamples?: number;
|
|
10
|
+
outputDir?: string;
|
|
11
|
+
checkpointsDir?: string;
|
|
12
|
+
trainedModelPath?: string;
|
|
13
|
+
dryRun?: boolean;
|
|
14
|
+
resultsDb?: string;
|
|
15
|
+
datasetVersion?: string;
|
|
16
|
+
codeCommit?: string;
|
|
17
|
+
matrixOutputDir?: string;
|
|
18
|
+
}
|
|
19
|
+
export interface BenchmarkVsCerebrasRunResult {
|
|
20
|
+
trainingRoot: string;
|
|
21
|
+
outputDir: string;
|
|
22
|
+
matrixOutputDir: string | null;
|
|
23
|
+
matrixArtifactPath: string | null;
|
|
24
|
+
resultsDb: string | null;
|
|
25
|
+
command: string[];
|
|
26
|
+
stdout: string;
|
|
27
|
+
stderr: string;
|
|
28
|
+
exitCode: number;
|
|
29
|
+
}
|
|
30
|
+
export declare function benchmarkVsCerebrasTierList(value: string | undefined): string;
|
|
31
|
+
export declare function buildBenchmarkVsCerebrasArgs(options: BenchmarkVsCerebrasRunOptions, resolved: {
|
|
32
|
+
trainingRoot: string;
|
|
33
|
+
outputDir: string;
|
|
34
|
+
matrixOutputDir?: string;
|
|
35
|
+
}): string[];
|
|
36
|
+
export declare function runBenchmarkVsCerebras(options: BenchmarkVsCerebrasRunOptions): Promise<BenchmarkVsCerebrasRunResult>;
|
|
37
|
+
//# sourceMappingURL=benchmark-vs-cerebras-runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"benchmark-vs-cerebras-runner.d.ts","sourceRoot":"","sources":["../../src/core/benchmark-vs-cerebras-runner.ts"],"names":[],"mappings":"AAMA,MAAM,MAAM,4BAA4B,GACpC,gCAAgC,GAChC,WAAW,GACX,QAAQ,GACR,KAAK,CAAC;AAEV,MAAM,WAAW,6BAA6B;IAC5C,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,4BAA4B,CAAC;IACzC,QAAQ,CAAC,EAAE,SAAS,GAAG,MAAM,GAAG,MAAM,CAAC;IACvC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,eAAe,CAAC,EAAE,MAAM,CAAC;CAC1B;AAED,MAAM,WAAW,4BAA4B;IAC3C,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;CAClB;AAkDD,wBAAgB,2BAA2B,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS,GAAG,MAAM,CAI7E;AA6BD,wBAAgB,4BAA4B,CAC1C,OAAO,EAAE,6BAA6B,EACtC,QAAQ,EAAE;IACR,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,eAAe,CAAC,EAAE,MAAM,CAAC;CAC1B,GACA,MAAM,EAAE,CAsCV;AAED,wBAAsB,sBAAsB,CAC1C,OAAO,EAAE,6BAA6B,GACrC,OAAO,CAAC,4BAA4B,CAAC,CAsCvC"}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import { spawn } from "node:child_process";
|
|
2
|
+
import { mkdir } from "node:fs/promises";
|
|
3
|
+
import { join, resolve } from "node:path";
|
|
4
|
+
import { ELIZA_ONE_BENCHMARK_TIER_LIST } from "./eliza1-benchmark-recipe.js";
|
|
5
|
+
import { trainingStateRoot } from "./training-config.js";
|
|
6
|
+
function safeTimestamp(value) {
|
|
7
|
+
return value.replace(/[:.]/g, "-");
|
|
8
|
+
}
|
|
9
|
+
const TRAINING_TIER_KEYS = {
|
|
10
|
+
"2b": "gemma4-e2b",
|
|
11
|
+
"4b": "gemma4-e4b",
|
|
12
|
+
"9b": "gemma4-12b",
|
|
13
|
+
"27b": "gemma4-31b",
|
|
14
|
+
"eliza-1-2b": "gemma4-e2b",
|
|
15
|
+
"eliza-1-4b": "gemma4-e4b",
|
|
16
|
+
"eliza-1-9b": "gemma4-12b",
|
|
17
|
+
"eliza-1-27b": "gemma4-31b",
|
|
18
|
+
"gemma4-e2b": "gemma4-e2b",
|
|
19
|
+
"gemma4-e4b": "gemma4-e4b",
|
|
20
|
+
"gemma4-12b": "gemma4-12b",
|
|
21
|
+
"gemma4-31b": "gemma4-31b",
|
|
22
|
+
"gemma-4-e2b": "gemma4-e2b",
|
|
23
|
+
"gemma-4-e4b": "gemma4-e4b",
|
|
24
|
+
"gemma-4-12b": "gemma4-12b",
|
|
25
|
+
"gemma-4-31b": "gemma4-31b",
|
|
26
|
+
"google/gemma-4-e2b": "gemma4-e2b",
|
|
27
|
+
"google/gemma-4-e4b": "gemma4-e4b",
|
|
28
|
+
"google/gemma-4-12b": "gemma4-12b",
|
|
29
|
+
"google/gemma-4-31b": "gemma4-31b",
|
|
30
|
+
"google-gemma-4-e2b": "gemma4-e2b",
|
|
31
|
+
"google-gemma-4-e4b": "gemma4-e4b",
|
|
32
|
+
"google-gemma-4-12b": "gemma4-12b",
|
|
33
|
+
"google-gemma-4-31b": "gemma4-31b"
|
|
34
|
+
};
|
|
35
|
+
const RETIRED_QWEN_TIER_ALIAS_RE = /\bqwen(?:\d+(?:\.\d+)?)?\b/i;
|
|
36
|
+
function normalizeTrainingTierKey(value) {
|
|
37
|
+
const trimmed = value.trim();
|
|
38
|
+
const key = trimmed.toLowerCase().replace(/_/g, "-");
|
|
39
|
+
if (RETIRED_QWEN_TIER_ALIAS_RE.test(key)) {
|
|
40
|
+
throw new Error(
|
|
41
|
+
`Qwen tier aliases are retired; use an active Gemma 4 tier key instead (${ELIZA_ONE_BENCHMARK_TIER_LIST}).`
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
return TRAINING_TIER_KEYS[key] ?? TRAINING_TIER_KEYS[key.replace(/\//g, "-")] ?? trimmed;
|
|
45
|
+
}
|
|
46
|
+
function benchmarkVsCerebrasTierList(value) {
|
|
47
|
+
const raw = value?.trim() || ELIZA_ONE_BENCHMARK_TIER_LIST;
|
|
48
|
+
if (raw.toLowerCase() === "all") return "all";
|
|
49
|
+
return raw.split(",").map(normalizeTrainingTierKey).filter(Boolean).join(",");
|
|
50
|
+
}
|
|
51
|
+
function collectProcess(command, args, cwd) {
|
|
52
|
+
return new Promise((resolvePromise, reject) => {
|
|
53
|
+
const child = spawn(command, args, {
|
|
54
|
+
cwd,
|
|
55
|
+
stdio: ["ignore", "pipe", "pipe"]
|
|
56
|
+
});
|
|
57
|
+
let stdout = "";
|
|
58
|
+
let stderr = "";
|
|
59
|
+
child.stdout.setEncoding("utf-8");
|
|
60
|
+
child.stderr.setEncoding("utf-8");
|
|
61
|
+
child.stdout.on("data", (chunk) => {
|
|
62
|
+
stdout += chunk;
|
|
63
|
+
});
|
|
64
|
+
child.stderr.on("data", (chunk) => {
|
|
65
|
+
stderr += chunk;
|
|
66
|
+
});
|
|
67
|
+
child.on("error", reject);
|
|
68
|
+
child.on("close", (code) => {
|
|
69
|
+
resolvePromise({ stdout, stderr, exitCode: code ?? 1 });
|
|
70
|
+
});
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
function buildBenchmarkVsCerebrasArgs(options, resolved) {
|
|
74
|
+
const scriptPath = join(
|
|
75
|
+
resolved.trainingRoot,
|
|
76
|
+
"scripts",
|
|
77
|
+
"benchmark_vs_cerebras.py"
|
|
78
|
+
);
|
|
79
|
+
const args = [
|
|
80
|
+
scriptPath,
|
|
81
|
+
"--tiers",
|
|
82
|
+
benchmarkVsCerebrasTierList(options.tiers),
|
|
83
|
+
"--benchmark",
|
|
84
|
+
options.benchmark ?? "eliza_harness_action_selection",
|
|
85
|
+
"--variants",
|
|
86
|
+
options.variants ?? "trained",
|
|
87
|
+
"--cerebras-model",
|
|
88
|
+
options.cerebrasModel ?? "gpt-oss-120b",
|
|
89
|
+
"--max-samples",
|
|
90
|
+
String(
|
|
91
|
+
typeof options.maxSamples === "number" ? Math.max(1, Math.floor(options.maxSamples)) : 50
|
|
92
|
+
),
|
|
93
|
+
"--output-dir",
|
|
94
|
+
resolved.outputDir
|
|
95
|
+
];
|
|
96
|
+
if (options.checkpointsDir)
|
|
97
|
+
args.push("--checkpoints-dir", options.checkpointsDir);
|
|
98
|
+
if (options.trainedModelPath)
|
|
99
|
+
args.push("--trained-model-path", options.trainedModelPath);
|
|
100
|
+
if (options.dryRun) args.push("--dry-run");
|
|
101
|
+
if (options.resultsDb) args.push("--results-db", options.resultsDb);
|
|
102
|
+
if (options.datasetVersion)
|
|
103
|
+
args.push("--dataset-version", options.datasetVersion);
|
|
104
|
+
if (options.codeCommit) args.push("--code-commit", options.codeCommit);
|
|
105
|
+
if (resolved.matrixOutputDir) {
|
|
106
|
+
args.push("--matrix-output-dir", resolved.matrixOutputDir);
|
|
107
|
+
}
|
|
108
|
+
return args;
|
|
109
|
+
}
|
|
110
|
+
async function runBenchmarkVsCerebras(options) {
|
|
111
|
+
const trainingRoot = resolve(
|
|
112
|
+
options.trainingRoot ?? join(process.cwd(), "packages", "training")
|
|
113
|
+
);
|
|
114
|
+
const stamp = safeTimestamp((/* @__PURE__ */ new Date()).toISOString());
|
|
115
|
+
const outputDir = options.outputDir ?? join(trainingStateRoot(), "benchmarks", "runs", stamp);
|
|
116
|
+
const matrixOutputDir = options.matrixOutputDir ?? join(trainingStateRoot(), "benchmarks", "matrices", stamp);
|
|
117
|
+
await mkdir(outputDir, { recursive: true });
|
|
118
|
+
await mkdir(matrixOutputDir, { recursive: true });
|
|
119
|
+
const args = buildBenchmarkVsCerebrasArgs(options, {
|
|
120
|
+
trainingRoot,
|
|
121
|
+
outputDir,
|
|
122
|
+
matrixOutputDir
|
|
123
|
+
});
|
|
124
|
+
const proc = await collectProcess(
|
|
125
|
+
options.python ?? "python3",
|
|
126
|
+
args,
|
|
127
|
+
trainingRoot
|
|
128
|
+
);
|
|
129
|
+
if (proc.exitCode !== 0) {
|
|
130
|
+
throw new Error(
|
|
131
|
+
`benchmark_vs_cerebras.py exited with code ${proc.exitCode}: ${proc.stderr || proc.stdout}`
|
|
132
|
+
);
|
|
133
|
+
}
|
|
134
|
+
return {
|
|
135
|
+
trainingRoot,
|
|
136
|
+
outputDir,
|
|
137
|
+
matrixOutputDir,
|
|
138
|
+
matrixArtifactPath: join(matrixOutputDir, "benchmark-matrix.json"),
|
|
139
|
+
resultsDb: options.resultsDb ?? null,
|
|
140
|
+
command: [options.python ?? "python3", ...args],
|
|
141
|
+
stdout: proc.stdout,
|
|
142
|
+
stderr: proc.stderr,
|
|
143
|
+
exitCode: proc.exitCode
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
export {
|
|
147
|
+
benchmarkVsCerebrasTierList,
|
|
148
|
+
buildBenchmarkVsCerebrasArgs,
|
|
149
|
+
runBenchmarkVsCerebras
|
|
150
|
+
};
|
|
151
|
+
//# sourceMappingURL=benchmark-vs-cerebras-runner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/core/benchmark-vs-cerebras-runner.ts"],"sourcesContent":["import { spawn } from \"node:child_process\";\nimport { mkdir } from \"node:fs/promises\";\nimport { join, resolve } from \"node:path\";\nimport { ELIZA_ONE_BENCHMARK_TIER_LIST } from \"./eliza1-benchmark-recipe.js\";\nimport { trainingStateRoot } from \"./training-config.js\";\n\nexport type BenchmarkVsCerebrasBenchmark =\n | \"eliza_harness_action_selection\"\n | \"clawbench\"\n | \"hermes\"\n | \"all\";\n\nexport interface BenchmarkVsCerebrasRunOptions {\n trainingRoot?: string;\n python?: string;\n tiers?: string;\n benchmark?: BenchmarkVsCerebrasBenchmark;\n variants?: \"trained\" | \"base\" | \"both\";\n cerebrasModel?: string;\n maxSamples?: number;\n outputDir?: string;\n checkpointsDir?: string;\n trainedModelPath?: string;\n dryRun?: boolean;\n resultsDb?: string;\n datasetVersion?: string;\n codeCommit?: string;\n matrixOutputDir?: string;\n}\n\nexport interface BenchmarkVsCerebrasRunResult {\n trainingRoot: string;\n outputDir: string;\n matrixOutputDir: string | null;\n matrixArtifactPath: string | null;\n resultsDb: string | null;\n command: string[];\n stdout: string;\n stderr: string;\n exitCode: number;\n}\n\nfunction safeTimestamp(value: string): string {\n return value.replace(/[:.]/g, \"-\");\n}\n\nconst TRAINING_TIER_KEYS: Record<string, string> = {\n \"2b\": \"gemma4-e2b\",\n \"4b\": \"gemma4-e4b\",\n \"9b\": \"gemma4-12b\",\n \"27b\": \"gemma4-31b\",\n \"eliza-1-2b\": \"gemma4-e2b\",\n \"eliza-1-4b\": \"gemma4-e4b\",\n \"eliza-1-9b\": \"gemma4-12b\",\n \"eliza-1-27b\": \"gemma4-31b\",\n \"gemma4-e2b\": \"gemma4-e2b\",\n \"gemma4-e4b\": \"gemma4-e4b\",\n \"gemma4-12b\": \"gemma4-12b\",\n \"gemma4-31b\": \"gemma4-31b\",\n \"gemma-4-e2b\": \"gemma4-e2b\",\n \"gemma-4-e4b\": \"gemma4-e4b\",\n \"gemma-4-12b\": \"gemma4-12b\",\n \"gemma-4-31b\": \"gemma4-31b\",\n \"google/gemma-4-e2b\": \"gemma4-e2b\",\n \"google/gemma-4-e4b\": \"gemma4-e4b\",\n \"google/gemma-4-12b\": \"gemma4-12b\",\n \"google/gemma-4-31b\": \"gemma4-31b\",\n \"google-gemma-4-e2b\": \"gemma4-e2b\",\n \"google-gemma-4-e4b\": \"gemma4-e4b\",\n \"google-gemma-4-12b\": \"gemma4-12b\",\n \"google-gemma-4-31b\": \"gemma4-31b\",\n};\n\nconst RETIRED_QWEN_TIER_ALIAS_RE = /\\bqwen(?:\\d+(?:\\.\\d+)?)?\\b/i;\n\nfunction normalizeTrainingTierKey(value: string): string {\n const trimmed = value.trim();\n const key = trimmed.toLowerCase().replace(/_/g, \"-\");\n if (RETIRED_QWEN_TIER_ALIAS_RE.test(key)) {\n throw new Error(\n `Qwen tier aliases are retired; use an active Gemma 4 tier key instead (${ELIZA_ONE_BENCHMARK_TIER_LIST}).`,\n );\n }\n return (\n TRAINING_TIER_KEYS[key] ??\n TRAINING_TIER_KEYS[key.replace(/\\//g, \"-\")] ??\n trimmed\n );\n}\n\nexport function benchmarkVsCerebrasTierList(value: string | undefined): string {\n const raw = value?.trim() || ELIZA_ONE_BENCHMARK_TIER_LIST;\n if (raw.toLowerCase() === \"all\") return \"all\";\n return raw.split(\",\").map(normalizeTrainingTierKey).filter(Boolean).join(\",\");\n}\n\nfunction collectProcess(\n command: string,\n args: string[],\n cwd: string,\n): Promise<{ stdout: string; stderr: string; exitCode: number }> {\n return new Promise((resolvePromise, reject) => {\n const child = spawn(command, args, {\n cwd,\n stdio: [\"ignore\", \"pipe\", \"pipe\"],\n });\n let stdout = \"\";\n let stderr = \"\";\n child.stdout.setEncoding(\"utf-8\");\n child.stderr.setEncoding(\"utf-8\");\n child.stdout.on(\"data\", (chunk) => {\n stdout += chunk;\n });\n child.stderr.on(\"data\", (chunk) => {\n stderr += chunk;\n });\n child.on(\"error\", reject);\n child.on(\"close\", (code) => {\n resolvePromise({ stdout, stderr, exitCode: code ?? 1 });\n });\n });\n}\n\nexport function buildBenchmarkVsCerebrasArgs(\n options: BenchmarkVsCerebrasRunOptions,\n resolved: {\n trainingRoot: string;\n outputDir: string;\n matrixOutputDir?: string;\n },\n): string[] {\n const scriptPath = join(\n resolved.trainingRoot,\n \"scripts\",\n \"benchmark_vs_cerebras.py\",\n );\n const args = [\n scriptPath,\n \"--tiers\",\n benchmarkVsCerebrasTierList(options.tiers),\n \"--benchmark\",\n options.benchmark ?? \"eliza_harness_action_selection\",\n \"--variants\",\n options.variants ?? \"trained\",\n \"--cerebras-model\",\n options.cerebrasModel ?? \"gpt-oss-120b\",\n \"--max-samples\",\n String(\n typeof options.maxSamples === \"number\"\n ? Math.max(1, Math.floor(options.maxSamples))\n : 50,\n ),\n \"--output-dir\",\n resolved.outputDir,\n ];\n if (options.checkpointsDir)\n args.push(\"--checkpoints-dir\", options.checkpointsDir);\n if (options.trainedModelPath)\n args.push(\"--trained-model-path\", options.trainedModelPath);\n if (options.dryRun) args.push(\"--dry-run\");\n if (options.resultsDb) args.push(\"--results-db\", options.resultsDb);\n if (options.datasetVersion)\n args.push(\"--dataset-version\", options.datasetVersion);\n if (options.codeCommit) args.push(\"--code-commit\", options.codeCommit);\n if (resolved.matrixOutputDir) {\n args.push(\"--matrix-output-dir\", resolved.matrixOutputDir);\n }\n return args;\n}\n\nexport async function runBenchmarkVsCerebras(\n options: BenchmarkVsCerebrasRunOptions,\n): Promise<BenchmarkVsCerebrasRunResult> {\n const trainingRoot = resolve(\n options.trainingRoot ?? join(process.cwd(), \"packages\", \"training\"),\n );\n const stamp = safeTimestamp(new Date().toISOString());\n const outputDir =\n options.outputDir ?? join(trainingStateRoot(), \"benchmarks\", \"runs\", stamp);\n const matrixOutputDir =\n options.matrixOutputDir ??\n join(trainingStateRoot(), \"benchmarks\", \"matrices\", stamp);\n await mkdir(outputDir, { recursive: true });\n await mkdir(matrixOutputDir, { recursive: true });\n const args = buildBenchmarkVsCerebrasArgs(options, {\n trainingRoot,\n outputDir,\n matrixOutputDir,\n });\n const proc = await collectProcess(\n options.python ?? \"python3\",\n args,\n trainingRoot,\n );\n if (proc.exitCode !== 0) {\n throw new Error(\n `benchmark_vs_cerebras.py exited with code ${proc.exitCode}: ${proc.stderr || proc.stdout}`,\n );\n }\n return {\n trainingRoot,\n outputDir,\n matrixOutputDir,\n matrixArtifactPath: join(matrixOutputDir, \"benchmark-matrix.json\"),\n resultsDb: options.resultsDb ?? null,\n command: [options.python ?? \"python3\", ...args],\n stdout: proc.stdout,\n stderr: proc.stderr,\n exitCode: proc.exitCode,\n };\n}\n"],"mappings":"AAAA,SAAS,aAAa;AACtB,SAAS,aAAa;AACtB,SAAS,MAAM,eAAe;AAC9B,SAAS,qCAAqC;AAC9C,SAAS,yBAAyB;AAsClC,SAAS,cAAc,OAAuB;AAC5C,SAAO,MAAM,QAAQ,SAAS,GAAG;AACnC;AAEA,MAAM,qBAA6C;AAAA,EACjD,MAAM;AAAA,EACN,MAAM;AAAA,EACN,MAAM;AAAA,EACN,OAAO;AAAA,EACP,cAAc;AAAA,EACd,cAAc;AAAA,EACd,cAAc;AAAA,EACd,eAAe;AAAA,EACf,cAAc;AAAA,EACd,cAAc;AAAA,EACd,cAAc;AAAA,EACd,cAAc;AAAA,EACd,eAAe;AAAA,EACf,eAAe;AAAA,EACf,eAAe;AAAA,EACf,eAAe;AAAA,EACf,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AACxB;AAEA,MAAM,6BAA6B;AAEnC,SAAS,yBAAyB,OAAuB;AACvD,QAAM,UAAU,MAAM,KAAK;AAC3B,QAAM,MAAM,QAAQ,YAAY,EAAE,QAAQ,MAAM,GAAG;AACnD,MAAI,2BAA2B,KAAK,GAAG,GAAG;AACxC,UAAM,IAAI;AAAA,MACR,0EAA0E,6BAA6B;AAAA,IACzG;AAAA,EACF;AACA,SACE,mBAAmB,GAAG,KACtB,mBAAmB,IAAI,QAAQ,OAAO,GAAG,CAAC,KAC1C;AAEJ;AAEO,SAAS,4BAA4B,OAAmC;AAC7E,QAAM,MAAM,OAAO,KAAK,KAAK;AAC7B,MAAI,IAAI,YAAY,MAAM,MAAO,QAAO;AACxC,SAAO,IAAI,MAAM,GAAG,EAAE,IAAI,wBAAwB,EAAE,OAAO,OAAO,EAAE,KAAK,GAAG;AAC9E;AAEA,SAAS,eACP,SACA,MACA,KAC+D;AAC/D,SAAO,IAAI,QAAQ,CAAC,gBAAgB,WAAW;AAC7C,UAAM,QAAQ,MAAM,SAAS,MAAM;AAAA,MACjC;AAAA,MACA,OAAO,CAAC,UAAU,QAAQ,MAAM;AAAA,IAClC,CAAC;AACD,QAAI,SAAS;AACb,QAAI,SAAS;AACb,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,GAAG,QAAQ,CAAC,UAAU;AACjC,gBAAU;AAAA,IACZ,CAAC;AACD,UAAM,OAAO,GAAG,QAAQ,CAAC,UAAU;AACjC,gBAAU;AAAA,IACZ,CAAC;AACD,UAAM,GAAG,SAAS,MAAM;AACxB,UAAM,GAAG,SAAS,CAAC,SAAS;AAC1B,qBAAe,EAAE,QAAQ,QAAQ,UAAU,QAAQ,EAAE,CAAC;AAAA,IACxD,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,6BACd,SACA,UAKU;AACV,QAAM,aAAa;AAAA,IACjB,SAAS;AAAA,IACT;AAAA,IACA;AAAA,EACF;AACA,QAAM,OAAO;AAAA,IACX;AAAA,IACA;AAAA,IACA,4BAA4B,QAAQ,KAAK;AAAA,IACzC;AAAA,IACA,QAAQ,aAAa;AAAA,IACrB;AAAA,IACA,QAAQ,YAAY;AAAA,IACpB;AAAA,IACA,QAAQ,iBAAiB;AAAA,IACzB;AAAA,IACA;AAAA,MACE,OAAO,QAAQ,eAAe,WAC1B,KAAK,IAAI,GAAG,KAAK,MAAM,QAAQ,UAAU,CAAC,IAC1C;AAAA,IACN;AAAA,IACA;AAAA,IACA,SAAS;AAAA,EACX;AACA,MAAI,QAAQ;AACV,SAAK,KAAK,qBAAqB,QAAQ,cAAc;AACvD,MAAI,QAAQ;AACV,SAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAC5D,MAAI,QAAQ,OAAQ,MAAK,KAAK,WAAW;AACzC,MAAI,QAAQ,UAAW,MAAK,KAAK,gBAAgB,QAAQ,SAAS;AAClE,MAAI,QAAQ;AACV,SAAK,KAAK,qBAAqB,QAAQ,cAAc;AACvD,MAAI,QAAQ,WAAY,MAAK,KAAK,iBAAiB,QAAQ,UAAU;AACrE,MAAI,SAAS,iBAAiB;AAC5B,SAAK,KAAK,uBAAuB,SAAS,eAAe;AAAA,EAC3D;AACA,SAAO;AACT;AAEA,eAAsB,uBACpB,SACuC;AACvC,QAAM,eAAe;AAAA,IACnB,QAAQ,gBAAgB,KAAK,QAAQ,IAAI,GAAG,YAAY,UAAU;AAAA,EACpE;AACA,QAAM,QAAQ,eAAc,oBAAI,KAAK,GAAE,YAAY,CAAC;AACpD,QAAM,YACJ,QAAQ,aAAa,KAAK,kBAAkB,GAAG,cAAc,QAAQ,KAAK;AAC5E,QAAM,kBACJ,QAAQ,mBACR,KAAK,kBAAkB,GAAG,cAAc,YAAY,KAAK;AAC3D,QAAM,MAAM,WAAW,EAAE,WAAW,KAAK,CAAC;AAC1C,QAAM,MAAM,iBAAiB,EAAE,WAAW,KAAK,CAAC;AAChD,QAAM,OAAO,6BAA6B,SAAS;AAAA,IACjD;AAAA,IACA;AAAA,IACA;AAAA,EACF,CAAC;AACD,QAAM,OAAO,MAAM;AAAA,IACjB,QAAQ,UAAU;AAAA,IAClB;AAAA,IACA;AAAA,EACF;AACA,MAAI,KAAK,aAAa,GAAG;AACvB,UAAM,IAAI;AAAA,MACR,6CAA6C,KAAK,QAAQ,KAAK,KAAK,UAAU,KAAK,MAAM;AAAA,IAC3F;AAAA,EACF;AACA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA,oBAAoB,KAAK,iBAAiB,uBAAuB;AAAA,IACjE,WAAW,QAAQ,aAAa;AAAA,IAChC,SAAS,CAAC,QAAQ,UAAU,WAAW,GAAG,IAAI;AAAA,IAC9C,QAAQ,KAAK;AAAA,IACb,QAAQ,KAAK;AAAA,IACb,UAAU,KAAK;AAAA,EACjB;AACF;","names":[]}
|