npm - @elizaos/training - Versions diffs - 2.0.0-alpha.21 → 2.0.0-alpha.22 - Mend

@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

package/.turbo/turbo-lint.log +2 -0
package/.turbo/turbo-typecheck.log +1 -0
package/dist/.tsbuildinfo +1 -0
package/dist/adapter.js +59 -0
package/dist/archetypes/ArchetypeConfigService.js +510 -0
package/dist/archetypes/derive-archetype.js +196 -0
package/dist/archetypes/index.js +7 -0
package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
package/dist/benchmark/BenchmarkDataViewer.js +197 -0
package/dist/benchmark/BenchmarkHistoryService.js +135 -0
package/dist/benchmark/BenchmarkRunner.js +483 -0
package/dist/benchmark/BenchmarkValidator.js +158 -0
package/dist/benchmark/FastEvalRunner.js +133 -0
package/dist/benchmark/MetricsValidator.js +104 -0
package/dist/benchmark/MetricsVisualizer.js +775 -0
package/dist/benchmark/ModelBenchmarkService.js +433 -0
package/dist/benchmark/ModelRegistry.js +122 -0
package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
package/dist/benchmark/SimulationA2AInterface.js +683 -0
package/dist/benchmark/SimulationEngine.js +522 -0
package/dist/benchmark/TaskRunner.js +60 -0
package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
package/dist/benchmark/index.js +23 -0
package/dist/benchmark/parseSimulationMetrics.js +86 -0
package/dist/benchmark/simulation-types.js +1 -0
package/dist/dependencies.js +197 -0
package/dist/generation/TrajectoryGenerator.js +244 -0
package/dist/generation/index.js +6 -0
package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
package/dist/huggingface/index.js +9 -0
package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
package/dist/index.js +41 -0
package/dist/init-training.js +43 -0
package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
package/dist/metrics/index.js +7 -0
package/dist/metrics/types.js +21 -0
package/dist/rubrics/__tests__/index.test.js +150 -0
package/dist/rubrics/ass-kisser.js +83 -0
package/dist/rubrics/degen.js +78 -0
package/dist/rubrics/goody-twoshoes.js +82 -0
package/dist/rubrics/index.js +184 -0
package/dist/rubrics/information-trader.js +82 -0
package/dist/rubrics/infosec.js +99 -0
package/dist/rubrics/liar.js +102 -0
package/dist/rubrics/perps-trader.js +85 -0
package/dist/rubrics/researcher.js +79 -0
package/dist/rubrics/scammer.js +80 -0
package/dist/rubrics/social-butterfly.js +71 -0
package/dist/rubrics/super-predictor.js +95 -0
package/dist/rubrics/trader.js +65 -0
package/dist/scoring/ArchetypeScoringService.js +301 -0
package/dist/scoring/JudgePromptBuilder.js +401 -0
package/dist/scoring/LLMJudgeCache.js +263 -0
package/dist/scoring/index.js +8 -0
package/dist/training/AutomationPipeline.js +714 -0
package/dist/training/BenchmarkService.js +370 -0
package/dist/training/ConfigValidator.js +153 -0
package/dist/training/MarketOutcomesTracker.js +142 -0
package/dist/training/ModelDeployer.js +128 -0
package/dist/training/ModelFetcher.js +48 -0
package/dist/training/ModelSelectionService.js +248 -0
package/dist/training/ModelUsageVerifier.js +106 -0
package/dist/training/MultiModelOrchestrator.js +349 -0
package/dist/training/RLModelConfig.js +295 -0
package/dist/training/RewardBackpropagationService.js +117 -0
package/dist/training/RulerScoringService.js +450 -0
package/dist/training/TrainingMonitor.js +108 -0
package/dist/training/TrajectoryRecorder.js +281 -0
package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
package/dist/training/index.js +30 -0
package/dist/training/logRLConfig.js +29 -0
package/dist/training/pipeline.js +80 -0
package/dist/training/storage/ModelStorageService.js +190 -0
package/dist/training/storage/TrainingDataArchiver.js +136 -0
package/dist/training/storage/index.js +7 -0
package/dist/training/types.js +6 -0
package/dist/training/window-utils.js +100 -0
package/dist/utils/index.js +73 -0
package/dist/utils/logger.js +55 -0
package/dist/utils/snowflake.js +15 -0
package/dist/utils/synthetic-detector.js +67 -0
package/package.json +2 -2
package/research-output/training-runs/training-run-1773742857616.json +38 -0
package/research-output/training-runs/training-run-1773742946977.json +38 -0
package/research-output/training-runs/training-run-1773743278891.json +38 -0
package/research-output/training-runs/training-run-1773743409754.json +38 -0
package/research-output/training-runs/training-run-1773743651086.json +38 -0
package/research-output/training-runs/training-run-1773743782883.json +38 -0

package/dist/training/ModelDeployer.js ADDED Viewed

@@ -0,0 +1,128 @@
+/**
+ * Model Deployer Service
+ *
+ * Automatically deploys trained models to agents.
+ * Handles gradual rollout and rollback if needed.
+ */
+import { getTrainingDataAdapter } from "../adapter";
+import { getAgentRuntimeManager } from "../dependencies";
+import { logger } from "../utils/logger";
+export class ModelDeployer {
+    deploymentStatus = new Map();
+    /**
+     * Deploy model to agents
+     */
+    async deploy(options) {
+        const da = getTrainingDataAdapter();
+        logger.info("Starting model deployment", {
+            version: options.modelVersion,
+            strategy: options.strategy,
+        });
+        const model = await da.getModelByVersion(options.modelVersion);
+        if (!model) {
+            throw new Error(`Model ${options.modelVersion} not found`);
+        }
+        const strategy = options.strategy === "immediate" ? "all" : options.strategy;
+        const targetAgents = await da.getAgentUsers({
+            strategy,
+            rolloutPercentage: options.rolloutPercentage,
+            testAgentIds: options.testAgentIds,
+        });
+        logger.info(`Deploying to ${targetAgents.length} agents`);
+        const deploymentId = `deploy-${Date.now()}`;
+        this.deploymentStatus.set(deploymentId, {
+            deploymentId,
+            modelVersion: options.modelVersion,
+            status: "in_progress",
+            agentsUpdated: 0,
+            agentsFailed: 0,
+            performance: {
+                rolloutSuccessRate: 0,
+                runtimeResetFailures: 0,
+            },
+            startedAt: new Date(),
+            completedAt: null,
+        });
+        await da.updateModelStatus(model.modelId, "deployed", {
+            deployedAt: new Date(),
+            agentsUsing: targetAgents.length,
+        });
+        // Clear agent runtimes so they pick up the new model.
+        const runtimeManager = getAgentRuntimeManager();
+        let runtimesReset = 0;
+        let runtimeResetFailures = 0;
+        for (const agent of targetAgents) {
+            try {
+                await runtimeManager.resetRuntime(agent.id);
+                runtimesReset++;
+            }
+            catch (err) {
+                runtimeResetFailures++;
+                logger.warn("Failed to reset runtime for agent", {
+                    agentId: agent.id,
+                    error: err instanceof Error ? err.message : String(err),
+                });
+            }
+        }
+        logger.info("Model deployed successfully", {
+            version: options.modelVersion,
+            agentsUpdated: targetAgents.length,
+            deploymentId,
+            runtimesReset,
+        });
+        const successRate = targetAgents.length > 0 ? runtimesReset / targetAgents.length : 0;
+        this.deploymentStatus.set(deploymentId, {
+            deploymentId,
+            modelVersion: options.modelVersion,
+            status: runtimeResetFailures > 0 ? "degraded" : "deployed",
+            agentsUpdated: runtimesReset,
+            agentsFailed: runtimeResetFailures,
+            performance: {
+                rolloutSuccessRate: successRate,
+                runtimeResetFailures,
+            },
+            startedAt: this.deploymentStatus.get(deploymentId)?.startedAt ?? new Date(),
+            completedAt: new Date(),
+        });
+        return {
+            success: runtimeResetFailures === 0,
+            agentsUpdated: runtimesReset,
+            deploymentId,
+            error: runtimeResetFailures > 0
+                ? `${runtimeResetFailures} agent runtimes failed to reset`
+                : undefined,
+        };
+    }
+    /**
+     * Rollback to previous model version
+     */
+    async rollback(currentVersion, targetVersion) {
+        logger.info("Rolling back model", {
+            from: currentVersion,
+            to: targetVersion,
+        });
+        return await this.deploy({
+            modelVersion: targetVersion,
+            strategy: "immediate",
+        });
+    }
+    /**
+     * Get deployment status
+     */
+    async getDeploymentStatus(deploymentId) {
+        const status = this.deploymentStatus.get(deploymentId);
+        if (!status)
+            return null;
+        return {
+            status: status.status,
+            agentsUpdated: status.agentsUpdated,
+            agentsFailed: status.agentsFailed,
+            performance: {
+                rolloutSuccessRate: status.performance.rolloutSuccessRate,
+                runtimeResetFailures: status.performance.runtimeResetFailures,
+            },
+        };
+    }
+}
+// Singleton
+export const modelDeployer = new ModelDeployer();

package/dist/training/ModelFetcher.js ADDED Viewed

@@ -0,0 +1,48 @@
+/**
+ * Model Fetcher
+ *
+ * Fetches trained RL models from the database for inference.
+ */
+import { getTrainingDataAdapter } from "../adapter";
+import { logger } from "../utils/logger";
+/**
+ * Get the latest RL model from database
+ */
+export async function getLatestRLModel() {
+    // Adapter returns the most recently created model.
+    // Original query filtered to status IN ('ready', 'deployed').
+    const adapter = getTrainingDataAdapter();
+    const model = await adapter.getLatestModel();
+    if (!model) {
+        return null;
+    }
+    // Skip models that aren't ready or deployed
+    if (model.status !== "ready" && model.status !== "deployed") {
+        return null;
+    }
+    const rlModelId = model.storagePath || model.modelId;
+    if (!rlModelId || rlModelId.trim().length === 0) {
+        logger.error("Model has no storagePath or modelId", {
+            modelId: model.modelId,
+            storagePath: model.storagePath,
+        }, "ModelFetcher");
+        return null;
+    }
+    if (!model.baseModel || model.baseModel.trim().length === 0) {
+        logger.error("Model has no baseModel", {
+            modelId: model.modelId,
+        }, "ModelFetcher");
+        return null;
+    }
+    return {
+        version: model.version,
+        modelId: rlModelId,
+        modelPath: rlModelId,
+        metadata: {
+            avgReward: model.avgReward ?? undefined,
+            benchmarkScore: model.benchmarkScore ?? undefined,
+            baseModel: model.baseModel,
+            trainedAt: model.createdAt,
+        },
+    };
+}

package/dist/training/ModelSelectionService.js ADDED Viewed

@@ -0,0 +1,248 @@
+/**
+ * Model Selection Service
+ *
+ * Determines which base model to use for training based on:
+ * 1. Number of available training bundles
+ * 2. Existence of trained models
+ * 3. Performance of previous models
+ */
+import { getTrainingDataAdapter } from "../adapter";
+import { logger } from "../utils/logger";
+export class ModelSelectionService {
+    /** Default base model - uses Qwen3-4B-128K (4B params, 128K context). Scale up via MODEL_TIER or AVAILABLE_VRAM_GB env vars */
+    BASE_MODEL = process.env.BASE_MODEL || "unsloth/Qwen3-4B-128K";
+    BUNDLE_THRESHOLD = 1000;
+    MIN_BUNDLES_FOR_TRAINING = 100;
+    MAX_TRAINING_EXAMPLES = 2000;
+    /**
+     * Select base model for training
+     *
+     * Determines which model to use as the base for training based on available
+     * training data and existing model performance.
+     *
+     * Decision tree:
+     * 1. No models exist? → Force first model from base
+     * 2. < 100 bundles? → Wait (not ready) - throws error
+     * 3. < 1000 bundles? → Train from base model
+     * 4. ≥ 1000 bundles? → Train from best performing model
+     *
+     * @returns ModelSelectionResult with selected model and strategy
+     * @throws Error if insufficient training data (< 100 bundles)
+     *
+     * @example
+     * ```typescript
+     * const result = await modelSelectionService.selectBaseModel();
+     * console.log(`Strategy: ${result.strategy}`);
+     * console.log(`Model: ${result.modelPath}`);
+     * ```
+     */
+    async selectBaseModel() {
+        logger.info("Selecting base model for training...", undefined, "ModelSelectionService");
+        // Count available training bundles (always fetch for accurate metrics)
+        const bundleCount = await this.countTrainingBundles();
+        // Check if any models exist
+        const forceFirst = await this.shouldForceFirstModel();
+        if (forceFirst) {
+            logger.info("No models exist - forcing first model creation", undefined, "ModelSelectionService");
+            return {
+                modelId: this.BASE_MODEL,
+                modelPath: this.BASE_MODEL,
+                strategy: "force_first",
+                reason: "No trained models exist - creating first model from base",
+                metadata: {
+                    baseModel: this.BASE_MODEL,
+                    bundleCount, // Use actual count, not 0
+                },
+            };
+        }
+        logger.info(`Found ${bundleCount} training bundles`, undefined, "ModelSelectionService");
+        // Not enough data yet
+        if (bundleCount < this.MIN_BUNDLES_FOR_TRAINING) {
+            throw new Error(`Insufficient training data: ${bundleCount} bundles ` +
+                `(need ${this.MIN_BUNDLES_FOR_TRAINING} minimum)`);
+        }
+        // Less than threshold: train from base model
+        if (bundleCount < this.BUNDLE_THRESHOLD) {
+            logger.info(`Bundle count ${bundleCount} < ${this.BUNDLE_THRESHOLD} - using base model`, undefined, "ModelSelectionService");
+            return {
+                modelId: this.BASE_MODEL,
+                modelPath: this.BASE_MODEL,
+                strategy: "base",
+                reason: `Training from base model (${bundleCount} bundles < ${this.BUNDLE_THRESHOLD} threshold)`,
+                metadata: {
+                    bundleCount,
+                    baseModel: this.BASE_MODEL,
+                },
+            };
+        }
+        // Above threshold: train from best performing model
+        const bestModel = await this.getBestPerformingModel();
+        if (!bestModel) {
+            logger.warn("No best model found despite bundle threshold - using base model", undefined, "ModelSelectionService");
+            return {
+                modelId: this.BASE_MODEL,
+                modelPath: this.BASE_MODEL,
+                strategy: "base",
+                reason: "No previous models available - using base model",
+                metadata: {
+                    bundleCount,
+                    baseModel: this.BASE_MODEL,
+                },
+            };
+        }
+        logger.info(`Bundle count ${bundleCount} ≥ ${this.BUNDLE_THRESHOLD} - continuing from best model`, {
+            bestModelId: bestModel.modelId,
+            bestScore: bestModel.benchmarkScore,
+        }, "ModelSelectionService");
+        // Use storagePath for model path (e.g., HuggingFace URL)
+        const modelStoragePath = bestModel.storagePath || bestModel.modelId;
+        return {
+            modelId: bestModel.modelId,
+            modelPath: modelStoragePath,
+            strategy: "continue",
+            reason: `Continuing from best model (score: ${bestModel.benchmarkScore?.toFixed(3) || "N/A"})`,
+            metadata: {
+                bundleCount,
+                bestModelScore: bestModel.benchmarkScore || undefined,
+                baseModel: bestModel.baseModel,
+            },
+        };
+    }
+    /**
+     * Get best performing model based on benchmark scores
+     *
+     * Finds the trained model with the highest benchmark score that is
+     * ready or deployed. Used for continuing training from a strong baseline.
+     *
+     * @returns Best performing model record, or null if none found
+     *
+     * @remarks
+     * Only considers models with status 'ready' or 'deployed' and
+     * non-null benchmark scores.
+     */
+    async getBestPerformingModel() {
+        const model = await getTrainingDataAdapter().getBestBenchmarkedModel();
+        if (!model) {
+            logger.warn("No benchmarked models found", undefined, "ModelSelectionService");
+            return null;
+        }
+        logger.info("Found best performing model", {
+            modelId: model.modelId,
+            version: model.version,
+            benchmarkScore: model.benchmarkScore,
+            avgReward: model.avgReward,
+        }, "ModelSelectionService");
+        return model;
+    }
+    /**
+     * Count available training bundles
+     *
+     * A "bundle" is a trajectory that:
+     * - Is marked as training data
+     * - Has been scored (aiJudgeReward IS NOT NULL)
+     * - Has not been used in training yet
+     * - Has valid steps data (not 'null' or '[]')
+     *
+     * @returns Number of available training bundles
+     */
+    async countTrainingBundles() {
+        return await getTrainingDataAdapter().countScoredTrajectoriesReady();
+    }
+    /**
+     * Check if we should force first model creation
+     *
+     * Returns true if no trained models exist yet, indicating we should
+     * create the first model from the base model.
+     *
+     * @returns True if no models exist, false otherwise
+     */
+    async shouldForceFirstModel() {
+        const modelCount = await this.countTrainedModels();
+        return modelCount === 0;
+    }
+    /**
+     * Count existing trained models
+     */
+    async countTrainedModels() {
+        return await getTrainingDataAdapter().countActiveModels();
+    }
+    /**
+     * Get training data limit based on bundle count
+     *
+     * Determines how many trajectories to use for training:
+     * - < 1000 bundles: Use all available (returns null)
+     * - ≥ 1000 bundles: Cap at 2000 most recent
+     *
+     * @returns Limit number (2000) or null to use all available
+     */
+    async getTrainingDataLimit() {
+        const bundleCount = await this.countTrainingBundles();
+        if (bundleCount < this.BUNDLE_THRESHOLD) {
+            return null; // Use all available
+        }
+        return this.MAX_TRAINING_EXAMPLES; // Cap at 2000
+    }
+    /**
+     * Get trajectories for training (with optional limit)
+     *
+     * Retrieves scored trajectories that haven't been used in training yet.
+     * Orders by most recent first to prioritize fresh data.
+     *
+     * @param limit - Optional limit on number of trajectories to return
+     * @returns Array of training trajectories
+     *
+     * @remarks
+     * Filters to only include:
+     * - isTrainingData: true
+     * - usedInTraining: false
+     * - aiJudgeReward: not null
+     * - Valid stepsJson (not 'null' or '[]')
+     */
+    async getTrainingTrajectories(limit) {
+        const result = await getTrainingDataAdapter().getTrainingTrajectories(limit ?? undefined);
+        logger.info(`Retrieved ${result.length} trajectories for training`, { limit, available: result.length }, "ModelSelectionService");
+        return result;
+    }
+    /**
+     * Get model selection summary for logging/monitoring
+     *
+     * Provides a comprehensive summary of the current model selection state,
+     * including bundle counts, model availability, and recommendations.
+     *
+     * @returns Summary object with counts, best model info, and recommendation
+     *
+     * @example
+     * ```typescript
+     * const summary = await modelSelectionService.getSelectionSummary();
+     * console.log(`Bundles: ${summary.bundleCount}`);
+     * console.log(`Recommendation: ${summary.recommendation}`);
+     * ```
+     */
+    async getSelectionSummary() {
+        const bundleCount = await this.countTrainingBundles();
+        const trainedModelCount = await this.countTrainedModels();
+        const bestModel = await this.getBestPerformingModel();
+        let recommendation = "";
+        if (trainedModelCount === 0) {
+            recommendation = "Force first model creation";
+        }
+        else if (bundleCount < this.MIN_BUNDLES_FOR_TRAINING) {
+            recommendation = "Not ready - need more data";
+        }
+        else if (bundleCount < this.BUNDLE_THRESHOLD) {
+            recommendation = "Train from base model";
+        }
+        else {
+            recommendation = "Train from best performing model";
+        }
+        return {
+            bundleCount,
+            trainedModelCount,
+            bestModel: bestModel?.modelId || null,
+            bestScore: bestModel?.benchmarkScore || null,
+            recommendation,
+        };
+    }
+}
+// Export singleton instance
+export const modelSelectionService = new ModelSelectionService();

package/dist/training/ModelUsageVerifier.js ADDED Viewed

@@ -0,0 +1,106 @@
+/**
+ * Model Usage Verifier
+ *
+ * Verifies that agents are using the correct models.
+ * Provides assertions and logging for model usage verification.
+ */
+import { getLlmLogAdapter, getTrainingDataAdapter } from "../adapter";
+import { logger } from "../utils/logger";
+export class ModelUsageVerifier {
+    /**
+     * Verify an agent's model usage
+     *
+     * Checks the agent's runtime configuration to determine which model
+     * is being used.
+     *
+     * @param agentUserId - Unique identifier for the agent
+     * @param runtime - Agent runtime to verify
+     * @returns ModelUsageStats with model information and inference count
+     */
+    static async verifyAgentModelUsage(agentUserId, runtime) {
+        const character = runtime.character;
+        const settings = character?.settings;
+        // Check for different model providers
+        const groqModel = String(settings?.GROQ_LARGE_MODEL || settings?.GROQ_SMALL_MODEL || "");
+        const claudeModel = String(settings?.CLAUDE_MODEL || "");
+        const openaiModel = String(settings?.OPENAI_MODEL || "");
+        let modelUsed;
+        let modelSource;
+        if (claudeModel) {
+            modelUsed = claudeModel;
+            modelSource = "claude";
+        }
+        else if (openaiModel) {
+            modelUsed = openaiModel;
+            modelSource = "openai";
+        }
+        else if (groqModel) {
+            modelUsed = groqModel;
+            modelSource = "groq";
+        }
+        else {
+            modelUsed = "unknown";
+            modelSource = "unknown";
+        }
+        // Count inferences from logs (using trajectoryId)
+        const trajectoryIds = await getTrainingDataAdapter().getTrajectoryIdsByAgent(agentUserId);
+        const twentyFourHoursAgo = new Date(Date.now() - 24 * 60 * 60 * 1000);
+        let inferenceCount = 0;
+        const llmAdapter = getLlmLogAdapter();
+        if (llmAdapter && trajectoryIds.length > 0) {
+            inferenceCount = await llmAdapter.countRecentLLMCalls(trajectoryIds, twentyFourHoursAgo);
+        }
+        return {
+            agentId: agentUserId,
+            modelUsed,
+            modelSource,
+            inferenceCount,
+        };
+    }
+    /**
+     * Verify multiple agents
+     */
+    static async verifyMultipleAgents(agentUserIds, runtimes) {
+        const details = [];
+        const errors = [];
+        for (const agentId of agentUserIds) {
+            const runtime = runtimes.get(agentId);
+            if (!runtime) {
+                errors.push(`Runtime not found for agent ${agentId}`);
+                continue;
+            }
+            const stats = await ModelUsageVerifier.verifyAgentModelUsage(agentId, runtime);
+            details.push(stats);
+        }
+        return {
+            success: details.length > 0,
+            agentsChecked: details.length,
+            details,
+            errors,
+        };
+    }
+    /**
+     * Assert that an agent is using a model
+     */
+    static async assertModelUsage(agentUserId, runtime) {
+        const stats = await ModelUsageVerifier.verifyAgentModelUsage(agentUserId, runtime);
+        if (stats.modelSource === "unknown") {
+            throw new Error(`Agent ${agentUserId} has no configured model. ` +
+                `Using: ${stats.modelUsed}`);
+        }
+        logger.info("Model usage verified", {
+            agentId: agentUserId,
+            model: stats.modelUsed,
+            source: stats.modelSource,
+        }, "ModelUsageVerifier");
+    }
+    /**
+     * Get model usage summary
+     */
+    static async getModelUsageSummary() {
+        const agents = await getTrainingDataAdapter().getAgentUsers();
+        return {
+            totalAgents: agents.length,
+        };
+    }
+}