npm - @elizaos/training - Versions diffs - 2.0.0-alpha.76 → 2.0.0-alpha.78 - Mend

@elizaos/training 2.0.0-alpha.76 → 2.0.0-alpha.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

package/package.json +2 -2
package/.turbo/turbo-lint.log +0 -3
package/.turbo/turbo-typecheck.log +0 -1
package/dist/.tsbuildinfo +0 -1
package/dist/adapter.js +0 -59
package/dist/archetypes/ArchetypeConfigService.js +0 -510
package/dist/archetypes/derive-archetype.js +0 -196
package/dist/archetypes/index.js +0 -7
package/dist/benchmark/ArchetypeMatchupBenchmark.js +0 -547
package/dist/benchmark/BenchmarkChartGenerator.js +0 -632
package/dist/benchmark/BenchmarkDataGenerator.js +0 -825
package/dist/benchmark/BenchmarkDataViewer.js +0 -197
package/dist/benchmark/BenchmarkHistoryService.js +0 -135
package/dist/benchmark/BenchmarkRunner.js +0 -483
package/dist/benchmark/BenchmarkValidator.js +0 -158
package/dist/benchmark/FastEvalRunner.js +0 -133
package/dist/benchmark/MetricsValidator.js +0 -104
package/dist/benchmark/MetricsVisualizer.js +0 -775
package/dist/benchmark/ModelBenchmarkService.js +0 -433
package/dist/benchmark/ModelRegistry.js +0 -122
package/dist/benchmark/RulerBenchmarkIntegration.js +0 -168
package/dist/benchmark/SimulationA2AInterface.js +0 -683
package/dist/benchmark/SimulationEngine.js +0 -522
package/dist/benchmark/TaskRunner.js +0 -60
package/dist/benchmark/__tests__/BenchmarkRunner.test.js +0 -409
package/dist/benchmark/__tests__/HeadToHead.test.js +0 -105
package/dist/benchmark/index.js +0 -23
package/dist/benchmark/parseSimulationMetrics.js +0 -86
package/dist/benchmark/simulation-types.js +0 -1
package/dist/dependencies.js +0 -197
package/dist/generation/TrajectoryGenerator.js +0 -244
package/dist/generation/index.js +0 -6
package/dist/huggingface/HuggingFaceDatasetUploader.js +0 -463
package/dist/huggingface/HuggingFaceIntegrationService.js +0 -272
package/dist/huggingface/HuggingFaceModelUploader.js +0 -385
package/dist/huggingface/index.js +0 -9
package/dist/huggingface/shared/HuggingFaceUploadUtil.js +0 -144
package/dist/index.js +0 -41
package/dist/init-training.js +0 -43
package/dist/metrics/TrajectoryMetricsExtractor.js +0 -523
package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +0 -628
package/dist/metrics/index.js +0 -7
package/dist/metrics/types.js +0 -21
package/dist/rubrics/__tests__/index.test.js +0 -150
package/dist/rubrics/ass-kisser.js +0 -83
package/dist/rubrics/degen.js +0 -78
package/dist/rubrics/goody-twoshoes.js +0 -82
package/dist/rubrics/index.js +0 -184
package/dist/rubrics/information-trader.js +0 -82
package/dist/rubrics/infosec.js +0 -99
package/dist/rubrics/liar.js +0 -102
package/dist/rubrics/perps-trader.js +0 -85
package/dist/rubrics/researcher.js +0 -79
package/dist/rubrics/scammer.js +0 -80
package/dist/rubrics/social-butterfly.js +0 -71
package/dist/rubrics/super-predictor.js +0 -95
package/dist/rubrics/trader.js +0 -65
package/dist/scoring/ArchetypeScoringService.js +0 -301
package/dist/scoring/JudgePromptBuilder.js +0 -401
package/dist/scoring/LLMJudgeCache.js +0 -263
package/dist/scoring/index.js +0 -8
package/dist/training/AutomationPipeline.js +0 -714
package/dist/training/BenchmarkService.js +0 -370
package/dist/training/ConfigValidator.js +0 -153
package/dist/training/MarketOutcomesTracker.js +0 -142
package/dist/training/ModelDeployer.js +0 -128
package/dist/training/ModelFetcher.js +0 -48
package/dist/training/ModelSelectionService.js +0 -248
package/dist/training/ModelUsageVerifier.js +0 -106
package/dist/training/MultiModelOrchestrator.js +0 -349
package/dist/training/RLModelConfig.js +0 -295
package/dist/training/RewardBackpropagationService.js +0 -117
package/dist/training/RulerScoringService.js +0 -450
package/dist/training/TrainingMonitor.js +0 -108
package/dist/training/TrajectoryRecorder.js +0 -281
package/dist/training/__tests__/TrajectoryRecorder.test.js +0 -363
package/dist/training/index.js +0 -30
package/dist/training/logRLConfig.js +0 -29
package/dist/training/pipeline.js +0 -80
package/dist/training/storage/ModelStorageService.js +0 -190
package/dist/training/storage/TrainingDataArchiver.js +0 -136
package/dist/training/storage/index.js +0 -7
package/dist/training/types.js +0 -6
package/dist/training/window-utils.js +0 -100
package/dist/utils/index.js +0 -73
package/dist/utils/logger.js +0 -55
package/dist/utils/snowflake.js +0 -15
package/dist/utils/synthetic-detector.js +0 -67
package/vitest.config.ts +0 -8

package/dist/scoring/ArchetypeScoringService.js DELETED Viewed

@@ -1,301 +0,0 @@
-/**
- * ArchetypeScoringService
- *
- * Scores trajectories using LLM-as-judge with archetype-specific rubrics.
- * Supports both single trajectory scoring and RULER-style relative comparison.
- *
- * @packageDocumentation
- */
-import { getTrainingDataAdapter } from "../adapter";
-import { getLLMCaller } from "../dependencies";
-import { trajectoryMetricsExtractor } from "../metrics";
-import { hasCustomRubric } from "../rubrics";
-import { logger, splitIntoBatches } from "../utils";
-import { judgePromptBuilder, } from "./JudgePromptBuilder";
-const DEFAULT_OPTIONS = {
-    includeActionDetails: false,
-    saveToDatabase: true,
-};
-/**
- * Service for scoring trajectories with archetype-aware evaluation.
- */
-export class ArchetypeScoringService {
-    minGroupSize = 2;
-    maxGroupSize = 8;
-    /**
-     * Score a single trajectory.
-     * @param trajectoryId - ID of the trajectory to score
-     * @param options - Scoring options
-     * @returns The score or null if trajectory not found
-     */
-    async scoreTrajectory(trajectoryId, options = {}) {
-        const opts = { ...DEFAULT_OPTIONS, ...options };
-        const traj = await getTrainingDataAdapter().getTrajectoryById(trajectoryId);
-        if (!traj) {
-            logger.warn("Trajectory not found", { trajectoryId }, "ArchetypeScoring");
-            return null;
-        }
-        const archetype = traj.archetype || opts.archetype || "default";
-        const steps = JSON.parse(traj.stepsJson);
-        const metrics = trajectoryMetricsExtractor.extractFromRaw({
-            trajectoryId: traj.trajectoryId,
-            agentId: traj.agentId,
-            stepsJson: traj.stepsJson,
-            scenarioId: traj.scenarioId || undefined,
-            finalPnL: traj.finalPnL || undefined,
-        });
-        if (!metrics) {
-            throw new Error(`Failed to extract metrics for trajectory ${trajectoryId}`);
-        }
-        const context = {
-            trajectoryId: traj.trajectoryId,
-            agentId: traj.agentId,
-            archetype,
-            steps,
-            metrics,
-            finalPnL: traj.finalPnL || undefined,
-            episodeLength: traj.episodeLength,
-            totalReward: traj.totalReward,
-        };
-        const { system, user } = judgePromptBuilder.buildSinglePrompt(context, {
-            includeActionDetails: opts.includeActionDetails,
-        });
-        const response = await this.callSingleJudge(system, user);
-        if (!response) {
-            throw new Error(`Judge returned no response for trajectory ${trajectoryId}`);
-        }
-        const score = {
-            trajectoryId: traj.trajectoryId,
-            agentId: traj.agentId,
-            archetype,
-            score: Math.max(0, Math.min(1, response.score)),
-            reasoning: response.reasoning,
-            strengths: response.strengths || [],
-            weaknesses: response.weaknesses || [],
-            metrics,
-            scoredAt: new Date(),
-        };
-        if (opts.saveToDatabase) {
-            await getTrainingDataAdapter().updateTrajectoryScore(trajectoryId, score.score, score.reasoning);
-        }
-        logger.info("Scored trajectory", {
-            trajectoryId,
-            archetype: score.archetype,
-            score: score.score,
-        }, "ArchetypeScoring");
-        return score;
-    }
-    /**
-     * Score multiple trajectories using RULER comparison.
-     * @param trajectoryIds - IDs of trajectories to score
-     * @param options - Scoring options
-     * @returns Array of scores
-     */
-    async scoreTrajectoryGroup(trajectoryIds, options = {}) {
-        const opts = { ...DEFAULT_OPTIONS, ...options };
-        if (trajectoryIds.length < this.minGroupSize) {
-            logger.warn("Group too small for RULER scoring", {
-                size: trajectoryIds.length,
-                minRequired: this.minGroupSize,
-            }, "ArchetypeScoring");
-            return [];
-        }
-        const trajResults = await getTrainingDataAdapter().getTrajectoriesByIds(trajectoryIds);
-        if (trajResults.length < this.minGroupSize) {
-            logger.warn("Not enough valid trajectories", {
-                requested: trajectoryIds.length,
-                found: trajResults.length,
-            }, "ArchetypeScoring");
-            return [];
-        }
-        const contexts = [];
-        const fallbackArchetype = opts.archetype || "default";
-        for (const traj of trajResults) {
-            const steps = JSON.parse(traj.stepsJson);
-            const archetype = traj.archetype || fallbackArchetype;
-            const metrics = trajectoryMetricsExtractor.extractFromRaw({
-                trajectoryId: traj.trajectoryId,
-                agentId: traj.agentId,
-                stepsJson: traj.stepsJson,
-                scenarioId: traj.scenarioId || undefined,
-                finalPnL: traj.finalPnL || undefined,
-            });
-            if (!metrics) {
-                throw new Error(`Failed to extract metrics for trajectory ${traj.trajectoryId}`);
-            }
-            contexts.push({
-                trajectoryId: traj.trajectoryId,
-                agentId: traj.agentId,
-                archetype,
-                steps,
-                metrics,
-                finalPnL: traj.finalPnL || undefined,
-                episodeLength: traj.episodeLength,
-                totalReward: traj.totalReward,
-            });
-        }
-        const batches = splitIntoBatches(contexts, this.maxGroupSize);
-        const scores = [];
-        for (const batch of batches) {
-            const scenarioId = batch[0]?.archetype || "unknown";
-            const { system, user } = judgePromptBuilder.buildComparisonPrompt(batch, scenarioId);
-            const response = await this.callComparisonJudge(system, user);
-            if (!response) {
-                throw new Error("Judge returned no response for batch");
-            }
-            for (let i = 0; i < batch.length; i++) {
-                const ctx = batch[i];
-                if (!ctx)
-                    continue;
-                const expectedId = `trajectory-${i + 1}`;
-                const scoreData = response.scores.find((s) => s.trajectory_id === expectedId);
-                if (!scoreData) {
-                    throw new Error(`Missing score for ${expectedId}`);
-                }
-                const score = {
-                    trajectoryId: ctx.trajectoryId,
-                    agentId: ctx.agentId,
-                    archetype: ctx.archetype || "default",
-                    score: Math.max(0, Math.min(1, scoreData.score)),
-                    reasoning: scoreData.explanation,
-                    strengths: [],
-                    weaknesses: [],
-                    metrics: ctx.metrics,
-                    scoredAt: new Date(),
-                };
-                scores.push(score);
-                if (opts.saveToDatabase) {
-                    await getTrainingDataAdapter().updateTrajectoryScore(ctx.trajectoryId, score.score, score.reasoning);
-                }
-            }
-        }
-        logger.info("Scored trajectory group", {
-            requested: trajectoryIds.length,
-            scored: scores.length,
-        }, "ArchetypeScoring");
-        return scores;
-    }
-    /**
-     * Score trajectories by archetype.
-     * @param archetype - Archetype to use for scoring
-     * @param trajectoryIds - IDs to score
-     * @returns Count of scored and errors
-     */
-    async scoreByArchetype(archetype, trajectoryIds) {
-        if (!hasCustomRubric(archetype)) {
-            logger.warn("No custom rubric for archetype, using default", { archetype }, "ArchetypeScoring");
-        }
-        if (trajectoryIds.length === 0) {
-            return { scored: 0, errors: 0 };
-        }
-        const scores = await this.scoreTrajectoryGroup(trajectoryIds, {
-            archetype,
-            saveToDatabase: true,
-        });
-        return {
-            scored: scores.length,
-            errors: trajectoryIds.length - scores.length,
-        };
-    }
-    /**
-     * Score all unscored trajectories.
-     * @param archetype - Default archetype to use
-     * @param limit - Maximum trajectories to score
-     * @returns Count of scored and errors
-     */
-    async scoreUnscoredTrajectories(archetype = "default", limit = 100) {
-        const unscoredResult = await getTrainingDataAdapter().getUnscoredTrajectories({ limit });
-        if (unscoredResult.length === 0) {
-            logger.info("No unscored trajectories found", {}, "ArchetypeScoring");
-            return { scored: 0, errors: 0 };
-        }
-        const trajectoryIds = unscoredResult.map((r) => r.trajectoryId);
-        return this.scoreByArchetype(archetype, trajectoryIds);
-    }
-    /**
-     * Score trajectories in parallel with rate limiting.
-     * @param trajectoryIds - IDs to score
-     * @param options - Scoring options
-     * @param concurrency - Maximum concurrent calls
-     * @returns Array of scores
-     */
-    async scoreTrajectoriesParallel(trajectoryIds, options = {}, concurrency = 5) {
-        const results = [];
-        const batches = splitIntoBatches(trajectoryIds, concurrency);
-        logger.info("Starting parallel scoring", {
-            total: trajectoryIds.length,
-            batches: batches.length,
-            concurrency,
-        }, "ArchetypeScoring");
-        for (let i = 0; i < batches.length; i++) {
-            const batch = batches[i] ?? [];
-            const batchPromises = batch.map((id) => this.scoreTrajectory(id, options));
-            const batchResults = await Promise.all(batchPromises);
-            for (const result of batchResults) {
-                if (result) {
-                    results.push(result);
-                }
-            }
-            if (i < batches.length - 1) {
-                await new Promise((resolve) => setTimeout(resolve, 100));
-            }
-        }
-        logger.info("Parallel scoring complete", {
-            requested: trajectoryIds.length,
-            scored: results.length,
-        }, "ArchetypeScoring");
-        return results;
-    }
-    /**
-     * Call LLM judge for single trajectory.
-     */
-    async callSingleJudge(system, user) {
-        const llmCaller = getLLMCaller();
-        const prompt = `${user}\n\nReturn ONLY valid JSON, no other text.`;
-        const response = await llmCaller.callGroqDirect({
-            prompt,
-            system,
-            modelSize: "large",
-            temperature: 0.3,
-            maxTokens: 1000,
-            actionType: "archetype_score_trajectory",
-        });
-        return this.parseJudgeResponse(response);
-    }
-    /**
-     * Call LLM judge for trajectory comparison.
-     */
-    async callComparisonJudge(system, user) {
-        const llmCaller = getLLMCaller();
-        const prompt = `${user}\n\nReturn ONLY valid JSON, no other text.`;
-        const response = await llmCaller.callGroqDirect({
-            prompt,
-            system,
-            modelSize: "large",
-            temperature: 0.3,
-            maxTokens: 2000,
-            actionType: "archetype_ruler_score",
-        });
-        return this.parseJudgeResponse(response);
-    }
-    /**
-     * Parse JSON response from judge.
-     */
-    parseJudgeResponse(response) {
-        const jsonText = response
-            .trim()
-            .replace(/```json\n?/g, "")
-            .replace(/```\n?/g, "")
-            .trim();
-        const jsonMatch = jsonText.match(/\{[\s\S]*\}/);
-        if (!jsonMatch) {
-            logger.error("No JSON found in response", {
-                preview: response.substring(0, 200),
-            }, "ArchetypeScoring");
-            return null;
-        }
-        return JSON.parse(jsonMatch[0]);
-    }
-}
-/** Singleton instance */
-export const archetypeScoringService = new ArchetypeScoringService();