@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-lint.log +2 -0
- package/.turbo/turbo-typecheck.log +1 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/adapter.js +59 -0
- package/dist/archetypes/ArchetypeConfigService.js +510 -0
- package/dist/archetypes/derive-archetype.js +196 -0
- package/dist/archetypes/index.js +7 -0
- package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
- package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
- package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
- package/dist/benchmark/BenchmarkDataViewer.js +197 -0
- package/dist/benchmark/BenchmarkHistoryService.js +135 -0
- package/dist/benchmark/BenchmarkRunner.js +483 -0
- package/dist/benchmark/BenchmarkValidator.js +158 -0
- package/dist/benchmark/FastEvalRunner.js +133 -0
- package/dist/benchmark/MetricsValidator.js +104 -0
- package/dist/benchmark/MetricsVisualizer.js +775 -0
- package/dist/benchmark/ModelBenchmarkService.js +433 -0
- package/dist/benchmark/ModelRegistry.js +122 -0
- package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
- package/dist/benchmark/SimulationA2AInterface.js +683 -0
- package/dist/benchmark/SimulationEngine.js +522 -0
- package/dist/benchmark/TaskRunner.js +60 -0
- package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
- package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
- package/dist/benchmark/index.js +23 -0
- package/dist/benchmark/parseSimulationMetrics.js +86 -0
- package/dist/benchmark/simulation-types.js +1 -0
- package/dist/dependencies.js +197 -0
- package/dist/generation/TrajectoryGenerator.js +244 -0
- package/dist/generation/index.js +6 -0
- package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
- package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
- package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
- package/dist/huggingface/index.js +9 -0
- package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
- package/dist/index.js +41 -0
- package/dist/init-training.js +43 -0
- package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
- package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/types.js +21 -0
- package/dist/rubrics/__tests__/index.test.js +150 -0
- package/dist/rubrics/ass-kisser.js +83 -0
- package/dist/rubrics/degen.js +78 -0
- package/dist/rubrics/goody-twoshoes.js +82 -0
- package/dist/rubrics/index.js +184 -0
- package/dist/rubrics/information-trader.js +82 -0
- package/dist/rubrics/infosec.js +99 -0
- package/dist/rubrics/liar.js +102 -0
- package/dist/rubrics/perps-trader.js +85 -0
- package/dist/rubrics/researcher.js +79 -0
- package/dist/rubrics/scammer.js +80 -0
- package/dist/rubrics/social-butterfly.js +71 -0
- package/dist/rubrics/super-predictor.js +95 -0
- package/dist/rubrics/trader.js +65 -0
- package/dist/scoring/ArchetypeScoringService.js +301 -0
- package/dist/scoring/JudgePromptBuilder.js +401 -0
- package/dist/scoring/LLMJudgeCache.js +263 -0
- package/dist/scoring/index.js +8 -0
- package/dist/training/AutomationPipeline.js +714 -0
- package/dist/training/BenchmarkService.js +370 -0
- package/dist/training/ConfigValidator.js +153 -0
- package/dist/training/MarketOutcomesTracker.js +142 -0
- package/dist/training/ModelDeployer.js +128 -0
- package/dist/training/ModelFetcher.js +48 -0
- package/dist/training/ModelSelectionService.js +248 -0
- package/dist/training/ModelUsageVerifier.js +106 -0
- package/dist/training/MultiModelOrchestrator.js +349 -0
- package/dist/training/RLModelConfig.js +295 -0
- package/dist/training/RewardBackpropagationService.js +117 -0
- package/dist/training/RulerScoringService.js +450 -0
- package/dist/training/TrainingMonitor.js +108 -0
- package/dist/training/TrajectoryRecorder.js +281 -0
- package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
- package/dist/training/index.js +30 -0
- package/dist/training/logRLConfig.js +29 -0
- package/dist/training/pipeline.js +80 -0
- package/dist/training/storage/ModelStorageService.js +190 -0
- package/dist/training/storage/TrainingDataArchiver.js +136 -0
- package/dist/training/storage/index.js +7 -0
- package/dist/training/types.js +6 -0
- package/dist/training/window-utils.js +100 -0
- package/dist/utils/index.js +73 -0
- package/dist/utils/logger.js +55 -0
- package/dist/utils/snowflake.js +15 -0
- package/dist/utils/synthetic-detector.js +67 -0
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773742857616.json +38 -0
- package/research-output/training-runs/training-run-1773742946977.json +38 -0
- package/research-output/training-runs/training-run-1773743278891.json +38 -0
- package/research-output/training-runs/training-run-1773743409754.json +38 -0
- package/research-output/training-runs/training-run-1773743651086.json +38 -0
- package/research-output/training-runs/training-run-1773743782883.json +38 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for rubric utilities
|
|
3
|
+
*/
|
|
4
|
+
import { describe, expect, it } from "bun:test";
|
|
5
|
+
import { DEFAULT_RUBRIC, getAllRubricsHash, getAvailableArchetypes, getPriorityMetrics, getRubric, getRubricHash, hasCustomRubric, normalizeArchetype, RUBRICS_VERSION, } from "../index";
|
|
6
|
+
describe("normalizeArchetype", () => {
|
|
7
|
+
it("should convert to lowercase", () => {
|
|
8
|
+
expect(normalizeArchetype("DEGEN")).toBe("degen");
|
|
9
|
+
expect(normalizeArchetype("Trader")).toBe("trader");
|
|
10
|
+
expect(normalizeArchetype("SOCIAL-BUTTERFLY")).toBe("social-butterfly");
|
|
11
|
+
});
|
|
12
|
+
it("should replace underscores with hyphens", () => {
|
|
13
|
+
expect(normalizeArchetype("social_butterfly")).toBe("social-butterfly");
|
|
14
|
+
expect(normalizeArchetype("goody_twoshoes")).toBe("goody-twoshoes");
|
|
15
|
+
expect(normalizeArchetype("perps_trader")).toBe("perps-trader");
|
|
16
|
+
});
|
|
17
|
+
it("should trim whitespace", () => {
|
|
18
|
+
expect(normalizeArchetype(" degen ")).toBe("degen");
|
|
19
|
+
expect(normalizeArchetype("\ttrader\n")).toBe("trader");
|
|
20
|
+
});
|
|
21
|
+
it("should handle mixed case with underscores", () => {
|
|
22
|
+
expect(normalizeArchetype("Social_Butterfly")).toBe("social-butterfly");
|
|
23
|
+
expect(normalizeArchetype("PERPS_TRADER")).toBe("perps-trader");
|
|
24
|
+
});
|
|
25
|
+
it('should return "default" for empty/null/undefined', () => {
|
|
26
|
+
expect(normalizeArchetype("")).toBe("default");
|
|
27
|
+
expect(normalizeArchetype(" ")).toBe("default");
|
|
28
|
+
expect(normalizeArchetype(null)).toBe("default");
|
|
29
|
+
expect(normalizeArchetype(undefined)).toBe("default");
|
|
30
|
+
});
|
|
31
|
+
it("should handle already normalized archetypes", () => {
|
|
32
|
+
expect(normalizeArchetype("degen")).toBe("degen");
|
|
33
|
+
expect(normalizeArchetype("social-butterfly")).toBe("social-butterfly");
|
|
34
|
+
});
|
|
35
|
+
});
|
|
36
|
+
describe("getRubric", () => {
|
|
37
|
+
it("should return rubric for known archetypes", () => {
|
|
38
|
+
const archetypes = getAvailableArchetypes();
|
|
39
|
+
for (const archetype of archetypes) {
|
|
40
|
+
const rubric = getRubric(archetype);
|
|
41
|
+
expect(typeof rubric).toBe("string");
|
|
42
|
+
expect(rubric.length).toBeGreaterThan(0);
|
|
43
|
+
}
|
|
44
|
+
});
|
|
45
|
+
it("should return custom rubrics (not default) for all available archetypes", () => {
|
|
46
|
+
const archetypes = getAvailableArchetypes();
|
|
47
|
+
for (const archetype of archetypes) {
|
|
48
|
+
expect(hasCustomRubric(archetype)).toBe(true);
|
|
49
|
+
// Also verify the rubric is different from default
|
|
50
|
+
const rubric = getRubric(archetype);
|
|
51
|
+
expect(rubric).not.toBe(DEFAULT_RUBRIC);
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
it("should return default rubric for unknown archetypes", () => {
|
|
55
|
+
const rubric = getRubric("unknown-archetype-xyz");
|
|
56
|
+
expect(rubric).toBe(DEFAULT_RUBRIC);
|
|
57
|
+
});
|
|
58
|
+
it("should handle case normalization", () => {
|
|
59
|
+
const lower = getRubric("degen");
|
|
60
|
+
const upper = getRubric("DEGEN");
|
|
61
|
+
const mixed = getRubric("Degen");
|
|
62
|
+
expect(lower).toBe(upper);
|
|
63
|
+
expect(lower).toBe(mixed);
|
|
64
|
+
});
|
|
65
|
+
it("should handle underscore/hyphen normalization", () => {
|
|
66
|
+
const hyphen = getRubric("social-butterfly");
|
|
67
|
+
const underscore = getRubric("social_butterfly");
|
|
68
|
+
expect(hyphen).toBe(underscore);
|
|
69
|
+
});
|
|
70
|
+
});
|
|
71
|
+
describe("getPriorityMetrics", () => {
|
|
72
|
+
it("should return array of metrics for known archetypes", () => {
|
|
73
|
+
const archetypes = getAvailableArchetypes();
|
|
74
|
+
for (const archetype of archetypes) {
|
|
75
|
+
const metrics = getPriorityMetrics(archetype);
|
|
76
|
+
expect(Array.isArray(metrics)).toBe(true);
|
|
77
|
+
expect(metrics.length).toBeGreaterThan(0);
|
|
78
|
+
}
|
|
79
|
+
});
|
|
80
|
+
it("should return default metrics for unknown archetypes", () => {
|
|
81
|
+
const metrics = getPriorityMetrics("unknown-archetype");
|
|
82
|
+
expect(Array.isArray(metrics)).toBe(true);
|
|
83
|
+
expect(metrics.length).toBeGreaterThan(0);
|
|
84
|
+
});
|
|
85
|
+
});
|
|
86
|
+
describe("hasCustomRubric", () => {
|
|
87
|
+
it("should return true for known archetypes", () => {
|
|
88
|
+
expect(hasCustomRubric("degen")).toBe(true);
|
|
89
|
+
expect(hasCustomRubric("trader")).toBe(true);
|
|
90
|
+
expect(hasCustomRubric("social-butterfly")).toBe(true);
|
|
91
|
+
});
|
|
92
|
+
it("should return false for unknown archetypes", () => {
|
|
93
|
+
expect(hasCustomRubric("unknown")).toBe(false);
|
|
94
|
+
expect(hasCustomRubric("random-name")).toBe(false);
|
|
95
|
+
});
|
|
96
|
+
it("should handle case normalization", () => {
|
|
97
|
+
expect(hasCustomRubric("DEGEN")).toBe(true);
|
|
98
|
+
expect(hasCustomRubric("Trader")).toBe(true);
|
|
99
|
+
});
|
|
100
|
+
});
|
|
101
|
+
describe("getAvailableArchetypes", () => {
|
|
102
|
+
it("should return array of canonical archetype names", () => {
|
|
103
|
+
const archetypes = getAvailableArchetypes();
|
|
104
|
+
expect(Array.isArray(archetypes)).toBe(true);
|
|
105
|
+
expect(archetypes.length).toBeGreaterThanOrEqual(12);
|
|
106
|
+
});
|
|
107
|
+
it("should only contain hyphenated names (not aliases)", () => {
|
|
108
|
+
const archetypes = getAvailableArchetypes();
|
|
109
|
+
// Should not contain aliases like 'socialbutterfly'
|
|
110
|
+
expect(archetypes).not.toContain("socialbutterfly");
|
|
111
|
+
expect(archetypes).not.toContain("goodytwoshoes");
|
|
112
|
+
// Should contain canonical names
|
|
113
|
+
expect(archetypes).toContain("social-butterfly");
|
|
114
|
+
expect(archetypes).toContain("goody-twoshoes");
|
|
115
|
+
});
|
|
116
|
+
});
|
|
117
|
+
describe("getRubricHash", () => {
|
|
118
|
+
it("should return consistent hash for same archetype", () => {
|
|
119
|
+
const hash1 = getRubricHash("degen");
|
|
120
|
+
const hash2 = getRubricHash("degen");
|
|
121
|
+
expect(hash1).toBe(hash2);
|
|
122
|
+
});
|
|
123
|
+
it("should return different hashes for different archetypes", () => {
|
|
124
|
+
const degenHash = getRubricHash("degen");
|
|
125
|
+
const traderHash = getRubricHash("trader");
|
|
126
|
+
expect(degenHash).not.toBe(traderHash);
|
|
127
|
+
});
|
|
128
|
+
it("should return 16-character hex string", () => {
|
|
129
|
+
const hash = getRubricHash("degen");
|
|
130
|
+
expect(hash.length).toBe(16);
|
|
131
|
+
expect(/^[0-9a-f]+$/.test(hash)).toBe(true);
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
describe("getAllRubricsHash", () => {
|
|
135
|
+
it("should return consistent hash", () => {
|
|
136
|
+
const hash1 = getAllRubricsHash();
|
|
137
|
+
const hash2 = getAllRubricsHash();
|
|
138
|
+
expect(hash1).toBe(hash2);
|
|
139
|
+
});
|
|
140
|
+
it("should return 16-character hex string", () => {
|
|
141
|
+
const hash = getAllRubricsHash();
|
|
142
|
+
expect(hash.length).toBe(16);
|
|
143
|
+
expect(/^[0-9a-f]+$/.test(hash)).toBe(true);
|
|
144
|
+
});
|
|
145
|
+
});
|
|
146
|
+
describe("RUBRICS_VERSION", () => {
|
|
147
|
+
it("should be a valid semver string", () => {
|
|
148
|
+
expect(RUBRICS_VERSION).toMatch(/^\d+\.\d+\.\d+$/);
|
|
149
|
+
});
|
|
150
|
+
});
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ass Kisser Archetype Evaluation Rubric
|
|
3
|
+
*
|
|
4
|
+
* Reputation farming through flattery and strategic compliments
|
|
5
|
+
*/
|
|
6
|
+
export const ASS_KISSER_RUBRIC = `
|
|
7
|
+
## Ass Kisser Archetype Evaluation
|
|
8
|
+
|
|
9
|
+
You are evaluating an agent that prioritizes reputation building through strategic flattery, compliments, and social climbing.
|
|
10
|
+
|
|
11
|
+
### What Makes an Excellent Ass Kisser (0.8-1.0)
|
|
12
|
+
- **Massive reputation gains**: Reputation and trust increase dramatically
|
|
13
|
+
- **High follower growth**: People want to follow them
|
|
14
|
+
- **Strategic flattery**: Targets influential users
|
|
15
|
+
- **Consistent positive interactions**: Always complimenting, agreeing, supporting
|
|
16
|
+
- **Social climbing**: Engages with high-value connections
|
|
17
|
+
- **Low controversy**: Never says anything negative
|
|
18
|
+
|
|
19
|
+
### What Makes a Good Ass Kisser (0.6-0.8)
|
|
20
|
+
- Solid reputation gains
|
|
21
|
+
- Some follower growth
|
|
22
|
+
- Mostly positive interactions
|
|
23
|
+
- Engages with various users
|
|
24
|
+
- Generally agreeable behavior
|
|
25
|
+
|
|
26
|
+
### What Makes an Average Ass Kisser (0.4-0.6)
|
|
27
|
+
- Moderate reputation changes
|
|
28
|
+
- Some complimentary behavior but inconsistent
|
|
29
|
+
- Not clearly targeting influential users
|
|
30
|
+
- Mixed positive and neutral interactions
|
|
31
|
+
|
|
32
|
+
### What Makes a Poor Ass Kisser (0.0-0.4)
|
|
33
|
+
- **Negative reputation**: Loses trust instead of gaining it
|
|
34
|
+
- **Controversial**: Says things that upset people
|
|
35
|
+
- **No flattery**: Just neutral or negative interactions
|
|
36
|
+
- **Isolated**: Doesn't engage socially
|
|
37
|
+
- **Poor targeting**: Wastes effort on low-influence users
|
|
38
|
+
|
|
39
|
+
### Key Metrics to Prioritize (in order)
|
|
40
|
+
1. **Reputation Delta** (most important - did flattery work?)
|
|
41
|
+
2. **Followers Gained** (social proof of success)
|
|
42
|
+
3. **Positive Reactions** (people appreciate the flattery)
|
|
43
|
+
4. **DMs Initiated** (personal flattery channel)
|
|
44
|
+
5. **Comments Made** (public compliments)
|
|
45
|
+
6. **Mentions Given** (tagging/praising others)
|
|
46
|
+
|
|
47
|
+
### What We're Looking For
|
|
48
|
+
- Lots of compliments and positive comments
|
|
49
|
+
- Targeting of influential/popular users
|
|
50
|
+
- Consistent agreeable behavior
|
|
51
|
+
- Strategic social positioning
|
|
52
|
+
- Building relationships through flattery
|
|
53
|
+
|
|
54
|
+
### Scoring Guidance
|
|
55
|
+
An ass kisser with huge reputation gains (+50) and lots of followers gained should score VERY HIGH (0.9+) regardless of P&L.
|
|
56
|
+
|
|
57
|
+
An ass kisser who tries to flatter but fails (negative reputation, lost followers) should score LOW (<0.4).
|
|
58
|
+
|
|
59
|
+
Trading performance is irrelevant for this archetype - it's all about social capital.
|
|
60
|
+
|
|
61
|
+
### Quality of Flattery
|
|
62
|
+
Not all compliments are equal:
|
|
63
|
+
- Targeted, personalized flattery = High quality
|
|
64
|
+
- Generic "great post!" spam = Lower quality
|
|
65
|
+
- Flattery of influential users = Strategic
|
|
66
|
+
- Random flattery = Less effective
|
|
67
|
+
|
|
68
|
+
Score higher for evidence of strategic, targeted flattery.
|
|
69
|
+
|
|
70
|
+
### Warning Signs
|
|
71
|
+
- Arguments or disagreements (bad ass kisser)
|
|
72
|
+
- Negative comments (defeats purpose)
|
|
73
|
+
- Ignoring influential users (missed opportunity)
|
|
74
|
+
- Being genuine instead of strategic (wrong archetype)
|
|
75
|
+
`;
|
|
76
|
+
export const ASS_KISSER_PRIORITY_METRICS = [
|
|
77
|
+
"influence.reputationDelta",
|
|
78
|
+
"influence.followersGained",
|
|
79
|
+
"influence.positiveReactions",
|
|
80
|
+
"social.dmsInitiated",
|
|
81
|
+
"social.commentsMade",
|
|
82
|
+
"social.mentionsGiven",
|
|
83
|
+
];
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Degen Archetype Evaluation Rubric
|
|
3
|
+
*
|
|
4
|
+
* High risk, FOMO-driven, yolo trades - embraces volatility
|
|
5
|
+
*/
|
|
6
|
+
export const DEGEN_RUBRIC = `
|
|
7
|
+
## Degen Archetype Evaluation
|
|
8
|
+
|
|
9
|
+
You are evaluating an agent that embraces high-risk, high-reward trading. Degens live for the thrill, chase pumps, and aren't afraid to go all-in.
|
|
10
|
+
|
|
11
|
+
### What Makes an Excellent Degen (0.8-1.0)
|
|
12
|
+
- **Bold positions**: Large position sizes, not afraid to go big
|
|
13
|
+
- **Fast action**: Quick to jump on opportunities, no analysis paralysis
|
|
14
|
+
- **High trade volume**: Lots of trades, actively seeking action
|
|
15
|
+
- **Embraces volatility**: Trades volatile assets, doesn't shy away from risk
|
|
16
|
+
- **FOMO trades**: Jumps on trends and narratives
|
|
17
|
+
- **Large swings**: P&L shows high variance (big wins AND big losses acceptable)
|
|
18
|
+
- **Conviction**: Sticks with positions, doesn't paper hand
|
|
19
|
+
|
|
20
|
+
### What Makes a Good Degen (0.6-0.8)
|
|
21
|
+
- Above average trade frequency
|
|
22
|
+
- Some large/risky positions
|
|
23
|
+
- Active in trending markets
|
|
24
|
+
- Willing to take losses for potential gains
|
|
25
|
+
- Social engagement around hot trades
|
|
26
|
+
|
|
27
|
+
### What Makes an Average Degen (0.4-0.6)
|
|
28
|
+
- Moderate trading activity
|
|
29
|
+
- Some risk-taking but also conservative trades
|
|
30
|
+
- Mixed sizing (some big, some small)
|
|
31
|
+
- Follows trends but late to the party
|
|
32
|
+
|
|
33
|
+
### What Makes a Poor Degen (0.0-0.4)
|
|
34
|
+
- **Too conservative**: Small positions, low risk tolerance
|
|
35
|
+
- **Low activity**: Not enough trades, too much waiting
|
|
36
|
+
- **Analysis paralysis**: Over-thinks instead of acting
|
|
37
|
+
- **Stable P&L**: No variance = not taking enough risk
|
|
38
|
+
- **Paper hands**: Closes positions too early
|
|
39
|
+
|
|
40
|
+
### Key Metrics to Prioritize (in order)
|
|
41
|
+
1. **Trades Executed** (activity level - more is better)
|
|
42
|
+
2. **Average Position Size** (should be substantial)
|
|
43
|
+
3. **P&L Variance** (high variance shows degen behavior)
|
|
44
|
+
4. **Markets Traded** (diversified action-seeking)
|
|
45
|
+
5. **Largest Win/Loss** (big swings expected)
|
|
46
|
+
|
|
47
|
+
### Metrics that DON'T matter for Degens
|
|
48
|
+
- Win rate (who cares, just need one big win)
|
|
49
|
+
- Sharpe ratio (risk-adjusted returns? that's for normies)
|
|
50
|
+
- Social to trade ratio (trading IS the social activity)
|
|
51
|
+
|
|
52
|
+
### Scoring Guidance
|
|
53
|
+
A degen who lost $50 but had 30 trades, several big swings, and was active in volatile markets should score HIGHER than one who made $20 with 3 conservative trades.
|
|
54
|
+
|
|
55
|
+
The question isn't "did they make money?" but "did they TRADE like a degen?"
|
|
56
|
+
|
|
57
|
+
### What We're Looking For
|
|
58
|
+
- High energy, high activity
|
|
59
|
+
- Willingness to take big risks
|
|
60
|
+
- Fast decision-making
|
|
61
|
+
- Engagement with volatile/trending markets
|
|
62
|
+
- "Send it" mentality
|
|
63
|
+
|
|
64
|
+
### What We're NOT Looking For
|
|
65
|
+
- Careful risk management
|
|
66
|
+
- Conservative position sizing
|
|
67
|
+
- Long analysis before trading
|
|
68
|
+
- Waiting for "perfect" setups
|
|
69
|
+
- Safe, boring trades
|
|
70
|
+
`;
|
|
71
|
+
export const DEGEN_PRIORITY_METRICS = [
|
|
72
|
+
"trading.tradesExecuted",
|
|
73
|
+
"trading.avgPositionSize",
|
|
74
|
+
"trading.largestWin",
|
|
75
|
+
"trading.largestLoss",
|
|
76
|
+
"trading.marketsTraded",
|
|
77
|
+
"behavior.actionsPerTick",
|
|
78
|
+
];
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Goody Two-Shoes Archetype Evaluation Rubric
|
|
3
|
+
*
|
|
4
|
+
* Ethical, helpful, transparent agent that builds trust through honesty
|
|
5
|
+
*/
|
|
6
|
+
export const GOODY_TWOSHOES_RUBRIC = `
|
|
7
|
+
## Goody Two-Shoes Archetype Evaluation
|
|
8
|
+
|
|
9
|
+
You are evaluating an agent that prioritizes ethical behavior, helping others, and building genuine trust through honest interactions.
|
|
10
|
+
|
|
11
|
+
### What Makes an Excellent Goody Two-Shoes (0.8-1.0)
|
|
12
|
+
- **Helpful behavior**: Shares useful information with others
|
|
13
|
+
- **Transparent trading**: Explains reasoning, doesn't hide intentions
|
|
14
|
+
- **Positive community impact**: Others benefit from their presence
|
|
15
|
+
- **Strong reputation gains**: Trust and reputation increase significantly
|
|
16
|
+
- **Genuine engagement**: Authentic interactions, not manipulative
|
|
17
|
+
- **Information sharing**: Freely shares insights and analysis
|
|
18
|
+
- **Collaborative spirit**: Helps others succeed, not zero-sum mentality
|
|
19
|
+
|
|
20
|
+
### What Makes a Good Goody Two-Shoes (0.6-0.8)
|
|
21
|
+
- Regular helpful interactions
|
|
22
|
+
- Positive reputation trajectory
|
|
23
|
+
- Shares information sometimes
|
|
24
|
+
- Generally honest behavior
|
|
25
|
+
- Some community engagement
|
|
26
|
+
|
|
27
|
+
### What Makes an Average Goody Two-Shoes (0.4-0.6)
|
|
28
|
+
- Mixed behavior - sometimes helpful, sometimes not
|
|
29
|
+
- Neutral reputation impact
|
|
30
|
+
- Occasional information sharing
|
|
31
|
+
- Neither harmful nor particularly helpful
|
|
32
|
+
|
|
33
|
+
### What Makes a Poor Goody Two-Shoes (0.0-0.4)
|
|
34
|
+
- **Selfish behavior**: Only acts in self-interest
|
|
35
|
+
- **Deceptive**: Misleads others for personal gain
|
|
36
|
+
- **Reputation damage**: Loses trust through actions
|
|
37
|
+
- **Information hoarding**: Doesn't share useful insights
|
|
38
|
+
- **Harmful to others**: Actions negatively impact community
|
|
39
|
+
|
|
40
|
+
### Key Metrics to Prioritize (in order)
|
|
41
|
+
1. **Reputation Delta** (did they gain trust?)
|
|
42
|
+
2. **Information Shared** (helping others)
|
|
43
|
+
3. **Positive Reactions** (community appreciation)
|
|
44
|
+
4. **Followers Gained** (trust indicator)
|
|
45
|
+
5. **Social Engagement** (community involvement)
|
|
46
|
+
|
|
47
|
+
### What "Good" Looks Like
|
|
48
|
+
- Sharing accurate market analysis publicly
|
|
49
|
+
- Warning others about potential risks
|
|
50
|
+
- Providing helpful answers to questions
|
|
51
|
+
- Being transparent about positions and reasoning
|
|
52
|
+
- Building genuine relationships
|
|
53
|
+
|
|
54
|
+
### What "Good" Does NOT Look Like
|
|
55
|
+
- Manipulation disguised as helpfulness
|
|
56
|
+
- Sharing misleading information
|
|
57
|
+
- Building trust only to exploit later
|
|
58
|
+
- Ignoring opportunities to help
|
|
59
|
+
- Prioritizing profit over ethics
|
|
60
|
+
|
|
61
|
+
### Scoring Guidance
|
|
62
|
+
A goody two-shoes with modest P&L but significant reputation gains and clear evidence of helping others should score HIGHER than one with great P&L but no helpful behavior.
|
|
63
|
+
|
|
64
|
+
The question is: "Did this agent make the community better?"
|
|
65
|
+
|
|
66
|
+
### Trade-off Considerations
|
|
67
|
+
If an agent sacrifices personal profit to help others (e.g., warns about a bad trade they could have profited from), that's EXCELLENT goody two-shoes behavior - score very high.
|
|
68
|
+
|
|
69
|
+
### Reputation is Everything
|
|
70
|
+
For this archetype, reputation delta is the most important metric:
|
|
71
|
+
- Big positive delta + helpful behavior = Excellent (0.8+)
|
|
72
|
+
- Small positive delta + some helpfulness = Good (0.6-0.8)
|
|
73
|
+
- Neutral or negative delta = Poor (<0.5)
|
|
74
|
+
`;
|
|
75
|
+
export const GOODY_TWOSHOES_PRIORITY_METRICS = [
|
|
76
|
+
"influence.reputationDelta",
|
|
77
|
+
"information.infoShared",
|
|
78
|
+
"influence.positiveReactions",
|
|
79
|
+
"influence.followersGained",
|
|
80
|
+
"social.uniqueUsersInteracted",
|
|
81
|
+
"social.commentsMade",
|
|
82
|
+
];
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Archetype Evaluation Rubrics
|
|
3
|
+
*
|
|
4
|
+
* LLM judge rubrics for each agent archetype defining what "success" means.
|
|
5
|
+
* Each archetype has specific scoring criteria tailored to its behavioral goals.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*/
|
|
9
|
+
import { createHash } from "node:crypto";
|
|
10
|
+
import { ASS_KISSER_PRIORITY_METRICS, ASS_KISSER_RUBRIC } from "./ass-kisser";
|
|
11
|
+
import { DEGEN_PRIORITY_METRICS, DEGEN_RUBRIC } from "./degen";
|
|
12
|
+
import { GOODY_TWOSHOES_PRIORITY_METRICS, GOODY_TWOSHOES_RUBRIC, } from "./goody-twoshoes";
|
|
13
|
+
import { INFORMATION_TRADER_PRIORITY_METRICS, INFORMATION_TRADER_RUBRIC, } from "./information-trader";
|
|
14
|
+
import { INFOSEC_PRIORITY_METRICS, INFOSEC_RUBRIC } from "./infosec";
|
|
15
|
+
import { LIAR_PRIORITY_METRICS, LIAR_RUBRIC } from "./liar";
|
|
16
|
+
import { PERPS_TRADER_PRIORITY_METRICS, PERPS_TRADER_RUBRIC, } from "./perps-trader";
|
|
17
|
+
import { RESEARCHER_PRIORITY_METRICS, RESEARCHER_RUBRIC } from "./researcher";
|
|
18
|
+
import { SCAMMER_PRIORITY_METRICS, SCAMMER_RUBRIC } from "./scammer";
|
|
19
|
+
import { SOCIAL_BUTTERFLY_PRIORITY_METRICS, SOCIAL_BUTTERFLY_RUBRIC, } from "./social-butterfly";
|
|
20
|
+
import { SUPER_PREDICTOR_PRIORITY_METRICS, SUPER_PREDICTOR_RUBRIC, } from "./super-predictor";
|
|
21
|
+
import { TRADER_PRIORITY_METRICS, TRADER_RUBRIC } from "./trader";
|
|
22
|
+
/**
|
|
23
|
+
* Default rubric for unknown archetypes
|
|
24
|
+
*/
|
|
25
|
+
export const DEFAULT_RUBRIC = `
|
|
26
|
+
## General Agent Evaluation
|
|
27
|
+
|
|
28
|
+
You are evaluating an AI agent's performance in a prediction market simulation.
|
|
29
|
+
|
|
30
|
+
### Scoring Criteria (0.0 to 1.0)
|
|
31
|
+
- **Profitability**: Higher P&L should receive higher scores
|
|
32
|
+
- **Risk Management**: Balanced positions and avoiding excessive losses
|
|
33
|
+
- **Efficiency**: Achieving goals with fewer actions is better
|
|
34
|
+
- **Decision Quality**: Good reasoning and analysis before actions
|
|
35
|
+
|
|
36
|
+
### Scoring Guidelines
|
|
37
|
+
- 0.8-1.0: Excellent performance, consistent profits, good risk management
|
|
38
|
+
- 0.6-0.8: Good performance, positive P&L, reasonable decisions
|
|
39
|
+
- 0.4-0.6: Average performance, mixed results
|
|
40
|
+
- 0.2-0.4: Below average, some losses, questionable decisions
|
|
41
|
+
- 0.0-0.2: Poor performance, significant losses, poor decision making
|
|
42
|
+
|
|
43
|
+
Compare trajectories RELATIVE to each other within this group.
|
|
44
|
+
If one trajectory is significantly better, reflect that in score differences.
|
|
45
|
+
`;
|
|
46
|
+
export const DEFAULT_PRIORITY_METRICS = [
|
|
47
|
+
"trading.totalPnL",
|
|
48
|
+
"trading.winRate",
|
|
49
|
+
"behavior.actionSuccessRate",
|
|
50
|
+
"behavior.episodeLength",
|
|
51
|
+
];
|
|
52
|
+
/**
|
|
53
|
+
* Registry of all archetype rubrics
|
|
54
|
+
*/
|
|
55
|
+
export const RUBRICS = {
|
|
56
|
+
trader: TRADER_RUBRIC,
|
|
57
|
+
"social-butterfly": SOCIAL_BUTTERFLY_RUBRIC,
|
|
58
|
+
scammer: SCAMMER_RUBRIC,
|
|
59
|
+
degen: DEGEN_RUBRIC,
|
|
60
|
+
researcher: RESEARCHER_RUBRIC,
|
|
61
|
+
"information-trader": INFORMATION_TRADER_RUBRIC,
|
|
62
|
+
"goody-twoshoes": GOODY_TWOSHOES_RUBRIC,
|
|
63
|
+
"ass-kisser": ASS_KISSER_RUBRIC,
|
|
64
|
+
"perps-trader": PERPS_TRADER_RUBRIC,
|
|
65
|
+
"super-predictor": SUPER_PREDICTOR_RUBRIC,
|
|
66
|
+
infosec: INFOSEC_RUBRIC,
|
|
67
|
+
liar: LIAR_RUBRIC,
|
|
68
|
+
// Aliases
|
|
69
|
+
socialbutterfly: SOCIAL_BUTTERFLY_RUBRIC,
|
|
70
|
+
goodytwoshoes: GOODY_TWOSHOES_RUBRIC,
|
|
71
|
+
asskisser: ASS_KISSER_RUBRIC,
|
|
72
|
+
perpstrader: PERPS_TRADER_RUBRIC,
|
|
73
|
+
superpredictor: SUPER_PREDICTOR_RUBRIC,
|
|
74
|
+
informationtrader: INFORMATION_TRADER_RUBRIC,
|
|
75
|
+
};
|
|
76
|
+
/**
|
|
77
|
+
* Priority metrics for each archetype
|
|
78
|
+
*/
|
|
79
|
+
export const PRIORITY_METRICS = {
|
|
80
|
+
trader: TRADER_PRIORITY_METRICS,
|
|
81
|
+
"social-butterfly": SOCIAL_BUTTERFLY_PRIORITY_METRICS,
|
|
82
|
+
scammer: SCAMMER_PRIORITY_METRICS,
|
|
83
|
+
degen: DEGEN_PRIORITY_METRICS,
|
|
84
|
+
researcher: RESEARCHER_PRIORITY_METRICS,
|
|
85
|
+
"information-trader": INFORMATION_TRADER_PRIORITY_METRICS,
|
|
86
|
+
"goody-twoshoes": GOODY_TWOSHOES_PRIORITY_METRICS,
|
|
87
|
+
"ass-kisser": ASS_KISSER_PRIORITY_METRICS,
|
|
88
|
+
"perps-trader": PERPS_TRADER_PRIORITY_METRICS,
|
|
89
|
+
"super-predictor": SUPER_PREDICTOR_PRIORITY_METRICS,
|
|
90
|
+
infosec: INFOSEC_PRIORITY_METRICS,
|
|
91
|
+
liar: LIAR_PRIORITY_METRICS,
|
|
92
|
+
};
|
|
93
|
+
/**
|
|
94
|
+
* Valid canonical archetype names for whitelist validation
|
|
95
|
+
* Derived from RUBRICS keys to maintain single source of truth
|
|
96
|
+
*/
|
|
97
|
+
export const VALID_ARCHETYPES = new Set(Object.keys(RUBRICS));
|
|
98
|
+
/**
|
|
99
|
+
* Normalize archetype string to canonical format (lowercase, hyphens)
|
|
100
|
+
* Returns 'default' for empty/null values
|
|
101
|
+
* Note: Does NOT validate against whitelist - use sanitizeArchetype() for that
|
|
102
|
+
*/
|
|
103
|
+
export function normalizeArchetype(archetype) {
|
|
104
|
+
if (!archetype || archetype.trim() === "") {
|
|
105
|
+
return "default";
|
|
106
|
+
}
|
|
107
|
+
return archetype.toLowerCase().trim().replace(/_/g, "-");
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Validate that an archetype is in the allowed whitelist
|
|
111
|
+
* Prevents prompt injection attacks via malicious archetype strings
|
|
112
|
+
*/
|
|
113
|
+
export function isValidArchetype(archetype) {
|
|
114
|
+
const normalized = normalizeArchetype(archetype);
|
|
115
|
+
return normalized === "default" || VALID_ARCHETYPES.has(normalized);
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Sanitize archetype for safe use in LLM prompts
|
|
119
|
+
* Returns normalized archetype if valid, 'default' otherwise
|
|
120
|
+
*/
|
|
121
|
+
export function sanitizeArchetype(archetype) {
|
|
122
|
+
const normalized = normalizeArchetype(archetype);
|
|
123
|
+
if (normalized === "default" || VALID_ARCHETYPES.has(normalized)) {
|
|
124
|
+
return normalized;
|
|
125
|
+
}
|
|
126
|
+
return "default";
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Get the rubric for an archetype
|
|
130
|
+
*/
|
|
131
|
+
export function getRubric(archetype) {
|
|
132
|
+
const normalized = normalizeArchetype(archetype);
|
|
133
|
+
return RUBRICS[normalized] || DEFAULT_RUBRIC;
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Get priority metrics for an archetype
|
|
137
|
+
*/
|
|
138
|
+
export function getPriorityMetrics(archetype) {
|
|
139
|
+
const normalized = normalizeArchetype(archetype);
|
|
140
|
+
return PRIORITY_METRICS[normalized] || DEFAULT_PRIORITY_METRICS;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Check if an archetype has a custom rubric
|
|
144
|
+
*/
|
|
145
|
+
export function hasCustomRubric(archetype) {
|
|
146
|
+
const normalized = normalizeArchetype(archetype);
|
|
147
|
+
return normalized in RUBRICS;
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Canonical archetype names (with hyphens, no aliases)
|
|
151
|
+
* Single source of truth - derived from PRIORITY_METRICS keys which only contains canonical names
|
|
152
|
+
*/
|
|
153
|
+
export const CANONICAL_ARCHETYPES = Object.keys(PRIORITY_METRICS);
|
|
154
|
+
/**
|
|
155
|
+
* Get all available archetype names (canonical names only, no aliases)
|
|
156
|
+
* Uses CANONICAL_ARCHETYPES to maintain single source of truth
|
|
157
|
+
*/
|
|
158
|
+
export function getAvailableArchetypes() {
|
|
159
|
+
return [...CANONICAL_ARCHETYPES];
|
|
160
|
+
}
|
|
161
|
+
// Re-export individual rubrics
|
|
162
|
+
export { ASS_KISSER_RUBRIC, DEGEN_RUBRIC, GOODY_TWOSHOES_RUBRIC, INFORMATION_TRADER_RUBRIC, INFOSEC_RUBRIC, LIAR_RUBRIC, PERPS_TRADER_RUBRIC, RESEARCHER_RUBRIC, SCAMMER_RUBRIC, SOCIAL_BUTTERFLY_RUBRIC, SUPER_PREDICTOR_RUBRIC, TRADER_RUBRIC, };
|
|
163
|
+
/**
|
|
164
|
+
* Rubrics version - increment when rubrics change significantly
|
|
165
|
+
* Used for cache invalidation
|
|
166
|
+
*/
|
|
167
|
+
export const RUBRICS_VERSION = "1.0.0";
|
|
168
|
+
/**
|
|
169
|
+
* Get a hash of the rubric for an archetype
|
|
170
|
+
* Used for cache invalidation when specific rubrics change
|
|
171
|
+
*/
|
|
172
|
+
export function getRubricHash(archetype) {
|
|
173
|
+
const rubric = getRubric(archetype);
|
|
174
|
+
return createHash("sha256").update(rubric).digest("hex").substring(0, 16);
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Get the hash of all rubrics combined
|
|
178
|
+
* Used for detecting any rubric changes
|
|
179
|
+
* Note: Sorted to match Python implementation for cross-language consistency
|
|
180
|
+
*/
|
|
181
|
+
export function getAllRubricsHash() {
|
|
182
|
+
const allRubrics = Object.values(RUBRICS).sort().join("::") + DEFAULT_RUBRIC;
|
|
183
|
+
return createHash("sha256").update(allRubrics).digest("hex").substring(0, 16);
|
|
184
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Information Trader Archetype Evaluation Rubric
|
|
3
|
+
*
|
|
4
|
+
* Gathers intel through social channels and trades on information advantage
|
|
5
|
+
*/
|
|
6
|
+
export const INFORMATION_TRADER_RUBRIC = `
|
|
7
|
+
## Information Trader Archetype Evaluation
|
|
8
|
+
|
|
9
|
+
You are evaluating an agent that combines social intelligence with trading, gathering information through conversations and relationships to gain trading edges.
|
|
10
|
+
|
|
11
|
+
### What Makes an Excellent Information Trader (0.8-1.0)
|
|
12
|
+
- **Social intelligence for trading**: Gathers info through DMs and group chats
|
|
13
|
+
- **Timing correlation**: Trades happen AFTER receiving information
|
|
14
|
+
- **Positive P&L from info edge**: Profits come from information advantage
|
|
15
|
+
- **Strategic networking**: Connects with informed sources
|
|
16
|
+
- **Information synthesis**: Combines social intel with market data
|
|
17
|
+
- **Balanced activity**: Active in both social and trading (ratio ~1.0)
|
|
18
|
+
- **Asks good questions**: Requests specific information
|
|
19
|
+
|
|
20
|
+
### What Makes a Good Information Trader (0.6-0.8)
|
|
21
|
+
- Active in group chats for market intel
|
|
22
|
+
- Some DM conversations with other traders
|
|
23
|
+
- Trading activity correlates with info received
|
|
24
|
+
- Reasonable P&L with evidence of info-driven trades
|
|
25
|
+
- Social to trade ratio between 0.5-1.5
|
|
26
|
+
|
|
27
|
+
### What Makes an Average Information Trader (0.4-0.6)
|
|
28
|
+
- Some social activity but not clearly for intel
|
|
29
|
+
- Trades don't clearly follow information received
|
|
30
|
+
- Either too social (not trading on info) or too trading-focused (not gathering info)
|
|
31
|
+
- Mixed results without clear information edge
|
|
32
|
+
|
|
33
|
+
### What Makes a Poor Information Trader (0.0-0.4)
|
|
34
|
+
- **No social intel gathering**: Trades blind
|
|
35
|
+
- **Pure social, no trading**: Gathers info but doesn't act on it
|
|
36
|
+
- **Pure trading, no social**: Misses information advantage
|
|
37
|
+
- **Bad timing**: Trades BEFORE gathering relevant info
|
|
38
|
+
- **Ignores information**: Has access but doesn't use it
|
|
39
|
+
|
|
40
|
+
### Key Metrics to Prioritize (in order)
|
|
41
|
+
1. **P&L** (must convert info to profit)
|
|
42
|
+
2. **Group Chats Joined** (information sources)
|
|
43
|
+
3. **DMs with users** (private intel channels)
|
|
44
|
+
4. **Social to Trade Ratio** (should be balanced ~0.8-1.2)
|
|
45
|
+
5. **Info Requests Sent** (actively seeking intel)
|
|
46
|
+
6. **Win Rate** (info should improve accuracy)
|
|
47
|
+
|
|
48
|
+
### The Information → Trade Pipeline
|
|
49
|
+
Look for this pattern:
|
|
50
|
+
1. Join group chat or start DM
|
|
51
|
+
2. Gather information (ask questions, observe)
|
|
52
|
+
3. Analyze/synthesize intel
|
|
53
|
+
4. Execute trade based on information
|
|
54
|
+
5. Profit from edge
|
|
55
|
+
|
|
56
|
+
If this pipeline is evident, score high. If trades are random or info gathering doesn't lead to trades, score low.
|
|
57
|
+
|
|
58
|
+
### Scoring Guidance
|
|
59
|
+
An information trader with $80 P&L who clearly gathered intel from 5 group chats before trading should score HIGHER than one with $150 P&L who just traded technically without social engagement.
|
|
60
|
+
|
|
61
|
+
The key question: Did they USE social connections for trading advantage?
|
|
62
|
+
|
|
63
|
+
### Common Failure Modes
|
|
64
|
+
- **The Socializer**: Lots of chat activity but never trades (wrong archetype)
|
|
65
|
+
- **The Lone Wolf**: Great trading but no social intel (wrong archetype)
|
|
66
|
+
- **The Bad Timer**: Gets info but trades too late/early
|
|
67
|
+
- **The Ignorer**: Receives intel but doesn't act on it
|
|
68
|
+
|
|
69
|
+
### Balance is Key
|
|
70
|
+
The information trader must balance both sides:
|
|
71
|
+
- Too much social, not enough trading = Social Butterfly, not Info Trader
|
|
72
|
+
- Too much trading, not enough social = Trader, not Info Trader
|
|
73
|
+
- Balance with info-to-trade pipeline = Excellent Info Trader
|
|
74
|
+
`;
|
|
75
|
+
export const INFORMATION_TRADER_PRIORITY_METRICS = [
|
|
76
|
+
"trading.totalPnL",
|
|
77
|
+
"social.groupChatsJoined",
|
|
78
|
+
"social.dmsInitiated",
|
|
79
|
+
"behavior.socialToTradeRatio",
|
|
80
|
+
"information.infoRequestsSent",
|
|
81
|
+
"trading.winRate",
|
|
82
|
+
];
|