@holoscript/framework 6.0.3 → 6.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +1 -2
- package/ROADMAP.md +68 -66
- package/dist/{InvisibleWallet-BB6tFvRA.d.cts → InvisibleWallet-EFiuaLn3.d.cts} +1 -1
- package/dist/{OrchestratorAgent-BvWgf9uw.d.cts → OrchestratorAgent-CrLDGNL6.d.cts} +1 -1
- package/dist/agents/index.cjs +11 -10
- package/dist/agents/index.d.cts +4 -16
- package/dist/ai/index.cjs +2 -2
- package/dist/behavior.cjs +10 -0
- package/dist/economy/index.cjs +4 -4
- package/dist/economy/index.d.cts +2 -2
- package/dist/index.cjs +33 -11
- package/dist/index.d.cts +3 -3
- package/dist/swarm/index.cjs +3 -0
- package/package.json +14 -9
- package/src/__tests__/bounty-marketplace.test.ts +53 -21
- package/src/__tests__/delegation.test.ts +1 -4
- package/src/__tests__/done-log-audit.test.ts +38 -46
- package/src/__tests__/framework.test.ts +172 -53
- package/src/__tests__/goal-synthesizer.test.ts +9 -6
- package/src/__tests__/presence.test.ts +1 -1
- package/src/__tests__/protocol-agent.test.ts +12 -11
- package/src/__tests__/revenue-splitter.test.ts +22 -15
- package/src/__tests__/scenario-driven-todo.test.ts +55 -35
- package/src/__tests__/self-improve.test.ts +28 -9
- package/src/__tests__/service-lifecycle.test.ts +9 -3
- package/src/__tests__/skill-router.test.ts +3 -3
- package/src/agents/CulturalMemory.ts +6 -6
- package/src/agents/DelegationTraceHooks.ts +560 -0
- package/src/agents/FederatedRegistryAdapter.ts +1 -1
- package/src/agents/NormEngine.ts +3 -8
- package/src/agents/OrchestratorAgent.ts +1 -1
- package/src/agents/TaskDelegationService.ts +5 -9
- package/src/agents/__tests__/AgentWalletRegistry.test.ts +5 -4
- package/src/agents/__tests__/CrossRealityHandoff.test.ts +9 -3
- package/src/agents/__tests__/DelegationTraceHooks.test.ts +390 -0
- package/src/agents/__tests__/TaskDelegationService.test.ts +4 -2
- package/src/agents/spatial-comms/Layer1RealTime.ts +36 -19
- package/src/agents/spatial-comms/Layer2A2A.ts +1 -3
- package/src/agents/spatial-comms/Layer3MCP.ts +13 -4
- package/src/agents/spatial-comms/ProtocolTypes.ts +5 -2
- package/src/agents/spatial-comms/examples/multi-agent-world-creation.ts +2 -2
- package/src/ai/HoloScriptGenerator.ts +2 -2
- package/src/ai/__tests__/PerceptionSystem.prod.test.ts +1 -1
- package/src/ai/__tests__/PerceptionSystem.test.ts +14 -14
- package/src/ai/__tests__/SteeringBehaviors.prod.test.ts +1 -1
- package/src/ai/index.ts +5 -1
- package/src/board/audit.ts +17 -6
- package/src/board/board-ops.ts +45 -15
- package/src/board/board-types.ts +94 -20
- package/src/delegation.ts +5 -3
- package/src/distributed-claimer.ts +13 -2
- package/src/economy/BountyManager.ts +40 -18
- package/src/economy/KnowledgeMarketplace.ts +27 -8
- package/src/economy/PaymentWebhookService.ts +0 -1
- package/src/economy/RevenueSplitter.ts +2 -4
- package/src/economy/UnifiedBudgetOptimizer.ts +8 -9
- package/src/economy/_core-stubs.ts +1 -1
- package/src/economy/x402-facilitator.ts +17 -8
- package/src/index.ts +16 -12
- package/src/knowledge/__tests__/knowledge-consolidator.test.ts +138 -89
- package/src/knowledge/__tests__/knowledge-store-vector.test.ts +59 -16
- package/src/knowledge/brain.ts +7 -7
- package/src/knowledge/consolidation.ts +16 -16
- package/src/knowledge/knowledge-consolidator.ts +60 -30
- package/src/knowledge/knowledge-store.ts +83 -45
- package/src/learning/ProceduralCompiler.ts +6 -1
- package/src/learning/learning/MemoryConsolidator.ts +102 -0
- package/src/learning/learning/MemoryScorer.ts +69 -0
- package/src/learning/learning/ProceduralCompiler.ts +45 -0
- package/src/learning/learning/SemanticClusterer.ts +66 -0
- package/src/llm/llm-adapter.ts +24 -10
- package/src/mesh/index.ts +37 -17
- package/src/protocol/goal-synthesizer.ts +24 -34
- package/src/protocol/implementations.ts +91 -22
- package/src/protocol/micro-phase-decomposer.ts +25 -17
- package/src/protocol/micro-step-decomposer.test.ts +104 -39
- package/src/protocol-agent.test.ts +17 -7
- package/src/protocol-agent.ts +45 -42
- package/src/self-improve/absorb-scanner.ts +9 -6
- package/src/self-improve/evolution-engine.ts +36 -18
- package/src/self-improve/framework-absorber.ts +21 -16
- package/src/self-improve/index.ts +2 -10
- package/src/self-improve/prompt-optimizer.ts +31 -19
- package/src/self-improve/test-generator.ts +16 -12
- package/src/skill-router.ts +7 -6
- package/src/swarm/messaging/GossipProtocol.ts +1 -1
- package/src/swarm/messaging/__tests__/BroadcastChannel.prod.test.ts +31 -9
- package/src/swarm/messaging/__tests__/GossipProtocol.prod.test.ts +21 -7
- package/src/swarm/messaging/__tests__/SwarmEventBus.prod.test.ts +24 -8
- package/src/swarm/messaging/__tests__/SwarmEventBus.test.ts +6 -2
- package/src/team.ts +277 -122
- package/src/training/scripts/generate-spatial-dataset.ts +1 -1
- package/src/training/training/LRScheduler.ts +377 -0
- package/src/training/training/QualityScoringPipeline.ts +139 -0
- package/src/training/training/SoftDedup.ts +461 -0
- package/src/training/training/SparsityMonitor.ts +685 -0
- package/src/training/training/SparsityMonitorTypes.ts +209 -0
- package/src/training/training/SpatialTrainingDataGenerator.ts +1526 -0
- package/src/training/training/SpatialTrainingDataTypes.ts +216 -0
- package/src/training/training/TrainingPipelineConfig.ts +215 -0
- package/src/training/training/__tests__/CorpusValidation.test.ts +87 -0
- package/src/training/training/__tests__/LRScheduler.test.ts +592 -0
- package/src/training/training/__tests__/SoftDedup.test.ts +415 -0
- package/src/training/training/__tests__/SparsityMonitor.test.ts +1623 -0
- package/src/training/training/__tests__/SpatialCorpusValidation.test.ts +72 -0
- package/src/training/training/__tests__/SpatialTrainingDataGenerator.test.ts +1244 -0
- package/src/training/training/__tests__/TrainingMonkeyIntegration.test.ts +897 -0
- package/src/training/training/__tests__/TrainingPipelineConfig.test.ts +202 -0
- package/src/training/training/__tests__/schema.test.ts +72 -0
- package/src/training/training/__tests__/training-constants.test.ts +106 -0
- package/src/training/training/__tests__/trait-mappings.test.ts +81 -0
- package/src/training/training/constants.ts +94 -0
- package/src/training/training/index.ts +17 -0
- package/src/training/training/schema.ts +147 -0
- package/src/training/training/scripts/generate-novel-use-cases-dataset.ts +272 -0
- package/src/training/training/scripts/generate-spatial-dataset.ts +521 -0
- package/src/training/training/trainingmonkey/TrainingMonkeyIntegration.ts +477 -0
- package/src/training/training/trainingmonkey/TrainingMonkeyTypes.ts +230 -0
- package/src/training/training/trainingmonkey/index.ts +26 -0
- package/src/training/training/trait-mappings.ts +157 -0
- package/src/types.ts +2 -7
- package/ALL-test-results.json +0 -1
- package/LICENSE +0 -21
- package/dist/AgentManifest-CB4xM-Ma.d.ts +0 -704
- package/dist/BehaviorTree-BrBFECv5.d.ts +0 -103
- package/dist/InvisibleWallet-rtRrBOA8.d.ts +0 -1732
- package/dist/OrchestratorAgent-Q_CbVTmO.d.ts +0 -798
- package/dist/agents/index.d.ts +0 -1788
- package/dist/agents/index.js +0 -4695
- package/dist/ai/index.d.ts +0 -1753
- package/dist/ai/index.js +0 -5244
- package/dist/behavior.d.ts +0 -130
- package/dist/behavior.js +0 -407
- package/dist/economy/index.d.ts +0 -747
- package/dist/economy/index.js +0 -3617
- package/dist/implementations-D9T3un9D.d.ts +0 -236
- package/dist/index.d.ts +0 -1729
- package/dist/index.js +0 -24277
- package/dist/learning/index.d.ts +0 -104
- package/dist/learning/index.js +0 -189
- package/dist/negotiation/index.d.ts +0 -610
- package/dist/negotiation/index.js +0 -931
- package/dist/skills/index.d.ts +0 -289
- package/dist/skills/index.js +0 -1079
- package/dist/swarm/index.d.ts +0 -2433
- package/dist/swarm/index.js +0 -5221
- package/dist/training/index.d.ts +0 -1734
- package/dist/training/index.js +0 -2687
- package/extract-failures.js +0 -10
- package/src/training/training/data/novel-use-cases.jsonl +0 -153
- package/src/training/training/data/spatial-reasoning-10k.jsonl +0 -9354
- package/src/types/core-stubs.d.ts +0 -113
- package/test-output.txt +0 -0
- package/test-result.json +0 -1
- package/tsc-errors.txt +0 -4
- package/tsc_output.txt +0 -0
- package/typescript-errors-2.txt +0 -0
- package/typescript-errors.txt +0 -22
- package/vitest-log-utf8.txt +0 -268
- package/vitest-log.txt +0 -0
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { SoftDedup, createSoftDedup, DEFAULT_SOFTDEDUP_CONFIG } from '../SoftDedup';
|
|
3
|
+
import type { SoftDedupConfig, SoftDedupResult } from '../SoftDedup';
|
|
4
|
+
|
|
5
|
+
// =============================================================================
|
|
6
|
+
// TEST DATA
|
|
7
|
+
// =============================================================================
|
|
8
|
+
|
|
9
|
+
const UNIQUE_EXAMPLES = [
|
|
10
|
+
'composition MyScene { orb Player { Grabbable {} Physics { mass: 10 } } }',
|
|
11
|
+
'world Arena { orb Enemy { Animation { clip: "attack" duration: 2.0 } } }',
|
|
12
|
+
'composition Garden { orb Tree { GaussianSplat { resolution: 512 } } }',
|
|
13
|
+
'world Ocean { orb Fish { NPC { behavior: "patrol" speed: 3.0 } } }',
|
|
14
|
+
'composition Castle { orb Knight { Tradeable { value: 100 } } }',
|
|
15
|
+
];
|
|
16
|
+
|
|
17
|
+
const DUPLICATE_HEAVY_EXAMPLES = [
|
|
18
|
+
'composition Scene { orb A { Grabbable {} } }',
|
|
19
|
+
'composition Scene { orb B { Grabbable {} } }',
|
|
20
|
+
'composition Scene { orb C { Grabbable {} } }',
|
|
21
|
+
'composition Scene { orb D { Grabbable {} } }',
|
|
22
|
+
'composition Scene { orb E { Grabbable {} } }',
|
|
23
|
+
'composition Scene { orb F { Grabbable {} } }',
|
|
24
|
+
'composition Scene { orb G { Grabbable {} } }',
|
|
25
|
+
'composition Scene { orb H { Grabbable {} } }',
|
|
26
|
+
'world UniqueWorld { orb Special { Physics { mass: 999 gravity: true } } }',
|
|
27
|
+
];
|
|
28
|
+
|
|
29
|
+
// =============================================================================
|
|
30
|
+
// TESTS
|
|
31
|
+
// =============================================================================
|
|
32
|
+
|
|
33
|
+
describe('SoftDedup', () => {
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
// CONSTRUCTION & CONFIGURATION
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
describe('constructor', () => {
|
|
39
|
+
it('uses default config when no overrides provided', () => {
|
|
40
|
+
const dedup = new SoftDedup();
|
|
41
|
+
const config = dedup.getConfig();
|
|
42
|
+
expect(config).toEqual(DEFAULT_SOFTDEDUP_CONFIG);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it('merges partial config with defaults', () => {
|
|
46
|
+
const dedup = new SoftDedup({ temperature: 0.5, wordLevel: true });
|
|
47
|
+
const config = dedup.getConfig();
|
|
48
|
+
expect(config.temperature).toBe(0.5);
|
|
49
|
+
expect(config.wordLevel).toBe(true);
|
|
50
|
+
expect(config.minWeight).toBe(DEFAULT_SOFTDEDUP_CONFIG.minWeight);
|
|
51
|
+
expect(config.ngramSizes).toEqual(DEFAULT_SOFTDEDUP_CONFIG.ngramSizes);
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it('throws on invalid minWeight (<= 0)', () => {
|
|
55
|
+
expect(() => new SoftDedup({ minWeight: 0 })).toThrow('minWeight');
|
|
56
|
+
expect(() => new SoftDedup({ minWeight: -0.5 })).toThrow('minWeight');
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
it('throws on invalid minWeight (> 1)', () => {
|
|
60
|
+
expect(() => new SoftDedup({ minWeight: 1.5 })).toThrow('minWeight');
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
it('throws on invalid maxWeight (< minWeight)', () => {
|
|
64
|
+
expect(() => new SoftDedup({ minWeight: 0.5, maxWeight: 0.3 })).toThrow('maxWeight');
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
it('throws on invalid maxWeight (> 1)', () => {
|
|
68
|
+
expect(() => new SoftDedup({ maxWeight: 1.5 })).toThrow('maxWeight');
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
it('throws on invalid temperature (<= 0)', () => {
|
|
72
|
+
expect(() => new SoftDedup({ temperature: 0 })).toThrow('temperature');
|
|
73
|
+
expect(() => new SoftDedup({ temperature: -1 })).toThrow('temperature');
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
it('throws on invalid commonThresholdPercentile', () => {
|
|
77
|
+
expect(() => new SoftDedup({ commonThresholdPercentile: -0.1 })).toThrow(
|
|
78
|
+
'commonThresholdPercentile'
|
|
79
|
+
);
|
|
80
|
+
expect(() => new SoftDedup({ commonThresholdPercentile: 1.5 })).toThrow(
|
|
81
|
+
'commonThresholdPercentile'
|
|
82
|
+
);
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it('throws on empty ngramSizes', () => {
|
|
86
|
+
expect(() => new SoftDedup({ ngramSizes: [] })).toThrow('ngramSizes');
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
it('throws on non-integer ngramSizes', () => {
|
|
90
|
+
expect(() => new SoftDedup({ ngramSizes: [2.5] })).toThrow('positive integer');
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
it('throws on zero ngramSize', () => {
|
|
94
|
+
expect(() => new SoftDedup({ ngramSizes: [0] })).toThrow('positive integer');
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
// ---------------------------------------------------------------------------
|
|
99
|
+
// EDGE CASES
|
|
100
|
+
// ---------------------------------------------------------------------------
|
|
101
|
+
|
|
102
|
+
describe('edge cases', () => {
|
|
103
|
+
it('returns empty array for empty dataset', () => {
|
|
104
|
+
const dedup = new SoftDedup();
|
|
105
|
+
const results = dedup.process([]);
|
|
106
|
+
expect(results).toEqual([]);
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
it('returns max weight for single example', () => {
|
|
110
|
+
const dedup = new SoftDedup();
|
|
111
|
+
const results = dedup.process(['hello world']);
|
|
112
|
+
expect(results).toHaveLength(1);
|
|
113
|
+
expect(results[0].samplingWeight).toBe(1.0);
|
|
114
|
+
expect(results[0].commonnessScore).toBe(0);
|
|
115
|
+
expect(results[0].index).toBe(0);
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
it('handles empty string examples', () => {
|
|
119
|
+
const dedup = new SoftDedup();
|
|
120
|
+
const results = dedup.process(['', '']);
|
|
121
|
+
expect(results).toHaveLength(2);
|
|
122
|
+
// Empty strings produce no n-grams -> max weight
|
|
123
|
+
for (const r of results) {
|
|
124
|
+
expect(r.samplingWeight).toBe(1.0);
|
|
125
|
+
expect(r.ngramStats.totalNgrams).toBe(0);
|
|
126
|
+
}
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
it('handles very short strings (shorter than min n-gram size)', () => {
|
|
130
|
+
const dedup = new SoftDedup({ ngramSizes: [5] });
|
|
131
|
+
const results = dedup.process(['ab', 'cd']);
|
|
132
|
+
expect(results).toHaveLength(2);
|
|
133
|
+
// Strings shorter than n=5 produce no n-grams
|
|
134
|
+
for (const r of results) {
|
|
135
|
+
expect(r.ngramStats.totalNgrams).toBe(0);
|
|
136
|
+
expect(r.samplingWeight).toBe(1.0);
|
|
137
|
+
}
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
it('handles identical examples (maximum commonness)', () => {
|
|
141
|
+
const text = 'composition Scene { orb Player { Grabbable {} } }';
|
|
142
|
+
const dedup = new SoftDedup();
|
|
143
|
+
const results = dedup.process([text, text, text, text, text]);
|
|
144
|
+
expect(results).toHaveLength(5);
|
|
145
|
+
|
|
146
|
+
// All identical -> all should have the same (low) weight
|
|
147
|
+
const weights = results.map((r) => r.samplingWeight);
|
|
148
|
+
expect(new Set(weights).size).toBe(1); // All same weight
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
it('handles whitespace-only examples', () => {
|
|
152
|
+
const dedup = new SoftDedup({ ngramSizes: [3] });
|
|
153
|
+
const results = dedup.process([' ', ' ']);
|
|
154
|
+
expect(results).toHaveLength(2);
|
|
155
|
+
// Whitespace produces character n-grams; both identical -> common
|
|
156
|
+
});
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
// ---------------------------------------------------------------------------
|
|
160
|
+
// CORE FUNCTIONALITY
|
|
161
|
+
// ---------------------------------------------------------------------------
|
|
162
|
+
|
|
163
|
+
describe('process', () => {
|
|
164
|
+
it('assigns higher weights to unique examples', () => {
|
|
165
|
+
const dedup = new SoftDedup();
|
|
166
|
+
const results = dedup.process(DUPLICATE_HEAVY_EXAMPLES);
|
|
167
|
+
|
|
168
|
+
// The unique world example (last one) should have a higher weight
|
|
169
|
+
// than the template-based ones
|
|
170
|
+
const templateWeights = results.slice(0, -1).map((r) => r.samplingWeight);
|
|
171
|
+
const uniqueWeight = results[results.length - 1].samplingWeight;
|
|
172
|
+
|
|
173
|
+
const avgTemplateWeight = templateWeights.reduce((a, b) => a + b, 0) / templateWeights.length;
|
|
174
|
+
|
|
175
|
+
expect(uniqueWeight).toBeGreaterThanOrEqual(avgTemplateWeight);
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
it('produces weights in [minWeight, maxWeight] range', () => {
|
|
179
|
+
const dedup = new SoftDedup({ minWeight: 0.2, maxWeight: 0.9 });
|
|
180
|
+
const results = dedup.process(DUPLICATE_HEAVY_EXAMPLES);
|
|
181
|
+
|
|
182
|
+
for (const r of results) {
|
|
183
|
+
expect(r.samplingWeight).toBeGreaterThanOrEqual(0.2);
|
|
184
|
+
expect(r.samplingWeight).toBeLessThanOrEqual(0.9);
|
|
185
|
+
}
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
it('preserves correct indices', () => {
|
|
189
|
+
const dedup = new SoftDedup();
|
|
190
|
+
const results = dedup.process(UNIQUE_EXAMPLES);
|
|
191
|
+
|
|
192
|
+
for (let i = 0; i < results.length; i++) {
|
|
193
|
+
expect(results[i].index).toBe(i);
|
|
194
|
+
}
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
it('commonness scores are in [0, 1] range', () => {
|
|
198
|
+
const dedup = new SoftDedup();
|
|
199
|
+
const results = dedup.process(DUPLICATE_HEAVY_EXAMPLES);
|
|
200
|
+
|
|
201
|
+
for (const r of results) {
|
|
202
|
+
expect(r.commonnessScore).toBeGreaterThanOrEqual(0);
|
|
203
|
+
expect(r.commonnessScore).toBeLessThanOrEqual(1);
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
it('n-gram stats are consistent', () => {
|
|
208
|
+
const dedup = new SoftDedup();
|
|
209
|
+
const results = dedup.process(UNIQUE_EXAMPLES);
|
|
210
|
+
|
|
211
|
+
for (const r of results) {
|
|
212
|
+
expect(r.ngramStats.commonNgrams).toBeLessThanOrEqual(r.ngramStats.totalNgrams);
|
|
213
|
+
expect(r.ngramStats.commonRatio).toBeGreaterThanOrEqual(0);
|
|
214
|
+
expect(r.ngramStats.commonRatio).toBeLessThanOrEqual(1);
|
|
215
|
+
|
|
216
|
+
if (r.ngramStats.totalNgrams > 0) {
|
|
217
|
+
expect(r.ngramStats.commonRatio).toBeCloseTo(
|
|
218
|
+
r.ngramStats.commonNgrams / r.ngramStats.totalNgrams,
|
|
219
|
+
10
|
|
220
|
+
);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
});
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
// ---------------------------------------------------------------------------
|
|
227
|
+
// WORD-LEVEL N-GRAMS
|
|
228
|
+
// ---------------------------------------------------------------------------
|
|
229
|
+
|
|
230
|
+
describe('word-level n-grams', () => {
|
|
231
|
+
it('supports word-level tokenization', () => {
|
|
232
|
+
const dedup = new SoftDedup({
|
|
233
|
+
wordLevel: true,
|
|
234
|
+
ngramSizes: [2, 3],
|
|
235
|
+
});
|
|
236
|
+
|
|
237
|
+
const results = dedup.process([
|
|
238
|
+
'composition Scene orb Player Grabbable',
|
|
239
|
+
'composition Scene orb Player Grabbable',
|
|
240
|
+
'world Arena orb Enemy Physics mass gravity',
|
|
241
|
+
]);
|
|
242
|
+
|
|
243
|
+
expect(results).toHaveLength(3);
|
|
244
|
+
// Word-level should still detect duplicates
|
|
245
|
+
expect(results[0].samplingWeight).toBe(results[1].samplingWeight);
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
it('handles single-word examples with word-level n-grams', () => {
|
|
249
|
+
const dedup = new SoftDedup({
|
|
250
|
+
wordLevel: true,
|
|
251
|
+
ngramSizes: [2],
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
const results = dedup.process(['hello', 'world']);
|
|
255
|
+
// Single words can't form bigrams -> no n-grams -> max weight
|
|
256
|
+
for (const r of results) {
|
|
257
|
+
expect(r.ngramStats.totalNgrams).toBe(0);
|
|
258
|
+
expect(r.samplingWeight).toBe(1.0);
|
|
259
|
+
}
|
|
260
|
+
});
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
// ---------------------------------------------------------------------------
|
|
264
|
+
// TEMPERATURE SCALING
|
|
265
|
+
// ---------------------------------------------------------------------------
|
|
266
|
+
|
|
267
|
+
describe('temperature scaling', () => {
|
|
268
|
+
it('lower temperature produces more extreme weights', () => {
|
|
269
|
+
const lowTemp = new SoftDedup({ temperature: 0.3 });
|
|
270
|
+
const highTemp = new SoftDedup({ temperature: 2.0 });
|
|
271
|
+
|
|
272
|
+
const lowResults = lowTemp.process(DUPLICATE_HEAVY_EXAMPLES);
|
|
273
|
+
const highResults = highTemp.process(DUPLICATE_HEAVY_EXAMPLES);
|
|
274
|
+
|
|
275
|
+
// Low temperature should have larger weight variance
|
|
276
|
+
const lowWeights = lowResults.map((r) => r.samplingWeight);
|
|
277
|
+
const highWeights = highResults.map((r) => r.samplingWeight);
|
|
278
|
+
|
|
279
|
+
const lowVariance = computeVariance(lowWeights);
|
|
280
|
+
const highVariance = computeVariance(highWeights);
|
|
281
|
+
|
|
282
|
+
// Low temperature should produce more spread-out weights
|
|
283
|
+
// (higher variance or at least not lower)
|
|
284
|
+
expect(lowVariance).toBeGreaterThanOrEqual(highVariance - 0.01);
|
|
285
|
+
});
|
|
286
|
+
});
|
|
287
|
+
|
|
288
|
+
// ---------------------------------------------------------------------------
|
|
289
|
+
// STATISTICS
|
|
290
|
+
// ---------------------------------------------------------------------------
|
|
291
|
+
|
|
292
|
+
describe('computeStats', () => {
|
|
293
|
+
it('returns zero stats for empty results', () => {
|
|
294
|
+
const dedup = new SoftDedup();
|
|
295
|
+
const stats = dedup.computeStats([]);
|
|
296
|
+
|
|
297
|
+
expect(stats.totalExamples).toBe(0);
|
|
298
|
+
expect(stats.meanWeight).toBe(0);
|
|
299
|
+
expect(stats.medianWeight).toBe(0);
|
|
300
|
+
expect(stats.effectiveDatasetSize).toBe(0);
|
|
301
|
+
expect(stats.reductionRatio).toBe(0);
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
it('computes correct stats for uniform weights', () => {
|
|
305
|
+
const dedup = new SoftDedup();
|
|
306
|
+
const results = dedup.process(UNIQUE_EXAMPLES);
|
|
307
|
+
const stats = dedup.computeStats(results);
|
|
308
|
+
|
|
309
|
+
expect(stats.totalExamples).toBe(5);
|
|
310
|
+
expect(stats.meanWeight).toBeGreaterThan(0);
|
|
311
|
+
expect(stats.meanWeight).toBeLessThanOrEqual(1);
|
|
312
|
+
expect(stats.effectiveDatasetSize).toBeLessThanOrEqual(5);
|
|
313
|
+
expect(stats.reductionRatio).toBeGreaterThanOrEqual(0);
|
|
314
|
+
expect(stats.reductionRatio).toBeLessThanOrEqual(1);
|
|
315
|
+
});
|
|
316
|
+
|
|
317
|
+
it('reports positive reduction ratio for duplicate-heavy datasets', () => {
|
|
318
|
+
const dedup = new SoftDedup();
|
|
319
|
+
const results = dedup.process(DUPLICATE_HEAVY_EXAMPLES);
|
|
320
|
+
const stats = dedup.computeStats(results);
|
|
321
|
+
|
|
322
|
+
// With duplicates, effective size should be less than total
|
|
323
|
+
expect(stats.effectiveDatasetSize).toBeLessThanOrEqual(stats.totalExamples);
|
|
324
|
+
expect(stats.reductionRatio).toBeGreaterThanOrEqual(0);
|
|
325
|
+
});
|
|
326
|
+
|
|
327
|
+
it('computes correct median for even-length arrays', () => {
|
|
328
|
+
const dedup = new SoftDedup();
|
|
329
|
+
const results: SoftDedupResult[] = [
|
|
330
|
+
{
|
|
331
|
+
index: 0,
|
|
332
|
+
commonnessScore: 0,
|
|
333
|
+
samplingWeight: 0.2,
|
|
334
|
+
ngramStats: { totalNgrams: 10, commonNgrams: 0, commonRatio: 0 },
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
index: 1,
|
|
338
|
+
commonnessScore: 0,
|
|
339
|
+
samplingWeight: 0.8,
|
|
340
|
+
ngramStats: { totalNgrams: 10, commonNgrams: 0, commonRatio: 0 },
|
|
341
|
+
},
|
|
342
|
+
];
|
|
343
|
+
const stats = dedup.computeStats(results);
|
|
344
|
+
expect(stats.medianWeight).toBe(0.5); // (0.2 + 0.8) / 2
|
|
345
|
+
});
|
|
346
|
+
|
|
347
|
+
it('computes correct median for odd-length arrays', () => {
|
|
348
|
+
const dedup = new SoftDedup();
|
|
349
|
+
const results: SoftDedupResult[] = [
|
|
350
|
+
{
|
|
351
|
+
index: 0,
|
|
352
|
+
commonnessScore: 0,
|
|
353
|
+
samplingWeight: 0.2,
|
|
354
|
+
ngramStats: { totalNgrams: 10, commonNgrams: 0, commonRatio: 0 },
|
|
355
|
+
},
|
|
356
|
+
{
|
|
357
|
+
index: 1,
|
|
358
|
+
commonnessScore: 0,
|
|
359
|
+
samplingWeight: 0.5,
|
|
360
|
+
ngramStats: { totalNgrams: 10, commonNgrams: 0, commonRatio: 0 },
|
|
361
|
+
},
|
|
362
|
+
{
|
|
363
|
+
index: 2,
|
|
364
|
+
commonnessScore: 0,
|
|
365
|
+
samplingWeight: 0.9,
|
|
366
|
+
ngramStats: { totalNgrams: 10, commonNgrams: 0, commonRatio: 0 },
|
|
367
|
+
},
|
|
368
|
+
];
|
|
369
|
+
const stats = dedup.computeStats(results);
|
|
370
|
+
expect(stats.medianWeight).toBe(0.5);
|
|
371
|
+
});
|
|
372
|
+
});
|
|
373
|
+
|
|
374
|
+
// ---------------------------------------------------------------------------
|
|
375
|
+
// FACTORY FUNCTION
|
|
376
|
+
// ---------------------------------------------------------------------------
|
|
377
|
+
|
|
378
|
+
describe('createSoftDedup', () => {
|
|
379
|
+
it('creates a SoftDedup instance with defaults', () => {
|
|
380
|
+
const dedup = createSoftDedup();
|
|
381
|
+
expect(dedup).toBeInstanceOf(SoftDedup);
|
|
382
|
+
expect(dedup.getConfig()).toEqual(DEFAULT_SOFTDEDUP_CONFIG);
|
|
383
|
+
});
|
|
384
|
+
|
|
385
|
+
it('creates a SoftDedup instance with overrides', () => {
|
|
386
|
+
const dedup = createSoftDedup({ temperature: 2.0 });
|
|
387
|
+
expect(dedup.getConfig().temperature).toBe(2.0);
|
|
388
|
+
});
|
|
389
|
+
});
|
|
390
|
+
|
|
391
|
+
// ---------------------------------------------------------------------------
|
|
392
|
+
// DEFAULT CONFIG
|
|
393
|
+
// ---------------------------------------------------------------------------
|
|
394
|
+
|
|
395
|
+
describe('DEFAULT_SOFTDEDUP_CONFIG', () => {
|
|
396
|
+
it('has expected default values', () => {
|
|
397
|
+
expect(DEFAULT_SOFTDEDUP_CONFIG.ngramSizes).toEqual([3, 5, 7]);
|
|
398
|
+
expect(DEFAULT_SOFTDEDUP_CONFIG.wordLevel).toBe(false);
|
|
399
|
+
expect(DEFAULT_SOFTDEDUP_CONFIG.minWeight).toBe(0.1);
|
|
400
|
+
expect(DEFAULT_SOFTDEDUP_CONFIG.maxWeight).toBe(1.0);
|
|
401
|
+
expect(DEFAULT_SOFTDEDUP_CONFIG.temperature).toBe(1.0);
|
|
402
|
+
expect(DEFAULT_SOFTDEDUP_CONFIG.commonThresholdPercentile).toBe(0.7);
|
|
403
|
+
});
|
|
404
|
+
});
|
|
405
|
+
});
|
|
406
|
+
|
|
407
|
+
// =============================================================================
|
|
408
|
+
// HELPERS
|
|
409
|
+
// =============================================================================
|
|
410
|
+
|
|
411
|
+
function computeVariance(values: number[]): number {
|
|
412
|
+
if (values.length === 0) return 0;
|
|
413
|
+
const mean = values.reduce((a, b) => a + b, 0) / values.length;
|
|
414
|
+
return values.reduce((acc, v) => acc + (v - mean) ** 2, 0) / values.length;
|
|
415
|
+
}
|