@holoscript/framework 6.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ALL-test-results.json +1 -0
- package/CHANGELOG.md +8 -0
- package/LICENSE +21 -0
- package/ROADMAP.md +175 -0
- package/dist/AgentManifest-CB4xM-Ma.d.cts +704 -0
- package/dist/AgentManifest-CB4xM-Ma.d.ts +704 -0
- package/dist/BehaviorTree-BrBFECv5.d.cts +103 -0
- package/dist/BehaviorTree-BrBFECv5.d.ts +103 -0
- package/dist/InvisibleWallet-BB6tFvRA.d.cts +1732 -0
- package/dist/InvisibleWallet-rtRrBOA8.d.ts +1732 -0
- package/dist/OrchestratorAgent-BvWgf9uw.d.cts +798 -0
- package/dist/OrchestratorAgent-Q_CbVTmO.d.ts +798 -0
- package/dist/agents/index.cjs +4790 -0
- package/dist/agents/index.d.cts +1788 -0
- package/dist/agents/index.d.ts +1788 -0
- package/dist/agents/index.js +4695 -0
- package/dist/ai/index.cjs +5347 -0
- package/dist/ai/index.d.cts +1753 -0
- package/dist/ai/index.d.ts +1753 -0
- package/dist/ai/index.js +5244 -0
- package/dist/behavior.cjs +449 -0
- package/dist/behavior.d.cts +130 -0
- package/dist/behavior.d.ts +130 -0
- package/dist/behavior.js +407 -0
- package/dist/economy/index.cjs +3659 -0
- package/dist/economy/index.d.cts +747 -0
- package/dist/economy/index.d.ts +747 -0
- package/dist/economy/index.js +3617 -0
- package/dist/implementations-D9T3un9D.d.cts +236 -0
- package/dist/implementations-D9T3un9D.d.ts +236 -0
- package/dist/index.cjs +24550 -0
- package/dist/index.d.cts +1729 -0
- package/dist/index.d.ts +1729 -0
- package/dist/index.js +24277 -0
- package/dist/learning/index.cjs +219 -0
- package/dist/learning/index.d.cts +104 -0
- package/dist/learning/index.d.ts +104 -0
- package/dist/learning/index.js +189 -0
- package/dist/negotiation/index.cjs +970 -0
- package/dist/negotiation/index.d.cts +610 -0
- package/dist/negotiation/index.d.ts +610 -0
- package/dist/negotiation/index.js +931 -0
- package/dist/skills/index.cjs +1118 -0
- package/dist/skills/index.d.cts +289 -0
- package/dist/skills/index.d.ts +289 -0
- package/dist/skills/index.js +1079 -0
- package/dist/swarm/index.cjs +5268 -0
- package/dist/swarm/index.d.cts +2433 -0
- package/dist/swarm/index.d.ts +2433 -0
- package/dist/swarm/index.js +5221 -0
- package/dist/training/index.cjs +2745 -0
- package/dist/training/index.d.cts +1734 -0
- package/dist/training/index.d.ts +1734 -0
- package/dist/training/index.js +2687 -0
- package/extract-failures.js +10 -0
- package/package.json +82 -0
- package/src/__tests__/bounty-marketplace.test.ts +374 -0
- package/src/__tests__/delegation.test.ts +144 -0
- package/src/__tests__/distributed-claimer.test.ts +147 -0
- package/src/__tests__/done-log-audit.test.ts +342 -0
- package/src/__tests__/framework.test.ts +865 -0
- package/src/__tests__/goal-synthesizer.test.ts +236 -0
- package/src/__tests__/presence.test.ts +223 -0
- package/src/__tests__/protocol-agent.test.ts +254 -0
- package/src/__tests__/revenue-splitter.test.ts +114 -0
- package/src/__tests__/scenario-driven-todo.test.ts +197 -0
- package/src/__tests__/self-improve.test.ts +349 -0
- package/src/__tests__/service-lifecycle.test.ts +237 -0
- package/src/__tests__/skill-router.test.ts +121 -0
- package/src/agents/AgentManifest.ts +493 -0
- package/src/agents/AgentRegistry.ts +475 -0
- package/src/agents/AgentTypes.ts +585 -0
- package/src/agents/AgentWalletRegistry.ts +83 -0
- package/src/agents/AuthenticatedCRDT.ts +388 -0
- package/src/agents/CapabilityMatcher.ts +453 -0
- package/src/agents/CrossRealityHandoff.ts +305 -0
- package/src/agents/CulturalMemory.ts +454 -0
- package/src/agents/FederatedRegistryAdapter.ts +429 -0
- package/src/agents/NormEngine.ts +450 -0
- package/src/agents/OrchestratorAgent.ts +414 -0
- package/src/agents/SkillWorkflowEngine.ts +472 -0
- package/src/agents/TaskDelegationService.ts +551 -0
- package/src/agents/__tests__/AgentManifest.prod.test.ts +134 -0
- package/src/agents/__tests__/AgentManifest.test.ts +182 -0
- package/src/agents/__tests__/AgentModule.test.ts +864 -0
- package/src/agents/__tests__/AgentRegistry.prod.test.ts +125 -0
- package/src/agents/__tests__/AgentRegistry.test.ts +148 -0
- package/src/agents/__tests__/AgentTypes.test.ts +534 -0
- package/src/agents/__tests__/AgentWalletRegistry.test.ts +152 -0
- package/src/agents/__tests__/AuthenticatedCRDT.test.ts +558 -0
- package/src/agents/__tests__/CapabilityMatcher.prod.test.ts +117 -0
- package/src/agents/__tests__/CapabilityMatcher.test.ts +178 -0
- package/src/agents/__tests__/CrossRealityHandoff.test.ts +402 -0
- package/src/agents/__tests__/CulturalMemory.test.ts +200 -0
- package/src/agents/__tests__/FederatedRegistryAdapter.test.ts +409 -0
- package/src/agents/__tests__/NormEngine.test.ts +276 -0
- package/src/agents/__tests__/OrchestratorAgent.test.ts +182 -0
- package/src/agents/__tests__/SkillWorkflowEngine.test.ts +357 -0
- package/src/agents/__tests__/TaskDelegationService.test.ts +446 -0
- package/src/agents/index.ts +107 -0
- package/src/agents/spatial-comms/Layer1RealTime.ts +621 -0
- package/src/agents/spatial-comms/Layer2A2A.ts +661 -0
- package/src/agents/spatial-comms/Layer3MCP.ts +651 -0
- package/src/agents/spatial-comms/ProtocolTypes.ts +543 -0
- package/src/agents/spatial-comms/SpatialCommClient.ts +483 -0
- package/src/agents/spatial-comms/__tests__/performance-benchmark.test.ts +465 -0
- package/src/agents/spatial-comms/examples/multi-agent-world-creation.ts +409 -0
- package/src/agents/spatial-comms/index.ts +66 -0
- package/src/ai/AIAdapter.ts +313 -0
- package/src/ai/AICopilot.ts +331 -0
- package/src/ai/AIOutputValidator.ts +203 -0
- package/src/ai/BTNodes.ts +239 -0
- package/src/ai/BehaviorSelector.ts +135 -0
- package/src/ai/BehaviorTree.ts +153 -0
- package/src/ai/Blackboard.ts +165 -0
- package/src/ai/GenerationAnalytics.ts +461 -0
- package/src/ai/GenerationCache.ts +265 -0
- package/src/ai/GoalPlanner.ts +165 -0
- package/src/ai/HoloScriptGenerator.ts +580 -0
- package/src/ai/InfluenceMap.ts +180 -0
- package/src/ai/NavMesh.ts +168 -0
- package/src/ai/PerceptionSystem.ts +178 -0
- package/src/ai/PromptTemplates.ts +453 -0
- package/src/ai/SemanticSearchService.ts +80 -0
- package/src/ai/StateMachine.ts +196 -0
- package/src/ai/SteeringBehavior.ts +150 -0
- package/src/ai/SteeringBehaviors.ts +244 -0
- package/src/ai/TrainingDataGenerator.ts +1082 -0
- package/src/ai/UtilityAI.ts +145 -0
- package/src/ai/__tests__/AIAdapter.prod.test.ts +259 -0
- package/src/ai/__tests__/AIAdapter.test.ts +109 -0
- package/src/ai/__tests__/AICopilot.prod.test.ts +341 -0
- package/src/ai/__tests__/AICopilot.test.ts +178 -0
- package/src/ai/__tests__/AIOutputValidator.prod.test.ts +226 -0
- package/src/ai/__tests__/AIOutputValidator.test.ts +138 -0
- package/src/ai/__tests__/BTNodes.prod.test.ts +391 -0
- package/src/ai/__tests__/BTNodes.test.ts +263 -0
- package/src/ai/__tests__/BehaviorSelector.prod.test.ts +129 -0
- package/src/ai/__tests__/BehaviorSelector.test.ts +132 -0
- package/src/ai/__tests__/BehaviorTree.prod.test.ts +266 -0
- package/src/ai/__tests__/BehaviorTree.test.ts +216 -0
- package/src/ai/__tests__/Blackboard.prod.test.ts +339 -0
- package/src/ai/__tests__/Blackboard.test.ts +183 -0
- package/src/ai/__tests__/GenerationAnalytics.prod.test.ts +141 -0
- package/src/ai/__tests__/GenerationAnalytics.test.ts +165 -0
- package/src/ai/__tests__/GenerationCache.prod.test.ts +144 -0
- package/src/ai/__tests__/GenerationCache.test.ts +171 -0
- package/src/ai/__tests__/GoalPlanner.prod.test.ts +189 -0
- package/src/ai/__tests__/GoalPlanner.test.ts +137 -0
- package/src/ai/__tests__/GoalPlannerDepth.prod.test.ts +217 -0
- package/src/ai/__tests__/HoloScriptGenerator.test.ts +125 -0
- package/src/ai/__tests__/InfluenceMap.prod.test.ts +146 -0
- package/src/ai/__tests__/InfluenceMap.test.ts +149 -0
- package/src/ai/__tests__/NavMesh.prod.test.ts +141 -0
- package/src/ai/__tests__/NavMesh.test.ts +159 -0
- package/src/ai/__tests__/PerceptionSystem.prod.test.ts +135 -0
- package/src/ai/__tests__/PerceptionSystem.test.ts +250 -0
- package/src/ai/__tests__/PromptTemplates.prod.test.ts +313 -0
- package/src/ai/__tests__/PromptTemplates.test.ts +146 -0
- package/src/ai/__tests__/SemanticSearch.test.ts +37 -0
- package/src/ai/__tests__/StateMachine.prod.test.ts +162 -0
- package/src/ai/__tests__/StateMachine.test.ts +163 -0
- package/src/ai/__tests__/SteeringBehavior.prod.test.ts +251 -0
- package/src/ai/__tests__/SteeringBehavior.test.ts +135 -0
- package/src/ai/__tests__/SteeringBehaviors.prod.test.ts +133 -0
- package/src/ai/__tests__/SteeringBehaviors.test.ts +151 -0
- package/src/ai/__tests__/TrainingDataGenerator.prod.test.ts +286 -0
- package/src/ai/__tests__/TrainingDataGenerator.test.ts +286 -0
- package/src/ai/__tests__/UtilityAI.prod.test.ts +207 -0
- package/src/ai/__tests__/UtilityAI.test.ts +155 -0
- package/src/ai/__tests__/adapters.prod.test.ts +263 -0
- package/src/ai/__tests__/adapters.test.ts +320 -0
- package/src/ai/adapters.ts +1585 -0
- package/src/ai/index.ts +130 -0
- package/src/behavior/BehaviorPresets.ts +140 -0
- package/src/behavior/BehaviorTree.ts +236 -0
- package/src/behavior/StateMachine.ts +176 -0
- package/src/behavior/StateTrait.ts +67 -0
- package/src/behavior/index.ts +8 -0
- package/src/behavior.ts +8 -0
- package/src/board/audit.ts +284 -0
- package/src/board/board-ops.ts +336 -0
- package/src/board/board-types.ts +302 -0
- package/src/board/index.ts +69 -0
- package/src/define-agent.ts +46 -0
- package/src/define-team.ts +33 -0
- package/src/delegation.ts +265 -0
- package/src/distributed-claimer.ts +228 -0
- package/src/economy/AgentBudgetEnforcer.ts +464 -0
- package/src/economy/BountyManager.ts +185 -0
- package/src/economy/CreatorRevenueAggregator.ts +460 -0
- package/src/economy/InvisibleWallet.ts +82 -0
- package/src/economy/KnowledgeMarketplace.ts +193 -0
- package/src/economy/PaymentWebhookService.ts +512 -0
- package/src/economy/RevenueSplitter.ts +156 -0
- package/src/economy/SubscriptionManager.ts +546 -0
- package/src/economy/UnifiedBudgetOptimizer.ts +635 -0
- package/src/economy/UsageMeter.ts +440 -0
- package/src/economy/_core-stubs.ts +219 -0
- package/src/economy/index.ts +100 -0
- package/src/economy/x402-facilitator.ts +1978 -0
- package/src/index.ts +348 -0
- package/src/knowledge/__tests__/knowledge-consolidator.test.ts +444 -0
- package/src/knowledge/__tests__/knowledge-store-vector.test.ts +291 -0
- package/src/knowledge/brain.ts +167 -0
- package/src/knowledge/consolidation.ts +581 -0
- package/src/knowledge/knowledge-consolidator.ts +510 -0
- package/src/knowledge/knowledge-store.ts +616 -0
- package/src/learning/MemoryConsolidator.ts +102 -0
- package/src/learning/MemoryScorer.ts +69 -0
- package/src/learning/ProceduralCompiler.ts +45 -0
- package/src/learning/SemanticClusterer.ts +66 -0
- package/src/learning/index.ts +8 -0
- package/src/llm/llm-adapter.ts +159 -0
- package/src/mesh/index.ts +309 -0
- package/src/negotiation/NegotiationProtocol.ts +694 -0
- package/src/negotiation/NegotiationTypes.ts +473 -0
- package/src/negotiation/VotingMechanisms.ts +691 -0
- package/src/negotiation/index.ts +49 -0
- package/src/protocol/goal-synthesizer.ts +317 -0
- package/src/protocol/implementations.ts +474 -0
- package/src/protocol/micro-phase-decomposer.ts +299 -0
- package/src/protocol/micro-step-decomposer.test.ts +306 -0
- package/src/protocol-agent.test.ts +353 -0
- package/src/protocol-agent.ts +670 -0
- package/src/self-improve/absorb-scanner.ts +252 -0
- package/src/self-improve/evolution-engine.ts +149 -0
- package/src/self-improve/framework-absorber.ts +214 -0
- package/src/self-improve/index.ts +50 -0
- package/src/self-improve/prompt-optimizer.ts +212 -0
- package/src/self-improve/test-generator.ts +175 -0
- package/src/skill-router.ts +186 -0
- package/src/skills/index.ts +5 -0
- package/src/skills/skill-md-bridge.ts +1699 -0
- package/src/swarm/ACOEngine.ts +261 -0
- package/src/swarm/CollectiveIntelligence.ts +383 -0
- package/src/swarm/ContributionSynthesizer.ts +481 -0
- package/src/swarm/LeaderElection.ts +393 -0
- package/src/swarm/PSOEngine.ts +206 -0
- package/src/swarm/QuorumPolicy.ts +173 -0
- package/src/swarm/SwarmCoordinator.ts +335 -0
- package/src/swarm/SwarmManager.ts +442 -0
- package/src/swarm/SwarmMembership.ts +456 -0
- package/src/swarm/VotingRound.ts +255 -0
- package/src/swarm/__tests__/ACOEngine.prod.test.ts +164 -0
- package/src/swarm/__tests__/ACOEngine.test.ts +117 -0
- package/src/swarm/__tests__/CollectiveIntelligence.prod.test.ts +296 -0
- package/src/swarm/__tests__/CollectiveIntelligence.test.ts +457 -0
- package/src/swarm/__tests__/ContributionSynthesizer.prod.test.ts +269 -0
- package/src/swarm/__tests__/ContributionSynthesizer.test.ts +254 -0
- package/src/swarm/__tests__/LeaderElection.prod.test.ts +196 -0
- package/src/swarm/__tests__/LeaderElection.test.ts +151 -0
- package/src/swarm/__tests__/PSOEngine.prod.test.ts +162 -0
- package/src/swarm/__tests__/PSOEngine.test.ts +106 -0
- package/src/swarm/__tests__/QuorumPolicy.prod.test.ts +216 -0
- package/src/swarm/__tests__/QuorumPolicy.test.ts +177 -0
- package/src/swarm/__tests__/SwarmCoordinator.prod.test.ts +186 -0
- package/src/swarm/__tests__/SwarmCoordinator.test.ts +167 -0
- package/src/swarm/__tests__/SwarmManager.prod.test.ts +308 -0
- package/src/swarm/__tests__/SwarmManager.test.ts +373 -0
- package/src/swarm/__tests__/SwarmMembership.prod.test.ts +273 -0
- package/src/swarm/__tests__/SwarmMembership.test.ts +264 -0
- package/src/swarm/__tests__/VotingRound.prod.test.ts +233 -0
- package/src/swarm/__tests__/VotingRound.test.ts +174 -0
- package/src/swarm/analytics/SwarmInspector.ts +476 -0
- package/src/swarm/analytics/SwarmMetrics.ts +449 -0
- package/src/swarm/analytics/__tests__/SwarmInspector.prod.test.ts +366 -0
- package/src/swarm/analytics/__tests__/SwarmInspector.test.ts +454 -0
- package/src/swarm/analytics/__tests__/SwarmMetrics.prod.test.ts +254 -0
- package/src/swarm/analytics/__tests__/SwarmMetrics.test.ts +370 -0
- package/src/swarm/analytics/index.ts +7 -0
- package/src/swarm/index.ts +69 -0
- package/src/swarm/messaging/BroadcastChannel.ts +509 -0
- package/src/swarm/messaging/GossipProtocol.ts +565 -0
- package/src/swarm/messaging/SwarmEventBus.ts +443 -0
- package/src/swarm/messaging/__tests__/BroadcastChannel.prod.test.ts +331 -0
- package/src/swarm/messaging/__tests__/BroadcastChannel.test.ts +333 -0
- package/src/swarm/messaging/__tests__/GossipProtocol.prod.test.ts +356 -0
- package/src/swarm/messaging/__tests__/GossipProtocol.test.ts +437 -0
- package/src/swarm/messaging/__tests__/SwarmEventBus.prod.test.ts +191 -0
- package/src/swarm/messaging/__tests__/SwarmEventBus.test.ts +247 -0
- package/src/swarm/messaging/index.ts +8 -0
- package/src/swarm/spatial/FlockingBehavior.ts +462 -0
- package/src/swarm/spatial/FormationController.ts +500 -0
- package/src/swarm/spatial/Vector3.ts +170 -0
- package/src/swarm/spatial/ZoneClaiming.ts +509 -0
- package/src/swarm/spatial/__tests__/FlockingBehavior.prod.test.ts +239 -0
- package/src/swarm/spatial/__tests__/FlockingBehavior.test.ts +298 -0
- package/src/swarm/spatial/__tests__/FormationController.prod.test.ts +240 -0
- package/src/swarm/spatial/__tests__/FormationController.test.ts +297 -0
- package/src/swarm/spatial/__tests__/Vector3.prod.test.ts +283 -0
- package/src/swarm/spatial/__tests__/Vector3.test.ts +224 -0
- package/src/swarm/spatial/__tests__/ZoneClaiming.prod.test.ts +246 -0
- package/src/swarm/spatial/__tests__/ZoneClaiming.test.ts +374 -0
- package/src/swarm/spatial/index.ts +28 -0
- package/src/team.ts +1245 -0
- package/src/training/LRScheduler.ts +377 -0
- package/src/training/QualityScoringPipeline.ts +139 -0
- package/src/training/SoftDedup.ts +461 -0
- package/src/training/SparsityMonitor.ts +685 -0
- package/src/training/SparsityMonitorTypes.ts +209 -0
- package/src/training/SpatialTrainingDataGenerator.ts +1526 -0
- package/src/training/SpatialTrainingDataTypes.ts +216 -0
- package/src/training/TrainingPipelineConfig.ts +215 -0
- package/src/training/constants.ts +94 -0
- package/src/training/index.ts +138 -0
- package/src/training/schema.ts +147 -0
- package/src/training/scripts/generate-novel-use-cases-dataset.ts +272 -0
- package/src/training/scripts/generate-spatial-dataset.ts +521 -0
- package/src/training/training/data/novel-use-cases.jsonl +153 -0
- package/src/training/training/data/spatial-reasoning-10k.jsonl +9354 -0
- package/src/training/trainingmonkey/TrainingMonkeyIntegration.ts +477 -0
- package/src/training/trainingmonkey/TrainingMonkeyTypes.ts +230 -0
- package/src/training/trainingmonkey/index.ts +26 -0
- package/src/training/trait-mappings.ts +157 -0
- package/src/types/core-stubs.d.ts +113 -0
- package/src/types.ts +304 -0
- package/test-output.txt +0 -0
- package/test-result.json +1 -0
- package/tsc-errors.txt +4 -0
- package/tsc_output.txt +0 -0
- package/tsconfig.json +14 -0
- package/tsup-learning-esm.config.ts +12 -0
- package/tsup.config.ts +21 -0
- package/typescript-errors-2.txt +0 -0
- package/typescript-errors.txt +22 -0
- package/vitest-log-utf8.txt +268 -0
- package/vitest-log.txt +0 -0
- package/vitest.config.ts +8 -0
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SoftDedup - Soft Deduplication via N-gram Commonness Scoring
|
|
3
|
+
*
|
|
4
|
+
* Instead of hard-deleting duplicate training examples, SoftDedup computes
|
|
5
|
+
* n-gram commonness scores and assigns sampling weights. Examples with
|
|
6
|
+
* high-frequency n-grams (template-generated / near-duplicate content)
|
|
7
|
+
* receive lower sampling weights, reducing their influence during training
|
|
8
|
+
* without discarding them entirely.
|
|
9
|
+
*
|
|
10
|
+
* Based on training rule W.008:
|
|
11
|
+
* "Reweight duplicates instead of deleting them. SoftDedup uses n-gram
|
|
12
|
+
* commonness scores to reduce sampling weight of high-frequency data.
|
|
13
|
+
* 26% faster training, +1.77% accuracy vs hard dedup alone."
|
|
14
|
+
*
|
|
15
|
+
* Pipeline position: Quality Filter -> Hard Dedup (W.004) -> SoftDedup (W.008)
|
|
16
|
+
*
|
|
17
|
+
* @module training/SoftDedup
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
// =============================================================================
|
|
21
|
+
// TYPES
|
|
22
|
+
// =============================================================================
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Configuration for the SoftDedup algorithm.
|
|
26
|
+
*/
|
|
27
|
+
export interface SoftDedupConfig {
|
|
28
|
+
/**
|
|
29
|
+
* N-gram sizes to compute commonness scores for.
|
|
30
|
+
* Using multiple sizes captures both local (small n) and structural (large n)
|
|
31
|
+
* patterns. Default: [3, 5, 7] (character-level trigrams, 5-grams, 7-grams).
|
|
32
|
+
*/
|
|
33
|
+
ngramSizes: number[];
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Whether to use word-level n-grams instead of character-level.
|
|
37
|
+
* Word-level captures semantic similarity; character-level captures
|
|
38
|
+
* template-level patterns. Default: false (character-level).
|
|
39
|
+
*/
|
|
40
|
+
wordLevel: boolean;
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Minimum sampling weight. Even the most common examples keep at least
|
|
44
|
+
* this weight to prevent complete exclusion. Default: 0.1 (10% weight).
|
|
45
|
+
* Must be in range (0, 1].
|
|
46
|
+
*/
|
|
47
|
+
minWeight: number;
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Maximum sampling weight. Rare/unique examples get at most this weight.
|
|
51
|
+
* Default: 1.0 (100% weight). Must be in range [minWeight, 1].
|
|
52
|
+
*/
|
|
53
|
+
maxWeight: number;
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Temperature parameter controlling how aggressively to downweight
|
|
57
|
+
* common examples. Higher temperature = more uniform weights.
|
|
58
|
+
* Lower temperature = more aggressive downweighting.
|
|
59
|
+
* Default: 1.0.
|
|
60
|
+
*/
|
|
61
|
+
temperature: number;
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Percentile threshold for "common" n-grams.
|
|
65
|
+
* N-grams appearing more frequently than this percentile of all n-gram
|
|
66
|
+
* frequencies are considered "common". Default: 0.7 (top 30% are common).
|
|
67
|
+
* Must be in range [0, 1].
|
|
68
|
+
*/
|
|
69
|
+
commonThresholdPercentile: number;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Result for a single training example after SoftDedup scoring.
|
|
74
|
+
*/
|
|
75
|
+
export interface SoftDedupResult {
|
|
76
|
+
/** Index of the example in the input array */
|
|
77
|
+
index: number;
|
|
78
|
+
|
|
79
|
+
/** Computed commonness score (0 = unique, 1 = fully common) */
|
|
80
|
+
commonnessScore: number;
|
|
81
|
+
|
|
82
|
+
/** Assigned sampling weight (minWeight to maxWeight) */
|
|
83
|
+
samplingWeight: number;
|
|
84
|
+
|
|
85
|
+
/** N-gram statistics for this example */
|
|
86
|
+
ngramStats: NgramStats;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* N-gram statistics for a single example.
|
|
91
|
+
*/
|
|
92
|
+
export interface NgramStats {
|
|
93
|
+
/** Total number of n-grams extracted */
|
|
94
|
+
totalNgrams: number;
|
|
95
|
+
|
|
96
|
+
/** Number of n-grams classified as "common" */
|
|
97
|
+
commonNgrams: number;
|
|
98
|
+
|
|
99
|
+
/** Ratio of common n-grams to total (0 to 1) */
|
|
100
|
+
commonRatio: number;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Aggregate statistics for the entire SoftDedup run.
|
|
105
|
+
*/
|
|
106
|
+
export interface SoftDedupStats {
|
|
107
|
+
/** Total examples processed */
|
|
108
|
+
totalExamples: number;
|
|
109
|
+
|
|
110
|
+
/** Mean sampling weight across all examples */
|
|
111
|
+
meanWeight: number;
|
|
112
|
+
|
|
113
|
+
/** Median sampling weight */
|
|
114
|
+
medianWeight: number;
|
|
115
|
+
|
|
116
|
+
/** Standard deviation of sampling weights */
|
|
117
|
+
stdWeight: number;
|
|
118
|
+
|
|
119
|
+
/** Number of examples at minimum weight (heavily downweighted) */
|
|
120
|
+
atMinWeight: number;
|
|
121
|
+
|
|
122
|
+
/** Number of examples at maximum weight (unique/rare) */
|
|
123
|
+
atMaxWeight: number;
|
|
124
|
+
|
|
125
|
+
/** Effective dataset size (sum of all weights) */
|
|
126
|
+
effectiveDatasetSize: number;
|
|
127
|
+
|
|
128
|
+
/** Reduction ratio: 1 - (effectiveSize / totalExamples) */
|
|
129
|
+
reductionRatio: number;
|
|
130
|
+
|
|
131
|
+
/** Number of unique n-grams in the corpus */
|
|
132
|
+
uniqueNgramsInCorpus: number;
|
|
133
|
+
|
|
134
|
+
/** Commonness threshold frequency (absolute count) */
|
|
135
|
+
commonThresholdFrequency: number;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// =============================================================================
|
|
139
|
+
// DEFAULT CONFIGURATION
|
|
140
|
+
// =============================================================================
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Default SoftDedup configuration.
|
|
144
|
+
* Tuned for HoloScript/Brittney training datasets (920K-1.5M examples).
|
|
145
|
+
*/
|
|
146
|
+
export const DEFAULT_SOFTDEDUP_CONFIG: SoftDedupConfig = {
|
|
147
|
+
ngramSizes: [3, 5, 7],
|
|
148
|
+
wordLevel: false,
|
|
149
|
+
minWeight: 0.1,
|
|
150
|
+
maxWeight: 1.0,
|
|
151
|
+
temperature: 1.0,
|
|
152
|
+
commonThresholdPercentile: 0.7,
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
// =============================================================================
|
|
156
|
+
// SOFT DEDUP CLASS
|
|
157
|
+
// =============================================================================
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* SoftDedup processor for training data.
|
|
161
|
+
*
|
|
162
|
+
* Computes n-gram commonness scores and assigns sampling weights
|
|
163
|
+
* to training examples. Works AFTER hard dedup (W.004).
|
|
164
|
+
*
|
|
165
|
+
* @example
|
|
166
|
+
* ```ts
|
|
167
|
+
* const dedup = new SoftDedup();
|
|
168
|
+
* const results = dedup.process([
|
|
169
|
+
* 'composition MyScene { orb Player { Grabbable {} } }',
|
|
170
|
+
* 'composition MyScene { orb Player { Grabbable {} } }', // near-duplicate
|
|
171
|
+
* 'world Arena { orb Enemy { Physics { mass: 10 } } }', // unique
|
|
172
|
+
* ]);
|
|
173
|
+
*
|
|
174
|
+
* // results[0].samplingWeight ~= 0.3 (common template)
|
|
175
|
+
* // results[1].samplingWeight ~= 0.3 (common template)
|
|
176
|
+
* // results[2].samplingWeight ~= 1.0 (unique content)
|
|
177
|
+
* ```
|
|
178
|
+
*/
|
|
179
|
+
export class SoftDedup {
|
|
180
|
+
private config: SoftDedupConfig;
|
|
181
|
+
|
|
182
|
+
constructor(config: Partial<SoftDedupConfig> = {}) {
|
|
183
|
+
this.config = { ...DEFAULT_SOFTDEDUP_CONFIG, ...config };
|
|
184
|
+
this.validateConfig();
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/**
|
|
188
|
+
* Process a dataset of text examples and compute sampling weights.
|
|
189
|
+
*
|
|
190
|
+
* @param examples - Array of text strings (training examples)
|
|
191
|
+
* @returns Array of SoftDedupResult with sampling weights
|
|
192
|
+
*/
|
|
193
|
+
process(examples: string[]): SoftDedupResult[] {
|
|
194
|
+
if (examples.length === 0) {
|
|
195
|
+
return [];
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
if (examples.length === 1) {
|
|
199
|
+
return [
|
|
200
|
+
{
|
|
201
|
+
index: 0,
|
|
202
|
+
commonnessScore: 0,
|
|
203
|
+
samplingWeight: this.config.maxWeight,
|
|
204
|
+
ngramStats: {
|
|
205
|
+
totalNgrams: this.extractNgrams(examples[0]).length,
|
|
206
|
+
commonNgrams: 0,
|
|
207
|
+
commonRatio: 0,
|
|
208
|
+
},
|
|
209
|
+
},
|
|
210
|
+
];
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Step 1: Build corpus-wide n-gram frequency map
|
|
214
|
+
const corpusFrequencies = this.buildCorpusFrequencies(examples);
|
|
215
|
+
|
|
216
|
+
// Step 2: Compute commonness threshold
|
|
217
|
+
const threshold = this.computeThreshold(corpusFrequencies);
|
|
218
|
+
|
|
219
|
+
// Step 3: Score each example
|
|
220
|
+
const results: SoftDedupResult[] = examples.map((example, index) => {
|
|
221
|
+
const ngrams = this.extractNgrams(example);
|
|
222
|
+
const totalNgrams = ngrams.length;
|
|
223
|
+
|
|
224
|
+
if (totalNgrams === 0) {
|
|
225
|
+
return {
|
|
226
|
+
index,
|
|
227
|
+
commonnessScore: 0,
|
|
228
|
+
samplingWeight: this.config.maxWeight,
|
|
229
|
+
ngramStats: { totalNgrams: 0, commonNgrams: 0, commonRatio: 0 },
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Count how many of this example's n-grams are "common"
|
|
234
|
+
let commonCount = 0;
|
|
235
|
+
for (const ngram of ngrams) {
|
|
236
|
+
const freq = corpusFrequencies.get(ngram) ?? 0;
|
|
237
|
+
if (freq >= threshold) {
|
|
238
|
+
commonCount++;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const commonRatio = commonCount / totalNgrams;
|
|
243
|
+
|
|
244
|
+
// Commonness score is the ratio of common n-grams
|
|
245
|
+
const commonnessScore = commonRatio;
|
|
246
|
+
|
|
247
|
+
// Convert commonness to sampling weight using temperature scaling
|
|
248
|
+
const samplingWeight = this.commonnessToWeight(commonnessScore);
|
|
249
|
+
|
|
250
|
+
return {
|
|
251
|
+
index,
|
|
252
|
+
commonnessScore,
|
|
253
|
+
samplingWeight,
|
|
254
|
+
ngramStats: {
|
|
255
|
+
totalNgrams,
|
|
256
|
+
commonNgrams: commonCount,
|
|
257
|
+
commonRatio,
|
|
258
|
+
},
|
|
259
|
+
};
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
return results;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* Compute aggregate statistics for a set of SoftDedup results.
|
|
267
|
+
*/
|
|
268
|
+
computeStats(results: SoftDedupResult[]): SoftDedupStats {
|
|
269
|
+
if (results.length === 0) {
|
|
270
|
+
return {
|
|
271
|
+
totalExamples: 0,
|
|
272
|
+
meanWeight: 0,
|
|
273
|
+
medianWeight: 0,
|
|
274
|
+
stdWeight: 0,
|
|
275
|
+
atMinWeight: 0,
|
|
276
|
+
atMaxWeight: 0,
|
|
277
|
+
effectiveDatasetSize: 0,
|
|
278
|
+
reductionRatio: 0,
|
|
279
|
+
uniqueNgramsInCorpus: 0,
|
|
280
|
+
commonThresholdFrequency: 0,
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
const weights = results.map((r) => r.samplingWeight);
|
|
285
|
+
const totalExamples = results.length;
|
|
286
|
+
const sum = weights.reduce((a, b) => a + b, 0);
|
|
287
|
+
const meanWeight = sum / totalExamples;
|
|
288
|
+
|
|
289
|
+
// Median
|
|
290
|
+
const sorted = [...weights].sort((a, b) => a - b);
|
|
291
|
+
const mid = Math.floor(sorted.length / 2);
|
|
292
|
+
const medianWeight =
|
|
293
|
+
sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
|
|
294
|
+
|
|
295
|
+
// Standard deviation
|
|
296
|
+
const variance = weights.reduce((acc, w) => acc + (w - meanWeight) ** 2, 0) / totalExamples;
|
|
297
|
+
const stdWeight = Math.sqrt(variance);
|
|
298
|
+
|
|
299
|
+
// Count extremes (with small epsilon for floating point)
|
|
300
|
+
const epsilon = 1e-9;
|
|
301
|
+
const atMinWeight = weights.filter((w) => Math.abs(w - this.config.minWeight) < epsilon).length;
|
|
302
|
+
const atMaxWeight = weights.filter((w) => Math.abs(w - this.config.maxWeight) < epsilon).length;
|
|
303
|
+
|
|
304
|
+
const effectiveDatasetSize = sum;
|
|
305
|
+
const reductionRatio = 1 - effectiveDatasetSize / totalExamples;
|
|
306
|
+
|
|
307
|
+
return {
|
|
308
|
+
totalExamples,
|
|
309
|
+
meanWeight,
|
|
310
|
+
medianWeight,
|
|
311
|
+
stdWeight,
|
|
312
|
+
atMinWeight,
|
|
313
|
+
atMaxWeight,
|
|
314
|
+
effectiveDatasetSize,
|
|
315
|
+
reductionRatio,
|
|
316
|
+
uniqueNgramsInCorpus: 0, // filled by caller if needed
|
|
317
|
+
commonThresholdFrequency: 0, // filled by caller if needed
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Get the current configuration.
|
|
323
|
+
*/
|
|
324
|
+
getConfig(): Readonly<SoftDedupConfig> {
|
|
325
|
+
return { ...this.config };
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// ===========================================================================
|
|
329
|
+
// INTERNAL METHODS
|
|
330
|
+
// ===========================================================================
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Extract n-grams from a text string.
|
|
334
|
+
* Supports both character-level and word-level n-grams.
|
|
335
|
+
*/
|
|
336
|
+
private extractNgrams(text: string): string[] {
|
|
337
|
+
const ngrams: string[] = [];
|
|
338
|
+
|
|
339
|
+
for (const n of this.config.ngramSizes) {
|
|
340
|
+
if (this.config.wordLevel) {
|
|
341
|
+
const words = text.split(/\s+/).filter((w) => w.length > 0);
|
|
342
|
+
for (let i = 0; i <= words.length - n; i++) {
|
|
343
|
+
ngrams.push(words.slice(i, i + n).join(' '));
|
|
344
|
+
}
|
|
345
|
+
} else {
|
|
346
|
+
const normalized = text.toLowerCase();
|
|
347
|
+
for (let i = 0; i <= normalized.length - n; i++) {
|
|
348
|
+
ngrams.push(normalized.substring(i, i + n));
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
return ngrams;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
/**
|
|
357
|
+
* Build a frequency map of all n-grams across the entire corpus.
|
|
358
|
+
*/
|
|
359
|
+
private buildCorpusFrequencies(examples: string[]): Map<string, number> {
|
|
360
|
+
const frequencies = new Map<string, number>();
|
|
361
|
+
|
|
362
|
+
for (const example of examples) {
|
|
363
|
+
const ngrams = this.extractNgrams(example);
|
|
364
|
+
for (const ngram of ngrams) {
|
|
365
|
+
frequencies.set(ngram, (frequencies.get(ngram) ?? 0) + 1);
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
return frequencies;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
/**
|
|
373
|
+
* Compute the frequency threshold above which an n-gram is considered "common".
|
|
374
|
+
* Uses the configured percentile of the frequency distribution.
|
|
375
|
+
*/
|
|
376
|
+
private computeThreshold(frequencies: Map<string, number>): number {
|
|
377
|
+
if (frequencies.size === 0) {
|
|
378
|
+
return 1;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
const freqValues = Array.from(frequencies.values()).sort((a, b) => a - b);
|
|
382
|
+
const percentileIndex = Math.floor(freqValues.length * this.config.commonThresholdPercentile);
|
|
383
|
+
const clampedIndex = Math.min(percentileIndex, freqValues.length - 1);
|
|
384
|
+
|
|
385
|
+
return Math.max(freqValues[clampedIndex], 2); // At least frequency 2 to be "common"
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
/**
|
|
389
|
+
* Convert a commonness score (0-1) to a sampling weight.
|
|
390
|
+
*
|
|
391
|
+
* Uses exponential decay with temperature scaling:
|
|
392
|
+
* weight = maxWeight * exp(-commonnessScore / temperature)
|
|
393
|
+
*
|
|
394
|
+
* Then clamps to [minWeight, maxWeight].
|
|
395
|
+
*/
|
|
396
|
+
private commonnessToWeight(commonnessScore: number): number {
|
|
397
|
+
const { minWeight, maxWeight, temperature } = this.config;
|
|
398
|
+
|
|
399
|
+
// Exponential decay: high commonness -> low weight
|
|
400
|
+
const rawWeight = maxWeight * Math.exp(-commonnessScore / temperature);
|
|
401
|
+
|
|
402
|
+
// Clamp to [minWeight, maxWeight]
|
|
403
|
+
return Math.max(minWeight, Math.min(maxWeight, rawWeight));
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
/**
|
|
407
|
+
* Validate configuration parameters.
|
|
408
|
+
* @throws Error if configuration is invalid
|
|
409
|
+
*/
|
|
410
|
+
private validateConfig(): void {
|
|
411
|
+
const { minWeight, maxWeight, temperature, commonThresholdPercentile, ngramSizes } =
|
|
412
|
+
this.config;
|
|
413
|
+
|
|
414
|
+
if (minWeight <= 0 || minWeight > 1) {
|
|
415
|
+
throw new Error(`SoftDedup: minWeight must be in (0, 1], got ${minWeight}`);
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
if (maxWeight < minWeight || maxWeight > 1) {
|
|
419
|
+
throw new Error(`SoftDedup: maxWeight must be in [minWeight, 1], got ${maxWeight}`);
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
if (temperature <= 0) {
|
|
423
|
+
throw new Error(`SoftDedup: temperature must be > 0, got ${temperature}`);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
if (commonThresholdPercentile < 0 || commonThresholdPercentile > 1) {
|
|
427
|
+
throw new Error(
|
|
428
|
+
`SoftDedup: commonThresholdPercentile must be in [0, 1], got ${commonThresholdPercentile}`
|
|
429
|
+
);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
if (ngramSizes.length === 0) {
|
|
433
|
+
throw new Error('SoftDedup: ngramSizes must have at least one entry');
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
for (const n of ngramSizes) {
|
|
437
|
+
if (n < 1 || !Number.isInteger(n)) {
|
|
438
|
+
throw new Error(`SoftDedup: each ngramSize must be a positive integer, got ${n}`);
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
// =============================================================================
|
|
445
|
+
// FACTORY FUNCTION
|
|
446
|
+
// =============================================================================
|
|
447
|
+
|
|
448
|
+
/**
|
|
449
|
+
* Create a SoftDedup processor with optional configuration overrides.
|
|
450
|
+
*
|
|
451
|
+
* @example
|
|
452
|
+
* ```ts
|
|
453
|
+
* const dedup = createSoftDedup({ wordLevel: true, temperature: 0.5 });
|
|
454
|
+
* const results = dedup.process(myDataset);
|
|
455
|
+
* const stats = dedup.computeStats(results);
|
|
456
|
+
* console.log(`Effective dataset size: ${stats.effectiveDatasetSize}`);
|
|
457
|
+
* ```
|
|
458
|
+
*/
|
|
459
|
+
export function createSoftDedup(config: Partial<SoftDedupConfig> = {}): SoftDedup {
|
|
460
|
+
return new SoftDedup(config);
|
|
461
|
+
}
|