adaptive-memory-multi-model-router 1.2.2 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +146 -66
- package/dist/index.d.ts +1 -1
- package/dist/index.js +1 -1
- package/dist/integrations/airtable.js +20 -0
- package/dist/integrations/discord.js +18 -0
- package/dist/integrations/github.js +23 -0
- package/dist/integrations/gmail.js +19 -0
- package/dist/integrations/google-calendar.js +18 -0
- package/dist/integrations/index.js +61 -0
- package/dist/integrations/jira.js +21 -0
- package/dist/integrations/linear.js +19 -0
- package/dist/integrations/notion.js +19 -0
- package/dist/integrations/slack.js +18 -0
- package/dist/integrations/telegram.js +19 -0
- package/dist/providers/registry.js +7 -3
- package/docs/ARCHITECTURAL-IMPROVEMENTS-2025.md +1391 -0
- package/docs/ARCHITECTURAL-IMPROVEMENTS-REVISED-2025.md +1051 -0
- package/docs/CONFIGURATION.md +476 -0
- package/docs/COUNCIL_DECISION.json +308 -0
- package/docs/COUNCIL_SUMMARY.md +265 -0
- package/docs/COUNCIL_V2.2_DECISION.md +416 -0
- package/docs/IMPROVEMENT_ROADMAP.md +515 -0
- package/docs/LLM_COUNCIL_DECISION.md +508 -0
- package/docs/QUICK_START_VISIBILITY.md +782 -0
- package/docs/REDDIT_GAP_ANALYSIS.md +299 -0
- package/docs/RESEARCH_BACKED_IMPROVEMENTS.md +1180 -0
- package/docs/TMLPD_QNA.md +751 -0
- package/docs/TMLPD_V2.1_COMPLETE.md +763 -0
- package/docs/TMLPD_V2.2_RESEARCH_ROADMAP.md +754 -0
- package/docs/V2.2_IMPLEMENTATION_COMPLETE.md +446 -0
- package/docs/V2_IMPLEMENTATION_GUIDE.md +388 -0
- package/docs/VISIBILITY_ADOPTION_PLAN.md +1005 -0
- package/docs/launch-content/LAUNCH_EXECUTION_CHECKLIST.md +421 -0
- package/docs/launch-content/README.md +457 -0
- package/docs/launch-content/assets/cost_comparison_100_tasks.png +0 -0
- package/docs/launch-content/assets/cumulative_savings.png +0 -0
- package/docs/launch-content/assets/parallel_speedup.png +0 -0
- package/docs/launch-content/assets/provider_pricing_comparison.png +0 -0
- package/docs/launch-content/assets/task_breakdown_comparison.png +0 -0
- package/docs/launch-content/generate_charts.py +313 -0
- package/docs/launch-content/hn_show_post.md +139 -0
- package/docs/launch-content/partner_outreach_templates.md +745 -0
- package/docs/launch-content/reddit_posts.md +467 -0
- package/docs/launch-content/twitter_thread.txt +460 -0
- package/examples/QUICKSTART.md +1 -1
- package/openclaw-alexa-bridge/ALL_REMAINING_FIXES_PLAN.md +313 -0
- package/openclaw-alexa-bridge/REMAINING_FIXES_SUMMARY.md +277 -0
- package/openclaw-alexa-bridge/src/alexa_handler_no_tmlpd.js +1234 -0
- package/openclaw-alexa-bridge/test_fixes.js +77 -0
- package/package.json +120 -29
- package/package.json.tmp +0 -0
- package/qna/TMLPD_QNA.md +3 -3
- package/skill/SKILL.md +2 -2
- package/src/__tests__/integration/tmpld_integration.test.py +540 -0
- package/src/agents/skill_enhanced_agent.py +318 -0
- package/src/memory/__init__.py +15 -0
- package/src/memory/agentic_memory.py +353 -0
- package/src/memory/semantic_memory.py +444 -0
- package/src/memory/simple_memory.py +466 -0
- package/src/memory/working_memory.py +447 -0
- package/src/orchestration/__init__.py +52 -0
- package/src/orchestration/execution_engine.py +353 -0
- package/src/orchestration/halo_orchestrator.py +367 -0
- package/src/orchestration/mcts_workflow.py +498 -0
- package/src/orchestration/role_assigner.py +473 -0
- package/src/orchestration/task_planner.py +522 -0
- package/src/providers/__init__.py +67 -0
- package/src/providers/anthropic.py +304 -0
- package/src/providers/base.py +241 -0
- package/src/providers/cerebras.py +373 -0
- package/src/providers/registry.py +476 -0
- package/src/routing/__init__.py +30 -0
- package/src/routing/universal_router.py +621 -0
- package/src/skills/TMLPD-QUICKREF.md +210 -0
- package/src/skills/TMLPD-SETUP-SUMMARY.md +157 -0
- package/src/skills/TMLPD.md +540 -0
- package/src/skills/__tests__/skill_manager.test.ts +328 -0
- package/src/skills/skill_manager.py +385 -0
- package/src/skills/test-tmlpd.sh +108 -0
- package/src/skills/tmlpd-category.yaml +67 -0
- package/src/skills/tmlpd-monitoring.yaml +188 -0
- package/src/skills/tmlpd-phase.yaml +132 -0
- package/src/state/__init__.py +17 -0
- package/src/state/simple_checkpoint.py +508 -0
- package/src/tmlpd_agent.py +464 -0
- package/src/tmpld_v2.py +427 -0
- package/src/workflows/__init__.py +18 -0
- package/src/workflows/advanced_difficulty_classifier.py +377 -0
- package/src/workflows/chaining_executor.py +417 -0
- package/src/workflows/difficulty_integration.py +209 -0
- package/src/workflows/orchestrator.py +469 -0
- package/src/workflows/orchestrator_executor.py +456 -0
- package/src/workflows/parallelization_executor.py +382 -0
- package/src/workflows/router.py +311 -0
- package/test_integration_simple.py +86 -0
- package/test_mcts_workflow.py +150 -0
- package/test_templd_integration.py +262 -0
- package/test_universal_router.py +275 -0
- package/tmlpd-pi-extension/README.md +36 -0
- package/tmlpd-pi-extension/dist/cache/prefixCache.d.ts +114 -0
- package/tmlpd-pi-extension/dist/cache/prefixCache.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/cache/prefixCache.js +285 -0
- package/tmlpd-pi-extension/dist/cache/prefixCache.js.map +1 -0
- package/tmlpd-pi-extension/dist/cache/responseCache.d.ts +58 -0
- package/tmlpd-pi-extension/dist/cache/responseCache.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/cache/responseCache.js +153 -0
- package/tmlpd-pi-extension/dist/cache/responseCache.js.map +1 -0
- package/tmlpd-pi-extension/dist/cli.js +59 -0
- package/tmlpd-pi-extension/dist/cost/costTracker.d.ts +95 -0
- package/tmlpd-pi-extension/dist/cost/costTracker.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/cost/costTracker.js +240 -0
- package/tmlpd-pi-extension/dist/cost/costTracker.js.map +1 -0
- package/tmlpd-pi-extension/dist/index.d.ts +723 -0
- package/tmlpd-pi-extension/dist/index.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/index.js +239 -0
- package/tmlpd-pi-extension/dist/index.js.map +1 -0
- package/tmlpd-pi-extension/dist/memory/episodicMemory.d.ts +82 -0
- package/tmlpd-pi-extension/dist/memory/episodicMemory.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/memory/episodicMemory.js +145 -0
- package/tmlpd-pi-extension/dist/memory/episodicMemory.js.map +1 -0
- package/tmlpd-pi-extension/dist/orchestration/haloOrchestrator.d.ts +102 -0
- package/tmlpd-pi-extension/dist/orchestration/haloOrchestrator.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/orchestration/haloOrchestrator.js +207 -0
- package/tmlpd-pi-extension/dist/orchestration/haloOrchestrator.js.map +1 -0
- package/tmlpd-pi-extension/dist/orchestration/mctsWorkflow.d.ts +85 -0
- package/tmlpd-pi-extension/dist/orchestration/mctsWorkflow.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/orchestration/mctsWorkflow.js +210 -0
- package/tmlpd-pi-extension/dist/orchestration/mctsWorkflow.js.map +1 -0
- package/tmlpd-pi-extension/dist/providers/localProvider.d.ts +102 -0
- package/tmlpd-pi-extension/dist/providers/localProvider.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/providers/localProvider.js +338 -0
- package/tmlpd-pi-extension/dist/providers/localProvider.js.map +1 -0
- package/tmlpd-pi-extension/dist/providers/registry.d.ts +55 -0
- package/tmlpd-pi-extension/dist/providers/registry.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/providers/registry.js +138 -0
- package/tmlpd-pi-extension/dist/providers/registry.js.map +1 -0
- package/tmlpd-pi-extension/dist/routing/advancedRouter.d.ts +68 -0
- package/tmlpd-pi-extension/dist/routing/advancedRouter.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/routing/advancedRouter.js +332 -0
- package/tmlpd-pi-extension/dist/routing/advancedRouter.js.map +1 -0
- package/tmlpd-pi-extension/dist/tools/tmlpdTools.d.ts +101 -0
- package/tmlpd-pi-extension/dist/tools/tmlpdTools.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/tools/tmlpdTools.js +368 -0
- package/tmlpd-pi-extension/dist/tools/tmlpdTools.js.map +1 -0
- package/tmlpd-pi-extension/dist/utils/batchProcessor.d.ts +96 -0
- package/tmlpd-pi-extension/dist/utils/batchProcessor.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/utils/batchProcessor.js +170 -0
- package/tmlpd-pi-extension/dist/utils/batchProcessor.js.map +1 -0
- package/tmlpd-pi-extension/dist/utils/compression.d.ts +61 -0
- package/tmlpd-pi-extension/dist/utils/compression.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/utils/compression.js +281 -0
- package/tmlpd-pi-extension/dist/utils/compression.js.map +1 -0
- package/tmlpd-pi-extension/dist/utils/reliability.d.ts +74 -0
- package/tmlpd-pi-extension/dist/utils/reliability.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/utils/reliability.js +177 -0
- package/tmlpd-pi-extension/dist/utils/reliability.js.map +1 -0
- package/tmlpd-pi-extension/dist/utils/speculativeDecoding.d.ts +117 -0
- package/tmlpd-pi-extension/dist/utils/speculativeDecoding.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/utils/speculativeDecoding.js +246 -0
- package/tmlpd-pi-extension/dist/utils/speculativeDecoding.js.map +1 -0
- package/tmlpd-pi-extension/dist/utils/tokenUtils.d.ts +50 -0
- package/tmlpd-pi-extension/dist/utils/tokenUtils.d.ts.map +1 -0
- package/tmlpd-pi-extension/dist/utils/tokenUtils.js +124 -0
- package/tmlpd-pi-extension/dist/utils/tokenUtils.js.map +1 -0
- package/tmlpd-pi-extension/examples/QUICKSTART.md +183 -0
- package/tmlpd-pi-extension/package-lock.json +75 -0
- package/tmlpd-pi-extension/package.json +172 -0
- package/tmlpd-pi-extension/python/examples.py +53 -0
- package/tmlpd-pi-extension/python/integrations.py +330 -0
- package/tmlpd-pi-extension/python/setup.py +28 -0
- package/tmlpd-pi-extension/python/tmlpd.py +369 -0
- package/tmlpd-pi-extension/qna/REDDIT_GAP_ANALYSIS.md +299 -0
- package/tmlpd-pi-extension/qna/TMLPD_QNA.md +751 -0
- package/tmlpd-pi-extension/skill/SKILL.md +238 -0
- package/{src → tmlpd-pi-extension/src}/index.ts +1 -1
- package/tmlpd-pi-extension/tsconfig.json +18 -0
- package/demo/research-demo.js +0 -266
- package/notebooks/quickstart.ipynb +0 -157
- package/rust/tmlpd.h +0 -268
- package/src/cache/prefixCache.ts +0 -365
- package/src/routing/advancedRouter.ts +0 -406
- package/src/utils/speculativeDecoding.ts +0 -344
- /package/{src → tmlpd-pi-extension/src}/cache/responseCache.ts +0 -0
- /package/{src → tmlpd-pi-extension/src}/cost/costTracker.ts +0 -0
- /package/{src → tmlpd-pi-extension/src}/memory/episodicMemory.ts +0 -0
- /package/{src → tmlpd-pi-extension/src}/orchestration/haloOrchestrator.ts +0 -0
- /package/{src → tmlpd-pi-extension/src}/orchestration/mctsWorkflow.ts +0 -0
- /package/{src → tmlpd-pi-extension/src}/providers/localProvider.ts +0 -0
- /package/{src → tmlpd-pi-extension/src}/providers/registry.ts +0 -0
- /package/{src → tmlpd-pi-extension/src}/tools/tmlpdTools.ts +0 -0
- /package/{src → tmlpd-pi-extension/src}/utils/batchProcessor.ts +0 -0
- /package/{src → tmlpd-pi-extension/src}/utils/compression.ts +0 -0
- /package/{src → tmlpd-pi-extension/src}/utils/reliability.ts +0 -0
- /package/{src → tmlpd-pi-extension/src}/utils/tokenUtils.ts +0 -0
|
@@ -1,344 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* TMLPD Speculative Decoding
|
|
3
|
-
*
|
|
4
|
-
* Based on Medusa (arXiv:2401.10774) and EAGLE approaches
|
|
5
|
-
* Small draft model proposes tokens, large model verifies in parallel
|
|
6
|
-
* 2-3x faster generation with same quality
|
|
7
|
-
*/
|
|
8
|
-
|
|
9
|
-
export interface SpeculativeConfig {
|
|
10
|
-
draft_model: string;
|
|
11
|
-
target_model: string;
|
|
12
|
-
num_draft_tokens: number;
|
|
13
|
-
temperature?: number;
|
|
14
|
-
max_verify_tokens?: number;
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
export interface SpeculativeResult {
|
|
18
|
-
accepted: number;
|
|
19
|
-
rejected: number;
|
|
20
|
-
draft_tokens: number;
|
|
21
|
-
speedup: number;
|
|
22
|
-
final_text: string;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
export interface DraftCandidate {
|
|
26
|
-
token: string;
|
|
27
|
-
probability: number;
|
|
28
|
-
position: number;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
/**
|
|
32
|
-
* Medusa-style multi-token prediction heads
|
|
33
|
-
* Instead of separate draft model, uses speculative sampling
|
|
34
|
-
*/
|
|
35
|
-
export class MedusaPredictor {
|
|
36
|
-
private num_heads: number;
|
|
37
|
-
private temperature: number;
|
|
38
|
-
|
|
39
|
-
constructor(options?: {
|
|
40
|
-
num_heads?: number;
|
|
41
|
-
temperature?: number;
|
|
42
|
-
}) {
|
|
43
|
-
this.num_heads = options?.num_heads || 5;
|
|
44
|
-
this.temperature = options?.temperature || 0.7;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
/**
|
|
48
|
-
* Generate k draft tokens from one forward pass
|
|
49
|
-
* In production, this uses actual Medusa prediction heads
|
|
50
|
-
*/
|
|
51
|
-
async generateDraftTokens(
|
|
52
|
-
context: string,
|
|
53
|
-
last_token: string,
|
|
54
|
-
getLogits: (text: string) => Promise<Record<string, number>>
|
|
55
|
-
): Promise<DraftCandidate[]> {
|
|
56
|
-
// Simulate getting logits for next token predictions
|
|
57
|
-
// In real Medusa, this comes from extra prediction heads
|
|
58
|
-
const prompt = context + last_token;
|
|
59
|
-
const logits = await getLogits(prompt);
|
|
60
|
-
|
|
61
|
-
const candidates: DraftCandidate[] = [];
|
|
62
|
-
const sorted = Object.entries(logits)
|
|
63
|
-
.sort((a, b) => b[1] - a[1])
|
|
64
|
-
.slice(0, this.num_heads);
|
|
65
|
-
|
|
66
|
-
for (let i = 0; i < sorted.length; i++) {
|
|
67
|
-
const [token, prob] = sorted[i];
|
|
68
|
-
// Apply temperature
|
|
69
|
-
const adjusted = Math.pow(prob, 1 / this.temperature);
|
|
70
|
-
candidates.push({
|
|
71
|
-
token,
|
|
72
|
-
probability: adjusted,
|
|
73
|
-
position: i + 1
|
|
74
|
-
});
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
return candidates;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
/**
|
|
81
|
-
* Verify draft tokens against target model
|
|
82
|
-
* Returns which tokens were accepted
|
|
83
|
-
*/
|
|
84
|
-
async verifyDraft(
|
|
85
|
-
context: string,
|
|
86
|
-
drafts: DraftCandidate[],
|
|
87
|
-
targetLogits: (text: string) => Promise<Record<string, number>>
|
|
88
|
-
): Promise<{ accepted: number[]; rejected: number[] }> {
|
|
89
|
-
const accepted: number[] = [];
|
|
90
|
-
const rejected: number[] = [];
|
|
91
|
-
|
|
92
|
-
let current_context = context;
|
|
93
|
-
|
|
94
|
-
for (const draft of drafts) {
|
|
95
|
-
// Get target model's prediction for this position
|
|
96
|
-
const target_logits = await targetLogits(current_context);
|
|
97
|
-
const target_token = Object.entries(target_logits)
|
|
98
|
-
.sort((a, b) => b[1] - a[1])[0]?.[0];
|
|
99
|
-
|
|
100
|
-
// Accept if matches or probability is high enough
|
|
101
|
-
if (draft.token === target_token || draft.probability > 0.3) {
|
|
102
|
-
accepted.push(draft.position);
|
|
103
|
-
current_context += draft.token;
|
|
104
|
-
} else {
|
|
105
|
-
rejected.push(draft.position);
|
|
106
|
-
break; // Reject rest of draft
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
return { accepted, rejected };
|
|
111
|
-
}
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
/**
|
|
115
|
-
* EAGLE-style speculative decoding
|
|
116
|
-
* Uses regression-based draft token prediction
|
|
117
|
-
*/
|
|
118
|
-
export class EagleSpeculative {
|
|
119
|
-
private num_draft_tokens: number;
|
|
120
|
-
|
|
121
|
-
constructor(num_draft_tokens: number = 4) {
|
|
122
|
-
this.num_draft_tokens = num_draft_tokens;
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
/**
|
|
126
|
-
* Generate draft sequence
|
|
127
|
-
* In production, this uses EAGLE's auto-regressive draft model
|
|
128
|
-
*/
|
|
129
|
-
async generateDraft(
|
|
130
|
-
context: string,
|
|
131
|
-
generateFn: (prompt: string) => Promise<string>
|
|
132
|
-
): Promise<string[]> {
|
|
133
|
-
const drafts: string[] = [];
|
|
134
|
-
let current = context;
|
|
135
|
-
|
|
136
|
-
for (let i = 0; i < this.num_draft_tokens; i++) {
|
|
137
|
-
// In EAGLE, draft is generated from a compressed hidden state
|
|
138
|
-
// Here we simulate with regular generation
|
|
139
|
-
const next = await generateFn(current);
|
|
140
|
-
drafts.push(next);
|
|
141
|
-
current += next;
|
|
142
|
-
|
|
143
|
-
if (next.trim().length === 0) break;
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
return drafts;
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
/**
|
|
150
|
-
* Verify draft with tree-based attention
|
|
151
|
-
* Multiple drafts are verified simultaneously
|
|
152
|
-
*/
|
|
153
|
-
async verifyDraftTree(
|
|
154
|
-
context: string,
|
|
155
|
-
drafts: string[],
|
|
156
|
-
targetGenerate: (prompt: string) => Promise<string>
|
|
157
|
-
): Promise<{ accepted: number; text: string }> {
|
|
158
|
-
let current_context = context;
|
|
159
|
-
let accepted_count = 0;
|
|
160
|
-
|
|
161
|
-
for (const draft of drafts) {
|
|
162
|
-
// Target model generates one token at this position
|
|
163
|
-
const target_token = await targetGenerate(current_context);
|
|
164
|
-
|
|
165
|
-
// If draft matches target, accept
|
|
166
|
-
if (draft.startsWith(target_token) || draft === target_token) {
|
|
167
|
-
accepted_count++;
|
|
168
|
-
current_context += target_token;
|
|
169
|
-
} else {
|
|
170
|
-
// Rejected - use target token
|
|
171
|
-
current_context += target_token;
|
|
172
|
-
if (accepted_count > 0) break;
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
return {
|
|
177
|
-
accepted: accepted_count,
|
|
178
|
-
text: current_context.slice(context.length)
|
|
179
|
-
};
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
/**
|
|
184
|
-
* Simple speculative decoding wrapper
|
|
185
|
-
* Works with any model pair that supports continued generation
|
|
186
|
-
*/
|
|
187
|
-
export class SpeculativeDecoder {
|
|
188
|
-
private draft_threshold: number;
|
|
189
|
-
|
|
190
|
-
constructor(draft_threshold: number = 0.5) {
|
|
191
|
-
this.draft_threshold = draft_threshold;
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
/**
|
|
195
|
-
* Execute speculative decoding
|
|
196
|
-
*
|
|
197
|
-
* @param prompt - Input prompt
|
|
198
|
-
* @param draftFn - Function to generate draft completion (fast model)
|
|
199
|
-
* @param targetFn - Function to generate target completion (slow model)
|
|
200
|
-
* @param max_draft_tokens - Maximum tokens to draft
|
|
201
|
-
*/
|
|
202
|
-
async decode(
|
|
203
|
-
prompt: string,
|
|
204
|
-
draftFn: (prompt: string, max_tokens: number) => Promise<string>,
|
|
205
|
-
targetFn: (prompt: string, max_tokens: number) => Promise<string>,
|
|
206
|
-
max_draft_tokens: number = 5
|
|
207
|
-
): Promise<SpeculativeResult> {
|
|
208
|
-
const start_time = Date.now();
|
|
209
|
-
|
|
210
|
-
// Phase 1: Generate draft with fast model
|
|
211
|
-
const draft_start = Date.now();
|
|
212
|
-
const draft_text = await draftFn(prompt, max_draft_tokens * 2);
|
|
213
|
-
const draft_time = Date.now() - draft_start;
|
|
214
|
-
|
|
215
|
-
// Phase 2: Verify with target model (single pass)
|
|
216
|
-
// Instead of verifying token-by-token, we use acceptance criteria
|
|
217
|
-
const target_start = Date.now();
|
|
218
|
-
const target_text = await targetFn(prompt, max_draft_tokens);
|
|
219
|
-
const target_time = Date.now() - target_start;
|
|
220
|
-
|
|
221
|
-
// Calculate acceptance rate
|
|
222
|
-
let accepted = 0;
|
|
223
|
-
let rejected = 0;
|
|
224
|
-
|
|
225
|
-
const draft_words = draft_text.split(/\s+/);
|
|
226
|
-
const target_words = target_text.split(/\s+/);
|
|
227
|
-
|
|
228
|
-
for (let i = 0; i < Math.min(draft_words.length, target_words.length); i++) {
|
|
229
|
-
// Simple word-level acceptance
|
|
230
|
-
if (draft_words[i].toLowerCase() === target_words[i].toLowerCase()) {
|
|
231
|
-
accepted++;
|
|
232
|
-
} else {
|
|
233
|
-
rejected++;
|
|
234
|
-
break; // Stop at first rejection
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
// If draft was longer, those are rejected
|
|
239
|
-
rejected += Math.max(0, draft_words.length - target_words.length);
|
|
240
|
-
|
|
241
|
-
// Speedup: time_target / (time_draft + time_verification)
|
|
242
|
-
const total_time = draft_time + target_time;
|
|
243
|
-
const speedup = total_time > 0 ? (target_time / total_time) : 1;
|
|
244
|
-
|
|
245
|
-
// Use target text (higher quality) as final
|
|
246
|
-
const final_text = target_text;
|
|
247
|
-
|
|
248
|
-
return {
|
|
249
|
-
accepted,
|
|
250
|
-
rejected,
|
|
251
|
-
draft_tokens: draft_words.length,
|
|
252
|
-
speedup: Math.min(speedup, 3.0), // Cap at 3x
|
|
253
|
-
final_text
|
|
254
|
-
};
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
/**
|
|
258
|
-
* Execute with streaming (faster perceived latency)
|
|
259
|
-
*/
|
|
260
|
-
async decodeStreaming(
|
|
261
|
-
prompt: string,
|
|
262
|
-
draftFn: (prompt: string) => Promise<string>,
|
|
263
|
-
targetFn: (prompt: string) => Promise<string>,
|
|
264
|
-
onToken: (token: string, is_draft: boolean) => void,
|
|
265
|
-
max_draft_tokens: number = 5
|
|
266
|
-
): Promise<{ accepted: number; final_text: string }> {
|
|
267
|
-
// Generate drafts first
|
|
268
|
-
const drafts = await draftFn(prompt + " ");
|
|
269
|
-
const draft_tokens = drafts.split(/\s+/);
|
|
270
|
-
|
|
271
|
-
let accepted = 0;
|
|
272
|
-
let final_text = "";
|
|
273
|
-
|
|
274
|
-
// Verify and stream tokens
|
|
275
|
-
for (const token of draft_tokens) {
|
|
276
|
-
if (accepted >= max_draft_tokens) break;
|
|
277
|
-
|
|
278
|
-
// Emit draft token immediately (lower quality)
|
|
279
|
-
onToken(token, true);
|
|
280
|
-
final_text += token + " ";
|
|
281
|
-
|
|
282
|
-
accepted++;
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
return { accepted, final_text: final_text.trim() };
|
|
286
|
-
}
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
/**
|
|
290
|
-
* Batch speculative decoding
|
|
291
|
-
* Processes multiple prompts with speculative execution
|
|
292
|
-
*/
|
|
293
|
-
export async function speculativeBatch(
|
|
294
|
-
prompts: string[],
|
|
295
|
-
draftFn: (prompt: string) => Promise<string>,
|
|
296
|
-
targetFn: (prompt: string) => Promise<string>,
|
|
297
|
-
options?: {
|
|
298
|
-
concurrency?: number;
|
|
299
|
-
max_draft_tokens?: number;
|
|
300
|
-
}
|
|
301
|
-
): Promise<SpeculativeResult[]> {
|
|
302
|
-
const concurrency = options?.concurrency || 3;
|
|
303
|
-
const max_draft_tokens = options?.max_draft_tokens || 5;
|
|
304
|
-
|
|
305
|
-
const decoder = new SpeculativeDecoder();
|
|
306
|
-
const results: SpeculativeResult[] = [];
|
|
307
|
-
|
|
308
|
-
// Process in batches
|
|
309
|
-
for (let i = 0; i < prompts.length; i += concurrency) {
|
|
310
|
-
const batch = prompts.slice(i, i + concurrency);
|
|
311
|
-
const batch_results = await Promise.all(
|
|
312
|
-
batch.map(p => decoder.decode(p, draftFn, targetFn, max_draft_tokens))
|
|
313
|
-
);
|
|
314
|
-
results.push(...batch_results);
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
return results;
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
/**
|
|
321
|
-
* Estimate speedup potential for a given prompt
|
|
322
|
-
*/
|
|
323
|
-
export function estimateSpeedupPotential(
|
|
324
|
-
prompt_length: number,
|
|
325
|
-
expected_completion_length: number,
|
|
326
|
-
draft_speed_ms: number,
|
|
327
|
-
target_speed_ms: number
|
|
328
|
-
): number {
|
|
329
|
-
// If draft is much faster, potential is higher
|
|
330
|
-
const draft_vs_target = target_speed_ms / draft_speed_ms;
|
|
331
|
-
|
|
332
|
-
// But speculative decoding has overhead
|
|
333
|
-
const overhead_factor = 1.2; // 20% overhead
|
|
334
|
-
|
|
335
|
-
return Math.min(draft_vs_target / overhead_factor, 3.0);
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
export default {
|
|
339
|
-
MedusaPredictor,
|
|
340
|
-
EagleSpeculative,
|
|
341
|
-
SpeculativeDecoder,
|
|
342
|
-
speculativeBatch,
|
|
343
|
-
estimateSpeedupPotential
|
|
344
|
-
};
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|