thumbgate 1.16.12 → 1.16.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/.claude-plugin/marketplace.json +2 -2
  2. package/.claude-plugin/plugin.json +1 -1
  3. package/.well-known/mcp/server-card.json +1 -1
  4. package/README.md +3 -1
  5. package/adapters/claude/.mcp.json +2 -2
  6. package/adapters/mcp/server-stdio.js +26 -1
  7. package/adapters/opencode/opencode.json +1 -1
  8. package/bin/cli.js +420 -1
  9. package/config/gate-templates.json +372 -0
  10. package/config/mcp-allowlists.json +25 -0
  11. package/config/model-candidates.json +59 -2
  12. package/config/model-tiers.json +4 -1
  13. package/package.json +79 -22
  14. package/public/compare.html +6 -0
  15. package/public/index.html +144 -11
  16. package/public/numbers.html +11 -11
  17. package/public/pro.html +22 -24
  18. package/scripts/agent-design-governance.js +211 -0
  19. package/scripts/agent-reasoning-traces.js +683 -0
  20. package/scripts/agent-reward-model.js +438 -0
  21. package/scripts/agent-stack-survival-audit.js +231 -0
  22. package/scripts/ai-engineering-stack-guardrails.js +256 -0
  23. package/scripts/billing.js +16 -4
  24. package/scripts/chatgpt-ads-readiness-pack.js +195 -0
  25. package/scripts/cli-schema.js +277 -0
  26. package/scripts/code-graph-guardrails.js +176 -0
  27. package/scripts/deepseek-v4-runtime-guardrails.js +253 -0
  28. package/scripts/gemini-embedding-policy.js +198 -0
  29. package/scripts/inference-cache-policy.js +39 -0
  30. package/scripts/judge-reward-function.js +396 -0
  31. package/scripts/llm-behavior-monitor.js +251 -0
  32. package/scripts/long-running-agent-context-guardrails.js +176 -0
  33. package/scripts/multimodal-retrieval-plan.js +31 -11
  34. package/scripts/oss-pr-opportunity-scout.js +240 -0
  35. package/scripts/proactive-agent-eval-guardrails.js +230 -0
  36. package/scripts/profile-router.js +5 -4
  37. package/scripts/prompting-operating-system.js +273 -0
  38. package/scripts/proxy-pointer-rag-guardrails.js +189 -0
  39. package/scripts/rag-precision-guardrails.js +202 -0
  40. package/scripts/rate-limiter.js +1 -1
  41. package/scripts/reasoning-efficiency-guardrails.js +176 -0
  42. package/scripts/reward-hacking-guardrails.js +251 -0
  43. package/scripts/seo-gsd.js +1201 -11
  44. package/scripts/single-use-credential-gate.js +182 -0
  45. package/scripts/structured-prompt-driven.js +226 -0
  46. package/scripts/telemetry-analytics.js +31 -6
  47. package/scripts/tool-registry.js +92 -0
  48. package/scripts/upstream-contribution-engine.js +379 -0
  49. package/scripts/vector-store.js +119 -4
  50. package/src/api/server.js +333 -100
  51. package/scripts/agents-sdk-sandbox-plan.js +0 -57
  52. package/scripts/ai-org-governance.js +0 -98
  53. package/scripts/artifact-agent-plan.js +0 -81
  54. package/scripts/enterprise-agent-rollout.js +0 -34
  55. package/scripts/experience-replay-governance.js +0 -69
  56. package/scripts/inference-economics.js +0 -53
  57. package/scripts/knowledge-layer-plan.js +0 -108
  58. package/scripts/memory-store-governance.js +0 -60
  59. package/scripts/post-training-governance.js +0 -34
  60. package/scripts/production-agent-readiness.js +0 -40
  61. package/scripts/scaling-law-claims.js +0 -60
  62. package/scripts/student-consistent-training.js +0 -73
@@ -0,0 +1,253 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ const { listGateTemplates } = require('./gate-templates');
5
+
6
+ const CATEGORY = 'Sparse Attention Runtime Safety';
7
+
8
+ function normalizeBoolean(value) {
9
+ if (value === true) return true;
10
+ if (value === false || value === undefined || value === null) return false;
11
+ return /^(1|true|yes|on)$/i.test(String(value).trim());
12
+ }
13
+
14
+ function toNumber(value) {
15
+ if (value === undefined || value === null || value === '') return null;
16
+ const num = Number(value);
17
+ return Number.isFinite(num) ? num : null;
18
+ }
19
+
20
+ function normalizeOptions(options = {}) {
21
+ const training = normalizeBoolean(options.training || options.rl || options['verified-rl']);
22
+ const kvOffload = normalizeBoolean(options['kv-offload'] || options['cpu-kv-offload'] || options.hisparse);
23
+ return {
24
+ workload: String(options.workload || options.name || 'deepseek-v4-runtime').trim() || 'deepseek-v4-runtime',
25
+ model: String(options.model || 'deepseek-v4-flash').trim() || 'deepseek-v4-flash',
26
+ engine: String(options.engine || 'sglang').trim() || 'sglang',
27
+ contextTokens: toNumber(options['context-tokens'] || options.context),
28
+ targetContextTokens: toNumber(options['target-context-tokens'] || options.target) || 1000000,
29
+ baselineThroughput: toNumber(options['baseline-throughput'] || options['baseline-tps']),
30
+ newThroughput: toNumber(options['new-throughput'] || options['new-tps']),
31
+ hybridAttention: normalizeBoolean(options['hybrid-attention'] || options.hybrid),
32
+ prefixCache: normalizeBoolean(options['prefix-cache'] || options.shadowradix),
33
+ cacheCoherenceEval: normalizeBoolean(options['cache-coherence-eval'] || options['cache-eval']),
34
+ speculativeDecoding: normalizeBoolean(options['speculative-decoding'] || options.speculative || options.mtp || options.eagle),
35
+ acceptLength: toNumber(options['accept-length'] || options['spec-accept-length']),
36
+ kvOffload,
37
+ training,
38
+ rolloutReplay: normalizeBoolean(options['rollout-replay'] || options.r3),
39
+ indexerReplay: normalizeBoolean(options['indexer-replay']),
40
+ trainInferenceDrift: toNumber(options['train-inference-drift'] || options.drift),
41
+ precisionMode: String(options['precision-mode'] || options.precision || '').trim().toLowerCase(),
42
+ deterministic: normalizeBoolean(options.deterministic || options['deterministic-kernels']),
43
+ numericalSpikes: normalizeBoolean(options['numerical-spikes'] || options['kl-spikes']),
44
+ };
45
+ }
46
+
47
+ function throughputDropPercent(options) {
48
+ if (options.baselineThroughput === null || options.newThroughput === null || options.baselineThroughput <= 0) return null;
49
+ return Number((((options.baselineThroughput - options.newThroughput) / options.baselineThroughput) * 100).toFixed(2));
50
+ }
51
+
52
+ function isLongContext(options) {
53
+ const context = options.contextTokens || options.targetContextTokens;
54
+ return context >= 128000;
55
+ }
56
+
57
+ function usesMixedPrecision(options) {
58
+ return /fp4|fp8|mxfp|mixed/.test(options.precisionMode);
59
+ }
60
+
61
+ function templateApplicability(template, options) {
62
+ const drop = throughputDropPercent(options);
63
+ if (template.id === 'require-hybrid-prefix-cache-coherence-eval') {
64
+ return (options.hybridAttention || isLongContext(options)) && (!options.prefixCache || !options.cacheCoherenceEval);
65
+ }
66
+ if (template.id === 'checkpoint-speculative-decoding-acceptance') {
67
+ return options.speculativeDecoding && (options.acceptLength === null || options.acceptLength < 2 || !options.cacheCoherenceEval);
68
+ }
69
+ if (template.id === 'require-long-context-kv-offload-capacity-plan') {
70
+ return isLongContext(options) && !options.kvOffload;
71
+ }
72
+ if (template.id === 'require-rollout-routing-and-indexer-replay') {
73
+ return options.training && (!options.rolloutReplay || !options.indexerReplay || (options.trainInferenceDrift !== null && options.trainInferenceDrift > 0.05));
74
+ }
75
+ if (template.id === 'checkpoint-mixed-precision-determinism') {
76
+ return (usesMixedPrecision(options) || options.numericalSpikes) && !options.deterministic;
77
+ }
78
+ if (template.id === 'checkpoint-long-context-throughput-regression') {
79
+ return drop !== null && drop > 10;
80
+ }
81
+ return false;
82
+ }
83
+
84
+ function buildSignals(options) {
85
+ const drop = throughputDropPercent(options);
86
+ return [
87
+ hybridAttentionSignal(options),
88
+ speculativeDecodingSignal(options),
89
+ longContextSignal(options, drop),
90
+ verifiedReplaySignal(options),
91
+ mixedPrecisionSignal(options),
92
+ ].filter(Boolean);
93
+ }
94
+
95
+ function hybridAttentionSignal(options) {
96
+ if (!(options.hybridAttention || isLongContext(options) || options.prefixCache)) return null;
97
+ return {
98
+ id: 'hybrid_attention_cache',
99
+ label: 'Hybrid attention prefix cache',
100
+ values: [
101
+ options.hybridAttention ? 'hybrid attention' : null,
102
+ options.prefixCache ? 'prefix cache enabled' : 'prefix cache missing',
103
+ options.cacheCoherenceEval ? 'coherence eval present' : 'missing coherence eval',
104
+ options.contextTokens !== null ? `${options.contextTokens} context tokens` : null,
105
+ ].filter(Boolean),
106
+ risk: 'SWA, compressed KV, and compression-state pools can drift unless cache lifetime and reuse are verified.',
107
+ };
108
+ }
109
+
110
+ function speculativeDecodingSignal(options) {
111
+ if (!(options.speculativeDecoding || options.acceptLength !== null)) return null;
112
+ return {
113
+ id: 'speculative_decoding',
114
+ label: 'Speculative decoding rollout',
115
+ values: [
116
+ options.speculativeDecoding ? 'speculative decoding enabled' : 'speculative decoding not declared',
117
+ options.acceptLength !== null ? `${options.acceptLength} accept length` : 'accept length missing',
118
+ ],
119
+ risk: 'Draft-token metadata and rollback paths can make throughput claims look good while correctness or acceptance collapses.',
120
+ };
121
+ }
122
+
123
+ function longContextSignal(options, drop) {
124
+ if (!isLongContext(options)) return null;
125
+ return {
126
+ id: 'long_context_capacity',
127
+ label: 'Long-context capacity plan',
128
+ values: [
129
+ `${options.contextTokens || options.targetContextTokens} token context target`,
130
+ options.kvOffload ? 'KV offload present' : 'KV offload missing',
131
+ drop !== null ? `${drop}% throughput drop` : null,
132
+ ].filter(Boolean),
133
+ risk: 'Long-context serving can hit memory ceilings or hidden throughput regressions without capacity and benchmark gates.',
134
+ };
135
+ }
136
+
137
+ function verifiedReplaySignal(options) {
138
+ if (!options.training) return null;
139
+ return {
140
+ id: 'verified_rl_replay',
141
+ label: 'Verified RL replay safety',
142
+ values: [
143
+ options.rolloutReplay ? 'rollout replay present' : 'rollout replay missing',
144
+ options.indexerReplay ? 'indexer replay present' : 'indexer replay missing',
145
+ options.trainInferenceDrift !== null ? `${options.trainInferenceDrift} train-inference drift` : null,
146
+ ].filter(Boolean),
147
+ risk: 'Sparse routing and indexer decisions must be replayed or training can optimize against a different path than rollout served.',
148
+ };
149
+ }
150
+
151
+ function mixedPrecisionSignal(options) {
152
+ if (!(usesMixedPrecision(options) || options.numericalSpikes)) return null;
153
+ return {
154
+ id: 'mixed_precision_determinism',
155
+ label: 'Mixed precision determinism',
156
+ values: [
157
+ options.precisionMode || 'precision mode unspecified',
158
+ options.deterministic ? 'determinism enabled' : 'determinism missing',
159
+ options.numericalSpikes ? 'numerical spikes observed' : null,
160
+ ].filter(Boolean),
161
+ risk: 'FP4/FP8 rollout and training can introduce silent numerical drift without deterministic and FP32-sensitive-path checks.',
162
+ };
163
+ }
164
+
165
+ function buildDeepSeekV4RuntimeGuardrailsPlan(rawOptions = {}, templatesPath) {
166
+ const options = normalizeOptions(rawOptions);
167
+ const templates = listGateTemplates(templatesPath)
168
+ .filter((template) => template.category === CATEGORY)
169
+ .map((template) => ({
170
+ ...template,
171
+ recommended: templateApplicability(template, options),
172
+ }));
173
+ const signals = buildSignals(options);
174
+ const recommendedTemplates = templates.filter((template) => template.recommended);
175
+
176
+ return {
177
+ name: 'thumbgate-deepseek-v4-runtime-guardrails',
178
+ status: recommendedTemplates.length > 0 ? 'actionable' : 'ready',
179
+ workload: options.workload,
180
+ model: options.model,
181
+ engine: options.engine,
182
+ metrics: {
183
+ contextTokens: options.contextTokens,
184
+ targetContextTokens: options.targetContextTokens,
185
+ baselineThroughput: options.baselineThroughput,
186
+ newThroughput: options.newThroughput,
187
+ throughputDropPercent: throughputDropPercent(options),
188
+ acceptLength: options.acceptLength,
189
+ trainInferenceDrift: options.trainInferenceDrift,
190
+ },
191
+ summary: {
192
+ signalCount: signals.length,
193
+ templateCount: templates.length,
194
+ recommendedTemplateCount: recommendedTemplates.length,
195
+ },
196
+ signals,
197
+ templates,
198
+ nextActions: [
199
+ 'Benchmark DeepSeek-V4 behind the same ThumbGate eval harness before changing routing defaults.',
200
+ 'Require cache-coherence and rollback evidence before enabling hybrid prefix caching or speculative decoding.',
201
+ 'Keep long-context memory and throughput budgets explicit before raising context windows.',
202
+ 'For RL or fine-tuning, require rollout-routing replay, indexer replay, and train-inference drift checks.',
203
+ 'Treat FP4/FP8 or mixed-precision paths as gated rollouts until deterministic and sensitive-FP32 checks pass.',
204
+ ],
205
+ exampleCommand: 'npx thumbgate deepseek-v4-runtime-guardrails --context-tokens=900000 --hybrid-attention --speculative-decoding --accept-length=1.4 --precision-mode=fp8 --training --json',
206
+ };
207
+ }
208
+
209
+ function formatDeepSeekV4RuntimeGuardrailsPlan(report) {
210
+ const lines = [
211
+ '',
212
+ 'ThumbGate DeepSeek-V4 Runtime Guardrails',
213
+ '-'.repeat(43),
214
+ `Status : ${report.status}`,
215
+ `Workload: ${report.workload}`,
216
+ `Model : ${report.model}`,
217
+ `Engine : ${report.engine}`,
218
+ `Signals : ${report.summary.signalCount}`,
219
+ `Templates: ${report.summary.recommendedTemplateCount}/${report.summary.templateCount} recommended`,
220
+ ];
221
+ if (report.metrics.contextTokens !== null) lines.push(`Context tokens: ${report.metrics.contextTokens}`);
222
+ if (report.metrics.throughputDropPercent !== null) lines.push(`Throughput drop: ${report.metrics.throughputDropPercent}%`);
223
+ if (report.metrics.acceptLength !== null) lines.push(`Spec accept length: ${report.metrics.acceptLength}`);
224
+ if (report.metrics.trainInferenceDrift !== null) lines.push(`Train/inference drift: ${report.metrics.trainInferenceDrift}`);
225
+
226
+ if (report.signals.length > 0) {
227
+ lines.push('', 'Detected runtime signals:');
228
+ for (const signal of report.signals) {
229
+ lines.push(` - ${signal.label}: ${signal.values.join(', ')}`);
230
+ lines.push(` Risk: ${signal.risk}`);
231
+ }
232
+ }
233
+
234
+ lines.push('', 'Recommended templates:');
235
+ const recommended = report.templates.filter((template) => template.recommended);
236
+ if (recommended.length === 0) lines.push(' - No sparse-attention runtime risks were passed.');
237
+ for (const template of recommended) {
238
+ lines.push(` - ${template.id} [${template.defaultAction}]`);
239
+ lines.push(` ${template.roi}`);
240
+ }
241
+
242
+ lines.push('', 'Next actions:');
243
+ for (const action of report.nextActions) lines.push(` - ${action}`);
244
+ lines.push('', `Example: ${report.exampleCommand}`, '');
245
+ return `${lines.join('\n')}\n`;
246
+ }
247
+
248
+ module.exports = {
249
+ buildDeepSeekV4RuntimeGuardrailsPlan,
250
+ formatDeepSeekV4RuntimeGuardrailsPlan,
251
+ normalizeOptions,
252
+ throughputDropPercent,
253
+ };
@@ -0,0 +1,198 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ const GEMINI_EMBEDDING_2_MODEL = 'gemini-embedding-2';
5
+ const DEFAULT_OUTPUT_DIMENSIONALITY = 768;
6
+ const RECOMMENDED_OUTPUT_DIMENSIONS = [3072, 1536, 768];
7
+
8
+ const MULTIMODAL_LIMITS = Object.freeze({
9
+ maxTextTokens: 8192,
10
+ maxImages: 6,
11
+ maxVideoSeconds: 120,
12
+ maxAudioSeconds: 180,
13
+ maxPdfPages: 6,
14
+ languages: '100+',
15
+ });
16
+
17
+ const ASYMMETRIC_TASKS = new Set([
18
+ 'question answering',
19
+ 'fact checking',
20
+ 'code retrieval',
21
+ 'search result',
22
+ ]);
23
+
24
+ const SYMMETRIC_TASKS = new Set([
25
+ 'anomaly detection',
26
+ 'classification',
27
+ 'clustering',
28
+ 'sentence similarity',
29
+ ]);
30
+
31
+ const GEMINI_TASK_TYPES = Object.freeze({
32
+ query: 'RETRIEVAL_QUERY',
33
+ document: 'RETRIEVAL_DOCUMENT',
34
+ classification: 'CLASSIFICATION',
35
+ clustering: 'CLUSTERING',
36
+ sentenceSimilarity: 'SEMANTIC_SIMILARITY',
37
+ });
38
+
39
+ function normalizeTask(task, fallback = 'code retrieval') {
40
+ const normalized = String(task || fallback)
41
+ .trim()
42
+ .toLowerCase()
43
+ .replace(/[_-]+/g, ' ')
44
+ .replace(/\s+/g, ' ');
45
+ return normalized || fallback;
46
+ }
47
+
48
+ function normalizeEmbeddingKind(kind) {
49
+ const normalized = String(kind || 'document').trim().toLowerCase();
50
+ if (normalized === 'query' || normalized === 'document' || normalized === 'symmetric') {
51
+ return normalized;
52
+ }
53
+ return 'document';
54
+ }
55
+
56
+ function isSymmetricTask(task) {
57
+ return SYMMETRIC_TASKS.has(normalizeTask(task));
58
+ }
59
+
60
+ function prepareEmbeddingText({ content, kind = 'document', task = 'code retrieval', title = 'none' } = {}) {
61
+ const text = String(content || '').trim();
62
+ const normalizedTask = normalizeTask(task);
63
+ const normalizedKind = isSymmetricTask(normalizedTask) ? 'symmetric' : normalizeEmbeddingKind(kind);
64
+
65
+ if (!text) return '';
66
+
67
+ if (normalizedKind === 'query' || normalizedKind === 'symmetric') {
68
+ return `task: ${normalizedTask} | query: ${text}`;
69
+ }
70
+
71
+ const safeTitle = String(title || 'none').trim() || 'none';
72
+ return `title: ${safeTitle} | text: ${text}`;
73
+ }
74
+
75
+ function resolveGeminiTaskType({ kind = 'document', task = 'code retrieval' } = {}) {
76
+ const normalizedTask = normalizeTask(task);
77
+ if (normalizedTask === 'classification') return GEMINI_TASK_TYPES.classification;
78
+ if (normalizedTask === 'clustering') return GEMINI_TASK_TYPES.clustering;
79
+ if (normalizedTask === 'sentence similarity') return GEMINI_TASK_TYPES.sentenceSimilarity;
80
+
81
+ const normalizedKind = normalizeEmbeddingKind(kind);
82
+ if (normalizedKind === 'query') return GEMINI_TASK_TYPES.query;
83
+ if (normalizedKind === 'document') return GEMINI_TASK_TYPES.document;
84
+ return undefined;
85
+ }
86
+
87
+ function resolveGeminiModelResource(model) {
88
+ const normalized = String(model || GEMINI_EMBEDDING_2_MODEL).trim() || GEMINI_EMBEDDING_2_MODEL;
89
+ return normalized.startsWith('models/') ? normalized : `models/${normalized}`;
90
+ }
91
+
92
+ function parseBoolean(value, fallback = false) {
93
+ if (value == null || value === '') return fallback;
94
+ if (typeof value === 'boolean') return value;
95
+ const normalized = String(value).trim().toLowerCase();
96
+ if (['1', 'true', 'yes', 'on'].includes(normalized)) return true;
97
+ if (['0', 'false', 'no', 'off'].includes(normalized)) return false;
98
+ return fallback;
99
+ }
100
+
101
+ function parsePositiveInteger(value, fallback) {
102
+ const parsed = Number(value);
103
+ if (!Number.isFinite(parsed) || parsed <= 0) return fallback;
104
+ return Math.floor(parsed);
105
+ }
106
+
107
+ function normalizeOutputDimensionality(value) {
108
+ const parsed = parsePositiveInteger(value, DEFAULT_OUTPUT_DIMENSIONALITY);
109
+ if (RECOMMENDED_OUTPUT_DIMENSIONS.includes(parsed)) return parsed;
110
+ return RECOMMENDED_OUTPUT_DIMENSIONS.reduce((best, candidate) => (
111
+ Math.abs(candidate - parsed) < Math.abs(best - parsed) ? candidate : best
112
+ ), DEFAULT_OUTPUT_DIMENSIONALITY);
113
+ }
114
+
115
+ function resolveGeminiEmbeddingConfig(env = process.env) {
116
+ const provider = String(env.THUMBGATE_EMBED_PROVIDER || env.THUMBGATE_EMBEDDING_PROVIDER || 'local')
117
+ .trim()
118
+ .toLowerCase();
119
+ const explicitGemini = parseBoolean(env.THUMBGATE_GEMINI_EMBEDDINGS, false);
120
+ const apiKey = env.GEMINI_API_KEY || env.GOOGLE_API_KEY || env.GOOGLE_GENERATIVE_AI_API_KEY || '';
121
+ const enabled = explicitGemini || provider === 'gemini';
122
+
123
+ return {
124
+ enabled,
125
+ provider: enabled ? 'gemini' : 'local',
126
+ model: String(env.THUMBGATE_GEMINI_EMBED_MODEL || GEMINI_EMBEDDING_2_MODEL).trim() || GEMINI_EMBEDDING_2_MODEL,
127
+ apiKey,
128
+ apiBaseUrl: trimTrailingSlashes(env.THUMBGATE_GEMINI_API_BASE_URL || 'https://generativelanguage.googleapis.com/v1beta'),
129
+ outputDimensionality: normalizeOutputDimensionality(env.THUMBGATE_GEMINI_EMBED_DIM || env.THUMBGATE_EMBED_DIM),
130
+ fallbackToLocal: parseBoolean(env.THUMBGATE_GEMINI_EMBED_FALLBACK_LOCAL, true),
131
+ defaultTask: normalizeTask(env.THUMBGATE_GEMINI_EMBED_TASK || 'code retrieval'),
132
+ multimodalLimits: MULTIMODAL_LIMITS,
133
+ recommendedOutputDimensions: RECOMMENDED_OUTPUT_DIMENSIONS,
134
+ };
135
+ }
136
+
137
+ function trimTrailingSlashes(value) {
138
+ let text = String(value);
139
+ while (text.endsWith('/')) text = text.slice(0, -1);
140
+ return text;
141
+ }
142
+
143
+ function buildGeminiEmbeddingRolloutPlan(args = {}) {
144
+ const corpusItems = parsePositiveInteger(args.corpusItems, 5000);
145
+ const outputDimensionality = normalizeOutputDimensionality(args.outputDimensionality || args.maxEmbeddingDim);
146
+ const task = normalizeTask(args.task || 'code retrieval');
147
+ const useBatchApi = args.useBatchApi !== false;
148
+ const vectorMb = Number(((corpusItems * outputDimensionality * 4) / (1024 * 1024)).toFixed(2));
149
+
150
+ return {
151
+ model: GEMINI_EMBEDDING_2_MODEL,
152
+ task,
153
+ outputDimensionality,
154
+ corpusItems,
155
+ estimatedFloat32Mb: vectorMb,
156
+ taskPrefixes: {
157
+ query: prepareEmbeddingText({ kind: 'query', task, content: '{query}' }),
158
+ document: prepareEmbeddingText({ kind: 'document', task, title: '{title}', content: '{content}' }),
159
+ symmetric: prepareEmbeddingText({ kind: 'symmetric', task: 'classification', content: '{content}' }),
160
+ },
161
+ apiHints: {
162
+ queryTaskType: resolveGeminiTaskType({ kind: 'query', task }),
163
+ documentTaskType: resolveGeminiTaskType({ kind: 'document', task }),
164
+ modelResource: resolveGeminiModelResource(GEMINI_EMBEDDING_2_MODEL),
165
+ },
166
+ modalityLimits: MULTIMODAL_LIMITS,
167
+ economics: {
168
+ recommendedDimensions: RECOMMENDED_OUTPUT_DIMENSIONS,
169
+ storageDefault: outputDimensionality,
170
+ batchApi: useBatchApi ? 'Use for offline re-indexing; Google positions Batch API embeddings at 50% of default price.' : 'Skip Batch API only for latency-sensitive incremental writes.',
171
+ },
172
+ rolloutSteps: [
173
+ 'Keep local embeddings as the default offline path.',
174
+ 'Enable Gemini Embedding 2 only when a Gemini API key is present.',
175
+ 'Use task-specific query/document prefixes at index and retrieval time.',
176
+ 'Start at 768 dimensions, then benchmark 1536 only if recall misses show up.',
177
+ 'Use Batch API for full re-indexes and online embed_content for fresh feedback events.',
178
+ ],
179
+ };
180
+ }
181
+
182
+ module.exports = {
183
+ ASYMMETRIC_TASKS,
184
+ DEFAULT_OUTPUT_DIMENSIONALITY,
185
+ GEMINI_EMBEDDING_2_MODEL,
186
+ MULTIMODAL_LIMITS,
187
+ RECOMMENDED_OUTPUT_DIMENSIONS,
188
+ SYMMETRIC_TASKS,
189
+ GEMINI_TASK_TYPES,
190
+ buildGeminiEmbeddingRolloutPlan,
191
+ isSymmetricTask,
192
+ normalizeOutputDimensionality,
193
+ normalizeTask,
194
+ prepareEmbeddingText,
195
+ resolveGeminiEmbeddingConfig,
196
+ resolveGeminiModelResource,
197
+ resolveGeminiTaskType,
198
+ };
@@ -66,7 +66,46 @@ function evaluateCacheCandidate(candidate = {}) {
66
66
  };
67
67
  }
68
68
 
69
+ function planDepthWiseKvSharing(options = {}) {
70
+ const layerCount = Number(options.layerCount || 0);
71
+ const cacheBudgetRatio = Number(options.cacheBudgetRatio || 1);
72
+ const trainingAdapted = Boolean(options.trainingAdapted || options.randomCrossLayerAttention);
73
+ const latencySensitive = Boolean(options.latencySensitive);
74
+ const unknownHardware = Boolean(options.unknownHardware);
75
+ const dataConstrained = Boolean(options.dataConstrained);
76
+ const issues = [];
77
+
78
+ if (layerCount < 12) issues.push('model_too_shallow_for_depth_sharing_roi');
79
+ if (!trainingAdapted) issues.push('requires_training_or_finetune_adaptation');
80
+ if (cacheBudgetRatio >= 0.9) issues.push('kv_memory_budget_not_constrained');
81
+ if (latencySensitive && !trainingAdapted) issues.push('avoid_runtime_only_cross_layer_sharing_for_ttfb');
82
+
83
+ const targetSharedLayerRatio = cacheBudgetRatio <= 0.5 ? 0.5 : cacheBudgetRatio <= 0.75 ? 0.25 : 0;
84
+ const estimatedKvMemoryReduction = Number((targetSharedLayerRatio * 0.9).toFixed(2));
85
+ const deploymentModes = [
86
+ 'full-kv-cache',
87
+ targetSharedLayerRatio >= 0.25 ? 'share-every-fourth-layer' : null,
88
+ targetSharedLayerRatio >= 0.5 ? 'share-every-other-layer' : null,
89
+ ].filter(Boolean);
90
+
91
+ return {
92
+ decision: issues.some((issue) => issue !== 'kv_memory_budget_not_constrained') ? 'research' : 'pilot',
93
+ issues,
94
+ technique: 'stochastic-kv-routing-depth-wise-cache-sharing',
95
+ targetSharedLayerRatio,
96
+ estimatedKvMemoryReduction,
97
+ deploymentModes,
98
+ recommendedWorkload: unknownHardware || dataConstrained ? 'adaptive-serving-pilot' : 'benchmark-before-rollout',
99
+ gates: [
100
+ 'compare quality against full-cache baseline',
101
+ 'measure time-to-first-token and tokens/sec',
102
+ 'block rollout if golden eval pass rate regresses',
103
+ ],
104
+ };
105
+ }
106
+
69
107
  module.exports = {
70
108
  buildInferenceCachePolicy,
71
109
  evaluateCacheCandidate,
110
+ planDepthWiseKvSharing,
72
111
  };