thumbgate 1.16.20 → 1.16.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +3 -2
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +1 -1
- package/adapters/opencode/opencode.json +1 -1
- package/bench/programbench-smoke.json +71 -0
- package/bench/thumbgate-bench.json +131 -0
- package/bin/cli.js +79 -2
- package/config/pro/constraints-pro.json +57 -0
- package/config/pro/prevention-rules-pro.md +27 -0
- package/config/pro/reminders-pro.json +38 -0
- package/config/pro/thompson-presets.json +38 -0
- package/package.json +16 -8
- package/public/dashboard.html +1 -1
- package/public/guide.html +5 -3
- package/public/index.html +43 -31
- package/public/lessons.html +1 -1
- package/public/numbers.html +45 -32
- package/public/pro.html +31 -88
- package/scripts/billing.js +3 -3
- package/scripts/gate-stats.js +29 -8
- package/scripts/harness-selector.js +188 -0
- package/scripts/rag-precision-guardrails.js +63 -1
- package/scripts/rate-limiter.js +1 -1
- package/scripts/reasoning-efficiency-guardrails.js +73 -1
- package/scripts/thumbgate-bench.js +707 -0
- package/src/api/server.js +66 -13
|
@@ -11,6 +11,13 @@ function normalizeBoolean(value) {
|
|
|
11
11
|
return /^(1|true|yes|on)$/i.test(String(value).trim());
|
|
12
12
|
}
|
|
13
13
|
|
|
14
|
+
function normalizeOptionalBoolean(value) {
|
|
15
|
+
if (value === undefined || value === null || value === '') return null;
|
|
16
|
+
if (value === true) return true;
|
|
17
|
+
if (value === false) return false;
|
|
18
|
+
return /^(1|true|yes|on)$/i.test(String(value).trim());
|
|
19
|
+
}
|
|
20
|
+
|
|
14
21
|
function toNumber(value) {
|
|
15
22
|
if (value === undefined || value === null || value === '') return null;
|
|
16
23
|
const num = Number(value);
|
|
@@ -28,6 +35,16 @@ function normalizeOptions(options = {}) {
|
|
|
28
35
|
lowConfidenceSteps: toNumber(options['low-confidence-steps']),
|
|
29
36
|
highConfidenceFailures: toNumber(options['high-confidence-failures']),
|
|
30
37
|
truncationFailures: normalizeBoolean(options['truncation-failures']),
|
|
38
|
+
promptTokens: toNumber(options['prompt-tokens'] || options['context-tokens'] || options.context),
|
|
39
|
+
outputTokens: toNumber(options['output-tokens'] || options.output),
|
|
40
|
+
ttftMs: toNumber(options['ttft-ms'] || options['time-to-first-token-ms'] || options.ttft),
|
|
41
|
+
tokensPerSecond: toNumber(options['tokens-per-second'] || options.tps),
|
|
42
|
+
kvCache: normalizeOptionalBoolean(options['kv-cache'] ?? options.kvcache),
|
|
43
|
+
kvCacheHitRate: toNumber(options['kv-cache-hit-rate'] || options['kv-hit-rate']),
|
|
44
|
+
quantized: normalizeBoolean(options.quantized || options.quantization),
|
|
45
|
+
qualityDelta: toNumber(options['quality-delta'] || options['quantized-quality-delta']),
|
|
46
|
+
prefillBudgetMs: toNumber(options['prefill-budget-ms'] || options['ttft-budget-ms']),
|
|
47
|
+
decodeBudgetTps: toNumber(options['decode-budget-tps'] || options['tokens-per-second-budget']),
|
|
31
48
|
};
|
|
32
49
|
}
|
|
33
50
|
|
|
@@ -89,6 +106,49 @@ function buildSignals(options) {
|
|
|
89
106
|
risk: 'failed rollouts may reflect verifier noise or truncation rather than bad reasoning',
|
|
90
107
|
});
|
|
91
108
|
}
|
|
109
|
+
if (options.promptTokens !== null || options.ttftMs !== null || options.prefillBudgetMs !== null) {
|
|
110
|
+
const overBudget = options.ttftMs !== null &&
|
|
111
|
+
options.prefillBudgetMs !== null &&
|
|
112
|
+
options.ttftMs > options.prefillBudgetMs;
|
|
113
|
+
signals.push({
|
|
114
|
+
id: 'prefill_decode_split',
|
|
115
|
+
label: 'Inference prefill/decode budget',
|
|
116
|
+
values: [
|
|
117
|
+
options.promptTokens !== null ? `${options.promptTokens} prompt tokens` : null,
|
|
118
|
+
options.ttftMs !== null ? `${options.ttftMs}ms TTFT` : null,
|
|
119
|
+
options.prefillBudgetMs !== null ? `${options.prefillBudgetMs}ms prefill budget` : null,
|
|
120
|
+
options.tokensPerSecond !== null ? `${options.tokensPerSecond} tokens/sec decode` : null,
|
|
121
|
+
options.decodeBudgetTps !== null ? `${options.decodeBudgetTps} tokens/sec decode budget` : null,
|
|
122
|
+
overBudget ? 'TTFT over budget' : null,
|
|
123
|
+
].filter(Boolean),
|
|
124
|
+
risk: 'long prompts raise prefill cost while slow decode can make cheap models miss user-facing latency budgets',
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
if ((options.promptTokens !== null && options.promptTokens >= 32000) || options.kvCacheHitRate !== null || options.kvCache === false) {
|
|
128
|
+
const lowHitRate = options.kvCacheHitRate !== null && options.kvCacheHitRate < 0.8;
|
|
129
|
+
signals.push({
|
|
130
|
+
id: 'kv_cache_policy',
|
|
131
|
+
label: 'KV cache policy',
|
|
132
|
+
values: [
|
|
133
|
+
options.kvCache === true ? 'KV cache enabled' : null,
|
|
134
|
+
options.kvCache === false ? 'KV cache missing' : null,
|
|
135
|
+
options.kvCacheHitRate !== null ? `${options.kvCacheHitRate} hit rate` : null,
|
|
136
|
+
lowHitRate ? 'low cache hit rate' : null,
|
|
137
|
+
].filter(Boolean),
|
|
138
|
+
risk: 'uncached long-context workloads can make repeated agent calls materially more expensive and slower',
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
if (options.quantized || options.qualityDelta !== null) {
|
|
142
|
+
signals.push({
|
|
143
|
+
id: 'quantization_rollout',
|
|
144
|
+
label: 'Quantized runtime rollout',
|
|
145
|
+
values: [
|
|
146
|
+
options.quantized ? 'quantized runtime' : null,
|
|
147
|
+
options.qualityDelta !== null ? `${options.qualityDelta} quality delta` : 'missing quality delta',
|
|
148
|
+
].filter(Boolean),
|
|
149
|
+
risk: 'quantization can cut inference cost only if quality and latency are measured before production routing',
|
|
150
|
+
});
|
|
151
|
+
}
|
|
92
152
|
return signals;
|
|
93
153
|
}
|
|
94
154
|
|
|
@@ -114,6 +174,16 @@ function buildReasoningEfficiencyGuardrailsPlan(rawOptions = {}, templatesPath)
|
|
|
114
174
|
baselineAccuracy: options.baselineAccuracy,
|
|
115
175
|
compressedAccuracy: options.compressedAccuracy,
|
|
116
176
|
accuracyDelta: accuracyDelta(options),
|
|
177
|
+
promptTokens: options.promptTokens,
|
|
178
|
+
outputTokens: options.outputTokens,
|
|
179
|
+
ttftMs: options.ttftMs,
|
|
180
|
+
tokensPerSecond: options.tokensPerSecond,
|
|
181
|
+
kvCache: options.kvCache,
|
|
182
|
+
kvCacheHitRate: options.kvCacheHitRate,
|
|
183
|
+
quantized: options.quantized,
|
|
184
|
+
qualityDelta: options.qualityDelta,
|
|
185
|
+
prefillBudgetMs: options.prefillBudgetMs,
|
|
186
|
+
decodeBudgetTps: options.decodeBudgetTps,
|
|
117
187
|
},
|
|
118
188
|
summary: {
|
|
119
189
|
signalCount: signals.length,
|
|
@@ -127,8 +197,10 @@ function buildReasoningEfficiencyGuardrailsPlan(rawOptions = {}, templatesPath)
|
|
|
127
197
|
'Inspect low-confidence steps even when the final rollout is correct.',
|
|
128
198
|
'Inspect high-confidence failed rollouts for truncation or verifier noise before penalizing the trace.',
|
|
129
199
|
'Route cheaper compressed reasoning only after accuracy and efficiency both clear the gate.',
|
|
200
|
+
'Track TTFT, decode throughput, KV-cache hit rate, and prompt-token count separately for every agent runtime.',
|
|
201
|
+
'Route quantized runtimes only after latency savings clear quality and verifier baselines.',
|
|
130
202
|
],
|
|
131
|
-
exampleCommand: 'npx thumbgate reasoning-efficiency-guardrails --
|
|
203
|
+
exampleCommand: 'npx thumbgate reasoning-efficiency-guardrails --prompt-tokens=120000 --ttft-ms=1800 --prefill-budget-ms=800 --kv-cache=false --quantized --quality-delta=-0.03 --json',
|
|
132
204
|
};
|
|
133
205
|
}
|
|
134
206
|
|