thumbgate 1.16.20 → 1.16.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/.well-known/mcp/server-card.json +1 -1
- package/README.md +3 -2
- package/adapters/claude/.mcp.json +2 -2
- package/adapters/mcp/server-stdio.js +1 -1
- package/adapters/opencode/opencode.json +1 -1
- package/bench/programbench-smoke.json +71 -0
- package/bench/thumbgate-bench.json +131 -0
- package/bin/cli.js +64 -1
- package/package.json +16 -8
- package/public/dashboard.html +1 -1
- package/public/guide.html +5 -3
- package/public/index.html +43 -31
- package/public/lessons.html +1 -1
- package/public/numbers.html +11 -11
- package/public/pro.html +31 -88
- package/scripts/billing.js +3 -3
- package/scripts/harness-selector.js +188 -0
- package/scripts/rag-precision-guardrails.js +63 -1
- package/scripts/rate-limiter.js +1 -1
- package/scripts/reasoning-efficiency-guardrails.js +73 -1
- package/scripts/thumbgate-bench.js +707 -0
- package/src/api/server.js +66 -13
|
@@ -253,6 +253,190 @@ function buildHarnessOptimizationAudit(options = {}) {
|
|
|
253
253
|
return scoreHarnessAudit(inputs, options);
|
|
254
254
|
}
|
|
255
255
|
|
|
256
|
+
function normalizeBoolean(value) {
|
|
257
|
+
if (value === true) return true;
|
|
258
|
+
if (value === false || value === undefined || value === null) return false;
|
|
259
|
+
return /^(1|true|yes|on)$/i.test(String(value).trim());
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
function normalizeOptionalBoolean(value, fallback = true) {
|
|
263
|
+
if (value === undefined || value === null || value === '') return fallback;
|
|
264
|
+
if (value === true) return true;
|
|
265
|
+
if (value === false) return false;
|
|
266
|
+
return /^(1|true|yes|on)$/i.test(String(value).trim());
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
function toNumber(value) {
|
|
270
|
+
if (value === undefined || value === null || value === '') return null;
|
|
271
|
+
const num = Number(value);
|
|
272
|
+
return Number.isFinite(num) ? num : null;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
function buildHarnessFitAudit(options = {}) {
|
|
276
|
+
const nativeHarness = String(options['native-harness'] || options.native || 'native').trim() || 'native';
|
|
277
|
+
const genericHarness = String(options['generic-harness'] || options.generic || 'generic').trim() || 'generic';
|
|
278
|
+
const sameModelDifferentHarness = normalizeBoolean(options['same-model-different-harness'] || options['same-model'] || options.crossHarness);
|
|
279
|
+
const controls = {
|
|
280
|
+
toolSchemaParity: normalizeOptionalBoolean(options['tool-schema-parity']),
|
|
281
|
+
permissionParity: normalizeOptionalBoolean(options['permission-parity']),
|
|
282
|
+
stateIsolation: normalizeOptionalBoolean(options['state-isolation']),
|
|
283
|
+
patchLoopParity: normalizeOptionalBoolean(options['patch-loop-parity']),
|
|
284
|
+
verificationParity: normalizeOptionalBoolean(options['verification-parity']),
|
|
285
|
+
};
|
|
286
|
+
const handoffDrift = toNumber(options['handoff-drift'] || options['handoff-drift-percent']);
|
|
287
|
+
const gaps = Object.entries(controls)
|
|
288
|
+
.filter(([, value]) => value === false)
|
|
289
|
+
.map(([key]) => key);
|
|
290
|
+
|
|
291
|
+
let score = 100;
|
|
292
|
+
if (sameModelDifferentHarness) score -= 15;
|
|
293
|
+
score -= gaps.length * 12;
|
|
294
|
+
if (handoffDrift !== null && handoffDrift > 0) score -= Math.min(20, Math.ceil(handoffDrift));
|
|
295
|
+
|
|
296
|
+
const signals = [];
|
|
297
|
+
if (sameModelDifferentHarness || gaps.length > 0) {
|
|
298
|
+
signals.push({
|
|
299
|
+
id: 'model_harness_fit',
|
|
300
|
+
label: 'Same model, different harness',
|
|
301
|
+
values: [
|
|
302
|
+
`${nativeHarness} vs ${genericHarness}`,
|
|
303
|
+
sameModelDifferentHarness ? 'same model run across harnesses' : null,
|
|
304
|
+
...gaps.map((gap) => `${gap} gap`),
|
|
305
|
+
].filter(Boolean),
|
|
306
|
+
risk: 'model quality can change when tool schemas, permissions, state, patch loops, or verification differ by harness',
|
|
307
|
+
});
|
|
308
|
+
}
|
|
309
|
+
if (handoffDrift !== null && handoffDrift > 0) {
|
|
310
|
+
signals.push({
|
|
311
|
+
id: 'handoff_drift',
|
|
312
|
+
label: 'Cross-harness handoff drift',
|
|
313
|
+
values: [`${handoffDrift}% drift`],
|
|
314
|
+
risk: 'handoffs between generic and native harnesses can lose task state or weaken verification',
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
const normalizedScore = Math.max(0, Math.min(100, score));
|
|
319
|
+
return {
|
|
320
|
+
name: 'thumbgate-model-harness-fit-audit',
|
|
321
|
+
status: normalizedScore >= 85 ? 'portable' : normalizedScore >= 65 ? 'watch' : 'native-required',
|
|
322
|
+
score: normalizedScore,
|
|
323
|
+
nativeHarness,
|
|
324
|
+
genericHarness,
|
|
325
|
+
controls,
|
|
326
|
+
metrics: { sameModelDifferentHarness, handoffDrift },
|
|
327
|
+
signals,
|
|
328
|
+
recommendations: [
|
|
329
|
+
'Benchmark the same task, same model, and same repository in native and generic harnesses before standardizing.',
|
|
330
|
+
'Require parity proof for tool schemas, permissions, state isolation, patch application, and verification loops.',
|
|
331
|
+
'Use the native harness for production edits when parity gaps remain; reserve generic harnesses for exploration and read-only analysis.',
|
|
332
|
+
],
|
|
333
|
+
};
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
function formatHarnessFitAudit(report) {
|
|
337
|
+
const lines = [
|
|
338
|
+
'',
|
|
339
|
+
'ThumbGate Model-Harness Fit Audit',
|
|
340
|
+
'-'.repeat(37),
|
|
341
|
+
`Status : ${report.status}`,
|
|
342
|
+
`Score : ${report.score}/100`,
|
|
343
|
+
`Harness: ${report.nativeHarness} vs ${report.genericHarness}`,
|
|
344
|
+
`Signals: ${report.signals.length}`,
|
|
345
|
+
];
|
|
346
|
+
if (report.signals.length > 0) {
|
|
347
|
+
lines.push('', 'Detected harness-fit risks:');
|
|
348
|
+
for (const signal of report.signals) {
|
|
349
|
+
lines.push(` - ${signal.label}: ${signal.values.join(', ')}`);
|
|
350
|
+
lines.push(` Risk: ${signal.risk}`);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
lines.push('', 'Recommendations:');
|
|
354
|
+
for (const recommendation of report.recommendations) lines.push(` - ${recommendation}`);
|
|
355
|
+
return `${lines.join('\n')}\n\n`;
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
function buildSolverWorkflowGovernance(options = {}) {
|
|
359
|
+
const solver = String(options.solver || options['solver-engine'] || 'solver').trim() || 'solver';
|
|
360
|
+
const multiAgent = normalizeBoolean(options['multi-agent'] || options.multiAgent || options.agentic);
|
|
361
|
+
const controls = {
|
|
362
|
+
objectiveDefined: normalizeOptionalBoolean(options['objective-defined']),
|
|
363
|
+
constraintsDefined: normalizeOptionalBoolean(options['constraints-defined']),
|
|
364
|
+
scenarioReplay: normalizeOptionalBoolean(options['scenario-replay']),
|
|
365
|
+
approvalGate: normalizeOptionalBoolean(options['approval-gate']),
|
|
366
|
+
rollbackPlan: normalizeOptionalBoolean(options['rollback-plan']),
|
|
367
|
+
solverProvenance: normalizeOptionalBoolean(options['solver-provenance']),
|
|
368
|
+
};
|
|
369
|
+
const dataFreshnessHours = toNumber(options['data-freshness-hours'] || options['freshness-hours']);
|
|
370
|
+
const gaps = Object.entries(controls)
|
|
371
|
+
.filter(([, value]) => value === false)
|
|
372
|
+
.map(([key]) => key);
|
|
373
|
+
|
|
374
|
+
let score = 100;
|
|
375
|
+
if (multiAgent) score -= 8;
|
|
376
|
+
score -= gaps.length * 13;
|
|
377
|
+
if (dataFreshnessHours !== null && dataFreshnessHours > 24) score -= 10;
|
|
378
|
+
|
|
379
|
+
const signals = [];
|
|
380
|
+
if (multiAgent || gaps.length > 0) {
|
|
381
|
+
signals.push({
|
|
382
|
+
id: 'solver_workflow_governance',
|
|
383
|
+
label: 'Solver-backed agent workflow',
|
|
384
|
+
values: [
|
|
385
|
+
solver,
|
|
386
|
+
multiAgent ? 'multi-agent orchestration' : null,
|
|
387
|
+
...gaps.map((gap) => `${gap} gap`),
|
|
388
|
+
].filter(Boolean),
|
|
389
|
+
risk: 'natural-language-to-optimization workflows need objective, constraint, replay, approval, rollback, and provenance gates',
|
|
390
|
+
});
|
|
391
|
+
}
|
|
392
|
+
if (dataFreshnessHours !== null && dataFreshnessHours > 24) {
|
|
393
|
+
signals.push({
|
|
394
|
+
id: 'solver_data_freshness',
|
|
395
|
+
label: 'Solver data freshness',
|
|
396
|
+
values: [`${dataFreshnessHours}h old`],
|
|
397
|
+
risk: 'optimization results can look mathematically valid while using stale operational data',
|
|
398
|
+
});
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
const normalizedScore = Math.max(0, Math.min(100, score));
|
|
402
|
+
return {
|
|
403
|
+
name: 'thumbgate-solver-workflow-governance',
|
|
404
|
+
status: normalizedScore >= 85 ? 'ready' : normalizedScore >= 65 ? 'approval-required' : 'blocked',
|
|
405
|
+
score: normalizedScore,
|
|
406
|
+
solver,
|
|
407
|
+
controls,
|
|
408
|
+
metrics: { multiAgent, dataFreshnessHours },
|
|
409
|
+
signals,
|
|
410
|
+
recommendations: [
|
|
411
|
+
'Capture the objective function, hard constraints, soft constraints, and data freshness before invoking the solver.',
|
|
412
|
+
'Replay at least one baseline scenario and one counterfactual before approving optimized actions.',
|
|
413
|
+
'Require human approval and rollback evidence before solver output changes supply chain, routing, scheduling, or pricing decisions.',
|
|
414
|
+
],
|
|
415
|
+
};
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
function formatSolverWorkflowGovernance(report) {
|
|
419
|
+
const lines = [
|
|
420
|
+
'',
|
|
421
|
+
'ThumbGate Solver Workflow Governance',
|
|
422
|
+
'-'.repeat(38),
|
|
423
|
+
`Status: ${report.status}`,
|
|
424
|
+
`Score : ${report.score}/100`,
|
|
425
|
+
`Solver: ${report.solver}`,
|
|
426
|
+
`Signals: ${report.signals.length}`,
|
|
427
|
+
];
|
|
428
|
+
if (report.signals.length > 0) {
|
|
429
|
+
lines.push('', 'Detected solver workflow risks:');
|
|
430
|
+
for (const signal of report.signals) {
|
|
431
|
+
lines.push(` - ${signal.label}: ${signal.values.join(', ')}`);
|
|
432
|
+
lines.push(` Risk: ${signal.risk}`);
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
lines.push('', 'Recommendations:');
|
|
436
|
+
for (const recommendation of report.recommendations) lines.push(` - ${recommendation}`);
|
|
437
|
+
return `${lines.join('\n')}\n\n`;
|
|
438
|
+
}
|
|
439
|
+
|
|
256
440
|
// ---------------------------------------------------------------------------
|
|
257
441
|
// Internal helpers
|
|
258
442
|
// ---------------------------------------------------------------------------
|
|
@@ -284,6 +468,10 @@ module.exports = {
|
|
|
284
468
|
collectDefaultHarnessAuditInputs,
|
|
285
469
|
scoreHarnessAudit,
|
|
286
470
|
buildHarnessOptimizationAudit,
|
|
471
|
+
buildHarnessFitAudit,
|
|
472
|
+
formatHarnessFitAudit,
|
|
473
|
+
buildSolverWorkflowGovernance,
|
|
474
|
+
formatSolverWorkflowGovernance,
|
|
287
475
|
extractCommandText,
|
|
288
476
|
HARNESSES,
|
|
289
477
|
DEPLOY_PATTERNS,
|
|
@@ -34,6 +34,14 @@ function normalizeOptions(options = {}) {
|
|
|
34
34
|
embeddingFineTune: normalizeBoolean(options['embedding-finetune'] || options['embedding-fine-tune'] || options['fine-tune']),
|
|
35
35
|
structuralNearMisses: normalizeBoolean(options['structural-near-misses'] || options['near-misses']),
|
|
36
36
|
verifier: normalizeBoolean(options.verifier || options.reranker || options['second-stage']),
|
|
37
|
+
hybridRetrieval: normalizeBoolean(options['hybrid-retrieval'] || options.hybrid),
|
|
38
|
+
denseRetrieval: normalizeBoolean(options.dense || options['dense-retrieval'] || options.embeddings),
|
|
39
|
+
sparseRetrieval: normalizeBoolean(options.sparse || options['sparse-retrieval'] || options.keyword),
|
|
40
|
+
reranker: normalizeBoolean(options.reranker || options.rerank),
|
|
41
|
+
sourceGrounding: normalizeBoolean(options['source-grounding'] || options.grounding || options.citations),
|
|
42
|
+
aclFilter: normalizeBoolean(options['acl-filter'] || options.acl || options['access-control']),
|
|
43
|
+
freshnessWindowHours: toNumber(options['freshness-window-hours'] || options['freshness-hours']),
|
|
44
|
+
scaleCorpusDocuments: toNumber(options['scale-corpus-documents'] || options['corpus-documents'] || options.documents),
|
|
37
45
|
latencyMs: toNumber(options['latency-ms'] || options.latency),
|
|
38
46
|
latencyBudgetMs: toNumber(options['latency-budget-ms'] || options['latency-budget']),
|
|
39
47
|
agenticPipeline: normalizeBoolean(options.agentic || options['agentic-pipeline']),
|
|
@@ -69,6 +77,8 @@ function buildSignals(options) {
|
|
|
69
77
|
precisionTuningSignal(options, drop),
|
|
70
78
|
ragCascadeSignal(options),
|
|
71
79
|
verifierLatencySignal(options),
|
|
80
|
+
hybridScaleSignal(options),
|
|
81
|
+
retrievalGovernanceSignal(options),
|
|
72
82
|
].filter(Boolean);
|
|
73
83
|
}
|
|
74
84
|
|
|
@@ -113,6 +123,48 @@ function verifierLatencySignal(options) {
|
|
|
113
123
|
};
|
|
114
124
|
}
|
|
115
125
|
|
|
126
|
+
function hybridScaleSignal(options) {
|
|
127
|
+
const hybridIntent = options.hybridRetrieval || (options.denseRetrieval && options.sparseRetrieval);
|
|
128
|
+
const largeCorpus = options.scaleCorpusDocuments !== null && options.scaleCorpusDocuments >= 100000;
|
|
129
|
+
if (!(hybridIntent || largeCorpus)) return null;
|
|
130
|
+
const missingControls = [
|
|
131
|
+
options.denseRetrieval ? null : 'dense recall unmeasured',
|
|
132
|
+
options.sparseRetrieval ? null : 'sparse recall unmeasured',
|
|
133
|
+
options.reranker ? null : 'missing reranker',
|
|
134
|
+
options.sourceGrounding ? null : 'missing source grounding',
|
|
135
|
+
options.aclFilter ? null : 'missing ACL filter',
|
|
136
|
+
].filter(Boolean);
|
|
137
|
+
return {
|
|
138
|
+
id: 'hybrid_retrieval_scale_wall',
|
|
139
|
+
label: 'Hybrid retrieval scale wall',
|
|
140
|
+
values: [
|
|
141
|
+
options.hybridRetrieval ? 'hybrid retrieval' : null,
|
|
142
|
+
options.denseRetrieval ? 'dense retrieval' : null,
|
|
143
|
+
options.sparseRetrieval ? 'sparse retrieval' : null,
|
|
144
|
+
options.scaleCorpusDocuments !== null ? `${options.scaleCorpusDocuments} documents` : null,
|
|
145
|
+
...missingControls,
|
|
146
|
+
].filter(Boolean),
|
|
147
|
+
risk: 'scaled RAG needs dense, sparse, reranking, grounding, and access-control evidence instead of vector-only correctness',
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
function retrievalGovernanceSignal(options) {
|
|
152
|
+
if (!options.agenticPipeline && options.sourceGrounding && options.aclFilter) return null;
|
|
153
|
+
if (!(options.agenticPipeline || options.hybridRetrieval || options.scaleCorpusDocuments !== null)) return null;
|
|
154
|
+
const gaps = [
|
|
155
|
+
options.sourceGrounding ? null : 'source evidence not enforced',
|
|
156
|
+
options.aclFilter ? null : 'access control not enforced',
|
|
157
|
+
options.freshnessWindowHours === null ? 'freshness window missing' : `${options.freshnessWindowHours}h freshness window`,
|
|
158
|
+
].filter(Boolean);
|
|
159
|
+
if (gaps.length === 0) return null;
|
|
160
|
+
return {
|
|
161
|
+
id: 'retrieval_governance_gap',
|
|
162
|
+
label: 'Retrieval governance gap',
|
|
163
|
+
values: gaps,
|
|
164
|
+
risk: 'agentic retrieval output can leak stale or unauthorized context into downstream actions',
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
|
|
116
168
|
function buildRagPrecisionGuardrailsPlan(rawOptions = {}, templatesPath) {
|
|
117
169
|
const options = normalizeOptions(rawOptions);
|
|
118
170
|
const templates = listGateTemplates(templatesPath)
|
|
@@ -135,6 +187,14 @@ function buildRagPrecisionGuardrailsPlan(rawOptions = {}, templatesPath) {
|
|
|
135
187
|
recallDropPercent: recallDropPercent(options),
|
|
136
188
|
baselinePrecision: options.baselinePrecision,
|
|
137
189
|
newPrecision: options.newPrecision,
|
|
190
|
+
hybridRetrieval: options.hybridRetrieval,
|
|
191
|
+
denseRetrieval: options.denseRetrieval,
|
|
192
|
+
sparseRetrieval: options.sparseRetrieval,
|
|
193
|
+
reranker: options.reranker,
|
|
194
|
+
sourceGrounding: options.sourceGrounding,
|
|
195
|
+
aclFilter: options.aclFilter,
|
|
196
|
+
freshnessWindowHours: options.freshnessWindowHours,
|
|
197
|
+
scaleCorpusDocuments: options.scaleCorpusDocuments,
|
|
138
198
|
latencyMs: options.latencyMs,
|
|
139
199
|
latencyBudgetMs: options.latencyBudgetMs,
|
|
140
200
|
},
|
|
@@ -150,8 +210,10 @@ function buildRagPrecisionGuardrailsPlan(rawOptions = {}, templatesPath) {
|
|
|
150
210
|
'Block embedding or threshold changes when recall drops without an approved rollback plan.',
|
|
151
211
|
'Use a second-stage verifier or reranker for structural near misses such as negation and role reversal.',
|
|
152
212
|
'Attach verifier latency budgets before routing the retrieval output into autonomous agent actions.',
|
|
213
|
+
'Measure dense recall, sparse recall, reranked relevance, source grounding, ACL filtering, and freshness as separate production gates.',
|
|
214
|
+
'Treat the retrieval layer as the agent ground truth: every autonomous action should carry source evidence and access-control proof.',
|
|
153
215
|
],
|
|
154
|
-
exampleCommand: 'npx thumbgate rag-precision-guardrails --
|
|
216
|
+
exampleCommand: 'npx thumbgate rag-precision-guardrails --hybrid-retrieval --dense --sparse --scale-corpus-documents=1000000 --agentic --json',
|
|
155
217
|
};
|
|
156
218
|
}
|
|
157
219
|
|
package/scripts/rate-limiter.js
CHANGED
|
@@ -36,7 +36,7 @@ const PAYWALL_MESSAGES = {
|
|
|
36
36
|
prevention_rules: 'Free tier includes 1 prevention rule. Your agents need more protection — upgrade to Pro for unlimited rules.',
|
|
37
37
|
recall: 'Recall is a Pro feature. Your past feedback is stored locally — upgrade to search and reuse it.',
|
|
38
38
|
search_lessons: 'Lesson search is a Pro feature. Upgrade to find patterns in your agent\'s mistakes.',
|
|
39
|
-
default: 'This feature requires Pro. Start
|
|
39
|
+
default: 'This feature requires Pro. Start Pro — card required; billed today.',
|
|
40
40
|
};
|
|
41
41
|
|
|
42
42
|
function isProTier(authContext) {
|
|
@@ -11,6 +11,13 @@ function normalizeBoolean(value) {
|
|
|
11
11
|
return /^(1|true|yes|on)$/i.test(String(value).trim());
|
|
12
12
|
}
|
|
13
13
|
|
|
14
|
+
function normalizeOptionalBoolean(value) {
|
|
15
|
+
if (value === undefined || value === null || value === '') return null;
|
|
16
|
+
if (value === true) return true;
|
|
17
|
+
if (value === false) return false;
|
|
18
|
+
return /^(1|true|yes|on)$/i.test(String(value).trim());
|
|
19
|
+
}
|
|
20
|
+
|
|
14
21
|
function toNumber(value) {
|
|
15
22
|
if (value === undefined || value === null || value === '') return null;
|
|
16
23
|
const num = Number(value);
|
|
@@ -28,6 +35,16 @@ function normalizeOptions(options = {}) {
|
|
|
28
35
|
lowConfidenceSteps: toNumber(options['low-confidence-steps']),
|
|
29
36
|
highConfidenceFailures: toNumber(options['high-confidence-failures']),
|
|
30
37
|
truncationFailures: normalizeBoolean(options['truncation-failures']),
|
|
38
|
+
promptTokens: toNumber(options['prompt-tokens'] || options['context-tokens'] || options.context),
|
|
39
|
+
outputTokens: toNumber(options['output-tokens'] || options.output),
|
|
40
|
+
ttftMs: toNumber(options['ttft-ms'] || options['time-to-first-token-ms'] || options.ttft),
|
|
41
|
+
tokensPerSecond: toNumber(options['tokens-per-second'] || options.tps),
|
|
42
|
+
kvCache: normalizeOptionalBoolean(options['kv-cache'] ?? options.kvcache),
|
|
43
|
+
kvCacheHitRate: toNumber(options['kv-cache-hit-rate'] || options['kv-hit-rate']),
|
|
44
|
+
quantized: normalizeBoolean(options.quantized || options.quantization),
|
|
45
|
+
qualityDelta: toNumber(options['quality-delta'] || options['quantized-quality-delta']),
|
|
46
|
+
prefillBudgetMs: toNumber(options['prefill-budget-ms'] || options['ttft-budget-ms']),
|
|
47
|
+
decodeBudgetTps: toNumber(options['decode-budget-tps'] || options['tokens-per-second-budget']),
|
|
31
48
|
};
|
|
32
49
|
}
|
|
33
50
|
|
|
@@ -89,6 +106,49 @@ function buildSignals(options) {
|
|
|
89
106
|
risk: 'failed rollouts may reflect verifier noise or truncation rather than bad reasoning',
|
|
90
107
|
});
|
|
91
108
|
}
|
|
109
|
+
if (options.promptTokens !== null || options.ttftMs !== null || options.prefillBudgetMs !== null) {
|
|
110
|
+
const overBudget = options.ttftMs !== null &&
|
|
111
|
+
options.prefillBudgetMs !== null &&
|
|
112
|
+
options.ttftMs > options.prefillBudgetMs;
|
|
113
|
+
signals.push({
|
|
114
|
+
id: 'prefill_decode_split',
|
|
115
|
+
label: 'Inference prefill/decode budget',
|
|
116
|
+
values: [
|
|
117
|
+
options.promptTokens !== null ? `${options.promptTokens} prompt tokens` : null,
|
|
118
|
+
options.ttftMs !== null ? `${options.ttftMs}ms TTFT` : null,
|
|
119
|
+
options.prefillBudgetMs !== null ? `${options.prefillBudgetMs}ms prefill budget` : null,
|
|
120
|
+
options.tokensPerSecond !== null ? `${options.tokensPerSecond} tokens/sec decode` : null,
|
|
121
|
+
options.decodeBudgetTps !== null ? `${options.decodeBudgetTps} tokens/sec decode budget` : null,
|
|
122
|
+
overBudget ? 'TTFT over budget' : null,
|
|
123
|
+
].filter(Boolean),
|
|
124
|
+
risk: 'long prompts raise prefill cost while slow decode can make cheap models miss user-facing latency budgets',
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
if ((options.promptTokens !== null && options.promptTokens >= 32000) || options.kvCacheHitRate !== null || options.kvCache === false) {
|
|
128
|
+
const lowHitRate = options.kvCacheHitRate !== null && options.kvCacheHitRate < 0.8;
|
|
129
|
+
signals.push({
|
|
130
|
+
id: 'kv_cache_policy',
|
|
131
|
+
label: 'KV cache policy',
|
|
132
|
+
values: [
|
|
133
|
+
options.kvCache === true ? 'KV cache enabled' : null,
|
|
134
|
+
options.kvCache === false ? 'KV cache missing' : null,
|
|
135
|
+
options.kvCacheHitRate !== null ? `${options.kvCacheHitRate} hit rate` : null,
|
|
136
|
+
lowHitRate ? 'low cache hit rate' : null,
|
|
137
|
+
].filter(Boolean),
|
|
138
|
+
risk: 'uncached long-context workloads can make repeated agent calls materially more expensive and slower',
|
|
139
|
+
});
|
|
140
|
+
}
|
|
141
|
+
if (options.quantized || options.qualityDelta !== null) {
|
|
142
|
+
signals.push({
|
|
143
|
+
id: 'quantization_rollout',
|
|
144
|
+
label: 'Quantized runtime rollout',
|
|
145
|
+
values: [
|
|
146
|
+
options.quantized ? 'quantized runtime' : null,
|
|
147
|
+
options.qualityDelta !== null ? `${options.qualityDelta} quality delta` : 'missing quality delta',
|
|
148
|
+
].filter(Boolean),
|
|
149
|
+
risk: 'quantization can cut inference cost only if quality and latency are measured before production routing',
|
|
150
|
+
});
|
|
151
|
+
}
|
|
92
152
|
return signals;
|
|
93
153
|
}
|
|
94
154
|
|
|
@@ -114,6 +174,16 @@ function buildReasoningEfficiencyGuardrailsPlan(rawOptions = {}, templatesPath)
|
|
|
114
174
|
baselineAccuracy: options.baselineAccuracy,
|
|
115
175
|
compressedAccuracy: options.compressedAccuracy,
|
|
116
176
|
accuracyDelta: accuracyDelta(options),
|
|
177
|
+
promptTokens: options.promptTokens,
|
|
178
|
+
outputTokens: options.outputTokens,
|
|
179
|
+
ttftMs: options.ttftMs,
|
|
180
|
+
tokensPerSecond: options.tokensPerSecond,
|
|
181
|
+
kvCache: options.kvCache,
|
|
182
|
+
kvCacheHitRate: options.kvCacheHitRate,
|
|
183
|
+
quantized: options.quantized,
|
|
184
|
+
qualityDelta: options.qualityDelta,
|
|
185
|
+
prefillBudgetMs: options.prefillBudgetMs,
|
|
186
|
+
decodeBudgetTps: options.decodeBudgetTps,
|
|
117
187
|
},
|
|
118
188
|
summary: {
|
|
119
189
|
signalCount: signals.length,
|
|
@@ -127,8 +197,10 @@ function buildReasoningEfficiencyGuardrailsPlan(rawOptions = {}, templatesPath)
|
|
|
127
197
|
'Inspect low-confidence steps even when the final rollout is correct.',
|
|
128
198
|
'Inspect high-confidence failed rollouts for truncation or verifier noise before penalizing the trace.',
|
|
129
199
|
'Route cheaper compressed reasoning only after accuracy and efficiency both clear the gate.',
|
|
200
|
+
'Track TTFT, decode throughput, KV-cache hit rate, and prompt-token count separately for every agent runtime.',
|
|
201
|
+
'Route quantized runtimes only after latency savings clear quality and verifier baselines.',
|
|
130
202
|
],
|
|
131
|
-
exampleCommand: 'npx thumbgate reasoning-efficiency-guardrails --
|
|
203
|
+
exampleCommand: 'npx thumbgate reasoning-efficiency-guardrails --prompt-tokens=120000 --ttft-ms=1800 --prefill-budget-ms=800 --kv-cache=false --quantized --quality-delta=-0.03 --json',
|
|
132
204
|
};
|
|
133
205
|
}
|
|
134
206
|
|