thumbgate 1.16.20 → 1.16.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -253,6 +253,190 @@ function buildHarnessOptimizationAudit(options = {}) {
253
253
  return scoreHarnessAudit(inputs, options);
254
254
  }
255
255
 
256
+ function normalizeBoolean(value) {
257
+ if (value === true) return true;
258
+ if (value === false || value === undefined || value === null) return false;
259
+ return /^(1|true|yes|on)$/i.test(String(value).trim());
260
+ }
261
+
262
+ function normalizeOptionalBoolean(value, fallback = true) {
263
+ if (value === undefined || value === null || value === '') return fallback;
264
+ if (value === true) return true;
265
+ if (value === false) return false;
266
+ return /^(1|true|yes|on)$/i.test(String(value).trim());
267
+ }
268
+
269
+ function toNumber(value) {
270
+ if (value === undefined || value === null || value === '') return null;
271
+ const num = Number(value);
272
+ return Number.isFinite(num) ? num : null;
273
+ }
274
+
275
+ function buildHarnessFitAudit(options = {}) {
276
+ const nativeHarness = String(options['native-harness'] || options.native || 'native').trim() || 'native';
277
+ const genericHarness = String(options['generic-harness'] || options.generic || 'generic').trim() || 'generic';
278
+ const sameModelDifferentHarness = normalizeBoolean(options['same-model-different-harness'] || options['same-model'] || options.crossHarness);
279
+ const controls = {
280
+ toolSchemaParity: normalizeOptionalBoolean(options['tool-schema-parity']),
281
+ permissionParity: normalizeOptionalBoolean(options['permission-parity']),
282
+ stateIsolation: normalizeOptionalBoolean(options['state-isolation']),
283
+ patchLoopParity: normalizeOptionalBoolean(options['patch-loop-parity']),
284
+ verificationParity: normalizeOptionalBoolean(options['verification-parity']),
285
+ };
286
+ const handoffDrift = toNumber(options['handoff-drift'] || options['handoff-drift-percent']);
287
+ const gaps = Object.entries(controls)
288
+ .filter(([, value]) => value === false)
289
+ .map(([key]) => key);
290
+
291
+ let score = 100;
292
+ if (sameModelDifferentHarness) score -= 15;
293
+ score -= gaps.length * 12;
294
+ if (handoffDrift !== null && handoffDrift > 0) score -= Math.min(20, Math.ceil(handoffDrift));
295
+
296
+ const signals = [];
297
+ if (sameModelDifferentHarness || gaps.length > 0) {
298
+ signals.push({
299
+ id: 'model_harness_fit',
300
+ label: 'Same model, different harness',
301
+ values: [
302
+ `${nativeHarness} vs ${genericHarness}`,
303
+ sameModelDifferentHarness ? 'same model run across harnesses' : null,
304
+ ...gaps.map((gap) => `${gap} gap`),
305
+ ].filter(Boolean),
306
+ risk: 'model quality can change when tool schemas, permissions, state, patch loops, or verification differ by harness',
307
+ });
308
+ }
309
+ if (handoffDrift !== null && handoffDrift > 0) {
310
+ signals.push({
311
+ id: 'handoff_drift',
312
+ label: 'Cross-harness handoff drift',
313
+ values: [`${handoffDrift}% drift`],
314
+ risk: 'handoffs between generic and native harnesses can lose task state or weaken verification',
315
+ });
316
+ }
317
+
318
+ const normalizedScore = Math.max(0, Math.min(100, score));
319
+ return {
320
+ name: 'thumbgate-model-harness-fit-audit',
321
+ status: normalizedScore >= 85 ? 'portable' : normalizedScore >= 65 ? 'watch' : 'native-required',
322
+ score: normalizedScore,
323
+ nativeHarness,
324
+ genericHarness,
325
+ controls,
326
+ metrics: { sameModelDifferentHarness, handoffDrift },
327
+ signals,
328
+ recommendations: [
329
+ 'Benchmark the same task, same model, and same repository in native and generic harnesses before standardizing.',
330
+ 'Require parity proof for tool schemas, permissions, state isolation, patch application, and verification loops.',
331
+ 'Use the native harness for production edits when parity gaps remain; reserve generic harnesses for exploration and read-only analysis.',
332
+ ],
333
+ };
334
+ }
335
+
336
+ function formatHarnessFitAudit(report) {
337
+ const lines = [
338
+ '',
339
+ 'ThumbGate Model-Harness Fit Audit',
340
+ '-'.repeat(37),
341
+ `Status : ${report.status}`,
342
+ `Score : ${report.score}/100`,
343
+ `Harness: ${report.nativeHarness} vs ${report.genericHarness}`,
344
+ `Signals: ${report.signals.length}`,
345
+ ];
346
+ if (report.signals.length > 0) {
347
+ lines.push('', 'Detected harness-fit risks:');
348
+ for (const signal of report.signals) {
349
+ lines.push(` - ${signal.label}: ${signal.values.join(', ')}`);
350
+ lines.push(` Risk: ${signal.risk}`);
351
+ }
352
+ }
353
+ lines.push('', 'Recommendations:');
354
+ for (const recommendation of report.recommendations) lines.push(` - ${recommendation}`);
355
+ return `${lines.join('\n')}\n\n`;
356
+ }
357
+
358
+ function buildSolverWorkflowGovernance(options = {}) {
359
+ const solver = String(options.solver || options['solver-engine'] || 'solver').trim() || 'solver';
360
+ const multiAgent = normalizeBoolean(options['multi-agent'] || options.multiAgent || options.agentic);
361
+ const controls = {
362
+ objectiveDefined: normalizeOptionalBoolean(options['objective-defined']),
363
+ constraintsDefined: normalizeOptionalBoolean(options['constraints-defined']),
364
+ scenarioReplay: normalizeOptionalBoolean(options['scenario-replay']),
365
+ approvalGate: normalizeOptionalBoolean(options['approval-gate']),
366
+ rollbackPlan: normalizeOptionalBoolean(options['rollback-plan']),
367
+ solverProvenance: normalizeOptionalBoolean(options['solver-provenance']),
368
+ };
369
+ const dataFreshnessHours = toNumber(options['data-freshness-hours'] || options['freshness-hours']);
370
+ const gaps = Object.entries(controls)
371
+ .filter(([, value]) => value === false)
372
+ .map(([key]) => key);
373
+
374
+ let score = 100;
375
+ if (multiAgent) score -= 8;
376
+ score -= gaps.length * 13;
377
+ if (dataFreshnessHours !== null && dataFreshnessHours > 24) score -= 10;
378
+
379
+ const signals = [];
380
+ if (multiAgent || gaps.length > 0) {
381
+ signals.push({
382
+ id: 'solver_workflow_governance',
383
+ label: 'Solver-backed agent workflow',
384
+ values: [
385
+ solver,
386
+ multiAgent ? 'multi-agent orchestration' : null,
387
+ ...gaps.map((gap) => `${gap} gap`),
388
+ ].filter(Boolean),
389
+ risk: 'natural-language-to-optimization workflows need objective, constraint, replay, approval, rollback, and provenance gates',
390
+ });
391
+ }
392
+ if (dataFreshnessHours !== null && dataFreshnessHours > 24) {
393
+ signals.push({
394
+ id: 'solver_data_freshness',
395
+ label: 'Solver data freshness',
396
+ values: [`${dataFreshnessHours}h old`],
397
+ risk: 'optimization results can look mathematically valid while using stale operational data',
398
+ });
399
+ }
400
+
401
+ const normalizedScore = Math.max(0, Math.min(100, score));
402
+ return {
403
+ name: 'thumbgate-solver-workflow-governance',
404
+ status: normalizedScore >= 85 ? 'ready' : normalizedScore >= 65 ? 'approval-required' : 'blocked',
405
+ score: normalizedScore,
406
+ solver,
407
+ controls,
408
+ metrics: { multiAgent, dataFreshnessHours },
409
+ signals,
410
+ recommendations: [
411
+ 'Capture the objective function, hard constraints, soft constraints, and data freshness before invoking the solver.',
412
+ 'Replay at least one baseline scenario and one counterfactual before approving optimized actions.',
413
+ 'Require human approval and rollback evidence before solver output changes supply chain, routing, scheduling, or pricing decisions.',
414
+ ],
415
+ };
416
+ }
417
+
418
+ function formatSolverWorkflowGovernance(report) {
419
+ const lines = [
420
+ '',
421
+ 'ThumbGate Solver Workflow Governance',
422
+ '-'.repeat(38),
423
+ `Status: ${report.status}`,
424
+ `Score : ${report.score}/100`,
425
+ `Solver: ${report.solver}`,
426
+ `Signals: ${report.signals.length}`,
427
+ ];
428
+ if (report.signals.length > 0) {
429
+ lines.push('', 'Detected solver workflow risks:');
430
+ for (const signal of report.signals) {
431
+ lines.push(` - ${signal.label}: ${signal.values.join(', ')}`);
432
+ lines.push(` Risk: ${signal.risk}`);
433
+ }
434
+ }
435
+ lines.push('', 'Recommendations:');
436
+ for (const recommendation of report.recommendations) lines.push(` - ${recommendation}`);
437
+ return `${lines.join('\n')}\n\n`;
438
+ }
439
+
256
440
  // ---------------------------------------------------------------------------
257
441
  // Internal helpers
258
442
  // ---------------------------------------------------------------------------
@@ -284,6 +468,10 @@ module.exports = {
284
468
  collectDefaultHarnessAuditInputs,
285
469
  scoreHarnessAudit,
286
470
  buildHarnessOptimizationAudit,
471
+ buildHarnessFitAudit,
472
+ formatHarnessFitAudit,
473
+ buildSolverWorkflowGovernance,
474
+ formatSolverWorkflowGovernance,
287
475
  extractCommandText,
288
476
  HARNESSES,
289
477
  DEPLOY_PATTERNS,
@@ -34,6 +34,14 @@ function normalizeOptions(options = {}) {
34
34
  embeddingFineTune: normalizeBoolean(options['embedding-finetune'] || options['embedding-fine-tune'] || options['fine-tune']),
35
35
  structuralNearMisses: normalizeBoolean(options['structural-near-misses'] || options['near-misses']),
36
36
  verifier: normalizeBoolean(options.verifier || options.reranker || options['second-stage']),
37
+ hybridRetrieval: normalizeBoolean(options['hybrid-retrieval'] || options.hybrid),
38
+ denseRetrieval: normalizeBoolean(options.dense || options['dense-retrieval'] || options.embeddings),
39
+ sparseRetrieval: normalizeBoolean(options.sparse || options['sparse-retrieval'] || options.keyword),
40
+ reranker: normalizeBoolean(options.reranker || options.rerank),
41
+ sourceGrounding: normalizeBoolean(options['source-grounding'] || options.grounding || options.citations),
42
+ aclFilter: normalizeBoolean(options['acl-filter'] || options.acl || options['access-control']),
43
+ freshnessWindowHours: toNumber(options['freshness-window-hours'] || options['freshness-hours']),
44
+ scaleCorpusDocuments: toNumber(options['scale-corpus-documents'] || options['corpus-documents'] || options.documents),
37
45
  latencyMs: toNumber(options['latency-ms'] || options.latency),
38
46
  latencyBudgetMs: toNumber(options['latency-budget-ms'] || options['latency-budget']),
39
47
  agenticPipeline: normalizeBoolean(options.agentic || options['agentic-pipeline']),
@@ -69,6 +77,8 @@ function buildSignals(options) {
69
77
  precisionTuningSignal(options, drop),
70
78
  ragCascadeSignal(options),
71
79
  verifierLatencySignal(options),
80
+ hybridScaleSignal(options),
81
+ retrievalGovernanceSignal(options),
72
82
  ].filter(Boolean);
73
83
  }
74
84
 
@@ -113,6 +123,48 @@ function verifierLatencySignal(options) {
113
123
  };
114
124
  }
115
125
 
126
+ function hybridScaleSignal(options) {
127
+ const hybridIntent = options.hybridRetrieval || (options.denseRetrieval && options.sparseRetrieval);
128
+ const largeCorpus = options.scaleCorpusDocuments !== null && options.scaleCorpusDocuments >= 100000;
129
+ if (!(hybridIntent || largeCorpus)) return null;
130
+ const missingControls = [
131
+ options.denseRetrieval ? null : 'dense recall unmeasured',
132
+ options.sparseRetrieval ? null : 'sparse recall unmeasured',
133
+ options.reranker ? null : 'missing reranker',
134
+ options.sourceGrounding ? null : 'missing source grounding',
135
+ options.aclFilter ? null : 'missing ACL filter',
136
+ ].filter(Boolean);
137
+ return {
138
+ id: 'hybrid_retrieval_scale_wall',
139
+ label: 'Hybrid retrieval scale wall',
140
+ values: [
141
+ options.hybridRetrieval ? 'hybrid retrieval' : null,
142
+ options.denseRetrieval ? 'dense retrieval' : null,
143
+ options.sparseRetrieval ? 'sparse retrieval' : null,
144
+ options.scaleCorpusDocuments !== null ? `${options.scaleCorpusDocuments} documents` : null,
145
+ ...missingControls,
146
+ ].filter(Boolean),
147
+ risk: 'scaled RAG needs dense, sparse, reranking, grounding, and access-control evidence instead of vector-only correctness',
148
+ };
149
+ }
150
+
151
+ function retrievalGovernanceSignal(options) {
152
+ if (!options.agenticPipeline && options.sourceGrounding && options.aclFilter) return null;
153
+ if (!(options.agenticPipeline || options.hybridRetrieval || options.scaleCorpusDocuments !== null)) return null;
154
+ const gaps = [
155
+ options.sourceGrounding ? null : 'source evidence not enforced',
156
+ options.aclFilter ? null : 'access control not enforced',
157
+ options.freshnessWindowHours === null ? 'freshness window missing' : `${options.freshnessWindowHours}h freshness window`,
158
+ ].filter(Boolean);
159
+ if (gaps.length === 0) return null;
160
+ return {
161
+ id: 'retrieval_governance_gap',
162
+ label: 'Retrieval governance gap',
163
+ values: gaps,
164
+ risk: 'agentic retrieval output can leak stale or unauthorized context into downstream actions',
165
+ };
166
+ }
167
+
116
168
  function buildRagPrecisionGuardrailsPlan(rawOptions = {}, templatesPath) {
117
169
  const options = normalizeOptions(rawOptions);
118
170
  const templates = listGateTemplates(templatesPath)
@@ -135,6 +187,14 @@ function buildRagPrecisionGuardrailsPlan(rawOptions = {}, templatesPath) {
135
187
  recallDropPercent: recallDropPercent(options),
136
188
  baselinePrecision: options.baselinePrecision,
137
189
  newPrecision: options.newPrecision,
190
+ hybridRetrieval: options.hybridRetrieval,
191
+ denseRetrieval: options.denseRetrieval,
192
+ sparseRetrieval: options.sparseRetrieval,
193
+ reranker: options.reranker,
194
+ sourceGrounding: options.sourceGrounding,
195
+ aclFilter: options.aclFilter,
196
+ freshnessWindowHours: options.freshnessWindowHours,
197
+ scaleCorpusDocuments: options.scaleCorpusDocuments,
138
198
  latencyMs: options.latencyMs,
139
199
  latencyBudgetMs: options.latencyBudgetMs,
140
200
  },
@@ -150,8 +210,10 @@ function buildRagPrecisionGuardrailsPlan(rawOptions = {}, templatesPath) {
150
210
  'Block embedding or threshold changes when recall drops without an approved rollback plan.',
151
211
  'Use a second-stage verifier or reranker for structural near misses such as negation and role reversal.',
152
212
  'Attach verifier latency budgets before routing the retrieval output into autonomous agent actions.',
213
+ 'Measure dense recall, sparse recall, reranked relevance, source grounding, ACL filtering, and freshness as separate production gates.',
214
+ 'Treat the retrieval layer as the agent ground truth: every autonomous action should carry source evidence and access-control proof.',
153
215
  ],
154
- exampleCommand: 'npx thumbgate rag-precision-guardrails --baseline-recall=0.86 --new-recall=0.72 --threshold-change --agentic --structural-near-misses --json',
216
+ exampleCommand: 'npx thumbgate rag-precision-guardrails --hybrid-retrieval --dense --sparse --scale-corpus-documents=1000000 --agentic --json',
155
217
  };
156
218
  }
157
219
 
@@ -36,7 +36,7 @@ const PAYWALL_MESSAGES = {
36
36
  prevention_rules: 'Free tier includes 1 prevention rule. Your agents need more protection — upgrade to Pro for unlimited rules.',
37
37
  recall: 'Recall is a Pro feature. Your past feedback is stored locally — upgrade to search and reuse it.',
38
38
  search_lessons: 'Lesson search is a Pro feature. Upgrade to find patterns in your agent\'s mistakes.',
39
- default: 'This feature requires Pro. Start a 7-day trial — card required; no charge today.',
39
+ default: 'This feature requires Pro. Start Pro — card required; billed today.',
40
40
  };
41
41
 
42
42
  function isProTier(authContext) {
@@ -11,6 +11,13 @@ function normalizeBoolean(value) {
11
11
  return /^(1|true|yes|on)$/i.test(String(value).trim());
12
12
  }
13
13
 
14
+ function normalizeOptionalBoolean(value) {
15
+ if (value === undefined || value === null || value === '') return null;
16
+ if (value === true) return true;
17
+ if (value === false) return false;
18
+ return /^(1|true|yes|on)$/i.test(String(value).trim());
19
+ }
20
+
14
21
  function toNumber(value) {
15
22
  if (value === undefined || value === null || value === '') return null;
16
23
  const num = Number(value);
@@ -28,6 +35,16 @@ function normalizeOptions(options = {}) {
28
35
  lowConfidenceSteps: toNumber(options['low-confidence-steps']),
29
36
  highConfidenceFailures: toNumber(options['high-confidence-failures']),
30
37
  truncationFailures: normalizeBoolean(options['truncation-failures']),
38
+ promptTokens: toNumber(options['prompt-tokens'] || options['context-tokens'] || options.context),
39
+ outputTokens: toNumber(options['output-tokens'] || options.output),
40
+ ttftMs: toNumber(options['ttft-ms'] || options['time-to-first-token-ms'] || options.ttft),
41
+ tokensPerSecond: toNumber(options['tokens-per-second'] || options.tps),
42
+ kvCache: normalizeOptionalBoolean(options['kv-cache'] ?? options.kvcache),
43
+ kvCacheHitRate: toNumber(options['kv-cache-hit-rate'] || options['kv-hit-rate']),
44
+ quantized: normalizeBoolean(options.quantized || options.quantization),
45
+ qualityDelta: toNumber(options['quality-delta'] || options['quantized-quality-delta']),
46
+ prefillBudgetMs: toNumber(options['prefill-budget-ms'] || options['ttft-budget-ms']),
47
+ decodeBudgetTps: toNumber(options['decode-budget-tps'] || options['tokens-per-second-budget']),
31
48
  };
32
49
  }
33
50
 
@@ -89,6 +106,49 @@ function buildSignals(options) {
89
106
  risk: 'failed rollouts may reflect verifier noise or truncation rather than bad reasoning',
90
107
  });
91
108
  }
109
+ if (options.promptTokens !== null || options.ttftMs !== null || options.prefillBudgetMs !== null) {
110
+ const overBudget = options.ttftMs !== null &&
111
+ options.prefillBudgetMs !== null &&
112
+ options.ttftMs > options.prefillBudgetMs;
113
+ signals.push({
114
+ id: 'prefill_decode_split',
115
+ label: 'Inference prefill/decode budget',
116
+ values: [
117
+ options.promptTokens !== null ? `${options.promptTokens} prompt tokens` : null,
118
+ options.ttftMs !== null ? `${options.ttftMs}ms TTFT` : null,
119
+ options.prefillBudgetMs !== null ? `${options.prefillBudgetMs}ms prefill budget` : null,
120
+ options.tokensPerSecond !== null ? `${options.tokensPerSecond} tokens/sec decode` : null,
121
+ options.decodeBudgetTps !== null ? `${options.decodeBudgetTps} tokens/sec decode budget` : null,
122
+ overBudget ? 'TTFT over budget' : null,
123
+ ].filter(Boolean),
124
+ risk: 'long prompts raise prefill cost while slow decode can make cheap models miss user-facing latency budgets',
125
+ });
126
+ }
127
+ if ((options.promptTokens !== null && options.promptTokens >= 32000) || options.kvCacheHitRate !== null || options.kvCache === false) {
128
+ const lowHitRate = options.kvCacheHitRate !== null && options.kvCacheHitRate < 0.8;
129
+ signals.push({
130
+ id: 'kv_cache_policy',
131
+ label: 'KV cache policy',
132
+ values: [
133
+ options.kvCache === true ? 'KV cache enabled' : null,
134
+ options.kvCache === false ? 'KV cache missing' : null,
135
+ options.kvCacheHitRate !== null ? `${options.kvCacheHitRate} hit rate` : null,
136
+ lowHitRate ? 'low cache hit rate' : null,
137
+ ].filter(Boolean),
138
+ risk: 'uncached long-context workloads can make repeated agent calls materially more expensive and slower',
139
+ });
140
+ }
141
+ if (options.quantized || options.qualityDelta !== null) {
142
+ signals.push({
143
+ id: 'quantization_rollout',
144
+ label: 'Quantized runtime rollout',
145
+ values: [
146
+ options.quantized ? 'quantized runtime' : null,
147
+ options.qualityDelta !== null ? `${options.qualityDelta} quality delta` : 'missing quality delta',
148
+ ].filter(Boolean),
149
+ risk: 'quantization can cut inference cost only if quality and latency are measured before production routing',
150
+ });
151
+ }
92
152
  return signals;
93
153
  }
94
154
 
@@ -114,6 +174,16 @@ function buildReasoningEfficiencyGuardrailsPlan(rawOptions = {}, templatesPath)
114
174
  baselineAccuracy: options.baselineAccuracy,
115
175
  compressedAccuracy: options.compressedAccuracy,
116
176
  accuracyDelta: accuracyDelta(options),
177
+ promptTokens: options.promptTokens,
178
+ outputTokens: options.outputTokens,
179
+ ttftMs: options.ttftMs,
180
+ tokensPerSecond: options.tokensPerSecond,
181
+ kvCache: options.kvCache,
182
+ kvCacheHitRate: options.kvCacheHitRate,
183
+ quantized: options.quantized,
184
+ qualityDelta: options.qualityDelta,
185
+ prefillBudgetMs: options.prefillBudgetMs,
186
+ decodeBudgetTps: options.decodeBudgetTps,
117
187
  },
118
188
  summary: {
119
189
  signalCount: signals.length,
@@ -127,8 +197,10 @@ function buildReasoningEfficiencyGuardrailsPlan(rawOptions = {}, templatesPath)
127
197
  'Inspect low-confidence steps even when the final rollout is correct.',
128
198
  'Inspect high-confidence failed rollouts for truncation or verifier noise before penalizing the trace.',
129
199
  'Route cheaper compressed reasoning only after accuracy and efficiency both clear the gate.',
200
+ 'Track TTFT, decode throughput, KV-cache hit rate, and prompt-token count separately for every agent runtime.',
201
+ 'Route quantized runtimes only after latency savings clear quality and verifier baselines.',
130
202
  ],
131
- exampleCommand: 'npx thumbgate reasoning-efficiency-guardrails --baseline-tokens=1200 --compressed-tokens=980 --baseline-accuracy=0.84 --compressed-accuracy=0.85 --verifier --json',
203
+ exampleCommand: 'npx thumbgate reasoning-efficiency-guardrails --prompt-tokens=120000 --ttft-ms=1800 --prefill-budget-ms=800 --kv-cache=false --quantized --quality-delta=-0.03 --json',
132
204
  };
133
205
  }
134
206