npm - thumbgate - Versions diffs - 1.16.20 → 1.16.21 - Mend

thumbgate 1.16.20 → 1.16.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/.claude-plugin/marketplace.json +2 -2
package/.claude-plugin/plugin.json +1 -1
package/.well-known/mcp/server-card.json +1 -1
package/README.md +3 -2
package/adapters/claude/.mcp.json +2 -2
package/adapters/mcp/server-stdio.js +1 -1
package/adapters/opencode/opencode.json +1 -1
package/bench/programbench-smoke.json +71 -0
package/bench/thumbgate-bench.json +131 -0
package/bin/cli.js +64 -1
package/package.json +16 -8
package/public/dashboard.html +1 -1
package/public/guide.html +5 -3
package/public/index.html +43 -31
package/public/lessons.html +1 -1
package/public/numbers.html +11 -11
package/public/pro.html +31 -88
package/scripts/billing.js +3 -3
package/scripts/harness-selector.js +188 -0
package/scripts/rag-precision-guardrails.js +63 -1
package/scripts/rate-limiter.js +1 -1
package/scripts/reasoning-efficiency-guardrails.js +73 -1
package/scripts/thumbgate-bench.js +707 -0
package/src/api/server.js +66 -13

package/scripts/harness-selector.js CHANGED Viewed

@@ -253,6 +253,190 @@ function buildHarnessOptimizationAudit(options = {}) {
   return scoreHarnessAudit(inputs, options);
 }
+function normalizeBoolean(value) {
+  if (value === true) return true;
+  if (value === false || value === undefined || value === null) return false;
+  return /^(1|true|yes|on)$/i.test(String(value).trim());
+}
+function normalizeOptionalBoolean(value, fallback = true) {
+  if (value === undefined || value === null || value === '') return fallback;
+  if (value === true) return true;
+  if (value === false) return false;
+  return /^(1|true|yes|on)$/i.test(String(value).trim());
+}
+function toNumber(value) {
+  if (value === undefined || value === null || value === '') return null;
+  const num = Number(value);
+  return Number.isFinite(num) ? num : null;
+}
+function buildHarnessFitAudit(options = {}) {
+  const nativeHarness = String(options['native-harness'] || options.native || 'native').trim() || 'native';
+  const genericHarness = String(options['generic-harness'] || options.generic || 'generic').trim() || 'generic';
+  const sameModelDifferentHarness = normalizeBoolean(options['same-model-different-harness'] || options['same-model'] || options.crossHarness);
+  const controls = {
+    toolSchemaParity: normalizeOptionalBoolean(options['tool-schema-parity']),
+    permissionParity: normalizeOptionalBoolean(options['permission-parity']),
+    stateIsolation: normalizeOptionalBoolean(options['state-isolation']),
+    patchLoopParity: normalizeOptionalBoolean(options['patch-loop-parity']),
+    verificationParity: normalizeOptionalBoolean(options['verification-parity']),
+  };
+  const handoffDrift = toNumber(options['handoff-drift'] || options['handoff-drift-percent']);
+  const gaps = Object.entries(controls)
+    .filter(([, value]) => value === false)
+    .map(([key]) => key);
+  let score = 100;
+  if (sameModelDifferentHarness) score -= 15;
+  score -= gaps.length * 12;
+  if (handoffDrift !== null && handoffDrift > 0) score -= Math.min(20, Math.ceil(handoffDrift));
+  const signals = [];
+  if (sameModelDifferentHarness || gaps.length > 0) {
+    signals.push({
+      id: 'model_harness_fit',
+      label: 'Same model, different harness',
+      values: [
+        `${nativeHarness} vs ${genericHarness}`,
+        sameModelDifferentHarness ? 'same model run across harnesses' : null,
+        ...gaps.map((gap) => `${gap} gap`),
+      ].filter(Boolean),
+      risk: 'model quality can change when tool schemas, permissions, state, patch loops, or verification differ by harness',
+    });
+  }
+  if (handoffDrift !== null && handoffDrift > 0) {
+    signals.push({
+      id: 'handoff_drift',
+      label: 'Cross-harness handoff drift',
+      values: [`${handoffDrift}% drift`],
+      risk: 'handoffs between generic and native harnesses can lose task state or weaken verification',
+    });
+  }
+  const normalizedScore = Math.max(0, Math.min(100, score));
+  return {
+    name: 'thumbgate-model-harness-fit-audit',
+    status: normalizedScore >= 85 ? 'portable' : normalizedScore >= 65 ? 'watch' : 'native-required',
+    score: normalizedScore,
+    nativeHarness,
+    genericHarness,
+    controls,
+    metrics: { sameModelDifferentHarness, handoffDrift },
+    signals,
+    recommendations: [
+      'Benchmark the same task, same model, and same repository in native and generic harnesses before standardizing.',
+      'Require parity proof for tool schemas, permissions, state isolation, patch application, and verification loops.',
+      'Use the native harness for production edits when parity gaps remain; reserve generic harnesses for exploration and read-only analysis.',
+    ],
+  };
+}
+function formatHarnessFitAudit(report) {
+  const lines = [
+    '',
+    'ThumbGate Model-Harness Fit Audit',
+    '-'.repeat(37),
+    `Status : ${report.status}`,
+    `Score  : ${report.score}/100`,
+    `Harness: ${report.nativeHarness} vs ${report.genericHarness}`,
+    `Signals: ${report.signals.length}`,
+  ];
+  if (report.signals.length > 0) {
+    lines.push('', 'Detected harness-fit risks:');
+    for (const signal of report.signals) {
+      lines.push(`  - ${signal.label}: ${signal.values.join(', ')}`);
+      lines.push(`    Risk: ${signal.risk}`);
+    }
+  }
+  lines.push('', 'Recommendations:');
+  for (const recommendation of report.recommendations) lines.push(`  - ${recommendation}`);
+  return `${lines.join('\n')}\n\n`;
+}
+function buildSolverWorkflowGovernance(options = {}) {
+  const solver = String(options.solver || options['solver-engine'] || 'solver').trim() || 'solver';
+  const multiAgent = normalizeBoolean(options['multi-agent'] || options.multiAgent || options.agentic);
+  const controls = {
+    objectiveDefined: normalizeOptionalBoolean(options['objective-defined']),
+    constraintsDefined: normalizeOptionalBoolean(options['constraints-defined']),
+    scenarioReplay: normalizeOptionalBoolean(options['scenario-replay']),
+    approvalGate: normalizeOptionalBoolean(options['approval-gate']),
+    rollbackPlan: normalizeOptionalBoolean(options['rollback-plan']),
+    solverProvenance: normalizeOptionalBoolean(options['solver-provenance']),
+  };
+  const dataFreshnessHours = toNumber(options['data-freshness-hours'] || options['freshness-hours']);
+  const gaps = Object.entries(controls)
+    .filter(([, value]) => value === false)
+    .map(([key]) => key);
+  let score = 100;
+  if (multiAgent) score -= 8;
+  score -= gaps.length * 13;
+  if (dataFreshnessHours !== null && dataFreshnessHours > 24) score -= 10;
+  const signals = [];
+  if (multiAgent || gaps.length > 0) {
+    signals.push({
+      id: 'solver_workflow_governance',
+      label: 'Solver-backed agent workflow',
+      values: [
+        solver,
+        multiAgent ? 'multi-agent orchestration' : null,
+        ...gaps.map((gap) => `${gap} gap`),
+      ].filter(Boolean),
+      risk: 'natural-language-to-optimization workflows need objective, constraint, replay, approval, rollback, and provenance gates',
+    });
+  }
+  if (dataFreshnessHours !== null && dataFreshnessHours > 24) {
+    signals.push({
+      id: 'solver_data_freshness',
+      label: 'Solver data freshness',
+      values: [`${dataFreshnessHours}h old`],
+      risk: 'optimization results can look mathematically valid while using stale operational data',
+    });
+  }
+  const normalizedScore = Math.max(0, Math.min(100, score));
+  return {
+    name: 'thumbgate-solver-workflow-governance',
+    status: normalizedScore >= 85 ? 'ready' : normalizedScore >= 65 ? 'approval-required' : 'blocked',
+    score: normalizedScore,
+    solver,
+    controls,
+    metrics: { multiAgent, dataFreshnessHours },
+    signals,
+    recommendations: [
+      'Capture the objective function, hard constraints, soft constraints, and data freshness before invoking the solver.',
+      'Replay at least one baseline scenario and one counterfactual before approving optimized actions.',
+      'Require human approval and rollback evidence before solver output changes supply chain, routing, scheduling, or pricing decisions.',
+    ],
+  };
+}
+function formatSolverWorkflowGovernance(report) {
+  const lines = [
+    '',
+    'ThumbGate Solver Workflow Governance',
+    '-'.repeat(38),
+    `Status: ${report.status}`,
+    `Score : ${report.score}/100`,
+    `Solver: ${report.solver}`,
+    `Signals: ${report.signals.length}`,
+  ];
+  if (report.signals.length > 0) {
+    lines.push('', 'Detected solver workflow risks:');
+    for (const signal of report.signals) {
+      lines.push(`  - ${signal.label}: ${signal.values.join(', ')}`);
+      lines.push(`    Risk: ${signal.risk}`);
+    }
+  }
+  lines.push('', 'Recommendations:');
+  for (const recommendation of report.recommendations) lines.push(`  - ${recommendation}`);
+  return `${lines.join('\n')}\n\n`;
+}
 // ---------------------------------------------------------------------------
 // Internal helpers
 // ---------------------------------------------------------------------------
@@ -284,6 +468,10 @@ module.exports = {
   collectDefaultHarnessAuditInputs,
   scoreHarnessAudit,
   buildHarnessOptimizationAudit,
+  buildHarnessFitAudit,
+  formatHarnessFitAudit,
+  buildSolverWorkflowGovernance,
+  formatSolverWorkflowGovernance,
   extractCommandText,
   HARNESSES,
   DEPLOY_PATTERNS,

package/scripts/rag-precision-guardrails.js CHANGED Viewed

@@ -34,6 +34,14 @@ function normalizeOptions(options = {}) {
     embeddingFineTune: normalizeBoolean(options['embedding-finetune'] || options['embedding-fine-tune'] || options['fine-tune']),
     structuralNearMisses: normalizeBoolean(options['structural-near-misses'] || options['near-misses']),
     verifier: normalizeBoolean(options.verifier || options.reranker || options['second-stage']),
+    hybridRetrieval: normalizeBoolean(options['hybrid-retrieval'] || options.hybrid),
+    denseRetrieval: normalizeBoolean(options.dense || options['dense-retrieval'] || options.embeddings),
+    sparseRetrieval: normalizeBoolean(options.sparse || options['sparse-retrieval'] || options.keyword),
+    reranker: normalizeBoolean(options.reranker || options.rerank),
+    sourceGrounding: normalizeBoolean(options['source-grounding'] || options.grounding || options.citations),
+    aclFilter: normalizeBoolean(options['acl-filter'] || options.acl || options['access-control']),
+    freshnessWindowHours: toNumber(options['freshness-window-hours'] || options['freshness-hours']),
+    scaleCorpusDocuments: toNumber(options['scale-corpus-documents'] || options['corpus-documents'] || options.documents),
     latencyMs: toNumber(options['latency-ms'] || options.latency),
     latencyBudgetMs: toNumber(options['latency-budget-ms'] || options['latency-budget']),
     agenticPipeline: normalizeBoolean(options.agentic || options['agentic-pipeline']),
@@ -69,6 +77,8 @@ function buildSignals(options) {
     precisionTuningSignal(options, drop),
     ragCascadeSignal(options),
     verifierLatencySignal(options),
+    hybridScaleSignal(options),
+    retrievalGovernanceSignal(options),
   ].filter(Boolean);
 }
@@ -113,6 +123,48 @@ function verifierLatencySignal(options) {
   };
 }
+function hybridScaleSignal(options) {
+  const hybridIntent = options.hybridRetrieval || (options.denseRetrieval && options.sparseRetrieval);
+  const largeCorpus = options.scaleCorpusDocuments !== null && options.scaleCorpusDocuments >= 100000;
+  if (!(hybridIntent || largeCorpus)) return null;
+  const missingControls = [
+    options.denseRetrieval ? null : 'dense recall unmeasured',
+    options.sparseRetrieval ? null : 'sparse recall unmeasured',
+    options.reranker ? null : 'missing reranker',
+    options.sourceGrounding ? null : 'missing source grounding',
+    options.aclFilter ? null : 'missing ACL filter',
+  ].filter(Boolean);
+  return {
+    id: 'hybrid_retrieval_scale_wall',
+    label: 'Hybrid retrieval scale wall',
+    values: [
+      options.hybridRetrieval ? 'hybrid retrieval' : null,
+      options.denseRetrieval ? 'dense retrieval' : null,
+      options.sparseRetrieval ? 'sparse retrieval' : null,
+      options.scaleCorpusDocuments !== null ? `${options.scaleCorpusDocuments} documents` : null,
+      ...missingControls,
+    ].filter(Boolean),
+    risk: 'scaled RAG needs dense, sparse, reranking, grounding, and access-control evidence instead of vector-only correctness',
+  };
+}
+function retrievalGovernanceSignal(options) {
+  if (!options.agenticPipeline && options.sourceGrounding && options.aclFilter) return null;
+  if (!(options.agenticPipeline || options.hybridRetrieval || options.scaleCorpusDocuments !== null)) return null;
+  const gaps = [
+    options.sourceGrounding ? null : 'source evidence not enforced',
+    options.aclFilter ? null : 'access control not enforced',
+    options.freshnessWindowHours === null ? 'freshness window missing' : `${options.freshnessWindowHours}h freshness window`,
+  ].filter(Boolean);
+  if (gaps.length === 0) return null;
+  return {
+    id: 'retrieval_governance_gap',
+    label: 'Retrieval governance gap',
+    values: gaps,
+    risk: 'agentic retrieval output can leak stale or unauthorized context into downstream actions',
+  };
+}
 function buildRagPrecisionGuardrailsPlan(rawOptions = {}, templatesPath) {
   const options = normalizeOptions(rawOptions);
   const templates = listGateTemplates(templatesPath)
@@ -135,6 +187,14 @@ function buildRagPrecisionGuardrailsPlan(rawOptions = {}, templatesPath) {
       recallDropPercent: recallDropPercent(options),
       baselinePrecision: options.baselinePrecision,
       newPrecision: options.newPrecision,
+      hybridRetrieval: options.hybridRetrieval,
+      denseRetrieval: options.denseRetrieval,
+      sparseRetrieval: options.sparseRetrieval,
+      reranker: options.reranker,
+      sourceGrounding: options.sourceGrounding,
+      aclFilter: options.aclFilter,
+      freshnessWindowHours: options.freshnessWindowHours,
+      scaleCorpusDocuments: options.scaleCorpusDocuments,
       latencyMs: options.latencyMs,
       latencyBudgetMs: options.latencyBudgetMs,
     },
@@ -150,8 +210,10 @@ function buildRagPrecisionGuardrailsPlan(rawOptions = {}, templatesPath) {
       'Block embedding or threshold changes when recall drops without an approved rollback plan.',
       'Use a second-stage verifier or reranker for structural near misses such as negation and role reversal.',
       'Attach verifier latency budgets before routing the retrieval output into autonomous agent actions.',
+      'Measure dense recall, sparse recall, reranked relevance, source grounding, ACL filtering, and freshness as separate production gates.',
+      'Treat the retrieval layer as the agent ground truth: every autonomous action should carry source evidence and access-control proof.',
     ],
-    exampleCommand: 'npx thumbgate rag-precision-guardrails --baseline-recall=0.86 --new-recall=0.72 --threshold-change --agentic --structural-near-misses --json',
+    exampleCommand: 'npx thumbgate rag-precision-guardrails --hybrid-retrieval --dense --sparse --scale-corpus-documents=1000000 --agentic --json',
   };
 }

package/scripts/rate-limiter.js CHANGED Viewed

@@ -36,7 +36,7 @@ const PAYWALL_MESSAGES = {
   prevention_rules: 'Free tier includes 1 prevention rule. Your agents need more protection — upgrade to Pro for unlimited rules.',
   recall: 'Recall is a Pro feature. Your past feedback is stored locally — upgrade to search and reuse it.',
   search_lessons: 'Lesson search is a Pro feature. Upgrade to find patterns in your agent\'s mistakes.',
-  default: 'This feature requires Pro. Start a 7-day trial — card required; no charge today.',
+  default: 'This feature requires Pro. Start Pro — card required; billed today.',
 };
 function isProTier(authContext) {

package/scripts/reasoning-efficiency-guardrails.js CHANGED Viewed

@@ -11,6 +11,13 @@ function normalizeBoolean(value) {
   return /^(1|true|yes|on)$/i.test(String(value).trim());
 }
+function normalizeOptionalBoolean(value) {
+  if (value === undefined || value === null || value === '') return null;
+  if (value === true) return true;
+  if (value === false) return false;
+  return /^(1|true|yes|on)$/i.test(String(value).trim());
+}
 function toNumber(value) {
   if (value === undefined || value === null || value === '') return null;
   const num = Number(value);
@@ -28,6 +35,16 @@ function normalizeOptions(options = {}) {
     lowConfidenceSteps: toNumber(options['low-confidence-steps']),
     highConfidenceFailures: toNumber(options['high-confidence-failures']),
     truncationFailures: normalizeBoolean(options['truncation-failures']),
+    promptTokens: toNumber(options['prompt-tokens'] || options['context-tokens'] || options.context),
+    outputTokens: toNumber(options['output-tokens'] || options.output),
+    ttftMs: toNumber(options['ttft-ms'] || options['time-to-first-token-ms'] || options.ttft),
+    tokensPerSecond: toNumber(options['tokens-per-second'] || options.tps),
+    kvCache: normalizeOptionalBoolean(options['kv-cache'] ?? options.kvcache),
+    kvCacheHitRate: toNumber(options['kv-cache-hit-rate'] || options['kv-hit-rate']),
+    quantized: normalizeBoolean(options.quantized || options.quantization),
+    qualityDelta: toNumber(options['quality-delta'] || options['quantized-quality-delta']),
+    prefillBudgetMs: toNumber(options['prefill-budget-ms'] || options['ttft-budget-ms']),
+    decodeBudgetTps: toNumber(options['decode-budget-tps'] || options['tokens-per-second-budget']),
   };
 }
@@ -89,6 +106,49 @@ function buildSignals(options) {
       risk: 'failed rollouts may reflect verifier noise or truncation rather than bad reasoning',
     });
   }
+  if (options.promptTokens !== null || options.ttftMs !== null || options.prefillBudgetMs !== null) {
+    const overBudget = options.ttftMs !== null &&
+      options.prefillBudgetMs !== null &&
+      options.ttftMs > options.prefillBudgetMs;
+    signals.push({
+      id: 'prefill_decode_split',
+      label: 'Inference prefill/decode budget',
+      values: [
+        options.promptTokens !== null ? `${options.promptTokens} prompt tokens` : null,
+        options.ttftMs !== null ? `${options.ttftMs}ms TTFT` : null,
+        options.prefillBudgetMs !== null ? `${options.prefillBudgetMs}ms prefill budget` : null,
+        options.tokensPerSecond !== null ? `${options.tokensPerSecond} tokens/sec decode` : null,
+        options.decodeBudgetTps !== null ? `${options.decodeBudgetTps} tokens/sec decode budget` : null,
+        overBudget ? 'TTFT over budget' : null,
+      ].filter(Boolean),
+      risk: 'long prompts raise prefill cost while slow decode can make cheap models miss user-facing latency budgets',
+    });
+  }
+  if ((options.promptTokens !== null && options.promptTokens >= 32000) || options.kvCacheHitRate !== null || options.kvCache === false) {
+    const lowHitRate = options.kvCacheHitRate !== null && options.kvCacheHitRate < 0.8;
+    signals.push({
+      id: 'kv_cache_policy',
+      label: 'KV cache policy',
+      values: [
+        options.kvCache === true ? 'KV cache enabled' : null,
+        options.kvCache === false ? 'KV cache missing' : null,
+        options.kvCacheHitRate !== null ? `${options.kvCacheHitRate} hit rate` : null,
+        lowHitRate ? 'low cache hit rate' : null,
+      ].filter(Boolean),
+      risk: 'uncached long-context workloads can make repeated agent calls materially more expensive and slower',
+    });
+  }
+  if (options.quantized || options.qualityDelta !== null) {
+    signals.push({
+      id: 'quantization_rollout',
+      label: 'Quantized runtime rollout',
+      values: [
+        options.quantized ? 'quantized runtime' : null,
+        options.qualityDelta !== null ? `${options.qualityDelta} quality delta` : 'missing quality delta',
+      ].filter(Boolean),
+      risk: 'quantization can cut inference cost only if quality and latency are measured before production routing',
+    });
+  }
   return signals;
 }
@@ -114,6 +174,16 @@ function buildReasoningEfficiencyGuardrailsPlan(rawOptions = {}, templatesPath)
       baselineAccuracy: options.baselineAccuracy,
       compressedAccuracy: options.compressedAccuracy,
       accuracyDelta: accuracyDelta(options),
+      promptTokens: options.promptTokens,
+      outputTokens: options.outputTokens,
+      ttftMs: options.ttftMs,
+      tokensPerSecond: options.tokensPerSecond,
+      kvCache: options.kvCache,
+      kvCacheHitRate: options.kvCacheHitRate,
+      quantized: options.quantized,
+      qualityDelta: options.qualityDelta,
+      prefillBudgetMs: options.prefillBudgetMs,
+      decodeBudgetTps: options.decodeBudgetTps,
     },
     summary: {
       signalCount: signals.length,
@@ -127,8 +197,10 @@ function buildReasoningEfficiencyGuardrailsPlan(rawOptions = {}, templatesPath)
       'Inspect low-confidence steps even when the final rollout is correct.',
       'Inspect high-confidence failed rollouts for truncation or verifier noise before penalizing the trace.',
       'Route cheaper compressed reasoning only after accuracy and efficiency both clear the gate.',
+      'Track TTFT, decode throughput, KV-cache hit rate, and prompt-token count separately for every agent runtime.',
+      'Route quantized runtimes only after latency savings clear quality and verifier baselines.',
     ],
-    exampleCommand: 'npx thumbgate reasoning-efficiency-guardrails --baseline-tokens=1200 --compressed-tokens=980 --baseline-accuracy=0.84 --compressed-accuracy=0.85 --verifier --json',
+    exampleCommand: 'npx thumbgate reasoning-efficiency-guardrails --prompt-tokens=120000 --ttft-ms=1800 --prefill-budget-ms=800 --kv-cache=false --quantized --quality-delta=-0.03 --json',
   };
 }