npm - thumbgate - Versions diffs - 1.14.1 → 1.16.0 - Mend

thumbgate 1.14.1 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

package/.claude-plugin/marketplace.json +6 -6
package/.claude-plugin/plugin.json +3 -3
package/.well-known/llms.txt +5 -5
package/.well-known/mcp/server-card.json +1 -1
package/README.md +60 -35
package/adapters/chatgpt/openapi.yaml +118 -2
package/adapters/claude/.mcp.json +2 -2
package/adapters/mcp/server-stdio.js +217 -84
package/adapters/opencode/opencode.json +1 -1
package/bench/prompt-eval-suite.json +5 -1
package/bin/cli.js +211 -8
package/config/enforcement.json +59 -7
package/config/evals/agent-safety-eval.json +338 -22
package/config/gates/default.json +33 -0
package/config/gates/routine.json +43 -0
package/config/github-about.json +3 -3
package/config/mcp-allowlists.json +4 -0
package/config/merge-quality-checks.json +2 -1
package/config/model-candidates.json +131 -0
package/openapi/openapi.yaml +118 -2
package/package.json +70 -51
package/public/blog.html +7 -7
package/public/codex-plugin.html +13 -7
package/public/compare.html +29 -23
package/public/dashboard.html +105 -12
package/public/guide.html +28 -28
package/public/index.html +233 -97
package/public/learn.html +87 -20
package/public/lessons.html +26 -2
package/public/numbers.html +271 -0
package/public/pro.html +89 -19
package/scripts/agent-audit-trace.js +55 -0
package/scripts/agent-memory-lifecycle.js +96 -0
package/scripts/agent-readiness-plan.js +118 -0
package/scripts/agentic-data-pipeline.js +21 -1
package/scripts/agents-sdk-sandbox-plan.js +57 -0
package/scripts/ai-org-governance.js +98 -0
package/scripts/ai-search-distribution.js +43 -0
package/scripts/artifact-agent-plan.js +81 -0
package/scripts/billing.js +27 -8
package/scripts/cli-feedback.js +2 -1
package/scripts/cli-schema.js +60 -5
package/scripts/code-mode-mcp-plan.js +71 -0
package/scripts/commercial-offer.js +1 -1
package/scripts/context-engine.js +1 -2
package/scripts/context-manager.js +4 -1
package/scripts/contextfs.js +214 -32
package/scripts/dashboard-render-spec.js +1 -1
package/scripts/dashboard.js +275 -9
package/scripts/decision-journal.js +13 -3
package/scripts/document-workflow-governance.js +62 -0
package/scripts/enterprise-agent-rollout.js +34 -0
package/scripts/experience-replay-governance.js +69 -0
package/scripts/export-hf-dataset.js +1 -1
package/scripts/feedback-loop.js +141 -9
package/scripts/feedback-to-rules.js +17 -23
package/scripts/gates-engine.js +4 -6
package/scripts/growth-campaigns.js +49 -0
package/scripts/harness-selector.js +145 -1
package/scripts/hybrid-supervisor-agent.js +64 -0
package/scripts/inference-cache-policy.js +72 -0
package/scripts/inference-economics.js +53 -0
package/scripts/internal-agent-bootstrap.js +12 -2
package/scripts/knowledge-layer-plan.js +108 -0
package/scripts/lesson-canonical.js +181 -0
package/scripts/lesson-db.js +71 -10
package/scripts/lesson-inference.js +183 -44
package/scripts/lesson-search.js +4 -1
package/scripts/lesson-synthesis.js +23 -2
package/scripts/llm-client.js +157 -26
package/scripts/mailer/resend-mailer.js +112 -1
package/scripts/mcp-transport-strategy.js +66 -0
package/scripts/memory-store-governance.js +60 -0
package/scripts/meta-agent-loop.js +7 -13
package/scripts/model-access-eligibility.js +38 -0
package/scripts/model-migration-readiness.js +55 -0
package/scripts/native-messaging-audit.js +514 -0
package/scripts/operational-integrity.js +96 -3
package/scripts/otel-declarative-config.js +56 -0
package/scripts/perplexity-client.js +1 -1
package/scripts/post-training-governance.js +34 -0
package/scripts/pr-manager.js +47 -7
package/scripts/private-core-boundary.js +72 -0
package/scripts/production-agent-readiness.js +40 -0
package/scripts/profile-router.js +16 -1
package/scripts/prompt-eval.js +564 -32
package/scripts/prompt-programs.js +93 -0
package/scripts/provider-action-normalizer.js +585 -0
package/scripts/rule-validator.js +285 -0
package/scripts/scaling-law-claims.js +60 -0
package/scripts/security-scanner.js +1 -1
package/scripts/self-distill-agent.js +7 -32
package/scripts/seo-gsd.js +400 -43
package/scripts/skill-rag-router.js +53 -0
package/scripts/spec-gate.js +1 -1
package/scripts/student-consistent-training.js +73 -0
package/scripts/synthetic-data-provenance.js +98 -0
package/scripts/task-context-result.js +81 -0
package/scripts/telemetry-analytics.js +149 -0
package/scripts/thompson-sampling.js +2 -2
package/scripts/token-savings.js +7 -6
package/scripts/token-tco.js +46 -0
package/scripts/tool-registry.js +75 -3
package/scripts/verification-loop.js +10 -1
package/scripts/verifier-scoring.js +71 -0
package/scripts/workflow-sentinel.js +284 -28
package/scripts/workspace-agent-routines.js +118 -0
package/skills/thumbgate/SKILL.md +1 -1
package/src/api/server.js +434 -120
package/.claude-plugin/README.md +0 -170
package/adapters/README.md +0 -12
package/scripts/analytics-report.js +0 -328
package/scripts/autonomous-workflow.js +0 -377
package/scripts/billing-setup.js +0 -109
package/scripts/creator-campaigns.js +0 -239
package/scripts/cross-encoder-reranker.js +0 -235
package/scripts/daemon-manager.js +0 -108
package/scripts/decision-trace.js +0 -354
package/scripts/delegation-runtime.js +0 -896
package/scripts/dispatch-brief.js +0 -159
package/scripts/distribution-surfaces.js +0 -110
package/scripts/feedback-history-distiller.js +0 -382
package/scripts/funnel-analytics.js +0 -35
package/scripts/history-distiller.js +0 -200
package/scripts/hosted-job-launcher.js +0 -256
package/scripts/intent-router.js +0 -392
package/scripts/lesson-reranker.js +0 -263
package/scripts/lesson-retrieval.js +0 -148
package/scripts/managed-lesson-agent.js +0 -183
package/scripts/operational-dashboard.js +0 -103
package/scripts/operational-summary.js +0 -129
package/scripts/operator-artifacts.js +0 -608
package/scripts/optimize-context.js +0 -17
package/scripts/org-dashboard.js +0 -206
package/scripts/partner-orchestration.js +0 -146
package/scripts/predictive-insights.js +0 -356
package/scripts/pulse.js +0 -80
package/scripts/reflector-agent.js +0 -221
package/scripts/sales-pipeline.js +0 -681
package/scripts/session-episode-store.js +0 -329
package/scripts/session-health-sensor.js +0 -242
package/scripts/session-report.js +0 -120
package/scripts/swarm-coordinator.js +0 -81
package/scripts/tool-kpi-tracker.js +0 -12
package/scripts/webhook-delivery.js +0 -62
package/scripts/workflow-sprint-intake.js +0 -475
package/skills/agent-memory/SKILL.md +0 -97
package/skills/solve-architecture-autonomy/SKILL.md +0 -17
package/skills/solve-architecture-autonomy/tool.js +0 -33
package/skills/thumbgate-feedback/SKILL.md +0 -49

package/scripts/rule-validator.js ADDED Viewed

@@ -0,0 +1,285 @@
+'use strict';
+/**
+ * scripts/rule-validator.js
+ *
+ * Pre-promotion validation harness for synthesized prevention rules.
+ *
+ * Why this exists:
+ *   Before this module, `synthesizePreventionRule` (lesson-synthesis.js) auto-
+ *   promoted any lesson that hit the occurrence threshold straight into
+ *   `synthesized-rules.jsonl` — no check that the proposed rule actually
+ *   matches the mistake pattern it was synthesized from, and no check that
+ *   it doesn't also fire on recent positive-signal events from overlapping
+ *   tags. That's the exact failure mode Autogenesis
+ *   (https://arxiv.org/abs/2604.15034) calls out: candidate improvements
+ *   must be validated through testing before integration, otherwise static
+ *   agents accumulate self-contradicting rules that degrade precision.
+ *
+ *   We already had 3 of the 4 Autogenesis phases:
+ *     - capability-gap identification (negative feedback events),
+ *     - candidate generation (synthesizePreventionRule),
+ *     - integration (append to synthesized-rules.jsonl).
+ *   The missing phase was validation. This module fills it.
+ *
+ * Validation contract:
+ *   A proposed rule is promotable iff:
+ *     1. It matches the seed lesson that triggered promotion (otherwise the
+ *        rule is tautologically broken — it wouldn't catch the mistake it
+ *        was built for).
+ *     2. Its precision on a recent-events sample clears a threshold
+ *        (default 0.8) — of the events the rule fires on, most must carry
+ *        the negative signal. A rule that blocks positive outcomes too is
+ *        a regression, not a prevention.
+ *
+ *   Recall is reported for operator visibility but does not gate
+ *   promotion — an overly specific rule is less harmful than an overly
+ *   broad one.
+ *
+ * Design notes:
+ *   - Pure functions, no IO. Caller supplies the event samples so tests
+ *     stay hermetic and the validator can run inside captureFeedback
+ *     without reaching for the filesystem.
+ *   - Token matching is deliberately simple (lowercase, punctuation strip,
+ *     length-2+ tokens, all-tokens-present) so the behavior is debuggable
+ *     from the console. We are not competing with NLP — we are gating a
+ *     one-line trigger string against a handful of sibling events.
+ */
+// Intentionally tiny stop list — we only drop noise that would erase the
+// trigger's discriminative tokens. If a stop-word-only rule ever matches a
+// positive event, that's a real false positive and we want to see it.
+const STOP = new Set([
+  'a', 'an', 'the', 'to', 'of', 'in', 'on', 'at', 'for', 'and', 'or',
+  'is', 'are', 'was', 'were', 'be', 'do', 'does', 'did',
+  'this', 'that', 'these', 'those',
+  'it', 'its', 'i', 'you', 'we', 'they',
+]);
+// Modality / negation words that `synthesizePreventionRule` commonly
+// inherits from lesson titles like "MISTAKE: never force-push". We want
+// these tokens to survive ordinary tokenize() output (they're legitimate
+// English), but we strip them from a rule's trigger before matching so
+// the rule still fires on events that describe the mistake without
+// echoing the modality. They remain meaningful in haystack positions.
+const TRIGGER_MODALITY = new Set(['never', 'always', 'ever', 'must', 'not', 'no']);
+/**
+ * Strip a few common English suffixes so "force-pushed" in a bug report
+ * matches a trigger token "push". We are NOT doing Porter-grade stemming;
+ * the goal is just to keep morphological variants from silently breaking
+ * the matcher. Minimum 3-char stem preserved so "goes" → "goe" (harmless)
+ * but "is" / "as" stay intact.
+ */
+function stem(token) {
+  if (token.length <= 3) return token;
+  if (token.endsWith('ing') && token.length > 5) return token.slice(0, -3);
+  if (token.endsWith('ed') && token.length > 4) return token.slice(0, -2);
+  if (token.endsWith('es') && token.length > 4) return token.slice(0, -2);
+  if (token.endsWith('s') && !token.endsWith('ss') && token.length > 3) {
+    return token.slice(0, -1);
+  }
+  return token;
+}
+function tokenize(text) {
+  if (text === null || text === undefined) return [];
+  return String(text)
+    .toLowerCase()
+    .replace(/[^a-z0-9\s]/g, ' ')
+    .split(/\s+/)
+    .filter((t) => t.length > 1 && !STOP.has(t))
+    .map(stem);
+}
+function eventText(event) {
+  if (!event || typeof event !== 'object') return '';
+  return [
+    event.title,
+    event.content,
+    event.whatToChange,
+    event.whatWentWrong,
+    event.whatWorked,
+    event.context,
+  ].filter(Boolean).join(' ');
+}
+function eventSignal(event) {
+  if (!event || typeof event !== 'object') return null;
+  const raw = event.signal;
+  if (!raw) return null;
+  const lower = String(raw).toLowerCase();
+  if (lower === 'up' || lower === 'positive') return 'positive';
+  if (lower === 'down' || lower === 'negative') return 'negative';
+  return lower;
+}
+/**
+ * Does `rule` fire on `event`? A rule fires when every content token of
+ * its trigger.condition appears in the event's combined text **in the
+ * same relative order** (subsequence match). An empty trigger never fires
+ * — that's a degenerate rule and we want the validator to reject it
+ * rather than silently match everything.
+ *
+ * Order matters because it's the cheapest way to distinguish
+ * "force-push to main caused incident" (trigger condition narrates the
+ * action) from "main branch healthy, no force push" (same tokens, wrong
+ * narrative). Without order we'd flag the second event as a false
+ * positive against every rule built on the same vocabulary.
+ */
+function ruleMatches(rule, event) {
+  const trigger = rule && rule.rule && rule.rule.trigger && rule.rule.trigger.condition;
+  const rawTokens = tokenize(trigger);
+  const tokens = rawTokens.filter((t) => !TRIGGER_MODALITY.has(t));
+  if (tokens.length === 0) return false;
+  const haystack = tokenize(eventText(event));
+  let hi = 0;
+  for (const t of tokens) {
+    while (hi < haystack.length && haystack[hi] !== t) hi += 1;
+    if (hi >= haystack.length) return false;
+    hi += 1;
+  }
+  return true;
+}
+/**
+ * Count true-positive / false-positive / false-negative / true-negative
+ * firings on a sample. Tags are used to scope the sample — only events
+ * that share at least one tag with the rule are considered, on the premise
+ * that a rule about git force-push shouldn't be precision-scored against
+ * deploy-pipeline events it was never meant to see.
+ */
+function scoreOnSample(rule, events, { scopeTags = null } = {}) {
+  const ruleTags = new Set((rule.tags || []).filter(Boolean).map((t) => String(t).toLowerCase()));
+  const scope = scopeTags ? new Set(scopeTags.map((t) => String(t).toLowerCase())) : null;
+  let tp = 0;
+  let fp = 0;
+  let fn = 0;
+  let tn = 0;
+  for (const event of Array.isArray(events) ? events : []) {
+    const tags = Array.isArray(event.tags)
+      ? event.tags.map((t) => String(t).toLowerCase())
+      : [];
+    // Out-of-scope events are ignored — they have nothing to say about
+    // this rule's precision.
+    if (scope && tags.length > 0 && !tags.some((t) => scope.has(t))) continue;
+    if (ruleTags.size > 0 && tags.length > 0 && !tags.some((t) => ruleTags.has(t))) continue;
+    const fires = ruleMatches(rule, event);
+    const signal = eventSignal(event);
+    if (signal === 'negative' && fires) tp += 1;
+    else if (signal === 'positive' && fires) fp += 1;
+    else if (signal === 'negative' && !fires) fn += 1;
+    else if (signal === 'positive' && !fires) tn += 1;
+  }
+  const firings = tp + fp;
+  const negatives = tp + fn;
+  return {
+    tp,
+    fp,
+    fn,
+    tn,
+    precision: firings > 0 ? tp / firings : null,
+    recall: negatives > 0 ? tp / negatives : null,
+  };
+}
+const DEFAULT_PRECISION_FLOOR = 0.8;
+const DEFAULT_MIN_SAMPLE = 3;
+/**
+ * Top-level validator. Returns a detailed report plus a boolean
+ * `shouldPromote`. The caller (feedback-loop) stamps the report onto the
+ * rule record so downstream operators can see why a rule was or wasn't
+ * promoted — silent rejection is worse than a rejected rule we can audit.
+ *
+ * Thresholds are overridable but the defaults are deliberately loose for
+ * Stage-1 rollout: precision ≥ 0.8, with a minimum of 3 sampled events in
+ * scope. Below the minimum sample, the validator promotes the rule but
+ * flags `reason: 'insufficient_sample'` so we don't starve the gate of new
+ * rules while feedback volume is still small.
+ */
+function validateProposedRule(rule, {
+  seedLesson,
+  recentEvents = [],
+  precisionFloor = DEFAULT_PRECISION_FLOOR,
+  minSample = DEFAULT_MIN_SAMPLE,
+} = {}) {
+  const report = {
+    shouldPromote: false,
+    reason: null,
+    matchesSeed: false,
+    precision: null,
+    recall: null,
+    sampleSize: 0,
+    tp: 0,
+    fp: 0,
+    fn: 0,
+    tn: 0,
+  };
+  if (!rule || !rule.rule) {
+    report.reason = 'invalid_rule_shape';
+    return report;
+  }
+  // Invariant 1: the rule must fire on the seed lesson. If it doesn't, the
+  // trigger extraction dropped the discriminative tokens and the rule is
+  // broken regardless of what the sample says.
+  report.matchesSeed = seedLesson ? ruleMatches(rule, seedLesson) : false;
+  if (!report.matchesSeed) {
+    report.reason = 'rule_does_not_match_seed_lesson';
+    return report;
+  }
+  // Invariant 2: precision on recent overlapping-tag events. We pass
+  // scopeTags = rule.tags so the scorer restricts to the same topical
+  // cluster as the rule.
+  const scoreReport = scoreOnSample(rule, recentEvents, { scopeTags: rule.tags });
+  Object.assign(report, scoreReport);
+  report.sampleSize = scoreReport.tp + scoreReport.fp + scoreReport.fn + scoreReport.tn;
+  if (report.sampleSize < minSample) {
+    // Permissive path: we can't prove harm, so allow promotion but flag
+    // the rule for later audit when more data accumulates.
+    report.shouldPromote = true;
+    report.reason = 'insufficient_sample';
+    return report;
+  }
+  if (report.precision === null) {
+    // Rule never fired on the in-scope sample. Still worth promoting
+    // because the seed invariant held — absence of firings just means
+    // this topic is quiet in recent history.
+    report.shouldPromote = true;
+    report.reason = 'no_firings_in_sample';
+    return report;
+  }
+  if (report.precision < precisionFloor) {
+    report.shouldPromote = false;
+    report.reason = 'precision_below_floor';
+    return report;
+  }
+  report.shouldPromote = true;
+  report.reason = 'validated';
+  return report;
+}
+module.exports = {
+  tokenize,
+  eventText,
+  eventSignal,
+  ruleMatches,
+  scoreOnSample,
+  validateProposedRule,
+  DEFAULT_PRECISION_FLOOR,
+  DEFAULT_MIN_SAMPLE,
+};

package/scripts/scaling-law-claims.js ADDED Viewed

@@ -0,0 +1,60 @@
+#!/usr/bin/env node
+'use strict';
+function normalizeText(value) {
+  if (value === undefined || value === null) return '';
+  return String(value).trim();
+}
+function classifyScalingClaim(claim) {
+  const text = normalizeText(claim).toLowerCase();
+  if (/\b(pretrain|pretraining|parameters|training tokens|flops|cross entropy|test loss)\b/.test(text)) {
+    return 'pretraining_scaling';
+  }
+  if (/\b(rl|reinforcement|feedback|dpo|kto|reward|policy|thumbs[-\s]?(up|down)|gate|prevention rule)\b/.test(text)) {
+    return 'feedback_policy_scaling';
+  }
+  return 'general_scaling';
+}
+function evaluateScalingClaim(input = {}) {
+  const claim = normalizeText(input.claim);
+  const claimType = classifyScalingClaim(claim);
+  const evidence = Array.isArray(input.evidence) ? input.evidence.filter(Boolean) : [];
+  const heldout = evidence.some((entry) => /held[-\s]?out|validation|eval|ablation|backtest/i.test(String(entry)));
+  const production = evidence.some((entry) => /production|real user|workflow run|decision journal|blocked action/i.test(String(entry)));
+  const rlCompute = evidence.some((entry) => /sampling compute|rollout|trajectory|policy update|reward model|rl compute/i.test(String(entry)));
+  const sampling = evidence.some((entry) => /pass@|best-of-n|majority vote|sample budget|sampling/i.test(String(entry)));
+  const issues = [];
+  if (!claim) issues.push('missing_claim');
+  if (claimType === 'feedback_policy_scaling' && !heldout) {
+    issues.push('missing_heldout_feedback_eval');
+  }
+  if (claimType === 'feedback_policy_scaling' && /rl|reinforcement|sampling/i.test(claim) && !rlCompute) {
+    issues.push('missing_rl_compute_evidence');
+  }
+  if (claimType === 'feedback_policy_scaling' && /sampling|best-of|vote|pass@/i.test(claim) && !sampling) {
+    issues.push('missing_sampling_budget_evidence');
+  }
+  if (claimType === 'pretraining_scaling' && evidence.length === 0) {
+    issues.push('missing_model_scaling_evidence');
+  }
+  if (/guarantee|always|never|100%|proves?/i.test(claim) && !production) {
+    issues.push('absolute_claim_without_production_evidence');
+  }
+  return {
+    claimType,
+    decision: issues.length === 0 ? 'allow' : 'warn',
+    issues,
+    requiredEvidence: claimType === 'feedback_policy_scaling'
+      ? ['held-out eval', 'ablation or backtest', 'RL/sampling compute budget when claimed', 'decision-journal production sample']
+      : ['source data', 'validation metric', 'scope limits'],
+  };
+}
+module.exports = {
+  classifyScalingClaim,
+  evaluateScalingClaim,
+};

package/scripts/security-scanner.js CHANGED Viewed

@@ -2,7 +2,7 @@
 'use strict';
 /**
- * Security Scanner — OWASP-aware static analysis for PreToolUse gates.
+ * Security Scanner — OWASP-aware static analysis for PreToolUse checks.
  *
  * Scans code being written/edited by AI agents for common vulnerability
  * patterns (injection, XSS, path traversal, etc.) and suspicious dependency

package/scripts/self-distill-agent.js CHANGED Viewed

@@ -349,39 +349,14 @@ Return JSON only, no markdown fences:
 Focus on actionable, specific lessons. Ignore trivial interactions.`;
 async function callAnthropicApi(conversationText, model) {
-  const apiKey = process.env.ANTHROPIC_API_KEY;
-  if (!apiKey) return null;
-  const body = JSON.stringify({
-    model: model || 'claude-sonnet-4-20250514',
-    max_tokens: 2048,
-    system: LLM_SYSTEM_PROMPT,
-    messages: [
-      { role: 'user', content: `Analyze this conversation window and extract lessons:\n\n${conversationText}` },
-    ],
+  const { callClaudeJson, MODELS } = require('./llm-client');
+  return callClaudeJson({
+    model: model || MODELS.SMART,
+    maxTokens: 2048,
+    systemPrompt: LLM_SYSTEM_PROMPT,
+    userPrompt: `Analyze this conversation window and extract lessons:\n\n${conversationText}`,
+    cache: true,
   });
-  try {
-    const resp = await fetch('https://api.anthropic.com/v1/messages', {
-      method: 'POST',
-      headers: {
-        'Content-Type': 'application/json',
-        'x-api-key': apiKey,
-        'anthropic-version': '2023-06-01',
-      },
-      body,
-    });
-    if (!resp.ok) return null;
-    const data = await resp.json();
-    const text = (data.content && data.content[0] && data.content[0].text) || '';
-    // Strip markdown fences if present
-    const cleaned = text.replace(/^```(?:json)?\s*/m, '').replace(/```\s*$/m, '').trim();
-    return JSON.parse(cleaned);
-  } catch {
-    return null;
-  }
 }
 async function generateLlmLessons(conversationWindow, model) {