npm - thumbgate - Versions diffs - 1.14.1 → 1.15.0 - Mend

thumbgate 1.14.1 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/.claude-plugin/marketplace.json +2 -2
package/.claude-plugin/plugin.json +1 -1
package/.well-known/mcp/server-card.json +1 -1
package/README.md +2 -1
package/adapters/claude/.mcp.json +2 -2
package/adapters/mcp/server-stdio.js +8 -1
package/adapters/opencode/opencode.json +1 -1
package/bin/cli.js +54 -0
package/config/enforcement.json +59 -7
package/config/gates/default.json +33 -0
package/config/mcp-allowlists.json +4 -0
package/config/merge-quality-checks.json +2 -1
package/package.json +17 -5
package/public/codex-plugin.html +7 -1
package/public/dashboard.html +23 -2
package/public/index.html +20 -2
package/public/learn.html +39 -0
package/public/lessons.html +25 -1
package/public/numbers.html +271 -0
package/public/pro.html +7 -1
package/scripts/cli-feedback.js +2 -1
package/scripts/cli-schema.js +43 -4
package/scripts/commercial-offer.js +1 -1
package/scripts/contextfs.js +214 -32
package/scripts/feedback-loop.js +49 -5
package/scripts/harness-selector.js +132 -0
package/scripts/lesson-canonical.js +181 -0
package/scripts/lesson-db.js +71 -10
package/scripts/lesson-synthesis.js +23 -2
package/scripts/native-messaging-audit.js +514 -0
package/scripts/pr-manager.js +47 -7
package/scripts/profile-router.js +16 -1
package/scripts/rule-validator.js +285 -0
package/scripts/seo-gsd.js +182 -2
package/scripts/tool-registry.js +12 -0
package/skills/thumbgate/SKILL.md +1 -1
package/src/api/server.js +53 -0
package/.claude-plugin/README.md +0 -170
package/adapters/README.md +0 -12
package/skills/agent-memory/SKILL.md +0 -97
package/skills/solve-architecture-autonomy/SKILL.md +0 -17
package/skills/solve-architecture-autonomy/tool.js +0 -33
package/skills/thumbgate-feedback/SKILL.md +0 -49

package/scripts/lesson-canonical.js ADDED Viewed

@@ -0,0 +1,181 @@
+'use strict';
+/**
+ * scripts/lesson-canonical.js
+ *
+ * Cross-session canonical-form hashing for lessons / memory records.
+ *
+ * Why this exists:
+ *   Before this module, deduplication of promoted lessons relied on:
+ *     1. `findDuplicateMemory()` — exact `sourceFeedbackId` match (catches
+ *        capture-retry races, misses everything else).
+ *     2. `findSimilarLesson()` in lesson-synthesis — Jaccard token overlap
+ *        with a 0.6 threshold on raw title+content (catches near-twins in
+ *        the same session, drifts with rewording).
+ *     3. `findDuplicate()` in lesson-db — exact `LOWER(TRIM(whatToChange))`
+ *        string match plus tag overlap (breaks the moment punctuation,
+ *        pronouns, or articles differ).
+ *
+ *   All three are first-pass filters. None normalize the text before
+ *   hashing, so the same root-cause promoted twice by two different
+ *   worktrees (e.g. "Don't force-push main." vs "never force push main!!")
+ *   survives as two lessons, inflates occurrences counters, and distorts
+ *   the Bayes-optimal gate's base-rate calibration.
+ *
+ *   This module provides a stable cross-session content signature by:
+ *     - Lowercasing and stripping punctuation,
+ *     - Removing a small stop-word list,
+ *     - Collapsing whitespace,
+ *     - Light plural stemming (trailing 's' where safe),
+ *     - Hashing a deterministic join of the normalized whatToChange /
+ *       content / title fields together with a sorted tag list.
+ *
+ *   Two lessons that differ only in phrasing collapse to the same hash;
+ *   lessons that differ in substance or tags do not.
+ *
+ * Design notes:
+ *   - Pure functions, no IO.
+ *   - SHA-256 via node:crypto keeps the signature short and safe to log.
+ *   - `findCanonicalDuplicate` is O(N) over the memory log, which is
+ *     fine at our scale (hundreds to low thousands of entries).
+ */
+const crypto = require('node:crypto');
+// Small English stop-word list. Intentionally conservative — the goal is
+// to defeat trivial wording drift, not to paraphrase every sentence.
+const STOP_WORDS = new Set([
+  'a', 'an', 'the', 'this', 'that', 'these', 'those',
+  'is', 'are', 'was', 'were', 'be', 'been', 'being',
+  'do', 'does', 'did', 'done', 'doing',
+  'have', 'has', 'had',
+  'i', 'you', 'we', 'they', 'he', 'she', 'it',
+  'my', 'your', 'our', 'their', 'his', 'her', 'its',
+  'and', 'or', 'but', 'so', 'if', 'then', 'than', 'because',
+  'of', 'in', 'on', 'at', 'to', 'for', 'with', 'from', 'by',
+  'not', 'no',
+]);
+/**
+ * Canonicalize a free-form string to a stable form that survives cosmetic
+ * rewrites. Returns a single lowercase token string separated by spaces.
+ */
+function canonicalizeText(input) {
+  if (input === null || input === undefined) return '';
+  const raw = String(input);
+  // 1. Lowercase + strip punctuation (keep word chars + whitespace).
+  const stripped = raw.toLowerCase().replace(/[^a-z0-9\s]/g, ' ');
+  // 2. Tokenize on whitespace, drop empties.
+  const tokens = stripped.split(/\s+/).filter(Boolean);
+  // 3. Drop stop words + trivially short tokens.
+  const content = tokens.filter((t) => t.length > 1 && !STOP_WORDS.has(t));
+  // 4. Light singularize: drop trailing 's' from >=4-char tokens not ending
+  //    in 'ss' (e.g. "rules" → "rule", but "pass" stays "pass").
+  const stemmed = content.map((t) => {
+    if (t.length >= 4 && t.endsWith('s') && !t.endsWith('ss')) {
+      return t.slice(0, -1);
+    }
+    return t;
+  });
+  // 5. Sort to make the signature order-invariant for bag-of-words dedup.
+  //    Two lessons that discuss the same tokens in different sentence order
+  //    must collapse. This loses sequence signal but our target is dedup,
+  //    not classification. Explicit localeCompare keeps the sort stable
+  //    across Node versions that default to implementation-defined
+  //    comparison for non-ASCII tokens (SonarCloud S2871).
+  stemmed.sort((a, b) => a.localeCompare(b));
+  return stemmed.join(' ');
+}
+function normalizeTags(tags) {
+  if (!Array.isArray(tags)) return [];
+  return [...new Set(
+    tags
+      .map((t) => String(t || '').trim().toLowerCase())
+      .filter(Boolean),
+  )].sort((a, b) => a.localeCompare(b));
+}
+/**
+ * Build a stable content signature for a lesson / memory record.
+ *
+ * Pulls whichever of the following fields are present:
+ *   - whatToChange, whatWentWrong, whatWorked  (feedback-loop schema)
+ *   - title, content                           (memory-log.jsonl schema)
+ *   - context                                  (capture-feedback schema)
+ *
+ * All fields are concatenated into one blob and canonicalized once, so a
+ * record that stores its content under `whatToChange` hashes identically
+ * to one that surfaces the same text under `content`. Cross-schema dedup
+ * matters because feedback-loop and capture-feedback write slightly
+ * different shapes for the same underlying lesson.
+ *
+ * The tag list is appended separately so two lessons with identical text
+ * but different tags remain distinct.
+ */
+function lessonCanonicalSignature(lesson) {
+  if (!lesson || typeof lesson !== 'object') return '';
+  const blob = [
+    lesson.whatToChange,
+    lesson.whatWentWrong,
+    lesson.whatWorked,
+    lesson.title,
+    lesson.content,
+    lesson.context,
+  ].filter(Boolean).join(' ');
+  const textSig = canonicalizeText(blob);
+  const tagSig = normalizeTags(lesson.tags).join(',');
+  return textSig ? `${textSig}::${tagSig}` : '';
+}
+/**
+ * Short deterministic hash of a lesson's canonical signature. 16 hex chars
+ * (64 bits) is ample for our scale and keeps log lines readable. Returns
+ * null when the record carries no normalized content (all fields empty) —
+ * hashing an empty string would create a "dedup magnet" that collapses all
+ * content-free records together, which is worse than no dedup at all.
+ */
+function canonicalHash(lesson) {
+  const sig = lessonCanonicalSignature(lesson);
+  if (!sig) return null;
+  return crypto.createHash('sha256').update(sig).digest('hex').slice(0, 16);
+}
+/**
+ * Scan a list of existing lesson records for one whose canonical hash
+ * matches `lesson`. Returns the first match or null. The existing record's
+ * stored `canonicalHash` field is preferred; absent that, the hash is
+ * recomputed on the fly so this works against legacy entries.
+ *
+ * Signal filter: when `lesson.signal` is present, only matches with the
+ * same signal are considered — a positive lesson about "force-push" must
+ * not merge with a negative lesson about the same action.
+ */
+function findCanonicalDuplicate(memoryEntries, lesson) {
+  if (!Array.isArray(memoryEntries) || memoryEntries.length === 0) return null;
+  const hash = canonicalHash(lesson);
+  if (!hash) return null;
+  const signalFilter = lesson.signal ? String(lesson.signal).toLowerCase() : null;
+  for (const entry of memoryEntries) {
+    if (!entry || typeof entry !== 'object') continue;
+    const entrySignal = entry.signal ? String(entry.signal).toLowerCase() : null;
+    if (signalFilter && entrySignal && entrySignal !== signalFilter) continue;
+    const entryHash = entry.canonicalHash || canonicalHash(entry);
+    if (entryHash && entryHash === hash) {
+      return entry;
+    }
+  }
+  return null;
+}
+module.exports = {
+  canonicalizeText,
+  normalizeTags,
+  lessonCanonicalSignature,
+  canonicalHash,
+  findCanonicalDuplicate,
+  STOP_WORDS,
+};

package/scripts/lesson-db.js CHANGED Viewed

@@ -188,9 +188,11 @@ function upsertLesson(db, feedbackEvent, memoryRecord) {
   const skill = feedbackEvent.skill || null;
   const whatToChange = feedbackEvent.whatToChange || null;
-  // Rule 2: dedup — if an existing lesson has the same whatToChange and shares tags, skip
+  // Rule 2: dedup — if an existing lesson has the same whatToChange and shares tags, skip.
+  // Passes the feedback event + memoryRecord through so findDuplicate can fall back to
+  // canonical-hash matching when punctuation/wording drift breaks the exact string path.
   if (whatToChange && whatToChange.trim()) {
-    const duplicate = findDuplicate(db, whatToChange, tags);
+    const duplicate = findDuplicate(db, whatToChange, tags, { feedbackEvent, memoryRecord, signal });
     if (duplicate) {
       // Bump importance if the new one is higher priority
       const PRIORITY = { critical: 4, high: 3, medium: 2, low: 1 };
@@ -231,23 +233,82 @@ function upsertLesson(db, feedbackEvent, memoryRecord) {
 /**
  * Find an existing lesson with identical whatToChange and overlapping tags.
  * Returns the existing row or null.
+ *
+ * Two-layer match:
+ *   1. Exact case-insensitive text match on `whatToChange` + tag overlap.
+ *      This is the original behavior — fast, index-friendly, catches verbatim
+ *      re-captures of the same feedback from the same session.
+ *   2. Canonical-hash fallback (optional). When the caller passes `opts` with
+ *      `feedbackEvent`/`memoryRecord`, we compute the incoming record's cross-
+ *      session canonical hash and scan recent lessons of the same signal.
+ *      This defeats the common drift cases the exact path misses: punctuation
+ *      changes, stop-word edits, casing, and trailing plurals.
+ *
+ * The fallback is gated on `opts` so existing callers that only have
+ * `(db, whatToChange, tags)` still work unchanged.
  */
-function findDuplicate(db, whatToChange, tags) {
+function findDuplicate(db, whatToChange, tags, opts = null) {
   if (!whatToChange || !whatToChange.trim()) return null;
-  // Exact match on whatToChange text (normalized)
+  // Layer 1: exact match on whatToChange text (normalized)
   const normalized = whatToChange.trim().toLowerCase();
   const candidates = db.prepare(
     `SELECT id, importance, tags FROM lessons WHERE LOWER(TRIM(whatToChange)) = ?`,
   ).all(normalized);
-  if (candidates.length === 0) return null;
+  if (candidates.length > 0) {
+    for (const c of candidates) {
+      if (tags.length === 0) return c; // no tags to compare = text match is enough
+      const cTags = safeParseTags(c.tags);
+      if (tags.some((t) => cTags.includes(t))) return c;
+    }
+  }
-  // If any candidate shares at least one tag, it's a duplicate
-  for (const c of candidates) {
-    if (tags.length === 0) return c; // no tags to compare = text match is enough
-    const cTags = safeParseTags(c.tags);
-    if (tags.some((t) => cTags.includes(t))) return c;
+  // Layer 2: canonical-hash fallback. Only runs when the caller supplied a
+  // full record so we have title/content/whatWentWrong available — scanning
+  // just `whatToChange` would miss records promoted under a different schema.
+  if (opts && (opts.feedbackEvent || opts.memoryRecord)) {
+    try {
+      const { canonicalHash } = require('./lesson-canonical');
+      // Build a synthetic lesson record from whatever the caller passed so the
+      // canonical hasher sees the same signature findCanonicalDuplicate uses.
+      const incoming = {
+        ...(opts.memoryRecord || {}),
+        whatToChange: opts.feedbackEvent?.whatToChange || opts.memoryRecord?.whatToChange || whatToChange,
+        whatWentWrong: opts.feedbackEvent?.whatWentWrong || opts.memoryRecord?.whatWentWrong || null,
+        whatWorked: opts.feedbackEvent?.whatWorked || opts.memoryRecord?.whatWorked || null,
+        tags,
+        signal: opts.signal || opts.memoryRecord?.signal || null,
+      };
+      const incomingHash = canonicalHash(incoming);
+      if (incomingHash) {
+        // Scan lessons of the same signal. Tags differ across schemas so we
+        // canonical-match row-by-row rather than hoping for a JSON array match.
+        const signalFilter = opts.signal || null;
+        const rows = signalFilter
+          ? db.prepare(
+              `SELECT id, importance, tags, whatToChange, whatWentWrong, whatWorked
+                 FROM lessons WHERE signal = ? AND pruned = 0`,
+            ).all(signalFilter)
+          : db.prepare(
+              `SELECT id, importance, tags, whatToChange, whatWentWrong, whatWorked
+                 FROM lessons WHERE pruned = 0`,
+            ).all();
+        for (const row of rows) {
+          const rowRecord = {
+            whatToChange: row.whatToChange,
+            whatWentWrong: row.whatWentWrong,
+            whatWorked: row.whatWorked,
+            tags: safeParseTags(row.tags),
+          };
+          if (canonicalHash(rowRecord) === incomingHash) {
+            return { id: row.id, importance: row.importance, tags: row.tags };
+          }
+        }
+      }
+    } catch (_canonErr) {
+      // Canonical fallback is best-effort — never break the upsert path.
+    }
   }
   return null;

package/scripts/lesson-synthesis.js CHANGED Viewed

@@ -1,6 +1,7 @@
 'use strict';
 const fs = require('fs');
 const path = require('path');
+const { canonicalHash, findCanonicalDuplicate } = require('./lesson-canonical');
 const SIMILARITY_THRESHOLD = 0.6;
 const AUTO_PROMOTE_THRESHOLD = 3;
@@ -34,10 +35,29 @@ function appendJSONLLocal(filePath, record) {
 /**
  * Find a similar existing lesson by comparing titles and context.
- * Uses token overlap (Jaccard similarity) — fast, no embeddings needed.
+ *
+ * Two-layer dedup:
+ *   1. Canonical-hash match (cross-session). Punctuation/stop-word/wording
+ *      drift is normalized away, so "never force-push main" and "Don't
+ *      force push main." collapse to the same hash. When a hash matches,
+ *      similarity is reported as 1.0 and we skip the Jaccard pass.
+ *   2. Jaccard token overlap (legacy within-session path). Catches
+ *      rewordings that survive canonicalization (new keywords, different
+ *      root verb) above the 0.6 threshold.
+ *
+ * The canonical pass runs first because it's O(N) with constant work per
+ * entry and rejects trivial duplicates before we pay the Jaccard price.
  */
 function findSimilarLesson(memoryLogPath, newRecord) {
   const existing = readJSONLLocal(memoryLogPath, { maxLines: 200 });
+  // Layer 1: canonical-hash exact match (normalization-invariant).
+  const canonicalMatch = findCanonicalDuplicate(existing, newRecord);
+  if (canonicalMatch) {
+    return { match: canonicalMatch, similarity: 1, matchType: 'canonical' };
+  }
+  // Layer 2: Jaccard token overlap (original behavior).
   const newTokens = tokenize(newRecord.title + ' ' + (newRecord.content || ''));
   let bestMatch = null;
@@ -52,7 +72,7 @@ function findSimilarLesson(memoryLogPath, newRecord) {
     }
   }
-  return bestMatch ? { match: bestMatch, similarity: bestScore } : null;
+  return bestMatch ? { match: bestMatch, similarity: bestScore, matchType: 'jaccard' } : null;
 }
 /**
@@ -191,6 +211,7 @@ module.exports = {
   jaccardSimilarity,
   tokenize,
   inferScopeFromTags,
+  canonicalHash,
   SIMILARITY_THRESHOLD,
   AUTO_PROMOTE_THRESHOLD,
 };