thumbgate 1.14.1 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/.claude-plugin/marketplace.json +2 -2
  2. package/.claude-plugin/plugin.json +1 -1
  3. package/.well-known/mcp/server-card.json +1 -1
  4. package/README.md +2 -1
  5. package/adapters/claude/.mcp.json +2 -2
  6. package/adapters/mcp/server-stdio.js +8 -1
  7. package/adapters/opencode/opencode.json +1 -1
  8. package/bin/cli.js +54 -0
  9. package/config/enforcement.json +59 -7
  10. package/config/gates/default.json +33 -0
  11. package/config/mcp-allowlists.json +4 -0
  12. package/config/merge-quality-checks.json +2 -1
  13. package/package.json +17 -5
  14. package/public/codex-plugin.html +7 -1
  15. package/public/dashboard.html +23 -2
  16. package/public/index.html +20 -2
  17. package/public/learn.html +39 -0
  18. package/public/lessons.html +25 -1
  19. package/public/numbers.html +271 -0
  20. package/public/pro.html +7 -1
  21. package/scripts/cli-feedback.js +2 -1
  22. package/scripts/cli-schema.js +43 -4
  23. package/scripts/commercial-offer.js +1 -1
  24. package/scripts/contextfs.js +214 -32
  25. package/scripts/feedback-loop.js +49 -5
  26. package/scripts/harness-selector.js +132 -0
  27. package/scripts/lesson-canonical.js +181 -0
  28. package/scripts/lesson-db.js +71 -10
  29. package/scripts/lesson-synthesis.js +23 -2
  30. package/scripts/native-messaging-audit.js +514 -0
  31. package/scripts/pr-manager.js +47 -7
  32. package/scripts/profile-router.js +16 -1
  33. package/scripts/rule-validator.js +285 -0
  34. package/scripts/seo-gsd.js +182 -2
  35. package/scripts/tool-registry.js +12 -0
  36. package/skills/thumbgate/SKILL.md +1 -1
  37. package/src/api/server.js +53 -0
  38. package/.claude-plugin/README.md +0 -170
  39. package/adapters/README.md +0 -12
  40. package/skills/agent-memory/SKILL.md +0 -97
  41. package/skills/solve-architecture-autonomy/SKILL.md +0 -17
  42. package/skills/solve-architecture-autonomy/tool.js +0 -33
  43. package/skills/thumbgate-feedback/SKILL.md +0 -49
@@ -0,0 +1,181 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * scripts/lesson-canonical.js
5
+ *
6
+ * Cross-session canonical-form hashing for lessons / memory records.
7
+ *
8
+ * Why this exists:
9
+ * Before this module, deduplication of promoted lessons relied on:
10
+ * 1. `findDuplicateMemory()` — exact `sourceFeedbackId` match (catches
11
+ * capture-retry races, misses everything else).
12
+ * 2. `findSimilarLesson()` in lesson-synthesis — Jaccard token overlap
13
+ * with a 0.6 threshold on raw title+content (catches near-twins in
14
+ * the same session, drifts with rewording).
15
+ * 3. `findDuplicate()` in lesson-db — exact `LOWER(TRIM(whatToChange))`
16
+ * string match plus tag overlap (breaks the moment punctuation,
17
+ * pronouns, or articles differ).
18
+ *
19
+ * All three are first-pass filters. None normalize the text before
20
+ * hashing, so the same root-cause promoted twice by two different
21
+ * worktrees (e.g. "Don't force-push main." vs "never force push main!!")
22
+ * survives as two lessons, inflates occurrences counters, and distorts
23
+ * the Bayes-optimal gate's base-rate calibration.
24
+ *
25
+ * This module provides a stable cross-session content signature by:
26
+ * - Lowercasing and stripping punctuation,
27
+ * - Removing a small stop-word list,
28
+ * - Collapsing whitespace,
29
+ * - Light plural stemming (trailing 's' where safe),
30
+ * - Hashing a deterministic join of the normalized whatToChange /
31
+ * content / title fields together with a sorted tag list.
32
+ *
33
+ * Two lessons that differ only in phrasing collapse to the same hash;
34
+ * lessons that differ in substance or tags do not.
35
+ *
36
+ * Design notes:
37
+ * - Pure functions, no IO.
38
+ * - SHA-256 via node:crypto keeps the signature short and safe to log.
39
+ * - `findCanonicalDuplicate` is O(N) over the memory log, which is
40
+ * fine at our scale (hundreds to low thousands of entries).
41
+ */
42
+
43
+ const crypto = require('node:crypto');
44
+
45
+ // Small English stop-word list. Intentionally conservative — the goal is
46
+ // to defeat trivial wording drift, not to paraphrase every sentence.
47
+ const STOP_WORDS = new Set([
48
+ 'a', 'an', 'the', 'this', 'that', 'these', 'those',
49
+ 'is', 'are', 'was', 'were', 'be', 'been', 'being',
50
+ 'do', 'does', 'did', 'done', 'doing',
51
+ 'have', 'has', 'had',
52
+ 'i', 'you', 'we', 'they', 'he', 'she', 'it',
53
+ 'my', 'your', 'our', 'their', 'his', 'her', 'its',
54
+ 'and', 'or', 'but', 'so', 'if', 'then', 'than', 'because',
55
+ 'of', 'in', 'on', 'at', 'to', 'for', 'with', 'from', 'by',
56
+ 'not', 'no',
57
+ ]);
58
+
59
+ /**
60
+ * Canonicalize a free-form string to a stable form that survives cosmetic
61
+ * rewrites. Returns a single lowercase token string separated by spaces.
62
+ */
63
+ function canonicalizeText(input) {
64
+ if (input === null || input === undefined) return '';
65
+ const raw = String(input);
66
+ // 1. Lowercase + strip punctuation (keep word chars + whitespace).
67
+ const stripped = raw.toLowerCase().replace(/[^a-z0-9\s]/g, ' ');
68
+ // 2. Tokenize on whitespace, drop empties.
69
+ const tokens = stripped.split(/\s+/).filter(Boolean);
70
+ // 3. Drop stop words + trivially short tokens.
71
+ const content = tokens.filter((t) => t.length > 1 && !STOP_WORDS.has(t));
72
+ // 4. Light singularize: drop trailing 's' from >=4-char tokens not ending
73
+ // in 'ss' (e.g. "rules" → "rule", but "pass" stays "pass").
74
+ const stemmed = content.map((t) => {
75
+ if (t.length >= 4 && t.endsWith('s') && !t.endsWith('ss')) {
76
+ return t.slice(0, -1);
77
+ }
78
+ return t;
79
+ });
80
+ // 5. Sort to make the signature order-invariant for bag-of-words dedup.
81
+ // Two lessons that discuss the same tokens in different sentence order
82
+ // must collapse. This loses sequence signal but our target is dedup,
83
+ // not classification. Explicit localeCompare keeps the sort stable
84
+ // across Node versions that default to implementation-defined
85
+ // comparison for non-ASCII tokens (SonarCloud S2871).
86
+ stemmed.sort((a, b) => a.localeCompare(b));
87
+ return stemmed.join(' ');
88
+ }
89
+
90
+ function normalizeTags(tags) {
91
+ if (!Array.isArray(tags)) return [];
92
+ return [...new Set(
93
+ tags
94
+ .map((t) => String(t || '').trim().toLowerCase())
95
+ .filter(Boolean),
96
+ )].sort((a, b) => a.localeCompare(b));
97
+ }
98
+
99
+ /**
100
+ * Build a stable content signature for a lesson / memory record.
101
+ *
102
+ * Pulls whichever of the following fields are present:
103
+ * - whatToChange, whatWentWrong, whatWorked (feedback-loop schema)
104
+ * - title, content (memory-log.jsonl schema)
105
+ * - context (capture-feedback schema)
106
+ *
107
+ * All fields are concatenated into one blob and canonicalized once, so a
108
+ * record that stores its content under `whatToChange` hashes identically
109
+ * to one that surfaces the same text under `content`. Cross-schema dedup
110
+ * matters because feedback-loop and capture-feedback write slightly
111
+ * different shapes for the same underlying lesson.
112
+ *
113
+ * The tag list is appended separately so two lessons with identical text
114
+ * but different tags remain distinct.
115
+ */
116
+ function lessonCanonicalSignature(lesson) {
117
+ if (!lesson || typeof lesson !== 'object') return '';
118
+ const blob = [
119
+ lesson.whatToChange,
120
+ lesson.whatWentWrong,
121
+ lesson.whatWorked,
122
+ lesson.title,
123
+ lesson.content,
124
+ lesson.context,
125
+ ].filter(Boolean).join(' ');
126
+ const textSig = canonicalizeText(blob);
127
+ const tagSig = normalizeTags(lesson.tags).join(',');
128
+ return textSig ? `${textSig}::${tagSig}` : '';
129
+ }
130
+
131
+ /**
132
+ * Short deterministic hash of a lesson's canonical signature. 16 hex chars
133
+ * (64 bits) is ample for our scale and keeps log lines readable. Returns
134
+ * null when the record carries no normalized content (all fields empty) —
135
+ * hashing an empty string would create a "dedup magnet" that collapses all
136
+ * content-free records together, which is worse than no dedup at all.
137
+ */
138
+ function canonicalHash(lesson) {
139
+ const sig = lessonCanonicalSignature(lesson);
140
+ if (!sig) return null;
141
+ return crypto.createHash('sha256').update(sig).digest('hex').slice(0, 16);
142
+ }
143
+
144
+ /**
145
+ * Scan a list of existing lesson records for one whose canonical hash
146
+ * matches `lesson`. Returns the first match or null. The existing record's
147
+ * stored `canonicalHash` field is preferred; absent that, the hash is
148
+ * recomputed on the fly so this works against legacy entries.
149
+ *
150
+ * Signal filter: when `lesson.signal` is present, only matches with the
151
+ * same signal are considered — a positive lesson about "force-push" must
152
+ * not merge with a negative lesson about the same action.
153
+ */
154
+ function findCanonicalDuplicate(memoryEntries, lesson) {
155
+ if (!Array.isArray(memoryEntries) || memoryEntries.length === 0) return null;
156
+ const hash = canonicalHash(lesson);
157
+ if (!hash) return null;
158
+
159
+ const signalFilter = lesson.signal ? String(lesson.signal).toLowerCase() : null;
160
+
161
+ for (const entry of memoryEntries) {
162
+ if (!entry || typeof entry !== 'object') continue;
163
+ const entrySignal = entry.signal ? String(entry.signal).toLowerCase() : null;
164
+ if (signalFilter && entrySignal && entrySignal !== signalFilter) continue;
165
+
166
+ const entryHash = entry.canonicalHash || canonicalHash(entry);
167
+ if (entryHash && entryHash === hash) {
168
+ return entry;
169
+ }
170
+ }
171
+ return null;
172
+ }
173
+
174
+ module.exports = {
175
+ canonicalizeText,
176
+ normalizeTags,
177
+ lessonCanonicalSignature,
178
+ canonicalHash,
179
+ findCanonicalDuplicate,
180
+ STOP_WORDS,
181
+ };
@@ -188,9 +188,11 @@ function upsertLesson(db, feedbackEvent, memoryRecord) {
188
188
  const skill = feedbackEvent.skill || null;
189
189
  const whatToChange = feedbackEvent.whatToChange || null;
190
190
 
191
- // Rule 2: dedup — if an existing lesson has the same whatToChange and shares tags, skip
191
+ // Rule 2: dedup — if an existing lesson has the same whatToChange and shares tags, skip.
192
+ // Passes the feedback event + memoryRecord through so findDuplicate can fall back to
193
+ // canonical-hash matching when punctuation/wording drift breaks the exact string path.
192
194
  if (whatToChange && whatToChange.trim()) {
193
- const duplicate = findDuplicate(db, whatToChange, tags);
195
+ const duplicate = findDuplicate(db, whatToChange, tags, { feedbackEvent, memoryRecord, signal });
194
196
  if (duplicate) {
195
197
  // Bump importance if the new one is higher priority
196
198
  const PRIORITY = { critical: 4, high: 3, medium: 2, low: 1 };
@@ -231,23 +233,82 @@ function upsertLesson(db, feedbackEvent, memoryRecord) {
231
233
  /**
232
234
  * Find an existing lesson with identical whatToChange and overlapping tags.
233
235
  * Returns the existing row or null.
236
+ *
237
+ * Two-layer match:
238
+ * 1. Exact case-insensitive text match on `whatToChange` + tag overlap.
239
+ * This is the original behavior — fast, index-friendly, catches verbatim
240
+ * re-captures of the same feedback from the same session.
241
+ * 2. Canonical-hash fallback (optional). When the caller passes `opts` with
242
+ * `feedbackEvent`/`memoryRecord`, we compute the incoming record's cross-
243
+ * session canonical hash and scan recent lessons of the same signal.
244
+ * This defeats the common drift cases the exact path misses: punctuation
245
+ * changes, stop-word edits, casing, and trailing plurals.
246
+ *
247
+ * The fallback is gated on `opts` so existing callers that only have
248
+ * `(db, whatToChange, tags)` still work unchanged.
234
249
  */
235
- function findDuplicate(db, whatToChange, tags) {
250
+ function findDuplicate(db, whatToChange, tags, opts = null) {
236
251
  if (!whatToChange || !whatToChange.trim()) return null;
237
252
 
238
- // Exact match on whatToChange text (normalized)
253
+ // Layer 1: exact match on whatToChange text (normalized)
239
254
  const normalized = whatToChange.trim().toLowerCase();
240
255
  const candidates = db.prepare(
241
256
  `SELECT id, importance, tags FROM lessons WHERE LOWER(TRIM(whatToChange)) = ?`,
242
257
  ).all(normalized);
243
258
 
244
- if (candidates.length === 0) return null;
259
+ if (candidates.length > 0) {
260
+ for (const c of candidates) {
261
+ if (tags.length === 0) return c; // no tags to compare = text match is enough
262
+ const cTags = safeParseTags(c.tags);
263
+ if (tags.some((t) => cTags.includes(t))) return c;
264
+ }
265
+ }
245
266
 
246
- // If any candidate shares at least one tag, it's a duplicate
247
- for (const c of candidates) {
248
- if (tags.length === 0) return c; // no tags to compare = text match is enough
249
- const cTags = safeParseTags(c.tags);
250
- if (tags.some((t) => cTags.includes(t))) return c;
267
+ // Layer 2: canonical-hash fallback. Only runs when the caller supplied a
268
+ // full record so we have title/content/whatWentWrong available — scanning
269
+ // just `whatToChange` would miss records promoted under a different schema.
270
+ if (opts && (opts.feedbackEvent || opts.memoryRecord)) {
271
+ try {
272
+ const { canonicalHash } = require('./lesson-canonical');
273
+ // Build a synthetic lesson record from whatever the caller passed so the
274
+ // canonical hasher sees the same signature findCanonicalDuplicate uses.
275
+ const incoming = {
276
+ ...(opts.memoryRecord || {}),
277
+ whatToChange: opts.feedbackEvent?.whatToChange || opts.memoryRecord?.whatToChange || whatToChange,
278
+ whatWentWrong: opts.feedbackEvent?.whatWentWrong || opts.memoryRecord?.whatWentWrong || null,
279
+ whatWorked: opts.feedbackEvent?.whatWorked || opts.memoryRecord?.whatWorked || null,
280
+ tags,
281
+ signal: opts.signal || opts.memoryRecord?.signal || null,
282
+ };
283
+ const incomingHash = canonicalHash(incoming);
284
+ if (incomingHash) {
285
+ // Scan lessons of the same signal. Tags differ across schemas so we
286
+ // canonical-match row-by-row rather than hoping for a JSON array match.
287
+ const signalFilter = opts.signal || null;
288
+ const rows = signalFilter
289
+ ? db.prepare(
290
+ `SELECT id, importance, tags, whatToChange, whatWentWrong, whatWorked
291
+ FROM lessons WHERE signal = ? AND pruned = 0`,
292
+ ).all(signalFilter)
293
+ : db.prepare(
294
+ `SELECT id, importance, tags, whatToChange, whatWentWrong, whatWorked
295
+ FROM lessons WHERE pruned = 0`,
296
+ ).all();
297
+ for (const row of rows) {
298
+ const rowRecord = {
299
+ whatToChange: row.whatToChange,
300
+ whatWentWrong: row.whatWentWrong,
301
+ whatWorked: row.whatWorked,
302
+ tags: safeParseTags(row.tags),
303
+ };
304
+ if (canonicalHash(rowRecord) === incomingHash) {
305
+ return { id: row.id, importance: row.importance, tags: row.tags };
306
+ }
307
+ }
308
+ }
309
+ } catch (_canonErr) {
310
+ // Canonical fallback is best-effort — never break the upsert path.
311
+ }
251
312
  }
252
313
 
253
314
  return null;
@@ -1,6 +1,7 @@
1
1
  'use strict';
2
2
  const fs = require('fs');
3
3
  const path = require('path');
4
+ const { canonicalHash, findCanonicalDuplicate } = require('./lesson-canonical');
4
5
 
5
6
  const SIMILARITY_THRESHOLD = 0.6;
6
7
  const AUTO_PROMOTE_THRESHOLD = 3;
@@ -34,10 +35,29 @@ function appendJSONLLocal(filePath, record) {
34
35
 
35
36
  /**
36
37
  * Find a similar existing lesson by comparing titles and context.
37
- * Uses token overlap (Jaccard similarity) — fast, no embeddings needed.
38
+ *
39
+ * Two-layer dedup:
40
+ * 1. Canonical-hash match (cross-session). Punctuation/stop-word/wording
41
+ * drift is normalized away, so "never force-push main" and "Don't
42
+ * force push main." collapse to the same hash. When a hash matches,
43
+ * similarity is reported as 1.0 and we skip the Jaccard pass.
44
+ * 2. Jaccard token overlap (legacy within-session path). Catches
45
+ * rewordings that survive canonicalization (new keywords, different
46
+ * root verb) above the 0.6 threshold.
47
+ *
48
+ * The canonical pass runs first because it's O(N) with constant work per
49
+ * entry and rejects trivial duplicates before we pay the Jaccard price.
38
50
  */
39
51
  function findSimilarLesson(memoryLogPath, newRecord) {
40
52
  const existing = readJSONLLocal(memoryLogPath, { maxLines: 200 });
53
+
54
+ // Layer 1: canonical-hash exact match (normalization-invariant).
55
+ const canonicalMatch = findCanonicalDuplicate(existing, newRecord);
56
+ if (canonicalMatch) {
57
+ return { match: canonicalMatch, similarity: 1, matchType: 'canonical' };
58
+ }
59
+
60
+ // Layer 2: Jaccard token overlap (original behavior).
41
61
  const newTokens = tokenize(newRecord.title + ' ' + (newRecord.content || ''));
42
62
 
43
63
  let bestMatch = null;
@@ -52,7 +72,7 @@ function findSimilarLesson(memoryLogPath, newRecord) {
52
72
  }
53
73
  }
54
74
 
55
- return bestMatch ? { match: bestMatch, similarity: bestScore } : null;
75
+ return bestMatch ? { match: bestMatch, similarity: bestScore, matchType: 'jaccard' } : null;
56
76
  }
57
77
 
58
78
  /**
@@ -191,6 +211,7 @@ module.exports = {
191
211
  jaccardSimilarity,
192
212
  tokenize,
193
213
  inferScopeFromTags,
214
+ canonicalHash,
194
215
  SIMILARITY_THRESHOLD,
195
216
  AUTO_PROMOTE_THRESHOLD,
196
217
  };