@yemi33/minions 0.1.1577 → 0.1.1578

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.1.1578 (2026-04-28)
4
+
5
+ ### Features
6
+ - hash-dedup, compress+normalize pass, dynamic stale-guard, rich result
7
+
3
8
  ## 0.1.1577 (2026-04-27)
4
9
 
5
10
  ### Fixes
@@ -1,5 +1,11 @@
1
1
  // render-kb.js — Knowledge base rendering functions extracted from dashboard.html
2
2
 
3
+ function _formatBytes(n) {
4
+ if (n < 1024) return n + ' B';
5
+ if (n < 1024 * 1024) return (n / 1024).toFixed(0) + ' KB';
6
+ return (n / 1024 / 1024).toFixed(1) + ' MB';
7
+ }
8
+
3
9
  const KB_CAT_LABELS = {
4
10
  architecture: 'Architecture', conventions: 'Conventions',
5
11
  'project-notes': 'Project Notes', 'build-reports': 'Build Reports',
@@ -175,7 +181,18 @@ async function kbSweep() {
175
181
  if (result && result.ok) {
176
182
  btn.textContent = 'done';
177
183
  btn.style.color = 'var(--green)';
178
- showToast('cmd-toast', 'KB sweep complete: ' + (result.summary || 'done'), true);
184
+ // Rich summary toast show the key counts inline; full breakdown via console.log for now
185
+ var bytesSaved = (result.bytesBefore || 0) - (result.bytesAfter || 0);
186
+ var pieces = [];
187
+ if (result.entriesBefore != null) pieces.push((result.entriesBefore - (result.entriesAfter || 0)) + ' entries removed');
188
+ if (result.hashDuplicatesArchived) pieces.push(result.hashDuplicatesArchived + ' hash-dup');
189
+ if (result.llmDuplicatesArchived) pieces.push(result.llmDuplicatesArchived + ' llm-dup');
190
+ if (result.staleRemoved) pieces.push(result.staleRemoved + ' stale');
191
+ if (result.reclassified) pieces.push(result.reclassified + ' reclassified');
192
+ if (result.rewritten) pieces.push(result.rewritten + ' rewritten');
193
+ if (bytesSaved > 0) pieces.push(_formatBytes(bytesSaved) + ' saved');
194
+ var msg = pieces.length ? 'KB sweep: ' + pieces.join(' · ') : 'KB sweep: ' + (result.summary || 'done');
195
+ showToast('cmd-toast', msg, true);
179
196
  refreshKnowledgeBase();
180
197
  } else {
181
198
  btn.style.color = 'var(--red)';
package/dashboard.js CHANGED
@@ -2577,183 +2577,31 @@ const server = http.createServer(async (req, res) => {
2577
2577
  }
2578
2578
 
2579
2579
  async function handleKnowledgeSweep(req, res) {
2580
- // Auto-release stale guard after 5 min (LLM may have hung)
2581
- if (global._kbSweepInFlight && global._kbSweepStartedAt && Date.now() - global._kbSweepStartedAt > 300000) {
2582
- console.log('[kb-sweep] Auto-releasing stale guard (>5min)');
2580
+ // Auto-release stale guard dynamic floor based on KB size (30 min min, +1s per entry)
2581
+ const { staleGuardMs } = require('./engine/kb-sweep');
2582
+ const entryCount = (queries.getKnowledgeBaseEntries() || []).length;
2583
+ const guardMs = staleGuardMs(entryCount);
2584
+ if (global._kbSweepInFlight && global._kbSweepStartedAt && Date.now() - global._kbSweepStartedAt > guardMs) {
2585
+ console.log(`[kb-sweep] Auto-releasing stale guard (>${Math.round(guardMs / 60000)}min for ${entryCount} entries)`);
2583
2586
  global._kbSweepInFlight = false;
2584
2587
  }
2585
2588
  if (global._kbSweepInFlight) {
2586
2589
  return jsonReply(res, 200, { ok: true, alreadyRunning: true, startedAt: global._kbSweepStartedAt });
2587
2590
  }
2588
- // Generation token prevents stale finally blocks from clearing the flag for a new sweep
2589
2591
  const sweepToken = Date.now() + Math.random();
2590
2592
  global._kbSweepToken = sweepToken;
2591
2593
  global._kbSweepInFlight = true;
2592
2594
  global._kbSweepStartedAt = Date.now();
2593
2595
  const body = await readBody(req).catch(() => ({}));
2594
- // Run sweep in background — return immediately so agents/UI don't time out
2595
2596
  _runKbSweepBackground(body, sweepToken);
2596
2597
  return jsonReply(res, 202, { ok: true, started: true });
2597
2598
  }
2598
2599
 
2599
2600
  async function _runKbSweepBackground(body, sweepToken) {
2600
2601
  try {
2601
- const entries = getKnowledgeBaseEntries();
2602
- if (entries.length < 2) {
2603
- global._kbSweepLastResult = { ok: true, summary: 'nothing to sweep (< 2 entries)' };
2604
- global._kbSweepLastCompletedAt = Date.now();
2605
- return;
2606
- }
2607
-
2608
- // Build a manifest of all KB entries with their content (skip pinned — user wants to keep them)
2609
- const requestPinnedKeys = Array.isArray(body.pinnedKeys)
2610
- ? body.pinnedKeys.filter(k => typeof k === 'string' && k.startsWith('knowledge/'))
2611
- : [];
2612
- const serverPinnedKeys = shared.getPinnedItems().filter(k => k.startsWith('knowledge/'));
2613
- const pinnedKeys = new Set([...serverPinnedKeys, ...requestPinnedKeys]);
2614
- const manifest = [];
2615
- for (const e of entries) {
2616
- if (pinnedKeys.has('knowledge/' + e.cat + '/' + e.file)) continue;
2617
- const content = safeRead(path.join(MINIONS_DIR, 'knowledge', e.cat, e.file));
2618
- if (!content) continue;
2619
- manifest.push({ category: e.cat, file: e.file, title: e.title, agent: e.agent, date: e.date, content: content.slice(0, 3000) });
2620
- }
2621
- if (manifest.length < 2) {
2622
- global._kbSweepLastResult = { ok: true, summary: 'nothing to sweep (< 2 unpinned entries)' };
2623
- global._kbSweepLastCompletedAt = Date.now();
2624
- return;
2625
- }
2626
-
2627
- const { callLLM, trackEngineUsage } = require('./engine/llm');
2628
- const BATCH_SIZE = 30; // ~30 entries per batch to stay within Haiku context
2629
- const batches = [];
2630
- for (let i = 0; i < manifest.length; i += BATCH_SIZE) {
2631
- batches.push(manifest.slice(i, i + BATCH_SIZE));
2632
- }
2633
-
2634
- const plan = { duplicates: [], reclassify: [], remove: [] };
2635
- for (let b = 0; b < batches.length; b++) {
2636
- const batch = batches[b];
2637
- const offset = b * BATCH_SIZE;
2638
- const prompt = `You are a knowledge base curator. Analyze these ${batch.length} entries (batch ${b + 1}/${batches.length}, indices ${offset}-${offset + batch.length - 1}) and produce a cleanup plan.
2639
-
2640
- ## Entries
2641
-
2642
- ${batch.map((m, i) => `[${offset + i}] ${m.category}/${m.file} | ${m.title} | ${m.date} | ${m.agent || '?'} | ${(m.content || '').slice(0, 200).replace(/\n/g, ' ')}`).join('\n')}
2643
-
2644
- ## Instructions
2645
-
2646
- 1. **Find duplicates**: entries with substantially the same content (same findings, different agents/runs). List pairs by index. Prefer keeping the more recent entry.
2647
- 2. **Find misclassified**: entries in the wrong category.
2648
- 3. **Find stale/empty**: entries with no actionable content (boilerplate, bail-out notes, "no changes needed").
2649
-
2650
- Respond with ONLY valid JSON: { "duplicates": [{ "keep": N, "remove": [N], "reason": "..." }], "reclassify": [{ "index": N, "from": "cat", "to": "cat", "reason": "..." }], "remove": [{ "index": N, "reason": "..." }] }
2651
- If nothing to do: { "duplicates": [], "reclassify": [], "remove": [] }`;
2652
-
2653
- const result = await callLLM(prompt, 'Output only JSON.', {
2654
- timeout: 120000, label: 'kb-sweep', model: 'haiku', maxTurns: 1, direct: true
2655
- });
2656
- trackEngineUsage('kb-sweep', result.usage);
2657
-
2658
- let batchPlan;
2659
- try {
2660
- let jsonStr = (result.text || '').trim();
2661
- const fenceMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
2662
- if (fenceMatch) jsonStr = fenceMatch[1].trim();
2663
- batchPlan = JSON.parse(jsonStr);
2664
- } catch {
2665
- console.log(`[kb-sweep] batch ${b + 1}/${batches.length} returned invalid JSON, skipping`);
2666
- continue;
2667
- }
2668
- if (batchPlan.duplicates) plan.duplicates.push(...batchPlan.duplicates);
2669
- if (batchPlan.reclassify) plan.reclassify.push(...batchPlan.reclassify);
2670
- if (batchPlan.remove) plan.remove.push(...batchPlan.remove);
2671
- }
2672
-
2673
- let removed = 0, reclassified = 0, merged = 0;
2674
- const kbDir = path.join(MINIONS_DIR, 'knowledge');
2675
-
2676
- // If nothing to do, store result and return
2677
- const totalActions = (plan.remove || []).length + (plan.duplicates || []).reduce((n, d) => n + (d.remove || []).length, 0) + (plan.reclassify || []).length;
2678
- if (totalActions === 0) {
2679
- global._kbSweepLastResult = { ok: true, summary: 'KB is clean — nothing to sweep', plan };
2680
- global._kbSweepLastCompletedAt = Date.now();
2681
- return;
2682
- }
2683
-
2684
- // Archive dir for swept files (never delete, always preserve)
2685
- const kbArchiveDir = path.join(kbDir, '_swept');
2686
- if (!fs.existsSync(kbArchiveDir)) fs.mkdirSync(kbArchiveDir, { recursive: true });
2687
-
2688
- function archiveKbFile(filePath, reason) {
2689
- if (!fs.existsSync(filePath)) return;
2690
- const basename = path.basename(filePath);
2691
- const destPath = shared.uniquePath(path.join(kbArchiveDir, basename));
2692
- try {
2693
- const content = safeRead(filePath);
2694
- if (content === null) return; // don't delete if we can't read
2695
- const meta = `<!-- swept: ${new Date().toISOString()} | reason: ${reason} -->\n`;
2696
- safeWrite(destPath, meta + content);
2697
- safeUnlink(filePath);
2698
- } catch (e) { console.error('kb archive:', e.message); }
2699
- }
2700
-
2701
- // Process removals (stale/empty) — archive, not delete
2702
- for (const r of (plan.remove || [])) {
2703
- const entry = manifest[r.index];
2704
- if (!entry) continue;
2705
- const fp = path.join(kbDir, entry.category, entry.file);
2706
- archiveKbFile(fp, 'stale: ' + (r.reason || ''));
2707
- removed++;
2708
- }
2709
-
2710
- // Process duplicates — archive the duplicates, keep the primary
2711
- for (const d of (plan.duplicates || [])) {
2712
- for (const idx of (d.remove || [])) {
2713
- const entry = manifest[idx];
2714
- if (!entry) continue;
2715
- const fp = path.join(kbDir, entry.category, entry.file);
2716
- archiveKbFile(fp, 'duplicate of index ' + d.keep + ': ' + (d.reason || ''));
2717
- merged++;
2718
- }
2719
- }
2720
-
2721
- // Process reclassifications (move between categories)
2722
- for (const r of (plan.reclassify || [])) {
2723
- const entry = manifest[r.index];
2724
- if (!entry || !shared.KB_CATEGORIES.includes(r.to)) continue;
2725
- const srcPath = path.join(kbDir, entry.category, entry.file);
2726
- const destDir = path.join(kbDir, r.to);
2727
- if (!fs.existsSync(srcPath)) continue;
2728
- if (!fs.existsSync(destDir)) fs.mkdirSync(destDir, { recursive: true });
2729
- try {
2730
- const srcStats = fs.statSync(srcPath);
2731
- const content = safeRead(srcPath);
2732
- const updated = content.replace(/^(category:\s*).+$/m, `$1${r.to}`);
2733
- const destPath = path.join(destDir, entry.file);
2734
- safeWrite(destPath, updated);
2735
- fs.utimesSync(destPath, srcStats.atime, srcStats.mtime);
2736
- safeUnlink(srcPath);
2737
- reclassified++;
2738
- } catch (e) { console.error('kb reclassify:', e.message); }
2739
- }
2740
-
2741
- // Prune swept files older than 30 days
2742
- let pruned = 0;
2743
- const SWEPT_RETENTION_MS = 30 * 24 * 60 * 60 * 1000;
2744
- try {
2745
- for (const f of fs.readdirSync(kbArchiveDir)) {
2746
- const fp = path.join(kbArchiveDir, f);
2747
- try {
2748
- if (Date.now() - fs.statSync(fp).mtimeMs > SWEPT_RETENTION_MS) { safeUnlink(fp); pruned++; }
2749
- } catch { /* cleanup */ }
2750
- }
2751
- } catch { /* optional */ }
2752
-
2753
- const summary = `${merged} duplicates merged, ${removed} stale removed, ${reclassified} reclassified${pruned ? ', ' + pruned + ' old swept files pruned' : ''}`;
2754
- safeWrite(path.join(ENGINE_DIR, 'kb-swept.json'), JSON.stringify({ timestamp: new Date().toISOString(), summary }));
2755
- queries.invalidateKnowledgeBaseCache();
2756
- global._kbSweepLastResult = { ok: true, summary, plan };
2602
+ const { runKbSweep } = require('./engine/kb-sweep');
2603
+ const result = await runKbSweep({ pinnedKeys: body.pinnedKeys });
2604
+ global._kbSweepLastResult = result;
2757
2605
  global._kbSweepLastCompletedAt = Date.now();
2758
2606
  } catch (e) {
2759
2607
  console.error('[kb-sweep] background error:', e.message);
@@ -2762,6 +2610,7 @@ If nothing to do: { "duplicates": [], "reclassify": [], "remove": [] }`;
2762
2610
  } finally { if (global._kbSweepToken === sweepToken) global._kbSweepInFlight = false; }
2763
2611
  }
2764
2612
 
2613
+
2765
2614
  function handleKnowledgeSweepStatus(req, res) {
2766
2615
  return jsonReply(res, 200, {
2767
2616
  inFlight: !!global._kbSweepInFlight,
@@ -0,0 +1,383 @@
1
+ /**
2
+ * engine/kb-sweep.js — Knowledge base sweep: dedup, compress, normalize.
3
+ *
4
+ * Replaces the inline sweep that lived in dashboard.js. Three passes:
5
+ * 1. Hash-based dedup — cheap, catches cross-batch duplicates
6
+ * 2. LLM batch sweep — finds remaining dupes + reclassify + stale-remove
7
+ * 3. Compress & normalize — per-entry LLM rewrite, flagged via _swept frontmatter
8
+ *
9
+ * Returns a rich summary so the dashboard can show before/after byte counts.
10
+ */
11
+
12
+ const fs = require('fs');
13
+ const path = require('path');
14
+ const crypto = require('crypto');
15
+ const shared = require('./shared');
16
+ const queries = require('./queries');
17
+ const { safeRead, safeWrite, safeUnlink, log, ts } = shared;
18
+ const { MINIONS_DIR, ENGINE_DIR } = queries;
19
+
20
+ const KB_DIR = path.join(MINIONS_DIR, 'knowledge');
21
+ const SWEPT_DIR = path.join(KB_DIR, '_swept');
22
+ const SWEPT_RETENTION_MS = 30 * 24 * 60 * 60 * 1000;
23
+ const COMPRESS_THRESHOLD_BYTES = 5000;
24
+ const LLM_BATCH_SIZE = 30;
25
+ const NORMALIZE_CONCURRENCY = 5;
26
+ const SWEPT_FLAG_KEY = '_swept'; // frontmatter key — entries with this skip the rewrite pass
27
+
28
+ function _hashEntry(content) {
29
+ const normalized = String(content || '').replace(/\s+/g, ' ').trim().slice(0, 500);
30
+ return crypto.createHash('sha256').update(normalized + ':' + (content?.length || 0)).digest('hex');
31
+ }
32
+
33
+ /**
34
+ * Parse YAML-ish frontmatter at the top of a markdown file.
35
+ * Returns { fm: {key:value}, body: string }.
36
+ */
37
+ function _parseFrontmatter(content) {
38
+ const m = String(content || '').match(/^---\n([\s\S]*?)\n---\n?([\s\S]*)$/);
39
+ if (!m) return { fm: {}, body: content || '' };
40
+ const fm = {};
41
+ for (const line of m[1].split('\n')) {
42
+ const lm = line.match(/^([\w-]+):\s*(.*)$/);
43
+ if (lm) fm[lm[1]] = lm[2].trim();
44
+ }
45
+ return { fm, body: m[2].replace(/^\n+/, '') };
46
+ }
47
+
48
+ function _serializeFrontmatter(fm, body) {
49
+ const keys = Object.keys(fm);
50
+ if (keys.length === 0) return body;
51
+ const lines = keys.map(k => `${k}: ${fm[k]}`);
52
+ return `---\n${lines.join('\n')}\n---\n\n${body.replace(/^\n+/, '')}`;
53
+ }
54
+
55
+ function _archiveKbFile(filePath, reason) {
56
+ if (!fs.existsSync(filePath)) return false;
57
+ if (!fs.existsSync(SWEPT_DIR)) fs.mkdirSync(SWEPT_DIR, { recursive: true });
58
+ const destPath = shared.uniquePath(path.join(SWEPT_DIR, path.basename(filePath)));
59
+ try {
60
+ const content = safeRead(filePath);
61
+ if (content === null) return false;
62
+ safeWrite(destPath, `<!-- swept: ${new Date().toISOString()} | reason: ${reason} -->\n${content}`);
63
+ safeUnlink(filePath);
64
+ return true;
65
+ } catch (e) { log('warn', `[kb-sweep] archive ${path.basename(filePath)}: ${e.message}`); return false; }
66
+ }
67
+
68
+ function _pruneOldSwept() {
69
+ if (!fs.existsSync(SWEPT_DIR)) return 0;
70
+ let pruned = 0;
71
+ try {
72
+ for (const f of fs.readdirSync(SWEPT_DIR)) {
73
+ const fp = path.join(SWEPT_DIR, f);
74
+ try {
75
+ if (Date.now() - fs.statSync(fp).mtimeMs > SWEPT_RETENTION_MS) { safeUnlink(fp); pruned++; }
76
+ } catch { /* ignore */ }
77
+ }
78
+ } catch { /* ignore */ }
79
+ return pruned;
80
+ }
81
+
82
+ /** Group entries by content hash, keep most-recent per group. Cheap, no LLM. */
83
+ function _hashDedup(manifest, opts = {}) {
84
+ const groups = new Map(); // hash → entries[]
85
+ for (const e of manifest) {
86
+ const h = _hashEntry(e.content);
87
+ if (!groups.has(h)) groups.set(h, []);
88
+ groups.get(h).push(e);
89
+ }
90
+ let archived = 0;
91
+ const survivors = [];
92
+ for (const [, group] of groups) {
93
+ if (group.length === 1) { survivors.push(group[0]); continue; }
94
+ // Keep most recent (by date frontmatter, then mtime)
95
+ group.sort((a, b) => (b.date || '').localeCompare(a.date || '') || b.mtimeMs - a.mtimeMs);
96
+ survivors.push(group[0]);
97
+ for (const dup of group.slice(1)) {
98
+ if (opts.dryRun) { archived++; continue; }
99
+ const fp = path.join(KB_DIR, dup.category, dup.file);
100
+ if (_archiveKbFile(fp, `hash-duplicate of ${group[0].category}/${group[0].file}`)) archived++;
101
+ }
102
+ }
103
+ return { survivors, archived };
104
+ }
105
+
106
+ /** Batched LLM sweep — finds within-batch dupes, reclassifies, removes stale. */
107
+ async function _llmBatchSweep(manifest, callLLM, trackEngineUsage) {
108
+ const plan = { duplicates: [], reclassify: [], remove: [] };
109
+ const batches = [];
110
+ for (let i = 0; i < manifest.length; i += LLM_BATCH_SIZE) {
111
+ batches.push(manifest.slice(i, i + LLM_BATCH_SIZE));
112
+ }
113
+ for (let b = 0; b < batches.length; b++) {
114
+ const batch = batches[b];
115
+ const offset = b * LLM_BATCH_SIZE;
116
+ const prompt = `You are a knowledge base curator. Analyze these ${batch.length} entries (batch ${b + 1}/${batches.length}, indices ${offset}-${offset + batch.length - 1}) and produce a cleanup plan.
117
+
118
+ ## Entries
119
+
120
+ ${batch.map((m, i) => `[${offset + i}] ${m.category}/${m.file} | ${m.title} | ${m.date} | ${m.agent || '?'} | ${(m.content || '').slice(0, 200).replace(/\n/g, ' ')}`).join('\n')}
121
+
122
+ ## Instructions
123
+
124
+ 1. **Find duplicates**: entries with substantially the same content (same findings, different agents/runs). List pairs by index. Prefer keeping the more recent entry.
125
+ 2. **Find misclassified**: entries in the wrong category.
126
+ 3. **Find stale/empty**: entries with no actionable content (boilerplate, bail-out notes, "no changes needed").
127
+
128
+ Respond with ONLY valid JSON: { "duplicates": [{ "keep": N, "remove": [N], "reason": "..." }], "reclassify": [{ "index": N, "from": "cat", "to": "cat", "reason": "..." }], "remove": [{ "index": N, "reason": "..." }] }
129
+ If nothing to do: { "duplicates": [], "reclassify": [], "remove": [] }`;
130
+
131
+ let result;
132
+ try {
133
+ result = await callLLM(prompt, 'Output only JSON.', { timeout: 120000, label: 'kb-sweep', model: 'haiku', maxTurns: 1, direct: true });
134
+ trackEngineUsage('kb-sweep', result.usage);
135
+ } catch (e) { log('warn', `[kb-sweep] batch ${b + 1} LLM error: ${e.message}`); continue; }
136
+
137
+ let batchPlan;
138
+ try {
139
+ let jsonStr = (result.text || '').trim();
140
+ const fence = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
141
+ if (fence) jsonStr = fence[1].trim();
142
+ batchPlan = JSON.parse(jsonStr);
143
+ } catch { log('warn', `[kb-sweep] batch ${b + 1} returned invalid JSON, skipping`); continue; }
144
+ if (batchPlan.duplicates) plan.duplicates.push(...batchPlan.duplicates);
145
+ if (batchPlan.reclassify) plan.reclassify.push(...batchPlan.reclassify);
146
+ if (batchPlan.remove) plan.remove.push(...batchPlan.remove);
147
+ }
148
+ return plan;
149
+ }
150
+
151
+ /**
152
+ * Per-entry rewrite pass: compress large entries + normalize structure into
153
+ * a fixed template. Only runs on entries lacking the `_swept` frontmatter flag.
154
+ * Concurrency-limited via Promise pool.
155
+ */
156
+ async function _rewritePass(survivors, callLLM, trackEngineUsage, opts = {}) {
157
+ const REWRITE_PROMPT = (entry, body) => `You are restructuring a knowledge-base entry so future agents can scan it quickly.
158
+
159
+ Reshape the content into this exact template, preserving ALL actionable findings, file:line references, and code snippets. Compress to <=800 words by dropping boilerplate (dates, full file paths that aren't actionable, agent IDs in the body, narrative scaffolding).
160
+
161
+ Template:
162
+ ## Summary
163
+ 2-3 sentence overview.
164
+
165
+ ## Key Findings
166
+ - Bullet 1 (specific, includes file:line where relevant)
167
+ - Bullet 2
168
+
169
+ ## Action Items
170
+ - Bullet (omit section entirely if none)
171
+
172
+ ## References
173
+ - file:line citations or doc links (omit section if none)
174
+
175
+ Output ONLY the template body — no frontmatter, no markdown code fence, no preamble.
176
+
177
+ Original entry (category: ${entry.category}, agent: ${entry.agent || '?'}, date: ${entry.date}):
178
+
179
+ ${body}`;
180
+
181
+ const candidates = [];
182
+ for (const e of survivors) {
183
+ const fp = path.join(KB_DIR, e.category, e.file);
184
+ const content = safeRead(fp);
185
+ if (content == null) continue;
186
+ const { fm, body } = _parseFrontmatter(content);
187
+ // Skip already-processed unless the file was modified after the sweep flag was set
188
+ if (fm[SWEPT_FLAG_KEY]) {
189
+ try {
190
+ const mtime = fs.statSync(fp).mtimeMs;
191
+ const sweptAt = Date.parse(fm[SWEPT_FLAG_KEY]);
192
+ if (Number.isFinite(sweptAt) && mtime <= sweptAt + 1000) continue;
193
+ } catch { /* ignore — re-process */ }
194
+ }
195
+ candidates.push({ entry: e, fp, fm, body, originalSize: content.length });
196
+ }
197
+
198
+ if (candidates.length === 0) return { processed: 0, bytesBefore: 0, bytesAfter: 0 };
199
+
200
+ let processed = 0, bytesBefore = 0, bytesAfter = 0;
201
+ // Simple promise pool — NORMALIZE_CONCURRENCY at a time
202
+ let cursor = 0;
203
+ async function worker() {
204
+ while (cursor < candidates.length) {
205
+ const c = candidates[cursor++];
206
+ try {
207
+ const result = await callLLM(REWRITE_PROMPT(c.entry, c.body), 'Output ONLY the template body.', {
208
+ timeout: 120000, label: 'kb-rewrite', model: 'haiku', maxTurns: 1, direct: true,
209
+ });
210
+ trackEngineUsage('kb-sweep', result.usage);
211
+ let newBody = (result.text || '').trim();
212
+ // Strip accidental code fence
213
+ const fence = newBody.match(/^```(?:markdown|md)?\s*([\s\S]*?)```$/);
214
+ if (fence) newBody = fence[1].trim();
215
+ if (!newBody || newBody.length < 50) continue; // suspicious — skip
216
+ const newFm = { ...c.fm, [SWEPT_FLAG_KEY]: new Date().toISOString() };
217
+ const newContent = _serializeFrontmatter(newFm, newBody);
218
+ if (!opts.dryRun) safeWrite(c.fp, newContent);
219
+ bytesBefore += c.originalSize;
220
+ bytesAfter += newContent.length;
221
+ processed++;
222
+ } catch (e) { log('warn', `[kb-sweep] rewrite ${c.entry.category}/${c.entry.file}: ${e.message}`); }
223
+ }
224
+ }
225
+ const workers = Array.from({ length: NORMALIZE_CONCURRENCY }, worker);
226
+ await Promise.all(workers);
227
+ return { processed, bytesBefore, bytesAfter };
228
+ }
229
+
230
+ function _applyLlmPlan(plan, manifest, opts = {}) {
231
+ let removed = 0, merged = 0, reclassified = 0;
232
+ for (const r of (plan.remove || [])) {
233
+ const entry = manifest[r.index];
234
+ if (!entry) continue;
235
+ if (opts.dryRun) { removed++; continue; }
236
+ if (_archiveKbFile(path.join(KB_DIR, entry.category, entry.file), `stale: ${r.reason || ''}`)) removed++;
237
+ }
238
+ for (const d of (plan.duplicates || [])) {
239
+ for (const idx of (d.remove || [])) {
240
+ const entry = manifest[idx];
241
+ if (!entry) continue;
242
+ if (opts.dryRun) { merged++; continue; }
243
+ if (_archiveKbFile(path.join(KB_DIR, entry.category, entry.file), `duplicate of index ${d.keep}: ${d.reason || ''}`)) merged++;
244
+ }
245
+ }
246
+ for (const r of (plan.reclassify || [])) {
247
+ const entry = manifest[r.index];
248
+ if (!entry || !shared.KB_CATEGORIES.includes(r.to)) continue;
249
+ if (opts.dryRun) { reclassified++; continue; }
250
+ const srcPath = path.join(KB_DIR, entry.category, entry.file);
251
+ const destDir = path.join(KB_DIR, r.to);
252
+ if (!fs.existsSync(srcPath)) continue;
253
+ if (!fs.existsSync(destDir)) fs.mkdirSync(destDir, { recursive: true });
254
+ try {
255
+ const stats = fs.statSync(srcPath);
256
+ const content = safeRead(srcPath);
257
+ const updated = (content || '').replace(/^(category:\s*).+$/m, `$1${r.to}`);
258
+ const destPath = path.join(destDir, entry.file);
259
+ safeWrite(destPath, updated);
260
+ fs.utimesSync(destPath, stats.atime, stats.mtime);
261
+ safeUnlink(srcPath);
262
+ reclassified++;
263
+ } catch (e) { log('warn', `[kb-sweep] reclassify ${entry.file}: ${e.message}`); }
264
+ }
265
+ return { removed, merged, reclassified };
266
+ }
267
+
268
+ /**
269
+ * Run the full sweep. Returns a rich summary.
270
+ *
271
+ * @param {object} opts
272
+ * @param {string[]} [opts.pinnedKeys] - extra pinned keys (e.g. from request body)
273
+ * @param {boolean} [opts.dryRun] - count actions but don't mutate files
274
+ * @returns {Promise<object>} summary
275
+ */
276
+ async function runKbSweep(opts = {}) {
277
+ const { callLLM, trackEngineUsage } = require('./llm');
278
+ const summary = {
279
+ ok: true,
280
+ entriesBefore: 0,
281
+ entriesAfter: 0,
282
+ bytesBefore: 0,
283
+ bytesAfter: 0,
284
+ hashDuplicatesArchived: 0,
285
+ llmDuplicatesArchived: 0,
286
+ staleRemoved: 0,
287
+ reclassified: 0,
288
+ rewritten: 0,
289
+ rewriteBytesBefore: 0,
290
+ rewriteBytesAfter: 0,
291
+ sweptArchivePruned: 0,
292
+ durationMs: 0,
293
+ };
294
+ const t0 = Date.now();
295
+
296
+ const entries = queries.getKnowledgeBaseEntries();
297
+ if (entries.length < 2) { summary.summary = 'nothing to sweep (< 2 entries)'; summary.durationMs = Date.now() - t0; return summary; }
298
+
299
+ const requestPinned = Array.isArray(opts.pinnedKeys)
300
+ ? opts.pinnedKeys.filter(k => typeof k === 'string' && k.startsWith('knowledge/'))
301
+ : [];
302
+ const pinned = new Set([
303
+ ...shared.getPinnedItems().filter(k => k.startsWith('knowledge/')),
304
+ ...requestPinned,
305
+ ]);
306
+
307
+ // Build manifest with full content + mtime
308
+ const manifest = [];
309
+ for (const e of entries) {
310
+ if (pinned.has(`knowledge/${e.cat}/${e.file}`)) continue;
311
+ const fp = path.join(KB_DIR, e.cat, e.file);
312
+ const content = safeRead(fp);
313
+ if (!content) continue;
314
+ let mtimeMs = 0;
315
+ try { mtimeMs = fs.statSync(fp).mtimeMs; } catch { /* ignore */ }
316
+ manifest.push({ category: e.cat, file: e.file, title: e.title, agent: e.agent, date: e.date, content: content.slice(0, 3000), mtimeMs });
317
+ summary.entriesBefore++;
318
+ summary.bytesBefore += content.length;
319
+ }
320
+ if (manifest.length < 2) { summary.summary = 'nothing to sweep (< 2 unpinned entries)'; summary.durationMs = Date.now() - t0; return summary; }
321
+
322
+ // 1. Hash-based dedup (cheap, catches cross-batch duplicates)
323
+ const { survivors: afterHash, archived: hashArchived } = _hashDedup(manifest, opts);
324
+ summary.hashDuplicatesArchived = hashArchived;
325
+
326
+ // 2. LLM batch sweep — within-batch dupes + reclassify + remove stale
327
+ // Only runs against survivors, but we need indices that match the LIST sent to the LLM
328
+ const llmManifest = afterHash;
329
+ const plan = await _llmBatchSweep(llmManifest, callLLM, trackEngineUsage);
330
+ const llmActions = _applyLlmPlan(plan, llmManifest, opts);
331
+ summary.llmDuplicatesArchived = llmActions.merged;
332
+ summary.staleRemoved = llmActions.removed;
333
+ summary.reclassified = llmActions.reclassified;
334
+
335
+ // 3. Per-entry rewrite (compress + normalize)
336
+ // Filter to entries that survived hash + LLM passes (still on disk)
337
+ const stillOnDisk = afterHash.filter(e => fs.existsSync(path.join(KB_DIR, e.category, e.file)));
338
+ const rewriteResult = await _rewritePass(stillOnDisk, callLLM, trackEngineUsage, opts);
339
+ summary.rewritten = rewriteResult.processed;
340
+ summary.rewriteBytesBefore = rewriteResult.bytesBefore;
341
+ summary.rewriteBytesAfter = rewriteResult.bytesAfter;
342
+
343
+ // 4. Prune old swept files (>30 days)
344
+ summary.sweptArchivePruned = _pruneOldSwept();
345
+
346
+ // Final tallies — re-walk surviving entries for accurate bytesAfter
347
+ const finalEntries = queries.getKnowledgeBaseEntries();
348
+ for (const e of finalEntries) {
349
+ if (pinned.has(`knowledge/${e.cat}/${e.file}`)) continue;
350
+ const fp = path.join(KB_DIR, e.cat, e.file);
351
+ const content = safeRead(fp);
352
+ if (!content) continue;
353
+ summary.entriesAfter++;
354
+ summary.bytesAfter += content.length;
355
+ }
356
+
357
+ summary.durationMs = Date.now() - t0;
358
+ summary.summary = `${summary.hashDuplicatesArchived} hash-dup, ${summary.llmDuplicatesArchived} llm-dup, ${summary.staleRemoved} stale, ${summary.reclassified} reclassified, ${summary.rewritten} rewritten (${(summary.bytesBefore - summary.bytesAfter).toLocaleString()} bytes saved)`;
359
+
360
+ if (!opts.dryRun) {
361
+ try { safeWrite(path.join(ENGINE_DIR, 'kb-swept.json'), JSON.stringify({ timestamp: ts(), summary: summary.summary, detail: summary })); } catch { /* ignore */ }
362
+ try { queries.invalidateKnowledgeBaseCache(); } catch { /* ignore */ }
363
+ }
364
+ return summary;
365
+ }
366
+
367
+ /** Compute a dynamic stale-guard timeout based on KB size. */
368
+ function staleGuardMs(entryCount) {
369
+ // 30 minutes minimum, plus 1 second per entry (for the rewrite pass)
370
+ return Math.max(30 * 60 * 1000, entryCount * 1000);
371
+ }
372
+
373
+ module.exports = {
374
+ runKbSweep,
375
+ staleGuardMs,
376
+ // Exported for tests
377
+ _hashEntry,
378
+ _parseFrontmatter,
379
+ _serializeFrontmatter,
380
+ _hashDedup,
381
+ COMPRESS_THRESHOLD_BYTES,
382
+ SWEPT_FLAG_KEY,
383
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.1577",
3
+ "version": "0.1.1578",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"