@yemi33/minions 0.1.1577 → 0.1.1578
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +5 -0
- package/dashboard/js/render-kb.js +18 -1
- package/dashboard.js +10 -161
- package/engine/kb-sweep.js +383 -0
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
// render-kb.js — Knowledge base rendering functions extracted from dashboard.html
|
|
2
2
|
|
|
3
|
+
function _formatBytes(n) {
|
|
4
|
+
if (n < 1024) return n + ' B';
|
|
5
|
+
if (n < 1024 * 1024) return (n / 1024).toFixed(0) + ' KB';
|
|
6
|
+
return (n / 1024 / 1024).toFixed(1) + ' MB';
|
|
7
|
+
}
|
|
8
|
+
|
|
3
9
|
const KB_CAT_LABELS = {
|
|
4
10
|
architecture: 'Architecture', conventions: 'Conventions',
|
|
5
11
|
'project-notes': 'Project Notes', 'build-reports': 'Build Reports',
|
|
@@ -175,7 +181,18 @@ async function kbSweep() {
|
|
|
175
181
|
if (result && result.ok) {
|
|
176
182
|
btn.textContent = 'done';
|
|
177
183
|
btn.style.color = 'var(--green)';
|
|
178
|
-
|
|
184
|
+
// Rich summary toast — show the key counts inline; full breakdown via console.log for now
|
|
185
|
+
var bytesSaved = (result.bytesBefore || 0) - (result.bytesAfter || 0);
|
|
186
|
+
var pieces = [];
|
|
187
|
+
if (result.entriesBefore != null) pieces.push((result.entriesBefore - (result.entriesAfter || 0)) + ' entries removed');
|
|
188
|
+
if (result.hashDuplicatesArchived) pieces.push(result.hashDuplicatesArchived + ' hash-dup');
|
|
189
|
+
if (result.llmDuplicatesArchived) pieces.push(result.llmDuplicatesArchived + ' llm-dup');
|
|
190
|
+
if (result.staleRemoved) pieces.push(result.staleRemoved + ' stale');
|
|
191
|
+
if (result.reclassified) pieces.push(result.reclassified + ' reclassified');
|
|
192
|
+
if (result.rewritten) pieces.push(result.rewritten + ' rewritten');
|
|
193
|
+
if (bytesSaved > 0) pieces.push(_formatBytes(bytesSaved) + ' saved');
|
|
194
|
+
var msg = pieces.length ? 'KB sweep: ' + pieces.join(' · ') : 'KB sweep: ' + (result.summary || 'done');
|
|
195
|
+
showToast('cmd-toast', msg, true);
|
|
179
196
|
refreshKnowledgeBase();
|
|
180
197
|
} else {
|
|
181
198
|
btn.style.color = 'var(--red)';
|
package/dashboard.js
CHANGED
|
@@ -2577,183 +2577,31 @@ const server = http.createServer(async (req, res) => {
|
|
|
2577
2577
|
}
|
|
2578
2578
|
|
|
2579
2579
|
async function handleKnowledgeSweep(req, res) {
|
|
2580
|
-
// Auto-release stale guard
|
|
2581
|
-
|
|
2582
|
-
|
|
2580
|
+
// Auto-release stale guard — dynamic floor based on KB size (30 min min, +1s per entry)
|
|
2581
|
+
const { staleGuardMs } = require('./engine/kb-sweep');
|
|
2582
|
+
const entryCount = (queries.getKnowledgeBaseEntries() || []).length;
|
|
2583
|
+
const guardMs = staleGuardMs(entryCount);
|
|
2584
|
+
if (global._kbSweepInFlight && global._kbSweepStartedAt && Date.now() - global._kbSweepStartedAt > guardMs) {
|
|
2585
|
+
console.log(`[kb-sweep] Auto-releasing stale guard (>${Math.round(guardMs / 60000)}min for ${entryCount} entries)`);
|
|
2583
2586
|
global._kbSweepInFlight = false;
|
|
2584
2587
|
}
|
|
2585
2588
|
if (global._kbSweepInFlight) {
|
|
2586
2589
|
return jsonReply(res, 200, { ok: true, alreadyRunning: true, startedAt: global._kbSweepStartedAt });
|
|
2587
2590
|
}
|
|
2588
|
-
// Generation token prevents stale finally blocks from clearing the flag for a new sweep
|
|
2589
2591
|
const sweepToken = Date.now() + Math.random();
|
|
2590
2592
|
global._kbSweepToken = sweepToken;
|
|
2591
2593
|
global._kbSweepInFlight = true;
|
|
2592
2594
|
global._kbSweepStartedAt = Date.now();
|
|
2593
2595
|
const body = await readBody(req).catch(() => ({}));
|
|
2594
|
-
// Run sweep in background — return immediately so agents/UI don't time out
|
|
2595
2596
|
_runKbSweepBackground(body, sweepToken);
|
|
2596
2597
|
return jsonReply(res, 202, { ok: true, started: true });
|
|
2597
2598
|
}
|
|
2598
2599
|
|
|
2599
2600
|
async function _runKbSweepBackground(body, sweepToken) {
|
|
2600
2601
|
try {
|
|
2601
|
-
const
|
|
2602
|
-
|
|
2603
|
-
|
|
2604
|
-
global._kbSweepLastCompletedAt = Date.now();
|
|
2605
|
-
return;
|
|
2606
|
-
}
|
|
2607
|
-
|
|
2608
|
-
// Build a manifest of all KB entries with their content (skip pinned — user wants to keep them)
|
|
2609
|
-
const requestPinnedKeys = Array.isArray(body.pinnedKeys)
|
|
2610
|
-
? body.pinnedKeys.filter(k => typeof k === 'string' && k.startsWith('knowledge/'))
|
|
2611
|
-
: [];
|
|
2612
|
-
const serverPinnedKeys = shared.getPinnedItems().filter(k => k.startsWith('knowledge/'));
|
|
2613
|
-
const pinnedKeys = new Set([...serverPinnedKeys, ...requestPinnedKeys]);
|
|
2614
|
-
const manifest = [];
|
|
2615
|
-
for (const e of entries) {
|
|
2616
|
-
if (pinnedKeys.has('knowledge/' + e.cat + '/' + e.file)) continue;
|
|
2617
|
-
const content = safeRead(path.join(MINIONS_DIR, 'knowledge', e.cat, e.file));
|
|
2618
|
-
if (!content) continue;
|
|
2619
|
-
manifest.push({ category: e.cat, file: e.file, title: e.title, agent: e.agent, date: e.date, content: content.slice(0, 3000) });
|
|
2620
|
-
}
|
|
2621
|
-
if (manifest.length < 2) {
|
|
2622
|
-
global._kbSweepLastResult = { ok: true, summary: 'nothing to sweep (< 2 unpinned entries)' };
|
|
2623
|
-
global._kbSweepLastCompletedAt = Date.now();
|
|
2624
|
-
return;
|
|
2625
|
-
}
|
|
2626
|
-
|
|
2627
|
-
const { callLLM, trackEngineUsage } = require('./engine/llm');
|
|
2628
|
-
const BATCH_SIZE = 30; // ~30 entries per batch to stay within Haiku context
|
|
2629
|
-
const batches = [];
|
|
2630
|
-
for (let i = 0; i < manifest.length; i += BATCH_SIZE) {
|
|
2631
|
-
batches.push(manifest.slice(i, i + BATCH_SIZE));
|
|
2632
|
-
}
|
|
2633
|
-
|
|
2634
|
-
const plan = { duplicates: [], reclassify: [], remove: [] };
|
|
2635
|
-
for (let b = 0; b < batches.length; b++) {
|
|
2636
|
-
const batch = batches[b];
|
|
2637
|
-
const offset = b * BATCH_SIZE;
|
|
2638
|
-
const prompt = `You are a knowledge base curator. Analyze these ${batch.length} entries (batch ${b + 1}/${batches.length}, indices ${offset}-${offset + batch.length - 1}) and produce a cleanup plan.
|
|
2639
|
-
|
|
2640
|
-
## Entries
|
|
2641
|
-
|
|
2642
|
-
${batch.map((m, i) => `[${offset + i}] ${m.category}/${m.file} | ${m.title} | ${m.date} | ${m.agent || '?'} | ${(m.content || '').slice(0, 200).replace(/\n/g, ' ')}`).join('\n')}
|
|
2643
|
-
|
|
2644
|
-
## Instructions
|
|
2645
|
-
|
|
2646
|
-
1. **Find duplicates**: entries with substantially the same content (same findings, different agents/runs). List pairs by index. Prefer keeping the more recent entry.
|
|
2647
|
-
2. **Find misclassified**: entries in the wrong category.
|
|
2648
|
-
3. **Find stale/empty**: entries with no actionable content (boilerplate, bail-out notes, "no changes needed").
|
|
2649
|
-
|
|
2650
|
-
Respond with ONLY valid JSON: { "duplicates": [{ "keep": N, "remove": [N], "reason": "..." }], "reclassify": [{ "index": N, "from": "cat", "to": "cat", "reason": "..." }], "remove": [{ "index": N, "reason": "..." }] }
|
|
2651
|
-
If nothing to do: { "duplicates": [], "reclassify": [], "remove": [] }`;
|
|
2652
|
-
|
|
2653
|
-
const result = await callLLM(prompt, 'Output only JSON.', {
|
|
2654
|
-
timeout: 120000, label: 'kb-sweep', model: 'haiku', maxTurns: 1, direct: true
|
|
2655
|
-
});
|
|
2656
|
-
trackEngineUsage('kb-sweep', result.usage);
|
|
2657
|
-
|
|
2658
|
-
let batchPlan;
|
|
2659
|
-
try {
|
|
2660
|
-
let jsonStr = (result.text || '').trim();
|
|
2661
|
-
const fenceMatch = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
2662
|
-
if (fenceMatch) jsonStr = fenceMatch[1].trim();
|
|
2663
|
-
batchPlan = JSON.parse(jsonStr);
|
|
2664
|
-
} catch {
|
|
2665
|
-
console.log(`[kb-sweep] batch ${b + 1}/${batches.length} returned invalid JSON, skipping`);
|
|
2666
|
-
continue;
|
|
2667
|
-
}
|
|
2668
|
-
if (batchPlan.duplicates) plan.duplicates.push(...batchPlan.duplicates);
|
|
2669
|
-
if (batchPlan.reclassify) plan.reclassify.push(...batchPlan.reclassify);
|
|
2670
|
-
if (batchPlan.remove) plan.remove.push(...batchPlan.remove);
|
|
2671
|
-
}
|
|
2672
|
-
|
|
2673
|
-
let removed = 0, reclassified = 0, merged = 0;
|
|
2674
|
-
const kbDir = path.join(MINIONS_DIR, 'knowledge');
|
|
2675
|
-
|
|
2676
|
-
// If nothing to do, store result and return
|
|
2677
|
-
const totalActions = (plan.remove || []).length + (plan.duplicates || []).reduce((n, d) => n + (d.remove || []).length, 0) + (plan.reclassify || []).length;
|
|
2678
|
-
if (totalActions === 0) {
|
|
2679
|
-
global._kbSweepLastResult = { ok: true, summary: 'KB is clean — nothing to sweep', plan };
|
|
2680
|
-
global._kbSweepLastCompletedAt = Date.now();
|
|
2681
|
-
return;
|
|
2682
|
-
}
|
|
2683
|
-
|
|
2684
|
-
// Archive dir for swept files (never delete, always preserve)
|
|
2685
|
-
const kbArchiveDir = path.join(kbDir, '_swept');
|
|
2686
|
-
if (!fs.existsSync(kbArchiveDir)) fs.mkdirSync(kbArchiveDir, { recursive: true });
|
|
2687
|
-
|
|
2688
|
-
function archiveKbFile(filePath, reason) {
|
|
2689
|
-
if (!fs.existsSync(filePath)) return;
|
|
2690
|
-
const basename = path.basename(filePath);
|
|
2691
|
-
const destPath = shared.uniquePath(path.join(kbArchiveDir, basename));
|
|
2692
|
-
try {
|
|
2693
|
-
const content = safeRead(filePath);
|
|
2694
|
-
if (content === null) return; // don't delete if we can't read
|
|
2695
|
-
const meta = `<!-- swept: ${new Date().toISOString()} | reason: ${reason} -->\n`;
|
|
2696
|
-
safeWrite(destPath, meta + content);
|
|
2697
|
-
safeUnlink(filePath);
|
|
2698
|
-
} catch (e) { console.error('kb archive:', e.message); }
|
|
2699
|
-
}
|
|
2700
|
-
|
|
2701
|
-
// Process removals (stale/empty) — archive, not delete
|
|
2702
|
-
for (const r of (plan.remove || [])) {
|
|
2703
|
-
const entry = manifest[r.index];
|
|
2704
|
-
if (!entry) continue;
|
|
2705
|
-
const fp = path.join(kbDir, entry.category, entry.file);
|
|
2706
|
-
archiveKbFile(fp, 'stale: ' + (r.reason || ''));
|
|
2707
|
-
removed++;
|
|
2708
|
-
}
|
|
2709
|
-
|
|
2710
|
-
// Process duplicates — archive the duplicates, keep the primary
|
|
2711
|
-
for (const d of (plan.duplicates || [])) {
|
|
2712
|
-
for (const idx of (d.remove || [])) {
|
|
2713
|
-
const entry = manifest[idx];
|
|
2714
|
-
if (!entry) continue;
|
|
2715
|
-
const fp = path.join(kbDir, entry.category, entry.file);
|
|
2716
|
-
archiveKbFile(fp, 'duplicate of index ' + d.keep + ': ' + (d.reason || ''));
|
|
2717
|
-
merged++;
|
|
2718
|
-
}
|
|
2719
|
-
}
|
|
2720
|
-
|
|
2721
|
-
// Process reclassifications (move between categories)
|
|
2722
|
-
for (const r of (plan.reclassify || [])) {
|
|
2723
|
-
const entry = manifest[r.index];
|
|
2724
|
-
if (!entry || !shared.KB_CATEGORIES.includes(r.to)) continue;
|
|
2725
|
-
const srcPath = path.join(kbDir, entry.category, entry.file);
|
|
2726
|
-
const destDir = path.join(kbDir, r.to);
|
|
2727
|
-
if (!fs.existsSync(srcPath)) continue;
|
|
2728
|
-
if (!fs.existsSync(destDir)) fs.mkdirSync(destDir, { recursive: true });
|
|
2729
|
-
try {
|
|
2730
|
-
const srcStats = fs.statSync(srcPath);
|
|
2731
|
-
const content = safeRead(srcPath);
|
|
2732
|
-
const updated = content.replace(/^(category:\s*).+$/m, `$1${r.to}`);
|
|
2733
|
-
const destPath = path.join(destDir, entry.file);
|
|
2734
|
-
safeWrite(destPath, updated);
|
|
2735
|
-
fs.utimesSync(destPath, srcStats.atime, srcStats.mtime);
|
|
2736
|
-
safeUnlink(srcPath);
|
|
2737
|
-
reclassified++;
|
|
2738
|
-
} catch (e) { console.error('kb reclassify:', e.message); }
|
|
2739
|
-
}
|
|
2740
|
-
|
|
2741
|
-
// Prune swept files older than 30 days
|
|
2742
|
-
let pruned = 0;
|
|
2743
|
-
const SWEPT_RETENTION_MS = 30 * 24 * 60 * 60 * 1000;
|
|
2744
|
-
try {
|
|
2745
|
-
for (const f of fs.readdirSync(kbArchiveDir)) {
|
|
2746
|
-
const fp = path.join(kbArchiveDir, f);
|
|
2747
|
-
try {
|
|
2748
|
-
if (Date.now() - fs.statSync(fp).mtimeMs > SWEPT_RETENTION_MS) { safeUnlink(fp); pruned++; }
|
|
2749
|
-
} catch { /* cleanup */ }
|
|
2750
|
-
}
|
|
2751
|
-
} catch { /* optional */ }
|
|
2752
|
-
|
|
2753
|
-
const summary = `${merged} duplicates merged, ${removed} stale removed, ${reclassified} reclassified${pruned ? ', ' + pruned + ' old swept files pruned' : ''}`;
|
|
2754
|
-
safeWrite(path.join(ENGINE_DIR, 'kb-swept.json'), JSON.stringify({ timestamp: new Date().toISOString(), summary }));
|
|
2755
|
-
queries.invalidateKnowledgeBaseCache();
|
|
2756
|
-
global._kbSweepLastResult = { ok: true, summary, plan };
|
|
2602
|
+
const { runKbSweep } = require('./engine/kb-sweep');
|
|
2603
|
+
const result = await runKbSweep({ pinnedKeys: body.pinnedKeys });
|
|
2604
|
+
global._kbSweepLastResult = result;
|
|
2757
2605
|
global._kbSweepLastCompletedAt = Date.now();
|
|
2758
2606
|
} catch (e) {
|
|
2759
2607
|
console.error('[kb-sweep] background error:', e.message);
|
|
@@ -2762,6 +2610,7 @@ If nothing to do: { "duplicates": [], "reclassify": [], "remove": [] }`;
|
|
|
2762
2610
|
} finally { if (global._kbSweepToken === sweepToken) global._kbSweepInFlight = false; }
|
|
2763
2611
|
}
|
|
2764
2612
|
|
|
2613
|
+
|
|
2765
2614
|
function handleKnowledgeSweepStatus(req, res) {
|
|
2766
2615
|
return jsonReply(res, 200, {
|
|
2767
2616
|
inFlight: !!global._kbSweepInFlight,
|
|
@@ -0,0 +1,383 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* engine/kb-sweep.js — Knowledge base sweep: dedup, compress, normalize.
|
|
3
|
+
*
|
|
4
|
+
* Replaces the inline sweep that lived in dashboard.js. Three passes:
|
|
5
|
+
* 1. Hash-based dedup — cheap, catches cross-batch duplicates
|
|
6
|
+
* 2. LLM batch sweep — finds remaining dupes + reclassify + stale-remove
|
|
7
|
+
* 3. Compress & normalize — per-entry LLM rewrite, flagged via _swept frontmatter
|
|
8
|
+
*
|
|
9
|
+
* Returns a rich summary so the dashboard can show before/after byte counts.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
const fs = require('fs');
|
|
13
|
+
const path = require('path');
|
|
14
|
+
const crypto = require('crypto');
|
|
15
|
+
const shared = require('./shared');
|
|
16
|
+
const queries = require('./queries');
|
|
17
|
+
const { safeRead, safeWrite, safeUnlink, log, ts } = shared;
|
|
18
|
+
const { MINIONS_DIR, ENGINE_DIR } = queries;
|
|
19
|
+
|
|
20
|
+
const KB_DIR = path.join(MINIONS_DIR, 'knowledge');
|
|
21
|
+
const SWEPT_DIR = path.join(KB_DIR, '_swept');
|
|
22
|
+
const SWEPT_RETENTION_MS = 30 * 24 * 60 * 60 * 1000;
|
|
23
|
+
const COMPRESS_THRESHOLD_BYTES = 5000;
|
|
24
|
+
const LLM_BATCH_SIZE = 30;
|
|
25
|
+
const NORMALIZE_CONCURRENCY = 5;
|
|
26
|
+
const SWEPT_FLAG_KEY = '_swept'; // frontmatter key — entries with this skip the rewrite pass
|
|
27
|
+
|
|
28
|
+
function _hashEntry(content) {
|
|
29
|
+
const normalized = String(content || '').replace(/\s+/g, ' ').trim().slice(0, 500);
|
|
30
|
+
return crypto.createHash('sha256').update(normalized + ':' + (content?.length || 0)).digest('hex');
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Parse YAML-ish frontmatter at the top of a markdown file.
|
|
35
|
+
* Returns { fm: {key:value}, body: string }.
|
|
36
|
+
*/
|
|
37
|
+
function _parseFrontmatter(content) {
|
|
38
|
+
const m = String(content || '').match(/^---\n([\s\S]*?)\n---\n?([\s\S]*)$/);
|
|
39
|
+
if (!m) return { fm: {}, body: content || '' };
|
|
40
|
+
const fm = {};
|
|
41
|
+
for (const line of m[1].split('\n')) {
|
|
42
|
+
const lm = line.match(/^([\w-]+):\s*(.*)$/);
|
|
43
|
+
if (lm) fm[lm[1]] = lm[2].trim();
|
|
44
|
+
}
|
|
45
|
+
return { fm, body: m[2].replace(/^\n+/, '') };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function _serializeFrontmatter(fm, body) {
|
|
49
|
+
const keys = Object.keys(fm);
|
|
50
|
+
if (keys.length === 0) return body;
|
|
51
|
+
const lines = keys.map(k => `${k}: ${fm[k]}`);
|
|
52
|
+
return `---\n${lines.join('\n')}\n---\n\n${body.replace(/^\n+/, '')}`;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function _archiveKbFile(filePath, reason) {
|
|
56
|
+
if (!fs.existsSync(filePath)) return false;
|
|
57
|
+
if (!fs.existsSync(SWEPT_DIR)) fs.mkdirSync(SWEPT_DIR, { recursive: true });
|
|
58
|
+
const destPath = shared.uniquePath(path.join(SWEPT_DIR, path.basename(filePath)));
|
|
59
|
+
try {
|
|
60
|
+
const content = safeRead(filePath);
|
|
61
|
+
if (content === null) return false;
|
|
62
|
+
safeWrite(destPath, `<!-- swept: ${new Date().toISOString()} | reason: ${reason} -->\n${content}`);
|
|
63
|
+
safeUnlink(filePath);
|
|
64
|
+
return true;
|
|
65
|
+
} catch (e) { log('warn', `[kb-sweep] archive ${path.basename(filePath)}: ${e.message}`); return false; }
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function _pruneOldSwept() {
|
|
69
|
+
if (!fs.existsSync(SWEPT_DIR)) return 0;
|
|
70
|
+
let pruned = 0;
|
|
71
|
+
try {
|
|
72
|
+
for (const f of fs.readdirSync(SWEPT_DIR)) {
|
|
73
|
+
const fp = path.join(SWEPT_DIR, f);
|
|
74
|
+
try {
|
|
75
|
+
if (Date.now() - fs.statSync(fp).mtimeMs > SWEPT_RETENTION_MS) { safeUnlink(fp); pruned++; }
|
|
76
|
+
} catch { /* ignore */ }
|
|
77
|
+
}
|
|
78
|
+
} catch { /* ignore */ }
|
|
79
|
+
return pruned;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/** Group entries by content hash, keep most-recent per group. Cheap, no LLM. */
|
|
83
|
+
function _hashDedup(manifest, opts = {}) {
|
|
84
|
+
const groups = new Map(); // hash → entries[]
|
|
85
|
+
for (const e of manifest) {
|
|
86
|
+
const h = _hashEntry(e.content);
|
|
87
|
+
if (!groups.has(h)) groups.set(h, []);
|
|
88
|
+
groups.get(h).push(e);
|
|
89
|
+
}
|
|
90
|
+
let archived = 0;
|
|
91
|
+
const survivors = [];
|
|
92
|
+
for (const [, group] of groups) {
|
|
93
|
+
if (group.length === 1) { survivors.push(group[0]); continue; }
|
|
94
|
+
// Keep most recent (by date frontmatter, then mtime)
|
|
95
|
+
group.sort((a, b) => (b.date || '').localeCompare(a.date || '') || b.mtimeMs - a.mtimeMs);
|
|
96
|
+
survivors.push(group[0]);
|
|
97
|
+
for (const dup of group.slice(1)) {
|
|
98
|
+
if (opts.dryRun) { archived++; continue; }
|
|
99
|
+
const fp = path.join(KB_DIR, dup.category, dup.file);
|
|
100
|
+
if (_archiveKbFile(fp, `hash-duplicate of ${group[0].category}/${group[0].file}`)) archived++;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
return { survivors, archived };
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/** Batched LLM sweep — finds within-batch dupes, reclassifies, removes stale. */
|
|
107
|
+
async function _llmBatchSweep(manifest, callLLM, trackEngineUsage) {
|
|
108
|
+
const plan = { duplicates: [], reclassify: [], remove: [] };
|
|
109
|
+
const batches = [];
|
|
110
|
+
for (let i = 0; i < manifest.length; i += LLM_BATCH_SIZE) {
|
|
111
|
+
batches.push(manifest.slice(i, i + LLM_BATCH_SIZE));
|
|
112
|
+
}
|
|
113
|
+
for (let b = 0; b < batches.length; b++) {
|
|
114
|
+
const batch = batches[b];
|
|
115
|
+
const offset = b * LLM_BATCH_SIZE;
|
|
116
|
+
const prompt = `You are a knowledge base curator. Analyze these ${batch.length} entries (batch ${b + 1}/${batches.length}, indices ${offset}-${offset + batch.length - 1}) and produce a cleanup plan.
|
|
117
|
+
|
|
118
|
+
## Entries
|
|
119
|
+
|
|
120
|
+
${batch.map((m, i) => `[${offset + i}] ${m.category}/${m.file} | ${m.title} | ${m.date} | ${m.agent || '?'} | ${(m.content || '').slice(0, 200).replace(/\n/g, ' ')}`).join('\n')}
|
|
121
|
+
|
|
122
|
+
## Instructions
|
|
123
|
+
|
|
124
|
+
1. **Find duplicates**: entries with substantially the same content (same findings, different agents/runs). List pairs by index. Prefer keeping the more recent entry.
|
|
125
|
+
2. **Find misclassified**: entries in the wrong category.
|
|
126
|
+
3. **Find stale/empty**: entries with no actionable content (boilerplate, bail-out notes, "no changes needed").
|
|
127
|
+
|
|
128
|
+
Respond with ONLY valid JSON: { "duplicates": [{ "keep": N, "remove": [N], "reason": "..." }], "reclassify": [{ "index": N, "from": "cat", "to": "cat", "reason": "..." }], "remove": [{ "index": N, "reason": "..." }] }
|
|
129
|
+
If nothing to do: { "duplicates": [], "reclassify": [], "remove": [] }`;
|
|
130
|
+
|
|
131
|
+
let result;
|
|
132
|
+
try {
|
|
133
|
+
result = await callLLM(prompt, 'Output only JSON.', { timeout: 120000, label: 'kb-sweep', model: 'haiku', maxTurns: 1, direct: true });
|
|
134
|
+
trackEngineUsage('kb-sweep', result.usage);
|
|
135
|
+
} catch (e) { log('warn', `[kb-sweep] batch ${b + 1} LLM error: ${e.message}`); continue; }
|
|
136
|
+
|
|
137
|
+
let batchPlan;
|
|
138
|
+
try {
|
|
139
|
+
let jsonStr = (result.text || '').trim();
|
|
140
|
+
const fence = jsonStr.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
141
|
+
if (fence) jsonStr = fence[1].trim();
|
|
142
|
+
batchPlan = JSON.parse(jsonStr);
|
|
143
|
+
} catch { log('warn', `[kb-sweep] batch ${b + 1} returned invalid JSON, skipping`); continue; }
|
|
144
|
+
if (batchPlan.duplicates) plan.duplicates.push(...batchPlan.duplicates);
|
|
145
|
+
if (batchPlan.reclassify) plan.reclassify.push(...batchPlan.reclassify);
|
|
146
|
+
if (batchPlan.remove) plan.remove.push(...batchPlan.remove);
|
|
147
|
+
}
|
|
148
|
+
return plan;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Per-entry rewrite pass: compress large entries + normalize structure into
|
|
153
|
+
* a fixed template. Only runs on entries lacking the `_swept` frontmatter flag.
|
|
154
|
+
* Concurrency-limited via Promise pool.
|
|
155
|
+
*/
|
|
156
|
+
async function _rewritePass(survivors, callLLM, trackEngineUsage, opts = {}) {
|
|
157
|
+
const REWRITE_PROMPT = (entry, body) => `You are restructuring a knowledge-base entry so future agents can scan it quickly.
|
|
158
|
+
|
|
159
|
+
Reshape the content into this exact template, preserving ALL actionable findings, file:line references, and code snippets. Compress to <=800 words by dropping boilerplate (dates, full file paths that aren't actionable, agent IDs in the body, narrative scaffolding).
|
|
160
|
+
|
|
161
|
+
Template:
|
|
162
|
+
## Summary
|
|
163
|
+
2-3 sentence overview.
|
|
164
|
+
|
|
165
|
+
## Key Findings
|
|
166
|
+
- Bullet 1 (specific, includes file:line where relevant)
|
|
167
|
+
- Bullet 2
|
|
168
|
+
|
|
169
|
+
## Action Items
|
|
170
|
+
- Bullet (omit section entirely if none)
|
|
171
|
+
|
|
172
|
+
## References
|
|
173
|
+
- file:line citations or doc links (omit section if none)
|
|
174
|
+
|
|
175
|
+
Output ONLY the template body — no frontmatter, no markdown code fence, no preamble.
|
|
176
|
+
|
|
177
|
+
Original entry (category: ${entry.category}, agent: ${entry.agent || '?'}, date: ${entry.date}):
|
|
178
|
+
|
|
179
|
+
${body}`;
|
|
180
|
+
|
|
181
|
+
const candidates = [];
|
|
182
|
+
for (const e of survivors) {
|
|
183
|
+
const fp = path.join(KB_DIR, e.category, e.file);
|
|
184
|
+
const content = safeRead(fp);
|
|
185
|
+
if (content == null) continue;
|
|
186
|
+
const { fm, body } = _parseFrontmatter(content);
|
|
187
|
+
// Skip already-processed unless the file was modified after the sweep flag was set
|
|
188
|
+
if (fm[SWEPT_FLAG_KEY]) {
|
|
189
|
+
try {
|
|
190
|
+
const mtime = fs.statSync(fp).mtimeMs;
|
|
191
|
+
const sweptAt = Date.parse(fm[SWEPT_FLAG_KEY]);
|
|
192
|
+
if (Number.isFinite(sweptAt) && mtime <= sweptAt + 1000) continue;
|
|
193
|
+
} catch { /* ignore — re-process */ }
|
|
194
|
+
}
|
|
195
|
+
candidates.push({ entry: e, fp, fm, body, originalSize: content.length });
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
if (candidates.length === 0) return { processed: 0, bytesBefore: 0, bytesAfter: 0 };
|
|
199
|
+
|
|
200
|
+
let processed = 0, bytesBefore = 0, bytesAfter = 0;
|
|
201
|
+
// Simple promise pool — NORMALIZE_CONCURRENCY at a time
|
|
202
|
+
let cursor = 0;
|
|
203
|
+
async function worker() {
|
|
204
|
+
while (cursor < candidates.length) {
|
|
205
|
+
const c = candidates[cursor++];
|
|
206
|
+
try {
|
|
207
|
+
const result = await callLLM(REWRITE_PROMPT(c.entry, c.body), 'Output ONLY the template body.', {
|
|
208
|
+
timeout: 120000, label: 'kb-rewrite', model: 'haiku', maxTurns: 1, direct: true,
|
|
209
|
+
});
|
|
210
|
+
trackEngineUsage('kb-sweep', result.usage);
|
|
211
|
+
let newBody = (result.text || '').trim();
|
|
212
|
+
// Strip accidental code fence
|
|
213
|
+
const fence = newBody.match(/^```(?:markdown|md)?\s*([\s\S]*?)```$/);
|
|
214
|
+
if (fence) newBody = fence[1].trim();
|
|
215
|
+
if (!newBody || newBody.length < 50) continue; // suspicious — skip
|
|
216
|
+
const newFm = { ...c.fm, [SWEPT_FLAG_KEY]: new Date().toISOString() };
|
|
217
|
+
const newContent = _serializeFrontmatter(newFm, newBody);
|
|
218
|
+
if (!opts.dryRun) safeWrite(c.fp, newContent);
|
|
219
|
+
bytesBefore += c.originalSize;
|
|
220
|
+
bytesAfter += newContent.length;
|
|
221
|
+
processed++;
|
|
222
|
+
} catch (e) { log('warn', `[kb-sweep] rewrite ${c.entry.category}/${c.entry.file}: ${e.message}`); }
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
const workers = Array.from({ length: NORMALIZE_CONCURRENCY }, worker);
|
|
226
|
+
await Promise.all(workers);
|
|
227
|
+
return { processed, bytesBefore, bytesAfter };
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function _applyLlmPlan(plan, manifest, opts = {}) {
|
|
231
|
+
let removed = 0, merged = 0, reclassified = 0;
|
|
232
|
+
for (const r of (plan.remove || [])) {
|
|
233
|
+
const entry = manifest[r.index];
|
|
234
|
+
if (!entry) continue;
|
|
235
|
+
if (opts.dryRun) { removed++; continue; }
|
|
236
|
+
if (_archiveKbFile(path.join(KB_DIR, entry.category, entry.file), `stale: ${r.reason || ''}`)) removed++;
|
|
237
|
+
}
|
|
238
|
+
for (const d of (plan.duplicates || [])) {
|
|
239
|
+
for (const idx of (d.remove || [])) {
|
|
240
|
+
const entry = manifest[idx];
|
|
241
|
+
if (!entry) continue;
|
|
242
|
+
if (opts.dryRun) { merged++; continue; }
|
|
243
|
+
if (_archiveKbFile(path.join(KB_DIR, entry.category, entry.file), `duplicate of index ${d.keep}: ${d.reason || ''}`)) merged++;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
for (const r of (plan.reclassify || [])) {
|
|
247
|
+
const entry = manifest[r.index];
|
|
248
|
+
if (!entry || !shared.KB_CATEGORIES.includes(r.to)) continue;
|
|
249
|
+
if (opts.dryRun) { reclassified++; continue; }
|
|
250
|
+
const srcPath = path.join(KB_DIR, entry.category, entry.file);
|
|
251
|
+
const destDir = path.join(KB_DIR, r.to);
|
|
252
|
+
if (!fs.existsSync(srcPath)) continue;
|
|
253
|
+
if (!fs.existsSync(destDir)) fs.mkdirSync(destDir, { recursive: true });
|
|
254
|
+
try {
|
|
255
|
+
const stats = fs.statSync(srcPath);
|
|
256
|
+
const content = safeRead(srcPath);
|
|
257
|
+
const updated = (content || '').replace(/^(category:\s*).+$/m, `$1${r.to}`);
|
|
258
|
+
const destPath = path.join(destDir, entry.file);
|
|
259
|
+
safeWrite(destPath, updated);
|
|
260
|
+
fs.utimesSync(destPath, stats.atime, stats.mtime);
|
|
261
|
+
safeUnlink(srcPath);
|
|
262
|
+
reclassified++;
|
|
263
|
+
} catch (e) { log('warn', `[kb-sweep] reclassify ${entry.file}: ${e.message}`); }
|
|
264
|
+
}
|
|
265
|
+
return { removed, merged, reclassified };
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Run the full sweep. Returns a rich summary.
|
|
270
|
+
*
|
|
271
|
+
* @param {object} opts
|
|
272
|
+
* @param {string[]} [opts.pinnedKeys] - extra pinned keys (e.g. from request body)
|
|
273
|
+
* @param {boolean} [opts.dryRun] - count actions but don't mutate files
|
|
274
|
+
* @returns {Promise<object>} summary
|
|
275
|
+
*/
|
|
276
|
+
async function runKbSweep(opts = {}) {
|
|
277
|
+
const { callLLM, trackEngineUsage } = require('./llm');
|
|
278
|
+
const summary = {
|
|
279
|
+
ok: true,
|
|
280
|
+
entriesBefore: 0,
|
|
281
|
+
entriesAfter: 0,
|
|
282
|
+
bytesBefore: 0,
|
|
283
|
+
bytesAfter: 0,
|
|
284
|
+
hashDuplicatesArchived: 0,
|
|
285
|
+
llmDuplicatesArchived: 0,
|
|
286
|
+
staleRemoved: 0,
|
|
287
|
+
reclassified: 0,
|
|
288
|
+
rewritten: 0,
|
|
289
|
+
rewriteBytesBefore: 0,
|
|
290
|
+
rewriteBytesAfter: 0,
|
|
291
|
+
sweptArchivePruned: 0,
|
|
292
|
+
durationMs: 0,
|
|
293
|
+
};
|
|
294
|
+
const t0 = Date.now();
|
|
295
|
+
|
|
296
|
+
const entries = queries.getKnowledgeBaseEntries();
|
|
297
|
+
if (entries.length < 2) { summary.summary = 'nothing to sweep (< 2 entries)'; summary.durationMs = Date.now() - t0; return summary; }
|
|
298
|
+
|
|
299
|
+
const requestPinned = Array.isArray(opts.pinnedKeys)
|
|
300
|
+
? opts.pinnedKeys.filter(k => typeof k === 'string' && k.startsWith('knowledge/'))
|
|
301
|
+
: [];
|
|
302
|
+
const pinned = new Set([
|
|
303
|
+
...shared.getPinnedItems().filter(k => k.startsWith('knowledge/')),
|
|
304
|
+
...requestPinned,
|
|
305
|
+
]);
|
|
306
|
+
|
|
307
|
+
// Build manifest with full content + mtime
|
|
308
|
+
const manifest = [];
|
|
309
|
+
for (const e of entries) {
|
|
310
|
+
if (pinned.has(`knowledge/${e.cat}/${e.file}`)) continue;
|
|
311
|
+
const fp = path.join(KB_DIR, e.cat, e.file);
|
|
312
|
+
const content = safeRead(fp);
|
|
313
|
+
if (!content) continue;
|
|
314
|
+
let mtimeMs = 0;
|
|
315
|
+
try { mtimeMs = fs.statSync(fp).mtimeMs; } catch { /* ignore */ }
|
|
316
|
+
manifest.push({ category: e.cat, file: e.file, title: e.title, agent: e.agent, date: e.date, content: content.slice(0, 3000), mtimeMs });
|
|
317
|
+
summary.entriesBefore++;
|
|
318
|
+
summary.bytesBefore += content.length;
|
|
319
|
+
}
|
|
320
|
+
if (manifest.length < 2) { summary.summary = 'nothing to sweep (< 2 unpinned entries)'; summary.durationMs = Date.now() - t0; return summary; }
|
|
321
|
+
|
|
322
|
+
// 1. Hash-based dedup (cheap, catches cross-batch duplicates)
|
|
323
|
+
const { survivors: afterHash, archived: hashArchived } = _hashDedup(manifest, opts);
|
|
324
|
+
summary.hashDuplicatesArchived = hashArchived;
|
|
325
|
+
|
|
326
|
+
// 2. LLM batch sweep — within-batch dupes + reclassify + remove stale
|
|
327
|
+
// Only runs against survivors, but we need indices that match the LIST sent to the LLM
|
|
328
|
+
const llmManifest = afterHash;
|
|
329
|
+
const plan = await _llmBatchSweep(llmManifest, callLLM, trackEngineUsage);
|
|
330
|
+
const llmActions = _applyLlmPlan(plan, llmManifest, opts);
|
|
331
|
+
summary.llmDuplicatesArchived = llmActions.merged;
|
|
332
|
+
summary.staleRemoved = llmActions.removed;
|
|
333
|
+
summary.reclassified = llmActions.reclassified;
|
|
334
|
+
|
|
335
|
+
// 3. Per-entry rewrite (compress + normalize)
|
|
336
|
+
// Filter to entries that survived hash + LLM passes (still on disk)
|
|
337
|
+
const stillOnDisk = afterHash.filter(e => fs.existsSync(path.join(KB_DIR, e.category, e.file)));
|
|
338
|
+
const rewriteResult = await _rewritePass(stillOnDisk, callLLM, trackEngineUsage, opts);
|
|
339
|
+
summary.rewritten = rewriteResult.processed;
|
|
340
|
+
summary.rewriteBytesBefore = rewriteResult.bytesBefore;
|
|
341
|
+
summary.rewriteBytesAfter = rewriteResult.bytesAfter;
|
|
342
|
+
|
|
343
|
+
// 4. Prune old swept files (>30 days)
|
|
344
|
+
summary.sweptArchivePruned = _pruneOldSwept();
|
|
345
|
+
|
|
346
|
+
// Final tallies — re-walk surviving entries for accurate bytesAfter
|
|
347
|
+
const finalEntries = queries.getKnowledgeBaseEntries();
|
|
348
|
+
for (const e of finalEntries) {
|
|
349
|
+
if (pinned.has(`knowledge/${e.cat}/${e.file}`)) continue;
|
|
350
|
+
const fp = path.join(KB_DIR, e.cat, e.file);
|
|
351
|
+
const content = safeRead(fp);
|
|
352
|
+
if (!content) continue;
|
|
353
|
+
summary.entriesAfter++;
|
|
354
|
+
summary.bytesAfter += content.length;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
summary.durationMs = Date.now() - t0;
|
|
358
|
+
summary.summary = `${summary.hashDuplicatesArchived} hash-dup, ${summary.llmDuplicatesArchived} llm-dup, ${summary.staleRemoved} stale, ${summary.reclassified} reclassified, ${summary.rewritten} rewritten (${(summary.bytesBefore - summary.bytesAfter).toLocaleString()} bytes saved)`;
|
|
359
|
+
|
|
360
|
+
if (!opts.dryRun) {
|
|
361
|
+
try { safeWrite(path.join(ENGINE_DIR, 'kb-swept.json'), JSON.stringify({ timestamp: ts(), summary: summary.summary, detail: summary })); } catch { /* ignore */ }
|
|
362
|
+
try { queries.invalidateKnowledgeBaseCache(); } catch { /* ignore */ }
|
|
363
|
+
}
|
|
364
|
+
return summary;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
/** Compute a dynamic stale-guard timeout based on KB size. */
|
|
368
|
+
function staleGuardMs(entryCount) {
|
|
369
|
+
// 30 minutes minimum, plus 1 second per entry (for the rewrite pass)
|
|
370
|
+
return Math.max(30 * 60 * 1000, entryCount * 1000);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
module.exports = {
|
|
374
|
+
runKbSweep,
|
|
375
|
+
staleGuardMs,
|
|
376
|
+
// Exported for tests
|
|
377
|
+
_hashEntry,
|
|
378
|
+
_parseFrontmatter,
|
|
379
|
+
_serializeFrontmatter,
|
|
380
|
+
_hashDedup,
|
|
381
|
+
COMPRESS_THRESHOLD_BYTES,
|
|
382
|
+
SWEPT_FLAG_KEY,
|
|
383
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@yemi33/minions",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1578",
|
|
4
4
|
"description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
|
|
5
5
|
"bin": {
|
|
6
6
|
"minions": "bin/minions.js"
|