@gcunharodrigues/wrxn 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/wrxn.cjs +60 -2
- package/lib/convert.cjs +215 -0
- package/lib/ingest.cjs +174 -0
- package/manifest.json +10 -0
- package/package.json +2 -2
- package/payload/.claude/skills/ingest/SKILL.md +72 -0
- package/payload/.mcp.json +1 -1
- package/payload/.wrxn/raw/.gitkeep +0 -0
package/bin/wrxn.cjs
CHANGED
|
@@ -12,6 +12,8 @@ const executor = require('../lib/executor.cjs');
|
|
|
12
12
|
const onboard = require('../lib/onboard.cjs');
|
|
13
13
|
const connect = require('../lib/connect.cjs');
|
|
14
14
|
const statusline = require('../lib/statusline.cjs');
|
|
15
|
+
const { convert } = require('../lib/convert.cjs');
|
|
16
|
+
const { ingest } = require('../lib/ingest.cjs');
|
|
15
17
|
|
|
16
18
|
const PKG_ROOT = path.join(__dirname, '..');
|
|
17
19
|
|
|
@@ -54,6 +56,8 @@ function parseArgs(argv) {
|
|
|
54
56
|
args.flags.owner = argv[++i];
|
|
55
57
|
} else if (a === '--probe') {
|
|
56
58
|
args.flags.probe = argv[++i];
|
|
59
|
+
} else if (a === '--distillation') {
|
|
60
|
+
args.flags.distillation = argv[++i];
|
|
57
61
|
} else if (a === '--check-report') {
|
|
58
62
|
args.flags['check-report'] = true;
|
|
59
63
|
} else if (a.startsWith('--')) {
|
|
@@ -112,6 +116,20 @@ Usage:
|
|
|
112
116
|
resolved (or --path) statusline script, idempotently (append-only,
|
|
113
117
|
never overwrites). init NEVER touches your statusline.
|
|
114
118
|
|
|
119
|
+
wrxn convert <file> [--cpu] convert a source file to Markdown and print it. Per-format routing:
|
|
120
|
+
markitdown (html/docx/txt/pptx/xlsx) · docling (pdf, with automatic
|
|
121
|
+
CPU fallback on a GPU arch-crash) · pure-JS floor when Python is
|
|
122
|
+
absent. --cpu forces docling onto CPU from the first attempt.
|
|
123
|
+
|
|
124
|
+
wrxn ingest <file> [--distillation <result.json>] [--root <dir>]
|
|
125
|
+
distill a source into the memory wiki: convert (slice 05) → an LLM
|
|
126
|
+
(the ingest skill) produces a summary + N note pages → write them
|
|
127
|
+
to .wrxn/wiki/, each stamped derived_from the raw source, which is
|
|
128
|
+
kept under .wrxn/raw/. ADDITIVE-ONLY: an existing page is never
|
|
129
|
+
overwritten (re-runs are safe). --distillation feeds the skill's
|
|
130
|
+
result JSON (summary,notes); without it, the harness points you at
|
|
131
|
+
the ingest skill.
|
|
132
|
+
|
|
115
133
|
wrxn onboard [--root <dir>] scaffold the Day-1 operator file set under context/ from a filled
|
|
116
134
|
aios-intake.md (the deterministic half of the onboard skill;
|
|
117
135
|
workspace installs only). Idempotent.
|
|
@@ -120,7 +138,7 @@ Profiles: --project (default, the dev pipeline + intelligence + enforcement) |
|
|
|
120
138
|
--workspace (adds the operator layer: onboard/audit/level-up + intake + decisions log +
|
|
121
139
|
connections registry).`;
|
|
122
140
|
|
|
123
|
-
function main(argv) {
|
|
141
|
+
async function main(argv) {
|
|
124
142
|
const args = parseArgs(argv);
|
|
125
143
|
|
|
126
144
|
if (args.flags.version) {
|
|
@@ -294,6 +312,43 @@ function main(argv) {
|
|
|
294
312
|
return 0;
|
|
295
313
|
}
|
|
296
314
|
|
|
315
|
+
if (cmd === 'convert') {
|
|
316
|
+
const file = args._[1];
|
|
317
|
+
if (!file) { process.stderr.write('wrxn: convert requires <file>\n'); return 2; }
|
|
318
|
+
try {
|
|
319
|
+
const md = await convert(path.resolve(file), { gpu: args.flags.cpu ? false : undefined });
|
|
320
|
+
process.stdout.write(md.endsWith('\n') ? md : md + '\n');
|
|
321
|
+
return 0;
|
|
322
|
+
} catch (err) {
|
|
323
|
+
process.stderr.write(`wrxn: ${err.message}\n`);
|
|
324
|
+
return 2;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
if (cmd === 'ingest') {
|
|
329
|
+
const file = args._[1];
|
|
330
|
+
if (!file) { process.stderr.write('wrxn: ingest requires <file>\n'); return 2; }
|
|
331
|
+
const root = path.resolve(args.flags.root || process.cwd());
|
|
332
|
+
// The distillation is the LLM step (the `ingest` skill). The CLI feeds its structured result via
|
|
333
|
+
// --distillation <result.json>; without one, the harness's defaultDistill points back to the skill.
|
|
334
|
+
let distill;
|
|
335
|
+
if (args.flags.distillation) {
|
|
336
|
+
const dpath = path.resolve(args.flags.distillation);
|
|
337
|
+
distill = () => JSON.parse(fs.readFileSync(dpath, 'utf8'));
|
|
338
|
+
}
|
|
339
|
+
try {
|
|
340
|
+
const report = await ingest(path.resolve(file), { root, ...(distill ? { distill } : {}) });
|
|
341
|
+
process.stdout.write(`wrxn ingest ${report.source} → raw ${report.raw}\n`);
|
|
342
|
+
for (const p of report.written) process.stdout.write(` wrote ${p}\n`);
|
|
343
|
+
for (const p of report.skipped) process.stdout.write(` skipped ${p} (exists — additive-only, never clobbered)\n`);
|
|
344
|
+
process.stdout.write(`${report.written.length} written, ${report.skipped.length} skipped.\n`);
|
|
345
|
+
return 0;
|
|
346
|
+
} catch (err) {
|
|
347
|
+
process.stderr.write(`wrxn: ${err.message}\n`);
|
|
348
|
+
return 2;
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
|
|
297
352
|
if (cmd === 'onboard') {
|
|
298
353
|
const root = path.resolve(args.flags.root || process.cwd());
|
|
299
354
|
let report;
|
|
@@ -397,4 +452,7 @@ function main(argv) {
|
|
|
397
452
|
return 2;
|
|
398
453
|
}
|
|
399
454
|
|
|
400
|
-
|
|
455
|
+
main(process.argv.slice(2)).then(
|
|
456
|
+
(code) => process.exit(code),
|
|
457
|
+
(err) => { process.stderr.write(`wrxn: ${err && err.message ? err.message : err}\n`); process.exit(1); }
|
|
458
|
+
);
|
package/lib/convert.cjs
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Converter primitive (multiformat-distill-05) — convert(srcPath) → Markdown, per-format routing.
|
|
4
|
+
//
|
|
5
|
+
// Decision (ADR 0001 / PRD §5, empirically baked off): markitdown is the primary subprocess for the
|
|
6
|
+
// office/web matrix (html/docx/pptx/xlsx); txt is a zero-dep pass-through; PDF escalates to docling
|
|
7
|
+
// (SOTA tables + OCR), which auto-grabs the GPU and CRASHES on arch-incompat (the GTX-1070/Pascal
|
|
8
|
+
// sm_61 trap — torch cu13x ships no sm_61 kernel) → we force CPU on that crash. When Python /
|
|
9
|
+
// markitdown is absent (ENOENT) we degrade to the pure-JS floor (turndown / mammoth / unpdf / SheetJS).
|
|
10
|
+
//
|
|
11
|
+
// The spawn boundary is INJECTED, mirroring lib/connect.cjs's injectable `invoke`: convert(src,{run})
|
|
12
|
+
// takes a converter runner so routing, ENOENT-degrade, and the CPU fallback are unit-testable WITHOUT
|
|
13
|
+
// any real binary. defaultRun does the real spawnSync — that is what makes the integration check
|
|
14
|
+
// "validated by invocation". convert is async only so the pure-JS floor (mammoth/unpdf are async)
|
|
15
|
+
// can be wired in completely; the primary subprocess path is plain blocking spawnSync.
|
|
16
|
+
|
|
17
|
+
const fs = require('fs');
|
|
18
|
+
const os = require('os');
|
|
19
|
+
const path = require('path');
|
|
20
|
+
const { spawnSync } = require('child_process');
|
|
21
|
+
|
|
22
|
+
// Extension → logical format. (.htm folds into html.)
|
|
23
|
+
const FORMATS = {
|
|
24
|
+
'.html': 'html',
|
|
25
|
+
'.htm': 'html',
|
|
26
|
+
'.docx': 'docx',
|
|
27
|
+
'.txt': 'txt',
|
|
28
|
+
'.pptx': 'pptx',
|
|
29
|
+
'.xlsx': 'xlsx',
|
|
30
|
+
'.pdf': 'pdf',
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
// CUDA / arch-incompat crash signatures — the Pascal sm_61 trap and friends. docling auto-grabs the
|
|
34
|
+
// GPU; a torch build with no matching SM kernel dies with "no kernel image is available...".
|
|
35
|
+
const ARCH_CRASH_RE = /no kernel image|kernel image is available|sm_\d+|CUDA error|CUDA_ERROR|device-side assert|out of memory/i;
|
|
36
|
+
|
|
37
|
+
const SPAWN_OPTS = { encoding: 'utf8', timeout: 600000, maxBuffer: 256 * 1024 * 1024 };
|
|
38
|
+
|
|
39
|
+
// ── the injected boundary's real implementation ────────────────────────────────
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Run a converter subprocess and normalize its result to { ok, markdown } | { ok:false, error }.
|
|
43
|
+
* error.code is 'ENOENT' (not installed → degrade), 'CRASH' (arch-incompat → CPU retry), or 'EXIT'.
|
|
44
|
+
*/
|
|
45
|
+
function defaultRun(tool, srcPath, { device } = {}) {
|
|
46
|
+
if (tool === 'markitdown') {
|
|
47
|
+
const r = spawnSync('markitdown', [srcPath], SPAWN_OPTS);
|
|
48
|
+
return normalize(r);
|
|
49
|
+
}
|
|
50
|
+
if (tool === 'docling') {
|
|
51
|
+
// docling writes <basename>.md into an --output dir (no markdown on stdout); read it back.
|
|
52
|
+
const outDir = fs.mkdtempSync(path.join(os.tmpdir(), 'wrxn-docling-'));
|
|
53
|
+
try {
|
|
54
|
+
const args = [srcPath, '--to', 'md', '--output', outDir];
|
|
55
|
+
const opts = { ...SPAWN_OPTS };
|
|
56
|
+
if (device === 'cpu') {
|
|
57
|
+
args.push('--device', 'cpu');
|
|
58
|
+
opts.env = { ...process.env, CUDA_VISIBLE_DEVICES: '' };
|
|
59
|
+
}
|
|
60
|
+
const r = spawnSync('docling', args, opts);
|
|
61
|
+
if (r.error) return { ok: false, error: classifyError(r.error) };
|
|
62
|
+
if (r.status !== 0 || r.signal) {
|
|
63
|
+
const stderr = r.stderr || '';
|
|
64
|
+
const code = ARCH_CRASH_RE.test(stderr) || r.signal ? 'CRASH' : 'EXIT';
|
|
65
|
+
return { ok: false, error: { code, status: r.status, signal: r.signal, message: stderr.trim() } };
|
|
66
|
+
}
|
|
67
|
+
return { ok: true, markdown: readDoclingOutput(outDir, srcPath) };
|
|
68
|
+
} finally {
|
|
69
|
+
fs.rmSync(outDir, { recursive: true, force: true });
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
throw new Error(`unknown converter tool: ${tool}`);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function normalize(r) {
|
|
76
|
+
if (r.error) return { ok: false, error: classifyError(r.error) };
|
|
77
|
+
if (r.status !== 0 || r.signal) {
|
|
78
|
+
return { ok: false, error: { code: 'EXIT', status: r.status, signal: r.signal, message: (r.stderr || '').trim() } };
|
|
79
|
+
}
|
|
80
|
+
return { ok: true, markdown: r.stdout };
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function classifyError(err) {
|
|
84
|
+
return { code: err.code || 'ERR', message: err.message || String(err) };
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function readDoclingOutput(outDir, srcPath) {
|
|
88
|
+
const base = path.basename(srcPath, path.extname(srcPath));
|
|
89
|
+
const preferred = path.join(outDir, `${base}.md`);
|
|
90
|
+
if (fs.existsSync(preferred)) return fs.readFileSync(preferred, 'utf8');
|
|
91
|
+
// Fall back to the first .md docling produced (naming can vary by version).
|
|
92
|
+
const md = fs.readdirSync(outDir).find((f) => f.toLowerCase().endsWith('.md'));
|
|
93
|
+
if (!md) throw new Error(`docling produced no markdown in ${outDir}`);
|
|
94
|
+
return fs.readFileSync(path.join(outDir, md), 'utf8');
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// ── the pure-JS floor (no-Python degrade) ───────────────────────────────────────
|
|
98
|
+
|
|
99
|
+
function lazy(mod) {
|
|
100
|
+
try {
|
|
101
|
+
return require(mod);
|
|
102
|
+
} catch {
|
|
103
|
+
throw new Error(
|
|
104
|
+
`pure-JS floor needs "${mod}" but it is not installed, and the primary converter is absent. ` +
|
|
105
|
+
`Install the primary path (pip install 'markitdown[all]' / docling) or the floor (npm i ${mod}).`
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/** The no-Python in-process floor (research §2: turndown / mammoth / unpdf / SheetJS). Async. */
|
|
111
|
+
async function defaultFloor(fmt, srcPath) {
|
|
112
|
+
if (fmt === 'txt') return fs.readFileSync(srcPath, 'utf8');
|
|
113
|
+
if (fmt === 'html') {
|
|
114
|
+
const Turndown = lazy('turndown');
|
|
115
|
+
const td = new Turndown();
|
|
116
|
+
try {
|
|
117
|
+
const { gfm } = require('turndown-plugin-gfm');
|
|
118
|
+
td.use(gfm);
|
|
119
|
+
} catch { /* gfm tables are a nice-to-have, not required */ }
|
|
120
|
+
return td.turndown(fs.readFileSync(srcPath, 'utf8'));
|
|
121
|
+
}
|
|
122
|
+
if (fmt === 'docx') {
|
|
123
|
+
const mammoth = lazy('mammoth');
|
|
124
|
+
const Turndown = lazy('turndown');
|
|
125
|
+
const { value: html } = await mammoth.convertToHtml({ path: srcPath });
|
|
126
|
+
return new Turndown().turndown(html);
|
|
127
|
+
}
|
|
128
|
+
if (fmt === 'pdf') {
|
|
129
|
+
const { extractText, getDocumentProxy } = lazy('unpdf');
|
|
130
|
+
const buf = new Uint8Array(fs.readFileSync(srcPath));
|
|
131
|
+
const pdf = await getDocumentProxy(buf);
|
|
132
|
+
const { text } = await extractText(pdf, { mergePages: true });
|
|
133
|
+
return text;
|
|
134
|
+
}
|
|
135
|
+
if (fmt === 'xlsx') {
|
|
136
|
+
const XLSX = lazy('xlsx');
|
|
137
|
+
const wb = XLSX.readFile(srcPath);
|
|
138
|
+
return wb.SheetNames.map((n) => `## ${n}\n\n${XLSX.utils.sheet_to_csv(wb.Sheets[n])}`).join('\n\n');
|
|
139
|
+
}
|
|
140
|
+
if (fmt === 'pptx') {
|
|
141
|
+
const officeParser = lazy('officeparser');
|
|
142
|
+
return await officeParser.parseOfficeAsync(srcPath);
|
|
143
|
+
}
|
|
144
|
+
throw new Error(`no pure-JS floor for format "${fmt}"`);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// ── the primitive ───────────────────────────────────────────────────────────────
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Convert a source file to Markdown via per-format routing.
|
|
151
|
+
* @param {string} srcPath
|
|
152
|
+
* @param {{ run?: Function, floor?: Function, gpu?: boolean }} [opts]
|
|
153
|
+
* run — injectable converter boundary (default: defaultRun, the real spawnSync).
|
|
154
|
+
* floor — injectable pure-JS floor (default: defaultFloor).
|
|
155
|
+
* gpu — false forces docling onto CPU from the first attempt (skips the GPU probe/crash).
|
|
156
|
+
* @returns {Promise<string>} the markdown.
|
|
157
|
+
*/
|
|
158
|
+
async function convert(srcPath, { run = defaultRun, floor = defaultFloor, gpu } = {}) {
|
|
159
|
+
// Resolve to an absolute path up front so a leading-dash filename can never be read as a CLI flag
|
|
160
|
+
// by the converter subprocess — the dash-neutralization must not depend on the caller (slice-06
|
|
161
|
+
// ingest calls convert() directly, not via the CLI).
|
|
162
|
+
srcPath = path.resolve(srcPath);
|
|
163
|
+
// Pre-check existence up front (mirrors lib/ingest.cjs's source-not-found guard) so a missing file
|
|
164
|
+
// is rejected with a clean message and NEVER reaches the converter subprocess — whose Python
|
|
165
|
+
// traceback (markitdown/docling) would otherwise leak to the user verbatim (multiformat-distill-08).
|
|
166
|
+
if (!fs.existsSync(srcPath)) throw new Error(`wrxn convert: source not found: ${srcPath}`);
|
|
167
|
+
const ext = path.extname(srcPath).toLowerCase();
|
|
168
|
+
const fmt = FORMATS[ext];
|
|
169
|
+
if (!fmt) {
|
|
170
|
+
throw new Error(`wrxn convert: unsupported format "${ext || '(none)'}" — supported: ${Object.keys(FORMATS).join(', ')}`);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// txt is already plain text — pass it through (zero-dep, always works).
|
|
174
|
+
if (fmt === 'txt') {
|
|
175
|
+
return fs.readFileSync(srcPath, 'utf8');
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (fmt === 'pdf') {
|
|
179
|
+
return convertPdf(srcPath, { run, floor, gpu });
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// markitdown-primary formats (html/docx/pptx/xlsx).
|
|
183
|
+
const r = run('markitdown', srcPath);
|
|
184
|
+
if (r.ok) return r.markdown;
|
|
185
|
+
if (r.error && r.error.code === 'ENOENT') {
|
|
186
|
+
return floor(fmt, srcPath); // markitdown absent → degrade to the pure-JS floor
|
|
187
|
+
}
|
|
188
|
+
throw new Error(`wrxn convert: markitdown failed on ${path.basename(srcPath)} — ${r.error.message || r.error.code}`);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/** PDF tier: docling (GPU/auto) → CPU on an arch-crash → pure-JS floor if docling is absent. */
|
|
192
|
+
async function convertPdf(srcPath, { run, floor, gpu }) {
|
|
193
|
+
const firstDevice = gpu === false ? 'cpu' : undefined; // undefined = let docling pick (GPU/auto)
|
|
194
|
+
const r = run('docling', srcPath, { device: firstDevice });
|
|
195
|
+
if (r.ok) return r.markdown;
|
|
196
|
+
if (r.error && r.error.code === 'ENOENT') {
|
|
197
|
+
return floor('pdf', srcPath); // no docling → unpdf floor
|
|
198
|
+
}
|
|
199
|
+
if (r.error && r.error.code === 'CRASH' && firstDevice !== 'cpu') {
|
|
200
|
+
// arch-incompat / GPU crash → force CPU (CUDA_VISIBLE_DEVICES='' + --device cpu).
|
|
201
|
+
const cpu = run('docling', srcPath, { device: 'cpu' });
|
|
202
|
+
if (cpu.ok) return cpu.markdown;
|
|
203
|
+
if (cpu.error && cpu.error.code === 'ENOENT') return floor('pdf', srcPath);
|
|
204
|
+
throw new Error(`wrxn convert: docling failed on the CPU fallback for ${path.basename(srcPath)} — ${cpu.error.message || cpu.error.code}`);
|
|
205
|
+
}
|
|
206
|
+
throw new Error(`wrxn convert: docling failed on ${path.basename(srcPath)} — ${r.error.message || r.error.code}`);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
module.exports = {
|
|
210
|
+
convert,
|
|
211
|
+
defaultRun,
|
|
212
|
+
defaultFloor,
|
|
213
|
+
FORMATS,
|
|
214
|
+
ARCH_CRASH_RE,
|
|
215
|
+
};
|
package/lib/ingest.cjs
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Distillation ingest harness (multiformat-distill-06) — the deterministic half of `wrxn ingest`.
|
|
4
|
+
//
|
|
5
|
+
// PRD decisions D/E (grill 2026-06-16) + [[karpathy-llm-wiki-pattern]] (raw → distill → wiki, Adler):
|
|
6
|
+
// a dropped source becomes a SUMMARY page + N NOTE pages in the memory wiki, each carrying a
|
|
7
|
+
// `derived_from:` link back to the raw source. Additive-only: ingest CREATES new pages and refuses
|
|
8
|
+
// to overwrite an existing one — editing existing knowledge + cross-source synthesis is the `dream`
|
|
9
|
+
// loop (out of scope here).
|
|
10
|
+
//
|
|
11
|
+
// TWO boundaries are INJECTED, mirroring lib/convert.cjs's injectable spawn, so the harness is
|
|
12
|
+
// deterministically testable WITHOUT a real binary OR a live LLM:
|
|
13
|
+
// - convert(src) → markdown slice-05 converter primitive (default: the real convert).
|
|
14
|
+
// - distill(markdown, ctx) → pages the LLM step. The `ingest` SKILL is the prompt that produces
|
|
15
|
+
// this; the harness only consumes its structured output, so the
|
|
16
|
+
// distillation QUALITY is validated by the feature QA-walk, not
|
|
17
|
+
// here. defaultDistill refuses to fabricate — it points the
|
|
18
|
+
// caller at the skill (or the CLI's --distillation feed).
|
|
19
|
+
|
|
20
|
+
const fs = require('fs');
|
|
21
|
+
const path = require('path');
|
|
22
|
+
const crypto = require('crypto');
|
|
23
|
+
const { convert: defaultConvert } = require('./convert.cjs');
|
|
24
|
+
|
|
25
|
+
const SLUG_RE = /^[a-z0-9][a-z0-9-]*$/;
|
|
26
|
+
const TIERS = ['concepts', 'decisions', 'gotchas', 'sessions'];
|
|
27
|
+
const DEFAULT_TIER = 'concepts'; // distilled source knowledge lands in the concepts tier by default.
|
|
28
|
+
const MAX_NOTES = 100; // cap so a garbage distillation can't flood the wiki.
|
|
29
|
+
// eslint-disable-next-line no-control-regex
|
|
30
|
+
const CTRL_RE = /[\x00-\x1f]/; // control chars (NL/CR/NUL/...) — illegal in a source filename.
|
|
31
|
+
|
|
32
|
+
// Collapse a value to a single safe frontmatter scalar: strip control chars, fold whitespace.
|
|
33
|
+
function safeScalar(v) {
|
|
34
|
+
// eslint-disable-next-line no-control-regex
|
|
35
|
+
return String(v || '').replace(/[\x00-\x1f]/g, ' ').replace(/\s+/g, ' ').trim();
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// The no-op default for the distill boundary: there is no deterministic LLM, so refuse rather than
|
|
39
|
+
// fabricate. The real distillation is the `ingest` skill; the CLI feeds its result via --distillation.
|
|
40
|
+
function defaultDistill() {
|
|
41
|
+
throw new Error(
|
|
42
|
+
'no distillation provided. The distillation step is the `ingest` skill (an LLM reads the ' +
|
|
43
|
+
'converted markdown and produces a summary + notes). Run via the ingest skill, feed a result ' +
|
|
44
|
+
'with --distillation <result.json>, or inject a distill boundary. See .claude/skills/ingest/SKILL.md.'
|
|
45
|
+
);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Flatten the distillation result into an ordered page list, validating the contract. */
|
|
49
|
+
function normalizePages(result) {
|
|
50
|
+
if (!result || typeof result !== 'object') throw new Error('wrxn ingest: distillation returned no result object');
|
|
51
|
+
const summary = result.summary;
|
|
52
|
+
if (!summary || !summary.slug || !summary.body) {
|
|
53
|
+
throw new Error('wrxn ingest: distillation must include a summary page with { slug, body }');
|
|
54
|
+
}
|
|
55
|
+
const notes = Array.isArray(result.notes) ? result.notes : [];
|
|
56
|
+
if (notes.length > MAX_NOTES) {
|
|
57
|
+
throw new Error(`wrxn ingest: distillation produced ${notes.length} notes — cap is ${MAX_NOTES}. Refusing to flood the wiki.`);
|
|
58
|
+
}
|
|
59
|
+
const pages = [{ ...summary, role: 'summary' }, ...notes.map((n) => ({ ...n, role: 'note' }))];
|
|
60
|
+
for (const pg of pages) {
|
|
61
|
+
if (!pg.slug || !SLUG_RE.test(pg.slug)) {
|
|
62
|
+
throw new Error(`wrxn ingest: page slug must be kebab-case ([a-z0-9-]): "${pg.slug}"`);
|
|
63
|
+
}
|
|
64
|
+
pg.tier = pg.tier || DEFAULT_TIER;
|
|
65
|
+
if (!TIERS.includes(pg.tier)) {
|
|
66
|
+
throw new Error(`wrxn ingest: unknown tier "${pg.tier}" — one of ${TIERS.join(', ')}`);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
// Intra-run dup: the DISTILLATION itself yielded two pages targeting one path. Distinct from the
|
|
70
|
+
// legit pre-existing-page skip (handled at write time via the wx/O_EXCL EEXIST path).
|
|
71
|
+
const seen = new Set();
|
|
72
|
+
for (const pg of pages) {
|
|
73
|
+
const key = `${pg.tier}/${pg.slug}`;
|
|
74
|
+
if (seen.has(key)) throw new Error(`wrxn ingest: duplicate slug in distillation: "${pg.slug}" (tier ${pg.tier})`);
|
|
75
|
+
seen.add(key);
|
|
76
|
+
}
|
|
77
|
+
return pages;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** Render one wiki page: frontmatter (with the sanitized derived_from provenance stamp) + body. */
|
|
81
|
+
function renderPage(pg, derivedFrom) {
|
|
82
|
+
return [
|
|
83
|
+
'---',
|
|
84
|
+
`name: ${pg.slug}`,
|
|
85
|
+
`description: ${safeScalar(pg.description)}`,
|
|
86
|
+
`tier: ${pg.tier}`,
|
|
87
|
+
`derived_from: ${safeScalar(derivedFrom)}`,
|
|
88
|
+
`role: ${pg.role}`,
|
|
89
|
+
'source: wrxn-ingest',
|
|
90
|
+
'---',
|
|
91
|
+
'',
|
|
92
|
+
`# ${pg.title || pg.slug}`,
|
|
93
|
+
'',
|
|
94
|
+
(pg.body || '').trim(),
|
|
95
|
+
'',
|
|
96
|
+
].join('\n');
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Ingest a source file into the memory wiki as a summary + N note pages.
|
|
101
|
+
* @param {string} srcPath
|
|
102
|
+
* @param {{ root?: string, convert?: Function, distill?: Function }} [opts]
|
|
103
|
+
* root — install root the wiki + raw zone live under (default: cwd).
|
|
104
|
+
* convert — injectable converter boundary (default: slice-05 convert, the real spawnSync path).
|
|
105
|
+
* distill — injectable distillation boundary (default: defaultDistill, which refuses to fabricate).
|
|
106
|
+
* @returns {Promise<{source:string, raw:string, written:string[], skipped:string[]}>}
|
|
107
|
+
*/
|
|
108
|
+
async function ingest(srcPath, { root, convert = defaultConvert, distill = defaultDistill } = {}) {
|
|
109
|
+
srcPath = path.resolve(srcPath);
|
|
110
|
+
if (!fs.existsSync(srcPath)) throw new Error(`wrxn ingest: source not found: ${srcPath}`);
|
|
111
|
+
root = path.resolve(root || process.cwd());
|
|
112
|
+
|
|
113
|
+
// ── fail-fast guards: everything cheap that can reject runs BEFORE convert + raw copy, so a pure
|
|
114
|
+
// error path leaves NO stray work (no spawned converter, no dropped raw file). ──
|
|
115
|
+
const base = path.basename(srcPath);
|
|
116
|
+
if (CTRL_RE.test(base)) {
|
|
117
|
+
// a newline/control char in the filename would break out of the YAML frontmatter block.
|
|
118
|
+
throw new Error(`wrxn ingest: source filename contains control characters (invalid): ${JSON.stringify(base)}`);
|
|
119
|
+
}
|
|
120
|
+
// refuse a symlinked source: copyFileSync would follow it and copy an arbitrary readable file.
|
|
121
|
+
if (fs.lstatSync(srcPath).isSymbolicLink()) {
|
|
122
|
+
throw new Error(`wrxn ingest: source is a symlink (refused): ${srcPath}`);
|
|
123
|
+
}
|
|
124
|
+
// validate the distill boundary up front — `wrxn ingest <file>` with no distillation must NOT
|
|
125
|
+
// convert + drop a raw file before defaultDistill throws.
|
|
126
|
+
if (distill === defaultDistill) defaultDistill();
|
|
127
|
+
|
|
128
|
+
// 1. convert source → markdown (slice 05).
|
|
129
|
+
const markdown = await convert(srcPath);
|
|
130
|
+
|
|
131
|
+
// 2. place/keep the raw source under .wrxn/raw/. The filename is content-hash-namespaced so two
|
|
132
|
+
// DIFFERENT sources sharing a basename never collide (provenance stays correct), while the SAME
|
|
133
|
+
// bytes always map to the SAME name → idempotent re-run skips the copy.
|
|
134
|
+
const rawDir = path.join(root, '.wrxn', 'raw');
|
|
135
|
+
fs.mkdirSync(rawDir, { recursive: true });
|
|
136
|
+
const bytes = fs.readFileSync(srcPath);
|
|
137
|
+
const hash = crypto.createHash('sha256').update(bytes).digest('hex').slice(0, 8);
|
|
138
|
+
const ext = path.extname(base);
|
|
139
|
+
const stem = base.slice(0, base.length - ext.length);
|
|
140
|
+
const rawName = `${stem}.${hash}${ext}`;
|
|
141
|
+
const rawDest = path.join(rawDir, rawName);
|
|
142
|
+
if (!fs.existsSync(rawDest)) fs.writeFileSync(rawDest, bytes);
|
|
143
|
+
const derivedFrom = path.relative(root, rawDest).split(path.sep).join('/');
|
|
144
|
+
|
|
145
|
+
// 3. distill the markdown → { summary, notes } (validated: contract, note cap, intra-run dup slug).
|
|
146
|
+
const pages = normalizePages(await distill(markdown, { srcPath, derivedFrom }));
|
|
147
|
+
|
|
148
|
+
// 4. write pages ADDITIVELY. The wx flag (O_EXCL) makes the check-and-create atomic AND refuses to
|
|
149
|
+
// follow a (dangling) symlink at the destination — EEXIST is the legit pre-existing-page skip.
|
|
150
|
+
const written = [];
|
|
151
|
+
const skipped = [];
|
|
152
|
+
for (const pg of pages) {
|
|
153
|
+
const dir = path.join(root, '.wrxn', 'wiki', pg.tier);
|
|
154
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
155
|
+
const dest = path.join(dir, `${pg.slug}.md`);
|
|
156
|
+
const rel = path.relative(root, dest).split(path.sep).join('/');
|
|
157
|
+
try {
|
|
158
|
+
fs.writeFileSync(dest, renderPage(pg, derivedFrom), { flag: 'wx' });
|
|
159
|
+
written.push(rel);
|
|
160
|
+
} catch (err) {
|
|
161
|
+
if (err.code === 'EEXIST') { skipped.push(rel); continue; }
|
|
162
|
+
throw err;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
return {
|
|
167
|
+
source: path.relative(root, srcPath).split(path.sep).join('/'),
|
|
168
|
+
raw: derivedFrom,
|
|
169
|
+
written,
|
|
170
|
+
skipped,
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
module.exports = { ingest, defaultDistill, normalizePages, DEFAULT_TIER, TIERS, MAX_NOTES };
|
package/manifest.json
CHANGED
|
@@ -148,6 +148,11 @@
|
|
|
148
148
|
"class": "managed",
|
|
149
149
|
"profile": "project"
|
|
150
150
|
},
|
|
151
|
+
{
|
|
152
|
+
"path": ".claude/skills/ingest/SKILL.md",
|
|
153
|
+
"class": "managed",
|
|
154
|
+
"profile": "project"
|
|
155
|
+
},
|
|
151
156
|
{
|
|
152
157
|
"path": ".claude/skills/memory/SKILL.md",
|
|
153
158
|
"class": "managed",
|
|
@@ -398,6 +403,11 @@
|
|
|
398
403
|
"class": "state",
|
|
399
404
|
"profile": "project"
|
|
400
405
|
},
|
|
406
|
+
{
|
|
407
|
+
"path": ".wrxn/raw/.gitkeep",
|
|
408
|
+
"class": "state",
|
|
409
|
+
"profile": "project"
|
|
410
|
+
},
|
|
401
411
|
{
|
|
402
412
|
"path": ".wrxn/wiki.cjs",
|
|
403
413
|
"class": "managed",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gcunharodrigues/wrxn",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "WRXN Kernel — installable AI operating system. Two profiles (project | workspace), pull-based updates, managed/seeded/state file classes.",
|
|
5
5
|
"bin": {
|
|
6
6
|
"wrxn": "bin/wrxn.cjs"
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
"test": "node --test"
|
|
17
17
|
},
|
|
18
18
|
"dependencies": {
|
|
19
|
-
"recon-wrxn": "6.0.0-wrxn.
|
|
19
|
+
"recon-wrxn": "6.0.0-wrxn.2"
|
|
20
20
|
},
|
|
21
21
|
"engines": {
|
|
22
22
|
"node": ">=20"
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ingest
|
|
3
|
+
description: Distill a dropped source file (PDF/DOCX/HTML/PPTX/XLSX/TXT) into curated memory-wiki pages — a summary page + N note pages, each linked back to the raw source. Use when an operator drops a file in .wrxn/raw/ (or names one) and says "ingest this", "distill this document", or "turn this source into wiki notes".
|
|
4
|
+
user-invocable: true
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# ingest — read → distill → split into notes
|
|
8
|
+
|
|
9
|
+
The distillation half of `wrxn ingest <file>`. Turns a raw source into compounding wiki knowledge,
|
|
10
|
+
following the Karpathy LLM-wiki pattern (Adler's *How to Read a Book*: read → distill → split into
|
|
11
|
+
notes). This is built as the first concrete slice of the Phase-3 `dream` ingest.
|
|
12
|
+
|
|
13
|
+
The work is **split** (the kernel executor pattern):
|
|
14
|
+
|
|
15
|
+
- **The harness is deterministic** (`lib/ingest.cjs` → `wrxn ingest`): convert the source to markdown,
|
|
16
|
+
place/keep the raw under `.wrxn/raw/`, write the pages you produce with a `derived_from:` provenance
|
|
17
|
+
stamp, and enforce the **additive-only** guard. You do NOT re-implement any of that.
|
|
18
|
+
- **You are the distillation step.** You read the converted markdown and produce the *content* — a
|
|
19
|
+
summary page + N note pages. Your job is the curation quality.
|
|
20
|
+
|
|
21
|
+
## Scope (PRD decision E)
|
|
22
|
+
|
|
23
|
+
- **Inspectional + analytical depth**: 1 source → **1 summary page + N note pages**. Divide the source
|
|
24
|
+
into its natural documents (sections / themes), one note page each.
|
|
25
|
+
- **Additive-only.** You CREATE new pages. You never edit an existing wiki page and never synthesise
|
|
26
|
+
across sources — that is the `dream` loop, out of scope. The harness refuses to overwrite, so a slug
|
|
27
|
+
that collides with an existing page is silently skipped: choose fresh, source-specific slugs.
|
|
28
|
+
|
|
29
|
+
## Loop
|
|
30
|
+
|
|
31
|
+
1. **Convert.** Read the converted markdown — either run `wrxn convert <file>` and read its output, or
|
|
32
|
+
read what the harness converted. Do not parse the binary yourself.
|
|
33
|
+
2. **Read for the gist.** One pass for the whole; identify the source's structure and its key claims.
|
|
34
|
+
3. **Summary page.** One page capturing what the source IS and its main points — the inspectional read.
|
|
35
|
+
4. **Note pages.** One page per distinct theme/section — the analytical read. Each note is
|
|
36
|
+
self-contained and titled by its idea, not "Section 3".
|
|
37
|
+
5. **Emit the result** as the structured object below and hand it to the harness.
|
|
38
|
+
|
|
39
|
+
## Result contract
|
|
40
|
+
|
|
41
|
+
The harness consumes this exact shape (the CLI accepts it via `--distillation <result.json>`):
|
|
42
|
+
|
|
43
|
+
```json
|
|
44
|
+
{
|
|
45
|
+
"summary": { "slug": "paper-summary", "title": "...", "description": "one line", "body": "markdown" },
|
|
46
|
+
"notes": [
|
|
47
|
+
{ "slug": "paper-method", "title": "...", "description": "one line", "body": "markdown" },
|
|
48
|
+
{ "slug": "paper-results", "title": "...", "description": "one line", "body": "markdown" }
|
|
49
|
+
]
|
|
50
|
+
}
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
- `slug` — kebab-case (`[a-z0-9-]`), unique, source-specific (prefix with the source name to avoid
|
|
54
|
+
collisions). The harness rejects a non-kebab slug.
|
|
55
|
+
- `summary` is required (`{slug, body}` minimum); `notes` is an array (≥1 in practice).
|
|
56
|
+
- `tier` is optional per page (default `concepts`; may be `concepts|decisions|gotchas|sessions`).
|
|
57
|
+
- The harness adds the `derived_from: .wrxn/raw/<file>` stamp, `role`, and `source: wrxn-ingest`
|
|
58
|
+
frontmatter — you do not write frontmatter into `body`.
|
|
59
|
+
|
|
60
|
+
## Run
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# the harness does convert → raw placement → provenance stamp → additive write:
|
|
64
|
+
wrxn ingest .wrxn/raw/paper.pdf --distillation result.json
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Re-running on the same source is safe — existing pages are skipped, never clobbered.
|
|
68
|
+
|
|
69
|
+
## Source
|
|
70
|
+
|
|
71
|
+
WRXN Kernel issue multiformat-distill-06 (PRD decisions D + E). Harness: `lib/ingest.cjs`.
|
|
72
|
+
Converter: `lib/convert.cjs` (slice 05). Wiki: `.wrxn/wiki/` (see the `memory` skill).
|
package/payload/.mcp.json
CHANGED
|
File without changes
|