@gcunharodrigues/wrxn 0.2.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/wrxn.cjs CHANGED
@@ -11,7 +11,10 @@ const worktree = require('../lib/worktree.cjs');
11
11
  const executor = require('../lib/executor.cjs');
12
12
  const onboard = require('../lib/onboard.cjs');
13
13
  const connect = require('../lib/connect.cjs');
14
+ const brain = require('../lib/brain.cjs');
14
15
  const statusline = require('../lib/statusline.cjs');
16
+ const { convert } = require('../lib/convert.cjs');
17
+ const { ingest } = require('../lib/ingest.cjs');
15
18
 
16
19
  const PKG_ROOT = path.join(__dirname, '..');
17
20
 
@@ -54,6 +57,12 @@ function parseArgs(argv) {
54
57
  args.flags.owner = argv[++i];
55
58
  } else if (a === '--probe') {
56
59
  args.flags.probe = argv[++i];
60
+ } else if (a === '--distillation') {
61
+ args.flags.distillation = argv[++i];
62
+ } else if (a === '--limit') {
63
+ args.flags.limit = argv[++i];
64
+ } else if (a === '--type') {
65
+ args.flags.type = argv[++i];
57
66
  } else if (a === '--check-report') {
58
67
  args.flags['check-report'] = true;
59
68
  } else if (a.startsWith('--')) {
@@ -105,6 +114,17 @@ Usage:
105
114
  list print all registered connections (agent-readable JSON)
106
115
  get <name> print one connection by name
107
116
 
117
+ wrxn brain query "<q>" [--json] [--limit <n>] [--type <prose|code|NodeType>] [--neighbors] [--root <dir>]
118
+ ask the warm Brain (recon-wrxn's code+prose graph) from the terminal.
119
+ WHOLE-BRAIN by default. Discovers the live serve door via
120
+ .recon-wrxn/serve-endpoint.json and POSTs the query; prints ranked
121
+ hits (name · type · file:line). If the Brain is not warm (no live
122
+ serve), prints a clear error and exits non-zero — no cold load.
123
+ --json emits the structured hits · --limit asks the door for top n ·
124
+ --type post-filters (prose=Page/Section, code=the rest, or an exact
125
+ NodeType) · --neighbors expands each hit to its 1-hop graph neighbors
126
+ (callers/callees/relationships via recon_explain).
127
+
108
128
  wrxn statusline [--inject [--path <script>]]
109
129
  SYNAPSE live-window writer. With no flag: report whether a statusline
110
130
  is configured (~/.claude/settings.json) + print the marker-bounded
@@ -112,6 +132,20 @@ Usage:
112
132
  resolved (or --path) statusline script, idempotently (append-only,
113
133
  never overwrites). init NEVER touches your statusline.
114
134
 
135
+ wrxn convert <file> [--cpu] convert a source file to Markdown and print it. Per-format routing:
136
+ markitdown (html/docx/txt/pptx/xlsx) · docling (pdf, with automatic
137
+ CPU fallback on a GPU arch-crash) · pure-JS floor when Python is
138
+ absent. --cpu forces docling onto CPU from the first attempt.
139
+
140
+ wrxn ingest <file> [--distillation <result.json>] [--root <dir>]
141
+ distill a source into the memory wiki: convert (slice 05) → an LLM
142
+ (the ingest skill) produces a summary + N note pages → write them
143
+ to .wrxn/wiki/, each stamped derived_from the raw source, which is
144
+ kept under .wrxn/raw/. ADDITIVE-ONLY: an existing page is never
145
+ overwritten (re-runs are safe). --distillation feeds the skill's
146
+ result JSON (summary,notes); without it, the harness points you at
147
+ the ingest skill.
148
+
115
149
  wrxn onboard [--root <dir>] scaffold the Day-1 operator file set under context/ from a filled
116
150
  aios-intake.md (the deterministic half of the onboard skill;
117
151
  workspace installs only). Idempotent.
@@ -120,7 +154,7 @@ Profiles: --project (default, the dev pipeline + intelligence + enforcement) |
120
154
  --workspace (adds the operator layer: onboard/audit/level-up + intake + decisions log +
121
155
  connections registry).`;
122
156
 
123
- function main(argv) {
157
+ async function main(argv) {
124
158
  const args = parseArgs(argv);
125
159
 
126
160
  if (args.flags.version) {
@@ -294,6 +328,43 @@ function main(argv) {
294
328
  return 0;
295
329
  }
296
330
 
331
+ if (cmd === 'convert') {
332
+ const file = args._[1];
333
+ if (!file) { process.stderr.write('wrxn: convert requires <file>\n'); return 2; }
334
+ try {
335
+ const md = await convert(path.resolve(file), { gpu: args.flags.cpu ? false : undefined });
336
+ process.stdout.write(md.endsWith('\n') ? md : md + '\n');
337
+ return 0;
338
+ } catch (err) {
339
+ process.stderr.write(`wrxn: ${err.message}\n`);
340
+ return 2;
341
+ }
342
+ }
343
+
344
+ if (cmd === 'ingest') {
345
+ const file = args._[1];
346
+ if (!file) { process.stderr.write('wrxn: ingest requires <file>\n'); return 2; }
347
+ const root = path.resolve(args.flags.root || process.cwd());
348
+ // The distillation is the LLM step (the `ingest` skill). The CLI feeds its structured result via
349
+ // --distillation <result.json>; without one, the harness's defaultDistill points back to the skill.
350
+ let distill;
351
+ if (args.flags.distillation) {
352
+ const dpath = path.resolve(args.flags.distillation);
353
+ distill = () => JSON.parse(fs.readFileSync(dpath, 'utf8'));
354
+ }
355
+ try {
356
+ const report = await ingest(path.resolve(file), { root, ...(distill ? { distill } : {}) });
357
+ process.stdout.write(`wrxn ingest ${report.source} → raw ${report.raw}\n`);
358
+ for (const p of report.written) process.stdout.write(` wrote ${p}\n`);
359
+ for (const p of report.skipped) process.stdout.write(` skipped ${p} (exists — additive-only, never clobbered)\n`);
360
+ process.stdout.write(`${report.written.length} written, ${report.skipped.length} skipped.\n`);
361
+ return 0;
362
+ } catch (err) {
363
+ process.stderr.write(`wrxn: ${err.message}\n`);
364
+ return 2;
365
+ }
366
+ }
367
+
297
368
  if (cmd === 'onboard') {
298
369
  const root = path.resolve(args.flags.root || process.cwd());
299
370
  let report;
@@ -353,6 +424,32 @@ function main(argv) {
353
424
  }
354
425
  }
355
426
 
427
+ if (cmd === 'brain') {
428
+ const sub = args._[1];
429
+ if (sub !== 'query') {
430
+ process.stderr.write(`wrxn: unknown brain subcommand "${sub || ''}" (expected: query)\n\n${USAGE}\n`);
431
+ return 2;
432
+ }
433
+ const q = args._[2];
434
+ if (!q) { process.stderr.write('wrxn: brain query requires "<query>"\n'); return 2; }
435
+ const opts = { json: !!args.flags.json, neighbors: !!args.flags.neighbors };
436
+ if (args.flags.limit != null) {
437
+ const n = parseInt(args.flags.limit, 10);
438
+ if (!Number.isInteger(n) || n <= 0) { process.stderr.write('wrxn: --limit requires a positive integer\n'); return 2; }
439
+ opts.limit = n;
440
+ }
441
+ if (args.flags.type) opts.type = String(args.flags.type);
442
+ const root = path.resolve(args.flags.root || process.cwd());
443
+ try {
444
+ const res = await brain.query(q, opts, { root });
445
+ process.stdout.write(brain.formatHits(res.hits, opts) + '\n');
446
+ return 0;
447
+ } catch (err) {
448
+ process.stderr.write(`wrxn: ${err.message}\n`);
449
+ return 2;
450
+ }
451
+ }
452
+
356
453
  if (cmd === 'statusline') {
357
454
  const home = process.env.HOME || os.homedir();
358
455
  const detection = statusline.detectStatusLine(home);
@@ -397,4 +494,7 @@ function main(argv) {
397
494
  return 2;
398
495
  }
399
496
 
400
- process.exit(main(process.argv.slice(2)));
497
+ main(process.argv.slice(2)).then(
498
+ (code) => process.exit(code),
499
+ (err) => { process.stderr.write(`wrxn: ${err && err.message ? err.message : err}\n`); process.exit(1); }
500
+ );
package/lib/brain.cjs ADDED
@@ -0,0 +1,295 @@
1
+ 'use strict';
2
+
3
+ // WRXN brain query (recon-brain-recall-03) — interrogate the warm Brain from the terminal.
4
+ //
5
+ // The Brain is recon-wrxn's unified code+prose knowledge graph, loaded WARM inside the `recon serve`
6
+ // process Claude Code boots for a session. This command reaches it over the loopback find door that
7
+ // serve announces via a discovery file — it is WHOLE-BRAIN (code AND prose, no scope filter by
8
+ // default), the operator's on-demand counterpart to the prose-only proactive Recall hook.
9
+ //
10
+ // Endpoint-first (v1): if no warm door is discoverable, we raise a clear, actionable error and the CLI
11
+ // exits non-zero — there is NO cold one-shot load (that would pay the index + embedder cost the warm
12
+ // serve already absorbs).
13
+ //
14
+ // The query path takes an INJECTED transport + endpoint reader (deps) so its behavior is unit-testable
15
+ // with no live serve — mirrors the injected-invoker seam in lib/connect.cjs and the recall hook's
16
+ // httpTransport. lib/brain.cjs is PACKAGE code (invoked via bin/wrxn.cjs), NOT payload — no manifest
17
+ // entry, consistent with lib/connect.cjs / lib/executor.cjs / lib/onboard.cjs.
18
+ //
19
+ // The discovery contract (serve-endpoint.json {pid,port}, pid-liveness) is duplicated here from the
20
+ // payload recall-surface hook ON PURPOSE: that hook must be node-stdlib-only and self-contained (it
21
+ // ships into installs without the kernel lib), so package code cannot import it. The contract is ~20
22
+ // stable lines — duplicating it across the install boundary is the same self-containment trade the
23
+ // payload hooks make for findInstallRoot.
24
+
25
+ const fs = require('fs');
26
+ const http = require('http');
27
+ const path = require('path');
28
+
29
+ const ENDPOINT_REL = path.join('.recon-wrxn', 'serve-endpoint.json');
30
+ const FIND_PATH = '/api/tools/recon_find';
31
+ const EXPLAIN_PATH = '/api/tools/recon_explain';
32
+ const TIMEOUT_MS = 5000; // generous: an interactive CLI, not the per-prompt 150ms recall budget
33
+ const MAX_RESPONSE_BYTES = 256 * 1024; // hard cap on an accumulated door response body (anti-flood)
34
+ const PROSE_TYPES = new Set(['Page', 'Section']);
35
+ const WALK_UP_LIMIT = 12;
36
+
37
+ // ── discovery (the cross-repo warm-door contract) ────────────────────────────────────
38
+
39
+ // A pid is alive unless process.kill(pid,0) throws. ESRCH = gone; EPERM = owned by another user but
40
+ // alive. Mirrors the cross-repo discovery contract (and the recall hook).
41
+ function pidAlive(pid) {
42
+ try {
43
+ process.kill(pid, 0);
44
+ return true;
45
+ } catch (e) {
46
+ return !!e && e.code === 'EPERM';
47
+ }
48
+ }
49
+
50
+ // Refuse a discovery file another user could have planted, or that is group/world-writable — trusting
51
+ // it would let a hostile workspace point the door host/port at an exfil/injection sink. lstat (not
52
+ // stat) so a symlink's OWN ownership/mode is judged, not its target's. A platform without getuid (no
53
+ // POSIX ownership) skips the uid check but still enforces the mode check. Any fault → not trusted.
54
+ function endpointTrusted(file) {
55
+ let st;
56
+ try {
57
+ st = fs.lstatSync(file);
58
+ } catch {
59
+ return false;
60
+ }
61
+ if (typeof process.getuid === 'function' && st.uid !== process.getuid()) return false; // foreign owner
62
+ if ((st.mode & 0o022) !== 0) return false; // group/world-writable
63
+ return true;
64
+ }
65
+
66
+ // Walk up from startDir to the first directory carrying .recon-wrxn/serve-endpoint.json; read and
67
+ // validate {pid,port}; trust it only when it is well-owned (not planted), well-formed, and the pid is
68
+ // alive. Returns {pid,port,root} or null (the Brain is not warm: absent, untrusted, malformed, missing
69
+ // fields, or a dead process).
70
+ function discoverEndpoint(startDir) {
71
+ let dir = startDir || process.cwd();
72
+ for (let i = 0; i < WALK_UP_LIMIT; i++) {
73
+ const file = path.join(dir, ENDPOINT_REL);
74
+ if (fs.existsSync(file)) {
75
+ if (!endpointTrusted(file)) return null; // foreign-owned or loose perms → not warm
76
+ let obj;
77
+ try {
78
+ obj = JSON.parse(fs.readFileSync(file, 'utf8'));
79
+ } catch {
80
+ return null; // malformed → not warm
81
+ }
82
+ const pid = Number(obj && obj.pid);
83
+ const port = Number(obj && obj.port);
84
+ if (!Number.isInteger(pid) || pid <= 0) return null;
85
+ if (!Number.isInteger(port) || port <= 0) return null;
86
+ if (!pidAlive(pid)) return null; // dead process → not warm
87
+ return { pid, port, root: dir };
88
+ }
89
+ const up = path.dirname(dir);
90
+ if (up === dir) break;
91
+ dir = up;
92
+ }
93
+ return null;
94
+ }
95
+
96
+ // ── transport (injectable; default = real loopback POST) ─────────────────────────────
97
+
98
+ // Default transport: a real loopback POST with a hard timeout. Injectable so unit tests never touch
99
+ // the network (mirrors lib/connect.cjs's invoke seam). Resolves {statusCode, body}; rejects on socket
100
+ // error or timeout.
101
+ function httpTransport({ port, path: reqPath, body, timeoutMs }) {
102
+ return new Promise((resolve, reject) => {
103
+ const payload = Buffer.from(JSON.stringify(body));
104
+ const deadline = timeoutMs || TIMEOUT_MS;
105
+ let settled = false;
106
+ let wall = null;
107
+ const done = (fn, arg) => {
108
+ if (settled) return;
109
+ settled = true;
110
+ if (wall) clearTimeout(wall);
111
+ fn(arg);
112
+ };
113
+ const req = http.request(
114
+ {
115
+ host: '127.0.0.1',
116
+ port,
117
+ path: reqPath,
118
+ method: 'POST',
119
+ headers: { 'Content-Type': 'application/json', 'Content-Length': payload.length },
120
+ },
121
+ (res) => {
122
+ const chunks = [];
123
+ let total = 0;
124
+ res.on('data', (c) => {
125
+ total += c.length;
126
+ if (total > MAX_RESPONSE_BYTES) { req.destroy(new Error('brain door response too large')); return; }
127
+ chunks.push(c);
128
+ });
129
+ res.on('end', () => done(resolve, { statusCode: res.statusCode, body: Buffer.concat(chunks).toString('utf8') }));
130
+ res.on('error', (e) => done(reject, e));
131
+ }
132
+ );
133
+ req.on('error', (e) => done(reject, e));
134
+ // Idle timeout (no bytes for `deadline`) AND an independent wall-clock — the latter bounds a trickle
135
+ // attacker that dribbles bytes to keep the idle timer from ever firing.
136
+ req.setTimeout(deadline, () => req.destroy(new Error('brain door timeout')));
137
+ wall = setTimeout(() => req.destroy(new Error('brain door wall-clock timeout')), deadline);
138
+ req.write(payload);
139
+ req.end();
140
+ });
141
+ }
142
+
143
+ // POST a door tool and return the parsed JSON body. Raises a clean error (never a crash) on a transport
144
+ // fault, a non-200 status, or a non-JSON body.
145
+ async function postTool(transport, port, reqPath, body, timeoutMs) {
146
+ let resp;
147
+ try {
148
+ resp = await transport({ port, path: reqPath, body, timeoutMs: timeoutMs || TIMEOUT_MS });
149
+ } catch (err) {
150
+ throw new Error(`Brain door request to ${reqPath} failed: ${err.message}`);
151
+ }
152
+ if (!resp || resp.statusCode !== 200) {
153
+ throw new Error(`Brain door returned HTTP ${resp ? resp.statusCode : 'no-response'} for ${reqPath}`);
154
+ }
155
+ try {
156
+ return JSON.parse(resp.body);
157
+ } catch {
158
+ throw new Error(`Brain door returned a malformed (non-JSON) response for ${reqPath}`);
159
+ }
160
+ }
161
+
162
+ // ── pure helpers ─────────────────────────────────────────────────────────────────────
163
+
164
+ function isProse(hit) {
165
+ return !!hit && PROSE_TYPES.has(hit.type);
166
+ }
167
+
168
+ // Post-filter hits by --type (the find request can't carry a type ARRAY, so prose=Page+Section is
169
+ // always a post-filter): 'prose' → Page/Section, 'code' → everything else, else an exact NodeType.
170
+ function filterByType(hits, type) {
171
+ if (!type) return hits;
172
+ if (type === 'prose') return hits.filter(isProse);
173
+ if (type === 'code') return hits.filter((h) => !isProse(h));
174
+ const t = String(type).toLowerCase();
175
+ return hits.filter((h) => String(h && h.type).toLowerCase() === t);
176
+ }
177
+
178
+ // Extract a hit's 1-hop neighbors from the door's structured recon_explain response:
179
+ // { result, neighbors: NeighborHit[] }, NeighborHit = { name, type, file, line, relationship }
180
+ // relationship ∈ caller | callee | import | importedBy | method | implementedBy | usedBy | testRef
181
+ // Strictly 1-hop. Consumes that real shape directly — no relationship-bucket guesswork. A missing or
182
+ // non-array `neighbors` (e.g. a degraded/empty explain) yields [] so the hit simply has no neighbors.
183
+ function extractNeighbors(resp) {
184
+ if (!resp || typeof resp !== 'object' || !Array.isArray(resp.neighbors)) return [];
185
+ return resp.neighbors.map((n) => {
186
+ const r = n || {};
187
+ const out = { name: r.name, type: r.type, file: r.file };
188
+ if (r.line != null) out.line = r.line;
189
+ if (r.relationship) out.relationship = r.relationship;
190
+ return out;
191
+ });
192
+ }
193
+
194
+ // ── formatting (pure) ────────────────────────────────────────────────────────────────
195
+
196
+ function hitLine(h) {
197
+ const name = h.name || '(unnamed)';
198
+ const type = h.type || '?';
199
+ const loc = h.file ? `${h.file}${h.line != null ? ':' + h.line : ''}` : '';
200
+ return loc ? `${name} · ${type} · ${loc}` : `${name} · ${type}`;
201
+ }
202
+
203
+ function neighborLine(n) {
204
+ const rel = n.relationship ? ` [${n.relationship}]` : '';
205
+ return ` - ${hitLine(n)}${rel}`;
206
+ }
207
+
208
+ // Render results: --json re-emits the structured hits; default is a human text list. With --neighbors,
209
+ // each hit's 1-hop neighbors are listed indented beneath it.
210
+ function formatHits(hits, opts = {}) {
211
+ const list = Array.isArray(hits) ? hits : [];
212
+ if (opts.json) return JSON.stringify(list, null, 2);
213
+ if (!list.length) return 'no results';
214
+ const lines = [];
215
+ for (const h of list) {
216
+ lines.push(hitLine(h));
217
+ if (opts.neighbors) {
218
+ const ns = Array.isArray(h.neighbors) ? h.neighbors : [];
219
+ if (ns.length) for (const n of ns) lines.push(neighborLine(n));
220
+ else lines.push(' (no 1-hop neighbors)');
221
+ }
222
+ }
223
+ return lines.join('\n');
224
+ }
225
+
226
+ // ── the query (IO shell over the injected seam) ──────────────────────────────────────
227
+
228
+ const NOT_WARM =
229
+ 'Brain is not warm — no live recon serve door found (.recon-wrxn/serve-endpoint.json is absent, ' +
230
+ 'malformed, or its process is dead). Open a Claude Code session (which boots recon serve), or run ' +
231
+ '`recon serve` with the find door enabled, then retry.';
232
+
233
+ /**
234
+ * Query the warm Brain. Whole-brain (code+prose) by default.
235
+ * @param {string} q the query string
236
+ * @param {object} opts { json?, limit?, type?, neighbors? }
237
+ * @param {object} deps { root?, discover?, transport?, timeoutMs? } — injected seam for tests
238
+ * @returns {Promise<{hits: object[]}>}
239
+ * @throws a clear error when the Brain is not warm, or on a malformed/non-200 door response.
240
+ */
241
+ async function query(q, opts = {}, deps = {}) {
242
+ const term = String(q == null ? '' : q).trim();
243
+ if (!term) throw new Error('wrxn brain query requires a non-empty query string');
244
+
245
+ const startDir = deps.root || process.env.CLAUDE_PROJECT_DIR || process.cwd();
246
+ const discover = deps.discover || discoverEndpoint;
247
+ const transport = deps.transport || httpTransport;
248
+ const timeoutMs = deps.timeoutMs || TIMEOUT_MS;
249
+
250
+ const door = discover(startDir);
251
+ if (!door) throw new Error(NOT_WARM);
252
+
253
+ const findBody = { query: term };
254
+ if (Number.isInteger(opts.limit) && opts.limit > 0) findBody.limit = opts.limit;
255
+
256
+ const found = await postTool(transport, door.port, FIND_PATH, findBody, timeoutMs);
257
+ if (!Array.isArray(found.hits)) {
258
+ throw new Error(
259
+ 'Brain door returned an unexpected response shape (no structured `hits` array) — the recon-wrxn ' +
260
+ 'serve door may predate the structured find response.'
261
+ );
262
+ }
263
+
264
+ let hits = filterByType(found.hits, opts.type);
265
+
266
+ // --neighbors: 1-hop expansion per hit via recon_explain — the ONLY place 1-hop lives. A per-hit
267
+ // explain failure degrades to empty neighbors (the find already succeeded); it never crashes.
268
+ if (opts.neighbors) {
269
+ for (const h of hits) {
270
+ const explainBody = { name: h.name };
271
+ if (h.file) explainBody.file = h.file;
272
+ try {
273
+ h.neighbors = extractNeighbors(await postTool(transport, door.port, EXPLAIN_PATH, explainBody, timeoutMs));
274
+ } catch {
275
+ h.neighbors = [];
276
+ }
277
+ }
278
+ }
279
+
280
+ return { hits };
281
+ }
282
+
283
+ module.exports = {
284
+ query,
285
+ formatHits,
286
+ discoverEndpoint,
287
+ pidAlive,
288
+ httpTransport,
289
+ filterByType,
290
+ extractNeighbors,
291
+ isProse,
292
+ FIND_PATH,
293
+ EXPLAIN_PATH,
294
+ PROSE_TYPES,
295
+ };
@@ -0,0 +1,215 @@
1
+ 'use strict';
2
+
3
+ // Converter primitive (multiformat-distill-05) — convert(srcPath) → Markdown, per-format routing.
4
+ //
5
+ // Decision (ADR 0001 / PRD §5, empirically baked off): markitdown is the primary subprocess for the
6
+ // office/web matrix (html/docx/pptx/xlsx); txt is a zero-dep pass-through; PDF escalates to docling
7
+ // (SOTA tables + OCR), which auto-grabs the GPU and CRASHES on arch-incompat (the GTX-1070/Pascal
8
+ // sm_61 trap — torch cu13x ships no sm_61 kernel) → we force CPU on that crash. When Python /
9
+ // markitdown is absent (ENOENT) we degrade to the pure-JS floor (turndown / mammoth / unpdf / SheetJS).
10
+ //
11
+ // The spawn boundary is INJECTED, mirroring lib/connect.cjs's injectable `invoke`: convert(src,{run})
12
+ // takes a converter runner so routing, ENOENT-degrade, and the CPU fallback are unit-testable WITHOUT
13
+ // any real binary. defaultRun does the real spawnSync — that is what makes the integration check
14
+ // "validated by invocation". convert is async only so the pure-JS floor (mammoth/unpdf are async)
15
+ // can be wired in completely; the primary subprocess path is plain blocking spawnSync.
16
+
17
+ const fs = require('fs');
18
+ const os = require('os');
19
+ const path = require('path');
20
+ const { spawnSync } = require('child_process');
21
+
22
+ // Extension → logical format. (.htm folds into html.)
23
+ const FORMATS = {
24
+ '.html': 'html',
25
+ '.htm': 'html',
26
+ '.docx': 'docx',
27
+ '.txt': 'txt',
28
+ '.pptx': 'pptx',
29
+ '.xlsx': 'xlsx',
30
+ '.pdf': 'pdf',
31
+ };
32
+
33
+ // CUDA / arch-incompat crash signatures — the Pascal sm_61 trap and friends. docling auto-grabs the
34
+ // GPU; a torch build with no matching SM kernel dies with "no kernel image is available...".
35
+ const ARCH_CRASH_RE = /no kernel image|kernel image is available|sm_\d+|CUDA error|CUDA_ERROR|device-side assert|out of memory/i;
36
+
37
+ const SPAWN_OPTS = { encoding: 'utf8', timeout: 600000, maxBuffer: 256 * 1024 * 1024 };
38
+
39
+ // ── the injected boundary's real implementation ────────────────────────────────
40
+
41
+ /**
42
+ * Run a converter subprocess and normalize its result to { ok, markdown } | { ok:false, error }.
43
+ * error.code is 'ENOENT' (not installed → degrade), 'CRASH' (arch-incompat → CPU retry), or 'EXIT'.
44
+ */
45
+ function defaultRun(tool, srcPath, { device } = {}) {
46
+ if (tool === 'markitdown') {
47
+ const r = spawnSync('markitdown', [srcPath], SPAWN_OPTS);
48
+ return normalize(r);
49
+ }
50
+ if (tool === 'docling') {
51
+ // docling writes <basename>.md into an --output dir (no markdown on stdout); read it back.
52
+ const outDir = fs.mkdtempSync(path.join(os.tmpdir(), 'wrxn-docling-'));
53
+ try {
54
+ const args = [srcPath, '--to', 'md', '--output', outDir];
55
+ const opts = { ...SPAWN_OPTS };
56
+ if (device === 'cpu') {
57
+ args.push('--device', 'cpu');
58
+ opts.env = { ...process.env, CUDA_VISIBLE_DEVICES: '' };
59
+ }
60
+ const r = spawnSync('docling', args, opts);
61
+ if (r.error) return { ok: false, error: classifyError(r.error) };
62
+ if (r.status !== 0 || r.signal) {
63
+ const stderr = r.stderr || '';
64
+ const code = ARCH_CRASH_RE.test(stderr) || r.signal ? 'CRASH' : 'EXIT';
65
+ return { ok: false, error: { code, status: r.status, signal: r.signal, message: stderr.trim() } };
66
+ }
67
+ return { ok: true, markdown: readDoclingOutput(outDir, srcPath) };
68
+ } finally {
69
+ fs.rmSync(outDir, { recursive: true, force: true });
70
+ }
71
+ }
72
+ throw new Error(`unknown converter tool: ${tool}`);
73
+ }
74
+
75
+ function normalize(r) {
76
+ if (r.error) return { ok: false, error: classifyError(r.error) };
77
+ if (r.status !== 0 || r.signal) {
78
+ return { ok: false, error: { code: 'EXIT', status: r.status, signal: r.signal, message: (r.stderr || '').trim() } };
79
+ }
80
+ return { ok: true, markdown: r.stdout };
81
+ }
82
+
83
+ function classifyError(err) {
84
+ return { code: err.code || 'ERR', message: err.message || String(err) };
85
+ }
86
+
87
+ function readDoclingOutput(outDir, srcPath) {
88
+ const base = path.basename(srcPath, path.extname(srcPath));
89
+ const preferred = path.join(outDir, `${base}.md`);
90
+ if (fs.existsSync(preferred)) return fs.readFileSync(preferred, 'utf8');
91
+ // Fall back to the first .md docling produced (naming can vary by version).
92
+ const md = fs.readdirSync(outDir).find((f) => f.toLowerCase().endsWith('.md'));
93
+ if (!md) throw new Error(`docling produced no markdown in ${outDir}`);
94
+ return fs.readFileSync(path.join(outDir, md), 'utf8');
95
+ }
96
+
97
+ // ── the pure-JS floor (no-Python degrade) ───────────────────────────────────────
98
+
99
+ function lazy(mod) {
100
+ try {
101
+ return require(mod);
102
+ } catch {
103
+ throw new Error(
104
+ `pure-JS floor needs "${mod}" but it is not installed, and the primary converter is absent. ` +
105
+ `Install the primary path (pip install 'markitdown[all]' / docling) or the floor (npm i ${mod}).`
106
+ );
107
+ }
108
+ }
109
+
110
+ /** The no-Python in-process floor (research §2: turndown / mammoth / unpdf / SheetJS). Async. */
111
+ async function defaultFloor(fmt, srcPath) {
112
+ if (fmt === 'txt') return fs.readFileSync(srcPath, 'utf8');
113
+ if (fmt === 'html') {
114
+ const Turndown = lazy('turndown');
115
+ const td = new Turndown();
116
+ try {
117
+ const { gfm } = require('turndown-plugin-gfm');
118
+ td.use(gfm);
119
+ } catch { /* gfm tables are a nice-to-have, not required */ }
120
+ return td.turndown(fs.readFileSync(srcPath, 'utf8'));
121
+ }
122
+ if (fmt === 'docx') {
123
+ const mammoth = lazy('mammoth');
124
+ const Turndown = lazy('turndown');
125
+ const { value: html } = await mammoth.convertToHtml({ path: srcPath });
126
+ return new Turndown().turndown(html);
127
+ }
128
+ if (fmt === 'pdf') {
129
+ const { extractText, getDocumentProxy } = lazy('unpdf');
130
+ const buf = new Uint8Array(fs.readFileSync(srcPath));
131
+ const pdf = await getDocumentProxy(buf);
132
+ const { text } = await extractText(pdf, { mergePages: true });
133
+ return text;
134
+ }
135
+ if (fmt === 'xlsx') {
136
+ const XLSX = lazy('xlsx');
137
+ const wb = XLSX.readFile(srcPath);
138
+ return wb.SheetNames.map((n) => `## ${n}\n\n${XLSX.utils.sheet_to_csv(wb.Sheets[n])}`).join('\n\n');
139
+ }
140
+ if (fmt === 'pptx') {
141
+ const officeParser = lazy('officeparser');
142
+ return await officeParser.parseOfficeAsync(srcPath);
143
+ }
144
+ throw new Error(`no pure-JS floor for format "${fmt}"`);
145
+ }
146
+
147
+ // ── the primitive ───────────────────────────────────────────────────────────────
148
+
149
+ /**
150
+ * Convert a source file to Markdown via per-format routing.
151
+ * @param {string} srcPath
152
+ * @param {{ run?: Function, floor?: Function, gpu?: boolean }} [opts]
153
+ * run — injectable converter boundary (default: defaultRun, the real spawnSync).
154
+ * floor — injectable pure-JS floor (default: defaultFloor).
155
+ * gpu — false forces docling onto CPU from the first attempt (skips the GPU probe/crash).
156
+ * @returns {Promise<string>} the markdown.
157
+ */
158
+ async function convert(srcPath, { run = defaultRun, floor = defaultFloor, gpu } = {}) {
159
+ // Resolve to an absolute path up front so a leading-dash filename can never be read as a CLI flag
160
+ // by the converter subprocess — the dash-neutralization must not depend on the caller (slice-06
161
+ // ingest calls convert() directly, not via the CLI).
162
+ srcPath = path.resolve(srcPath);
163
+ // Pre-check existence up front (mirrors lib/ingest.cjs's source-not-found guard) so a missing file
164
+ // is rejected with a clean message and NEVER reaches the converter subprocess — whose Python
165
+ // traceback (markitdown/docling) would otherwise leak to the user verbatim (multiformat-distill-08).
166
+ if (!fs.existsSync(srcPath)) throw new Error(`wrxn convert: source not found: ${srcPath}`);
167
+ const ext = path.extname(srcPath).toLowerCase();
168
+ const fmt = FORMATS[ext];
169
+ if (!fmt) {
170
+ throw new Error(`wrxn convert: unsupported format "${ext || '(none)'}" — supported: ${Object.keys(FORMATS).join(', ')}`);
171
+ }
172
+
173
+ // txt is already plain text — pass it through (zero-dep, always works).
174
+ if (fmt === 'txt') {
175
+ return fs.readFileSync(srcPath, 'utf8');
176
+ }
177
+
178
+ if (fmt === 'pdf') {
179
+ return convertPdf(srcPath, { run, floor, gpu });
180
+ }
181
+
182
+ // markitdown-primary formats (html/docx/pptx/xlsx).
183
+ const r = run('markitdown', srcPath);
184
+ if (r.ok) return r.markdown;
185
+ if (r.error && r.error.code === 'ENOENT') {
186
+ return floor(fmt, srcPath); // markitdown absent → degrade to the pure-JS floor
187
+ }
188
+ throw new Error(`wrxn convert: markitdown failed on ${path.basename(srcPath)} — ${r.error.message || r.error.code}`);
189
+ }
190
+
191
+ /** PDF tier: docling (GPU/auto) → CPU on an arch-crash → pure-JS floor if docling is absent. */
192
+ async function convertPdf(srcPath, { run, floor, gpu }) {
193
+ const firstDevice = gpu === false ? 'cpu' : undefined; // undefined = let docling pick (GPU/auto)
194
+ const r = run('docling', srcPath, { device: firstDevice });
195
+ if (r.ok) return r.markdown;
196
+ if (r.error && r.error.code === 'ENOENT') {
197
+ return floor('pdf', srcPath); // no docling → unpdf floor
198
+ }
199
+ if (r.error && r.error.code === 'CRASH' && firstDevice !== 'cpu') {
200
+ // arch-incompat / GPU crash → force CPU (CUDA_VISIBLE_DEVICES='' + --device cpu).
201
+ const cpu = run('docling', srcPath, { device: 'cpu' });
202
+ if (cpu.ok) return cpu.markdown;
203
+ if (cpu.error && cpu.error.code === 'ENOENT') return floor('pdf', srcPath);
204
+ throw new Error(`wrxn convert: docling failed on the CPU fallback for ${path.basename(srcPath)} — ${cpu.error.message || cpu.error.code}`);
205
+ }
206
+ throw new Error(`wrxn convert: docling failed on ${path.basename(srcPath)} — ${r.error.message || r.error.code}`);
207
+ }
208
+
209
+ module.exports = {
210
+ convert,
211
+ defaultRun,
212
+ defaultFloor,
213
+ FORMATS,
214
+ ARCH_CRASH_RE,
215
+ };