rewritable 0.1.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,257 @@
1
+ // Consumer-side static self-description for `self-description/1` — the answer to
2
+ // "what is this rewritable, and what can be done with it?" computed from the
3
+ // file BYTES, without executing the container's JS.
4
+ // Contract + reference: docs/specs/rwa-self-description-spec.md,
5
+ // tools/self-description.mjs (computeSelfDescription / validateSelfDescription).
6
+ //
7
+ // This is a PUBLISH-SAFE MIRROR of the reference's static computer. The CLI is a
8
+ // standalone npm package and cannot reach repo-root tools/ at runtime, so the
9
+ // kind→provider table, the substrate baseline, the title/blocks extraction, and
10
+ // the assembled object are duplicated here — the same pattern as
11
+ // cli/src/apply-edits.mjs mirroring the seed. The mirror is pinned to the single
12
+ // source by tests/identity.test.mjs (KIND_PROVIDERS / SUBSTRATE_BASELINE deep-equal
13
+ // the reference; the full assembled object deep-equals computeSelfDescription in
14
+ // doc.test.mjs). Drift fails loudly. KEEP IN STEP with tools/self-description.mjs.
15
+
16
+ import { tagHasFrozenAttr } from './apply-edits.mjs';
17
+ import { parseSkillZone } from './skill-manifest.mjs';
18
+
19
+ export const SCHEMA_TAG = 'self-description/1';
20
+ // Mirror of tools/self-description.mjs AFFORDANCE_KINDS / PROVENANCES — used by the
21
+ // declared-projection conformance gate (declaredIsConforming). Keep in step.
22
+ export const AFFORDANCE_KINDS = ['view', 'edit-surface', 'tool', 'compute', 'hook'];
23
+ export const PROVENANCES = ['first-party', 'installed'];
24
+
25
+ // kind -> registered provider bundle (spec §4). Each provider is {kind,name,label};
26
+ // `provenance:'first-party'` is added per emit (bootstrap-resident providers).
27
+ // The presentation entry mirrors the seed presentationProvider {name:'presentation',
28
+ // label:'Present'} (seeds/rewritable.html:3542-3543) so static == live by construction.
29
+ // ONLY kinds the runtime FIRST-PARTY-provides — custom kinds (datatable, …) are
30
+ // consumer-built via provide()/the declaration, so their honest static answer is
31
+ // [] (declared > static supplies the real affordances when a declaration exists).
32
+ export const KIND_PROVIDERS = {
33
+ document: [],
34
+ presentation: [{ kind: 'view', name: 'presentation', label: 'Present' }],
35
+ workflow: [],
36
+ // skill-host: no first-party affordances; installed skills (provenance:'installed')
37
+ // come from parseSkillZone (§8), not this table. Explicit [] mirrors the oracle.
38
+ 'skill-host': [],
39
+ };
40
+
41
+ // Substrate-universal ops — the SAME for every container regardless of kind. The
42
+ // "what can be done with me" data that is NOT an affordance (affordances stay
43
+ // kernel-pure: a base document is []). `history` is undo-only — there is no redo
44
+ // (re-write-able-spec Invariant 7).
45
+ export const SUBSTRATE_BASELINE = Object.freeze({
46
+ edit: ['lens'],
47
+ tools: ['apply_dsl_plan', 'apply_edits', 'replace_document'],
48
+ export: ['html', 'print'],
49
+ history: ['undo'],
50
+ });
51
+
52
+ /**
53
+ * The document's human-readable title: the text of its first <h1>, or null.
54
+ * Mirrors tools/self-description.mjs `staticTitle` exactly (so titles agree).
55
+ * @param {string} doc — the LF-canonical editable body
56
+ * @returns {string|null}
57
+ */
58
+ export function extractTitle(doc) {
59
+ const m = (doc || '').match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
60
+ if (!m) return null;
61
+ const text = m[1].replace(/<[^>]*>/g, '').replace(/\s+/g, ' ').trim();
62
+ return text || null;
63
+ }
64
+
65
+ /**
66
+ * Count of data-rwa-id-addressable blocks — a coarse "how structured" signal.
67
+ * @param {string} doc — the LF-canonical editable body
68
+ * @returns {number}
69
+ */
70
+ export function countBlocks(doc) {
71
+ return ((doc || '').match(/\bdata-rwa-id\b/g) || []).length;
72
+ }
73
+
74
+ /**
75
+ * Assemble the STATIC self-description projection from a container's already-
76
+ * extracted facts (so inspectDoc parses the file once). Equivalent to the
77
+ * reference `computeSelfDescription(fileText)`, minus the file parsing.
78
+ *
79
+ * @param {{doc:string, uuid:string|null, kind:string, frozenZones:string[]}} facts
80
+ * @returns {object} a `source:'static'` self-description/1 object (spec §2)
81
+ */
82
+ export function buildSelfDescription({ doc, uuid, kind, frozenZones }) {
83
+ // First-party (kind-derived) + INSTALLED skills from the frozen #rwa-skills zone
84
+ // (§8). Mirrors tools/self-description.mjs computeSelfDescription exactly.
85
+ const affordances = [
86
+ ...(KIND_PROVIDERS[kind] || []).map((p) => ({ ...p, provenance: 'first-party' })),
87
+ ...parseSkillZone(doc),
88
+ ];
89
+ return {
90
+ rwa: SCHEMA_TAG,
91
+ source: 'static',
92
+ uuid,
93
+ kind,
94
+ title: extractTitle(doc),
95
+ blocks: countBlocks(doc),
96
+ affordances,
97
+ frozenZones,
98
+ baseline: { ...SUBSTRATE_BASELINE },
99
+ };
100
+ }
101
+
102
+ // ── The `declared` projection (v1.1, spec §3.1) ───────────────────────────────
103
+ // A custom-affordance file (a datatable the kind table can only GUESS for) may
104
+ // carry its own answer: an inert `<script id="rwa-affordances">` block with a
105
+ // `source:"declared"` self-description. The reader prefers it (declared > static)
106
+ // only if it is TRUSTWORTHY — edit-unreachable so the lens/agent can't have
107
+ // drifted it. Mirror of tools/self-description.mjs DECL_RE / parseDeclaration /
108
+ // declarationFacts (publish-safe; the CLI can't reach repo-root tools/ at runtime).
109
+ // The oracle takes only fileText and extractInlineDoc's it; the CLI passes the
110
+ // already-extracted `doc` (== extractInlineDoc(fileText)) so the two agree.
111
+ // KEEP IN STEP with tools/self-description.mjs.
112
+ const DECL_RE = /<script\b[^>]*\bid=["']rwa-affordances["'][^>]*>([\s\S]*?)<\/script\s*>/i;
113
+
114
+ // A body declaration lives inside INLINE_DOC (its </script> escaped in raw bytes),
115
+ // so it is found in `doc`; a chrome declaration (immutable, outside INLINE_DOC) is
116
+ // found in the raw file text. Return which, so the reader can judge edit-reachability.
117
+ function declarationLocus(fileText, doc) {
118
+ if (doc && DECL_RE.test(doc)) return { hay: doc, inEditableBody: true };
119
+ return { hay: fileText, inEditableBody: false };
120
+ }
121
+
122
+ /**
123
+ * Extract the embedded #rwa-affordances declaration, if any.
124
+ * @returns {{ declaration: object|null, raw: string|null, error: string|null }}
125
+ */
126
+ export function parseDeclaration(fileText, doc) {
127
+ const m = declarationLocus(fileText, doc).hay.match(DECL_RE);
128
+ if (!m) return { declaration: null, raw: null, error: null };
129
+ try {
130
+ return { declaration: JSON.parse(m[1]), raw: m[1], error: null };
131
+ } catch (e) {
132
+ return { declaration: null, raw: m[1], error: 'invalid JSON: ' + (e && e.message) };
133
+ }
134
+ }
135
+
136
+ /**
137
+ * Edit-reachability facts for the declaration (spec §3.1). Trustworthy iff
138
+ * `!inEditableBody` (chrome) OR `frozenAttr` (data-rwa-frozen — enforced by the
139
+ * lens, and by the CLI as of attribute-form enforcement). `frozenZones` is NOT
140
+ * consulted (marker-form only, SD-04).
141
+ * @returns {{ found: boolean, inEditableBody: boolean, frozenAttr: boolean }}
142
+ */
143
+ export function declarationFacts(fileText, doc) {
144
+ const { hay, inEditableBody } = declarationLocus(fileText, doc);
145
+ const m = hay.match(DECL_RE);
146
+ if (!m) return { found: false, inEditableBody: false, frozenAttr: false };
147
+ const openTag = m[0].slice(0, m[0].indexOf('>') + 1);
148
+ // DOM-accurate: data-rwa-frozen must be a real attribute NAME (not a value-
149
+ // mention / longer name), matching the seed's actual enforcement — else the
150
+ // CLI would over-trust a declaration the lens can still drift (euler #112).
151
+ return { found: true, inEditableBody, frozenAttr: tagHasFrozenAttr(openTag) };
152
+ }
153
+
154
+ export const SOURCES = ['static', 'live', 'declared'];
155
+ const isStrArray = (v) => Array.isArray(v) && v.every((x) => typeof x === 'string');
156
+
157
+ /**
158
+ * Validate a self-description/1 object against the §2/§3.1 schema — a publish-safe
159
+ * MIRROR of tools/self-description.mjs validateSelfDescription, so the reader can
160
+ * guarantee it never emits a non-conforming declared answer without importing
161
+ * repo-root tools/ at runtime. Pinned to the oracle by test (identity.test.mjs).
162
+ * KEEP IN STEP with tools/self-description.mjs.
163
+ * @returns {{ valid: boolean, errors: string[] }}
164
+ */
165
+ export function validateSelfDescription(obj) {
166
+ const errors = [];
167
+ if (obj === null || typeof obj !== 'object' || Array.isArray(obj)) return { valid: false, errors: ['not an object'] };
168
+ if (obj.rwa !== SCHEMA_TAG) errors.push('rwa must be "' + SCHEMA_TAG + '"');
169
+ if (!SOURCES.includes(obj.source)) errors.push('source must be one of ' + SOURCES.join(' | '));
170
+ // uuid/frozenZones are container facts the reader fills, so optional in a declaration.
171
+ if (obj.source === 'declared') {
172
+ if ('uuid' in obj && obj.uuid !== null && typeof obj.uuid !== 'string') errors.push('uuid, if present, must be a string or null');
173
+ } else if (!('uuid' in obj) || (obj.uuid !== null && typeof obj.uuid !== 'string')) {
174
+ errors.push('uuid must be a string or null');
175
+ }
176
+ if (typeof obj.kind !== 'string' || obj.kind.length === 0) errors.push('kind must be a non-empty string');
177
+ if ('title' in obj && obj.title !== null && typeof obj.title !== 'string') errors.push('title must be a string or null');
178
+ if ('blocks' in obj && (typeof obj.blocks !== 'number' || !Number.isFinite(obj.blocks))) errors.push('blocks must be a number');
179
+ if (!Array.isArray(obj.affordances)) {
180
+ errors.push('affordances must be an array');
181
+ } else {
182
+ obj.affordances.forEach((a, i) => {
183
+ if (a === null || typeof a !== 'object' || Array.isArray(a)) { errors.push('affordances[' + i + '] must be an object'); return; }
184
+ if (!AFFORDANCE_KINDS.includes(a.kind)) errors.push('affordances[' + i + '].kind unknown');
185
+ if (typeof a.name !== 'string' || !a.name) errors.push('affordances[' + i + '].name must be a non-empty string');
186
+ if ('label' in a && typeof a.label !== 'string') errors.push('affordances[' + i + '].label must be a string');
187
+ if (!PROVENANCES.includes(a.provenance)) errors.push('affordances[' + i + '].provenance must be first-party | installed');
188
+ if ('surface' in a && typeof a.surface !== 'string') errors.push('affordances[' + i + '].surface must be a string');
189
+ if ('target' in a && typeof a.target !== 'string') errors.push('affordances[' + i + '].target must be a string');
190
+ if ('output' in a && typeof a.output !== 'string') errors.push('affordances[' + i + '].output must be a string');
191
+ if ('inputs' in a && !isStrArray(a.inputs)) errors.push('affordances[' + i + '].inputs must be an array of strings');
192
+ if ('verified' in a && typeof a.verified !== 'boolean') errors.push('affordances[' + i + '].verified must be a boolean');
193
+ });
194
+ }
195
+ if ('data' in obj && obj.data !== null && typeof obj.data !== 'string') errors.push('data must be a string or null');
196
+ if ('frozenZones' in obj) {
197
+ if (!isStrArray(obj.frozenZones)) errors.push('frozenZones must be an array of strings');
198
+ } else if (obj.source !== 'declared') {
199
+ errors.push('frozenZones must be an array of strings');
200
+ }
201
+ if ('baseline' in obj) {
202
+ const b = obj.baseline;
203
+ if (b === null || typeof b !== 'object' || Array.isArray(b)) {
204
+ errors.push('baseline must be an object');
205
+ } else {
206
+ for (const k of ['edit', 'tools', 'export', 'history', 'view']) {
207
+ if (k in b && !isStrArray(b[k])) errors.push('baseline.' + k + ', if present, must be an array of strings');
208
+ }
209
+ if (Array.isArray(b.history) && b.history.includes('redo')) errors.push('baseline.history must not claim "redo"');
210
+ }
211
+ }
212
+ if (obj.source === 'static' && 'activeView' in obj) errors.push('static projection must omit activeView');
213
+ if ('activeView' in obj && obj.activeView !== null && typeof obj.activeView !== 'string') errors.push('activeView must be a string or null');
214
+ return { valid: errors.length === 0, errors };
215
+ }
216
+
217
+ /**
218
+ * The reader's one answer (spec §3.1 precedence: declared > static). If the file
219
+ * carries a TRUSTWORTHY (edit-unreachable) declaration that — once the reader
220
+ * fills container facts (uuid/frozenZones/blocks from the bytes, authoritative
221
+ * over any author claim) — VALIDATES, emit it as `source:'declared'`. Otherwise
222
+ * emit the static kind-derived projection. Validating the assembled object before
223
+ * trusting it guarantees the reader never emits a non-conforming answer (a subtly
224
+ * malformed trustworthy declaration safely falls back to static). No `live`
225
+ * registry on the static path, so there is no declared>live>static middle tier.
226
+ *
227
+ * @param {{fileText:string, doc:string, uuid:string|null, kind:string, frozenZones:string[]}} facts
228
+ * @returns {object} a self-description/1 object (`source:'declared'` or `'static'`)
229
+ */
230
+ export function resolveSelfDescription({ fileText, doc, uuid, kind, frozenZones }) {
231
+ const f = declarationFacts(fileText, doc);
232
+ if (f.found && (!f.inEditableBody || f.frozenAttr)) {
233
+ const { declaration } = parseDeclaration(fileText, doc);
234
+ if (declaration && typeof declaration === 'object' && !Array.isArray(declaration)) {
235
+ // Fill ONLY container facts (uuid/frozenZones/blocks from the bytes —
236
+ // authoritative over any author claim). Do NOT force rwa/source: the
237
+ // discriminator and source are the author's claim and must already be
238
+ // correct, or the declaration is non-conforming and we must not "repair"
239
+ // it into a trusted answer (e.g. a `schema`-not-`rwa` pre-aligned block).
240
+ // Union installed skills (parseSkillZone) into the declared affordances —
241
+ // the static path does, so dropping them here made declared≠live (SD-04).
242
+ // Declared providers win a (kind,name) collision; mirrors the seed's
243
+ // runtimeDescribe registry→declared→installed precedence.
244
+ const declAff = Array.isArray(declaration.affordances) ? declaration.affordances : [];
245
+ const seen = new Set(declAff.map((a) => a.kind + '\0' + a.name));
246
+ const candidate = {
247
+ ...declaration,
248
+ affordances: [...declAff, ...parseSkillZone(doc).filter((s) => !seen.has(s.kind + '\0' + s.name))],
249
+ uuid,
250
+ frozenZones,
251
+ blocks: countBlocks(doc),
252
+ };
253
+ if (candidate.source === 'declared' && validateSelfDescription(candidate).valid) return candidate;
254
+ }
255
+ }
256
+ return buildSelfDescription({ doc, uuid, kind, frozenZones });
257
+ }
@@ -0,0 +1,360 @@
1
+ import { spawn } from 'node:child_process';
2
+ import { readFile } from 'node:fs/promises';
3
+ import path from 'node:path';
4
+ import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
5
+
6
+ // PDF / docx → HTML by spawning the `claude` CLI in print mode.
7
+ //
8
+ // PDFs are processed in PARALLEL: split into page ranges, each chunk
9
+ // handed to its own `claude -p` subprocess concurrently, then merged.
10
+ // Long papers go from sequential N×t to roughly t×ceil(chunks/concurrency).
11
+ //
12
+ // Why: the user's machine has Anthropic's official `pdf` and `docx` skills
13
+ // installed under ~/.claude/skills/. Those skills have rich Python tooling
14
+ // (pypdf, pdfplumber, pandoc, mammoth, LibreOffice) that the rwa CLI itself
15
+ // can't reasonably bundle. Calling `claude -p` lets the agent invoke its
16
+ // skill, run the local Python sandbox, and hand back clean semantic HTML —
17
+ // strictly better fidelity than either the local pdfjs heuristic or the
18
+ // raw-vision OpenRouter path, on documents where the skills apply.
19
+ //
20
+ // Trust model: this spawns a Claude Code subprocess that reads the input file's
21
+ // CONTENTS into an agent context (the pdf/docx skill needs Python — pypdf,
22
+ // pdfplumber, mammoth — to extract them, so the agent genuinely needs tool
23
+ // access). That makes the file attacker-controlled input: prompt-injection text
24
+ // hidden in a third-party PDF/DOCX could hijack the agent. `import` is precisely
25
+ // the command you point at files you received from someone else, so "the user
26
+ // trusts their input file" is the WRONG threat model.
27
+ //
28
+ // Therefore `--claude` is gated behind an explicit `--trust-input` consent flag
29
+ // (convertViaClaudeCli throws below if it is absent). Only when the user vouches
30
+ // for the file do we add `--permission-mode bypassPermissions`. The default
31
+ // import path (pdfjs/mammoth — parses bytes, never executes the file's content)
32
+ // remains the safe, no-flag route. Documented in HELP.
33
+
34
+ const SKILL_FOR_EXT = { pdf: 'pdf', docx: 'docx' };
35
+
36
+ const DEFAULT_CHUNK_SIZE = 5; // pages per chunk
37
+ const DEFAULT_CONCURRENCY = 4; // simultaneous claude -p subprocesses
38
+ const DEFAULT_TIMEOUT_MS = 1_200_000; // 20 minutes per chunk
39
+
40
+ const PROMPT_TEMPLATE = (skill, filePath, pageRange) => {
41
+ const rangeNote = pageRange
42
+ ? `\n\nIMPORTANT: Process ONLY pages ${pageRange.start} to ${pageRange.end} (inclusive) of the document. Use the pdf skill's page-range support (pypdf/pdfplumber accept page indices) to extract just that slice. Do not output content from any other pages. The full document is ${pageRange.totalPages} pages; this chunk is pages ${pageRange.start}-${pageRange.end}.`
43
+ : '';
44
+ const styleNote = pageRange && pageRange.start > 1
45
+ ? `\n\nIMPORTANT (chunk ${pageRange.start}-${pageRange.end}): omit the leading <style> and @page rules. Output ONLY the inner content of the .doc wrapper for these pages — start your output with the actual content elements (e.g., <h2>, <p>, <table>...) and end with the last content element. Do NOT include <article>, <style>, <div class="doc">, or </article>, </div>. Just the content of pages ${pageRange.start}-${pageRange.end}, ready to splice into a larger document. The first chunk handled the styling; later chunks contribute content only.`
46
+ : '';
47
+
48
+ return `Use the ${skill} skill to extract the content of ${filePath} and convert it to a single <article>...</article> element that VISUALLY MATCHES the original document as closely as possible when rendered in a browser.${rangeNote}${styleNote}
49
+
50
+ The output will be embedded inside a re-writeable document container that has its own dark-theme CSS. Your <article> must include a leading scoped <style> block that defines its own visual appearance, so the container's theme does not bleed in.
51
+
52
+ Required structure (full-document or first-chunk only — see chunk note above):
53
+
54
+ <article style="all: revert;">
55
+ <style>
56
+ /* Scope every rule to .doc to avoid leaking into the container.
57
+ Use 'all: revert' or explicit resets to neutralize the container's theme. */
58
+ .doc { background: ...; color: ...; font-family: ...; padding: ...; max-width: ...; margin: 0 auto; }
59
+ .doc h1, .doc h2, .doc p, .doc table, .doc th, .doc td { ... }
60
+ /* etc. */
61
+ </style>
62
+ <div class="doc">
63
+ ... actual content ...
64
+ </div>
65
+ </article>
66
+
67
+ Style requirements (match the source PDF):
68
+ - Background color (usually white #ffffff for printed documents).
69
+ - Text color (usually black #000000 or near-black).
70
+ - Font family — pick a generic match: invoices and letters use sans-serif (Helvetica, Arial, system-ui); academic/literary uses serif (Georgia, Times New Roman); monospaced text uses monospace.
71
+ - Font sizes — match the visual hierarchy (titles bigger, body smaller, footnotes smallest).
72
+ - Text alignment — left, right, center, or justify, matching each block in the source.
73
+ - Right-aligned blocks (sender addresses, dates) MUST remain right-aligned via CSS.
74
+ - Padding/margins around sections that mirror the PDF's vertical density. Crucially, do NOT inflate vertical spacing — if the source fits on N pages, your output should fit on N pages when printed at the source paper size. Prefer tight margins (~0.5em-1em between blocks) over generous ones; a single-page invoice should remain a single-page invoice.
75
+ - Tables — borders, cell padding, header weight, alternating rows or shading where the PDF has them.
76
+ - Bold and italic where used, via <strong>/<em> (preferred) or font-weight/font-style in the scoped CSS.
77
+
78
+ Print-fit requirements (REQUIRED for documents that match a paper size):
79
+ - Include an @media print rule inside the scoped <style> block that:
80
+ * Removes any max-width constraint (so the doc fills the page width).
81
+ * Sets margin:0 / padding:0 on .doc so the printer's @page margin (default 0.5in) is the only outer margin.
82
+ * Optionally tightens block spacing further if the source page density is dense.
83
+ * Uses page-break-inside:avoid on tables, headers, and footer blocks so they don't split awkwardly across pages.
84
+ - Add an @page rule with size matching the source (default A4 if uncertain): @page { size: A4; margin: 0.5in; }
85
+
86
+ Content requirements:
87
+ - Use semantic tags: <h1>-<h6>, <p>, <ul>/<ol>/<li>, <table>/<thead>/<tbody>/<tr>/<td>/<th>, <strong>/<em>, <a href="...">.
88
+ - Preserve text exactly. Do not summarize, paraphrase, or reword.
89
+ - Reconstruct multi-column layouts as the source has them: side-by-side blocks via CSS flex/grid in your scoped styles, or as table cells if that fits better.
90
+ - No <img> tags. No <script>. No external resources (no @import, no <link>, no Google Fonts URLs — only system or generic font families).
91
+ - No id attributes. Class names should be scoped under .doc to avoid collisions with the container.
92
+ - Do not include <html>, <head>, <body>, or <!doctype>.
93
+
94
+ Print ONLY the final HTML as your last response. No preamble, no markdown fences, no commentary.`;
95
+ };
96
+
97
+ /**
98
+ * @param {string} filePath Absolute path to the file to import
99
+ * @param {string} ext Extension without dot ("pdf" or "docx")
100
+ * @param {object} [opts]
101
+ * @param {AbortSignal} [opts.signal]
102
+ * @param {number} [opts.timeoutMs] Wall-clock cap PER CHUNK (default 20min)
103
+ * @param {number} [opts.chunkSize] Pages per chunk for PDFs (default 5)
104
+ * @param {number} [opts.concurrency] Max simultaneous subprocesses (default 4)
105
+ * @returns {Promise<{ html: string, warnings: string[] }>}
106
+ */
107
+ export async function convertViaClaudeCli(filePath, ext, opts = {}) {
108
+ const skill = SKILL_FOR_EXT[ext];
109
+ if (!skill) {
110
+ const e = new Error(`--claude only supports .pdf and .docx (got .${ext})`);
111
+ e.exitCode = 2;
112
+ throw e;
113
+ }
114
+
115
+ // Consent gate (SECURITY). Refuse to point an autonomous agent at the file
116
+ // unless the user explicitly vouched for it. Must run BEFORE any file read or
117
+ // subprocess spawn, so an unconsented file is never touched by the agent.
118
+ if (!opts.trustInput) {
119
+ const e = new Error(
120
+ `refusing to run an autonomous agent on ${filePath} without consent.\n` +
121
+ ` --claude extraction reads the file's contents into a Claude Code agent, so a\n` +
122
+ ` malicious file could hijack it (prompt-injection -> code execution).\n` +
123
+ ` Re-run with --claude --trust-input only if you trust this file's source.\n` +
124
+ ` (The default import, without --claude, parses the file safely and never executes its contents.)`
125
+ );
126
+ e.exitCode = 2;
127
+ throw e;
128
+ }
129
+
130
+ // docx isn't naturally page-chunkable (no fixed page boundaries inside the
131
+ // XML). Single call.
132
+ if (ext !== 'pdf') {
133
+ const stdout = await runClaude(filePath, PROMPT_TEMPLATE(skill, filePath, null), opts);
134
+ const html = extractArticle(stdout);
135
+ if (!html) {
136
+ const preview = stdout.trim().slice(0, 400);
137
+ const e = new Error(
138
+ `claude: output did not contain an <article> element. Output preview:\n${preview}`
139
+ );
140
+ e.exitCode = 2;
141
+ throw e;
142
+ }
143
+ return {
144
+ html,
145
+ warnings: [`claude: imported via \`claude -p\` (${skill} skill)`],
146
+ };
147
+ }
148
+
149
+ const totalPages = await getPdfPageCount(filePath);
150
+ const chunkSize = opts.chunkSize || DEFAULT_CHUNK_SIZE;
151
+ const concurrency = opts.concurrency || DEFAULT_CONCURRENCY;
152
+
153
+ const ranges = [];
154
+ for (let start = 1; start <= totalPages; start += chunkSize) {
155
+ const end = Math.min(start + chunkSize - 1, totalPages);
156
+ ranges.push({ start, end, totalPages });
157
+ }
158
+
159
+ console.error(
160
+ `note: claude: ${totalPages}-page PDF → ${ranges.length} chunk${ranges.length === 1 ? '' : 's'} of ≤${chunkSize} pages, ${Math.min(concurrency, ranges.length)} parallel`
161
+ );
162
+
163
+ const htmlChunks = await runWithConcurrency(ranges, concurrency, async (range, idx) => {
164
+ console.error(`note: claude: chunk ${idx + 1}/${ranges.length} (pages ${range.start}-${range.end}) starting…`);
165
+ const prompt = PROMPT_TEMPLATE(skill, filePath, range);
166
+ const html = await runClaude(filePath, prompt, opts);
167
+ console.error(`note: claude: chunk ${idx + 1}/${ranges.length} done`);
168
+ return html;
169
+ });
170
+
171
+ const merged = mergeChunks(htmlChunks);
172
+ return {
173
+ html: merged,
174
+ warnings: [
175
+ `claude: imported ${ranges.length} chunk${ranges.length === 1 ? '' : 's'} via parallel \`claude -p\` (${skill} skill)`,
176
+ ],
177
+ };
178
+ }
179
+
180
+ // Run a single `claude -p` invocation. Returns the extracted HTML for the
181
+ // chunk (either a full <article> or content-only fragment depending on the
182
+ // prompt's chunk hint).
183
+ function runClaude(filePath, prompt, { signal, timeoutMs = DEFAULT_TIMEOUT_MS } = {}) {
184
+ const args = [
185
+ '-p',
186
+ '--output-format', 'text',
187
+ '--add-dir', path.dirname(filePath),
188
+ '--permission-mode', 'bypassPermissions',
189
+ prompt,
190
+ ];
191
+
192
+ return new Promise((resolve, reject) => {
193
+ let proc;
194
+ try {
195
+ proc = spawn('claude', args, { stdio: ['ignore', 'pipe', 'pipe'], signal });
196
+ } catch (err) {
197
+ const e = new Error(`claude: failed to spawn (${err && err.message ? err.message : String(err)}). Is the claude CLI installed?`);
198
+ e.exitCode = 2;
199
+ return reject(e);
200
+ }
201
+
202
+ let stdout = '';
203
+ let stderr = '';
204
+ proc.stdout.on('data', d => { stdout += d.toString('utf8'); });
205
+ proc.stderr.on('data', d => { stderr += d.toString('utf8'); });
206
+
207
+ const timer = setTimeout(() => {
208
+ proc.kill('SIGKILL');
209
+ const e = new Error(`claude: timed out after ${Math.round(timeoutMs / 1000)}s`);
210
+ e.exitCode = 2;
211
+ reject(e);
212
+ }, timeoutMs);
213
+
214
+ proc.on('error', err => {
215
+ clearTimeout(timer);
216
+ const e = new Error(`claude: spawn error (${err.code || err.message})`);
217
+ e.exitCode = 2;
218
+ reject(e);
219
+ });
220
+
221
+ proc.on('close', code => {
222
+ clearTimeout(timer);
223
+ if (code !== 0) {
224
+ const tail = stderr.trim().split('\n').slice(-5).join('\n').slice(0, 800);
225
+ const e = new Error(`claude -p exited ${code}${tail ? '\n' + tail : ''}`);
226
+ e.exitCode = 2;
227
+ return reject(e);
228
+ }
229
+ // Output may be a full <article>...</article> (first chunk / single call)
230
+ // or just inner content (later chunks). Hand the full stdout to the
231
+ // merger; it knows how to extract either shape.
232
+ resolve(stdout);
233
+ });
234
+ });
235
+ }
236
+
237
+ // Bounded-concurrency parallel runner. Items are processed in input order
238
+ // up to `concurrency` at a time. Order of `results[]` matches input order,
239
+ // regardless of completion order.
240
+ async function runWithConcurrency(items, concurrency, fn) {
241
+ const results = new Array(items.length);
242
+ let nextIdx = 0;
243
+ const worker = async () => {
244
+ while (true) {
245
+ const myIdx = nextIdx++;
246
+ if (myIdx >= items.length) break;
247
+ results[myIdx] = await fn(items[myIdx], myIdx);
248
+ }
249
+ };
250
+ const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
251
+ await Promise.all(workers);
252
+ return results;
253
+ }
254
+
255
+ async function getPdfPageCount(filePath) {
256
+ const buf = await readFile(filePath);
257
+ const data = new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength);
258
+ let doc;
259
+ try {
260
+ doc = await pdfjs.getDocument({ data, isEvalSupported: false }).promise;
261
+ } catch (err) {
262
+ const e = new Error(`claude: failed to read PDF page count (${err && err.message ? err.message : String(err)})`);
263
+ e.exitCode = 2;
264
+ throw e;
265
+ }
266
+ const count = doc.numPages;
267
+ await doc.destroy().catch(() => {});
268
+ return count;
269
+ }
270
+
271
+ // Merge per-chunk HTML output into a single <article>. The first chunk's
272
+ // output is treated as a full <article> with leading <style>/@page; later
273
+ // chunks are content-only fragments (per their prompt). We:
274
+ // 1. Extract the first chunk's full <article ...>...<style>...</style>...<div class="doc"> shell
275
+ // 2. Append each later chunk's content fragments inside that .doc
276
+ // 3. Close with </div></article>
277
+ //
278
+ // If a later chunk DID emit a full <article>+<style> (the model ignored the
279
+ // chunk hint), strip its <article>/<style>/<div class="doc"> wrappers and
280
+ // keep only its inner content.
281
+ function mergeChunks(stdouts) {
282
+ if (stdouts.length === 1) {
283
+ const html = extractArticle(stdouts[0]);
284
+ if (!html) {
285
+ const preview = stdouts[0].trim().slice(0, 400);
286
+ const e = new Error(
287
+ `claude: output did not contain an <article> element. Output preview:\n${preview}`
288
+ );
289
+ e.exitCode = 2;
290
+ throw e;
291
+ }
292
+ return html;
293
+ }
294
+
295
+ const first = extractArticle(stdouts[0]);
296
+ if (!first) {
297
+ const preview = stdouts[0].trim().slice(0, 400);
298
+ const e = new Error(
299
+ `claude: first chunk output did not contain an <article> element. Output preview:\n${preview}`
300
+ );
301
+ e.exitCode = 2;
302
+ throw e;
303
+ }
304
+
305
+ // Find the .doc wrapper closing in the first chunk, so we can splice
306
+ // additional content before it. Prefer </div></article>; fall back to just
307
+ // </article> if no .doc wrapper exists.
308
+ const closingDocArticle = /<\/div>\s*<\/article>\s*$/i;
309
+ const closingArticleOnly = /<\/article>\s*$/i;
310
+ let prefix, suffix;
311
+ if (closingDocArticle.test(first)) {
312
+ prefix = first.replace(closingDocArticle, '');
313
+ suffix = '</div></article>';
314
+ } else if (closingArticleOnly.test(first)) {
315
+ prefix = first.replace(closingArticleOnly, '');
316
+ suffix = '</article>';
317
+ } else {
318
+ // Shouldn't happen — extractArticle guarantees </article>. Defensive.
319
+ prefix = first;
320
+ suffix = '';
321
+ }
322
+
323
+ const additional = stdouts.slice(1).map(stripChunkWrappers).filter(Boolean);
324
+ return [prefix, ...additional.map(c => '\n' + c), suffix].join('');
325
+ }
326
+
327
+ // Pull content out of a chunk's stdout. If the chunk emitted a full
328
+ // <article>+<style>+<div class="doc">...</div></article> (because the model
329
+ // ignored the "content-only" hint), strip those wrappers and the <style>.
330
+ // Otherwise return the cleaned stdout (already content-only).
331
+ function stripChunkWrappers(stdout) {
332
+ let body = stdout.trim();
333
+
334
+ // If wrapped in <article>...</article>, take only the inside.
335
+ const articleMatch = body.match(/<article(?:\s[^>]*)?>([\s\S]*)<\/article>/i);
336
+ if (articleMatch) body = articleMatch[1];
337
+
338
+ // Strip any <style>...</style> (we keep only the first chunk's styles).
339
+ body = body.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '');
340
+
341
+ // Strip <div class="doc">...</div> wrapper if present.
342
+ const docMatch = body.match(/<div[^>]*class\s*=\s*["']doc["'][^>]*>([\s\S]*)<\/div>/i);
343
+ if (docMatch) body = docMatch[1];
344
+
345
+ // Strip stray markdown fences (some models add them despite the prompt).
346
+ body = body.replace(/^```(?:html)?\s*/i, '').replace(/\s*```\s*$/i, '');
347
+
348
+ return body.trim();
349
+ }
350
+
351
+ // Extract the outermost <article>...</article>. The agent's stdout might
352
+ // include thinking commentary, tool-use traces, or markdown fences in
353
+ // addition to the HTML; pull out only the article element.
354
+ function extractArticle(text) {
355
+ const start = text.search(/<article(?:\s[^>]*)?>/i);
356
+ if (start < 0) return null;
357
+ const end = text.lastIndexOf('</article>');
358
+ if (end < 0 || end < start) return null;
359
+ return text.slice(start, end + '</article>'.length).trim();
360
+ }