npm - rewritable - Versions diffs - 0.1.0 → 0.5.0 - Mend

rewritable 0.1.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/README.md +263 -5
package/bin/rwa.mjs +1033 -6
package/package.json +7 -4
package/seeds/rewritable.html +6989 -156
package/src/agent-loop.mjs +155 -0
package/src/apply-edits.mjs +664 -0
package/src/atomic-write.mjs +38 -0
package/src/backend.mjs +43 -0
package/src/clone-extract.mjs +249 -0
package/src/clone.mjs +161 -0
package/src/commands.mjs +207 -11
package/src/create.mjs +256 -0
package/src/doc.mjs +69 -0
package/src/dsl-compiler.mjs +357 -0
package/src/edit.mjs +300 -0
package/src/fetch-page.mjs +346 -0
package/src/host.mjs +126 -0
package/src/identity.mjs +257 -0
package/src/import-claude.mjs +360 -0
package/src/import-vision.mjs +156 -0
package/src/import.mjs +357 -8
package/src/ls.mjs +105 -0
package/src/publish-site.mjs +85 -0
package/src/publish.mjs +98 -0
package/src/seed-extract.mjs +40 -0
package/src/seed.mjs +1399 -6
package/src/self-contained.mjs +115 -0
package/src/skill-manifest.mjs +227 -0
package/src/skin.mjs +350 -0
package/src/skins.mjs +274 -0
package/src/template.mjs +109 -0

package/src/identity.mjs ADDED Viewed

@@ -0,0 +1,257 @@
+// Consumer-side static self-description for `self-description/1` — the answer to
+// "what is this rewritable, and what can be done with it?" computed from the
+// file BYTES, without executing the container's JS.
+// Contract + reference: docs/specs/rwa-self-description-spec.md,
+// tools/self-description.mjs (computeSelfDescription / validateSelfDescription).
+//
+// This is a PUBLISH-SAFE MIRROR of the reference's static computer. The CLI is a
+// standalone npm package and cannot reach repo-root tools/ at runtime, so the
+// kind→provider table, the substrate baseline, the title/blocks extraction, and
+// the assembled object are duplicated here — the same pattern as
+// cli/src/apply-edits.mjs mirroring the seed. The mirror is pinned to the single
+// source by tests/identity.test.mjs (KIND_PROVIDERS / SUBSTRATE_BASELINE deep-equal
+// the reference; the full assembled object deep-equals computeSelfDescription in
+// doc.test.mjs). Drift fails loudly. KEEP IN STEP with tools/self-description.mjs.
+import { tagHasFrozenAttr } from './apply-edits.mjs';
+import { parseSkillZone } from './skill-manifest.mjs';
+export const SCHEMA_TAG = 'self-description/1';
+// Mirror of tools/self-description.mjs AFFORDANCE_KINDS / PROVENANCES — used by the
+// declared-projection conformance gate (declaredIsConforming). Keep in step.
+export const AFFORDANCE_KINDS = ['view', 'edit-surface', 'tool', 'compute', 'hook'];
+export const PROVENANCES = ['first-party', 'installed'];
+// kind -> registered provider bundle (spec §4). Each provider is {kind,name,label};
+// `provenance:'first-party'` is added per emit (bootstrap-resident providers).
+// The presentation entry mirrors the seed presentationProvider {name:'presentation',
+// label:'Present'} (seeds/rewritable.html:3542-3543) so static == live by construction.
+// ONLY kinds the runtime FIRST-PARTY-provides — custom kinds (datatable, …) are
+// consumer-built via provide()/the declaration, so their honest static answer is
+// [] (declared > static supplies the real affordances when a declaration exists).
+export const KIND_PROVIDERS = {
+  document: [],
+  presentation: [{ kind: 'view', name: 'presentation', label: 'Present' }],
+  workflow: [],
+  // skill-host: no first-party affordances; installed skills (provenance:'installed')
+  // come from parseSkillZone (§8), not this table. Explicit [] mirrors the oracle.
+  'skill-host': [],
+};
+// Substrate-universal ops — the SAME for every container regardless of kind. The
+// "what can be done with me" data that is NOT an affordance (affordances stay
+// kernel-pure: a base document is []). `history` is undo-only — there is no redo
+// (re-write-able-spec Invariant 7).
+export const SUBSTRATE_BASELINE = Object.freeze({
+  edit: ['lens'],
+  tools: ['apply_dsl_plan', 'apply_edits', 'replace_document'],
+  export: ['html', 'print'],
+  history: ['undo'],
+});
+/**
+ * The document's human-readable title: the text of its first <h1>, or null.
+ * Mirrors tools/self-description.mjs `staticTitle` exactly (so titles agree).
+ * @param {string} doc — the LF-canonical editable body
+ * @returns {string|null}
+ */
+export function extractTitle(doc) {
+  const m = (doc || '').match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
+  if (!m) return null;
+  const text = m[1].replace(/<[^>]*>/g, '').replace(/\s+/g, ' ').trim();
+  return text || null;
+}
+/**
+ * Count of data-rwa-id-addressable blocks — a coarse "how structured" signal.
+ * @param {string} doc — the LF-canonical editable body
+ * @returns {number}
+ */
+export function countBlocks(doc) {
+  return ((doc || '').match(/\bdata-rwa-id\b/g) || []).length;
+}
+/**
+ * Assemble the STATIC self-description projection from a container's already-
+ * extracted facts (so inspectDoc parses the file once). Equivalent to the
+ * reference `computeSelfDescription(fileText)`, minus the file parsing.
+ *
+ * @param {{doc:string, uuid:string|null, kind:string, frozenZones:string[]}} facts
+ * @returns {object} a `source:'static'` self-description/1 object (spec §2)
+ */
+export function buildSelfDescription({ doc, uuid, kind, frozenZones }) {
+  // First-party (kind-derived) + INSTALLED skills from the frozen #rwa-skills zone
+  // (§8). Mirrors tools/self-description.mjs computeSelfDescription exactly.
+  const affordances = [
+    ...(KIND_PROVIDERS[kind] || []).map((p) => ({ ...p, provenance: 'first-party' })),
+    ...parseSkillZone(doc),
+  ];
+  return {
+    rwa: SCHEMA_TAG,
+    source: 'static',
+    uuid,
+    kind,
+    title: extractTitle(doc),
+    blocks: countBlocks(doc),
+    affordances,
+    frozenZones,
+    baseline: { ...SUBSTRATE_BASELINE },
+  };
+}
+// ── The `declared` projection (v1.1, spec §3.1) ───────────────────────────────
+// A custom-affordance file (a datatable the kind table can only GUESS for) may
+// carry its own answer: an inert `<script id="rwa-affordances">` block with a
+// `source:"declared"` self-description. The reader prefers it (declared > static)
+// only if it is TRUSTWORTHY — edit-unreachable so the lens/agent can't have
+// drifted it. Mirror of tools/self-description.mjs DECL_RE / parseDeclaration /
+// declarationFacts (publish-safe; the CLI can't reach repo-root tools/ at runtime).
+// The oracle takes only fileText and extractInlineDoc's it; the CLI passes the
+// already-extracted `doc` (== extractInlineDoc(fileText)) so the two agree.
+// KEEP IN STEP with tools/self-description.mjs.
+const DECL_RE = /<script\b[^>]*\bid=["']rwa-affordances["'][^>]*>([\s\S]*?)<\/script\s*>/i;
+// A body declaration lives inside INLINE_DOC (its </script> escaped in raw bytes),
+// so it is found in `doc`; a chrome declaration (immutable, outside INLINE_DOC) is
+// found in the raw file text. Return which, so the reader can judge edit-reachability.
+function declarationLocus(fileText, doc) {
+  if (doc && DECL_RE.test(doc)) return { hay: doc, inEditableBody: true };
+  return { hay: fileText, inEditableBody: false };
+}
+/**
+ * Extract the embedded #rwa-affordances declaration, if any.
+ * @returns {{ declaration: object|null, raw: string|null, error: string|null }}
+ */
+export function parseDeclaration(fileText, doc) {
+  const m = declarationLocus(fileText, doc).hay.match(DECL_RE);
+  if (!m) return { declaration: null, raw: null, error: null };
+  try {
+    return { declaration: JSON.parse(m[1]), raw: m[1], error: null };
+  } catch (e) {
+    return { declaration: null, raw: m[1], error: 'invalid JSON: ' + (e && e.message) };
+  }
+}
+/**
+ * Edit-reachability facts for the declaration (spec §3.1). Trustworthy iff
+ * `!inEditableBody` (chrome) OR `frozenAttr` (data-rwa-frozen — enforced by the
+ * lens, and by the CLI as of attribute-form enforcement). `frozenZones` is NOT
+ * consulted (marker-form only, SD-04).
+ * @returns {{ found: boolean, inEditableBody: boolean, frozenAttr: boolean }}
+ */
+export function declarationFacts(fileText, doc) {
+  const { hay, inEditableBody } = declarationLocus(fileText, doc);
+  const m = hay.match(DECL_RE);
+  if (!m) return { found: false, inEditableBody: false, frozenAttr: false };
+  const openTag = m[0].slice(0, m[0].indexOf('>') + 1);
+  // DOM-accurate: data-rwa-frozen must be a real attribute NAME (not a value-
+  // mention / longer name), matching the seed's actual enforcement — else the
+  // CLI would over-trust a declaration the lens can still drift (euler #112).
+  return { found: true, inEditableBody, frozenAttr: tagHasFrozenAttr(openTag) };
+}
+export const SOURCES = ['static', 'live', 'declared'];
+const isStrArray = (v) => Array.isArray(v) && v.every((x) => typeof x === 'string');
+/**
+ * Validate a self-description/1 object against the §2/§3.1 schema — a publish-safe
+ * MIRROR of tools/self-description.mjs validateSelfDescription, so the reader can
+ * guarantee it never emits a non-conforming declared answer without importing
+ * repo-root tools/ at runtime. Pinned to the oracle by test (identity.test.mjs).
+ * KEEP IN STEP with tools/self-description.mjs.
+ * @returns {{ valid: boolean, errors: string[] }}
+ */
+export function validateSelfDescription(obj) {
+  const errors = [];
+  if (obj === null || typeof obj !== 'object' || Array.isArray(obj)) return { valid: false, errors: ['not an object'] };
+  if (obj.rwa !== SCHEMA_TAG) errors.push('rwa must be "' + SCHEMA_TAG + '"');
+  if (!SOURCES.includes(obj.source)) errors.push('source must be one of ' + SOURCES.join(' | '));
+  // uuid/frozenZones are container facts the reader fills, so optional in a declaration.
+  if (obj.source === 'declared') {
+    if ('uuid' in obj && obj.uuid !== null && typeof obj.uuid !== 'string') errors.push('uuid, if present, must be a string or null');
+  } else if (!('uuid' in obj) || (obj.uuid !== null && typeof obj.uuid !== 'string')) {
+    errors.push('uuid must be a string or null');
+  }
+  if (typeof obj.kind !== 'string' || obj.kind.length === 0) errors.push('kind must be a non-empty string');
+  if ('title' in obj && obj.title !== null && typeof obj.title !== 'string') errors.push('title must be a string or null');
+  if ('blocks' in obj && (typeof obj.blocks !== 'number' || !Number.isFinite(obj.blocks))) errors.push('blocks must be a number');
+  if (!Array.isArray(obj.affordances)) {
+    errors.push('affordances must be an array');
+  } else {
+    obj.affordances.forEach((a, i) => {
+      if (a === null || typeof a !== 'object' || Array.isArray(a)) { errors.push('affordances[' + i + '] must be an object'); return; }
+      if (!AFFORDANCE_KINDS.includes(a.kind)) errors.push('affordances[' + i + '].kind unknown');
+      if (typeof a.name !== 'string' || !a.name) errors.push('affordances[' + i + '].name must be a non-empty string');
+      if ('label' in a && typeof a.label !== 'string') errors.push('affordances[' + i + '].label must be a string');
+      if (!PROVENANCES.includes(a.provenance)) errors.push('affordances[' + i + '].provenance must be first-party | installed');
+      if ('surface' in a && typeof a.surface !== 'string') errors.push('affordances[' + i + '].surface must be a string');
+      if ('target' in a && typeof a.target !== 'string') errors.push('affordances[' + i + '].target must be a string');
+      if ('output' in a && typeof a.output !== 'string') errors.push('affordances[' + i + '].output must be a string');
+      if ('inputs' in a && !isStrArray(a.inputs)) errors.push('affordances[' + i + '].inputs must be an array of strings');
+      if ('verified' in a && typeof a.verified !== 'boolean') errors.push('affordances[' + i + '].verified must be a boolean');
+    });
+  }
+  if ('data' in obj && obj.data !== null && typeof obj.data !== 'string') errors.push('data must be a string or null');
+  if ('frozenZones' in obj) {
+    if (!isStrArray(obj.frozenZones)) errors.push('frozenZones must be an array of strings');
+  } else if (obj.source !== 'declared') {
+    errors.push('frozenZones must be an array of strings');
+  }
+  if ('baseline' in obj) {
+    const b = obj.baseline;
+    if (b === null || typeof b !== 'object' || Array.isArray(b)) {
+      errors.push('baseline must be an object');
+    } else {
+      for (const k of ['edit', 'tools', 'export', 'history', 'view']) {
+        if (k in b && !isStrArray(b[k])) errors.push('baseline.' + k + ', if present, must be an array of strings');
+      }
+      if (Array.isArray(b.history) && b.history.includes('redo')) errors.push('baseline.history must not claim "redo"');
+    }
+  }
+  if (obj.source === 'static' && 'activeView' in obj) errors.push('static projection must omit activeView');
+  if ('activeView' in obj && obj.activeView !== null && typeof obj.activeView !== 'string') errors.push('activeView must be a string or null');
+  return { valid: errors.length === 0, errors };
+}
+/**
+ * The reader's one answer (spec §3.1 precedence: declared > static). If the file
+ * carries a TRUSTWORTHY (edit-unreachable) declaration that — once the reader
+ * fills container facts (uuid/frozenZones/blocks from the bytes, authoritative
+ * over any author claim) — VALIDATES, emit it as `source:'declared'`. Otherwise
+ * emit the static kind-derived projection. Validating the assembled object before
+ * trusting it guarantees the reader never emits a non-conforming answer (a subtly
+ * malformed trustworthy declaration safely falls back to static). No `live`
+ * registry on the static path, so there is no declared>live>static middle tier.
+ *
+ * @param {{fileText:string, doc:string, uuid:string|null, kind:string, frozenZones:string[]}} facts
+ * @returns {object} a self-description/1 object (`source:'declared'` or `'static'`)
+ */
+export function resolveSelfDescription({ fileText, doc, uuid, kind, frozenZones }) {
+  const f = declarationFacts(fileText, doc);
+  if (f.found && (!f.inEditableBody || f.frozenAttr)) {
+    const { declaration } = parseDeclaration(fileText, doc);
+    if (declaration && typeof declaration === 'object' && !Array.isArray(declaration)) {
+      // Fill ONLY container facts (uuid/frozenZones/blocks from the bytes —
+      // authoritative over any author claim). Do NOT force rwa/source: the
+      // discriminator and source are the author's claim and must already be
+      // correct, or the declaration is non-conforming and we must not "repair"
+      // it into a trusted answer (e.g. a `schema`-not-`rwa` pre-aligned block).
+      // Union installed skills (parseSkillZone) into the declared affordances —
+      // the static path does, so dropping them here made declared≠live (SD-04).
+      // Declared providers win a (kind,name) collision; mirrors the seed's
+      // runtimeDescribe registry→declared→installed precedence.
+      const declAff = Array.isArray(declaration.affordances) ? declaration.affordances : [];
+      const seen = new Set(declAff.map((a) => a.kind + '\0' + a.name));
+      const candidate = {
+        ...declaration,
+        affordances: [...declAff, ...parseSkillZone(doc).filter((s) => !seen.has(s.kind + '\0' + s.name))],
+        uuid,
+        frozenZones,
+        blocks: countBlocks(doc),
+      };
+      if (candidate.source === 'declared' && validateSelfDescription(candidate).valid) return candidate;
+    }
+  }
+  return buildSelfDescription({ doc, uuid, kind, frozenZones });
+}

package/src/import-claude.mjs ADDED Viewed

@@ -0,0 +1,360 @@
+import { spawn } from 'node:child_process';
+import { readFile } from 'node:fs/promises';
+import path from 'node:path';
+import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
+// PDF / docx → HTML by spawning the `claude` CLI in print mode.
+//
+// PDFs are processed in PARALLEL: split into page ranges, each chunk
+// handed to its own `claude -p` subprocess concurrently, then merged.
+// Long papers go from sequential N×t to roughly t×ceil(chunks/concurrency).
+//
+// Why: the user's machine has Anthropic's official `pdf` and `docx` skills
+// installed under ~/.claude/skills/. Those skills have rich Python tooling
+// (pypdf, pdfplumber, pandoc, mammoth, LibreOffice) that the rwa CLI itself
+// can't reasonably bundle. Calling `claude -p` lets the agent invoke its
+// skill, run the local Python sandbox, and hand back clean semantic HTML —
+// strictly better fidelity than either the local pdfjs heuristic or the
+// raw-vision OpenRouter path, on documents where the skills apply.
+//
+// Trust model: this spawns a Claude Code subprocess that reads the input file's
+// CONTENTS into an agent context (the pdf/docx skill needs Python — pypdf,
+// pdfplumber, mammoth — to extract them, so the agent genuinely needs tool
+// access). That makes the file attacker-controlled input: prompt-injection text
+// hidden in a third-party PDF/DOCX could hijack the agent. `import` is precisely
+// the command you point at files you received from someone else, so "the user
+// trusts their input file" is the WRONG threat model.
+//
+// Therefore `--claude` is gated behind an explicit `--trust-input` consent flag
+// (convertViaClaudeCli throws below if it is absent). Only when the user vouches
+// for the file do we add `--permission-mode bypassPermissions`. The default
+// import path (pdfjs/mammoth — parses bytes, never executes the file's content)
+// remains the safe, no-flag route. Documented in HELP.
+const SKILL_FOR_EXT = { pdf: 'pdf', docx: 'docx' };
+const DEFAULT_CHUNK_SIZE = 5;       // pages per chunk
+const DEFAULT_CONCURRENCY = 4;      // simultaneous claude -p subprocesses
+const DEFAULT_TIMEOUT_MS = 1_200_000; // 20 minutes per chunk
+const PROMPT_TEMPLATE = (skill, filePath, pageRange) => {
+  const rangeNote = pageRange
+    ? `\n\nIMPORTANT: Process ONLY pages ${pageRange.start} to ${pageRange.end} (inclusive) of the document. Use the pdf skill's page-range support (pypdf/pdfplumber accept page indices) to extract just that slice. Do not output content from any other pages. The full document is ${pageRange.totalPages} pages; this chunk is pages ${pageRange.start}-${pageRange.end}.`
+    : '';
+  const styleNote = pageRange && pageRange.start > 1
+    ? `\n\nIMPORTANT (chunk ${pageRange.start}-${pageRange.end}): omit the leading <style> and @page rules. Output ONLY the inner content of the .doc wrapper for these pages — start your output with the actual content elements (e.g., <h2>, <p>, <table>...) and end with the last content element. Do NOT include <article>, <style>, <div class="doc">, or </article>, </div>. Just the content of pages ${pageRange.start}-${pageRange.end}, ready to splice into a larger document. The first chunk handled the styling; later chunks contribute content only.`
+    : '';
+  return `Use the ${skill} skill to extract the content of ${filePath} and convert it to a single <article>...</article> element that VISUALLY MATCHES the original document as closely as possible when rendered in a browser.${rangeNote}${styleNote}
+The output will be embedded inside a re-writeable document container that has its own dark-theme CSS. Your <article> must include a leading scoped <style> block that defines its own visual appearance, so the container's theme does not bleed in.
+Required structure (full-document or first-chunk only — see chunk note above):
+<article style="all: revert;">
+  <style>
+    /* Scope every rule to .doc to avoid leaking into the container.
+       Use 'all: revert' or explicit resets to neutralize the container's theme. */
+    .doc { background: ...; color: ...; font-family: ...; padding: ...; max-width: ...; margin: 0 auto; }
+    .doc h1, .doc h2, .doc p, .doc table, .doc th, .doc td { ... }
+    /* etc. */
+  </style>
+  <div class="doc">
+    ... actual content ...
+  </div>
+</article>
+Style requirements (match the source PDF):
+- Background color (usually white #ffffff for printed documents).
+- Text color (usually black #000000 or near-black).
+- Font family — pick a generic match: invoices and letters use sans-serif (Helvetica, Arial, system-ui); academic/literary uses serif (Georgia, Times New Roman); monospaced text uses monospace.
+- Font sizes — match the visual hierarchy (titles bigger, body smaller, footnotes smallest).
+- Text alignment — left, right, center, or justify, matching each block in the source.
+- Right-aligned blocks (sender addresses, dates) MUST remain right-aligned via CSS.
+- Padding/margins around sections that mirror the PDF's vertical density. Crucially, do NOT inflate vertical spacing — if the source fits on N pages, your output should fit on N pages when printed at the source paper size. Prefer tight margins (~0.5em-1em between blocks) over generous ones; a single-page invoice should remain a single-page invoice.
+- Tables — borders, cell padding, header weight, alternating rows or shading where the PDF has them.
+- Bold and italic where used, via <strong>/<em> (preferred) or font-weight/font-style in the scoped CSS.
+Print-fit requirements (REQUIRED for documents that match a paper size):
+- Include an @media print rule inside the scoped <style> block that:
+  * Removes any max-width constraint (so the doc fills the page width).
+  * Sets margin:0 / padding:0 on .doc so the printer's @page margin (default 0.5in) is the only outer margin.
+  * Optionally tightens block spacing further if the source page density is dense.
+  * Uses page-break-inside:avoid on tables, headers, and footer blocks so they don't split awkwardly across pages.
+- Add an @page rule with size matching the source (default A4 if uncertain): @page { size: A4; margin: 0.5in; }
+Content requirements:
+- Use semantic tags: <h1>-<h6>, <p>, <ul>/<ol>/<li>, <table>/<thead>/<tbody>/<tr>/<td>/<th>, <strong>/<em>, <a href="...">.
+- Preserve text exactly. Do not summarize, paraphrase, or reword.
+- Reconstruct multi-column layouts as the source has them: side-by-side blocks via CSS flex/grid in your scoped styles, or as table cells if that fits better.
+- No <img> tags. No <script>. No external resources (no @import, no <link>, no Google Fonts URLs — only system or generic font families).
+- No id attributes. Class names should be scoped under .doc to avoid collisions with the container.
+- Do not include <html>, <head>, <body>, or <!doctype>.
+Print ONLY the final HTML as your last response. No preamble, no markdown fences, no commentary.`;
+};
+/**
+ * @param {string} filePath  Absolute path to the file to import
+ * @param {string} ext       Extension without dot ("pdf" or "docx")
+ * @param {object} [opts]
+ * @param {AbortSignal} [opts.signal]
+ * @param {number} [opts.timeoutMs]    Wall-clock cap PER CHUNK (default 20min)
+ * @param {number} [opts.chunkSize]    Pages per chunk for PDFs (default 5)
+ * @param {number} [opts.concurrency]  Max simultaneous subprocesses (default 4)
+ * @returns {Promise<{ html: string, warnings: string[] }>}
+ */
+export async function convertViaClaudeCli(filePath, ext, opts = {}) {
+  const skill = SKILL_FOR_EXT[ext];
+  if (!skill) {
+    const e = new Error(`--claude only supports .pdf and .docx (got .${ext})`);
+    e.exitCode = 2;
+    throw e;
+  }
+  // Consent gate (SECURITY). Refuse to point an autonomous agent at the file
+  // unless the user explicitly vouched for it. Must run BEFORE any file read or
+  // subprocess spawn, so an unconsented file is never touched by the agent.
+  if (!opts.trustInput) {
+    const e = new Error(
+      `refusing to run an autonomous agent on ${filePath} without consent.\n` +
+      `  --claude extraction reads the file's contents into a Claude Code agent, so a\n` +
+      `  malicious file could hijack it (prompt-injection -> code execution).\n` +
+      `  Re-run with --claude --trust-input only if you trust this file's source.\n` +
+      `  (The default import, without --claude, parses the file safely and never executes its contents.)`
+    );
+    e.exitCode = 2;
+    throw e;
+  }
+  // docx isn't naturally page-chunkable (no fixed page boundaries inside the
+  // XML). Single call.
+  if (ext !== 'pdf') {
+    const stdout = await runClaude(filePath, PROMPT_TEMPLATE(skill, filePath, null), opts);
+    const html = extractArticle(stdout);
+    if (!html) {
+      const preview = stdout.trim().slice(0, 400);
+      const e = new Error(
+        `claude: output did not contain an <article> element. Output preview:\n${preview}`
+      );
+      e.exitCode = 2;
+      throw e;
+    }
+    return {
+      html,
+      warnings: [`claude: imported via \`claude -p\` (${skill} skill)`],
+    };
+  }
+  const totalPages = await getPdfPageCount(filePath);
+  const chunkSize = opts.chunkSize || DEFAULT_CHUNK_SIZE;
+  const concurrency = opts.concurrency || DEFAULT_CONCURRENCY;
+  const ranges = [];
+  for (let start = 1; start <= totalPages; start += chunkSize) {
+    const end = Math.min(start + chunkSize - 1, totalPages);
+    ranges.push({ start, end, totalPages });
+  }
+  console.error(
+    `note: claude: ${totalPages}-page PDF → ${ranges.length} chunk${ranges.length === 1 ? '' : 's'} of ≤${chunkSize} pages, ${Math.min(concurrency, ranges.length)} parallel`
+  );
+  const htmlChunks = await runWithConcurrency(ranges, concurrency, async (range, idx) => {
+    console.error(`note: claude: chunk ${idx + 1}/${ranges.length} (pages ${range.start}-${range.end}) starting…`);
+    const prompt = PROMPT_TEMPLATE(skill, filePath, range);
+    const html = await runClaude(filePath, prompt, opts);
+    console.error(`note: claude: chunk ${idx + 1}/${ranges.length} done`);
+    return html;
+  });
+  const merged = mergeChunks(htmlChunks);
+  return {
+    html: merged,
+    warnings: [
+      `claude: imported ${ranges.length} chunk${ranges.length === 1 ? '' : 's'} via parallel \`claude -p\` (${skill} skill)`,
+    ],
+  };
+}
+// Run a single `claude -p` invocation. Returns the extracted HTML for the
+// chunk (either a full <article> or content-only fragment depending on the
+// prompt's chunk hint).
+function runClaude(filePath, prompt, { signal, timeoutMs = DEFAULT_TIMEOUT_MS } = {}) {
+  const args = [
+    '-p',
+    '--output-format', 'text',
+    '--add-dir', path.dirname(filePath),
+    '--permission-mode', 'bypassPermissions',
+    prompt,
+  ];
+  return new Promise((resolve, reject) => {
+    let proc;
+    try {
+      proc = spawn('claude', args, { stdio: ['ignore', 'pipe', 'pipe'], signal });
+    } catch (err) {
+      const e = new Error(`claude: failed to spawn (${err && err.message ? err.message : String(err)}). Is the claude CLI installed?`);
+      e.exitCode = 2;
+      return reject(e);
+    }
+    let stdout = '';
+    let stderr = '';
+    proc.stdout.on('data', d => { stdout += d.toString('utf8'); });
+    proc.stderr.on('data', d => { stderr += d.toString('utf8'); });
+    const timer = setTimeout(() => {
+      proc.kill('SIGKILL');
+      const e = new Error(`claude: timed out after ${Math.round(timeoutMs / 1000)}s`);
+      e.exitCode = 2;
+      reject(e);
+    }, timeoutMs);
+    proc.on('error', err => {
+      clearTimeout(timer);
+      const e = new Error(`claude: spawn error (${err.code || err.message})`);
+      e.exitCode = 2;
+      reject(e);
+    });
+    proc.on('close', code => {
+      clearTimeout(timer);
+      if (code !== 0) {
+        const tail = stderr.trim().split('\n').slice(-5).join('\n').slice(0, 800);
+        const e = new Error(`claude -p exited ${code}${tail ? '\n' + tail : ''}`);
+        e.exitCode = 2;
+        return reject(e);
+      }
+      // Output may be a full <article>...</article> (first chunk / single call)
+      // or just inner content (later chunks). Hand the full stdout to the
+      // merger; it knows how to extract either shape.
+      resolve(stdout);
+    });
+  });
+}
+// Bounded-concurrency parallel runner. Items are processed in input order
+// up to `concurrency` at a time. Order of `results[]` matches input order,
+// regardless of completion order.
+async function runWithConcurrency(items, concurrency, fn) {
+  const results = new Array(items.length);
+  let nextIdx = 0;
+  const worker = async () => {
+    while (true) {
+      const myIdx = nextIdx++;
+      if (myIdx >= items.length) break;
+      results[myIdx] = await fn(items[myIdx], myIdx);
+    }
+  };
+  const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
+  await Promise.all(workers);
+  return results;
+}
+async function getPdfPageCount(filePath) {
+  const buf = await readFile(filePath);
+  const data = new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength);
+  let doc;
+  try {
+    doc = await pdfjs.getDocument({ data, isEvalSupported: false }).promise;
+  } catch (err) {
+    const e = new Error(`claude: failed to read PDF page count (${err && err.message ? err.message : String(err)})`);
+    e.exitCode = 2;
+    throw e;
+  }
+  const count = doc.numPages;
+  await doc.destroy().catch(() => {});
+  return count;
+}
+// Merge per-chunk HTML output into a single <article>. The first chunk's
+// output is treated as a full <article> with leading <style>/@page; later
+// chunks are content-only fragments (per their prompt). We:
+// 1. Extract the first chunk's full <article ...>...<style>...</style>...<div class="doc"> shell
+// 2. Append each later chunk's content fragments inside that .doc
+// 3. Close with </div></article>
+//
+// If a later chunk DID emit a full <article>+<style> (the model ignored the
+// chunk hint), strip its <article>/<style>/<div class="doc"> wrappers and
+// keep only its inner content.
+function mergeChunks(stdouts) {
+  if (stdouts.length === 1) {
+    const html = extractArticle(stdouts[0]);
+    if (!html) {
+      const preview = stdouts[0].trim().slice(0, 400);
+      const e = new Error(
+        `claude: output did not contain an <article> element. Output preview:\n${preview}`
+      );
+      e.exitCode = 2;
+      throw e;
+    }
+    return html;
+  }
+  const first = extractArticle(stdouts[0]);
+  if (!first) {
+    const preview = stdouts[0].trim().slice(0, 400);
+    const e = new Error(
+      `claude: first chunk output did not contain an <article> element. Output preview:\n${preview}`
+    );
+    e.exitCode = 2;
+    throw e;
+  }
+  // Find the .doc wrapper closing in the first chunk, so we can splice
+  // additional content before it. Prefer </div></article>; fall back to just
+  // </article> if no .doc wrapper exists.
+  const closingDocArticle = /<\/div>\s*<\/article>\s*$/i;
+  const closingArticleOnly = /<\/article>\s*$/i;
+  let prefix, suffix;
+  if (closingDocArticle.test(first)) {
+    prefix = first.replace(closingDocArticle, '');
+    suffix = '</div></article>';
+  } else if (closingArticleOnly.test(first)) {
+    prefix = first.replace(closingArticleOnly, '');
+    suffix = '</article>';
+  } else {
+    // Shouldn't happen — extractArticle guarantees </article>. Defensive.
+    prefix = first;
+    suffix = '';
+  }
+  const additional = stdouts.slice(1).map(stripChunkWrappers).filter(Boolean);
+  return [prefix, ...additional.map(c => '\n' + c), suffix].join('');
+}
+// Pull content out of a chunk's stdout. If the chunk emitted a full
+// <article>+<style>+<div class="doc">...</div></article> (because the model
+// ignored the "content-only" hint), strip those wrappers and the <style>.
+// Otherwise return the cleaned stdout (already content-only).
+function stripChunkWrappers(stdout) {
+  let body = stdout.trim();
+  // If wrapped in <article>...</article>, take only the inside.
+  const articleMatch = body.match(/<article(?:\s[^>]*)?>([\s\S]*)<\/article>/i);
+  if (articleMatch) body = articleMatch[1];
+  // Strip any <style>...</style> (we keep only the first chunk's styles).
+  body = body.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '');
+  // Strip <div class="doc">...</div> wrapper if present.
+  const docMatch = body.match(/<div[^>]*class\s*=\s*["']doc["'][^>]*>([\s\S]*)<\/div>/i);
+  if (docMatch) body = docMatch[1];
+  // Strip stray markdown fences (some models add them despite the prompt).
+  body = body.replace(/^```(?:html)?\s*/i, '').replace(/\s*```\s*$/i, '');
+  return body.trim();
+}
+// Extract the outermost <article>...</article>. The agent's stdout might
+// include thinking commentary, tool-use traces, or markdown fences in
+// addition to the HTML; pull out only the article element.
+function extractArticle(text) {
+  const start = text.search(/<article(?:\s[^>]*)?>/i);
+  if (start < 0) return null;
+  const end = text.lastIndexOf('</article>');
+  if (end < 0 || end < start) return null;
+  return text.slice(start, end + '</article>'.length).trim();
+}