rewritable 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/commands.mjs CHANGED
@@ -1,10 +1,13 @@
1
1
  import fs from 'node:fs/promises';
2
2
  import path from 'node:path';
3
- import { fileURLToPath } from 'node:url';
3
+ import { fileURLToPath, pathToFileURL } from 'node:url';
4
+ import { spawn } from 'node:child_process';
4
5
  import crypto from 'node:crypto';
5
6
 
6
7
  import { loadSeed, applySeedSubs, replaceInlineDoc } from './seed.mjs';
7
8
  import { convert } from './import.mjs';
9
+ import { convertPdfViaVision } from './import-vision.mjs';
10
+ import { convertViaClaudeCli } from './import-claude.mjs';
8
11
 
9
12
  const here = path.dirname(fileURLToPath(import.meta.url));
10
13
  const packageRoot = path.dirname(here);
@@ -53,7 +56,81 @@ function rel(p) {
53
56
  return r || p;
54
57
  }
55
58
 
56
- export async function newCmd({ outPath, force }) {
59
+ // Parse a single var out of a .env-style file. Minimal — handles KEY=value,
60
+ // surrounding whitespace, optional matched single/double quotes, leading `export`.
61
+ // Skips blank/comment lines. No interpolation, no multiline values.
62
+ async function readEnvKey(name) {
63
+ if (process.env[name]) return process.env[name];
64
+ let text;
65
+ try {
66
+ text = await fs.readFile(path.join(process.cwd(), '.env'), 'utf8');
67
+ } catch (_) { return null; }
68
+ for (const line of text.split('\n')) {
69
+ const m = line.match(/^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*?)\s*$/);
70
+ if (!m || m[1] !== name) continue;
71
+ let v = m[2];
72
+ if ((v.startsWith('"') && v.endsWith('"')) || (v.startsWith("'") && v.endsWith("'"))) {
73
+ v = v.slice(1, -1);
74
+ }
75
+ return v || null;
76
+ }
77
+ return null;
78
+ }
79
+
80
+ // Validate-and-return a backend name. Returns null for invalid input rather
81
+ // than throwing — pre-fill is best-effort; an unknown value just means the
82
+ // user sees the default backend (openrouter) on first paint.
83
+ function validBackend(v) {
84
+ return ['openrouter', 'ollama', 'lmstudio', 'bridge'].includes(v) ? v : null;
85
+ }
86
+
87
+ // Collect URL-param pre-fills from env / ./.env. Returns an object whose keys
88
+ // match the URL params the bootstrap lifts (key, backend, model). Missing or
89
+ // invalid values are omitted; the bootstrap falls back to its defaults.
90
+ async function collectPrefill() {
91
+ const out = {};
92
+ const key = await readEnvKey('OPENROUTER_API_KEY');
93
+ const backend = validBackend(await readEnvKey('RWA_BACKEND'));
94
+ const model = await readEnvKey('RWA_MODEL');
95
+ if (key) out.key = key;
96
+ if (backend) out.backend = backend;
97
+ if (model) out.model = model;
98
+ return out;
99
+ }
100
+
101
+ function openFile(target, prefill) {
102
+ // When any prefill is present we open via a file:// URL with the params so
103
+ // the bootstrap can lift them into sessionStorage on first paint and scrub
104
+ // the URL bar via history.replaceState. Without any prefill we use the bare
105
+ // path so the open command is byte-identical to before.
106
+ let arg;
107
+ const params = prefill || {};
108
+ const hasAny = params.key || params.backend || params.model;
109
+ if (hasAny) {
110
+ const u = pathToFileURL(target);
111
+ if (params.key) u.searchParams.set('key', params.key);
112
+ if (params.backend) u.searchParams.set('backend', params.backend);
113
+ if (params.model) u.searchParams.set('model', params.model);
114
+ arg = u.toString();
115
+ } else {
116
+ arg = target;
117
+ }
118
+ let cmd, args;
119
+ if (process.platform === 'darwin') {
120
+ cmd = 'open'; args = [arg];
121
+ } else if (process.platform === 'win32') {
122
+ cmd = 'cmd'; args = ['/c', 'start', '""', arg];
123
+ } else {
124
+ cmd = 'xdg-open'; args = [arg];
125
+ }
126
+ const child = spawn(cmd, args, { detached: true, stdio: 'ignore' });
127
+ child.on('error', err => {
128
+ console.error(`note: could not open file (${err.code || err.message})`);
129
+ });
130
+ child.unref();
131
+ }
132
+
133
+ export async function newCmd({ outPath, force, open }) {
57
134
  const out = path.resolve(outPath || './rewritable.html');
58
135
  await ensureWritable(out, force);
59
136
  const seed = await loadSeed(SEED_CANDIDATES);
@@ -66,9 +143,21 @@ export async function newCmd({ outPath, force }) {
66
143
  });
67
144
  await fs.writeFile(out, result, 'utf8');
68
145
  console.log(`wrote ${rel(out)}`);
146
+ if (open) {
147
+ const prefill = await collectPrefill();
148
+ if (prefill.key) console.error('note: passing OPENROUTER_API_KEY via ?key= URL parameter');
149
+ if (prefill.backend) console.error(`note: passing RWA_BACKEND=${prefill.backend} via ?backend= URL parameter`);
150
+ if (prefill.model) console.error(`note: passing RWA_MODEL=${prefill.model} via ?model= URL parameter`);
151
+ openFile(out, prefill);
152
+ }
69
153
  }
70
154
 
71
- export async function importCmd({ inputPath, outPath, force }) {
155
+ export async function importCmd({ inputPath, outPath, force, open, vision, claude, model, timeoutSec }) {
156
+ if (vision && claude) {
157
+ const e = new Error('--vision and --claude are mutually exclusive');
158
+ e.exitCode = 2;
159
+ throw e;
160
+ }
72
161
  const input = path.resolve(inputPath);
73
162
  const inputDir = path.dirname(input);
74
163
  const inputBasename = path.basename(input, path.extname(input));
@@ -76,8 +165,28 @@ export async function importCmd({ inputPath, outPath, force }) {
76
165
  await ensureWritable(out, force);
77
166
 
78
167
  const ext = path.extname(input).toLowerCase().replace(/^\./, '');
79
- const contents = await fs.readFile(input, 'utf8');
80
- const { html, warnings } = await convert(ext, contents);
168
+ let html, warnings;
169
+ if (vision) {
170
+ if (ext !== 'pdf') {
171
+ const e = new Error(`--vision is currently only supported for .pdf (got .${ext})`);
172
+ e.exitCode = 2;
173
+ throw e;
174
+ }
175
+ console.error('note: vision: posting to openrouter…');
176
+ // Buffer for HTTP base64 encoding.
177
+ const contents = await fs.readFile(input);
178
+ ({ html, warnings } = await convertPdfViaVision(contents, { model }));
179
+ } else if (claude) {
180
+ console.error(`note: claude: spawning \`claude -p\`…`);
181
+ // Pass the path; the skill reads the file itself via its own tools.
182
+ const claudeOpts = timeoutSec ? { timeoutMs: timeoutSec * 1000 } : {};
183
+ ({ html, warnings } = await convertViaClaudeCli(input, ext, claudeOpts));
184
+ } else {
185
+ // Buffer (not utf8 string) — docx and pdf are binary, and text formats
186
+ // decode internally inside convert().
187
+ const contents = await fs.readFile(input);
188
+ ({ html, warnings } = await convert(ext, contents));
189
+ }
81
190
  for (const w of warnings) console.error(`note: ${w}`);
82
191
 
83
192
  const seed = await loadSeed(SEED_CANDIDATES);
@@ -97,4 +206,11 @@ export async function importCmd({ inputPath, outPath, force }) {
97
206
  const result = replaceInlineDoc(subbed, html);
98
207
  await fs.writeFile(out, result, 'utf8');
99
208
  console.log(`wrote ${rel(out)}`);
209
+ if (open) {
210
+ const prefill = await collectPrefill();
211
+ if (prefill.key) console.error('note: passing OPENROUTER_API_KEY via ?key= URL parameter');
212
+ if (prefill.backend) console.error(`note: passing RWA_BACKEND=${prefill.backend} via ?backend= URL parameter`);
213
+ if (prefill.model) console.error(`note: passing RWA_MODEL=${prefill.model} via ?model= URL parameter`);
214
+ openFile(out, prefill);
215
+ }
100
216
  }
@@ -0,0 +1,336 @@
1
+ import { spawn } from 'node:child_process';
2
+ import { readFile } from 'node:fs/promises';
3
+ import path from 'node:path';
4
+ import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
5
+
6
+ // PDF / docx → HTML by spawning the `claude` CLI in print mode.
7
+ //
8
+ // PDFs are processed in PARALLEL: split into page ranges, each chunk
9
+ // handed to its own `claude -p` subprocess concurrently, then merged.
10
+ // Long papers go from sequential N×t to roughly t×ceil(chunks/concurrency).
11
+ //
12
+ // Why: the user's machine has Anthropic's official `pdf` and `docx` skills
13
+ // installed under ~/.claude/skills/. Those skills have rich Python tooling
14
+ // (pypdf, pdfplumber, pandoc, mammoth, LibreOffice) that the rwa CLI itself
15
+ // can't reasonably bundle. Calling `claude -p` lets the agent invoke its
16
+ // skill, run the local Python sandbox, and hand back clean semantic HTML —
17
+ // strictly better fidelity than either the local pdfjs heuristic or the
18
+ // raw-vision OpenRouter path, on documents where the skills apply.
19
+ //
20
+ // Trust model: this spawns a Claude Code subprocess with
21
+ // `--permission-mode bypassPermissions`, which lets the agent run shell
22
+ // commands and write files without prompting. The user already trusts
23
+ // their input file (they're importing it). Document this in HELP.
24
+
25
+ const SKILL_FOR_EXT = { pdf: 'pdf', docx: 'docx' };
26
+
27
+ const DEFAULT_CHUNK_SIZE = 5; // pages per chunk
28
+ const DEFAULT_CONCURRENCY = 4; // simultaneous claude -p subprocesses
29
+ const DEFAULT_TIMEOUT_MS = 1_200_000; // 20 minutes per chunk
30
+
31
+ const PROMPT_TEMPLATE = (skill, filePath, pageRange) => {
32
+ const rangeNote = pageRange
33
+ ? `\n\nIMPORTANT: Process ONLY pages ${pageRange.start} to ${pageRange.end} (inclusive) of the document. Use the pdf skill's page-range support (pypdf/pdfplumber accept page indices) to extract just that slice. Do not output content from any other pages. The full document is ${pageRange.totalPages} pages; this chunk is pages ${pageRange.start}-${pageRange.end}.`
34
+ : '';
35
+ const styleNote = pageRange && pageRange.start > 1
36
+ ? `\n\nIMPORTANT (chunk ${pageRange.start}-${pageRange.end}): omit the leading <style> and @page rules. Output ONLY the inner content of the .doc wrapper for these pages — start your output with the actual content elements (e.g., <h2>, <p>, <table>...) and end with the last content element. Do NOT include <article>, <style>, <div class="doc">, or </article>, </div>. Just the content of pages ${pageRange.start}-${pageRange.end}, ready to splice into a larger document. The first chunk handled the styling; later chunks contribute content only.`
37
+ : '';
38
+
39
+ return `Use the ${skill} skill to extract the content of ${filePath} and convert it to a single <article>...</article> element that VISUALLY MATCHES the original document as closely as possible when rendered in a browser.${rangeNote}${styleNote}
40
+
41
+ The output will be embedded inside a re-writeable document container that has its own dark-theme CSS. Your <article> must include a leading scoped <style> block that defines its own visual appearance, so the container's theme does not bleed in.
42
+
43
+ Required structure (full-document or first-chunk only — see chunk note above):
44
+
45
+ <article style="all: revert;">
46
+ <style>
47
+ /* Scope every rule to .doc to avoid leaking into the container.
48
+ Use 'all: revert' or explicit resets to neutralize the container's theme. */
49
+ .doc { background: ...; color: ...; font-family: ...; padding: ...; max-width: ...; margin: 0 auto; }
50
+ .doc h1, .doc h2, .doc p, .doc table, .doc th, .doc td { ... }
51
+ /* etc. */
52
+ </style>
53
+ <div class="doc">
54
+ ... actual content ...
55
+ </div>
56
+ </article>
57
+
58
+ Style requirements (match the source PDF):
59
+ - Background color (usually white #ffffff for printed documents).
60
+ - Text color (usually black #000000 or near-black).
61
+ - Font family — pick a generic match: invoices and letters use sans-serif (Helvetica, Arial, system-ui); academic/literary uses serif (Georgia, Times New Roman); monospaced text uses monospace.
62
+ - Font sizes — match the visual hierarchy (titles bigger, body smaller, footnotes smallest).
63
+ - Text alignment — left, right, center, or justify, matching each block in the source.
64
+ - Right-aligned blocks (sender addresses, dates) MUST remain right-aligned via CSS.
65
+ - Padding/margins around sections that mirror the PDF's vertical density. Crucially, do NOT inflate vertical spacing — if the source fits on N pages, your output should fit on N pages when printed at the source paper size. Prefer tight margins (~0.5em-1em between blocks) over generous ones; a single-page invoice should remain a single-page invoice.
66
+ - Tables — borders, cell padding, header weight, alternating rows or shading where the PDF has them.
67
+ - Bold and italic where used, via <strong>/<em> (preferred) or font-weight/font-style in the scoped CSS.
68
+
69
+ Print-fit requirements (REQUIRED for documents that match a paper size):
70
+ - Include an @media print rule inside the scoped <style> block that:
71
+ * Removes any max-width constraint (so the doc fills the page width).
72
+ * Sets margin:0 / padding:0 on .doc so the printer's @page margin (default 0.5in) is the only outer margin.
73
+ * Optionally tightens block spacing further if the source page density is dense.
74
+ * Uses page-break-inside:avoid on tables, headers, and footer blocks so they don't split awkwardly across pages.
75
+ - Add an @page rule with size matching the source (default A4 if uncertain): @page { size: A4; margin: 0.5in; }
76
+
77
+ Content requirements:
78
+ - Use semantic tags: <h1>-<h6>, <p>, <ul>/<ol>/<li>, <table>/<thead>/<tbody>/<tr>/<td>/<th>, <strong>/<em>, <a href="...">.
79
+ - Preserve text exactly. Do not summarize, paraphrase, or reword.
80
+ - Reconstruct multi-column layouts as the source has them: side-by-side blocks via CSS flex/grid in your scoped styles, or as table cells if that fits better.
81
+ - No <img> tags. No <script>. No external resources (no @import, no <link>, no Google Fonts URLs — only system or generic font families).
82
+ - No id attributes. Class names should be scoped under .doc to avoid collisions with the container.
83
+ - Do not include <html>, <head>, <body>, or <!doctype>.
84
+
85
+ Print ONLY the final HTML as your last response. No preamble, no markdown fences, no commentary.`;
86
+ };
87
+
88
+ /**
89
+ * @param {string} filePath Absolute path to the file to import
90
+ * @param {string} ext Extension without dot ("pdf" or "docx")
91
+ * @param {object} [opts]
92
+ * @param {AbortSignal} [opts.signal]
93
+ * @param {number} [opts.timeoutMs] Wall-clock cap PER CHUNK (default 20min)
94
+ * @param {number} [opts.chunkSize] Pages per chunk for PDFs (default 5)
95
+ * @param {number} [opts.concurrency] Max simultaneous subprocesses (default 4)
96
+ * @returns {Promise<{ html: string, warnings: string[] }>}
97
+ */
98
+ export async function convertViaClaudeCli(filePath, ext, opts = {}) {
99
+ const skill = SKILL_FOR_EXT[ext];
100
+ if (!skill) {
101
+ const e = new Error(`--claude only supports .pdf and .docx (got .${ext})`);
102
+ e.exitCode = 2;
103
+ throw e;
104
+ }
105
+
106
+ // docx isn't naturally page-chunkable (no fixed page boundaries inside the
107
+ // XML). Single call.
108
+ if (ext !== 'pdf') {
109
+ const stdout = await runClaude(filePath, PROMPT_TEMPLATE(skill, filePath, null), opts);
110
+ const html = extractArticle(stdout);
111
+ if (!html) {
112
+ const preview = stdout.trim().slice(0, 400);
113
+ const e = new Error(
114
+ `claude: output did not contain an <article> element. Output preview:\n${preview}`
115
+ );
116
+ e.exitCode = 2;
117
+ throw e;
118
+ }
119
+ return {
120
+ html,
121
+ warnings: [`claude: imported via \`claude -p\` (${skill} skill)`],
122
+ };
123
+ }
124
+
125
+ const totalPages = await getPdfPageCount(filePath);
126
+ const chunkSize = opts.chunkSize || DEFAULT_CHUNK_SIZE;
127
+ const concurrency = opts.concurrency || DEFAULT_CONCURRENCY;
128
+
129
+ const ranges = [];
130
+ for (let start = 1; start <= totalPages; start += chunkSize) {
131
+ const end = Math.min(start + chunkSize - 1, totalPages);
132
+ ranges.push({ start, end, totalPages });
133
+ }
134
+
135
+ console.error(
136
+ `note: claude: ${totalPages}-page PDF → ${ranges.length} chunk${ranges.length === 1 ? '' : 's'} of ≤${chunkSize} pages, ${Math.min(concurrency, ranges.length)} parallel`
137
+ );
138
+
139
+ const htmlChunks = await runWithConcurrency(ranges, concurrency, async (range, idx) => {
140
+ console.error(`note: claude: chunk ${idx + 1}/${ranges.length} (pages ${range.start}-${range.end}) starting…`);
141
+ const prompt = PROMPT_TEMPLATE(skill, filePath, range);
142
+ const html = await runClaude(filePath, prompt, opts);
143
+ console.error(`note: claude: chunk ${idx + 1}/${ranges.length} done`);
144
+ return html;
145
+ });
146
+
147
+ const merged = mergeChunks(htmlChunks);
148
+ return {
149
+ html: merged,
150
+ warnings: [
151
+ `claude: imported ${ranges.length} chunk${ranges.length === 1 ? '' : 's'} via parallel \`claude -p\` (${skill} skill)`,
152
+ ],
153
+ };
154
+ }
155
+
156
+ // Run a single `claude -p` invocation. Returns the extracted HTML for the
157
+ // chunk (either a full <article> or content-only fragment depending on the
158
+ // prompt's chunk hint).
159
+ function runClaude(filePath, prompt, { signal, timeoutMs = DEFAULT_TIMEOUT_MS } = {}) {
160
+ const args = [
161
+ '-p',
162
+ '--output-format', 'text',
163
+ '--add-dir', path.dirname(filePath),
164
+ '--permission-mode', 'bypassPermissions',
165
+ prompt,
166
+ ];
167
+
168
+ return new Promise((resolve, reject) => {
169
+ let proc;
170
+ try {
171
+ proc = spawn('claude', args, { stdio: ['ignore', 'pipe', 'pipe'], signal });
172
+ } catch (err) {
173
+ const e = new Error(`claude: failed to spawn (${err && err.message ? err.message : String(err)}). Is the claude CLI installed?`);
174
+ e.exitCode = 2;
175
+ return reject(e);
176
+ }
177
+
178
+ let stdout = '';
179
+ let stderr = '';
180
+ proc.stdout.on('data', d => { stdout += d.toString('utf8'); });
181
+ proc.stderr.on('data', d => { stderr += d.toString('utf8'); });
182
+
183
+ const timer = setTimeout(() => {
184
+ proc.kill('SIGKILL');
185
+ const e = new Error(`claude: timed out after ${Math.round(timeoutMs / 1000)}s`);
186
+ e.exitCode = 2;
187
+ reject(e);
188
+ }, timeoutMs);
189
+
190
+ proc.on('error', err => {
191
+ clearTimeout(timer);
192
+ const e = new Error(`claude: spawn error (${err.code || err.message})`);
193
+ e.exitCode = 2;
194
+ reject(e);
195
+ });
196
+
197
+ proc.on('close', code => {
198
+ clearTimeout(timer);
199
+ if (code !== 0) {
200
+ const tail = stderr.trim().split('\n').slice(-5).join('\n').slice(0, 800);
201
+ const e = new Error(`claude -p exited ${code}${tail ? '\n' + tail : ''}`);
202
+ e.exitCode = 2;
203
+ return reject(e);
204
+ }
205
+ // Output may be a full <article>...</article> (first chunk / single call)
206
+ // or just inner content (later chunks). Hand the full stdout to the
207
+ // merger; it knows how to extract either shape.
208
+ resolve(stdout);
209
+ });
210
+ });
211
+ }
212
+
213
+ // Bounded-concurrency parallel runner. Items are processed in input order
214
+ // up to `concurrency` at a time. Order of `results[]` matches input order,
215
+ // regardless of completion order.
216
+ async function runWithConcurrency(items, concurrency, fn) {
217
+ const results = new Array(items.length);
218
+ let nextIdx = 0;
219
+ const worker = async () => {
220
+ while (true) {
221
+ const myIdx = nextIdx++;
222
+ if (myIdx >= items.length) break;
223
+ results[myIdx] = await fn(items[myIdx], myIdx);
224
+ }
225
+ };
226
+ const workers = Array.from({ length: Math.min(concurrency, items.length) }, worker);
227
+ await Promise.all(workers);
228
+ return results;
229
+ }
230
+
231
+ async function getPdfPageCount(filePath) {
232
+ const buf = await readFile(filePath);
233
+ const data = new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength);
234
+ let doc;
235
+ try {
236
+ doc = await pdfjs.getDocument({ data, isEvalSupported: false }).promise;
237
+ } catch (err) {
238
+ const e = new Error(`claude: failed to read PDF page count (${err && err.message ? err.message : String(err)})`);
239
+ e.exitCode = 2;
240
+ throw e;
241
+ }
242
+ const count = doc.numPages;
243
+ await doc.destroy().catch(() => {});
244
+ return count;
245
+ }
246
+
247
+ // Merge per-chunk HTML output into a single <article>. The first chunk's
248
+ // output is treated as a full <article> with leading <style>/@page; later
249
+ // chunks are content-only fragments (per their prompt). We:
250
+ // 1. Extract the first chunk's full <article ...>...<style>...</style>...<div class="doc"> shell
251
+ // 2. Append each later chunk's content fragments inside that .doc
252
+ // 3. Close with </div></article>
253
+ //
254
+ // If a later chunk DID emit a full <article>+<style> (the model ignored the
255
+ // chunk hint), strip its <article>/<style>/<div class="doc"> wrappers and
256
+ // keep only its inner content.
257
+ function mergeChunks(stdouts) {
258
+ if (stdouts.length === 1) {
259
+ const html = extractArticle(stdouts[0]);
260
+ if (!html) {
261
+ const preview = stdouts[0].trim().slice(0, 400);
262
+ const e = new Error(
263
+ `claude: output did not contain an <article> element. Output preview:\n${preview}`
264
+ );
265
+ e.exitCode = 2;
266
+ throw e;
267
+ }
268
+ return html;
269
+ }
270
+
271
+ const first = extractArticle(stdouts[0]);
272
+ if (!first) {
273
+ const preview = stdouts[0].trim().slice(0, 400);
274
+ const e = new Error(
275
+ `claude: first chunk output did not contain an <article> element. Output preview:\n${preview}`
276
+ );
277
+ e.exitCode = 2;
278
+ throw e;
279
+ }
280
+
281
+ // Find the .doc wrapper closing in the first chunk, so we can splice
282
+ // additional content before it. Prefer </div></article>; fall back to just
283
+ // </article> if no .doc wrapper exists.
284
+ const closingDocArticle = /<\/div>\s*<\/article>\s*$/i;
285
+ const closingArticleOnly = /<\/article>\s*$/i;
286
+ let prefix, suffix;
287
+ if (closingDocArticle.test(first)) {
288
+ prefix = first.replace(closingDocArticle, '');
289
+ suffix = '</div></article>';
290
+ } else if (closingArticleOnly.test(first)) {
291
+ prefix = first.replace(closingArticleOnly, '');
292
+ suffix = '</article>';
293
+ } else {
294
+ // Shouldn't happen — extractArticle guarantees </article>. Defensive.
295
+ prefix = first;
296
+ suffix = '';
297
+ }
298
+
299
+ const additional = stdouts.slice(1).map(stripChunkWrappers).filter(Boolean);
300
+ return [prefix, ...additional.map(c => '\n' + c), suffix].join('');
301
+ }
302
+
303
+ // Pull content out of a chunk's stdout. If the chunk emitted a full
304
+ // <article>+<style>+<div class="doc">...</div></article> (because the model
305
+ // ignored the "content-only" hint), strip those wrappers and the <style>.
306
+ // Otherwise return the cleaned stdout (already content-only).
307
+ function stripChunkWrappers(stdout) {
308
+ let body = stdout.trim();
309
+
310
+ // If wrapped in <article>...</article>, take only the inside.
311
+ const articleMatch = body.match(/<article(?:\s[^>]*)?>([\s\S]*)<\/article>/i);
312
+ if (articleMatch) body = articleMatch[1];
313
+
314
+ // Strip any <style>...</style> (we keep only the first chunk's styles).
315
+ body = body.replace(/<style(?:\s[^>]*)?>[\s\S]*?<\/style>/gi, '');
316
+
317
+ // Strip <div class="doc">...</div> wrapper if present.
318
+ const docMatch = body.match(/<div[^>]*class\s*=\s*["']doc["'][^>]*>([\s\S]*)<\/div>/i);
319
+ if (docMatch) body = docMatch[1];
320
+
321
+ // Strip stray markdown fences (some models add them despite the prompt).
322
+ body = body.replace(/^```(?:html)?\s*/i, '').replace(/\s*```\s*$/i, '');
323
+
324
+ return body.trim();
325
+ }
326
+
327
+ // Extract the outermost <article>...</article>. The agent's stdout might
328
+ // include thinking commentary, tool-use traces, or markdown fences in
329
+ // addition to the HTML; pull out only the article element.
330
+ function extractArticle(text) {
331
+ const start = text.search(/<article(?:\s[^>]*)?>/i);
332
+ if (start < 0) return null;
333
+ const end = text.lastIndexOf('</article>');
334
+ if (end < 0 || end < start) return null;
335
+ return text.slice(start, end + '</article>'.length).trim();
336
+ }
@@ -0,0 +1,156 @@
1
+ import fs from 'node:fs/promises';
2
+ import path from 'node:path';
3
+
4
+ // PDF → HTML via OpenRouter chat completions.
5
+ //
6
+ // Why this exists: pdfjs's text extraction produces flat-paragraph output
7
+ // that loses tables, multi-column layouts, and any text whose font has a
8
+ // broken toUnicode CMap (e.g. "Ü" decoded as "UY"). Sending the raw PDF to
9
+ // a vision-capable model bypasses both — the model reads the rendered
10
+ // content and reconstructs semantic HTML.
11
+ //
12
+ // Trade-off: ~$0.01-$0.05 per page in API costs, network round-trip
13
+ // latency. Opt-in via `rwa import file.pdf --vision`.
14
+ //
15
+ // Wire format: OpenRouter's PDF input docs say content type is "file" with
16
+ // `file_data: "data:application/pdf;base64,..."`. For Anthropic models OR
17
+ // passes this through as a native PDF document block; for others (Gemini,
18
+ // GPT-4o), it's routed through OR's file-parser plugin (engine "native"
19
+ // uses the model's own multimodal capability).
20
+
21
+ const OPENROUTER_URL = 'https://openrouter.ai/api/v1/chat/completions';
22
+
23
+ const SYSTEM_PROMPT = `You are converting a PDF document into clean, semantic HTML for embedding in a single-file rewritable document container.
24
+
25
+ Output requirements:
26
+ - A single <article> element containing all document content.
27
+ - Use semantic HTML: <h1>-<h6> for headings, <p> for paragraphs, <ul>/<ol>/<li> for lists, <table><thead><tbody><tr><td>/<th> for tables, <strong>/<em> for emphasis, <a href="..."> for links.
28
+ - Do NOT output <html>, <head>, <body>, <!doctype>, any preamble, or any explanation before or after the HTML.
29
+ - Do NOT wrap output in markdown code fences (no \`\`\`html).
30
+ - Preserve text content exactly — do not summarize, paraphrase, translate, or reword.
31
+ - Reconstruct multi-column layouts and tables faithfully. Table headers go in <thead>, body rows in <tbody>.
32
+ - Omit <img> entirely; this container is text-focused. If an image carries information, describe it briefly in a <p>.
33
+ - No <script>, <style>, class, or id attributes. Plain semantic HTML only.
34
+
35
+ Output ONLY the <article>...</article> element.`;
36
+
37
+ const USER_PROMPT = 'Convert this PDF document to a single <article> element of clean semantic HTML, following the rules in the system prompt.';
38
+
39
+ /**
40
+ * @param {Buffer|Uint8Array} bytes PDF content
41
+ * @param {object} [opts]
42
+ * @param {string} [opts.apiKey] OpenRouter API key. If omitted, read from
43
+ * process.env.OPENROUTER_API_KEY, then ./.env
44
+ * @param {string} [opts.model] OpenRouter model id; default reuses
45
+ * the rwa container's default
46
+ * @param {AbortSignal} [opts.signal]
47
+ * @returns {Promise<{ html: string, warnings: string[] }>}
48
+ */
49
+ export async function convertPdfViaVision(bytes, { apiKey, model, signal } = {}) {
50
+ apiKey = apiKey || process.env.OPENROUTER_API_KEY || await readDotEnvKey('OPENROUTER_API_KEY');
51
+ if (!apiKey) {
52
+ const e = new Error('vision: OPENROUTER_API_KEY is required (set in env or ./.env)');
53
+ e.exitCode = 2;
54
+ throw e;
55
+ }
56
+ const buf = Buffer.isBuffer(bytes) ? bytes : Buffer.from(bytes);
57
+ const dataUri = `data:application/pdf;base64,${buf.toString('base64')}`;
58
+
59
+ const body = {
60
+ model: model || 'google/gemini-3-flash-preview',
61
+ messages: [
62
+ { role: 'system', content: SYSTEM_PROMPT },
63
+ {
64
+ role: 'user',
65
+ content: [
66
+ { type: 'text', text: USER_PROMPT },
67
+ { type: 'file', file: { filename: 'document.pdf', file_data: dataUri } },
68
+ ],
69
+ },
70
+ ],
71
+ // Generous output budget — long PDFs can produce a lot of HTML.
72
+ // OpenRouter will clamp to model's actual max if smaller.
73
+ max_tokens: 16384,
74
+ // Deterministic output — we want the same HTML for the same input.
75
+ temperature: 0,
76
+ };
77
+
78
+ const res = await fetch(OPENROUTER_URL, {
79
+ method: 'POST',
80
+ headers: {
81
+ 'Authorization': `Bearer ${apiKey}`,
82
+ 'Content-Type': 'application/json',
83
+ // Recommended by OpenRouter for tracking, helps with rate-limit accounting.
84
+ 'HTTP-Referer': 'https://github.com/martintreiber/rewritable',
85
+ 'X-Title': 'rwa CLI',
86
+ },
87
+ body: JSON.stringify(body),
88
+ signal,
89
+ });
90
+
91
+ if (!res.ok) {
92
+ const text = await res.text().catch(() => '');
93
+ const e = new Error(`vision: openrouter ${res.status}${text ? ': ' + text.slice(0, 500) : ''}`);
94
+ e.exitCode = 2;
95
+ throw e;
96
+ }
97
+
98
+ const json = await res.json();
99
+ const content = json?.choices?.[0]?.message?.content;
100
+ if (typeof content !== 'string' || !content.trim()) {
101
+ const e = new Error('vision: openrouter returned empty content');
102
+ e.exitCode = 2;
103
+ throw e;
104
+ }
105
+
106
+ const html = extractArticle(content);
107
+ if (!html) {
108
+ const e = new Error(
109
+ `vision: model output did not contain an <article> element. Output preview:\n${content.slice(0, 300)}`
110
+ );
111
+ e.exitCode = 2;
112
+ throw e;
113
+ }
114
+
115
+ const warnings = [];
116
+ // Surface usage so the user sees what each import cost.
117
+ const usage = json?.usage;
118
+ if (usage) {
119
+ const tokens = `${usage.prompt_tokens || 0} in / ${usage.completion_tokens || 0} out`;
120
+ warnings.push(`vision: ${body.model} (${tokens} tokens)`);
121
+ }
122
+ return { html, warnings };
123
+ }
124
+
125
+ // Minimal .env reader for the OPENROUTER_API_KEY fallback path. Handles
126
+ // KEY=value with optional surrounding whitespace, optional matched quotes,
127
+ // optional `export` prefix. No interpolation, no multi-line values. Returns
128
+ // null if the file or key is missing.
129
+ async function readDotEnvKey(name) {
130
+ let text;
131
+ try {
132
+ text = await fs.readFile(path.join(process.cwd(), '.env'), 'utf8');
133
+ } catch (_) { return null; }
134
+ for (const line of text.split('\n')) {
135
+ const m = line.match(/^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*?)\s*$/);
136
+ if (!m || m[1] !== name) continue;
137
+ let v = m[2];
138
+ if ((v.startsWith('"') && v.endsWith('"')) || (v.startsWith("'") && v.endsWith("'"))) {
139
+ v = v.slice(1, -1);
140
+ }
141
+ return v || null;
142
+ }
143
+ return null;
144
+ }
145
+
146
+ // Extract the outermost <article>...</article>. Models often wrap output in
147
+ // ```html fences or add a "Here is the HTML:" preamble despite the system
148
+ // prompt; pull out only the article element to be robust to that.
149
+ function extractArticle(text) {
150
+ // Find the first <article (allow attributes) and the LAST </article>.
151
+ const start = text.search(/<article(?:\s[^>]*)?>/i);
152
+ if (start < 0) return null;
153
+ const end = text.lastIndexOf('</article>');
154
+ if (end < 0 || end < start) return null;
155
+ return text.slice(start, end + '</article>'.length).trim();
156
+ }