rewritable 0.15.0 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -75,10 +75,12 @@ Embeds the input file's content as the document's initial state. Supported forma
75
75
  - `.csv` — parsed via [`papaparse`](https://www.papaparse.com/) (RFC 4180; handles quoted commas, embedded newlines, escaped quotes, BOM). First row becomes `<thead>`, remaining rows `<tbody>`; every cell is HTML-escaped. Parse warnings print to stderr but don't abort the import.
76
76
  - `.txt` — paragraph-split on blank lines, HTML chars escaped
77
77
  - `.docx` — converted via [`mammoth`](https://github.com/mwilliamson/mammoth.js) to semantic HTML; `href`/`src` URLs are scheme-sanitized (same allow-list as `.md`).
78
- - `.pdf` — reconstructed with **maximum geometry fidelity**: each page is rebuilt as positioned, real (still editable) text at its original coordinates, with the document's rules and boxes drawn from the PDF's own vector operators — so an invoice, form, or statement *looks like the original* while staying a rewritable you can edit with `⌘K`. Bold/italic are recovered from the embedded font names; near-perfect, not pixel-exact (system substitute fonts, black text). A scanned/image-only PDF (no text layer) exits `2` — OCR is not supported. For a model-based alternative, see `--vision` / `--claude`.
78
+ - `.pdf` — reconstructed with **maximum geometry fidelity**: each page is rebuilt as positioned, real (still editable) text at its original coordinates, with the document's rules and boxes drawn from the PDF's own vector operators — so an invoice, form, or statement *looks like the original* while staying a rewritable you can edit with `⌘K`. Bold/italic are recovered from the embedded font names; near-perfect, not pixel-exact (system substitute fonts, black text). It also **exports back to PDF at the source page size, edge-to-edge** (`⌘P` / Save as PDF) — true scale, no double margin. A scanned/image-only PDF (no text layer) exits `2` — OCR is not supported. For a model-based alternative, see `--vision` / `--claude`.
79
79
 
80
80
  Output defaults to `<input-basename>.html` in the input's directory. Conversion is deterministic and offline — no API key, no network.
81
81
 
82
+ **Import fidelity loop (PDF).** After a PDF import, `rwa` runs an offline **structural fidelity check** (text-coverage + extraction-quality). If it's low *and* a model is reachable (`RWA_OPENROUTER_KEY` set), it **auto-escalates** to `--vision` and keeps the higher-fidelity result — announced on stderr. **Offline-first is preserved:** with no key, a low-fidelity import stays offline, succeeds, and just warns (suggesting `--vision`/`--claude`). Controls: `--no-escalate` (disable the loop), `--target-fidelity <0..1>` (set the bar, default 0.85). *(The deterministic geometry import is unchanged; the loop only measures and, when authorized, escalates. The full picture-level visual judge is a browser-side follow-up.)*
83
+
82
84
  ### `rwa clone <url> [path]`
83
85
 
84
86
  Clone a public webpage into a self-contained rewritable: fetch the page, extract its main article and title, and bake the content into a fresh container. First-class for **WordPress / ikangai posts** — a blog post becomes an editable, shareable single-file `.html` you can rewrite with `⌘K`.
package/bin/rwa.mjs CHANGED
@@ -1217,6 +1217,16 @@ function detectProductKind(fileText) {
1217
1217
  const vision = rest.includes('--vision');
1218
1218
  const claude = rest.includes('--claude');
1219
1219
  const trustInput = rest.includes('--trust-input');
1220
+ // Import fidelity loop (PDF): measure the deterministic import + auto-escalate to --vision when
1221
+ // low AND a model is reachable. --no-escalate opts out; --target-fidelity <0..1> sets the bar.
1222
+ const escalate = !rest.includes('--no-escalate');
1223
+ const tfIdx = rest.indexOf('--target-fidelity');
1224
+ const targetFidelity = tfIdx >= 0 ? Number(rest[tfIdx + 1]) : undefined;
1225
+ if (tfIdx >= 0 && (!Number.isFinite(targetFidelity) || targetFidelity < 0 || targetFidelity > 1)) {
1226
+ console.error('rwa import: --target-fidelity must be a number between 0 and 1');
1227
+ process.exitCode = 2;
1228
+ return;
1229
+ }
1220
1230
  // --model and --timeout take a value: find the index, then take the next arg.
1221
1231
  const modelIdx = rest.indexOf('--model');
1222
1232
  const model = modelIdx >= 0 ? rest[modelIdx + 1] : undefined;
@@ -1249,7 +1259,7 @@ function detectProductKind(fileText) {
1249
1259
  process.exitCode = 2;
1250
1260
  return;
1251
1261
  }
1252
- const positional = rest.filter((a, i) => !a.startsWith('-') && rest[i - 1] !== '--model' && rest[i - 1] !== '--timeout' && rest[i - 1] !== '--kind' && rest[i - 1] !== '--skin');
1262
+ const positional = rest.filter((a, i) => !a.startsWith('-') && rest[i - 1] !== '--model' && rest[i - 1] !== '--timeout' && rest[i - 1] !== '--kind' && rest[i - 1] !== '--skin' && rest[i - 1] !== '--target-fidelity');
1253
1263
  if (verb === 'new') {
1254
1264
  // `rwa new --kind <starter>` selects a built-in starter. Otherwise a bare-word
1255
1265
  // first positional is a TEMPLATE name (clone a data-rwa-template-labeled file
@@ -1268,7 +1278,7 @@ function detectProductKind(fileText) {
1268
1278
  process.exitCode = 2;
1269
1279
  return;
1270
1280
  }
1271
- await importCmd({ inputPath: positional[0], outPath: positional[1], force, open, vision, claude, trustInput, model, timeoutSec });
1281
+ await importCmd({ inputPath: positional[0], outPath: positional[1], force, open, vision, claude, trustInput, model, timeoutSec, escalate, targetFidelity });
1272
1282
  } else {
1273
1283
  console.error(`rwa: unknown verb "${verb}". Try --help.`);
1274
1284
  process.exitCode = 2;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "rewritable",
3
- "version": "0.15.0",
3
+ "version": "0.16.1",
4
4
  "description": "CLI for re-writeable: emit and import single-file rwa documents.",
5
5
  "type": "module",
6
6
  "bin": {
package/src/commands.mjs CHANGED
@@ -228,7 +228,7 @@ export async function newCmd({ outPath, force, open, kind, templateName, skin })
228
228
 
229
229
  export { KNOWN_KINDS };
230
230
 
231
- export async function importCmd({ inputPath, outPath, force, open, vision, claude, trustInput, model, timeoutSec }) {
231
+ export async function importCmd({ inputPath, outPath, force, open, vision, claude, trustInput, model, timeoutSec, escalate, targetFidelity }) {
232
232
  if (vision && claude) {
233
233
  const e = new Error('--vision and --claude are mutually exclusive');
234
234
  e.exitCode = 2;
@@ -265,7 +265,31 @@ export async function importCmd({ inputPath, outPath, force, open, vision, claud
265
265
  // Buffer (not utf8 string) — docx and pdf are binary, and text formats
266
266
  // decode internally inside convert().
267
267
  const contents = await fs.readFile(input);
268
- ({ html, warnings } = await convert(ext, contents));
268
+ const conv = await convert(ext, contents);
269
+ ({ html, warnings } = conv);
270
+ // Import fidelity loop (PDF) — measure the deterministic import; on a low structural score,
271
+ // auto-escalate to --vision, but ONLY when a model is reachable (offline-first: a keyless
272
+ // import stays offline and warns). `--no-escalate` opts out. Design: docs/plans/2026-06-30-…
273
+ if (ext === 'pdf' && conv.fidelityInput && escalate !== false) {
274
+ const { measureAndEscalate } = await import('./import-fidelity.mjs');
275
+ // Resolve the OpenRouter key ONCE so the reachability probe and the actual escalation target
276
+ // agree (both honor RWA_OPENROUTER_KEY, the project-preferred var, AND OPENROUTER_API_KEY).
277
+ // convertPdfViaVision otherwise only reads OPENROUTER_API_KEY, so a probe that said "reachable"
278
+ // on RWA_OPENROUTER_KEY alone would escalate into a guaranteed "key required" throw.
279
+ const orKey = process.env.RWA_OPENROUTER_KEY || process.env.OPENROUTER_API_KEY;
280
+ const r = await measureAndEscalate(
281
+ { structuralInput: conv.fidelityInput, importResult: conv },
282
+ {
283
+ threshold: targetFidelity,
284
+ escalate: escalate !== false,
285
+ modelReachable: () => !!orKey,
286
+ visionImport: async () => { console.error('note: import fidelity low — escalating to --vision (openrouter)…'); return convertPdfViaVision(contents, { model, apiKey: orKey }); },
287
+ },
288
+ );
289
+ if (r.note) console.error('note: ' + r.note);
290
+ html = r.result.html;
291
+ warnings = r.result.warnings || warnings;
292
+ }
269
293
  }
270
294
  for (const w of warnings) console.error(`note: ${w}`);
271
295
 
@@ -0,0 +1,131 @@
1
+ // Import fidelity loop — increment 1 (docs/plans/2026-06-30-import-fidelity-loop-design.md).
2
+ // An OFFLINE structural check on a PDF import + auto-escalate UP the ladder (default → --vision)
3
+ // when fidelity is low — gated on offline-first: escalation fires only when a model is reachable;
4
+ // a keyless low-fidelity import stays offline and warns, never touching the network.
5
+ //
6
+ // Increment 1 measures two false-positive-free signals:
7
+ // - coverage: fraction of source word-tokens present in the imported text (transform fidelity —
8
+ // catches dropped/mangled content; ~1 for a faithful geometry import).
9
+ // - garble: 1 − share of replacement (U+FFFD) / control chars in the source (extraction
10
+ // quality — a PDF with broken font encoding extracts to garbage; the geometry import
11
+ // is then unfaithful and should escalate to --vision, which reads the rendered glyphs).
12
+ // The graphics/visual signal ("this page is a chart/scan the text import can't reproduce") needs a
13
+ // renderer; it is the browser-side VISUAL JUDGE, a later increment. `density` is returned for
14
+ // callers/future use but deliberately NOT scored here (char-count alone false-positives on short
15
+ // docs) and is not yet surfaced in CLI output.
16
+ //
17
+ // Sensitivity, stated honestly: this is a FLOOR, biased to never over-escalate a good import.
18
+ // coverage uses substring `includes`, so reordered/duplicated text scores ~1.0; garble only counts
19
+ // U+FFFD + control chars, so wrong-encoding glyphs that aren't U+FFFD score ~1.0. The offline
20
+ // trigger therefore reliably fires only on DROPPED text or replacement-char garble — visual/glyph
21
+ // faithfulness is the VLM judge's job, not this metric's.
22
+
23
+ const DEFAULT_THRESHOLD = 0.85;
24
+ const MIN_CHARS_PER_PAGE = 200; // informational density floor only (not part of the score in inc. 1)
25
+
26
+ const norm = (s) => String(s == null ? '' : s).replace(/\s+/g, ' ').trim().toLowerCase();
27
+ const stripTags = (h) => String(h == null ? '' : h).replace(/<[^>]*>/g, ' ');
28
+
29
+ // Count replacement (U+FFFD) + control chars (excluding tab/LF/CR) without embedding literal
30
+ // control bytes in source — a char-code scan, so the file stays clean ASCII.
31
+ function badChars(src) {
32
+ let bad = 0;
33
+ for (let i = 0; i < src.length; i++) {
34
+ const c = src.charCodeAt(i);
35
+ if (c === 0xFFFD || (c < 0x20 && c !== 9 && c !== 10 && c !== 13)) bad++;
36
+ }
37
+ return bad;
38
+ }
39
+
40
+ /** Pure offline structural score in [0,1] = min(coverage, garble). Returns the components + reasons. */
41
+ export function structuralScore({ sourceText, pages } = {}, importedHtml) {
42
+ const src = String(sourceText == null ? '' : sourceText);
43
+ const importText = norm(stripTags(importedHtml));
44
+
45
+ // coverage — unique source word-tokens (len>1) present in the imported text
46
+ const tokens = [...new Set(norm(src).split(' ').filter(t => t.length > 1))];
47
+ let coverage = 1;
48
+ if (tokens.length) {
49
+ let hit = 0;
50
+ for (const t of tokens) if (importText.includes(t)) hit++;
51
+ coverage = hit / tokens.length;
52
+ }
53
+
54
+ // garble — replacement + control char share of the source
55
+ const garble = src.length ? Math.max(0, 1 - badChars(src) / src.length) : 1;
56
+
57
+ // density — reported only (graphics-heaviness detection is the deferred visual judge)
58
+ const pageN = Math.max(1, pages | 0);
59
+ const density = Math.min(1, src.replace(/\s+/g, '').length / (pageN * MIN_CHARS_PER_PAGE));
60
+
61
+ const score = Math.min(coverage, garble);
62
+ const reasons = [];
63
+ if (coverage < 0.9) reasons.push('low-coverage');
64
+ if (garble < 0.9) reasons.push('garbled-text');
65
+ return { score, coverage, garble, density, reasons };
66
+ }
67
+
68
+ /**
69
+ * Per-page structural fidelity for a multipage import. `perPage`: [{ sourceText, html }], where
70
+ * imported page i aligns 1:1 with source page i (the geometry import preserves page order). Returns
71
+ * { pages:[{page, score, coverage, garble, reasons}], overall (mean), worst (the lowest page) }.
72
+ * The per-page strip is what the browser visual judge surfaces so you jump to the bad pages.
73
+ */
74
+ export function structuralScoreByPage(perPage = []) {
75
+ const pages = perPage.map((p, i) => ({ page: i + 1, ...structuralScore({ sourceText: p.sourceText, pages: 1 }, p.html) }));
76
+ const overall = pages.length ? pages.reduce((a, p) => a + p.score, 0) / pages.length : 1;
77
+ const worst = pages.length ? pages.reduce((w, p) => (p.score < w.score ? p : w)) : null;
78
+ return { pages, overall, worst };
79
+ }
80
+
81
+ // The escalate-trigger fidelity: the WORST page when per-page data is present (so one bad page in an
82
+ // otherwise-good doc still escalates — averaging would hide it), else the whole-doc score.
83
+ function measureStructural(structuralInput, importHtml) {
84
+ if (structuralInput && Array.isArray(structuralInput.perPage) && structuralInput.perPage.length) {
85
+ const bp = structuralScoreByPage(structuralInput.perPage);
86
+ return { score: bp.worst.score, coverage: bp.worst.coverage, garble: bp.worst.garble, reasons: bp.worst.reasons, overall: bp.overall, worst: bp.worst, pages: bp.pages };
87
+ }
88
+ return structuralScore(structuralInput, importHtml);
89
+ }
90
+
91
+ /**
92
+ * Measure the geometry import; if its score is below `threshold` AND escalation is enabled AND a
93
+ * model is reachable, re-import via the injected `visionImport` and keep the higher-rung result.
94
+ * Offline-first: with no reachable model, never call the network — keep the deterministic import and
95
+ * surface a warning `note`. A failed escalation falls back to the deterministic import (loud).
96
+ *
97
+ * deps: { threshold=0.85, escalate=true, modelReachable():boolean, visionImport():Promise<importResult> }
98
+ */
99
+ export async function measureAndEscalate({ structuralInput, importResult }, deps = {}) {
100
+ const threshold = deps.threshold == null ? DEFAULT_THRESHOLD : deps.threshold;
101
+ const escalate = deps.escalate !== false; // default on
102
+ const fidelity = measureStructural(structuralInput, importResult.html);
103
+
104
+ if (fidelity.score >= threshold || !escalate) {
105
+ return { result: importResult, fidelity, escalated: false };
106
+ }
107
+
108
+ const reachable = typeof deps.modelReachable === 'function' ? deps.modelReachable() : false;
109
+ if (!reachable) {
110
+ return {
111
+ result: importResult, fidelity, escalated: false,
112
+ note: 'low import fidelity (' + fidelity.score.toFixed(2) + (fidelity.reasons.length ? ': ' + fidelity.reasons.join(', ') : '') +
113
+ ') — set RWA_OPENROUTER_KEY or use --vision/--claude for a higher-fidelity import',
114
+ };
115
+ }
116
+
117
+ try {
118
+ const vResult = await deps.visionImport();
119
+ // Escalation succeeded: keep the higher rung. We do NOT re-score with the same text-only metric
120
+ // — its blind spot (graphics/garbled glyphs) is exactly why we escalated; the model addresses it.
121
+ return {
122
+ result: vResult, escalated: true, baselineFidelity: fidelity,
123
+ note: 'import fidelity ' + fidelity.score.toFixed(2) + ' — escalated to --vision',
124
+ };
125
+ } catch (e) {
126
+ return {
127
+ result: importResult, fidelity, escalated: false,
128
+ note: 'low import fidelity (' + fidelity.score.toFixed(2) + ') — escalation to --vision failed: ' + ((e && e.message) || e),
129
+ };
130
+ }
131
+ }
package/src/import.mjs CHANGED
@@ -264,13 +264,18 @@ async function convertPdf(bytes) {
264
264
  throw e;
265
265
  }
266
266
  const pages = [];
267
+ const perPage = [];
267
268
  let totalText = 0;
269
+ let sourceText = '';
268
270
  for (let p = 1; p <= doc.numPages; p++) {
269
271
  const page = await doc.getPage(p);
270
272
  const rendered = await renderPdfPage(page, pdfjs.Util, pdfjs.OPS);
271
273
  pages.push(rendered.html);
272
274
  totalText += rendered.textCount;
275
+ sourceText += (rendered.text || '') + '\n';
276
+ perPage.push({ sourceText: rendered.text || '', html: rendered.html }); // for per-page fidelity
273
277
  }
278
+ const pageCount = doc.numPages;
274
279
  await doc.destroy().catch(() => {});
275
280
 
276
281
  if (totalText === 0) {
@@ -278,9 +283,20 @@ async function convertPdf(bytes) {
278
283
  e.exitCode = 2;
279
284
  throw e;
280
285
  }
286
+ // Print at true page size, edge to edge. Two coupled corrections, both print-only
287
+ // (screen rendering is byte-unchanged): (1) the page box is sized in PDF points
288
+ // rendered as CSS px (72→96 dpi), so it draws at 75% of physical size — zoom
289
+ // (96/72, held a hair under 1.33333 so the page never spills to a blank second
290
+ // sheet) scales it back to true size; (2) a per-document @page matching the source
291
+ // page's own size + margin:0 overrides the seed's @page{margin:18mm}, so a page that
292
+ // already carries its own margins isn't double-framed. Points-based @page size keeps
293
+ // this correct for any paper (A4, Letter, …), not just A4.
294
+ const dims = /width:([\d.]+)px;height:([\d.]+)px/.exec(pages[0] || '');
295
+ const printPageRule = dims ? `\n<style>@page{size:${dims[1]}pt ${dims[2]}pt;margin:0}</style>` : '';
281
296
  return {
282
- html: `<article class="rwa-pdf">\n${PDF_PAGE_STYLE}\n<div class="rwa-pdf-doc">\n${pages.join('\n')}\n</div>\n</article>`,
297
+ html: `<article class="rwa-pdf">\n${PDF_PAGE_STYLE}${printPageRule}\n<div class="rwa-pdf-doc">\n${pages.join('\n')}\n</div>\n</article>`,
283
298
  warnings: ['pdf: imported as a geometry-faithful reconstruction (positioned text + rules) — text stays editable but is absolutely positioned'],
299
+ fidelityInput: { sourceText, pages: pageCount, perPage }, // for the import fidelity loop (import-fidelity.mjs); perPage drives per-page/worst-page scoring
284
300
  };
285
301
  }
286
302
 
@@ -309,7 +325,7 @@ const PDF_PAGE_STYLE = `<style>
309
325
  .rwa-pdf-page{position:relative;flex:none;background:#fff;box-shadow:0 1px 5px rgba(0,0,0,.18);overflow:hidden;}
310
326
  .rwa-pdf-t{position:absolute;white-space:pre;line-height:1;color:#000;transform-origin:0 0;}
311
327
  .rwa-pdf-g{position:absolute;}
312
- @media print{.rwa-pdf{background:none}.rwa-pdf-doc{gap:0;padding:0;overflow:visible}.rwa-pdf-page{box-shadow:none}}
328
+ @media print{.rwa-pdf{background:none}.rwa-pdf-doc{gap:0;padding:0;overflow:visible}.rwa-pdf-page{box-shadow:none;zoom:1.3333}}
313
329
  </style>`;
314
330
 
315
331
  function escapePdfText(s) {
@@ -489,5 +505,7 @@ async function renderPdfPage(page, Util, OPS) {
489
505
  }
490
506
 
491
507
  const html = `<div class="rwa-pdf-page" style="width:${pdfNum(vp.width)}px;height:${pdfNum(vp.height)}px">\n${parts.join('\n')}\n</div>`;
492
- return { html, textCount };
508
+ // text: the raw pdf.js-extracted source text (for the import fidelity check — coverage + garble).
509
+ const text = tc.items.map(i => i.str || '').join(' ');
510
+ return { html, textCount, text };
493
511
  }