rewritable 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/bin/rwa.mjs +12 -2
- package/package.json +1 -1
- package/src/commands.mjs +26 -2
- package/src/import-fidelity.mjs +131 -0
- package/src/import.mjs +9 -1
package/README.md
CHANGED
|
@@ -79,6 +79,8 @@ Embeds the input file's content as the document's initial state. Supported forma
|
|
|
79
79
|
|
|
80
80
|
Output defaults to `<input-basename>.html` in the input's directory. Conversion is deterministic and offline — no API key, no network.
|
|
81
81
|
|
|
82
|
+
**Import fidelity loop (PDF).** After a PDF import, `rwa` runs an offline **structural fidelity check** (text-coverage + extraction-quality). If it's low *and* a model is reachable (`RWA_OPENROUTER_KEY` set), it **auto-escalates** to `--vision` and keeps the higher-fidelity result — announced on stderr. **Offline-first is preserved:** with no key, a low-fidelity import stays offline, succeeds, and just warns (suggesting `--vision`/`--claude`). Controls: `--no-escalate` (disable the loop), `--target-fidelity <0..1>` (set the bar, default 0.85). *(The deterministic geometry import is unchanged; the loop only measures and, when authorized, escalates. The full picture-level visual judge is a browser-side follow-up.)*
|
|
83
|
+
|
|
82
84
|
### `rwa clone <url> [path]`
|
|
83
85
|
|
|
84
86
|
Clone a public webpage into a self-contained rewritable: fetch the page, extract its main article and title, and bake the content into a fresh container. First-class for **WordPress / ikangai posts** — a blog post becomes an editable, shareable single-file `.html` you can rewrite with `⌘K`.
|
package/bin/rwa.mjs
CHANGED
|
@@ -1217,6 +1217,16 @@ function detectProductKind(fileText) {
|
|
|
1217
1217
|
const vision = rest.includes('--vision');
|
|
1218
1218
|
const claude = rest.includes('--claude');
|
|
1219
1219
|
const trustInput = rest.includes('--trust-input');
|
|
1220
|
+
// Import fidelity loop (PDF): measure the deterministic import + auto-escalate to --vision when
|
|
1221
|
+
// low AND a model is reachable. --no-escalate opts out; --target-fidelity <0..1> sets the bar.
|
|
1222
|
+
const escalate = !rest.includes('--no-escalate');
|
|
1223
|
+
const tfIdx = rest.indexOf('--target-fidelity');
|
|
1224
|
+
const targetFidelity = tfIdx >= 0 ? Number(rest[tfIdx + 1]) : undefined;
|
|
1225
|
+
if (tfIdx >= 0 && (!Number.isFinite(targetFidelity) || targetFidelity < 0 || targetFidelity > 1)) {
|
|
1226
|
+
console.error('rwa import: --target-fidelity must be a number between 0 and 1');
|
|
1227
|
+
process.exitCode = 2;
|
|
1228
|
+
return;
|
|
1229
|
+
}
|
|
1220
1230
|
// --model and --timeout take a value: find the index, then take the next arg.
|
|
1221
1231
|
const modelIdx = rest.indexOf('--model');
|
|
1222
1232
|
const model = modelIdx >= 0 ? rest[modelIdx + 1] : undefined;
|
|
@@ -1249,7 +1259,7 @@ function detectProductKind(fileText) {
|
|
|
1249
1259
|
process.exitCode = 2;
|
|
1250
1260
|
return;
|
|
1251
1261
|
}
|
|
1252
|
-
const positional = rest.filter((a, i) => !a.startsWith('-') && rest[i - 1] !== '--model' && rest[i - 1] !== '--timeout' && rest[i - 1] !== '--kind' && rest[i - 1] !== '--skin');
|
|
1262
|
+
const positional = rest.filter((a, i) => !a.startsWith('-') && rest[i - 1] !== '--model' && rest[i - 1] !== '--timeout' && rest[i - 1] !== '--kind' && rest[i - 1] !== '--skin' && rest[i - 1] !== '--target-fidelity');
|
|
1253
1263
|
if (verb === 'new') {
|
|
1254
1264
|
// `rwa new --kind <starter>` selects a built-in starter. Otherwise a bare-word
|
|
1255
1265
|
// first positional is a TEMPLATE name (clone a data-rwa-template-labeled file
|
|
@@ -1268,7 +1278,7 @@ function detectProductKind(fileText) {
|
|
|
1268
1278
|
process.exitCode = 2;
|
|
1269
1279
|
return;
|
|
1270
1280
|
}
|
|
1271
|
-
await importCmd({ inputPath: positional[0], outPath: positional[1], force, open, vision, claude, trustInput, model, timeoutSec });
|
|
1281
|
+
await importCmd({ inputPath: positional[0], outPath: positional[1], force, open, vision, claude, trustInput, model, timeoutSec, escalate, targetFidelity });
|
|
1272
1282
|
} else {
|
|
1273
1283
|
console.error(`rwa: unknown verb "${verb}". Try --help.`);
|
|
1274
1284
|
process.exitCode = 2;
|
package/package.json
CHANGED
package/src/commands.mjs
CHANGED
|
@@ -228,7 +228,7 @@ export async function newCmd({ outPath, force, open, kind, templateName, skin })
|
|
|
228
228
|
|
|
229
229
|
export { KNOWN_KINDS };
|
|
230
230
|
|
|
231
|
-
export async function importCmd({ inputPath, outPath, force, open, vision, claude, trustInput, model, timeoutSec }) {
|
|
231
|
+
export async function importCmd({ inputPath, outPath, force, open, vision, claude, trustInput, model, timeoutSec, escalate, targetFidelity }) {
|
|
232
232
|
if (vision && claude) {
|
|
233
233
|
const e = new Error('--vision and --claude are mutually exclusive');
|
|
234
234
|
e.exitCode = 2;
|
|
@@ -265,7 +265,31 @@ export async function importCmd({ inputPath, outPath, force, open, vision, claud
|
|
|
265
265
|
// Buffer (not utf8 string) — docx and pdf are binary, and text formats
|
|
266
266
|
// decode internally inside convert().
|
|
267
267
|
const contents = await fs.readFile(input);
|
|
268
|
-
|
|
268
|
+
const conv = await convert(ext, contents);
|
|
269
|
+
({ html, warnings } = conv);
|
|
270
|
+
// Import fidelity loop (PDF) — measure the deterministic import; on a low structural score,
|
|
271
|
+
// auto-escalate to --vision, but ONLY when a model is reachable (offline-first: a keyless
|
|
272
|
+
// import stays offline and warns). `--no-escalate` opts out. Design: docs/plans/2026-06-30-…
|
|
273
|
+
if (ext === 'pdf' && conv.fidelityInput && escalate !== false) {
|
|
274
|
+
const { measureAndEscalate } = await import('./import-fidelity.mjs');
|
|
275
|
+
// Resolve the OpenRouter key ONCE so the reachability probe and the actual escalation target
|
|
276
|
+
// agree (both honor RWA_OPENROUTER_KEY, the project-preferred var, AND OPENROUTER_API_KEY).
|
|
277
|
+
// convertPdfViaVision otherwise only reads OPENROUTER_API_KEY, so a probe that said "reachable"
|
|
278
|
+
// on RWA_OPENROUTER_KEY alone would escalate into a guaranteed "key required" throw.
|
|
279
|
+
const orKey = process.env.RWA_OPENROUTER_KEY || process.env.OPENROUTER_API_KEY;
|
|
280
|
+
const r = await measureAndEscalate(
|
|
281
|
+
{ structuralInput: conv.fidelityInput, importResult: conv },
|
|
282
|
+
{
|
|
283
|
+
threshold: targetFidelity,
|
|
284
|
+
escalate: escalate !== false,
|
|
285
|
+
modelReachable: () => !!orKey,
|
|
286
|
+
visionImport: async () => { console.error('note: import fidelity low — escalating to --vision (openrouter)…'); return convertPdfViaVision(contents, { model, apiKey: orKey }); },
|
|
287
|
+
},
|
|
288
|
+
);
|
|
289
|
+
if (r.note) console.error('note: ' + r.note);
|
|
290
|
+
html = r.result.html;
|
|
291
|
+
warnings = r.result.warnings || warnings;
|
|
292
|
+
}
|
|
269
293
|
}
|
|
270
294
|
for (const w of warnings) console.error(`note: ${w}`);
|
|
271
295
|
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
// Import fidelity loop — increment 1 (docs/plans/2026-06-30-import-fidelity-loop-design.md).
|
|
2
|
+
// An OFFLINE structural check on a PDF import + auto-escalate UP the ladder (default → --vision)
|
|
3
|
+
// when fidelity is low — gated on offline-first: escalation fires only when a model is reachable;
|
|
4
|
+
// a keyless low-fidelity import stays offline and warns, never touching the network.
|
|
5
|
+
//
|
|
6
|
+
// Increment 1 measures two false-positive-free signals:
|
|
7
|
+
// - coverage: fraction of source word-tokens present in the imported text (transform fidelity —
|
|
8
|
+
// catches dropped/mangled content; ~1 for a faithful geometry import).
|
|
9
|
+
// - garble: 1 − share of replacement (U+FFFD) / control chars in the source (extraction
|
|
10
|
+
// quality — a PDF with broken font encoding extracts to garbage; the geometry import
|
|
11
|
+
// is then unfaithful and should escalate to --vision, which reads the rendered glyphs).
|
|
12
|
+
// The graphics/visual signal ("this page is a chart/scan the text import can't reproduce") needs a
|
|
13
|
+
// renderer; it is the browser-side VISUAL JUDGE, a later increment. `density` is returned for
|
|
14
|
+
// callers/future use but deliberately NOT scored here (char-count alone false-positives on short
|
|
15
|
+
// docs) and is not yet surfaced in CLI output.
|
|
16
|
+
//
|
|
17
|
+
// Sensitivity, stated honestly: this is a FLOOR, biased to never over-escalate a good import.
|
|
18
|
+
// coverage uses substring `includes`, so reordered/duplicated text scores ~1.0; garble only counts
|
|
19
|
+
// U+FFFD + control chars, so wrong-encoding glyphs that aren't U+FFFD score ~1.0. The offline
|
|
20
|
+
// trigger therefore reliably fires only on DROPPED text or replacement-char garble — visual/glyph
|
|
21
|
+
// faithfulness is the VLM judge's job, not this metric's.
|
|
22
|
+
|
|
23
|
+
const DEFAULT_THRESHOLD = 0.85;
|
|
24
|
+
const MIN_CHARS_PER_PAGE = 200; // informational density floor only (not part of the score in inc. 1)
|
|
25
|
+
|
|
26
|
+
const norm = (s) => String(s == null ? '' : s).replace(/\s+/g, ' ').trim().toLowerCase();
|
|
27
|
+
const stripTags = (h) => String(h == null ? '' : h).replace(/<[^>]*>/g, ' ');
|
|
28
|
+
|
|
29
|
+
// Count replacement (U+FFFD) + control chars (excluding tab/LF/CR) without embedding literal
|
|
30
|
+
// control bytes in source — a char-code scan, so the file stays clean ASCII.
|
|
31
|
+
function badChars(src) {
|
|
32
|
+
let bad = 0;
|
|
33
|
+
for (let i = 0; i < src.length; i++) {
|
|
34
|
+
const c = src.charCodeAt(i);
|
|
35
|
+
if (c === 0xFFFD || (c < 0x20 && c !== 9 && c !== 10 && c !== 13)) bad++;
|
|
36
|
+
}
|
|
37
|
+
return bad;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Pure offline structural score in [0,1] = min(coverage, garble). Returns the components + reasons. */
|
|
41
|
+
export function structuralScore({ sourceText, pages } = {}, importedHtml) {
|
|
42
|
+
const src = String(sourceText == null ? '' : sourceText);
|
|
43
|
+
const importText = norm(stripTags(importedHtml));
|
|
44
|
+
|
|
45
|
+
// coverage — unique source word-tokens (len>1) present in the imported text
|
|
46
|
+
const tokens = [...new Set(norm(src).split(' ').filter(t => t.length > 1))];
|
|
47
|
+
let coverage = 1;
|
|
48
|
+
if (tokens.length) {
|
|
49
|
+
let hit = 0;
|
|
50
|
+
for (const t of tokens) if (importText.includes(t)) hit++;
|
|
51
|
+
coverage = hit / tokens.length;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// garble — replacement + control char share of the source
|
|
55
|
+
const garble = src.length ? Math.max(0, 1 - badChars(src) / src.length) : 1;
|
|
56
|
+
|
|
57
|
+
// density — reported only (graphics-heaviness detection is the deferred visual judge)
|
|
58
|
+
const pageN = Math.max(1, pages | 0);
|
|
59
|
+
const density = Math.min(1, src.replace(/\s+/g, '').length / (pageN * MIN_CHARS_PER_PAGE));
|
|
60
|
+
|
|
61
|
+
const score = Math.min(coverage, garble);
|
|
62
|
+
const reasons = [];
|
|
63
|
+
if (coverage < 0.9) reasons.push('low-coverage');
|
|
64
|
+
if (garble < 0.9) reasons.push('garbled-text');
|
|
65
|
+
return { score, coverage, garble, density, reasons };
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Per-page structural fidelity for a multipage import. `perPage`: [{ sourceText, html }], where
|
|
70
|
+
* imported page i aligns 1:1 with source page i (the geometry import preserves page order). Returns
|
|
71
|
+
* { pages:[{page, score, coverage, garble, reasons}], overall (mean), worst (the lowest page) }.
|
|
72
|
+
* The per-page strip is what the browser visual judge surfaces so you jump to the bad pages.
|
|
73
|
+
*/
|
|
74
|
+
export function structuralScoreByPage(perPage = []) {
|
|
75
|
+
const pages = perPage.map((p, i) => ({ page: i + 1, ...structuralScore({ sourceText: p.sourceText, pages: 1 }, p.html) }));
|
|
76
|
+
const overall = pages.length ? pages.reduce((a, p) => a + p.score, 0) / pages.length : 1;
|
|
77
|
+
const worst = pages.length ? pages.reduce((w, p) => (p.score < w.score ? p : w)) : null;
|
|
78
|
+
return { pages, overall, worst };
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// The escalate-trigger fidelity: the WORST page when per-page data is present (so one bad page in an
|
|
82
|
+
// otherwise-good doc still escalates — averaging would hide it), else the whole-doc score.
|
|
83
|
+
function measureStructural(structuralInput, importHtml) {
|
|
84
|
+
if (structuralInput && Array.isArray(structuralInput.perPage) && structuralInput.perPage.length) {
|
|
85
|
+
const bp = structuralScoreByPage(structuralInput.perPage);
|
|
86
|
+
return { score: bp.worst.score, coverage: bp.worst.coverage, garble: bp.worst.garble, reasons: bp.worst.reasons, overall: bp.overall, worst: bp.worst, pages: bp.pages };
|
|
87
|
+
}
|
|
88
|
+
return structuralScore(structuralInput, importHtml);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Measure the geometry import; if its score is below `threshold` AND escalation is enabled AND a
|
|
93
|
+
* model is reachable, re-import via the injected `visionImport` and keep the higher-rung result.
|
|
94
|
+
* Offline-first: with no reachable model, never call the network — keep the deterministic import and
|
|
95
|
+
* surface a warning `note`. A failed escalation falls back to the deterministic import (loud).
|
|
96
|
+
*
|
|
97
|
+
* deps: { threshold=0.85, escalate=true, modelReachable():boolean, visionImport():Promise<importResult> }
|
|
98
|
+
*/
|
|
99
|
+
export async function measureAndEscalate({ structuralInput, importResult }, deps = {}) {
|
|
100
|
+
const threshold = deps.threshold == null ? DEFAULT_THRESHOLD : deps.threshold;
|
|
101
|
+
const escalate = deps.escalate !== false; // default on
|
|
102
|
+
const fidelity = measureStructural(structuralInput, importResult.html);
|
|
103
|
+
|
|
104
|
+
if (fidelity.score >= threshold || !escalate) {
|
|
105
|
+
return { result: importResult, fidelity, escalated: false };
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const reachable = typeof deps.modelReachable === 'function' ? deps.modelReachable() : false;
|
|
109
|
+
if (!reachable) {
|
|
110
|
+
return {
|
|
111
|
+
result: importResult, fidelity, escalated: false,
|
|
112
|
+
note: 'low import fidelity (' + fidelity.score.toFixed(2) + (fidelity.reasons.length ? ': ' + fidelity.reasons.join(', ') : '') +
|
|
113
|
+
') — set RWA_OPENROUTER_KEY or use --vision/--claude for a higher-fidelity import',
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
try {
|
|
118
|
+
const vResult = await deps.visionImport();
|
|
119
|
+
// Escalation succeeded: keep the higher rung. We do NOT re-score with the same text-only metric
|
|
120
|
+
// — its blind spot (graphics/garbled glyphs) is exactly why we escalated; the model addresses it.
|
|
121
|
+
return {
|
|
122
|
+
result: vResult, escalated: true, baselineFidelity: fidelity,
|
|
123
|
+
note: 'import fidelity ' + fidelity.score.toFixed(2) + ' — escalated to --vision',
|
|
124
|
+
};
|
|
125
|
+
} catch (e) {
|
|
126
|
+
return {
|
|
127
|
+
result: importResult, fidelity, escalated: false,
|
|
128
|
+
note: 'low import fidelity (' + fidelity.score.toFixed(2) + ') — escalation to --vision failed: ' + ((e && e.message) || e),
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
}
|
package/src/import.mjs
CHANGED
|
@@ -264,13 +264,18 @@ async function convertPdf(bytes) {
|
|
|
264
264
|
throw e;
|
|
265
265
|
}
|
|
266
266
|
const pages = [];
|
|
267
|
+
const perPage = [];
|
|
267
268
|
let totalText = 0;
|
|
269
|
+
let sourceText = '';
|
|
268
270
|
for (let p = 1; p <= doc.numPages; p++) {
|
|
269
271
|
const page = await doc.getPage(p);
|
|
270
272
|
const rendered = await renderPdfPage(page, pdfjs.Util, pdfjs.OPS);
|
|
271
273
|
pages.push(rendered.html);
|
|
272
274
|
totalText += rendered.textCount;
|
|
275
|
+
sourceText += (rendered.text || '') + '\n';
|
|
276
|
+
perPage.push({ sourceText: rendered.text || '', html: rendered.html }); // for per-page fidelity
|
|
273
277
|
}
|
|
278
|
+
const pageCount = doc.numPages;
|
|
274
279
|
await doc.destroy().catch(() => {});
|
|
275
280
|
|
|
276
281
|
if (totalText === 0) {
|
|
@@ -281,6 +286,7 @@ async function convertPdf(bytes) {
|
|
|
281
286
|
return {
|
|
282
287
|
html: `<article class="rwa-pdf">\n${PDF_PAGE_STYLE}\n<div class="rwa-pdf-doc">\n${pages.join('\n')}\n</div>\n</article>`,
|
|
283
288
|
warnings: ['pdf: imported as a geometry-faithful reconstruction (positioned text + rules) — text stays editable but is absolutely positioned'],
|
|
289
|
+
fidelityInput: { sourceText, pages: pageCount, perPage }, // for the import fidelity loop (import-fidelity.mjs); perPage drives per-page/worst-page scoring
|
|
284
290
|
};
|
|
285
291
|
}
|
|
286
292
|
|
|
@@ -489,5 +495,7 @@ async function renderPdfPage(page, Util, OPS) {
|
|
|
489
495
|
}
|
|
490
496
|
|
|
491
497
|
const html = `<div class="rwa-pdf-page" style="width:${pdfNum(vp.width)}px;height:${pdfNum(vp.height)}px">\n${parts.join('\n')}\n</div>`;
|
|
492
|
-
|
|
498
|
+
// text: the raw pdf.js-extracted source text (for the import fidelity check — coverage + garble).
|
|
499
|
+
const text = tc.items.map(i => i.str || '').join(' ');
|
|
500
|
+
return { html, textCount, text };
|
|
493
501
|
}
|