rewritable 0.1.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +263 -5
- package/bin/rwa.mjs +1033 -6
- package/package.json +7 -4
- package/seeds/rewritable.html +6989 -156
- package/src/agent-loop.mjs +155 -0
- package/src/apply-edits.mjs +664 -0
- package/src/atomic-write.mjs +38 -0
- package/src/backend.mjs +43 -0
- package/src/clone-extract.mjs +249 -0
- package/src/clone.mjs +161 -0
- package/src/commands.mjs +207 -11
- package/src/create.mjs +256 -0
- package/src/doc.mjs +69 -0
- package/src/dsl-compiler.mjs +357 -0
- package/src/edit.mjs +300 -0
- package/src/fetch-page.mjs +346 -0
- package/src/host.mjs +126 -0
- package/src/identity.mjs +257 -0
- package/src/import-claude.mjs +360 -0
- package/src/import-vision.mjs +156 -0
- package/src/import.mjs +357 -8
- package/src/ls.mjs +105 -0
- package/src/publish-site.mjs +85 -0
- package/src/publish.mjs +98 -0
- package/src/seed-extract.mjs +40 -0
- package/src/seed.mjs +1399 -6
- package/src/self-contained.mjs +115 -0
- package/src/skill-manifest.mjs +227 -0
- package/src/skin.mjs +350 -0
- package/src/skins.mjs +274 -0
- package/src/template.mjs +109 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
// Crash-safe atomic file write for `rwa edit`. Write the new bytes to a sibling
|
|
2
|
+
// temp file, fsync (datasync) its contents, then rename(2) over the target —
|
|
3
|
+
// atomic on the same filesystem, so a reader sees either the old file or the new
|
|
4
|
+
// one, never a half-written stream. (datasync, not sync: a power loss between
|
|
5
|
+
// rename and the kernel flushing dirty pages could otherwise land a renamed file
|
|
6
|
+
// with stale/zero bytes; we don't depend on the temp's metadata being durable.)
|
|
7
|
+
//
|
|
8
|
+
// On ANY failure the temp file is removed. Previously a writeFile/datasync
|
|
9
|
+
// failure left a `.rwa-tmp-<pid>` behind — only the rename-failure path cleaned
|
|
10
|
+
// up. `deps` is injectable so the cleanup-on-failure is unit-testable without an
|
|
11
|
+
// actual disk failure.
|
|
12
|
+
|
|
13
|
+
import { open, rename, unlink } from 'node:fs/promises';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* @param {string} filePath — destination path (overwritten atomically)
|
|
17
|
+
* @param {string} content — UTF-8 bytes to write
|
|
18
|
+
* @param {{open:Function, rename:Function, unlink:Function}} [deps] — injectable fs ops
|
|
19
|
+
*/
|
|
20
|
+
export async function atomicWrite(filePath, content, deps = { open, rename, unlink }) {
|
|
21
|
+
const tmp = `${filePath}.rwa-tmp-${process.pid}`;
|
|
22
|
+
let handle;
|
|
23
|
+
try {
|
|
24
|
+
handle = await deps.open(tmp, 'w');
|
|
25
|
+
await handle.writeFile(content, 'utf8');
|
|
26
|
+
await handle.datasync();
|
|
27
|
+
await handle.close();
|
|
28
|
+
handle = null;
|
|
29
|
+
await deps.rename(tmp, filePath);
|
|
30
|
+
} catch (e) {
|
|
31
|
+
// Close a still-open handle, then remove the temp so a failed write never
|
|
32
|
+
// leaks a .rwa-tmp-<pid>. Cleanup errors are swallowed — the original
|
|
33
|
+
// failure `e` is what the caller needs.
|
|
34
|
+
if (handle) { try { await handle.close(); } catch { /* already failing */ } }
|
|
35
|
+
await deps.unlink(tmp).catch(() => {});
|
|
36
|
+
throw e;
|
|
37
|
+
}
|
|
38
|
+
}
|
package/src/backend.mjs
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
// Backend auth resolution for `rwa edit`. Extracted from bin/rwa.mjs so the
|
|
2
|
+
// precedence is unit-testable (the bin entrypoint runs on import and can't be
|
|
3
|
+
// imported cleanly).
|
|
4
|
+
//
|
|
5
|
+
// Only the openrouter backend needs a key — ollama and lmstudio run locally
|
|
6
|
+
// without auth. The key resolves in order: an explicit --api-key flag, then the
|
|
7
|
+
// project-specific RWA_OPENROUTER_KEY (env conventions match the docker-compose
|
|
8
|
+
// deploy in service/), then the CONVENTIONAL OPENROUTER_API_KEY that agents and
|
|
9
|
+
// users normally have exported. Empty strings count as absent.
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Resolve the API key for a backend.
|
|
13
|
+
* @param {string} backendName — 'openrouter' | 'ollama' | 'lmstudio'
|
|
14
|
+
* @param {string|undefined} flagValue — the --api-key flag value, if any
|
|
15
|
+
* @param {Record<string,string|undefined>} [env] — environment (injectable for tests)
|
|
16
|
+
* @returns {string|undefined} the key, or undefined when none applies
|
|
17
|
+
*/
|
|
18
|
+
export function resolveApiKey(backendName, flagValue, env = process.env) {
|
|
19
|
+
if (flagValue) return flagValue;
|
|
20
|
+
if (backendName === 'openrouter') {
|
|
21
|
+
return env.RWA_OPENROUTER_KEY || env.OPENROUTER_API_KEY || undefined;
|
|
22
|
+
}
|
|
23
|
+
return undefined;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Default OpenAI-compatible base URL for a backend — mirrors the inline
|
|
28
|
+
* `envBaseUrl` in bin/rwa.mjs (and seeds/rewritable.html resolveBackendConfig).
|
|
29
|
+
* ollama and lmstudio honor RWA_*_URL overrides (remote host / non-standard port);
|
|
30
|
+
* openrouter is fixed (the URL has never drifted in the seed). Shared by `rwa edit`
|
|
31
|
+
* and `rwa create` so the default never diverges between the two.
|
|
32
|
+
* @param {string} name — 'openrouter' | 'ollama' | 'lmstudio'
|
|
33
|
+
* @param {Record<string,string|undefined>} [env] — environment (injectable for tests)
|
|
34
|
+
* @returns {string|undefined}
|
|
35
|
+
*/
|
|
36
|
+
export function envBaseUrl(name, env = process.env) {
|
|
37
|
+
switch (name) {
|
|
38
|
+
case 'openrouter': return 'https://openrouter.ai/api/v1';
|
|
39
|
+
case 'ollama': return env.RWA_OLLAMA_URL || 'http://localhost:11434/v1';
|
|
40
|
+
case 'lmstudio': return env.RWA_LMSTUDIO_URL || 'http://localhost:1234/v1';
|
|
41
|
+
default: return undefined;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
// clone-extract: locate the main article + title in a fetched webpage's HTML.
|
|
2
|
+
//
|
|
3
|
+
// This is the EXTRACTOR stage of `rwa clone <url>`: fetch (elsewhere) →
|
|
4
|
+
// extract (here) → sanitize (elsewhere) → bootstrap (elsewhere). It only
|
|
5
|
+
// LOCATES content; it does NOT strip scripts/attributes — a later sanitize
|
|
6
|
+
// task owns that. Pure and dependency-free (built-in JS only) so it can be
|
|
7
|
+
// mirrored to the browser /import path the way the rest of the CLI is.
|
|
8
|
+
|
|
9
|
+
// Decode the entities that show up in titles. Titles come from og:title /
|
|
10
|
+
// <title> / <h1>, which are entity-encoded in source HTML. WordPress wptexturize
|
|
11
|
+
// emits numeric smart-punctuation (’ etc.), so we decode numeric entities
|
|
12
|
+
// generically plus a few common named ones. & is decoded LAST so a
|
|
13
|
+
// double-encoded "&#8217;" doesn't get unescaped into a live entity.
|
|
14
|
+
function decodeEntities(s) {
|
|
15
|
+
return s
|
|
16
|
+
.replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCodePoint(parseInt(h, 16)))
|
|
17
|
+
.replace(/&#(\d+);/g, (_, d) => String.fromCodePoint(parseInt(d, 10)))
|
|
18
|
+
.replace(/ /g, ' ')
|
|
19
|
+
.replace(/…/g, '…')
|
|
20
|
+
.replace(/—/g, '—')
|
|
21
|
+
.replace(/–/g, '–')
|
|
22
|
+
.replace(/</g, '<')
|
|
23
|
+
.replace(/>/g, '>')
|
|
24
|
+
.replace(/"/g, '"')
|
|
25
|
+
.replace(/&/g, '&');
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function stripTags(s) {
|
|
29
|
+
return s.replace(/<[^>]*>/g, '');
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Title precedence: og:title → <title> (minus a " | Site" / " – Site" /
|
|
33
|
+
// " — Site" suffix) → first <h1> → 'Untitled'. og:title is the cleanest
|
|
34
|
+
// signal on the WordPress/OpenGraph pages we target; the <title> suffix
|
|
35
|
+
// strip removes the site-name tail that most CMSes append.
|
|
36
|
+
function extractTitle(html) {
|
|
37
|
+
const og = html.match(
|
|
38
|
+
/<meta[^>]*\bproperty\s*=\s*["']og:title["'][^>]*\bcontent\s*=\s*["']([^"']*)["']/i,
|
|
39
|
+
) || html.match(
|
|
40
|
+
/<meta[^>]*\bcontent\s*=\s*["']([^"']*)["'][^>]*\bproperty\s*=\s*["']og:title["']/i,
|
|
41
|
+
);
|
|
42
|
+
if (og) return decodeEntities(og[1].trim());
|
|
43
|
+
|
|
44
|
+
const title = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
45
|
+
if (title) {
|
|
46
|
+
const raw = decodeEntities(stripTags(title[1]).trim());
|
|
47
|
+
// Drop a trailing " | Site" / " – Site" / " — Site" separator + tail.
|
|
48
|
+
const cut = raw.replace(/\s*[|–—]\s*[^|–—]*$/, '').trim();
|
|
49
|
+
if (cut) return cut;
|
|
50
|
+
if (raw) return raw;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const h1 = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
|
54
|
+
if (h1) {
|
|
55
|
+
const t = decodeEntities(stripTags(h1[1]).trim());
|
|
56
|
+
if (t) return t;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return 'Untitled';
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Find the index of the opening tag for the element whose class attribute
|
|
63
|
+
// contains `cls`. Returns the index of the '<' or -1.
|
|
64
|
+
function findClassOpen(html, cls) {
|
|
65
|
+
// Bound the class token on the class-list separators (start/space/quote)
|
|
66
|
+
// rather than \b, so "entry-content" does NOT match inside
|
|
67
|
+
// "entry-content-wrapper" or "entry-content-meta" — a hyphen is a \w
|
|
68
|
+
// boundary-free char, so \b would false-match the prefix.
|
|
69
|
+
const re = new RegExp(
|
|
70
|
+
`<([a-z][a-z0-9]*)\\b[^>]*\\bclass\\s*=\\s*(["'])(?:[^"']*[\\s])?${cls}(?=[\\s]|\\2)[^"']*\\2[^>]*>`,
|
|
71
|
+
'i',
|
|
72
|
+
);
|
|
73
|
+
const m = re.exec(html);
|
|
74
|
+
if (!m) return null;
|
|
75
|
+
// Compute the opening-tag end quote-aware: the regex's trailing [^>]*> stops
|
|
76
|
+
// at the first '>', which may be inside a later attribute value (e.g.
|
|
77
|
+
// title="a>b"). tagEnd skips quoted regions so openEnd lands on the real '>'.
|
|
78
|
+
const gt = tagEnd(html, m.index);
|
|
79
|
+
return { index: m.index, tag: m[1].toLowerCase(), openEnd: gt === -1 ? m.index + m[0].length : gt + 1 };
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Find the index of the '>' that ends a tag starting at/after `from`, skipping
|
|
83
|
+
// over quoted attribute regions so a '>' inside an attribute value (e.g.
|
|
84
|
+
// <div title="a>b">) is not mistaken for the tag end. Returns -1 if none.
|
|
85
|
+
function tagEnd(html, from) {
|
|
86
|
+
let i = from, q = null;
|
|
87
|
+
while (i < html.length) {
|
|
88
|
+
const ch = html[i];
|
|
89
|
+
if (q) { if (ch === q) q = null; }
|
|
90
|
+
else if (ch === '"' || ch === "'") q = ch;
|
|
91
|
+
else if (ch === '>') return i;
|
|
92
|
+
i++;
|
|
93
|
+
}
|
|
94
|
+
return -1;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// THE CRUX — balanced extraction, parser-free.
|
|
98
|
+
//
|
|
99
|
+
// A naive non-greedy /(<div>)(.*?)(<\/div>)/ truncates at the FIRST nested
|
|
100
|
+
// </div>, which on a real WordPress entry-content (panel-grid wrappers,
|
|
101
|
+
// widget divs, tinymce divs — 6 nested <div>s deep in the fixture) cuts the
|
|
102
|
+
// body off after the first paragraph. So instead we walk forward from the
|
|
103
|
+
// opening tag tracking the open/close depth of `tagName` and stop at the
|
|
104
|
+
// MATCHING close. HTML comments are skipped (so `<!-- .entry-content -->`
|
|
105
|
+
// doesn't confuse the scan) and void/self-closing same-name tags don't
|
|
106
|
+
// bump depth (div/article are never void, but we guard anyway).
|
|
107
|
+
//
|
|
108
|
+
// Returns the INNER HTML of the container (between openEnd and the matching
|
|
109
|
+
// close), or null if no balanced close is found.
|
|
110
|
+
function balancedInner(html, tagName, startIndex, openEnd) {
|
|
111
|
+
// Locate end of the opening tag at startIndex. Use the caller-supplied
|
|
112
|
+
// openEnd (from findClassOpen, already quote-aware) when available; else
|
|
113
|
+
// scan quote-aware so a '>' inside an attribute value doesn't end the tag.
|
|
114
|
+
const openTagEnd = openEnd != null ? openEnd - 1 : tagEnd(html, startIndex);
|
|
115
|
+
if (openTagEnd === -1) return null;
|
|
116
|
+
const innerStart = openTagEnd + 1;
|
|
117
|
+
|
|
118
|
+
const openRe = new RegExp(`<${tagName}\\b`, 'gi');
|
|
119
|
+
const closeRe = new RegExp(`</${tagName}\\s*>`, 'gi');
|
|
120
|
+
// Raw-text elements: their content is CDATA-like (never markup), so a literal
|
|
121
|
+
// "</div>" or "<div" inside a <script>/<style> must not perturb the scan.
|
|
122
|
+
const rawTextRe = /<(script|style)\b/gi;
|
|
123
|
+
|
|
124
|
+
let depth = 1;
|
|
125
|
+
let pos = innerStart;
|
|
126
|
+
while (pos < html.length) {
|
|
127
|
+
// Skip over HTML comments so markers like <!-- .entry-content --> and
|
|
128
|
+
// any commented-out same-name tags don't perturb the depth count.
|
|
129
|
+
if (html.startsWith('<!--', pos)) {
|
|
130
|
+
const end = html.indexOf('-->', pos + 4);
|
|
131
|
+
if (end === -1) break;
|
|
132
|
+
pos = end + 3;
|
|
133
|
+
continue;
|
|
134
|
+
}
|
|
135
|
+
openRe.lastIndex = pos;
|
|
136
|
+
closeRe.lastIndex = pos;
|
|
137
|
+
rawTextRe.lastIndex = pos;
|
|
138
|
+
const o = openRe.exec(html);
|
|
139
|
+
const c = closeRe.exec(html);
|
|
140
|
+
const r = rawTextRe.exec(html);
|
|
141
|
+
if (!c) break; // unbalanced — no matching close
|
|
142
|
+
// If a raw-text element opens before the next same-name open/close event,
|
|
143
|
+
// fast-forward past its matching </script>/</style> (raw-text elements
|
|
144
|
+
// don't nest), so its text — which may contain "</div>" or "<div" — is
|
|
145
|
+
// never interpreted as markup.
|
|
146
|
+
if (r && r.index < c.index && (!o || r.index < o.index)) {
|
|
147
|
+
const close = new RegExp(`</${r[1]}\\s*>`, 'i');
|
|
148
|
+
const after = r.index + r[0].length;
|
|
149
|
+
const m = close.exec(html.slice(after));
|
|
150
|
+
pos = m ? after + m.index + m[0].length : html.length;
|
|
151
|
+
continue;
|
|
152
|
+
}
|
|
153
|
+
if (o && o.index < c.index) {
|
|
154
|
+
// Next event is a nested open of the same tag. Self-closing (<tag .../>)
|
|
155
|
+
// wouldn't add depth; div/article aren't void so we treat all as nesting,
|
|
156
|
+
// but guard the self-closing form just in case. Quote-aware tag end so a
|
|
157
|
+
// '>' inside an attribute value doesn't end the tag early.
|
|
158
|
+
const gt = tagEnd(html, o.index);
|
|
159
|
+
const selfClosing = gt !== -1 && html[gt - 1] === '/';
|
|
160
|
+
if (!selfClosing) depth++;
|
|
161
|
+
pos = (gt === -1 ? html.length : gt + 1);
|
|
162
|
+
} else {
|
|
163
|
+
depth--;
|
|
164
|
+
pos = c.index + c[0].length;
|
|
165
|
+
if (depth === 0) return html.slice(innerStart, c.index);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
return null;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Generic density fallback: from <body>, scan top-level block elements and
|
|
172
|
+
// pick the subtree with the highest text-length ÷ tag-count ratio. Chrome
|
|
173
|
+
// (nav/header/footer/aside + class-matched menus/sidebars/comments/etc.) is
|
|
174
|
+
// excluded so the dense content block wins even when the page has no
|
|
175
|
+
// recognised content container. This is a heuristic, not a parser — good
|
|
176
|
+
// enough to find "the main blob of prose" on an unknown layout.
|
|
177
|
+
const CHROME_TAGS = /^(nav|header|footer|aside)$/i;
|
|
178
|
+
const CHROME_CLASS = /\b(nav|menu|sidebar|footer|header|comment|share|related)\b/i;
|
|
179
|
+
|
|
180
|
+
function isChrome(tag, attrs) {
|
|
181
|
+
if (CHROME_TAGS.test(tag)) return true;
|
|
182
|
+
const cls = attrs.match(/\bclass\s*=\s*["']([^"']*)["']/i);
|
|
183
|
+
if (cls && CHROME_CLASS.test(cls[1])) return true;
|
|
184
|
+
return false;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
function density(inner) {
|
|
188
|
+
const text = stripTags(inner).replace(/\s+/g, ' ').trim();
|
|
189
|
+
const tags = (inner.match(/<[a-z][^>]*>/gi) || []).length || 1;
|
|
190
|
+
return text.length / tags;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
function genericFallback(html) {
|
|
194
|
+
// Restrict to <body> when present.
|
|
195
|
+
let scope = html;
|
|
196
|
+
const bodyOpen = html.search(/<body\b[^>]*>/i);
|
|
197
|
+
if (bodyOpen !== -1) {
|
|
198
|
+
const bm = html.slice(bodyOpen).match(/<body\b[^>]*>/i);
|
|
199
|
+
const start = bodyOpen + bm[0].length;
|
|
200
|
+
const bodyClose = html.toLowerCase().indexOf('</body>', start);
|
|
201
|
+
scope = html.slice(start, bodyClose === -1 ? html.length : bodyClose);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Walk top-level block elements in scope. We re-scan from each element's
|
|
205
|
+
// end (using its balanced close) so we only consider siblings, not nested
|
|
206
|
+
// descendants — that keeps the density comparison apples-to-apples.
|
|
207
|
+
const blockRe = /<([a-z][a-z0-9]*)\b([^>]*)>/gi;
|
|
208
|
+
let best = null;
|
|
209
|
+
let m;
|
|
210
|
+
let cursor = 0;
|
|
211
|
+
while ((m = blockRe.exec(scope))) {
|
|
212
|
+
if (m.index < cursor) continue; // inside a subtree we already consumed
|
|
213
|
+
const tag = m[1].toLowerCase();
|
|
214
|
+
const attrs = m[2];
|
|
215
|
+
const inner = balancedInner(scope, tag, m.index);
|
|
216
|
+
if (inner == null) { continue; }
|
|
217
|
+
// advance cursor past this whole element so its descendants are skipped
|
|
218
|
+
const closeRe = new RegExp(`</${tag}\\s*>`, 'gi');
|
|
219
|
+
closeRe.lastIndex = m.index + m[0].length + inner.length;
|
|
220
|
+
const cm = closeRe.exec(scope);
|
|
221
|
+
cursor = cm ? cm.index + cm[0].length : scope.length;
|
|
222
|
+
if (isChrome(tag, attrs)) continue;
|
|
223
|
+
const score = density(inner);
|
|
224
|
+
if (!best || score > best.score) best = { score, inner };
|
|
225
|
+
}
|
|
226
|
+
return best ? best.inner : scope;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
export function extractArticle(html) {
|
|
230
|
+
const title = extractTitle(html);
|
|
231
|
+
|
|
232
|
+
// Profile 1 — WordPress/ikangai: the element whose class contains
|
|
233
|
+
// "entry-content" (appears once per post page).
|
|
234
|
+
const ec = findClassOpen(html, 'entry-content');
|
|
235
|
+
if (ec) {
|
|
236
|
+
const inner = balancedInner(html, ec.tag, ec.index, ec.openEnd);
|
|
237
|
+
if (inner != null) return { title, html: inner };
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Profile 2 — Generic semantic: first <article>…</article>.
|
|
241
|
+
const art = html.search(/<article\b[^>]*>/i);
|
|
242
|
+
if (art !== -1) {
|
|
243
|
+
const inner = balancedInner(html, 'article', art);
|
|
244
|
+
if (inner != null) return { title, html: inner };
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Fallback — density heuristic over <body> top-level blocks.
|
|
248
|
+
return { title, html: genericFallback(html) };
|
|
249
|
+
}
|
package/src/clone.mjs
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
// `rwa clone <url>` bootstrap wiring. Turns a fetched web page into a saved
|
|
2
|
+
// rewritable by reusing the EXACT import pipeline: extract the article body,
|
|
3
|
+
// run it through the same sanitiser the import path uses, then drop it into a
|
|
4
|
+
// fresh seed via applySeedSubs + replaceInlineDoc.
|
|
5
|
+
//
|
|
6
|
+
// Why prepend an <h1>: inspectDoc derives the document title from the body's
|
|
7
|
+
// first <h1>, not the <title> tag. A WordPress post's <h1> lives outside
|
|
8
|
+
// .entry-content, so the extracted content carries no heading — without this
|
|
9
|
+
// the cloned doc would render as "Untitled". The extracted page title is the
|
|
10
|
+
// honest source, so we prepend it.
|
|
11
|
+
|
|
12
|
+
import { access } from 'node:fs/promises';
|
|
13
|
+
import { basename } from 'node:path';
|
|
14
|
+
|
|
15
|
+
import { extractArticle } from './clone-extract.mjs';
|
|
16
|
+
import { sanitizeImportedHtml } from './import.mjs';
|
|
17
|
+
import { loadSeed, applySeedSubs, replaceInlineDoc } from './seed.mjs';
|
|
18
|
+
import { SEED_CANDIDATES } from './commands.mjs';
|
|
19
|
+
import { atomicWrite } from './atomic-write.mjs';
|
|
20
|
+
import { fetchPage, fetchImageDataUri, CloneError } from './fetch-page.mjs';
|
|
21
|
+
|
|
22
|
+
function escapeHtml(s) {
|
|
23
|
+
return String(s)
|
|
24
|
+
.replace(/&/g, '&')
|
|
25
|
+
.replace(/</g, '<')
|
|
26
|
+
.replace(/>/g, '>')
|
|
27
|
+
.replace(/"/g, '"');
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// `rwa clone --localize-images`: make a clone truly self-contained by inlining
|
|
31
|
+
// each remote <img src> as a data: URI (the same form the GUI/import produce),
|
|
32
|
+
// so the saved file needs no network. Each image rides the SSRF-guarded
|
|
33
|
+
// fetchImageDataUri (image/* only, raw bytes — the CLI has no canvas to
|
|
34
|
+
// recompress). GRACEFUL by design: a fetch failure, SSRF block, non-image
|
|
35
|
+
// content-type, over-cap image, or exhausted total budget leaves that <img> at
|
|
36
|
+
// its remote URL and records a warning — one bad image never fails the clone.
|
|
37
|
+
// Caps: per-image 2 MB, total 8 MB (under the 10 MB container budget, leaving
|
|
38
|
+
// headroom for the prose). Relative src is resolved against the page URL.
|
|
39
|
+
const LOCALIZE_PER_IMAGE = 2 * 1024 * 1024;
|
|
40
|
+
const LOCALIZE_TOTAL = 8 * 1024 * 1024;
|
|
41
|
+
const IMG_SRC_RE = /(<img\b[^>]*?\bsrc\s*=\s*)(["'])(https?:\/\/[^"']+|\/[^"']*|\.\.?\/[^"']*)\2/gi;
|
|
42
|
+
|
|
43
|
+
export async function localizeImages(html, sourceUrl, opts = {}) {
|
|
44
|
+
const deps = opts.deps || {};
|
|
45
|
+
const perImage = opts.perImage || LOCALIZE_PER_IMAGE;
|
|
46
|
+
const totalCap = opts.totalCap || LOCALIZE_TOTAL;
|
|
47
|
+
const warnings = [];
|
|
48
|
+
let inlined = 0, totalBytes = 0;
|
|
49
|
+
|
|
50
|
+
// Collect matches first (regex with async work can't run inside .replace).
|
|
51
|
+
const matches = [];
|
|
52
|
+
let m;
|
|
53
|
+
IMG_SRC_RE.lastIndex = 0;
|
|
54
|
+
while ((m = IMG_SRC_RE.exec(html)) !== null) {
|
|
55
|
+
matches.push({ whole: m[0], pre: m[1], quote: m[2], src: m[3], index: m.index });
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Resolve + fetch each unique src once (a page often repeats an image).
|
|
59
|
+
const resolved = new Map(); // original src -> data URI (or null = leave remote)
|
|
60
|
+
for (const { src } of matches) {
|
|
61
|
+
if (resolved.has(src)) continue;
|
|
62
|
+
let abs;
|
|
63
|
+
try { abs = new URL(src, sourceUrl).href; }
|
|
64
|
+
catch { resolved.set(src, null); warnings.push(`skipped ${src} (unresolvable URL)`); continue; }
|
|
65
|
+
if (totalBytes >= totalCap) { resolved.set(src, null); warnings.push(`skipped ${src} (container image budget reached)`); continue; }
|
|
66
|
+
try {
|
|
67
|
+
const remaining = Math.min(perImage, totalCap - totalBytes);
|
|
68
|
+
const dataUri = await fetchImageDataUri(abs, { maxBytes: remaining, deps });
|
|
69
|
+
// base64 is ~4/3 of the raw bytes; count the raw size toward the budget.
|
|
70
|
+
totalBytes += Math.floor((dataUri.length - dataUri.indexOf(',') - 1) * 3 / 4);
|
|
71
|
+
resolved.set(src, dataUri);
|
|
72
|
+
inlined++;
|
|
73
|
+
} catch (err) {
|
|
74
|
+
resolved.set(src, null);
|
|
75
|
+
warnings.push(`skipped ${src} (${err && err.subcode ? err.subcode : (err && err.message) || 'fetch failed'})`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const out = html.replace(IMG_SRC_RE, (whole, pre, quote, src) => {
|
|
80
|
+
const dataUri = resolved.get(src);
|
|
81
|
+
return dataUri ? pre + quote + dataUri + quote : whole;
|
|
82
|
+
});
|
|
83
|
+
return { html: out, inlined, warnings };
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export async function cloneFromHtml(html, outPath, sourceUrl, opts = {}) {
|
|
87
|
+
const { title, html: content } = extractArticle(html);
|
|
88
|
+
// sanitizeImportedHtml returns { html, warnings }; we only need the cleaned body.
|
|
89
|
+
let { html: clean } = sanitizeImportedHtml(content);
|
|
90
|
+
|
|
91
|
+
// --localize-images: inline remote <img src> as data: URIs so the clone is
|
|
92
|
+
// truly self-contained. Runs AFTER the sanitizer (its http/https/data:image
|
|
93
|
+
// src allowlist already passed); swaps surviving remote URLs for data URIs.
|
|
94
|
+
if (opts.localizeImages) {
|
|
95
|
+
const r = await localizeImages(clean, sourceUrl, { deps: opts.deps });
|
|
96
|
+
clean = r.html;
|
|
97
|
+
if (r.inlined) console.error(`note: inlined ${r.inlined} image(s)`);
|
|
98
|
+
for (const w of r.warnings) console.error('note: ' + w);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Defence-in-depth: the wired path (cloneCmd → fetchPage) already validates
|
|
102
|
+
// the scheme, but this exported fn must be safe-by-default for any caller.
|
|
103
|
+
// Only http/https URLs become a live provenance <a href>; anything else
|
|
104
|
+
// (javascript:, data:, file:, …) renders as plain escaped text — no href —
|
|
105
|
+
// so a hostile scheme can never produce a clickable link in the cloned doc.
|
|
106
|
+
const safeProvenance = /^https?:\/\//i.test(String(sourceUrl));
|
|
107
|
+
const provenance = safeProvenance
|
|
108
|
+
? `<a href="${escapeHtml(sourceUrl)}">${escapeHtml(sourceUrl)}</a>`
|
|
109
|
+
: escapeHtml(sourceUrl);
|
|
110
|
+
|
|
111
|
+
const body = `<article>\n<h1>${escapeHtml(title)}</h1>\n${clean}\n`
|
|
112
|
+
+ `<footer><p><small>Cloned from ${provenance}</small></p></footer>\n</article>`;
|
|
113
|
+
|
|
114
|
+
const seed = await loadSeed(SEED_CANDIDATES);
|
|
115
|
+
// Order matches the `rwa import` lesson: seed-level substitutions on the
|
|
116
|
+
// pristine seed FIRST, then inject the body — so DOC_UUID can't false-match
|
|
117
|
+
// inside imported content.
|
|
118
|
+
const subbed = applySeedSubs(seed, {
|
|
119
|
+
uuid: crypto.randomUUID(),
|
|
120
|
+
title,
|
|
121
|
+
fileMeta: basename(outPath),
|
|
122
|
+
});
|
|
123
|
+
const result = replaceInlineDoc(subbed, body);
|
|
124
|
+
await atomicWrite(outPath, result);
|
|
125
|
+
return { outPath, title };
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Derive a filename slug from the URL's last non-empty path segment.
|
|
129
|
+
function slugFromUrl(url) {
|
|
130
|
+
let parsed;
|
|
131
|
+
try {
|
|
132
|
+
parsed = new URL(url);
|
|
133
|
+
} catch {
|
|
134
|
+
return 'clone';
|
|
135
|
+
}
|
|
136
|
+
const segments = parsed.pathname.split('/').filter(Boolean);
|
|
137
|
+
const last = segments.length ? segments[segments.length - 1] : '';
|
|
138
|
+
const slug = last.replace(/\.[a-z0-9]+$/i, '').replace(/[^a-z0-9-]+/gi, '-').replace(/^-+|-+$/g, '');
|
|
139
|
+
return slug || 'clone';
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
export async function cloneCmd({ url, outPath, force, localizeImages: localize }) {
|
|
143
|
+
const html = await fetchPage(url);
|
|
144
|
+
const resolvedOut = outPath || `./${slugFromUrl(url)}.html`;
|
|
145
|
+
|
|
146
|
+
if (!force) {
|
|
147
|
+
let exists = true;
|
|
148
|
+
try {
|
|
149
|
+
await access(resolvedOut);
|
|
150
|
+
} catch {
|
|
151
|
+
exists = false;
|
|
152
|
+
}
|
|
153
|
+
if (exists) {
|
|
154
|
+
throw new CloneError(2, 'exists', { path: resolvedOut,
|
|
155
|
+
message: `destination exists: ${resolvedOut} (use --force to overwrite)` });
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
await cloneFromHtml(html, resolvedOut, url, { localizeImages: localize });
|
|
160
|
+
console.log('wrote ' + resolvedOut);
|
|
161
|
+
}
|