rewritable 0.3.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ // Crash-safe atomic file write for `rwa edit`. Write the new bytes to a sibling
2
+ // temp file, fsync (datasync) its contents, then rename(2) over the target —
3
+ // atomic on the same filesystem, so a reader sees either the old file or the new
4
+ // one, never a half-written stream. (datasync, not sync: a power loss between
5
+ // rename and the kernel flushing dirty pages could otherwise land a renamed file
6
+ // with stale/zero bytes; we don't depend on the temp's metadata being durable.)
7
+ //
8
+ // On ANY failure the temp file is removed. Previously a writeFile/datasync
9
+ // failure left a `.rwa-tmp-<pid>` behind — only the rename-failure path cleaned
10
+ // up. `deps` is injectable so the cleanup-on-failure is unit-testable without an
11
+ // actual disk failure.
12
+
13
+ import { open, rename, unlink } from 'node:fs/promises';
14
+
15
+ /**
16
+ * @param {string} filePath — destination path (overwritten atomically)
17
+ * @param {string} content — UTF-8 bytes to write
18
+ * @param {{open:Function, rename:Function, unlink:Function}} [deps] — injectable fs ops
19
+ */
20
+ export async function atomicWrite(filePath, content, deps = { open, rename, unlink }) {
21
+ const tmp = `${filePath}.rwa-tmp-${process.pid}`;
22
+ let handle;
23
+ try {
24
+ handle = await deps.open(tmp, 'w');
25
+ await handle.writeFile(content, 'utf8');
26
+ await handle.datasync();
27
+ await handle.close();
28
+ handle = null;
29
+ await deps.rename(tmp, filePath);
30
+ } catch (e) {
31
+ // Close a still-open handle, then remove the temp so a failed write never
32
+ // leaks a .rwa-tmp-<pid>. Cleanup errors are swallowed — the original
33
+ // failure `e` is what the caller needs.
34
+ if (handle) { try { await handle.close(); } catch { /* already failing */ } }
35
+ await deps.unlink(tmp).catch(() => {});
36
+ throw e;
37
+ }
38
+ }
@@ -0,0 +1,43 @@
1
+ // Backend auth resolution for `rwa edit`. Extracted from bin/rwa.mjs so the
2
+ // precedence is unit-testable (the bin entrypoint runs on import and can't be
3
+ // imported cleanly).
4
+ //
5
+ // Only the openrouter backend needs a key — ollama and lmstudio run locally
6
+ // without auth. The key resolves in order: an explicit --api-key flag, then the
7
+ // project-specific RWA_OPENROUTER_KEY (env conventions match the docker-compose
8
+ // deploy in service/), then the CONVENTIONAL OPENROUTER_API_KEY that agents and
9
+ // users normally have exported. Empty strings count as absent.
10
+
11
+ /**
12
+ * Resolve the API key for a backend.
13
+ * @param {string} backendName — 'openrouter' | 'ollama' | 'lmstudio'
14
+ * @param {string|undefined} flagValue — the --api-key flag value, if any
15
+ * @param {Record<string,string|undefined>} [env] — environment (injectable for tests)
16
+ * @returns {string|undefined} the key, or undefined when none applies
17
+ */
18
+ export function resolveApiKey(backendName, flagValue, env = process.env) {
19
+ if (flagValue) return flagValue;
20
+ if (backendName === 'openrouter') {
21
+ return env.RWA_OPENROUTER_KEY || env.OPENROUTER_API_KEY || undefined;
22
+ }
23
+ return undefined;
24
+ }
25
+
26
+ /**
27
+ * Default OpenAI-compatible base URL for a backend — mirrors the inline
28
+ * `envBaseUrl` in bin/rwa.mjs (and seeds/rewritable.html resolveBackendConfig).
29
+ * ollama and lmstudio honor RWA_*_URL overrides (remote host / non-standard port);
30
+ * openrouter is fixed (the URL has never drifted in the seed). Shared by `rwa edit`
31
+ * and `rwa create` so the default never diverges between the two.
32
+ * @param {string} name — 'openrouter' | 'ollama' | 'lmstudio'
33
+ * @param {Record<string,string|undefined>} [env] — environment (injectable for tests)
34
+ * @returns {string|undefined}
35
+ */
36
+ export function envBaseUrl(name, env = process.env) {
37
+ switch (name) {
38
+ case 'openrouter': return 'https://openrouter.ai/api/v1';
39
+ case 'ollama': return env.RWA_OLLAMA_URL || 'http://localhost:11434/v1';
40
+ case 'lmstudio': return env.RWA_LMSTUDIO_URL || 'http://localhost:1234/v1';
41
+ default: return undefined;
42
+ }
43
+ }
@@ -0,0 +1,249 @@
1
+ // clone-extract: locate the main article + title in a fetched webpage's HTML.
2
+ //
3
+ // This is the EXTRACTOR stage of `rwa clone <url>`: fetch (elsewhere) →
4
+ // extract (here) → sanitize (elsewhere) → bootstrap (elsewhere). It only
5
+ // LOCATES content; it does NOT strip scripts/attributes — a later sanitize
6
+ // task owns that. Pure and dependency-free (built-in JS only) so it can be
7
+ // mirrored to the browser /import path the way the rest of the CLI is.
8
+
9
+ // Decode the entities that show up in titles. Titles come from og:title /
10
+ // <title> / <h1>, which are entity-encoded in source HTML. WordPress wptexturize
11
+ // emits numeric smart-punctuation (&#8217; etc.), so we decode numeric entities
12
+ // generically plus a few common named ones. &amp; is decoded LAST so a
13
+ // double-encoded "&amp;#8217;" doesn't get unescaped into a live entity.
14
+ function decodeEntities(s) {
15
+ return s
16
+ .replace(/&#x([0-9a-fA-F]+);/g, (_, h) => String.fromCodePoint(parseInt(h, 16)))
17
+ .replace(/&#(\d+);/g, (_, d) => String.fromCodePoint(parseInt(d, 10)))
18
+ .replace(/&nbsp;/g, ' ')
19
+ .replace(/&hellip;/g, '…')
20
+ .replace(/&mdash;/g, '—')
21
+ .replace(/&ndash;/g, '–')
22
+ .replace(/&lt;/g, '<')
23
+ .replace(/&gt;/g, '>')
24
+ .replace(/&quot;/g, '"')
25
+ .replace(/&amp;/g, '&');
26
+ }
27
+
28
+ function stripTags(s) {
29
+ return s.replace(/<[^>]*>/g, '');
30
+ }
31
+
32
+ // Title precedence: og:title → <title> (minus a " | Site" / " – Site" /
33
+ // " — Site" suffix) → first <h1> → 'Untitled'. og:title is the cleanest
34
+ // signal on the WordPress/OpenGraph pages we target; the <title> suffix
35
+ // strip removes the site-name tail that most CMSes append.
36
+ function extractTitle(html) {
37
+ const og = html.match(
38
+ /<meta[^>]*\bproperty\s*=\s*["']og:title["'][^>]*\bcontent\s*=\s*["']([^"']*)["']/i,
39
+ ) || html.match(
40
+ /<meta[^>]*\bcontent\s*=\s*["']([^"']*)["'][^>]*\bproperty\s*=\s*["']og:title["']/i,
41
+ );
42
+ if (og) return decodeEntities(og[1].trim());
43
+
44
+ const title = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
45
+ if (title) {
46
+ const raw = decodeEntities(stripTags(title[1]).trim());
47
+ // Drop a trailing " | Site" / " – Site" / " — Site" separator + tail.
48
+ const cut = raw.replace(/\s*[|–—]\s*[^|–—]*$/, '').trim();
49
+ if (cut) return cut;
50
+ if (raw) return raw;
51
+ }
52
+
53
+ const h1 = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
54
+ if (h1) {
55
+ const t = decodeEntities(stripTags(h1[1]).trim());
56
+ if (t) return t;
57
+ }
58
+
59
+ return 'Untitled';
60
+ }
61
+
62
+ // Find the index of the opening tag for the element whose class attribute
63
+ // contains `cls`. Returns the index of the '<' or -1.
64
+ function findClassOpen(html, cls) {
65
+ // Bound the class token on the class-list separators (start/space/quote)
66
+ // rather than \b, so "entry-content" does NOT match inside
67
+ // "entry-content-wrapper" or "entry-content-meta" — a hyphen is a \w
68
+ // boundary-free char, so \b would false-match the prefix.
69
+ const re = new RegExp(
70
+ `<([a-z][a-z0-9]*)\\b[^>]*\\bclass\\s*=\\s*(["'])(?:[^"']*[\\s])?${cls}(?=[\\s]|\\2)[^"']*\\2[^>]*>`,
71
+ 'i',
72
+ );
73
+ const m = re.exec(html);
74
+ if (!m) return null;
75
+ // Compute the opening-tag end quote-aware: the regex's trailing [^>]*> stops
76
+ // at the first '>', which may be inside a later attribute value (e.g.
77
+ // title="a>b"). tagEnd skips quoted regions so openEnd lands on the real '>'.
78
+ const gt = tagEnd(html, m.index);
79
+ return { index: m.index, tag: m[1].toLowerCase(), openEnd: gt === -1 ? m.index + m[0].length : gt + 1 };
80
+ }
81
+
82
+ // Find the index of the '>' that ends a tag starting at/after `from`, skipping
83
+ // over quoted attribute regions so a '>' inside an attribute value (e.g.
84
+ // <div title="a>b">) is not mistaken for the tag end. Returns -1 if none.
85
+ function tagEnd(html, from) {
86
+ let i = from, q = null;
87
+ while (i < html.length) {
88
+ const ch = html[i];
89
+ if (q) { if (ch === q) q = null; }
90
+ else if (ch === '"' || ch === "'") q = ch;
91
+ else if (ch === '>') return i;
92
+ i++;
93
+ }
94
+ return -1;
95
+ }
96
+
97
+ // THE CRUX — balanced extraction, parser-free.
98
+ //
99
+ // A naive non-greedy /(<div>)(.*?)(<\/div>)/ truncates at the FIRST nested
100
+ // </div>, which on a real WordPress entry-content (panel-grid wrappers,
101
+ // widget divs, tinymce divs — 6 nested <div>s deep in the fixture) cuts the
102
+ // body off after the first paragraph. So instead we walk forward from the
103
+ // opening tag tracking the open/close depth of `tagName` and stop at the
104
+ // MATCHING close. HTML comments are skipped (so `<!-- .entry-content -->`
105
+ // doesn't confuse the scan) and void/self-closing same-name tags don't
106
+ // bump depth (div/article are never void, but we guard anyway).
107
+ //
108
+ // Returns the INNER HTML of the container (between openEnd and the matching
109
+ // close), or null if no balanced close is found.
110
+ function balancedInner(html, tagName, startIndex, openEnd) {
111
+ // Locate end of the opening tag at startIndex. Use the caller-supplied
112
+ // openEnd (from findClassOpen, already quote-aware) when available; else
113
+ // scan quote-aware so a '>' inside an attribute value doesn't end the tag.
114
+ const openTagEnd = openEnd != null ? openEnd - 1 : tagEnd(html, startIndex);
115
+ if (openTagEnd === -1) return null;
116
+ const innerStart = openTagEnd + 1;
117
+
118
+ const openRe = new RegExp(`<${tagName}\\b`, 'gi');
119
+ const closeRe = new RegExp(`</${tagName}\\s*>`, 'gi');
120
+ // Raw-text elements: their content is CDATA-like (never markup), so a literal
121
+ // "</div>" or "<div" inside a <script>/<style> must not perturb the scan.
122
+ const rawTextRe = /<(script|style)\b/gi;
123
+
124
+ let depth = 1;
125
+ let pos = innerStart;
126
+ while (pos < html.length) {
127
+ // Skip over HTML comments so markers like <!-- .entry-content --> and
128
+ // any commented-out same-name tags don't perturb the depth count.
129
+ if (html.startsWith('<!--', pos)) {
130
+ const end = html.indexOf('-->', pos + 4);
131
+ if (end === -1) break;
132
+ pos = end + 3;
133
+ continue;
134
+ }
135
+ openRe.lastIndex = pos;
136
+ closeRe.lastIndex = pos;
137
+ rawTextRe.lastIndex = pos;
138
+ const o = openRe.exec(html);
139
+ const c = closeRe.exec(html);
140
+ const r = rawTextRe.exec(html);
141
+ if (!c) break; // unbalanced — no matching close
142
+ // If a raw-text element opens before the next same-name open/close event,
143
+ // fast-forward past its matching </script>/</style> (raw-text elements
144
+ // don't nest), so its text — which may contain "</div>" or "<div" — is
145
+ // never interpreted as markup.
146
+ if (r && r.index < c.index && (!o || r.index < o.index)) {
147
+ const close = new RegExp(`</${r[1]}\\s*>`, 'i');
148
+ const after = r.index + r[0].length;
149
+ const m = close.exec(html.slice(after));
150
+ pos = m ? after + m.index + m[0].length : html.length;
151
+ continue;
152
+ }
153
+ if (o && o.index < c.index) {
154
+ // Next event is a nested open of the same tag. Self-closing (<tag .../>)
155
+ // wouldn't add depth; div/article aren't void so we treat all as nesting,
156
+ // but guard the self-closing form just in case. Quote-aware tag end so a
157
+ // '>' inside an attribute value doesn't end the tag early.
158
+ const gt = tagEnd(html, o.index);
159
+ const selfClosing = gt !== -1 && html[gt - 1] === '/';
160
+ if (!selfClosing) depth++;
161
+ pos = (gt === -1 ? html.length : gt + 1);
162
+ } else {
163
+ depth--;
164
+ pos = c.index + c[0].length;
165
+ if (depth === 0) return html.slice(innerStart, c.index);
166
+ }
167
+ }
168
+ return null;
169
+ }
170
+
171
+ // Generic density fallback: from <body>, scan top-level block elements and
172
+ // pick the subtree with the highest text-length ÷ tag-count ratio. Chrome
173
+ // (nav/header/footer/aside + class-matched menus/sidebars/comments/etc.) is
174
+ // excluded so the dense content block wins even when the page has no
175
+ // recognised content container. This is a heuristic, not a parser — good
176
+ // enough to find "the main blob of prose" on an unknown layout.
177
+ const CHROME_TAGS = /^(nav|header|footer|aside)$/i;
178
+ const CHROME_CLASS = /\b(nav|menu|sidebar|footer|header|comment|share|related)\b/i;
179
+
180
+ function isChrome(tag, attrs) {
181
+ if (CHROME_TAGS.test(tag)) return true;
182
+ const cls = attrs.match(/\bclass\s*=\s*["']([^"']*)["']/i);
183
+ if (cls && CHROME_CLASS.test(cls[1])) return true;
184
+ return false;
185
+ }
186
+
187
+ function density(inner) {
188
+ const text = stripTags(inner).replace(/\s+/g, ' ').trim();
189
+ const tags = (inner.match(/<[a-z][^>]*>/gi) || []).length || 1;
190
+ return text.length / tags;
191
+ }
192
+
193
+ function genericFallback(html) {
194
+ // Restrict to <body> when present.
195
+ let scope = html;
196
+ const bodyOpen = html.search(/<body\b[^>]*>/i);
197
+ if (bodyOpen !== -1) {
198
+ const bm = html.slice(bodyOpen).match(/<body\b[^>]*>/i);
199
+ const start = bodyOpen + bm[0].length;
200
+ const bodyClose = html.toLowerCase().indexOf('</body>', start);
201
+ scope = html.slice(start, bodyClose === -1 ? html.length : bodyClose);
202
+ }
203
+
204
+ // Walk top-level block elements in scope. We re-scan from each element's
205
+ // end (using its balanced close) so we only consider siblings, not nested
206
+ // descendants — that keeps the density comparison apples-to-apples.
207
+ const blockRe = /<([a-z][a-z0-9]*)\b([^>]*)>/gi;
208
+ let best = null;
209
+ let m;
210
+ let cursor = 0;
211
+ while ((m = blockRe.exec(scope))) {
212
+ if (m.index < cursor) continue; // inside a subtree we already consumed
213
+ const tag = m[1].toLowerCase();
214
+ const attrs = m[2];
215
+ const inner = balancedInner(scope, tag, m.index);
216
+ if (inner == null) { continue; }
217
+ // advance cursor past this whole element so its descendants are skipped
218
+ const closeRe = new RegExp(`</${tag}\\s*>`, 'gi');
219
+ closeRe.lastIndex = m.index + m[0].length + inner.length;
220
+ const cm = closeRe.exec(scope);
221
+ cursor = cm ? cm.index + cm[0].length : scope.length;
222
+ if (isChrome(tag, attrs)) continue;
223
+ const score = density(inner);
224
+ if (!best || score > best.score) best = { score, inner };
225
+ }
226
+ return best ? best.inner : scope;
227
+ }
228
+
229
+ export function extractArticle(html) {
230
+ const title = extractTitle(html);
231
+
232
+ // Profile 1 — WordPress/ikangai: the element whose class contains
233
+ // "entry-content" (appears once per post page).
234
+ const ec = findClassOpen(html, 'entry-content');
235
+ if (ec) {
236
+ const inner = balancedInner(html, ec.tag, ec.index, ec.openEnd);
237
+ if (inner != null) return { title, html: inner };
238
+ }
239
+
240
+ // Profile 2 — Generic semantic: first <article>…</article>.
241
+ const art = html.search(/<article\b[^>]*>/i);
242
+ if (art !== -1) {
243
+ const inner = balancedInner(html, 'article', art);
244
+ if (inner != null) return { title, html: inner };
245
+ }
246
+
247
+ // Fallback — density heuristic over <body> top-level blocks.
248
+ return { title, html: genericFallback(html) };
249
+ }
package/src/clone.mjs ADDED
@@ -0,0 +1,161 @@
1
+ // `rwa clone <url>` bootstrap wiring. Turns a fetched web page into a saved
2
+ // rewritable by reusing the EXACT import pipeline: extract the article body,
3
+ // run it through the same sanitiser the import path uses, then drop it into a
4
+ // fresh seed via applySeedSubs + replaceInlineDoc.
5
+ //
6
+ // Why prepend an <h1>: inspectDoc derives the document title from the body's
7
+ // first <h1>, not the <title> tag. A WordPress post's <h1> lives outside
8
+ // .entry-content, so the extracted content carries no heading — without this
9
+ // the cloned doc would render as "Untitled". The extracted page title is the
10
+ // honest source, so we prepend it.
11
+
12
+ import { access } from 'node:fs/promises';
13
+ import { basename } from 'node:path';
14
+
15
+ import { extractArticle } from './clone-extract.mjs';
16
+ import { sanitizeImportedHtml } from './import.mjs';
17
+ import { loadSeed, applySeedSubs, replaceInlineDoc } from './seed.mjs';
18
+ import { SEED_CANDIDATES } from './commands.mjs';
19
+ import { atomicWrite } from './atomic-write.mjs';
20
+ import { fetchPage, fetchImageDataUri, CloneError } from './fetch-page.mjs';
21
+
22
+ function escapeHtml(s) {
23
+ return String(s)
24
+ .replace(/&/g, '&amp;')
25
+ .replace(/</g, '&lt;')
26
+ .replace(/>/g, '&gt;')
27
+ .replace(/"/g, '&quot;');
28
+ }
29
+
30
+ // `rwa clone --localize-images`: make a clone truly self-contained by inlining
31
+ // each remote <img src> as a data: URI (the same form the GUI/import produce),
32
+ // so the saved file needs no network. Each image rides the SSRF-guarded
33
+ // fetchImageDataUri (image/* only, raw bytes — the CLI has no canvas to
34
+ // recompress). GRACEFUL by design: a fetch failure, SSRF block, non-image
35
+ // content-type, over-cap image, or exhausted total budget leaves that <img> at
36
+ // its remote URL and records a warning — one bad image never fails the clone.
37
+ // Caps: per-image 2 MB, total 8 MB (under the 10 MB container budget, leaving
38
+ // headroom for the prose). Relative src is resolved against the page URL.
39
+ const LOCALIZE_PER_IMAGE = 2 * 1024 * 1024;
40
+ const LOCALIZE_TOTAL = 8 * 1024 * 1024;
41
+ const IMG_SRC_RE = /(<img\b[^>]*?\bsrc\s*=\s*)(["'])(https?:\/\/[^"']+|\/[^"']*|\.\.?\/[^"']*)\2/gi;
42
+
43
+ export async function localizeImages(html, sourceUrl, opts = {}) {
44
+ const deps = opts.deps || {};
45
+ const perImage = opts.perImage || LOCALIZE_PER_IMAGE;
46
+ const totalCap = opts.totalCap || LOCALIZE_TOTAL;
47
+ const warnings = [];
48
+ let inlined = 0, totalBytes = 0;
49
+
50
+ // Collect matches first (regex with async work can't run inside .replace).
51
+ const matches = [];
52
+ let m;
53
+ IMG_SRC_RE.lastIndex = 0;
54
+ while ((m = IMG_SRC_RE.exec(html)) !== null) {
55
+ matches.push({ whole: m[0], pre: m[1], quote: m[2], src: m[3], index: m.index });
56
+ }
57
+
58
+ // Resolve + fetch each unique src once (a page often repeats an image).
59
+ const resolved = new Map(); // original src -> data URI (or null = leave remote)
60
+ for (const { src } of matches) {
61
+ if (resolved.has(src)) continue;
62
+ let abs;
63
+ try { abs = new URL(src, sourceUrl).href; }
64
+ catch { resolved.set(src, null); warnings.push(`skipped ${src} (unresolvable URL)`); continue; }
65
+ if (totalBytes >= totalCap) { resolved.set(src, null); warnings.push(`skipped ${src} (container image budget reached)`); continue; }
66
+ try {
67
+ const remaining = Math.min(perImage, totalCap - totalBytes);
68
+ const dataUri = await fetchImageDataUri(abs, { maxBytes: remaining, deps });
69
+ // base64 is ~4/3 of the raw bytes; count the raw size toward the budget.
70
+ totalBytes += Math.floor((dataUri.length - dataUri.indexOf(',') - 1) * 3 / 4);
71
+ resolved.set(src, dataUri);
72
+ inlined++;
73
+ } catch (err) {
74
+ resolved.set(src, null);
75
+ warnings.push(`skipped ${src} (${err && err.subcode ? err.subcode : (err && err.message) || 'fetch failed'})`);
76
+ }
77
+ }
78
+
79
+ const out = html.replace(IMG_SRC_RE, (whole, pre, quote, src) => {
80
+ const dataUri = resolved.get(src);
81
+ return dataUri ? pre + quote + dataUri + quote : whole;
82
+ });
83
+ return { html: out, inlined, warnings };
84
+ }
85
+
86
+ export async function cloneFromHtml(html, outPath, sourceUrl, opts = {}) {
87
+ const { title, html: content } = extractArticle(html);
88
+ // sanitizeImportedHtml returns { html, warnings }; we only need the cleaned body.
89
+ let { html: clean } = sanitizeImportedHtml(content);
90
+
91
+ // --localize-images: inline remote <img src> as data: URIs so the clone is
92
+ // truly self-contained. Runs AFTER the sanitizer (its http/https/data:image
93
+ // src allowlist already passed); swaps surviving remote URLs for data URIs.
94
+ if (opts.localizeImages) {
95
+ const r = await localizeImages(clean, sourceUrl, { deps: opts.deps });
96
+ clean = r.html;
97
+ if (r.inlined) console.error(`note: inlined ${r.inlined} image(s)`);
98
+ for (const w of r.warnings) console.error('note: ' + w);
99
+ }
100
+
101
+ // Defence-in-depth: the wired path (cloneCmd → fetchPage) already validates
102
+ // the scheme, but this exported fn must be safe-by-default for any caller.
103
+ // Only http/https URLs become a live provenance <a href>; anything else
104
+ // (javascript:, data:, file:, …) renders as plain escaped text — no href —
105
+ // so a hostile scheme can never produce a clickable link in the cloned doc.
106
+ const safeProvenance = /^https?:\/\//i.test(String(sourceUrl));
107
+ const provenance = safeProvenance
108
+ ? `<a href="${escapeHtml(sourceUrl)}">${escapeHtml(sourceUrl)}</a>`
109
+ : escapeHtml(sourceUrl);
110
+
111
+ const body = `<article>\n<h1>${escapeHtml(title)}</h1>\n${clean}\n`
112
+ + `<footer><p><small>Cloned from ${provenance}</small></p></footer>\n</article>`;
113
+
114
+ const seed = await loadSeed(SEED_CANDIDATES);
115
+ // Order matches the `rwa import` lesson: seed-level substitutions on the
116
+ // pristine seed FIRST, then inject the body — so DOC_UUID can't false-match
117
+ // inside imported content.
118
+ const subbed = applySeedSubs(seed, {
119
+ uuid: crypto.randomUUID(),
120
+ title,
121
+ fileMeta: basename(outPath),
122
+ });
123
+ const result = replaceInlineDoc(subbed, body);
124
+ await atomicWrite(outPath, result);
125
+ return { outPath, title };
126
+ }
127
+
128
+ // Derive a filename slug from the URL's last non-empty path segment.
129
+ function slugFromUrl(url) {
130
+ let parsed;
131
+ try {
132
+ parsed = new URL(url);
133
+ } catch {
134
+ return 'clone';
135
+ }
136
+ const segments = parsed.pathname.split('/').filter(Boolean);
137
+ const last = segments.length ? segments[segments.length - 1] : '';
138
+ const slug = last.replace(/\.[a-z0-9]+$/i, '').replace(/[^a-z0-9-]+/gi, '-').replace(/^-+|-+$/g, '');
139
+ return slug || 'clone';
140
+ }
141
+
142
+ export async function cloneCmd({ url, outPath, force, localizeImages: localize }) {
143
+ const html = await fetchPage(url);
144
+ const resolvedOut = outPath || `./${slugFromUrl(url)}.html`;
145
+
146
+ if (!force) {
147
+ let exists = true;
148
+ try {
149
+ await access(resolvedOut);
150
+ } catch {
151
+ exists = false;
152
+ }
153
+ if (exists) {
154
+ throw new CloneError(2, 'exists', { path: resolvedOut,
155
+ message: `destination exists: ${resolvedOut} (use --force to overwrite)` });
156
+ }
157
+ }
158
+
159
+ await cloneFromHtml(html, resolvedOut, url, { localizeImages: localize });
160
+ console.log('wrote ' + resolvedOut);
161
+ }
package/src/commands.mjs CHANGED
@@ -4,7 +4,9 @@ import { fileURLToPath, pathToFileURL } from 'node:url';
4
4
  import { spawn } from 'node:child_process';
5
5
  import crypto from 'node:crypto';
6
6
 
7
- import { loadSeed, applySeedSubs, replaceInlineDoc } from './seed.mjs';
7
+ import { loadSeed, applySeedSubs, replaceInlineDoc, extractInlineDoc, kindOverrides, KNOWN_KINDS } from './seed.mjs';
8
+ import { skinByName } from './skins.mjs';
9
+ import { resolveBareWord } from './template.mjs';
8
10
  import { convert } from './import.mjs';
9
11
  import { convertPdfViaVision } from './import-vision.mjs';
10
12
  import { convertViaClaudeCli } from './import-claude.mjs';
@@ -13,8 +15,10 @@ const here = path.dirname(fileURLToPath(import.meta.url));
13
15
  const packageRoot = path.dirname(here);
14
16
 
15
17
  // Look in the in-package copy first (published case), fall back to the
16
- // repo-canonical seed (dev case where cli/ sits next to seeds/).
17
- const SEED_CANDIDATES = [
18
+ // repo-canonical seed (dev case where cli/ sits next to seeds/). Exported so
19
+ // the `rwa edit` instruction path can extract SYSTEM_PROMPTS/TOOL_SCHEMAS
20
+ // from the same seed `rwa new`/`rwa import` use — single source of truth.
21
+ export const SEED_CANDIDATES = [
18
22
  path.join(packageRoot, 'seeds', 'rewritable.html'),
19
23
  path.join(packageRoot, '..', 'seeds', 'rewritable.html'),
20
24
  ];
@@ -130,19 +134,89 @@ function openFile(target, prefill) {
130
134
  child.unref();
131
135
  }
132
136
 
133
- export async function newCmd({ outPath, force, open }) {
134
- const out = path.resolve(outPath || './rewritable.html');
137
+ // Open a freshly-written container, lifting env / .env prefills into the
138
+ // file:// URL (key/backend/model) exactly as the new/import open paths do.
139
+ // Exported so `rwa create` can honor --open without duplicating openFile +
140
+ // collectPrefill. (newCmd/importCmd keep their inline blocks unchanged.)
141
+ export async function openWithPrefill(out) {
142
+ const prefill = await collectPrefill();
143
+ if (prefill.key) console.error('note: passing OPENROUTER_API_KEY via ?key= URL parameter');
144
+ if (prefill.backend) console.error(`note: passing RWA_BACKEND=${prefill.backend} via ?backend= URL parameter`);
145
+ if (prefill.model) console.error(`note: passing RWA_MODEL=${prefill.model} via ?model= URL parameter`);
146
+ openFile(out, prefill);
147
+ }
148
+
149
+ export async function newCmd({ outPath, force, open, kind, templateName, skin }) {
150
+ // Two body sources funnel through one seed-subs path. Default: a built-in
151
+ // starter (kindOverrides). `templateName` set: clone a data-rwa-template-labeled
152
+ // file from cwd — pristine seed + the template's INLINE_DOC (label stripped),
153
+ // fresh UUID. A cloned instance is a document with the template's body (the
154
+ // template `kind` is a discovery label, not a PRODUCT_KIND).
155
+ let out, bodyOverride, fromMsg = '';
156
+ let resolvedKind = kind || 'document';
157
+ if (templateName) {
158
+ // Template-first, kind-fallback (design 2026-05-31 §3.2), via the ONE resolver
159
+ // shared with `rwa create`: a bare word is first a cwd template label to clone;
160
+ // on a miss, if it names a built-in kind, emit that kind; otherwise error naming
161
+ // both misses. A user's labeled file thus overrides the built-in starter, and
162
+ // `rwa new presentation` makes the deck.
163
+ const frame = await resolveBareWord(templateName, process.cwd());
164
+ const dated = `./${templateName}-${new Date().toISOString().slice(0, 10)}.html`;
165
+ if (frame && frame.source === 'template') {
166
+ if (frame.ambiguous) console.error(`note: multiple "${templateName}" templates in ./; using ${rel(frame.templatePath)} (most recent)`);
167
+ out = path.resolve(outPath || dated);
168
+ bodyOverride = frame.body; // already label-stripped by the resolver
169
+ resolvedKind = 'document';
170
+ fromMsg = ` (from template ${rel(frame.templatePath)})`;
171
+ } else if (frame && frame.source === 'kind') {
172
+ resolvedKind = frame.kind;
173
+ out = path.resolve(outPath || dated);
174
+ // bodyOverride stays unset → kindOverrides(resolvedKind) supplies the body.
175
+ } else {
176
+ const e = new Error(`no rwa file in ./ is labeled "${templateName}", and "${templateName}" is not a known kind (${KNOWN_KINDS.join(', ')}). Add data-rwa-template="${templateName}" to a doc's root element to make it a template, or use a known kind.`);
177
+ e.exitCode = 2;
178
+ throw e;
179
+ }
180
+ } else {
181
+ out = path.resolve(outPath || './rewritable.html');
182
+ }
135
183
  await ensureWritable(out, force);
136
184
  const seed = await loadSeed(SEED_CANDIDATES);
137
185
  const fileMeta = path.basename(out);
138
186
  const title = titleFromBasename(path.basename(out, path.extname(out)));
139
- const result = applySeedSubs(seed, {
187
+ // R9-minimal: kind defaults to 'document' (current behavior — no overrides
188
+ // applied, byte-identical to pre-flag emit). For other kinds, kindOverrides
189
+ // supplies the INLINE_DOC body and lens placeholder; SYSTEM_PROMPT is
190
+ // intentionally left alone (audit R1).
191
+ const overrides = kindOverrides(resolvedKind);
192
+ let result = applySeedSubs(seed, {
140
193
  uuid: crypto.randomUUID(),
141
194
  title,
142
195
  fileMeta,
196
+ lensPlaceholder: overrides.lensPlaceholder,
197
+ palPlaceholder: overrides.palPlaceholder,
198
+ productHeader: overrides.productHeader,
199
+ productKind: resolvedKind, // audit R1
200
+ lensClickToAnchor: overrides.lensClickToAnchor, // audit R3 scoped
143
201
  });
202
+ let body = bodyOverride != null ? bodyOverride : overrides.body;
203
+ // --skin: prepend the preset's <style data-rwa-skin> block as the leading child
204
+ // of INLINE_DOC. Skin is orthogonal to kind (a skinned document/presentation),
205
+ // and the inject runs AFTER applySeedSubs (the `rwa import` ordering lesson) so
206
+ // the skin CSS can't false-match a substitution regex. Deterministic, offline,
207
+ // model-free — the L1 restyle is a later phase. skinByName throws exit-2 on an
208
+ // unknown name (caught by the bin's outer handler).
209
+ if (skin) {
210
+ const { theme } = skinByName(skin);
211
+ const base = body != null ? body : extractInlineDoc(result);
212
+ body = theme + '\n' + base;
213
+ }
214
+ if (body != null) result = replaceInlineDoc(result, body);
144
215
  await fs.writeFile(out, result, 'utf8');
145
- console.log(`wrote ${rel(out)}`);
216
+ // Annotate with the resolved kind (covers both `--kind presentation` and the
217
+ // bare-word `rwa new presentation` fallback); a template clone reports its source.
218
+ const kindMsg = resolvedKind !== 'document' ? ` (kind: ${resolvedKind})` : '';
219
+ console.log(`wrote ${rel(out)}${fromMsg || kindMsg}`);
146
220
  if (open) {
147
221
  const prefill = await collectPrefill();
148
222
  if (prefill.key) console.error('note: passing OPENROUTER_API_KEY via ?key= URL parameter');
@@ -152,7 +226,9 @@ export async function newCmd({ outPath, force, open }) {
152
226
  }
153
227
  }
154
228
 
155
- export async function importCmd({ inputPath, outPath, force, open, vision, claude, model, timeoutSec }) {
229
+ export { KNOWN_KINDS };
230
+
231
+ export async function importCmd({ inputPath, outPath, force, open, vision, claude, trustInput, model, timeoutSec }) {
156
232
  if (vision && claude) {
157
233
  const e = new Error('--vision and --claude are mutually exclusive');
158
234
  e.exitCode = 2;
@@ -177,9 +253,13 @@ export async function importCmd({ inputPath, outPath, force, open, vision, claud
177
253
  const contents = await fs.readFile(input);
178
254
  ({ html, warnings } = await convertPdfViaVision(contents, { model }));
179
255
  } else if (claude) {
180
- console.error(`note: claude: spawning \`claude -p\`…`);
256
+ if (trustInput) {
257
+ console.error(`note: claude: --trust-input set — running the agent with bypassPermissions on ${path.basename(input)}. Only safe for files you trust.`);
258
+ }
181
259
  // Pass the path; the skill reads the file itself via its own tools.
182
- const claudeOpts = timeoutSec ? { timeoutMs: timeoutSec * 1000 } : {};
260
+ // trustInput gates the bypassPermissions agent (see import-claude.mjs); the
261
+ // consent gate there throws with exitCode 2 when it is absent.
262
+ const claudeOpts = { trustInput, ...(timeoutSec ? { timeoutMs: timeoutSec * 1000 } : {}) };
183
263
  ({ html, warnings } = await convertViaClaudeCli(input, ext, claudeOpts));
184
264
  } else {
185
265
  // Buffer (not utf8 string) — docx and pdf are binary, and text formats