rewritable 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +261 -5
- package/bin/rwa.mjs +1000 -9
- package/package.json +2 -2
- package/seeds/rewritable.html +4065 -207
- package/src/agent-loop.mjs +155 -0
- package/src/apply-edits.mjs +664 -0
- package/src/atomic-write.mjs +38 -0
- package/src/backend.mjs +43 -0
- package/src/clone-extract.mjs +249 -0
- package/src/clone.mjs +161 -0
- package/src/commands.mjs +90 -10
- package/src/create.mjs +256 -0
- package/src/doc.mjs +69 -0
- package/src/dsl-compiler.mjs +357 -0
- package/src/edit.mjs +300 -0
- package/src/fetch-page.mjs +346 -0
- package/src/host.mjs +126 -0
- package/src/identity.mjs +257 -0
- package/src/import-claude.mjs +28 -4
- package/src/import-vision.mjs +1 -1
- package/src/import.mjs +76 -10
- package/src/ls.mjs +105 -0
- package/src/publish-site.mjs +85 -0
- package/src/publish.mjs +98 -0
- package/src/seed-extract.mjs +40 -0
- package/src/seed.mjs +1387 -5
- package/src/self-contained.mjs +115 -0
- package/src/skill-manifest.mjs +227 -0
- package/src/skin.mjs +350 -0
- package/src/skins.mjs +274 -0
- package/src/template.mjs +109 -0
package/src/edit.mjs
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
// Plan-path entry for `rwa edit`. Composes the three foundation modules
|
|
2
|
+
// (dsl-compiler, apply-edits, seed splice helpers) into a single function
|
|
3
|
+
// that takes a target .html and a tool-envelope, applies the edit
|
|
4
|
+
// deterministically, and atomically writes the file back.
|
|
5
|
+
//
|
|
6
|
+
// Error surface (load-bearing — Task 5's --json output keys on these):
|
|
7
|
+
// exitCode 2 / subcode: 'not_found', 'read_error', 'not_a_rewritable'
|
|
8
|
+
// exitCode 3 / subcode: 'not_an_object', 'unknown_shape',
|
|
9
|
+
// 'ambiguous_envelope', 'missing_version',
|
|
10
|
+
// 'version_mismatch', 'missing_reason',
|
|
11
|
+
// 'malformed_envelope', 'frozen_zone_violation',
|
|
12
|
+
// plus DslCompileError.code or RwaEditError.code
|
|
13
|
+
// from the underlying modules.
|
|
14
|
+
|
|
15
|
+
import { readFile } from 'node:fs/promises';
|
|
16
|
+
import { atomicWrite } from './atomic-write.mjs';
|
|
17
|
+
import {
|
|
18
|
+
applyEdits, RwaEditError, dataRwaFrozenSnapshot, FAILURE_HINTS,
|
|
19
|
+
virtualizeImages, expandImages, assertNoNewAssetTokens, mapEnvelopeImages, MAX_DOC_EXPANDED,
|
|
20
|
+
extractFrozenZones3, lockedRangesIn, markerZoneRangesIn,
|
|
21
|
+
} from './apply-edits.mjs';
|
|
22
|
+
import { compileDslPlan } from './dsl-compiler.mjs';
|
|
23
|
+
import { extractInlineDoc, replaceInlineDoc } from './seed.mjs';
|
|
24
|
+
|
|
25
|
+
export class CliError extends Error {
|
|
26
|
+
constructor(exitCode, subcode, details = {}) {
|
|
27
|
+
super(subcode);
|
|
28
|
+
this.exitCode = exitCode;
|
|
29
|
+
this.subcode = subcode;
|
|
30
|
+
this.details = details;
|
|
31
|
+
// Self-documenting failures: attach a one-line, code-keyed recovery hint so
|
|
32
|
+
// `rwa edit --json` consumers (agents, scripts) get actionable guidance, not
|
|
33
|
+
// just a code. Mirrors the seed's failureToToolResult. Additive and keyed on
|
|
34
|
+
// a limited table, so subcodes without a hint (e.g. doc.mjs read errors) are
|
|
35
|
+
// untouched.
|
|
36
|
+
if (FAILURE_HINTS[subcode] && this.details.hint == null) this.details.hint = FAILURE_HINTS[subcode];
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Inspect the envelope's discriminator set and assert version invariants.
|
|
41
|
+
// Returns the canonical tool name on success.
|
|
42
|
+
// Frozen-zone preservation for wholesale-replacement paths (replace_document and
|
|
43
|
+
// the DSL escape op) — the equivalent of the guards applyEdits runs on the
|
|
44
|
+
// find/replace path. MARKER-form zones (all three fence forms) must survive
|
|
45
|
+
// byte-identically by name (mirror of seed replaceDocument's extractFrozenZones/
|
|
46
|
+
// frozenZonesIntact check); the set of ATTRIBUTE-form data-rwa-frozen elements
|
|
47
|
+
// must be unchanged (snapshot equality, mirror of seed dataRwaFrozenSnapshot).
|
|
48
|
+
// Without this the escape hatch would let an agent drift a frozen self-
|
|
49
|
+
// description declaration that apply_edits protects.
|
|
50
|
+
function assertFrozenPreserved(currentDoc, newDoc) {
|
|
51
|
+
// Class-lock coverage (rwa-lens/1 spec §7; seed replaceDocument class_lock_uncovered).
|
|
52
|
+
// A bare .rwa-locked block in the CURRENT doc cannot survive a wholesale rewrite —
|
|
53
|
+
// the wrapper can be reshaped, attribute-mutated, or dropped. Locks are only safe
|
|
54
|
+
// under replace_document if their source range is entirely contained within a
|
|
55
|
+
// marker-form frozen zone (markers wrap or equal the lock — NOT the inverse).
|
|
56
|
+
// Precondition on the current doc: if any lock is uncovered, NO replace_document
|
|
57
|
+
// is allowed, regardless of the new doc. markerZoneRangesIn is 3-fence-form, and
|
|
58
|
+
// the byte-preservation scan below is too (extractFrozenZones3) — so a lock the
|
|
59
|
+
// coverage check accepts as covered by a /* */ or // zone is a zone the
|
|
60
|
+
// preservation check actually protects. The two agree on the fence-form axis.
|
|
61
|
+
const lockRanges = lockedRangesIn(currentDoc);
|
|
62
|
+
if (lockRanges.length) {
|
|
63
|
+
const markerRanges = markerZoneRangesIn(currentDoc);
|
|
64
|
+
for (const [ls, le] of lockRanges) {
|
|
65
|
+
const covered = markerRanges.some(([ms, me]) => ms <= ls && le <= me);
|
|
66
|
+
if (!covered) throw new CliError(3, 'class_lock_uncovered', { lockRange: [ls, le] });
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
// Marker-form frozen zones — all three fence forms, with unterminated AND
|
|
70
|
+
// duplicate detection (faithful mirror of the seed's extractFrozenZones +
|
|
71
|
+
// frozenZonesIntact). One scan feeds byte-preservation, add-rejection, the
|
|
72
|
+
// half-open-fence check, and the shadow-duplicate check, so a /* */ or // zone
|
|
73
|
+
// can't be silently dropped, minted, half-opened, or duplicated via the escape
|
|
74
|
+
// hatch — and a duplicate-name pair can't smuggle a tampered copy past a
|
|
75
|
+
// last-wins Map. The CLI surfaces frozen_zone_violation (its replace-path
|
|
76
|
+
// convention) where the seed throws frozen_zone_corrupted.
|
|
77
|
+
const oldZones = extractFrozenZones3(currentDoc);
|
|
78
|
+
const newZones = extractFrozenZones3(newDoc);
|
|
79
|
+
const orphan = newZones.find(z => z.error === 'unterminated');
|
|
80
|
+
if (orphan) {
|
|
81
|
+
throw new CliError(3, 'frozen_zone_violation', {
|
|
82
|
+
zone: orphan.name,
|
|
83
|
+
reason: 'replace_document must not leave an unterminated frozen-zone marker',
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
const dup = newZones.find(z => z.error === 'duplicate') || oldZones.find(z => z.error === 'duplicate');
|
|
87
|
+
if (dup) {
|
|
88
|
+
throw new CliError(3, 'frozen_zone_violation', {
|
|
89
|
+
zone: dup.name,
|
|
90
|
+
reason: 'duplicate frozen-zone name (a tampered shadow copy could hide behind a last-wins match)',
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
const oldByName = new Map(oldZones.map(z => [z.name, z.inner]));
|
|
94
|
+
const newByName = new Map(newZones.map(z => [z.name, z.inner]));
|
|
95
|
+
// Preserve byte-identically by name (the seed compares inner content; marker
|
|
96
|
+
// text is fixed grammar, the name is the key).
|
|
97
|
+
for (const [name, inner] of oldByName) {
|
|
98
|
+
if (!newByName.has(name) || newByName.get(name) !== inner) {
|
|
99
|
+
throw new CliError(3, 'frozen_zone_violation', {
|
|
100
|
+
zone: name,
|
|
101
|
+
reason: 'replace_document must preserve frozen zones byte-identically',
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// …and must not ADD a new marker-form zone (mint an author-invariant). The
|
|
106
|
+
// attribute-form add/remove is caught by the dataRwaFrozenSnapshot check below.
|
|
107
|
+
for (const name of newByName.keys()) {
|
|
108
|
+
if (!oldByName.has(name)) {
|
|
109
|
+
throw new CliError(3, 'frozen_zone_violation', {
|
|
110
|
+
zone: name,
|
|
111
|
+
reason: 'replace_document must not add a new frozen zone',
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
const a = dataRwaFrozenSnapshot(currentDoc);
|
|
116
|
+
const b = dataRwaFrozenSnapshot(newDoc);
|
|
117
|
+
if (a.length !== b.length || a.some((x, i) => x !== b[i])) {
|
|
118
|
+
throw new CliError(3, 'frozen_zone_violation', {
|
|
119
|
+
form: 'attribute',
|
|
120
|
+
reason: 'replace_document must preserve data-rwa-frozen elements byte-identically',
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
// Reserved HTML id: the escape hatch must not inject id="rwa-doc-mount" (it
|
|
124
|
+
// would shadow/hijack the runtime mount). Parser-free mirror of the seed's
|
|
125
|
+
// findReservedIdViolation (querySelector('#rwa-doc-mount')).
|
|
126
|
+
if (/\bid\s*=\s*["']?rwa-doc-mount(?=["'\s/>]|$)/i.test(newDoc)) {
|
|
127
|
+
throw new CliError(3, 'reserved_id_used', { id: 'rwa-doc-mount' });
|
|
128
|
+
}
|
|
129
|
+
// #5 opt-in (rwa-id-strict): the escape hatch must not lose an existing
|
|
130
|
+
// data-rwa-id when the container declares <meta name="rwa-id-strict">.
|
|
131
|
+
if (/<meta\s+name\s*=\s*["']?rwa-id-strict\b/i.test(currentDoc)) {
|
|
132
|
+
const ids = (s) => new Set([...s.matchAll(/\sdata-rwa-id\s*=\s*(?:"([^"]*)"|'([^']*)')/g)].map((m) => (m[1] != null ? m[1] : m[2])));
|
|
133
|
+
const after = ids(newDoc);
|
|
134
|
+
for (const id of ids(currentDoc)) if (!after.has(id)) throw new CliError(3, 'rwa_id_stripped', { id });
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// String.prototype.isWellFormed (Node 22+) — false for an unpaired UTF-16
|
|
139
|
+
// surrogate. Mirror of the seed's isWellFormed lone-surrogate guard.
|
|
140
|
+
const isWellFormedStr = (s) => typeof s !== 'string' || typeof s.isWellFormed !== 'function' || s.isWellFormed();
|
|
141
|
+
|
|
142
|
+
function validateEnvelope(env) {
|
|
143
|
+
if (typeof env !== 'object' || env === null || Array.isArray(env)) {
|
|
144
|
+
throw new CliError(3, 'not_an_object');
|
|
145
|
+
}
|
|
146
|
+
const hasEdits = 'edits' in env;
|
|
147
|
+
const hasOps = 'ops' in env;
|
|
148
|
+
const hasDoc = 'doc' in env;
|
|
149
|
+
const count = (hasEdits ? 1 : 0) + (hasOps ? 1 : 0) + (hasDoc ? 1 : 0);
|
|
150
|
+
if (count === 0) throw new CliError(3, 'unknown_shape');
|
|
151
|
+
if (count > 1) throw new CliError(3, 'ambiguous_envelope');
|
|
152
|
+
if (typeof env.version !== 'string' || env.version.length === 0) {
|
|
153
|
+
throw new CliError(3, 'missing_version');
|
|
154
|
+
}
|
|
155
|
+
if (hasEdits && env.version !== 'rwa-edit/1') {
|
|
156
|
+
throw new CliError(3, 'version_mismatch', { expected: 'rwa-edit/1', got: env.version });
|
|
157
|
+
}
|
|
158
|
+
if (hasOps && env.version !== 'rwa-edit-dsl/1') {
|
|
159
|
+
throw new CliError(3, 'version_mismatch', { expected: 'rwa-edit-dsl/1', got: env.version });
|
|
160
|
+
}
|
|
161
|
+
if (hasDoc && env.version !== 'rwa-edit/1') {
|
|
162
|
+
throw new CliError(3, 'version_mismatch', { expected: 'rwa-edit/1', got: env.version });
|
|
163
|
+
}
|
|
164
|
+
// `'doc' in env` is true even when env.doc is undefined — without this type
|
|
165
|
+
// check `replaceInlineDoc(fileText, undefined)` would silently write an
|
|
166
|
+
// empty body (canonLF(undefined) → ''). Use `malformed_envelope` to match
|
|
167
|
+
// the bootstrap's replaceDocument shape-check (seeds/rewritable.html
|
|
168
|
+
// §replaceDocument, line ~2913).
|
|
169
|
+
if (hasDoc && typeof env.doc !== 'string') {
|
|
170
|
+
throw new CliError(3, 'malformed_envelope', { reason: 'doc must be a string' });
|
|
171
|
+
}
|
|
172
|
+
if (hasDoc && (typeof env.reason !== 'string' || env.reason.length === 0)) {
|
|
173
|
+
throw new CliError(3, 'missing_reason');
|
|
174
|
+
}
|
|
175
|
+
// Lone-surrogate guard (mirror seed isWellFormed): an unpaired UTF-16 surrogate
|
|
176
|
+
// in doc/reason corrupts the durable file on encode.
|
|
177
|
+
if (hasDoc && (!isWellFormedStr(env.doc) || !isWellFormedStr(env.reason))) {
|
|
178
|
+
throw new CliError(3, 'malformed_envelope', { reason: 'lone_surrogate' });
|
|
179
|
+
}
|
|
180
|
+
return hasEdits ? 'apply_edits' : hasOps ? 'apply_dsl_plan' : 'replace_document';
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Apply a tool-envelope to a rewritable .html on disk.
|
|
185
|
+
*
|
|
186
|
+
* @param {string} filePath — absolute or relative path to the target .html
|
|
187
|
+
* @param {object} envelope — apply_edits / apply_dsl_plan / replace_document envelope
|
|
188
|
+
* @param {object} [opts]
|
|
189
|
+
* @param {boolean} [opts.virtualImages] — the envelope speaks rwa-asset token
|
|
190
|
+
* form (rwa-edit-spec.md §19): the agent saw the VIRTUAL doc, so apply on the
|
|
191
|
+
* virtual form and expand tokens back before the file write. Hash-keyed
|
|
192
|
+
* tokens make the map re-derivable from the doc bytes — no map threading.
|
|
193
|
+
* Raw paths (piped envelope / --plan) leave this unset: real bytes, plus the
|
|
194
|
+
* fail-loud guard against introducing a NEW token with no bytes behind it.
|
|
195
|
+
* @returns {Promise<{exitCode: 0}>}
|
|
196
|
+
* @throws {CliError} on any validation, compile, or apply failure
|
|
197
|
+
*/
|
|
198
|
+
export async function applyPlan(filePath, envelope, opts = {}) {
|
|
199
|
+
// 1. Read the file. Surfacing not_found before envelope validation matches
|
|
200
|
+
// the user's mental model: file errors first, then plan errors.
|
|
201
|
+
let fileText;
|
|
202
|
+
try {
|
|
203
|
+
fileText = await readFile(filePath, 'utf8');
|
|
204
|
+
} catch (e) {
|
|
205
|
+
if (e && e.code === 'ENOENT') throw new CliError(2, 'not_found', { path: filePath });
|
|
206
|
+
// EACCES, EISDIR, EMFILE, etc. — "not_found" would mislead the user.
|
|
207
|
+
throw new CliError(2, 'read_error', {
|
|
208
|
+
path: filePath,
|
|
209
|
+
errno: e && e.code,
|
|
210
|
+
message: e && e.message,
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// 2. Extract INLINE_DOC body. A plain-text or non-rewritable target throws.
|
|
215
|
+
let currentDoc;
|
|
216
|
+
try {
|
|
217
|
+
currentDoc = extractInlineDoc(fileText);
|
|
218
|
+
} catch (_e) {
|
|
219
|
+
throw new CliError(2, 'not_a_rewritable', { path: filePath });
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// 3. Validate envelope shape + version.
|
|
223
|
+
const shape = validateEnvelope(envelope);
|
|
224
|
+
|
|
225
|
+
// images-v1 (rwa-edit-spec.md §19) — two virtualization modes:
|
|
226
|
+
// • opts.virtualImages: the envelope is ALREADY token-form (agent/CLI path).
|
|
227
|
+
// Virtualize the stored doc so token anchors match, apply, expand.
|
|
228
|
+
// • opts.virtualizeEnvelope: the envelope is EXPANDED (real data: URIs) —
|
|
229
|
+
// the hosted /modify relay. Seed a map from the stored doc, then tokenize
|
|
230
|
+
// the incoming envelope into the SAME map (registering new image bytes),
|
|
231
|
+
// so the apply runs on the token form (caps = text budget) and expansion
|
|
232
|
+
// resolves both existing and new images.
|
|
233
|
+
// Either way all guards below (frozen zones, snapshots) run virtual-vs-virtual.
|
|
234
|
+
const vimg = (opts.virtualImages || opts.virtualizeEnvelope) ? virtualizeImages(currentDoc) : null;
|
|
235
|
+
if (opts.virtualizeEnvelope) envelope = mapEnvelopeImages(envelope, vimg.assets);
|
|
236
|
+
const workDoc = vimg ? vimg.doc : currentDoc;
|
|
237
|
+
|
|
238
|
+
// 4. Compute the new doc per shape.
|
|
239
|
+
let newDoc;
|
|
240
|
+
if (shape === 'replace_document') {
|
|
241
|
+
newDoc = envelope.doc;
|
|
242
|
+
assertFrozenPreserved(workDoc, newDoc);
|
|
243
|
+
} else if (shape === 'apply_dsl_plan') {
|
|
244
|
+
let compiled;
|
|
245
|
+
try {
|
|
246
|
+
compiled = compileDslPlan(envelope, workDoc);
|
|
247
|
+
} catch (e) {
|
|
248
|
+
// Pass e.op through: DslCompileError carries the offending DSL op, which
|
|
249
|
+
// --json consumers need to point at the failing step (was dropped).
|
|
250
|
+
throw new CliError(3, e.code || 'dsl_compile_error', { message: e.message, op: e.op });
|
|
251
|
+
}
|
|
252
|
+
if (compiled.tool === 'replace_document') {
|
|
253
|
+
newDoc = compiled.envelope.doc;
|
|
254
|
+
assertFrozenPreserved(workDoc, newDoc); // the DSL escape op must not bypass frozen zones either
|
|
255
|
+
} else {
|
|
256
|
+
try {
|
|
257
|
+
newDoc = applyEdits(workDoc, compiled.envelope.edits);
|
|
258
|
+
} catch (e) {
|
|
259
|
+
if (e instanceof RwaEditError) {
|
|
260
|
+
throw new CliError(3, e.code, { editIndex: e.editIndex, ...e.context });
|
|
261
|
+
}
|
|
262
|
+
throw e;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
} else {
|
|
266
|
+
try {
|
|
267
|
+
newDoc = applyEdits(workDoc, envelope.edits);
|
|
268
|
+
} catch (e) {
|
|
269
|
+
if (e instanceof RwaEditError) {
|
|
270
|
+
throw new CliError(3, e.code, { editIndex: e.editIndex, ...e.context });
|
|
271
|
+
}
|
|
272
|
+
throw e;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// images-v1: expand token-form output to real bytes (an invented token
|
|
277
|
+
// rejects here, before anything is written); raw paths get the fail-loud
|
|
278
|
+
// guard against minting a NEW token with no bytes behind it.
|
|
279
|
+
try {
|
|
280
|
+
if (vimg) newDoc = expandImages(newDoc, vimg.assets, vimg.orphans);
|
|
281
|
+
else assertNoNewAssetTokens(currentDoc, newDoc);
|
|
282
|
+
} catch (e) {
|
|
283
|
+
if (e instanceof RwaEditError) throw new CliError(3, e.code, { ...e.context });
|
|
284
|
+
throw e;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Expanded-size guard (image paths only): MAX_DOC measured the VIRTUAL form,
|
|
288
|
+
// so cap the REAL doc here — the DoS bound that the per-edit byte cap no
|
|
289
|
+
// longer provides once image bytes are tokenized. Mirrors the GUI's 10 MB
|
|
290
|
+
// container budget; authoritative server-side on the hosted /modify path.
|
|
291
|
+
if (vimg && newDoc.length > MAX_DOC_EXPANDED) {
|
|
292
|
+
throw new CliError(3, 'target_size_exceeded', { expanded: true, length: newDoc.length, cap: MAX_DOC_EXPANDED });
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// 5. Splice the new doc back into the bootstrap and write atomically (temp +
|
|
296
|
+
// fsync + rename(2)); the temp is removed on any failure. See ./atomic-write.mjs.
|
|
297
|
+
const newFileText = replaceInlineDoc(fileText, newDoc);
|
|
298
|
+
await atomicWrite(filePath, newFileText);
|
|
299
|
+
return { exitCode: 0 };
|
|
300
|
+
}
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
// SSRF-safe page fetcher for `rwa clone <url>`. The fetch layer only — the
|
|
2
|
+
// article extractor and the bootstrap wiring are separate modules.
|
|
3
|
+
//
|
|
4
|
+
// A user (or an agent) can pass any URL, so without guards `rwa clone
|
|
5
|
+
// http://169.254.169.254/…` or `http://127.0.0.1:…` could reach cloud-metadata
|
|
6
|
+
// endpoints or internal services. Defence is in three layers:
|
|
7
|
+
// 1. scheme allowlist (http/https only) — assertFetchableUrl
|
|
8
|
+
// 2. IP-literal classification (block private/etc.) — assertPublicIp
|
|
9
|
+
// 3. DNS-rebinding defence: resolve the hostname and re-classify EVERY
|
|
10
|
+
// resolved address; manual per-hop redirect re-validation (no
|
|
11
|
+
// redirect:'follow' — that would bypass the per-hop checks).
|
|
12
|
+
//
|
|
13
|
+
// Error surface (all exitCode 2 so the CLI maps them to the file/fetch class):
|
|
14
|
+
// subcode: 'bad_scheme', 'blocked_host', 'too_many_redirects', 'http_error',
|
|
15
|
+
// 'not_html', 'too_large', 'fetch_failed'.
|
|
16
|
+
//
|
|
17
|
+
// Mirrors the rigor of the seed bridge SSRF block (redirect:'error' +
|
|
18
|
+
// private-range rejection). Only node: built-ins + global fetch.
|
|
19
|
+
|
|
20
|
+
import { isIP } from 'node:net';
|
|
21
|
+
import { lookup } from 'node:dns/promises';
|
|
22
|
+
|
|
23
|
+
export class CloneError extends Error {
|
|
24
|
+
constructor(exitCode, subcode, details = {}) {
|
|
25
|
+
super(subcode);
|
|
26
|
+
this.exitCode = exitCode;
|
|
27
|
+
this.subcode = subcode;
|
|
28
|
+
this.details = details;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// --- IP classification ------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
// Parse a dotted-quad into four octets, or null if it is not a v4 literal.
|
|
35
|
+
function parseV4(host) {
|
|
36
|
+
if (isIP(host) !== 4) return null;
|
|
37
|
+
const parts = host.split('.').map((p) => Number(p));
|
|
38
|
+
if (parts.length !== 4 || parts.some((n) => !Number.isInteger(n) || n < 0 || n > 255)) return null;
|
|
39
|
+
return parts;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// True if a v4 address falls in any range we refuse to fetch. Categories are
|
|
43
|
+
// returned (not just a boolean) so the error message can name the reason.
|
|
44
|
+
function v4Category([a, b, c]) {
|
|
45
|
+
if (a === 0) return 'unspecified'; // 0.0.0.0/8
|
|
46
|
+
if (a === 10) return 'private'; // 10/8
|
|
47
|
+
if (a === 127) return 'loopback'; // 127/8
|
|
48
|
+
if (a === 169 && b === 254) return 'link-local'; // 169.254/16 (incl. metadata)
|
|
49
|
+
if (a === 172 && b >= 16 && b <= 31) return 'private'; // 172.16/12
|
|
50
|
+
if (a === 192 && b === 168) return 'private'; // 192.168/16
|
|
51
|
+
if (a === 100 && b >= 64 && b <= 127) return 'reserved'; // 100.64/10 CGNAT
|
|
52
|
+
if (a === 192 && b === 0 && c === 2) return 'reserved'; // 192.0.2/24 TEST-NET-1
|
|
53
|
+
if (a === 198 && (b === 18 || b === 19)) return 'reserved'; // 198.18/15 benchmarking
|
|
54
|
+
if (a === 198 && b === 51 && c === 100) return 'reserved'; // 198.51.100/24 TEST-NET-2
|
|
55
|
+
if (a === 203 && b === 0 && c === 113) return 'reserved'; // 203.0.113/24 TEST-NET-3
|
|
56
|
+
if (a === 192 && b === 88 && c === 99) return 'reserved'; // 192.88.99/24 6to4 anycast
|
|
57
|
+
if (a >= 224) return 'reserved'; // 224/4 multicast + 240/4 reserved
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Expand an IPv6 literal into its full 16 bytes, dep-free: handles `::`
|
|
62
|
+
// compression and an embedded dotted-quad tail (::ffff:a.b.c.d / ::a.b.c.d).
|
|
63
|
+
// Returns a 16-element byte array, or null if it does not parse as v6. Operating
|
|
64
|
+
// on bytes (not string regexes) makes the dotted and hex spellings of the same
|
|
65
|
+
// address — e.g. ::ffff:127.0.0.1 and ::ffff:7f00:1 — classify identically.
|
|
66
|
+
function expandV6(host) {
|
|
67
|
+
if (isIP(host) !== 6) return null;
|
|
68
|
+
let s = host.toLowerCase();
|
|
69
|
+
// Split out an embedded IPv4 tail (last group with dots) into two hex groups.
|
|
70
|
+
const dot = s.lastIndexOf(':');
|
|
71
|
+
const tail = s.slice(dot + 1);
|
|
72
|
+
let v4Bytes = null;
|
|
73
|
+
if (tail.includes('.')) {
|
|
74
|
+
const quad = parseV4(tail);
|
|
75
|
+
if (!quad) return null;
|
|
76
|
+
v4Bytes = quad;
|
|
77
|
+
s = s.slice(0, dot + 1); // keep trailing ':' so the group count stays right
|
|
78
|
+
}
|
|
79
|
+
// Split around the `::` compression point (at most one). The length-mismatch
|
|
80
|
+
// and multiple-`::` guards below are belt-and-suspenders — isIP() already
|
|
81
|
+
// rejected malformed literals, but we re-check on raw bytes for defence-in-depth.
|
|
82
|
+
const halves = s.split('::');
|
|
83
|
+
if (halves.length > 2) return null;
|
|
84
|
+
const splitGroups = (part) => (part === '' ? [] : part.split(':').filter((g) => g !== ''));
|
|
85
|
+
const head = splitGroups(halves[0]);
|
|
86
|
+
const tailGroups = halves.length === 2 ? splitGroups(halves[1]) : [];
|
|
87
|
+
// Each remaining group is one 16-bit hex word; the v4 tail (if any) is 2 words.
|
|
88
|
+
const v4Words = v4Bytes ? 2 : 0;
|
|
89
|
+
const words = [];
|
|
90
|
+
for (const g of head) words.push(parseInt(g, 16));
|
|
91
|
+
if (halves.length === 2) {
|
|
92
|
+
const fill = 8 - head.length - tailGroups.length - v4Words;
|
|
93
|
+
if (fill < 0) return null;
|
|
94
|
+
for (let i = 0; i < fill; i++) words.push(0);
|
|
95
|
+
}
|
|
96
|
+
for (const g of tailGroups) words.push(parseInt(g, 16));
|
|
97
|
+
if (words.length !== 8 - v4Words) return null;
|
|
98
|
+
const bytes = [];
|
|
99
|
+
for (const w of words) { bytes.push((w >> 8) & 0xff, w & 0xff); }
|
|
100
|
+
if (v4Bytes) bytes.push(...v4Bytes);
|
|
101
|
+
if (bytes.length !== 16) return null;
|
|
102
|
+
return bytes;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Normalize a v6 literal: returns a category string if it must be blocked, or
|
|
106
|
+
// 'mapped:<v4>' to signal an IPv4-mapped (or -compatible) address whose embedded
|
|
107
|
+
// v4 must be re-checked through the v4 category logic, or null for a public v6.
|
|
108
|
+
function v6Category(host) {
|
|
109
|
+
const b = expandV6(host);
|
|
110
|
+
if (!b) return null;
|
|
111
|
+
const allZeroThrough = (n) => b.slice(0, n).every((x) => x === 0);
|
|
112
|
+
// IPv4-mapped ::ffff:a.b.c.d — first 10 bytes zero, bytes 11-12 = 0xff,0xff.
|
|
113
|
+
if (allZeroThrough(10) && b[10] === 0xff && b[11] === 0xff) {
|
|
114
|
+
return `mapped:${b[12]}.${b[13]}.${b[14]}.${b[15]}`;
|
|
115
|
+
}
|
|
116
|
+
// ::1 loopback / :: unspecified (must come before the v4-compatible check).
|
|
117
|
+
if (allZeroThrough(15) && b[15] === 1) return 'loopback';
|
|
118
|
+
if (b.every((x) => x === 0)) return 'unspecified';
|
|
119
|
+
// Deprecated IPv4-compatible ::a.b.c.d — first 12 bytes zero, low 32 bits a
|
|
120
|
+
// real v4. Re-check the embedded v4 the same way as the mapped form.
|
|
121
|
+
if (allZeroThrough(12)) {
|
|
122
|
+
return `mapped:${b[12]}.${b[13]}.${b[14]}.${b[15]}`;
|
|
123
|
+
}
|
|
124
|
+
// ff00::/8 — IPv6 multicast (mirrors the v4 224/4 block; closes the asymmetry).
|
|
125
|
+
if (b[0] === 0xff) return 'reserved';
|
|
126
|
+
// fc00::/7 — Unique Local Addresses (fc.. and fd..).
|
|
127
|
+
if ((b[0] & 0xfe) === 0xfc) return 'private';
|
|
128
|
+
// fe80::/10 — link-local.
|
|
129
|
+
if (b[0] === 0xfe && (b[1] & 0xc0) === 0x80) return 'link-local';
|
|
130
|
+
// 2001:db8::/32 — documentation range (RFC 3849), never routable.
|
|
131
|
+
if (b[0] === 0x20 && b[1] === 0x01 && b[2] === 0x0d && b[3] === 0xb8) return 'reserved';
|
|
132
|
+
// NAT64 64:ff9b::/96 — bytes 0-1 = 00 64, 2-3 = ff 9b, bytes 4-11 zero, the
|
|
133
|
+
// embedded v4 in bytes 12-15 is reachable through a NAT64 gateway. Re-check it.
|
|
134
|
+
if (b[0] === 0x00 && b[1] === 0x64 && b[2] === 0xff && b[3] === 0x9b &&
|
|
135
|
+
b.slice(4, 12).every((x) => x === 0)) {
|
|
136
|
+
return `mapped:${b[12]}.${b[13]}.${b[14]}.${b[15]}`;
|
|
137
|
+
}
|
|
138
|
+
// 6to4 2002::/16 — bytes 0-1 = 20 02, the embedded v4 is bytes 2-5; reachable
|
|
139
|
+
// through a 6to4 relay. Re-check the embedded v4.
|
|
140
|
+
if (b[0] === 0x20 && b[1] === 0x02) {
|
|
141
|
+
return `mapped:${b[2]}.${b[3]}.${b[4]}.${b[5]}`;
|
|
142
|
+
}
|
|
143
|
+
return null;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// Classify a single IP literal (v4 or v6). Throws CloneError(blocked_host) for
|
|
147
|
+
// any non-public address; returns silently for a public address. Shared by the
|
|
148
|
+
// sync URL check and the async DNS-rebinding check.
|
|
149
|
+
function assertPublicIp(ip, host = ip) {
|
|
150
|
+
const fam = isIP(ip);
|
|
151
|
+
if (fam === 4) {
|
|
152
|
+
const cat = v4Category(parseV4(ip));
|
|
153
|
+
if (cat) throw new CloneError(2, 'blocked_host', { host, ip, category: cat,
|
|
154
|
+
message: `blocked ${cat} address ${ip}` });
|
|
155
|
+
return;
|
|
156
|
+
}
|
|
157
|
+
if (fam === 6) {
|
|
158
|
+
const cat = v6Category(ip);
|
|
159
|
+
if (cat && cat.startsWith('mapped:')) {
|
|
160
|
+
const v4 = cat.slice('mapped:'.length);
|
|
161
|
+
// [255,255] sentinel: if the embedded quad somehow fails to re-parse, force
|
|
162
|
+
// a blocking category (255 ⇒ a>=224 'reserved') rather than failing open.
|
|
163
|
+
const c4 = v4Category(parseV4(v4) || [255, 255]);
|
|
164
|
+
if (c4) throw new CloneError(2, 'blocked_host', { host, ip, category: c4,
|
|
165
|
+
message: `blocked ${c4} address ${v4} (IPv4-mapped IPv6)` });
|
|
166
|
+
return; // public IPv4-mapped v6
|
|
167
|
+
}
|
|
168
|
+
if (cat) throw new CloneError(2, 'blocked_host', { host, ip, category: cat,
|
|
169
|
+
message: `blocked ${cat} address ${ip}` });
|
|
170
|
+
return;
|
|
171
|
+
}
|
|
172
|
+
// Not an IP literal — caller decides (sync path returns, DNS path won't hit).
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// --- URL gate (sync) --------------------------------------------------------
|
|
176
|
+
|
|
177
|
+
// Strip surrounding brackets from an IPv6 URL hostname.
|
|
178
|
+
function bareHost(hostname) {
|
|
179
|
+
return hostname.startsWith('[') && hostname.endsWith(']')
|
|
180
|
+
? hostname.slice(1, -1)
|
|
181
|
+
: hostname;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Synchronous pre-flight: scheme + IP-literal classification only. DNS is async
|
|
185
|
+
// and lives in fetchPage. Returns the parsed URL on success.
|
|
186
|
+
export function assertFetchableUrl(url) {
|
|
187
|
+
let parsed;
|
|
188
|
+
try {
|
|
189
|
+
parsed = new URL(url);
|
|
190
|
+
} catch {
|
|
191
|
+
throw new CloneError(2, 'bad_scheme', { url, message: 'unparseable URL (no valid scheme)' });
|
|
192
|
+
}
|
|
193
|
+
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
|
|
194
|
+
throw new CloneError(2, 'bad_scheme', { url, protocol: parsed.protocol,
|
|
195
|
+
message: `unsupported scheme ${parsed.protocol} — only http/https allowed` });
|
|
196
|
+
}
|
|
197
|
+
const host = bareHost(parsed.hostname);
|
|
198
|
+
if (host.toLowerCase() === 'localhost') {
|
|
199
|
+
throw new CloneError(2, 'blocked_host', { host, category: 'loopback',
|
|
200
|
+
message: 'blocked loopback host localhost' });
|
|
201
|
+
}
|
|
202
|
+
if (isIP(host)) assertPublicIp(host);
|
|
203
|
+
return parsed;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// --- fetch (async) ----------------------------------------------------------
|
|
207
|
+
|
|
208
|
+
// Resolve a non-literal hostname and re-classify every resolved address, so a
|
|
209
|
+
// public-looking name that resolves to a private IP (DNS rebinding) is blocked.
|
|
210
|
+
async function assertHostResolvesPublic(host, lookupImpl = lookup) {
|
|
211
|
+
if (isIP(host)) return; // already validated as a literal
|
|
212
|
+
let addrs;
|
|
213
|
+
try {
|
|
214
|
+
addrs = await lookupImpl(host, { all: true });
|
|
215
|
+
} catch (err) {
|
|
216
|
+
throw new CloneError(2, 'fetch_failed', { host, message: `DNS lookup failed: ${err.message}` });
|
|
217
|
+
}
|
|
218
|
+
if (!addrs.length) {
|
|
219
|
+
throw new CloneError(2, 'fetch_failed', { host, message: 'DNS lookup returned no addresses' });
|
|
220
|
+
}
|
|
221
|
+
for (const { address } of addrs) assertPublicIp(address, host);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Shared SSRF-guarded fetch core for fetchPage (HTML) and fetchImageDataUri
|
|
225
|
+
// (images). Validates the URL + every redirect hop (DNS-rebinding re-resolution,
|
|
226
|
+
// never redirect:'follow'), streams with a hard byte cap, and returns the raw
|
|
227
|
+
// bytes + matched mime + final URL. Content-type policy is the CALLER's job
|
|
228
|
+
// (this core is media-agnostic) — the one place the two fetchers differ, plus
|
|
229
|
+
// the `accept` header. Keeping the security machinery here means the image path
|
|
230
|
+
// can never drift from the audited HTML path.
|
|
231
|
+
async function fetchValidatedBytes(url, { maxBytes, timeoutMs, maxRedirects, accept, deps }) {
|
|
232
|
+
const lookupImpl = deps.lookup || lookup;
|
|
233
|
+
const fetchImpl = deps.fetchImpl || fetch;
|
|
234
|
+
|
|
235
|
+
let current = assertFetchableUrl(url);
|
|
236
|
+
await assertHostResolvesPublic(bareHost(current.hostname), lookupImpl);
|
|
237
|
+
|
|
238
|
+
let response;
|
|
239
|
+
for (let hop = 0; ; hop++) {
|
|
240
|
+
try {
|
|
241
|
+
response = await fetchImpl(current.href, {
|
|
242
|
+
redirect: 'manual',
|
|
243
|
+
signal: AbortSignal.timeout(timeoutMs),
|
|
244
|
+
headers: {
|
|
245
|
+
'user-agent': 'rwa-clone/1.0 (+https://rewritable.ikangai.com)',
|
|
246
|
+
'accept': accept,
|
|
247
|
+
},
|
|
248
|
+
});
|
|
249
|
+
} catch (err) {
|
|
250
|
+
throw new CloneError(2, 'fetch_failed', { url: current.href, message: err.message });
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// 3xx with a Location → manual per-hop revalidation (NEVER redirect:'follow').
|
|
254
|
+
if (response.status >= 300 && response.status < 400 && response.headers.get('location')) {
|
|
255
|
+
if (hop >= maxRedirects) {
|
|
256
|
+
throw new CloneError(2, 'too_many_redirects', { url: current.href, hops: hop + 1 });
|
|
257
|
+
}
|
|
258
|
+
let next;
|
|
259
|
+
try {
|
|
260
|
+
next = new URL(response.headers.get('location'), current.href);
|
|
261
|
+
} catch {
|
|
262
|
+
throw new CloneError(2, 'fetch_failed', { url: current.href, message: 'malformed redirect Location' });
|
|
263
|
+
}
|
|
264
|
+
current = assertFetchableUrl(next.href);
|
|
265
|
+
await assertHostResolvesPublic(bareHost(current.hostname), lookupImpl);
|
|
266
|
+
continue;
|
|
267
|
+
}
|
|
268
|
+
break;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if (!response.ok) {
|
|
272
|
+
throw new CloneError(2, 'http_error', { url: current.href, status: response.status });
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
const contentType = response.headers.get('content-type') || '';
|
|
276
|
+
// Match the media type only — an unanchored substring test would wrongly pass
|
|
277
|
+
// e.g. `image/svg+xml; charset=text/html` (a parameter that mentions text/html).
|
|
278
|
+
const mime = contentType.split(';')[0].trim().toLowerCase();
|
|
279
|
+
|
|
280
|
+
// content-length is advisory; we still cap the streamed bytes below.
|
|
281
|
+
const declared = Number(response.headers.get('content-length'));
|
|
282
|
+
if (Number.isFinite(declared) && declared > maxBytes) {
|
|
283
|
+
throw new CloneError(2, 'too_large', { url: current.href, contentLength: declared, maxBytes });
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// Stream and cap — a lying or absent content-length cannot exhaust memory.
|
|
287
|
+
if (!response.body) {
|
|
288
|
+
const buf = await response.arrayBuffer();
|
|
289
|
+
if (buf.byteLength > maxBytes) {
|
|
290
|
+
throw new CloneError(2, 'too_large', { url: current.href, maxBytes });
|
|
291
|
+
}
|
|
292
|
+
return { bytes: new Uint8Array(buf), mime, url: current.href };
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const reader = response.body.getReader();
|
|
296
|
+
const chunks = [];
|
|
297
|
+
let total = 0;
|
|
298
|
+
try {
|
|
299
|
+
for (;;) {
|
|
300
|
+
const { done, value } = await reader.read();
|
|
301
|
+
if (done) break;
|
|
302
|
+
total += value.byteLength;
|
|
303
|
+
if (total > maxBytes) {
|
|
304
|
+
await reader.cancel();
|
|
305
|
+
throw new CloneError(2, 'too_large', { url: current.href, maxBytes });
|
|
306
|
+
}
|
|
307
|
+
chunks.push(value);
|
|
308
|
+
}
|
|
309
|
+
} catch (err) {
|
|
310
|
+
if (err instanceof CloneError) throw err;
|
|
311
|
+
throw new CloneError(2, 'fetch_failed', { url: current.href, message: err.message });
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
const out = new Uint8Array(total);
|
|
315
|
+
let off = 0;
|
|
316
|
+
for (const c of chunks) { out.set(c, off); off += c.byteLength; }
|
|
317
|
+
return { bytes: out, mime, url: current.href };
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
export async function fetchPage(url, { maxBytes = 3_000_000, timeoutMs = 15000, maxRedirects = 5, deps = {} } = {}) {
|
|
321
|
+
// Injection seam (testing only): defaults are the real node:dns lookup and the
|
|
322
|
+
// global fetch, so the public call signature is unchanged for real callers.
|
|
323
|
+
const { bytes, mime, url: finalUrl } = await fetchValidatedBytes(url, {
|
|
324
|
+
maxBytes, timeoutMs, maxRedirects, accept: 'text/html,application/xhtml+xml', deps,
|
|
325
|
+
});
|
|
326
|
+
if (mime !== 'text/html' && mime !== 'application/xhtml+xml') {
|
|
327
|
+
throw new CloneError(2, 'not_html', { url: finalUrl, contentType: mime });
|
|
328
|
+
}
|
|
329
|
+
return new TextDecoder('utf-8').decode(bytes);
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// Image localization (rwa clone --localize-images). Fetch ONE image URL through
|
|
333
|
+
// the same SSRF-guarded core and return it as a `data:image/<type>;base64,…`
|
|
334
|
+
// URI, or throw CloneError. image/* only (raster + svg+xml — `<img src>` renders
|
|
335
|
+
// SVG in no-script image mode, the same allowance import.mjs makes). The CLI has
|
|
336
|
+
// no canvas, so bytes are inlined RAW (no recompression) — bounded by maxBytes.
|
|
337
|
+
const IMG_MIME_RE = /^image\/(png|jpeg|gif|webp|avif|svg\+xml|bmp|x-icon|vnd\.microsoft\.icon)$/;
|
|
338
|
+
export async function fetchImageDataUri(url, { maxBytes = 2_000_000, timeoutMs = 15000, maxRedirects = 5, deps = {} } = {}) {
|
|
339
|
+
const { bytes, mime, url: finalUrl } = await fetchValidatedBytes(url, {
|
|
340
|
+
maxBytes, timeoutMs, maxRedirects, accept: 'image/*', deps,
|
|
341
|
+
});
|
|
342
|
+
if (!IMG_MIME_RE.test(mime)) {
|
|
343
|
+
throw new CloneError(2, 'not_image', { url: finalUrl, contentType: mime });
|
|
344
|
+
}
|
|
345
|
+
return `data:${mime};base64,${Buffer.from(bytes).toString('base64')}`;
|
|
346
|
+
}
|