rewritable 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,664 @@
1
+ // rwa-edit/1 apply-edits — hand-mirrored from seeds/rewritable.html's
2
+ // applyEdits pipeline (search `async function applyEdits`). Read alongside rwa-edit-spec.md §5
3
+ // (apply_edits semantics) and §7 (frozen zones).
4
+ //
5
+ // Differences from the seed, called out so future maintainers don't expect
6
+ // strict parity:
7
+ // 1. Seed collapses reserved-marker hits and zone-crossing hits both into
8
+ // `frozen_zone_violation`. CLI splits them: `reserved_substring` for
9
+ // a find/replace that *contains* a marker substring, and
10
+ // `frozen_zone_violation` for an edit whose find-range overlaps a
11
+ // marker-form frozen zone. The plan (Task 4/5 dispatch) is keyed on
12
+ // these distinct codes.
13
+ // 2. Seed enforces `data-rwa-frozen` attribute-form zones via a DOMParser
14
+ // snapshot of [data-rwa-frozen] elements. The CLI mirrors that guard
15
+ // parser-free (offline-first, no jsdom): `dataRwaFrozenSnapshot` captures
16
+ // each frozen element as `tag\0outerHTML` (sorted), and applyEdits rejects
17
+ // a batch that changes the set (`frozen_zone_violation`, `form:'attribute'`)
18
+ // — covering BOTH marker-form and attribute-form now. Reserved-substring
19
+ // detection still blocks edits that mention `data-rwa-frozen` literally.
20
+ // The seed's DOMParser handles edge cases (a `>` inside a quoted attribute
21
+ // value) that the CLI's pragmatic regex matcher does not; the before/after
22
+ // snapshot is relative, so a consistent mis-parse of an UNCHANGED element
23
+ // still compares equal. KEEP IN STEP with the seed (search
24
+ // `function dataRwaFrozenSnapshot`).
25
+ // 3. Seed's structural-shape check uses DOMParser + executable-script-
26
+ // type filtering + top-level-tag-types set. CLI v1 uses regex counting
27
+ // of <script>/<style> tags — enough to catch the realistic accidental-
28
+ // damage signal (a model emitting an inline <script> in a content
29
+ // edit) without pulling in a parser.
30
+ //
31
+ // ## Other known v1 scope-downs vs seed
32
+ //
33
+ // The seed (search `async function applyEdits` in seeds/rewritable.html) enforces additional
34
+ // invariants the CLI does NOT in v1. Tracked in cli/TODO.md for v2:
35
+ //
36
+ // - MAX_REPLACE = 8KB per-edit cap (seed throws 'replace_too_large')
37
+ // - MAX_DOC = 1MB whole-doc cap (seed throws 'target_size_exceeded')
38
+ // - isWellFormed lone-surrogate guard on find/replace/doc
39
+ // - canonLF normalization of find/replace before matching
40
+ // (CRLF-containing anchors fail with find_not_found in the CLI but
41
+ // match correctly in the browser)
42
+ // - Class-lock violation check on apply_edits (class_lock_violation — an edit
43
+ // find-range crossing a .rwa-locked subtree). NOTE: the replace_document
44
+ // coverage check (class_lock_uncovered) IS enforced — see edit.mjs
45
+ // assertFrozenPreserved + the exported lockedRangesIn/markerZoneRangesIn.
46
+ // - Reserved-id violation (reserved_id_used) — including data-rwa-id injection
47
+ // - HTML parse-validity post-apply (parse_error_post_apply)
48
+
49
+ export class RwaEditError extends Error {
50
+ constructor(code, editIndex = null, context = {}) {
51
+ super(code);
52
+ this.code = code;
53
+ this.editIndex = editIndex;
54
+ this.context = context;
55
+ }
56
+ }
57
+
58
+ // Size caps — mirror of the seed's RWA_EDIT (search `MAX_REPLACE:` in
59
+ // seeds/rewritable.html). MAX_REPLACE is the per-edit `replace` cap;
60
+ // MAX_DOC is the whole-document cap after the batch applies. With images-v1
61
+ // these are measured on the VIRTUAL (rwa-asset token) form when the caller
62
+ // virtualizes — a text budget, never a pixel budget (rwa-edit-spec.md §19).
63
+ const MAX_REPLACE = 8 * 1024;
64
+ const MAX_DOC = 1024 * 1024;
65
+ // Real-bytes whole-document cap for the image paths, where MAX_DOC measures the
66
+ // VIRTUAL (token) form. Mirrors the GUI's container budget (RWA_IMG.FILE_STOP);
67
+ // authoritative server-side on the hosted /modify path (rwa-edit-spec.md §19).
68
+ export const MAX_DOC_EXPANDED = 10 * 1024 * 1024;
69
+
70
+ // LF canonicalization — mirror of the seed's canonLF. The seed normalizes the
71
+ // doc AND every find/replace to LF before matching, so a CRLF document or a
72
+ // CRLF-containing anchor behaves identically in the CLI and the browser.
73
+ // Without this a CRLF doc + LF anchor (or vice versa) spuriously misses.
74
+ const canonLF = (s) => (s == null ? '' : String(s).replace(/\r\n/g, '\n').replace(/\r/g, '\n'));
75
+
76
+ // UTF-16 well-formedness — a lone surrogate in find/replace becomes U+FFFD on
77
+ // UTF-8 encode (the durable file write) and silently corrupts byte-equality.
78
+ // Mirror of the seed's isWellFormed guard. String.prototype.isWellFormed is
79
+ // Node 22+; treat its absence as "no check available."
80
+ const isWellFormed = (s) => typeof s !== 'string' || typeof s.isWellFormed !== 'function' || s.isWellFormed();
81
+
82
+ // Plain-English, code-keyed recovery hints. Self-documenting failures: an agent
83
+ // (or `rwa edit --json` consumer) gets one actionable line, not just a code.
84
+ // A static lookup — never a model call (Rule 5). Keep in sync with the seed's
85
+ // FAILURE_HINTS (failureToToolResult). No angle brackets / reserved markers in
86
+ // the strings, so they stay safe to embed in the seed bootstrap and survive the
87
+ // CLI tree's reserved-marker scan.
88
+ export const FAILURE_HINTS = {
89
+ find_not_found: 'find must match the document byte-for-byte (whitespace and case included). If a closest match is shown, copy it exactly; otherwise pick a shorter, distinctive anchor.',
90
+ find_not_unique: 'find appears more than once. Extend it with neighbouring text until it is unique; the hints list shows where.',
91
+ frozen_zone_violation: 'This region is an author-protected frozen zone. Anchor on a different region — frozen zones change only by editing the file outside the runtime.',
92
+ reserved_substring: 'find or replace contains a reserved rwa marker. Anchor on ordinary document text instead.',
93
+ structural_shape_changed: 'The edit would change the document script/style tag count. Keep edits content-only, or use a structural plan.',
94
+ replace_too_large: 'replace exceeds the per-edit size cap. Split the change into smaller anchored edits.',
95
+ empty_find: 'find must be a non-empty string — provide the exact text to anchor on.',
96
+ parse_error_post_apply: 'The result was not well-formed HTML — check that the tags in replace are balanced.',
97
+ unknown_asset_reference: 'src uses an rwa-asset: token that does not exist in this document. Copy tokens verbatim from existing <img> tags; never invent or edit them.',
98
+ };
99
+
100
+ // ─── Image-asset virtualization (images-v1) ─────────────────────────
101
+ // Hand-mirror of the seed block beside containsReservedMarker in
102
+ // seeds/rewritable.html (rwaAssetHash8/registerImageAsset/virtualizeImages/
103
+ // virtualizeWithMap/expandImages/assertNoNewAssetTokens). Normative contract:
104
+ // rwa-edit-spec.md §19. KEEP IN STEP with the seed.
105
+ //
106
+ // The model never sees image bytes: `rwa edit <instruction>` builds its prompt
107
+ // from the VIRTUAL doc (data:image src → rwa-asset:<hash8> token) and the
108
+ // apply expands tokens back before the file write. Hash-keyed (FNV-1a — token
109
+ // identity/dedupe, not integrity), so tokens are stable across moves and the
110
+ // map can be re-derived deterministically from the same doc bytes.
111
+ export function rwaAssetHash8(s) {
112
+ let h = 0x811c9dc5;
113
+ for (let i = 0; i < s.length; i++) { h ^= s.charCodeAt(i); h = Math.imul(h, 0x01000193) >>> 0; }
114
+ return h.toString(16).padStart(8, '0');
115
+ }
116
+ const RWA_ASSET_SRC_RE = /(\bsrc\s*=\s*)(["'])(data:image\/[^"']*)\2/g;
117
+ const RWA_ASSET_TOKEN_RE = /(\bsrc\s*=\s*)(["'])(rwa-asset:[0-9a-f]{8,})\2/g;
118
+ export function registerImageAsset(assets, uri) {
119
+ // Collision probe: deterministic re-salt (32-bit birthday ~1e-6 at 100 images).
120
+ let n = 1, token;
121
+ do { token = 'rwa-asset:' + rwaAssetHash8(n === 1 ? uri : uri + '\0' + n); n++; }
122
+ while (assets.has(token) && assets.get(token) !== uri);
123
+ assets.set(token, uri);
124
+ return token;
125
+ }
126
+ export function virtualizeImages(doc, assets) {
127
+ assets = assets || new Map();
128
+ // Orphans: tokens already present in the RAW doc (user-authored or
129
+ // pre-broken). They map to nothing; expansion passes them through instead
130
+ // of throwing, so a pre-broken doc stays editable.
131
+ const orphans = new Set();
132
+ let m;
133
+ RWA_ASSET_TOKEN_RE.lastIndex = 0;
134
+ while ((m = RWA_ASSET_TOKEN_RE.exec(doc)) !== null) orphans.add(m[3]);
135
+ const vdoc = doc.replace(RWA_ASSET_SRC_RE, (_, p, q, uri) => p + q + registerImageAsset(assets, uri) + q);
136
+ return { doc: vdoc, assets, orphans };
137
+ }
138
+ // URI→token substitution for ANY string (a doc slice virtualizes to the
139
+ // corresponding vdoc slice as long as it doesn't cut a URI in half).
140
+ export function virtualizeWithMap(s, assets) {
141
+ if (!s || !assets || assets.size === 0) return s;
142
+ let out = s;
143
+ for (const [token, uri] of assets) out = out.split(uri).join(token);
144
+ return out;
145
+ }
146
+ export function expandImages(vdoc, assets, orphans) {
147
+ return vdoc.replace(RWA_ASSET_TOKEN_RE, (whole, p, q, token) => {
148
+ const uri = assets ? assets.get(token) : null;
149
+ if (uri == null) {
150
+ if (orphans && orphans.has(token)) return whole;
151
+ throw new RwaEditError('unknown_asset_reference', null, { token });
152
+ }
153
+ return p + q + uri + q;
154
+ });
155
+ }
156
+ // Tokenize the data:image URIs inside an EXPANDED envelope's find/replace (and
157
+ // the replace_document `doc`), registering each into the shared `assets` map so
158
+ // expansion can resolve them afterward. Used by the hosted /modify path
159
+ // (rwa-edit-spec.md §19, opts.virtualizeEnvelope): the client relays an expanded
160
+ // envelope, the server tokenizes it against a map seeded from the stored doc so
161
+ // the apply runs on the token form (caps = text budget) and new image bytes ride
162
+ // in via the envelope's own URIs. Returns a NEW envelope; the input is untouched.
163
+ export function mapEnvelopeImages(envelope, assets) {
164
+ const tok = (s) => virtualizeImages(s || '', assets).doc; // shares + extends `assets`
165
+ if (Array.isArray(envelope.edits)) {
166
+ return { ...envelope, edits: envelope.edits.map(e => ({ ...e, find: tok(e.find), replace: tok(e.replace) })) };
167
+ }
168
+ if (typeof envelope.doc === 'string') {
169
+ return { ...envelope, doc: tok(envelope.doc) };
170
+ }
171
+ return envelope;
172
+ }
173
+ // No-assets writers must not introduce a NEW rwa-asset token — a token with no
174
+ // bytes behind it is a permanently broken image; committing one silently is the
175
+ // failure mode Rule 12 forbids. Tokens already in the current doc stay legal.
176
+ export function assertNoNewAssetTokens(currentDoc, work) {
177
+ const seen = new Set();
178
+ let m;
179
+ RWA_ASSET_TOKEN_RE.lastIndex = 0;
180
+ while ((m = RWA_ASSET_TOKEN_RE.exec(currentDoc)) !== null) seen.add(m[3]);
181
+ RWA_ASSET_TOKEN_RE.lastIndex = 0;
182
+ while ((m = RWA_ASSET_TOKEN_RE.exec(work)) !== null) {
183
+ if (!seen.has(m[3])) throw new RwaEditError('unknown_asset_reference', null, { token: m[3] });
184
+ }
185
+ }
186
+
187
+ // Source of truth: seeds/rewritable.html RWA_EDIT.RESERVED (line ~1608).
188
+ // The string-concat trick on the comment/attribute markers prevents this
189
+ // source file itself from tripping reserved-marker scans run over the CLI
190
+ // tree.
191
+ const RESERVED_MARKERS = [
192
+ 'rwa:frozen:begin',
193
+ 'rwa:frozen:end',
194
+ '<' + '!-- rwa:',
195
+ '/*' + ' rwa:',
196
+ '//' + ' rwa:',
197
+ 'data-rwa-frozen',
198
+ ];
199
+
200
+ export function containsReservedMarker(s) {
201
+ if (!s) return false;
202
+ for (const m of RESERVED_MARKERS) if (s.includes(m)) return true;
203
+ return false;
204
+ }
205
+
206
+ function countOccurrences(haystack, needle) {
207
+ if (!needle) return 0;
208
+ let n = 0, i = 0;
209
+ while ((i = haystack.indexOf(needle, i)) !== -1) { n++; i += needle.length; }
210
+ return n;
211
+ }
212
+
213
+ // Surrounding-context snippets for find_not_unique — mirrors the seed's
214
+ // nearbySnippets so `rwa edit --json` (and the agent loop) can disambiguate.
215
+ // Source of truth: seeds/rewritable.html nearbySnippets (~line 1783).
216
+ function nearbySnippets(haystack, needle, max = 3, ctx = 40) {
217
+ const out = []; let i = 0;
218
+ while ((i = haystack.indexOf(needle, i)) !== -1 && out.length < max) {
219
+ const a = Math.max(0, i - ctx);
220
+ const b = Math.min(haystack.length, i + needle.length + ctx);
221
+ out.push({ pos: i, before: haystack.slice(a, i), after: haystack.slice(i + needle.length, b) });
222
+ i += needle.length;
223
+ }
224
+ return out;
225
+ }
226
+
227
+ // Deterministic near-miss finder for find_not_found. Given a `find` that does
228
+ // NOT appear verbatim in `doc`, return a context fragment {closest, match}
229
+ // describing the closest actual text so an agent (or human) can self-correct
230
+ // the anchor in one retry — no model call (Rule 5). Returns {} when nothing
231
+ // useful is found. Cold path (failure only), so an O(n) projection is fine.
232
+ // Source of truth: seeds/rewritable.html findClosestAnchor — keep in sync.
233
+ function findClosestAnchor(doc, find) {
234
+ if (!doc || !find) return {};
235
+ const needleNorm = find.replace(/[ \t\n\r\f]+/g, ' ').trim();
236
+ if (!needleNorm) return {};
237
+
238
+ // Whitespace-collapsed projection of `doc`, with an offset map back to the
239
+ // original bytes (map[k] = source index of norm[k]; a whitespace run collapses
240
+ // to one space mapped to its first char). lowNorm mirrors norm length-for-length
241
+ // (chars whose lowercase isn't single-char are left as-is) so the case pass
242
+ // shares the same map without desync.
243
+ let norm = '', lowNorm = '';
244
+ const map = [];
245
+ let inWs = false;
246
+ for (let i = 0; i < doc.length; i++) {
247
+ const c = doc[i];
248
+ if (c === ' ' || c === '\t' || c === '\n' || c === '\r' || c === '\f') {
249
+ if (!inWs) { norm += ' '; lowNorm += ' '; map.push(i); inWs = true; }
250
+ } else {
251
+ const lc = c.toLowerCase();
252
+ norm += c;
253
+ lowNorm += lc.length === 1 ? lc : c;
254
+ map.push(i);
255
+ inWs = false;
256
+ }
257
+ }
258
+ // Cap the payload so an oversized anchor can't bloat the tool_result. When
259
+ // elided, flag truncated:true — the elided text LOCATES the region but is NOT
260
+ // byte-for-byte re-appliable, so the consumer must shorten its anchor rather
261
+ // than paste the string back (honest, machine-actionable).
262
+ const MAX = 300;
263
+ const mk = (raw, match) => raw.length <= MAX
264
+ ? { closest: raw, match }
265
+ : { closest: raw.slice(0, MAX - 18) + ' …[' + (raw.length - MAX) + ' more]… ', match, truncated: true };
266
+ const span = (k, normLen) => doc.slice(map[k], map[k + normLen - 1] + 1); // trim() ⇒ non-ws ends
267
+
268
+ // Pass 1 — whitespace-only mismatch (verbatim normalized match).
269
+ let k = norm.indexOf(needleNorm);
270
+ if (k !== -1) return mk(span(k, needleNorm.length), 'whitespace');
271
+
272
+ // Pass 2 — case (± whitespace) mismatch.
273
+ k = lowNorm.indexOf(needleNorm.toLowerCase());
274
+ if (k !== -1) return mk(span(k, needleNorm.length), 'case');
275
+
276
+ // Pass 3 — partial: longest matching prefix of the needle (floor 12 chars).
277
+ // Prefix-match is monotonic in length, so binary-search the longest L.
278
+ const FLOOR = 12;
279
+ if (needleNorm.length >= FLOOR) {
280
+ let lo = FLOOR, hi = needleNorm.length, best = -1, bestK = -1;
281
+ while (lo <= hi) {
282
+ const mid = (lo + hi) >> 1;
283
+ const j = norm.indexOf(needleNorm.slice(0, mid));
284
+ if (j !== -1) { best = mid; bestK = j; lo = mid + 1; } else { hi = mid - 1; }
285
+ }
286
+ if (best !== -1) {
287
+ const start = map[bestK];
288
+ const matchEnd = map[bestK + best - 1] + 1;
289
+ const ctxEnd = Math.min(doc.length, matchEnd + 40); // show where it diverges
290
+ return mk(doc.slice(start, ctxEnd), 'partial');
291
+ }
292
+ }
293
+
294
+ return {};
295
+ }
296
+
297
+ // Extract marker-form frozen zones. Returns array of
298
+ // `{ start, end, name }` covering the entire span from the opening
299
+ // `<!-- rwa:frozen:begin <name> -->` to the closing
300
+ // `<!-- rwa:frozen:end <name> -->` (inclusive of both markers).
301
+ //
302
+ // Scoped to the HTML-comment form. Source-of-truth seed also handles
303
+ // `/* rwa:frozen:* */` and `// rwa:frozen:*` (script/JS-comment forms);
304
+ // for the CLI v1 those are deferred — they were a niche need on the seed
305
+ // side and the substrate is the doc the CLI edits, not the bootstrap.
306
+ export function findFrozenZones(doc) {
307
+ const zones = [];
308
+ const beginRe = /<!--\s*rwa:frozen:begin\s+([A-Za-z0-9_-]+)\s*-->/g;
309
+ let m;
310
+ while ((m = beginRe.exec(doc)) !== null) {
311
+ const name = m[1];
312
+ const innerStart = m.index + m[0].length;
313
+ const endRe = new RegExp(
314
+ '<!--\\s*rwa:frozen:end\\s+' + name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + '\\s*-->',
315
+ 'g',
316
+ );
317
+ endRe.lastIndex = innerStart;
318
+ const e = endRe.exec(doc);
319
+ if (!e) continue; // unterminated — silently skipped; seed flags this elsewhere
320
+ zones.push({ start: m.index, end: e.index + e[0].length, name });
321
+ }
322
+ return zones;
323
+ }
324
+
325
+ // Regex-escape a dynamic literal (zone name) before embedding it in a RegExp.
326
+ // Mirror of the seed's escapeRegex. Zone names are [A-Za-z0-9_-]+ today so this
327
+ // is belt-and-suspenders, but keeping it shared means the three fence-form
328
+ // builders below stay byte-aligned with the seed and with each other.
329
+ function escapeRegex(s) { return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); }
330
+
331
+ // Full 3-fence-form frozen-zone scan — faithful mirror of the seed's
332
+ // extractFrozenZones (seeds/rewritable.html, search `function extractFrozenZones`).
333
+ // Returns one entry per begin-marker: { name, inner } for a terminated zone, or
334
+ // { name, error: 'unterminated' | 'duplicate' }. This is the canonical scan the
335
+ // replace_document guard uses for byte-preservation, add-rejection, unterminated
336
+ // AND duplicate detection — across <!-- -->, /* */ and // fence forms — so the
337
+ // escape hatch can't silently drop, mint, half-open, or shadow-duplicate a zone
338
+ // in any fence form. (findFrozenZones below stays comment-form-only on purpose:
339
+ // it is the REPORTING source for `rwa doc`/`ls` frozenZones, where SD-04 pins it
340
+ // to the seed's reporting projection. This scan is the ENFORCEMENT source.)
341
+ // KEEP IN STEP with the seed.
342
+ export function extractFrozenZones3(doc) {
343
+ const zones = [];
344
+ if (!doc) return zones;
345
+ const seen = new Set();
346
+ const beginRe = /(<!--|\/\*|\/\/)\s*rwa:frozen:begin\s+([A-Za-z0-9_-]+)\s*(-->|\*\/|(?=\r?\n|$))/g;
347
+ let m;
348
+ while ((m = beginRe.exec(doc)) !== null) {
349
+ const opener = m[1];
350
+ const name = m[2];
351
+ let innerStart = m.index + m[0].length;
352
+ if (opener === '//') {
353
+ // Line-comment form: the inner zone starts after this line's newline.
354
+ while (innerStart < doc.length && doc[innerStart] !== '\n') innerStart++;
355
+ if (innerStart < doc.length) innerStart++;
356
+ }
357
+ let endRe;
358
+ if (opener === '<!--') endRe = new RegExp('<!--\\s*rwa:frozen:end\\s+' + escapeRegex(name) + '\\s*-->', 'g');
359
+ else if (opener === '/*') endRe = new RegExp('\\/\\*\\s*rwa:frozen:end\\s+' + escapeRegex(name) + '\\s*\\*\\/', 'g');
360
+ else endRe = new RegExp('\\/\\/\\s*rwa:frozen:end\\s+' + escapeRegex(name) + '(?=\\r?\\n|$)', 'g');
361
+ endRe.lastIndex = innerStart;
362
+ const e = endRe.exec(doc);
363
+ if (!e) { zones.push({ name, error: 'unterminated' }); continue; }
364
+ if (seen.has(name)) { zones.push({ name, error: 'duplicate' }); continue; }
365
+ seen.add(name);
366
+ zones.push({ name, inner: doc.slice(innerStart, e.index) });
367
+ }
368
+ return zones;
369
+ }
370
+
371
+ // Detect an unterminated marker-form frozen zone (a begin marker with no
372
+ // matching end), across all three fence forms. Thin projection of
373
+ // extractFrozenZones3 so the standalone check and the full guard can never
374
+ // disagree. Returns the offending zone name, or null. KEEP IN STEP with the seed.
375
+ export function unterminatedFrozenMarker(doc) {
376
+ const z = extractFrozenZones3(doc).find(z => z.error === 'unterminated');
377
+ return z ? z.name : null;
378
+ }
379
+
380
+ function editCrossesFrozenZone(doc, find, zones) {
381
+ const findIdx = doc.indexOf(find);
382
+ if (findIdx === -1) return null;
383
+ const findEnd = findIdx + find.length;
384
+ for (const z of zones) {
385
+ // Overlap: edit range intersects zone range. Adjacent (findEnd === z.start
386
+ // or findIdx === z.end) is OK — same convention as the seed's class-lock
387
+ // check (seeds/rewritable.html ~line 2860).
388
+ if (findIdx < z.end && findEnd > z.start) return z;
389
+ }
390
+ return null;
391
+ }
392
+
393
+ // Void HTML elements have no closing tag, so the depth-matcher below must not
394
+ // scan to EOF looking for a close that never comes.
395
+ const VOID_ELEMENTS = new Set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img',
396
+ 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']);
397
+
398
+ // Index just past the matching `</tag>` for an element opened at `from`,
399
+ // tracking nested same-tag depth so a naive "next close" can't stop early.
400
+ // -1 if unterminated. Mirror of the seed's findCloseTagEnd: EVERY non-close
401
+ // open of `tag` increments depth — including a self-closing `<tag/>`, because
402
+ // for the non-void container tags this is called with (void tags are guarded
403
+ // before the call), HTML ignores the trailing slash and treats it as an open.
404
+ // (A prior CLI deviation exempted `<tag/>`, diverging from the seed on
405
+ // malformed self-closing same-tag nesting — removed for parity.)
406
+ function matchingCloseEnd(doc, tag, from) {
407
+ const tagRe = new RegExp('<(/?)' + tag.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + '\\b[^>]*>', 'gi');
408
+ tagRe.lastIndex = from;
409
+ let depth = 1, t;
410
+ while ((t = tagRe.exec(doc)) !== null) {
411
+ if (t[1] === '/') { if (--depth === 0) return t.index + t[0].length; }
412
+ else depth++;
413
+ }
414
+ return -1;
415
+ }
416
+
417
+ // True iff `openTag` carries data-rwa-frozen as an actual attribute NAME — not
418
+ // inside a quoted value (class="data-rwa-frozen") and not a prefix of a longer
419
+ // name (data-rwa-frozen-note). Mirror of the seed's tagHasFrozenAttr
420
+ // (seeds/rewritable.html:2112) so the CLI's byte-range frozen detection agrees
421
+ // with the real DOM enforcement (querySelectorAll('[data-rwa-frozen]')) — the
422
+ // cheap /\bdata-rwa-frozen\b/ pre-filter's value/longer-name matches no longer
423
+ // false-positive. KEEP IN STEP with the seed.
424
+ export function tagHasFrozenAttr(openTag) {
425
+ const am = /^<[a-zA-Z][a-zA-Z0-9]*((?:\s[^>]*)?)\/?>$/.exec(openTag);
426
+ if (!am) return false;
427
+ const attrRe = /([^\s=/>]+)(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>]+))?/g;
428
+ let a;
429
+ while ((a = attrRe.exec(am[1])) !== null) {
430
+ if (a[1] === 'data-rwa-frozen') return true;
431
+ }
432
+ return false;
433
+ }
434
+
435
+ // Parser-free mirror of the seed's dataRwaFrozenSnapshot (seeds/rewritable.html, search
436
+ // `function dataRwaFrozenSnapshot`): each data-rwa-frozen element captured as `tagName\0outerHTML`, sorted.
437
+ // applyEdits compares this before/after to reject ANY change (inner text,
438
+ // attributes, add/remove) to an attribute-form frozen element — position-
439
+ // independent (sorted; outerHTML self-contained), batch-level like the seed.
440
+ //
441
+ // The seed uses DOMParser; the CLI stays parser-free (offline-first, no jsdom),
442
+ // so this is a pragmatic regex + tag-depth matcher. Edge cases a real parser
443
+ // handles (a literal `>` inside a quoted attribute value, a tag name inside a
444
+ // comment/string) are out of v1 scope — but because the check is a RELATIVE
445
+ // before/after snapshot, a consistent mis-parse of an UNCHANGED frozen element
446
+ // still compares equal, and the conservative failure direction (false-positive
447
+ // rejection) is the safe one for a frozen-zone guard. KEEP IN STEP with the seed.
448
+ export function dataRwaFrozenSnapshot(doc) {
449
+ const out = [];
450
+ const openRe = /<([a-zA-Z][A-Za-z0-9-]*)\b[^>]*\bdata-rwa-frozen\b[^>]*>/g;
451
+ let m;
452
+ while ((m = openRe.exec(doc)) !== null) {
453
+ const tag = m[1].toLowerCase();
454
+ const openTag = m[0];
455
+ if (!tagHasFrozenAttr(openTag)) continue; // the cheap regex matched a value/longer-name; not a real frozen element
456
+ if (VOID_ELEMENTS.has(tag) || /\/>\s*$/.test(openTag)) {
457
+ out.push(tag + '\0' + openTag); // self-contained: no inner, no close
458
+ continue;
459
+ }
460
+ const closeEnd = matchingCloseEnd(doc, tag, m.index + openTag.length);
461
+ out.push(tag + '\0' + (closeEnd === -1 ? doc.slice(m.index) : doc.slice(m.index, closeEnd)));
462
+ }
463
+ return out.sort();
464
+ }
465
+
466
+ function snapshotsEqual(a, b) {
467
+ if (a.length !== b.length) return false;
468
+ for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false;
469
+ return true;
470
+ }
471
+
472
+ // Parser-free port of the seed's lockedRangesIn (seeds/rewritable.html, search `function lockedRangesIn`):
473
+ // the [start, end] byte range of each .rwa-locked element's whole subtree.
474
+ // Used by replace_document's class-lock coverage check. matchingCloseEnd is the
475
+ // CLI's equivalent of the seed's findCloseTagEnd (depth-tracked same-tag close).
476
+ // KEEP IN STEP with the seed.
477
+ export function lockedRangesIn(doc) {
478
+ if (!doc) return [];
479
+ // Quoted ("…" / '…') OR unquoted (class=rwa-locked) attribute values — the
480
+ // browser's classList enforces the lock regardless of quoting, so the
481
+ // text-scan must too (mirror of the seed's lockedRangesIn).
482
+ const opening = /<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*\bclass\s*=\s*("([^"]*)"|'([^']*)'|([^\s"'>]+))[^>]*>/g;
483
+ const out = [];
484
+ let m;
485
+ while ((m = opening.exec(doc)) !== null) {
486
+ const cls = (m[3] || m[4] || m[5] || '');
487
+ if (!/\brwa-locked\b/.test(cls)) continue;
488
+ const end = matchingCloseEnd(doc, m[1], m.index + m[0].length);
489
+ if (end !== -1) out.push([m.index, end]);
490
+ }
491
+ return out;
492
+ }
493
+
494
+ // Parser-free port of the seed's markerZoneRangesIn (seeds/rewritable.html, search `function markerZoneRangesIn`):
495
+ // the [start, end] byte ranges of every protected zone — marker-form frozen
496
+ // zones (all three fence forms, INCLUDING the fences) and data-rwa-frozen
497
+ // attribute-form element subtrees. Used by the class-lock coverage check to
498
+ // verify each .rwa-locked range is fully contained in a protected zone.
499
+ // Unterminated begin markers are skipped here (they carry no closed range);
500
+ // they are rejected separately by unterminatedFrozenMarker. KEEP IN STEP with
501
+ // the seed.
502
+ export function markerZoneRangesIn(doc) {
503
+ if (!doc) return [];
504
+ const out = [];
505
+ const beginRe = /(<!--|\/\*|\/\/)\s*rwa:frozen:begin\s+([A-Za-z0-9_-]+)\s*(-->|\*\/|(?=\r?\n|$))/g;
506
+ let m;
507
+ while ((m = beginRe.exec(doc)) !== null) {
508
+ const opener = m[1];
509
+ const name = m[2];
510
+ const startOfBegin = m.index;
511
+ let innerStart = m.index + m[0].length;
512
+ if (opener === '//') {
513
+ while (innerStart < doc.length && doc[innerStart] !== '\n') innerStart++;
514
+ if (innerStart < doc.length) innerStart++;
515
+ }
516
+ const esc = name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
517
+ let endRe;
518
+ if (opener === '<!--') endRe = new RegExp('<!--\\s*rwa:frozen:end\\s+' + esc + '\\s*-->', 'g');
519
+ else if (opener === '/*') endRe = new RegExp('\\/\\*\\s*rwa:frozen:end\\s+' + esc + '\\s*\\*\\/', 'g');
520
+ else endRe = new RegExp('\\/\\/\\s*rwa:frozen:end\\s+' + esc + '(?=\\r?\\n|$)', 'g');
521
+ endRe.lastIndex = innerStart;
522
+ const e = endRe.exec(doc);
523
+ if (!e) continue; // unterminated — skip (caught by unterminatedFrozenMarker)
524
+ out.push([startOfBegin, e.index + e[0].length]);
525
+ }
526
+ // data-rwa-frozen elements: opening tags carrying that attribute as a real
527
+ // NAME (tagHasFrozenAttr filters value/longer-name false positives).
528
+ const fzAttr = /<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*\bdata-rwa-frozen\b[^>]*>/g;
529
+ while ((m = fzAttr.exec(doc)) !== null) {
530
+ if (!tagHasFrozenAttr(m[0])) continue;
531
+ const end = matchingCloseEnd(doc, m[1], m.index + m[0].length);
532
+ if (end !== -1) out.push([m.index, end]);
533
+ }
534
+ return out;
535
+ }
536
+
537
+ // Structural-shape check (rwa-edit-spec.md §7).
538
+ // CLI v1: regex count of <script> and <style> tags. The seed additionally
539
+ // tracks top-level tag-types-set and exempts non-executable scripts
540
+ // (text/workflow-node, application/json) — both deferred for v1; the realistic
541
+ // damage signal (a model emitting an inline <script> inside a content edit)
542
+ // is fully caught by the count check.
543
+ function structuralShape(doc) {
544
+ return {
545
+ scripts: (doc.match(/<script[\s>]/gi) || []).length,
546
+ styles: (doc.match(/<style[\s>]/gi) || []).length,
547
+ };
548
+ }
549
+
550
+ export function applyEdits(doc, edits) {
551
+ if (!Array.isArray(edits) || edits.length === 0) {
552
+ throw new RwaEditError('malformed_envelope', null, { reason: 'edits must be a non-empty array' });
553
+ }
554
+
555
+ // LF-canonicalize the document up front (mirror of the seed): all matching,
556
+ // splicing, and the post-apply doc are LF-only, so CRLF in the source no
557
+ // longer causes spurious find_not_found against LF anchors (or vice versa).
558
+ doc = canonLF(doc);
559
+
560
+ const before = structuralShape(doc);
561
+ const zones = findFrozenZones(doc);
562
+ // Attribute-form frozen zones (data-rwa-frozen) are enforced batch-level by
563
+ // snapshot equality (see dataRwaFrozenSnapshot), mirroring the seed.
564
+ const frozenAttr = dataRwaFrozenSnapshot(doc);
565
+
566
+ let working = doc;
567
+ for (let i = 0; i < edits.length; i++) {
568
+ const raw = edits[i] || {};
569
+ if (!raw.find) throw new RwaEditError('empty_find', i);
570
+ // Lone-surrogate guard BEFORE canonLF/match: a malformed find/replace would
571
+ // corrupt the durable file on UTF-8 encode (mirror of the seed).
572
+ if (!isWellFormed(raw.find) || !isWellFormed(raw.replace)) {
573
+ throw new RwaEditError('malformed_envelope', i, { reason: 'lone_surrogate' });
574
+ }
575
+ // Per-edit replace cap (mirror of the seed's MAX_REPLACE). Measured on the
576
+ // raw replace bytes the caller supplied (the virtual/token form under
577
+ // images-v1) — a text budget.
578
+ if ((raw.replace || '').length > MAX_REPLACE) {
579
+ throw new RwaEditError('replace_too_large', i, { length: (raw.replace || '').length, cap: MAX_REPLACE });
580
+ }
581
+ // Canonicalize the anchor + replacement to LF so a CRLF-containing find
582
+ // matches the LF-canonical working copy (and the splice stays LF-only).
583
+ const find = canonLF(raw.find);
584
+ const replace = canonLF(raw.replace);
585
+
586
+ // Reserved-substring check (spec §4 rule 6) — runs before the find lookup
587
+ // so a literal `data-rwa-frozen` in either side fails fast.
588
+ if (containsReservedMarker(find) || containsReservedMarker(replace)) {
589
+ throw new RwaEditError('reserved_substring', i, { find, replace });
590
+ }
591
+
592
+ const count = countOccurrences(working, find);
593
+ if (count === 0) throw new RwaEditError('find_not_found', i, { find, ...findClosestAnchor(working, find) });
594
+ if (count > 1) throw new RwaEditError('find_not_unique', i, { find, count, hints: nearbySnippets(working, find) });
595
+
596
+ // Frozen-zone overlap check (marker form). Recompute zones each iteration
597
+ // against `working` so prior edits can't shift the zone boundaries
598
+ // under the next edit's check.
599
+ const liveZones = findFrozenZones(working);
600
+ const zone = editCrossesFrozenZone(working, find, liveZones);
601
+ if (zone) {
602
+ throw new RwaEditError('frozen_zone_violation', i, { zone: zone.name });
603
+ }
604
+
605
+ // Class-declared lock check (rwa-lens/1 spec §7; mirror of the seed's
606
+ // apply path). Reject any find-range overlapping a .rwa-locked source
607
+ // range. Adjacent insertions (find ends exactly where a lock begins, or
608
+ // starts where one ends) are OK. Recomputed per iteration because
609
+ // `working` mutates after each splice.
610
+ const idxLock = working.indexOf(find);
611
+ const editStart = idxLock, editEnd = idxLock + find.length;
612
+ for (const [ls, le] of lockedRangesIn(working)) {
613
+ if (editEnd > ls && editStart < le) {
614
+ throw new RwaEditError('class_lock_violation', i, { lockRange: [ls, le], editRange: [editStart, editEnd] });
615
+ }
616
+ }
617
+
618
+ // Slice-based splice — String.prototype.replace honors $&/$`/$'/$$
619
+ // patterns in the replacement string even for literal-string searches,
620
+ // mangling content like "$$amount". Splicing keeps bytes verbatim.
621
+ const idx = working.indexOf(find);
622
+ working = working.slice(0, idx) + (replace || '') + working.slice(idx + find.length);
623
+ }
624
+
625
+ const after = structuralShape(working);
626
+ if (before.scripts !== after.scripts || before.styles !== after.styles) {
627
+ throw new RwaEditError('structural_shape_changed', null, { before, after });
628
+ }
629
+
630
+ // Frozen-zone integrity: zone count must match. (Marker-form-only; seed
631
+ // additionally diffs the inner bytes via extractFrozenZones — for the CLI
632
+ // v1 the count check + per-edit crossing check is the practical guard.)
633
+ const newZones = findFrozenZones(working);
634
+ if (newZones.length !== zones.length) {
635
+ throw new RwaEditError('frozen_zone_corrupted', null, {
636
+ before: zones.length,
637
+ after: newZones.length,
638
+ });
639
+ }
640
+
641
+ // Attribute-form frozen zones: the set of data-rwa-frozen elements (by
642
+ // tag+outerHTML) must be unchanged after the whole batch — mirrors the seed's
643
+ // dataRwaFrozenSnapshot/snapshotsEqual guard. Reported as frozen_zone_violation
644
+ // (the FAILURE_HINTS message already covers "author-protected frozen zone").
645
+ if (!snapshotsEqual(frozenAttr, dataRwaFrozenSnapshot(working))) {
646
+ throw new RwaEditError('frozen_zone_violation', null, { form: 'attribute' });
647
+ }
648
+
649
+ // #5 opt-in (rwa-id-strict): mirror of the seed — a container declaring
650
+ // <meta name="rwa-id-strict"> (in a frozen zone) forbids losing an existing
651
+ // data-rwa-id (the default would backfill a fresh one, breaking #frag links).
652
+ if (/<meta\s+name\s*=\s*["']?rwa-id-strict\b/i.test(doc)) {
653
+ const ids = (s) => new Set([...s.matchAll(/\sdata-rwa-id\s*=\s*(?:"([^"]*)"|'([^']*)')/g)].map((m) => (m[1] != null ? m[1] : m[2])));
654
+ const after = ids(working);
655
+ for (const id of ids(doc)) if (!after.has(id)) throw new RwaEditError('rwa_id_stripped', null, { id });
656
+ }
657
+
658
+ // Whole-document cap (mirror of the seed's MAX_DOC). Measured on the final
659
+ // working copy — the virtual/token form under images-v1, so image bytes
660
+ // never count against the text budget.
661
+ if (working.length > MAX_DOC) throw new RwaEditError('target_size_exceeded', null, { length: working.length, cap: MAX_DOC });
662
+
663
+ return working;
664
+ }