@oh-my-pi/hashline 15.5.12 → 15.5.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/tokenizer.ts CHANGED
@@ -1,31 +1,27 @@
1
1
  /**
2
2
  * Stateful, line-oriented classifier for hashline diff text.
3
3
  *
4
- * The {@link Tokenizer} can be fed in chunks ({@link Tokenizer.feed}/{@link
5
- * Tokenizer.end}) for streaming use, or in one shot ({@link
6
- * Tokenizer.tokenizeAll}). Each emitted token carries its 1-indexed source
7
- * line number so downstream consumers (parser, validators, error messages)
8
- * can refer back to the input precisely.
9
- *
10
4
  * Format shape:
11
5
  * ```
12
- * *** path/to/file.ts#0A3
13
- * @@ 5,7 @@
6
+ * path/to/file.ts#0A3
7
+ * replace 5..7:
14
8
  * +literal new line
15
- * &3,4
16
9
  * ```
17
- * Each `***` line opens a new file section; each `@@ A,B @@` line opens a
18
- * new hunk whose body (zero or more `+`/`&` rows) replaces the selected
19
- * range. Empty body = delete the selected range.
20
10
  */
21
-
22
11
  import {
23
12
  describeAnchorExamples,
13
+ HL_DELETE_KEYWORD,
24
14
  HL_FILE_HASH_LENGTH,
25
15
  HL_FILE_HASH_SEP,
26
16
  HL_FILE_PREFIX,
27
- HL_PAYLOAD_REPEAT,
17
+ HL_HEADER_COLON,
18
+ HL_INSERT_AFTER,
19
+ HL_INSERT_BEFORE,
20
+ HL_INSERT_HEAD,
21
+ HL_INSERT_KEYWORD,
22
+ HL_INSERT_TAIL,
28
23
  HL_PAYLOAD_REPLACE,
24
+ HL_REPLACE_KEYWORD,
29
25
  } from "./format";
30
26
  import { ABORT_MARKER, BEGIN_PATCH_MARKER, END_PATCH_MARKER } from "./messages";
31
27
  import type { Anchor, Cursor, ParsedRange } from "./types";
@@ -46,10 +42,8 @@ const CHAR_UPPER_F = 70;
46
42
  const CHAR_LOWER_A = 97;
47
43
  const CHAR_LOWER_F = 102;
48
44
  const CHAR_PAYLOAD_REPLACE = HL_PAYLOAD_REPLACE.charCodeAt(0);
49
- const CHAR_PAYLOAD_REPEAT = HL_PAYLOAD_REPEAT.charCodeAt(0);
45
+ const CHAR_COLON = HL_HEADER_COLON.charCodeAt(0);
50
46
  const FILE_PREFIX_LENGTH = HL_FILE_PREFIX.length;
51
- const BOF_ANCHOR = "BOF";
52
- const EOF_ANCHOR = "EOF";
53
47
 
54
48
  function isDigitCode(code: number): boolean {
55
49
  return code >= CHAR_ZERO && code <= CHAR_NINE;
@@ -91,14 +85,8 @@ function markerLineEquals(line: string, marker: string): boolean {
91
85
  return end === marker.length && line.startsWith(marker);
92
86
  }
93
87
 
94
- /**
95
- * Split a hashline diff into individual lines without losing the trailing
96
- * empty line that callers may rely on for explicit blank payloads. CRLF pairs
97
- * are normalized to a single line break.
98
- */
99
88
  export function splitHashlineLines(text: string): string[] {
100
89
  if (text.length === 0) return [""];
101
-
102
90
  const lines: string[] = [];
103
91
  let start = 0;
104
92
  for (let index = 0; index < text.length; index++) {
@@ -108,7 +96,6 @@ export function splitHashlineLines(text: string): string[] {
108
96
  lines.push(text.slice(start, end));
109
97
  start = index + 1;
110
98
  }
111
-
112
99
  if (start < text.length) {
113
100
  let end = text.length;
114
101
  if (end > start && text.charCodeAt(end - 1) === CHAR_CARRIAGE_RETURN) end--;
@@ -119,6 +106,7 @@ export function splitHashlineLines(text: string): string[] {
119
106
 
120
107
  export function cloneCursor(cursor: Cursor): Cursor {
121
108
  if (cursor.kind === "before_anchor") return { kind: "before_anchor", anchor: { ...cursor.anchor } };
109
+ if (cursor.kind === "after_anchor") return { kind: "after_anchor", anchor: { ...cursor.anchor } };
122
110
  return cursor;
123
111
  }
124
112
 
@@ -129,7 +117,6 @@ interface NumberScan {
129
117
 
130
118
  function scanLineNumber(line: string, index: number, end: number): NumberScan | null {
131
119
  if (index >= end || !isNonZeroDigitCode(line.charCodeAt(index))) return null;
132
-
133
120
  let lineNumber = 0;
134
121
  let nextIndex = index;
135
122
  while (nextIndex < end) {
@@ -160,36 +147,6 @@ interface RangeScan {
160
147
  nextIndex: number;
161
148
  }
162
149
 
163
- /**
164
- * Scan a numeric range for a hunk header. Canonical form is `A B` (two
165
- * numbers separated by whitespace); models also reflexively emit `A-B`,
166
- * `A..B`, and `A…B` (unicode ellipsis), so we accept any of those as the
167
- * range separator. A second number is REQUIRED — bare `A` is not a valid
168
- * hunk header in this grammar. Repeat-row bodies (`&A..B`) keep their own
169
- * parser and still accept the `&A` single-line shorthand; see
170
- * {@link tryParseRepeatPayload}.
171
- */
172
- function scanHeaderRange(line: string, index = 0, end = trimEndIndex(line)): RangeScan | null {
173
- const numberStart = skipWhitespace(line, index, end);
174
- const start = scanLineNumber(line, numberStart, end);
175
- if (start === null) return null;
176
-
177
- const afterFirst = scanRangeSeparator(line, start.nextIndex, end);
178
- if (afterFirst === null) return null;
179
- const endNumber = scanLineNumber(line, afterFirst, end);
180
- if (endNumber === null) return null;
181
- return {
182
- range: { start: { line: start.line }, end: { line: endNumber.line } },
183
- nextIndex: skipWhitespace(line, endNumber.nextIndex, end),
184
- };
185
- }
186
-
187
- /**
188
- * Consume the mandatory range separator (whitespace, `-`, `..`, or `…`)
189
- * between the two numbers of a hunk-header range. Returns the index of
190
- * the second number, or `null` when the separator is missing or no digit
191
- * follows it.
192
- */
193
150
  function scanRangeSeparator(line: string, index: number, end: number): number | null {
194
151
  let cursor = index;
195
152
  let consumedSeparator = false;
@@ -217,39 +174,105 @@ function scanRangeSeparator(line: string, index: number, end: number): number |
217
174
  return cursor;
218
175
  }
219
176
 
220
- export type BlockTarget = { kind: "range"; range: ParsedRange } | { kind: "bof" } | { kind: "eof" };
177
+ function scanHeaderRange(line: string, index = 0, end = trimEndIndex(line), allowSingle = false): RangeScan | null {
178
+ const numberStart = skipWhitespace(line, index, end);
179
+ const start = scanLineNumber(line, numberStart, end);
180
+ if (start === null) return null;
181
+ const afterFirst = scanRangeSeparator(line, start.nextIndex, end);
182
+ if (afterFirst === null) {
183
+ if (!allowSingle) return null;
184
+ return {
185
+ range: { start: { line: start.line }, end: { line: start.line } },
186
+ nextIndex: skipWhitespace(line, start.nextIndex, end),
187
+ };
188
+ }
189
+ const endNumber = scanLineNumber(line, afterFirst, end);
190
+ if (endNumber === null) return null;
191
+ return {
192
+ range: { start: { line: start.line }, end: { line: endNumber.line } },
193
+ nextIndex: skipWhitespace(line, endNumber.nextIndex, end),
194
+ };
195
+ }
196
+
197
+ export type BlockTarget =
198
+ | { kind: "replace"; range: ParsedRange }
199
+ | { kind: "delete"; range: ParsedRange }
200
+ | { kind: "insert_before"; anchor: Anchor }
201
+ | { kind: "insert_after"; anchor: Anchor }
202
+ | { kind: "bof" }
203
+ | { kind: "eof" };
221
204
 
222
205
  interface TargetScan {
223
206
  target: BlockTarget;
224
207
  nextIndex: number;
225
208
  }
226
209
 
227
- /**
228
- * Scan the anchor portion of a hunk header. Accepts `BOF`, `EOF`, or
229
- * `A B` (range). Single-number anchors are NOT accepted; callers must
230
- * spell single-line ranges as `A A`.
231
- */
210
+ function scanKeyword(line: string, index: number, end: number, keyword: string): number | null {
211
+ if (!line.startsWith(keyword, index)) return null;
212
+ const next = index + keyword.length;
213
+ if (next < end) {
214
+ const code = line.charCodeAt(next);
215
+ if (!isWhitespaceCode(code) && code !== CHAR_COLON) return null;
216
+ }
217
+ return next;
218
+ }
219
+
220
+ function consumeOptionalColon(line: string, index: number, end: number): number {
221
+ const cursor = skipWhitespace(line, index, end);
222
+ return cursor < end && line.charCodeAt(cursor) === CHAR_COLON ? skipWhitespace(line, cursor + 1, end) : cursor;
223
+ }
224
+
225
+ function scanInsertTarget(line: string, index: number, end: number): TargetScan | null {
226
+ const cursor = skipWhitespace(line, index, end);
227
+ const beforeEnd = scanKeyword(line, cursor, end, HL_INSERT_BEFORE);
228
+ if (beforeEnd !== null) {
229
+ const anchor = scanLineNumber(line, skipWhitespace(line, beforeEnd, end), end);
230
+ if (anchor === null) return null;
231
+ const nextIndex = consumeOptionalColon(line, anchor.nextIndex, end);
232
+ return { target: { kind: "insert_before", anchor: { line: anchor.line } }, nextIndex };
233
+ }
234
+ const afterEnd = scanKeyword(line, cursor, end, HL_INSERT_AFTER);
235
+ if (afterEnd !== null) {
236
+ const anchor = scanLineNumber(line, skipWhitespace(line, afterEnd, end), end);
237
+ if (anchor === null) return null;
238
+ const nextIndex = consumeOptionalColon(line, anchor.nextIndex, end);
239
+ return { target: { kind: "insert_after", anchor: { line: anchor.line } }, nextIndex };
240
+ }
241
+ const headEnd = scanKeyword(line, cursor, end, HL_INSERT_HEAD);
242
+ if (headEnd !== null) return { target: { kind: "bof" }, nextIndex: consumeOptionalColon(line, headEnd, end) };
243
+ const tailEnd = scanKeyword(line, cursor, end, HL_INSERT_TAIL);
244
+ if (tailEnd !== null) return { target: { kind: "eof" }, nextIndex: consumeOptionalColon(line, tailEnd, end) };
245
+ return null;
246
+ }
247
+
232
248
  function scanHunkAnchor(line: string, start: number, end: number): TargetScan | null {
233
249
  const cursor = skipWhitespace(line, start, end);
234
- if (line.startsWith(BOF_ANCHOR, cursor)) {
235
- return { target: { kind: "bof" }, nextIndex: skipWhitespace(line, cursor + BOF_ANCHOR.length, end) };
250
+ const replaceEnd = scanKeyword(line, cursor, end, HL_REPLACE_KEYWORD);
251
+ if (replaceEnd !== null) {
252
+ const range = scanHeaderRange(line, replaceEnd, end, true);
253
+ if (range === null) return null;
254
+ return {
255
+ target: { kind: "replace", range: range.range },
256
+ nextIndex: consumeOptionalColon(line, range.nextIndex, end),
257
+ };
236
258
  }
237
- if (line.startsWith(EOF_ANCHOR, cursor)) {
238
- return { target: { kind: "eof" }, nextIndex: skipWhitespace(line, cursor + EOF_ANCHOR.length, end) };
259
+ const deleteEnd = scanKeyword(line, cursor, end, HL_DELETE_KEYWORD);
260
+ if (deleteEnd !== null) {
261
+ const range = scanHeaderRange(line, deleteEnd, end, true);
262
+ if (range === null) return null;
263
+ const next = skipWhitespace(line, range.nextIndex, end);
264
+ if (next < end && line.charCodeAt(next) === CHAR_COLON) return null;
265
+ return { target: { kind: "delete", range: range.range }, nextIndex: next };
239
266
  }
240
- const range = scanHeaderRange(line, cursor, end);
241
- if (range === null) return null;
242
- return { target: { kind: "range", range: range.range }, nextIndex: range.nextIndex };
267
+ const insertEnd = scanKeyword(line, cursor, end, HL_INSERT_KEYWORD);
268
+ if (insertEnd !== null) return scanInsertTarget(line, insertEnd, end);
269
+ return null;
243
270
  }
244
271
 
245
272
  interface ParsedHunkHeader {
246
273
  target: BlockTarget;
247
274
  }
248
275
 
249
- /**
250
- * Parse a bare hunk-header line: `A B` (range) or the keywords
251
- * `BOF` / `EOF`. Returns `null` for lines that do not match the shape.
252
- */
253
276
  function tryParseHunkHeader(line: string): ParsedHunkHeader | null {
254
277
  const end = trimEndIndex(line);
255
278
  const start = skipWhitespace(line, 0, end);
@@ -260,46 +283,11 @@ function tryParseHunkHeader(line: string): ParsedHunkHeader | null {
260
283
  return { target: scan.target };
261
284
  }
262
285
 
263
- /**
264
- * Parse a `&A,B` repeat payload row (or `&A` shorthand for `&A,A`). Returns
265
- * `null` when the line does not match.
266
- */
267
- function tryParseRepeatPayload(line: string): ParsedRange | null {
268
- const end = trimEndIndex(line);
269
- if (line.length === 0 || line.charCodeAt(0) !== CHAR_PAYLOAD_REPEAT) return null;
270
-
271
- const start = scanLineNumber(line, 1, end);
272
- if (start === null) return null;
273
- if (start.nextIndex === end) {
274
- // `&A` shorthand → `&A,A`.
275
- return { start: { line: start.line }, end: { line: start.line } };
276
- }
277
- if (
278
- start.nextIndex + 1 >= end ||
279
- line.charCodeAt(start.nextIndex) !== CHAR_DOT ||
280
- line.charCodeAt(start.nextIndex + 1) !== CHAR_DOT
281
- )
282
- return null;
283
-
284
- const finish = scanLineNumber(line, start.nextIndex + 2, end);
285
- if (finish === null) return null;
286
- if (skipWhitespace(line, finish.nextIndex, end) !== end) return null;
287
- return { start: { line: start.line }, end: { line: finish.line } };
288
- }
289
-
290
- /**
291
- * Parse a `¶PATH[#hash]` file-header line. Returns `null` for lines that
292
- * do not start with the file prefix or that fail the strict shape.
293
- *
294
- * `*** Begin Patch` / `*** End Patch` / `*** Abort` markers are matched
295
- * earlier in {@link classifyLine}, so envelope markers never reach here.
296
- */
297
286
  function tryParseHeader(line: string): { path: string; fileHash?: string } | null {
298
287
  if (!line.startsWith(HL_FILE_PREFIX)) return null;
299
288
  const end = trimEndIndex(line);
300
289
  let index = FILE_PREFIX_LENGTH;
301
290
  if (index >= end) return null;
302
-
303
291
  const pathStart = index;
304
292
  while (index < end) {
305
293
  const code = line.charCodeAt(index);
@@ -308,7 +296,6 @@ function tryParseHeader(line: string): { path: string; fileHash?: string } | nul
308
296
  }
309
297
  if (index === pathStart) return null;
310
298
  const path = line.slice(pathStart, index);
311
-
312
299
  let fileHash: string | undefined;
313
300
  if (index < end && line.charCodeAt(index) === CHAR_HASH) {
314
301
  const hashStart = index + 1;
@@ -320,15 +307,11 @@ function tryParseHeader(line: string): { path: string; fileHash?: string } | nul
320
307
  fileHash = line.slice(hashStart, hashEnd).toUpperCase();
321
308
  index = hashEnd;
322
309
  }
323
-
324
- // Anything other than trailing whitespace disqualifies the header.
325
310
  if (skipWhitespace(line, index, end) !== end) return null;
326
-
327
311
  return fileHash !== undefined ? { path, fileHash } : { path };
328
312
  }
329
313
 
330
314
  interface TokenBase {
331
- /** 1-indexed line number in the original input stream. */
332
315
  lineNum: number;
333
316
  }
334
317
 
@@ -340,7 +323,6 @@ export type Token =
340
323
  | (TokenBase & { kind: "header"; path: string; fileHash?: string })
341
324
  | (TokenBase & { kind: "op-block"; target: BlockTarget })
342
325
  | (TokenBase & { kind: "payload-literal"; text: string })
343
- | (TokenBase & { kind: "payload-repeat"; range: ParsedRange })
344
326
  | (TokenBase & { kind: "raw"; text: string });
345
327
 
346
328
  function classifyLine(line: string, lineNum: number): Token {
@@ -348,9 +330,7 @@ function classifyLine(line: string, lineNum: number): Token {
348
330
  if (markerLineEquals(line, BEGIN_PATCH_MARKER)) return { kind: "envelope-begin", lineNum };
349
331
  if (markerLineEquals(line, END_PATCH_MARKER)) return { kind: "envelope-end", lineNum };
350
332
  if (markerLineEquals(line, ABORT_MARKER)) return { kind: "abort", lineNum };
351
-
352
333
  const firstCode = line.charCodeAt(0);
353
-
354
334
  if (line.startsWith(HL_FILE_PREFIX)) {
355
335
  const header = tryParseHeader(line);
356
336
  if (header !== null) {
@@ -359,48 +339,24 @@ function classifyLine(line: string, lineNum: number): Token {
359
339
  : { kind: "header", lineNum, path: header.path };
360
340
  }
361
341
  }
362
-
363
- // Hunk header lines are `A B` (two numbers) or the keyword `BOF` /
364
- // `EOF`. `@@`-bracketed forms are intentionally NOT accepted here —
365
- // they fall through to `raw` and the parser rejects them as
366
- // apply_patch contamination.
367
- const isHunkLead = isNonZeroDigitCode(firstCode) || line.startsWith(BOF_ANCHOR) || line.startsWith(EOF_ANCHOR);
342
+ const lead = skipWhitespace(line, 0);
343
+ const isHunkLead =
344
+ line.startsWith(HL_REPLACE_KEYWORD, lead) ||
345
+ line.startsWith(HL_DELETE_KEYWORD, lead) ||
346
+ line.startsWith(HL_INSERT_KEYWORD, lead);
368
347
  if (isHunkLead) {
369
348
  const hunk = tryParseHunkHeader(line);
370
349
  if (hunk !== null) return { kind: "op-block", lineNum, target: hunk.target };
371
350
  }
372
-
373
- if (firstCode === CHAR_PAYLOAD_REPLACE) {
374
- return { kind: "payload-literal", lineNum, text: line.slice(1) };
375
- }
376
- if (firstCode === CHAR_PAYLOAD_REPEAT) {
377
- const range = tryParseRepeatPayload(line);
378
- if (range !== null) return { kind: "payload-repeat", lineNum, range };
379
- }
380
-
351
+ if (firstCode === CHAR_PAYLOAD_REPLACE) return { kind: "payload-literal", lineNum, text: line.slice(1) };
381
352
  return { kind: "raw", lineNum, text: line };
382
353
  }
383
354
 
384
- /**
385
- * Stateful, line-oriented classifier for hashline diff text. Use the
386
- * streaming {@link feed}/{@link end} pair to ingest text in chunks (each
387
- * completed line emits exactly one token; a trailing partial line stays
388
- * buffered until the next chunk or {@link end}). Use the stateless
389
- * {@link tokenize}/predicate methods for callers that already hold whole
390
- * lines and only need classification without buffering.
391
- */
392
355
  export class Tokenizer {
393
356
  #buffer = "";
394
357
  #nextLineNum = 1;
395
358
  #closed = false;
396
359
 
397
- /**
398
- * Ingest a chunk of input text. Each newline-terminated line in the
399
- * combined buffer produces one token. A trailing partial line (no `\n`
400
- * yet, possibly ending in a lone `\r`) stays buffered until the next
401
- * `feed`/`end` call so CRLF pairs that straddle chunk boundaries are
402
- * still normalized correctly.
403
- */
404
360
  feed(chunk: string): Token[] {
405
361
  if (this.#closed) throw new Error("Tokenizer is closed; call reset() before reusing.");
406
362
  if (chunk.length === 0) return [];
@@ -408,11 +364,6 @@ export class Tokenizer {
408
364
  return this.#drainCompleteLines();
409
365
  }
410
366
 
411
- /**
412
- * Flush any buffered residual line (the last line of input when it lacks
413
- * a trailing newline) and mark the tokenizer closed. Calling `end` a
414
- * second time returns `[]`; reuse requires `reset`.
415
- */
416
367
  end(): Token[] {
417
368
  if (this.#closed) return [];
418
369
  this.#closed = true;
@@ -421,18 +372,15 @@ export class Tokenizer {
421
372
  if (buf.length === 0) return [];
422
373
  let stop = buf.length;
423
374
  if (buf.charCodeAt(stop - 1) === CHAR_CARRIAGE_RETURN) stop--;
424
- const token = classifyLine(buf.slice(0, stop), this.#nextLineNum++);
425
- return [token];
375
+ return [classifyLine(buf.slice(0, stop), this.#nextLineNum++)];
426
376
  }
427
377
 
428
- /** Discard any buffered text and reset the line counter to 1. */
429
378
  reset(): void {
430
379
  this.#buffer = "";
431
380
  this.#nextLineNum = 1;
432
381
  this.#closed = false;
433
382
  }
434
383
 
435
- /** Convenience: feed an entire text and immediately flush. */
436
384
  tokenizeAll(text: string): Token[] {
437
385
  this.reset();
438
386
  const first = this.feed(text);
@@ -440,7 +388,6 @@ export class Tokenizer {
440
388
  return last.length === 0 ? first : first.concat(last);
441
389
  }
442
390
 
443
- /** Stateless one-shot classification. Does not touch the streaming buffer. */
444
391
  tokenize(line: string, lineNum = 0): Token {
445
392
  return classifyLine(line, lineNum);
446
393
  }
package/src/types.ts CHANGED
@@ -9,14 +9,18 @@ export interface Anchor {
9
9
  line: number;
10
10
  }
11
11
 
12
- /** Where an `insert` or `repeat` edit should land relative to existing content. */
13
- export type Cursor = { kind: "bof" } | { kind: "eof" } | { kind: "before_anchor"; anchor: Anchor };
12
+ /** Where an `insert` edit should land relative to existing content. */
13
+ export type Cursor =
14
+ | { kind: "bof" }
15
+ | { kind: "eof" }
16
+ | { kind: "before_anchor"; anchor: Anchor }
17
+ | { kind: "after_anchor"; anchor: Anchor };
14
18
 
15
19
  /**
16
20
  * A single low-level edit produced by the parser and consumed by the applier.
17
- * Multi-line replacements decompose to one `insert`/`repeat` per replacement
18
- * line plus one `delete` per consumed line. Replacement payloads are tagged so
19
- * the applier can distinguish literal insertion from new content for a deleted
21
+ * Multi-line replacements decompose to one `insert` per replacement line plus
22
+ * one `delete` per consumed line. Replacement payloads are tagged so the
23
+ * applier can distinguish literal insertion from new content for a deleted
20
24
  * line.
21
25
  */
22
26
  export type Edit =
@@ -28,14 +32,6 @@ export type Edit =
28
32
  index: number;
29
33
  mode?: "replacement";
30
34
  }
31
- | {
32
- kind: "repeat";
33
- cursor: Cursor;
34
- range: ParsedRange;
35
- lineNum: number;
36
- index: number;
37
- mode?: "replacement";
38
- }
39
35
  | { kind: "delete"; anchor: Anchor; lineNum: number; index: number; oldAssertion?: string };
40
36
 
41
37
  /** Result of applying a parsed set of edits to a text body. */