pi-hashline-edit-pro 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,192 @@
1
+ /**
2
+ * Hash computation — xxHash32-based line hashing with occurrence-aware
3
+ * discriminators.
4
+ *
5
+ * This module owns the hash constants, the xxHash32 wrapper, and the
6
+ * per-line hash computation functions. Every other module that needs
7
+ * line hashes goes through `computeLineHashes` (full-file) or
8
+ * `computeLineHash` (single-line helper).
9
+ */
10
+
11
+ import * as XXH from "xxhashjs";
12
+
13
+ // ─── Constants ──────────────────────────────────────────────────────────
14
+
15
+ /**
16
+ * Hash length in characters. The original `pi-hashline-edit` uses 2 chars of
17
+ * a 16-char alphabet (8 bits / 256 buckets); this fork uses 4 chars of a
18
+ * 64-char alphabet (24 bits / 16 777 216 buckets). With HASH_LENGTH=4, the
19
+ * birthday paradox stays out of practical concern for any realistic file
20
+ * size. Bumping to 5 is a one-line change here if you want to push the
21
+ * threshold further; the cost is one more char per anchor in the `read`
22
+ * output.
23
+ */
24
+ export const HASH_LENGTH = 4;
25
+
26
+ /**
27
+ * URL-safe base64 alphabet: A–Z, a–z, 0–9, `-`, `_`. 64 distinct chars
28
+ * giving 6 bits per hash character. No exclusions, no human-readability
29
+ * heuristics — the consumer is an LLM that tokenizes, not a human that
30
+ * squints at pixel glyphs. The `-` and `_` are at the end of the string
31
+ * so any character class built from this alphabet (e.g. `[${HASH_ALPHABET}]`)
32
+ * treats them as literal rather than as range operators.
33
+ */
34
+ const HASH_ALPHABET =
35
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
36
+ const HASH_ALPHABET_BITS = 6;
37
+ const HASH_ALPHABET_MASK = (1 << HASH_ALPHABET_BITS) - 1;
38
+ // `-` must be escaped when used inside a regex character class — otherwise it
39
+ // forms a range with the preceding char (`9-_` spans ASCII 57–95, which
40
+ // silently swallows the literal `-`). The `_` is always literal.
41
+ const HASH_ALPHABET_REGEX_SAFE = HASH_ALPHABET.replace(/-/g, "\\-");
42
+ const HASH_ALPHABET_RE = new RegExp(`^[${HASH_ALPHABET_REGEX_SAFE}]+$`);
43
+ export const HASH_CHARS_CLASS = `[${HASH_ALPHABET_REGEX_SAFE}]{${HASH_LENGTH}}`;
44
+
45
+ /**
46
+ * Encode the top `HASH_LENGTH * 6` bits of a 32-bit hash value as a
47
+ * `HASH_LENGTH`-char string in the URL-safe base64 alphabet.
48
+ *
49
+ * The 0.2.0/0.3.0 releases pre-computed this mapping as a `DICT` lookup
50
+ * table. At 3 chars that was 262 144 entries × 3 chars = ~1 MB of static
51
+ * memory; at 4 chars it would be 16 777 216 entries × 4 chars = ~450 MB
52
+ * and a multi-second module load. So we now compute the string inline.
53
+ * The per-line cost is one xxHash32 call plus `HASH_LENGTH` small string
54
+ * concatenations, which is still nanoseconds — this is called once per
55
+ * line in `computeLineHashes`, not on a hot path.
56
+ */
57
+ function hashToString(h: number): string {
58
+ const totalBits = HASH_LENGTH * HASH_ALPHABET_BITS;
59
+ const shift = 32 - totalBits;
60
+ let n = h >>> shift;
61
+ let out = "";
62
+ for (let j = 0; j < HASH_LENGTH; j++) {
63
+ // Build left-to-right: the first iteration writes the high-order
64
+ // 6 bits, the last writes the low-order 6 bits.
65
+ out +=
66
+ HASH_ALPHABET[
67
+ (n >>> ((HASH_LENGTH - 1 - j) * HASH_ALPHABET_BITS)) &
68
+ HASH_ALPHABET_MASK
69
+ ]!;
70
+ }
71
+ return out;
72
+ }
73
+
74
+ /**
75
+ * Patterns used to detect (and reject) hashline display prefixes inside edit
76
+ * payloads. The runtime no longer strips them — the model must send literal
77
+ * file content. Matching any of these triggers `[E_INVALID_PATCH]`.
78
+ */
79
+ export const HASHLINE_PREFIX_RE = new RegExp(
80
+ `^\\s*(?:>>>|>>)?\\s*${HASH_CHARS_CLASS}:`,
81
+ );
82
+ export const HASHLINE_PREFIX_PLUS_RE = new RegExp(
83
+ `^\\+\\s*${HASH_CHARS_CLASS}:`,
84
+ );
85
+ export const DIFF_MINUS_RE = /^-\s*\d+\s{4}/;
86
+
87
+ /**
88
+ * Bare hashline prefix: a HASH_LENGTH-char hash followed by ":" with no
89
+ * "LINE#" part (e.g. "KKZ:### heading", "TPN:text", "TJZ:"). Capture
90
+ * group 1 is the hash.
91
+ *
92
+ * This is the partial-hash failure mode from issue #24: the model copies a
93
+ * hash it saw in `read` output into the line content but drops the rest
94
+ * of the rendered `HASH:content` form. The first 5 characters of the line
95
+ * (4 alphabet chars + ":") are matched by this regex, then
96
+ * `assertNoBareHashPrefixLines` rejects the edit with `[E_BARE_HASH_PREFIX]`
97
+ * so the model gets actionable feedback instead of a silent correctness bug.
98
+ */
99
+ export const HASHLINE_BARE_PREFIX_RE = new RegExp(`^\\s*(${HASH_CHARS_CLASS}):`);
100
+
101
+ /** Lines containing no alphanumeric characters (only punctuation/symbols/whitespace). */
102
+ const RE_SIGNIFICANT = /[\p{L}\p{N}]/u;
103
+
104
+ function xxh32(input: string, seed = 0): number {
105
+ return XXH.h32(seed).update(input).digest().toNumber() >>> 0;
106
+ }
107
+
108
+ /**
109
+ * Discriminator prefixes for the occurrence-aware hash space.
110
+ *
111
+ * `S${lineNumber}` puts symbol-only lines (lone `}`, etc.) into a namespace
112
+ * keyed by line number, so the same `}` on different lines never collides.
113
+ *
114
+ * `C${occurrence}` puts content lines into a namespace keyed by the running
115
+ * occurrence count of that canonical content, so the same `import {...}` on
116
+ * different lines never collides either. This is the key behavioural change
117
+ * from the upstream 2-char hash: identical content now hashes to different
118
+ * values at different positions, so the model can target a specific
119
+ * occurrence without resorting to `offset` + a small `limit` window.
120
+ */
121
+ const SYMBOL_DISCRIMINATOR = (lineNumber: number): string => `S${lineNumber}`;
122
+ const CONTENT_DISCRIMINATOR = (occurrence: number): string => `C${occurrence}`;
123
+
124
+ function canonicalizeLine(line: string): string {
125
+ return line.replace(/\r/g, "").trimEnd();
126
+ }
127
+
128
+ function isSymbolOnly(canonical: string): boolean {
129
+ return !RE_SIGNIFICANT.test(canonical);
130
+ }
131
+
132
+ /**
133
+ * Compute hashes for every line of the file.
134
+ *
135
+ * Returns an array of length `lines.length`, where index `i` is the hash of
136
+ * line `i + 1` (1-indexed). Two lines with the same canonical content get
137
+ * different hashes based on which occurrence they are.
138
+ *
139
+ * The runtime always works from a precomputed array so that all validation,
140
+ * formatting, and error-message code paths see the same hash for a given line.
141
+ * The standalone `computeLineHash(idx, line)` helper below is kept for
142
+ * single-line use (e.g. diff-preview formatting) where occurrence context
143
+ * is not available; it treats the input as a 1st-occurrence content line.
144
+ */
145
+ export function computeLineHashes(content: string): string[] {
146
+ const lines = content.split("\n");
147
+ const hashes = new Array<string>(lines.length);
148
+ const counts = new Map<string, number>();
149
+ for (let i = 0; i < lines.length; i++) {
150
+ const lineNumber = i + 1;
151
+ const canonical = canonicalizeLine(lines[i]!);
152
+ let discriminator: string;
153
+ if (isSymbolOnly(canonical)) {
154
+ discriminator = SYMBOL_DISCRIMINATOR(lineNumber);
155
+ } else {
156
+ const occurrence = (counts.get(canonical) ?? 0) + 1;
157
+ counts.set(canonical, occurrence);
158
+ discriminator = CONTENT_DISCRIMINATOR(occurrence);
159
+ }
160
+ hashes[i] = hashToString(xxh32(`${discriminator}:${canonical}`));
161
+ }
162
+ return hashes;
163
+ }
164
+
165
+ /**
166
+ * Single-line hash for callers that don't have the full file context.
167
+ *
168
+ * This treats the input as a 1st-occurrence content line (or, for symbol-only
169
+ * lines, as the line at index `idx`). It is the right answer for diff-preview
170
+ * formatting and for tests that build anchors one line at a time, but it is
171
+ * NOT the same as the hash that `computeLineHashes` would produce for the
172
+ * same line in a file with duplicate content. Production validation always
173
+ * uses `computeLineHashes` + per-line lookup.
174
+ */
175
+ export function computeLineHash(idx: number, line: string): string {
176
+ const canonical = canonicalizeLine(line);
177
+ const discriminator = isSymbolOnly(canonical)
178
+ ? SYMBOL_DISCRIMINATOR(idx)
179
+ : CONTENT_DISCRIMINATOR(1);
180
+ return hashToString(xxh32(`${discriminator}:${canonical}`));
181
+ }
182
+
183
+ /** Exported for tests and for downstream tools that want to mirror the format. */
184
+ export const HASH_FORMAT = {
185
+ length: HASH_LENGTH,
186
+ bitsPerChar: HASH_ALPHABET_BITS,
187
+ alphabet: HASH_ALPHABET,
188
+ };
189
+
190
+
191
+ /** Re-export HASH_ALPHABET_RE for parse module */
192
+ export { HASH_ALPHABET_RE };
@@ -0,0 +1,70 @@
1
+ /**
2
+ * Hashline engine — hash-anchored line editing.
3
+ *
4
+ * Forked from pi-hashline-edit (MIT, github.com/RimuruW/pi-hashline-edit),
5
+ * which was vendored & adapted from oh-my-pi (MIT, github.com/can1357/oh-my-pi).
6
+ *
7
+ * This fork preserves the strict semantics of the original (no silent
8
+ * relocation, no autocorrection heuristics, no fuzzy fallback) and uses a
9
+ * 4-character hash over a 64-character URL-safe base64 alphabet, giving
10
+ * 24 bits of entropy (16 777 216 buckets) per anchor. Birthday-paradox
11
+ * collisions become effectively zero for any realistic file size. The
12
+ * alphabet is sized for an LLM consumer, not a human reader — the model
13
+ * tokenizes, it does not squint at pixel glyphs.
14
+ *
15
+ * Anchor format: a bare hash alone (`aB3x`). The line number is no longer
16
+ * part of the wire format, and no content may follow the hash either. The
17
+ * model never has to type a line number; the runtime resolves each hash to
18
+ * a line via the file's precomputed hash array.
19
+ *
20
+ * On a hash collision (two different lines happen to have the same hash
21
+ * — extremely rare at 24 bits) the anchor is rejected with
22
+ * `[E_AMBIGUOUS_ANCHOR]`. The model is expected to disambiguate by calling
23
+ * `read` again to get fresh hashes.
24
+ */
25
+
26
+ // Re-export everything from sub-modules to preserve the public API surface.
27
+ // Consumers should import from "./hashline" (this index) and get the same
28
+ // symbols as before the split.
29
+
30
+ export {
31
+ // Hash computation
32
+ HASH_LENGTH,
33
+ HASH_FORMAT,
34
+ HASH_CHARS_CLASS,
35
+ HASHLINE_PREFIX_RE,
36
+ HASHLINE_PREFIX_PLUS_RE,
37
+ DIFF_MINUS_RE,
38
+ HASHLINE_BARE_PREFIX_RE,
39
+ computeLineHashes,
40
+ computeLineHash,
41
+ } from "./hash";
42
+
43
+ export {
44
+ // Parsing
45
+ parseHashRef,
46
+ hashlineParseText,
47
+ type Anchor,
48
+ } from "./parse";
49
+
50
+ export {
51
+ // Resolution and validation
52
+ type ResolvedAnchor,
53
+ type HashlineEdit,
54
+ type ResolvedHashlineEdit,
55
+ type HashlineToolEdit,
56
+ type NoopEdit,
57
+ resolveEditAnchors,
58
+ validateAnchorEdits,
59
+ assertNoBareHashPrefixLines,
60
+ formatMismatchError,
61
+ } from "./resolve";
62
+
63
+ export {
64
+ // Application
65
+ buildLineIndex,
66
+ applyHashlineEdits,
67
+ computeAffectedLineRange,
68
+ formatHashlineRegion,
69
+ computeChangedLineRange,
70
+ } from "./apply";
@@ -0,0 +1,116 @@
1
+ /**
2
+ * Parsing — anchor parsing and edit content preprocessing.
3
+ *
4
+ * This module owns the wire-format parsing for hash anchors and the
5
+ * content preprocessing that rejects display prefixes in edit payloads.
6
+ */
7
+
8
+ import {
9
+ HASH_LENGTH,
10
+ HASH_ALPHABET_RE,
11
+ HASH_CHARS_CLASS,
12
+ HASHLINE_PREFIX_PLUS_RE,
13
+ DIFF_MINUS_RE,
14
+ } from "./hash";
15
+
16
+ // ─── Types ──────────────────────────────────────────────────────────────
17
+
18
+ /**
19
+ * An anchor is just a hash. The hash is the entire wire format for `pos`
20
+ * and `end` — the runtime looks it up in the file's precomputed hash array
21
+ * to find the line. No content may follow the hash.
22
+ */
23
+ export type Anchor = { hash: string };
24
+
25
+ // ─── Parsing ────────────────────────────────────────────────────────────
26
+
27
+ function diagnoseHashRef(ref: string): string {
28
+ const trimmed = ref.trim();
29
+
30
+ if (!trimmed.length) {
31
+ return `[E_BAD_REF] Invalid anchor. Expected a bare 4-character hash (e.g. "aB3x").`;
32
+ }
33
+
34
+ // Detect the legacy "LINE#HASH" form (5#aB3x, 12#MQ, etc.) so we can
35
+ // give a clear error pointing at the new format.
36
+ if (/^\d+\s*#/.test(trimmed)) {
37
+ return `[E_BAD_REF] Invalid anchor. Use the hash alone (e.g. "aB3x") — no line numbers or trailing content.`;
38
+ }
39
+
40
+ return `[E_BAD_REF] Invalid anchor "${trimmed}". Expected a bare 4-character hash.`;
41
+ }
42
+
43
+ function parseAnchorRef(ref: string): Anchor {
44
+ const trimmed = ref.trim();
45
+
46
+ // Strict: the wire format is a 4-character hash from the URL-safe base64
47
+ // alphabet (A-Za-z0-9-_), copied verbatim from `read` output. The first
48
+ // character can be `-` (a valid alphabet char), so a hash like `-qkl` is
49
+ // taken literally. No other form is tolerated: `+`/`-`/`>>>` markers from
50
+ // diff contexts or stale-anchor retry blocks are rejected. The model must
51
+ // copy just the 4-character hash with no surrounding characters.
52
+ if (
53
+ trimmed.length === HASH_LENGTH &&
54
+ HASH_ALPHABET_RE.test(trimmed)
55
+ ) {
56
+ return { hash: trimmed };
57
+ }
58
+
59
+ throw new Error(diagnoseHashRef(ref));
60
+ }
61
+
62
+ /**
63
+ * Parse a hash anchor. Accepts `HASH` (e.g. `"aB3x"`) only. The
64
+ * `HASH:content` disambiguator from earlier versions is gone — the hash
65
+ * is the entire wire format for `pos` and `end`, and no content may
66
+ * follow it.
67
+ *
68
+ * Throws `[E_BAD_REF]` for malformed input.
69
+ */
70
+ export const parseHashRef = parseAnchorRef;
71
+
72
+ // ─── Content preprocessing ──────────────────────────────────────────────
73
+
74
+ /**
75
+ * Reject hashline display prefixes in edit payloads. Strict semantics: the
76
+ * model must send literal file content for `lines`, not the rendered read /
77
+ * diff form. Silent stripping is no longer performed — see AGENTS.md.
78
+ *
79
+ * This covers the unambiguous `+HASH:` / diff `+/-` forms, rejectable on
80
+ * shape alone. The bare `HHHH:` variant (issue #24) is context-dependent and
81
+ * lives in `assertNoBareHashPrefixLines`.
82
+ *
83
+ */
84
+ function assertNoDisplayPrefixes(lines: string[]): void {
85
+ for (const line of lines) {
86
+ if (!line.length) continue;
87
+ if (
88
+ HASHLINE_PREFIX_PLUS_RE.test(line) ||
89
+ DIFF_MINUS_RE.test(line)
90
+ ) {
91
+ throw new Error(
92
+ `[E_INVALID_PATCH] "lines" must contain literal file content, not HASH: or diff prefixes. Offending line: ${JSON.stringify(line)}`
93
+ );
94
+ }
95
+ }
96
+ }
97
+
98
+ /**
99
+ * Parse replacement text into lines.
100
+ *
101
+ * String input is normalized to LF and drops exactly one trailing newline,
102
+ * matching read-preview style content. Array input is preserved verbatim so
103
+ * explicitly provided blank lines remain intact. Display prefixes are
104
+ * rejected by `assertNoDisplayPrefixes`, never silently stripped.
105
+ */
106
+ export function hashlineParseText(edit: string[] | string | null): string[] {
107
+ if (edit === null) return [];
108
+ const lines =
109
+ typeof edit === "string"
110
+ ? (edit.endsWith("\n") ? edit.slice(0, -1) : edit)
111
+ .replaceAll("\r", "")
112
+ .split("\n")
113
+ : edit;
114
+ assertNoDisplayPrefixes(lines);
115
+ return lines;
116
+ }