npm - pi-hashline-edit-pro - Versions diffs - 0.2.0 - Mend

pi-hashline-edit-pro 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/LICENSE +21 -0
package/README.md +143 -0
package/index.ts +64 -0
package/package.json +52 -0
package/prompts/edit-snippet.md +1 -0
package/prompts/edit.md +58 -0
package/prompts/read-guidelines.md +3 -0
package/prompts/read-snippet.md +1 -0
package/prompts/read.md +28 -0
package/src/edit-diff.ts +234 -0
package/src/edit-normalize.ts +68 -0
package/src/edit-render.ts +280 -0
package/src/edit-response.ts +531 -0
package/src/edit.ts +689 -0
package/src/file-kind.ts +161 -0
package/src/fs-write.ts +105 -0
package/src/hashline/apply.ts +660 -0
package/src/hashline/hash.ts +192 -0
package/src/hashline/index.ts +70 -0
package/src/hashline/parse.ts +116 -0
package/src/hashline/resolve.ts +552 -0
package/src/path-utils.ts +13 -0
package/src/read.ts +256 -0
package/src/runtime.ts +3 -0
package/src/snapshot.ts +29 -0
package/src/utils.ts +11 -0

package/src/hashline/hash.ts ADDED Viewed

@@ -0,0 +1,192 @@
+/**
+ * Hash computation — xxHash32-based line hashing with occurrence-aware
+ * discriminators.
+ *
+ * This module owns the hash constants, the xxHash32 wrapper, and the
+ * per-line hash computation functions. Every other module that needs
+ * line hashes goes through `computeLineHashes` (full-file) or
+ * `computeLineHash` (single-line helper).
+ */
+import * as XXH from "xxhashjs";
+// ─── Constants ──────────────────────────────────────────────────────────
+/**
+ * Hash length in characters. The original `pi-hashline-edit` uses 2 chars of
+ * a 16-char alphabet (8 bits / 256 buckets); this fork uses 4 chars of a
+ * 64-char alphabet (24 bits / 16 777 216 buckets). With HASH_LENGTH=4, the
+ * birthday paradox stays out of practical concern for any realistic file
+ * size. Bumping to 5 is a one-line change here if you want to push the
+ * threshold further; the cost is one more char per anchor in the `read`
+ * output.
+ */
+export const HASH_LENGTH = 4;
+/**
+ * URL-safe base64 alphabet: A–Z, a–z, 0–9, `-`, `_`. 64 distinct chars
+ * giving 6 bits per hash character. No exclusions, no human-readability
+ * heuristics — the consumer is an LLM that tokenizes, not a human that
+ * squints at pixel glyphs. The `-` and `_` are at the end of the string
+ * so any character class built from this alphabet (e.g. `[${HASH_ALPHABET}]`)
+ * treats them as literal rather than as range operators.
+ */
+const HASH_ALPHABET =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
+const HASH_ALPHABET_BITS = 6;
+const HASH_ALPHABET_MASK = (1 << HASH_ALPHABET_BITS) - 1;
+// `-` must be escaped when used inside a regex character class — otherwise it
+// forms a range with the preceding char (`9-_` spans ASCII 57–95, which
+// silently swallows the literal `-`). The `_` is always literal.
+const HASH_ALPHABET_REGEX_SAFE = HASH_ALPHABET.replace(/-/g, "\\-");
+const HASH_ALPHABET_RE = new RegExp(`^[${HASH_ALPHABET_REGEX_SAFE}]+$`);
+export const HASH_CHARS_CLASS = `[${HASH_ALPHABET_REGEX_SAFE}]{${HASH_LENGTH}}`;
+/**
+ * Encode the top `HASH_LENGTH * 6` bits of a 32-bit hash value as a
+ * `HASH_LENGTH`-char string in the URL-safe base64 alphabet.
+ *
+ * The 0.2.0/0.3.0 releases pre-computed this mapping as a `DICT` lookup
+ * table. At 3 chars that was 262 144 entries × 3 chars = ~1 MB of static
+ * memory; at 4 chars it would be 16 777 216 entries × 4 chars = ~450 MB
+ * and a multi-second module load. So we now compute the string inline.
+ * The per-line cost is one xxHash32 call plus `HASH_LENGTH` small string
+ * concatenations, which is still nanoseconds — this is called once per
+ * line in `computeLineHashes`, not on a hot path.
+ */
+function hashToString(h: number): string {
+	const totalBits = HASH_LENGTH * HASH_ALPHABET_BITS;
+	const shift = 32 - totalBits;
+	let n = h >>> shift;
+	let out = "";
+	for (let j = 0; j < HASH_LENGTH; j++) {
+		// Build left-to-right: the first iteration writes the high-order
+		// 6 bits, the last writes the low-order 6 bits.
+		out +=
+			HASH_ALPHABET[
+				(n >>> ((HASH_LENGTH - 1 - j) * HASH_ALPHABET_BITS)) &
+					HASH_ALPHABET_MASK
+			]!;
+	}
+	return out;
+}
+/**
+ * Patterns used to detect (and reject) hashline display prefixes inside edit
+ * payloads. The runtime no longer strips them — the model must send literal
+ * file content. Matching any of these triggers `[E_INVALID_PATCH]`.
+ */
+export const HASHLINE_PREFIX_RE = new RegExp(
+	`^\\s*(?:>>>|>>)?\\s*${HASH_CHARS_CLASS}:`,
+);
+export const HASHLINE_PREFIX_PLUS_RE = new RegExp(
+	`^\\+\\s*${HASH_CHARS_CLASS}:`,
+);
+export const DIFF_MINUS_RE = /^-\s*\d+\s{4}/;
+/**
+ * Bare hashline prefix: a HASH_LENGTH-char hash followed by ":" with no
+ * "LINE#" part (e.g. "KKZ:### heading", "TPN:text", "TJZ:"). Capture
+ * group 1 is the hash.
+ *
+ * This is the partial-hash failure mode from issue #24: the model copies a
+ * hash it saw in `read` output into the line content but drops the rest
+ * of the rendered `HASH:content` form. The first 5 characters of the line
+ * (4 alphabet chars + ":") are matched by this regex, then
+ * `assertNoBareHashPrefixLines` rejects the edit with `[E_BARE_HASH_PREFIX]`
+ * so the model gets actionable feedback instead of a silent correctness bug.
+ */
+export const HASHLINE_BARE_PREFIX_RE = new RegExp(`^\\s*(${HASH_CHARS_CLASS}):`);
+/** Lines containing no alphanumeric characters (only punctuation/symbols/whitespace). */
+const RE_SIGNIFICANT = /[\p{L}\p{N}]/u;
+function xxh32(input: string, seed = 0): number {
+	return XXH.h32(seed).update(input).digest().toNumber() >>> 0;
+}
+/**
+ * Discriminator prefixes for the occurrence-aware hash space.
+ *
+ * `S${lineNumber}` puts symbol-only lines (lone `}`, etc.) into a namespace
+ * keyed by line number, so the same `}` on different lines never collides.
+ *
+ * `C${occurrence}` puts content lines into a namespace keyed by the running
+ * occurrence count of that canonical content, so the same `import {...}` on
+ * different lines never collides either. This is the key behavioural change
+ * from the upstream 2-char hash: identical content now hashes to different
+ * values at different positions, so the model can target a specific
+ * occurrence without resorting to `offset` + a small `limit` window.
+ */
+const SYMBOL_DISCRIMINATOR = (lineNumber: number): string => `S${lineNumber}`;
+const CONTENT_DISCRIMINATOR = (occurrence: number): string => `C${occurrence}`;
+function canonicalizeLine(line: string): string {
+	return line.replace(/\r/g, "").trimEnd();
+}
+function isSymbolOnly(canonical: string): boolean {
+	return !RE_SIGNIFICANT.test(canonical);
+}
+/**
+ * Compute hashes for every line of the file.
+ *
+ * Returns an array of length `lines.length`, where index `i` is the hash of
+ * line `i + 1` (1-indexed). Two lines with the same canonical content get
+ * different hashes based on which occurrence they are.
+ *
+ * The runtime always works from a precomputed array so that all validation,
+ * formatting, and error-message code paths see the same hash for a given line.
+ * The standalone `computeLineHash(idx, line)` helper below is kept for
+ * single-line use (e.g. diff-preview formatting) where occurrence context
+ * is not available; it treats the input as a 1st-occurrence content line.
+ */
+export function computeLineHashes(content: string): string[] {
+	const lines = content.split("\n");
+	const hashes = new Array<string>(lines.length);
+	const counts = new Map<string, number>();
+	for (let i = 0; i < lines.length; i++) {
+		const lineNumber = i + 1;
+		const canonical = canonicalizeLine(lines[i]!);
+		let discriminator: string;
+		if (isSymbolOnly(canonical)) {
+			discriminator = SYMBOL_DISCRIMINATOR(lineNumber);
+		} else {
+			const occurrence = (counts.get(canonical) ?? 0) + 1;
+			counts.set(canonical, occurrence);
+			discriminator = CONTENT_DISCRIMINATOR(occurrence);
+		}
+		hashes[i] = hashToString(xxh32(`${discriminator}:${canonical}`));
+	}
+	return hashes;
+}
+/**
+ * Single-line hash for callers that don't have the full file context.
+ *
+ * This treats the input as a 1st-occurrence content line (or, for symbol-only
+ * lines, as the line at index `idx`). It is the right answer for diff-preview
+ * formatting and for tests that build anchors one line at a time, but it is
+ * NOT the same as the hash that `computeLineHashes` would produce for the
+ * same line in a file with duplicate content. Production validation always
+ * uses `computeLineHashes` + per-line lookup.
+ */
+export function computeLineHash(idx: number, line: string): string {
+	const canonical = canonicalizeLine(line);
+	const discriminator = isSymbolOnly(canonical)
+		? SYMBOL_DISCRIMINATOR(idx)
+		: CONTENT_DISCRIMINATOR(1);
+	return hashToString(xxh32(`${discriminator}:${canonical}`));
+}
+/** Exported for tests and for downstream tools that want to mirror the format. */
+export const HASH_FORMAT = {
+	length: HASH_LENGTH,
+	bitsPerChar: HASH_ALPHABET_BITS,
+	alphabet: HASH_ALPHABET,
+};
+/** Re-export HASH_ALPHABET_RE for parse module */
+export { HASH_ALPHABET_RE };

package/src/hashline/index.ts ADDED Viewed

@@ -0,0 +1,70 @@
+/**
+ * Hashline engine — hash-anchored line editing.
+ *
+ * Forked from pi-hashline-edit (MIT, github.com/RimuruW/pi-hashline-edit),
+ * which was vendored & adapted from oh-my-pi (MIT, github.com/can1357/oh-my-pi).
+ *
+ * This fork preserves the strict semantics of the original (no silent
+ * relocation, no autocorrection heuristics, no fuzzy fallback) and uses a
+ * 4-character hash over a 64-character URL-safe base64 alphabet, giving
+ * 24 bits of entropy (16 777 216 buckets) per anchor. Birthday-paradox
+ * collisions become effectively zero for any realistic file size. The
+ * alphabet is sized for an LLM consumer, not a human reader — the model
+ * tokenizes, it does not squint at pixel glyphs.
+ *
+ * Anchor format: a bare hash alone (`aB3x`). The line number is no longer
+ * part of the wire format, and no content may follow the hash either. The
+ * model never has to type a line number; the runtime resolves each hash to
+ * a line via the file's precomputed hash array.
+ *
+ * On a hash collision (two different lines happen to have the same hash
+ * — extremely rare at 24 bits) the anchor is rejected with
+ * `[E_AMBIGUOUS_ANCHOR]`. The model is expected to disambiguate by calling
+ * `read` again to get fresh hashes.
+ */
+// Re-export everything from sub-modules to preserve the public API surface.
+// Consumers should import from "./hashline" (this index) and get the same
+// symbols as before the split.
+export {
+	// Hash computation
+	HASH_LENGTH,
+	HASH_FORMAT,
+	HASH_CHARS_CLASS,
+	HASHLINE_PREFIX_RE,
+	HASHLINE_PREFIX_PLUS_RE,
+	DIFF_MINUS_RE,
+	HASHLINE_BARE_PREFIX_RE,
+	computeLineHashes,
+	computeLineHash,
+} from "./hash";
+export {
+	// Parsing
+	parseHashRef,
+	hashlineParseText,
+	type Anchor,
+} from "./parse";
+export {
+	// Resolution and validation
+	type ResolvedAnchor,
+	type HashlineEdit,
+	type ResolvedHashlineEdit,
+	type HashlineToolEdit,
+	type NoopEdit,
+	resolveEditAnchors,
+	validateAnchorEdits,
+	assertNoBareHashPrefixLines,
+	formatMismatchError,
+} from "./resolve";
+export {
+	// Application
+	buildLineIndex,
+	applyHashlineEdits,
+	computeAffectedLineRange,
+	formatHashlineRegion,
+	computeChangedLineRange,
+} from "./apply";

package/src/hashline/parse.ts ADDED Viewed

@@ -0,0 +1,116 @@
+/**
+ * Parsing — anchor parsing and edit content preprocessing.
+ *
+ * This module owns the wire-format parsing for hash anchors and the
+ * content preprocessing that rejects display prefixes in edit payloads.
+ */
+import {
+	HASH_LENGTH,
+	HASH_ALPHABET_RE,
+	HASH_CHARS_CLASS,
+	HASHLINE_PREFIX_PLUS_RE,
+	DIFF_MINUS_RE,
+} from "./hash";
+// ─── Types ──────────────────────────────────────────────────────────────
+/**
+ * An anchor is just a hash. The hash is the entire wire format for `pos`
+ * and `end` — the runtime looks it up in the file's precomputed hash array
+ * to find the line. No content may follow the hash.
+ */
+export type Anchor = { hash: string };
+// ─── Parsing ────────────────────────────────────────────────────────────
+function diagnoseHashRef(ref: string): string {
+	const trimmed = ref.trim();
+	if (!trimmed.length) {
+		return `[E_BAD_REF] Invalid anchor. Expected a bare 4-character hash (e.g. "aB3x").`;
+	}
+	// Detect the legacy "LINE#HASH" form (5#aB3x, 12#MQ, etc.) so we can
+	// give a clear error pointing at the new format.
+	if (/^\d+\s*#/.test(trimmed)) {
+		return `[E_BAD_REF] Invalid anchor. Use the hash alone (e.g. "aB3x") — no line numbers or trailing content.`;
+	}
+	return `[E_BAD_REF] Invalid anchor "${trimmed}". Expected a bare 4-character hash.`;
+}
+function parseAnchorRef(ref: string): Anchor {
+	const trimmed = ref.trim();
+	// Strict: the wire format is a 4-character hash from the URL-safe base64
+	// alphabet (A-Za-z0-9-_), copied verbatim from `read` output. The first
+	// character can be `-` (a valid alphabet char), so a hash like `-qkl` is
+	// taken literally. No other form is tolerated: `+`/`-`/`>>>` markers from
+	// diff contexts or stale-anchor retry blocks are rejected. The model must
+	// copy just the 4-character hash with no surrounding characters.
+	if (
+		trimmed.length === HASH_LENGTH &&
+		HASH_ALPHABET_RE.test(trimmed)
+	) {
+		return { hash: trimmed };
+	}
+	throw new Error(diagnoseHashRef(ref));
+}
+/**
+ * Parse a hash anchor. Accepts `HASH` (e.g. `"aB3x"`) only. The
+ * `HASH:content` disambiguator from earlier versions is gone — the hash
+ * is the entire wire format for `pos` and `end`, and no content may
+ * follow it.
+ *
+ * Throws `[E_BAD_REF]` for malformed input.
+ */
+export const parseHashRef = parseAnchorRef;
+// ─── Content preprocessing ──────────────────────────────────────────────
+/**
+ * Reject hashline display prefixes in edit payloads. Strict semantics: the
+ * model must send literal file content for `lines`, not the rendered read /
+ * diff form. Silent stripping is no longer performed — see AGENTS.md.
+ *
+ * This covers the unambiguous `+HASH:` / diff `+/-` forms, rejectable on
+ * shape alone. The bare `HHHH:` variant (issue #24) is context-dependent and
+ * lives in `assertNoBareHashPrefixLines`.
+ *
+ */
+function assertNoDisplayPrefixes(lines: string[]): void {
+	for (const line of lines) {
+		if (!line.length) continue;
+		if (
+			HASHLINE_PREFIX_PLUS_RE.test(line) ||
+			DIFF_MINUS_RE.test(line)
+		) {
+			throw new Error(
+			`[E_INVALID_PATCH] "lines" must contain literal file content, not HASH: or diff prefixes. Offending line: ${JSON.stringify(line)}`
+			);
+		}
+	}
+}
+/**
+ * Parse replacement text into lines.
+ *
+ * String input is normalized to LF and drops exactly one trailing newline,
+ * matching read-preview style content. Array input is preserved verbatim so
+ * explicitly provided blank lines remain intact. Display prefixes are
+ * rejected by `assertNoDisplayPrefixes`, never silently stripped.
+ */
+export function hashlineParseText(edit: string[] | string | null): string[] {
+	if (edit === null) return [];
+	const lines =
+		typeof edit === "string"
+			? (edit.endsWith("\n") ? edit.slice(0, -1) : edit)
+					.replaceAll("\r", "")
+					.split("\n")
+			: edit;
+	assertNoDisplayPrefixes(lines);
+	return lines;
+}