npm - @ctxr/skill-llm-wiki - Versions diffs - 1.0.1 - Mend

@ctxr/skill-llm-wiki 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

package/CHANGELOG.md +134 -0
package/LICENSE +21 -0
package/README.md +484 -0
package/SKILL.md +252 -0
package/guide/basics/concepts.md +74 -0
package/guide/basics/index.md +45 -0
package/guide/basics/schema.md +140 -0
package/guide/cli.md +256 -0
package/guide/correctness/index.md +45 -0
package/guide/correctness/invariants.md +89 -0
package/guide/correctness/safety.md +96 -0
package/guide/history/diff.md +110 -0
package/guide/history/hidden-git.md +130 -0
package/guide/history/index.md +52 -0
package/guide/history/remote-sync.md +113 -0
package/guide/index.md +134 -0
package/guide/isolation/coexistence.md +134 -0
package/guide/isolation/index.md +44 -0
package/guide/isolation/scale.md +251 -0
package/guide/layout/in-place-mode.md +97 -0
package/guide/layout/index.md +53 -0
package/guide/layout/layout-contract.md +131 -0
package/guide/layout/layout-modes.md +115 -0
package/guide/operations/index.md +76 -0
package/guide/operations/ingest/build.md +75 -0
package/guide/operations/ingest/extend.md +61 -0
package/guide/operations/ingest/index.md +54 -0
package/guide/operations/ingest/join.md +65 -0
package/guide/operations/maintain/fix.md +66 -0
package/guide/operations/maintain/index.md +47 -0
package/guide/operations/maintain/rebuild.md +86 -0
package/guide/operations/validate.md +48 -0
package/guide/substrate/index.md +47 -0
package/guide/substrate/operators.md +96 -0
package/guide/substrate/tiered-ai.md +363 -0
package/guide/ux/index.md +44 -0
package/guide/ux/preflight.md +150 -0
package/guide/ux/user-intent.md +135 -0
package/package.json +55 -0
package/scripts/cli.mjs +893 -0
package/scripts/commands/remote.mjs +93 -0
package/scripts/commands/review.mjs +253 -0
package/scripts/commands/sync.mjs +84 -0
package/scripts/lib/chunk.mjs +421 -0
package/scripts/lib/cluster-detect.mjs +516 -0
package/scripts/lib/decision-log.mjs +343 -0
package/scripts/lib/draft.mjs +158 -0
package/scripts/lib/embeddings.mjs +366 -0
package/scripts/lib/frontmatter.mjs +497 -0
package/scripts/lib/git-commands.mjs +155 -0
package/scripts/lib/git.mjs +486 -0
package/scripts/lib/gitignore.mjs +62 -0
package/scripts/lib/history.mjs +331 -0
package/scripts/lib/indices.mjs +510 -0
package/scripts/lib/ingest.mjs +258 -0
package/scripts/lib/intent.mjs +713 -0
package/scripts/lib/interactive.mjs +99 -0
package/scripts/lib/migrate.mjs +126 -0
package/scripts/lib/nest-applier.mjs +260 -0
package/scripts/lib/operators.mjs +1365 -0
package/scripts/lib/orchestrator.mjs +718 -0
package/scripts/lib/paths.mjs +197 -0
package/scripts/lib/preflight.mjs +213 -0
package/scripts/lib/provenance.mjs +672 -0
package/scripts/lib/quality-metric.mjs +269 -0
package/scripts/lib/query-fixture.mjs +71 -0
package/scripts/lib/rollback.mjs +95 -0
package/scripts/lib/shape-check.mjs +172 -0
package/scripts/lib/similarity-cache.mjs +126 -0
package/scripts/lib/similarity.mjs +230 -0
package/scripts/lib/snapshot.mjs +54 -0
package/scripts/lib/source-frontmatter.mjs +85 -0
package/scripts/lib/tier2-protocol.mjs +470 -0
package/scripts/lib/tiered.mjs +453 -0
package/scripts/lib/validate.mjs +362 -0

package/scripts/lib/similarity-cache.mjs ADDED Viewed

@@ -0,0 +1,126 @@
+// similarity-cache.mjs — pairwise memoisation of tiered similarity
+// decisions. Keyed by the sorted pair of content hashes so (a,b) and
+// (b,a) resolve to the same entry. Invalidated implicitly when either
+// entry's hash changes — the key simply doesn't match anymore.
+//
+// Cache entries are JSON files under `<wiki>/.llmwiki/similarity-cache/`.
+// One file per pair. The filename is derived from the sorted hashes
+// with sha256 collapsing to keep the name short and filesystem-safe.
+// The payload carries the tier, similarity, decision, and the tier
+// at which the decision was resolved — tests read it back to verify
+// caching prevented redundant work.
+import { createHash } from "node:crypto";
+import {
+  existsSync,
+  mkdirSync,
+  readFileSync,
+  readdirSync,
+  renameSync,
+  rmSync,
+  writeFileSync,
+} from "node:fs";
+import { join } from "node:path";
+export function cacheDir(wikiRoot) {
+  return join(wikiRoot, ".llmwiki", "similarity-cache");
+}
+// Deterministic filename for a hash pair. Hash prefixes are sorted
+// so the lookup is symmetric regardless of argument order.
+export function cacheKey(hashA, hashB) {
+  if (!hashA || !hashB) {
+    throw new Error("similarity-cache: both hashes must be non-empty strings");
+  }
+  const [first, second] = hashA <= hashB ? [hashA, hashB] : [hashB, hashA];
+  // sha256 the concatenation so the resulting key is a bounded-
+  // length filesystem-safe string. Truncate to 32 hex chars for
+  // readability — 128 bits of discriminant is more than enough.
+  return createHash("sha256")
+    .update(first + "\0" + second)
+    .digest("hex")
+    .slice(0, 32);
+}
+export function cacheEntryPath(wikiRoot, hashA, hashB) {
+  return join(cacheDir(wikiRoot), cacheKey(hashA, hashB) + ".json");
+}
+// Read a cached decision. Returns null on miss or malformed file.
+// Does NOT throw on parse errors — a corrupt cache entry is treated
+// as a miss, so the caller re-computes and overwrites it.
+export function readCached(wikiRoot, hashA, hashB) {
+  const path = cacheEntryPath(wikiRoot, hashA, hashB);
+  if (!existsSync(path)) return null;
+  try {
+    const raw = readFileSync(path, "utf8");
+    const parsed = JSON.parse(raw);
+    // Minimal sanity check: must have tier, similarity, decision.
+    if (
+      typeof parsed !== "object" ||
+      parsed === null ||
+      typeof parsed.tier !== "number" ||
+      typeof parsed.similarity !== "number" ||
+      typeof parsed.decision !== "string"
+    ) {
+      return null;
+    }
+    return parsed;
+  } catch {
+    return null;
+  }
+}
+// Write a decision to the cache atomically (temp-file + rename).
+export function writeCached(wikiRoot, hashA, hashB, decision) {
+  if (!decision || typeof decision !== "object") {
+    throw new Error("similarity-cache: decision must be an object");
+  }
+  const dir = cacheDir(wikiRoot);
+  mkdirSync(dir, { recursive: true });
+  const path = cacheEntryPath(wikiRoot, hashA, hashB);
+  const payload = JSON.stringify(
+    {
+      tier: decision.tier,
+      similarity: decision.similarity,
+      decision: decision.decision,
+      confidence_band: decision.confidence_band ?? null,
+      cached_at: new Date().toISOString(),
+    },
+    null,
+    0,
+  );
+  const tmp = `${path}.tmp.${process.pid}.${Date.now()}`;
+  writeFileSync(tmp, payload, "utf8");
+  renameSync(tmp, path);
+}
+// Remove every cache file. Used by tests and by `startCorpus` via
+// the orchestrator when the corpus changes materially. Safe to call
+// when the cache dir doesn't exist.
+export function clearCache(wikiRoot) {
+  const dir = cacheDir(wikiRoot);
+  if (!existsSync(dir)) return 0;
+  let count = 0;
+  for (const name of readdirSync(dir)) {
+    if (!name.endsWith(".json")) continue;
+    try {
+      rmSync(join(dir, name), { force: true });
+      count++;
+    } catch {
+      /* best-effort */
+    }
+  }
+  return count;
+}
+// Count cached entries — convenience for tests and metrics.
+export function cacheSize(wikiRoot) {
+  const dir = cacheDir(wikiRoot);
+  if (!existsSync(dir)) return 0;
+  let n = 0;
+  for (const name of readdirSync(dir)) {
+    if (name.endsWith(".json")) n++;
+  }
+  return n;
+}

package/scripts/lib/similarity.mjs ADDED Viewed

@@ -0,0 +1,230 @@
+// similarity.mjs — Tier 0 of the tiered AI ladder (methodology §8.5).
+//
+// Pure TF-IDF + cosine similarity over entry frontmatters. No external
+// dependencies. Deterministic. Cheap enough to run on every pairwise
+// check without concern. The ladder escalates to Tier 1 (local
+// embeddings) only when Tier 0's confidence is mid-band.
+//
+// Scope restriction: this module operates on frontmatter fields only
+// (focus + covers[] + tags + id). Bodies are never touched — that is
+// the whole point of Phase 5's chunk iterator, and Phase 6 honours it
+// at the substrate level.
+//
+// Thresholds are tunable via `<wiki>/.llmwiki/config.yaml` in
+// future; Phase 6 ships the defaults as exported constants so tests
+// and `tiered.mjs` can reference them without drift.
+export const TIER0_DECISIVE_SAME = 0.85;
+export const TIER0_DECISIVE_DIFFERENT = 0.30;
+// Small embedded English stopword list — intentionally narrow. A
+// fuller list tends to over-aggressive filtering and hides real
+// content signals in short covers[] strings. The covers field uses
+// terse technical phrases; dropping articles and connectives is
+// enough.
+const STOPWORDS = new Set([
+  "a", "an", "and", "are", "as", "at", "be", "by", "for", "from",
+  "has", "have", "he", "her", "his", "i", "in", "is", "it", "its",
+  "of", "on", "or", "she", "that", "the", "their", "they", "this",
+  "to", "was", "we", "were", "will", "with",
+]);
+// Tokeniser: lowercase, strip punctuation, split on non-word runs,
+// filter stopwords and short tokens. Keeps Unicode letters so
+// non-ASCII frontmatters are first-class. Numbers are kept because
+// version suffixes like "v1" / "v2" are often meaningful signals.
+export function tokenize(text) {
+  if (!text || typeof text !== "string") return [];
+  // `\p{L}` = any Unicode letter; `\p{N}` = any Unicode number.
+  const tokens = text
+    .toLowerCase()
+    .split(/[^\p{L}\p{N}]+/u)
+    .filter((t) => t.length >= 2)
+    .filter((t) => !STOPWORDS.has(t));
+  return tokens;
+}
+// Build a tf-vector from a token list. Plain term-frequency map.
+function tf(tokens) {
+  const v = new Map();
+  for (const t of tokens) v.set(t, (v.get(t) ?? 0) + 1);
+  return v;
+}
+// Compute idf weights from a corpus of token lists using the
+// scikit-learn smoothed form: `log((1 + N) / (1 + df)) + 1`. This
+// keeps ubiquitous terms with lower weight than rare terms while
+// never going negative and behaving sensibly at small N.
+//
+// Round-trip values (verified):
+//   N=1, df=1 → log(2/2)+1 = 1.000
+//   N=2, df=2 → log(3/3)+1 = 1.000
+//   N=2, df=1 → log(3/2)+1 ≈ 1.405
+//   N=3, df=3 → log(4/4)+1 = 1.000
+//   N=3, df=1 → log(4/2)+1 ≈ 1.693
+//
+// Notice that under this formula terms shared across ALL entries
+// get a BASELINE weight of 1.0, while rarer terms get larger
+// weights. Cosine then correctly down-weights shared-everything
+// terms relative to distinguishing ones — the behaviour the
+// "rare terms distinguish better" intuition implies.
+export function computeIdf(tokenLists) {
+  const df = new Map();
+  for (const tokens of tokenLists) {
+    const seen = new Set();
+    for (const t of tokens) {
+      if (seen.has(t)) continue;
+      seen.add(t);
+      df.set(t, (df.get(t) ?? 0) + 1);
+    }
+  }
+  const N = tokenLists.length;
+  const idf = new Map();
+  for (const [t, dfT] of df) {
+    idf.set(t, Math.log((1 + N) / (1 + dfT)) + 1);
+  }
+  return idf;
+}
+// Precompute a directory-wide IDF model once per comparison pool.
+// Callers that iterate N² pairs (detectMerge) reuse the returned
+// { idfMap, tokenLists, texts } instead of recomputing IDF for
+// every pair — the difference between O(N³) and O(N²) work.
+export function buildComparisonModel(entries) {
+  const texts = entries.map((e) => entryText(e));
+  const tokenLists = texts.map((t) => tokenize(t));
+  const idfMap = computeIdf(tokenLists);
+  return { texts, tokenLists, idfMap };
+}
+// Convert a tf map into a tf-idf vector given an idf map. Terms
+// absent from idf (i.e., not present in the corpus) contribute 0.
+export function tfidfVector(tokens, idfMap) {
+  const tfMap = tf(tokens);
+  const out = new Map();
+  for (const [term, freq] of tfMap) {
+    const idf = idfMap.get(term);
+    if (idf === undefined) continue;
+    out.set(term, freq * idf);
+  }
+  return out;
+}
+// Cosine similarity between two sparse tf-idf vectors. Returns a
+// value in [0, 1] for non-negative inputs. Handles zero vectors by
+// returning 0 (rather than NaN).
+export function cosine(a, b) {
+  if (!a || !b) return 0;
+  if (a.size === 0 || b.size === 0) return 0;
+  let dot = 0;
+  let normA = 0;
+  let normB = 0;
+  // Iterate over the smaller map for the dot product; every entry
+  // in the larger map contributes to its norm.
+  const [small, large] = a.size <= b.size ? [a, b] : [b, a];
+  for (const [term, weight] of small) {
+    const other = large.get(term) ?? 0;
+    dot += weight * other;
+  }
+  for (const w of a.values()) normA += w * w;
+  for (const w of b.values()) normB += w * w;
+  const denom = Math.sqrt(normA) * Math.sqrt(normB);
+  if (denom === 0) return 0;
+  return dot / denom;
+}
+// Build the text we compare for a single entry. The methodology
+// specifies frontmatter fields only: focus + covers[] + tags +
+// domains. We join them into a single text for tokenisation, with
+// doubled weight on focus (repeat it twice in the concatenation).
+export function entryText(data) {
+  if (!data || typeof data !== "object") return "";
+  const parts = [];
+  // Focus is the most semantically concentrated field; double it.
+  if (typeof data.focus === "string") {
+    parts.push(data.focus, data.focus);
+  }
+  if (Array.isArray(data.covers)) {
+    parts.push(data.covers.filter((c) => typeof c === "string").join(" "));
+  }
+  if (Array.isArray(data.tags)) {
+    parts.push(data.tags.filter((t) => typeof t === "string").join(" "));
+  }
+  if (Array.isArray(data.domains)) {
+    parts.push(data.domains.filter((d) => typeof d === "string").join(" "));
+  }
+  return parts.join(" ").trim();
+}
+// Compare two entries via Tier 0: returns
+//   { tier: 0, similarity, decision, confidence_band, reason }
+//
+// `decision` is one of "same" / "different" / "escalate" /
+// "undecidable". "undecidable" is returned when either entry's
+// frontmatter is empty — the caller should NOT escalate this to
+// Tier 1/2 because an empty-text pair embeds to whatever the model
+// emits for empty input, which collapses to near-1.0 cosine and
+// would cause spurious MERGE decisions. Callers must treat
+// "undecidable" as a hard stop for this pair.
+//
+// `corpusContext` is a list of other entry data objects providing
+// the IDF statistics. For sibling-comparison use cases the context
+// is the full set of siblings. `precomputedModel` is the optional
+// result of `buildComparisonModel(corpusContext)` — pass it to
+// reuse IDF/tokens across many pairs (O(N²) vs O(N³)).
+export function compareEntries(
+  a,
+  b,
+  corpusContext = null,
+  {
+    sameThreshold = TIER0_DECISIVE_SAME,
+    differentThreshold = TIER0_DECISIVE_DIFFERENT,
+    precomputedModel = null,
+  } = {},
+) {
+  const textA = entryText(a);
+  const textB = entryText(b);
+  if (textA === "" || textB === "") {
+    return {
+      tier: 0,
+      similarity: 0,
+      decision: "undecidable",
+      confidence_band: "insufficient-text",
+      reason: "one or both entries had empty frontmatter text",
+    };
+  }
+  const tokensA = tokenize(textA);
+  const tokensB = tokenize(textB);
+  let idfMap;
+  if (precomputedModel) {
+    idfMap = precomputedModel.idfMap;
+  } else {
+    const contextList =
+      corpusContext && corpusContext.length > 0
+        ? corpusContext.map((e) => tokenize(entryText(e)))
+        : [tokensA, tokensB];
+    idfMap = computeIdf(contextList);
+  }
+  const vecA = tfidfVector(tokensA, idfMap);
+  const vecB = tfidfVector(tokensB, idfMap);
+  const sim = cosine(vecA, vecB);
+  let decision;
+  let band;
+  if (sim >= sameThreshold) {
+    decision = "same";
+    band = "decisive-same";
+  } else if (sim <= differentThreshold) {
+    decision = "different";
+    band = "decisive-different";
+  } else {
+    decision = "escalate";
+    band = "mid-band";
+  }
+  return {
+    tier: 0,
+    similarity: sim,
+    decision,
+    confidence_band: band,
+    reason: null,
+  };
+}

package/scripts/lib/snapshot.mjs ADDED Viewed

@@ -0,0 +1,54 @@
+// snapshot.mjs — create the pre-op snapshot commit that every top-level
+// operation anchors rollback to. The snapshot captures every byte of every
+// tracked wiki file at the moment before the operation starts.
+import {
+  gitCommit,
+  gitHeadSha,
+  gitInit,
+  gitRunChecked,
+  gitTag,
+  gitWorkingTreeClean,
+} from "./git.mjs";
+import { ensureWikiGitignore } from "./gitignore.mjs";
+export { ensureWikiGitignore };
+// preOpSnapshot(wikiRoot, opId)
+//   1. Ensure the private repo exists (git init + genesis).
+//   2. Ensure the wiki-local .gitignore is present.
+//   3. git add -A.
+//   4. If anything is staged: commit "pre-op <opId>". Otherwise skip commit.
+//   5. Tag HEAD as pre-op/<opId>.
+//
+// Tag naming note: the pre-op anchor lives in the `refs/tags/pre-op/`
+// namespace and the final tag lives in `refs/tags/op/`. Keeping them in
+// separate ref subdirectories avoids git's "cannot create ref X: X/y
+// exists" hierarchy collision — we used to use `op/<id>/pre` + `op/<id>`
+// which DO collide because git treats the slash as a directory
+// boundary. Rollback's `pre-<op-id>` shorthand resolves to `pre-op/<op-id>`.
+//
+// Returns { initialized, tag, committed, sha } — `committed` indicates
+// whether a new pre-op commit was actually written (skipped when the
+// working tree already matched HEAD), and `sha` is the final HEAD SHA
+// the tag points at. The tag creation itself is loud on collision: if
+// `pre-op/<opId>` already exists pointing elsewhere, gitTag throws
+// rather than silently overwriting a prior rollback anchor.
+export function preOpSnapshot(wikiRoot, opId) {
+  if (!opId || typeof opId !== "string") {
+    throw new Error("preOpSnapshot requires a non-empty opId string");
+  }
+  const init = gitInit(wikiRoot);
+  ensureWikiGitignore(wikiRoot);
+  gitRunChecked(wikiRoot, ["add", "-A"]);
+  const clean = gitWorkingTreeClean(wikiRoot);
+  let committed = false;
+  if (!clean) {
+    gitCommit(wikiRoot, `pre-op ${opId}`);
+    committed = true;
+  }
+  const tag = `pre-op/${opId}`;
+  gitTag(wikiRoot, tag, "HEAD");
+  const sha = gitHeadSha(wikiRoot);
+  return { initialized: init.initialized, tag, committed, sha };
+}

package/scripts/lib/source-frontmatter.mjs ADDED Viewed

@@ -0,0 +1,85 @@
+// source-frontmatter.mjs — read frontmatter from source files via
+// `gray-matter`, the de-facto-standard YAML frontmatter library.
+//
+// Why not extend scripts/lib/frontmatter.mjs? That module owns the
+// skill's OUTPUT serialisation, where deterministic byte-identical
+// rendering is a hard requirement. The home-rolled parser only covers
+// the narrow YAML subset the skill writes. When we read an ALREADY-
+// frontmatter'd source file (e.g. a hand-authored wiki guide with
+// `activation`, `covers`, `tags`, `focus`, `shared_covers`, nested
+// sequences of maps, etc.), we want full YAML 1.2 coverage — exactly
+// what gray-matter gives us (it delegates to js-yaml).
+//
+// Pollution guard: gray-matter's parsed object can still contain keys
+// that would poison our internal object pipeline if merged naively
+// (`__proto__`, `constructor`, `prototype`). We strip those here so
+// every downstream caller can treat the returned `data` as a safe
+// plain object. This preserves the security invariant encoded in
+// `scripts/lib/frontmatter.mjs`'s POLLUTION_KEYS list and its test
+// at `tests/unit/frontmatter-pollution.test.mjs`.
+import matter from "gray-matter";
+const POLLUTION_KEYS = new Set(["__proto__", "constructor", "prototype"]);
+// Parse a raw source string. Returns `{ data, body, hasFrontmatter }`.
+//
+//   - `data`: the parsed frontmatter as a safe plain object (pollution
+//     keys removed recursively). Empty object when there was no fence.
+//   - `body`: the source content WITH the frontmatter block stripped.
+//     This is what the orchestrator concatenates fresh frontmatter on
+//     top of — stripping here is what fixes the double-stack bug.
+//   - `hasFrontmatter`: true iff the source opened with `---\n` and a
+//     matching close fence. Used by the orchestrator to decide whether
+//     to apply the authored-field merge in draftLeafFrontmatter.
+export function parseSourceFrontmatter(raw) {
+  if (typeof raw !== "string") {
+    throw new TypeError("parseSourceFrontmatter: raw must be a string");
+  }
+  // Fast path: no leading fence → no frontmatter. Avoids gray-matter
+  // having to tokenise the whole file just to confirm there's nothing
+  // to parse.
+  if (!raw.startsWith("---\n") && raw !== "---\n" && !raw.startsWith("---\r\n")) {
+    return { data: {}, body: raw, hasFrontmatter: false };
+  }
+  let parsed;
+  try {
+    parsed = matter(raw, { excerpt: false });
+  } catch (err) {
+    // gray-matter throws on malformed YAML. Surface the underlying
+    // message and let the orchestrator decide whether to fall back
+    // (it currently treats malformed source frontmatter as empty).
+    return {
+      data: {},
+      body: raw,
+      hasFrontmatter: false,
+      error: err.message || String(err),
+    };
+  }
+  const safeData = sanitise(parsed.data);
+  const body = typeof parsed.content === "string" ? parsed.content : "";
+  // gray-matter returns `matter: ""` when there was no fence. Use the
+  // `isEmpty` heuristic: any non-empty data object OR a body shorter
+  // than the raw input implies a fence was parsed.
+  const hasFrontmatter =
+    Object.keys(safeData).length > 0 || body.length < raw.length;
+  return { data: safeData, body, hasFrontmatter };
+}
+// Recursively copy a parsed object, refusing any pollution key. Arrays
+// and nested maps are walked; primitives pass through unchanged.
+function sanitise(value) {
+  if (value === null || typeof value !== "object") return value;
+  if (Array.isArray(value)) return value.map((v) => sanitise(v));
+  const out = {};
+  for (const [k, v] of Object.entries(value)) {
+    if (POLLUTION_KEYS.has(k)) continue;
+    Object.defineProperty(out, k, {
+      value: sanitise(v),
+      writable: true,
+      enumerable: true,
+      configurable: true,
+    });
+  }
+  return out;
+}