npm - @anmol-srv/sigil - Versions diffs - 0.10.3 - Mend

@anmol-srv/sigil 0.10.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/src/db/migrations/20260424120002_create-embedding-cache.cjs ADDED Viewed

@@ -0,0 +1,26 @@
+/**
+ * Persistent embedding cache — avoids re-embedding identical text.
+ *
+ * Keyed on sha256(provider + model + text). Value is the cached vector.
+ * LRU eviction when count exceeds a soft limit (applied at write time).
+ */
+exports.up = async function (knex) {
+  await knex.schema.createTable('embedding_cache', (table) => {
+    table.string('key').primary();                // sha256(provider|model|text)
+    table.string('provider').notNullable();
+    table.string('model').notNullable();
+    table.integer('hits').notNullable().defaultTo(0);
+    table.timestamp('created_at').notNullable().defaultTo(knex.fn.now());
+    table.timestamp('last_used_at').notNullable().defaultTo(knex.fn.now());
+    table.index('last_used_at');
+  });
+  // Embedding column with the same dims as everywhere else (768)
+  await knex.raw('ALTER TABLE embedding_cache ADD COLUMN embedding vector(768)');
+};
+exports.down = async function (knex) {
+  await knex.schema.dropTable('embedding_cache');
+};

package/src/db/migrations/20260429120000_halfvec-index-compression.cjs ADDED Viewed

@@ -0,0 +1,34 @@
+/**
+ * Halfvec compression for HNSW indexes (Ogham §"Halfvec compression").
+ *
+ * The embedding columns stay as vector(768) (float32), but the HNSW index
+ * casts to halfvec(768) (float16). ~50% index size reduction with negligible
+ * quality loss — the cosine distance computation has more than enough
+ * precision at fp16 for retrieval ranking.
+ *
+ * Why not change the column type? Because storing as float32 keeps room for
+ * higher-precision operations (exact distance, future re-indexing strategies)
+ * while the HNSW index only needs distance ordering, where fp16 is fine.
+ */
+const TABLES = ['chunk', 'fact', 'entity'];
+exports.up = async function (knex) {
+  for (const table of TABLES) {
+    // Drop the old plain-vector HNSW index
+    await knex.raw(`DROP INDEX IF EXISTS ${table}_embedding_idx`);
+    // Recreate with halfvec cast
+    await knex.raw(
+      `CREATE INDEX ${table}_embedding_idx ON ${table} USING hnsw ((embedding::halfvec(768)) halfvec_cosine_ops) WITH (m = 16, ef_construction = 64)`,
+    );
+  }
+};
+exports.down = async function (knex) {
+  for (const table of TABLES) {
+    await knex.raw(`DROP INDEX IF EXISTS ${table}_embedding_idx`);
+    await knex.raw(
+      `CREATE INDEX ${table}_embedding_idx ON ${table} USING hnsw (embedding vector_cosine_ops)`,
+    );
+  }
+};

package/src/db/migrations/20260429120100_create-hebbian-edge-table.cjs ADDED Viewed

@@ -0,0 +1,37 @@
+/**
+ * Hebbian co-retrieval edges between facts.
+ *
+ * When two facts are retrieved together in the same search top-K, the edge
+ * between them strengthens. Over time, the graph builds itself from search
+ * behavior — no LLM calls, no manual annotation.
+ *
+ * Lexicographic canonicalization (fact_a_id < fact_b_id) prevents the
+ * (a,b)/(b,a) duplicate problem that bites symmetric relations. Lesson
+ * borrowed from OGHAM-LEARNINGS.md.
+ */
+exports.up = async function (knex) {
+  await knex.schema.createTable('hebbian_edge', (table) => {
+    table.bigInteger('fact_a_id').notNullable().references('id').inTable('fact').onDelete('CASCADE');
+    table.bigInteger('fact_b_id').notNullable().references('id').inTable('fact').onDelete('CASCADE');
+    table.integer('strength').notNullable().defaultTo(1);
+    table.timestamp('first_seen_at').notNullable().defaultTo(knex.fn.now());
+    table.timestamp('last_seen_at').notNullable().defaultTo(knex.fn.now());
+    table.primary(['fact_a_id', 'fact_b_id']);
+  });
+  // Enforce canonical ordering at the row level — fact_a_id MUST be less than fact_b_id.
+  await knex.raw(`
+    ALTER TABLE hebbian_edge
+    ADD CONSTRAINT hebbian_edge_canonical_order
+    CHECK (fact_a_id < fact_b_id)
+  `);
+  // For walking outward from a single fact: index both columns.
+  await knex.raw(`CREATE INDEX hebbian_edge_a_idx ON hebbian_edge (fact_a_id, strength DESC)`);
+  await knex.raw(`CREATE INDEX hebbian_edge_b_idx ON hebbian_edge (fact_b_id, strength DESC)`);
+};
+exports.down = async function (knex) {
+  await knex.schema.dropTable('hebbian_edge');
+};

package/src/db/migrations/20260429120200_upgrade-embedding-dim-1024.cjs ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * Upgrade embedding columns from vector(768) → vector(N) where N >= 1024.
+ *
+ * CONDITIONAL: only runs when EMBEDDING_DIMENSIONS env >= 1024. The default
+ * (unset or 768) is the Ollama nomic-embed-text dimension; bumping the schema
+ * to 1024 there would mismatch the embedder and break ingest.
+ *
+ * Activates when an operator opts into a 1024d-class model (Voyage 3-large,
+ * OpenAI text-embedding-3-large truncated to 1024d, bge-large-en-v1.5).
+ * They set EMBEDDING_DIMENSIONS=1024 (or higher) and re-run sigil migrate.
+ *
+ * REFUSES TO RUN if any embedding row exists — changing the column type
+ * would invalidate stored embeddings. Operators upgrading an existing DB:
+ *   1. sigil export to back up
+ *   2. sigil reset --confirm
+ *   3. set EMBEDDING_DIMENSIONS=1024 in ~/.sigil/.env
+ *   4. sigil migrate
+ *   5. re-ingest with the new embedding model
+ */
+const TABLES = ['chunk', 'fact', 'entity', 'embedding_cache'];
+const DEFAULT_DIM = 768;
+exports.up = async function (knex) {
+  const targetDim = Number(process.env.EMBEDDING_DIMENSIONS) || DEFAULT_DIM;
+  if (targetDim <= DEFAULT_DIM) {
+    // No-op for the default 768d (local nomic). Migration is recorded as
+    // applied so it doesn't keep trying on every sigil migrate.
+    return;
+  }
+  // Safety check — bail loudly if existing embeddings would be invalidated.
+  for (const table of TABLES) {
+    const { rows } = await knex.raw(`SELECT COUNT(*)::int AS c FROM ${table} WHERE embedding IS NOT NULL`);
+    const count = rows[0].c;
+    if (count > 0) {
+      throw new Error(
+        `Cannot upgrade embedding dim to ${targetDim}: ${table} has ${count} rows with existing embeddings. ` +
+        `Run 'sigil export' to back up, then 'sigil reset --confirm' to wipe, then re-migrate ` +
+        `and re-ingest with the new embedding model.`,
+      );
+    }
+  }
+  for (const table of TABLES) {
+    await knex.raw(`ALTER TABLE ${table} ALTER COLUMN embedding TYPE vector(${targetDim}) USING embedding::vector(${targetDim})`);
+    // embedding_cache doesn't have an HNSW index — it's a key-value store keyed on sha256.
+    if (table === 'embedding_cache') continue;
+    await knex.raw(`DROP INDEX IF EXISTS ${table}_embedding_idx`);
+    await knex.raw(
+      `CREATE INDEX ${table}_embedding_idx ON ${table} USING hnsw ((embedding::halfvec(${targetDim})) halfvec_cosine_ops) WITH (m = 16, ef_construction = 64)`,
+    );
+  }
+};
+exports.down = async function (knex) {
+  // The down migration always reverts to 768d — it's the lowest common
+  // denominator and matches the prior halfvec migration's index.
+  for (const table of TABLES) {
+    await knex.raw(`ALTER TABLE ${table} ALTER COLUMN embedding TYPE vector(${DEFAULT_DIM}) USING NULL`);
+    if (table === 'embedding_cache') continue;
+    await knex.raw(`DROP INDEX IF EXISTS ${table}_embedding_idx`);
+    await knex.raw(
+      `CREATE INDEX ${table}_embedding_idx ON ${table} USING hnsw ((embedding::halfvec(${DEFAULT_DIM})) halfvec_cosine_ops) WITH (m = 16, ef_construction = 64)`,
+    );
+  }
+};

package/src/db/migrations/20260504120000_scope-document-source-path-uniqueness.cjs ADDED Viewed

@@ -0,0 +1,45 @@
+/**
+ * Scope document.source_path uniqueness to (source_path, namespace).
+ *
+ * Prior schema enforced UNIQUE(source_path) globally — meaning the same file
+ * path could only exist in ONE namespace at a time. This bit eval harnesses
+ * (per-question namespaces re-using the same source path), and would bite
+ * legitimate users wanting the same doc in personal + work namespaces.
+ *
+ * The composite UNIQUE(source_path, namespace) keeps "no dupes within a
+ * namespace" guarantee but allows the same path in different namespaces.
+ */
+exports.up = async function (knex) {
+  // Count any cross-namespace would-be-duplicates the old constraint masked.
+  const dupes = await knex.raw(`
+    SELECT source_path, COUNT(DISTINCT namespace) AS namespaces
+    FROM document
+    GROUP BY source_path
+    HAVING COUNT(DISTINCT namespace) > 1
+  `);
+  if (dupes.rows && dupes.rows.length) {
+    console.warn(`[migration] ${dupes.rows.length} source_paths now allowed in multiple namespaces.`);
+  }
+  await knex.schema.alterTable('document', (table) => {
+    table.dropUnique('source_path');
+  });
+  await knex.schema.alterTable('document', (table) => {
+    table.unique(['source_path', 'namespace']);
+  });
+};
+exports.down = async function (knex) {
+  await knex.schema.alterTable('document', (table) => {
+    table.dropUnique(['source_path', 'namespace']);
+  });
+  // Recreate the old global constraint. If multiple rows share a source_path
+  // across namespaces, this DOWN will fail loudly — that's correct, the
+  // operator must consolidate first.
+  await knex.schema.alterTable('document', (table) => {
+    table.unique('source_path');
+  });
+};

package/src/db/migrations/20260508001733_add-entity-aliases.cjs ADDED Viewed

@@ -0,0 +1,42 @@
+/**
+ * Add an `aliases` text-array column to the entity table.
+ *
+ * Why: AUDM and the existing 3-stage entity resolver both fail on entity
+ * renames ("Smara is now named Sigil") because the rename's vector
+ * similarity to existing facts about Smara is too low to trigger any
+ * dedup. The structural fix is to track entity identity at the entity
+ * layer (stable UUIDs surviving renames) and let facts reference those
+ * UUIDs via fact_entity. When a rename is detected, the canonical
+ * `name` rolls forward and the old name lands in `aliases[]` so that:
+ *
+ *   1. Future ingests mentioning the old name still resolve to the
+ *      same entity row (alias-aware lookup in findByName).
+ *   2. Search-time graph traversal pulls historical facts via the
+ *      stable entity_id even though their text still mentions the
+ *      old name.
+ *
+ * Defaults to '{}' so all existing rows have a sensible empty value.
+ * Indexed via a GIN expression on the lowercased array so case-
+ * insensitive lookup is fast.
+ */
+exports.up = async function (knex) {
+  await knex.raw(`
+    ALTER TABLE entity
+    ADD COLUMN aliases TEXT[] NOT NULL DEFAULT '{}'::text[]
+  `);
+  // Aliases are stored already lowercased by the resolver (push only happens
+  // via pushAlias() which lowercases at the boundary), so a plain GIN index
+  // on the array is sufficient. PGlite rejects subqueries in expression
+  // indexes, so we can't transform at index time — pre-lowercasing is the
+  // simpler contract.
+  await knex.raw(`
+    CREATE INDEX entity_aliases_idx ON entity USING GIN (aliases)
+  `);
+};
+exports.down = async function (knex) {
+  await knex.raw(`DROP INDEX IF EXISTS entity_aliases_idx`);
+  await knex.raw(`ALTER TABLE entity DROP COLUMN IF EXISTS aliases`);
+};

package/src/db/migrations/20260512120000_create-entity-hebbian-edge.cjs ADDED Viewed

@@ -0,0 +1,42 @@
+/**
+ * Hebbian co-retrieval edges between entities.
+ *
+ * Sibling of hebbian_edge but for entities, not facts. When a search returns
+ * a top-K result set, every entity linked to those facts is considered "co-
+ * activated." Pairwise edges between those entities strengthen.
+ *
+ * Why entities (in addition to fact-level edges):
+ *   - Fact-level edges are brittle when the same idea is stored as two
+ *     different facts. Entity edges survive paraphrase + AUDM splits.
+ *   - The entity graph is already the substrate for graph_boost / related-
+ *     entity expansion. A learned weight on top sharpens that traversal.
+ *
+ * Strength is NUMERIC (not integer) because the update rule caps via
+ * LEAST(strength + eta, cap) and read-time decay multiplies by a fractional
+ * exponential factor. Lex canonicalization (entity_a_id < entity_b_id)
+ * prevents (a,b)/(b,a) dupes.
+ */
+exports.up = async function (knex) {
+  await knex.schema.createTable('entity_hebbian_edge', (table) => {
+    table.bigInteger('entity_a_id').notNullable().references('id').inTable('entity').onDelete('CASCADE');
+    table.bigInteger('entity_b_id').notNullable().references('id').inTable('entity').onDelete('CASCADE');
+    table.decimal('strength', 12, 4).notNullable().defaultTo(1);
+    table.timestamp('first_seen_at').notNullable().defaultTo(knex.fn.now());
+    table.timestamp('last_seen_at').notNullable().defaultTo(knex.fn.now());
+    table.primary(['entity_a_id', 'entity_b_id']);
+  });
+  await knex.raw(`
+    ALTER TABLE entity_hebbian_edge
+    ADD CONSTRAINT entity_hebbian_edge_canonical_order
+    CHECK (entity_a_id < entity_b_id)
+  `);
+  await knex.raw(`CREATE INDEX entity_hebbian_edge_a_idx ON entity_hebbian_edge (entity_a_id, strength DESC)`);
+  await knex.raw(`CREATE INDEX entity_hebbian_edge_b_idx ON entity_hebbian_edge (entity_b_id, strength DESC)`);
+};
+exports.down = async function (knex) {
+  await knex.schema.dropTable('entity_hebbian_edge');
+};

package/src/db/migrations/20260512120000_create-pod-tables.cjs ADDED Viewed

@@ -0,0 +1,71 @@
+/**
+ * Create the `pod` table — typed memory containers that segregate facts,
+ * documents, and entities by source or subject. Pods sit on top of the
+ * existing fact/entity/document model; they do not replace AUDM, entity
+ * dedup, or the namespace partition.
+ *
+ * Pod types ship in this branch:
+ *   - 'session'  → one per Claude Code session (external_id = session_id)
+ *   - 'person'   → one per person you have a relationship with
+ *                  (entity_id FK to the canonical entity row)
+ *
+ * Future types reserved (no auto-creation yet):
+ *   - 'project', 'connector_workspace', 'custom'
+ *
+ * Membership lives in a separate `pod_membership` junction (next migration)
+ * so the `fact` row stays read-mostly and the HNSW index does not bloat —
+ * same discipline as the 20260424 fact_lifecycle split.
+ */
+exports.up = async function (knex) {
+  await knex.schema.createTable('pod', (table) => {
+    table.increments('id').primary();
+    table.text('uid').notNullable().unique();
+    table.text('pod_type').notNullable();
+    table.text('name').notNullable();
+    table.text('namespace').notNullable();
+    table.jsonb('attrs').notNullable().defaultTo('{}');
+    table.text('status').notNullable().defaultTo('active'); // active | archived
+    // Person/project pods link to their canonical entity. Nullable for
+    // session/workspace pods.
+    table.integer('entity_id').references('id').inTable('entity');
+    // Connector-workspace pods link to their connection. Nullable for
+    // session/person pods.
+    table.integer('connection_id').references('id').inTable('connection');
+    // Stable external identifier for upsert idempotency. For session pods
+    // this is the Claude Code session_id; for workspace pods this is the
+    // platform's team/org id.
+    table.text('external_id');
+    table.timestamp('started_at');
+    table.timestamp('ended_at');
+    // Denormalised member counters, refreshed by `sigil maintain` (or
+    // incrementally by membership writes). Cheap to keep, expensive to
+    // recompute on demand.
+    table.integer('member_doc_count').notNullable().defaultTo(0);
+    table.integer('member_fact_count').notNullable().defaultTo(0);
+    table.timestamps(false, true);
+    table.index('pod_type');
+    table.index('namespace');
+    table.index(['namespace', 'pod_type', 'status']);
+  });
+  // Upsert key: (pod_type, external_id, namespace) where external_id is set.
+  // Partial unique because external_id is nullable (custom pods may have none).
+  await knex.raw(`
+    CREATE UNIQUE INDEX pod_external_id_unique
+      ON pod (pod_type, external_id, namespace)
+      WHERE external_id IS NOT NULL
+  `);
+};
+exports.down = async function (knex) {
+  await knex.raw('DROP INDEX IF EXISTS pod_external_id_unique');
+  await knex.schema.dropTableIfExists('pod');
+};

package/src/db/migrations/20260512120100_create-pod-membership.cjs ADDED Viewed

@@ -0,0 +1,50 @@
+/**
+ * Polymorphic many-to-many junction linking pods to facts, documents, and
+ * entities. A fact can legitimately belong to multiple pods (e.g., a fact
+ * about Dhaval extracted in a Claude Code session belongs to both the
+ * person pod for Dhaval and the session pod for that conversation), so a
+ * single `pod_id` column on the fact row would force a lossy "primary
+ * pod" choice.
+ *
+ * Keeping membership in a junction also preserves the fact row's
+ * read-mostly invariant (no HNSW index churn on pod attach/detach) — same
+ * discipline as fact_lifecycle (20260424120000).
+ *
+ * member_id uses bigInteger because fact.id is bigint; document.id and
+ * entity.id are int4 but fit fine in the wider column.
+ */
+exports.up = async function (knex) {
+  await knex.schema.createTable('pod_membership', (table) => {
+    table.increments('id').primary();
+    table
+      .integer('pod_id')
+      .notNullable()
+      .references('id')
+      .inTable('pod')
+      .onDelete('CASCADE');
+    // 'fact' | 'document' | 'entity'. FK not enforced because Postgres
+    // does not support polymorphic FKs; integrity is the caller's
+    // responsibility (membership.js).
+    table.text('member_type').notNullable();
+    table.bigInteger('member_id').notNullable();
+    // 'primary' (this pod owns the member) | 'contextual' (member is
+    // referenced from this pod's perspective) | 'mention' (member just
+    // mentions an entity associated with this pod). Free string for now;
+    // promote to enum once the values settle.
+    table.text('role');
+    table.timestamp('created_at').notNullable().defaultTo(knex.fn.now());
+    table.unique(['pod_id', 'member_type', 'member_id']);
+    // Reverse lookup: "what pods is this fact/document/entity in?"
+    table.index(['member_type', 'member_id']);
+  });
+};
+exports.down = async function (knex) {
+  await knex.schema.dropTableIfExists('pod_membership');
+};

package/src/db/migrations/20260512120200_add-document-source-metadata.cjs ADDED Viewed

@@ -0,0 +1,32 @@
+/**
+ * Add `source_metadata` jsonb and `connection_id` FK to `document`.
+ *
+ * Why: the ingestion pipeline accepts a `metadata` arg (from sources/file.js,
+ * sources/url.js, future connectors) but currently drops it on the floor —
+ * it reaches `parse()` for format hints and `linkDocumentEntities()` for
+ * minor signals, but never lands on the document row. That made source-
+ * instance reasoning ("this came from Slack message ts=X in team=Y")
+ * impossible.
+ *
+ * Pods need this to attach connector-sourced documents to the right
+ * workspace pod and to derive person pods from senders.
+ *
+ * Defaults to '{}' so all existing rows have a sensible empty value.
+ */
+exports.up = async function (knex) {
+  await knex.schema.alterTable('document', (table) => {
+    table.integer('connection_id').references('id').inTable('connection');
+    table.jsonb('source_metadata').notNullable().defaultTo('{}');
+  });
+  await knex.raw('CREATE INDEX document_connection_id_idx ON document (connection_id)');
+};
+exports.down = async function (knex) {
+  await knex.raw('DROP INDEX IF EXISTS document_connection_id_idx');
+  await knex.schema.alterTable('document', (table) => {
+    table.dropColumn('source_metadata');
+    table.dropColumn('connection_id');
+  });
+};

package/src/db/migrations/20260514023428_rewrite-session-pods-and-add-fact-attribution-columns.cjs ADDED Viewed

@@ -0,0 +1,86 @@
+/**
+ * 0.10.0 — Pod distinction layer foundation.
+ *
+ * Three changes:
+ *   1. Rewrite pod_type='session' rows to 'claude_session'. The pod kind
+ *      registry treats the original Claude Code session pod as one of many
+ *      kinds (alongside project, person, playbook, vital); the name needs
+ *      to reflect that. No CHECK constraint exists on pod_type — the
+ *      column is plain text — so this is just an UPDATE.
+ *
+ *   2. Add fact.importance_score INTEGER. Existing fact.importance is a
+ *      text enum (vital | high | medium | supplementary | trivial); the
+ *      hot-context decay function in 0.10.0 needs a numeric scale.
+ *      Backfill: vital=5, high=4, medium=3, supplementary=2, trivial=1.
+ *      The text column stays as the authoritative input from the LLM
+ *      extractor; the numeric is the derived score retrieval uses.
+ *
+ *   3. Add fact.superseded_at TIMESTAMP and fact.superseded_by_fact_uid
+ *      TEXT for the append-only / bi-temporal pattern (Graphiti). Existing
+ *      valid_from / valid_until already cover event-time validity; these
+ *      add transaction-time supersession (the arbiter agent that lands in
+ *      0.11.0 will populate them).
+ */
+exports.up = async function (knex) {
+  // 1. Rewrite session → claude_session
+  await knex.raw("UPDATE pod SET pod_type = 'claude_session' WHERE pod_type = 'session'");
+  // 2. Add fact.importance_score with backfill
+  const hasImportanceScore = await knex.schema.hasColumn('fact', 'importance_score');
+  if (!hasImportanceScore) {
+    await knex.schema.alterTable('fact', (table) => {
+      table.integer('importance_score');
+    });
+    await knex.raw(`
+      UPDATE fact SET importance_score = CASE importance
+        WHEN 'vital'         THEN 5
+        WHEN 'high'          THEN 4
+        WHEN 'medium'        THEN 3
+        WHEN 'supplementary' THEN 2
+        WHEN 'trivial'       THEN 1
+        ELSE 2
+      END
+    `);
+    await knex.schema.alterTable('fact', (table) => {
+      table.integer('importance_score').defaultTo(2).notNullable().alter();
+    });
+    await knex.schema.alterTable('fact', (table) => {
+      table.index('importance_score');
+    });
+  }
+  // 3. Add supersession columns
+  const hasSupersededAt = await knex.schema.hasColumn('fact', 'superseded_at');
+  if (!hasSupersededAt) {
+    await knex.schema.alterTable('fact', (table) => {
+      table.timestamp('superseded_at');
+      table.text('superseded_by_fact_uid');
+    });
+    await knex.schema.alterTable('fact', (table) => {
+      table.index('superseded_at');
+    });
+  }
+};
+exports.down = async function (knex) {
+  // 3. Drop supersession columns
+  if (await knex.schema.hasColumn('fact', 'superseded_at')) {
+    await knex.schema.alterTable('fact', (table) => {
+      table.dropIndex('superseded_at');
+      table.dropColumn('superseded_at');
+      table.dropColumn('superseded_by_fact_uid');
+    });
+  }
+  // 2. Drop importance_score
+  if (await knex.schema.hasColumn('fact', 'importance_score')) {
+    await knex.schema.alterTable('fact', (table) => {
+      table.dropIndex('importance_score');
+      table.dropColumn('importance_score');
+    });
+  }
+  // 1. Rewrite claude_session → session
+  await knex.raw("UPDATE pod SET pod_type = 'session' WHERE pod_type = 'claude_session'");
+};