npm - sweet-search - Versions diffs - 2.5.2 → 2.5.4 - Mend

sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (155) hide show

package/core/cli.js +24 -3
package/core/graph/graph-expansion.js +215 -36
package/core/graph/graph-extractor.js +196 -11
package/core/graph/graph-search.js +395 -92
package/core/graph/hcgs-generator.js +2 -1
package/core/graph/index.js +2 -0
package/core/graph/repo-map.js +28 -6
package/core/graph/structural-answer-cues.js +168 -0
package/core/graph/structural-callsite-hints.js +40 -0
package/core/graph/structural-context-format.js +40 -0
package/core/graph/structural-context.js +450 -0
package/core/graph/structural-forward-push.js +156 -0
package/core/graph/structural-header-context.js +19 -0
package/core/graph/structural-importance.js +148 -0
package/core/graph/structural-pagerank.js +197 -0
package/core/graph/summary-manager.js +13 -9
package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
package/core/incremental-indexing/application/file-watcher.mjs +197 -0
package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
package/core/incremental-indexing/application/operator-cli.mjs +554 -0
package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
package/core/incremental-indexing/application/reconciler.mjs +477 -0
package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
package/core/indexing/admission-policy.js +139 -0
package/core/indexing/artifact-builder.js +29 -12
package/core/indexing/ast-chunker.js +107 -30
package/core/indexing/dedup/exemplar-selector.js +19 -1
package/core/indexing/gitignore-filter.js +223 -0
package/core/indexing/incremental-tracker.js +99 -30
package/core/indexing/index-codebase-v21.js +6 -5
package/core/indexing/index-maintainer.mjs +698 -6
package/core/indexing/indexer-ann.js +99 -15
package/core/indexing/indexer-build.js +158 -45
package/core/indexing/indexer-empty-baseline.js +80 -0
package/core/indexing/indexer-manifest.js +66 -0
package/core/indexing/indexer-phases.js +56 -23
package/core/indexing/indexer-sparse-gram.js +54 -13
package/core/indexing/indexer-utils.js +26 -208
package/core/indexing/indexing-file-policy.js +32 -7
package/core/indexing/maintainer-launcher.mjs +137 -0
package/core/indexing/merkle-tracker.js +251 -244
package/core/indexing/model-pool.js +46 -5
package/core/infrastructure/code-graph-repository.js +758 -6
package/core/infrastructure/code-graph-visibility.js +157 -0
package/core/infrastructure/codebase-repository.js +100 -13
package/core/infrastructure/config/search.js +1 -1
package/core/infrastructure/db-utils.js +118 -0
package/core/infrastructure/dedup-hashing.js +10 -13
package/core/infrastructure/hardware-capability.js +17 -7
package/core/infrastructure/index.js +8 -2
package/core/infrastructure/language-patterns/maps.js +4 -1
package/core/infrastructure/language-patterns/registry-core.js +56 -17
package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
package/core/infrastructure/language-patterns.js +69 -0
package/core/infrastructure/model-registry.js +20 -0
package/core/infrastructure/native-inference.js +7 -12
package/core/infrastructure/native-resolver.js +52 -37
package/core/infrastructure/native-sparse-gram.js +261 -20
package/core/infrastructure/native-tokenizer.js +6 -15
package/core/infrastructure/simd-distance.js +10 -16
package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
package/core/infrastructure/structural-alias-resolver.js +122 -0
package/core/infrastructure/structural-candidate-ranker.js +34 -0
package/core/infrastructure/structural-context-repository.js +472 -0
package/core/infrastructure/structural-context-utils.js +51 -0
package/core/infrastructure/structural-graph-signals.js +121 -0
package/core/infrastructure/structural-qualified-resolution.js +15 -0
package/core/infrastructure/structural-source-definitions.js +100 -0
package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
package/core/infrastructure/tree-sitter-provider.js +811 -37
package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
package/core/query/query-router.js +55 -5
package/core/ranking/file-kind-ranking.js +2192 -15
package/core/ranking/late-interaction-index.js +87 -12
package/core/search/cli-decoration.js +290 -0
package/core/search/context-expander.js +988 -78
package/core/search/index.js +1 -0
package/core/search/output-policy.js +275 -0
package/core/search/search-anchor.js +499 -0
package/core/search/search-boost.js +93 -1
package/core/search/search-cli.js +61 -204
package/core/search/search-hybrid.js +250 -10
package/core/search/search-pattern-chunks.js +57 -8
package/core/search/search-pattern-planner.js +68 -9
package/core/search/search-pattern-prefilter.js +30 -10
package/core/search/search-pattern-ripgrep.js +40 -4
package/core/search/search-pattern-sparse-overlay.js +256 -0
package/core/search/search-pattern.js +117 -29
package/core/search/search-postprocess.js +479 -5
package/core/search/search-read-semantic.js +260 -23
package/core/search/search-read.js +82 -64
package/core/search/search-reader-pin.js +71 -0
package/core/search/search-rrf.js +279 -0
package/core/search/search-semantic.js +110 -5
package/core/search/search-server.js +130 -57
package/core/search/search-trace.js +107 -0
package/core/search/server-identity.js +93 -0
package/core/search/session-daemon-prewarm.mjs +33 -10
package/core/search/sweet-search.js +399 -7
package/core/skills/sweet-index/SKILL.md +8 -6
package/core/vector-store/binary-hnsw-index.js +194 -30
package/core/vector-store/float-vector-store.js +96 -6
package/core/vector-store/hnsw-index.js +220 -49
package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
package/eval/agent-read-workflows/bin/ss-find +15 -0
package/eval/agent-read-workflows/bin/ss-grep +12 -0
package/eval/agent-read-workflows/bin/ss-read +14 -0
package/eval/agent-read-workflows/bin/ss-search +18 -0
package/eval/agent-read-workflows/bin/ss-semantic +12 -0
package/eval/agent-read-workflows/bin/ss-trace +11 -0
package/mcp/read-tool.js +109 -0
package/mcp/server.js +55 -15
package/mcp/tool-handlers.js +14 -124
package/mcp/trace-tool.js +81 -0
package/package.json +25 -10
package/scripts/hooks/intercept-read.mjs +55 -0
package/scripts/hooks/remind-tools.mjs +40 -0
package/scripts/init.js +698 -54
package/scripts/inject-agent-instructions.js +431 -0
package/scripts/install-prompt-reminders.js +188 -0
package/scripts/install-tool-enforcement.js +220 -0
package/scripts/smoke-test.js +12 -9
package/scripts/uninstall.js +276 -18
package/scripts/write-claude-rules.js +110 -0

package/core/graph/structural-importance.js ADDED Viewed

@@ -0,0 +1,148 @@
+/**
+ * Importance scoring for structural-trace items.
+ *
+ * The trace builder computes an `importance` value per caller, callee, and
+ * impact path. The formula fuses three classes of signal:
+ *
+ *   1. Query-time directional Personalized PageRank (PPR) from the target —
+ *      dominant graph signal. Backward direction ranks callers; forward
+ *      direction ranks callees. This is what makes "fan-in" mean *important*
+ *      fan-in: a leaf utility called by many things does not get high PPR
+ *      relative to a specific target unless that target itself reaches it.
+ *
+ *   2. Static index-time PageRank (page_rank column) — a backstop that helps
+ *      degenerate subgraphs (small symbols, brand-new code with no incoming
+ *      edges yet) where Forward Push has nothing to flow.
+ *
+ *   3. Structural heuristics — relationship type, depth, exported-API status,
+ *      type kind, hint-token overlap, fan-in tiebreaker. Penalties for
+ *      test-only paths and unresolved external nodes.
+ *
+ * Score weights sum to 1.0 over the positive terms; the test/external
+ * negatives are intentionally large enough to guarantee production callers
+ * beat fixtures.
+ */
+export const REL_WEIGHT = { calls: 1.0, uses: 0.72, implements: 0.88, extends: 0.84, overrides: 0.78 };
+export const TYPE_WEIGHT = {
+  class: 0.92, struct: 0.9, trait: 0.88, interface: 0.86, enum: 0.84,
+  function: 0.84, method: 0.82, component: 0.78, type: 0.7, typeAlias: 0.68, external: 0.2,
+};
+export function isTestPath(filePath = '') {
+  return /(^|\/)(__tests__|tests?|spec|fixtures|examples?|docs?)(\/|$)|[-_.](test|spec)\.[cm]?[jt]sx?$|_test\.go$/.test(filePath);
+}
+export function isExported(entity) {
+  const name = entity?.name || '';
+  const sig = entity?.signature || '';
+  if (!name) return false;
+  if (/^\w/.test(name) && name[0] === name[0].toUpperCase()) return true;
+  return /\b(export|public|pub)\b/.test(sig) || /^[A-Z_][A-Z0-9_]+$/.test(name);
+}
+export function logNorm(value, maxValue) {
+  if (!value || !maxValue) return 0;
+  return Math.log1p(value) / Math.log1p(maxValue);
+}
+export function tokenize(text) {
+  return [...new Set(String(text || '').toLowerCase().match(/[a-z_][a-z0-9_]{2,}/g) || [])];
+}
+// A/B knobs for ranking ablation. Read once at module load.
+//   SWEET_SEARCH_TRACE_NO_PPR=1 → drop the 0.20 directional PPR contribution.
+//   SWEET_SEARCH_TRACE_NO_PR=1  → drop the 0.10 static PageRank contribution.
+// These let probe runs isolate the marginal contribution of each graph signal
+// without rebuilding the index or maintaining a second worktree. Off by default.
+const NO_PPR = process.env.SWEET_SEARCH_TRACE_NO_PPR === '1';
+const NO_PR = process.env.SWEET_SEARCH_TRACE_NO_PR === '1';
+export function getAblationFlags() {
+  return { noPpr: NO_PPR, noPr: NO_PR };
+}
+export function hintScore(entity, hintTokens) {
+  if (!hintTokens.length) return 0;
+  const hay = `${entity.name} ${entity.type} ${entity.signature} ${entity.summary}`.toLowerCase();
+  let hits = 0;
+  for (const tok of hintTokens) if (hay.includes(tok)) hits++;
+  return hits / hintTokens.length;
+}
+/**
+ * Score a single caller/callee/impact-node entity.
+ *
+ * Weights (sum to 1.0):
+ *   0.20 proximity (1/depth) — prefer direct neighbours of target
+ *   0.16 relationship type (calls > implements > uses)
+ *   0.20 directional PPR — primary signal
+ *   0.10 static PageRank — backstop
+ *   0.10 fan-in tiebreaker — keep load-bearing leaves above no-signal nodes
+ *   0.08 entity type kind (classes/structs above untyped)
+ *   0.06 isExported (public API surface)
+ *   0.10 hint-token overlap — caller-supplied query intent
+ *
+ * Penalties (additive, can drop below positive sum):
+ *  -0.38 isTestPath — production callers must beat test fixtures
+ *  -0.25 type='external' — unresolved targets are not authoritative
+ *
+ * @param {object} entity - caller/callee row with relationship, depth, file
+ * @param {object} ctx - score context (fan, pageRank, pprScores, max*, hintTokens)
+ * @returns {number} importance in (0, 1.5]
+ */
+export function scoreEntity(entity, ctx) {
+  const fan = ctx.fan?.get?.(entity.id) || { fanIn: 0, fanOut: 0 };
+  const rel = REL_WEIGHT[entity.relationship] ?? 0.55;
+  const proximity = 1 / Math.max(1, entity.depth || 1);
+  const ppr = ctx.pprScores?.get?.(entity.id) || 0;
+  const pageRank = ctx.pageRank?.get?.(entity.id) || 0;
+  const pprTerm = NO_PPR ? 0 : 0.20 * logNorm(ppr, ctx.maxPpr || 1);
+  const prTerm = NO_PR ? 0 : 0.10 * logNorm(pageRank, ctx.maxPageRank || 1);
+  let score =
+    0.20 * proximity +
+    0.16 * rel +
+    pprTerm +
+    prTerm +
+    0.10 * logNorm(fan.fanIn, ctx.maxFanIn || 1) +
+    0.08 * (TYPE_WEIGHT[entity.type] ?? 0.5) +
+    0.06 * (isExported(entity) ? 1 : 0) +
+    0.10 * hintScore(entity, ctx.hintTokens || []);
+  if (isTestPath(entity.filePath)) score -= 0.38;
+  if (entity.type === 'external') score -= 0.25;
+  return Math.max(0.01, score);
+}
+/**
+ * Score an impact path using the bottleneck and average node importance.
+ *
+ * The caller passes in the score context that matches the path's direction:
+ * a downstream path's nodes are scored against the forward (callee) PPR run,
+ * an upstream path's against the backward (caller) PPR run. This avoids the
+ * directional bias where standard global PR over-promotes leaf utilities.
+ *
+ * @param {object} path - { direction, depth, path: [...nodes] }
+ * @param {object} ctx - directional score context
+ * @returns {number} importance in (0, 1.5]
+ */
+export function scoreImpactPath(path, ctx) {
+  const nodes = path.direction === 'downstream' ? path.path.slice(1) : path.path.slice(0, -1);
+  const scored = nodes.map(node => scoreEntity({ ...node, depth: path.depth }, ctx));
+  if (!scored.length) return 0.01;
+  const bottleneck = Math.min(...scored);
+  const avg = scored.reduce((a, b) => a + b, 0) / scored.length;
+  return Math.max(0.01, (0.55 * bottleneck + 0.45 * avg) / Math.sqrt(path.depth));
+}
+/**
+ * Compute normalization constants from a score context.
+ * Avoids divide-by-zero in logNorm when the subgraph is degenerate.
+ *
+ * @param {Iterable<number>} values
+ * @returns {number} max value, never below 1e-9
+ */
+export function safeMax(values) {
+  let m = 1e-9;
+  for (const v of values) if (Number.isFinite(v) && v > m) m = v;
+  return m;
+}

package/core/graph/structural-pagerank.js ADDED Viewed

@@ -0,0 +1,197 @@
+/**
+ * Structural PageRank — index-time edge-weighted PageRank for the entity call graph.
+ *
+ * Distinct from core/graph/repo-map.js (which runs unweighted PR on a deduped
+ * adjacency for repo-map rendering). This module:
+ *   1. Builds a weighted adjacency directly from `relationships.weight` so that
+ *      a function called five times gets five units of mass, not one.
+ *   2. Persists the result to a `page_rank REAL` column on `entities`, so the
+ *      structural-trace builder can read it as a backstop importance signal at
+ *      query time without recomputing.
+ *
+ * Domain layer: the actual SQL writes happen behind a writable database handle
+ * passed in by the index builder; no path or filesystem concerns leak in.
+ */
+const DEFAULT_DAMPING = 0.85;
+const DEFAULT_MAX_ITERATIONS = 50;
+const DEFAULT_CONVERGENCE = 1e-6;
+const DEFAULT_BATCH_SIZE = 500;
+const RELATIONSHIP_TYPE_WEIGHTS = {
+  calls: 1.0,
+  uses: 0.6,
+  implements: 0.85,
+  extends: 0.85,
+  overrides: 0.75,
+};
+/**
+ * Run edge-weighted PageRank power iteration over a call/use graph.
+ *
+ * The iteration handles:
+ *   - dangling nodes (no out-edges) via uniform mass redistribution
+ *   - weighted edges with per-source weight normalization
+ *   - early termination on L1 convergence
+ *
+ * @param {Map<string, Map<string, number>>} weightedOutEdges - source → (target → weight)
+ * @param {Set<string>} allNodes - every entity ID, including dangling
+ * @param {object} [opts]
+ * @param {number} [opts.damping=0.85]
+ * @param {number} [opts.maxIterations=50]
+ * @param {number} [opts.convergence=1e-6]
+ * @returns {Map<string, number>} entity ID → PageRank score (sums to ~1)
+ */
+export function pageRankWeighted(weightedOutEdges, allNodes, opts = {}) {
+  const damping = opts.damping ?? DEFAULT_DAMPING;
+  const maxIter = opts.maxIterations ?? DEFAULT_MAX_ITERATIONS;
+  const convergence = opts.convergence ?? DEFAULT_CONVERGENCE;
+  const n = allNodes.size;
+  if (n === 0) return new Map();
+  const base = (1 - damping) / n;
+  const totalOutWeight = new Map();
+  const inEdges = new Map();
+  for (const node of allNodes) inEdges.set(node, []);
+  for (const [src, targets] of weightedOutEdges) {
+    if (!allNodes.has(src)) continue;
+    let sum = 0;
+    for (const [tgt, w] of targets) {
+      if (!allNodes.has(tgt) || tgt === src) continue;
+      sum += w;
+      inEdges.get(tgt).push([src, w]);
+    }
+    if (sum > 0) totalOutWeight.set(src, sum);
+  }
+  const danglingNodes = [];
+  for (const node of allNodes) {
+    if (!totalOutWeight.has(node)) danglingNodes.push(node);
+  }
+  let scores = new Map();
+  const initScore = 1 / n;
+  for (const node of allNodes) scores.set(node, initScore);
+  for (let iter = 0; iter < maxIter; iter++) {
+    let danglingMass = 0;
+    for (const node of danglingNodes) danglingMass += scores.get(node);
+    const danglingShare = damping * danglingMass / n;
+    const next = new Map();
+    let delta = 0;
+    for (const node of allNodes) {
+      let sum = 0;
+      for (const [src, w] of inEdges.get(node)) {
+        const out = totalOutWeight.get(src);
+        if (out > 0) sum += scores.get(src) * (w / out);
+      }
+      const newScore = base + damping * sum + danglingShare;
+      next.set(node, newScore);
+      delta += Math.abs(newScore - scores.get(node));
+    }
+    scores = next;
+    if (delta < convergence) break;
+  }
+  return scores;
+}
+/**
+ * Build a weighted adjacency map from the relationships table.
+ *
+ * Multiple rows for the same (source, target) collapse via summed weight:
+ *   `r.weight * RELATIONSHIP_TYPE_WEIGHTS[r.type]`
+ * so that a hot call site contributes more mass than a single import.
+ *
+ * Unresolved targets (relationships with no target_id) are dropped because
+ * PageRank is only defined over nodes that actually exist in the graph.
+ *
+ * @param {import('better-sqlite3').Database} db - writable code-graph DB
+ * @returns {{ outEdges: Map<string, Map<string, number>>, allNodes: Set<string> }}
+ */
+export function buildWeightedAdjacency(db) {
+  const allNodes = new Set();
+  for (const row of db.prepare('SELECT id FROM entities WHERE stale_since IS NULL').iterate()) {
+    allNodes.add(row.id);
+  }
+  const outEdges = new Map();
+  const stmt = db.prepare(`
+    SELECT source_id, target_id, type, COALESCE(weight, 1.0) AS weight
+    FROM relationships
+    WHERE source_id IS NOT NULL AND target_id IS NOT NULL
+  `);
+  for (const row of stmt.iterate()) {
+    if (!allNodes.has(row.source_id) || !allNodes.has(row.target_id)) continue;
+    if (row.source_id === row.target_id) continue;
+    const typeWeight = RELATIONSHIP_TYPE_WEIGHTS[row.type] ?? 0.5;
+    const w = row.weight * typeWeight;
+    if (!(w > 0)) continue;
+    let bucket = outEdges.get(row.source_id);
+    if (!bucket) {
+      bucket = new Map();
+      outEdges.set(row.source_id, bucket);
+    }
+    bucket.set(row.target_id, (bucket.get(row.target_id) || 0) + w);
+  }
+  return { outEdges, allNodes };
+}
+/**
+ * Ensure the `page_rank` REAL column exists on entities. Idempotent.
+ * Older databases predate this column, so existing indexes auto-migrate.
+ *
+ * @param {import('better-sqlite3').Database} db - writable
+ * @returns {boolean} true on success
+ */
+export function ensurePageRankColumn(db) {
+  try {
+    const columns = db.prepare('PRAGMA table_info(entities)').all();
+    const has = columns.some(col => col.name === 'page_rank');
+    if (!has) {
+      db.exec('ALTER TABLE entities ADD COLUMN page_rank REAL DEFAULT 0');
+    }
+    db.exec('CREATE INDEX IF NOT EXISTS idx_entities_page_rank ON entities(page_rank) WHERE stale_since IS NULL');
+    return true;
+  } catch (err) {
+    if (err && /duplicate column/i.test(err.message)) return true;
+    return false;
+  }
+}
+/**
+ * Compute weighted PageRank and persist it to the `page_rank` column.
+ * Index-build calls this once after relationships are fully resolved.
+ *
+ * @param {import('better-sqlite3').Database} db - writable code-graph DB
+ * @param {object} [opts]
+ * @returns {{ entities: number, iterations: number, ms: number, written: number }}
+ */
+export function populatePageRankColumn(db, opts = {}) {
+  const started = Date.now();
+  ensurePageRankColumn(db);
+  const { outEdges, allNodes } = buildWeightedAdjacency(db);
+  if (allNodes.size === 0) {
+    return { entities: 0, iterations: 0, ms: Date.now() - started, written: 0 };
+  }
+  const scores = pageRankWeighted(outEdges, allNodes, opts);
+  const update = db.prepare('UPDATE entities SET page_rank = ? WHERE id = ?');
+  const batchSize = opts.batchSize ?? DEFAULT_BATCH_SIZE;
+  const tx = db.transaction((rows) => {
+    for (const [id, score] of rows) update.run(score, id);
+  });
+  let written = 0;
+  let buffer = [];
+  for (const entry of scores) {
+    buffer.push(entry);
+    if (buffer.length >= batchSize) {
+      tx(buffer);
+      written += buffer.length;
+      buffer = [];
+    }
+  }
+  if (buffer.length) {
+    tx(buffer);
+    written += buffer.length;
+  }
+  return { entities: allNodes.size, iterations: -1, ms: Date.now() - started, written };
+}
+export const __TEST__ = { RELATIONSHIP_TYPE_WEIGHTS, DEFAULT_DAMPING };

package/core/graph/summary-manager.js CHANGED Viewed

@@ -23,6 +23,7 @@ import { existsSync } from 'fs';
 import fs from 'fs/promises';
 import path from 'path';
 import { DB_PATHS } from '../infrastructure/config/index.js';
+import { chunkedInExec } from '../infrastructure/db-utils.js';
 // =============================================================================
 // CRASH-SAFE DISK PERSISTENCE
@@ -402,17 +403,20 @@ export async function markForRegeneration(dbPath = DB_PATHS.codeGraph, filePaths
   const Database = (await import('better-sqlite3')).default;
   const db = new Database(dbPath);
-  const placeholders = filePaths.map(() => '?').join(',');
-  const stmt = db.prepare(`
-    UPDATE entities
-    SET summary = NULL, summary_embedding = NULL
-    WHERE file_path IN (${placeholders})
-  `);
-  const result = stmt.run(...filePaths);
+  // Chunk to stay under SQLite's bound-parameter limit. Caller may pass
+  // tens of thousands of file paths on large initial mark-for-regeneration
+  // operations; an unchunked IN(?,?,...) crashes with "too many SQL
+  // variables". chunkedInExec wraps the per-batch run in a transaction.
+  const { changes } = chunkedInExec(
+    db,
+    `UPDATE entities
+        SET summary = NULL, summary_embedding = NULL
+      WHERE file_path IN (__IN_PLACEHOLDERS__)`,
+    filePaths,
+  );
   db.close();
-  return { marked: result.changes };
+  return { marked: changes };
 }
 /**

package/core/incremental-indexing/application/dirty-scan.mjs ADDED Viewed

@@ -0,0 +1,236 @@
+/**
+ * Tick-driven dirty-file producer for the default-on incremental maintainer.
+ *
+ * The reconcile tick is a *consumer*: it drains `index-maintainer-queue.jsonl`
+ * and reindexes whatever paths were enqueued. Something has to PRODUCE those
+ * entries. `sweet-search index --add <path>` does it manually, and an editor
+ * hook can do it per-edit — but with neither, an ordinary file save is never
+ * observed and the index silently goes stale (release-gate finding C1).
+ *
+ * This module is the missing autonomous producer. Once per tick (before the
+ * consume step) it diffs the working tree against the reconciler's own
+ * `merkle-state.json` baseline using a cheap stat comparison (size + mtime_ns,
+ * no hashing) and appends add / modify / delete hints to the same JSONL queue
+ * the reconciler already drains. The reconciler then updates `merkle-state.json`
+ * for the files it processes, so the next scan sees them as unchanged — the
+ * queue does not grow without bound.
+ *
+ * Admission: it uses the SAME `admission-policy` full indexing uses, so a file a
+ * fresh `sweet-search index` would skip (wrong extension, gitignored, excluded,
+ * oversized) is never newly enqueued, and a file full indexing would admit is
+ * eligible. Gitignore is evaluated in ONE batched `git check-ignore` per tick,
+ * never per file.
+ *
+ * Current-session convergence: a previously-indexed file that is deleted, or
+ * that becomes excluded / oversized / gitignored, is enqueued so the consumer
+ * retires it — incremental results then match a fresh full rebuild. (The
+ * consumer is the authority on admit-vs-retire; this producer only decides what
+ * to enqueue.)
+ *
+ * Design notes:
+ *   - Walks the whole tree each tick (pruning denied directories) so the "seen"
+ *     set is complete and unchanged-but-now-excluded files are not mistaken for
+ *     deletions; only the *enqueue* list is bounded by `maxEnqueue`.
+ *   - De-dupes against paths already in the dirty/processing queues so repeated
+ *     ticks before a slow reconcile don't pile up duplicates.
+ *   - Opt-out: `SWEET_SEARCH_RECONCILE_SCAN=0|false|off` disables just the
+ *     producer (the maintainer keeps consuming externally-enqueued hints).
+ */
+import fs from 'node:fs';
+import path from 'node:path';
+import { createAdmissionPolicy } from '../../indexing/admission-policy.js';
+const DIRTY_QUEUE = 'index-maintainer-queue.jsonl';
+const PROCESSING_QUEUE = 'index-maintainer-queue.processing.jsonl';
+const MERKLE_STATE = 'merkle-state.json';
+const DEFAULT_MAX_ENQUEUE = 5000;
+/** Is the autonomous scan producer enabled? Default-on; off-tokens disable it. */
+export function dirtyScanEnabled(env = process.env) {
+  const raw = env.SWEET_SEARCH_RECONCILE_SCAN;
+  if (raw == null || raw === '') return true;
+  const n = String(raw).trim().toLowerCase();
+  return !(n === '0' || n === 'false' || n === 'off');
+}
+function readMerkleFiles(stateDir) {
+  try {
+    const parsed = JSON.parse(fs.readFileSync(path.join(stateDir, MERKLE_STATE), 'utf8'));
+    return parsed && parsed.files && typeof parsed.files === 'object' ? parsed.files : {};
+  } catch {
+    return {};
+  }
+}
+/** Project-relative paths already queued (dirty + in-flight), forward-slashed. */
+function alreadyQueued(stateDir) {
+  const set = new Set();
+  for (const name of [DIRTY_QUEUE, PROCESSING_QUEUE]) {
+    let raw;
+    try {
+      raw = fs.readFileSync(path.join(stateDir, name), 'utf8');
+    } catch {
+      continue;
+    }
+    for (const line of raw.split('\n')) {
+      const t = line.trim();
+      if (!t) continue;
+      try {
+        const fp = JSON.parse(t).file_path;
+        if (fp) set.add(String(fp).replace(/\\/g, '/'));
+      } catch {
+        /* tolerate a malformed line */
+      }
+    }
+  }
+  return set;
+}
+/**
+ * Diff the working tree against merkle-state.json and append dirty hints.
+ *
+ * @param {object} opts
+ * @param {string} opts.projectRoot
+ * @param {string} opts.stateDir
+ * @param {object} [opts.admissionPolicy]   Shared admission policy (created from projectRoot if omitted).
+ * @param {(rel:string)=>boolean} [opts.isExcluded]   Extra deny predicate layered on the policy.
+ * @param {number} [opts.maxEnqueue]
+ * @param {(phase:string)=>void} [opts.onProgress]
+ * @returns {Promise<{enqueued:number, added:number, modified:number, deleted:number, retired:number, files:string[]}>}
+ */
+export async function scanDirtyAndEnqueue({ projectRoot, stateDir, admissionPolicy, isExcluded, maxEnqueue = DEFAULT_MAX_ENQUEUE, onProgress = null }) {
+  const policy = admissionPolicy || createAdmissionPolicy({ projectRoot });
+  const extraDeny = typeof isExcluded === 'function' ? isExcluded : null;
+  const merkle = readMerkleFiles(stateDir);
+  const queued = alreadyQueued(stateDir);
+  const maxFileSize = BigInt(policy.maxFileSize);
+  const progress = typeof onProgress === 'function'
+    ? (phase) => { onProgress(phase); }
+    : () => {};
+  let walked = 0;
+  // Never enqueue the maintainer's own state dir — its queues/manifests/db are
+  // not source files and must be skipped regardless of the policy.
+  const stateDirResolved = path.resolve(stateDir);
+  const isStateDir = (abs) => {
+    const r = path.resolve(abs);
+    return r === stateDirResolved || r.startsWith(stateDirResolved + path.sep);
+  };
+  // 1. Full walk: classify every present file; prune denied directories so we
+  //    never descend node_modules/.git/etc. `present` keeps shape-rejected
+  //    merkle files too (they must be retired).
+  const present = new Map(); // rel -> { isNew, changed, shapeOk, sizeOk }
+  const stack = [projectRoot];
+  while (stack.length) {
+    const dir = stack.pop();
+    let entries;
+    try {
+      entries = fs.readdirSync(dir, { withFileTypes: true });
+    } catch {
+      continue;
+    }
+    for (const ent of entries) {
+      const abs = path.join(dir, ent.name);
+      if (isStateDir(abs)) continue;
+      const rel = path.relative(projectRoot, abs).replace(/\\/g, '/');
+      if (!rel) continue;
+      if (ent.isDirectory()) {
+        if (policy.isExcluded(rel) || (extraDeny && extraDeny(rel))) continue; // prune subtree
+        stack.push(abs);
+        continue;
+      }
+      if (!ent.isFile()) continue;
+      walked += 1;
+      if (walked % 1000 === 0) progress('dirty-scan:walk');
+      const prev = merkle[rel];
+      const shapeOk = policy.admitsShape(rel) && !(extraDeny && extraDeny(rel));
+      if (!shapeOk) {
+        // New rejected files are dropped; previously-indexed ones are retired.
+        if (prev) present.set(rel, { isNew: false, changed: false, shapeOk: false, sizeOk: false });
+        continue;
+      }
+      let stat;
+      try {
+        stat = fs.statSync(abs, { bigint: true });
+      } catch {
+        if (prev) present.set(rel, { isNew: false, changed: false, shapeOk: false, sizeOk: false });
+        continue;
+      }
+      const sizeOk = stat.size <= maxFileSize;
+      const isNew = !prev;
+      const changed = isNew
+        ? true
+        : (stat.size.toString() !== String(prev.size) || stat.mtimeNs.toString() !== String(prev.mtime_ns));
+      present.set(rel, { isNew, changed, shapeOk: true, sizeOk });
+    }
+  }
+  // 2. Gitignore: ONE batched check over admissible (shape+size OK) files. This
+  //    catches both new files dropped into a gitignored path and previously
+  //    indexed files whose `.gitignore` status changed.
+  const gitCandidates = [];
+  for (const [rel, v] of present) {
+    if (v.shapeOk && v.sizeOk) gitCandidates.push(rel);
+  }
+  progress('dirty-scan:gitignore');
+  const gitignored = await policy.gitignoredSet(gitCandidates);
+  progress('dirty-scan:decide');
+  // 3. Decide enqueues. Admitted+changed → reindex; previously-indexed but no
+  //    longer admitted → retire.
+  const toEnqueue = [];
+  let added = 0;
+  let modified = 0;
+  let deleted = 0;
+  let retired = 0;
+  const enqueue = (rel) => {
+    toEnqueue.push(rel);
+    queued.add(rel);
+  };
+  for (const [rel, v] of present) {
+    if (toEnqueue.length >= maxEnqueue) break;
+    const admitted = v.shapeOk && v.sizeOk && !gitignored.has(rel);
+    if (admitted) {
+      if (v.changed && !queued.has(rel)) {
+        enqueue(rel);
+        v.isNew ? (added += 1) : (modified += 1);
+      }
+    } else if (merkle[rel] && !queued.has(rel)) {
+      enqueue(rel);
+      retired += 1;
+    }
+  }
+  // 4. Merkle-known files not seen in the walk: deleted (gone) or living under a
+  //    directory that just became denied. Either way, retire.
+  for (const rel of Object.keys(merkle)) {
+    if (toEnqueue.length >= maxEnqueue) break;
+    if (present.has(rel) || queued.has(rel)) continue;
+    if (!fs.existsSync(path.join(projectRoot, rel))) {
+      enqueue(rel);
+      deleted += 1;
+    } else {
+      enqueue(rel);
+      retired += 1;
+    }
+  }
+  if (toEnqueue.length === 0) {
+    return { enqueued: 0, added: 0, modified: 0, deleted: 0, retired: 0, files: [] };
+  }
+  fs.mkdirSync(stateDir, { recursive: true });
+  const now = Date.now();
+  const iso = new Date(now).toISOString();
+  const lines = toEnqueue
+    .map((rel) => `${JSON.stringify({ file_path: rel, timestamp: now, queued_at: iso, source: 'scan' })}\n`)
+    .join('');
+  fs.appendFileSync(path.join(stateDir, DIRTY_QUEUE), lines);
+  progress('dirty-scan:queued');
+  return { enqueued: toEnqueue.length, added, modified, deleted, retired, files: toEnqueue };
+}