sweet-search 2.5.2 → 2.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,148 @@
1
+ /**
2
+ * Importance scoring for structural-trace items.
3
+ *
4
+ * The trace builder computes an `importance` value per caller, callee, and
5
+ * impact path. The formula fuses three classes of signal:
6
+ *
7
+ * 1. Query-time directional Personalized PageRank (PPR) from the target —
8
+ * dominant graph signal. Backward direction ranks callers; forward
9
+ * direction ranks callees. This is what makes "fan-in" mean *important*
10
+ * fan-in: a leaf utility called by many things does not get high PPR
11
+ * relative to a specific target unless that target itself reaches it.
12
+ *
13
+ * 2. Static index-time PageRank (page_rank column) — a backstop that helps
14
+ * degenerate subgraphs (small symbols, brand-new code with no incoming
15
+ * edges yet) where Forward Push has nothing to flow.
16
+ *
17
+ * 3. Structural heuristics — relationship type, depth, exported-API status,
18
+ * type kind, hint-token overlap, fan-in tiebreaker. Penalties for
19
+ * test-only paths and unresolved external nodes.
20
+ *
21
+ * Score weights sum to 1.0 over the positive terms; the test/external
22
+ * negatives are intentionally large enough to guarantee production callers
23
+ * beat fixtures.
24
+ */
25
+
26
+ export const REL_WEIGHT = { calls: 1.0, uses: 0.72, implements: 0.88, extends: 0.84, overrides: 0.78 };
27
+ export const TYPE_WEIGHT = {
28
+ class: 0.92, struct: 0.9, trait: 0.88, interface: 0.86, enum: 0.84,
29
+ function: 0.84, method: 0.82, component: 0.78, type: 0.7, typeAlias: 0.68, external: 0.2,
30
+ };
31
+
32
+ export function isTestPath(filePath = '') {
33
+ return /(^|\/)(__tests__|tests?|spec|fixtures|examples?|docs?)(\/|$)|[-_.](test|spec)\.[cm]?[jt]sx?$|_test\.go$/.test(filePath);
34
+ }
35
+
36
+ export function isExported(entity) {
37
+ const name = entity?.name || '';
38
+ const sig = entity?.signature || '';
39
+ if (!name) return false;
40
+ if (/^\w/.test(name) && name[0] === name[0].toUpperCase()) return true;
41
+ return /\b(export|public|pub)\b/.test(sig) || /^[A-Z_][A-Z0-9_]+$/.test(name);
42
+ }
43
+
44
+ export function logNorm(value, maxValue) {
45
+ if (!value || !maxValue) return 0;
46
+ return Math.log1p(value) / Math.log1p(maxValue);
47
+ }
48
+
49
+ export function tokenize(text) {
50
+ return [...new Set(String(text || '').toLowerCase().match(/[a-z_][a-z0-9_]{2,}/g) || [])];
51
+ }
52
+
53
+ // A/B knobs for ranking ablation. Read once at module load.
54
+ // SWEET_SEARCH_TRACE_NO_PPR=1 → drop the 0.20 directional PPR contribution.
55
+ // SWEET_SEARCH_TRACE_NO_PR=1 → drop the 0.10 static PageRank contribution.
56
+ // These let probe runs isolate the marginal contribution of each graph signal
57
+ // without rebuilding the index or maintaining a second worktree. Off by default.
58
+ const NO_PPR = process.env.SWEET_SEARCH_TRACE_NO_PPR === '1';
59
+ const NO_PR = process.env.SWEET_SEARCH_TRACE_NO_PR === '1';
60
+
61
+ export function getAblationFlags() {
62
+ return { noPpr: NO_PPR, noPr: NO_PR };
63
+ }
64
+
65
+ export function hintScore(entity, hintTokens) {
66
+ if (!hintTokens.length) return 0;
67
+ const hay = `${entity.name} ${entity.type} ${entity.signature} ${entity.summary}`.toLowerCase();
68
+ let hits = 0;
69
+ for (const tok of hintTokens) if (hay.includes(tok)) hits++;
70
+ return hits / hintTokens.length;
71
+ }
72
+
73
+ /**
74
+ * Score a single caller/callee/impact-node entity.
75
+ *
76
+ * Weights (sum to 1.0):
77
+ * 0.20 proximity (1/depth) — prefer direct neighbours of target
78
+ * 0.16 relationship type (calls > implements > uses)
79
+ * 0.20 directional PPR — primary signal
80
+ * 0.10 static PageRank — backstop
81
+ * 0.10 fan-in tiebreaker — keep load-bearing leaves above no-signal nodes
82
+ * 0.08 entity type kind (classes/structs above untyped)
83
+ * 0.06 isExported (public API surface)
84
+ * 0.10 hint-token overlap — caller-supplied query intent
85
+ *
86
+ * Penalties (additive, can drop below positive sum):
87
+ * -0.38 isTestPath — production callers must beat test fixtures
88
+ * -0.25 type='external' — unresolved targets are not authoritative
89
+ *
90
+ * @param {object} entity - caller/callee row with relationship, depth, file
91
+ * @param {object} ctx - score context (fan, pageRank, pprScores, max*, hintTokens)
92
+ * @returns {number} importance in (0, 1.5]
93
+ */
94
+ export function scoreEntity(entity, ctx) {
95
+ const fan = ctx.fan?.get?.(entity.id) || { fanIn: 0, fanOut: 0 };
96
+ const rel = REL_WEIGHT[entity.relationship] ?? 0.55;
97
+ const proximity = 1 / Math.max(1, entity.depth || 1);
98
+ const ppr = ctx.pprScores?.get?.(entity.id) || 0;
99
+ const pageRank = ctx.pageRank?.get?.(entity.id) || 0;
100
+ const pprTerm = NO_PPR ? 0 : 0.20 * logNorm(ppr, ctx.maxPpr || 1);
101
+ const prTerm = NO_PR ? 0 : 0.10 * logNorm(pageRank, ctx.maxPageRank || 1);
102
+ let score =
103
+ 0.20 * proximity +
104
+ 0.16 * rel +
105
+ pprTerm +
106
+ prTerm +
107
+ 0.10 * logNorm(fan.fanIn, ctx.maxFanIn || 1) +
108
+ 0.08 * (TYPE_WEIGHT[entity.type] ?? 0.5) +
109
+ 0.06 * (isExported(entity) ? 1 : 0) +
110
+ 0.10 * hintScore(entity, ctx.hintTokens || []);
111
+ if (isTestPath(entity.filePath)) score -= 0.38;
112
+ if (entity.type === 'external') score -= 0.25;
113
+ return Math.max(0.01, score);
114
+ }
115
+
116
+ /**
117
+ * Score an impact path using the bottleneck and average node importance.
118
+ *
119
+ * The caller passes in the score context that matches the path's direction:
120
+ * a downstream path's nodes are scored against the forward (callee) PPR run,
121
+ * an upstream path's against the backward (caller) PPR run. This avoids the
122
+ * directional bias where standard global PR over-promotes leaf utilities.
123
+ *
124
+ * @param {object} path - { direction, depth, path: [...nodes] }
125
+ * @param {object} ctx - directional score context
126
+ * @returns {number} importance in (0, 1.5]
127
+ */
128
+ export function scoreImpactPath(path, ctx) {
129
+ const nodes = path.direction === 'downstream' ? path.path.slice(1) : path.path.slice(0, -1);
130
+ const scored = nodes.map(node => scoreEntity({ ...node, depth: path.depth }, ctx));
131
+ if (!scored.length) return 0.01;
132
+ const bottleneck = Math.min(...scored);
133
+ const avg = scored.reduce((a, b) => a + b, 0) / scored.length;
134
+ return Math.max(0.01, (0.55 * bottleneck + 0.45 * avg) / Math.sqrt(path.depth));
135
+ }
136
+
137
+ /**
138
+ * Compute normalization constants from a score context.
139
+ * Avoids divide-by-zero in logNorm when the subgraph is degenerate.
140
+ *
141
+ * @param {Iterable<number>} values
142
+ * @returns {number} max value, never below 1e-9
143
+ */
144
+ export function safeMax(values) {
145
+ let m = 1e-9;
146
+ for (const v of values) if (Number.isFinite(v) && v > m) m = v;
147
+ return m;
148
+ }
@@ -0,0 +1,197 @@
1
+ /**
2
+ * Structural PageRank — index-time edge-weighted PageRank for the entity call graph.
3
+ *
4
+ * Distinct from core/graph/repo-map.js (which runs unweighted PR on a deduped
5
+ * adjacency for repo-map rendering). This module:
6
+ * 1. Builds a weighted adjacency directly from `relationships.weight` so that
7
+ * a function called five times gets five units of mass, not one.
8
+ * 2. Persists the result to a `page_rank REAL` column on `entities`, so the
9
+ * structural-trace builder can read it as a backstop importance signal at
10
+ * query time without recomputing.
11
+ *
12
+ * Domain layer: the actual SQL writes happen behind a writable database handle
13
+ * passed in by the index builder; no path or filesystem concerns leak in.
14
+ */
15
+ const DEFAULT_DAMPING = 0.85;
16
+ const DEFAULT_MAX_ITERATIONS = 50;
17
+ const DEFAULT_CONVERGENCE = 1e-6;
18
+ const DEFAULT_BATCH_SIZE = 500;
19
+ const RELATIONSHIP_TYPE_WEIGHTS = {
20
+ calls: 1.0,
21
+ uses: 0.6,
22
+ implements: 0.85,
23
+ extends: 0.85,
24
+ overrides: 0.75,
25
+ };
26
+
27
+ /**
28
+ * Run edge-weighted PageRank power iteration over a call/use graph.
29
+ *
30
+ * The iteration handles:
31
+ * - dangling nodes (no out-edges) via uniform mass redistribution
32
+ * - weighted edges with per-source weight normalization
33
+ * - early termination on L1 convergence
34
+ *
35
+ * @param {Map<string, Map<string, number>>} weightedOutEdges - source → (target → weight)
36
+ * @param {Set<string>} allNodes - every entity ID, including dangling
37
+ * @param {object} [opts]
38
+ * @param {number} [opts.damping=0.85]
39
+ * @param {number} [opts.maxIterations=50]
40
+ * @param {number} [opts.convergence=1e-6]
41
+ * @returns {Map<string, number>} entity ID → PageRank score (sums to ~1)
42
+ */
43
+ export function pageRankWeighted(weightedOutEdges, allNodes, opts = {}) {
44
+ const damping = opts.damping ?? DEFAULT_DAMPING;
45
+ const maxIter = opts.maxIterations ?? DEFAULT_MAX_ITERATIONS;
46
+ const convergence = opts.convergence ?? DEFAULT_CONVERGENCE;
47
+ const n = allNodes.size;
48
+ if (n === 0) return new Map();
49
+
50
+ const base = (1 - damping) / n;
51
+ const totalOutWeight = new Map();
52
+ const inEdges = new Map();
53
+ for (const node of allNodes) inEdges.set(node, []);
54
+ for (const [src, targets] of weightedOutEdges) {
55
+ if (!allNodes.has(src)) continue;
56
+ let sum = 0;
57
+ for (const [tgt, w] of targets) {
58
+ if (!allNodes.has(tgt) || tgt === src) continue;
59
+ sum += w;
60
+ inEdges.get(tgt).push([src, w]);
61
+ }
62
+ if (sum > 0) totalOutWeight.set(src, sum);
63
+ }
64
+
65
+ const danglingNodes = [];
66
+ for (const node of allNodes) {
67
+ if (!totalOutWeight.has(node)) danglingNodes.push(node);
68
+ }
69
+
70
+ let scores = new Map();
71
+ const initScore = 1 / n;
72
+ for (const node of allNodes) scores.set(node, initScore);
73
+
74
+ for (let iter = 0; iter < maxIter; iter++) {
75
+ let danglingMass = 0;
76
+ for (const node of danglingNodes) danglingMass += scores.get(node);
77
+ const danglingShare = damping * danglingMass / n;
78
+
79
+ const next = new Map();
80
+ let delta = 0;
81
+ for (const node of allNodes) {
82
+ let sum = 0;
83
+ for (const [src, w] of inEdges.get(node)) {
84
+ const out = totalOutWeight.get(src);
85
+ if (out > 0) sum += scores.get(src) * (w / out);
86
+ }
87
+ const newScore = base + damping * sum + danglingShare;
88
+ next.set(node, newScore);
89
+ delta += Math.abs(newScore - scores.get(node));
90
+ }
91
+ scores = next;
92
+ if (delta < convergence) break;
93
+ }
94
+ return scores;
95
+ }
96
+
97
+ /**
98
+ * Build a weighted adjacency map from the relationships table.
99
+ *
100
+ * Multiple rows for the same (source, target) collapse via summed weight:
101
+ * `r.weight * RELATIONSHIP_TYPE_WEIGHTS[r.type]`
102
+ * so that a hot call site contributes more mass than a single import.
103
+ *
104
+ * Unresolved targets (relationships with no target_id) are dropped because
105
+ * PageRank is only defined over nodes that actually exist in the graph.
106
+ *
107
+ * @param {import('better-sqlite3').Database} db - writable code-graph DB
108
+ * @returns {{ outEdges: Map<string, Map<string, number>>, allNodes: Set<string> }}
109
+ */
110
+ export function buildWeightedAdjacency(db) {
111
+ const allNodes = new Set();
112
+ for (const row of db.prepare('SELECT id FROM entities WHERE stale_since IS NULL').iterate()) {
113
+ allNodes.add(row.id);
114
+ }
115
+ const outEdges = new Map();
116
+ const stmt = db.prepare(`
117
+ SELECT source_id, target_id, type, COALESCE(weight, 1.0) AS weight
118
+ FROM relationships
119
+ WHERE source_id IS NOT NULL AND target_id IS NOT NULL
120
+ `);
121
+ for (const row of stmt.iterate()) {
122
+ if (!allNodes.has(row.source_id) || !allNodes.has(row.target_id)) continue;
123
+ if (row.source_id === row.target_id) continue;
124
+ const typeWeight = RELATIONSHIP_TYPE_WEIGHTS[row.type] ?? 0.5;
125
+ const w = row.weight * typeWeight;
126
+ if (!(w > 0)) continue;
127
+ let bucket = outEdges.get(row.source_id);
128
+ if (!bucket) {
129
+ bucket = new Map();
130
+ outEdges.set(row.source_id, bucket);
131
+ }
132
+ bucket.set(row.target_id, (bucket.get(row.target_id) || 0) + w);
133
+ }
134
+ return { outEdges, allNodes };
135
+ }
136
+
137
+ /**
138
+ * Ensure the `page_rank` REAL column exists on entities. Idempotent.
139
+ * Older databases predate this column, so existing indexes auto-migrate.
140
+ *
141
+ * @param {import('better-sqlite3').Database} db - writable
142
+ * @returns {boolean} true on success
143
+ */
144
+ export function ensurePageRankColumn(db) {
145
+ try {
146
+ const columns = db.prepare('PRAGMA table_info(entities)').all();
147
+ const has = columns.some(col => col.name === 'page_rank');
148
+ if (!has) {
149
+ db.exec('ALTER TABLE entities ADD COLUMN page_rank REAL DEFAULT 0');
150
+ }
151
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entities_page_rank ON entities(page_rank) WHERE stale_since IS NULL');
152
+ return true;
153
+ } catch (err) {
154
+ if (err && /duplicate column/i.test(err.message)) return true;
155
+ return false;
156
+ }
157
+ }
158
+
159
+ /**
160
+ * Compute weighted PageRank and persist it to the `page_rank` column.
161
+ * Index-build calls this once after relationships are fully resolved.
162
+ *
163
+ * @param {import('better-sqlite3').Database} db - writable code-graph DB
164
+ * @param {object} [opts]
165
+ * @returns {{ entities: number, iterations: number, ms: number, written: number }}
166
+ */
167
+ export function populatePageRankColumn(db, opts = {}) {
168
+ const started = Date.now();
169
+ ensurePageRankColumn(db);
170
+ const { outEdges, allNodes } = buildWeightedAdjacency(db);
171
+ if (allNodes.size === 0) {
172
+ return { entities: 0, iterations: 0, ms: Date.now() - started, written: 0 };
173
+ }
174
+ const scores = pageRankWeighted(outEdges, allNodes, opts);
175
+ const update = db.prepare('UPDATE entities SET page_rank = ? WHERE id = ?');
176
+ const batchSize = opts.batchSize ?? DEFAULT_BATCH_SIZE;
177
+ const tx = db.transaction((rows) => {
178
+ for (const [id, score] of rows) update.run(score, id);
179
+ });
180
+ let written = 0;
181
+ let buffer = [];
182
+ for (const entry of scores) {
183
+ buffer.push(entry);
184
+ if (buffer.length >= batchSize) {
185
+ tx(buffer);
186
+ written += buffer.length;
187
+ buffer = [];
188
+ }
189
+ }
190
+ if (buffer.length) {
191
+ tx(buffer);
192
+ written += buffer.length;
193
+ }
194
+ return { entities: allNodes.size, iterations: -1, ms: Date.now() - started, written };
195
+ }
196
+
197
+ export const __TEST__ = { RELATIONSHIP_TYPE_WEIGHTS, DEFAULT_DAMPING };
@@ -23,6 +23,7 @@ import { existsSync } from 'fs';
23
23
  import fs from 'fs/promises';
24
24
  import path from 'path';
25
25
  import { DB_PATHS } from '../infrastructure/config/index.js';
26
+ import { chunkedInExec } from '../infrastructure/db-utils.js';
26
27
 
27
28
  // =============================================================================
28
29
  // CRASH-SAFE DISK PERSISTENCE
@@ -402,17 +403,20 @@ export async function markForRegeneration(dbPath = DB_PATHS.codeGraph, filePaths
402
403
  const Database = (await import('better-sqlite3')).default;
403
404
  const db = new Database(dbPath);
404
405
 
405
- const placeholders = filePaths.map(() => '?').join(',');
406
- const stmt = db.prepare(`
407
- UPDATE entities
408
- SET summary = NULL, summary_embedding = NULL
409
- WHERE file_path IN (${placeholders})
410
- `);
411
-
412
- const result = stmt.run(...filePaths);
406
+ // Chunk to stay under SQLite's bound-parameter limit. Caller may pass
407
+ // tens of thousands of file paths on large initial mark-for-regeneration
408
+ // operations; an unchunked IN(?,?,...) crashes with "too many SQL
409
+ // variables". chunkedInExec wraps the per-batch run in a transaction.
410
+ const { changes } = chunkedInExec(
411
+ db,
412
+ `UPDATE entities
413
+ SET summary = NULL, summary_embedding = NULL
414
+ WHERE file_path IN (__IN_PLACEHOLDERS__)`,
415
+ filePaths,
416
+ );
413
417
  db.close();
414
418
 
415
- return { marked: result.changes };
419
+ return { marked: changes };
416
420
  }
417
421
 
418
422
  /**
@@ -0,0 +1,236 @@
1
+ /**
2
+ * Tick-driven dirty-file producer for the default-on incremental maintainer.
3
+ *
4
+ * The reconcile tick is a *consumer*: it drains `index-maintainer-queue.jsonl`
5
+ * and reindexes whatever paths were enqueued. Something has to PRODUCE those
6
+ * entries. `sweet-search index --add <path>` does it manually, and an editor
7
+ * hook can do it per-edit — but with neither, an ordinary file save is never
8
+ * observed and the index silently goes stale (release-gate finding C1).
9
+ *
10
+ * This module is the missing autonomous producer. Once per tick (before the
11
+ * consume step) it diffs the working tree against the reconciler's own
12
+ * `merkle-state.json` baseline using a cheap stat comparison (size + mtime_ns,
13
+ * no hashing) and appends add / modify / delete hints to the same JSONL queue
14
+ * the reconciler already drains. The reconciler then updates `merkle-state.json`
15
+ * for the files it processes, so the next scan sees them as unchanged — the
16
+ * queue does not grow without bound.
17
+ *
18
+ * Admission: it uses the SAME `admission-policy` full indexing uses, so a file a
19
+ * fresh `sweet-search index` would skip (wrong extension, gitignored, excluded,
20
+ * oversized) is never newly enqueued, and a file full indexing would admit is
21
+ * eligible. Gitignore is evaluated in ONE batched `git check-ignore` per tick,
22
+ * never per file.
23
+ *
24
+ * Current-session convergence: a previously-indexed file that is deleted, or
25
+ * that becomes excluded / oversized / gitignored, is enqueued so the consumer
26
+ * retires it — incremental results then match a fresh full rebuild. (The
27
+ * consumer is the authority on admit-vs-retire; this producer only decides what
28
+ * to enqueue.)
29
+ *
30
+ * Design notes:
31
+ * - Walks the whole tree each tick (pruning denied directories) so the "seen"
32
+ * set is complete and unchanged-but-now-excluded files are not mistaken for
33
+ * deletions; only the *enqueue* list is bounded by `maxEnqueue`.
34
+ * - De-dupes against paths already in the dirty/processing queues so repeated
35
+ * ticks before a slow reconcile don't pile up duplicates.
36
+ * - Opt-out: `SWEET_SEARCH_RECONCILE_SCAN=0|false|off` disables just the
37
+ * producer (the maintainer keeps consuming externally-enqueued hints).
38
+ */
39
+
40
+ import fs from 'node:fs';
41
+ import path from 'node:path';
42
+
43
+ import { createAdmissionPolicy } from '../../indexing/admission-policy.js';
44
+
45
+ const DIRTY_QUEUE = 'index-maintainer-queue.jsonl';
46
+ const PROCESSING_QUEUE = 'index-maintainer-queue.processing.jsonl';
47
+ const MERKLE_STATE = 'merkle-state.json';
48
+ const DEFAULT_MAX_ENQUEUE = 5000;
49
+
50
+ /** Is the autonomous scan producer enabled? Default-on; off-tokens disable it. */
51
+ export function dirtyScanEnabled(env = process.env) {
52
+ const raw = env.SWEET_SEARCH_RECONCILE_SCAN;
53
+ if (raw == null || raw === '') return true;
54
+ const n = String(raw).trim().toLowerCase();
55
+ return !(n === '0' || n === 'false' || n === 'off');
56
+ }
57
+
58
+ function readMerkleFiles(stateDir) {
59
+ try {
60
+ const parsed = JSON.parse(fs.readFileSync(path.join(stateDir, MERKLE_STATE), 'utf8'));
61
+ return parsed && parsed.files && typeof parsed.files === 'object' ? parsed.files : {};
62
+ } catch {
63
+ return {};
64
+ }
65
+ }
66
+
67
+ /** Project-relative paths already queued (dirty + in-flight), forward-slashed. */
68
+ function alreadyQueued(stateDir) {
69
+ const set = new Set();
70
+ for (const name of [DIRTY_QUEUE, PROCESSING_QUEUE]) {
71
+ let raw;
72
+ try {
73
+ raw = fs.readFileSync(path.join(stateDir, name), 'utf8');
74
+ } catch {
75
+ continue;
76
+ }
77
+ for (const line of raw.split('\n')) {
78
+ const t = line.trim();
79
+ if (!t) continue;
80
+ try {
81
+ const fp = JSON.parse(t).file_path;
82
+ if (fp) set.add(String(fp).replace(/\\/g, '/'));
83
+ } catch {
84
+ /* tolerate a malformed line */
85
+ }
86
+ }
87
+ }
88
+ return set;
89
+ }
90
+
91
+ /**
92
+ * Diff the working tree against merkle-state.json and append dirty hints.
93
+ *
94
+ * @param {object} opts
95
+ * @param {string} opts.projectRoot
96
+ * @param {string} opts.stateDir
97
+ * @param {object} [opts.admissionPolicy] Shared admission policy (created from projectRoot if omitted).
98
+ * @param {(rel:string)=>boolean} [opts.isExcluded] Extra deny predicate layered on the policy.
99
+ * @param {number} [opts.maxEnqueue]
100
+ * @param {(phase:string)=>void} [opts.onProgress]
101
+ * @returns {Promise<{enqueued:number, added:number, modified:number, deleted:number, retired:number, files:string[]}>}
102
+ */
103
+ export async function scanDirtyAndEnqueue({ projectRoot, stateDir, admissionPolicy, isExcluded, maxEnqueue = DEFAULT_MAX_ENQUEUE, onProgress = null }) {
104
+ const policy = admissionPolicy || createAdmissionPolicy({ projectRoot });
105
+ const extraDeny = typeof isExcluded === 'function' ? isExcluded : null;
106
+ const merkle = readMerkleFiles(stateDir);
107
+ const queued = alreadyQueued(stateDir);
108
+ const maxFileSize = BigInt(policy.maxFileSize);
109
+ const progress = typeof onProgress === 'function'
110
+ ? (phase) => { onProgress(phase); }
111
+ : () => {};
112
+ let walked = 0;
113
+
114
+ // Never enqueue the maintainer's own state dir — its queues/manifests/db are
115
+ // not source files and must be skipped regardless of the policy.
116
+ const stateDirResolved = path.resolve(stateDir);
117
+ const isStateDir = (abs) => {
118
+ const r = path.resolve(abs);
119
+ return r === stateDirResolved || r.startsWith(stateDirResolved + path.sep);
120
+ };
121
+
122
+ // 1. Full walk: classify every present file; prune denied directories so we
123
+ // never descend node_modules/.git/etc. `present` keeps shape-rejected
124
+ // merkle files too (they must be retired).
125
+ const present = new Map(); // rel -> { isNew, changed, shapeOk, sizeOk }
126
+ const stack = [projectRoot];
127
+ while (stack.length) {
128
+ const dir = stack.pop();
129
+ let entries;
130
+ try {
131
+ entries = fs.readdirSync(dir, { withFileTypes: true });
132
+ } catch {
133
+ continue;
134
+ }
135
+ for (const ent of entries) {
136
+ const abs = path.join(dir, ent.name);
137
+ if (isStateDir(abs)) continue;
138
+ const rel = path.relative(projectRoot, abs).replace(/\\/g, '/');
139
+ if (!rel) continue;
140
+ if (ent.isDirectory()) {
141
+ if (policy.isExcluded(rel) || (extraDeny && extraDeny(rel))) continue; // prune subtree
142
+ stack.push(abs);
143
+ continue;
144
+ }
145
+ if (!ent.isFile()) continue;
146
+ walked += 1;
147
+ if (walked % 1000 === 0) progress('dirty-scan:walk');
148
+ const prev = merkle[rel];
149
+ const shapeOk = policy.admitsShape(rel) && !(extraDeny && extraDeny(rel));
150
+ if (!shapeOk) {
151
+ // New rejected files are dropped; previously-indexed ones are retired.
152
+ if (prev) present.set(rel, { isNew: false, changed: false, shapeOk: false, sizeOk: false });
153
+ continue;
154
+ }
155
+ let stat;
156
+ try {
157
+ stat = fs.statSync(abs, { bigint: true });
158
+ } catch {
159
+ if (prev) present.set(rel, { isNew: false, changed: false, shapeOk: false, sizeOk: false });
160
+ continue;
161
+ }
162
+ const sizeOk = stat.size <= maxFileSize;
163
+ const isNew = !prev;
164
+ const changed = isNew
165
+ ? true
166
+ : (stat.size.toString() !== String(prev.size) || stat.mtimeNs.toString() !== String(prev.mtime_ns));
167
+ present.set(rel, { isNew, changed, shapeOk: true, sizeOk });
168
+ }
169
+ }
170
+
171
+ // 2. Gitignore: ONE batched check over admissible (shape+size OK) files. This
172
+ // catches both new files dropped into a gitignored path and previously
173
+ // indexed files whose `.gitignore` status changed.
174
+ const gitCandidates = [];
175
+ for (const [rel, v] of present) {
176
+ if (v.shapeOk && v.sizeOk) gitCandidates.push(rel);
177
+ }
178
+ progress('dirty-scan:gitignore');
179
+ const gitignored = await policy.gitignoredSet(gitCandidates);
180
+ progress('dirty-scan:decide');
181
+
182
+ // 3. Decide enqueues. Admitted+changed → reindex; previously-indexed but no
183
+ // longer admitted → retire.
184
+ const toEnqueue = [];
185
+ let added = 0;
186
+ let modified = 0;
187
+ let deleted = 0;
188
+ let retired = 0;
189
+ const enqueue = (rel) => {
190
+ toEnqueue.push(rel);
191
+ queued.add(rel);
192
+ };
193
+
194
+ for (const [rel, v] of present) {
195
+ if (toEnqueue.length >= maxEnqueue) break;
196
+ const admitted = v.shapeOk && v.sizeOk && !gitignored.has(rel);
197
+ if (admitted) {
198
+ if (v.changed && !queued.has(rel)) {
199
+ enqueue(rel);
200
+ v.isNew ? (added += 1) : (modified += 1);
201
+ }
202
+ } else if (merkle[rel] && !queued.has(rel)) {
203
+ enqueue(rel);
204
+ retired += 1;
205
+ }
206
+ }
207
+
208
+ // 4. Merkle-known files not seen in the walk: deleted (gone) or living under a
209
+ // directory that just became denied. Either way, retire.
210
+ for (const rel of Object.keys(merkle)) {
211
+ if (toEnqueue.length >= maxEnqueue) break;
212
+ if (present.has(rel) || queued.has(rel)) continue;
213
+ if (!fs.existsSync(path.join(projectRoot, rel))) {
214
+ enqueue(rel);
215
+ deleted += 1;
216
+ } else {
217
+ enqueue(rel);
218
+ retired += 1;
219
+ }
220
+ }
221
+
222
+ if (toEnqueue.length === 0) {
223
+ return { enqueued: 0, added: 0, modified: 0, deleted: 0, retired: 0, files: [] };
224
+ }
225
+
226
+ fs.mkdirSync(stateDir, { recursive: true });
227
+ const now = Date.now();
228
+ const iso = new Date(now).toISOString();
229
+ const lines = toEnqueue
230
+ .map((rel) => `${JSON.stringify({ file_path: rel, timestamp: now, queued_at: iso, source: 'scan' })}\n`)
231
+ .join('');
232
+ fs.appendFileSync(path.join(stateDir, DIRTY_QUEUE), lines);
233
+ progress('dirty-scan:queued');
234
+
235
+ return { enqueued: toEnqueue.length, added, modified, deleted, retired, files: toEnqueue };
236
+ }