sweet-search 2.5.2 → 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -0,0 +1,133 @@
1
+ /**
2
+ * Retired-vector physical GC for `codebase.db`.
3
+ *
4
+ * `applyDiff` (vector-delta-writer) only *tombstones* superseded rows by
5
+ * setting `epoch_retired`; nothing ever deletes them, so `codebase.db` grows
6
+ * without bound under a long-lived daemon. This module physically removes
7
+ * retired rows once no reader can still observe them.
8
+ *
9
+ * Safety (strict visibility, plan § 8.1.1):
10
+ * A row is visible to a reader pinned at manifest epoch E iff
11
+ * epoch_written <= E AND (epoch_retired IS NULL OR epoch_retired > E).
12
+ * So a row with `epoch_retired = R` is invisible to every reader whose
13
+ * pinned epoch E satisfies E >= R. The smallest epoch any live reader
14
+ * pins is `minLiveEpoch` (from reader heartbeats); the repository always
15
+ * re-syncs to the *latest* manifest, so a reader's query epoch is never
16
+ * below its heartbeat epoch. Therefore deleting rows with
17
+ * `epoch_retired <= frontier`, where `frontier = minLiveEpoch ??
18
+ * currentManifestEpoch`, can never remove a row any reader still sees:
19
+ * - readers present → frontier = minLiveEpoch <= every reader's epoch.
20
+ * - no readers → frontier = currentManifestEpoch; any future
21
+ * reader reads a manifest at epoch >= currentManifestEpoch (monotonic),
22
+ * so deleted rows (retired <= currentManifestEpoch <= future E) are
23
+ * already invisible to it.
24
+ *
25
+ * The float/binary HNSW indices keep their own vector copies and are rebuilt
26
+ * from live rows (`epoch_retired IS NULL`); they never read retired rows, so
27
+ * deleting retired DB rows cannot desync them.
28
+ */
29
+
30
+ import fs from 'node:fs';
31
+ import path from 'node:path';
32
+ import Database from 'better-sqlite3';
33
+
34
+ export const DEFAULT_GC_BATCH = 2000;
35
+ export const DEFAULT_GC_MAX_ROWS = 100_000;
36
+
37
+ /**
38
+ * Delete retired rows at or below `frontier` in bounded batches.
39
+ *
40
+ * Uses a `rowid IN (SELECT … LIMIT ?)` subquery so it works regardless of
41
+ * whether SQLite was compiled with `SQLITE_ENABLE_UPDATE_DELETE_LIMIT`.
42
+ *
43
+ * @param {import('better-sqlite3').Database} db open read-write connection
44
+ * @param {number} frontier inclusive prune boundary
45
+ * @param {{batchSize?:number, maxRows?:number}} [opts]
46
+ * @returns {{deleted:number, batches:number, hitCap:boolean}}
47
+ */
48
+ export function pruneRetiredVectors(db, frontier, opts = {}) {
49
+ if (!Number.isInteger(frontier)) {
50
+ throw new Error(`pruneRetiredVectors: frontier must be an integer, got ${frontier}`);
51
+ }
52
+ const batchSize = Number.isInteger(opts.batchSize) && opts.batchSize > 0 ? opts.batchSize : DEFAULT_GC_BATCH;
53
+ const maxRows = Number.isInteger(opts.maxRows) && opts.maxRows > 0 ? opts.maxRows : DEFAULT_GC_MAX_ROWS;
54
+
55
+ const stmt = db.prepare(`
56
+ DELETE FROM vectors
57
+ WHERE rowid IN (
58
+ SELECT rowid FROM vectors
59
+ WHERE epoch_retired IS NOT NULL AND epoch_retired <= ?
60
+ LIMIT ?
61
+ )
62
+ `);
63
+
64
+ let deleted = 0;
65
+ let batches = 0;
66
+ let hitCap = false;
67
+ for (;;) {
68
+ const remainingCap = maxRows - deleted;
69
+ if (remainingCap <= 0) { hitCap = true; break; }
70
+ const take = Math.min(batchSize, remainingCap);
71
+ const res = stmt.run(frontier, take);
72
+ const changes = res.changes ?? 0;
73
+ deleted += changes;
74
+ batches += 1;
75
+ if (changes < take) break; // drained
76
+ }
77
+ return { deleted, batches, hitCap };
78
+ }
79
+
80
+ /**
81
+ * Run retired-vector GC against `<stateDir>/codebase.db`.
82
+ *
83
+ * Computes the safe prune frontier from reader heartbeats (falling back to
84
+ * the current manifest epoch when no readers are live), prunes in bounded
85
+ * batches, then issues a PASSIVE WAL checkpoint to keep the WAL from
86
+ * growing without ever blocking concurrent readers. Never throws on a
87
+ * missing DB / column / heartbeat dir — returns `{ skipped }` instead.
88
+ *
89
+ * @param {string} stateDir
90
+ * @param {{
91
+ * dbPath?:string, batchSize?:number, maxRows?:number,
92
+ * minLiveEpoch?:(dir:string)=>(number|null),
93
+ * readManifest?:(dir:string)=>(object|null),
94
+ * }} [deps]
95
+ * @returns {{deleted:number, frontier:number, hadReaders:boolean, batches:number, hitCap:boolean}|{skipped:string}}
96
+ */
97
+ export function runVectorGc(stateDir, deps = {}) {
98
+ const dbPath = deps.dbPath || path.join(stateDir, 'codebase.db');
99
+ if (!fs.existsSync(dbPath)) return { skipped: 'no-vector-db' };
100
+
101
+ const minLiveEpochFn = deps.minLiveEpoch;
102
+ const readManifestFn = deps.readManifest;
103
+ if (typeof minLiveEpochFn !== 'function' || typeof readManifestFn !== 'function') {
104
+ throw new Error('runVectorGc: minLiveEpoch and readManifest deps are required');
105
+ }
106
+
107
+ let frontier = null;
108
+ let hadReaders = false;
109
+ const live = minLiveEpochFn(stateDir);
110
+ if (Number.isInteger(live)) {
111
+ frontier = live;
112
+ hadReaders = true;
113
+ } else {
114
+ const manifest = readManifestFn(stateDir);
115
+ if (Number.isInteger(manifest?.epoch)) frontier = manifest.epoch;
116
+ }
117
+ if (!Number.isInteger(frontier)) return { skipped: 'no-frontier' };
118
+
119
+ const db = new Database(dbPath);
120
+ try {
121
+ const cols = db.prepare('PRAGMA table_info(vectors)').all().map((c) => c.name);
122
+ if (!cols.includes('epoch_retired')) return { skipped: 'no-epoch-column' };
123
+ db.pragma('journal_mode = WAL');
124
+ db.pragma('synchronous = NORMAL');
125
+ const result = pruneRetiredVectors(db, frontier, deps);
126
+ if (result.deleted > 0) {
127
+ try { db.pragma('wal_checkpoint(PASSIVE)'); } catch { /* best-effort */ }
128
+ }
129
+ return { ...result, frontier, hadReaders };
130
+ } finally {
131
+ db.close();
132
+ }
133
+ }
@@ -0,0 +1,155 @@
1
+ /**
2
+ * Cross-worktree DB stamping.
3
+ *
4
+ * Plan § 8.5 / § 14.2.4. Sweet-search's `.sweet-search/` directory holds
5
+ * the index for one project; when a user has multiple worktrees pointing
6
+ * at the same repo, they either get their own state dir each (the
7
+ * default) or share one. Sharing is allowed only when:
8
+ *
9
+ * - `project_root` matches the recorded stamp, AND
10
+ * - `git common-dir` matches the recorded stamp.
11
+ *
12
+ * A mismatched stamp aborts daemon startup with a clear remediation
13
+ * message instead of silently mixing index histories.
14
+ *
15
+ * Stamp file: `.sweet-search/worktree-stamp.json`:
16
+ *
17
+ * {
18
+ * "projectRoot": "/Users/x/repo",
19
+ * "gitCommonDir": "/Users/x/repo/.git",
20
+ * "stampedAt": "2026-05-16T00:00:00.000Z"
21
+ * }
22
+ *
23
+ * The stamp is informational and additive — the lockfile (`lockfile.mjs`)
24
+ * is what enforces single-writer semantics; the stamp catches the
25
+ * "two worktrees writing to the same state dir" footgun before it
26
+ * corrupts the index.
27
+ */
28
+
29
+ import { spawnSync } from 'node:child_process';
30
+ import fs from 'node:fs';
31
+ import path from 'node:path';
32
+
33
+ export const STAMP_FILENAME = 'worktree-stamp.json';
34
+
35
+ function stampPath(stateDir) {
36
+ return path.join(stateDir, STAMP_FILENAME);
37
+ }
38
+
39
+ /**
40
+ * Resolve `git rev-parse --git-common-dir` for the given project root.
41
+ * Returns `null` when not a git repo or git is unavailable.
42
+ *
43
+ * @param {string} projectRoot
44
+ * @returns {string|null}
45
+ */
46
+ export function gitCommonDir(projectRoot) {
47
+ try {
48
+ const r = spawnSync('git', ['rev-parse', '--git-common-dir'], {
49
+ cwd: projectRoot,
50
+ encoding: 'utf-8',
51
+ });
52
+ if (r.status !== 0) return null;
53
+ const out = (r.stdout || '').trim();
54
+ if (!out) return null;
55
+ return path.isAbsolute(out) ? out : path.resolve(projectRoot, out);
56
+ } catch {
57
+ return null;
58
+ }
59
+ }
60
+
61
+ /**
62
+ * Read the stamp from disk, returning `null` when missing.
63
+ *
64
+ * @param {string} stateDir
65
+ * @returns {{projectRoot:string, gitCommonDir:string|null, stampedAt:string}|null}
66
+ */
67
+ export function readStamp(stateDir) {
68
+ const p = stampPath(stateDir);
69
+ if (!fs.existsSync(p)) return null;
70
+ try {
71
+ return JSON.parse(fs.readFileSync(p, 'utf-8'));
72
+ } catch {
73
+ return null;
74
+ }
75
+ }
76
+
77
+ /**
78
+ * Write a stamp for the current project. Caller invokes on first daemon
79
+ * start after a freshly initialised `.sweet-search/` dir.
80
+ *
81
+ * @param {string} stateDir
82
+ * @param {string} projectRoot
83
+ */
84
+ export function writeStamp(stateDir, projectRoot) {
85
+ fs.mkdirSync(stateDir, { recursive: true });
86
+ const stamp = {
87
+ projectRoot: path.resolve(projectRoot),
88
+ gitCommonDir: gitCommonDir(projectRoot),
89
+ stampedAt: new Date().toISOString(),
90
+ };
91
+ fs.writeFileSync(stampPath(stateDir), JSON.stringify(stamp, null, 2));
92
+ return stamp;
93
+ }
94
+
95
+ /**
96
+ * Verify the stamp matches the current project. Returns
97
+ * { ok: true } when the stamp matches (or is absent).
98
+ * { ok: false, reason, expected, actual } on mismatch.
99
+ *
100
+ * Caller should:
101
+ * - on absent stamp → call `writeStamp` to mint one;
102
+ * - on mismatch → log ERROR with the remediation message and exit.
103
+ *
104
+ * @param {string} stateDir
105
+ * @param {string} projectRoot
106
+ * @returns {{ok:boolean, reason?:string, expected?:object, actual?:object}}
107
+ */
108
+ export function verifyStamp(stateDir, projectRoot) {
109
+ const stamp = readStamp(stateDir);
110
+ if (!stamp) return { ok: true, reason: 'absent' };
111
+ const actualProject = path.resolve(projectRoot);
112
+ const actualGit = gitCommonDir(projectRoot);
113
+ if (stamp.projectRoot !== actualProject) {
114
+ return {
115
+ ok: false,
116
+ reason: 'projectRoot-mismatch',
117
+ expected: { projectRoot: stamp.projectRoot, gitCommonDir: stamp.gitCommonDir },
118
+ actual: { projectRoot: actualProject, gitCommonDir: actualGit },
119
+ };
120
+ }
121
+ if (stamp.gitCommonDir && actualGit && stamp.gitCommonDir !== actualGit) {
122
+ return {
123
+ ok: false,
124
+ reason: 'gitCommonDir-mismatch',
125
+ expected: { projectRoot: stamp.projectRoot, gitCommonDir: stamp.gitCommonDir },
126
+ actual: { projectRoot: actualProject, gitCommonDir: actualGit },
127
+ };
128
+ }
129
+ return { ok: true, reason: 'match' };
130
+ }
131
+
132
+ /**
133
+ * Compose the operator-facing error message for a stamp mismatch.
134
+ *
135
+ * @param {{reason:string, expected:object, actual:object}} mismatch
136
+ * @returns {string}
137
+ */
138
+ export function formatStampMismatch(mismatch) {
139
+ const linesExpected = [
140
+ ` expected projectRoot: ${mismatch.expected.projectRoot}`,
141
+ ` expected gitCommonDir: ${mismatch.expected.gitCommonDir ?? '(none)'}`,
142
+ ];
143
+ const linesActual = [
144
+ ` actual projectRoot: ${mismatch.actual.projectRoot}`,
145
+ ` actual gitCommonDir: ${mismatch.actual.gitCommonDir ?? '(none)'}`,
146
+ ];
147
+ return [
148
+ `[sweet-search] worktree-stamp mismatch (${mismatch.reason}):`,
149
+ ...linesExpected,
150
+ ...linesActual,
151
+ '',
152
+ 'Remediation: either use a per-worktree .sweet-search/ directory (default)',
153
+ 'or reset the shared index with `npm run index` from this worktree.',
154
+ ].join('\n');
155
+ }
@@ -0,0 +1,115 @@
1
+ /**
2
+ * WSL2 detection + watcher default policy.
3
+ *
4
+ * Plan § 34.6 / § 37.3.3. The earlier blanket "WSL2 → polling-only" rule
5
+ * over-penalised users on native Linux paths inside WSL2; an earlier
6
+ * draft proposed parsing `df -T` or `/proc/mounts` to detect the actual
7
+ * filesystem type, but the parser was brittle across Linux distros, WSL
8
+ * versions, and mount-point naming.
9
+ *
10
+ * The pragmatic compromise:
11
+ * 1. Detect WSL2 via `/proc/version` containing `microsoft` or `WSL`.
12
+ * 2. If detected, recommend `SWEET_SEARCH_WATCH=0` **as default** (not
13
+ * forced).
14
+ * 3. If the user explicitly set `SWEET_SEARCH_WATCH=1`, respect it.
15
+ * 4. If the watcher fails at startup (inotify error on a 9p mount),
16
+ * fall back to polling with a clear remediation message rather
17
+ * than crashing.
18
+ *
19
+ * This module is read-only — it inspects the host environment and
20
+ * returns a recommendation. The caller (Phase 4 file watcher setup)
21
+ * acts on the recommendation.
22
+ */
23
+
24
+ import fs from 'node:fs';
25
+
26
+ let cached = null;
27
+
28
+ function detectInner() {
29
+ // Containers also typically have inotify problems with bind mounts.
30
+ let kubernetes = false;
31
+ let docker = false;
32
+ try {
33
+ if (process.env.KUBERNETES_SERVICE_HOST) kubernetes = true;
34
+ if (fs.existsSync('/.dockerenv')) docker = true;
35
+ if (!docker && fs.existsSync('/proc/1/cgroup')) {
36
+ const c = fs.readFileSync('/proc/1/cgroup', 'utf-8');
37
+ if (/docker|kubepods|containerd/i.test(c)) docker = true;
38
+ }
39
+ } catch {
40
+ // ignore
41
+ }
42
+
43
+ let wsl2 = false;
44
+ try {
45
+ if (fs.existsSync('/proc/version')) {
46
+ const v = fs.readFileSync('/proc/version', 'utf-8').toLowerCase();
47
+ if (v.includes('microsoft') || v.includes('wsl')) wsl2 = true;
48
+ }
49
+ } catch {
50
+ // ignore
51
+ }
52
+
53
+ return {
54
+ wsl2,
55
+ docker,
56
+ kubernetes,
57
+ container: docker || kubernetes,
58
+ platform: process.platform,
59
+ };
60
+ }
61
+
62
+ export function detectEnvironment() {
63
+ if (cached === null) cached = detectInner();
64
+ return cached;
65
+ }
66
+
67
+ /**
68
+ * Decide the watcher's default-enabled state for this host.
69
+ *
70
+ * - Explicit env var wins (`SWEET_SEARCH_WATCH=1` or `0`).
71
+ * - WSL2 → polling default with override available.
72
+ * - Container → polling default.
73
+ * - Windows native → polling default (Phase 4 ships node:fs.watch; the
74
+ * Rust notify binding via `ReadDirectoryChangesW` is Phase 6).
75
+ * - Otherwise → watcher enabled.
76
+ *
77
+ * @param {NodeJS.ProcessEnv} [env]
78
+ * @returns {{watcherEnabled:boolean, reason:string}}
79
+ */
80
+ export function watcherDefault(env = process.env) {
81
+ const explicit = env.SWEET_SEARCH_WATCH;
82
+ if (explicit === '1' || explicit === 'true' || explicit === 'on') {
83
+ return { watcherEnabled: true, reason: 'env-override-on' };
84
+ }
85
+ if (explicit === '0' || explicit === 'false' || explicit === 'off') {
86
+ return { watcherEnabled: false, reason: 'env-override-off' };
87
+ }
88
+ const detect = detectEnvironment();
89
+ if (detect.wsl2) {
90
+ return {
91
+ watcherEnabled: false,
92
+ reason: 'wsl2-polling-default (set SWEET_SEARCH_WATCH=1 if your project is on native ext4 e.g. /home/user/project)',
93
+ };
94
+ }
95
+ if (detect.container) {
96
+ return { watcherEnabled: false, reason: 'container-polling-default' };
97
+ }
98
+ if (detect.platform === 'win32') {
99
+ return { watcherEnabled: false, reason: 'win32-polling-default (Phase 6 Rust notify binding pending)' };
100
+ }
101
+ return { watcherEnabled: true, reason: 'native-os-watcher' };
102
+ }
103
+
104
+ /**
105
+ * Format the watcher startup message that goes to stderr. Plan § 34.6
106
+ * step 2 specifies the user-facing copy.
107
+ *
108
+ * @param {{watcherEnabled:boolean, reason:string}} decision
109
+ * @returns {string}
110
+ */
111
+ export function formatWatcherNotice(decision) {
112
+ return `[sweet-search] file watcher: ${decision.watcherEnabled ? 'on' : 'off'} (${decision.reason})`;
113
+ }
114
+
115
+ export const __testing = { detectInner, resetCache: () => { cached = null; } };
@@ -0,0 +1,139 @@
1
+ /**
2
+ * Shared file-admission policy for full + incremental indexing.
3
+ *
4
+ * Full indexing (`discoverFiles` in indexer-utils.js) and incremental indexing
5
+ * (`dirty-scan` producer + `production-reconciler` consumer) MUST admit exactly
6
+ * the same files: a file full indexing would skip must never be newly indexed by
7
+ * incremental, and a file full indexing would admit must be eligible for
8
+ * incremental. This module is the single definition of that decision so the two
9
+ * paths cannot drift.
10
+ *
11
+ * A file is admitted iff ALL of:
12
+ * 1. include allowlist — matches a project `include` glob (minimatch)
13
+ * 2. NOT excluded — `buildPathFilter` deny-list (default deny dirs/exts +
14
+ * project `exclude` globs + `.sweet-search-ignore`)
15
+ * 3. NOT oversized — size ≤ project `maxFileSize`
16
+ * 4. NOT gitignored — `git check-ignore` alignment (agentic paths exempt),
17
+ * only when the worktree is a git repo
18
+ *
19
+ * The exclude/deny component is delegated to `buildPathFilter` (incremental
20
+ * infra) so its rules — and its tests — stay the single source for "deny", and
21
+ * gitignore is delegated to `gitignore-filter` so it matches full indexing.
22
+ * This module only adds the include allowlist + size + the wiring.
23
+ *
24
+ * Shape checks (include + deny) are synchronous and I/O-free so producers can
25
+ * prune cheaply during a tree walk. Size is a single `stat`. Gitignore is async
26
+ * and batched (one `git check-ignore` per call) — never per-file.
27
+ */
28
+
29
+ import path from 'node:path';
30
+ import { statSync, existsSync } from 'node:fs';
31
+ import { Minimatch } from 'minimatch';
32
+
33
+ import { loadProjectConfig } from '../infrastructure/config/index.js';
34
+ import { buildPathFilter } from '../incremental-indexing/infrastructure/path-filter.mjs';
35
+ import { getGitIgnoredPathSet, isGitignoreAllowlistedAgenticPath, toPosixPath } from './gitignore-filter.js';
36
+
37
+ const MM_OPTS = { dot: true, nocase: false };
38
+ const DEFAULT_MAX_FILE_SIZE = 1 * 1024 * 1024;
39
+
40
+ function normalizeRel(rel) {
41
+ return String(rel || '').replace(/\\/g, '/').replace(/^\.\//, '');
42
+ }
43
+
44
+ /**
45
+ * Build an admission policy bound to a project root.
46
+ *
47
+ * @param {object} [opts]
48
+ * @param {string} [opts.projectRoot]
49
+ * @param {object} [opts.config] Pre-loaded loadProjectConfig() result.
50
+ * @param {boolean} [opts.allowSweetSearchDir] Lift the `.sweet-search` deny (daemon self-paths).
51
+ */
52
+ export function createAdmissionPolicy({ projectRoot = process.cwd(), config, allowSweetSearchDir = false } = {}) {
53
+ const cfg = config || loadProjectConfig(projectRoot);
54
+ const includeGlobs = Array.isArray(cfg.include) ? cfg.include : [];
55
+ const excludeGlobs = Array.isArray(cfg.exclude) ? cfg.exclude : [];
56
+ const includeMatchers = includeGlobs.map((g) => new Minimatch(g, MM_OPTS));
57
+ const isDenied = buildPathFilter({ projectRoot, allowSweetSearchDir });
58
+ const maxFileSize = typeof cfg.maxFileSize === 'number' ? cfg.maxFileSize : DEFAULT_MAX_FILE_SIZE;
59
+ const respectGitignore = cfg.respectGitignore !== false;
60
+ const hasGit = existsSync(path.join(projectRoot, '.git'));
61
+
62
+ /** Include allowlist only (matches a project include glob). */
63
+ function matchesInclude(rel) {
64
+ const r = normalizeRel(rel);
65
+ if (!r) return false;
66
+ return includeMatchers.some((m) => m.match(r));
67
+ }
68
+
69
+ /** Deny-list only (true ⇒ excluded). Mirrors buildPathFilter; used for directory pruning. */
70
+ function isExcluded(rel) {
71
+ return isDenied(normalizeRel(rel));
72
+ }
73
+
74
+ /** Synchronous shape gate: include allowlist AND not excluded. No I/O. */
75
+ function admitsShape(rel) {
76
+ const r = normalizeRel(rel);
77
+ if (!r) return false;
78
+ return matchesInclude(r) && !isDenied(r);
79
+ }
80
+
81
+ /** True if the file at `absPath` exceeds maxFileSize. A stat error ⇒ true (treat as inadmissible, matching full indexing which drops un-statable files). */
82
+ function isOversizedAbs(absPath) {
83
+ try {
84
+ return statSync(absPath).size > maxFileSize;
85
+ } catch {
86
+ return true;
87
+ }
88
+ }
89
+
90
+ /**
91
+ * Batched gitignore: returns the subset of `rels` that git would ignore
92
+ * (posix-normalised). Empty when gitignore is disabled, the worktree is not a
93
+ * git repo, or git is unavailable — matching full indexing's fallback to
94
+ * "admit everything" rather than dropping files on a git failure.
95
+ */
96
+ async function gitignoredSet(rels, { silent = true } = {}) {
97
+ if (!respectGitignore || !hasGit) return new Set();
98
+ const candidates = [];
99
+ for (const rel of rels) {
100
+ const r = normalizeRel(rel);
101
+ if (!r || isGitignoreAllowlistedAgenticPath(r)) continue;
102
+ candidates.push(r);
103
+ }
104
+ if (candidates.length === 0) return new Set();
105
+ const ignored = await getGitIgnoredPathSet(candidates, { projectRoot, silent });
106
+ if (!ignored) return new Set();
107
+ const out = new Set();
108
+ for (const p of ignored) out.add(toPosixPath(p));
109
+ return out;
110
+ }
111
+
112
+ /**
113
+ * Convenience for batch discovery: drop gitignored paths from `rels`.
114
+ * Equivalent to full indexing's applyGitignoreAlignment over an already
115
+ * shape+size filtered list.
116
+ */
117
+ async function applyGitignore(rels, { silent = true } = {}) {
118
+ const list = [...rels];
119
+ const ignored = await gitignoredSet(list, { silent });
120
+ if (ignored.size === 0) return { files: list, gitignored: 0 };
121
+ const files = list.filter((rel) => !ignored.has(toPosixPath(normalizeRel(rel))));
122
+ return { files, gitignored: list.length - files.length };
123
+ }
124
+
125
+ return {
126
+ projectRoot,
127
+ includeGlobs,
128
+ excludeGlobs,
129
+ maxFileSize,
130
+ respectGitignore,
131
+ hasGit,
132
+ matchesInclude,
133
+ isExcluded,
134
+ admitsShape,
135
+ isOversizedAbs,
136
+ gitignoredSet,
137
+ applyGitignore,
138
+ };
139
+ }
@@ -56,6 +56,20 @@ import { BinaryHNSWIndex } from '../vector-store/binary-hnsw-index.js';
56
56
  import { truncateForHNSW, fisherYatesShuffle, normalizedFloatToInt8, floatToBinary } from '../infrastructure/quantization.js';
57
57
  import { FloatVectorStore, getFloatStorePath } from '../vector-store/float-vector-store.js';
58
58
 
59
+ function hasVectorColumn(db, column) {
60
+ try {
61
+ return db.prepare('PRAGMA table_info(vectors)').all().some((col) => col.name === column);
62
+ } catch (_err) {
63
+ return false;
64
+ }
65
+ }
66
+
67
+ function liveVectorSql(db, alias = '') {
68
+ if (!hasVectorColumn(db, 'epoch_retired')) return '1=1';
69
+ const prefix = alias ? `${alias}.` : '';
70
+ return `${prefix}epoch_retired IS NULL`;
71
+ }
72
+
59
73
  // =============================================================================
60
74
  // THRESHOLD CHECKING FUNCTIONS
61
75
  // =============================================================================
@@ -387,7 +401,8 @@ async function buildHnswIndexFromDb(db, options = {}) {
387
401
  } = options;
388
402
 
389
403
  const dimension = options.dimension || Math.ceil(floatDimension / 8);
390
- const totalVectors = db.prepare('SELECT COUNT(*) as c FROM vectors').get().c;
404
+ const vectorWhere = liveVectorSql(db);
405
+ const totalVectors = db.prepare(`SELECT COUNT(*) as c FROM vectors WHERE ${vectorWhere}`).get().c;
391
406
 
392
407
  const index = new BinaryHNSWIndex({
393
408
  dimension,
@@ -408,19 +423,20 @@ async function buildHnswIndexFromDb(db, options = {}) {
408
423
  db.exec('CREATE TEMP TABLE IF NOT EXISTS artifact_order (pos INTEGER PRIMARY KEY, vector_rowid INTEGER)');
409
424
  db.exec('DELETE FROM artifact_order');
410
425
 
411
- let indices = Array.from({ length: totalVectors }, (_, i) => i + 1); // 1-based rowids
426
+ const rowidRows = db.prepare(`SELECT rowid FROM vectors WHERE ${vectorWhere} ORDER BY rowid`).all();
427
+ let indices = rowidRows.map((row) => row.rowid);
412
428
 
413
429
  if (insertionOrder === 'shuffle') {
414
430
  fisherYatesShuffle(indices);
415
431
  } else if (insertionOrder === 'diversity') {
416
- const filePaths = db.prepare('SELECT metadata FROM vectors ORDER BY rowid').all().map(r => {
417
- try { return JSON.parse(r.metadata)?.file || '_unknown'; } catch (_e) { return '_unknown'; }
418
- });
432
+ const rows = db.prepare(`SELECT rowid, metadata FROM vectors WHERE ${vectorWhere} ORDER BY rowid`).all();
419
433
  const buckets = new Map();
420
- for (let i = 0; i < filePaths.length; i++) {
421
- const dir = filePaths[i].replace(/\/[^/]+$/, '') || '_unknown';
434
+ for (const row of rows) {
435
+ let filePath = '_unknown';
436
+ try { filePath = JSON.parse(row.metadata)?.file || '_unknown'; } catch (_e) {}
437
+ const dir = filePath.replace(/\/[^/]+$/, '') || '_unknown';
422
438
  if (!buckets.has(dir)) buckets.set(dir, []);
423
- buckets.get(dir).push(i + 1);
439
+ buckets.get(dir).push(row.rowid);
424
440
  }
425
441
  const dirs = [...buckets.keys()];
426
442
  fisherYatesShuffle(dirs);
@@ -448,7 +464,7 @@ async function buildHnswIndexFromDb(db, options = {}) {
448
464
  ORDER BY o.pos
449
465
  `);
450
466
  } else {
451
- stmt = db.prepare('SELECT id, embedding, metadata FROM vectors ORDER BY rowid');
467
+ stmt = db.prepare(`SELECT id, embedding, metadata FROM vectors WHERE ${vectorWhere} ORDER BY rowid`);
452
468
  }
453
469
 
454
470
  const startTime = performance.now();
@@ -578,11 +594,12 @@ export async function saveArtifacts(hnswIndex) {
578
594
  */
579
595
  /** Build and save a FloatVectorStore by streaming from SQLite cursor. */
580
596
  async function buildAndSaveFloatStoreFromDb(db, floatDimension, floatStorePath) {
581
- const totalVectors = db.prepare('SELECT COUNT(*) as c FROM vectors').get().c;
597
+ const vectorWhere = liveVectorSql(db);
598
+ const totalVectors = db.prepare(`SELECT COUNT(*) as c FROM vectors WHERE ${vectorWhere}`).get().c;
582
599
  console.log(`Building float vector store (${totalVectors} vectors, ${floatDimension}d)...`);
583
600
  const floatStore = new FloatVectorStore();
584
601
  const floatEntries = [];
585
- const stmt = db.prepare('SELECT id, embedding FROM vectors ORDER BY rowid');
602
+ const stmt = db.prepare(`SELECT id, embedding FROM vectors WHERE ${vectorWhere} ORDER BY rowid`);
586
603
  for (const row of stmt.iterate()) {
587
604
  const embedding = new Float32Array(row.embedding.buffer, row.embedding.byteOffset, row.embedding.length / 4);
588
605
  floatEntries.push({
@@ -619,7 +636,7 @@ export async function buildFromCodebaseDb(codebaseDbPath = DB_PATHS.codebase, op
619
636
  const db = new Database(codebaseDbPath, insertionOrder === 'sequential' ? { readonly: true } : {});
620
637
  applyReadPragmas(db);
621
638
 
622
- const totalVectors = db.prepare('SELECT COUNT(*) as c FROM vectors').get().c;
639
+ const totalVectors = db.prepare(`SELECT COUNT(*) as c FROM vectors WHERE ${liveVectorSql(db)}`).get().c;
623
640
 
624
641
  console.log(`Found ${totalVectors} vectors`);
625
642