sweet-search 2.5.2 → 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/core/cli.js +24 -3
  2. package/core/graph/graph-expansion.js +215 -36
  3. package/core/graph/graph-extractor.js +196 -11
  4. package/core/graph/graph-search.js +395 -92
  5. package/core/graph/hcgs-generator.js +2 -1
  6. package/core/graph/index.js +2 -0
  7. package/core/graph/repo-map.js +28 -6
  8. package/core/graph/structural-answer-cues.js +168 -0
  9. package/core/graph/structural-callsite-hints.js +40 -0
  10. package/core/graph/structural-context-format.js +40 -0
  11. package/core/graph/structural-context.js +450 -0
  12. package/core/graph/structural-forward-push.js +156 -0
  13. package/core/graph/structural-header-context.js +19 -0
  14. package/core/graph/structural-importance.js +148 -0
  15. package/core/graph/structural-pagerank.js +197 -0
  16. package/core/graph/summary-manager.js +13 -9
  17. package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
  18. package/core/incremental-indexing/application/file-watcher.mjs +197 -0
  19. package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
  20. package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
  21. package/core/incremental-indexing/application/operator-cli.mjs +554 -0
  22. package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
  23. package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
  24. package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
  25. package/core/incremental-indexing/application/reconciler.mjs +477 -0
  26. package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
  27. package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
  28. package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
  29. package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
  30. package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
  31. package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
  32. package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
  33. package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
  34. package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
  35. package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
  36. package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
  37. package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
  38. package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
  39. package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
  40. package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
  41. package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
  42. package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
  43. package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
  44. package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
  45. package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
  46. package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
  47. package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
  48. package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
  49. package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
  50. package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
  51. package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
  52. package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
  53. package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
  54. package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
  55. package/core/indexing/admission-policy.js +139 -0
  56. package/core/indexing/artifact-builder.js +29 -12
  57. package/core/indexing/ast-chunker.js +107 -30
  58. package/core/indexing/dedup/exemplar-selector.js +19 -1
  59. package/core/indexing/gitignore-filter.js +223 -0
  60. package/core/indexing/incremental-tracker.js +99 -30
  61. package/core/indexing/index-codebase-v21.js +6 -5
  62. package/core/indexing/index-maintainer.mjs +698 -6
  63. package/core/indexing/indexer-ann.js +99 -15
  64. package/core/indexing/indexer-build.js +158 -45
  65. package/core/indexing/indexer-empty-baseline.js +80 -0
  66. package/core/indexing/indexer-manifest.js +66 -0
  67. package/core/indexing/indexer-phases.js +56 -23
  68. package/core/indexing/indexer-sparse-gram.js +54 -13
  69. package/core/indexing/indexer-utils.js +26 -208
  70. package/core/indexing/indexing-file-policy.js +32 -7
  71. package/core/indexing/maintainer-launcher.mjs +137 -0
  72. package/core/indexing/merkle-tracker.js +251 -244
  73. package/core/indexing/model-pool.js +46 -5
  74. package/core/infrastructure/code-graph-repository.js +758 -6
  75. package/core/infrastructure/code-graph-visibility.js +157 -0
  76. package/core/infrastructure/codebase-repository.js +100 -13
  77. package/core/infrastructure/config/search.js +1 -1
  78. package/core/infrastructure/db-utils.js +118 -0
  79. package/core/infrastructure/dedup-hashing.js +10 -13
  80. package/core/infrastructure/hardware-capability.js +17 -7
  81. package/core/infrastructure/index.js +8 -2
  82. package/core/infrastructure/language-patterns/maps.js +4 -1
  83. package/core/infrastructure/language-patterns/registry-core.js +56 -17
  84. package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
  85. package/core/infrastructure/language-patterns.js +69 -0
  86. package/core/infrastructure/model-registry.js +20 -0
  87. package/core/infrastructure/native-inference.js +7 -12
  88. package/core/infrastructure/native-resolver.js +52 -37
  89. package/core/infrastructure/native-sparse-gram.js +261 -20
  90. package/core/infrastructure/native-tokenizer.js +6 -15
  91. package/core/infrastructure/simd-distance.js +10 -16
  92. package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
  93. package/core/infrastructure/structural-alias-resolver.js +122 -0
  94. package/core/infrastructure/structural-candidate-ranker.js +34 -0
  95. package/core/infrastructure/structural-context-repository.js +472 -0
  96. package/core/infrastructure/structural-context-utils.js +51 -0
  97. package/core/infrastructure/structural-graph-signals.js +121 -0
  98. package/core/infrastructure/structural-qualified-resolution.js +15 -0
  99. package/core/infrastructure/structural-source-definitions.js +100 -0
  100. package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
  101. package/core/infrastructure/tree-sitter-provider.js +811 -37
  102. package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
  103. package/core/query/query-router.js +55 -5
  104. package/core/ranking/file-kind-ranking.js +2192 -15
  105. package/core/ranking/late-interaction-index.js +87 -12
  106. package/core/search/cli-decoration.js +290 -0
  107. package/core/search/context-expander.js +988 -78
  108. package/core/search/index.js +1 -0
  109. package/core/search/output-policy.js +275 -0
  110. package/core/search/search-anchor.js +499 -0
  111. package/core/search/search-boost.js +93 -1
  112. package/core/search/search-cli.js +61 -204
  113. package/core/search/search-hybrid.js +250 -10
  114. package/core/search/search-pattern-chunks.js +57 -8
  115. package/core/search/search-pattern-planner.js +68 -9
  116. package/core/search/search-pattern-prefilter.js +30 -10
  117. package/core/search/search-pattern-ripgrep.js +40 -4
  118. package/core/search/search-pattern-sparse-overlay.js +256 -0
  119. package/core/search/search-pattern.js +117 -29
  120. package/core/search/search-postprocess.js +479 -5
  121. package/core/search/search-read-semantic.js +260 -23
  122. package/core/search/search-read.js +82 -64
  123. package/core/search/search-reader-pin.js +71 -0
  124. package/core/search/search-rrf.js +279 -0
  125. package/core/search/search-semantic.js +110 -5
  126. package/core/search/search-server.js +130 -57
  127. package/core/search/search-trace.js +107 -0
  128. package/core/search/server-identity.js +93 -0
  129. package/core/search/session-daemon-prewarm.mjs +33 -10
  130. package/core/search/sweet-search.js +399 -7
  131. package/core/skills/sweet-index/SKILL.md +8 -6
  132. package/core/vector-store/binary-hnsw-index.js +194 -30
  133. package/core/vector-store/float-vector-store.js +96 -6
  134. package/core/vector-store/hnsw-index.js +220 -49
  135. package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
  136. package/eval/agent-read-workflows/bin/ss-find +15 -0
  137. package/eval/agent-read-workflows/bin/ss-grep +12 -0
  138. package/eval/agent-read-workflows/bin/ss-read +14 -0
  139. package/eval/agent-read-workflows/bin/ss-search +18 -0
  140. package/eval/agent-read-workflows/bin/ss-semantic +12 -0
  141. package/eval/agent-read-workflows/bin/ss-trace +11 -0
  142. package/mcp/read-tool.js +109 -0
  143. package/mcp/server.js +55 -15
  144. package/mcp/tool-handlers.js +14 -124
  145. package/mcp/trace-tool.js +81 -0
  146. package/package.json +25 -10
  147. package/scripts/hooks/intercept-read.mjs +55 -0
  148. package/scripts/hooks/remind-tools.mjs +40 -0
  149. package/scripts/init.js +698 -54
  150. package/scripts/inject-agent-instructions.js +431 -0
  151. package/scripts/install-prompt-reminders.js +188 -0
  152. package/scripts/install-tool-enforcement.js +220 -0
  153. package/scripts/smoke-test.js +12 -9
  154. package/scripts/uninstall.js +276 -18
  155. package/scripts/write-claude-rules.js +110 -0
@@ -12,7 +12,7 @@ import { createHash } from 'crypto';
12
12
  import path from 'path';
13
13
  import fs from 'fs/promises';
14
14
  import { detectProjectBoundary } from '../infrastructure/project-detector.js';
15
- import { getLanguageByPath } from '../infrastructure/language-patterns.js';
15
+ import { getLanguageByPath, resolveLanguage } from '../infrastructure/language-patterns.js';
16
16
  import { DocumentChunker } from './document-chunker.js';
17
17
 
18
18
  const MAX_CHUNK_SIZE = 2000;
@@ -151,12 +151,25 @@ function buildEmbeddingText({ variant: variantOverride, content, relativePath, l
151
151
  const pathLine = relativePath ? `# ${relativePath}` : null;
152
152
  const parentLine = hierarchyInfo?.parentSymbol
153
153
  ? `# Parent: ${hierarchyInfo.parentType} ${hierarchyInfo.parentSymbol}` : null;
154
- const symbolLine = (symbol && symbol !== 'unknown')
154
+ // Ruby method chunks keep metadata anchors, but omit method-name header
155
+ // text so top-level Ruby snippets stay aligned with the pre-method-boundary
156
+ // embedding surface.
157
+ const isRubyMethodChunk = language === 'ruby'
158
+ && (chunkType === 'method' || chunkType === 'singleton_method');
159
+ const symbolLine = (symbol && symbol !== 'unknown' && !isRubyMethodChunk)
155
160
  ? `# ${chunkType}: ${symbol}` : null;
156
161
  const langLine = (language && language !== 'text')
157
162
  ? `# Language: ${language}` : null;
158
163
  const signatureLine = hierarchyInfo?.signature
159
164
  ? `# Signature: ${hierarchyInfo.signature}` : null;
165
+ // Multi-symbol header: when the cAST sibling-merge collapses several
166
+ // top-level boundaries into one chunk (small Rust files with adjacent
167
+ // free-standing fns, e.g. packaging.rs:is_package + detect_package_root),
168
+ // the bi-encoder otherwise only sees the FIRST boundary's name. Adding
169
+ // an `# Additional:` line surfaces sibling symbol names to the encoder
170
+ // without changing chunk count or chunk text.
171
+ const additionalLine = (hierarchyInfo?.additionalSymbols && hierarchyInfo.additionalSymbols.length > 0)
172
+ ? `# Additional: ${hierarchyInfo.additionalSymbols.join(', ')}` : null;
160
173
 
161
174
  switch (variant) {
162
175
  case 'no_path':
@@ -223,6 +236,7 @@ function buildEmbeddingText({ variant: variantOverride, content, relativePath, l
223
236
  if (pathLine) parts.push(pathLine);
224
237
  if (parentLine) parts.push(parentLine);
225
238
  if (symbolLine) parts.push(symbolLine);
239
+ if (additionalLine) parts.push(additionalLine);
226
240
  if (langLine) parts.push(langLine);
227
241
  break;
228
242
  }
@@ -279,9 +293,17 @@ function buildLiText({ content, relativePath, language, chunkType, symbol, hiera
279
293
  if (hierarchyInfo?.parentSymbol) {
280
294
  lines.push(`# Parent: ${hierarchyInfo.parentType} ${hierarchyInfo.parentSymbol}`);
281
295
  }
282
- if (symbol && symbol !== 'unknown') {
296
+ // Mirror the Ruby method header carve-out used by embedding_text.
297
+ const isRubyMethodChunk = language === 'ruby'
298
+ && (chunkType === 'method' || chunkType === 'singleton_method');
299
+ if (symbol && symbol !== 'unknown' && !isRubyMethodChunk) {
283
300
  lines.push(`# ${chunkType}: ${symbol}`);
284
301
  }
302
+ // Mirror embedding_text: surface sibling symbol names so the LI MaxSim
303
+ // stage's input includes the same context as the bi-encoder embedding.
304
+ if (hierarchyInfo?.additionalSymbols && hierarchyInfo.additionalSymbols.length > 0) {
305
+ lines.push(`# Additional: ${hierarchyInfo.additionalSymbols.join(', ')}`);
306
+ }
285
307
  if (language && language !== 'text') {
286
308
  lines.push(`# Language: ${language}`);
287
309
  }
@@ -321,7 +343,11 @@ export class ASTChunker {
321
343
  return this._docChunker.parseFile(filePath, content);
322
344
  }
323
345
 
324
- const langInfo = getLanguageByPath(filePath);
346
+ // resolveLanguage handles per-file disambiguation of ambiguous extensions
347
+ // (today: `.h` → c-vs-cpp) using a content scan for C++-only tokens.
348
+ // Header-only C++ libraries (highway, Eigen, …) are routed to cpp so the
349
+ // chunker uses tree-sitter-cpp instead of tree-sitter-c.
350
+ const langInfo = resolveLanguage(filePath, content);
325
351
  if (!langInfo || !langInfo.chunker) {
326
352
  return this.parseGenericFile(filePath, content);
327
353
  }
@@ -361,20 +387,27 @@ export class ASTChunker {
361
387
  const tsChunks = await provider.parseFileToChunks(content, langInfo.id);
362
388
  if (!tsChunks || tsChunks.length === 0) return null;
363
389
 
364
- return tsChunks.map(chunk =>
365
- this.buildChunk(
366
- chunk.text, filePath, langInfo.id, chunk.type, chunk.name,
367
- chunk.startLine, chunk.endLine,
368
- {
369
- chunkId: chunk.chunkId,
370
- parentChunkId: chunk.parentChunkId,
371
- parentSymbol: chunk.parentSymbol,
372
- parentType: chunk.parentType,
373
- signature: chunk.signature || null,
374
- }
375
- )
376
- );
377
- }
390
+ return tsChunks.map(chunk => {
391
+ const isTopLevelRubyMethod = langInfo.id === 'ruby'
392
+ && (chunk.type === 'method' || chunk.type === 'singleton_method')
393
+ && !chunk.parentSymbol;
394
+
395
+ return this.buildChunk(
396
+ chunk.text, filePath, langInfo.id,
397
+ isTopLevelRubyMethod ? 'code' : chunk.type,
398
+ isTopLevelRubyMethod ? null : chunk.name,
399
+ chunk.startLine, chunk.endLine,
400
+ {
401
+ chunkId: chunk.chunkId,
402
+ parentChunkId: chunk.parentChunkId,
403
+ parentSymbol: chunk.parentSymbol,
404
+ parentType: chunk.parentType,
405
+ signature: chunk.signature || null,
406
+ additionalSymbols: chunk.additionalSymbols || null,
407
+ }
408
+ );
409
+ });
410
+ }
378
411
 
379
412
  parseBraceBasedFile(filePath, content, language, patterns, comment, multiLine) {
380
413
  const chunks = [];
@@ -388,12 +421,30 @@ export class ASTChunker {
388
421
 
389
422
  for (let i = 0; i < lines.length; i++) {
390
423
  const line = lines[i];
424
+ // Capture the comment state BEFORE _stripNonCode mutates it so we
425
+ // can tell whether this line entered the iteration inside a block
426
+ // comment (e.g. mid-Javadoc). Boundary detection must be skipped
427
+ // for such lines — otherwise `_matchBoundary` happily matches
428
+ // `public class MyClass {` inside a Java `/** ... <pre> ... */`
429
+ // example and emits a phantom class chunk. Verified on gson
430
+ // SerializedName.java where the regex fallback path emits a
431
+ // 25-82 chunk attributed to "class MyClass" sourced from the
432
+ // Javadoc body. See the matching extractJava() block-comment
433
+ // skip in core/graph/graph-extractor.js for the symmetric fix.
434
+ const inBlockCommentAtStart = stripState.inBlockComment;
391
435
  const stripped = this._stripNonCode(line, stripState, comment, hasTemplateInterpolation);
392
436
 
393
437
  braceDepth += (stripped.match(/{/g) || []).length;
394
438
  braceDepth -= (stripped.match(/}/g) || []).length;
395
439
 
396
- const { name: matched, type: matchType, joinedLines } = this._matchBoundary(line, patterns, language, lines, i, multiLine);
440
+ // When the line is entirely inside a block comment (entered as
441
+ // such AND still inside on exit), there's nothing executable to
442
+ // match — skip boundary detection entirely. The stripped output
443
+ // is already empty/whitespace so brace-depth tracking is a no-op.
444
+ const lineFullyInComment = inBlockCommentAtStart && stripState.inBlockComment;
445
+ const { name: matched, type: matchType, joinedLines } = lineFullyInComment
446
+ ? { name: null, type: null, joinedLines: 0 }
447
+ : this._matchBoundary(stripped, patterns, language, lines, i, multiLine);
397
448
 
398
449
  if ((matched && currentChunk) || (braceDepth === 0 && currentChunk)) {
399
450
  const chunkContent = lines.slice(chunkStart, i + 1).join('\n');
@@ -805,13 +856,20 @@ export class ASTChunker {
805
856
  for (let i = 0; i < lines.length; i++) {
806
857
  const line = lines[i];
807
858
  const lineSize = line.length + 1; // +1 for newline
859
+ const inBlockCommentAtStart = stripState.inBlockComment;
808
860
  const stripped = this._stripNonCode(line, stripState, comment, hasTemplateInterpolation);
809
861
 
810
862
  braceDepth += (stripped.match(/{/g) || []).length;
811
863
  braceDepth -= (stripped.match(/}/g) || []).length;
812
864
 
813
- // Check if this line is a sub-boundary (a new construct starting at depth >= 1)
814
- const { name: matched, type: matchType } = this._matchBoundary(line, patterns, language, lines, i, false);
865
+ // Check if this line is a sub-boundary (a new construct starting at depth >= 1).
866
+ // Use the comment-stripped line so Javadoc `<pre>public class Foo {...}</pre>`
867
+ // examples don't get matched as real sub-boundaries (parallels
868
+ // parseBraceBasedFile fix above).
869
+ const lineFullyInComment = inBlockCommentAtStart && stripState.inBlockComment;
870
+ const { name: matched, type: matchType } = lineFullyInComment
871
+ ? { name: null, type: null }
872
+ : this._matchBoundary(stripped, patterns, language, lines, i, false);
815
873
  const isSubBoundary = matched && i > segStart;
816
874
 
817
875
  // Split condition: at a sub-boundary, or accumulated segment exceeds max
@@ -938,6 +996,11 @@ export class ASTChunker {
938
996
  metadata.parent_symbol = hierarchyInfo.parentSymbol;
939
997
  metadata.parent_type = hierarchyInfo.parentType;
940
998
  }
999
+ // Carry sibling-symbol context into metadata so enrichEmbeddingText()
1000
+ // can rebuild the multi-symbol header during post-chunk enrichment.
1001
+ if (hierarchyInfo?.additionalSymbols && hierarchyInfo.additionalSymbols.length > 0) {
1002
+ metadata.additional_symbols = hierarchyInfo.additionalSymbols;
1003
+ }
941
1004
 
942
1005
  return {
943
1006
  text: content.trim(),
@@ -967,15 +1030,29 @@ export class ASTChunker {
967
1030
  const parts = [];
968
1031
  parts.push(`# ${chunk.metadata.path}`);
969
1032
 
970
- if (scopeChain && scopeChain.length > 0) {
971
- parts.push(`# Scope: ${scopeChain.join(' > ')}`);
972
- } else if (chunk.metadata.parent_symbol) {
973
- // Preserve cAST parent context when no scope chain from code graph
974
- parts.push(`# Parent: ${chunk.metadata.parent_type} ${chunk.metadata.parent_symbol}`);
975
- }
976
-
977
- if (chunk.metadata.symbol && chunk.metadata.symbol !== 'unknown') {
978
- parts.push(`# Defines: ${chunk.metadata.chunk_type} ${chunk.metadata.symbol}`);
1033
+ const isRubyMethodChunk = chunk.metadata.language === 'ruby'
1034
+ && (chunk.metadata.chunk_type === 'method'
1035
+ || chunk.metadata.chunk_type === 'singleton_method');
1036
+ const hasOnlySelfScope = scopeChain
1037
+ && scopeChain.length === 1
1038
+ && scopeChain[0] === chunk.metadata.symbol
1039
+ && !chunk.metadata.parent_symbol;
1040
+
1041
+ if (scopeChain && scopeChain.length > 0 && !(isRubyMethodChunk && hasOnlySelfScope)) {
1042
+ parts.push(`# Scope: ${scopeChain.join(' > ')}`);
1043
+ } else if (chunk.metadata.parent_symbol) {
1044
+ // Preserve cAST parent context when no scope chain from code graph
1045
+ parts.push(`# Parent: ${chunk.metadata.parent_type} ${chunk.metadata.parent_symbol}`);
1046
+ }
1047
+ // Keep Ruby method metadata, but avoid injecting the method name into
1048
+ // the production embedding/LI text. Top-level Ruby method chunks also
1049
+ // skip self-only scope above.
1050
+ if (chunk.metadata.symbol && chunk.metadata.symbol !== 'unknown' && !isRubyMethodChunk) {
1051
+ parts.push(`# Defines: ${chunk.metadata.chunk_type} ${chunk.metadata.symbol}`);
1052
+ }
1053
+
1054
+ if (chunk.metadata.additional_symbols && chunk.metadata.additional_symbols.length > 0) {
1055
+ parts.push(`# Additional: ${chunk.metadata.additional_symbols.join(', ')}`);
979
1056
  }
980
1057
 
981
1058
  if (imports && imports.length > 0) {
@@ -16,7 +16,25 @@ function lengthOf(chunk) {
16
16
  }
17
17
 
18
18
  function pathOf(chunk) {
19
- return chunk.file || chunk.metadata?.path || '';
19
+ return firstSafeRelativePath(
20
+ chunk.metadata?.relative_path,
21
+ chunk.metadata?.path,
22
+ chunk.metadata?.file_path,
23
+ chunk.file,
24
+ chunk.metadata?.file,
25
+ ) || '';
26
+ }
27
+
28
+ function firstSafeRelativePath(...candidates) {
29
+ for (const candidate of candidates) {
30
+ if (typeof candidate !== 'string') continue;
31
+ const normalized = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
32
+ if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
33
+ if (/^[A-Za-z]:\//.test(normalized)) continue;
34
+ if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
35
+ return normalized;
36
+ }
37
+ return null;
20
38
  }
21
39
 
22
40
  function hashOf(chunk) {
@@ -0,0 +1,223 @@
1
+ /**
2
+ * Gitignore alignment for file admission.
3
+ *
4
+ * Extracted from indexer-utils.js so both full indexing (`discoverFiles`) and
5
+ * the shared admission policy (`admission-policy.js`, used by incremental
6
+ * indexing) run the *same* `.gitignore` logic. The only behavioural change vs
7
+ * the original is that the project root is a parameter instead of the global
8
+ * `PROJECT_ROOT` constant, so the incremental maintainer can align gitignore
9
+ * against the worktree it actually reconciles. Full indexing keeps passing
10
+ * `PROJECT_ROOT` (the default), so its behaviour is unchanged.
11
+ */
12
+
13
+ import fs from 'fs/promises';
14
+ import { existsSync } from 'fs';
15
+ import { spawn } from 'child_process';
16
+ import path from 'path';
17
+
18
+ import { PROJECT_ROOT, AGENTIC_GITIGNORE_ALLOWLIST } from '../infrastructure/config/index.js';
19
+
20
+ export function toPosixPath(filePath) {
21
+ return filePath.replace(/\\/g, '/');
22
+ }
23
+
24
+ function logError(message) {
25
+ console.error(`[indexer] ${message}`);
26
+ }
27
+
28
+ /**
29
+ * Agentic tooling paths stay indexable even when listed in `.gitignore`
30
+ * (local AI workflow files). Mirrors AGENTIC_GITIGNORE_ALLOWLIST.
31
+ */
32
+ export function isGitignoreAllowlistedAgenticPath(relativePath) {
33
+ const normalized = toPosixPath(relativePath).replace(/^\.\//, '');
34
+ const basename = path.posix.basename(normalized);
35
+
36
+ if (AGENTIC_GITIGNORE_ALLOWLIST.files.includes(basename)) {
37
+ return true;
38
+ }
39
+
40
+ if (AGENTIC_GITIGNORE_ALLOWLIST.filePrefixes.some(prefix => basename.startsWith(prefix))) {
41
+ return true;
42
+ }
43
+
44
+ return AGENTIC_GITIGNORE_ALLOWLIST.directories.some(dirPrefix =>
45
+ normalized.startsWith(dirPrefix) || normalized.includes(`/${dirPrefix}`)
46
+ );
47
+ }
48
+
49
+ /**
50
+ * Run `git check-ignore` on a single batch of paths.
51
+ * Returns a list of ignored paths, or null on fatal error.
52
+ */
53
+ function checkIgnoreBatch(batch, projectRoot, reportError) {
54
+ return new Promise((resolve) => {
55
+ const ignoredChunks = [];
56
+ let settled = false;
57
+
58
+ const git = spawn('git', ['check-ignore', '-z', '--stdin'], { cwd: projectRoot });
59
+
60
+ git.stdout.on('data', chunk => ignoredChunks.push(chunk));
61
+ git.stderr.on('data', () => {}); // Suppress — batched caller handles partial failures
62
+
63
+ git.on('error', (err) => {
64
+ if (settled) return;
65
+ settled = true;
66
+ reportError(`WARN: Unable to run git check-ignore (${err.message})`);
67
+ resolve(null);
68
+ });
69
+
70
+ git.on('close', (code) => {
71
+ if (settled) return;
72
+ settled = true;
73
+
74
+ // code 0 = some ignored, code 1 = none ignored, both valid.
75
+ // code 128 = fatal (e.g. path beyond symlink) — still use partial stdout.
76
+ if (code !== 0 && code !== 1 && ignoredChunks.length === 0) {
77
+ resolve(null);
78
+ return;
79
+ }
80
+
81
+ const ignored = Buffer.concat(ignoredChunks)
82
+ .toString('utf8')
83
+ .split('\0')
84
+ .filter(Boolean)
85
+ .map(toPosixPath);
86
+
87
+ resolve(ignored);
88
+ });
89
+
90
+ const stdinPayload = `${batch.map(toPosixPath).join('\0')}\0`;
91
+ git.stdin.on('error', () => {}); // Suppress EPIPE if git exits early
92
+ git.stdin.end(stdinPayload);
93
+ });
94
+ }
95
+
96
+ const CHECK_IGNORE_BATCH_SIZE = 5000;
97
+
98
+ /**
99
+ * Find directory components that are symlinks, so we can filter out paths
100
+ * that traverse them (git check-ignore fatals on "beyond a symbolic link").
101
+ */
102
+ async function findSymlinkDirs(paths, projectRoot) {
103
+ const checked = new Map();
104
+ const symlinkPrefixes = [];
105
+
106
+ for (const p of paths) {
107
+ const parts = p.split('/');
108
+ let dir = '';
109
+ for (let i = 0; i < parts.length - 1; i++) {
110
+ dir = dir ? `${dir}/${parts[i]}` : parts[i];
111
+ if (checked.has(dir)) continue;
112
+ try {
113
+ const stat = await fs.lstat(path.join(projectRoot, dir));
114
+ const isLink = stat.isSymbolicLink();
115
+ checked.set(dir, isLink);
116
+ if (isLink) symlinkPrefixes.push(dir + '/');
117
+ } catch {
118
+ checked.set(dir, false);
119
+ }
120
+ }
121
+ }
122
+
123
+ return symlinkPrefixes;
124
+ }
125
+
126
+ export async function getGitIgnoredPathSet(paths, options = {}) {
127
+ const projectRoot = options.projectRoot || PROJECT_ROOT;
128
+ const silent = options.silent ?? false;
129
+ const reportError = silent ? () => {} : logError;
130
+
131
+ if (paths.length === 0) {
132
+ return new Set();
133
+ }
134
+
135
+ const ignored = new Set();
136
+
137
+ // Pre-filter paths that traverse symlinks — git check-ignore fatals on these.
138
+ // Files beyond symlinks are also checked: if the symlink dir itself is ignored,
139
+ // all files under it are treated as ignored too.
140
+ const symlinkPrefixes = await findSymlinkDirs(paths, projectRoot);
141
+ let safePaths = paths;
142
+ if (symlinkPrefixes.length > 0) {
143
+ // Check if the symlink directories themselves are ignored
144
+ const symlinkDirs = symlinkPrefixes.map(p => p.slice(0, -1)); // remove trailing /
145
+ const symlinkIgnored = await checkIgnoreBatch(symlinkDirs, projectRoot, reportError);
146
+ const ignoredSymlinks = new Set(symlinkIgnored || []);
147
+
148
+ safePaths = [];
149
+ for (const p of paths) {
150
+ const matchedPrefix = symlinkPrefixes.find(prefix => p.startsWith(prefix));
151
+ if (matchedPrefix) {
152
+ // Path traverses a symlink — check if symlink dir is gitignored
153
+ const dir = matchedPrefix.slice(0, -1);
154
+ if (ignoredSymlinks.has(dir)) {
155
+ ignored.add(toPosixPath(p)); // inherit parent's ignored status
156
+ }
157
+ // Either way, skip git check-ignore (would fatal)
158
+ } else {
159
+ safePaths.push(p);
160
+ }
161
+ }
162
+ }
163
+
164
+ let failedBatches = 0;
165
+
166
+ for (let i = 0; i < safePaths.length; i += CHECK_IGNORE_BATCH_SIZE) {
167
+ const batch = safePaths.slice(i, i + CHECK_IGNORE_BATCH_SIZE);
168
+ const result = await checkIgnoreBatch(batch, projectRoot, reportError);
169
+ if (result) {
170
+ for (const p of result) ignored.add(p);
171
+ } else {
172
+ failedBatches++;
173
+ }
174
+ }
175
+
176
+ const totalBatches = Math.ceil(safePaths.length / CHECK_IGNORE_BATCH_SIZE);
177
+ if (failedBatches === totalBatches && totalBatches > 0) {
178
+ reportError('WARN: git check-ignore failed on all batches — gitignore filtering disabled');
179
+ return null;
180
+ }
181
+
182
+ return ignored;
183
+ }
184
+
185
+ export async function applyGitignoreAlignment(files, respectGitignore, options = {}) {
186
+ const projectRoot = options.projectRoot || PROJECT_ROOT;
187
+ if (!respectGitignore || !existsSync(path.join(projectRoot, '.git'))) {
188
+ return { files, gitignored: 0 };
189
+ }
190
+
191
+ const bypassGitignore = new Set();
192
+ const candidates = [];
193
+ for (const file of files) {
194
+ if (isGitignoreAllowlistedAgenticPath(file)) {
195
+ bypassGitignore.add(file);
196
+ } else {
197
+ candidates.push(file);
198
+ }
199
+ }
200
+
201
+ const ignoredSet = await getGitIgnoredPathSet(candidates, { projectRoot, silent: options.silent });
202
+ if (!ignoredSet) {
203
+ return { files, gitignored: 0 };
204
+ }
205
+
206
+ const kept = [];
207
+ let gitignored = 0;
208
+ for (const file of files) {
209
+ if (bypassGitignore.has(file)) {
210
+ kept.push(file);
211
+ continue;
212
+ }
213
+
214
+ const normalized = toPosixPath(file);
215
+ if (ignoredSet.has(normalized)) {
216
+ gitignored++;
217
+ continue;
218
+ }
219
+ kept.push(file);
220
+ }
221
+
222
+ return { files: kept, gitignored };
223
+ }