sweet-search 2.5.2 → 2.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/core/cli.js +24 -3
- package/core/graph/graph-expansion.js +215 -36
- package/core/graph/graph-extractor.js +196 -11
- package/core/graph/graph-search.js +395 -92
- package/core/graph/hcgs-generator.js +2 -1
- package/core/graph/index.js +2 -0
- package/core/graph/repo-map.js +28 -6
- package/core/graph/structural-answer-cues.js +168 -0
- package/core/graph/structural-callsite-hints.js +40 -0
- package/core/graph/structural-context-format.js +40 -0
- package/core/graph/structural-context.js +450 -0
- package/core/graph/structural-forward-push.js +156 -0
- package/core/graph/structural-header-context.js +19 -0
- package/core/graph/structural-importance.js +148 -0
- package/core/graph/structural-pagerank.js +197 -0
- package/core/graph/summary-manager.js +13 -9
- package/core/incremental-indexing/application/dirty-scan.mjs +236 -0
- package/core/incremental-indexing/application/file-watcher.mjs +197 -0
- package/core/incremental-indexing/application/maintenance-handlers.mjs +519 -0
- package/core/incremental-indexing/application/maintenance-worker.mjs +380 -0
- package/core/incremental-indexing/application/operator-cli.mjs +554 -0
- package/core/incremental-indexing/application/production-li-delta.mjs +192 -0
- package/core/incremental-indexing/application/production-reconciler-helpers.mjs +107 -0
- package/core/incremental-indexing/application/production-reconciler.mjs +583 -0
- package/core/incremental-indexing/application/reconciler.mjs +477 -0
- package/core/incremental-indexing/application/tombstone-injector.mjs +148 -0
- package/core/incremental-indexing/domain/chunk-identity.mjs +260 -0
- package/core/incremental-indexing/domain/encoder-deps.mjs +193 -0
- package/core/incremental-indexing/domain/encoder-input.mjs +225 -0
- package/core/incremental-indexing/domain/interval-autotune.mjs +255 -0
- package/core/incremental-indexing/domain/reconcile-counters.mjs +149 -0
- package/core/incremental-indexing/domain/watermark-scheduler.mjs +239 -0
- package/core/incremental-indexing/infrastructure/artifact-temp-sweep.mjs +163 -0
- package/core/incremental-indexing/infrastructure/baseline-readiness.mjs +121 -0
- package/core/incremental-indexing/infrastructure/dirty-set.mjs +233 -0
- package/core/incremental-indexing/infrastructure/graph-gc.mjs +314 -0
- package/core/incremental-indexing/infrastructure/hashing.mjs +298 -0
- package/core/incremental-indexing/infrastructure/hcgs-invalidation.mjs +182 -0
- package/core/incremental-indexing/infrastructure/li-segment-merge.mjs +278 -0
- package/core/incremental-indexing/infrastructure/li-segment-state.mjs +173 -0
- package/core/incremental-indexing/infrastructure/lockfile.mjs +119 -0
- package/core/incremental-indexing/infrastructure/maintenance-state-reader.mjs +283 -0
- package/core/incremental-indexing/infrastructure/manifest.mjs +194 -0
- package/core/incremental-indexing/infrastructure/path-filter.mjs +190 -0
- package/core/incremental-indexing/infrastructure/reader-heartbeat.mjs +201 -0
- package/core/incremental-indexing/infrastructure/schema-migrations.mjs +257 -0
- package/core/incremental-indexing/infrastructure/sparse-gram-delta.mjs +335 -0
- package/core/incremental-indexing/infrastructure/sqlite-fts5.mjs +176 -0
- package/core/incremental-indexing/infrastructure/staleness-display.mjs +105 -0
- package/core/incremental-indexing/infrastructure/tombstone-bitmap.mjs +234 -0
- package/core/incremental-indexing/infrastructure/vector-delta-writer.mjs +359 -0
- package/core/incremental-indexing/infrastructure/vector-gc.mjs +133 -0
- package/core/incremental-indexing/infrastructure/worktree-stamp.mjs +155 -0
- package/core/incremental-indexing/infrastructure/wsl2-detect.mjs +115 -0
- package/core/indexing/admission-policy.js +139 -0
- package/core/indexing/artifact-builder.js +29 -12
- package/core/indexing/ast-chunker.js +107 -30
- package/core/indexing/dedup/exemplar-selector.js +19 -1
- package/core/indexing/gitignore-filter.js +223 -0
- package/core/indexing/incremental-tracker.js +99 -30
- package/core/indexing/index-codebase-v21.js +6 -5
- package/core/indexing/index-maintainer.mjs +698 -6
- package/core/indexing/indexer-ann.js +99 -15
- package/core/indexing/indexer-build.js +158 -45
- package/core/indexing/indexer-empty-baseline.js +80 -0
- package/core/indexing/indexer-manifest.js +66 -0
- package/core/indexing/indexer-phases.js +56 -23
- package/core/indexing/indexer-sparse-gram.js +54 -13
- package/core/indexing/indexer-utils.js +26 -208
- package/core/indexing/indexing-file-policy.js +32 -7
- package/core/indexing/maintainer-launcher.mjs +137 -0
- package/core/indexing/merkle-tracker.js +251 -244
- package/core/indexing/model-pool.js +46 -5
- package/core/infrastructure/code-graph-repository.js +758 -6
- package/core/infrastructure/code-graph-visibility.js +157 -0
- package/core/infrastructure/codebase-repository.js +100 -13
- package/core/infrastructure/config/search.js +1 -1
- package/core/infrastructure/db-utils.js +118 -0
- package/core/infrastructure/dedup-hashing.js +10 -13
- package/core/infrastructure/hardware-capability.js +17 -7
- package/core/infrastructure/index.js +8 -2
- package/core/infrastructure/language-patterns/maps.js +4 -1
- package/core/infrastructure/language-patterns/registry-core.js +56 -17
- package/core/infrastructure/language-patterns/registry-object-oriented.js +12 -5
- package/core/infrastructure/language-patterns.js +69 -0
- package/core/infrastructure/model-registry.js +20 -0
- package/core/infrastructure/native-inference.js +7 -12
- package/core/infrastructure/native-resolver.js +52 -37
- package/core/infrastructure/native-sparse-gram.js +261 -20
- package/core/infrastructure/native-tokenizer.js +6 -15
- package/core/infrastructure/simd-distance.js +10 -16
- package/core/infrastructure/sparse-gram-delta-reader.js +76 -0
- package/core/infrastructure/structural-alias-resolver.js +122 -0
- package/core/infrastructure/structural-candidate-ranker.js +34 -0
- package/core/infrastructure/structural-context-repository.js +472 -0
- package/core/infrastructure/structural-context-utils.js +51 -0
- package/core/infrastructure/structural-graph-signals.js +121 -0
- package/core/infrastructure/structural-qualified-resolution.js +15 -0
- package/core/infrastructure/structural-source-definitions.js +100 -0
- package/core/infrastructure/tombstone-bitmap-reader.js +139 -0
- package/core/infrastructure/tree-sitter-provider.js +811 -37
- package/core/prompt-optimization/data/p7-final/sweet-search-system-prompt.md +50 -0
- package/core/query/query-router.js +55 -5
- package/core/ranking/file-kind-ranking.js +2192 -15
- package/core/ranking/late-interaction-index.js +87 -12
- package/core/search/cli-decoration.js +290 -0
- package/core/search/context-expander.js +988 -78
- package/core/search/index.js +1 -0
- package/core/search/output-policy.js +275 -0
- package/core/search/search-anchor.js +499 -0
- package/core/search/search-boost.js +93 -1
- package/core/search/search-cli.js +61 -204
- package/core/search/search-hybrid.js +250 -10
- package/core/search/search-pattern-chunks.js +57 -8
- package/core/search/search-pattern-planner.js +68 -9
- package/core/search/search-pattern-prefilter.js +30 -10
- package/core/search/search-pattern-ripgrep.js +40 -4
- package/core/search/search-pattern-sparse-overlay.js +256 -0
- package/core/search/search-pattern.js +117 -29
- package/core/search/search-postprocess.js +479 -5
- package/core/search/search-read-semantic.js +260 -23
- package/core/search/search-read.js +82 -64
- package/core/search/search-reader-pin.js +71 -0
- package/core/search/search-rrf.js +279 -0
- package/core/search/search-semantic.js +110 -5
- package/core/search/search-server.js +130 -57
- package/core/search/search-trace.js +107 -0
- package/core/search/server-identity.js +93 -0
- package/core/search/session-daemon-prewarm.mjs +33 -10
- package/core/search/sweet-search.js +399 -7
- package/core/skills/sweet-index/SKILL.md +8 -6
- package/core/vector-store/binary-hnsw-index.js +194 -30
- package/core/vector-store/float-vector-store.js +96 -6
- package/core/vector-store/hnsw-index.js +220 -49
- package/eval/agent-read-workflows/bin/_ss-helpers.mjs +471 -0
- package/eval/agent-read-workflows/bin/ss-find +15 -0
- package/eval/agent-read-workflows/bin/ss-grep +12 -0
- package/eval/agent-read-workflows/bin/ss-read +14 -0
- package/eval/agent-read-workflows/bin/ss-search +18 -0
- package/eval/agent-read-workflows/bin/ss-semantic +12 -0
- package/eval/agent-read-workflows/bin/ss-trace +11 -0
- package/mcp/read-tool.js +109 -0
- package/mcp/server.js +55 -15
- package/mcp/tool-handlers.js +14 -124
- package/mcp/trace-tool.js +81 -0
- package/package.json +25 -10
- package/scripts/hooks/intercept-read.mjs +55 -0
- package/scripts/hooks/remind-tools.mjs +40 -0
- package/scripts/init.js +698 -54
- package/scripts/inject-agent-instructions.js +431 -0
- package/scripts/install-prompt-reminders.js +188 -0
- package/scripts/install-tool-enforcement.js +220 -0
- package/scripts/smoke-test.js +12 -9
- package/scripts/uninstall.js +276 -18
- package/scripts/write-claude-rules.js +110 -0
|
@@ -12,7 +12,7 @@ import { createHash } from 'crypto';
|
|
|
12
12
|
import path from 'path';
|
|
13
13
|
import fs from 'fs/promises';
|
|
14
14
|
import { detectProjectBoundary } from '../infrastructure/project-detector.js';
|
|
15
|
-
import { getLanguageByPath } from '../infrastructure/language-patterns.js';
|
|
15
|
+
import { getLanguageByPath, resolveLanguage } from '../infrastructure/language-patterns.js';
|
|
16
16
|
import { DocumentChunker } from './document-chunker.js';
|
|
17
17
|
|
|
18
18
|
const MAX_CHUNK_SIZE = 2000;
|
|
@@ -151,12 +151,25 @@ function buildEmbeddingText({ variant: variantOverride, content, relativePath, l
|
|
|
151
151
|
const pathLine = relativePath ? `# ${relativePath}` : null;
|
|
152
152
|
const parentLine = hierarchyInfo?.parentSymbol
|
|
153
153
|
? `# Parent: ${hierarchyInfo.parentType} ${hierarchyInfo.parentSymbol}` : null;
|
|
154
|
-
|
|
154
|
+
// Ruby method chunks keep metadata anchors, but omit method-name header
|
|
155
|
+
// text so top-level Ruby snippets stay aligned with the pre-method-boundary
|
|
156
|
+
// embedding surface.
|
|
157
|
+
const isRubyMethodChunk = language === 'ruby'
|
|
158
|
+
&& (chunkType === 'method' || chunkType === 'singleton_method');
|
|
159
|
+
const symbolLine = (symbol && symbol !== 'unknown' && !isRubyMethodChunk)
|
|
155
160
|
? `# ${chunkType}: ${symbol}` : null;
|
|
156
161
|
const langLine = (language && language !== 'text')
|
|
157
162
|
? `# Language: ${language}` : null;
|
|
158
163
|
const signatureLine = hierarchyInfo?.signature
|
|
159
164
|
? `# Signature: ${hierarchyInfo.signature}` : null;
|
|
165
|
+
// Multi-symbol header: when the cAST sibling-merge collapses several
|
|
166
|
+
// top-level boundaries into one chunk (small Rust files with adjacent
|
|
167
|
+
// free-standing fns, e.g. packaging.rs:is_package + detect_package_root),
|
|
168
|
+
// the bi-encoder otherwise only sees the FIRST boundary's name. Adding
|
|
169
|
+
// an `# Additional:` line surfaces sibling symbol names to the encoder
|
|
170
|
+
// without changing chunk count or chunk text.
|
|
171
|
+
const additionalLine = (hierarchyInfo?.additionalSymbols && hierarchyInfo.additionalSymbols.length > 0)
|
|
172
|
+
? `# Additional: ${hierarchyInfo.additionalSymbols.join(', ')}` : null;
|
|
160
173
|
|
|
161
174
|
switch (variant) {
|
|
162
175
|
case 'no_path':
|
|
@@ -223,6 +236,7 @@ function buildEmbeddingText({ variant: variantOverride, content, relativePath, l
|
|
|
223
236
|
if (pathLine) parts.push(pathLine);
|
|
224
237
|
if (parentLine) parts.push(parentLine);
|
|
225
238
|
if (symbolLine) parts.push(symbolLine);
|
|
239
|
+
if (additionalLine) parts.push(additionalLine);
|
|
226
240
|
if (langLine) parts.push(langLine);
|
|
227
241
|
break;
|
|
228
242
|
}
|
|
@@ -279,9 +293,17 @@ function buildLiText({ content, relativePath, language, chunkType, symbol, hiera
|
|
|
279
293
|
if (hierarchyInfo?.parentSymbol) {
|
|
280
294
|
lines.push(`# Parent: ${hierarchyInfo.parentType} ${hierarchyInfo.parentSymbol}`);
|
|
281
295
|
}
|
|
282
|
-
|
|
296
|
+
// Mirror the Ruby method header carve-out used by embedding_text.
|
|
297
|
+
const isRubyMethodChunk = language === 'ruby'
|
|
298
|
+
&& (chunkType === 'method' || chunkType === 'singleton_method');
|
|
299
|
+
if (symbol && symbol !== 'unknown' && !isRubyMethodChunk) {
|
|
283
300
|
lines.push(`# ${chunkType}: ${symbol}`);
|
|
284
301
|
}
|
|
302
|
+
// Mirror embedding_text: surface sibling symbol names so the LI MaxSim
|
|
303
|
+
// stage's input includes the same context as the bi-encoder embedding.
|
|
304
|
+
if (hierarchyInfo?.additionalSymbols && hierarchyInfo.additionalSymbols.length > 0) {
|
|
305
|
+
lines.push(`# Additional: ${hierarchyInfo.additionalSymbols.join(', ')}`);
|
|
306
|
+
}
|
|
285
307
|
if (language && language !== 'text') {
|
|
286
308
|
lines.push(`# Language: ${language}`);
|
|
287
309
|
}
|
|
@@ -321,7 +343,11 @@ export class ASTChunker {
|
|
|
321
343
|
return this._docChunker.parseFile(filePath, content);
|
|
322
344
|
}
|
|
323
345
|
|
|
324
|
-
|
|
346
|
+
// resolveLanguage handles per-file disambiguation of ambiguous extensions
|
|
347
|
+
// (today: `.h` → c-vs-cpp) using a content scan for C++-only tokens.
|
|
348
|
+
// Header-only C++ libraries (highway, Eigen, …) are routed to cpp so the
|
|
349
|
+
// chunker uses tree-sitter-cpp instead of tree-sitter-c.
|
|
350
|
+
const langInfo = resolveLanguage(filePath, content);
|
|
325
351
|
if (!langInfo || !langInfo.chunker) {
|
|
326
352
|
return this.parseGenericFile(filePath, content);
|
|
327
353
|
}
|
|
@@ -361,20 +387,27 @@ export class ASTChunker {
|
|
|
361
387
|
const tsChunks = await provider.parseFileToChunks(content, langInfo.id);
|
|
362
388
|
if (!tsChunks || tsChunks.length === 0) return null;
|
|
363
389
|
|
|
364
|
-
return tsChunks.map(chunk =>
|
|
365
|
-
|
|
366
|
-
chunk.
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
390
|
+
return tsChunks.map(chunk => {
|
|
391
|
+
const isTopLevelRubyMethod = langInfo.id === 'ruby'
|
|
392
|
+
&& (chunk.type === 'method' || chunk.type === 'singleton_method')
|
|
393
|
+
&& !chunk.parentSymbol;
|
|
394
|
+
|
|
395
|
+
return this.buildChunk(
|
|
396
|
+
chunk.text, filePath, langInfo.id,
|
|
397
|
+
isTopLevelRubyMethod ? 'code' : chunk.type,
|
|
398
|
+
isTopLevelRubyMethod ? null : chunk.name,
|
|
399
|
+
chunk.startLine, chunk.endLine,
|
|
400
|
+
{
|
|
401
|
+
chunkId: chunk.chunkId,
|
|
402
|
+
parentChunkId: chunk.parentChunkId,
|
|
403
|
+
parentSymbol: chunk.parentSymbol,
|
|
404
|
+
parentType: chunk.parentType,
|
|
405
|
+
signature: chunk.signature || null,
|
|
406
|
+
additionalSymbols: chunk.additionalSymbols || null,
|
|
407
|
+
}
|
|
408
|
+
);
|
|
409
|
+
});
|
|
410
|
+
}
|
|
378
411
|
|
|
379
412
|
parseBraceBasedFile(filePath, content, language, patterns, comment, multiLine) {
|
|
380
413
|
const chunks = [];
|
|
@@ -388,12 +421,30 @@ export class ASTChunker {
|
|
|
388
421
|
|
|
389
422
|
for (let i = 0; i < lines.length; i++) {
|
|
390
423
|
const line = lines[i];
|
|
424
|
+
// Capture the comment state BEFORE _stripNonCode mutates it so we
|
|
425
|
+
// can tell whether this line entered the iteration inside a block
|
|
426
|
+
// comment (e.g. mid-Javadoc). Boundary detection must be skipped
|
|
427
|
+
// for such lines — otherwise `_matchBoundary` happily matches
|
|
428
|
+
// `public class MyClass {` inside a Java `/** ... <pre> ... */`
|
|
429
|
+
// example and emits a phantom class chunk. Verified on gson
|
|
430
|
+
// SerializedName.java where the regex fallback path emits a
|
|
431
|
+
// 25-82 chunk attributed to "class MyClass" sourced from the
|
|
432
|
+
// Javadoc body. See the matching extractJava() block-comment
|
|
433
|
+
// skip in core/graph/graph-extractor.js for the symmetric fix.
|
|
434
|
+
const inBlockCommentAtStart = stripState.inBlockComment;
|
|
391
435
|
const stripped = this._stripNonCode(line, stripState, comment, hasTemplateInterpolation);
|
|
392
436
|
|
|
393
437
|
braceDepth += (stripped.match(/{/g) || []).length;
|
|
394
438
|
braceDepth -= (stripped.match(/}/g) || []).length;
|
|
395
439
|
|
|
396
|
-
|
|
440
|
+
// When the line is entirely inside a block comment (entered as
|
|
441
|
+
// such AND still inside on exit), there's nothing executable to
|
|
442
|
+
// match — skip boundary detection entirely. The stripped output
|
|
443
|
+
// is already empty/whitespace so brace-depth tracking is a no-op.
|
|
444
|
+
const lineFullyInComment = inBlockCommentAtStart && stripState.inBlockComment;
|
|
445
|
+
const { name: matched, type: matchType, joinedLines } = lineFullyInComment
|
|
446
|
+
? { name: null, type: null, joinedLines: 0 }
|
|
447
|
+
: this._matchBoundary(stripped, patterns, language, lines, i, multiLine);
|
|
397
448
|
|
|
398
449
|
if ((matched && currentChunk) || (braceDepth === 0 && currentChunk)) {
|
|
399
450
|
const chunkContent = lines.slice(chunkStart, i + 1).join('\n');
|
|
@@ -805,13 +856,20 @@ export class ASTChunker {
|
|
|
805
856
|
for (let i = 0; i < lines.length; i++) {
|
|
806
857
|
const line = lines[i];
|
|
807
858
|
const lineSize = line.length + 1; // +1 for newline
|
|
859
|
+
const inBlockCommentAtStart = stripState.inBlockComment;
|
|
808
860
|
const stripped = this._stripNonCode(line, stripState, comment, hasTemplateInterpolation);
|
|
809
861
|
|
|
810
862
|
braceDepth += (stripped.match(/{/g) || []).length;
|
|
811
863
|
braceDepth -= (stripped.match(/}/g) || []).length;
|
|
812
864
|
|
|
813
|
-
// Check if this line is a sub-boundary (a new construct starting at depth >= 1)
|
|
814
|
-
|
|
865
|
+
// Check if this line is a sub-boundary (a new construct starting at depth >= 1).
|
|
866
|
+
// Use the comment-stripped line so Javadoc `<pre>public class Foo {...}</pre>`
|
|
867
|
+
// examples don't get matched as real sub-boundaries (parallels
|
|
868
|
+
// parseBraceBasedFile fix above).
|
|
869
|
+
const lineFullyInComment = inBlockCommentAtStart && stripState.inBlockComment;
|
|
870
|
+
const { name: matched, type: matchType } = lineFullyInComment
|
|
871
|
+
? { name: null, type: null }
|
|
872
|
+
: this._matchBoundary(stripped, patterns, language, lines, i, false);
|
|
815
873
|
const isSubBoundary = matched && i > segStart;
|
|
816
874
|
|
|
817
875
|
// Split condition: at a sub-boundary, or accumulated segment exceeds max
|
|
@@ -938,6 +996,11 @@ export class ASTChunker {
|
|
|
938
996
|
metadata.parent_symbol = hierarchyInfo.parentSymbol;
|
|
939
997
|
metadata.parent_type = hierarchyInfo.parentType;
|
|
940
998
|
}
|
|
999
|
+
// Carry sibling-symbol context into metadata so enrichEmbeddingText()
|
|
1000
|
+
// can rebuild the multi-symbol header during post-chunk enrichment.
|
|
1001
|
+
if (hierarchyInfo?.additionalSymbols && hierarchyInfo.additionalSymbols.length > 0) {
|
|
1002
|
+
metadata.additional_symbols = hierarchyInfo.additionalSymbols;
|
|
1003
|
+
}
|
|
941
1004
|
|
|
942
1005
|
return {
|
|
943
1006
|
text: content.trim(),
|
|
@@ -967,15 +1030,29 @@ export class ASTChunker {
|
|
|
967
1030
|
const parts = [];
|
|
968
1031
|
parts.push(`# ${chunk.metadata.path}`);
|
|
969
1032
|
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
1033
|
+
const isRubyMethodChunk = chunk.metadata.language === 'ruby'
|
|
1034
|
+
&& (chunk.metadata.chunk_type === 'method'
|
|
1035
|
+
|| chunk.metadata.chunk_type === 'singleton_method');
|
|
1036
|
+
const hasOnlySelfScope = scopeChain
|
|
1037
|
+
&& scopeChain.length === 1
|
|
1038
|
+
&& scopeChain[0] === chunk.metadata.symbol
|
|
1039
|
+
&& !chunk.metadata.parent_symbol;
|
|
1040
|
+
|
|
1041
|
+
if (scopeChain && scopeChain.length > 0 && !(isRubyMethodChunk && hasOnlySelfScope)) {
|
|
1042
|
+
parts.push(`# Scope: ${scopeChain.join(' > ')}`);
|
|
1043
|
+
} else if (chunk.metadata.parent_symbol) {
|
|
1044
|
+
// Preserve cAST parent context when no scope chain from code graph
|
|
1045
|
+
parts.push(`# Parent: ${chunk.metadata.parent_type} ${chunk.metadata.parent_symbol}`);
|
|
1046
|
+
}
|
|
1047
|
+
// Keep Ruby method metadata, but avoid injecting the method name into
|
|
1048
|
+
// the production embedding/LI text. Top-level Ruby method chunks also
|
|
1049
|
+
// skip self-only scope above.
|
|
1050
|
+
if (chunk.metadata.symbol && chunk.metadata.symbol !== 'unknown' && !isRubyMethodChunk) {
|
|
1051
|
+
parts.push(`# Defines: ${chunk.metadata.chunk_type} ${chunk.metadata.symbol}`);
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
if (chunk.metadata.additional_symbols && chunk.metadata.additional_symbols.length > 0) {
|
|
1055
|
+
parts.push(`# Additional: ${chunk.metadata.additional_symbols.join(', ')}`);
|
|
979
1056
|
}
|
|
980
1057
|
|
|
981
1058
|
if (imports && imports.length > 0) {
|
|
@@ -16,7 +16,25 @@ function lengthOf(chunk) {
|
|
|
16
16
|
}
|
|
17
17
|
|
|
18
18
|
function pathOf(chunk) {
|
|
19
|
-
return
|
|
19
|
+
return firstSafeRelativePath(
|
|
20
|
+
chunk.metadata?.relative_path,
|
|
21
|
+
chunk.metadata?.path,
|
|
22
|
+
chunk.metadata?.file_path,
|
|
23
|
+
chunk.file,
|
|
24
|
+
chunk.metadata?.file,
|
|
25
|
+
) || '';
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function firstSafeRelativePath(...candidates) {
|
|
29
|
+
for (const candidate of candidates) {
|
|
30
|
+
if (typeof candidate !== 'string') continue;
|
|
31
|
+
const normalized = candidate.replace(/\\/g, '/').replace(/^\.\//, '').replace(/\/+/g, '/');
|
|
32
|
+
if (!normalized || normalized === '.' || normalized.startsWith('/')) continue;
|
|
33
|
+
if (/^[A-Za-z]:\//.test(normalized)) continue;
|
|
34
|
+
if (normalized === '..' || normalized.startsWith('../') || normalized.includes('/../')) continue;
|
|
35
|
+
return normalized;
|
|
36
|
+
}
|
|
37
|
+
return null;
|
|
20
38
|
}
|
|
21
39
|
|
|
22
40
|
function hashOf(chunk) {
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gitignore alignment for file admission.
|
|
3
|
+
*
|
|
4
|
+
* Extracted from indexer-utils.js so both full indexing (`discoverFiles`) and
|
|
5
|
+
* the shared admission policy (`admission-policy.js`, used by incremental
|
|
6
|
+
* indexing) run the *same* `.gitignore` logic. The only behavioural change vs
|
|
7
|
+
* the original is that the project root is a parameter instead of the global
|
|
8
|
+
* `PROJECT_ROOT` constant, so the incremental maintainer can align gitignore
|
|
9
|
+
* against the worktree it actually reconciles. Full indexing keeps passing
|
|
10
|
+
* `PROJECT_ROOT` (the default), so its behaviour is unchanged.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import fs from 'fs/promises';
|
|
14
|
+
import { existsSync } from 'fs';
|
|
15
|
+
import { spawn } from 'child_process';
|
|
16
|
+
import path from 'path';
|
|
17
|
+
|
|
18
|
+
import { PROJECT_ROOT, AGENTIC_GITIGNORE_ALLOWLIST } from '../infrastructure/config/index.js';
|
|
19
|
+
|
|
20
|
+
export function toPosixPath(filePath) {
|
|
21
|
+
return filePath.replace(/\\/g, '/');
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function logError(message) {
|
|
25
|
+
console.error(`[indexer] ${message}`);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Agentic tooling paths stay indexable even when listed in `.gitignore`
|
|
30
|
+
* (local AI workflow files). Mirrors AGENTIC_GITIGNORE_ALLOWLIST.
|
|
31
|
+
*/
|
|
32
|
+
export function isGitignoreAllowlistedAgenticPath(relativePath) {
|
|
33
|
+
const normalized = toPosixPath(relativePath).replace(/^\.\//, '');
|
|
34
|
+
const basename = path.posix.basename(normalized);
|
|
35
|
+
|
|
36
|
+
if (AGENTIC_GITIGNORE_ALLOWLIST.files.includes(basename)) {
|
|
37
|
+
return true;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (AGENTIC_GITIGNORE_ALLOWLIST.filePrefixes.some(prefix => basename.startsWith(prefix))) {
|
|
41
|
+
return true;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return AGENTIC_GITIGNORE_ALLOWLIST.directories.some(dirPrefix =>
|
|
45
|
+
normalized.startsWith(dirPrefix) || normalized.includes(`/${dirPrefix}`)
|
|
46
|
+
);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Run `git check-ignore` on a single batch of paths.
|
|
51
|
+
* Returns a list of ignored paths, or null on fatal error.
|
|
52
|
+
*/
|
|
53
|
+
function checkIgnoreBatch(batch, projectRoot, reportError) {
|
|
54
|
+
return new Promise((resolve) => {
|
|
55
|
+
const ignoredChunks = [];
|
|
56
|
+
let settled = false;
|
|
57
|
+
|
|
58
|
+
const git = spawn('git', ['check-ignore', '-z', '--stdin'], { cwd: projectRoot });
|
|
59
|
+
|
|
60
|
+
git.stdout.on('data', chunk => ignoredChunks.push(chunk));
|
|
61
|
+
git.stderr.on('data', () => {}); // Suppress — batched caller handles partial failures
|
|
62
|
+
|
|
63
|
+
git.on('error', (err) => {
|
|
64
|
+
if (settled) return;
|
|
65
|
+
settled = true;
|
|
66
|
+
reportError(`WARN: Unable to run git check-ignore (${err.message})`);
|
|
67
|
+
resolve(null);
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
git.on('close', (code) => {
|
|
71
|
+
if (settled) return;
|
|
72
|
+
settled = true;
|
|
73
|
+
|
|
74
|
+
// code 0 = some ignored, code 1 = none ignored, both valid.
|
|
75
|
+
// code 128 = fatal (e.g. path beyond symlink) — still use partial stdout.
|
|
76
|
+
if (code !== 0 && code !== 1 && ignoredChunks.length === 0) {
|
|
77
|
+
resolve(null);
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const ignored = Buffer.concat(ignoredChunks)
|
|
82
|
+
.toString('utf8')
|
|
83
|
+
.split('\0')
|
|
84
|
+
.filter(Boolean)
|
|
85
|
+
.map(toPosixPath);
|
|
86
|
+
|
|
87
|
+
resolve(ignored);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
const stdinPayload = `${batch.map(toPosixPath).join('\0')}\0`;
|
|
91
|
+
git.stdin.on('error', () => {}); // Suppress EPIPE if git exits early
|
|
92
|
+
git.stdin.end(stdinPayload);
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const CHECK_IGNORE_BATCH_SIZE = 5000;
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Find directory components that are symlinks, so we can filter out paths
|
|
100
|
+
* that traverse them (git check-ignore fatals on "beyond a symbolic link").
|
|
101
|
+
*/
|
|
102
|
+
async function findSymlinkDirs(paths, projectRoot) {
|
|
103
|
+
const checked = new Map();
|
|
104
|
+
const symlinkPrefixes = [];
|
|
105
|
+
|
|
106
|
+
for (const p of paths) {
|
|
107
|
+
const parts = p.split('/');
|
|
108
|
+
let dir = '';
|
|
109
|
+
for (let i = 0; i < parts.length - 1; i++) {
|
|
110
|
+
dir = dir ? `${dir}/${parts[i]}` : parts[i];
|
|
111
|
+
if (checked.has(dir)) continue;
|
|
112
|
+
try {
|
|
113
|
+
const stat = await fs.lstat(path.join(projectRoot, dir));
|
|
114
|
+
const isLink = stat.isSymbolicLink();
|
|
115
|
+
checked.set(dir, isLink);
|
|
116
|
+
if (isLink) symlinkPrefixes.push(dir + '/');
|
|
117
|
+
} catch {
|
|
118
|
+
checked.set(dir, false);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
return symlinkPrefixes;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
export async function getGitIgnoredPathSet(paths, options = {}) {
|
|
127
|
+
const projectRoot = options.projectRoot || PROJECT_ROOT;
|
|
128
|
+
const silent = options.silent ?? false;
|
|
129
|
+
const reportError = silent ? () => {} : logError;
|
|
130
|
+
|
|
131
|
+
if (paths.length === 0) {
|
|
132
|
+
return new Set();
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const ignored = new Set();
|
|
136
|
+
|
|
137
|
+
// Pre-filter paths that traverse symlinks — git check-ignore fatals on these.
|
|
138
|
+
// Files beyond symlinks are also checked: if the symlink dir itself is ignored,
|
|
139
|
+
// all files under it are treated as ignored too.
|
|
140
|
+
const symlinkPrefixes = await findSymlinkDirs(paths, projectRoot);
|
|
141
|
+
let safePaths = paths;
|
|
142
|
+
if (symlinkPrefixes.length > 0) {
|
|
143
|
+
// Check if the symlink directories themselves are ignored
|
|
144
|
+
const symlinkDirs = symlinkPrefixes.map(p => p.slice(0, -1)); // remove trailing /
|
|
145
|
+
const symlinkIgnored = await checkIgnoreBatch(symlinkDirs, projectRoot, reportError);
|
|
146
|
+
const ignoredSymlinks = new Set(symlinkIgnored || []);
|
|
147
|
+
|
|
148
|
+
safePaths = [];
|
|
149
|
+
for (const p of paths) {
|
|
150
|
+
const matchedPrefix = symlinkPrefixes.find(prefix => p.startsWith(prefix));
|
|
151
|
+
if (matchedPrefix) {
|
|
152
|
+
// Path traverses a symlink — check if symlink dir is gitignored
|
|
153
|
+
const dir = matchedPrefix.slice(0, -1);
|
|
154
|
+
if (ignoredSymlinks.has(dir)) {
|
|
155
|
+
ignored.add(toPosixPath(p)); // inherit parent's ignored status
|
|
156
|
+
}
|
|
157
|
+
// Either way, skip git check-ignore (would fatal)
|
|
158
|
+
} else {
|
|
159
|
+
safePaths.push(p);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
let failedBatches = 0;
|
|
165
|
+
|
|
166
|
+
for (let i = 0; i < safePaths.length; i += CHECK_IGNORE_BATCH_SIZE) {
|
|
167
|
+
const batch = safePaths.slice(i, i + CHECK_IGNORE_BATCH_SIZE);
|
|
168
|
+
const result = await checkIgnoreBatch(batch, projectRoot, reportError);
|
|
169
|
+
if (result) {
|
|
170
|
+
for (const p of result) ignored.add(p);
|
|
171
|
+
} else {
|
|
172
|
+
failedBatches++;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const totalBatches = Math.ceil(safePaths.length / CHECK_IGNORE_BATCH_SIZE);
|
|
177
|
+
if (failedBatches === totalBatches && totalBatches > 0) {
|
|
178
|
+
reportError('WARN: git check-ignore failed on all batches — gitignore filtering disabled');
|
|
179
|
+
return null;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return ignored;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
export async function applyGitignoreAlignment(files, respectGitignore, options = {}) {
|
|
186
|
+
const projectRoot = options.projectRoot || PROJECT_ROOT;
|
|
187
|
+
if (!respectGitignore || !existsSync(path.join(projectRoot, '.git'))) {
|
|
188
|
+
return { files, gitignored: 0 };
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
const bypassGitignore = new Set();
|
|
192
|
+
const candidates = [];
|
|
193
|
+
for (const file of files) {
|
|
194
|
+
if (isGitignoreAllowlistedAgenticPath(file)) {
|
|
195
|
+
bypassGitignore.add(file);
|
|
196
|
+
} else {
|
|
197
|
+
candidates.push(file);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const ignoredSet = await getGitIgnoredPathSet(candidates, { projectRoot, silent: options.silent });
|
|
202
|
+
if (!ignoredSet) {
|
|
203
|
+
return { files, gitignored: 0 };
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const kept = [];
|
|
207
|
+
let gitignored = 0;
|
|
208
|
+
for (const file of files) {
|
|
209
|
+
if (bypassGitignore.has(file)) {
|
|
210
|
+
kept.push(file);
|
|
211
|
+
continue;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
const normalized = toPosixPath(file);
|
|
215
|
+
if (ignoredSet.has(normalized)) {
|
|
216
|
+
gitignored++;
|
|
217
|
+
continue;
|
|
218
|
+
}
|
|
219
|
+
kept.push(file);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
return { files: kept, gitignored };
|
|
223
|
+
}
|