xindex 1.0.17 → 1.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.mcp.json +0 -4
- package/.xindex.json +2 -1
- package/apps/indexApp.ts +1 -1
- package/apps/mcpApp.ts +18 -4
- package/apps/run.mcp.ts +12 -6
- package/apps/run.search.ts +2 -2
- package/apps/searchApp.ts +18 -6
- package/apps/watchApp.ts +1 -1
- package/components/buildComponents.ts +2 -0
- package/components/config/loadConfig.ts +6 -0
- package/components/config/xindexConfig.ts +2 -0
- package/components/index/contentIndexDriver.ts +3 -2
- package/components/keywords/cleanUpKeywords.ts +23 -3
- package/components/locate/locateInFile.ts +61 -57
- package/components/locate/windowsOf.ts +3 -1
- package/features/searchIndex.ts +71 -9
- package/package.json +2 -2
- package/packages/fun/src/array-finder.ts +1 -1
- package/packages/fun/src/array-index.ts +1 -1
- package/packages/fun/src/asyncRequest.ts +1 -1
- package/packages/fun/src/concurrency.ts +5 -5
- package/packages/fun/src/counter.ts +1 -1
- package/packages/fun/src/flatten.ts +12 -6
- package/packages/fun/src/hash128.ts +2 -2
- package/packages/fun/src/hash256.ts +2 -2
- package/packages/fun/src/hub.ts +1 -1
- package/packages/fun/src/interval.ts +1 -1
- package/packages/fun/src/mailbox.ts +1 -1
- package/packages/fun/src/match-left-and-right-arrays.ts +1 -1
- package/packages/fun/src/memos.ts +1 -1
- package/packages/fun/src/pubsub.ts +2 -2
- package/packages/fun/src/tick.ts +1 -1
- package/packages/fun/src/time-behavior.ts +1 -1
- package/packages/fun/src/timedFallback.ts +1 -1
- package/packages/fun/src/value.ts +1 -1
- package/packages/fun/src/waitForCounter.ts +2 -2
- package/packages/streamx/src/batch.ts +2 -2
- package/packages/streamx/src/batchTimed.ts +5 -5
- package/packages/streamx/src/buffer.ts +4 -4
- package/packages/streamx/src/concatenate.ts +1 -1
- package/packages/streamx/src/filter.ts +2 -2
- package/packages/streamx/src/flat.ts +1 -1
- package/packages/streamx/src/flatMap.ts +3 -3
- package/packages/streamx/src/from.ts +1 -1
- package/packages/streamx/src/interval.ts +3 -3
- package/packages/streamx/src/loop.ts +2 -2
- package/packages/streamx/src/map.ts +2 -2
- package/packages/streamx/src/merge.ts +4 -4
- package/packages/streamx/src/nodeReadable.ts +1 -1
- package/packages/streamx/src/nodeTransform.ts +2 -2
- package/packages/streamx/src/nodeWritable.ts +3 -3
- package/packages/streamx/src/objectReader.ts +2 -2
- package/packages/streamx/src/reader.ts +1 -1
- package/packages/streamx/src/reduce.ts +2 -2
- package/packages/streamx/src/scale.ts +7 -7
- package/packages/streamx/src/scaleSync.ts +5 -5
- package/packages/streamx/src/sequence.ts +1 -1
- package/packages/streamx/src/tap.ts +3 -3
- package/packages/streamx/src/toArray.ts +1 -1
- package/packages/streamx/src/writer.ts +4 -4
- package/tsconfig.json +1 -1
- package/components/index/documentContentIndexDriver.ts +0 -127
package/.mcp.json
CHANGED
package/.xindex.json
CHANGED
package/apps/indexApp.ts
CHANGED
|
@@ -6,7 +6,7 @@ import {run} from "../packages/streamx/src/index.js";
|
|
|
6
6
|
import {IWalkFiles} from "../components/walkFiles.js";
|
|
7
7
|
import {IIndexContent} from "../features/indexContent.js";
|
|
8
8
|
import {ILogger} from "../components/logger.js";
|
|
9
|
-
import {INDEXING_BATCH_SIZE} from "../components/config/INDEXING_BATCH_SIZE";
|
|
9
|
+
import {INDEXING_BATCH_SIZE} from "../components/config/INDEXING_BATCH_SIZE.js";
|
|
10
10
|
import {SafeIndexBatch} from "../components/io/safeIndexBatch.js";
|
|
11
11
|
|
|
12
12
|
export type IIndexApp = (inputs: string[]) => Promise<void>;
|
package/apps/mcpApp.ts
CHANGED
|
@@ -50,14 +50,28 @@ export function McpApp({
|
|
|
50
50
|
inputSchema: z.object({
|
|
51
51
|
query: z.string()
|
|
52
52
|
.describe("Natural language search query"),
|
|
53
|
-
limit: z.number().int().min(1).max(50).default(
|
|
54
|
-
.describe(
|
|
53
|
+
limit: z.number().int().min(1).max(50).default(config.searchDefaultLimit)
|
|
54
|
+
.describe(`Max results to return default ${config.searchDefaultLimit}, max 50)`),
|
|
55
|
+
windowLines: z.number().int().min(1).default(config.maxLines).optional()
|
|
56
|
+
.describe("Optional line-window size for per-file snippet locating"),
|
|
57
|
+
includePaths: z.array(z.string()).optional()
|
|
58
|
+
.describe("Optional gitignore-style include patterns (e.g. 'src/**', '**/*.ts', '!src/vendor/**')"),
|
|
59
|
+
excludePaths: z.array(z.string()).optional()
|
|
60
|
+
.describe("Optional gitignore-style exclude patterns (e.g. 'node_modules/**', '**/*.test.ts')"),
|
|
61
|
+
scoreThreshold: z.number().min(0).max(1).optional()
|
|
62
|
+
.describe("Optional minimum semantic score threshold (0..1)"),
|
|
55
63
|
}),
|
|
56
64
|
annotations: {readOnlyHint: true},
|
|
57
|
-
}, async ({query, limit}) => {
|
|
65
|
+
}, async ({query, limit, scoreThreshold, windowLines, includePaths, excludePaths}) => {
|
|
58
66
|
try {
|
|
59
67
|
const format = FormatSearchResults();
|
|
60
|
-
const results = await search(query,
|
|
68
|
+
const results = await search(query, {
|
|
69
|
+
limit,
|
|
70
|
+
scoreThreshold,
|
|
71
|
+
windowLines,
|
|
72
|
+
includePaths,
|
|
73
|
+
excludePaths,
|
|
74
|
+
});
|
|
61
75
|
const text = await format(query, results);
|
|
62
76
|
return {content: [{type: "text" as const, text}]};
|
|
63
77
|
} catch (e) {
|
package/apps/run.mcp.ts
CHANGED
|
@@ -29,7 +29,7 @@ const indexApp = async (inputs: string[]) => {
|
|
|
29
29
|
await rawIndexApp(inputs);
|
|
30
30
|
await flush();
|
|
31
31
|
};
|
|
32
|
-
const search = SearchApp({searchContentIndex});
|
|
32
|
+
const search = SearchApp({searchIndex: searchContentIndex, searchDefaultLimit: config.searchDefaultLimit});
|
|
33
33
|
|
|
34
34
|
const appId = AppId();
|
|
35
35
|
const watcherLock = WatcherLock({
|
|
@@ -45,16 +45,22 @@ const watch = watchDisabled ? undefined : {
|
|
|
45
45
|
watcherLock,
|
|
46
46
|
};
|
|
47
47
|
|
|
48
|
-
|
|
49
|
-
|
|
48
|
+
let shuttingDown = false;
|
|
49
|
+
const shutdown = async (reason: string) => {
|
|
50
|
+
if (shuttingDown) return;
|
|
51
|
+
shuttingDown = true;
|
|
52
|
+
log(`shutdown (${reason}) — stopping heartbeat...`);
|
|
50
53
|
watcherLock.stopHeartbeat();
|
|
51
54
|
log(`releasing lock...`);
|
|
52
55
|
await watcherLock.release();
|
|
53
|
-
log(`waiting 7s for another watcher to take over...`);
|
|
54
|
-
await new Promise(r => setTimeout(r, 7000));
|
|
55
56
|
log(`exiting`);
|
|
56
57
|
process.exit(0);
|
|
57
|
-
}
|
|
58
|
+
};
|
|
59
|
+
process.on("SIGINT", () => shutdown("SIGINT"));
|
|
60
|
+
process.on("SIGTERM", () => shutdown("SIGTERM"));
|
|
61
|
+
process.on("SIGHUP", () => shutdown("SIGHUP"));
|
|
62
|
+
process.stdin.on("close", () => shutdown("stdin-close"));
|
|
63
|
+
process.stdin.on("end", () => shutdown("stdin-end"));
|
|
58
64
|
|
|
59
65
|
log(`[${appId}] started`);
|
|
60
66
|
const mcpApp = McpApp({search, indexApp, getIndexStats, resetIndex, log, watch, config});
|
package/apps/run.search.ts
CHANGED
|
@@ -4,8 +4,8 @@ import {SearchApp} from "./searchApp.js";
|
|
|
4
4
|
import {FormatSearchResults} from "../components/index/formatSearchResults.js";
|
|
5
5
|
|
|
6
6
|
const log = BufferedLoggerToStdOut();
|
|
7
|
-
const {searchContentIndex} = await BuildComponents({log});
|
|
8
|
-
const search = SearchApp({searchContentIndex});
|
|
7
|
+
const {searchContentIndex, config} = await BuildComponents({log});
|
|
8
|
+
const search = SearchApp({searchIndex: searchContentIndex, searchDefaultLimit: config.searchDefaultLimit});
|
|
9
9
|
|
|
10
10
|
const query = process.argv.slice(2).join(" ");
|
|
11
11
|
if (!query) {
|
package/apps/searchApp.ts
CHANGED
|
@@ -1,11 +1,23 @@
|
|
|
1
|
-
import {ISearchIndex,
|
|
1
|
+
import {IIndexRecord, ISearchIndex, ISearchIndexOptions} from "../features/searchIndex.js";
|
|
2
2
|
|
|
3
|
-
export type ISearchApp = (query: string,
|
|
3
|
+
export type ISearchApp = (query: string, options?: Partial<ISearchIndexOptions>) => Promise<IIndexRecord[]>;
|
|
4
4
|
|
|
5
|
-
export function SearchApp({
|
|
6
|
-
|
|
5
|
+
export function SearchApp({searchIndex, searchDefaultLimit = 7}: {
|
|
6
|
+
searchIndex: ISearchIndex;
|
|
7
|
+
searchDefaultLimit?: number;
|
|
7
8
|
}): ISearchApp {
|
|
8
|
-
return async function search(query,
|
|
9
|
-
|
|
9
|
+
return async function search(query, options = {}) {
|
|
10
|
+
const normalizedOptions: ISearchIndexOptions = {
|
|
11
|
+
limit: options.limit ?? searchDefaultLimit,
|
|
12
|
+
prefetchMultiplier: options.prefetchMultiplier,
|
|
13
|
+
scoreThreshold: options.scoreThreshold,
|
|
14
|
+
locateInFiles: options.locateInFiles,
|
|
15
|
+
windowLines: options.windowLines,
|
|
16
|
+
useKeywordsSearch: options.useKeywordsSearch,
|
|
17
|
+
includePaths: options.includePaths ?? [],
|
|
18
|
+
excludePaths: options.excludePaths ?? [],
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
return searchIndex({query, options: normalizedOptions});
|
|
10
22
|
}
|
|
11
23
|
}
|
package/apps/watchApp.ts
CHANGED
|
@@ -10,7 +10,7 @@ import {ILogger} from "../components/logger.js";
|
|
|
10
10
|
import {IWatcherLock} from "../components/index/watcherLock.js";
|
|
11
11
|
import {WatchFileEventsApp} from "./watchFileEventsApp.js";
|
|
12
12
|
import {IIndexContent} from "../features/indexContent.js";
|
|
13
|
-
import {INDEXING_BATCH_SIZE} from "../components/config/INDEXING_BATCH_SIZE";
|
|
13
|
+
import {INDEXING_BATCH_SIZE} from "../components/config/INDEXING_BATCH_SIZE.js";
|
|
14
14
|
import {SafeIndexBatch} from "../components/io/safeIndexBatch.js";
|
|
15
15
|
|
|
16
16
|
export type IWatchApp = {
|
|
@@ -23,6 +23,7 @@ export async function BuildComponents({log, watchCoalesceMs = 0, indexingCoalesc
|
|
|
23
23
|
extractKeywords,
|
|
24
24
|
cleanUpKeywords,
|
|
25
25
|
windowLines: config.maxLines,
|
|
26
|
+
embedConcurrency: config.searchConcurrency,
|
|
26
27
|
maxFileBytes: config.maxFileBytes,
|
|
27
28
|
});
|
|
28
29
|
|
|
@@ -38,6 +39,7 @@ export async function BuildComponents({log, watchCoalesceMs = 0, indexingCoalesc
|
|
|
38
39
|
cleanUpKeywords,
|
|
39
40
|
locateInFile,
|
|
40
41
|
scoreThreshold: SCORE_THRESHOLD,
|
|
42
|
+
searchConcurrency: config.searchConcurrency,
|
|
41
43
|
log,
|
|
42
44
|
indexingWatchCoalesceMs: watchCoalesceMs,
|
|
43
45
|
indexingCoalesceMs,
|
|
@@ -3,6 +3,8 @@ import {IXindexConfig} from "./xindexConfig.js";
|
|
|
3
3
|
import {ILogger} from "../logger.js";
|
|
4
4
|
|
|
5
5
|
const DEFAULT_MAX_LINES = 30;
|
|
6
|
+
const DEFAULT_SEARCH_DEFAULT_LIMIT = 7;
|
|
7
|
+
const DEFAULT_SEARCH_CONCURRENCY = 4;
|
|
6
8
|
const DEFAULT_MAX_FILE_BYTES = 50_000;
|
|
7
9
|
const DEFAULT_FOLLOW_SYMLINKS = false;
|
|
8
10
|
const DEFAULT_IGNORE_FILES = ['.xindex', 'node_modules'];
|
|
@@ -11,6 +13,8 @@ const DEFAULTS: IXindexConfig = {
|
|
|
11
13
|
ignoreKeywords: [],
|
|
12
14
|
ignoreFiles: DEFAULT_IGNORE_FILES,
|
|
13
15
|
maxLines: DEFAULT_MAX_LINES,
|
|
16
|
+
searchDefaultLimit: DEFAULT_SEARCH_DEFAULT_LIMIT,
|
|
17
|
+
searchConcurrency: DEFAULT_SEARCH_CONCURRENCY,
|
|
14
18
|
maxFileBytes: DEFAULT_MAX_FILE_BYTES,
|
|
15
19
|
followSymlinks: DEFAULT_FOLLOW_SYMLINKS,
|
|
16
20
|
};
|
|
@@ -45,6 +49,8 @@ export function LoadConfig({configPath, log}: { configPath: string, log: ILogger
|
|
|
45
49
|
ignoreKeywords: toStrings(parsed.ignoreKeywords),
|
|
46
50
|
ignoreFiles: toStrings(parsed.ignoreFiles),
|
|
47
51
|
maxLines: toNum(parsed.maxLines, DEFAULT_MAX_LINES),
|
|
52
|
+
searchDefaultLimit: Math.max(1, toNum(parsed.searchDefaultLimit, DEFAULT_SEARCH_DEFAULT_LIMIT)),
|
|
53
|
+
searchConcurrency: Math.max(1, toNum(parsed.searchConcurrency, DEFAULT_SEARCH_CONCURRENCY)),
|
|
48
54
|
maxFileBytes: toNum(parsed.maxFileBytes, DEFAULT_MAX_FILE_BYTES),
|
|
49
55
|
followSymlinks: typeof parsed.followSymlinks === "boolean" ? parsed.followSymlinks : DEFAULT_FOLLOW_SYMLINKS,
|
|
50
56
|
};
|
|
@@ -23,7 +23,7 @@ export type IContentIndexDriver = Readonly<{
|
|
|
23
23
|
}>;
|
|
24
24
|
|
|
25
25
|
export async function ContentIndexDriver({
|
|
26
|
-
path, embed, extractKeywords, cleanUpKeywords, locateInFile, scoreThreshold, log,
|
|
26
|
+
path, embed, extractKeywords, cleanUpKeywords, locateInFile, scoreThreshold, searchConcurrency, log,
|
|
27
27
|
indexingWatchCoalesceMs = 0, indexingCoalesceMs = 0,
|
|
28
28
|
}: {
|
|
29
29
|
path: string,
|
|
@@ -32,6 +32,7 @@ export async function ContentIndexDriver({
|
|
|
32
32
|
cleanUpKeywords: ICleanUpKeywords,
|
|
33
33
|
locateInFile: ILocateInFile,
|
|
34
34
|
scoreThreshold: number,
|
|
35
|
+
searchConcurrency: number,
|
|
35
36
|
log: ILogger,
|
|
36
37
|
indexingWatchCoalesceMs?: number,
|
|
37
38
|
indexingCoalesceMs?: number,
|
|
@@ -62,7 +63,7 @@ export async function ContentIndexDriver({
|
|
|
62
63
|
indexContentWatch: IndexContent({extractKeywords, cleanUpKeywords, embed, indexApi: watchCoalesce, log}),
|
|
63
64
|
indexContentBatch: IndexContent({extractKeywords, cleanUpKeywords, embed, indexApi: batchCoalesce, log}),
|
|
64
65
|
removeContent: RemoveContent({indexApi: drainingApi}),
|
|
65
|
-
searchContentIndex: SearchIndex({extractKeywords, cleanUpKeywords, embed, index, locateInFile, scoreThreshold}),
|
|
66
|
+
searchContentIndex: SearchIndex({extractKeywords, cleanUpKeywords, embed, index, locateInFile, scoreThreshold, searchConcurrency}),
|
|
66
67
|
resetIndex: ResetIndex({indexApi: drainingApi}),
|
|
67
68
|
flush: flushAll,
|
|
68
69
|
};
|
|
@@ -13,7 +13,16 @@ export function CleanUpKeywords({maxNgrams, minLength, ignoreKeywords = []}: {
|
|
|
13
13
|
const ignoreSet = new Set(ignoreKeywords.map(normalize));
|
|
14
14
|
|
|
15
15
|
return function cleanUpKeywords(keywords) {
|
|
16
|
-
const
|
|
16
|
+
const joined = keywords.join(" ");
|
|
17
|
+
const expanded = joined + " " + splitIdentifiers(joined);
|
|
18
|
+
const tokens = expanded.replace(/[\W_]+/gm, " ").trim().split(/\s+/);
|
|
19
|
+
const seenTokens = new Set<string>();
|
|
20
|
+
const cleaned = tokens.filter(t => {
|
|
21
|
+
const k = t.toLowerCase();
|
|
22
|
+
if (!k || seenTokens.has(k)) return false;
|
|
23
|
+
seenTokens.add(k);
|
|
24
|
+
return true;
|
|
25
|
+
}).join(" ");
|
|
17
26
|
|
|
18
27
|
const extracted: string[] = keyword_extractor.extract(cleaned, {
|
|
19
28
|
language: "english",
|
|
@@ -25,14 +34,25 @@ export function CleanUpKeywords({maxNgrams, minLength, ignoreKeywords = []}: {
|
|
|
25
34
|
|
|
26
35
|
const seen = new Set<string>();
|
|
27
36
|
|
|
28
|
-
|
|
37
|
+
const output = extracted
|
|
29
38
|
.map(normalize)
|
|
30
39
|
.filter((kw: string) => {
|
|
31
|
-
if (kw.length <= minLength || !/[a-z]/i.test(kw)) return false;
|
|
40
|
+
// if (kw.length <= minLength || !/[a-z]/i.test(kw)) return false;
|
|
41
|
+
if (kw.length <= minLength) return false;
|
|
32
42
|
if (ignoreSet.has(kw)) return false;
|
|
33
43
|
if (seen.has(kw)) return false;
|
|
34
44
|
seen.add(kw);
|
|
35
45
|
return true;
|
|
36
46
|
});
|
|
47
|
+
|
|
48
|
+
// console.log(`CleanUpKeywords: input=${JSON.stringify(keywords, null, 2)} output=${JSON.stringify(output, null, 2)}`);
|
|
49
|
+
return output;
|
|
37
50
|
}
|
|
38
51
|
}
|
|
52
|
+
|
|
53
|
+
function splitIdentifiers(text: string): string {
|
|
54
|
+
return text
|
|
55
|
+
.replace(/[_\-.]+/g, " ")
|
|
56
|
+
.replace(/([a-z0-9])([A-Z])/g, "$1 $2")
|
|
57
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2");
|
|
58
|
+
}
|
|
@@ -6,20 +6,20 @@ import {IIndexRecord} from "../../features/searchIndex.js";
|
|
|
6
6
|
import {IInMemoryIndex, InMemoryIndex} from "./inMemoryIndex.js";
|
|
7
7
|
import {IWindow, windowsOf} from "./windowsOf.js";
|
|
8
8
|
import {Bm25, IBm25Doc, tokenizeForBm25} from "./bm25.js";
|
|
9
|
+
import {DEFAULT_LOCATE_BATCH_SIZE} from "../config/DEFAULT_LOCATE_BATCH_SIZE.js";
|
|
9
10
|
import {from} from "../../packages/streamx/src/from.js";
|
|
10
11
|
import {filter} from "../../packages/streamx/src/filter.js";
|
|
11
12
|
import {map} from "../../packages/streamx/src/map.js";
|
|
12
13
|
import {flatMap} from "../../packages/streamx/src/flatMap.js";
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
15
|
-
import {DEFAULT_LOCATE_BATCH_SIZE} from "../config/DEFAULT_LOCATE_BATCH_SIZE";
|
|
16
|
-
import {scaleSync} from "../../packages/streamx/src/scaleSync";
|
|
14
|
+
import {scaleSync} from "../../packages/streamx/src/scaleSync.js";
|
|
15
|
+
import {toArray} from "../../packages/streamx/src/toArray.js";
|
|
17
16
|
|
|
18
17
|
export type ILocateInFile = (
|
|
19
18
|
query: string,
|
|
20
19
|
queryVector: number[],
|
|
21
20
|
candidates: IIndexRecord[],
|
|
22
21
|
limit: number,
|
|
22
|
+
windowLinesOverride?: number,
|
|
23
23
|
) => Promise<IIndexRecord[]>;
|
|
24
24
|
|
|
25
25
|
export function LocateInFile({
|
|
@@ -41,67 +41,70 @@ export function LocateInFile({
|
|
|
41
41
|
type IWindowWithKeywords = IWindow & { keywords: string };
|
|
42
42
|
type IWindowWithVector = IWindowWithKeywords & { vector: number[] };
|
|
43
43
|
|
|
44
|
-
return async function locateInFile(query, queryVector, candidates, limit) {
|
|
44
|
+
return async function locateInFile(query, queryVector, candidates, limit, windowLinesOverride) {
|
|
45
|
+
const effectiveWindowLines = windowLinesOverride ?? windowLines;
|
|
46
|
+
const effectiveConcurrency = Math.max(1, embedConcurrency);
|
|
45
47
|
const memIndex: IInMemoryIndex = InMemoryIndex({dimensions: queryVector.length});
|
|
46
48
|
|
|
47
49
|
try {
|
|
48
|
-
const ids =
|
|
50
|
+
const ids: string[] = await toArray(
|
|
51
|
+
from<IIndexRecord>(candidates)
|
|
52
|
+
.pipe(scaleSync<IIndexRecord, string | null>(effectiveConcurrency, async (c) => {
|
|
53
|
+
try {
|
|
54
|
+
const s = await stat(c.id);
|
|
55
|
+
if (!s.isFile()) return null;
|
|
56
|
+
if (s.size > maxFileBytes) return null;
|
|
57
|
+
return c.id;
|
|
58
|
+
} catch {
|
|
59
|
+
return null;
|
|
60
|
+
}
|
|
61
|
+
}))
|
|
62
|
+
.pipe(filter((id: string | null) => id !== null))
|
|
63
|
+
.pipe(map<string | null, string>((id) => id as string)),
|
|
64
|
+
);
|
|
49
65
|
const bm25Docs: IBm25Doc[] = [];
|
|
50
66
|
const metaById = new Map<string, { fileId: string; startLine: number; endLine: number; snippet: string }>();
|
|
51
67
|
|
|
52
|
-
const
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
const
|
|
67
|
-
return {
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
return windowsOf({text: r!.text, id: r!.id, windowLines});
|
|
75
|
-
}));
|
|
68
|
+
const withVectors: IWindowWithVector[] = await toArray(
|
|
69
|
+
from<string>(ids)
|
|
70
|
+
.pipe(scaleSync<string, {id: string, text: string} | null>(effectiveConcurrency, async (id) => {
|
|
71
|
+
try {
|
|
72
|
+
const text = await readFile(id, "utf8");
|
|
73
|
+
return {id, text};
|
|
74
|
+
} catch {
|
|
75
|
+
return null;
|
|
76
|
+
}
|
|
77
|
+
}))
|
|
78
|
+
.pipe(filter((r: {id: string, text: string} | null) => r !== null))
|
|
79
|
+
.pipe(flatMap((r: {id: string, text: string} | null): IWindow[] =>
|
|
80
|
+
windowsOf({text: r!.text, id: r!.id, windowLines: effectiveWindowLines})))
|
|
81
|
+
.pipe(map<IWindow, IWindowWithKeywords>((w: IWindow) => {
|
|
82
|
+
const kw = cleanUpKeywords(extractKeywords(w.snippet)).join(", ");
|
|
83
|
+
return {...w, keywords: kw || w.snippet.slice(0, 200)};
|
|
84
|
+
}))
|
|
85
|
+
.pipe(scaleSync<IWindowWithKeywords, IWindowWithVector>(effectiveConcurrency, async (w) => {
|
|
86
|
+
const vector = await embed(w.keywords);
|
|
87
|
+
return {...w, vector};
|
|
88
|
+
})),
|
|
89
|
+
);
|
|
76
90
|
|
|
77
|
-
const
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
);
|
|
93
|
-
bm25Docs.push({id, tokens: tokenizeForBm25(w.snippet)});
|
|
94
|
-
metaById.set(id, {
|
|
95
|
-
fileId: w.fileId,
|
|
96
|
-
startLine: w.startLine,
|
|
97
|
-
endLine: w.endLine,
|
|
98
|
-
snippet: w.snippet
|
|
99
|
-
});
|
|
100
|
-
}));
|
|
101
|
-
|
|
102
|
-
await run(withVectors);
|
|
91
|
+
for (const w of withVectors) {
|
|
92
|
+
const id = `${w.fileId}:${w.startLine}:${w.endLine}`;
|
|
93
|
+
await memIndex.upsertItem(
|
|
94
|
+
id,
|
|
95
|
+
w.vector,
|
|
96
|
+
{fileId: w.fileId, startLine: w.startLine, endLine: w.endLine, snippet: w.snippet},
|
|
97
|
+
);
|
|
98
|
+
bm25Docs.push({id, tokens: tokenizeForBm25(w.snippet)});
|
|
99
|
+
metaById.set(id, {
|
|
100
|
+
fileId: w.fileId,
|
|
101
|
+
startLine: w.startLine,
|
|
102
|
+
endLine: w.endLine,
|
|
103
|
+
snippet: w.snippet,
|
|
104
|
+
});
|
|
105
|
+
}
|
|
103
106
|
|
|
104
|
-
const poolSize = Math.
|
|
107
|
+
const poolSize = Math.min(limit * 10, 100);
|
|
105
108
|
const vecHits = await memIndex.query(queryVector, query, poolSize);
|
|
106
109
|
|
|
107
110
|
const bm25 = Bm25({docs: bm25Docs});
|
|
@@ -151,3 +154,4 @@ export function LocateInFile({
|
|
|
151
154
|
}
|
|
152
155
|
};
|
|
153
156
|
}
|
|
157
|
+
|
|
@@ -12,7 +12,9 @@ export function windowsOf({text, id, windowLines}: {
|
|
|
12
12
|
}): IWindow[] {
|
|
13
13
|
const lines = text.split("\n");
|
|
14
14
|
const windows: IWindow[] = [];
|
|
15
|
-
const
|
|
15
|
+
const a12 = Math.max(1, Math.floor(windowLines / 2));
|
|
16
|
+
// const a13 = Math.max(1, Math.floor(windowLines / 3));
|
|
17
|
+
const scales = [a12, windowLines];
|
|
16
18
|
for (const size of scales) {
|
|
17
19
|
const step = Math.max(1, Math.floor(size / 2));
|
|
18
20
|
for (let i = 0; i < lines.length; i += step) {
|
package/features/searchIndex.ts
CHANGED
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import {LocalIndex} from "vectra";
|
|
2
|
+
import ignore from "ignore";
|
|
2
3
|
import {IEmbed} from "../components/llm/embed.js";
|
|
3
4
|
import {IExtractKeywords} from "../components/keywords/extractKeywords.js";
|
|
4
5
|
import {ICleanUpKeywords} from "../components/keywords/cleanUpKeywords.js";
|
|
5
6
|
import {ILocateInFile} from "../components/locate/locateInFile.js";
|
|
7
|
+
import {Concurrency} from "../packages/fun/src/concurrency.js";
|
|
6
8
|
|
|
7
9
|
export type IIndexRecord = {
|
|
8
10
|
score: number;
|
|
@@ -15,25 +17,66 @@ export type IIndexRecord = {
|
|
|
15
17
|
endLine?: number;
|
|
16
18
|
};
|
|
17
19
|
|
|
18
|
-
export type
|
|
20
|
+
export type ISearchIndexOptions = {
|
|
21
|
+
limit: number;
|
|
22
|
+
prefetchMultiplier?: number;
|
|
23
|
+
scoreThreshold?: number;
|
|
24
|
+
locateInFiles?: boolean;
|
|
25
|
+
windowLines?: number;
|
|
26
|
+
useKeywordsSearch?: boolean;
|
|
27
|
+
includePaths?: string[];
|
|
28
|
+
excludePaths?: string[];
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
export type ISearchIndexQuery = Readonly<{
|
|
32
|
+
query: string;
|
|
33
|
+
options: ISearchIndexOptions;
|
|
34
|
+
}>;
|
|
35
|
+
export type ISearchIndex = (query: ISearchIndexQuery) => Promise<IIndexRecord[]>;
|
|
19
36
|
|
|
20
|
-
export function SearchIndex({
|
|
37
|
+
export function SearchIndex({
|
|
38
|
+
extractKeywords,
|
|
39
|
+
cleanUpKeywords,
|
|
40
|
+
embed,
|
|
41
|
+
index,
|
|
42
|
+
locateInFile,
|
|
43
|
+
searchConcurrency = 3,
|
|
44
|
+
scoreThreshold = 0.05
|
|
45
|
+
}: {
|
|
21
46
|
extractKeywords: IExtractKeywords,
|
|
22
47
|
cleanUpKeywords: ICleanUpKeywords,
|
|
23
48
|
embed: IEmbed,
|
|
24
49
|
index: LocalIndex,
|
|
25
50
|
locateInFile: ILocateInFile,
|
|
26
51
|
scoreThreshold: number
|
|
52
|
+
searchConcurrency?: number
|
|
27
53
|
}): ISearchIndex {
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
54
|
+
const worker: ISearchIndex = async function searchContentIndex({
|
|
55
|
+
query, options: {
|
|
56
|
+
limit,
|
|
57
|
+
prefetchMultiplier = 3,
|
|
58
|
+
scoreThreshold: threshold = scoreThreshold,
|
|
59
|
+
locateInFiles = true,
|
|
60
|
+
windowLines,
|
|
61
|
+
useKeywordsSearch = false,
|
|
62
|
+
includePaths = [],
|
|
63
|
+
excludePaths = [],
|
|
64
|
+
}
|
|
65
|
+
}: ISearchIndexQuery) {
|
|
66
|
+
const keywords = useKeywordsSearch ? cleanUpKeywords(extractKeywords(query)) : [];
|
|
67
|
+
const searchText = useKeywordsSearch && keywords.length > 0 ? keywords.join(", ") : query;
|
|
31
68
|
const vector = await embed(searchText);
|
|
69
|
+
const includeMatcher = createGitignoreMatcher(includePaths);
|
|
70
|
+
const excludeMatcher = createGitignoreMatcher(excludePaths);
|
|
32
71
|
|
|
33
|
-
const
|
|
72
|
+
const expectedMultiplier = limit * prefetchMultiplier;
|
|
73
|
+
const pathPatternBoost = Math.max(1, includePaths.length + excludePaths.length);
|
|
74
|
+
const results = await index.queryItems(vector, searchText, expectedMultiplier + pathPatternBoost);
|
|
34
75
|
|
|
35
76
|
const candidates = results
|
|
36
|
-
.filter(r => r.score >=
|
|
77
|
+
.filter(r => r.score >= threshold)
|
|
78
|
+
.filter(r => includePaths.length === 0 || includeMatcher(r.item.id))
|
|
79
|
+
.filter(r => excludePaths.length === 0 || !excludeMatcher(r.item.id))
|
|
37
80
|
.sort((a, b) => b.score - a.score)
|
|
38
81
|
.map(r => ({
|
|
39
82
|
score: r.score,
|
|
@@ -41,10 +84,29 @@ export function SearchIndex({extractKeywords, cleanUpKeywords, embed, index, loc
|
|
|
41
84
|
keywords: typeof r.item.metadata?.keywords === "string" ? r.item.metadata.keywords : "",
|
|
42
85
|
}));
|
|
43
86
|
|
|
44
|
-
if (!locateInFile || candidates.length === 0) {
|
|
87
|
+
if (!locateInFile || !locateInFiles || candidates.length === 0) {
|
|
45
88
|
return candidates;
|
|
46
89
|
}
|
|
47
90
|
|
|
48
|
-
return locateInFile(query, vector, candidates, limit);
|
|
91
|
+
return locateInFile(query, vector, candidates, limit, windowLines);
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
const control = Concurrency(searchConcurrency, worker);
|
|
95
|
+
|
|
96
|
+
return async (input) => {
|
|
97
|
+
const resolve = await control(input);
|
|
98
|
+
return await resolve();
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function createGitignoreMatcher(patterns: string[]): (path: string) => boolean {
|
|
103
|
+
const ig = ignore();
|
|
104
|
+
for (const pattern of patterns) {
|
|
105
|
+
ig.add(pattern);
|
|
49
106
|
}
|
|
107
|
+
return (path: string) => ig.ignores(normalizePath(path));
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function normalizePath(path: string): string {
|
|
111
|
+
return path.replaceAll("\\", "/").replace(/^\.\/+/, "");
|
|
50
112
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "xindex",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.19",
|
|
4
4
|
"description": "Local semantic code search — index codebase, search by meaning or keywords",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "xindex.ts",
|
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
"mcp": "tsx apps/run.mcp.ts",
|
|
22
22
|
"watch": "tsx apps/run.watch.ts",
|
|
23
23
|
"test.functional": "bash test/functional.sh",
|
|
24
|
-
"test.compilation": "
|
|
24
|
+
"test.compilation": "tsc --ignoreConfig --noEmit --target ES2022 --module ESNext --moduleResolution bundler --esModuleInterop --skipLibCheck --strict false $(git ls-files '*.ts') && tsc -p tsconfig.json --noEmit",
|
|
25
25
|
"test.npx": "docker run --rm -it -w /tmp node:22 bash -c 'npm i -g xindex && xindex-index tsx-0 && xindex-search streamx map | grep \"await mapper\" && which xindex | grep bin/xindex' ",
|
|
26
26
|
"console": "docker run --rm -it -v \"$PWD:/app\" -w /app node:22 bash"
|
|
27
27
|
},
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { Counter, ICounter } from './counter';
|
|
2
|
-
import { Defer, IDefer } from './defer';
|
|
3
|
-
import { IValue, Value } from './value';
|
|
4
|
-
import { waitForCounter, waitForZeroCounter } from './waitForCounter';
|
|
1
|
+
import { Counter, ICounter } from './counter.js';
|
|
2
|
+
import { Defer, IDefer } from './defer.js';
|
|
3
|
+
import { IValue, Value } from './value.js';
|
|
4
|
+
import { waitForCounter, waitForZeroCounter } from './waitForCounter.js';
|
|
5
5
|
|
|
6
6
|
export type IStopKeepConcurrency = () => Promise<void>;
|
|
7
7
|
|
|
@@ -168,7 +168,7 @@ export function KeyedConcurrency<Input, Output = any>(
|
|
|
168
168
|
const keyedControl = Concurrency<Input>(workerConcurrency, worker);
|
|
169
169
|
registry[key] = keyedControl;
|
|
170
170
|
|
|
171
|
-
const unsubscribe = keyedControl.quantity.subscribe(value => {
|
|
171
|
+
const unsubscribe = keyedControl.quantity.subscribe((value: number) => {
|
|
172
172
|
if (value <= 0) {
|
|
173
173
|
unsubscribe();
|
|
174
174
|
|