xindex 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.tmp/watch.log +14 -0
- package/.xindex.json +91 -19
- package/README.md +5 -1
- package/apps/indexApp.ts +5 -5
- package/apps/run.index.ts +5 -3
- package/apps/run.mcp.ts +18 -8
- package/apps/run.watch.ts +10 -4
- package/apps/watchApp.ts +5 -5
- package/componets/appId.ts +1 -1
- package/componets/buildComponents.ts +15 -5
- package/componets/config/DEFAULT_LOCATE_BATCH_SIZE.ts +1 -1
- package/componets/config/INDEXING_BATCH_SIZE.ts +1 -1
- package/componets/config/INDEXING_COALESCE_MAX_ITEMS.ts +1 -0
- package/componets/config/INDEXING_COALESCE_MS.ts +1 -0
- package/componets/config/WATCH_COALESCE_MS.ts +1 -0
- package/componets/config/WATCH_FLUSH_MS.ts +1 -1
- package/componets/config/loadConfig.ts +12 -2
- package/componets/config/xindexConfig.ts +1 -0
- package/componets/index/coalesceIndexApi.ts +74 -0
- package/componets/index/contentIndexDriver.ts +36 -8
- package/componets/index/documentContentIndexDriver.ts +2 -1
- package/componets/index/handleFileEvent.ts +14 -34
- package/componets/index/indexApi.ts +51 -22
- package/componets/io/safeIndexBatch.ts +30 -0
- package/componets/io/safeReadFile.ts +36 -0
- package/componets/llm/embed.ts +11 -5
- package/componets/locate/locateInFile.ts +11 -6
- package/componets/logger.ts +15 -12
- package/componets/walkFiles.ts +34 -3
- package/features/indexContent.ts +32 -7
- package/package.json +7 -1
package/.tmp/watch.log
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
npm warn Unknown env config "version-git-tag". This will stop working in the next major version of npm.
|
|
2
|
+
npm warn Unknown env config "argv". This will stop working in the next major version of npm.
|
|
3
|
+
npm warn Unknown env config "version-commit-hooks". This will stop working in the next major version of npm.
|
|
4
|
+
npm warn Unknown env config "version-git-message". This will stop working in the next major version of npm.
|
|
5
|
+
npm warn Unknown env config "version-tag-prefix". This will stop working in the next major version of npm.
|
|
6
|
+
[xindex-1776991372027-99a3bfc69e73f] started, indexing and watching: .
|
|
7
|
+
indexing: notes.md
|
|
8
|
+
indexing: util.ts
|
|
9
|
+
initial index complete, acquiring watcher lock...
|
|
10
|
+
done. indexed: {"indexedAmount":0}
|
|
11
|
+
watching for changes: .
|
|
12
|
+
index: wombat.md
|
|
13
|
+
remove: functional
|
|
14
|
+
index end failed: Error saving index: Error: ENOENT: no such file or directory, open '.xindex/semantic/index.json'
|
package/.xindex.json
CHANGED
|
@@ -1,23 +1,95 @@
|
|
|
1
1
|
{
|
|
2
2
|
"ignoreKeywords": [
|
|
3
|
-
"import",
|
|
4
|
-
"
|
|
5
|
-
"
|
|
6
|
-
"
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"
|
|
3
|
+
"import",
|
|
4
|
+
"export",
|
|
5
|
+
"const",
|
|
6
|
+
"function",
|
|
7
|
+
"return",
|
|
8
|
+
"async",
|
|
9
|
+
"await",
|
|
10
|
+
"type",
|
|
11
|
+
"string",
|
|
12
|
+
"number",
|
|
13
|
+
"from",
|
|
14
|
+
"default",
|
|
15
|
+
"let",
|
|
16
|
+
"var",
|
|
17
|
+
"if",
|
|
18
|
+
"else",
|
|
19
|
+
"switch",
|
|
20
|
+
"case",
|
|
21
|
+
"break",
|
|
22
|
+
"continue",
|
|
23
|
+
"for",
|
|
24
|
+
"of",
|
|
25
|
+
"in",
|
|
26
|
+
"try",
|
|
27
|
+
"catch",
|
|
28
|
+
"finally",
|
|
29
|
+
"throw",
|
|
30
|
+
"new",
|
|
31
|
+
"delete",
|
|
32
|
+
"as",
|
|
33
|
+
"is",
|
|
34
|
+
"typeof",
|
|
35
|
+
"instanceof",
|
|
36
|
+
"void",
|
|
37
|
+
"this",
|
|
38
|
+
"super",
|
|
39
|
+
"interface",
|
|
40
|
+
"class",
|
|
41
|
+
"extends",
|
|
42
|
+
"implements",
|
|
43
|
+
"public",
|
|
44
|
+
"private",
|
|
45
|
+
"protected",
|
|
46
|
+
"static",
|
|
47
|
+
"abstract",
|
|
48
|
+
"readonly",
|
|
49
|
+
"declare",
|
|
50
|
+
"namespace",
|
|
51
|
+
"module",
|
|
52
|
+
"enum",
|
|
53
|
+
"boolean",
|
|
54
|
+
"any",
|
|
55
|
+
"unknown",
|
|
56
|
+
"never",
|
|
57
|
+
"undefined",
|
|
58
|
+
"null",
|
|
59
|
+
"object",
|
|
60
|
+
"record",
|
|
61
|
+
"array",
|
|
62
|
+
"readonlyarray",
|
|
63
|
+
"promise",
|
|
64
|
+
"true",
|
|
65
|
+
"false",
|
|
66
|
+
"src",
|
|
67
|
+
"packages",
|
|
68
|
+
"fun",
|
|
69
|
+
"componets",
|
|
70
|
+
"streamx",
|
|
71
|
+
"apps",
|
|
72
|
+
"run",
|
|
73
|
+
"rnd",
|
|
74
|
+
"tsx",
|
|
75
|
+
"utf8",
|
|
76
|
+
"length",
|
|
77
|
+
"map",
|
|
78
|
+
"slice",
|
|
79
|
+
"push",
|
|
80
|
+
"join",
|
|
81
|
+
"resolve",
|
|
82
|
+
"stringify",
|
|
83
|
+
"json",
|
|
84
|
+
"settimeout",
|
|
85
|
+
"path",
|
|
86
|
+
"readfile"
|
|
20
87
|
],
|
|
21
|
-
"ignoreFiles": [
|
|
22
|
-
|
|
88
|
+
"ignoreFiles": [
|
|
89
|
+
".xindex",
|
|
90
|
+
"media"
|
|
91
|
+
],
|
|
92
|
+
"maxLines": 30,
|
|
93
|
+
"maxFileBytes": 100000,
|
|
94
|
+
"followSymlinks": false
|
|
23
95
|
}
|
package/README.md
CHANGED
|
@@ -6,6 +6,8 @@ Local semantic code search for your codebase — plus an MCP server so Claude Co
|
|
|
6
6
|
|
|
7
7
|
## Install
|
|
8
8
|
|
|
9
|
+
Requires **Node ≥22**.
|
|
10
|
+
|
|
9
11
|
```bash
|
|
10
12
|
npm i -g xindex
|
|
11
13
|
```
|
|
@@ -47,6 +49,7 @@ Open the project in Claude Code — it picks up the xindex MCP server and can ca
|
|
|
47
49
|
- **Watch mode** — keeps the index warm while you code
|
|
48
50
|
- **Gitignore-aware** — respects `.gitignore` + custom ignore rules
|
|
49
51
|
- **Zero config** — works with defaults; `.xindex.json` is optional
|
|
52
|
+
- **Tolerant** — tolerates unreadable files, oversize files, empty files, binary files, and symlinks; each is skipped with a log line so the run always finishes
|
|
50
53
|
|
|
51
54
|
## Claude Code skills (`@xi`)
|
|
52
55
|
|
|
@@ -171,7 +174,8 @@ Project-root file. All fields optional; unknown keys ignored; missing/empty →
|
|
|
171
174
|
- **`ignoreKeywords`** — `string[]`, default `[]`. Tokens stripped before embedding — add project slang/boilerplate polluting results. Entries ≤1 char warn.
|
|
172
175
|
- **`ignoreFiles`** — `string[]`, default `[]`. Extra globs excluded during walk/watch, on top of `.gitignore` — add vendored/generated folders.
|
|
173
176
|
- **`maxLines`** — `number`, default `30`. Lines per chunk — tune if chunks feel over/under-sized.
|
|
174
|
-
- **`maxFileBytes`** — `number`, default `
|
|
177
|
+
- **`maxFileBytes`** — `number`, default `50000`. Skip files over this (50 KB) — raise to index larger generated files.
|
|
178
|
+
- **`followSymlinks`** — `boolean`, default `false`. When `false`, symbolic links encountered during walk/watch are skipped with a log line. Set `true` to follow them (cycles are broken via `realpath` dedup).
|
|
175
179
|
|
|
176
180
|
```json
|
|
177
181
|
{
|
package/apps/indexApp.ts
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import {readFile} from "fs/promises";
|
|
2
1
|
import {from} from "../packages/streamx/src/from.js";
|
|
3
2
|
import {batch} from "../packages/streamx/src/batch.js";
|
|
4
3
|
import {map} from "../packages/streamx/src/map.js";
|
|
@@ -8,23 +7,24 @@ import {IWalkFiles} from "../componets/walkFiles.js";
|
|
|
8
7
|
import {IIndexContent} from "../features/indexContent.js";
|
|
9
8
|
import {ILogger} from "../componets/logger.js";
|
|
10
9
|
import {INDEXING_BATCH_SIZE} from "../componets/config/INDEXING_BATCH_SIZE";
|
|
10
|
+
import {SafeIndexBatch} from "../componets/io/safeIndexBatch.js";
|
|
11
11
|
|
|
12
12
|
export type IIndexApp = (inputs: string[]) => Promise<void>;
|
|
13
13
|
|
|
14
|
-
export function IndexApp({walkFiles, indexContent, log}: {
|
|
14
|
+
export function IndexApp({walkFiles, indexContent, log, maxFileBytes}: {
|
|
15
15
|
walkFiles: IWalkFiles,
|
|
16
16
|
indexContent: IIndexContent,
|
|
17
17
|
log: ILogger,
|
|
18
|
+
maxFileBytes: number,
|
|
18
19
|
}): IIndexApp {
|
|
20
|
+
const safeIndexBatch = SafeIndexBatch({indexContent, log, maxFileBytes});
|
|
19
21
|
return async function indexApp(inputs) {
|
|
20
22
|
await run(
|
|
21
23
|
from(walkFiles(inputs))
|
|
22
24
|
.pipe(tap(id => log(`indexing: ${id}`)))
|
|
23
25
|
.pipe(batch(INDEXING_BATCH_SIZE))
|
|
24
26
|
.pipe(map<string[], string[]>(async (ids) => {
|
|
25
|
-
|
|
26
|
-
const items = ids.map((id, i) => ({id, content: `${texts[i]}. ${id}`}));
|
|
27
|
-
await indexContent(items);
|
|
27
|
+
await safeIndexBatch(ids);
|
|
28
28
|
return ids;
|
|
29
29
|
}))
|
|
30
30
|
);
|
package/apps/run.index.ts
CHANGED
|
@@ -3,17 +3,19 @@ import {BufferedLoggerToStdOut} from "../componets/logger.js";
|
|
|
3
3
|
import {WalkFiles} from "../componets/walkFiles.js";
|
|
4
4
|
import {IndexApp} from "./indexApp.js";
|
|
5
5
|
import {AppId} from "../componets/appId.js";
|
|
6
|
+
import {INDEXING_COALESCE_MS} from "../componets/config/INDEXING_COALESCE_MS.js";
|
|
6
7
|
|
|
7
8
|
const appId = AppId();
|
|
8
9
|
const cwd = process.cwd();
|
|
9
10
|
const log = BufferedLoggerToStdOut();
|
|
10
|
-
const {
|
|
11
|
-
const walkFiles = WalkFiles({cwd, log, ignoreFiles: config.ignoreFiles});
|
|
12
|
-
const indexApp = IndexApp({walkFiles, indexContent, log});
|
|
11
|
+
const {indexContentBatch, getIndexStats, config, flush} = await BuildComponents({log, indexingCoalesceMs: INDEXING_COALESCE_MS});
|
|
12
|
+
const walkFiles = WalkFiles({cwd, log, ignoreFiles: config.ignoreFiles, followSymlinks: config.followSymlinks});
|
|
13
|
+
const indexApp = IndexApp({walkFiles, indexContent: indexContentBatch, log, maxFileBytes: config.maxFileBytes});
|
|
13
14
|
|
|
14
15
|
const inputs = process.argv.slice(2);
|
|
15
16
|
if (!inputs.length) inputs.push(".");
|
|
16
17
|
|
|
17
18
|
log(`[${appId}] started, indexing: ${inputs.join(", ")}`);
|
|
18
19
|
await indexApp(inputs);
|
|
20
|
+
await flush();
|
|
19
21
|
log(`[${appId}] done:`, await getIndexStats());
|
package/apps/run.mcp.ts
CHANGED
|
@@ -9,6 +9,8 @@ import {McpApp} from "./mcpApp.js";
|
|
|
9
9
|
import {SearchApp} from "./searchApp.js";
|
|
10
10
|
import {join} from "path";
|
|
11
11
|
import {AppId} from "../componets/appId.js";
|
|
12
|
+
import {INDEXING_COALESCE_MS} from "../componets/config/INDEXING_COALESCE_MS.js";
|
|
13
|
+
import {WATCH_COALESCE_MS} from "../componets/config/WATCH_COALESCE_MS.js";
|
|
12
14
|
|
|
13
15
|
const args = process.argv.slice(2);
|
|
14
16
|
const watchDirArg = args.find(a => a.startsWith("--watch-dir="));
|
|
@@ -16,9 +18,17 @@ const watchDisabled = args.includes("--watch-disabled");
|
|
|
16
18
|
|
|
17
19
|
const cwd = process.cwd();
|
|
18
20
|
const log = BufferedLoggerToStdErr();
|
|
19
|
-
const {
|
|
20
|
-
|
|
21
|
-
|
|
21
|
+
const {indexContentWatch, indexContentBatch, removeContent, getIndexStats, searchContentIndex, resetIndex, config, flush} = await BuildComponents({
|
|
22
|
+
log,
|
|
23
|
+
watchCoalesceMs: WATCH_COALESCE_MS,
|
|
24
|
+
indexingCoalesceMs: INDEXING_COALESCE_MS,
|
|
25
|
+
});
|
|
26
|
+
const walkFiles = WalkFiles({cwd, log, ignoreFiles: config.ignoreFiles, followSymlinks: config.followSymlinks});
|
|
27
|
+
const rawIndexApp = IndexApp({walkFiles, indexContent: indexContentBatch, log, maxFileBytes: config.maxFileBytes});
|
|
28
|
+
const indexApp = async (inputs: string[]) => {
|
|
29
|
+
await rawIndexApp(inputs);
|
|
30
|
+
await flush();
|
|
31
|
+
};
|
|
22
32
|
const search = SearchApp({searchContentIndex});
|
|
23
33
|
|
|
24
34
|
const appId = AppId();
|
|
@@ -31,18 +41,18 @@ const watcherLock = WatcherLock({
|
|
|
31
41
|
const watch = watchDisabled ? undefined : {
|
|
32
42
|
watchFiles: WatchFiles({cwd, log, ignoreFiles: config.ignoreFiles}),
|
|
33
43
|
watchDir: watchDirArg ? watchDirArg.split("=")[1] : ".",
|
|
34
|
-
handleFileEvents: HandleFileEvents({indexContent, removeContent, log}),
|
|
44
|
+
handleFileEvents: HandleFileEvents({indexContent: indexContentWatch, removeContent, log, maxFileBytes: config.maxFileBytes}),
|
|
35
45
|
watcherLock,
|
|
36
46
|
};
|
|
37
47
|
|
|
38
48
|
process.on("SIGINT", async () => {
|
|
39
|
-
log(`
|
|
49
|
+
log(`shutting down — stopping heartbeat...`);
|
|
40
50
|
watcherLock.stopHeartbeat();
|
|
41
|
-
log(`
|
|
51
|
+
log(`releasing lock...`);
|
|
42
52
|
await watcherLock.release();
|
|
43
|
-
log(`
|
|
53
|
+
log(`waiting 7s for another watcher to take over...`);
|
|
44
54
|
await new Promise(r => setTimeout(r, 7000));
|
|
45
|
-
log(`
|
|
55
|
+
log(`exiting`);
|
|
46
56
|
process.exit(0);
|
|
47
57
|
});
|
|
48
58
|
|
package/apps/run.watch.ts
CHANGED
|
@@ -7,13 +7,19 @@ import {WatcherLock} from "../componets/index/watcherLock.js";
|
|
|
7
7
|
import {WatchApp} from "./watchApp.js";
|
|
8
8
|
import {AppId} from "../componets/appId.js";
|
|
9
9
|
import {join} from "path";
|
|
10
|
+
import {WATCH_COALESCE_MS} from "../componets/config/WATCH_COALESCE_MS.js";
|
|
11
|
+
import {INDEXING_COALESCE_MS} from "../componets/config/INDEXING_COALESCE_MS.js";
|
|
10
12
|
|
|
11
13
|
const cwd = process.cwd();
|
|
12
14
|
const log = BufferedLoggerToStdOut();
|
|
13
|
-
const {
|
|
14
|
-
|
|
15
|
+
const {indexContentWatch, indexContentBatch, removeContent, getIndexStats, config} = await BuildComponents({
|
|
16
|
+
log,
|
|
17
|
+
watchCoalesceMs: WATCH_COALESCE_MS,
|
|
18
|
+
indexingCoalesceMs: INDEXING_COALESCE_MS,
|
|
19
|
+
});
|
|
20
|
+
const walkFiles = WalkFiles({cwd, log, ignoreFiles: config.ignoreFiles, followSymlinks: config.followSymlinks});
|
|
15
21
|
const watchFiles = WatchFiles({cwd, log, ignoreFiles: config.ignoreFiles});
|
|
16
|
-
const handleFileEvents = HandleFileEvents({indexContent, removeContent, log});
|
|
22
|
+
const handleFileEvents = HandleFileEvents({indexContent: indexContentWatch, removeContent, log, maxFileBytes: config.maxFileBytes});
|
|
17
23
|
|
|
18
24
|
const appId = AppId();
|
|
19
25
|
const watcherLock = WatcherLock({
|
|
@@ -22,7 +28,7 @@ const watcherLock = WatcherLock({
|
|
|
22
28
|
log,
|
|
23
29
|
});
|
|
24
30
|
|
|
25
|
-
const app = WatchApp({walkFiles, watchFiles, handleFileEvents, indexContent, log, watcherLock});
|
|
31
|
+
const app = WatchApp({walkFiles, watchFiles, handleFileEvents, indexContent: indexContentBatch, log, watcherLock, maxFileBytes: config.maxFileBytes});
|
|
26
32
|
|
|
27
33
|
const inputs = process.argv.slice(2);
|
|
28
34
|
if (!inputs.length) inputs.push(".");
|
package/apps/watchApp.ts
CHANGED
|
@@ -3,7 +3,6 @@ import {batch} from "../packages/streamx/src/batch.js";
|
|
|
3
3
|
import {map} from "../packages/streamx/src/map.js";
|
|
4
4
|
import {tap} from "../packages/streamx/src/tap.js";
|
|
5
5
|
import {run} from "../packages/streamx/src/index.js";
|
|
6
|
-
import {readFile} from "fs/promises";
|
|
7
6
|
import {IWalkFiles} from "../componets/walkFiles.js";
|
|
8
7
|
import {IWatchFiles} from "../componets/watchFiles.js";
|
|
9
8
|
import {IHandleFileEvents} from "../componets/index/handleFileEvent.js";
|
|
@@ -12,20 +11,23 @@ import {IWatcherLock} from "../componets/index/watcherLock.js";
|
|
|
12
11
|
import {WatchFileEventsApp} from "./watchFileEventsApp.js";
|
|
13
12
|
import {IIndexContent} from "../features/indexContent.js";
|
|
14
13
|
import {INDEXING_BATCH_SIZE} from "../componets/config/INDEXING_BATCH_SIZE";
|
|
14
|
+
import {SafeIndexBatch} from "../componets/io/safeIndexBatch.js";
|
|
15
15
|
|
|
16
16
|
export type IWatchApp = {
|
|
17
17
|
run: (inputs: string[]) => Promise<void>;
|
|
18
18
|
stop: () => void;
|
|
19
19
|
};
|
|
20
20
|
|
|
21
|
-
export function WatchApp({walkFiles, watchFiles, handleFileEvents, indexContent, log, watcherLock}: {
|
|
21
|
+
export function WatchApp({walkFiles, watchFiles, handleFileEvents, indexContent, log, watcherLock, maxFileBytes}: {
|
|
22
22
|
walkFiles: IWalkFiles,
|
|
23
23
|
watchFiles: IWatchFiles,
|
|
24
24
|
handleFileEvents: IHandleFileEvents,
|
|
25
25
|
indexContent: IIndexContent,
|
|
26
26
|
log: ILogger,
|
|
27
27
|
watcherLock: IWatcherLock,
|
|
28
|
+
maxFileBytes: number,
|
|
28
29
|
}): IWatchApp {
|
|
30
|
+
const safeIndexBatch = SafeIndexBatch({indexContent, log, maxFileBytes});
|
|
29
31
|
return {
|
|
30
32
|
async run(inputs) {
|
|
31
33
|
// Phase 1: initial index — walk all files
|
|
@@ -34,9 +36,7 @@ export function WatchApp({walkFiles, watchFiles, handleFileEvents, indexContent,
|
|
|
34
36
|
.pipe(tap(id => log(`indexing: ${id}`)))
|
|
35
37
|
.pipe(batch(INDEXING_BATCH_SIZE))
|
|
36
38
|
.pipe(map<string[], string[]>(async (ids) => {
|
|
37
|
-
|
|
38
|
-
const items = ids.map((id, i) => ({id, content: `${texts[i]}. ${id}`}));
|
|
39
|
-
await indexContent(items);
|
|
39
|
+
await safeIndexBatch(ids);
|
|
40
40
|
return ids;
|
|
41
41
|
}))
|
|
42
42
|
);
|
package/componets/appId.ts
CHANGED
|
@@ -6,7 +6,11 @@ import {LoadConfig} from "./config/loadConfig.js";
|
|
|
6
6
|
import {ILogger} from "./logger.js";
|
|
7
7
|
import {LocateInFile} from "./locate/locateInFile.js";
|
|
8
8
|
|
|
9
|
-
export async function BuildComponents({log
|
|
9
|
+
export async function BuildComponents({log, watchCoalesceMs = 0, indexingCoalesceMs = 0}: {
|
|
10
|
+
log: ILogger,
|
|
11
|
+
watchCoalesceMs?: number,
|
|
12
|
+
indexingCoalesceMs?: number,
|
|
13
|
+
}) {
|
|
10
14
|
const loadConfig = LoadConfig({configPath: ".xindex.json", log});
|
|
11
15
|
const config = await loadConfig();
|
|
12
16
|
|
|
@@ -26,18 +30,24 @@ export async function BuildComponents({log}: { log: ILogger }) {
|
|
|
26
30
|
|
|
27
31
|
const SCORE_THRESHOLD = 0.01;
|
|
28
32
|
|
|
29
|
-
const {
|
|
33
|
+
const {indexContentWatch, indexContentBatch, removeContent, getIndexStats, searchContentIndex, resetIndex, flush}
|
|
30
34
|
= await ContentIndexDriver({
|
|
31
35
|
path: DEFAULT_INDEX_PATH,
|
|
32
36
|
embed,
|
|
33
37
|
extractKeywords,
|
|
34
38
|
cleanUpKeywords,
|
|
35
39
|
locateInFile,
|
|
36
|
-
scoreThreshold: SCORE_THRESHOLD
|
|
40
|
+
scoreThreshold: SCORE_THRESHOLD,
|
|
41
|
+
log,
|
|
42
|
+
indexingWatchCoalesceMs: watchCoalesceMs,
|
|
43
|
+
indexingCoalesceMs,
|
|
37
44
|
});
|
|
38
45
|
|
|
39
46
|
return {
|
|
40
|
-
extractKeywords, cleanUpKeywords,
|
|
41
|
-
|
|
47
|
+
extractKeywords, cleanUpKeywords,
|
|
48
|
+
indexContentWatch, indexContentBatch,
|
|
49
|
+
removeContent, getIndexStats,
|
|
50
|
+
searchContentIndex, resetIndex,
|
|
51
|
+
locateInFile, config, flush,
|
|
42
52
|
};
|
|
43
53
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
export const DEFAULT_LOCATE_BATCH_SIZE =
|
|
1
|
+
export const DEFAULT_LOCATE_BATCH_SIZE = 8;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
export const INDEXING_BATCH_SIZE =
|
|
1
|
+
export const INDEXING_BATCH_SIZE = 16;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export const INDEXING_COALESCE_MAX_ITEMS = 1000;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export const INDEXING_COALESCE_MS = 1000;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export const WATCH_COALESCE_MS = 100;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
export const WATCH_FLUSH_MS =
|
|
1
|
+
export const WATCH_FLUSH_MS = 300;
|
|
@@ -3,13 +3,16 @@ import {IXindexConfig} from "./xindexConfig.js";
|
|
|
3
3
|
import {ILogger} from "../logger.js";
|
|
4
4
|
|
|
5
5
|
const DEFAULT_MAX_LINES = 30;
|
|
6
|
-
const DEFAULT_MAX_FILE_BYTES =
|
|
6
|
+
const DEFAULT_MAX_FILE_BYTES = 50_000;
|
|
7
|
+
const DEFAULT_FOLLOW_SYMLINKS = false;
|
|
8
|
+
const DEFAULT_IGNORE_FILES = ['.xindex', 'node_modules'];
|
|
7
9
|
|
|
8
10
|
const DEFAULTS: IXindexConfig = {
|
|
9
11
|
ignoreKeywords: [],
|
|
10
|
-
ignoreFiles:
|
|
12
|
+
ignoreFiles: DEFAULT_IGNORE_FILES,
|
|
11
13
|
maxLines: DEFAULT_MAX_LINES,
|
|
12
14
|
maxFileBytes: DEFAULT_MAX_FILE_BYTES,
|
|
15
|
+
followSymlinks: DEFAULT_FOLLOW_SYMLINKS,
|
|
13
16
|
};
|
|
14
17
|
|
|
15
18
|
export type ILoadConfig = () => Promise<IXindexConfig>;
|
|
@@ -41,11 +44,18 @@ export function LoadConfig({configPath, log}: { configPath: string, log: ILogger
|
|
|
41
44
|
ignoreFiles: toStrings(parsed.ignoreFiles),
|
|
42
45
|
maxLines: toNum(parsed.maxLines, DEFAULT_MAX_LINES),
|
|
43
46
|
maxFileBytes: toNum(parsed.maxFileBytes, DEFAULT_MAX_FILE_BYTES),
|
|
47
|
+
followSymlinks: typeof parsed.followSymlinks === "boolean" ? parsed.followSymlinks : DEFAULT_FOLLOW_SYMLINKS,
|
|
44
48
|
};
|
|
45
49
|
|
|
46
50
|
for (const kw of config.ignoreKeywords) {
|
|
47
51
|
if (kw.length <= 1) log(`warning: ignoreKeywords entry "${kw}" is <=1 char`);
|
|
48
52
|
}
|
|
53
|
+
if (config.maxFileBytes < 1024) {
|
|
54
|
+
log(`warning: maxFileBytes (${config.maxFileBytes}) < 1024 — likely a typo`);
|
|
55
|
+
}
|
|
56
|
+
if (parsed.followSymlinks !== undefined && typeof parsed.followSymlinks !== "boolean") {
|
|
57
|
+
log(`warning: followSymlinks must be boolean — using default (${DEFAULT_FOLLOW_SYMLINKS})`);
|
|
58
|
+
}
|
|
49
59
|
|
|
50
60
|
return config;
|
|
51
61
|
};
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import {IIndexApi, IndexCommandType} from "./indexApi.js";
|
|
2
|
+
import {ILogger} from "../logger.js";
|
|
3
|
+
import {INDEXING_COALESCE_MAX_ITEMS} from "../config/INDEXING_COALESCE_MAX_ITEMS.js";
|
|
4
|
+
import {Defer, IDefer} from "../../packages/fun/src/defer.js";
|
|
5
|
+
|
|
6
|
+
type Item = {id: string, vector: number[], keywords: string};
|
|
7
|
+
|
|
8
|
+
export function CoalesceIndexApi({inner, windowMs, maxItems = INDEXING_COALESCE_MAX_ITEMS, log}: {
|
|
9
|
+
inner: IIndexApi,
|
|
10
|
+
windowMs: number,
|
|
11
|
+
maxItems?: number,
|
|
12
|
+
log: ILogger,
|
|
13
|
+
}): IIndexApi {
|
|
14
|
+
if (windowMs <= 0) return inner;
|
|
15
|
+
|
|
16
|
+
let pending: Item[] = [];
|
|
17
|
+
let delivery: IDefer | null = null;
|
|
18
|
+
let timer: ReturnType<typeof setTimeout> | null = null;
|
|
19
|
+
|
|
20
|
+
async function deliver() {
|
|
21
|
+
if (delivery) { await delivery.promise; return; }
|
|
22
|
+
if (timer) { clearTimeout(timer); timer = null; }
|
|
23
|
+
if (pending.length === 0) return;
|
|
24
|
+
|
|
25
|
+
const d = Defer();
|
|
26
|
+
delivery = d;
|
|
27
|
+
const items = pending;
|
|
28
|
+
pending = [];
|
|
29
|
+
try {
|
|
30
|
+
await inner({type: IndexCommandType.index, items});
|
|
31
|
+
} catch (e: any) {
|
|
32
|
+
log(`coalesced flush failed (${items.length} items): ${e?.message ?? e}`);
|
|
33
|
+
} finally {
|
|
34
|
+
delivery = null;
|
|
35
|
+
d.resolve();
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if (pending.length >= maxItems) {
|
|
39
|
+
await deliver();
|
|
40
|
+
} else if (pending.length > 0 && !timer) {
|
|
41
|
+
timer = setTimeout(deliver, windowMs);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const api = (async function coalescedIndexApi(msg) {
|
|
46
|
+
switch (msg.type) {
|
|
47
|
+
case IndexCommandType.index: {
|
|
48
|
+
// if (delivery) await delivery.promise;
|
|
49
|
+
pending.push(...msg.items);
|
|
50
|
+
|
|
51
|
+
if (!timer) {
|
|
52
|
+
timer = setTimeout(deliver, windowMs);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if (pending.length >= maxItems) {
|
|
56
|
+
await deliver();
|
|
57
|
+
}
|
|
58
|
+
return;
|
|
59
|
+
}
|
|
60
|
+
case IndexCommandType.delete:
|
|
61
|
+
case IndexCommandType.reset: {
|
|
62
|
+
while (pending.length > 0 || delivery) await deliver();
|
|
63
|
+
return inner(msg);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}) as IIndexApi;
|
|
67
|
+
|
|
68
|
+
api.flush = async () => {
|
|
69
|
+
while (pending.length > 0 || delivery) await deliver();
|
|
70
|
+
await inner.flush();
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
return api;
|
|
74
|
+
}
|
|
@@ -7,35 +7,63 @@ import {SearchIndex, ISearchIndex} from "../../features/searchIndex.js";
|
|
|
7
7
|
import {RemoveContent, IRemoveContent} from "../../features/removeContent.js";
|
|
8
8
|
import {ResetIndex, IResetIndex} from "../../features/resetIndex.js";
|
|
9
9
|
import {VectraIndex} from "./vectraIndex.js";
|
|
10
|
-
import {IndexApi} from "./indexApi.js";
|
|
10
|
+
import {IndexApi, IIndexApi} from "./indexApi.js";
|
|
11
|
+
import {CoalesceIndexApi} from "./coalesceIndexApi.js";
|
|
11
12
|
import {ILocateInFile} from "../locate/locateInFile.js";
|
|
13
|
+
import {ILogger} from "../logger.js";
|
|
12
14
|
|
|
13
15
|
export type IContentIndexDriver = Readonly<{
|
|
14
16
|
getIndexStats: IGetIndexStats,
|
|
15
|
-
|
|
17
|
+
indexContentWatch: IIndexContent,
|
|
18
|
+
indexContentBatch: IIndexContent,
|
|
16
19
|
removeContent: IRemoveContent,
|
|
17
20
|
searchContentIndex: ISearchIndex,
|
|
18
21
|
resetIndex: IResetIndex,
|
|
19
22
|
flush: () => Promise<void>,
|
|
20
23
|
}>;
|
|
21
24
|
|
|
22
|
-
export async function ContentIndexDriver({
|
|
25
|
+
export async function ContentIndexDriver({
|
|
26
|
+
path, embed, extractKeywords, cleanUpKeywords, locateInFile, scoreThreshold, log,
|
|
27
|
+
indexingWatchCoalesceMs = 0, indexingCoalesceMs = 0,
|
|
28
|
+
}: {
|
|
23
29
|
path: string,
|
|
24
30
|
embed: IEmbed,
|
|
25
31
|
extractKeywords: IExtractKeywords,
|
|
26
32
|
cleanUpKeywords: ICleanUpKeywords,
|
|
27
33
|
locateInFile: ILocateInFile,
|
|
28
34
|
scoreThreshold: number,
|
|
35
|
+
log: ILogger,
|
|
36
|
+
indexingWatchCoalesceMs?: number,
|
|
37
|
+
indexingCoalesceMs?: number,
|
|
29
38
|
}): Promise<IContentIndexDriver> {
|
|
30
39
|
const index = await VectraIndex(path + "/semantic");
|
|
31
|
-
const
|
|
40
|
+
const rawIndexApi = IndexApi({index, log});
|
|
41
|
+
const watchCoalesce = CoalesceIndexApi({inner: rawIndexApi, windowMs: indexingWatchCoalesceMs, log});
|
|
42
|
+
const batchCoalesce = CoalesceIndexApi({inner: rawIndexApi, windowMs: indexingCoalesceMs, log});
|
|
43
|
+
|
|
44
|
+
const flushAll = async () => {
|
|
45
|
+
await Promise.all([watchCoalesce.flush(), batchCoalesce.flush()]);
|
|
46
|
+
await rawIndexApi.flush();
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
const drainThen = (inner: IIndexApi): IIndexApi => {
|
|
50
|
+
const api = (async (msg) => {
|
|
51
|
+
await flushAll();
|
|
52
|
+
return inner(msg);
|
|
53
|
+
}) as IIndexApi;
|
|
54
|
+
api.flush = flushAll;
|
|
55
|
+
return api;
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
const drainingApi = drainThen(rawIndexApi);
|
|
32
59
|
|
|
33
60
|
return {
|
|
34
61
|
getIndexStats: GetIndexStats({index}),
|
|
35
|
-
|
|
36
|
-
|
|
62
|
+
indexContentWatch: IndexContent({extractKeywords, cleanUpKeywords, embed, indexApi: watchCoalesce, log}),
|
|
63
|
+
indexContentBatch: IndexContent({extractKeywords, cleanUpKeywords, embed, indexApi: batchCoalesce, log}),
|
|
64
|
+
removeContent: RemoveContent({indexApi: drainingApi}),
|
|
37
65
|
searchContentIndex: SearchIndex({extractKeywords, cleanUpKeywords, embed, index, locateInFile, scoreThreshold}),
|
|
38
|
-
resetIndex: ResetIndex({indexApi}),
|
|
39
|
-
flush:
|
|
66
|
+
resetIndex: ResetIndex({indexApi: drainingApi}),
|
|
67
|
+
flush: flushAll,
|
|
40
68
|
};
|
|
41
69
|
}
|
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
import {readFile} from "fs/promises";
|
|
2
1
|
import {IIndexContent} from "../../features/indexContent.js";
|
|
3
2
|
import {IRemoveContent} from "../../features/removeContent.js";
|
|
4
3
|
import {ILogger} from "../logger.js";
|
|
5
4
|
import {FileEventType, IFileEvent} from "../watchFiles.js";
|
|
5
|
+
import {SafeIndexBatch} from "../io/safeIndexBatch.js";
|
|
6
6
|
|
|
7
7
|
export type IHandleFileEvent = (event: IFileEvent) => Promise<void>;
|
|
8
8
|
export type IHandleFileEvents = (events: IFileEvent[]) => Promise<void>;
|
|
9
9
|
|
|
10
|
-
export function HandleFileEvent({indexContent, removeContent, log}: {
|
|
10
|
+
export function HandleFileEvent({indexContent, removeContent, log, maxFileBytes}: {
|
|
11
11
|
indexContent: IIndexContent,
|
|
12
12
|
removeContent: IRemoveContent,
|
|
13
13
|
log: ILogger,
|
|
14
|
+
maxFileBytes: number,
|
|
14
15
|
}): IHandleFileEvent {
|
|
16
|
+
const safeIndexBatch = SafeIndexBatch({indexContent, log, maxFileBytes});
|
|
15
17
|
return async function handleFileEvent(event) {
|
|
16
18
|
if (event.type === FileEventType.index) {
|
|
17
|
-
|
|
18
|
-
await indexContent([{id: event.path, content: `${text}. ${event.path}`}]);
|
|
19
|
-
log(`index: ${event.path}`);
|
|
19
|
+
await safeIndexBatch([event.path]);
|
|
20
20
|
} else {
|
|
21
21
|
try { await removeContent([event.path]); } catch (e) { log(`remove failed: ${event.path} — ${(e as any)?.message ?? e}`); }
|
|
22
22
|
log(`remove: ${event.path}`);
|
|
@@ -24,42 +24,22 @@ export function HandleFileEvent({indexContent, removeContent, log}: {
|
|
|
24
24
|
};
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
-
export function HandleFileEvents({indexContent, removeContent, log}: {
|
|
27
|
+
export function HandleFileEvents({indexContent, removeContent, log, maxFileBytes}: {
|
|
28
28
|
indexContent: IIndexContent,
|
|
29
29
|
removeContent: IRemoveContent,
|
|
30
30
|
log: ILogger,
|
|
31
|
+
maxFileBytes: number,
|
|
31
32
|
}): IHandleFileEvents {
|
|
33
|
+
const safeIndexBatch = SafeIndexBatch({indexContent, log, maxFileBytes});
|
|
32
34
|
return async function handleFileEvents(events) {
|
|
33
|
-
const
|
|
34
|
-
const
|
|
35
|
+
const indexPaths = events.filter(e => e.type === FileEventType.index).map(e => e.path);
|
|
36
|
+
const removePaths = events.filter(e => e.type === FileEventType.remove).map(e => e.path);
|
|
35
37
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
log(`index: ${event.path}`);
|
|
39
|
-
try {
|
|
40
|
-
const text = await readFile(event.path, "utf8");
|
|
41
|
-
return {id: event.path, content: `${text}. ${event.path}`};
|
|
42
|
-
} catch (e) {
|
|
43
|
-
log(`index failed: ${event.path} — ${(e as any)?.message ?? e}`);
|
|
44
|
-
return undefined;
|
|
45
|
-
}
|
|
46
|
-
}))).filter((item): item is { id: string, content: string } => !!item);
|
|
47
|
-
|
|
48
|
-
if (indexItems.length > 0) {
|
|
49
|
-
try {
|
|
50
|
-
await indexContent(indexItems);
|
|
51
|
-
} catch (e) {
|
|
52
|
-
log(`index batch failed: ${(e as any)?.message ?? e}`);
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
if (removeEvents.length > 0) {
|
|
58
|
-
const removePaths = removeEvents.map((event) => {
|
|
59
|
-
log(`remove: ${event.path}`);
|
|
60
|
-
return event.path;
|
|
61
|
-
});
|
|
38
|
+
for (const p of indexPaths) log(`index: ${p}`);
|
|
39
|
+
if (indexPaths.length > 0) await safeIndexBatch(indexPaths);
|
|
62
40
|
|
|
41
|
+
if (removePaths.length > 0) {
|
|
42
|
+
for (const p of removePaths) log(`remove: ${p}`);
|
|
63
43
|
try {
|
|
64
44
|
await removeContent(removePaths);
|
|
65
45
|
} catch (e) {
|
|
@@ -2,6 +2,7 @@ import {LocalIndex} from "vectra";
|
|
|
2
2
|
import {IType} from "../IType.js";
|
|
3
3
|
import {ISerial, Serial} from "../../packages/fun/src/serial.js";
|
|
4
4
|
import {caseNever} from "../../packages/fun/src/case-never.js";
|
|
5
|
+
import {ILogger} from "../logger.js";
|
|
5
6
|
|
|
6
7
|
export enum IndexCommandType {
|
|
7
8
|
index = 'index',
|
|
@@ -16,17 +17,29 @@ export type IIndexCommand =
|
|
|
16
17
|
|
|
17
18
|
export type IIndexApi = ISerial<IIndexCommand, void>;
|
|
18
19
|
|
|
19
|
-
export function IndexApi({index}: { index: LocalIndex }): IIndexApi {
|
|
20
|
+
export function IndexApi({index, log}: { index: LocalIndex, log: ILogger }): IIndexApi {
|
|
20
21
|
return Serial<IIndexCommand, void>(async msg => {
|
|
21
22
|
switch (msg.type) {
|
|
22
23
|
case IndexCommandType.delete: {
|
|
23
|
-
await index.beginUpdate();
|
|
24
24
|
try {
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
await index.beginUpdate();
|
|
26
|
+
try {
|
|
27
|
+
for (const id of msg.ids) {
|
|
28
|
+
try {
|
|
29
|
+
await index.deleteItem(id);
|
|
30
|
+
} catch (e: any) {
|
|
31
|
+
log(`delete failed for ${id}: ${e?.message ?? e}`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
} finally {
|
|
35
|
+
try {
|
|
36
|
+
await index.endUpdate();
|
|
37
|
+
} catch (e: any) {
|
|
38
|
+
log(`index end failed: ${e?.message ?? e}`);
|
|
39
|
+
}
|
|
27
40
|
}
|
|
28
|
-
}
|
|
29
|
-
|
|
41
|
+
} catch (e: any) {
|
|
42
|
+
log(`index begin failed: ${e?.message ?? e}`);
|
|
30
43
|
}
|
|
31
44
|
break;
|
|
32
45
|
}
|
|
@@ -35,35 +48,51 @@ export function IndexApi({index}: { index: LocalIndex }): IIndexApi {
|
|
|
35
48
|
await index.batchInsertItems(msg.items.map(item => ({
|
|
36
49
|
id: item.id,
|
|
37
50
|
vector: item.vector,
|
|
38
|
-
metadata: {
|
|
51
|
+
metadata: {},
|
|
39
52
|
})));
|
|
40
|
-
} catch (error) {
|
|
41
|
-
const errorMessage = String(
|
|
53
|
+
} catch (error: any) {
|
|
54
|
+
const errorMessage = String(error?.message ?? error).toLowerCase();
|
|
42
55
|
const isDuplicateIdError =
|
|
43
56
|
errorMessage.includes("already exists")
|
|
44
57
|
|| errorMessage.includes("duplicate");
|
|
45
58
|
if (!isDuplicateIdError) {
|
|
46
|
-
|
|
59
|
+
log(`index write failed (${msg.items.length} items): ${error?.message ?? error}`);
|
|
60
|
+
break;
|
|
47
61
|
}
|
|
48
|
-
|
|
49
|
-
await index.beginUpdate();
|
|
50
62
|
try {
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
63
|
+
await index.beginUpdate();
|
|
64
|
+
try {
|
|
65
|
+
for (const item of msg.items) {
|
|
66
|
+
try {
|
|
67
|
+
await index.upsertItem({
|
|
68
|
+
id: item.id,
|
|
69
|
+
vector: item.vector,
|
|
70
|
+
metadata: {},
|
|
71
|
+
});
|
|
72
|
+
} catch (e: any) {
|
|
73
|
+
log(`index write failed for ${item.id}: ${e?.message ?? e}`);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
} finally {
|
|
77
|
+
try {
|
|
78
|
+
await index.endUpdate();
|
|
79
|
+
} catch (e: any) {
|
|
80
|
+
log(`index end failed: ${e?.message ?? e}`);
|
|
81
|
+
}
|
|
57
82
|
}
|
|
58
|
-
}
|
|
59
|
-
|
|
83
|
+
} catch (e: any) {
|
|
84
|
+
log(`index begin failed: ${e?.message ?? e}`);
|
|
60
85
|
}
|
|
61
86
|
}
|
|
62
87
|
break;
|
|
63
88
|
}
|
|
64
89
|
case IndexCommandType.reset: {
|
|
65
|
-
|
|
66
|
-
|
|
90
|
+
try {
|
|
91
|
+
await index.deleteIndex();
|
|
92
|
+
await index.createIndex();
|
|
93
|
+
} catch (e: any) {
|
|
94
|
+
log(`reset failed: ${e?.message ?? e}`);
|
|
95
|
+
}
|
|
67
96
|
break;
|
|
68
97
|
}
|
|
69
98
|
default:
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import {safeReadFile, SafeReadResultType} from "./safeReadFile.js";
|
|
2
|
+
import {IIndexContent} from "../../features/indexContent.js";
|
|
3
|
+
import {ILogger} from "../logger.js";
|
|
4
|
+
|
|
5
|
+
export type ISafeIndexBatch = (ids: string[]) => Promise<void>;
|
|
6
|
+
|
|
7
|
+
export function SafeIndexBatch({indexContent, log, maxFileBytes}: {
|
|
8
|
+
indexContent: IIndexContent,
|
|
9
|
+
log: ILogger,
|
|
10
|
+
maxFileBytes: number,
|
|
11
|
+
}): ISafeIndexBatch {
|
|
12
|
+
return async function safeIndexBatch(ids) {
|
|
13
|
+
const results = await Promise.all(ids.map(id => safeReadFile(id, maxFileBytes)));
|
|
14
|
+
const ok: {id: string, content: string}[] = [];
|
|
15
|
+
for (const r of results) {
|
|
16
|
+
if (r.type === SafeReadResultType.ok) {
|
|
17
|
+
ok.push({id: r.id, content: `${r.content}. ${r.id}`});
|
|
18
|
+
} else {
|
|
19
|
+
log(`skip ${r.id}: ${r.reason}`);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
if (ok.length > 0) {
|
|
23
|
+
try {
|
|
24
|
+
await indexContent(ok);
|
|
25
|
+
} catch (e: any) {
|
|
26
|
+
log(`index batch failed: ${e?.message ?? e}`);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
};
|
|
30
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import {stat, readFile} from "fs/promises";
|
|
2
|
+
import {IType} from "../IType.js";
|
|
3
|
+
|
|
4
|
+
export enum SafeReadResultType {
|
|
5
|
+
ok = "ok",
|
|
6
|
+
skip = "skip",
|
|
7
|
+
error = "error",
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export enum SafeReadSkipReason {
|
|
11
|
+
tooLarge = "tooLarge",
|
|
12
|
+
empty = "empty",
|
|
13
|
+
binary = "binary",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export type ISafeReadResult =
|
|
17
|
+
| IType<{ type: SafeReadResultType.ok, id: string, content: string }>
|
|
18
|
+
| IType<{ type: SafeReadResultType.skip, id: string, reason: SafeReadSkipReason }>
|
|
19
|
+
| IType<{ type: SafeReadResultType.error, id: string, reason: string }>;
|
|
20
|
+
|
|
21
|
+
export async function safeReadFile(id: string, maxBytes: number): Promise<ISafeReadResult> {
|
|
22
|
+
try {
|
|
23
|
+
const s = await stat(id);
|
|
24
|
+
if (s.size > maxBytes) return {type: SafeReadResultType.skip, id, reason: SafeReadSkipReason.tooLarge};
|
|
25
|
+
const buf = await readFile(id);
|
|
26
|
+
const probe = buf.subarray(0, Math.min(10, buf.length));
|
|
27
|
+
for (let i = 0; i < probe.length; i++) {
|
|
28
|
+
if (probe[i] === 0) return {type: SafeReadResultType.skip, id, reason: SafeReadSkipReason.binary};
|
|
29
|
+
}
|
|
30
|
+
const content = buf.toString("utf8");
|
|
31
|
+
if (!content.trim()) return {type: SafeReadResultType.skip, id, reason: SafeReadSkipReason.empty};
|
|
32
|
+
return {type: SafeReadResultType.ok, id, content};
|
|
33
|
+
} catch (e: any) {
|
|
34
|
+
return {type: SafeReadResultType.error, id, reason: e?.code ?? e?.message ?? String(e)};
|
|
35
|
+
}
|
|
36
|
+
}
|
package/componets/llm/embed.ts
CHANGED
|
@@ -1,18 +1,24 @@
|
|
|
1
1
|
import {pipeline} from "@huggingface/transformers";
|
|
2
2
|
|
|
3
|
-
const
|
|
4
|
-
"feature-extraction",
|
|
5
|
-
"sentence-transformers/all-MiniLM-L6-v2"
|
|
6
|
-
);
|
|
3
|
+
const MODEL = "sentence-transformers/all-MiniLM-L6-v2";
|
|
7
4
|
|
|
8
5
|
export type IEmbed = (text: string) => Promise<number[]>;
|
|
9
6
|
|
|
10
7
|
type Pooling = "none" | "mean" | "cls" | "first_token" | "eos" | "last_token";
|
|
11
8
|
|
|
12
9
|
export function Embed({pooling, normalize}: {pooling: Pooling, normalize: boolean}): IEmbed {
|
|
10
|
+
let embedderPromise: Promise<any> | null = null;
|
|
11
|
+
|
|
12
|
+
function getEmbedder() {
|
|
13
|
+
if (!embedderPromise) {
|
|
14
|
+
embedderPromise = pipeline("feature-extraction", MODEL);
|
|
15
|
+
}
|
|
16
|
+
return embedderPromise;
|
|
17
|
+
}
|
|
18
|
+
|
|
13
19
|
return async function embed(text) {
|
|
20
|
+
const embedder = await getEmbedder();
|
|
14
21
|
const result = await embedder(text, {pooling, normalize});
|
|
15
22
|
return Array.from(result.data as Float32Array);
|
|
16
23
|
}
|
|
17
24
|
}
|
|
18
|
-
|
|
@@ -11,9 +11,9 @@ import {filter} from "../../packages/streamx/src/filter.js";
|
|
|
11
11
|
import {map} from "../../packages/streamx/src/map.js";
|
|
12
12
|
import {flatMap} from "../../packages/streamx/src/flatMap.js";
|
|
13
13
|
import {tap} from "../../packages/streamx/src/tap.js";
|
|
14
|
-
import {scaleSync} from "../../packages/streamx/src/scaleSync.js";
|
|
15
14
|
import {run} from "../../packages/streamx/src/index.js";
|
|
16
15
|
import {DEFAULT_LOCATE_BATCH_SIZE} from "../config/DEFAULT_LOCATE_BATCH_SIZE";
|
|
16
|
+
import {scaleSync} from "../../packages/streamx/src/scaleSync";
|
|
17
17
|
|
|
18
18
|
export type ILocateInFile = (
|
|
19
19
|
query: string,
|
|
@@ -61,12 +61,17 @@ export function LocateInFile({
|
|
|
61
61
|
return false;
|
|
62
62
|
}
|
|
63
63
|
}))
|
|
64
|
-
.pipe(scaleSync(SCALE_FILE_READS, async (id: string) => {
|
|
65
|
-
|
|
66
|
-
|
|
64
|
+
.pipe(scaleSync(SCALE_FILE_READS, async (id: string): Promise<{ id: string, text: string } | null> => {
|
|
65
|
+
try {
|
|
66
|
+
const text = await readFile(id, "utf8");
|
|
67
|
+
return {id, text};
|
|
68
|
+
} catch {
|
|
69
|
+
return null;
|
|
70
|
+
}
|
|
67
71
|
}))
|
|
68
|
-
.pipe(
|
|
69
|
-
|
|
72
|
+
.pipe(filter(async (r: { id: string, text: string } | null) => r !== null))
|
|
73
|
+
.pipe(flatMap((r: { id: string, text: string } | null): IWindow[] => {
|
|
74
|
+
return windowsOf({text: r!.text, id: r!.id, windowLines});
|
|
70
75
|
}));
|
|
71
76
|
|
|
72
77
|
const withVectors = from<IWindow>(windows)
|
package/componets/logger.ts
CHANGED
|
@@ -1,34 +1,37 @@
|
|
|
1
1
|
export type ILogger = (...args: any[]) => void;
|
|
2
2
|
|
|
3
|
-
export function Logger(fn: (...args: any[]) => void): ILogger {
|
|
4
|
-
return function log(...args) {
|
|
5
|
-
fn(...args);
|
|
6
|
-
}
|
|
7
|
-
}
|
|
8
|
-
|
|
9
3
|
const DEFAULT_FLUSH_MS = 100;
|
|
4
|
+
const DEFAULT_FLUSH_LINES = 15;
|
|
10
5
|
|
|
11
|
-
export function BufferedLoggerToStdOut(flushMs = DEFAULT_FLUSH_MS): ILogger {
|
|
12
|
-
return BufferedLogger(s => process.stdout.write(s), flushMs);
|
|
6
|
+
export function BufferedLoggerToStdOut(flushMs = DEFAULT_FLUSH_MS, flushLines = DEFAULT_FLUSH_LINES): ILogger {
|
|
7
|
+
return BufferedLogger(s => process.stdout.write(s), flushMs, flushLines);
|
|
13
8
|
}
|
|
14
9
|
|
|
15
|
-
export function BufferedLoggerToStdErr(flushMs = DEFAULT_FLUSH_MS): ILogger {
|
|
16
|
-
return BufferedLogger(s => process.stderr.write(s), flushMs);
|
|
10
|
+
export function BufferedLoggerToStdErr(flushMs = DEFAULT_FLUSH_MS, flushLines = DEFAULT_FLUSH_LINES): ILogger {
|
|
11
|
+
return BufferedLogger(s => process.stderr.write(s), flushMs, flushLines);
|
|
17
12
|
}
|
|
18
13
|
|
|
19
|
-
export function BufferedLogger(
|
|
14
|
+
export function BufferedLogger(
|
|
15
|
+
write: (text: string) => void,
|
|
16
|
+
flushMs = DEFAULT_FLUSH_MS,
|
|
17
|
+
flushLines = DEFAULT_FLUSH_LINES,
|
|
18
|
+
): ILogger {
|
|
20
19
|
let buffer: string[] = [];
|
|
21
20
|
let timer: ReturnType<typeof setTimeout> | null = null;
|
|
22
21
|
|
|
23
22
|
function flush() {
|
|
23
|
+
if (timer) { clearTimeout(timer); timer = null; }
|
|
24
24
|
if (buffer.length === 0) return;
|
|
25
25
|
write(buffer.join("\n") + "\n");
|
|
26
26
|
buffer = [];
|
|
27
|
-
timer = null;
|
|
28
27
|
}
|
|
29
28
|
|
|
30
29
|
return function log(...args) {
|
|
31
30
|
buffer.push(args.map(a => typeof a === "object" ? JSON.stringify(a) : String(a)).join(" "));
|
|
31
|
+
if (buffer.length >= flushLines) {
|
|
32
|
+
flush();
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
32
35
|
if (!timer) timer = setTimeout(flush, flushMs);
|
|
33
36
|
}
|
|
34
37
|
}
|
package/componets/walkFiles.ts
CHANGED
|
@@ -1,11 +1,16 @@
|
|
|
1
|
-
import {readdir, stat, readFile} from "fs/promises";
|
|
1
|
+
import {readdir, stat, readFile, realpath} from "fs/promises";
|
|
2
2
|
import {join, relative} from "path";
|
|
3
3
|
import ignore from "ignore";
|
|
4
4
|
import {ILogger} from "./logger.js";
|
|
5
5
|
|
|
6
6
|
export type IWalkFiles = (inputs: string[]) => AsyncIterable<string>;
|
|
7
7
|
|
|
8
|
-
export function WalkFiles({cwd, log, ignoreFiles = []
|
|
8
|
+
export function WalkFiles({cwd, log, ignoreFiles = [], followSymlinks = false}: {
|
|
9
|
+
cwd: string,
|
|
10
|
+
log: ILogger,
|
|
11
|
+
ignoreFiles?: string[],
|
|
12
|
+
followSymlinks?: boolean,
|
|
13
|
+
}): IWalkFiles {
|
|
9
14
|
|
|
10
15
|
async function tryReadGitignore(dir: string): Promise<string> {
|
|
11
16
|
try {
|
|
@@ -15,6 +20,8 @@ export function WalkFiles({cwd, log, ignoreFiles = []}: {cwd: string, log: ILogg
|
|
|
15
20
|
}
|
|
16
21
|
}
|
|
17
22
|
|
|
23
|
+
const visited = new Set<string>();
|
|
24
|
+
|
|
18
25
|
async function* walk(dir: string, parentRules: string[]): AsyncIterable<string> {
|
|
19
26
|
const localGitignore = await tryReadGitignore(dir);
|
|
20
27
|
const rules = localGitignore ? [...parentRules, localGitignore] : parentRules;
|
|
@@ -27,11 +34,35 @@ export function WalkFiles({cwd, log, ignoreFiles = []}: {cwd: string, log: ILogg
|
|
|
27
34
|
const abs = join(dir, entry.name);
|
|
28
35
|
const rel = relative(cwd, abs);
|
|
29
36
|
|
|
37
|
+
if (entry.isSymbolicLink()) {
|
|
38
|
+
if (!followSymlinks) {
|
|
39
|
+
log(`skip symlink ${rel}`);
|
|
40
|
+
continue;
|
|
41
|
+
}
|
|
42
|
+
try {
|
|
43
|
+
const real = await realpath(abs);
|
|
44
|
+
if (visited.has(real)) continue;
|
|
45
|
+
visited.add(real);
|
|
46
|
+
const s = await stat(real);
|
|
47
|
+
if (s.isDirectory()) {
|
|
48
|
+
if (entry.name.startsWith(".")) continue;
|
|
49
|
+
if (ig.ignores(rel + "/")) continue;
|
|
50
|
+
yield* walk(abs, rules);
|
|
51
|
+
} else if (s.isFile()) {
|
|
52
|
+
if (ig.ignores(rel)) continue;
|
|
53
|
+
yield rel;
|
|
54
|
+
}
|
|
55
|
+
} catch (e: any) {
|
|
56
|
+
log(`skip symlink ${rel}: ${e?.code ?? e?.message ?? e}`);
|
|
57
|
+
}
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
|
|
30
61
|
if (entry.isDirectory()) {
|
|
31
62
|
if (entry.name.startsWith(".")) continue;
|
|
32
63
|
if (ig.ignores(rel + "/")) continue;
|
|
33
64
|
yield* walk(abs, rules);
|
|
34
|
-
} else {
|
|
65
|
+
} else if (entry.isFile()) {
|
|
35
66
|
if (ig.ignores(rel)) continue;
|
|
36
67
|
yield rel;
|
|
37
68
|
}
|
package/features/indexContent.ts
CHANGED
|
@@ -2,22 +2,47 @@ import {IndexCommandType, IIndexApi} from "../componets/index/indexApi.js";
|
|
|
2
2
|
import {IExtractKeywords} from "../componets/keywords/extractKeywords.js";
|
|
3
3
|
import {ICleanUpKeywords} from "../componets/keywords/cleanUpKeywords.js";
|
|
4
4
|
import {IEmbed} from "../componets/llm/embed.js";
|
|
5
|
+
import {ILogger} from "../componets/logger.js";
|
|
5
6
|
|
|
6
7
|
export type IIndexContent = (items: Array<{id: string, content: string}>) => Promise<void>;
|
|
7
8
|
|
|
8
|
-
|
|
9
|
+
const SLOW_WARN_MS = 3000;
|
|
10
|
+
|
|
11
|
+
export function IndexContent({extractKeywords, cleanUpKeywords, embed, indexApi, log}: {
|
|
9
12
|
extractKeywords: IExtractKeywords,
|
|
10
13
|
cleanUpKeywords: ICleanUpKeywords,
|
|
11
14
|
embed: IEmbed,
|
|
12
15
|
indexApi: IIndexApi,
|
|
16
|
+
log: ILogger,
|
|
13
17
|
}): IIndexContent {
|
|
14
18
|
return async function indexContent(items) {
|
|
15
|
-
const indexItems = await Promise.all(items.map(async (item) => {
|
|
16
|
-
const
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
19
|
+
const indexItems = (await Promise.all(items.map(async (item) => {
|
|
20
|
+
const warn = setTimeout(
|
|
21
|
+
() => log(`slow: still embedding ${item.id} after ${SLOW_WARN_MS}ms`),
|
|
22
|
+
SLOW_WARN_MS,
|
|
23
|
+
);
|
|
24
|
+
try {
|
|
25
|
+
const keywords = cleanUpKeywords(extractKeywords(item.content)).join(", ");
|
|
26
|
+
const vector = await embed(keywords);
|
|
27
|
+
return {id: item.id, vector, keywords};
|
|
28
|
+
} catch (e: any) {
|
|
29
|
+
log(`skip ${item.id}: keywords/embed failed — ${e?.code ?? e?.message ?? e}`);
|
|
30
|
+
return null;
|
|
31
|
+
} finally {
|
|
32
|
+
clearTimeout(warn);
|
|
33
|
+
}
|
|
34
|
+
}))).filter((x): x is NonNullable<typeof x> => !!x);
|
|
20
35
|
|
|
21
|
-
|
|
36
|
+
if (indexItems.length > 0) {
|
|
37
|
+
const warn = setTimeout(
|
|
38
|
+
() => log(`slow: still writing index after ${SLOW_WARN_MS}ms — ${indexItems.map(i => i.id).join(", ")}`),
|
|
39
|
+
SLOW_WARN_MS,
|
|
40
|
+
);
|
|
41
|
+
try {
|
|
42
|
+
await indexApi({type: IndexCommandType.index, items: indexItems});
|
|
43
|
+
} finally {
|
|
44
|
+
clearTimeout(warn);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
22
47
|
}
|
|
23
48
|
}
|
package/package.json
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "xindex",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.7",
|
|
4
4
|
"description": "Local semantic code search — index codebase, search by meaning or keywords",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "xindex.ts",
|
|
7
7
|
"bin": {
|
|
8
|
+
"xindex": "bin/xindex-mcp",
|
|
8
9
|
"xindex-index": "bin/xindex-index",
|
|
9
10
|
"xindex-search": "bin/xindex-search",
|
|
10
11
|
"xindex-mcp": "bin/xindex-mcp",
|
|
@@ -15,11 +16,16 @@
|
|
|
15
16
|
"index": "tsx apps/run.index.ts",
|
|
16
17
|
"search": "tsx apps/run.search.ts",
|
|
17
18
|
"reset": "tsx apps/run.reset.ts",
|
|
19
|
+
"test": "npm run test.functional && npm run test.compilation",
|
|
18
20
|
"mcp": "tsx apps/run.mcp.ts",
|
|
19
21
|
"watch": "tsx apps/run.watch.ts",
|
|
22
|
+
"test.functional": "bash test/functional.sh",
|
|
20
23
|
"test.compilation": "npx -y tsc --ignoreConfig --noEmit --target ES2022 --module ESNext --moduleResolution bundler --esModuleInterop --skipLibCheck --strict false $(git ls-files '*.ts')"
|
|
21
24
|
},
|
|
22
25
|
"private": false,
|
|
26
|
+
"engines": {
|
|
27
|
+
"node": ">=22"
|
|
28
|
+
},
|
|
23
29
|
"keywords": [
|
|
24
30
|
"semantic-search",
|
|
25
31
|
"code-search",
|