xindex 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/.xindex.json +2 -1
  2. package/CLAUDE.md +1 -0
  3. package/README.md +33 -26
  4. package/apps/indexApp.ts +9 -8
  5. package/apps/mcpApp.ts +6 -6
  6. package/apps/run.index.ts +2 -2
  7. package/apps/run.mcp.ts +6 -4
  8. package/apps/run.search.ts +1 -1
  9. package/apps/run.watch.ts +3 -3
  10. package/apps/searchApp.ts +4 -2
  11. package/apps/watchApp.ts +16 -8
  12. package/apps/watchFileEventsApp.ts +14 -4
  13. package/componets/buildComponents.ts +25 -9
  14. package/componets/config/DEFAULT_LOCATE_BATCH_SIZE.ts +1 -0
  15. package/componets/config/INDEXING_BATCH_SIZE.ts +1 -0
  16. package/componets/config/WATCH_FLUSH_MS.ts +1 -0
  17. package/componets/config/loadConfig.ts +10 -1
  18. package/componets/config/xindexConfig.ts +2 -0
  19. package/componets/ignore/loadIgnoreChain.ts +40 -0
  20. package/componets/index/contentIndexDriver.ts +7 -5
  21. package/componets/index/documentContentIndexDriver.ts +126 -0
  22. package/componets/index/documentIndex.ts +26 -0
  23. package/componets/index/formatSearchResults.ts +16 -2
  24. package/componets/index/handleFileEvent.ts +48 -3
  25. package/componets/index/indexApi.ts +39 -11
  26. package/componets/locate/bm25.ts +50 -0
  27. package/componets/locate/inMemoryIndex.ts +48 -0
  28. package/componets/locate/locateInFile.ts +148 -0
  29. package/componets/locate/windowsOf.ts +29 -0
  30. package/componets/watchFiles.ts +5 -16
  31. package/features/indexContent.ts +12 -5
  32. package/features/removeContent.ts +3 -3
  33. package/features/searchIndex.ts +22 -5
  34. package/package.json +15 -2
  35. package/packages/streamx/src/batchTimed.ts +1 -1
  36. package/packages/streamx/src/buffer.ts +1 -1
  37. package/packages/streamx/src/defer.ts +55 -0
  38. package/packages/streamx/src/interval.ts +1 -1
  39. package/packages/streamx/src/merge.ts +1 -1
  40. package/packages/streamx/src/nodeWritable.ts +1 -1
  41. package/packages/streamx/src/scale.ts +2 -2
  42. package/packages/streamx/src/writer.ts +1 -1
  43. package/.ai/research/.gitkeep +0 -0
  44. package/.ai/task/.gitkeep +0 -0
  45. package/.claude/settings.local.json +0 -73
  46. package/.claude/skills/make-hof/SKILL.md +0 -8
  47. package/.claude/skills/make-hof/playbook.md +0 -38
  48. package/.cursor/mcp.json +0 -8
  49. package/media/MEDIUM.md +0 -139
  50. package/media/SOCIAL.md +0 -102
@@ -0,0 +1,126 @@
1
+ import {rm} from "node:fs/promises";
2
+ import {extname} from "node:path";
3
+ import type {IContentIndexDriver} from "./contentIndexDriver.js";
4
+ import type {IGetIndexStats} from "./getIndexStats.js";
5
+ import type {IIndexContent} from "../../features/indexContent.js";
6
+ import type {IRemoveContent} from "../../features/removeContent.js";
7
+ import type {IResetIndex} from "../../features/resetIndex.js";
8
+ import type {IIndexRecord, ISearchIndex} from "../../features/searchIndex.js";
9
+ import {DocumentIndex} from "./documentIndex.js";
10
+
11
+ export async function DocumentContentIndexDriver({path, model}: {
12
+ path: string,
13
+ model?: string,
14
+ }): Promise<IContentIndexDriver> {
15
+ const docsPath = `${path}/docs`;
16
+ let index = await DocumentIndex({path: docsPath, model});
17
+
18
+ const indexContent: IIndexContent = async items => {
19
+ for (const item of items) {
20
+ await index.upsertDocument(item.id, item.content, docTypeForPath(item.id), {});
21
+ }
22
+ };
23
+
24
+ const removeContent: IRemoveContent = async ids => {
25
+ for (const id of ids) {
26
+ await index.deleteDocument(id);
27
+ }
28
+ };
29
+
30
+ const searchContentIndex: ISearchIndex = async (query, limit) => {
31
+ const results = await index.queryDocuments(query, {
32
+ maxDocuments: limit,
33
+ maxChunks: 5,
34
+ isBm25: true,
35
+ });
36
+
37
+ const out: IIndexRecord[] = [];
38
+
39
+ for (const r of results) {
40
+ const text = await r.loadText();
41
+ const lineStarts = computeLineStarts(text);
42
+
43
+ for (const c of r.chunks) {
44
+ const {startPos, endPos} = c.item.metadata;
45
+ const snippet = text.slice(startPos, endPos);
46
+ const startLine = lineOf(lineStarts, startPos);
47
+ const endLine = lineOf(lineStarts, endPos);
48
+
49
+ out.push({
50
+ id: r.uri,
51
+ score: c.score,
52
+ startPos,
53
+ endPos,
54
+ snippet,
55
+ startLine,
56
+ endLine,
57
+ });
58
+ }
59
+ }
60
+
61
+ return out;
62
+ };
63
+
64
+ const resetIndex: IResetIndex = async () => {
65
+ /** Clears persisted catalog + chunks on disk; recreates empty index metadata. */
66
+ await rm(docsPath, {recursive: true, force: true});
67
+
68
+ /*
69
+ * Re-bind `index`: after `rm`, the prior LocalDocumentIndex instance may retain stale
70
+ * in-memory caches; `DocumentIndex` constructs a fresh instance against the recreated folder.
71
+ */
72
+ index = await DocumentIndex({path: docsPath, model});
73
+ };
74
+
75
+ const getIndexStats: IGetIndexStats = async () => {
76
+ const stats = await index.getCatalogStats();
77
+
78
+ return {indexedAmount: stats.documents ?? 0};
79
+ };
80
+
81
+ const flush = async () => {
82
+ /* LocalDocumentIndex persists updates without an explicit flush. */
83
+ };
84
+
85
+ return {
86
+ getIndexStats,
87
+ indexContent,
88
+ removeContent,
89
+ searchContentIndex,
90
+ resetIndex,
91
+ flush,
92
+ };
93
+ }
94
+
95
+ const DOC_TYPE_BY_EXT: Record<string, string> = {
96
+ ".ts": "ts", ".tsx": "tsx", ".mts": "ts", ".cts": "ts",
97
+ ".js": "js", ".jsx": "jsx", ".mjs": "js", ".cjs": "js",
98
+ ".py": "python",
99
+ ".go": "go",
100
+ ".java": "java",
101
+ ".cs": "csharp",
102
+ ".cpp": "cpp", ".cc": "cpp", ".cxx": "cpp", ".hpp": "cpp", ".h": "cpp",
103
+ ".md": "markdown",
104
+ };
105
+
106
+ function docTypeForPath(path: string): string {
107
+ return DOC_TYPE_BY_EXT[extname(path).toLowerCase()] ?? "text";
108
+ }
109
+
110
+ function computeLineStarts(text: string): number[] {
111
+ const starts = [0];
112
+ for (let i = 0; i < text.length; i++) {
113
+ if (text.charCodeAt(i) === 10) starts.push(i + 1);
114
+ }
115
+ return starts;
116
+ }
117
+
118
+ function lineOf(lineStarts: number[], pos: number): number {
119
+ let lo = 0, hi = lineStarts.length - 1;
120
+ while (lo < hi) {
121
+ const mid = (lo + hi + 1) >>> 1;
122
+ if (lineStarts[mid] <= pos) lo = mid;
123
+ else hi = mid - 1;
124
+ }
125
+ return lo + 1;
126
+ }
@@ -0,0 +1,26 @@
1
+ import { LocalDocumentIndex, TransformersEmbeddings } from "vectra";
2
+
3
+ export type IDocumentIndex = LocalDocumentIndex;
4
+
5
+ export async function DocumentIndex({
6
+ path,
7
+ model = "Xenova/all-MiniLM-L6-v2",
8
+ dtype = "q8",
9
+ }: {
10
+ path: string;
11
+ model?: string;
12
+ dtype?: "fp32" | "fp16" | "q8" | "q4";
13
+ }): Promise<LocalDocumentIndex> {
14
+ const embeddings = await TransformersEmbeddings.create({ model, dtype });
15
+ const index = new LocalDocumentIndex({
16
+ folderPath: path,
17
+ embeddings,
18
+ chunkingConfig: { chunkSize: 512, chunkOverlap: 0 },
19
+ });
20
+
21
+ if (!(await index.isIndexCreated())) {
22
+ await index.createIndex();
23
+ }
24
+
25
+ return index;
26
+ }
@@ -9,8 +9,22 @@ export function FormatSearchResults(): IFormatSearchResults {
9
9
  const lines: string[] = [];
10
10
  for (let i = 0; i < results.length; i++) {
11
11
  const r = results[i];
12
- const kw = r.keywords ? `${r.keywords}` : "";
13
- lines.push(`${i + 1}. ${r.id}${kw}`);
12
+ const scoreStr = `[${r.score.toFixed(2)}]`;
13
+ if (r.snippet !== undefined && r.snippet !== "") {
14
+ const header =
15
+ typeof r.startLine === "number"
16
+ ? `${i + 1}. ${r.id}:L${r.startLine}-L${r.endLine ?? r.startLine} ${scoreStr}`
17
+ : `${i + 1}. ${r.id} ${scoreStr}`;
18
+ const snippetBlock = r.snippet
19
+ .split("\n")
20
+ .map((line) => ` ${line}`)
21
+ .join("\n");
22
+ lines.push(`${header}\n\n${snippetBlock}`);
23
+ } else if (r.keywords) {
24
+ lines.push(`${i + 1}. ${r.id} ${scoreStr} — ${r.keywords}`);
25
+ } else {
26
+ lines.push(`${i + 1}. ${r.id} ${scoreStr}`);
27
+ }
14
28
  }
15
29
 
16
30
  return `\n# Search: "${query}" — ${results.length} result(s)\n\n${lines.join(";\n\n")}\n`;
@@ -5,6 +5,7 @@ import {ILogger} from "../logger.js";
5
5
  import {FileEventType, IFileEvent} from "../watchFiles.js";
6
6
 
7
7
  export type IHandleFileEvent = (event: IFileEvent) => Promise<void>;
8
+ export type IHandleFileEvents = (events: IFileEvent[]) => Promise<void>;
8
9
 
9
10
  export function HandleFileEvent({indexContent, removeContent, log}: {
10
11
  indexContent: IIndexContent,
@@ -13,13 +14,57 @@ export function HandleFileEvent({indexContent, removeContent, log}: {
13
14
  }): IHandleFileEvent {
14
15
  return async function handleFileEvent(event) {
15
16
  if (event.type === FileEventType.index) {
16
- try { await removeContent(event.path); } catch (e) { log(`remove failed: ${event.path} — ${(e as any)?.message ?? e}`); }
17
17
  const text = await readFile(event.path, "utf8");
18
- await indexContent(event.path, `${text}. ${event.path}`);
18
+ await indexContent([{id: event.path, content: `${text}. ${event.path}`}]);
19
19
  log(`index: ${event.path}`);
20
20
  } else {
21
- try { await removeContent(event.path); } catch (e) { log(`remove failed: ${event.path} — ${(e as any)?.message ?? e}`); }
21
+ try { await removeContent([event.path]); } catch (e) { log(`remove failed: ${event.path} — ${(e as any)?.message ?? e}`); }
22
22
  log(`remove: ${event.path}`);
23
23
  }
24
24
  };
25
25
  }
26
+
27
+ export function HandleFileEvents({indexContent, removeContent, log}: {
28
+ indexContent: IIndexContent,
29
+ removeContent: IRemoveContent,
30
+ log: ILogger,
31
+ }): IHandleFileEvents {
32
+ return async function handleFileEvents(events) {
33
+ const indexEvents = events.filter((event) => event.type === FileEventType.index);
34
+ const removeEvents = events.filter((event) => event.type === FileEventType.remove);
35
+
36
+ if (indexEvents.length > 0) {
37
+ const indexItems = (await Promise.all(indexEvents.map(async (event) => {
38
+ log(`index: ${event.path}`);
39
+ try {
40
+ const text = await readFile(event.path, "utf8");
41
+ return {id: event.path, content: `${text}. ${event.path}`};
42
+ } catch (e) {
43
+ log(`index failed: ${event.path} — ${(e as any)?.message ?? e}`);
44
+ return undefined;
45
+ }
46
+ }))).filter((item): item is { id: string, content: string } => !!item);
47
+
48
+ if (indexItems.length > 0) {
49
+ try {
50
+ await indexContent(indexItems);
51
+ } catch (e) {
52
+ log(`index batch failed: ${(e as any)?.message ?? e}`);
53
+ }
54
+ }
55
+ }
56
+
57
+ if (removeEvents.length > 0) {
58
+ const removePaths = removeEvents.map((event) => {
59
+ log(`remove: ${event.path}`);
60
+ return event.path;
61
+ });
62
+
63
+ try {
64
+ await removeContent(removePaths);
65
+ } catch (e) {
66
+ log(`remove batch failed: ${(e as any)?.message ?? e}`);
67
+ }
68
+ }
69
+ };
70
+ }
@@ -2,7 +2,6 @@ import {LocalIndex} from "vectra";
2
2
  import {IType} from "../IType.js";
3
3
  import {ISerial, Serial} from "../../packages/fun/src/serial.js";
4
4
  import {caseNever} from "../../packages/fun/src/case-never.js";
5
- import {IEmbed} from "../llm/embed.js";
6
5
 
7
6
  export enum IndexCommandType {
8
7
  index = 'index',
@@ -11,26 +10,55 @@ export enum IndexCommandType {
11
10
  }
12
11
 
13
12
  export type IIndexCommand =
14
- | IType<{ type: IndexCommandType.index, id: string, content: string, keywords: string }>
15
- | IType<{ type: IndexCommandType.delete, id: string }>
13
+ | IType<{ type: IndexCommandType.index, items: Array<{id: string, vector: number[], keywords: string}> }>
14
+ | IType<{ type: IndexCommandType.delete, ids: string[] }>
16
15
  | IType<{ type: IndexCommandType.reset }>;
17
16
 
18
17
  export type IIndexApi = ISerial<IIndexCommand, void>;
19
18
 
20
- export function IndexApi({index, embed}: { index: LocalIndex, embed: IEmbed }): IIndexApi {
19
+ export function IndexApi({index}: { index: LocalIndex }): IIndexApi {
21
20
  return Serial<IIndexCommand, void>(async msg => {
22
21
  switch (msg.type) {
23
22
  case IndexCommandType.delete: {
24
- await index.deleteItem(msg.id);
23
+ await index.beginUpdate();
24
+ try {
25
+ for (const id of msg.ids) {
26
+ await index.deleteItem(id);
27
+ }
28
+ } finally {
29
+ await index.endUpdate();
30
+ }
25
31
  break;
26
32
  }
27
33
  case IndexCommandType.index: {
28
- const vector = await embed(msg.content);
29
- await index.upsertItem({
30
- id: msg.id,
31
- vector,
32
- metadata: {id: msg.id, keywords: msg.keywords},
33
- });
34
+ try {
35
+ await index.batchInsertItems(msg.items.map(item => ({
36
+ id: item.id,
37
+ vector: item.vector,
38
+ metadata: {id: item.id, keywords: item.keywords},
39
+ })));
40
+ } catch (error) {
41
+ const errorMessage = String((error as Error)?.message ?? error).toLowerCase();
42
+ const isDuplicateIdError =
43
+ errorMessage.includes("already exists")
44
+ || errorMessage.includes("duplicate");
45
+ if (!isDuplicateIdError) {
46
+ throw error;
47
+ }
48
+
49
+ await index.beginUpdate();
50
+ try {
51
+ for (const item of msg.items) {
52
+ await index.upsertItem({
53
+ id: item.id,
54
+ vector: item.vector,
55
+ metadata: {id: item.id, keywords: item.keywords},
56
+ });
57
+ }
58
+ } finally {
59
+ await index.endUpdate();
60
+ }
61
+ }
34
62
  break;
35
63
  }
36
64
  case IndexCommandType.reset: {
@@ -0,0 +1,50 @@
1
+ export type IBm25Doc = { id: string; tokens: string[] };
2
+
3
+ export type IBm25 = {
4
+ score(queryTokens: string[]): Array<{ id: string; score: number }>;
5
+ };
6
+
7
+ export function Bm25({docs, k1 = 1.5, b = 0.75}: {
8
+ docs: IBm25Doc[];
9
+ k1?: number;
10
+ b?: number;
11
+ }): IBm25 {
12
+ const N = docs.length;
13
+ const df = new Map<string, number>();
14
+ const tfs: Array<Map<string, number>> = [];
15
+ let totalLen = 0;
16
+
17
+ for (const d of docs) {
18
+ const tf = new Map<string, number>();
19
+ for (const t of d.tokens) tf.set(t, (tf.get(t) ?? 0) + 1);
20
+ tfs.push(tf);
21
+ totalLen += d.tokens.length;
22
+ for (const t of tf.keys()) df.set(t, (df.get(t) ?? 0) + 1);
23
+ }
24
+ const avgdl = totalLen / Math.max(1, N);
25
+
26
+ return {
27
+ score(queryTokens) {
28
+ const out: Array<{ id: string; score: number }> = [];
29
+ const uniq = [...new Set(queryTokens)];
30
+ for (let i = 0; i < docs.length; i++) {
31
+ const tf = tfs[i];
32
+ const dl = docs[i].tokens.length;
33
+ let s = 0;
34
+ for (const t of uniq) {
35
+ const f = tf.get(t);
36
+ if (!f) continue;
37
+ const n = df.get(t)!;
38
+ const idf = Math.log(((N - n + 0.5) / (n + 0.5)) + 1);
39
+ s += idf * (f * (k1 + 1)) / (f + k1 * (1 - b + b * dl / avgdl));
40
+ }
41
+ if (s > 0) out.push({id: docs[i].id, score: s});
42
+ }
43
+ return out.sort((a, b) => b.score - a.score);
44
+ },
45
+ };
46
+ }
47
+
48
+ export function tokenizeForBm25(text: string): string[] {
49
+ return text.toLowerCase().split(/[^a-z0-9]+/).filter(t => t.length >= 2);
50
+ }
@@ -0,0 +1,48 @@
1
+ import {LocalIndex, VirtualFileStorage} from "vectra";
2
+
3
+ export type ISnippetMeta = {
4
+ fileId: string;
5
+ startLine: number;
6
+ endLine: number;
7
+ snippet: string;
8
+ };
9
+
10
+ export type IInMemoryIndex = {
11
+ upsertItem(id: string, vector: number[], meta: ISnippetMeta): Promise<void>;
12
+ query(vector: number[], text: string, limit: number): Promise<Array<{score: number; meta: ISnippetMeta}>>;
13
+ dispose(): Promise<void>;
14
+ };
15
+
16
+ export function InMemoryIndex(_opts: {dimensions: number} = {dimensions: 0}): IInMemoryIndex {
17
+ const storage = new VirtualFileStorage();
18
+ const index = new LocalIndex("mem://idx", undefined, storage);
19
+ let created = false;
20
+
21
+ async function ensureCreated() {
22
+ if (created) return;
23
+ if (!(await index.isIndexCreated())) {
24
+ await index.createIndex();
25
+ }
26
+ created = true;
27
+ }
28
+
29
+ return {
30
+ async upsertItem(id, vector, meta) {
31
+ await ensureCreated();
32
+ await index.upsertItem({id, vector, metadata: meta as unknown as Record<string, any>});
33
+ },
34
+
35
+ async query(vector, text, limit) {
36
+ await ensureCreated();
37
+ const results = await index.queryItems(vector, text, limit);
38
+ return results.map(r => ({
39
+ score: r.score,
40
+ meta: r.item.metadata as unknown as ISnippetMeta,
41
+ }));
42
+ },
43
+
44
+ async dispose() {
45
+ // No-op: VirtualFileStorage is GC'd with the instance
46
+ },
47
+ };
48
+ }
@@ -0,0 +1,148 @@
1
+ import {readFile, stat} from "fs/promises";
2
+ import {IEmbed} from "../llm/embed.js";
3
+ import {IExtractKeywords} from "../keywords/extractKeywords.js";
4
+ import {ICleanUpKeywords} from "../keywords/cleanUpKeywords.js";
5
+ import {IIndexRecord} from "../../features/searchIndex.js";
6
+ import {IInMemoryIndex, InMemoryIndex} from "./inMemoryIndex.js";
7
+ import {IWindow, windowsOf} from "./windowsOf.js";
8
+ import {Bm25, IBm25Doc, tokenizeForBm25} from "./bm25.js";
9
+ import {from} from "../../packages/streamx/src/from.js";
10
+ import {filter} from "../../packages/streamx/src/filter.js";
11
+ import {map} from "../../packages/streamx/src/map.js";
12
+ import {flatMap} from "../../packages/streamx/src/flatMap.js";
13
+ import {tap} from "../../packages/streamx/src/tap.js";
14
+ import {scaleSync} from "../../packages/streamx/src/scaleSync.js";
15
+ import {run} from "../../packages/streamx/src/index.js";
16
+ import {DEFAULT_LOCATE_BATCH_SIZE} from "../config/DEFAULT_LOCATE_BATCH_SIZE";
17
+
18
+ export type ILocateInFile = (
19
+ query: string,
20
+ queryVector: number[],
21
+ candidates: IIndexRecord[],
22
+ limit: number,
23
+ ) => Promise<IIndexRecord[]>;
24
+
25
+ export function LocateInFile({
26
+ embed,
27
+ extractKeywords,
28
+ cleanUpKeywords,
29
+ windowLines = 15,
30
+ maxFileBytes = 10_000_000,
31
+ embedConcurrency = DEFAULT_LOCATE_BATCH_SIZE,
32
+ }: {
33
+ embed: IEmbed;
34
+ extractKeywords: IExtractKeywords;
35
+ cleanUpKeywords: ICleanUpKeywords;
36
+ windowLines?: number;
37
+ maxSnippetsPerFile?: number;
38
+ maxFileBytes?: number;
39
+ embedConcurrency?: number;
40
+ }): ILocateInFile {
41
+ type IWindowWithKeywords = IWindow & { keywords: string };
42
+ type IWindowWithVector = IWindowWithKeywords & { vector: number[] };
43
+
44
+ return async function locateInFile(query, queryVector, candidates, limit) {
45
+ const memIndex: IInMemoryIndex = InMemoryIndex({dimensions: queryVector.length});
46
+
47
+ try {
48
+ const ids = candidates.map(c => c.id);
49
+ const bm25Docs: IBm25Doc[] = [];
50
+ const metaById = new Map<string, { fileId: string; startLine: number; endLine: number; snippet: string }>();
51
+
52
+ const SCALE_FILE_READS = embedConcurrency;
53
+ const windows = from<string>(ids)
54
+ .pipe(filter(async (id: string) => {
55
+ try {
56
+ const s = await stat(id);
57
+ if (!s.isFile()) return false;
58
+ if (s.size > maxFileBytes) return false;
59
+ return true;
60
+ } catch {
61
+ return false;
62
+ }
63
+ }))
64
+ .pipe(scaleSync(SCALE_FILE_READS, async (id: string) => {
65
+ const text = await readFile(id, "utf8");
66
+ return {id, text};
67
+ }))
68
+ .pipe(flatMap(({id, text}: { id: string, text: string }): IWindow[] => {
69
+ return windowsOf({text, id, windowLines});
70
+ }));
71
+
72
+ const withVectors = from<IWindow>(windows)
73
+ .pipe(map<IWindow, IWindowWithKeywords>(w => {
74
+ const kw = cleanUpKeywords(extractKeywords(w.snippet)).join(", ");
75
+ return {...w, keywords: kw || w.snippet.slice(0, 200)};
76
+ }))
77
+ .pipe(scaleSync<IWindowWithKeywords, IWindowWithVector>(embedConcurrency, async (w) => {
78
+ const vector = await embed(w.keywords);
79
+ return {...w, vector};
80
+ }))
81
+ .pipe(tap(async (w: IWindowWithVector) => {
82
+ const id = `${w.fileId}:${w.startLine}:${w.endLine}`;
83
+ await memIndex.upsertItem(
84
+ id,
85
+ w.vector,
86
+ {fileId: w.fileId, startLine: w.startLine, endLine: w.endLine, snippet: w.snippet},
87
+ );
88
+ bm25Docs.push({id, tokens: tokenizeForBm25(w.snippet)});
89
+ metaById.set(id, {
90
+ fileId: w.fileId,
91
+ startLine: w.startLine,
92
+ endLine: w.endLine,
93
+ snippet: w.snippet
94
+ });
95
+ }));
96
+
97
+ await run(withVectors);
98
+
99
+ const poolSize = Math.max(limit * 8, 40);
100
+ const vecHits = await memIndex.query(queryVector, query, poolSize);
101
+
102
+ const bm25 = Bm25({docs: bm25Docs});
103
+ const bm25Hits = bm25.score(tokenizeForBm25(query)).slice(0, poolSize);
104
+
105
+ const cosById = new Map<string, number>();
106
+ vecHits.forEach(h => {
107
+ const id = `${h.meta.fileId}:${h.meta.startLine}:${h.meta.endLine}`;
108
+ cosById.set(id, h.score);
109
+ });
110
+ const bmById = new Map<string, number>();
111
+ bm25Hits.forEach(h => bmById.set(h.id, h.score));
112
+
113
+ const cosMax = Math.max(...vecHits.map(h => h.score), 1e-9);
114
+ const bmMax = Math.max(...bm25Hits.map(h => h.score), 1e-9);
115
+
116
+ const allIds = new Set<string>([...cosById.keys(), ...bmById.keys()]);
117
+ const ranked = [...allIds]
118
+ .map(id => {
119
+ const cosNorm = (cosById.get(id) ?? 0) / cosMax;
120
+ const bmNorm = (bmById.get(id) ?? 0) / bmMax;
121
+ return {id, score: Math.max(cosNorm, bmNorm), meta: metaById.get(id)!};
122
+ })
123
+ .filter(r => r.meta)
124
+ .sort((a, b) => b.score - a.score);
125
+
126
+ const kept: typeof ranked = [];
127
+ for (const h of ranked) {
128
+ const overlaps = kept.some(k =>
129
+ k.meta.fileId === h.meta.fileId &&
130
+ h.meta.startLine <= k.meta.endLine &&
131
+ h.meta.endLine >= k.meta.startLine
132
+ );
133
+ if (!overlaps) kept.push(h);
134
+ if (kept.length >= limit) break;
135
+ }
136
+
137
+ return kept.map(h => ({
138
+ score: h.score,
139
+ id: h.meta.fileId,
140
+ startLine: h.meta.startLine,
141
+ endLine: h.meta.endLine,
142
+ snippet: h.meta.snippet,
143
+ }));
144
+ } finally {
145
+ await memIndex.dispose();
146
+ }
147
+ };
148
+ }
@@ -0,0 +1,29 @@
1
+ export type IWindow = {
2
+ fileId: string;
3
+ startLine: number;
4
+ endLine: number;
5
+ snippet: string;
6
+ };
7
+
8
+ export function windowsOf({text, id, windowLines}: {
9
+ text: string;
10
+ id: string;
11
+ windowLines: number;
12
+ }): IWindow[] {
13
+ const lines = text.split("\n");
14
+ const windows: IWindow[] = [];
15
+ const scales = [Math.max(1, Math.floor(windowLines / 2)), windowLines];
16
+ for (const size of scales) {
17
+ const step = Math.max(1, Math.floor(size / 2));
18
+ for (let i = 0; i < lines.length; i += step) {
19
+ const slice = lines.slice(i, i + size);
20
+ windows.push({
21
+ fileId: id,
22
+ startLine: i + 1,
23
+ endLine: Math.min(i + size, lines.length),
24
+ snippet: slice.join("\n"),
25
+ });
26
+ }
27
+ }
28
+ return windows;
29
+ }
@@ -1,8 +1,8 @@
1
- import {readFile, stat, watch} from "fs/promises";
1
+ import {stat, watch} from "fs/promises";
2
2
  import {join, relative} from "path";
3
- import ignore from "ignore";
4
3
  import {ILogger} from "./logger.js";
5
4
  import {IType} from "./IType.js";
5
+ import {loadIgnoreChain} from "./ignore/loadIgnoreChain.js";
6
6
 
7
7
  export enum FileEventType {
8
8
  index = 'index',
@@ -19,18 +19,6 @@ export type IWatchFiles = (inputs: string[]) => IWatchFilesResult;
19
19
 
20
20
  export function WatchFiles({cwd, log, ignoreFiles = []}: { cwd: string, log: ILogger, ignoreFiles?: string[] }): IWatchFiles {
21
21
 
22
- async function loadGitignore(dir: string): Promise<ReturnType<typeof ignore>> {
23
- const ig = ignore();
24
- ig.add(".*");
25
- try {
26
- const content = await readFile(join(dir, ".gitignore"), "utf8");
27
- ig.add(content);
28
- } catch {
29
- }
30
- for (const pattern of ignoreFiles) ig.add(pattern);
31
- return ig;
32
- }
33
-
34
22
  return function watchFiles(inputs) {
35
23
  const pending = new Map<string, IFileEvent>();
36
24
  let notify: (() => void) | null = null;
@@ -38,7 +26,6 @@ export function WatchFiles({cwd, log, ignoreFiles = []}: { cwd: string, log: ILo
38
26
  const abortControllers: AbortController[] = [];
39
27
 
40
28
  async function startWatching(dir: string) {
41
- const ig = await loadGitignore(dir);
42
29
  const ac = new AbortController();
43
30
  abortControllers.push(ac);
44
31
 
@@ -48,9 +35,11 @@ export function WatchFiles({cwd, log, ignoreFiles = []}: { cwd: string, log: ILo
48
35
  if (!event.filename) continue;
49
36
 
50
37
  const rel = relative(cwd, join(dir, event.filename));
51
- if (ig.ignores(rel)) continue;
52
38
  if (rel.endsWith("~")) continue;
53
39
 
40
+ const ig = await loadIgnoreChain(cwd, rel, ignoreFiles);
41
+ if (ig.ignores(rel)) continue;
42
+
54
43
  try {
55
44
  await stat(join(dir, event.filename));
56
45
  pending.set(rel, {type: FileEventType.index, path: rel});
@@ -1,16 +1,23 @@
1
1
  import {IndexCommandType, IIndexApi} from "../componets/index/indexApi.js";
2
2
  import {IExtractKeywords} from "../componets/keywords/extractKeywords.js";
3
3
  import {ICleanUpKeywords} from "../componets/keywords/cleanUpKeywords.js";
4
+ import {IEmbed} from "../componets/llm/embed.js";
4
5
 
5
- export type IIndexContent = (id: string, content: string) => Promise<void>;
6
+ export type IIndexContent = (items: Array<{id: string, content: string}>) => Promise<void>;
6
7
 
7
- export function IndexContent({extractKeywords, cleanUpKeywords, indexApi}: {
8
+ export function IndexContent({extractKeywords, cleanUpKeywords, embed, indexApi}: {
8
9
  extractKeywords: IExtractKeywords,
9
10
  cleanUpKeywords: ICleanUpKeywords,
11
+ embed: IEmbed,
10
12
  indexApi: IIndexApi,
11
13
  }): IIndexContent {
12
- return async function indexContent(id, content) {
13
- const keywords = cleanUpKeywords(extractKeywords(content)).join(", ");
14
- await indexApi({type: IndexCommandType.index, id, content: keywords, keywords});
14
+ return async function indexContent(items) {
15
+ const indexItems = await Promise.all(items.map(async (item) => {
16
+ const keywords = cleanUpKeywords(extractKeywords(item.content)).join(", ");
17
+ const vector = await embed(keywords);
18
+ return {id: item.id, vector, keywords};
19
+ }));
20
+
21
+ await indexApi({type: IndexCommandType.index, items: indexItems});
15
22
  }
16
23
  }