@winci/local-rag 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/.claude-plugin/plugin.json +24 -0
  2. package/.mcp.json +11 -0
  3. package/LICENSE +21 -0
  4. package/README.md +567 -0
  5. package/hooks/hooks.json +25 -0
  6. package/hooks/scripts/reindex-file.sh +19 -0
  7. package/hooks/scripts/session-start.sh +11 -0
  8. package/package.json +52 -0
  9. package/skills/local-rag/SKILL.md +42 -0
  10. package/src/cli/commands/analytics.ts +58 -0
  11. package/src/cli/commands/benchmark.ts +30 -0
  12. package/src/cli/commands/checkpoint.ts +85 -0
  13. package/src/cli/commands/conversation.ts +102 -0
  14. package/src/cli/commands/demo.ts +119 -0
  15. package/src/cli/commands/eval.ts +31 -0
  16. package/src/cli/commands/index-cmd.ts +26 -0
  17. package/src/cli/commands/init.ts +35 -0
  18. package/src/cli/commands/map.ts +21 -0
  19. package/src/cli/commands/remove.ts +15 -0
  20. package/src/cli/commands/search-cmd.ts +59 -0
  21. package/src/cli/commands/serve.ts +5 -0
  22. package/src/cli/commands/status.ts +13 -0
  23. package/src/cli/index.ts +117 -0
  24. package/src/cli/progress.ts +21 -0
  25. package/src/cli/setup.ts +192 -0
  26. package/src/config/index.ts +101 -0
  27. package/src/conversation/indexer.ts +147 -0
  28. package/src/conversation/parser.ts +323 -0
  29. package/src/db/analytics.ts +116 -0
  30. package/src/db/annotations.ts +161 -0
  31. package/src/db/checkpoints.ts +166 -0
  32. package/src/db/conversation.ts +241 -0
  33. package/src/db/files.ts +146 -0
  34. package/src/db/graph.ts +250 -0
  35. package/src/db/index.ts +468 -0
  36. package/src/db/search.ts +244 -0
  37. package/src/db/types.ts +85 -0
  38. package/src/embeddings/embed.ts +73 -0
  39. package/src/graph/resolver.ts +305 -0
  40. package/src/indexing/chunker.ts +523 -0
  41. package/src/indexing/indexer.ts +263 -0
  42. package/src/indexing/parse.ts +99 -0
  43. package/src/indexing/watcher.ts +84 -0
  44. package/src/main.ts +8 -0
  45. package/src/search/benchmark.ts +139 -0
  46. package/src/search/eval.ts +171 -0
  47. package/src/search/hybrid.ts +194 -0
  48. package/src/search/reranker.ts +99 -0
  49. package/src/search/usages.ts +27 -0
  50. package/src/server/index.ts +126 -0
  51. package/src/tools/analytics-tools.ts +58 -0
  52. package/src/tools/annotation-tools.ts +89 -0
  53. package/src/tools/checkpoint-tools.ts +147 -0
  54. package/src/tools/conversation-tools.ts +86 -0
  55. package/src/tools/git-tools.ts +103 -0
  56. package/src/tools/graph-tools.ts +163 -0
  57. package/src/tools/index-tools.ts +91 -0
  58. package/src/tools/index.ts +33 -0
  59. package/src/tools/search.ts +238 -0
  60. package/src/types.ts +9 -0
  61. package/src/utils/log.ts +39 -0
@@ -0,0 +1,523 @@
1
+ import { chunk as astChunk } from "code-chunk";
2
+ import { log } from "../utils/log";
3
+
4
+ export interface ChunkImport {
5
+ name: string;
6
+ source: string;
7
+ }
8
+
9
+ export interface ChunkExport {
10
+ name: string;
11
+ type: string;
12
+ }
13
+
14
+ export interface Chunk {
15
+ text: string;
16
+ index: number;
17
+ startLine?: number;
18
+ endLine?: number;
19
+ imports?: ChunkImport[];
20
+ exports?: ChunkExport[];
21
+ }
22
+
23
+ const DEFAULT_CHUNK_SIZE = 512; // in characters
24
+ const DEFAULT_CHUNK_OVERLAP = 50;
25
+
26
+ // Extensions that code-chunk supports via tree-sitter
27
+ const AST_SUPPORTED = new Set([
28
+ ".ts", ".tsx", ".js", ".jsx", ".py", ".go", ".rs", ".java",
29
+ ]);
30
+
31
+ // Code-like extensions handled by blank-line heuristic splitting
32
+ const HEURISTIC_CODE = new Set([
33
+ ".c", ".cpp", ".h", ".hpp", ".rb", ".swift",
34
+ ".sh", ".bash", ".zsh", ".fish",
35
+ ".tf", ".proto", ".graphql", ".gql",
36
+ ".mod", ".xml",
37
+ ".jenkinsfile", ".vagrantfile", ".gemfile", ".rakefile", ".brewfile", ".procfile",
38
+ ]);
39
+
40
+ /**
41
+ * Every extension (real or virtual) that chunkText knows how to handle.
42
+ * Files with extensions outside this set are skipped by the indexer so
43
+ * binaries and other unrecognised formats never enter the DB.
44
+ */
45
+ export const KNOWN_EXTENSIONS = new Set([
46
+ // Markdown
47
+ ".md", ".mdx", ".markdown",
48
+ // Plain text
49
+ ".txt",
50
+ // AST-aware code
51
+ ".ts", ".tsx", ".js", ".jsx", ".py", ".go", ".rs", ".java",
52
+ // Heuristic code (blank-line blocks)
53
+ ".c", ".cpp", ".h", ".hpp", ".rb", ".swift",
54
+ ".sh", ".bash", ".zsh", ".fish",
55
+ ".tf", ".proto", ".graphql", ".gql",
56
+ ".mod",
57
+ ".xml",
58
+ // Virtual extensions for basename-detected files
59
+ ".makefile", ".dockerfile", ".jenkinsfile",
60
+ ".vagrantfile", ".gemfile", ".rakefile", ".brewfile", ".procfile",
61
+ // Structured data
62
+ ".yaml", ".yml", ".json", ".toml",
63
+ // Query / schema languages
64
+ ".sql",
65
+ // API collections
66
+ ".bru",
67
+ // Stylesheets
68
+ ".css", ".scss", ".less",
69
+ ]);
70
+
71
+ /**
72
+ * Split text into chunks. Strategy depends on content type:
73
+ * - Code (supported languages): AST-aware chunking via tree-sitter
74
+ * - Markdown: split on headings first, then by size
75
+ * - Code (unsupported): split on blank-line-separated blocks, then by size
76
+ * - Other: split on paragraphs, then by size
77
+ */
78
+ export async function chunkText(
79
+ text: string,
80
+ extension: string,
81
+ chunkSize = DEFAULT_CHUNK_SIZE,
82
+ chunkOverlap = DEFAULT_CHUNK_OVERLAP,
83
+ filePath?: string
84
+ ): Promise<Chunk[]> {
85
+ const chunks = await _chunkText(text, extension, chunkSize, chunkOverlap, filePath);
86
+ assignLineNumbers(chunks, text);
87
+ return chunks;
88
+ }
89
+
90
+ async function _chunkText(
91
+ text: string,
92
+ extension: string,
93
+ chunkSize = DEFAULT_CHUNK_SIZE,
94
+ chunkOverlap = DEFAULT_CHUNK_OVERLAP,
95
+ filePath?: string
96
+ ): Promise<Chunk[]> {
97
+ // Try AST-aware chunking for supported code files (even small ones, for import/export extraction)
98
+ if (AST_SUPPORTED.has(extension)) {
99
+ try {
100
+ const astChunks = await astChunk(filePath || `file${extension}`, text, {
101
+ maxChunkSize: chunkSize,
102
+ });
103
+ if (astChunks.length > 0) {
104
+ return astChunks.map((c, i) => ({
105
+ text: c.text,
106
+ index: i,
107
+ imports: c.context.imports.map((im) => ({ name: im.name, source: im.source })),
108
+ exports: c.context.entities
109
+ .filter((e) => e.type === "export" || e.type === "function" || e.type === "class" || e.type === "interface" || e.type === "type" || e.type === "enum")
110
+ .map((e) => ({ name: e.name, type: e.type })),
111
+ }));
112
+ }
113
+ } catch (err) {
114
+ log.debug(`AST chunking failed for ${filePath || extension}, using heuristic: ${err instanceof Error ? err.message : err}`, "chunker");
115
+ }
116
+ }
117
+
118
+ if (text.length <= chunkSize) {
119
+ return [{ text, index: 0 }];
120
+ }
121
+
122
+ const isMarkdown = [".md", ".mdx", ".markdown"].includes(extension);
123
+ // Code-like files: split on blank-line-separated blocks as a heuristic.
124
+ // Includes AST-supported languages plus shell, HCL, proto, GraphQL, etc.
125
+ const isCode = AST_SUPPORTED.has(extension) || HEURISTIC_CODE.has(extension);
126
+
127
+ let sections: string[];
128
+
129
+ if (isMarkdown) {
130
+ sections = splitMarkdown(text);
131
+ } else if (extension === ".makefile") {
132
+ sections = splitMakefile(text);
133
+ } else if (extension === ".dockerfile") {
134
+ sections = splitDockerfile(text);
135
+ } else if (extension === ".yaml" || extension === ".yml") {
136
+ sections = splitYAML(text);
137
+ } else if (extension === ".json") {
138
+ sections = splitJSON(text);
139
+ } else if (extension === ".toml") {
140
+ sections = splitTOML(text);
141
+ } else if (extension === ".bru") {
142
+ sections = splitBru(text);
143
+ } else if (extension === ".sql") {
144
+ sections = splitSQL(text);
145
+ } else if (extension === ".css" || extension === ".scss" || extension === ".less") {
146
+ sections = splitCSS(text);
147
+ } else if (isCode) {
148
+ sections = splitCode(text);
149
+ } else {
150
+ sections = splitParagraphs(text);
151
+ }
152
+
153
+ // Further split any section that exceeds chunkSize
154
+ const chunks: Chunk[] = [];
155
+ let index = 0;
156
+
157
+ for (const section of sections) {
158
+ if (section.length <= chunkSize) {
159
+ chunks.push({ text: section, index: index++ });
160
+ } else {
161
+ const subChunks = splitBySize(section, chunkSize, chunkOverlap);
162
+ for (const sub of subChunks) {
163
+ chunks.push({ text: sub, index: index++ });
164
+ }
165
+ }
166
+ }
167
+
168
+ return chunks;
169
+ }
170
+
171
+ /**
172
+ * Assign startLine/endLine to each chunk by locating the chunk text in the
173
+ * original file source. Uses indexOf with a forward cursor so overlapping or
174
+ * repeated text still resolves in order. Chunks whose text is not a verbatim
175
+ * substring (e.g. JSON-reformatted chunks) are left without line numbers.
176
+ */
177
+ function assignLineNumbers(chunks: Chunk[], fullText: string): void {
178
+ const lineOffsets = [0];
179
+ for (let i = 0; i < fullText.length; i++) {
180
+ if (fullText[i] === "\n") lineOffsets.push(i + 1);
181
+ }
182
+
183
+ function offsetToLine(offset: number): number {
184
+ let lo = 0, hi = lineOffsets.length - 1;
185
+ while (lo < hi) {
186
+ const mid = (lo + hi + 1) >> 1;
187
+ if (lineOffsets[mid] <= offset) lo = mid;
188
+ else hi = mid - 1;
189
+ }
190
+ return lo + 1; // 1-based
191
+ }
192
+
193
+ let cursor = 0;
194
+ for (const chunk of chunks) {
195
+ const idx = fullText.indexOf(chunk.text, cursor);
196
+ if (idx >= 0) {
197
+ chunk.startLine = offsetToLine(idx);
198
+ chunk.endLine = offsetToLine(idx + Math.max(chunk.text.length - 1, 0));
199
+ cursor = idx + chunk.text.length;
200
+ }
201
+ }
202
+ }
203
+
204
+ function splitMarkdown(text: string): string[] {
205
+ // Split on heading boundaries (## or ###)
206
+ const parts = text.split(/(?=^#{1,3}\s)/m);
207
+ return parts.filter((p) => p.trim().length > 0);
208
+ }
209
+
210
+ function splitDockerfile(text: string): string[] {
211
+ // Each FROM instruction starts a new build stage — use that as the primary
212
+ // boundary. Within a single-stage file this produces one section, which the
213
+ // size-based fallback will further split if needed.
214
+ const lines = text.split("\n");
215
+ const sections: string[] = [];
216
+ let current: string[] = [];
217
+
218
+ for (const line of lines) {
219
+ if (/^FROM\s+/i.test(line) && current.length > 0) {
220
+ const section = current.join("\n").trim();
221
+ if (section) sections.push(section);
222
+ current = [line];
223
+ } else {
224
+ current.push(line);
225
+ }
226
+ }
227
+
228
+ if (current.length > 0) {
229
+ const section = current.join("\n").trim();
230
+ if (section) sections.push(section);
231
+ }
232
+
233
+ return mergeTinyParts(sections.length > 0 ? sections : [text], 100);
234
+ }
235
+
236
+ function splitBru(text: string): string[] {
237
+ // Each top-level block in the Bru Markup Language starts at column 0 with
238
+ // `keyword {` (keyword may contain colons/hyphens, e.g. `body:json`, `vars:pre-request`).
239
+ const lines = text.split("\n");
240
+ const sections: string[] = [];
241
+ let current: string[] = [];
242
+
243
+ for (const line of lines) {
244
+ if (/^[a-zA-Z][a-zA-Z0-9:_-]*\s*\{/.test(line) && current.length > 0) {
245
+ const section = current.join("\n").trim();
246
+ if (section) sections.push(section);
247
+ current = [line];
248
+ } else {
249
+ current.push(line);
250
+ }
251
+ }
252
+
253
+ if (current.length > 0) {
254
+ const section = current.join("\n").trim();
255
+ if (section) sections.push(section);
256
+ }
257
+
258
+ return mergeTinyParts(sections, 100);
259
+ }
260
+
261
+ function splitTOML(text: string): string[] {
262
+ // Split on [section] and [[array-of-tables]] headers.
263
+ const lines = text.split("\n");
264
+ const sections: string[] = [];
265
+ let current: string[] = [];
266
+
267
+ for (const line of lines) {
268
+ if (/^\s*\[\[?[\w.]/.test(line) && current.length > 0) {
269
+ const section = current.join("\n").trim();
270
+ if (section) sections.push(section);
271
+ current = [line];
272
+ } else {
273
+ current.push(line);
274
+ }
275
+ }
276
+
277
+ if (current.length > 0) {
278
+ const section = current.join("\n").trim();
279
+ if (section) sections.push(section);
280
+ }
281
+
282
+ return mergeTinyParts(sections, 100);
283
+ }
284
+
285
+ function splitSQL(text: string): string[] {
286
+ // Split on semicolons that terminate statements. Preserves the semicolon
287
+ // so each chunk reads as a complete statement.
288
+ const statements = text
289
+ .split(/(?<=;)\s*\n/)
290
+ .map((s) => s.trim())
291
+ .filter((s) => s.length > 0);
292
+
293
+ return mergeTinyParts(statements, 100);
294
+ }
295
+
296
+ function splitMakefile(text: string): string[] {
297
+ // Each Makefile target (and its recipe) becomes its own chunk.
298
+ // A target line starts at column 0, is not a comment or blank, and has
299
+ // a colon that is NOT part of := or ::= (variable assignment operators).
300
+ const lines = text.split("\n");
301
+ const sections: string[] = [];
302
+ let current: string[] = [];
303
+
304
+ for (const line of lines) {
305
+ const isTarget =
306
+ line.length > 0 &&
307
+ !line.startsWith("\t") &&
308
+ !line.startsWith(" ") &&
309
+ !line.startsWith("#") &&
310
+ /^[A-Za-z0-9_./%$()-][^=\n]*:(?!=)/.test(line);
311
+
312
+ if (isTarget && current.length > 0) {
313
+ const section = current.join("\n").trim();
314
+ if (section) sections.push(section);
315
+ current = [line];
316
+ } else {
317
+ current.push(line);
318
+ }
319
+ }
320
+
321
+ if (current.length > 0) {
322
+ const section = current.join("\n").trim();
323
+ if (section) sections.push(section);
324
+ }
325
+
326
+ return mergeTinyParts(sections, 100);
327
+ }
328
+
329
+ function splitYAML(text: string): string[] {
330
+ // Split on top-level YAML keys (lines at column 0 matching `key:`).
331
+ // For OpenAPI files (detected by a top-level `paths:` key), further split
332
+ // the paths section on individual path entries (e.g. ` /users:`).
333
+ const lines = text.split("\n");
334
+ const topSections: string[] = [];
335
+ let current: string[] = [];
336
+
337
+ for (const line of lines) {
338
+ const isTopKey =
339
+ !line.startsWith(" ") &&
340
+ !line.startsWith("\t") &&
341
+ !line.startsWith("#") &&
342
+ /^[a-zA-Z_$][a-zA-Z0-9_$-]*\s*:/.test(line);
343
+
344
+ if (isTopKey && current.length > 0) {
345
+ const section = current.join("\n").trim();
346
+ if (section) topSections.push(section);
347
+ current = [line];
348
+ } else {
349
+ current.push(line);
350
+ }
351
+ }
352
+ if (current.length > 0) {
353
+ const section = current.join("\n").trim();
354
+ if (section) topSections.push(section);
355
+ }
356
+
357
+ // OpenAPI: further split the `paths:` section on individual path entries
358
+ const result: string[] = [];
359
+ for (const section of topSections) {
360
+ if (/^paths\s*:/.test(section)) {
361
+ result.push(...splitOpenAPIPathsYAML(section));
362
+ } else {
363
+ result.push(section);
364
+ }
365
+ }
366
+
367
+ return mergeTinyParts(result, 100);
368
+ }
369
+
370
+ function splitOpenAPIPathsYAML(pathsSection: string): string[] {
371
+ // Each ` /path:` line starts a new chunk (2-space indent + leading slash).
372
+ const lines = pathsSection.split("\n");
373
+ const chunks: string[] = [];
374
+ let current: string[] = [lines[0]]; // "paths:" header line
375
+
376
+ for (let i = 1; i < lines.length; i++) {
377
+ const line = lines[i];
378
+ if (/^ \//.test(line) && current.length > 1) {
379
+ const section = current.join("\n").trim();
380
+ if (section && section !== "paths:") chunks.push(section);
381
+ current = ["paths:", line];
382
+ } else {
383
+ current.push(line);
384
+ }
385
+ }
386
+
387
+ if (current.length > 1) {
388
+ const section = current.join("\n").trim();
389
+ if (section && section !== "paths:") chunks.push(section);
390
+ }
391
+
392
+ return chunks.length > 0 ? chunks : [pathsSection];
393
+ }
394
+
395
+ // Above this size, skip JSON.parse to avoid OOM / long GC pauses.
396
+ // 500k-line files (~10-20MB) are fine; this guards against 100MB+ files.
397
+ const JSON_PARSE_LIMIT = 50 * 1024 * 1024;
398
+
399
+ function splitJSON(text: string): string[] {
400
+ if (text.length > JSON_PARSE_LIMIT) {
401
+ log.warn(
402
+ `JSON file too large for structural parsing (${(text.length / 1024 / 1024).toFixed(1)}MB), using line-based splitting`,
403
+ "chunker"
404
+ );
405
+ return splitParagraphs(text);
406
+ }
407
+
408
+ try {
409
+ const obj = JSON.parse(text);
410
+
411
+ if (typeof obj !== "object" || obj === null) {
412
+ return [text];
413
+ }
414
+
415
+ if (Array.isArray(obj)) {
416
+ // Chunk each array item individually
417
+ const items = obj.map(
418
+ (item, i) => `[${i}]: ${JSON.stringify(item, null, 2)}`
419
+ );
420
+ return mergeTinyParts(items, 100);
421
+ }
422
+
423
+ // Object: one chunk per top-level key.
424
+ // For OpenAPI, further split `paths` into individual path chunks.
425
+ const result: string[] = [];
426
+ for (const [key, value] of Object.entries(obj)) {
427
+ if (key === "paths" && typeof value === "object" && value !== null) {
428
+ for (const [path, ops] of Object.entries(value)) {
429
+ result.push(`paths["${path}"]: ${JSON.stringify(ops, null, 2)}`);
430
+ }
431
+ } else {
432
+ result.push(`"${key}": ${JSON.stringify(value, null, 2)}`);
433
+ }
434
+ }
435
+
436
+ return mergeTinyParts(result, 100);
437
+ } catch {
438
+ // Not valid JSON — fall back to paragraph splitting
439
+ return splitParagraphs(text);
440
+ }
441
+ }
442
+
443
+ function splitCode(text: string): string[] {
444
+ // Split on double newlines (function/class boundaries)
445
+ const parts = text.split(/\n\n+/);
446
+ return mergeTinyParts(parts, 100);
447
+ }
448
+
449
+ function splitParagraphs(text: string): string[] {
450
+ const parts = text.split(/\n\n+/);
451
+ return mergeTinyParts(parts, 100);
452
+ }
453
+
454
+ function splitCSS(text: string): string[] {
455
+ // Split on top-level brace blocks. Each rule (.foo {}), @media block,
456
+ // @keyframes, etc. ends when brace depth returns to 0.
457
+ const chunks: string[] = [];
458
+ let current: string[] = [];
459
+ let depth = 0;
460
+
461
+ for (const line of text.split("\n")) {
462
+ current.push(line);
463
+ for (const ch of line) {
464
+ if (ch === "{") depth++;
465
+ else if (ch === "}") depth--;
466
+ }
467
+ if (depth === 0 && current.some((l) => l.trim())) {
468
+ const block = current.join("\n").trim();
469
+ if (block) chunks.push(block);
470
+ current = [];
471
+ }
472
+ }
473
+
474
+ if (current.length > 0) {
475
+ const remaining = current.join("\n").trim();
476
+ if (remaining) chunks.push(remaining);
477
+ }
478
+
479
+ return mergeTinyParts(chunks, 100);
480
+ }
481
+
482
+ /**
483
+ * Merge consecutive tiny parts (< minSize chars) to avoid
484
+ * creating embeddings for near-empty chunks.
485
+ */
486
+ function mergeTinyParts(parts: string[], minSize: number): string[] {
487
+ const merged: string[] = [];
488
+ let buffer = "";
489
+
490
+ for (const part of parts) {
491
+ const trimmed = part.trim();
492
+ if (!trimmed) continue;
493
+
494
+ if (buffer.length + trimmed.length < minSize) {
495
+ buffer += (buffer ? "\n\n" : "") + trimmed;
496
+ } else {
497
+ if (buffer) merged.push(buffer);
498
+ buffer = trimmed;
499
+ }
500
+ }
501
+
502
+ if (buffer) merged.push(buffer);
503
+ return merged;
504
+ }
505
+
506
+ function splitBySize(
507
+ text: string,
508
+ chunkSize: number,
509
+ overlap: number
510
+ ): string[] {
511
+ const chunks: string[] = [];
512
+ let start = 0;
513
+
514
+ while (start < text.length) {
515
+ const end = Math.min(start + chunkSize, text.length);
516
+ chunks.push(text.slice(start, end));
517
+
518
+ if (end >= text.length) break;
519
+ start = end - overlap;
520
+ }
521
+
522
+ return chunks;
523
+ }