@winci/local-rag 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +24 -0
- package/.mcp.json +11 -0
- package/LICENSE +21 -0
- package/README.md +567 -0
- package/hooks/hooks.json +25 -0
- package/hooks/scripts/reindex-file.sh +19 -0
- package/hooks/scripts/session-start.sh +11 -0
- package/package.json +52 -0
- package/skills/local-rag/SKILL.md +42 -0
- package/src/cli/commands/analytics.ts +58 -0
- package/src/cli/commands/benchmark.ts +30 -0
- package/src/cli/commands/checkpoint.ts +85 -0
- package/src/cli/commands/conversation.ts +102 -0
- package/src/cli/commands/demo.ts +119 -0
- package/src/cli/commands/eval.ts +31 -0
- package/src/cli/commands/index-cmd.ts +26 -0
- package/src/cli/commands/init.ts +35 -0
- package/src/cli/commands/map.ts +21 -0
- package/src/cli/commands/remove.ts +15 -0
- package/src/cli/commands/search-cmd.ts +59 -0
- package/src/cli/commands/serve.ts +5 -0
- package/src/cli/commands/status.ts +13 -0
- package/src/cli/index.ts +117 -0
- package/src/cli/progress.ts +21 -0
- package/src/cli/setup.ts +192 -0
- package/src/config/index.ts +101 -0
- package/src/conversation/indexer.ts +147 -0
- package/src/conversation/parser.ts +323 -0
- package/src/db/analytics.ts +116 -0
- package/src/db/annotations.ts +161 -0
- package/src/db/checkpoints.ts +166 -0
- package/src/db/conversation.ts +241 -0
- package/src/db/files.ts +146 -0
- package/src/db/graph.ts +250 -0
- package/src/db/index.ts +468 -0
- package/src/db/search.ts +244 -0
- package/src/db/types.ts +85 -0
- package/src/embeddings/embed.ts +73 -0
- package/src/graph/resolver.ts +305 -0
- package/src/indexing/chunker.ts +523 -0
- package/src/indexing/indexer.ts +263 -0
- package/src/indexing/parse.ts +99 -0
- package/src/indexing/watcher.ts +84 -0
- package/src/main.ts +8 -0
- package/src/search/benchmark.ts +139 -0
- package/src/search/eval.ts +171 -0
- package/src/search/hybrid.ts +194 -0
- package/src/search/reranker.ts +99 -0
- package/src/search/usages.ts +27 -0
- package/src/server/index.ts +126 -0
- package/src/tools/analytics-tools.ts +58 -0
- package/src/tools/annotation-tools.ts +89 -0
- package/src/tools/checkpoint-tools.ts +147 -0
- package/src/tools/conversation-tools.ts +86 -0
- package/src/tools/git-tools.ts +103 -0
- package/src/tools/graph-tools.ts +163 -0
- package/src/tools/index-tools.ts +91 -0
- package/src/tools/index.ts +33 -0
- package/src/tools/search.ts +238 -0
- package/src/types.ts +9 -0
- package/src/utils/log.ts +39 -0
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
import { chunk as astChunk } from "code-chunk";
|
|
2
|
+
import { log } from "../utils/log";
|
|
3
|
+
|
|
4
|
+
export interface ChunkImport {
|
|
5
|
+
name: string;
|
|
6
|
+
source: string;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export interface ChunkExport {
|
|
10
|
+
name: string;
|
|
11
|
+
type: string;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface Chunk {
|
|
15
|
+
text: string;
|
|
16
|
+
index: number;
|
|
17
|
+
startLine?: number;
|
|
18
|
+
endLine?: number;
|
|
19
|
+
imports?: ChunkImport[];
|
|
20
|
+
exports?: ChunkExport[];
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const DEFAULT_CHUNK_SIZE = 512; // in characters
|
|
24
|
+
const DEFAULT_CHUNK_OVERLAP = 50;
|
|
25
|
+
|
|
26
|
+
// Extensions that code-chunk supports via tree-sitter
|
|
27
|
+
const AST_SUPPORTED = new Set([
|
|
28
|
+
".ts", ".tsx", ".js", ".jsx", ".py", ".go", ".rs", ".java",
|
|
29
|
+
]);
|
|
30
|
+
|
|
31
|
+
// Code-like extensions handled by blank-line heuristic splitting
|
|
32
|
+
const HEURISTIC_CODE = new Set([
|
|
33
|
+
".c", ".cpp", ".h", ".hpp", ".rb", ".swift",
|
|
34
|
+
".sh", ".bash", ".zsh", ".fish",
|
|
35
|
+
".tf", ".proto", ".graphql", ".gql",
|
|
36
|
+
".mod", ".xml",
|
|
37
|
+
".jenkinsfile", ".vagrantfile", ".gemfile", ".rakefile", ".brewfile", ".procfile",
|
|
38
|
+
]);
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Every extension (real or virtual) that chunkText knows how to handle.
|
|
42
|
+
* Files with extensions outside this set are skipped by the indexer so
|
|
43
|
+
* binaries and other unrecognised formats never enter the DB.
|
|
44
|
+
*/
|
|
45
|
+
export const KNOWN_EXTENSIONS = new Set([
|
|
46
|
+
// Markdown
|
|
47
|
+
".md", ".mdx", ".markdown",
|
|
48
|
+
// Plain text
|
|
49
|
+
".txt",
|
|
50
|
+
// AST-aware code
|
|
51
|
+
".ts", ".tsx", ".js", ".jsx", ".py", ".go", ".rs", ".java",
|
|
52
|
+
// Heuristic code (blank-line blocks)
|
|
53
|
+
".c", ".cpp", ".h", ".hpp", ".rb", ".swift",
|
|
54
|
+
".sh", ".bash", ".zsh", ".fish",
|
|
55
|
+
".tf", ".proto", ".graphql", ".gql",
|
|
56
|
+
".mod",
|
|
57
|
+
".xml",
|
|
58
|
+
// Virtual extensions for basename-detected files
|
|
59
|
+
".makefile", ".dockerfile", ".jenkinsfile",
|
|
60
|
+
".vagrantfile", ".gemfile", ".rakefile", ".brewfile", ".procfile",
|
|
61
|
+
// Structured data
|
|
62
|
+
".yaml", ".yml", ".json", ".toml",
|
|
63
|
+
// Query / schema languages
|
|
64
|
+
".sql",
|
|
65
|
+
// API collections
|
|
66
|
+
".bru",
|
|
67
|
+
// Stylesheets
|
|
68
|
+
".css", ".scss", ".less",
|
|
69
|
+
]);
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Split text into chunks. Strategy depends on content type:
|
|
73
|
+
* - Code (supported languages): AST-aware chunking via tree-sitter
|
|
74
|
+
* - Markdown: split on headings first, then by size
|
|
75
|
+
* - Code (unsupported): split on blank-line-separated blocks, then by size
|
|
76
|
+
* - Other: split on paragraphs, then by size
|
|
77
|
+
*/
|
|
78
|
+
export async function chunkText(
|
|
79
|
+
text: string,
|
|
80
|
+
extension: string,
|
|
81
|
+
chunkSize = DEFAULT_CHUNK_SIZE,
|
|
82
|
+
chunkOverlap = DEFAULT_CHUNK_OVERLAP,
|
|
83
|
+
filePath?: string
|
|
84
|
+
): Promise<Chunk[]> {
|
|
85
|
+
const chunks = await _chunkText(text, extension, chunkSize, chunkOverlap, filePath);
|
|
86
|
+
assignLineNumbers(chunks, text);
|
|
87
|
+
return chunks;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
async function _chunkText(
|
|
91
|
+
text: string,
|
|
92
|
+
extension: string,
|
|
93
|
+
chunkSize = DEFAULT_CHUNK_SIZE,
|
|
94
|
+
chunkOverlap = DEFAULT_CHUNK_OVERLAP,
|
|
95
|
+
filePath?: string
|
|
96
|
+
): Promise<Chunk[]> {
|
|
97
|
+
// Try AST-aware chunking for supported code files (even small ones, for import/export extraction)
|
|
98
|
+
if (AST_SUPPORTED.has(extension)) {
|
|
99
|
+
try {
|
|
100
|
+
const astChunks = await astChunk(filePath || `file${extension}`, text, {
|
|
101
|
+
maxChunkSize: chunkSize,
|
|
102
|
+
});
|
|
103
|
+
if (astChunks.length > 0) {
|
|
104
|
+
return astChunks.map((c, i) => ({
|
|
105
|
+
text: c.text,
|
|
106
|
+
index: i,
|
|
107
|
+
imports: c.context.imports.map((im) => ({ name: im.name, source: im.source })),
|
|
108
|
+
exports: c.context.entities
|
|
109
|
+
.filter((e) => e.type === "export" || e.type === "function" || e.type === "class" || e.type === "interface" || e.type === "type" || e.type === "enum")
|
|
110
|
+
.map((e) => ({ name: e.name, type: e.type })),
|
|
111
|
+
}));
|
|
112
|
+
}
|
|
113
|
+
} catch (err) {
|
|
114
|
+
log.debug(`AST chunking failed for ${filePath || extension}, using heuristic: ${err instanceof Error ? err.message : err}`, "chunker");
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (text.length <= chunkSize) {
|
|
119
|
+
return [{ text, index: 0 }];
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const isMarkdown = [".md", ".mdx", ".markdown"].includes(extension);
|
|
123
|
+
// Code-like files: split on blank-line-separated blocks as a heuristic.
|
|
124
|
+
// Includes AST-supported languages plus shell, HCL, proto, GraphQL, etc.
|
|
125
|
+
const isCode = AST_SUPPORTED.has(extension) || HEURISTIC_CODE.has(extension);
|
|
126
|
+
|
|
127
|
+
let sections: string[];
|
|
128
|
+
|
|
129
|
+
if (isMarkdown) {
|
|
130
|
+
sections = splitMarkdown(text);
|
|
131
|
+
} else if (extension === ".makefile") {
|
|
132
|
+
sections = splitMakefile(text);
|
|
133
|
+
} else if (extension === ".dockerfile") {
|
|
134
|
+
sections = splitDockerfile(text);
|
|
135
|
+
} else if (extension === ".yaml" || extension === ".yml") {
|
|
136
|
+
sections = splitYAML(text);
|
|
137
|
+
} else if (extension === ".json") {
|
|
138
|
+
sections = splitJSON(text);
|
|
139
|
+
} else if (extension === ".toml") {
|
|
140
|
+
sections = splitTOML(text);
|
|
141
|
+
} else if (extension === ".bru") {
|
|
142
|
+
sections = splitBru(text);
|
|
143
|
+
} else if (extension === ".sql") {
|
|
144
|
+
sections = splitSQL(text);
|
|
145
|
+
} else if (extension === ".css" || extension === ".scss" || extension === ".less") {
|
|
146
|
+
sections = splitCSS(text);
|
|
147
|
+
} else if (isCode) {
|
|
148
|
+
sections = splitCode(text);
|
|
149
|
+
} else {
|
|
150
|
+
sections = splitParagraphs(text);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Further split any section that exceeds chunkSize
|
|
154
|
+
const chunks: Chunk[] = [];
|
|
155
|
+
let index = 0;
|
|
156
|
+
|
|
157
|
+
for (const section of sections) {
|
|
158
|
+
if (section.length <= chunkSize) {
|
|
159
|
+
chunks.push({ text: section, index: index++ });
|
|
160
|
+
} else {
|
|
161
|
+
const subChunks = splitBySize(section, chunkSize, chunkOverlap);
|
|
162
|
+
for (const sub of subChunks) {
|
|
163
|
+
chunks.push({ text: sub, index: index++ });
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return chunks;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Assign startLine/endLine to each chunk by locating the chunk text in the
|
|
173
|
+
* original file source. Uses indexOf with a forward cursor so overlapping or
|
|
174
|
+
* repeated text still resolves in order. Chunks whose text is not a verbatim
|
|
175
|
+
* substring (e.g. JSON-reformatted chunks) are left without line numbers.
|
|
176
|
+
*/
|
|
177
|
+
function assignLineNumbers(chunks: Chunk[], fullText: string): void {
|
|
178
|
+
const lineOffsets = [0];
|
|
179
|
+
for (let i = 0; i < fullText.length; i++) {
|
|
180
|
+
if (fullText[i] === "\n") lineOffsets.push(i + 1);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
function offsetToLine(offset: number): number {
|
|
184
|
+
let lo = 0, hi = lineOffsets.length - 1;
|
|
185
|
+
while (lo < hi) {
|
|
186
|
+
const mid = (lo + hi + 1) >> 1;
|
|
187
|
+
if (lineOffsets[mid] <= offset) lo = mid;
|
|
188
|
+
else hi = mid - 1;
|
|
189
|
+
}
|
|
190
|
+
return lo + 1; // 1-based
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
let cursor = 0;
|
|
194
|
+
for (const chunk of chunks) {
|
|
195
|
+
const idx = fullText.indexOf(chunk.text, cursor);
|
|
196
|
+
if (idx >= 0) {
|
|
197
|
+
chunk.startLine = offsetToLine(idx);
|
|
198
|
+
chunk.endLine = offsetToLine(idx + Math.max(chunk.text.length - 1, 0));
|
|
199
|
+
cursor = idx + chunk.text.length;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
function splitMarkdown(text: string): string[] {
|
|
205
|
+
// Split on heading boundaries (## or ###)
|
|
206
|
+
const parts = text.split(/(?=^#{1,3}\s)/m);
|
|
207
|
+
return parts.filter((p) => p.trim().length > 0);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function splitDockerfile(text: string): string[] {
|
|
211
|
+
// Each FROM instruction starts a new build stage — use that as the primary
|
|
212
|
+
// boundary. Within a single-stage file this produces one section, which the
|
|
213
|
+
// size-based fallback will further split if needed.
|
|
214
|
+
const lines = text.split("\n");
|
|
215
|
+
const sections: string[] = [];
|
|
216
|
+
let current: string[] = [];
|
|
217
|
+
|
|
218
|
+
for (const line of lines) {
|
|
219
|
+
if (/^FROM\s+/i.test(line) && current.length > 0) {
|
|
220
|
+
const section = current.join("\n").trim();
|
|
221
|
+
if (section) sections.push(section);
|
|
222
|
+
current = [line];
|
|
223
|
+
} else {
|
|
224
|
+
current.push(line);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if (current.length > 0) {
|
|
229
|
+
const section = current.join("\n").trim();
|
|
230
|
+
if (section) sections.push(section);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
return mergeTinyParts(sections.length > 0 ? sections : [text], 100);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
function splitBru(text: string): string[] {
|
|
237
|
+
// Each top-level block in the Bru Markup Language starts at column 0 with
|
|
238
|
+
// `keyword {` (keyword may contain colons/hyphens, e.g. `body:json`, `vars:pre-request`).
|
|
239
|
+
const lines = text.split("\n");
|
|
240
|
+
const sections: string[] = [];
|
|
241
|
+
let current: string[] = [];
|
|
242
|
+
|
|
243
|
+
for (const line of lines) {
|
|
244
|
+
if (/^[a-zA-Z][a-zA-Z0-9:_-]*\s*\{/.test(line) && current.length > 0) {
|
|
245
|
+
const section = current.join("\n").trim();
|
|
246
|
+
if (section) sections.push(section);
|
|
247
|
+
current = [line];
|
|
248
|
+
} else {
|
|
249
|
+
current.push(line);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
if (current.length > 0) {
|
|
254
|
+
const section = current.join("\n").trim();
|
|
255
|
+
if (section) sections.push(section);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
return mergeTinyParts(sections, 100);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
function splitTOML(text: string): string[] {
|
|
262
|
+
// Split on [section] and [[array-of-tables]] headers.
|
|
263
|
+
const lines = text.split("\n");
|
|
264
|
+
const sections: string[] = [];
|
|
265
|
+
let current: string[] = [];
|
|
266
|
+
|
|
267
|
+
for (const line of lines) {
|
|
268
|
+
if (/^\s*\[\[?[\w.]/.test(line) && current.length > 0) {
|
|
269
|
+
const section = current.join("\n").trim();
|
|
270
|
+
if (section) sections.push(section);
|
|
271
|
+
current = [line];
|
|
272
|
+
} else {
|
|
273
|
+
current.push(line);
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
if (current.length > 0) {
|
|
278
|
+
const section = current.join("\n").trim();
|
|
279
|
+
if (section) sections.push(section);
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
return mergeTinyParts(sections, 100);
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
function splitSQL(text: string): string[] {
|
|
286
|
+
// Split on semicolons that terminate statements. Preserves the semicolon
|
|
287
|
+
// so each chunk reads as a complete statement.
|
|
288
|
+
const statements = text
|
|
289
|
+
.split(/(?<=;)\s*\n/)
|
|
290
|
+
.map((s) => s.trim())
|
|
291
|
+
.filter((s) => s.length > 0);
|
|
292
|
+
|
|
293
|
+
return mergeTinyParts(statements, 100);
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
function splitMakefile(text: string): string[] {
|
|
297
|
+
// Each Makefile target (and its recipe) becomes its own chunk.
|
|
298
|
+
// A target line starts at column 0, is not a comment or blank, and has
|
|
299
|
+
// a colon that is NOT part of := or ::= (variable assignment operators).
|
|
300
|
+
const lines = text.split("\n");
|
|
301
|
+
const sections: string[] = [];
|
|
302
|
+
let current: string[] = [];
|
|
303
|
+
|
|
304
|
+
for (const line of lines) {
|
|
305
|
+
const isTarget =
|
|
306
|
+
line.length > 0 &&
|
|
307
|
+
!line.startsWith("\t") &&
|
|
308
|
+
!line.startsWith(" ") &&
|
|
309
|
+
!line.startsWith("#") &&
|
|
310
|
+
/^[A-Za-z0-9_./%$()-][^=\n]*:(?!=)/.test(line);
|
|
311
|
+
|
|
312
|
+
if (isTarget && current.length > 0) {
|
|
313
|
+
const section = current.join("\n").trim();
|
|
314
|
+
if (section) sections.push(section);
|
|
315
|
+
current = [line];
|
|
316
|
+
} else {
|
|
317
|
+
current.push(line);
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
if (current.length > 0) {
|
|
322
|
+
const section = current.join("\n").trim();
|
|
323
|
+
if (section) sections.push(section);
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
return mergeTinyParts(sections, 100);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
function splitYAML(text: string): string[] {
|
|
330
|
+
// Split on top-level YAML keys (lines at column 0 matching `key:`).
|
|
331
|
+
// For OpenAPI files (detected by a top-level `paths:` key), further split
|
|
332
|
+
// the paths section on individual path entries (e.g. ` /users:`).
|
|
333
|
+
const lines = text.split("\n");
|
|
334
|
+
const topSections: string[] = [];
|
|
335
|
+
let current: string[] = [];
|
|
336
|
+
|
|
337
|
+
for (const line of lines) {
|
|
338
|
+
const isTopKey =
|
|
339
|
+
!line.startsWith(" ") &&
|
|
340
|
+
!line.startsWith("\t") &&
|
|
341
|
+
!line.startsWith("#") &&
|
|
342
|
+
/^[a-zA-Z_$][a-zA-Z0-9_$-]*\s*:/.test(line);
|
|
343
|
+
|
|
344
|
+
if (isTopKey && current.length > 0) {
|
|
345
|
+
const section = current.join("\n").trim();
|
|
346
|
+
if (section) topSections.push(section);
|
|
347
|
+
current = [line];
|
|
348
|
+
} else {
|
|
349
|
+
current.push(line);
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
if (current.length > 0) {
|
|
353
|
+
const section = current.join("\n").trim();
|
|
354
|
+
if (section) topSections.push(section);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// OpenAPI: further split the `paths:` section on individual path entries
|
|
358
|
+
const result: string[] = [];
|
|
359
|
+
for (const section of topSections) {
|
|
360
|
+
if (/^paths\s*:/.test(section)) {
|
|
361
|
+
result.push(...splitOpenAPIPathsYAML(section));
|
|
362
|
+
} else {
|
|
363
|
+
result.push(section);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
return mergeTinyParts(result, 100);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
function splitOpenAPIPathsYAML(pathsSection: string): string[] {
|
|
371
|
+
// Each ` /path:` line starts a new chunk (2-space indent + leading slash).
|
|
372
|
+
const lines = pathsSection.split("\n");
|
|
373
|
+
const chunks: string[] = [];
|
|
374
|
+
let current: string[] = [lines[0]]; // "paths:" header line
|
|
375
|
+
|
|
376
|
+
for (let i = 1; i < lines.length; i++) {
|
|
377
|
+
const line = lines[i];
|
|
378
|
+
if (/^ \//.test(line) && current.length > 1) {
|
|
379
|
+
const section = current.join("\n").trim();
|
|
380
|
+
if (section && section !== "paths:") chunks.push(section);
|
|
381
|
+
current = ["paths:", line];
|
|
382
|
+
} else {
|
|
383
|
+
current.push(line);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
if (current.length > 1) {
|
|
388
|
+
const section = current.join("\n").trim();
|
|
389
|
+
if (section && section !== "paths:") chunks.push(section);
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
return chunks.length > 0 ? chunks : [pathsSection];
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
// Above this size, skip JSON.parse to avoid OOM / long GC pauses.
|
|
396
|
+
// 500k-line files (~10-20MB) are fine; this guards against 100MB+ files.
|
|
397
|
+
const JSON_PARSE_LIMIT = 50 * 1024 * 1024;
|
|
398
|
+
|
|
399
|
+
function splitJSON(text: string): string[] {
|
|
400
|
+
if (text.length > JSON_PARSE_LIMIT) {
|
|
401
|
+
log.warn(
|
|
402
|
+
`JSON file too large for structural parsing (${(text.length / 1024 / 1024).toFixed(1)}MB), using line-based splitting`,
|
|
403
|
+
"chunker"
|
|
404
|
+
);
|
|
405
|
+
return splitParagraphs(text);
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
try {
|
|
409
|
+
const obj = JSON.parse(text);
|
|
410
|
+
|
|
411
|
+
if (typeof obj !== "object" || obj === null) {
|
|
412
|
+
return [text];
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
if (Array.isArray(obj)) {
|
|
416
|
+
// Chunk each array item individually
|
|
417
|
+
const items = obj.map(
|
|
418
|
+
(item, i) => `[${i}]: ${JSON.stringify(item, null, 2)}`
|
|
419
|
+
);
|
|
420
|
+
return mergeTinyParts(items, 100);
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// Object: one chunk per top-level key.
|
|
424
|
+
// For OpenAPI, further split `paths` into individual path chunks.
|
|
425
|
+
const result: string[] = [];
|
|
426
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
427
|
+
if (key === "paths" && typeof value === "object" && value !== null) {
|
|
428
|
+
for (const [path, ops] of Object.entries(value)) {
|
|
429
|
+
result.push(`paths["${path}"]: ${JSON.stringify(ops, null, 2)}`);
|
|
430
|
+
}
|
|
431
|
+
} else {
|
|
432
|
+
result.push(`"${key}": ${JSON.stringify(value, null, 2)}`);
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
return mergeTinyParts(result, 100);
|
|
437
|
+
} catch {
|
|
438
|
+
// Not valid JSON — fall back to paragraph splitting
|
|
439
|
+
return splitParagraphs(text);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
function splitCode(text: string): string[] {
|
|
444
|
+
// Split on double newlines (function/class boundaries)
|
|
445
|
+
const parts = text.split(/\n\n+/);
|
|
446
|
+
return mergeTinyParts(parts, 100);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
function splitParagraphs(text: string): string[] {
|
|
450
|
+
const parts = text.split(/\n\n+/);
|
|
451
|
+
return mergeTinyParts(parts, 100);
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
function splitCSS(text: string): string[] {
|
|
455
|
+
// Split on top-level brace blocks. Each rule (.foo {}), @media block,
|
|
456
|
+
// @keyframes, etc. ends when brace depth returns to 0.
|
|
457
|
+
const chunks: string[] = [];
|
|
458
|
+
let current: string[] = [];
|
|
459
|
+
let depth = 0;
|
|
460
|
+
|
|
461
|
+
for (const line of text.split("\n")) {
|
|
462
|
+
current.push(line);
|
|
463
|
+
for (const ch of line) {
|
|
464
|
+
if (ch === "{") depth++;
|
|
465
|
+
else if (ch === "}") depth--;
|
|
466
|
+
}
|
|
467
|
+
if (depth === 0 && current.some((l) => l.trim())) {
|
|
468
|
+
const block = current.join("\n").trim();
|
|
469
|
+
if (block) chunks.push(block);
|
|
470
|
+
current = [];
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
if (current.length > 0) {
|
|
475
|
+
const remaining = current.join("\n").trim();
|
|
476
|
+
if (remaining) chunks.push(remaining);
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
return mergeTinyParts(chunks, 100);
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
/**
|
|
483
|
+
* Merge consecutive tiny parts (< minSize chars) to avoid
|
|
484
|
+
* creating embeddings for near-empty chunks.
|
|
485
|
+
*/
|
|
486
|
+
function mergeTinyParts(parts: string[], minSize: number): string[] {
|
|
487
|
+
const merged: string[] = [];
|
|
488
|
+
let buffer = "";
|
|
489
|
+
|
|
490
|
+
for (const part of parts) {
|
|
491
|
+
const trimmed = part.trim();
|
|
492
|
+
if (!trimmed) continue;
|
|
493
|
+
|
|
494
|
+
if (buffer.length + trimmed.length < minSize) {
|
|
495
|
+
buffer += (buffer ? "\n\n" : "") + trimmed;
|
|
496
|
+
} else {
|
|
497
|
+
if (buffer) merged.push(buffer);
|
|
498
|
+
buffer = trimmed;
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
if (buffer) merged.push(buffer);
|
|
503
|
+
return merged;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
function splitBySize(
|
|
507
|
+
text: string,
|
|
508
|
+
chunkSize: number,
|
|
509
|
+
overlap: number
|
|
510
|
+
): string[] {
|
|
511
|
+
const chunks: string[] = [];
|
|
512
|
+
let start = 0;
|
|
513
|
+
|
|
514
|
+
while (start < text.length) {
|
|
515
|
+
const end = Math.min(start + chunkSize, text.length);
|
|
516
|
+
chunks.push(text.slice(start, end));
|
|
517
|
+
|
|
518
|
+
if (end >= text.length) break;
|
|
519
|
+
start = end - overlap;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
return chunks;
|
|
523
|
+
}
|