@comfanion/usethis_search 0.1.4 → 0.2.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +424 -8
- package/file-indexer.ts +21 -1
- package/package.json +12 -2
- package/tools/codeindex.ts +135 -16
- package/tools/search.ts +46 -11
- package/vectorizer/bm25-index.ts +155 -0
- package/vectorizer/chunkers/chunker-factory.ts +98 -0
- package/vectorizer/chunkers/code-chunker.ts +325 -0
- package/vectorizer/chunkers/markdown-chunker.ts +177 -0
- package/vectorizer/content-cleaner.ts +136 -0
- package/vectorizer/hybrid-search.ts +97 -0
- package/vectorizer/index.js +395 -16
- package/vectorizer/metadata-extractor.ts +125 -0
- package/vectorizer/query-cache.ts +126 -0
- package/vectorizer/search-metrics.ts +155 -0
- package/vectorizer.yaml +81 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Code Chunker — splits source code by functions, classes, and exports.
|
|
3
|
+
*
|
|
4
|
+
* Uses regex-based parsing (no AST dependency) to detect function/class
|
|
5
|
+
* boundaries. Falls back to line-based splitting for unstructured code.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface CodeChunkConfig {
|
|
9
|
+
min_chunk_size: number
|
|
10
|
+
max_chunk_size: number
|
|
11
|
+
split_by_functions: boolean
|
|
12
|
+
include_function_signature: boolean
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export const DEFAULT_CODE_CONFIG: CodeChunkConfig = {
|
|
16
|
+
min_chunk_size: 300,
|
|
17
|
+
max_chunk_size: 1500,
|
|
18
|
+
split_by_functions: true,
|
|
19
|
+
include_function_signature: true,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface CodeChunk {
|
|
23
|
+
content: string
|
|
24
|
+
function_name?: string
|
|
25
|
+
class_name?: string
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// ── Block detection ─────────────────────────────────────────────────────────
|
|
29
|
+
|
|
30
|
+
interface CodeBlock {
|
|
31
|
+
type: "function" | "class" | "method" | "other"
|
|
32
|
+
name: string
|
|
33
|
+
className?: string
|
|
34
|
+
startLine: number
|
|
35
|
+
endLine: number
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Detect top-level function/class blocks via brace-counting.
|
|
40
|
+
* Works for JS/TS/Go/Rust/Java/C-family languages.
|
|
41
|
+
*/
|
|
42
|
+
function detectBlocks(lines: string[]): CodeBlock[] {
|
|
43
|
+
const blocks: CodeBlock[] = []
|
|
44
|
+
|
|
45
|
+
// Patterns for function/class declarations
|
|
46
|
+
const fnPatterns = [
|
|
47
|
+
// JS/TS: function name(, async function name(, export function
|
|
48
|
+
/(?:export\s+)?(?:async\s+)?function\s+(\w+)/,
|
|
49
|
+
// Arrow: const name = (…) => or const name = async (
|
|
50
|
+
/(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/,
|
|
51
|
+
// Method inside class: name( or async name(
|
|
52
|
+
/^\s+(?:async\s+)?(\w+)\s*\([^)]*\)\s*(?::\s*\w[^{]*)?\s*\{/,
|
|
53
|
+
// Go: func Name(
|
|
54
|
+
/^func\s+(?:\([^)]*\)\s+)?(\w+)\s*\(/,
|
|
55
|
+
// Rust: fn name(
|
|
56
|
+
/(?:pub\s+)?(?:async\s+)?fn\s+(\w+)/,
|
|
57
|
+
// Python def
|
|
58
|
+
/^\s*(?:async\s+)?def\s+(\w+)\s*\(/,
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
const classPatterns = [
|
|
62
|
+
// JS/TS/Java/C#: class Name
|
|
63
|
+
/(?:export\s+)?(?:abstract\s+)?class\s+(\w+)/,
|
|
64
|
+
// Rust: struct/enum/impl
|
|
65
|
+
/(?:pub\s+)?(?:struct|enum|impl)\s+(\w+)/,
|
|
66
|
+
// Python class
|
|
67
|
+
/^class\s+(\w+)/,
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
let currentClass: string | undefined
|
|
71
|
+
let i = 0
|
|
72
|
+
|
|
73
|
+
while (i < lines.length) {
|
|
74
|
+
const line = lines[i]
|
|
75
|
+
|
|
76
|
+
// Check for class
|
|
77
|
+
let classMatch: RegExpMatchArray | null = null
|
|
78
|
+
for (const pat of classPatterns) {
|
|
79
|
+
classMatch = line.match(pat)
|
|
80
|
+
if (classMatch) break
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if (classMatch) {
|
|
84
|
+
const name = classMatch[1]
|
|
85
|
+
const endLine = findBlockEnd(lines, i)
|
|
86
|
+
blocks.push({ type: "class", name, startLine: i, endLine })
|
|
87
|
+
currentClass = name
|
|
88
|
+
|
|
89
|
+
// Look for methods inside
|
|
90
|
+
for (let j = i + 1; j < endLine; j++) {
|
|
91
|
+
const methodLine = lines[j]
|
|
92
|
+
const methodMatch = methodLine.match(/^\s+(?:(?:public|private|protected|static|async|override)\s+)*(\w+)\s*\([^)]*\)\s*(?::\s*[^{]*)?\s*\{/)
|
|
93
|
+
if (methodMatch && methodMatch[1] !== "constructor" || methodMatch && methodMatch[1] === "constructor") {
|
|
94
|
+
const mEnd = findBlockEnd(lines, j)
|
|
95
|
+
blocks.push({
|
|
96
|
+
type: "method",
|
|
97
|
+
name: methodMatch[1],
|
|
98
|
+
className: name,
|
|
99
|
+
startLine: j,
|
|
100
|
+
endLine: mEnd,
|
|
101
|
+
})
|
|
102
|
+
j = mEnd
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
i = endLine + 1
|
|
107
|
+
currentClass = undefined
|
|
108
|
+
continue
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Check for standalone function
|
|
112
|
+
let fnMatch: RegExpMatchArray | null = null
|
|
113
|
+
for (const pat of fnPatterns) {
|
|
114
|
+
fnMatch = line.match(pat)
|
|
115
|
+
if (fnMatch) break
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (fnMatch && !currentClass) {
|
|
119
|
+
const name = fnMatch[1]
|
|
120
|
+
const endLine = findBlockEnd(lines, i)
|
|
121
|
+
blocks.push({ type: "function", name, startLine: i, endLine })
|
|
122
|
+
i = endLine + 1
|
|
123
|
+
continue
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
i++
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return blocks
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/** Find end of brace-delimited block starting at `startLine`. */
|
|
133
|
+
function findBlockEnd(lines: string[], startLine: number): number {
|
|
134
|
+
let braceCount = 0
|
|
135
|
+
let started = false
|
|
136
|
+
|
|
137
|
+
// For Python-style (indent-based), use indent detection
|
|
138
|
+
const firstLine = lines[startLine]
|
|
139
|
+
const isPythonStyle = firstLine.match(/:\s*$/) && !firstLine.includes("{")
|
|
140
|
+
|
|
141
|
+
if (isPythonStyle) {
|
|
142
|
+
return findPythonBlockEnd(lines, startLine)
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
for (let i = startLine; i < lines.length; i++) {
|
|
146
|
+
const line = lines[i]
|
|
147
|
+
for (const ch of line) {
|
|
148
|
+
if (ch === "{") { braceCount++; started = true }
|
|
149
|
+
if (ch === "}") { braceCount-- }
|
|
150
|
+
}
|
|
151
|
+
if (started && braceCount <= 0) {
|
|
152
|
+
return i
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
return Math.min(startLine + 50, lines.length - 1)
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/** Find end of indent-based block (Python). */
|
|
160
|
+
function findPythonBlockEnd(lines: string[], startLine: number): number {
|
|
161
|
+
const baseIndent = lines[startLine].match(/^(\s*)/)?.[1].length ?? 0
|
|
162
|
+
|
|
163
|
+
for (let i = startLine + 1; i < lines.length; i++) {
|
|
164
|
+
const line = lines[i]
|
|
165
|
+
if (line.trim() === "") continue
|
|
166
|
+
const indent = line.match(/^(\s*)/)?.[1].length ?? 0
|
|
167
|
+
if (indent <= baseIndent) {
|
|
168
|
+
return i - 1
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return lines.length - 1
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// ── Fallback: line-based splitting ──────────────────────────────────────────
|
|
176
|
+
|
|
177
|
+
function splitByLines(lines: string[], maxChars: number): CodeChunk[] {
|
|
178
|
+
const chunks: CodeChunk[] = []
|
|
179
|
+
let current: string[] = []
|
|
180
|
+
let currentLen = 0
|
|
181
|
+
|
|
182
|
+
for (const line of lines) {
|
|
183
|
+
if (currentLen + line.length + 1 > maxChars && current.length > 0) {
|
|
184
|
+
chunks.push({ content: current.join("\n") })
|
|
185
|
+
current = []
|
|
186
|
+
currentLen = 0
|
|
187
|
+
}
|
|
188
|
+
current.push(line)
|
|
189
|
+
currentLen += line.length + 1
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
if (current.length > 0) {
|
|
193
|
+
chunks.push({ content: current.join("\n") })
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
return chunks
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// ── Public API ──────────────────────────────────────────────────────────────
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Chunk source code by functions/classes.
|
|
203
|
+
*/
|
|
204
|
+
export function chunkCode(
|
|
205
|
+
content: string,
|
|
206
|
+
config: CodeChunkConfig = DEFAULT_CODE_CONFIG,
|
|
207
|
+
): CodeChunk[] {
|
|
208
|
+
const lines = content.split("\n")
|
|
209
|
+
|
|
210
|
+
if (!config.split_by_functions) {
|
|
211
|
+
return splitByLines(lines, config.max_chunk_size)
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
const blocks = detectBlocks(lines)
|
|
215
|
+
|
|
216
|
+
if (blocks.length === 0) {
|
|
217
|
+
// No recognizable blocks — fallback
|
|
218
|
+
return splitByLines(lines, config.max_chunk_size)
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
const chunks: CodeChunk[] = []
|
|
222
|
+
|
|
223
|
+
// Collect "gaps" (code between blocks) and blocks themselves
|
|
224
|
+
let lastEnd = -1
|
|
225
|
+
|
|
226
|
+
for (const block of blocks) {
|
|
227
|
+
// If there is class-level block, skip individual method-level duplicate
|
|
228
|
+
if (block.type === "method") continue
|
|
229
|
+
|
|
230
|
+
// Gap before this block
|
|
231
|
+
if (block.startLine > lastEnd + 1) {
|
|
232
|
+
const gapContent = lines.slice(lastEnd + 1, block.startLine).join("\n").trim()
|
|
233
|
+
if (gapContent.length >= config.min_chunk_size) {
|
|
234
|
+
chunks.push({ content: gapContent })
|
|
235
|
+
} else if (gapContent.length > 0 && chunks.length > 0) {
|
|
236
|
+
// Merge small gap with previous chunk
|
|
237
|
+
chunks[chunks.length - 1].content += "\n\n" + gapContent
|
|
238
|
+
} else if (gapContent.length > 0) {
|
|
239
|
+
chunks.push({ content: gapContent })
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
const blockContent = lines.slice(block.startLine, block.endLine + 1).join("\n")
|
|
244
|
+
|
|
245
|
+
if (blockContent.length > config.max_chunk_size && block.type === "class") {
|
|
246
|
+
// Split class into methods
|
|
247
|
+
const methods = blocks.filter(
|
|
248
|
+
(b) => b.type === "method" && b.className === block.name,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
if (methods.length > 0) {
|
|
252
|
+
let classLastEnd = block.startLine
|
|
253
|
+
|
|
254
|
+
for (const method of methods) {
|
|
255
|
+
// Class preamble / gap before method
|
|
256
|
+
if (method.startLine > classLastEnd + 1) {
|
|
257
|
+
const gap = lines.slice(classLastEnd + 1, method.startLine).join("\n").trim()
|
|
258
|
+
if (gap) {
|
|
259
|
+
chunks.push({
|
|
260
|
+
content: gap,
|
|
261
|
+
class_name: block.name,
|
|
262
|
+
})
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
chunks.push({
|
|
267
|
+
content: lines.slice(method.startLine, method.endLine + 1).join("\n"),
|
|
268
|
+
function_name: method.name,
|
|
269
|
+
class_name: block.name,
|
|
270
|
+
})
|
|
271
|
+
classLastEnd = method.endLine
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// Class tail
|
|
275
|
+
if (classLastEnd < block.endLine) {
|
|
276
|
+
const tail = lines.slice(classLastEnd + 1, block.endLine + 1).join("\n").trim()
|
|
277
|
+
if (tail) {
|
|
278
|
+
chunks.push({ content: tail, class_name: block.name })
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
} else {
|
|
282
|
+
// No methods found — split by lines
|
|
283
|
+
const subChunks = splitByLines(
|
|
284
|
+
lines.slice(block.startLine, block.endLine + 1),
|
|
285
|
+
config.max_chunk_size,
|
|
286
|
+
)
|
|
287
|
+
for (const sc of subChunks) {
|
|
288
|
+
sc.class_name = block.name
|
|
289
|
+
chunks.push(sc)
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
} else {
|
|
293
|
+
chunks.push({
|
|
294
|
+
content: blockContent,
|
|
295
|
+
function_name: block.type === "function" ? block.name : undefined,
|
|
296
|
+
class_name: block.type === "class" ? block.name : block.className,
|
|
297
|
+
})
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
lastEnd = block.endLine
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// Trailing code after last block
|
|
304
|
+
if (lastEnd < lines.length - 1) {
|
|
305
|
+
const trailing = lines.slice(lastEnd + 1).join("\n").trim()
|
|
306
|
+
if (trailing.length > 0) {
|
|
307
|
+
chunks.push({ content: trailing })
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// Final: split any chunk still too large
|
|
312
|
+
const result: CodeChunk[] = []
|
|
313
|
+
for (const chunk of chunks) {
|
|
314
|
+
if (chunk.content.length > config.max_chunk_size) {
|
|
315
|
+
const parts = splitByLines(chunk.content.split("\n"), config.max_chunk_size)
|
|
316
|
+
for (const p of parts) {
|
|
317
|
+
result.push({ ...chunk, content: p.content })
|
|
318
|
+
}
|
|
319
|
+
} else {
|
|
320
|
+
result.push(chunk)
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
return result.filter((c) => c.content.trim().length > 0)
|
|
325
|
+
}
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown Chunker — splits Markdown by heading structure.
|
|
3
|
+
*
|
|
4
|
+
* Preserves heading hierarchy ("API > Auth > JWT") in metadata,
|
|
5
|
+
* merges small sections, and splits oversized ones.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface MarkdownChunkConfig {
|
|
9
|
+
min_chunk_size: number // merge sections smaller than this (chars)
|
|
10
|
+
max_chunk_size: number // split sections larger than this (chars)
|
|
11
|
+
split_by_headings: boolean
|
|
12
|
+
preserve_heading_hierarchy: boolean
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
|
|
16
|
+
min_chunk_size: 200,
|
|
17
|
+
max_chunk_size: 2000,
|
|
18
|
+
split_by_headings: true,
|
|
19
|
+
preserve_heading_hierarchy: true,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface MarkdownChunk {
|
|
23
|
+
content: string
|
|
24
|
+
heading_context: string // "H1 > H2 > H3"
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// ── Internal types ──────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
interface Section {
|
|
30
|
+
level: number // 1-6 for headings, 0 for preamble
|
|
31
|
+
heading: string
|
|
32
|
+
body: string
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// ── Parsing ─────────────────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
/** Parse Markdown into sections keyed by heading. */
|
|
38
|
+
function parseSections(content: string): Section[] {
|
|
39
|
+
const lines = content.split("\n")
|
|
40
|
+
const sections: Section[] = []
|
|
41
|
+
let currentSection: Section = { level: 0, heading: "", body: "" }
|
|
42
|
+
|
|
43
|
+
for (const line of lines) {
|
|
44
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/)
|
|
45
|
+
if (headingMatch) {
|
|
46
|
+
// Push previous section
|
|
47
|
+
if (currentSection.body.trim() || currentSection.heading) {
|
|
48
|
+
sections.push(currentSection)
|
|
49
|
+
}
|
|
50
|
+
currentSection = {
|
|
51
|
+
level: headingMatch[1].length,
|
|
52
|
+
heading: headingMatch[2].trim(),
|
|
53
|
+
body: "",
|
|
54
|
+
}
|
|
55
|
+
} else {
|
|
56
|
+
currentSection.body += line + "\n"
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Push last section
|
|
61
|
+
if (currentSection.body.trim() || currentSection.heading) {
|
|
62
|
+
sections.push(currentSection)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return sections
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** Build heading hierarchy path for a section given the heading stack. */
|
|
69
|
+
function buildHeadingContext(stack: { level: number; heading: string }[]): string {
|
|
70
|
+
return stack.map((h) => h.heading).join(" > ")
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// ── Splitting oversized sections ────────────────────────────────────────────
|
|
74
|
+
|
|
75
|
+
function splitLargeText(text: string, maxSize: number): string[] {
|
|
76
|
+
if (text.length <= maxSize) return [text]
|
|
77
|
+
|
|
78
|
+
const chunks: string[] = []
|
|
79
|
+
const lines = text.split("\n")
|
|
80
|
+
let current: string[] = []
|
|
81
|
+
let currentLen = 0
|
|
82
|
+
|
|
83
|
+
for (const line of lines) {
|
|
84
|
+
if (currentLen + line.length + 1 > maxSize && current.length > 0) {
|
|
85
|
+
chunks.push(current.join("\n"))
|
|
86
|
+
current = []
|
|
87
|
+
currentLen = 0
|
|
88
|
+
}
|
|
89
|
+
current.push(line)
|
|
90
|
+
currentLen += line.length + 1
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (current.length > 0) {
|
|
94
|
+
chunks.push(current.join("\n"))
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return chunks
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// ── Public API ──────────────────────────────────────────────────────────────
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Chunk Markdown content into semantic sections.
|
|
104
|
+
*/
|
|
105
|
+
export function chunkMarkdown(
|
|
106
|
+
content: string,
|
|
107
|
+
config: MarkdownChunkConfig = DEFAULT_MD_CONFIG,
|
|
108
|
+
): MarkdownChunk[] {
|
|
109
|
+
if (!config.split_by_headings) {
|
|
110
|
+
// Fallback: single chunk (caller can use fixed chunker)
|
|
111
|
+
return [{ content, heading_context: "" }]
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const sections = parseSections(content)
|
|
115
|
+
const rawChunks: MarkdownChunk[] = []
|
|
116
|
+
|
|
117
|
+
// Heading stack for hierarchy tracking
|
|
118
|
+
const headingStack: { level: number; heading: string }[] = []
|
|
119
|
+
|
|
120
|
+
for (const section of sections) {
|
|
121
|
+
// Update heading stack
|
|
122
|
+
if (section.level > 0) {
|
|
123
|
+
// Pop headings at same or deeper level
|
|
124
|
+
while (
|
|
125
|
+
headingStack.length > 0 &&
|
|
126
|
+
headingStack[headingStack.length - 1].level >= section.level
|
|
127
|
+
) {
|
|
128
|
+
headingStack.pop()
|
|
129
|
+
}
|
|
130
|
+
headingStack.push({ level: section.level, heading: section.heading })
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const headingContext = config.preserve_heading_hierarchy
|
|
134
|
+
? buildHeadingContext(headingStack)
|
|
135
|
+
: section.heading
|
|
136
|
+
|
|
137
|
+
const sectionText = section.heading
|
|
138
|
+
? `${"#".repeat(section.level)} ${section.heading}\n${section.body}`
|
|
139
|
+
: section.body
|
|
140
|
+
|
|
141
|
+
rawChunks.push({ content: sectionText.trim(), heading_context: headingContext })
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Merge small sections with previous
|
|
145
|
+
const merged: MarkdownChunk[] = []
|
|
146
|
+
for (const chunk of rawChunks) {
|
|
147
|
+
if (
|
|
148
|
+
merged.length > 0 &&
|
|
149
|
+
chunk.content.length < config.min_chunk_size
|
|
150
|
+
) {
|
|
151
|
+
const prev = merged[merged.length - 1]
|
|
152
|
+
prev.content += "\n\n" + chunk.content
|
|
153
|
+
// Keep the deepest heading context
|
|
154
|
+
if (chunk.heading_context) {
|
|
155
|
+
prev.heading_context = chunk.heading_context
|
|
156
|
+
}
|
|
157
|
+
} else {
|
|
158
|
+
merged.push({ ...chunk })
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Split oversized sections
|
|
163
|
+
const result: MarkdownChunk[] = []
|
|
164
|
+
for (const chunk of merged) {
|
|
165
|
+
if (chunk.content.length > config.max_chunk_size) {
|
|
166
|
+
const parts = splitLargeText(chunk.content, config.max_chunk_size)
|
|
167
|
+
for (const part of parts) {
|
|
168
|
+
result.push({ content: part, heading_context: chunk.heading_context })
|
|
169
|
+
}
|
|
170
|
+
} else {
|
|
171
|
+
result.push(chunk)
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Filter empties
|
|
176
|
+
return result.filter((c) => c.content.trim().length > 0)
|
|
177
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content Cleaner — removes noise from file content before chunking.
|
|
3
|
+
*
|
|
4
|
+
* Strips TOC blocks, breadcrumbs, repeated headers, auto-generated markers,
|
|
5
|
+
* and optionally imports/comments so the embedding model sees only signal.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface CleaningConfig {
|
|
9
|
+
remove_toc: boolean
|
|
10
|
+
remove_frontmatter_metadata: boolean
|
|
11
|
+
remove_imports: boolean
|
|
12
|
+
remove_comments: boolean
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export const DEFAULT_CLEANING_CONFIG: CleaningConfig = {
|
|
16
|
+
remove_toc: true,
|
|
17
|
+
remove_frontmatter_metadata: false,
|
|
18
|
+
remove_imports: false,
|
|
19
|
+
remove_comments: false,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// ── Markdown noise ──────────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
/** Remove YAML front-matter (---…---) from Markdown. */
|
|
25
|
+
function stripFrontmatter(text: string): string {
|
|
26
|
+
return text.replace(/^---\n[\s\S]*?\n---\n?/, "")
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Remove inline TOC blocks.
|
|
31
|
+
* Matches patterns like:
|
|
32
|
+
* ## Table of Contents
|
|
33
|
+
* - [Section](#section)
|
|
34
|
+
* …blank line
|
|
35
|
+
*/
|
|
36
|
+
function stripToc(text: string): string {
|
|
37
|
+
// Pattern: heading containing "table of contents" or "contents" followed by link-list
|
|
38
|
+
return text.replace(
|
|
39
|
+
/^#{1,3}\s*(Table of Contents|Contents|TOC)\s*\n([\t ]*[-*]\s*\[.*?\]\(#.*?\)\s*\n?)+/gim,
|
|
40
|
+
"",
|
|
41
|
+
)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** Remove HTML-style TOC markers like <!-- TOC --> … <!-- /TOC --> */
|
|
45
|
+
function stripHtmlTocMarkers(text: string): string {
|
|
46
|
+
return text.replace(/<!--\s*TOC\s*-->[\s\S]*?<!--\s*\/TOC\s*-->\n?/gi, "")
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Remove breadcrumb lines, e.g. `Home > Docs > API` at the top. */
|
|
50
|
+
function stripBreadcrumbs(text: string): string {
|
|
51
|
+
// Matches lines that look like breadcrumbs (word > word > word) at start
|
|
52
|
+
return text.replace(/^(?:[\w\s]+>\s*){2,}[\w\s]+\n{1,2}/gm, "")
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Remove auto-generated code markers like `// AUTO-GENERATED` blocks. */
|
|
56
|
+
function stripAutoGenMarkers(text: string): string {
|
|
57
|
+
return text.replace(
|
|
58
|
+
/\/[/*]\s*(?:AUTO[- ]?GENERATED|DO NOT (?:EDIT|MODIFY)|GENERATED BY|This file (?:is|was) (?:auto-?)?generated)[^\n]*/gi,
|
|
59
|
+
"",
|
|
60
|
+
)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// ── Code noise ──────────────────────────────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
/** Remove import/require statements (JS/TS/Python/Go). */
|
|
66
|
+
function stripImports(text: string): string {
|
|
67
|
+
// JS/TS imports
|
|
68
|
+
let result = text.replace(/^import\s[\s\S]*?from\s+['"][^'"]+['"];?\s*$/gm, "")
|
|
69
|
+
result = result.replace(/^import\s+['"][^'"]+['"];?\s*$/gm, "")
|
|
70
|
+
// require
|
|
71
|
+
result = result.replace(/^(?:const|let|var)\s+.*?=\s*require\s*\(.*?\);?\s*$/gm, "")
|
|
72
|
+
// Python
|
|
73
|
+
result = result.replace(/^(?:from\s+\S+\s+)?import\s+.+$/gm, "")
|
|
74
|
+
// Go
|
|
75
|
+
result = result.replace(/^import\s*\(\s*\n(?:[\t ]*"[^"]*"\s*\n?)*\s*\)/gm, "")
|
|
76
|
+
result = result.replace(/^import\s+"[^"]*"\s*$/gm, "")
|
|
77
|
+
return result
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** Remove single-line and block comments (JS/TS style). */
|
|
81
|
+
function stripComments(text: string): string {
|
|
82
|
+
// Block comments
|
|
83
|
+
let result = text.replace(/\/\*[\s\S]*?\*\//g, "")
|
|
84
|
+
// Single-line // comments (only full-line, not inline URLs etc.)
|
|
85
|
+
result = result.replace(/^\s*\/\/[^\n]*$/gm, "")
|
|
86
|
+
// Python/Ruby # comments (full line only)
|
|
87
|
+
result = result.replace(/^\s*#[^\n!]*$/gm, "")
|
|
88
|
+
return result
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// ── Shared ──────────────────────────────────────────────────────────────────
|
|
92
|
+
|
|
93
|
+
/** Collapse 3+ consecutive blank lines into 2. */
|
|
94
|
+
function collapseBlankLines(text: string): string {
|
|
95
|
+
return text.replace(/\n{3,}/g, "\n\n")
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// ── Public API ──────────────────────────────────────────────────────────────
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Clean file content according to the supplied config.
|
|
102
|
+
* @param content Raw file content
|
|
103
|
+
* @param fileType 'docs' | 'code' | 'config'
|
|
104
|
+
* @param config Cleaning options
|
|
105
|
+
*/
|
|
106
|
+
export function cleanContent(
|
|
107
|
+
content: string,
|
|
108
|
+
fileType: "docs" | "code" | "config",
|
|
109
|
+
config: CleaningConfig = DEFAULT_CLEANING_CONFIG,
|
|
110
|
+
): string {
|
|
111
|
+
let result = content
|
|
112
|
+
|
|
113
|
+
if (fileType === "docs") {
|
|
114
|
+
if (config.remove_frontmatter_metadata) {
|
|
115
|
+
result = stripFrontmatter(result)
|
|
116
|
+
}
|
|
117
|
+
if (config.remove_toc) {
|
|
118
|
+
result = stripToc(result)
|
|
119
|
+
result = stripHtmlTocMarkers(result)
|
|
120
|
+
}
|
|
121
|
+
result = stripBreadcrumbs(result)
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
if (fileType === "code") {
|
|
125
|
+
result = stripAutoGenMarkers(result)
|
|
126
|
+
if (config.remove_imports) {
|
|
127
|
+
result = stripImports(result)
|
|
128
|
+
}
|
|
129
|
+
if (config.remove_comments) {
|
|
130
|
+
result = stripComments(result)
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
result = collapseBlankLines(result).trim()
|
|
135
|
+
return result
|
|
136
|
+
}
|