@comfanion/usethis_search 0.1.4 → 0.2.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,325 @@
1
+ /**
2
+ * Code Chunker — splits source code by functions, classes, and exports.
3
+ *
4
+ * Uses regex-based parsing (no AST dependency) to detect function/class
5
+ * boundaries. Falls back to line-based splitting for unstructured code.
6
+ */
7
+
8
+ export interface CodeChunkConfig {
9
+ min_chunk_size: number
10
+ max_chunk_size: number
11
+ split_by_functions: boolean
12
+ include_function_signature: boolean
13
+ }
14
+
15
+ export const DEFAULT_CODE_CONFIG: CodeChunkConfig = {
16
+ min_chunk_size: 300,
17
+ max_chunk_size: 1500,
18
+ split_by_functions: true,
19
+ include_function_signature: true,
20
+ }
21
+
22
+ export interface CodeChunk {
23
+ content: string
24
+ function_name?: string
25
+ class_name?: string
26
+ }
27
+
28
+ // ── Block detection ─────────────────────────────────────────────────────────
29
+
30
+ interface CodeBlock {
31
+ type: "function" | "class" | "method" | "other"
32
+ name: string
33
+ className?: string
34
+ startLine: number
35
+ endLine: number
36
+ }
37
+
38
+ /**
39
+ * Detect top-level function/class blocks via brace-counting.
40
+ * Works for JS/TS/Go/Rust/Java/C-family languages.
41
+ */
42
+ function detectBlocks(lines: string[]): CodeBlock[] {
43
+ const blocks: CodeBlock[] = []
44
+
45
+ // Patterns for function/class declarations
46
+ const fnPatterns = [
47
+ // JS/TS: function name(, async function name(, export function
48
+ /(?:export\s+)?(?:async\s+)?function\s+(\w+)/,
49
+ // Arrow: const name = (…) => or const name = async (
50
+ /(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/,
51
+ // Method inside class: name( or async name(
52
+ /^\s+(?:async\s+)?(\w+)\s*\([^)]*\)\s*(?::\s*\w[^{]*)?\s*\{/,
53
+ // Go: func Name(
54
+ /^func\s+(?:\([^)]*\)\s+)?(\w+)\s*\(/,
55
+ // Rust: fn name(
56
+ /(?:pub\s+)?(?:async\s+)?fn\s+(\w+)/,
57
+ // Python def
58
+ /^\s*(?:async\s+)?def\s+(\w+)\s*\(/,
59
+ ]
60
+
61
+ const classPatterns = [
62
+ // JS/TS/Java/C#: class Name
63
+ /(?:export\s+)?(?:abstract\s+)?class\s+(\w+)/,
64
+ // Rust: struct/enum/impl
65
+ /(?:pub\s+)?(?:struct|enum|impl)\s+(\w+)/,
66
+ // Python class
67
+ /^class\s+(\w+)/,
68
+ ]
69
+
70
+ let currentClass: string | undefined
71
+ let i = 0
72
+
73
+ while (i < lines.length) {
74
+ const line = lines[i]
75
+
76
+ // Check for class
77
+ let classMatch: RegExpMatchArray | null = null
78
+ for (const pat of classPatterns) {
79
+ classMatch = line.match(pat)
80
+ if (classMatch) break
81
+ }
82
+
83
+ if (classMatch) {
84
+ const name = classMatch[1]
85
+ const endLine = findBlockEnd(lines, i)
86
+ blocks.push({ type: "class", name, startLine: i, endLine })
87
+ currentClass = name
88
+
89
+ // Look for methods inside
90
+ for (let j = i + 1; j < endLine; j++) {
91
+ const methodLine = lines[j]
92
+ const methodMatch = methodLine.match(/^\s+(?:(?:public|private|protected|static|async|override)\s+)*(\w+)\s*\([^)]*\)\s*(?::\s*[^{]*)?\s*\{/)
93
+ if (methodMatch && methodMatch[1] !== "constructor" || methodMatch && methodMatch[1] === "constructor") {
94
+ const mEnd = findBlockEnd(lines, j)
95
+ blocks.push({
96
+ type: "method",
97
+ name: methodMatch[1],
98
+ className: name,
99
+ startLine: j,
100
+ endLine: mEnd,
101
+ })
102
+ j = mEnd
103
+ }
104
+ }
105
+
106
+ i = endLine + 1
107
+ currentClass = undefined
108
+ continue
109
+ }
110
+
111
+ // Check for standalone function
112
+ let fnMatch: RegExpMatchArray | null = null
113
+ for (const pat of fnPatterns) {
114
+ fnMatch = line.match(pat)
115
+ if (fnMatch) break
116
+ }
117
+
118
+ if (fnMatch && !currentClass) {
119
+ const name = fnMatch[1]
120
+ const endLine = findBlockEnd(lines, i)
121
+ blocks.push({ type: "function", name, startLine: i, endLine })
122
+ i = endLine + 1
123
+ continue
124
+ }
125
+
126
+ i++
127
+ }
128
+
129
+ return blocks
130
+ }
131
+
132
+ /** Find end of brace-delimited block starting at `startLine`. */
133
+ function findBlockEnd(lines: string[], startLine: number): number {
134
+ let braceCount = 0
135
+ let started = false
136
+
137
+ // For Python-style (indent-based), use indent detection
138
+ const firstLine = lines[startLine]
139
+ const isPythonStyle = firstLine.match(/:\s*$/) && !firstLine.includes("{")
140
+
141
+ if (isPythonStyle) {
142
+ return findPythonBlockEnd(lines, startLine)
143
+ }
144
+
145
+ for (let i = startLine; i < lines.length; i++) {
146
+ const line = lines[i]
147
+ for (const ch of line) {
148
+ if (ch === "{") { braceCount++; started = true }
149
+ if (ch === "}") { braceCount-- }
150
+ }
151
+ if (started && braceCount <= 0) {
152
+ return i
153
+ }
154
+ }
155
+
156
+ return Math.min(startLine + 50, lines.length - 1)
157
+ }
158
+
159
+ /** Find end of indent-based block (Python). */
160
+ function findPythonBlockEnd(lines: string[], startLine: number): number {
161
+ const baseIndent = lines[startLine].match(/^(\s*)/)?.[1].length ?? 0
162
+
163
+ for (let i = startLine + 1; i < lines.length; i++) {
164
+ const line = lines[i]
165
+ if (line.trim() === "") continue
166
+ const indent = line.match(/^(\s*)/)?.[1].length ?? 0
167
+ if (indent <= baseIndent) {
168
+ return i - 1
169
+ }
170
+ }
171
+
172
+ return lines.length - 1
173
+ }
174
+
175
+ // ── Fallback: line-based splitting ──────────────────────────────────────────
176
+
177
+ function splitByLines(lines: string[], maxChars: number): CodeChunk[] {
178
+ const chunks: CodeChunk[] = []
179
+ let current: string[] = []
180
+ let currentLen = 0
181
+
182
+ for (const line of lines) {
183
+ if (currentLen + line.length + 1 > maxChars && current.length > 0) {
184
+ chunks.push({ content: current.join("\n") })
185
+ current = []
186
+ currentLen = 0
187
+ }
188
+ current.push(line)
189
+ currentLen += line.length + 1
190
+ }
191
+
192
+ if (current.length > 0) {
193
+ chunks.push({ content: current.join("\n") })
194
+ }
195
+
196
+ return chunks
197
+ }
198
+
199
+ // ── Public API ──────────────────────────────────────────────────────────────
200
+
201
+ /**
202
+ * Chunk source code by functions/classes.
203
+ */
204
+ export function chunkCode(
205
+ content: string,
206
+ config: CodeChunkConfig = DEFAULT_CODE_CONFIG,
207
+ ): CodeChunk[] {
208
+ const lines = content.split("\n")
209
+
210
+ if (!config.split_by_functions) {
211
+ return splitByLines(lines, config.max_chunk_size)
212
+ }
213
+
214
+ const blocks = detectBlocks(lines)
215
+
216
+ if (blocks.length === 0) {
217
+ // No recognizable blocks — fallback
218
+ return splitByLines(lines, config.max_chunk_size)
219
+ }
220
+
221
+ const chunks: CodeChunk[] = []
222
+
223
+ // Collect "gaps" (code between blocks) and blocks themselves
224
+ let lastEnd = -1
225
+
226
+ for (const block of blocks) {
227
+ // If there is class-level block, skip individual method-level duplicate
228
+ if (block.type === "method") continue
229
+
230
+ // Gap before this block
231
+ if (block.startLine > lastEnd + 1) {
232
+ const gapContent = lines.slice(lastEnd + 1, block.startLine).join("\n").trim()
233
+ if (gapContent.length >= config.min_chunk_size) {
234
+ chunks.push({ content: gapContent })
235
+ } else if (gapContent.length > 0 && chunks.length > 0) {
236
+ // Merge small gap with previous chunk
237
+ chunks[chunks.length - 1].content += "\n\n" + gapContent
238
+ } else if (gapContent.length > 0) {
239
+ chunks.push({ content: gapContent })
240
+ }
241
+ }
242
+
243
+ const blockContent = lines.slice(block.startLine, block.endLine + 1).join("\n")
244
+
245
+ if (blockContent.length > config.max_chunk_size && block.type === "class") {
246
+ // Split class into methods
247
+ const methods = blocks.filter(
248
+ (b) => b.type === "method" && b.className === block.name,
249
+ )
250
+
251
+ if (methods.length > 0) {
252
+ let classLastEnd = block.startLine
253
+
254
+ for (const method of methods) {
255
+ // Class preamble / gap before method
256
+ if (method.startLine > classLastEnd + 1) {
257
+ const gap = lines.slice(classLastEnd + 1, method.startLine).join("\n").trim()
258
+ if (gap) {
259
+ chunks.push({
260
+ content: gap,
261
+ class_name: block.name,
262
+ })
263
+ }
264
+ }
265
+
266
+ chunks.push({
267
+ content: lines.slice(method.startLine, method.endLine + 1).join("\n"),
268
+ function_name: method.name,
269
+ class_name: block.name,
270
+ })
271
+ classLastEnd = method.endLine
272
+ }
273
+
274
+ // Class tail
275
+ if (classLastEnd < block.endLine) {
276
+ const tail = lines.slice(classLastEnd + 1, block.endLine + 1).join("\n").trim()
277
+ if (tail) {
278
+ chunks.push({ content: tail, class_name: block.name })
279
+ }
280
+ }
281
+ } else {
282
+ // No methods found — split by lines
283
+ const subChunks = splitByLines(
284
+ lines.slice(block.startLine, block.endLine + 1),
285
+ config.max_chunk_size,
286
+ )
287
+ for (const sc of subChunks) {
288
+ sc.class_name = block.name
289
+ chunks.push(sc)
290
+ }
291
+ }
292
+ } else {
293
+ chunks.push({
294
+ content: blockContent,
295
+ function_name: block.type === "function" ? block.name : undefined,
296
+ class_name: block.type === "class" ? block.name : block.className,
297
+ })
298
+ }
299
+
300
+ lastEnd = block.endLine
301
+ }
302
+
303
+ // Trailing code after last block
304
+ if (lastEnd < lines.length - 1) {
305
+ const trailing = lines.slice(lastEnd + 1).join("\n").trim()
306
+ if (trailing.length > 0) {
307
+ chunks.push({ content: trailing })
308
+ }
309
+ }
310
+
311
+ // Final: split any chunk still too large
312
+ const result: CodeChunk[] = []
313
+ for (const chunk of chunks) {
314
+ if (chunk.content.length > config.max_chunk_size) {
315
+ const parts = splitByLines(chunk.content.split("\n"), config.max_chunk_size)
316
+ for (const p of parts) {
317
+ result.push({ ...chunk, content: p.content })
318
+ }
319
+ } else {
320
+ result.push(chunk)
321
+ }
322
+ }
323
+
324
+ return result.filter((c) => c.content.trim().length > 0)
325
+ }
@@ -0,0 +1,177 @@
1
+ /**
2
+ * Markdown Chunker — splits Markdown by heading structure.
3
+ *
4
+ * Preserves heading hierarchy ("API > Auth > JWT") in metadata,
5
+ * merges small sections, and splits oversized ones.
6
+ */
7
+
8
+ export interface MarkdownChunkConfig {
9
+ min_chunk_size: number // merge sections smaller than this (chars)
10
+ max_chunk_size: number // split sections larger than this (chars)
11
+ split_by_headings: boolean
12
+ preserve_heading_hierarchy: boolean
13
+ }
14
+
15
+ export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
16
+ min_chunk_size: 200,
17
+ max_chunk_size: 2000,
18
+ split_by_headings: true,
19
+ preserve_heading_hierarchy: true,
20
+ }
21
+
22
+ export interface MarkdownChunk {
23
+ content: string
24
+ heading_context: string // "H1 > H2 > H3"
25
+ }
26
+
27
+ // ── Internal types ──────────────────────────────────────────────────────────
28
+
29
+ interface Section {
30
+ level: number // 1-6 for headings, 0 for preamble
31
+ heading: string
32
+ body: string
33
+ }
34
+
35
+ // ── Parsing ─────────────────────────────────────────────────────────────────
36
+
37
+ /** Parse Markdown into sections keyed by heading. */
38
+ function parseSections(content: string): Section[] {
39
+ const lines = content.split("\n")
40
+ const sections: Section[] = []
41
+ let currentSection: Section = { level: 0, heading: "", body: "" }
42
+
43
+ for (const line of lines) {
44
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)$/)
45
+ if (headingMatch) {
46
+ // Push previous section
47
+ if (currentSection.body.trim() || currentSection.heading) {
48
+ sections.push(currentSection)
49
+ }
50
+ currentSection = {
51
+ level: headingMatch[1].length,
52
+ heading: headingMatch[2].trim(),
53
+ body: "",
54
+ }
55
+ } else {
56
+ currentSection.body += line + "\n"
57
+ }
58
+ }
59
+
60
+ // Push last section
61
+ if (currentSection.body.trim() || currentSection.heading) {
62
+ sections.push(currentSection)
63
+ }
64
+
65
+ return sections
66
+ }
67
+
68
+ /** Build heading hierarchy path for a section given the heading stack. */
69
+ function buildHeadingContext(stack: { level: number; heading: string }[]): string {
70
+ return stack.map((h) => h.heading).join(" > ")
71
+ }
72
+
73
+ // ── Splitting oversized sections ────────────────────────────────────────────
74
+
75
+ function splitLargeText(text: string, maxSize: number): string[] {
76
+ if (text.length <= maxSize) return [text]
77
+
78
+ const chunks: string[] = []
79
+ const lines = text.split("\n")
80
+ let current: string[] = []
81
+ let currentLen = 0
82
+
83
+ for (const line of lines) {
84
+ if (currentLen + line.length + 1 > maxSize && current.length > 0) {
85
+ chunks.push(current.join("\n"))
86
+ current = []
87
+ currentLen = 0
88
+ }
89
+ current.push(line)
90
+ currentLen += line.length + 1
91
+ }
92
+
93
+ if (current.length > 0) {
94
+ chunks.push(current.join("\n"))
95
+ }
96
+
97
+ return chunks
98
+ }
99
+
100
+ // ── Public API ──────────────────────────────────────────────────────────────
101
+
102
+ /**
103
+ * Chunk Markdown content into semantic sections.
104
+ */
105
+ export function chunkMarkdown(
106
+ content: string,
107
+ config: MarkdownChunkConfig = DEFAULT_MD_CONFIG,
108
+ ): MarkdownChunk[] {
109
+ if (!config.split_by_headings) {
110
+ // Fallback: single chunk (caller can use fixed chunker)
111
+ return [{ content, heading_context: "" }]
112
+ }
113
+
114
+ const sections = parseSections(content)
115
+ const rawChunks: MarkdownChunk[] = []
116
+
117
+ // Heading stack for hierarchy tracking
118
+ const headingStack: { level: number; heading: string }[] = []
119
+
120
+ for (const section of sections) {
121
+ // Update heading stack
122
+ if (section.level > 0) {
123
+ // Pop headings at same or deeper level
124
+ while (
125
+ headingStack.length > 0 &&
126
+ headingStack[headingStack.length - 1].level >= section.level
127
+ ) {
128
+ headingStack.pop()
129
+ }
130
+ headingStack.push({ level: section.level, heading: section.heading })
131
+ }
132
+
133
+ const headingContext = config.preserve_heading_hierarchy
134
+ ? buildHeadingContext(headingStack)
135
+ : section.heading
136
+
137
+ const sectionText = section.heading
138
+ ? `${"#".repeat(section.level)} ${section.heading}\n${section.body}`
139
+ : section.body
140
+
141
+ rawChunks.push({ content: sectionText.trim(), heading_context: headingContext })
142
+ }
143
+
144
+ // Merge small sections with previous
145
+ const merged: MarkdownChunk[] = []
146
+ for (const chunk of rawChunks) {
147
+ if (
148
+ merged.length > 0 &&
149
+ chunk.content.length < config.min_chunk_size
150
+ ) {
151
+ const prev = merged[merged.length - 1]
152
+ prev.content += "\n\n" + chunk.content
153
+ // Keep the deepest heading context
154
+ if (chunk.heading_context) {
155
+ prev.heading_context = chunk.heading_context
156
+ }
157
+ } else {
158
+ merged.push({ ...chunk })
159
+ }
160
+ }
161
+
162
+ // Split oversized sections
163
+ const result: MarkdownChunk[] = []
164
+ for (const chunk of merged) {
165
+ if (chunk.content.length > config.max_chunk_size) {
166
+ const parts = splitLargeText(chunk.content, config.max_chunk_size)
167
+ for (const part of parts) {
168
+ result.push({ content: part, heading_context: chunk.heading_context })
169
+ }
170
+ } else {
171
+ result.push(chunk)
172
+ }
173
+ }
174
+
175
+ // Filter empties
176
+ return result.filter((c) => c.content.trim().length > 0)
177
+ }
@@ -0,0 +1,136 @@
1
+ /**
2
+ * Content Cleaner — removes noise from file content before chunking.
3
+ *
4
+ * Strips TOC blocks, breadcrumbs, repeated headers, auto-generated markers,
5
+ * and optionally imports/comments so the embedding model sees only signal.
6
+ */
7
+
8
+ export interface CleaningConfig {
9
+ remove_toc: boolean
10
+ remove_frontmatter_metadata: boolean
11
+ remove_imports: boolean
12
+ remove_comments: boolean
13
+ }
14
+
15
+ export const DEFAULT_CLEANING_CONFIG: CleaningConfig = {
16
+ remove_toc: true,
17
+ remove_frontmatter_metadata: false,
18
+ remove_imports: false,
19
+ remove_comments: false,
20
+ }
21
+
22
+ // ── Markdown noise ──────────────────────────────────────────────────────────
23
+
24
+ /** Remove YAML front-matter (---…---) from Markdown. */
25
+ function stripFrontmatter(text: string): string {
26
+ return text.replace(/^---\n[\s\S]*?\n---\n?/, "")
27
+ }
28
+
29
+ /**
30
+ * Remove inline TOC blocks.
31
+ * Matches patterns like:
32
+ * ## Table of Contents
33
+ * - [Section](#section)
34
+ * …blank line
35
+ */
36
+ function stripToc(text: string): string {
37
+ // Pattern: heading containing "table of contents" or "contents" followed by link-list
38
+ return text.replace(
39
+ /^#{1,3}\s*(Table of Contents|Contents|TOC)\s*\n([\t ]*[-*]\s*\[.*?\]\(#.*?\)\s*\n?)+/gim,
40
+ "",
41
+ )
42
+ }
43
+
44
+ /** Remove HTML-style TOC markers like <!-- TOC --> … <!-- /TOC --> */
45
+ function stripHtmlTocMarkers(text: string): string {
46
+ return text.replace(/<!--\s*TOC\s*-->[\s\S]*?<!--\s*\/TOC\s*-->\n?/gi, "")
47
+ }
48
+
49
+ /** Remove breadcrumb lines, e.g. `Home > Docs > API` at the top. */
50
+ function stripBreadcrumbs(text: string): string {
51
+ // Matches lines that look like breadcrumbs (word > word > word) at start
52
+ return text.replace(/^(?:[\w\s]+>\s*){2,}[\w\s]+\n{1,2}/gm, "")
53
+ }
54
+
55
+ /** Remove auto-generated code markers like `// AUTO-GENERATED` blocks. */
56
+ function stripAutoGenMarkers(text: string): string {
57
+ return text.replace(
58
+ /\/[/*]\s*(?:AUTO[- ]?GENERATED|DO NOT (?:EDIT|MODIFY)|GENERATED BY|This file (?:is|was) (?:auto-?)?generated)[^\n]*/gi,
59
+ "",
60
+ )
61
+ }
62
+
63
+ // ── Code noise ──────────────────────────────────────────────────────────────
64
+
65
+ /** Remove import/require statements (JS/TS/Python/Go). */
66
+ function stripImports(text: string): string {
67
+ // JS/TS imports
68
+ let result = text.replace(/^import\s[\s\S]*?from\s+['"][^'"]+['"];?\s*$/gm, "")
69
+ result = result.replace(/^import\s+['"][^'"]+['"];?\s*$/gm, "")
70
+ // require
71
+ result = result.replace(/^(?:const|let|var)\s+.*?=\s*require\s*\(.*?\);?\s*$/gm, "")
72
+ // Python
73
+ result = result.replace(/^(?:from\s+\S+\s+)?import\s+.+$/gm, "")
74
+ // Go
75
+ result = result.replace(/^import\s*\(\s*\n(?:[\t ]*"[^"]*"\s*\n?)*\s*\)/gm, "")
76
+ result = result.replace(/^import\s+"[^"]*"\s*$/gm, "")
77
+ return result
78
+ }
79
+
80
+ /** Remove single-line and block comments (JS/TS style). */
81
+ function stripComments(text: string): string {
82
+ // Block comments
83
+ let result = text.replace(/\/\*[\s\S]*?\*\//g, "")
84
+ // Single-line // comments (only full-line, not inline URLs etc.)
85
+ result = result.replace(/^\s*\/\/[^\n]*$/gm, "")
86
+ // Python/Ruby # comments (full line only)
87
+ result = result.replace(/^\s*#[^\n!]*$/gm, "")
88
+ return result
89
+ }
90
+
91
+ // ── Shared ──────────────────────────────────────────────────────────────────
92
+
93
+ /** Collapse 3+ consecutive blank lines into 2. */
94
+ function collapseBlankLines(text: string): string {
95
+ return text.replace(/\n{3,}/g, "\n\n")
96
+ }
97
+
98
+ // ── Public API ──────────────────────────────────────────────────────────────
99
+
100
+ /**
101
+ * Clean file content according to the supplied config.
102
+ * @param content Raw file content
103
+ * @param fileType 'docs' | 'code' | 'config'
104
+ * @param config Cleaning options
105
+ */
106
+ export function cleanContent(
107
+ content: string,
108
+ fileType: "docs" | "code" | "config",
109
+ config: CleaningConfig = DEFAULT_CLEANING_CONFIG,
110
+ ): string {
111
+ let result = content
112
+
113
+ if (fileType === "docs") {
114
+ if (config.remove_frontmatter_metadata) {
115
+ result = stripFrontmatter(result)
116
+ }
117
+ if (config.remove_toc) {
118
+ result = stripToc(result)
119
+ result = stripHtmlTocMarkers(result)
120
+ }
121
+ result = stripBreadcrumbs(result)
122
+ }
123
+
124
+ if (fileType === "code") {
125
+ result = stripAutoGenMarkers(result)
126
+ if (config.remove_imports) {
127
+ result = stripImports(result)
128
+ }
129
+ if (config.remove_comments) {
130
+ result = stripComments(result)
131
+ }
132
+ }
133
+
134
+ result = collapseBlankLines(result).trim()
135
+ return result
136
+ }