@comfanion/usethis_search 0.1.5 → 0.2.0-dev.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+ /**
2
+ * BM25 Index — keyword-based search using Okapi BM25 scoring.
3
+ *
4
+ * Builds an inverted index from chunk content and scores queries
5
+ * against it. Designed to complement vector similarity search.
6
+ */
7
+
8
+ // ── BM25 parameters ────────────────────────────────────────────────────────
9
+
10
+ const K1 = 1.2 // term frequency saturation
11
+ const B = 0.75 // length normalization
12
+
13
+ // ── Types ───────────────────────────────────────────────────────────────────
14
+
15
+ interface DocEntry {
16
+ id: number
17
+ termFreqs: Map<string, number>
18
+ length: number // total tokens
19
+ }
20
+
21
+ export interface BM25Result {
22
+ id: number
23
+ score: number
24
+ }
25
+
26
+ // ── Tokenizer ───────────────────────────────────────────────────────────────
27
+
28
+ const STOP_WORDS = new Set([
29
+ "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
30
+ "have", "has", "had", "do", "does", "did", "will", "would", "could",
31
+ "should", "may", "might", "shall", "can", "need", "must",
32
+ "and", "or", "but", "not", "no", "nor",
33
+ "in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
34
+ "into", "about", "between", "through", "during", "before", "after",
35
+ "this", "that", "these", "those", "it", "its",
36
+ "i", "you", "he", "she", "we", "they", "me", "him", "her", "us", "them",
37
+ "my", "your", "his", "our", "their",
38
+ "what", "which", "who", "whom", "where", "when", "how", "why",
39
+ "if", "then", "else", "so", "than", "too", "very",
40
+ ])
41
+
42
+ /**
43
+ * Tokenize text into lowercase terms, filtering stop words and short tokens.
44
+ */
45
+ export function tokenize(text: string): string[] {
46
+ return text
47
+ .toLowerCase()
48
+ .replace(/[^a-z0-9_\-]/g, " ")
49
+ .split(/\s+/)
50
+ .filter((t) => t.length > 1 && !STOP_WORDS.has(t))
51
+ }
52
+
53
+ // ── BM25 Index class ────────────────────────────────────────────────────────
54
+
55
+ export class BM25Index {
56
+ private docs: DocEntry[] = []
57
+ private invertedIndex: Map<string, Set<number>> = new Map()
58
+ private avgDocLength: number = 0
59
+ private docCount: number = 0
60
+
61
+ /**
62
+ * Build index from a list of text documents.
63
+ * Each document is identified by its array index.
64
+ */
65
+ build(documents: string[]): void {
66
+ this.docs = []
67
+ this.invertedIndex = new Map()
68
+
69
+ let totalLength = 0
70
+
71
+ for (let i = 0; i < documents.length; i++) {
72
+ const tokens = tokenize(documents[i])
73
+ const termFreqs = new Map<string, number>()
74
+
75
+ for (const token of tokens) {
76
+ termFreqs.set(token, (termFreqs.get(token) || 0) + 1)
77
+
78
+ if (!this.invertedIndex.has(token)) {
79
+ this.invertedIndex.set(token, new Set())
80
+ }
81
+ this.invertedIndex.get(token)!.add(i)
82
+ }
83
+
84
+ this.docs.push({ id: i, termFreqs, length: tokens.length })
85
+ totalLength += tokens.length
86
+ }
87
+
88
+ this.docCount = documents.length
89
+ this.avgDocLength = this.docCount > 0 ? totalLength / this.docCount : 0
90
+ }
91
+
92
+ /**
93
+ * Score a query against indexed documents.
94
+ * Returns array sorted by descending score.
95
+ */
96
+ search(query: string, limit: number = 50): BM25Result[] {
97
+ const queryTerms = tokenize(query)
98
+ if (queryTerms.length === 0) return []
99
+
100
+ // Collect candidate docs (any doc containing at least one query term)
101
+ const candidateIds = new Set<number>()
102
+ for (const term of queryTerms) {
103
+ const postings = this.invertedIndex.get(term)
104
+ if (postings) {
105
+ for (const id of postings) candidateIds.add(id)
106
+ }
107
+ }
108
+
109
+ if (candidateIds.size === 0) return []
110
+
111
+ // Score each candidate
112
+ const results: BM25Result[] = []
113
+
114
+ for (const docId of candidateIds) {
115
+ const doc = this.docs[docId]
116
+ let score = 0
117
+
118
+ for (const term of queryTerms) {
119
+ const tf = doc.termFreqs.get(term) || 0
120
+ if (tf === 0) continue
121
+
122
+ const df = this.invertedIndex.get(term)?.size || 0
123
+ const idf = Math.log((this.docCount - df + 0.5) / (df + 0.5) + 1)
124
+ const tfNorm = (tf * (K1 + 1)) / (tf + K1 * (1 - B + B * (doc.length / this.avgDocLength)))
125
+
126
+ score += idf * tfNorm
127
+ }
128
+
129
+ if (score > 0) {
130
+ results.push({ id: docId, score })
131
+ }
132
+ }
133
+
134
+ results.sort((a, b) => b.score - a.score)
135
+ return results.slice(0, limit)
136
+ }
137
+
138
+ /** Number of indexed documents. */
139
+ get size(): number {
140
+ return this.docCount
141
+ }
142
+
143
+ /** Number of unique terms. */
144
+ get vocabularySize(): number {
145
+ return this.invertedIndex.size
146
+ }
147
+
148
+ /** Release all memory held by the index. */
149
+ clear(): void {
150
+ this.docs = []
151
+ this.invertedIndex = new Map()
152
+ this.avgDocLength = 0
153
+ this.docCount = 0
154
+ }
155
+ }
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Chunker Factory — selects the appropriate chunker based on file type.
3
+ *
4
+ * Routes: markdown → markdown-chunker, code → code-chunker, else → fixed.
5
+ */
6
+
7
+ import { chunkMarkdown, type MarkdownChunkConfig, DEFAULT_MD_CONFIG } from "./markdown-chunker"
8
+ import { chunkCode, type CodeChunkConfig, DEFAULT_CODE_CONFIG } from "./code-chunker"
9
+ import type { FileType } from "../metadata-extractor"
10
+
11
+ // ── Types ───────────────────────────────────────────────────────────────────
12
+
13
+ export type ChunkingStrategy = "fixed" | "semantic" | "hybrid"
14
+
15
+ export interface ChunkingConfig {
16
+ strategy: ChunkingStrategy
17
+ markdown: MarkdownChunkConfig
18
+ code: CodeChunkConfig
19
+ fixed: { max_chars: number }
20
+ }
21
+
22
+ export const DEFAULT_CHUNKING_CONFIG: ChunkingConfig = {
23
+ strategy: "semantic",
24
+ markdown: DEFAULT_MD_CONFIG,
25
+ code: DEFAULT_CODE_CONFIG,
26
+ fixed: { max_chars: 1500 },
27
+ }
28
+
29
+ /** Unified chunk output from any chunker. */
30
+ export interface UnifiedChunk {
31
+ content: string
32
+ heading_context?: string
33
+ function_name?: string
34
+ class_name?: string
35
+ }
36
+
37
+ // ── Fixed chunker (legacy) ──────────────────────────────────────────────────
38
+
39
+ function chunkFixed(content: string, maxChars: number): UnifiedChunk[] {
40
+ const chunks: UnifiedChunk[] = []
41
+ const lines = content.split("\n")
42
+ let current: string[] = []
43
+ let currentLen = 0
44
+
45
+ for (const line of lines) {
46
+ if (currentLen + line.length + 1 > maxChars && current.length > 0) {
47
+ chunks.push({ content: current.join("\n") })
48
+ current = []
49
+ currentLen = 0
50
+ }
51
+ current.push(line)
52
+ currentLen += line.length + 1
53
+ }
54
+
55
+ if (current.length > 0) {
56
+ chunks.push({ content: current.join("\n") })
57
+ }
58
+
59
+ return chunks
60
+ }
61
+
62
+ // ── Public API ──────────────────────────────────────────────────────────────
63
+
64
+ /**
65
+ * Chunk content using the appropriate strategy for the given file type.
66
+ */
67
+ export function chunkContent(
68
+ content: string,
69
+ fileType: FileType,
70
+ language: string,
71
+ config: ChunkingConfig = DEFAULT_CHUNKING_CONFIG,
72
+ ): UnifiedChunk[] {
73
+ // If strategy is "fixed", always use fixed chunker
74
+ if (config.strategy === "fixed") {
75
+ return chunkFixed(content, config.fixed.max_chars)
76
+ }
77
+
78
+ // Semantic or hybrid: pick by file type
79
+ if (fileType === "docs" || language === "markdown") {
80
+ const mdChunks = chunkMarkdown(content, config.markdown)
81
+ return mdChunks.map((c) => ({
82
+ content: c.content,
83
+ heading_context: c.heading_context,
84
+ }))
85
+ }
86
+
87
+ if (fileType === "code") {
88
+ const codeChunks = chunkCode(content, config.code)
89
+ return codeChunks.map((c) => ({
90
+ content: c.content,
91
+ function_name: c.function_name,
92
+ class_name: c.class_name,
93
+ }))
94
+ }
95
+
96
+ // Config files or unknown — fixed
97
+ return chunkFixed(content, config.fixed.max_chars)
98
+ }
@@ -0,0 +1,325 @@
1
+ /**
2
+ * Code Chunker — splits source code by functions, classes, and exports.
3
+ *
4
+ * Uses regex-based parsing (no AST dependency) to detect function/class
5
+ * boundaries. Falls back to line-based splitting for unstructured code.
6
+ */
7
+
8
+ export interface CodeChunkConfig {
9
+ min_chunk_size: number
10
+ max_chunk_size: number
11
+ split_by_functions: boolean
12
+ include_function_signature: boolean
13
+ }
14
+
15
+ export const DEFAULT_CODE_CONFIG: CodeChunkConfig = {
16
+ min_chunk_size: 300,
17
+ max_chunk_size: 1500,
18
+ split_by_functions: true,
19
+ include_function_signature: true,
20
+ }
21
+
22
+ export interface CodeChunk {
23
+ content: string
24
+ function_name?: string
25
+ class_name?: string
26
+ }
27
+
28
+ // ── Block detection ─────────────────────────────────────────────────────────
29
+
30
+ interface CodeBlock {
31
+ type: "function" | "class" | "method" | "other"
32
+ name: string
33
+ className?: string
34
+ startLine: number
35
+ endLine: number
36
+ }
37
+
38
+ /**
39
+ * Detect top-level function/class blocks via brace-counting.
40
+ * Works for JS/TS/Go/Rust/Java/C-family languages.
41
+ */
42
+ function detectBlocks(lines: string[]): CodeBlock[] {
43
+ const blocks: CodeBlock[] = []
44
+
45
+ // Patterns for function/class declarations
46
+ const fnPatterns = [
47
+ // JS/TS: function name(, async function name(, export function
48
+ /(?:export\s+)?(?:async\s+)?function\s+(\w+)/,
49
+ // Arrow: const name = (…) => or const name = async (
50
+ /(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[^=])\s*=>/,
51
+ // Method inside class: name( or async name(
52
+ /^\s+(?:async\s+)?(\w+)\s*\([^)]*\)\s*(?::\s*\w[^{]*)?\s*\{/,
53
+ // Go: func Name(
54
+ /^func\s+(?:\([^)]*\)\s+)?(\w+)\s*\(/,
55
+ // Rust: fn name(
56
+ /(?:pub\s+)?(?:async\s+)?fn\s+(\w+)/,
57
+ // Python def
58
+ /^\s*(?:async\s+)?def\s+(\w+)\s*\(/,
59
+ ]
60
+
61
+ const classPatterns = [
62
+ // JS/TS/Java/C#: class Name
63
+ /(?:export\s+)?(?:abstract\s+)?class\s+(\w+)/,
64
+ // Rust: struct/enum/impl
65
+ /(?:pub\s+)?(?:struct|enum|impl)\s+(\w+)/,
66
+ // Python class
67
+ /^class\s+(\w+)/,
68
+ ]
69
+
70
+ let currentClass: string | undefined
71
+ let i = 0
72
+
73
+ while (i < lines.length) {
74
+ const line = lines[i]
75
+
76
+ // Check for class
77
+ let classMatch: RegExpMatchArray | null = null
78
+ for (const pat of classPatterns) {
79
+ classMatch = line.match(pat)
80
+ if (classMatch) break
81
+ }
82
+
83
+ if (classMatch) {
84
+ const name = classMatch[1]
85
+ const endLine = findBlockEnd(lines, i)
86
+ blocks.push({ type: "class", name, startLine: i, endLine })
87
+ currentClass = name
88
+
89
+ // Look for methods inside
90
+ for (let j = i + 1; j < endLine; j++) {
91
+ const methodLine = lines[j]
92
+ const methodMatch = methodLine.match(/^\s+(?:(?:public|private|protected|static|async|override)\s+)*(\w+)\s*\([^)]*\)\s*(?::\s*[^{]*)?\s*\{/)
93
+ if (methodMatch && methodMatch[1] !== "constructor" || methodMatch && methodMatch[1] === "constructor") {
94
+ const mEnd = findBlockEnd(lines, j)
95
+ blocks.push({
96
+ type: "method",
97
+ name: methodMatch[1],
98
+ className: name,
99
+ startLine: j,
100
+ endLine: mEnd,
101
+ })
102
+ j = mEnd
103
+ }
104
+ }
105
+
106
+ i = endLine + 1
107
+ currentClass = undefined
108
+ continue
109
+ }
110
+
111
+ // Check for standalone function
112
+ let fnMatch: RegExpMatchArray | null = null
113
+ for (const pat of fnPatterns) {
114
+ fnMatch = line.match(pat)
115
+ if (fnMatch) break
116
+ }
117
+
118
+ if (fnMatch && !currentClass) {
119
+ const name = fnMatch[1]
120
+ const endLine = findBlockEnd(lines, i)
121
+ blocks.push({ type: "function", name, startLine: i, endLine })
122
+ i = endLine + 1
123
+ continue
124
+ }
125
+
126
+ i++
127
+ }
128
+
129
+ return blocks
130
+ }
131
+
132
+ /** Find end of brace-delimited block starting at `startLine`. */
133
+ function findBlockEnd(lines: string[], startLine: number): number {
134
+ let braceCount = 0
135
+ let started = false
136
+
137
+ // For Python-style (indent-based), use indent detection
138
+ const firstLine = lines[startLine]
139
+ const isPythonStyle = firstLine.match(/:\s*$/) && !firstLine.includes("{")
140
+
141
+ if (isPythonStyle) {
142
+ return findPythonBlockEnd(lines, startLine)
143
+ }
144
+
145
+ for (let i = startLine; i < lines.length; i++) {
146
+ const line = lines[i]
147
+ for (const ch of line) {
148
+ if (ch === "{") { braceCount++; started = true }
149
+ if (ch === "}") { braceCount-- }
150
+ }
151
+ if (started && braceCount <= 0) {
152
+ return i
153
+ }
154
+ }
155
+
156
+ return Math.min(startLine + 50, lines.length - 1)
157
+ }
158
+
159
+ /** Find end of indent-based block (Python). */
160
+ function findPythonBlockEnd(lines: string[], startLine: number): number {
161
+ const baseIndent = lines[startLine].match(/^(\s*)/)?.[1].length ?? 0
162
+
163
+ for (let i = startLine + 1; i < lines.length; i++) {
164
+ const line = lines[i]
165
+ if (line.trim() === "") continue
166
+ const indent = line.match(/^(\s*)/)?.[1].length ?? 0
167
+ if (indent <= baseIndent) {
168
+ return i - 1
169
+ }
170
+ }
171
+
172
+ return lines.length - 1
173
+ }
174
+
175
+ // ── Fallback: line-based splitting ──────────────────────────────────────────
176
+
177
+ function splitByLines(lines: string[], maxChars: number): CodeChunk[] {
178
+ const chunks: CodeChunk[] = []
179
+ let current: string[] = []
180
+ let currentLen = 0
181
+
182
+ for (const line of lines) {
183
+ if (currentLen + line.length + 1 > maxChars && current.length > 0) {
184
+ chunks.push({ content: current.join("\n") })
185
+ current = []
186
+ currentLen = 0
187
+ }
188
+ current.push(line)
189
+ currentLen += line.length + 1
190
+ }
191
+
192
+ if (current.length > 0) {
193
+ chunks.push({ content: current.join("\n") })
194
+ }
195
+
196
+ return chunks
197
+ }
198
+
199
+ // ── Public API ──────────────────────────────────────────────────────────────
200
+
201
+ /**
202
+ * Chunk source code by functions/classes.
203
+ */
204
+ export function chunkCode(
205
+ content: string,
206
+ config: CodeChunkConfig = DEFAULT_CODE_CONFIG,
207
+ ): CodeChunk[] {
208
+ const lines = content.split("\n")
209
+
210
+ if (!config.split_by_functions) {
211
+ return splitByLines(lines, config.max_chunk_size)
212
+ }
213
+
214
+ const blocks = detectBlocks(lines)
215
+
216
+ if (blocks.length === 0) {
217
+ // No recognizable blocks — fallback
218
+ return splitByLines(lines, config.max_chunk_size)
219
+ }
220
+
221
+ const chunks: CodeChunk[] = []
222
+
223
+ // Collect "gaps" (code between blocks) and blocks themselves
224
+ let lastEnd = -1
225
+
226
+ for (const block of blocks) {
227
+ // If there is class-level block, skip individual method-level duplicate
228
+ if (block.type === "method") continue
229
+
230
+ // Gap before this block
231
+ if (block.startLine > lastEnd + 1) {
232
+ const gapContent = lines.slice(lastEnd + 1, block.startLine).join("\n").trim()
233
+ if (gapContent.length >= config.min_chunk_size) {
234
+ chunks.push({ content: gapContent })
235
+ } else if (gapContent.length > 0 && chunks.length > 0) {
236
+ // Merge small gap with previous chunk
237
+ chunks[chunks.length - 1].content += "\n\n" + gapContent
238
+ } else if (gapContent.length > 0) {
239
+ chunks.push({ content: gapContent })
240
+ }
241
+ }
242
+
243
+ const blockContent = lines.slice(block.startLine, block.endLine + 1).join("\n")
244
+
245
+ if (blockContent.length > config.max_chunk_size && block.type === "class") {
246
+ // Split class into methods
247
+ const methods = blocks.filter(
248
+ (b) => b.type === "method" && b.className === block.name,
249
+ )
250
+
251
+ if (methods.length > 0) {
252
+ let classLastEnd = block.startLine
253
+
254
+ for (const method of methods) {
255
+ // Class preamble / gap before method
256
+ if (method.startLine > classLastEnd + 1) {
257
+ const gap = lines.slice(classLastEnd + 1, method.startLine).join("\n").trim()
258
+ if (gap) {
259
+ chunks.push({
260
+ content: gap,
261
+ class_name: block.name,
262
+ })
263
+ }
264
+ }
265
+
266
+ chunks.push({
267
+ content: lines.slice(method.startLine, method.endLine + 1).join("\n"),
268
+ function_name: method.name,
269
+ class_name: block.name,
270
+ })
271
+ classLastEnd = method.endLine
272
+ }
273
+
274
+ // Class tail
275
+ if (classLastEnd < block.endLine) {
276
+ const tail = lines.slice(classLastEnd + 1, block.endLine + 1).join("\n").trim()
277
+ if (tail) {
278
+ chunks.push({ content: tail, class_name: block.name })
279
+ }
280
+ }
281
+ } else {
282
+ // No methods found — split by lines
283
+ const subChunks = splitByLines(
284
+ lines.slice(block.startLine, block.endLine + 1),
285
+ config.max_chunk_size,
286
+ )
287
+ for (const sc of subChunks) {
288
+ sc.class_name = block.name
289
+ chunks.push(sc)
290
+ }
291
+ }
292
+ } else {
293
+ chunks.push({
294
+ content: blockContent,
295
+ function_name: block.type === "function" ? block.name : undefined,
296
+ class_name: block.type === "class" ? block.name : block.className,
297
+ })
298
+ }
299
+
300
+ lastEnd = block.endLine
301
+ }
302
+
303
+ // Trailing code after last block
304
+ if (lastEnd < lines.length - 1) {
305
+ const trailing = lines.slice(lastEnd + 1).join("\n").trim()
306
+ if (trailing.length > 0) {
307
+ chunks.push({ content: trailing })
308
+ }
309
+ }
310
+
311
+ // Final: split any chunk still too large
312
+ const result: CodeChunk[] = []
313
+ for (const chunk of chunks) {
314
+ if (chunk.content.length > config.max_chunk_size) {
315
+ const parts = splitByLines(chunk.content.split("\n"), config.max_chunk_size)
316
+ for (const p of parts) {
317
+ result.push({ ...chunk, content: p.content })
318
+ }
319
+ } else {
320
+ result.push(chunk)
321
+ }
322
+ }
323
+
324
+ return result.filter((c) => c.content.trim().length > 0)
325
+ }