@comfanion/usethis_search 3.0.0-dev.9 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,207 @@
1
+ /**
2
+ * ChunkStore — SQLite-based persistent chunk storage.
3
+ * Populated by Phase 1 (no vectors needed). Provides BM25 search
4
+ * and metadata queries immediately, before embedding is complete.
5
+ *
6
+ * Uses bun:sqlite with WAL mode for concurrent read access.
7
+ */
8
+
9
+ import { Database } from "bun:sqlite"
10
+
11
+ export interface StoredChunk {
12
+ chunk_id: string
13
+ file: string
14
+ chunk_index: number
15
+ content: string
16
+ file_type: string
17
+ language: string
18
+ last_modified: string
19
+ file_size: number
20
+ heading_context: string
21
+ function_name: string
22
+ class_name: string
23
+ tags: string
24
+ start_line: number
25
+ end_line: number
26
+ archived: boolean
27
+ vectorized: boolean
28
+ }
29
+
30
+ export class ChunkStore {
31
+ private db: Database | null = null
32
+
33
+ // Prepared statements
34
+ private _stmtInsert: any = null
35
+ private _stmtByFile: any = null
36
+ private _stmtDeleteByFile: any = null
37
+ private _stmtAll: any = null
38
+ private _stmtByChunkId: any = null
39
+ private _stmtMarkVectorized: any = null
40
+ private _stmtHasVectors: any = null
41
+ private _stmtCount: any = null
42
+ private _stmtSearch: any = null
43
+
44
+ constructor(private dbPath: string) {}
45
+
46
+ async init(): Promise<this> {
47
+ const fullPath = this.dbPath.endsWith(".db") ? this.dbPath : this.dbPath + ".db"
48
+ this.db = new Database(fullPath)
49
+
50
+ this.db.exec("PRAGMA journal_mode = WAL")
51
+ this.db.exec("PRAGMA synchronous = NORMAL")
52
+ this.db.exec("PRAGMA cache_size = -4000") // 4MB cache
53
+
54
+ this.db.exec(`
55
+ CREATE TABLE IF NOT EXISTS chunks (
56
+ chunk_id TEXT PRIMARY KEY,
57
+ file TEXT NOT NULL,
58
+ chunk_index INTEGER NOT NULL DEFAULT 0,
59
+ content TEXT NOT NULL,
60
+ file_type TEXT NOT NULL DEFAULT '',
61
+ language TEXT NOT NULL DEFAULT '',
62
+ last_modified TEXT NOT NULL DEFAULT '',
63
+ file_size INTEGER NOT NULL DEFAULT 0,
64
+ heading_context TEXT NOT NULL DEFAULT '',
65
+ function_name TEXT NOT NULL DEFAULT '',
66
+ class_name TEXT NOT NULL DEFAULT '',
67
+ tags TEXT NOT NULL DEFAULT '',
68
+ start_line INTEGER NOT NULL DEFAULT -1,
69
+ end_line INTEGER NOT NULL DEFAULT -1,
70
+ archived INTEGER NOT NULL DEFAULT 0,
71
+ vectorized INTEGER NOT NULL DEFAULT 0
72
+ )
73
+ `)
74
+
75
+ this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)")
76
+ this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_vectorized ON chunks(vectorized)")
77
+ this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_language ON chunks(language)")
78
+
79
+ // Prepare statements
80
+ this._stmtInsert = this.db.prepare(`
81
+ INSERT OR REPLACE INTO chunks
82
+ (chunk_id, file, chunk_index, content, file_type, language, last_modified, file_size,
83
+ heading_context, function_name, class_name, tags, start_line, end_line, archived, vectorized)
84
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0)
85
+ `)
86
+ this._stmtByFile = this.db.prepare("SELECT * FROM chunks WHERE file = ?")
87
+ this._stmtDeleteByFile = this.db.prepare("DELETE FROM chunks WHERE file = ?")
88
+ this._stmtAll = this.db.prepare("SELECT * FROM chunks")
89
+ this._stmtByChunkId = this.db.prepare("SELECT * FROM chunks WHERE chunk_id = ?")
90
+ this._stmtMarkVectorized = this.db.prepare("UPDATE chunks SET vectorized = 1 WHERE file = ?")
91
+ this._stmtHasVectors = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks WHERE vectorized = 0")
92
+ this._stmtCount = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks")
93
+
94
+ return this
95
+ }
96
+
97
+ /**
98
+ * Store chunks from Phase 1 (batch, in transaction).
99
+ */
100
+ storeChunks(rows: Array<{
101
+ chunk_id: string, file: string, chunk_index: number, content: string,
102
+ file_type: string, language: string, last_modified: string, file_size: number,
103
+ heading_context: string, function_name: string, class_name: string, tags: string,
104
+ start_line: number, end_line: number, archived: boolean
105
+ }>): void {
106
+ if (!this.db) throw new Error("ChunkStore not initialized")
107
+
108
+ const insertMany = this.db.transaction((items: typeof rows) => {
109
+ for (const r of items) {
110
+ this._stmtInsert.run(
111
+ r.chunk_id, r.file, r.chunk_index, r.content,
112
+ r.file_type, r.language, r.last_modified, r.file_size,
113
+ r.heading_context, r.function_name, r.class_name, r.tags,
114
+ r.start_line, r.end_line, r.archived ? 1 : 0
115
+ )
116
+ }
117
+ })
118
+ insertMany(rows)
119
+ }
120
+
121
+ /**
122
+ * Delete all chunks for a file (before re-indexing).
123
+ */
124
+ deleteByFile(filePath: string): void {
125
+ if (!this.db) throw new Error("ChunkStore not initialized")
126
+ this._stmtDeleteByFile.run(filePath)
127
+ }
128
+
129
+ /**
130
+ * Mark all chunks for a file as vectorized (Phase 2 complete).
131
+ */
132
+ markVectorized(filePath: string): void {
133
+ if (!this.db) throw new Error("ChunkStore not initialized")
134
+ this._stmtMarkVectorized.run(filePath)
135
+ }
136
+
137
+ /**
138
+ * Check if all chunks have vectors.
139
+ */
140
+ hasUnvectorizedChunks(): boolean {
141
+ if (!this.db) return false
142
+ const row = this._stmtHasVectors.get() as { cnt: number }
143
+ return row.cnt > 0
144
+ }
145
+
146
+ /**
147
+ * Get all chunks (for BM25 index building).
148
+ */
149
+ getAllChunks(): StoredChunk[] {
150
+ if (!this.db) return []
151
+ return this._stmtAll.all().map((r: any) => this.toChunk(r))
152
+ }
153
+
154
+ /**
155
+ * Get chunks for a specific file.
156
+ */
157
+ getChunksByFile(filePath: string): StoredChunk[] {
158
+ if (!this.db) return []
159
+ return this._stmtByFile.all(filePath).map((r: any) => this.toChunk(r))
160
+ }
161
+
162
+ /**
163
+ * Get a single chunk by ID.
164
+ */
165
+ getChunkById(chunkId: string): StoredChunk | null {
166
+ if (!this.db) return null
167
+ const row = this._stmtByChunkId.get(chunkId)
168
+ return row ? this.toChunk(row) : null
169
+ }
170
+
171
+ /**
172
+ * Total chunk count.
173
+ */
174
+ count(): number {
175
+ if (!this.db) return 0
176
+ const row = this._stmtCount.get() as { cnt: number }
177
+ return row.cnt
178
+ }
179
+
180
+ close(): void {
181
+ if (this.db) {
182
+ this.db.close()
183
+ this.db = null
184
+ }
185
+ }
186
+
187
+ private toChunk(row: any): StoredChunk {
188
+ return {
189
+ chunk_id: row.chunk_id,
190
+ file: row.file,
191
+ chunk_index: row.chunk_index,
192
+ content: row.content,
193
+ file_type: row.file_type,
194
+ language: row.language,
195
+ last_modified: row.last_modified,
196
+ file_size: row.file_size,
197
+ heading_context: row.heading_context,
198
+ function_name: row.function_name,
199
+ class_name: row.class_name,
200
+ tags: row.tags,
201
+ start_line: row.start_line,
202
+ end_line: row.end_line,
203
+ archived: !!row.archived,
204
+ vectorized: !!row.vectorized,
205
+ }
206
+ }
207
+ }
@@ -23,6 +23,8 @@ export interface CodeChunk {
23
23
  content: string
24
24
  function_name?: string
25
25
  class_name?: string
26
+ start_line?: number
27
+ end_line?: number
26
28
  }
27
29
 
28
30
  // ── Block detection ─────────────────────────────────────────────────────────
@@ -172,31 +174,74 @@ function findPythonBlockEnd(lines: string[], startLine: number): number {
172
174
  return lines.length - 1
173
175
  }
174
176
 
175
- // ── Fallback: line-based splitting ──────────────────────────────────────────
177
+ // ── Fallback: line-based splitting ──────────────────────────────────────────
178
+
179
+ function splitByLines(lines: string[], maxChars: number): CodeChunk[] {
180
+ const chunks: CodeChunk[] = []
181
+ let current: string[] = []
182
+ let currentLen = 0
183
+ let startLine = 0
184
+
185
+ for (let i = 0; i < lines.length; i++) {
186
+ const line = lines[i]
187
+ if (currentLen + line.length + 1 > maxChars && current.length > 0) {
188
+ chunks.push({ content: current.join("\n"), start_line: startLine, end_line: i - 1 })
189
+ current = []
190
+ currentLen = 0
191
+ startLine = i
192
+ }
193
+ current.push(line)
194
+ currentLen += line.length + 1
195
+ }
176
196
 
177
- function splitByLines(lines: string[], maxChars: number): CodeChunk[] {
178
- const chunks: CodeChunk[] = []
179
- let current: string[] = []
180
- let currentLen = 0
181
-
182
- for (const line of lines) {
183
- if (currentLen + line.length + 1 > maxChars && current.length > 0) {
184
- chunks.push({ content: current.join("\n") })
185
- current = []
186
- currentLen = 0
197
+ if (current.length > 0) {
198
+ chunks.push({ content: current.join("\n"), start_line: startLine, end_line: lines.length - 1 })
187
199
  }
188
- current.push(line)
189
- currentLen += line.length + 1
190
- }
191
200
 
192
- if (current.length > 0) {
193
- chunks.push({ content: current.join("\n") })
201
+ return chunks
194
202
  }
195
203
 
196
- return chunks
197
- }
204
+ // ── Split large chunks preserving line numbers ────────────────────────────
205
+
206
+ function splitChunkByLines(chunk: CodeChunk, maxChars: number): CodeChunk[] {
207
+ const lines = chunk.content.split("\n")
208
+ const baseLine = chunk.start_line || 0
209
+
210
+ const parts: CodeChunk[] = []
211
+ let current: string[] = []
212
+ let currentLen = 0
213
+ let startLine = baseLine
214
+
215
+ for (let i = 0; i < lines.length; i++) {
216
+ const line = lines[i]
217
+ if (currentLen + line.length + 1 > maxChars && current.length > 0) {
218
+ parts.push({
219
+ ...chunk,
220
+ content: current.join("\n"),
221
+ start_line: startLine,
222
+ end_line: baseLine + i - 1,
223
+ })
224
+ current = []
225
+ currentLen = 0
226
+ startLine = baseLine + i
227
+ }
228
+ current.push(line)
229
+ currentLen += line.length + 1
230
+ }
231
+
232
+ if (current.length > 0) {
233
+ parts.push({
234
+ ...chunk,
235
+ content: current.join("\n"),
236
+ start_line: startLine,
237
+ end_line: baseLine + lines.length - 1,
238
+ })
239
+ }
240
+
241
+ return parts
242
+ }
198
243
 
199
- // ── Public API ──────────────────────────────────────────────────────────────
244
+ // ── Public API ──────────────────────────────────────────────────────────────
200
245
 
201
246
  /**
202
247
  * Chunk source code by functions/classes.
@@ -231,12 +276,13 @@ export function chunkCode(
231
276
  if (block.startLine > lastEnd + 1) {
232
277
  const gapContent = lines.slice(lastEnd + 1, block.startLine).join("\n").trim()
233
278
  if (gapContent.length >= config.min_chunk_size) {
234
- chunks.push({ content: gapContent })
279
+ chunks.push({ content: gapContent, start_line: lastEnd + 1, end_line: block.startLine - 1 })
235
280
  } else if (gapContent.length > 0 && chunks.length > 0) {
236
281
  // Merge small gap with previous chunk
237
282
  chunks[chunks.length - 1].content += "\n\n" + gapContent
283
+ chunks[chunks.length - 1].end_line = block.startLine - 1
238
284
  } else if (gapContent.length > 0) {
239
- chunks.push({ content: gapContent })
285
+ chunks.push({ content: gapContent, start_line: lastEnd + 1, end_line: block.startLine - 1 })
240
286
  }
241
287
  }
242
288
 
@@ -259,6 +305,8 @@ export function chunkCode(
259
305
  chunks.push({
260
306
  content: gap,
261
307
  class_name: block.name,
308
+ start_line: classLastEnd + 1,
309
+ end_line: method.startLine - 1,
262
310
  })
263
311
  }
264
312
  }
@@ -267,6 +315,8 @@ export function chunkCode(
267
315
  content: lines.slice(method.startLine, method.endLine + 1).join("\n"),
268
316
  function_name: method.name,
269
317
  class_name: block.name,
318
+ start_line: method.startLine,
319
+ end_line: method.endLine,
270
320
  })
271
321
  classLastEnd = method.endLine
272
322
  }
@@ -275,7 +325,7 @@ export function chunkCode(
275
325
  if (classLastEnd < block.endLine) {
276
326
  const tail = lines.slice(classLastEnd + 1, block.endLine + 1).join("\n").trim()
277
327
  if (tail) {
278
- chunks.push({ content: tail, class_name: block.name })
328
+ chunks.push({ content: tail, class_name: block.name, start_line: classLastEnd + 1, end_line: block.endLine })
279
329
  }
280
330
  }
281
331
  } else {
@@ -312,9 +362,9 @@ export function chunkCode(
312
362
  const result: CodeChunk[] = []
313
363
  for (const chunk of chunks) {
314
364
  if (chunk.content.length > config.max_chunk_size) {
315
- const parts = splitByLines(chunk.content.split("\n"), config.max_chunk_size)
365
+ const parts = splitChunkByLines(chunk, config.max_chunk_size)
316
366
  for (const p of parts) {
317
- result.push({ ...chunk, content: p.content })
367
+ result.push(p)
318
368
  }
319
369
  } else {
320
370
  result.push(chunk)
@@ -22,14 +22,18 @@ export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
22
22
  export interface MarkdownChunk {
23
23
  content: string
24
24
  heading_context: string // "H1 > H2 > H3"
25
+ start_line?: number
26
+ end_line?: number
25
27
  }
26
28
 
27
29
  // ── Internal types ──────────────────────────────────────────────────────────
28
30
 
29
31
  interface Section {
30
- level: number // 1-6 for headings, 0 for preamble
32
+ level: number //1-6 for headings, 0 for preamble
31
33
  heading: string
32
34
  body: string
35
+ start_line: number
36
+ end_line: number
33
37
  }
34
38
 
35
39
  // ── Parsing ─────────────────────────────────────────────────────────────────
@@ -38,19 +42,23 @@ interface Section {
38
42
  function parseSections(content: string): Section[] {
39
43
  const lines = content.split("\n")
40
44
  const sections: Section[] = []
41
- let currentSection: Section = { level: 0, heading: "", body: "" }
45
+ let currentSection: Section = { level: 0, heading: "", body: "", start_line: 0, end_line: 0 }
42
46
 
43
- for (const line of lines) {
47
+ for (let i = 0; i < lines.length; i++) {
48
+ const line = lines[i]
44
49
  const headingMatch = line.match(/^(#{1,6})\s+(.+)$/)
45
50
  if (headingMatch) {
46
51
  // Push previous section
47
52
  if (currentSection.body.trim() || currentSection.heading) {
53
+ currentSection.end_line = i - 1
48
54
  sections.push(currentSection)
49
55
  }
50
56
  currentSection = {
51
57
  level: headingMatch[1].length,
52
58
  heading: headingMatch[2].trim(),
53
59
  body: "",
60
+ start_line: i,
61
+ end_line: 0,
54
62
  }
55
63
  } else {
56
64
  currentSection.body += line + "\n"
@@ -59,6 +67,7 @@ function parseSections(content: string): Section[] {
59
67
 
60
68
  // Push last section
61
69
  if (currentSection.body.trim() || currentSection.heading) {
70
+ currentSection.end_line = lines.length - 1
62
71
  sections.push(currentSection)
63
72
  }
64
73
 
@@ -97,6 +106,45 @@ function splitLargeText(text: string, maxSize: number): string[] {
97
106
  return chunks
98
107
  }
99
108
 
109
+ function splitLargeTextWithLines(text: string, maxSize: number, startLine: number): Array<{ content: string; start_line: number; end_line: number }> {
110
+ if (text.length <= maxSize) {
111
+ const lines = text.split("\n")
112
+ return [{ content: text, start_line: startLine, end_line: startLine + lines.length - 1 }]
113
+ }
114
+
115
+ const chunks: Array<{ content: string; start_line: number; end_line: number }> = []
116
+ const lines = text.split("\n")
117
+ let current: string[] = []
118
+ let currentLen = 0
119
+ let chunkStartLine = startLine
120
+
121
+ for (let i = 0; i < lines.length; i++) {
122
+ const line = lines[i]
123
+ if (currentLen + line.length + 1 > maxSize && current.length > 0) {
124
+ chunks.push({
125
+ content: current.join("\n"),
126
+ start_line: chunkStartLine,
127
+ end_line: startLine + i - 1,
128
+ })
129
+ current = []
130
+ currentLen = 0
131
+ chunkStartLine = startLine + i
132
+ }
133
+ current.push(line)
134
+ currentLen += line.length + 1
135
+ }
136
+
137
+ if (current.length > 0) {
138
+ chunks.push({
139
+ content: current.join("\n"),
140
+ start_line: chunkStartLine,
141
+ end_line: startLine + lines.length - 1,
142
+ })
143
+ }
144
+
145
+ return chunks
146
+ }
147
+
100
148
  // ── Public API ──────────────────────────────────────────────────────────────
101
149
 
102
150
  /**
@@ -138,7 +186,12 @@ export function chunkMarkdown(
138
186
  ? `${"#".repeat(section.level)} ${section.heading}\n${section.body}`
139
187
  : section.body
140
188
 
141
- rawChunks.push({ content: sectionText.trim(), heading_context: headingContext })
189
+ rawChunks.push({
190
+ content: sectionText.trim(),
191
+ heading_context: headingContext,
192
+ start_line: section.start_line,
193
+ end_line: section.end_line,
194
+ })
142
195
  }
143
196
 
144
197
  // Merge small sections with previous
@@ -150,7 +203,11 @@ export function chunkMarkdown(
150
203
  ) {
151
204
  const prev = merged[merged.length - 1]
152
205
  prev.content += "\n\n" + chunk.content
153
- // Keep the deepest heading context
206
+ // Merge end_line
207
+ if (chunk.end_line !== undefined) {
208
+ prev.end_line = chunk.end_line
209
+ }
210
+ // Keep deepest heading context
154
211
  if (chunk.heading_context) {
155
212
  prev.heading_context = chunk.heading_context
156
213
  }
@@ -163,9 +220,14 @@ export function chunkMarkdown(
163
220
  const result: MarkdownChunk[] = []
164
221
  for (const chunk of merged) {
165
222
  if (chunk.content.length > config.max_chunk_size) {
166
- const parts = splitLargeText(chunk.content, config.max_chunk_size)
223
+ const parts = splitLargeTextWithLines(chunk.content, config.max_chunk_size, chunk.start_line || 0)
167
224
  for (const part of parts) {
168
- result.push({ content: part, heading_context: chunk.heading_context })
225
+ result.push({
226
+ content: part.content,
227
+ heading_context: chunk.heading_context,
228
+ start_line: part.start_line,
229
+ end_line: part.end_line,
230
+ })
169
231
  }
170
232
  } else {
171
233
  result.push(chunk)