@comfanion/usethis_search 3.0.0-dev.9 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.ts +263 -0
- package/file-indexer.ts +1 -1
- package/index.ts +0 -2
- package/package.json +12 -5
- package/tools/codeindex.ts +2 -2
- package/tools/search.ts +254 -66
- package/vectorizer/analyzers/lsp-analyzer.ts +7 -7
- package/vectorizer/analyzers/regex-analyzer.ts +358 -61
- package/vectorizer/chunk-store.ts +207 -0
- package/vectorizer/chunkers/code-chunker.ts +74 -24
- package/vectorizer/chunkers/markdown-chunker.ts +69 -7
- package/vectorizer/graph-builder.ts +207 -15
- package/vectorizer/graph-db.ts +161 -164
- package/vectorizer/hybrid-search.ts +1 -1
- package/vectorizer/{index.js → index.ts} +796 -160
- package/vectorizer.yaml +20 -2
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ChunkStore — SQLite-based persistent chunk storage.
|
|
3
|
+
* Populated by Phase 1 (no vectors needed). Provides BM25 search
|
|
4
|
+
* and metadata queries immediately, before embedding is complete.
|
|
5
|
+
*
|
|
6
|
+
* Uses bun:sqlite with WAL mode for concurrent read access.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { Database } from "bun:sqlite"
|
|
10
|
+
|
|
11
|
+
export interface StoredChunk {
|
|
12
|
+
chunk_id: string
|
|
13
|
+
file: string
|
|
14
|
+
chunk_index: number
|
|
15
|
+
content: string
|
|
16
|
+
file_type: string
|
|
17
|
+
language: string
|
|
18
|
+
last_modified: string
|
|
19
|
+
file_size: number
|
|
20
|
+
heading_context: string
|
|
21
|
+
function_name: string
|
|
22
|
+
class_name: string
|
|
23
|
+
tags: string
|
|
24
|
+
start_line: number
|
|
25
|
+
end_line: number
|
|
26
|
+
archived: boolean
|
|
27
|
+
vectorized: boolean
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export class ChunkStore {
|
|
31
|
+
private db: Database | null = null
|
|
32
|
+
|
|
33
|
+
// Prepared statements
|
|
34
|
+
private _stmtInsert: any = null
|
|
35
|
+
private _stmtByFile: any = null
|
|
36
|
+
private _stmtDeleteByFile: any = null
|
|
37
|
+
private _stmtAll: any = null
|
|
38
|
+
private _stmtByChunkId: any = null
|
|
39
|
+
private _stmtMarkVectorized: any = null
|
|
40
|
+
private _stmtHasVectors: any = null
|
|
41
|
+
private _stmtCount: any = null
|
|
42
|
+
private _stmtSearch: any = null
|
|
43
|
+
|
|
44
|
+
constructor(private dbPath: string) {}
|
|
45
|
+
|
|
46
|
+
async init(): Promise<this> {
|
|
47
|
+
const fullPath = this.dbPath.endsWith(".db") ? this.dbPath : this.dbPath + ".db"
|
|
48
|
+
this.db = new Database(fullPath)
|
|
49
|
+
|
|
50
|
+
this.db.exec("PRAGMA journal_mode = WAL")
|
|
51
|
+
this.db.exec("PRAGMA synchronous = NORMAL")
|
|
52
|
+
this.db.exec("PRAGMA cache_size = -4000") // 4MB cache
|
|
53
|
+
|
|
54
|
+
this.db.exec(`
|
|
55
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
56
|
+
chunk_id TEXT PRIMARY KEY,
|
|
57
|
+
file TEXT NOT NULL,
|
|
58
|
+
chunk_index INTEGER NOT NULL DEFAULT 0,
|
|
59
|
+
content TEXT NOT NULL,
|
|
60
|
+
file_type TEXT NOT NULL DEFAULT '',
|
|
61
|
+
language TEXT NOT NULL DEFAULT '',
|
|
62
|
+
last_modified TEXT NOT NULL DEFAULT '',
|
|
63
|
+
file_size INTEGER NOT NULL DEFAULT 0,
|
|
64
|
+
heading_context TEXT NOT NULL DEFAULT '',
|
|
65
|
+
function_name TEXT NOT NULL DEFAULT '',
|
|
66
|
+
class_name TEXT NOT NULL DEFAULT '',
|
|
67
|
+
tags TEXT NOT NULL DEFAULT '',
|
|
68
|
+
start_line INTEGER NOT NULL DEFAULT -1,
|
|
69
|
+
end_line INTEGER NOT NULL DEFAULT -1,
|
|
70
|
+
archived INTEGER NOT NULL DEFAULT 0,
|
|
71
|
+
vectorized INTEGER NOT NULL DEFAULT 0
|
|
72
|
+
)
|
|
73
|
+
`)
|
|
74
|
+
|
|
75
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)")
|
|
76
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_vectorized ON chunks(vectorized)")
|
|
77
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_chunks_language ON chunks(language)")
|
|
78
|
+
|
|
79
|
+
// Prepare statements
|
|
80
|
+
this._stmtInsert = this.db.prepare(`
|
|
81
|
+
INSERT OR REPLACE INTO chunks
|
|
82
|
+
(chunk_id, file, chunk_index, content, file_type, language, last_modified, file_size,
|
|
83
|
+
heading_context, function_name, class_name, tags, start_line, end_line, archived, vectorized)
|
|
84
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 0)
|
|
85
|
+
`)
|
|
86
|
+
this._stmtByFile = this.db.prepare("SELECT * FROM chunks WHERE file = ?")
|
|
87
|
+
this._stmtDeleteByFile = this.db.prepare("DELETE FROM chunks WHERE file = ?")
|
|
88
|
+
this._stmtAll = this.db.prepare("SELECT * FROM chunks")
|
|
89
|
+
this._stmtByChunkId = this.db.prepare("SELECT * FROM chunks WHERE chunk_id = ?")
|
|
90
|
+
this._stmtMarkVectorized = this.db.prepare("UPDATE chunks SET vectorized = 1 WHERE file = ?")
|
|
91
|
+
this._stmtHasVectors = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks WHERE vectorized = 0")
|
|
92
|
+
this._stmtCount = this.db.prepare("SELECT COUNT(*) as cnt FROM chunks")
|
|
93
|
+
|
|
94
|
+
return this
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Store chunks from Phase 1 (batch, in transaction).
|
|
99
|
+
*/
|
|
100
|
+
storeChunks(rows: Array<{
|
|
101
|
+
chunk_id: string, file: string, chunk_index: number, content: string,
|
|
102
|
+
file_type: string, language: string, last_modified: string, file_size: number,
|
|
103
|
+
heading_context: string, function_name: string, class_name: string, tags: string,
|
|
104
|
+
start_line: number, end_line: number, archived: boolean
|
|
105
|
+
}>): void {
|
|
106
|
+
if (!this.db) throw new Error("ChunkStore not initialized")
|
|
107
|
+
|
|
108
|
+
const insertMany = this.db.transaction((items: typeof rows) => {
|
|
109
|
+
for (const r of items) {
|
|
110
|
+
this._stmtInsert.run(
|
|
111
|
+
r.chunk_id, r.file, r.chunk_index, r.content,
|
|
112
|
+
r.file_type, r.language, r.last_modified, r.file_size,
|
|
113
|
+
r.heading_context, r.function_name, r.class_name, r.tags,
|
|
114
|
+
r.start_line, r.end_line, r.archived ? 1 : 0
|
|
115
|
+
)
|
|
116
|
+
}
|
|
117
|
+
})
|
|
118
|
+
insertMany(rows)
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Delete all chunks for a file (before re-indexing).
|
|
123
|
+
*/
|
|
124
|
+
deleteByFile(filePath: string): void {
|
|
125
|
+
if (!this.db) throw new Error("ChunkStore not initialized")
|
|
126
|
+
this._stmtDeleteByFile.run(filePath)
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Mark all chunks for a file as vectorized (Phase 2 complete).
|
|
131
|
+
*/
|
|
132
|
+
markVectorized(filePath: string): void {
|
|
133
|
+
if (!this.db) throw new Error("ChunkStore not initialized")
|
|
134
|
+
this._stmtMarkVectorized.run(filePath)
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Check if all chunks have vectors.
|
|
139
|
+
*/
|
|
140
|
+
hasUnvectorizedChunks(): boolean {
|
|
141
|
+
if (!this.db) return false
|
|
142
|
+
const row = this._stmtHasVectors.get() as { cnt: number }
|
|
143
|
+
return row.cnt > 0
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Get all chunks (for BM25 index building).
|
|
148
|
+
*/
|
|
149
|
+
getAllChunks(): StoredChunk[] {
|
|
150
|
+
if (!this.db) return []
|
|
151
|
+
return this._stmtAll.all().map((r: any) => this.toChunk(r))
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Get chunks for a specific file.
|
|
156
|
+
*/
|
|
157
|
+
getChunksByFile(filePath: string): StoredChunk[] {
|
|
158
|
+
if (!this.db) return []
|
|
159
|
+
return this._stmtByFile.all(filePath).map((r: any) => this.toChunk(r))
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Get a single chunk by ID.
|
|
164
|
+
*/
|
|
165
|
+
getChunkById(chunkId: string): StoredChunk | null {
|
|
166
|
+
if (!this.db) return null
|
|
167
|
+
const row = this._stmtByChunkId.get(chunkId)
|
|
168
|
+
return row ? this.toChunk(row) : null
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Total chunk count.
|
|
173
|
+
*/
|
|
174
|
+
count(): number {
|
|
175
|
+
if (!this.db) return 0
|
|
176
|
+
const row = this._stmtCount.get() as { cnt: number }
|
|
177
|
+
return row.cnt
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
close(): void {
|
|
181
|
+
if (this.db) {
|
|
182
|
+
this.db.close()
|
|
183
|
+
this.db = null
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
private toChunk(row: any): StoredChunk {
|
|
188
|
+
return {
|
|
189
|
+
chunk_id: row.chunk_id,
|
|
190
|
+
file: row.file,
|
|
191
|
+
chunk_index: row.chunk_index,
|
|
192
|
+
content: row.content,
|
|
193
|
+
file_type: row.file_type,
|
|
194
|
+
language: row.language,
|
|
195
|
+
last_modified: row.last_modified,
|
|
196
|
+
file_size: row.file_size,
|
|
197
|
+
heading_context: row.heading_context,
|
|
198
|
+
function_name: row.function_name,
|
|
199
|
+
class_name: row.class_name,
|
|
200
|
+
tags: row.tags,
|
|
201
|
+
start_line: row.start_line,
|
|
202
|
+
end_line: row.end_line,
|
|
203
|
+
archived: !!row.archived,
|
|
204
|
+
vectorized: !!row.vectorized,
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
@@ -23,6 +23,8 @@ export interface CodeChunk {
|
|
|
23
23
|
content: string
|
|
24
24
|
function_name?: string
|
|
25
25
|
class_name?: string
|
|
26
|
+
start_line?: number
|
|
27
|
+
end_line?: number
|
|
26
28
|
}
|
|
27
29
|
|
|
28
30
|
// ── Block detection ─────────────────────────────────────────────────────────
|
|
@@ -172,31 +174,74 @@ function findPythonBlockEnd(lines: string[], startLine: number): number {
|
|
|
172
174
|
return lines.length - 1
|
|
173
175
|
}
|
|
174
176
|
|
|
175
|
-
// ── Fallback: line-based splitting ──────────────────────────────────────────
|
|
177
|
+
// ── Fallback: line-based splitting ──────────────────────────────────────────
|
|
178
|
+
|
|
179
|
+
function splitByLines(lines: string[], maxChars: number): CodeChunk[] {
|
|
180
|
+
const chunks: CodeChunk[] = []
|
|
181
|
+
let current: string[] = []
|
|
182
|
+
let currentLen = 0
|
|
183
|
+
let startLine = 0
|
|
184
|
+
|
|
185
|
+
for (let i = 0; i < lines.length; i++) {
|
|
186
|
+
const line = lines[i]
|
|
187
|
+
if (currentLen + line.length + 1 > maxChars && current.length > 0) {
|
|
188
|
+
chunks.push({ content: current.join("\n"), start_line: startLine, end_line: i - 1 })
|
|
189
|
+
current = []
|
|
190
|
+
currentLen = 0
|
|
191
|
+
startLine = i
|
|
192
|
+
}
|
|
193
|
+
current.push(line)
|
|
194
|
+
currentLen += line.length + 1
|
|
195
|
+
}
|
|
176
196
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
let current: string[] = []
|
|
180
|
-
let currentLen = 0
|
|
181
|
-
|
|
182
|
-
for (const line of lines) {
|
|
183
|
-
if (currentLen + line.length + 1 > maxChars && current.length > 0) {
|
|
184
|
-
chunks.push({ content: current.join("\n") })
|
|
185
|
-
current = []
|
|
186
|
-
currentLen = 0
|
|
197
|
+
if (current.length > 0) {
|
|
198
|
+
chunks.push({ content: current.join("\n"), start_line: startLine, end_line: lines.length - 1 })
|
|
187
199
|
}
|
|
188
|
-
current.push(line)
|
|
189
|
-
currentLen += line.length + 1
|
|
190
|
-
}
|
|
191
200
|
|
|
192
|
-
|
|
193
|
-
chunks.push({ content: current.join("\n") })
|
|
201
|
+
return chunks
|
|
194
202
|
}
|
|
195
203
|
|
|
196
|
-
|
|
197
|
-
|
|
204
|
+
// ── Split large chunks preserving line numbers ────────────────────────────
|
|
205
|
+
|
|
206
|
+
function splitChunkByLines(chunk: CodeChunk, maxChars: number): CodeChunk[] {
|
|
207
|
+
const lines = chunk.content.split("\n")
|
|
208
|
+
const baseLine = chunk.start_line || 0
|
|
209
|
+
|
|
210
|
+
const parts: CodeChunk[] = []
|
|
211
|
+
let current: string[] = []
|
|
212
|
+
let currentLen = 0
|
|
213
|
+
let startLine = baseLine
|
|
214
|
+
|
|
215
|
+
for (let i = 0; i < lines.length; i++) {
|
|
216
|
+
const line = lines[i]
|
|
217
|
+
if (currentLen + line.length + 1 > maxChars && current.length > 0) {
|
|
218
|
+
parts.push({
|
|
219
|
+
...chunk,
|
|
220
|
+
content: current.join("\n"),
|
|
221
|
+
start_line: startLine,
|
|
222
|
+
end_line: baseLine + i - 1,
|
|
223
|
+
})
|
|
224
|
+
current = []
|
|
225
|
+
currentLen = 0
|
|
226
|
+
startLine = baseLine + i
|
|
227
|
+
}
|
|
228
|
+
current.push(line)
|
|
229
|
+
currentLen += line.length + 1
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
if (current.length > 0) {
|
|
233
|
+
parts.push({
|
|
234
|
+
...chunk,
|
|
235
|
+
content: current.join("\n"),
|
|
236
|
+
start_line: startLine,
|
|
237
|
+
end_line: baseLine + lines.length - 1,
|
|
238
|
+
})
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return parts
|
|
242
|
+
}
|
|
198
243
|
|
|
199
|
-
// ── Public API ──────────────────────────────────────────────────────────────
|
|
244
|
+
// ── Public API ──────────────────────────────────────────────────────────────
|
|
200
245
|
|
|
201
246
|
/**
|
|
202
247
|
* Chunk source code by functions/classes.
|
|
@@ -231,12 +276,13 @@ export function chunkCode(
|
|
|
231
276
|
if (block.startLine > lastEnd + 1) {
|
|
232
277
|
const gapContent = lines.slice(lastEnd + 1, block.startLine).join("\n").trim()
|
|
233
278
|
if (gapContent.length >= config.min_chunk_size) {
|
|
234
|
-
chunks.push({ content: gapContent })
|
|
279
|
+
chunks.push({ content: gapContent, start_line: lastEnd + 1, end_line: block.startLine - 1 })
|
|
235
280
|
} else if (gapContent.length > 0 && chunks.length > 0) {
|
|
236
281
|
// Merge small gap with previous chunk
|
|
237
282
|
chunks[chunks.length - 1].content += "\n\n" + gapContent
|
|
283
|
+
chunks[chunks.length - 1].end_line = block.startLine - 1
|
|
238
284
|
} else if (gapContent.length > 0) {
|
|
239
|
-
chunks.push({ content: gapContent })
|
|
285
|
+
chunks.push({ content: gapContent, start_line: lastEnd + 1, end_line: block.startLine - 1 })
|
|
240
286
|
}
|
|
241
287
|
}
|
|
242
288
|
|
|
@@ -259,6 +305,8 @@ export function chunkCode(
|
|
|
259
305
|
chunks.push({
|
|
260
306
|
content: gap,
|
|
261
307
|
class_name: block.name,
|
|
308
|
+
start_line: classLastEnd + 1,
|
|
309
|
+
end_line: method.startLine - 1,
|
|
262
310
|
})
|
|
263
311
|
}
|
|
264
312
|
}
|
|
@@ -267,6 +315,8 @@ export function chunkCode(
|
|
|
267
315
|
content: lines.slice(method.startLine, method.endLine + 1).join("\n"),
|
|
268
316
|
function_name: method.name,
|
|
269
317
|
class_name: block.name,
|
|
318
|
+
start_line: method.startLine,
|
|
319
|
+
end_line: method.endLine,
|
|
270
320
|
})
|
|
271
321
|
classLastEnd = method.endLine
|
|
272
322
|
}
|
|
@@ -275,7 +325,7 @@ export function chunkCode(
|
|
|
275
325
|
if (classLastEnd < block.endLine) {
|
|
276
326
|
const tail = lines.slice(classLastEnd + 1, block.endLine + 1).join("\n").trim()
|
|
277
327
|
if (tail) {
|
|
278
|
-
chunks.push({ content: tail, class_name: block.name })
|
|
328
|
+
chunks.push({ content: tail, class_name: block.name, start_line: classLastEnd + 1, end_line: block.endLine })
|
|
279
329
|
}
|
|
280
330
|
}
|
|
281
331
|
} else {
|
|
@@ -312,9 +362,9 @@ export function chunkCode(
|
|
|
312
362
|
const result: CodeChunk[] = []
|
|
313
363
|
for (const chunk of chunks) {
|
|
314
364
|
if (chunk.content.length > config.max_chunk_size) {
|
|
315
|
-
const parts =
|
|
365
|
+
const parts = splitChunkByLines(chunk, config.max_chunk_size)
|
|
316
366
|
for (const p of parts) {
|
|
317
|
-
result.push(
|
|
367
|
+
result.push(p)
|
|
318
368
|
}
|
|
319
369
|
} else {
|
|
320
370
|
result.push(chunk)
|
|
@@ -22,14 +22,18 @@ export const DEFAULT_MD_CONFIG: MarkdownChunkConfig = {
|
|
|
22
22
|
export interface MarkdownChunk {
|
|
23
23
|
content: string
|
|
24
24
|
heading_context: string // "H1 > H2 > H3"
|
|
25
|
+
start_line?: number
|
|
26
|
+
end_line?: number
|
|
25
27
|
}
|
|
26
28
|
|
|
27
29
|
// ── Internal types ──────────────────────────────────────────────────────────
|
|
28
30
|
|
|
29
31
|
interface Section {
|
|
30
|
-
level: number //
|
|
32
|
+
level: number //1-6 for headings, 0 for preamble
|
|
31
33
|
heading: string
|
|
32
34
|
body: string
|
|
35
|
+
start_line: number
|
|
36
|
+
end_line: number
|
|
33
37
|
}
|
|
34
38
|
|
|
35
39
|
// ── Parsing ─────────────────────────────────────────────────────────────────
|
|
@@ -38,19 +42,23 @@ interface Section {
|
|
|
38
42
|
function parseSections(content: string): Section[] {
|
|
39
43
|
const lines = content.split("\n")
|
|
40
44
|
const sections: Section[] = []
|
|
41
|
-
let currentSection: Section = { level: 0, heading: "", body: "" }
|
|
45
|
+
let currentSection: Section = { level: 0, heading: "", body: "", start_line: 0, end_line: 0 }
|
|
42
46
|
|
|
43
|
-
for (
|
|
47
|
+
for (let i = 0; i < lines.length; i++) {
|
|
48
|
+
const line = lines[i]
|
|
44
49
|
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/)
|
|
45
50
|
if (headingMatch) {
|
|
46
51
|
// Push previous section
|
|
47
52
|
if (currentSection.body.trim() || currentSection.heading) {
|
|
53
|
+
currentSection.end_line = i - 1
|
|
48
54
|
sections.push(currentSection)
|
|
49
55
|
}
|
|
50
56
|
currentSection = {
|
|
51
57
|
level: headingMatch[1].length,
|
|
52
58
|
heading: headingMatch[2].trim(),
|
|
53
59
|
body: "",
|
|
60
|
+
start_line: i,
|
|
61
|
+
end_line: 0,
|
|
54
62
|
}
|
|
55
63
|
} else {
|
|
56
64
|
currentSection.body += line + "\n"
|
|
@@ -59,6 +67,7 @@ function parseSections(content: string): Section[] {
|
|
|
59
67
|
|
|
60
68
|
// Push last section
|
|
61
69
|
if (currentSection.body.trim() || currentSection.heading) {
|
|
70
|
+
currentSection.end_line = lines.length - 1
|
|
62
71
|
sections.push(currentSection)
|
|
63
72
|
}
|
|
64
73
|
|
|
@@ -97,6 +106,45 @@ function splitLargeText(text: string, maxSize: number): string[] {
|
|
|
97
106
|
return chunks
|
|
98
107
|
}
|
|
99
108
|
|
|
109
|
+
function splitLargeTextWithLines(text: string, maxSize: number, startLine: number): Array<{ content: string; start_line: number; end_line: number }> {
|
|
110
|
+
if (text.length <= maxSize) {
|
|
111
|
+
const lines = text.split("\n")
|
|
112
|
+
return [{ content: text, start_line: startLine, end_line: startLine + lines.length - 1 }]
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const chunks: Array<{ content: string; start_line: number; end_line: number }> = []
|
|
116
|
+
const lines = text.split("\n")
|
|
117
|
+
let current: string[] = []
|
|
118
|
+
let currentLen = 0
|
|
119
|
+
let chunkStartLine = startLine
|
|
120
|
+
|
|
121
|
+
for (let i = 0; i < lines.length; i++) {
|
|
122
|
+
const line = lines[i]
|
|
123
|
+
if (currentLen + line.length + 1 > maxSize && current.length > 0) {
|
|
124
|
+
chunks.push({
|
|
125
|
+
content: current.join("\n"),
|
|
126
|
+
start_line: chunkStartLine,
|
|
127
|
+
end_line: startLine + i - 1,
|
|
128
|
+
})
|
|
129
|
+
current = []
|
|
130
|
+
currentLen = 0
|
|
131
|
+
chunkStartLine = startLine + i
|
|
132
|
+
}
|
|
133
|
+
current.push(line)
|
|
134
|
+
currentLen += line.length + 1
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
if (current.length > 0) {
|
|
138
|
+
chunks.push({
|
|
139
|
+
content: current.join("\n"),
|
|
140
|
+
start_line: chunkStartLine,
|
|
141
|
+
end_line: startLine + lines.length - 1,
|
|
142
|
+
})
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return chunks
|
|
146
|
+
}
|
|
147
|
+
|
|
100
148
|
// ── Public API ──────────────────────────────────────────────────────────────
|
|
101
149
|
|
|
102
150
|
/**
|
|
@@ -138,7 +186,12 @@ export function chunkMarkdown(
|
|
|
138
186
|
? `${"#".repeat(section.level)} ${section.heading}\n${section.body}`
|
|
139
187
|
: section.body
|
|
140
188
|
|
|
141
|
-
rawChunks.push({
|
|
189
|
+
rawChunks.push({
|
|
190
|
+
content: sectionText.trim(),
|
|
191
|
+
heading_context: headingContext,
|
|
192
|
+
start_line: section.start_line,
|
|
193
|
+
end_line: section.end_line,
|
|
194
|
+
})
|
|
142
195
|
}
|
|
143
196
|
|
|
144
197
|
// Merge small sections with previous
|
|
@@ -150,7 +203,11 @@ export function chunkMarkdown(
|
|
|
150
203
|
) {
|
|
151
204
|
const prev = merged[merged.length - 1]
|
|
152
205
|
prev.content += "\n\n" + chunk.content
|
|
153
|
-
//
|
|
206
|
+
// Merge end_line
|
|
207
|
+
if (chunk.end_line !== undefined) {
|
|
208
|
+
prev.end_line = chunk.end_line
|
|
209
|
+
}
|
|
210
|
+
// Keep deepest heading context
|
|
154
211
|
if (chunk.heading_context) {
|
|
155
212
|
prev.heading_context = chunk.heading_context
|
|
156
213
|
}
|
|
@@ -163,9 +220,14 @@ export function chunkMarkdown(
|
|
|
163
220
|
const result: MarkdownChunk[] = []
|
|
164
221
|
for (const chunk of merged) {
|
|
165
222
|
if (chunk.content.length > config.max_chunk_size) {
|
|
166
|
-
const parts =
|
|
223
|
+
const parts = splitLargeTextWithLines(chunk.content, config.max_chunk_size, chunk.start_line || 0)
|
|
167
224
|
for (const part of parts) {
|
|
168
|
-
result.push({
|
|
225
|
+
result.push({
|
|
226
|
+
content: part.content,
|
|
227
|
+
heading_context: chunk.heading_context,
|
|
228
|
+
start_line: part.start_line,
|
|
229
|
+
end_line: part.end_line,
|
|
230
|
+
})
|
|
169
231
|
}
|
|
170
232
|
} else {
|
|
171
233
|
result.push(chunk)
|