spine-framework-cortex 0.2.19 → 0.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/functions/custom_case_analysis.ts +507 -0
- package/functions/custom_cortex-chunks.ts +52 -0
- package/functions/custom_cortex-handler.ts +35 -0
- package/functions/custom_kb-chunker-test.ts +364 -0
- package/functions/custom_kb-chunker.ts +576 -0
- package/functions/custom_kb-embeddings.ts +472 -0
- package/functions/custom_kb-ingestion.ts +447 -0
- package/functions/custom_tag_management.ts +314 -0
- package/manifest.json +1 -0
- package/package.json +1 -1
- package/pages/courses/CoursesPage.tsx +1 -1
- package/pages/kb/KBEditorPage.tsx +1 -1
- package/pages/support/RedactionReview.tsx +1 -1
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adaptive Article Chunker for KB Embeddings
|
|
3
|
+
*
|
|
4
|
+
* Strategy 5 (Recursive/Hierarchical) as the framework:
|
|
5
|
+
* - Strategy 1 (Heading-Based) for structured docs
|
|
6
|
+
* - Strategy 3 (Heading + Size Guards) for oversized sections
|
|
7
|
+
* - Strategy 4 (Paragraph Grouping) for unstructured prose
|
|
8
|
+
* - Strategy 2 (Fixed Window) deliberately excluded
|
|
9
|
+
*
|
|
10
|
+
* Hard rules:
|
|
11
|
+
* - Never split a code block or table
|
|
12
|
+
* - Never split mid-paragraph
|
|
13
|
+
* - Merge tiny chunks (<100 tokens) into neighbor
|
|
14
|
+
* - Prefix every chunk with context
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Types
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
export interface Chunk {
|
|
22
|
+
/** The text content to embed */
|
|
23
|
+
content: string
|
|
24
|
+
/** Section path for heading-based chunks, e.g. "Fields > data_type reference" */
|
|
25
|
+
sectionPath: string | null
|
|
26
|
+
/** 0-based index within the article */
|
|
27
|
+
chunkIndex: number
|
|
28
|
+
/** Total chunks for this article */
|
|
29
|
+
chunkTotal: number
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface ChunkerOptions {
|
|
33
|
+
/** Article title — used as prefix context for every chunk */
|
|
34
|
+
articleTitle: string
|
|
35
|
+
/** Max tokens per chunk before sub-splitting. Default 800. */
|
|
36
|
+
maxTokens?: number
|
|
37
|
+
/** Min tokens per chunk before merging with neighbor. Default 100. */
|
|
38
|
+
minTokens?: number
|
|
39
|
+
/** Token count below which the entire article is a single chunk. Default 600. */
|
|
40
|
+
singleChunkThreshold?: number
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
// Token estimation
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Rough token estimate: ~4 chars per token for English text.
|
|
49
|
+
* Good enough for chunking decisions — actual tokenization happens at OpenAI.
|
|
50
|
+
*/
|
|
51
|
+
export function estimateTokens(text: string): number {
|
|
52
|
+
return Math.ceil(text.length / 4)
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// ---------------------------------------------------------------------------
|
|
56
|
+
// HTML → plain text
|
|
57
|
+
// ---------------------------------------------------------------------------
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Strip HTML tags to plain text, preserving structure via newlines.
|
|
61
|
+
* Headings, paragraphs, divs get newlines. Inline tags are stripped.
|
|
62
|
+
*/
|
|
63
|
+
export function htmlToPlainText(html: string): string {
|
|
64
|
+
let text = html
|
|
65
|
+
// Normalize line breaks
|
|
66
|
+
text = text.replace(/\r\n?/g, '\n')
|
|
67
|
+
// Block-level elements → newlines
|
|
68
|
+
text = text.replace(/<\/(p|div|li|tr|blockquote)>/gi, '\n')
|
|
69
|
+
text = text.replace(/<br\s*\/?>/gi, '\n')
|
|
70
|
+
text = text.replace(/<\/(h[1-6])>/gi, '\n')
|
|
71
|
+
// Strip remaining tags
|
|
72
|
+
text = text.replace(/<[^>]+>/g, '')
|
|
73
|
+
// Decode common entities
|
|
74
|
+
text = text.replace(/&/g, '&')
|
|
75
|
+
text = text.replace(/</g, '<')
|
|
76
|
+
text = text.replace(/>/g, '>')
|
|
77
|
+
text = text.replace(/"/g, '"')
|
|
78
|
+
text = text.replace(/'/g, "'")
|
|
79
|
+
text = text.replace(/ /g, ' ')
|
|
80
|
+
// Collapse excessive newlines
|
|
81
|
+
text = text.replace(/\n{3,}/g, '\n\n')
|
|
82
|
+
return text.trim()
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
// Detect content format
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
type ContentFormat = 'markdown' | 'html'
|
|
90
|
+
|
|
91
|
+
function detectFormat(content: string): ContentFormat {
|
|
92
|
+
// If it has HTML block tags, treat as HTML
|
|
93
|
+
if (/<(p|div|h[1-6]|ul|ol|table|pre)\b/i.test(content)) return 'html'
|
|
94
|
+
return 'markdown'
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// ---------------------------------------------------------------------------
|
|
98
|
+
// Heading extraction
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
interface Section {
|
|
102
|
+
/** Heading level: 1-6 for explicit headings, 0 for preamble */
|
|
103
|
+
level: number
|
|
104
|
+
/** Heading text (empty for preamble) */
|
|
105
|
+
heading: string
|
|
106
|
+
/** Body content below this heading (not including sub-sections) */
|
|
107
|
+
body: string
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Parse markdown into a flat list of sections by heading.
|
|
112
|
+
* Each section includes its heading text and the body up to the next heading.
|
|
113
|
+
*
|
|
114
|
+
* @param splitLevel - Only split on headings at this level or shallower.
|
|
115
|
+
* Deeper headings (e.g. ### when splitLevel=2) are kept as body content.
|
|
116
|
+
* Default 6 = split on all headings.
|
|
117
|
+
*/
|
|
118
|
+
function parseMarkdownSections(content: string, splitLevel: number = 6): Section[] {
|
|
119
|
+
const lines = content.split('\n')
|
|
120
|
+
const sections: Section[] = []
|
|
121
|
+
let currentLevel = 0
|
|
122
|
+
let currentHeading = ''
|
|
123
|
+
let bodyLines: string[] = []
|
|
124
|
+
|
|
125
|
+
for (const line of lines) {
|
|
126
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/)
|
|
127
|
+
if (headingMatch && headingMatch[1].length <= splitLevel) {
|
|
128
|
+
// Flush previous section
|
|
129
|
+
if (bodyLines.length > 0 || currentHeading) {
|
|
130
|
+
sections.push({
|
|
131
|
+
level: currentLevel,
|
|
132
|
+
heading: currentHeading,
|
|
133
|
+
body: bodyLines.join('\n').trim()
|
|
134
|
+
})
|
|
135
|
+
}
|
|
136
|
+
currentLevel = headingMatch[1].length
|
|
137
|
+
currentHeading = headingMatch[2]
|
|
138
|
+
bodyLines = []
|
|
139
|
+
} else {
|
|
140
|
+
bodyLines.push(line)
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Flush final section
|
|
145
|
+
if (bodyLines.length > 0 || currentHeading) {
|
|
146
|
+
sections.push({
|
|
147
|
+
level: currentLevel,
|
|
148
|
+
heading: currentHeading,
|
|
149
|
+
body: bodyLines.join('\n').trim()
|
|
150
|
+
})
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return sections
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Parse HTML into sections by h2/h3/h4 tags.
|
|
158
|
+
* Converts to plain text first, then extracts heading structure.
|
|
159
|
+
*/
|
|
160
|
+
function parseHtmlSections(html: string, splitLevel: number = 6): Section[] {
|
|
161
|
+
// Insert markdown-style headings before stripping, so we can parse them
|
|
162
|
+
let marked = html.replace(/<h([1-6])[^>]*>(.*?)<\/h\1>/gi, (_, level, text) => {
|
|
163
|
+
const hashes = '#'.repeat(parseInt(level))
|
|
164
|
+
// Strip any HTML inside the heading text
|
|
165
|
+
const cleanText = text.replace(/<[^>]+>/g, '').trim()
|
|
166
|
+
return `\n${hashes} ${cleanText}\n`
|
|
167
|
+
})
|
|
168
|
+
// Convert the rest to plain text
|
|
169
|
+
marked = htmlToPlainText(marked)
|
|
170
|
+
return parseMarkdownSections(marked, splitLevel)
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// ---------------------------------------------------------------------------
|
|
174
|
+
// Atomic block detection
|
|
175
|
+
// ---------------------------------------------------------------------------
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Check if a line is the start of a fenced code block.
|
|
179
|
+
*/
|
|
180
|
+
function isCodeFenceStart(line: string): boolean {
|
|
181
|
+
return /^```/.test(line.trim())
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Check if a line is inside a markdown table (starts with |).
|
|
186
|
+
*/
|
|
187
|
+
function isTableRow(line: string): boolean {
|
|
188
|
+
return /^\|/.test(line.trim())
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// ---------------------------------------------------------------------------
|
|
192
|
+
// Paragraph-based splitting (Strategy 4)
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Split text into paragraphs, keeping code blocks and tables as atomic units.
|
|
197
|
+
* Returns an array of paragraph strings.
|
|
198
|
+
*/
|
|
199
|
+
function splitIntoParagraphs(text: string): string[] {
|
|
200
|
+
const lines = text.split('\n')
|
|
201
|
+
const paragraphs: string[] = []
|
|
202
|
+
let current: string[] = []
|
|
203
|
+
let inCodeBlock = false
|
|
204
|
+
|
|
205
|
+
for (let i = 0; i < lines.length; i++) {
|
|
206
|
+
const line = lines[i]
|
|
207
|
+
|
|
208
|
+
// Track code fence boundaries
|
|
209
|
+
if (isCodeFenceStart(line)) {
|
|
210
|
+
if (inCodeBlock) {
|
|
211
|
+
// End of code block — include closing fence, flush as atomic unit
|
|
212
|
+
current.push(line)
|
|
213
|
+
paragraphs.push(current.join('\n'))
|
|
214
|
+
current = []
|
|
215
|
+
inCodeBlock = false
|
|
216
|
+
continue
|
|
217
|
+
} else {
|
|
218
|
+
// Start of code block — flush what we have, start atomic unit
|
|
219
|
+
if (current.length > 0 && current.some(l => l.trim())) {
|
|
220
|
+
paragraphs.push(current.join('\n').trim())
|
|
221
|
+
}
|
|
222
|
+
current = [line]
|
|
223
|
+
inCodeBlock = true
|
|
224
|
+
continue
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if (inCodeBlock) {
|
|
229
|
+
current.push(line)
|
|
230
|
+
continue
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Table rows are grouped together as atomic
|
|
234
|
+
if (isTableRow(line)) {
|
|
235
|
+
// If previous content wasn't a table, flush it
|
|
236
|
+
if (current.length > 0 && !isTableRow(current[current.length - 1])) {
|
|
237
|
+
if (current.some(l => l.trim())) {
|
|
238
|
+
paragraphs.push(current.join('\n').trim())
|
|
239
|
+
}
|
|
240
|
+
current = []
|
|
241
|
+
}
|
|
242
|
+
current.push(line)
|
|
243
|
+
continue
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// If we were in a table and hit a non-table line, flush the table
|
|
247
|
+
if (current.length > 0 && isTableRow(current[current.length - 1]) && !isTableRow(line)) {
|
|
248
|
+
paragraphs.push(current.join('\n').trim())
|
|
249
|
+
current = []
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Empty line = paragraph boundary
|
|
253
|
+
if (line.trim() === '') {
|
|
254
|
+
if (current.length > 0 && current.some(l => l.trim())) {
|
|
255
|
+
paragraphs.push(current.join('\n').trim())
|
|
256
|
+
current = []
|
|
257
|
+
}
|
|
258
|
+
continue
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
current.push(line)
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// Flush remaining
|
|
265
|
+
if (inCodeBlock && current.length > 0) {
|
|
266
|
+
// Unclosed code block — flush as-is
|
|
267
|
+
paragraphs.push(current.join('\n').trim())
|
|
268
|
+
} else if (current.length > 0 && current.some(l => l.trim())) {
|
|
269
|
+
paragraphs.push(current.join('\n').trim())
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
return paragraphs.filter(p => p.length > 0)
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
/**
|
|
276
|
+
* Group paragraphs into chunks of approximately maxTokens.
|
|
277
|
+
* Never splits a paragraph — it's the atomic unit.
|
|
278
|
+
*/
|
|
279
|
+
function groupParagraphs(paragraphs: string[], maxTokens: number): string[] {
|
|
280
|
+
const chunks: string[] = []
|
|
281
|
+
let current: string[] = []
|
|
282
|
+
let currentTokens = 0
|
|
283
|
+
|
|
284
|
+
for (const para of paragraphs) {
|
|
285
|
+
const paraTokens = estimateTokens(para)
|
|
286
|
+
|
|
287
|
+
// If a single paragraph exceeds max, it goes as its own chunk (atomic — don't break it)
|
|
288
|
+
if (paraTokens > maxTokens && current.length === 0) {
|
|
289
|
+
chunks.push(para)
|
|
290
|
+
continue
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
// Would adding this paragraph exceed the limit?
|
|
294
|
+
if (currentTokens + paraTokens > maxTokens && current.length > 0) {
|
|
295
|
+
chunks.push(current.join('\n\n'))
|
|
296
|
+
current = [para]
|
|
297
|
+
currentTokens = paraTokens
|
|
298
|
+
} else {
|
|
299
|
+
current.push(para)
|
|
300
|
+
currentTokens += paraTokens
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
if (current.length > 0) {
|
|
305
|
+
chunks.push(current.join('\n\n'))
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
return chunks
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// ---------------------------------------------------------------------------
|
|
312
|
+
// Section-based splitting (Strategy 1 + 3)
|
|
313
|
+
// ---------------------------------------------------------------------------
|
|
314
|
+
|
|
315
|
+
/**
|
|
316
|
+
* Build section path from nested heading context.
|
|
317
|
+
* E.g. "fields" → "Fields" or "fields > data_type reference" → "Fields > data_type reference"
|
|
318
|
+
*/
|
|
319
|
+
function buildSectionPath(headingStack: string[]): string {
|
|
320
|
+
return headingStack.filter(Boolean).join(' > ')
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* Recursively split sections that are too large.
|
|
325
|
+
* Strategy 3: sub-split on sub-headings if available, else paragraph split (Strategy 4).
|
|
326
|
+
*/
|
|
327
|
+
function splitOversizedSection(
|
|
328
|
+
body: string,
|
|
329
|
+
currentLevel: number,
|
|
330
|
+
maxTokens: number
|
|
331
|
+
): string[] {
|
|
332
|
+
// Try to find sub-headings at the next level
|
|
333
|
+
const subHeadingPattern = new RegExp(`^${'#'.repeat(currentLevel + 1)}\\s+`, 'm')
|
|
334
|
+
|
|
335
|
+
if (subHeadingPattern.test(body)) {
|
|
336
|
+
// Has sub-headings — split on them (recurse Strategy 1)
|
|
337
|
+
const subSections = parseMarkdownSections(body).filter(s =>
|
|
338
|
+
s.level > currentLevel || s.level === 0
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
// Re-parse keeping proper section boundaries
|
|
342
|
+
const subLines = body.split('\n')
|
|
343
|
+
const subChunks: string[] = []
|
|
344
|
+
let currentChunk: string[] = []
|
|
345
|
+
|
|
346
|
+
for (const line of subLines) {
|
|
347
|
+
const subMatch = line.match(new RegExp(`^(#{${currentLevel + 1},6})\\s+(.+)$`))
|
|
348
|
+
if (subMatch && currentChunk.length > 0) {
|
|
349
|
+
const chunkText = currentChunk.join('\n').trim()
|
|
350
|
+
if (chunkText) {
|
|
351
|
+
if (estimateTokens(chunkText) > maxTokens) {
|
|
352
|
+
// Still too big — go deeper
|
|
353
|
+
subChunks.push(...splitOversizedSection(chunkText, currentLevel + 1, maxTokens))
|
|
354
|
+
} else {
|
|
355
|
+
subChunks.push(chunkText)
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
currentChunk = [line]
|
|
359
|
+
} else {
|
|
360
|
+
currentChunk.push(line)
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
if (currentChunk.length > 0) {
|
|
364
|
+
const chunkText = currentChunk.join('\n').trim()
|
|
365
|
+
if (chunkText) {
|
|
366
|
+
if (estimateTokens(chunkText) > maxTokens) {
|
|
367
|
+
subChunks.push(...splitOversizedSection(chunkText, currentLevel + 1, maxTokens))
|
|
368
|
+
} else {
|
|
369
|
+
subChunks.push(chunkText)
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
return subChunks
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// No sub-headings — fall back to paragraph grouping (Strategy 4)
|
|
377
|
+
const paragraphs = splitIntoParagraphs(body)
|
|
378
|
+
return groupParagraphs(paragraphs, maxTokens)
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// ---------------------------------------------------------------------------
|
|
382
|
+
// Merge tiny chunks
|
|
383
|
+
// ---------------------------------------------------------------------------
|
|
384
|
+
|
|
385
|
+
/**
|
|
386
|
+
* Merge chunks that are below the minimum token threshold into their neighbor.
|
|
387
|
+
* Prefers merging with the next chunk; if last, merges with previous.
|
|
388
|
+
*/
|
|
389
|
+
function mergeTinyChunks(chunks: string[], minTokens: number): string[] {
|
|
390
|
+
if (chunks.length <= 1) return chunks
|
|
391
|
+
|
|
392
|
+
const result: string[] = []
|
|
393
|
+
|
|
394
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
395
|
+
const tokens = estimateTokens(chunks[i])
|
|
396
|
+
|
|
397
|
+
if (tokens < minTokens && result.length > 0) {
|
|
398
|
+
// Merge with previous
|
|
399
|
+
result[result.length - 1] += '\n\n' + chunks[i]
|
|
400
|
+
} else if (tokens < minTokens && i < chunks.length - 1) {
|
|
401
|
+
// Merge with next
|
|
402
|
+
chunks[i + 1] = chunks[i] + '\n\n' + chunks[i + 1]
|
|
403
|
+
} else {
|
|
404
|
+
result.push(chunks[i])
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
return result
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// ---------------------------------------------------------------------------
|
|
412
|
+
// Main chunker
|
|
413
|
+
// ---------------------------------------------------------------------------
|
|
414
|
+
|
|
415
|
+
/**
|
|
416
|
+
* Chunk an article for embedding using the adaptive recursive strategy.
|
|
417
|
+
*
|
|
418
|
+
* Decision tree:
|
|
419
|
+
* 1. ≤ singleChunkThreshold tokens → single chunk, no splitting
|
|
420
|
+
* 2. Has headings → heading-based split (Strategy 1)
|
|
421
|
+
* - Oversized sections → sub-split (Strategy 3 → recurse or Strategy 4)
|
|
422
|
+
* 3. No headings → paragraph grouping (Strategy 4)
|
|
423
|
+
* 4. Merge tiny chunks
|
|
424
|
+
* 5. Prefix every chunk with context
|
|
425
|
+
*/
|
|
426
|
+
export function chunkArticle(content: string, options: ChunkerOptions): Chunk[] {
|
|
427
|
+
const {
|
|
428
|
+
articleTitle,
|
|
429
|
+
maxTokens = 800,
|
|
430
|
+
minTokens = 100,
|
|
431
|
+
singleChunkThreshold = 600,
|
|
432
|
+
} = options
|
|
433
|
+
|
|
434
|
+
if (!content || content.trim().length === 0) {
|
|
435
|
+
return [{
|
|
436
|
+
content: articleTitle,
|
|
437
|
+
sectionPath: null,
|
|
438
|
+
chunkIndex: 0,
|
|
439
|
+
chunkTotal: 1,
|
|
440
|
+
}]
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
// Detect format and normalize to plain text for token counting / splitting
|
|
444
|
+
const format = detectFormat(content)
|
|
445
|
+
let plainContent: string
|
|
446
|
+
let sections: Section[]
|
|
447
|
+
|
|
448
|
+
// Primary split at ## level — ### and deeper stay as body content
|
|
449
|
+
// Sub-splitting (Strategy 3) will split on ### when a section is too large
|
|
450
|
+
const primarySplitLevel = 2
|
|
451
|
+
|
|
452
|
+
if (format === 'html') {
|
|
453
|
+
sections = parseHtmlSections(content, primarySplitLevel)
|
|
454
|
+
plainContent = htmlToPlainText(content)
|
|
455
|
+
} else {
|
|
456
|
+
sections = parseMarkdownSections(content, primarySplitLevel)
|
|
457
|
+
plainContent = content
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
const totalTokens = estimateTokens(plainContent)
|
|
461
|
+
|
|
462
|
+
// ── Step 1: Small article → single chunk ──────────────────────────
|
|
463
|
+
if (totalTokens <= singleChunkThreshold) {
|
|
464
|
+
const prefix = articleTitle
|
|
465
|
+
return [{
|
|
466
|
+
content: `${prefix}\n\n${plainContent}`,
|
|
467
|
+
sectionPath: null,
|
|
468
|
+
chunkIndex: 0,
|
|
469
|
+
chunkTotal: 1,
|
|
470
|
+
}]
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
// ── Step 2/3: Check for headings ──────────────────────────────────
|
|
474
|
+
// h1 is treated as preamble/title — only ## and deeper count as section headings
|
|
475
|
+
const hasHeadings = sections.some(s => s.level >= 2)
|
|
476
|
+
|
|
477
|
+
let rawChunks: { text: string; sectionPath: string | null }[]
|
|
478
|
+
|
|
479
|
+
if (hasHeadings) {
|
|
480
|
+
// Strategy 1: heading-based split
|
|
481
|
+
rawChunks = []
|
|
482
|
+
// Track headings by level for proper nesting: level → heading text
|
|
483
|
+
const headingByLevel = new Map<number, string>()
|
|
484
|
+
|
|
485
|
+
for (const section of sections) {
|
|
486
|
+
if ((section.level === 0 && !section.heading) || section.level === 1) {
|
|
487
|
+
// Preamble or h1 title — treat as intro context, not a section to chunk
|
|
488
|
+
if (section.body.trim()) {
|
|
489
|
+
rawChunks.push({
|
|
490
|
+
text: section.body,
|
|
491
|
+
sectionPath: null,
|
|
492
|
+
})
|
|
493
|
+
}
|
|
494
|
+
continue
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// Clear this level and all deeper levels, then set current heading
|
|
498
|
+
for (const lvl of headingByLevel.keys()) {
|
|
499
|
+
if (lvl >= section.level) headingByLevel.delete(lvl)
|
|
500
|
+
}
|
|
501
|
+
headingByLevel.set(section.level, section.heading)
|
|
502
|
+
|
|
503
|
+
// Build path from sorted levels: ## Parent > ### Child
|
|
504
|
+
const sortedLevels = [...headingByLevel.keys()].sort((a, b) => a - b)
|
|
505
|
+
const sectionPath = sortedLevels.map(l => headingByLevel.get(l)!).join(' > ')
|
|
506
|
+
const fullSection = section.heading + '\n' + section.body
|
|
507
|
+
const sectionTokens = estimateTokens(fullSection)
|
|
508
|
+
|
|
509
|
+
if (sectionTokens > maxTokens) {
|
|
510
|
+
// Strategy 3: sub-split oversized section
|
|
511
|
+
const subChunks = splitOversizedSection(fullSection, section.level, maxTokens)
|
|
512
|
+
for (let i = 0; i < subChunks.length; i++) {
|
|
513
|
+
rawChunks.push({
|
|
514
|
+
text: subChunks[i],
|
|
515
|
+
sectionPath: subChunks.length > 1 ? `${sectionPath} (${i + 1}/${subChunks.length})` : sectionPath,
|
|
516
|
+
})
|
|
517
|
+
}
|
|
518
|
+
} else {
|
|
519
|
+
rawChunks.push({
|
|
520
|
+
text: fullSection,
|
|
521
|
+
sectionPath,
|
|
522
|
+
})
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
} else {
|
|
526
|
+
// Strategy 4: paragraph grouping for unstructured content
|
|
527
|
+
const paragraphs = splitIntoParagraphs(plainContent)
|
|
528
|
+
const grouped = groupParagraphs(paragraphs, maxTokens)
|
|
529
|
+
rawChunks = grouped.map(text => ({
|
|
530
|
+
text,
|
|
531
|
+
sectionPath: null,
|
|
532
|
+
}))
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
// ── Step 4: Merge tiny chunks ─────────────────────────────────────
|
|
536
|
+
// We need to merge while preserving sectionPath, so we work at the rawChunks level
|
|
537
|
+
const mergedChunks: { text: string; sectionPath: string | null }[] = []
|
|
538
|
+
|
|
539
|
+
for (let i = 0; i < rawChunks.length; i++) {
|
|
540
|
+
const tokens = estimateTokens(rawChunks[i].text)
|
|
541
|
+
|
|
542
|
+
if (tokens < minTokens && mergedChunks.length > 0) {
|
|
543
|
+
// Merge with previous chunk
|
|
544
|
+
const prev = mergedChunks[mergedChunks.length - 1]
|
|
545
|
+
prev.text += '\n\n' + rawChunks[i].text
|
|
546
|
+
// Keep the previous section path (the primary one)
|
|
547
|
+
} else if (tokens < minTokens && i < rawChunks.length - 1) {
|
|
548
|
+
// Merge with next chunk
|
|
549
|
+
rawChunks[i + 1].text = rawChunks[i].text + '\n\n' + rawChunks[i + 1].text
|
|
550
|
+
// Next chunk keeps its section path
|
|
551
|
+
} else {
|
|
552
|
+
mergedChunks.push({ ...rawChunks[i] })
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// ── Step 5: Prefix and build final chunks ─────────────────────────
|
|
557
|
+
const total = mergedChunks.length
|
|
558
|
+
|
|
559
|
+
return mergedChunks.map((chunk, index) => {
|
|
560
|
+
let prefix: string
|
|
561
|
+
if (chunk.sectionPath) {
|
|
562
|
+
prefix = `${articleTitle} > ${chunk.sectionPath}`
|
|
563
|
+
} else if (total > 1) {
|
|
564
|
+
prefix = `${articleTitle} (chunk ${index + 1} of ${total})`
|
|
565
|
+
} else {
|
|
566
|
+
prefix = articleTitle
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
return {
|
|
570
|
+
content: `${prefix}\n\n${chunk.text}`,
|
|
571
|
+
sectionPath: chunk.sectionPath,
|
|
572
|
+
chunkIndex: index,
|
|
573
|
+
chunkTotal: total,
|
|
574
|
+
}
|
|
575
|
+
})
|
|
576
|
+
}
|