spine-framework-cortex 0.2.20 → 0.2.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,576 @@
1
+ /**
2
+ * Adaptive Article Chunker for KB Embeddings
3
+ *
4
+ * Strategy 5 (Recursive/Hierarchical) as the framework:
5
+ * - Strategy 1 (Heading-Based) for structured docs
6
+ * - Strategy 3 (Heading + Size Guards) for oversized sections
7
+ * - Strategy 4 (Paragraph Grouping) for unstructured prose
8
+ * - Strategy 2 (Fixed Window) deliberately excluded
9
+ *
10
+ * Hard rules:
11
+ * - Never split a code block or table
12
+ * - Never split mid-paragraph
13
+ * - Merge tiny chunks (<100 tokens) into neighbor
14
+ * - Prefix every chunk with context
15
+ */
16
+
17
+ // ---------------------------------------------------------------------------
18
+ // Types
19
+ // ---------------------------------------------------------------------------
20
+
21
+ export interface Chunk {
22
+ /** The text content to embed */
23
+ content: string
24
+ /** Section path for heading-based chunks, e.g. "Fields > data_type reference" */
25
+ sectionPath: string | null
26
+ /** 0-based index within the article */
27
+ chunkIndex: number
28
+ /** Total chunks for this article */
29
+ chunkTotal: number
30
+ }
31
+
32
+ export interface ChunkerOptions {
33
+ /** Article title — used as prefix context for every chunk */
34
+ articleTitle: string
35
+ /** Max tokens per chunk before sub-splitting. Default 800. */
36
+ maxTokens?: number
37
+ /** Min tokens per chunk before merging with neighbor. Default 100. */
38
+ minTokens?: number
39
+ /** Token count below which the entire article is a single chunk. Default 600. */
40
+ singleChunkThreshold?: number
41
+ }
42
+
43
+ // ---------------------------------------------------------------------------
44
+ // Token estimation
45
+ // ---------------------------------------------------------------------------
46
+
47
+ /**
48
+ * Rough token estimate: ~4 chars per token for English text.
49
+ * Good enough for chunking decisions — actual tokenization happens at OpenAI.
50
+ */
51
+ export function estimateTokens(text: string): number {
52
+ return Math.ceil(text.length / 4)
53
+ }
54
+
55
+ // ---------------------------------------------------------------------------
56
+ // HTML → plain text
57
+ // ---------------------------------------------------------------------------
58
+
59
+ /**
60
+ * Strip HTML tags to plain text, preserving structure via newlines.
61
+ * Headings, paragraphs, divs get newlines. Inline tags are stripped.
62
+ */
63
+ export function htmlToPlainText(html: string): string {
64
+ let text = html
65
+ // Normalize line breaks
66
+ text = text.replace(/\r\n?/g, '\n')
67
+ // Block-level elements → newlines
68
+ text = text.replace(/<\/(p|div|li|tr|blockquote)>/gi, '\n')
69
+ text = text.replace(/<br\s*\/?>/gi, '\n')
70
+ text = text.replace(/<\/(h[1-6])>/gi, '\n')
71
+ // Strip remaining tags
72
+ text = text.replace(/<[^>]+>/g, '')
73
+ // Decode common entities
74
+ text = text.replace(/&amp;/g, '&')
75
+ text = text.replace(/&lt;/g, '<')
76
+ text = text.replace(/&gt;/g, '>')
77
+ text = text.replace(/&quot;/g, '"')
78
+ text = text.replace(/&#39;/g, "'")
79
+ text = text.replace(/&nbsp;/g, ' ')
80
+ // Collapse excessive newlines
81
+ text = text.replace(/\n{3,}/g, '\n\n')
82
+ return text.trim()
83
+ }
84
+
85
+ // ---------------------------------------------------------------------------
86
+ // Detect content format
87
+ // ---------------------------------------------------------------------------
88
+
89
+ type ContentFormat = 'markdown' | 'html'
90
+
91
+ function detectFormat(content: string): ContentFormat {
92
+ // If it has HTML block tags, treat as HTML
93
+ if (/<(p|div|h[1-6]|ul|ol|table|pre)\b/i.test(content)) return 'html'
94
+ return 'markdown'
95
+ }
96
+
97
+ // ---------------------------------------------------------------------------
98
+ // Heading extraction
99
+ // ---------------------------------------------------------------------------
100
+
101
+ interface Section {
102
+ /** Heading level: 1-6 for explicit headings, 0 for preamble */
103
+ level: number
104
+ /** Heading text (empty for preamble) */
105
+ heading: string
106
+ /** Body content below this heading (not including sub-sections) */
107
+ body: string
108
+ }
109
+
110
+ /**
111
+ * Parse markdown into a flat list of sections by heading.
112
+ * Each section includes its heading text and the body up to the next heading.
113
+ *
114
+ * @param splitLevel - Only split on headings at this level or shallower.
115
+ * Deeper headings (e.g. ### when splitLevel=2) are kept as body content.
116
+ * Default 6 = split on all headings.
117
+ */
118
+ function parseMarkdownSections(content: string, splitLevel: number = 6): Section[] {
119
+ const lines = content.split('\n')
120
+ const sections: Section[] = []
121
+ let currentLevel = 0
122
+ let currentHeading = ''
123
+ let bodyLines: string[] = []
124
+
125
+ for (const line of lines) {
126
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)$/)
127
+ if (headingMatch && headingMatch[1].length <= splitLevel) {
128
+ // Flush previous section
129
+ if (bodyLines.length > 0 || currentHeading) {
130
+ sections.push({
131
+ level: currentLevel,
132
+ heading: currentHeading,
133
+ body: bodyLines.join('\n').trim()
134
+ })
135
+ }
136
+ currentLevel = headingMatch[1].length
137
+ currentHeading = headingMatch[2]
138
+ bodyLines = []
139
+ } else {
140
+ bodyLines.push(line)
141
+ }
142
+ }
143
+
144
+ // Flush final section
145
+ if (bodyLines.length > 0 || currentHeading) {
146
+ sections.push({
147
+ level: currentLevel,
148
+ heading: currentHeading,
149
+ body: bodyLines.join('\n').trim()
150
+ })
151
+ }
152
+
153
+ return sections
154
+ }
155
+
156
+ /**
157
+ * Parse HTML into sections by h2/h3/h4 tags.
158
+ * Converts to plain text first, then extracts heading structure.
159
+ */
160
+ function parseHtmlSections(html: string, splitLevel: number = 6): Section[] {
161
+ // Insert markdown-style headings before stripping, so we can parse them
162
+ let marked = html.replace(/<h([1-6])[^>]*>(.*?)<\/h\1>/gi, (_, level, text) => {
163
+ const hashes = '#'.repeat(parseInt(level))
164
+ // Strip any HTML inside the heading text
165
+ const cleanText = text.replace(/<[^>]+>/g, '').trim()
166
+ return `\n${hashes} ${cleanText}\n`
167
+ })
168
+ // Convert the rest to plain text
169
+ marked = htmlToPlainText(marked)
170
+ return parseMarkdownSections(marked, splitLevel)
171
+ }
172
+
173
+ // ---------------------------------------------------------------------------
174
+ // Atomic block detection
175
+ // ---------------------------------------------------------------------------
176
+
177
+ /**
178
+ * Check if a line is the start of a fenced code block.
179
+ */
180
+ function isCodeFenceStart(line: string): boolean {
181
+ return /^```/.test(line.trim())
182
+ }
183
+
184
+ /**
185
+ * Check if a line is inside a markdown table (starts with |).
186
+ */
187
+ function isTableRow(line: string): boolean {
188
+ return /^\|/.test(line.trim())
189
+ }
190
+
191
+ // ---------------------------------------------------------------------------
192
+ // Paragraph-based splitting (Strategy 4)
193
+ // ---------------------------------------------------------------------------
194
+
195
+ /**
196
+ * Split text into paragraphs, keeping code blocks and tables as atomic units.
197
+ * Returns an array of paragraph strings.
198
+ */
199
+ function splitIntoParagraphs(text: string): string[] {
200
+ const lines = text.split('\n')
201
+ const paragraphs: string[] = []
202
+ let current: string[] = []
203
+ let inCodeBlock = false
204
+
205
+ for (let i = 0; i < lines.length; i++) {
206
+ const line = lines[i]
207
+
208
+ // Track code fence boundaries
209
+ if (isCodeFenceStart(line)) {
210
+ if (inCodeBlock) {
211
+ // End of code block — include closing fence, flush as atomic unit
212
+ current.push(line)
213
+ paragraphs.push(current.join('\n'))
214
+ current = []
215
+ inCodeBlock = false
216
+ continue
217
+ } else {
218
+ // Start of code block — flush what we have, start atomic unit
219
+ if (current.length > 0 && current.some(l => l.trim())) {
220
+ paragraphs.push(current.join('\n').trim())
221
+ }
222
+ current = [line]
223
+ inCodeBlock = true
224
+ continue
225
+ }
226
+ }
227
+
228
+ if (inCodeBlock) {
229
+ current.push(line)
230
+ continue
231
+ }
232
+
233
+ // Table rows are grouped together as atomic
234
+ if (isTableRow(line)) {
235
+ // If previous content wasn't a table, flush it
236
+ if (current.length > 0 && !isTableRow(current[current.length - 1])) {
237
+ if (current.some(l => l.trim())) {
238
+ paragraphs.push(current.join('\n').trim())
239
+ }
240
+ current = []
241
+ }
242
+ current.push(line)
243
+ continue
244
+ }
245
+
246
+ // If we were in a table and hit a non-table line, flush the table
247
+ if (current.length > 0 && isTableRow(current[current.length - 1]) && !isTableRow(line)) {
248
+ paragraphs.push(current.join('\n').trim())
249
+ current = []
250
+ }
251
+
252
+ // Empty line = paragraph boundary
253
+ if (line.trim() === '') {
254
+ if (current.length > 0 && current.some(l => l.trim())) {
255
+ paragraphs.push(current.join('\n').trim())
256
+ current = []
257
+ }
258
+ continue
259
+ }
260
+
261
+ current.push(line)
262
+ }
263
+
264
+ // Flush remaining
265
+ if (inCodeBlock && current.length > 0) {
266
+ // Unclosed code block — flush as-is
267
+ paragraphs.push(current.join('\n').trim())
268
+ } else if (current.length > 0 && current.some(l => l.trim())) {
269
+ paragraphs.push(current.join('\n').trim())
270
+ }
271
+
272
+ return paragraphs.filter(p => p.length > 0)
273
+ }
274
+
275
+ /**
276
+ * Group paragraphs into chunks of approximately maxTokens.
277
+ * Never splits a paragraph — it's the atomic unit.
278
+ */
279
+ function groupParagraphs(paragraphs: string[], maxTokens: number): string[] {
280
+ const chunks: string[] = []
281
+ let current: string[] = []
282
+ let currentTokens = 0
283
+
284
+ for (const para of paragraphs) {
285
+ const paraTokens = estimateTokens(para)
286
+
287
+ // If a single paragraph exceeds max, it goes as its own chunk (atomic — don't break it)
288
+ if (paraTokens > maxTokens && current.length === 0) {
289
+ chunks.push(para)
290
+ continue
291
+ }
292
+
293
+ // Would adding this paragraph exceed the limit?
294
+ if (currentTokens + paraTokens > maxTokens && current.length > 0) {
295
+ chunks.push(current.join('\n\n'))
296
+ current = [para]
297
+ currentTokens = paraTokens
298
+ } else {
299
+ current.push(para)
300
+ currentTokens += paraTokens
301
+ }
302
+ }
303
+
304
+ if (current.length > 0) {
305
+ chunks.push(current.join('\n\n'))
306
+ }
307
+
308
+ return chunks
309
+ }
310
+
311
+ // ---------------------------------------------------------------------------
312
+ // Section-based splitting (Strategy 1 + 3)
313
+ // ---------------------------------------------------------------------------
314
+
315
+ /**
316
+ * Build section path from nested heading context.
317
+ * E.g. "fields" → "Fields" or "fields > data_type reference" → "Fields > data_type reference"
318
+ */
319
+ function buildSectionPath(headingStack: string[]): string {
320
+ return headingStack.filter(Boolean).join(' > ')
321
+ }
322
+
323
+ /**
324
+ * Recursively split sections that are too large.
325
+ * Strategy 3: sub-split on sub-headings if available, else paragraph split (Strategy 4).
326
+ */
327
+ function splitOversizedSection(
328
+ body: string,
329
+ currentLevel: number,
330
+ maxTokens: number
331
+ ): string[] {
332
+ // Try to find sub-headings at the next level
333
+ const subHeadingPattern = new RegExp(`^${'#'.repeat(currentLevel + 1)}\\s+`, 'm')
334
+
335
+ if (subHeadingPattern.test(body)) {
336
+ // Has sub-headings — split on them (recurse Strategy 1)
337
+ const subSections = parseMarkdownSections(body).filter(s =>
338
+ s.level > currentLevel || s.level === 0
339
+ )
340
+
341
+ // Re-parse keeping proper section boundaries
342
+ const subLines = body.split('\n')
343
+ const subChunks: string[] = []
344
+ let currentChunk: string[] = []
345
+
346
+ for (const line of subLines) {
347
+ const subMatch = line.match(new RegExp(`^(#{${currentLevel + 1},6})\\s+(.+)$`))
348
+ if (subMatch && currentChunk.length > 0) {
349
+ const chunkText = currentChunk.join('\n').trim()
350
+ if (chunkText) {
351
+ if (estimateTokens(chunkText) > maxTokens) {
352
+ // Still too big — go deeper
353
+ subChunks.push(...splitOversizedSection(chunkText, currentLevel + 1, maxTokens))
354
+ } else {
355
+ subChunks.push(chunkText)
356
+ }
357
+ }
358
+ currentChunk = [line]
359
+ } else {
360
+ currentChunk.push(line)
361
+ }
362
+ }
363
+ if (currentChunk.length > 0) {
364
+ const chunkText = currentChunk.join('\n').trim()
365
+ if (chunkText) {
366
+ if (estimateTokens(chunkText) > maxTokens) {
367
+ subChunks.push(...splitOversizedSection(chunkText, currentLevel + 1, maxTokens))
368
+ } else {
369
+ subChunks.push(chunkText)
370
+ }
371
+ }
372
+ }
373
+ return subChunks
374
+ }
375
+
376
+ // No sub-headings — fall back to paragraph grouping (Strategy 4)
377
+ const paragraphs = splitIntoParagraphs(body)
378
+ return groupParagraphs(paragraphs, maxTokens)
379
+ }
380
+
381
+ // ---------------------------------------------------------------------------
382
+ // Merge tiny chunks
383
+ // ---------------------------------------------------------------------------
384
+
385
+ /**
386
+ * Merge chunks that are below the minimum token threshold into their neighbor.
387
+ * Prefers merging with the next chunk; if last, merges with previous.
388
+ */
389
+ function mergeTinyChunks(chunks: string[], minTokens: number): string[] {
390
+ if (chunks.length <= 1) return chunks
391
+
392
+ const result: string[] = []
393
+
394
+ for (let i = 0; i < chunks.length; i++) {
395
+ const tokens = estimateTokens(chunks[i])
396
+
397
+ if (tokens < minTokens && result.length > 0) {
398
+ // Merge with previous
399
+ result[result.length - 1] += '\n\n' + chunks[i]
400
+ } else if (tokens < minTokens && i < chunks.length - 1) {
401
+ // Merge with next
402
+ chunks[i + 1] = chunks[i] + '\n\n' + chunks[i + 1]
403
+ } else {
404
+ result.push(chunks[i])
405
+ }
406
+ }
407
+
408
+ return result
409
+ }
410
+
411
+ // ---------------------------------------------------------------------------
412
+ // Main chunker
413
+ // ---------------------------------------------------------------------------
414
+
415
+ /**
416
+ * Chunk an article for embedding using the adaptive recursive strategy.
417
+ *
418
+ * Decision tree:
419
+ * 1. ≤ singleChunkThreshold tokens → single chunk, no splitting
420
+ * 2. Has headings → heading-based split (Strategy 1)
421
+ * - Oversized sections → sub-split (Strategy 3 → recurse or Strategy 4)
422
+ * 3. No headings → paragraph grouping (Strategy 4)
423
+ * 4. Merge tiny chunks
424
+ * 5. Prefix every chunk with context
425
+ */
426
+ export function chunkArticle(content: string, options: ChunkerOptions): Chunk[] {
427
+ const {
428
+ articleTitle,
429
+ maxTokens = 800,
430
+ minTokens = 100,
431
+ singleChunkThreshold = 600,
432
+ } = options
433
+
434
+ if (!content || content.trim().length === 0) {
435
+ return [{
436
+ content: articleTitle,
437
+ sectionPath: null,
438
+ chunkIndex: 0,
439
+ chunkTotal: 1,
440
+ }]
441
+ }
442
+
443
+ // Detect format and normalize to plain text for token counting / splitting
444
+ const format = detectFormat(content)
445
+ let plainContent: string
446
+ let sections: Section[]
447
+
448
+ // Primary split at ## level — ### and deeper stay as body content
449
+ // Sub-splitting (Strategy 3) will split on ### when a section is too large
450
+ const primarySplitLevel = 2
451
+
452
+ if (format === 'html') {
453
+ sections = parseHtmlSections(content, primarySplitLevel)
454
+ plainContent = htmlToPlainText(content)
455
+ } else {
456
+ sections = parseMarkdownSections(content, primarySplitLevel)
457
+ plainContent = content
458
+ }
459
+
460
+ const totalTokens = estimateTokens(plainContent)
461
+
462
+ // ── Step 1: Small article → single chunk ──────────────────────────
463
+ if (totalTokens <= singleChunkThreshold) {
464
+ const prefix = articleTitle
465
+ return [{
466
+ content: `${prefix}\n\n${plainContent}`,
467
+ sectionPath: null,
468
+ chunkIndex: 0,
469
+ chunkTotal: 1,
470
+ }]
471
+ }
472
+
473
+ // ── Step 2/3: Check for headings ──────────────────────────────────
474
+ // h1 is treated as preamble/title — only ## and deeper count as section headings
475
+ const hasHeadings = sections.some(s => s.level >= 2)
476
+
477
+ let rawChunks: { text: string; sectionPath: string | null }[]
478
+
479
+ if (hasHeadings) {
480
+ // Strategy 1: heading-based split
481
+ rawChunks = []
482
+ // Track headings by level for proper nesting: level → heading text
483
+ const headingByLevel = new Map<number, string>()
484
+
485
+ for (const section of sections) {
486
+ if ((section.level === 0 && !section.heading) || section.level === 1) {
487
+ // Preamble or h1 title — treat as intro context, not a section to chunk
488
+ if (section.body.trim()) {
489
+ rawChunks.push({
490
+ text: section.body,
491
+ sectionPath: null,
492
+ })
493
+ }
494
+ continue
495
+ }
496
+
497
+ // Clear this level and all deeper levels, then set current heading
498
+ for (const lvl of headingByLevel.keys()) {
499
+ if (lvl >= section.level) headingByLevel.delete(lvl)
500
+ }
501
+ headingByLevel.set(section.level, section.heading)
502
+
503
+ // Build path from sorted levels: ## Parent > ### Child
504
+ const sortedLevels = [...headingByLevel.keys()].sort((a, b) => a - b)
505
+ const sectionPath = sortedLevels.map(l => headingByLevel.get(l)!).join(' > ')
506
+ const fullSection = section.heading + '\n' + section.body
507
+ const sectionTokens = estimateTokens(fullSection)
508
+
509
+ if (sectionTokens > maxTokens) {
510
+ // Strategy 3: sub-split oversized section
511
+ const subChunks = splitOversizedSection(fullSection, section.level, maxTokens)
512
+ for (let i = 0; i < subChunks.length; i++) {
513
+ rawChunks.push({
514
+ text: subChunks[i],
515
+ sectionPath: subChunks.length > 1 ? `${sectionPath} (${i + 1}/${subChunks.length})` : sectionPath,
516
+ })
517
+ }
518
+ } else {
519
+ rawChunks.push({
520
+ text: fullSection,
521
+ sectionPath,
522
+ })
523
+ }
524
+ }
525
+ } else {
526
+ // Strategy 4: paragraph grouping for unstructured content
527
+ const paragraphs = splitIntoParagraphs(plainContent)
528
+ const grouped = groupParagraphs(paragraphs, maxTokens)
529
+ rawChunks = grouped.map(text => ({
530
+ text,
531
+ sectionPath: null,
532
+ }))
533
+ }
534
+
535
+ // ── Step 4: Merge tiny chunks ─────────────────────────────────────
536
+ // We need to merge while preserving sectionPath, so we work at the rawChunks level
537
+ const mergedChunks: { text: string; sectionPath: string | null }[] = []
538
+
539
+ for (let i = 0; i < rawChunks.length; i++) {
540
+ const tokens = estimateTokens(rawChunks[i].text)
541
+
542
+ if (tokens < minTokens && mergedChunks.length > 0) {
543
+ // Merge with previous chunk
544
+ const prev = mergedChunks[mergedChunks.length - 1]
545
+ prev.text += '\n\n' + rawChunks[i].text
546
+ // Keep the previous section path (the primary one)
547
+ } else if (tokens < minTokens && i < rawChunks.length - 1) {
548
+ // Merge with next chunk
549
+ rawChunks[i + 1].text = rawChunks[i].text + '\n\n' + rawChunks[i + 1].text
550
+ // Next chunk keeps its section path
551
+ } else {
552
+ mergedChunks.push({ ...rawChunks[i] })
553
+ }
554
+ }
555
+
556
+ // ── Step 5: Prefix and build final chunks ─────────────────────────
557
+ const total = mergedChunks.length
558
+
559
+ return mergedChunks.map((chunk, index) => {
560
+ let prefix: string
561
+ if (chunk.sectionPath) {
562
+ prefix = `${articleTitle} > ${chunk.sectionPath}`
563
+ } else if (total > 1) {
564
+ prefix = `${articleTitle} (chunk ${index + 1} of ${total})`
565
+ } else {
566
+ prefix = articleTitle
567
+ }
568
+
569
+ return {
570
+ content: `${prefix}\n\n${chunk.text}`,
571
+ sectionPath: chunk.sectionPath,
572
+ chunkIndex: index,
573
+ chunkTotal: total,
574
+ }
575
+ })
576
+ }