@twelvehart/supermemory-runtime 1.0.0-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/.env.example +57 -0
  2. package/README.md +374 -0
  3. package/dist/index.js +189 -0
  4. package/dist/mcp/index.js +1132 -0
  5. package/docker-compose.prod.yml +91 -0
  6. package/docker-compose.yml +358 -0
  7. package/drizzle/0000_dapper_the_professor.sql +159 -0
  8. package/drizzle/0001_api_keys.sql +51 -0
  9. package/drizzle/meta/0000_snapshot.json +1532 -0
  10. package/drizzle/meta/_journal.json +13 -0
  11. package/drizzle.config.ts +20 -0
  12. package/package.json +114 -0
  13. package/scripts/add-extraction-job.ts +122 -0
  14. package/scripts/benchmark-pgvector.ts +122 -0
  15. package/scripts/bootstrap.sh +209 -0
  16. package/scripts/check-runtime-pack.ts +111 -0
  17. package/scripts/claude-mcp-config.ts +336 -0
  18. package/scripts/docker-entrypoint.sh +183 -0
  19. package/scripts/doctor.ts +377 -0
  20. package/scripts/init-db.sql +33 -0
  21. package/scripts/install.sh +1110 -0
  22. package/scripts/mcp-setup.ts +271 -0
  23. package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
  24. package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
  25. package/scripts/migrations/003_create_hnsw_index.sql +94 -0
  26. package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
  27. package/scripts/migrations/005_create_chunks_table.sql +95 -0
  28. package/scripts/migrations/006_create_processing_queue.sql +45 -0
  29. package/scripts/migrations/generate_test_data.sql +42 -0
  30. package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
  31. package/scripts/migrations/run_migrations.sh +286 -0
  32. package/scripts/migrations/test_hnsw_index.sql +255 -0
  33. package/scripts/pre-commit-secrets +282 -0
  34. package/scripts/run-extraction-worker.ts +46 -0
  35. package/scripts/run-phase1-tests.sh +291 -0
  36. package/scripts/setup.ts +222 -0
  37. package/scripts/smoke-install.sh +12 -0
  38. package/scripts/test-health-endpoint.sh +328 -0
  39. package/src/api/index.ts +2 -0
  40. package/src/api/middleware/auth.ts +80 -0
  41. package/src/api/middleware/csrf.ts +308 -0
  42. package/src/api/middleware/errorHandler.ts +166 -0
  43. package/src/api/middleware/rateLimit.ts +360 -0
  44. package/src/api/middleware/validation.ts +514 -0
  45. package/src/api/routes/documents.ts +286 -0
  46. package/src/api/routes/profiles.ts +237 -0
  47. package/src/api/routes/search.ts +71 -0
  48. package/src/api/stores/index.ts +58 -0
  49. package/src/config/bootstrap-env.ts +3 -0
  50. package/src/config/env.ts +71 -0
  51. package/src/config/feature-flags.ts +25 -0
  52. package/src/config/index.ts +140 -0
  53. package/src/config/secrets.config.ts +291 -0
  54. package/src/db/client.ts +92 -0
  55. package/src/db/index.ts +73 -0
  56. package/src/db/postgres.ts +72 -0
  57. package/src/db/schema/chunks.schema.ts +31 -0
  58. package/src/db/schema/containers.schema.ts +46 -0
  59. package/src/db/schema/documents.schema.ts +49 -0
  60. package/src/db/schema/embeddings.schema.ts +32 -0
  61. package/src/db/schema/index.ts +11 -0
  62. package/src/db/schema/memories.schema.ts +72 -0
  63. package/src/db/schema/profiles.schema.ts +34 -0
  64. package/src/db/schema/queue.schema.ts +59 -0
  65. package/src/db/schema/relationships.schema.ts +42 -0
  66. package/src/db/schema.ts +223 -0
  67. package/src/db/worker-connection.ts +47 -0
  68. package/src/index.ts +235 -0
  69. package/src/mcp/CLAUDE.md +1 -0
  70. package/src/mcp/index.ts +1380 -0
  71. package/src/mcp/legacyState.ts +22 -0
  72. package/src/mcp/rateLimit.ts +358 -0
  73. package/src/mcp/resources.ts +309 -0
  74. package/src/mcp/results.ts +104 -0
  75. package/src/mcp/tools.ts +401 -0
  76. package/src/queues/config.ts +119 -0
  77. package/src/queues/index.ts +289 -0
  78. package/src/sdk/client.ts +225 -0
  79. package/src/sdk/errors.ts +266 -0
  80. package/src/sdk/http.ts +560 -0
  81. package/src/sdk/index.ts +244 -0
  82. package/src/sdk/resources/base.ts +65 -0
  83. package/src/sdk/resources/connections.ts +204 -0
  84. package/src/sdk/resources/documents.ts +163 -0
  85. package/src/sdk/resources/index.ts +10 -0
  86. package/src/sdk/resources/memories.ts +150 -0
  87. package/src/sdk/resources/search.ts +60 -0
  88. package/src/sdk/resources/settings.ts +36 -0
  89. package/src/sdk/types.ts +674 -0
  90. package/src/services/chunking/index.ts +451 -0
  91. package/src/services/chunking.service.ts +650 -0
  92. package/src/services/csrf.service.ts +252 -0
  93. package/src/services/documents.repository.ts +219 -0
  94. package/src/services/documents.service.ts +191 -0
  95. package/src/services/embedding.service.ts +404 -0
  96. package/src/services/extraction.service.ts +300 -0
  97. package/src/services/extractors/code.extractor.ts +451 -0
  98. package/src/services/extractors/index.ts +9 -0
  99. package/src/services/extractors/markdown.extractor.ts +461 -0
  100. package/src/services/extractors/pdf.extractor.ts +315 -0
  101. package/src/services/extractors/text.extractor.ts +118 -0
  102. package/src/services/extractors/url.extractor.ts +243 -0
  103. package/src/services/index.ts +235 -0
  104. package/src/services/ingestion.service.ts +177 -0
  105. package/src/services/llm/anthropic.ts +400 -0
  106. package/src/services/llm/base.ts +460 -0
  107. package/src/services/llm/contradiction-detector.service.ts +526 -0
  108. package/src/services/llm/heuristics.ts +148 -0
  109. package/src/services/llm/index.ts +309 -0
  110. package/src/services/llm/memory-classifier.service.ts +383 -0
  111. package/src/services/llm/memory-extension-detector.service.ts +523 -0
  112. package/src/services/llm/mock.ts +470 -0
  113. package/src/services/llm/openai.ts +398 -0
  114. package/src/services/llm/prompts.ts +438 -0
  115. package/src/services/llm/types.ts +373 -0
  116. package/src/services/memory.repository.ts +1769 -0
  117. package/src/services/memory.service.ts +1338 -0
  118. package/src/services/memory.types.ts +234 -0
  119. package/src/services/persistence/index.ts +295 -0
  120. package/src/services/pipeline.service.ts +509 -0
  121. package/src/services/profile.repository.ts +436 -0
  122. package/src/services/profile.service.ts +560 -0
  123. package/src/services/profile.types.ts +270 -0
  124. package/src/services/relationships/detector.ts +1128 -0
  125. package/src/services/relationships/index.ts +268 -0
  126. package/src/services/relationships/memory-integration.ts +459 -0
  127. package/src/services/relationships/strategies.ts +132 -0
  128. package/src/services/relationships/types.ts +370 -0
  129. package/src/services/search.service.ts +761 -0
  130. package/src/services/search.types.ts +220 -0
  131. package/src/services/secrets.service.ts +384 -0
  132. package/src/services/vectorstore/base.ts +327 -0
  133. package/src/services/vectorstore/index.ts +444 -0
  134. package/src/services/vectorstore/memory.ts +286 -0
  135. package/src/services/vectorstore/migration.ts +295 -0
  136. package/src/services/vectorstore/mock.ts +403 -0
  137. package/src/services/vectorstore/pgvector.ts +695 -0
  138. package/src/services/vectorstore/types.ts +247 -0
  139. package/src/startup.ts +389 -0
  140. package/src/types/api.types.ts +193 -0
  141. package/src/types/document.types.ts +103 -0
  142. package/src/types/index.ts +241 -0
  143. package/src/types/profile.base.ts +133 -0
  144. package/src/utils/errors.ts +447 -0
  145. package/src/utils/id.ts +15 -0
  146. package/src/utils/index.ts +101 -0
  147. package/src/utils/logger.ts +313 -0
  148. package/src/utils/sanitization.ts +501 -0
  149. package/src/utils/secret-validation.ts +273 -0
  150. package/src/utils/synonyms.ts +188 -0
  151. package/src/utils/validation.ts +581 -0
  152. package/src/workers/chunking.worker.ts +242 -0
  153. package/src/workers/embedding.worker.ts +358 -0
  154. package/src/workers/extraction.worker.ts +346 -0
  155. package/src/workers/indexing.worker.ts +505 -0
  156. package/tsconfig.json +38 -0
@@ -0,0 +1,650 @@
1
+ /**
2
+ * Smart chunking service - splits content into meaningful chunks
3
+ */
4
+
5
+ import { v4 as uuidv4 } from 'uuid'
6
+ import {
7
+ Chunk,
8
+ ChunkType,
9
+ ChunkPosition,
10
+ ChunkMetadata,
11
+ ChunkingOptions,
12
+ ContentType,
13
+ } from '../types/document.types.js'
14
+ import { MarkdownExtractor, MarkdownSection } from './extractors/markdown.extractor.js'
15
+ import { CodeExtractor, CodeBlock } from './extractors/code.extractor.js'
16
+
17
+ const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
18
+ maxChunkSize: 1500,
19
+ minChunkSize: 100,
20
+ overlap: 100,
21
+ preserveStructure: true,
22
+ }
23
+
24
+ export class ChunkingService {
25
+ private readonly markdownExtractor: MarkdownExtractor
26
+ private readonly codeExtractor: CodeExtractor
27
+
28
+ constructor() {
29
+ this.markdownExtractor = new MarkdownExtractor()
30
+ this.codeExtractor = new CodeExtractor()
31
+ }
32
+
33
+ /**
34
+ * Chunk content based on type
35
+ */
36
+ chunk(documentId: string, content: string, contentType: ContentType, options?: ChunkingOptions): Chunk[] {
37
+ const opts = { ...DEFAULT_OPTIONS, ...options }
38
+
39
+ switch (contentType) {
40
+ case 'markdown':
41
+ return this.chunkByHeadings(documentId, content, opts)
42
+ case 'code':
43
+ return this.chunkByAST(documentId, content, opts)
44
+ default:
45
+ return this.chunkBySemanticSections(documentId, content, opts)
46
+ }
47
+ }
48
+
49
+ /**
50
+ * Chunk by semantic sections (paragraphs, logical breaks)
51
+ */
52
+ chunkBySemanticSections(documentId: string, text: string, options?: ChunkingOptions): Chunk[] {
53
+ const opts = { ...DEFAULT_OPTIONS, ...options }
54
+ const chunks: Chunk[] = []
55
+
56
+ // Split into paragraphs first
57
+ const paragraphs = text
58
+ .split(/\n\n+/)
59
+ .map((p) => p.trim())
60
+ .filter((p) => p.length > 0)
61
+
62
+ let currentContent = ''
63
+ let currentStart = 0
64
+ let chunkIndex = 0
65
+
66
+ for (let i = 0; i < paragraphs.length; i++) {
67
+ const paragraph = paragraphs[i] ?? ''
68
+ const testContent = currentContent ? `${currentContent}\n\n${paragraph}` : paragraph
69
+
70
+ if (testContent.length > opts.maxChunkSize && currentContent.length > 0) {
71
+ // Current chunk is full, save it
72
+ chunks.push(
73
+ this.createChunk(
74
+ documentId,
75
+ currentContent,
76
+ 'paragraph',
77
+ {
78
+ index: chunkIndex,
79
+ start: currentStart,
80
+ end: currentStart + currentContent.length,
81
+ },
82
+ {}
83
+ )
84
+ )
85
+
86
+ chunkIndex++
87
+
88
+ // Handle overlap by including end of previous chunk
89
+ if (opts.overlap > 0 && currentContent.length > opts.overlap) {
90
+ const overlapText = currentContent.slice(-opts.overlap)
91
+ const previousContentLength = currentContent.length
92
+ currentContent = `${overlapText}\n\n${paragraph}`
93
+ currentStart = currentStart + previousContentLength - opts.overlap
94
+ } else {
95
+ currentContent = paragraph
96
+ currentStart = this.findPosition(text, paragraph, currentStart)
97
+ }
98
+ } else {
99
+ currentContent = testContent
100
+ if (i === 0) {
101
+ currentStart = 0
102
+ }
103
+ }
104
+ }
105
+
106
+ // Save remaining content
107
+ if (currentContent.length >= opts.minChunkSize) {
108
+ chunks.push(
109
+ this.createChunk(
110
+ documentId,
111
+ currentContent,
112
+ 'paragraph',
113
+ {
114
+ index: chunkIndex,
115
+ start: currentStart,
116
+ end: currentStart + currentContent.length,
117
+ },
118
+ {}
119
+ )
120
+ )
121
+ } else if (chunks.length > 0 && currentContent.length > 0) {
122
+ // Merge with previous chunk if too small
123
+ const lastChunk = chunks[chunks.length - 1]
124
+ if (lastChunk) {
125
+ lastChunk.content += `\n\n${currentContent}`
126
+ lastChunk.position.end += currentContent.length + 2
127
+ lastChunk.metadata.charCount = lastChunk.content.length
128
+ lastChunk.metadata.wordCount = lastChunk.content.split(/\s+/).length
129
+ }
130
+ }
131
+
132
+ return chunks
133
+ }
134
+
135
+ /**
136
+ * Chunk markdown by headings
137
+ */
138
+ chunkByHeadings(documentId: string, markdown: string, options?: ChunkingOptions): Chunk[] {
139
+ const opts = { ...DEFAULT_OPTIONS, ...options }
140
+ const sections = this.markdownExtractor.parseSections(markdown)
141
+ const flatSections = this.markdownExtractor.flattenSections(sections)
142
+ const chunks: Chunk[] = []
143
+
144
+ for (const section of flatSections) {
145
+ const fullContent = section.heading
146
+ ? `${'#'.repeat(section.level)} ${section.heading}\n\n${section.content}`
147
+ : section.content
148
+
149
+ if (fullContent.length <= opts.maxChunkSize) {
150
+ chunks.push(
151
+ this.createChunk(
152
+ documentId,
153
+ fullContent,
154
+ section.level > 0 ? 'heading' : 'section',
155
+ {
156
+ index: chunks.length,
157
+ start: section.startLine,
158
+ end: section.endLine,
159
+ lineStart: section.startLine,
160
+ lineEnd: section.endLine,
161
+ },
162
+ {
163
+ headingLevel: section.level,
164
+ headingText: section.heading,
165
+ }
166
+ )
167
+ )
168
+ } else {
169
+ // Section too large, split by paragraphs with heading context
170
+ const sectionChunks = this.splitLargeSection(documentId, section, opts, chunks.length)
171
+ chunks.push(...sectionChunks)
172
+ }
173
+ }
174
+
175
+ return chunks
176
+ }
177
+
178
+ /**
179
+ * Split a large section into smaller chunks
180
+ */
181
+ private splitLargeSection(
182
+ documentId: string,
183
+ section: MarkdownSection,
184
+ options: Required<ChunkingOptions>,
185
+ startIndex: number
186
+ ): Chunk[] {
187
+ const chunks: Chunk[] = []
188
+ const headingPrefix = section.heading ? `${'#'.repeat(section.level)} ${section.heading}\n\n` : ''
189
+
190
+ const paragraphs = section.content
191
+ .split(/\n\n+/)
192
+ .map((p) => p.trim())
193
+ .filter((p) => p.length > 0)
194
+
195
+ let currentContent = headingPrefix
196
+ let chunkIndex = startIndex
197
+
198
+ for (const paragraph of paragraphs) {
199
+ const testContent = currentContent + paragraph + '\n\n'
200
+
201
+ if (testContent.length > options.maxChunkSize) {
202
+ if (currentContent.length > headingPrefix.length) {
203
+ chunks.push(
204
+ this.createChunk(
205
+ documentId,
206
+ currentContent.trim(),
207
+ 'section',
208
+ {
209
+ index: chunkIndex,
210
+ start: section.startLine,
211
+ end: section.endLine,
212
+ lineStart: section.startLine,
213
+ lineEnd: section.endLine,
214
+ },
215
+ {
216
+ headingLevel: section.level,
217
+ headingText: section.heading,
218
+ }
219
+ )
220
+ )
221
+ chunkIndex++
222
+ currentContent = headingPrefix
223
+ }
224
+
225
+ // If single paragraph is too large, split it
226
+ if (paragraph.length > options.maxChunkSize) {
227
+ const subChunks = this.splitLargeParagraph(documentId, paragraph, options, chunkIndex, section)
228
+ chunks.push(...subChunks)
229
+ chunkIndex += subChunks.length
230
+ continue
231
+ }
232
+ }
233
+
234
+ currentContent += paragraph + '\n\n'
235
+ }
236
+
237
+ // Save remaining content
238
+ if (currentContent.length > headingPrefix.length) {
239
+ chunks.push(
240
+ this.createChunk(
241
+ documentId,
242
+ currentContent.trim(),
243
+ 'section',
244
+ {
245
+ index: chunkIndex,
246
+ start: section.startLine,
247
+ end: section.endLine,
248
+ },
249
+ {
250
+ headingLevel: section.level,
251
+ headingText: section.heading,
252
+ }
253
+ )
254
+ )
255
+ }
256
+
257
+ return chunks
258
+ }
259
+
260
+ /**
261
+ * Split a large paragraph into sentence-based chunks
262
+ */
263
+ private splitLargeParagraph(
264
+ documentId: string,
265
+ paragraph: string,
266
+ options: Required<ChunkingOptions>,
267
+ startIndex: number,
268
+ section?: MarkdownSection
269
+ ): Chunk[] {
270
+ const chunks: Chunk[] = []
271
+ const sentences = this.splitIntoSentences(paragraph)
272
+
273
+ let currentContent = ''
274
+ let chunkIndex = startIndex
275
+
276
+ for (const sentence of sentences) {
277
+ const testContent = currentContent + sentence + ' '
278
+
279
+ if (testContent.length > options.maxChunkSize && currentContent.length > 0) {
280
+ chunks.push(
281
+ this.createChunk(
282
+ documentId,
283
+ currentContent.trim(),
284
+ 'paragraph',
285
+ {
286
+ index: chunkIndex,
287
+ start: 0,
288
+ end: currentContent.length,
289
+ },
290
+ section
291
+ ? {
292
+ headingLevel: section.level,
293
+ headingText: section.heading,
294
+ }
295
+ : {}
296
+ )
297
+ )
298
+ chunkIndex++
299
+
300
+ // Add overlap
301
+ if (options.overlap > 0) {
302
+ const words = currentContent.split(' ')
303
+ const overlapWords = Math.floor(options.overlap / 6) // Approx 6 chars per word
304
+ currentContent = words.slice(-overlapWords).join(' ') + ' ' + sentence + ' '
305
+ } else {
306
+ currentContent = sentence + ' '
307
+ }
308
+ } else {
309
+ currentContent = testContent
310
+ }
311
+ }
312
+
313
+ if (currentContent.trim().length > 0) {
314
+ chunks.push(
315
+ this.createChunk(
316
+ documentId,
317
+ currentContent.trim(),
318
+ 'paragraph',
319
+ {
320
+ index: chunkIndex,
321
+ start: 0,
322
+ end: currentContent.length,
323
+ },
324
+ {}
325
+ )
326
+ )
327
+ }
328
+
329
+ return chunks
330
+ }
331
+
332
+ /**
333
+ * Chunk code by AST structure
334
+ */
335
+ chunkByAST(documentId: string, code: string, options?: ChunkingOptions, language?: string): Chunk[] {
336
+ const opts = { ...DEFAULT_OPTIONS, ...options }
337
+ const detectedLanguage = language ?? this.codeExtractor.detectLanguage(code)
338
+ const codeBlocks = this.codeExtractor.parseCodeBlocks(code, detectedLanguage)
339
+ const chunks: Chunk[] = []
340
+
341
+ // If no blocks detected, fall back to line-based chunking
342
+ if (codeBlocks.length === 0) {
343
+ return this.chunkByLines(documentId, code, opts, detectedLanguage)
344
+ }
345
+
346
+ // Group related blocks (imports, then definitions)
347
+ const imports = codeBlocks.filter((b) => b.type === 'import')
348
+ const definitions = codeBlocks.filter((b) => b.type !== 'import')
349
+
350
+ // Create import chunk if there are imports
351
+ if (imports.length > 0) {
352
+ const firstImport = imports[0]
353
+ const lastImport = imports[imports.length - 1]
354
+ const importContent = imports.map((i) => i.content).join('\n')
355
+ if (importContent.length <= opts.maxChunkSize && firstImport && lastImport) {
356
+ chunks.push(
357
+ this.createChunk(
358
+ documentId,
359
+ importContent,
360
+ 'code_block',
361
+ {
362
+ index: 0,
363
+ start: firstImport.startLine,
364
+ end: lastImport.endLine,
365
+ lineStart: firstImport.startLine,
366
+ lineEnd: lastImport.endLine,
367
+ },
368
+ {
369
+ language: detectedLanguage,
370
+ }
371
+ )
372
+ )
373
+ }
374
+ }
375
+
376
+ // Create chunks for each code block
377
+ for (const block of definitions) {
378
+ const blockContent = block.docstring ? `${block.docstring}\n${block.content}` : block.content
379
+
380
+ if (blockContent.length <= opts.maxChunkSize) {
381
+ chunks.push(this.createCodeBlockChunk(documentId, block, chunks.length))
382
+ } else {
383
+ // Large function/class - split by methods or logical sections
384
+ const subChunks = this.splitLargeCodeBlock(documentId, block, opts, chunks.length)
385
+ chunks.push(...subChunks)
386
+ }
387
+ }
388
+
389
+ return chunks
390
+ }
391
+
392
+ /**
393
+ * Create a chunk from a code block
394
+ */
395
+ private createCodeBlockChunk(documentId: string, block: CodeBlock, index: number): Chunk {
396
+ const content = block.docstring ? `${block.docstring}\n${block.content}` : block.content
397
+
398
+ const chunkType: ChunkType =
399
+ block.type === 'class'
400
+ ? 'class'
401
+ : block.type === 'function' || block.type === 'method'
402
+ ? 'function'
403
+ : 'code_block'
404
+
405
+ return this.createChunk(
406
+ documentId,
407
+ content,
408
+ chunkType,
409
+ {
410
+ index,
411
+ start: block.startLine,
412
+ end: block.endLine,
413
+ lineStart: block.startLine,
414
+ lineEnd: block.endLine,
415
+ },
416
+ {
417
+ language: block.language,
418
+ functionName: block.type === 'function' || block.type === 'method' ? block.name : undefined,
419
+ className: block.type === 'class' ? block.name : block.parent,
420
+ }
421
+ )
422
+ }
423
+
424
+ /**
425
+ * Split large code block into smaller chunks
426
+ */
427
+ private splitLargeCodeBlock(
428
+ documentId: string,
429
+ block: CodeBlock,
430
+ options: Required<ChunkingOptions>,
431
+ startIndex: number
432
+ ): Chunk[] {
433
+ const chunks: Chunk[] = []
434
+ const lines = block.content.split('\n')
435
+ let currentContent = ''
436
+ let currentStartLine = block.startLine
437
+ let chunkIndex = startIndex
438
+
439
+ for (let i = 0; i < lines.length; i++) {
440
+ const line = lines[i] ?? ''
441
+ const testContent = currentContent + line + '\n'
442
+
443
+ if (testContent.length > options.maxChunkSize && currentContent.length > 0) {
444
+ chunks.push(
445
+ this.createChunk(
446
+ documentId,
447
+ currentContent.trim(),
448
+ 'code_block',
449
+ {
450
+ index: chunkIndex,
451
+ start: currentStartLine,
452
+ end: block.startLine + i - 1,
453
+ lineStart: currentStartLine,
454
+ lineEnd: block.startLine + i - 1,
455
+ },
456
+ {
457
+ language: block.language,
458
+ className: block.type === 'class' ? block.name : block.parent,
459
+ functionName: block.type === 'function' || block.type === 'method' ? block.name : undefined,
460
+ }
461
+ )
462
+ )
463
+ chunkIndex++
464
+ currentContent = line + '\n'
465
+ currentStartLine = block.startLine + i
466
+ } else {
467
+ currentContent = testContent
468
+ }
469
+ }
470
+
471
+ if (currentContent.trim().length > 0) {
472
+ chunks.push(
473
+ this.createChunk(
474
+ documentId,
475
+ currentContent.trim(),
476
+ 'code_block',
477
+ {
478
+ index: chunkIndex,
479
+ start: currentStartLine,
480
+ end: block.endLine,
481
+ lineStart: currentStartLine,
482
+ lineEnd: block.endLine,
483
+ },
484
+ {
485
+ language: block.language,
486
+ className: block.type === 'class' ? block.name : block.parent,
487
+ }
488
+ )
489
+ )
490
+ }
491
+
492
+ return chunks
493
+ }
494
+
495
+ /**
496
+ * Fallback: chunk by lines
497
+ */
498
+ private chunkByLines(
499
+ documentId: string,
500
+ code: string,
501
+ options: Required<ChunkingOptions>,
502
+ language: string
503
+ ): Chunk[] {
504
+ const chunks: Chunk[] = []
505
+ const lines = code.split('\n')
506
+ let currentContent = ''
507
+ let currentStartLine = 1
508
+ let chunkIndex = 0
509
+
510
+ for (let i = 0; i < lines.length; i++) {
511
+ const line = lines[i] ?? ''
512
+ const testContent = currentContent + line + '\n'
513
+
514
+ if (testContent.length > options.maxChunkSize && currentContent.length > 0) {
515
+ chunks.push(
516
+ this.createChunk(
517
+ documentId,
518
+ currentContent.trim(),
519
+ 'code_block',
520
+ {
521
+ index: chunkIndex,
522
+ start: currentStartLine,
523
+ end: i,
524
+ lineStart: currentStartLine,
525
+ lineEnd: i,
526
+ },
527
+ { language }
528
+ )
529
+ )
530
+ chunkIndex++
531
+ currentContent = line + '\n'
532
+ currentStartLine = i + 1
533
+ } else {
534
+ currentContent = testContent
535
+ }
536
+ }
537
+
538
+ if (currentContent.trim().length > 0) {
539
+ chunks.push(
540
+ this.createChunk(
541
+ documentId,
542
+ currentContent.trim(),
543
+ 'code_block',
544
+ {
545
+ index: chunkIndex,
546
+ start: currentStartLine,
547
+ end: lines.length,
548
+ lineStart: currentStartLine,
549
+ lineEnd: lines.length,
550
+ },
551
+ { language }
552
+ )
553
+ )
554
+ }
555
+
556
+ return chunks
557
+ }
558
+
559
+ /**
560
+ * Create a chunk object
561
+ */
562
+ private createChunk(
563
+ documentId: string,
564
+ content: string,
565
+ type: ChunkType,
566
+ position: ChunkPosition,
567
+ metadata: Partial<ChunkMetadata>
568
+ ): Chunk {
569
+ const words = content.split(/\s+/).filter((w) => w.length > 0)
570
+
571
+ return {
572
+ id: uuidv4(),
573
+ documentId,
574
+ content,
575
+ type,
576
+ position,
577
+ metadata: {
578
+ ...metadata,
579
+ wordCount: words.length,
580
+ charCount: content.length,
581
+ },
582
+ }
583
+ }
584
+
585
+ /**
586
+ * Split text into sentences
587
+ */
588
+ private splitIntoSentences(text: string): string[] {
589
+ const sentenceEnders = /([.!?]+)\s+/g
590
+ const sentences: string[] = []
591
+ let lastIndex = 0
592
+ let match: RegExpExecArray | null
593
+
594
+ while ((match = sentenceEnders.exec(text)) !== null) {
595
+ const matchGroup = match[1] ?? ''
596
+ const sentence = text.slice(lastIndex, match.index + matchGroup.length)
597
+ if (sentence.trim().length > 0) {
598
+ sentences.push(sentence.trim())
599
+ }
600
+ lastIndex = match.index + match[0].length
601
+ }
602
+
603
+ const remaining = text.slice(lastIndex).trim()
604
+ if (remaining.length > 0) {
605
+ sentences.push(remaining)
606
+ }
607
+
608
+ return sentences
609
+ }
610
+
611
+ /**
612
+ * Find position of text in content
613
+ */
614
+ private findPosition(fullText: string, searchText: string, startFrom: number): number {
615
+ const pos = fullText.indexOf(searchText, startFrom)
616
+ return pos >= 0 ? pos : startFrom
617
+ }
618
+
619
+ /**
620
+ * Merge small chunks together
621
+ */
622
+ mergeSmallChunks(chunks: Chunk[], minSize: number = 100): Chunk[] {
623
+ const merged: Chunk[] = []
624
+
625
+ for (const chunk of chunks) {
626
+ if (merged.length === 0) {
627
+ merged.push(chunk)
628
+ continue
629
+ }
630
+
631
+ const lastChunk = merged[merged.length - 1]
632
+ if (!lastChunk) {
633
+ merged.push(chunk)
634
+ continue
635
+ }
636
+
637
+ if (lastChunk.content.length < minSize || chunk.content.length < minSize) {
638
+ // Merge with previous
639
+ lastChunk.content += '\n\n' + chunk.content
640
+ lastChunk.position.end = chunk.position.end
641
+ lastChunk.metadata.charCount = lastChunk.content.length
642
+ lastChunk.metadata.wordCount = lastChunk.content.split(/\s+/).length
643
+ } else {
644
+ merged.push(chunk)
645
+ }
646
+ }
647
+
648
+ return merged
649
+ }
650
+ }