@twelvehart/supermemory-runtime 1.0.0-next.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +57 -0
- package/README.md +374 -0
- package/dist/index.js +189 -0
- package/dist/mcp/index.js +1132 -0
- package/docker-compose.prod.yml +91 -0
- package/docker-compose.yml +358 -0
- package/drizzle/0000_dapper_the_professor.sql +159 -0
- package/drizzle/0001_api_keys.sql +51 -0
- package/drizzle/meta/0000_snapshot.json +1532 -0
- package/drizzle/meta/_journal.json +13 -0
- package/drizzle.config.ts +20 -0
- package/package.json +114 -0
- package/scripts/add-extraction-job.ts +122 -0
- package/scripts/benchmark-pgvector.ts +122 -0
- package/scripts/bootstrap.sh +209 -0
- package/scripts/check-runtime-pack.ts +111 -0
- package/scripts/claude-mcp-config.ts +336 -0
- package/scripts/docker-entrypoint.sh +183 -0
- package/scripts/doctor.ts +377 -0
- package/scripts/init-db.sql +33 -0
- package/scripts/install.sh +1110 -0
- package/scripts/mcp-setup.ts +271 -0
- package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
- package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
- package/scripts/migrations/003_create_hnsw_index.sql +94 -0
- package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
- package/scripts/migrations/005_create_chunks_table.sql +95 -0
- package/scripts/migrations/006_create_processing_queue.sql +45 -0
- package/scripts/migrations/generate_test_data.sql +42 -0
- package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
- package/scripts/migrations/run_migrations.sh +286 -0
- package/scripts/migrations/test_hnsw_index.sql +255 -0
- package/scripts/pre-commit-secrets +282 -0
- package/scripts/run-extraction-worker.ts +46 -0
- package/scripts/run-phase1-tests.sh +291 -0
- package/scripts/setup.ts +222 -0
- package/scripts/smoke-install.sh +12 -0
- package/scripts/test-health-endpoint.sh +328 -0
- package/src/api/index.ts +2 -0
- package/src/api/middleware/auth.ts +80 -0
- package/src/api/middleware/csrf.ts +308 -0
- package/src/api/middleware/errorHandler.ts +166 -0
- package/src/api/middleware/rateLimit.ts +360 -0
- package/src/api/middleware/validation.ts +514 -0
- package/src/api/routes/documents.ts +286 -0
- package/src/api/routes/profiles.ts +237 -0
- package/src/api/routes/search.ts +71 -0
- package/src/api/stores/index.ts +58 -0
- package/src/config/bootstrap-env.ts +3 -0
- package/src/config/env.ts +71 -0
- package/src/config/feature-flags.ts +25 -0
- package/src/config/index.ts +140 -0
- package/src/config/secrets.config.ts +291 -0
- package/src/db/client.ts +92 -0
- package/src/db/index.ts +73 -0
- package/src/db/postgres.ts +72 -0
- package/src/db/schema/chunks.schema.ts +31 -0
- package/src/db/schema/containers.schema.ts +46 -0
- package/src/db/schema/documents.schema.ts +49 -0
- package/src/db/schema/embeddings.schema.ts +32 -0
- package/src/db/schema/index.ts +11 -0
- package/src/db/schema/memories.schema.ts +72 -0
- package/src/db/schema/profiles.schema.ts +34 -0
- package/src/db/schema/queue.schema.ts +59 -0
- package/src/db/schema/relationships.schema.ts +42 -0
- package/src/db/schema.ts +223 -0
- package/src/db/worker-connection.ts +47 -0
- package/src/index.ts +235 -0
- package/src/mcp/CLAUDE.md +1 -0
- package/src/mcp/index.ts +1380 -0
- package/src/mcp/legacyState.ts +22 -0
- package/src/mcp/rateLimit.ts +358 -0
- package/src/mcp/resources.ts +309 -0
- package/src/mcp/results.ts +104 -0
- package/src/mcp/tools.ts +401 -0
- package/src/queues/config.ts +119 -0
- package/src/queues/index.ts +289 -0
- package/src/sdk/client.ts +225 -0
- package/src/sdk/errors.ts +266 -0
- package/src/sdk/http.ts +560 -0
- package/src/sdk/index.ts +244 -0
- package/src/sdk/resources/base.ts +65 -0
- package/src/sdk/resources/connections.ts +204 -0
- package/src/sdk/resources/documents.ts +163 -0
- package/src/sdk/resources/index.ts +10 -0
- package/src/sdk/resources/memories.ts +150 -0
- package/src/sdk/resources/search.ts +60 -0
- package/src/sdk/resources/settings.ts +36 -0
- package/src/sdk/types.ts +674 -0
- package/src/services/chunking/index.ts +451 -0
- package/src/services/chunking.service.ts +650 -0
- package/src/services/csrf.service.ts +252 -0
- package/src/services/documents.repository.ts +219 -0
- package/src/services/documents.service.ts +191 -0
- package/src/services/embedding.service.ts +404 -0
- package/src/services/extraction.service.ts +300 -0
- package/src/services/extractors/code.extractor.ts +451 -0
- package/src/services/extractors/index.ts +9 -0
- package/src/services/extractors/markdown.extractor.ts +461 -0
- package/src/services/extractors/pdf.extractor.ts +315 -0
- package/src/services/extractors/text.extractor.ts +118 -0
- package/src/services/extractors/url.extractor.ts +243 -0
- package/src/services/index.ts +235 -0
- package/src/services/ingestion.service.ts +177 -0
- package/src/services/llm/anthropic.ts +400 -0
- package/src/services/llm/base.ts +460 -0
- package/src/services/llm/contradiction-detector.service.ts +526 -0
- package/src/services/llm/heuristics.ts +148 -0
- package/src/services/llm/index.ts +309 -0
- package/src/services/llm/memory-classifier.service.ts +383 -0
- package/src/services/llm/memory-extension-detector.service.ts +523 -0
- package/src/services/llm/mock.ts +470 -0
- package/src/services/llm/openai.ts +398 -0
- package/src/services/llm/prompts.ts +438 -0
- package/src/services/llm/types.ts +373 -0
- package/src/services/memory.repository.ts +1769 -0
- package/src/services/memory.service.ts +1338 -0
- package/src/services/memory.types.ts +234 -0
- package/src/services/persistence/index.ts +295 -0
- package/src/services/pipeline.service.ts +509 -0
- package/src/services/profile.repository.ts +436 -0
- package/src/services/profile.service.ts +560 -0
- package/src/services/profile.types.ts +270 -0
- package/src/services/relationships/detector.ts +1128 -0
- package/src/services/relationships/index.ts +268 -0
- package/src/services/relationships/memory-integration.ts +459 -0
- package/src/services/relationships/strategies.ts +132 -0
- package/src/services/relationships/types.ts +370 -0
- package/src/services/search.service.ts +761 -0
- package/src/services/search.types.ts +220 -0
- package/src/services/secrets.service.ts +384 -0
- package/src/services/vectorstore/base.ts +327 -0
- package/src/services/vectorstore/index.ts +444 -0
- package/src/services/vectorstore/memory.ts +286 -0
- package/src/services/vectorstore/migration.ts +295 -0
- package/src/services/vectorstore/mock.ts +403 -0
- package/src/services/vectorstore/pgvector.ts +695 -0
- package/src/services/vectorstore/types.ts +247 -0
- package/src/startup.ts +389 -0
- package/src/types/api.types.ts +193 -0
- package/src/types/document.types.ts +103 -0
- package/src/types/index.ts +241 -0
- package/src/types/profile.base.ts +133 -0
- package/src/utils/errors.ts +447 -0
- package/src/utils/id.ts +15 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +313 -0
- package/src/utils/sanitization.ts +501 -0
- package/src/utils/secret-validation.ts +273 -0
- package/src/utils/synonyms.ts +188 -0
- package/src/utils/validation.ts +581 -0
- package/src/workers/chunking.worker.ts +242 -0
- package/src/workers/embedding.worker.ts +358 -0
- package/src/workers/extraction.worker.ts +346 -0
- package/src/workers/indexing.worker.ts +505 -0
- package/tsconfig.json +38 -0
|
@@ -0,0 +1,650 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart chunking service - splits content into meaningful chunks
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { v4 as uuidv4 } from 'uuid'
|
|
6
|
+
import {
|
|
7
|
+
Chunk,
|
|
8
|
+
ChunkType,
|
|
9
|
+
ChunkPosition,
|
|
10
|
+
ChunkMetadata,
|
|
11
|
+
ChunkingOptions,
|
|
12
|
+
ContentType,
|
|
13
|
+
} from '../types/document.types.js'
|
|
14
|
+
import { MarkdownExtractor, MarkdownSection } from './extractors/markdown.extractor.js'
|
|
15
|
+
import { CodeExtractor, CodeBlock } from './extractors/code.extractor.js'
|
|
16
|
+
|
|
17
|
+
const DEFAULT_OPTIONS: Required<ChunkingOptions> = {
|
|
18
|
+
maxChunkSize: 1500,
|
|
19
|
+
minChunkSize: 100,
|
|
20
|
+
overlap: 100,
|
|
21
|
+
preserveStructure: true,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export class ChunkingService {
|
|
25
|
+
private readonly markdownExtractor: MarkdownExtractor
|
|
26
|
+
private readonly codeExtractor: CodeExtractor
|
|
27
|
+
|
|
28
|
+
constructor() {
|
|
29
|
+
this.markdownExtractor = new MarkdownExtractor()
|
|
30
|
+
this.codeExtractor = new CodeExtractor()
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Chunk content based on type
|
|
35
|
+
*/
|
|
36
|
+
chunk(documentId: string, content: string, contentType: ContentType, options?: ChunkingOptions): Chunk[] {
|
|
37
|
+
const opts = { ...DEFAULT_OPTIONS, ...options }
|
|
38
|
+
|
|
39
|
+
switch (contentType) {
|
|
40
|
+
case 'markdown':
|
|
41
|
+
return this.chunkByHeadings(documentId, content, opts)
|
|
42
|
+
case 'code':
|
|
43
|
+
return this.chunkByAST(documentId, content, opts)
|
|
44
|
+
default:
|
|
45
|
+
return this.chunkBySemanticSections(documentId, content, opts)
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Chunk by semantic sections (paragraphs, logical breaks)
|
|
51
|
+
*/
|
|
52
|
+
chunkBySemanticSections(documentId: string, text: string, options?: ChunkingOptions): Chunk[] {
|
|
53
|
+
const opts = { ...DEFAULT_OPTIONS, ...options }
|
|
54
|
+
const chunks: Chunk[] = []
|
|
55
|
+
|
|
56
|
+
// Split into paragraphs first
|
|
57
|
+
const paragraphs = text
|
|
58
|
+
.split(/\n\n+/)
|
|
59
|
+
.map((p) => p.trim())
|
|
60
|
+
.filter((p) => p.length > 0)
|
|
61
|
+
|
|
62
|
+
let currentContent = ''
|
|
63
|
+
let currentStart = 0
|
|
64
|
+
let chunkIndex = 0
|
|
65
|
+
|
|
66
|
+
for (let i = 0; i < paragraphs.length; i++) {
|
|
67
|
+
const paragraph = paragraphs[i] ?? ''
|
|
68
|
+
const testContent = currentContent ? `${currentContent}\n\n${paragraph}` : paragraph
|
|
69
|
+
|
|
70
|
+
if (testContent.length > opts.maxChunkSize && currentContent.length > 0) {
|
|
71
|
+
// Current chunk is full, save it
|
|
72
|
+
chunks.push(
|
|
73
|
+
this.createChunk(
|
|
74
|
+
documentId,
|
|
75
|
+
currentContent,
|
|
76
|
+
'paragraph',
|
|
77
|
+
{
|
|
78
|
+
index: chunkIndex,
|
|
79
|
+
start: currentStart,
|
|
80
|
+
end: currentStart + currentContent.length,
|
|
81
|
+
},
|
|
82
|
+
{}
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
chunkIndex++
|
|
87
|
+
|
|
88
|
+
// Handle overlap by including end of previous chunk
|
|
89
|
+
if (opts.overlap > 0 && currentContent.length > opts.overlap) {
|
|
90
|
+
const overlapText = currentContent.slice(-opts.overlap)
|
|
91
|
+
const previousContentLength = currentContent.length
|
|
92
|
+
currentContent = `${overlapText}\n\n${paragraph}`
|
|
93
|
+
currentStart = currentStart + previousContentLength - opts.overlap
|
|
94
|
+
} else {
|
|
95
|
+
currentContent = paragraph
|
|
96
|
+
currentStart = this.findPosition(text, paragraph, currentStart)
|
|
97
|
+
}
|
|
98
|
+
} else {
|
|
99
|
+
currentContent = testContent
|
|
100
|
+
if (i === 0) {
|
|
101
|
+
currentStart = 0
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Save remaining content
|
|
107
|
+
if (currentContent.length >= opts.minChunkSize) {
|
|
108
|
+
chunks.push(
|
|
109
|
+
this.createChunk(
|
|
110
|
+
documentId,
|
|
111
|
+
currentContent,
|
|
112
|
+
'paragraph',
|
|
113
|
+
{
|
|
114
|
+
index: chunkIndex,
|
|
115
|
+
start: currentStart,
|
|
116
|
+
end: currentStart + currentContent.length,
|
|
117
|
+
},
|
|
118
|
+
{}
|
|
119
|
+
)
|
|
120
|
+
)
|
|
121
|
+
} else if (chunks.length > 0 && currentContent.length > 0) {
|
|
122
|
+
// Merge with previous chunk if too small
|
|
123
|
+
const lastChunk = chunks[chunks.length - 1]
|
|
124
|
+
if (lastChunk) {
|
|
125
|
+
lastChunk.content += `\n\n${currentContent}`
|
|
126
|
+
lastChunk.position.end += currentContent.length + 2
|
|
127
|
+
lastChunk.metadata.charCount = lastChunk.content.length
|
|
128
|
+
lastChunk.metadata.wordCount = lastChunk.content.split(/\s+/).length
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return chunks
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Chunk markdown by headings
|
|
137
|
+
*/
|
|
138
|
+
chunkByHeadings(documentId: string, markdown: string, options?: ChunkingOptions): Chunk[] {
|
|
139
|
+
const opts = { ...DEFAULT_OPTIONS, ...options }
|
|
140
|
+
const sections = this.markdownExtractor.parseSections(markdown)
|
|
141
|
+
const flatSections = this.markdownExtractor.flattenSections(sections)
|
|
142
|
+
const chunks: Chunk[] = []
|
|
143
|
+
|
|
144
|
+
for (const section of flatSections) {
|
|
145
|
+
const fullContent = section.heading
|
|
146
|
+
? `${'#'.repeat(section.level)} ${section.heading}\n\n${section.content}`
|
|
147
|
+
: section.content
|
|
148
|
+
|
|
149
|
+
if (fullContent.length <= opts.maxChunkSize) {
|
|
150
|
+
chunks.push(
|
|
151
|
+
this.createChunk(
|
|
152
|
+
documentId,
|
|
153
|
+
fullContent,
|
|
154
|
+
section.level > 0 ? 'heading' : 'section',
|
|
155
|
+
{
|
|
156
|
+
index: chunks.length,
|
|
157
|
+
start: section.startLine,
|
|
158
|
+
end: section.endLine,
|
|
159
|
+
lineStart: section.startLine,
|
|
160
|
+
lineEnd: section.endLine,
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
headingLevel: section.level,
|
|
164
|
+
headingText: section.heading,
|
|
165
|
+
}
|
|
166
|
+
)
|
|
167
|
+
)
|
|
168
|
+
} else {
|
|
169
|
+
// Section too large, split by paragraphs with heading context
|
|
170
|
+
const sectionChunks = this.splitLargeSection(documentId, section, opts, chunks.length)
|
|
171
|
+
chunks.push(...sectionChunks)
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return chunks
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Split a large section into smaller chunks
|
|
180
|
+
*/
|
|
181
|
+
private splitLargeSection(
|
|
182
|
+
documentId: string,
|
|
183
|
+
section: MarkdownSection,
|
|
184
|
+
options: Required<ChunkingOptions>,
|
|
185
|
+
startIndex: number
|
|
186
|
+
): Chunk[] {
|
|
187
|
+
const chunks: Chunk[] = []
|
|
188
|
+
const headingPrefix = section.heading ? `${'#'.repeat(section.level)} ${section.heading}\n\n` : ''
|
|
189
|
+
|
|
190
|
+
const paragraphs = section.content
|
|
191
|
+
.split(/\n\n+/)
|
|
192
|
+
.map((p) => p.trim())
|
|
193
|
+
.filter((p) => p.length > 0)
|
|
194
|
+
|
|
195
|
+
let currentContent = headingPrefix
|
|
196
|
+
let chunkIndex = startIndex
|
|
197
|
+
|
|
198
|
+
for (const paragraph of paragraphs) {
|
|
199
|
+
const testContent = currentContent + paragraph + '\n\n'
|
|
200
|
+
|
|
201
|
+
if (testContent.length > options.maxChunkSize) {
|
|
202
|
+
if (currentContent.length > headingPrefix.length) {
|
|
203
|
+
chunks.push(
|
|
204
|
+
this.createChunk(
|
|
205
|
+
documentId,
|
|
206
|
+
currentContent.trim(),
|
|
207
|
+
'section',
|
|
208
|
+
{
|
|
209
|
+
index: chunkIndex,
|
|
210
|
+
start: section.startLine,
|
|
211
|
+
end: section.endLine,
|
|
212
|
+
lineStart: section.startLine,
|
|
213
|
+
lineEnd: section.endLine,
|
|
214
|
+
},
|
|
215
|
+
{
|
|
216
|
+
headingLevel: section.level,
|
|
217
|
+
headingText: section.heading,
|
|
218
|
+
}
|
|
219
|
+
)
|
|
220
|
+
)
|
|
221
|
+
chunkIndex++
|
|
222
|
+
currentContent = headingPrefix
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// If single paragraph is too large, split it
|
|
226
|
+
if (paragraph.length > options.maxChunkSize) {
|
|
227
|
+
const subChunks = this.splitLargeParagraph(documentId, paragraph, options, chunkIndex, section)
|
|
228
|
+
chunks.push(...subChunks)
|
|
229
|
+
chunkIndex += subChunks.length
|
|
230
|
+
continue
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
currentContent += paragraph + '\n\n'
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Save remaining content
|
|
238
|
+
if (currentContent.length > headingPrefix.length) {
|
|
239
|
+
chunks.push(
|
|
240
|
+
this.createChunk(
|
|
241
|
+
documentId,
|
|
242
|
+
currentContent.trim(),
|
|
243
|
+
'section',
|
|
244
|
+
{
|
|
245
|
+
index: chunkIndex,
|
|
246
|
+
start: section.startLine,
|
|
247
|
+
end: section.endLine,
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
headingLevel: section.level,
|
|
251
|
+
headingText: section.heading,
|
|
252
|
+
}
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return chunks
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Split a large paragraph into sentence-based chunks
|
|
262
|
+
*/
|
|
263
|
+
private splitLargeParagraph(
|
|
264
|
+
documentId: string,
|
|
265
|
+
paragraph: string,
|
|
266
|
+
options: Required<ChunkingOptions>,
|
|
267
|
+
startIndex: number,
|
|
268
|
+
section?: MarkdownSection
|
|
269
|
+
): Chunk[] {
|
|
270
|
+
const chunks: Chunk[] = []
|
|
271
|
+
const sentences = this.splitIntoSentences(paragraph)
|
|
272
|
+
|
|
273
|
+
let currentContent = ''
|
|
274
|
+
let chunkIndex = startIndex
|
|
275
|
+
|
|
276
|
+
for (const sentence of sentences) {
|
|
277
|
+
const testContent = currentContent + sentence + ' '
|
|
278
|
+
|
|
279
|
+
if (testContent.length > options.maxChunkSize && currentContent.length > 0) {
|
|
280
|
+
chunks.push(
|
|
281
|
+
this.createChunk(
|
|
282
|
+
documentId,
|
|
283
|
+
currentContent.trim(),
|
|
284
|
+
'paragraph',
|
|
285
|
+
{
|
|
286
|
+
index: chunkIndex,
|
|
287
|
+
start: 0,
|
|
288
|
+
end: currentContent.length,
|
|
289
|
+
},
|
|
290
|
+
section
|
|
291
|
+
? {
|
|
292
|
+
headingLevel: section.level,
|
|
293
|
+
headingText: section.heading,
|
|
294
|
+
}
|
|
295
|
+
: {}
|
|
296
|
+
)
|
|
297
|
+
)
|
|
298
|
+
chunkIndex++
|
|
299
|
+
|
|
300
|
+
// Add overlap
|
|
301
|
+
if (options.overlap > 0) {
|
|
302
|
+
const words = currentContent.split(' ')
|
|
303
|
+
const overlapWords = Math.floor(options.overlap / 6) // Approx 6 chars per word
|
|
304
|
+
currentContent = words.slice(-overlapWords).join(' ') + ' ' + sentence + ' '
|
|
305
|
+
} else {
|
|
306
|
+
currentContent = sentence + ' '
|
|
307
|
+
}
|
|
308
|
+
} else {
|
|
309
|
+
currentContent = testContent
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
if (currentContent.trim().length > 0) {
|
|
314
|
+
chunks.push(
|
|
315
|
+
this.createChunk(
|
|
316
|
+
documentId,
|
|
317
|
+
currentContent.trim(),
|
|
318
|
+
'paragraph',
|
|
319
|
+
{
|
|
320
|
+
index: chunkIndex,
|
|
321
|
+
start: 0,
|
|
322
|
+
end: currentContent.length,
|
|
323
|
+
},
|
|
324
|
+
{}
|
|
325
|
+
)
|
|
326
|
+
)
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
return chunks
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Chunk code by AST structure
|
|
334
|
+
*/
|
|
335
|
+
chunkByAST(documentId: string, code: string, options?: ChunkingOptions, language?: string): Chunk[] {
|
|
336
|
+
const opts = { ...DEFAULT_OPTIONS, ...options }
|
|
337
|
+
const detectedLanguage = language ?? this.codeExtractor.detectLanguage(code)
|
|
338
|
+
const codeBlocks = this.codeExtractor.parseCodeBlocks(code, detectedLanguage)
|
|
339
|
+
const chunks: Chunk[] = []
|
|
340
|
+
|
|
341
|
+
// If no blocks detected, fall back to line-based chunking
|
|
342
|
+
if (codeBlocks.length === 0) {
|
|
343
|
+
return this.chunkByLines(documentId, code, opts, detectedLanguage)
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Group related blocks (imports, then definitions)
|
|
347
|
+
const imports = codeBlocks.filter((b) => b.type === 'import')
|
|
348
|
+
const definitions = codeBlocks.filter((b) => b.type !== 'import')
|
|
349
|
+
|
|
350
|
+
// Create import chunk if there are imports
|
|
351
|
+
if (imports.length > 0) {
|
|
352
|
+
const firstImport = imports[0]
|
|
353
|
+
const lastImport = imports[imports.length - 1]
|
|
354
|
+
const importContent = imports.map((i) => i.content).join('\n')
|
|
355
|
+
if (importContent.length <= opts.maxChunkSize && firstImport && lastImport) {
|
|
356
|
+
chunks.push(
|
|
357
|
+
this.createChunk(
|
|
358
|
+
documentId,
|
|
359
|
+
importContent,
|
|
360
|
+
'code_block',
|
|
361
|
+
{
|
|
362
|
+
index: 0,
|
|
363
|
+
start: firstImport.startLine,
|
|
364
|
+
end: lastImport.endLine,
|
|
365
|
+
lineStart: firstImport.startLine,
|
|
366
|
+
lineEnd: lastImport.endLine,
|
|
367
|
+
},
|
|
368
|
+
{
|
|
369
|
+
language: detectedLanguage,
|
|
370
|
+
}
|
|
371
|
+
)
|
|
372
|
+
)
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// Create chunks for each code block
|
|
377
|
+
for (const block of definitions) {
|
|
378
|
+
const blockContent = block.docstring ? `${block.docstring}\n${block.content}` : block.content
|
|
379
|
+
|
|
380
|
+
if (blockContent.length <= opts.maxChunkSize) {
|
|
381
|
+
chunks.push(this.createCodeBlockChunk(documentId, block, chunks.length))
|
|
382
|
+
} else {
|
|
383
|
+
// Large function/class - split by methods or logical sections
|
|
384
|
+
const subChunks = this.splitLargeCodeBlock(documentId, block, opts, chunks.length)
|
|
385
|
+
chunks.push(...subChunks)
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
return chunks
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/**
|
|
393
|
+
* Create a chunk from a code block
|
|
394
|
+
*/
|
|
395
|
+
private createCodeBlockChunk(documentId: string, block: CodeBlock, index: number): Chunk {
|
|
396
|
+
const content = block.docstring ? `${block.docstring}\n${block.content}` : block.content
|
|
397
|
+
|
|
398
|
+
const chunkType: ChunkType =
|
|
399
|
+
block.type === 'class'
|
|
400
|
+
? 'class'
|
|
401
|
+
: block.type === 'function' || block.type === 'method'
|
|
402
|
+
? 'function'
|
|
403
|
+
: 'code_block'
|
|
404
|
+
|
|
405
|
+
return this.createChunk(
|
|
406
|
+
documentId,
|
|
407
|
+
content,
|
|
408
|
+
chunkType,
|
|
409
|
+
{
|
|
410
|
+
index,
|
|
411
|
+
start: block.startLine,
|
|
412
|
+
end: block.endLine,
|
|
413
|
+
lineStart: block.startLine,
|
|
414
|
+
lineEnd: block.endLine,
|
|
415
|
+
},
|
|
416
|
+
{
|
|
417
|
+
language: block.language,
|
|
418
|
+
functionName: block.type === 'function' || block.type === 'method' ? block.name : undefined,
|
|
419
|
+
className: block.type === 'class' ? block.name : block.parent,
|
|
420
|
+
}
|
|
421
|
+
)
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
/**
|
|
425
|
+
* Split large code block into smaller chunks
|
|
426
|
+
*/
|
|
427
|
+
private splitLargeCodeBlock(
|
|
428
|
+
documentId: string,
|
|
429
|
+
block: CodeBlock,
|
|
430
|
+
options: Required<ChunkingOptions>,
|
|
431
|
+
startIndex: number
|
|
432
|
+
): Chunk[] {
|
|
433
|
+
const chunks: Chunk[] = []
|
|
434
|
+
const lines = block.content.split('\n')
|
|
435
|
+
let currentContent = ''
|
|
436
|
+
let currentStartLine = block.startLine
|
|
437
|
+
let chunkIndex = startIndex
|
|
438
|
+
|
|
439
|
+
for (let i = 0; i < lines.length; i++) {
|
|
440
|
+
const line = lines[i] ?? ''
|
|
441
|
+
const testContent = currentContent + line + '\n'
|
|
442
|
+
|
|
443
|
+
if (testContent.length > options.maxChunkSize && currentContent.length > 0) {
|
|
444
|
+
chunks.push(
|
|
445
|
+
this.createChunk(
|
|
446
|
+
documentId,
|
|
447
|
+
currentContent.trim(),
|
|
448
|
+
'code_block',
|
|
449
|
+
{
|
|
450
|
+
index: chunkIndex,
|
|
451
|
+
start: currentStartLine,
|
|
452
|
+
end: block.startLine + i - 1,
|
|
453
|
+
lineStart: currentStartLine,
|
|
454
|
+
lineEnd: block.startLine + i - 1,
|
|
455
|
+
},
|
|
456
|
+
{
|
|
457
|
+
language: block.language,
|
|
458
|
+
className: block.type === 'class' ? block.name : block.parent,
|
|
459
|
+
functionName: block.type === 'function' || block.type === 'method' ? block.name : undefined,
|
|
460
|
+
}
|
|
461
|
+
)
|
|
462
|
+
)
|
|
463
|
+
chunkIndex++
|
|
464
|
+
currentContent = line + '\n'
|
|
465
|
+
currentStartLine = block.startLine + i
|
|
466
|
+
} else {
|
|
467
|
+
currentContent = testContent
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
if (currentContent.trim().length > 0) {
|
|
472
|
+
chunks.push(
|
|
473
|
+
this.createChunk(
|
|
474
|
+
documentId,
|
|
475
|
+
currentContent.trim(),
|
|
476
|
+
'code_block',
|
|
477
|
+
{
|
|
478
|
+
index: chunkIndex,
|
|
479
|
+
start: currentStartLine,
|
|
480
|
+
end: block.endLine,
|
|
481
|
+
lineStart: currentStartLine,
|
|
482
|
+
lineEnd: block.endLine,
|
|
483
|
+
},
|
|
484
|
+
{
|
|
485
|
+
language: block.language,
|
|
486
|
+
className: block.type === 'class' ? block.name : block.parent,
|
|
487
|
+
}
|
|
488
|
+
)
|
|
489
|
+
)
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
return chunks
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
/**
|
|
496
|
+
* Fallback: chunk by lines
|
|
497
|
+
*/
|
|
498
|
+
private chunkByLines(
|
|
499
|
+
documentId: string,
|
|
500
|
+
code: string,
|
|
501
|
+
options: Required<ChunkingOptions>,
|
|
502
|
+
language: string
|
|
503
|
+
): Chunk[] {
|
|
504
|
+
const chunks: Chunk[] = []
|
|
505
|
+
const lines = code.split('\n')
|
|
506
|
+
let currentContent = ''
|
|
507
|
+
let currentStartLine = 1
|
|
508
|
+
let chunkIndex = 0
|
|
509
|
+
|
|
510
|
+
for (let i = 0; i < lines.length; i++) {
|
|
511
|
+
const line = lines[i] ?? ''
|
|
512
|
+
const testContent = currentContent + line + '\n'
|
|
513
|
+
|
|
514
|
+
if (testContent.length > options.maxChunkSize && currentContent.length > 0) {
|
|
515
|
+
chunks.push(
|
|
516
|
+
this.createChunk(
|
|
517
|
+
documentId,
|
|
518
|
+
currentContent.trim(),
|
|
519
|
+
'code_block',
|
|
520
|
+
{
|
|
521
|
+
index: chunkIndex,
|
|
522
|
+
start: currentStartLine,
|
|
523
|
+
end: i,
|
|
524
|
+
lineStart: currentStartLine,
|
|
525
|
+
lineEnd: i,
|
|
526
|
+
},
|
|
527
|
+
{ language }
|
|
528
|
+
)
|
|
529
|
+
)
|
|
530
|
+
chunkIndex++
|
|
531
|
+
currentContent = line + '\n'
|
|
532
|
+
currentStartLine = i + 1
|
|
533
|
+
} else {
|
|
534
|
+
currentContent = testContent
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
if (currentContent.trim().length > 0) {
|
|
539
|
+
chunks.push(
|
|
540
|
+
this.createChunk(
|
|
541
|
+
documentId,
|
|
542
|
+
currentContent.trim(),
|
|
543
|
+
'code_block',
|
|
544
|
+
{
|
|
545
|
+
index: chunkIndex,
|
|
546
|
+
start: currentStartLine,
|
|
547
|
+
end: lines.length,
|
|
548
|
+
lineStart: currentStartLine,
|
|
549
|
+
lineEnd: lines.length,
|
|
550
|
+
},
|
|
551
|
+
{ language }
|
|
552
|
+
)
|
|
553
|
+
)
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
return chunks
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
/**
|
|
560
|
+
* Create a chunk object
|
|
561
|
+
*/
|
|
562
|
+
private createChunk(
|
|
563
|
+
documentId: string,
|
|
564
|
+
content: string,
|
|
565
|
+
type: ChunkType,
|
|
566
|
+
position: ChunkPosition,
|
|
567
|
+
metadata: Partial<ChunkMetadata>
|
|
568
|
+
): Chunk {
|
|
569
|
+
const words = content.split(/\s+/).filter((w) => w.length > 0)
|
|
570
|
+
|
|
571
|
+
return {
|
|
572
|
+
id: uuidv4(),
|
|
573
|
+
documentId,
|
|
574
|
+
content,
|
|
575
|
+
type,
|
|
576
|
+
position,
|
|
577
|
+
metadata: {
|
|
578
|
+
...metadata,
|
|
579
|
+
wordCount: words.length,
|
|
580
|
+
charCount: content.length,
|
|
581
|
+
},
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
/**
|
|
586
|
+
* Split text into sentences
|
|
587
|
+
*/
|
|
588
|
+
private splitIntoSentences(text: string): string[] {
|
|
589
|
+
const sentenceEnders = /([.!?]+)\s+/g
|
|
590
|
+
const sentences: string[] = []
|
|
591
|
+
let lastIndex = 0
|
|
592
|
+
let match: RegExpExecArray | null
|
|
593
|
+
|
|
594
|
+
while ((match = sentenceEnders.exec(text)) !== null) {
|
|
595
|
+
const matchGroup = match[1] ?? ''
|
|
596
|
+
const sentence = text.slice(lastIndex, match.index + matchGroup.length)
|
|
597
|
+
if (sentence.trim().length > 0) {
|
|
598
|
+
sentences.push(sentence.trim())
|
|
599
|
+
}
|
|
600
|
+
lastIndex = match.index + match[0].length
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
const remaining = text.slice(lastIndex).trim()
|
|
604
|
+
if (remaining.length > 0) {
|
|
605
|
+
sentences.push(remaining)
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
return sentences
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
/**
|
|
612
|
+
* Find position of text in content
|
|
613
|
+
*/
|
|
614
|
+
private findPosition(fullText: string, searchText: string, startFrom: number): number {
|
|
615
|
+
const pos = fullText.indexOf(searchText, startFrom)
|
|
616
|
+
return pos >= 0 ? pos : startFrom
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
/**
|
|
620
|
+
* Merge small chunks together
|
|
621
|
+
*/
|
|
622
|
+
mergeSmallChunks(chunks: Chunk[], minSize: number = 100): Chunk[] {
|
|
623
|
+
const merged: Chunk[] = []
|
|
624
|
+
|
|
625
|
+
for (const chunk of chunks) {
|
|
626
|
+
if (merged.length === 0) {
|
|
627
|
+
merged.push(chunk)
|
|
628
|
+
continue
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
const lastChunk = merged[merged.length - 1]
|
|
632
|
+
if (!lastChunk) {
|
|
633
|
+
merged.push(chunk)
|
|
634
|
+
continue
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
if (lastChunk.content.length < minSize || chunk.content.length < minSize) {
|
|
638
|
+
// Merge with previous
|
|
639
|
+
lastChunk.content += '\n\n' + chunk.content
|
|
640
|
+
lastChunk.position.end = chunk.position.end
|
|
641
|
+
lastChunk.metadata.charCount = lastChunk.content.length
|
|
642
|
+
lastChunk.metadata.wordCount = lastChunk.content.split(/\s+/).length
|
|
643
|
+
} else {
|
|
644
|
+
merged.push(chunk)
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
return merged
|
|
649
|
+
}
|
|
650
|
+
}
|