@twelvehart/supermemory-runtime 1.0.0-next.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +57 -0
- package/README.md +374 -0
- package/dist/index.js +189 -0
- package/dist/mcp/index.js +1132 -0
- package/docker-compose.prod.yml +91 -0
- package/docker-compose.yml +358 -0
- package/drizzle/0000_dapper_the_professor.sql +159 -0
- package/drizzle/0001_api_keys.sql +51 -0
- package/drizzle/meta/0000_snapshot.json +1532 -0
- package/drizzle/meta/_journal.json +13 -0
- package/drizzle.config.ts +20 -0
- package/package.json +114 -0
- package/scripts/add-extraction-job.ts +122 -0
- package/scripts/benchmark-pgvector.ts +122 -0
- package/scripts/bootstrap.sh +209 -0
- package/scripts/check-runtime-pack.ts +111 -0
- package/scripts/claude-mcp-config.ts +336 -0
- package/scripts/docker-entrypoint.sh +183 -0
- package/scripts/doctor.ts +377 -0
- package/scripts/init-db.sql +33 -0
- package/scripts/install.sh +1110 -0
- package/scripts/mcp-setup.ts +271 -0
- package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
- package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
- package/scripts/migrations/003_create_hnsw_index.sql +94 -0
- package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
- package/scripts/migrations/005_create_chunks_table.sql +95 -0
- package/scripts/migrations/006_create_processing_queue.sql +45 -0
- package/scripts/migrations/generate_test_data.sql +42 -0
- package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
- package/scripts/migrations/run_migrations.sh +286 -0
- package/scripts/migrations/test_hnsw_index.sql +255 -0
- package/scripts/pre-commit-secrets +282 -0
- package/scripts/run-extraction-worker.ts +46 -0
- package/scripts/run-phase1-tests.sh +291 -0
- package/scripts/setup.ts +222 -0
- package/scripts/smoke-install.sh +12 -0
- package/scripts/test-health-endpoint.sh +328 -0
- package/src/api/index.ts +2 -0
- package/src/api/middleware/auth.ts +80 -0
- package/src/api/middleware/csrf.ts +308 -0
- package/src/api/middleware/errorHandler.ts +166 -0
- package/src/api/middleware/rateLimit.ts +360 -0
- package/src/api/middleware/validation.ts +514 -0
- package/src/api/routes/documents.ts +286 -0
- package/src/api/routes/profiles.ts +237 -0
- package/src/api/routes/search.ts +71 -0
- package/src/api/stores/index.ts +58 -0
- package/src/config/bootstrap-env.ts +3 -0
- package/src/config/env.ts +71 -0
- package/src/config/feature-flags.ts +25 -0
- package/src/config/index.ts +140 -0
- package/src/config/secrets.config.ts +291 -0
- package/src/db/client.ts +92 -0
- package/src/db/index.ts +73 -0
- package/src/db/postgres.ts +72 -0
- package/src/db/schema/chunks.schema.ts +31 -0
- package/src/db/schema/containers.schema.ts +46 -0
- package/src/db/schema/documents.schema.ts +49 -0
- package/src/db/schema/embeddings.schema.ts +32 -0
- package/src/db/schema/index.ts +11 -0
- package/src/db/schema/memories.schema.ts +72 -0
- package/src/db/schema/profiles.schema.ts +34 -0
- package/src/db/schema/queue.schema.ts +59 -0
- package/src/db/schema/relationships.schema.ts +42 -0
- package/src/db/schema.ts +223 -0
- package/src/db/worker-connection.ts +47 -0
- package/src/index.ts +235 -0
- package/src/mcp/CLAUDE.md +1 -0
- package/src/mcp/index.ts +1380 -0
- package/src/mcp/legacyState.ts +22 -0
- package/src/mcp/rateLimit.ts +358 -0
- package/src/mcp/resources.ts +309 -0
- package/src/mcp/results.ts +104 -0
- package/src/mcp/tools.ts +401 -0
- package/src/queues/config.ts +119 -0
- package/src/queues/index.ts +289 -0
- package/src/sdk/client.ts +225 -0
- package/src/sdk/errors.ts +266 -0
- package/src/sdk/http.ts +560 -0
- package/src/sdk/index.ts +244 -0
- package/src/sdk/resources/base.ts +65 -0
- package/src/sdk/resources/connections.ts +204 -0
- package/src/sdk/resources/documents.ts +163 -0
- package/src/sdk/resources/index.ts +10 -0
- package/src/sdk/resources/memories.ts +150 -0
- package/src/sdk/resources/search.ts +60 -0
- package/src/sdk/resources/settings.ts +36 -0
- package/src/sdk/types.ts +674 -0
- package/src/services/chunking/index.ts +451 -0
- package/src/services/chunking.service.ts +650 -0
- package/src/services/csrf.service.ts +252 -0
- package/src/services/documents.repository.ts +219 -0
- package/src/services/documents.service.ts +191 -0
- package/src/services/embedding.service.ts +404 -0
- package/src/services/extraction.service.ts +300 -0
- package/src/services/extractors/code.extractor.ts +451 -0
- package/src/services/extractors/index.ts +9 -0
- package/src/services/extractors/markdown.extractor.ts +461 -0
- package/src/services/extractors/pdf.extractor.ts +315 -0
- package/src/services/extractors/text.extractor.ts +118 -0
- package/src/services/extractors/url.extractor.ts +243 -0
- package/src/services/index.ts +235 -0
- package/src/services/ingestion.service.ts +177 -0
- package/src/services/llm/anthropic.ts +400 -0
- package/src/services/llm/base.ts +460 -0
- package/src/services/llm/contradiction-detector.service.ts +526 -0
- package/src/services/llm/heuristics.ts +148 -0
- package/src/services/llm/index.ts +309 -0
- package/src/services/llm/memory-classifier.service.ts +383 -0
- package/src/services/llm/memory-extension-detector.service.ts +523 -0
- package/src/services/llm/mock.ts +470 -0
- package/src/services/llm/openai.ts +398 -0
- package/src/services/llm/prompts.ts +438 -0
- package/src/services/llm/types.ts +373 -0
- package/src/services/memory.repository.ts +1769 -0
- package/src/services/memory.service.ts +1338 -0
- package/src/services/memory.types.ts +234 -0
- package/src/services/persistence/index.ts +295 -0
- package/src/services/pipeline.service.ts +509 -0
- package/src/services/profile.repository.ts +436 -0
- package/src/services/profile.service.ts +560 -0
- package/src/services/profile.types.ts +270 -0
- package/src/services/relationships/detector.ts +1128 -0
- package/src/services/relationships/index.ts +268 -0
- package/src/services/relationships/memory-integration.ts +459 -0
- package/src/services/relationships/strategies.ts +132 -0
- package/src/services/relationships/types.ts +370 -0
- package/src/services/search.service.ts +761 -0
- package/src/services/search.types.ts +220 -0
- package/src/services/secrets.service.ts +384 -0
- package/src/services/vectorstore/base.ts +327 -0
- package/src/services/vectorstore/index.ts +444 -0
- package/src/services/vectorstore/memory.ts +286 -0
- package/src/services/vectorstore/migration.ts +295 -0
- package/src/services/vectorstore/mock.ts +403 -0
- package/src/services/vectorstore/pgvector.ts +695 -0
- package/src/services/vectorstore/types.ts +247 -0
- package/src/startup.ts +389 -0
- package/src/types/api.types.ts +193 -0
- package/src/types/document.types.ts +103 -0
- package/src/types/index.ts +241 -0
- package/src/types/profile.base.ts +133 -0
- package/src/utils/errors.ts +447 -0
- package/src/utils/id.ts +15 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +313 -0
- package/src/utils/sanitization.ts +501 -0
- package/src/utils/secret-validation.ts +273 -0
- package/src/utils/synonyms.ts +188 -0
- package/src/utils/validation.ts +581 -0
- package/src/workers/chunking.worker.ts +242 -0
- package/src/workers/embedding.worker.ts +358 -0
- package/src/workers/extraction.worker.ts +346 -0
- package/src/workers/indexing.worker.ts +505 -0
- package/tsconfig.json +38 -0
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunking Service
|
|
3
|
+
*
|
|
4
|
+
* Splits content into manageable chunks for embedding and indexing.
|
|
5
|
+
* Supports multiple content types with specialized chunking strategies.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface ChunkMetadata {
|
|
9
|
+
position: number
|
|
10
|
+
parentDocumentId: string
|
|
11
|
+
contentType: 'markdown' | 'code' | 'text'
|
|
12
|
+
language?: string // For code chunks
|
|
13
|
+
heading?: string // For markdown chunks
|
|
14
|
+
startOffset: number
|
|
15
|
+
endOffset: number
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface Chunk {
|
|
19
|
+
content: string
|
|
20
|
+
metadata: ChunkMetadata
|
|
21
|
+
tokenCount: number
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface ChunkingOptions {
|
|
25
|
+
chunkSize?: number // Default: 512 tokens (~2048 characters)
|
|
26
|
+
overlap?: number // Default: 50 tokens
|
|
27
|
+
contentType?: 'markdown' | 'code' | 'text'
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Content type detection based on content analysis
|
|
32
|
+
*/
|
|
33
|
+
export function detectContentType(content: string): 'markdown' | 'code' | 'text' {
|
|
34
|
+
// Markdown indicators
|
|
35
|
+
const markdownPatterns = [
|
|
36
|
+
/^#{1,6}\s+/m, // Headers
|
|
37
|
+
/\[.*?\]\(.*?\)/, // Links
|
|
38
|
+
/```[\s\S]*?```/, // Code blocks
|
|
39
|
+
/^\*\s+/m, // Unordered lists
|
|
40
|
+
/^\d+\.\s+/m, // Ordered lists
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
const markdownScore = markdownPatterns.filter((pattern) => pattern.test(content)).length
|
|
44
|
+
|
|
45
|
+
// Code indicators
|
|
46
|
+
const codePatterns = [
|
|
47
|
+
/^(import|export|from|require)\s+/m,
|
|
48
|
+
/^(function|const|let|var|class|interface|type)\s+/m,
|
|
49
|
+
/[{};()]/g,
|
|
50
|
+
/^(public|private|protected|async|await)\s+/m,
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
const codeScore = codePatterns.filter((pattern) => pattern.test(content)).length
|
|
54
|
+
|
|
55
|
+
// Determine content type
|
|
56
|
+
if (markdownScore >= 2) return 'markdown'
|
|
57
|
+
if (codeScore >= 2) return 'code'
|
|
58
|
+
return 'text'
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Estimate token count (rough approximation: 1 token ≈ 4 characters)
|
|
63
|
+
*/
|
|
64
|
+
function estimateTokens(text: string): number {
|
|
65
|
+
return Math.ceil(text.length / 4)
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Semantic chunking: split by paragraphs and sections
|
|
70
|
+
*/
|
|
71
|
+
function chunkSemantic(content: string, parentDocumentId: string, chunkSize: number, overlap: number): Chunk[] {
|
|
72
|
+
const chunks: Chunk[] = []
|
|
73
|
+
const paragraphs = content.split(/\n\n+/)
|
|
74
|
+
|
|
75
|
+
// If no paragraph breaks exist and content is large, use fixed chunking directly
|
|
76
|
+
if (paragraphs.length === 1 && estimateTokens(content) > chunkSize) {
|
|
77
|
+
return chunkFixed(content, parentDocumentId, chunkSize, overlap)
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
let currentChunk = ''
|
|
81
|
+
let currentOffset = 0
|
|
82
|
+
let position = 0
|
|
83
|
+
|
|
84
|
+
for (let i = 0; i < paragraphs.length; i++) {
|
|
85
|
+
const paragraph = paragraphs[i]
|
|
86
|
+
if (!paragraph) continue
|
|
87
|
+
|
|
88
|
+
// If a single paragraph is too large, split it by words
|
|
89
|
+
if (estimateTokens(paragraph) > chunkSize) {
|
|
90
|
+
// First, save current chunk if exists
|
|
91
|
+
if (currentChunk) {
|
|
92
|
+
chunks.push({
|
|
93
|
+
content: currentChunk,
|
|
94
|
+
metadata: {
|
|
95
|
+
position,
|
|
96
|
+
parentDocumentId,
|
|
97
|
+
contentType: 'text',
|
|
98
|
+
startOffset: currentOffset,
|
|
99
|
+
endOffset: currentOffset + currentChunk.length,
|
|
100
|
+
},
|
|
101
|
+
tokenCount: estimateTokens(currentChunk),
|
|
102
|
+
})
|
|
103
|
+
position++
|
|
104
|
+
currentOffset += currentChunk.length
|
|
105
|
+
currentChunk = ''
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Split large paragraph by words
|
|
109
|
+
const words = paragraph.split(/\s+/)
|
|
110
|
+
let wordChunk = ''
|
|
111
|
+
let wordOffset = currentOffset
|
|
112
|
+
|
|
113
|
+
for (const word of words) {
|
|
114
|
+
const testChunk = wordChunk ? `${wordChunk} ${word}` : word
|
|
115
|
+
if (estimateTokens(testChunk) <= chunkSize) {
|
|
116
|
+
wordChunk = testChunk
|
|
117
|
+
} else {
|
|
118
|
+
if (wordChunk) {
|
|
119
|
+
chunks.push({
|
|
120
|
+
content: wordChunk,
|
|
121
|
+
metadata: {
|
|
122
|
+
position,
|
|
123
|
+
parentDocumentId,
|
|
124
|
+
contentType: 'text',
|
|
125
|
+
startOffset: wordOffset,
|
|
126
|
+
endOffset: wordOffset + wordChunk.length,
|
|
127
|
+
},
|
|
128
|
+
tokenCount: estimateTokens(wordChunk),
|
|
129
|
+
})
|
|
130
|
+
position++
|
|
131
|
+
wordOffset += wordChunk.length + 1 // +1 for space
|
|
132
|
+
}
|
|
133
|
+
wordChunk = word
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
if (wordChunk) {
|
|
138
|
+
chunks.push({
|
|
139
|
+
content: wordChunk,
|
|
140
|
+
metadata: {
|
|
141
|
+
position,
|
|
142
|
+
parentDocumentId,
|
|
143
|
+
contentType: 'text',
|
|
144
|
+
startOffset: wordOffset,
|
|
145
|
+
endOffset: wordOffset + wordChunk.length,
|
|
146
|
+
},
|
|
147
|
+
tokenCount: estimateTokens(wordChunk),
|
|
148
|
+
})
|
|
149
|
+
position++
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
currentOffset += paragraph.length
|
|
153
|
+
continue
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
const combined = currentChunk ? `${currentChunk}\n\n${paragraph}` : paragraph
|
|
157
|
+
|
|
158
|
+
if (estimateTokens(combined) <= chunkSize) {
|
|
159
|
+
currentChunk = combined
|
|
160
|
+
} else {
|
|
161
|
+
// Save current chunk
|
|
162
|
+
const tokenCount = estimateTokens(currentChunk)
|
|
163
|
+
chunks.push({
|
|
164
|
+
content: currentChunk,
|
|
165
|
+
metadata: {
|
|
166
|
+
position,
|
|
167
|
+
parentDocumentId,
|
|
168
|
+
contentType: 'text',
|
|
169
|
+
startOffset: currentOffset,
|
|
170
|
+
endOffset: currentOffset + currentChunk.length,
|
|
171
|
+
},
|
|
172
|
+
tokenCount,
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
position++
|
|
176
|
+
currentOffset += currentChunk.length
|
|
177
|
+
|
|
178
|
+
// Start new chunk with overlap
|
|
179
|
+
const overlapText = currentChunk.split(/\s+/).slice(-overlap).join(' ')
|
|
180
|
+
currentChunk = overlapText ? `${overlapText}\n\n${paragraph}` : paragraph
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Add final chunk
|
|
185
|
+
if (currentChunk) {
|
|
186
|
+
chunks.push({
|
|
187
|
+
content: currentChunk,
|
|
188
|
+
metadata: {
|
|
189
|
+
position,
|
|
190
|
+
parentDocumentId,
|
|
191
|
+
contentType: 'text',
|
|
192
|
+
startOffset: currentOffset,
|
|
193
|
+
endOffset: currentOffset + currentChunk.length,
|
|
194
|
+
},
|
|
195
|
+
tokenCount: estimateTokens(currentChunk),
|
|
196
|
+
})
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
return chunks
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/**
|
|
203
|
+
* Markdown chunking: split by heading hierarchy
|
|
204
|
+
*/
|
|
205
|
+
function chunkMarkdown(content: string, parentDocumentId: string, chunkSize: number, overlap: number): Chunk[] {
|
|
206
|
+
const chunks: Chunk[] = []
|
|
207
|
+
const sections: Array<{ heading: string; content: string; level: number }> = []
|
|
208
|
+
|
|
209
|
+
// Split by headers
|
|
210
|
+
const lines = content.split('\n')
|
|
211
|
+
let currentSection = { heading: '', content: '', level: 0 }
|
|
212
|
+
|
|
213
|
+
for (const line of lines) {
|
|
214
|
+
const headerMatch = line.match(/^(#{1,6})\s+(.+)$/)
|
|
215
|
+
|
|
216
|
+
if (headerMatch && headerMatch[1] && headerMatch[2]) {
|
|
217
|
+
if (currentSection.content) {
|
|
218
|
+
sections.push({ ...currentSection })
|
|
219
|
+
}
|
|
220
|
+
currentSection = {
|
|
221
|
+
heading: headerMatch[2],
|
|
222
|
+
content: line + '\n',
|
|
223
|
+
level: headerMatch[1].length,
|
|
224
|
+
}
|
|
225
|
+
} else {
|
|
226
|
+
currentSection.content += line + '\n'
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if (currentSection.content) {
|
|
231
|
+
sections.push(currentSection)
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// Convert sections to chunks
|
|
235
|
+
let position = 0
|
|
236
|
+
let currentOffset = 0
|
|
237
|
+
|
|
238
|
+
for (const section of sections) {
|
|
239
|
+
const tokenCount = estimateTokens(section.content)
|
|
240
|
+
|
|
241
|
+
if (tokenCount <= chunkSize) {
|
|
242
|
+
chunks.push({
|
|
243
|
+
content: section.content.trim(),
|
|
244
|
+
metadata: {
|
|
245
|
+
position,
|
|
246
|
+
parentDocumentId,
|
|
247
|
+
contentType: 'markdown',
|
|
248
|
+
heading: section.heading,
|
|
249
|
+
startOffset: currentOffset,
|
|
250
|
+
endOffset: currentOffset + section.content.length,
|
|
251
|
+
},
|
|
252
|
+
tokenCount,
|
|
253
|
+
})
|
|
254
|
+
position++
|
|
255
|
+
} else {
|
|
256
|
+
// Section too large, split further with semantic chunking
|
|
257
|
+
const subChunks = chunkSemantic(section.content, parentDocumentId, chunkSize, overlap)
|
|
258
|
+
for (const chunk of subChunks) {
|
|
259
|
+
chunks.push({
|
|
260
|
+
...chunk,
|
|
261
|
+
metadata: {
|
|
262
|
+
...chunk.metadata,
|
|
263
|
+
contentType: 'markdown',
|
|
264
|
+
heading: section.heading,
|
|
265
|
+
position,
|
|
266
|
+
},
|
|
267
|
+
})
|
|
268
|
+
position++
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
currentOffset += section.content.length
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
return chunks
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Code chunking: AST-aware with scope preservation
|
|
280
|
+
*/
|
|
281
|
+
function chunkCode(content: string, parentDocumentId: string, chunkSize: number, overlap: number): Chunk[] {
|
|
282
|
+
const chunks: Chunk[] = []
|
|
283
|
+
|
|
284
|
+
// Detect language
|
|
285
|
+
let language = 'unknown'
|
|
286
|
+
if (content.includes('function') || content.includes('const')) language = 'javascript'
|
|
287
|
+
if (content.includes('def ') || content.includes('import ')) language = 'python'
|
|
288
|
+
if (content.includes('func ') || content.includes('package ')) language = 'go'
|
|
289
|
+
|
|
290
|
+
// Split by function/class boundaries
|
|
291
|
+
const codeBlocks: string[] = []
|
|
292
|
+
const functionPattern = /^(function|const|let|var|class|def|func|export|public|private)\s+/gm
|
|
293
|
+
const matches = [...content.matchAll(functionPattern)]
|
|
294
|
+
|
|
295
|
+
if (matches.length === 0) {
|
|
296
|
+
// No clear function boundaries, use semantic chunking
|
|
297
|
+
return chunkSemantic(content, parentDocumentId, chunkSize, overlap)
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
let lastIndex = 0
|
|
301
|
+
for (let i = 0; i < matches.length; i++) {
|
|
302
|
+
const match = matches[i]
|
|
303
|
+
if (!match) continue
|
|
304
|
+
const startIndex = match.index || 0
|
|
305
|
+
|
|
306
|
+
if (i > 0) {
|
|
307
|
+
codeBlocks.push(content.substring(lastIndex, startIndex))
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
lastIndex = startIndex
|
|
311
|
+
}
|
|
312
|
+
codeBlocks.push(content.substring(lastIndex))
|
|
313
|
+
|
|
314
|
+
// Convert blocks to chunks
|
|
315
|
+
let position = 0
|
|
316
|
+
let currentOffset = 0
|
|
317
|
+
|
|
318
|
+
for (const block of codeBlocks) {
|
|
319
|
+
if (!block.trim()) continue
|
|
320
|
+
|
|
321
|
+
const tokenCount = estimateTokens(block)
|
|
322
|
+
|
|
323
|
+
if (tokenCount <= chunkSize) {
|
|
324
|
+
chunks.push({
|
|
325
|
+
content: block.trim(),
|
|
326
|
+
metadata: {
|
|
327
|
+
position,
|
|
328
|
+
parentDocumentId,
|
|
329
|
+
contentType: 'code',
|
|
330
|
+
language,
|
|
331
|
+
startOffset: currentOffset,
|
|
332
|
+
endOffset: currentOffset + block.length,
|
|
333
|
+
},
|
|
334
|
+
tokenCount,
|
|
335
|
+
})
|
|
336
|
+
position++
|
|
337
|
+
} else {
|
|
338
|
+
// Block too large, split by lines
|
|
339
|
+
const lines = block.split('\n')
|
|
340
|
+
let currentChunk = ''
|
|
341
|
+
let chunkStart = currentOffset
|
|
342
|
+
|
|
343
|
+
for (const line of lines) {
|
|
344
|
+
const combined = currentChunk ? `${currentChunk}\n${line}` : line
|
|
345
|
+
|
|
346
|
+
if (estimateTokens(combined) <= chunkSize) {
|
|
347
|
+
currentChunk = combined
|
|
348
|
+
} else {
|
|
349
|
+
if (currentChunk) {
|
|
350
|
+
chunks.push({
|
|
351
|
+
content: currentChunk,
|
|
352
|
+
metadata: {
|
|
353
|
+
position,
|
|
354
|
+
parentDocumentId,
|
|
355
|
+
contentType: 'code',
|
|
356
|
+
language,
|
|
357
|
+
startOffset: chunkStart,
|
|
358
|
+
endOffset: chunkStart + currentChunk.length,
|
|
359
|
+
},
|
|
360
|
+
tokenCount: estimateTokens(currentChunk),
|
|
361
|
+
})
|
|
362
|
+
position++
|
|
363
|
+
chunkStart += currentChunk.length
|
|
364
|
+
}
|
|
365
|
+
currentChunk = line
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
if (currentChunk) {
|
|
370
|
+
chunks.push({
|
|
371
|
+
content: currentChunk,
|
|
372
|
+
metadata: {
|
|
373
|
+
position,
|
|
374
|
+
parentDocumentId,
|
|
375
|
+
contentType: 'code',
|
|
376
|
+
language,
|
|
377
|
+
startOffset: chunkStart,
|
|
378
|
+
endOffset: chunkStart + currentChunk.length,
|
|
379
|
+
},
|
|
380
|
+
tokenCount: estimateTokens(currentChunk),
|
|
381
|
+
})
|
|
382
|
+
position++
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
currentOffset += block.length
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
return chunks
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/**
|
|
393
|
+
* Fixed-size chunking with overlap (fallback)
|
|
394
|
+
*/
|
|
395
|
+
function chunkFixed(content: string, parentDocumentId: string, chunkSize: number, overlap: number): Chunk[] {
|
|
396
|
+
const chunks: Chunk[] = []
|
|
397
|
+
const charSize = chunkSize * 4 // ~4 chars per token
|
|
398
|
+
const overlapSize = overlap * 4
|
|
399
|
+
|
|
400
|
+
let position = 0
|
|
401
|
+
let offset = 0
|
|
402
|
+
|
|
403
|
+
while (offset < content.length) {
|
|
404
|
+
const end = Math.min(offset + charSize, content.length)
|
|
405
|
+
const chunkText = content.substring(offset, end)
|
|
406
|
+
|
|
407
|
+
chunks.push({
|
|
408
|
+
content: chunkText,
|
|
409
|
+
metadata: {
|
|
410
|
+
position,
|
|
411
|
+
parentDocumentId,
|
|
412
|
+
contentType: 'text',
|
|
413
|
+
startOffset: offset,
|
|
414
|
+
endOffset: end,
|
|
415
|
+
},
|
|
416
|
+
tokenCount: estimateTokens(chunkText),
|
|
417
|
+
})
|
|
418
|
+
|
|
419
|
+
position++
|
|
420
|
+
|
|
421
|
+
// Break if we've reached the end to avoid infinite loop
|
|
422
|
+
if (end >= content.length) {
|
|
423
|
+
break
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
// Move forward with overlap, ensuring we always advance
|
|
427
|
+
const nextOffset = end - overlapSize
|
|
428
|
+
offset = Math.max(nextOffset, offset + 1)
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
return chunks
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
/**
|
|
435
|
+
* Main chunking function with strategy selection
|
|
436
|
+
*/
|
|
437
|
+
export function chunkContent(content: string, parentDocumentId: string, options: ChunkingOptions = {}): Chunk[] {
|
|
438
|
+
const { chunkSize = 512, overlap = 50, contentType = detectContentType(content) } = options
|
|
439
|
+
|
|
440
|
+
// Select strategy based on content type
|
|
441
|
+
switch (contentType) {
|
|
442
|
+
case 'markdown':
|
|
443
|
+
return chunkMarkdown(content, parentDocumentId, chunkSize, overlap)
|
|
444
|
+
case 'code':
|
|
445
|
+
return chunkCode(content, parentDocumentId, chunkSize, overlap)
|
|
446
|
+
case 'text':
|
|
447
|
+
return chunkSemantic(content, parentDocumentId, chunkSize, overlap)
|
|
448
|
+
default:
|
|
449
|
+
return chunkFixed(content, parentDocumentId, chunkSize, overlap)
|
|
450
|
+
}
|
|
451
|
+
}
|