@twelvehart/supermemory-runtime 1.0.0-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/.env.example +57 -0
  2. package/README.md +374 -0
  3. package/dist/index.js +189 -0
  4. package/dist/mcp/index.js +1132 -0
  5. package/docker-compose.prod.yml +91 -0
  6. package/docker-compose.yml +358 -0
  7. package/drizzle/0000_dapper_the_professor.sql +159 -0
  8. package/drizzle/0001_api_keys.sql +51 -0
  9. package/drizzle/meta/0000_snapshot.json +1532 -0
  10. package/drizzle/meta/_journal.json +13 -0
  11. package/drizzle.config.ts +20 -0
  12. package/package.json +114 -0
  13. package/scripts/add-extraction-job.ts +122 -0
  14. package/scripts/benchmark-pgvector.ts +122 -0
  15. package/scripts/bootstrap.sh +209 -0
  16. package/scripts/check-runtime-pack.ts +111 -0
  17. package/scripts/claude-mcp-config.ts +336 -0
  18. package/scripts/docker-entrypoint.sh +183 -0
  19. package/scripts/doctor.ts +377 -0
  20. package/scripts/init-db.sql +33 -0
  21. package/scripts/install.sh +1110 -0
  22. package/scripts/mcp-setup.ts +271 -0
  23. package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
  24. package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
  25. package/scripts/migrations/003_create_hnsw_index.sql +94 -0
  26. package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
  27. package/scripts/migrations/005_create_chunks_table.sql +95 -0
  28. package/scripts/migrations/006_create_processing_queue.sql +45 -0
  29. package/scripts/migrations/generate_test_data.sql +42 -0
  30. package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
  31. package/scripts/migrations/run_migrations.sh +286 -0
  32. package/scripts/migrations/test_hnsw_index.sql +255 -0
  33. package/scripts/pre-commit-secrets +282 -0
  34. package/scripts/run-extraction-worker.ts +46 -0
  35. package/scripts/run-phase1-tests.sh +291 -0
  36. package/scripts/setup.ts +222 -0
  37. package/scripts/smoke-install.sh +12 -0
  38. package/scripts/test-health-endpoint.sh +328 -0
  39. package/src/api/index.ts +2 -0
  40. package/src/api/middleware/auth.ts +80 -0
  41. package/src/api/middleware/csrf.ts +308 -0
  42. package/src/api/middleware/errorHandler.ts +166 -0
  43. package/src/api/middleware/rateLimit.ts +360 -0
  44. package/src/api/middleware/validation.ts +514 -0
  45. package/src/api/routes/documents.ts +286 -0
  46. package/src/api/routes/profiles.ts +237 -0
  47. package/src/api/routes/search.ts +71 -0
  48. package/src/api/stores/index.ts +58 -0
  49. package/src/config/bootstrap-env.ts +3 -0
  50. package/src/config/env.ts +71 -0
  51. package/src/config/feature-flags.ts +25 -0
  52. package/src/config/index.ts +140 -0
  53. package/src/config/secrets.config.ts +291 -0
  54. package/src/db/client.ts +92 -0
  55. package/src/db/index.ts +73 -0
  56. package/src/db/postgres.ts +72 -0
  57. package/src/db/schema/chunks.schema.ts +31 -0
  58. package/src/db/schema/containers.schema.ts +46 -0
  59. package/src/db/schema/documents.schema.ts +49 -0
  60. package/src/db/schema/embeddings.schema.ts +32 -0
  61. package/src/db/schema/index.ts +11 -0
  62. package/src/db/schema/memories.schema.ts +72 -0
  63. package/src/db/schema/profiles.schema.ts +34 -0
  64. package/src/db/schema/queue.schema.ts +59 -0
  65. package/src/db/schema/relationships.schema.ts +42 -0
  66. package/src/db/schema.ts +223 -0
  67. package/src/db/worker-connection.ts +47 -0
  68. package/src/index.ts +235 -0
  69. package/src/mcp/CLAUDE.md +1 -0
  70. package/src/mcp/index.ts +1380 -0
  71. package/src/mcp/legacyState.ts +22 -0
  72. package/src/mcp/rateLimit.ts +358 -0
  73. package/src/mcp/resources.ts +309 -0
  74. package/src/mcp/results.ts +104 -0
  75. package/src/mcp/tools.ts +401 -0
  76. package/src/queues/config.ts +119 -0
  77. package/src/queues/index.ts +289 -0
  78. package/src/sdk/client.ts +225 -0
  79. package/src/sdk/errors.ts +266 -0
  80. package/src/sdk/http.ts +560 -0
  81. package/src/sdk/index.ts +244 -0
  82. package/src/sdk/resources/base.ts +65 -0
  83. package/src/sdk/resources/connections.ts +204 -0
  84. package/src/sdk/resources/documents.ts +163 -0
  85. package/src/sdk/resources/index.ts +10 -0
  86. package/src/sdk/resources/memories.ts +150 -0
  87. package/src/sdk/resources/search.ts +60 -0
  88. package/src/sdk/resources/settings.ts +36 -0
  89. package/src/sdk/types.ts +674 -0
  90. package/src/services/chunking/index.ts +451 -0
  91. package/src/services/chunking.service.ts +650 -0
  92. package/src/services/csrf.service.ts +252 -0
  93. package/src/services/documents.repository.ts +219 -0
  94. package/src/services/documents.service.ts +191 -0
  95. package/src/services/embedding.service.ts +404 -0
  96. package/src/services/extraction.service.ts +300 -0
  97. package/src/services/extractors/code.extractor.ts +451 -0
  98. package/src/services/extractors/index.ts +9 -0
  99. package/src/services/extractors/markdown.extractor.ts +461 -0
  100. package/src/services/extractors/pdf.extractor.ts +315 -0
  101. package/src/services/extractors/text.extractor.ts +118 -0
  102. package/src/services/extractors/url.extractor.ts +243 -0
  103. package/src/services/index.ts +235 -0
  104. package/src/services/ingestion.service.ts +177 -0
  105. package/src/services/llm/anthropic.ts +400 -0
  106. package/src/services/llm/base.ts +460 -0
  107. package/src/services/llm/contradiction-detector.service.ts +526 -0
  108. package/src/services/llm/heuristics.ts +148 -0
  109. package/src/services/llm/index.ts +309 -0
  110. package/src/services/llm/memory-classifier.service.ts +383 -0
  111. package/src/services/llm/memory-extension-detector.service.ts +523 -0
  112. package/src/services/llm/mock.ts +470 -0
  113. package/src/services/llm/openai.ts +398 -0
  114. package/src/services/llm/prompts.ts +438 -0
  115. package/src/services/llm/types.ts +373 -0
  116. package/src/services/memory.repository.ts +1769 -0
  117. package/src/services/memory.service.ts +1338 -0
  118. package/src/services/memory.types.ts +234 -0
  119. package/src/services/persistence/index.ts +295 -0
  120. package/src/services/pipeline.service.ts +509 -0
  121. package/src/services/profile.repository.ts +436 -0
  122. package/src/services/profile.service.ts +560 -0
  123. package/src/services/profile.types.ts +270 -0
  124. package/src/services/relationships/detector.ts +1128 -0
  125. package/src/services/relationships/index.ts +268 -0
  126. package/src/services/relationships/memory-integration.ts +459 -0
  127. package/src/services/relationships/strategies.ts +132 -0
  128. package/src/services/relationships/types.ts +370 -0
  129. package/src/services/search.service.ts +761 -0
  130. package/src/services/search.types.ts +220 -0
  131. package/src/services/secrets.service.ts +384 -0
  132. package/src/services/vectorstore/base.ts +327 -0
  133. package/src/services/vectorstore/index.ts +444 -0
  134. package/src/services/vectorstore/memory.ts +286 -0
  135. package/src/services/vectorstore/migration.ts +295 -0
  136. package/src/services/vectorstore/mock.ts +403 -0
  137. package/src/services/vectorstore/pgvector.ts +695 -0
  138. package/src/services/vectorstore/types.ts +247 -0
  139. package/src/startup.ts +389 -0
  140. package/src/types/api.types.ts +193 -0
  141. package/src/types/document.types.ts +103 -0
  142. package/src/types/index.ts +241 -0
  143. package/src/types/profile.base.ts +133 -0
  144. package/src/utils/errors.ts +447 -0
  145. package/src/utils/id.ts +15 -0
  146. package/src/utils/index.ts +101 -0
  147. package/src/utils/logger.ts +313 -0
  148. package/src/utils/sanitization.ts +501 -0
  149. package/src/utils/secret-validation.ts +273 -0
  150. package/src/utils/synonyms.ts +188 -0
  151. package/src/utils/validation.ts +581 -0
  152. package/src/workers/chunking.worker.ts +242 -0
  153. package/src/workers/embedding.worker.ts +358 -0
  154. package/src/workers/extraction.worker.ts +346 -0
  155. package/src/workers/indexing.worker.ts +505 -0
  156. package/tsconfig.json +38 -0
@@ -0,0 +1,451 @@
1
+ /**
2
+ * Chunking Service
3
+ *
4
+ * Splits content into manageable chunks for embedding and indexing.
5
+ * Supports multiple content types with specialized chunking strategies.
6
+ */
7
+
8
+ export interface ChunkMetadata {
9
+ position: number
10
+ parentDocumentId: string
11
+ contentType: 'markdown' | 'code' | 'text'
12
+ language?: string // For code chunks
13
+ heading?: string // For markdown chunks
14
+ startOffset: number
15
+ endOffset: number
16
+ }
17
+
18
+ export interface Chunk {
19
+ content: string
20
+ metadata: ChunkMetadata
21
+ tokenCount: number
22
+ }
23
+
24
+ export interface ChunkingOptions {
25
+ chunkSize?: number // Default: 512 tokens (~2048 characters)
26
+ overlap?: number // Default: 50 tokens
27
+ contentType?: 'markdown' | 'code' | 'text'
28
+ }
29
+
30
+ /**
31
+ * Content type detection based on content analysis
32
+ */
33
+ export function detectContentType(content: string): 'markdown' | 'code' | 'text' {
34
+ // Markdown indicators
35
+ const markdownPatterns = [
36
+ /^#{1,6}\s+/m, // Headers
37
+ /\[.*?\]\(.*?\)/, // Links
38
+ /```[\s\S]*?```/, // Code blocks
39
+ /^\*\s+/m, // Unordered lists
40
+ /^\d+\.\s+/m, // Ordered lists
41
+ ]
42
+
43
+ const markdownScore = markdownPatterns.filter((pattern) => pattern.test(content)).length
44
+
45
+ // Code indicators
46
+ const codePatterns = [
47
+ /^(import|export|from|require)\s+/m,
48
+ /^(function|const|let|var|class|interface|type)\s+/m,
49
+ /[{};()]/g,
50
+ /^(public|private|protected|async|await)\s+/m,
51
+ ]
52
+
53
+ const codeScore = codePatterns.filter((pattern) => pattern.test(content)).length
54
+
55
+ // Determine content type
56
+ if (markdownScore >= 2) return 'markdown'
57
+ if (codeScore >= 2) return 'code'
58
+ return 'text'
59
+ }
60
+
61
+ /**
62
+ * Estimate token count (rough approximation: 1 token ≈ 4 characters)
63
+ */
64
+ function estimateTokens(text: string): number {
65
+ return Math.ceil(text.length / 4)
66
+ }
67
+
68
+ /**
69
+ * Semantic chunking: split by paragraphs and sections
70
+ */
71
+ function chunkSemantic(content: string, parentDocumentId: string, chunkSize: number, overlap: number): Chunk[] {
72
+ const chunks: Chunk[] = []
73
+ const paragraphs = content.split(/\n\n+/)
74
+
75
+ // If no paragraph breaks exist and content is large, use fixed chunking directly
76
+ if (paragraphs.length === 1 && estimateTokens(content) > chunkSize) {
77
+ return chunkFixed(content, parentDocumentId, chunkSize, overlap)
78
+ }
79
+
80
+ let currentChunk = ''
81
+ let currentOffset = 0
82
+ let position = 0
83
+
84
+ for (let i = 0; i < paragraphs.length; i++) {
85
+ const paragraph = paragraphs[i]
86
+ if (!paragraph) continue
87
+
88
+ // If a single paragraph is too large, split it by words
89
+ if (estimateTokens(paragraph) > chunkSize) {
90
+ // First, save current chunk if exists
91
+ if (currentChunk) {
92
+ chunks.push({
93
+ content: currentChunk,
94
+ metadata: {
95
+ position,
96
+ parentDocumentId,
97
+ contentType: 'text',
98
+ startOffset: currentOffset,
99
+ endOffset: currentOffset + currentChunk.length,
100
+ },
101
+ tokenCount: estimateTokens(currentChunk),
102
+ })
103
+ position++
104
+ currentOffset += currentChunk.length
105
+ currentChunk = ''
106
+ }
107
+
108
+ // Split large paragraph by words
109
+ const words = paragraph.split(/\s+/)
110
+ let wordChunk = ''
111
+ let wordOffset = currentOffset
112
+
113
+ for (const word of words) {
114
+ const testChunk = wordChunk ? `${wordChunk} ${word}` : word
115
+ if (estimateTokens(testChunk) <= chunkSize) {
116
+ wordChunk = testChunk
117
+ } else {
118
+ if (wordChunk) {
119
+ chunks.push({
120
+ content: wordChunk,
121
+ metadata: {
122
+ position,
123
+ parentDocumentId,
124
+ contentType: 'text',
125
+ startOffset: wordOffset,
126
+ endOffset: wordOffset + wordChunk.length,
127
+ },
128
+ tokenCount: estimateTokens(wordChunk),
129
+ })
130
+ position++
131
+ wordOffset += wordChunk.length + 1 // +1 for space
132
+ }
133
+ wordChunk = word
134
+ }
135
+ }
136
+
137
+ if (wordChunk) {
138
+ chunks.push({
139
+ content: wordChunk,
140
+ metadata: {
141
+ position,
142
+ parentDocumentId,
143
+ contentType: 'text',
144
+ startOffset: wordOffset,
145
+ endOffset: wordOffset + wordChunk.length,
146
+ },
147
+ tokenCount: estimateTokens(wordChunk),
148
+ })
149
+ position++
150
+ }
151
+
152
+ currentOffset += paragraph.length
153
+ continue
154
+ }
155
+
156
+ const combined = currentChunk ? `${currentChunk}\n\n${paragraph}` : paragraph
157
+
158
+ if (estimateTokens(combined) <= chunkSize) {
159
+ currentChunk = combined
160
+ } else {
161
+ // Save current chunk
162
+ const tokenCount = estimateTokens(currentChunk)
163
+ chunks.push({
164
+ content: currentChunk,
165
+ metadata: {
166
+ position,
167
+ parentDocumentId,
168
+ contentType: 'text',
169
+ startOffset: currentOffset,
170
+ endOffset: currentOffset + currentChunk.length,
171
+ },
172
+ tokenCount,
173
+ })
174
+
175
+ position++
176
+ currentOffset += currentChunk.length
177
+
178
+ // Start new chunk with overlap
179
+ const overlapText = currentChunk.split(/\s+/).slice(-overlap).join(' ')
180
+ currentChunk = overlapText ? `${overlapText}\n\n${paragraph}` : paragraph
181
+ }
182
+ }
183
+
184
+ // Add final chunk
185
+ if (currentChunk) {
186
+ chunks.push({
187
+ content: currentChunk,
188
+ metadata: {
189
+ position,
190
+ parentDocumentId,
191
+ contentType: 'text',
192
+ startOffset: currentOffset,
193
+ endOffset: currentOffset + currentChunk.length,
194
+ },
195
+ tokenCount: estimateTokens(currentChunk),
196
+ })
197
+ }
198
+
199
+ return chunks
200
+ }
201
+
202
+ /**
203
+ * Markdown chunking: split by heading hierarchy
204
+ */
205
+ function chunkMarkdown(content: string, parentDocumentId: string, chunkSize: number, overlap: number): Chunk[] {
206
+ const chunks: Chunk[] = []
207
+ const sections: Array<{ heading: string; content: string; level: number }> = []
208
+
209
+ // Split by headers
210
+ const lines = content.split('\n')
211
+ let currentSection = { heading: '', content: '', level: 0 }
212
+
213
+ for (const line of lines) {
214
+ const headerMatch = line.match(/^(#{1,6})\s+(.+)$/)
215
+
216
+ if (headerMatch && headerMatch[1] && headerMatch[2]) {
217
+ if (currentSection.content) {
218
+ sections.push({ ...currentSection })
219
+ }
220
+ currentSection = {
221
+ heading: headerMatch[2],
222
+ content: line + '\n',
223
+ level: headerMatch[1].length,
224
+ }
225
+ } else {
226
+ currentSection.content += line + '\n'
227
+ }
228
+ }
229
+
230
+ if (currentSection.content) {
231
+ sections.push(currentSection)
232
+ }
233
+
234
+ // Convert sections to chunks
235
+ let position = 0
236
+ let currentOffset = 0
237
+
238
+ for (const section of sections) {
239
+ const tokenCount = estimateTokens(section.content)
240
+
241
+ if (tokenCount <= chunkSize) {
242
+ chunks.push({
243
+ content: section.content.trim(),
244
+ metadata: {
245
+ position,
246
+ parentDocumentId,
247
+ contentType: 'markdown',
248
+ heading: section.heading,
249
+ startOffset: currentOffset,
250
+ endOffset: currentOffset + section.content.length,
251
+ },
252
+ tokenCount,
253
+ })
254
+ position++
255
+ } else {
256
+ // Section too large, split further with semantic chunking
257
+ const subChunks = chunkSemantic(section.content, parentDocumentId, chunkSize, overlap)
258
+ for (const chunk of subChunks) {
259
+ chunks.push({
260
+ ...chunk,
261
+ metadata: {
262
+ ...chunk.metadata,
263
+ contentType: 'markdown',
264
+ heading: section.heading,
265
+ position,
266
+ },
267
+ })
268
+ position++
269
+ }
270
+ }
271
+
272
+ currentOffset += section.content.length
273
+ }
274
+
275
+ return chunks
276
+ }
277
+
278
+ /**
279
+ * Code chunking: AST-aware with scope preservation
280
+ */
281
+ function chunkCode(content: string, parentDocumentId: string, chunkSize: number, overlap: number): Chunk[] {
282
+ const chunks: Chunk[] = []
283
+
284
+ // Detect language
285
+ let language = 'unknown'
286
+ if (content.includes('function') || content.includes('const')) language = 'javascript'
287
+ if (content.includes('def ') || content.includes('import ')) language = 'python'
288
+ if (content.includes('func ') || content.includes('package ')) language = 'go'
289
+
290
+ // Split by function/class boundaries
291
+ const codeBlocks: string[] = []
292
+ const functionPattern = /^(function|const|let|var|class|def|func|export|public|private)\s+/gm
293
+ const matches = [...content.matchAll(functionPattern)]
294
+
295
+ if (matches.length === 0) {
296
+ // No clear function boundaries, use semantic chunking
297
+ return chunkSemantic(content, parentDocumentId, chunkSize, overlap)
298
+ }
299
+
300
+ let lastIndex = 0
301
+ for (let i = 0; i < matches.length; i++) {
302
+ const match = matches[i]
303
+ if (!match) continue
304
+ const startIndex = match.index || 0
305
+
306
+ if (i > 0) {
307
+ codeBlocks.push(content.substring(lastIndex, startIndex))
308
+ }
309
+
310
+ lastIndex = startIndex
311
+ }
312
+ codeBlocks.push(content.substring(lastIndex))
313
+
314
+ // Convert blocks to chunks
315
+ let position = 0
316
+ let currentOffset = 0
317
+
318
+ for (const block of codeBlocks) {
319
+ if (!block.trim()) continue
320
+
321
+ const tokenCount = estimateTokens(block)
322
+
323
+ if (tokenCount <= chunkSize) {
324
+ chunks.push({
325
+ content: block.trim(),
326
+ metadata: {
327
+ position,
328
+ parentDocumentId,
329
+ contentType: 'code',
330
+ language,
331
+ startOffset: currentOffset,
332
+ endOffset: currentOffset + block.length,
333
+ },
334
+ tokenCount,
335
+ })
336
+ position++
337
+ } else {
338
+ // Block too large, split by lines
339
+ const lines = block.split('\n')
340
+ let currentChunk = ''
341
+ let chunkStart = currentOffset
342
+
343
+ for (const line of lines) {
344
+ const combined = currentChunk ? `${currentChunk}\n${line}` : line
345
+
346
+ if (estimateTokens(combined) <= chunkSize) {
347
+ currentChunk = combined
348
+ } else {
349
+ if (currentChunk) {
350
+ chunks.push({
351
+ content: currentChunk,
352
+ metadata: {
353
+ position,
354
+ parentDocumentId,
355
+ contentType: 'code',
356
+ language,
357
+ startOffset: chunkStart,
358
+ endOffset: chunkStart + currentChunk.length,
359
+ },
360
+ tokenCount: estimateTokens(currentChunk),
361
+ })
362
+ position++
363
+ chunkStart += currentChunk.length
364
+ }
365
+ currentChunk = line
366
+ }
367
+ }
368
+
369
+ if (currentChunk) {
370
+ chunks.push({
371
+ content: currentChunk,
372
+ metadata: {
373
+ position,
374
+ parentDocumentId,
375
+ contentType: 'code',
376
+ language,
377
+ startOffset: chunkStart,
378
+ endOffset: chunkStart + currentChunk.length,
379
+ },
380
+ tokenCount: estimateTokens(currentChunk),
381
+ })
382
+ position++
383
+ }
384
+ }
385
+
386
+ currentOffset += block.length
387
+ }
388
+
389
+ return chunks
390
+ }
391
+
392
+ /**
393
+ * Fixed-size chunking with overlap (fallback)
394
+ */
395
+ function chunkFixed(content: string, parentDocumentId: string, chunkSize: number, overlap: number): Chunk[] {
396
+ const chunks: Chunk[] = []
397
+ const charSize = chunkSize * 4 // ~4 chars per token
398
+ const overlapSize = overlap * 4
399
+
400
+ let position = 0
401
+ let offset = 0
402
+
403
+ while (offset < content.length) {
404
+ const end = Math.min(offset + charSize, content.length)
405
+ const chunkText = content.substring(offset, end)
406
+
407
+ chunks.push({
408
+ content: chunkText,
409
+ metadata: {
410
+ position,
411
+ parentDocumentId,
412
+ contentType: 'text',
413
+ startOffset: offset,
414
+ endOffset: end,
415
+ },
416
+ tokenCount: estimateTokens(chunkText),
417
+ })
418
+
419
+ position++
420
+
421
+ // Break if we've reached the end to avoid infinite loop
422
+ if (end >= content.length) {
423
+ break
424
+ }
425
+
426
+ // Move forward with overlap, ensuring we always advance
427
+ const nextOffset = end - overlapSize
428
+ offset = Math.max(nextOffset, offset + 1)
429
+ }
430
+
431
+ return chunks
432
+ }
433
+
434
+ /**
435
+ * Main chunking function with strategy selection
436
+ */
437
+ export function chunkContent(content: string, parentDocumentId: string, options: ChunkingOptions = {}): Chunk[] {
438
+ const { chunkSize = 512, overlap = 50, contentType = detectContentType(content) } = options
439
+
440
+ // Select strategy based on content type
441
+ switch (contentType) {
442
+ case 'markdown':
443
+ return chunkMarkdown(content, parentDocumentId, chunkSize, overlap)
444
+ case 'code':
445
+ return chunkCode(content, parentDocumentId, chunkSize, overlap)
446
+ case 'text':
447
+ return chunkSemantic(content, parentDocumentId, chunkSize, overlap)
448
+ default:
449
+ return chunkFixed(content, parentDocumentId, chunkSize, overlap)
450
+ }
451
+ }