@twelvehart/supermemory-runtime 1.0.0-next.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +57 -0
- package/README.md +374 -0
- package/dist/index.js +189 -0
- package/dist/mcp/index.js +1132 -0
- package/docker-compose.prod.yml +91 -0
- package/docker-compose.yml +358 -0
- package/drizzle/0000_dapper_the_professor.sql +159 -0
- package/drizzle/0001_api_keys.sql +51 -0
- package/drizzle/meta/0000_snapshot.json +1532 -0
- package/drizzle/meta/_journal.json +13 -0
- package/drizzle.config.ts +20 -0
- package/package.json +114 -0
- package/scripts/add-extraction-job.ts +122 -0
- package/scripts/benchmark-pgvector.ts +122 -0
- package/scripts/bootstrap.sh +209 -0
- package/scripts/check-runtime-pack.ts +111 -0
- package/scripts/claude-mcp-config.ts +336 -0
- package/scripts/docker-entrypoint.sh +183 -0
- package/scripts/doctor.ts +377 -0
- package/scripts/init-db.sql +33 -0
- package/scripts/install.sh +1110 -0
- package/scripts/mcp-setup.ts +271 -0
- package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
- package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
- package/scripts/migrations/003_create_hnsw_index.sql +94 -0
- package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
- package/scripts/migrations/005_create_chunks_table.sql +95 -0
- package/scripts/migrations/006_create_processing_queue.sql +45 -0
- package/scripts/migrations/generate_test_data.sql +42 -0
- package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
- package/scripts/migrations/run_migrations.sh +286 -0
- package/scripts/migrations/test_hnsw_index.sql +255 -0
- package/scripts/pre-commit-secrets +282 -0
- package/scripts/run-extraction-worker.ts +46 -0
- package/scripts/run-phase1-tests.sh +291 -0
- package/scripts/setup.ts +222 -0
- package/scripts/smoke-install.sh +12 -0
- package/scripts/test-health-endpoint.sh +328 -0
- package/src/api/index.ts +2 -0
- package/src/api/middleware/auth.ts +80 -0
- package/src/api/middleware/csrf.ts +308 -0
- package/src/api/middleware/errorHandler.ts +166 -0
- package/src/api/middleware/rateLimit.ts +360 -0
- package/src/api/middleware/validation.ts +514 -0
- package/src/api/routes/documents.ts +286 -0
- package/src/api/routes/profiles.ts +237 -0
- package/src/api/routes/search.ts +71 -0
- package/src/api/stores/index.ts +58 -0
- package/src/config/bootstrap-env.ts +3 -0
- package/src/config/env.ts +71 -0
- package/src/config/feature-flags.ts +25 -0
- package/src/config/index.ts +140 -0
- package/src/config/secrets.config.ts +291 -0
- package/src/db/client.ts +92 -0
- package/src/db/index.ts +73 -0
- package/src/db/postgres.ts +72 -0
- package/src/db/schema/chunks.schema.ts +31 -0
- package/src/db/schema/containers.schema.ts +46 -0
- package/src/db/schema/documents.schema.ts +49 -0
- package/src/db/schema/embeddings.schema.ts +32 -0
- package/src/db/schema/index.ts +11 -0
- package/src/db/schema/memories.schema.ts +72 -0
- package/src/db/schema/profiles.schema.ts +34 -0
- package/src/db/schema/queue.schema.ts +59 -0
- package/src/db/schema/relationships.schema.ts +42 -0
- package/src/db/schema.ts +223 -0
- package/src/db/worker-connection.ts +47 -0
- package/src/index.ts +235 -0
- package/src/mcp/CLAUDE.md +1 -0
- package/src/mcp/index.ts +1380 -0
- package/src/mcp/legacyState.ts +22 -0
- package/src/mcp/rateLimit.ts +358 -0
- package/src/mcp/resources.ts +309 -0
- package/src/mcp/results.ts +104 -0
- package/src/mcp/tools.ts +401 -0
- package/src/queues/config.ts +119 -0
- package/src/queues/index.ts +289 -0
- package/src/sdk/client.ts +225 -0
- package/src/sdk/errors.ts +266 -0
- package/src/sdk/http.ts +560 -0
- package/src/sdk/index.ts +244 -0
- package/src/sdk/resources/base.ts +65 -0
- package/src/sdk/resources/connections.ts +204 -0
- package/src/sdk/resources/documents.ts +163 -0
- package/src/sdk/resources/index.ts +10 -0
- package/src/sdk/resources/memories.ts +150 -0
- package/src/sdk/resources/search.ts +60 -0
- package/src/sdk/resources/settings.ts +36 -0
- package/src/sdk/types.ts +674 -0
- package/src/services/chunking/index.ts +451 -0
- package/src/services/chunking.service.ts +650 -0
- package/src/services/csrf.service.ts +252 -0
- package/src/services/documents.repository.ts +219 -0
- package/src/services/documents.service.ts +191 -0
- package/src/services/embedding.service.ts +404 -0
- package/src/services/extraction.service.ts +300 -0
- package/src/services/extractors/code.extractor.ts +451 -0
- package/src/services/extractors/index.ts +9 -0
- package/src/services/extractors/markdown.extractor.ts +461 -0
- package/src/services/extractors/pdf.extractor.ts +315 -0
- package/src/services/extractors/text.extractor.ts +118 -0
- package/src/services/extractors/url.extractor.ts +243 -0
- package/src/services/index.ts +235 -0
- package/src/services/ingestion.service.ts +177 -0
- package/src/services/llm/anthropic.ts +400 -0
- package/src/services/llm/base.ts +460 -0
- package/src/services/llm/contradiction-detector.service.ts +526 -0
- package/src/services/llm/heuristics.ts +148 -0
- package/src/services/llm/index.ts +309 -0
- package/src/services/llm/memory-classifier.service.ts +383 -0
- package/src/services/llm/memory-extension-detector.service.ts +523 -0
- package/src/services/llm/mock.ts +470 -0
- package/src/services/llm/openai.ts +398 -0
- package/src/services/llm/prompts.ts +438 -0
- package/src/services/llm/types.ts +373 -0
- package/src/services/memory.repository.ts +1769 -0
- package/src/services/memory.service.ts +1338 -0
- package/src/services/memory.types.ts +234 -0
- package/src/services/persistence/index.ts +295 -0
- package/src/services/pipeline.service.ts +509 -0
- package/src/services/profile.repository.ts +436 -0
- package/src/services/profile.service.ts +560 -0
- package/src/services/profile.types.ts +270 -0
- package/src/services/relationships/detector.ts +1128 -0
- package/src/services/relationships/index.ts +268 -0
- package/src/services/relationships/memory-integration.ts +459 -0
- package/src/services/relationships/strategies.ts +132 -0
- package/src/services/relationships/types.ts +370 -0
- package/src/services/search.service.ts +761 -0
- package/src/services/search.types.ts +220 -0
- package/src/services/secrets.service.ts +384 -0
- package/src/services/vectorstore/base.ts +327 -0
- package/src/services/vectorstore/index.ts +444 -0
- package/src/services/vectorstore/memory.ts +286 -0
- package/src/services/vectorstore/migration.ts +295 -0
- package/src/services/vectorstore/mock.ts +403 -0
- package/src/services/vectorstore/pgvector.ts +695 -0
- package/src/services/vectorstore/types.ts +247 -0
- package/src/startup.ts +389 -0
- package/src/types/api.types.ts +193 -0
- package/src/types/document.types.ts +103 -0
- package/src/types/index.ts +241 -0
- package/src/types/profile.base.ts +133 -0
- package/src/utils/errors.ts +447 -0
- package/src/utils/id.ts +15 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +313 -0
- package/src/utils/sanitization.ts +501 -0
- package/src/utils/secret-validation.ts +273 -0
- package/src/utils/synonyms.ts +188 -0
- package/src/utils/validation.ts +581 -0
- package/src/workers/chunking.worker.ts +242 -0
- package/src/workers/embedding.worker.ts +358 -0
- package/src/workers/extraction.worker.ts +346 -0
- package/src/workers/indexing.worker.ts +505 -0
- package/tsconfig.json +38 -0
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Code extractor - AST-aware extraction and chunking for source code
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { ExtractionResult, ExtractorInterface, ContentType } from '../../types/document.types.js'
|
|
6
|
+
|
|
7
|
+
export interface CodeBlock {
|
|
8
|
+
type: 'function' | 'class' | 'method' | 'interface' | 'type' | 'import' | 'export' | 'comment' | 'other'
|
|
9
|
+
name: string
|
|
10
|
+
content: string
|
|
11
|
+
startLine: number
|
|
12
|
+
endLine: number
|
|
13
|
+
language: string
|
|
14
|
+
parent?: string
|
|
15
|
+
docstring?: string
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
interface LanguagePattern {
|
|
19
|
+
extensions: string[]
|
|
20
|
+
functionPattern: RegExp
|
|
21
|
+
classPattern: RegExp
|
|
22
|
+
methodPattern?: RegExp
|
|
23
|
+
interfacePattern?: RegExp
|
|
24
|
+
typePattern?: RegExp
|
|
25
|
+
importPattern: RegExp
|
|
26
|
+
commentPattern: RegExp
|
|
27
|
+
docstringPattern?: RegExp
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export class CodeExtractor implements ExtractorInterface {
|
|
31
|
+
/**
|
|
32
|
+
* Core language patterns - supports TypeScript, JavaScript, Python, and Go
|
|
33
|
+
* Other languages can be added as needed based on usage patterns
|
|
34
|
+
*/
|
|
35
|
+
private readonly languages: Record<string, LanguagePattern> = {
|
|
36
|
+
typescript: {
|
|
37
|
+
extensions: ['.ts', '.tsx'],
|
|
38
|
+
functionPattern: /^(?:export\s+)?(?:async\s+)?function\s+(\w+)/m,
|
|
39
|
+
classPattern: /^(?:export\s+)?(?:abstract\s+)?class\s+(\w+)/m,
|
|
40
|
+
methodPattern: /^\s+(?:async\s+)?(?:static\s+)?(?:get\s+|set\s+)?(\w+)\s*\([^)]*\)\s*(?::\s*\S+)?\s*\{/m,
|
|
41
|
+
interfacePattern: /^(?:export\s+)?interface\s+(\w+)/m,
|
|
42
|
+
typePattern: /^(?:export\s+)?type\s+(\w+)/m,
|
|
43
|
+
importPattern: /^import\s+.*from\s+['"](.+)['"]/m,
|
|
44
|
+
commentPattern: /\/\/.*$|\/\*[\s\S]*?\*\//,
|
|
45
|
+
docstringPattern: /\/\*\*[\s\S]*?\*\//,
|
|
46
|
+
},
|
|
47
|
+
javascript: {
|
|
48
|
+
extensions: ['.js', '.jsx', '.mjs', '.cjs'],
|
|
49
|
+
functionPattern: /^(?:export\s+)?(?:async\s+)?function\s+(\w+)/m,
|
|
50
|
+
classPattern: /^(?:export\s+)?class\s+(\w+)/m,
|
|
51
|
+
methodPattern: /^\s+(?:async\s+)?(?:static\s+)?(?:get\s+|set\s+)?(\w+)\s*\([^)]*\)\s*\{/m,
|
|
52
|
+
importPattern: /^(?:import\s+.*from\s+['"](.+)['"]|require\(['"](.+)['"]\))/m,
|
|
53
|
+
commentPattern: /\/\/.*$|\/\*[\s\S]*?\*\//,
|
|
54
|
+
docstringPattern: /\/\*\*[\s\S]*?\*\//,
|
|
55
|
+
},
|
|
56
|
+
python: {
|
|
57
|
+
extensions: ['.py', '.pyw'],
|
|
58
|
+
functionPattern: /^(?:async\s+)?def\s+(\w+)/m,
|
|
59
|
+
classPattern: /^class\s+(\w+)/m,
|
|
60
|
+
methodPattern: /^\s+(?:async\s+)?def\s+(\w+)/m,
|
|
61
|
+
importPattern: /^(?:from\s+(\S+)\s+)?import\s+/m,
|
|
62
|
+
commentPattern: /#.*$/,
|
|
63
|
+
docstringPattern: /"""[\s\S]*?"""|'''[\s\S]*?'''/,
|
|
64
|
+
},
|
|
65
|
+
go: {
|
|
66
|
+
extensions: ['.go'],
|
|
67
|
+
functionPattern: /^func\s+(\w+)/m,
|
|
68
|
+
classPattern: /^type\s+(\w+)\s+struct/m,
|
|
69
|
+
methodPattern: /^func\s+\([^)]+\)\s+(\w+)/m,
|
|
70
|
+
interfacePattern: /^type\s+(\w+)\s+interface/m,
|
|
71
|
+
importPattern: /^import\s+(?:\(\s*)?["']([^"']+)["']/m,
|
|
72
|
+
commentPattern: /\/\/.*$|\/\*[\s\S]*?\*\//,
|
|
73
|
+
},
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Check if content appears to be source code
|
|
78
|
+
*/
|
|
79
|
+
canHandle(content: string): boolean {
|
|
80
|
+
if (typeof content !== 'string' || content.length === 0) {
|
|
81
|
+
return false
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Check for common code patterns (focusing on core languages)
|
|
85
|
+
const codePatterns = [
|
|
86
|
+
/^import\s+/m, // JS/TS/Python/Go
|
|
87
|
+
/^export\s+/m, // JS/TS
|
|
88
|
+
/^(?:const|let|var)\s+\w+\s*=/m, // JS/TS
|
|
89
|
+
/^function\s+\w+/m, // JS/TS
|
|
90
|
+
/^class\s+\w+/m, // JS/TS/Python
|
|
91
|
+
/^def\s+\w+/m, // Python
|
|
92
|
+
/^func\s+\w+/m, // Go
|
|
93
|
+
/^package\s+\w+/m, // Go
|
|
94
|
+
/:\s*(?:string|number|boolean)/m, // TS
|
|
95
|
+
/^\s+self\./m, // Python
|
|
96
|
+
/:\s*=\s*/, // Go
|
|
97
|
+
/^(?:async\s+)?function\s+/m, // JS/TS
|
|
98
|
+
/=>\s*\{/, // JS/TS arrow functions
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
let score = 0
|
|
102
|
+
for (const pattern of codePatterns) {
|
|
103
|
+
if (pattern.test(content)) score++
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// Check bracket/brace balance (code usually has balanced braces)
|
|
107
|
+
const openBraces = (content.match(/\{/g) ?? []).length
|
|
108
|
+
const closeBraces = (content.match(/\}/g) ?? []).length
|
|
109
|
+
if (openBraces > 0 && Math.abs(openBraces - closeBraces) < openBraces * 0.1) {
|
|
110
|
+
score++
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
return score >= 2
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Extract code content with AST-aware parsing
|
|
118
|
+
*/
|
|
119
|
+
async extract(content: string, options?: Record<string, unknown>): Promise<ExtractionResult> {
|
|
120
|
+
const language = (options?.language as string) ?? this.detectLanguage(content)
|
|
121
|
+
const blocks = this.parseCodeBlocks(content, language)
|
|
122
|
+
const metadata = this.extractMetadata(content, blocks, language)
|
|
123
|
+
|
|
124
|
+
return {
|
|
125
|
+
content,
|
|
126
|
+
contentType: 'code' as ContentType,
|
|
127
|
+
metadata: {
|
|
128
|
+
...metadata,
|
|
129
|
+
codeBlocks: blocks.map((b) => ({
|
|
130
|
+
type: b.type,
|
|
131
|
+
name: b.name,
|
|
132
|
+
startLine: b.startLine,
|
|
133
|
+
endLine: b.endLine,
|
|
134
|
+
})),
|
|
135
|
+
},
|
|
136
|
+
rawContent: content,
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Detect programming language from content - supports TypeScript, JavaScript, Python, and Go
|
|
142
|
+
*/
|
|
143
|
+
detectLanguage(content: string): string {
|
|
144
|
+
// TypeScript indicators (check first since it's a superset of JavaScript)
|
|
145
|
+
if (
|
|
146
|
+
/:\s*(?:string|number|boolean|void|any|unknown|never)\b/.test(content) ||
|
|
147
|
+
/interface\s+\w+/.test(content) ||
|
|
148
|
+
/<\w+>.*>/.test(content) ||
|
|
149
|
+
/as\s+(?:string|number|boolean)/.test(content)
|
|
150
|
+
) {
|
|
151
|
+
return 'typescript'
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Python indicators
|
|
155
|
+
if (/^def\s+\w+.*:\s*$/m.test(content) || /^class\s+\w+.*:\s*$/m.test(content) || /^\s+self\./m.test(content)) {
|
|
156
|
+
return 'python'
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Go indicators
|
|
160
|
+
if (/^package\s+\w+/m.test(content) || /^func\s+\([^)]+\)/.test(content) || /:=/.test(content)) {
|
|
161
|
+
return 'go'
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Default to JavaScript
|
|
165
|
+
return 'javascript'
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Parse code into logical blocks
|
|
170
|
+
*/
|
|
171
|
+
parseCodeBlocks(content: string, language: string): CodeBlock[] {
|
|
172
|
+
const blocks: CodeBlock[] = []
|
|
173
|
+
const lines = content.split('\n')
|
|
174
|
+
const pattern = this.languages[language] ?? this.languages['javascript']
|
|
175
|
+
if (!pattern) {
|
|
176
|
+
return blocks
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
let currentClass: string | undefined
|
|
180
|
+
let i = 0
|
|
181
|
+
|
|
182
|
+
while (i < lines.length) {
|
|
183
|
+
const line = lines[i] ?? ''
|
|
184
|
+
const remainingContent = lines.slice(i).join('\n')
|
|
185
|
+
|
|
186
|
+
// Check for imports
|
|
187
|
+
const importMatch = line.match(pattern.importPattern)
|
|
188
|
+
if (importMatch) {
|
|
189
|
+
blocks.push({
|
|
190
|
+
type: 'import',
|
|
191
|
+
name: importMatch[1] ?? importMatch[2] ?? 'import',
|
|
192
|
+
content: line,
|
|
193
|
+
startLine: i + 1,
|
|
194
|
+
endLine: i + 1,
|
|
195
|
+
language,
|
|
196
|
+
})
|
|
197
|
+
i++
|
|
198
|
+
continue
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Check for docstrings/comments before definitions
|
|
202
|
+
let docstring: string | undefined
|
|
203
|
+
if (pattern.docstringPattern) {
|
|
204
|
+
const docMatch = remainingContent.match(pattern.docstringPattern)
|
|
205
|
+
if (docMatch && docMatch[0] && remainingContent.indexOf(docMatch[0]) === 0) {
|
|
206
|
+
docstring = docMatch[0]
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Check for class definitions
|
|
211
|
+
const classMatch = line.match(pattern.classPattern)
|
|
212
|
+
if (classMatch && classMatch[1]) {
|
|
213
|
+
const block = this.extractBlock(lines, i, language)
|
|
214
|
+
blocks.push({
|
|
215
|
+
type: 'class',
|
|
216
|
+
name: classMatch[1],
|
|
217
|
+
content: block.content,
|
|
218
|
+
startLine: i + 1,
|
|
219
|
+
endLine: block.endLine + 1,
|
|
220
|
+
language,
|
|
221
|
+
docstring,
|
|
222
|
+
})
|
|
223
|
+
currentClass = classMatch[1]
|
|
224
|
+
i = block.endLine + 1
|
|
225
|
+
continue
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Check for interface definitions (TypeScript/Java/Go)
|
|
229
|
+
if (pattern.interfacePattern) {
|
|
230
|
+
const interfaceMatch = line.match(pattern.interfacePattern)
|
|
231
|
+
if (interfaceMatch && interfaceMatch[1]) {
|
|
232
|
+
const block = this.extractBlock(lines, i, language)
|
|
233
|
+
blocks.push({
|
|
234
|
+
type: 'interface',
|
|
235
|
+
name: interfaceMatch[1],
|
|
236
|
+
content: block.content,
|
|
237
|
+
startLine: i + 1,
|
|
238
|
+
endLine: block.endLine + 1,
|
|
239
|
+
language,
|
|
240
|
+
docstring,
|
|
241
|
+
})
|
|
242
|
+
i = block.endLine + 1
|
|
243
|
+
continue
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// Check for type definitions
|
|
248
|
+
if (pattern.typePattern) {
|
|
249
|
+
const typeMatch = line.match(pattern.typePattern)
|
|
250
|
+
if (typeMatch && typeMatch[1]) {
|
|
251
|
+
const block = this.extractBlock(lines, i, language)
|
|
252
|
+
blocks.push({
|
|
253
|
+
type: 'type',
|
|
254
|
+
name: typeMatch[1],
|
|
255
|
+
content: block.content,
|
|
256
|
+
startLine: i + 1,
|
|
257
|
+
endLine: block.endLine + 1,
|
|
258
|
+
language,
|
|
259
|
+
docstring,
|
|
260
|
+
})
|
|
261
|
+
i = block.endLine + 1
|
|
262
|
+
continue
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Check for method definitions (inside class)
|
|
267
|
+
if (pattern.methodPattern && currentClass) {
|
|
268
|
+
const methodMatch = line.match(pattern.methodPattern)
|
|
269
|
+
if (methodMatch && methodMatch[1] && (line.startsWith(' ') || line.startsWith('\t'))) {
|
|
270
|
+
const block = this.extractBlock(lines, i, language)
|
|
271
|
+
blocks.push({
|
|
272
|
+
type: 'method',
|
|
273
|
+
name: methodMatch[1],
|
|
274
|
+
content: block.content,
|
|
275
|
+
startLine: i + 1,
|
|
276
|
+
endLine: block.endLine + 1,
|
|
277
|
+
language,
|
|
278
|
+
parent: currentClass,
|
|
279
|
+
docstring,
|
|
280
|
+
})
|
|
281
|
+
i = block.endLine + 1
|
|
282
|
+
continue
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// Check for function definitions
|
|
287
|
+
const functionMatch = line.match(pattern.functionPattern)
|
|
288
|
+
if (functionMatch && functionMatch[1]) {
|
|
289
|
+
const block = this.extractBlock(lines, i, language)
|
|
290
|
+
blocks.push({
|
|
291
|
+
type: 'function',
|
|
292
|
+
name: functionMatch[1],
|
|
293
|
+
content: block.content,
|
|
294
|
+
startLine: i + 1,
|
|
295
|
+
endLine: block.endLine + 1,
|
|
296
|
+
language,
|
|
297
|
+
docstring,
|
|
298
|
+
})
|
|
299
|
+
currentClass = undefined
|
|
300
|
+
i = block.endLine + 1
|
|
301
|
+
continue
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Check for arrow functions and const declarations
|
|
305
|
+
const arrowMatch = line.match(/^(?:export\s+)?(?:const|let)\s+(\w+)\s*=\s*(?:async\s*)?\([^)]*\)\s*=>/)
|
|
306
|
+
if (arrowMatch && arrowMatch[1]) {
|
|
307
|
+
const block = this.extractBlock(lines, i, language)
|
|
308
|
+
blocks.push({
|
|
309
|
+
type: 'function',
|
|
310
|
+
name: arrowMatch[1],
|
|
311
|
+
content: block.content,
|
|
312
|
+
startLine: i + 1,
|
|
313
|
+
endLine: block.endLine + 1,
|
|
314
|
+
language,
|
|
315
|
+
docstring,
|
|
316
|
+
})
|
|
317
|
+
i = block.endLine + 1
|
|
318
|
+
continue
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
i++
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
return blocks
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Extract a complete code block (handles brace matching)
|
|
329
|
+
*/
|
|
330
|
+
private extractBlock(lines: string[], startIndex: number, language: string): { content: string; endLine: number } {
|
|
331
|
+
const isPython = language === 'python'
|
|
332
|
+
|
|
333
|
+
if (isPython) {
|
|
334
|
+
return this.extractPythonBlock(lines, startIndex)
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Brace-based languages
|
|
338
|
+
let braceCount = 0
|
|
339
|
+
let started = false
|
|
340
|
+
let endIndex = startIndex
|
|
341
|
+
|
|
342
|
+
for (let i = startIndex; i < lines.length; i++) {
|
|
343
|
+
const line = lines[i] ?? ''
|
|
344
|
+
|
|
345
|
+
for (const char of line) {
|
|
346
|
+
if (char === '{') {
|
|
347
|
+
braceCount++
|
|
348
|
+
started = true
|
|
349
|
+
} else if (char === '}') {
|
|
350
|
+
braceCount--
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
endIndex = i
|
|
355
|
+
|
|
356
|
+
if (started && braceCount === 0) {
|
|
357
|
+
break
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
const content = lines.slice(startIndex, endIndex + 1).join('\n')
|
|
362
|
+
return { content, endLine: endIndex }
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/**
|
|
366
|
+
* Extract Python block (indentation-based)
|
|
367
|
+
*/
|
|
368
|
+
private extractPythonBlock(lines: string[], startIndex: number): { content: string; endLine: number } {
|
|
369
|
+
const startLine = lines[startIndex] ?? ''
|
|
370
|
+
const baseIndent = startLine.match(/^(\s*)/)?.[1]?.length ?? 0
|
|
371
|
+
let endIndex = startIndex
|
|
372
|
+
|
|
373
|
+
for (let i = startIndex + 1; i < lines.length; i++) {
|
|
374
|
+
const line = lines[i] ?? ''
|
|
375
|
+
|
|
376
|
+
// Skip empty lines
|
|
377
|
+
if (line.trim() === '') {
|
|
378
|
+
endIndex = i
|
|
379
|
+
continue
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
const currentIndent = line.match(/^(\s*)/)?.[1]?.length ?? 0
|
|
383
|
+
|
|
384
|
+
// Block ends when we return to same or less indentation
|
|
385
|
+
if (currentIndent <= baseIndent && line.trim() !== '') {
|
|
386
|
+
break
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
endIndex = i
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
const content = lines.slice(startIndex, endIndex + 1).join('\n')
|
|
393
|
+
return { content, endLine: endIndex }
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
/**
|
|
397
|
+
* Extract metadata from code content
|
|
398
|
+
*/
|
|
399
|
+
private extractMetadata(content: string, blocks: CodeBlock[], language: string): ExtractionResult['metadata'] {
|
|
400
|
+
const lines = content.split('\n')
|
|
401
|
+
const words = content.split(/\s+/).filter((w) => w.length > 0)
|
|
402
|
+
|
|
403
|
+
const functions = blocks.filter((b) => b.type === 'function')
|
|
404
|
+
const classes = blocks.filter((b) => b.type === 'class')
|
|
405
|
+
const interfaces = blocks.filter((b) => b.type === 'interface')
|
|
406
|
+
const imports = blocks.filter((b) => b.type === 'import')
|
|
407
|
+
|
|
408
|
+
// Detect test file (check for common test patterns in content)
|
|
409
|
+
const isTestFile =
|
|
410
|
+
/describe\s*\(|it\s*\(|test\s*\(/.test(content) ||
|
|
411
|
+
/def\s+test_/.test(content) ||
|
|
412
|
+
/#\[test\]/.test(content) ||
|
|
413
|
+
/assert\s*\(|expect\s*\(/.test(content)
|
|
414
|
+
|
|
415
|
+
return {
|
|
416
|
+
source: 'code',
|
|
417
|
+
language,
|
|
418
|
+
mimeType: this.getMimeType(language),
|
|
419
|
+
wordCount: words.length,
|
|
420
|
+
charCount: content.length,
|
|
421
|
+
lineCount: lines.length,
|
|
422
|
+
functionCount: functions.length,
|
|
423
|
+
classCount: classes.length,
|
|
424
|
+
interfaceCount: interfaces.length,
|
|
425
|
+
importCount: imports.length,
|
|
426
|
+
isTestFile,
|
|
427
|
+
hasDocstrings: blocks.some((b) => b.docstring),
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
/**
|
|
432
|
+
* Get MIME type for language - supports TypeScript, JavaScript, Python, and Go
|
|
433
|
+
*/
|
|
434
|
+
private getMimeType(language: string): string {
|
|
435
|
+
const mimeTypes: Record<string, string> = {
|
|
436
|
+
typescript: 'text/typescript',
|
|
437
|
+
javascript: 'text/javascript',
|
|
438
|
+
python: 'text/x-python',
|
|
439
|
+
go: 'text/x-go',
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
return mimeTypes[language] ?? 'text/plain'
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
/**
|
|
446
|
+
* Get supported languages
|
|
447
|
+
*/
|
|
448
|
+
getSupportedLanguages(): string[] {
|
|
449
|
+
return Object.keys(this.languages)
|
|
450
|
+
}
|
|
451
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extractors barrel export
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export { TextExtractor } from './text.extractor.js'
|
|
6
|
+
export { UrlExtractor } from './url.extractor.js'
|
|
7
|
+
export { PdfExtractor } from './pdf.extractor.js'
|
|
8
|
+
export { MarkdownExtractor, type MarkdownSection } from './markdown.extractor.js'
|
|
9
|
+
export { CodeExtractor, type CodeBlock } from './code.extractor.js'
|