@twelvehart/supermemory-runtime 1.0.0-next.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +57 -0
- package/README.md +374 -0
- package/dist/index.js +189 -0
- package/dist/mcp/index.js +1132 -0
- package/docker-compose.prod.yml +91 -0
- package/docker-compose.yml +358 -0
- package/drizzle/0000_dapper_the_professor.sql +159 -0
- package/drizzle/0001_api_keys.sql +51 -0
- package/drizzle/meta/0000_snapshot.json +1532 -0
- package/drizzle/meta/_journal.json +13 -0
- package/drizzle.config.ts +20 -0
- package/package.json +114 -0
- package/scripts/add-extraction-job.ts +122 -0
- package/scripts/benchmark-pgvector.ts +122 -0
- package/scripts/bootstrap.sh +209 -0
- package/scripts/check-runtime-pack.ts +111 -0
- package/scripts/claude-mcp-config.ts +336 -0
- package/scripts/docker-entrypoint.sh +183 -0
- package/scripts/doctor.ts +377 -0
- package/scripts/init-db.sql +33 -0
- package/scripts/install.sh +1110 -0
- package/scripts/mcp-setup.ts +271 -0
- package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
- package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
- package/scripts/migrations/003_create_hnsw_index.sql +94 -0
- package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
- package/scripts/migrations/005_create_chunks_table.sql +95 -0
- package/scripts/migrations/006_create_processing_queue.sql +45 -0
- package/scripts/migrations/generate_test_data.sql +42 -0
- package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
- package/scripts/migrations/run_migrations.sh +286 -0
- package/scripts/migrations/test_hnsw_index.sql +255 -0
- package/scripts/pre-commit-secrets +282 -0
- package/scripts/run-extraction-worker.ts +46 -0
- package/scripts/run-phase1-tests.sh +291 -0
- package/scripts/setup.ts +222 -0
- package/scripts/smoke-install.sh +12 -0
- package/scripts/test-health-endpoint.sh +328 -0
- package/src/api/index.ts +2 -0
- package/src/api/middleware/auth.ts +80 -0
- package/src/api/middleware/csrf.ts +308 -0
- package/src/api/middleware/errorHandler.ts +166 -0
- package/src/api/middleware/rateLimit.ts +360 -0
- package/src/api/middleware/validation.ts +514 -0
- package/src/api/routes/documents.ts +286 -0
- package/src/api/routes/profiles.ts +237 -0
- package/src/api/routes/search.ts +71 -0
- package/src/api/stores/index.ts +58 -0
- package/src/config/bootstrap-env.ts +3 -0
- package/src/config/env.ts +71 -0
- package/src/config/feature-flags.ts +25 -0
- package/src/config/index.ts +140 -0
- package/src/config/secrets.config.ts +291 -0
- package/src/db/client.ts +92 -0
- package/src/db/index.ts +73 -0
- package/src/db/postgres.ts +72 -0
- package/src/db/schema/chunks.schema.ts +31 -0
- package/src/db/schema/containers.schema.ts +46 -0
- package/src/db/schema/documents.schema.ts +49 -0
- package/src/db/schema/embeddings.schema.ts +32 -0
- package/src/db/schema/index.ts +11 -0
- package/src/db/schema/memories.schema.ts +72 -0
- package/src/db/schema/profiles.schema.ts +34 -0
- package/src/db/schema/queue.schema.ts +59 -0
- package/src/db/schema/relationships.schema.ts +42 -0
- package/src/db/schema.ts +223 -0
- package/src/db/worker-connection.ts +47 -0
- package/src/index.ts +235 -0
- package/src/mcp/CLAUDE.md +1 -0
- package/src/mcp/index.ts +1380 -0
- package/src/mcp/legacyState.ts +22 -0
- package/src/mcp/rateLimit.ts +358 -0
- package/src/mcp/resources.ts +309 -0
- package/src/mcp/results.ts +104 -0
- package/src/mcp/tools.ts +401 -0
- package/src/queues/config.ts +119 -0
- package/src/queues/index.ts +289 -0
- package/src/sdk/client.ts +225 -0
- package/src/sdk/errors.ts +266 -0
- package/src/sdk/http.ts +560 -0
- package/src/sdk/index.ts +244 -0
- package/src/sdk/resources/base.ts +65 -0
- package/src/sdk/resources/connections.ts +204 -0
- package/src/sdk/resources/documents.ts +163 -0
- package/src/sdk/resources/index.ts +10 -0
- package/src/sdk/resources/memories.ts +150 -0
- package/src/sdk/resources/search.ts +60 -0
- package/src/sdk/resources/settings.ts +36 -0
- package/src/sdk/types.ts +674 -0
- package/src/services/chunking/index.ts +451 -0
- package/src/services/chunking.service.ts +650 -0
- package/src/services/csrf.service.ts +252 -0
- package/src/services/documents.repository.ts +219 -0
- package/src/services/documents.service.ts +191 -0
- package/src/services/embedding.service.ts +404 -0
- package/src/services/extraction.service.ts +300 -0
- package/src/services/extractors/code.extractor.ts +451 -0
- package/src/services/extractors/index.ts +9 -0
- package/src/services/extractors/markdown.extractor.ts +461 -0
- package/src/services/extractors/pdf.extractor.ts +315 -0
- package/src/services/extractors/text.extractor.ts +118 -0
- package/src/services/extractors/url.extractor.ts +243 -0
- package/src/services/index.ts +235 -0
- package/src/services/ingestion.service.ts +177 -0
- package/src/services/llm/anthropic.ts +400 -0
- package/src/services/llm/base.ts +460 -0
- package/src/services/llm/contradiction-detector.service.ts +526 -0
- package/src/services/llm/heuristics.ts +148 -0
- package/src/services/llm/index.ts +309 -0
- package/src/services/llm/memory-classifier.service.ts +383 -0
- package/src/services/llm/memory-extension-detector.service.ts +523 -0
- package/src/services/llm/mock.ts +470 -0
- package/src/services/llm/openai.ts +398 -0
- package/src/services/llm/prompts.ts +438 -0
- package/src/services/llm/types.ts +373 -0
- package/src/services/memory.repository.ts +1769 -0
- package/src/services/memory.service.ts +1338 -0
- package/src/services/memory.types.ts +234 -0
- package/src/services/persistence/index.ts +295 -0
- package/src/services/pipeline.service.ts +509 -0
- package/src/services/profile.repository.ts +436 -0
- package/src/services/profile.service.ts +560 -0
- package/src/services/profile.types.ts +270 -0
- package/src/services/relationships/detector.ts +1128 -0
- package/src/services/relationships/index.ts +268 -0
- package/src/services/relationships/memory-integration.ts +459 -0
- package/src/services/relationships/strategies.ts +132 -0
- package/src/services/relationships/types.ts +370 -0
- package/src/services/search.service.ts +761 -0
- package/src/services/search.types.ts +220 -0
- package/src/services/secrets.service.ts +384 -0
- package/src/services/vectorstore/base.ts +327 -0
- package/src/services/vectorstore/index.ts +444 -0
- package/src/services/vectorstore/memory.ts +286 -0
- package/src/services/vectorstore/migration.ts +295 -0
- package/src/services/vectorstore/mock.ts +403 -0
- package/src/services/vectorstore/pgvector.ts +695 -0
- package/src/services/vectorstore/types.ts +247 -0
- package/src/startup.ts +389 -0
- package/src/types/api.types.ts +193 -0
- package/src/types/document.types.ts +103 -0
- package/src/types/index.ts +241 -0
- package/src/types/profile.base.ts +133 -0
- package/src/utils/errors.ts +447 -0
- package/src/utils/id.ts +15 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +313 -0
- package/src/utils/sanitization.ts +501 -0
- package/src/utils/secret-validation.ts +273 -0
- package/src/utils/synonyms.ts +188 -0
- package/src/utils/validation.ts +581 -0
- package/src/workers/chunking.worker.ts +242 -0
- package/src/workers/embedding.worker.ts +358 -0
- package/src/workers/extraction.worker.ts +346 -0
- package/src/workers/indexing.worker.ts +505 -0
- package/tsconfig.json +38 -0
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF extractor - extracts text content from PDF files
|
|
3
|
+
* Uses pdf-parse library for extraction
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { ExtractionResult, ExtractorInterface, ContentType } from '../../types/document.types.js'
|
|
7
|
+
import { DependencyError } from '../../utils/errors.js'
|
|
8
|
+
|
|
9
|
+
// pdf-parse types (library doesn't have proper types)
|
|
10
|
+
interface PdfData {
|
|
11
|
+
numpages: number
|
|
12
|
+
numrender: number
|
|
13
|
+
info: {
|
|
14
|
+
Title?: string
|
|
15
|
+
Author?: string
|
|
16
|
+
Subject?: string
|
|
17
|
+
Keywords?: string
|
|
18
|
+
Creator?: string
|
|
19
|
+
Producer?: string
|
|
20
|
+
CreationDate?: string
|
|
21
|
+
ModDate?: string
|
|
22
|
+
}
|
|
23
|
+
metadata: unknown
|
|
24
|
+
text: string
|
|
25
|
+
version: string
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// Type for text content item from pdf.js
|
|
29
|
+
interface TextItem {
|
|
30
|
+
str: string
|
|
31
|
+
dir?: string
|
|
32
|
+
width?: number
|
|
33
|
+
height?: number
|
|
34
|
+
transform?: number[]
|
|
35
|
+
fontName?: string
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Type for text content from getTextContent()
|
|
39
|
+
interface TextContent {
|
|
40
|
+
items: TextItem[]
|
|
41
|
+
styles?: Record<string, unknown>
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Type for page data from pdf-parse pagerender callback
|
|
45
|
+
interface PageData {
|
|
46
|
+
getTextContent(): Promise<TextContent>
|
|
47
|
+
pageNumber?: number
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
interface PdfParseOptions {
|
|
51
|
+
pagerender?: (pageData: unknown) => string
|
|
52
|
+
max?: number
|
|
53
|
+
version?: string
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
type PdfParseFunction = (buffer: Buffer, options?: PdfParseOptions) => Promise<PdfData>
|
|
57
|
+
|
|
58
|
+
// Dynamic import for pdf-parse
|
|
59
|
+
let pdfParse: PdfParseFunction | null = null
|
|
60
|
+
|
|
61
|
+
async function loadPdfParse(): Promise<PdfParseFunction> {
|
|
62
|
+
if (!pdfParse) {
|
|
63
|
+
try {
|
|
64
|
+
const module = await import('pdf-parse')
|
|
65
|
+
pdfParse = module.default as PdfParseFunction
|
|
66
|
+
} catch {
|
|
67
|
+
throw new DependencyError('pdf-parse', 'npm install pdf-parse')
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
return pdfParse
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
export class PdfExtractor implements ExtractorInterface {
|
|
74
|
+
/**
|
|
75
|
+
* Check if content is a PDF buffer or base64 encoded PDF
|
|
76
|
+
*/
|
|
77
|
+
canHandle(content: string | Buffer): boolean {
|
|
78
|
+
if (Buffer.isBuffer(content)) {
|
|
79
|
+
// Check PDF magic bytes: %PDF
|
|
80
|
+
return content.slice(0, 4).toString() === '%PDF'
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
if (typeof content === 'string') {
|
|
84
|
+
// Check if it's base64 encoded PDF
|
|
85
|
+
if (content.startsWith('data:application/pdf;base64,')) {
|
|
86
|
+
return true
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Check if it starts with PDF magic bytes
|
|
90
|
+
if (content.startsWith('%PDF')) {
|
|
91
|
+
return true
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Try to detect base64 PDF without data URI prefix
|
|
95
|
+
try {
|
|
96
|
+
const decoded = Buffer.from(content.slice(0, 100), 'base64').toString()
|
|
97
|
+
return decoded.startsWith('%PDF')
|
|
98
|
+
} catch {
|
|
99
|
+
return false
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return false
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Extract text content from PDF
|
|
108
|
+
*/
|
|
109
|
+
async extract(content: string | Buffer, options?: Record<string, unknown>): Promise<ExtractionResult> {
|
|
110
|
+
const parse = await loadPdfParse()
|
|
111
|
+
|
|
112
|
+
const buffer = this.toBuffer(content)
|
|
113
|
+
const pdfData = await parse(buffer, {
|
|
114
|
+
max: options?.maxPages as number | undefined,
|
|
115
|
+
})
|
|
116
|
+
|
|
117
|
+
const cleanedText = this.cleanPdfText(pdfData.text)
|
|
118
|
+
const metadata = this.extractMetadata(pdfData, cleanedText)
|
|
119
|
+
|
|
120
|
+
return {
|
|
121
|
+
content: cleanedText,
|
|
122
|
+
contentType: 'pdf' as ContentType,
|
|
123
|
+
metadata,
|
|
124
|
+
rawContent: pdfData.text,
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Convert input to Buffer
|
|
130
|
+
*/
|
|
131
|
+
private toBuffer(content: string | Buffer): Buffer {
|
|
132
|
+
if (Buffer.isBuffer(content)) {
|
|
133
|
+
return content
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Handle data URI
|
|
137
|
+
if (content.startsWith('data:application/pdf;base64,')) {
|
|
138
|
+
const base64Data = content.replace('data:application/pdf;base64,', '')
|
|
139
|
+
return Buffer.from(base64Data, 'base64')
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Try base64 decode
|
|
143
|
+
try {
|
|
144
|
+
const buffer = Buffer.from(content, 'base64')
|
|
145
|
+
if (buffer.slice(0, 4).toString() === '%PDF') {
|
|
146
|
+
return buffer
|
|
147
|
+
}
|
|
148
|
+
} catch {
|
|
149
|
+
// Not base64
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Assume raw PDF string
|
|
153
|
+
return Buffer.from(content, 'binary')
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Clean extracted PDF text
|
|
158
|
+
*/
|
|
159
|
+
private cleanPdfText(text: string): string {
|
|
160
|
+
return (
|
|
161
|
+
text
|
|
162
|
+
// Fix common PDF extraction artifacts
|
|
163
|
+
.replace(/\f/g, '\n\n') // Form feeds to paragraph breaks
|
|
164
|
+
.replace(/\r\n/g, '\n')
|
|
165
|
+
.replace(/\r/g, '\n')
|
|
166
|
+
// Remove excessive whitespace
|
|
167
|
+
.replace(/[ \t]+/g, ' ')
|
|
168
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
169
|
+
// Fix hyphenation at line breaks
|
|
170
|
+
.replace(/(\w)-\n(\w)/g, '$1$2')
|
|
171
|
+
// Remove page numbers (common patterns)
|
|
172
|
+
.replace(/^\s*\d+\s*$/gm, '')
|
|
173
|
+
.replace(/\n\s*Page\s+\d+\s*\n/gi, '\n')
|
|
174
|
+
// Trim lines
|
|
175
|
+
.split('\n')
|
|
176
|
+
.map((line) => line.trim())
|
|
177
|
+
.join('\n')
|
|
178
|
+
.trim()
|
|
179
|
+
)
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* Extract metadata from PDF data
|
|
184
|
+
*/
|
|
185
|
+
private extractMetadata(pdfData: PdfData, cleanedText: string): ExtractionResult['metadata'] {
|
|
186
|
+
const words = cleanedText.split(/\s+/).filter((w) => w.length > 0)
|
|
187
|
+
const info = pdfData.info ?? {}
|
|
188
|
+
|
|
189
|
+
// Parse creation date if available
|
|
190
|
+
let createdAt: string | undefined
|
|
191
|
+
if (info.CreationDate) {
|
|
192
|
+
createdAt = this.parsePdfDate(info.CreationDate)
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Parse keywords into tags
|
|
196
|
+
let tags: string[] | undefined
|
|
197
|
+
if (info.Keywords) {
|
|
198
|
+
tags = info.Keywords.split(/[,;]/)
|
|
199
|
+
.map((k) => k.trim())
|
|
200
|
+
.filter((k) => k.length > 0)
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return {
|
|
204
|
+
title: info.Title,
|
|
205
|
+
author: info.Author,
|
|
206
|
+
description: info.Subject,
|
|
207
|
+
tags,
|
|
208
|
+
source: 'pdf',
|
|
209
|
+
mimeType: 'application/pdf',
|
|
210
|
+
wordCount: words.length,
|
|
211
|
+
charCount: cleanedText.length,
|
|
212
|
+
pageCount: pdfData.numpages,
|
|
213
|
+
pdfVersion: pdfData.version,
|
|
214
|
+
creator: info.Creator,
|
|
215
|
+
producer: info.Producer,
|
|
216
|
+
createdAt,
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Parse PDF date format (D:YYYYMMDDHHmmSS)
|
|
222
|
+
*/
|
|
223
|
+
private parsePdfDate(dateStr: string): string | undefined {
|
|
224
|
+
try {
|
|
225
|
+
// Remove D: prefix if present
|
|
226
|
+
const clean = dateStr.replace(/^D:/, '')
|
|
227
|
+
|
|
228
|
+
// Extract date components
|
|
229
|
+
const year = clean.slice(0, 4)
|
|
230
|
+
const month = clean.slice(4, 6) || '01'
|
|
231
|
+
const day = clean.slice(6, 8) || '01'
|
|
232
|
+
const hour = clean.slice(8, 10) || '00'
|
|
233
|
+
const minute = clean.slice(10, 12) || '00'
|
|
234
|
+
const second = clean.slice(12, 14) || '00'
|
|
235
|
+
|
|
236
|
+
const date = new Date(`${year}-${month}-${day}T${hour}:${minute}:${second}Z`)
|
|
237
|
+
|
|
238
|
+
return date.toISOString()
|
|
239
|
+
} catch {
|
|
240
|
+
return undefined
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Extract text from specific pages using pageData.getTextContent()
|
|
246
|
+
*/
|
|
247
|
+
async extractPages(content: string | Buffer, startPage: number, endPage?: number): Promise<string[]> {
|
|
248
|
+
const parse = await loadPdfParse()
|
|
249
|
+
|
|
250
|
+
const buffer = this.toBuffer(content)
|
|
251
|
+
const pages: string[] = []
|
|
252
|
+
let currentPage = 0
|
|
253
|
+
|
|
254
|
+
// Custom page render function that extracts actual text content
|
|
255
|
+
const pageRender = async (pageData: PageData): Promise<string> => {
|
|
256
|
+
currentPage++
|
|
257
|
+
|
|
258
|
+
// Skip pages outside the requested range
|
|
259
|
+
if (currentPage < startPage || (endPage && currentPage > endPage)) {
|
|
260
|
+
return ''
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
try {
|
|
264
|
+
// Use getTextContent to extract actual text from the page
|
|
265
|
+
const textContent = await pageData.getTextContent()
|
|
266
|
+
|
|
267
|
+
if (!textContent || !textContent.items) {
|
|
268
|
+
pages.push('')
|
|
269
|
+
return ''
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// Combine all text items into a single string
|
|
273
|
+
const pageText = textContent.items
|
|
274
|
+
.map((item: TextItem) => {
|
|
275
|
+
// Handle text items - they have a 'str' property
|
|
276
|
+
if ('str' in item && typeof item.str === 'string') {
|
|
277
|
+
return item.str
|
|
278
|
+
}
|
|
279
|
+
return ''
|
|
280
|
+
})
|
|
281
|
+
.join('')
|
|
282
|
+
.trim()
|
|
283
|
+
|
|
284
|
+
pages.push(this.cleanPdfText(pageText))
|
|
285
|
+
return pageText
|
|
286
|
+
} catch (error) {
|
|
287
|
+
// If getTextContent fails, add empty string for this page
|
|
288
|
+
console.warn(`Failed to extract text from page ${currentPage}:`, error)
|
|
289
|
+
pages.push('')
|
|
290
|
+
return ''
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
await parse(buffer, {
|
|
295
|
+
pagerender: pageRender as unknown as (pageData: unknown) => string,
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
return pages
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Extract text from all pages with page boundaries preserved
|
|
303
|
+
*/
|
|
304
|
+
async extractAllPages(content: string | Buffer): Promise<string[]> {
|
|
305
|
+
const parse = await loadPdfParse()
|
|
306
|
+
const buffer = this.toBuffer(content)
|
|
307
|
+
|
|
308
|
+
// First pass to get page count
|
|
309
|
+
const pdfData = await parse(buffer)
|
|
310
|
+
const totalPages = pdfData.numpages
|
|
311
|
+
|
|
312
|
+
// Extract each page
|
|
313
|
+
return this.extractPages(content, 1, totalPages)
|
|
314
|
+
}
|
|
315
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Plain text extractor - handles raw text content
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { ExtractionResult, ExtractorInterface, ContentType } from '../../types/document.types.js'
|
|
6
|
+
|
|
7
|
+
export class TextExtractor implements ExtractorInterface {
|
|
8
|
+
/**
|
|
9
|
+
* Check if this extractor can handle the content
|
|
10
|
+
*/
|
|
11
|
+
canHandle(content: string): boolean {
|
|
12
|
+
// Text extractor is the fallback - it can handle anything
|
|
13
|
+
return typeof content === 'string' && content.length > 0
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Extract text content with basic cleaning and metadata
|
|
18
|
+
*/
|
|
19
|
+
async extract(content: string, options?: Record<string, unknown>): Promise<ExtractionResult> {
|
|
20
|
+
const cleanedContent = this.cleanText(content)
|
|
21
|
+
const metadata = this.extractMetadata(cleanedContent, options)
|
|
22
|
+
|
|
23
|
+
return {
|
|
24
|
+
content: cleanedContent,
|
|
25
|
+
contentType: 'text' as ContentType,
|
|
26
|
+
metadata,
|
|
27
|
+
rawContent: content,
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Clean and normalize text content
|
|
33
|
+
*/
|
|
34
|
+
private cleanText(text: string): string {
|
|
35
|
+
return (
|
|
36
|
+
text
|
|
37
|
+
// Normalize line endings
|
|
38
|
+
.replace(/\r\n/g, '\n')
|
|
39
|
+
.replace(/\r/g, '\n')
|
|
40
|
+
// Remove excessive whitespace while preserving paragraph breaks
|
|
41
|
+
.replace(/[ \t]+/g, ' ')
|
|
42
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
43
|
+
// Trim each line
|
|
44
|
+
.split('\n')
|
|
45
|
+
.map((line) => line.trim())
|
|
46
|
+
.join('\n')
|
|
47
|
+
// Final trim
|
|
48
|
+
.trim()
|
|
49
|
+
)
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Extract metadata from text content
|
|
54
|
+
*/
|
|
55
|
+
private extractMetadata(content: string, options?: Record<string, unknown>): ExtractionResult['metadata'] {
|
|
56
|
+
const words = content.split(/\s+/).filter((w) => w.length > 0)
|
|
57
|
+
const lines = content.split('\n')
|
|
58
|
+
|
|
59
|
+
// Try to extract title from first line if it looks like a title
|
|
60
|
+
let title: string | undefined
|
|
61
|
+
if (lines.length > 0) {
|
|
62
|
+
const firstLine = lines[0]?.trim() ?? ''
|
|
63
|
+
// Use first line as title if it's non-empty and reasonably short
|
|
64
|
+
if (firstLine.length > 0 && firstLine.length < 200) {
|
|
65
|
+
title = firstLine
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const metadataExtra = (options?.metadata as Record<string, unknown>) ?? {}
|
|
70
|
+
|
|
71
|
+
return {
|
|
72
|
+
title,
|
|
73
|
+
wordCount: words.length,
|
|
74
|
+
charCount: content.length,
|
|
75
|
+
lineCount: lines.length,
|
|
76
|
+
source: 'text',
|
|
77
|
+
...metadataExtra,
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Split text into sentences for more granular processing
|
|
83
|
+
*/
|
|
84
|
+
splitIntoSentences(text: string): string[] {
|
|
85
|
+
// Simple sentence splitting - handles common cases
|
|
86
|
+
const sentenceEnders = /([.!?]+)\s+/g
|
|
87
|
+
const sentences: string[] = []
|
|
88
|
+
let lastIndex = 0
|
|
89
|
+
let match: RegExpExecArray | null
|
|
90
|
+
|
|
91
|
+
while ((match = sentenceEnders.exec(text)) !== null) {
|
|
92
|
+
const matchGroup = match[1] ?? ''
|
|
93
|
+
const sentence = text.slice(lastIndex, match.index + matchGroup.length).trim()
|
|
94
|
+
if (sentence.length > 0) {
|
|
95
|
+
sentences.push(sentence)
|
|
96
|
+
}
|
|
97
|
+
lastIndex = match.index + match[0].length
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Add remaining text as last sentence
|
|
101
|
+
const remaining = text.slice(lastIndex).trim()
|
|
102
|
+
if (remaining.length > 0) {
|
|
103
|
+
sentences.push(remaining)
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return sentences
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Split text into paragraphs
|
|
111
|
+
*/
|
|
112
|
+
splitIntoParagraphs(text: string): string[] {
|
|
113
|
+
return text
|
|
114
|
+
.split(/\n\n+/)
|
|
115
|
+
.map((p) => p.trim())
|
|
116
|
+
.filter((p) => p.length > 0)
|
|
117
|
+
}
|
|
118
|
+
}
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL extractor - fetches and cleans web page content
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { ExtractionResult, ExtractorInterface, ContentType } from '../../types/document.types.js'
|
|
6
|
+
import { ExternalServiceError } from '../../utils/errors.js'
|
|
7
|
+
|
|
8
|
+
interface FetchOptions {
|
|
9
|
+
timeout?: number
|
|
10
|
+
userAgent?: string
|
|
11
|
+
followRedirects?: boolean
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export class UrlExtractor implements ExtractorInterface {
|
|
15
|
+
private readonly defaultTimeout = 30000
|
|
16
|
+
private readonly defaultUserAgent = 'Mozilla/5.0 (compatible; SupermemoryBot/1.0)'
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Check if content is a valid URL
|
|
20
|
+
*/
|
|
21
|
+
canHandle(content: string): boolean {
|
|
22
|
+
try {
|
|
23
|
+
const trimmed = content.trim()
|
|
24
|
+
const url = new URL(trimmed)
|
|
25
|
+
return url.protocol === 'http:' || url.protocol === 'https:'
|
|
26
|
+
} catch {
|
|
27
|
+
return false
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Fetch URL and extract clean content
|
|
33
|
+
*/
|
|
34
|
+
async extract(url: string, options?: FetchOptions & Record<string, unknown>): Promise<ExtractionResult> {
|
|
35
|
+
const trimmedUrl = url.trim()
|
|
36
|
+
const html = await this.fetchUrl(trimmedUrl, options)
|
|
37
|
+
const { content, metadata } = this.parseHtml(html, trimmedUrl)
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
content,
|
|
41
|
+
contentType: 'url' as ContentType,
|
|
42
|
+
metadata: {
|
|
43
|
+
...metadata,
|
|
44
|
+
sourceUrl: trimmedUrl,
|
|
45
|
+
source: 'web',
|
|
46
|
+
},
|
|
47
|
+
rawContent: html,
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Fetch URL content
|
|
53
|
+
*/
|
|
54
|
+
private async fetchUrl(url: string, options?: FetchOptions): Promise<string> {
|
|
55
|
+
const timeout = options?.timeout ?? this.defaultTimeout
|
|
56
|
+
const controller = new AbortController()
|
|
57
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout)
|
|
58
|
+
|
|
59
|
+
try {
|
|
60
|
+
const response = await fetch(url, {
|
|
61
|
+
headers: {
|
|
62
|
+
'User-Agent': options?.userAgent ?? this.defaultUserAgent,
|
|
63
|
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
64
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
65
|
+
},
|
|
66
|
+
redirect: options?.followRedirects !== false ? 'follow' : 'manual',
|
|
67
|
+
signal: controller.signal,
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
if (!response.ok) {
|
|
71
|
+
throw new ExternalServiceError('HTTP', `HTTP ${response.status}: ${response.statusText}`, response.status, {
|
|
72
|
+
url,
|
|
73
|
+
})
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return await response.text()
|
|
77
|
+
} finally {
|
|
78
|
+
clearTimeout(timeoutId)
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Parse HTML and extract clean text content
|
|
84
|
+
*/
|
|
85
|
+
private parseHtml(html: string, url: string): { content: string; metadata: ExtractionResult['metadata'] } {
|
|
86
|
+
// Extract title
|
|
87
|
+
const titleMatch = html.match(/<title[^>]*>([^<]*)<\/title>/i)
|
|
88
|
+
const title = titleMatch?.[1] ? this.decodeHtmlEntities(titleMatch[1].trim()) : undefined
|
|
89
|
+
|
|
90
|
+
// Extract meta description (handle both attribute orders)
|
|
91
|
+
const descMatch =
|
|
92
|
+
html.match(/<meta[^>]*name=["']description["'][^>]*content=["']([^"']*)["']/i) ??
|
|
93
|
+
html.match(/<meta[^>]*content=["']([^"']*)["'][^>]*name=["']description["']/i)
|
|
94
|
+
const description = descMatch?.[1] ? this.decodeHtmlEntities(descMatch[1].trim()) : undefined
|
|
95
|
+
|
|
96
|
+
// Extract author (handle both attribute orders)
|
|
97
|
+
const authorMatch =
|
|
98
|
+
html.match(/<meta[^>]*name=["']author["'][^>]*content=["']([^"']*)["']/i) ??
|
|
99
|
+
html.match(/<meta[^>]*content=["']([^"']*)["'][^>]*name=["']author["']/i)
|
|
100
|
+
const author = authorMatch?.[1] ? this.decodeHtmlEntities(authorMatch[1].trim()) : undefined
|
|
101
|
+
|
|
102
|
+
// Extract og:tags for additional metadata
|
|
103
|
+
const ogTags = this.extractOpenGraphTags(html)
|
|
104
|
+
|
|
105
|
+
// Clean HTML to get text content
|
|
106
|
+
const content = this.htmlToText(html)
|
|
107
|
+
const words = content.split(/\s+/).filter((w) => w.length > 0)
|
|
108
|
+
|
|
109
|
+
let domain: string | undefined
|
|
110
|
+
try {
|
|
111
|
+
domain = new URL(url).hostname
|
|
112
|
+
} catch {
|
|
113
|
+
// URL parsing failed, leave domain undefined
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return {
|
|
117
|
+
content,
|
|
118
|
+
metadata: {
|
|
119
|
+
title: title ?? ogTags['title'],
|
|
120
|
+
description: description ?? ogTags['description'],
|
|
121
|
+
author,
|
|
122
|
+
wordCount: words.length,
|
|
123
|
+
charCount: content.length,
|
|
124
|
+
mimeType: 'text/html',
|
|
125
|
+
ogImage: ogTags['image'],
|
|
126
|
+
ogType: ogTags['type'],
|
|
127
|
+
domain,
|
|
128
|
+
},
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Extract OpenGraph meta tags
|
|
134
|
+
*/
|
|
135
|
+
private extractOpenGraphTags(html: string): Record<string, string | undefined> {
|
|
136
|
+
const tags: Record<string, string | undefined> = {}
|
|
137
|
+
const ogPattern = /<meta[^>]*property=["']og:([^"']*)["'][^>]*content=["']([^"']*)["']/gi
|
|
138
|
+
let match: RegExpExecArray | null
|
|
139
|
+
|
|
140
|
+
while ((match = ogPattern.exec(html)) !== null) {
|
|
141
|
+
const key = match[1]
|
|
142
|
+
const value = match[2]
|
|
143
|
+
if (key && value) {
|
|
144
|
+
tags[key] = this.decodeHtmlEntities(value)
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
return tags
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Convert HTML to clean text
|
|
153
|
+
*/
|
|
154
|
+
private htmlToText(html: string): string {
|
|
155
|
+
let text = html
|
|
156
|
+
|
|
157
|
+
// Remove script and style content
|
|
158
|
+
text = text.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
|
159
|
+
text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
|
160
|
+
text = text.replace(/<noscript[^>]*>[\s\S]*?<\/noscript>/gi, '')
|
|
161
|
+
|
|
162
|
+
// Remove comments
|
|
163
|
+
text = text.replace(/<!--[\s\S]*?-->/g, '')
|
|
164
|
+
|
|
165
|
+
// Remove header, footer, nav, aside (common non-content areas)
|
|
166
|
+
text = text.replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '')
|
|
167
|
+
text = text.replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
|
|
168
|
+
text = text.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
|
|
169
|
+
text = text.replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '')
|
|
170
|
+
|
|
171
|
+
// Convert block elements to newlines
|
|
172
|
+
text = text.replace(/<\/(p|div|h[1-6]|li|tr|br|hr)[^>]*>/gi, '\n')
|
|
173
|
+
text = text.replace(/<(br|hr)[^>]*\/?>/gi, '\n')
|
|
174
|
+
|
|
175
|
+
// Remove all remaining HTML tags
|
|
176
|
+
text = text.replace(/<[^>]+>/g, ' ')
|
|
177
|
+
|
|
178
|
+
// Decode HTML entities
|
|
179
|
+
text = this.decodeHtmlEntities(text)
|
|
180
|
+
|
|
181
|
+
// Clean up whitespace
|
|
182
|
+
text = text
|
|
183
|
+
.replace(/[ \t]+/g, ' ')
|
|
184
|
+
.replace(/\n[ \t]+/g, '\n')
|
|
185
|
+
.replace(/[ \t]+\n/g, '\n')
|
|
186
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
187
|
+
.trim()
|
|
188
|
+
|
|
189
|
+
return text
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Decode common HTML entities
|
|
194
|
+
*/
|
|
195
|
+
private decodeHtmlEntities(text: string): string {
|
|
196
|
+
const entities: Record<string, string> = {
|
|
197
|
+
'&': '&',
|
|
198
|
+
'<': '<',
|
|
199
|
+
'>': '>',
|
|
200
|
+
'"': '"',
|
|
201
|
+
''': "'",
|
|
202
|
+
''': "'",
|
|
203
|
+
' ': ' ',
|
|
204
|
+
'—': '--',
|
|
205
|
+
'–': '-',
|
|
206
|
+
'…': '...',
|
|
207
|
+
'©': '(c)',
|
|
208
|
+
'®': '(R)',
|
|
209
|
+
'™': '(TM)',
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
let result = text
|
|
213
|
+
for (const [entity, char] of Object.entries(entities)) {
|
|
214
|
+
result = result.replace(new RegExp(entity, 'gi'), char)
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Handle numeric entities
|
|
218
|
+
result = result.replace(/&#(\d+);/g, (_, code: string) => String.fromCharCode(parseInt(code, 10)))
|
|
219
|
+
result = result.replace(/&#x([a-fA-F0-9]+);/g, (_, code: string) => String.fromCharCode(parseInt(code, 16)))
|
|
220
|
+
|
|
221
|
+
return result
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/**
|
|
225
|
+
* Check if URL is accessible
|
|
226
|
+
*/
|
|
227
|
+
async isAccessible(url: string): Promise<boolean> {
|
|
228
|
+
try {
|
|
229
|
+
const controller = new AbortController()
|
|
230
|
+
const timeoutId = setTimeout(() => controller.abort(), 5000)
|
|
231
|
+
|
|
232
|
+
const response = await fetch(url, {
|
|
233
|
+
method: 'HEAD',
|
|
234
|
+
signal: controller.signal,
|
|
235
|
+
})
|
|
236
|
+
|
|
237
|
+
clearTimeout(timeoutId)
|
|
238
|
+
return response.ok
|
|
239
|
+
} catch {
|
|
240
|
+
return false
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|