@twelvehart/supermemory-runtime 1.0.0-next.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/.env.example +57 -0
  2. package/README.md +374 -0
  3. package/dist/index.js +189 -0
  4. package/dist/mcp/index.js +1132 -0
  5. package/docker-compose.prod.yml +91 -0
  6. package/docker-compose.yml +358 -0
  7. package/drizzle/0000_dapper_the_professor.sql +159 -0
  8. package/drizzle/0001_api_keys.sql +51 -0
  9. package/drizzle/meta/0000_snapshot.json +1532 -0
  10. package/drizzle/meta/_journal.json +13 -0
  11. package/drizzle.config.ts +20 -0
  12. package/package.json +114 -0
  13. package/scripts/add-extraction-job.ts +122 -0
  14. package/scripts/benchmark-pgvector.ts +122 -0
  15. package/scripts/bootstrap.sh +209 -0
  16. package/scripts/check-runtime-pack.ts +111 -0
  17. package/scripts/claude-mcp-config.ts +336 -0
  18. package/scripts/docker-entrypoint.sh +183 -0
  19. package/scripts/doctor.ts +377 -0
  20. package/scripts/init-db.sql +33 -0
  21. package/scripts/install.sh +1110 -0
  22. package/scripts/mcp-setup.ts +271 -0
  23. package/scripts/migrations/001_create_pgvector_extension.sql +31 -0
  24. package/scripts/migrations/002_create_memory_embeddings_table.sql +75 -0
  25. package/scripts/migrations/003_create_hnsw_index.sql +94 -0
  26. package/scripts/migrations/004_create_memory_embeddings_standalone.sql +70 -0
  27. package/scripts/migrations/005_create_chunks_table.sql +95 -0
  28. package/scripts/migrations/006_create_processing_queue.sql +45 -0
  29. package/scripts/migrations/generate_test_data.sql +42 -0
  30. package/scripts/migrations/phase1_comprehensive_test.sql +204 -0
  31. package/scripts/migrations/run_migrations.sh +286 -0
  32. package/scripts/migrations/test_hnsw_index.sql +255 -0
  33. package/scripts/pre-commit-secrets +282 -0
  34. package/scripts/run-extraction-worker.ts +46 -0
  35. package/scripts/run-phase1-tests.sh +291 -0
  36. package/scripts/setup.ts +222 -0
  37. package/scripts/smoke-install.sh +12 -0
  38. package/scripts/test-health-endpoint.sh +328 -0
  39. package/src/api/index.ts +2 -0
  40. package/src/api/middleware/auth.ts +80 -0
  41. package/src/api/middleware/csrf.ts +308 -0
  42. package/src/api/middleware/errorHandler.ts +166 -0
  43. package/src/api/middleware/rateLimit.ts +360 -0
  44. package/src/api/middleware/validation.ts +514 -0
  45. package/src/api/routes/documents.ts +286 -0
  46. package/src/api/routes/profiles.ts +237 -0
  47. package/src/api/routes/search.ts +71 -0
  48. package/src/api/stores/index.ts +58 -0
  49. package/src/config/bootstrap-env.ts +3 -0
  50. package/src/config/env.ts +71 -0
  51. package/src/config/feature-flags.ts +25 -0
  52. package/src/config/index.ts +140 -0
  53. package/src/config/secrets.config.ts +291 -0
  54. package/src/db/client.ts +92 -0
  55. package/src/db/index.ts +73 -0
  56. package/src/db/postgres.ts +72 -0
  57. package/src/db/schema/chunks.schema.ts +31 -0
  58. package/src/db/schema/containers.schema.ts +46 -0
  59. package/src/db/schema/documents.schema.ts +49 -0
  60. package/src/db/schema/embeddings.schema.ts +32 -0
  61. package/src/db/schema/index.ts +11 -0
  62. package/src/db/schema/memories.schema.ts +72 -0
  63. package/src/db/schema/profiles.schema.ts +34 -0
  64. package/src/db/schema/queue.schema.ts +59 -0
  65. package/src/db/schema/relationships.schema.ts +42 -0
  66. package/src/db/schema.ts +223 -0
  67. package/src/db/worker-connection.ts +47 -0
  68. package/src/index.ts +235 -0
  69. package/src/mcp/CLAUDE.md +1 -0
  70. package/src/mcp/index.ts +1380 -0
  71. package/src/mcp/legacyState.ts +22 -0
  72. package/src/mcp/rateLimit.ts +358 -0
  73. package/src/mcp/resources.ts +309 -0
  74. package/src/mcp/results.ts +104 -0
  75. package/src/mcp/tools.ts +401 -0
  76. package/src/queues/config.ts +119 -0
  77. package/src/queues/index.ts +289 -0
  78. package/src/sdk/client.ts +225 -0
  79. package/src/sdk/errors.ts +266 -0
  80. package/src/sdk/http.ts +560 -0
  81. package/src/sdk/index.ts +244 -0
  82. package/src/sdk/resources/base.ts +65 -0
  83. package/src/sdk/resources/connections.ts +204 -0
  84. package/src/sdk/resources/documents.ts +163 -0
  85. package/src/sdk/resources/index.ts +10 -0
  86. package/src/sdk/resources/memories.ts +150 -0
  87. package/src/sdk/resources/search.ts +60 -0
  88. package/src/sdk/resources/settings.ts +36 -0
  89. package/src/sdk/types.ts +674 -0
  90. package/src/services/chunking/index.ts +451 -0
  91. package/src/services/chunking.service.ts +650 -0
  92. package/src/services/csrf.service.ts +252 -0
  93. package/src/services/documents.repository.ts +219 -0
  94. package/src/services/documents.service.ts +191 -0
  95. package/src/services/embedding.service.ts +404 -0
  96. package/src/services/extraction.service.ts +300 -0
  97. package/src/services/extractors/code.extractor.ts +451 -0
  98. package/src/services/extractors/index.ts +9 -0
  99. package/src/services/extractors/markdown.extractor.ts +461 -0
  100. package/src/services/extractors/pdf.extractor.ts +315 -0
  101. package/src/services/extractors/text.extractor.ts +118 -0
  102. package/src/services/extractors/url.extractor.ts +243 -0
  103. package/src/services/index.ts +235 -0
  104. package/src/services/ingestion.service.ts +177 -0
  105. package/src/services/llm/anthropic.ts +400 -0
  106. package/src/services/llm/base.ts +460 -0
  107. package/src/services/llm/contradiction-detector.service.ts +526 -0
  108. package/src/services/llm/heuristics.ts +148 -0
  109. package/src/services/llm/index.ts +309 -0
  110. package/src/services/llm/memory-classifier.service.ts +383 -0
  111. package/src/services/llm/memory-extension-detector.service.ts +523 -0
  112. package/src/services/llm/mock.ts +470 -0
  113. package/src/services/llm/openai.ts +398 -0
  114. package/src/services/llm/prompts.ts +438 -0
  115. package/src/services/llm/types.ts +373 -0
  116. package/src/services/memory.repository.ts +1769 -0
  117. package/src/services/memory.service.ts +1338 -0
  118. package/src/services/memory.types.ts +234 -0
  119. package/src/services/persistence/index.ts +295 -0
  120. package/src/services/pipeline.service.ts +509 -0
  121. package/src/services/profile.repository.ts +436 -0
  122. package/src/services/profile.service.ts +560 -0
  123. package/src/services/profile.types.ts +270 -0
  124. package/src/services/relationships/detector.ts +1128 -0
  125. package/src/services/relationships/index.ts +268 -0
  126. package/src/services/relationships/memory-integration.ts +459 -0
  127. package/src/services/relationships/strategies.ts +132 -0
  128. package/src/services/relationships/types.ts +370 -0
  129. package/src/services/search.service.ts +761 -0
  130. package/src/services/search.types.ts +220 -0
  131. package/src/services/secrets.service.ts +384 -0
  132. package/src/services/vectorstore/base.ts +327 -0
  133. package/src/services/vectorstore/index.ts +444 -0
  134. package/src/services/vectorstore/memory.ts +286 -0
  135. package/src/services/vectorstore/migration.ts +295 -0
  136. package/src/services/vectorstore/mock.ts +403 -0
  137. package/src/services/vectorstore/pgvector.ts +695 -0
  138. package/src/services/vectorstore/types.ts +247 -0
  139. package/src/startup.ts +389 -0
  140. package/src/types/api.types.ts +193 -0
  141. package/src/types/document.types.ts +103 -0
  142. package/src/types/index.ts +241 -0
  143. package/src/types/profile.base.ts +133 -0
  144. package/src/utils/errors.ts +447 -0
  145. package/src/utils/id.ts +15 -0
  146. package/src/utils/index.ts +101 -0
  147. package/src/utils/logger.ts +313 -0
  148. package/src/utils/sanitization.ts +501 -0
  149. package/src/utils/secret-validation.ts +273 -0
  150. package/src/utils/synonyms.ts +188 -0
  151. package/src/utils/validation.ts +581 -0
  152. package/src/workers/chunking.worker.ts +242 -0
  153. package/src/workers/embedding.worker.ts +358 -0
  154. package/src/workers/extraction.worker.ts +346 -0
  155. package/src/workers/indexing.worker.ts +505 -0
  156. package/tsconfig.json +38 -0
@@ -0,0 +1,315 @@
1
+ /**
2
+ * PDF extractor - extracts text content from PDF files
3
+ * Uses pdf-parse library for extraction
4
+ */
5
+
6
+ import { ExtractionResult, ExtractorInterface, ContentType } from '../../types/document.types.js'
7
+ import { DependencyError } from '../../utils/errors.js'
8
+
9
+ // pdf-parse types (library doesn't have proper types)
10
+ interface PdfData {
11
+ numpages: number
12
+ numrender: number
13
+ info: {
14
+ Title?: string
15
+ Author?: string
16
+ Subject?: string
17
+ Keywords?: string
18
+ Creator?: string
19
+ Producer?: string
20
+ CreationDate?: string
21
+ ModDate?: string
22
+ }
23
+ metadata: unknown
24
+ text: string
25
+ version: string
26
+ }
27
+
28
+ // Type for text content item from pdf.js
29
+ interface TextItem {
30
+ str: string
31
+ dir?: string
32
+ width?: number
33
+ height?: number
34
+ transform?: number[]
35
+ fontName?: string
36
+ }
37
+
38
+ // Type for text content from getTextContent()
39
+ interface TextContent {
40
+ items: TextItem[]
41
+ styles?: Record<string, unknown>
42
+ }
43
+
44
+ // Type for page data from pdf-parse pagerender callback
45
+ interface PageData {
46
+ getTextContent(): Promise<TextContent>
47
+ pageNumber?: number
48
+ }
49
+
50
+ interface PdfParseOptions {
51
+ pagerender?: (pageData: unknown) => string
52
+ max?: number
53
+ version?: string
54
+ }
55
+
56
+ type PdfParseFunction = (buffer: Buffer, options?: PdfParseOptions) => Promise<PdfData>
57
+
58
+ // Dynamic import for pdf-parse
59
+ let pdfParse: PdfParseFunction | null = null
60
+
61
+ async function loadPdfParse(): Promise<PdfParseFunction> {
62
+ if (!pdfParse) {
63
+ try {
64
+ const module = await import('pdf-parse')
65
+ pdfParse = module.default as PdfParseFunction
66
+ } catch {
67
+ throw new DependencyError('pdf-parse', 'npm install pdf-parse')
68
+ }
69
+ }
70
+ return pdfParse
71
+ }
72
+
73
+ export class PdfExtractor implements ExtractorInterface {
74
+ /**
75
+ * Check if content is a PDF buffer or base64 encoded PDF
76
+ */
77
+ canHandle(content: string | Buffer): boolean {
78
+ if (Buffer.isBuffer(content)) {
79
+ // Check PDF magic bytes: %PDF
80
+ return content.slice(0, 4).toString() === '%PDF'
81
+ }
82
+
83
+ if (typeof content === 'string') {
84
+ // Check if it's base64 encoded PDF
85
+ if (content.startsWith('data:application/pdf;base64,')) {
86
+ return true
87
+ }
88
+
89
+ // Check if it starts with PDF magic bytes
90
+ if (content.startsWith('%PDF')) {
91
+ return true
92
+ }
93
+
94
+ // Try to detect base64 PDF without data URI prefix
95
+ try {
96
+ const decoded = Buffer.from(content.slice(0, 100), 'base64').toString()
97
+ return decoded.startsWith('%PDF')
98
+ } catch {
99
+ return false
100
+ }
101
+ }
102
+
103
+ return false
104
+ }
105
+
106
+ /**
107
+ * Extract text content from PDF
108
+ */
109
+ async extract(content: string | Buffer, options?: Record<string, unknown>): Promise<ExtractionResult> {
110
+ const parse = await loadPdfParse()
111
+
112
+ const buffer = this.toBuffer(content)
113
+ const pdfData = await parse(buffer, {
114
+ max: options?.maxPages as number | undefined,
115
+ })
116
+
117
+ const cleanedText = this.cleanPdfText(pdfData.text)
118
+ const metadata = this.extractMetadata(pdfData, cleanedText)
119
+
120
+ return {
121
+ content: cleanedText,
122
+ contentType: 'pdf' as ContentType,
123
+ metadata,
124
+ rawContent: pdfData.text,
125
+ }
126
+ }
127
+
128
+ /**
129
+ * Convert input to Buffer
130
+ */
131
+ private toBuffer(content: string | Buffer): Buffer {
132
+ if (Buffer.isBuffer(content)) {
133
+ return content
134
+ }
135
+
136
+ // Handle data URI
137
+ if (content.startsWith('data:application/pdf;base64,')) {
138
+ const base64Data = content.replace('data:application/pdf;base64,', '')
139
+ return Buffer.from(base64Data, 'base64')
140
+ }
141
+
142
+ // Try base64 decode
143
+ try {
144
+ const buffer = Buffer.from(content, 'base64')
145
+ if (buffer.slice(0, 4).toString() === '%PDF') {
146
+ return buffer
147
+ }
148
+ } catch {
149
+ // Not base64
150
+ }
151
+
152
+ // Assume raw PDF string
153
+ return Buffer.from(content, 'binary')
154
+ }
155
+
156
+ /**
157
+ * Clean extracted PDF text
158
+ */
159
+ private cleanPdfText(text: string): string {
160
+ return (
161
+ text
162
+ // Fix common PDF extraction artifacts
163
+ .replace(/\f/g, '\n\n') // Form feeds to paragraph breaks
164
+ .replace(/\r\n/g, '\n')
165
+ .replace(/\r/g, '\n')
166
+ // Remove excessive whitespace
167
+ .replace(/[ \t]+/g, ' ')
168
+ .replace(/\n{3,}/g, '\n\n')
169
+ // Fix hyphenation at line breaks
170
+ .replace(/(\w)-\n(\w)/g, '$1$2')
171
+ // Remove page numbers (common patterns)
172
+ .replace(/^\s*\d+\s*$/gm, '')
173
+ .replace(/\n\s*Page\s+\d+\s*\n/gi, '\n')
174
+ // Trim lines
175
+ .split('\n')
176
+ .map((line) => line.trim())
177
+ .join('\n')
178
+ .trim()
179
+ )
180
+ }
181
+
182
+ /**
183
+ * Extract metadata from PDF data
184
+ */
185
+ private extractMetadata(pdfData: PdfData, cleanedText: string): ExtractionResult['metadata'] {
186
+ const words = cleanedText.split(/\s+/).filter((w) => w.length > 0)
187
+ const info = pdfData.info ?? {}
188
+
189
+ // Parse creation date if available
190
+ let createdAt: string | undefined
191
+ if (info.CreationDate) {
192
+ createdAt = this.parsePdfDate(info.CreationDate)
193
+ }
194
+
195
+ // Parse keywords into tags
196
+ let tags: string[] | undefined
197
+ if (info.Keywords) {
198
+ tags = info.Keywords.split(/[,;]/)
199
+ .map((k) => k.trim())
200
+ .filter((k) => k.length > 0)
201
+ }
202
+
203
+ return {
204
+ title: info.Title,
205
+ author: info.Author,
206
+ description: info.Subject,
207
+ tags,
208
+ source: 'pdf',
209
+ mimeType: 'application/pdf',
210
+ wordCount: words.length,
211
+ charCount: cleanedText.length,
212
+ pageCount: pdfData.numpages,
213
+ pdfVersion: pdfData.version,
214
+ creator: info.Creator,
215
+ producer: info.Producer,
216
+ createdAt,
217
+ }
218
+ }
219
+
220
+ /**
221
+ * Parse PDF date format (D:YYYYMMDDHHmmSS)
222
+ */
223
+ private parsePdfDate(dateStr: string): string | undefined {
224
+ try {
225
+ // Remove D: prefix if present
226
+ const clean = dateStr.replace(/^D:/, '')
227
+
228
+ // Extract date components
229
+ const year = clean.slice(0, 4)
230
+ const month = clean.slice(4, 6) || '01'
231
+ const day = clean.slice(6, 8) || '01'
232
+ const hour = clean.slice(8, 10) || '00'
233
+ const minute = clean.slice(10, 12) || '00'
234
+ const second = clean.slice(12, 14) || '00'
235
+
236
+ const date = new Date(`${year}-${month}-${day}T${hour}:${minute}:${second}Z`)
237
+
238
+ return date.toISOString()
239
+ } catch {
240
+ return undefined
241
+ }
242
+ }
243
+
244
+ /**
245
+ * Extract text from specific pages using pageData.getTextContent()
246
+ */
247
+ async extractPages(content: string | Buffer, startPage: number, endPage?: number): Promise<string[]> {
248
+ const parse = await loadPdfParse()
249
+
250
+ const buffer = this.toBuffer(content)
251
+ const pages: string[] = []
252
+ let currentPage = 0
253
+
254
+ // Custom page render function that extracts actual text content
255
+ const pageRender = async (pageData: PageData): Promise<string> => {
256
+ currentPage++
257
+
258
+ // Skip pages outside the requested range
259
+ if (currentPage < startPage || (endPage && currentPage > endPage)) {
260
+ return ''
261
+ }
262
+
263
+ try {
264
+ // Use getTextContent to extract actual text from the page
265
+ const textContent = await pageData.getTextContent()
266
+
267
+ if (!textContent || !textContent.items) {
268
+ pages.push('')
269
+ return ''
270
+ }
271
+
272
+ // Combine all text items into a single string
273
+ const pageText = textContent.items
274
+ .map((item: TextItem) => {
275
+ // Handle text items - they have a 'str' property
276
+ if ('str' in item && typeof item.str === 'string') {
277
+ return item.str
278
+ }
279
+ return ''
280
+ })
281
+ .join('')
282
+ .trim()
283
+
284
+ pages.push(this.cleanPdfText(pageText))
285
+ return pageText
286
+ } catch (error) {
287
+ // If getTextContent fails, add empty string for this page
288
+ console.warn(`Failed to extract text from page ${currentPage}:`, error)
289
+ pages.push('')
290
+ return ''
291
+ }
292
+ }
293
+
294
+ await parse(buffer, {
295
+ pagerender: pageRender as unknown as (pageData: unknown) => string,
296
+ })
297
+
298
+ return pages
299
+ }
300
+
301
+ /**
302
+ * Extract text from all pages with page boundaries preserved
303
+ */
304
+ async extractAllPages(content: string | Buffer): Promise<string[]> {
305
+ const parse = await loadPdfParse()
306
+ const buffer = this.toBuffer(content)
307
+
308
+ // First pass to get page count
309
+ const pdfData = await parse(buffer)
310
+ const totalPages = pdfData.numpages
311
+
312
+ // Extract each page
313
+ return this.extractPages(content, 1, totalPages)
314
+ }
315
+ }
@@ -0,0 +1,118 @@
1
+ /**
2
+ * Plain text extractor - handles raw text content
3
+ */
4
+
5
+ import { ExtractionResult, ExtractorInterface, ContentType } from '../../types/document.types.js'
6
+
7
+ export class TextExtractor implements ExtractorInterface {
8
+ /**
9
+ * Check if this extractor can handle the content
10
+ */
11
+ canHandle(content: string): boolean {
12
+ // Text extractor is the fallback - it can handle anything
13
+ return typeof content === 'string' && content.length > 0
14
+ }
15
+
16
+ /**
17
+ * Extract text content with basic cleaning and metadata
18
+ */
19
+ async extract(content: string, options?: Record<string, unknown>): Promise<ExtractionResult> {
20
+ const cleanedContent = this.cleanText(content)
21
+ const metadata = this.extractMetadata(cleanedContent, options)
22
+
23
+ return {
24
+ content: cleanedContent,
25
+ contentType: 'text' as ContentType,
26
+ metadata,
27
+ rawContent: content,
28
+ }
29
+ }
30
+
31
+ /**
32
+ * Clean and normalize text content
33
+ */
34
+ private cleanText(text: string): string {
35
+ return (
36
+ text
37
+ // Normalize line endings
38
+ .replace(/\r\n/g, '\n')
39
+ .replace(/\r/g, '\n')
40
+ // Remove excessive whitespace while preserving paragraph breaks
41
+ .replace(/[ \t]+/g, ' ')
42
+ .replace(/\n{3,}/g, '\n\n')
43
+ // Trim each line
44
+ .split('\n')
45
+ .map((line) => line.trim())
46
+ .join('\n')
47
+ // Final trim
48
+ .trim()
49
+ )
50
+ }
51
+
52
+ /**
53
+ * Extract metadata from text content
54
+ */
55
+ private extractMetadata(content: string, options?: Record<string, unknown>): ExtractionResult['metadata'] {
56
+ const words = content.split(/\s+/).filter((w) => w.length > 0)
57
+ const lines = content.split('\n')
58
+
59
+ // Try to extract title from first line if it looks like a title
60
+ let title: string | undefined
61
+ if (lines.length > 0) {
62
+ const firstLine = lines[0]?.trim() ?? ''
63
+ // Use first line as title if it's non-empty and reasonably short
64
+ if (firstLine.length > 0 && firstLine.length < 200) {
65
+ title = firstLine
66
+ }
67
+ }
68
+
69
+ const metadataExtra = (options?.metadata as Record<string, unknown>) ?? {}
70
+
71
+ return {
72
+ title,
73
+ wordCount: words.length,
74
+ charCount: content.length,
75
+ lineCount: lines.length,
76
+ source: 'text',
77
+ ...metadataExtra,
78
+ }
79
+ }
80
+
81
+ /**
82
+ * Split text into sentences for more granular processing
83
+ */
84
+ splitIntoSentences(text: string): string[] {
85
+ // Simple sentence splitting - handles common cases
86
+ const sentenceEnders = /([.!?]+)\s+/g
87
+ const sentences: string[] = []
88
+ let lastIndex = 0
89
+ let match: RegExpExecArray | null
90
+
91
+ while ((match = sentenceEnders.exec(text)) !== null) {
92
+ const matchGroup = match[1] ?? ''
93
+ const sentence = text.slice(lastIndex, match.index + matchGroup.length).trim()
94
+ if (sentence.length > 0) {
95
+ sentences.push(sentence)
96
+ }
97
+ lastIndex = match.index + match[0].length
98
+ }
99
+
100
+ // Add remaining text as last sentence
101
+ const remaining = text.slice(lastIndex).trim()
102
+ if (remaining.length > 0) {
103
+ sentences.push(remaining)
104
+ }
105
+
106
+ return sentences
107
+ }
108
+
109
+ /**
110
+ * Split text into paragraphs
111
+ */
112
+ splitIntoParagraphs(text: string): string[] {
113
+ return text
114
+ .split(/\n\n+/)
115
+ .map((p) => p.trim())
116
+ .filter((p) => p.length > 0)
117
+ }
118
+ }
@@ -0,0 +1,243 @@
1
+ /**
2
+ * URL extractor - fetches and cleans web page content
3
+ */
4
+
5
+ import { ExtractionResult, ExtractorInterface, ContentType } from '../../types/document.types.js'
6
+ import { ExternalServiceError } from '../../utils/errors.js'
7
+
8
+ interface FetchOptions {
9
+ timeout?: number
10
+ userAgent?: string
11
+ followRedirects?: boolean
12
+ }
13
+
14
+ export class UrlExtractor implements ExtractorInterface {
15
+ private readonly defaultTimeout = 30000
16
+ private readonly defaultUserAgent = 'Mozilla/5.0 (compatible; SupermemoryBot/1.0)'
17
+
18
+ /**
19
+ * Check if content is a valid URL
20
+ */
21
+ canHandle(content: string): boolean {
22
+ try {
23
+ const trimmed = content.trim()
24
+ const url = new URL(trimmed)
25
+ return url.protocol === 'http:' || url.protocol === 'https:'
26
+ } catch {
27
+ return false
28
+ }
29
+ }
30
+
31
+ /**
32
+ * Fetch URL and extract clean content
33
+ */
34
+ async extract(url: string, options?: FetchOptions & Record<string, unknown>): Promise<ExtractionResult> {
35
+ const trimmedUrl = url.trim()
36
+ const html = await this.fetchUrl(trimmedUrl, options)
37
+ const { content, metadata } = this.parseHtml(html, trimmedUrl)
38
+
39
+ return {
40
+ content,
41
+ contentType: 'url' as ContentType,
42
+ metadata: {
43
+ ...metadata,
44
+ sourceUrl: trimmedUrl,
45
+ source: 'web',
46
+ },
47
+ rawContent: html,
48
+ }
49
+ }
50
+
51
+ /**
52
+ * Fetch URL content
53
+ */
54
+ private async fetchUrl(url: string, options?: FetchOptions): Promise<string> {
55
+ const timeout = options?.timeout ?? this.defaultTimeout
56
+ const controller = new AbortController()
57
+ const timeoutId = setTimeout(() => controller.abort(), timeout)
58
+
59
+ try {
60
+ const response = await fetch(url, {
61
+ headers: {
62
+ 'User-Agent': options?.userAgent ?? this.defaultUserAgent,
63
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
64
+ 'Accept-Language': 'en-US,en;q=0.5',
65
+ },
66
+ redirect: options?.followRedirects !== false ? 'follow' : 'manual',
67
+ signal: controller.signal,
68
+ })
69
+
70
+ if (!response.ok) {
71
+ throw new ExternalServiceError('HTTP', `HTTP ${response.status}: ${response.statusText}`, response.status, {
72
+ url,
73
+ })
74
+ }
75
+
76
+ return await response.text()
77
+ } finally {
78
+ clearTimeout(timeoutId)
79
+ }
80
+ }
81
+
82
+ /**
83
+ * Parse HTML and extract clean text content
84
+ */
85
+ private parseHtml(html: string, url: string): { content: string; metadata: ExtractionResult['metadata'] } {
86
+ // Extract title
87
+ const titleMatch = html.match(/<title[^>]*>([^<]*)<\/title>/i)
88
+ const title = titleMatch?.[1] ? this.decodeHtmlEntities(titleMatch[1].trim()) : undefined
89
+
90
+ // Extract meta description (handle both attribute orders)
91
+ const descMatch =
92
+ html.match(/<meta[^>]*name=["']description["'][^>]*content=["']([^"']*)["']/i) ??
93
+ html.match(/<meta[^>]*content=["']([^"']*)["'][^>]*name=["']description["']/i)
94
+ const description = descMatch?.[1] ? this.decodeHtmlEntities(descMatch[1].trim()) : undefined
95
+
96
+ // Extract author (handle both attribute orders)
97
+ const authorMatch =
98
+ html.match(/<meta[^>]*name=["']author["'][^>]*content=["']([^"']*)["']/i) ??
99
+ html.match(/<meta[^>]*content=["']([^"']*)["'][^>]*name=["']author["']/i)
100
+ const author = authorMatch?.[1] ? this.decodeHtmlEntities(authorMatch[1].trim()) : undefined
101
+
102
+ // Extract og:tags for additional metadata
103
+ const ogTags = this.extractOpenGraphTags(html)
104
+
105
+ // Clean HTML to get text content
106
+ const content = this.htmlToText(html)
107
+ const words = content.split(/\s+/).filter((w) => w.length > 0)
108
+
109
+ let domain: string | undefined
110
+ try {
111
+ domain = new URL(url).hostname
112
+ } catch {
113
+ // URL parsing failed, leave domain undefined
114
+ }
115
+
116
+ return {
117
+ content,
118
+ metadata: {
119
+ title: title ?? ogTags['title'],
120
+ description: description ?? ogTags['description'],
121
+ author,
122
+ wordCount: words.length,
123
+ charCount: content.length,
124
+ mimeType: 'text/html',
125
+ ogImage: ogTags['image'],
126
+ ogType: ogTags['type'],
127
+ domain,
128
+ },
129
+ }
130
+ }
131
+
132
+ /**
133
+ * Extract OpenGraph meta tags
134
+ */
135
+ private extractOpenGraphTags(html: string): Record<string, string | undefined> {
136
+ const tags: Record<string, string | undefined> = {}
137
+ const ogPattern = /<meta[^>]*property=["']og:([^"']*)["'][^>]*content=["']([^"']*)["']/gi
138
+ let match: RegExpExecArray | null
139
+
140
+ while ((match = ogPattern.exec(html)) !== null) {
141
+ const key = match[1]
142
+ const value = match[2]
143
+ if (key && value) {
144
+ tags[key] = this.decodeHtmlEntities(value)
145
+ }
146
+ }
147
+
148
+ return tags
149
+ }
150
+
151
+ /**
152
+ * Convert HTML to clean text
153
+ */
154
+ private htmlToText(html: string): string {
155
+ let text = html
156
+
157
+ // Remove script and style content
158
+ text = text.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
159
+ text = text.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
160
+ text = text.replace(/<noscript[^>]*>[\s\S]*?<\/noscript>/gi, '')
161
+
162
+ // Remove comments
163
+ text = text.replace(/<!--[\s\S]*?-->/g, '')
164
+
165
+ // Remove header, footer, nav, aside (common non-content areas)
166
+ text = text.replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '')
167
+ text = text.replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
168
+ text = text.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
169
+ text = text.replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '')
170
+
171
+ // Convert block elements to newlines
172
+ text = text.replace(/<\/(p|div|h[1-6]|li|tr|br|hr)[^>]*>/gi, '\n')
173
+ text = text.replace(/<(br|hr)[^>]*\/?>/gi, '\n')
174
+
175
+ // Remove all remaining HTML tags
176
+ text = text.replace(/<[^>]+>/g, ' ')
177
+
178
+ // Decode HTML entities
179
+ text = this.decodeHtmlEntities(text)
180
+
181
+ // Clean up whitespace
182
+ text = text
183
+ .replace(/[ \t]+/g, ' ')
184
+ .replace(/\n[ \t]+/g, '\n')
185
+ .replace(/[ \t]+\n/g, '\n')
186
+ .replace(/\n{3,}/g, '\n\n')
187
+ .trim()
188
+
189
+ return text
190
+ }
191
+
192
+ /**
193
+ * Decode common HTML entities
194
+ */
195
+ private decodeHtmlEntities(text: string): string {
196
+ const entities: Record<string, string> = {
197
+ '&amp;': '&',
198
+ '&lt;': '<',
199
+ '&gt;': '>',
200
+ '&quot;': '"',
201
+ '&#39;': "'",
202
+ '&apos;': "'",
203
+ '&nbsp;': ' ',
204
+ '&mdash;': '--',
205
+ '&ndash;': '-',
206
+ '&hellip;': '...',
207
+ '&copy;': '(c)',
208
+ '&reg;': '(R)',
209
+ '&trade;': '(TM)',
210
+ }
211
+
212
+ let result = text
213
+ for (const [entity, char] of Object.entries(entities)) {
214
+ result = result.replace(new RegExp(entity, 'gi'), char)
215
+ }
216
+
217
+ // Handle numeric entities
218
+ result = result.replace(/&#(\d+);/g, (_, code: string) => String.fromCharCode(parseInt(code, 10)))
219
+ result = result.replace(/&#x([a-fA-F0-9]+);/g, (_, code: string) => String.fromCharCode(parseInt(code, 16)))
220
+
221
+ return result
222
+ }
223
+
224
+ /**
225
+ * Check if URL is accessible
226
+ */
227
+ async isAccessible(url: string): Promise<boolean> {
228
+ try {
229
+ const controller = new AbortController()
230
+ const timeoutId = setTimeout(() => controller.abort(), 5000)
231
+
232
+ const response = await fetch(url, {
233
+ method: 'HEAD',
234
+ signal: controller.signal,
235
+ })
236
+
237
+ clearTimeout(timeoutId)
238
+ return response.ok
239
+ } catch {
240
+ return false
241
+ }
242
+ }
243
+ }