arcfetch 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,260 @@
1
+ import { existsSync, mkdirSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs';
2
+ import { writeFile } from 'node:fs/promises';
3
+ import { join, basename } from 'node:path';
4
+ import type { FetchiConfig } from '../config/schema.js';
5
+
6
+ export interface CachedReference {
7
+ refId: string;
8
+ title: string;
9
+ url: string;
10
+ filepath: string;
11
+ fetchedDate: string;
12
+ size: number;
13
+ query?: string;
14
+ }
15
+
16
+ export interface SaveResult {
17
+ refId: string;
18
+ filepath: string;
19
+ error?: string;
20
+ }
21
+
22
+ export interface ListResult {
23
+ references: CachedReference[];
24
+ error?: string;
25
+ }
26
+
27
+ export interface PromoteResult {
28
+ success: boolean;
29
+ fromPath: string;
30
+ toPath: string;
31
+ error?: string;
32
+ }
33
+
34
+ export interface DeleteResult {
35
+ success: boolean;
36
+ filepath: string;
37
+ error?: string;
38
+ }
39
+
40
+ /**
41
+ * Get the next available reference ID
42
+ */
43
+ export function getNextRefId(dir: string): string {
44
+ if (!existsSync(dir)) {
45
+ return 'REF-001';
46
+ }
47
+
48
+ try {
49
+ const files = readdirSync(dir);
50
+ let maxId = 0;
51
+
52
+ for (const file of files) {
53
+ const match = file.match(/REF-(\d+)/);
54
+ if (match) {
55
+ const id = parseInt(match[1], 10);
56
+ if (id > maxId) maxId = id;
57
+ }
58
+ }
59
+
60
+ return `REF-${String(maxId + 1).padStart(3, '0')}`;
61
+ } catch {
62
+ return 'REF-001';
63
+ }
64
+ }
65
+
66
+ /**
67
+ * Generate a slug from title
68
+ */
69
+ function slugify(title: string): string {
70
+ return title
71
+ .toLowerCase()
72
+ .replace(/[^a-z0-9]+/g, '-')
73
+ .replace(/^-|-$/g, '')
74
+ .slice(0, 60);
75
+ }
76
+
77
+ /**
78
+ * Save content to temp directory
79
+ */
80
+ export async function saveToTemp(
81
+ config: FetchiConfig,
82
+ title: string,
83
+ url: string,
84
+ content: string,
85
+ query?: string
86
+ ): Promise<SaveResult> {
87
+ try {
88
+ const tempDir = config.paths.tempDir;
89
+
90
+ mkdirSync(tempDir, { recursive: true });
91
+
92
+ const refId = getNextRefId(tempDir);
93
+ const slug = slugify(title);
94
+ const filename = `${refId}-${slug}.md`;
95
+ const filepath = join(tempDir, filename);
96
+
97
+ const today = new Date().toISOString().split('T')[0];
98
+ let fileContent = `---\n`;
99
+ fileContent += `id: ${refId}\n`;
100
+ fileContent += `title: "${title.replace(/"/g, '\\"')}"\n`;
101
+ fileContent += `source_url: ${url}\n`;
102
+ fileContent += `fetched_date: ${today}\n`;
103
+ fileContent += `type: web\n`;
104
+ fileContent += `status: temporary\n`;
105
+ if (query) {
106
+ fileContent += `query: "${query.replace(/"/g, '\\"')}"\n`;
107
+ }
108
+ fileContent += `---\n\n`;
109
+ fileContent += content;
110
+
111
+ await writeFile(filepath, fileContent, 'utf-8');
112
+
113
+ return { refId, filepath };
114
+ } catch (error) {
115
+ const message = error instanceof Error ? error.message : String(error);
116
+ return { refId: '', filepath: '', error: message };
117
+ }
118
+ }
119
+
120
+ /**
121
+ * List all cached references
122
+ */
123
+ export function listCached(config: FetchiConfig): ListResult {
124
+ try {
125
+ const tempDir = config.paths.tempDir;
126
+
127
+ if (!existsSync(tempDir)) {
128
+ return { references: [] };
129
+ }
130
+
131
+ const files = readdirSync(tempDir).filter(f => f.endsWith('.md') && f.startsWith('REF-'));
132
+ const references: CachedReference[] = [];
133
+
134
+ for (const file of files) {
135
+ const filepath = join(tempDir, file);
136
+ const content = readFileSync(filepath, 'utf-8');
137
+
138
+ const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---/);
139
+ if (!frontmatterMatch) continue;
140
+
141
+ const frontmatter = frontmatterMatch[1];
142
+
143
+ const getId = (key: string): string => {
144
+ const match = frontmatter.match(new RegExp(`^${key}:\\s*(.+)$`, 'm'));
145
+ return match ? match[1].trim().replace(/^["']|["']$/g, '').trim() : '';
146
+ };
147
+
148
+ const ref = {
149
+ refId: getId('id'),
150
+ title: getId('title'),
151
+ url: getId('source_url'),
152
+ filepath,
153
+ fetchedDate: getId('fetched_date'),
154
+ size: content.length,
155
+ query: getId('query') || undefined,
156
+ };
157
+ references.push(ref);
158
+ }
159
+
160
+ references.sort((a, b) => b.refId.localeCompare(a.refId));
161
+
162
+ return { references };
163
+ } catch (error) {
164
+ const message = error instanceof Error ? error.message : String(error);
165
+ return { references: [], error: message };
166
+ }
167
+ }
168
+
169
+ /**
170
+ * Find a cached reference by ID
171
+ */
172
+ export function findCached(config: FetchiConfig, refId: string): CachedReference | null {
173
+ const { references } = listCached(config);
174
+ return references.find(r => r.refId === refId) || null;
175
+ }
176
+
177
+ /**
178
+ * Promote a reference from temp to docs folder
179
+ */
180
+ export function promoteReference(config: FetchiConfig, refId: string): PromoteResult {
181
+ try {
182
+ const cached = findCached(config, refId);
183
+
184
+ if (!cached) {
185
+ return {
186
+ success: false,
187
+ fromPath: '',
188
+ toPath: '',
189
+ error: `Reference ${refId} not found in ${config.paths.tempDir}`,
190
+ };
191
+ }
192
+
193
+ const docsDir = config.paths.docsDir;
194
+
195
+ mkdirSync(docsDir, { recursive: true });
196
+
197
+ let content = readFileSync(cached.filepath, 'utf-8');
198
+
199
+ content = content.replace(/^status:\s*temporary$/m, 'status: permanent');
200
+
201
+ const filename = basename(cached.filepath);
202
+ const toPath = join(docsDir, filename);
203
+
204
+ writeFileSync(toPath, content, 'utf-8');
205
+
206
+ unlinkSync(cached.filepath);
207
+
208
+ return {
209
+ success: true,
210
+ fromPath: cached.filepath,
211
+ toPath,
212
+ };
213
+ } catch (error) {
214
+ const message = error instanceof Error ? error.message : String(error);
215
+ return {
216
+ success: false,
217
+ fromPath: '',
218
+ toPath: '',
219
+ error: message,
220
+ };
221
+ }
222
+ }
223
+
224
+ /**
225
+ * Delete a cached reference
226
+ */
227
+ export function deleteCached(config: FetchiConfig, refId: string): DeleteResult {
228
+ try {
229
+ const cached = findCached(config, refId);
230
+
231
+ if (!cached) {
232
+ return {
233
+ success: false,
234
+ filepath: '',
235
+ error: `Reference ${refId} not found in ${config.paths.tempDir}`,
236
+ };
237
+ }
238
+
239
+ unlinkSync(cached.filepath);
240
+
241
+ return {
242
+ success: true,
243
+ filepath: cached.filepath,
244
+ };
245
+ } catch (error) {
246
+ const message = error instanceof Error ? error.message : String(error);
247
+ return {
248
+ success: false,
249
+ filepath: '',
250
+ error: message,
251
+ };
252
+ }
253
+ }
254
+
255
+ /**
256
+ * Get cache root (for backwards compatibility)
257
+ */
258
+ export function findCacheRoot(): string {
259
+ return process.cwd();
260
+ }
@@ -0,0 +1,87 @@
1
+ import TurndownService from 'turndown';
2
+ import { gfm } from 'turndown-plugin-gfm';
3
+ import { Readability } from '@mozilla/readability';
4
+ import { parseHTML } from 'linkedom';
5
+ import { cleanMarkdownComplete } from '../utils/markdown-cleaner.js';
6
+
7
+ export interface ExtractionResult {
8
+ markdown?: string;
9
+ title?: string;
10
+ byline?: string;
11
+ excerpt?: string;
12
+ siteName?: string;
13
+ error?: string;
14
+ }
15
+
16
+ const turndown = new TurndownService({
17
+ headingStyle: 'atx',
18
+ codeBlockStyle: 'fenced',
19
+ bulletListMarker: '-',
20
+ emDelimiter: '*',
21
+ strongDelimiter: '**',
22
+ hr: '---',
23
+ });
24
+
25
+ turndown.use(gfm);
26
+
27
+ turndown.addRule('removeComments', {
28
+ filter: (node) => (node as unknown as { nodeType: number }).nodeType === 8,
29
+ replacement: () => '',
30
+ });
31
+
32
+ export async function processHtmlToMarkdown(
33
+ html: string,
34
+ url: string,
35
+ verbose = false
36
+ ): Promise<ExtractionResult> {
37
+ try {
38
+ if (verbose) {
39
+ console.error(`📝 Processing HTML (${html.length} chars)`);
40
+ }
41
+
42
+ const { document } = parseHTML(html, { url });
43
+
44
+ const reader = new Readability(document, {
45
+ debug: false,
46
+ maxElemsToParse: 0,
47
+ nbTopCandidates: 5,
48
+ charThreshold: 500,
49
+ keepClasses: false,
50
+ });
51
+
52
+ const article = reader.parse();
53
+
54
+ if (!article) {
55
+ return {
56
+ error: 'Could not extract article content. Page may not contain article-like content.',
57
+ };
58
+ }
59
+
60
+ const content = article.content ?? '';
61
+
62
+ if (verbose) {
63
+ console.error(`📝 Extracted: "${article.title}" (${content.length} chars)`);
64
+ }
65
+
66
+ let markdown = turndown.turndown(content);
67
+
68
+ markdown = cleanMarkdownComplete(markdown);
69
+
70
+ let header = `# ${article.title}\n\n`;
71
+ if (article.byline) header += `**By:** ${article.byline}\n\n`;
72
+ if (article.siteName) header += `**Source:** ${article.siteName}\n\n`;
73
+ if (article.excerpt) header += `**Summary:** ${article.excerpt}\n\n`;
74
+ header += `**URL:** ${url}\n\n---\n\n`;
75
+
76
+ return {
77
+ markdown: header + markdown,
78
+ title: article.title ?? undefined,
79
+ byline: article.byline ?? undefined,
80
+ excerpt: article.excerpt ?? undefined,
81
+ siteName: article.siteName ?? undefined,
82
+ };
83
+ } catch (error) {
84
+ const message = error instanceof Error ? error.message : String(error);
85
+ return { error: message };
86
+ }
87
+ }
@@ -0,0 +1,4 @@
1
+ export * from './pipeline.js';
2
+ export * from './extractor.js';
3
+ export * from './cache.js';
4
+ export * from './playwright/index.js';
@@ -0,0 +1,181 @@
1
+ import type { FetchiConfig } from '../config/schema.js';
2
+ import { validateMarkdown, type ValidationResult } from '../utils/markdown-validator.js';
3
+ import { fetchWithBrowser, closeBrowser } from './playwright/manager.js';
4
+ import { processHtmlToMarkdown } from './extractor.js';
5
+
6
+ export interface FetchResult {
7
+ success: boolean;
8
+ markdown?: string;
9
+ title?: string;
10
+ byline?: string;
11
+ excerpt?: string;
12
+ siteName?: string;
13
+ quality?: ValidationResult;
14
+ error?: string;
15
+ suggestion?: string;
16
+ usedPlaywright?: boolean;
17
+ playwrightReason?: string;
18
+ }
19
+
20
+ interface SimpleFetchResult {
21
+ html: string;
22
+ error?: string;
23
+ }
24
+
25
+ async function simpleFetch(url: string, verbose = false): Promise<SimpleFetchResult> {
26
+ try {
27
+ if (verbose) {
28
+ console.error(`📡 Simple fetch: ${url}`);
29
+ }
30
+
31
+ const response = await fetch(url, {
32
+ redirect: 'follow',
33
+ headers: {
34
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
35
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36
+ 'Accept-Language': 'en-US,en;q=0.5',
37
+ },
38
+ });
39
+
40
+ if (!response.ok) {
41
+ return { html: '', error: `HTTP ${response.status}: ${response.statusText}` };
42
+ }
43
+
44
+ const html = await response.text();
45
+
46
+ if (verbose) {
47
+ console.error(`📡 Simple fetch: Got ${html.length} chars`);
48
+ }
49
+
50
+ return { html };
51
+ } catch (error) {
52
+ const message = error instanceof Error ? error.message : String(error);
53
+ return { html: '', error: message };
54
+ }
55
+ }
56
+
57
+ async function tryPlaywright(
58
+ url: string,
59
+ config: FetchiConfig,
60
+ reason: string,
61
+ verbose = false
62
+ ): Promise<FetchResult> {
63
+ if (verbose) {
64
+ console.error(`🎭 Trying Playwright (reason: ${reason})`);
65
+ }
66
+
67
+ const browserResult = await fetchWithBrowser(url, config.playwright, verbose);
68
+
69
+ if (browserResult.error) {
70
+ return {
71
+ success: false,
72
+ error: `Playwright fetch failed: ${browserResult.error}`,
73
+ };
74
+ }
75
+
76
+ const extracted = await processHtmlToMarkdown(browserResult.html, url, verbose);
77
+
78
+ if (extracted.error) {
79
+ return {
80
+ success: false,
81
+ error: extracted.error,
82
+ };
83
+ }
84
+
85
+ const quality = validateMarkdown(extracted.markdown!);
86
+
87
+ if (quality.score < config.quality.minScore) {
88
+ return {
89
+ success: false,
90
+ error: `Content quality too low (${quality.score}/100) even with JavaScript rendering`,
91
+ quality,
92
+ suggestion: 'This page may be a login wall, forum, or complex web app not suitable for article extraction',
93
+ usedPlaywright: true,
94
+ playwrightReason: reason,
95
+ };
96
+ }
97
+
98
+ return {
99
+ success: true,
100
+ markdown: extracted.markdown,
101
+ title: extracted.title,
102
+ byline: extracted.byline,
103
+ excerpt: extracted.excerpt,
104
+ siteName: extracted.siteName,
105
+ quality,
106
+ usedPlaywright: true,
107
+ playwrightReason: reason,
108
+ };
109
+ }
110
+
111
+ export async function fetchUrl(
112
+ url: string,
113
+ config: FetchiConfig,
114
+ verbose = false
115
+ ): Promise<FetchResult> {
116
+ const simpleResult = await simpleFetch(url, verbose);
117
+
118
+ if (simpleResult.error) {
119
+ if (verbose) {
120
+ console.error(`📡 Simple fetch failed: ${simpleResult.error}`);
121
+ }
122
+ return tryPlaywright(url, config, 'network_error', verbose);
123
+ }
124
+
125
+ const extracted = await processHtmlToMarkdown(simpleResult.html, url, verbose);
126
+
127
+ if (extracted.error) {
128
+ if (verbose) {
129
+ console.error(`📝 Extraction failed: ${extracted.error}`);
130
+ }
131
+ return tryPlaywright(url, config, 'extraction_failed', verbose);
132
+ }
133
+
134
+ const quality = validateMarkdown(extracted.markdown!);
135
+
136
+ if (verbose) {
137
+ console.error(`📊 Quality score: ${quality.score}/100`);
138
+ }
139
+
140
+ if (quality.score >= config.quality.jsRetryThreshold) {
141
+ return {
142
+ success: true,
143
+ markdown: extracted.markdown,
144
+ title: extracted.title,
145
+ byline: extracted.byline,
146
+ excerpt: extracted.excerpt,
147
+ siteName: extracted.siteName,
148
+ quality,
149
+ };
150
+ }
151
+
152
+ if (quality.score >= config.quality.minScore) {
153
+ if (verbose) {
154
+ console.error(`📊 Quality marginal (${quality.score}), trying Playwright...`);
155
+ }
156
+
157
+ const playwrightResult = await tryPlaywright(url, config, 'quality_marginal', verbose);
158
+
159
+ if (playwrightResult.success && playwrightResult.quality!.score > quality.score) {
160
+ return playwrightResult;
161
+ }
162
+
163
+ return {
164
+ success: true,
165
+ markdown: extracted.markdown,
166
+ title: extracted.title,
167
+ byline: extracted.byline,
168
+ excerpt: extracted.excerpt,
169
+ siteName: extracted.siteName,
170
+ quality,
171
+ };
172
+ }
173
+
174
+ if (verbose) {
175
+ console.error(`📊 Quality too low (${quality.score}), trying Playwright...`);
176
+ }
177
+
178
+ return tryPlaywright(url, config, 'quality_too_low', verbose);
179
+ }
180
+
181
+ export { closeBrowser };
@@ -0,0 +1,138 @@
1
+ import { chromium, type Browser } from 'playwright';
2
+ import { exec } from 'node:child_process';
3
+ import { promisify } from 'node:util';
4
+ import type { PlaywrightConfig } from '../../config/schema.js';
5
+ import type { BrowserManager } from './types.js';
6
+
7
+ const execAsync = promisify(exec);
8
+
9
+ let containerId: string | null = null;
10
+ let browserInstance: Browser | null = null;
11
+
12
+ export async function isDockerAvailable(): Promise<boolean> {
13
+ try {
14
+ await execAsync('docker info', { timeout: 5000 });
15
+ return true;
16
+ } catch {
17
+ return false;
18
+ }
19
+ }
20
+
21
+ export async function pullDockerImage(image: string): Promise<void> {
22
+ try {
23
+ // Check if image exists locally first
24
+ await execAsync(`docker image inspect ${image}`, { timeout: 10000 });
25
+ } catch {
26
+ // Image doesn't exist, pull it
27
+ console.error(`Pulling Docker image: ${image}...`);
28
+ await execAsync(`docker pull ${image}`, { timeout: 300000 }); // 5 min timeout for pull
29
+ }
30
+ }
31
+
32
+ export class DockerBrowserManager implements BrowserManager {
33
+ private config: PlaywrightConfig;
34
+ private wsEndpoint: string | null = null;
35
+
36
+ constructor(config: PlaywrightConfig) {
37
+ this.config = config;
38
+ }
39
+
40
+ async getBrowser(): Promise<Browser> {
41
+ if (browserInstance) {
42
+ return browserInstance;
43
+ }
44
+
45
+ // Pull image if needed
46
+ await pullDockerImage(this.config.dockerImage);
47
+
48
+ // Find available port
49
+ const port = await this.findAvailablePort();
50
+
51
+ // Start container with Playwright server
52
+ const { stdout } = await execAsync(`
53
+ docker run -d --rm \
54
+ -p ${port}:3000 \
55
+ --name arcfetch-playwright-${port} \
56
+ ${this.config.dockerImage} \
57
+ npx -y playwright run-server --port 3000
58
+ `.trim());
59
+
60
+ containerId = stdout.trim();
61
+ this.wsEndpoint = `ws://localhost:${port}`;
62
+
63
+ // Wait for server to be ready
64
+ await this.waitForServer(port);
65
+
66
+ // Connect to browser
67
+ browserInstance = await chromium.connect(this.wsEndpoint);
68
+
69
+ return browserInstance;
70
+ }
71
+
72
+ async closeBrowser(): Promise<void> {
73
+ if (browserInstance) {
74
+ await browserInstance.close();
75
+ browserInstance = null;
76
+ }
77
+
78
+ if (containerId) {
79
+ try {
80
+ await execAsync(`docker stop ${containerId}`, { timeout: 10000 });
81
+ } catch {
82
+ // Container may have already stopped
83
+ }
84
+ containerId = null;
85
+ }
86
+ }
87
+
88
+ isDocker(): boolean {
89
+ return true;
90
+ }
91
+
92
+ private async findAvailablePort(): Promise<number> {
93
+ // Start from 3001 and find an available port
94
+ for (let port = 3001; port < 3100; port++) {
95
+ try {
96
+ const server = Bun.serve({
97
+ port,
98
+ fetch() { return new Response(''); }
99
+ });
100
+ server.stop();
101
+ return port;
102
+ } catch {
103
+ }
104
+ }
105
+ throw new Error('No available ports found');
106
+ }
107
+
108
+ private async waitForServer(port: number, maxAttempts = 30): Promise<void> {
109
+ for (let i = 0; i < maxAttempts; i++) {
110
+ try {
111
+ const response = await fetch(`http://localhost:${port}/json`);
112
+ if (response.ok) return;
113
+ } catch {
114
+ // Server not ready yet
115
+ }
116
+ await new Promise(resolve => setTimeout(resolve, 1000));
117
+ }
118
+ throw new Error('Playwright server failed to start');
119
+ }
120
+ }
121
+
122
+ // Cleanup on process exit
123
+ process.on('exit', async () => {
124
+ if (containerId) {
125
+ try {
126
+ await execAsync(`docker stop ${containerId}`);
127
+ } catch {}
128
+ }
129
+ });
130
+
131
+ process.on('SIGINT', async () => {
132
+ if (containerId) {
133
+ try {
134
+ await execAsync(`docker stop ${containerId}`);
135
+ } catch {}
136
+ }
137
+ process.exit();
138
+ });
@@ -0,0 +1,3 @@
1
+ export * from './types.js';
2
+ export * from './manager.js';
3
+ export { isDockerAvailable } from './docker.js';
@@ -0,0 +1,38 @@
1
+ import { chromium } from 'playwright-extra';
2
+ import type { Browser } from 'playwright';
3
+ import stealth from 'puppeteer-extra-plugin-stealth';
4
+ import type { PlaywrightConfig } from '../../config/schema.js';
5
+ import type { BrowserManager } from './types.js';
6
+
7
+ chromium.use(stealth());
8
+
9
+ let browserInstance: Browser | null = null;
10
+
11
+ export class LocalBrowserManager implements BrowserManager {
12
+ private config: PlaywrightConfig;
13
+
14
+ constructor(config: PlaywrightConfig) {
15
+ this.config = config;
16
+ }
17
+
18
+ async getBrowser(): Promise<Browser> {
19
+ if (!browserInstance) {
20
+ browserInstance = await chromium.launch({
21
+ headless: true,
22
+ timeout: this.config.timeout,
23
+ });
24
+ }
25
+ return browserInstance;
26
+ }
27
+
28
+ async closeBrowser(): Promise<void> {
29
+ if (browserInstance) {
30
+ await browserInstance.close();
31
+ browserInstance = null;
32
+ }
33
+ }
34
+
35
+ isDocker(): boolean {
36
+ return false;
37
+ }
38
+ }