arcfetch 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +215 -0
- package/cli.ts +455 -0
- package/index.ts +331 -0
- package/package.json +70 -0
- package/src/config/defaults.ts +22 -0
- package/src/config/index.ts +3 -0
- package/src/config/loader.ts +130 -0
- package/src/config/schema.ts +36 -0
- package/src/core/cache.ts +260 -0
- package/src/core/extractor.ts +87 -0
- package/src/core/index.ts +4 -0
- package/src/core/pipeline.ts +181 -0
- package/src/core/playwright/docker.ts +138 -0
- package/src/core/playwright/index.ts +3 -0
- package/src/core/playwright/local.ts +38 -0
- package/src/core/playwright/manager.ts +89 -0
- package/src/core/playwright/types.ts +12 -0
- package/src/types/turndown-plugin-gfm.d.ts +8 -0
- package/src/utils/markdown-cleaner.ts +79 -0
- package/src/utils/markdown-validator.ts +136 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import { existsSync, mkdirSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs';
|
|
2
|
+
import { writeFile } from 'node:fs/promises';
|
|
3
|
+
import { join, basename } from 'node:path';
|
|
4
|
+
import type { FetchiConfig } from '../config/schema.js';
|
|
5
|
+
|
|
6
|
+
export interface CachedReference {
|
|
7
|
+
refId: string;
|
|
8
|
+
title: string;
|
|
9
|
+
url: string;
|
|
10
|
+
filepath: string;
|
|
11
|
+
fetchedDate: string;
|
|
12
|
+
size: number;
|
|
13
|
+
query?: string;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface SaveResult {
|
|
17
|
+
refId: string;
|
|
18
|
+
filepath: string;
|
|
19
|
+
error?: string;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface ListResult {
|
|
23
|
+
references: CachedReference[];
|
|
24
|
+
error?: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface PromoteResult {
|
|
28
|
+
success: boolean;
|
|
29
|
+
fromPath: string;
|
|
30
|
+
toPath: string;
|
|
31
|
+
error?: string;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export interface DeleteResult {
|
|
35
|
+
success: boolean;
|
|
36
|
+
filepath: string;
|
|
37
|
+
error?: string;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Get the next available reference ID
|
|
42
|
+
*/
|
|
43
|
+
export function getNextRefId(dir: string): string {
|
|
44
|
+
if (!existsSync(dir)) {
|
|
45
|
+
return 'REF-001';
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
try {
|
|
49
|
+
const files = readdirSync(dir);
|
|
50
|
+
let maxId = 0;
|
|
51
|
+
|
|
52
|
+
for (const file of files) {
|
|
53
|
+
const match = file.match(/REF-(\d+)/);
|
|
54
|
+
if (match) {
|
|
55
|
+
const id = parseInt(match[1], 10);
|
|
56
|
+
if (id > maxId) maxId = id;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return `REF-${String(maxId + 1).padStart(3, '0')}`;
|
|
61
|
+
} catch {
|
|
62
|
+
return 'REF-001';
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Generate a slug from title
|
|
68
|
+
*/
|
|
69
|
+
function slugify(title: string): string {
|
|
70
|
+
return title
|
|
71
|
+
.toLowerCase()
|
|
72
|
+
.replace(/[^a-z0-9]+/g, '-')
|
|
73
|
+
.replace(/^-|-$/g, '')
|
|
74
|
+
.slice(0, 60);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Save content to temp directory
|
|
79
|
+
*/
|
|
80
|
+
export async function saveToTemp(
|
|
81
|
+
config: FetchiConfig,
|
|
82
|
+
title: string,
|
|
83
|
+
url: string,
|
|
84
|
+
content: string,
|
|
85
|
+
query?: string
|
|
86
|
+
): Promise<SaveResult> {
|
|
87
|
+
try {
|
|
88
|
+
const tempDir = config.paths.tempDir;
|
|
89
|
+
|
|
90
|
+
mkdirSync(tempDir, { recursive: true });
|
|
91
|
+
|
|
92
|
+
const refId = getNextRefId(tempDir);
|
|
93
|
+
const slug = slugify(title);
|
|
94
|
+
const filename = `${refId}-${slug}.md`;
|
|
95
|
+
const filepath = join(tempDir, filename);
|
|
96
|
+
|
|
97
|
+
const today = new Date().toISOString().split('T')[0];
|
|
98
|
+
let fileContent = `---\n`;
|
|
99
|
+
fileContent += `id: ${refId}\n`;
|
|
100
|
+
fileContent += `title: "${title.replace(/"/g, '\\"')}"\n`;
|
|
101
|
+
fileContent += `source_url: ${url}\n`;
|
|
102
|
+
fileContent += `fetched_date: ${today}\n`;
|
|
103
|
+
fileContent += `type: web\n`;
|
|
104
|
+
fileContent += `status: temporary\n`;
|
|
105
|
+
if (query) {
|
|
106
|
+
fileContent += `query: "${query.replace(/"/g, '\\"')}"\n`;
|
|
107
|
+
}
|
|
108
|
+
fileContent += `---\n\n`;
|
|
109
|
+
fileContent += content;
|
|
110
|
+
|
|
111
|
+
await writeFile(filepath, fileContent, 'utf-8');
|
|
112
|
+
|
|
113
|
+
return { refId, filepath };
|
|
114
|
+
} catch (error) {
|
|
115
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
116
|
+
return { refId: '', filepath: '', error: message };
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* List all cached references
|
|
122
|
+
*/
|
|
123
|
+
export function listCached(config: FetchiConfig): ListResult {
|
|
124
|
+
try {
|
|
125
|
+
const tempDir = config.paths.tempDir;
|
|
126
|
+
|
|
127
|
+
if (!existsSync(tempDir)) {
|
|
128
|
+
return { references: [] };
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const files = readdirSync(tempDir).filter(f => f.endsWith('.md') && f.startsWith('REF-'));
|
|
132
|
+
const references: CachedReference[] = [];
|
|
133
|
+
|
|
134
|
+
for (const file of files) {
|
|
135
|
+
const filepath = join(tempDir, file);
|
|
136
|
+
const content = readFileSync(filepath, 'utf-8');
|
|
137
|
+
|
|
138
|
+
const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---/);
|
|
139
|
+
if (!frontmatterMatch) continue;
|
|
140
|
+
|
|
141
|
+
const frontmatter = frontmatterMatch[1];
|
|
142
|
+
|
|
143
|
+
const getId = (key: string): string => {
|
|
144
|
+
const match = frontmatter.match(new RegExp(`^${key}:\\s*(.+)$`, 'm'));
|
|
145
|
+
return match ? match[1].trim().replace(/^["']|["']$/g, '').trim() : '';
|
|
146
|
+
};
|
|
147
|
+
|
|
148
|
+
const ref = {
|
|
149
|
+
refId: getId('id'),
|
|
150
|
+
title: getId('title'),
|
|
151
|
+
url: getId('source_url'),
|
|
152
|
+
filepath,
|
|
153
|
+
fetchedDate: getId('fetched_date'),
|
|
154
|
+
size: content.length,
|
|
155
|
+
query: getId('query') || undefined,
|
|
156
|
+
};
|
|
157
|
+
references.push(ref);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
references.sort((a, b) => b.refId.localeCompare(a.refId));
|
|
161
|
+
|
|
162
|
+
return { references };
|
|
163
|
+
} catch (error) {
|
|
164
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
165
|
+
return { references: [], error: message };
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Find a cached reference by ID
|
|
171
|
+
*/
|
|
172
|
+
export function findCached(config: FetchiConfig, refId: string): CachedReference | null {
|
|
173
|
+
const { references } = listCached(config);
|
|
174
|
+
return references.find(r => r.refId === refId) || null;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Promote a reference from temp to docs folder
|
|
179
|
+
*/
|
|
180
|
+
export function promoteReference(config: FetchiConfig, refId: string): PromoteResult {
|
|
181
|
+
try {
|
|
182
|
+
const cached = findCached(config, refId);
|
|
183
|
+
|
|
184
|
+
if (!cached) {
|
|
185
|
+
return {
|
|
186
|
+
success: false,
|
|
187
|
+
fromPath: '',
|
|
188
|
+
toPath: '',
|
|
189
|
+
error: `Reference ${refId} not found in ${config.paths.tempDir}`,
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const docsDir = config.paths.docsDir;
|
|
194
|
+
|
|
195
|
+
mkdirSync(docsDir, { recursive: true });
|
|
196
|
+
|
|
197
|
+
let content = readFileSync(cached.filepath, 'utf-8');
|
|
198
|
+
|
|
199
|
+
content = content.replace(/^status:\s*temporary$/m, 'status: permanent');
|
|
200
|
+
|
|
201
|
+
const filename = basename(cached.filepath);
|
|
202
|
+
const toPath = join(docsDir, filename);
|
|
203
|
+
|
|
204
|
+
writeFileSync(toPath, content, 'utf-8');
|
|
205
|
+
|
|
206
|
+
unlinkSync(cached.filepath);
|
|
207
|
+
|
|
208
|
+
return {
|
|
209
|
+
success: true,
|
|
210
|
+
fromPath: cached.filepath,
|
|
211
|
+
toPath,
|
|
212
|
+
};
|
|
213
|
+
} catch (error) {
|
|
214
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
215
|
+
return {
|
|
216
|
+
success: false,
|
|
217
|
+
fromPath: '',
|
|
218
|
+
toPath: '',
|
|
219
|
+
error: message,
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/**
|
|
225
|
+
* Delete a cached reference
|
|
226
|
+
*/
|
|
227
|
+
export function deleteCached(config: FetchiConfig, refId: string): DeleteResult {
|
|
228
|
+
try {
|
|
229
|
+
const cached = findCached(config, refId);
|
|
230
|
+
|
|
231
|
+
if (!cached) {
|
|
232
|
+
return {
|
|
233
|
+
success: false,
|
|
234
|
+
filepath: '',
|
|
235
|
+
error: `Reference ${refId} not found in ${config.paths.tempDir}`,
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
unlinkSync(cached.filepath);
|
|
240
|
+
|
|
241
|
+
return {
|
|
242
|
+
success: true,
|
|
243
|
+
filepath: cached.filepath,
|
|
244
|
+
};
|
|
245
|
+
} catch (error) {
|
|
246
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
247
|
+
return {
|
|
248
|
+
success: false,
|
|
249
|
+
filepath: '',
|
|
250
|
+
error: message,
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Get cache root (for backwards compatibility)
|
|
257
|
+
*/
|
|
258
|
+
export function findCacheRoot(): string {
|
|
259
|
+
return process.cwd();
|
|
260
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import TurndownService from 'turndown';
|
|
2
|
+
import { gfm } from 'turndown-plugin-gfm';
|
|
3
|
+
import { Readability } from '@mozilla/readability';
|
|
4
|
+
import { parseHTML } from 'linkedom';
|
|
5
|
+
import { cleanMarkdownComplete } from '../utils/markdown-cleaner.js';
|
|
6
|
+
|
|
7
|
+
export interface ExtractionResult {
|
|
8
|
+
markdown?: string;
|
|
9
|
+
title?: string;
|
|
10
|
+
byline?: string;
|
|
11
|
+
excerpt?: string;
|
|
12
|
+
siteName?: string;
|
|
13
|
+
error?: string;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const turndown = new TurndownService({
|
|
17
|
+
headingStyle: 'atx',
|
|
18
|
+
codeBlockStyle: 'fenced',
|
|
19
|
+
bulletListMarker: '-',
|
|
20
|
+
emDelimiter: '*',
|
|
21
|
+
strongDelimiter: '**',
|
|
22
|
+
hr: '---',
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
turndown.use(gfm);
|
|
26
|
+
|
|
27
|
+
turndown.addRule('removeComments', {
|
|
28
|
+
filter: (node) => (node as unknown as { nodeType: number }).nodeType === 8,
|
|
29
|
+
replacement: () => '',
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
export async function processHtmlToMarkdown(
|
|
33
|
+
html: string,
|
|
34
|
+
url: string,
|
|
35
|
+
verbose = false
|
|
36
|
+
): Promise<ExtractionResult> {
|
|
37
|
+
try {
|
|
38
|
+
if (verbose) {
|
|
39
|
+
console.error(`📝 Processing HTML (${html.length} chars)`);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const { document } = parseHTML(html, { url });
|
|
43
|
+
|
|
44
|
+
const reader = new Readability(document, {
|
|
45
|
+
debug: false,
|
|
46
|
+
maxElemsToParse: 0,
|
|
47
|
+
nbTopCandidates: 5,
|
|
48
|
+
charThreshold: 500,
|
|
49
|
+
keepClasses: false,
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
const article = reader.parse();
|
|
53
|
+
|
|
54
|
+
if (!article) {
|
|
55
|
+
return {
|
|
56
|
+
error: 'Could not extract article content. Page may not contain article-like content.',
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const content = article.content ?? '';
|
|
61
|
+
|
|
62
|
+
if (verbose) {
|
|
63
|
+
console.error(`📝 Extracted: "${article.title}" (${content.length} chars)`);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
let markdown = turndown.turndown(content);
|
|
67
|
+
|
|
68
|
+
markdown = cleanMarkdownComplete(markdown);
|
|
69
|
+
|
|
70
|
+
let header = `# ${article.title}\n\n`;
|
|
71
|
+
if (article.byline) header += `**By:** ${article.byline}\n\n`;
|
|
72
|
+
if (article.siteName) header += `**Source:** ${article.siteName}\n\n`;
|
|
73
|
+
if (article.excerpt) header += `**Summary:** ${article.excerpt}\n\n`;
|
|
74
|
+
header += `**URL:** ${url}\n\n---\n\n`;
|
|
75
|
+
|
|
76
|
+
return {
|
|
77
|
+
markdown: header + markdown,
|
|
78
|
+
title: article.title ?? undefined,
|
|
79
|
+
byline: article.byline ?? undefined,
|
|
80
|
+
excerpt: article.excerpt ?? undefined,
|
|
81
|
+
siteName: article.siteName ?? undefined,
|
|
82
|
+
};
|
|
83
|
+
} catch (error) {
|
|
84
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
85
|
+
return { error: message };
|
|
86
|
+
}
|
|
87
|
+
}
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import type { FetchiConfig } from '../config/schema.js';
|
|
2
|
+
import { validateMarkdown, type ValidationResult } from '../utils/markdown-validator.js';
|
|
3
|
+
import { fetchWithBrowser, closeBrowser } from './playwright/manager.js';
|
|
4
|
+
import { processHtmlToMarkdown } from './extractor.js';
|
|
5
|
+
|
|
6
|
+
export interface FetchResult {
|
|
7
|
+
success: boolean;
|
|
8
|
+
markdown?: string;
|
|
9
|
+
title?: string;
|
|
10
|
+
byline?: string;
|
|
11
|
+
excerpt?: string;
|
|
12
|
+
siteName?: string;
|
|
13
|
+
quality?: ValidationResult;
|
|
14
|
+
error?: string;
|
|
15
|
+
suggestion?: string;
|
|
16
|
+
usedPlaywright?: boolean;
|
|
17
|
+
playwrightReason?: string;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
interface SimpleFetchResult {
|
|
21
|
+
html: string;
|
|
22
|
+
error?: string;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
async function simpleFetch(url: string, verbose = false): Promise<SimpleFetchResult> {
|
|
26
|
+
try {
|
|
27
|
+
if (verbose) {
|
|
28
|
+
console.error(`📡 Simple fetch: ${url}`);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const response = await fetch(url, {
|
|
32
|
+
redirect: 'follow',
|
|
33
|
+
headers: {
|
|
34
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
35
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
36
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
37
|
+
},
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
if (!response.ok) {
|
|
41
|
+
return { html: '', error: `HTTP ${response.status}: ${response.statusText}` };
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const html = await response.text();
|
|
45
|
+
|
|
46
|
+
if (verbose) {
|
|
47
|
+
console.error(`📡 Simple fetch: Got ${html.length} chars`);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return { html };
|
|
51
|
+
} catch (error) {
|
|
52
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
53
|
+
return { html: '', error: message };
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async function tryPlaywright(
|
|
58
|
+
url: string,
|
|
59
|
+
config: FetchiConfig,
|
|
60
|
+
reason: string,
|
|
61
|
+
verbose = false
|
|
62
|
+
): Promise<FetchResult> {
|
|
63
|
+
if (verbose) {
|
|
64
|
+
console.error(`🎭 Trying Playwright (reason: ${reason})`);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const browserResult = await fetchWithBrowser(url, config.playwright, verbose);
|
|
68
|
+
|
|
69
|
+
if (browserResult.error) {
|
|
70
|
+
return {
|
|
71
|
+
success: false,
|
|
72
|
+
error: `Playwright fetch failed: ${browserResult.error}`,
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const extracted = await processHtmlToMarkdown(browserResult.html, url, verbose);
|
|
77
|
+
|
|
78
|
+
if (extracted.error) {
|
|
79
|
+
return {
|
|
80
|
+
success: false,
|
|
81
|
+
error: extracted.error,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const quality = validateMarkdown(extracted.markdown!);
|
|
86
|
+
|
|
87
|
+
if (quality.score < config.quality.minScore) {
|
|
88
|
+
return {
|
|
89
|
+
success: false,
|
|
90
|
+
error: `Content quality too low (${quality.score}/100) even with JavaScript rendering`,
|
|
91
|
+
quality,
|
|
92
|
+
suggestion: 'This page may be a login wall, forum, or complex web app not suitable for article extraction',
|
|
93
|
+
usedPlaywright: true,
|
|
94
|
+
playwrightReason: reason,
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
return {
|
|
99
|
+
success: true,
|
|
100
|
+
markdown: extracted.markdown,
|
|
101
|
+
title: extracted.title,
|
|
102
|
+
byline: extracted.byline,
|
|
103
|
+
excerpt: extracted.excerpt,
|
|
104
|
+
siteName: extracted.siteName,
|
|
105
|
+
quality,
|
|
106
|
+
usedPlaywright: true,
|
|
107
|
+
playwrightReason: reason,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
export async function fetchUrl(
|
|
112
|
+
url: string,
|
|
113
|
+
config: FetchiConfig,
|
|
114
|
+
verbose = false
|
|
115
|
+
): Promise<FetchResult> {
|
|
116
|
+
const simpleResult = await simpleFetch(url, verbose);
|
|
117
|
+
|
|
118
|
+
if (simpleResult.error) {
|
|
119
|
+
if (verbose) {
|
|
120
|
+
console.error(`📡 Simple fetch failed: ${simpleResult.error}`);
|
|
121
|
+
}
|
|
122
|
+
return tryPlaywright(url, config, 'network_error', verbose);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const extracted = await processHtmlToMarkdown(simpleResult.html, url, verbose);
|
|
126
|
+
|
|
127
|
+
if (extracted.error) {
|
|
128
|
+
if (verbose) {
|
|
129
|
+
console.error(`📝 Extraction failed: ${extracted.error}`);
|
|
130
|
+
}
|
|
131
|
+
return tryPlaywright(url, config, 'extraction_failed', verbose);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const quality = validateMarkdown(extracted.markdown!);
|
|
135
|
+
|
|
136
|
+
if (verbose) {
|
|
137
|
+
console.error(`📊 Quality score: ${quality.score}/100`);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (quality.score >= config.quality.jsRetryThreshold) {
|
|
141
|
+
return {
|
|
142
|
+
success: true,
|
|
143
|
+
markdown: extracted.markdown,
|
|
144
|
+
title: extracted.title,
|
|
145
|
+
byline: extracted.byline,
|
|
146
|
+
excerpt: extracted.excerpt,
|
|
147
|
+
siteName: extracted.siteName,
|
|
148
|
+
quality,
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (quality.score >= config.quality.minScore) {
|
|
153
|
+
if (verbose) {
|
|
154
|
+
console.error(`📊 Quality marginal (${quality.score}), trying Playwright...`);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const playwrightResult = await tryPlaywright(url, config, 'quality_marginal', verbose);
|
|
158
|
+
|
|
159
|
+
if (playwrightResult.success && playwrightResult.quality!.score > quality.score) {
|
|
160
|
+
return playwrightResult;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return {
|
|
164
|
+
success: true,
|
|
165
|
+
markdown: extracted.markdown,
|
|
166
|
+
title: extracted.title,
|
|
167
|
+
byline: extracted.byline,
|
|
168
|
+
excerpt: extracted.excerpt,
|
|
169
|
+
siteName: extracted.siteName,
|
|
170
|
+
quality,
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
if (verbose) {
|
|
175
|
+
console.error(`📊 Quality too low (${quality.score}), trying Playwright...`);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return tryPlaywright(url, config, 'quality_too_low', verbose);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
export { closeBrowser };
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import { chromium, type Browser } from 'playwright';
|
|
2
|
+
import { exec } from 'node:child_process';
|
|
3
|
+
import { promisify } from 'node:util';
|
|
4
|
+
import type { PlaywrightConfig } from '../../config/schema.js';
|
|
5
|
+
import type { BrowserManager } from './types.js';
|
|
6
|
+
|
|
7
|
+
const execAsync = promisify(exec);
|
|
8
|
+
|
|
9
|
+
let containerId: string | null = null;
|
|
10
|
+
let browserInstance: Browser | null = null;
|
|
11
|
+
|
|
12
|
+
export async function isDockerAvailable(): Promise<boolean> {
|
|
13
|
+
try {
|
|
14
|
+
await execAsync('docker info', { timeout: 5000 });
|
|
15
|
+
return true;
|
|
16
|
+
} catch {
|
|
17
|
+
return false;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export async function pullDockerImage(image: string): Promise<void> {
|
|
22
|
+
try {
|
|
23
|
+
// Check if image exists locally first
|
|
24
|
+
await execAsync(`docker image inspect ${image}`, { timeout: 10000 });
|
|
25
|
+
} catch {
|
|
26
|
+
// Image doesn't exist, pull it
|
|
27
|
+
console.error(`Pulling Docker image: ${image}...`);
|
|
28
|
+
await execAsync(`docker pull ${image}`, { timeout: 300000 }); // 5 min timeout for pull
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export class DockerBrowserManager implements BrowserManager {
|
|
33
|
+
private config: PlaywrightConfig;
|
|
34
|
+
private wsEndpoint: string | null = null;
|
|
35
|
+
|
|
36
|
+
constructor(config: PlaywrightConfig) {
|
|
37
|
+
this.config = config;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
async getBrowser(): Promise<Browser> {
|
|
41
|
+
if (browserInstance) {
|
|
42
|
+
return browserInstance;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Pull image if needed
|
|
46
|
+
await pullDockerImage(this.config.dockerImage);
|
|
47
|
+
|
|
48
|
+
// Find available port
|
|
49
|
+
const port = await this.findAvailablePort();
|
|
50
|
+
|
|
51
|
+
// Start container with Playwright server
|
|
52
|
+
const { stdout } = await execAsync(`
|
|
53
|
+
docker run -d --rm \
|
|
54
|
+
-p ${port}:3000 \
|
|
55
|
+
--name arcfetch-playwright-${port} \
|
|
56
|
+
${this.config.dockerImage} \
|
|
57
|
+
npx -y playwright run-server --port 3000
|
|
58
|
+
`.trim());
|
|
59
|
+
|
|
60
|
+
containerId = stdout.trim();
|
|
61
|
+
this.wsEndpoint = `ws://localhost:${port}`;
|
|
62
|
+
|
|
63
|
+
// Wait for server to be ready
|
|
64
|
+
await this.waitForServer(port);
|
|
65
|
+
|
|
66
|
+
// Connect to browser
|
|
67
|
+
browserInstance = await chromium.connect(this.wsEndpoint);
|
|
68
|
+
|
|
69
|
+
return browserInstance;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
async closeBrowser(): Promise<void> {
|
|
73
|
+
if (browserInstance) {
|
|
74
|
+
await browserInstance.close();
|
|
75
|
+
browserInstance = null;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if (containerId) {
|
|
79
|
+
try {
|
|
80
|
+
await execAsync(`docker stop ${containerId}`, { timeout: 10000 });
|
|
81
|
+
} catch {
|
|
82
|
+
// Container may have already stopped
|
|
83
|
+
}
|
|
84
|
+
containerId = null;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
isDocker(): boolean {
|
|
89
|
+
return true;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
private async findAvailablePort(): Promise<number> {
|
|
93
|
+
// Start from 3001 and find an available port
|
|
94
|
+
for (let port = 3001; port < 3100; port++) {
|
|
95
|
+
try {
|
|
96
|
+
const server = Bun.serve({
|
|
97
|
+
port,
|
|
98
|
+
fetch() { return new Response(''); }
|
|
99
|
+
});
|
|
100
|
+
server.stop();
|
|
101
|
+
return port;
|
|
102
|
+
} catch {
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
throw new Error('No available ports found');
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
private async waitForServer(port: number, maxAttempts = 30): Promise<void> {
|
|
109
|
+
for (let i = 0; i < maxAttempts; i++) {
|
|
110
|
+
try {
|
|
111
|
+
const response = await fetch(`http://localhost:${port}/json`);
|
|
112
|
+
if (response.ok) return;
|
|
113
|
+
} catch {
|
|
114
|
+
// Server not ready yet
|
|
115
|
+
}
|
|
116
|
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
117
|
+
}
|
|
118
|
+
throw new Error('Playwright server failed to start');
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Cleanup on process exit
|
|
123
|
+
process.on('exit', async () => {
|
|
124
|
+
if (containerId) {
|
|
125
|
+
try {
|
|
126
|
+
await execAsync(`docker stop ${containerId}`);
|
|
127
|
+
} catch {}
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
process.on('SIGINT', async () => {
|
|
132
|
+
if (containerId) {
|
|
133
|
+
try {
|
|
134
|
+
await execAsync(`docker stop ${containerId}`);
|
|
135
|
+
} catch {}
|
|
136
|
+
}
|
|
137
|
+
process.exit();
|
|
138
|
+
});
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { chromium } from 'playwright-extra';
|
|
2
|
+
import type { Browser } from 'playwright';
|
|
3
|
+
import stealth from 'puppeteer-extra-plugin-stealth';
|
|
4
|
+
import type { PlaywrightConfig } from '../../config/schema.js';
|
|
5
|
+
import type { BrowserManager } from './types.js';
|
|
6
|
+
|
|
7
|
+
chromium.use(stealth());
|
|
8
|
+
|
|
9
|
+
let browserInstance: Browser | null = null;
|
|
10
|
+
|
|
11
|
+
export class LocalBrowserManager implements BrowserManager {
|
|
12
|
+
private config: PlaywrightConfig;
|
|
13
|
+
|
|
14
|
+
constructor(config: PlaywrightConfig) {
|
|
15
|
+
this.config = config;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
async getBrowser(): Promise<Browser> {
|
|
19
|
+
if (!browserInstance) {
|
|
20
|
+
browserInstance = await chromium.launch({
|
|
21
|
+
headless: true,
|
|
22
|
+
timeout: this.config.timeout,
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
return browserInstance;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
async closeBrowser(): Promise<void> {
|
|
29
|
+
if (browserInstance) {
|
|
30
|
+
await browserInstance.close();
|
|
31
|
+
browserInstance = null;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
isDocker(): boolean {
|
|
36
|
+
return false;
|
|
37
|
+
}
|
|
38
|
+
}
|