arcfetch 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,260 @@
1
+ import { existsSync, mkdirSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs';
2
+ import { writeFile } from 'node:fs/promises';
3
+ import { join, basename } from 'node:path';
4
+ import type { FetchiConfig } from '../config/schema.js';
5
+
6
+ export interface CachedReference {
7
+ refId: string;
8
+ title: string;
9
+ url: string;
10
+ filepath: string;
11
+ fetchedDate: string;
12
+ size: number;
13
+ query?: string;
14
+ }
15
+
16
+ export interface SaveResult {
17
+ refId: string;
18
+ filepath: string;
19
+ error?: string;
20
+ }
21
+
22
+ export interface ListResult {
23
+ references: CachedReference[];
24
+ error?: string;
25
+ }
26
+
27
+ export interface PromoteResult {
28
+ success: boolean;
29
+ fromPath: string;
30
+ toPath: string;
31
+ error?: string;
32
+ }
33
+
34
+ export interface DeleteResult {
35
+ success: boolean;
36
+ filepath: string;
37
+ error?: string;
38
+ }
39
+
40
+ /**
41
+ * Get the next available reference ID
42
+ */
43
+ export function getNextRefId(dir: string): string {
44
+ if (!existsSync(dir)) {
45
+ return 'REF-001';
46
+ }
47
+
48
+ try {
49
+ const files = readdirSync(dir);
50
+ let maxId = 0;
51
+
52
+ for (const file of files) {
53
+ const match = file.match(/REF-(\d+)/);
54
+ if (match) {
55
+ const id = parseInt(match[1], 10);
56
+ if (id > maxId) maxId = id;
57
+ }
58
+ }
59
+
60
+ return `REF-${String(maxId + 1).padStart(3, '0')}`;
61
+ } catch {
62
+ return 'REF-001';
63
+ }
64
+ }
65
+
66
+ /**
67
+ * Generate a slug from title
68
+ */
69
+ function slugify(title: string): string {
70
+ return title
71
+ .toLowerCase()
72
+ .replace(/[^a-z0-9]+/g, '-')
73
+ .replace(/^-|-$/g, '')
74
+ .slice(0, 60);
75
+ }
76
+
77
+ /**
78
+ * Save content to temp directory
79
+ */
80
+ export async function saveToTemp(
81
+ config: FetchiConfig,
82
+ title: string,
83
+ url: string,
84
+ content: string,
85
+ query?: string
86
+ ): Promise<SaveResult> {
87
+ try {
88
+ const tempDir = config.paths.tempDir;
89
+
90
+ mkdirSync(tempDir, { recursive: true });
91
+
92
+ const refId = getNextRefId(tempDir);
93
+ const slug = slugify(title);
94
+ const filename = `${refId}-${slug}.md`;
95
+ const filepath = join(tempDir, filename);
96
+
97
+ const today = new Date().toISOString().split('T')[0];
98
+ let fileContent = `---\n`;
99
+ fileContent += `id: ${refId}\n`;
100
+ fileContent += `title: "${title.replace(/"/g, '\\"')}"\n`;
101
+ fileContent += `source_url: ${url}\n`;
102
+ fileContent += `fetched_date: ${today}\n`;
103
+ fileContent += `type: web\n`;
104
+ fileContent += `status: temporary\n`;
105
+ if (query) {
106
+ fileContent += `query: "${query.replace(/"/g, '\\"')}"\n`;
107
+ }
108
+ fileContent += `---\n\n`;
109
+ fileContent += content;
110
+
111
+ await writeFile(filepath, fileContent, 'utf-8');
112
+
113
+ return { refId, filepath };
114
+ } catch (error) {
115
+ const message = error instanceof Error ? error.message : String(error);
116
+ return { refId: '', filepath: '', error: message };
117
+ }
118
+ }
119
+
120
+ /**
121
+ * List all cached references
122
+ */
123
+ export function listCached(config: FetchiConfig): ListResult {
124
+ try {
125
+ const tempDir = config.paths.tempDir;
126
+
127
+ if (!existsSync(tempDir)) {
128
+ return { references: [] };
129
+ }
130
+
131
+ const files = readdirSync(tempDir).filter(f => f.endsWith('.md') && f.startsWith('REF-'));
132
+ const references: CachedReference[] = [];
133
+
134
+ for (const file of files) {
135
+ const filepath = join(tempDir, file);
136
+ const content = readFileSync(filepath, 'utf-8');
137
+
138
+ const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---/);
139
+ if (!frontmatterMatch) continue;
140
+
141
+ const frontmatter = frontmatterMatch[1];
142
+
143
+ const getId = (key: string): string => {
144
+ const match = frontmatter.match(new RegExp(`^${key}:\\s*(.+)$`, 'm'));
145
+ return match ? match[1].trim().replace(/^["']|["']$/g, '').trim() : '';
146
+ };
147
+
148
+ const ref = {
149
+ refId: getId('id'),
150
+ title: getId('title'),
151
+ url: getId('source_url'),
152
+ filepath,
153
+ fetchedDate: getId('fetched_date'),
154
+ size: content.length,
155
+ query: getId('query') || undefined,
156
+ };
157
+ references.push(ref);
158
+ }
159
+
160
+ references.sort((a, b) => b.refId.localeCompare(a.refId));
161
+
162
+ return { references };
163
+ } catch (error) {
164
+ const message = error instanceof Error ? error.message : String(error);
165
+ return { references: [], error: message };
166
+ }
167
+ }
168
+
169
+ /**
170
+ * Find a cached reference by ID
171
+ */
172
+ export function findCached(config: FetchiConfig, refId: string): CachedReference | null {
173
+ const { references } = listCached(config);
174
+ return references.find(r => r.refId === refId) || null;
175
+ }
176
+
177
+ /**
178
+ * Promote a reference from temp to docs folder
179
+ */
180
+ export function promoteReference(config: FetchiConfig, refId: string): PromoteResult {
181
+ try {
182
+ const cached = findCached(config, refId);
183
+
184
+ if (!cached) {
185
+ return {
186
+ success: false,
187
+ fromPath: '',
188
+ toPath: '',
189
+ error: `Reference ${refId} not found in ${config.paths.tempDir}`,
190
+ };
191
+ }
192
+
193
+ const docsDir = config.paths.docsDir;
194
+
195
+ mkdirSync(docsDir, { recursive: true });
196
+
197
+ let content = readFileSync(cached.filepath, 'utf-8');
198
+
199
+ content = content.replace(/^status:\s*temporary$/m, 'status: permanent');
200
+
201
+ const filename = basename(cached.filepath);
202
+ const toPath = join(docsDir, filename);
203
+
204
+ writeFileSync(toPath, content, 'utf-8');
205
+
206
+ unlinkSync(cached.filepath);
207
+
208
+ return {
209
+ success: true,
210
+ fromPath: cached.filepath,
211
+ toPath,
212
+ };
213
+ } catch (error) {
214
+ const message = error instanceof Error ? error.message : String(error);
215
+ return {
216
+ success: false,
217
+ fromPath: '',
218
+ toPath: '',
219
+ error: message,
220
+ };
221
+ }
222
+ }
223
+
224
+ /**
225
+ * Delete a cached reference
226
+ */
227
+ export function deleteCached(config: FetchiConfig, refId: string): DeleteResult {
228
+ try {
229
+ const cached = findCached(config, refId);
230
+
231
+ if (!cached) {
232
+ return {
233
+ success: false,
234
+ filepath: '',
235
+ error: `Reference ${refId} not found in ${config.paths.tempDir}`,
236
+ };
237
+ }
238
+
239
+ unlinkSync(cached.filepath);
240
+
241
+ return {
242
+ success: true,
243
+ filepath: cached.filepath,
244
+ };
245
+ } catch (error) {
246
+ const message = error instanceof Error ? error.message : String(error);
247
+ return {
248
+ success: false,
249
+ filepath: '',
250
+ error: message,
251
+ };
252
+ }
253
+ }
254
+
255
+ /**
256
+ * Get cache root (for backwards compatibility)
257
+ */
258
+ export function findCacheRoot(): string {
259
+ return process.cwd();
260
+ }
@@ -0,0 +1,87 @@
1
+ import TurndownService from 'turndown';
2
+ import { gfm } from 'turndown-plugin-gfm';
3
+ import { Readability } from '@mozilla/readability';
4
+ import { parseHTML } from 'linkedom';
5
+ import { cleanMarkdownComplete } from '../utils/markdown-cleaner.js';
6
+
7
+ export interface ExtractionResult {
8
+ markdown?: string;
9
+ title?: string;
10
+ byline?: string;
11
+ excerpt?: string;
12
+ siteName?: string;
13
+ error?: string;
14
+ }
15
+
16
+ const turndown = new TurndownService({
17
+ headingStyle: 'atx',
18
+ codeBlockStyle: 'fenced',
19
+ bulletListMarker: '-',
20
+ emDelimiter: '*',
21
+ strongDelimiter: '**',
22
+ hr: '---',
23
+ });
24
+
25
+ turndown.use(gfm);
26
+
27
+ turndown.addRule('removeComments', {
28
+ filter: (node) => (node as unknown as { nodeType: number }).nodeType === 8,
29
+ replacement: () => '',
30
+ });
31
+
32
+ export async function processHtmlToMarkdown(
33
+ html: string,
34
+ url: string,
35
+ verbose = false
36
+ ): Promise<ExtractionResult> {
37
+ try {
38
+ if (verbose) {
39
+ console.error(`📝 Processing HTML (${html.length} chars)`);
40
+ }
41
+
42
+ const { document } = parseHTML(html, { url });
43
+
44
+ const reader = new Readability(document, {
45
+ debug: false,
46
+ maxElemsToParse: 0,
47
+ nbTopCandidates: 5,
48
+ charThreshold: 500,
49
+ keepClasses: false,
50
+ });
51
+
52
+ const article = reader.parse();
53
+
54
+ if (!article) {
55
+ return {
56
+ error: 'Could not extract article content. Page may not contain article-like content.',
57
+ };
58
+ }
59
+
60
+ const content = article.content ?? '';
61
+
62
+ if (verbose) {
63
+ console.error(`📝 Extracted: "${article.title}" (${content.length} chars)`);
64
+ }
65
+
66
+ let markdown = turndown.turndown(content);
67
+
68
+ markdown = cleanMarkdownComplete(markdown);
69
+
70
+ let header = `# ${article.title}\n\n`;
71
+ if (article.byline) header += `**By:** ${article.byline}\n\n`;
72
+ if (article.siteName) header += `**Source:** ${article.siteName}\n\n`;
73
+ if (article.excerpt) header += `**Summary:** ${article.excerpt}\n\n`;
74
+ header += `**URL:** ${url}\n\n---\n\n`;
75
+
76
+ return {
77
+ markdown: header + markdown,
78
+ title: article.title ?? undefined,
79
+ byline: article.byline ?? undefined,
80
+ excerpt: article.excerpt ?? undefined,
81
+ siteName: article.siteName ?? undefined,
82
+ };
83
+ } catch (error) {
84
+ const message = error instanceof Error ? error.message : String(error);
85
+ return { error: message };
86
+ }
87
+ }
@@ -0,0 +1,4 @@
1
+ export * from './pipeline.js';
2
+ export * from './extractor.js';
3
+ export * from './cache.js';
4
+ export * from './playwright/index.js';
@@ -0,0 +1,189 @@
1
+ import type { FetchiConfig } from '../config/schema.js';
2
+ import { validateMarkdown, type ValidationResult } from '../utils/markdown-validator.js';
3
+ import { fetchWithBrowser, closeBrowser } from './playwright/manager.js';
4
+ import { processHtmlToMarkdown } from './extractor.js';
5
+
6
+ export interface FetchResult {
7
+ success: boolean;
8
+ markdown?: string;
9
+ title?: string;
10
+ byline?: string;
11
+ excerpt?: string;
12
+ siteName?: string;
13
+ quality?: ValidationResult;
14
+ error?: string;
15
+ suggestion?: string;
16
+ usedPlaywright?: boolean;
17
+ playwrightReason?: string;
18
+ }
19
+
20
+ interface SimpleFetchResult {
21
+ html: string;
22
+ error?: string;
23
+ }
24
+
25
+ async function simpleFetch(url: string, verbose = false): Promise<SimpleFetchResult> {
26
+ try {
27
+ if (verbose) {
28
+ console.error(`📡 Simple fetch: ${url}`);
29
+ }
30
+
31
+ const response = await fetch(url, {
32
+ redirect: 'follow',
33
+ headers: {
34
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
35
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36
+ 'Accept-Language': 'en-US,en;q=0.5',
37
+ },
38
+ });
39
+
40
+ if (!response.ok) {
41
+ return { html: '', error: `HTTP ${response.status}: ${response.statusText}` };
42
+ }
43
+
44
+ const html = await response.text();
45
+
46
+ if (verbose) {
47
+ console.error(`📡 Simple fetch: Got ${html.length} chars`);
48
+ }
49
+
50
+ return { html };
51
+ } catch (error) {
52
+ const message = error instanceof Error ? error.message : String(error);
53
+ return { html: '', error: message };
54
+ }
55
+ }
56
+
57
+ async function tryPlaywright(
58
+ url: string,
59
+ config: FetchiConfig,
60
+ reason: string,
61
+ verbose = false
62
+ ): Promise<FetchResult> {
63
+ if (verbose) {
64
+ console.error(`🎭 Trying Playwright (reason: ${reason})`);
65
+ }
66
+
67
+ const browserResult = await fetchWithBrowser(url, config.playwright, verbose);
68
+
69
+ if (browserResult.error) {
70
+ return {
71
+ success: false,
72
+ error: `Playwright fetch failed: ${browserResult.error}`,
73
+ };
74
+ }
75
+
76
+ const extracted = await processHtmlToMarkdown(browserResult.html, url, verbose);
77
+
78
+ if (extracted.error) {
79
+ return {
80
+ success: false,
81
+ error: extracted.error,
82
+ };
83
+ }
84
+
85
+ const quality = validateMarkdown(extracted.markdown!);
86
+
87
+ if (quality.score < config.quality.minScore) {
88
+ return {
89
+ success: false,
90
+ error: `Content quality too low (${quality.score}/100) even with JavaScript rendering`,
91
+ quality,
92
+ suggestion: 'This page may be a login wall, forum, or complex web app not suitable for article extraction',
93
+ usedPlaywright: true,
94
+ playwrightReason: reason,
95
+ };
96
+ }
97
+
98
+ return {
99
+ success: true,
100
+ markdown: extracted.markdown,
101
+ title: extracted.title,
102
+ byline: extracted.byline,
103
+ excerpt: extracted.excerpt,
104
+ siteName: extracted.siteName,
105
+ quality,
106
+ usedPlaywright: true,
107
+ playwrightReason: reason,
108
+ };
109
+ }
110
+
111
+ export async function fetchUrl(
112
+ url: string,
113
+ config: FetchiConfig,
114
+ verbose = false,
115
+ forcePlaywright = false
116
+ ): Promise<FetchResult> {
117
+ if (forcePlaywright) {
118
+ if (verbose) {
119
+ console.error(`⚡ Force Playwright mode enabled`);
120
+ }
121
+ return tryPlaywright(url, config, 'forced', verbose);
122
+ }
123
+
124
+ const simpleResult = await simpleFetch(url, verbose);
125
+
126
+ if (simpleResult.error) {
127
+ if (verbose) {
128
+ console.error(`📡 Simple fetch failed: ${simpleResult.error}`);
129
+ }
130
+ return tryPlaywright(url, config, 'network_error', verbose);
131
+ }
132
+
133
+ const extracted = await processHtmlToMarkdown(simpleResult.html, url, verbose);
134
+
135
+ if (extracted.error) {
136
+ if (verbose) {
137
+ console.error(`📝 Extraction failed: ${extracted.error}`);
138
+ }
139
+ return tryPlaywright(url, config, 'extraction_failed', verbose);
140
+ }
141
+
142
+ const quality = validateMarkdown(extracted.markdown!);
143
+
144
+ if (verbose) {
145
+ console.error(`📊 Quality score: ${quality.score}/100`);
146
+ }
147
+
148
+ if (quality.score >= config.quality.jsRetryThreshold) {
149
+ return {
150
+ success: true,
151
+ markdown: extracted.markdown,
152
+ title: extracted.title,
153
+ byline: extracted.byline,
154
+ excerpt: extracted.excerpt,
155
+ siteName: extracted.siteName,
156
+ quality,
157
+ };
158
+ }
159
+
160
+ if (quality.score >= config.quality.minScore) {
161
+ if (verbose) {
162
+ console.error(`📊 Quality marginal (${quality.score}), trying Playwright...`);
163
+ }
164
+
165
+ const playwrightResult = await tryPlaywright(url, config, 'quality_marginal', verbose);
166
+
167
+ if (playwrightResult.success && playwrightResult.quality!.score > quality.score) {
168
+ return playwrightResult;
169
+ }
170
+
171
+ return {
172
+ success: true,
173
+ markdown: extracted.markdown,
174
+ title: extracted.title,
175
+ byline: extracted.byline,
176
+ excerpt: extracted.excerpt,
177
+ siteName: extracted.siteName,
178
+ quality,
179
+ };
180
+ }
181
+
182
+ if (verbose) {
183
+ console.error(`📊 Quality too low (${quality.score}), trying Playwright...`);
184
+ }
185
+
186
+ return tryPlaywright(url, config, 'quality_too_low', verbose);
187
+ }
188
+
189
+ export { closeBrowser };
@@ -0,0 +1,2 @@
1
+ export * from './types.js';
2
+ export * from './manager.js';
@@ -0,0 +1,38 @@
1
+ import { chromium } from 'playwright-extra';
2
+ import type { Browser } from 'playwright';
3
+ import stealth from 'puppeteer-extra-plugin-stealth';
4
+ import type { PlaywrightConfig } from '../../config/schema.js';
5
+ import type { BrowserManager } from './types.js';
6
+
7
+ chromium.use(stealth());
8
+
9
+ let browserInstance: Browser | null = null;
10
+
11
+ export class LocalBrowserManager implements BrowserManager {
12
+ private config: PlaywrightConfig;
13
+
14
+ constructor(config: PlaywrightConfig) {
15
+ this.config = config;
16
+ }
17
+
18
+ async getBrowser(): Promise<Browser> {
19
+ if (!browserInstance) {
20
+ browserInstance = await chromium.launch({
21
+ headless: true,
22
+ timeout: this.config.timeout,
23
+ });
24
+ }
25
+ return browserInstance;
26
+ }
27
+
28
+ async closeBrowser(): Promise<void> {
29
+ if (browserInstance) {
30
+ await browserInstance.close();
31
+ browserInstance = null;
32
+ }
33
+ }
34
+
35
+ isDocker(): boolean {
36
+ return false;
37
+ }
38
+ }
@@ -0,0 +1,61 @@
1
+ import type { PlaywrightConfig } from '../../config/schema.js';
2
+ import type { BrowserManager, FetchWithBrowserResult } from './types.js';
3
+ import { LocalBrowserManager } from './local.js';
4
+
5
+ let currentManager: BrowserManager | null = null;
6
+
7
+ export async function getBrowserManager(config: PlaywrightConfig): Promise<BrowserManager> {
8
+ if (currentManager) {
9
+ return currentManager;
10
+ }
11
+
12
+ // Only local mode is supported
13
+ currentManager = new LocalBrowserManager(config);
14
+ return currentManager;
15
+ }
16
+
17
+ export async function fetchWithBrowser(
18
+ url: string,
19
+ config: PlaywrightConfig,
20
+ verbose = false
21
+ ): Promise<FetchWithBrowserResult> {
22
+ const manager = await getBrowserManager(config);
23
+ const browser = await manager.getBrowser();
24
+
25
+ try {
26
+ const context = await browser.newContext({
27
+ userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
28
+ });
29
+ const page = await context.newPage();
30
+
31
+ if (verbose) {
32
+ console.error(`🎭 Playwright: Navigating to ${url}`);
33
+ }
34
+
35
+ await page.goto(url, {
36
+ waitUntil: config.waitStrategy,
37
+ timeout: config.timeout,
38
+ });
39
+
40
+ const html = await page.content();
41
+
42
+ if (verbose) {
43
+ console.error(`🎭 Playwright: Got ${html.length} chars of HTML`);
44
+ }
45
+
46
+ await page.close();
47
+ await context.close();
48
+
49
+ return { html };
50
+ } catch (error) {
51
+ const message = error instanceof Error ? error.message : String(error);
52
+ return { html: '', error: message };
53
+ }
54
+ }
55
+
56
+ export async function closeBrowser(): Promise<void> {
57
+ if (currentManager) {
58
+ await currentManager.closeBrowser();
59
+ currentManager = null;
60
+ }
61
+ }
@@ -0,0 +1,12 @@
1
+ import type { Browser } from 'playwright';
2
+
3
+ export interface BrowserManager {
4
+ getBrowser(): Promise<Browser>;
5
+ closeBrowser(): Promise<void>;
6
+ isDocker(): boolean;
7
+ }
8
+
9
+ export interface FetchWithBrowserResult {
10
+ html: string;
11
+ error?: string;
12
+ }
@@ -0,0 +1,8 @@
1
+ declare module 'turndown-plugin-gfm' {
2
+ import TurndownService from 'turndown';
3
+
4
+ export function gfm(turndownService: TurndownService): void;
5
+ export function strikethrough(turndownService: TurndownService): void;
6
+ export function tables(turndownService: TurndownService): void;
7
+ export function taskListItems(turndownService: TurndownService): void;
8
+ }