arcfetch 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,89 @@
1
+ import type { PlaywrightConfig } from '../../config/schema.js';
2
+ import type { BrowserManager, FetchWithBrowserResult } from './types.js';
3
+ import { LocalBrowserManager } from './local.js';
4
+ import { DockerBrowserManager, isDockerAvailable } from './docker.js';
5
+
6
+ let currentManager: BrowserManager | null = null;
7
+
8
+ export async function getBrowserManager(config: PlaywrightConfig): Promise<BrowserManager> {
9
+ if (currentManager) {
10
+ return currentManager;
11
+ }
12
+
13
+ const mode = config.mode;
14
+
15
+ if (mode === 'local') {
16
+ currentManager = new LocalBrowserManager(config);
17
+ return currentManager;
18
+ }
19
+
20
+ if (mode === 'docker') {
21
+ const dockerAvailable = await isDockerAvailable();
22
+ if (!dockerAvailable) {
23
+ throw new Error('Docker mode requested but Docker is not available');
24
+ }
25
+ currentManager = new DockerBrowserManager(config);
26
+ return currentManager;
27
+ }
28
+
29
+ // Auto mode: prefer Docker if available, fall back to local
30
+ if (mode === 'auto') {
31
+ const dockerAvailable = await isDockerAvailable();
32
+ if (dockerAvailable) {
33
+ console.error('Using Docker for Playwright');
34
+ currentManager = new DockerBrowserManager(config);
35
+ } else {
36
+ console.error('Docker not available, using local Playwright');
37
+ currentManager = new LocalBrowserManager(config);
38
+ }
39
+ return currentManager;
40
+ }
41
+
42
+ throw new Error(`Unknown Playwright mode: ${mode}`);
43
+ }
44
+
45
+ export async function fetchWithBrowser(
46
+ url: string,
47
+ config: PlaywrightConfig,
48
+ verbose = false
49
+ ): Promise<FetchWithBrowserResult> {
50
+ const manager = await getBrowserManager(config);
51
+ const browser = await manager.getBrowser();
52
+
53
+ try {
54
+ const context = await browser.newContext({
55
+ userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
56
+ });
57
+ const page = await context.newPage();
58
+
59
+ if (verbose) {
60
+ console.error(`🎭 Playwright: Navigating to ${url} (${manager.isDocker() ? 'Docker' : 'local'})`);
61
+ }
62
+
63
+ await page.goto(url, {
64
+ waitUntil: config.waitStrategy,
65
+ timeout: config.timeout,
66
+ });
67
+
68
+ const html = await page.content();
69
+
70
+ if (verbose) {
71
+ console.error(`🎭 Playwright: Got ${html.length} chars of HTML`);
72
+ }
73
+
74
+ await page.close();
75
+ await context.close();
76
+
77
+ return { html };
78
+ } catch (error) {
79
+ const message = error instanceof Error ? error.message : String(error);
80
+ return { html: '', error: message };
81
+ }
82
+ }
83
+
84
+ export async function closeBrowser(): Promise<void> {
85
+ if (currentManager) {
86
+ await currentManager.closeBrowser();
87
+ currentManager = null;
88
+ }
89
+ }
@@ -0,0 +1,12 @@
1
+ import type { Browser } from 'playwright';
2
+
3
+ export interface BrowserManager {
4
+ getBrowser(): Promise<Browser>;
5
+ closeBrowser(): Promise<void>;
6
+ isDocker(): boolean;
7
+ }
8
+
9
+ export interface FetchWithBrowserResult {
10
+ html: string;
11
+ error?: string;
12
+ }
@@ -0,0 +1,8 @@
1
+ declare module 'turndown-plugin-gfm' {
2
+ import TurndownService from 'turndown';
3
+
4
+ export function gfm(turndownService: TurndownService): void;
5
+ export function strikethrough(turndownService: TurndownService): void;
6
+ export function tables(turndownService: TurndownService): void;
7
+ export function taskListItems(turndownService: TurndownService): void;
8
+ }
@@ -0,0 +1,79 @@
1
+ /**
2
+ * Markdown Cleaning Utilities
3
+ *
4
+ * Post-processing functions to clean and optimize markdown for LLM context efficiency
5
+ */
6
+
7
+ export function cleanMarkdown(markdown: string): string {
8
+ if (!markdown.trim()) {
9
+ return markdown;
10
+ }
11
+
12
+ let cleaned = markdown
13
+
14
+ cleaned = cleaned.replace(/\r\n/g, '\n')
15
+ cleaned = cleaned.replace(/\n{3,}/g, '\n\n')
16
+ cleaned = cleaned.replace(/[ \t]+$/gm, '')
17
+ cleaned = cleaned.trim()
18
+ cleaned = cleaned.replace(/([^\n])\n(#{1,6} )/g, '$1\n\n$2')
19
+ cleaned = cleaned.replace(/(#{1,6} .+)\n([^#\n])/g, '$1\n\n$2')
20
+ cleaned = cleaned.replace(/([^\n])\n([-*+] |\d+\. )/g, '$1\n\n$2')
21
+ cleaned = cleaned.replace(/(\*|_) +/g, '$1')
22
+ cleaned = cleaned.replace(/ +(\*|_)/g, '$1')
23
+ cleaned = cleaned.replace(/([^\n])\n```/g, '$1\n\n```')
24
+ cleaned = cleaned.replace(/```\n([^`])/g, '```\n\n$1')
25
+ cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, ' ')
26
+ cleaned = cleaned.replace(/ {2,}/g, ' ')
27
+
28
+ return cleaned
29
+ }
30
+
31
+ export function advancedClean(markdown: string): string {
32
+ let cleaned = markdown
33
+
34
+ cleaned = cleaned.replace(/\[([^\]]+)\]\(\)/g, '$1')
35
+ cleaned = cleaned.replace(/<[^>]+>/g, '')
36
+ cleaned = cleaned.replace(/\*\*\*\*/g, '')
37
+ cleaned = cleaned.replace(/(?<!\*)\*\*(?!\*)/g, '')
38
+ cleaned = cleaned.replace(/__/g, '')
39
+ cleaned = cleaned.replace(/!\[\]\(([^)]+)\)/g, '![]($1)')
40
+ cleaned = cleaned.replace(/[\u200B-\u200D\uFEFF]/g, '')
41
+ cleaned = cleaned.replace(/[\u201C\u201D]/g, '"')
42
+ cleaned = cleaned.replace(/[\u2018\u2019]/g, "'")
43
+ cleaned = cleaned.replace(/[\u2013\u2014]/g, '-')
44
+
45
+ cleaned = cleaned.replace(/^(?!```)[^\n]*$/gm, (line) => {
46
+ return line.replace(/ {2,}/g, ' ')
47
+ })
48
+
49
+ return cleaned
50
+ }
51
+
52
+ export function finalCleanup(markdown: string): string {
53
+ if (!markdown.trim()) {
54
+ return markdown;
55
+ }
56
+
57
+ let cleaned = markdown
58
+
59
+ cleaned = cleaned.replace(/^(\s*)[*+] /gm, '$1- ')
60
+ cleaned = cleaned.replace(/_([^_]+)_/g, '*$1*')
61
+ cleaned = cleaned.replace(/^~~~(\w*)\n/gm, '```$1\n')
62
+ cleaned = cleaned.replace(/^~~~$/gm, '```')
63
+ cleaned = cleaned.replace(/\n{3,}/g, '\n\n')
64
+ cleaned = `${cleaned.trim()}\n`
65
+
66
+ return cleaned
67
+ }
68
+
69
+ export function cleanMarkdownComplete(markdown: string): string {
70
+ if (!markdown.trim()) {
71
+ return markdown;
72
+ }
73
+
74
+ let cleaned = cleanMarkdown(markdown)
75
+ cleaned = advancedClean(cleaned)
76
+ cleaned = finalCleanup(cleaned)
77
+
78
+ return cleaned
79
+ }
@@ -0,0 +1,136 @@
1
+ /**
2
+ * Markdown Quality Validator
3
+ *
4
+ * Validates that extracted markdown is clean and usable.
5
+ * Returns quality score and issues.
6
+ */
7
+
8
+ export interface ValidationResult {
9
+ isValid: boolean;
10
+ score: number; // 0-100
11
+ issues: string[];
12
+ warnings: string[];
13
+ }
14
+
15
+ export function validateMarkdown(markdown: string): ValidationResult {
16
+ const issues: string[] = [];
17
+ const warnings: string[] = [];
18
+ let score = 100;
19
+
20
+ // Check for excessive HTML tags (indicates poor conversion)
21
+ const htmlTagMatches = markdown.match(/<[^>]+>/g);
22
+ const htmlTagCount = htmlTagMatches ? htmlTagMatches.length : 0;
23
+
24
+ if (htmlTagCount > 100) {
25
+ score -= 40;
26
+ issues.push(`${htmlTagCount} leftover HTML tags found (likely forum/discussion thread)`);
27
+ } else if (htmlTagCount > 50) {
28
+ score -= 20;
29
+ warnings.push(`${htmlTagCount} HTML tags present`);
30
+ } else if (htmlTagCount > 10) {
31
+ score -= 5;
32
+ warnings.push(`${htmlTagCount} minor HTML tags present`);
33
+ }
34
+
35
+ // Check for table structure (tr/td) not converted
36
+ const tableTagMatches = markdown.match(/<t[rd][\s>]/gi);
37
+ const tableTagCount = tableTagMatches ? tableTagMatches.length : 0;
38
+
39
+ if (tableTagCount > 50) {
40
+ score -= 30;
41
+ issues.push(`${tableTagCount} unconverted table tags (complex layout not suitable)`);
42
+ }
43
+
44
+ // Check markdown to HTML ratio (too much HTML means poor extraction)
45
+ const htmlCharCount = markdown.match(/<[^>]*>/g)?.join('').length || 0;
46
+ const htmlRatio = htmlCharCount / markdown.length;
47
+
48
+ if (htmlRatio > 0.3) {
49
+ score -= 25;
50
+ issues.push(`${(htmlRatio * 100).toFixed(1)}% of content is HTML tags`);
51
+ } else if (htmlRatio > 0.15) {
52
+ score -= 10;
53
+ warnings.push(`${(htmlRatio * 100).toFixed(1)}% HTML tag ratio`);
54
+ }
55
+
56
+ // Check for script tags (should never be present)
57
+ const scriptMatches = markdown.match(/<script/gi);
58
+ if (scriptMatches && scriptMatches.length > 0) {
59
+ score -= 15;
60
+ warnings.push(`${scriptMatches.length} script tags present`);
61
+ }
62
+
63
+ // Check for style tags
64
+ const styleMatches = markdown.match(/<style/gi);
65
+ if (styleMatches && styleMatches.length > 0) {
66
+ score -= 10;
67
+ warnings.push(`${styleMatches.length} style tags present`);
68
+ }
69
+
70
+ // Check for minimal/blank content
71
+ const contentLength = markdown.replace(/<[^>]*>/g, '').replace(/[#*\-_`[\]()]/g, '').trim().length;
72
+
73
+ if (contentLength === 0) {
74
+ score = 0;
75
+ issues.push("Blank content - no text extracted");
76
+ } else if (contentLength < 50) {
77
+ score -= 50;
78
+ issues.push(`Extremely short content (${contentLength} chars) - likely extraction failure`);
79
+ } else if (contentLength < 200 && (htmlTagCount > 50 || tableTagCount > 20)) {
80
+ score -= 30;
81
+ issues.push(`Only ${contentLength} chars of actual content with excessive HTML (extraction likely failed)`);
82
+ } else if (contentLength < 300) {
83
+ score -= 15;
84
+ warnings.push(`Short content (${contentLength} chars) - may not be a full article`);
85
+ }
86
+
87
+ // Check for excessive newlines (indicates poor formatting)
88
+ const excessiveNewlines = markdown.match(/\n{5,}/g);
89
+ if (excessiveNewlines && excessiveNewlines.length > 10) {
90
+ score -= 5;
91
+ warnings.push(`${excessiveNewlines.length} sections with excessive newlines`);
92
+ }
93
+
94
+ // Quality thresholds
95
+ const isValid = score >= 60; // Below 60 is unusable
96
+
97
+ return {
98
+ isValid,
99
+ score: Math.max(0, score),
100
+ issues,
101
+ warnings,
102
+ };
103
+ }
104
+
105
+ /**
106
+ * Generate human-readable quality report
107
+ */
108
+ export function formatValidationReport(result: ValidationResult): string {
109
+ let report = `**Quality Score**: ${result.score}/100`;
110
+
111
+ if (result.score >= 90) {
112
+ report += " ✅ Excellent";
113
+ } else if (result.score >= 75) {
114
+ report += " ✅ Good";
115
+ } else if (result.score >= 60) {
116
+ report += " ⚠️ Acceptable";
117
+ } else {
118
+ report += " ❌ Poor";
119
+ }
120
+
121
+ if (result.issues.length > 0) {
122
+ report += "\n\n**Issues**:\n";
123
+ result.issues.forEach(issue => {
124
+ report += `- ${issue}\n`;
125
+ });
126
+ }
127
+
128
+ if (result.warnings.length > 0) {
129
+ report += "\n**Warnings**:\n";
130
+ result.warnings.forEach(warning => {
131
+ report += `- ${warning}\n`;
132
+ });
133
+ }
134
+
135
+ return report;
136
+ }