arcfetch 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +212 -0
- package/cli.ts +461 -0
- package/index.ts +332 -0
- package/package.json +96 -0
- package/src/config/defaults.ts +20 -0
- package/src/config/index.ts +3 -0
- package/src/config/loader.ts +118 -0
- package/src/config/schema.ts +34 -0
- package/src/core/cache.ts +260 -0
- package/src/core/extractor.ts +87 -0
- package/src/core/index.ts +4 -0
- package/src/core/pipeline.ts +189 -0
- package/src/core/playwright/index.ts +2 -0
- package/src/core/playwright/local.ts +38 -0
- package/src/core/playwright/manager.ts +61 -0
- package/src/core/playwright/types.ts +12 -0
- package/src/types/turndown-plugin-gfm.d.ts +8 -0
- package/src/utils/markdown-cleaner.ts +79 -0
- package/src/utils/markdown-validator.ts +136 -0
- package/src/utils/version.ts +12 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown Cleaning Utilities
|
|
3
|
+
*
|
|
4
|
+
* Post-processing functions to clean and optimize markdown for LLM context efficiency
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export function cleanMarkdown(markdown: string): string {
|
|
8
|
+
if (!markdown.trim()) {
|
|
9
|
+
return markdown;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
let cleaned = markdown
|
|
13
|
+
|
|
14
|
+
cleaned = cleaned.replace(/\r\n/g, '\n')
|
|
15
|
+
cleaned = cleaned.replace(/\n{3,}/g, '\n\n')
|
|
16
|
+
cleaned = cleaned.replace(/[ \t]+$/gm, '')
|
|
17
|
+
cleaned = cleaned.trim()
|
|
18
|
+
cleaned = cleaned.replace(/([^\n])\n(#{1,6} )/g, '$1\n\n$2')
|
|
19
|
+
cleaned = cleaned.replace(/(#{1,6} .+)\n([^#\n])/g, '$1\n\n$2')
|
|
20
|
+
cleaned = cleaned.replace(/([^\n])\n([-*+] |\d+\. )/g, '$1\n\n$2')
|
|
21
|
+
cleaned = cleaned.replace(/(\*|_) +/g, '$1')
|
|
22
|
+
cleaned = cleaned.replace(/ +(\*|_)/g, '$1')
|
|
23
|
+
cleaned = cleaned.replace(/([^\n])\n```/g, '$1\n\n```')
|
|
24
|
+
cleaned = cleaned.replace(/```\n([^`])/g, '```\n\n$1')
|
|
25
|
+
cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, ' ')
|
|
26
|
+
cleaned = cleaned.replace(/ {2,}/g, ' ')
|
|
27
|
+
|
|
28
|
+
return cleaned
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export function advancedClean(markdown: string): string {
|
|
32
|
+
let cleaned = markdown
|
|
33
|
+
|
|
34
|
+
cleaned = cleaned.replace(/\[([^\]]+)\]\(\)/g, '$1')
|
|
35
|
+
cleaned = cleaned.replace(/<[^>]+>/g, '')
|
|
36
|
+
cleaned = cleaned.replace(/\*\*\*\*/g, '')
|
|
37
|
+
cleaned = cleaned.replace(/(?<!\*)\*\*(?!\*)/g, '')
|
|
38
|
+
cleaned = cleaned.replace(/__/g, '')
|
|
39
|
+
cleaned = cleaned.replace(/!\[\]\(([^)]+)\)/g, '')
|
|
40
|
+
cleaned = cleaned.replace(/[\u200B-\u200D\uFEFF]/g, '')
|
|
41
|
+
cleaned = cleaned.replace(/[\u201C\u201D]/g, '"')
|
|
42
|
+
cleaned = cleaned.replace(/[\u2018\u2019]/g, "'")
|
|
43
|
+
cleaned = cleaned.replace(/[\u2013\u2014]/g, '-')
|
|
44
|
+
|
|
45
|
+
cleaned = cleaned.replace(/^(?!```)[^\n]*$/gm, (line) => {
|
|
46
|
+
return line.replace(/ {2,}/g, ' ')
|
|
47
|
+
})
|
|
48
|
+
|
|
49
|
+
return cleaned
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export function finalCleanup(markdown: string): string {
|
|
53
|
+
if (!markdown.trim()) {
|
|
54
|
+
return markdown;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
let cleaned = markdown
|
|
58
|
+
|
|
59
|
+
cleaned = cleaned.replace(/^(\s*)[*+] /gm, '$1- ')
|
|
60
|
+
cleaned = cleaned.replace(/_([^_]+)_/g, '*$1*')
|
|
61
|
+
cleaned = cleaned.replace(/^~~~(\w*)\n/gm, '```$1\n')
|
|
62
|
+
cleaned = cleaned.replace(/^~~~$/gm, '```')
|
|
63
|
+
cleaned = cleaned.replace(/\n{3,}/g, '\n\n')
|
|
64
|
+
cleaned = `${cleaned.trim()}\n`
|
|
65
|
+
|
|
66
|
+
return cleaned
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export function cleanMarkdownComplete(markdown: string): string {
|
|
70
|
+
if (!markdown.trim()) {
|
|
71
|
+
return markdown;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
let cleaned = cleanMarkdown(markdown)
|
|
75
|
+
cleaned = advancedClean(cleaned)
|
|
76
|
+
cleaned = finalCleanup(cleaned)
|
|
77
|
+
|
|
78
|
+
return cleaned
|
|
79
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown Quality Validator
|
|
3
|
+
*
|
|
4
|
+
* Validates that extracted markdown is clean and usable.
|
|
5
|
+
* Returns quality score and issues.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface ValidationResult {
|
|
9
|
+
isValid: boolean;
|
|
10
|
+
score: number; // 0-100
|
|
11
|
+
issues: string[];
|
|
12
|
+
warnings: string[];
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export function validateMarkdown(markdown: string): ValidationResult {
|
|
16
|
+
const issues: string[] = [];
|
|
17
|
+
const warnings: string[] = [];
|
|
18
|
+
let score = 100;
|
|
19
|
+
|
|
20
|
+
// Check for excessive HTML tags (indicates poor conversion)
|
|
21
|
+
const htmlTagMatches = markdown.match(/<[^>]+>/g);
|
|
22
|
+
const htmlTagCount = htmlTagMatches ? htmlTagMatches.length : 0;
|
|
23
|
+
|
|
24
|
+
if (htmlTagCount > 100) {
|
|
25
|
+
score -= 40;
|
|
26
|
+
issues.push(`${htmlTagCount} leftover HTML tags found (likely forum/discussion thread)`);
|
|
27
|
+
} else if (htmlTagCount > 50) {
|
|
28
|
+
score -= 20;
|
|
29
|
+
warnings.push(`${htmlTagCount} HTML tags present`);
|
|
30
|
+
} else if (htmlTagCount > 10) {
|
|
31
|
+
score -= 5;
|
|
32
|
+
warnings.push(`${htmlTagCount} minor HTML tags present`);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Check for table structure (tr/td) not converted
|
|
36
|
+
const tableTagMatches = markdown.match(/<t[rd][\s>]/gi);
|
|
37
|
+
const tableTagCount = tableTagMatches ? tableTagMatches.length : 0;
|
|
38
|
+
|
|
39
|
+
if (tableTagCount > 50) {
|
|
40
|
+
score -= 30;
|
|
41
|
+
issues.push(`${tableTagCount} unconverted table tags (complex layout not suitable)`);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Check markdown to HTML ratio (too much HTML means poor extraction)
|
|
45
|
+
const htmlCharCount = markdown.match(/<[^>]*>/g)?.join('').length || 0;
|
|
46
|
+
const htmlRatio = htmlCharCount / markdown.length;
|
|
47
|
+
|
|
48
|
+
if (htmlRatio > 0.3) {
|
|
49
|
+
score -= 25;
|
|
50
|
+
issues.push(`${(htmlRatio * 100).toFixed(1)}% of content is HTML tags`);
|
|
51
|
+
} else if (htmlRatio > 0.15) {
|
|
52
|
+
score -= 10;
|
|
53
|
+
warnings.push(`${(htmlRatio * 100).toFixed(1)}% HTML tag ratio`);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// Check for script tags (should never be present)
|
|
57
|
+
const scriptMatches = markdown.match(/<script/gi);
|
|
58
|
+
if (scriptMatches && scriptMatches.length > 0) {
|
|
59
|
+
score -= 15;
|
|
60
|
+
warnings.push(`${scriptMatches.length} script tags present`);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Check for style tags
|
|
64
|
+
const styleMatches = markdown.match(/<style/gi);
|
|
65
|
+
if (styleMatches && styleMatches.length > 0) {
|
|
66
|
+
score -= 10;
|
|
67
|
+
warnings.push(`${styleMatches.length} style tags present`);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Check for minimal/blank content
|
|
71
|
+
const contentLength = markdown.replace(/<[^>]*>/g, '').replace(/[#*\-_`[\]()]/g, '').trim().length;
|
|
72
|
+
|
|
73
|
+
if (contentLength === 0) {
|
|
74
|
+
score = 0;
|
|
75
|
+
issues.push("Blank content - no text extracted");
|
|
76
|
+
} else if (contentLength < 50) {
|
|
77
|
+
score -= 50;
|
|
78
|
+
issues.push(`Extremely short content (${contentLength} chars) - likely extraction failure`);
|
|
79
|
+
} else if (contentLength < 200 && (htmlTagCount > 50 || tableTagCount > 20)) {
|
|
80
|
+
score -= 30;
|
|
81
|
+
issues.push(`Only ${contentLength} chars of actual content with excessive HTML (extraction likely failed)`);
|
|
82
|
+
} else if (contentLength < 300) {
|
|
83
|
+
score -= 15;
|
|
84
|
+
warnings.push(`Short content (${contentLength} chars) - may not be a full article`);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Check for excessive newlines (indicates poor formatting)
|
|
88
|
+
const excessiveNewlines = markdown.match(/\n{5,}/g);
|
|
89
|
+
if (excessiveNewlines && excessiveNewlines.length > 10) {
|
|
90
|
+
score -= 5;
|
|
91
|
+
warnings.push(`${excessiveNewlines.length} sections with excessive newlines`);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Quality thresholds
|
|
95
|
+
const isValid = score >= 60; // Below 60 is unusable
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
isValid,
|
|
99
|
+
score: Math.max(0, score),
|
|
100
|
+
issues,
|
|
101
|
+
warnings,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Generate human-readable quality report
|
|
107
|
+
*/
|
|
108
|
+
export function formatValidationReport(result: ValidationResult): string {
|
|
109
|
+
let report = `**Quality Score**: ${result.score}/100`;
|
|
110
|
+
|
|
111
|
+
if (result.score >= 90) {
|
|
112
|
+
report += " ✅ Excellent";
|
|
113
|
+
} else if (result.score >= 75) {
|
|
114
|
+
report += " ✅ Good";
|
|
115
|
+
} else if (result.score >= 60) {
|
|
116
|
+
report += " ⚠️ Acceptable";
|
|
117
|
+
} else {
|
|
118
|
+
report += " ❌ Poor";
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (result.issues.length > 0) {
|
|
122
|
+
report += "\n\n**Issues**:\n";
|
|
123
|
+
result.issues.forEach(issue => {
|
|
124
|
+
report += `- ${issue}\n`;
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (result.warnings.length > 0) {
|
|
129
|
+
report += "\n**Warnings**:\n";
|
|
130
|
+
result.warnings.forEach(warning => {
|
|
131
|
+
report += `- ${warning}\n`;
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return report;
|
|
136
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import fs from 'fs';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = path.dirname(__filename);
|
|
7
|
+
const packageJsonPath = path.resolve(__dirname, '../../package.json');
|
|
8
|
+
const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
|
|
9
|
+
|
|
10
|
+
export function getVersion(): string {
|
|
11
|
+
return packageJson.version;
|
|
12
|
+
}
|