arcfetch 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/cli.ts +96 -122
- package/index.ts +71 -102
- package/package.json +2 -2
- package/src/config/defaults.ts +1 -5
- package/src/config/index.ts +1 -1
- package/src/config/loader.ts +5 -9
- package/src/config/schema.ts +1 -8
- package/src/core/cache.ts +12 -13
- package/src/core/extractor.ts +3 -7
- package/src/core/fetch-links.ts +85 -0
- package/src/core/index.ts +3 -2
- package/src/core/pipeline.ts +29 -13
- package/src/core/playwright/index.ts +1 -1
- package/src/core/playwright/local.ts +15 -6
- package/src/core/playwright/manager.ts +86 -14
- package/src/utils/markdown-cleaner.ts +41 -41
- package/src/utils/markdown-validator.ts +98 -26
package/src/config/schema.ts
CHANGED
|
@@ -6,7 +6,7 @@ export const QualityConfigSchema = z.object({
|
|
|
6
6
|
});
|
|
7
7
|
|
|
8
8
|
export const PathsConfigSchema = z.object({
|
|
9
|
-
tempDir: z.string().default('.tmp'),
|
|
9
|
+
tempDir: z.string().default('.tmp/arcfetch'),
|
|
10
10
|
docsDir: z.string().default('docs/ai/references'),
|
|
11
11
|
});
|
|
12
12
|
|
|
@@ -15,20 +15,13 @@ export const PlaywrightConfigSchema = z.object({
|
|
|
15
15
|
waitStrategy: z.enum(['networkidle', 'domcontentloaded', 'load']).default('networkidle'),
|
|
16
16
|
});
|
|
17
17
|
|
|
18
|
-
export const RetryConfigSchema = z.object({
|
|
19
|
-
maxAttempts: z.number().default(2),
|
|
20
|
-
backoffMs: z.number().default(1000),
|
|
21
|
-
});
|
|
22
|
-
|
|
23
18
|
export const FetchiConfigSchema = z.object({
|
|
24
19
|
quality: QualityConfigSchema.default({}),
|
|
25
20
|
paths: PathsConfigSchema.default({}),
|
|
26
21
|
playwright: PlaywrightConfigSchema.default({}),
|
|
27
|
-
retry: RetryConfigSchema.default({}),
|
|
28
22
|
});
|
|
29
23
|
|
|
30
24
|
export type FetchiConfig = z.infer<typeof FetchiConfigSchema>;
|
|
31
25
|
export type QualityConfig = z.infer<typeof QualityConfigSchema>;
|
|
32
26
|
export type PathsConfig = z.infer<typeof PathsConfigSchema>;
|
|
33
27
|
export type PlaywrightConfig = z.infer<typeof PlaywrightConfigSchema>;
|
|
34
|
-
export type RetryConfig = z.infer<typeof RetryConfigSchema>;
|
package/src/core/cache.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { existsSync, mkdirSync, readdirSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs';
|
|
2
2
|
import { writeFile } from 'node:fs/promises';
|
|
3
|
-
import {
|
|
3
|
+
import { basename, join } from 'node:path';
|
|
4
4
|
import type { FetchiConfig } from '../config/schema';
|
|
5
5
|
|
|
6
6
|
export interface CachedReference {
|
|
@@ -43,7 +43,7 @@ export interface DeleteResult {
|
|
|
43
43
|
*/
|
|
44
44
|
export function findByUrl(config: FetchiConfig, url: string): CachedReference | null {
|
|
45
45
|
const { references } = listCached(config);
|
|
46
|
-
return references.find(r => r.url === url) || null;
|
|
46
|
+
return references.find((r) => r.url === url) || null;
|
|
47
47
|
}
|
|
48
48
|
|
|
49
49
|
/**
|
|
@@ -89,9 +89,10 @@ export async function saveToTemp(
|
|
|
89
89
|
const filepath = existing && refetch ? existing.filepath : join(tempDir, filename);
|
|
90
90
|
|
|
91
91
|
const today = new Date().toISOString().split('T')[0];
|
|
92
|
+
const sanitizedUrl = url.replace(/[\r\n]/g, '');
|
|
92
93
|
let fileContent = `---\n`;
|
|
93
94
|
fileContent += `title: "${title.replace(/"/g, '\\"')}"\n`;
|
|
94
|
-
fileContent += `source_url: ${
|
|
95
|
+
fileContent += `source_url: ${sanitizedUrl}\n`;
|
|
95
96
|
fileContent += `fetched_date: ${today}\n`;
|
|
96
97
|
fileContent += `type: web\n`;
|
|
97
98
|
fileContent += `status: temporary\n`;
|
|
@@ -121,7 +122,7 @@ export function listCached(config: FetchiConfig): ListResult {
|
|
|
121
122
|
return { references: [] };
|
|
122
123
|
}
|
|
123
124
|
|
|
124
|
-
const files = readdirSync(tempDir).filter(f => f.endsWith('.md'));
|
|
125
|
+
const files = readdirSync(tempDir).filter((f) => f.endsWith('.md'));
|
|
125
126
|
const references: CachedReference[] = [];
|
|
126
127
|
|
|
127
128
|
for (const file of files) {
|
|
@@ -135,7 +136,12 @@ export function listCached(config: FetchiConfig): ListResult {
|
|
|
135
136
|
|
|
136
137
|
const getValue = (key: string): string => {
|
|
137
138
|
const match = frontmatter.match(new RegExp(`^${key}:\\s*(.+)$`, 'm'));
|
|
138
|
-
return match
|
|
139
|
+
return match
|
|
140
|
+
? match[1]
|
|
141
|
+
.trim()
|
|
142
|
+
.replace(/^["']|["']$/g, '')
|
|
143
|
+
.trim()
|
|
144
|
+
: '';
|
|
139
145
|
};
|
|
140
146
|
|
|
141
147
|
// Use filename (without .md) as refId
|
|
@@ -168,7 +174,7 @@ export function listCached(config: FetchiConfig): ListResult {
|
|
|
168
174
|
*/
|
|
169
175
|
export function findCached(config: FetchiConfig, refId: string): CachedReference | null {
|
|
170
176
|
const { references } = listCached(config);
|
|
171
|
-
return references.find(r => r.refId === refId) || null;
|
|
177
|
+
return references.find((r) => r.refId === refId) || null;
|
|
172
178
|
}
|
|
173
179
|
|
|
174
180
|
/**
|
|
@@ -249,13 +255,6 @@ export function deleteCached(config: FetchiConfig, refId: string): DeleteResult
|
|
|
249
255
|
}
|
|
250
256
|
}
|
|
251
257
|
|
|
252
|
-
/**
|
|
253
|
-
* Get cache root (for backwards compatibility)
|
|
254
|
-
*/
|
|
255
|
-
export function findCacheRoot(): string {
|
|
256
|
-
return process.cwd();
|
|
257
|
-
}
|
|
258
|
-
|
|
259
258
|
// ============================================================================
|
|
260
259
|
// LINK EXTRACTION
|
|
261
260
|
// ============================================================================
|
package/src/core/extractor.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import TurndownService from 'turndown';
|
|
2
|
-
import { gfm } from 'turndown-plugin-gfm';
|
|
3
1
|
import { Readability } from '@mozilla/readability';
|
|
4
2
|
import { parseHTML } from 'linkedom';
|
|
3
|
+
import TurndownService from 'turndown';
|
|
4
|
+
import { gfm } from 'turndown-plugin-gfm';
|
|
5
5
|
import { cleanMarkdownComplete } from '../utils/markdown-cleaner';
|
|
6
6
|
|
|
7
7
|
export interface ExtractionResult {
|
|
@@ -29,11 +29,7 @@ turndown.addRule('removeComments', {
|
|
|
29
29
|
replacement: () => '',
|
|
30
30
|
});
|
|
31
31
|
|
|
32
|
-
export async function processHtmlToMarkdown(
|
|
33
|
-
html: string,
|
|
34
|
-
url: string,
|
|
35
|
-
verbose = false
|
|
36
|
-
): Promise<ExtractionResult> {
|
|
32
|
+
export async function processHtmlToMarkdown(html: string, url: string, verbose = false): Promise<ExtractionResult> {
|
|
37
33
|
try {
|
|
38
34
|
if (verbose) {
|
|
39
35
|
console.error(`📝 Processing HTML (${html.length} chars)`);
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import type { FetchiConfig } from '../config/schema';
|
|
2
|
+
import { extractLinksFromCached, saveToTemp } from './cache';
|
|
3
|
+
import { closeBrowser, fetchUrl } from './pipeline';
|
|
4
|
+
|
|
5
|
+
export interface FetchLinkResult {
|
|
6
|
+
url: string;
|
|
7
|
+
status: 'new' | 'cached' | 'failed';
|
|
8
|
+
refId?: string;
|
|
9
|
+
error?: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface FetchLinksFromRefResult {
|
|
13
|
+
results: FetchLinkResult[];
|
|
14
|
+
summary: { new: number; cached: number; failed: number };
|
|
15
|
+
error?: string;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export async function fetchLinksFromRef(
|
|
19
|
+
config: FetchiConfig,
|
|
20
|
+
refId: string,
|
|
21
|
+
options?: { refetch?: boolean; verbose?: boolean; onProgress?: (result: FetchLinkResult) => void }
|
|
22
|
+
): Promise<FetchLinksFromRefResult> {
|
|
23
|
+
const linksResult = extractLinksFromCached(config, refId);
|
|
24
|
+
|
|
25
|
+
if (linksResult.error) {
|
|
26
|
+
return { results: [], summary: { new: 0, cached: 0, failed: 0 }, error: linksResult.error };
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (linksResult.count === 0) {
|
|
30
|
+
return { results: [], summary: { new: 0, cached: 0, failed: 0 } };
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const results: FetchLinkResult[] = [];
|
|
34
|
+
const concurrency = 5;
|
|
35
|
+
const urls = linksResult.links.map((l) => l.href);
|
|
36
|
+
const verbose = options?.verbose ?? false;
|
|
37
|
+
const refetch = options?.refetch ?? false;
|
|
38
|
+
|
|
39
|
+
for (let i = 0; i < urls.length; i += concurrency) {
|
|
40
|
+
const batch = urls.slice(i, i + concurrency);
|
|
41
|
+
const batchPromises = batch.map(async (url): Promise<FetchLinkResult> => {
|
|
42
|
+
try {
|
|
43
|
+
const fetchResult = await fetchUrl(url, config, verbose);
|
|
44
|
+
|
|
45
|
+
if (!fetchResult.success) {
|
|
46
|
+
return { url, status: 'failed', error: fetchResult.error };
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const saveResult = await saveToTemp(config, fetchResult.title!, url, fetchResult.markdown!, undefined, refetch);
|
|
50
|
+
|
|
51
|
+
if (saveResult.error) {
|
|
52
|
+
return { url, status: 'failed', error: saveResult.error };
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if (saveResult.alreadyExists) {
|
|
56
|
+
return { url, status: 'cached', refId: saveResult.refId };
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return { url, status: 'new', refId: saveResult.refId };
|
|
60
|
+
} catch (error) {
|
|
61
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
62
|
+
return { url, status: 'failed', error: message };
|
|
63
|
+
}
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
const batchResults = await Promise.all(batchPromises);
|
|
67
|
+
results.push(...batchResults);
|
|
68
|
+
|
|
69
|
+
if (options?.onProgress) {
|
|
70
|
+
for (const r of batchResults) {
|
|
71
|
+
options.onProgress(r);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
await closeBrowser();
|
|
77
|
+
|
|
78
|
+
const summary = {
|
|
79
|
+
new: results.filter((r) => r.status === 'new').length,
|
|
80
|
+
cached: results.filter((r) => r.status === 'cached').length,
|
|
81
|
+
failed: results.filter((r) => r.status === 'failed').length,
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
return { results, summary };
|
|
85
|
+
}
|
package/src/core/index.ts
CHANGED
package/src/core/pipeline.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import type { FetchiConfig } from '../config/schema';
|
|
2
|
-
import {
|
|
3
|
-
import { fetchWithBrowser, closeBrowser } from './playwright/manager';
|
|
2
|
+
import { type ValidationResult, validateMarkdown } from '../utils/markdown-validator';
|
|
4
3
|
import { processHtmlToMarkdown } from './extractor';
|
|
4
|
+
import { closeBrowser, fetchWithBrowser } from './playwright/manager';
|
|
5
5
|
|
|
6
6
|
export interface FetchResult {
|
|
7
7
|
success: boolean;
|
|
@@ -31,8 +31,9 @@ async function simpleFetch(url: string, verbose = false): Promise<SimpleFetchRes
|
|
|
31
31
|
const response = await fetch(url, {
|
|
32
32
|
redirect: 'follow',
|
|
33
33
|
headers: {
|
|
34
|
-
'User-Agent':
|
|
35
|
-
|
|
34
|
+
'User-Agent':
|
|
35
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
36
|
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
36
37
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
37
38
|
},
|
|
38
39
|
});
|
|
@@ -54,12 +55,7 @@ async function simpleFetch(url: string, verbose = false): Promise<SimpleFetchRes
|
|
|
54
55
|
}
|
|
55
56
|
}
|
|
56
57
|
|
|
57
|
-
async function tryPlaywright(
|
|
58
|
-
url: string,
|
|
59
|
-
config: FetchiConfig,
|
|
60
|
-
reason: string,
|
|
61
|
-
verbose = false
|
|
62
|
-
): Promise<FetchResult> {
|
|
58
|
+
async function tryPlaywright(url: string, config: FetchiConfig, reason: string, verbose = false): Promise<FetchResult> {
|
|
63
59
|
if (verbose) {
|
|
64
60
|
console.error(`🎭 Trying Playwright (reason: ${reason})`);
|
|
65
61
|
}
|
|
@@ -82,7 +78,7 @@ async function tryPlaywright(
|
|
|
82
78
|
};
|
|
83
79
|
}
|
|
84
80
|
|
|
85
|
-
const quality = validateMarkdown(extracted.markdown
|
|
81
|
+
const quality = validateMarkdown(extracted.markdown!, { sourceHtmlLength: browserResult.html.length });
|
|
86
82
|
|
|
87
83
|
if (quality.score < config.quality.minScore) {
|
|
88
84
|
return {
|
|
@@ -114,9 +110,24 @@ export async function fetchUrl(
|
|
|
114
110
|
verbose = false,
|
|
115
111
|
forcePlaywright = false
|
|
116
112
|
): Promise<FetchResult> {
|
|
113
|
+
try {
|
|
114
|
+
const parsed = new URL(url);
|
|
115
|
+
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
|
|
116
|
+
return {
|
|
117
|
+
success: false,
|
|
118
|
+
error: `Invalid URL protocol: ${parsed.protocol} — only http and https are supported`,
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
} catch {
|
|
122
|
+
return {
|
|
123
|
+
success: false,
|
|
124
|
+
error: `Invalid URL: ${url}`,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
117
128
|
if (forcePlaywright) {
|
|
118
129
|
if (verbose) {
|
|
119
|
-
console.error(
|
|
130
|
+
console.error('⚡ Force Playwright mode enabled');
|
|
120
131
|
}
|
|
121
132
|
return tryPlaywright(url, config, 'forced', verbose);
|
|
122
133
|
}
|
|
@@ -139,10 +150,15 @@ export async function fetchUrl(
|
|
|
139
150
|
return tryPlaywright(url, config, 'extraction_failed', verbose);
|
|
140
151
|
}
|
|
141
152
|
|
|
142
|
-
const quality = validateMarkdown(extracted.markdown
|
|
153
|
+
const quality = validateMarkdown(extracted.markdown!, { sourceHtmlLength: simpleResult.html.length });
|
|
143
154
|
|
|
144
155
|
if (verbose) {
|
|
145
156
|
console.error(`📊 Quality score: ${quality.score}/100`);
|
|
157
|
+
if (quality.issues.length > 0) {
|
|
158
|
+
for (const issue of quality.issues) {
|
|
159
|
+
console.error(` ⚠ ${issue}`);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
146
162
|
}
|
|
147
163
|
|
|
148
164
|
if (quality.score >= config.quality.jsRetryThreshold) {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { chromium } from 'playwright-extra';
|
|
2
1
|
import type { Browser } from 'playwright';
|
|
2
|
+
import { chromium } from 'playwright-extra';
|
|
3
3
|
import stealth from 'puppeteer-extra-plugin-stealth';
|
|
4
4
|
import type { PlaywrightConfig } from '../../config/schema';
|
|
5
5
|
import type { BrowserManager } from './types';
|
|
@@ -10,28 +10,37 @@ let browserInstance: Browser | null = null;
|
|
|
10
10
|
|
|
11
11
|
export class LocalBrowserManager implements BrowserManager {
|
|
12
12
|
private config: PlaywrightConfig;
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
constructor(config: PlaywrightConfig) {
|
|
15
15
|
this.config = config;
|
|
16
16
|
}
|
|
17
|
-
|
|
17
|
+
|
|
18
18
|
async getBrowser(): Promise<Browser> {
|
|
19
19
|
if (!browserInstance) {
|
|
20
|
-
browserInstance = await chromium.launch({
|
|
20
|
+
browserInstance = await chromium.launch({
|
|
21
21
|
headless: true,
|
|
22
22
|
timeout: this.config.timeout,
|
|
23
|
+
args: [
|
|
24
|
+
'--disable-blink-features=AutomationControlled',
|
|
25
|
+
'--disable-features=IsolateOrigins,site-per-process',
|
|
26
|
+
'--disable-infobars',
|
|
27
|
+
'--no-first-run',
|
|
28
|
+
'--no-default-browser-check',
|
|
29
|
+
'--disable-background-networking',
|
|
30
|
+
'--disable-dev-shm-usage',
|
|
31
|
+
],
|
|
23
32
|
});
|
|
24
33
|
}
|
|
25
34
|
return browserInstance;
|
|
26
35
|
}
|
|
27
|
-
|
|
36
|
+
|
|
28
37
|
async closeBrowser(): Promise<void> {
|
|
29
38
|
if (browserInstance) {
|
|
30
39
|
await browserInstance.close();
|
|
31
40
|
browserInstance = null;
|
|
32
41
|
}
|
|
33
42
|
}
|
|
34
|
-
|
|
43
|
+
|
|
35
44
|
isDocker(): boolean {
|
|
36
45
|
return false;
|
|
37
46
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import type { PlaywrightConfig } from '../../config/schema';
|
|
2
|
-
import type { BrowserManager, FetchWithBrowserResult } from './types';
|
|
3
2
|
import { LocalBrowserManager } from './local';
|
|
3
|
+
import type { BrowserManager, FetchWithBrowserResult } from './types';
|
|
4
4
|
|
|
5
5
|
let currentManager: BrowserManager | null = null;
|
|
6
6
|
|
|
@@ -14,6 +14,23 @@ export async function getBrowserManager(config: PlaywrightConfig): Promise<Brows
|
|
|
14
14
|
return currentManager;
|
|
15
15
|
}
|
|
16
16
|
|
|
17
|
+
/** Common desktop viewport sizes to rotate through for fingerprint diversity */
|
|
18
|
+
const VIEWPORTS = [
|
|
19
|
+
{ width: 1920, height: 1080 },
|
|
20
|
+
{ width: 1536, height: 864 },
|
|
21
|
+
{ width: 1440, height: 900 },
|
|
22
|
+
{ width: 1366, height: 768 },
|
|
23
|
+
{ width: 1280, height: 720 },
|
|
24
|
+
];
|
|
25
|
+
|
|
26
|
+
const TIMEZONES = ['America/New_York', 'America/Chicago', 'America/Denver', 'America/Los_Angeles'];
|
|
27
|
+
|
|
28
|
+
const LOCALES = ['en-US', 'en-US', 'en-US', 'en-GB'];
|
|
29
|
+
|
|
30
|
+
function pick<T>(arr: T[]): T {
|
|
31
|
+
return arr[Math.floor(Math.random() * arr.length)];
|
|
32
|
+
}
|
|
33
|
+
|
|
17
34
|
export async function fetchWithBrowser(
|
|
18
35
|
url: string,
|
|
19
36
|
config: PlaywrightConfig,
|
|
@@ -21,35 +38,90 @@ export async function fetchWithBrowser(
|
|
|
21
38
|
): Promise<FetchWithBrowserResult> {
|
|
22
39
|
const manager = await getBrowserManager(config);
|
|
23
40
|
const browser = await manager.getBrowser();
|
|
24
|
-
|
|
41
|
+
|
|
42
|
+
const viewport = pick(VIEWPORTS);
|
|
43
|
+
const locale = pick(LOCALES);
|
|
44
|
+
const timezone = pick(TIMEZONES);
|
|
45
|
+
|
|
46
|
+
const context = await browser.newContext({
|
|
47
|
+
viewport,
|
|
48
|
+
locale,
|
|
49
|
+
timezoneId: timezone,
|
|
50
|
+
userAgent:
|
|
51
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
|
52
|
+
// Realistic browser headers
|
|
53
|
+
extraHTTPHeaders: {
|
|
54
|
+
'Accept-Language': `${locale},en;q=0.9`,
|
|
55
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
56
|
+
'Sec-CH-UA': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
|
|
57
|
+
'Sec-CH-UA-Mobile': '?0',
|
|
58
|
+
'Sec-CH-UA-Platform': '"macOS"',
|
|
59
|
+
'Sec-Fetch-Dest': 'document',
|
|
60
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
61
|
+
'Sec-Fetch-Site': 'none',
|
|
62
|
+
'Sec-Fetch-User': '?1',
|
|
63
|
+
'Upgrade-Insecure-Requests': '1',
|
|
64
|
+
},
|
|
65
|
+
// Pretend we have granted permissions a real user would have
|
|
66
|
+
permissions: ['geolocation'],
|
|
67
|
+
deviceScaleFactor: 2,
|
|
68
|
+
isMobile: false,
|
|
69
|
+
hasTouch: false,
|
|
70
|
+
javaScriptEnabled: true,
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
const page = await context.newPage();
|
|
74
|
+
|
|
25
75
|
try {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
76
|
+
// Override navigator properties that leak headless signals
|
|
77
|
+
await page.addInitScript(`
|
|
78
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
79
|
+
Object.defineProperty(navigator, 'plugins', {
|
|
80
|
+
get: () => [
|
|
81
|
+
{ name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer' },
|
|
82
|
+
{ name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai' },
|
|
83
|
+
{ name: 'Native Client', filename: 'internal-nacl-plugin' },
|
|
84
|
+
],
|
|
85
|
+
});
|
|
86
|
+
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
|
87
|
+
Object.defineProperty(navigator, 'maxTouchPoints', { get: () => 0 });
|
|
88
|
+
if (typeof Notification !== 'undefined') {
|
|
89
|
+
Object.defineProperty(Notification, 'permission', { get: () => 'default' });
|
|
90
|
+
}
|
|
91
|
+
window.chrome = window.chrome || {};
|
|
92
|
+
window.chrome.runtime = window.chrome.runtime || {};
|
|
93
|
+
`);
|
|
94
|
+
|
|
31
95
|
if (verbose) {
|
|
32
|
-
console.error(
|
|
96
|
+
console.error(
|
|
97
|
+
`🎭 Playwright: Navigating to ${url} (${viewport.width}x${viewport.height}, ${locale}, ${timezone})`
|
|
98
|
+
);
|
|
33
99
|
}
|
|
34
|
-
|
|
100
|
+
|
|
101
|
+
// Small random delay to avoid machine-like timing patterns
|
|
102
|
+
await page.waitForTimeout(200 + Math.floor(Math.random() * 300));
|
|
103
|
+
|
|
35
104
|
await page.goto(url, {
|
|
36
105
|
waitUntil: config.waitStrategy,
|
|
37
106
|
timeout: config.timeout,
|
|
38
107
|
});
|
|
39
|
-
|
|
108
|
+
|
|
109
|
+
// Wait a bit after load for lazy-loaded content / hydration
|
|
110
|
+
await page.waitForTimeout(500 + Math.floor(Math.random() * 500));
|
|
111
|
+
|
|
40
112
|
const html = await page.content();
|
|
41
|
-
|
|
113
|
+
|
|
42
114
|
if (verbose) {
|
|
43
115
|
console.error(`🎭 Playwright: Got ${html.length} chars of HTML`);
|
|
44
116
|
}
|
|
45
|
-
|
|
46
|
-
await page.close();
|
|
47
|
-
await context.close();
|
|
48
117
|
|
|
49
118
|
return { html };
|
|
50
119
|
} catch (error) {
|
|
51
120
|
const message = error instanceof Error ? error.message : String(error);
|
|
52
121
|
return { html: '', error: message };
|
|
122
|
+
} finally {
|
|
123
|
+
await page.close();
|
|
124
|
+
await context.close();
|
|
53
125
|
}
|
|
54
126
|
}
|
|
55
127
|
|
|
@@ -9,44 +9,44 @@ export function cleanMarkdown(markdown: string): string {
|
|
|
9
9
|
return markdown;
|
|
10
10
|
}
|
|
11
11
|
|
|
12
|
-
let cleaned = markdown
|
|
12
|
+
let cleaned = markdown;
|
|
13
13
|
|
|
14
|
-
cleaned = cleaned.replace(/\r\n/g, '\n')
|
|
15
|
-
cleaned = cleaned.replace(/\n{3,}/g, '\n\n')
|
|
16
|
-
cleaned = cleaned.replace(/[ \t]+$/gm, '')
|
|
17
|
-
cleaned = cleaned.trim()
|
|
18
|
-
cleaned = cleaned.replace(/([^\n])\n(#{1,6} )/g, '$1\n\n$2')
|
|
19
|
-
cleaned = cleaned.replace(/(#{1,6} .+)\n([^#\n])/g, '$1\n\n$2')
|
|
20
|
-
cleaned = cleaned.replace(/([^\n])\n([-*+] |\d+\. )/g, '$1\n\n$2')
|
|
21
|
-
cleaned = cleaned.replace(/(\*|_) +/g, '$1')
|
|
22
|
-
cleaned = cleaned.replace(/ +(\*|_)/g, '$1')
|
|
23
|
-
cleaned = cleaned.replace(/([^\n])\n```/g, '$1\n\n```')
|
|
24
|
-
cleaned = cleaned.replace(/```\n([^`])/g, '```\n\n$1')
|
|
25
|
-
cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, ' ')
|
|
26
|
-
cleaned = cleaned.replace(/ {2,}/g, ' ')
|
|
14
|
+
cleaned = cleaned.replace(/\r\n/g, '\n');
|
|
15
|
+
cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
|
|
16
|
+
cleaned = cleaned.replace(/[ \t]+$/gm, '');
|
|
17
|
+
cleaned = cleaned.trim();
|
|
18
|
+
cleaned = cleaned.replace(/([^\n])\n(#{1,6} )/g, '$1\n\n$2');
|
|
19
|
+
cleaned = cleaned.replace(/(#{1,6} .+)\n([^#\n])/g, '$1\n\n$2');
|
|
20
|
+
cleaned = cleaned.replace(/([^\n])\n([-*+] |\d+\. )/g, '$1\n\n$2');
|
|
21
|
+
cleaned = cleaned.replace(/(\*|_) +/g, '$1');
|
|
22
|
+
cleaned = cleaned.replace(/ +(\*|_)/g, '$1');
|
|
23
|
+
cleaned = cleaned.replace(/([^\n])\n```/g, '$1\n\n```');
|
|
24
|
+
cleaned = cleaned.replace(/```\n([^`])/g, '```\n\n$1');
|
|
25
|
+
cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, ' ');
|
|
26
|
+
cleaned = cleaned.replace(/ {2,}/g, ' ');
|
|
27
27
|
|
|
28
|
-
return cleaned
|
|
28
|
+
return cleaned;
|
|
29
29
|
}
|
|
30
30
|
|
|
31
31
|
export function advancedClean(markdown: string): string {
|
|
32
|
-
let cleaned = markdown
|
|
32
|
+
let cleaned = markdown;
|
|
33
33
|
|
|
34
|
-
cleaned = cleaned.replace(/\[([^\]]+)\]\(\)/g, '$1')
|
|
35
|
-
cleaned = cleaned.replace(/<[^>]+>/g, '')
|
|
36
|
-
cleaned = cleaned.replace(/\*\*\*\*/g, '')
|
|
37
|
-
cleaned = cleaned.replace(/(?<!\*)\*\*(?!\*)/g, '')
|
|
38
|
-
cleaned = cleaned.replace(/__/g, '')
|
|
39
|
-
cleaned = cleaned.replace(/!\[\]\(([^)]+)\)/g, '')
|
|
40
|
-
cleaned = cleaned.replace(/[\u200B-\u200D\uFEFF]/g, '')
|
|
41
|
-
cleaned = cleaned.replace(/[\u201C\u201D]/g, '"')
|
|
42
|
-
cleaned = cleaned.replace(/[\u2018\u2019]/g, "'")
|
|
43
|
-
cleaned = cleaned.replace(/[\u2013\u2014]/g, '-')
|
|
34
|
+
cleaned = cleaned.replace(/\[([^\]]+)\]\(\)/g, '$1');
|
|
35
|
+
cleaned = cleaned.replace(/<[^>]+>/g, '');
|
|
36
|
+
cleaned = cleaned.replace(/\*\*\*\*/g, '');
|
|
37
|
+
cleaned = cleaned.replace(/(?<!\*)\*\*(?!\*)/g, '');
|
|
38
|
+
cleaned = cleaned.replace(/__/g, '');
|
|
39
|
+
cleaned = cleaned.replace(/!\[\]\(([^)]+)\)/g, '');
|
|
40
|
+
cleaned = cleaned.replace(/[\u200B-\u200D\uFEFF]/g, '');
|
|
41
|
+
cleaned = cleaned.replace(/[\u201C\u201D]/g, '"');
|
|
42
|
+
cleaned = cleaned.replace(/[\u2018\u2019]/g, "'");
|
|
43
|
+
cleaned = cleaned.replace(/[\u2013\u2014]/g, '-');
|
|
44
44
|
|
|
45
45
|
cleaned = cleaned.replace(/^(?!```)[^\n]*$/gm, (line) => {
|
|
46
|
-
return line.replace(/ {2,}/g, ' ')
|
|
47
|
-
})
|
|
46
|
+
return line.replace(/ {2,}/g, ' ');
|
|
47
|
+
});
|
|
48
48
|
|
|
49
|
-
return cleaned
|
|
49
|
+
return cleaned;
|
|
50
50
|
}
|
|
51
51
|
|
|
52
52
|
export function finalCleanup(markdown: string): string {
|
|
@@ -54,16 +54,16 @@ export function finalCleanup(markdown: string): string {
|
|
|
54
54
|
return markdown;
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
-
let cleaned = markdown
|
|
57
|
+
let cleaned = markdown;
|
|
58
58
|
|
|
59
|
-
cleaned = cleaned.replace(/^(\s*)[*+] /gm, '$1- ')
|
|
60
|
-
cleaned = cleaned.replace(/_([^_]+)_/g, '*$1*')
|
|
61
|
-
cleaned = cleaned.replace(/^~~~(\w*)\n/gm, '```$1\n')
|
|
62
|
-
cleaned = cleaned.replace(/^~~~$/gm, '```')
|
|
63
|
-
cleaned = cleaned.replace(/\n{3,}/g, '\n\n')
|
|
64
|
-
cleaned = `${cleaned.trim()}\n
|
|
59
|
+
cleaned = cleaned.replace(/^(\s*)[*+] /gm, '$1- ');
|
|
60
|
+
cleaned = cleaned.replace(/_([^_]+)_/g, '*$1*');
|
|
61
|
+
cleaned = cleaned.replace(/^~~~(\w*)\n/gm, '```$1\n');
|
|
62
|
+
cleaned = cleaned.replace(/^~~~$/gm, '```');
|
|
63
|
+
cleaned = cleaned.replace(/\n{3,}/g, '\n\n');
|
|
64
|
+
cleaned = `${cleaned.trim()}\n`;
|
|
65
65
|
|
|
66
|
-
return cleaned
|
|
66
|
+
return cleaned;
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
export function cleanMarkdownComplete(markdown: string): string {
|
|
@@ -71,9 +71,9 @@ export function cleanMarkdownComplete(markdown: string): string {
|
|
|
71
71
|
return markdown;
|
|
72
72
|
}
|
|
73
73
|
|
|
74
|
-
let cleaned = cleanMarkdown(markdown)
|
|
75
|
-
cleaned = advancedClean(cleaned)
|
|
76
|
-
cleaned = finalCleanup(cleaned)
|
|
74
|
+
let cleaned = cleanMarkdown(markdown);
|
|
75
|
+
cleaned = advancedClean(cleaned);
|
|
76
|
+
cleaned = finalCleanup(cleaned);
|
|
77
77
|
|
|
78
|
-
return cleaned
|
|
78
|
+
return cleaned;
|
|
79
79
|
}
|