md-fetch 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +212 -0
- package/LICENSE +21 -0
- package/README.md +449 -0
- package/README.zh-CN.md +449 -0
- package/dist/cli.d.ts +27 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +158 -0
- package/dist/cli.js.map +1 -0
- package/dist/constants.d.ts +9 -0
- package/dist/constants.d.ts.map +1 -0
- package/dist/constants.js +15 -0
- package/dist/constants.js.map +1 -0
- package/dist/core/browser.d.ts +23 -0
- package/dist/core/browser.d.ts.map +1 -0
- package/dist/core/browser.js +125 -0
- package/dist/core/browser.js.map +1 -0
- package/dist/core/converter.d.ts +18 -0
- package/dist/core/converter.d.ts.map +1 -0
- package/dist/core/converter.js +74 -0
- package/dist/core/converter.js.map +1 -0
- package/dist/core/extractor.d.ts +28 -0
- package/dist/core/extractor.d.ts.map +1 -0
- package/dist/core/extractor.js +151 -0
- package/dist/core/extractor.js.map +1 -0
- package/dist/core/fetcher.d.ts +24 -0
- package/dist/core/fetcher.d.ts.map +1 -0
- package/dist/core/fetcher.js +111 -0
- package/dist/core/fetcher.js.map +1 -0
- package/dist/core/processor.d.ts +22 -0
- package/dist/core/processor.d.ts.map +1 -0
- package/dist/core/processor.js +104 -0
- package/dist/core/processor.js.map +1 -0
- package/dist/core/screenshotter.d.ts +31 -0
- package/dist/core/screenshotter.d.ts.map +1 -0
- package/dist/core/screenshotter.js +222 -0
- package/dist/core/screenshotter.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +14 -0
- package/dist/index.js.map +1 -0
- package/dist/screen-cli.d.ts +26 -0
- package/dist/screen-cli.d.ts.map +1 -0
- package/dist/screen-cli.js +196 -0
- package/dist/screen-cli.js.map +1 -0
- package/dist/screen.d.ts +3 -0
- package/dist/screen.d.ts.map +1 -0
- package/dist/screen.js +14 -0
- package/dist/screen.js.map +1 -0
- package/dist/types/index.d.ts +151 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +42 -0
- package/dist/types/index.js.map +1 -0
- package/dist/utils/filename-sanitizer.d.ts +38 -0
- package/dist/utils/filename-sanitizer.d.ts.map +1 -0
- package/dist/utils/filename-sanitizer.js +79 -0
- package/dist/utils/filename-sanitizer.js.map +1 -0
- package/dist/utils/frontmatter.d.ts +6 -0
- package/dist/utils/frontmatter.d.ts.map +1 -0
- package/dist/utils/frontmatter.js +65 -0
- package/dist/utils/frontmatter.js.map +1 -0
- package/package.json +56 -0
- package/skills/md-fetch/SKILL.md +133 -0
- package/skills/md-fetch/references/cli-reference.md +257 -0
- package/src/cli.ts +169 -0
- package/src/constants.ts +17 -0
- package/src/core/browser.ts +161 -0
- package/src/core/converter.ts +82 -0
- package/src/core/extractor.ts +172 -0
- package/src/core/fetcher.ts +143 -0
- package/src/core/processor.ts +124 -0
- package/src/core/screenshotter.ts +289 -0
- package/src/index.ts +15 -0
- package/src/screen-cli.ts +216 -0
- package/src/screen.ts +15 -0
- package/src/types/index.ts +227 -0
- package/src/utils/filename-sanitizer.ts +88 -0
- package/src/utils/frontmatter.ts +81 -0
- package/tsconfig.json +20 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type { BrowserOptions } from '../types/index.js';
|
|
2
|
+
export declare class BrowserFetcher {
|
|
3
|
+
private browser;
|
|
4
|
+
private options;
|
|
5
|
+
constructor(options?: BrowserOptions);
|
|
6
|
+
/**
|
|
7
|
+
* Launch browser instance
|
|
8
|
+
*/
|
|
9
|
+
launch(): Promise<void>;
|
|
10
|
+
/**
|
|
11
|
+
* Fetch page content using browser
|
|
12
|
+
*/
|
|
13
|
+
fetchPage(url: string): Promise<string>;
|
|
14
|
+
/**
|
|
15
|
+
* Close browser instance
|
|
16
|
+
*/
|
|
17
|
+
close(): Promise<void>;
|
|
18
|
+
/**
|
|
19
|
+
* Find Chrome executable path
|
|
20
|
+
*/
|
|
21
|
+
private findChromePath;
|
|
22
|
+
}
|
|
23
|
+
//# sourceMappingURL=browser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser.d.ts","sourceRoot":"","sources":["../../src/core/browser.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAKxD,qBAAa,cAAc;IACzB,OAAO,CAAC,OAAO,CAAwB;IACvC,OAAO,CAAC,OAAO,CAAiB;gBAEpB,OAAO,GAAE,cAAmB;IAIxC;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC;IAiD7B;;OAEG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IA8C7C;;OAEG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAO5B;;OAEG;IACH,OAAO,CAAC,cAAc;CAgCvB"}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import puppeteer from 'puppeteer-core';
|
|
2
|
+
import { BrowserError } from '../types/index.js';
|
|
3
|
+
import { DEFAULT_TIMEOUT, DEFAULT_WAIT_UNTIL } from '../constants.js';
|
|
4
|
+
import { existsSync } from 'fs';
|
|
5
|
+
export class BrowserFetcher {
|
|
6
|
+
browser = null;
|
|
7
|
+
options;
|
|
8
|
+
constructor(options = {}) {
|
|
9
|
+
this.options = options;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Launch browser instance
|
|
13
|
+
*/
|
|
14
|
+
async launch() {
|
|
15
|
+
if (this.browser) {
|
|
16
|
+
return; // Already launched
|
|
17
|
+
}
|
|
18
|
+
const { executablePath, headless = true, timeout = DEFAULT_TIMEOUT, proxy } = this.options;
|
|
19
|
+
// 查找 Chrome 可执行文件
|
|
20
|
+
const chromePath = executablePath || this.findChromePath();
|
|
21
|
+
if (!chromePath) {
|
|
22
|
+
throw new BrowserError('', 'Chrome/Chromium not found. Please install Chrome or specify the path with --browser-path');
|
|
23
|
+
}
|
|
24
|
+
const launchOptions = {
|
|
25
|
+
executablePath: chromePath,
|
|
26
|
+
headless,
|
|
27
|
+
timeout,
|
|
28
|
+
args: [
|
|
29
|
+
'--no-sandbox',
|
|
30
|
+
'--disable-setuid-sandbox',
|
|
31
|
+
'--disable-dev-shm-usage',
|
|
32
|
+
'--disable-gpu'
|
|
33
|
+
]
|
|
34
|
+
};
|
|
35
|
+
// 添加代理参数
|
|
36
|
+
if (proxy) {
|
|
37
|
+
launchOptions.args.push(`--proxy-server=${proxy}`);
|
|
38
|
+
}
|
|
39
|
+
try {
|
|
40
|
+
this.browser = await puppeteer.launch(launchOptions);
|
|
41
|
+
}
|
|
42
|
+
catch (error) {
|
|
43
|
+
throw new BrowserError('', `Failed to launch browser: ${error.message}`);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Fetch page content using browser
|
|
48
|
+
*/
|
|
49
|
+
async fetchPage(url) {
|
|
50
|
+
if (!this.browser) {
|
|
51
|
+
await this.launch();
|
|
52
|
+
}
|
|
53
|
+
const { waitUntil = DEFAULT_WAIT_UNTIL, timeout = DEFAULT_TIMEOUT, userAgent } = this.options;
|
|
54
|
+
let page = null;
|
|
55
|
+
try {
|
|
56
|
+
page = await this.browser.newPage();
|
|
57
|
+
// 设置 User-Agent
|
|
58
|
+
if (userAgent) {
|
|
59
|
+
await page.setUserAgent(userAgent);
|
|
60
|
+
}
|
|
61
|
+
// 设置超时
|
|
62
|
+
page.setDefaultTimeout(timeout);
|
|
63
|
+
// 导航到页面
|
|
64
|
+
await page.goto(url, {
|
|
65
|
+
waitUntil,
|
|
66
|
+
timeout
|
|
67
|
+
});
|
|
68
|
+
// 获取 HTML 内容
|
|
69
|
+
const html = await page.content();
|
|
70
|
+
return html;
|
|
71
|
+
}
|
|
72
|
+
catch (error) {
|
|
73
|
+
throw new BrowserError(url, `Failed to fetch page: ${error.message}`);
|
|
74
|
+
}
|
|
75
|
+
finally {
|
|
76
|
+
if (page) {
|
|
77
|
+
await page.close();
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Close browser instance
|
|
83
|
+
*/
|
|
84
|
+
async close() {
|
|
85
|
+
if (this.browser) {
|
|
86
|
+
await this.browser.close();
|
|
87
|
+
this.browser = null;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Find Chrome executable path
|
|
92
|
+
*/
|
|
93
|
+
findChromePath() {
|
|
94
|
+
// 常见的 Chrome 安装路径
|
|
95
|
+
const possiblePaths = [
|
|
96
|
+
// Windows
|
|
97
|
+
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
|
|
98
|
+
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
|
|
99
|
+
process.env.LOCALAPPDATA + '\\Google\\Chrome\\Application\\chrome.exe',
|
|
100
|
+
// macOS
|
|
101
|
+
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
102
|
+
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
|
103
|
+
// Linux
|
|
104
|
+
'/usr/bin/google-chrome',
|
|
105
|
+
'/usr/bin/google-chrome-stable',
|
|
106
|
+
'/usr/bin/chromium',
|
|
107
|
+
'/usr/bin/chromium-browser',
|
|
108
|
+
'/snap/bin/chromium'
|
|
109
|
+
];
|
|
110
|
+
// 尝试查找可用的路径
|
|
111
|
+
for (const path of possiblePaths) {
|
|
112
|
+
try {
|
|
113
|
+
if (existsSync(path)) {
|
|
114
|
+
return path;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
catch {
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
// 从环境变量读取
|
|
122
|
+
return process.env.CHROME_PATH || process.env.CHROMIUM_PATH;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
//# sourceMappingURL=browser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser.js","sourceRoot":"","sources":["../../src/core/browser.ts"],"names":[],"mappings":"AAAA,OAAO,SAAmE,MAAM,gBAAgB,CAAC;AAEjG,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AACjD,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AACtE,OAAO,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAEhC,MAAM,OAAO,cAAc;IACjB,OAAO,GAAmB,IAAI,CAAC;IAC/B,OAAO,CAAiB;IAEhC,YAAY,UAA0B,EAAE;QACtC,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,MAAM;QACV,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,OAAO,CAAC,mBAAmB;QAC7B,CAAC;QAED,MAAM,EACJ,cAAc,EACd,QAAQ,GAAG,IAAI,EACf,OAAO,GAAG,eAAe,EACzB,KAAK,EACN,GAAG,IAAI,CAAC,OAAO,CAAC;QAEjB,kBAAkB;QAClB,MAAM,UAAU,GAAG,cAAc,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;QAE3D,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,MAAM,IAAI,YAAY,CACpB,EAAE,EACF,0FAA0F,CAC3F,CAAC;QACJ,CAAC;QAED,MAAM,aAAa,GAA2B;YAC5C,cAAc,EAAE,UAAU;YAC1B,QAAQ;YACR,OAAO;YACP,IAAI,EAAE;gBACJ,cAAc;gBACd,0BAA0B;gBAC1B,yBAAyB;gBACzB,eAAe;aAChB;SACF,CAAC;QAEF,SAAS;QACT,IAAI,KAAK,EAAE,CAAC;YACV,aAAa,CAAC,IAAK,CAAC,IAAI,CAAC,kBAAkB,KAAK,EAAE,CAAC,CAAC;QACtD,CAAC;QAED,IAAI,CAAC;YACH,IAAI,CAAC,OAAO,GAAG,MAAM,SAAS,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC;QACvD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,YAAY,CACpB,EAAE,EACF,6BAA8B,KAAe,CAAC,OAAO,EAAE,CACxD,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,SAAS,CAAC,GAAW;QACzB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,IAAI,CAAC,MAAM,EAAE,CAAC;QACtB,CAAC;QAED,MAAM,EACJ,SAAS,GAAG,kBAAkB,EAC9B,OAAO,GAAG,eAAe,EACzB,SAAS,EACV,GAAG,IAAI,CAAC,OAAO,CAAC;QAEjB,IAAI,IAAI,GAAgB,IAAI,CAAC;QAE7B,IAAI,CAAC;YACH,IAAI,GAAG,MAAM,IAAI,CAAC,OAAQ,CAAC,OAAO,EAAE,CAAC;YAErC,gBAAgB;YAChB,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YACrC,CAAC;YAED,OAAO;YACP,IAAI,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC;YAEhC,QAAQ;YACR,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACnB,SAAS;gBACT,OAAO;aACR,CAAC,CAAC;YAEH,aAAa;YACb,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YAElC,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,YAAY,CACpB,GAAG,EACH,yBAA0B,KAAe,CAAC,OAAO,EAAE,CACpD,CAAC;QACJ,CAAC;gBAAS,CAAC;YACT,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;YACrB,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK;QACT,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;IACH,CAAC;IAED;;OAEG;IACK,cAAc;QACpB,kBAAkB;QAClB,MAAM,aAAa,GAAG;YACpB,UAAU;YACV,4DAA4D;YAC5D,kEAAkE;YAClE,OAAO,CAAC,GAAG,CAAC,YAAY,GAAG,2CAA2C;YACtE,QAAQ;YACR,8DAA8D;YAC9D,oDAAoD;YACpD,QAAQ;YACR,wBAAwB;YACxB,+BAA+B;YAC/B,mBAAmB;YACnB,2BAA2B;YAC3B,oBAAoB;SACrB,CAAC;QAEF,YAAY;QACZ,KAAK,MAAM,IAAI,IAAI,aAAa,EAAE,CAAC;YACjC,IAAI,CAAC;gBACH,IAAI,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;oBACrB,OAAO,IAAI,CAAC;gBACd,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,SAAS;YACX,CAAC;QACH,CAAC;QAED,UAAU;QACV,OAAO,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC;IAC9D,CAAC;CACF"}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { ConversionOptions } from '../types/index.js';
|
|
2
|
+
export declare class MarkdownConverter {
|
|
3
|
+
private turndownService;
|
|
4
|
+
constructor(options?: ConversionOptions);
|
|
5
|
+
/**
|
|
6
|
+
* Convert HTML to Markdown
|
|
7
|
+
*/
|
|
8
|
+
convert(html: string): string;
|
|
9
|
+
/**
|
|
10
|
+
* Setup custom conversion rules
|
|
11
|
+
*/
|
|
12
|
+
private setupCustomRules;
|
|
13
|
+
/**
|
|
14
|
+
* Clean up the generated Markdown
|
|
15
|
+
*/
|
|
16
|
+
private cleanMarkdown;
|
|
17
|
+
}
|
|
18
|
+
//# sourceMappingURL=converter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"converter.d.ts","sourceRoot":"","sources":["../../src/core/converter.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AAE3D,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,eAAe,CAAkB;gBAE7B,OAAO,GAAE,iBAAsB;IAc3C;;OAEG;IACH,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAS7B;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAkCxB;;OAEG;IACH,OAAO,CAAC,aAAa;CAStB"}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import TurndownService from 'turndown';
|
|
2
|
+
export class MarkdownConverter {
|
|
3
|
+
turndownService;
|
|
4
|
+
constructor(options = {}) {
|
|
5
|
+
this.turndownService = new TurndownService({
|
|
6
|
+
headingStyle: options.headingStyle || 'atx',
|
|
7
|
+
codeBlockStyle: options.codeBlockStyle || 'fenced',
|
|
8
|
+
bulletListMarker: options.bulletListMarker || '-',
|
|
9
|
+
hr: '---',
|
|
10
|
+
emDelimiter: '*',
|
|
11
|
+
strongDelimiter: '**'
|
|
12
|
+
});
|
|
13
|
+
// 添加自定义规则
|
|
14
|
+
this.setupCustomRules();
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Convert HTML to Markdown
|
|
18
|
+
*/
|
|
19
|
+
convert(html) {
|
|
20
|
+
try {
|
|
21
|
+
const markdown = this.turndownService.turndown(html);
|
|
22
|
+
return this.cleanMarkdown(markdown);
|
|
23
|
+
}
|
|
24
|
+
catch (error) {
|
|
25
|
+
throw new Error(`Failed to convert HTML to Markdown: ${error.message}`);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Setup custom conversion rules
|
|
30
|
+
*/
|
|
31
|
+
setupCustomRules() {
|
|
32
|
+
// 保留图片的 alt 文本和链接
|
|
33
|
+
this.turndownService.addRule('images', {
|
|
34
|
+
filter: 'img',
|
|
35
|
+
replacement: (content, node) => {
|
|
36
|
+
const alt = node.alt || '';
|
|
37
|
+
const src = node.src || '';
|
|
38
|
+
const title = node.title || '';
|
|
39
|
+
if (!src)
|
|
40
|
+
return '';
|
|
41
|
+
return title
|
|
42
|
+
? ``
|
|
43
|
+
: ``;
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
// 处理代码块,保留语言标识
|
|
47
|
+
this.turndownService.addRule('codeBlocks', {
|
|
48
|
+
filter: (node) => {
|
|
49
|
+
return node.nodeName === 'PRE' &&
|
|
50
|
+
node.firstChild?.nodeName === 'CODE';
|
|
51
|
+
},
|
|
52
|
+
replacement: (content, node) => {
|
|
53
|
+
const codeNode = node.firstChild;
|
|
54
|
+
const className = codeNode?.className || '';
|
|
55
|
+
const language = className.match(/language-(\w+)/)?.[1] || '';
|
|
56
|
+
const code = codeNode?.textContent || '';
|
|
57
|
+
return `\n\`\`\`${language}\n${code}\n\`\`\`\n`;
|
|
58
|
+
}
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Clean up the generated Markdown
|
|
63
|
+
*/
|
|
64
|
+
cleanMarkdown(markdown) {
|
|
65
|
+
return markdown
|
|
66
|
+
// 移除多余的空行(超过 2 个连续换行)
|
|
67
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
68
|
+
// 移除行尾空格
|
|
69
|
+
.replace(/[ \t]+$/gm, '')
|
|
70
|
+
// 确保文件以换行符结尾
|
|
71
|
+
.trim() + '\n';
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
//# sourceMappingURL=converter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"converter.js","sourceRoot":"","sources":["../../src/core/converter.ts"],"names":[],"mappings":"AAAA,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,OAAO,iBAAiB;IACpB,eAAe,CAAkB;IAEzC,YAAY,UAA6B,EAAE;QACzC,IAAI,CAAC,eAAe,GAAG,IAAI,eAAe,CAAC;YACzC,YAAY,EAAE,OAAO,CAAC,YAAY,IAAI,KAAK;YAC3C,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,QAAQ;YAClD,gBAAgB,EAAE,OAAO,CAAC,gBAAgB,IAAI,GAAG;YACjD,EAAE,EAAE,KAAK;YACT,WAAW,EAAE,GAAG;YAChB,eAAe,EAAE,IAAI;SACtB,CAAC,CAAC;QAEH,UAAU;QACV,IAAI,CAAC,gBAAgB,EAAE,CAAC;IAC1B,CAAC;IAED;;OAEG;IACH,OAAO,CAAC,IAAY;QAClB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YACrD,OAAO,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;QACtC,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,uCAAwC,KAAe,CAAC,OAAO,EAAE,CAAC,CAAC;QACrF,CAAC;IACH,CAAC;IAED;;OAEG;IACK,gBAAgB;QACtB,kBAAkB;QAClB,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,QAAQ,EAAE;YACrC,MAAM,EAAE,KAAK;YACb,WAAW,EAAE,CAAC,OAAO,EAAE,IAAI,EAAE,EAAE;gBAC7B,MAAM,GAAG,GAAI,IAAyB,CAAC,GAAG,IAAI,EAAE,CAAC;gBACjD,MAAM,GAAG,GAAI,IAAyB,CAAC,GAAG,IAAI,EAAE,CAAC;gBACjD,MAAM,KAAK,GAAI,IAAyB,CAAC,KAAK,IAAI,EAAE,CAAC;gBAErD,IAAI,CAAC,GAAG;oBAAE,OAAO,EAAE,CAAC;gBAEpB,OAAO,KAAK;oBACV,CAAC,CAAC,KAAK,GAAG,KAAK,GAAG,KAAK,KAAK,IAAI;oBAChC,CAAC,CAAC,KAAK,GAAG,KAAK,GAAG,GAAG,CAAC;YAC1B,CAAC;SACF,CAAC,CAAC;QAEH,eAAe;QACf,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,YAAY,EAAE;YACzC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBACf,OAAO,IAAI,CAAC,QAAQ,KAAK,KAAK;oBACvB,IAAI,CAAC,UAAU,EAAE,QAAQ,KAAK,MAAM,CAAC;YAC9C,CAAC;YACD,WAAW,EAAE,CAAC,OAAO,EAAE,IAAI,EAAE,EAAE;gBAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAyB,CAAC;gBAChD,MAAM,SAAS,GAAG,QAAQ,EAAE,SAAS,IAAI,EAAE,CAAC;gBAC5C,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,gBAAgB,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAE9D,MAAM,IAAI,GAAG,QAAQ,EAAE,WAAW,IAAI,EAAE,CAAC;gBACzC,OAAO,WAAW,QAAQ,KAAK,IAAI,YAAY,CAAC;YAClD,CAAC;SACF,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,QAAgB;QACpC,OAAO,QAAQ;YACb,sBAAsB;aACrB,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;YAC3B,SAAS;aACR,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC;YACzB,aAAa;aACZ,IAAI,EAAE,GAAG,IAAI,CAAC;IACnB,CAAC;CACF"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { ExtractOptions, ExtractedContent, PageMetadata } from '../types/index.js';
|
|
2
|
+
export declare class ContentExtractor {
|
|
3
|
+
/**
|
|
4
|
+
* Create a virtual console that suppresses CSS errors
|
|
5
|
+
*/
|
|
6
|
+
private createVirtualConsole;
|
|
7
|
+
/**
|
|
8
|
+
* Extract content from HTML
|
|
9
|
+
*/
|
|
10
|
+
extract(html: string, url: string, options: ExtractOptions): ExtractedContent;
|
|
11
|
+
/**
|
|
12
|
+
* Extract content using CSS selector
|
|
13
|
+
*/
|
|
14
|
+
private extractBySelector;
|
|
15
|
+
/**
|
|
16
|
+
* Extract content using Mozilla Readability
|
|
17
|
+
*/
|
|
18
|
+
private extractWithReadability;
|
|
19
|
+
/**
|
|
20
|
+
* Extract metadata from the page
|
|
21
|
+
*/
|
|
22
|
+
extractMetadata(html: string, url: string): PageMetadata;
|
|
23
|
+
/**
|
|
24
|
+
* Extract keywords from meta tags
|
|
25
|
+
*/
|
|
26
|
+
private extractKeywords;
|
|
27
|
+
}
|
|
28
|
+
//# sourceMappingURL=extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extractor.d.ts","sourceRoot":"","sources":["../../src/core/extractor.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,cAAc,EAAE,gBAAgB,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAGxF,qBAAa,gBAAgB;IAC3B;;OAEG;IACH,OAAO,CAAC,oBAAoB;IAS5B;;OAEG;IACH,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,cAAc,GAAG,gBAAgB;IAwC7E;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAUzB;;OAEG;IACH,OAAO,CAAC,sBAAsB;IAsB9B;;OAEG;IACH,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,YAAY;IAyDxD;;OAEG;IACH,OAAO,CAAC,eAAe;CASxB"}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import { JSDOM, VirtualConsole } from 'jsdom';
|
|
2
|
+
import { Readability } from '@mozilla/readability';
|
|
3
|
+
import { ExtractionError } from '../types/index.js';
|
|
4
|
+
export class ContentExtractor {
|
|
5
|
+
/**
|
|
6
|
+
* Create a virtual console that suppresses CSS errors
|
|
7
|
+
*/
|
|
8
|
+
createVirtualConsole() {
|
|
9
|
+
const virtualConsole = new VirtualConsole();
|
|
10
|
+
// Suppress CSS parsing errors (they don't affect content extraction)
|
|
11
|
+
virtualConsole.on('error', () => {
|
|
12
|
+
// Ignore errors
|
|
13
|
+
});
|
|
14
|
+
return virtualConsole;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Extract content from HTML
|
|
18
|
+
*/
|
|
19
|
+
extract(html, url, options) {
|
|
20
|
+
const { useReadability, selector } = options;
|
|
21
|
+
try {
|
|
22
|
+
const dom = new JSDOM(html, {
|
|
23
|
+
url,
|
|
24
|
+
virtualConsole: this.createVirtualConsole()
|
|
25
|
+
});
|
|
26
|
+
const document = dom.window.document;
|
|
27
|
+
// 提取元数据
|
|
28
|
+
const metadata = this.extractMetadata(html, url);
|
|
29
|
+
let content;
|
|
30
|
+
// 如果指定了选择器,使用选择器提取
|
|
31
|
+
if (selector) {
|
|
32
|
+
content = this.extractBySelector(document, selector);
|
|
33
|
+
}
|
|
34
|
+
// 如果禁用了 readability,返回整个 body
|
|
35
|
+
else if (!useReadability) {
|
|
36
|
+
content = document.body.innerHTML || '';
|
|
37
|
+
}
|
|
38
|
+
// 使用 readability 提取主要内容
|
|
39
|
+
else {
|
|
40
|
+
content = this.extractWithReadability(document, url);
|
|
41
|
+
}
|
|
42
|
+
return {
|
|
43
|
+
content,
|
|
44
|
+
metadata
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
catch (error) {
|
|
48
|
+
throw new ExtractionError(url, `Failed to extract content: ${error.message}`);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Extract content using CSS selector
|
|
53
|
+
*/
|
|
54
|
+
extractBySelector(document, selector) {
|
|
55
|
+
const element = document.querySelector(selector);
|
|
56
|
+
if (!element) {
|
|
57
|
+
throw new Error(`Selector "${selector}" not found in the document`);
|
|
58
|
+
}
|
|
59
|
+
return element.innerHTML;
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Extract content using Mozilla Readability
|
|
63
|
+
*/
|
|
64
|
+
extractWithReadability(document, url) {
|
|
65
|
+
// Clone document for readability (它会修改 DOM)
|
|
66
|
+
const documentClone = document.cloneNode(true);
|
|
67
|
+
const reader = new Readability(documentClone, {
|
|
68
|
+
debug: false,
|
|
69
|
+
maxElemsToParse: 0, // 无限制
|
|
70
|
+
nbTopCandidates: 5,
|
|
71
|
+
charThreshold: 500
|
|
72
|
+
});
|
|
73
|
+
const article = reader.parse();
|
|
74
|
+
if (!article || !article.content) {
|
|
75
|
+
// 如果 readability 失败,回退到原始 body
|
|
76
|
+
console.warn(`Readability failed for ${url}, falling back to full body content`);
|
|
77
|
+
return document.body.innerHTML || '';
|
|
78
|
+
}
|
|
79
|
+
return article.content;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Extract metadata from the page
|
|
83
|
+
*/
|
|
84
|
+
extractMetadata(html, url) {
|
|
85
|
+
try {
|
|
86
|
+
const dom = new JSDOM(html, {
|
|
87
|
+
url,
|
|
88
|
+
virtualConsole: this.createVirtualConsole()
|
|
89
|
+
});
|
|
90
|
+
const document = dom.window.document;
|
|
91
|
+
// 使用 readability 获取元数据
|
|
92
|
+
const documentClone = document.cloneNode(true);
|
|
93
|
+
const reader = new Readability(documentClone);
|
|
94
|
+
const article = reader.parse();
|
|
95
|
+
// Helper function to get meta content
|
|
96
|
+
const getMeta = (selectors) => {
|
|
97
|
+
for (const selector of selectors) {
|
|
98
|
+
const element = document.querySelector(selector);
|
|
99
|
+
const content = element?.getAttribute('content') || element?.getAttribute('value');
|
|
100
|
+
if (content)
|
|
101
|
+
return content;
|
|
102
|
+
}
|
|
103
|
+
return undefined;
|
|
104
|
+
};
|
|
105
|
+
// Extract comprehensive metadata
|
|
106
|
+
const metadata = {
|
|
107
|
+
url,
|
|
108
|
+
title: article?.title ||
|
|
109
|
+
getMeta(['meta[property="og:title"]', 'meta[name="twitter:title"]']) ||
|
|
110
|
+
document.querySelector('title')?.textContent ||
|
|
111
|
+
undefined,
|
|
112
|
+
description: article?.excerpt ||
|
|
113
|
+
getMeta(['meta[name="description"]', 'meta[property="og:description"]', 'meta[name="twitter:description"]']) ||
|
|
114
|
+
undefined,
|
|
115
|
+
author: article?.byline ||
|
|
116
|
+
getMeta(['meta[name="author"]', 'meta[property="article:author"]']) ||
|
|
117
|
+
undefined,
|
|
118
|
+
publishedTime: getMeta(['meta[property="article:published_time"]', 'meta[name="publish_date"]', 'meta[property="og:published_time"]']) ||
|
|
119
|
+
undefined,
|
|
120
|
+
modifiedTime: getMeta(['meta[property="article:modified_time"]', 'meta[property="og:updated_time"]']) ||
|
|
121
|
+
undefined,
|
|
122
|
+
siteName: article?.siteName ||
|
|
123
|
+
getMeta(['meta[property="og:site_name"]']) ||
|
|
124
|
+
undefined,
|
|
125
|
+
keywords: this.extractKeywords(document),
|
|
126
|
+
image: getMeta(['meta[property="og:image"]', 'meta[name="twitter:image"]']) ||
|
|
127
|
+
undefined,
|
|
128
|
+
lang: document.documentElement.lang ||
|
|
129
|
+
getMeta(['meta[property="og:locale"]']) ||
|
|
130
|
+
undefined
|
|
131
|
+
};
|
|
132
|
+
return metadata;
|
|
133
|
+
}
|
|
134
|
+
catch (error) {
|
|
135
|
+
return { url };
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Extract keywords from meta tags
|
|
140
|
+
*/
|
|
141
|
+
extractKeywords(document) {
|
|
142
|
+
const keywordsContent = document.querySelector('meta[name="keywords"]')?.getAttribute('content');
|
|
143
|
+
if (!keywordsContent)
|
|
144
|
+
return undefined;
|
|
145
|
+
return keywordsContent
|
|
146
|
+
.split(',')
|
|
147
|
+
.map(k => k.trim())
|
|
148
|
+
.filter(k => k.length > 0);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
//# sourceMappingURL=extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extractor.js","sourceRoot":"","sources":["../../src/core/extractor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,cAAc,EAAE,MAAM,OAAO,CAAC;AAC9C,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AAEnD,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAEpD,MAAM,OAAO,gBAAgB;IAC3B;;OAEG;IACK,oBAAoB;QAC1B,MAAM,cAAc,GAAG,IAAI,cAAc,EAAE,CAAC;QAC5C,qEAAqE;QACrE,cAAc,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;YAC9B,gBAAgB;QAClB,CAAC,CAAC,CAAC;QACH,OAAO,cAAc,CAAC;IACxB,CAAC;IAED;;OAEG;IACH,OAAO,CAAC,IAAY,EAAE,GAAW,EAAE,OAAuB;QACxD,MAAM,EAAE,cAAc,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;QAE7C,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE;gBAC1B,GAAG;gBACH,cAAc,EAAE,IAAI,CAAC,oBAAoB,EAAE;aAC5C,CAAC,CAAC;YACH,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;YAErC,QAAQ;YACR,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAEjD,IAAI,OAAe,CAAC;YAEpB,mBAAmB;YACnB,IAAI,QAAQ,EAAE,CAAC;gBACb,OAAO,GAAG,IAAI,CAAC,iBAAiB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;YACvD,CAAC;YACD,8BAA8B;iBACzB,IAAI,CAAC,cAAc,EAAE,CAAC;gBACzB,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;YAC1C,CAAC;YACD,wBAAwB;iBACnB,CAAC;gBACJ,OAAO,GAAG,IAAI,CAAC,sBAAsB,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;YACvD,CAAC;YAED,OAAO;gBACL,OAAO;gBACP,QAAQ;aACT,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,eAAe,CACvB,GAAG,EACH,8BAA+B,KAAe,CAAC,OAAO,EAAE,CACzD,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACK,iBAAiB,CAAC,QAAkB,EAAE,QAAgB;QAC5D,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;QAEjD,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,aAAa,QAAQ,6BAA6B,CAAC,CAAC;QACtE,CAAC;QAED,OAAO,OAAO,CAAC,SAAS,CAAC;IAC3B,CAAC;IAED;;OAEG;IACK,sBAAsB,CAAC,QAAkB,EAAE,GAAW;QAC5D,4CAA4C;QAC5C,MAAM,aAAa,GAAG,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAa,CAAC;QAE3D,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,aAAa,EAAE;YAC5C,KAAK,EAAE,KAAK;YACZ,eAAe,EAAE,CAAC,EAAE,MAAM;YAC1B,eAAe,EAAE,CAAC;YAClB,aAAa,EAAE,GAAG;SACnB,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAE/B,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YACjC,+BAA+B;YAC/B,OAAO,CAAC,IAAI,CAAC,0BAA0B,GAAG,qCAAqC,CAAC,CAAC;YACjF,OAAO,QAAQ,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;QACvC,CAAC;QAED,OAAO,OAAO,CAAC,OAAO,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,IAAY,EAAE,GAAW;QACvC,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE;gBAC1B,GAAG;gBACH,cAAc,EAAE,IAAI,CAAC,oBAAoB,EAAE;aAC5C,CAAC,CAAC;YACH,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;YAErC,uBAAuB;YACvB,MAAM,aAAa,GAAG,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAa,CAAC;YAC3D,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,aAAa,CAAC,CAAC;YAC9C,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;YAE/B,sCAAsC;YACtC,MAAM,OAAO,GAAG,CAAC,SAAmB,EAAsB,EAAE;gBAC1D,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;oBACjC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;oBACjD,MAAM,OAAO,GAAG,OAAO,EAAE,YAAY,CAAC,SAAS,CAAC,IAAI,OAAO,EAAE,YAAY,CAAC,OAAO,CAAC,CAAC;oBACnF,IAAI,OAAO;wBAAE,OAAO,OAAO,CAAC;gBAC9B,CAAC;gBACD,OAAO,SAAS,CAAC;YACnB,CAAC,CAAC;YAEF,iCAAiC;YACjC,MAAM,QAAQ,GAAiB;gBAC7B,GAAG;gBACH,KAAK,EAAE,OAAO,EAAE,KAAK;oBACd,OAAO,CAAC,CAAC,2BAA2B,EAAE,4BAA4B,CAAC,CAAC;oBACpE,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,WAAW;oBAC5C,SAAS;gBAChB,WAAW,EAAE,OAAO,EAAE,OAAO;oBACjB,OAAO,CAAC,CAAC,0BAA0B,EAAE,iCAAiC,EAAE,kCAAkC,CAAC,CAAC;oBAC5G,SAAS;gBACrB,MAAM,EAAE,OAAO,EAAE,MAAM;oBAChB,OAAO,CAAC,CAAC,qBAAqB,EAAE,iCAAiC,CAAC,CAAC;oBACnE,SAAS;gBAChB,aAAa,EAAE,OAAO,CAAC,CAAC,yCAAyC,EAAE,2BAA2B,EAAE,oCAAoC,CAAC,CAAC;oBACxH,SAAS;gBACvB,YAAY,EAAE,OAAO,CAAC,CAAC,wCAAwC,EAAE,kCAAkC,CAAC,CAAC;oBACxF,SAAS;gBACtB,QAAQ,EAAE,OAAO,EAAE,QAAQ;oBAClB,OAAO,CAAC,CAAC,+BAA+B,CAAC,CAAC;oBAC1C,SAAS;gBAClB,QAAQ,EAAE,IAAI,CAAC,eAAe,CAAC,QAAQ,CAAC;gBACxC,KAAK,EAAE,OAAO,CAAC,CAAC,2BAA2B,EAAE,4BAA4B,CAAC,CAAC;oBACrE,SAAS;gBACf,IAAI,EAAE,QAAQ,CAAC,eAAe,CAAC,IAAI;oBAC9B,OAAO,CAAC,CAAC,4BAA4B,CAAC,CAAC;oBACvC,SAAS;aACf,CAAC;YAEF,OAAO,QAAQ,CAAC;QAClB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,EAAE,GAAG,EAAE,CAAC;QACjB,CAAC;IACH,CAAC;IAED;;OAEG;IACK,eAAe,CAAC,QAAkB;QACxC,MAAM,eAAe,GAAG,QAAQ,CAAC,aAAa,CAAC,uBAAuB,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,CAAC;QACjG,IAAI,CAAC,eAAe;YAAE,OAAO,SAAS,CAAC;QAEvC,OAAO,eAAe;aACnB,KAAK,CAAC,GAAG,CAAC;aACV,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC/B,CAAC;CACF"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { FetchOptions } from '../types/index.js';
|
|
2
|
+
export declare class Fetcher {
|
|
3
|
+
/**
|
|
4
|
+
* Get proxy URL from environment variables or options
|
|
5
|
+
*/
|
|
6
|
+
private getProxyUrl;
|
|
7
|
+
/**
|
|
8
|
+
* Fetch HTML content from a URL
|
|
9
|
+
*/
|
|
10
|
+
fetch(url: string, options?: FetchOptions): Promise<string>;
|
|
11
|
+
/**
|
|
12
|
+
* Fetch multiple URLs with concurrency control (for future batch processing)
|
|
13
|
+
*/
|
|
14
|
+
fetchBatch(urls: string[], options?: FetchOptions, concurrent?: number): Promise<Array<{
|
|
15
|
+
url: string;
|
|
16
|
+
html?: string;
|
|
17
|
+
error?: Error;
|
|
18
|
+
}>>;
|
|
19
|
+
/**
|
|
20
|
+
* Sleep helper for retry delays
|
|
21
|
+
*/
|
|
22
|
+
private sleep;
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=fetcher.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetcher.d.ts","sourceRoot":"","sources":["../../src/core/fetcher.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAKtD,qBAAa,OAAO;IAClB;;OAEG;IACH,OAAO,CAAC,WAAW;IAgCnB;;OAEG;IACG,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,MAAM,CAAC;IAoErE;;OAEG;IACG,UAAU,CACd,IAAI,EAAE,MAAM,EAAE,EACd,OAAO,GAAE,YAAiB,EAC1B,UAAU,GAAE,MAAU,GACrB,OAAO,CAAC,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,IAAI,CAAC,EAAE,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,KAAK,CAAA;KAAE,CAAC,CAAC;IAiBhE;;OAEG;IACH,OAAO,CAAC,KAAK;CAGd"}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import { FetchError } from '../types/index.js';
|
|
2
|
+
import { DEFAULT_TIMEOUT, DEFAULT_USER_AGENT, RETRY_ATTEMPTS, RETRY_DELAY } from '../constants.js';
|
|
3
|
+
import { ProxyAgent } from 'undici';
|
|
4
|
+
export class Fetcher {
|
|
5
|
+
/**
|
|
6
|
+
* Get proxy URL from environment variables or options
|
|
7
|
+
*/
|
|
8
|
+
getProxyUrl(url, proxyOption) {
|
|
9
|
+
// 优先使用命令行参数中的代理
|
|
10
|
+
if (proxyOption) {
|
|
11
|
+
return proxyOption;
|
|
12
|
+
}
|
|
13
|
+
// 从环境变量读取代理
|
|
14
|
+
const urlObj = new URL(url);
|
|
15
|
+
const protocol = urlObj.protocol;
|
|
16
|
+
// 检查 NO_PROXY
|
|
17
|
+
const noProxy = process.env.NO_PROXY || process.env.no_proxy;
|
|
18
|
+
if (noProxy) {
|
|
19
|
+
const noProxyList = noProxy.split(',').map(s => s.trim());
|
|
20
|
+
if (noProxyList.some(pattern => {
|
|
21
|
+
if (pattern === '*')
|
|
22
|
+
return true;
|
|
23
|
+
if (pattern.startsWith('.'))
|
|
24
|
+
return urlObj.hostname.endsWith(pattern);
|
|
25
|
+
return urlObj.hostname === pattern;
|
|
26
|
+
})) {
|
|
27
|
+
return undefined;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
// 根据协议选择代理
|
|
31
|
+
if (protocol === 'https:') {
|
|
32
|
+
return process.env.HTTPS_PROXY || process.env.https_proxy ||
|
|
33
|
+
process.env.HTTP_PROXY || process.env.http_proxy;
|
|
34
|
+
}
|
|
35
|
+
else {
|
|
36
|
+
return process.env.HTTP_PROXY || process.env.http_proxy;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Fetch HTML content from a URL
|
|
41
|
+
*/
|
|
42
|
+
async fetch(url, options = {}) {
|
|
43
|
+
const { headers = {}, timeout = DEFAULT_TIMEOUT, userAgent = DEFAULT_USER_AGENT, proxy } = options;
|
|
44
|
+
// 获取代理 URL
|
|
45
|
+
const proxyUrl = this.getProxyUrl(url, proxy);
|
|
46
|
+
// 构建 fetch 选项
|
|
47
|
+
const fetchOptions = {
|
|
48
|
+
headers: {
|
|
49
|
+
'User-Agent': userAgent,
|
|
50
|
+
...headers
|
|
51
|
+
},
|
|
52
|
+
signal: AbortSignal.timeout(timeout)
|
|
53
|
+
};
|
|
54
|
+
// 如果有代理,使用 ProxyAgent
|
|
55
|
+
if (proxyUrl) {
|
|
56
|
+
fetchOptions.dispatcher = new ProxyAgent(proxyUrl);
|
|
57
|
+
}
|
|
58
|
+
let lastError = null;
|
|
59
|
+
// 重试逻辑
|
|
60
|
+
for (let attempt = 0; attempt < RETRY_ATTEMPTS; attempt++) {
|
|
61
|
+
try {
|
|
62
|
+
const response = await fetch(url, fetchOptions);
|
|
63
|
+
if (!response.ok) {
|
|
64
|
+
throw new FetchError(url, response.status, `HTTP ${response.status}: ${response.statusText}`);
|
|
65
|
+
}
|
|
66
|
+
const html = await response.text();
|
|
67
|
+
return html;
|
|
68
|
+
}
|
|
69
|
+
catch (error) {
|
|
70
|
+
lastError = error;
|
|
71
|
+
// 如果是最后一次尝试,抛出错误
|
|
72
|
+
if (attempt === RETRY_ATTEMPTS - 1) {
|
|
73
|
+
break;
|
|
74
|
+
}
|
|
75
|
+
// 指数退避:等待时间 = RETRY_DELAY * 2^attempt
|
|
76
|
+
const delay = RETRY_DELAY * Math.pow(2, attempt);
|
|
77
|
+
await this.sleep(delay);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
// 所有重试都失败了
|
|
81
|
+
if (lastError instanceof FetchError) {
|
|
82
|
+
throw lastError;
|
|
83
|
+
}
|
|
84
|
+
throw new FetchError(url, undefined, `Failed to fetch after ${RETRY_ATTEMPTS} attempts: ${lastError?.message}`);
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Fetch multiple URLs with concurrency control (for future batch processing)
|
|
88
|
+
*/
|
|
89
|
+
async fetchBatch(urls, options = {}, concurrent = 3) {
|
|
90
|
+
// 这个方法在 Phase 3 批量处理时实现
|
|
91
|
+
// 目前先返回一个简单的顺序实现
|
|
92
|
+
const results = [];
|
|
93
|
+
for (const url of urls) {
|
|
94
|
+
try {
|
|
95
|
+
const html = await this.fetch(url, options);
|
|
96
|
+
results.push({ url, html });
|
|
97
|
+
}
|
|
98
|
+
catch (error) {
|
|
99
|
+
results.push({ url, error: error });
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
return results;
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Sleep helper for retry delays
|
|
106
|
+
*/
|
|
107
|
+
sleep(ms) {
|
|
108
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
//# sourceMappingURL=fetcher.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fetcher.js","sourceRoot":"","sources":["../../src/core/fetcher.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AACnG,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAEpC,MAAM,OAAO,OAAO;IAClB;;OAEG;IACK,WAAW,CAAC,GAAW,EAAE,WAAoB;QACnD,gBAAgB;QAChB,IAAI,WAAW,EAAE,CAAC;YAChB,OAAO,WAAW,CAAC;QACrB,CAAC;QAED,YAAY;QACZ,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;QAEjC,cAAc;QACd,MAAM,OAAO,GAAG,OAAO,CAAC,GAAG,CAAC,QAAQ,IAAI,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC;QAC7D,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YAC1D,IAAI,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE;gBAC7B,IAAI,OAAO,KAAK,GAAG;oBAAE,OAAO,IAAI,CAAC;gBACjC,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;oBAAE,OAAO,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;gBACtE,OAAO,MAAM,CAAC,QAAQ,KAAK,OAAO,CAAC;YACrC,CAAC,CAAC,EAAE,CAAC;gBACH,OAAO,SAAS,CAAC;YACnB,CAAC;QACH,CAAC;QAED,WAAW;QACX,IAAI,QAAQ,KAAK,QAAQ,EAAE,CAAC;YAC1B,OAAO,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,OAAO,CAAC,GAAG,CAAC,WAAW;gBAClD,OAAO,CAAC,GAAG,CAAC,UAAU,IAAI,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;QAC1D,CAAC;aAAM,CAAC;YACN,OAAO,OAAO,CAAC,GAAG,CAAC,UAAU,IAAI,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;QAC1D,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,GAAW,EAAE,UAAwB,EAAE;QACjD,MAAM,EACJ,OAAO,GAAG,EAAE,EACZ,OAAO,GAAG,eAAe,EACzB,SAAS,GAAG,kBAAkB,EAC9B,KAAK,EACN,GAAG,OAAO,CAAC;QAEZ,WAAW;QACX,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QAE9C,cAAc;QACd,MAAM,YAAY,GAA8C;YAC9D,OAAO,EAAE;gBACP,YAAY,EAAE,SAAS;gBACvB,GAAG,OAAO;aACX;YACD,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC;SACrC,CAAC;QAEF,sBAAsB;QACtB,IAAI,QAAQ,EAAE,CAAC;YACb,YAAY,CAAC,UAAU,GAAG,IAAI,UAAU,CAAC,QAAQ,CAAC,CAAC;QACrD,CAAC;QAED,IAAI,SAAS,GAAiB,IAAI,CAAC;QAEnC,OAAO;QACP,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,GAAG,cAAc,EAAE,OAAO,EAAE,EAAE,CAAC;YAC1D,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE,YAAY,CAAC,CAAC;gBAEhD,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;oBACjB,MAAM,IAAI,UAAU,CAClB,GAAG,EACH,QAAQ,CAAC,MAAM,EACf,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAClD,CAAC;gBACJ,CAAC;gBAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;gBACnC,OAAO,IAAI,CAAC;YACd,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,SAAS,GAAG,KAAc,CAAC;gBAE3B,iBAAiB;gBACjB,IAAI,OAAO,KAAK,cAAc,GAAG,CAAC,EAAE,CAAC;oBACnC,MAAM;gBACR,CAAC;gBAED,sCAAsC;gBACtC,MAAM,KAAK,GAAG,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;gBACjD,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC;QAED,WAAW;QACX,IAAI,SAAS,YAAY,UAAU,EAAE,CAAC;YACpC,MAAM,SAAS,CAAC;QAClB,CAAC;QAED,MAAM,IAAI,UAAU,CAClB,GAAG,EACH,SAAS,EACT,yBAAyB,cAAc,cAAc,SAAS,EAAE,OAAO,EAAE,CAC1E,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU,CACd,IAAc,EACd,UAAwB,EAAE,EAC1B,aAAqB,CAAC;QAEtB,wBAAwB;QACxB,iBAAiB;QACjB,MAAM,OAAO,GAAyD,EAAE,CAAC;QAEzE,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;gBAC5C,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;YAC9B,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,KAAc,EAAE,CAAC,CAAC;YAC/C,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,EAAU;QACtB,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC;IACzD,CAAC;CACF"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { ProcessOptions, FetchResult } from '../types/index.js';
|
|
2
|
+
export declare class ContentProcessor {
|
|
3
|
+
private fetcher;
|
|
4
|
+
private browserFetcher;
|
|
5
|
+
private extractor;
|
|
6
|
+
private converter;
|
|
7
|
+
constructor();
|
|
8
|
+
/**
|
|
9
|
+
* Process a single URL: fetch → extract → convert
|
|
10
|
+
*/
|
|
11
|
+
process(url: string, options: ProcessOptions): Promise<string>;
|
|
12
|
+
/**
|
|
13
|
+
* Process multiple URLs (batch mode)
|
|
14
|
+
* Will be enhanced in Phase 3 with concurrency control
|
|
15
|
+
*/
|
|
16
|
+
processBatch(urls: string[], options: ProcessOptions): Promise<FetchResult[]>;
|
|
17
|
+
/**
|
|
18
|
+
* Cleanup resources (close browser if opened)
|
|
19
|
+
*/
|
|
20
|
+
cleanup(): Promise<void>;
|
|
21
|
+
}
|
|
22
|
+
//# sourceMappingURL=processor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"processor.d.ts","sourceRoot":"","sources":["../../src/core/processor.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAErE,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,OAAO,CAAU;IACzB,OAAO,CAAC,cAAc,CAA+B;IACrD,OAAO,CAAC,SAAS,CAAmB;IACpC,OAAO,CAAC,SAAS,CAAoB;;IAQrC;;OAEG;IACG,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC;IAiEpE;;;OAGG;IACG,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,OAAO,EAAE,cAAc,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;IAuBnF;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAM/B"}
|