md-fetch 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/AGENTS.md +212 -0
  2. package/LICENSE +21 -0
  3. package/README.md +449 -0
  4. package/README.zh-CN.md +449 -0
  5. package/dist/cli.d.ts +27 -0
  6. package/dist/cli.d.ts.map +1 -0
  7. package/dist/cli.js +158 -0
  8. package/dist/cli.js.map +1 -0
  9. package/dist/constants.d.ts +9 -0
  10. package/dist/constants.d.ts.map +1 -0
  11. package/dist/constants.js +15 -0
  12. package/dist/constants.js.map +1 -0
  13. package/dist/core/browser.d.ts +23 -0
  14. package/dist/core/browser.d.ts.map +1 -0
  15. package/dist/core/browser.js +125 -0
  16. package/dist/core/browser.js.map +1 -0
  17. package/dist/core/converter.d.ts +18 -0
  18. package/dist/core/converter.d.ts.map +1 -0
  19. package/dist/core/converter.js +74 -0
  20. package/dist/core/converter.js.map +1 -0
  21. package/dist/core/extractor.d.ts +28 -0
  22. package/dist/core/extractor.d.ts.map +1 -0
  23. package/dist/core/extractor.js +151 -0
  24. package/dist/core/extractor.js.map +1 -0
  25. package/dist/core/fetcher.d.ts +24 -0
  26. package/dist/core/fetcher.d.ts.map +1 -0
  27. package/dist/core/fetcher.js +111 -0
  28. package/dist/core/fetcher.js.map +1 -0
  29. package/dist/core/processor.d.ts +22 -0
  30. package/dist/core/processor.d.ts.map +1 -0
  31. package/dist/core/processor.js +104 -0
  32. package/dist/core/processor.js.map +1 -0
  33. package/dist/core/screenshotter.d.ts +31 -0
  34. package/dist/core/screenshotter.d.ts.map +1 -0
  35. package/dist/core/screenshotter.js +222 -0
  36. package/dist/core/screenshotter.js.map +1 -0
  37. package/dist/index.d.ts +3 -0
  38. package/dist/index.d.ts.map +1 -0
  39. package/dist/index.js +14 -0
  40. package/dist/index.js.map +1 -0
  41. package/dist/screen-cli.d.ts +26 -0
  42. package/dist/screen-cli.d.ts.map +1 -0
  43. package/dist/screen-cli.js +196 -0
  44. package/dist/screen-cli.js.map +1 -0
  45. package/dist/screen.d.ts +3 -0
  46. package/dist/screen.d.ts.map +1 -0
  47. package/dist/screen.js +14 -0
  48. package/dist/screen.js.map +1 -0
  49. package/dist/types/index.d.ts +151 -0
  50. package/dist/types/index.d.ts.map +1 -0
  51. package/dist/types/index.js +42 -0
  52. package/dist/types/index.js.map +1 -0
  53. package/dist/utils/filename-sanitizer.d.ts +38 -0
  54. package/dist/utils/filename-sanitizer.d.ts.map +1 -0
  55. package/dist/utils/filename-sanitizer.js +79 -0
  56. package/dist/utils/filename-sanitizer.js.map +1 -0
  57. package/dist/utils/frontmatter.d.ts +6 -0
  58. package/dist/utils/frontmatter.d.ts.map +1 -0
  59. package/dist/utils/frontmatter.js +65 -0
  60. package/dist/utils/frontmatter.js.map +1 -0
  61. package/package.json +56 -0
  62. package/skills/md-fetch/SKILL.md +133 -0
  63. package/skills/md-fetch/references/cli-reference.md +257 -0
  64. package/src/cli.ts +169 -0
  65. package/src/constants.ts +17 -0
  66. package/src/core/browser.ts +161 -0
  67. package/src/core/converter.ts +82 -0
  68. package/src/core/extractor.ts +172 -0
  69. package/src/core/fetcher.ts +143 -0
  70. package/src/core/processor.ts +124 -0
  71. package/src/core/screenshotter.ts +289 -0
  72. package/src/index.ts +15 -0
  73. package/src/screen-cli.ts +216 -0
  74. package/src/screen.ts +15 -0
  75. package/src/types/index.ts +227 -0
  76. package/src/utils/filename-sanitizer.ts +88 -0
  77. package/src/utils/frontmatter.ts +81 -0
  78. package/tsconfig.json +20 -0
@@ -0,0 +1,23 @@
1
+ import type { BrowserOptions } from '../types/index.js';
2
+ export declare class BrowserFetcher {
3
+ private browser;
4
+ private options;
5
+ constructor(options?: BrowserOptions);
6
+ /**
7
+ * Launch browser instance
8
+ */
9
+ launch(): Promise<void>;
10
+ /**
11
+ * Fetch page content using browser
12
+ */
13
+ fetchPage(url: string): Promise<string>;
14
+ /**
15
+ * Close browser instance
16
+ */
17
+ close(): Promise<void>;
18
+ /**
19
+ * Find Chrome executable path
20
+ */
21
+ private findChromePath;
22
+ }
23
+ //# sourceMappingURL=browser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"browser.d.ts","sourceRoot":"","sources":["../../src/core/browser.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,mBAAmB,CAAC;AAKxD,qBAAa,cAAc;IACzB,OAAO,CAAC,OAAO,CAAwB;IACvC,OAAO,CAAC,OAAO,CAAiB;gBAEpB,OAAO,GAAE,cAAmB;IAIxC;;OAEG;IACG,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC;IAiD7B;;OAEG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IA8C7C;;OAEG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAO5B;;OAEG;IACH,OAAO,CAAC,cAAc;CAgCvB"}
@@ -0,0 +1,125 @@
1
+ import puppeteer from 'puppeteer-core';
2
+ import { BrowserError } from '../types/index.js';
3
+ import { DEFAULT_TIMEOUT, DEFAULT_WAIT_UNTIL } from '../constants.js';
4
+ import { existsSync } from 'fs';
5
+ export class BrowserFetcher {
6
+ browser = null;
7
+ options;
8
+ constructor(options = {}) {
9
+ this.options = options;
10
+ }
11
+ /**
12
+ * Launch browser instance
13
+ */
14
+ async launch() {
15
+ if (this.browser) {
16
+ return; // Already launched
17
+ }
18
+ const { executablePath, headless = true, timeout = DEFAULT_TIMEOUT, proxy } = this.options;
19
+ // 查找 Chrome 可执行文件
20
+ const chromePath = executablePath || this.findChromePath();
21
+ if (!chromePath) {
22
+ throw new BrowserError('', 'Chrome/Chromium not found. Please install Chrome or specify the path with --browser-path');
23
+ }
24
+ const launchOptions = {
25
+ executablePath: chromePath,
26
+ headless,
27
+ timeout,
28
+ args: [
29
+ '--no-sandbox',
30
+ '--disable-setuid-sandbox',
31
+ '--disable-dev-shm-usage',
32
+ '--disable-gpu'
33
+ ]
34
+ };
35
+ // 添加代理参数
36
+ if (proxy) {
37
+ launchOptions.args.push(`--proxy-server=${proxy}`);
38
+ }
39
+ try {
40
+ this.browser = await puppeteer.launch(launchOptions);
41
+ }
42
+ catch (error) {
43
+ throw new BrowserError('', `Failed to launch browser: ${error.message}`);
44
+ }
45
+ }
46
+ /**
47
+ * Fetch page content using browser
48
+ */
49
+ async fetchPage(url) {
50
+ if (!this.browser) {
51
+ await this.launch();
52
+ }
53
+ const { waitUntil = DEFAULT_WAIT_UNTIL, timeout = DEFAULT_TIMEOUT, userAgent } = this.options;
54
+ let page = null;
55
+ try {
56
+ page = await this.browser.newPage();
57
+ // 设置 User-Agent
58
+ if (userAgent) {
59
+ await page.setUserAgent(userAgent);
60
+ }
61
+ // 设置超时
62
+ page.setDefaultTimeout(timeout);
63
+ // 导航到页面
64
+ await page.goto(url, {
65
+ waitUntil,
66
+ timeout
67
+ });
68
+ // 获取 HTML 内容
69
+ const html = await page.content();
70
+ return html;
71
+ }
72
+ catch (error) {
73
+ throw new BrowserError(url, `Failed to fetch page: ${error.message}`);
74
+ }
75
+ finally {
76
+ if (page) {
77
+ await page.close();
78
+ }
79
+ }
80
+ }
81
+ /**
82
+ * Close browser instance
83
+ */
84
+ async close() {
85
+ if (this.browser) {
86
+ await this.browser.close();
87
+ this.browser = null;
88
+ }
89
+ }
90
+ /**
91
+ * Find Chrome executable path
92
+ */
93
+ findChromePath() {
94
+ // 常见的 Chrome 安装路径
95
+ const possiblePaths = [
96
+ // Windows
97
+ 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
98
+ 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
99
+ process.env.LOCALAPPDATA + '\\Google\\Chrome\\Application\\chrome.exe',
100
+ // macOS
101
+ '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
102
+ '/Applications/Chromium.app/Contents/MacOS/Chromium',
103
+ // Linux
104
+ '/usr/bin/google-chrome',
105
+ '/usr/bin/google-chrome-stable',
106
+ '/usr/bin/chromium',
107
+ '/usr/bin/chromium-browser',
108
+ '/snap/bin/chromium'
109
+ ];
110
+ // 尝试查找可用的路径
111
+ for (const path of possiblePaths) {
112
+ try {
113
+ if (existsSync(path)) {
114
+ return path;
115
+ }
116
+ }
117
+ catch {
118
+ continue;
119
+ }
120
+ }
121
+ // 从环境变量读取
122
+ return process.env.CHROME_PATH || process.env.CHROMIUM_PATH;
123
+ }
124
+ }
125
+ //# sourceMappingURL=browser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"browser.js","sourceRoot":"","sources":["../../src/core/browser.ts"],"names":[],"mappings":"AAAA,OAAO,SAAmE,MAAM,gBAAgB,CAAC;AAEjG,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AACjD,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AACtE,OAAO,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAEhC,MAAM,OAAO,cAAc;IACjB,OAAO,GAAmB,IAAI,CAAC;IAC/B,OAAO,CAAiB;IAEhC,YAAY,UAA0B,EAAE;QACtC,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,MAAM;QACV,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,OAAO,CAAC,mBAAmB;QAC7B,CAAC;QAED,MAAM,EACJ,cAAc,EACd,QAAQ,GAAG,IAAI,EACf,OAAO,GAAG,eAAe,EACzB,KAAK,EACN,GAAG,IAAI,CAAC,OAAO,CAAC;QAEjB,kBAAkB;QAClB,MAAM,UAAU,GAAG,cAAc,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;QAE3D,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,MAAM,IAAI,YAAY,CACpB,EAAE,EACF,0FAA0F,CAC3F,CAAC;QACJ,CAAC;QAED,MAAM,aAAa,GAA2B;YAC5C,cAAc,EAAE,UAAU;YAC1B,QAAQ;YACR,OAAO;YACP,IAAI,EAAE;gBACJ,cAAc;gBACd,0BAA0B;gBAC1B,yBAAyB;gBACzB,eAAe;aAChB;SACF,CAAC;QAEF,SAAS;QACT,IAAI,KAAK,EAAE,CAAC;YACV,aAAa,CAAC,IAAK,CAAC,IAAI,CAAC,kBAAkB,KAAK,EAAE,CAAC,CAAC;QACtD,CAAC;QAED,IAAI,CAAC;YACH,IAAI,CAAC,OAAO,GAAG,MAAM,SAAS,CAAC,MAAM,CAAC,aAAa,CAAC,CAAC;QACvD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,YAAY,CACpB,EAAE,EACF,6BAA8B,KAAe,CAAC,OAAO,EAAE,CACxD,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,SAAS,CAAC,GAAW;QACzB,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,IAAI,CAAC,MAAM,EAAE,CAAC;QACtB,CAAC;QAED,MAAM,EACJ,SAAS,GAAG,kBAAkB,EAC9B,OAAO,GAAG,eAAe,EACzB,SAAS,EACV,GAAG,IAAI,CAAC,OAAO,CAAC;QAEjB,IAAI,IAAI,GAAgB,IAAI,CAAC;QAE7B,IAAI,CAAC;YACH,IAAI,GAAG,MAAM,IAAI,CAAC,OAAQ,CAAC,OAAO,EAAE,CAAC;YAErC,gBAAgB;YAChB,IAAI,SAAS,EAAE,CAAC;gBACd,MAAM,IAAI,CAAC,YAAY,CAAC,SAAS,CAAC,CAAC;YACrC,CAAC;YAED,OAAO;YACP,IAAI,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC;YAEhC,QAAQ;YACR,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;gBACnB,SAAS;gBACT,OAAO;aACR,CAAC,CAAC;YAEH,aAAa;YACb,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YAElC,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,YAAY,CACpB,GAAG,EACH,yBAA0B,KAAe,CAAC,OAAO,EAAE,CACpD,CAAC;QACJ,CAAC;gBAAS,CAAC;YACT,IAAI,IAAI,EAAE,CAAC;gBACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;YACrB,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK;QACT,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;IACH,CAAC;IAED;;OAEG;IACK,cAAc;QACpB,kBAAkB;QAClB,MAAM,aAAa,GAAG;YACpB,UAAU;YACV,4DAA4D;YAC5D,kEAAkE;YAClE,OAAO,CAAC,GAAG,CAAC,YAAY,GAAG,2CAA2C;YACtE,QAAQ;YACR,8DAA8D;YAC9D,oDAAoD;YACpD,QAAQ;YACR,wBAAwB;YACxB,+BAA+B;YAC/B,mBAAmB;YACnB,2BAA2B;YAC3B,oBAAoB;SACrB,CAAC;QAEF,YAAY;QACZ,KAAK,MAAM,IAAI,IAAI,aAAa,EAAE,CAAC;YACjC,IAAI,CAAC;gBACH,IAAI,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;oBACrB,OAAO,IAAI,CAAC;gBACd,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,SAAS;YACX,CAAC;QACH,CAAC;QAED,UAAU;QACV,OAAO,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC;IAC9D,CAAC;CACF"}
@@ -0,0 +1,18 @@
1
+ import type { ConversionOptions } from '../types/index.js';
2
+ export declare class MarkdownConverter {
3
+ private turndownService;
4
+ constructor(options?: ConversionOptions);
5
+ /**
6
+ * Convert HTML to Markdown
7
+ */
8
+ convert(html: string): string;
9
+ /**
10
+ * Setup custom conversion rules
11
+ */
12
+ private setupCustomRules;
13
+ /**
14
+ * Clean up the generated Markdown
15
+ */
16
+ private cleanMarkdown;
17
+ }
18
+ //# sourceMappingURL=converter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"converter.d.ts","sourceRoot":"","sources":["../../src/core/converter.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AAE3D,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,eAAe,CAAkB;gBAE7B,OAAO,GAAE,iBAAsB;IAc3C;;OAEG;IACH,OAAO,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAS7B;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAkCxB;;OAEG;IACH,OAAO,CAAC,aAAa;CAStB"}
@@ -0,0 +1,74 @@
1
+ import TurndownService from 'turndown';
2
+ export class MarkdownConverter {
3
+ turndownService;
4
+ constructor(options = {}) {
5
+ this.turndownService = new TurndownService({
6
+ headingStyle: options.headingStyle || 'atx',
7
+ codeBlockStyle: options.codeBlockStyle || 'fenced',
8
+ bulletListMarker: options.bulletListMarker || '-',
9
+ hr: '---',
10
+ emDelimiter: '*',
11
+ strongDelimiter: '**'
12
+ });
13
+ // 添加自定义规则
14
+ this.setupCustomRules();
15
+ }
16
+ /**
17
+ * Convert HTML to Markdown
18
+ */
19
+ convert(html) {
20
+ try {
21
+ const markdown = this.turndownService.turndown(html);
22
+ return this.cleanMarkdown(markdown);
23
+ }
24
+ catch (error) {
25
+ throw new Error(`Failed to convert HTML to Markdown: ${error.message}`);
26
+ }
27
+ }
28
+ /**
29
+ * Setup custom conversion rules
30
+ */
31
+ setupCustomRules() {
32
+ // 保留图片的 alt 文本和链接
33
+ this.turndownService.addRule('images', {
34
+ filter: 'img',
35
+ replacement: (content, node) => {
36
+ const alt = node.alt || '';
37
+ const src = node.src || '';
38
+ const title = node.title || '';
39
+ if (!src)
40
+ return '';
41
+ return title
42
+ ? `![${alt}](${src} "${title}")`
43
+ : `![${alt}](${src})`;
44
+ }
45
+ });
46
+ // 处理代码块,保留语言标识
47
+ this.turndownService.addRule('codeBlocks', {
48
+ filter: (node) => {
49
+ return node.nodeName === 'PRE' &&
50
+ node.firstChild?.nodeName === 'CODE';
51
+ },
52
+ replacement: (content, node) => {
53
+ const codeNode = node.firstChild;
54
+ const className = codeNode?.className || '';
55
+ const language = className.match(/language-(\w+)/)?.[1] || '';
56
+ const code = codeNode?.textContent || '';
57
+ return `\n\`\`\`${language}\n${code}\n\`\`\`\n`;
58
+ }
59
+ });
60
+ }
61
+ /**
62
+ * Clean up the generated Markdown
63
+ */
64
+ cleanMarkdown(markdown) {
65
+ return markdown
66
+ // 移除多余的空行(超过 2 个连续换行)
67
+ .replace(/\n{3,}/g, '\n\n')
68
+ // 移除行尾空格
69
+ .replace(/[ \t]+$/gm, '')
70
+ // 确保文件以换行符结尾
71
+ .trim() + '\n';
72
+ }
73
+ }
74
+ //# sourceMappingURL=converter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"converter.js","sourceRoot":"","sources":["../../src/core/converter.ts"],"names":[],"mappings":"AAAA,OAAO,eAAe,MAAM,UAAU,CAAC;AAGvC,MAAM,OAAO,iBAAiB;IACpB,eAAe,CAAkB;IAEzC,YAAY,UAA6B,EAAE;QACzC,IAAI,CAAC,eAAe,GAAG,IAAI,eAAe,CAAC;YACzC,YAAY,EAAE,OAAO,CAAC,YAAY,IAAI,KAAK;YAC3C,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,QAAQ;YAClD,gBAAgB,EAAE,OAAO,CAAC,gBAAgB,IAAI,GAAG;YACjD,EAAE,EAAE,KAAK;YACT,WAAW,EAAE,GAAG;YAChB,eAAe,EAAE,IAAI;SACtB,CAAC,CAAC;QAEH,UAAU;QACV,IAAI,CAAC,gBAAgB,EAAE,CAAC;IAC1B,CAAC;IAED;;OAEG;IACH,OAAO,CAAC,IAAY;QAClB,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;YACrD,OAAO,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;QACtC,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CAAC,uCAAwC,KAAe,CAAC,OAAO,EAAE,CAAC,CAAC;QACrF,CAAC;IACH,CAAC;IAED;;OAEG;IACK,gBAAgB;QACtB,kBAAkB;QAClB,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,QAAQ,EAAE;YACrC,MAAM,EAAE,KAAK;YACb,WAAW,EAAE,CAAC,OAAO,EAAE,IAAI,EAAE,EAAE;gBAC7B,MAAM,GAAG,GAAI,IAAyB,CAAC,GAAG,IAAI,EAAE,CAAC;gBACjD,MAAM,GAAG,GAAI,IAAyB,CAAC,GAAG,IAAI,EAAE,CAAC;gBACjD,MAAM,KAAK,GAAI,IAAyB,CAAC,KAAK,IAAI,EAAE,CAAC;gBAErD,IAAI,CAAC,GAAG;oBAAE,OAAO,EAAE,CAAC;gBAEpB,OAAO,KAAK;oBACV,CAAC,CAAC,KAAK,GAAG,KAAK,GAAG,KAAK,KAAK,IAAI;oBAChC,CAAC,CAAC,KAAK,GAAG,KAAK,GAAG,GAAG,CAAC;YAC1B,CAAC;SACF,CAAC,CAAC;QAEH,eAAe;QACf,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,YAAY,EAAE;YACzC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;gBACf,OAAO,IAAI,CAAC,QAAQ,KAAK,KAAK;oBACvB,IAAI,CAAC,UAAU,EAAE,QAAQ,KAAK,MAAM,CAAC;YAC9C,CAAC;YACD,WAAW,EAAE,CAAC,OAAO,EAAE,IAAI,EAAE,EAAE;gBAC7B,MAAM,QAAQ,GAAG,IAAI,CAAC,UAAyB,CAAC;gBAChD,MAAM,SAAS,GAAG,QAAQ,EAAE,SAAS,IAAI,EAAE,CAAC;gBAC5C,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,gBAAgB,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAE9D,MAAM,IAAI,GAAG,QAAQ,EAAE,WAAW,IAAI,EAAE,CAAC;gBACzC,OAAO,WAAW,QAAQ,KAAK,IAAI,YAAY,CAAC;YAClD,CAAC;SACF,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,QAAgB;QACpC,OAAO,QAAQ;YACb,sBAAsB;aACrB,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;YAC3B,SAAS;aACR,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC;YACzB,aAAa;aACZ,IAAI,EAAE,GAAG,IAAI,CAAC;IACnB,CAAC;CACF"}
@@ -0,0 +1,28 @@
1
+ import type { ExtractOptions, ExtractedContent, PageMetadata } from '../types/index.js';
2
+ export declare class ContentExtractor {
3
+ /**
4
+ * Create a virtual console that suppresses CSS errors
5
+ */
6
+ private createVirtualConsole;
7
+ /**
8
+ * Extract content from HTML
9
+ */
10
+ extract(html: string, url: string, options: ExtractOptions): ExtractedContent;
11
+ /**
12
+ * Extract content using CSS selector
13
+ */
14
+ private extractBySelector;
15
+ /**
16
+ * Extract content using Mozilla Readability
17
+ */
18
+ private extractWithReadability;
19
+ /**
20
+ * Extract metadata from the page
21
+ */
22
+ extractMetadata(html: string, url: string): PageMetadata;
23
+ /**
24
+ * Extract keywords from meta tags
25
+ */
26
+ private extractKeywords;
27
+ }
28
+ //# sourceMappingURL=extractor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.d.ts","sourceRoot":"","sources":["../../src/core/extractor.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,cAAc,EAAE,gBAAgB,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAGxF,qBAAa,gBAAgB;IAC3B;;OAEG;IACH,OAAO,CAAC,oBAAoB;IAS5B;;OAEG;IACH,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,cAAc,GAAG,gBAAgB;IAwC7E;;OAEG;IACH,OAAO,CAAC,iBAAiB;IAUzB;;OAEG;IACH,OAAO,CAAC,sBAAsB;IAsB9B;;OAEG;IACH,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,YAAY;IAyDxD;;OAEG;IACH,OAAO,CAAC,eAAe;CASxB"}
@@ -0,0 +1,151 @@
1
+ import { JSDOM, VirtualConsole } from 'jsdom';
2
+ import { Readability } from '@mozilla/readability';
3
+ import { ExtractionError } from '../types/index.js';
4
+ export class ContentExtractor {
5
+ /**
6
+ * Create a virtual console that suppresses CSS errors
7
+ */
8
+ createVirtualConsole() {
9
+ const virtualConsole = new VirtualConsole();
10
+ // Suppress CSS parsing errors (they don't affect content extraction)
11
+ virtualConsole.on('error', () => {
12
+ // Ignore errors
13
+ });
14
+ return virtualConsole;
15
+ }
16
+ /**
17
+ * Extract content from HTML
18
+ */
19
+ extract(html, url, options) {
20
+ const { useReadability, selector } = options;
21
+ try {
22
+ const dom = new JSDOM(html, {
23
+ url,
24
+ virtualConsole: this.createVirtualConsole()
25
+ });
26
+ const document = dom.window.document;
27
+ // 提取元数据
28
+ const metadata = this.extractMetadata(html, url);
29
+ let content;
30
+ // 如果指定了选择器,使用选择器提取
31
+ if (selector) {
32
+ content = this.extractBySelector(document, selector);
33
+ }
34
+ // 如果禁用了 readability,返回整个 body
35
+ else if (!useReadability) {
36
+ content = document.body.innerHTML || '';
37
+ }
38
+ // 使用 readability 提取主要内容
39
+ else {
40
+ content = this.extractWithReadability(document, url);
41
+ }
42
+ return {
43
+ content,
44
+ metadata
45
+ };
46
+ }
47
+ catch (error) {
48
+ throw new ExtractionError(url, `Failed to extract content: ${error.message}`);
49
+ }
50
+ }
51
+ /**
52
+ * Extract content using CSS selector
53
+ */
54
+ extractBySelector(document, selector) {
55
+ const element = document.querySelector(selector);
56
+ if (!element) {
57
+ throw new Error(`Selector "${selector}" not found in the document`);
58
+ }
59
+ return element.innerHTML;
60
+ }
61
+ /**
62
+ * Extract content using Mozilla Readability
63
+ */
64
+ extractWithReadability(document, url) {
65
+ // Clone document for readability (它会修改 DOM)
66
+ const documentClone = document.cloneNode(true);
67
+ const reader = new Readability(documentClone, {
68
+ debug: false,
69
+ maxElemsToParse: 0, // 无限制
70
+ nbTopCandidates: 5,
71
+ charThreshold: 500
72
+ });
73
+ const article = reader.parse();
74
+ if (!article || !article.content) {
75
+ // 如果 readability 失败,回退到原始 body
76
+ console.warn(`Readability failed for ${url}, falling back to full body content`);
77
+ return document.body.innerHTML || '';
78
+ }
79
+ return article.content;
80
+ }
81
+ /**
82
+ * Extract metadata from the page
83
+ */
84
+ extractMetadata(html, url) {
85
+ try {
86
+ const dom = new JSDOM(html, {
87
+ url,
88
+ virtualConsole: this.createVirtualConsole()
89
+ });
90
+ const document = dom.window.document;
91
+ // 使用 readability 获取元数据
92
+ const documentClone = document.cloneNode(true);
93
+ const reader = new Readability(documentClone);
94
+ const article = reader.parse();
95
+ // Helper function to get meta content
96
+ const getMeta = (selectors) => {
97
+ for (const selector of selectors) {
98
+ const element = document.querySelector(selector);
99
+ const content = element?.getAttribute('content') || element?.getAttribute('value');
100
+ if (content)
101
+ return content;
102
+ }
103
+ return undefined;
104
+ };
105
+ // Extract comprehensive metadata
106
+ const metadata = {
107
+ url,
108
+ title: article?.title ||
109
+ getMeta(['meta[property="og:title"]', 'meta[name="twitter:title"]']) ||
110
+ document.querySelector('title')?.textContent ||
111
+ undefined,
112
+ description: article?.excerpt ||
113
+ getMeta(['meta[name="description"]', 'meta[property="og:description"]', 'meta[name="twitter:description"]']) ||
114
+ undefined,
115
+ author: article?.byline ||
116
+ getMeta(['meta[name="author"]', 'meta[property="article:author"]']) ||
117
+ undefined,
118
+ publishedTime: getMeta(['meta[property="article:published_time"]', 'meta[name="publish_date"]', 'meta[property="og:published_time"]']) ||
119
+ undefined,
120
+ modifiedTime: getMeta(['meta[property="article:modified_time"]', 'meta[property="og:updated_time"]']) ||
121
+ undefined,
122
+ siteName: article?.siteName ||
123
+ getMeta(['meta[property="og:site_name"]']) ||
124
+ undefined,
125
+ keywords: this.extractKeywords(document),
126
+ image: getMeta(['meta[property="og:image"]', 'meta[name="twitter:image"]']) ||
127
+ undefined,
128
+ lang: document.documentElement.lang ||
129
+ getMeta(['meta[property="og:locale"]']) ||
130
+ undefined
131
+ };
132
+ return metadata;
133
+ }
134
+ catch (error) {
135
+ return { url };
136
+ }
137
+ }
138
+ /**
139
+ * Extract keywords from meta tags
140
+ */
141
+ extractKeywords(document) {
142
+ const keywordsContent = document.querySelector('meta[name="keywords"]')?.getAttribute('content');
143
+ if (!keywordsContent)
144
+ return undefined;
145
+ return keywordsContent
146
+ .split(',')
147
+ .map(k => k.trim())
148
+ .filter(k => k.length > 0);
149
+ }
150
+ }
151
+ //# sourceMappingURL=extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.js","sourceRoot":"","sources":["../../src/core/extractor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,cAAc,EAAE,MAAM,OAAO,CAAC;AAC9C,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AAEnD,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAEpD,MAAM,OAAO,gBAAgB;IAC3B;;OAEG;IACK,oBAAoB;QAC1B,MAAM,cAAc,GAAG,IAAI,cAAc,EAAE,CAAC;QAC5C,qEAAqE;QACrE,cAAc,CAAC,EAAE,CAAC,OAAO,EAAE,GAAG,EAAE;YAC9B,gBAAgB;QAClB,CAAC,CAAC,CAAC;QACH,OAAO,cAAc,CAAC;IACxB,CAAC;IAED;;OAEG;IACH,OAAO,CAAC,IAAY,EAAE,GAAW,EAAE,OAAuB;QACxD,MAAM,EAAE,cAAc,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;QAE7C,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE;gBAC1B,GAAG;gBACH,cAAc,EAAE,IAAI,CAAC,oBAAoB,EAAE;aAC5C,CAAC,CAAC;YACH,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;YAErC,QAAQ;YACR,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;YAEjD,IAAI,OAAe,CAAC;YAEpB,mBAAmB;YACnB,IAAI,QAAQ,EAAE,CAAC;gBACb,OAAO,GAAG,IAAI,CAAC,iBAAiB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;YACvD,CAAC;YACD,8BAA8B;iBACzB,IAAI,CAAC,cAAc,EAAE,CAAC;gBACzB,OAAO,GAAG,QAAQ,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;YAC1C,CAAC;YACD,wBAAwB;iBACnB,CAAC;gBACJ,OAAO,GAAG,IAAI,CAAC,sBAAsB,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;YACvD,CAAC;YAED,OAAO;gBACL,OAAO;gBACP,QAAQ;aACT,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,eAAe,CACvB,GAAG,EACH,8BAA+B,KAAe,CAAC,OAAO,EAAE,CACzD,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACK,iBAAiB,CAAC,QAAkB,EAAE,QAAgB;QAC5D,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;QAEjD,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,aAAa,QAAQ,6BAA6B,CAAC,CAAC;QACtE,CAAC;QAED,OAAO,OAAO,CAAC,SAAS,CAAC;IAC3B,CAAC;IAED;;OAEG;IACK,sBAAsB,CAAC,QAAkB,EAAE,GAAW;QAC5D,4CAA4C;QAC5C,MAAM,aAAa,GAAG,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAa,CAAC;QAE3D,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,aAAa,EAAE;YAC5C,KAAK,EAAE,KAAK;YACZ,eAAe,EAAE,CAAC,EAAE,MAAM;YAC1B,eAAe,EAAE,CAAC;YAClB,aAAa,EAAE,GAAG;SACnB,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAE/B,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YACjC,+BAA+B;YAC/B,OAAO,CAAC,IAAI,CAAC,0BAA0B,GAAG,qCAAqC,CAAC,CAAC;YACjF,OAAO,QAAQ,CAAC,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;QACvC,CAAC;QAED,OAAO,OAAO,CAAC,OAAO,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,IAAY,EAAE,GAAW;QACvC,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE;gBAC1B,GAAG;gBACH,cAAc,EAAE,IAAI,CAAC,oBAAoB,EAAE;aAC5C,CAAC,CAAC;YACH,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;YAErC,uBAAuB;YACvB,MAAM,aAAa,GAAG,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAa,CAAC;YAC3D,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,aAAa,CAAC,CAAC;YAC9C,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;YAE/B,sCAAsC;YACtC,MAAM,OAAO,GAAG,CAAC,SAAmB,EAAsB,EAAE;gBAC1D,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;oBACjC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;oBACjD,MAAM,OAAO,GAAG,OAAO,EAAE,YAAY,CAAC,SAAS,CAAC,IAAI,OAAO,EAAE,YAAY,CAAC,OAAO,CAAC,CAAC;oBACnF,IAAI,OAAO;wBAAE,OAAO,OAAO,CAAC;gBAC9B,CAAC;gBACD,OAAO,SAAS,CAAC;YACnB,CAAC,CAAC;YAEF,iCAAiC;YACjC,MAAM,QAAQ,GAAiB;gBAC7B,GAAG;gBACH,KAAK,EAAE,OAAO,EAAE,KAAK;oBACd,OAAO,CAAC,CAAC,2BAA2B,EAAE,4BAA4B,CAAC,CAAC;oBACpE,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,WAAW;oBAC5C,SAAS;gBAChB,WAAW,EAAE,OAAO,EAAE,OAAO;oBACjB,OAAO,CAAC,CAAC,0BAA0B,EAAE,iCAAiC,EAAE,kCAAkC,CAAC,CAAC;oBAC5G,SAAS;gBACrB,MAAM,EAAE,OAAO,EAAE,MAAM;oBAChB,OAAO,CAAC,CAAC,qBAAqB,EAAE,iCAAiC,CAAC,CAAC;oBACnE,SAAS;gBAChB,aAAa,EAAE,OAAO,CAAC,CAAC,yCAAyC,EAAE,2BAA2B,EAAE,oCAAoC,CAAC,CAAC;oBACxH,SAAS;gBACvB,YAAY,EAAE,OAAO,CAAC,CAAC,wCAAwC,EAAE,kCAAkC,CAAC,CAAC;oBACxF,SAAS;gBACtB,QAAQ,EAAE,OAAO,EAAE,QAAQ;oBAClB,OAAO,CAAC,CAAC,+BAA+B,CAAC,CAAC;oBAC1C,SAAS;gBAClB,QAAQ,EAAE,IAAI,CAAC,eAAe,CAAC,QAAQ,CAAC;gBACxC,KAAK,EAAE,OAAO,CAAC,CAAC,2BAA2B,EAAE,4BAA4B,CAAC,CAAC;oBACrE,SAAS;gBACf,IAAI,EAAE,QAAQ,CAAC,eAAe,CAAC,IAAI;oBAC9B,OAAO,CAAC,CAAC,4BAA4B,CAAC,CAAC;oBACvC,SAAS;aACf,CAAC;YAEF,OAAO,QAAQ,CAAC;QAClB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,EAAE,GAAG,EAAE,CAAC;QACjB,CAAC;IACH,CAAC;IAED;;OAEG;IACK,eAAe,CAAC,QAAkB;QACxC,MAAM,eAAe,GAAG,QAAQ,CAAC,aAAa,CAAC,uBAAuB,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,CAAC;QACjG,IAAI,CAAC,eAAe;YAAE,OAAO,SAAS,CAAC;QAEvC,OAAO,eAAe;aACnB,KAAK,CAAC,GAAG,CAAC;aACV,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC/B,CAAC;CACF"}
@@ -0,0 +1,24 @@
1
+ import type { FetchOptions } from '../types/index.js';
2
+ export declare class Fetcher {
3
+ /**
4
+ * Get proxy URL from environment variables or options
5
+ */
6
+ private getProxyUrl;
7
+ /**
8
+ * Fetch HTML content from a URL
9
+ */
10
+ fetch(url: string, options?: FetchOptions): Promise<string>;
11
+ /**
12
+ * Fetch multiple URLs with concurrency control (for future batch processing)
13
+ */
14
+ fetchBatch(urls: string[], options?: FetchOptions, concurrent?: number): Promise<Array<{
15
+ url: string;
16
+ html?: string;
17
+ error?: Error;
18
+ }>>;
19
+ /**
20
+ * Sleep helper for retry delays
21
+ */
22
+ private sleep;
23
+ }
24
+ //# sourceMappingURL=fetcher.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetcher.d.ts","sourceRoot":"","sources":["../../src/core/fetcher.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAKtD,qBAAa,OAAO;IAClB;;OAEG;IACH,OAAO,CAAC,WAAW;IAgCnB;;OAEG;IACG,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,OAAO,CAAC,MAAM,CAAC;IAoErE;;OAEG;IACG,UAAU,CACd,IAAI,EAAE,MAAM,EAAE,EACd,OAAO,GAAE,YAAiB,EAC1B,UAAU,GAAE,MAAU,GACrB,OAAO,CAAC,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,IAAI,CAAC,EAAE,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,KAAK,CAAA;KAAE,CAAC,CAAC;IAiBhE;;OAEG;IACH,OAAO,CAAC,KAAK;CAGd"}
@@ -0,0 +1,111 @@
1
+ import { FetchError } from '../types/index.js';
2
+ import { DEFAULT_TIMEOUT, DEFAULT_USER_AGENT, RETRY_ATTEMPTS, RETRY_DELAY } from '../constants.js';
3
+ import { ProxyAgent } from 'undici';
4
+ export class Fetcher {
5
+ /**
6
+ * Get proxy URL from environment variables or options
7
+ */
8
+ getProxyUrl(url, proxyOption) {
9
+ // 优先使用命令行参数中的代理
10
+ if (proxyOption) {
11
+ return proxyOption;
12
+ }
13
+ // 从环境变量读取代理
14
+ const urlObj = new URL(url);
15
+ const protocol = urlObj.protocol;
16
+ // 检查 NO_PROXY
17
+ const noProxy = process.env.NO_PROXY || process.env.no_proxy;
18
+ if (noProxy) {
19
+ const noProxyList = noProxy.split(',').map(s => s.trim());
20
+ if (noProxyList.some(pattern => {
21
+ if (pattern === '*')
22
+ return true;
23
+ if (pattern.startsWith('.'))
24
+ return urlObj.hostname.endsWith(pattern);
25
+ return urlObj.hostname === pattern;
26
+ })) {
27
+ return undefined;
28
+ }
29
+ }
30
+ // 根据协议选择代理
31
+ if (protocol === 'https:') {
32
+ return process.env.HTTPS_PROXY || process.env.https_proxy ||
33
+ process.env.HTTP_PROXY || process.env.http_proxy;
34
+ }
35
+ else {
36
+ return process.env.HTTP_PROXY || process.env.http_proxy;
37
+ }
38
+ }
39
+ /**
40
+ * Fetch HTML content from a URL
41
+ */
42
+ async fetch(url, options = {}) {
43
+ const { headers = {}, timeout = DEFAULT_TIMEOUT, userAgent = DEFAULT_USER_AGENT, proxy } = options;
44
+ // 获取代理 URL
45
+ const proxyUrl = this.getProxyUrl(url, proxy);
46
+ // 构建 fetch 选项
47
+ const fetchOptions = {
48
+ headers: {
49
+ 'User-Agent': userAgent,
50
+ ...headers
51
+ },
52
+ signal: AbortSignal.timeout(timeout)
53
+ };
54
+ // 如果有代理,使用 ProxyAgent
55
+ if (proxyUrl) {
56
+ fetchOptions.dispatcher = new ProxyAgent(proxyUrl);
57
+ }
58
+ let lastError = null;
59
+ // 重试逻辑
60
+ for (let attempt = 0; attempt < RETRY_ATTEMPTS; attempt++) {
61
+ try {
62
+ const response = await fetch(url, fetchOptions);
63
+ if (!response.ok) {
64
+ throw new FetchError(url, response.status, `HTTP ${response.status}: ${response.statusText}`);
65
+ }
66
+ const html = await response.text();
67
+ return html;
68
+ }
69
+ catch (error) {
70
+ lastError = error;
71
+ // 如果是最后一次尝试,抛出错误
72
+ if (attempt === RETRY_ATTEMPTS - 1) {
73
+ break;
74
+ }
75
+ // 指数退避:等待时间 = RETRY_DELAY * 2^attempt
76
+ const delay = RETRY_DELAY * Math.pow(2, attempt);
77
+ await this.sleep(delay);
78
+ }
79
+ }
80
+ // 所有重试都失败了
81
+ if (lastError instanceof FetchError) {
82
+ throw lastError;
83
+ }
84
+ throw new FetchError(url, undefined, `Failed to fetch after ${RETRY_ATTEMPTS} attempts: ${lastError?.message}`);
85
+ }
86
+ /**
87
+ * Fetch multiple URLs with concurrency control (for future batch processing)
88
+ */
89
+ async fetchBatch(urls, options = {}, concurrent = 3) {
90
+ // 这个方法在 Phase 3 批量处理时实现
91
+ // 目前先返回一个简单的顺序实现
92
+ const results = [];
93
+ for (const url of urls) {
94
+ try {
95
+ const html = await this.fetch(url, options);
96
+ results.push({ url, html });
97
+ }
98
+ catch (error) {
99
+ results.push({ url, error: error });
100
+ }
101
+ }
102
+ return results;
103
+ }
104
+ /**
105
+ * Sleep helper for retry delays
106
+ */
107
+ sleep(ms) {
108
+ return new Promise(resolve => setTimeout(resolve, ms));
109
+ }
110
+ }
111
+ //# sourceMappingURL=fetcher.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetcher.js","sourceRoot":"","sources":["../../src/core/fetcher.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,EAAE,eAAe,EAAE,kBAAkB,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AACnG,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AAEpC,MAAM,OAAO,OAAO;IAClB;;OAEG;IACK,WAAW,CAAC,GAAW,EAAE,WAAoB;QACnD,gBAAgB;QAChB,IAAI,WAAW,EAAE,CAAC;YAChB,OAAO,WAAW,CAAC;QACrB,CAAC;QAED,YAAY;QACZ,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;QAEjC,cAAc;QACd,MAAM,OAAO,GAAG,OAAO,CAAC,GAAG,CAAC,QAAQ,IAAI,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC;QAC7D,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YAC1D,IAAI,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE;gBAC7B,IAAI,OAAO,KAAK,GAAG;oBAAE,OAAO,IAAI,CAAC;gBACjC,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;oBAAE,OAAO,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;gBACtE,OAAO,MAAM,CAAC,QAAQ,KAAK,OAAO,CAAC;YACrC,CAAC,CAAC,EAAE,CAAC;gBACH,OAAO,SAAS,CAAC;YACnB,CAAC;QACH,CAAC;QAED,WAAW;QACX,IAAI,QAAQ,KAAK,QAAQ,EAAE,CAAC;YAC1B,OAAO,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,OAAO,CAAC,GAAG,CAAC,WAAW;gBAClD,OAAO,CAAC,GAAG,CAAC,UAAU,IAAI,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;QAC1D,CAAC;aAAM,CAAC;YACN,OAAO,OAAO,CAAC,GAAG,CAAC,UAAU,IAAI,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC;QAC1D,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,KAAK,CAAC,GAAW,EAAE,UAAwB,EAAE;QACjD,MAAM,EACJ,OAAO,GAAG,EAAE,EACZ,OAAO,GAAG,eAAe,EACzB,SAAS,GAAG,kBAAkB,EAC9B,KAAK,EACN,GAAG,OAAO,CAAC;QAEZ,WAAW;QACX,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QAE9C,cAAc;QACd,MAAM,YAAY,GAA8C;YAC9D,OAAO,EAAE;gBACP,YAAY,EAAE,SAAS;gBACvB,GAAG,OAAO;aACX;YACD,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC;SACrC,CAAC;QAEF,sBAAsB;QACtB,IAAI,QAAQ,EAAE,CAAC;YACb,YAAY,CAAC,UAAU,GAAG,IAAI,UAAU,CAAC,QAAQ,CAAC,CAAC;QACrD,CAAC;QAED,IAAI,SAAS,GAAiB,IAAI,CAAC;QAEnC,OAAO;QACP,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,GAAG,cAAc,EAAE,OAAO,EAAE,EAAE,CAAC;YAC1D,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE,YAAY,CAAC,CAAC;gBAEhD,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;oBACjB,MAAM,IAAI,UAAU,CAClB,GAAG,EACH,QAAQ,CAAC,MAAM,EACf,QAAQ,QAAQ,CAAC,MAAM,KAAK,QAAQ,CAAC,UAAU,EAAE,CAClD,CAAC;gBACJ,CAAC;gBAED,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;gBACnC,OAAO,IAAI,CAAC;YACd,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,SAAS,GAAG,KAAc,CAAC;gBAE3B,iBAAiB;gBACjB,IAAI,OAAO,KAAK,cAAc,GAAG,CAAC,EAAE,CAAC;oBACnC,MAAM;gBACR,CAAC;gBAED,sCAAsC;gBACtC,MAAM,KAAK,GAAG,WAAW,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;gBACjD,MAAM,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC;QAED,WAAW;QACX,IAAI,SAAS,YAAY,UAAU,EAAE,CAAC;YACpC,MAAM,SAAS,CAAC;QAClB,CAAC;QAED,MAAM,IAAI,UAAU,CAClB,GAAG,EACH,SAAS,EACT,yBAAyB,cAAc,cAAc,SAAS,EAAE,OAAO,EAAE,CAC1E,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU,CACd,IAAc,EACd,UAAwB,EAAE,EAC1B,aAAqB,CAAC;QAEtB,wBAAwB;QACxB,iBAAiB;QACjB,MAAM,OAAO,GAAyD,EAAE,CAAC;QAEzE,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;gBAC5C,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC;YAC9B,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,KAAc,EAAE,CAAC,CAAC;YAC/C,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,EAAU;QACtB,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC;IACzD,CAAC;CACF"}
@@ -0,0 +1,22 @@
1
+ import type { ProcessOptions, FetchResult } from '../types/index.js';
2
+ export declare class ContentProcessor {
3
+ private fetcher;
4
+ private browserFetcher;
5
+ private extractor;
6
+ private converter;
7
+ constructor();
8
+ /**
9
+ * Process a single URL: fetch → extract → convert
10
+ */
11
+ process(url: string, options: ProcessOptions): Promise<string>;
12
+ /**
13
+ * Process multiple URLs (batch mode)
14
+ * Will be enhanced in Phase 3 with concurrency control
15
+ */
16
+ processBatch(urls: string[], options: ProcessOptions): Promise<FetchResult[]>;
17
+ /**
18
+ * Cleanup resources (close browser if opened)
19
+ */
20
+ cleanup(): Promise<void>;
21
+ }
22
+ //# sourceMappingURL=processor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"processor.d.ts","sourceRoot":"","sources":["../../src/core/processor.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAErE,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,OAAO,CAAU;IACzB,OAAO,CAAC,cAAc,CAA+B;IACrD,OAAO,CAAC,SAAS,CAAmB;IACpC,OAAO,CAAC,SAAS,CAAoB;;IAQrC;;OAEG;IACG,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC;IAiEpE;;;OAGG;IACG,YAAY,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,OAAO,EAAE,cAAc,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;IAuBnF;;OAEG;IACG,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAM/B"}