botrun-crawler-2 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +126 -0
  2. package/dist/cli.d.ts +10 -0
  3. package/dist/cli.d.ts.map +1 -0
  4. package/dist/cli.js +155 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/crawler/cli.d.ts +19 -0
  7. package/dist/crawler/cli.d.ts.map +1 -0
  8. package/dist/crawler/cli.js +179 -0
  9. package/dist/crawler/cli.js.map +1 -0
  10. package/dist/crawler/index.d.ts +146 -0
  11. package/dist/crawler/index.d.ts.map +1 -0
  12. package/dist/crawler/index.js +670 -0
  13. package/dist/crawler/index.js.map +1 -0
  14. package/dist/index.d.ts +17 -0
  15. package/dist/index.d.ts.map +1 -0
  16. package/dist/index.js +20 -0
  17. package/dist/index.js.map +1 -0
  18. package/dist/lib/agent.d.ts +34 -0
  19. package/dist/lib/agent.d.ts.map +1 -0
  20. package/dist/lib/agent.js +73 -0
  21. package/dist/lib/agent.js.map +1 -0
  22. package/dist/lib/cache.d.ts +49 -0
  23. package/dist/lib/cache.d.ts.map +1 -0
  24. package/dist/lib/cache.js +141 -0
  25. package/dist/lib/cache.js.map +1 -0
  26. package/dist/lib/filename-decoder.d.ts +62 -0
  27. package/dist/lib/filename-decoder.d.ts.map +1 -0
  28. package/dist/lib/filename-decoder.js +229 -0
  29. package/dist/lib/filename-decoder.js.map +1 -0
  30. package/dist/lib/http-client.d.ts +86 -0
  31. package/dist/lib/http-client.d.ts.map +1 -0
  32. package/dist/lib/http-client.js +373 -0
  33. package/dist/lib/http-client.js.map +1 -0
  34. package/dist/lib/index.d.ts +15 -0
  35. package/dist/lib/index.d.ts.map +1 -0
  36. package/dist/lib/index.js +19 -0
  37. package/dist/lib/index.js.map +1 -0
  38. package/dist/lib/logger.d.ts +41 -0
  39. package/dist/lib/logger.d.ts.map +1 -0
  40. package/dist/lib/logger.js +122 -0
  41. package/dist/lib/logger.js.map +1 -0
  42. package/dist/lib/scene-detector.d.ts +92 -0
  43. package/dist/lib/scene-detector.d.ts.map +1 -0
  44. package/dist/lib/scene-detector.js +297 -0
  45. package/dist/lib/scene-detector.js.map +1 -0
  46. package/dist/processors/audio.d.ts +20 -0
  47. package/dist/processors/audio.d.ts.map +1 -0
  48. package/dist/processors/audio.js +110 -0
  49. package/dist/processors/audio.js.map +1 -0
  50. package/dist/processors/base.d.ts +53 -0
  51. package/dist/processors/base.d.ts.map +1 -0
  52. package/dist/processors/base.js +194 -0
  53. package/dist/processors/base.js.map +1 -0
  54. package/dist/processors/data.d.ts +48 -0
  55. package/dist/processors/data.d.ts.map +1 -0
  56. package/dist/processors/data.js +206 -0
  57. package/dist/processors/data.js.map +1 -0
  58. package/dist/processors/document.d.ts +20 -0
  59. package/dist/processors/document.d.ts.map +1 -0
  60. package/dist/processors/document.js +137 -0
  61. package/dist/processors/document.js.map +1 -0
  62. package/dist/processors/image.d.ts +20 -0
  63. package/dist/processors/image.d.ts.map +1 -0
  64. package/dist/processors/image.js +92 -0
  65. package/dist/processors/image.js.map +1 -0
  66. package/dist/processors/index.d.ts +53 -0
  67. package/dist/processors/index.d.ts.map +1 -0
  68. package/dist/processors/index.js +177 -0
  69. package/dist/processors/index.js.map +1 -0
  70. package/dist/processors/text.d.ts +44 -0
  71. package/dist/processors/text.d.ts.map +1 -0
  72. package/dist/processors/text.js +262 -0
  73. package/dist/processors/text.js.map +1 -0
  74. package/dist/processors/video.d.ts +20 -0
  75. package/dist/processors/video.d.ts.map +1 -0
  76. package/dist/processors/video.js +93 -0
  77. package/dist/processors/video.js.map +1 -0
  78. package/dist/scraper/cli.d.ts +23 -0
  79. package/dist/scraper/cli.d.ts.map +1 -0
  80. package/dist/scraper/cli.js +118 -0
  81. package/dist/scraper/cli.js.map +1 -0
  82. package/dist/scraper/index.d.ts +120 -0
  83. package/dist/scraper/index.d.ts.map +1 -0
  84. package/dist/scraper/index.js +372 -0
  85. package/dist/scraper/index.js.map +1 -0
  86. package/dist/types/index.d.ts +123 -0
  87. package/dist/types/index.d.ts.map +1 -0
  88. package/dist/types/index.js +40 -0
  89. package/dist/types/index.js.map +1 -0
  90. package/package.json +108 -0
package/README.md ADDED
@@ -0,0 +1,126 @@
1
+ # botrun-crawler
2
+
3
+ 智慧網頁爬蟲 - 支援動態下載連結偵測、MIME type 分析、政府網站相容
4
+
5
+ ## 安裝
6
+
7
+ ```bash
8
+ npm install -g botrun-crawler
9
+ ```
10
+
11
+ ## CLI 使用方法
12
+
13
+ ### 基本爬取
14
+
15
+ ```bash
16
+ # 單頁完整爬取
17
+ botrun-crawler crawl "https://example.com"
18
+
19
+ # JSON 格式輸出
20
+ botrun-crawler crawl "https://example.com" --json
21
+
22
+ # 安靜模式
23
+ botrun-crawler crawl "https://example.com" -q
24
+ ```
25
+
26
+ ### 儲存結果
27
+
28
+ ```bash
29
+ # 儲存 HTML
30
+ botrun-crawler crawl "https://example.com" --save-html page.html
31
+
32
+ # 儲存純文字(LLM 友善格式)
33
+ botrun-crawler crawl "https://example.com" --save-text page.txt
34
+
35
+ # 儲存下載連結清單
36
+ botrun-crawler crawl "https://example.com" --save-downloads urls.txt
37
+
38
+ # 全部儲存
39
+ botrun-crawler crawl "https://example.com" \
40
+ --save-html page.html \
41
+ --save-text page.txt \
42
+ --save-downloads urls.txt
43
+ ```
44
+
45
+ ### 效能選項
46
+
47
+ ```bash
48
+ # 跳過 MIME type 分析(加快速度)
49
+ botrun-crawler crawl "https://example.com" --skip-mime
50
+ ```
51
+
52
+ ## 命令總覽
53
+
54
+ | 命令 | 說明 |
55
+ |------|------|
56
+ | `crawl <url>` | 單頁完整爬取(推薦) |
57
+ | `scrape <url>` | 簡易版抓取 |
58
+ | `scrape-batch <file>` | 批次抓取多個網址 |
59
+ | `process <file>` | 處理本地檔案 |
60
+ | `help` | 顯示說明 |
61
+
62
+ ## crawl 選項
63
+
64
+ | 選項 | 說明 |
65
+ |------|------|
66
+ | `--json` | JSON 格式輸出 |
67
+ | `--save-html <file>` | 儲存原始 HTML |
68
+ | `--save-text <file>` | 儲存純文字 |
69
+ | `--save-downloads <file>` | 儲存下載連結清單 |
70
+ | `--skip-mime` | 跳過 MIME type 分析 |
71
+ | `-v, --verbose` | 詳細模式 |
72
+ | `-q, --quiet` | 安靜模式 |
73
+
74
+ ## 輸出範例
75
+
76
+ ```
77
+ ========================================
78
+ 單頁完整爬取結果
79
+ ========================================
80
+ 網址: https://www.ida.gov.tw/...
81
+ 標題: 經濟部產業發展署
82
+
83
+ 連結統計
84
+ ├─ 下載連結: 475 個
85
+ │ ├─ PDF: 120
86
+ │ ├─ Word: 85
87
+ │ └─ 動態連結: 270
88
+ ├─ 連外連結: 22 個
89
+ └─ 內部連結: 675 個
90
+ ```
91
+
92
+ ## Proxy 設定
93
+
94
+ ```bash
95
+ export HTTPS_PROXY="http://proxy:8080"
96
+ export HTTP_PROXY="http://proxy:8080"
97
+ export NO_PROXY="localhost,.internal.com"
98
+ ```
99
+
100
+ ## 程式庫使用
101
+
102
+ ```typescript
103
+ import { crawlPage } from 'botrun-crawler/crawler';
104
+
105
+ const result = await crawlPage('https://example.com');
106
+ console.log(result.links.downloads.count);
107
+ ```
108
+
109
+ ### 子模組
110
+
111
+ ```typescript
112
+ import { PageCrawler, crawlPage } from 'botrun-crawler/crawler';
113
+ import { scrapeUrl } from 'botrun-crawler/scraper';
114
+ import { fetchWithProxy, fetchMimeType } from 'botrun-crawler/lib';
115
+ ```
116
+
117
+ ## 特色
118
+
119
+ - **動態下載連結偵測**:自動識別 `download.aspx`、`getFile.do` 等動態連結
120
+ - **MIME Type 分析**:自動分類 PDF/Word/Excel 等檔案類型
121
+ - **檔名亂碼修正**:自動修正 UTF-8 編碼問題
122
+ - **政府網站相容**:針對台灣政府網站最佳化
123
+
124
+ ## License
125
+
126
+ MIT
package/dist/cli.d.ts ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * 耳(Ear)- 任意格式處理器 CLI
4
+ * 零幻覺四部曲之一:將任意格式轉換為 AI 可讀文字
5
+ *
6
+ * @version 1.0.0
7
+ * @author 永恆知己系統
8
+ */
9
+ export {};
10
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA;;;;;;GAMG"}
package/dist/cli.js ADDED
@@ -0,0 +1,155 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * 耳(Ear)- 任意格式處理器 CLI
4
+ * 零幻覺四部曲之一:將任意格式轉換為 AI 可讀文字
5
+ *
6
+ * @version 1.0.0
7
+ * @author 永恆知己系統
8
+ */
9
+ import { Command } from "commander";
10
+ import * as path from "path";
11
+ import * as fs from "fs";
12
+ import { ProcessorRouter } from "./processors/index.js";
13
+ import { CacheService } from "./lib/cache.js";
14
+ import { LoggerService } from "./lib/logger.js";
15
+ import { AgentService } from "./lib/agent.js";
16
+ import { runScrapeCommand, runScrapeBatchCommand } from "./scraper/cli.js";
17
+ import { runCrawlCommand } from "./crawler/cli.js";
18
+ // ============================================================
19
+ // CLI 程式
20
+ // ============================================================
21
+ const program = new Command();
22
+ program
23
+ .name("ear")
24
+ .description("耳(Ear)- 任意格式處理器\n零幻覺四部曲之一:將任意格式轉換為 AI 可讀文字")
25
+ .version("1.0.0");
26
+ // ============================================================
27
+ // process 命令
28
+ // ============================================================
29
+ program
30
+ .command("process <target>")
31
+ .description("處理單一檔案或目錄中的所有檔案")
32
+ .option("-o, --output <dir>", "指定輸出目錄")
33
+ .option("-f, --format <format>", "強制指定輸入格式")
34
+ .option("-m, --model <model>", "指定 AI 模型", "claude-haiku-4-5")
35
+ .option("-j, --jobs <n>", "並行處理數量", "4")
36
+ .option("--no-cache", "忽略快取,強制重新處理")
37
+ .option("--dry-run", "模擬執行,不實際處理")
38
+ .option("-v, --verbose", "顯示詳細日誌")
39
+ .action(async (target, options) => {
40
+ const logger = new LoggerService(options.verbose ?? false);
41
+ const cache = new CacheService(logger);
42
+ const agent = new AgentService(options.model ?? "claude-haiku-4-5", logger);
43
+ const router = new ProcessorRouter(cache, logger, agent);
44
+ try {
45
+ logger.info(`開始處理:${target}`);
46
+ const targetPath = path.resolve(target);
47
+ if (!fs.existsSync(targetPath)) {
48
+ logger.error(`檔案或目錄不存在:${targetPath}`);
49
+ process.exit(1);
50
+ }
51
+ const stats = fs.statSync(targetPath);
52
+ if (stats.isDirectory()) {
53
+ // 目錄處理
54
+ await router.processDirectory(targetPath, {
55
+ jobs: parseInt(String(options.jobs ?? 4), 10),
56
+ noCache: options.noCache ?? false,
57
+ dryRun: options.dryRun ?? false,
58
+ outputDir: options.output,
59
+ });
60
+ }
61
+ else {
62
+ // 單檔處理
63
+ const result = await router.processFile(targetPath, {
64
+ noCache: options.noCache ?? false,
65
+ dryRun: options.dryRun ?? false,
66
+ forceFormat: options.format,
67
+ outputDir: options.output,
68
+ });
69
+ if (result.success) {
70
+ logger.success(`處理完成:${result.outputPath}`);
71
+ }
72
+ else {
73
+ logger.error(`處理失敗:${result.error}`);
74
+ process.exit(1);
75
+ }
76
+ }
77
+ }
78
+ catch (error) {
79
+ logger.error(`處理錯誤:${error instanceof Error ? error.message : String(error)}`);
80
+ process.exit(1);
81
+ }
82
+ });
83
+ // ============================================================
84
+ // ask 命令(自然語言驅動)
85
+ // ============================================================
86
+ program
87
+ .command("ask <query>")
88
+ .description("使用自然語言描述任務")
89
+ .option("-m, --model <model>", "指定 AI 模型", "claude-haiku-4-5")
90
+ .option("-v, --verbose", "顯示詳細日誌")
91
+ .action(async (query, options) => {
92
+ const logger = new LoggerService(options.verbose ?? false);
93
+ const agent = new AgentService(options.model ?? "claude-haiku-4-5", logger);
94
+ try {
95
+ logger.info(`自然語言查詢:${query}`);
96
+ const result = await agent.ask(query);
97
+ console.log("\n" + result);
98
+ }
99
+ catch (error) {
100
+ logger.error(`查詢錯誤:${error instanceof Error ? error.message : String(error)}`);
101
+ process.exit(1);
102
+ }
103
+ });
104
+ // ============================================================
105
+ // crawl 命令(單頁完整爬取)
106
+ // ============================================================
107
+ program
108
+ .command("crawl <url>")
109
+ .description("單頁完整爬取(含結構分析、下載清單、變更追蹤)")
110
+ .option("--json", "以 JSON 格式輸出")
111
+ .option("--save-html <file>", "儲存原始 HTML")
112
+ .option("--save-text <file>", "儲存純文字(LLM 友善格式)")
113
+ .option("--save-downloads <file>", "儲存下載連結清單")
114
+ .option("--skip-mime", "跳過 MIME type 分析(預設會分析)")
115
+ .option("-v, --verbose", "顯示詳細資訊")
116
+ .option("-q, --quiet", "安靜模式")
117
+ .action(async (url, options) => {
118
+ await runCrawlCommand(url, options);
119
+ });
120
+ // ============================================================
121
+ // scrape 命令(網頁抓取與分析)- 簡易版,向後相容
122
+ // ============================================================
123
+ program
124
+ .command("scrape <url>")
125
+ .description("抓取網頁並分析")
126
+ .option("--json", "以 JSON 格式輸出")
127
+ .option("--save-html <file>", "儲存原始 HTML")
128
+ .option("--save-text <file>", "儲存純文字(LLM 友善格式)")
129
+ .option("-v, --verbose", "顯示詳細資訊")
130
+ .option("-q, --quiet", "安靜模式")
131
+ .action(async (url, options) => {
132
+ await runScrapeCommand(url, options);
133
+ });
134
+ // ============================================================
135
+ // scrape-batch 命令(批次抓取)
136
+ // ============================================================
137
+ program
138
+ .command("scrape-batch <urlsFile> [outputDir]")
139
+ .description("批次抓取多個網址")
140
+ .option("-q, --quiet", "安靜模式")
141
+ .action(async (urlsFile, outputDir = ".", options) => {
142
+ const filePath = path.resolve(urlsFile);
143
+ if (!fs.existsSync(filePath)) {
144
+ console.error(`找不到檔案: ${urlsFile}`);
145
+ process.exit(1);
146
+ }
147
+ const content = fs.readFileSync(filePath, "utf-8");
148
+ const urls = content.split("\n");
149
+ await runScrapeBatchCommand(urls, outputDir, options);
150
+ });
151
+ // ============================================================
152
+ // 執行
153
+ // ============================================================
154
+ program.parse();
155
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA;;;;;;GAMG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AACzB,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAC9C,OAAO,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAChD,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAC9C,OAAO,EAAE,gBAAgB,EAAE,qBAAqB,EAAE,MAAM,kBAAkB,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAgBnD,+DAA+D;AAC/D,SAAS;AACT,+DAA+D;AAE/D,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,KAAK,CAAC;KACX,WAAW,CAAC,4CAA4C,CAAC;KACzD,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,+DAA+D;AAC/D,aAAa;AACb,+DAA+D;AAE/D,OAAO;KACJ,OAAO,CAAC,kBAAkB,CAAC;KAC3B,WAAW,CAAC,iBAAiB,CAAC;KAC9B,MAAM,CAAC,oBAAoB,EAAE,QAAQ,CAAC;KACtC,MAAM,CAAC,uBAAuB,EAAE,UAAU,CAAC;KAC3C,MAAM,CAAC,qBAAqB,EAAE,UAAU,EAAE,kBAAkB,CAAC;KAC7D,MAAM,CAAC,gBAAgB,EAAE,QAAQ,EAAE,GAAG,CAAC;KACvC,MAAM,CAAC,YAAY,EAAE,aAAa,CAAC;KACnC,MAAM,CAAC,WAAW,EAAE,YAAY,CAAC;KACjC,MAAM,CAAC,eAAe,EAAE,QAAQ,CAAC;KACjC,MAAM,CAAC,KAAK,EAAE,MAAc,EAAE,OAAuB,EAAE,EAAE;IACxD,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,OAAO,CAAC,OAAO,IAAI,KAAK,CAAC,CAAC;IAC3D,MAAM,KAAK,GAAG,IAAI,YAAY,CAAC,MAAM,CAAC,CAAC;IACvC,MAAM,KAAK,GAAG,IAAI,YAAY,CAAC,OAAO,CAAC,KAAK,IAAI,kBAAkB,EAAE,MAAM,CAAC,CAAC;IAC5E,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC;IAEzD,IAAI,CAAC;QACH,MAAM,CAAC,IAAI,CAAC,QAAQ,MAAM,EAAE,CAAC,CAAC;QAE9B,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QAExC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC/B,MAAM,CAAC,KAAK,CAAC,YAAY,UAAU,EAAE,CAAC,CAAC;YACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,MAAM,KAAK,GAAG,EAAE,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC;QAEtC,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC;YACxB,OAAO;YACP,MAAM,MAAM,CAAC,gBAAgB,CAAC,UAAU,EAAE;gBACxC,IAAI,EAAE,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC;gBAC7C,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,KAAK;gBACjC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,KAAK;gBAC/B,SAAS,EAAE,OAAO,CAAC,MAAM;aAC1B,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,OAAO;YACP,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,UAAU,EAAE;gBAClD,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,KAAK;gBACjC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,KAAK;gBAC/B,WAAW,EAAE,OAAO,CAAC,MAAM;gBAC3B,SAAS,EAAE,OAAO,CAAC,MAAM;aAC1B,CAAC,CAAC;YAEH,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;gBACnB,MAAM,CAAC,OAAO,CAAC,QAAQ,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC;YAC9C,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,KAAK,CAAC,QAAQ,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBACrC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;QACH,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,CAAC,KAAK,CAAC,QAAQ,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAC/E,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,+DAA+D;AAC/D,iBAAiB;AACjB,+DAA+D;AAE/D,OAAO;KACJ,OAAO,CAAC,aAAa,CAAC;KACtB,WAAW,CAAC,YAAY,CAAC;KACzB,MAAM,CAAC,qBAAqB,EAAE,UAAU,EAAE,kBAAkB,CAAC;KAC7D,MAAM,CAAC,eAAe,EAAE,QAAQ,CAAC;KACjC,MAAM,CAAC,KAAK,EAAE,KAAa,EAAE,OAA8C,EAAE,EAAE;IAC9E,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,OAAO,CAAC,OAAO,IAAI,KAAK,CAAC,CAAC;IAC3D,MAAM,KAAK,GAAG,IAAI,YAAY,CAAC,OAAO,CAAC,KAAK,IAAI,kBAAkB,EAAE,MAAM,CAAC,CAAC;IAE5E,IAAI,CAAC;QACH,MAAM,CAAC,IAAI,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC;QAE/B,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEtC,OAAO,CAAC,GAAG,CAAC,IAAI,GAAG,MAAM,CAAC,CAAC;IAC7B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,CAAC,KAAK,CAAC,QAAQ,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAC/E,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,+DAA+D;AAC/D,mBAAmB;AACnB,+DAA+D;AAE/D,OAAO;KACJ,OAAO,CAAC,aAAa,CAAC;KACtB,WAAW,CAAC,yBAAyB,CAAC;KACtC,MAAM,CAAC,QAAQ,EAAE,aAAa,CAAC;KAC/B,MAAM,CAAC,oBAAoB,EAAE,WAAW,CAAC;KACzC,MAAM,CAAC,oBAAoB,EAAE,iBAAiB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,UAAU,CAAC;KAC7C,MAAM,CAAC,aAAa,EAAE,wBAAwB,CAAC;KAC/C,MAAM,CAAC,eAAe,EAAE,QAAQ,CAAC;KACjC,MAAM,CAAC,aAAa,EAAE,MAAM,CAAC;KAC7B,MAAM,CAAC,KAAK,EAAE,GAAW,EAAE,OAQ3B,EAAE,EAAE;IACH,MAAM,eAAe,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;AACtC,CAAC,CAAC,CAAC;AAEL,+DAA+D;AAC/D,+BAA+B;AAC/B,+DAA+D;AAE/D,OAAO;KACJ,OAAO,CAAC,cAAc,CAAC;KACvB,WAAW,CAAC,SAAS,CAAC;KACtB,MAAM,CAAC,QAAQ,EAAE,aAAa,CAAC;KAC/B,MAAM,CAAC,oBAAoB,EAAE,WAAW,CAAC;KACzC,MAAM,CAAC,oBAAoB,EAAE,iBAAiB,CAAC;KAC/C,MAAM,CAAC,eAAe,EAAE,QAAQ,CAAC;KACjC,MAAM,CAAC,aAAa,EAAE,MAAM,CAAC;KAC7B,MAAM,CAAC,KAAK,EAAE,GAAW,EAAE,OAM3B,EAAE,EAAE;IACH,MAAM,gBAAgB,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;AACvC,CAAC,CAAC,CAAC;AAEL,+DAA+D;AAC/D,wBAAwB;AACxB,+DAA+D;AAE/D,OAAO;KACJ,OAAO,CAAC,qCAAqC,CAAC;KAC9C,WAAW,CAAC,UAAU,CAAC;KACvB,MAAM,CAAC,aAAa,EAAE,MAAM,CAAC;KAC7B,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,YAAoB,GAAG,EAAE,OAA4B,EAAE,EAAE;IACxF,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IACxC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC7B,OAAO,CAAC,KAAK,CAAC,UAAU,QAAQ,EAAE,CAAC,CAAC;QACpC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACnD,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEjC,MAAM,qBAAqB,CAAC,IAAI,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;AACxD,CAAC,CAAC,CAAC;AAEL,+DAA+D;AAC/D,KAAK;AACL,+DAA+D;AAE/D,OAAO,CAAC,KAAK,EAAE,CAAC"}
@@ -0,0 +1,19 @@
1
+ import { CrawlResult } from "./index.js";
2
+ export interface CrawlCliOptions {
3
+ json?: boolean;
4
+ saveHtml?: string;
5
+ saveText?: string;
6
+ saveDownloads?: string;
7
+ verbose?: boolean;
8
+ quiet?: boolean;
9
+ skipMime?: boolean;
10
+ }
11
+ /**
12
+ * 輸出人類可讀報告
13
+ */
14
+ export declare function printCrawlReport(result: CrawlResult, verbose?: boolean): void;
15
+ /**
16
+ * 執行 crawl 命令
17
+ */
18
+ export declare function runCrawlCommand(url: string, options: CrawlCliOptions): Promise<void>;
19
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../../src/crawler/cli.ts"],"names":[],"mappings":"AAIA,OAAO,EAAe,WAAW,EAAc,MAAM,YAAY,CAAC;AAElE,MAAM,WAAW,eAAe;IAC9B,IAAI,CAAC,EAAE,OAAO,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,MAAM,EAAE,WAAW,EACnB,OAAO,GAAE,OAAe,GACvB,IAAI,CA8IN;AAED;;GAEG;AACH,wBAAsB,eAAe,CACnC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,eAAe,GACvB,OAAO,CAAC,IAAI,CAAC,CAyCf"}
@@ -0,0 +1,179 @@
1
+ /**
2
+ * 單頁爬取 CLI 模組
3
+ */
4
+ import * as fs from "fs";
5
+ import { PageCrawler } from "./index.js";
6
+ /**
7
+ * 輸出人類可讀報告
8
+ */
9
+ export function printCrawlReport(result, verbose = false) {
10
+ console.log("=".repeat(60));
11
+ console.log("網頁分析報告");
12
+ console.log("=".repeat(60));
13
+ console.log(`網址: ${result.url}`);
14
+ if (result.title) {
15
+ console.log(`標題: ${result.title}`);
16
+ }
17
+ console.log(`爬取時間: ${result.crawled_at}`);
18
+ console.log(`頁面指紋: ${result.fingerprint.slice(0, 16)}...`);
19
+ // 頁面 MIME type(如果有)
20
+ if (result.pageMimeType) {
21
+ const mime = result.pageMimeType;
22
+ let mimeStr = mime.mimeType;
23
+ if (mime.charset)
24
+ mimeStr += `; charset=${mime.charset}`;
25
+ console.log(`頁面類型: ${mimeStr}`);
26
+ }
27
+ // Benchmark 計時
28
+ if (result.benchmark) {
29
+ const b = result.benchmark;
30
+ let benchmarkStr = `抓取: ${(b.fetch_ms / 1000).toFixed(2)}s, 解析: ${(b.parse_ms / 1000).toFixed(2)}s`;
31
+ if (b.mime_analysis_ms !== undefined) {
32
+ benchmarkStr += `, MIME分析: ${(b.mime_analysis_ms / 1000).toFixed(2)}s`;
33
+ }
34
+ benchmarkStr += `, 總計: ${(b.total_ms / 1000).toFixed(2)}s`;
35
+ console.log(`執行時間: ${benchmarkStr}`);
36
+ }
37
+ console.log();
38
+ console.log("【大小統計】");
39
+ console.log(` 原始 HTML: ${result.size.raw_html_readable}`);
40
+ console.log(` 純文字: ${result.size.text_readable}`);
41
+ console.log(` 壓縮比: ${result.size.compression_ratio} 節省`);
42
+ console.log();
43
+ // 下載檔案統計
44
+ const downloads = result.links.downloads;
45
+ const uniqueCount = downloads.unique_count || downloads.count;
46
+ const originalCount = downloads.original_count || downloads.count;
47
+ const hasDuplicates = originalCount > uniqueCount;
48
+ if (hasDuplicates) {
49
+ console.log(`【可下載檔案】共 ${uniqueCount} 個(原始 ${originalCount} 個,去重 ${originalCount - uniqueCount} 個)`);
50
+ }
51
+ else {
52
+ console.log(`【可下載檔案】共 ${result.links.downloads.count} 個`);
53
+ }
54
+ // 按檔案類別統計(人類可讀)
55
+ if (downloads.by_file_category && Object.keys(downloads.by_file_category).length > 0) {
56
+ console.log(" 按檔案類型統計:");
57
+ const sorted = Object.entries(downloads.by_file_category)
58
+ .sort((a, b) => b[1] - a[1]);
59
+ for (const [category, count] of sorted) {
60
+ console.log(` - ${category}: ${count} 個`);
61
+ }
62
+ }
63
+ else if (Object.keys(result.links.downloads.by_type).length > 0) {
64
+ // 如果沒有 MIME 分析,顯示按副檔名類型統計
65
+ console.log(" 按副檔名統計:");
66
+ for (const [type, count] of Object.entries(result.links.downloads.by_type)
67
+ .sort((a, b) => b[1] - a[1])) {
68
+ console.log(` - ${type.toUpperCase()}: ${count} 個`);
69
+ }
70
+ }
71
+ // 顯示真實 MIME type 統計(如果有且為 verbose 模式)
72
+ if (verbose && downloads.by_real_mime && Object.keys(downloads.by_real_mime).length > 0) {
73
+ console.log(" 按真實 MIME type 統計:");
74
+ const sorted = Object.entries(downloads.by_real_mime)
75
+ .sort((a, b) => b[1] - a[1]);
76
+ for (const [mimeType, count] of sorted) {
77
+ console.log(` - ${mimeType}: ${count} 個`);
78
+ }
79
+ }
80
+ if (verbose && result.links.downloads.items.length > 0) {
81
+ console.log(" 下載清單:");
82
+ const items = result.links.downloads.items.slice(0, 20);
83
+ for (let i = 0; i < items.length; i++) {
84
+ const item = items[i];
85
+ console.log(` ${i + 1}. [${item.type.toUpperCase()}] ${item.text.slice(0, 50)}`);
86
+ if (item.category) {
87
+ console.log(` 分類: ${item.category}`);
88
+ }
89
+ }
90
+ if (result.links.downloads.items.length > 20) {
91
+ console.log(` ... 還有 ${result.links.downloads.items.length - 20} 個`);
92
+ }
93
+ }
94
+ console.log();
95
+ console.log(`【連外連結】共 ${result.links.external.count} 個`);
96
+ if (Object.keys(result.links.external.domains).length > 0) {
97
+ const sorted = Object.entries(result.links.external.domains)
98
+ .sort((a, b) => b[1] - a[1])
99
+ .slice(0, 10);
100
+ for (const [domain, count] of sorted) {
101
+ console.log(` - ${domain}: ${count} 個`);
102
+ }
103
+ }
104
+ // MIME type 統計(如果有)
105
+ if (result.links.external.by_mime_type && Object.keys(result.links.external.by_mime_type).length > 0) {
106
+ console.log(" 按 MIME type 統計:");
107
+ const mimeTypeSorted = Object.entries(result.links.external.by_mime_type)
108
+ .sort((a, b) => b[1] - a[1]);
109
+ for (const [mimeType, count] of mimeTypeSorted) {
110
+ console.log(` - ${mimeType}: ${count} 個`);
111
+ }
112
+ }
113
+ console.log();
114
+ console.log(`【內部連結】共 ${result.links.internal.count} 個`);
115
+ console.log();
116
+ if (result.content.headings.length > 0) {
117
+ console.log(`【頁面結構】共 ${result.content.headings.length} 個標題`);
118
+ if (verbose) {
119
+ for (const heading of result.content.headings.slice(0, 10)) {
120
+ console.log(` - ${heading.slice(0, 60)}`);
121
+ }
122
+ if (result.content.headings.length > 10) {
123
+ console.log(` ... 還有 ${result.content.headings.length - 10} 個`);
124
+ }
125
+ }
126
+ console.log();
127
+ }
128
+ if (result.content.forms && result.content.forms.length > 0) {
129
+ console.log(`【表單】共 ${result.content.forms.length} 個`);
130
+ if (verbose) {
131
+ for (const form of result.content.forms) {
132
+ console.log(` - ${form.method} ${form.action || "(無 action)"}`);
133
+ console.log(` 欄位: ${form.fields.join(", ")}`);
134
+ }
135
+ }
136
+ console.log();
137
+ }
138
+ }
139
+ /**
140
+ * 執行 crawl 命令
141
+ */
142
+ export async function runCrawlCommand(url, options) {
143
+ const crawler = new PageCrawler(url);
144
+ const success = await crawler.fetch();
145
+ if (!success) {
146
+ process.exit(1);
147
+ }
148
+ // 預設分析 MIME type(除非指定 --skip-mime)
149
+ const result = options.skipMime
150
+ ? crawler.analyze()
151
+ : await crawler.analyzeWithMimeTypes();
152
+ // 儲存檔案
153
+ if (options.saveHtml) {
154
+ fs.writeFileSync(options.saveHtml, crawler.getRawHtml(), "utf-8");
155
+ if (!options.quiet) {
156
+ console.error(`原始 HTML 已儲存: ${options.saveHtml}`);
157
+ }
158
+ }
159
+ if (options.saveText) {
160
+ fs.writeFileSync(options.saveText, crawler.getTextContent(), "utf-8");
161
+ if (!options.quiet) {
162
+ console.error(`純文字已儲存: ${options.saveText}`);
163
+ }
164
+ }
165
+ if (options.saveDownloads) {
166
+ fs.writeFileSync(options.saveDownloads, crawler.getDownloadList(), "utf-8");
167
+ if (!options.quiet) {
168
+ console.error(`下載清單已儲存: ${options.saveDownloads}`);
169
+ }
170
+ }
171
+ // 輸出
172
+ if (options.json) {
173
+ console.log(JSON.stringify(result, null, 2));
174
+ }
175
+ else {
176
+ printCrawlReport(result, options.verbose);
177
+ }
178
+ }
179
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../../src/crawler/cli.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AACzB,OAAO,EAAE,WAAW,EAA2B,MAAM,YAAY,CAAC;AAYlE;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAC9B,MAAmB,EACnB,UAAmB,KAAK;IAExB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACtB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC;IACjC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;QACjB,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;IACrC,CAAC;IACD,OAAO,CAAC,GAAG,CAAC,SAAS,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC;IAC1C,OAAO,CAAC,GAAG,CAAC,SAAS,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC;IAE3D,oBAAoB;IACpB,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;QACxB,MAAM,IAAI,GAAG,MAAM,CAAC,YAAY,CAAC;QACjC,IAAI,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC;QAC5B,IAAI,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,aAAa,IAAI,CAAC,OAAO,EAAE,CAAC;QACzD,OAAO,CAAC,GAAG,CAAC,SAAS,OAAO,EAAE,CAAC,CAAC;IAClC,CAAC;IAED,eAAe;IACf,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;QACrB,MAAM,CAAC,GAAG,MAAM,CAAC,SAAS,CAAC;QAC3B,IAAI,YAAY,GAAG,OAAO,CAAC,CAAC,CAAC,QAAQ,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,QAAQ,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;QACpG,IAAI,CAAC,CAAC,gBAAgB,KAAK,SAAS,EAAE,CAAC;YACrC,YAAY,IAAI,aAAa,CAAC,CAAC,CAAC,gBAAgB,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;QACzE,CAAC;QACD,YAAY,IAAI,SAAS,CAAC,CAAC,CAAC,QAAQ,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;QAC3D,OAAO,CAAC,GAAG,CAAC,SAAS,YAAY,EAAE,CAAC,CAAC;IACvC,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACtB,OAAO,CAAC,GAAG,CAAC,cAAc,MAAM,CAAC,IAAI,CAAC,iBAAiB,EAAE,CAAC,CAAC;IAC3D,OAAO,CAAC,GAAG,CAAC,aAAa,MAAM,CAAC,IAAI,CAAC,aAAa,EAAE,CAAC,CAAC;IACtD,OAAO,CAAC,GAAG,CAAC,aAAa,MAAM,CAAC,IAAI,CAAC,iBAAiB,KAAK,CAAC,CAAC;IAC7D,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,SAAS;IACT,MAAM,SAAS,GAAG,MAAM,CAAC,KAAK,CAAC,SAAgB,CAAC;IAChD,MAAM,WAAW,GAAG,SAAS,CAAC,YAAY,IAAI,SAAS,CAAC,KAAK,CAAC;IAC9D,MAAM,aAAa,GAAG,SAAS,CAAC,cAAc,IAAI,SAAS,CAAC,KAAK,CAAC;IAClE,MAAM,aAAa,GAAG,aAAa,GAAG,WAAW,CAAC;IAElD,IAAI,aAAa,EAAE,CAAC;QAClB,OAAO,CAAC,GAAG,CAAC,YAAY,WAAW,SAAS,aAAa,SAAS,aAAa,GAAG,WAAW,KAAK,CAAC,CAAC;IACtG,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,IAAI,CAAC,CAAC;IAC5D,CAAC;IAED,gBAAgB;IAChB,IAAI,SAAS,CAAC,gBAAgB,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrF,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;QAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,gBAA0C,CAAC;aAChF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/B,KAAK,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;YACvC,OAAO,CAAC,GAAG,CAAC,SAAS,QAAQ,KAAK,KAAK,IAAI,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;SAAM,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClE,0BAA0B;QAC1B,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QACzB,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,OAAO,CAAC;aACvE,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC/B,OAAO,CAAC,GAAG,CAAC,SAAS,IAAI,CAAC,WAAW,EAAE,KAAK,KAAK,IAAI,CAAC,CAAC;QACzD,CAAC;IACH,CAAC;IAED,sCAAsC;IACtC,IAAI,OAAO,IAAI,SAAS,CAAC,YAAY,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxF,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC,CAAC;QACnC,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,YAAsC,CAAC;aAC5E,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/B,KAAK,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;YACvC,OAAO,CAAC,GAAG,CAAC,SAAS,QAAQ,KAAK,KAAK,IAAI,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IACD,IAAI,OAAO,IAAI,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvD,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACvB,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACxD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACtB,OAAO,CAAC,GAAG,CACT,OAAO,CAAC,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CACvE,CAAC;YACF,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAClB,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC;YAC7C,CAAC;QACH,CAAC;QACD,IAAI,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YAC7C,OAAO,CAAC,GAAG,CACT,cAAc,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,IAAI,CAC3D,CAAC;QACJ,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;IACxD,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1D,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC;aACzD,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;aAC3B,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAChB,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;YACrC,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,KAAK,KAAK,IAAI,CAAC,CAAC;QAC3C,CAAC;IACH,CAAC;IAED,oBAAoB;IACpB,IAAI,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,YAAY,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrG,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;QACjC,MAAM,cAAc,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC;aACtE,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/B,KAAK,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,IAAI,cAAc,EAAE,CAAC;YAC/C,OAAO,CAAC,GAAG,CAAC,SAAS,QAAQ,KAAK,KAAK,IAAI,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,IAAI,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvC,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,MAAM,CAAC,CAAC;QAC7D,IAAI,OAAO,EAAE,CAAC;YACZ,KAAK,MAAM,OAAO,IAAI,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;gBAC3D,OAAO,CAAC,GAAG,CAAC,OAAO,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;YAC7C,CAAC;YACD,IAAI,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;gBACxC,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,GAAG,EAAE,IAAI,CAAC,CAAC;YACnE,CAAC;QACH,CAAC;QACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAChB,CAAC;IAED,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5D,OAAO,CAAC,GAAG,CAAC,SAAS,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,CAAC;QACtD,IAAI,OAAO,EAAE,CAAC;YACZ,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;gBACxC,OAAO,CAAC,GAAG,CAAC,OAAO,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,IAAI,YAAY,EAAE,CAAC,CAAC;gBACjE,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnD,CAAC;QACH,CAAC;QACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAChB,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,GAAW,EACX,OAAwB;IAExB,MAAM,OAAO,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,CAAC;IAErC,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;IACtC,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,mCAAmC;IACnC,MAAM,MAAM,GAAG,OAAO,CAAC,QAAQ;QAC7B,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE;QACnB,CAAC,CAAC,MAAM,OAAO,CAAC,oBAAoB,EAAE,CAAC;IAEzC,OAAO;IACP,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACrB,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,QAAQ,EAAE,OAAO,CAAC,UAAU,EAAE,EAAE,OAAO,CAAC,CAAC;QAClE,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,KAAK,CAAC,gBAAgB,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;QACpD,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACrB,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,QAAQ,EAAE,OAAO,CAAC,cAAc,EAAE,EAAE,OAAO,CAAC,CAAC;QACtE,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,KAAK,CAAC,WAAW,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;QAC1B,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,aAAa,EAAE,OAAO,CAAC,eAAe,EAAE,EAAE,OAAO,CAAC,CAAC;QAC5E,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,KAAK,CAAC,YAAY,OAAO,CAAC,aAAa,EAAE,CAAC,CAAC;QACrD,CAAC;IACH,CAAC;IAED,KAAK;IACL,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;QACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAC/C,CAAC;SAAM,CAAC;QACN,gBAAgB,CAAC,MAAM,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;IAC5C,CAAC;AACH,CAAC"}
@@ -0,0 +1,146 @@
1
+ /**
2
+ * 單頁完整爬取模組
3
+ *
4
+ * 滿足五個動機:
5
+ * 1. AI 知識庫建置 - 純文字 + metadata
6
+ * 2. 完整離線歸檔 - 下載清單 + 檔案分類
7
+ * 3. 法規變更追蹤 - 頁面指紋 + 時間戳記
8
+ * 4. 申請流程自動化 - 結構解析
9
+ * 5. 跨文件關聯分析 - 連結關係
10
+ *
11
+ * 支援 HTTPS Proxy(沙盒環境)
12
+ */
13
+ import { MimeTypeInfo } from "../lib/http-client.js";
14
+ export interface CrawlOptions {
15
+ timeout?: number;
16
+ userAgent?: string;
17
+ }
18
+ export interface DownloadLinkItem {
19
+ url: string;
20
+ text: string;
21
+ type: string;
22
+ category?: string;
23
+ mimeType?: MimeTypeInfo;
24
+ fileCategory?: string;
25
+ }
26
+ export interface ExternalLinkItem {
27
+ url: string;
28
+ text: string;
29
+ domain: string;
30
+ mimeType?: MimeTypeInfo;
31
+ }
32
+ export type { MimeTypeInfo };
33
+ export interface LinkItem {
34
+ url: string;
35
+ text: string;
36
+ }
37
+ export interface FormInfo {
38
+ action: string;
39
+ method: string;
40
+ fields: string[];
41
+ }
42
+ export interface BenchmarkInfo {
43
+ fetch_ms: number;
44
+ parse_ms: number;
45
+ mime_analysis_ms?: number;
46
+ total_ms: number;
47
+ }
48
+ export interface CrawlResult {
49
+ url: string;
50
+ title: string;
51
+ description: string;
52
+ crawled_at: string;
53
+ fingerprint: string;
54
+ pageMimeType?: MimeTypeInfo;
55
+ benchmark?: BenchmarkInfo;
56
+ size: {
57
+ raw_html_bytes: number;
58
+ raw_html_readable: string;
59
+ text_bytes: number;
60
+ text_readable: string;
61
+ compression_ratio: string;
62
+ };
63
+ links: {
64
+ downloads: {
65
+ count: number;
66
+ unique_count?: number;
67
+ original_count?: number;
68
+ by_type: Record<string, number>;
69
+ by_real_mime?: Record<string, number>;
70
+ by_file_category?: Record<string, number>;
71
+ items: DownloadLinkItem[];
72
+ };
73
+ external: {
74
+ count: number;
75
+ domains: Record<string, number>;
76
+ by_mime_type?: Record<string, number>;
77
+ items: ExternalLinkItem[];
78
+ };
79
+ internal: {
80
+ count: number;
81
+ items: LinkItem[];
82
+ };
83
+ };
84
+ content: {
85
+ headings: string[];
86
+ sections: string[];
87
+ forms?: FormInfo[];
88
+ };
89
+ }
90
+ export declare function formatSize(bytes: number): string;
91
+ /**
92
+ * 格式化為台灣時間 (UTC+8)
93
+ */
94
+ export declare function formatTaiwanTime(date: Date): string;
95
+ export declare function extractText(html: string): string;
96
+ /**
97
+ * 下載連結分析器
98
+ * 用於判斷 URL 是否為下載連結,並分類
99
+ */
100
+ export declare class DownloadAnalyzer {
101
+ analyze(url: string, linkText: string): {
102
+ isDownload: boolean;
103
+ type: string | null;
104
+ category: string | null;
105
+ };
106
+ }
107
+ export declare class PageCrawler {
108
+ private url;
109
+ private baseDomain;
110
+ private html;
111
+ private crawledAt;
112
+ private pageMimeType?;
113
+ private responseHeaders?;
114
+ private fetchStartTime;
115
+ private fetchEndTime;
116
+ constructor(url: string);
117
+ getFetchDuration(): number;
118
+ fetch(options?: CrawlOptions): Promise<boolean>;
119
+ private detectPageMimeType;
120
+ getPageMimeType(): MimeTypeInfo | undefined;
121
+ getRawHtml(): string;
122
+ getTextContent(): string;
123
+ getFingerprint(): string;
124
+ private downloadStats;
125
+ getDownloadableLinks(): DownloadLinkItem[];
126
+ getDownloadStats(): {
127
+ originalCount: number;
128
+ uniqueCount: number;
129
+ };
130
+ getExternalLinks(): ExternalLinkItem[];
131
+ getInternalLinks(): LinkItem[];
132
+ analyze(): CrawlResult;
133
+ /**
134
+ * 分析並獲取外部連結和下載連結的 MIME type(需要額外網路請求)
135
+ */
136
+ analyzeWithMimeTypes(): Promise<CrawlResult>;
137
+ /**
138
+ * 產生下載清單(每行一個 URL)
139
+ */
140
+ getDownloadList(): string;
141
+ }
142
+ /**
143
+ * 快速爬取單頁
144
+ */
145
+ export declare function crawlPage(url: string, options?: CrawlOptions): Promise<CrawlResult | null>;
146
+ //# sourceMappingURL=index.d.ts.map