botrun-crawler-2 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +126 -0
- package/dist/cli.d.ts +10 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +155 -0
- package/dist/cli.js.map +1 -0
- package/dist/crawler/cli.d.ts +19 -0
- package/dist/crawler/cli.d.ts.map +1 -0
- package/dist/crawler/cli.js +179 -0
- package/dist/crawler/cli.js.map +1 -0
- package/dist/crawler/index.d.ts +146 -0
- package/dist/crawler/index.d.ts.map +1 -0
- package/dist/crawler/index.js +670 -0
- package/dist/crawler/index.js.map +1 -0
- package/dist/index.d.ts +17 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/agent.d.ts +34 -0
- package/dist/lib/agent.d.ts.map +1 -0
- package/dist/lib/agent.js +73 -0
- package/dist/lib/agent.js.map +1 -0
- package/dist/lib/cache.d.ts +49 -0
- package/dist/lib/cache.d.ts.map +1 -0
- package/dist/lib/cache.js +141 -0
- package/dist/lib/cache.js.map +1 -0
- package/dist/lib/filename-decoder.d.ts +62 -0
- package/dist/lib/filename-decoder.d.ts.map +1 -0
- package/dist/lib/filename-decoder.js +229 -0
- package/dist/lib/filename-decoder.js.map +1 -0
- package/dist/lib/http-client.d.ts +86 -0
- package/dist/lib/http-client.d.ts.map +1 -0
- package/dist/lib/http-client.js +373 -0
- package/dist/lib/http-client.js.map +1 -0
- package/dist/lib/index.d.ts +15 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +19 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/logger.d.ts +41 -0
- package/dist/lib/logger.d.ts.map +1 -0
- package/dist/lib/logger.js +122 -0
- package/dist/lib/logger.js.map +1 -0
- package/dist/lib/scene-detector.d.ts +92 -0
- package/dist/lib/scene-detector.d.ts.map +1 -0
- package/dist/lib/scene-detector.js +297 -0
- package/dist/lib/scene-detector.js.map +1 -0
- package/dist/processors/audio.d.ts +20 -0
- package/dist/processors/audio.d.ts.map +1 -0
- package/dist/processors/audio.js +110 -0
- package/dist/processors/audio.js.map +1 -0
- package/dist/processors/base.d.ts +53 -0
- package/dist/processors/base.d.ts.map +1 -0
- package/dist/processors/base.js +194 -0
- package/dist/processors/base.js.map +1 -0
- package/dist/processors/data.d.ts +48 -0
- package/dist/processors/data.d.ts.map +1 -0
- package/dist/processors/data.js +206 -0
- package/dist/processors/data.js.map +1 -0
- package/dist/processors/document.d.ts +20 -0
- package/dist/processors/document.d.ts.map +1 -0
- package/dist/processors/document.js +137 -0
- package/dist/processors/document.js.map +1 -0
- package/dist/processors/image.d.ts +20 -0
- package/dist/processors/image.d.ts.map +1 -0
- package/dist/processors/image.js +92 -0
- package/dist/processors/image.js.map +1 -0
- package/dist/processors/index.d.ts +53 -0
- package/dist/processors/index.d.ts.map +1 -0
- package/dist/processors/index.js +177 -0
- package/dist/processors/index.js.map +1 -0
- package/dist/processors/text.d.ts +44 -0
- package/dist/processors/text.d.ts.map +1 -0
- package/dist/processors/text.js +262 -0
- package/dist/processors/text.js.map +1 -0
- package/dist/processors/video.d.ts +20 -0
- package/dist/processors/video.d.ts.map +1 -0
- package/dist/processors/video.js +93 -0
- package/dist/processors/video.js.map +1 -0
- package/dist/scraper/cli.d.ts +23 -0
- package/dist/scraper/cli.d.ts.map +1 -0
- package/dist/scraper/cli.js +118 -0
- package/dist/scraper/cli.js.map +1 -0
- package/dist/scraper/index.d.ts +120 -0
- package/dist/scraper/index.d.ts.map +1 -0
- package/dist/scraper/index.js +372 -0
- package/dist/scraper/index.js.map +1 -0
- package/dist/types/index.d.ts +123 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +40 -0
- package/dist/types/index.js.map +1 -0
- package/package.json +108 -0
package/README.md
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# botrun-crawler
|
|
2
|
+
|
|
3
|
+
智慧網頁爬蟲 - 支援動態下載連結偵測、MIME type 分析、政府網站相容
|
|
4
|
+
|
|
5
|
+
## 安裝
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install -g botrun-crawler
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## CLI 使用方法
|
|
12
|
+
|
|
13
|
+
### 基本爬取
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# 單頁完整爬取
|
|
17
|
+
botrun-crawler crawl "https://example.com"
|
|
18
|
+
|
|
19
|
+
# JSON 格式輸出
|
|
20
|
+
botrun-crawler crawl "https://example.com" --json
|
|
21
|
+
|
|
22
|
+
# 安靜模式
|
|
23
|
+
botrun-crawler crawl "https://example.com" -q
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### 儲存結果
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# 儲存 HTML
|
|
30
|
+
botrun-crawler crawl "https://example.com" --save-html page.html
|
|
31
|
+
|
|
32
|
+
# 儲存純文字(LLM 友善格式)
|
|
33
|
+
botrun-crawler crawl "https://example.com" --save-text page.txt
|
|
34
|
+
|
|
35
|
+
# 儲存下載連結清單
|
|
36
|
+
botrun-crawler crawl "https://example.com" --save-downloads urls.txt
|
|
37
|
+
|
|
38
|
+
# 全部儲存
|
|
39
|
+
botrun-crawler crawl "https://example.com" \
|
|
40
|
+
--save-html page.html \
|
|
41
|
+
--save-text page.txt \
|
|
42
|
+
--save-downloads urls.txt
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### 效能選項
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# 跳過 MIME type 分析(加快速度)
|
|
49
|
+
botrun-crawler crawl "https://example.com" --skip-mime
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## 命令總覽
|
|
53
|
+
|
|
54
|
+
| 命令 | 說明 |
|
|
55
|
+
|------|------|
|
|
56
|
+
| `crawl <url>` | 單頁完整爬取(推薦) |
|
|
57
|
+
| `scrape <url>` | 簡易版抓取 |
|
|
58
|
+
| `scrape-batch <file>` | 批次抓取多個網址 |
|
|
59
|
+
| `process <file>` | 處理本地檔案 |
|
|
60
|
+
| `help` | 顯示說明 |
|
|
61
|
+
|
|
62
|
+
## crawl 選項
|
|
63
|
+
|
|
64
|
+
| 選項 | 說明 |
|
|
65
|
+
|------|------|
|
|
66
|
+
| `--json` | JSON 格式輸出 |
|
|
67
|
+
| `--save-html <file>` | 儲存原始 HTML |
|
|
68
|
+
| `--save-text <file>` | 儲存純文字 |
|
|
69
|
+
| `--save-downloads <file>` | 儲存下載連結清單 |
|
|
70
|
+
| `--skip-mime` | 跳過 MIME type 分析 |
|
|
71
|
+
| `-v, --verbose` | 詳細模式 |
|
|
72
|
+
| `-q, --quiet` | 安靜模式 |
|
|
73
|
+
|
|
74
|
+
## 輸出範例
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
========================================
|
|
78
|
+
單頁完整爬取結果
|
|
79
|
+
========================================
|
|
80
|
+
網址: https://www.ida.gov.tw/...
|
|
81
|
+
標題: 經濟部產業發展署
|
|
82
|
+
|
|
83
|
+
連結統計
|
|
84
|
+
├─ 下載連結: 475 個
|
|
85
|
+
│ ├─ PDF: 120
|
|
86
|
+
│ ├─ Word: 85
|
|
87
|
+
│ └─ 動態連結: 270
|
|
88
|
+
├─ 連外連結: 22 個
|
|
89
|
+
└─ 內部連結: 675 個
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Proxy 設定
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
export HTTPS_PROXY="http://proxy:8080"
|
|
96
|
+
export HTTP_PROXY="http://proxy:8080"
|
|
97
|
+
export NO_PROXY="localhost,.internal.com"
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## 程式庫使用
|
|
101
|
+
|
|
102
|
+
```typescript
|
|
103
|
+
import { crawlPage } from 'botrun-crawler/crawler';
|
|
104
|
+
|
|
105
|
+
const result = await crawlPage('https://example.com');
|
|
106
|
+
console.log(result.links.downloads.count);
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### 子模組
|
|
110
|
+
|
|
111
|
+
```typescript
|
|
112
|
+
import { PageCrawler, crawlPage } from 'botrun-crawler/crawler';
|
|
113
|
+
import { scrapeUrl } from 'botrun-crawler/scraper';
|
|
114
|
+
import { fetchWithProxy, fetchMimeType } from 'botrun-crawler/lib';
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## 特色
|
|
118
|
+
|
|
119
|
+
- **動態下載連結偵測**:自動識別 `download.aspx`、`getFile.do` 等動態連結
|
|
120
|
+
- **MIME Type 分析**:自動分類 PDF/Word/Excel 等檔案類型
|
|
121
|
+
- **檔名亂碼修正**:自動修正 UTF-8 編碼問題
|
|
122
|
+
- **政府網站相容**:針對台灣政府網站最佳化
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
MIT
|
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA;;;;;;GAMG"}
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* 耳(Ear)- 任意格式處理器 CLI
|
|
4
|
+
* 零幻覺四部曲之一:將任意格式轉換為 AI 可讀文字
|
|
5
|
+
*
|
|
6
|
+
* @version 1.0.0
|
|
7
|
+
* @author 永恆知己系統
|
|
8
|
+
*/
|
|
9
|
+
import { Command } from "commander";
|
|
10
|
+
import * as path from "path";
|
|
11
|
+
import * as fs from "fs";
|
|
12
|
+
import { ProcessorRouter } from "./processors/index.js";
|
|
13
|
+
import { CacheService } from "./lib/cache.js";
|
|
14
|
+
import { LoggerService } from "./lib/logger.js";
|
|
15
|
+
import { AgentService } from "./lib/agent.js";
|
|
16
|
+
import { runScrapeCommand, runScrapeBatchCommand } from "./scraper/cli.js";
|
|
17
|
+
import { runCrawlCommand } from "./crawler/cli.js";
|
|
18
|
+
// ============================================================
|
|
19
|
+
// CLI 程式
|
|
20
|
+
// ============================================================
|
|
21
|
+
const program = new Command();
|
|
22
|
+
program
|
|
23
|
+
.name("ear")
|
|
24
|
+
.description("耳(Ear)- 任意格式處理器\n零幻覺四部曲之一:將任意格式轉換為 AI 可讀文字")
|
|
25
|
+
.version("1.0.0");
|
|
26
|
+
// ============================================================
|
|
27
|
+
// process 命令
|
|
28
|
+
// ============================================================
|
|
29
|
+
program
|
|
30
|
+
.command("process <target>")
|
|
31
|
+
.description("處理單一檔案或目錄中的所有檔案")
|
|
32
|
+
.option("-o, --output <dir>", "指定輸出目錄")
|
|
33
|
+
.option("-f, --format <format>", "強制指定輸入格式")
|
|
34
|
+
.option("-m, --model <model>", "指定 AI 模型", "claude-haiku-4-5")
|
|
35
|
+
.option("-j, --jobs <n>", "並行處理數量", "4")
|
|
36
|
+
.option("--no-cache", "忽略快取,強制重新處理")
|
|
37
|
+
.option("--dry-run", "模擬執行,不實際處理")
|
|
38
|
+
.option("-v, --verbose", "顯示詳細日誌")
|
|
39
|
+
.action(async (target, options) => {
|
|
40
|
+
const logger = new LoggerService(options.verbose ?? false);
|
|
41
|
+
const cache = new CacheService(logger);
|
|
42
|
+
const agent = new AgentService(options.model ?? "claude-haiku-4-5", logger);
|
|
43
|
+
const router = new ProcessorRouter(cache, logger, agent);
|
|
44
|
+
try {
|
|
45
|
+
logger.info(`開始處理:${target}`);
|
|
46
|
+
const targetPath = path.resolve(target);
|
|
47
|
+
if (!fs.existsSync(targetPath)) {
|
|
48
|
+
logger.error(`檔案或目錄不存在:${targetPath}`);
|
|
49
|
+
process.exit(1);
|
|
50
|
+
}
|
|
51
|
+
const stats = fs.statSync(targetPath);
|
|
52
|
+
if (stats.isDirectory()) {
|
|
53
|
+
// 目錄處理
|
|
54
|
+
await router.processDirectory(targetPath, {
|
|
55
|
+
jobs: parseInt(String(options.jobs ?? 4), 10),
|
|
56
|
+
noCache: options.noCache ?? false,
|
|
57
|
+
dryRun: options.dryRun ?? false,
|
|
58
|
+
outputDir: options.output,
|
|
59
|
+
});
|
|
60
|
+
}
|
|
61
|
+
else {
|
|
62
|
+
// 單檔處理
|
|
63
|
+
const result = await router.processFile(targetPath, {
|
|
64
|
+
noCache: options.noCache ?? false,
|
|
65
|
+
dryRun: options.dryRun ?? false,
|
|
66
|
+
forceFormat: options.format,
|
|
67
|
+
outputDir: options.output,
|
|
68
|
+
});
|
|
69
|
+
if (result.success) {
|
|
70
|
+
logger.success(`處理完成:${result.outputPath}`);
|
|
71
|
+
}
|
|
72
|
+
else {
|
|
73
|
+
logger.error(`處理失敗:${result.error}`);
|
|
74
|
+
process.exit(1);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
catch (error) {
|
|
79
|
+
logger.error(`處理錯誤:${error instanceof Error ? error.message : String(error)}`);
|
|
80
|
+
process.exit(1);
|
|
81
|
+
}
|
|
82
|
+
});
|
|
83
|
+
// ============================================================
|
|
84
|
+
// ask 命令(自然語言驅動)
|
|
85
|
+
// ============================================================
|
|
86
|
+
program
|
|
87
|
+
.command("ask <query>")
|
|
88
|
+
.description("使用自然語言描述任務")
|
|
89
|
+
.option("-m, --model <model>", "指定 AI 模型", "claude-haiku-4-5")
|
|
90
|
+
.option("-v, --verbose", "顯示詳細日誌")
|
|
91
|
+
.action(async (query, options) => {
|
|
92
|
+
const logger = new LoggerService(options.verbose ?? false);
|
|
93
|
+
const agent = new AgentService(options.model ?? "claude-haiku-4-5", logger);
|
|
94
|
+
try {
|
|
95
|
+
logger.info(`自然語言查詢:${query}`);
|
|
96
|
+
const result = await agent.ask(query);
|
|
97
|
+
console.log("\n" + result);
|
|
98
|
+
}
|
|
99
|
+
catch (error) {
|
|
100
|
+
logger.error(`查詢錯誤:${error instanceof Error ? error.message : String(error)}`);
|
|
101
|
+
process.exit(1);
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
// ============================================================
|
|
105
|
+
// crawl 命令(單頁完整爬取)
|
|
106
|
+
// ============================================================
|
|
107
|
+
program
|
|
108
|
+
.command("crawl <url>")
|
|
109
|
+
.description("單頁完整爬取(含結構分析、下載清單、變更追蹤)")
|
|
110
|
+
.option("--json", "以 JSON 格式輸出")
|
|
111
|
+
.option("--save-html <file>", "儲存原始 HTML")
|
|
112
|
+
.option("--save-text <file>", "儲存純文字(LLM 友善格式)")
|
|
113
|
+
.option("--save-downloads <file>", "儲存下載連結清單")
|
|
114
|
+
.option("--skip-mime", "跳過 MIME type 分析(預設會分析)")
|
|
115
|
+
.option("-v, --verbose", "顯示詳細資訊")
|
|
116
|
+
.option("-q, --quiet", "安靜模式")
|
|
117
|
+
.action(async (url, options) => {
|
|
118
|
+
await runCrawlCommand(url, options);
|
|
119
|
+
});
|
|
120
|
+
// ============================================================
|
|
121
|
+
// scrape 命令(網頁抓取與分析)- 簡易版,向後相容
|
|
122
|
+
// ============================================================
|
|
123
|
+
program
|
|
124
|
+
.command("scrape <url>")
|
|
125
|
+
.description("抓取網頁並分析")
|
|
126
|
+
.option("--json", "以 JSON 格式輸出")
|
|
127
|
+
.option("--save-html <file>", "儲存原始 HTML")
|
|
128
|
+
.option("--save-text <file>", "儲存純文字(LLM 友善格式)")
|
|
129
|
+
.option("-v, --verbose", "顯示詳細資訊")
|
|
130
|
+
.option("-q, --quiet", "安靜模式")
|
|
131
|
+
.action(async (url, options) => {
|
|
132
|
+
await runScrapeCommand(url, options);
|
|
133
|
+
});
|
|
134
|
+
// ============================================================
|
|
135
|
+
// scrape-batch 命令(批次抓取)
|
|
136
|
+
// ============================================================
|
|
137
|
+
program
|
|
138
|
+
.command("scrape-batch <urlsFile> [outputDir]")
|
|
139
|
+
.description("批次抓取多個網址")
|
|
140
|
+
.option("-q, --quiet", "安靜模式")
|
|
141
|
+
.action(async (urlsFile, outputDir = ".", options) => {
|
|
142
|
+
const filePath = path.resolve(urlsFile);
|
|
143
|
+
if (!fs.existsSync(filePath)) {
|
|
144
|
+
console.error(`找不到檔案: ${urlsFile}`);
|
|
145
|
+
process.exit(1);
|
|
146
|
+
}
|
|
147
|
+
const content = fs.readFileSync(filePath, "utf-8");
|
|
148
|
+
const urls = content.split("\n");
|
|
149
|
+
await runScrapeBatchCommand(urls, outputDir, options);
|
|
150
|
+
});
|
|
151
|
+
// ============================================================
|
|
152
|
+
// 執行
|
|
153
|
+
// ============================================================
|
|
154
|
+
program.parse();
|
|
155
|
+
//# sourceMappingURL=cli.js.map
|
package/dist/cli.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA;;;;;;GAMG;AAEH,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AACpC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AACzB,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAC9C,OAAO,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAChD,OAAO,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAC9C,OAAO,EAAE,gBAAgB,EAAE,qBAAqB,EAAE,MAAM,kBAAkB,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAgBnD,+DAA+D;AAC/D,SAAS;AACT,+DAA+D;AAE/D,MAAM,OAAO,GAAG,IAAI,OAAO,EAAE,CAAC;AAE9B,OAAO;KACJ,IAAI,CAAC,KAAK,CAAC;KACX,WAAW,CAAC,4CAA4C,CAAC;KACzD,OAAO,CAAC,OAAO,CAAC,CAAC;AAEpB,+DAA+D;AAC/D,aAAa;AACb,+DAA+D;AAE/D,OAAO;KACJ,OAAO,CAAC,kBAAkB,CAAC;KAC3B,WAAW,CAAC,iBAAiB,CAAC;KAC9B,MAAM,CAAC,oBAAoB,EAAE,QAAQ,CAAC;KACtC,MAAM,CAAC,uBAAuB,EAAE,UAAU,CAAC;KAC3C,MAAM,CAAC,qBAAqB,EAAE,UAAU,EAAE,kBAAkB,CAAC;KAC7D,MAAM,CAAC,gBAAgB,EAAE,QAAQ,EAAE,GAAG,CAAC;KACvC,MAAM,CAAC,YAAY,EAAE,aAAa,CAAC;KACnC,MAAM,CAAC,WAAW,EAAE,YAAY,CAAC;KACjC,MAAM,CAAC,eAAe,EAAE,QAAQ,CAAC;KACjC,MAAM,CAAC,KAAK,EAAE,MAAc,EAAE,OAAuB,EAAE,EAAE;IACxD,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,OAAO,CAAC,OAAO,IAAI,KAAK,CAAC,CAAC;IAC3D,MAAM,KAAK,GAAG,IAAI,YAAY,CAAC,MAAM,CAAC,CAAC;IACvC,MAAM,KAAK,GAAG,IAAI,YAAY,CAAC,OAAO,CAAC,KAAK,IAAI,kBAAkB,EAAE,MAAM,CAAC,CAAC;IAC5E,MAAM,MAAM,GAAG,IAAI,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC;IAEzD,IAAI,CAAC;QACH,MAAM,CAAC,IAAI,CAAC,QAAQ,MAAM,EAAE,CAAC,CAAC;QAE9B,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QAExC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC/B,MAAM,CAAC,KAAK,CAAC,YAAY,UAAU,EAAE,CAAC,CAAC;YACvC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;QAED,MAAM,KAAK,GAAG,EAAE,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC;QAEtC,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC;YACxB,OAAO;YACP,MAAM,MAAM,CAAC,gBAAgB,CAAC,UAAU,EAAE;gBACxC,IAAI,EAAE,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC;gBAC7C,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,KAAK;gBACjC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,KAAK;gBAC/B,SAAS,EAAE,OAAO,CAAC,MAAM;aAC1B,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,OAAO;YACP,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,WAAW,CAAC,UAAU,EAAE;gBAClD,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,KAAK;gBACjC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,KAAK;gBAC/B,WAAW,EAAE,OAAO,CAAC,MAAM;gBAC3B,SAAS,EAAE,OAAO,CAAC,MAAM;aAC1B,CAAC,CAAC;YAEH,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;gBACnB,MAAM,CAAC,OAAO,CAAC,QAAQ,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC;YAC9C,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,KAAK,CAAC,QAAQ,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;gBACrC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAClB,CAAC;QACH,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,CAAC,KAAK,CAAC,QAAQ,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAC/E,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,+DAA+D;AAC/D,iBAAiB;AACjB,+DAA+D;AAE/D,OAAO;KACJ,OAAO,CAAC,aAAa,CAAC;KACtB,WAAW,CAAC,YAAY,CAAC;KACzB,MAAM,CAAC,qBAAqB,EAAE,UAAU,EAAE,kBAAkB,CAAC;KAC7D,MAAM,CAAC,eAAe,EAAE,QAAQ,CAAC;KACjC,MAAM,CAAC,KAAK,EAAE,KAAa,EAAE,OAA8C,EAAE,EAAE;IAC9E,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,OAAO,CAAC,OAAO,IAAI,KAAK,CAAC,CAAC;IAC3D,MAAM,KAAK,GAAG,IAAI,YAAY,CAAC,OAAO,CAAC,KAAK,IAAI,kBAAkB,EAAE,MAAM,CAAC,CAAC;IAE5E,IAAI,CAAC;QACH,MAAM,CAAC,IAAI,CAAC,UAAU,KAAK,EAAE,CAAC,CAAC;QAE/B,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEtC,OAAO,CAAC,GAAG,CAAC,IAAI,GAAG,MAAM,CAAC,CAAC;IAC7B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,CAAC,KAAK,CAAC,QAAQ,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAC/E,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC,CAAC,CAAC;AAEL,+DAA+D;AAC/D,mBAAmB;AACnB,+DAA+D;AAE/D,OAAO;KACJ,OAAO,CAAC,aAAa,CAAC;KACtB,WAAW,CAAC,yBAAyB,CAAC;KACtC,MAAM,CAAC,QAAQ,EAAE,aAAa,CAAC;KAC/B,MAAM,CAAC,oBAAoB,EAAE,WAAW,CAAC;KACzC,MAAM,CAAC,oBAAoB,EAAE,iBAAiB,CAAC;KAC/C,MAAM,CAAC,yBAAyB,EAAE,UAAU,CAAC;KAC7C,MAAM,CAAC,aAAa,EAAE,wBAAwB,CAAC;KAC/C,MAAM,CAAC,eAAe,EAAE,QAAQ,CAAC;KACjC,MAAM,CAAC,aAAa,EAAE,MAAM,CAAC;KAC7B,MAAM,CAAC,KAAK,EAAE,GAAW,EAAE,OAQ3B,EAAE,EAAE;IACH,MAAM,eAAe,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;AACtC,CAAC,CAAC,CAAC;AAEL,+DAA+D;AAC/D,+BAA+B;AAC/B,+DAA+D;AAE/D,OAAO;KACJ,OAAO,CAAC,cAAc,CAAC;KACvB,WAAW,CAAC,SAAS,CAAC;KACtB,MAAM,CAAC,QAAQ,EAAE,aAAa,CAAC;KAC/B,MAAM,CAAC,oBAAoB,EAAE,WAAW,CAAC;KACzC,MAAM,CAAC,oBAAoB,EAAE,iBAAiB,CAAC;KAC/C,MAAM,CAAC,eAAe,EAAE,QAAQ,CAAC;KACjC,MAAM,CAAC,aAAa,EAAE,MAAM,CAAC;KAC7B,MAAM,CAAC,KAAK,EAAE,GAAW,EAAE,OAM3B,EAAE,EAAE;IACH,MAAM,gBAAgB,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;AACvC,CAAC,CAAC,CAAC;AAEL,+DAA+D;AAC/D,wBAAwB;AACxB,+DAA+D;AAE/D,OAAO;KACJ,OAAO,CAAC,qCAAqC,CAAC;KAC9C,WAAW,CAAC,UAAU,CAAC;KACvB,MAAM,CAAC,aAAa,EAAE,MAAM,CAAC;KAC7B,MAAM,CAAC,KAAK,EAAE,QAAgB,EAAE,YAAoB,GAAG,EAAE,OAA4B,EAAE,EAAE;IACxF,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;IACxC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;QAC7B,OAAO,CAAC,KAAK,CAAC,UAAU,QAAQ,EAAE,CAAC,CAAC;QACpC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACnD,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEjC,MAAM,qBAAqB,CAAC,IAAI,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;AACxD,CAAC,CAAC,CAAC;AAEL,+DAA+D;AAC/D,KAAK;AACL,+DAA+D;AAE/D,OAAO,CAAC,KAAK,EAAE,CAAC"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { CrawlResult } from "./index.js";
|
|
2
|
+
export interface CrawlCliOptions {
|
|
3
|
+
json?: boolean;
|
|
4
|
+
saveHtml?: string;
|
|
5
|
+
saveText?: string;
|
|
6
|
+
saveDownloads?: string;
|
|
7
|
+
verbose?: boolean;
|
|
8
|
+
quiet?: boolean;
|
|
9
|
+
skipMime?: boolean;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* 輸出人類可讀報告
|
|
13
|
+
*/
|
|
14
|
+
export declare function printCrawlReport(result: CrawlResult, verbose?: boolean): void;
|
|
15
|
+
/**
|
|
16
|
+
* 執行 crawl 命令
|
|
17
|
+
*/
|
|
18
|
+
export declare function runCrawlCommand(url: string, options: CrawlCliOptions): Promise<void>;
|
|
19
|
+
//# sourceMappingURL=cli.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../../src/crawler/cli.ts"],"names":[],"mappings":"AAIA,OAAO,EAAe,WAAW,EAAc,MAAM,YAAY,CAAC;AAElE,MAAM,WAAW,eAAe;IAC9B,IAAI,CAAC,EAAE,OAAO,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,MAAM,EAAE,WAAW,EACnB,OAAO,GAAE,OAAe,GACvB,IAAI,CA8IN;AAED;;GAEG;AACH,wBAAsB,eAAe,CACnC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,eAAe,GACvB,OAAO,CAAC,IAAI,CAAC,CAyCf"}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 單頁爬取 CLI 模組
|
|
3
|
+
*/
|
|
4
|
+
import * as fs from "fs";
|
|
5
|
+
import { PageCrawler } from "./index.js";
|
|
6
|
+
/**
|
|
7
|
+
* 輸出人類可讀報告
|
|
8
|
+
*/
|
|
9
|
+
export function printCrawlReport(result, verbose = false) {
|
|
10
|
+
console.log("=".repeat(60));
|
|
11
|
+
console.log("網頁分析報告");
|
|
12
|
+
console.log("=".repeat(60));
|
|
13
|
+
console.log(`網址: ${result.url}`);
|
|
14
|
+
if (result.title) {
|
|
15
|
+
console.log(`標題: ${result.title}`);
|
|
16
|
+
}
|
|
17
|
+
console.log(`爬取時間: ${result.crawled_at}`);
|
|
18
|
+
console.log(`頁面指紋: ${result.fingerprint.slice(0, 16)}...`);
|
|
19
|
+
// 頁面 MIME type(如果有)
|
|
20
|
+
if (result.pageMimeType) {
|
|
21
|
+
const mime = result.pageMimeType;
|
|
22
|
+
let mimeStr = mime.mimeType;
|
|
23
|
+
if (mime.charset)
|
|
24
|
+
mimeStr += `; charset=${mime.charset}`;
|
|
25
|
+
console.log(`頁面類型: ${mimeStr}`);
|
|
26
|
+
}
|
|
27
|
+
// Benchmark 計時
|
|
28
|
+
if (result.benchmark) {
|
|
29
|
+
const b = result.benchmark;
|
|
30
|
+
let benchmarkStr = `抓取: ${(b.fetch_ms / 1000).toFixed(2)}s, 解析: ${(b.parse_ms / 1000).toFixed(2)}s`;
|
|
31
|
+
if (b.mime_analysis_ms !== undefined) {
|
|
32
|
+
benchmarkStr += `, MIME分析: ${(b.mime_analysis_ms / 1000).toFixed(2)}s`;
|
|
33
|
+
}
|
|
34
|
+
benchmarkStr += `, 總計: ${(b.total_ms / 1000).toFixed(2)}s`;
|
|
35
|
+
console.log(`執行時間: ${benchmarkStr}`);
|
|
36
|
+
}
|
|
37
|
+
console.log();
|
|
38
|
+
console.log("【大小統計】");
|
|
39
|
+
console.log(` 原始 HTML: ${result.size.raw_html_readable}`);
|
|
40
|
+
console.log(` 純文字: ${result.size.text_readable}`);
|
|
41
|
+
console.log(` 壓縮比: ${result.size.compression_ratio} 節省`);
|
|
42
|
+
console.log();
|
|
43
|
+
// 下載檔案統計
|
|
44
|
+
const downloads = result.links.downloads;
|
|
45
|
+
const uniqueCount = downloads.unique_count || downloads.count;
|
|
46
|
+
const originalCount = downloads.original_count || downloads.count;
|
|
47
|
+
const hasDuplicates = originalCount > uniqueCount;
|
|
48
|
+
if (hasDuplicates) {
|
|
49
|
+
console.log(`【可下載檔案】共 ${uniqueCount} 個(原始 ${originalCount} 個,去重 ${originalCount - uniqueCount} 個)`);
|
|
50
|
+
}
|
|
51
|
+
else {
|
|
52
|
+
console.log(`【可下載檔案】共 ${result.links.downloads.count} 個`);
|
|
53
|
+
}
|
|
54
|
+
// 按檔案類別統計(人類可讀)
|
|
55
|
+
if (downloads.by_file_category && Object.keys(downloads.by_file_category).length > 0) {
|
|
56
|
+
console.log(" 按檔案類型統計:");
|
|
57
|
+
const sorted = Object.entries(downloads.by_file_category)
|
|
58
|
+
.sort((a, b) => b[1] - a[1]);
|
|
59
|
+
for (const [category, count] of sorted) {
|
|
60
|
+
console.log(` - ${category}: ${count} 個`);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
else if (Object.keys(result.links.downloads.by_type).length > 0) {
|
|
64
|
+
// 如果沒有 MIME 分析,顯示按副檔名類型統計
|
|
65
|
+
console.log(" 按副檔名統計:");
|
|
66
|
+
for (const [type, count] of Object.entries(result.links.downloads.by_type)
|
|
67
|
+
.sort((a, b) => b[1] - a[1])) {
|
|
68
|
+
console.log(` - ${type.toUpperCase()}: ${count} 個`);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
// 顯示真實 MIME type 統計(如果有且為 verbose 模式)
|
|
72
|
+
if (verbose && downloads.by_real_mime && Object.keys(downloads.by_real_mime).length > 0) {
|
|
73
|
+
console.log(" 按真實 MIME type 統計:");
|
|
74
|
+
const sorted = Object.entries(downloads.by_real_mime)
|
|
75
|
+
.sort((a, b) => b[1] - a[1]);
|
|
76
|
+
for (const [mimeType, count] of sorted) {
|
|
77
|
+
console.log(` - ${mimeType}: ${count} 個`);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
if (verbose && result.links.downloads.items.length > 0) {
|
|
81
|
+
console.log(" 下載清單:");
|
|
82
|
+
const items = result.links.downloads.items.slice(0, 20);
|
|
83
|
+
for (let i = 0; i < items.length; i++) {
|
|
84
|
+
const item = items[i];
|
|
85
|
+
console.log(` ${i + 1}. [${item.type.toUpperCase()}] ${item.text.slice(0, 50)}`);
|
|
86
|
+
if (item.category) {
|
|
87
|
+
console.log(` 分類: ${item.category}`);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
if (result.links.downloads.items.length > 20) {
|
|
91
|
+
console.log(` ... 還有 ${result.links.downloads.items.length - 20} 個`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
console.log();
|
|
95
|
+
console.log(`【連外連結】共 ${result.links.external.count} 個`);
|
|
96
|
+
if (Object.keys(result.links.external.domains).length > 0) {
|
|
97
|
+
const sorted = Object.entries(result.links.external.domains)
|
|
98
|
+
.sort((a, b) => b[1] - a[1])
|
|
99
|
+
.slice(0, 10);
|
|
100
|
+
for (const [domain, count] of sorted) {
|
|
101
|
+
console.log(` - ${domain}: ${count} 個`);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
// MIME type 統計(如果有)
|
|
105
|
+
if (result.links.external.by_mime_type && Object.keys(result.links.external.by_mime_type).length > 0) {
|
|
106
|
+
console.log(" 按 MIME type 統計:");
|
|
107
|
+
const mimeTypeSorted = Object.entries(result.links.external.by_mime_type)
|
|
108
|
+
.sort((a, b) => b[1] - a[1]);
|
|
109
|
+
for (const [mimeType, count] of mimeTypeSorted) {
|
|
110
|
+
console.log(` - ${mimeType}: ${count} 個`);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
console.log();
|
|
114
|
+
console.log(`【內部連結】共 ${result.links.internal.count} 個`);
|
|
115
|
+
console.log();
|
|
116
|
+
if (result.content.headings.length > 0) {
|
|
117
|
+
console.log(`【頁面結構】共 ${result.content.headings.length} 個標題`);
|
|
118
|
+
if (verbose) {
|
|
119
|
+
for (const heading of result.content.headings.slice(0, 10)) {
|
|
120
|
+
console.log(` - ${heading.slice(0, 60)}`);
|
|
121
|
+
}
|
|
122
|
+
if (result.content.headings.length > 10) {
|
|
123
|
+
console.log(` ... 還有 ${result.content.headings.length - 10} 個`);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
console.log();
|
|
127
|
+
}
|
|
128
|
+
if (result.content.forms && result.content.forms.length > 0) {
|
|
129
|
+
console.log(`【表單】共 ${result.content.forms.length} 個`);
|
|
130
|
+
if (verbose) {
|
|
131
|
+
for (const form of result.content.forms) {
|
|
132
|
+
console.log(` - ${form.method} ${form.action || "(無 action)"}`);
|
|
133
|
+
console.log(` 欄位: ${form.fields.join(", ")}`);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
console.log();
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* 執行 crawl 命令
|
|
141
|
+
*/
|
|
142
|
+
export async function runCrawlCommand(url, options) {
|
|
143
|
+
const crawler = new PageCrawler(url);
|
|
144
|
+
const success = await crawler.fetch();
|
|
145
|
+
if (!success) {
|
|
146
|
+
process.exit(1);
|
|
147
|
+
}
|
|
148
|
+
// 預設分析 MIME type(除非指定 --skip-mime)
|
|
149
|
+
const result = options.skipMime
|
|
150
|
+
? crawler.analyze()
|
|
151
|
+
: await crawler.analyzeWithMimeTypes();
|
|
152
|
+
// 儲存檔案
|
|
153
|
+
if (options.saveHtml) {
|
|
154
|
+
fs.writeFileSync(options.saveHtml, crawler.getRawHtml(), "utf-8");
|
|
155
|
+
if (!options.quiet) {
|
|
156
|
+
console.error(`原始 HTML 已儲存: ${options.saveHtml}`);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
if (options.saveText) {
|
|
160
|
+
fs.writeFileSync(options.saveText, crawler.getTextContent(), "utf-8");
|
|
161
|
+
if (!options.quiet) {
|
|
162
|
+
console.error(`純文字已儲存: ${options.saveText}`);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
if (options.saveDownloads) {
|
|
166
|
+
fs.writeFileSync(options.saveDownloads, crawler.getDownloadList(), "utf-8");
|
|
167
|
+
if (!options.quiet) {
|
|
168
|
+
console.error(`下載清單已儲存: ${options.saveDownloads}`);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
// 輸出
|
|
172
|
+
if (options.json) {
|
|
173
|
+
console.log(JSON.stringify(result, null, 2));
|
|
174
|
+
}
|
|
175
|
+
else {
|
|
176
|
+
printCrawlReport(result, options.verbose);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
//# sourceMappingURL=cli.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../../src/crawler/cli.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AACzB,OAAO,EAAE,WAAW,EAA2B,MAAM,YAAY,CAAC;AAYlE;;GAEG;AACH,MAAM,UAAU,gBAAgB,CAC9B,MAAmB,EACnB,UAAmB,KAAK;IAExB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACtB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC;IACjC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;QACjB,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;IACrC,CAAC;IACD,OAAO,CAAC,GAAG,CAAC,SAAS,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC;IAC1C,OAAO,CAAC,GAAG,CAAC,SAAS,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC;IAE3D,oBAAoB;IACpB,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;QACxB,MAAM,IAAI,GAAG,MAAM,CAAC,YAAY,CAAC;QACjC,IAAI,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC;QAC5B,IAAI,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,aAAa,IAAI,CAAC,OAAO,EAAE,CAAC;QACzD,OAAO,CAAC,GAAG,CAAC,SAAS,OAAO,EAAE,CAAC,CAAC;IAClC,CAAC;IAED,eAAe;IACf,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;QACrB,MAAM,CAAC,GAAG,MAAM,CAAC,SAAS,CAAC;QAC3B,IAAI,YAAY,GAAG,OAAO,CAAC,CAAC,CAAC,QAAQ,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,QAAQ,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;QACpG,IAAI,CAAC,CAAC,gBAAgB,KAAK,SAAS,EAAE,CAAC;YACrC,YAAY,IAAI,aAAa,CAAC,CAAC,CAAC,gBAAgB,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;QACzE,CAAC;QACD,YAAY,IAAI,SAAS,CAAC,CAAC,CAAC,QAAQ,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC;QAC3D,OAAO,CAAC,GAAG,CAAC,SAAS,YAAY,EAAE,CAAC,CAAC;IACvC,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACtB,OAAO,CAAC,GAAG,CAAC,cAAc,MAAM,CAAC,IAAI,CAAC,iBAAiB,EAAE,CAAC,CAAC;IAC3D,OAAO,CAAC,GAAG,CAAC,aAAa,MAAM,CAAC,IAAI,CAAC,aAAa,EAAE,CAAC,CAAC;IACtD,OAAO,CAAC,GAAG,CAAC,aAAa,MAAM,CAAC,IAAI,CAAC,iBAAiB,KAAK,CAAC,CAAC;IAC7D,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,SAAS;IACT,MAAM,SAAS,GAAG,MAAM,CAAC,KAAK,CAAC,SAAgB,CAAC;IAChD,MAAM,WAAW,GAAG,SAAS,CAAC,YAAY,IAAI,SAAS,CAAC,KAAK,CAAC;IAC9D,MAAM,aAAa,GAAG,SAAS,CAAC,cAAc,IAAI,SAAS,CAAC,KAAK,CAAC;IAClE,MAAM,aAAa,GAAG,aAAa,GAAG,WAAW,CAAC;IAElD,IAAI,aAAa,EAAE,CAAC;QAClB,OAAO,CAAC,GAAG,CAAC,YAAY,WAAW,SAAS,aAAa,SAAS,aAAa,GAAG,WAAW,KAAK,CAAC,CAAC;IACtG,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,IAAI,CAAC,CAAC;IAC5D,CAAC;IAED,gBAAgB;IAChB,IAAI,SAAS,CAAC,gBAAgB,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrF,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;QAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,gBAA0C,CAAC;aAChF,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/B,KAAK,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;YACvC,OAAO,CAAC,GAAG,CAAC,SAAS,QAAQ,KAAK,KAAK,IAAI,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;SAAM,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClE,0BAA0B;QAC1B,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC;QACzB,KAAK,MAAM,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,OAAO,CAAC;aACvE,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC/B,OAAO,CAAC,GAAG,CAAC,SAAS,IAAI,CAAC,WAAW,EAAE,KAAK,KAAK,IAAI,CAAC,CAAC;QACzD,CAAC;IACH,CAAC;IAED,sCAAsC;IACtC,IAAI,OAAO,IAAI,SAAS,CAAC,YAAY,IAAI,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxF,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC,CAAC;QACnC,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,YAAsC,CAAC;aAC5E,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/B,KAAK,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;YACvC,OAAO,CAAC,GAAG,CAAC,SAAS,QAAQ,KAAK,KAAK,IAAI,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IACD,IAAI,OAAO,IAAI,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvD,OAAO,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC;QACvB,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACxD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACtB,OAAO,CAAC,GAAG,CACT,OAAO,CAAC,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CACvE,CAAC;YACF,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBAClB,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC;YAC7C,CAAC;QACH,CAAC;QACD,IAAI,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YAC7C,OAAO,CAAC,GAAG,CACT,cAAc,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,IAAI,CAC3D,CAAC;QACJ,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;IACxD,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1D,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC;aACzD,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;aAC3B,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAChB,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;YACrC,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,KAAK,KAAK,IAAI,CAAC,CAAC;QAC3C,CAAC;IACH,CAAC;IAED,oBAAoB;IACpB,IAAI,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,YAAY,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACrG,OAAO,CAAC,GAAG,CAAC,mBAAmB,CAAC,CAAC;QACjC,MAAM,cAAc,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC;aACtE,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/B,KAAK,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,IAAI,cAAc,EAAE,CAAC;YAC/C,OAAO,CAAC,GAAG,CAAC,SAAS,QAAQ,KAAK,KAAK,IAAI,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,IAAI,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvC,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,MAAM,CAAC,CAAC;QAC7D,IAAI,OAAO,EAAE,CAAC;YACZ,KAAK,MAAM,OAAO,IAAI,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC;gBAC3D,OAAO,CAAC,GAAG,CAAC,OAAO,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;YAC7C,CAAC;YACD,IAAI,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;gBACxC,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,MAAM,GAAG,EAAE,IAAI,CAAC,CAAC;YACnE,CAAC;QACH,CAAC;QACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAChB,CAAC;IAED,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5D,OAAO,CAAC,GAAG,CAAC,SAAS,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,CAAC;QACtD,IAAI,OAAO,EAAE,CAAC;YACZ,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;gBACxC,OAAO,CAAC,GAAG,CAAC,OAAO,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,IAAI,YAAY,EAAE,CAAC,CAAC;gBACjE,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACnD,CAAC;QACH,CAAC;QACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAChB,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,GAAW,EACX,OAAwB;IAExB,MAAM,OAAO,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,CAAC;IAErC,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;IACtC,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,mCAAmC;IACnC,MAAM,MAAM,GAAG,OAAO,CAAC,QAAQ;QAC7B,CAAC,CAAC,OAAO,CAAC,OAAO,EAAE;QACnB,CAAC,CAAC,MAAM,OAAO,CAAC,oBAAoB,EAAE,CAAC;IAEzC,OAAO;IACP,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACrB,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,QAAQ,EAAE,OAAO,CAAC,UAAU,EAAE,EAAE,OAAO,CAAC,CAAC;QAClE,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,KAAK,CAAC,gBAAgB,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;QACpD,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACrB,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,QAAQ,EAAE,OAAO,CAAC,cAAc,EAAE,EAAE,OAAO,CAAC,CAAC;QACtE,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,KAAK,CAAC,WAAW,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;QAC1B,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,aAAa,EAAE,OAAO,CAAC,eAAe,EAAE,EAAE,OAAO,CAAC,CAAC;QAC5E,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,KAAK,CAAC,YAAY,OAAO,CAAC,aAAa,EAAE,CAAC,CAAC;QACrD,CAAC;IACH,CAAC;IAED,KAAK;IACL,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;QACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAC/C,CAAC;SAAM,CAAC;QACN,gBAAgB,CAAC,MAAM,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;IAC5C,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 單頁完整爬取模組
|
|
3
|
+
*
|
|
4
|
+
* 滿足五個動機:
|
|
5
|
+
* 1. AI 知識庫建置 - 純文字 + metadata
|
|
6
|
+
* 2. 完整離線歸檔 - 下載清單 + 檔案分類
|
|
7
|
+
* 3. 法規變更追蹤 - 頁面指紋 + 時間戳記
|
|
8
|
+
* 4. 申請流程自動化 - 結構解析
|
|
9
|
+
* 5. 跨文件關聯分析 - 連結關係
|
|
10
|
+
*
|
|
11
|
+
* 支援 HTTPS Proxy(沙盒環境)
|
|
12
|
+
*/
|
|
13
|
+
import { MimeTypeInfo } from "../lib/http-client.js";
|
|
14
|
+
export interface CrawlOptions {
|
|
15
|
+
timeout?: number;
|
|
16
|
+
userAgent?: string;
|
|
17
|
+
}
|
|
18
|
+
export interface DownloadLinkItem {
|
|
19
|
+
url: string;
|
|
20
|
+
text: string;
|
|
21
|
+
type: string;
|
|
22
|
+
category?: string;
|
|
23
|
+
mimeType?: MimeTypeInfo;
|
|
24
|
+
fileCategory?: string;
|
|
25
|
+
}
|
|
26
|
+
export interface ExternalLinkItem {
|
|
27
|
+
url: string;
|
|
28
|
+
text: string;
|
|
29
|
+
domain: string;
|
|
30
|
+
mimeType?: MimeTypeInfo;
|
|
31
|
+
}
|
|
32
|
+
export type { MimeTypeInfo };
|
|
33
|
+
export interface LinkItem {
|
|
34
|
+
url: string;
|
|
35
|
+
text: string;
|
|
36
|
+
}
|
|
37
|
+
export interface FormInfo {
|
|
38
|
+
action: string;
|
|
39
|
+
method: string;
|
|
40
|
+
fields: string[];
|
|
41
|
+
}
|
|
42
|
+
export interface BenchmarkInfo {
|
|
43
|
+
fetch_ms: number;
|
|
44
|
+
parse_ms: number;
|
|
45
|
+
mime_analysis_ms?: number;
|
|
46
|
+
total_ms: number;
|
|
47
|
+
}
|
|
48
|
+
export interface CrawlResult {
|
|
49
|
+
url: string;
|
|
50
|
+
title: string;
|
|
51
|
+
description: string;
|
|
52
|
+
crawled_at: string;
|
|
53
|
+
fingerprint: string;
|
|
54
|
+
pageMimeType?: MimeTypeInfo;
|
|
55
|
+
benchmark?: BenchmarkInfo;
|
|
56
|
+
size: {
|
|
57
|
+
raw_html_bytes: number;
|
|
58
|
+
raw_html_readable: string;
|
|
59
|
+
text_bytes: number;
|
|
60
|
+
text_readable: string;
|
|
61
|
+
compression_ratio: string;
|
|
62
|
+
};
|
|
63
|
+
links: {
|
|
64
|
+
downloads: {
|
|
65
|
+
count: number;
|
|
66
|
+
unique_count?: number;
|
|
67
|
+
original_count?: number;
|
|
68
|
+
by_type: Record<string, number>;
|
|
69
|
+
by_real_mime?: Record<string, number>;
|
|
70
|
+
by_file_category?: Record<string, number>;
|
|
71
|
+
items: DownloadLinkItem[];
|
|
72
|
+
};
|
|
73
|
+
external: {
|
|
74
|
+
count: number;
|
|
75
|
+
domains: Record<string, number>;
|
|
76
|
+
by_mime_type?: Record<string, number>;
|
|
77
|
+
items: ExternalLinkItem[];
|
|
78
|
+
};
|
|
79
|
+
internal: {
|
|
80
|
+
count: number;
|
|
81
|
+
items: LinkItem[];
|
|
82
|
+
};
|
|
83
|
+
};
|
|
84
|
+
content: {
|
|
85
|
+
headings: string[];
|
|
86
|
+
sections: string[];
|
|
87
|
+
forms?: FormInfo[];
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
export declare function formatSize(bytes: number): string;
|
|
91
|
+
/**
|
|
92
|
+
* 格式化為台灣時間 (UTC+8)
|
|
93
|
+
*/
|
|
94
|
+
export declare function formatTaiwanTime(date: Date): string;
|
|
95
|
+
export declare function extractText(html: string): string;
|
|
96
|
+
/**
|
|
97
|
+
* 下載連結分析器
|
|
98
|
+
* 用於判斷 URL 是否為下載連結,並分類
|
|
99
|
+
*/
|
|
100
|
+
export declare class DownloadAnalyzer {
|
|
101
|
+
analyze(url: string, linkText: string): {
|
|
102
|
+
isDownload: boolean;
|
|
103
|
+
type: string | null;
|
|
104
|
+
category: string | null;
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
export declare class PageCrawler {
|
|
108
|
+
private url;
|
|
109
|
+
private baseDomain;
|
|
110
|
+
private html;
|
|
111
|
+
private crawledAt;
|
|
112
|
+
private pageMimeType?;
|
|
113
|
+
private responseHeaders?;
|
|
114
|
+
private fetchStartTime;
|
|
115
|
+
private fetchEndTime;
|
|
116
|
+
constructor(url: string);
|
|
117
|
+
getFetchDuration(): number;
|
|
118
|
+
fetch(options?: CrawlOptions): Promise<boolean>;
|
|
119
|
+
private detectPageMimeType;
|
|
120
|
+
getPageMimeType(): MimeTypeInfo | undefined;
|
|
121
|
+
getRawHtml(): string;
|
|
122
|
+
getTextContent(): string;
|
|
123
|
+
getFingerprint(): string;
|
|
124
|
+
private downloadStats;
|
|
125
|
+
getDownloadableLinks(): DownloadLinkItem[];
|
|
126
|
+
getDownloadStats(): {
|
|
127
|
+
originalCount: number;
|
|
128
|
+
uniqueCount: number;
|
|
129
|
+
};
|
|
130
|
+
getExternalLinks(): ExternalLinkItem[];
|
|
131
|
+
getInternalLinks(): LinkItem[];
|
|
132
|
+
analyze(): CrawlResult;
|
|
133
|
+
/**
|
|
134
|
+
* 分析並獲取外部連結和下載連結的 MIME type(需要額外網路請求)
|
|
135
|
+
*/
|
|
136
|
+
analyzeWithMimeTypes(): Promise<CrawlResult>;
|
|
137
|
+
/**
|
|
138
|
+
* 產生下載清單(每行一個 URL)
|
|
139
|
+
*/
|
|
140
|
+
getDownloadList(): string;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* 快速爬取單頁
|
|
144
|
+
*/
|
|
145
|
+
export declare function crawlPage(url: string, options?: CrawlOptions): Promise<CrawlResult | null>;
|
|
146
|
+
//# sourceMappingURL=index.d.ts.map
|