botrun-crawler-2 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +126 -0
- package/dist/cli.d.ts +10 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +155 -0
- package/dist/cli.js.map +1 -0
- package/dist/crawler/cli.d.ts +19 -0
- package/dist/crawler/cli.d.ts.map +1 -0
- package/dist/crawler/cli.js +179 -0
- package/dist/crawler/cli.js.map +1 -0
- package/dist/crawler/index.d.ts +146 -0
- package/dist/crawler/index.d.ts.map +1 -0
- package/dist/crawler/index.js +670 -0
- package/dist/crawler/index.js.map +1 -0
- package/dist/index.d.ts +17 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/agent.d.ts +34 -0
- package/dist/lib/agent.d.ts.map +1 -0
- package/dist/lib/agent.js +73 -0
- package/dist/lib/agent.js.map +1 -0
- package/dist/lib/cache.d.ts +49 -0
- package/dist/lib/cache.d.ts.map +1 -0
- package/dist/lib/cache.js +141 -0
- package/dist/lib/cache.js.map +1 -0
- package/dist/lib/filename-decoder.d.ts +62 -0
- package/dist/lib/filename-decoder.d.ts.map +1 -0
- package/dist/lib/filename-decoder.js +229 -0
- package/dist/lib/filename-decoder.js.map +1 -0
- package/dist/lib/http-client.d.ts +86 -0
- package/dist/lib/http-client.d.ts.map +1 -0
- package/dist/lib/http-client.js +373 -0
- package/dist/lib/http-client.js.map +1 -0
- package/dist/lib/index.d.ts +15 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +19 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/logger.d.ts +41 -0
- package/dist/lib/logger.d.ts.map +1 -0
- package/dist/lib/logger.js +122 -0
- package/dist/lib/logger.js.map +1 -0
- package/dist/lib/scene-detector.d.ts +92 -0
- package/dist/lib/scene-detector.d.ts.map +1 -0
- package/dist/lib/scene-detector.js +297 -0
- package/dist/lib/scene-detector.js.map +1 -0
- package/dist/processors/audio.d.ts +20 -0
- package/dist/processors/audio.d.ts.map +1 -0
- package/dist/processors/audio.js +110 -0
- package/dist/processors/audio.js.map +1 -0
- package/dist/processors/base.d.ts +53 -0
- package/dist/processors/base.d.ts.map +1 -0
- package/dist/processors/base.js +194 -0
- package/dist/processors/base.js.map +1 -0
- package/dist/processors/data.d.ts +48 -0
- package/dist/processors/data.d.ts.map +1 -0
- package/dist/processors/data.js +206 -0
- package/dist/processors/data.js.map +1 -0
- package/dist/processors/document.d.ts +20 -0
- package/dist/processors/document.d.ts.map +1 -0
- package/dist/processors/document.js +137 -0
- package/dist/processors/document.js.map +1 -0
- package/dist/processors/image.d.ts +20 -0
- package/dist/processors/image.d.ts.map +1 -0
- package/dist/processors/image.js +92 -0
- package/dist/processors/image.js.map +1 -0
- package/dist/processors/index.d.ts +53 -0
- package/dist/processors/index.d.ts.map +1 -0
- package/dist/processors/index.js +177 -0
- package/dist/processors/index.js.map +1 -0
- package/dist/processors/text.d.ts +44 -0
- package/dist/processors/text.d.ts.map +1 -0
- package/dist/processors/text.js +262 -0
- package/dist/processors/text.js.map +1 -0
- package/dist/processors/video.d.ts +20 -0
- package/dist/processors/video.d.ts.map +1 -0
- package/dist/processors/video.js +93 -0
- package/dist/processors/video.js.map +1 -0
- package/dist/scraper/cli.d.ts +23 -0
- package/dist/scraper/cli.d.ts.map +1 -0
- package/dist/scraper/cli.js +118 -0
- package/dist/scraper/cli.js.map +1 -0
- package/dist/scraper/index.d.ts +120 -0
- package/dist/scraper/index.d.ts.map +1 -0
- package/dist/scraper/index.js +372 -0
- package/dist/scraper/index.js.map +1 -0
- package/dist/types/index.d.ts +123 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +40 -0
- package/dist/types/index.js.map +1 -0
- package/package.json +108 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 耳(Ear)- 影片處理器
|
|
3
|
+
*
|
|
4
|
+
* 支援格式:.mov, .mp4, .webm, .avi, .mkv, .m4v
|
|
5
|
+
* 功能:音軌轉文字、關鍵幀描述、字幕萃取、場景分段
|
|
6
|
+
*
|
|
7
|
+
* @version 1.0.0
|
|
8
|
+
*/
|
|
9
|
+
import { ProcessResult, ProcessOptions } from "../types/index.js";
|
|
10
|
+
import { BaseProcessor } from "./base.js";
|
|
11
|
+
export declare class VideoProcessor extends BaseProcessor {
|
|
12
|
+
readonly name = "VideoProcessor";
|
|
13
|
+
readonly version = "1.0.0";
|
|
14
|
+
readonly supportedExtensions: string[];
|
|
15
|
+
/**
|
|
16
|
+
* 實際處理影片
|
|
17
|
+
*/
|
|
18
|
+
protected doProcess(filePath: string, options: ProcessOptions): Promise<ProcessResult>;
|
|
19
|
+
}
|
|
20
|
+
//# sourceMappingURL=video.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"video.d.ts","sourceRoot":"","sources":["../../src/processors/video.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,OAAO,EAAE,aAAa,EAAE,cAAc,EAAqB,MAAM,mBAAmB,CAAC;AACrF,OAAO,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAE1C,qBAAa,cAAe,SAAQ,aAAa;IAC/C,QAAQ,CAAC,IAAI,oBAAoB;IACjC,QAAQ,CAAC,OAAO,WAAW;IAC3B,QAAQ,CAAC,mBAAmB,WAA2B;IAEvD;;OAEG;cACa,SAAS,CACvB,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,cAAc,GACtB,OAAO,CAAC,aAAa,CAAC;CAmF1B"}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 耳(Ear)- 影片處理器
|
|
3
|
+
*
|
|
4
|
+
* 支援格式:.mov, .mp4, .webm, .avi, .mkv, .m4v
|
|
5
|
+
* 功能:音軌轉文字、關鍵幀描述、字幕萃取、場景分段
|
|
6
|
+
*
|
|
7
|
+
* @version 1.0.0
|
|
8
|
+
*/
|
|
9
|
+
import * as path from "path";
|
|
10
|
+
import * as fs from "fs";
|
|
11
|
+
import { FORMAT_EXTENSIONS } from "../types/index.js";
|
|
12
|
+
import { BaseProcessor } from "./base.js";
|
|
13
|
+
export class VideoProcessor extends BaseProcessor {
|
|
14
|
+
name = "VideoProcessor";
|
|
15
|
+
version = "1.0.0";
|
|
16
|
+
supportedExtensions = FORMAT_EXTENSIONS.video;
|
|
17
|
+
/**
|
|
18
|
+
* 實際處理影片
|
|
19
|
+
*/
|
|
20
|
+
async doProcess(filePath, options) {
|
|
21
|
+
const startTime = Date.now();
|
|
22
|
+
const outputPath = this.getOutputPath(filePath, options);
|
|
23
|
+
try {
|
|
24
|
+
this.logger.child(this.name).info("處理影片中...");
|
|
25
|
+
// 影片處理通常需要先萃取音軌
|
|
26
|
+
// 這裡先實作基本框架,後續可整合 ffmpeg
|
|
27
|
+
const { content, costUsd } = await this.agent.processWithAgent(filePath, `請分析此影片並輸出以下內容:
|
|
28
|
+
|
|
29
|
+
## 輸出格式
|
|
30
|
+
|
|
31
|
+
### 1. 基本資訊
|
|
32
|
+
- 影片長度
|
|
33
|
+
- 解析度
|
|
34
|
+
- 格式
|
|
35
|
+
|
|
36
|
+
### 2. 音軌轉文字
|
|
37
|
+
將影片中的對話和旁白轉換為文字。
|
|
38
|
+
格式:[時間戳] 說話者:內容
|
|
39
|
+
|
|
40
|
+
### 3. 場景分段
|
|
41
|
+
將影片分成有意義的場景,每個場景包含:
|
|
42
|
+
- 時間範圍
|
|
43
|
+
- 場景描述
|
|
44
|
+
- 主要內容
|
|
45
|
+
|
|
46
|
+
### 4. 關鍵幀描述
|
|
47
|
+
描述影片中的重要視覺元素和場景。
|
|
48
|
+
|
|
49
|
+
### 5. 字幕萃取
|
|
50
|
+
如果影片有內嵌字幕,請萃取出來。
|
|
51
|
+
|
|
52
|
+
### 6. 摘要
|
|
53
|
+
200 字以內的影片內容摘要。`);
|
|
54
|
+
// 儲存輸出
|
|
55
|
+
const outputDir = path.dirname(outputPath);
|
|
56
|
+
if (!fs.existsSync(outputDir)) {
|
|
57
|
+
fs.mkdirSync(outputDir, { recursive: true });
|
|
58
|
+
}
|
|
59
|
+
const timestamp = this.getTimestamp();
|
|
60
|
+
const basename = path.basename(filePath, path.extname(filePath));
|
|
61
|
+
const markdown = `---
|
|
62
|
+
source: ${filePath}
|
|
63
|
+
processed_at: ${timestamp}
|
|
64
|
+
processor: ${this.name}
|
|
65
|
+
version: ${this.version}
|
|
66
|
+
type: video
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
# ${basename}
|
|
70
|
+
|
|
71
|
+
${content}
|
|
72
|
+
`;
|
|
73
|
+
fs.writeFileSync(outputPath, markdown);
|
|
74
|
+
return {
|
|
75
|
+
success: true,
|
|
76
|
+
outputPath,
|
|
77
|
+
content: markdown,
|
|
78
|
+
metadata: await this.createMetadata(filePath, startTime, costUsd, "claude-haiku-4-5"),
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
catch (error) {
|
|
82
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
83
|
+
return {
|
|
84
|
+
success: false,
|
|
85
|
+
outputPath: "",
|
|
86
|
+
content: "",
|
|
87
|
+
metadata: await this.createMetadata(filePath, startTime, 0),
|
|
88
|
+
error: errorMessage,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
//# sourceMappingURL=video.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"video.js","sourceRoot":"","sources":["../../src/processors/video.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AACzB,OAAO,EAAiC,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACrF,OAAO,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAE1C,MAAM,OAAO,cAAe,SAAQ,aAAa;IACtC,IAAI,GAAG,gBAAgB,CAAC;IACxB,OAAO,GAAG,OAAO,CAAC;IAClB,mBAAmB,GAAG,iBAAiB,CAAC,KAAK,CAAC;IAEvD;;OAEG;IACO,KAAK,CAAC,SAAS,CACvB,QAAgB,EAChB,OAAuB;QAEvB,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAEzD,IAAI,CAAC;YACH,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAE9C,gBAAgB;YAChB,yBAAyB;YAEzB,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAC5D,QAAQ,EACR;;;;;;;;;;;;;;;;;;;;;;;;;;wBA0BgB,CACjB,CAAC;YAEF,OAAO;YACP,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;YAC3C,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC9B,EAAE,CAAC,SAAS,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAC/C,CAAC;YAED,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;YACtC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;YAEjE,MAAM,QAAQ,GAAG;UACb,QAAQ;gBACF,SAAS;aACZ,IAAI,CAAC,IAAI;WACX,IAAI,CAAC,OAAO;;;;IAInB,QAAQ;;EAEV,OAAO;CACR,CAAC;YAEI,EAAE,CAAC,aAAa,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;YAEvC,OAAO;gBACL,OAAO,EAAE,IAAI;gBACb,UAAU;gBACV,OAAO,EAAE,QAAQ;gBACjB,QAAQ,EAAE,MAAM,IAAI,CAAC,cAAc,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,kBAAkB,CAAC;aACtF,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAE5E,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,UAAU,EAAE,EAAE;gBACd,OAAO,EAAE,EAAE;gBACX,QAAQ,EAAE,MAAM,IAAI,CAAC,cAAc,CAAC,QAAQ,EAAE,SAAS,EAAE,CAAC,CAAC;gBAC3D,KAAK,EAAE,YAAY;aACpB,CAAC;QACJ,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { ScrapeResult } from "./index.js";
|
|
2
|
+
export interface ScrapeCliOptions {
|
|
3
|
+
json?: boolean;
|
|
4
|
+
saveHtml?: string;
|
|
5
|
+
saveText?: string;
|
|
6
|
+
verbose?: boolean;
|
|
7
|
+
quiet?: boolean;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* 輸出人類可讀報告
|
|
11
|
+
*/
|
|
12
|
+
export declare function printReport(result: ScrapeResult, verbose?: boolean): void;
|
|
13
|
+
/**
|
|
14
|
+
* 執行 scrape 命令
|
|
15
|
+
*/
|
|
16
|
+
export declare function runScrapeCommand(url: string, options: ScrapeCliOptions): Promise<void>;
|
|
17
|
+
/**
|
|
18
|
+
* 批次抓取多個 URL
|
|
19
|
+
*/
|
|
20
|
+
export declare function runScrapeBatchCommand(urls: string[], outputDir: string, options?: {
|
|
21
|
+
quiet?: boolean;
|
|
22
|
+
}): Promise<void>;
|
|
23
|
+
//# sourceMappingURL=cli.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../../src/scraper/cli.ts"],"names":[],"mappings":"AAIA,OAAO,EAAmB,YAAY,EAAc,MAAM,YAAY,CAAC;AAEvE,MAAM,WAAW,gBAAgB;IAC/B,IAAI,CAAC,EAAE,OAAO,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,KAAK,CAAC,EAAE,OAAO,CAAC;CACjB;AAED;;GAEG;AACH,wBAAgB,WAAW,CAAC,MAAM,EAAE,YAAY,EAAE,OAAO,GAAE,OAAe,GAAG,IAAI,CA0ChF;AAED;;GAEG;AACH,wBAAsB,gBAAgB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC,CA+B5F;AAED;;GAEG;AACH,wBAAsB,qBAAqB,CACzC,IAAI,EAAE,MAAM,EAAE,EACd,SAAS,EAAE,MAAM,EACjB,OAAO,GAAE;IAAE,KAAK,CAAC,EAAE,OAAO,CAAA;CAAO,GAChC,OAAO,CAAC,IAAI,CAAC,CAuCf"}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 網頁抓取 CLI 模組
|
|
3
|
+
*/
|
|
4
|
+
import * as fs from "fs";
|
|
5
|
+
import { WebPageAnalyzer } from "./index.js";
|
|
6
|
+
/**
|
|
7
|
+
* 輸出人類可讀報告
|
|
8
|
+
*/
|
|
9
|
+
export function printReport(result, verbose = false) {
|
|
10
|
+
console.log("=".repeat(60));
|
|
11
|
+
console.log("網頁分析報告");
|
|
12
|
+
console.log("=".repeat(60));
|
|
13
|
+
console.log(`網址: ${result.url}`);
|
|
14
|
+
if (result.title) {
|
|
15
|
+
console.log(`標題: ${result.title}`);
|
|
16
|
+
}
|
|
17
|
+
console.log();
|
|
18
|
+
console.log("【大小統計】");
|
|
19
|
+
console.log(` 原始 HTML: ${result.size.raw_html_readable}`);
|
|
20
|
+
console.log(` 純文字: ${result.size.text_readable}`);
|
|
21
|
+
console.log(` 壓縮比: ${result.size.compression_ratio} 節省`);
|
|
22
|
+
console.log();
|
|
23
|
+
console.log(`【可下載檔案】共 ${result.links.downloads.count} 個`);
|
|
24
|
+
if (verbose && result.links.downloads.items.length > 0) {
|
|
25
|
+
const items = result.links.downloads.items.slice(0, 20);
|
|
26
|
+
for (let i = 0; i < items.length; i++) {
|
|
27
|
+
const item = items[i];
|
|
28
|
+
console.log(` ${i + 1}. [${item.type.toUpperCase()}] ${item.text.slice(0, 50)}`);
|
|
29
|
+
}
|
|
30
|
+
if (result.links.downloads.items.length > 20) {
|
|
31
|
+
console.log(` ... 還有 ${result.links.downloads.items.length - 20} 個`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
console.log();
|
|
35
|
+
console.log(`【連外連結】共 ${result.links.external.count} 個`);
|
|
36
|
+
if (Object.keys(result.links.external.domains).length > 0) {
|
|
37
|
+
const sorted = Object.entries(result.links.external.domains)
|
|
38
|
+
.sort((a, b) => b[1] - a[1])
|
|
39
|
+
.slice(0, 10);
|
|
40
|
+
for (const [domain, count] of sorted) {
|
|
41
|
+
console.log(` - ${domain}: ${count} 個`);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
console.log();
|
|
45
|
+
console.log(`【內部連結】共 ${result.links.internal.count} 個`);
|
|
46
|
+
console.log();
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* 執行 scrape 命令
|
|
50
|
+
*/
|
|
51
|
+
export async function runScrapeCommand(url, options) {
|
|
52
|
+
const analyzer = new WebPageAnalyzer(url);
|
|
53
|
+
const success = await analyzer.fetch();
|
|
54
|
+
if (!success) {
|
|
55
|
+
process.exit(1);
|
|
56
|
+
}
|
|
57
|
+
const result = analyzer.analyze();
|
|
58
|
+
// 儲存檔案
|
|
59
|
+
if (options.saveHtml) {
|
|
60
|
+
fs.writeFileSync(options.saveHtml, analyzer.getRawHtml(), "utf-8");
|
|
61
|
+
if (!options.quiet) {
|
|
62
|
+
console.error(`原始 HTML 已儲存: ${options.saveHtml}`);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
if (options.saveText) {
|
|
66
|
+
fs.writeFileSync(options.saveText, analyzer.getTextContent(), "utf-8");
|
|
67
|
+
if (!options.quiet) {
|
|
68
|
+
console.error(`純文字已儲存: ${options.saveText}`);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
// 輸出
|
|
72
|
+
if (options.json) {
|
|
73
|
+
console.log(JSON.stringify(result, null, 2));
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
printReport(result, options.verbose);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* 批次抓取多個 URL
|
|
81
|
+
*/
|
|
82
|
+
export async function runScrapeBatchCommand(urls, outputDir, options = {}) {
|
|
83
|
+
fs.mkdirSync(outputDir, { recursive: true });
|
|
84
|
+
let count = 0;
|
|
85
|
+
for (const url of urls) {
|
|
86
|
+
// 跳過空行和註解
|
|
87
|
+
const trimmed = url.trim();
|
|
88
|
+
if (!trimmed || trimmed.startsWith("#"))
|
|
89
|
+
continue;
|
|
90
|
+
count++;
|
|
91
|
+
const filename = trimmed
|
|
92
|
+
.replace(/[^a-zA-Z0-9]/g, "_")
|
|
93
|
+
.slice(0, 50);
|
|
94
|
+
if (!options.quiet) {
|
|
95
|
+
console.log(`[${count}] 抓取: ${trimmed}`);
|
|
96
|
+
}
|
|
97
|
+
try {
|
|
98
|
+
const analyzer = new WebPageAnalyzer(trimmed);
|
|
99
|
+
const success = await analyzer.fetch();
|
|
100
|
+
if (success) {
|
|
101
|
+
const result = analyzer.analyze();
|
|
102
|
+
fs.writeFileSync(`${outputDir}/${filename}.html`, analyzer.getRawHtml(), "utf-8");
|
|
103
|
+
fs.writeFileSync(`${outputDir}/${filename}.txt`, analyzer.getTextContent(), "utf-8");
|
|
104
|
+
fs.writeFileSync(`${outputDir}/${filename}.json`, JSON.stringify(result, null, 2), "utf-8");
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
console.warn(`抓取失敗: ${trimmed}`);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
catch (error) {
|
|
111
|
+
console.warn(`抓取失敗: ${trimmed}`);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
if (!options.quiet) {
|
|
115
|
+
console.log(`完成批次抓取,共處理 ${count} 個網址`);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
//# sourceMappingURL=cli.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.js","sourceRoot":"","sources":["../../src/scraper/cli.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AACzB,OAAO,EAAE,eAAe,EAA4B,MAAM,YAAY,CAAC;AAUvE;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,MAAoB,EAAE,UAAmB,KAAK;IACxE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACtB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC;IACjC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;QACjB,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;IACrC,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACtB,OAAO,CAAC,GAAG,CAAC,cAAc,MAAM,CAAC,IAAI,CAAC,iBAAiB,EAAE,CAAC,CAAC;IAC3D,OAAO,CAAC,GAAG,CAAC,aAAa,MAAM,CAAC,IAAI,CAAC,aAAa,EAAE,CAAC,CAAC;IACtD,OAAO,CAAC,GAAG,CAAC,aAAa,MAAM,CAAC,IAAI,CAAC,iBAAiB,KAAK,CAAC,CAAC;IAC7D,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,IAAI,CAAC,CAAC;IAC1D,IAAI,OAAO,IAAI,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACxD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACtB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;QACpF,CAAC;QACD,IAAI,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YAC7C,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,IAAI,CAAC,CAAC;QACxE,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;IACxD,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1D,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC;aACzD,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;aAC3B,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAChB,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;YACrC,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,KAAK,KAAK,IAAI,CAAC,CAAC;QAC3C,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,EAAE,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,GAAW,EAAE,OAAyB;IAC3E,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC;IAE1C,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,KAAK,EAAE,CAAC;IACvC,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC;IAElC,OAAO;IACP,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACrB,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,QAAQ,EAAE,QAAQ,CAAC,UAAU,EAAE,EAAE,OAAO,CAAC,CAAC;QACnE,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,KAAK,CAAC,gBAAgB,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;QACpD,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACrB,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,QAAQ,EAAE,QAAQ,CAAC,cAAc,EAAE,EAAE,OAAO,CAAC,CAAC;QACvE,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,KAAK,CAAC,WAAW,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IAED,KAAK;IACL,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;QACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAC/C,CAAC;SAAM,CAAC;QACN,WAAW,CAAC,MAAM,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;IACvC,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,qBAAqB,CACzC,IAAc,EACd,SAAiB,EACjB,UAA+B,EAAE;IAEjC,EAAE,CAAC,SAAS,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE7C,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,UAAU;QACV,MAAM,OAAO,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;QAC3B,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,SAAS;QAElD,KAAK,EAAE,CAAC;QACR,MAAM,QAAQ,GAAG,OAAO;aACrB,OAAO,CAAC,eAAe,EAAE,GAAG,CAAC;aAC7B,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAEhB,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,GAAG,CAAC,IAAI,KAAK,SAAS,OAAO,EAAE,CAAC,CAAC;QAC3C,CAAC;QAED,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,OAAO,CAAC,CAAC;YAC9C,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,KAAK,EAAE,CAAC;YAEvC,IAAI,OAAO,EAAE,CAAC;gBACZ,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC;gBAElC,EAAE,CAAC,aAAa,CAAC,GAAG,SAAS,IAAI,QAAQ,OAAO,EAAE,QAAQ,CAAC,UAAU,EAAE,EAAE,OAAO,CAAC,CAAC;gBAClF,EAAE,CAAC,aAAa,CAAC,GAAG,SAAS,IAAI,QAAQ,MAAM,EAAE,QAAQ,CAAC,cAAc,EAAE,EAAE,OAAO,CAAC,CAAC;gBACrF,EAAE,CAAC,aAAa,CAAC,GAAG,SAAS,IAAI,QAAQ,OAAO,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAC9F,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,SAAS,OAAO,EAAE,CAAC,CAAC;YACnC,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,SAAS,OAAO,EAAE,CAAC,CAAC;QACnC,CAAC;IACH,CAAC;IAED,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;QACnB,OAAO,CAAC,GAAG,CAAC,cAAc,KAAK,MAAM,CAAC,CAAC;IACzC,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 網頁抓取與分析模組 (TypeScript 版本)
|
|
3
|
+
* 遵循 BDD/TDD/SOLID/DRY/KISS/DDD 原則
|
|
4
|
+
*
|
|
5
|
+
* 支援 HTTPS Proxy(沙盒環境)
|
|
6
|
+
*/
|
|
7
|
+
export interface ScrapeOptions {
|
|
8
|
+
timeout?: number;
|
|
9
|
+
userAgent?: string;
|
|
10
|
+
}
|
|
11
|
+
export interface LinkItem {
|
|
12
|
+
url: string;
|
|
13
|
+
text: string;
|
|
14
|
+
}
|
|
15
|
+
export interface DownloadLinkItem extends LinkItem {
|
|
16
|
+
type: string;
|
|
17
|
+
}
|
|
18
|
+
export interface ExternalLinkItem extends LinkItem {
|
|
19
|
+
domain: string;
|
|
20
|
+
}
|
|
21
|
+
export interface ScrapeResult {
|
|
22
|
+
url: string;
|
|
23
|
+
title: string;
|
|
24
|
+
description: string;
|
|
25
|
+
size: {
|
|
26
|
+
raw_html_bytes: number;
|
|
27
|
+
raw_html_readable: string;
|
|
28
|
+
text_bytes: number;
|
|
29
|
+
text_readable: string;
|
|
30
|
+
compression_ratio: string;
|
|
31
|
+
};
|
|
32
|
+
links: {
|
|
33
|
+
downloads: {
|
|
34
|
+
count: number;
|
|
35
|
+
items: DownloadLinkItem[];
|
|
36
|
+
};
|
|
37
|
+
external: {
|
|
38
|
+
count: number;
|
|
39
|
+
domains: Record<string, number>;
|
|
40
|
+
items: ExternalLinkItem[];
|
|
41
|
+
};
|
|
42
|
+
internal: {
|
|
43
|
+
count: number;
|
|
44
|
+
items: LinkItem[];
|
|
45
|
+
};
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* 格式化檔案大小
|
|
50
|
+
*/
|
|
51
|
+
export declare function formatSize(bytes: number): string;
|
|
52
|
+
/**
|
|
53
|
+
* 解析 HTML 取得純文字
|
|
54
|
+
*/
|
|
55
|
+
export declare function extractText(html: string): string;
|
|
56
|
+
/**
|
|
57
|
+
* 網頁分析器類別
|
|
58
|
+
*/
|
|
59
|
+
export declare class WebPageAnalyzer {
|
|
60
|
+
private url;
|
|
61
|
+
private baseDomain;
|
|
62
|
+
private html;
|
|
63
|
+
constructor(url: string);
|
|
64
|
+
/**
|
|
65
|
+
* 抓取網頁(支援 HTTPS Proxy)
|
|
66
|
+
*/
|
|
67
|
+
fetch(options?: ScrapeOptions): Promise<boolean>;
|
|
68
|
+
/**
|
|
69
|
+
* 取得原始 HTML
|
|
70
|
+
*/
|
|
71
|
+
getRawHtml(): string;
|
|
72
|
+
/**
|
|
73
|
+
* 取得原始大小
|
|
74
|
+
*/
|
|
75
|
+
getRawSize(): number;
|
|
76
|
+
/**
|
|
77
|
+
* 取得純文字
|
|
78
|
+
*/
|
|
79
|
+
getTextContent(): string;
|
|
80
|
+
/**
|
|
81
|
+
* 取得純文字大小
|
|
82
|
+
*/
|
|
83
|
+
getTextSize(): number;
|
|
84
|
+
/**
|
|
85
|
+
* 取得頁面標題
|
|
86
|
+
*/
|
|
87
|
+
getTitle(): string;
|
|
88
|
+
/**
|
|
89
|
+
* 取得 meta description
|
|
90
|
+
*/
|
|
91
|
+
getDescription(): string;
|
|
92
|
+
/**
|
|
93
|
+
* 取得可下載連結
|
|
94
|
+
*/
|
|
95
|
+
getDownloadableLinks(): DownloadLinkItem[];
|
|
96
|
+
/**
|
|
97
|
+
* 取得連外連結
|
|
98
|
+
*/
|
|
99
|
+
getExternalLinks(): ExternalLinkItem[];
|
|
100
|
+
/**
|
|
101
|
+
* 取得內部連結
|
|
102
|
+
*/
|
|
103
|
+
getInternalLinks(): LinkItem[];
|
|
104
|
+
/**
|
|
105
|
+
* 完整分析
|
|
106
|
+
*/
|
|
107
|
+
analyze(): ScrapeResult;
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* 快速抓取並分析網頁
|
|
111
|
+
*/
|
|
112
|
+
export declare function scrapeUrl(url: string, options?: ScrapeOptions): Promise<ScrapeResult | null>;
|
|
113
|
+
/**
|
|
114
|
+
* 抓取並取得原始內容
|
|
115
|
+
*/
|
|
116
|
+
export declare function fetchRawContent(url: string, options?: ScrapeOptions): Promise<{
|
|
117
|
+
html: string;
|
|
118
|
+
text: string;
|
|
119
|
+
} | null>;
|
|
120
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scraper/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,MAAM,WAAW,aAAa;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,QAAQ;IACvB,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,gBAAiB,SAAQ,QAAQ;IAChD,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,gBAAiB,SAAQ,QAAQ;IAChD,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,YAAY;IAC3B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE;QACJ,cAAc,EAAE,MAAM,CAAC;QACvB,iBAAiB,EAAE,MAAM,CAAC;QAC1B,UAAU,EAAE,MAAM,CAAC;QACnB,aAAa,EAAE,MAAM,CAAC;QACtB,iBAAiB,EAAE,MAAM,CAAC;KAC3B,CAAC;IACF,KAAK,EAAE;QACL,SAAS,EAAE;YACT,KAAK,EAAE,MAAM,CAAC;YACd,KAAK,EAAE,gBAAgB,EAAE,CAAC;SAC3B,CAAC;QACF,QAAQ,EAAE;YACR,KAAK,EAAE,MAAM,CAAC;YACd,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;YAChC,KAAK,EAAE,gBAAgB,EAAE,CAAC;SAC3B,CAAC;QACF,QAAQ,EAAE;YACR,KAAK,EAAE,MAAM,CAAC;YACd,KAAK,EAAE,QAAQ,EAAE,CAAC;SACnB,CAAC;KACH,CAAC;CACH;AAgBD;;GAEG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAQhD;AAED;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAiChD;AA+FD;;GAEG;AACH,qBAAa,eAAe;IAC1B,OAAO,CAAC,GAAG,CAAS;IACpB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,IAAI,CAAc;gBAEd,GAAG,EAAE,MAAM;IAMvB;;OAEG;IACG,KAAK,CAAC,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,OAAO,CAAC;IA0B1D;;OAEG;IACH,UAAU,IAAI,MAAM;IAIpB;;OAEG;IACH,UAAU,IAAI,MAAM;IAIpB;;OAEG;IACH,cAAc,IAAI,MAAM;IAIxB;;OAEG;IACH,WAAW,IAAI,MAAM;IAIrB;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,cAAc,IAAI,MAAM;IAIxB;;OAEG;IACH,oBAAoB,IAAI,gBAAgB,EAAE;IAsB1C;;OAEG;IACH,gBAAgB,IAAI,gBAAgB,EAAE;IAgCtC;;OAEG;IACH,gBAAgB,IAAI,QAAQ,EAAE;IA8B9B;;OAEG;IACH,OAAO,IAAI,YAAY;CA6CxB;AAED;;GAEG;AACH,wBAAsB,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC,CAOtG;AAED;;GAEG;AACH,wBAAsB,eAAe,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC;IACvF,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;CACd,GAAG,IAAI,CAAC,CAUR"}
|