botrun-crawler-2 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +126 -0
  2. package/dist/cli.d.ts +10 -0
  3. package/dist/cli.d.ts.map +1 -0
  4. package/dist/cli.js +155 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/crawler/cli.d.ts +19 -0
  7. package/dist/crawler/cli.d.ts.map +1 -0
  8. package/dist/crawler/cli.js +179 -0
  9. package/dist/crawler/cli.js.map +1 -0
  10. package/dist/crawler/index.d.ts +146 -0
  11. package/dist/crawler/index.d.ts.map +1 -0
  12. package/dist/crawler/index.js +670 -0
  13. package/dist/crawler/index.js.map +1 -0
  14. package/dist/index.d.ts +17 -0
  15. package/dist/index.d.ts.map +1 -0
  16. package/dist/index.js +20 -0
  17. package/dist/index.js.map +1 -0
  18. package/dist/lib/agent.d.ts +34 -0
  19. package/dist/lib/agent.d.ts.map +1 -0
  20. package/dist/lib/agent.js +73 -0
  21. package/dist/lib/agent.js.map +1 -0
  22. package/dist/lib/cache.d.ts +49 -0
  23. package/dist/lib/cache.d.ts.map +1 -0
  24. package/dist/lib/cache.js +141 -0
  25. package/dist/lib/cache.js.map +1 -0
  26. package/dist/lib/filename-decoder.d.ts +62 -0
  27. package/dist/lib/filename-decoder.d.ts.map +1 -0
  28. package/dist/lib/filename-decoder.js +229 -0
  29. package/dist/lib/filename-decoder.js.map +1 -0
  30. package/dist/lib/http-client.d.ts +86 -0
  31. package/dist/lib/http-client.d.ts.map +1 -0
  32. package/dist/lib/http-client.js +373 -0
  33. package/dist/lib/http-client.js.map +1 -0
  34. package/dist/lib/index.d.ts +15 -0
  35. package/dist/lib/index.d.ts.map +1 -0
  36. package/dist/lib/index.js +19 -0
  37. package/dist/lib/index.js.map +1 -0
  38. package/dist/lib/logger.d.ts +41 -0
  39. package/dist/lib/logger.d.ts.map +1 -0
  40. package/dist/lib/logger.js +122 -0
  41. package/dist/lib/logger.js.map +1 -0
  42. package/dist/lib/scene-detector.d.ts +92 -0
  43. package/dist/lib/scene-detector.d.ts.map +1 -0
  44. package/dist/lib/scene-detector.js +297 -0
  45. package/dist/lib/scene-detector.js.map +1 -0
  46. package/dist/processors/audio.d.ts +20 -0
  47. package/dist/processors/audio.d.ts.map +1 -0
  48. package/dist/processors/audio.js +110 -0
  49. package/dist/processors/audio.js.map +1 -0
  50. package/dist/processors/base.d.ts +53 -0
  51. package/dist/processors/base.d.ts.map +1 -0
  52. package/dist/processors/base.js +194 -0
  53. package/dist/processors/base.js.map +1 -0
  54. package/dist/processors/data.d.ts +48 -0
  55. package/dist/processors/data.d.ts.map +1 -0
  56. package/dist/processors/data.js +206 -0
  57. package/dist/processors/data.js.map +1 -0
  58. package/dist/processors/document.d.ts +20 -0
  59. package/dist/processors/document.d.ts.map +1 -0
  60. package/dist/processors/document.js +137 -0
  61. package/dist/processors/document.js.map +1 -0
  62. package/dist/processors/image.d.ts +20 -0
  63. package/dist/processors/image.d.ts.map +1 -0
  64. package/dist/processors/image.js +92 -0
  65. package/dist/processors/image.js.map +1 -0
  66. package/dist/processors/index.d.ts +53 -0
  67. package/dist/processors/index.d.ts.map +1 -0
  68. package/dist/processors/index.js +177 -0
  69. package/dist/processors/index.js.map +1 -0
  70. package/dist/processors/text.d.ts +44 -0
  71. package/dist/processors/text.d.ts.map +1 -0
  72. package/dist/processors/text.js +262 -0
  73. package/dist/processors/text.js.map +1 -0
  74. package/dist/processors/video.d.ts +20 -0
  75. package/dist/processors/video.d.ts.map +1 -0
  76. package/dist/processors/video.js +93 -0
  77. package/dist/processors/video.js.map +1 -0
  78. package/dist/scraper/cli.d.ts +23 -0
  79. package/dist/scraper/cli.d.ts.map +1 -0
  80. package/dist/scraper/cli.js +118 -0
  81. package/dist/scraper/cli.js.map +1 -0
  82. package/dist/scraper/index.d.ts +120 -0
  83. package/dist/scraper/index.d.ts.map +1 -0
  84. package/dist/scraper/index.js +372 -0
  85. package/dist/scraper/index.js.map +1 -0
  86. package/dist/types/index.d.ts +123 -0
  87. package/dist/types/index.d.ts.map +1 -0
  88. package/dist/types/index.js +40 -0
  89. package/dist/types/index.js.map +1 -0
  90. package/package.json +108 -0
@@ -0,0 +1,20 @@
1
+ /**
2
+ * 耳(Ear)- 影片處理器
3
+ *
4
+ * 支援格式:.mov, .mp4, .webm, .avi, .mkv, .m4v
5
+ * 功能:音軌轉文字、關鍵幀描述、字幕萃取、場景分段
6
+ *
7
+ * @version 1.0.0
8
+ */
9
+ import { ProcessResult, ProcessOptions } from "../types/index.js";
10
+ import { BaseProcessor } from "./base.js";
11
+ export declare class VideoProcessor extends BaseProcessor {
12
+ readonly name = "VideoProcessor";
13
+ readonly version = "1.0.0";
14
+ readonly supportedExtensions: string[];
15
+ /**
16
+ * 實際處理影片
17
+ */
18
+ protected doProcess(filePath: string, options: ProcessOptions): Promise<ProcessResult>;
19
+ }
20
+ //# sourceMappingURL=video.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"video.d.ts","sourceRoot":"","sources":["../../src/processors/video.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,OAAO,EAAE,aAAa,EAAE,cAAc,EAAqB,MAAM,mBAAmB,CAAC;AACrF,OAAO,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAE1C,qBAAa,cAAe,SAAQ,aAAa;IAC/C,QAAQ,CAAC,IAAI,oBAAoB;IACjC,QAAQ,CAAC,OAAO,WAAW;IAC3B,QAAQ,CAAC,mBAAmB,WAA2B;IAEvD;;OAEG;cACa,SAAS,CACvB,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,cAAc,GACtB,OAAO,CAAC,aAAa,CAAC;CAmF1B"}
@@ -0,0 +1,93 @@
1
+ /**
2
+ * 耳(Ear)- 影片處理器
3
+ *
4
+ * 支援格式:.mov, .mp4, .webm, .avi, .mkv, .m4v
5
+ * 功能:音軌轉文字、關鍵幀描述、字幕萃取、場景分段
6
+ *
7
+ * @version 1.0.0
8
+ */
9
+ import * as path from "path";
10
+ import * as fs from "fs";
11
+ import { FORMAT_EXTENSIONS } from "../types/index.js";
12
+ import { BaseProcessor } from "./base.js";
13
+ export class VideoProcessor extends BaseProcessor {
14
+ name = "VideoProcessor";
15
+ version = "1.0.0";
16
+ supportedExtensions = FORMAT_EXTENSIONS.video;
17
+ /**
18
+ * 實際處理影片
19
+ */
20
+ async doProcess(filePath, options) {
21
+ const startTime = Date.now();
22
+ const outputPath = this.getOutputPath(filePath, options);
23
+ try {
24
+ this.logger.child(this.name).info("處理影片中...");
25
+ // 影片處理通常需要先萃取音軌
26
+ // 這裡先實作基本框架,後續可整合 ffmpeg
27
+ const { content, costUsd } = await this.agent.processWithAgent(filePath, `請分析此影片並輸出以下內容:
28
+
29
+ ## 輸出格式
30
+
31
+ ### 1. 基本資訊
32
+ - 影片長度
33
+ - 解析度
34
+ - 格式
35
+
36
+ ### 2. 音軌轉文字
37
+ 將影片中的對話和旁白轉換為文字。
38
+ 格式:[時間戳] 說話者:內容
39
+
40
+ ### 3. 場景分段
41
+ 將影片分成有意義的場景,每個場景包含:
42
+ - 時間範圍
43
+ - 場景描述
44
+ - 主要內容
45
+
46
+ ### 4. 關鍵幀描述
47
+ 描述影片中的重要視覺元素和場景。
48
+
49
+ ### 5. 字幕萃取
50
+ 如果影片有內嵌字幕,請萃取出來。
51
+
52
+ ### 6. 摘要
53
+ 200 字以內的影片內容摘要。`);
54
+ // 儲存輸出
55
+ const outputDir = path.dirname(outputPath);
56
+ if (!fs.existsSync(outputDir)) {
57
+ fs.mkdirSync(outputDir, { recursive: true });
58
+ }
59
+ const timestamp = this.getTimestamp();
60
+ const basename = path.basename(filePath, path.extname(filePath));
61
+ const markdown = `---
62
+ source: ${filePath}
63
+ processed_at: ${timestamp}
64
+ processor: ${this.name}
65
+ version: ${this.version}
66
+ type: video
67
+ ---
68
+
69
+ # ${basename}
70
+
71
+ ${content}
72
+ `;
73
+ fs.writeFileSync(outputPath, markdown);
74
+ return {
75
+ success: true,
76
+ outputPath,
77
+ content: markdown,
78
+ metadata: await this.createMetadata(filePath, startTime, costUsd, "claude-haiku-4-5"),
79
+ };
80
+ }
81
+ catch (error) {
82
+ const errorMessage = error instanceof Error ? error.message : String(error);
83
+ return {
84
+ success: false,
85
+ outputPath: "",
86
+ content: "",
87
+ metadata: await this.createMetadata(filePath, startTime, 0),
88
+ error: errorMessage,
89
+ };
90
+ }
91
+ }
92
+ }
93
+ //# sourceMappingURL=video.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"video.js","sourceRoot":"","sources":["../../src/processors/video.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AACzB,OAAO,EAAiC,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACrF,OAAO,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAE1C,MAAM,OAAO,cAAe,SAAQ,aAAa;IACtC,IAAI,GAAG,gBAAgB,CAAC;IACxB,OAAO,GAAG,OAAO,CAAC;IAClB,mBAAmB,GAAG,iBAAiB,CAAC,KAAK,CAAC;IAEvD;;OAEG;IACO,KAAK,CAAC,SAAS,CACvB,QAAgB,EAChB,OAAuB;QAEvB,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAEzD,IAAI,CAAC;YACH,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YAE9C,gBAAgB;YAChB,yBAAyB;YAEzB,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,GAAG,MAAM,IAAI,CAAC,KAAK,CAAC,gBAAgB,CAC5D,QAAQ,EACR;;;;;;;;;;;;;;;;;;;;;;;;;;wBA0BgB,CACjB,CAAC;YAEF,OAAO;YACP,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;YAC3C,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC9B,EAAE,CAAC,SAAS,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YAC/C,CAAC;YAED,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC;YACtC,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;YAEjE,MAAM,QAAQ,GAAG;UACb,QAAQ;gBACF,SAAS;aACZ,IAAI,CAAC,IAAI;WACX,IAAI,CAAC,OAAO;;;;IAInB,QAAQ;;EAEV,OAAO;CACR,CAAC;YAEI,EAAE,CAAC,aAAa,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;YAEvC,OAAO;gBACL,OAAO,EAAE,IAAI;gBACb,UAAU;gBACV,OAAO,EAAE,QAAQ;gBACjB,QAAQ,EAAE,MAAM,IAAI,CAAC,cAAc,CAAC,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,kBAAkB,CAAC;aACtF,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAE5E,OAAO;gBACL,OAAO,EAAE,KAAK;gBACd,UAAU,EAAE,EAAE;gBACd,OAAO,EAAE,EAAE;gBACX,QAAQ,EAAE,MAAM,IAAI,CAAC,cAAc,CAAC,QAAQ,EAAE,SAAS,EAAE,CAAC,CAAC;gBAC3D,KAAK,EAAE,YAAY;aACpB,CAAC;QACJ,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1,23 @@
1
+ import { ScrapeResult } from "./index.js";
2
+ export interface ScrapeCliOptions {
3
+ json?: boolean;
4
+ saveHtml?: string;
5
+ saveText?: string;
6
+ verbose?: boolean;
7
+ quiet?: boolean;
8
+ }
9
+ /**
10
+ * 輸出人類可讀報告
11
+ */
12
+ export declare function printReport(result: ScrapeResult, verbose?: boolean): void;
13
+ /**
14
+ * 執行 scrape 命令
15
+ */
16
+ export declare function runScrapeCommand(url: string, options: ScrapeCliOptions): Promise<void>;
17
+ /**
18
+ * 批次抓取多個 URL
19
+ */
20
+ export declare function runScrapeBatchCommand(urls: string[], outputDir: string, options?: {
21
+ quiet?: boolean;
22
+ }): Promise<void>;
23
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../../src/scraper/cli.ts"],"names":[],"mappings":"AAIA,OAAO,EAAmB,YAAY,EAAc,MAAM,YAAY,CAAC;AAEvE,MAAM,WAAW,gBAAgB;IAC/B,IAAI,CAAC,EAAE,OAAO,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,KAAK,CAAC,EAAE,OAAO,CAAC;CACjB;AAED;;GAEG;AACH,wBAAgB,WAAW,CAAC,MAAM,EAAE,YAAY,EAAE,OAAO,GAAE,OAAe,GAAG,IAAI,CA0ChF;AAED;;GAEG;AACH,wBAAsB,gBAAgB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC,CA+B5F;AAED;;GAEG;AACH,wBAAsB,qBAAqB,CACzC,IAAI,EAAE,MAAM,EAAE,EACd,SAAS,EAAE,MAAM,EACjB,OAAO,GAAE;IAAE,KAAK,CAAC,EAAE,OAAO,CAAA;CAAO,GAChC,OAAO,CAAC,IAAI,CAAC,CAuCf"}
@@ -0,0 +1,118 @@
1
+ /**
2
+ * 網頁抓取 CLI 模組
3
+ */
4
+ import * as fs from "fs";
5
+ import { WebPageAnalyzer } from "./index.js";
6
+ /**
7
+ * 輸出人類可讀報告
8
+ */
9
+ export function printReport(result, verbose = false) {
10
+ console.log("=".repeat(60));
11
+ console.log("網頁分析報告");
12
+ console.log("=".repeat(60));
13
+ console.log(`網址: ${result.url}`);
14
+ if (result.title) {
15
+ console.log(`標題: ${result.title}`);
16
+ }
17
+ console.log();
18
+ console.log("【大小統計】");
19
+ console.log(` 原始 HTML: ${result.size.raw_html_readable}`);
20
+ console.log(` 純文字: ${result.size.text_readable}`);
21
+ console.log(` 壓縮比: ${result.size.compression_ratio} 節省`);
22
+ console.log();
23
+ console.log(`【可下載檔案】共 ${result.links.downloads.count} 個`);
24
+ if (verbose && result.links.downloads.items.length > 0) {
25
+ const items = result.links.downloads.items.slice(0, 20);
26
+ for (let i = 0; i < items.length; i++) {
27
+ const item = items[i];
28
+ console.log(` ${i + 1}. [${item.type.toUpperCase()}] ${item.text.slice(0, 50)}`);
29
+ }
30
+ if (result.links.downloads.items.length > 20) {
31
+ console.log(` ... 還有 ${result.links.downloads.items.length - 20} 個`);
32
+ }
33
+ }
34
+ console.log();
35
+ console.log(`【連外連結】共 ${result.links.external.count} 個`);
36
+ if (Object.keys(result.links.external.domains).length > 0) {
37
+ const sorted = Object.entries(result.links.external.domains)
38
+ .sort((a, b) => b[1] - a[1])
39
+ .slice(0, 10);
40
+ for (const [domain, count] of sorted) {
41
+ console.log(` - ${domain}: ${count} 個`);
42
+ }
43
+ }
44
+ console.log();
45
+ console.log(`【內部連結】共 ${result.links.internal.count} 個`);
46
+ console.log();
47
+ }
48
+ /**
49
+ * 執行 scrape 命令
50
+ */
51
+ export async function runScrapeCommand(url, options) {
52
+ const analyzer = new WebPageAnalyzer(url);
53
+ const success = await analyzer.fetch();
54
+ if (!success) {
55
+ process.exit(1);
56
+ }
57
+ const result = analyzer.analyze();
58
+ // 儲存檔案
59
+ if (options.saveHtml) {
60
+ fs.writeFileSync(options.saveHtml, analyzer.getRawHtml(), "utf-8");
61
+ if (!options.quiet) {
62
+ console.error(`原始 HTML 已儲存: ${options.saveHtml}`);
63
+ }
64
+ }
65
+ if (options.saveText) {
66
+ fs.writeFileSync(options.saveText, analyzer.getTextContent(), "utf-8");
67
+ if (!options.quiet) {
68
+ console.error(`純文字已儲存: ${options.saveText}`);
69
+ }
70
+ }
71
+ // 輸出
72
+ if (options.json) {
73
+ console.log(JSON.stringify(result, null, 2));
74
+ }
75
+ else {
76
+ printReport(result, options.verbose);
77
+ }
78
+ }
79
+ /**
80
+ * 批次抓取多個 URL
81
+ */
82
+ export async function runScrapeBatchCommand(urls, outputDir, options = {}) {
83
+ fs.mkdirSync(outputDir, { recursive: true });
84
+ let count = 0;
85
+ for (const url of urls) {
86
+ // 跳過空行和註解
87
+ const trimmed = url.trim();
88
+ if (!trimmed || trimmed.startsWith("#"))
89
+ continue;
90
+ count++;
91
+ const filename = trimmed
92
+ .replace(/[^a-zA-Z0-9]/g, "_")
93
+ .slice(0, 50);
94
+ if (!options.quiet) {
95
+ console.log(`[${count}] 抓取: ${trimmed}`);
96
+ }
97
+ try {
98
+ const analyzer = new WebPageAnalyzer(trimmed);
99
+ const success = await analyzer.fetch();
100
+ if (success) {
101
+ const result = analyzer.analyze();
102
+ fs.writeFileSync(`${outputDir}/${filename}.html`, analyzer.getRawHtml(), "utf-8");
103
+ fs.writeFileSync(`${outputDir}/${filename}.txt`, analyzer.getTextContent(), "utf-8");
104
+ fs.writeFileSync(`${outputDir}/${filename}.json`, JSON.stringify(result, null, 2), "utf-8");
105
+ }
106
+ else {
107
+ console.warn(`抓取失敗: ${trimmed}`);
108
+ }
109
+ }
110
+ catch (error) {
111
+ console.warn(`抓取失敗: ${trimmed}`);
112
+ }
113
+ }
114
+ if (!options.quiet) {
115
+ console.log(`完成批次抓取,共處理 ${count} 個網址`);
116
+ }
117
+ }
118
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../../src/scraper/cli.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,OAAO,KAAK,EAAE,MAAM,IAAI,CAAC;AACzB,OAAO,EAAE,eAAe,EAA4B,MAAM,YAAY,CAAC;AAUvE;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,MAAoB,EAAE,UAAmB,KAAK;IACxE,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACtB,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,CAAC;IAC5B,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC;IACjC,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;QACjB,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC;IACrC,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACtB,OAAO,CAAC,GAAG,CAAC,cAAc,MAAM,CAAC,IAAI,CAAC,iBAAiB,EAAE,CAAC,CAAC;IAC3D,OAAO,CAAC,GAAG,CAAC,aAAa,MAAM,CAAC,IAAI,CAAC,aAAa,EAAE,CAAC,CAAC;IACtD,OAAO,CAAC,GAAG,CAAC,aAAa,MAAM,CAAC,IAAI,CAAC,iBAAiB,KAAK,CAAC,CAAC;IAC7D,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,IAAI,CAAC,CAAC;IAC1D,IAAI,OAAO,IAAI,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACxD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YACtB,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,CAAC;QACpF,CAAC;QACD,IAAI,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;YAC7C,OAAO,CAAC,GAAG,CAAC,YAAY,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,MAAM,GAAG,EAAE,IAAI,CAAC,CAAC;QACxE,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;IACxD,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1D,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC;aACzD,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;aAC3B,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAChB,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,MAAM,EAAE,CAAC;YACrC,OAAO,CAAC,GAAG,CAAC,OAAO,MAAM,KAAK,KAAK,IAAI,CAAC,CAAC;QAC3C,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEd,OAAO,CAAC,GAAG,CAAC,WAAW,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,KAAK,IAAI,CAAC,CAAC;IACxD,OAAO,CAAC,GAAG,EAAE,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CAAC,GAAW,EAAE,OAAyB;IAC3E,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC;IAE1C,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,KAAK,EAAE,CAAC;IACvC,IAAI,CAAC,OAAO,EAAE,CAAC;QACb,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC;IAElC,OAAO;IACP,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACrB,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,QAAQ,EAAE,QAAQ,CAAC,UAAU,EAAE,EAAE,OAAO,CAAC,CAAC;QACnE,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,KAAK,CAAC,gBAAgB,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;QACpD,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;QACrB,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,QAAQ,EAAE,QAAQ,CAAC,cAAc,EAAE,EAAE,OAAO,CAAC,CAAC;QACvE,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,KAAK,CAAC,WAAW,OAAO,CAAC,QAAQ,EAAE,CAAC,CAAC;QAC/C,CAAC;IACH,CAAC;IAED,KAAK;IACL,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;QACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;IAC/C,CAAC;SAAM,CAAC;QACN,WAAW,CAAC,MAAM,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;IACvC,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,qBAAqB,CACzC,IAAc,EACd,SAAiB,EACjB,UAA+B,EAAE;IAEjC,EAAE,CAAC,SAAS,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE7C,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,UAAU;QACV,MAAM,OAAO,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;QAC3B,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;YAAE,SAAS;QAElD,KAAK,EAAE,CAAC;QACR,MAAM,QAAQ,GAAG,OAAO;aACrB,OAAO,CAAC,eAAe,EAAE,GAAG,CAAC;aAC7B,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAEhB,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YACnB,OAAO,CAAC,GAAG,CAAC,IAAI,KAAK,SAAS,OAAO,EAAE,CAAC,CAAC;QAC3C,CAAC;QAED,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC,OAAO,CAAC,CAAC;YAC9C,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,KAAK,EAAE,CAAC;YAEvC,IAAI,OAAO,EAAE,CAAC;gBACZ,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC;gBAElC,EAAE,CAAC,aAAa,CAAC,GAAG,SAAS,IAAI,QAAQ,OAAO,EAAE,QAAQ,CAAC,UAAU,EAAE,EAAE,OAAO,CAAC,CAAC;gBAClF,EAAE,CAAC,aAAa,CAAC,GAAG,SAAS,IAAI,QAAQ,MAAM,EAAE,QAAQ,CAAC,cAAc,EAAE,EAAE,OAAO,CAAC,CAAC;gBACrF,EAAE,CAAC,aAAa,CAAC,GAAG,SAAS,IAAI,QAAQ,OAAO,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;YAC9F,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,SAAS,OAAO,EAAE,CAAC,CAAC;YACnC,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,SAAS,OAAO,EAAE,CAAC,CAAC;QACnC,CAAC;IACH,CAAC;IAED,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;QACnB,OAAO,CAAC,GAAG,CAAC,cAAc,KAAK,MAAM,CAAC,CAAC;IACzC,CAAC;AACH,CAAC"}
@@ -0,0 +1,120 @@
1
+ /**
2
+ * 網頁抓取與分析模組 (TypeScript 版本)
3
+ * 遵循 BDD/TDD/SOLID/DRY/KISS/DDD 原則
4
+ *
5
+ * 支援 HTTPS Proxy(沙盒環境)
6
+ */
7
+ export interface ScrapeOptions {
8
+ timeout?: number;
9
+ userAgent?: string;
10
+ }
11
+ export interface LinkItem {
12
+ url: string;
13
+ text: string;
14
+ }
15
+ export interface DownloadLinkItem extends LinkItem {
16
+ type: string;
17
+ }
18
+ export interface ExternalLinkItem extends LinkItem {
19
+ domain: string;
20
+ }
21
+ export interface ScrapeResult {
22
+ url: string;
23
+ title: string;
24
+ description: string;
25
+ size: {
26
+ raw_html_bytes: number;
27
+ raw_html_readable: string;
28
+ text_bytes: number;
29
+ text_readable: string;
30
+ compression_ratio: string;
31
+ };
32
+ links: {
33
+ downloads: {
34
+ count: number;
35
+ items: DownloadLinkItem[];
36
+ };
37
+ external: {
38
+ count: number;
39
+ domains: Record<string, number>;
40
+ items: ExternalLinkItem[];
41
+ };
42
+ internal: {
43
+ count: number;
44
+ items: LinkItem[];
45
+ };
46
+ };
47
+ }
48
+ /**
49
+ * 格式化檔案大小
50
+ */
51
+ export declare function formatSize(bytes: number): string;
52
+ /**
53
+ * 解析 HTML 取得純文字
54
+ */
55
+ export declare function extractText(html: string): string;
56
+ /**
57
+ * 網頁分析器類別
58
+ */
59
+ export declare class WebPageAnalyzer {
60
+ private url;
61
+ private baseDomain;
62
+ private html;
63
+ constructor(url: string);
64
+ /**
65
+ * 抓取網頁(支援 HTTPS Proxy)
66
+ */
67
+ fetch(options?: ScrapeOptions): Promise<boolean>;
68
+ /**
69
+ * 取得原始 HTML
70
+ */
71
+ getRawHtml(): string;
72
+ /**
73
+ * 取得原始大小
74
+ */
75
+ getRawSize(): number;
76
+ /**
77
+ * 取得純文字
78
+ */
79
+ getTextContent(): string;
80
+ /**
81
+ * 取得純文字大小
82
+ */
83
+ getTextSize(): number;
84
+ /**
85
+ * 取得頁面標題
86
+ */
87
+ getTitle(): string;
88
+ /**
89
+ * 取得 meta description
90
+ */
91
+ getDescription(): string;
92
+ /**
93
+ * 取得可下載連結
94
+ */
95
+ getDownloadableLinks(): DownloadLinkItem[];
96
+ /**
97
+ * 取得連外連結
98
+ */
99
+ getExternalLinks(): ExternalLinkItem[];
100
+ /**
101
+ * 取得內部連結
102
+ */
103
+ getInternalLinks(): LinkItem[];
104
+ /**
105
+ * 完整分析
106
+ */
107
+ analyze(): ScrapeResult;
108
+ }
109
+ /**
110
+ * 快速抓取並分析網頁
111
+ */
112
+ export declare function scrapeUrl(url: string, options?: ScrapeOptions): Promise<ScrapeResult | null>;
113
+ /**
114
+ * 抓取並取得原始內容
115
+ */
116
+ export declare function fetchRawContent(url: string, options?: ScrapeOptions): Promise<{
117
+ html: string;
118
+ text: string;
119
+ } | null>;
120
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scraper/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,MAAM,WAAW,aAAa;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,QAAQ;IACvB,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,gBAAiB,SAAQ,QAAQ;IAChD,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,gBAAiB,SAAQ,QAAQ;IAChD,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,YAAY;IAC3B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE;QACJ,cAAc,EAAE,MAAM,CAAC;QACvB,iBAAiB,EAAE,MAAM,CAAC;QAC1B,UAAU,EAAE,MAAM,CAAC;QACnB,aAAa,EAAE,MAAM,CAAC;QACtB,iBAAiB,EAAE,MAAM,CAAC;KAC3B,CAAC;IACF,KAAK,EAAE;QACL,SAAS,EAAE;YACT,KAAK,EAAE,MAAM,CAAC;YACd,KAAK,EAAE,gBAAgB,EAAE,CAAC;SAC3B,CAAC;QACF,QAAQ,EAAE;YACR,KAAK,EAAE,MAAM,CAAC;YACd,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;YAChC,KAAK,EAAE,gBAAgB,EAAE,CAAC;SAC3B,CAAC;QACF,QAAQ,EAAE;YACR,KAAK,EAAE,MAAM,CAAC;YACd,KAAK,EAAE,QAAQ,EAAE,CAAC;SACnB,CAAC;KACH,CAAC;CACH;AAgBD;;GAEG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAQhD;AAED;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAiChD;AA+FD;;GAEG;AACH,qBAAa,eAAe;IAC1B,OAAO,CAAC,GAAG,CAAS;IACpB,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,IAAI,CAAc;gBAEd,GAAG,EAAE,MAAM;IAMvB;;OAEG;IACG,KAAK,CAAC,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,OAAO,CAAC;IA0B1D;;OAEG;IACH,UAAU,IAAI,MAAM;IAIpB;;OAEG;IACH,UAAU,IAAI,MAAM;IAIpB;;OAEG;IACH,cAAc,IAAI,MAAM;IAIxB;;OAEG;IACH,WAAW,IAAI,MAAM;IAIrB;;OAEG;IACH,QAAQ,IAAI,MAAM;IAIlB;;OAEG;IACH,cAAc,IAAI,MAAM;IAIxB;;OAEG;IACH,oBAAoB,IAAI,gBAAgB,EAAE;IAsB1C;;OAEG;IACH,gBAAgB,IAAI,gBAAgB,EAAE;IAgCtC;;OAEG;IACH,gBAAgB,IAAI,QAAQ,EAAE;IA8B9B;;OAEG;IACH,OAAO,IAAI,YAAY;CA6CxB;AAED;;GAEG;AACH,wBAAsB,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC,CAOtG;AAED;;GAEG;AACH,wBAAsB,eAAe,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,aAAkB,GAAG,OAAO,CAAC;IACvF,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;CACd,GAAG,IAAI,CAAC,CAUR"}