@d-zero/replicator 0.5.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +60 -7
- package/dist/child-process.js +94 -0
- package/dist/cli.js +21 -8
- package/dist/index.js +153 -265
- package/dist/resource-downloader.js +130 -0
- package/package.json +8 -6
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# `@d-zero/replicator`
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
ウェブページとそのリソースをレスポンシブ画像対応でローカルディレクトリに複製するツールです。複数のURLを並列処理し、効率的にリソースを取得します。
|
|
4
4
|
|
|
5
5
|
## インストール
|
|
6
6
|
|
|
@@ -13,7 +13,7 @@ npm install @d-zero/replicator
|
|
|
13
13
|
### CLI
|
|
14
14
|
|
|
15
15
|
```bash
|
|
16
|
-
npx @d-zero/replicator <url
|
|
16
|
+
npx @d-zero/replicator <url...> -o <output-directory> [options]
|
|
17
17
|
```
|
|
18
18
|
|
|
19
19
|
#### オプション
|
|
@@ -21,8 +21,16 @@ npx @d-zero/replicator <url> -o <output-directory> [options]
|
|
|
21
21
|
- `-o, --output <dir>`: 出力ディレクトリ(必須)
|
|
22
22
|
- `-t, --timeout <ms>`: リクエストタイムアウト(ミリ秒、デフォルト: 30000)
|
|
23
23
|
- `-d, --devices <devices>`: デバイスプリセット(カンマ区切り、デフォルト: desktop-compact,mobile)
|
|
24
|
+
- `-l, --limit <number>`: 並列処理数の上限(デフォルト: 3)
|
|
25
|
+
- `--only <type>`: ダウンロード対象を限定(`page` または `resource`)
|
|
24
26
|
- `-v, --verbose`: 詳細ログモード
|
|
25
27
|
|
|
28
|
+
##### `--only` オプション
|
|
29
|
+
|
|
30
|
+
- `page`: HTMLページのみをダウンロード(リソーススキャンをスキップして高速化)
|
|
31
|
+
- `resource`: リソース(CSS、JS、画像など)のみをダウンロード(HTMLページを除外)
|
|
32
|
+
- 未指定: すべてのファイルをダウンロード(デフォルト動作)
|
|
33
|
+
|
|
26
34
|
#### 利用可能なデバイスプリセット
|
|
27
35
|
|
|
28
36
|
- `desktop`: 1400px幅
|
|
@@ -36,14 +44,26 @@ npx @d-zero/replicator <url> -o <output-directory> [options]
|
|
|
36
44
|
#### 使用例
|
|
37
45
|
|
|
38
46
|
```bash
|
|
39
|
-
#
|
|
47
|
+
# 単一URL(デフォルトデバイス: desktop-compact, mobile)
|
|
40
48
|
npx @d-zero/replicator https://example.com -o ./output
|
|
41
49
|
|
|
50
|
+
# 複数URLを並列処理
|
|
51
|
+
npx @d-zero/replicator https://example.com/page1 https://example.com/page2 -o ./output
|
|
52
|
+
|
|
53
|
+
# 並列数を制限
|
|
54
|
+
npx @d-zero/replicator https://example.com/page1 https://example.com/page2 -o ./output --limit 2
|
|
55
|
+
|
|
42
56
|
# カスタムデバイス指定
|
|
43
57
|
npx @d-zero/replicator https://example.com -o ./output --devices desktop,tablet,mobile
|
|
44
58
|
|
|
45
59
|
# タイムアウト指定
|
|
46
60
|
npx @d-zero/replicator https://example.com -o ./output --timeout 60000
|
|
61
|
+
|
|
62
|
+
# HTMLページのみダウンロード(高速)
|
|
63
|
+
npx @d-zero/replicator https://example.com -o ./output --only page
|
|
64
|
+
|
|
65
|
+
# リソースのみダウンロード(HTMLを除外)
|
|
66
|
+
npx @d-zero/replicator https://example.com -o ./output --only resource
|
|
47
67
|
```
|
|
48
68
|
|
|
49
69
|
### プログラマティック使用
|
|
@@ -51,11 +71,27 @@ npx @d-zero/replicator https://example.com -o ./output --timeout 60000
|
|
|
51
71
|
```typescript
|
|
52
72
|
import { replicate } from '@d-zero/replicator';
|
|
53
73
|
|
|
54
|
-
//
|
|
55
|
-
await replicate(
|
|
74
|
+
// 単一URL
|
|
75
|
+
await replicate({
|
|
76
|
+
urls: ['https://example.com'],
|
|
77
|
+
outputDir: './output',
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
// 複数URLを並列処理
|
|
81
|
+
await replicate({
|
|
82
|
+
urls: [
|
|
83
|
+
'https://example.com/page1',
|
|
84
|
+
'https://example.com/page2',
|
|
85
|
+
'https://example.com/page3',
|
|
86
|
+
],
|
|
87
|
+
outputDir: './output',
|
|
88
|
+
limit: 2, // 最大2つのURLを同時処理
|
|
89
|
+
});
|
|
56
90
|
|
|
57
91
|
// カスタムデバイス
|
|
58
|
-
await replicate(
|
|
92
|
+
await replicate({
|
|
93
|
+
urls: ['https://example.com'],
|
|
94
|
+
outputDir: './output',
|
|
59
95
|
devices: {
|
|
60
96
|
desktop: { width: 1400 },
|
|
61
97
|
mobile: { width: 375, resolution: 2 },
|
|
@@ -63,17 +99,34 @@ await replicate('https://example.com', './output', {
|
|
|
63
99
|
timeout: 30000,
|
|
64
100
|
verbose: true,
|
|
65
101
|
});
|
|
102
|
+
|
|
103
|
+
// HTMLページのみダウンロード
|
|
104
|
+
await replicate({
|
|
105
|
+
urls: ['https://example.com'],
|
|
106
|
+
outputDir: './output',
|
|
107
|
+
only: 'page',
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
// リソースのみダウンロード
|
|
111
|
+
await replicate({
|
|
112
|
+
urls: ['https://example.com'],
|
|
113
|
+
outputDir: './output',
|
|
114
|
+
only: 'resource',
|
|
115
|
+
});
|
|
66
116
|
```
|
|
67
117
|
|
|
68
118
|
## 機能
|
|
69
119
|
|
|
120
|
+
- **並列処理**: 複数のURLを並列で効率的に処理
|
|
121
|
+
- **メモリ効率**: リソースを直接ディスクに保存してメモリ使用量を最小化
|
|
122
|
+
- **選択的ダウンロード**: `--only`オプションでHTMLページのみまたはリソースのみをダウンロード可能
|
|
70
123
|
- **レスポンシブ画像対応**: 複数のデバイス幅で`<picture>`要素やメディアクエリのリソースを取得
|
|
71
124
|
- **遅延読み込み対応**: ページを自動スクロールして`loading=lazy`や`IntersectionObserver`ベースのコンテンツを取得
|
|
72
125
|
- **マルチデバイスシミュレーション**: 様々なデバイス幅と解像度をシミュレートして包括的なリソース取得を実現
|
|
73
126
|
- HTMLページのディレクトリ構造を保持してダウンロード
|
|
74
127
|
- 関連するすべてのリソース(CSS、JS、画像など)を取得
|
|
75
128
|
- リソース間の相対リンクを維持
|
|
76
|
-
-
|
|
129
|
+
- 同一ホストのリソースのみサポート(複数ホストが検出された場合はエラー)
|
|
77
130
|
- 元のファイル拡張子とパスを保持
|
|
78
131
|
|
|
79
132
|
## License
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { createChildProcess } from '@d-zero/puppeteer-dealer';
|
|
2
|
+
import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
|
|
3
|
+
import { scrollAllOver } from '@d-zero/puppeteer-scroll';
|
|
4
|
+
/**
|
|
5
|
+
* Add resource path to the set with MIME type encoding if needed
|
|
6
|
+
* @param pathname - Resource pathname
|
|
7
|
+
* @param mimeType - MIME type from response headers (optional)
|
|
8
|
+
* @returns Encoded resource path
|
|
9
|
+
*/
|
|
10
|
+
function encodeResourcePath(pathname, mimeType) {
|
|
11
|
+
// Normalize empty pathname to "/"
|
|
12
|
+
if (pathname === '') {
|
|
13
|
+
pathname = '/';
|
|
14
|
+
}
|
|
15
|
+
// Check if the last segment has an extension
|
|
16
|
+
const lastSlashIndex = pathname.lastIndexOf('/');
|
|
17
|
+
const lastSegment = lastSlashIndex === -1 ? pathname : pathname.slice(lastSlashIndex + 1);
|
|
18
|
+
const hasExtension = lastSegment.includes('.');
|
|
19
|
+
// For paths without extension, encode with MIME type if available
|
|
20
|
+
if (!hasExtension && mimeType) {
|
|
21
|
+
return `${pathname}:::${mimeType}`;
|
|
22
|
+
}
|
|
23
|
+
// For paths with extension or without MIME type, return as-is
|
|
24
|
+
return pathname;
|
|
25
|
+
}
|
|
26
|
+
createChildProcess((param) => {
|
|
27
|
+
const { devices, timeout } = param;
|
|
28
|
+
return {
|
|
29
|
+
async eachPage({ page, url }, logger) {
|
|
30
|
+
const resourcePaths = new Set();
|
|
31
|
+
const pageHostname = new URL(url).hostname;
|
|
32
|
+
// Add the page URL itself first (in case response event is missed)
|
|
33
|
+
const pageUrlObj = new URL(url);
|
|
34
|
+
const pagePathname = pageUrlObj.pathname;
|
|
35
|
+
resourcePaths.add(encodeResourcePath(pagePathname, 'text/html'));
|
|
36
|
+
// Listen to all network responses
|
|
37
|
+
const responseHandler = (response) => {
|
|
38
|
+
const responseUrl = response.url();
|
|
39
|
+
// Skip data URLs
|
|
40
|
+
if (responseUrl.startsWith('data:')) {
|
|
41
|
+
return;
|
|
42
|
+
}
|
|
43
|
+
// Skip non-GET requests (POST, PUT, etc. cannot be replicated)
|
|
44
|
+
if (response.request().method() !== 'GET') {
|
|
45
|
+
return;
|
|
46
|
+
}
|
|
47
|
+
// Skip non-successful responses (not 2xx)
|
|
48
|
+
if (response.status() < 200 || response.status() >= 300) {
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
const resourceUrlObj = new URL(responseUrl);
|
|
52
|
+
// Skip different domain resources
|
|
53
|
+
if (resourceUrlObj.hostname !== pageHostname) {
|
|
54
|
+
return;
|
|
55
|
+
}
|
|
56
|
+
// Get pathname and MIME type
|
|
57
|
+
const resourcePath = resourceUrlObj.pathname;
|
|
58
|
+
const contentType = response.headers()['content-type'];
|
|
59
|
+
const mimeType = contentType?.split(';')[0]?.trim();
|
|
60
|
+
// Add resource with MIME encoding if needed
|
|
61
|
+
resourcePaths.add(encodeResourcePath(resourcePath, mimeType));
|
|
62
|
+
};
|
|
63
|
+
page.on('response', responseHandler);
|
|
64
|
+
const defaultSizes = {
|
|
65
|
+
'desktop-compact': devicePresets['desktop-compact'],
|
|
66
|
+
mobile: devicePresets.mobile,
|
|
67
|
+
};
|
|
68
|
+
const targetSizes = devices ?? defaultSizes;
|
|
69
|
+
// Scan the page across all device sizes
|
|
70
|
+
for (const [sizeName, size] of Object.entries(targetSizes)) {
|
|
71
|
+
logger(`📱 Scanning with ${sizeName} (${size.width}px)`);
|
|
72
|
+
await beforePageScan(page, url, {
|
|
73
|
+
name: sizeName,
|
|
74
|
+
width: size.width,
|
|
75
|
+
timeout,
|
|
76
|
+
}).catch((error) => {
|
|
77
|
+
logger(`❌ Failed to scan ${sizeName}: ${error.message}`);
|
|
78
|
+
throw error;
|
|
79
|
+
});
|
|
80
|
+
// Scroll to load lazy resources
|
|
81
|
+
await scrollAllOver(page).catch((error) => {
|
|
82
|
+
logger(`❌ Failed to scroll ${sizeName}: ${error.message}`);
|
|
83
|
+
throw error;
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
page.off('response', responseHandler);
|
|
87
|
+
logger(`📦 Collected ${resourcePaths.size} resources`);
|
|
88
|
+
return {
|
|
89
|
+
url,
|
|
90
|
+
encodedUrls: [...resourcePaths],
|
|
91
|
+
};
|
|
92
|
+
},
|
|
93
|
+
};
|
|
94
|
+
});
|
package/dist/cli.js
CHANGED
|
@@ -8,14 +8,17 @@ const { options, args } = createCLI({
|
|
|
8
8
|
v: 'verbose',
|
|
9
9
|
t: 'timeout',
|
|
10
10
|
d: 'devices',
|
|
11
|
+
l: 'limit',
|
|
11
12
|
},
|
|
12
13
|
usage: [
|
|
13
|
-
'Usage: replicator <
|
|
14
|
+
'Usage: replicator <url1> [url2...] -o <output-directory> [options]',
|
|
14
15
|
'',
|
|
15
16
|
'Options:',
|
|
16
17
|
' -o, --output <dir> Output directory (required)',
|
|
17
18
|
' -t, --timeout <ms> Request timeout in milliseconds (default: 30000)',
|
|
18
19
|
' -d, --devices <devices> Device presets (comma-separated, default: desktop-compact,mobile)',
|
|
20
|
+
' -l, --limit <number> Parallel execution limit (default: 3)',
|
|
21
|
+
' --only <type> Download only specified type: page or resource',
|
|
19
22
|
' -v, --verbose Enable verbose logging',
|
|
20
23
|
'',
|
|
21
24
|
'Available device presets:',
|
|
@@ -23,36 +26,46 @@ const { options, args } = createCLI({
|
|
|
23
26
|
'',
|
|
24
27
|
'Examples:',
|
|
25
28
|
' replicator https://example.com -o ./output',
|
|
29
|
+
' replicator https://example.com/page1 https://example.com/page2 -o ./output',
|
|
26
30
|
' replicator https://example.com -o ./output --devices desktop,tablet',
|
|
27
|
-
' replicator https://example.com -o ./output --timeout 60000',
|
|
31
|
+
' replicator https://example.com -o ./output --timeout 60000 --limit 5',
|
|
32
|
+
' replicator https://example.com -o ./output --only page',
|
|
33
|
+
' replicator https://example.com -o ./output --only resource',
|
|
28
34
|
],
|
|
29
35
|
parseArgs: (cli) => ({
|
|
30
36
|
...parseCommonOptions(cli),
|
|
31
37
|
output: cli.output,
|
|
32
38
|
timeout: cli.timeout ? Number(cli.timeout) : undefined,
|
|
33
39
|
devices: cli.devices,
|
|
40
|
+
limit: cli.limit ? Number(cli.limit) : undefined,
|
|
41
|
+
only: cli.only,
|
|
34
42
|
}),
|
|
35
43
|
validateArgs: (options, cli) => {
|
|
44
|
+
if (options.only && options.only !== 'page' && options.only !== 'resource') {
|
|
45
|
+
return false;
|
|
46
|
+
}
|
|
36
47
|
return !!(cli._.length > 0 && options.output);
|
|
37
48
|
},
|
|
38
49
|
});
|
|
39
|
-
const
|
|
50
|
+
const urls = args.filter((arg) => typeof arg === 'string');
|
|
40
51
|
const outputDir = options.output;
|
|
41
|
-
if (
|
|
52
|
+
if (urls.length === 0) {
|
|
42
53
|
// eslint-disable-next-line no-console
|
|
43
|
-
console.error('❌ Error: URL is required');
|
|
54
|
+
console.error('❌ Error: At least one URL is required');
|
|
44
55
|
process.exit(1);
|
|
45
56
|
}
|
|
46
57
|
try {
|
|
47
58
|
const deviceNames = options.devices ? parseList(options.devices) : undefined;
|
|
48
59
|
const devices = parseDevicesOption(deviceNames);
|
|
49
|
-
await replicate(
|
|
60
|
+
await replicate({
|
|
61
|
+
urls,
|
|
62
|
+
outputDir,
|
|
50
63
|
verbose: options.verbose ?? false,
|
|
51
64
|
timeout: options.timeout,
|
|
52
65
|
devices,
|
|
66
|
+
limit: options.limit,
|
|
67
|
+
only: options.only,
|
|
53
68
|
});
|
|
54
|
-
// eslint-disable-next-line no-console
|
|
55
|
-
console.log(`✅ Successfully replicated ${url} to ${outputDir}`);
|
|
56
69
|
}
|
|
57
70
|
catch (error) {
|
|
58
71
|
if (error instanceof Error) {
|
package/dist/index.js
CHANGED
|
@@ -1,287 +1,175 @@
|
|
|
1
|
-
import { promises as fs } from 'node:fs';
|
|
2
1
|
import path from 'node:path';
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
2
|
+
import { deal, createProcess } from '@d-zero/puppeteer-dealer';
|
|
3
|
+
import { devicePresets } from '@d-zero/puppeteer-page-scan';
|
|
4
|
+
import { validateSameHost } from '@d-zero/shared/validate-same-host';
|
|
5
|
+
import c from 'ansi-colors';
|
|
6
|
+
import { downloadResources } from './resource-downloader.js';
|
|
6
7
|
/**
|
|
7
|
-
*
|
|
8
|
-
* @param
|
|
9
|
-
* @param
|
|
10
|
-
* @
|
|
8
|
+
* Encode resource path with MIME type if needed
|
|
9
|
+
* @param pathname - Resource pathname
|
|
10
|
+
* @param mimeType - MIME type (optional)
|
|
11
|
+
* @returns Encoded resource path
|
|
11
12
|
*/
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
mobile: devicePresets.mobile,
|
|
17
|
-
};
|
|
18
|
-
const targetSizes = devices ?? defaultSizes;
|
|
19
|
-
const log = (message) => {
|
|
20
|
-
if (verbose) {
|
|
21
|
-
// eslint-disable-next-line no-console
|
|
22
|
-
console.log(message);
|
|
23
|
-
}
|
|
24
|
-
};
|
|
25
|
-
// Always show these key progress messages
|
|
26
|
-
const progress = (message) => {
|
|
27
|
-
// eslint-disable-next-line no-console
|
|
28
|
-
console.log(message);
|
|
29
|
-
};
|
|
30
|
-
const baseUrl = new URL(url);
|
|
31
|
-
const allResources = [];
|
|
32
|
-
progress(`🚀 Starting replication of ${url}`);
|
|
33
|
-
log(` Output directory: ${outputDir}`);
|
|
34
|
-
log(` Device sizes: ${Object.keys(targetSizes).join(', ')}`);
|
|
35
|
-
progress(`🌐 Launching browser...`);
|
|
36
|
-
const browser = await launch({
|
|
37
|
-
headless: true,
|
|
38
|
-
});
|
|
39
|
-
try {
|
|
40
|
-
// Process each device size
|
|
41
|
-
for (const [sizeName, sizeConfig] of Object.entries(targetSizes)) {
|
|
42
|
-
const { width } = sizeConfig;
|
|
43
|
-
const resolution = 'resolution' in sizeConfig ? sizeConfig.resolution : undefined;
|
|
44
|
-
progress(`📱 Processing ${sizeName} (${width}px${resolution ? `, ${resolution}x` : ''})...`);
|
|
45
|
-
const page = await browser.newPage();
|
|
46
|
-
const sizeResources = [];
|
|
47
|
-
try {
|
|
48
|
-
await processPageForSize(page, url, baseUrl, sizeResources, {
|
|
49
|
-
sizeName,
|
|
50
|
-
width,
|
|
51
|
-
resolution,
|
|
52
|
-
timeout,
|
|
53
|
-
log,
|
|
54
|
-
progress,
|
|
55
|
-
});
|
|
56
|
-
// Merge resources, avoiding duplicates
|
|
57
|
-
for (const resource of sizeResources) {
|
|
58
|
-
const existing = allResources.find((r) => r.url === resource.url);
|
|
59
|
-
if (!existing) {
|
|
60
|
-
allResources.push(resource);
|
|
61
|
-
}
|
|
62
|
-
else if (!existing.content && resource.content) {
|
|
63
|
-
existing.content = resource.content;
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
finally {
|
|
68
|
-
await page.close().catch((error) => {
|
|
69
|
-
log(`⚠️ Warning: Failed to close page for ${sizeName}: ${error instanceof Error ? error.message : String(error)}`);
|
|
70
|
-
});
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
const resourceCount = allResources.length;
|
|
74
|
-
const downloadedCount = allResources.filter((r) => r.content).length;
|
|
75
|
-
progress(`📄 Found ${resourceCount} total resources (${downloadedCount} downloaded successfully)`);
|
|
76
|
-
// Ensure output directory exists
|
|
77
|
-
progress(`📁 Creating output directory...`);
|
|
78
|
-
await fs.mkdir(outputDir, { recursive: true });
|
|
79
|
-
// Save all resources
|
|
80
|
-
progress(`💾 Saving files to disk...`);
|
|
81
|
-
const savedCount = await saveResources(allResources, outputDir, log, progress);
|
|
82
|
-
progress(`🎉 Replication complete! ${savedCount} files saved to ${outputDir}`);
|
|
13
|
+
function encodeResourcePath(pathname, mimeType) {
|
|
14
|
+
// Normalize empty pathname to "/"
|
|
15
|
+
if (pathname === '') {
|
|
16
|
+
pathname = '/';
|
|
83
17
|
}
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
18
|
+
// Check if the last segment has an extension
|
|
19
|
+
const lastSlashIndex = pathname.lastIndexOf('/');
|
|
20
|
+
const lastSegment = lastSlashIndex === -1 ? pathname : pathname.slice(lastSlashIndex + 1);
|
|
21
|
+
const hasExtension = lastSegment.includes('.');
|
|
22
|
+
// For paths without extension, encode with MIME type if available
|
|
23
|
+
if (!hasExtension && mimeType) {
|
|
24
|
+
return `${pathname}:::${mimeType}`;
|
|
90
25
|
}
|
|
26
|
+
// For paths with extension or without MIME type, return as-is
|
|
27
|
+
return pathname;
|
|
91
28
|
}
|
|
92
29
|
/**
|
|
93
|
-
*
|
|
94
|
-
* @param
|
|
95
|
-
* @param
|
|
96
|
-
* @
|
|
97
|
-
* @param resources
|
|
98
|
-
* @param options
|
|
99
|
-
* @param options.sizeName
|
|
100
|
-
* @param options.width
|
|
101
|
-
* @param options.resolution
|
|
102
|
-
* @param options.timeout
|
|
103
|
-
* @param options.log
|
|
104
|
-
* @param options.progress
|
|
30
|
+
* Collect page URLs without resource scanning (page-only mode)
|
|
31
|
+
* @param urls - Array of URLs to process
|
|
32
|
+
* @param progress - Progress logger function
|
|
33
|
+
* @returns Set of encoded URLs
|
|
105
34
|
*/
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
const
|
|
112
|
-
const
|
|
113
|
-
//
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
if (!resources.some((r) => r.url === requestUrl)) {
|
|
120
|
-
resources.push({
|
|
121
|
-
url: requestUrl,
|
|
122
|
-
localPath,
|
|
123
|
-
type: resourceType,
|
|
124
|
-
});
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
else {
|
|
128
|
-
log(`🚫 [${sizeName}] Skipping external resource: ${requestUrl}`);
|
|
129
|
-
}
|
|
130
|
-
});
|
|
131
|
-
page.on('response', (response) => {
|
|
132
|
-
const responseUrl = response.url();
|
|
133
|
-
const responseUrlObj = new URL(responseUrl);
|
|
134
|
-
// Only handle same-host resources
|
|
135
|
-
if (responseUrlObj.hostname === baseUrl.hostname) {
|
|
136
|
-
const promise = (async () => {
|
|
137
|
-
const resource = resources.find((r) => r.url === responseUrl);
|
|
138
|
-
if (resource && response.ok()) {
|
|
139
|
-
await response
|
|
140
|
-
.buffer()
|
|
141
|
-
.then((buffer) => {
|
|
142
|
-
resource.content = buffer;
|
|
143
|
-
log(`✅ [${sizeName}] Downloaded: ${responseUrl}`);
|
|
144
|
-
})
|
|
145
|
-
.catch((error) => {
|
|
146
|
-
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
147
|
-
log(`❌ [${sizeName}] Failed to download: ${responseUrl} - ${errorMessage}`);
|
|
148
|
-
});
|
|
149
|
-
}
|
|
150
|
-
else if (resource) {
|
|
151
|
-
log(`❌ [${sizeName}] Resource failed (${response.status()}): ${responseUrl}`);
|
|
152
|
-
}
|
|
153
|
-
})();
|
|
154
|
-
requestPromises.push(promise);
|
|
155
|
-
}
|
|
156
|
-
});
|
|
157
|
-
// Set viewport and navigate using beforePageScan (which includes scrolling)
|
|
158
|
-
progress(`📡 [${sizeName}] Setting viewport and navigating...`);
|
|
159
|
-
await beforePageScan(page, url, {
|
|
160
|
-
name: sizeName,
|
|
161
|
-
width,
|
|
162
|
-
resolution,
|
|
163
|
-
timeout,
|
|
164
|
-
listener: (phase, data) => {
|
|
165
|
-
switch (phase) {
|
|
166
|
-
case 'setViewport': {
|
|
167
|
-
const setViewportData = data;
|
|
168
|
-
log(`📱 [${sizeName}] Viewport set: ${setViewportData.width}px${setViewportData.resolution ? ` @ ${setViewportData.resolution}x` : ''}`);
|
|
169
|
-
break;
|
|
170
|
-
}
|
|
171
|
-
case 'load': {
|
|
172
|
-
const loadData = data;
|
|
173
|
-
log(`📄 [${sizeName}] Page loaded (${loadData.type})`);
|
|
174
|
-
break;
|
|
175
|
-
}
|
|
176
|
-
case 'scroll': {
|
|
177
|
-
const scrollData = data;
|
|
178
|
-
switch (scrollData.message) {
|
|
179
|
-
case 'Start scrolling': {
|
|
180
|
-
log(`📜 [${sizeName}] Starting scroll to trigger lazy loading...`);
|
|
181
|
-
break;
|
|
182
|
-
}
|
|
183
|
-
case 'End of page': {
|
|
184
|
-
log(`📜 [${sizeName}] Scroll completed (${scrollData.scrollY}/${scrollData.scrollHeight}px)`);
|
|
185
|
-
break;
|
|
186
|
-
}
|
|
187
|
-
case 'Scrolling': {
|
|
188
|
-
const progress = Math.round((scrollData.scrollY / scrollData.scrollHeight) * 100);
|
|
189
|
-
log(`📜 [${sizeName}] Scrolling progress: ${progress}% (${scrollData.scrollY}/${scrollData.scrollHeight}px)`);
|
|
190
|
-
break;
|
|
191
|
-
}
|
|
192
|
-
// No default
|
|
193
|
-
}
|
|
194
|
-
break;
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
-
},
|
|
198
|
-
});
|
|
199
|
-
progress(`⏳ [${sizeName}] Waiting for all resources to load...`);
|
|
200
|
-
// Wait for all downloads to complete
|
|
201
|
-
await Promise.all(requestPromises);
|
|
202
|
-
const resourceCount = resources.length;
|
|
203
|
-
const downloadedCount = resources.filter((r) => r.content).length;
|
|
204
|
-
progress(`📄 [${sizeName}] Found ${resourceCount} resources (${downloadedCount} downloaded)`);
|
|
35
|
+
function collectPageUrlsOnly(urls, progress) {
|
|
36
|
+
progress(c.bold.yellow('📄 Page-only mode: Skipping resource collection...'));
|
|
37
|
+
progress('');
|
|
38
|
+
const encodedUrls = new Set();
|
|
39
|
+
for (const url of urls) {
|
|
40
|
+
const urlObj = new URL(url);
|
|
41
|
+
const pathname = urlObj.pathname || '/';
|
|
42
|
+
// Encode as HTML page
|
|
43
|
+
const encodedPath = encodeResourcePath(pathname, 'text/html');
|
|
44
|
+
encodedUrls.add(encodedPath);
|
|
45
|
+
}
|
|
46
|
+
progress(c.bold.green(`✅ Prepared ${encodedUrls.size} page(s) for download`));
|
|
47
|
+
return encodedUrls;
|
|
205
48
|
}
|
|
206
49
|
/**
|
|
207
|
-
*
|
|
208
|
-
* @param
|
|
209
|
-
* @param
|
|
210
|
-
* @param
|
|
211
|
-
* @param
|
|
50
|
+
* Collect all resource URLs using Puppeteer scanning
|
|
51
|
+
* @param urls - Array of URLs to process
|
|
52
|
+
* @param targetSizes - Device sizes for responsive scanning
|
|
53
|
+
* @param timeout - Request timeout
|
|
54
|
+
* @param verbose - Enable verbose logging
|
|
55
|
+
* @param limit - Parallel execution limit
|
|
56
|
+
* @param progress - Progress logger function
|
|
57
|
+
* @returns Set of encoded URLs
|
|
212
58
|
*/
|
|
213
|
-
async function
|
|
214
|
-
|
|
215
|
-
const
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
59
|
+
async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress) {
|
|
60
|
+
progress(c.bold.yellow('📡 Phase 1: Collecting resource metadata...'));
|
|
61
|
+
const results = [];
|
|
62
|
+
await deal(urls.map((url) => ({ id: null, url })), (_, done, total) => {
|
|
63
|
+
const percentage = Math.round((done / total) * 100);
|
|
64
|
+
return `${c.bold.cyan('🌐 Replicating')} ${done}/${total} (${percentage}%)`;
|
|
65
|
+
}, () => createProcess(path.resolve(import.meta.dirname, 'child-process.js'), {
|
|
66
|
+
devices: targetSizes,
|
|
67
|
+
timeout,
|
|
68
|
+
}, {}), {
|
|
69
|
+
verbose,
|
|
70
|
+
limit,
|
|
71
|
+
each: (result) => {
|
|
72
|
+
results.push(result);
|
|
73
|
+
},
|
|
74
|
+
});
|
|
75
|
+
progress('');
|
|
76
|
+
progress(c.bold.green(`✅ Phase 1 complete: Collected metadata from ${results.length} URL(s)`));
|
|
77
|
+
// Log collected URLs in verbose mode
|
|
78
|
+
if (verbose) {
|
|
79
|
+
progress('');
|
|
80
|
+
progress(c.gray('📋 Collected URLs by page:'));
|
|
81
|
+
for (const result of results) {
|
|
82
|
+
progress(c.gray(` ${result.url}:`));
|
|
83
|
+
for (const encodedUrl of result.encodedUrls) {
|
|
84
|
+
progress(c.gray(` - ${encodedUrl}`));
|
|
235
85
|
}
|
|
236
86
|
}
|
|
237
87
|
}
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
function urlToLocalPath(url) {
|
|
245
|
-
const urlObj = new URL(url);
|
|
246
|
-
let pathname = urlObj.pathname;
|
|
247
|
-
// Remove leading slash
|
|
248
|
-
if (pathname.startsWith('/')) {
|
|
249
|
-
pathname = pathname.slice(1);
|
|
250
|
-
}
|
|
251
|
-
// If path is empty or ends with /, treat as index.html
|
|
252
|
-
if (pathname === '' || pathname.endsWith('/')) {
|
|
253
|
-
pathname = pathname + 'index.html';
|
|
254
|
-
}
|
|
255
|
-
// If no extension, add .html
|
|
256
|
-
if (!pathname.includes('.')) {
|
|
257
|
-
pathname = pathname + '.html';
|
|
88
|
+
// Aggregate all resource URLs
|
|
89
|
+
const encodedUrls = new Set();
|
|
90
|
+
for (const result of results) {
|
|
91
|
+
for (const encodedUrl of result.encodedUrls) {
|
|
92
|
+
encodedUrls.add(encodedUrl);
|
|
93
|
+
}
|
|
258
94
|
}
|
|
259
|
-
return
|
|
95
|
+
return encodedUrls;
|
|
260
96
|
}
|
|
261
97
|
/**
|
|
98
|
+
* Replicate web pages with all their resources to local directories
|
|
262
99
|
*
|
|
263
|
-
*
|
|
100
|
+
* ## Architecture
|
|
101
|
+
*
|
|
102
|
+
* This implementation uses a two-phase architecture for memory efficiency:
|
|
103
|
+
*
|
|
104
|
+
* ### Phase 1: Metadata Collection
|
|
105
|
+
* - Each URL is processed in a separate child process using puppeteer-dealer
|
|
106
|
+
* - Child processes scan pages with Puppeteer and collect resource URLs
|
|
107
|
+
* - For URLs ending with '/' (e.g., https://example.com/), MIME type is captured
|
|
108
|
+
* and encoded as "url:::MIME/type" format
|
|
109
|
+
* - Only metadata (URLs + MIME types) is returned to parent - no buffer data
|
|
110
|
+
*
|
|
111
|
+
* ### Phase 2: Resource Download
|
|
112
|
+
* - Parent process aggregates all metadata and removes duplicates
|
|
113
|
+
* - Parses encoded URLs to determine correct local paths
|
|
114
|
+
* - Downloads resources via fetch() and immediately writes to disk
|
|
115
|
+
* - No resource content is kept in memory
|
|
116
|
+
*
|
|
117
|
+
* This approach minimizes memory usage by avoiding duplicate I/O operations
|
|
118
|
+
* and keeping buffer data out of inter-process communication.
|
|
119
|
+
* @param options - Replication options
|
|
264
120
|
*/
|
|
265
|
-
function
|
|
266
|
-
const
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
pathname.endsWith('.htm') ||
|
|
270
|
-
pathname === '/' ||
|
|
271
|
-
!pathname.includes('.')) {
|
|
272
|
-
return 'html';
|
|
121
|
+
export async function replicate(options) {
|
|
122
|
+
const { urls, outputDir, verbose = false, timeout = 30_000, devices, limit = 3, only, } = options;
|
|
123
|
+
if (urls.length === 0) {
|
|
124
|
+
throw new Error('At least one URL is required');
|
|
273
125
|
}
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
|
|
126
|
+
// Validate that all URLs share the same hostname
|
|
127
|
+
validateSameHost(urls);
|
|
128
|
+
const log = (message) => {
|
|
129
|
+
if (!verbose) {
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
132
|
+
// eslint-disable-next-line no-console
|
|
133
|
+
console.log(message);
|
|
134
|
+
};
|
|
135
|
+
const progress = (message) => {
|
|
136
|
+
// eslint-disable-next-line no-console
|
|
137
|
+
console.log(message);
|
|
138
|
+
};
|
|
139
|
+
const defaultSizes = {
|
|
140
|
+
'desktop-compact': devicePresets['desktop-compact'],
|
|
141
|
+
mobile: devicePresets.mobile,
|
|
142
|
+
};
|
|
143
|
+
const targetSizes = devices ?? defaultSizes;
|
|
144
|
+
progress(c.bold.cyan(`🌐 Replicating ${urls.length} URL(s)`));
|
|
145
|
+
progress(c.gray(` Output: ${outputDir}`));
|
|
146
|
+
progress(c.gray(` Parallel limit: ${limit}`));
|
|
147
|
+
progress('');
|
|
148
|
+
// Phase 1: Collect resource metadata from all URLs
|
|
149
|
+
let allEncodedUrls;
|
|
150
|
+
switch (only) {
|
|
151
|
+
case 'page': {
|
|
152
|
+
allEncodedUrls = collectPageUrlsOnly(urls, progress);
|
|
153
|
+
break;
|
|
154
|
+
}
|
|
155
|
+
case 'resource':
|
|
156
|
+
case undefined: {
|
|
157
|
+
allEncodedUrls = await collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress);
|
|
158
|
+
break;
|
|
159
|
+
}
|
|
160
|
+
default: {
|
|
161
|
+
throw new Error(`Invalid only option: ${only}`);
|
|
162
|
+
}
|
|
285
163
|
}
|
|
286
|
-
|
|
164
|
+
progress('');
|
|
165
|
+
// Phase 2: Download resources
|
|
166
|
+
progress(c.bold.yellow('📦 Phase 2: Downloading resources...'));
|
|
167
|
+
log(` Total unique resources: ${allEncodedUrls.size}`);
|
|
168
|
+
// Use the first URL as base URL for constructing full URLs
|
|
169
|
+
const baseUrl = urls[0];
|
|
170
|
+
// Download all resources
|
|
171
|
+
await downloadResources([...allEncodedUrls], baseUrl, outputDir, progress, verbose, only);
|
|
172
|
+
progress('');
|
|
173
|
+
progress(c.bold.green(`✅ Replication complete!`));
|
|
174
|
+
progress(c.gray(` All resources saved to: ${outputDir}`));
|
|
287
175
|
}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import { mkdir, writeFile } from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { deal } from '@d-zero/dealer';
|
|
4
|
+
import { mimeToExtension } from '@d-zero/shared/mime-to-extension';
|
|
5
|
+
import { urlToLocalPath } from '@d-zero/shared/url-to-local-path';
|
|
6
|
+
import c from 'ansi-colors';
|
|
7
|
+
/**
|
|
8
|
+
* Parse encoded pathname and return the actual URL and local path
|
|
9
|
+
* @param encodedPath - pathname or "pathname:::MIME/type" format
|
|
10
|
+
* @param baseUrl - Base URL to construct full URL from pathname
|
|
11
|
+
*/
|
|
12
|
+
function parseEncodedPath(encodedPath, baseUrl) {
|
|
13
|
+
const parts = encodedPath.split(':::');
|
|
14
|
+
if (parts.length === 2) {
|
|
15
|
+
// Format: "pathname:::MIME/type"
|
|
16
|
+
const pathname = parts[0];
|
|
17
|
+
const mimeType = parts[1];
|
|
18
|
+
const url = new URL(pathname, baseUrl).href;
|
|
19
|
+
const extension = mimeToExtension(mimeType);
|
|
20
|
+
const localPath = urlToLocalPath(url, extension);
|
|
21
|
+
return { url, localPath };
|
|
22
|
+
}
|
|
23
|
+
// Regular pathname without MIME encoding
|
|
24
|
+
const pathname = encodedPath;
|
|
25
|
+
const url = new URL(pathname, baseUrl).href;
|
|
26
|
+
const localPath = urlToLocalPath(url, '');
|
|
27
|
+
return { url, localPath };
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Download and save resources to disk
|
|
31
|
+
* @param encodedPaths - Array of encoded pathnames
|
|
32
|
+
* @param baseUrl - Base URL to construct full URLs
|
|
33
|
+
* @param outputDir - Output directory
|
|
34
|
+
* @param logger - Logger function
|
|
35
|
+
* @param verbose - Enable verbose output
|
|
36
|
+
* @param only - Download only specified type: page or resource
|
|
37
|
+
*/
|
|
38
|
+
export async function downloadResources(encodedPaths, baseUrl, outputDir, logger, verbose = false, only) {
|
|
39
|
+
const uniqueResources = new Map();
|
|
40
|
+
// Parse all encoded pathnames
|
|
41
|
+
for (const encodedPath of encodedPaths) {
|
|
42
|
+
const { url, localPath } = parseEncodedPath(encodedPath, baseUrl);
|
|
43
|
+
// Filter based on 'only' option
|
|
44
|
+
// Note: HTML pages always have .html extension after parseEncodedPath
|
|
45
|
+
// - either from "pathname:::text/html" encoding via mimeToExtension
|
|
46
|
+
// - or from original URL path already having .html extension
|
|
47
|
+
const isHtmlPage = localPath.endsWith('.html');
|
|
48
|
+
if (only === 'resource' && isHtmlPage) {
|
|
49
|
+
// Skip HTML pages in resource-only mode
|
|
50
|
+
continue;
|
|
51
|
+
}
|
|
52
|
+
if (only === 'page' && !isHtmlPage) {
|
|
53
|
+
// Skip non-HTML resources in page-only mode
|
|
54
|
+
// (though this shouldn't happen if Phase 1 was skipped correctly)
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
if (!uniqueResources.has(localPath)) {
|
|
58
|
+
uniqueResources.set(localPath, { url, localPath, encodedPath });
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
const tasks = [...uniqueResources.values()];
|
|
62
|
+
if (tasks.length === 0) {
|
|
63
|
+
logger(c.yellow('⚠️ No resources to download'));
|
|
64
|
+
return;
|
|
65
|
+
}
|
|
66
|
+
logger(`📥 Downloading ${tasks.length} unique resources...`);
|
|
67
|
+
logger('');
|
|
68
|
+
let downloaded = 0;
|
|
69
|
+
let failed = 0;
|
|
70
|
+
await deal(tasks, (task, update, index) => {
|
|
71
|
+
const fileId = index.toString().padStart(4, '0');
|
|
72
|
+
const lineHeader = `%braille% ${c.bgWhite(` ${fileId} `)} ${c.gray(task.localPath)}: `;
|
|
73
|
+
return async () => {
|
|
74
|
+
update(`${lineHeader}Fetching%dots%`);
|
|
75
|
+
const response = await fetch(task.url).catch((error) => {
|
|
76
|
+
update(`${lineHeader}${c.red(`❌ Fetch failed: ${error.message}`)}`);
|
|
77
|
+
failed++;
|
|
78
|
+
return null;
|
|
79
|
+
});
|
|
80
|
+
if (!response) {
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
if (!response.ok) {
|
|
84
|
+
update(`${lineHeader}${c.red(`❌ HTTP ${response.status}`)}`);
|
|
85
|
+
failed++;
|
|
86
|
+
return;
|
|
87
|
+
}
|
|
88
|
+
update(`${lineHeader}Reading content%dots%`);
|
|
89
|
+
const content = Buffer.from(await response.arrayBuffer());
|
|
90
|
+
const fullPath = path.join(outputDir, task.localPath);
|
|
91
|
+
const dir = path.dirname(fullPath);
|
|
92
|
+
update(`${lineHeader}Creating directory%dots%`);
|
|
93
|
+
const mkdirSuccess = await mkdir(dir, { recursive: true })
|
|
94
|
+
.then(() => true)
|
|
95
|
+
.catch((error) => {
|
|
96
|
+
update(`${lineHeader}${c.red(`❌ Failed to create directory: ${error.message}`)}`);
|
|
97
|
+
failed++;
|
|
98
|
+
return false;
|
|
99
|
+
});
|
|
100
|
+
if (!mkdirSuccess) {
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
update(`${lineHeader}Writing file%dots%`);
|
|
104
|
+
const writeSuccess = await writeFile(fullPath, content)
|
|
105
|
+
.then(() => true)
|
|
106
|
+
.catch((error) => {
|
|
107
|
+
update(`${lineHeader}${c.red(`❌ Failed to write: ${error.message}`)}`);
|
|
108
|
+
failed++;
|
|
109
|
+
return false;
|
|
110
|
+
});
|
|
111
|
+
if (!writeSuccess) {
|
|
112
|
+
return;
|
|
113
|
+
}
|
|
114
|
+
downloaded++;
|
|
115
|
+
update(`${lineHeader}${c.green('✅ Downloaded')}`);
|
|
116
|
+
};
|
|
117
|
+
}, {
|
|
118
|
+
limit: 10,
|
|
119
|
+
verbose,
|
|
120
|
+
header: (progress, done, total, limit) => {
|
|
121
|
+
const percentage = Math.round(progress * 100);
|
|
122
|
+
if (progress === 1) {
|
|
123
|
+
return `${c.bold.green('📥 Download Complete')} ${done}/${total} (${percentage}%) - ${c.green(`✅ ${downloaded}`)} ${c.red(`❌ ${failed}`)}`;
|
|
124
|
+
}
|
|
125
|
+
return `${c.bold.cyan('📥 Downloading')} %earth% %dots% ${done}/${total} (${percentage}%) - Limit: ${limit}`;
|
|
126
|
+
},
|
|
127
|
+
});
|
|
128
|
+
logger('');
|
|
129
|
+
logger(c.bold.green(`✅ Downloaded: ${downloaded}, Failed: ${failed}`));
|
|
130
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@d-zero/replicator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.6.0",
|
|
4
4
|
"description": "Replicate web pages with all their resources to local directories",
|
|
5
5
|
"author": "D-ZERO",
|
|
6
6
|
"license": "MIT",
|
|
@@ -25,15 +25,17 @@
|
|
|
25
25
|
},
|
|
26
26
|
"dependencies": {
|
|
27
27
|
"@d-zero/cli-core": "1.1.3",
|
|
28
|
-
"@d-zero/
|
|
29
|
-
"@d-zero/puppeteer-
|
|
30
|
-
"@d-zero/
|
|
28
|
+
"@d-zero/dealer": "1.3.2",
|
|
29
|
+
"@d-zero/puppeteer-dealer": "0.5.9",
|
|
30
|
+
"@d-zero/puppeteer-page-scan": "4.2.5",
|
|
31
|
+
"@d-zero/puppeteer-scroll": "3.0.11",
|
|
32
|
+
"@d-zero/shared": "0.12.0",
|
|
31
33
|
"ansi-colors": "4.1.3",
|
|
32
34
|
"minimist": "1.2.8",
|
|
33
|
-
"puppeteer": "24.
|
|
35
|
+
"puppeteer": "24.26.1"
|
|
34
36
|
},
|
|
35
37
|
"devDependencies": {
|
|
36
38
|
"@types/minimist": "1.2.5"
|
|
37
39
|
},
|
|
38
|
-
"gitHead": "
|
|
40
|
+
"gitHead": "fe6d98ee0108b0e53848f28a74e4e08875e31a78"
|
|
39
41
|
}
|