@d-zero/replicator 0.8.6 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -0
- package/dist/child-process.js +4 -1
- package/dist/cli.js +8 -0
- package/dist/index.js +8 -4
- package/dist/resource-downloader.d.ts +3 -1
- package/dist/resource-downloader.js +9 -2
- package/dist/types.d.ts +4 -0
- package/package.json +8 -8
package/README.md
CHANGED
|
@@ -26,6 +26,7 @@ npx @d-zero/replicator <url...> -o <output-directory> [options]
|
|
|
26
26
|
- `--interval <ms>`: 並列実行間の間隔(デフォルト: なし)
|
|
27
27
|
- 数値または"min-max"形式でランダム範囲を指定可能
|
|
28
28
|
- `--only <type>`: ダウンロード対象を限定(`page` または `resource`)
|
|
29
|
+
- `-a, --auth <user:pass>`: Basic認証の認証情報(`ユーザー名:パスワード` 形式)
|
|
29
30
|
- `-v, --verbose`: 詳細ログモード
|
|
30
31
|
|
|
31
32
|
##### `--only` オプション
|
|
@@ -67,6 +68,9 @@ npx @d-zero/replicator https://example.com -o ./output --only page
|
|
|
67
68
|
|
|
68
69
|
# リソースのみダウンロード(HTMLを除外)
|
|
69
70
|
npx @d-zero/replicator https://example.com -o ./output --only resource
|
|
71
|
+
|
|
72
|
+
# Basic認証が必要なページ
|
|
73
|
+
npx @d-zero/replicator https://example.com -o ./output -a username:password
|
|
70
74
|
```
|
|
71
75
|
|
|
72
76
|
### プログラマティック使用
|
|
@@ -116,10 +120,19 @@ await replicate({
|
|
|
116
120
|
outputDir: './output',
|
|
117
121
|
only: 'resource',
|
|
118
122
|
});
|
|
123
|
+
|
|
124
|
+
// Basic認証が必要なページ
|
|
125
|
+
await replicate({
|
|
126
|
+
urls: ['https://example.com'],
|
|
127
|
+
outputDir: './output',
|
|
128
|
+
username: 'username',
|
|
129
|
+
password: 'password',
|
|
130
|
+
});
|
|
119
131
|
```
|
|
120
132
|
|
|
121
133
|
## 機能
|
|
122
134
|
|
|
135
|
+
- **Basic認証対応**: `--auth user:pass`オプションでBasic認証が必要なページにアクセス可能
|
|
123
136
|
- **並列処理**: 複数のURLを並列で効率的に処理
|
|
124
137
|
- **メモリ効率**: リソースを直接ディスクに保存してメモリ使用量を最小化
|
|
125
138
|
- **選択的ダウンロード**: `--only`オプションでHTMLページのみまたはリソースのみをダウンロード可能
|
package/dist/child-process.js
CHANGED
|
@@ -3,7 +3,7 @@ import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
|
|
|
3
3
|
import { scrollAllOver } from '@d-zero/puppeteer-scroll';
|
|
4
4
|
import { encodeResourcePath } from '@d-zero/shared/encode-resource-path';
|
|
5
5
|
createChildProcess((param) => {
|
|
6
|
-
const { devices, timeout } = param;
|
|
6
|
+
const { devices, timeout, username, password } = param;
|
|
7
7
|
return {
|
|
8
8
|
async eachPage({ page, url }, logger) {
|
|
9
9
|
const resourcePaths = new Set();
|
|
@@ -38,6 +38,9 @@ createChildProcess((param) => {
|
|
|
38
38
|
resourcePaths.add(encodeResourcePath(resourceUrlObj, mimeType));
|
|
39
39
|
};
|
|
40
40
|
page.on('response', responseHandler);
|
|
41
|
+
if (username && password) {
|
|
42
|
+
await page.authenticate({ username, password });
|
|
43
|
+
}
|
|
41
44
|
const defaultSizes = {
|
|
42
45
|
'desktop-compact': devicePresets['desktop-compact'],
|
|
43
46
|
mobile: devicePresets.mobile,
|
package/dist/cli.js
CHANGED
|
@@ -14,6 +14,7 @@ const { options, args } = createCLI({
|
|
|
14
14
|
t: 'timeout',
|
|
15
15
|
d: 'devices',
|
|
16
16
|
l: 'limit',
|
|
17
|
+
a: 'auth',
|
|
17
18
|
},
|
|
18
19
|
usage: [
|
|
19
20
|
'Usage: replicator <url1> [url2...] -o <output-directory> [options]',
|
|
@@ -26,6 +27,7 @@ const { options, args } = createCLI({
|
|
|
26
27
|
' --interval <ms> Interval between parallel executions (default: none)',
|
|
27
28
|
' Format: number or "min-max" for random range',
|
|
28
29
|
' --only <type> Download only specified type: page or resource',
|
|
30
|
+
' -a, --auth <user:pass> Credentials for Basic authentication',
|
|
29
31
|
' -v, --verbose Enable verbose logging',
|
|
30
32
|
'',
|
|
31
33
|
'Available device presets:',
|
|
@@ -46,6 +48,7 @@ const { options, args } = createCLI({
|
|
|
46
48
|
devices: cli.devices,
|
|
47
49
|
limit: cli.limit ? Number(cli.limit) : undefined,
|
|
48
50
|
only: cli.only,
|
|
51
|
+
auth: cli.auth,
|
|
49
52
|
}),
|
|
50
53
|
validateArgs: (options, cli) => {
|
|
51
54
|
if (options.only && options.only !== 'page' && options.only !== 'resource') {
|
|
@@ -64,6 +67,9 @@ if (urls.length === 0) {
|
|
|
64
67
|
try {
|
|
65
68
|
const deviceNames = options.devices ? parseList(options.devices) : undefined;
|
|
66
69
|
const devices = parseDevicesOption(deviceNames);
|
|
70
|
+
const colonIndex = options.auth ? options.auth.indexOf(':') : -1;
|
|
71
|
+
const username = colonIndex >= 0 ? options.auth.slice(0, colonIndex) : undefined;
|
|
72
|
+
const password = colonIndex >= 0 ? options.auth.slice(colonIndex + 1) : undefined;
|
|
67
73
|
await replicate({
|
|
68
74
|
urls,
|
|
69
75
|
outputDir,
|
|
@@ -73,6 +79,8 @@ try {
|
|
|
73
79
|
limit: options.limit,
|
|
74
80
|
only: options.only,
|
|
75
81
|
interval: options.interval,
|
|
82
|
+
username,
|
|
83
|
+
password,
|
|
76
84
|
});
|
|
77
85
|
}
|
|
78
86
|
catch (error) {
|
package/dist/index.js
CHANGED
|
@@ -33,9 +33,11 @@ function collectPageUrlsOnly(urls, progress) {
|
|
|
33
33
|
* @param limit - Parallel execution limit
|
|
34
34
|
* @param progress - Progress logger function
|
|
35
35
|
* @param interval
|
|
36
|
+
* @param username
|
|
37
|
+
* @param password
|
|
36
38
|
* @returns Set of encoded URLs
|
|
37
39
|
*/
|
|
38
|
-
async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress, interval) {
|
|
40
|
+
async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress, interval, username, password) {
|
|
39
41
|
progress(c.bold.yellow('📡 Phase 1: Collecting resource metadata...'));
|
|
40
42
|
const results = [];
|
|
41
43
|
await deal(urls.map((url) => ({ id: null, url })), (_, done, total) => {
|
|
@@ -44,6 +46,8 @@ async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit
|
|
|
44
46
|
}, () => createProcess(path.resolve(import.meta.dirname, 'child-process.js'), {
|
|
45
47
|
devices: targetSizes,
|
|
46
48
|
timeout,
|
|
49
|
+
username,
|
|
50
|
+
password,
|
|
47
51
|
}, {}), {
|
|
48
52
|
verbose,
|
|
49
53
|
limit,
|
|
@@ -99,7 +103,7 @@ async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit
|
|
|
99
103
|
* @param options - Replication options
|
|
100
104
|
*/
|
|
101
105
|
export async function replicate(options) {
|
|
102
|
-
const { urls, outputDir, verbose = false, timeout = 30_000, devices, limit = 3, only, interval, } = options;
|
|
106
|
+
const { urls, outputDir, verbose = false, timeout = 30_000, devices, limit = 3, only, interval, username, password, } = options;
|
|
103
107
|
if (urls.length === 0) {
|
|
104
108
|
throw new Error('At least one URL is required');
|
|
105
109
|
}
|
|
@@ -134,7 +138,7 @@ export async function replicate(options) {
|
|
|
134
138
|
}
|
|
135
139
|
case 'resource':
|
|
136
140
|
case undefined: {
|
|
137
|
-
allEncodedUrls = await collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress, interval);
|
|
141
|
+
allEncodedUrls = await collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress, interval, username, password);
|
|
138
142
|
break;
|
|
139
143
|
}
|
|
140
144
|
default: {
|
|
@@ -148,7 +152,7 @@ export async function replicate(options) {
|
|
|
148
152
|
// Use the first URL as base URL for constructing full URLs
|
|
149
153
|
const baseUrl = urls[0];
|
|
150
154
|
// Download all resources
|
|
151
|
-
await downloadResources([...allEncodedUrls], baseUrl, outputDir, progress, verbose, only, interval);
|
|
155
|
+
await downloadResources([...allEncodedUrls], baseUrl, outputDir, progress, verbose, only, interval, username, password);
|
|
152
156
|
progress('');
|
|
153
157
|
progress(c.bold.green(`✅ Replication complete!`));
|
|
154
158
|
progress(c.gray(` All resources saved to: ${outputDir}`));
|
|
@@ -17,5 +17,7 @@ import type { DelayOptions } from '@d-zero/shared/delay';
|
|
|
17
17
|
* @param verbose
|
|
18
18
|
* @param only
|
|
19
19
|
* @param interval
|
|
20
|
+
* @param username
|
|
21
|
+
* @param password
|
|
20
22
|
*/
|
|
21
|
-
export declare function downloadResources(encodedPaths: string[], baseUrl: string, outputDir: string, logger: (message: string) => void, verbose?: boolean, only?: 'page' | 'resource', interval?: number | DelayOptions): Promise<void>;
|
|
23
|
+
export declare function downloadResources(encodedPaths: string[], baseUrl: string, outputDir: string, logger: (message: string) => void, verbose?: boolean, only?: 'page' | 'resource', interval?: number | DelayOptions, username?: string, password?: string): Promise<void>;
|
|
@@ -21,8 +21,10 @@ import c from 'ansi-colors';
|
|
|
21
21
|
* @param verbose
|
|
22
22
|
* @param only
|
|
23
23
|
* @param interval
|
|
24
|
+
* @param username
|
|
25
|
+
* @param password
|
|
24
26
|
*/
|
|
25
|
-
export async function downloadResources(encodedPaths, baseUrl, outputDir, logger, verbose = false, only, interval) {
|
|
27
|
+
export async function downloadResources(encodedPaths, baseUrl, outputDir, logger, verbose = false, only, interval, username, password) {
|
|
26
28
|
const uniqueResources = new Map();
|
|
27
29
|
// Parse all encoded pathnames
|
|
28
30
|
for (const encodedPath of encodedPaths) {
|
|
@@ -46,6 +48,9 @@ export async function downloadResources(encodedPaths, baseUrl, outputDir, logger
|
|
|
46
48
|
}
|
|
47
49
|
}
|
|
48
50
|
const tasks = [...uniqueResources.values()];
|
|
51
|
+
const authHeader = username && password
|
|
52
|
+
? `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`
|
|
53
|
+
: undefined;
|
|
49
54
|
if (tasks.length === 0) {
|
|
50
55
|
logger(c.yellow('⚠️ No resources to download'));
|
|
51
56
|
return;
|
|
@@ -60,7 +65,9 @@ export async function downloadResources(encodedPaths, baseUrl, outputDir, logger
|
|
|
60
65
|
setLineHeader(lineHeader);
|
|
61
66
|
return async () => {
|
|
62
67
|
update('Fetching%dots%');
|
|
63
|
-
const response = await fetch(task.url
|
|
68
|
+
const response = await fetch(task.url, {
|
|
69
|
+
headers: authHeader ? { Authorization: authHeader } : {},
|
|
70
|
+
}).catch((error) => {
|
|
64
71
|
update(c.red(`❌ Fetch failed: ${error.message}`));
|
|
65
72
|
failed++;
|
|
66
73
|
return null;
|
package/dist/types.d.ts
CHANGED
|
@@ -11,6 +11,8 @@ export interface ReplicateOptions {
|
|
|
11
11
|
limit?: number;
|
|
12
12
|
only?: 'page' | 'resource';
|
|
13
13
|
interval?: number | DelayOptions;
|
|
14
|
+
username?: string;
|
|
15
|
+
password?: string;
|
|
14
16
|
}
|
|
15
17
|
export interface Resource {
|
|
16
18
|
url: string;
|
|
@@ -22,6 +24,8 @@ export interface ChildProcessInput {
|
|
|
22
24
|
resolution?: number;
|
|
23
25
|
}>;
|
|
24
26
|
timeout?: number;
|
|
27
|
+
username?: string;
|
|
28
|
+
password?: string;
|
|
25
29
|
}
|
|
26
30
|
export interface ChildProcessResult {
|
|
27
31
|
url: string;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@d-zero/replicator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.9.0",
|
|
4
4
|
"description": "Replicate web pages with all their resources to local directories",
|
|
5
5
|
"author": "D-ZERO",
|
|
6
6
|
"license": "MIT",
|
|
@@ -24,12 +24,12 @@
|
|
|
24
24
|
"clean": "tsc --build --clean"
|
|
25
25
|
},
|
|
26
26
|
"dependencies": {
|
|
27
|
-
"@d-zero/cli-core": "1.3.
|
|
28
|
-
"@d-zero/dealer": "1.6.
|
|
29
|
-
"@d-zero/puppeteer-dealer": "0.7.
|
|
30
|
-
"@d-zero/puppeteer-page-scan": "4.4.
|
|
31
|
-
"@d-zero/puppeteer-scroll": "3.1.
|
|
32
|
-
"@d-zero/shared": "0.20.
|
|
27
|
+
"@d-zero/cli-core": "1.3.4",
|
|
28
|
+
"@d-zero/dealer": "1.6.5",
|
|
29
|
+
"@d-zero/puppeteer-dealer": "0.7.5",
|
|
30
|
+
"@d-zero/puppeteer-page-scan": "4.4.5",
|
|
31
|
+
"@d-zero/puppeteer-scroll": "3.1.13",
|
|
32
|
+
"@d-zero/shared": "0.20.1",
|
|
33
33
|
"ansi-colors": "4.1.3",
|
|
34
34
|
"minimist": "1.2.8",
|
|
35
35
|
"puppeteer": "24.37.5"
|
|
@@ -37,5 +37,5 @@
|
|
|
37
37
|
"devDependencies": {
|
|
38
38
|
"@types/minimist": "1.2.5"
|
|
39
39
|
},
|
|
40
|
-
"gitHead": "
|
|
40
|
+
"gitHead": "31410767ae6beff5c5dbe21a824406a4e6716868"
|
|
41
41
|
}
|