@d-zero/replicator 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/child-process.d.ts +1 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +3 -0
- package/dist/index.d.ts +26 -0
- package/dist/index.js +6 -4
- package/dist/resource-downloader.d.ts +21 -0
- package/dist/resource-downloader.js +23 -11
- package/dist/types.d.ts +29 -0
- package/package.json +9 -9
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/dist/cli.d.ts
ADDED
package/dist/cli.js
CHANGED
|
@@ -18,6 +18,8 @@ const { options, args } = createCLI({
|
|
|
18
18
|
' -t, --timeout <ms> Request timeout in milliseconds (default: 30000)',
|
|
19
19
|
' -d, --devices <devices> Device presets (comma-separated, default: desktop-compact,mobile)',
|
|
20
20
|
' -l, --limit <number> Parallel execution limit (default: 3)',
|
|
21
|
+
' --interval <ms> Interval between parallel executions (default: none)',
|
|
22
|
+
' Format: number or "min-max" for random range',
|
|
21
23
|
' --only <type> Download only specified type: page or resource',
|
|
22
24
|
' -v, --verbose Enable verbose logging',
|
|
23
25
|
'',
|
|
@@ -65,6 +67,7 @@ try {
|
|
|
65
67
|
devices,
|
|
66
68
|
limit: options.limit,
|
|
67
69
|
only: options.only,
|
|
70
|
+
interval: options.interval,
|
|
68
71
|
});
|
|
69
72
|
}
|
|
70
73
|
catch (error) {
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import type { ReplicateOptions } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Replicate web pages with all their resources to local directories
|
|
4
|
+
*
|
|
5
|
+
* ## Architecture
|
|
6
|
+
*
|
|
7
|
+
* This implementation uses a two-phase architecture for memory efficiency:
|
|
8
|
+
*
|
|
9
|
+
* ### Phase 1: Metadata Collection
|
|
10
|
+
* - Each URL is processed in a separate child process using puppeteer-dealer
|
|
11
|
+
* - Child processes scan pages with Puppeteer and collect resource URLs
|
|
12
|
+
* - For URLs ending with '/' (e.g., https://example.com/), MIME type is captured
|
|
13
|
+
* and encoded as "url:::MIME/type" format
|
|
14
|
+
* - Only metadata (URLs + MIME types) is returned to parent - no buffer data
|
|
15
|
+
*
|
|
16
|
+
* ### Phase 2: Resource Download
|
|
17
|
+
* - Parent process aggregates all metadata and removes duplicates
|
|
18
|
+
* - Parses encoded URLs to determine correct local paths
|
|
19
|
+
* - Downloads resources via fetch() and immediately writes to disk
|
|
20
|
+
* - No resource content is kept in memory
|
|
21
|
+
*
|
|
22
|
+
* This approach minimizes memory usage by avoiding duplicate I/O operations
|
|
23
|
+
* and keeping buffer data out of inter-process communication.
|
|
24
|
+
* @param options - Replication options
|
|
25
|
+
*/
|
|
26
|
+
export declare function replicate(options: ReplicateOptions): Promise<void>;
|
package/dist/index.js
CHANGED
|
@@ -54,9 +54,10 @@ function collectPageUrlsOnly(urls, progress) {
|
|
|
54
54
|
* @param verbose - Enable verbose logging
|
|
55
55
|
* @param limit - Parallel execution limit
|
|
56
56
|
* @param progress - Progress logger function
|
|
57
|
+
* @param interval
|
|
57
58
|
* @returns Set of encoded URLs
|
|
58
59
|
*/
|
|
59
|
-
async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress) {
|
|
60
|
+
async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress, interval) {
|
|
60
61
|
progress(c.bold.yellow('📡 Phase 1: Collecting resource metadata...'));
|
|
61
62
|
const results = [];
|
|
62
63
|
await deal(urls.map((url) => ({ id: null, url })), (_, done, total) => {
|
|
@@ -68,6 +69,7 @@ async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit
|
|
|
68
69
|
}, {}), {
|
|
69
70
|
verbose,
|
|
70
71
|
limit,
|
|
72
|
+
interval,
|
|
71
73
|
each: (result) => {
|
|
72
74
|
results.push(result);
|
|
73
75
|
},
|
|
@@ -119,7 +121,7 @@ async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit
|
|
|
119
121
|
* @param options - Replication options
|
|
120
122
|
*/
|
|
121
123
|
export async function replicate(options) {
|
|
122
|
-
const { urls, outputDir, verbose = false, timeout = 30_000, devices, limit = 3, only, } = options;
|
|
124
|
+
const { urls, outputDir, verbose = false, timeout = 30_000, devices, limit = 3, only, interval, } = options;
|
|
123
125
|
if (urls.length === 0) {
|
|
124
126
|
throw new Error('At least one URL is required');
|
|
125
127
|
}
|
|
@@ -154,7 +156,7 @@ export async function replicate(options) {
|
|
|
154
156
|
}
|
|
155
157
|
case 'resource':
|
|
156
158
|
case undefined: {
|
|
157
|
-
allEncodedUrls = await collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress);
|
|
159
|
+
allEncodedUrls = await collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress, interval);
|
|
158
160
|
break;
|
|
159
161
|
}
|
|
160
162
|
default: {
|
|
@@ -168,7 +170,7 @@ export async function replicate(options) {
|
|
|
168
170
|
// Use the first URL as base URL for constructing full URLs
|
|
169
171
|
const baseUrl = urls[0];
|
|
170
172
|
// Download all resources
|
|
171
|
-
await downloadResources([...allEncodedUrls], baseUrl, outputDir, progress, verbose, only);
|
|
173
|
+
await downloadResources([...allEncodedUrls], baseUrl, outputDir, progress, verbose, only, interval);
|
|
172
174
|
progress('');
|
|
173
175
|
progress(c.bold.green(`✅ Replication complete!`));
|
|
174
176
|
progress(c.gray(` All resources saved to: ${outputDir}`));
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { DelayOptions } from '@d-zero/shared/delay';
|
|
2
|
+
/**
|
|
3
|
+
* Download and save resources to disk
|
|
4
|
+
* @param encodedPaths - Array of encoded pathnames
|
|
5
|
+
* @param baseUrl - Base URL to construct full URLs
|
|
6
|
+
* @param outputDir - Output directory
|
|
7
|
+
* @param logger - Logger function
|
|
8
|
+
* @param verbose - Enable verbose output
|
|
9
|
+
* @param only - Download only specified type: page or resource
|
|
10
|
+
*/
|
|
11
|
+
/**
|
|
12
|
+
*
|
|
13
|
+
* @param encodedPaths
|
|
14
|
+
* @param baseUrl
|
|
15
|
+
* @param outputDir
|
|
16
|
+
* @param logger
|
|
17
|
+
* @param verbose
|
|
18
|
+
* @param only
|
|
19
|
+
* @param interval
|
|
20
|
+
*/
|
|
21
|
+
export declare function downloadResources(encodedPaths: string[], baseUrl: string, outputDir: string, logger: (message: string) => void, verbose?: boolean, only?: 'page' | 'resource', interval?: number | DelayOptions): Promise<void>;
|
|
@@ -35,7 +35,17 @@ function parseEncodedPath(encodedPath, baseUrl) {
|
|
|
35
35
|
* @param verbose - Enable verbose output
|
|
36
36
|
* @param only - Download only specified type: page or resource
|
|
37
37
|
*/
|
|
38
|
-
|
|
38
|
+
/**
|
|
39
|
+
*
|
|
40
|
+
* @param encodedPaths
|
|
41
|
+
* @param baseUrl
|
|
42
|
+
* @param outputDir
|
|
43
|
+
* @param logger
|
|
44
|
+
* @param verbose
|
|
45
|
+
* @param only
|
|
46
|
+
* @param interval
|
|
47
|
+
*/
|
|
48
|
+
export async function downloadResources(encodedPaths, baseUrl, outputDir, logger, verbose = false, only, interval) {
|
|
39
49
|
const uniqueResources = new Map();
|
|
40
50
|
// Parse all encoded pathnames
|
|
41
51
|
for (const encodedPath of encodedPaths) {
|
|
@@ -67,13 +77,14 @@ export async function downloadResources(encodedPaths, baseUrl, outputDir, logger
|
|
|
67
77
|
logger('');
|
|
68
78
|
let downloaded = 0;
|
|
69
79
|
let failed = 0;
|
|
70
|
-
await deal(tasks, (task, update, index) => {
|
|
80
|
+
await deal(tasks, (task, update, index, setLineHeader) => {
|
|
71
81
|
const fileId = index.toString().padStart(4, '0');
|
|
72
82
|
const lineHeader = `%braille% ${c.bgWhite(` ${fileId} `)} ${c.gray(task.localPath)}: `;
|
|
83
|
+
setLineHeader(lineHeader);
|
|
73
84
|
return async () => {
|
|
74
|
-
update(
|
|
85
|
+
update('Fetching%dots%');
|
|
75
86
|
const response = await fetch(task.url).catch((error) => {
|
|
76
|
-
update(
|
|
87
|
+
update(c.red(`❌ Fetch failed: ${error.message}`));
|
|
77
88
|
failed++;
|
|
78
89
|
return null;
|
|
79
90
|
});
|
|
@@ -81,30 +92,30 @@ export async function downloadResources(encodedPaths, baseUrl, outputDir, logger
|
|
|
81
92
|
return;
|
|
82
93
|
}
|
|
83
94
|
if (!response.ok) {
|
|
84
|
-
update(
|
|
95
|
+
update(c.red(`❌ HTTP ${response.status}`));
|
|
85
96
|
failed++;
|
|
86
97
|
return;
|
|
87
98
|
}
|
|
88
|
-
update(
|
|
99
|
+
update('Reading content%dots%');
|
|
89
100
|
const content = Buffer.from(await response.arrayBuffer());
|
|
90
101
|
const fullPath = path.join(outputDir, task.localPath);
|
|
91
102
|
const dir = path.dirname(fullPath);
|
|
92
|
-
update(
|
|
103
|
+
update('Creating directory%dots%');
|
|
93
104
|
const mkdirSuccess = await mkdir(dir, { recursive: true })
|
|
94
105
|
.then(() => true)
|
|
95
106
|
.catch((error) => {
|
|
96
|
-
update(
|
|
107
|
+
update(c.red(`❌ Failed to create directory: ${error.message}`));
|
|
97
108
|
failed++;
|
|
98
109
|
return false;
|
|
99
110
|
});
|
|
100
111
|
if (!mkdirSuccess) {
|
|
101
112
|
return;
|
|
102
113
|
}
|
|
103
|
-
update(
|
|
114
|
+
update('Writing file%dots%');
|
|
104
115
|
const writeSuccess = await writeFile(fullPath, content)
|
|
105
116
|
.then(() => true)
|
|
106
117
|
.catch((error) => {
|
|
107
|
-
update(
|
|
118
|
+
update(c.red(`❌ Failed to write: ${error.message}`));
|
|
108
119
|
failed++;
|
|
109
120
|
return false;
|
|
110
121
|
});
|
|
@@ -112,11 +123,12 @@ export async function downloadResources(encodedPaths, baseUrl, outputDir, logger
|
|
|
112
123
|
return;
|
|
113
124
|
}
|
|
114
125
|
downloaded++;
|
|
115
|
-
update(
|
|
126
|
+
update(c.green('✅ Downloaded'));
|
|
116
127
|
};
|
|
117
128
|
}, {
|
|
118
129
|
limit: 10,
|
|
119
130
|
verbose,
|
|
131
|
+
interval,
|
|
120
132
|
header: (progress, done, total, limit) => {
|
|
121
133
|
const percentage = Math.round(progress * 100);
|
|
122
134
|
if (progress === 1) {
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { DelayOptions } from '@d-zero/shared/delay';
|
|
2
|
+
export interface ReplicateOptions {
|
|
3
|
+
urls: string[];
|
|
4
|
+
outputDir: string;
|
|
5
|
+
verbose?: boolean;
|
|
6
|
+
timeout?: number;
|
|
7
|
+
devices?: Record<string, {
|
|
8
|
+
width: number;
|
|
9
|
+
resolution?: number;
|
|
10
|
+
}>;
|
|
11
|
+
limit?: number;
|
|
12
|
+
only?: 'page' | 'resource';
|
|
13
|
+
interval?: number | DelayOptions;
|
|
14
|
+
}
|
|
15
|
+
export interface Resource {
|
|
16
|
+
url: string;
|
|
17
|
+
localPath: string;
|
|
18
|
+
}
|
|
19
|
+
export interface ChildProcessInput {
|
|
20
|
+
devices?: Record<string, {
|
|
21
|
+
width: number;
|
|
22
|
+
resolution?: number;
|
|
23
|
+
}>;
|
|
24
|
+
timeout?: number;
|
|
25
|
+
}
|
|
26
|
+
export interface ChildProcessResult {
|
|
27
|
+
url: string;
|
|
28
|
+
encodedUrls: string[];
|
|
29
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@d-zero/replicator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.0",
|
|
4
4
|
"description": "Replicate web pages with all their resources to local directories",
|
|
5
5
|
"author": "D-ZERO",
|
|
6
6
|
"license": "MIT",
|
|
@@ -24,18 +24,18 @@
|
|
|
24
24
|
"clean": "tsc --build --clean"
|
|
25
25
|
},
|
|
26
26
|
"dependencies": {
|
|
27
|
-
"@d-zero/cli-core": "1.
|
|
28
|
-
"@d-zero/dealer": "1.
|
|
29
|
-
"@d-zero/puppeteer-dealer": "0.5.
|
|
30
|
-
"@d-zero/puppeteer-page-scan": "4.2.
|
|
31
|
-
"@d-zero/puppeteer-scroll": "3.0
|
|
32
|
-
"@d-zero/shared": "0.
|
|
27
|
+
"@d-zero/cli-core": "1.2.0",
|
|
28
|
+
"@d-zero/dealer": "1.4.0",
|
|
29
|
+
"@d-zero/puppeteer-dealer": "0.5.10",
|
|
30
|
+
"@d-zero/puppeteer-page-scan": "4.2.6",
|
|
31
|
+
"@d-zero/puppeteer-scroll": "3.1.0",
|
|
32
|
+
"@d-zero/shared": "0.13.0",
|
|
33
33
|
"ansi-colors": "4.1.3",
|
|
34
34
|
"minimist": "1.2.8",
|
|
35
|
-
"puppeteer": "24.
|
|
35
|
+
"puppeteer": "24.27.0"
|
|
36
36
|
},
|
|
37
37
|
"devDependencies": {
|
|
38
38
|
"@types/minimist": "1.2.5"
|
|
39
39
|
},
|
|
40
|
-
"gitHead": "
|
|
40
|
+
"gitHead": "85abd39686d2ce02c7b8db071437464d212dd982"
|
|
41
41
|
}
|