@d-zero/replicator 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ export {};
package/dist/cli.d.ts ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
package/dist/cli.js CHANGED
@@ -18,6 +18,8 @@ const { options, args } = createCLI({
18
18
  ' -t, --timeout <ms> Request timeout in milliseconds (default: 30000)',
19
19
  ' -d, --devices <devices> Device presets (comma-separated, default: desktop-compact,mobile)',
20
20
  ' -l, --limit <number> Parallel execution limit (default: 3)',
21
+ ' --interval <ms> Interval between parallel executions (default: none)',
22
+ ' Format: number or "min-max" for random range',
21
23
  ' --only <type> Download only specified type: page or resource',
22
24
  ' -v, --verbose Enable verbose logging',
23
25
  '',
@@ -65,6 +67,7 @@ try {
65
67
  devices,
66
68
  limit: options.limit,
67
69
  only: options.only,
70
+ interval: options.interval,
68
71
  });
69
72
  }
70
73
  catch (error) {
@@ -0,0 +1,26 @@
1
+ import type { ReplicateOptions } from './types.js';
2
+ /**
3
+ * Replicate web pages with all their resources to local directories
4
+ *
5
+ * ## Architecture
6
+ *
7
+ * This implementation uses a two-phase architecture for memory efficiency:
8
+ *
9
+ * ### Phase 1: Metadata Collection
10
+ * - Each URL is processed in a separate child process using puppeteer-dealer
11
+ * - Child processes scan pages with Puppeteer and collect resource URLs
12
+ * - For URLs ending with '/' (e.g., https://example.com/), MIME type is captured
13
+ * and encoded as "url:::MIME/type" format
14
+ * - Only metadata (URLs + MIME types) is returned to parent - no buffer data
15
+ *
16
+ * ### Phase 2: Resource Download
17
+ * - Parent process aggregates all metadata and removes duplicates
18
+ * - Parses encoded URLs to determine correct local paths
19
+ * - Downloads resources via fetch() and immediately writes to disk
20
+ * - No resource content is kept in memory
21
+ *
22
+ * This approach minimizes memory usage by avoiding duplicate I/O operations
23
+ * and keeping buffer data out of inter-process communication.
24
+ * @param options - Replication options
25
+ */
26
+ export declare function replicate(options: ReplicateOptions): Promise<void>;
package/dist/index.js CHANGED
@@ -54,9 +54,10 @@ function collectPageUrlsOnly(urls, progress) {
54
54
  * @param verbose - Enable verbose logging
55
55
  * @param limit - Parallel execution limit
56
56
  * @param progress - Progress logger function
57
+ * @param interval
57
58
  * @returns Set of encoded URLs
58
59
  */
59
- async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress) {
60
+ async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress, interval) {
60
61
  progress(c.bold.yellow('📡 Phase 1: Collecting resource metadata...'));
61
62
  const results = [];
62
63
  await deal(urls.map((url) => ({ id: null, url })), (_, done, total) => {
@@ -68,6 +69,7 @@ async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit
68
69
  }, {}), {
69
70
  verbose,
70
71
  limit,
72
+ interval,
71
73
  each: (result) => {
72
74
  results.push(result);
73
75
  },
@@ -119,7 +121,7 @@ async function collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit
119
121
  * @param options - Replication options
120
122
  */
121
123
  export async function replicate(options) {
122
- const { urls, outputDir, verbose = false, timeout = 30_000, devices, limit = 3, only, } = options;
124
+ const { urls, outputDir, verbose = false, timeout = 30_000, devices, limit = 3, only, interval, } = options;
123
125
  if (urls.length === 0) {
124
126
  throw new Error('At least one URL is required');
125
127
  }
@@ -154,7 +156,7 @@ export async function replicate(options) {
154
156
  }
155
157
  case 'resource':
156
158
  case undefined: {
157
- allEncodedUrls = await collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress);
159
+ allEncodedUrls = await collectAllResourceUrls(urls, targetSizes, timeout, verbose, limit, progress, interval);
158
160
  break;
159
161
  }
160
162
  default: {
@@ -168,7 +170,7 @@ export async function replicate(options) {
168
170
  // Use the first URL as base URL for constructing full URLs
169
171
  const baseUrl = urls[0];
170
172
  // Download all resources
171
- await downloadResources([...allEncodedUrls], baseUrl, outputDir, progress, verbose, only);
173
+ await downloadResources([...allEncodedUrls], baseUrl, outputDir, progress, verbose, only, interval);
172
174
  progress('');
173
175
  progress(c.bold.green(`✅ Replication complete!`));
174
176
  progress(c.gray(` All resources saved to: ${outputDir}`));
@@ -0,0 +1,21 @@
1
+ import type { DelayOptions } from '@d-zero/shared/delay';
2
+ /**
3
+ * Download and save resources to disk
4
+ * @param encodedPaths - Array of encoded pathnames
5
+ * @param baseUrl - Base URL to construct full URLs
6
+ * @param outputDir - Output directory
7
+ * @param logger - Logger function
8
+ * @param verbose - Enable verbose output
9
+ * @param only - Download only specified type: page or resource
10
+ */
11
+ /**
12
+ *
13
+ * @param encodedPaths
14
+ * @param baseUrl
15
+ * @param outputDir
16
+ * @param logger
17
+ * @param verbose
18
+ * @param only
19
+ * @param interval
20
+ */
21
+ export declare function downloadResources(encodedPaths: string[], baseUrl: string, outputDir: string, logger: (message: string) => void, verbose?: boolean, only?: 'page' | 'resource', interval?: number | DelayOptions): Promise<void>;
@@ -35,7 +35,17 @@ function parseEncodedPath(encodedPath, baseUrl) {
35
35
  * @param verbose - Enable verbose output
36
36
  * @param only - Download only specified type: page or resource
37
37
  */
38
- export async function downloadResources(encodedPaths, baseUrl, outputDir, logger, verbose = false, only) {
38
+ /**
39
+ *
40
+ * @param encodedPaths
41
+ * @param baseUrl
42
+ * @param outputDir
43
+ * @param logger
44
+ * @param verbose
45
+ * @param only
46
+ * @param interval
47
+ */
48
+ export async function downloadResources(encodedPaths, baseUrl, outputDir, logger, verbose = false, only, interval) {
39
49
  const uniqueResources = new Map();
40
50
  // Parse all encoded pathnames
41
51
  for (const encodedPath of encodedPaths) {
@@ -67,13 +77,14 @@ export async function downloadResources(encodedPaths, baseUrl, outputDir, logger
67
77
  logger('');
68
78
  let downloaded = 0;
69
79
  let failed = 0;
70
- await deal(tasks, (task, update, index) => {
80
+ await deal(tasks, (task, update, index, setLineHeader) => {
71
81
  const fileId = index.toString().padStart(4, '0');
72
82
  const lineHeader = `%braille% ${c.bgWhite(` ${fileId} `)} ${c.gray(task.localPath)}: `;
83
+ setLineHeader(lineHeader);
73
84
  return async () => {
74
- update(`${lineHeader}Fetching%dots%`);
85
+ update('Fetching%dots%');
75
86
  const response = await fetch(task.url).catch((error) => {
76
- update(`${lineHeader}${c.red(`❌ Fetch failed: ${error.message}`)}`);
87
+ update(c.red(`❌ Fetch failed: ${error.message}`));
77
88
  failed++;
78
89
  return null;
79
90
  });
@@ -81,30 +92,30 @@ export async function downloadResources(encodedPaths, baseUrl, outputDir, logger
81
92
  return;
82
93
  }
83
94
  if (!response.ok) {
84
- update(`${lineHeader}${c.red(`❌ HTTP ${response.status}`)}`);
95
+ update(c.red(`❌ HTTP ${response.status}`));
85
96
  failed++;
86
97
  return;
87
98
  }
88
- update(`${lineHeader}Reading content%dots%`);
99
+ update('Reading content%dots%');
89
100
  const content = Buffer.from(await response.arrayBuffer());
90
101
  const fullPath = path.join(outputDir, task.localPath);
91
102
  const dir = path.dirname(fullPath);
92
- update(`${lineHeader}Creating directory%dots%`);
103
+ update('Creating directory%dots%');
93
104
  const mkdirSuccess = await mkdir(dir, { recursive: true })
94
105
  .then(() => true)
95
106
  .catch((error) => {
96
- update(`${lineHeader}${c.red(`❌ Failed to create directory: ${error.message}`)}`);
107
+ update(c.red(`❌ Failed to create directory: ${error.message}`));
97
108
  failed++;
98
109
  return false;
99
110
  });
100
111
  if (!mkdirSuccess) {
101
112
  return;
102
113
  }
103
- update(`${lineHeader}Writing file%dots%`);
114
+ update('Writing file%dots%');
104
115
  const writeSuccess = await writeFile(fullPath, content)
105
116
  .then(() => true)
106
117
  .catch((error) => {
107
- update(`${lineHeader}${c.red(`❌ Failed to write: ${error.message}`)}`);
118
+ update(c.red(`❌ Failed to write: ${error.message}`));
108
119
  failed++;
109
120
  return false;
110
121
  });
@@ -112,11 +123,12 @@ export async function downloadResources(encodedPaths, baseUrl, outputDir, logger
112
123
  return;
113
124
  }
114
125
  downloaded++;
115
- update(`${lineHeader}${c.green('✅ Downloaded')}`);
126
+ update(c.green('✅ Downloaded'));
116
127
  };
117
128
  }, {
118
129
  limit: 10,
119
130
  verbose,
131
+ interval,
120
132
  header: (progress, done, total, limit) => {
121
133
  const percentage = Math.round(progress * 100);
122
134
  if (progress === 1) {
@@ -0,0 +1,29 @@
1
+ import type { DelayOptions } from '@d-zero/shared/delay';
2
+ export interface ReplicateOptions {
3
+ urls: string[];
4
+ outputDir: string;
5
+ verbose?: boolean;
6
+ timeout?: number;
7
+ devices?: Record<string, {
8
+ width: number;
9
+ resolution?: number;
10
+ }>;
11
+ limit?: number;
12
+ only?: 'page' | 'resource';
13
+ interval?: number | DelayOptions;
14
+ }
15
+ export interface Resource {
16
+ url: string;
17
+ localPath: string;
18
+ }
19
+ export interface ChildProcessInput {
20
+ devices?: Record<string, {
21
+ width: number;
22
+ resolution?: number;
23
+ }>;
24
+ timeout?: number;
25
+ }
26
+ export interface ChildProcessResult {
27
+ url: string;
28
+ encodedUrls: string[];
29
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@d-zero/replicator",
3
- "version": "0.6.0",
3
+ "version": "0.7.0",
4
4
  "description": "Replicate web pages with all their resources to local directories",
5
5
  "author": "D-ZERO",
6
6
  "license": "MIT",
@@ -24,18 +24,18 @@
24
24
  "clean": "tsc --build --clean"
25
25
  },
26
26
  "dependencies": {
27
- "@d-zero/cli-core": "1.1.3",
28
- "@d-zero/dealer": "1.3.2",
29
- "@d-zero/puppeteer-dealer": "0.5.9",
30
- "@d-zero/puppeteer-page-scan": "4.2.5",
31
- "@d-zero/puppeteer-scroll": "3.0.11",
32
- "@d-zero/shared": "0.12.0",
27
+ "@d-zero/cli-core": "1.2.0",
28
+ "@d-zero/dealer": "1.4.0",
29
+ "@d-zero/puppeteer-dealer": "0.5.10",
30
+ "@d-zero/puppeteer-page-scan": "4.2.6",
31
+ "@d-zero/puppeteer-scroll": "3.1.0",
32
+ "@d-zero/shared": "0.13.0",
33
33
  "ansi-colors": "4.1.3",
34
34
  "minimist": "1.2.8",
35
- "puppeteer": "24.26.1"
35
+ "puppeteer": "24.27.0"
36
36
  },
37
37
  "devDependencies": {
38
38
  "@types/minimist": "1.2.5"
39
39
  },
40
- "gitHead": "fe6d98ee0108b0e53848f28a74e4e08875e31a78"
40
+ "gitHead": "85abd39686d2ce02c7b8db071437464d212dd982"
41
41
  }