@d-zero/replicator 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 D-ZERO Co., Ltd.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,37 @@
1
+ # @d-zero/replicator
2
+
3
+ Replicate web pages with all their resources to local directories
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install @d-zero/replicator
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ### CLI
14
+
15
+ ```bash
16
+ npx @d-zero/replicator <url> -o <output-directory>
17
+ ```
18
+
19
+ ### Programmatic
20
+
21
+ ```typescript
22
+ import { replicate } from '@d-zero/replicator';
23
+
24
+ await replicate('https://example.com', './output');
25
+ ```
26
+
27
+ ## Features
28
+
29
+ - Download HTML pages preserving directory structure
30
+ - Fetch all related resources (CSS, JS, images, etc.)
31
+ - Maintain relative links between resources
32
+ - Support for same-host resources only
33
+ - Preserve original file extensions and paths
34
+
35
+ ## License
36
+
37
+ MIT
package/dist/cli.js ADDED
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env node
2
+ import { createCLI, parseCommonOptions } from '@d-zero/cli-core';
3
+ import { replicate } from './index.js';
4
+ const { options, args } = createCLI({
5
+ aliases: {
6
+ o: 'output',
7
+ v: 'verbose',
8
+ },
9
+ usage: ['Usage: replicator <url> -o <output-directory> [--verbose]'],
10
+ parseArgs: (cli) => ({
11
+ ...parseCommonOptions(cli),
12
+ output: cli.output,
13
+ }),
14
+ validateArgs: (options, cli) => {
15
+ return !!(cli._.length > 0 && options.output);
16
+ },
17
+ });
18
+ const url = args[0];
19
+ const outputDir = options.output;
20
+ if (!url || typeof url !== 'string') {
21
+ // eslint-disable-next-line no-console
22
+ console.error('❌ Error: URL is required');
23
+ process.exit(1);
24
+ }
25
+ try {
26
+ await replicate(url, outputDir, {
27
+ verbose: options.verbose ?? false,
28
+ });
29
+ // eslint-disable-next-line no-console
30
+ console.log(`✅ Successfully replicated ${url} to ${outputDir}`);
31
+ }
32
+ catch (error) {
33
+ if (error instanceof Error) {
34
+ // eslint-disable-next-line no-console
35
+ console.error('❌ Error:', error.message);
36
+ if (options.verbose) {
37
+ // eslint-disable-next-line no-console
38
+ console.error('Stack trace:', error.stack);
39
+ }
40
+ }
41
+ else {
42
+ // eslint-disable-next-line no-console
43
+ console.error('❌ Unknown error:', error);
44
+ }
45
+ process.exit(1);
46
+ }
package/dist/index.js ADDED
@@ -0,0 +1,195 @@
1
+ import { promises as fs } from 'node:fs';
2
+ import path from 'node:path';
3
+ import { URL } from 'node:url';
4
+ import { launch } from 'puppeteer';
5
+ /**
6
+ *
7
+ * @param url
8
+ * @param outputDir
9
+ * @param options
10
+ */
11
+ export async function replicate(url, outputDir, options = {}) {
12
+ const { verbose = false, userAgent, timeout = 30_000 } = options;
13
+ const log = (message) => {
14
+ if (verbose) {
15
+ // eslint-disable-next-line no-console
16
+ console.log(message);
17
+ }
18
+ };
19
+ // Always show these key progress messages
20
+ const progress = (message) => {
21
+ // eslint-disable-next-line no-console
22
+ console.log(message);
23
+ };
24
+ const baseUrl = new URL(url);
25
+ const resources = [];
26
+ progress(`🚀 Starting replication of ${url}`);
27
+ log(` Output directory: ${outputDir}`);
28
+ progress(`🌐 Launching browser...`);
29
+ const browser = await launch({
30
+ headless: true,
31
+ timeout,
32
+ });
33
+ progress(`📄 Creating new page...`);
34
+ const page = await browser.newPage();
35
+ if (userAgent) {
36
+ log(` Setting user agent: ${userAgent}`);
37
+ await page.setUserAgent(userAgent);
38
+ }
39
+ progress(`🔍 Setting up resource detection...`);
40
+ // Collect all requests
41
+ const requestPromises = [];
42
+ page.on('request', (request) => {
43
+ const requestUrl = request.url();
44
+ const requestUrlObj = new URL(requestUrl);
45
+ // Only handle same-host resources
46
+ if (requestUrlObj.hostname === baseUrl.hostname) {
47
+ log(`📥 Intercepting: ${requestUrl}`);
48
+ const localPath = urlToLocalPath(requestUrl);
49
+ const resourceType = getResourceType(requestUrl);
50
+ resources.push({
51
+ url: requestUrl,
52
+ localPath,
53
+ type: resourceType,
54
+ });
55
+ }
56
+ else {
57
+ log(`🚫 Skipping external resource: ${requestUrl}`);
58
+ }
59
+ });
60
+ page.on('response', (response) => {
61
+ const responseUrl = response.url();
62
+ const responseUrlObj = new URL(responseUrl);
63
+ // Only handle same-host resources
64
+ if (responseUrlObj.hostname === baseUrl.hostname) {
65
+ const promise = (async () => {
66
+ const resource = resources.find((r) => r.url === responseUrl);
67
+ if (resource && response.ok()) {
68
+ await response
69
+ .buffer()
70
+ .then((buffer) => {
71
+ resource.content = buffer;
72
+ log(`✅ Downloaded: ${responseUrl}`);
73
+ })
74
+ .catch((error) => {
75
+ const errorMessage = error instanceof Error ? error.message : String(error);
76
+ log(`❌ Failed to download: ${responseUrl} - ${errorMessage}`);
77
+ // Don't rethrow here as this would break the entire operation
78
+ // Individual resource failures should not stop the whole process
79
+ });
80
+ }
81
+ else if (resource) {
82
+ log(`❌ Resource failed (${response.status()}): ${responseUrl}`);
83
+ }
84
+ })();
85
+ requestPromises.push(promise);
86
+ }
87
+ });
88
+ try {
89
+ // Navigate to the page
90
+ progress(`📡 Navigating to ${url}...`);
91
+ await page.goto(url, { waitUntil: 'networkidle2', timeout });
92
+ progress(`⏳ Waiting for all resources to load...`);
93
+ // Wait for all downloads to complete
94
+ await Promise.all(requestPromises);
95
+ const resourceCount = resources.length;
96
+ const downloadedCount = resources.filter((r) => r.content).length;
97
+ progress(`📄 Found ${resourceCount} resources (${downloadedCount} downloaded successfully)`);
98
+ // Ensure output directory exists
99
+ progress(`📁 Creating output directory...`);
100
+ await fs.mkdir(outputDir, { recursive: true });
101
+ // Save all resources
102
+ progress(`💾 Saving files to disk...`);
103
+ const savedCount = await saveResources(resources, outputDir, log, progress);
104
+ progress(`🎉 Replication complete! ${savedCount} files saved to ${outputDir}`);
105
+ }
106
+ finally {
107
+ progress(`🔧 Cleaning up browser...`);
108
+ await browser.close().catch((error) => {
109
+ // Log browser close errors but don't throw them
110
+ log(`⚠️ Warning: Failed to close browser: ${error instanceof Error ? error.message : String(error)}`);
111
+ });
112
+ }
113
+ }
114
+ /**
115
+ *
116
+ * @param resources
117
+ * @param outputDir
118
+ * @param log
119
+ * @param progress
120
+ */
121
+ async function saveResources(resources, outputDir, log, progress) {
122
+ let savedCount = 0;
123
+ const totalResources = resources.filter((r) => r.content).length;
124
+ for (const resource of resources) {
125
+ if (resource.content) {
126
+ const fullPath = path.join(outputDir, resource.localPath);
127
+ const dir = path.dirname(fullPath);
128
+ try {
129
+ await fs.mkdir(dir, { recursive: true });
130
+ await fs.writeFile(fullPath, resource.content);
131
+ savedCount++;
132
+ // Show progress every 10 files or for the last file
133
+ if (savedCount % 10 === 0 || savedCount === totalResources) {
134
+ progress(` Saved ${savedCount}/${totalResources} files...`);
135
+ }
136
+ log(`💾 Saved: ${resource.localPath}`);
137
+ }
138
+ catch (error) {
139
+ const errorMessage = error instanceof Error ? error.message : String(error);
140
+ log(`❌ Failed to save ${resource.localPath}: ${errorMessage}`);
141
+ progress(` ⚠️ Failed to save ${resource.localPath}`);
142
+ // Continue with other resources instead of failing completely
143
+ }
144
+ }
145
+ }
146
+ return savedCount;
147
+ }
148
+ /**
149
+ *
150
+ * @param url
151
+ */
152
+ function urlToLocalPath(url) {
153
+ const urlObj = new URL(url);
154
+ let pathname = urlObj.pathname;
155
+ // Remove leading slash
156
+ if (pathname.startsWith('/')) {
157
+ pathname = pathname.slice(1);
158
+ }
159
+ // If path is empty or ends with /, treat as index.html
160
+ if (pathname === '' || pathname.endsWith('/')) {
161
+ pathname = pathname + 'index.html';
162
+ }
163
+ // If no extension, add .html
164
+ if (!pathname.includes('.')) {
165
+ pathname = pathname + '.html';
166
+ }
167
+ return pathname;
168
+ }
169
+ /**
170
+ *
171
+ * @param url
172
+ */
173
+ function getResourceType(url) {
174
+ const urlObj = new URL(url);
175
+ const pathname = urlObj.pathname.toLowerCase();
176
+ if (pathname.endsWith('.html') ||
177
+ pathname.endsWith('.htm') ||
178
+ pathname === '/' ||
179
+ !pathname.includes('.')) {
180
+ return 'html';
181
+ }
182
+ if (pathname.endsWith('.css')) {
183
+ return 'css';
184
+ }
185
+ if (pathname.endsWith('.js') || pathname.endsWith('.mjs')) {
186
+ return 'js';
187
+ }
188
+ if (/\.(?:jpg|jpeg|png|gif|svg|webp|ico)$/.test(pathname)) {
189
+ return 'image';
190
+ }
191
+ if (/\.(?:woff|woff2|ttf|otf|eot)$/.test(pathname)) {
192
+ return 'font';
193
+ }
194
+ return 'other';
195
+ }
package/dist/types.js ADDED
@@ -0,0 +1 @@
1
+ export {};
package/package.json ADDED
@@ -0,0 +1,37 @@
1
+ {
2
+ "name": "@d-zero/replicator",
3
+ "version": "0.2.0",
4
+ "description": "Replicate web pages with all their resources to local directories",
5
+ "author": "D-ZERO",
6
+ "license": "MIT",
7
+ "publishConfig": {
8
+ "access": "public"
9
+ },
10
+ "type": "module",
11
+ "exports": {
12
+ ".": {
13
+ "import": "./dist/index.js",
14
+ "types": "./dist/index.d.ts"
15
+ }
16
+ },
17
+ "bin": "./dist/cli.js",
18
+ "files": [
19
+ "dist"
20
+ ],
21
+ "scripts": {
22
+ "build": "tsc",
23
+ "watch": "tsc --watch",
24
+ "clean": "tsc --build --clean"
25
+ },
26
+ "dependencies": {
27
+ "@d-zero/cli-core": "1.1.0",
28
+ "@d-zero/shared": "0.9.1",
29
+ "ansi-colors": "4.1.3",
30
+ "minimist": "1.2.8",
31
+ "puppeteer": "24.12.0"
32
+ },
33
+ "devDependencies": {
34
+ "@types/minimist": "1.2.5"
35
+ },
36
+ "gitHead": "7cc778738d8c811adb69cee528655e12eba52e87"
37
+ }