@d-zero/replicator 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +37 -0
- package/dist/cli.js +46 -0
- package/dist/index.js +195 -0
- package/dist/types.js +1 -0
- package/package.json +37 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 D-ZERO Co., Ltd.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# @d-zero/replicator
|
|
2
|
+
|
|
3
|
+
Replicate web pages with all their resources to local directories
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @d-zero/replicator
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
### CLI
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npx @d-zero/replicator <url> -o <output-directory>
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### Programmatic
|
|
20
|
+
|
|
21
|
+
```typescript
|
|
22
|
+
import { replicate } from '@d-zero/replicator';
|
|
23
|
+
|
|
24
|
+
await replicate('https://example.com', './output');
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Features
|
|
28
|
+
|
|
29
|
+
- Download HTML pages preserving directory structure
|
|
30
|
+
- Fetch all related resources (CSS, JS, images, etc.)
|
|
31
|
+
- Maintain relative links between resources
|
|
32
|
+
- Support for same-host resources only
|
|
33
|
+
- Preserve original file extensions and paths
|
|
34
|
+
|
|
35
|
+
## License
|
|
36
|
+
|
|
37
|
+
MIT
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { createCLI, parseCommonOptions } from '@d-zero/cli-core';
|
|
3
|
+
import { replicate } from './index.js';
|
|
4
|
+
const { options, args } = createCLI({
|
|
5
|
+
aliases: {
|
|
6
|
+
o: 'output',
|
|
7
|
+
v: 'verbose',
|
|
8
|
+
},
|
|
9
|
+
usage: ['Usage: replicator <url> -o <output-directory> [--verbose]'],
|
|
10
|
+
parseArgs: (cli) => ({
|
|
11
|
+
...parseCommonOptions(cli),
|
|
12
|
+
output: cli.output,
|
|
13
|
+
}),
|
|
14
|
+
validateArgs: (options, cli) => {
|
|
15
|
+
return !!(cli._.length > 0 && options.output);
|
|
16
|
+
},
|
|
17
|
+
});
|
|
18
|
+
const url = args[0];
|
|
19
|
+
const outputDir = options.output;
|
|
20
|
+
if (!url || typeof url !== 'string') {
|
|
21
|
+
// eslint-disable-next-line no-console
|
|
22
|
+
console.error('❌ Error: URL is required');
|
|
23
|
+
process.exit(1);
|
|
24
|
+
}
|
|
25
|
+
try {
|
|
26
|
+
await replicate(url, outputDir, {
|
|
27
|
+
verbose: options.verbose ?? false,
|
|
28
|
+
});
|
|
29
|
+
// eslint-disable-next-line no-console
|
|
30
|
+
console.log(`✅ Successfully replicated ${url} to ${outputDir}`);
|
|
31
|
+
}
|
|
32
|
+
catch (error) {
|
|
33
|
+
if (error instanceof Error) {
|
|
34
|
+
// eslint-disable-next-line no-console
|
|
35
|
+
console.error('❌ Error:', error.message);
|
|
36
|
+
if (options.verbose) {
|
|
37
|
+
// eslint-disable-next-line no-console
|
|
38
|
+
console.error('Stack trace:', error.stack);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
else {
|
|
42
|
+
// eslint-disable-next-line no-console
|
|
43
|
+
console.error('❌ Unknown error:', error);
|
|
44
|
+
}
|
|
45
|
+
process.exit(1);
|
|
46
|
+
}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import { promises as fs } from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { URL } from 'node:url';
|
|
4
|
+
import { launch } from 'puppeteer';
|
|
5
|
+
/**
|
|
6
|
+
*
|
|
7
|
+
* @param url
|
|
8
|
+
* @param outputDir
|
|
9
|
+
* @param options
|
|
10
|
+
*/
|
|
11
|
+
export async function replicate(url, outputDir, options = {}) {
|
|
12
|
+
const { verbose = false, userAgent, timeout = 30_000 } = options;
|
|
13
|
+
const log = (message) => {
|
|
14
|
+
if (verbose) {
|
|
15
|
+
// eslint-disable-next-line no-console
|
|
16
|
+
console.log(message);
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
// Always show these key progress messages
|
|
20
|
+
const progress = (message) => {
|
|
21
|
+
// eslint-disable-next-line no-console
|
|
22
|
+
console.log(message);
|
|
23
|
+
};
|
|
24
|
+
const baseUrl = new URL(url);
|
|
25
|
+
const resources = [];
|
|
26
|
+
progress(`🚀 Starting replication of ${url}`);
|
|
27
|
+
log(` Output directory: ${outputDir}`);
|
|
28
|
+
progress(`🌐 Launching browser...`);
|
|
29
|
+
const browser = await launch({
|
|
30
|
+
headless: true,
|
|
31
|
+
timeout,
|
|
32
|
+
});
|
|
33
|
+
progress(`📄 Creating new page...`);
|
|
34
|
+
const page = await browser.newPage();
|
|
35
|
+
if (userAgent) {
|
|
36
|
+
log(` Setting user agent: ${userAgent}`);
|
|
37
|
+
await page.setUserAgent(userAgent);
|
|
38
|
+
}
|
|
39
|
+
progress(`🔍 Setting up resource detection...`);
|
|
40
|
+
// Collect all requests
|
|
41
|
+
const requestPromises = [];
|
|
42
|
+
page.on('request', (request) => {
|
|
43
|
+
const requestUrl = request.url();
|
|
44
|
+
const requestUrlObj = new URL(requestUrl);
|
|
45
|
+
// Only handle same-host resources
|
|
46
|
+
if (requestUrlObj.hostname === baseUrl.hostname) {
|
|
47
|
+
log(`📥 Intercepting: ${requestUrl}`);
|
|
48
|
+
const localPath = urlToLocalPath(requestUrl);
|
|
49
|
+
const resourceType = getResourceType(requestUrl);
|
|
50
|
+
resources.push({
|
|
51
|
+
url: requestUrl,
|
|
52
|
+
localPath,
|
|
53
|
+
type: resourceType,
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
else {
|
|
57
|
+
log(`🚫 Skipping external resource: ${requestUrl}`);
|
|
58
|
+
}
|
|
59
|
+
});
|
|
60
|
+
page.on('response', (response) => {
|
|
61
|
+
const responseUrl = response.url();
|
|
62
|
+
const responseUrlObj = new URL(responseUrl);
|
|
63
|
+
// Only handle same-host resources
|
|
64
|
+
if (responseUrlObj.hostname === baseUrl.hostname) {
|
|
65
|
+
const promise = (async () => {
|
|
66
|
+
const resource = resources.find((r) => r.url === responseUrl);
|
|
67
|
+
if (resource && response.ok()) {
|
|
68
|
+
await response
|
|
69
|
+
.buffer()
|
|
70
|
+
.then((buffer) => {
|
|
71
|
+
resource.content = buffer;
|
|
72
|
+
log(`✅ Downloaded: ${responseUrl}`);
|
|
73
|
+
})
|
|
74
|
+
.catch((error) => {
|
|
75
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
76
|
+
log(`❌ Failed to download: ${responseUrl} - ${errorMessage}`);
|
|
77
|
+
// Don't rethrow here as this would break the entire operation
|
|
78
|
+
// Individual resource failures should not stop the whole process
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
else if (resource) {
|
|
82
|
+
log(`❌ Resource failed (${response.status()}): ${responseUrl}`);
|
|
83
|
+
}
|
|
84
|
+
})();
|
|
85
|
+
requestPromises.push(promise);
|
|
86
|
+
}
|
|
87
|
+
});
|
|
88
|
+
try {
|
|
89
|
+
// Navigate to the page
|
|
90
|
+
progress(`📡 Navigating to ${url}...`);
|
|
91
|
+
await page.goto(url, { waitUntil: 'networkidle2', timeout });
|
|
92
|
+
progress(`⏳ Waiting for all resources to load...`);
|
|
93
|
+
// Wait for all downloads to complete
|
|
94
|
+
await Promise.all(requestPromises);
|
|
95
|
+
const resourceCount = resources.length;
|
|
96
|
+
const downloadedCount = resources.filter((r) => r.content).length;
|
|
97
|
+
progress(`📄 Found ${resourceCount} resources (${downloadedCount} downloaded successfully)`);
|
|
98
|
+
// Ensure output directory exists
|
|
99
|
+
progress(`📁 Creating output directory...`);
|
|
100
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
101
|
+
// Save all resources
|
|
102
|
+
progress(`💾 Saving files to disk...`);
|
|
103
|
+
const savedCount = await saveResources(resources, outputDir, log, progress);
|
|
104
|
+
progress(`🎉 Replication complete! ${savedCount} files saved to ${outputDir}`);
|
|
105
|
+
}
|
|
106
|
+
finally {
|
|
107
|
+
progress(`🔧 Cleaning up browser...`);
|
|
108
|
+
await browser.close().catch((error) => {
|
|
109
|
+
// Log browser close errors but don't throw them
|
|
110
|
+
log(`⚠️ Warning: Failed to close browser: ${error instanceof Error ? error.message : String(error)}`);
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
*
|
|
116
|
+
* @param resources
|
|
117
|
+
* @param outputDir
|
|
118
|
+
* @param log
|
|
119
|
+
* @param progress
|
|
120
|
+
*/
|
|
121
|
+
async function saveResources(resources, outputDir, log, progress) {
|
|
122
|
+
let savedCount = 0;
|
|
123
|
+
const totalResources = resources.filter((r) => r.content).length;
|
|
124
|
+
for (const resource of resources) {
|
|
125
|
+
if (resource.content) {
|
|
126
|
+
const fullPath = path.join(outputDir, resource.localPath);
|
|
127
|
+
const dir = path.dirname(fullPath);
|
|
128
|
+
try {
|
|
129
|
+
await fs.mkdir(dir, { recursive: true });
|
|
130
|
+
await fs.writeFile(fullPath, resource.content);
|
|
131
|
+
savedCount++;
|
|
132
|
+
// Show progress every 10 files or for the last file
|
|
133
|
+
if (savedCount % 10 === 0 || savedCount === totalResources) {
|
|
134
|
+
progress(` Saved ${savedCount}/${totalResources} files...`);
|
|
135
|
+
}
|
|
136
|
+
log(`💾 Saved: ${resource.localPath}`);
|
|
137
|
+
}
|
|
138
|
+
catch (error) {
|
|
139
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
140
|
+
log(`❌ Failed to save ${resource.localPath}: ${errorMessage}`);
|
|
141
|
+
progress(` ⚠️ Failed to save ${resource.localPath}`);
|
|
142
|
+
// Continue with other resources instead of failing completely
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
return savedCount;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
*
|
|
150
|
+
* @param url
|
|
151
|
+
*/
|
|
152
|
+
function urlToLocalPath(url) {
|
|
153
|
+
const urlObj = new URL(url);
|
|
154
|
+
let pathname = urlObj.pathname;
|
|
155
|
+
// Remove leading slash
|
|
156
|
+
if (pathname.startsWith('/')) {
|
|
157
|
+
pathname = pathname.slice(1);
|
|
158
|
+
}
|
|
159
|
+
// If path is empty or ends with /, treat as index.html
|
|
160
|
+
if (pathname === '' || pathname.endsWith('/')) {
|
|
161
|
+
pathname = pathname + 'index.html';
|
|
162
|
+
}
|
|
163
|
+
// If no extension, add .html
|
|
164
|
+
if (!pathname.includes('.')) {
|
|
165
|
+
pathname = pathname + '.html';
|
|
166
|
+
}
|
|
167
|
+
return pathname;
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
*
|
|
171
|
+
* @param url
|
|
172
|
+
*/
|
|
173
|
+
function getResourceType(url) {
|
|
174
|
+
const urlObj = new URL(url);
|
|
175
|
+
const pathname = urlObj.pathname.toLowerCase();
|
|
176
|
+
if (pathname.endsWith('.html') ||
|
|
177
|
+
pathname.endsWith('.htm') ||
|
|
178
|
+
pathname === '/' ||
|
|
179
|
+
!pathname.includes('.')) {
|
|
180
|
+
return 'html';
|
|
181
|
+
}
|
|
182
|
+
if (pathname.endsWith('.css')) {
|
|
183
|
+
return 'css';
|
|
184
|
+
}
|
|
185
|
+
if (pathname.endsWith('.js') || pathname.endsWith('.mjs')) {
|
|
186
|
+
return 'js';
|
|
187
|
+
}
|
|
188
|
+
if (/\.(?:jpg|jpeg|png|gif|svg|webp|ico)$/.test(pathname)) {
|
|
189
|
+
return 'image';
|
|
190
|
+
}
|
|
191
|
+
if (/\.(?:woff|woff2|ttf|otf|eot)$/.test(pathname)) {
|
|
192
|
+
return 'font';
|
|
193
|
+
}
|
|
194
|
+
return 'other';
|
|
195
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/package.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@d-zero/replicator",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "Replicate web pages with all their resources to local directories",
|
|
5
|
+
"author": "D-ZERO",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"publishConfig": {
|
|
8
|
+
"access": "public"
|
|
9
|
+
},
|
|
10
|
+
"type": "module",
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"import": "./dist/index.js",
|
|
14
|
+
"types": "./dist/index.d.ts"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"bin": "./dist/cli.js",
|
|
18
|
+
"files": [
|
|
19
|
+
"dist"
|
|
20
|
+
],
|
|
21
|
+
"scripts": {
|
|
22
|
+
"build": "tsc",
|
|
23
|
+
"watch": "tsc --watch",
|
|
24
|
+
"clean": "tsc --build --clean"
|
|
25
|
+
},
|
|
26
|
+
"dependencies": {
|
|
27
|
+
"@d-zero/cli-core": "1.1.0",
|
|
28
|
+
"@d-zero/shared": "0.9.1",
|
|
29
|
+
"ansi-colors": "4.1.3",
|
|
30
|
+
"minimist": "1.2.8",
|
|
31
|
+
"puppeteer": "24.12.0"
|
|
32
|
+
},
|
|
33
|
+
"devDependencies": {
|
|
34
|
+
"@types/minimist": "1.2.5"
|
|
35
|
+
},
|
|
36
|
+
"gitHead": "7cc778738d8c811adb69cee528655e12eba52e87"
|
|
37
|
+
}
|