site-mirror 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/cli.mjs CHANGED
@@ -1,286 +1,314 @@
1
- #!/usr/bin/env node
2
- /**
3
- * site-mirror CLI
4
- *
5
- * Usage:
6
- * site-mirror init Interactive setup - creates site-mirror.config.json
7
- * site-mirror run [options] Run the mirror (reads config + CLI overrides)
8
- * site-mirror serve [port] Serve the offline folder locally
9
- *
10
- * Options (for run):
11
- * --start <url> Start URL (required if not in config)
12
- * --out <dir> Output directory (default: ./offline)
13
- * --maxPages <n> Max pages to crawl (0 = unlimited)
14
- * --maxDepth <n> Max link depth (0 = unlimited)
15
- * --sameOriginOnly Only crawl same-origin pages (default: true)
16
- * --seedSitemaps Seed URLs from sitemap.xml (default: false)
17
- * --singlePage Download only this one page + all its assets (no crawling)
18
- */
19
-
20
- import fs from 'node:fs/promises';
21
- import path from 'node:path';
22
- import readline from 'node:readline';
23
- import { fileURLToPath } from 'node:url';
24
- import { execSync, spawn } from 'node:child_process';
25
-
26
- const __dirname = path.dirname(fileURLToPath(import.meta.url));
27
- const CONFIG_FILE = 'site-mirror.config.json';
28
-
29
- const defaultConfig = {
30
- start: '',
31
- out: './offline',
32
- maxPages: 0,
33
- maxDepth: 0,
34
- sameOriginOnly: true,
35
- seedSitemaps: false,
36
- singlePage: false,
37
- userAgent:
38
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36',
39
- };
40
-
41
- async function loadConfig(cwd) {
42
- const configPath = path.join(cwd, CONFIG_FILE);
43
- try {
44
- const raw = await fs.readFile(configPath, 'utf-8');
45
- return { ...defaultConfig, ...JSON.parse(raw) };
46
- } catch {
47
- return { ...defaultConfig };
48
- }
49
- }
50
-
51
- function parseCliArgs(argv) {
52
- const args = {};
53
- for (let i = 0; i < argv.length; i++) {
54
- const key = argv[i];
55
- const next = argv[i + 1];
56
-
57
- if (key === '--start') {
58
- args.start = next;
59
- i++;
60
- } else if (key === '--out') {
61
- args.out = next;
62
- i++;
63
- } else if (key === '--maxPages') {
64
- args.maxPages = Number(next);
65
- i++;
66
- } else if (key === '--maxDepth') {
67
- args.maxDepth = Number(next);
68
- i++;
69
- } else if (key === '--sameOriginOnly') {
70
- args.sameOriginOnly = next !== 'false' && next !== '0';
71
- i++;
72
- } else if (key === '--seedSitemaps') {
73
- args.seedSitemaps = next === 'true' || next === '1' || next === 'yes';
74
- i++;
75
- } else if (key === '--singlePage') {
76
- args.singlePage = true;
77
- } else if (key === '--userAgent') {
78
- args.userAgent = next;
79
- i++;
80
- }
81
- }
82
- return args;
83
- }
84
-
85
- async function prompt(rl, question, defaultValue) {
86
- return new Promise((resolve) => {
87
- const hint = defaultValue !== undefined ? ` (${defaultValue})` : '';
88
- rl.question(`${question}${hint}: `, (answer) => {
89
- const trimmed = answer.trim();
90
- resolve(trimmed === '' && defaultValue !== undefined ? defaultValue : trimmed);
91
- });
92
- });
93
- }
94
-
95
- async function cmdInit(cwd) {
96
- const configPath = path.join(cwd, CONFIG_FILE);
97
- const exists = await fs
98
- .access(configPath)
99
- .then(() => true)
100
- .catch(() => false);
101
-
102
- if (exists) {
103
- console.log(`Config already exists: ${configPath}`);
104
- console.log('Delete it first if you want to reinitialize.');
105
- return;
106
- }
107
-
108
- console.log('');
109
- console.log('Welcome to site-mirror! Let\'s set up your config.');
110
- console.log('Press Enter to accept the default value shown in parentheses.');
111
- console.log('');
112
-
113
- const rl = readline.createInterface({
114
- input: process.stdin,
115
- output: process.stdout,
116
- });
117
-
118
- try {
119
- const start = await prompt(rl, 'Website URL to mirror', 'https://example.com/');
120
- const out = await prompt(rl, 'Output directory', './offline');
121
- const singlePageStr = await prompt(rl, 'Single page only? (yes/no)', 'no');
122
- const singlePage = singlePageStr === 'yes' || singlePageStr === 'y' || singlePageStr === 'true';
123
-
124
- let maxPages = 0;
125
- let maxDepth = 0;
126
- let seedSitemaps = false;
127
-
128
- if (!singlePage) {
129
- const maxPagesStr = await prompt(rl, 'Max pages to crawl (0 = unlimited)', '200');
130
- maxPages = parseInt(maxPagesStr, 10) || 0;
131
-
132
- const maxDepthStr = await prompt(rl, 'Max link depth (0 = unlimited)', '6');
133
- maxDepth = parseInt(maxDepthStr, 10) || 0;
134
-
135
- const seedStr = await prompt(rl, 'Seed from sitemap.xml? (yes/no)', 'no');
136
- seedSitemaps = seedStr === 'yes' || seedStr === 'y' || seedStr === 'true';
137
- }
138
-
139
- const config = {
140
- start,
141
- out,
142
- singlePage,
143
- maxPages,
144
- maxDepth,
145
- sameOriginOnly: true,
146
- seedSitemaps,
147
- };
148
-
149
- await fs.writeFile(configPath, JSON.stringify(config, null, 2) + '\n');
150
- console.log('');
151
- console.log(`Created ${CONFIG_FILE}`);
152
- console.log('');
153
- console.log('Run "site-mirror run" to start mirroring!');
154
- } finally {
155
- rl.close();
156
- }
157
- }
158
-
159
- async function cmdRun(cwd, cliArgs) {
160
- const config = await loadConfig(cwd);
161
- const merged = { ...config, ...cliArgs };
162
-
163
- // Validate required fields
164
- if (!merged.start) {
165
- console.error('');
166
- console.error('Error: No start URL specified.');
167
- console.error('');
168
- console.error('You can either:');
169
- console.error(' 1. Run "site-mirror init" to create a config file');
170
- console.error(' 2. Pass --start <url> directly, e.g.:');
171
- console.error(' site-mirror run --start https://example.com/');
172
- console.error(' site-mirror run --start https://example.com/page --singlePage');
173
- console.error('');
174
- process.exit(1);
175
- }
176
-
177
- // Validate URL format
178
- try {
179
- new URL(merged.start);
180
- } catch {
181
- console.error('');
182
- console.error(`Error: Invalid URL "${merged.start}"`);
183
- console.error('Please provide a valid URL starting with http:// or https://');
184
- console.error('');
185
- process.exit(1);
186
- }
187
-
188
- console.log('');
189
- console.log('Configuration:');
190
- console.log(` start: ${merged.start}`);
191
- console.log(` out: ${merged.out}`);
192
- if (merged.singlePage) {
193
- console.log(` mode: SINGLE PAGE (no crawling)`);
194
- } else {
195
- console.log(` maxPages: ${merged.maxPages || 'unlimited'}`);
196
- console.log(` maxDepth: ${merged.maxDepth || 'unlimited'}`);
197
- console.log(` seedSitemaps: ${merged.seedSitemaps}`);
198
- }
199
- console.log(` sameOriginOnly: ${merged.sameOriginOnly}`);
200
- console.log('');
201
-
202
- // Dynamically import the mirror lib
203
- const { mirrorSite } = await import('../lib/mirror.mjs');
204
- await mirrorSite({
205
- start: merged.start,
206
- out: path.resolve(cwd, merged.out),
207
- maxPages: merged.singlePage ? 1 : merged.maxPages,
208
- maxDepth: merged.singlePage ? 0 : merged.maxDepth,
209
- sameOriginOnly: merged.sameOriginOnly,
210
- seedSitemaps: merged.singlePage ? false : merged.seedSitemaps,
211
- singlePage: merged.singlePage,
212
- userAgent: merged.userAgent,
213
- });
214
- }
215
-
216
- async function cmdServe(cwd, port) {
217
- const outDir = path.join(cwd, 'offline');
218
- const exists = await fs
219
- .access(outDir)
220
- .then(() => true)
221
- .catch(() => false);
222
-
223
- if (!exists) {
224
- console.error(`No offline folder found at ${outDir}`);
225
- console.error('Run "site-mirror run" first to download the site.');
226
- process.exit(1);
227
- }
228
-
229
- console.log(`Serving ${outDir} on http://localhost:${port}/`);
230
- try {
231
- execSync(`npx http-server "${outDir}" -p ${port} -c-1`, { stdio: 'inherit' });
232
- } catch {
233
- // User closed with Ctrl+C
234
- }
235
- }
236
-
237
- async function main() {
238
- const cwd = process.cwd();
239
- const args = process.argv.slice(2);
240
- const command = args[0];
241
-
242
- if (!command || command === 'help' || command === '--help' || command === '-h') {
243
- console.log(`
244
- site-mirror - Mirror websites for offline browsing
245
-
246
- Usage:
247
- site-mirror init Interactive setup - creates site-mirror.config.json
248
- site-mirror run [options] Mirror the website (reads config + CLI overrides)
249
- site-mirror serve [port] Serve the offline folder (default port: 8080)
250
-
251
- Run options:
252
- --start <url> Start URL (required if not in config)
253
- --out <dir> Output directory (default: ./offline)
254
- --maxPages <n> Max pages (0 = unlimited)
255
- --maxDepth <n> Max depth (0 = unlimited)
256
- --seedSitemaps true Seed from sitemap.xml
257
- --singlePage Download only this page + all its assets (no crawling)
258
-
259
- Examples:
260
- site-mirror run --start https://example.com/page --singlePage
261
- site-mirror init
262
- site-mirror run
263
- site-mirror serve 3000
264
- `);
265
- return;
266
- }
267
-
268
- if (command === 'init') {
269
- await cmdInit(cwd);
270
- } else if (command === 'run') {
271
- const cliArgs = parseCliArgs(args.slice(1));
272
- await cmdRun(cwd, cliArgs);
273
- } else if (command === 'serve') {
274
- const port = args[1] || '8080';
275
- await cmdServe(cwd, port);
276
- } else {
277
- // Assume it's run with options directly (no subcommand)
278
- const cliArgs = parseCliArgs(args);
279
- await cmdRun(cwd, cliArgs);
280
- }
281
- }
282
-
283
- main().catch((err) => {
284
- console.error(err);
285
- process.exit(1);
286
- });
1
+ #!/usr/bin/env node
2
+ /**
3
+ * site-mirror CLI
4
+ *
5
+ * Usage:
6
+ * site-mirror init Interactive setup - creates site-mirror.config.json
7
+ * site-mirror run [options] Run the mirror (reads config + CLI overrides)
8
+ * site-mirror serve [port] Serve the offline folder locally
9
+ *
10
+ * Options (for run):
11
+ * --start <url> Start URL (required if not in config)
12
+ * --out <dir> Output directory (default: ./offline)
13
+ * --maxPages <n> Max pages to crawl (0 = unlimited)
14
+ * --maxDepth <n> Max link depth (0 = unlimited)
15
+ * --sameOriginOnly Only crawl same-origin pages (default: true)
16
+ * --seedSitemaps Seed URLs from sitemap.xml (default: false)
17
+ * --singlePage Download only this one page + all its assets (no crawling)
18
+ */
19
+
20
+ import fs from 'node:fs/promises';
21
+ import path from 'node:path';
22
+ import readline from 'node:readline';
23
+ import { fileURLToPath } from 'node:url';
24
+ import { execSync, spawn } from 'node:child_process';
25
+
26
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
27
+ const CONFIG_FILE = 'site-mirror.config.json';
28
+
29
+ const defaultConfig = {
30
+ start: '',
31
+ out: './offline',
32
+ maxPages: 0,
33
+ maxDepth: 0,
34
+ sameOriginOnly: true,
35
+ seedSitemaps: false,
36
+ singlePage: false,
37
+ userAgent:
38
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36',
39
+ };
40
+
41
+ async function loadConfig(cwd) {
42
+ const configPath = path.join(cwd, CONFIG_FILE);
43
+ try {
44
+ const raw = await fs.readFile(configPath, 'utf-8');
45
+ return { ...defaultConfig, ...JSON.parse(raw) };
46
+ } catch {
47
+ return { ...defaultConfig };
48
+ }
49
+ }
50
+
51
+ function parseCliArgs(argv) {
52
+ const args = {};
53
+ for (let i = 0; i < argv.length; i++) {
54
+ const key = argv[i];
55
+ const next = argv[i + 1];
56
+
57
+ if (key === '--start') {
58
+ args.start = next;
59
+ i++;
60
+ } else if (key === '--out') {
61
+ args.out = next;
62
+ i++;
63
+ } else if (key === '--maxPages') {
64
+ args.maxPages = Number(next);
65
+ i++;
66
+ } else if (key === '--maxDepth') {
67
+ args.maxDepth = Number(next);
68
+ i++;
69
+ } else if (key === '--sameOriginOnly') {
70
+ args.sameOriginOnly = next !== 'false' && next !== '0';
71
+ i++;
72
+ } else if (key === '--seedSitemaps') {
73
+ args.seedSitemaps = next === 'true' || next === '1' || next === 'yes';
74
+ i++;
75
+ } else if (key === '--singlePage') {
76
+ args.singlePage = true;
77
+ } else if (key === '--userAgent') {
78
+ args.userAgent = next;
79
+ i++;
80
+ }
81
+ }
82
+ return args;
83
+ }
84
+
85
+ async function prompt(rl, question, defaultValue) {
86
+ return new Promise((resolve) => {
87
+ const hint = defaultValue !== undefined ? ` (${defaultValue})` : '';
88
+ rl.question(`${question}${hint}: `, (answer) => {
89
+ const trimmed = answer.trim();
90
+ resolve(trimmed === '' && defaultValue !== undefined ? defaultValue : trimmed);
91
+ });
92
+ });
93
+ }
94
+
95
+ async function cmdInit(cwd) {
96
+ const configPath = path.join(cwd, CONFIG_FILE);
97
+ const exists = await fs
98
+ .access(configPath)
99
+ .then(() => true)
100
+ .catch(() => false);
101
+
102
+ if (exists) {
103
+ console.log(`Config already exists: ${configPath}`);
104
+ console.log('Delete it first if you want to reinitialize.');
105
+ return;
106
+ }
107
+
108
+ console.log('');
109
+ console.log('Welcome to site-mirror! Let\'s set up your config.');
110
+ console.log('Press Enter to accept the default value shown in parentheses.');
111
+ console.log('');
112
+
113
+ const rl = readline.createInterface({
114
+ input: process.stdin,
115
+ output: process.stdout,
116
+ });
117
+
118
+ try {
119
+ const start = await prompt(rl, 'Website URL to mirror', 'https://example.com/');
120
+ const out = await prompt(rl, 'Output directory', './offline');
121
+ const singlePageStr = await prompt(rl, 'Single page only? (yes/no)', 'no');
122
+ const singlePage = singlePageStr === 'yes' || singlePageStr === 'y' || singlePageStr === 'true';
123
+
124
+ let maxPages = 0;
125
+ let maxDepth = 0;
126
+ let seedSitemaps = false;
127
+
128
+ if (!singlePage) {
129
+ const maxPagesStr = await prompt(rl, 'Max pages to crawl (0 = unlimited)', '200');
130
+ maxPages = parseInt(maxPagesStr, 10) || 0;
131
+
132
+ const maxDepthStr = await prompt(rl, 'Max link depth (0 = unlimited)', '6');
133
+ maxDepth = parseInt(maxDepthStr, 10) || 0;
134
+
135
+ const seedStr = await prompt(rl, 'Seed from sitemap.xml? (yes/no)', 'no');
136
+ seedSitemaps = seedStr === 'yes' || seedStr === 'y' || seedStr === 'true';
137
+ }
138
+
139
+ const config = {
140
+ start,
141
+ out,
142
+ singlePage,
143
+ maxPages,
144
+ maxDepth,
145
+ sameOriginOnly: true,
146
+ seedSitemaps,
147
+ };
148
+
149
+ await fs.writeFile(configPath, JSON.stringify(config, null, 2) + '\n');
150
+ console.log('');
151
+ console.log(`Created ${CONFIG_FILE}`);
152
+ console.log('');
153
+ console.log('Run "site-mirror run" to start mirroring!');
154
+ } finally {
155
+ rl.close();
156
+ }
157
+ }
158
+
159
+ async function cmdRun(cwd, cliArgs) {
160
+ const config = await loadConfig(cwd);
161
+ const merged = { ...config, ...cliArgs };
162
+
163
+ // Validate required fields
164
+ if (!merged.start) {
165
+ console.error('');
166
+ console.error('Error: No start URL specified.');
167
+ console.error('');
168
+ console.error('You can either:');
169
+ console.error(' 1. Run "site-mirror init" to create a config file');
170
+ console.error(' 2. Pass --start <url> directly, e.g.:');
171
+ console.error(' site-mirror run --start https://example.com/');
172
+ console.error(' site-mirror run --start https://example.com/page --singlePage');
173
+ console.error('');
174
+ process.exit(1);
175
+ }
176
+
177
+ // Validate URL format
178
+ try {
179
+ new URL(merged.start);
180
+ } catch {
181
+ console.error('');
182
+ console.error(`Error: Invalid URL "${merged.start}"`);
183
+ console.error('Please provide a valid URL starting with http:// or https://');
184
+ console.error('');
185
+ process.exit(1);
186
+ }
187
+
188
+ console.log('');
189
+ console.log('Configuration:');
190
+ console.log(` start: ${merged.start}`);
191
+ console.log(` out: ${merged.out}`);
192
+ if (merged.singlePage) {
193
+ console.log(` mode: SINGLE PAGE (no crawling)`);
194
+ } else {
195
+ console.log(` maxPages: ${merged.maxPages || 'unlimited'}`);
196
+ console.log(` maxDepth: ${merged.maxDepth || 'unlimited'}`);
197
+ console.log(` seedSitemaps: ${merged.seedSitemaps}`);
198
+ }
199
+ console.log(` sameOriginOnly: ${merged.sameOriginOnly}`);
200
+ console.log('');
201
+
202
+ // Dynamically import the mirror lib
203
+ const { mirrorSite } = await import('../lib/mirror.mjs');
204
+ await mirrorSite({
205
+ start: merged.start,
206
+ out: path.resolve(cwd, merged.out),
207
+ maxPages: merged.singlePage ? 1 : merged.maxPages,
208
+ maxDepth: merged.singlePage ? 0 : merged.maxDepth,
209
+ sameOriginOnly: merged.sameOriginOnly,
210
+ seedSitemaps: merged.singlePage ? false : merged.seedSitemaps,
211
+ singlePage: merged.singlePage,
212
+ userAgent: merged.userAgent,
213
+ });
214
+ }
215
+
216
+ async function cmdServe(cwd, port) {
217
+ // Try to find the output directory:
218
+ // 1. Check config file for 'out' setting
219
+ // 2. Check for 'offline' folder (default)
220
+ // 3. Check if current directory has index.html (serve current dir)
221
+ const config = await loadConfig(cwd);
222
+ const configOut = config.out ? path.resolve(cwd, config.out) : null;
223
+ const defaultOut = path.join(cwd, 'offline');
224
+
225
+ let outDir = null;
226
+
227
+ // Check config's out directory first
228
+ if (configOut) {
229
+ const configExists = await fs.access(configOut).then(() => true).catch(() => false);
230
+ if (configExists) outDir = configOut;
231
+ }
232
+
233
+ // Check default 'offline' folder
234
+ if (!outDir) {
235
+ const defaultExists = await fs.access(defaultOut).then(() => true).catch(() => false);
236
+ if (defaultExists) outDir = defaultOut;
237
+ }
238
+
239
+ // Check if current directory has index.html (might be serving from inside output folder)
240
+ if (!outDir) {
241
+ const indexExists = await fs.access(path.join(cwd, 'index.html')).then(() => true).catch(() => false);
242
+ if (indexExists) outDir = cwd;
243
+ }
244
+
245
+ if (!outDir) {
246
+ console.error('No output folder found.');
247
+ console.error('');
248
+ console.error('Tried:');
249
+ if (configOut) console.error(` - ${configOut} (from config)`);
250
+ console.error(` - ${defaultOut} (default)`);
251
+ console.error(` - ${cwd} (current directory)`);
252
+ console.error('');
253
+ console.error('Run "site-mirror run" first to download the site.');
254
+ process.exit(1);
255
+ }
256
+
257
+ console.log(`Serving ${outDir} on http://localhost:${port}/`);
258
+ try {
259
+ execSync(`npx http-server "${outDir}" -p ${port} -c-1`, { stdio: 'inherit' });
260
+ } catch {
261
+ // User closed with Ctrl+C
262
+ }
263
+ }
264
+
265
+ async function main() {
266
+ const cwd = process.cwd();
267
+ const args = process.argv.slice(2);
268
+ const command = args[0];
269
+
270
+ if (!command || command === 'help' || command === '--help' || command === '-h') {
271
+ console.log(`
272
+ site-mirror - Mirror websites for offline browsing
273
+
274
+ Usage:
275
+ site-mirror init Interactive setup - creates site-mirror.config.json
276
+ site-mirror run [options] Mirror the website (reads config + CLI overrides)
277
+ site-mirror serve [port] Serve the offline folder (default port: 8080)
278
+
279
+ Run options:
280
+ --start <url> Start URL (required if not in config)
281
+ --out <dir> Output directory (default: ./offline)
282
+ --maxPages <n> Max pages (0 = unlimited)
283
+ --maxDepth <n> Max depth (0 = unlimited)
284
+ --seedSitemaps true Seed from sitemap.xml
285
+ --singlePage Download only this page + all its assets (no crawling)
286
+
287
+ Examples:
288
+ site-mirror run --start https://example.com/page --singlePage
289
+ site-mirror init
290
+ site-mirror run
291
+ site-mirror serve 3000
292
+ `);
293
+ return;
294
+ }
295
+
296
+ if (command === 'init') {
297
+ await cmdInit(cwd);
298
+ } else if (command === 'run') {
299
+ const cliArgs = parseCliArgs(args.slice(1));
300
+ await cmdRun(cwd, cliArgs);
301
+ } else if (command === 'serve') {
302
+ const port = args[1] || '8080';
303
+ await cmdServe(cwd, port);
304
+ } else {
305
+ // Assume it's run with options directly (no subcommand)
306
+ const cliArgs = parseCliArgs(args);
307
+ await cmdRun(cwd, cliArgs);
308
+ }
309
+ }
310
+
311
+ main().catch((err) => {
312
+ console.error(err);
313
+ process.exit(1);
314
+ });
@@ -1,15 +1,15 @@
1
- #!/usr/bin/env node
2
- /**
3
- * Post-install script to ensure Playwright browsers are available.
4
- * Runs automatically after `npm install`.
5
- */
6
- import { execSync } from 'node:child_process';
7
-
8
- console.log('[site-mirror] Checking Playwright browsers...');
9
-
10
- try {
11
- execSync('npx playwright install chromium', { stdio: 'inherit' });
12
- console.log('[site-mirror] Chromium browser ready.');
13
- } catch (err) {
14
- console.warn('[site-mirror] Could not auto-install Chromium. Run manually: npx playwright install chromium');
15
- }
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Post-install script to ensure Playwright browsers are available.
4
+ * Runs automatically after `npm install`.
5
+ */
6
+ import { execSync } from 'node:child_process';
7
+
8
+ console.log('[site-mirror] Checking Playwright browsers...');
9
+
10
+ try {
11
+ execSync('npx playwright install chromium', { stdio: 'inherit' });
12
+ console.log('[site-mirror] Chromium browser ready.');
13
+ } catch (err) {
14
+ console.warn('[site-mirror] Could not auto-install Chromium. Run manually: npx playwright install chromium');
15
+ }