site-mirror 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Mahesh
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,101 @@
1
+ # site-mirror
2
+
3
+ A CLI tool to mirror websites for offline browsing using Playwright.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ # Install globally
9
+ npm install -g site-mirror
10
+
11
+ # Or use directly via npx
12
+ npx site-mirror --help
13
+ ```
14
+
15
+ ## Quick Start
16
+
17
+ ```bash
18
+ # Download a single page with all its assets (no config needed!)
19
+ site-mirror run --start https://www.apple.com/iphone/ --singlePage
20
+
21
+ # Crawl an entire site
22
+ site-mirror run --start https://example.com/
23
+
24
+ # Or use interactive config-based workflow:
25
+ site-mirror init # Interactive prompts to create site-mirror.config.json
26
+ site-mirror run # Runs the mirror using config
27
+ site-mirror serve # Serve locally on port 8080
28
+ ```
29
+
30
+ ## Commands
31
+
32
+ | Command | Description |
33
+ | ------------------------ | ----------------------------------------------------- |
34
+ | `site-mirror init` | Interactive setup - creates `site-mirror.config.json` |
35
+ | `site-mirror run` | Run the mirror (reads config + CLI overrides) |
36
+ | `site-mirror serve` | Serve the `./offline` folder locally |
37
+ | `site-mirror serve 3000` | Serve on a custom port |
38
+
39
+ ## CLI Options (for `run`)
40
+
41
+ | Option | Description | Default |
42
+ | ------------------- | ---------------------------------------- | --------------- |
43
+ | `--start <url>` | Start URL (required if not in config) | - |
44
+ | `--out <dir>` | Output directory | `./offline` |
45
+ | `--maxPages <n>` | Max pages to crawl (0 = unlimited) | `0` |
46
+ | `--maxDepth <n>` | Max link depth (0 = unlimited) | `0` |
47
+ | `--sameOriginOnly` | Only crawl same-origin pages | `true` |
48
+ | `--seedSitemaps` | Seed URLs from sitemap.xml/robots.txt | `false` |
49
+ | `--singlePage` | Download only this page + all its assets | `false` |
50
+
51
+ ## Config File (`site-mirror.config.json`)
52
+
53
+ Created via `site-mirror init` (interactive) or manually:
54
+
55
+ ```json
56
+ {
57
+ "start": "https://example.com/",
58
+ "out": "./offline",
59
+ "singlePage": false,
60
+ "maxPages": 200,
61
+ "maxDepth": 6,
62
+ "sameOriginOnly": true,
63
+ "seedSitemaps": false
64
+ }
65
+ ```
66
+
67
+ CLI options override config file settings.
68
+
69
+ ## Output Structure
70
+
71
+ ```
72
+ ./offline/
73
+ ├── index.html # Homepage
74
+ ├── about/
75
+ │ └── index.html # /about/ page
76
+ ├── _next/ # Same-origin assets
77
+ │ └── static/
78
+ ├── _external/ # Cross-origin assets
79
+ │ └── cdn.example.com/
80
+ │ └── script.js
81
+ ```
82
+
83
+ ## How It Works
84
+
85
+ 1. Launches headless Chromium via Playwright
86
+ 2. Navigates to each page, waits for network idle
87
+ 3. Captures all static assets (CSS, JS, images, fonts, videos)
88
+ 4. Rewrites absolute same-origin URLs to relative paths
89
+ 5. Injects a script to handle SPA-style navigation offline
90
+ 6. Discovers new pages via `<a href>` links
91
+ 7. Saves everything to the output directory
92
+
93
+ ## Notes
94
+
95
+ - XHR/fetch API responses are not saved (only rendered HTML + static assets)
96
+ - Some interactive features requiring live APIs won't work offline
97
+ - Be mindful of target site's Terms of Service and robots.txt
98
+
99
+ ## License
100
+
101
+ MIT
package/bin/cli.mjs ADDED
@@ -0,0 +1,286 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * site-mirror CLI
4
+ *
5
+ * Usage:
6
+ * site-mirror init Interactive setup - creates site-mirror.config.json
7
+ * site-mirror run [options] Run the mirror (reads config + CLI overrides)
8
+ * site-mirror serve [port] Serve the offline folder locally
9
+ *
10
+ * Options (for run):
11
+ * --start <url> Start URL (required if not in config)
12
+ * --out <dir> Output directory (default: ./offline)
13
+ * --maxPages <n> Max pages to crawl (0 = unlimited)
14
+ * --maxDepth <n> Max link depth (0 = unlimited)
15
+ * --sameOriginOnly Only crawl same-origin pages (default: true)
16
+ * --seedSitemaps Seed URLs from sitemap.xml (default: false)
17
+ * --singlePage Download only this one page + all its assets (no crawling)
18
+ */
19
+
20
+ import fs from 'node:fs/promises';
21
+ import path from 'node:path';
22
+ import readline from 'node:readline';
23
+ import { fileURLToPath } from 'node:url';
24
+ import { execSync, spawn } from 'node:child_process';
25
+
26
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
27
+ const CONFIG_FILE = 'site-mirror.config.json';
28
+
29
+ const defaultConfig = {
30
+ start: '',
31
+ out: './offline',
32
+ maxPages: 0,
33
+ maxDepth: 0,
34
+ sameOriginOnly: true,
35
+ seedSitemaps: false,
36
+ singlePage: false,
37
+ userAgent:
38
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36',
39
+ };
40
+
41
+ async function loadConfig(cwd) {
42
+ const configPath = path.join(cwd, CONFIG_FILE);
43
+ try {
44
+ const raw = await fs.readFile(configPath, 'utf-8');
45
+ return { ...defaultConfig, ...JSON.parse(raw) };
46
+ } catch {
47
+ return { ...defaultConfig };
48
+ }
49
+ }
50
+
51
+ function parseCliArgs(argv) {
52
+ const args = {};
53
+ for (let i = 0; i < argv.length; i++) {
54
+ const key = argv[i];
55
+ const next = argv[i + 1];
56
+
57
+ if (key === '--start') {
58
+ args.start = next;
59
+ i++;
60
+ } else if (key === '--out') {
61
+ args.out = next;
62
+ i++;
63
+ } else if (key === '--maxPages') {
64
+ args.maxPages = Number(next);
65
+ i++;
66
+ } else if (key === '--maxDepth') {
67
+ args.maxDepth = Number(next);
68
+ i++;
69
+ } else if (key === '--sameOriginOnly') {
70
+ args.sameOriginOnly = next !== 'false' && next !== '0';
71
+ i++;
72
+ } else if (key === '--seedSitemaps') {
73
+ args.seedSitemaps = next === 'true' || next === '1' || next === 'yes';
74
+ i++;
75
+ } else if (key === '--singlePage') {
76
+ args.singlePage = true;
77
+ } else if (key === '--userAgent') {
78
+ args.userAgent = next;
79
+ i++;
80
+ }
81
+ }
82
+ return args;
83
+ }
84
+
85
+ async function prompt(rl, question, defaultValue) {
86
+ return new Promise((resolve) => {
87
+ const hint = defaultValue !== undefined ? ` (${defaultValue})` : '';
88
+ rl.question(`${question}${hint}: `, (answer) => {
89
+ const trimmed = answer.trim();
90
+ resolve(trimmed === '' && defaultValue !== undefined ? defaultValue : trimmed);
91
+ });
92
+ });
93
+ }
94
+
95
+ async function cmdInit(cwd) {
96
+ const configPath = path.join(cwd, CONFIG_FILE);
97
+ const exists = await fs
98
+ .access(configPath)
99
+ .then(() => true)
100
+ .catch(() => false);
101
+
102
+ if (exists) {
103
+ console.log(`Config already exists: ${configPath}`);
104
+ console.log('Delete it first if you want to reinitialize.');
105
+ return;
106
+ }
107
+
108
+ console.log('');
109
+ console.log('Welcome to site-mirror! Let\'s set up your config.');
110
+ console.log('Press Enter to accept the default value shown in parentheses.');
111
+ console.log('');
112
+
113
+ const rl = readline.createInterface({
114
+ input: process.stdin,
115
+ output: process.stdout,
116
+ });
117
+
118
+ try {
119
+ const start = await prompt(rl, 'Website URL to mirror', 'https://example.com/');
120
+ const out = await prompt(rl, 'Output directory', './offline');
121
+ const singlePageStr = await prompt(rl, 'Single page only? (yes/no)', 'no');
122
+ const singlePage = singlePageStr === 'yes' || singlePageStr === 'y' || singlePageStr === 'true';
123
+
124
+ let maxPages = 0;
125
+ let maxDepth = 0;
126
+ let seedSitemaps = false;
127
+
128
+ if (!singlePage) {
129
+ const maxPagesStr = await prompt(rl, 'Max pages to crawl (0 = unlimited)', '200');
130
+ maxPages = parseInt(maxPagesStr, 10) || 0;
131
+
132
+ const maxDepthStr = await prompt(rl, 'Max link depth (0 = unlimited)', '6');
133
+ maxDepth = parseInt(maxDepthStr, 10) || 0;
134
+
135
+ const seedStr = await prompt(rl, 'Seed from sitemap.xml? (yes/no)', 'no');
136
+ seedSitemaps = seedStr === 'yes' || seedStr === 'y' || seedStr === 'true';
137
+ }
138
+
139
+ const config = {
140
+ start,
141
+ out,
142
+ singlePage,
143
+ maxPages,
144
+ maxDepth,
145
+ sameOriginOnly: true,
146
+ seedSitemaps,
147
+ };
148
+
149
+ await fs.writeFile(configPath, JSON.stringify(config, null, 2) + '\n');
150
+ console.log('');
151
+ console.log(`Created ${CONFIG_FILE}`);
152
+ console.log('');
153
+ console.log('Run "site-mirror run" to start mirroring!');
154
+ } finally {
155
+ rl.close();
156
+ }
157
+ }
158
+
159
+ async function cmdRun(cwd, cliArgs) {
160
+ const config = await loadConfig(cwd);
161
+ const merged = { ...config, ...cliArgs };
162
+
163
+ // Validate required fields
164
+ if (!merged.start) {
165
+ console.error('');
166
+ console.error('Error: No start URL specified.');
167
+ console.error('');
168
+ console.error('You can either:');
169
+ console.error(' 1. Run "site-mirror init" to create a config file');
170
+ console.error(' 2. Pass --start <url> directly, e.g.:');
171
+ console.error(' site-mirror run --start https://example.com/');
172
+ console.error(' site-mirror run --start https://example.com/page --singlePage');
173
+ console.error('');
174
+ process.exit(1);
175
+ }
176
+
177
+ // Validate URL format
178
+ try {
179
+ new URL(merged.start);
180
+ } catch {
181
+ console.error('');
182
+ console.error(`Error: Invalid URL "${merged.start}"`);
183
+ console.error('Please provide a valid URL starting with http:// or https://');
184
+ console.error('');
185
+ process.exit(1);
186
+ }
187
+
188
+ console.log('');
189
+ console.log('Configuration:');
190
+ console.log(` start: ${merged.start}`);
191
+ console.log(` out: ${merged.out}`);
192
+ if (merged.singlePage) {
193
+ console.log(` mode: SINGLE PAGE (no crawling)`);
194
+ } else {
195
+ console.log(` maxPages: ${merged.maxPages || 'unlimited'}`);
196
+ console.log(` maxDepth: ${merged.maxDepth || 'unlimited'}`);
197
+ console.log(` seedSitemaps: ${merged.seedSitemaps}`);
198
+ }
199
+ console.log(` sameOriginOnly: ${merged.sameOriginOnly}`);
200
+ console.log('');
201
+
202
+ // Dynamically import the mirror lib
203
+ const { mirrorSite } = await import('../lib/mirror.mjs');
204
+ await mirrorSite({
205
+ start: merged.start,
206
+ out: path.resolve(cwd, merged.out),
207
+ maxPages: merged.singlePage ? 1 : merged.maxPages,
208
+ maxDepth: merged.singlePage ? 0 : merged.maxDepth,
209
+ sameOriginOnly: merged.sameOriginOnly,
210
+ seedSitemaps: merged.singlePage ? false : merged.seedSitemaps,
211
+ singlePage: merged.singlePage,
212
+ userAgent: merged.userAgent,
213
+ });
214
+ }
215
+
216
+ async function cmdServe(cwd, port) {
217
+ const outDir = path.join(cwd, 'offline');
218
+ const exists = await fs
219
+ .access(outDir)
220
+ .then(() => true)
221
+ .catch(() => false);
222
+
223
+ if (!exists) {
224
+ console.error(`No offline folder found at ${outDir}`);
225
+ console.error('Run "site-mirror run" first to download the site.');
226
+ process.exit(1);
227
+ }
228
+
229
+ console.log(`Serving ${outDir} on http://localhost:${port}/`);
230
+ try {
231
+ execSync(`npx http-server "${outDir}" -p ${port} -c-1`, { stdio: 'inherit' });
232
+ } catch {
233
+ // User closed with Ctrl+C
234
+ }
235
+ }
236
+
237
+ async function main() {
238
+ const cwd = process.cwd();
239
+ const args = process.argv.slice(2);
240
+ const command = args[0];
241
+
242
+ if (!command || command === 'help' || command === '--help' || command === '-h') {
243
+ console.log(`
244
+ site-mirror - Mirror websites for offline browsing
245
+
246
+ Usage:
247
+ site-mirror init Interactive setup - creates site-mirror.config.json
248
+ site-mirror run [options] Mirror the website (reads config + CLI overrides)
249
+ site-mirror serve [port] Serve the offline folder (default port: 8080)
250
+
251
+ Run options:
252
+ --start <url> Start URL (required if not in config)
253
+ --out <dir> Output directory (default: ./offline)
254
+ --maxPages <n> Max pages (0 = unlimited)
255
+ --maxDepth <n> Max depth (0 = unlimited)
256
+ --seedSitemaps true Seed from sitemap.xml
257
+ --singlePage Download only this page + all its assets (no crawling)
258
+
259
+ Examples:
260
+ site-mirror run --start https://example.com/page --singlePage
261
+ site-mirror init
262
+ site-mirror run
263
+ site-mirror serve 3000
264
+ `);
265
+ return;
266
+ }
267
+
268
+ if (command === 'init') {
269
+ await cmdInit(cwd);
270
+ } else if (command === 'run') {
271
+ const cliArgs = parseCliArgs(args.slice(1));
272
+ await cmdRun(cwd, cliArgs);
273
+ } else if (command === 'serve') {
274
+ const port = args[1] || '8080';
275
+ await cmdServe(cwd, port);
276
+ } else {
277
+ // Assume it's run with options directly (no subcommand)
278
+ const cliArgs = parseCliArgs(args);
279
+ await cmdRun(cwd, cliArgs);
280
+ }
281
+ }
282
+
283
+ main().catch((err) => {
284
+ console.error(err);
285
+ process.exit(1);
286
+ });
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Post-install script to ensure Playwright browsers are available.
4
+ * Runs automatically after `npm install`.
5
+ */
6
+ import { execSync } from 'node:child_process';
7
+
8
+ console.log('[site-mirror] Checking Playwright browsers...');
9
+
10
+ try {
11
+ execSync('npx playwright install chromium', { stdio: 'inherit' });
12
+ console.log('[site-mirror] Chromium browser ready.');
13
+ } catch (err) {
14
+ console.warn('[site-mirror] Could not auto-install Chromium. Run manually: npx playwright install chromium');
15
+ }
package/lib/mirror.mjs ADDED
@@ -0,0 +1,463 @@
1
+ /**
2
+ * Core mirroring logic.
3
+ * Exports mirrorSite() for use by the CLI.
4
+ */
5
+ import { chromium } from 'playwright';
6
+ import * as cheerio from 'cheerio';
7
+ import fs from 'node:fs/promises';
8
+ import path from 'node:path';
9
+ import crypto from 'node:crypto';
10
+
11
+ async function ensureDir(dirPath) {
12
+ await fs.mkdir(dirPath, { recursive: true });
13
+ }
14
+
15
+ function normalizePageUrl(urlString) {
16
+ const u = new URL(urlString);
17
+ u.hash = '';
18
+ return u;
19
+ }
20
+
21
+ function shouldSkipLink(url) {
22
+ const ext = path.posix.extname(url.pathname).toLowerCase();
23
+ const skipExt = new Set([
24
+ '.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg',
25
+ '.css', '.js', '.mjs', '.map',
26
+ '.json', '.xml', '.txt',
27
+ '.woff', '.woff2', '.ttf', '.otf',
28
+ '.pdf', '.zip', '.rar', '.7z',
29
+ '.mp4', '.webm', '.mp3', '.wav',
30
+ ]);
31
+ return skipExt.has(ext);
32
+ }
33
+
34
+ function pageOutputPath(outRoot, pageUrl) {
35
+ const pathname = decodeURIComponent(pageUrl.pathname || '/');
36
+ let rel;
37
+
38
+ if (pathname === '/' || pathname === '') {
39
+ rel = 'index.html';
40
+ } else if (pathname.endsWith('/')) {
41
+ rel = path.posix.join(pathname, 'index.html').slice(1);
42
+ } else {
43
+ const ext = path.posix.extname(pathname);
44
+ if (ext) rel = pathname.slice(1);
45
+ else rel = path.posix.join(pathname, 'index.html').slice(1);
46
+ }
47
+
48
+ if (pageUrl.search) {
49
+ const hash = crypto.createHash('sha1').update(pageUrl.search).digest('hex').slice(0, 8);
50
+ const dir = path.posix.dirname(rel);
51
+ const base = path.posix.basename(rel, '.html');
52
+ rel = path.posix.join(dir === '.' ? '' : dir, `${base}__qs-${hash}.html`);
53
+ }
54
+
55
+ return path.join(outRoot, rel);
56
+ }
57
+
58
+ function guessExtension(contentType) {
59
+ const ct = (contentType || '').toLowerCase();
60
+ if (ct.includes('text/css')) return '.css';
61
+ if (ct.includes('javascript')) return '.js';
62
+ if (ct.includes('application/wasm')) return '.wasm';
63
+ if (ct.includes('image/svg+xml')) return '.svg';
64
+ if (ct.startsWith('image/')) return `.${ct.split('image/')[1].split(';')[0]}`;
65
+ if (ct.includes('font/woff2')) return '.woff2';
66
+ if (ct.includes('font/woff')) return '.woff';
67
+ if (ct.includes('font/ttf')) return '.ttf';
68
+ if (ct.includes('font/otf')) return '.otf';
69
+ return '';
70
+ }
71
+
72
+ function assetOutputPath(outRoot, startOrigin, assetUrl, contentType) {
73
+ const pathname = assetUrl.pathname || '/';
74
+ const ext = path.posix.extname(pathname);
75
+
76
+ if (assetUrl.origin === startOrigin) {
77
+ const rel = pathname === '/' ? 'index' : pathname.slice(1);
78
+ return path.join(outRoot, rel);
79
+ }
80
+
81
+ const hostRoot = path.posix.join('_external', assetUrl.hostname);
82
+ let rel = pathname === '/' ? 'index' : pathname.slice(1);
83
+ rel = path.posix.join(hostRoot, rel);
84
+
85
+ if (assetUrl.search) {
86
+ const hash = crypto.createHash('sha1').update(assetUrl.search).digest('hex').slice(0, 8);
87
+ const dir = path.posix.dirname(rel);
88
+ const base = path.posix.basename(rel, ext || '');
89
+ const finalExt = ext || guessExtension(contentType) || '';
90
+ rel = path.posix.join(dir === '.' ? '' : dir, `${base}__qs-${hash}${finalExt}`);
91
+ } else if (!ext) {
92
+ const guessed = guessExtension(contentType);
93
+ if (guessed) rel = `${rel}${guessed}`;
94
+ }
95
+
96
+ return path.join(outRoot, rel);
97
+ }
98
+
99
+ function rewriteAbsoluteSameOriginUrls(html, startOrigin) {
100
+ const $ = cheerio.load(html);
101
+ const attrs = [
102
+ { selector: 'a', attr: 'href' },
103
+ { selector: 'link', attr: 'href' },
104
+ { selector: 'script', attr: 'src' },
105
+ { selector: 'img', attr: 'src' },
106
+ { selector: 'source', attr: 'src' },
107
+ { selector: 'source', attr: 'srcset' },
108
+ { selector: 'img', attr: 'srcset' },
109
+ { selector: 'video', attr: 'src' },
110
+ { selector: 'video', attr: 'poster' },
111
+ { selector: 'audio', attr: 'src' },
112
+ { selector: 'form', attr: 'action' },
113
+ ];
114
+
115
+ for (const { selector, attr } of attrs) {
116
+ $(selector).each((_, el) => {
117
+ const value = $(el).attr(attr);
118
+ if (!value) return;
119
+ if (value.startsWith('mailto:') || value.startsWith('tel:') || value.startsWith('javascript:')) return;
120
+
121
+ // Handle srcset specially (comma-separated list of URLs)
122
+ if (attr === 'srcset') {
123
+ const parts = value.split(',').map((part) => {
124
+ const trimmed = part.trim();
125
+ const [url, descriptor] = trimmed.split(/\s+/);
126
+ try {
127
+ const normalized = url.startsWith('//') ? `https:${url}` : url;
128
+ const u = new URL(normalized, startOrigin);
129
+ if (u.origin === startOrigin) {
130
+ return descriptor ? `${u.pathname}${u.search} ${descriptor}` : u.pathname + u.search;
131
+ }
132
+ } catch {}
133
+ return trimmed;
134
+ });
135
+ $(el).attr(attr, parts.join(', '));
136
+ return;
137
+ }
138
+
139
+ try {
140
+ const normalized = value.startsWith('//') ? `https:${value}` : value;
141
+ const u = new URL(normalized, startOrigin);
142
+ if (u.origin === startOrigin) {
143
+ $(el).attr(attr, `${u.pathname}${u.search}${u.hash}`);
144
+ }
145
+ } catch {
146
+ // ignore
147
+ }
148
+ });
149
+ }
150
+
151
+ return $.html();
152
+ }
153
+
154
+ function injectOfflineFullNavigation(html) {
155
+ const $ = cheerio.load(html);
156
+ const script = `\n<script>(function(){function a(e){for(;e&&e!==document.body;){if(e.tagName==='A')return e;e=e.parentElement}return null}document.addEventListener('click',function(e){if(e.defaultPrevented)return;if(e.button!==0)return;if(e.metaKey||e.ctrlKey||e.shiftKey||e.altKey)return;var t=a(e.target);if(!t)return;var h=t.getAttribute('href');if(!h||h.startsWith('#'))return;if(h.startsWith('http://')||h.startsWith('https://')||h.startsWith('mailto:')||h.startsWith('tel:'))return;var tg=t.getAttribute('target');if(tg&&tg!=='_self')return;e.preventDefault();window.location.href=h;},true);})();</script>\n`;
157
+ if ($('body').length) $('body').append(script);
158
+ else $.root().append(script);
159
+ return $.html();
160
+ }
161
+
162
+ async function writeFileSafe(filePath, bufferOrString) {
163
+ await ensureDir(path.dirname(filePath));
164
+ await fs.writeFile(filePath, bufferOrString);
165
+ }
166
+
167
+ function parseSitemapLocs(xmlText) {
168
+ const locs = [];
169
+ const re = /<loc>([^<]+)<\/loc>/gi;
170
+ let m;
171
+ while ((m = re.exec(xmlText)) !== null) {
172
+ const loc = m[1].trim();
173
+ if (loc) locs.push(loc);
174
+ }
175
+ return locs;
176
+ }
177
+
178
+ async function fetchText(url) {
179
+ const controller = new AbortController();
180
+ const timeoutMs = 20000;
181
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
182
+
183
+ try {
184
+ const res = await fetch(url, {
185
+ redirect: 'follow',
186
+ headers: { 'user-agent': 'offline-mirror/1.0' },
187
+ signal: controller.signal,
188
+ });
189
+ if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`);
190
+ return await res.text();
191
+ } finally {
192
+ clearTimeout(timeout);
193
+ }
194
+ }
195
+
196
+ async function discoverSitemapUrls(startOrigin) {
197
+ const urls = new Set();
198
+
199
+ try {
200
+ const robots = await fetchText(`${startOrigin}/robots.txt`);
201
+ for (const line of robots.split(/\r?\n/)) {
202
+ const m = /^sitemap:\s*(.+)$/i.exec(line.trim());
203
+ if (m?.[1]) urls.add(m[1].trim());
204
+ }
205
+ } catch {
206
+ // ignore
207
+ }
208
+
209
+ urls.add(`${startOrigin}/sitemap.xml`);
210
+ return [...urls];
211
+ }
212
+
213
+ async function seedFromSitemaps({ startOrigin, enqueue }) {
214
+ const sitemapUrls = await discoverSitemapUrls(startOrigin);
215
+ const seenSitemaps = new Set();
216
+ const queue = [...sitemapUrls];
217
+
218
+ while (queue.length) {
219
+ const sitemapUrl = queue.shift();
220
+ if (seenSitemaps.has(sitemapUrl)) continue;
221
+ seenSitemaps.add(sitemapUrl);
222
+
223
+ console.log(`Sitemap: ${sitemapUrl}`);
224
+
225
+ let xml;
226
+ try {
227
+ xml = await fetchText(sitemapUrl);
228
+ } catch (e) {
229
+ console.log(` (skipped: ${e?.message || 'fetch failed'})`);
230
+ continue;
231
+ }
232
+
233
+ let count = 0;
234
+ for (const loc of parseSitemapLocs(xml)) {
235
+ let u;
236
+ try {
237
+ u = new URL(loc);
238
+ } catch {
239
+ continue;
240
+ }
241
+
242
+ if (u.pathname.endsWith('.xml') && u.origin === startOrigin && /sitemap/i.test(u.pathname)) {
243
+ queue.push(u.toString());
244
+ continue;
245
+ }
246
+
247
+ if (u.origin !== startOrigin) continue;
248
+ u.hash = '';
249
+ enqueue(u.toString());
250
+ count++;
251
+ }
252
+ console.log(` (found ${count} URLs)`);
253
+ }
254
+ }
255
+
256
+ export async function mirrorSite({ start, out, maxPages, maxDepth, sameOriginOnly, seedSitemaps, singlePage, userAgent }) {
257
+ const startUrl = normalizePageUrl(start);
258
+ const startOrigin = startUrl.origin;
259
+ const outRoot = path.resolve(out);
260
+
261
+ await ensureDir(outRoot);
262
+
263
+ const visitedPages = new Set();
264
+ const enqueuedPages = new Set();
265
+ const savedAssets = new Set();
266
+
267
+ const queue = [{ url: startUrl.toString(), depth: 0 }];
268
+ enqueuedPages.add(startUrl.toString());
269
+
270
+ const enqueue = (url, depth = 1) => {
271
+ if (enqueuedPages.has(url) || visitedPages.has(url)) return;
272
+ queue.push({ url, depth });
273
+ enqueuedPages.add(url);
274
+ };
275
+
276
+ if (seedSitemaps) {
277
+ console.log('Seeding from sitemap/robots...');
278
+ try {
279
+ await seedFromSitemaps({
280
+ startOrigin,
281
+ enqueue: (url) => {
282
+ try {
283
+ const u = normalizePageUrl(url);
284
+ if (sameOriginOnly && u.origin !== startOrigin) return;
285
+ if (shouldSkipLink(u)) return;
286
+ enqueue(u.toString(), 1);
287
+ } catch {
288
+ // ignore
289
+ }
290
+ },
291
+ });
292
+ } catch (e) {
293
+ console.warn(`Sitemap seeding failed: ${e?.message || e}`);
294
+ }
295
+ console.log(`Seeding complete. Queue size: ${queue.length}`);
296
+ }
297
+
298
+ let browser = await chromium.launch({ headless: true });
299
+ let context = await browser.newContext({ userAgent });
300
+
301
+ const relaunch = async (reason) => {
302
+ try {
303
+ console.warn(`Relaunching browser context: ${reason}`);
304
+ await context?.close();
305
+ } catch {}
306
+ try {
307
+ await browser?.close();
308
+ } catch {}
309
+ browser = await chromium.launch({ headless: true });
310
+ context = await browser.newContext({ userAgent });
311
+ };
312
+
313
+ const unlimitedPages = !maxPages || maxPages <= 0;
314
+ const unlimitedDepth = !maxDepth || maxDepth <= 0;
315
+
316
+ try {
317
+ while (queue.length > 0 && (unlimitedPages || visitedPages.size < maxPages)) {
318
+ const { url, depth } = queue.shift();
319
+ if (visitedPages.has(url)) continue;
320
+ if (!unlimitedDepth && depth > maxDepth) continue;
321
+
322
+ const pageUrl = normalizePageUrl(url);
323
+ if (sameOriginOnly && pageUrl.origin !== startOrigin) continue;
324
+
325
+ visitedPages.add(pageUrl.toString());
326
+ const progressMax = unlimitedPages ? '∞' : String(maxPages);
327
+ console.log(`[${visitedPages.size}/${progressMax}] ${pageUrl.toString()}`);
328
+
329
+ let page;
330
+ try {
331
+ page = await context.newPage();
332
+ } catch (e) {
333
+ await relaunch(e?.message || 'newPage failed');
334
+ page = await context.newPage();
335
+ }
336
+
337
+ const onResponse = async (response) => {
338
+ try {
339
+ const req = response.request();
340
+ if (req.method() !== 'GET') return;
341
+
342
+ const resourceType = req.resourceType();
343
+ if (['xhr', 'fetch', 'websocket', 'eventsource'].includes(resourceType)) return;
344
+
345
+ const responseUrl = response.url();
346
+ if (!responseUrl.startsWith('http://') && !responseUrl.startsWith('https://')) return;
347
+
348
+ const assetUrl = new URL(responseUrl);
349
+ const ct = response.headers()['content-type'] || '';
350
+
351
+ const isStaticLike = ['stylesheet', 'script', 'image', 'font', 'media', 'other'].includes(resourceType);
352
+ if (!isStaticLike) return;
353
+ if (ct.toLowerCase().includes('text/html') || resourceType === 'document') return;
354
+
355
+ const ctLower = ct.toLowerCase();
356
+ const allowed =
357
+ ctLower.includes('text/css') ||
358
+ ctLower.includes('javascript') ||
359
+ ctLower.startsWith('image/') ||
360
+ ctLower.startsWith('font/') ||
361
+ ctLower.startsWith('video/') ||
362
+ ctLower.startsWith('audio/') ||
363
+ ctLower.includes('font-woff') ||
364
+ ctLower.includes('application/wasm') ||
365
+ ctLower.includes('application/octet-stream');
366
+ if (!allowed) return;
367
+
368
+ const key = assetUrl.toString();
369
+ if (savedAssets.has(key)) return;
370
+
371
+ const body = await response.body();
372
+ const filePath = assetOutputPath(outRoot, startOrigin, assetUrl, ct);
373
+ await writeFileSafe(filePath, body);
374
+ savedAssets.add(key);
375
+ } catch {
376
+ // best effort
377
+ }
378
+ };
379
+
380
+ page.on('response', onResponse);
381
+
382
+ try {
383
+ try {
384
+ await page.goto(pageUrl.toString(), { waitUntil: 'domcontentloaded', timeout: 60000 });
385
+ } catch (e) {
386
+ const message = e?.message || String(e);
387
+ if (message.includes('net::ERR_ABORTED')) {
388
+ await page.goto(pageUrl.toString(), { waitUntil: 'load', timeout: 60000 });
389
+ } else {
390
+ throw e;
391
+ }
392
+ }
393
+
394
+ try {
395
+ await page.waitForLoadState('networkidle', { timeout: 20000 });
396
+ } catch {
397
+ // long-lived connections are fine
398
+ }
399
+
400
+ try {
401
+ await page.waitForTimeout(1500);
402
+ } catch {}
403
+
404
+ // Skip link discovery in single-page mode
405
+ if (!singlePage) {
406
+ const rawLinks = await page.$$eval('a[href]', (els) =>
407
+ els.map((e) => e.getAttribute('href')).filter(Boolean)
408
+ );
409
+
410
+ for (const href of rawLinks) {
411
+ if (!href || href.startsWith('#')) continue;
412
+ if (href.startsWith('mailto:') || href.startsWith('tel:') || href.startsWith('javascript:')) continue;
413
+
414
+ let linkUrl;
415
+ try {
416
+ linkUrl = new URL(href, pageUrl.toString());
417
+ } catch {
418
+ continue;
419
+ }
420
+
421
+ if (sameOriginOnly && linkUrl.origin !== startOrigin) continue;
422
+ linkUrl.hash = '';
423
+ if (shouldSkipLink(linkUrl)) continue;
424
+
425
+ const normalized = linkUrl.toString();
426
+ if (!enqueuedPages.has(normalized) && !visitedPages.has(normalized)) {
427
+ enqueue(normalized, depth + 1);
428
+ }
429
+ }
430
+ }
431
+
432
+ let html = await page.content();
433
+ html = rewriteAbsoluteSameOriginUrls(html, startOrigin);
434
+ html = injectOfflineFullNavigation(html);
435
+
436
+ const htmlPath = pageOutputPath(outRoot, pageUrl);
437
+ await writeFileSafe(htmlPath, html);
438
+ } catch (e) {
439
+ const message = e?.message || String(e);
440
+ console.warn(`Failed: ${pageUrl.toString()} (${message})`);
441
+ if (message.includes('Target page, context or browser has been closed')) {
442
+ await relaunch('target closed during navigation');
443
+ }
444
+ } finally {
445
+ page.off('response', onResponse);
446
+ try {
447
+ await page.close();
448
+ } catch {}
449
+ }
450
+ }
451
+ } finally {
452
+ try {
453
+ await context.close();
454
+ } catch {}
455
+ try {
456
+ await browser.close();
457
+ } catch {}
458
+ }
459
+
460
+ console.log('');
461
+ console.log(`Done. Pages saved: ${visitedPages.size}, Assets saved: ${savedAssets.size}`);
462
+ console.log(`Output: ${outRoot}`);
463
+ }
package/package.json ADDED
@@ -0,0 +1,46 @@
1
+ {
2
+ "name": "site-mirror",
3
+ "version": "1.0.0",
4
+ "description": "CLI tool to mirror websites for offline browsing using Playwright",
5
+ "type": "module",
6
+ "bin": {
7
+ "site-mirror": "bin/cli.mjs"
8
+ },
9
+ "scripts": {
10
+ "postinstall": "node ./bin/postinstall.mjs"
11
+ },
12
+ "files": [
13
+ "bin/",
14
+ "lib/",
15
+ "README.md",
16
+ "LICENSE"
17
+ ],
18
+ "keywords": [
19
+ "mirror",
20
+ "offline",
21
+ "website",
22
+ "crawler",
23
+ "playwright",
24
+ "scraper",
25
+ "web-scraping",
26
+ "archiver",
27
+ "static-site"
28
+ ],
29
+ "author": "Mahesh Doiphode",
30
+ "license": "MIT",
31
+ "repository": {
32
+ "type": "git",
33
+ "url": "https://github.com/MaheshDoiphode/site-mirror.git"
34
+ },
35
+ "bugs": {
36
+ "url": "https://github.com/MaheshDoiphode/site-mirror/issues"
37
+ },
38
+ "homepage": "https://github.com/MaheshDoiphode/site-mirror#readme",
39
+ "dependencies": {
40
+ "cheerio": "^1.0.0-rc.12",
41
+ "playwright": "^1.49.1"
42
+ },
43
+ "engines": {
44
+ "node": ">=18.0.0"
45
+ }
46
+ }