site-mirror 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/mirror.mjs CHANGED
@@ -1,463 +1,511 @@
1
- /**
2
- * Core mirroring logic.
3
- * Exports mirrorSite() for use by the CLI.
4
- */
5
- import { chromium } from 'playwright';
6
- import * as cheerio from 'cheerio';
7
- import fs from 'node:fs/promises';
8
- import path from 'node:path';
9
- import crypto from 'node:crypto';
10
-
11
- async function ensureDir(dirPath) {
12
- await fs.mkdir(dirPath, { recursive: true });
13
- }
14
-
15
- function normalizePageUrl(urlString) {
16
- const u = new URL(urlString);
17
- u.hash = '';
18
- return u;
19
- }
20
-
21
- function shouldSkipLink(url) {
22
- const ext = path.posix.extname(url.pathname).toLowerCase();
23
- const skipExt = new Set([
24
- '.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg',
25
- '.css', '.js', '.mjs', '.map',
26
- '.json', '.xml', '.txt',
27
- '.woff', '.woff2', '.ttf', '.otf',
28
- '.pdf', '.zip', '.rar', '.7z',
29
- '.mp4', '.webm', '.mp3', '.wav',
30
- ]);
31
- return skipExt.has(ext);
32
- }
33
-
34
- function pageOutputPath(outRoot, pageUrl) {
35
- const pathname = decodeURIComponent(pageUrl.pathname || '/');
36
- let rel;
37
-
38
- if (pathname === '/' || pathname === '') {
39
- rel = 'index.html';
40
- } else if (pathname.endsWith('/')) {
41
- rel = path.posix.join(pathname, 'index.html').slice(1);
42
- } else {
43
- const ext = path.posix.extname(pathname);
44
- if (ext) rel = pathname.slice(1);
45
- else rel = path.posix.join(pathname, 'index.html').slice(1);
46
- }
47
-
48
- if (pageUrl.search) {
49
- const hash = crypto.createHash('sha1').update(pageUrl.search).digest('hex').slice(0, 8);
50
- const dir = path.posix.dirname(rel);
51
- const base = path.posix.basename(rel, '.html');
52
- rel = path.posix.join(dir === '.' ? '' : dir, `${base}__qs-${hash}.html`);
53
- }
54
-
55
- return path.join(outRoot, rel);
56
- }
57
-
58
- function guessExtension(contentType) {
59
- const ct = (contentType || '').toLowerCase();
60
- if (ct.includes('text/css')) return '.css';
61
- if (ct.includes('javascript')) return '.js';
62
- if (ct.includes('application/wasm')) return '.wasm';
63
- if (ct.includes('image/svg+xml')) return '.svg';
64
- if (ct.startsWith('image/')) return `.${ct.split('image/')[1].split(';')[0]}`;
65
- if (ct.includes('font/woff2')) return '.woff2';
66
- if (ct.includes('font/woff')) return '.woff';
67
- if (ct.includes('font/ttf')) return '.ttf';
68
- if (ct.includes('font/otf')) return '.otf';
69
- return '';
70
- }
71
-
72
- function assetOutputPath(outRoot, startOrigin, assetUrl, contentType) {
73
- const pathname = assetUrl.pathname || '/';
74
- const ext = path.posix.extname(pathname);
75
-
76
- if (assetUrl.origin === startOrigin) {
77
- const rel = pathname === '/' ? 'index' : pathname.slice(1);
78
- return path.join(outRoot, rel);
79
- }
80
-
81
- const hostRoot = path.posix.join('_external', assetUrl.hostname);
82
- let rel = pathname === '/' ? 'index' : pathname.slice(1);
83
- rel = path.posix.join(hostRoot, rel);
84
-
85
- if (assetUrl.search) {
86
- const hash = crypto.createHash('sha1').update(assetUrl.search).digest('hex').slice(0, 8);
87
- const dir = path.posix.dirname(rel);
88
- const base = path.posix.basename(rel, ext || '');
89
- const finalExt = ext || guessExtension(contentType) || '';
90
- rel = path.posix.join(dir === '.' ? '' : dir, `${base}__qs-${hash}${finalExt}`);
91
- } else if (!ext) {
92
- const guessed = guessExtension(contentType);
93
- if (guessed) rel = `${rel}${guessed}`;
94
- }
95
-
96
- return path.join(outRoot, rel);
97
- }
98
-
99
- function rewriteAbsoluteSameOriginUrls(html, startOrigin) {
100
- const $ = cheerio.load(html);
101
- const attrs = [
102
- { selector: 'a', attr: 'href' },
103
- { selector: 'link', attr: 'href' },
104
- { selector: 'script', attr: 'src' },
105
- { selector: 'img', attr: 'src' },
106
- { selector: 'source', attr: 'src' },
107
- { selector: 'source', attr: 'srcset' },
108
- { selector: 'img', attr: 'srcset' },
109
- { selector: 'video', attr: 'src' },
110
- { selector: 'video', attr: 'poster' },
111
- { selector: 'audio', attr: 'src' },
112
- { selector: 'form', attr: 'action' },
113
- ];
114
-
115
- for (const { selector, attr } of attrs) {
116
- $(selector).each((_, el) => {
117
- const value = $(el).attr(attr);
118
- if (!value) return;
119
- if (value.startsWith('mailto:') || value.startsWith('tel:') || value.startsWith('javascript:')) return;
120
-
121
- // Handle srcset specially (comma-separated list of URLs)
122
- if (attr === 'srcset') {
123
- const parts = value.split(',').map((part) => {
124
- const trimmed = part.trim();
125
- const [url, descriptor] = trimmed.split(/\s+/);
126
- try {
127
- const normalized = url.startsWith('//') ? `https:${url}` : url;
128
- const u = new URL(normalized, startOrigin);
129
- if (u.origin === startOrigin) {
130
- return descriptor ? `${u.pathname}${u.search} ${descriptor}` : u.pathname + u.search;
131
- }
132
- } catch {}
133
- return trimmed;
134
- });
135
- $(el).attr(attr, parts.join(', '));
136
- return;
137
- }
138
-
139
- try {
140
- const normalized = value.startsWith('//') ? `https:${value}` : value;
141
- const u = new URL(normalized, startOrigin);
142
- if (u.origin === startOrigin) {
143
- $(el).attr(attr, `${u.pathname}${u.search}${u.hash}`);
144
- }
145
- } catch {
146
- // ignore
147
- }
148
- });
149
- }
150
-
151
- return $.html();
152
- }
153
-
154
- function injectOfflineFullNavigation(html) {
155
- const $ = cheerio.load(html);
156
- const script = `\n<script>(function(){function a(e){for(;e&&e!==document.body;){if(e.tagName==='A')return e;e=e.parentElement}return null}document.addEventListener('click',function(e){if(e.defaultPrevented)return;if(e.button!==0)return;if(e.metaKey||e.ctrlKey||e.shiftKey||e.altKey)return;var t=a(e.target);if(!t)return;var h=t.getAttribute('href');if(!h||h.startsWith('#'))return;if(h.startsWith('http://')||h.startsWith('https://')||h.startsWith('mailto:')||h.startsWith('tel:'))return;var tg=t.getAttribute('target');if(tg&&tg!=='_self')return;e.preventDefault();window.location.href=h;},true);})();</script>\n`;
157
- if ($('body').length) $('body').append(script);
158
- else $.root().append(script);
159
- return $.html();
160
- }
161
-
162
- async function writeFileSafe(filePath, bufferOrString) {
163
- await ensureDir(path.dirname(filePath));
164
- await fs.writeFile(filePath, bufferOrString);
165
- }
166
-
167
- function parseSitemapLocs(xmlText) {
168
- const locs = [];
169
- const re = /<loc>([^<]+)<\/loc>/gi;
170
- let m;
171
- while ((m = re.exec(xmlText)) !== null) {
172
- const loc = m[1].trim();
173
- if (loc) locs.push(loc);
174
- }
175
- return locs;
176
- }
177
-
178
- async function fetchText(url) {
179
- const controller = new AbortController();
180
- const timeoutMs = 20000;
181
- const timeout = setTimeout(() => controller.abort(), timeoutMs);
182
-
183
- try {
184
- const res = await fetch(url, {
185
- redirect: 'follow',
186
- headers: { 'user-agent': 'offline-mirror/1.0' },
187
- signal: controller.signal,
188
- });
189
- if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`);
190
- return await res.text();
191
- } finally {
192
- clearTimeout(timeout);
193
- }
194
- }
195
-
196
- async function discoverSitemapUrls(startOrigin) {
197
- const urls = new Set();
198
-
199
- try {
200
- const robots = await fetchText(`${startOrigin}/robots.txt`);
201
- for (const line of robots.split(/\r?\n/)) {
202
- const m = /^sitemap:\s*(.+)$/i.exec(line.trim());
203
- if (m?.[1]) urls.add(m[1].trim());
204
- }
205
- } catch {
206
- // ignore
207
- }
208
-
209
- urls.add(`${startOrigin}/sitemap.xml`);
210
- return [...urls];
211
- }
212
-
213
- async function seedFromSitemaps({ startOrigin, enqueue }) {
214
- const sitemapUrls = await discoverSitemapUrls(startOrigin);
215
- const seenSitemaps = new Set();
216
- const queue = [...sitemapUrls];
217
-
218
- while (queue.length) {
219
- const sitemapUrl = queue.shift();
220
- if (seenSitemaps.has(sitemapUrl)) continue;
221
- seenSitemaps.add(sitemapUrl);
222
-
223
- console.log(`Sitemap: ${sitemapUrl}`);
224
-
225
- let xml;
226
- try {
227
- xml = await fetchText(sitemapUrl);
228
- } catch (e) {
229
- console.log(` (skipped: ${e?.message || 'fetch failed'})`);
230
- continue;
231
- }
232
-
233
- let count = 0;
234
- for (const loc of parseSitemapLocs(xml)) {
235
- let u;
236
- try {
237
- u = new URL(loc);
238
- } catch {
239
- continue;
240
- }
241
-
242
- if (u.pathname.endsWith('.xml') && u.origin === startOrigin && /sitemap/i.test(u.pathname)) {
243
- queue.push(u.toString());
244
- continue;
245
- }
246
-
247
- if (u.origin !== startOrigin) continue;
248
- u.hash = '';
249
- enqueue(u.toString());
250
- count++;
251
- }
252
- console.log(` (found ${count} URLs)`);
253
- }
254
- }
255
-
256
- export async function mirrorSite({ start, out, maxPages, maxDepth, sameOriginOnly, seedSitemaps, singlePage, userAgent }) {
257
- const startUrl = normalizePageUrl(start);
258
- const startOrigin = startUrl.origin;
259
- const outRoot = path.resolve(out);
260
-
261
- await ensureDir(outRoot);
262
-
263
- const visitedPages = new Set();
264
- const enqueuedPages = new Set();
265
- const savedAssets = new Set();
266
-
267
- const queue = [{ url: startUrl.toString(), depth: 0 }];
268
- enqueuedPages.add(startUrl.toString());
269
-
270
- const enqueue = (url, depth = 1) => {
271
- if (enqueuedPages.has(url) || visitedPages.has(url)) return;
272
- queue.push({ url, depth });
273
- enqueuedPages.add(url);
274
- };
275
-
276
- if (seedSitemaps) {
277
- console.log('Seeding from sitemap/robots...');
278
- try {
279
- await seedFromSitemaps({
280
- startOrigin,
281
- enqueue: (url) => {
282
- try {
283
- const u = normalizePageUrl(url);
284
- if (sameOriginOnly && u.origin !== startOrigin) return;
285
- if (shouldSkipLink(u)) return;
286
- enqueue(u.toString(), 1);
287
- } catch {
288
- // ignore
289
- }
290
- },
291
- });
292
- } catch (e) {
293
- console.warn(`Sitemap seeding failed: ${e?.message || e}`);
294
- }
295
- console.log(`Seeding complete. Queue size: ${queue.length}`);
296
- }
297
-
298
- let browser = await chromium.launch({ headless: true });
299
- let context = await browser.newContext({ userAgent });
300
-
301
- const relaunch = async (reason) => {
302
- try {
303
- console.warn(`Relaunching browser context: ${reason}`);
304
- await context?.close();
305
- } catch {}
306
- try {
307
- await browser?.close();
308
- } catch {}
309
- browser = await chromium.launch({ headless: true });
310
- context = await browser.newContext({ userAgent });
311
- };
312
-
313
- const unlimitedPages = !maxPages || maxPages <= 0;
314
- const unlimitedDepth = !maxDepth || maxDepth <= 0;
315
-
316
- try {
317
- while (queue.length > 0 && (unlimitedPages || visitedPages.size < maxPages)) {
318
- const { url, depth } = queue.shift();
319
- if (visitedPages.has(url)) continue;
320
- if (!unlimitedDepth && depth > maxDepth) continue;
321
-
322
- const pageUrl = normalizePageUrl(url);
323
- if (sameOriginOnly && pageUrl.origin !== startOrigin) continue;
324
-
325
- visitedPages.add(pageUrl.toString());
326
- const progressMax = unlimitedPages ? '∞' : String(maxPages);
327
- console.log(`[${visitedPages.size}/${progressMax}] ${pageUrl.toString()}`);
328
-
329
- let page;
330
- try {
331
- page = await context.newPage();
332
- } catch (e) {
333
- await relaunch(e?.message || 'newPage failed');
334
- page = await context.newPage();
335
- }
336
-
337
- const onResponse = async (response) => {
338
- try {
339
- const req = response.request();
340
- if (req.method() !== 'GET') return;
341
-
342
- const resourceType = req.resourceType();
343
- if (['xhr', 'fetch', 'websocket', 'eventsource'].includes(resourceType)) return;
344
-
345
- const responseUrl = response.url();
346
- if (!responseUrl.startsWith('http://') && !responseUrl.startsWith('https://')) return;
347
-
348
- const assetUrl = new URL(responseUrl);
349
- const ct = response.headers()['content-type'] || '';
350
-
351
- const isStaticLike = ['stylesheet', 'script', 'image', 'font', 'media', 'other'].includes(resourceType);
352
- if (!isStaticLike) return;
353
- if (ct.toLowerCase().includes('text/html') || resourceType === 'document') return;
354
-
355
- const ctLower = ct.toLowerCase();
356
- const allowed =
357
- ctLower.includes('text/css') ||
358
- ctLower.includes('javascript') ||
359
- ctLower.startsWith('image/') ||
360
- ctLower.startsWith('font/') ||
361
- ctLower.startsWith('video/') ||
362
- ctLower.startsWith('audio/') ||
363
- ctLower.includes('font-woff') ||
364
- ctLower.includes('application/wasm') ||
365
- ctLower.includes('application/octet-stream');
366
- if (!allowed) return;
367
-
368
- const key = assetUrl.toString();
369
- if (savedAssets.has(key)) return;
370
-
371
- const body = await response.body();
372
- const filePath = assetOutputPath(outRoot, startOrigin, assetUrl, ct);
373
- await writeFileSafe(filePath, body);
374
- savedAssets.add(key);
375
- } catch {
376
- // best effort
377
- }
378
- };
379
-
380
- page.on('response', onResponse);
381
-
382
- try {
383
- try {
384
- await page.goto(pageUrl.toString(), { waitUntil: 'domcontentloaded', timeout: 60000 });
385
- } catch (e) {
386
- const message = e?.message || String(e);
387
- if (message.includes('net::ERR_ABORTED')) {
388
- await page.goto(pageUrl.toString(), { waitUntil: 'load', timeout: 60000 });
389
- } else {
390
- throw e;
391
- }
392
- }
393
-
394
- try {
395
- await page.waitForLoadState('networkidle', { timeout: 20000 });
396
- } catch {
397
- // long-lived connections are fine
398
- }
399
-
400
- try {
401
- await page.waitForTimeout(1500);
402
- } catch {}
403
-
404
- // Skip link discovery in single-page mode
405
- if (!singlePage) {
406
- const rawLinks = await page.$$eval('a[href]', (els) =>
407
- els.map((e) => e.getAttribute('href')).filter(Boolean)
408
- );
409
-
410
- for (const href of rawLinks) {
411
- if (!href || href.startsWith('#')) continue;
412
- if (href.startsWith('mailto:') || href.startsWith('tel:') || href.startsWith('javascript:')) continue;
413
-
414
- let linkUrl;
415
- try {
416
- linkUrl = new URL(href, pageUrl.toString());
417
- } catch {
418
- continue;
419
- }
420
-
421
- if (sameOriginOnly && linkUrl.origin !== startOrigin) continue;
422
- linkUrl.hash = '';
423
- if (shouldSkipLink(linkUrl)) continue;
424
-
425
- const normalized = linkUrl.toString();
426
- if (!enqueuedPages.has(normalized) && !visitedPages.has(normalized)) {
427
- enqueue(normalized, depth + 1);
428
- }
429
- }
430
- }
431
-
432
- let html = await page.content();
433
- html = rewriteAbsoluteSameOriginUrls(html, startOrigin);
434
- html = injectOfflineFullNavigation(html);
435
-
436
- const htmlPath = pageOutputPath(outRoot, pageUrl);
437
- await writeFileSafe(htmlPath, html);
438
- } catch (e) {
439
- const message = e?.message || String(e);
440
- console.warn(`Failed: ${pageUrl.toString()} (${message})`);
441
- if (message.includes('Target page, context or browser has been closed')) {
442
- await relaunch('target closed during navigation');
443
- }
444
- } finally {
445
- page.off('response', onResponse);
446
- try {
447
- await page.close();
448
- } catch {}
449
- }
450
- }
451
- } finally {
452
- try {
453
- await context.close();
454
- } catch {}
455
- try {
456
- await browser.close();
457
- } catch {}
458
- }
459
-
460
- console.log('');
461
- console.log(`Done. Pages saved: ${visitedPages.size}, Assets saved: ${savedAssets.size}`);
462
- console.log(`Output: ${outRoot}`);
463
- }
1
+ /**
2
+ * Core mirroring logic.
3
+ * Exports mirrorSite() for use by the CLI.
4
+ */
5
+ import { chromium } from 'playwright';
6
+ import * as cheerio from 'cheerio';
7
+ import fs from 'node:fs/promises';
8
+ import path from 'node:path';
9
+ import crypto from 'node:crypto';
10
+
11
+ async function ensureDir(dirPath) {
12
+ await fs.mkdir(dirPath, { recursive: true });
13
+ }
14
+
15
+ function normalizePageUrl(urlString) {
16
+ const u = new URL(urlString);
17
+ u.hash = '';
18
+ return u;
19
+ }
20
+
21
+ function shouldSkipLink(url) {
22
+ const ext = path.posix.extname(url.pathname).toLowerCase();
23
+ const skipExt = new Set([
24
+ '.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg',
25
+ '.css', '.js', '.mjs', '.map',
26
+ '.json', '.xml', '.txt',
27
+ '.woff', '.woff2', '.ttf', '.otf',
28
+ '.pdf', '.zip', '.rar', '.7z',
29
+ '.mp4', '.webm', '.mp3', '.wav',
30
+ ]);
31
+ return skipExt.has(ext);
32
+ }
33
+
34
+ function pageOutputPath(outRoot, pageUrl) {
35
+ const pathname = decodeURIComponent(pageUrl.pathname || '/');
36
+ let rel;
37
+
38
+ if (pathname === '/' || pathname === '') {
39
+ rel = 'index.html';
40
+ } else if (pathname.endsWith('/')) {
41
+ rel = path.posix.join(pathname, 'index.html').slice(1);
42
+ } else {
43
+ const ext = path.posix.extname(pathname);
44
+ if (ext) rel = pathname.slice(1);
45
+ else rel = path.posix.join(pathname, 'index.html').slice(1);
46
+ }
47
+
48
+ if (pageUrl.search) {
49
+ const hash = crypto.createHash('sha1').update(pageUrl.search).digest('hex').slice(0, 8);
50
+ const dir = path.posix.dirname(rel);
51
+ const base = path.posix.basename(rel, '.html');
52
+ rel = path.posix.join(dir === '.' ? '' : dir, `${base}__qs-${hash}.html`);
53
+ }
54
+
55
+ return path.join(outRoot, rel);
56
+ }
57
+
58
+ function guessExtension(contentType) {
59
+ const ct = (contentType || '').toLowerCase();
60
+ if (ct.includes('text/css')) return '.css';
61
+ if (ct.includes('javascript')) return '.js';
62
+ if (ct.includes('application/wasm')) return '.wasm';
63
+ if (ct.includes('image/svg+xml')) return '.svg';
64
+ if (ct.startsWith('image/')) return `.${ct.split('image/')[1].split(';')[0]}`;
65
+ if (ct.includes('font/woff2')) return '.woff2';
66
+ if (ct.includes('font/woff')) return '.woff';
67
+ if (ct.includes('font/ttf')) return '.ttf';
68
+ if (ct.includes('font/otf')) return '.otf';
69
+ return '';
70
+ }
71
+
72
+ function assetOutputPath(outRoot, startOrigin, assetUrl, contentType) {
73
+ const pathname = assetUrl.pathname || '/';
74
+ const ext = path.posix.extname(pathname);
75
+
76
+ if (assetUrl.origin === startOrigin) {
77
+ const rel = pathname === '/' ? 'index' : pathname.slice(1);
78
+ return path.join(outRoot, rel);
79
+ }
80
+
81
+ const hostRoot = path.posix.join('_external', assetUrl.hostname);
82
+ let rel = pathname === '/' ? 'index' : pathname.slice(1);
83
+ rel = path.posix.join(hostRoot, rel);
84
+
85
+ if (assetUrl.search) {
86
+ const hash = crypto.createHash('sha1').update(assetUrl.search).digest('hex').slice(0, 8);
87
+ const dir = path.posix.dirname(rel);
88
+ const base = path.posix.basename(rel, ext || '');
89
+ const finalExt = ext || guessExtension(contentType) || '';
90
+ rel = path.posix.join(dir === '.' ? '' : dir, `${base}__qs-${hash}${finalExt}`);
91
+ } else if (!ext) {
92
+ const guessed = guessExtension(contentType);
93
+ if (guessed) rel = `${rel}${guessed}`;
94
+ }
95
+
96
+ return path.join(outRoot, rel);
97
+ }
98
+
99
+ function rewriteAbsoluteSameOriginUrls(html, startOrigin) {
100
+ const $ = cheerio.load(html);
101
+ const attrs = [
102
+ { selector: 'a', attr: 'href' },
103
+ { selector: 'link', attr: 'href' },
104
+ { selector: 'script', attr: 'src' },
105
+ { selector: 'img', attr: 'src' },
106
+ { selector: 'source', attr: 'src' },
107
+ { selector: 'source', attr: 'srcset' },
108
+ { selector: 'img', attr: 'srcset' },
109
+ { selector: 'video', attr: 'src' },
110
+ { selector: 'video', attr: 'poster' },
111
+ { selector: 'audio', attr: 'src' },
112
+ { selector: 'form', attr: 'action' },
113
+ ];
114
+
115
+ for (const { selector, attr } of attrs) {
116
+ $(selector).each((_, el) => {
117
+ const value = $(el).attr(attr);
118
+ if (!value) return;
119
+ if (value.startsWith('mailto:') || value.startsWith('tel:') || value.startsWith('javascript:')) return;
120
+
121
+ // Handle srcset specially (comma-separated list of URLs)
122
+ if (attr === 'srcset') {
123
+ const parts = value.split(',').map((part) => {
124
+ const trimmed = part.trim();
125
+ const [url, descriptor] = trimmed.split(/\s+/);
126
+ try {
127
+ const normalized = url.startsWith('//') ? `https:${url}` : url;
128
+ const u = new URL(normalized, startOrigin);
129
+
130
+ // Rewrite both same-origin and cross-origin to relative paths
131
+ if (u.origin === startOrigin) {
132
+ return descriptor ? `${u.pathname}${u.search} ${descriptor}` : u.pathname + u.search;
133
+ } else {
134
+ // Cross-origin: rewrite to _external/hostname/path
135
+ const externalPath = `/_external/${u.hostname}${u.pathname}${u.search}`;
136
+ return descriptor ? `${externalPath} ${descriptor}` : externalPath;
137
+ }
138
+ } catch {}
139
+ return trimmed;
140
+ });
141
+ $(el).attr(attr, parts.join(', '));
142
+ return;
143
+ }
144
+
145
+ try {
146
+ const normalized = value.startsWith('//') ? `https:${value}` : value;
147
+ const u = new URL(normalized, startOrigin);
148
+
149
+ if (u.origin === startOrigin) {
150
+ // Same-origin: use root-relative path
151
+ $(el).attr(attr, `${u.pathname}${u.search}${u.hash}`);
152
+ } else {
153
+ // Cross-origin: rewrite to _external/hostname/path
154
+ const externalPath = `/_external/${u.hostname}${u.pathname}${u.search}${u.hash}`;
155
+ $(el).attr(attr, externalPath);
156
+ }
157
+ } catch {
158
+ // ignore
159
+ }
160
+ });
161
+ }
162
+
163
+ return $.html();
164
+ }
165
+
166
+ function injectOfflineFullNavigation(html) {
167
+ const $ = cheerio.load(html);
168
+ const script = `\n<script>(function(){function a(e){for(;e&&e!==document.body;){if(e.tagName==='A')return e;e=e.parentElement}return null}document.addEventListener('click',function(e){if(e.defaultPrevented)return;if(e.button!==0)return;if(e.metaKey||e.ctrlKey||e.shiftKey||e.altKey)return;var t=a(e.target);if(!t)return;var h=t.getAttribute('href');if(!h||h.startsWith('#'))return;if(h.startsWith('http://')||h.startsWith('https://')||h.startsWith('mailto:')||h.startsWith('tel:'))return;var tg=t.getAttribute('target');if(tg&&tg!=='_self')return;e.preventDefault();window.location.href=h;},true);})();</script>\n`;
169
+ if ($('body').length) $('body').append(script);
170
+ else $.root().append(script);
171
+ return $.html();
172
+ }
173
+
174
+ async function writeFileSafe(filePath, bufferOrString) {
175
+ await ensureDir(path.dirname(filePath));
176
+ await fs.writeFile(filePath, bufferOrString);
177
+ }
178
+
179
+ function parseSitemapLocs(xmlText) {
180
+ const locs = [];
181
+ const re = /<loc>([^<]+)<\/loc>/gi;
182
+ let m;
183
+ while ((m = re.exec(xmlText)) !== null) {
184
+ const loc = m[1].trim();
185
+ if (loc) locs.push(loc);
186
+ }
187
+ return locs;
188
+ }
189
+
190
+ async function fetchText(url) {
191
+ const controller = new AbortController();
192
+ const timeoutMs = 20000;
193
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
194
+
195
+ try {
196
+ const res = await fetch(url, {
197
+ redirect: 'follow',
198
+ headers: { 'user-agent': 'offline-mirror/1.0' },
199
+ signal: controller.signal,
200
+ });
201
+ if (!res.ok) throw new Error(`HTTP ${res.status} for ${url}`);
202
+ return await res.text();
203
+ } finally {
204
+ clearTimeout(timeout);
205
+ }
206
+ }
207
+
208
+ async function discoverSitemapUrls(startOrigin) {
209
+ const urls = new Set();
210
+
211
+ try {
212
+ const robots = await fetchText(`${startOrigin}/robots.txt`);
213
+ for (const line of robots.split(/\r?\n/)) {
214
+ const m = /^sitemap:\s*(.+)$/i.exec(line.trim());
215
+ if (m?.[1]) urls.add(m[1].trim());
216
+ }
217
+ } catch {
218
+ // ignore
219
+ }
220
+
221
+ urls.add(`${startOrigin}/sitemap.xml`);
222
+ return [...urls];
223
+ }
224
+
225
+ async function seedFromSitemaps({ startOrigin, enqueue }) {
226
+ const sitemapUrls = await discoverSitemapUrls(startOrigin);
227
+ const seenSitemaps = new Set();
228
+ const queue = [...sitemapUrls];
229
+
230
+ while (queue.length) {
231
+ const sitemapUrl = queue.shift();
232
+ if (seenSitemaps.has(sitemapUrl)) continue;
233
+ seenSitemaps.add(sitemapUrl);
234
+
235
+ console.log(`Sitemap: ${sitemapUrl}`);
236
+
237
+ let xml;
238
+ try {
239
+ xml = await fetchText(sitemapUrl);
240
+ } catch (e) {
241
+ console.log(` (skipped: ${e?.message || 'fetch failed'})`);
242
+ continue;
243
+ }
244
+
245
+ let count = 0;
246
+ for (const loc of parseSitemapLocs(xml)) {
247
+ let u;
248
+ try {
249
+ u = new URL(loc);
250
+ } catch {
251
+ continue;
252
+ }
253
+
254
+ if (u.pathname.endsWith('.xml') && u.origin === startOrigin && /sitemap/i.test(u.pathname)) {
255
+ queue.push(u.toString());
256
+ continue;
257
+ }
258
+
259
+ if (u.origin !== startOrigin) continue;
260
+ u.hash = '';
261
+ enqueue(u.toString());
262
+ count++;
263
+ }
264
+ console.log(` (found ${count} URLs)`);
265
+ }
266
+ }
267
+
268
+ export async function mirrorSite({ start, out, maxPages, maxDepth, sameOriginOnly, seedSitemaps, singlePage, userAgent }) {
269
+ const startUrl = normalizePageUrl(start);
270
+ let startOrigin = startUrl.origin;
271
+ const outRoot = path.resolve(out);
272
+
273
+ await ensureDir(outRoot);
274
+
275
+ const visitedPages = new Set();
276
+ const enqueuedPages = new Set();
277
+ const savedAssets = new Set();
278
+
279
+ const queue = [{ url: startUrl.toString(), depth: 0 }];
280
+ enqueuedPages.add(startUrl.toString());
281
+
282
+ const enqueue = (url, depth = 1) => {
283
+ if (enqueuedPages.has(url) || visitedPages.has(url)) return;
284
+ queue.push({ url, depth });
285
+ enqueuedPages.add(url);
286
+ };
287
+
288
+ if (seedSitemaps) {
289
+ console.log('Seeding from sitemap/robots...');
290
+ try {
291
+ await seedFromSitemaps({
292
+ startOrigin,
293
+ enqueue: (url) => {
294
+ try {
295
+ const u = normalizePageUrl(url);
296
+ if (sameOriginOnly && u.origin !== startOrigin) return;
297
+ if (shouldSkipLink(u)) return;
298
+ enqueue(u.toString(), 1);
299
+ } catch {
300
+ // ignore
301
+ }
302
+ },
303
+ });
304
+ } catch (e) {
305
+ console.warn(`Sitemap seeding failed: ${e?.message || e}`);
306
+ }
307
+ console.log(`Seeding complete. Queue size: ${queue.length}`);
308
+ }
309
+
310
+ let browser = await chromium.launch({ headless: true });
311
+ let context = await browser.newContext({ userAgent });
312
+
313
+ const relaunch = async (reason) => {
314
+ try {
315
+ console.warn(`Relaunching browser context: ${reason}`);
316
+ await context?.close();
317
+ } catch {}
318
+ try {
319
+ await browser?.close();
320
+ } catch {}
321
+ browser = await chromium.launch({ headless: true });
322
+ context = await browser.newContext({ userAgent });
323
+ };
324
+
325
+ const unlimitedPages = !maxPages || maxPages <= 0;
326
+ const unlimitedDepth = !maxDepth || maxDepth <= 0;
327
+
328
+ try {
329
+ while (queue.length > 0 && (unlimitedPages || visitedPages.size < maxPages)) {
330
+ const { url, depth } = queue.shift();
331
+ if (visitedPages.has(url)) continue;
332
+ if (!unlimitedDepth && depth > maxDepth) continue;
333
+
334
+ const pageUrl = normalizePageUrl(url);
335
+ if (sameOriginOnly && pageUrl.origin !== startOrigin) continue;
336
+
337
+ visitedPages.add(pageUrl.toString());
338
+ const progressMax = unlimitedPages ? '∞' : String(maxPages);
339
+ console.log(`[${visitedPages.size}/${progressMax}] ${pageUrl.toString()}`);
340
+
341
+ let page;
342
+ try {
343
+ page = await context.newPage();
344
+ } catch (e) {
345
+ await relaunch(e?.message || 'newPage failed');
346
+ page = await context.newPage();
347
+ }
348
+
349
+ let assetCount = 0;
350
+ const onResponse = async (response) => {
351
+ try {
352
+ const req = response.request();
353
+ if (req.method() !== 'GET') return;
354
+
355
+ const resourceType = req.resourceType();
356
+ const responseUrl = response.url();
357
+ if (!responseUrl.startsWith('http://') && !responseUrl.startsWith('https://')) return;
358
+
359
+ const assetUrl = new URL(responseUrl);
360
+ const ct = response.headers()['content-type'] || '';
361
+ const ctLower = ct.toLowerCase();
362
+
363
+ // Skip actual HTML documents
364
+ if (resourceType === 'document') return;
365
+ if (ctLower.includes('text/html')) return;
366
+
367
+ // Allow all static assets regardless of how they were loaded (XHR, fetch, etc.)
368
+ const isStatic =
369
+ resourceType === 'stylesheet' ||
370
+ resourceType === 'script' ||
371
+ resourceType === 'image' ||
372
+ resourceType === 'font' ||
373
+ resourceType === 'media' ||
374
+ resourceType === 'other' ||
375
+ ctLower.includes('text/css') ||
376
+ ctLower.includes('javascript') ||
377
+ ctLower.includes('json') ||
378
+ ctLower.startsWith('image/') ||
379
+ ctLower.startsWith('font/') ||
380
+ ctLower.startsWith('video/') ||
381
+ ctLower.startsWith('audio/') ||
382
+ ctLower.includes('woff') ||
383
+ ctLower.includes('wasm') ||
384
+ ctLower.includes('octet-stream');
385
+
386
+ if (!isStatic) return;
387
+
388
+ const key = assetUrl.toString();
389
+ if (savedAssets.has(key)) return;
390
+
391
+ const body = await response.body();
392
+ // Use the page's current origin (after redirects) instead of the initial startOrigin
393
+ const currentOrigin = new URL(page.url()).origin;
394
+ const filePath = assetOutputPath(outRoot, currentOrigin, assetUrl, ct);
395
+ await writeFileSafe(filePath, body);
396
+ savedAssets.add(key);
397
+ assetCount++;
398
+ } catch {
399
+ // best effort
400
+ }
401
+ };
402
+
403
+ page.on('response', onResponse);
404
+
405
+ try {
406
+ try {
407
+ await page.goto(pageUrl.toString(), { waitUntil: 'domcontentloaded', timeout: 60000 });
408
+ } catch (e) {
409
+ const message = e?.message || String(e);
410
+ if (message.includes('net::ERR_ABORTED')) {
411
+ await page.goto(pageUrl.toString(), { waitUntil: 'load', timeout: 60000 });
412
+ } else {
413
+ throw e;
414
+ }
415
+ }
416
+
417
+ try {
418
+ await page.waitForLoadState('networkidle', { timeout: 20000 });
419
+ } catch {
420
+ // long-lived connections are fine
421
+ }
422
+
423
+ // Update startOrigin to match the actual origin after redirects (e.g., apple.com -> www.apple.com)
424
+ if (visitedPages.size === 0) {
425
+ const actualOrigin = new URL(page.url()).origin;
426
+ if (actualOrigin !== startOrigin) {
427
+ console.log(` (followed redirect: ${startOrigin} → ${actualOrigin})`);
428
+ startOrigin = actualOrigin;
429
+ }
430
+ }
431
+
432
+ // Scroll to trigger lazy-loaded content
433
+ try {
434
+ await page.evaluate(() => {
435
+ window.scrollTo(0, document.body.scrollHeight / 2);
436
+ });
437
+ await page.waitForTimeout(1000);
438
+ await page.evaluate(() => {
439
+ window.scrollTo(0, document.body.scrollHeight);
440
+ });
441
+ await page.waitForTimeout(1000);
442
+ await page.evaluate(() => {
443
+ window.scrollTo(0, 0);
444
+ });
445
+ } catch {}
446
+
447
+ try {
448
+ await page.waitForTimeout(3000); // Increased from 1500ms to 3000ms for dynamic content
449
+ } catch {}
450
+
451
+ // Skip link discovery in single-page mode
452
+ if (!singlePage) {
453
+ const rawLinks = await page.$$eval('a[href]', (els) =>
454
+ els.map((e) => e.getAttribute('href')).filter(Boolean)
455
+ );
456
+
457
+ for (const href of rawLinks) {
458
+ if (!href || href.startsWith('#')) continue;
459
+ if (href.startsWith('mailto:') || href.startsWith('tel:') || href.startsWith('javascript:')) continue;
460
+
461
+ let linkUrl;
462
+ try {
463
+ linkUrl = new URL(href, pageUrl.toString());
464
+ } catch {
465
+ continue;
466
+ }
467
+
468
+ if (sameOriginOnly && linkUrl.origin !== startOrigin) continue;
469
+ linkUrl.hash = '';
470
+ if (shouldSkipLink(linkUrl)) continue;
471
+
472
+ const normalized = linkUrl.toString();
473
+ if (!enqueuedPages.has(normalized) && !visitedPages.has(normalized)) {
474
+ enqueue(normalized, depth + 1);
475
+ }
476
+ }
477
+ }
478
+
479
+ let html = await page.content();
480
+ html = rewriteAbsoluteSameOriginUrls(html, startOrigin);
481
+ html = injectOfflineFullNavigation(html);
482
+
483
+ const htmlPath = pageOutputPath(outRoot, pageUrl);
484
+ await writeFileSafe(htmlPath, html);
485
+ console.log(` → Saved ${assetCount} assets`);
486
+ } catch (e) {
487
+ const message = e?.message || String(e);
488
+ console.warn(`Failed: ${pageUrl.toString()} (${message})`);
489
+ if (message.includes('Target page, context or browser has been closed')) {
490
+ await relaunch('target closed during navigation');
491
+ }
492
+ } finally {
493
+ page.off('response', onResponse);
494
+ try {
495
+ await page.close();
496
+ } catch {}
497
+ }
498
+ }
499
+ } finally {
500
+ try {
501
+ await context.close();
502
+ } catch {}
503
+ try {
504
+ await browser.close();
505
+ } catch {}
506
+ }
507
+
508
+ console.log('');
509
+ console.log(`Done. Pages saved: ${visitedPages.size}, Assets saved: ${savedAssets.size}`);
510
+ console.log(`Output: ${outRoot}`);
511
+ }