@d-zero/replicator 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/child-process.js +4 -27
- package/dist/index.js +2 -24
- package/dist/resource-downloader.js +1 -24
- package/package.json +8 -8
package/dist/child-process.js
CHANGED
|
@@ -1,28 +1,7 @@
|
|
|
1
1
|
import { createChildProcess } from '@d-zero/puppeteer-dealer';
|
|
2
2
|
import { beforePageScan, devicePresets } from '@d-zero/puppeteer-page-scan';
|
|
3
3
|
import { scrollAllOver } from '@d-zero/puppeteer-scroll';
|
|
4
|
-
|
|
5
|
-
* Add resource path to the set with MIME type encoding if needed
|
|
6
|
-
* @param pathname - Resource pathname
|
|
7
|
-
* @param mimeType - MIME type from response headers (optional)
|
|
8
|
-
* @returns Encoded resource path
|
|
9
|
-
*/
|
|
10
|
-
function encodeResourcePath(pathname, mimeType) {
|
|
11
|
-
// Normalize empty pathname to "/"
|
|
12
|
-
if (pathname === '') {
|
|
13
|
-
pathname = '/';
|
|
14
|
-
}
|
|
15
|
-
// Check if the last segment has an extension
|
|
16
|
-
const lastSlashIndex = pathname.lastIndexOf('/');
|
|
17
|
-
const lastSegment = lastSlashIndex === -1 ? pathname : pathname.slice(lastSlashIndex + 1);
|
|
18
|
-
const hasExtension = lastSegment.includes('.');
|
|
19
|
-
// For paths without extension, encode with MIME type if available
|
|
20
|
-
if (!hasExtension && mimeType) {
|
|
21
|
-
return `${pathname}:::${mimeType}`;
|
|
22
|
-
}
|
|
23
|
-
// For paths with extension or without MIME type, return as-is
|
|
24
|
-
return pathname;
|
|
25
|
-
}
|
|
4
|
+
import { encodeResourcePath } from '@d-zero/shared/encode-resource-path';
|
|
26
5
|
createChildProcess((param) => {
|
|
27
6
|
const { devices, timeout } = param;
|
|
28
7
|
return {
|
|
@@ -31,8 +10,7 @@ createChildProcess((param) => {
|
|
|
31
10
|
const pageHostname = new URL(url).hostname;
|
|
32
11
|
// Add the page URL itself first (in case response event is missed)
|
|
33
12
|
const pageUrlObj = new URL(url);
|
|
34
|
-
|
|
35
|
-
resourcePaths.add(encodeResourcePath(pagePathname, 'text/html'));
|
|
13
|
+
resourcePaths.add(encodeResourcePath(pageUrlObj, 'text/html'));
|
|
36
14
|
// Listen to all network responses
|
|
37
15
|
const responseHandler = (response) => {
|
|
38
16
|
const responseUrl = response.url();
|
|
@@ -53,12 +31,11 @@ createChildProcess((param) => {
|
|
|
53
31
|
if (resourceUrlObj.hostname !== pageHostname) {
|
|
54
32
|
return;
|
|
55
33
|
}
|
|
56
|
-
// Get
|
|
57
|
-
const resourcePath = resourceUrlObj.pathname;
|
|
34
|
+
// Get MIME type
|
|
58
35
|
const contentType = response.headers()['content-type'];
|
|
59
36
|
const mimeType = contentType?.split(';')[0]?.trim();
|
|
60
37
|
// Add resource with MIME encoding if needed
|
|
61
|
-
resourcePaths.add(encodeResourcePath(
|
|
38
|
+
resourcePaths.add(encodeResourcePath(resourceUrlObj, mimeType));
|
|
62
39
|
};
|
|
63
40
|
page.on('response', responseHandler);
|
|
64
41
|
const defaultSizes = {
|
package/dist/index.js
CHANGED
|
@@ -1,31 +1,10 @@
|
|
|
1
1
|
import path from 'node:path';
|
|
2
2
|
import { deal, createProcess } from '@d-zero/puppeteer-dealer';
|
|
3
3
|
import { devicePresets } from '@d-zero/puppeteer-page-scan';
|
|
4
|
+
import { encodeResourcePath } from '@d-zero/shared/encode-resource-path';
|
|
4
5
|
import { validateSameHost } from '@d-zero/shared/validate-same-host';
|
|
5
6
|
import c from 'ansi-colors';
|
|
6
7
|
import { downloadResources } from './resource-downloader.js';
|
|
7
|
-
/**
|
|
8
|
-
* Encode resource path with MIME type if needed
|
|
9
|
-
* @param pathname - Resource pathname
|
|
10
|
-
* @param mimeType - MIME type (optional)
|
|
11
|
-
* @returns Encoded resource path
|
|
12
|
-
*/
|
|
13
|
-
function encodeResourcePath(pathname, mimeType) {
|
|
14
|
-
// Normalize empty pathname to "/"
|
|
15
|
-
if (pathname === '') {
|
|
16
|
-
pathname = '/';
|
|
17
|
-
}
|
|
18
|
-
// Check if the last segment has an extension
|
|
19
|
-
const lastSlashIndex = pathname.lastIndexOf('/');
|
|
20
|
-
const lastSegment = lastSlashIndex === -1 ? pathname : pathname.slice(lastSlashIndex + 1);
|
|
21
|
-
const hasExtension = lastSegment.includes('.');
|
|
22
|
-
// For paths without extension, encode with MIME type if available
|
|
23
|
-
if (!hasExtension && mimeType) {
|
|
24
|
-
return `${pathname}:::${mimeType}`;
|
|
25
|
-
}
|
|
26
|
-
// For paths with extension or without MIME type, return as-is
|
|
27
|
-
return pathname;
|
|
28
|
-
}
|
|
29
8
|
/**
|
|
30
9
|
* Collect page URLs without resource scanning (page-only mode)
|
|
31
10
|
* @param urls - Array of URLs to process
|
|
@@ -38,9 +17,8 @@ function collectPageUrlsOnly(urls, progress) {
|
|
|
38
17
|
const encodedUrls = new Set();
|
|
39
18
|
for (const url of urls) {
|
|
40
19
|
const urlObj = new URL(url);
|
|
41
|
-
const pathname = urlObj.pathname || '/';
|
|
42
20
|
// Encode as HTML page
|
|
43
|
-
const encodedPath = encodeResourcePath(
|
|
21
|
+
const encodedPath = encodeResourcePath(urlObj, 'text/html');
|
|
44
22
|
encodedUrls.add(encodedPath);
|
|
45
23
|
}
|
|
46
24
|
progress(c.bold.green(`✅ Prepared ${encodedUrls.size} page(s) for download`));
|
|
@@ -1,31 +1,8 @@
|
|
|
1
1
|
import { mkdir, writeFile } from 'node:fs/promises';
|
|
2
2
|
import path from 'node:path';
|
|
3
3
|
import { deal } from '@d-zero/dealer';
|
|
4
|
-
import {
|
|
5
|
-
import { urlToLocalPath } from '@d-zero/shared/url-to-local-path';
|
|
4
|
+
import { parseEncodedPath } from '@d-zero/shared/encode-resource-path';
|
|
6
5
|
import c from 'ansi-colors';
|
|
7
|
-
/**
|
|
8
|
-
* Parse encoded pathname and return the actual URL and local path
|
|
9
|
-
* @param encodedPath - pathname or "pathname:::MIME/type" format
|
|
10
|
-
* @param baseUrl - Base URL to construct full URL from pathname
|
|
11
|
-
*/
|
|
12
|
-
function parseEncodedPath(encodedPath, baseUrl) {
|
|
13
|
-
const parts = encodedPath.split(':::');
|
|
14
|
-
if (parts.length === 2) {
|
|
15
|
-
// Format: "pathname:::MIME/type"
|
|
16
|
-
const pathname = parts[0];
|
|
17
|
-
const mimeType = parts[1];
|
|
18
|
-
const url = new URL(pathname, baseUrl).href;
|
|
19
|
-
const extension = mimeToExtension(mimeType);
|
|
20
|
-
const localPath = urlToLocalPath(url, extension);
|
|
21
|
-
return { url, localPath };
|
|
22
|
-
}
|
|
23
|
-
// Regular pathname without MIME encoding
|
|
24
|
-
const pathname = encodedPath;
|
|
25
|
-
const url = new URL(pathname, baseUrl).href;
|
|
26
|
-
const localPath = urlToLocalPath(url, '');
|
|
27
|
-
return { url, localPath };
|
|
28
|
-
}
|
|
29
6
|
/**
|
|
30
7
|
* Download and save resources to disk
|
|
31
8
|
* @param encodedPaths - Array of encoded pathnames
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@d-zero/replicator",
|
|
3
|
-
"version": "0.7.
|
|
3
|
+
"version": "0.7.2",
|
|
4
4
|
"description": "Replicate web pages with all their resources to local directories",
|
|
5
5
|
"author": "D-ZERO",
|
|
6
6
|
"license": "MIT",
|
|
@@ -24,12 +24,12 @@
|
|
|
24
24
|
"clean": "tsc --build --clean"
|
|
25
25
|
},
|
|
26
26
|
"dependencies": {
|
|
27
|
-
"@d-zero/cli-core": "1.2.
|
|
28
|
-
"@d-zero/dealer": "1.4.
|
|
29
|
-
"@d-zero/puppeteer-dealer": "0.5.
|
|
30
|
-
"@d-zero/puppeteer-page-scan": "4.2.
|
|
31
|
-
"@d-zero/puppeteer-scroll": "3.1.
|
|
32
|
-
"@d-zero/shared": "0.
|
|
27
|
+
"@d-zero/cli-core": "1.2.2",
|
|
28
|
+
"@d-zero/dealer": "1.4.2",
|
|
29
|
+
"@d-zero/puppeteer-dealer": "0.5.12",
|
|
30
|
+
"@d-zero/puppeteer-page-scan": "4.2.8",
|
|
31
|
+
"@d-zero/puppeteer-scroll": "3.1.2",
|
|
32
|
+
"@d-zero/shared": "0.15.0",
|
|
33
33
|
"ansi-colors": "4.1.3",
|
|
34
34
|
"minimist": "1.2.8",
|
|
35
35
|
"puppeteer": "24.27.0"
|
|
@@ -37,5 +37,5 @@
|
|
|
37
37
|
"devDependencies": {
|
|
38
38
|
"@types/minimist": "1.2.5"
|
|
39
39
|
},
|
|
40
|
-
"gitHead": "
|
|
40
|
+
"gitHead": "858864f8e57494a5a872431e1d772e18e4cc843a"
|
|
41
41
|
}
|