@pagepocket/main-content-unit 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/asset-filename.d.ts +16 -0
- package/dist/asset-filename.js +62 -0
- package/dist/build-content-tree.d.ts +29 -0
- package/dist/build-content-tree.js +86 -0
- package/dist/capture-lookup.d.ts +28 -0
- package/dist/capture-lookup.js +40 -0
- package/dist/collect-asset-urls.d.ts +13 -0
- package/dist/collect-asset-urls.js +79 -0
- package/dist/extract-url-map.d.ts +13 -0
- package/dist/extract-url-map.js +75 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +1 -0
- package/dist/main-content-unit.d.ts +20 -0
- package/dist/main-content-unit.js +62 -0
- package/dist/rewrite-content-urls.d.ts +23 -0
- package/dist/rewrite-content-urls.js +87 -0
- package/package.json +28 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Derive a human-readable, filesystem-safe filename from a URL and MIME type.
|
|
3
|
+
*
|
|
4
|
+
* Attempts to preserve the original filename from the URL path. Falls back
|
|
5
|
+
* to a hash-based name. Appends the correct extension from the MIME type
|
|
6
|
+
* when the URL-derived name lacks one.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* ```ts
|
|
10
|
+
* assetFilename("https://cdn.example.com/images/photo.jpg", "image/jpeg")
|
|
11
|
+
* // => "photo.jpg"
|
|
12
|
+
* assetFilename("https://cdn.example.com/api/image?id=42", "image/png")
|
|
13
|
+
* // => "image_id_42.png"
|
|
14
|
+
* ```
|
|
15
|
+
*/
|
|
16
|
+
export declare const assetFilename: (url: string, mimeType?: string) => string;
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { extensionFromContentType } from "@pagepocket/shared";
|
|
2
|
+
const extractPathBasename = (url) => {
|
|
3
|
+
try {
|
|
4
|
+
const pathname = new URL(url).pathname;
|
|
5
|
+
const segments = pathname.split("/");
|
|
6
|
+
const last = segments[segments.length - 1];
|
|
7
|
+
if (!last || last === "/") {
|
|
8
|
+
return undefined;
|
|
9
|
+
}
|
|
10
|
+
return decodeURIComponent(last);
|
|
11
|
+
}
|
|
12
|
+
catch {
|
|
13
|
+
return undefined;
|
|
14
|
+
}
|
|
15
|
+
};
|
|
16
|
+
const sanitize = (name) => name
|
|
17
|
+
.replace(/[^a-zA-Z0-9._-]/g, "_")
|
|
18
|
+
.replace(/_+/g, "_")
|
|
19
|
+
.replace(/^_+|_+$/g, "")
|
|
20
|
+
.slice(0, 120);
|
|
21
|
+
/**
|
|
22
|
+
* Derive a human-readable, filesystem-safe filename from a URL and MIME type.
|
|
23
|
+
*
|
|
24
|
+
* Attempts to preserve the original filename from the URL path. Falls back
|
|
25
|
+
* to a hash-based name. Appends the correct extension from the MIME type
|
|
26
|
+
* when the URL-derived name lacks one.
|
|
27
|
+
*
|
|
28
|
+
* Usage:
|
|
29
|
+
* ```ts
|
|
30
|
+
* assetFilename("https://cdn.example.com/images/photo.jpg", "image/jpeg")
|
|
31
|
+
* // => "photo.jpg"
|
|
32
|
+
* assetFilename("https://cdn.example.com/api/image?id=42", "image/png")
|
|
33
|
+
* // => "image_id_42.png"
|
|
34
|
+
* ```
|
|
35
|
+
*/
|
|
36
|
+
export const assetFilename = (url, mimeType) => {
|
|
37
|
+
const basename = extractPathBasename(url);
|
|
38
|
+
const mimeExt = extensionFromContentType(mimeType);
|
|
39
|
+
if (basename) {
|
|
40
|
+
const sanitized = sanitize(basename);
|
|
41
|
+
const hasExt = sanitized.includes(".");
|
|
42
|
+
if (sanitized && hasExt) {
|
|
43
|
+
return sanitized;
|
|
44
|
+
}
|
|
45
|
+
if (sanitized && mimeExt) {
|
|
46
|
+
return `${sanitized}${mimeExt}`;
|
|
47
|
+
}
|
|
48
|
+
if (sanitized) {
|
|
49
|
+
return sanitized;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
const hash = simpleHash(url);
|
|
53
|
+
return mimeExt ? `${hash}${mimeExt}` : hash;
|
|
54
|
+
};
|
|
55
|
+
const simpleHash = (input) => {
|
|
56
|
+
let hash = 0;
|
|
57
|
+
for (let index = 0; index < input.length; index++) {
|
|
58
|
+
const char = input.charCodeAt(index);
|
|
59
|
+
hash = ((hash << 5) - hash + char) | 0;
|
|
60
|
+
}
|
|
61
|
+
return `asset_${Math.abs(hash).toString(36)}`;
|
|
62
|
+
};
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { type CaptureArtifacts, type FileTree } from "@pagepocket/lib";
|
|
2
|
+
export type ContentTreeInput = {
|
|
3
|
+
contentHtml: string;
|
|
4
|
+
baseUrl: string;
|
|
5
|
+
capture: CaptureArtifacts;
|
|
6
|
+
};
|
|
7
|
+
export type ContentTreeResult = {
|
|
8
|
+
files: FileTree;
|
|
9
|
+
rewrittenHtml: string;
|
|
10
|
+
};
|
|
11
|
+
/**
|
|
12
|
+
* Build a self-contained FileTree from extracted main content HTML.
|
|
13
|
+
*
|
|
14
|
+
* 1. Scans `contentHtml` for media asset URLs (img, video, audio, source, poster).
|
|
15
|
+
* 2. Resolves each URL against the capture's ContentStore to download the body.
|
|
16
|
+
* 3. Assigns each asset a filesystem-safe name under `/assets/`.
|
|
17
|
+
* 4. Rewrites all asset URLs in the HTML to `/assets/{filename}`.
|
|
18
|
+
* 5. Returns a FileTree with `/index.html` + `/assets/*`.
|
|
19
|
+
*
|
|
20
|
+
* Usage:
|
|
21
|
+
* ```ts
|
|
22
|
+
* const { files, rewrittenHtml } = await buildContentTree({
|
|
23
|
+
* contentHtml: '<img src="https://example.com/photo.jpg">',
|
|
24
|
+
* baseUrl: 'https://example.com/article',
|
|
25
|
+
* capture,
|
|
26
|
+
* });
|
|
27
|
+
* ```
|
|
28
|
+
*/
|
|
29
|
+
export declare const buildContentTree: (input: ContentTreeInput) => Promise<ContentTreeResult>;
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { streamToUint8Array } from "@pagepocket/lib";
|
|
2
|
+
import { assetFilename } from "./asset-filename.js";
|
|
3
|
+
import { buildCaptureLookup } from "./capture-lookup.js";
|
|
4
|
+
import { collectAssetUrls } from "./collect-asset-urls.js";
|
|
5
|
+
import { rewriteContentUrls } from "./rewrite-content-urls.js";
|
|
6
|
+
/**
|
|
7
|
+
* Build a self-contained FileTree from extracted main content HTML.
|
|
8
|
+
*
|
|
9
|
+
* 1. Scans `contentHtml` for media asset URLs (img, video, audio, source, poster).
|
|
10
|
+
* 2. Resolves each URL against the capture's ContentStore to download the body.
|
|
11
|
+
* 3. Assigns each asset a filesystem-safe name under `/assets/`.
|
|
12
|
+
* 4. Rewrites all asset URLs in the HTML to `/assets/{filename}`.
|
|
13
|
+
* 5. Returns a FileTree with `/index.html` + `/assets/*`.
|
|
14
|
+
*
|
|
15
|
+
* Usage:
|
|
16
|
+
* ```ts
|
|
17
|
+
* const { files, rewrittenHtml } = await buildContentTree({
|
|
18
|
+
* contentHtml: '<img src="https://example.com/photo.jpg">',
|
|
19
|
+
* baseUrl: 'https://example.com/article',
|
|
20
|
+
* capture,
|
|
21
|
+
* });
|
|
22
|
+
* ```
|
|
23
|
+
*/
|
|
24
|
+
export const buildContentTree = async (input) => {
|
|
25
|
+
const { contentHtml, baseUrl, capture } = input;
|
|
26
|
+
const assetUrls = collectAssetUrls(contentHtml, baseUrl);
|
|
27
|
+
const captureLookup = buildCaptureLookup(capture);
|
|
28
|
+
const urlToAssetPath = new Map();
|
|
29
|
+
const assetFiles = [];
|
|
30
|
+
const usedFilenames = new Set();
|
|
31
|
+
for (const assetUrl of assetUrls) {
|
|
32
|
+
const entry = captureLookup.get(assetUrl);
|
|
33
|
+
if (!entry || !entry.bodyRef) {
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
const rawName = assetFilename(assetUrl, entry.mimeType);
|
|
37
|
+
const uniqueName = deduplicateFilename(rawName, usedFilenames);
|
|
38
|
+
usedFilenames.add(uniqueName);
|
|
39
|
+
const assetPath = `/assets/${uniqueName}`;
|
|
40
|
+
urlToAssetPath.set(assetUrl, assetPath);
|
|
41
|
+
const stream = await capture.contentStore.open(entry.bodyRef);
|
|
42
|
+
const bytes = await streamToUint8Array(stream);
|
|
43
|
+
assetFiles.push({
|
|
44
|
+
kind: "file",
|
|
45
|
+
path: assetPath,
|
|
46
|
+
source: { kind: "bytes", data: bytes }
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
const resolve = (absoluteUrl) => urlToAssetPath.get(absoluteUrl);
|
|
50
|
+
const rewrittenHtml = rewriteContentUrls({
|
|
51
|
+
html: contentHtml,
|
|
52
|
+
baseUrl,
|
|
53
|
+
resolve
|
|
54
|
+
});
|
|
55
|
+
const indexHtmlBytes = new TextEncoder().encode(rewrittenHtml);
|
|
56
|
+
const indexFile = {
|
|
57
|
+
kind: "file",
|
|
58
|
+
path: "/index.html",
|
|
59
|
+
source: { kind: "bytes", data: indexHtmlBytes }
|
|
60
|
+
};
|
|
61
|
+
return {
|
|
62
|
+
rewrittenHtml,
|
|
63
|
+
files: {
|
|
64
|
+
root: {
|
|
65
|
+
kind: "directory",
|
|
66
|
+
path: "",
|
|
67
|
+
entries: [indexFile, ...assetFiles]
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
};
|
|
71
|
+
};
|
|
72
|
+
const deduplicateFilename = (name, used) => {
|
|
73
|
+
if (!used.has(name)) {
|
|
74
|
+
return name;
|
|
75
|
+
}
|
|
76
|
+
const dotIndex = name.lastIndexOf(".");
|
|
77
|
+
const stem = dotIndex > 0 ? name.slice(0, dotIndex) : name;
|
|
78
|
+
const ext = dotIndex > 0 ? name.slice(dotIndex) : "";
|
|
79
|
+
for (let suffix = 2; suffix <= 9999; suffix++) {
|
|
80
|
+
const candidate = `${stem}-${suffix}${ext}`;
|
|
81
|
+
if (!used.has(candidate)) {
|
|
82
|
+
return candidate;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
return `${stem}-${Date.now()}${ext}`;
|
|
86
|
+
};
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { CaptureArtifacts } from "@pagepocket/lib";
|
|
2
|
+
type ResponseEntry = {
|
|
3
|
+
url: string;
|
|
4
|
+
mimeType?: string;
|
|
5
|
+
bodyRef?: {
|
|
6
|
+
kind: "memory";
|
|
7
|
+
data: Uint8Array;
|
|
8
|
+
} | {
|
|
9
|
+
kind: "store-ref";
|
|
10
|
+
id: string;
|
|
11
|
+
};
|
|
12
|
+
bodySize?: number;
|
|
13
|
+
};
|
|
14
|
+
/**
|
|
15
|
+
* Build a URL-keyed lookup of captured HTTP responses.
|
|
16
|
+
*
|
|
17
|
+
* Matches response events to their request URLs, deduplicating by URL
|
|
18
|
+
* (last response wins). Only successful responses (status 2xx) with
|
|
19
|
+
* a body reference are included.
|
|
20
|
+
*
|
|
21
|
+
* Usage:
|
|
22
|
+
* ```ts
|
|
23
|
+
* const lookup = buildCaptureLookup(capture);
|
|
24
|
+
* const entry = lookup.get("https://example.com/photo.jpg");
|
|
25
|
+
* ```
|
|
26
|
+
*/
|
|
27
|
+
export declare const buildCaptureLookup: (capture: CaptureArtifacts) => Map<string, ResponseEntry>;
|
|
28
|
+
export {};
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Build a URL-keyed lookup of captured HTTP responses.
|
|
3
|
+
*
|
|
4
|
+
* Matches response events to their request URLs, deduplicating by URL
|
|
5
|
+
* (last response wins). Only successful responses (status 2xx) with
|
|
6
|
+
* a body reference are included.
|
|
7
|
+
*
|
|
8
|
+
* Usage:
|
|
9
|
+
* ```ts
|
|
10
|
+
* const lookup = buildCaptureLookup(capture);
|
|
11
|
+
* const entry = lookup.get("https://example.com/photo.jpg");
|
|
12
|
+
* ```
|
|
13
|
+
*/
|
|
14
|
+
export const buildCaptureLookup = (capture) => {
|
|
15
|
+
const requestUrls = new Map();
|
|
16
|
+
const lookup = new Map();
|
|
17
|
+
for (const event of capture.events) {
|
|
18
|
+
if (event.type === "http.request") {
|
|
19
|
+
requestUrls.set(event.requestId, event.url);
|
|
20
|
+
continue;
|
|
21
|
+
}
|
|
22
|
+
if (event.type !== "http.response") {
|
|
23
|
+
continue;
|
|
24
|
+
}
|
|
25
|
+
if (event.status < 200 || event.status >= 300) {
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
if (!event.bodyRef) {
|
|
29
|
+
continue;
|
|
30
|
+
}
|
|
31
|
+
const url = requestUrls.get(event.requestId) ?? event.url;
|
|
32
|
+
lookup.set(url, {
|
|
33
|
+
url,
|
|
34
|
+
mimeType: event.mimeType,
|
|
35
|
+
bodyRef: event.bodyRef,
|
|
36
|
+
bodySize: event.bodySize
|
|
37
|
+
});
|
|
38
|
+
}
|
|
39
|
+
return lookup;
|
|
40
|
+
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Collect all unique absolute asset URLs referenced in an HTML fragment.
|
|
3
|
+
*
|
|
4
|
+
* Scans `img`, `video`, `audio`, `source`, and `poster` attributes
|
|
5
|
+
* including `srcset`. Returns a deduplicated list of absolute URLs.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* ```ts
|
|
9
|
+
* const urls = collectAssetUrls('<img src="/photo.jpg">', 'https://example.com');
|
|
10
|
+
* // => ["https://example.com/photo.jpg"]
|
|
11
|
+
* ```
|
|
12
|
+
*/
|
|
13
|
+
export declare const collectAssetUrls: (html: string, baseUrl: string) => string[];
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import * as cheerio from "cheerio";
|
|
2
|
+
const ASSET_SELECTORS = [
|
|
3
|
+
{ selector: "img[src]", attr: "src" },
|
|
4
|
+
{ selector: "video[src]", attr: "src" },
|
|
5
|
+
{ selector: "audio[src]", attr: "src" },
|
|
6
|
+
{ selector: "source[src]", attr: "src" },
|
|
7
|
+
{ selector: "video[poster]", attr: "poster" },
|
|
8
|
+
{ selector: "img[srcset]", attr: "srcset" },
|
|
9
|
+
{ selector: "source[srcset]", attr: "srcset" }
|
|
10
|
+
];
|
|
11
|
+
const shouldSkip = (value) => {
|
|
12
|
+
const trimmed = value.trim();
|
|
13
|
+
return (!trimmed ||
|
|
14
|
+
trimmed.startsWith("data:") ||
|
|
15
|
+
trimmed.startsWith("blob:") ||
|
|
16
|
+
trimmed.startsWith("javascript:") ||
|
|
17
|
+
trimmed.startsWith("#"));
|
|
18
|
+
};
|
|
19
|
+
const toAbsolute = (value, baseUrl) => {
|
|
20
|
+
if (shouldSkip(value)) {
|
|
21
|
+
return undefined;
|
|
22
|
+
}
|
|
23
|
+
try {
|
|
24
|
+
return new URL(value, baseUrl).toString();
|
|
25
|
+
}
|
|
26
|
+
catch {
|
|
27
|
+
return undefined;
|
|
28
|
+
}
|
|
29
|
+
};
|
|
30
|
+
const parseSrcsetUrls = (srcset, baseUrl) => {
|
|
31
|
+
const urls = [];
|
|
32
|
+
for (const part of srcset.split(",")) {
|
|
33
|
+
const segments = part.trim().split(/\s+/);
|
|
34
|
+
const rawUrl = segments[0];
|
|
35
|
+
if (!rawUrl) {
|
|
36
|
+
continue;
|
|
37
|
+
}
|
|
38
|
+
const absolute = toAbsolute(rawUrl, baseUrl);
|
|
39
|
+
if (absolute) {
|
|
40
|
+
urls.push(absolute);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return urls;
|
|
44
|
+
};
|
|
45
|
+
/**
|
|
46
|
+
* Collect all unique absolute asset URLs referenced in an HTML fragment.
|
|
47
|
+
*
|
|
48
|
+
* Scans `img`, `video`, `audio`, `source`, and `poster` attributes
|
|
49
|
+
* including `srcset`. Returns a deduplicated list of absolute URLs.
|
|
50
|
+
*
|
|
51
|
+
* Usage:
|
|
52
|
+
* ```ts
|
|
53
|
+
* const urls = collectAssetUrls('<img src="/photo.jpg">', 'https://example.com');
|
|
54
|
+
* // => ["https://example.com/photo.jpg"]
|
|
55
|
+
* ```
|
|
56
|
+
*/
|
|
57
|
+
export const collectAssetUrls = (html, baseUrl) => {
|
|
58
|
+
const $ = cheerio.load(html, undefined, false);
|
|
59
|
+
const seen = new Set();
|
|
60
|
+
for (const { selector, attr } of ASSET_SELECTORS) {
|
|
61
|
+
$(selector).each((_, element) => {
|
|
62
|
+
const value = $(element).attr(attr);
|
|
63
|
+
if (!value) {
|
|
64
|
+
return;
|
|
65
|
+
}
|
|
66
|
+
if (attr === "srcset") {
|
|
67
|
+
for (const url of parseSrcsetUrls(value, baseUrl)) {
|
|
68
|
+
seen.add(url);
|
|
69
|
+
}
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
const absolute = toAbsolute(value, baseUrl);
|
|
73
|
+
if (absolute) {
|
|
74
|
+
seen.add(absolute);
|
|
75
|
+
}
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
return [...seen];
|
|
79
|
+
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { FileTree } from "@pagepocket/lib";
|
|
2
|
+
/**
|
|
3
|
+
* Extract the URL-to-local-path mapping from a snapshot FileTree by reading
|
|
4
|
+
* the embedded `resources_path.json` file.
|
|
5
|
+
*
|
|
6
|
+
* Usage:
|
|
7
|
+
* ```ts
|
|
8
|
+
* const urlToPath = await extractUrlMap(fileTree);
|
|
9
|
+
* const localPath = urlToPath.get("https://example.com/img.jpg");
|
|
10
|
+
* // => "/resources/img.jpg"
|
|
11
|
+
* ```
|
|
12
|
+
*/
|
|
13
|
+
export declare const extractUrlMap: (files: FileTree) => Promise<Map<string, string>>;
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract the URL-to-local-path mapping from a snapshot FileTree by reading
|
|
3
|
+
* the embedded `resources_path.json` file.
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* ```ts
|
|
7
|
+
* const urlToPath = await extractUrlMap(fileTree);
|
|
8
|
+
* const localPath = urlToPath.get("https://example.com/img.jpg");
|
|
9
|
+
* // => "/resources/img.jpg"
|
|
10
|
+
* ```
|
|
11
|
+
*/
|
|
12
|
+
export const extractUrlMap = async (files) => {
|
|
13
|
+
const resourcesPathEntry = findFile(files.root, "/resources_path.json");
|
|
14
|
+
if (!resourcesPathEntry) {
|
|
15
|
+
return new Map();
|
|
16
|
+
}
|
|
17
|
+
const bytes = await readFileEntry(files, resourcesPathEntry);
|
|
18
|
+
const text = new TextDecoder().decode(bytes);
|
|
19
|
+
const snapshot = JSON.parse(text);
|
|
20
|
+
const urlToPath = new Map();
|
|
21
|
+
for (const item of snapshot.items) {
|
|
22
|
+
urlToPath.set(item.url, item.path);
|
|
23
|
+
}
|
|
24
|
+
return urlToPath;
|
|
25
|
+
};
|
|
26
|
+
const findFile = (dir, targetPath) => {
|
|
27
|
+
for (const entry of dir.entries) {
|
|
28
|
+
if (entry.kind === "file" && entry.path === targetPath) {
|
|
29
|
+
return entry;
|
|
30
|
+
}
|
|
31
|
+
if (entry.kind === "directory") {
|
|
32
|
+
const found = findFile(entry, targetPath);
|
|
33
|
+
if (found) {
|
|
34
|
+
return found;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
return undefined;
|
|
39
|
+
};
|
|
40
|
+
const readFileEntry = async (files, entry) => {
|
|
41
|
+
if (entry.kind !== "file") {
|
|
42
|
+
throw new Error("Expected file entry");
|
|
43
|
+
}
|
|
44
|
+
const source = entry.source;
|
|
45
|
+
if (source.kind === "bytes") {
|
|
46
|
+
return source.data;
|
|
47
|
+
}
|
|
48
|
+
if (source.kind === "text") {
|
|
49
|
+
return new TextEncoder().encode(source.text);
|
|
50
|
+
}
|
|
51
|
+
if (source.kind === "content-ref") {
|
|
52
|
+
if (!files.content) {
|
|
53
|
+
throw new Error("Cannot resolve content-ref without files.content handle");
|
|
54
|
+
}
|
|
55
|
+
const stream = await files.content.open(source.ref);
|
|
56
|
+
const reader = stream.getReader();
|
|
57
|
+
const chunks = [];
|
|
58
|
+
for (;;) {
|
|
59
|
+
const { done, value } = await reader.read();
|
|
60
|
+
if (done) {
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
chunks.push(value);
|
|
64
|
+
}
|
|
65
|
+
const totalLength = chunks.reduce((sum, chunk) => sum + chunk.byteLength, 0);
|
|
66
|
+
const result = new Uint8Array(totalLength);
|
|
67
|
+
let offset = 0;
|
|
68
|
+
for (const chunk of chunks) {
|
|
69
|
+
result.set(chunk, offset);
|
|
70
|
+
offset += chunk.byteLength;
|
|
71
|
+
}
|
|
72
|
+
return result;
|
|
73
|
+
}
|
|
74
|
+
throw new Error(`Unhandled file source kind: ${JSON.stringify(source)}`);
|
|
75
|
+
};
|
package/dist/index.d.ts
ADDED
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { MainContentUnit, MainContentUnit as default } from "./main-content-unit.js";
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import { Unit, type FileTree, type UnitContext, type UnitPatch, type UnitRuntime } from "@pagepocket/lib";
|
|
2
|
+
export type MainContentUnitOptions = {
|
|
3
|
+
markdown?: boolean;
|
|
4
|
+
};
|
|
5
|
+
export declare class MainContentUnit extends Unit {
|
|
6
|
+
readonly id = "mainContent";
|
|
7
|
+
readonly kind = "extract.mainContent";
|
|
8
|
+
private options;
|
|
9
|
+
constructor(options?: MainContentUnitOptions);
|
|
10
|
+
merge(returnValue: UnitPatch, pluginContributedValue?: UnitPatch): UnitPatch;
|
|
11
|
+
run(ctx: UnitContext, _rt: UnitRuntime): Promise<{
|
|
12
|
+
files: FileTree;
|
|
13
|
+
html: {
|
|
14
|
+
htmlString: string;
|
|
15
|
+
baseUrl: string;
|
|
16
|
+
url?: string;
|
|
17
|
+
};
|
|
18
|
+
}>;
|
|
19
|
+
private extractMainContent;
|
|
20
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { mergeFileTrees, Unit } from "@pagepocket/lib";
|
|
2
|
+
import { Defuddle } from "defuddle/node";
|
|
3
|
+
import { buildContentTree } from "./build-content-tree.js";
|
|
4
|
+
export class MainContentUnit extends Unit {
|
|
5
|
+
constructor(options) {
|
|
6
|
+
super();
|
|
7
|
+
this.id = "mainContent";
|
|
8
|
+
this.kind = "extract.mainContent";
|
|
9
|
+
this.options = options ?? {};
|
|
10
|
+
}
|
|
11
|
+
merge(returnValue, pluginContributedValue = {}) {
|
|
12
|
+
const mergedValue = { ...returnValue, ...pluginContributedValue };
|
|
13
|
+
const returnFiles = returnValue.files;
|
|
14
|
+
const pluginFiles = pluginContributedValue.files;
|
|
15
|
+
if (!isFileTree(returnFiles) || !isFileTree(pluginFiles)) {
|
|
16
|
+
return mergedValue;
|
|
17
|
+
}
|
|
18
|
+
return { ...mergedValue, files: mergeFileTrees(returnFiles, pluginFiles) };
|
|
19
|
+
}
|
|
20
|
+
async run(ctx, _rt) {
|
|
21
|
+
const capture = ctx.value.capture;
|
|
22
|
+
if (!capture) {
|
|
23
|
+
throw new Error("MainContentUnit requires ctx.value.capture");
|
|
24
|
+
}
|
|
25
|
+
const html = ctx.value.html;
|
|
26
|
+
if (!html) {
|
|
27
|
+
throw new Error("MainContentUnit requires ctx.value.html");
|
|
28
|
+
}
|
|
29
|
+
const domHtml = ctx.value.domHtml;
|
|
30
|
+
const sourceHtml = domHtml?.htmlString ?? html.htmlString;
|
|
31
|
+
const baseUrl = domHtml?.baseUrl ?? html.baseUrl;
|
|
32
|
+
const contentHtml = await this.extractMainContent(sourceHtml, baseUrl);
|
|
33
|
+
const { files } = await buildContentTree({
|
|
34
|
+
contentHtml,
|
|
35
|
+
baseUrl,
|
|
36
|
+
capture
|
|
37
|
+
});
|
|
38
|
+
return { files, html };
|
|
39
|
+
}
|
|
40
|
+
async extractMainContent(htmlString, baseUrl) {
|
|
41
|
+
const result = await Defuddle(htmlString, baseUrl, {
|
|
42
|
+
markdown: this.options.markdown ?? false
|
|
43
|
+
});
|
|
44
|
+
return result.content;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
const isFileTree = (value) => {
|
|
48
|
+
if (!value || typeof value !== "object") {
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
if (!("root" in value)) {
|
|
52
|
+
return false;
|
|
53
|
+
}
|
|
54
|
+
const root = value.root;
|
|
55
|
+
if (!root || typeof root !== "object") {
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
const rootRecord = root;
|
|
59
|
+
return (rootRecord.kind === "directory" &&
|
|
60
|
+
typeof rootRecord.path === "string" &&
|
|
61
|
+
Array.isArray(rootRecord.entries));
|
|
62
|
+
};
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
type UrlResolver = (absoluteUrl: string) => string | undefined;
|
|
2
|
+
/**
|
|
3
|
+
* Rewrite resource URLs in an HTML fragment so they point to local snapshot paths.
|
|
4
|
+
*
|
|
5
|
+
* This handles `src`, `href`, `srcset`, `poster`, `data-src`, `data-href`,
|
|
6
|
+
* `data-poster`, `data-url`, and inline `style` background URLs — the same
|
|
7
|
+
* set of attributes that the snapshot builder's full-page rewriter handles.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* ```ts
|
|
11
|
+
* const rewritten = rewriteContentUrls({
|
|
12
|
+
* html: '<img src="https://example.com/photo.jpg">',
|
|
13
|
+
* baseUrl: 'https://example.com/article',
|
|
14
|
+
* resolve: (url) => urlToPath.get(url),
|
|
15
|
+
* });
|
|
16
|
+
* ```
|
|
17
|
+
*/
|
|
18
|
+
export declare const rewriteContentUrls: (input: {
|
|
19
|
+
html: string;
|
|
20
|
+
baseUrl: string;
|
|
21
|
+
resolve: UrlResolver;
|
|
22
|
+
}) => string;
|
|
23
|
+
export {};
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import * as cheerio from "cheerio";
|
|
2
|
+
/**
|
|
3
|
+
* Rewrite resource URLs in an HTML fragment so they point to local snapshot paths.
|
|
4
|
+
*
|
|
5
|
+
* This handles `src`, `href`, `srcset`, `poster`, `data-src`, `data-href`,
|
|
6
|
+
* `data-poster`, `data-url`, and inline `style` background URLs — the same
|
|
7
|
+
* set of attributes that the snapshot builder's full-page rewriter handles.
|
|
8
|
+
*
|
|
9
|
+
* Usage:
|
|
10
|
+
* ```ts
|
|
11
|
+
* const rewritten = rewriteContentUrls({
|
|
12
|
+
* html: '<img src="https://example.com/photo.jpg">',
|
|
13
|
+
* baseUrl: 'https://example.com/article',
|
|
14
|
+
* resolve: (url) => urlToPath.get(url),
|
|
15
|
+
* });
|
|
16
|
+
* ```
|
|
17
|
+
*/
|
|
18
|
+
export const rewriteContentUrls = (input) => {
|
|
19
|
+
const { html, baseUrl, resolve } = input;
|
|
20
|
+
const $ = cheerio.load(html, undefined, false);
|
|
21
|
+
const rewriteAttr = (selector, attr) => {
|
|
22
|
+
$(selector).each((_, element) => {
|
|
23
|
+
const value = $(element).attr(attr);
|
|
24
|
+
if (!value) {
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
const resolved = resolveValue(value, baseUrl, resolve);
|
|
28
|
+
if (resolved) {
|
|
29
|
+
$(element).attr(attr, resolved);
|
|
30
|
+
}
|
|
31
|
+
});
|
|
32
|
+
};
|
|
33
|
+
rewriteAttr("[src]", "src");
|
|
34
|
+
rewriteAttr("link[href]", "href");
|
|
35
|
+
rewriteAttr("a[href]", "href");
|
|
36
|
+
rewriteAttr("[poster]", "poster");
|
|
37
|
+
rewriteAttr("[data-src]", "data-src");
|
|
38
|
+
rewriteAttr("[data-href]", "data-href");
|
|
39
|
+
rewriteAttr("[data-poster]", "data-poster");
|
|
40
|
+
rewriteAttr("[data-url]", "data-url");
|
|
41
|
+
$("[srcset]").each((_, element) => {
|
|
42
|
+
const value = $(element).attr("srcset");
|
|
43
|
+
if (!value) {
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
const rewritten = rewriteSrcset(value, baseUrl, resolve);
|
|
47
|
+
$(element).attr("srcset", rewritten);
|
|
48
|
+
});
|
|
49
|
+
return $.html();
|
|
50
|
+
};
|
|
51
|
+
const resolveValue = (value, baseUrl, resolve) => {
|
|
52
|
+
if (shouldSkip(value)) {
|
|
53
|
+
return undefined;
|
|
54
|
+
}
|
|
55
|
+
try {
|
|
56
|
+
const absolute = new URL(value, baseUrl).toString();
|
|
57
|
+
return resolve(absolute);
|
|
58
|
+
}
|
|
59
|
+
catch {
|
|
60
|
+
return undefined;
|
|
61
|
+
}
|
|
62
|
+
};
|
|
63
|
+
const rewriteSrcset = (srcset, baseUrl, resolve) => {
|
|
64
|
+
const parts = srcset.split(",").map((part) => part.trim());
|
|
65
|
+
const rewritten = parts.map((part) => {
|
|
66
|
+
const [url, ...descriptors] = part.split(/\s+/);
|
|
67
|
+
if (!url) {
|
|
68
|
+
return part;
|
|
69
|
+
}
|
|
70
|
+
const resolved = resolveValue(url, baseUrl, resolve);
|
|
71
|
+
if (!resolved) {
|
|
72
|
+
return part;
|
|
73
|
+
}
|
|
74
|
+
return [resolved, ...descriptors].join(" ");
|
|
75
|
+
});
|
|
76
|
+
return rewritten.join(", ");
|
|
77
|
+
};
|
|
78
|
+
const shouldSkip = (value) => {
|
|
79
|
+
const trimmed = value.trim();
|
|
80
|
+
return (!trimmed ||
|
|
81
|
+
trimmed.startsWith("data:") ||
|
|
82
|
+
trimmed.startsWith("blob:") ||
|
|
83
|
+
trimmed.startsWith("javascript:") ||
|
|
84
|
+
trimmed.startsWith("#") ||
|
|
85
|
+
trimmed.startsWith("mailto:") ||
|
|
86
|
+
trimmed.startsWith("tel:"));
|
|
87
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@pagepocket/main-content-unit",
|
|
3
|
+
"version": "0.12.0",
|
|
4
|
+
"description": "PagePocket unit: extract main content from captured HTML using defuddle and rewrite resource URLs to local paths",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.js",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"files": [
|
|
9
|
+
"dist"
|
|
10
|
+
],
|
|
11
|
+
"license": "ISC",
|
|
12
|
+
"dependencies": {
|
|
13
|
+
"cheerio": "^1.0.0",
|
|
14
|
+
"defuddle": "^0.6.0",
|
|
15
|
+
"jsdom": "^26.0.0",
|
|
16
|
+
"@pagepocket/lib": "0.12.0",
|
|
17
|
+
"@pagepocket/shared": "0.12.0"
|
|
18
|
+
},
|
|
19
|
+
"devDependencies": {
|
|
20
|
+
"typescript": "^5.4.5",
|
|
21
|
+
"tsx": "^4.19.3",
|
|
22
|
+
"@types/node": "^20.17.12"
|
|
23
|
+
},
|
|
24
|
+
"scripts": {
|
|
25
|
+
"build": "tsc -p tsconfig.json",
|
|
26
|
+
"test": "tsx --test specs/*.spec.ts"
|
|
27
|
+
}
|
|
28
|
+
}
|