@pagepocket/lib 0.11.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,11 +13,27 @@ export type CaptureTarget = {
13
13
  baseUrl: string;
14
14
  url?: string;
15
15
  };
16
+ import type { ProgressEvent } from "@pagepocket/contracts";
16
17
  import type { PagePocketOptions } from "../types.js";
17
18
  import type { CaptureResult as PagePocketCaptureResult, Plugin as V3Plugin, Unit as V3Unit } from "../units/contracts-bridge.js";
19
+ import type { CaptureOptions } from "../units/types.js";
20
+ export type CaptureEventMap = {
21
+ "unit:start": Extract<ProgressEvent, {
22
+ type: "unit:start";
23
+ }>;
24
+ "unit:end": Extract<ProgressEvent, {
25
+ type: "unit:end";
26
+ }>;
27
+ "unit:log": Extract<ProgressEvent, {
28
+ type: "unit:log";
29
+ }>;
30
+ };
31
+ type CaptureEventName = keyof CaptureEventMap;
32
+ type CaptureEventListener<K extends CaptureEventName> = (event: CaptureEventMap[K]) => void;
18
33
  export declare class PagePocket {
19
34
  private target;
20
35
  private options;
36
+ private listeners;
21
37
  private constructor();
22
38
  static fromURL(url: string, options?: PagePocketOptions): PagePocket;
23
39
  static fromPuppeteerPage(page: unknown, options?: PagePocketOptions): PagePocket;
@@ -31,8 +47,10 @@ export declare class PagePocket {
31
47
  url?: string;
32
48
  serialize?: (doc: unknown) => string;
33
49
  } & PagePocketOptions): PagePocket;
50
+ on<K extends CaptureEventName>(event: K, listener: CaptureEventListener<K>): this;
34
51
  capture(options: {
35
52
  units: V3Unit[];
36
53
  plugins?: V3Plugin[];
37
- } & import("../units/types.js").CaptureOptions): Promise<PagePocketCaptureResult>;
54
+ } & CaptureOptions): Promise<PagePocketCaptureResult>;
38
55
  }
56
+ export {};
@@ -1,6 +1,7 @@
1
1
  import { runCapture } from "../units/index.js";
2
2
  export class PagePocket {
3
3
  constructor(target, options) {
4
+ this.listeners = new Map();
4
5
  this.target = target;
5
6
  this.options = options ?? {};
6
7
  }
@@ -32,6 +33,16 @@ export class PagePocket {
32
33
  const { baseUrl, url, serialize: _serialize, ...rest } = options;
33
34
  return new PagePocket({ kind: "html", htmlString, baseUrl, ...(url ? { url } : {}) }, rest);
34
35
  }
36
+ on(event, listener) {
37
+ const existing = this.listeners.get(event);
38
+ if (existing) {
39
+ existing.push(listener);
40
+ }
41
+ else {
42
+ this.listeners.set(event, [listener]);
43
+ }
44
+ return this;
45
+ }
35
46
  async capture(options) {
36
47
  const entry = this.target.kind === "url"
37
48
  ? { kind: "url", url: this.target.url }
@@ -45,12 +56,25 @@ export class PagePocket {
45
56
  htmlString: this.target.htmlString,
46
57
  ...(this.target.url ? { url: this.target.url } : {})
47
58
  };
59
+ const hasListeners = this.listeners.size > 0;
60
+ const onProgress = hasListeners
61
+ ? (event) => {
62
+ const eventListeners = this.listeners.get(event.type);
63
+ if (!eventListeners) {
64
+ return;
65
+ }
66
+ for (const listener of eventListeners) {
67
+ listener(event);
68
+ }
69
+ }
70
+ : undefined;
48
71
  const result = await runCapture({
49
72
  entry,
50
73
  pocketOptions: this.options,
51
74
  options,
52
75
  units: options.units,
53
- plugins: options.plugins
76
+ plugins: options.plugins,
77
+ onProgress
54
78
  });
55
79
  return result;
56
80
  }
@@ -33,8 +33,8 @@ export const replayCssProxy = {
33
33
  if (!next) {
34
34
  return full;
35
35
  }
36
- const q = quote || "";
37
- return "url(" + q + next + q + ")";
36
+ const quoteSymbol = quote || "";
37
+ return "url(" + quoteSymbol + next + quoteSymbol + ")";
38
38
  } catch {
39
39
  return full;
40
40
  }
package/dist/index.d.ts CHANGED
@@ -1,4 +1,5 @@
1
1
  export { PagePocket } from "./core/pagepocket.js";
2
+ export type { CaptureEventMap } from "./core/pagepocket.js";
2
3
  export * from "./units/index.js";
3
4
  export type { UnitContext, UnitContributeContext, UnitPatch, UnitRuntime } from "./units/contracts-bridge.js";
4
5
  export { TERMINAL_RESULT_KEY } from "./units/contracts-bridge.js";
package/dist/kind-map.js CHANGED
@@ -42,17 +42,13 @@ export const requiredEntryKindError = (unitName, requiredKind, entry) => {
42
42
  * Convenience handler for `mapKind` that throws a standardized
43
43
  * "does not support entry kind" error.
44
44
  */
45
- export const throwUnsupportedEntryKind = (unitName) => {
46
- return (_kind, entry) => {
47
- throw unsupportedEntryKindError(unitName, entry);
48
- };
45
+ export const throwUnsupportedEntryKind = (unitName) => (_kind, entry) => {
46
+ throw unsupportedEntryKindError(unitName, entry);
49
47
  };
50
48
  /**
51
49
  * Convenience handler for `mapKind` that throws a standardized
52
50
  * "requires entry kind" error.
53
51
  */
54
- export const throwRequiredEntryKind = (unitName, requiredKind) => {
55
- return (_kind, entry) => {
56
- throw requiredEntryKindError(unitName, requiredKind, entry);
57
- };
52
+ export const throwRequiredEntryKind = (unitName, requiredKind) => (_kind, entry) => {
53
+ throw requiredEntryKindError(unitName, requiredKind, entry);
58
54
  };
@@ -3,9 +3,7 @@ const defaultApply = {
3
3
  limit: "all",
4
4
  onReplaced: "stop"
5
5
  };
6
- const isPlainObject = (value) => {
7
- return typeof value === "object" && value !== null;
8
- };
6
+ const isPlainObject = (value) => typeof value === "object" && value !== null;
9
7
  const normalizeApply = (apply) => ({
10
8
  scope: apply?.scope ?? defaultApply.scope,
11
9
  limit: apply?.limit ?? defaultApply.limit,
@@ -7,19 +7,19 @@ const runFnRuleOnSelection = async (input) => {
7
7
  const selection = $(item.query).toArray();
8
8
  const limit = item.apply.limit;
9
9
  const max = limit === "all" ? selection.length : Math.max(0, limit);
10
- for (let i = 0; i < selection.length && i < max; i += 1) {
11
- const el = selection[i];
12
- const $el = $(el);
13
- if (!isHtmlElement($, $el)) {
10
+ for (let selectionIndex = 0; selectionIndex < selection.length && selectionIndex < max; selectionIndex += 1) {
11
+ const selectedElement = selection[selectionIndex];
12
+ const $selectedElement = $(selectedElement);
13
+ if (!isHtmlElement($, $selectedElement)) {
14
14
  continue;
15
15
  }
16
16
  const ctx = {
17
17
  $,
18
- $el,
18
+ $el: $selectedElement,
19
19
  url: input.url,
20
20
  entryUrl: input.entryUrl,
21
21
  ruleIndex: item.ruleIndex,
22
- matchIndex: i
22
+ matchIndex: selectionIndex
23
23
  };
24
24
  const result = await item.run(ctx);
25
25
  if (!result) {
@@ -27,7 +27,7 @@ const runFnRuleOnSelection = async (input) => {
27
27
  }
28
28
  const actions = Array.isArray(result) ? result : [result];
29
29
  for (const action of actions) {
30
- applyReplaceAction($, $el, action);
30
+ applyReplaceAction($, $selectedElement, action);
31
31
  }
32
32
  }
33
33
  };
@@ -56,22 +56,22 @@ export const applyReplaceElements = async (input) => {
56
56
  const limit = item.apply.limit;
57
57
  const max = limit === "all" ? selection.length : Math.max(0, limit);
58
58
  let replacedCount = 0;
59
- for (let i = 0; i < selection.length && replacedCount < max; i += 1) {
60
- const el = selection[i];
61
- if (item.apply.onReplaced === "stop" && replacedByIndex.has(el)) {
59
+ for (let selectionIndex = 0; selectionIndex < selection.length && replacedCount < max; selectionIndex += 1) {
60
+ const selectedElementIdentity = selection[selectionIndex];
61
+ if (item.apply.onReplaced === "stop" && replacedByIndex.has(selectedElementIdentity)) {
62
62
  continue;
63
63
  }
64
- const $el = input.$(selection[i]);
65
- if (!isHtmlElement(input.$, $el)) {
64
+ const $selectedElement = input.$(selection[selectionIndex]);
65
+ if (!isHtmlElement(input.$, $selectedElement)) {
66
66
  continue;
67
67
  }
68
- if (!elementMatchesFilter(input.$, $el, filter)) {
68
+ if (!elementMatchesFilter(input.$, $selectedElement, filter)) {
69
69
  continue;
70
70
  }
71
- applyReplaceAction(input.$, $el, item.rule.replace);
71
+ applyReplaceAction(input.$, $selectedElement, item.rule.replace);
72
72
  replacedCount += 1;
73
73
  if (item.apply.onReplaced === "stop") {
74
- replacedByIndex.add(el);
74
+ replacedByIndex.add(selectedElementIdentity);
75
75
  }
76
76
  }
77
77
  }
@@ -1,6 +1,4 @@
1
- const looksAlreadyEscapedForStaticServers = (value) => {
2
- return /%25[0-9a-fA-F]{2}/.test(value);
3
- };
1
+ const looksAlreadyEscapedForStaticServers = (value) => /%25[0-9a-fA-F]{2}/.test(value);
4
2
  export const escapePercentForStaticServersOnce = (value) => {
5
3
  if (!value) {
6
4
  return value;
@@ -14,19 +14,19 @@ const encodeEmbeddedUrlTailIfPresent = (pathname) => {
14
14
  return undefined;
15
15
  }
16
16
  const parts = raw.split("/");
17
- for (let i = 0; i < parts.length; i += 1) {
18
- const scheme = parts[i];
17
+ for (let partIndex = 0; partIndex < parts.length; partIndex += 1) {
18
+ const scheme = parts[partIndex];
19
19
  if (scheme !== "http:" && scheme !== "https:") {
20
20
  continue;
21
21
  }
22
- const hasDoubleSlash = parts[i + 1] === "";
23
- const host = parts[i + 2] || "";
22
+ const hasDoubleSlash = parts[partIndex + 1] === "";
23
+ const host = parts[partIndex + 2] || "";
24
24
  if (!hasDoubleSlash || !isLikelyHostname(host)) {
25
25
  continue;
26
26
  }
27
- const embedded = scheme + "//" + parts.slice(i + 2).join("/");
27
+ const embedded = scheme + "//" + parts.slice(partIndex + 2).join("/");
28
28
  const encoded = encodeURIComponent(embedded);
29
- const nextParts = parts.slice(0, i).concat(encoded);
29
+ const nextParts = parts.slice(0, partIndex).concat(encoded);
30
30
  const rebuilt = nextParts.join("/") || "/";
31
31
  return rebuilt.startsWith("/") ? rebuilt : "/" + rebuilt;
32
32
  }
@@ -92,7 +92,7 @@ const preferSingle = (items, baseUrl, suffixLength) => {
92
92
  }
93
93
  })();
94
94
  if (baseParsed) {
95
- const sameOrigin = items.filter((i) => i.parsed.origin === baseParsed.origin);
95
+ const sameOrigin = items.filter((indexedItem) => indexedItem.parsed.origin === baseParsed.origin);
96
96
  if (sameOrigin.length === 1) {
97
97
  return sameOrigin[0];
98
98
  }
@@ -120,9 +120,9 @@ const tryCandidates = (items, baseUrl, suffixLength) => {
120
120
  const makeSuffixes = (pathname) => {
121
121
  const parts = pathname.split("/").filter(Boolean);
122
122
  const out = [];
123
- for (let i = 0; i < parts.length; i += 1) {
124
- const suffix = "/" + parts.slice(i).join("/");
125
- out.push({ key: suffix, depth: parts.length - i });
123
+ for (let partIndex = 0; partIndex < parts.length; partIndex += 1) {
124
+ const suffix = "/" + parts.slice(partIndex).join("/");
125
+ out.push({ key: suffix, depth: parts.length - partIndex });
126
126
  }
127
127
  return out;
128
128
  };
@@ -175,7 +175,7 @@ export const resolveToLocalPath = (options) => {
175
175
  const pathname = abs.pathname || "/";
176
176
  const pathnameVariants = makePathnameVariants(pathname);
177
177
  const search = abs.search || "";
178
- const pathnameWithSearchVariants = pathnameVariants.map((p) => p + search);
178
+ const pathnameWithSearchVariants = pathnameVariants.map((pathnameVariant) => pathnameVariant + search);
179
179
  for (const key of pathnameWithSearchVariants) {
180
180
  const items = toArray(index.byPathnameWithSearch.get(key));
181
181
  const match = tryCandidates(items, baseUrl, 99);
@@ -24,7 +24,7 @@ const isDescriptorToken = (token) => {
24
24
  const parseSrcset = (input) => {
25
25
  const rawCandidates = input
26
26
  .split(",")
27
- .map((c) => c.trim())
27
+ .map((candidateText) => candidateText.trim())
28
28
  .filter(Boolean);
29
29
  return rawCandidates.map((candidate) => {
30
30
  const tokens = candidate.split(/\s+/).filter(Boolean);
@@ -40,26 +40,24 @@ const parseSrcset = (input) => {
40
40
  return { url: candidate };
41
41
  });
42
42
  };
43
- const stringifySrcset = (candidates) => {
44
- return candidates
45
- .map((c) => {
46
- const url = c.url.trim();
47
- if (!c.descriptor) {
48
- return url;
49
- }
50
- return `${url} ${c.descriptor.trim()}`;
51
- })
52
- .filter(Boolean)
53
- .join(",");
54
- };
43
+ const stringifySrcset = (candidates) => candidates
44
+ .map((candidate) => {
45
+ const url = candidate.url.trim();
46
+ if (!candidate.descriptor) {
47
+ return url;
48
+ }
49
+ return `${url} ${candidate.descriptor.trim()}`;
50
+ })
51
+ .filter(Boolean)
52
+ .join(",");
55
53
  export const rewriteSrcsetValue = (value, baseUrl, resolve) => {
56
54
  if (isUnsafeSrcsetValue(value)) {
57
55
  return "";
58
56
  }
59
57
  const candidates = parseSrcset(value);
60
- const rewritten = candidates.map((c) => {
61
- const resolved = resolveUrlValue(c.url, baseUrl, resolve);
62
- return { url: resolved ?? c.url, descriptor: c.descriptor };
58
+ const rewritten = candidates.map((candidate) => {
59
+ const resolved = resolveUrlValue(candidate.url, baseUrl, resolve);
60
+ return { url: resolved ?? candidate.url, descriptor: candidate.descriptor };
63
61
  });
64
62
  return stringifySrcset(rewritten);
65
63
  };
@@ -1,7 +1,7 @@
1
1
  export const headersListToRecord = (headers) => {
2
2
  const out = {};
3
- for (const h of headers) {
4
- out[h.name] = h.value;
3
+ for (const header of headers) {
4
+ out[header.name] = header.value;
5
5
  }
6
6
  return out;
7
7
  };
@@ -1,16 +1,15 @@
1
1
  import { sanitizePosixPath } from "../utils.js";
2
- export const escapePercentForStaticServers = (value) => {
3
- // Many static servers decode percent-encoding in the request path before
4
- // resolving it to a filesystem path.
5
- //
6
- // Our snapshots can contain literal "%2F" sequences in filenames (e.g.
7
- // Substack image URLs embedded into a path segment). When a server decodes
8
- // "%2F" to "/", it changes the path structure and causes 404s.
9
- //
10
- // Escaping "%" to "%25" makes the request decode back to the original
11
- // filename on disk.
12
- return value.split("%").join("%25");
13
- };
2
+ export const escapePercentForStaticServers = (value) =>
3
+ // Many static servers decode percent-encoding in the request path before
4
+ // resolving it to a filesystem path.
5
+ //
6
+ // Our snapshots can contain literal "%2F" sequences in filenames (e.g.
7
+ // Substack image URLs embedded into a path segment). When a server decodes
8
+ // "%2F" to "/", it changes the path structure and causes 404s.
9
+ //
10
+ // Escaping "%" to "%25" makes the request decode back to the original
11
+ // filename on disk.
12
+ value.split("%").join("%25");
14
13
  export const docDirFromUrl = (url) => {
15
14
  try {
16
15
  const parsed = new URL(url);
@@ -0,0 +1,7 @@
1
+ import type { FileTree } from "../core/file-tree.js";
2
+ import type { ReplaceElementsConfig } from "../types.js";
3
+ export declare const applyReplaceElementsToFileTree: (input: {
4
+ files: FileTree;
5
+ replaceElements: ReplaceElementsConfig;
6
+ entryUrl: string;
7
+ }) => Promise<FileTree>;
@@ -0,0 +1,63 @@
1
+ import * as cheerio from "cheerio";
2
+ import { applyReplaceElements } from "../replace-elements.js";
3
+ import { streamToUint8Array } from "../utils/streams.js";
4
+ import { decodeUtf8 } from "../utils.js";
5
+ const isHtmlFile = (path) => path.endsWith(".html") || path.endsWith(".htm");
6
+ const readFileSource = async (file, fileTree) => {
7
+ const source = file.source;
8
+ if (source.kind === "bytes") {
9
+ return source.data;
10
+ }
11
+ if (source.kind === "text") {
12
+ return new TextEncoder().encode(source.text);
13
+ }
14
+ if (source.kind === "content-ref" && fileTree.content) {
15
+ const stream = await fileTree.content.open(source.ref);
16
+ return streamToUint8Array(stream);
17
+ }
18
+ return undefined;
19
+ };
20
+ const processHtmlFile = async (file, fileTree, replaceElements, entryUrl) => {
21
+ const bytes = await readFileSource(file, fileTree);
22
+ if (!bytes) {
23
+ return file;
24
+ }
25
+ const decoded = decodeUtf8(bytes);
26
+ if (typeof decoded === "undefined") {
27
+ return file;
28
+ }
29
+ const $ = cheerio.load(decoded);
30
+ await applyReplaceElements({
31
+ $,
32
+ entryUrl,
33
+ url: entryUrl,
34
+ replaceElements,
35
+ isEntryDocument: true
36
+ });
37
+ const updatedHtml = $.html();
38
+ const updatedBytes = new TextEncoder().encode(updatedHtml);
39
+ return {
40
+ ...file,
41
+ source: { kind: "bytes", data: updatedBytes }
42
+ };
43
+ };
44
+ const processDirectory = async (dir, fileTree, replaceElements, entryUrl) => {
45
+ const updatedEntries = [];
46
+ for (const entry of dir.entries) {
47
+ if (entry.kind === "file" && isHtmlFile(entry.path)) {
48
+ updatedEntries.push(await processHtmlFile(entry, fileTree, replaceElements, entryUrl));
49
+ continue;
50
+ }
51
+ if (entry.kind === "directory") {
52
+ updatedEntries.push(await processDirectory(entry, fileTree, replaceElements, entryUrl));
53
+ continue;
54
+ }
55
+ updatedEntries.push(entry);
56
+ }
57
+ return { ...dir, entries: updatedEntries };
58
+ };
59
+ export const applyReplaceElementsToFileTree = async (input) => {
60
+ const { files, replaceElements, entryUrl } = input;
61
+ const updatedRoot = await processDirectory(files.root, files, replaceElements, entryUrl);
62
+ return { ...files, root: updatedRoot };
63
+ };
@@ -1,4 +1,5 @@
1
1
  import type { ChannelToken, ReplaceElementsConfig } from "@pagepocket/contracts";
2
+ import type { CaptureOptions, EntryInfo, PagePocketOptions } from "./types.js";
2
3
  export type CaptureResult = {
3
4
  kind: "raw";
4
5
  outputDir: string;
@@ -46,18 +47,26 @@ export interface ElementPatchRegistry {
46
47
  compile(): Promise<ReplaceElementsConfig>;
47
48
  }
48
49
  export interface UnitRuntime {
49
- readonly entry: import("./types.js").EntryInfo;
50
- readonly options: import("./types.js").CaptureOptions;
51
- readonly pocketOptions: import("./types.js").PagePocketOptions;
50
+ readonly entry: EntryInfo;
51
+ readonly options: CaptureOptions;
52
+ readonly pocketOptions: PagePocketOptions;
52
53
  publish<T>(t: ChannelToken<T>, value: T): void;
53
54
  subscribe<T>(t: ChannelToken<T>): AsyncIterable<T>;
54
55
  hasPublisher(t: ChannelToken<unknown>): boolean;
55
56
  readonly elements: ElementPatchRegistry;
56
57
  defer(promise: DeferredHandle): void;
58
+ /**
59
+ * Emit a log message from the currently executing unit.
60
+ *
61
+ * The message is published on the well-known PROGRESS channel as a
62
+ * `unit:log` event so external consumers (e.g. `PagePocket.on("unit:log", …)`)
63
+ * can observe it.
64
+ */
65
+ log(message: string, data?: unknown): void;
57
66
  }
58
67
  export interface PluginHost {
59
- readonly entry: import("./types.js").EntryInfo;
60
- readonly options: import("./types.js").CaptureOptions;
68
+ readonly entry: EntryInfo;
69
+ readonly options: CaptureOptions;
61
70
  subscribe<T>(t: ChannelToken<T>): AsyncIterable<T>;
62
71
  hasPublisher(t: ChannelToken<unknown>): boolean;
63
72
  readonly elements: ElementPatchRegistry;
@@ -65,7 +74,7 @@ export interface PluginHost {
65
74
  }
66
75
  export declare abstract class Unit {
67
76
  abstract readonly id: string;
68
- abstract readonly kind: string;
77
+ abstract readonly description: string;
69
78
  abstract run(ctx: UnitContext, rt: UnitRuntime): Promise<void | UnitPatch>;
70
79
  merge(returnValue: UnitPatch, pluginContributedValue?: UnitPatch): UnitPatch;
71
80
  }
@@ -0,0 +1,28 @@
1
+ import { Unit, type UnitPatch } from "./contracts-bridge.js";
2
+ /**
3
+ * Abstract base class for units whose `run()` produces a `FileTree`.
4
+ *
5
+ * Provides a default `merge` implementation that deep-merges the `files`
6
+ * property when both the unit return value and the plugin-contributed value
7
+ * contain a valid `FileTree`. All other properties are shallow-spread
8
+ * (plugin wins on conflict), matching the base `Unit.merge` behaviour.
9
+ *
10
+ * Subclasses only need to implement `id` and `run()`.
11
+ *
12
+ * Usage:
13
+ * ```ts
14
+ * import { FileTreeUnit } from "@pagepocket/lib";
15
+ *
16
+ * export class MyUnit extends FileTreeUnit {
17
+ * readonly id = "my";
18
+ *
19
+ * async run(ctx, rt) {
20
+ * const files = buildFiles();
21
+ * return { files };
22
+ * }
23
+ * }
24
+ * ```
25
+ */
26
+ export declare abstract class FileTreeUnit extends Unit {
27
+ merge(returnValue: UnitPatch, pluginContributedValue?: UnitPatch): UnitPatch;
28
+ }
@@ -0,0 +1,53 @@
1
+ import { mergeFileTrees } from "../core/file-tree-merge.js";
2
+ import { Unit } from "./contracts-bridge.js";
3
+ /**
4
+ * Abstract base class for units whose `run()` produces a `FileTree`.
5
+ *
6
+ * Provides a default `merge` implementation that deep-merges the `files`
7
+ * property when both the unit return value and the plugin-contributed value
8
+ * contain a valid `FileTree`. All other properties are shallow-spread
9
+ * (plugin wins on conflict), matching the base `Unit.merge` behaviour.
10
+ *
11
+ * Subclasses only need to implement `id` and `run()`.
12
+ *
13
+ * Usage:
14
+ * ```ts
15
+ * import { FileTreeUnit } from "@pagepocket/lib";
16
+ *
17
+ * export class MyUnit extends FileTreeUnit {
18
+ * readonly id = "my";
19
+ *
20
+ * async run(ctx, rt) {
21
+ * const files = buildFiles();
22
+ * return { files };
23
+ * }
24
+ * }
25
+ * ```
26
+ */
27
+ export class FileTreeUnit extends Unit {
28
+ merge(returnValue, pluginContributedValue = {}) {
29
+ const mergedValue = { ...returnValue, ...pluginContributedValue };
30
+ const returnFiles = returnValue.files;
31
+ const pluginFiles = pluginContributedValue.files;
32
+ if (!isFileTree(returnFiles) || !isFileTree(pluginFiles)) {
33
+ return mergedValue;
34
+ }
35
+ return { ...mergedValue, files: mergeFileTrees(returnFiles, pluginFiles) };
36
+ }
37
+ }
38
+ const isFileTree = (value) => {
39
+ if (!value || typeof value !== "object") {
40
+ return false;
41
+ }
42
+ if (!("root" in value)) {
43
+ return false;
44
+ }
45
+ const root = value.root;
46
+ if (!root || typeof root !== "object") {
47
+ return false;
48
+ }
49
+ const rootRecord = root;
50
+ return (rootRecord.kind === "directory" &&
51
+ typeof rootRecord.path === "string" &&
52
+ Array.isArray(rootRecord.entries));
53
+ };
@@ -1,4 +1,7 @@
1
1
  export type { CaptureOptions, EntryInfo } from "./types.js";
2
2
  export type { CaptureResult, Plugin, PluginHost, UnitRuntime } from "./contracts-bridge.js";
3
3
  export { Unit } from "./contracts-bridge.js";
4
+ export { FileTreeUnit } from "./file-tree-unit.js";
5
+ export { SnapshotUnit } from "./snapshot-unit.js";
4
6
  export { runCapture } from "./runner.js";
7
+ export type { ProgressListener } from "./runner.js";
@@ -1,2 +1,4 @@
1
1
  export { Unit } from "./contracts-bridge.js";
2
+ export { FileTreeUnit } from "./file-tree-unit.js";
3
+ export { SnapshotUnit } from "./snapshot-unit.js";
2
4
  export { runCapture } from "./runner.js";
@@ -35,11 +35,11 @@ export class AsyncQueue {
35
35
  async *iterate() {
36
36
  while (true) {
37
37
  if (this.values.length > 0) {
38
- const v = this.values.shift();
39
- if (v === undefined) {
38
+ const value = this.values.shift();
39
+ if (value === undefined) {
40
40
  continue;
41
41
  }
42
- yield v;
42
+ yield value;
43
43
  continue;
44
44
  }
45
45
  if (this.done) {
@@ -1,4 +1,4 @@
1
- import type { ChannelToken, ReplaceElementsConfig } from "@pagepocket/contracts";
1
+ import { type ChannelToken, type ReplaceElementsConfig } from "@pagepocket/contracts";
2
2
  import { type ElementPatchRegistry, type UnitContext, type UnitPatch, type UnitRuntime } from "../contracts-bridge.js";
3
3
  import type { CaptureOptions, EntryInfo, PagePocketOptions } from "../types.js";
4
4
  declare class ElementPatchRegistryImpl implements ElementPatchRegistry {
@@ -19,17 +19,20 @@ export declare class RuntimeImpl implements UnitRuntime {
19
19
  readonly pocketOptions: PagePocketOptions;
20
20
  private channels;
21
21
  private deferred;
22
+ private currentUnitId;
22
23
  readonly elements: ElementPatchRegistryImpl;
23
24
  constructor(input: {
24
25
  entry: EntryInfo;
25
26
  options: CaptureOptions;
26
27
  pocketOptions: PagePocketOptions;
27
28
  });
28
- publish<T>(t: ChannelToken<T>, value: T): void;
29
- subscribe<T>(t: ChannelToken<T>): AsyncIterable<T>;
30
- hasPublisher(t: ChannelToken<unknown>): boolean;
29
+ _setCurrentUnitId(unitId: string): void;
30
+ log(message: string, data?: unknown): void;
31
+ publish<T>(channelToken: ChannelToken<T>, value: T): void;
32
+ subscribe<T>(channelToken: ChannelToken<T>): AsyncIterable<T>;
33
+ hasPublisher(channelToken: ChannelToken<unknown>): boolean;
31
34
  defer(promise: Promise<unknown>): void;
32
- _ensureChannel(t: ChannelToken<unknown>): void;
35
+ _ensureChannel<T>(channelToken: ChannelToken<T>): void;
33
36
  _closeAllChannels(): Promise<void>;
34
37
  _awaitDeferred(): Promise<void>;
35
38
  }
@@ -1,3 +1,4 @@
1
+ import { PROGRESS } from "@pagepocket/contracts";
1
2
  import { TERMINAL_RESULT_KEY } from "../contracts-bridge.js";
2
3
  import { AsyncQueue, emptyAsyncIterable } from "./async-queue.js";
3
4
  import { DeferredTracker } from "./deferred-tracker.js";
@@ -30,13 +31,26 @@ export class RuntimeImpl {
30
31
  constructor(input) {
31
32
  this.channels = new Map();
32
33
  this.deferred = new DeferredTracker();
34
+ this.currentUnitId = "";
33
35
  this.elements = new ElementPatchRegistryImpl();
34
36
  this.entry = input.entry;
35
37
  this.options = input.options;
36
38
  this.pocketOptions = input.pocketOptions;
37
39
  }
38
- publish(t, value) {
39
- const state = this.channels.get(t.id);
40
+ _setCurrentUnitId(unitId) {
41
+ this.currentUnitId = unitId;
42
+ }
43
+ log(message, data) {
44
+ const event = {
45
+ type: "unit:log",
46
+ unitId: this.currentUnitId,
47
+ message,
48
+ ...(data !== undefined ? { data } : {})
49
+ };
50
+ this.publish(PROGRESS, event);
51
+ }
52
+ publish(channelToken, value) {
53
+ const state = this.channels.get(channelToken.id);
40
54
  if (!state || state.closed) {
41
55
  return;
42
56
  }
@@ -45,45 +59,45 @@ export class RuntimeImpl {
45
59
  sub.push(value);
46
60
  }
47
61
  }
48
- subscribe(t) {
49
- if (!this.channels.has(t.id)) {
50
- this.channels.set(t.id, { hasPublisher: false, subs: new Set(), closed: false });
62
+ subscribe(channelToken) {
63
+ if (!this.channels.has(channelToken.id)) {
64
+ this.channels.set(channelToken.id, { hasPublisher: false, subs: new Set(), closed: false });
51
65
  }
52
- const state = this.channels.get(t.id);
66
+ const state = this.channels.get(channelToken.id);
53
67
  if (!state || state.closed) {
54
68
  return emptyAsyncIterable();
55
69
  }
56
- const q = new AsyncQueue();
70
+ const queue = new AsyncQueue();
57
71
  const sub = {
58
- push: (v) => q.push(v),
59
- close: () => q.close()
72
+ push: (value) => queue.push(value),
73
+ close: () => queue.close()
60
74
  };
61
75
  state.subs.add(sub);
62
76
  const owner = this;
63
77
  return (async function* () {
64
78
  try {
65
- for await (const v of q.iterate()) {
66
- yield v;
79
+ for await (const value of queue.iterate()) {
80
+ yield value;
67
81
  }
68
82
  }
69
83
  finally {
70
- const s = owner.channels.get(t.id);
71
- s?.subs.delete(sub);
72
- q.close();
84
+ const channelState = owner.channels.get(channelToken.id);
85
+ channelState?.subs.delete(sub);
86
+ queue.close();
73
87
  }
74
88
  })();
75
89
  }
76
- hasPublisher(t) {
77
- return this.channels.get(t.id)?.hasPublisher === true;
90
+ hasPublisher(channelToken) {
91
+ return this.channels.get(channelToken.id)?.hasPublisher === true;
78
92
  }
79
93
  defer(promise) {
80
94
  this.deferred.add(promise);
81
95
  }
82
- _ensureChannel(t) {
83
- if (this.channels.has(t.id)) {
96
+ _ensureChannel(channelToken) {
97
+ if (this.channels.has(channelToken.id)) {
84
98
  return;
85
99
  }
86
- this.channels.set(t.id, { hasPublisher: false, subs: new Set(), closed: false });
100
+ this.channels.set(channelToken.id, { hasPublisher: false, subs: new Set(), closed: false });
87
101
  }
88
102
  async _closeAllChannels() {
89
103
  for (const state of this.channels.values()) {
@@ -103,11 +117,11 @@ export class RuntimeImpl {
103
117
  }
104
118
  export const mergePatchIntoFreshContext = (patch) => {
105
119
  const next = { value: {} };
106
- for (const [k, v] of Object.entries(patch)) {
107
- if (k === TERMINAL_RESULT_KEY) {
120
+ for (const [key, value] of Object.entries(patch)) {
121
+ if (key === TERMINAL_RESULT_KEY) {
108
122
  continue;
109
123
  }
110
- next.value[k] = v;
124
+ next.value[key] = value;
111
125
  }
112
126
  return next;
113
127
  };
@@ -1,6 +1,7 @@
1
- import type { ChannelToken } from "@pagepocket/contracts";
1
+ import { type ChannelToken, type ProgressEvent } from "@pagepocket/contracts";
2
2
  import { type CaptureResult, type Plugin, type Unit } from "./contracts-bridge.js";
3
3
  import type { CaptureOptions, EntryInfo, PagePocketOptions } from "./types.js";
4
+ export type ProgressListener = (event: ProgressEvent) => void;
4
5
  export declare const runCapture: (input: {
5
6
  entry: EntryInfo;
6
7
  pocketOptions: PagePocketOptions;
@@ -8,4 +9,5 @@ export declare const runCapture: (input: {
8
9
  units: Unit[];
9
10
  plugins?: Plugin[];
10
11
  declaredChannels?: ChannelToken<unknown>[];
12
+ onProgress?: ProgressListener;
11
13
  }) => Promise<CaptureResult>;
@@ -1,3 +1,5 @@
1
+ import { PROGRESS } from "@pagepocket/contracts";
2
+ import { debugLog } from "../core/debug.js";
1
3
  import { TERMINAL_RESULT_KEY } from "./contracts-bridge.js";
2
4
  import { mergePatchIntoFreshContext, RuntimeImpl } from "./internal/runtime.js";
3
5
  export const runCapture = async (input) => {
@@ -6,34 +8,54 @@ export const runCapture = async (input) => {
6
8
  options: input.options,
7
9
  pocketOptions: input.pocketOptions
8
10
  });
9
- for (const ch of input.declaredChannels ?? []) {
10
- rt._ensureChannel(ch);
11
+ for (const channel of input.declaredChannels ?? []) {
12
+ rt._ensureChannel(channel);
13
+ }
14
+ rt._ensureChannel(PROGRESS);
15
+ if (input.onProgress) {
16
+ const listener = input.onProgress;
17
+ const progressTask = (async () => {
18
+ for await (const event of rt.subscribe(PROGRESS)) {
19
+ listener(event);
20
+ }
21
+ })();
22
+ rt.defer(progressTask);
11
23
  }
12
24
  const pluginHost = {
13
25
  entry: rt.entry,
14
26
  options: rt.options,
15
- subscribe: (t) => rt.subscribe(t),
16
- hasPublisher: (t) => rt.hasPublisher(t),
27
+ subscribe: (channelToken) => rt.subscribe(channelToken),
28
+ hasPublisher: (channelToken) => rt.hasPublisher(channelToken),
17
29
  elements: rt.elements,
18
- defer: (p) => rt.defer(p)
30
+ defer: (deferredPromise) => rt.defer(deferredPromise)
19
31
  };
20
32
  const pluginSetupValues = new Map();
21
33
  for (const plugin of input.plugins ?? []) {
22
- const v = await plugin.setup(pluginHost);
23
- if (typeof v !== "undefined") {
24
- pluginSetupValues.set(plugin, v);
34
+ const setupValue = await plugin.setup(pluginHost);
35
+ if (typeof setupValue !== "undefined") {
36
+ pluginSetupValues.set(plugin, setupValue);
25
37
  }
26
38
  }
27
- const mergePatch = (_ctx, patch) => {
28
- return mergePatchIntoFreshContext(patch);
29
- };
39
+ const mergePatch = (_ctx, patch) => mergePatchIntoFreshContext(patch);
30
40
  let ctx = { value: {} };
31
41
  let result;
42
+ const totalUnits = input.units.length;
32
43
  try {
33
- for (const unit of input.units) {
44
+ for (let i = 0; i < input.units.length; i++) {
34
45
  if (result) {
35
46
  break;
36
47
  }
48
+ const unit = input.units[i];
49
+ const unitIndex = i;
50
+ rt._setCurrentUnitId(unit.id);
51
+ rt.publish(PROGRESS, {
52
+ type: "unit:start",
53
+ unitId: unit.id,
54
+ unitDescription: unit.description,
55
+ index: unitIndex,
56
+ total: totalUnits
57
+ });
58
+ const unitStartTime = Date.now();
37
59
  const baseCtx = ctx;
38
60
  const boundPlugins = (input.plugins ?? []).filter((plugin) => {
39
61
  const unitId = plugin.constructor?.unitId;
@@ -42,13 +64,13 @@ export const runCapture = async (input) => {
42
64
  let pluginContributedValue = {};
43
65
  if (boundPlugins.length > 0) {
44
66
  const pluginResults = await Promise.allSettled(boundPlugins
45
- .filter((p) => typeof p.contribute === "function")
46
- .map(async (p) => p.contribute({ value: ctx.value, setupValue: pluginSetupValues.get(p) }, rt)));
47
- for (const r of pluginResults) {
48
- if (r.status !== "fulfilled") {
67
+ .filter((pluginBinding) => typeof pluginBinding.contribute === "function")
68
+ .map(async (pluginBinding) => pluginBinding.contribute({ value: ctx.value, setupValue: pluginSetupValues.get(pluginBinding) }, rt)));
69
+ for (const pluginResult of pluginResults) {
70
+ if (pluginResult.status !== "fulfilled") {
49
71
  continue;
50
72
  }
51
- const patch = r.value;
73
+ const patch = pluginResult.value;
52
74
  if (!patch || typeof patch !== "object") {
53
75
  continue;
54
76
  }
@@ -64,13 +86,44 @@ export const runCapture = async (input) => {
64
86
  }
65
87
  }
66
88
  if (result) {
89
+ rt.publish(PROGRESS, {
90
+ type: "unit:end",
91
+ unitId: unit.id,
92
+ unitDescription: unit.description,
93
+ index: unitIndex,
94
+ total: totalUnits,
95
+ durationMs: Date.now() - unitStartTime
96
+ });
67
97
  break;
68
98
  }
69
- const out = (await unit.run(baseCtx, rt)) ?? {};
99
+ let out;
100
+ try {
101
+ out = (await unit.run(baseCtx, rt)) ?? {};
102
+ }
103
+ catch (err) {
104
+ debugLog(`[runner] unit "${unit.id}" threw:`, err);
105
+ rt.publish(PROGRESS, {
106
+ type: "unit:end",
107
+ unitId: unit.id,
108
+ unitDescription: unit.description,
109
+ index: unitIndex,
110
+ total: totalUnits,
111
+ durationMs: Date.now() - unitStartTime
112
+ });
113
+ continue;
114
+ }
70
115
  const unitReturnValue = out && typeof out === "object" ? out : {};
71
116
  const merged = unit.merge(unitReturnValue, pluginContributedValue);
72
117
  const mergedPatch = merged && typeof merged === "object" ? merged : {};
73
118
  ctx = { value: mergePatch({ value: {} }, mergedPatch).value };
119
+ rt.publish(PROGRESS, {
120
+ type: "unit:end",
121
+ unitId: unit.id,
122
+ unitDescription: unit.description,
123
+ index: unitIndex,
124
+ total: totalUnits,
125
+ durationMs: Date.now() - unitStartTime
126
+ });
74
127
  const terminal = mergedPatch[TERMINAL_RESULT_KEY];
75
128
  if (terminal) {
76
129
  result = terminal;
@@ -0,0 +1,31 @@
1
+ import type { UnitContext, UnitPatch, UnitRuntime } from "./contracts-bridge.js";
2
+ import { FileTreeUnit } from "./file-tree-unit.js";
3
+ /**
4
+ * Base class for units that produce the primary snapshot FileTree.
5
+ *
6
+ * `run()` delegates to `build()` and automatically:
7
+ * - Injects `snapshotType` into the returned patch.
8
+ * - Compiles all plugin-contributed element-replacement rules
9
+ * (`rt.elements.compile()`) and applies them to every HTML file
10
+ * in the returned FileTree. Subclasses never need to call
11
+ * `rt.elements.compile()` themselves.
12
+ *
13
+ * Subclasses implement `id`, `snapshotType`, and `build()`.
14
+ *
15
+ * ```ts
16
+ * export class MySnapshotUnit extends SnapshotUnit {
17
+ * readonly id = "mySnapshot";
18
+ * readonly snapshotType = "my-type";
19
+ *
20
+ * async build(ctx, rt) {
21
+ * return { files: buildFiles(), html: ctx.value.html };
22
+ * }
23
+ * }
24
+ * ```
25
+ */
26
+ export declare abstract class SnapshotUnit extends FileTreeUnit {
27
+ /** Identifier for the kind of snapshot this unit produces (e.g. "full", "main-content"). */
28
+ abstract readonly snapshotType: string;
29
+ abstract build(ctx: UnitContext, rt: UnitRuntime): Promise<void | UnitPatch>;
30
+ run(ctx: UnitContext, rt: UnitRuntime): Promise<void | UnitPatch>;
31
+ }
@@ -0,0 +1,58 @@
1
+ import { applyReplaceElementsToFileTree } from "./apply-replace-elements-to-file-tree.js";
2
+ import { FileTreeUnit } from "./file-tree-unit.js";
3
+ /**
4
+ * Base class for units that produce the primary snapshot FileTree.
5
+ *
6
+ * `run()` delegates to `build()` and automatically:
7
+ * - Injects `snapshotType` into the returned patch.
8
+ * - Compiles all plugin-contributed element-replacement rules
9
+ * (`rt.elements.compile()`) and applies them to every HTML file
10
+ * in the returned FileTree. Subclasses never need to call
11
+ * `rt.elements.compile()` themselves.
12
+ *
13
+ * Subclasses implement `id`, `snapshotType`, and `build()`.
14
+ *
15
+ * ```ts
16
+ * export class MySnapshotUnit extends SnapshotUnit {
17
+ * readonly id = "mySnapshot";
18
+ * readonly snapshotType = "my-type";
19
+ *
20
+ * async build(ctx, rt) {
21
+ * return { files: buildFiles(), html: ctx.value.html };
22
+ * }
23
+ * }
24
+ * ```
25
+ */
26
+ export class SnapshotUnit extends FileTreeUnit {
27
+ async run(ctx, rt) {
28
+ const patch = await this.build(ctx, rt);
29
+ if (!patch) {
30
+ return;
31
+ }
32
+ const files = patch.files;
33
+ const replaceElements = await rt.elements.compile();
34
+ if (files && replaceElements.length > 0) {
35
+ const entryUrl = resolveEntryUrl(rt);
36
+ const updatedFiles = await applyReplaceElementsToFileTree({
37
+ files,
38
+ replaceElements,
39
+ entryUrl
40
+ });
41
+ return { ...patch, files: updatedFiles, snapshotType: this.snapshotType };
42
+ }
43
+ return { ...patch, snapshotType: this.snapshotType };
44
+ }
45
+ }
46
+ const resolveEntryUrl = (rt) => {
47
+ const entry = rt.entry;
48
+ if (entry.kind === "url") {
49
+ return entry.url;
50
+ }
51
+ if (entry.kind === "html-string" || entry.kind === "document") {
52
+ return entry.url ?? entry.baseUrl;
53
+ }
54
+ if (entry.kind === "puppeteer-page" || entry.kind === "cdp-tab") {
55
+ return entry.url ?? "";
56
+ }
57
+ return "";
58
+ };
@@ -1,4 +1,4 @@
1
- import type { CompletionStrategy, ContentStore, PathResolver, ResourceFilter } from "../types.js";
1
+ import type { CompletionStrategy, ContentStore, PagePocketOptions, PathResolver, ResourceFilter } from "../types.js";
2
2
  export type EntryInfo = {
3
3
  kind: "url";
4
4
  url: string;
@@ -36,4 +36,4 @@ export interface CaptureOptions {
36
36
  maxResources?: number;
37
37
  };
38
38
  }
39
- export type PagePocketOptions = import("../types.js").PagePocketOptions;
39
+ export type { PagePocketOptions };
package/dist/utils.js CHANGED
@@ -4,8 +4,8 @@ const FNV_PRIME = 0x01000193;
4
4
  export const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
5
5
  export const hashString = (value) => {
6
6
  let hash = FNV_OFFSET;
7
- for (let i = 0; i < value.length; i += 1) {
8
- hash ^= value.charCodeAt(i);
7
+ for (let characterIndex = 0; characterIndex < value.length; characterIndex += 1) {
8
+ hash ^= value.charCodeAt(characterIndex);
9
9
  hash = (hash * FNV_PRIME) >>> 0;
10
10
  }
11
11
  return hash.toString(16).padStart(8, "0");
@@ -24,9 +24,7 @@ export const sanitizePosixPath = (value) => {
24
24
  }
25
25
  return clean.join("/");
26
26
  };
27
- const getGlobalBuffer = () => {
28
- return globalThis.Buffer;
29
- };
27
+ const getGlobalBuffer = () => globalThis.Buffer;
30
28
  export const bytesToBase64 = (bytes) => {
31
29
  const BufferCtor = getGlobalBuffer();
32
30
  if (BufferCtor) {
@@ -34,8 +32,8 @@ export const bytesToBase64 = (bytes) => {
34
32
  }
35
33
  let binary = "";
36
34
  const chunkSize = 0x8000;
37
- for (let i = 0; i < bytes.length; i += chunkSize) {
38
- const chunk = bytes.subarray(i, i + chunkSize);
35
+ for (let chunkStart = 0; chunkStart < bytes.length; chunkStart += chunkSize) {
36
+ const chunk = bytes.subarray(chunkStart, chunkStart + chunkSize);
39
37
  binary += String.fromCharCode(...chunk);
40
38
  }
41
39
  return btoa(binary);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pagepocket/lib",
3
- "version": "0.11.1",
3
+ "version": "0.13.0",
4
4
  "description": "Library for rewriting HTML snapshots and inlining local resources.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -20,9 +20,9 @@
20
20
  "dependencies": {
21
21
  "cheerio": "^1.0.0-rc.12",
22
22
  "domhandler": "^5.0.3",
23
- "@pagepocket/contracts": "0.11.1",
24
- "@pagepocket/uni-fs": "0.11.1",
25
- "@pagepocket/shared": "0.11.1"
23
+ "@pagepocket/contracts": "0.13.0",
24
+ "@pagepocket/shared": "0.13.0",
25
+ "@pagepocket/uni-fs": "0.13.0"
26
26
  },
27
27
  "devDependencies": {
28
28
  "@playwright/test": "^1.50.1",