pagerts 0.2.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/.github/codeql/codeql-config.yml +7 -0
  2. package/.github/workflows/ci.yml +146 -0
  3. package/.github/workflows/dependency-update.yml +52 -0
  4. package/.prettierignore +5 -0
  5. package/.prettierrc.json +10 -0
  6. package/MAINTAINERS.md +30 -0
  7. package/POST-INSTALL.md +205 -0
  8. package/README.md +220 -16
  9. package/SECURITY.md +160 -0
  10. package/bin/main.js +24 -19
  11. package/bin/main.js.map +4 -4
  12. package/eslint.config.mjs +83 -0
  13. package/{jest.config.js → jest.config.cjs} +45 -30
  14. package/package.json +34 -13
  15. package/src/__tests__/PageFetcher.test.ts +48 -0
  16. package/src/__tests__/security.test.ts +153 -0
  17. package/src/extractors/AbstractExtractor.ts +4 -5
  18. package/src/extractors/PageExtractor.ts +21 -12
  19. package/src/extractors/ResourceExtractor.ts +31 -25
  20. package/src/extractors/TagExtractor.ts +13 -14
  21. package/src/extractors/index.ts +4 -0
  22. package/src/main.ts +71 -43
  23. package/src/page/Page.ts +24 -19
  24. package/src/page/PageFetcher.ts +81 -30
  25. package/src/page/index.ts +3 -0
  26. package/src/printers/AbstractResourcePrinter.ts +6 -6
  27. package/src/printers/JSONStylePrinter.ts +9 -12
  28. package/src/printers/LogStylePrinter.ts +30 -28
  29. package/src/printers/index.ts +3 -0
  30. package/src/resource.ts +88 -96
  31. package/src/security.ts +184 -0
  32. package/tsconfig.eslint.json +5 -0
  33. package/tsconfig.json +27 -11
  34. package/bin/package.json +0 -40
  35. package/bin/src/extractors/AbstractExtractor.js +0 -11
  36. package/bin/src/extractors/AbstractExtractor.js.map +0 -1
  37. package/bin/src/extractors/PageExtractor.js +0 -13
  38. package/bin/src/extractors/PageExtractor.js.map +0 -1
  39. package/bin/src/extractors/ResourceExtractor.js +0 -32
  40. package/bin/src/extractors/ResourceExtractor.js.map +0 -1
  41. package/bin/src/main.js +0 -36
  42. package/bin/src/main.js.map +0 -1
  43. package/bin/src/page/Page.js +0 -8
  44. package/bin/src/page/Page.js.map +0 -1
  45. package/bin/src/page/PageFetcher.js +0 -26
  46. package/bin/src/page/PageFetcher.js.map +0 -1
  47. package/bin/src/printers/AbstractResourcePrinter.js +0 -8
  48. package/bin/src/printers/AbstractResourcePrinter.js.map +0 -1
  49. package/bin/src/printers/JSONStylePrinter.js +0 -12
  50. package/bin/src/printers/JSONStylePrinter.js.map +0 -1
  51. package/bin/src/printers/LogStylePrinter.js +0 -27
  52. package/bin/src/printers/LogStylePrinter.js.map +0 -1
  53. package/bin/src/resource.js +0 -56
  54. package/bin/src/resource.js.map +0 -1
package/src/main.ts CHANGED
@@ -1,43 +1,71 @@
1
- #!/usr/bin/env node
2
- import { Command, createArgument } from "commander";
3
-
4
- import { description, name, version } from '../package.json';
5
- import { PageExtractor } from "./extractors/PageExtractor";
6
- import { ResourceExtractor } from "./extractors/ResourceExtractor";
7
- import { PageFetcher } from "./page/PageFetcher";
8
- import type { Page, PageMetadata } from "./page/Page";
9
- import { JSONStylePrinter } from "./printers/JSONStylePrinter";
10
- import { LogStylePrinter } from "./printers/LogStylePrinter";
11
-
12
- const program = new Command();
13
-
14
- const url = createArgument("<url | file...>", "remote https://URL or local file://resource.html to extract from");
15
-
16
- (async () => {
17
- await program
18
- .name(name)
19
- .version(version, "-v, --version")
20
- .description(description)
21
- .addArgument(url)
22
- .action(async (urls: string[]) => {
23
- const printer = new JSONStylePrinter();
24
- // simple log style printer
25
- // const printer = new LogStylePrinter();
26
-
27
- const pageFetcher = new PageFetcher()
28
- const pageExtractor = new PageExtractor()
29
- const resourceExtractor = new ResourceExtractor(["a", "meta", "link", "embed"])
30
-
31
- const pageResponses = await pageFetcher.fetchAll(urls);
32
- const pageMetadatas: PageMetadata[] = [];
33
-
34
- for (const { content, url, error } of pageResponses) {
35
- const resources = error in (content) ? [] : await resourceExtractor.extract(content);
36
- const descriptor = error in content ? { url, error } : await pageExtractor.extract(content);
37
- pageMetadatas.push({ ...descriptor, resources });
38
- }
39
-
40
- await printer.print(...pageMetadatas);
41
- })
42
- .parseAsync(process.argv);
43
- })();
1
+ #!/usr/bin/env node
2
+ import { Command, createArgument } from 'commander';
3
+
4
+ import pkg from '../package.json';
5
+ import { PageExtractor, ResourceExtractor } from './extractors/index.js';
6
+ import { PageFetcher, type PageMetadata } from './page/index.js';
7
+ import { JSONStylePrinter } from './printers/index.js';
8
+ import { validateUrls } from './security.js';
9
+
10
+ const { description, name, version } = pkg;
11
+
12
+ const program = new Command();
13
+
14
+ const url = createArgument(
15
+ '<url | file...>',
16
+ 'remote https://URL or local file://resource.html to extract from'
17
+ );
18
+
19
+ (async (): Promise<void> => {
20
+ await program
21
+ .name(name)
22
+ .version(version, '-v, --version')
23
+ .description(description)
24
+ .addArgument(url)
25
+ .action(async (urls: string[]) => {
26
+ try {
27
+ // Validate URLs first
28
+ const { validUrls, errors } = validateUrls(urls);
29
+
30
+ // Report validation errors
31
+ if (errors.length > 0) {
32
+ console.error('\n❌ URL Validation Errors:');
33
+ errors.forEach(({ url: invalidUrl, error }) => {
34
+ console.error(` - ${invalidUrl}: ${error}`);
35
+ });
36
+ }
37
+
38
+ // Exit if no valid URLs
39
+ if (validUrls.length === 0) {
40
+ console.error('\n❌ No valid URLs to process. Exiting.');
41
+ process.exit(1);
42
+ }
43
+
44
+ console.error(`\n✅ Processing ${validUrls.length} valid URL(s)...`);
45
+
46
+ const printer = new JSONStylePrinter();
47
+ const pageFetcher = new PageFetcher();
48
+ const pageExtractor = new PageExtractor();
49
+ const resourceExtractor = new ResourceExtractor(['a', 'meta', 'link', 'embed']);
50
+
51
+ const pageResponses = await pageFetcher.fetchAll(validUrls);
52
+ const pageMetadatas: PageMetadata[] = [];
53
+
54
+ for (const { content, url: responseUrl, error } of pageResponses) {
55
+ const resources =
56
+ error !== undefined || !content ? [] : await resourceExtractor.extract(content);
57
+ const descriptor =
58
+ error !== undefined || !content
59
+ ? { url: responseUrl, error: error ?? 'Unknown error', resources }
60
+ : await pageExtractor.extract(content);
61
+ pageMetadatas.push({ ...descriptor, resources });
62
+ }
63
+
64
+ await printer.print(...pageMetadatas);
65
+ } catch (error) {
66
+ console.error('\n❌ An error occurred:', error instanceof Error ? error.message : error);
67
+ process.exit(1);
68
+ }
69
+ })
70
+ .parseAsync(process.argv);
71
+ })();
package/src/page/Page.ts CHANGED
@@ -1,19 +1,24 @@
1
- import type { ExternalResource } from "../resource";
2
-
3
- type hasTitle = {
4
- title: string;
5
- };
6
-
7
- type hasUrl = {
8
- url: string;
9
- };
10
-
11
- type hasResources = {
12
- resources: ExternalResource[];
13
- };
14
-
15
- export type Page = hasTitle & hasUrl
16
- export type PageMetadata = (Page & hasResources) | { error: string }
17
- export const isError = (page: PageMetadata): page is { error: string } => 'error' in page;
18
- export const isPage = (page: any): page is Page =>
19
- "resources" in page && Array.isArray(page.resources);
1
+ import type { ExternalResource } from '../resource.js';
2
+
3
+ type hasTitle = {
4
+ title: string;
5
+ };
6
+
7
+ type hasUrl = {
8
+ url: string;
9
+ };
10
+
11
+ type hasResources = {
12
+ resources: ExternalResource[];
13
+ };
14
+
15
+ export type Page = hasTitle & hasUrl;
16
+
17
+ export type PageSuccess = Page & hasResources;
18
+ export type PageFailure = hasUrl & hasResources & { error: string };
19
+ export type PageMetadata = PageSuccess | PageFailure;
20
+
21
+ export const isError = (page: PageMetadata): page is PageFailure => 'error' in page;
22
+
23
+ export const isPage = (page: PageMetadata): page is PageSuccess =>
24
+ 'title' in page && typeof page.title === 'string' && Array.isArray(page.resources);
@@ -1,30 +1,81 @@
1
- import { JSDOM, VirtualConsole } from 'jsdom';
2
- import type { Page, PageMetadata } from './Page';
3
-
4
- interface PageResponse {
5
- url: string;
6
- content?: JSDOM;
7
- error?: string;
8
- }
9
-
10
- export class PageFetcher {
11
- private async fetchPage(url: string): Promise<PageResponse> {
12
- let dom: Promise<JSDOM>;
13
- const virtualConsole = new VirtualConsole().on('jsdomError', (error) => {
14
- process.stderr.write(`Error parsing ${url}:${error.message}\n`);
15
- });
16
- if (url.startsWith("file://")) {
17
- dom = JSDOM.fromFile(url, { virtualConsole });
18
- } else {
19
- dom = JSDOM.fromURL(url, { virtualConsole });
20
- }
21
-
22
- return dom.then(content => ({ url, content }))
23
- .catch(({ message }) => ({ url, error: `JSDOM failed to parse: ${message}` }));
24
- }
25
- async fetchAll(urls: string[]): Promise<PageResponse[]> {
26
- const responses = await Promise.all(urls.map(url => this.fetchPage(url)));
27
- return responses.filter(response => response.content !== undefined);
28
- }
29
-
30
- }
1
+ import { JSDOM, VirtualConsole } from 'jsdom';
2
+
3
+ interface PageResponse {
4
+ url: string;
5
+ content?: JSDOM;
6
+ error?: string;
7
+ }
8
+
9
+ export class PageFetcher {
10
+ private readonly timeout: number;
11
+ private readonly maxRetries: number;
12
+
13
+ constructor(timeout = 10000, maxRetries = 2) {
14
+ this.timeout = timeout;
15
+ this.maxRetries = maxRetries;
16
+ }
17
+
18
+ private async fetchPage(url: string, retryCount = 0): Promise<PageResponse> {
19
+ const virtualConsole = new VirtualConsole().on('jsdomError', (error: Error) => {
20
+ process.stderr.write(`Error parsing ${url}: ${error.message}\n`);
21
+ });
22
+
23
+ try {
24
+ let dom: Promise<JSDOM>;
25
+
26
+ if (url.startsWith('file://')) {
27
+ dom = JSDOM.fromFile(url.substring(7), { virtualConsole });
28
+ } else {
29
+ // Add timeout and security options for remote URLs
30
+ dom = JSDOM.fromURL(url, {
31
+ virtualConsole,
32
+ resources: 'usable',
33
+ runScripts: 'outside-only', // More secure - don't execute page scripts
34
+ beforeParse(window) {
35
+ // Prevent infinite loops and resource exhaustion
36
+ window.setTimeout = (() => {
37
+ throw new Error('setTimeout disabled for security');
38
+ }) as typeof window.setTimeout;
39
+ window.setInterval = (() => {
40
+ throw new Error('setInterval disabled for security');
41
+ }) as typeof window.setInterval;
42
+ },
43
+ });
44
+ }
45
+
46
+ const content = await Promise.race([
47
+ dom,
48
+ new Promise<never>((_, reject) =>
49
+ setTimeout(() => reject(new Error('Request timeout')), this.timeout)
50
+ ),
51
+ ]);
52
+
53
+ return { url, content };
54
+ } catch (error) {
55
+ const message = error instanceof Error ? error.message : 'Unknown error';
56
+
57
+ // Retry logic for transient errors
58
+ if (retryCount < this.maxRetries && this.isRetryableError(message)) {
59
+ process.stderr.write(`Retrying ${url} (attempt ${retryCount + 1}/${this.maxRetries})...\n`);
60
+ await this.delay(1000 * (retryCount + 1)); // Exponential backoff
61
+ return this.fetchPage(url, retryCount + 1);
62
+ }
63
+
64
+ return { url, error: `Failed to fetch: ${message}` };
65
+ }
66
+ }
67
+
68
+ private isRetryableError(message: string): boolean {
69
+ const retryablePatterns = [/timeout/i, /ECONNRESET/i, /ETIMEDOUT/i, /ENOTFOUND/i, /network/i];
70
+ return retryablePatterns.some((pattern) => pattern.test(message));
71
+ }
72
+
73
+ private delay(ms: number): Promise<void> {
74
+ return new Promise((resolve) => setTimeout(resolve, ms));
75
+ }
76
+
77
+ async fetchAll(urls: string[]): Promise<PageResponse[]> {
78
+ const responses = await Promise.all(urls.map((url) => this.fetchPage(url)));
79
+ return responses.filter((response) => response.content !== undefined || response.error);
80
+ }
81
+ }
@@ -0,0 +1,3 @@
1
+ export { PageFetcher } from './PageFetcher.js';
2
+ export type { Page, PageMetadata } from './Page.js';
3
+ export { isPage, isError } from './Page.js';
@@ -1,6 +1,6 @@
1
- import type { Page, PageMetadata } from "../page/Page";
2
-
3
- export abstract class AbstractResourcePrinter {
4
- constructor() { }
5
- abstract print(...pages: PageMetadata[]): void | Promise<void>;
6
- }
1
+ import type { PageMetadata } from '../page/index.js';
2
+
3
+ export abstract class AbstractResourcePrinter {
4
+ constructor() {}
5
+ abstract print(...pages: PageMetadata[]): void | Promise<void>;
6
+ }
@@ -1,12 +1,9 @@
1
- import type { PageMetadata } from "../page/Page";
2
- import { AbstractResourcePrinter } from "./AbstractResourcePrinter";
3
-
4
-
5
- export class JSONStylePrinter extends AbstractResourcePrinter {
6
- print(...pages: PageMetadata[]): void | Promise<void> {
7
- const json = JSON.stringify(pages);
8
- process.stdout.write(json + "\n")
9
- }
10
-
11
-
12
- }
1
+ import type { PageMetadata } from '../page/index.js';
2
+ import { AbstractResourcePrinter } from './AbstractResourcePrinter.js';
3
+
4
+ export class JSONStylePrinter extends AbstractResourcePrinter {
5
+ print(...pages: PageMetadata[]): void | Promise<void> {
6
+ const json = JSON.stringify(pages);
7
+ process.stdout.write(json + '\n');
8
+ }
9
+ }
@@ -1,28 +1,30 @@
1
- import { isPage, type Page, type PageMetadata } from '../page/Page';
2
- import { AbstractResourcePrinter } from './AbstractResourcePrinter';
3
-
4
- export class LogStylePrinter extends AbstractResourcePrinter {
5
-
6
- write(str: string): void {
7
- process.stdout.write(str)
8
- }
9
-
10
- async print(...pages: PageMetadata[]): Promise<void> {
11
- for (const page of pages) {
12
- if (!isPage(page)) {
13
- this.write(page.error)
14
- continue
15
- }
16
-
17
- const {resources, title, url } = page
18
-
19
- this.write(`Title: ${title}\n`)
20
- this.write(`URL: ${url}\n\n`)
21
-
22
- for (const resource of resources) {
23
- const { link: { url }, text: { value } } = resource
24
- this.write(`${value}: ${url}\n`)
25
- }
26
- }
27
- }
28
- }
1
+ import { isPage, type PageMetadata } from '../page/index.js';
2
+ import { AbstractResourcePrinter } from './AbstractResourcePrinter.js';
3
+
4
+ export class LogStylePrinter extends AbstractResourcePrinter {
5
+ write(str: string): void {
6
+ process.stdout.write(str);
7
+ }
8
+
9
+ async print(...pages: PageMetadata[]): Promise<void> {
10
+ for (const page of pages) {
11
+ if (!isPage(page)) {
12
+ this.write(page.error);
13
+ continue;
14
+ }
15
+
16
+ const { resources, title, url } = page;
17
+
18
+ this.write(`Title: ${title}\n`);
19
+ this.write(`URL: ${url}\n\n`);
20
+
21
+ for (const resource of resources) {
22
+ const {
23
+ link: { url },
24
+ text: { value },
25
+ } = resource;
26
+ this.write(`${value}: ${url}\n`);
27
+ }
28
+ }
29
+ }
30
+ }
@@ -0,0 +1,3 @@
1
+ export { AbstractResourcePrinter } from './AbstractResourcePrinter.js';
2
+ export { JSONStylePrinter } from './JSONStylePrinter.js';
3
+ export { LogStylePrinter } from './LogStylePrinter.js';
package/src/resource.ts CHANGED
@@ -1,96 +1,88 @@
1
- /**
2
- * @license MIT
3
- * We are interested in visualising a page as a collection of tags.
4
- *
5
- * We wish to work with tags that can be compactly previewed on a webpage.
6
- * Here we must declare all of the element types that can be used to represent
7
- * a resource that can be hyperlinked off a webpage.
8
- */
9
-
10
- type Tags = HTMLElementTagNameMap
11
-
12
- function findDefinedKey(element: Resource, keys: LinkKey[]): LinkKey | undefined {
13
- for (const key of keys) {
14
- if (isKeyDefined(key, element)) {
15
- return key;
16
- }
17
- }
18
- }
19
-
20
- export const RESOURCE_DISPLAYABLE_KEYS = [
21
- 'id',
22
- 'innerText',
23
- 'textContent',
24
- 'class',
25
- 'ariaLabel',
26
- 'ariaDescription',
27
- 'alt',
28
- 'rel'
29
- ] as const;
30
-
31
- export type DisplayableKey = (typeof RESOURCE_DISPLAYABLE_KEYS)[number];
32
-
33
- export type ResourceKey = {
34
- key: DisplayableKey;
35
- value: string;
36
- };
37
-
38
- export const RESOURCE_LINK_KEYS = [
39
- "href",
40
- "data-src",
41
- "target",
42
- "action",
43
- "src",
44
- "url"
45
- ] as const;
46
-
47
- export type LinkKey = typeof RESOURCE_LINK_KEYS[number];
48
-
49
- export type ResourceLink = {
50
- key: LinkKey;
51
- url: string;
52
- }
53
-
54
- export function findResourceText(element: Resource): ResourceKey | undefined {
55
- for (const key of RESOURCE_DISPLAYABLE_KEYS) {
56
- const value = element[key]
57
- if (value && typeof value === 'string' && value.trim() !== '')
58
- return { key, value };
59
- }
60
- }
61
-
62
- export function findResourceLink(element: Resource): ResourceLink | undefined {
63
- const key = findDefinedKey(element, [...RESOURCE_LINK_KEYS]);
64
- const url = element[key];
65
- if (url && typeof url === 'string' && url.trim() !== '')
66
- return { key, url };
67
- }
68
-
69
- export type ExternalResource = {
70
- text: ResourceKey;
71
- link: ResourceLink;
72
- };
73
-
74
- export const isResourceKey = (key: string): key is LinkKey => key in RESOURCE_LINK_KEYS;
75
-
76
- export const isKeyDefined = <E extends Tags[keyof Tags]>(key: string, element: E): boolean =>
77
- key in element && element[key] !== undefined;
78
-
79
- export type ResourceElement<T, U> = {
80
- [K in keyof T]: U extends keyof T[K] ? T[K] : never
81
- }[keyof T];
82
-
83
- export type Tag = keyof Tags
84
-
85
- export type Resource = ResourceElement<Tags, (typeof RESOURCE_LINK_KEYS)[number]>;
86
-
87
- export type ResourceByName<T extends keyof Tags> = Tags[T]
88
-
89
- /** tests **/
90
-
91
- type test1 = HTMLAnchorElement extends Resource ? true : false // true
92
- type test2 = HTMLImageElement extends Resource ? true : false // true
93
- type test3 = HTMLDivElement extends Resource ? true : false // false
94
-
95
- type test4 = ResourceElement<Tags, "src">
96
-
1
+ /**
2
+ * @license MIT
3
+ * We are interested in visualising a page as a collection of tags.
4
+ *
5
+ * We wish to work with tags that can be compactly previewed on a webpage.
6
+ * Here we must declare all of the element types that can be used to represent
7
+ * a resource that can be hyperlinked off a webpage.
8
+ */
9
+ type Tags = HTMLElementTagNameMap;
10
+
11
+ function findDefinedKey(element: Resource, keys: LinkKey[]): LinkKey | undefined {
12
+ for (const key of keys) {
13
+ if (isKeyDefined(key, element)) {
14
+ return key;
15
+ }
16
+ }
17
+
18
+ return undefined;
19
+ }
20
+
21
+ export const RESOURCE_DISPLAYABLE_KEYS = [
22
+ 'id',
23
+ 'innerText',
24
+ 'textContent',
25
+ 'class',
26
+ 'ariaLabel',
27
+ 'ariaDescription',
28
+ 'alt',
29
+ ] as const;
30
+
31
+ export type DisplayableKey = (typeof RESOURCE_DISPLAYABLE_KEYS)[number];
32
+
33
+ export type ResourceKey = {
34
+ key: DisplayableKey;
35
+ value: string;
36
+ };
37
+
38
+ export const RESOURCE_LINK_KEYS = ['href', 'data-src', 'target', 'action', 'src', 'url'] as const;
39
+
40
+ export type LinkKey = (typeof RESOURCE_LINK_KEYS)[number];
41
+
42
+ export type ResourceLink = {
43
+ key: LinkKey;
44
+ url: string;
45
+ };
46
+
47
+ export function findResourceText(element: Resource): ResourceKey | undefined {
48
+ for (const key of RESOURCE_DISPLAYABLE_KEYS) {
49
+ const value = element[key];
50
+ if (value && typeof value === 'string' && value.trim() !== '') return { key, value };
51
+ }
52
+
53
+ return undefined;
54
+ }
55
+
56
+ export function findResourceLink(element: Resource): ResourceLink | undefined {
57
+ const key = findDefinedKey(element, [...RESOURCE_LINK_KEYS]);
58
+ if (!key) {
59
+ return undefined;
60
+ }
61
+
62
+ const url = element[key];
63
+ if (url && typeof url === 'string' && url.trim() !== '') return { key, url };
64
+
65
+ return undefined;
66
+ }
67
+
68
+ export type ExternalResource = {
69
+ text: ResourceKey;
70
+ link: ResourceLink;
71
+ };
72
+
73
+ export const isResourceKey = (key: string): key is LinkKey => key in RESOURCE_LINK_KEYS;
74
+
75
+ export const isKeyDefined = (key: DisplayableKey | LinkKey, element: Resource): boolean =>
76
+ key in element && element[key] !== undefined;
77
+
78
+ export type ResourceElement<T, U> = {
79
+ [K in keyof T]: U extends keyof T[K] ? T[K] : never;
80
+ }[keyof T];
81
+
82
+ export type Tag = keyof Tags;
83
+
84
+ export type Resource = HTMLElement & {
85
+ [K in DisplayableKey | LinkKey]?: string | null;
86
+ };
87
+
88
+ export type ResourceByName<T extends keyof Tags> = Tags[T];