pagerts 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +39 -0
  2. package/bin/main.js +11 -10
  3. package/bin/main.js.map +3 -3
  4. package/bin/package.json +40 -0
  5. package/bin/src/extractors/AbstractExtractor.js +11 -0
  6. package/bin/src/extractors/AbstractExtractor.js.map +1 -0
  7. package/bin/src/extractors/PageExtractor.js +13 -0
  8. package/bin/src/extractors/PageExtractor.js.map +1 -0
  9. package/bin/src/extractors/ResourceExtractor.js +32 -0
  10. package/bin/src/extractors/ResourceExtractor.js.map +1 -0
  11. package/bin/src/main.js +36 -0
  12. package/bin/src/main.js.map +1 -0
  13. package/bin/src/page/Page.js +8 -0
  14. package/bin/src/page/Page.js.map +1 -0
  15. package/bin/src/page/PageFetcher.js +26 -0
  16. package/bin/src/page/PageFetcher.js.map +1 -0
  17. package/bin/src/printers/AbstractResourcePrinter.js +8 -0
  18. package/bin/src/printers/AbstractResourcePrinter.js.map +1 -0
  19. package/bin/src/printers/JSONStylePrinter.js +12 -0
  20. package/bin/src/printers/JSONStylePrinter.js.map +1 -0
  21. package/bin/src/printers/LogStylePrinter.js +27 -0
  22. package/bin/src/printers/LogStylePrinter.js.map +1 -0
  23. package/bin/src/resource.js +56 -0
  24. package/bin/src/resource.js.map +1 -0
  25. package/jest.config.js +198 -198
  26. package/package.json +40 -40
  27. package/src/extractors/AbstractExtractor.ts +5 -5
  28. package/src/extractors/PageExtractor.ts +11 -14
  29. package/src/extractors/ResourceExtractor.ts +24 -24
  30. package/src/extractors/TagExtractor.ts +13 -13
  31. package/src/main.ts +43 -47
  32. package/src/page/Page.ts +19 -3
  33. package/src/page/PageFetcher.ts +30 -25
  34. package/src/printers/AbstractResourcePrinter.ts +6 -6
  35. package/src/printers/JSONStylePrinter.ts +12 -12
  36. package/src/printers/LogStylePrinter.ts +28 -23
  37. package/src/resource.ts +96 -96
  38. package/tsconfig.json +11 -11
  39. package/.vscode/launch.json +0 -22
  40. package/src/page/PageMetadata.ts +0 -23
package/src/main.ts CHANGED
@@ -1,47 +1,43 @@
1
- #!/usr/bin/env node
2
- import { Command, createArgument } from "commander";
3
-
4
- import { description, name, version } from '../package.json';
5
- import { PageExtractor } from "./extractors/PageExtractor";
6
- import { ResourceExtractor } from "./extractors/ResourceExtractor";
7
- import { isJSDOM, PageFetcher, type PageResponse } from "./page/PageFetcher";
8
- import { type Page } from "./page/PageMetadata";
9
- import { JSONStylePrinter } from "./printers/JSONStylePrinter";
10
-
11
- const program = new Command();
12
-
13
- const url = createArgument("<url|file...>", "remote URL or local file to extract remote resources from");
14
-
15
- (async () => {
16
- await program
17
- .name(name)
18
- .version(version, "-v, --version")
19
- .description(description)
20
- .addArgument(url)
21
- .action(async (urls: string[]) => {
22
- const printer = new JSONStylePrinter();
23
- try {
24
- const pageFetcher = new PageFetcher()
25
- const pageExtractor = new PageExtractor()
26
- const resourceExtractor = new ResourceExtractor(["a", "meta", "link", "embed"])
27
- const pagesFetched: PageResponse[] = await pageFetcher.fetchAll(urls);
28
- const metadataPages: Page[] = []
29
- for (const page of pagesFetched) {
30
- // check if page has an error
31
- if (isJSDOM(page)) {
32
- const resources = await resourceExtractor.extract(page);
33
- const descriptor = await pageExtractor.extract(page);
34
- metadataPages.push({
35
- ...descriptor, resources
36
- });
37
- } else metadataPages.push(page);
38
- }
39
- await printer.print(...metadataPages);
40
- } catch (error) {
41
- await printer.print({
42
- error: error.message
43
- })
44
- }
45
- })
46
- .parseAsync(process.argv);
47
- })();
1
+ #!/usr/bin/env node
2
+ import { Command, createArgument } from "commander";
3
+
4
+ import { description, name, version } from '../package.json';
5
+ import { PageExtractor } from "./extractors/PageExtractor";
6
+ import { ResourceExtractor } from "./extractors/ResourceExtractor";
7
+ import { PageFetcher } from "./page/PageFetcher";
8
+ import type { Page, PageMetadata } from "./page/Page";
9
+ import { JSONStylePrinter } from "./printers/JSONStylePrinter";
10
+ import { LogStylePrinter } from "./printers/LogStylePrinter";
11
+
12
+ const program = new Command();
13
+
14
+ const url = createArgument("<url | file...>", "remote https://URL or local file://resource.html to extract from");
15
+
16
+ (async () => {
17
+ await program
18
+ .name(name)
19
+ .version(version, "-v, --version")
20
+ .description(description)
21
+ .addArgument(url)
22
+ .action(async (urls: string[]) => {
23
+ const printer = new JSONStylePrinter();
24
+ // simple log style printer
25
+ // const printer = new LogStylePrinter();
26
+
27
+ const pageFetcher = new PageFetcher()
28
+ const pageExtractor = new PageExtractor()
29
+ const resourceExtractor = new ResourceExtractor(["a", "meta", "link", "embed"])
30
+
31
+ const pageResponses = await pageFetcher.fetchAll(urls);
32
+ const pageMetadatas: PageMetadata[] = [];
33
+
34
+ for (const { content, url, error } of pageResponses) {
35
+ const resources = error in (content) ? [] : await resourceExtractor.extract(content);
36
+ const descriptor = error in content ? { url, error } : await pageExtractor.extract(content);
37
+ pageMetadatas.push({ ...descriptor, resources });
38
+ }
39
+
40
+ await printer.print(...pageMetadatas);
41
+ })
42
+ .parseAsync(process.argv);
43
+ })();
package/src/page/Page.ts CHANGED
@@ -1,3 +1,19 @@
1
- import { JSDOM } from 'jsdom';
2
-
3
- export type Page = JSDOM
1
+ import type { ExternalResource } from "../resource";
2
+
3
+ type hasTitle = {
4
+ title: string;
5
+ };
6
+
7
+ type hasUrl = {
8
+ url: string;
9
+ };
10
+
11
+ type hasResources = {
12
+ resources: ExternalResource[];
13
+ };
14
+
15
+ export type Page = hasTitle & hasUrl
16
+ export type PageMetadata = (Page & hasResources) | { error: string }
17
+ export const isError = (page: PageMetadata): page is { error: string } => 'error' in page;
18
+ export const isPage = (page: any): page is Page =>
19
+ "resources" in page && Array.isArray(page.resources);
@@ -1,25 +1,30 @@
1
- import { JSDOM, VirtualConsole } from 'jsdom';
2
- import type { Page } from './PageMetadata';
3
-
4
- export type PageResponse = JSDOM | Page
5
-
6
- export const isJSDOM = (page: PageResponse): page is JSDOM => 'window' in page;
7
-
8
- export class PageFetcher {
9
- async fetchPage(url: string): Promise<PageResponse> {
10
- try {
11
- const dom = await JSDOM.fromURL(url, {
12
- virtualConsole: new VirtualConsole().on('jsdomError', (error) => {
13
- console.error(`Error parsing ${url}:`, error.message);
14
- })
15
- });
16
- return dom;
17
- } catch (error) {
18
- throw error;
19
- }
20
- }
21
- async fetchAll(urls: string[]): Promise<PageResponse[]> {
22
- return await Promise.all(urls.map(url => this.fetchPage(url)));
23
- }
24
- constructor() { }
25
- }
1
+ import { JSDOM, VirtualConsole } from 'jsdom';
2
+ import type { Page, PageMetadata } from './Page';
3
+
4
+ interface PageResponse {
5
+ url: string;
6
+ content?: JSDOM;
7
+ error?: string;
8
+ }
9
+
10
+ export class PageFetcher {
11
+ private async fetchPage(url: string): Promise<PageResponse> {
12
+ let dom: Promise<JSDOM>;
13
+ const virtualConsole = new VirtualConsole().on('jsdomError', (error) => {
14
+ process.stderr.write(`Error parsing ${url}:${error.message}\n`);
15
+ });
16
+ if (url.startsWith("file://")) {
17
+ dom = JSDOM.fromFile(url, { virtualConsole });
18
+ } else {
19
+ dom = JSDOM.fromURL(url, { virtualConsole });
20
+ }
21
+
22
+ return dom.then(content => ({ url, content }))
23
+ .catch(({ message }) => ({ url, error: `JSDOM failed to parse: ${message}` }));
24
+ }
25
+ async fetchAll(urls: string[]): Promise<PageResponse[]> {
26
+ const responses = await Promise.all(urls.map(url => this.fetchPage(url)));
27
+ return responses.filter(response => response.content !== undefined);
28
+ }
29
+
30
+ }
@@ -1,6 +1,6 @@
1
- import type { Page } from "../page/PageMetadata";
2
-
3
- export abstract class AbstractResourcePrinter {
4
- constructor() { }
5
- abstract print(...pages: Page[]): void | Promise<void>;
6
- }
1
+ import type { Page, PageMetadata } from "../page/Page";
2
+
3
+ export abstract class AbstractResourcePrinter {
4
+ constructor() { }
5
+ abstract print(...pages: PageMetadata[]): void | Promise<void>;
6
+ }
@@ -1,12 +1,12 @@
1
- import type { Page } from "../page/PageMetadata";
2
- import { AbstractResourcePrinter } from "./AbstractResourcePrinter";
3
-
4
-
5
- export class JSONStylePrinter extends AbstractResourcePrinter {
6
- print(...pages: Page[]): void | Promise<void> {
7
- const json = JSON.stringify(pages);
8
- process.stdout.write(json + "\n")
9
- }
10
-
11
-
12
- }
1
+ import type { PageMetadata } from "../page/Page";
2
+ import { AbstractResourcePrinter } from "./AbstractResourcePrinter";
3
+
4
+
5
+ export class JSONStylePrinter extends AbstractResourcePrinter {
6
+ print(...pages: PageMetadata[]): void | Promise<void> {
7
+ const json = JSON.stringify(pages);
8
+ process.stdout.write(json + "\n")
9
+ }
10
+
11
+
12
+ }
@@ -1,23 +1,28 @@
1
- import type { PageMetadata } from '../page/PageMetadata';
2
- import { AbstractResourcePrinter } from './AbstractResourcePrinter';
3
-
4
- export class LogStylePrinter extends AbstractResourcePrinter {
5
-
6
- write(str: string): void {
7
- process.stdout.write(str)
8
- }
9
-
10
- async print(...pages: PageMetadata[]): Promise<void> {
11
- for (const page of pages) {
12
- const { resources, title, url } = page
13
-
14
- this.write(`Title: ${title}\n`)
15
- this.write(`URL: ${url}\n\n`)
16
-
17
- for (const resource of resources) {
18
- const { link: { url }, text: { value } } = resource
19
- this.write(`${value}: ${url}\n`)
20
- }
21
- }
22
- }
23
- }
1
+ import { isPage, type Page, type PageMetadata } from '../page/Page';
2
+ import { AbstractResourcePrinter } from './AbstractResourcePrinter';
3
+
4
+ export class LogStylePrinter extends AbstractResourcePrinter {
5
+
6
+ write(str: string): void {
7
+ process.stdout.write(str)
8
+ }
9
+
10
+ async print(...pages: PageMetadata[]): Promise<void> {
11
+ for (const page of pages) {
12
+ if (!isPage(page)) {
13
+ this.write(page.error)
14
+ continue
15
+ }
16
+
17
+ const {resources, title, url } = page
18
+
19
+ this.write(`Title: ${title}\n`)
20
+ this.write(`URL: ${url}\n\n`)
21
+
22
+ for (const resource of resources) {
23
+ const { link: { url }, text: { value } } = resource
24
+ this.write(`${value}: ${url}\n`)
25
+ }
26
+ }
27
+ }
28
+ }
package/src/resource.ts CHANGED
@@ -1,96 +1,96 @@
1
- /**
2
- * @license MIT
3
- * We are interested in visualising a page as a collection of tags.
4
- *
5
- * We wish to work with tags that can be compactly previewed on a webpage.
6
- * Here we must declare all of the element types that can be used to represent
7
- * a resource that can be hyperlinked off a webpage.
8
- */
9
-
10
- type Tags = HTMLElementTagNameMap
11
-
12
- function findDefinedKey(element: Resource, keys: LinkKey[]): LinkKey | undefined {
13
- for (const key of keys) {
14
- if (isKeyDefined(key, element)) {
15
- return key;
16
- }
17
- }
18
- }
19
-
20
- export const RESOURCE_DISPLAYABLE_KEYS = [
21
- 'id',
22
- 'innerText',
23
- 'textContent',
24
- 'class',
25
- 'ariaLabel',
26
- 'ariaDescription',
27
- 'alt',
28
- 'rel'
29
- ] as const;
30
-
31
- export type DisplayableKey = (typeof RESOURCE_DISPLAYABLE_KEYS)[number];
32
-
33
- export type ResourceKey = {
34
- key: DisplayableKey;
35
- value: string;
36
- };
37
-
38
- export const RESOURCE_LINK_KEYS = [
39
- "href",
40
- "data-src",
41
- "target",
42
- "action",
43
- "src",
44
- "url"
45
- ] as const;
46
-
47
- export type LinkKey = typeof RESOURCE_LINK_KEYS[number];
48
-
49
- export type ResourceLink = {
50
- key: LinkKey;
51
- url: string;
52
- }
53
-
54
- export function findResourceText(element: Resource): ResourceKey | undefined {
55
- for (const key of RESOURCE_DISPLAYABLE_KEYS) {
56
- const value = element[key]
57
- if (value && typeof value === 'string' && value.trim() !== '')
58
- return { key, value };
59
- }
60
- }
61
-
62
- export function findResourceLink(element: Resource): ResourceLink | undefined {
63
- const key = findDefinedKey(element, [...RESOURCE_LINK_KEYS]);
64
- const url = element[key];
65
- if (url && typeof url === 'string' && url.trim() !== '')
66
- return { key, url };
67
- }
68
-
69
- export type ExternalResource = {
70
- text: ResourceKey;
71
- link: ResourceLink;
72
- };
73
-
74
- export const isResourceKey = (key: string): key is LinkKey => key in RESOURCE_LINK_KEYS;
75
-
76
- export const isKeyDefined = <E extends Tags[keyof Tags]>(key: string, element: E): boolean =>
77
- key in element && element[key] !== undefined;
78
-
79
- export type ResourceElement<T, U> = {
80
- [K in keyof T]: U extends keyof T[K] ? T[K] : never
81
- }[keyof T];
82
-
83
- export type Tag = keyof Tags
84
-
85
- export type Resource = ResourceElement<Tags, (typeof RESOURCE_LINK_KEYS)[number]>;
86
-
87
- export type ResourceByName<T extends keyof Tags> = Tags[T]
88
-
89
- /** tests **/
90
-
91
- type test1 = HTMLAnchorElement extends Resource ? true : false // true
92
- type test2 = HTMLImageElement extends Resource ? true : false // true
93
- type test3 = HTMLDivElement extends Resource ? true : false // false
94
-
95
- type test4 = ResourceElement<Tags, "src">
96
-
1
+ /**
2
+ * @license MIT
3
+ * We are interested in visualising a page as a collection of tags.
4
+ *
5
+ * We wish to work with tags that can be compactly previewed on a webpage.
6
+ * Here we must declare all of the element types that can be used to represent
7
+ * a resource that can be hyperlinked off a webpage.
8
+ */
9
+
10
+ type Tags = HTMLElementTagNameMap
11
+
12
+ function findDefinedKey(element: Resource, keys: LinkKey[]): LinkKey | undefined {
13
+ for (const key of keys) {
14
+ if (isKeyDefined(key, element)) {
15
+ return key;
16
+ }
17
+ }
18
+ }
19
+
20
+ export const RESOURCE_DISPLAYABLE_KEYS = [
21
+ 'id',
22
+ 'innerText',
23
+ 'textContent',
24
+ 'class',
25
+ 'ariaLabel',
26
+ 'ariaDescription',
27
+ 'alt',
28
+ 'rel'
29
+ ] as const;
30
+
31
+ export type DisplayableKey = (typeof RESOURCE_DISPLAYABLE_KEYS)[number];
32
+
33
+ export type ResourceKey = {
34
+ key: DisplayableKey;
35
+ value: string;
36
+ };
37
+
38
+ export const RESOURCE_LINK_KEYS = [
39
+ "href",
40
+ "data-src",
41
+ "target",
42
+ "action",
43
+ "src",
44
+ "url"
45
+ ] as const;
46
+
47
+ export type LinkKey = typeof RESOURCE_LINK_KEYS[number];
48
+
49
+ export type ResourceLink = {
50
+ key: LinkKey;
51
+ url: string;
52
+ }
53
+
54
+ export function findResourceText(element: Resource): ResourceKey | undefined {
55
+ for (const key of RESOURCE_DISPLAYABLE_KEYS) {
56
+ const value = element[key]
57
+ if (value && typeof value === 'string' && value.trim() !== '')
58
+ return { key, value };
59
+ }
60
+ }
61
+
62
+ export function findResourceLink(element: Resource): ResourceLink | undefined {
63
+ const key = findDefinedKey(element, [...RESOURCE_LINK_KEYS]);
64
+ const url = element[key];
65
+ if (url && typeof url === 'string' && url.trim() !== '')
66
+ return { key, url };
67
+ }
68
+
69
+ export type ExternalResource = {
70
+ text: ResourceKey;
71
+ link: ResourceLink;
72
+ };
73
+
74
+ export const isResourceKey = (key: string): key is LinkKey => key in RESOURCE_LINK_KEYS;
75
+
76
+ export const isKeyDefined = <E extends Tags[keyof Tags]>(key: string, element: E): boolean =>
77
+ key in element && element[key] !== undefined;
78
+
79
+ export type ResourceElement<T, U> = {
80
+ [K in keyof T]: U extends keyof T[K] ? T[K] : never
81
+ }[keyof T];
82
+
83
+ export type Tag = keyof Tags
84
+
85
+ export type Resource = ResourceElement<Tags, (typeof RESOURCE_LINK_KEYS)[number]>;
86
+
87
+ export type ResourceByName<T extends keyof Tags> = Tags[T]
88
+
89
+ /** tests **/
90
+
91
+ type test1 = HTMLAnchorElement extends Resource ? true : false // true
92
+ type test2 = HTMLImageElement extends Resource ? true : false // true
93
+ type test3 = HTMLDivElement extends Resource ? true : false // false
94
+
95
+ type test4 = ResourceElement<Tags, "src">
96
+
package/tsconfig.json CHANGED
@@ -1,12 +1,12 @@
1
- {
2
- "compilerOptions": {
3
- "module": "NodeNext",
4
- "target": "ESNext",
5
- "resolveJsonModule": true,
6
- "outDir": "dist",
7
- "sourceMap": true,
8
- },
9
- "include": [
10
- "src/**.*",
11
- ],
1
+ {
2
+ "compilerOptions": {
3
+ "module": "NodeNext",
4
+ "target": "ESNext",
5
+ "resolveJsonModule": true,
6
+ "outDir": "bin",
7
+ "sourceMap": true,
8
+ },
9
+ "include": [
10
+ "src/**.*",
11
+ ],
12
12
  }
@@ -1,22 +0,0 @@
1
- {
2
- // Use IntelliSense to learn about possible attributes.
3
- // Hover to view descriptions of existing attributes.
4
- // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
- "version": "0.2.0",
6
- "configurations": [
7
- {
8
- "type": "node",
9
- "args": ["https://google.com/"],
10
- "request": "launch",
11
- "name": "Launch Program",
12
- "skipFiles": [
13
- "<node_internals>/**"
14
- ],
15
- "program": "${workspaceFolder}/src/main.ts",
16
- "preLaunchTask": "tsc: build - tsconfig.json",
17
- "outFiles": [
18
- "${workspaceFolder}/dist/**/*.js"
19
- ]
20
- }
21
- ]
22
- }
@@ -1,23 +0,0 @@
1
- import type { ExternalResource } from "../resource";
2
- import type { PageResponse } from "./PageFetcher";
3
-
4
- type hasTitle = {
5
- title: string;
6
- };
7
-
8
- type hasUrl = {
9
- url: string;
10
- };
11
-
12
- export type PageDescriptor = hasTitle & hasUrl
13
-
14
- export type PageMetadata = {
15
- resources: ExternalResource[];
16
- }
17
-
18
- export type Page = (PageDescriptor & PageMetadata) | {
19
- error: Error
20
- }
21
-
22
- export const isPage = (page: any): page is Page =>
23
- "resources" in page && Array.isArray(page.resources);