pagerts 0.1.9 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -16
- package/bin/main.js +14 -14
- package/bin/main.js.map +4 -4
- package/bin/package.json +40 -0
- package/bin/src/extractors/AbstractExtractor.js +11 -0
- package/bin/src/extractors/AbstractExtractor.js.map +1 -0
- package/bin/src/extractors/PageExtractor.js +13 -0
- package/bin/src/extractors/PageExtractor.js.map +1 -0
- package/bin/src/extractors/ResourceExtractor.js +32 -0
- package/bin/src/extractors/ResourceExtractor.js.map +1 -0
- package/bin/src/main.js +36 -0
- package/bin/src/main.js.map +1 -0
- package/bin/src/page/Page.js +8 -0
- package/bin/src/page/Page.js.map +1 -0
- package/bin/src/page/PageFetcher.js +26 -0
- package/bin/src/page/PageFetcher.js.map +1 -0
- package/bin/src/printers/AbstractResourcePrinter.js +8 -0
- package/bin/src/printers/AbstractResourcePrinter.js.map +1 -0
- package/bin/src/printers/JSONStylePrinter.js +12 -0
- package/bin/src/printers/JSONStylePrinter.js.map +1 -0
- package/bin/src/printers/LogStylePrinter.js +27 -0
- package/bin/src/printers/LogStylePrinter.js.map +1 -0
- package/bin/src/resource.js +56 -0
- package/bin/src/resource.js.map +1 -0
- package/package.json +2 -2
- package/src/extractors/PageExtractor.ts +3 -12
- package/src/main.ts +10 -15
- package/src/page/Page.ts +0 -1
- package/src/page/PageFetcher.ts +20 -14
package/bin/package.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pagerts",
|
|
3
|
+
"description": "A tool for viewing external relations in a webpage",
|
|
4
|
+
"version": "0.1.9",
|
|
5
|
+
"main": "main.js",
|
|
6
|
+
"bin": {
|
|
7
|
+
"pagerts": "bin/main.js"
|
|
8
|
+
},
|
|
9
|
+
"scripts": {
|
|
10
|
+
"test": "jest",
|
|
11
|
+
"build": "esbuild src/main.ts --external:jsdom --bundle --outdir=bin --minify --sourcemap --platform=node",
|
|
12
|
+
"lint": "tsc",
|
|
13
|
+
"start": "node ./bin/main.js",
|
|
14
|
+
"dev": "npx tsx src/main.ts"
|
|
15
|
+
},
|
|
16
|
+
"keywords": [
|
|
17
|
+
"webpage",
|
|
18
|
+
"hierarchy",
|
|
19
|
+
"management"
|
|
20
|
+
],
|
|
21
|
+
"author": "Kirill kn253 Nevzorov",
|
|
22
|
+
"license": "MIT",
|
|
23
|
+
"bugs": {
|
|
24
|
+
"url": "https://github.com/akinevz0/pagerts/issues"
|
|
25
|
+
},
|
|
26
|
+
"homepage": "https://github.com/akinevz0/pagerts",
|
|
27
|
+
"dependencies": {
|
|
28
|
+
"blessed": "^0.1.81",
|
|
29
|
+
"commander": "^12.1.0",
|
|
30
|
+
"dotenv": "^16.4.5",
|
|
31
|
+
"jsdom": "^26.0.0"
|
|
32
|
+
},
|
|
33
|
+
"devDependencies": {
|
|
34
|
+
"@types/blessed": "^0.1.25",
|
|
35
|
+
"@types/jsdom": "^21.1.7",
|
|
36
|
+
"@types/node": "^22.8.2",
|
|
37
|
+
"esbuild": "^0.25.1",
|
|
38
|
+
"ts-node": "^10.9.2"
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.AbstractExtractor = void 0;
|
|
4
|
+
class AbstractExtractor {
|
|
5
|
+
name;
|
|
6
|
+
constructor(name) {
|
|
7
|
+
this.name = name;
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
exports.AbstractExtractor = AbstractExtractor;
|
|
11
|
+
//# sourceMappingURL=AbstractExtractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AbstractExtractor.js","sourceRoot":"","sources":["../../../src/extractors/AbstractExtractor.ts"],"names":[],"mappings":";;;AACA,MAAsB,iBAAiB;IACd;IAArB,YAAqB,IAAW;QAAX,SAAI,GAAJ,IAAI,CAAO;IAAI,CAAC;CAExC;AAHD,8CAGC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.PageExtractor = void 0;
|
|
4
|
+
const AbstractExtractor_1 = require("./AbstractExtractor");
|
|
5
|
+
class PageExtractor extends AbstractExtractor_1.AbstractExtractor {
|
|
6
|
+
constructor() { super("page-extractor"); }
|
|
7
|
+
async extract(value) {
|
|
8
|
+
const { window: { document: { title, location: { href: url } } } } = value;
|
|
9
|
+
return { title, url };
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
exports.PageExtractor = PageExtractor;
|
|
13
|
+
//# sourceMappingURL=PageExtractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"PageExtractor.js","sourceRoot":"","sources":["../../../src/extractors/PageExtractor.ts"],"names":[],"mappings":";;;AAEA,2DAAwD;AAExD,MAAa,aAAc,SAAQ,qCAA8B;IAC7D,gBAAgB,KAAK,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC;IAE1C,KAAK,CAAC,OAAO,CAAC,KAAY;QACtB,MAAM,EAAE,MAAM,EAAE,EAAE,QAAQ,EAAE,EAAE,KAAK,EAAE,QAAQ,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,EAAE,GAAG,KAAK,CAAA;QAC1E,OAAO,EAAE,KAAK,EAAE,GAAG,EAAE,CAAA;IACzB,CAAC;CACJ;AAPD,sCAOC"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ResourceExtractor = void 0;
|
|
4
|
+
const resource_1 = require("../resource");
|
|
5
|
+
const AbstractExtractor_1 = require("./AbstractExtractor");
|
|
6
|
+
class ResourceExtractor extends AbstractExtractor_1.AbstractExtractor {
|
|
7
|
+
tags;
|
|
8
|
+
constructor(tags) {
|
|
9
|
+
super("page-extractor");
|
|
10
|
+
this.tags = tags;
|
|
11
|
+
}
|
|
12
|
+
async extract(value) {
|
|
13
|
+
const { document } = value.window;
|
|
14
|
+
const externalResources = [];
|
|
15
|
+
for (const tag of this.tags) {
|
|
16
|
+
const selector = document.querySelectorAll(tag);
|
|
17
|
+
const elements = Array.from(selector);
|
|
18
|
+
for (const element of elements) {
|
|
19
|
+
const text = (0, resource_1.findResourceText)(element);
|
|
20
|
+
const link = (0, resource_1.findResourceLink)(element);
|
|
21
|
+
if (!text || !link)
|
|
22
|
+
continue;
|
|
23
|
+
if (!link.url.startsWith("http"))
|
|
24
|
+
continue;
|
|
25
|
+
externalResources.push({ text, link });
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
return externalResources;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
exports.ResourceExtractor = ResourceExtractor;
|
|
32
|
+
//# sourceMappingURL=ResourceExtractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ResourceExtractor.js","sourceRoot":"","sources":["../../../src/extractors/ResourceExtractor.ts"],"names":[],"mappings":";;;AACA,0CAAiH;AACjH,2DAAwD;AAExD,MAAa,iBAAkB,SAAQ,qCAA4C;IAClD;IAA7B,YAA6B,IAAW;QACpC,KAAK,CAAC,gBAAgB,CAAC,CAAC;QADC,SAAI,GAAJ,IAAI,CAAO;IAExC,CAAC;IACD,KAAK,CAAC,OAAO,CAAC,KAAY;QACtB,MAAM,EAAE,QAAQ,EAAE,GAAG,KAAK,CAAC,MAAM,CAAC;QAClC,MAAM,iBAAiB,GAAuB,EAAE,CAAC;QACjD,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,QAAQ,CAAC,gBAAgB,CAAW,GAAG,CAAC,CAAA;YACzD,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;YACrC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;gBAC7B,MAAM,IAAI,GAAG,IAAA,2BAAgB,EAAC,OAAO,CAAC,CAAC;gBACvC,MAAM,IAAI,GAAG,IAAA,2BAAgB,EAAC,OAAO,CAAC,CAAC;gBACvC,IAAG,CAAC,IAAI,IAAI,CAAC,IAAI;oBAAE,SAAQ;gBAC3B,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC;oBAAE,SAAQ;gBAC1C,iBAAiB,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAA;YAC1C,CAAC;QACL,CAAC;QACD,OAAO,iBAAiB,CAAC;IAC7B,CAAC;CACJ;AApBD,8CAoBC"}
|
package/bin/src/main.js
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
const commander_1 = require("commander");
|
|
5
|
+
const package_json_1 = require("../package.json");
|
|
6
|
+
const PageExtractor_1 = require("./extractors/PageExtractor");
|
|
7
|
+
const ResourceExtractor_1 = require("./extractors/ResourceExtractor");
|
|
8
|
+
const PageFetcher_1 = require("./page/PageFetcher");
|
|
9
|
+
const JSONStylePrinter_1 = require("./printers/JSONStylePrinter");
|
|
10
|
+
const program = new commander_1.Command();
|
|
11
|
+
const url = (0, commander_1.createArgument)("<url | file...>", "remote https://URL or local file://resource.html to extract from");
|
|
12
|
+
(async () => {
|
|
13
|
+
await program
|
|
14
|
+
.name(package_json_1.name)
|
|
15
|
+
.version(package_json_1.version, "-v, --version")
|
|
16
|
+
.description(package_json_1.description)
|
|
17
|
+
.addArgument(url)
|
|
18
|
+
.action(async (urls) => {
|
|
19
|
+
const printer = new JSONStylePrinter_1.JSONStylePrinter();
|
|
20
|
+
// simple log style printer
|
|
21
|
+
// const printer = new LogStylePrinter();
|
|
22
|
+
const pageFetcher = new PageFetcher_1.PageFetcher();
|
|
23
|
+
const pageExtractor = new PageExtractor_1.PageExtractor();
|
|
24
|
+
const resourceExtractor = new ResourceExtractor_1.ResourceExtractor(["a", "meta", "link", "embed"]);
|
|
25
|
+
const pageResponses = await pageFetcher.fetchAll(urls);
|
|
26
|
+
const pageMetadatas = [];
|
|
27
|
+
for (const { content, url, error } of pageResponses) {
|
|
28
|
+
const resources = error in (content) ? [] : await resourceExtractor.extract(content);
|
|
29
|
+
const descriptor = error in content ? { url, error } : await pageExtractor.extract(content);
|
|
30
|
+
pageMetadatas.push({ ...descriptor, resources });
|
|
31
|
+
}
|
|
32
|
+
await printer.print(...pageMetadatas);
|
|
33
|
+
})
|
|
34
|
+
.parseAsync(process.argv);
|
|
35
|
+
})();
|
|
36
|
+
//# sourceMappingURL=main.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"main.js","sourceRoot":"","sources":["../../src/main.ts"],"names":[],"mappings":";;;AACA,yCAAoD;AAEpD,kDAA6D;AAC7D,8DAA2D;AAC3D,sEAAmE;AACnE,oDAAiD;AAEjD,kEAA+D;AAG/D,MAAM,OAAO,GAAG,IAAI,mBAAO,EAAE,CAAC;AAE9B,MAAM,GAAG,GAAG,IAAA,0BAAc,EAAC,iBAAiB,EAAE,kEAAkE,CAAC,CAAC;AAElH,CAAC,KAAK,IAAI,EAAE;IACV,MAAM,OAAO;SACV,IAAI,CAAC,mBAAI,CAAC;SACV,OAAO,CAAC,sBAAO,EAAE,eAAe,CAAC;SACjC,WAAW,CAAC,0BAAW,CAAC;SACxB,WAAW,CAAC,GAAG,CAAC;SAChB,MAAM,CAAC,KAAK,EAAE,IAAc,EAAE,EAAE;QAC/B,MAAM,OAAO,GAAG,IAAI,mCAAgB,EAAE,CAAC;QACvC,2BAA2B;QAC3B,yCAAyC;QAEzC,MAAM,WAAW,GAAG,IAAI,yBAAW,EAAE,CAAA;QACrC,MAAM,aAAa,GAAG,IAAI,6BAAa,EAAE,CAAA;QACzC,MAAM,iBAAiB,GAAG,IAAI,qCAAiB,CAAC,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAA;QAE/E,MAAM,aAAa,GAAG,MAAM,WAAW,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACvD,MAAM,aAAa,GAAmB,EAAE,CAAC;QAEzC,KAAK,MAAM,EAAE,OAAO,EAAE,GAAG,EAAE,KAAK,EAAE,IAAI,aAAa,EAAE,CAAC;YACpD,MAAM,SAAS,GAAG,KAAK,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,iBAAiB,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YACrF,MAAM,UAAU,GAAG,KAAK,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC,MAAM,aAAa,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YAC5F,aAAa,CAAC,IAAI,CAAC,EAAE,GAAG,UAAU,EAAE,SAAS,EAAE,CAAC,CAAC;QACnD,CAAC;QAED,MAAM,OAAO,CAAC,KAAK,CAAC,GAAG,aAAa,CAAC,CAAC;IACxC,CAAC,CAAC;SACD,UAAU,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;AAC9B,CAAC,CAAC,EAAE,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.isPage = exports.isError = void 0;
|
|
4
|
+
const isError = (page) => 'error' in page;
|
|
5
|
+
exports.isError = isError;
|
|
6
|
+
const isPage = (page) => "resources" in page && Array.isArray(page.resources);
|
|
7
|
+
exports.isPage = isPage;
|
|
8
|
+
//# sourceMappingURL=Page.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"Page.js","sourceRoot":"","sources":["../../../src/page/Page.ts"],"names":[],"mappings":";;;AAgBO,MAAM,OAAO,GAAG,CAAC,IAAkB,EAA6B,EAAE,CAAC,OAAO,IAAI,IAAI,CAAC;AAA7E,QAAA,OAAO,WAAsE;AACnF,MAAM,MAAM,GAAG,CAAC,IAAS,EAAgB,EAAE,CAC9C,WAAW,IAAI,IAAI,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;AAD5C,QAAA,MAAM,UACsC"}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.PageFetcher = void 0;
|
|
4
|
+
const jsdom_1 = require("jsdom");
|
|
5
|
+
class PageFetcher {
|
|
6
|
+
async fetchPage(url) {
|
|
7
|
+
let dom;
|
|
8
|
+
const virtualConsole = new jsdom_1.VirtualConsole().on('jsdomError', (error) => {
|
|
9
|
+
process.stderr.write(`Error parsing ${url}:${error.message}\n`);
|
|
10
|
+
});
|
|
11
|
+
if (url.startsWith("file://")) {
|
|
12
|
+
dom = jsdom_1.JSDOM.fromFile(url, { virtualConsole });
|
|
13
|
+
}
|
|
14
|
+
else {
|
|
15
|
+
dom = jsdom_1.JSDOM.fromURL(url, { virtualConsole });
|
|
16
|
+
}
|
|
17
|
+
return dom.then(content => ({ url, content }))
|
|
18
|
+
.catch(({ message }) => ({ url, error: `JSDOM failed to parse: ${message}` }));
|
|
19
|
+
}
|
|
20
|
+
async fetchAll(urls) {
|
|
21
|
+
const responses = await Promise.all(urls.map(url => this.fetchPage(url)));
|
|
22
|
+
return responses.filter(response => response.content !== undefined);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
exports.PageFetcher = PageFetcher;
|
|
26
|
+
//# sourceMappingURL=PageFetcher.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"PageFetcher.js","sourceRoot":"","sources":["../../../src/page/PageFetcher.ts"],"names":[],"mappings":";;;AAAA,iCAA8C;AAS9C,MAAa,WAAW;IACZ,KAAK,CAAC,SAAS,CAAC,GAAW;QAC/B,IAAI,GAAmB,CAAC;QACxB,MAAM,cAAc,GAAG,IAAI,sBAAc,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,KAAK,EAAE,EAAE;YACnE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,iBAAiB,GAAG,IAAI,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;QACpE,CAAC,CAAC,CAAC;QACH,IAAI,GAAG,CAAC,UAAU,CAAC,SAAS,CAAC,EAAE,CAAC;YAC5B,GAAG,GAAG,aAAK,CAAC,QAAQ,CAAC,GAAG,EAAE,EAAE,cAAc,EAAE,CAAC,CAAC;QAClD,CAAC;aAAM,CAAC;YACJ,GAAG,GAAG,aAAK,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,cAAc,EAAE,CAAC,CAAC;QACjD,CAAC;QAED,OAAO,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,CAAC;aACzC,KAAK,CAAC,CAAC,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,0BAA0B,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC;IACvF,CAAC;IACD,KAAK,CAAC,QAAQ,CAAC,IAAc;QACzB,MAAM,SAAS,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QAC1E,OAAO,SAAS,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC,QAAQ,CAAC,OAAO,KAAK,SAAS,CAAC,CAAC;IACxE,CAAC;CAEJ;AApBD,kCAoBC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.AbstractResourcePrinter = void 0;
|
|
4
|
+
class AbstractResourcePrinter {
|
|
5
|
+
constructor() { }
|
|
6
|
+
}
|
|
7
|
+
exports.AbstractResourcePrinter = AbstractResourcePrinter;
|
|
8
|
+
//# sourceMappingURL=AbstractResourcePrinter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AbstractResourcePrinter.js","sourceRoot":"","sources":["../../../src/printers/AbstractResourcePrinter.ts"],"names":[],"mappings":";;;AAEA,MAAsB,uBAAuB;IACzC,gBAAiB,CAAC;CAErB;AAHD,0DAGC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.JSONStylePrinter = void 0;
|
|
4
|
+
const AbstractResourcePrinter_1 = require("./AbstractResourcePrinter");
|
|
5
|
+
class JSONStylePrinter extends AbstractResourcePrinter_1.AbstractResourcePrinter {
|
|
6
|
+
print(...pages) {
|
|
7
|
+
const json = JSON.stringify(pages);
|
|
8
|
+
process.stdout.write(json + "\n");
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
exports.JSONStylePrinter = JSONStylePrinter;
|
|
12
|
+
//# sourceMappingURL=JSONStylePrinter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"JSONStylePrinter.js","sourceRoot":"","sources":["../../../src/printers/JSONStylePrinter.ts"],"names":[],"mappings":";;;AACA,uEAAoE;AAGpE,MAAa,gBAAiB,SAAQ,iDAAuB;IACzD,KAAK,CAAC,GAAG,KAAqB;QAC1B,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QACnC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,GAAG,IAAI,CAAC,CAAA;IACrC,CAAC;CAGJ;AAPD,4CAOC"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.LogStylePrinter = void 0;
|
|
4
|
+
const Page_1 = require("../page/Page");
|
|
5
|
+
const AbstractResourcePrinter_1 = require("./AbstractResourcePrinter");
|
|
6
|
+
class LogStylePrinter extends AbstractResourcePrinter_1.AbstractResourcePrinter {
|
|
7
|
+
write(str) {
|
|
8
|
+
process.stdout.write(str);
|
|
9
|
+
}
|
|
10
|
+
async print(...pages) {
|
|
11
|
+
for (const page of pages) {
|
|
12
|
+
if (!(0, Page_1.isPage)(page)) {
|
|
13
|
+
this.write(page.error);
|
|
14
|
+
continue;
|
|
15
|
+
}
|
|
16
|
+
const { resources, title, url } = page;
|
|
17
|
+
this.write(`Title: ${title}\n`);
|
|
18
|
+
this.write(`URL: ${url}\n\n`);
|
|
19
|
+
for (const resource of resources) {
|
|
20
|
+
const { link: { url }, text: { value } } = resource;
|
|
21
|
+
this.write(`${value}: ${url}\n`);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
exports.LogStylePrinter = LogStylePrinter;
|
|
27
|
+
//# sourceMappingURL=LogStylePrinter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"LogStylePrinter.js","sourceRoot":"","sources":["../../../src/printers/LogStylePrinter.ts"],"names":[],"mappings":";;;AAAA,uCAAoE;AACpE,uEAAoE;AAEpE,MAAa,eAAgB,SAAQ,iDAAuB;IAExD,KAAK,CAAC,GAAW;QACb,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;IAC7B,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,GAAG,KAAqB;QAChC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACvB,IAAI,CAAC,IAAA,aAAM,EAAC,IAAI,CAAC,EAAE,CAAC;gBAChB,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;gBACtB,SAAQ;YACZ,CAAC;YAED,MAAM,EAAC,SAAS,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,IAAI,CAAA;YAErC,IAAI,CAAC,KAAK,CAAC,UAAU,KAAK,IAAI,CAAC,CAAA;YAC/B,IAAI,CAAC,KAAK,CAAC,QAAQ,GAAG,MAAM,CAAC,CAAA;YAE7B,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;gBAC/B,MAAM,EAAE,IAAI,EAAE,EAAE,GAAG,EAAE,EAAE,IAAI,EAAE,EAAE,KAAK,EAAE,EAAE,GAAG,QAAQ,CAAA;gBACnD,IAAI,CAAC,KAAK,CAAC,GAAG,KAAK,KAAK,GAAG,IAAI,CAAC,CAAA;YACpC,CAAC;QACL,CAAC;IACL,CAAC;CACJ;AAxBD,0CAwBC"}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* @license MIT
|
|
4
|
+
* We are interested in visualising a page as a collection of tags.
|
|
5
|
+
*
|
|
6
|
+
* We wish to work with tags that can be compactly previewed on a webpage.
|
|
7
|
+
* Here we must declare all of the element types that can be used to represent
|
|
8
|
+
* a resource that can be hyperlinked off a webpage.
|
|
9
|
+
*/
|
|
10
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
|
+
exports.isKeyDefined = exports.isResourceKey = exports.RESOURCE_LINK_KEYS = exports.RESOURCE_DISPLAYABLE_KEYS = void 0;
|
|
12
|
+
exports.findResourceText = findResourceText;
|
|
13
|
+
exports.findResourceLink = findResourceLink;
|
|
14
|
+
function findDefinedKey(element, keys) {
|
|
15
|
+
for (const key of keys) {
|
|
16
|
+
if ((0, exports.isKeyDefined)(key, element)) {
|
|
17
|
+
return key;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
exports.RESOURCE_DISPLAYABLE_KEYS = [
|
|
22
|
+
'id',
|
|
23
|
+
'innerText',
|
|
24
|
+
'textContent',
|
|
25
|
+
'class',
|
|
26
|
+
'ariaLabel',
|
|
27
|
+
'ariaDescription',
|
|
28
|
+
'alt',
|
|
29
|
+
'rel'
|
|
30
|
+
];
|
|
31
|
+
exports.RESOURCE_LINK_KEYS = [
|
|
32
|
+
"href",
|
|
33
|
+
"data-src",
|
|
34
|
+
"target",
|
|
35
|
+
"action",
|
|
36
|
+
"src",
|
|
37
|
+
"url"
|
|
38
|
+
];
|
|
39
|
+
function findResourceText(element) {
|
|
40
|
+
for (const key of exports.RESOURCE_DISPLAYABLE_KEYS) {
|
|
41
|
+
const value = element[key];
|
|
42
|
+
if (value && typeof value === 'string' && value.trim() !== '')
|
|
43
|
+
return { key, value };
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
function findResourceLink(element) {
|
|
47
|
+
const key = findDefinedKey(element, [...exports.RESOURCE_LINK_KEYS]);
|
|
48
|
+
const url = element[key];
|
|
49
|
+
if (url && typeof url === 'string' && url.trim() !== '')
|
|
50
|
+
return { key, url };
|
|
51
|
+
}
|
|
52
|
+
const isResourceKey = (key) => key in exports.RESOURCE_LINK_KEYS;
|
|
53
|
+
exports.isResourceKey = isResourceKey;
|
|
54
|
+
const isKeyDefined = (key, element) => key in element && element[key] !== undefined;
|
|
55
|
+
exports.isKeyDefined = isKeyDefined;
|
|
56
|
+
//# sourceMappingURL=resource.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"resource.js","sourceRoot":"","sources":["../../src/resource.ts"],"names":[],"mappings":";AAAA;;;;;;;GAOG;;;AA8CH,4CAMC;AAED,4CAKC;AAvDD,SAAS,cAAc,CAAC,OAAiB,EAAE,IAAe;IACtD,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACrB,IAAI,IAAA,oBAAY,EAAC,GAAG,EAAE,OAAO,CAAC,EAAE,CAAC;YAC7B,OAAO,GAAG,CAAC;QACf,CAAC;IACL,CAAC;AACL,CAAC;AAEY,QAAA,yBAAyB,GAAG;IACrC,IAAI;IACJ,WAAW;IACX,aAAa;IACb,OAAO;IACP,WAAW;IACX,iBAAiB;IACjB,KAAK;IACL,KAAK;CACC,CAAC;AASE,QAAA,kBAAkB,GAAG;IAC9B,MAAM;IACN,UAAU;IACV,QAAQ;IACR,QAAQ;IACR,KAAK;IACL,KAAK;CACC,CAAC;AASX,SAAgB,gBAAgB,CAAC,OAAiB;IAC9C,KAAK,MAAM,GAAG,IAAI,iCAAyB,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,OAAO,CAAC,GAAG,CAAC,CAAA;QAC1B,IAAI,KAAK,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,CAAC,IAAI,EAAE,KAAK,EAAE;YACzD,OAAO,EAAE,GAAG,EAAE,KAAK,EAAE,CAAC;IAC9B,CAAC;AACL,CAAC;AAED,SAAgB,gBAAgB,CAAC,OAAiB;IAC9C,MAAM,GAAG,GAAG,cAAc,CAAC,OAAO,EAAE,CAAC,GAAG,0BAAkB,CAAC,CAAC,CAAC;IAC7D,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC;IACzB,IAAI,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ,IAAI,GAAG,CAAC,IAAI,EAAE,KAAK,EAAE;QACnD,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC;AAC5B,CAAC;AAOM,MAAM,aAAa,GAAG,CAAC,GAAW,EAAkB,EAAE,CAAC,GAAG,IAAI,0BAAkB,CAAC;AAA3E,QAAA,aAAa,iBAA8D;AAEjF,MAAM,YAAY,GAAG,CAA6B,GAAW,EAAE,OAAU,EAAW,EAAE,CACzF,GAAG,IAAI,OAAO,IAAI,OAAO,CAAC,GAAG,CAAC,KAAK,SAAS,CAAC;AADpC,QAAA,YAAY,gBACwB"}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pagerts",
|
|
3
3
|
"description": "A tool for viewing external relations in a webpage",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.2.0",
|
|
5
5
|
"main": "main.js",
|
|
6
6
|
"bin": {
|
|
7
7
|
"pagerts": "bin/main.js"
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
"scripts": {
|
|
10
10
|
"test": "jest",
|
|
11
11
|
"build": "esbuild src/main.ts --external:jsdom --bundle --outdir=bin --minify --sourcemap --platform=node",
|
|
12
|
-
"
|
|
12
|
+
"lint": "tsc",
|
|
13
13
|
"start": "node ./bin/main.js",
|
|
14
14
|
"dev": "npx tsx src/main.ts"
|
|
15
15
|
},
|
|
@@ -1,21 +1,12 @@
|
|
|
1
1
|
import { isError, type Page } from '../page/Page';
|
|
2
2
|
import { JSDOM } from 'jsdom';
|
|
3
3
|
import { AbstractExtractor } from './AbstractExtractor';
|
|
4
|
-
import { isJSDOM, isMetadata, type PageResponse } from '../page/PageFetcher';
|
|
5
4
|
|
|
6
|
-
export class PageExtractor extends AbstractExtractor<
|
|
5
|
+
export class PageExtractor extends AbstractExtractor<JSDOM, Page> {
|
|
7
6
|
constructor() { super("page-extractor"); }
|
|
8
7
|
|
|
9
|
-
async extract(value:
|
|
10
|
-
|
|
11
|
-
const { window: { document: { title, location: { href: url } } } } = value
|
|
12
|
-
return { title, url }
|
|
13
|
-
}
|
|
14
|
-
if (isError(value)) {
|
|
15
|
-
const { error } = value
|
|
16
|
-
throw new Error(error)
|
|
17
|
-
}
|
|
18
|
-
const { title, url } = value
|
|
8
|
+
async extract(value: JSDOM): Promise<Page> {
|
|
9
|
+
const { window: { document: { title, location: { href: url } } } } = value
|
|
19
10
|
return { title, url }
|
|
20
11
|
}
|
|
21
12
|
}
|
package/src/main.ts
CHANGED
|
@@ -4,14 +4,14 @@ import { Command, createArgument } from "commander";
|
|
|
4
4
|
import { description, name, version } from '../package.json';
|
|
5
5
|
import { PageExtractor } from "./extractors/PageExtractor";
|
|
6
6
|
import { ResourceExtractor } from "./extractors/ResourceExtractor";
|
|
7
|
-
import {
|
|
7
|
+
import { PageFetcher } from "./page/PageFetcher";
|
|
8
8
|
import type { Page, PageMetadata } from "./page/Page";
|
|
9
9
|
import { JSONStylePrinter } from "./printers/JSONStylePrinter";
|
|
10
10
|
import { LogStylePrinter } from "./printers/LogStylePrinter";
|
|
11
11
|
|
|
12
12
|
const program = new Command();
|
|
13
13
|
|
|
14
|
-
const url = createArgument("<url|file...>", "remote URL or local file to extract
|
|
14
|
+
const url = createArgument("<url | file...>", "remote https://URL or local file://resource.html to extract from");
|
|
15
15
|
|
|
16
16
|
(async () => {
|
|
17
17
|
await program
|
|
@@ -28,20 +28,15 @@ const url = createArgument("<url|file...>", "remote URL or local file to extract
|
|
|
28
28
|
const pageExtractor = new PageExtractor()
|
|
29
29
|
const resourceExtractor = new ResourceExtractor(["a", "meta", "link", "embed"])
|
|
30
30
|
|
|
31
|
-
const pageResponses
|
|
32
|
-
const pageMetadatas: PageMetadata[] = []
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
pageMetadatas.push({
|
|
39
|
-
...descriptor, resources
|
|
40
|
-
});
|
|
41
|
-
} else pageMetadatas.push({
|
|
42
|
-
...page, resources: []
|
|
43
|
-
});
|
|
31
|
+
const pageResponses = await pageFetcher.fetchAll(urls);
|
|
32
|
+
const pageMetadatas: PageMetadata[] = [];
|
|
33
|
+
|
|
34
|
+
for (const { content, url, error } of pageResponses) {
|
|
35
|
+
const resources = error in (content) ? [] : await resourceExtractor.extract(content);
|
|
36
|
+
const descriptor = error in content ? { url, error } : await pageExtractor.extract(content);
|
|
37
|
+
pageMetadatas.push({ ...descriptor, resources });
|
|
44
38
|
}
|
|
39
|
+
|
|
45
40
|
await printer.print(...pageMetadatas);
|
|
46
41
|
})
|
|
47
42
|
.parseAsync(process.argv);
|
package/src/page/Page.ts
CHANGED
package/src/page/PageFetcher.ts
CHANGED
|
@@ -1,24 +1,30 @@
|
|
|
1
1
|
import { JSDOM, VirtualConsole } from 'jsdom';
|
|
2
2
|
import type { Page, PageMetadata } from './Page';
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
4
|
+
interface PageResponse {
|
|
5
|
+
url: string;
|
|
6
|
+
content?: JSDOM;
|
|
7
|
+
error?: string;
|
|
8
|
+
}
|
|
8
9
|
|
|
9
10
|
export class PageFetcher {
|
|
10
|
-
async fetchPage(url: string): Promise<PageResponse> {
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
})
|
|
11
|
+
private async fetchPage(url: string): Promise<PageResponse> {
|
|
12
|
+
let dom: Promise<JSDOM>;
|
|
13
|
+
const virtualConsole = new VirtualConsole().on('jsdomError', (error) => {
|
|
14
|
+
process.stderr.write(`Error parsing ${url}:${error.message}\n`);
|
|
15
15
|
});
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
}
|
|
16
|
+
if (url.startsWith("file://")) {
|
|
17
|
+
dom = JSDOM.fromFile(url, { virtualConsole });
|
|
18
|
+
} else {
|
|
19
|
+
dom = JSDOM.fromURL(url, { virtualConsole });
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
return dom.then(content => ({ url, content }))
|
|
23
|
+
.catch(({ message }) => ({ url, error: `JSDOM failed to parse: ${message}` }));
|
|
19
24
|
}
|
|
20
25
|
async fetchAll(urls: string[]): Promise<PageResponse[]> {
|
|
21
|
-
|
|
26
|
+
const responses = await Promise.all(urls.map(url => this.fetchPage(url)));
|
|
27
|
+
return responses.filter(response => response.content !== undefined);
|
|
22
28
|
}
|
|
23
|
-
|
|
29
|
+
|
|
24
30
|
}
|