cheerio-to-text 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,7 @@
1
+ Copyright 2022 Peter Bengtsson
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,64 @@
1
+ # cheerio-to-html
2
+
3
+ Explained by an example:
4
+
5
+ ```js
6
+ import fs from "fs"
7
+ import cheerio from "cheerio"
8
+ import { render } from "cheerio-to-text"
9
+
10
+ const html = fs.readFileSync("page.html", "utf-8")
11
+ console.log(html)
12
+ //
13
+ // <!doctype html>
14
+ // <body>
15
+ // <div id="main">
16
+ // <p>Para<strong>graph</strong>.</p>
17
+ // <ul><li>Foo</li><li>Bar</li></ul><h3>Heading</h3>
18
+ // </div>
19
+ // </body>
20
+ //
21
+
22
+ const $ = cheerio.load(html)
23
+ console.log($("div#main"))
24
+ //
25
+ // Paragraph.
26
+ // FooBarHeading
27
+ //
28
+
29
+ console.log(render($("div#main")))
30
+ //
31
+ // Paragraph.
32
+ // Foo
33
+ // Bar
34
+ // Heading
35
+ //
36
+ ```
37
+
38
+ Much of the origin of this that [GitHub Docs](https://docs.github.com) scrapes
39
+ every page with `got` and `cheerio` and then needs to convert that into an
40
+ appropriate string of plain text that it can use for searching
41
+ with Elasticsearch. Using `myCheerioObject.text()` isn't good enough
42
+ because it lumps together HTML blocking tags that have no whitespace
43
+ between the `>` and the next `<`.
44
+
45
+ ## License
46
+
47
+ MIT
48
+
49
+ ## How to hack
50
+
51
+ Run `npm run build:watch` in one terminal the look at
52
+ `example.mjs` (which you run with `node example.mjs`)
53
+
54
+ ## How to run tests
55
+
56
+ ```sh
57
+ jest
58
+ ```
59
+
60
+ Or
61
+
62
+ ```sh
63
+ jest --watch -t "some test text"
64
+ ```
@@ -0,0 +1,3 @@
1
+ import type { Element, CheerioAPI, Document, Cheerio } from "cheerio";
2
+ export declare function render(node: CheerioAPI | Document | string | Element | Cheerio<Element>): string;
3
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,UAAU,EAAE,QAAQ,EAAW,OAAO,EAAE,MAAM,SAAS,CAAA;AAa9E,wBAAgB,MAAM,CACpB,IAAI,EAAE,UAAU,GAAG,QAAQ,GAAG,MAAM,GAAG,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,GAChE,MAAM,CA8BR"}
@@ -0,0 +1,46 @@
1
+ import cheerio from "cheerio";
2
+ const inlineElements = new Set(`a,abbr,acronym,audio,b,bdi,bdo,big,br,button,canvas,cite,code,data,
3
+ datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,label,map,mark,
4
+ meter,noscript,object,output,picture,progress,q,ruby,s,samp,script,
5
+ select,slot,small,span,strong,sub,sup,svg,template,textarea,time,
6
+ tt,u,var,video,wbr`
7
+ .split(",")
8
+ .map((s) => s.trim()));
9
+ export function render(node) {
10
+ let root = null;
11
+ if (typeof node === "string") {
12
+ root = cheerio.load(node)("body")[0];
13
+ }
14
+ else if (typeof node === "object" && "0" in node) {
15
+ root = node[0];
16
+ }
17
+ else if (typeof node === "object" && "children" in node && "type" in node) {
18
+ root = node;
19
+ }
20
+ if (!root) {
21
+ throw new Error("node was not a string, cheerio loaded document, or a cheerio node");
22
+ }
23
+ let text = "";
24
+ function enter(element) {
25
+ if (element.type === "text") {
26
+ if (element.data.trim())
27
+ text += element.data;
28
+ }
29
+ else if (element.type === "tag") {
30
+ if (!inlineElements.has(element.tagName)) {
31
+ if (text.at(-1) !== "\n")
32
+ text += "\n";
33
+ }
34
+ }
35
+ }
36
+ walk(root, enter);
37
+ return text.trim();
38
+ }
39
+ function walk(root, enter) {
40
+ enter(root);
41
+ if (root.type === "tag") {
42
+ for (const child of root.children) {
43
+ walk(child, enter);
44
+ }
45
+ }
46
+ }
@@ -0,0 +1 @@
1
+ {"type":"module"}
package/lib/index.d.ts ADDED
@@ -0,0 +1,3 @@
1
+ import type { Element, CheerioAPI, Document, Cheerio } from "cheerio";
2
+ export declare function render(node: CheerioAPI | Document | string | Element | Cheerio<Element>): string;
3
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,UAAU,EAAE,QAAQ,EAAW,OAAO,EAAE,MAAM,SAAS,CAAA;AAa9E,wBAAgB,MAAM,CACpB,IAAI,EAAE,UAAU,GAAG,QAAQ,GAAG,MAAM,GAAG,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,GAChE,MAAM,CA8BR"}
package/lib/index.js ADDED
@@ -0,0 +1,71 @@
1
+ "use strict";
2
+ var __values = (this && this.__values) || function(o) {
3
+ var s = typeof Symbol === "function" && Symbol.iterator, m = s && o[s], i = 0;
4
+ if (m) return m.call(o);
5
+ if (o && typeof o.length === "number") return {
6
+ next: function () {
7
+ if (o && i >= o.length) o = void 0;
8
+ return { value: o && o[i++], done: !o };
9
+ }
10
+ };
11
+ throw new TypeError(s ? "Object is not iterable." : "Symbol.iterator is not defined.");
12
+ };
13
+ var __importDefault = (this && this.__importDefault) || function (mod) {
14
+ return (mod && mod.__esModule) ? mod : { "default": mod };
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ exports.render = void 0;
18
+ var cheerio_1 = __importDefault(require("cheerio"));
19
+ var inlineElements = new Set("a,abbr,acronym,audio,b,bdi,bdo,big,br,button,canvas,cite,code,data,\n datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,label,map,mark,\n meter,noscript,object,output,picture,progress,q,ruby,s,samp,script,\n select,slot,small,span,strong,sub,sup,svg,template,textarea,time,\n tt,u,var,video,wbr"
20
+ .split(",")
21
+ .map(function (s) { return s.trim(); }));
22
+ function render(node) {
23
+ var root = null;
24
+ if (typeof node === "string") {
25
+ root = cheerio_1.default.load(node)("body")[0];
26
+ }
27
+ else if (typeof node === "object" && "0" in node) {
28
+ root = node[0];
29
+ }
30
+ else if (typeof node === "object" && "children" in node && "type" in node) {
31
+ root = node;
32
+ }
33
+ if (!root) {
34
+ throw new Error("node was not a string, cheerio loaded document, or a cheerio node");
35
+ }
36
+ var text = "";
37
+ function enter(element) {
38
+ if (element.type === "text") {
39
+ if (element.data.trim())
40
+ text += element.data;
41
+ }
42
+ else if (element.type === "tag") {
43
+ if (!inlineElements.has(element.tagName)) {
44
+ if (text.at(-1) !== "\n")
45
+ text += "\n";
46
+ }
47
+ }
48
+ }
49
+ walk(root, enter);
50
+ return text.trim();
51
+ }
52
+ exports.render = render;
53
+ function walk(root, enter) {
54
+ var e_1, _a;
55
+ enter(root);
56
+ if (root.type === "tag") {
57
+ try {
58
+ for (var _b = __values(root.children), _c = _b.next(); !_c.done; _c = _b.next()) {
59
+ var child = _c.value;
60
+ walk(child, enter);
61
+ }
62
+ }
63
+ catch (e_1_1) { e_1 = { error: e_1_1 }; }
64
+ finally {
65
+ try {
66
+ if (_c && !_c.done && (_a = _b.return)) _a.call(_b);
67
+ }
68
+ finally { if (e_1) throw e_1.error; }
69
+ }
70
+ }
71
+ }
package/package.json ADDED
@@ -0,0 +1,82 @@
1
+ {
2
+ "name": "cheerio-to-text",
3
+ "version": "0.1.0",
4
+ "description": "Turn a Cheerio object into plain text",
5
+ "repository": {
6
+ "type": "git",
7
+ "url": "git://github.com/peterbe/cheerio-to-text.git"
8
+ },
9
+ "bugs": {
10
+ "url": "https://github.com/peterbe/cheerio-to-text/issues"
11
+ },
12
+ "main": "lib/index.js",
13
+ "types": "lib/index.d.ts",
14
+ "module": "lib/esm/index.js",
15
+ "files": [
16
+ "lib"
17
+ ],
18
+ "exports": {
19
+ "require": "./lib/index.js",
20
+ "import": "./lib/esm/index.js"
21
+ },
22
+ "scripts": {
23
+ "test": "npm run test:jest && npm run lint",
24
+ "test:jest": "jest",
25
+ "lint": "npm run lint:es && npm run lint:prettier",
26
+ "lint:es": "eslint src",
27
+ "lint:prettier": "npm run prettier -- --check",
28
+ "format": "npm run format:es && npm run format:prettier",
29
+ "format:es": "npm run lint:es -- --fix",
30
+ "format:prettier": "npm run prettier -- --write",
31
+ "prettier": "prettier '**/*.{ts,md,json,yml}'",
32
+ "build:watch": "tsc --watch",
33
+ "build": "npm run build:cjs && npm run build:esm",
34
+ "build:cjs": "tsc",
35
+ "build:esm": "npm run build:cjs -- --module esnext --target es2019 --outDir lib/esm && echo '{\"type\":\"module\"}' > lib/esm/package.json",
36
+ "prepare": "npm run build",
37
+ "release": "npm run prepare && np"
38
+ },
39
+ "keywords": [
40
+ "cheerio",
41
+ "html",
42
+ "text"
43
+ ],
44
+ "author": "Peter Bengtsson",
45
+ "license": "MIT",
46
+ "jest": {
47
+ "preset": "ts-jest",
48
+ "testEnvironment": "node",
49
+ "testMatch": [
50
+ "<rootDir>/test/*.ts"
51
+ ],
52
+ "reporters": [
53
+ "default",
54
+ [
55
+ "jest-junit",
56
+ {
57
+ "suiteName": "jest tests"
58
+ }
59
+ ]
60
+ ]
61
+ },
62
+ "devDependencies": {
63
+ "@types/jest": "29.1.1",
64
+ "@types/node": "18.8.2",
65
+ "@typescript-eslint/eslint-plugin": "5.39.0",
66
+ "@typescript-eslint/parser": "5.39.0",
67
+ "eslint": "8.24.0",
68
+ "eslint-config-prettier": "8.5.0",
69
+ "jest": "29.1.2",
70
+ "jest-junit": "14.0.1",
71
+ "np": "7.6.2",
72
+ "prettier": "2.7.1",
73
+ "ts-jest": "29.0.3",
74
+ "typescript": "4.8.4"
75
+ },
76
+ "peerDependencies": {
77
+ "cheerio": "1.0.0-rc.12"
78
+ },
79
+ "prettier": {
80
+ "semi": false
81
+ }
82
+ }