@jocmp/mercury-parser 2.2.3-dev

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,164 @@
1
+ {
2
+ "name": "@jocmp/mercury-parser",
3
+ "version": "2.2.3-dev",
4
+ "description": "Mercury Parser transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
5
+ "author": "jocmp <hello@jocmp.com>",
6
+ "homepage": "https://github.com/jocmp/mercury-parser",
7
+ "license": "MIT",
8
+ "repository": {
9
+ "type": "git",
10
+ "url": "git+https://github.com/jocmp/mercury-parser.git"
11
+ },
12
+ "bugs": {
13
+ "url": "https://github.com/@jocmp/mercury-parser/issues"
14
+ },
15
+ "keywords": [
16
+ "mercury",
17
+ "parser",
18
+ "reader",
19
+ "web",
20
+ "content"
21
+ ],
22
+ "files": [
23
+ "dist",
24
+ "cli.js",
25
+ "src/shims/"
26
+ ],
27
+ "main": "./dist/mercury.js",
28
+ "bin": {
29
+ "mercury-parser": "./cli.js"
30
+ },
31
+ "scripts": {
32
+ "lint": "eslint . --fix",
33
+ "lint:ci": "remark . && eslint .",
34
+ "lint-fix-quiet": "eslint --fix --quiet",
35
+ "build": "yarn lint && rollup -c && yarn test:build",
36
+ "build:ci": "rollup -c && yarn test:build",
37
+ "build:web": "yarn lint && rollup -c rollup.config.web.js && yarn test:build:web",
38
+ "build:esm": "yarn lint && rollup -c rollup.config.esm.js && yarn test:build:esm",
39
+ "build:esm:ci": "rollup -c rollup.config.esm.js && yarn test:build:esm",
40
+ "build:web:ci": "rollup -c rollup.config.web.js && yarn test:build:web",
41
+ "release": "yarn build && yarn build:web",
42
+ "build:generator": "rollup -c scripts/rollup.config.js",
43
+ "test_build": "rollup -c",
44
+ "test": "yarn test:node && yarn test:web",
45
+ "test:node": "jest --json --outputFile test-output.json",
46
+ "test:web": "node ./node_modules/karma/bin/karma start karma.conf.js --auto-watch",
47
+ "test:build": "cd ./scripts && jest check-build.test.js",
48
+ "test:build:web": "node ./scripts/proxy-browser-test.js",
49
+ "test:build:esm": "node ./scripts/proxy-browser-test.js",
50
+ "watch:test": "jest --watch",
51
+ "generate-parser": "node ./dist/generate-custom-parser.js"
52
+ },
53
+ "engines": {
54
+ "node": ">=10"
55
+ },
56
+ "devDependencies": {
57
+ "@babel/core": "^7.0.0",
58
+ "@babel/plugin-transform-runtime": "^7.0.0",
59
+ "@babel/polyfill": "^7.0.0",
60
+ "@babel/preset-env": "^7.0.0",
61
+ "@babel/runtime": "^7.0.0",
62
+ "@jesses/circle-github-bot": "^2.1.0",
63
+ "@octokit/rest": "^16.9.0",
64
+ "babel-core": "^7.0.0-bridge.0",
65
+ "babel-eslint": "^10.0.1",
66
+ "babel-jest": "^23.4.2",
67
+ "babel-plugin-module-alias": "^1.6.0",
68
+ "babel-plugin-module-resolver": "^3.1.2",
69
+ "babelify": "^10.0.0",
70
+ "babelrc-rollup": "^3.0.0",
71
+ "brfs": "^2.0.1",
72
+ "brfs-babel": "^2.0.0",
73
+ "browserify": "^16.2.3",
74
+ "changelog-maker": "^2.3.0",
75
+ "eslint": "^5.12.0",
76
+ "eslint-config-airbnb": "^17.1.0",
77
+ "eslint-config-prettier": "^6.1.0",
78
+ "eslint-import-resolver-babel-module": "^2.2.1",
79
+ "eslint-plugin-babel": "^5.3.0",
80
+ "eslint-plugin-import": "^2.14.0",
81
+ "eslint-plugin-jsx-a11y": "^6.1.2",
82
+ "eslint-plugin-react": "^7.12.3",
83
+ "express": "^4.16.4",
84
+ "husky": "^3.0.0",
85
+ "inquirer": "^7.0.0",
86
+ "jasmine-core": "^2.5.2",
87
+ "jest": "^23.6.0",
88
+ "jest-cli": "^23.6.0",
89
+ "karma": "^6.3.16",
90
+ "karma-browserify": "8.1.0",
91
+ "karma-chrome-launcher": "^3.0.0",
92
+ "karma-cli": "^2.0.0",
93
+ "karma-jasmine": "^1.0.2",
94
+ "karma-mocha": "^1.3.0",
95
+ "karma-requirejs": "^1.1.0",
96
+ "lint-staged": "^8.1.0",
97
+ "mocha": "^6.0.0",
98
+ "nock": "^10.0.6",
99
+ "ora": "^4.0.0",
100
+ "prettier": "^1.15.3",
101
+ "remark-cli": "^7.0.0",
102
+ "remark-lint": "^6.0.4",
103
+ "remark-preset-lint-recommended": "^3.0.2",
104
+ "request": "^2.88.2",
105
+ "requirejs": "^2.3.6",
106
+ "rollup": "^1.1.0",
107
+ "rollup-plugin-babel": "^4.0.1",
108
+ "rollup-plugin-commonjs": "^9.2.0",
109
+ "rollup-plugin-node-globals": "^1.4.0",
110
+ "rollup-plugin-node-resolve": "^2.0.0",
111
+ "rollup-plugin-terser": "^6.1.0",
112
+ "rollup-plugin-uglify": "^6.0.1",
113
+ "watchify": "^3.11.1"
114
+ },
115
+ "dependencies": {
116
+ "@babel/runtime-corejs2": "^7.2.0",
117
+ "@postlight/ci-failed-test-reporter": "^1.0",
118
+ "browser-request": "github:postlight/browser-request#feat-add-headers-to-response",
119
+ "cheerio": "^0.22.0",
120
+ "difflib": "github:postlight/difflib.js",
121
+ "ellipsize": "0.1.0",
122
+ "iconv-lite": "0.5.0",
123
+ "jquery": "^3.5.0",
124
+ "moment": "^2.23.0",
125
+ "moment-parseformat": "3.0.0",
126
+ "moment-timezone": "0.5.37",
127
+ "postman-request": "^2.88.1-postman.31",
128
+ "string-direction": "^0.1.2",
129
+ "turndown": "^7.1.1",
130
+ "valid-url": "^1.0.9",
131
+ "wuzzy": "^0.1.4",
132
+ "yargs-parser": "^15.0.1"
133
+ },
134
+ "bundleDependencies": [
135
+ "jquery",
136
+ "moment-timezone",
137
+ "browser-request"
138
+ ],
139
+ "browser": {
140
+ "main": "./dist/mercury.web.js",
141
+ "cheerio": "./src/shims/cheerio-query",
142
+ "jquery": "./node_modules/jquery/dist/jquery.min.js",
143
+ "postman-request": "browser-request",
144
+ "iconv-lite": "./src/shims/iconv-lite",
145
+ "moment-timezone": "./node_modules/moment-timezone/builds/moment-timezone-with-data-2012-2022.min.js"
146
+ },
147
+ "husky": {
148
+ "hooks": {
149
+ "pre-commit": "lint-staged"
150
+ }
151
+ },
152
+ "lint-staged": {
153
+ "*.js": [
154
+ "eslint --fix",
155
+ "prettier --write",
156
+ "git add"
157
+ ],
158
+ "*.{json,css,md}": [
159
+ "remark .",
160
+ "prettier --write",
161
+ "git add"
162
+ ]
163
+ }
164
+ }
@@ -0,0 +1,119 @@
1
+ // This module attempts to square cheerio with jquery
2
+ // so that node-specific quirks/features of cheerio
3
+ // will also work in the browser. This mostly involves
4
+ // shimming a few functions and rewriting the jquery
5
+ // constructor so it sandboxes most of its operations
6
+ // and doesn't mutate existing dom elements in the page.
7
+
8
+ import jQuery from 'jquery';
9
+
10
+ const PARSER_CLASS = 'mercury-parsing-container';
11
+ let PARSING_NODE;
12
+
13
+ jQuery.noConflict();
14
+ const $ = (selector, context, rootjQuery, contextOverride = true) => {
15
+ if (contextOverride) {
16
+ if (context && typeof context === 'string') {
17
+ context = PARSING_NODE.find(context);
18
+ } else if (!context) {
19
+ context = PARSING_NODE;
20
+ }
21
+ }
22
+
23
+ return new jQuery.fn.init(selector, context, rootjQuery); // eslint-disable-line new-cap
24
+ };
25
+
26
+ // eslint-disable-next-line no-multi-assign
27
+ $.fn = $.prototype = jQuery.fn;
28
+ jQuery.extend($, jQuery); // copy's trim, extend etc to $
29
+
30
+ const removeUnusedTags = $node => {
31
+ // remove scripts and stylesheets
32
+ $node.find('script, style, link[rel="stylesheet"]').remove();
33
+
34
+ return $node;
35
+ };
36
+
37
+ $.cloneHtml = () => {
38
+ const html = removeUnusedTags($('html', null, null, false).clone());
39
+
40
+ return html
41
+ .children()
42
+ .wrap('<div />')
43
+ .wrap('<div />');
44
+ };
45
+
46
+ $.root = () => $('*').first();
47
+
48
+ $.browser = true;
49
+
50
+ const isContainer = $node => {
51
+ const el = $node.get(0);
52
+ if (el && el.tagName) {
53
+ return el.tagName.toLowerCase() === 'container';
54
+ }
55
+
56
+ return false;
57
+ };
58
+
59
+ $.html = $node => {
60
+ if ($node) {
61
+ // we never want to return a parsing container, only its children
62
+ if (isContainer($node) || isContainer($node.children('container'))) {
63
+ return $node.children('container').html() || $node.html();
64
+ }
65
+
66
+ return $('<div>')
67
+ .append($node.eq(0).clone())
68
+ .html();
69
+ }
70
+
71
+ const $body = removeUnusedTags($('body', null, null, false).clone());
72
+ const $head = removeUnusedTags($('head', null, null, false).clone());
73
+
74
+ if (PARSING_NODE && PARSING_NODE.length > 0) {
75
+ return PARSING_NODE.children().html();
76
+ }
77
+
78
+ const html = $('<container />')
79
+ .append($(`<container>${$head.html()}</container>`))
80
+ .append($(`<container>${$body.html()}</container>`))
81
+ .wrap('<container />')
82
+ .parent()
83
+ .html();
84
+
85
+ return html;
86
+ };
87
+
88
+ // eslint-disable-next-line no-unused-vars
89
+ $.load = (html, opts = {}, returnHtml = false) => {
90
+ if (!html) {
91
+ html = $.cloneHtml();
92
+ } else {
93
+ html = $('<container />').html(html);
94
+ }
95
+
96
+ PARSING_NODE =
97
+ PARSING_NODE || $(`<div class="${PARSER_CLASS}" style="display:none;" />`);
98
+
99
+ // Strip scripts
100
+ html = removeUnusedTags(html);
101
+
102
+ // Remove comments
103
+ html
104
+ .find('*')
105
+ .contents()
106
+ .each(function() {
107
+ // eslint-disable-next-line no-undef
108
+ if (this.nodeType === Node.COMMENT_NODE) {
109
+ $(this).remove();
110
+ }
111
+ });
112
+ PARSING_NODE.html(html);
113
+
114
+ if (returnHtml) return { $, html: html.html() };
115
+
116
+ return $;
117
+ };
118
+
119
+ export default $;
@@ -0,0 +1,9 @@
1
+ // this is a shim for the browser build;
2
+ // iconv-lite doubles build size, and we
3
+ // don't need it for already rendered text
4
+ const iconv = {
5
+ encodingExists: () => false,
6
+ decode: s => s,
7
+ };
8
+
9
+ export default iconv;