@jocmp/mercury-parser 2.2.3-dev
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +149 -0
- package/cli.js +111 -0
- package/dist/generate-custom-parser.js +8421 -0
- package/dist/generate-custom-parser.js.map +1 -0
- package/dist/mercury.esm.js +5 -0
- package/dist/mercury.esm.js.map +1 -0
- package/dist/mercury.js +8166 -0
- package/dist/mercury.js.map +1 -0
- package/dist/mercury.web.js +2 -0
- package/dist/mercury.web.js.map +1 -0
- package/package.json +164 -0
- package/src/shims/cheerio-query.js +119 -0
- package/src/shims/iconv-lite.js +9 -0
package/package.json
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@jocmp/mercury-parser",
|
|
3
|
+
"version": "2.2.3-dev",
|
|
4
|
+
"description": "Mercury Parser transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
|
|
5
|
+
"author": "jocmp <hello@jocmp.com>",
|
|
6
|
+
"homepage": "https://github.com/jocmp/mercury-parser",
|
|
7
|
+
"license": "MIT",
|
|
8
|
+
"repository": {
|
|
9
|
+
"type": "git",
|
|
10
|
+
"url": "git+https://github.com/jocmp/mercury-parser.git"
|
|
11
|
+
},
|
|
12
|
+
"bugs": {
|
|
13
|
+
"url": "https://github.com/@jocmp/mercury-parser/issues"
|
|
14
|
+
},
|
|
15
|
+
"keywords": [
|
|
16
|
+
"mercury",
|
|
17
|
+
"parser",
|
|
18
|
+
"reader",
|
|
19
|
+
"web",
|
|
20
|
+
"content"
|
|
21
|
+
],
|
|
22
|
+
"files": [
|
|
23
|
+
"dist",
|
|
24
|
+
"cli.js",
|
|
25
|
+
"src/shims/"
|
|
26
|
+
],
|
|
27
|
+
"main": "./dist/mercury.js",
|
|
28
|
+
"bin": {
|
|
29
|
+
"mercury-parser": "./cli.js"
|
|
30
|
+
},
|
|
31
|
+
"scripts": {
|
|
32
|
+
"lint": "eslint . --fix",
|
|
33
|
+
"lint:ci": "remark . && eslint .",
|
|
34
|
+
"lint-fix-quiet": "eslint --fix --quiet",
|
|
35
|
+
"build": "yarn lint && rollup -c && yarn test:build",
|
|
36
|
+
"build:ci": "rollup -c && yarn test:build",
|
|
37
|
+
"build:web": "yarn lint && rollup -c rollup.config.web.js && yarn test:build:web",
|
|
38
|
+
"build:esm": "yarn lint && rollup -c rollup.config.esm.js && yarn test:build:esm",
|
|
39
|
+
"build:esm:ci": "rollup -c rollup.config.esm.js && yarn test:build:esm",
|
|
40
|
+
"build:web:ci": "rollup -c rollup.config.web.js && yarn test:build:web",
|
|
41
|
+
"release": "yarn build && yarn build:web",
|
|
42
|
+
"build:generator": "rollup -c scripts/rollup.config.js",
|
|
43
|
+
"test_build": "rollup -c",
|
|
44
|
+
"test": "yarn test:node && yarn test:web",
|
|
45
|
+
"test:node": "jest --json --outputFile test-output.json",
|
|
46
|
+
"test:web": "node ./node_modules/karma/bin/karma start karma.conf.js --auto-watch",
|
|
47
|
+
"test:build": "cd ./scripts && jest check-build.test.js",
|
|
48
|
+
"test:build:web": "node ./scripts/proxy-browser-test.js",
|
|
49
|
+
"test:build:esm": "node ./scripts/proxy-browser-test.js",
|
|
50
|
+
"watch:test": "jest --watch",
|
|
51
|
+
"generate-parser": "node ./dist/generate-custom-parser.js"
|
|
52
|
+
},
|
|
53
|
+
"engines": {
|
|
54
|
+
"node": ">=10"
|
|
55
|
+
},
|
|
56
|
+
"devDependencies": {
|
|
57
|
+
"@babel/core": "^7.0.0",
|
|
58
|
+
"@babel/plugin-transform-runtime": "^7.0.0",
|
|
59
|
+
"@babel/polyfill": "^7.0.0",
|
|
60
|
+
"@babel/preset-env": "^7.0.0",
|
|
61
|
+
"@babel/runtime": "^7.0.0",
|
|
62
|
+
"@jesses/circle-github-bot": "^2.1.0",
|
|
63
|
+
"@octokit/rest": "^16.9.0",
|
|
64
|
+
"babel-core": "^7.0.0-bridge.0",
|
|
65
|
+
"babel-eslint": "^10.0.1",
|
|
66
|
+
"babel-jest": "^23.4.2",
|
|
67
|
+
"babel-plugin-module-alias": "^1.6.0",
|
|
68
|
+
"babel-plugin-module-resolver": "^3.1.2",
|
|
69
|
+
"babelify": "^10.0.0",
|
|
70
|
+
"babelrc-rollup": "^3.0.0",
|
|
71
|
+
"brfs": "^2.0.1",
|
|
72
|
+
"brfs-babel": "^2.0.0",
|
|
73
|
+
"browserify": "^16.2.3",
|
|
74
|
+
"changelog-maker": "^2.3.0",
|
|
75
|
+
"eslint": "^5.12.0",
|
|
76
|
+
"eslint-config-airbnb": "^17.1.0",
|
|
77
|
+
"eslint-config-prettier": "^6.1.0",
|
|
78
|
+
"eslint-import-resolver-babel-module": "^2.2.1",
|
|
79
|
+
"eslint-plugin-babel": "^5.3.0",
|
|
80
|
+
"eslint-plugin-import": "^2.14.0",
|
|
81
|
+
"eslint-plugin-jsx-a11y": "^6.1.2",
|
|
82
|
+
"eslint-plugin-react": "^7.12.3",
|
|
83
|
+
"express": "^4.16.4",
|
|
84
|
+
"husky": "^3.0.0",
|
|
85
|
+
"inquirer": "^7.0.0",
|
|
86
|
+
"jasmine-core": "^2.5.2",
|
|
87
|
+
"jest": "^23.6.0",
|
|
88
|
+
"jest-cli": "^23.6.0",
|
|
89
|
+
"karma": "^6.3.16",
|
|
90
|
+
"karma-browserify": "8.1.0",
|
|
91
|
+
"karma-chrome-launcher": "^3.0.0",
|
|
92
|
+
"karma-cli": "^2.0.0",
|
|
93
|
+
"karma-jasmine": "^1.0.2",
|
|
94
|
+
"karma-mocha": "^1.3.0",
|
|
95
|
+
"karma-requirejs": "^1.1.0",
|
|
96
|
+
"lint-staged": "^8.1.0",
|
|
97
|
+
"mocha": "^6.0.0",
|
|
98
|
+
"nock": "^10.0.6",
|
|
99
|
+
"ora": "^4.0.0",
|
|
100
|
+
"prettier": "^1.15.3",
|
|
101
|
+
"remark-cli": "^7.0.0",
|
|
102
|
+
"remark-lint": "^6.0.4",
|
|
103
|
+
"remark-preset-lint-recommended": "^3.0.2",
|
|
104
|
+
"request": "^2.88.2",
|
|
105
|
+
"requirejs": "^2.3.6",
|
|
106
|
+
"rollup": "^1.1.0",
|
|
107
|
+
"rollup-plugin-babel": "^4.0.1",
|
|
108
|
+
"rollup-plugin-commonjs": "^9.2.0",
|
|
109
|
+
"rollup-plugin-node-globals": "^1.4.0",
|
|
110
|
+
"rollup-plugin-node-resolve": "^2.0.0",
|
|
111
|
+
"rollup-plugin-terser": "^6.1.0",
|
|
112
|
+
"rollup-plugin-uglify": "^6.0.1",
|
|
113
|
+
"watchify": "^3.11.1"
|
|
114
|
+
},
|
|
115
|
+
"dependencies": {
|
|
116
|
+
"@babel/runtime-corejs2": "^7.2.0",
|
|
117
|
+
"@postlight/ci-failed-test-reporter": "^1.0",
|
|
118
|
+
"browser-request": "github:postlight/browser-request#feat-add-headers-to-response",
|
|
119
|
+
"cheerio": "^0.22.0",
|
|
120
|
+
"difflib": "github:postlight/difflib.js",
|
|
121
|
+
"ellipsize": "0.1.0",
|
|
122
|
+
"iconv-lite": "0.5.0",
|
|
123
|
+
"jquery": "^3.5.0",
|
|
124
|
+
"moment": "^2.23.0",
|
|
125
|
+
"moment-parseformat": "3.0.0",
|
|
126
|
+
"moment-timezone": "0.5.37",
|
|
127
|
+
"postman-request": "^2.88.1-postman.31",
|
|
128
|
+
"string-direction": "^0.1.2",
|
|
129
|
+
"turndown": "^7.1.1",
|
|
130
|
+
"valid-url": "^1.0.9",
|
|
131
|
+
"wuzzy": "^0.1.4",
|
|
132
|
+
"yargs-parser": "^15.0.1"
|
|
133
|
+
},
|
|
134
|
+
"bundleDependencies": [
|
|
135
|
+
"jquery",
|
|
136
|
+
"moment-timezone",
|
|
137
|
+
"browser-request"
|
|
138
|
+
],
|
|
139
|
+
"browser": {
|
|
140
|
+
"main": "./dist/mercury.web.js",
|
|
141
|
+
"cheerio": "./src/shims/cheerio-query",
|
|
142
|
+
"jquery": "./node_modules/jquery/dist/jquery.min.js",
|
|
143
|
+
"postman-request": "browser-request",
|
|
144
|
+
"iconv-lite": "./src/shims/iconv-lite",
|
|
145
|
+
"moment-timezone": "./node_modules/moment-timezone/builds/moment-timezone-with-data-2012-2022.min.js"
|
|
146
|
+
},
|
|
147
|
+
"husky": {
|
|
148
|
+
"hooks": {
|
|
149
|
+
"pre-commit": "lint-staged"
|
|
150
|
+
}
|
|
151
|
+
},
|
|
152
|
+
"lint-staged": {
|
|
153
|
+
"*.js": [
|
|
154
|
+
"eslint --fix",
|
|
155
|
+
"prettier --write",
|
|
156
|
+
"git add"
|
|
157
|
+
],
|
|
158
|
+
"*.{json,css,md}": [
|
|
159
|
+
"remark .",
|
|
160
|
+
"prettier --write",
|
|
161
|
+
"git add"
|
|
162
|
+
]
|
|
163
|
+
}
|
|
164
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
// This module attempts to square cheerio with jquery
|
|
2
|
+
// so that node-specific quirks/features of cheerio
|
|
3
|
+
// will also work in the browser. This mostly involves
|
|
4
|
+
// shimming a few functions and rewriting the jquery
|
|
5
|
+
// constructor so it sandboxes most of its operations
|
|
6
|
+
// and doesn't mutate existing dom elements in the page.
|
|
7
|
+
|
|
8
|
+
import jQuery from 'jquery';
|
|
9
|
+
|
|
10
|
+
const PARSER_CLASS = 'mercury-parsing-container';
|
|
11
|
+
let PARSING_NODE;
|
|
12
|
+
|
|
13
|
+
jQuery.noConflict();
|
|
14
|
+
const $ = (selector, context, rootjQuery, contextOverride = true) => {
|
|
15
|
+
if (contextOverride) {
|
|
16
|
+
if (context && typeof context === 'string') {
|
|
17
|
+
context = PARSING_NODE.find(context);
|
|
18
|
+
} else if (!context) {
|
|
19
|
+
context = PARSING_NODE;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
return new jQuery.fn.init(selector, context, rootjQuery); // eslint-disable-line new-cap
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
// eslint-disable-next-line no-multi-assign
|
|
27
|
+
$.fn = $.prototype = jQuery.fn;
|
|
28
|
+
jQuery.extend($, jQuery); // copy's trim, extend etc to $
|
|
29
|
+
|
|
30
|
+
const removeUnusedTags = $node => {
|
|
31
|
+
// remove scripts and stylesheets
|
|
32
|
+
$node.find('script, style, link[rel="stylesheet"]').remove();
|
|
33
|
+
|
|
34
|
+
return $node;
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
$.cloneHtml = () => {
|
|
38
|
+
const html = removeUnusedTags($('html', null, null, false).clone());
|
|
39
|
+
|
|
40
|
+
return html
|
|
41
|
+
.children()
|
|
42
|
+
.wrap('<div />')
|
|
43
|
+
.wrap('<div />');
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
$.root = () => $('*').first();
|
|
47
|
+
|
|
48
|
+
$.browser = true;
|
|
49
|
+
|
|
50
|
+
const isContainer = $node => {
|
|
51
|
+
const el = $node.get(0);
|
|
52
|
+
if (el && el.tagName) {
|
|
53
|
+
return el.tagName.toLowerCase() === 'container';
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return false;
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
$.html = $node => {
|
|
60
|
+
if ($node) {
|
|
61
|
+
// we never want to return a parsing container, only its children
|
|
62
|
+
if (isContainer($node) || isContainer($node.children('container'))) {
|
|
63
|
+
return $node.children('container').html() || $node.html();
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return $('<div>')
|
|
67
|
+
.append($node.eq(0).clone())
|
|
68
|
+
.html();
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const $body = removeUnusedTags($('body', null, null, false).clone());
|
|
72
|
+
const $head = removeUnusedTags($('head', null, null, false).clone());
|
|
73
|
+
|
|
74
|
+
if (PARSING_NODE && PARSING_NODE.length > 0) {
|
|
75
|
+
return PARSING_NODE.children().html();
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const html = $('<container />')
|
|
79
|
+
.append($(`<container>${$head.html()}</container>`))
|
|
80
|
+
.append($(`<container>${$body.html()}</container>`))
|
|
81
|
+
.wrap('<container />')
|
|
82
|
+
.parent()
|
|
83
|
+
.html();
|
|
84
|
+
|
|
85
|
+
return html;
|
|
86
|
+
};
|
|
87
|
+
|
|
88
|
+
// eslint-disable-next-line no-unused-vars
|
|
89
|
+
$.load = (html, opts = {}, returnHtml = false) => {
|
|
90
|
+
if (!html) {
|
|
91
|
+
html = $.cloneHtml();
|
|
92
|
+
} else {
|
|
93
|
+
html = $('<container />').html(html);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
PARSING_NODE =
|
|
97
|
+
PARSING_NODE || $(`<div class="${PARSER_CLASS}" style="display:none;" />`);
|
|
98
|
+
|
|
99
|
+
// Strip scripts
|
|
100
|
+
html = removeUnusedTags(html);
|
|
101
|
+
|
|
102
|
+
// Remove comments
|
|
103
|
+
html
|
|
104
|
+
.find('*')
|
|
105
|
+
.contents()
|
|
106
|
+
.each(function() {
|
|
107
|
+
// eslint-disable-next-line no-undef
|
|
108
|
+
if (this.nodeType === Node.COMMENT_NODE) {
|
|
109
|
+
$(this).remove();
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
PARSING_NODE.html(html);
|
|
113
|
+
|
|
114
|
+
if (returnHtml) return { $, html: html.html() };
|
|
115
|
+
|
|
116
|
+
return $;
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
export default $;
|