@jocmp/mercury-parser 2.2.3-dev
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +149 -0
- package/cli.js +111 -0
- package/dist/generate-custom-parser.js +8421 -0
- package/dist/generate-custom-parser.js.map +1 -0
- package/dist/mercury.esm.js +5 -0
- package/dist/mercury.esm.js.map +1 -0
- package/dist/mercury.js +8166 -0
- package/dist/mercury.js.map +1 -0
- package/dist/mercury.web.js +2 -0
- package/dist/mercury.web.js.map +1 -0
- package/package.json +164 -0
- package/src/shims/cheerio-query.js +119 -0
- package/src/shims/iconv-lite.js +9 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2019 Postlight
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# Mercury Parser - Extracting content from chaos
|
|
2
|
+
|
|
3
|
+
Mercury Parser extracts the bits that humans care about from any URL you give it. That includes article content, titles, authors, published dates, excerpts, lead images, and more.
|
|
4
|
+
|
|
5
|
+
Mercury Parser allows you to easily create custom parsers using simple JavaScript and CSS selectors. This allows you to proactively manage parsing and migration edge cases. There are [many examples available](https://github.com/jocmp/mercury-parser/tree/master/src/extractors/custom) along with [documentation](https://github.com/jocmp/mercury-parser/blob/master/src/extractors/custom/README.md).
|
|
6
|
+
|
|
7
|
+
## How? Like this.
|
|
8
|
+
|
|
9
|
+
### Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# If you're using yarn
|
|
13
|
+
yarn add @jocmp/mercury-parser
|
|
14
|
+
|
|
15
|
+
# If you're using npm
|
|
16
|
+
npm install @jocmp/mercury-parser
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### Usage
|
|
20
|
+
|
|
21
|
+
```javascript
|
|
22
|
+
import Parser from '@jocmp/mercury-parser';
|
|
23
|
+
|
|
24
|
+
Parser.parse(url).then(result => console.log(result));
|
|
25
|
+
|
|
26
|
+
// NOTE: When used in the browser, you can omit the URL argument
|
|
27
|
+
// and simply run `Parser.parse()` to parse the current page.
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
The result looks like this:
|
|
31
|
+
|
|
32
|
+
```json
|
|
33
|
+
{
|
|
34
|
+
"title": "Thunder (mascot)",
|
|
35
|
+
"content": "... <p><b>Thunder</b> is the <a href=\"https://en.wikipedia.org/wiki/Stage_name\">stage name</a> for the...",
|
|
36
|
+
"author": "Wikipedia Contributors",
|
|
37
|
+
"date_published": "2016-09-16T20:56:00.000Z",
|
|
38
|
+
"lead_image_url": null,
|
|
39
|
+
"dek": null,
|
|
40
|
+
"next_page_url": null,
|
|
41
|
+
"url": "https://en.wikipedia.org/wiki/Thunder_(mascot)",
|
|
42
|
+
"domain": "en.wikipedia.org",
|
|
43
|
+
"excerpt": "Thunder Thunder is the stage name for the horse who is the official live animal mascot for the Denver Broncos",
|
|
44
|
+
"word_count": 4677,
|
|
45
|
+
"direction": "ltr",
|
|
46
|
+
"total_pages": 1,
|
|
47
|
+
"rendered_pages": 1
|
|
48
|
+
}
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
If Parser is unable to find a field, that field will return `null`.
|
|
52
|
+
|
|
53
|
+
#### `parse()` Options
|
|
54
|
+
|
|
55
|
+
##### Content Formats
|
|
56
|
+
|
|
57
|
+
By default, Postlight Parser returns the `content` field as HTML. However, you can override this behavior by passing in options to the `parse` function, specifying whether or not to scrape all pages of an article, and what type of output to return (valid values are `'html'`, `'markdown'`, and `'text'`). For example:
|
|
58
|
+
|
|
59
|
+
```javascript
|
|
60
|
+
Parser.parse(url, { contentType: 'markdown' }).then(result =>
|
|
61
|
+
console.log(result)
|
|
62
|
+
);
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
This returns the the page's `content` as GitHub-flavored Markdown:
|
|
66
|
+
|
|
67
|
+
```json
|
|
68
|
+
"content": "...**Thunder** is the [stage name](https://en.wikipedia.org/wiki/Stage_name) for the..."
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
##### Custom Request Headers
|
|
72
|
+
|
|
73
|
+
You can include custom headers in requests by passing name-value pairs to the `parse` function as follows:
|
|
74
|
+
|
|
75
|
+
```javascript
|
|
76
|
+
Parser.parse(url, {
|
|
77
|
+
headers: {
|
|
78
|
+
Cookie: 'name=value; name2=value2; name3=value3',
|
|
79
|
+
'User-Agent':
|
|
80
|
+
'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
|
|
81
|
+
},
|
|
82
|
+
}).then(result => console.log(result));
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
##### Pre-fetched HTML
|
|
86
|
+
|
|
87
|
+
You can use Postlight Parser to parse custom or pre-fetched HTML by passing an HTML string to the `parse` function as follows:
|
|
88
|
+
|
|
89
|
+
```javascript
|
|
90
|
+
Parser.parse(url, {
|
|
91
|
+
html:
|
|
92
|
+
'<html><body><article><h1>Thunder (mascot)</h1><p>Thunder is the stage name for the horse who is the official live animal mascot for the Denver Broncos</p></article></body></html>',
|
|
93
|
+
}).then(result => console.log(result));
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Note that the URL argument is still supplied, in order to identify the web site and use its custom parser, if it has any, though it will not be used for fetching content.
|
|
97
|
+
|
|
98
|
+
#### The command-line parser
|
|
99
|
+
|
|
100
|
+
Postlight Parser also ships with a CLI, meaning you can use it from your command line like so:
|
|
101
|
+
|
|
102
|
+

|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
# Install Postlight Parser globally
|
|
106
|
+
yarn global add @jocmp/mercury-parser
|
|
107
|
+
# or
|
|
108
|
+
npm -g install @jocmp/mercury-parser
|
|
109
|
+
|
|
110
|
+
# Then
|
|
111
|
+
postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source
|
|
112
|
+
|
|
113
|
+
# Pass optional --format argument to set content type (html|markdown|text)
|
|
114
|
+
postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --format=markdown
|
|
115
|
+
|
|
116
|
+
# Pass optional --header.name=value arguments to include custom headers in the request
|
|
117
|
+
postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --header.Cookie="name=value; name2=value2; name3=value3" --header.User-Agent="Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1"
|
|
118
|
+
|
|
119
|
+
# Pass optional --extend argument to add a custom type to the response
|
|
120
|
+
postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend credit="p:last-child em"
|
|
121
|
+
|
|
122
|
+
# Pass optional --extend-list argument to add a custom type with multiple matches
|
|
123
|
+
postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list categories=".meta__tags-list a"
|
|
124
|
+
|
|
125
|
+
# Get the value of attributes by adding a pipe to --extend or --extend-list
|
|
126
|
+
postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list links=".body a|href"
|
|
127
|
+
|
|
128
|
+
# Pass optional --add-extractor argument to add a custom extractor at runtime.
|
|
129
|
+
postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## License
|
|
133
|
+
|
|
134
|
+
Licensed under either of the below, at your preference:
|
|
135
|
+
|
|
136
|
+
- Apache License, Version 2.0
|
|
137
|
+
([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
|
138
|
+
- MIT license
|
|
139
|
+
([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
|
140
|
+
|
|
141
|
+
## Contributing
|
|
142
|
+
|
|
143
|
+
For details on how to contribute to Postlight Parser, including how to write a custom content extractor for any site, see [CONTRIBUTING.md](./CONTRIBUTING.md)
|
|
144
|
+
|
|
145
|
+
Unless it is explicitly stated otherwise, any contribution intentionally submitted for inclusion in the work, as defined in the Apache-2.0 license, shall be dual licensed as above without any additional terms or conditions.
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
🔬 A Labs project from your friends at [Postlight](https://postlight.com). Happy coding!
|
package/cli.js
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/* eslint-disable */
|
|
3
|
+
|
|
4
|
+
const Parser = require('./dist/mercury');
|
|
5
|
+
const package_info = require('./package.json');
|
|
6
|
+
const argv = require('yargs-parser')(process.argv.slice(2));
|
|
7
|
+
|
|
8
|
+
const {
|
|
9
|
+
_: [url],
|
|
10
|
+
format,
|
|
11
|
+
f,
|
|
12
|
+
extend,
|
|
13
|
+
e,
|
|
14
|
+
extendList,
|
|
15
|
+
l,
|
|
16
|
+
header,
|
|
17
|
+
h,
|
|
18
|
+
addExtractor,
|
|
19
|
+
x,
|
|
20
|
+
version,
|
|
21
|
+
} = argv;
|
|
22
|
+
(async (
|
|
23
|
+
urlToParse,
|
|
24
|
+
contentType,
|
|
25
|
+
extendedTypes,
|
|
26
|
+
extendedListTypes,
|
|
27
|
+
headers,
|
|
28
|
+
addExtractor,
|
|
29
|
+
version
|
|
30
|
+
) => {
|
|
31
|
+
if (version) {
|
|
32
|
+
console.log(package_info.version);
|
|
33
|
+
process.exit(0);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
if (!urlToParse) {
|
|
37
|
+
console.log(
|
|
38
|
+
'\n\
|
|
39
|
+
postlight-parser\n\n\
|
|
40
|
+
The Postlight Parser extracts semantic content from any url\n\n\
|
|
41
|
+
Usage:\n\
|
|
42
|
+
\n\
|
|
43
|
+
$ postlight-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... [--add-extractor path_to_extractor.js]... \n\
|
|
44
|
+
\n\
|
|
45
|
+
'
|
|
46
|
+
);
|
|
47
|
+
return;
|
|
48
|
+
}
|
|
49
|
+
try {
|
|
50
|
+
const contentTypeMap = {
|
|
51
|
+
html: 'html',
|
|
52
|
+
markdown: 'markdown',
|
|
53
|
+
md: 'markdown',
|
|
54
|
+
text: 'text',
|
|
55
|
+
txt: 'text',
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
const extensions = {};
|
|
59
|
+
[].concat(extendedTypes || []).forEach(t => {
|
|
60
|
+
const [name, selector] = t.split('=');
|
|
61
|
+
const fullSelector =
|
|
62
|
+
selector.indexOf('|') > 0 ? selector.split('|') : selector;
|
|
63
|
+
extensions[name] = { selectors: [fullSelector] };
|
|
64
|
+
});
|
|
65
|
+
[].concat(extendedListTypes || []).forEach(t => {
|
|
66
|
+
const [name, selector] = t.split('=');
|
|
67
|
+
const fullSelector =
|
|
68
|
+
selector.indexOf('|') > 0 ? selector.split('|') : selector;
|
|
69
|
+
extensions[name] = {
|
|
70
|
+
selectors: [fullSelector],
|
|
71
|
+
allowMultiple: true,
|
|
72
|
+
};
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
// Attempt to load custom extractor from path.
|
|
76
|
+
let customExtractor;
|
|
77
|
+
if (addExtractor) {
|
|
78
|
+
customExtractor = require(addExtractor);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
const result = await Parser.parse(urlToParse, {
|
|
82
|
+
contentType: contentTypeMap[contentType],
|
|
83
|
+
extend: extensions,
|
|
84
|
+
headers,
|
|
85
|
+
customExtractor,
|
|
86
|
+
});
|
|
87
|
+
console.log(JSON.stringify(result, null, 2));
|
|
88
|
+
} catch (e) {
|
|
89
|
+
if (e.message === 'ETIMEDOUT' && false) {
|
|
90
|
+
console.error(
|
|
91
|
+
'\nPostlight Parser encountered a timeout trying to load that resource.'
|
|
92
|
+
);
|
|
93
|
+
} else {
|
|
94
|
+
console.error(
|
|
95
|
+
'\nPostlight Parser encountered a problem trying to parse that resource.\n'
|
|
96
|
+
);
|
|
97
|
+
console.error(e);
|
|
98
|
+
}
|
|
99
|
+
const reportBug =
|
|
100
|
+
'If you believe this was an error, please file an issue at:\n\n https://github.com/postlight/parser/issues/new';
|
|
101
|
+
console.error(`\n${reportBug}\n`);
|
|
102
|
+
process.exit(1);
|
|
103
|
+
}
|
|
104
|
+
})(
|
|
105
|
+
url,
|
|
106
|
+
format || f,
|
|
107
|
+
extend || e,
|
|
108
|
+
extendList || l,
|
|
109
|
+
header || h,
|
|
110
|
+
addExtractor || x
|
|
111
|
+
);
|