html-get 2.9.13 → 2.9.16-0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -13
- package/bin/index.js +13 -4
- package/package.json +26 -24
- package/src/auto-domains.json +16 -16
- package/src/html.js +3 -4
- package/src/index.js +1 -1
package/README.md
CHANGED
|
@@ -38,21 +38,37 @@ $ npm install puppeteer html-get --save
|
|
|
38
38
|
## Usage
|
|
39
39
|
|
|
40
40
|
```js
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
const createBrowserless = require('browserless')
|
|
43
42
|
const getHTML = require('html-get')
|
|
44
43
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
44
|
+
// Spawn Chromium process once
|
|
45
|
+
const browserlessFactory = createBrowserless()
|
|
46
|
+
|
|
47
|
+
// Kill the process when Node.js exit
|
|
48
|
+
process.on('exit', () => {
|
|
49
|
+
console.log('closing resources!')
|
|
50
|
+
browserlessFactory.close()
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
const getContent = async url => {
|
|
54
|
+
// create a browser context inside Chromium process
|
|
55
|
+
const browserContext = browserlessFactory.createContext()
|
|
56
|
+
const getBrowserless = () => browserContext
|
|
57
|
+
const result = await getHTML(url, { getBrowserless })
|
|
58
|
+
// close the browser context after it's used
|
|
59
|
+
await getBrowserless((browser) => browser.destroyContext())
|
|
60
|
+
return result
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
getContent('https://example.com')
|
|
64
|
+
.then(content => {
|
|
65
|
+
console.log(content)
|
|
66
|
+
process.exit()
|
|
67
|
+
})
|
|
68
|
+
.catch(error => {
|
|
69
|
+
console.error(error)
|
|
70
|
+
process.exit(1)
|
|
71
|
+
})
|
|
56
72
|
```
|
|
57
73
|
|
|
58
74
|
### Command Line
|
package/bin/index.js
CHANGED
|
@@ -2,18 +2,24 @@
|
|
|
2
2
|
|
|
3
3
|
'use strict'
|
|
4
4
|
|
|
5
|
+
const createBrowserless = require('browserless')
|
|
5
6
|
const minimist = require('minimist')
|
|
6
7
|
const { URL } = require('url')
|
|
7
8
|
|
|
8
9
|
const getHTML = require('..')
|
|
9
10
|
|
|
11
|
+
const browserlessFactory = createBrowserless()
|
|
12
|
+
|
|
10
13
|
const [input, ...argv] = process.argv.slice(2)
|
|
11
14
|
const url = new URL(input).toString()
|
|
12
15
|
|
|
13
16
|
const { debug: isDebug, ...args } = minimist(argv)
|
|
14
17
|
|
|
15
|
-
|
|
16
|
-
|
|
18
|
+
const browserContext = browserlessFactory.createContext()
|
|
19
|
+
const getBrowserless = () => browserContext
|
|
20
|
+
|
|
21
|
+
getHTML(url, { getBrowserless, ...args })
|
|
22
|
+
.then(async ({ html, stats, headers, statusCode }) => {
|
|
17
23
|
if (isDebug) {
|
|
18
24
|
console.log(`
|
|
19
25
|
url: ${url}
|
|
@@ -31,10 +37,13 @@ getHTML(url, args)
|
|
|
31
37
|
} else {
|
|
32
38
|
console.log(html)
|
|
33
39
|
}
|
|
34
|
-
|
|
35
40
|
process.exit(0)
|
|
36
41
|
})
|
|
37
|
-
.catch(err => {
|
|
42
|
+
.catch(async err => {
|
|
38
43
|
console.error(err)
|
|
39
44
|
process.exit(1)
|
|
40
45
|
})
|
|
46
|
+
.finally(async () => {
|
|
47
|
+
await getBrowserless(browser => browser.destroyContext())
|
|
48
|
+
browserlessFactory.close()
|
|
49
|
+
})
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, using prerendering when is necessary.",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.9.
|
|
5
|
+
"version": "2.9.16-0",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -29,14 +29,14 @@
|
|
|
29
29
|
"request"
|
|
30
30
|
],
|
|
31
31
|
"dependencies": {
|
|
32
|
-
"@metascraper/helpers": "~5.
|
|
32
|
+
"@metascraper/helpers": "~5.29.0",
|
|
33
33
|
"cheerio": "~1.0.0-rc.10",
|
|
34
34
|
"css-url-regex": "~4.0.0",
|
|
35
35
|
"debug-logfmt": "~1.0.4",
|
|
36
36
|
"execall": "~2.0.0",
|
|
37
37
|
"got": "~11.8.2",
|
|
38
38
|
"html-encode": "~2.1.6",
|
|
39
|
-
"html-urls": "~2.4.
|
|
39
|
+
"html-urls": "~2.4.34",
|
|
40
40
|
"is-html-content": "~1.0.0",
|
|
41
41
|
"lodash": "~4.17.21",
|
|
42
42
|
"minimist": "~1.2.6",
|
|
@@ -44,8 +44,8 @@
|
|
|
44
44
|
"p-retry": "~4.6.0",
|
|
45
45
|
"replace-string": "~3.1.0",
|
|
46
46
|
"time-span": "~4.0.0",
|
|
47
|
-
"tldts": "~5.7.
|
|
48
|
-
"top-sites": "~1.1.
|
|
47
|
+
"tldts": "~5.7.74",
|
|
48
|
+
"top-sites": "~1.1.96",
|
|
49
49
|
"write-json-file": "~4.3.0"
|
|
50
50
|
},
|
|
51
51
|
"devDependencies": {
|
|
@@ -77,26 +77,12 @@
|
|
|
77
77
|
"scripts",
|
|
78
78
|
"src"
|
|
79
79
|
],
|
|
80
|
-
"scripts": {
|
|
81
|
-
"clean": "rm -rf node_modules",
|
|
82
|
-
"contributors": "(git-authors-cli && finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
83
|
-
"lint": "standard-markdown README.md && standard",
|
|
84
|
-
"postinstall": "node scripts/postinstall",
|
|
85
|
-
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
86
|
-
"prerelease": "npm run update:check && npm run contributors",
|
|
87
|
-
"pretest": "npm run lint",
|
|
88
|
-
"release": "standard-version -a",
|
|
89
|
-
"release:github": "conventional-github-releaser -p angular",
|
|
90
|
-
"release:tags": "git push --follow-tags origin HEAD:master",
|
|
91
|
-
"test": "c8 ava",
|
|
92
|
-
"update": "ncu -u",
|
|
93
|
-
"update:check": "ncu -- --error-level 2"
|
|
94
|
-
},
|
|
95
80
|
"license": "MIT",
|
|
96
81
|
"ava": {
|
|
82
|
+
"workerThreads": false,
|
|
97
83
|
"files": [
|
|
98
|
-
"
|
|
99
|
-
"test
|
|
84
|
+
"test/**/*.js",
|
|
85
|
+
"!test/util.js"
|
|
100
86
|
],
|
|
101
87
|
"timeout": "2m"
|
|
102
88
|
},
|
|
@@ -119,5 +105,21 @@
|
|
|
119
105
|
"simple-git-hooks": {
|
|
120
106
|
"commit-msg": "npx commitlint --edit",
|
|
121
107
|
"pre-commit": "npx nano-staged"
|
|
122
|
-
}
|
|
123
|
-
|
|
108
|
+
},
|
|
109
|
+
"scripts": {
|
|
110
|
+
"clean": "rm -rf node_modules",
|
|
111
|
+
"contributors": "(git-authors-cli && finepack --sort-ignore-object-at ava && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
112
|
+
"lint": "standard-markdown README.md && standard",
|
|
113
|
+
"postinstall": "node scripts/postinstall",
|
|
114
|
+
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
115
|
+
"prerelease": "npm run update:check && npm run contributors",
|
|
116
|
+
"pretest": "npm run lint",
|
|
117
|
+
"release": "standard-version -a",
|
|
118
|
+
"release:github": "conventional-github-releaser -p angular",
|
|
119
|
+
"release:tags": "git push --follow-tags origin HEAD:master",
|
|
120
|
+
"test": "c8 ava",
|
|
121
|
+
"update": "ncu -u",
|
|
122
|
+
"update:check": "ncu -- --error-level 2"
|
|
123
|
+
},
|
|
124
|
+
"readme": "<div align=\"center\">\n <img src=\"https://cdn.microlink.io/logo/banner.png\" alt=\"microlink cdn\">\n <br>\n <br>\n</div>\n\n\n[](https://coveralls.io/github/microlinkhq/html-get)\n[](https://www.npmjs.org/package/html-get)\n\n> Get the HTML from any website, using prerendering when is necessary.\n\n## Features\n\n- Get HTML markup from any website (client side apps as well).\n- Prerendering detection based on domains list.\n- Speed up process blocking ads trackers.\n- Encoding body response properly.\n\n<br>\n\nHeadless technology like [puppeteer](https://github.com/GoogleChrome/puppeteer) brings us to get the HTML markup from any website, even when the target URL is client side app and we need to wait until dom events fire for getting the real markup.\n\nGenerally this approach better than a simple GET request from the target URL, but because you need to wait for dom events, prerendering could be slow and in some scenario unnecessary (sites that use server side rendering could be resolved with a simple GET).\n\n**html-get** bring the best of both worlds, doing the following algorithm:\n\n- Determinate if the target URL actually needs prerendering (internally it has a [list of popular site domains](https://github.com/microlinkhq/html-get/blob/master/src/auto-domains.js) that don't need it).\n- If it needs prerendering, perform the action using Headless technology, blocking ads trackers requests for speed up the process, trying to resolve the main request in the minimum amount of time.\n- If it does not need prerendering or prerendering fails for any reason (for example, timeout), the request will be resolved doing a GET request.\n\n## Install\n\n```bash\n$ npm install puppeteer html-get --save\n```\n\n## Usage\n\n```js\nconst createBrowserless = require('browserless')\nconst getHTML = require('html-get')\n\n// Spawn Chromium process once\nconst browserlessFactory = createBrowserless()\n\n// Kill the process when Node.js exit\nprocess.on('exit', () => {\n console.log('closing resources!')\n browserlessFactory.close()\n})\n\nconst getContent = async url => {\n // create a browser context inside Chromium process\n const browserContext = browserlessFactory.createContext()\n const getBrowserless = () => browserContext\n const result = await getHTML(url, { getBrowserless })\n // close the browser context after it's used\n await getBrowserless((browser) => browser.destroyContext())\n return result\n}\n\ngetContent('https://example.com')\n .then(content => {\n console.log(content)\n process.exit()\n })\n .catch(error => {\n console.error(error)\n process.exit(1)\n })\n```\n\n### Command Line\n\n```\n$ npx html-get https://example.com\n```\n\n## API\n\n### getHTML(url, [options])\n\n#### url\n\n*Required*<br>\nType: `string`\n\nThe target URL for getting the HTML markup.\n\n##### getBrowserless\n\n*Required*<br>\nType: `function`<br>\n\nA function that should return a [browserless](https://browserless.js.org/) instance to be used for interact with puppeteer:\n\n#### options\n\n##### prerender\n\nType: `boolean`|`string`<br>\nDefault: `'auto'`\n\nEnable or disable prerendering as mechanism for getting the HTML markup explicitly.\n\nThe value `auto` means that that internally use a list of websites that don't need to use prerendering by default. This list is used for speedup the process, using `fetch` mode for these websites.\n\nSee [getMode parameter](#getMode) for know more.\n\n##### encoding\n\nType: `string`<br>\nDefault: `'utf-8'`\n\nEncoding the HTML markup properly from the body response.\n\nIt determines the encode to use A Node.js library for converting HTML documents of arbitrary encoding into a target encoding (utf8, utf16, etc).\n\n##### headers\n\nType: `object`<br>\n\nRequest headers that will be passed to fetch/prerender process.\n\n##### getMode\n\nType: `function`<br>\n\nA function evaluation that will be invoked to determinate the resolutive `mode` for getting the HTML markup from the target URL.\n\nThe default `getMode` is:\n\n```js\nconst getMode = (url, { prerender }) => {\n if (prerender === false) return 'fetch'\n if (prerender !== 'auto') return 'prerender'\n return autoDomains.includes(getDomain(url)) ? 'fetch' : 'prerender'\n}\n```\n\n##### gotOptions\n\nType: `object`<br>\n\nUnder `mode=fetch`, pass configuration object to [got](https://www.npmjs.com/package/got).\n\n##### puppeteerOpts\n\nType: `object`\n\nUnder non `mode=fetch`, pass configuration object to [puppeteer](https://www.npmjs.com/package/puppeteer).\n\n##### rewriteUrls\n\nType: `boolean`<br>\nDefault: `false`\n\nWhen is `true`, it will be rewritten CSS/HTML relatives URLs present in the HTML markup into absolutes.\n\n## License\n\n**html-get** © [Microlink](https://microlink.io), Released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.<br>\nAuthored and maintained by [Kiko Beats](https://kikobeats.com) with help from [contributors](https://github.com/microlinkhq/html-get/contributors).\n\n> [microlink.io](https://microlink.io) · GitHub [@MicrolinkHQ](https://github.com/microlinkhq) · Twitter [@microlinkhq](https://twitter.com/microlinkhq)\n"
|
|
125
|
+
}
|
package/src/auto-domains.json
CHANGED
|
@@ -1,41 +1,41 @@
|
|
|
1
1
|
[
|
|
2
|
-
"google",
|
|
3
2
|
"youtube",
|
|
4
|
-
"microsoft",
|
|
5
3
|
"apple",
|
|
4
|
+
"google",
|
|
5
|
+
"microsoft",
|
|
6
6
|
"wikipedia",
|
|
7
7
|
"wordpress",
|
|
8
8
|
"blogspot",
|
|
9
|
-
"github",
|
|
10
9
|
"vimeo",
|
|
11
|
-
"
|
|
12
|
-
"theguardian",
|
|
13
|
-
"bbc",
|
|
10
|
+
"github",
|
|
14
11
|
"nytimes",
|
|
12
|
+
"imdb",
|
|
15
13
|
"slideshare",
|
|
16
|
-
"
|
|
14
|
+
"bbc",
|
|
15
|
+
"theguardian",
|
|
17
16
|
"telegraph",
|
|
18
17
|
"huffingtonpost",
|
|
18
|
+
"pinterest",
|
|
19
|
+
"soundcloud",
|
|
20
|
+
"spotify",
|
|
21
|
+
"eventbrite",
|
|
22
|
+
"stackoverflow",
|
|
19
23
|
"zoom",
|
|
20
24
|
"techcrunch",
|
|
21
|
-
"spotify",
|
|
22
25
|
"yelp",
|
|
23
|
-
"eventbrite",
|
|
24
|
-
"soundcloud",
|
|
25
|
-
"theverge",
|
|
26
26
|
"engadget",
|
|
27
|
-
"
|
|
28
|
-
"
|
|
27
|
+
"theverge",
|
|
28
|
+
"reddit",
|
|
29
29
|
"etsy",
|
|
30
|
-
"csdn",
|
|
31
30
|
"flickr",
|
|
31
|
+
"digg",
|
|
32
|
+
"csdn",
|
|
32
33
|
"ghost",
|
|
33
34
|
"giphy",
|
|
35
|
+
"imgur",
|
|
34
36
|
"meetup",
|
|
35
37
|
"producthunt",
|
|
36
|
-
"reddit",
|
|
37
38
|
"sourceforge",
|
|
38
|
-
"stackoverflow",
|
|
39
39
|
"tumblr",
|
|
40
40
|
"ycombinator"
|
|
41
41
|
]
|
package/src/html.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
|
-
const { startsWith, get, split, nth, castArray, forEach } = require('lodash')
|
|
4
3
|
const { date: toDate, isUrl, isMime } = require('@metascraper/helpers')
|
|
4
|
+
const { get, split, nth, castArray, forEach } = require('lodash')
|
|
5
5
|
const { TAGS: URL_TAGS } = require('html-urls')
|
|
6
6
|
const replaceString = require('replace-string')
|
|
7
7
|
const isHTML = require('is-html-content')
|
|
@@ -75,7 +75,8 @@ const rewriteHtmlUrls = ({ $, url }) => {
|
|
|
75
75
|
$(tagName.join(',')).each(function () {
|
|
76
76
|
const el = $(this)
|
|
77
77
|
const attr = el.attr(urlAttr)
|
|
78
|
-
|
|
78
|
+
|
|
79
|
+
if (typeof attr === 'string' && !attr.startsWith('http')) {
|
|
79
80
|
try {
|
|
80
81
|
const newAttr = new URL(attr, url).toString()
|
|
81
82
|
el.attr(urlAttr, newAttr)
|
|
@@ -166,5 +167,3 @@ module.exports = ({
|
|
|
166
167
|
|
|
167
168
|
return rewriteUrls ? rewriteCssUrls({ html: $.html(), url }) : $.html()
|
|
168
169
|
}
|
|
169
|
-
|
|
170
|
-
module.exports.isHTML = isHTML
|
package/src/index.js
CHANGED
|
@@ -161,7 +161,7 @@ module.exports = async (
|
|
|
161
161
|
rewriteUrls = false
|
|
162
162
|
} = {}
|
|
163
163
|
) => {
|
|
164
|
-
if (!getBrowserless) {
|
|
164
|
+
if (!getBrowserless && prerender !== false) {
|
|
165
165
|
throw TypeError(
|
|
166
166
|
"Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
|
|
167
167
|
)
|