html-get 2.9.14 → 2.9.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -13
- package/bin/index.js +13 -4
- package/package.json +5 -4
- package/src/auto-domains.json +13 -13
- package/src/html.js +3 -4
- package/src/index.js +1 -1
package/README.md
CHANGED
|
@@ -38,21 +38,37 @@ $ npm install puppeteer html-get --save
|
|
|
38
38
|
## Usage
|
|
39
39
|
|
|
40
40
|
```js
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
const createBrowserless = require('browserless')
|
|
43
42
|
const getHTML = require('html-get')
|
|
44
43
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
44
|
+
// Spawn Chromium process once
|
|
45
|
+
const browserlessFactory = createBrowserless()
|
|
46
|
+
|
|
47
|
+
// Kill the process when Node.js exit
|
|
48
|
+
process.on('exit', () => {
|
|
49
|
+
console.log('closing resources!')
|
|
50
|
+
browserlessFactory.close()
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
const getContent = async url => {
|
|
54
|
+
// create a browser context inside Chromium process
|
|
55
|
+
const browserContext = browserlessFactory.createContext()
|
|
56
|
+
const getBrowserless = () => browserContext
|
|
57
|
+
const result = await getHTML(url, { getBrowserless })
|
|
58
|
+
// close the browser context after it's used
|
|
59
|
+
await getBrowserless((browser) => browser.destroyContext())
|
|
60
|
+
return result
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
getContent('https://example.com')
|
|
64
|
+
.then(content => {
|
|
65
|
+
console.log(content)
|
|
66
|
+
process.exit()
|
|
67
|
+
})
|
|
68
|
+
.catch(error => {
|
|
69
|
+
console.error(error)
|
|
70
|
+
process.exit(1)
|
|
71
|
+
})
|
|
56
72
|
```
|
|
57
73
|
|
|
58
74
|
### Command Line
|
package/bin/index.js
CHANGED
|
@@ -2,18 +2,24 @@
|
|
|
2
2
|
|
|
3
3
|
'use strict'
|
|
4
4
|
|
|
5
|
+
const createBrowserless = require('browserless')
|
|
5
6
|
const minimist = require('minimist')
|
|
6
7
|
const { URL } = require('url')
|
|
7
8
|
|
|
8
9
|
const getHTML = require('..')
|
|
9
10
|
|
|
11
|
+
const browserlessFactory = createBrowserless()
|
|
12
|
+
|
|
10
13
|
const [input, ...argv] = process.argv.slice(2)
|
|
11
14
|
const url = new URL(input).toString()
|
|
12
15
|
|
|
13
16
|
const { debug: isDebug, ...args } = minimist(argv)
|
|
14
17
|
|
|
15
|
-
|
|
16
|
-
|
|
18
|
+
const browserContext = browserlessFactory.createContext()
|
|
19
|
+
const getBrowserless = () => browserContext
|
|
20
|
+
|
|
21
|
+
getHTML(url, { getBrowserless, ...args })
|
|
22
|
+
.then(async ({ html, stats, headers, statusCode }) => {
|
|
17
23
|
if (isDebug) {
|
|
18
24
|
console.log(`
|
|
19
25
|
url: ${url}
|
|
@@ -31,10 +37,13 @@ getHTML(url, args)
|
|
|
31
37
|
} else {
|
|
32
38
|
console.log(html)
|
|
33
39
|
}
|
|
34
|
-
|
|
35
40
|
process.exit(0)
|
|
36
41
|
})
|
|
37
|
-
.catch(err => {
|
|
42
|
+
.catch(async err => {
|
|
38
43
|
console.error(err)
|
|
39
44
|
process.exit(1)
|
|
40
45
|
})
|
|
46
|
+
.finally(async () => {
|
|
47
|
+
await getBrowserless(browser => browser.destroyContext())
|
|
48
|
+
browserlessFactory.close()
|
|
49
|
+
})
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, using prerendering when is necessary.",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.9.
|
|
5
|
+
"version": "2.9.16",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -79,7 +79,7 @@
|
|
|
79
79
|
],
|
|
80
80
|
"scripts": {
|
|
81
81
|
"clean": "rm -rf node_modules",
|
|
82
|
-
"contributors": "(git-authors-cli && finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
82
|
+
"contributors": "(git-authors-cli && finepack --sort-ignore-object-at ava && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
83
83
|
"lint": "standard-markdown README.md && standard",
|
|
84
84
|
"postinstall": "node scripts/postinstall",
|
|
85
85
|
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
@@ -94,9 +94,10 @@
|
|
|
94
94
|
},
|
|
95
95
|
"license": "MIT",
|
|
96
96
|
"ava": {
|
|
97
|
+
"workerThreads": false,
|
|
97
98
|
"files": [
|
|
98
|
-
"
|
|
99
|
-
"test
|
|
99
|
+
"test/**/*.js",
|
|
100
|
+
"!test/util.js"
|
|
100
101
|
],
|
|
101
102
|
"timeout": "2m"
|
|
102
103
|
},
|
package/src/auto-domains.json
CHANGED
|
@@ -1,37 +1,37 @@
|
|
|
1
1
|
[
|
|
2
2
|
"youtube",
|
|
3
3
|
"google",
|
|
4
|
-
"microsoft",
|
|
5
4
|
"apple",
|
|
6
|
-
"
|
|
5
|
+
"microsoft",
|
|
7
6
|
"wordpress",
|
|
7
|
+
"wikipedia",
|
|
8
8
|
"blogspot",
|
|
9
|
-
"vimeo",
|
|
10
9
|
"github",
|
|
11
|
-
"
|
|
10
|
+
"vimeo",
|
|
12
11
|
"slideshare",
|
|
13
|
-
"bbc",
|
|
14
12
|
"imdb",
|
|
13
|
+
"bbc",
|
|
15
14
|
"theguardian",
|
|
15
|
+
"nytimes",
|
|
16
|
+
"huffingtonpost",
|
|
16
17
|
"telegraph",
|
|
17
18
|
"pinterest",
|
|
18
|
-
"
|
|
19
|
-
"spotify",
|
|
19
|
+
"yelp",
|
|
20
20
|
"eventbrite",
|
|
21
|
+
"engadget",
|
|
21
22
|
"zoom",
|
|
22
23
|
"techcrunch",
|
|
23
|
-
"yelp",
|
|
24
|
-
"soundcloud",
|
|
25
|
-
"engadget",
|
|
26
24
|
"theverge",
|
|
25
|
+
"spotify",
|
|
26
|
+
"soundcloud",
|
|
27
|
+
"etsy",
|
|
27
28
|
"flickr",
|
|
28
29
|
"stackoverflow",
|
|
29
|
-
"giphy",
|
|
30
|
-
"imgur",
|
|
31
30
|
"csdn",
|
|
32
31
|
"digg",
|
|
33
|
-
"etsy",
|
|
34
32
|
"ghost",
|
|
33
|
+
"giphy",
|
|
34
|
+
"imgur",
|
|
35
35
|
"meetup",
|
|
36
36
|
"producthunt",
|
|
37
37
|
"reddit",
|
package/src/html.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
|
-
const { startsWith, get, split, nth, castArray, forEach } = require('lodash')
|
|
4
3
|
const { date: toDate, isUrl, isMime } = require('@metascraper/helpers')
|
|
4
|
+
const { get, split, nth, castArray, forEach } = require('lodash')
|
|
5
5
|
const { TAGS: URL_TAGS } = require('html-urls')
|
|
6
6
|
const replaceString = require('replace-string')
|
|
7
7
|
const isHTML = require('is-html-content')
|
|
@@ -75,7 +75,8 @@ const rewriteHtmlUrls = ({ $, url }) => {
|
|
|
75
75
|
$(tagName.join(',')).each(function () {
|
|
76
76
|
const el = $(this)
|
|
77
77
|
const attr = el.attr(urlAttr)
|
|
78
|
-
|
|
78
|
+
|
|
79
|
+
if (typeof attr === 'string' && !attr.startsWith('http')) {
|
|
79
80
|
try {
|
|
80
81
|
const newAttr = new URL(attr, url).toString()
|
|
81
82
|
el.attr(urlAttr, newAttr)
|
|
@@ -166,5 +167,3 @@ module.exports = ({
|
|
|
166
167
|
|
|
167
168
|
return rewriteUrls ? rewriteCssUrls({ html: $.html(), url }) : $.html()
|
|
168
169
|
}
|
|
169
|
-
|
|
170
|
-
module.exports.isHTML = isHTML
|
package/src/index.js
CHANGED
|
@@ -161,7 +161,7 @@ module.exports = async (
|
|
|
161
161
|
rewriteUrls = false
|
|
162
162
|
} = {}
|
|
163
163
|
) => {
|
|
164
|
-
if (!getBrowserless) {
|
|
164
|
+
if (!getBrowserless && prerender !== false) {
|
|
165
165
|
throw TypeError(
|
|
166
166
|
"Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
|
|
167
167
|
)
|