html-get 2.10.6 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -4
- package/src/auto-domains.json +12 -12
- package/src/index.js +128 -110
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, using prerendering when is necessary.",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.
|
|
5
|
+
"version": "2.11.0",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -29,14 +29,14 @@
|
|
|
29
29
|
"request"
|
|
30
30
|
],
|
|
31
31
|
"dependencies": {
|
|
32
|
-
"@metascraper/helpers": "~5.34.
|
|
32
|
+
"@metascraper/helpers": "~5.34.7",
|
|
33
33
|
"cheerio": "~1.0.0-rc.12",
|
|
34
34
|
"css-url-regex": "~4.0.0",
|
|
35
35
|
"debug-logfmt": "~1.0.4",
|
|
36
36
|
"execall": "~2.0.0",
|
|
37
37
|
"got": "~11.8.6",
|
|
38
38
|
"html-encode": "~2.1.6",
|
|
39
|
-
"html-urls": "~2.4.
|
|
39
|
+
"html-urls": "~2.4.45",
|
|
40
40
|
"is-html-content": "~1.0.0",
|
|
41
41
|
"lodash": "~4.17.21",
|
|
42
42
|
"mri": "~1.2.0",
|
|
@@ -44,7 +44,7 @@
|
|
|
44
44
|
"p-retry": "~4.6.0",
|
|
45
45
|
"replace-string": "~3.1.0",
|
|
46
46
|
"time-span": "~4.0.0",
|
|
47
|
-
"top-sites": "~1.1.
|
|
47
|
+
"top-sites": "~1.1.169",
|
|
48
48
|
"write-json-file": "~4.3.0"
|
|
49
49
|
},
|
|
50
50
|
"devDependencies": {
|
package/src/auto-domains.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[
|
|
2
|
-
"youtube",
|
|
3
2
|
"google",
|
|
3
|
+
"youtube",
|
|
4
4
|
"microsoft",
|
|
5
5
|
"apple",
|
|
6
6
|
"wikipedia",
|
|
@@ -8,32 +8,32 @@
|
|
|
8
8
|
"blogspot",
|
|
9
9
|
"vimeo",
|
|
10
10
|
"github",
|
|
11
|
-
"imdb",
|
|
12
11
|
"theguardian",
|
|
12
|
+
"imdb",
|
|
13
13
|
"bbc",
|
|
14
|
-
"slideshare",
|
|
15
14
|
"nytimes",
|
|
16
|
-
"
|
|
17
|
-
"telegraph",
|
|
18
|
-
"pinterest",
|
|
15
|
+
"slideshare",
|
|
19
16
|
"soundcloud",
|
|
20
|
-
"
|
|
17
|
+
"pinterest",
|
|
18
|
+
"telegraph",
|
|
19
|
+
"huffingtonpost",
|
|
21
20
|
"engadget",
|
|
22
|
-
"yelp",
|
|
23
|
-
"zoom",
|
|
24
21
|
"techcrunch",
|
|
22
|
+
"zoom",
|
|
23
|
+
"yelp",
|
|
24
|
+
"eventbrite",
|
|
25
25
|
"spotify",
|
|
26
26
|
"theverge",
|
|
27
|
-
"etsy",
|
|
28
27
|
"dribbble",
|
|
28
|
+
"giphy",
|
|
29
|
+
"imgur",
|
|
29
30
|
"csdn",
|
|
30
31
|
"deviantart",
|
|
31
32
|
"digg",
|
|
33
|
+
"etsy",
|
|
32
34
|
"flickr",
|
|
33
35
|
"ghost",
|
|
34
|
-
"giphy",
|
|
35
36
|
"gitlab",
|
|
36
|
-
"imgur",
|
|
37
37
|
"meetup",
|
|
38
38
|
"producthunt",
|
|
39
39
|
"sourceforge",
|
package/src/index.js
CHANGED
|
@@ -13,11 +13,12 @@ const addHtml = require('./html')
|
|
|
13
13
|
|
|
14
14
|
const REQ_TIMEOUT = 8000
|
|
15
15
|
|
|
16
|
-
const fetch = (
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
16
|
+
const fetch = PCancelable.fn(
|
|
17
|
+
async (
|
|
18
|
+
url,
|
|
19
|
+
{ reflect = false, toEncode, timeout = REQ_TIMEOUT, ...opts },
|
|
20
|
+
onCancel
|
|
21
|
+
) => {
|
|
21
22
|
const reqTimeout = reflect ? timeout / 2 : timeout
|
|
22
23
|
|
|
23
24
|
const req = got(url, {
|
|
@@ -36,99 +37,109 @@ const fetch = (
|
|
|
36
37
|
|
|
37
38
|
try {
|
|
38
39
|
const res = await req
|
|
39
|
-
return
|
|
40
|
+
return {
|
|
40
41
|
headers: res.headers,
|
|
41
42
|
html: await toEncode(res.body, res.headers['content-type']),
|
|
42
43
|
mode: 'fetch',
|
|
43
44
|
url: res.url,
|
|
44
45
|
statusCode: res.statusCode
|
|
45
|
-
}
|
|
46
|
+
}
|
|
46
47
|
} catch (error) {
|
|
47
48
|
debug('fetch:error', { url, message: error.message || error, reflect })
|
|
48
49
|
return reflect
|
|
49
|
-
?
|
|
50
|
-
:
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
50
|
+
? { isRejected: true, error }
|
|
51
|
+
: {
|
|
52
|
+
url,
|
|
53
|
+
html: '',
|
|
54
|
+
mode: 'fetch',
|
|
55
|
+
headers: error.response ? error.response.headers : {},
|
|
56
|
+
statusCode: error.response ? error.response.statusCode : undefined
|
|
57
|
+
}
|
|
57
58
|
}
|
|
58
|
-
})
|
|
59
|
-
|
|
60
|
-
const prerender = async (
|
|
61
|
-
url,
|
|
62
|
-
{
|
|
63
|
-
getBrowserless,
|
|
64
|
-
toEncode,
|
|
65
|
-
headers,
|
|
66
|
-
gotOpts,
|
|
67
|
-
timeout = REQ_TIMEOUT,
|
|
68
|
-
abortTypes = ['image', 'stylesheet', 'font'],
|
|
69
|
-
...opts
|
|
70
59
|
}
|
|
71
|
-
)
|
|
72
|
-
let fetchRes
|
|
73
|
-
let data = {}
|
|
74
|
-
let isFetchResRejected = false
|
|
60
|
+
)
|
|
75
61
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
62
|
+
const prerender = PCancelable.fn(
|
|
63
|
+
async (
|
|
64
|
+
url,
|
|
65
|
+
{
|
|
66
|
+
getBrowserless,
|
|
79
67
|
toEncode,
|
|
80
|
-
...gotOpts,
|
|
81
68
|
headers,
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
69
|
+
gotOpts,
|
|
70
|
+
timeout = REQ_TIMEOUT,
|
|
71
|
+
abortTypes = ['image', 'stylesheet', 'font'],
|
|
72
|
+
...opts
|
|
73
|
+
},
|
|
74
|
+
onCancel
|
|
75
|
+
) => {
|
|
76
|
+
let fetchRes
|
|
77
|
+
let data = {}
|
|
78
|
+
let isFetchResRejected = false
|
|
79
|
+
|
|
80
|
+
onCancel(() => fetchRes.cancel())
|
|
81
|
+
|
|
82
|
+
try {
|
|
83
|
+
fetchRes = fetch(url, {
|
|
84
|
+
reflect: true,
|
|
85
|
+
toEncode,
|
|
86
|
+
...gotOpts,
|
|
100
87
|
headers,
|
|
101
|
-
|
|
102
|
-
}
|
|
103
|
-
|
|
88
|
+
timeout
|
|
89
|
+
})
|
|
90
|
+
const browserless = await getBrowserless()
|
|
91
|
+
|
|
92
|
+
const getPayload = browserless.evaluate(
|
|
93
|
+
async (page, response) => {
|
|
94
|
+
if (!response) throw new AbortError('empty response')
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
headers: response.headers(),
|
|
98
|
+
html: await page.content(),
|
|
99
|
+
mode: 'prerender',
|
|
100
|
+
url: response.url(),
|
|
101
|
+
statusCode: response.status()
|
|
102
|
+
}
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
timeout,
|
|
106
|
+
headers,
|
|
107
|
+
abortTypes
|
|
108
|
+
}
|
|
109
|
+
)
|
|
104
110
|
|
|
105
|
-
|
|
111
|
+
onCancel(() => {
|
|
112
|
+
debug('prerender:cancel', { url })
|
|
113
|
+
getPayload.cancel()
|
|
114
|
+
})
|
|
106
115
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
116
|
+
const payload = await getPayload(url, opts)
|
|
117
|
+
await fetchRes.cancel()
|
|
118
|
+
debug('prerender', { url, state: 'success' })
|
|
119
|
+
return payload
|
|
120
|
+
} catch (err) {
|
|
121
|
+
const { isRejected, ...dataProps } = await fetchRes
|
|
112
122
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
123
|
+
debug('prerender:error', {
|
|
124
|
+
url,
|
|
125
|
+
isRejected,
|
|
126
|
+
error: err.message
|
|
127
|
+
})
|
|
118
128
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
129
|
+
isFetchResRejected = isRejected
|
|
130
|
+
data = dataProps
|
|
131
|
+
}
|
|
122
132
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
}
|
|
133
|
+
return isFetchResRejected
|
|
134
|
+
? {
|
|
135
|
+
headers: data.headers || {},
|
|
136
|
+
html: '',
|
|
137
|
+
url,
|
|
138
|
+
mode: 'prerender'
|
|
139
|
+
}
|
|
140
|
+
: data
|
|
141
|
+
}
|
|
142
|
+
)
|
|
132
143
|
|
|
133
144
|
const modes = { fetch, prerender }
|
|
134
145
|
|
|
@@ -160,40 +171,47 @@ const getContent = async (
|
|
|
160
171
|
return { ...content, html }
|
|
161
172
|
}
|
|
162
173
|
|
|
163
|
-
module.exports =
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
174
|
+
module.exports = PCancelable.fn(
|
|
175
|
+
async (
|
|
176
|
+
targetUrl,
|
|
177
|
+
{
|
|
178
|
+
encoding = 'utf-8',
|
|
179
|
+
getBrowserless,
|
|
180
|
+
getMode = determinateMode,
|
|
181
|
+
gotOpts,
|
|
182
|
+
headers,
|
|
183
|
+
prerender = 'auto',
|
|
184
|
+
puppeteerOpts,
|
|
185
|
+
rewriteUrls = false
|
|
186
|
+
} = {},
|
|
187
|
+
onCancel
|
|
188
|
+
) => {
|
|
189
|
+
if (!getBrowserless && prerender !== false) {
|
|
190
|
+
throw TypeError(
|
|
191
|
+
"Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
|
|
192
|
+
)
|
|
193
|
+
}
|
|
181
194
|
|
|
182
|
-
|
|
183
|
-
|
|
195
|
+
const toEncode = htmlEncode(encoding)
|
|
196
|
+
const reqMode = getMode(targetUrl, { prerender })
|
|
184
197
|
|
|
185
|
-
|
|
198
|
+
const time = timeSpan()
|
|
186
199
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
200
|
+
const promise = getContent(targetUrl, reqMode, {
|
|
201
|
+
getBrowserless,
|
|
202
|
+
gotOpts,
|
|
203
|
+
headers,
|
|
204
|
+
puppeteerOpts,
|
|
205
|
+
rewriteUrls,
|
|
206
|
+
toEncode
|
|
207
|
+
})
|
|
195
208
|
|
|
196
|
-
|
|
197
|
-
|
|
209
|
+
onCancel(() => promise.onCancel())
|
|
210
|
+
|
|
211
|
+
const { mode, ...payload } = await promise
|
|
212
|
+
|
|
213
|
+
return { ...payload, stats: { mode, timing: time.rounded() } }
|
|
214
|
+
}
|
|
215
|
+
)
|
|
198
216
|
|
|
199
217
|
module.exports.REQ_TIMEOUT = REQ_TIMEOUT
|