html-get 2.10.7 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -4
- package/src/index.js +111 -95
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, using prerendering when is necessary.",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.
|
|
5
|
+
"version": "2.11.0",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -29,14 +29,14 @@
|
|
|
29
29
|
"request"
|
|
30
30
|
],
|
|
31
31
|
"dependencies": {
|
|
32
|
-
"@metascraper/helpers": "~5.34.
|
|
32
|
+
"@metascraper/helpers": "~5.34.7",
|
|
33
33
|
"cheerio": "~1.0.0-rc.12",
|
|
34
34
|
"css-url-regex": "~4.0.0",
|
|
35
35
|
"debug-logfmt": "~1.0.4",
|
|
36
36
|
"execall": "~2.0.0",
|
|
37
37
|
"got": "~11.8.6",
|
|
38
38
|
"html-encode": "~2.1.6",
|
|
39
|
-
"html-urls": "~2.4.
|
|
39
|
+
"html-urls": "~2.4.45",
|
|
40
40
|
"is-html-content": "~1.0.0",
|
|
41
41
|
"lodash": "~4.17.21",
|
|
42
42
|
"mri": "~1.2.0",
|
|
@@ -44,7 +44,7 @@
|
|
|
44
44
|
"p-retry": "~4.6.0",
|
|
45
45
|
"replace-string": "~3.1.0",
|
|
46
46
|
"time-span": "~4.0.0",
|
|
47
|
-
"top-sites": "~1.1.
|
|
47
|
+
"top-sites": "~1.1.169",
|
|
48
48
|
"write-json-file": "~4.3.0"
|
|
49
49
|
},
|
|
50
50
|
"devDependencies": {
|
package/src/index.js
CHANGED
|
@@ -59,78 +59,87 @@ const fetch = PCancelable.fn(
|
|
|
59
59
|
}
|
|
60
60
|
)
|
|
61
61
|
|
|
62
|
-
const prerender =
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
headers,
|
|
68
|
-
gotOpts,
|
|
69
|
-
timeout = REQ_TIMEOUT,
|
|
70
|
-
abortTypes = ['image', 'stylesheet', 'font'],
|
|
71
|
-
...opts
|
|
72
|
-
}
|
|
73
|
-
) => {
|
|
74
|
-
let fetchRes
|
|
75
|
-
let data = {}
|
|
76
|
-
let isFetchResRejected = false
|
|
77
|
-
|
|
78
|
-
try {
|
|
79
|
-
fetchRes = fetch(url, {
|
|
80
|
-
reflect: true,
|
|
62
|
+
const prerender = PCancelable.fn(
|
|
63
|
+
async (
|
|
64
|
+
url,
|
|
65
|
+
{
|
|
66
|
+
getBrowserless,
|
|
81
67
|
toEncode,
|
|
82
|
-
...gotOpts,
|
|
83
68
|
headers,
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
html: await page.content(),
|
|
95
|
-
mode: 'prerender',
|
|
96
|
-
url: response.url(),
|
|
97
|
-
statusCode: response.status()
|
|
98
|
-
}
|
|
99
|
-
},
|
|
100
|
-
{
|
|
101
|
-
timeout,
|
|
102
|
-
headers,
|
|
103
|
-
abortTypes
|
|
104
|
-
}
|
|
105
|
-
)
|
|
69
|
+
gotOpts,
|
|
70
|
+
timeout = REQ_TIMEOUT,
|
|
71
|
+
abortTypes = ['image', 'stylesheet', 'font'],
|
|
72
|
+
...opts
|
|
73
|
+
},
|
|
74
|
+
onCancel
|
|
75
|
+
) => {
|
|
76
|
+
let fetchRes
|
|
77
|
+
let data = {}
|
|
78
|
+
let isFetchResRejected = false
|
|
106
79
|
|
|
107
|
-
|
|
80
|
+
onCancel(() => fetchRes.cancel())
|
|
108
81
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
82
|
+
try {
|
|
83
|
+
fetchRes = fetch(url, {
|
|
84
|
+
reflect: true,
|
|
85
|
+
toEncode,
|
|
86
|
+
...gotOpts,
|
|
87
|
+
headers,
|
|
88
|
+
timeout
|
|
89
|
+
})
|
|
90
|
+
const browserless = await getBrowserless()
|
|
91
|
+
|
|
92
|
+
const getPayload = browserless.evaluate(
|
|
93
|
+
async (page, response) => {
|
|
94
|
+
if (!response) throw new AbortError('empty response')
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
headers: response.headers(),
|
|
98
|
+
html: await page.content(),
|
|
99
|
+
mode: 'prerender',
|
|
100
|
+
url: response.url(),
|
|
101
|
+
statusCode: response.status()
|
|
102
|
+
}
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
timeout,
|
|
106
|
+
headers,
|
|
107
|
+
abortTypes
|
|
108
|
+
}
|
|
109
|
+
)
|
|
114
110
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
})
|
|
111
|
+
onCancel(() => {
|
|
112
|
+
debug('prerender:cancel', { url })
|
|
113
|
+
getPayload.cancel()
|
|
114
|
+
})
|
|
120
115
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
116
|
+
const payload = await getPayload(url, opts)
|
|
117
|
+
await fetchRes.cancel()
|
|
118
|
+
debug('prerender', { url, state: 'success' })
|
|
119
|
+
return payload
|
|
120
|
+
} catch (err) {
|
|
121
|
+
const { isRejected, ...dataProps } = await fetchRes
|
|
124
122
|
|
|
125
|
-
|
|
126
|
-
? {
|
|
127
|
-
headers: data.headers || {},
|
|
128
|
-
html: '',
|
|
123
|
+
debug('prerender:error', {
|
|
129
124
|
url,
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
125
|
+
isRejected,
|
|
126
|
+
error: err.message
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
isFetchResRejected = isRejected
|
|
130
|
+
data = dataProps
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return isFetchResRejected
|
|
134
|
+
? {
|
|
135
|
+
headers: data.headers || {},
|
|
136
|
+
html: '',
|
|
137
|
+
url,
|
|
138
|
+
mode: 'prerender'
|
|
139
|
+
}
|
|
140
|
+
: data
|
|
141
|
+
}
|
|
142
|
+
)
|
|
134
143
|
|
|
135
144
|
const modes = { fetch, prerender }
|
|
136
145
|
|
|
@@ -162,40 +171,47 @@ const getContent = async (
|
|
|
162
171
|
return { ...content, html }
|
|
163
172
|
}
|
|
164
173
|
|
|
165
|
-
module.exports =
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
174
|
+
module.exports = PCancelable.fn(
|
|
175
|
+
async (
|
|
176
|
+
targetUrl,
|
|
177
|
+
{
|
|
178
|
+
encoding = 'utf-8',
|
|
179
|
+
getBrowserless,
|
|
180
|
+
getMode = determinateMode,
|
|
181
|
+
gotOpts,
|
|
182
|
+
headers,
|
|
183
|
+
prerender = 'auto',
|
|
184
|
+
puppeteerOpts,
|
|
185
|
+
rewriteUrls = false
|
|
186
|
+
} = {},
|
|
187
|
+
onCancel
|
|
188
|
+
) => {
|
|
189
|
+
if (!getBrowserless && prerender !== false) {
|
|
190
|
+
throw TypeError(
|
|
191
|
+
"Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
|
|
192
|
+
)
|
|
193
|
+
}
|
|
183
194
|
|
|
184
|
-
|
|
185
|
-
|
|
195
|
+
const toEncode = htmlEncode(encoding)
|
|
196
|
+
const reqMode = getMode(targetUrl, { prerender })
|
|
186
197
|
|
|
187
|
-
|
|
198
|
+
const time = timeSpan()
|
|
188
199
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
200
|
+
const promise = getContent(targetUrl, reqMode, {
|
|
201
|
+
getBrowserless,
|
|
202
|
+
gotOpts,
|
|
203
|
+
headers,
|
|
204
|
+
puppeteerOpts,
|
|
205
|
+
rewriteUrls,
|
|
206
|
+
toEncode
|
|
207
|
+
})
|
|
197
208
|
|
|
198
|
-
|
|
199
|
-
|
|
209
|
+
onCancel(() => promise.onCancel())
|
|
210
|
+
|
|
211
|
+
const { mode, ...payload } = await promise
|
|
212
|
+
|
|
213
|
+
return { ...payload, stats: { mode, timing: time.rounded() } }
|
|
214
|
+
}
|
|
215
|
+
)
|
|
200
216
|
|
|
201
217
|
module.exports.REQ_TIMEOUT = REQ_TIMEOUT
|