html-get 2.10.7 → 2.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -4
- package/src/auto-domains.json +12 -12
- package/src/index.js +106 -95
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, using prerendering when is necessary.",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.
|
|
5
|
+
"version": "2.11.1",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -29,14 +29,14 @@
|
|
|
29
29
|
"request"
|
|
30
30
|
],
|
|
31
31
|
"dependencies": {
|
|
32
|
-
"@metascraper/helpers": "~5.34.
|
|
32
|
+
"@metascraper/helpers": "~5.34.7",
|
|
33
33
|
"cheerio": "~1.0.0-rc.12",
|
|
34
34
|
"css-url-regex": "~4.0.0",
|
|
35
35
|
"debug-logfmt": "~1.0.4",
|
|
36
36
|
"execall": "~2.0.0",
|
|
37
37
|
"got": "~11.8.6",
|
|
38
38
|
"html-encode": "~2.1.6",
|
|
39
|
-
"html-urls": "~2.4.
|
|
39
|
+
"html-urls": "~2.4.45",
|
|
40
40
|
"is-html-content": "~1.0.0",
|
|
41
41
|
"lodash": "~4.17.21",
|
|
42
42
|
"mri": "~1.2.0",
|
|
@@ -44,7 +44,7 @@
|
|
|
44
44
|
"p-retry": "~4.6.0",
|
|
45
45
|
"replace-string": "~3.1.0",
|
|
46
46
|
"time-span": "~4.0.0",
|
|
47
|
-
"top-sites": "~1.1.
|
|
47
|
+
"top-sites": "~1.1.169",
|
|
48
48
|
"write-json-file": "~4.3.0"
|
|
49
49
|
},
|
|
50
50
|
"devDependencies": {
|
package/src/auto-domains.json
CHANGED
|
@@ -1,39 +1,39 @@
|
|
|
1
1
|
[
|
|
2
2
|
"google",
|
|
3
3
|
"youtube",
|
|
4
|
-
"microsoft",
|
|
5
4
|
"apple",
|
|
6
|
-
"
|
|
5
|
+
"microsoft",
|
|
7
6
|
"wordpress",
|
|
7
|
+
"wikipedia",
|
|
8
8
|
"blogspot",
|
|
9
9
|
"vimeo",
|
|
10
10
|
"github",
|
|
11
11
|
"theguardian",
|
|
12
|
-
"imdb",
|
|
13
|
-
"bbc",
|
|
14
12
|
"nytimes",
|
|
15
13
|
"slideshare",
|
|
16
|
-
"
|
|
14
|
+
"bbc",
|
|
15
|
+
"imdb",
|
|
17
16
|
"pinterest",
|
|
18
17
|
"telegraph",
|
|
18
|
+
"spotify",
|
|
19
19
|
"huffingtonpost",
|
|
20
|
+
"soundcloud",
|
|
20
21
|
"engadget",
|
|
21
|
-
"techcrunch",
|
|
22
22
|
"zoom",
|
|
23
|
+
"techcrunch",
|
|
23
24
|
"yelp",
|
|
24
25
|
"eventbrite",
|
|
25
|
-
"spotify",
|
|
26
26
|
"theverge",
|
|
27
|
-
"
|
|
28
|
-
"
|
|
29
|
-
"imgur",
|
|
27
|
+
"flickr",
|
|
28
|
+
"digg",
|
|
30
29
|
"csdn",
|
|
31
30
|
"deviantart",
|
|
32
|
-
"
|
|
31
|
+
"dribbble",
|
|
33
32
|
"etsy",
|
|
34
|
-
"flickr",
|
|
35
33
|
"ghost",
|
|
34
|
+
"giphy",
|
|
36
35
|
"gitlab",
|
|
36
|
+
"imgur",
|
|
37
37
|
"meetup",
|
|
38
38
|
"producthunt",
|
|
39
39
|
"sourceforge",
|
package/src/index.js
CHANGED
|
@@ -59,78 +59,82 @@ const fetch = PCancelable.fn(
|
|
|
59
59
|
}
|
|
60
60
|
)
|
|
61
61
|
|
|
62
|
-
const prerender =
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
headers,
|
|
68
|
-
gotOpts,
|
|
69
|
-
timeout = REQ_TIMEOUT,
|
|
70
|
-
abortTypes = ['image', 'stylesheet', 'font'],
|
|
71
|
-
...opts
|
|
72
|
-
}
|
|
73
|
-
) => {
|
|
74
|
-
let fetchRes
|
|
75
|
-
let data = {}
|
|
76
|
-
let isFetchResRejected = false
|
|
77
|
-
|
|
78
|
-
try {
|
|
79
|
-
fetchRes = fetch(url, {
|
|
80
|
-
reflect: true,
|
|
62
|
+
const prerender = PCancelable.fn(
|
|
63
|
+
async (
|
|
64
|
+
url,
|
|
65
|
+
{
|
|
66
|
+
getBrowserless,
|
|
81
67
|
toEncode,
|
|
82
|
-
...gotOpts,
|
|
83
68
|
headers,
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
69
|
+
gotOpts,
|
|
70
|
+
timeout = REQ_TIMEOUT,
|
|
71
|
+
abortTypes = ['image', 'stylesheet', 'font'],
|
|
72
|
+
...opts
|
|
73
|
+
},
|
|
74
|
+
onCancel
|
|
75
|
+
) => {
|
|
76
|
+
let fetchRes
|
|
77
|
+
let data = {}
|
|
78
|
+
let isFetchResRejected = false
|
|
79
|
+
|
|
80
|
+
onCancel(() => fetchRes.cancel())
|
|
81
|
+
|
|
82
|
+
try {
|
|
83
|
+
fetchRes = fetch(url, {
|
|
84
|
+
reflect: true,
|
|
85
|
+
toEncode,
|
|
86
|
+
...gotOpts,
|
|
102
87
|
headers,
|
|
103
|
-
|
|
104
|
-
}
|
|
105
|
-
|
|
88
|
+
timeout
|
|
89
|
+
})
|
|
90
|
+
const browserless = await getBrowserless()
|
|
91
|
+
|
|
92
|
+
const getPayload = browserless.evaluate(
|
|
93
|
+
async (page, response) => {
|
|
94
|
+
if (!response) throw new AbortError('empty response')
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
headers: response.headers(),
|
|
98
|
+
html: await page.content(),
|
|
99
|
+
mode: 'prerender',
|
|
100
|
+
url: response.url(),
|
|
101
|
+
statusCode: response.status()
|
|
102
|
+
}
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
timeout,
|
|
106
|
+
headers,
|
|
107
|
+
abortTypes
|
|
108
|
+
}
|
|
109
|
+
)
|
|
106
110
|
|
|
107
|
-
|
|
111
|
+
const payload = await getPayload(url, opts)
|
|
112
|
+
await fetchRes.cancel()
|
|
113
|
+
debug('prerender', { url, state: 'success' })
|
|
114
|
+
return payload
|
|
115
|
+
} catch (err) {
|
|
116
|
+
const { isRejected, ...dataProps } = await fetchRes
|
|
108
117
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
118
|
+
debug('prerender:error', {
|
|
119
|
+
url,
|
|
120
|
+
isRejected,
|
|
121
|
+
error: err.message
|
|
122
|
+
})
|
|
114
123
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
error: err.message
|
|
119
|
-
})
|
|
124
|
+
isFetchResRejected = isRejected
|
|
125
|
+
data = dataProps
|
|
126
|
+
}
|
|
120
127
|
|
|
121
|
-
isFetchResRejected
|
|
122
|
-
|
|
128
|
+
return isFetchResRejected
|
|
129
|
+
? {
|
|
130
|
+
headers: data.headers || {},
|
|
131
|
+
html: '',
|
|
132
|
+
url,
|
|
133
|
+
mode: 'prerender'
|
|
134
|
+
}
|
|
135
|
+
: data
|
|
123
136
|
}
|
|
124
|
-
|
|
125
|
-
return isFetchResRejected
|
|
126
|
-
? {
|
|
127
|
-
headers: data.headers || {},
|
|
128
|
-
html: '',
|
|
129
|
-
url,
|
|
130
|
-
mode: 'prerender'
|
|
131
|
-
}
|
|
132
|
-
: data
|
|
133
|
-
}
|
|
137
|
+
)
|
|
134
138
|
|
|
135
139
|
const modes = { fetch, prerender }
|
|
136
140
|
|
|
@@ -162,40 +166,47 @@ const getContent = async (
|
|
|
162
166
|
return { ...content, html }
|
|
163
167
|
}
|
|
164
168
|
|
|
165
|
-
module.exports =
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
169
|
+
module.exports = PCancelable.fn(
|
|
170
|
+
async (
|
|
171
|
+
targetUrl,
|
|
172
|
+
{
|
|
173
|
+
encoding = 'utf-8',
|
|
174
|
+
getBrowserless,
|
|
175
|
+
getMode = determinateMode,
|
|
176
|
+
gotOpts,
|
|
177
|
+
headers,
|
|
178
|
+
prerender = 'auto',
|
|
179
|
+
puppeteerOpts,
|
|
180
|
+
rewriteUrls = false
|
|
181
|
+
} = {},
|
|
182
|
+
onCancel
|
|
183
|
+
) => {
|
|
184
|
+
if (!getBrowserless && prerender !== false) {
|
|
185
|
+
throw TypeError(
|
|
186
|
+
"Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
|
|
187
|
+
)
|
|
188
|
+
}
|
|
183
189
|
|
|
184
|
-
|
|
185
|
-
|
|
190
|
+
const toEncode = htmlEncode(encoding)
|
|
191
|
+
const reqMode = getMode(targetUrl, { prerender })
|
|
186
192
|
|
|
187
|
-
|
|
193
|
+
const time = timeSpan()
|
|
188
194
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
195
|
+
const promise = getContent(targetUrl, reqMode, {
|
|
196
|
+
getBrowserless,
|
|
197
|
+
gotOpts,
|
|
198
|
+
headers,
|
|
199
|
+
puppeteerOpts,
|
|
200
|
+
rewriteUrls,
|
|
201
|
+
toEncode
|
|
202
|
+
})
|
|
197
203
|
|
|
198
|
-
|
|
199
|
-
|
|
204
|
+
onCancel(() => promise.onCancel())
|
|
205
|
+
|
|
206
|
+
const { mode, ...payload } = await promise
|
|
207
|
+
|
|
208
|
+
return { ...payload, stats: { mode, timing: time.rounded() } }
|
|
209
|
+
}
|
|
210
|
+
)
|
|
200
211
|
|
|
201
212
|
module.exports.REQ_TIMEOUT = REQ_TIMEOUT
|