html-get 2.10.7 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +4 -4
  2. package/src/index.js +111 -95
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, using prerendering when is necessary.",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.10.7",
5
+ "version": "2.11.0",
6
6
  "main": "src/index.js",
7
7
  "bin": {
8
8
  "html-get": "bin/index.js"
@@ -29,14 +29,14 @@
29
29
  "request"
30
30
  ],
31
31
  "dependencies": {
32
- "@metascraper/helpers": "~5.34.1",
32
+ "@metascraper/helpers": "~5.34.7",
33
33
  "cheerio": "~1.0.0-rc.12",
34
34
  "css-url-regex": "~4.0.0",
35
35
  "debug-logfmt": "~1.0.4",
36
36
  "execall": "~2.0.0",
37
37
  "got": "~11.8.6",
38
38
  "html-encode": "~2.1.6",
39
- "html-urls": "~2.4.39",
39
+ "html-urls": "~2.4.45",
40
40
  "is-html-content": "~1.0.0",
41
41
  "lodash": "~4.17.21",
42
42
  "mri": "~1.2.0",
@@ -44,7 +44,7 @@
44
44
  "p-retry": "~4.6.0",
45
45
  "replace-string": "~3.1.0",
46
46
  "time-span": "~4.0.0",
47
- "top-sites": "~1.1.132",
47
+ "top-sites": "~1.1.169",
48
48
  "write-json-file": "~4.3.0"
49
49
  },
50
50
  "devDependencies": {
package/src/index.js CHANGED
@@ -59,78 +59,87 @@ const fetch = PCancelable.fn(
59
59
  }
60
60
  )
61
61
 
62
- const prerender = async (
63
- url,
64
- {
65
- getBrowserless,
66
- toEncode,
67
- headers,
68
- gotOpts,
69
- timeout = REQ_TIMEOUT,
70
- abortTypes = ['image', 'stylesheet', 'font'],
71
- ...opts
72
- }
73
- ) => {
74
- let fetchRes
75
- let data = {}
76
- let isFetchResRejected = false
77
-
78
- try {
79
- fetchRes = fetch(url, {
80
- reflect: true,
62
+ const prerender = PCancelable.fn(
63
+ async (
64
+ url,
65
+ {
66
+ getBrowserless,
81
67
  toEncode,
82
- ...gotOpts,
83
68
  headers,
84
- timeout
85
- })
86
- const browserless = await getBrowserless()
87
-
88
- const getPayload = browserless.evaluate(
89
- async (page, response) => {
90
- if (!response) throw new AbortError('empty response')
91
-
92
- return {
93
- headers: response.headers(),
94
- html: await page.content(),
95
- mode: 'prerender',
96
- url: response.url(),
97
- statusCode: response.status()
98
- }
99
- },
100
- {
101
- timeout,
102
- headers,
103
- abortTypes
104
- }
105
- )
69
+ gotOpts,
70
+ timeout = REQ_TIMEOUT,
71
+ abortTypes = ['image', 'stylesheet', 'font'],
72
+ ...opts
73
+ },
74
+ onCancel
75
+ ) => {
76
+ let fetchRes
77
+ let data = {}
78
+ let isFetchResRejected = false
106
79
 
107
- const payload = await getPayload(url, opts)
80
+ onCancel(() => fetchRes.cancel())
108
81
 
109
- await fetchRes.cancel()
110
- debug('prerender', { url, state: 'success' })
111
- return payload
112
- } catch (err) {
113
- const { isRejected, ...dataProps } = await fetchRes
82
+ try {
83
+ fetchRes = fetch(url, {
84
+ reflect: true,
85
+ toEncode,
86
+ ...gotOpts,
87
+ headers,
88
+ timeout
89
+ })
90
+ const browserless = await getBrowserless()
91
+
92
+ const getPayload = browserless.evaluate(
93
+ async (page, response) => {
94
+ if (!response) throw new AbortError('empty response')
95
+
96
+ return {
97
+ headers: response.headers(),
98
+ html: await page.content(),
99
+ mode: 'prerender',
100
+ url: response.url(),
101
+ statusCode: response.status()
102
+ }
103
+ },
104
+ {
105
+ timeout,
106
+ headers,
107
+ abortTypes
108
+ }
109
+ )
114
110
 
115
- debug('prerender:error', {
116
- url,
117
- isRejected,
118
- error: err.message
119
- })
111
+ onCancel(() => {
112
+ debug('prerender:cancel', { url })
113
+ getPayload.cancel()
114
+ })
120
115
 
121
- isFetchResRejected = isRejected
122
- data = dataProps
123
- }
116
+ const payload = await getPayload(url, opts)
117
+ await fetchRes.cancel()
118
+ debug('prerender', { url, state: 'success' })
119
+ return payload
120
+ } catch (err) {
121
+ const { isRejected, ...dataProps } = await fetchRes
124
122
 
125
- return isFetchResRejected
126
- ? {
127
- headers: data.headers || {},
128
- html: '',
123
+ debug('prerender:error', {
129
124
  url,
130
- mode: 'prerender'
131
- }
132
- : data
133
- }
125
+ isRejected,
126
+ error: err.message
127
+ })
128
+
129
+ isFetchResRejected = isRejected
130
+ data = dataProps
131
+ }
132
+
133
+ return isFetchResRejected
134
+ ? {
135
+ headers: data.headers || {},
136
+ html: '',
137
+ url,
138
+ mode: 'prerender'
139
+ }
140
+ : data
141
+ }
142
+ )
134
143
 
135
144
  const modes = { fetch, prerender }
136
145
 
@@ -162,40 +171,47 @@ const getContent = async (
162
171
  return { ...content, html }
163
172
  }
164
173
 
165
- module.exports = async (
166
- targetUrl,
167
- {
168
- encoding = 'utf-8',
169
- getBrowserless,
170
- getMode = determinateMode,
171
- gotOpts,
172
- headers,
173
- prerender = 'auto',
174
- puppeteerOpts,
175
- rewriteUrls = false
176
- } = {}
177
- ) => {
178
- if (!getBrowserless && prerender !== false) {
179
- throw TypeError(
180
- "Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
181
- )
182
- }
174
+ module.exports = PCancelable.fn(
175
+ async (
176
+ targetUrl,
177
+ {
178
+ encoding = 'utf-8',
179
+ getBrowserless,
180
+ getMode = determinateMode,
181
+ gotOpts,
182
+ headers,
183
+ prerender = 'auto',
184
+ puppeteerOpts,
185
+ rewriteUrls = false
186
+ } = {},
187
+ onCancel
188
+ ) => {
189
+ if (!getBrowserless && prerender !== false) {
190
+ throw TypeError(
191
+ "Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
192
+ )
193
+ }
183
194
 
184
- const toEncode = htmlEncode(encoding)
185
- const reqMode = getMode(targetUrl, { prerender })
195
+ const toEncode = htmlEncode(encoding)
196
+ const reqMode = getMode(targetUrl, { prerender })
186
197
 
187
- const time = timeSpan()
198
+ const time = timeSpan()
188
199
 
189
- const { mode, ...payload } = await getContent(targetUrl, reqMode, {
190
- getBrowserless,
191
- gotOpts,
192
- headers,
193
- puppeteerOpts,
194
- rewriteUrls,
195
- toEncode
196
- })
200
+ const promise = getContent(targetUrl, reqMode, {
201
+ getBrowserless,
202
+ gotOpts,
203
+ headers,
204
+ puppeteerOpts,
205
+ rewriteUrls,
206
+ toEncode
207
+ })
197
208
 
198
- return { ...payload, stats: { mode, timing: time.rounded() } }
199
- }
209
+ onCancel(() => promise.onCancel())
210
+
211
+ const { mode, ...payload } = await promise
212
+
213
+ return { ...payload, stats: { mode, timing: time.rounded() } }
214
+ }
215
+ )
200
216
 
201
217
  module.exports.REQ_TIMEOUT = REQ_TIMEOUT