html-get 2.10.7 → 2.11.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, using prerendering when is necessary.",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.10.7",
5
+ "version": "2.11.1",
6
6
  "main": "src/index.js",
7
7
  "bin": {
8
8
  "html-get": "bin/index.js"
@@ -29,14 +29,14 @@
29
29
  "request"
30
30
  ],
31
31
  "dependencies": {
32
- "@metascraper/helpers": "~5.34.1",
32
+ "@metascraper/helpers": "~5.34.7",
33
33
  "cheerio": "~1.0.0-rc.12",
34
34
  "css-url-regex": "~4.0.0",
35
35
  "debug-logfmt": "~1.0.4",
36
36
  "execall": "~2.0.0",
37
37
  "got": "~11.8.6",
38
38
  "html-encode": "~2.1.6",
39
- "html-urls": "~2.4.39",
39
+ "html-urls": "~2.4.45",
40
40
  "is-html-content": "~1.0.0",
41
41
  "lodash": "~4.17.21",
42
42
  "mri": "~1.2.0",
@@ -44,7 +44,7 @@
44
44
  "p-retry": "~4.6.0",
45
45
  "replace-string": "~3.1.0",
46
46
  "time-span": "~4.0.0",
47
- "top-sites": "~1.1.132",
47
+ "top-sites": "~1.1.169",
48
48
  "write-json-file": "~4.3.0"
49
49
  },
50
50
  "devDependencies": {
@@ -1,39 +1,39 @@
1
1
  [
2
2
  "google",
3
3
  "youtube",
4
- "microsoft",
5
4
  "apple",
6
- "wikipedia",
5
+ "microsoft",
7
6
  "wordpress",
7
+ "wikipedia",
8
8
  "blogspot",
9
9
  "vimeo",
10
10
  "github",
11
11
  "theguardian",
12
- "imdb",
13
- "bbc",
14
12
  "nytimes",
15
13
  "slideshare",
16
- "soundcloud",
14
+ "bbc",
15
+ "imdb",
17
16
  "pinterest",
18
17
  "telegraph",
18
+ "spotify",
19
19
  "huffingtonpost",
20
+ "soundcloud",
20
21
  "engadget",
21
- "techcrunch",
22
22
  "zoom",
23
+ "techcrunch",
23
24
  "yelp",
24
25
  "eventbrite",
25
- "spotify",
26
26
  "theverge",
27
- "dribbble",
28
- "giphy",
29
- "imgur",
27
+ "flickr",
28
+ "digg",
30
29
  "csdn",
31
30
  "deviantart",
32
- "digg",
31
+ "dribbble",
33
32
  "etsy",
34
- "flickr",
35
33
  "ghost",
34
+ "giphy",
36
35
  "gitlab",
36
+ "imgur",
37
37
  "meetup",
38
38
  "producthunt",
39
39
  "sourceforge",
package/src/index.js CHANGED
@@ -59,78 +59,82 @@ const fetch = PCancelable.fn(
59
59
  }
60
60
  )
61
61
 
62
- const prerender = async (
63
- url,
64
- {
65
- getBrowserless,
66
- toEncode,
67
- headers,
68
- gotOpts,
69
- timeout = REQ_TIMEOUT,
70
- abortTypes = ['image', 'stylesheet', 'font'],
71
- ...opts
72
- }
73
- ) => {
74
- let fetchRes
75
- let data = {}
76
- let isFetchResRejected = false
77
-
78
- try {
79
- fetchRes = fetch(url, {
80
- reflect: true,
62
+ const prerender = PCancelable.fn(
63
+ async (
64
+ url,
65
+ {
66
+ getBrowserless,
81
67
  toEncode,
82
- ...gotOpts,
83
68
  headers,
84
- timeout
85
- })
86
- const browserless = await getBrowserless()
87
-
88
- const getPayload = browserless.evaluate(
89
- async (page, response) => {
90
- if (!response) throw new AbortError('empty response')
91
-
92
- return {
93
- headers: response.headers(),
94
- html: await page.content(),
95
- mode: 'prerender',
96
- url: response.url(),
97
- statusCode: response.status()
98
- }
99
- },
100
- {
101
- timeout,
69
+ gotOpts,
70
+ timeout = REQ_TIMEOUT,
71
+ abortTypes = ['image', 'stylesheet', 'font'],
72
+ ...opts
73
+ },
74
+ onCancel
75
+ ) => {
76
+ let fetchRes
77
+ let data = {}
78
+ let isFetchResRejected = false
79
+
80
+ onCancel(() => fetchRes.cancel())
81
+
82
+ try {
83
+ fetchRes = fetch(url, {
84
+ reflect: true,
85
+ toEncode,
86
+ ...gotOpts,
102
87
  headers,
103
- abortTypes
104
- }
105
- )
88
+ timeout
89
+ })
90
+ const browserless = await getBrowserless()
91
+
92
+ const getPayload = browserless.evaluate(
93
+ async (page, response) => {
94
+ if (!response) throw new AbortError('empty response')
95
+
96
+ return {
97
+ headers: response.headers(),
98
+ html: await page.content(),
99
+ mode: 'prerender',
100
+ url: response.url(),
101
+ statusCode: response.status()
102
+ }
103
+ },
104
+ {
105
+ timeout,
106
+ headers,
107
+ abortTypes
108
+ }
109
+ )
106
110
 
107
- const payload = await getPayload(url, opts)
111
+ const payload = await getPayload(url, opts)
112
+ await fetchRes.cancel()
113
+ debug('prerender', { url, state: 'success' })
114
+ return payload
115
+ } catch (err) {
116
+ const { isRejected, ...dataProps } = await fetchRes
108
117
 
109
- await fetchRes.cancel()
110
- debug('prerender', { url, state: 'success' })
111
- return payload
112
- } catch (err) {
113
- const { isRejected, ...dataProps } = await fetchRes
118
+ debug('prerender:error', {
119
+ url,
120
+ isRejected,
121
+ error: err.message
122
+ })
114
123
 
115
- debug('prerender:error', {
116
- url,
117
- isRejected,
118
- error: err.message
119
- })
124
+ isFetchResRejected = isRejected
125
+ data = dataProps
126
+ }
120
127
 
121
- isFetchResRejected = isRejected
122
- data = dataProps
128
+ return isFetchResRejected
129
+ ? {
130
+ headers: data.headers || {},
131
+ html: '',
132
+ url,
133
+ mode: 'prerender'
134
+ }
135
+ : data
123
136
  }
124
-
125
- return isFetchResRejected
126
- ? {
127
- headers: data.headers || {},
128
- html: '',
129
- url,
130
- mode: 'prerender'
131
- }
132
- : data
133
- }
137
+ )
134
138
 
135
139
  const modes = { fetch, prerender }
136
140
 
@@ -162,40 +166,47 @@ const getContent = async (
162
166
  return { ...content, html }
163
167
  }
164
168
 
165
- module.exports = async (
166
- targetUrl,
167
- {
168
- encoding = 'utf-8',
169
- getBrowserless,
170
- getMode = determinateMode,
171
- gotOpts,
172
- headers,
173
- prerender = 'auto',
174
- puppeteerOpts,
175
- rewriteUrls = false
176
- } = {}
177
- ) => {
178
- if (!getBrowserless && prerender !== false) {
179
- throw TypeError(
180
- "Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
181
- )
182
- }
169
+ module.exports = PCancelable.fn(
170
+ async (
171
+ targetUrl,
172
+ {
173
+ encoding = 'utf-8',
174
+ getBrowserless,
175
+ getMode = determinateMode,
176
+ gotOpts,
177
+ headers,
178
+ prerender = 'auto',
179
+ puppeteerOpts,
180
+ rewriteUrls = false
181
+ } = {},
182
+ onCancel
183
+ ) => {
184
+ if (!getBrowserless && prerender !== false) {
185
+ throw TypeError(
186
+ "Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
187
+ )
188
+ }
183
189
 
184
- const toEncode = htmlEncode(encoding)
185
- const reqMode = getMode(targetUrl, { prerender })
190
+ const toEncode = htmlEncode(encoding)
191
+ const reqMode = getMode(targetUrl, { prerender })
186
192
 
187
- const time = timeSpan()
193
+ const time = timeSpan()
188
194
 
189
- const { mode, ...payload } = await getContent(targetUrl, reqMode, {
190
- getBrowserless,
191
- gotOpts,
192
- headers,
193
- puppeteerOpts,
194
- rewriteUrls,
195
- toEncode
196
- })
195
+ const promise = getContent(targetUrl, reqMode, {
196
+ getBrowserless,
197
+ gotOpts,
198
+ headers,
199
+ puppeteerOpts,
200
+ rewriteUrls,
201
+ toEncode
202
+ })
197
203
 
198
- return { ...payload, stats: { mode, timing: time.rounded() } }
199
- }
204
+ onCancel(() => promise.onCancel())
205
+
206
+ const { mode, ...payload } = await promise
207
+
208
+ return { ...payload, stats: { mode, timing: time.rounded() } }
209
+ }
210
+ )
200
211
 
201
212
  module.exports.REQ_TIMEOUT = REQ_TIMEOUT