html-get 2.10.6 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, using prerendering when is necessary.",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.10.6",
5
+ "version": "2.11.0",
6
6
  "main": "src/index.js",
7
7
  "bin": {
8
8
  "html-get": "bin/index.js"
@@ -29,14 +29,14 @@
29
29
  "request"
30
30
  ],
31
31
  "dependencies": {
32
- "@metascraper/helpers": "~5.34.1",
32
+ "@metascraper/helpers": "~5.34.7",
33
33
  "cheerio": "~1.0.0-rc.12",
34
34
  "css-url-regex": "~4.0.0",
35
35
  "debug-logfmt": "~1.0.4",
36
36
  "execall": "~2.0.0",
37
37
  "got": "~11.8.6",
38
38
  "html-encode": "~2.1.6",
39
- "html-urls": "~2.4.39",
39
+ "html-urls": "~2.4.45",
40
40
  "is-html-content": "~1.0.0",
41
41
  "lodash": "~4.17.21",
42
42
  "mri": "~1.2.0",
@@ -44,7 +44,7 @@
44
44
  "p-retry": "~4.6.0",
45
45
  "replace-string": "~3.1.0",
46
46
  "time-span": "~4.0.0",
47
- "top-sites": "~1.1.132",
47
+ "top-sites": "~1.1.169",
48
48
  "write-json-file": "~4.3.0"
49
49
  },
50
50
  "devDependencies": {
@@ -1,6 +1,6 @@
1
1
  [
2
- "youtube",
3
2
  "google",
3
+ "youtube",
4
4
  "microsoft",
5
5
  "apple",
6
6
  "wikipedia",
@@ -8,32 +8,32 @@
8
8
  "blogspot",
9
9
  "vimeo",
10
10
  "github",
11
- "imdb",
12
11
  "theguardian",
12
+ "imdb",
13
13
  "bbc",
14
- "slideshare",
15
14
  "nytimes",
16
- "huffingtonpost",
17
- "telegraph",
18
- "pinterest",
15
+ "slideshare",
19
16
  "soundcloud",
20
- "eventbrite",
17
+ "pinterest",
18
+ "telegraph",
19
+ "huffingtonpost",
21
20
  "engadget",
22
- "yelp",
23
- "zoom",
24
21
  "techcrunch",
22
+ "zoom",
23
+ "yelp",
24
+ "eventbrite",
25
25
  "spotify",
26
26
  "theverge",
27
- "etsy",
28
27
  "dribbble",
28
+ "giphy",
29
+ "imgur",
29
30
  "csdn",
30
31
  "deviantart",
31
32
  "digg",
33
+ "etsy",
32
34
  "flickr",
33
35
  "ghost",
34
- "giphy",
35
36
  "gitlab",
36
- "imgur",
37
37
  "meetup",
38
38
  "producthunt",
39
39
  "sourceforge",
package/src/index.js CHANGED
@@ -13,11 +13,12 @@ const addHtml = require('./html')
13
13
 
14
14
  const REQ_TIMEOUT = 8000
15
15
 
16
- const fetch = (
17
- url,
18
- { reflect = false, toEncode, timeout = REQ_TIMEOUT, ...opts }
19
- ) =>
20
- new PCancelable(async (resolve, reject, onCancel) => {
16
+ const fetch = PCancelable.fn(
17
+ async (
18
+ url,
19
+ { reflect = false, toEncode, timeout = REQ_TIMEOUT, ...opts },
20
+ onCancel
21
+ ) => {
21
22
  const reqTimeout = reflect ? timeout / 2 : timeout
22
23
 
23
24
  const req = got(url, {
@@ -36,99 +37,109 @@ const fetch = (
36
37
 
37
38
  try {
38
39
  const res = await req
39
- return resolve({
40
+ return {
40
41
  headers: res.headers,
41
42
  html: await toEncode(res.body, res.headers['content-type']),
42
43
  mode: 'fetch',
43
44
  url: res.url,
44
45
  statusCode: res.statusCode
45
- })
46
+ }
46
47
  } catch (error) {
47
48
  debug('fetch:error', { url, message: error.message || error, reflect })
48
49
  return reflect
49
- ? resolve({ isRejected: true, error })
50
- : resolve({
51
- url,
52
- html: '',
53
- mode: 'fetch',
54
- headers: error.response ? error.response.headers : {},
55
- statusCode: error.response ? error.response.statusCode : undefined
56
- })
50
+ ? { isRejected: true, error }
51
+ : {
52
+ url,
53
+ html: '',
54
+ mode: 'fetch',
55
+ headers: error.response ? error.response.headers : {},
56
+ statusCode: error.response ? error.response.statusCode : undefined
57
+ }
57
58
  }
58
- })
59
-
60
- const prerender = async (
61
- url,
62
- {
63
- getBrowserless,
64
- toEncode,
65
- headers,
66
- gotOpts,
67
- timeout = REQ_TIMEOUT,
68
- abortTypes = ['image', 'stylesheet', 'font'],
69
- ...opts
70
59
  }
71
- ) => {
72
- let fetchRes
73
- let data = {}
74
- let isFetchResRejected = false
60
+ )
75
61
 
76
- try {
77
- fetchRes = fetch(url, {
78
- reflect: true,
62
+ const prerender = PCancelable.fn(
63
+ async (
64
+ url,
65
+ {
66
+ getBrowserless,
79
67
  toEncode,
80
- ...gotOpts,
81
68
  headers,
82
- timeout
83
- })
84
- const browserless = await getBrowserless()
85
-
86
- const getPayload = browserless.evaluate(
87
- async (page, response) => {
88
- if (!response) throw new AbortError('empty response')
89
-
90
- return {
91
- headers: response.headers(),
92
- html: await page.content(),
93
- mode: 'prerender',
94
- url: response.url(),
95
- statusCode: response.status()
96
- }
97
- },
98
- {
99
- timeout,
69
+ gotOpts,
70
+ timeout = REQ_TIMEOUT,
71
+ abortTypes = ['image', 'stylesheet', 'font'],
72
+ ...opts
73
+ },
74
+ onCancel
75
+ ) => {
76
+ let fetchRes
77
+ let data = {}
78
+ let isFetchResRejected = false
79
+
80
+ onCancel(() => fetchRes.cancel())
81
+
82
+ try {
83
+ fetchRes = fetch(url, {
84
+ reflect: true,
85
+ toEncode,
86
+ ...gotOpts,
100
87
  headers,
101
- abortTypes
102
- }
103
- )
88
+ timeout
89
+ })
90
+ const browserless = await getBrowserless()
91
+
92
+ const getPayload = browserless.evaluate(
93
+ async (page, response) => {
94
+ if (!response) throw new AbortError('empty response')
95
+
96
+ return {
97
+ headers: response.headers(),
98
+ html: await page.content(),
99
+ mode: 'prerender',
100
+ url: response.url(),
101
+ statusCode: response.status()
102
+ }
103
+ },
104
+ {
105
+ timeout,
106
+ headers,
107
+ abortTypes
108
+ }
109
+ )
104
110
 
105
- const payload = await getPayload(url, opts)
111
+ onCancel(() => {
112
+ debug('prerender:cancel', { url })
113
+ getPayload.cancel()
114
+ })
106
115
 
107
- await fetchRes.cancel()
108
- debug('prerender', { url, state: 'success' })
109
- return payload
110
- } catch (err) {
111
- const { isRejected, ...dataProps } = await fetchRes
116
+ const payload = await getPayload(url, opts)
117
+ await fetchRes.cancel()
118
+ debug('prerender', { url, state: 'success' })
119
+ return payload
120
+ } catch (err) {
121
+ const { isRejected, ...dataProps } = await fetchRes
112
122
 
113
- debug('prerender:error', {
114
- url,
115
- isRejected,
116
- error: err.message
117
- })
123
+ debug('prerender:error', {
124
+ url,
125
+ isRejected,
126
+ error: err.message
127
+ })
118
128
 
119
- isFetchResRejected = isRejected
120
- data = dataProps
121
- }
129
+ isFetchResRejected = isRejected
130
+ data = dataProps
131
+ }
122
132
 
123
- return isFetchResRejected
124
- ? {
125
- headers: data.headers || {},
126
- html: '',
127
- url,
128
- mode: 'prerender'
129
- }
130
- : data
131
- }
133
+ return isFetchResRejected
134
+ ? {
135
+ headers: data.headers || {},
136
+ html: '',
137
+ url,
138
+ mode: 'prerender'
139
+ }
140
+ : data
141
+ }
142
+ )
132
143
 
133
144
  const modes = { fetch, prerender }
134
145
 
@@ -160,40 +171,47 @@ const getContent = async (
160
171
  return { ...content, html }
161
172
  }
162
173
 
163
- module.exports = async (
164
- targetUrl,
165
- {
166
- encoding = 'utf-8',
167
- getBrowserless,
168
- getMode = determinateMode,
169
- gotOpts,
170
- headers,
171
- prerender = 'auto',
172
- puppeteerOpts,
173
- rewriteUrls = false
174
- } = {}
175
- ) => {
176
- if (!getBrowserless && prerender !== false) {
177
- throw TypeError(
178
- "Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
179
- )
180
- }
174
+ module.exports = PCancelable.fn(
175
+ async (
176
+ targetUrl,
177
+ {
178
+ encoding = 'utf-8',
179
+ getBrowserless,
180
+ getMode = determinateMode,
181
+ gotOpts,
182
+ headers,
183
+ prerender = 'auto',
184
+ puppeteerOpts,
185
+ rewriteUrls = false
186
+ } = {},
187
+ onCancel
188
+ ) => {
189
+ if (!getBrowserless && prerender !== false) {
190
+ throw TypeError(
191
+ "Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
192
+ )
193
+ }
181
194
 
182
- const toEncode = htmlEncode(encoding)
183
- const reqMode = getMode(targetUrl, { prerender })
195
+ const toEncode = htmlEncode(encoding)
196
+ const reqMode = getMode(targetUrl, { prerender })
184
197
 
185
- const time = timeSpan()
198
+ const time = timeSpan()
186
199
 
187
- const { mode, ...payload } = await getContent(targetUrl, reqMode, {
188
- getBrowserless,
189
- gotOpts,
190
- headers,
191
- puppeteerOpts,
192
- rewriteUrls,
193
- toEncode
194
- })
200
+ const promise = getContent(targetUrl, reqMode, {
201
+ getBrowserless,
202
+ gotOpts,
203
+ headers,
204
+ puppeteerOpts,
205
+ rewriteUrls,
206
+ toEncode
207
+ })
195
208
 
196
- return { ...payload, stats: { mode, timing: time.rounded() } }
197
- }
209
+ onCancel(() => promise.onCancel())
210
+
211
+ const { mode, ...payload } = await promise
212
+
213
+ return { ...payload, stats: { mode, timing: time.rounded() } }
214
+ }
215
+ )
198
216
 
199
217
  module.exports.REQ_TIMEOUT = REQ_TIMEOUT