html-get 2.20.0 → 2.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/package.json +4 -3
- package/src/auto-domains.json +1 -1
- package/src/html.js +5 -5
- package/src/index.js +30 -14
- package/src/util.js +30 -0
package/README.md
CHANGED
|
@@ -121,11 +121,14 @@ Type: `object`
|
|
|
121
121
|
|
|
122
122
|
Request headers that will be passed to fetch/prerender process.
|
|
123
123
|
|
|
124
|
-
#####
|
|
124
|
+
##### mutool
|
|
125
125
|
|
|
126
|
-
Type: `function
|
|
126
|
+
Type: `function`|`boolean`<br>
|
|
127
|
+
Default: `source code`
|
|
128
|
+
|
|
129
|
+
It returns a function that receives that executes [mutool](https://mupdf.com/) binary for turning PDF files into HTML markup.
|
|
127
130
|
|
|
128
|
-
It
|
|
131
|
+
It can explicitly disabled passing `false`.
|
|
129
132
|
|
|
130
133
|
##### prerender
|
|
131
134
|
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, fine-tuned for correction & speed",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.
|
|
5
|
+
"version": "2.21.1",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -38,6 +38,7 @@
|
|
|
38
38
|
"@kikobeats/time-span": "~1.0.5",
|
|
39
39
|
"@metascraper/helpers": "~5.46.1",
|
|
40
40
|
"cheerio": "~1.0.0",
|
|
41
|
+
"content-type": "~1.0.5",
|
|
41
42
|
"css-url-regex": "~4.0.0",
|
|
42
43
|
"debug-logfmt": "~1.2.3",
|
|
43
44
|
"execall": "~2.0.0",
|
|
@@ -50,7 +51,7 @@
|
|
|
50
51
|
"mri": "~1.2.0",
|
|
51
52
|
"p-cancelable": "~2.1.0",
|
|
52
53
|
"p-retry": "~4.6.0",
|
|
53
|
-
"tinyspawn": "~1.
|
|
54
|
+
"tinyspawn": "~1.4.0",
|
|
54
55
|
"top-sites": "~1.1.220"
|
|
55
56
|
},
|
|
56
57
|
"devDependencies": {
|
|
@@ -97,7 +98,7 @@
|
|
|
97
98
|
"ava": {
|
|
98
99
|
"files": [
|
|
99
100
|
"test/**/*.js",
|
|
100
|
-
"!test/
|
|
101
|
+
"!test/helpers.js"
|
|
101
102
|
],
|
|
102
103
|
"timeout": "2m",
|
|
103
104
|
"workerThreads": false
|
package/src/auto-domains.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
[[["domainWithoutSuffix","
|
|
1
|
+
[[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","bbc"]],[["domain","x.com"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
|
package/src/html.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
|
-
const { get, split, nth, castArray, forEach } = require('lodash')
|
|
4
3
|
const debug = require('debug-logfmt')('html-get:rewrite')
|
|
4
|
+
const { get, castArray, forEach } = require('lodash')
|
|
5
5
|
const isLocalAddress = require('is-local-address')
|
|
6
6
|
const { TAGS: URL_TAGS } = require('html-urls')
|
|
7
7
|
const isHTML = require('is-html-content')
|
|
@@ -19,6 +19,8 @@ const {
|
|
|
19
19
|
parseUrl
|
|
20
20
|
} = require('@metascraper/helpers')
|
|
21
21
|
|
|
22
|
+
const { getContentType, getCharset } = require('./util')
|
|
23
|
+
|
|
22
24
|
const has = el => el.length !== 0
|
|
23
25
|
|
|
24
26
|
const upsert = (el, collection, item) => !has(el) && collection.push(item)
|
|
@@ -35,8 +37,7 @@ const getDate = headers => {
|
|
|
35
37
|
|
|
36
38
|
const addHead = ({ $, url, headers }) => {
|
|
37
39
|
const tags = []
|
|
38
|
-
const
|
|
39
|
-
const charset = nth(split(contentType, 'charset='), 1)
|
|
40
|
+
const charset = getCharset(headers)
|
|
40
41
|
const date = getDate(headers)
|
|
41
42
|
const { domain } = parseUrl(url)
|
|
42
43
|
const head = $('head')
|
|
@@ -73,8 +74,7 @@ const addHead = ({ $, url, headers }) => {
|
|
|
73
74
|
}
|
|
74
75
|
|
|
75
76
|
const addBody = ({ url, headers, html }) => {
|
|
76
|
-
const contentType =
|
|
77
|
-
|
|
77
|
+
const contentType = getContentType(headers)
|
|
78
78
|
let element = ''
|
|
79
79
|
|
|
80
80
|
if (isMime(contentType, 'image')) {
|
package/src/index.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
3
|
const { parseUrl, isMediaUrl, isPdfUrl } = require('@metascraper/helpers')
|
|
4
|
+
const { readFile, writeFile } = require('fs/promises')
|
|
4
5
|
const timeSpan = require('@kikobeats/time-span')()
|
|
5
6
|
const debug = require('debug-logfmt')('html-get')
|
|
6
7
|
const { execSync } = require('child_process')
|
|
7
|
-
const { writeFile } = require('fs/promises')
|
|
8
8
|
const PCancelable = require('p-cancelable')
|
|
9
9
|
const { AbortError } = require('p-retry')
|
|
10
10
|
const htmlEncode = require('html-encode')
|
|
@@ -14,6 +14,7 @@ const path = require('path')
|
|
|
14
14
|
const got = require('got')
|
|
15
15
|
const os = require('os')
|
|
16
16
|
|
|
17
|
+
const { getContentLength, getContentType } = require('./util')
|
|
17
18
|
const autoDomains = require('./auto-domains')
|
|
18
19
|
const addHtml = require('./html')
|
|
19
20
|
|
|
@@ -21,12 +22,14 @@ const REQ_TIMEOUT = 8000
|
|
|
21
22
|
|
|
22
23
|
const ABORT_TYPES = ['image', 'stylesheet', 'font']
|
|
23
24
|
|
|
25
|
+
const PDF_SIZE_TRESHOLD = 150 * 1024 // 150kb
|
|
26
|
+
|
|
24
27
|
const fetch = PCancelable.fn(
|
|
25
28
|
async (
|
|
26
29
|
url,
|
|
27
30
|
{
|
|
28
31
|
getTemporalFile,
|
|
29
|
-
|
|
32
|
+
mutool,
|
|
30
33
|
reflect = false,
|
|
31
34
|
timeout = REQ_TIMEOUT,
|
|
32
35
|
toEncode,
|
|
@@ -58,14 +61,22 @@ const fetch = PCancelable.fn(
|
|
|
58
61
|
const res = await req
|
|
59
62
|
|
|
60
63
|
const html = await (async () => {
|
|
61
|
-
const contentType = res.headers
|
|
62
|
-
|
|
64
|
+
const contentType = getContentType(res.headers)
|
|
65
|
+
|
|
66
|
+
if (mutool && contentType === 'application/pdf') {
|
|
63
67
|
const file = getTemporalFile(url, 'pdf')
|
|
64
68
|
await writeFile(file.path, res.body)
|
|
65
|
-
|
|
69
|
+
if (getContentLength(res.headers) > PDF_SIZE_TRESHOLD) {
|
|
70
|
+
const ofile = getTemporalFile(`${url}-pdf`, 'pdf')
|
|
71
|
+
await mutool(`-o ${ofile.path} ${file.path}`)
|
|
72
|
+
return readFile(ofile.path, 'utf-8')
|
|
73
|
+
} else {
|
|
74
|
+
const { stdout } = await mutool(file.path)
|
|
75
|
+
return stdout
|
|
76
|
+
}
|
|
66
77
|
}
|
|
67
78
|
|
|
68
|
-
return contentType
|
|
79
|
+
return contentType === 'text/html' || !isMediaUrl(url)
|
|
69
80
|
? await toEncode(res.body, res.headers['content-type'])
|
|
70
81
|
: res.body.toString()
|
|
71
82
|
})()
|
|
@@ -193,8 +204,8 @@ const defaultGetMode = (url, { prerender }) => {
|
|
|
193
204
|
return isFetchMode(url) ? 'fetch' : 'prerender'
|
|
194
205
|
}
|
|
195
206
|
|
|
196
|
-
const defaultGetTemporalFile = (
|
|
197
|
-
const hash = crypto.createHash('sha256').update(
|
|
207
|
+
const defaultGetTemporalFile = (input, ext) => {
|
|
208
|
+
const hash = crypto.createHash('sha256').update(input).digest('hex')
|
|
198
209
|
const filepath = path.join(
|
|
199
210
|
os.tmpdir(),
|
|
200
211
|
ext === undefined ? hash : `${hash}.${ext}`
|
|
@@ -202,10 +213,13 @@ const defaultGetTemporalFile = (url, ext) => {
|
|
|
202
213
|
return { path: filepath }
|
|
203
214
|
}
|
|
204
215
|
|
|
205
|
-
const
|
|
216
|
+
const defaultMutool = () =>
|
|
206
217
|
(() => {
|
|
207
218
|
try {
|
|
208
|
-
|
|
219
|
+
const mutoolPath = execSync('which mutool', { stdio: 'pipe' })
|
|
220
|
+
.toString()
|
|
221
|
+
.trim()
|
|
222
|
+
return (...args) => $(`${mutoolPath} draw -q -F html ${args}`)
|
|
209
223
|
} catch (_) {}
|
|
210
224
|
})()
|
|
211
225
|
|
|
@@ -218,7 +232,7 @@ const getContent = PCancelable.fn(
|
|
|
218
232
|
getTemporalFile,
|
|
219
233
|
gotOpts,
|
|
220
234
|
headers,
|
|
221
|
-
|
|
235
|
+
mutool,
|
|
222
236
|
puppeteerOpts,
|
|
223
237
|
rewriteUrls,
|
|
224
238
|
rewriteHtml,
|
|
@@ -229,7 +243,7 @@ const getContent = PCancelable.fn(
|
|
|
229
243
|
const isFetchMode = mode === 'fetch'
|
|
230
244
|
|
|
231
245
|
const fetchOpts = isFetchMode
|
|
232
|
-
? { headers, toEncode,
|
|
246
|
+
? { headers, toEncode, mutool, getTemporalFile, ...gotOpts }
|
|
233
247
|
: { headers, toEncode, getBrowserless, gotOpts, ...puppeteerOpts }
|
|
234
248
|
|
|
235
249
|
const promise = modes[mode](url, fetchOpts)
|
|
@@ -258,7 +272,7 @@ module.exports = PCancelable.fn(
|
|
|
258
272
|
getTemporalFile = defaultGetTemporalFile,
|
|
259
273
|
gotOpts,
|
|
260
274
|
headers,
|
|
261
|
-
|
|
275
|
+
mutool = defaultMutool(),
|
|
262
276
|
prerender = 'auto',
|
|
263
277
|
puppeteerOpts,
|
|
264
278
|
rewriteHtml = false,
|
|
@@ -283,7 +297,7 @@ module.exports = PCancelable.fn(
|
|
|
283
297
|
getTemporalFile,
|
|
284
298
|
gotOpts,
|
|
285
299
|
headers,
|
|
286
|
-
|
|
300
|
+
mutool,
|
|
287
301
|
puppeteerOpts,
|
|
288
302
|
rewriteUrls,
|
|
289
303
|
rewriteHtml,
|
|
@@ -303,5 +317,7 @@ module.exports = PCancelable.fn(
|
|
|
303
317
|
|
|
304
318
|
module.exports.REQ_TIMEOUT = REQ_TIMEOUT
|
|
305
319
|
module.exports.ABORT_TYPES = ABORT_TYPES
|
|
320
|
+
module.exports.PDF_SIZE_TRESHOLD = PDF_SIZE_TRESHOLD
|
|
306
321
|
module.exports.isFetchMode = isFetchMode
|
|
307
322
|
module.exports.getContent = getContent
|
|
323
|
+
module.exports.defaultMutool = defaultMutool
|
package/src/util.js
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
'use strict'
|
|
2
|
+
|
|
3
|
+
const { parse } = require('content-type')
|
|
4
|
+
|
|
5
|
+
const CACHE = Object.create(null)
|
|
6
|
+
|
|
7
|
+
const parseContentType = contentType =>
|
|
8
|
+
typeof contentType === 'string'
|
|
9
|
+
? parse(contentType)
|
|
10
|
+
: { type: undefined, parameters: {} }
|
|
11
|
+
|
|
12
|
+
const contentType = headers => {
|
|
13
|
+
const contentType = headers['content-type']
|
|
14
|
+
return (
|
|
15
|
+
CACHE[contentType] || (CACHE[contentType] = parseContentType(contentType))
|
|
16
|
+
)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const getContentType = headers => contentType(headers).type
|
|
20
|
+
|
|
21
|
+
const getCharset = headers =>
|
|
22
|
+
contentType(headers).parameters.charset?.toLowerCase()
|
|
23
|
+
|
|
24
|
+
const getContentLength = headers => Number(headers['content-length'])
|
|
25
|
+
|
|
26
|
+
module.exports = {
|
|
27
|
+
getCharset,
|
|
28
|
+
getContentLength,
|
|
29
|
+
getContentType
|
|
30
|
+
}
|