html-get 2.19.0 → 2.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -3
- package/package.json +4 -7
- package/src/auto-domains.json +1 -1
- package/src/html.js +32 -12
- package/src/index.js +39 -19
- package/src/util.js +30 -0
package/README.md
CHANGED
|
@@ -121,11 +121,14 @@ Type: `object`
|
|
|
121
121
|
|
|
122
122
|
Request headers that will be passed to fetch/prerender process.
|
|
123
123
|
|
|
124
|
-
#####
|
|
124
|
+
##### mutool
|
|
125
125
|
|
|
126
|
-
Type: `function
|
|
126
|
+
Type: `function`|`boolean`<br>
|
|
127
|
+
Default: `source code`
|
|
128
|
+
|
|
129
|
+
It returns a function that receives that executes [mutool](https://mupdf.com/) binary for turning PDF files into HTML markup.
|
|
127
130
|
|
|
128
|
-
It
|
|
131
|
+
It can explicitly disabled passing `false`.
|
|
129
132
|
|
|
130
133
|
##### prerender
|
|
131
134
|
|
|
@@ -158,6 +161,12 @@ Default: `false`
|
|
|
158
161
|
|
|
159
162
|
When is `true`, it will rewrite some common mistake related with HTML meta tags.
|
|
160
163
|
|
|
164
|
+
##### serializeHtml
|
|
165
|
+
|
|
166
|
+
It determines how HTML should be serialied before returning.
|
|
167
|
+
|
|
168
|
+
It's serialized `$ => ({ html: $.html() })` by default.
|
|
169
|
+
|
|
161
170
|
## License
|
|
162
171
|
|
|
163
172
|
**html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.<br>
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, fine-tuned for correction & speed",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.
|
|
5
|
+
"version": "2.21.0",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -38,6 +38,7 @@
|
|
|
38
38
|
"@kikobeats/time-span": "~1.0.5",
|
|
39
39
|
"@metascraper/helpers": "~5.46.1",
|
|
40
40
|
"cheerio": "~1.0.0",
|
|
41
|
+
"content-type": "~1.0.5",
|
|
41
42
|
"css-url-regex": "~4.0.0",
|
|
42
43
|
"debug-logfmt": "~1.2.3",
|
|
43
44
|
"execall": "~2.0.0",
|
|
@@ -71,7 +72,6 @@
|
|
|
71
72
|
"regex-iso-date": "latest",
|
|
72
73
|
"simple-git-hooks": "latest",
|
|
73
74
|
"standard": "latest",
|
|
74
|
-
"standard-markdown": "latest",
|
|
75
75
|
"standard-version": "latest"
|
|
76
76
|
},
|
|
77
77
|
"engines": {
|
|
@@ -85,7 +85,7 @@
|
|
|
85
85
|
"scripts": {
|
|
86
86
|
"clean": "rm -rf node_modules",
|
|
87
87
|
"contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
88
|
-
"lint": "standard
|
|
88
|
+
"lint": "standard",
|
|
89
89
|
"postinstall": "node scripts/postinstall",
|
|
90
90
|
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
91
91
|
"pretest": "npm run lint",
|
|
@@ -98,7 +98,7 @@
|
|
|
98
98
|
"ava": {
|
|
99
99
|
"files": [
|
|
100
100
|
"test/**/*.js",
|
|
101
|
-
"!test/
|
|
101
|
+
"!test/helpers.js"
|
|
102
102
|
],
|
|
103
103
|
"timeout": "2m",
|
|
104
104
|
"workerThreads": false
|
|
@@ -118,9 +118,6 @@
|
|
|
118
118
|
"prettier-standard",
|
|
119
119
|
"standard --fix"
|
|
120
120
|
],
|
|
121
|
-
"*.md": [
|
|
122
|
-
"standard-markdown"
|
|
123
|
-
],
|
|
124
121
|
"package.json": [
|
|
125
122
|
"finepack"
|
|
126
123
|
]
|
package/src/auto-domains.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
[[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","
|
|
1
|
+
[[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","bbc"]],[["domain","x.com"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
|
package/src/html.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
|
-
const { get, split, nth, castArray, forEach } = require('lodash')
|
|
4
3
|
const debug = require('debug-logfmt')('html-get:rewrite')
|
|
4
|
+
const { get, castArray, forEach } = require('lodash')
|
|
5
5
|
const isLocalAddress = require('is-local-address')
|
|
6
6
|
const { TAGS: URL_TAGS } = require('html-urls')
|
|
7
7
|
const isHTML = require('is-html-content')
|
|
@@ -19,6 +19,8 @@ const {
|
|
|
19
19
|
parseUrl
|
|
20
20
|
} = require('@metascraper/helpers')
|
|
21
21
|
|
|
22
|
+
const { getContentType, getCharset } = require('./util')
|
|
23
|
+
|
|
22
24
|
const has = el => el.length !== 0
|
|
23
25
|
|
|
24
26
|
const upsert = (el, collection, item) => !has(el) && collection.push(item)
|
|
@@ -35,8 +37,7 @@ const getDate = headers => {
|
|
|
35
37
|
|
|
36
38
|
const addHead = ({ $, url, headers }) => {
|
|
37
39
|
const tags = []
|
|
38
|
-
const
|
|
39
|
-
const charset = nth(split(contentType, 'charset='), 1)
|
|
40
|
+
const charset = getCharset(headers)
|
|
40
41
|
const date = getDate(headers)
|
|
41
42
|
const { domain } = parseUrl(url)
|
|
42
43
|
const head = $('head')
|
|
@@ -73,8 +74,7 @@ const addHead = ({ $, url, headers }) => {
|
|
|
73
74
|
}
|
|
74
75
|
|
|
75
76
|
const addBody = ({ url, headers, html }) => {
|
|
76
|
-
const contentType =
|
|
77
|
-
|
|
77
|
+
const contentType = getContentType(headers)
|
|
78
78
|
let element = ''
|
|
79
79
|
|
|
80
80
|
if (isMime(contentType, 'image')) {
|
|
@@ -132,24 +132,44 @@ const rewriteHtmlUrls = ({ $, url }) => {
|
|
|
132
132
|
})
|
|
133
133
|
}
|
|
134
134
|
|
|
135
|
-
const
|
|
136
|
-
const cssUrls = Array.from(
|
|
137
|
-
|
|
135
|
+
const replaceCssUrls = (url, stylesheet) => {
|
|
136
|
+
const cssUrls = Array.from(execall(cssUrl(), stylesheet)).reduce(
|
|
137
|
+
(acc, match) => {
|
|
138
138
|
match.subMatches.forEach(match => acc.add(match))
|
|
139
139
|
return acc
|
|
140
|
-
},
|
|
140
|
+
},
|
|
141
|
+
new Set()
|
|
141
142
|
)
|
|
142
143
|
|
|
143
144
|
cssUrls.forEach(cssUrl => {
|
|
144
145
|
if (cssUrl.startsWith('/')) {
|
|
145
146
|
try {
|
|
146
147
|
const absoluteUrl = new URL(cssUrl, url).toString()
|
|
147
|
-
|
|
148
|
+
stylesheet = stylesheet.replaceAll(
|
|
149
|
+
`url(${cssUrl})`,
|
|
150
|
+
`url(${absoluteUrl})`
|
|
151
|
+
)
|
|
148
152
|
} catch (_) {}
|
|
149
153
|
}
|
|
150
154
|
})
|
|
151
155
|
|
|
152
|
-
return
|
|
156
|
+
return stylesheet
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const rewriteCssUrls = ({ $, url }) => {
|
|
160
|
+
// Process <style> tags
|
|
161
|
+
// e.g., <style>body { background-image: url('/image.jpg'); }</style>
|
|
162
|
+
$('style').each((_, element) =>
|
|
163
|
+
$(element).html(replaceCssUrls(url, $(element).html()))
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
// Process elements with style attributes
|
|
167
|
+
// e.g., <div style="background-image: url('/image.jpg');"></div>
|
|
168
|
+
$('[style]').each((_, element) =>
|
|
169
|
+
$(element).attr('style', replaceCssUrls(url, $(element).attr('style')))
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return $
|
|
153
173
|
}
|
|
154
174
|
|
|
155
175
|
const injectStyle = ({ $, styles }) =>
|
|
@@ -216,7 +236,7 @@ module.exports = ({
|
|
|
216
236
|
if (scripts) injectScripts({ $, scripts, type: 'text/javascript' })
|
|
217
237
|
if (modules) injectScripts({ $, modules, type: 'module' })
|
|
218
238
|
|
|
219
|
-
return rewriteUrls ? rewriteCssUrls({
|
|
239
|
+
return rewriteUrls ? rewriteCssUrls({ $, url }) : $
|
|
220
240
|
}
|
|
221
241
|
|
|
222
242
|
module.exports.getDate = getDate
|
package/src/index.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
3
|
const { parseUrl, isMediaUrl, isPdfUrl } = require('@metascraper/helpers')
|
|
4
|
+
const { readFile, writeFile } = require('fs/promises')
|
|
4
5
|
const timeSpan = require('@kikobeats/time-span')()
|
|
5
6
|
const debug = require('debug-logfmt')('html-get')
|
|
6
7
|
const { execSync } = require('child_process')
|
|
7
|
-
const { writeFile } = require('fs/promises')
|
|
8
8
|
const PCancelable = require('p-cancelable')
|
|
9
9
|
const { AbortError } = require('p-retry')
|
|
10
10
|
const htmlEncode = require('html-encode')
|
|
@@ -14,6 +14,7 @@ const path = require('path')
|
|
|
14
14
|
const got = require('got')
|
|
15
15
|
const os = require('os')
|
|
16
16
|
|
|
17
|
+
const { getContentLength, getContentType } = require('./util')
|
|
17
18
|
const autoDomains = require('./auto-domains')
|
|
18
19
|
const addHtml = require('./html')
|
|
19
20
|
|
|
@@ -21,12 +22,14 @@ const REQ_TIMEOUT = 8000
|
|
|
21
22
|
|
|
22
23
|
const ABORT_TYPES = ['image', 'stylesheet', 'font']
|
|
23
24
|
|
|
25
|
+
const PDF_SIZE_TRESHOLD = 150 * 1024 // 150kb
|
|
26
|
+
|
|
24
27
|
const fetch = PCancelable.fn(
|
|
25
28
|
async (
|
|
26
29
|
url,
|
|
27
30
|
{
|
|
28
31
|
getTemporalFile,
|
|
29
|
-
|
|
32
|
+
mutool,
|
|
30
33
|
reflect = false,
|
|
31
34
|
timeout = REQ_TIMEOUT,
|
|
32
35
|
toEncode,
|
|
@@ -58,14 +61,22 @@ const fetch = PCancelable.fn(
|
|
|
58
61
|
const res = await req
|
|
59
62
|
|
|
60
63
|
const html = await (async () => {
|
|
61
|
-
const contentType = res.headers
|
|
62
|
-
|
|
64
|
+
const contentType = getContentType(res.headers)
|
|
65
|
+
|
|
66
|
+
if (mutool && contentType === 'application/pdf') {
|
|
63
67
|
const file = getTemporalFile(url, 'pdf')
|
|
64
68
|
await writeFile(file.path, res.body)
|
|
65
|
-
|
|
69
|
+
if (getContentLength(res.headers) > PDF_SIZE_TRESHOLD) {
|
|
70
|
+
const ofile = getTemporalFile(`${url}-pdf`, 'pdf')
|
|
71
|
+
await mutool(`-o ${ofile.path} ${file.path}`)
|
|
72
|
+
return readFile(ofile.path, 'utf-8')
|
|
73
|
+
} else {
|
|
74
|
+
const { stdout } = await mutool(file.path)
|
|
75
|
+
return stdout
|
|
76
|
+
}
|
|
66
77
|
}
|
|
67
78
|
|
|
68
|
-
return contentType
|
|
79
|
+
return contentType === 'text/html' || !isMediaUrl(url)
|
|
69
80
|
? await toEncode(res.body, res.headers['content-type'])
|
|
70
81
|
: res.body.toString()
|
|
71
82
|
})()
|
|
@@ -193,8 +204,8 @@ const defaultGetMode = (url, { prerender }) => {
|
|
|
193
204
|
return isFetchMode(url) ? 'fetch' : 'prerender'
|
|
194
205
|
}
|
|
195
206
|
|
|
196
|
-
const defaultGetTemporalFile = (
|
|
197
|
-
const hash = crypto.createHash('sha256').update(
|
|
207
|
+
const defaultGetTemporalFile = (input, ext) => {
|
|
208
|
+
const hash = crypto.createHash('sha256').update(input).digest('hex')
|
|
198
209
|
const filepath = path.join(
|
|
199
210
|
os.tmpdir(),
|
|
200
211
|
ext === undefined ? hash : `${hash}.${ext}`
|
|
@@ -202,10 +213,13 @@ const defaultGetTemporalFile = (url, ext) => {
|
|
|
202
213
|
return { path: filepath }
|
|
203
214
|
}
|
|
204
215
|
|
|
205
|
-
const
|
|
216
|
+
const defaultMutool = () =>
|
|
206
217
|
(() => {
|
|
207
218
|
try {
|
|
208
|
-
|
|
219
|
+
const mutoolPath = execSync('which mutool', { stdio: 'pipe' })
|
|
220
|
+
.toString()
|
|
221
|
+
.trim()
|
|
222
|
+
return (...args) => $(`${mutoolPath} draw -q -F html ${args}`)
|
|
209
223
|
} catch (_) {}
|
|
210
224
|
})()
|
|
211
225
|
|
|
@@ -218,7 +232,7 @@ const getContent = PCancelable.fn(
|
|
|
218
232
|
getTemporalFile,
|
|
219
233
|
gotOpts,
|
|
220
234
|
headers,
|
|
221
|
-
|
|
235
|
+
mutool,
|
|
222
236
|
puppeteerOpts,
|
|
223
237
|
rewriteUrls,
|
|
224
238
|
rewriteHtml,
|
|
@@ -229,21 +243,21 @@ const getContent = PCancelable.fn(
|
|
|
229
243
|
const isFetchMode = mode === 'fetch'
|
|
230
244
|
|
|
231
245
|
const fetchOpts = isFetchMode
|
|
232
|
-
? { headers, toEncode,
|
|
246
|
+
? { headers, toEncode, mutool, getTemporalFile, ...gotOpts }
|
|
233
247
|
: { headers, toEncode, getBrowserless, gotOpts, ...puppeteerOpts }
|
|
234
248
|
|
|
235
249
|
const promise = modes[mode](url, fetchOpts)
|
|
236
250
|
onCancel(() => promise.cancel())
|
|
237
251
|
|
|
238
252
|
return promise.then(content => {
|
|
239
|
-
const
|
|
253
|
+
const $ = addHtml({
|
|
240
254
|
...content,
|
|
241
255
|
...(isFetchMode ? puppeteerOpts : undefined),
|
|
242
256
|
rewriteUrls,
|
|
243
257
|
rewriteHtml
|
|
244
258
|
})
|
|
245
259
|
|
|
246
|
-
return { ...content,
|
|
260
|
+
return { ...content, $ }
|
|
247
261
|
})
|
|
248
262
|
}
|
|
249
263
|
)
|
|
@@ -258,11 +272,12 @@ module.exports = PCancelable.fn(
|
|
|
258
272
|
getTemporalFile = defaultGetTemporalFile,
|
|
259
273
|
gotOpts,
|
|
260
274
|
headers,
|
|
261
|
-
|
|
275
|
+
mutool = defaultMutool(),
|
|
262
276
|
prerender = 'auto',
|
|
263
277
|
puppeteerOpts,
|
|
278
|
+
rewriteHtml = false,
|
|
264
279
|
rewriteUrls = false,
|
|
265
|
-
|
|
280
|
+
serializeHtml = $ => ({ html: $.html() })
|
|
266
281
|
} = {},
|
|
267
282
|
onCancel
|
|
268
283
|
) => {
|
|
@@ -282,7 +297,7 @@ module.exports = PCancelable.fn(
|
|
|
282
297
|
getTemporalFile,
|
|
283
298
|
gotOpts,
|
|
284
299
|
headers,
|
|
285
|
-
|
|
300
|
+
mutool,
|
|
286
301
|
puppeteerOpts,
|
|
287
302
|
rewriteUrls,
|
|
288
303
|
rewriteHtml,
|
|
@@ -291,13 +306,18 @@ module.exports = PCancelable.fn(
|
|
|
291
306
|
|
|
292
307
|
onCancel(() => promise.cancel())
|
|
293
308
|
|
|
294
|
-
const { mode, ...payload } = await promise
|
|
309
|
+
const { mode, $, ...payload } = await promise
|
|
295
310
|
|
|
296
|
-
return Object.assign(payload, {
|
|
311
|
+
return Object.assign(payload, {
|
|
312
|
+
...serializeHtml($),
|
|
313
|
+
stats: { mode, timing: duration() }
|
|
314
|
+
})
|
|
297
315
|
}
|
|
298
316
|
)
|
|
299
317
|
|
|
300
318
|
module.exports.REQ_TIMEOUT = REQ_TIMEOUT
|
|
301
319
|
module.exports.ABORT_TYPES = ABORT_TYPES
|
|
320
|
+
module.exports.PDF_SIZE_TRESHOLD = PDF_SIZE_TRESHOLD
|
|
302
321
|
module.exports.isFetchMode = isFetchMode
|
|
303
322
|
module.exports.getContent = getContent
|
|
323
|
+
module.exports.defaultMutool = defaultMutool
|
package/src/util.js
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
'use strict'
|
|
2
|
+
|
|
3
|
+
const { parse } = require('content-type')
|
|
4
|
+
|
|
5
|
+
const CACHE = Object.create(null)
|
|
6
|
+
|
|
7
|
+
const parseContentType = contentType =>
|
|
8
|
+
typeof contentType === 'string'
|
|
9
|
+
? parse(contentType)
|
|
10
|
+
: { type: undefined, parameters: {} }
|
|
11
|
+
|
|
12
|
+
const contentType = headers => {
|
|
13
|
+
const contentType = headers['content-type']
|
|
14
|
+
return (
|
|
15
|
+
CACHE[contentType] || (CACHE[contentType] = parseContentType(contentType))
|
|
16
|
+
)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const getContentType = headers => contentType(headers).type
|
|
20
|
+
|
|
21
|
+
const getCharset = headers =>
|
|
22
|
+
contentType(headers).parameters.charset?.toLowerCase()
|
|
23
|
+
|
|
24
|
+
const getContentLength = headers => Number(headers['content-length'])
|
|
25
|
+
|
|
26
|
+
module.exports = {
|
|
27
|
+
getCharset,
|
|
28
|
+
getContentLength,
|
|
29
|
+
getContentType
|
|
30
|
+
}
|