html-get 2.20.0 → 2.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -121,11 +121,14 @@ Type: `object`
121
121
 
122
122
  Request headers that will be passed to fetch/prerender process.
123
123
 
124
- ##### mutoolPath
124
+ ##### mutool
125
125
 
126
- Type: `function`
126
+ Type: `function`|`boolean`<br>
127
+ Default: `source code`
128
+
129
+ It returns a function that receives that executes [mutool](https://mupdf.com/) binary for turning PDF files into HTML markup.
127
130
 
128
- It returns the path for [mutool](https://mupdf.com/) binary, used for turning PDF files into HTML markup.
131
+ It can explicitly disabled passing `false`.
129
132
 
130
133
  ##### prerender
131
134
 
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, fine-tuned for correction & speed",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.20.0",
5
+ "version": "2.21.1",
6
6
  "main": "src/index.js",
7
7
  "bin": {
8
8
  "html-get": "bin/index.js"
@@ -38,6 +38,7 @@
38
38
  "@kikobeats/time-span": "~1.0.5",
39
39
  "@metascraper/helpers": "~5.46.1",
40
40
  "cheerio": "~1.0.0",
41
+ "content-type": "~1.0.5",
41
42
  "css-url-regex": "~4.0.0",
42
43
  "debug-logfmt": "~1.2.3",
43
44
  "execall": "~2.0.0",
@@ -50,7 +51,7 @@
50
51
  "mri": "~1.2.0",
51
52
  "p-cancelable": "~2.1.0",
52
53
  "p-retry": "~4.6.0",
53
- "tinyspawn": "~1.3.3",
54
+ "tinyspawn": "~1.4.0",
54
55
  "top-sites": "~1.1.220"
55
56
  },
56
57
  "devDependencies": {
@@ -97,7 +98,7 @@
97
98
  "ava": {
98
99
  "files": [
99
100
  "test/**/*.js",
100
- "!test/util.js"
101
+ "!test/helpers.js"
101
102
  ],
102
103
  "timeout": "2m",
103
104
  "workerThreads": false
@@ -1 +1 @@
1
- [[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domain","x.com"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
1
+ [[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","bbc"]],[["domain","x.com"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
package/src/html.js CHANGED
@@ -1,7 +1,7 @@
1
1
  'use strict'
2
2
 
3
- const { get, split, nth, castArray, forEach } = require('lodash')
4
3
  const debug = require('debug-logfmt')('html-get:rewrite')
4
+ const { get, castArray, forEach } = require('lodash')
5
5
  const isLocalAddress = require('is-local-address')
6
6
  const { TAGS: URL_TAGS } = require('html-urls')
7
7
  const isHTML = require('is-html-content')
@@ -19,6 +19,8 @@ const {
19
19
  parseUrl
20
20
  } = require('@metascraper/helpers')
21
21
 
22
+ const { getContentType, getCharset } = require('./util')
23
+
22
24
  const has = el => el.length !== 0
23
25
 
24
26
  const upsert = (el, collection, item) => !has(el) && collection.push(item)
@@ -35,8 +37,7 @@ const getDate = headers => {
35
37
 
36
38
  const addHead = ({ $, url, headers }) => {
37
39
  const tags = []
38
- const contentType = get(headers, 'content-type')
39
- const charset = nth(split(contentType, 'charset='), 1)
40
+ const charset = getCharset(headers)
40
41
  const date = getDate(headers)
41
42
  const { domain } = parseUrl(url)
42
43
  const head = $('head')
@@ -73,8 +74,7 @@ const addHead = ({ $, url, headers }) => {
73
74
  }
74
75
 
75
76
  const addBody = ({ url, headers, html }) => {
76
- const contentType = get(headers, 'content-type')
77
-
77
+ const contentType = getContentType(headers)
78
78
  let element = ''
79
79
 
80
80
  if (isMime(contentType, 'image')) {
package/src/index.js CHANGED
@@ -1,10 +1,10 @@
1
1
  'use strict'
2
2
 
3
3
  const { parseUrl, isMediaUrl, isPdfUrl } = require('@metascraper/helpers')
4
+ const { readFile, writeFile } = require('fs/promises')
4
5
  const timeSpan = require('@kikobeats/time-span')()
5
6
  const debug = require('debug-logfmt')('html-get')
6
7
  const { execSync } = require('child_process')
7
- const { writeFile } = require('fs/promises')
8
8
  const PCancelable = require('p-cancelable')
9
9
  const { AbortError } = require('p-retry')
10
10
  const htmlEncode = require('html-encode')
@@ -14,6 +14,7 @@ const path = require('path')
14
14
  const got = require('got')
15
15
  const os = require('os')
16
16
 
17
+ const { getContentLength, getContentType } = require('./util')
17
18
  const autoDomains = require('./auto-domains')
18
19
  const addHtml = require('./html')
19
20
 
@@ -21,12 +22,14 @@ const REQ_TIMEOUT = 8000
21
22
 
22
23
  const ABORT_TYPES = ['image', 'stylesheet', 'font']
23
24
 
25
+ const PDF_SIZE_TRESHOLD = 150 * 1024 // 150kb
26
+
24
27
  const fetch = PCancelable.fn(
25
28
  async (
26
29
  url,
27
30
  {
28
31
  getTemporalFile,
29
- mutoolPath,
32
+ mutool,
30
33
  reflect = false,
31
34
  timeout = REQ_TIMEOUT,
32
35
  toEncode,
@@ -58,14 +61,22 @@ const fetch = PCancelable.fn(
58
61
  const res = await req
59
62
 
60
63
  const html = await (async () => {
61
- const contentType = res.headers['content-type'] ?? ''
62
- if (mutoolPath && contentType === 'application/pdf') {
64
+ const contentType = getContentType(res.headers)
65
+
66
+ if (mutool && contentType === 'application/pdf') {
63
67
  const file = getTemporalFile(url, 'pdf')
64
68
  await writeFile(file.path, res.body)
65
- return (await $(`${mutoolPath} draw -q -F html ${file.path}`)).stdout
69
+ if (getContentLength(res.headers) > PDF_SIZE_TRESHOLD) {
70
+ const ofile = getTemporalFile(`${url}-pdf`, 'pdf')
71
+ await mutool(`-o ${ofile.path} ${file.path}`)
72
+ return readFile(ofile.path, 'utf-8')
73
+ } else {
74
+ const { stdout } = await mutool(file.path)
75
+ return stdout
76
+ }
66
77
  }
67
78
 
68
- return contentType.startsWith('text/html') || !isMediaUrl(url)
79
+ return contentType === 'text/html' || !isMediaUrl(url)
69
80
  ? await toEncode(res.body, res.headers['content-type'])
70
81
  : res.body.toString()
71
82
  })()
@@ -193,8 +204,8 @@ const defaultGetMode = (url, { prerender }) => {
193
204
  return isFetchMode(url) ? 'fetch' : 'prerender'
194
205
  }
195
206
 
196
- const defaultGetTemporalFile = (url, ext) => {
197
- const hash = crypto.createHash('sha256').update(url).digest('hex')
207
+ const defaultGetTemporalFile = (input, ext) => {
208
+ const hash = crypto.createHash('sha256').update(input).digest('hex')
198
209
  const filepath = path.join(
199
210
  os.tmpdir(),
200
211
  ext === undefined ? hash : `${hash}.${ext}`
@@ -202,10 +213,13 @@ const defaultGetTemporalFile = (url, ext) => {
202
213
  return { path: filepath }
203
214
  }
204
215
 
205
- const defaultMutoolPath = () =>
216
+ const defaultMutool = () =>
206
217
  (() => {
207
218
  try {
208
- return execSync('which mutool', { stdio: 'pipe' }).toString().trim()
219
+ const mutoolPath = execSync('which mutool', { stdio: 'pipe' })
220
+ .toString()
221
+ .trim()
222
+ return (...args) => $(`${mutoolPath} draw -q -F html ${args}`)
209
223
  } catch (_) {}
210
224
  })()
211
225
 
@@ -218,7 +232,7 @@ const getContent = PCancelable.fn(
218
232
  getTemporalFile,
219
233
  gotOpts,
220
234
  headers,
221
- mutoolPath,
235
+ mutool,
222
236
  puppeteerOpts,
223
237
  rewriteUrls,
224
238
  rewriteHtml,
@@ -229,7 +243,7 @@ const getContent = PCancelable.fn(
229
243
  const isFetchMode = mode === 'fetch'
230
244
 
231
245
  const fetchOpts = isFetchMode
232
- ? { headers, toEncode, mutoolPath, getTemporalFile, ...gotOpts }
246
+ ? { headers, toEncode, mutool, getTemporalFile, ...gotOpts }
233
247
  : { headers, toEncode, getBrowserless, gotOpts, ...puppeteerOpts }
234
248
 
235
249
  const promise = modes[mode](url, fetchOpts)
@@ -258,7 +272,7 @@ module.exports = PCancelable.fn(
258
272
  getTemporalFile = defaultGetTemporalFile,
259
273
  gotOpts,
260
274
  headers,
261
- mutoolPath = defaultMutoolPath(),
275
+ mutool = defaultMutool(),
262
276
  prerender = 'auto',
263
277
  puppeteerOpts,
264
278
  rewriteHtml = false,
@@ -283,7 +297,7 @@ module.exports = PCancelable.fn(
283
297
  getTemporalFile,
284
298
  gotOpts,
285
299
  headers,
286
- mutoolPath,
300
+ mutool,
287
301
  puppeteerOpts,
288
302
  rewriteUrls,
289
303
  rewriteHtml,
@@ -303,5 +317,7 @@ module.exports = PCancelable.fn(
303
317
 
304
318
  module.exports.REQ_TIMEOUT = REQ_TIMEOUT
305
319
  module.exports.ABORT_TYPES = ABORT_TYPES
320
+ module.exports.PDF_SIZE_TRESHOLD = PDF_SIZE_TRESHOLD
306
321
  module.exports.isFetchMode = isFetchMode
307
322
  module.exports.getContent = getContent
323
+ module.exports.defaultMutool = defaultMutool
package/src/util.js ADDED
@@ -0,0 +1,30 @@
1
+ 'use strict'
2
+
3
+ const { parse } = require('content-type')
4
+
5
+ const CACHE = Object.create(null)
6
+
7
+ const parseContentType = contentType =>
8
+ typeof contentType === 'string'
9
+ ? parse(contentType)
10
+ : { type: undefined, parameters: {} }
11
+
12
+ const contentType = headers => {
13
+ const contentType = headers['content-type']
14
+ return (
15
+ CACHE[contentType] || (CACHE[contentType] = parseContentType(contentType))
16
+ )
17
+ }
18
+
19
+ const getContentType = headers => contentType(headers).type
20
+
21
+ const getCharset = headers =>
22
+ contentType(headers).parameters.charset?.toLowerCase()
23
+
24
+ const getContentLength = headers => Number(headers['content-length'])
25
+
26
+ module.exports = {
27
+ getCharset,
28
+ getContentLength,
29
+ getContentType
30
+ }