html-get 2.19.0 → 2.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -121,11 +121,14 @@ Type: `object`
121
121
 
122
122
  Request headers that will be passed to fetch/prerender process.
123
123
 
124
- ##### mutoolPath
124
+ ##### mutool
125
125
 
126
- Type: `function`
126
+ Type: `function`|`boolean`<br>
127
+ Default: `source code`
128
+
129
+ It returns a function that receives that executes [mutool](https://mupdf.com/) binary for turning PDF files into HTML markup.
127
130
 
128
- It returns the path for [mutool](https://mupdf.com/) binary, used for turning PDF files into HTML markup.
131
+ It can explicitly disabled passing `false`.
129
132
 
130
133
  ##### prerender
131
134
 
@@ -158,6 +161,12 @@ Default: `false`
158
161
 
159
162
  When is `true`, it will rewrite some common mistake related with HTML meta tags.
160
163
 
164
+ ##### serializeHtml
165
+
166
+ It determines how HTML should be serialied before returning.
167
+
168
+ It's serialized `$ => ({ html: $.html() })` by default.
169
+
161
170
  ## License
162
171
 
163
172
  **html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.<br>
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, fine-tuned for correction & speed",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.19.0",
5
+ "version": "2.21.0",
6
6
  "main": "src/index.js",
7
7
  "bin": {
8
8
  "html-get": "bin/index.js"
@@ -38,6 +38,7 @@
38
38
  "@kikobeats/time-span": "~1.0.5",
39
39
  "@metascraper/helpers": "~5.46.1",
40
40
  "cheerio": "~1.0.0",
41
+ "content-type": "~1.0.5",
41
42
  "css-url-regex": "~4.0.0",
42
43
  "debug-logfmt": "~1.2.3",
43
44
  "execall": "~2.0.0",
@@ -71,7 +72,6 @@
71
72
  "regex-iso-date": "latest",
72
73
  "simple-git-hooks": "latest",
73
74
  "standard": "latest",
74
- "standard-markdown": "latest",
75
75
  "standard-version": "latest"
76
76
  },
77
77
  "engines": {
@@ -85,7 +85,7 @@
85
85
  "scripts": {
86
86
  "clean": "rm -rf node_modules",
87
87
  "contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
88
- "lint": "standard-markdown README.md && standard",
88
+ "lint": "standard",
89
89
  "postinstall": "node scripts/postinstall",
90
90
  "postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
91
91
  "pretest": "npm run lint",
@@ -98,7 +98,7 @@
98
98
  "ava": {
99
99
  "files": [
100
100
  "test/**/*.js",
101
- "!test/util.js"
101
+ "!test/helpers.js"
102
102
  ],
103
103
  "timeout": "2m",
104
104
  "workerThreads": false
@@ -118,9 +118,6 @@
118
118
  "prettier-standard",
119
119
  "standard --fix"
120
120
  ],
121
- "*.md": [
122
- "standard-markdown"
123
- ],
124
121
  "package.json": [
125
122
  "finepack"
126
123
  ]
@@ -1 +1 @@
1
- [[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","theguardian"]],[["domain","x.com"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","arxiv"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","zoom"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
1
+ [[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","bbc"]],[["domain","x.com"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
package/src/html.js CHANGED
@@ -1,7 +1,7 @@
1
1
  'use strict'
2
2
 
3
- const { get, split, nth, castArray, forEach } = require('lodash')
4
3
  const debug = require('debug-logfmt')('html-get:rewrite')
4
+ const { get, castArray, forEach } = require('lodash')
5
5
  const isLocalAddress = require('is-local-address')
6
6
  const { TAGS: URL_TAGS } = require('html-urls')
7
7
  const isHTML = require('is-html-content')
@@ -19,6 +19,8 @@ const {
19
19
  parseUrl
20
20
  } = require('@metascraper/helpers')
21
21
 
22
+ const { getContentType, getCharset } = require('./util')
23
+
22
24
  const has = el => el.length !== 0
23
25
 
24
26
  const upsert = (el, collection, item) => !has(el) && collection.push(item)
@@ -35,8 +37,7 @@ const getDate = headers => {
35
37
 
36
38
  const addHead = ({ $, url, headers }) => {
37
39
  const tags = []
38
- const contentType = get(headers, 'content-type')
39
- const charset = nth(split(contentType, 'charset='), 1)
40
+ const charset = getCharset(headers)
40
41
  const date = getDate(headers)
41
42
  const { domain } = parseUrl(url)
42
43
  const head = $('head')
@@ -73,8 +74,7 @@ const addHead = ({ $, url, headers }) => {
73
74
  }
74
75
 
75
76
  const addBody = ({ url, headers, html }) => {
76
- const contentType = get(headers, 'content-type')
77
-
77
+ const contentType = getContentType(headers)
78
78
  let element = ''
79
79
 
80
80
  if (isMime(contentType, 'image')) {
@@ -132,24 +132,44 @@ const rewriteHtmlUrls = ({ $, url }) => {
132
132
  })
133
133
  }
134
134
 
135
- const rewriteCssUrls = ({ html, url }) => {
136
- const cssUrls = Array.from(
137
- execall(cssUrl(), html).reduce((acc, match) => {
135
+ const replaceCssUrls = (url, stylesheet) => {
136
+ const cssUrls = Array.from(execall(cssUrl(), stylesheet)).reduce(
137
+ (acc, match) => {
138
138
  match.subMatches.forEach(match => acc.add(match))
139
139
  return acc
140
- }, new Set())
140
+ },
141
+ new Set()
141
142
  )
142
143
 
143
144
  cssUrls.forEach(cssUrl => {
144
145
  if (cssUrl.startsWith('/')) {
145
146
  try {
146
147
  const absoluteUrl = new URL(cssUrl, url).toString()
147
- html = html.replaceAll(`url(${cssUrl})`, `url(${absoluteUrl})`)
148
+ stylesheet = stylesheet.replaceAll(
149
+ `url(${cssUrl})`,
150
+ `url(${absoluteUrl})`
151
+ )
148
152
  } catch (_) {}
149
153
  }
150
154
  })
151
155
 
152
- return html
156
+ return stylesheet
157
+ }
158
+
159
+ const rewriteCssUrls = ({ $, url }) => {
160
+ // Process <style> tags
161
+ // e.g., <style>body { background-image: url('/image.jpg'); }</style>
162
+ $('style').each((_, element) =>
163
+ $(element).html(replaceCssUrls(url, $(element).html()))
164
+ )
165
+
166
+ // Process elements with style attributes
167
+ // e.g., <div style="background-image: url('/image.jpg');"></div>
168
+ $('[style]').each((_, element) =>
169
+ $(element).attr('style', replaceCssUrls(url, $(element).attr('style')))
170
+ )
171
+
172
+ return $
153
173
  }
154
174
 
155
175
  const injectStyle = ({ $, styles }) =>
@@ -216,7 +236,7 @@ module.exports = ({
216
236
  if (scripts) injectScripts({ $, scripts, type: 'text/javascript' })
217
237
  if (modules) injectScripts({ $, modules, type: 'module' })
218
238
 
219
- return rewriteUrls ? rewriteCssUrls({ html: $.html(), url }) : $.html()
239
+ return rewriteUrls ? rewriteCssUrls({ $, url }) : $
220
240
  }
221
241
 
222
242
  module.exports.getDate = getDate
package/src/index.js CHANGED
@@ -1,10 +1,10 @@
1
1
  'use strict'
2
2
 
3
3
  const { parseUrl, isMediaUrl, isPdfUrl } = require('@metascraper/helpers')
4
+ const { readFile, writeFile } = require('fs/promises')
4
5
  const timeSpan = require('@kikobeats/time-span')()
5
6
  const debug = require('debug-logfmt')('html-get')
6
7
  const { execSync } = require('child_process')
7
- const { writeFile } = require('fs/promises')
8
8
  const PCancelable = require('p-cancelable')
9
9
  const { AbortError } = require('p-retry')
10
10
  const htmlEncode = require('html-encode')
@@ -14,6 +14,7 @@ const path = require('path')
14
14
  const got = require('got')
15
15
  const os = require('os')
16
16
 
17
+ const { getContentLength, getContentType } = require('./util')
17
18
  const autoDomains = require('./auto-domains')
18
19
  const addHtml = require('./html')
19
20
 
@@ -21,12 +22,14 @@ const REQ_TIMEOUT = 8000
21
22
 
22
23
  const ABORT_TYPES = ['image', 'stylesheet', 'font']
23
24
 
25
+ const PDF_SIZE_TRESHOLD = 150 * 1024 // 150kb
26
+
24
27
  const fetch = PCancelable.fn(
25
28
  async (
26
29
  url,
27
30
  {
28
31
  getTemporalFile,
29
- mutoolPath,
32
+ mutool,
30
33
  reflect = false,
31
34
  timeout = REQ_TIMEOUT,
32
35
  toEncode,
@@ -58,14 +61,22 @@ const fetch = PCancelable.fn(
58
61
  const res = await req
59
62
 
60
63
  const html = await (async () => {
61
- const contentType = res.headers['content-type'] ?? ''
62
- if (mutoolPath && contentType === 'application/pdf') {
64
+ const contentType = getContentType(res.headers)
65
+
66
+ if (mutool && contentType === 'application/pdf') {
63
67
  const file = getTemporalFile(url, 'pdf')
64
68
  await writeFile(file.path, res.body)
65
- return (await $(`${mutoolPath} draw -q -F html ${file.path}`)).stdout
69
+ if (getContentLength(res.headers) > PDF_SIZE_TRESHOLD) {
70
+ const ofile = getTemporalFile(`${url}-pdf`, 'pdf')
71
+ await mutool(`-o ${ofile.path} ${file.path}`)
72
+ return readFile(ofile.path, 'utf-8')
73
+ } else {
74
+ const { stdout } = await mutool(file.path)
75
+ return stdout
76
+ }
66
77
  }
67
78
 
68
- return contentType.startsWith('text/html') || !isMediaUrl(url)
79
+ return contentType === 'text/html' || !isMediaUrl(url)
69
80
  ? await toEncode(res.body, res.headers['content-type'])
70
81
  : res.body.toString()
71
82
  })()
@@ -193,8 +204,8 @@ const defaultGetMode = (url, { prerender }) => {
193
204
  return isFetchMode(url) ? 'fetch' : 'prerender'
194
205
  }
195
206
 
196
- const defaultGetTemporalFile = (url, ext) => {
197
- const hash = crypto.createHash('sha256').update(url).digest('hex')
207
+ const defaultGetTemporalFile = (input, ext) => {
208
+ const hash = crypto.createHash('sha256').update(input).digest('hex')
198
209
  const filepath = path.join(
199
210
  os.tmpdir(),
200
211
  ext === undefined ? hash : `${hash}.${ext}`
@@ -202,10 +213,13 @@ const defaultGetTemporalFile = (url, ext) => {
202
213
  return { path: filepath }
203
214
  }
204
215
 
205
- const defaultMutoolPath = () =>
216
+ const defaultMutool = () =>
206
217
  (() => {
207
218
  try {
208
- return execSync('which mutool', { stdio: 'pipe' }).toString().trim()
219
+ const mutoolPath = execSync('which mutool', { stdio: 'pipe' })
220
+ .toString()
221
+ .trim()
222
+ return (...args) => $(`${mutoolPath} draw -q -F html ${args}`)
209
223
  } catch (_) {}
210
224
  })()
211
225
 
@@ -218,7 +232,7 @@ const getContent = PCancelable.fn(
218
232
  getTemporalFile,
219
233
  gotOpts,
220
234
  headers,
221
- mutoolPath,
235
+ mutool,
222
236
  puppeteerOpts,
223
237
  rewriteUrls,
224
238
  rewriteHtml,
@@ -229,21 +243,21 @@ const getContent = PCancelable.fn(
229
243
  const isFetchMode = mode === 'fetch'
230
244
 
231
245
  const fetchOpts = isFetchMode
232
- ? { headers, toEncode, mutoolPath, getTemporalFile, ...gotOpts }
246
+ ? { headers, toEncode, mutool, getTemporalFile, ...gotOpts }
233
247
  : { headers, toEncode, getBrowserless, gotOpts, ...puppeteerOpts }
234
248
 
235
249
  const promise = modes[mode](url, fetchOpts)
236
250
  onCancel(() => promise.cancel())
237
251
 
238
252
  return promise.then(content => {
239
- const html = addHtml({
253
+ const $ = addHtml({
240
254
  ...content,
241
255
  ...(isFetchMode ? puppeteerOpts : undefined),
242
256
  rewriteUrls,
243
257
  rewriteHtml
244
258
  })
245
259
 
246
- return { ...content, html }
260
+ return { ...content, $ }
247
261
  })
248
262
  }
249
263
  )
@@ -258,11 +272,12 @@ module.exports = PCancelable.fn(
258
272
  getTemporalFile = defaultGetTemporalFile,
259
273
  gotOpts,
260
274
  headers,
261
- mutoolPath = defaultMutoolPath(),
275
+ mutool = defaultMutool(),
262
276
  prerender = 'auto',
263
277
  puppeteerOpts,
278
+ rewriteHtml = false,
264
279
  rewriteUrls = false,
265
- rewriteHtml = false
280
+ serializeHtml = $ => ({ html: $.html() })
266
281
  } = {},
267
282
  onCancel
268
283
  ) => {
@@ -282,7 +297,7 @@ module.exports = PCancelable.fn(
282
297
  getTemporalFile,
283
298
  gotOpts,
284
299
  headers,
285
- mutoolPath,
300
+ mutool,
286
301
  puppeteerOpts,
287
302
  rewriteUrls,
288
303
  rewriteHtml,
@@ -291,13 +306,18 @@ module.exports = PCancelable.fn(
291
306
 
292
307
  onCancel(() => promise.cancel())
293
308
 
294
- const { mode, ...payload } = await promise
309
+ const { mode, $, ...payload } = await promise
295
310
 
296
- return Object.assign(payload, { stats: { mode, timing: duration() } })
311
+ return Object.assign(payload, {
312
+ ...serializeHtml($),
313
+ stats: { mode, timing: duration() }
314
+ })
297
315
  }
298
316
  )
299
317
 
300
318
  module.exports.REQ_TIMEOUT = REQ_TIMEOUT
301
319
  module.exports.ABORT_TYPES = ABORT_TYPES
320
+ module.exports.PDF_SIZE_TRESHOLD = PDF_SIZE_TRESHOLD
302
321
  module.exports.isFetchMode = isFetchMode
303
322
  module.exports.getContent = getContent
323
+ module.exports.defaultMutool = defaultMutool
package/src/util.js ADDED
@@ -0,0 +1,30 @@
1
+ 'use strict'
2
+
3
+ const { parse } = require('content-type')
4
+
5
+ const CACHE = Object.create(null)
6
+
7
+ const parseContentType = contentType =>
8
+ typeof contentType === 'string'
9
+ ? parse(contentType)
10
+ : { type: undefined, parameters: {} }
11
+
12
+ const contentType = headers => {
13
+ const contentType = headers['content-type']
14
+ return (
15
+ CACHE[contentType] || (CACHE[contentType] = parseContentType(contentType))
16
+ )
17
+ }
18
+
19
+ const getContentType = headers => contentType(headers).type
20
+
21
+ const getCharset = headers =>
22
+ contentType(headers).parameters.charset?.toLowerCase()
23
+
24
+ const getContentLength = headers => Number(headers['content-length'])
25
+
26
+ module.exports = {
27
+ getCharset,
28
+ getContentLength,
29
+ getContentType
30
+ }