html-get 2.14.3 → 2.15.0-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE CHANGED
File without changes
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, using prerendering when is necessary.",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.14.3",
5
+ "version": "2.15.0-0",
6
6
  "main": "src/index.js",
7
7
  "bin": {
8
8
  "html-get": "bin/index.js"
@@ -44,6 +44,7 @@
44
44
  "p-cancelable": "~2.1.0",
45
45
  "p-retry": "~4.6.0",
46
46
  "replace-string": "~3.1.0",
47
+ "tinyspawn": "~1.2.6",
47
48
  "top-sites": "~1.1.202"
48
49
  },
49
50
  "devDependencies": {
@@ -76,21 +77,6 @@
76
77
  "scripts",
77
78
  "src"
78
79
  ],
79
- "scripts": {
80
- "clean": "rm -rf node_modules",
81
- "contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
82
- "lint": "standard-markdown README.md && standard",
83
- "postinstall": "node scripts/postinstall",
84
- "postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
85
- "prerelease": "npm run update:check && npm run contributors",
86
- "pretest": "npm run lint",
87
- "release": "standard-version -a",
88
- "release:github": "github-generate-release",
89
- "release:tags": "git push --follow-tags origin HEAD:master",
90
- "test": "c8 ava",
91
- "update": "ncu -u",
92
- "update:check": "ncu -- --error-level 2"
93
- },
94
80
  "license": "MIT",
95
81
  "ava": {
96
82
  "files": [
@@ -120,5 +106,20 @@
120
106
  "simple-git-hooks": {
121
107
  "commit-msg": "npx commitlint --edit",
122
108
  "pre-commit": "npx nano-staged"
109
+ },
110
+ "scripts": {
111
+ "clean": "rm -rf node_modules",
112
+ "contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
113
+ "lint": "standard-markdown README.md && standard",
114
+ "postinstall": "node scripts/postinstall",
115
+ "postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
116
+ "prerelease": "npm run update:check && npm run contributors",
117
+ "pretest": "npm run lint",
118
+ "release": "standard-version -a",
119
+ "release:github": "github-generate-release",
120
+ "release:tags": "git push --follow-tags origin HEAD:master",
121
+ "test": "c8 ava",
122
+ "update": "ncu -u",
123
+ "update:check": "ncu -- --error-level 2"
123
124
  }
124
- }
125
+ }
File without changes
@@ -1 +1 @@
1
- [[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","soundcloud"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","zoom"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
1
+ [[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","stackoverflow"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
package/src/index.js CHANGED
@@ -1,23 +1,30 @@
1
1
  'use strict'
2
2
 
3
- const { parseUrl, isMediaUrl } = require('@metascraper/helpers')
3
+ const { parseUrl, isMediaUrl, isPdfUrl } = require('@metascraper/helpers')
4
4
  const timeSpan = require('@kikobeats/time-span')()
5
5
  const debug = require('debug-logfmt')('html-get')
6
+ const { execSync } = require('child_process')
7
+ const { writeFile } = require('fs/promises')
6
8
  const PCancelable = require('p-cancelable')
7
9
  const { AbortError } = require('p-retry')
8
10
  const htmlEncode = require('html-encode')
11
+ const crypto = require('crypto')
12
+ const $ = require('tinyspawn')
13
+ const path = require('path')
9
14
  const got = require('got')
15
+ const os = require('os')
10
16
 
11
17
  const autoDomains = require('./auto-domains')
12
18
  const addHtml = require('./html')
13
19
 
14
20
  const REQ_TIMEOUT = 8000
21
+
15
22
  const ABORT_TYPES = ['image', 'stylesheet', 'font']
16
23
 
17
24
  const fetch = PCancelable.fn(
18
25
  async (
19
26
  url,
20
- { reflect = false, toEncode, timeout = REQ_TIMEOUT, ...opts },
27
+ { reflect = false, toEncode, timeout = REQ_TIMEOUT, mutoolPath, getTemporalFile, ...opts },
21
28
  onCancel
22
29
  ) => {
23
30
  const reqTimeout = reflect ? timeout / 2 : timeout
@@ -37,13 +44,24 @@ const fetch = PCancelable.fn(
37
44
 
38
45
  try {
39
46
  const res = await req
47
+
48
+ const html = await (async () => {
49
+ const contentType = res.headers['content-type'] ?? ''
50
+ if (mutoolPath && contentType === 'application/pdf') {
51
+ const file = getTemporalFile(url, 'pdf')
52
+ await writeFile(file.path, res.body)
53
+ return (await $(`mutool draw -q -F html ${file.path}`)).stdout
54
+ }
55
+
56
+ return contentType.startsWith('text/html') ||
57
+ !isMediaUrl(url)
58
+ ? await toEncode(res.body, res.headers['content-type'])
59
+ : res.body
60
+ })()
61
+
40
62
  return {
41
63
  headers: res.headers,
42
- html:
43
- res.headers['content-type'].startsWith('text/html') ||
44
- !isMediaUrl(url)
45
- ? await toEncode(res.body, res.headers['content-type'])
46
- : res.body,
64
+ html,
47
65
  mode: 'fetch',
48
66
  url: res.url,
49
67
  statusCode: res.statusCode
@@ -149,22 +167,31 @@ const isFetchMode = url => {
149
167
  )
150
168
  }
151
169
 
152
- const determinateMode = (url, { prerender }) => {
153
- if (prerender === false || isMediaUrl(url)) return 'fetch'
170
+ const defaultGetMode = (url, { prerender }) => {
171
+ if (prerender === false || isMediaUrl(url) || isPdfUrl(url)) return 'fetch'
154
172
  if (prerender === true) return 'prerender'
155
173
  return isFetchMode(url) ? 'fetch' : 'prerender'
156
174
  }
157
175
 
176
+ const defaultGetTemporalFile = (url, ext) => {
177
+ const hash = crypto.createHash('sha256').update(url).digest('hex')
178
+ const filepath = path.join(os.tmpdir(), ext === undefined ? hash : `${hash}.${ext}`)
179
+ return { path: filepath }
180
+ }
181
+
182
+ const defaultMutoolPath = () => (() => { try { return execSync('which mutool').toString().trim() } catch (_) {} })()
183
+
158
184
  const getContent = PCancelable.fn(
159
185
  (
160
186
  url,
161
187
  mode,
162
- { getBrowserless, gotOpts, headers, puppeteerOpts, rewriteUrls, toEncode },
188
+ { getBrowserless, gotOpts, headers, puppeteerOpts, rewriteUrls, toEncode, mutoolPath, getTemporalFile },
163
189
  onCancel
164
190
  ) => {
165
191
  const isFetchMode = mode === 'fetch'
192
+
166
193
  const fetchOpts = isFetchMode
167
- ? { headers, toEncode, ...gotOpts }
194
+ ? { headers, toEncode, mutoolPath, getTemporalFile, ...gotOpts }
168
195
  : { headers, toEncode, getBrowserless, gotOpts, ...puppeteerOpts }
169
196
 
170
197
  const promise = modes[mode](url, fetchOpts)
@@ -188,9 +215,11 @@ module.exports = PCancelable.fn(
188
215
  {
189
216
  encoding = 'utf-8',
190
217
  getBrowserless,
191
- getMode = determinateMode,
218
+ getMode = defaultGetMode,
219
+ getTemporalFile = defaultGetTemporalFile,
192
220
  gotOpts,
193
221
  headers,
222
+ mutoolPath = defaultMutoolPath(),
194
223
  prerender = 'auto',
195
224
  puppeteerOpts,
196
225
  rewriteUrls = false
@@ -210,8 +239,10 @@ module.exports = PCancelable.fn(
210
239
 
211
240
  const promise = getContent(targetUrl, reqMode, {
212
241
  getBrowserless,
242
+ getTemporalFile,
213
243
  gotOpts,
214
244
  headers,
245
+ mutoolPath,
215
246
  puppeteerOpts,
216
247
  rewriteUrls,
217
248
  toEncode