html-get 2.14.3 → 2.15.0-0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +0 -0
- package/package.json +18 -17
- package/scripts/postinstall +0 -0
- package/src/auto-domains.json +1 -1
- package/src/index.js +43 -12
package/LICENSE
CHANGED
|
File without changes
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, using prerendering when is necessary.",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.
|
|
5
|
+
"version": "2.15.0-0",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -44,6 +44,7 @@
|
|
|
44
44
|
"p-cancelable": "~2.1.0",
|
|
45
45
|
"p-retry": "~4.6.0",
|
|
46
46
|
"replace-string": "~3.1.0",
|
|
47
|
+
"tinyspawn": "~1.2.6",
|
|
47
48
|
"top-sites": "~1.1.202"
|
|
48
49
|
},
|
|
49
50
|
"devDependencies": {
|
|
@@ -76,21 +77,6 @@
|
|
|
76
77
|
"scripts",
|
|
77
78
|
"src"
|
|
78
79
|
],
|
|
79
|
-
"scripts": {
|
|
80
|
-
"clean": "rm -rf node_modules",
|
|
81
|
-
"contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
82
|
-
"lint": "standard-markdown README.md && standard",
|
|
83
|
-
"postinstall": "node scripts/postinstall",
|
|
84
|
-
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
85
|
-
"prerelease": "npm run update:check && npm run contributors",
|
|
86
|
-
"pretest": "npm run lint",
|
|
87
|
-
"release": "standard-version -a",
|
|
88
|
-
"release:github": "github-generate-release",
|
|
89
|
-
"release:tags": "git push --follow-tags origin HEAD:master",
|
|
90
|
-
"test": "c8 ava",
|
|
91
|
-
"update": "ncu -u",
|
|
92
|
-
"update:check": "ncu -- --error-level 2"
|
|
93
|
-
},
|
|
94
80
|
"license": "MIT",
|
|
95
81
|
"ava": {
|
|
96
82
|
"files": [
|
|
@@ -120,5 +106,20 @@
|
|
|
120
106
|
"simple-git-hooks": {
|
|
121
107
|
"commit-msg": "npx commitlint --edit",
|
|
122
108
|
"pre-commit": "npx nano-staged"
|
|
109
|
+
},
|
|
110
|
+
"scripts": {
|
|
111
|
+
"clean": "rm -rf node_modules",
|
|
112
|
+
"contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
113
|
+
"lint": "standard-markdown README.md && standard",
|
|
114
|
+
"postinstall": "node scripts/postinstall",
|
|
115
|
+
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
116
|
+
"prerelease": "npm run update:check && npm run contributors",
|
|
117
|
+
"pretest": "npm run lint",
|
|
118
|
+
"release": "standard-version -a",
|
|
119
|
+
"release:github": "github-generate-release",
|
|
120
|
+
"release:tags": "git push --follow-tags origin HEAD:master",
|
|
121
|
+
"test": "c8 ava",
|
|
122
|
+
"update": "ncu -u",
|
|
123
|
+
"update:check": "ncu -- --error-level 2"
|
|
123
124
|
}
|
|
124
|
-
}
|
|
125
|
+
}
|
package/scripts/postinstall
CHANGED
|
File without changes
|
package/src/auto-domains.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
[[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","
|
|
1
|
+
[[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","stackoverflow"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
|
package/src/index.js
CHANGED
|
@@ -1,23 +1,30 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
|
-
const { parseUrl, isMediaUrl } = require('@metascraper/helpers')
|
|
3
|
+
const { parseUrl, isMediaUrl, isPdfUrl } = require('@metascraper/helpers')
|
|
4
4
|
const timeSpan = require('@kikobeats/time-span')()
|
|
5
5
|
const debug = require('debug-logfmt')('html-get')
|
|
6
|
+
const { execSync } = require('child_process')
|
|
7
|
+
const { writeFile } = require('fs/promises')
|
|
6
8
|
const PCancelable = require('p-cancelable')
|
|
7
9
|
const { AbortError } = require('p-retry')
|
|
8
10
|
const htmlEncode = require('html-encode')
|
|
11
|
+
const crypto = require('crypto')
|
|
12
|
+
const $ = require('tinyspawn')
|
|
13
|
+
const path = require('path')
|
|
9
14
|
const got = require('got')
|
|
15
|
+
const os = require('os')
|
|
10
16
|
|
|
11
17
|
const autoDomains = require('./auto-domains')
|
|
12
18
|
const addHtml = require('./html')
|
|
13
19
|
|
|
14
20
|
const REQ_TIMEOUT = 8000
|
|
21
|
+
|
|
15
22
|
const ABORT_TYPES = ['image', 'stylesheet', 'font']
|
|
16
23
|
|
|
17
24
|
const fetch = PCancelable.fn(
|
|
18
25
|
async (
|
|
19
26
|
url,
|
|
20
|
-
{ reflect = false, toEncode, timeout = REQ_TIMEOUT, ...opts },
|
|
27
|
+
{ reflect = false, toEncode, timeout = REQ_TIMEOUT, mutoolPath, getTemporalFile, ...opts },
|
|
21
28
|
onCancel
|
|
22
29
|
) => {
|
|
23
30
|
const reqTimeout = reflect ? timeout / 2 : timeout
|
|
@@ -37,13 +44,24 @@ const fetch = PCancelable.fn(
|
|
|
37
44
|
|
|
38
45
|
try {
|
|
39
46
|
const res = await req
|
|
47
|
+
|
|
48
|
+
const html = await (async () => {
|
|
49
|
+
const contentType = res.headers['content-type'] ?? ''
|
|
50
|
+
if (mutoolPath && contentType === 'application/pdf') {
|
|
51
|
+
const file = getTemporalFile(url, 'pdf')
|
|
52
|
+
await writeFile(file.path, res.body)
|
|
53
|
+
return (await $(`mutool draw -q -F html ${file.path}`)).stdout
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return contentType.startsWith('text/html') ||
|
|
57
|
+
!isMediaUrl(url)
|
|
58
|
+
? await toEncode(res.body, res.headers['content-type'])
|
|
59
|
+
: res.body
|
|
60
|
+
})()
|
|
61
|
+
|
|
40
62
|
return {
|
|
41
63
|
headers: res.headers,
|
|
42
|
-
html
|
|
43
|
-
res.headers['content-type'].startsWith('text/html') ||
|
|
44
|
-
!isMediaUrl(url)
|
|
45
|
-
? await toEncode(res.body, res.headers['content-type'])
|
|
46
|
-
: res.body,
|
|
64
|
+
html,
|
|
47
65
|
mode: 'fetch',
|
|
48
66
|
url: res.url,
|
|
49
67
|
statusCode: res.statusCode
|
|
@@ -149,22 +167,31 @@ const isFetchMode = url => {
|
|
|
149
167
|
)
|
|
150
168
|
}
|
|
151
169
|
|
|
152
|
-
const
|
|
153
|
-
if (prerender === false || isMediaUrl(url)) return 'fetch'
|
|
170
|
+
const defaultGetMode = (url, { prerender }) => {
|
|
171
|
+
if (prerender === false || isMediaUrl(url) || isPdfUrl(url)) return 'fetch'
|
|
154
172
|
if (prerender === true) return 'prerender'
|
|
155
173
|
return isFetchMode(url) ? 'fetch' : 'prerender'
|
|
156
174
|
}
|
|
157
175
|
|
|
176
|
+
const defaultGetTemporalFile = (url, ext) => {
|
|
177
|
+
const hash = crypto.createHash('sha256').update(url).digest('hex')
|
|
178
|
+
const filepath = path.join(os.tmpdir(), ext === undefined ? hash : `${hash}.${ext}`)
|
|
179
|
+
return { path: filepath }
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const defaultMutoolPath = () => (() => { try { return execSync('which mutool').toString().trim() } catch (_) {} })()
|
|
183
|
+
|
|
158
184
|
const getContent = PCancelable.fn(
|
|
159
185
|
(
|
|
160
186
|
url,
|
|
161
187
|
mode,
|
|
162
|
-
{ getBrowserless, gotOpts, headers, puppeteerOpts, rewriteUrls, toEncode },
|
|
188
|
+
{ getBrowserless, gotOpts, headers, puppeteerOpts, rewriteUrls, toEncode, mutoolPath, getTemporalFile },
|
|
163
189
|
onCancel
|
|
164
190
|
) => {
|
|
165
191
|
const isFetchMode = mode === 'fetch'
|
|
192
|
+
|
|
166
193
|
const fetchOpts = isFetchMode
|
|
167
|
-
? { headers, toEncode, ...gotOpts }
|
|
194
|
+
? { headers, toEncode, mutoolPath, getTemporalFile, ...gotOpts }
|
|
168
195
|
: { headers, toEncode, getBrowserless, gotOpts, ...puppeteerOpts }
|
|
169
196
|
|
|
170
197
|
const promise = modes[mode](url, fetchOpts)
|
|
@@ -188,9 +215,11 @@ module.exports = PCancelable.fn(
|
|
|
188
215
|
{
|
|
189
216
|
encoding = 'utf-8',
|
|
190
217
|
getBrowserless,
|
|
191
|
-
getMode =
|
|
218
|
+
getMode = defaultGetMode,
|
|
219
|
+
getTemporalFile = defaultGetTemporalFile,
|
|
192
220
|
gotOpts,
|
|
193
221
|
headers,
|
|
222
|
+
mutoolPath = defaultMutoolPath(),
|
|
194
223
|
prerender = 'auto',
|
|
195
224
|
puppeteerOpts,
|
|
196
225
|
rewriteUrls = false
|
|
@@ -210,8 +239,10 @@ module.exports = PCancelable.fn(
|
|
|
210
239
|
|
|
211
240
|
const promise = getContent(targetUrl, reqMode, {
|
|
212
241
|
getBrowserless,
|
|
242
|
+
getTemporalFile,
|
|
213
243
|
gotOpts,
|
|
214
244
|
headers,
|
|
245
|
+
mutoolPath,
|
|
215
246
|
puppeteerOpts,
|
|
216
247
|
rewriteUrls,
|
|
217
248
|
toEncode
|