html-get 2.15.0-0 → 2.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +0 -0
- package/README.md +38 -46
- package/package.json +27 -22
- package/scripts/postinstall +0 -0
- package/src/auto-domains.json +1 -1
- package/src/index.js +29 -6
package/LICENSE
CHANGED
|
File without changes
|
package/README.md
CHANGED
|
@@ -9,26 +9,16 @@
|
|
|
9
9
|
[](https://coveralls.io/github/microlinkhq/html-get)
|
|
10
10
|
[](https://www.npmjs.org/package/html-get)
|
|
11
11
|
|
|
12
|
-
> Get the HTML from any website,
|
|
12
|
+
> Get the HTML from any website, fine-tuned for correction & speed.
|
|
13
13
|
|
|
14
14
|
## Features
|
|
15
15
|
|
|
16
|
-
- Get HTML markup
|
|
17
|
-
-
|
|
18
|
-
-
|
|
19
|
-
-
|
|
16
|
+
- Get HTML markup for any URL, including images, video, audio, or pdf.
|
|
17
|
+
- Block ads tracker or any non-necessary network subrequest.
|
|
18
|
+
- Handle unreachable or timeout URLs gracefully.
|
|
19
|
+
- Ensure HTML markup is appropriately encoded.
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
Headless technology like [puppeteer](https://github.com/GoogleChrome/puppeteer) brings us to get the HTML markup from any website, even when the target URL is client side app and we need to wait until dom events fire for getting the real markup.
|
|
24
|
-
|
|
25
|
-
Generally this approach better than a simple GET request from the target URL, but because you need to wait for dom events, prerendering could be slow and in some scenario unnecessary (sites that use server side rendering could be resolved with a simple GET).
|
|
26
|
-
|
|
27
|
-
**html-get** bring the best of both worlds, doing the following algorithm:
|
|
28
|
-
|
|
29
|
-
- Determinate if the target URL actually needs prerendering (internally it has a [list of popular site domains](https://github.com/microlinkhq/html-get/blob/master/src/auto-domains.js) that don't need it).
|
|
30
|
-
- If it needs prerendering, perform the action using Headless technology, blocking ads trackers requests for speed up the process, trying to resolve the main request in the minimum amount of time.
|
|
31
|
-
- If it does not need prerendering or prerendering fails for any reason (for example, timeout), the request will be resolved doing a GET request.
|
|
21
|
+
**html-get** takes advantage of [puppeteer](https://github.com/GoogleChrome/puppeteer) headless technology when is needed, such as client-side apps that needs to be prerender.
|
|
32
22
|
|
|
33
23
|
## Install
|
|
34
24
|
|
|
@@ -89,68 +79,70 @@ Type: `string`
|
|
|
89
79
|
|
|
90
80
|
The target URL for getting the HTML markup.
|
|
91
81
|
|
|
82
|
+
#### options
|
|
83
|
+
|
|
84
|
+
##### encoding
|
|
85
|
+
|
|
86
|
+
Type: `string`
|
|
87
|
+
Default: `'utf-8'`
|
|
88
|
+
|
|
89
|
+
It ensures the HTML markup is encoded to the encoded value provided.
|
|
90
|
+
|
|
91
|
+
The value will be passes to [`html-encode`](https://github.com/kikobeats/html-encode)
|
|
92
|
+
|
|
92
93
|
##### getBrowserless
|
|
93
94
|
|
|
94
95
|
*Required*<br>
|
|
95
|
-
Type: `function
|
|
96
|
+
Type: `function`
|
|
96
97
|
|
|
97
98
|
A function that should return a [browserless](https://browserless.js.org/) instance to be used for interact with puppeteer:
|
|
98
99
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
##### prerender
|
|
100
|
+
##### getMode
|
|
102
101
|
|
|
103
|
-
Type: `
|
|
104
|
-
Default: `'auto'`
|
|
102
|
+
Type: `function`
|
|
105
103
|
|
|
106
|
-
|
|
104
|
+
It determines the strategy to use based on the `url`, being the possibles values `'fetch'` or `'prerender'` .
|
|
107
105
|
|
|
108
|
-
|
|
106
|
+
##### getTemporalFile
|
|
109
107
|
|
|
110
|
-
|
|
108
|
+
Type: `function`
|
|
111
109
|
|
|
112
|
-
|
|
110
|
+
It creates a temporal file.
|
|
113
111
|
|
|
114
|
-
|
|
115
|
-
Default: `'utf-8'`
|
|
112
|
+
##### gotOpts
|
|
116
113
|
|
|
117
|
-
|
|
114
|
+
Type: `object`
|
|
118
115
|
|
|
119
|
-
It
|
|
116
|
+
It passes configuration object to [got](https://www.npmjs.com/package/got) under `'fetch'` strategy.
|
|
120
117
|
|
|
121
118
|
##### headers
|
|
122
119
|
|
|
123
|
-
Type: `object
|
|
120
|
+
Type: `object`
|
|
124
121
|
|
|
125
122
|
Request headers that will be passed to fetch/prerender process.
|
|
126
123
|
|
|
127
|
-
#####
|
|
124
|
+
##### mutoolPath
|
|
128
125
|
|
|
129
|
-
Type: `function
|
|
126
|
+
Type: `function`
|
|
130
127
|
|
|
131
|
-
|
|
128
|
+
It returns the path for [mutool](https://mupdf.com/) binary, used for turning PDF files into HTML markup.
|
|
132
129
|
|
|
133
|
-
|
|
130
|
+
##### prerender
|
|
134
131
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
if (prerender === false) return 'fetch'
|
|
138
|
-
if (prerender !== 'auto') return 'prerender'
|
|
139
|
-
return autoDomains.includes(getDomain(url)) ? 'fetch' : 'prerender'
|
|
140
|
-
}
|
|
141
|
-
```
|
|
132
|
+
Type: `boolean`|`string`<br>
|
|
133
|
+
Default: `'auto'`
|
|
142
134
|
|
|
143
|
-
|
|
135
|
+
Enable or disable prerendering as mechanism for getting the HTML markup explicitly.
|
|
144
136
|
|
|
145
|
-
|
|
137
|
+
The value `auto` means that that internally use a list of websites that don't need to use prerendering by default. This list is used for speedup the process, using `fetch` mode for these websites.
|
|
146
138
|
|
|
147
|
-
|
|
139
|
+
See [getMode parameter](#getMode) for know more.
|
|
148
140
|
|
|
149
141
|
##### puppeteerOpts
|
|
150
142
|
|
|
151
143
|
Type: `object`
|
|
152
144
|
|
|
153
|
-
|
|
145
|
+
It passes coniguration object to [puppeteer](https://www.npmjs.com/package/puppeteer) under `'prerender'` strategy.
|
|
154
146
|
|
|
155
147
|
##### rewriteUrls
|
|
156
148
|
|
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "html-get",
|
|
3
|
-
"description": "Get the HTML from any website,
|
|
3
|
+
"description": "Get the HTML from any website, fine-tuned for correction & speed",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.15.0
|
|
5
|
+
"version": "2.15.0",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -20,20 +20,25 @@
|
|
|
20
20
|
"url": "https://github.com/microlinkhq/html-get/issues"
|
|
21
21
|
},
|
|
22
22
|
"keywords": [
|
|
23
|
+
"audio",
|
|
23
24
|
"fetch",
|
|
24
25
|
"get",
|
|
25
26
|
"got",
|
|
26
27
|
"headless",
|
|
27
28
|
"html",
|
|
29
|
+
"image",
|
|
30
|
+
"markup",
|
|
31
|
+
"pdf",
|
|
28
32
|
"prerender",
|
|
29
|
-
"request"
|
|
33
|
+
"request",
|
|
34
|
+
"video"
|
|
30
35
|
],
|
|
31
36
|
"dependencies": {
|
|
32
37
|
"@kikobeats/time-span": "~1.0.3",
|
|
33
|
-
"@metascraper/helpers": "~5.43.
|
|
38
|
+
"@metascraper/helpers": "~5.43.4",
|
|
34
39
|
"cheerio": "~1.0.0-rc.12",
|
|
35
40
|
"css-url-regex": "~4.0.0",
|
|
36
|
-
"debug-logfmt": "~1.2.
|
|
41
|
+
"debug-logfmt": "~1.2.2",
|
|
37
42
|
"execall": "~2.0.0",
|
|
38
43
|
"got": "~11.8.6",
|
|
39
44
|
"html-encode": "~2.1.6",
|
|
@@ -45,7 +50,7 @@
|
|
|
45
50
|
"p-retry": "~4.6.0",
|
|
46
51
|
"replace-string": "~3.1.0",
|
|
47
52
|
"tinyspawn": "~1.2.6",
|
|
48
|
-
"top-sites": "~1.1.
|
|
53
|
+
"top-sites": "~1.1.205"
|
|
49
54
|
},
|
|
50
55
|
"devDependencies": {
|
|
51
56
|
"@commitlint/cli": "latest",
|
|
@@ -77,6 +82,21 @@
|
|
|
77
82
|
"scripts",
|
|
78
83
|
"src"
|
|
79
84
|
],
|
|
85
|
+
"scripts": {
|
|
86
|
+
"clean": "rm -rf node_modules",
|
|
87
|
+
"contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
88
|
+
"lint": "standard-markdown README.md && standard",
|
|
89
|
+
"postinstall": "node scripts/postinstall",
|
|
90
|
+
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
91
|
+
"prerelease": "npm run update:check",
|
|
92
|
+
"pretest": "npm run lint",
|
|
93
|
+
"release": "standard-version -a",
|
|
94
|
+
"release:github": "github-generate-release",
|
|
95
|
+
"release:tags": "git push --follow-tags origin HEAD:master",
|
|
96
|
+
"test": "c8 ava",
|
|
97
|
+
"update": "ncu -u",
|
|
98
|
+
"update:check": "ncu -- --error-level 2"
|
|
99
|
+
},
|
|
80
100
|
"license": "MIT",
|
|
81
101
|
"ava": {
|
|
82
102
|
"files": [
|
|
@@ -106,20 +126,5 @@
|
|
|
106
126
|
"simple-git-hooks": {
|
|
107
127
|
"commit-msg": "npx commitlint --edit",
|
|
108
128
|
"pre-commit": "npx nano-staged"
|
|
109
|
-
},
|
|
110
|
-
"scripts": {
|
|
111
|
-
"clean": "rm -rf node_modules",
|
|
112
|
-
"contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
113
|
-
"lint": "standard-markdown README.md && standard",
|
|
114
|
-
"postinstall": "node scripts/postinstall",
|
|
115
|
-
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
116
|
-
"prerelease": "npm run update:check && npm run contributors",
|
|
117
|
-
"pretest": "npm run lint",
|
|
118
|
-
"release": "standard-version -a",
|
|
119
|
-
"release:github": "github-generate-release",
|
|
120
|
-
"release:tags": "git push --follow-tags origin HEAD:master",
|
|
121
|
-
"test": "c8 ava",
|
|
122
|
-
"update": "ncu -u",
|
|
123
|
-
"update:check": "ncu -- --error-level 2"
|
|
124
129
|
}
|
|
125
|
-
}
|
|
130
|
+
}
|
package/scripts/postinstall
CHANGED
|
File without changes
|
package/src/auto-domains.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
[[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","
|
|
1
|
+
[[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domainWithoutSuffix","stackoverflow"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
|
package/src/index.js
CHANGED
|
@@ -24,7 +24,14 @@ const ABORT_TYPES = ['image', 'stylesheet', 'font']
|
|
|
24
24
|
const fetch = PCancelable.fn(
|
|
25
25
|
async (
|
|
26
26
|
url,
|
|
27
|
-
{
|
|
27
|
+
{
|
|
28
|
+
reflect = false,
|
|
29
|
+
toEncode,
|
|
30
|
+
timeout = REQ_TIMEOUT,
|
|
31
|
+
mutoolPath,
|
|
32
|
+
getTemporalFile,
|
|
33
|
+
...opts
|
|
34
|
+
},
|
|
28
35
|
onCancel
|
|
29
36
|
) => {
|
|
30
37
|
const reqTimeout = reflect ? timeout / 2 : timeout
|
|
@@ -53,8 +60,7 @@ const fetch = PCancelable.fn(
|
|
|
53
60
|
return (await $(`mutool draw -q -F html ${file.path}`)).stdout
|
|
54
61
|
}
|
|
55
62
|
|
|
56
|
-
return contentType.startsWith('text/html') ||
|
|
57
|
-
!isMediaUrl(url)
|
|
63
|
+
return contentType.startsWith('text/html') || !isMediaUrl(url)
|
|
58
64
|
? await toEncode(res.body, res.headers['content-type'])
|
|
59
65
|
: res.body
|
|
60
66
|
})()
|
|
@@ -175,17 +181,34 @@ const defaultGetMode = (url, { prerender }) => {
|
|
|
175
181
|
|
|
176
182
|
const defaultGetTemporalFile = (url, ext) => {
|
|
177
183
|
const hash = crypto.createHash('sha256').update(url).digest('hex')
|
|
178
|
-
const filepath = path.join(
|
|
184
|
+
const filepath = path.join(
|
|
185
|
+
os.tmpdir(),
|
|
186
|
+
ext === undefined ? hash : `${hash}.${ext}`
|
|
187
|
+
)
|
|
179
188
|
return { path: filepath }
|
|
180
189
|
}
|
|
181
190
|
|
|
182
|
-
const defaultMutoolPath = () =>
|
|
191
|
+
const defaultMutoolPath = () =>
|
|
192
|
+
(() => {
|
|
193
|
+
try {
|
|
194
|
+
return execSync('which mutool').toString().trim()
|
|
195
|
+
} catch (_) {}
|
|
196
|
+
})()
|
|
183
197
|
|
|
184
198
|
const getContent = PCancelable.fn(
|
|
185
199
|
(
|
|
186
200
|
url,
|
|
187
201
|
mode,
|
|
188
|
-
{
|
|
202
|
+
{
|
|
203
|
+
getBrowserless,
|
|
204
|
+
getTemporalFile,
|
|
205
|
+
gotOpts,
|
|
206
|
+
headers,
|
|
207
|
+
mutoolPath,
|
|
208
|
+
puppeteerOpts,
|
|
209
|
+
rewriteUrls,
|
|
210
|
+
toEncode
|
|
211
|
+
},
|
|
189
212
|
onCancel
|
|
190
213
|
) => {
|
|
191
214
|
const isFetchMode = mode === 'fetch'
|