html-get 2.16.10 → 2.17.0-0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +0 -0
- package/README.md +7 -0
- package/package.json +14 -14
- package/scripts/postinstall +0 -0
- package/src/auto-domains.json +1 -1
- package/src/html.js +22 -0
- package/src/index.js +8 -4
package/LICENSE
CHANGED
|
File without changes
|
package/README.md
CHANGED
|
@@ -151,6 +151,13 @@ Default: `false`
|
|
|
151
151
|
|
|
152
152
|
When is `true`, it will be rewritten CSS/HTML relatives URLs present in the HTML markup into absolutes.
|
|
153
153
|
|
|
154
|
+
##### rewriteHtml
|
|
155
|
+
|
|
156
|
+
Type: `boolean`<br>
|
|
157
|
+
Default: `false`
|
|
158
|
+
|
|
159
|
+
When is `true`, it will rewrite some common mistake related with HTML meta tags.
|
|
160
|
+
|
|
154
161
|
## License
|
|
155
162
|
|
|
156
163
|
**html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.<br>
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, fine-tuned for correction & speed",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.
|
|
5
|
+
"version": "2.17.0-0",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -81,18 +81,6 @@
|
|
|
81
81
|
"scripts",
|
|
82
82
|
"src"
|
|
83
83
|
],
|
|
84
|
-
"scripts": {
|
|
85
|
-
"clean": "rm -rf node_modules",
|
|
86
|
-
"contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
87
|
-
"lint": "standard-markdown README.md && standard",
|
|
88
|
-
"postinstall": "node scripts/postinstall",
|
|
89
|
-
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
90
|
-
"pretest": "npm run lint",
|
|
91
|
-
"release": "standard-version -a",
|
|
92
|
-
"release:github": "github-generate-release",
|
|
93
|
-
"release:tags": "git push --follow-tags origin HEAD:master",
|
|
94
|
-
"test": "c8 ava"
|
|
95
|
-
},
|
|
96
84
|
"license": "MIT",
|
|
97
85
|
"ava": {
|
|
98
86
|
"files": [
|
|
@@ -127,5 +115,17 @@
|
|
|
127
115
|
"simple-git-hooks": {
|
|
128
116
|
"commit-msg": "npx commitlint --edit",
|
|
129
117
|
"pre-commit": "npx nano-staged"
|
|
118
|
+
},
|
|
119
|
+
"scripts": {
|
|
120
|
+
"clean": "rm -rf node_modules",
|
|
121
|
+
"contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
122
|
+
"lint": "standard-markdown README.md && standard",
|
|
123
|
+
"postinstall": "node scripts/postinstall",
|
|
124
|
+
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
125
|
+
"pretest": "npm run lint",
|
|
126
|
+
"release": "standard-version -a",
|
|
127
|
+
"release:github": "github-generate-release",
|
|
128
|
+
"release:tags": "git push --follow-tags origin HEAD:master",
|
|
129
|
+
"test": "c8 ava"
|
|
130
130
|
}
|
|
131
|
-
}
|
|
131
|
+
}
|
package/scripts/postinstall
CHANGED
|
File without changes
|
package/src/auto-domains.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
[[["domainWithoutSuffix","
|
|
1
|
+
[[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","telegraph"]],[["domain","x.com"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
|
package/src/html.js
CHANGED
|
@@ -89,6 +89,22 @@ const addBody = ({ url, headers, html }) => {
|
|
|
89
89
|
return `<!DOCTYPE html><html><head></head><body>${element}</body></html>`
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
+
const rewriteOpenGraph = ({ $ }) =>
|
|
93
|
+
$('meta[name^="og:"]').each((_, element) => {
|
|
94
|
+
const el = $(element)
|
|
95
|
+
const name = el.attr('name')
|
|
96
|
+
el.removeAttr('name')
|
|
97
|
+
el.attr('property', name)
|
|
98
|
+
})
|
|
99
|
+
|
|
100
|
+
const rewriteMetaProperty = ({ $ }) =>
|
|
101
|
+
$('meta[property]:not([property^="og"])').each((_, element) => {
|
|
102
|
+
const el = $(element)
|
|
103
|
+
const property = el.attr('property')
|
|
104
|
+
el.removeAttr('property')
|
|
105
|
+
el.attr('name', property)
|
|
106
|
+
})
|
|
107
|
+
|
|
92
108
|
const rewriteHtmlUrls = ({ $, url }) => {
|
|
93
109
|
forEach(URL_TAGS, (tagName, urlAttr) => {
|
|
94
110
|
$(tagName.join(',')).each(function () {
|
|
@@ -156,6 +172,7 @@ module.exports = ({
|
|
|
156
172
|
hide,
|
|
157
173
|
remove,
|
|
158
174
|
rewriteUrls,
|
|
175
|
+
rewriteHtml,
|
|
159
176
|
scripts,
|
|
160
177
|
modules
|
|
161
178
|
}) => {
|
|
@@ -167,6 +184,11 @@ module.exports = ({
|
|
|
167
184
|
|
|
168
185
|
if (rewriteUrls) rewriteHtmlUrls({ $, url })
|
|
169
186
|
|
|
187
|
+
if (rewriteHtml) {
|
|
188
|
+
rewriteOpenGraph({ $ })
|
|
189
|
+
rewriteMetaProperty({ $ })
|
|
190
|
+
}
|
|
191
|
+
|
|
170
192
|
addHead({ $, url, headers })
|
|
171
193
|
|
|
172
194
|
if (styles) injectStyle({ $, styles })
|
package/src/index.js
CHANGED
|
@@ -57,7 +57,7 @@ const fetch = PCancelable.fn(
|
|
|
57
57
|
if (mutoolPath && contentType === 'application/pdf') {
|
|
58
58
|
const file = getTemporalFile(url, 'pdf')
|
|
59
59
|
await writeFile(file.path, res.body)
|
|
60
|
-
return (await $(
|
|
60
|
+
return (await $(`${mutoolPath} draw -q -F html ${file.path}`)).stdout
|
|
61
61
|
}
|
|
62
62
|
|
|
63
63
|
return contentType.startsWith('text/html') || !isMediaUrl(url)
|
|
@@ -191,7 +191,7 @@ const defaultGetTemporalFile = (url, ext) => {
|
|
|
191
191
|
const defaultMutoolPath = () =>
|
|
192
192
|
(() => {
|
|
193
193
|
try {
|
|
194
|
-
return execSync('which mutool').toString().trim()
|
|
194
|
+
return execSync('which mutool', { stdio: 'pipe' }).toString().trim()
|
|
195
195
|
} catch (_) {}
|
|
196
196
|
})()
|
|
197
197
|
|
|
@@ -207,6 +207,7 @@ const getContent = PCancelable.fn(
|
|
|
207
207
|
mutoolPath,
|
|
208
208
|
puppeteerOpts,
|
|
209
209
|
rewriteUrls,
|
|
210
|
+
rewriteHtml,
|
|
210
211
|
toEncode
|
|
211
212
|
},
|
|
212
213
|
onCancel
|
|
@@ -224,7 +225,8 @@ const getContent = PCancelable.fn(
|
|
|
224
225
|
const html = addHtml({
|
|
225
226
|
...content,
|
|
226
227
|
...(isFetchMode ? puppeteerOpts : undefined),
|
|
227
|
-
rewriteUrls
|
|
228
|
+
rewriteUrls,
|
|
229
|
+
rewriteHtml
|
|
228
230
|
})
|
|
229
231
|
|
|
230
232
|
return { ...content, html }
|
|
@@ -245,7 +247,8 @@ module.exports = PCancelable.fn(
|
|
|
245
247
|
mutoolPath = defaultMutoolPath(),
|
|
246
248
|
prerender = 'auto',
|
|
247
249
|
puppeteerOpts,
|
|
248
|
-
rewriteUrls = false
|
|
250
|
+
rewriteUrls = false,
|
|
251
|
+
rewriteHtml = false
|
|
249
252
|
} = {},
|
|
250
253
|
onCancel
|
|
251
254
|
) => {
|
|
@@ -268,6 +271,7 @@ module.exports = PCancelable.fn(
|
|
|
268
271
|
mutoolPath,
|
|
269
272
|
puppeteerOpts,
|
|
270
273
|
rewriteUrls,
|
|
274
|
+
rewriteHtml,
|
|
271
275
|
toEncode
|
|
272
276
|
})
|
|
273
277
|
|