html-get 2.19.0 → 2.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/package.json +2 -6
- package/src/auto-domains.json +1 -1
- package/src/html.js +27 -7
- package/src/index.js +9 -5
package/README.md
CHANGED
|
@@ -158,6 +158,12 @@ Default: `false`
|
|
|
158
158
|
|
|
159
159
|
When is `true`, it will rewrite some common mistake related with HTML meta tags.
|
|
160
160
|
|
|
161
|
+
##### serializeHtml
|
|
162
|
+
|
|
163
|
+
It determines how HTML should be serialied before returning.
|
|
164
|
+
|
|
165
|
+
It's serialized `$ => ({ html: $.html() })` by default.
|
|
166
|
+
|
|
161
167
|
## License
|
|
162
168
|
|
|
163
169
|
**html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.<br>
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, fine-tuned for correction & speed",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.
|
|
5
|
+
"version": "2.20.0",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -71,7 +71,6 @@
|
|
|
71
71
|
"regex-iso-date": "latest",
|
|
72
72
|
"simple-git-hooks": "latest",
|
|
73
73
|
"standard": "latest",
|
|
74
|
-
"standard-markdown": "latest",
|
|
75
74
|
"standard-version": "latest"
|
|
76
75
|
},
|
|
77
76
|
"engines": {
|
|
@@ -85,7 +84,7 @@
|
|
|
85
84
|
"scripts": {
|
|
86
85
|
"clean": "rm -rf node_modules",
|
|
87
86
|
"contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
88
|
-
"lint": "standard
|
|
87
|
+
"lint": "standard",
|
|
89
88
|
"postinstall": "node scripts/postinstall",
|
|
90
89
|
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
91
90
|
"pretest": "npm run lint",
|
|
@@ -118,9 +117,6 @@
|
|
|
118
117
|
"prettier-standard",
|
|
119
118
|
"standard --fix"
|
|
120
119
|
],
|
|
121
|
-
"*.md": [
|
|
122
|
-
"standard-markdown"
|
|
123
|
-
],
|
|
124
120
|
"package.json": [
|
|
125
121
|
"finepack"
|
|
126
122
|
]
|
package/src/auto-domains.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
[[["domainWithoutSuffix","
|
|
1
|
+
[[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domain","x.com"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
|
package/src/html.js
CHANGED
|
@@ -132,24 +132,44 @@ const rewriteHtmlUrls = ({ $, url }) => {
|
|
|
132
132
|
})
|
|
133
133
|
}
|
|
134
134
|
|
|
135
|
-
const
|
|
136
|
-
const cssUrls = Array.from(
|
|
137
|
-
|
|
135
|
+
const replaceCssUrls = (url, stylesheet) => {
|
|
136
|
+
const cssUrls = Array.from(execall(cssUrl(), stylesheet)).reduce(
|
|
137
|
+
(acc, match) => {
|
|
138
138
|
match.subMatches.forEach(match => acc.add(match))
|
|
139
139
|
return acc
|
|
140
|
-
},
|
|
140
|
+
},
|
|
141
|
+
new Set()
|
|
141
142
|
)
|
|
142
143
|
|
|
143
144
|
cssUrls.forEach(cssUrl => {
|
|
144
145
|
if (cssUrl.startsWith('/')) {
|
|
145
146
|
try {
|
|
146
147
|
const absoluteUrl = new URL(cssUrl, url).toString()
|
|
147
|
-
|
|
148
|
+
stylesheet = stylesheet.replaceAll(
|
|
149
|
+
`url(${cssUrl})`,
|
|
150
|
+
`url(${absoluteUrl})`
|
|
151
|
+
)
|
|
148
152
|
} catch (_) {}
|
|
149
153
|
}
|
|
150
154
|
})
|
|
151
155
|
|
|
152
|
-
return
|
|
156
|
+
return stylesheet
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const rewriteCssUrls = ({ $, url }) => {
|
|
160
|
+
// Process <style> tags
|
|
161
|
+
// e.g., <style>body { background-image: url('/image.jpg'); }</style>
|
|
162
|
+
$('style').each((_, element) =>
|
|
163
|
+
$(element).html(replaceCssUrls(url, $(element).html()))
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
// Process elements with style attributes
|
|
167
|
+
// e.g., <div style="background-image: url('/image.jpg');"></div>
|
|
168
|
+
$('[style]').each((_, element) =>
|
|
169
|
+
$(element).attr('style', replaceCssUrls(url, $(element).attr('style')))
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return $
|
|
153
173
|
}
|
|
154
174
|
|
|
155
175
|
const injectStyle = ({ $, styles }) =>
|
|
@@ -216,7 +236,7 @@ module.exports = ({
|
|
|
216
236
|
if (scripts) injectScripts({ $, scripts, type: 'text/javascript' })
|
|
217
237
|
if (modules) injectScripts({ $, modules, type: 'module' })
|
|
218
238
|
|
|
219
|
-
return rewriteUrls ? rewriteCssUrls({
|
|
239
|
+
return rewriteUrls ? rewriteCssUrls({ $, url }) : $
|
|
220
240
|
}
|
|
221
241
|
|
|
222
242
|
module.exports.getDate = getDate
|
package/src/index.js
CHANGED
|
@@ -236,14 +236,14 @@ const getContent = PCancelable.fn(
|
|
|
236
236
|
onCancel(() => promise.cancel())
|
|
237
237
|
|
|
238
238
|
return promise.then(content => {
|
|
239
|
-
const
|
|
239
|
+
const $ = addHtml({
|
|
240
240
|
...content,
|
|
241
241
|
...(isFetchMode ? puppeteerOpts : undefined),
|
|
242
242
|
rewriteUrls,
|
|
243
243
|
rewriteHtml
|
|
244
244
|
})
|
|
245
245
|
|
|
246
|
-
return { ...content,
|
|
246
|
+
return { ...content, $ }
|
|
247
247
|
})
|
|
248
248
|
}
|
|
249
249
|
)
|
|
@@ -261,8 +261,9 @@ module.exports = PCancelable.fn(
|
|
|
261
261
|
mutoolPath = defaultMutoolPath(),
|
|
262
262
|
prerender = 'auto',
|
|
263
263
|
puppeteerOpts,
|
|
264
|
+
rewriteHtml = false,
|
|
264
265
|
rewriteUrls = false,
|
|
265
|
-
|
|
266
|
+
serializeHtml = $ => ({ html: $.html() })
|
|
266
267
|
} = {},
|
|
267
268
|
onCancel
|
|
268
269
|
) => {
|
|
@@ -291,9 +292,12 @@ module.exports = PCancelable.fn(
|
|
|
291
292
|
|
|
292
293
|
onCancel(() => promise.cancel())
|
|
293
294
|
|
|
294
|
-
const { mode, ...payload } = await promise
|
|
295
|
+
const { mode, $, ...payload } = await promise
|
|
295
296
|
|
|
296
|
-
return Object.assign(payload, {
|
|
297
|
+
return Object.assign(payload, {
|
|
298
|
+
...serializeHtml($),
|
|
299
|
+
stats: { mode, timing: duration() }
|
|
300
|
+
})
|
|
297
301
|
}
|
|
298
302
|
)
|
|
299
303
|
|