html-get 2.18.5 → 2.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/package.json +6 -7
- package/src/auto-domains.json +1 -1
- package/src/html.js +38 -17
- package/src/index.js +9 -5
package/README.md
CHANGED
|
@@ -158,6 +158,12 @@ Default: `false`
|
|
|
158
158
|
|
|
159
159
|
When is `true`, it will rewrite some common mistake related with HTML meta tags.
|
|
160
160
|
|
|
161
|
+
##### serializeHtml
|
|
162
|
+
|
|
163
|
+
It determines how HTML should be serialied before returning.
|
|
164
|
+
|
|
165
|
+
It's serialized `$ => ({ html: $.html() })` by default.
|
|
166
|
+
|
|
161
167
|
## License
|
|
162
168
|
|
|
163
169
|
**html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.<br>
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, fine-tuned for correction & speed",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.
|
|
5
|
+
"version": "2.20.0",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -45,7 +45,7 @@
|
|
|
45
45
|
"html-encode": "~2.1.7",
|
|
46
46
|
"html-urls": "~2.4.62",
|
|
47
47
|
"is-html-content": "~1.0.0",
|
|
48
|
-
"
|
|
48
|
+
"is-local-address": "~2.2.0",
|
|
49
49
|
"lodash": "~4.17.21",
|
|
50
50
|
"mri": "~1.2.0",
|
|
51
51
|
"p-cancelable": "~2.1.0",
|
|
@@ -71,7 +71,6 @@
|
|
|
71
71
|
"regex-iso-date": "latest",
|
|
72
72
|
"simple-git-hooks": "latest",
|
|
73
73
|
"standard": "latest",
|
|
74
|
-
"standard-markdown": "latest",
|
|
75
74
|
"standard-version": "latest"
|
|
76
75
|
},
|
|
77
76
|
"engines": {
|
|
@@ -85,7 +84,7 @@
|
|
|
85
84
|
"scripts": {
|
|
86
85
|
"clean": "rm -rf node_modules",
|
|
87
86
|
"contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
|
|
88
|
-
"lint": "standard
|
|
87
|
+
"lint": "standard",
|
|
89
88
|
"postinstall": "node scripts/postinstall",
|
|
90
89
|
"postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
|
|
91
90
|
"pretest": "npm run lint",
|
|
@@ -118,13 +117,13 @@
|
|
|
118
117
|
"prettier-standard",
|
|
119
118
|
"standard --fix"
|
|
120
119
|
],
|
|
121
|
-
"*.md": [
|
|
122
|
-
"standard-markdown"
|
|
123
|
-
],
|
|
124
120
|
"package.json": [
|
|
125
121
|
"finepack"
|
|
126
122
|
]
|
|
127
123
|
},
|
|
124
|
+
"pnpm": {
|
|
125
|
+
"neverBuiltDependencies": []
|
|
126
|
+
},
|
|
128
127
|
"simple-git-hooks": {
|
|
129
128
|
"commit-msg": "npx commitlint --edit",
|
|
130
129
|
"pre-commit": "npx nano-staged"
|
package/src/auto-domains.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
[[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","
|
|
1
|
+
[[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domain","x.com"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
|
package/src/html.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
const { get, split, nth, castArray, forEach } = require('lodash')
|
|
4
4
|
const debug = require('debug-logfmt')('html-get:rewrite')
|
|
5
|
-
const
|
|
5
|
+
const isLocalAddress = require('is-local-address')
|
|
6
6
|
const { TAGS: URL_TAGS } = require('html-urls')
|
|
7
7
|
const isHTML = require('is-html-content')
|
|
8
8
|
const cssUrl = require('css-url-regex')
|
|
@@ -118,37 +118,58 @@ const rewriteHtmlUrls = ({ $, url }) => {
|
|
|
118
118
|
$(tagName.join(',')).each(function () {
|
|
119
119
|
const el = $(this)
|
|
120
120
|
const attr = el.attr(urlAttr)
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
121
|
+
if (typeof attr !== 'string') return
|
|
122
|
+
try {
|
|
123
|
+
const urlObj = new URL(attr, url)
|
|
124
|
+
if (!urlObj.protocol.startsWith('http')) return
|
|
125
|
+
if (isLocalAddress(urlObj.hostname)) {
|
|
126
|
+
el.remove()
|
|
127
|
+
} else {
|
|
128
|
+
el.attr(urlAttr, urlObj.toString())
|
|
129
|
+
}
|
|
130
|
+
} catch (_) {}
|
|
130
131
|
})
|
|
131
132
|
})
|
|
132
133
|
}
|
|
133
134
|
|
|
134
|
-
const
|
|
135
|
-
const cssUrls = Array.from(
|
|
136
|
-
|
|
135
|
+
const replaceCssUrls = (url, stylesheet) => {
|
|
136
|
+
const cssUrls = Array.from(execall(cssUrl(), stylesheet)).reduce(
|
|
137
|
+
(acc, match) => {
|
|
137
138
|
match.subMatches.forEach(match => acc.add(match))
|
|
138
139
|
return acc
|
|
139
|
-
},
|
|
140
|
+
},
|
|
141
|
+
new Set()
|
|
140
142
|
)
|
|
141
143
|
|
|
142
144
|
cssUrls.forEach(cssUrl => {
|
|
143
145
|
if (cssUrl.startsWith('/')) {
|
|
144
146
|
try {
|
|
145
147
|
const absoluteUrl = new URL(cssUrl, url).toString()
|
|
146
|
-
|
|
148
|
+
stylesheet = stylesheet.replaceAll(
|
|
149
|
+
`url(${cssUrl})`,
|
|
150
|
+
`url(${absoluteUrl})`
|
|
151
|
+
)
|
|
147
152
|
} catch (_) {}
|
|
148
153
|
}
|
|
149
154
|
})
|
|
150
155
|
|
|
151
|
-
return
|
|
156
|
+
return stylesheet
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
const rewriteCssUrls = ({ $, url }) => {
|
|
160
|
+
// Process <style> tags
|
|
161
|
+
// e.g., <style>body { background-image: url('/image.jpg'); }</style>
|
|
162
|
+
$('style').each((_, element) =>
|
|
163
|
+
$(element).html(replaceCssUrls(url, $(element).html()))
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
// Process elements with style attributes
|
|
167
|
+
// e.g., <div style="background-image: url('/image.jpg');"></div>
|
|
168
|
+
$('[style]').each((_, element) =>
|
|
169
|
+
$(element).attr('style', replaceCssUrls(url, $(element).attr('style')))
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return $
|
|
152
173
|
}
|
|
153
174
|
|
|
154
175
|
const injectStyle = ({ $, styles }) =>
|
|
@@ -215,7 +236,7 @@ module.exports = ({
|
|
|
215
236
|
if (scripts) injectScripts({ $, scripts, type: 'text/javascript' })
|
|
216
237
|
if (modules) injectScripts({ $, modules, type: 'module' })
|
|
217
238
|
|
|
218
|
-
return rewriteUrls ? rewriteCssUrls({
|
|
239
|
+
return rewriteUrls ? rewriteCssUrls({ $, url }) : $
|
|
219
240
|
}
|
|
220
241
|
|
|
221
242
|
module.exports.getDate = getDate
|
package/src/index.js
CHANGED
|
@@ -236,14 +236,14 @@ const getContent = PCancelable.fn(
|
|
|
236
236
|
onCancel(() => promise.cancel())
|
|
237
237
|
|
|
238
238
|
return promise.then(content => {
|
|
239
|
-
const
|
|
239
|
+
const $ = addHtml({
|
|
240
240
|
...content,
|
|
241
241
|
...(isFetchMode ? puppeteerOpts : undefined),
|
|
242
242
|
rewriteUrls,
|
|
243
243
|
rewriteHtml
|
|
244
244
|
})
|
|
245
245
|
|
|
246
|
-
return { ...content,
|
|
246
|
+
return { ...content, $ }
|
|
247
247
|
})
|
|
248
248
|
}
|
|
249
249
|
)
|
|
@@ -261,8 +261,9 @@ module.exports = PCancelable.fn(
|
|
|
261
261
|
mutoolPath = defaultMutoolPath(),
|
|
262
262
|
prerender = 'auto',
|
|
263
263
|
puppeteerOpts,
|
|
264
|
+
rewriteHtml = false,
|
|
264
265
|
rewriteUrls = false,
|
|
265
|
-
|
|
266
|
+
serializeHtml = $ => ({ html: $.html() })
|
|
266
267
|
} = {},
|
|
267
268
|
onCancel
|
|
268
269
|
) => {
|
|
@@ -291,9 +292,12 @@ module.exports = PCancelable.fn(
|
|
|
291
292
|
|
|
292
293
|
onCancel(() => promise.cancel())
|
|
293
294
|
|
|
294
|
-
const { mode, ...payload } = await promise
|
|
295
|
+
const { mode, $, ...payload } = await promise
|
|
295
296
|
|
|
296
|
-
return Object.assign(payload, {
|
|
297
|
+
return Object.assign(payload, {
|
|
298
|
+
...serializeHtml($),
|
|
299
|
+
stats: { mode, timing: duration() }
|
|
300
|
+
})
|
|
297
301
|
}
|
|
298
302
|
)
|
|
299
303
|
|