html-get 2.18.5 → 2.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -158,6 +158,12 @@ Default: `false`
158
158
 
159
159
  When is `true`, it will rewrite some common mistake related with HTML meta tags.
160
160
 
161
+ ##### serializeHtml
162
+
163
+ It determines how HTML should be serialied before returning.
164
+
165
+ It's serialized `$ => ({ html: $.html() })` by default.
166
+
161
167
  ## License
162
168
 
163
169
  **html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.<br>
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, fine-tuned for correction & speed",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.18.5",
5
+ "version": "2.20.0",
6
6
  "main": "src/index.js",
7
7
  "bin": {
8
8
  "html-get": "bin/index.js"
@@ -45,7 +45,7 @@
45
45
  "html-encode": "~2.1.7",
46
46
  "html-urls": "~2.4.62",
47
47
  "is-html-content": "~1.0.0",
48
- "localhost-url-regex": "~1.0.13",
48
+ "is-local-address": "~2.2.0",
49
49
  "lodash": "~4.17.21",
50
50
  "mri": "~1.2.0",
51
51
  "p-cancelable": "~2.1.0",
@@ -71,7 +71,6 @@
71
71
  "regex-iso-date": "latest",
72
72
  "simple-git-hooks": "latest",
73
73
  "standard": "latest",
74
- "standard-markdown": "latest",
75
74
  "standard-version": "latest"
76
75
  },
77
76
  "engines": {
@@ -85,7 +84,7 @@
85
84
  "scripts": {
86
85
  "clean": "rm -rf node_modules",
87
86
  "contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
88
- "lint": "standard-markdown README.md && standard",
87
+ "lint": "standard",
89
88
  "postinstall": "node scripts/postinstall",
90
89
  "postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
91
90
  "pretest": "npm run lint",
@@ -118,13 +117,13 @@
118
117
  "prettier-standard",
119
118
  "standard --fix"
120
119
  ],
121
- "*.md": [
122
- "standard-markdown"
123
- ],
124
120
  "package.json": [
125
121
  "finepack"
126
122
  ]
127
123
  },
124
+ "pnpm": {
125
+ "neverBuiltDependencies": []
126
+ },
128
127
  "simple-git-hooks": {
129
128
  "commit-msg": "npx commitlint --edit",
130
129
  "pre-commit": "npx nano-staged"
@@ -1 +1 @@
1
- [[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","theguardian"]],[["domain","x.com"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","theverge"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
1
+ [[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domain","x.com"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
package/src/html.js CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  const { get, split, nth, castArray, forEach } = require('lodash')
4
4
  const debug = require('debug-logfmt')('html-get:rewrite')
5
- const localhostUrl = require('localhost-url-regex')
5
+ const isLocalAddress = require('is-local-address')
6
6
  const { TAGS: URL_TAGS } = require('html-urls')
7
7
  const isHTML = require('is-html-content')
8
8
  const cssUrl = require('css-url-regex')
@@ -118,37 +118,58 @@ const rewriteHtmlUrls = ({ $, url }) => {
118
118
  $(tagName.join(',')).each(function () {
119
119
  const el = $(this)
120
120
  const attr = el.attr(urlAttr)
121
-
122
- if (localhostUrl().test(attr)) {
123
- el.remove()
124
- } else if (typeof attr === 'string' && !attr.startsWith('http')) {
125
- try {
126
- const newAttr = new URL(attr, url).toString()
127
- el.attr(urlAttr, newAttr)
128
- } catch (_) {}
129
- }
121
+ if (typeof attr !== 'string') return
122
+ try {
123
+ const urlObj = new URL(attr, url)
124
+ if (!urlObj.protocol.startsWith('http')) return
125
+ if (isLocalAddress(urlObj.hostname)) {
126
+ el.remove()
127
+ } else {
128
+ el.attr(urlAttr, urlObj.toString())
129
+ }
130
+ } catch (_) {}
130
131
  })
131
132
  })
132
133
  }
133
134
 
134
- const rewriteCssUrls = ({ html, url }) => {
135
- const cssUrls = Array.from(
136
- execall(cssUrl(), html).reduce((acc, match) => {
135
+ const replaceCssUrls = (url, stylesheet) => {
136
+ const cssUrls = Array.from(execall(cssUrl(), stylesheet)).reduce(
137
+ (acc, match) => {
137
138
  match.subMatches.forEach(match => acc.add(match))
138
139
  return acc
139
- }, new Set())
140
+ },
141
+ new Set()
140
142
  )
141
143
 
142
144
  cssUrls.forEach(cssUrl => {
143
145
  if (cssUrl.startsWith('/')) {
144
146
  try {
145
147
  const absoluteUrl = new URL(cssUrl, url).toString()
146
- html = html.replaceAll(`url(${cssUrl})`, `url(${absoluteUrl})`)
148
+ stylesheet = stylesheet.replaceAll(
149
+ `url(${cssUrl})`,
150
+ `url(${absoluteUrl})`
151
+ )
147
152
  } catch (_) {}
148
153
  }
149
154
  })
150
155
 
151
- return html
156
+ return stylesheet
157
+ }
158
+
159
+ const rewriteCssUrls = ({ $, url }) => {
160
+ // Process <style> tags
161
+ // e.g., <style>body { background-image: url('/image.jpg'); }</style>
162
+ $('style').each((_, element) =>
163
+ $(element).html(replaceCssUrls(url, $(element).html()))
164
+ )
165
+
166
+ // Process elements with style attributes
167
+ // e.g., <div style="background-image: url('/image.jpg');"></div>
168
+ $('[style]').each((_, element) =>
169
+ $(element).attr('style', replaceCssUrls(url, $(element).attr('style')))
170
+ )
171
+
172
+ return $
152
173
  }
153
174
 
154
175
  const injectStyle = ({ $, styles }) =>
@@ -215,7 +236,7 @@ module.exports = ({
215
236
  if (scripts) injectScripts({ $, scripts, type: 'text/javascript' })
216
237
  if (modules) injectScripts({ $, modules, type: 'module' })
217
238
 
218
- return rewriteUrls ? rewriteCssUrls({ html: $.html(), url }) : $.html()
239
+ return rewriteUrls ? rewriteCssUrls({ $, url }) : $
219
240
  }
220
241
 
221
242
  module.exports.getDate = getDate
package/src/index.js CHANGED
@@ -236,14 +236,14 @@ const getContent = PCancelable.fn(
236
236
  onCancel(() => promise.cancel())
237
237
 
238
238
  return promise.then(content => {
239
- const html = addHtml({
239
+ const $ = addHtml({
240
240
  ...content,
241
241
  ...(isFetchMode ? puppeteerOpts : undefined),
242
242
  rewriteUrls,
243
243
  rewriteHtml
244
244
  })
245
245
 
246
- return { ...content, html }
246
+ return { ...content, $ }
247
247
  })
248
248
  }
249
249
  )
@@ -261,8 +261,9 @@ module.exports = PCancelable.fn(
261
261
  mutoolPath = defaultMutoolPath(),
262
262
  prerender = 'auto',
263
263
  puppeteerOpts,
264
+ rewriteHtml = false,
264
265
  rewriteUrls = false,
265
- rewriteHtml = false
266
+ serializeHtml = $ => ({ html: $.html() })
266
267
  } = {},
267
268
  onCancel
268
269
  ) => {
@@ -291,9 +292,12 @@ module.exports = PCancelable.fn(
291
292
 
292
293
  onCancel(() => promise.cancel())
293
294
 
294
- const { mode, ...payload } = await promise
295
+ const { mode, $, ...payload } = await promise
295
296
 
296
- return Object.assign(payload, { stats: { mode, timing: duration() } })
297
+ return Object.assign(payload, {
298
+ ...serializeHtml($),
299
+ stats: { mode, timing: duration() }
300
+ })
297
301
  }
298
302
  )
299
303