html-get 2.19.0 → 2.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -158,6 +158,12 @@ Default: `false`
158
158
 
159
159
  When is `true`, it will rewrite some common mistake related with HTML meta tags.
160
160
 
161
+ ##### serializeHtml
162
+
163
+ It determines how HTML should be serialied before returning.
164
+
165
+ It's serialized `$ => ({ html: $.html() })` by default.
166
+
161
167
  ## License
162
168
 
163
169
  **html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.<br>
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, fine-tuned for correction & speed",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.19.0",
5
+ "version": "2.20.0",
6
6
  "main": "src/index.js",
7
7
  "bin": {
8
8
  "html-get": "bin/index.js"
@@ -71,7 +71,6 @@
71
71
  "regex-iso-date": "latest",
72
72
  "simple-git-hooks": "latest",
73
73
  "standard": "latest",
74
- "standard-markdown": "latest",
75
74
  "standard-version": "latest"
76
75
  },
77
76
  "engines": {
@@ -85,7 +84,7 @@
85
84
  "scripts": {
86
85
  "clean": "rm -rf node_modules",
87
86
  "contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
88
- "lint": "standard-markdown README.md && standard",
87
+ "lint": "standard",
89
88
  "postinstall": "node scripts/postinstall",
90
89
  "postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
91
90
  "pretest": "npm run lint",
@@ -118,9 +117,6 @@
118
117
  "prettier-standard",
119
118
  "standard --fix"
120
119
  ],
121
- "*.md": [
122
- "standard-markdown"
123
- ],
124
120
  "package.json": [
125
121
  "finepack"
126
122
  ]
@@ -1 +1 @@
1
- [[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","theguardian"]],[["domain","x.com"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","arxiv"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","zoom"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
1
+ [[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domain","x.com"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
package/src/html.js CHANGED
@@ -132,24 +132,44 @@ const rewriteHtmlUrls = ({ $, url }) => {
132
132
  })
133
133
  }
134
134
 
135
- const rewriteCssUrls = ({ html, url }) => {
136
- const cssUrls = Array.from(
137
- execall(cssUrl(), html).reduce((acc, match) => {
135
+ const replaceCssUrls = (url, stylesheet) => {
136
+ const cssUrls = Array.from(execall(cssUrl(), stylesheet)).reduce(
137
+ (acc, match) => {
138
138
  match.subMatches.forEach(match => acc.add(match))
139
139
  return acc
140
- }, new Set())
140
+ },
141
+ new Set()
141
142
  )
142
143
 
143
144
  cssUrls.forEach(cssUrl => {
144
145
  if (cssUrl.startsWith('/')) {
145
146
  try {
146
147
  const absoluteUrl = new URL(cssUrl, url).toString()
147
- html = html.replaceAll(`url(${cssUrl})`, `url(${absoluteUrl})`)
148
+ stylesheet = stylesheet.replaceAll(
149
+ `url(${cssUrl})`,
150
+ `url(${absoluteUrl})`
151
+ )
148
152
  } catch (_) {}
149
153
  }
150
154
  })
151
155
 
152
- return html
156
+ return stylesheet
157
+ }
158
+
159
+ const rewriteCssUrls = ({ $, url }) => {
160
+ // Process <style> tags
161
+ // e.g., <style>body { background-image: url('/image.jpg'); }</style>
162
+ $('style').each((_, element) =>
163
+ $(element).html(replaceCssUrls(url, $(element).html()))
164
+ )
165
+
166
+ // Process elements with style attributes
167
+ // e.g., <div style="background-image: url('/image.jpg');"></div>
168
+ $('[style]').each((_, element) =>
169
+ $(element).attr('style', replaceCssUrls(url, $(element).attr('style')))
170
+ )
171
+
172
+ return $
153
173
  }
154
174
 
155
175
  const injectStyle = ({ $, styles }) =>
@@ -216,7 +236,7 @@ module.exports = ({
216
236
  if (scripts) injectScripts({ $, scripts, type: 'text/javascript' })
217
237
  if (modules) injectScripts({ $, modules, type: 'module' })
218
238
 
219
- return rewriteUrls ? rewriteCssUrls({ html: $.html(), url }) : $.html()
239
+ return rewriteUrls ? rewriteCssUrls({ $, url }) : $
220
240
  }
221
241
 
222
242
  module.exports.getDate = getDate
package/src/index.js CHANGED
@@ -236,14 +236,14 @@ const getContent = PCancelable.fn(
236
236
  onCancel(() => promise.cancel())
237
237
 
238
238
  return promise.then(content => {
239
- const html = addHtml({
239
+ const $ = addHtml({
240
240
  ...content,
241
241
  ...(isFetchMode ? puppeteerOpts : undefined),
242
242
  rewriteUrls,
243
243
  rewriteHtml
244
244
  })
245
245
 
246
- return { ...content, html }
246
+ return { ...content, $ }
247
247
  })
248
248
  }
249
249
  )
@@ -261,8 +261,9 @@ module.exports = PCancelable.fn(
261
261
  mutoolPath = defaultMutoolPath(),
262
262
  prerender = 'auto',
263
263
  puppeteerOpts,
264
+ rewriteHtml = false,
264
265
  rewriteUrls = false,
265
- rewriteHtml = false
266
+ serializeHtml = $ => ({ html: $.html() })
266
267
  } = {},
267
268
  onCancel
268
269
  ) => {
@@ -291,9 +292,12 @@ module.exports = PCancelable.fn(
291
292
 
292
293
  onCancel(() => promise.cancel())
293
294
 
294
- const { mode, ...payload } = await promise
295
+ const { mode, $, ...payload } = await promise
295
296
 
296
- return Object.assign(payload, { stats: { mode, timing: duration() } })
297
+ return Object.assign(payload, {
298
+ ...serializeHtml($),
299
+ stats: { mode, timing: duration() }
300
+ })
297
301
  }
298
302
  )
299
303