html-get 2.16.10 → 2.17.0-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE CHANGED
File without changes
package/README.md CHANGED
@@ -151,6 +151,13 @@ Default: `false`
151
151
 
152
152
  When is `true`, it will be rewritten CSS/HTML relatives URLs present in the HTML markup into absolutes.
153
153
 
154
+ ##### rewriteHtml
155
+
156
+ Type: `boolean`<br>
157
+ Default: `false`
158
+
159
+ When is `true`, it will rewrite some common mistake related with HTML meta tags.
160
+
154
161
  ## License
155
162
 
156
163
  **html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.<br>
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, fine-tuned for correction & speed",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.16.10",
5
+ "version": "2.17.0-0",
6
6
  "main": "src/index.js",
7
7
  "bin": {
8
8
  "html-get": "bin/index.js"
@@ -81,18 +81,6 @@
81
81
  "scripts",
82
82
  "src"
83
83
  ],
84
- "scripts": {
85
- "clean": "rm -rf node_modules",
86
- "contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
87
- "lint": "standard-markdown README.md && standard",
88
- "postinstall": "node scripts/postinstall",
89
- "postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
90
- "pretest": "npm run lint",
91
- "release": "standard-version -a",
92
- "release:github": "github-generate-release",
93
- "release:tags": "git push --follow-tags origin HEAD:master",
94
- "test": "c8 ava"
95
- },
96
84
  "license": "MIT",
97
85
  "ava": {
98
86
  "files": [
@@ -127,5 +115,17 @@
127
115
  "simple-git-hooks": {
128
116
  "commit-msg": "npx commitlint --edit",
129
117
  "pre-commit": "npx nano-staged"
118
+ },
119
+ "scripts": {
120
+ "clean": "rm -rf node_modules",
121
+ "contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
122
+ "lint": "standard-markdown README.md && standard",
123
+ "postinstall": "node scripts/postinstall",
124
+ "postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
125
+ "pretest": "npm run lint",
126
+ "release": "standard-version -a",
127
+ "release:github": "github-generate-release",
128
+ "release:tags": "git push --follow-tags origin HEAD:master",
129
+ "test": "c8 ava"
130
130
  }
131
- }
131
+ }
File without changes
@@ -1 +1 @@
1
- [[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","telegraph"]],[["domain","x.com"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domainWithoutSuffix","eventbrite"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
1
+ [[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","telegraph"]],[["domain","x.com"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
package/src/html.js CHANGED
@@ -89,6 +89,22 @@ const addBody = ({ url, headers, html }) => {
89
89
  return `<!DOCTYPE html><html><head></head><body>${element}</body></html>`
90
90
  }
91
91
 
92
+ const rewriteOpenGraph = ({ $ }) =>
93
+ $('meta[name^="og:"]').each((_, element) => {
94
+ const el = $(element)
95
+ const name = el.attr('name')
96
+ el.removeAttr('name')
97
+ el.attr('property', name)
98
+ })
99
+
100
+ const rewriteMetaProperty = ({ $ }) =>
101
+ $('meta[property]:not([property^="og"])').each((_, element) => {
102
+ const el = $(element)
103
+ const property = el.attr('property')
104
+ el.removeAttr('property')
105
+ el.attr('name', property)
106
+ })
107
+
92
108
  const rewriteHtmlUrls = ({ $, url }) => {
93
109
  forEach(URL_TAGS, (tagName, urlAttr) => {
94
110
  $(tagName.join(',')).each(function () {
@@ -156,6 +172,7 @@ module.exports = ({
156
172
  hide,
157
173
  remove,
158
174
  rewriteUrls,
175
+ rewriteHtml,
159
176
  scripts,
160
177
  modules
161
178
  }) => {
@@ -167,6 +184,11 @@ module.exports = ({
167
184
 
168
185
  if (rewriteUrls) rewriteHtmlUrls({ $, url })
169
186
 
187
+ if (rewriteHtml) {
188
+ rewriteOpenGraph({ $ })
189
+ rewriteMetaProperty({ $ })
190
+ }
191
+
170
192
  addHead({ $, url, headers })
171
193
 
172
194
  if (styles) injectStyle({ $, styles })
package/src/index.js CHANGED
@@ -57,7 +57,7 @@ const fetch = PCancelable.fn(
57
57
  if (mutoolPath && contentType === 'application/pdf') {
58
58
  const file = getTemporalFile(url, 'pdf')
59
59
  await writeFile(file.path, res.body)
60
- return (await $(`mutool draw -q -F html ${file.path}`)).stdout
60
+ return (await $(`${mutoolPath} draw -q -F html ${file.path}`)).stdout
61
61
  }
62
62
 
63
63
  return contentType.startsWith('text/html') || !isMediaUrl(url)
@@ -191,7 +191,7 @@ const defaultGetTemporalFile = (url, ext) => {
191
191
  const defaultMutoolPath = () =>
192
192
  (() => {
193
193
  try {
194
- return execSync('which mutool').toString().trim()
194
+ return execSync('which mutool', { stdio: 'pipe' }).toString().trim()
195
195
  } catch (_) {}
196
196
  })()
197
197
 
@@ -207,6 +207,7 @@ const getContent = PCancelable.fn(
207
207
  mutoolPath,
208
208
  puppeteerOpts,
209
209
  rewriteUrls,
210
+ rewriteHtml,
210
211
  toEncode
211
212
  },
212
213
  onCancel
@@ -224,7 +225,8 @@ const getContent = PCancelable.fn(
224
225
  const html = addHtml({
225
226
  ...content,
226
227
  ...(isFetchMode ? puppeteerOpts : undefined),
227
- rewriteUrls
228
+ rewriteUrls,
229
+ rewriteHtml
228
230
  })
229
231
 
230
232
  return { ...content, html }
@@ -245,7 +247,8 @@ module.exports = PCancelable.fn(
245
247
  mutoolPath = defaultMutoolPath(),
246
248
  prerender = 'auto',
247
249
  puppeteerOpts,
248
- rewriteUrls = false
250
+ rewriteUrls = false,
251
+ rewriteHtml = false
249
252
  } = {},
250
253
  onCancel
251
254
  ) => {
@@ -268,6 +271,7 @@ module.exports = PCancelable.fn(
268
271
  mutoolPath,
269
272
  puppeteerOpts,
270
273
  rewriteUrls,
274
+ rewriteHtml,
271
275
  toEncode
272
276
  })
273
277