html-get 2.15.1 → 2.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, fine-tuned for correction & speed",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.15.1",
5
+ "version": "2.16.0",
6
6
  "main": "src/index.js",
7
7
  "bin": {
8
8
  "html-get": "bin/index.js"
@@ -35,7 +35,7 @@
35
35
  ],
36
36
  "dependencies": {
37
37
  "@kikobeats/time-span": "~1.0.3",
38
- "@metascraper/helpers": "~5.43.4",
38
+ "@metascraper/helpers": "~5.44.0",
39
39
  "cheerio": "~1.0.0-rc.12",
40
40
  "css-url-regex": "~4.0.0",
41
41
  "debug-logfmt": "~1.2.2",
@@ -44,11 +44,11 @@
44
44
  "html-encode": "~2.1.6",
45
45
  "html-urls": "~2.4.55",
46
46
  "is-html-content": "~1.0.0",
47
+ "localhost-url-regex": "~1.0.11",
47
48
  "lodash": "~4.17.21",
48
49
  "mri": "~1.2.0",
49
50
  "p-cancelable": "~2.1.0",
50
51
  "p-retry": "~4.6.0",
51
- "replace-string": "~3.1.0",
52
52
  "tinyspawn": "~1.2.6",
53
53
  "top-sites": "~1.1.205"
54
54
  },
@@ -67,7 +67,7 @@
67
67
  "nano-staged": "latest",
68
68
  "npm-check-updates": "latest",
69
69
  "pretty": "latest",
70
- "puppeteer": "latest",
70
+ "puppeteer": "21",
71
71
  "regex-iso-date": "latest",
72
72
  "simple-git-hooks": "latest",
73
73
  "standard": "latest",
@@ -1 +1,266 @@
1
- [[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domainWithoutSuffix","stackoverflow"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
1
+ [
2
+ [
3
+ [
4
+ "domainWithoutSuffix",
5
+ "google"
6
+ ]
7
+ ],
8
+ [
9
+ [
10
+ "domainWithoutSuffix",
11
+ "youtube"
12
+ ]
13
+ ],
14
+ [
15
+ [
16
+ "domainWithoutSuffix",
17
+ "microsoft"
18
+ ]
19
+ ],
20
+ [
21
+ [
22
+ "domainWithoutSuffix",
23
+ "apple"
24
+ ]
25
+ ],
26
+ [
27
+ [
28
+ "domainWithoutSuffix",
29
+ "wordpress"
30
+ ]
31
+ ],
32
+ [
33
+ [
34
+ "domainWithoutSuffix",
35
+ "wikipedia"
36
+ ]
37
+ ],
38
+ [
39
+ [
40
+ "domainWithoutSuffix",
41
+ "blogspot"
42
+ ]
43
+ ],
44
+ [
45
+ [
46
+ "domainWithoutSuffix",
47
+ "vimeo"
48
+ ]
49
+ ],
50
+ [
51
+ [
52
+ "domainWithoutSuffix",
53
+ "github"
54
+ ]
55
+ ],
56
+ [
57
+ [
58
+ "domainWithoutSuffix",
59
+ "nytimes"
60
+ ]
61
+ ],
62
+ [
63
+ [
64
+ "domainWithoutSuffix",
65
+ "theguardian"
66
+ ]
67
+ ],
68
+ [
69
+ [
70
+ "domainWithoutSuffix",
71
+ "imdb"
72
+ ]
73
+ ],
74
+ [
75
+ [
76
+ "domainWithoutSuffix",
77
+ "bbc"
78
+ ]
79
+ ],
80
+ [
81
+ [
82
+ "domainWithoutSuffix",
83
+ "slideshare"
84
+ ]
85
+ ],
86
+ [
87
+ [
88
+ "domainWithoutSuffix",
89
+ "soundcloud"
90
+ ]
91
+ ],
92
+ [
93
+ [
94
+ "domainWithoutSuffix",
95
+ "huffingtonpost"
96
+ ]
97
+ ],
98
+ [
99
+ [
100
+ "domainWithoutSuffix",
101
+ "spotify"
102
+ ]
103
+ ],
104
+ [
105
+ [
106
+ "domainWithoutSuffix",
107
+ "twitter"
108
+ ]
109
+ ],
110
+ [
111
+ [
112
+ "domainWithoutSuffix",
113
+ "pinterest"
114
+ ]
115
+ ],
116
+ [
117
+ [
118
+ "domainWithoutSuffix",
119
+ "telegraph"
120
+ ]
121
+ ],
122
+ [
123
+ [
124
+ "domainWithoutSuffix",
125
+ "techcrunch"
126
+ ]
127
+ ],
128
+ [
129
+ [
130
+ "domainWithoutSuffix",
131
+ "zoom"
132
+ ]
133
+ ],
134
+ [
135
+ [
136
+ "domainWithoutSuffix",
137
+ "stackoverflow"
138
+ ]
139
+ ],
140
+ [
141
+ [
142
+ "domain",
143
+ "abc.net.au"
144
+ ]
145
+ ],
146
+ [
147
+ [
148
+ "domainWithoutSuffix",
149
+ "yelp"
150
+ ]
151
+ ],
152
+ [
153
+ [
154
+ "domainWithoutSuffix",
155
+ "eventbrite"
156
+ ]
157
+ ],
158
+ [
159
+ [
160
+ "domainWithoutSuffix",
161
+ "engadget"
162
+ ]
163
+ ],
164
+ [
165
+ [
166
+ "domainWithoutSuffix",
167
+ "theverge"
168
+ ]
169
+ ],
170
+ [
171
+ [
172
+ "domainWithoutSuffix",
173
+ "substack"
174
+ ]
175
+ ],
176
+ [
177
+ [
178
+ "domainWithoutSuffix",
179
+ "giphy"
180
+ ]
181
+ ],
182
+ [
183
+ [
184
+ "domainWithoutSuffix",
185
+ "dribbble"
186
+ ]
187
+ ],
188
+ [
189
+ [
190
+ "domainWithoutSuffix",
191
+ "csdn"
192
+ ]
193
+ ],
194
+ [
195
+ [
196
+ "domainWithoutSuffix",
197
+ "deviantart"
198
+ ]
199
+ ],
200
+ [
201
+ [
202
+ "domainWithoutSuffix",
203
+ "digg"
204
+ ]
205
+ ],
206
+ [
207
+ [
208
+ "domainWithoutSuffix",
209
+ "etsy"
210
+ ]
211
+ ],
212
+ [
213
+ [
214
+ "domainWithoutSuffix",
215
+ "flickr"
216
+ ]
217
+ ],
218
+ [
219
+ [
220
+ "domainWithoutSuffix",
221
+ "ghost"
222
+ ]
223
+ ],
224
+ [
225
+ [
226
+ "domainWithoutSuffix",
227
+ "gitlab"
228
+ ]
229
+ ],
230
+ [
231
+ [
232
+ "domainWithoutSuffix",
233
+ "imgur"
234
+ ]
235
+ ],
236
+ [
237
+ [
238
+ "domainWithoutSuffix",
239
+ "meetup"
240
+ ]
241
+ ],
242
+ [
243
+ [
244
+ "domainWithoutSuffix",
245
+ "producthunt"
246
+ ]
247
+ ],
248
+ [
249
+ [
250
+ "domainWithoutSuffix",
251
+ "sourceforge"
252
+ ]
253
+ ],
254
+ [
255
+ [
256
+ "domainWithoutSuffix",
257
+ "tumblr"
258
+ ]
259
+ ],
260
+ [
261
+ [
262
+ "domainWithoutSuffix",
263
+ "ycombinator"
264
+ ]
265
+ ]
266
+ ]
package/src/html.js CHANGED
@@ -1,8 +1,8 @@
1
1
  'use strict'
2
2
 
3
3
  const { get, split, nth, castArray, forEach } = require('lodash')
4
+ const localhostUrl = require('localhost-url-regex')
4
5
  const { TAGS: URL_TAGS } = require('html-urls')
5
- const replaceString = require('replace-string')
6
6
  const isHTML = require('is-html-content')
7
7
  const cssUrl = require('css-url-regex')
8
8
  const execall = require('execall')
@@ -95,7 +95,9 @@ const rewriteHtmlUrls = ({ $, url }) => {
95
95
  const el = $(this)
96
96
  const attr = el.attr(urlAttr)
97
97
 
98
- if (typeof attr === 'string' && !attr.startsWith('http')) {
98
+ if (localhostUrl().test(attr)) {
99
+ el.remove()
100
+ } else if (typeof attr === 'string' && !attr.startsWith('http')) {
99
101
  try {
100
102
  const newAttr = new URL(attr, url).toString()
101
103
  el.attr(urlAttr, newAttr)
@@ -117,7 +119,7 @@ const rewriteCssUrls = ({ html, url }) => {
117
119
  if (cssUrl.startsWith('/')) {
118
120
  try {
119
121
  const absoluteUrl = new URL(cssUrl, url).toString()
120
- html = replaceString(html, `url(${cssUrl})`, `url(${absoluteUrl})`)
122
+ html = html.replaceAll(`url(${cssUrl})`, `url(${absoluteUrl})`)
121
123
  } catch (_) {}
122
124
  }
123
125
  })