html-get 2.9.23 → 2.9.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -6
- package/scripts/postinstall +2 -3
- package/src/auto-domains.json +15 -15
- package/src/html.js +2 -2
- package/src/index.js +3 -3
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "html-get",
|
|
3
3
|
"description": "Get the HTML from any website, using prerendering when is necessary.",
|
|
4
4
|
"homepage": "https://nicedoc.com/microlinkhq/html-get",
|
|
5
|
-
"version": "2.9.
|
|
5
|
+
"version": "2.9.25",
|
|
6
6
|
"main": "src/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"html-get": "bin/index.js"
|
|
@@ -29,14 +29,14 @@
|
|
|
29
29
|
"request"
|
|
30
30
|
],
|
|
31
31
|
"dependencies": {
|
|
32
|
-
"@metascraper/helpers": "~5.
|
|
32
|
+
"@metascraper/helpers": "~5.32.1",
|
|
33
33
|
"cheerio": "~1.0.0-rc.12",
|
|
34
34
|
"css-url-regex": "~4.0.0",
|
|
35
35
|
"debug-logfmt": "~1.0.4",
|
|
36
36
|
"execall": "~2.0.0",
|
|
37
37
|
"got": "~11.8.5",
|
|
38
38
|
"html-encode": "~2.1.6",
|
|
39
|
-
"html-urls": "~2.4.
|
|
39
|
+
"html-urls": "~2.4.39",
|
|
40
40
|
"is-html-content": "~1.0.0",
|
|
41
41
|
"lodash": "~4.17.21",
|
|
42
42
|
"minimist": "~1.2.6",
|
|
@@ -44,8 +44,7 @@
|
|
|
44
44
|
"p-retry": "~4.6.0",
|
|
45
45
|
"replace-string": "~3.1.0",
|
|
46
46
|
"time-span": "~4.0.0",
|
|
47
|
-
"
|
|
48
|
-
"top-sites": "~1.1.117",
|
|
47
|
+
"top-sites": "~1.1.132",
|
|
49
48
|
"write-json-file": "~4.3.0"
|
|
50
49
|
},
|
|
51
50
|
"devDependencies": {
|
|
@@ -94,7 +93,6 @@
|
|
|
94
93
|
},
|
|
95
94
|
"license": "MIT",
|
|
96
95
|
"ava": {
|
|
97
|
-
"workerThreads": false,
|
|
98
96
|
"files": [
|
|
99
97
|
"test/**/*.js",
|
|
100
98
|
"!test/util.js"
|
package/scripts/postinstall
CHANGED
|
@@ -3,11 +3,10 @@
|
|
|
3
3
|
'use strict'
|
|
4
4
|
|
|
5
5
|
const { compact, reduce, findIndex } = require('lodash')
|
|
6
|
+
const { parseUrl } = require('@metascraper/helpers')
|
|
6
7
|
const writeJsonFile = require('write-json-file')
|
|
7
8
|
const topsites = require('top-sites')
|
|
8
9
|
|
|
9
|
-
const { getDomainWithoutSuffix } = require('tldts')
|
|
10
|
-
|
|
11
10
|
const domains = [
|
|
12
11
|
'apple',
|
|
13
12
|
'bbc',
|
|
@@ -55,7 +54,7 @@ const { top, rest } = reduce(
|
|
|
55
54
|
(acc, domain) => {
|
|
56
55
|
const index = findIndex(
|
|
57
56
|
topsites,
|
|
58
|
-
({ rootDomain }) =>
|
|
57
|
+
({ rootDomain }) => parseUrl(rootDomain).domainWithoutSuffix === domain
|
|
59
58
|
)
|
|
60
59
|
if (index !== -1) acc.top[index] = domain
|
|
61
60
|
else acc.rest.push(domain)
|
package/src/auto-domains.json
CHANGED
|
@@ -1,35 +1,35 @@
|
|
|
1
1
|
[
|
|
2
|
-
"apple",
|
|
3
2
|
"google",
|
|
4
3
|
"youtube",
|
|
4
|
+
"apple",
|
|
5
5
|
"microsoft",
|
|
6
|
-
"wordpress",
|
|
7
6
|
"wikipedia",
|
|
7
|
+
"wordpress",
|
|
8
8
|
"blogspot",
|
|
9
9
|
"vimeo",
|
|
10
10
|
"github",
|
|
11
|
-
"
|
|
12
|
-
"slideshare",
|
|
11
|
+
"nytimes",
|
|
13
12
|
"imdb",
|
|
14
13
|
"bbc",
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
"pinterest",
|
|
18
|
-
"soundcloud",
|
|
14
|
+
"slideshare",
|
|
15
|
+
"theguardian",
|
|
19
16
|
"huffingtonpost",
|
|
20
|
-
"
|
|
21
|
-
"
|
|
17
|
+
"soundcloud",
|
|
18
|
+
"pinterest",
|
|
19
|
+
"telegraph",
|
|
20
|
+
"spotify",
|
|
22
21
|
"yelp",
|
|
23
22
|
"eventbrite",
|
|
24
|
-
"
|
|
23
|
+
"techcrunch",
|
|
24
|
+
"zoom",
|
|
25
25
|
"engadget",
|
|
26
26
|
"theverge",
|
|
27
|
-
"
|
|
28
|
-
"stackoverflow",
|
|
27
|
+
"etsy",
|
|
29
28
|
"reddit",
|
|
30
|
-
"
|
|
29
|
+
"stackoverflow",
|
|
31
30
|
"csdn",
|
|
32
|
-
"
|
|
31
|
+
"digg",
|
|
32
|
+
"flickr",
|
|
33
33
|
"ghost",
|
|
34
34
|
"giphy",
|
|
35
35
|
"imgur",
|
package/src/html.js
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
3
|
const { get, split, nth, castArray, forEach } = require('lodash')
|
|
4
|
+
const { parseUrl } = require('@metascraper/helpers')
|
|
4
5
|
const { TAGS: URL_TAGS } = require('html-urls')
|
|
5
6
|
const replaceString = require('replace-string')
|
|
6
7
|
const isHTML = require('is-html-content')
|
|
7
8
|
const cssUrl = require('css-url-regex')
|
|
8
|
-
const { getDomain } = require('tldts')
|
|
9
9
|
const execall = require('execall')
|
|
10
10
|
const cheerio = require('cheerio')
|
|
11
11
|
const { URL } = require('url')
|
|
@@ -36,7 +36,7 @@ const addHead = ({ $, url, headers }) => {
|
|
|
36
36
|
upsert(
|
|
37
37
|
head.find('meta[property="og:site_name"]'),
|
|
38
38
|
tags,
|
|
39
|
-
`<meta property="og:site_name" content="${
|
|
39
|
+
`<meta property="og:site_name" content="${parseUrl(url).domain}">`
|
|
40
40
|
)
|
|
41
41
|
|
|
42
42
|
if (date) {
|
package/src/index.js
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
'use strict'
|
|
2
2
|
|
|
3
|
-
const { isMediaUrl } = require('@metascraper/helpers')
|
|
4
|
-
const { getDomainWithoutSuffix } = require('tldts')
|
|
3
|
+
const { parseUrl, isMediaUrl } = require('@metascraper/helpers')
|
|
5
4
|
const debug = require('debug-logfmt')('html-get')
|
|
6
5
|
const PCancelable = require('p-cancelable')
|
|
7
6
|
const { AbortError } = require('p-retry')
|
|
@@ -122,7 +121,8 @@ const prerender = async (
|
|
|
122
121
|
|
|
123
122
|
const modes = { fetch, prerender }
|
|
124
123
|
|
|
125
|
-
const isFetchMode = url =>
|
|
124
|
+
const isFetchMode = url =>
|
|
125
|
+
autoDomains.includes(parseUrl(url).domainWithoutSuffix)
|
|
126
126
|
|
|
127
127
|
const determinateMode = (url, { prerender }) => {
|
|
128
128
|
if (prerender === false || isMediaUrl(url)) return 'fetch'
|