html-get 2.22.0-1 → 2.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE CHANGED
File without changes
package/index.d.ts ADDED
@@ -0,0 +1,100 @@
1
+ import type { CheerioAPI } from 'cheerio'
2
+
3
+ /**
4
+ * Result returned by html-get
5
+ */
6
+ export interface HtmlGetResult {
7
+ /** The HTML content */
8
+ html: string
9
+ /** Response headers */
10
+ headers: Record<string, string | string[] | undefined>
11
+ /** Final URL after redirects */
12
+ url: string
13
+ /** HTTP status code */
14
+ statusCode: number
15
+ /** Redirect history */
16
+ redirects: Array<{ statusCode: number; url: string }>
17
+ /** Mode used: 'fetch' or 'prerender' */
18
+ mode: 'fetch' | 'prerender'
19
+ /** Parsed HTML (Cheerio) */
20
+ $?: CheerioAPI
21
+ /** Statistics about the request */
22
+ stats: {
23
+ mode: 'fetch' | 'prerender'
24
+ timing: number
25
+ }
26
+ }
27
+
28
+ /**
29
+ * Options for html-get
30
+ */
31
+ export interface HtmlGetOptions {
32
+ /** Character encoding for HTML (default: 'utf-8') */
33
+ encoding?: string
34
+ /** Function that returns a browserless instance (required unless prerender is false) */
35
+ getBrowserless?: () => Promise<any>
36
+ /** Function to determine the mode ('fetch' or 'prerender') */
37
+ getMode?: (url: string, options: { prerender: boolean | 'auto' }) => 'fetch' | 'prerender'
38
+ /** Function to create temporary files */
39
+ getTemporalFile?: (input: string, ext?: string) => { path: string }
40
+ /** Options passed to got (the HTTP client) */
41
+ gotOpts?: Record<string, any>
42
+ /** Request headers */
43
+ headers?: Record<string, string>
44
+ /** Mutool function for PDF processing, or false to disable */
45
+ mutool?: ((...args: string[]) => any) | false
46
+ /** Prerender mode: true, false, or 'auto' (default) */
47
+ prerender?: boolean | 'auto'
48
+ /** Options passed to Puppeteer */
49
+ puppeteerOpts?: Record<string, any>
50
+ /** Rewrite relative URLs to absolute */
51
+ rewriteUrls?: boolean
52
+ /** Rewrite common HTML meta tag mistakes */
53
+ rewriteHtml?: boolean
54
+ /** Function to serialize HTML (default: $ => ({ html: $.html() })) */
55
+ serializeHtml?: ($: CheerioAPI) => { html: string }
56
+ }
57
+
58
+ /**
59
+ * Main function to get HTML from a URL
60
+ */
61
+ export function htmlGet(
62
+ targetUrl: string,
63
+ options?: HtmlGetOptions
64
+ ): Promise<HtmlGetResult>
65
+
66
+ /**
67
+ * Check if a URL should use 'fetch' mode (no prerender needed)
68
+ */
69
+ export function isFetchMode(url: string): boolean
70
+
71
+ /**
72
+ * Get content directly with a specific mode
73
+ */
74
+ export function getContent(
75
+ url: string,
76
+ mode: 'fetch' | 'prerender',
77
+ options?: HtmlGetOptions
78
+ ): Promise<HtmlGetResult>
79
+
80
+ /**
81
+ * Default mutool function (returns undefined if mutool is not installed)
82
+ */
83
+ export function defaultMutool(): ((...args: string[]) => any) | undefined
84
+
85
+ /**
86
+ * Default request timeout in milliseconds
87
+ */
88
+ export const REQ_TIMEOUT: number
89
+
90
+ /**
91
+ * Default abort types for prerendering
92
+ */
93
+ export const ABORT_TYPES: string[]
94
+
95
+ /**
96
+ * PDF size threshold in bytes (150KB)
97
+ */
98
+ export const PDF_SIZE_TRESHOLD: number
99
+
100
+ export default htmlGet
package/package.json CHANGED
@@ -2,7 +2,8 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, fine-tuned for correction & speed",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.22.0-1",
5
+ "version": "2.22.0",
6
+ "types": "index.d.ts",
6
7
  "main": "src/index.js",
7
8
  "bin": {
8
9
  "html-get": "bin/index.js"
@@ -12,7 +13,16 @@
12
13
  "name": "Kiko Beats",
13
14
  "url": "https://kikobeats.com"
14
15
  },
15
- "contributors": [],
16
+ "contributors": [
17
+ {
18
+ "name": "Divyansh Singh",
19
+ "email": "40380293+brc-dd@users.noreply.github.com"
20
+ },
21
+ {
22
+ "name": "Michael Lip",
23
+ "email": "51033404+theluckystrike@users.noreply.github.com"
24
+ }
25
+ ],
16
26
  "repository": {
17
27
  "type": "git",
18
28
  "url": "git+https://github.com/microlinkhq/html-get.git"
@@ -36,25 +46,27 @@
36
46
  ],
37
47
  "dependencies": {
38
48
  "@kikobeats/time-span": "~1.0.5",
39
- "@metascraper/helpers": "~5.46.1",
40
- "cheerio": "~1.0.0",
49
+ "@metascraper/helpers": "~5.49.1",
50
+ "cheerio": "~1.2.0",
41
51
  "content-type": "~1.0.5",
42
52
  "css-url-regex": "~4.0.0",
43
- "debug-logfmt": "~1.2.3",
53
+ "debug-logfmt": "~1.4.0",
44
54
  "execall": "~2.0.0",
45
55
  "got": "~11.8.6",
46
56
  "html-encode": "~2.1.7",
47
57
  "html-urls": "~2.4.62",
48
58
  "is-html-content": "~1.0.0",
49
- "is-local-address": "~2.2.0",
59
+ "is-local-address": "~2.3.0",
50
60
  "lodash": "~4.17.21",
51
61
  "mri": "~1.2.0",
62
+ "null-prototype-object": "~1.2.0",
52
63
  "p-cancelable": "~2.1.0",
53
64
  "p-retry": "~4.6.0",
54
65
  "tinyspawn": "~1.5.0",
55
66
  "top-sites": "~1.1.220"
56
67
  },
57
68
  "devDependencies": {
69
+ "@browserless/test": "latest",
58
70
  "@commitlint/cli": "latest",
59
71
  "@commitlint/config-conventional": "latest",
60
72
  "@ksmithut/prettier-standard": "latest",
@@ -79,9 +91,22 @@
79
91
  },
80
92
  "files": [
81
93
  "bin",
94
+ "index.d.ts",
82
95
  "scripts",
83
96
  "src"
84
97
  ],
98
+ "scripts": {
99
+ "clean": "rm -rf node_modules",
100
+ "contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
101
+ "lint": "standard",
102
+ "postinstall": "node scripts/postinstall",
103
+ "postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
104
+ "pretest": "npm run lint",
105
+ "release": "standard-version -a",
106
+ "release:github": "github-generate-release",
107
+ "release:tags": "git push --follow-tags origin HEAD:master",
108
+ "test": "c8 ava"
109
+ },
85
110
  "license": "MIT",
86
111
  "ava": {
87
112
  "files": [
@@ -110,20 +135,11 @@
110
135
  "finepack"
111
136
  ]
112
137
  },
138
+ "pnpm": {
139
+ "neverBuiltDependencies": []
140
+ },
113
141
  "simple-git-hooks": {
114
142
  "commit-msg": "npx commitlint --edit",
115
143
  "pre-commit": "npx nano-staged"
116
- },
117
- "scripts": {
118
- "clean": "rm -rf node_modules",
119
- "contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
120
- "lint": "standard",
121
- "postinstall": "node scripts/postinstall",
122
- "postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
123
- "pretest": "npm run lint",
124
- "release": "standard-version -a",
125
- "release:github": "github-generate-release",
126
- "release:tags": "git push --follow-tags origin HEAD:master",
127
- "test": "c8 ava"
128
144
  }
129
- }
145
+ }
@@ -9,7 +9,7 @@ const topsites = require('top-sites')
9
9
 
10
10
  const domains = [
11
11
  [['domain', 'abc.net.au']],
12
- [['domain', 'x.com']],
12
+ [['domain', 'bsky.app']],
13
13
  [['domainWithoutSuffix', 'apple']],
14
14
  [['domainWithoutSuffix', 'arxiv']],
15
15
  [['domainWithoutSuffix', 'bbc']],
@@ -17,7 +17,6 @@ const domains = [
17
17
  [['domainWithoutSuffix', 'csdn']],
18
18
  [['domainWithoutSuffix', 'deviantart']],
19
19
  [['domainWithoutSuffix', 'digg']],
20
- [['domainWithoutSuffix', 'dribbble']],
21
20
  [['domainWithoutSuffix', 'engadget']],
22
21
  [['domainWithoutSuffix', 'etsy']],
23
22
  [['domainWithoutSuffix', 'eventbrite']],
@@ -48,7 +47,6 @@ const domains = [
48
47
  [['domainWithoutSuffix', 'theguardian']],
49
48
  [['domainWithoutSuffix', 'theverge']],
50
49
  [['domainWithoutSuffix', 'tumblr']],
51
- [['domainWithoutSuffix', 'twitter']],
52
50
  [['domainWithoutSuffix', 'vimeo']],
53
51
  [['domainWithoutSuffix', 'wikipedia']],
54
52
  [['domainWithoutSuffix', 'wordpress']],
@@ -79,6 +77,7 @@ const { top, rest } = reduce(
79
77
  { top: new Array(topsites.length), rest: [] }
80
78
  )
81
79
 
82
- writeFile('./src/auto-domains.json', JSON.stringify(compact(top).concat(rest)), null, 2).catch(
83
- error => console.log(error)
84
- )
80
+ writeFile(
81
+ './src/auto-domains.json',
82
+ JSON.stringify(compact(top).concat(rest), null, 2)
83
+ ).catch(error => console.log(error))
@@ -1 +1,284 @@
1
- [[["domainWithoutSuffix","google"]],[["domainWithoutSuffix","youtube"]],[["domainWithoutSuffix","wordpress"]],[["domainWithoutSuffix","apple"]],[["domainWithoutSuffix","microsoft"]],[["domainWithoutSuffix","wikipedia"]],[["domainWithoutSuffix","vimeo"]],[["domainWithoutSuffix","blogspot"]],[["domainWithoutSuffix","github"]],[["domain","x.com"]],[["domainWithoutSuffix","nytimes"]],[["domainWithoutSuffix","theguardian"]],[["domainWithoutSuffix","twitter"]],[["domainWithoutSuffix","bbc"]],[["domainWithoutSuffix","imdb"]],[["domainWithoutSuffix","slideshare"]],[["domainWithoutSuffix","telegraph"]],[["domainWithoutSuffix","pinterest"]],[["domainWithoutSuffix","spotify"]],[["domainWithoutSuffix","huffingtonpost"]],[["domainWithoutSuffix","instagram"]],[["domainWithoutSuffix","techcrunch"]],[["domainWithoutSuffix","zoom"]],[["domain","abc.net.au"]],[["domainWithoutSuffix","arxiv"]],[["domainWithoutSuffix","yelp"]],[["domainWithoutSuffix","eventbrite"]],[["domainWithoutSuffix","engadget"]],[["domainWithoutSuffix","soundcloud"]],[["domainWithoutSuffix","theverge"]],[["domainWithoutSuffix","digg"]],[["domainWithoutSuffix","imgur"]],[["domainWithoutSuffix","csdn"]],[["domainWithoutSuffix","deviantart"]],[["domainWithoutSuffix","dribbble"]],[["domainWithoutSuffix","etsy"]],[["domainWithoutSuffix","flickr"]],[["domainWithoutSuffix","ghost"]],[["domainWithoutSuffix","giphy"]],[["domainWithoutSuffix","gitlab"]],[["domainWithoutSuffix","meetup"]],[["domainWithoutSuffix","producthunt"]],[["domainWithoutSuffix","reddit"]],[["domainWithoutSuffix","sourceforge"]],[["domainWithoutSuffix","stackoverflow"]],[["domainWithoutSuffix","substack"]],[["domainWithoutSuffix","tumblr"]],[["domainWithoutSuffix","ycombinator"]]]
1
+ [
2
+ [
3
+ [
4
+ "domainWithoutSuffix",
5
+ "google"
6
+ ]
7
+ ],
8
+ [
9
+ [
10
+ "domainWithoutSuffix",
11
+ "youtube"
12
+ ]
13
+ ],
14
+ [
15
+ [
16
+ "domainWithoutSuffix",
17
+ "microsoft"
18
+ ]
19
+ ],
20
+ [
21
+ [
22
+ "domainWithoutSuffix",
23
+ "apple"
24
+ ]
25
+ ],
26
+ [
27
+ [
28
+ "domainWithoutSuffix",
29
+ "wordpress"
30
+ ]
31
+ ],
32
+ [
33
+ [
34
+ "domainWithoutSuffix",
35
+ "wikipedia"
36
+ ]
37
+ ],
38
+ [
39
+ [
40
+ "domainWithoutSuffix",
41
+ "blogspot"
42
+ ]
43
+ ],
44
+ [
45
+ [
46
+ "domainWithoutSuffix",
47
+ "vimeo"
48
+ ]
49
+ ],
50
+ [
51
+ [
52
+ "domainWithoutSuffix",
53
+ "github"
54
+ ]
55
+ ],
56
+ [
57
+ [
58
+ "domainWithoutSuffix",
59
+ "bbc"
60
+ ]
61
+ ],
62
+ [
63
+ [
64
+ "domainWithoutSuffix",
65
+ "nytimes"
66
+ ]
67
+ ],
68
+ [
69
+ [
70
+ "domainWithoutSuffix",
71
+ "theguardian"
72
+ ]
73
+ ],
74
+ [
75
+ [
76
+ "domainWithoutSuffix",
77
+ "imdb"
78
+ ]
79
+ ],
80
+ [
81
+ [
82
+ "domainWithoutSuffix",
83
+ "pinterest"
84
+ ]
85
+ ],
86
+ [
87
+ [
88
+ "domainWithoutSuffix",
89
+ "telegraph"
90
+ ]
91
+ ],
92
+ [
93
+ [
94
+ "domainWithoutSuffix",
95
+ "slideshare"
96
+ ]
97
+ ],
98
+ [
99
+ [
100
+ "domainWithoutSuffix",
101
+ "huffingtonpost"
102
+ ]
103
+ ],
104
+ [
105
+ [
106
+ "domainWithoutSuffix",
107
+ "spotify"
108
+ ]
109
+ ],
110
+ [
111
+ [
112
+ "domainWithoutSuffix",
113
+ "instagram"
114
+ ]
115
+ ],
116
+ [
117
+ [
118
+ "domainWithoutSuffix",
119
+ "soundcloud"
120
+ ]
121
+ ],
122
+ [
123
+ [
124
+ "domainWithoutSuffix",
125
+ "engadget"
126
+ ]
127
+ ],
128
+ [
129
+ [
130
+ "domainWithoutSuffix",
131
+ "techcrunch"
132
+ ]
133
+ ],
134
+ [
135
+ [
136
+ "domainWithoutSuffix",
137
+ "zoom"
138
+ ]
139
+ ],
140
+ [
141
+ [
142
+ "domain",
143
+ "abc.net.au"
144
+ ]
145
+ ],
146
+ [
147
+ [
148
+ "domainWithoutSuffix",
149
+ "arxiv"
150
+ ]
151
+ ],
152
+ [
153
+ [
154
+ "domainWithoutSuffix",
155
+ "eventbrite"
156
+ ]
157
+ ],
158
+ [
159
+ [
160
+ "domainWithoutSuffix",
161
+ "yelp"
162
+ ]
163
+ ],
164
+ [
165
+ [
166
+ "domainWithoutSuffix",
167
+ "theverge"
168
+ ]
169
+ ],
170
+ [
171
+ [
172
+ "domainWithoutSuffix",
173
+ "dribbble"
174
+ ]
175
+ ],
176
+ [
177
+ [
178
+ "domain",
179
+ "bsky.app"
180
+ ]
181
+ ],
182
+ [
183
+ [
184
+ "domainWithoutSuffix",
185
+ "csdn"
186
+ ]
187
+ ],
188
+ [
189
+ [
190
+ "domainWithoutSuffix",
191
+ "deviantart"
192
+ ]
193
+ ],
194
+ [
195
+ [
196
+ "domainWithoutSuffix",
197
+ "digg"
198
+ ]
199
+ ],
200
+ [
201
+ [
202
+ "domainWithoutSuffix",
203
+ "etsy"
204
+ ]
205
+ ],
206
+ [
207
+ [
208
+ "domainWithoutSuffix",
209
+ "flickr"
210
+ ]
211
+ ],
212
+ [
213
+ [
214
+ "domainWithoutSuffix",
215
+ "ghost"
216
+ ]
217
+ ],
218
+ [
219
+ [
220
+ "domainWithoutSuffix",
221
+ "giphy"
222
+ ]
223
+ ],
224
+ [
225
+ [
226
+ "domainWithoutSuffix",
227
+ "gitlab"
228
+ ]
229
+ ],
230
+ [
231
+ [
232
+ "domainWithoutSuffix",
233
+ "imgur"
234
+ ]
235
+ ],
236
+ [
237
+ [
238
+ "domainWithoutSuffix",
239
+ "meetup"
240
+ ]
241
+ ],
242
+ [
243
+ [
244
+ "domainWithoutSuffix",
245
+ "producthunt"
246
+ ]
247
+ ],
248
+ [
249
+ [
250
+ "domainWithoutSuffix",
251
+ "reddit"
252
+ ]
253
+ ],
254
+ [
255
+ [
256
+ "domainWithoutSuffix",
257
+ "sourceforge"
258
+ ]
259
+ ],
260
+ [
261
+ [
262
+ "domainWithoutSuffix",
263
+ "stackoverflow"
264
+ ]
265
+ ],
266
+ [
267
+ [
268
+ "domainWithoutSuffix",
269
+ "substack"
270
+ ]
271
+ ],
272
+ [
273
+ [
274
+ "domainWithoutSuffix",
275
+ "tumblr"
276
+ ]
277
+ ],
278
+ [
279
+ [
280
+ "domainWithoutSuffix",
281
+ "ycombinator"
282
+ ]
283
+ ]
284
+ ]
package/src/index.js CHANGED
@@ -139,10 +139,9 @@ const prerender = PCancelable.fn(
139
139
  async (page, response) => {
140
140
  if (!response) throw new AbortError('empty response')
141
141
 
142
- const duration = debug.duration('payload')
143
- const payload = {
142
+ return {
144
143
  headers: response.headers(),
145
- html: await page.content('html'),
144
+ html: await page.content(),
146
145
  mode: 'prerender',
147
146
  url: response.url(),
148
147
  statusCode: response.status(),
@@ -154,9 +153,6 @@ const prerender = PCancelable.fn(
154
153
  url: req.url()
155
154
  }))
156
155
  }
157
-
158
- duration()
159
- return payload
160
156
  },
161
157
  {
162
158
  timeout,
@@ -165,10 +161,9 @@ const prerender = PCancelable.fn(
165
161
  }
166
162
  )
167
163
 
168
- const duration = debug.duration('prerender')
169
164
  const payload = await getPayload(url, opts)
170
165
  await fetchRes.cancel()
171
- duration({ url, state: 'success' })
166
+ debug('prerender', { url, state: 'success' })
172
167
  return payload
173
168
  } catch (err) {
174
169
  const { isRejected, ...dataProps } = await fetchRes
@@ -221,7 +216,11 @@ const defaultGetTemporalFile = (input, ext) => {
221
216
  const defaultMutool = () =>
222
217
  (() => {
223
218
  try {
224
- const mutoolPath = execSync('which mutool').toString().trim()
219
+ const mutoolPath = execSync('which mutool', {
220
+ stdio: ['pipe', 'pipe', 'ignore']
221
+ })
222
+ .toString()
223
+ .trim()
225
224
  return (...args) => $(`${mutoolPath} draw -q -F html ${args}`)
226
225
  } catch (_) {}
227
226
  })()
package/src/util.js CHANGED
@@ -1,13 +1,22 @@
1
1
  'use strict'
2
2
 
3
+ const NullProtoObj = require('null-prototype-object')
3
4
  const { parse } = require('content-type')
4
5
 
5
- const CACHE = Object.create(null)
6
+ const CACHE = new NullProtoObj()
6
7
 
7
- const parseContentType = contentType =>
8
- typeof contentType === 'string'
9
- ? parse(contentType)
10
- : { type: undefined, parameters: {} }
8
+ const UNKNOWN_CONTENT_TYPE = { type: undefined, parameters: {} }
9
+
10
+ const SEPARATOR = /,|\r?\n/
11
+
12
+ const parseContentType = contentType => {
13
+ if (typeof contentType !== 'string') return UNKNOWN_CONTENT_TYPE
14
+ try {
15
+ return parse(contentType.split(SEPARATOR)[0])
16
+ } catch {
17
+ return UNKNOWN_CONTENT_TYPE
18
+ }
19
+ }
11
20
 
12
21
  const contentType = headers => {
13
22
  const contentType = headers['content-type']