html-get 2.21.20 → 2.21.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.d.ts ADDED
@@ -0,0 +1,100 @@
1
+ import type { CheerioAPI } from 'cheerio'
2
+
3
+ /**
4
+ * Result returned by html-get
5
+ */
6
+ export interface HtmlGetResult {
7
+ /** The HTML content */
8
+ html: string
9
+ /** Response headers */
10
+ headers: Record<string, string | string[] | undefined>
11
+ /** Final URL after redirects */
12
+ url: string
13
+ /** HTTP status code */
14
+ statusCode: number
15
+ /** Redirect history */
16
+ redirects: Array<{ statusCode: number; url: string }>
17
+ /** Mode used: 'fetch' or 'prerender' */
18
+ mode: 'fetch' | 'prerender'
19
+ /** Parsed HTML (Cheerio) */
20
+ $?: CheerioAPI
21
+ /** Statistics about the request */
22
+ stats: {
23
+ mode: 'fetch' | 'prerender'
24
+ timing: number
25
+ }
26
+ }
27
+
28
+ /**
29
+ * Options for html-get
30
+ */
31
+ export interface HtmlGetOptions {
32
+ /** Character encoding for HTML (default: 'utf-8') */
33
+ encoding?: string
34
+ /** Function that returns a browserless instance (required unless prerender is false) */
35
+ getBrowserless?: () => Promise<any>
36
+ /** Function to determine the mode ('fetch' or 'prerender') */
37
+ getMode?: (url: string, options: { prerender: boolean | 'auto' }) => 'fetch' | 'prerender'
38
+ /** Function to create temporary files */
39
+ getTemporalFile?: (input: string, ext?: string) => { path: string }
40
+ /** Options passed to got (the HTTP client) */
41
+ gotOpts?: Record<string, any>
42
+ /** Request headers */
43
+ headers?: Record<string, string>
44
+ /** Mutool function for PDF processing, or false to disable */
45
+ mutool?: ((...args: string[]) => any) | false
46
+ /** Prerender mode: true, false, or 'auto' (default) */
47
+ prerender?: boolean | 'auto'
48
+ /** Options passed to Puppeteer */
49
+ puppeteerOpts?: Record<string, any>
50
+ /** Rewrite relative URLs to absolute */
51
+ rewriteUrls?: boolean
52
+ /** Rewrite common HTML meta tag mistakes */
53
+ rewriteHtml?: boolean
54
+ /** Function to serialize HTML (default: $ => ({ html: $.html() })) */
55
+ serializeHtml?: ($: CheerioAPI) => { html: string }
56
+ }
57
+
58
+ /**
59
+ * Main function to get HTML from a URL
60
+ */
61
+ export function htmlGet(
62
+ targetUrl: string,
63
+ options?: HtmlGetOptions
64
+ ): Promise<HtmlGetResult>
65
+
66
+ /**
67
+ * Check if a URL should use 'fetch' mode (no prerender needed)
68
+ */
69
+ export function isFetchMode(url: string): boolean
70
+
71
+ /**
72
+ * Get content directly with a specific mode
73
+ */
74
+ export function getContent(
75
+ url: string,
76
+ mode: 'fetch' | 'prerender',
77
+ options?: HtmlGetOptions
78
+ ): Promise<HtmlGetResult>
79
+
80
+ /**
81
+ * Default mutool function (returns undefined if mutool is not installed)
82
+ */
83
+ export function defaultMutool(): ((...args: string[]) => any) | undefined
84
+
85
+ /**
86
+ * Default request timeout in milliseconds
87
+ */
88
+ export const REQ_TIMEOUT: number
89
+
90
+ /**
91
+ * Default abort types for prerendering
92
+ */
93
+ export const ABORT_TYPES: string[]
94
+
95
+ /**
96
+ * PDF size threshold in bytes (150KB)
97
+ */
98
+ export const PDF_SIZE_TRESHOLD: number
99
+
100
+ export default htmlGet
package/package.json CHANGED
@@ -2,7 +2,8 @@
2
2
  "name": "html-get",
3
3
  "description": "Get the HTML from any website, fine-tuned for correction & speed",
4
4
  "homepage": "https://nicedoc.com/microlinkhq/html-get",
5
- "version": "2.21.20",
5
+ "types": "index.d.ts",
6
+ "version": "2.21.22",
6
7
  "main": "src/index.js",
7
8
  "bin": {
8
9
  "html-get": "bin/index.js"
@@ -16,6 +17,10 @@
16
17
  {
17
18
  "name": "Divyansh Singh",
18
19
  "email": "40380293+brc-dd@users.noreply.github.com"
20
+ },
21
+ {
22
+ "name": "Michael Lip",
23
+ "email": "51033404+theluckystrike@users.noreply.github.com"
19
24
  }
20
25
  ],
21
26
  "repository": {
@@ -86,6 +91,7 @@
86
91
  },
87
92
  "files": [
88
93
  "bin",
94
+ "index.d.ts",
89
95
  "scripts",
90
96
  "src"
91
97
  ],
@@ -17,7 +17,6 @@ const domains = [
17
17
  [['domainWithoutSuffix', 'csdn']],
18
18
  [['domainWithoutSuffix', 'deviantart']],
19
19
  [['domainWithoutSuffix', 'digg']],
20
- [['domainWithoutSuffix', 'dribbble']],
21
20
  [['domainWithoutSuffix', 'engadget']],
22
21
  [['domainWithoutSuffix', 'etsy']],
23
22
  [['domainWithoutSuffix', 'eventbrite']],
@@ -78,6 +77,7 @@ const { top, rest } = reduce(
78
77
  { top: new Array(topsites.length), rest: [] }
79
78
  )
80
79
 
81
- writeFile('./src/auto-domains.json', JSON.stringify(compact(top).concat(rest), null, 2)).catch(
82
- error => console.log(error)
83
- )
80
+ writeFile(
81
+ './src/auto-domains.json',
82
+ JSON.stringify(compact(top).concat(rest), null, 2)
83
+ ).catch(error => console.log(error))
@@ -44,67 +44,67 @@
44
44
  [
45
45
  [
46
46
  "domainWithoutSuffix",
47
- "github"
47
+ "vimeo"
48
48
  ]
49
49
  ],
50
50
  [
51
51
  [
52
52
  "domainWithoutSuffix",
53
- "vimeo"
53
+ "github"
54
54
  ]
55
55
  ],
56
56
  [
57
57
  [
58
58
  "domainWithoutSuffix",
59
- "nytimes"
59
+ "bbc"
60
60
  ]
61
61
  ],
62
62
  [
63
63
  [
64
64
  "domainWithoutSuffix",
65
- "bbc"
65
+ "nytimes"
66
66
  ]
67
67
  ],
68
68
  [
69
69
  [
70
70
  "domainWithoutSuffix",
71
- "imdb"
71
+ "theguardian"
72
72
  ]
73
73
  ],
74
74
  [
75
75
  [
76
76
  "domainWithoutSuffix",
77
- "slideshare"
77
+ "imdb"
78
78
  ]
79
79
  ],
80
80
  [
81
81
  [
82
82
  "domainWithoutSuffix",
83
- "theguardian"
83
+ "pinterest"
84
84
  ]
85
85
  ],
86
86
  [
87
87
  [
88
88
  "domainWithoutSuffix",
89
- "spotify"
89
+ "telegraph"
90
90
  ]
91
91
  ],
92
92
  [
93
93
  [
94
94
  "domainWithoutSuffix",
95
- "huffingtonpost"
95
+ "slideshare"
96
96
  ]
97
97
  ],
98
98
  [
99
99
  [
100
100
  "domainWithoutSuffix",
101
- "pinterest"
101
+ "huffingtonpost"
102
102
  ]
103
103
  ],
104
104
  [
105
105
  [
106
106
  "domainWithoutSuffix",
107
- "telegraph"
107
+ "spotify"
108
108
  ]
109
109
  ],
110
110
  [
@@ -122,19 +122,19 @@
122
122
  [
123
123
  [
124
124
  "domainWithoutSuffix",
125
- "techcrunch"
125
+ "engadget"
126
126
  ]
127
127
  ],
128
128
  [
129
129
  [
130
130
  "domainWithoutSuffix",
131
- "zoom"
131
+ "techcrunch"
132
132
  ]
133
133
  ],
134
134
  [
135
135
  [
136
136
  "domainWithoutSuffix",
137
- "engadget"
137
+ "zoom"
138
138
  ]
139
139
  ],
140
140
  [
@@ -146,121 +146,121 @@
146
146
  [
147
147
  [
148
148
  "domainWithoutSuffix",
149
- "eventbrite"
149
+ "arxiv"
150
150
  ]
151
151
  ],
152
152
  [
153
153
  [
154
154
  "domainWithoutSuffix",
155
- "yelp"
155
+ "eventbrite"
156
156
  ]
157
157
  ],
158
158
  [
159
159
  [
160
160
  "domainWithoutSuffix",
161
- "theverge"
161
+ "yelp"
162
162
  ]
163
163
  ],
164
164
  [
165
165
  [
166
166
  "domainWithoutSuffix",
167
- "arxiv"
167
+ "theverge"
168
168
  ]
169
169
  ],
170
170
  [
171
171
  [
172
172
  "domainWithoutSuffix",
173
- "imgur"
173
+ "dribbble"
174
174
  ]
175
175
  ],
176
176
  [
177
177
  [
178
- "domainWithoutSuffix",
179
- "reddit"
178
+ "domain",
179
+ "bsky.app"
180
180
  ]
181
181
  ],
182
182
  [
183
183
  [
184
184
  "domainWithoutSuffix",
185
- "stackoverflow"
185
+ "csdn"
186
186
  ]
187
187
  ],
188
188
  [
189
189
  [
190
190
  "domainWithoutSuffix",
191
- "flickr"
191
+ "deviantart"
192
192
  ]
193
193
  ],
194
194
  [
195
195
  [
196
196
  "domainWithoutSuffix",
197
- "sourceforge"
197
+ "digg"
198
198
  ]
199
199
  ],
200
200
  [
201
201
  [
202
202
  "domainWithoutSuffix",
203
- "dribbble"
203
+ "etsy"
204
204
  ]
205
205
  ],
206
206
  [
207
207
  [
208
- "domain",
209
- "bsky.app"
208
+ "domainWithoutSuffix",
209
+ "flickr"
210
210
  ]
211
211
  ],
212
212
  [
213
213
  [
214
214
  "domainWithoutSuffix",
215
- "csdn"
215
+ "ghost"
216
216
  ]
217
217
  ],
218
218
  [
219
219
  [
220
220
  "domainWithoutSuffix",
221
- "deviantart"
221
+ "giphy"
222
222
  ]
223
223
  ],
224
224
  [
225
225
  [
226
226
  "domainWithoutSuffix",
227
- "digg"
227
+ "gitlab"
228
228
  ]
229
229
  ],
230
230
  [
231
231
  [
232
232
  "domainWithoutSuffix",
233
- "etsy"
233
+ "imgur"
234
234
  ]
235
235
  ],
236
236
  [
237
237
  [
238
238
  "domainWithoutSuffix",
239
- "ghost"
239
+ "meetup"
240
240
  ]
241
241
  ],
242
242
  [
243
243
  [
244
244
  "domainWithoutSuffix",
245
- "giphy"
245
+ "producthunt"
246
246
  ]
247
247
  ],
248
248
  [
249
249
  [
250
250
  "domainWithoutSuffix",
251
- "gitlab"
251
+ "reddit"
252
252
  ]
253
253
  ],
254
254
  [
255
255
  [
256
256
  "domainWithoutSuffix",
257
- "meetup"
257
+ "sourceforge"
258
258
  ]
259
259
  ],
260
260
  [
261
261
  [
262
262
  "domainWithoutSuffix",
263
- "producthunt"
263
+ "stackoverflow"
264
264
  ]
265
265
  ],
266
266
  [