@tothalex/cloud 0.0.41 → 0.0.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cloud/got.d.ts +64 -0
- package/src/cloud/spider.d.ts +111 -0
- package/src/index.d.ts +2 -0
package/package.json
CHANGED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
declare module 'cloud/got' {
|
|
2
|
+
export interface GotOptions {
|
|
3
|
+
/** Request timeout in milliseconds */
|
|
4
|
+
timeout?: number
|
|
5
|
+
/** HTTP headers */
|
|
6
|
+
headers?: Record<string, string>
|
|
7
|
+
/** HTTP method (GET, POST, PUT, DELETE, PATCH, HEAD) */
|
|
8
|
+
method?: string
|
|
9
|
+
/** JSON body (automatically sets Content-Type: application/json) */
|
|
10
|
+
json?: unknown
|
|
11
|
+
/** Raw body string */
|
|
12
|
+
body?: string
|
|
13
|
+
/** Response type: 'json' (default) or 'text' */
|
|
14
|
+
responseType?: 'json' | 'text'
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface GotResponse {
|
|
18
|
+
body: string
|
|
19
|
+
statusCode: number
|
|
20
|
+
headers: Record<string, string>
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Make a GET request and return parsed JSON
|
|
25
|
+
* @param url The URL to request
|
|
26
|
+
* @param options Request options
|
|
27
|
+
* @returns Parsed JSON response (when responseType is 'json') or GotResponse object
|
|
28
|
+
*/
|
|
29
|
+
export function get<T = unknown>(url: string, options?: GotOptions): Promise<T>
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Make a POST request and return parsed JSON
|
|
33
|
+
* @param url The URL to request
|
|
34
|
+
* @param options Request options
|
|
35
|
+
* @returns Parsed JSON response (when responseType is 'json') or GotResponse object
|
|
36
|
+
*/
|
|
37
|
+
export function post<T = unknown>(url: string, options?: GotOptions): Promise<T>
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Make a PUT request and return parsed JSON
|
|
41
|
+
* @param url The URL to request
|
|
42
|
+
* @param options Request options
|
|
43
|
+
* @returns Parsed JSON response (when responseType is 'json') or GotResponse object
|
|
44
|
+
*/
|
|
45
|
+
export function put<T = unknown>(url: string, options?: GotOptions): Promise<T>
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Make a DELETE request and return parsed JSON
|
|
49
|
+
* @param url The URL to request
|
|
50
|
+
* @param options Request options
|
|
51
|
+
* @returns Parsed JSON response (when responseType is 'json') or GotResponse object
|
|
52
|
+
*/
|
|
53
|
+
function _delete<T = unknown>(url: string, options?: GotOptions): Promise<T>
|
|
54
|
+
export { _delete as delete }
|
|
55
|
+
|
|
56
|
+
const got: {
|
|
57
|
+
get: typeof get
|
|
58
|
+
post: typeof post
|
|
59
|
+
put: typeof put
|
|
60
|
+
delete: typeof _delete
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export default got
|
|
64
|
+
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
declare module 'cloud/spider' {
|
|
2
|
+
/** The rendering driver to use */
|
|
3
|
+
type SpiderDriver = 'http' | 'chrome' | 'smart'
|
|
4
|
+
|
|
5
|
+
/** Redirect policy */
|
|
6
|
+
type RedirectPolicy = 'loose' | 'strict' | 'none'
|
|
7
|
+
|
|
8
|
+
/** Options shared by all spider functions */
|
|
9
|
+
interface SpiderBaseOptions {
|
|
10
|
+
/** Rendering driver: 'http' (default), 'chrome' (headless Chrome), or 'smart' (auto-detect) */
|
|
11
|
+
driver?: SpiderDriver
|
|
12
|
+
/** Custom HTTP headers */
|
|
13
|
+
headers?: Record<string, string>
|
|
14
|
+
/** User-Agent string */
|
|
15
|
+
userAgent?: string
|
|
16
|
+
/** Proxy URL (e.g., 'http://user:pass@proxy:8080', supports SOCKS) */
|
|
17
|
+
proxy?: string
|
|
18
|
+
/** Cookie string (e.g., 'foo=bar; baz=qux') */
|
|
19
|
+
cookies?: string
|
|
20
|
+
/** Per-request timeout in milliseconds (default: 15000) */
|
|
21
|
+
requestTimeout?: number
|
|
22
|
+
/** Accept invalid TLS certificates (default: false) */
|
|
23
|
+
acceptInsecureCerts?: boolean
|
|
24
|
+
/** Respect robots.txt (default: true for crawl/scrape, false for fetchPage) */
|
|
25
|
+
respectRobotsTxt?: boolean
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** Options for scrape and crawl */
|
|
29
|
+
interface CrawlOptions extends SpiderBaseOptions {
|
|
30
|
+
/** Maximum number of pages to visit */
|
|
31
|
+
limit?: number
|
|
32
|
+
/** Maximum crawl depth from the start URL (default: 25) */
|
|
33
|
+
depth?: number
|
|
34
|
+
/** Polite delay between requests in milliseconds */
|
|
35
|
+
delay?: number
|
|
36
|
+
/** Maximum concurrent requests */
|
|
37
|
+
concurrency?: number
|
|
38
|
+
/** Overall crawl timeout in milliseconds */
|
|
39
|
+
crawlTimeout?: number
|
|
40
|
+
/** Include subdomains (default: false) */
|
|
41
|
+
subdomains?: boolean
|
|
42
|
+
/** Include all TLDs for the domain (default: false) */
|
|
43
|
+
tld?: boolean
|
|
44
|
+
/** URL patterns to allow (regex; others are excluded) */
|
|
45
|
+
allowUrls?: string[]
|
|
46
|
+
/** URL patterns to block (regex) */
|
|
47
|
+
blockUrls?: string[]
|
|
48
|
+
/** Redirect policy: 'loose' (default), 'strict', 'none' */
|
|
49
|
+
redirectPolicy?: RedirectPolicy
|
|
50
|
+
/** Only crawl HTML pages, skip other resources (default: false) */
|
|
51
|
+
onlyHtml?: boolean
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/** A crawled page with its content */
|
|
55
|
+
interface SpiderPage {
|
|
56
|
+
/** The URL of the page */
|
|
57
|
+
url: string
|
|
58
|
+
/** The HTML content */
|
|
59
|
+
html: string
|
|
60
|
+
/** HTTP status code */
|
|
61
|
+
status: number
|
|
62
|
+
/** Response headers */
|
|
63
|
+
headers: Record<string, string>
|
|
64
|
+
/** Final URL after redirects (if different from url) */
|
|
65
|
+
finalUrl?: string
|
|
66
|
+
/** Error message if the request failed */
|
|
67
|
+
error?: string
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Fetch a single page and return its content.
|
|
72
|
+
* Defaults to HTTP driver with no robots.txt check.
|
|
73
|
+
*/
|
|
74
|
+
export function fetchPage(url: string, options?: SpiderBaseOptions): Promise<SpiderPage>
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Crawl a website and return all pages with their HTML content.
|
|
78
|
+
*/
|
|
79
|
+
export function scrape(url: string, options?: CrawlOptions): Promise<SpiderPage[]>
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Crawl a website and return only the discovered URLs.
|
|
83
|
+
*/
|
|
84
|
+
export function crawl(url: string, options?: CrawlOptions): Promise<string[]>
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Stream-crawl a website, calling the callback for each page as it arrives.
|
|
88
|
+
* Returns the total number of pages streamed.
|
|
89
|
+
*
|
|
90
|
+
* @example
|
|
91
|
+
* ```js
|
|
92
|
+
* const count = await scrapeStream('https://example.com', { limit: 100 }, (page) => {
|
|
93
|
+
* console.log(page.url, page.status)
|
|
94
|
+
* })
|
|
95
|
+
* ```
|
|
96
|
+
*/
|
|
97
|
+
export function scrapeStream(
|
|
98
|
+
url: string,
|
|
99
|
+
options: CrawlOptions | undefined,
|
|
100
|
+
callback: (page: SpiderPage) => void,
|
|
101
|
+
): Promise<number>
|
|
102
|
+
|
|
103
|
+
const spider: {
|
|
104
|
+
fetchPage: typeof fetchPage
|
|
105
|
+
scrape: typeof scrape
|
|
106
|
+
crawl: typeof crawl
|
|
107
|
+
scrapeStream: typeof scrapeStream
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
export default spider
|
|
111
|
+
}
|
package/src/index.d.ts
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
/// <reference types="./cloud/cache.d.ts" />
|
|
2
2
|
/// <reference types="./cloud/event.d.ts" />
|
|
3
|
+
/// <reference types="./cloud/got.d.ts" />
|
|
3
4
|
/// <reference types="./cloud/index.d.ts" />
|
|
4
5
|
/// <reference types="./cloud/postgres.d.ts" />
|
|
5
6
|
/// <reference types="./cloud/secret.d.ts" />
|
|
6
7
|
/// <reference types="./cloud/uuid.d.ts" />
|
|
8
|
+
/// <reference types="./cloud/spider.d.ts" />
|
|
7
9
|
|
|
8
10
|
/// <reference types="./assert.d.ts" />
|
|
9
11
|
/// <reference types="./buffer.d.ts" />
|