@tothalex/cloud 0.0.44 → 0.0.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cloud/spider.d.ts +122 -0
- package/src/index.d.ts +1 -0
package/package.json
CHANGED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
declare module 'cloud/spider' {
|
|
2
|
+
/** The rendering driver to use */
|
|
3
|
+
type SpiderDriver = 'http' | 'chrome' | 'smart'
|
|
4
|
+
|
|
5
|
+
/** Redirect policy */
|
|
6
|
+
type RedirectPolicy = 'loose' | 'strict' | 'none'
|
|
7
|
+
|
|
8
|
+
/** Options shared by all spider functions */
|
|
9
|
+
interface SpiderBaseOptions {
|
|
10
|
+
/** Rendering driver: 'http' (default), 'chrome' (headless Chrome), or 'smart' (auto-detect) */
|
|
11
|
+
driver?: SpiderDriver
|
|
12
|
+
/** Custom HTTP headers */
|
|
13
|
+
headers?: Record<string, string>
|
|
14
|
+
/** User-Agent string */
|
|
15
|
+
userAgent?: string
|
|
16
|
+
/** Proxy URL (e.g., 'http://user:pass@proxy:8080', supports SOCKS) */
|
|
17
|
+
proxy?: string
|
|
18
|
+
/** Cookie string (e.g., 'foo=bar; baz=qux') */
|
|
19
|
+
cookies?: string
|
|
20
|
+
/** Per-request timeout in milliseconds (default: 15000) */
|
|
21
|
+
requestTimeout?: number
|
|
22
|
+
/** Accept invalid TLS certificates (default: false) */
|
|
23
|
+
acceptInsecureCerts?: boolean
|
|
24
|
+
/** Respect robots.txt (default: true for crawl/scrape, false for fetchPage) */
|
|
25
|
+
respectRobotsTxt?: boolean
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** Options for scrape and crawl */
|
|
29
|
+
interface CrawlOptions extends SpiderBaseOptions {
|
|
30
|
+
/** Maximum number of pages to visit */
|
|
31
|
+
limit?: number
|
|
32
|
+
/** Maximum crawl depth from the start URL (default: 25) */
|
|
33
|
+
depth?: number
|
|
34
|
+
/** Polite delay between requests in milliseconds */
|
|
35
|
+
delay?: number
|
|
36
|
+
/** Maximum concurrent requests */
|
|
37
|
+
concurrency?: number
|
|
38
|
+
/** Overall crawl timeout in milliseconds */
|
|
39
|
+
crawlTimeout?: number
|
|
40
|
+
/** Include subdomains (default: false) */
|
|
41
|
+
subdomains?: boolean
|
|
42
|
+
/** Include all TLDs for the domain (default: false) */
|
|
43
|
+
tld?: boolean
|
|
44
|
+
/** URL patterns to allow (regex; others are excluded) */
|
|
45
|
+
allowUrls?: string[]
|
|
46
|
+
/** URL patterns to block (regex) */
|
|
47
|
+
blockUrls?: string[]
|
|
48
|
+
/** Redirect policy: 'loose' (default), 'strict', 'none' */
|
|
49
|
+
redirectPolicy?: RedirectPolicy
|
|
50
|
+
/** Only crawl HTML pages, skip other resources (default: false) */
|
|
51
|
+
onlyHtml?: boolean
|
|
52
|
+
|
|
53
|
+
/** Wait for a CSS selector to appear before returning (chrome only) */
|
|
54
|
+
waitForSelector?: string
|
|
55
|
+
/** Wait for a fixed delay in milliseconds before returning (chrome only) */
|
|
56
|
+
waitForDelay?: number
|
|
57
|
+
/** Max timeout in milliseconds for all waitFor operations (chrome only, default: 30000) */
|
|
58
|
+
waitForTimeout?: number
|
|
59
|
+
/** Wait for network to become idle before returning (chrome only) */
|
|
60
|
+
waitForIdleNetwork?: boolean
|
|
61
|
+
/** Wait for page navigations/redirects to complete before returning (chrome only) */
|
|
62
|
+
waitForPageNavigations?: boolean
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** A crawled page with its content */
|
|
66
|
+
interface SpiderPage {
|
|
67
|
+
/** The URL of the page */
|
|
68
|
+
url: string
|
|
69
|
+
/** The HTML content */
|
|
70
|
+
html: string
|
|
71
|
+
/** HTTP status code */
|
|
72
|
+
status: number
|
|
73
|
+
/** Response headers */
|
|
74
|
+
headers: Record<string, string>
|
|
75
|
+
/** Final URL after redirects (if different from url) */
|
|
76
|
+
finalUrl?: string
|
|
77
|
+
/** Error message if the request failed */
|
|
78
|
+
error?: string
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Fetch a single page and return its content.
|
|
83
|
+
* Defaults to HTTP driver with no robots.txt check.
|
|
84
|
+
*/
|
|
85
|
+
export function fetchPage(url: string, options?: SpiderBaseOptions): Promise<SpiderPage>
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Crawl a website and return all pages with their HTML content.
|
|
89
|
+
*/
|
|
90
|
+
export function scrape(url: string, options?: CrawlOptions): Promise<SpiderPage[]>
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Crawl a website and return only the discovered URLs.
|
|
94
|
+
*/
|
|
95
|
+
export function crawl(url: string, options?: CrawlOptions): Promise<string[]>
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Stream-crawl a website, calling the callback for each page as it arrives.
|
|
99
|
+
* Returns the total number of pages streamed.
|
|
100
|
+
*
|
|
101
|
+
* @example
|
|
102
|
+
* ```js
|
|
103
|
+
* const count = await scrapeStream('https://example.com', { limit: 100 }, (page) => {
|
|
104
|
+
* console.log(page.url, page.status)
|
|
105
|
+
* })
|
|
106
|
+
* ```
|
|
107
|
+
*/
|
|
108
|
+
export function scrapeStream(
|
|
109
|
+
url: string,
|
|
110
|
+
options: CrawlOptions | undefined,
|
|
111
|
+
callback: (page: SpiderPage) => void,
|
|
112
|
+
): Promise<number>
|
|
113
|
+
|
|
114
|
+
const spider: {
|
|
115
|
+
fetchPage: typeof fetchPage
|
|
116
|
+
scrape: typeof scrape
|
|
117
|
+
crawl: typeof crawl
|
|
118
|
+
scrapeStream: typeof scrapeStream
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export default spider
|
|
122
|
+
}
|
package/src/index.d.ts
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
/// <reference types="./cloud/postgres.d.ts" />
|
|
6
6
|
/// <reference types="./cloud/secret.d.ts" />
|
|
7
7
|
/// <reference types="./cloud/uuid.d.ts" />
|
|
8
|
+
/// <reference types="./cloud/spider.d.ts" />
|
|
8
9
|
|
|
9
10
|
/// <reference types="./assert.d.ts" />
|
|
10
11
|
/// <reference types="./buffer.d.ts" />
|