@tothalex/cloud 0.0.44 → 0.0.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tothalex/cloud",
3
- "version": "0.0.44",
3
+ "version": "0.0.46",
4
4
  "main": "",
5
5
  "types": "./src/index.d.ts",
6
6
  "files": [
@@ -0,0 +1,122 @@
1
+ declare module 'cloud/spider' {
2
+ /** The rendering driver to use */
3
+ type SpiderDriver = 'http' | 'chrome' | 'smart'
4
+
5
+ /** Redirect policy */
6
+ type RedirectPolicy = 'loose' | 'strict' | 'none'
7
+
8
+ /** Options shared by all spider functions */
9
+ interface SpiderBaseOptions {
10
+ /** Rendering driver: 'http' (default), 'chrome' (headless Chrome), or 'smart' (auto-detect) */
11
+ driver?: SpiderDriver
12
+ /** Custom HTTP headers */
13
+ headers?: Record<string, string>
14
+ /** User-Agent string */
15
+ userAgent?: string
16
+ /** Proxy URL (e.g., 'http://user:pass@proxy:8080', supports SOCKS) */
17
+ proxy?: string
18
+ /** Cookie string (e.g., 'foo=bar; baz=qux') */
19
+ cookies?: string
20
+ /** Per-request timeout in milliseconds (default: 15000) */
21
+ requestTimeout?: number
22
+ /** Accept invalid TLS certificates (default: false) */
23
+ acceptInsecureCerts?: boolean
24
+ /** Respect robots.txt (default: true for crawl/scrape, false for fetchPage) */
25
+ respectRobotsTxt?: boolean
26
+ }
27
+
28
+ /** Options for scrape and crawl */
29
+ interface CrawlOptions extends SpiderBaseOptions {
30
+ /** Maximum number of pages to visit */
31
+ limit?: number
32
+ /** Maximum crawl depth from the start URL (default: 25) */
33
+ depth?: number
34
+ /** Polite delay between requests in milliseconds */
35
+ delay?: number
36
+ /** Maximum concurrent requests */
37
+ concurrency?: number
38
+ /** Overall crawl timeout in milliseconds */
39
+ crawlTimeout?: number
40
+ /** Include subdomains (default: false) */
41
+ subdomains?: boolean
42
+ /** Include all TLDs for the domain (default: false) */
43
+ tld?: boolean
44
+ /** URL patterns to allow (regex; others are excluded) */
45
+ allowUrls?: string[]
46
+ /** URL patterns to block (regex) */
47
+ blockUrls?: string[]
48
+ /** Redirect policy: 'loose' (default), 'strict', 'none' */
49
+ redirectPolicy?: RedirectPolicy
50
+ /** Only crawl HTML pages, skip other resources (default: false) */
51
+ onlyHtml?: boolean
52
+
53
+ /** Wait for a CSS selector to appear before returning (chrome only) */
54
+ waitForSelector?: string
55
+ /** Wait for a fixed delay in milliseconds before returning (chrome only) */
56
+ waitForDelay?: number
57
+ /** Max timeout in milliseconds for all waitFor operations (chrome only, default: 30000) */
58
+ waitForTimeout?: number
59
+ /** Wait for network to become idle before returning (chrome only) */
60
+ waitForIdleNetwork?: boolean
61
+ /** Wait for page navigations/redirects to complete before returning (chrome only) */
62
+ waitForPageNavigations?: boolean
63
+ }
64
+
65
+ /** A crawled page with its content */
66
+ interface SpiderPage {
67
+ /** The URL of the page */
68
+ url: string
69
+ /** The HTML content */
70
+ html: string
71
+ /** HTTP status code */
72
+ status: number
73
+ /** Response headers */
74
+ headers: Record<string, string>
75
+ /** Final URL after redirects (if different from url) */
76
+ finalUrl?: string
77
+ /** Error message if the request failed */
78
+ error?: string
79
+ }
80
+
81
+ /**
82
+ * Fetch a single page and return its content.
83
+ * Defaults to HTTP driver with no robots.txt check.
84
+ */
85
+ export function fetchPage(url: string, options?: SpiderBaseOptions): Promise<SpiderPage>
86
+
87
+ /**
88
+ * Crawl a website and return all pages with their HTML content.
89
+ */
90
+ export function scrape(url: string, options?: CrawlOptions): Promise<SpiderPage[]>
91
+
92
+ /**
93
+ * Crawl a website and return only the discovered URLs.
94
+ */
95
+ export function crawl(url: string, options?: CrawlOptions): Promise<string[]>
96
+
97
+ /**
98
+ * Stream-crawl a website, calling the callback for each page as it arrives.
99
+ * Returns the total number of pages streamed.
100
+ *
101
+ * @example
102
+ * ```js
103
+ * const count = await scrapeStream('https://example.com', { limit: 100 }, (page) => {
104
+ * console.log(page.url, page.status)
105
+ * })
106
+ * ```
107
+ */
108
+ export function scrapeStream(
109
+ url: string,
110
+ options: CrawlOptions | undefined,
111
+ callback: (page: SpiderPage) => void,
112
+ ): Promise<number>
113
+
114
+ const spider: {
115
+ fetchPage: typeof fetchPage
116
+ scrape: typeof scrape
117
+ crawl: typeof crawl
118
+ scrapeStream: typeof scrapeStream
119
+ }
120
+
121
+ export default spider
122
+ }
package/src/index.d.ts CHANGED
@@ -5,6 +5,7 @@
5
5
  /// <reference types="./cloud/postgres.d.ts" />
6
6
  /// <reference types="./cloud/secret.d.ts" />
7
7
  /// <reference types="./cloud/uuid.d.ts" />
8
+ /// <reference types="./cloud/spider.d.ts" />
8
9
 
9
10
  /// <reference types="./assert.d.ts" />
10
11
  /// <reference types="./buffer.d.ts" />