@tothalex/cloud 0.0.44 → 0.0.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tothalex/cloud",
3
- "version": "0.0.44",
3
+ "version": "0.0.45",
4
4
  "main": "",
5
5
  "types": "./src/index.d.ts",
6
6
  "files": [
@@ -0,0 +1,111 @@
1
+ declare module 'cloud/spider' {
2
+ /** The rendering driver to use */
3
+ type SpiderDriver = 'http' | 'chrome' | 'smart'
4
+
5
+ /** Redirect policy */
6
+ type RedirectPolicy = 'loose' | 'strict' | 'none'
7
+
8
+ /** Options shared by all spider functions */
9
+ interface SpiderBaseOptions {
10
+ /** Rendering driver: 'http' (default), 'chrome' (headless Chrome), or 'smart' (auto-detect) */
11
+ driver?: SpiderDriver
12
+ /** Custom HTTP headers */
13
+ headers?: Record<string, string>
14
+ /** User-Agent string */
15
+ userAgent?: string
16
+ /** Proxy URL (e.g., 'http://user:pass@proxy:8080', supports SOCKS) */
17
+ proxy?: string
18
+ /** Cookie string (e.g., 'foo=bar; baz=qux') */
19
+ cookies?: string
20
+ /** Per-request timeout in milliseconds (default: 15000) */
21
+ requestTimeout?: number
22
+ /** Accept invalid TLS certificates (default: false) */
23
+ acceptInsecureCerts?: boolean
24
+ /** Respect robots.txt (default: true for crawl/scrape, false for fetchPage) */
25
+ respectRobotsTxt?: boolean
26
+ }
27
+
28
+ /** Options for scrape and crawl */
29
+ interface CrawlOptions extends SpiderBaseOptions {
30
+ /** Maximum number of pages to visit */
31
+ limit?: number
32
+ /** Maximum crawl depth from the start URL (default: 25) */
33
+ depth?: number
34
+ /** Polite delay between requests in milliseconds */
35
+ delay?: number
36
+ /** Maximum concurrent requests */
37
+ concurrency?: number
38
+ /** Overall crawl timeout in milliseconds */
39
+ crawlTimeout?: number
40
+ /** Include subdomains (default: false) */
41
+ subdomains?: boolean
42
+ /** Include all TLDs for the domain (default: false) */
43
+ tld?: boolean
44
+ /** URL patterns to allow (regex; others are excluded) */
45
+ allowUrls?: string[]
46
+ /** URL patterns to block (regex) */
47
+ blockUrls?: string[]
48
+ /** Redirect policy: 'loose' (default), 'strict', 'none' */
49
+ redirectPolicy?: RedirectPolicy
50
+ /** Only crawl HTML pages, skip other resources (default: false) */
51
+ onlyHtml?: boolean
52
+ }
53
+
54
+ /** A crawled page with its content */
55
+ interface SpiderPage {
56
+ /** The URL of the page */
57
+ url: string
58
+ /** The HTML content */
59
+ html: string
60
+ /** HTTP status code */
61
+ status: number
62
+ /** Response headers */
63
+ headers: Record<string, string>
64
+ /** Final URL after redirects (if different from url) */
65
+ finalUrl?: string
66
+ /** Error message if the request failed */
67
+ error?: string
68
+ }
69
+
70
+ /**
71
+ * Fetch a single page and return its content.
72
+ * Defaults to HTTP driver with no robots.txt check.
73
+ */
74
+ export function fetchPage(url: string, options?: SpiderBaseOptions): Promise<SpiderPage>
75
+
76
+ /**
77
+ * Crawl a website and return all pages with their HTML content.
78
+ */
79
+ export function scrape(url: string, options?: CrawlOptions): Promise<SpiderPage[]>
80
+
81
+ /**
82
+ * Crawl a website and return only the discovered URLs.
83
+ */
84
+ export function crawl(url: string, options?: CrawlOptions): Promise<string[]>
85
+
86
+ /**
87
+ * Stream-crawl a website, calling the callback for each page as it arrives.
88
+ * Returns the total number of pages streamed.
89
+ *
90
+ * @example
91
+ * ```js
92
+ * const count = await scrapeStream('https://example.com', { limit: 100 }, (page) => {
93
+ * console.log(page.url, page.status)
94
+ * })
95
+ * ```
96
+ */
97
+ export function scrapeStream(
98
+ url: string,
99
+ options: CrawlOptions | undefined,
100
+ callback: (page: SpiderPage) => void,
101
+ ): Promise<number>
102
+
103
+ const spider: {
104
+ fetchPage: typeof fetchPage
105
+ scrape: typeof scrape
106
+ crawl: typeof crawl
107
+ scrapeStream: typeof scrapeStream
108
+ }
109
+
110
+ export default spider
111
+ }
package/src/index.d.ts CHANGED
@@ -5,6 +5,7 @@
5
5
  /// <reference types="./cloud/postgres.d.ts" />
6
6
  /// <reference types="./cloud/secret.d.ts" />
7
7
  /// <reference types="./cloud/uuid.d.ts" />
8
+ /// <reference types="./cloud/spider.d.ts" />
8
9
 
9
10
  /// <reference types="./assert.d.ts" />
10
11
  /// <reference types="./buffer.d.ts" />