@tothalex/cloud 0.0.41 → 0.0.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tothalex/cloud",
3
- "version": "0.0.41",
3
+ "version": "0.0.45",
4
4
  "main": "",
5
5
  "types": "./src/index.d.ts",
6
6
  "files": [
@@ -0,0 +1,64 @@
1
+ declare module 'cloud/got' {
2
+ export interface GotOptions {
3
+ /** Request timeout in milliseconds */
4
+ timeout?: number
5
+ /** HTTP headers */
6
+ headers?: Record<string, string>
7
+ /** HTTP method (GET, POST, PUT, DELETE, PATCH, HEAD) */
8
+ method?: string
9
+ /** JSON body (automatically sets Content-Type: application/json) */
10
+ json?: unknown
11
+ /** Raw body string */
12
+ body?: string
13
+ /** Response type: 'json' (default) or 'text' */
14
+ responseType?: 'json' | 'text'
15
+ }
16
+
17
+ export interface GotResponse {
18
+ body: string
19
+ statusCode: number
20
+ headers: Record<string, string>
21
+ }
22
+
23
+ /**
24
+ * Make a GET request and return parsed JSON
25
+ * @param url The URL to request
26
+ * @param options Request options
27
+ * @returns Parsed JSON response (when responseType is 'json') or GotResponse object
28
+ */
29
+ export function get<T = unknown>(url: string, options?: GotOptions): Promise<T>
30
+
31
+ /**
32
+ * Make a POST request and return parsed JSON
33
+ * @param url The URL to request
34
+ * @param options Request options
35
+ * @returns Parsed JSON response (when responseType is 'json') or GotResponse object
36
+ */
37
+ export function post<T = unknown>(url: string, options?: GotOptions): Promise<T>
38
+
39
+ /**
40
+ * Make a PUT request and return parsed JSON
41
+ * @param url The URL to request
42
+ * @param options Request options
43
+ * @returns Parsed JSON response (when responseType is 'json') or GotResponse object
44
+ */
45
+ export function put<T = unknown>(url: string, options?: GotOptions): Promise<T>
46
+
47
+ /**
48
+ * Make a DELETE request and return parsed JSON
49
+ * @param url The URL to request
50
+ * @param options Request options
51
+ * @returns Parsed JSON response (when responseType is 'json') or GotResponse object
52
+ */
53
+ function _delete<T = unknown>(url: string, options?: GotOptions): Promise<T>
54
+ export { _delete as delete }
55
+
56
+ const got: {
57
+ get: typeof get
58
+ post: typeof post
59
+ put: typeof put
60
+ delete: typeof _delete
61
+ }
62
+
63
+ export default got
64
+ }
@@ -0,0 +1,111 @@
1
+ declare module 'cloud/spider' {
2
+ /** The rendering driver to use */
3
+ type SpiderDriver = 'http' | 'chrome' | 'smart'
4
+
5
+ /** Redirect policy */
6
+ type RedirectPolicy = 'loose' | 'strict' | 'none'
7
+
8
+ /** Options shared by all spider functions */
9
+ interface SpiderBaseOptions {
10
+ /** Rendering driver: 'http' (default), 'chrome' (headless Chrome), or 'smart' (auto-detect) */
11
+ driver?: SpiderDriver
12
+ /** Custom HTTP headers */
13
+ headers?: Record<string, string>
14
+ /** User-Agent string */
15
+ userAgent?: string
16
+ /** Proxy URL (e.g., 'http://user:pass@proxy:8080', supports SOCKS) */
17
+ proxy?: string
18
+ /** Cookie string (e.g., 'foo=bar; baz=qux') */
19
+ cookies?: string
20
+ /** Per-request timeout in milliseconds (default: 15000) */
21
+ requestTimeout?: number
22
+ /** Accept invalid TLS certificates (default: false) */
23
+ acceptInsecureCerts?: boolean
24
+ /** Respect robots.txt (default: true for crawl/scrape, false for fetchPage) */
25
+ respectRobotsTxt?: boolean
26
+ }
27
+
28
+ /** Options for scrape and crawl */
29
+ interface CrawlOptions extends SpiderBaseOptions {
30
+ /** Maximum number of pages to visit */
31
+ limit?: number
32
+ /** Maximum crawl depth from the start URL (default: 25) */
33
+ depth?: number
34
+ /** Polite delay between requests in milliseconds */
35
+ delay?: number
36
+ /** Maximum concurrent requests */
37
+ concurrency?: number
38
+ /** Overall crawl timeout in milliseconds */
39
+ crawlTimeout?: number
40
+ /** Include subdomains (default: false) */
41
+ subdomains?: boolean
42
+ /** Include all TLDs for the domain (default: false) */
43
+ tld?: boolean
44
+ /** URL patterns to allow (regex; others are excluded) */
45
+ allowUrls?: string[]
46
+ /** URL patterns to block (regex) */
47
+ blockUrls?: string[]
48
+ /** Redirect policy: 'loose' (default), 'strict', 'none' */
49
+ redirectPolicy?: RedirectPolicy
50
+ /** Only crawl HTML pages, skip other resources (default: false) */
51
+ onlyHtml?: boolean
52
+ }
53
+
54
+ /** A crawled page with its content */
55
+ interface SpiderPage {
56
+ /** The URL of the page */
57
+ url: string
58
+ /** The HTML content */
59
+ html: string
60
+ /** HTTP status code */
61
+ status: number
62
+ /** Response headers */
63
+ headers: Record<string, string>
64
+ /** Final URL after redirects (if different from url) */
65
+ finalUrl?: string
66
+ /** Error message if the request failed */
67
+ error?: string
68
+ }
69
+
70
+ /**
71
+ * Fetch a single page and return its content.
72
+ * Defaults to HTTP driver with no robots.txt check.
73
+ */
74
+ export function fetchPage(url: string, options?: SpiderBaseOptions): Promise<SpiderPage>
75
+
76
+ /**
77
+ * Crawl a website and return all pages with their HTML content.
78
+ */
79
+ export function scrape(url: string, options?: CrawlOptions): Promise<SpiderPage[]>
80
+
81
+ /**
82
+ * Crawl a website and return only the discovered URLs.
83
+ */
84
+ export function crawl(url: string, options?: CrawlOptions): Promise<string[]>
85
+
86
+ /**
87
+ * Stream-crawl a website, calling the callback for each page as it arrives.
88
+ * Returns the total number of pages streamed.
89
+ *
90
+ * @example
91
+ * ```js
92
+ * const count = await scrapeStream('https://example.com', { limit: 100 }, (page) => {
93
+ * console.log(page.url, page.status)
94
+ * })
95
+ * ```
96
+ */
97
+ export function scrapeStream(
98
+ url: string,
99
+ options: CrawlOptions | undefined,
100
+ callback: (page: SpiderPage) => void,
101
+ ): Promise<number>
102
+
103
+ const spider: {
104
+ fetchPage: typeof fetchPage
105
+ scrape: typeof scrape
106
+ crawl: typeof crawl
107
+ scrapeStream: typeof scrapeStream
108
+ }
109
+
110
+ export default spider
111
+ }
package/src/index.d.ts CHANGED
@@ -1,9 +1,11 @@
1
1
  /// <reference types="./cloud/cache.d.ts" />
2
2
  /// <reference types="./cloud/event.d.ts" />
3
+ /// <reference types="./cloud/got.d.ts" />
3
4
  /// <reference types="./cloud/index.d.ts" />
4
5
  /// <reference types="./cloud/postgres.d.ts" />
5
6
  /// <reference types="./cloud/secret.d.ts" />
6
7
  /// <reference types="./cloud/uuid.d.ts" />
8
+ /// <reference types="./cloud/spider.d.ts" />
7
9
 
8
10
  /// <reference types="./assert.d.ts" />
9
11
  /// <reference types="./buffer.d.ts" />