@spider-cloud/spider-client 0.0.22 → 0.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/client.d.ts CHANGED
@@ -1,3 +1,8 @@
1
+ import { SpiderParams } from "./config";
2
+ /**
3
+ * Generic params for core request.
4
+ */
5
+ type GenericParams = Omit<SpiderParams, "url">;
1
6
  /**
2
7
  * Configuration interface for Spider.
3
8
  */
@@ -38,18 +43,18 @@ export declare class Spider {
38
43
  /**
39
44
  * Scrapes data from a specified URL.
40
45
  * @param {string} url - The URL to scrape.
41
- * @param {object} [params={}] - Additional parameters for the scraping request.
46
+ * @param {GenericParams} [params={}] - Additional parameters for the scraping request.
42
47
  * @returns {Promise<any>} The scraped data from the URL.
43
48
  */
44
- scrapeUrl(url: string, params?: {}): Promise<any>;
49
+ scrapeUrl(url: string, params?: GenericParams): Promise<any>;
45
50
  /**
46
51
  * Initiates a crawling job starting from the specified URL.
47
52
  * @param {string} url - The URL to start crawling.
48
- * @param {object} [params={}] - Additional parameters for the crawl.
53
+ * @param {GenericParams} [params={}] - Additional parameters for the crawl.
49
54
  * @param {boolean} [stream=false] - Whether to receive the response as a stream.
50
55
  * @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming.
51
56
  */
52
- crawlUrl(url: string, params?: {}, stream?: boolean): Promise<any>;
57
+ crawlUrl(url: string, params?: GenericParams, stream?: boolean): Promise<any>;
53
58
  /**
54
59
  * Retrieves all links from the specified URL.
55
60
  * @param {string} url - The URL from which to gather links.
@@ -60,17 +65,17 @@ export declare class Spider {
60
65
  /**
61
66
  * Takes a screenshot of the specified URL.
62
67
  * @param {string} url - The URL to screenshot.
63
- * @param {object} [params={}] - Configuration parameters for the screenshot.
68
+ * @param {GenericParams} [params={}] - Configuration parameters for the screenshot.
64
69
  * @returns {Promise<any>} The screenshot data.
65
70
  */
66
- screenshot(url: string, params?: {}): Promise<any>;
71
+ screenshot(url: string, params?: GenericParams): Promise<any>;
67
72
  /**
68
73
  * Perform a search and gather a list of websites to start crawling and collect resources.
69
74
  * @param {string} search - The search query.
70
- * @param {object} [params={}] - Configuration parameters for the search.
75
+ * @param {GenericParams} [params={}] - Configuration parameters for the search.
71
76
  * @returns {Promise<any>} The result of the crawl, either structured data or a Response object if streaming.
72
77
  */
73
- search(q: string, params?: {}): Promise<any>;
78
+ search(q: string, params?: GenericParams): Promise<any>;
74
79
  /**
75
80
  * Transform HTML to Markdown or text. You can send up to 10MB of data at once.
76
81
  * @param {object} data - The data to trasnform, a list of objects with the key 'html' and optional 'url' key for readability.
@@ -84,24 +89,24 @@ export declare class Spider {
84
89
  /**
85
90
  * Extracts contact information from the specified URL.
86
91
  * @param {string} url - The URL from which to extract contacts.
87
- * @param {object} [params={}] - Configuration parameters for the extraction.
92
+ * @param {GenericParams} [params={}] - Configuration parameters for the extraction.
88
93
  * @returns {Promise<any>} The contact information extracted.
89
94
  */
90
- extractContacts(url: string, params?: {}): Promise<any>;
95
+ extractContacts(url: string, params?: GenericParams): Promise<any>;
91
96
  /**
92
97
  * Applies labeling to data extracted from a specified URL.
93
98
  * @param {string} url - The URL to label.
94
- * @param {object} [params={}] - Configuration parameters for labeling.
99
+ * @param {GenericParams} [params={}] - Configuration parameters for labeling.
95
100
  * @returns {Promise<any>} The labeled data.
96
101
  */
97
- label(url: string, params?: {}): Promise<any>;
102
+ label(url: string, params?: GenericParams): Promise<any>;
98
103
  /**
99
104
  * Check the crawl state of the website.
100
105
  * @param {string} url - The URL to check.
101
- * @param {object} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
106
+ * @param {GenericParams} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
102
107
  * @returns {Promise<any>} The crawl state data.
103
108
  */
104
- getCrawlState(url: string, params?: {}): Promise<any>;
109
+ getCrawlState(url: string, params?: GenericParams): Promise<any>;
105
110
  /**
106
111
  * Retrieves the number of credits available on the account.
107
112
  * @returns {Promise<any>} The current credit balance.
@@ -113,21 +118,21 @@ export declare class Spider {
113
118
  * @param {object} data - The data to be inserted.
114
119
  * @returns {Promise<any>} The response from the server.
115
120
  */
116
- postData(table: string, data: object): Promise<any>;
121
+ postData(table: string, data: GenericParams | Record<string, any>): Promise<any>;
117
122
  /**
118
123
  * Send a GET request to retrieve data from a specified table.
119
124
  * @param {string} table - The table name in the database.
120
125
  * @param {object} params - The query parameters for data retrieval.
121
126
  * @returns {Promise<any>} The response from the server.
122
127
  */
123
- getData(table: string, params: object): Promise<any>;
128
+ getData(table: string, params: GenericParams | Record<string, any>): Promise<any>;
124
129
  /**
125
130
  * Send a DELETE request to remove data from a specified table.
126
131
  * @param {string} table - The table name in the database.
127
132
  * @param {object} params - Parameters to identify records to delete.
128
133
  * @returns {Promise<any>} The response from the server.
129
134
  */
130
- deleteData(table: string, params: object): Promise<any>;
135
+ deleteData(table: string, params: GenericParams | Record<string, any>): Promise<any>;
131
136
  /**
132
137
  * Prepares common headers for each API request.
133
138
  * @returns {HeadersInit} A headers object for fetch requests.
@@ -144,3 +149,4 @@ export declare class Spider {
144
149
  */
145
150
  handleError(response: Response, action: string): void;
146
151
  }
152
+ export {};
package/dist/client.js CHANGED
@@ -80,7 +80,7 @@ class Spider {
80
80
  /**
81
81
  * Scrapes data from a specified URL.
82
82
  * @param {string} url - The URL to scrape.
83
- * @param {object} [params={}] - Additional parameters for the scraping request.
83
+ * @param {GenericParams} [params={}] - Additional parameters for the scraping request.
84
84
  * @returns {Promise<any>} The scraped data from the URL.
85
85
  */
86
86
  async scrapeUrl(url, params = {}) {
@@ -89,7 +89,7 @@ class Spider {
89
89
  /**
90
90
  * Initiates a crawling job starting from the specified URL.
91
91
  * @param {string} url - The URL to start crawling.
92
- * @param {object} [params={}] - Additional parameters for the crawl.
92
+ * @param {GenericParams} [params={}] - Additional parameters for the crawl.
93
93
  * @param {boolean} [stream=false] - Whether to receive the response as a stream.
94
94
  * @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming.
95
95
  */
@@ -108,7 +108,7 @@ class Spider {
108
108
  /**
109
109
  * Takes a screenshot of the specified URL.
110
110
  * @param {string} url - The URL to screenshot.
111
- * @param {object} [params={}] - Configuration parameters for the screenshot.
111
+ * @param {GenericParams} [params={}] - Configuration parameters for the screenshot.
112
112
  * @returns {Promise<any>} The screenshot data.
113
113
  */
114
114
  async screenshot(url, params = {}) {
@@ -117,7 +117,7 @@ class Spider {
117
117
  /**
118
118
  * Perform a search and gather a list of websites to start crawling and collect resources.
119
119
  * @param {string} search - The search query.
120
- * @param {object} [params={}] - Configuration parameters for the search.
120
+ * @param {GenericParams} [params={}] - Configuration parameters for the search.
121
121
  * @returns {Promise<any>} The result of the crawl, either structured data or a Response object if streaming.
122
122
  */
123
123
  async search(q, params = {}) {
@@ -135,7 +135,7 @@ class Spider {
135
135
  /**
136
136
  * Extracts contact information from the specified URL.
137
137
  * @param {string} url - The URL from which to extract contacts.
138
- * @param {object} [params={}] - Configuration parameters for the extraction.
138
+ * @param {GenericParams} [params={}] - Configuration parameters for the extraction.
139
139
  * @returns {Promise<any>} The contact information extracted.
140
140
  */
141
141
  async extractContacts(url, params = {}) {
@@ -144,7 +144,7 @@ class Spider {
144
144
  /**
145
145
  * Applies labeling to data extracted from a specified URL.
146
146
  * @param {string} url - The URL to label.
147
- * @param {object} [params={}] - Configuration parameters for labeling.
147
+ * @param {GenericParams} [params={}] - Configuration parameters for labeling.
148
148
  * @returns {Promise<any>} The labeled data.
149
149
  */
150
150
  async label(url, params = {}) {
@@ -153,7 +153,7 @@ class Spider {
153
153
  /**
154
154
  * Check the crawl state of the website.
155
155
  * @param {string} url - The URL to check.
156
- * @param {object} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
156
+ * @param {GenericParams} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
157
157
  * @returns {Promise<any>} The crawl state data.
158
158
  */
159
159
  async getCrawlState(url, params = {}) {
@@ -0,0 +1,144 @@
1
+ /**
2
+ * Represents viewport dimensions.
3
+ */
4
+ export interface Viewport {
5
+ width: number;
6
+ height: number;
7
+ }
8
+ /**
9
+ * Represents HTTP headers as a dictionary object.
10
+ */
11
+ export interface Headers {
12
+ [key: string]: string;
13
+ }
14
+ /**
15
+ * Represents a budget for various resources.
16
+ */
17
+ export interface Budget {
18
+ [key: string]: number;
19
+ }
20
+ /**
21
+ * Represents the options available for making a spider request.
22
+ */
23
+ export interface SpiderParams {
24
+ /**
25
+ * The URL to be crawled.
26
+ */
27
+ url: string;
28
+ /**
29
+ * The type of request to be made.
30
+ */
31
+ request?: "http" | "chrome" | "smart";
32
+ /**
33
+ * The maximum number of pages the crawler should visit.
34
+ */
35
+ limit?: number;
36
+ /**
37
+ * The format in which the result should be returned.
38
+ */
39
+ return_format?: "markdown" | "raw" | "text" | "html2text" | "bytes";
40
+ /**
41
+ * Specifies whether to only visit the top-level domain.
42
+ */
43
+ tld?: boolean;
44
+ /**
45
+ * The depth of the crawl.
46
+ */
47
+ depth?: number;
48
+ /**
49
+ * Specifies whether the request should be cached.
50
+ */
51
+ cache?: boolean;
52
+ /**
53
+ * The budget for various resources.
54
+ */
55
+ budget?: Budget;
56
+ /**
57
+ * The locale to be used during the crawl.
58
+ */
59
+ locale?: string;
60
+ /**
61
+ * The cookies to be set for the request, formatted as a single string.
62
+ */
63
+ cookies?: string;
64
+ /**
65
+ * Specifies whether to use stealth techniques to avoid detection.
66
+ */
67
+ stealth?: boolean;
68
+ /**
69
+ * The headers to be used for the request.
70
+ */
71
+ headers?: Headers;
72
+ /**
73
+ * Specifies whether anti-bot measures should be used.
74
+ */
75
+ anti_bot?: boolean;
76
+ /**
77
+ * Specifies whether to include metadata in the response.
78
+ */
79
+ metadata?: boolean;
80
+ /**
81
+ * The dimensions of the viewport.
82
+ */
83
+ viewport?: Viewport;
84
+ /**
85
+ * The encoding to be used for the request.
86
+ */
87
+ encoding?: "UTF-8" | "SHIFT_JIS" | string;
88
+ /**
89
+ * Specifies whether to include subdomains in the crawl.
90
+ */
91
+ subdomains?: boolean;
92
+ /**
93
+ * The user agent string to be used for the request.
94
+ */
95
+ user_agent?: string;
96
+ /**
97
+ * Specifies whether the response data should be stored.
98
+ */
99
+ store_data?: boolean;
100
+ /**
101
+ * Configuration settings for GPT (general purpose texture mappings).
102
+ */
103
+ gpt_config?: string[];
104
+ /**
105
+ * Specifies whether to use fingerprinting protection.
106
+ */
107
+ fingerprint?: boolean;
108
+ /**
109
+ * Specifies whether to perform the request without using storage.
110
+ */
111
+ storageless?: boolean;
112
+ /**
113
+ * Specifies whether readability optimizations should be applied.
114
+ */
115
+ readability?: boolean;
116
+ /**
117
+ * Specifies whether to use a proxy for the request.
118
+ */
119
+ proxy_enabled?: boolean;
120
+ /**
121
+ * Specifies whether to respect the site's robots.txt file.
122
+ */
123
+ respect_robots?: boolean;
124
+ /**
125
+ * CSS selector to be used to filter the content.
126
+ */
127
+ query_selector?: string;
128
+ /**
129
+ * Specifies whether to load all resources of the crawl target.
130
+ */
131
+ full_resources?: boolean;
132
+ /**
133
+ * The timeout for the request, in milliseconds.
134
+ */
135
+ request_timeout?: number;
136
+ /**
137
+ * Specifies whether to run the request in the background.
138
+ */
139
+ run_in_background?: boolean;
140
+ /**
141
+ * Specifies whether to skip configuration checks.
142
+ */
143
+ skip_config_checks?: boolean;
144
+ }
package/dist/config.js ADDED
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
package/dist/index.d.ts CHANGED
@@ -1 +1,2 @@
1
1
  export { Spider } from "./client";
2
+ export type { SpiderParams, Budget, Viewport } from "./config";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@spider-cloud/spider-client",
3
- "version": "0.0.22",
3
+ "version": "0.0.24",
4
4
  "description": "A Javascript SDK for Spider Cloud services",
5
5
  "scripts": {
6
6
  "test": "jest",
@@ -15,6 +15,7 @@
15
15
  "keywords": [
16
16
  "spider",
17
17
  "sdk",
18
+ "web crawling",
18
19
  "web scraping",
19
20
  "api",
20
21
  "llm scraping"
@@ -24,7 +25,7 @@
24
25
  "devDependencies": {
25
26
  "@jest/globals": "^29.7.0",
26
27
  "@types/jest": "^29.5.12",
27
- "@types/node": "20.12.7",
28
+ "@types/node": "20.14.2",
28
29
  "dotenv": "^16.4.5",
29
30
  "ts-jest": "^29.1.2",
30
31
  "typescript": "5.4.5"