@spider-cloud/spider-client 0.0.21 → 0.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -95,8 +95,10 @@ spider
95
95
 
96
96
  - **`scrapeUrl(url, params)`**: Scrape data from a specified URL. Optional parameters can be passed to customize the scraping behavior.
97
97
  - **`crawlUrl(url, params, stream)`**: Begin crawling from a specific URL with optional parameters for customization and an optional streaming response.
98
+ - **`search(q, params)`**: Perform a search and gather a list of websites to start crawling and collect resources.
98
99
  - **`links(url, params)`**: Retrieve all links from the specified URL with optional parameters.
99
100
  - **`screenshot(url, params)`**: Take a screenshot of the specified URL.
101
+ - **`transform(data, params)`**: Perform a fast HTML transformation to markdown or text.
100
102
  - **`extractContacts(url, params)`**: Extract contact information from the specified URL.
101
103
  - **`label(url, params)`**: Apply labeling to data extracted from the specified URL.
102
104
  - **`getCrawlState(url, params)`**: Check the website crawl state.
package/dist/client.d.ts CHANGED
@@ -1,3 +1,8 @@
1
+ import { SpiderParams } from "./config";
2
+ /**
3
+ * Generic params for core request.
4
+ */
5
+ type GenericParams = Omit<SpiderParams, "url">;
1
6
  /**
2
7
  * Configuration interface for Spider.
3
8
  */
@@ -38,18 +43,18 @@ export declare class Spider {
38
43
  /**
39
44
  * Scrapes data from a specified URL.
40
45
  * @param {string} url - The URL to scrape.
41
- * @param {object} [params={}] - Additional parameters for the scraping request.
46
+ * @param {GenericParams} [params={}] - Additional parameters for the scraping request.
42
47
  * @returns {Promise<any>} The scraped data from the URL.
43
48
  */
44
- scrapeUrl(url: string, params?: {}): Promise<any>;
49
+ scrapeUrl(url: string, params?: GenericParams): Promise<any>;
45
50
  /**
46
51
  * Initiates a crawling job starting from the specified URL.
47
52
  * @param {string} url - The URL to start crawling.
48
- * @param {object} [params={}] - Additional parameters for the crawl.
53
+ * @param {GenericParams} [params={}] - Additional parameters for the crawl.
49
54
  * @param {boolean} [stream=false] - Whether to receive the response as a stream.
50
55
  * @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming.
51
56
  */
52
- crawlUrl(url: string, params?: {}, stream?: boolean): Promise<any>;
57
+ crawlUrl(url: string, params?: GenericParams, stream?: boolean): Promise<any>;
53
58
  /**
54
59
  * Retrieves all links from the specified URL.
55
60
  * @param {string} url - The URL from which to gather links.
@@ -60,31 +65,48 @@ export declare class Spider {
60
65
  /**
61
66
  * Takes a screenshot of the specified URL.
62
67
  * @param {string} url - The URL to screenshot.
63
- * @param {object} [params={}] - Configuration parameters for the screenshot.
68
+ * @param {GenericParams} [params={}] - Configuration parameters for the screenshot.
64
69
  * @returns {Promise<any>} The screenshot data.
65
70
  */
66
- screenshot(url: string, params?: {}): Promise<any>;
71
+ screenshot(url: string, params?: GenericParams): Promise<any>;
72
+ /**
73
+ * Perform a search and gather a list of websites to start crawling and collect resources.
74
+ * @param {string} search - The search query.
75
+ * @param {GenericParams} [params={}] - Configuration parameters for the search.
76
+ * @returns {Promise<any>} The result of the crawl, either structured data or a Response object if streaming.
77
+ */
78
+ search(q: string, params?: GenericParams): Promise<any>;
79
+ /**
80
+ * Transform HTML to Markdown or text. You can send up to 10MB of data at once.
81
+ * @param {object} data - The data to trasnform, a list of objects with the key 'html' and optional 'url' key for readability.
82
+ * @param {object} [params={}] - Configuration parameters for the transformation.
83
+ * @returns {Promise<any>} The transformation result.
84
+ */
85
+ transform(data: {
86
+ html: string;
87
+ url?: string;
88
+ }[], params?: {}): Promise<any>;
67
89
  /**
68
90
  * Extracts contact information from the specified URL.
69
91
  * @param {string} url - The URL from which to extract contacts.
70
- * @param {object} [params={}] - Configuration parameters for the extraction.
92
+ * @param {GenericParams} [params={}] - Configuration parameters for the extraction.
71
93
  * @returns {Promise<any>} The contact information extracted.
72
94
  */
73
- extractContacts(url: string, params?: {}): Promise<any>;
95
+ extractContacts(url: string, params?: GenericParams): Promise<any>;
74
96
  /**
75
97
  * Applies labeling to data extracted from a specified URL.
76
98
  * @param {string} url - The URL to label.
77
- * @param {object} [params={}] - Configuration parameters for labeling.
99
+ * @param {GenericParams} [params={}] - Configuration parameters for labeling.
78
100
  * @returns {Promise<any>} The labeled data.
79
101
  */
80
- label(url: string, params?: {}): Promise<any>;
102
+ label(url: string, params?: GenericParams): Promise<any>;
81
103
  /**
82
104
  * Check the crawl state of the website.
83
105
  * @param {string} url - The URL to check.
84
- * @param {object} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
106
+ * @param {GenericParams} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
85
107
  * @returns {Promise<any>} The crawl state data.
86
108
  */
87
- getCrawlState(url: string, params?: {}): Promise<any>;
109
+ getCrawlState(url: string, params?: GenericParams): Promise<any>;
88
110
  /**
89
111
  * Retrieves the number of credits available on the account.
90
112
  * @returns {Promise<any>} The current credit balance.
@@ -96,21 +118,21 @@ export declare class Spider {
96
118
  * @param {object} data - The data to be inserted.
97
119
  * @returns {Promise<any>} The response from the server.
98
120
  */
99
- postData(table: string, data: object): Promise<any>;
121
+ postData(table: string, data: GenericParams | Record<string, any>): Promise<any>;
100
122
  /**
101
123
  * Send a GET request to retrieve data from a specified table.
102
124
  * @param {string} table - The table name in the database.
103
125
  * @param {object} params - The query parameters for data retrieval.
104
126
  * @returns {Promise<any>} The response from the server.
105
127
  */
106
- getData(table: string, params: object): Promise<any>;
128
+ getData(table: string, params: GenericParams | Record<string, any>): Promise<any>;
107
129
  /**
108
130
  * Send a DELETE request to remove data from a specified table.
109
131
  * @param {string} table - The table name in the database.
110
132
  * @param {object} params - Parameters to identify records to delete.
111
133
  * @returns {Promise<any>} The response from the server.
112
134
  */
113
- deleteData(table: string, params: object): Promise<any>;
135
+ deleteData(table: string, params: GenericParams | Record<string, any>): Promise<any>;
114
136
  /**
115
137
  * Prepares common headers for each API request.
116
138
  * @returns {HeadersInit} A headers object for fetch requests.
@@ -127,3 +149,4 @@ export declare class Spider {
127
149
  */
128
150
  handleError(response: Response, action: string): void;
129
151
  }
152
+ export {};
package/dist/client.js CHANGED
@@ -80,7 +80,7 @@ class Spider {
80
80
  /**
81
81
  * Scrapes data from a specified URL.
82
82
  * @param {string} url - The URL to scrape.
83
- * @param {object} [params={}] - Additional parameters for the scraping request.
83
+ * @param {GenericParams} [params={}] - Additional parameters for the scraping request.
84
84
  * @returns {Promise<any>} The scraped data from the URL.
85
85
  */
86
86
  async scrapeUrl(url, params = {}) {
@@ -89,7 +89,7 @@ class Spider {
89
89
  /**
90
90
  * Initiates a crawling job starting from the specified URL.
91
91
  * @param {string} url - The URL to start crawling.
92
- * @param {object} [params={}] - Additional parameters for the crawl.
92
+ * @param {GenericParams} [params={}] - Additional parameters for the crawl.
93
93
  * @param {boolean} [stream=false] - Whether to receive the response as a stream.
94
94
  * @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming.
95
95
  */
@@ -108,16 +108,34 @@ class Spider {
108
108
  /**
109
109
  * Takes a screenshot of the specified URL.
110
110
  * @param {string} url - The URL to screenshot.
111
- * @param {object} [params={}] - Configuration parameters for the screenshot.
111
+ * @param {GenericParams} [params={}] - Configuration parameters for the screenshot.
112
112
  * @returns {Promise<any>} The screenshot data.
113
113
  */
114
114
  async screenshot(url, params = {}) {
115
115
  return this._apiPost("screenshot", { url: url, ...params });
116
116
  }
117
+ /**
118
+ * Perform a search and gather a list of websites to start crawling and collect resources.
119
+ * @param {string} search - The search query.
120
+ * @param {GenericParams} [params={}] - Configuration parameters for the search.
121
+ * @returns {Promise<any>} The result of the crawl, either structured data or a Response object if streaming.
122
+ */
123
+ async search(q, params = {}) {
124
+ return this._apiPost("search", { search: q, ...params });
125
+ }
126
+ /**
127
+ * Transform HTML to Markdown or text. You can send up to 10MB of data at once.
128
+ * @param {object} data - The data to trasnform, a list of objects with the key 'html' and optional 'url' key for readability.
129
+ * @param {object} [params={}] - Configuration parameters for the transformation.
130
+ * @returns {Promise<any>} The transformation result.
131
+ */
132
+ async transform(data, params = {}) {
133
+ return this._apiPost("transform", { data, ...params });
134
+ }
117
135
  /**
118
136
  * Extracts contact information from the specified URL.
119
137
  * @param {string} url - The URL from which to extract contacts.
120
- * @param {object} [params={}] - Configuration parameters for the extraction.
138
+ * @param {GenericParams} [params={}] - Configuration parameters for the extraction.
121
139
  * @returns {Promise<any>} The contact information extracted.
122
140
  */
123
141
  async extractContacts(url, params = {}) {
@@ -126,7 +144,7 @@ class Spider {
126
144
  /**
127
145
  * Applies labeling to data extracted from a specified URL.
128
146
  * @param {string} url - The URL to label.
129
- * @param {object} [params={}] - Configuration parameters for labeling.
147
+ * @param {GenericParams} [params={}] - Configuration parameters for labeling.
130
148
  * @returns {Promise<any>} The labeled data.
131
149
  */
132
150
  async label(url, params = {}) {
@@ -135,7 +153,7 @@ class Spider {
135
153
  /**
136
154
  * Check the crawl state of the website.
137
155
  * @param {string} url - The URL to check.
138
- * @param {object} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
156
+ * @param {GenericParams} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
139
157
  * @returns {Promise<any>} The crawl state data.
140
158
  */
141
159
  async getCrawlState(url, params = {}) {
@@ -0,0 +1,144 @@
1
+ /**
2
+ * Represents viewport dimensions.
3
+ */
4
+ export interface Viewport {
5
+ width: number;
6
+ height: number;
7
+ }
8
+ /**
9
+ * Represents HTTP headers as a dictionary object.
10
+ */
11
+ export interface Headers {
12
+ [key: string]: string;
13
+ }
14
+ /**
15
+ * Represents a budget for various resources.
16
+ */
17
+ export interface Budget {
18
+ [key: string]: number;
19
+ }
20
+ /**
21
+ * Represents the options available for making a spider request.
22
+ */
23
+ export interface SpiderParams {
24
+ /**
25
+ * The URL to be crawled.
26
+ */
27
+ url: string;
28
+ /**
29
+ * The type of request to be made.
30
+ */
31
+ request?: "http" | "chrome" | "smart";
32
+ /**
33
+ * The maximum number of pages the crawler should visit.
34
+ */
35
+ limit?: number;
36
+ /**
37
+ * The format in which the result should be returned.
38
+ */
39
+ return_format?: "markdown" | "raw" | "text" | "html2text";
40
+ /**
41
+ * Specifies whether to only visit the top-level domain.
42
+ */
43
+ tld?: boolean;
44
+ /**
45
+ * The depth of the crawl.
46
+ */
47
+ depth?: number;
48
+ /**
49
+ * Specifies whether the request should be cached.
50
+ */
51
+ cache?: boolean;
52
+ /**
53
+ * The budget for various resources.
54
+ */
55
+ budget?: Budget;
56
+ /**
57
+ * The locale to be used during the crawl.
58
+ */
59
+ locale?: string;
60
+ /**
61
+ * The cookies to be set for the request, formatted as a single string.
62
+ */
63
+ cookies?: string;
64
+ /**
65
+ * Specifies whether to use stealth techniques to avoid detection.
66
+ */
67
+ stealth?: boolean;
68
+ /**
69
+ * The headers to be used for the request.
70
+ */
71
+ headers?: Headers;
72
+ /**
73
+ * Specifies whether anti-bot measures should be used.
74
+ */
75
+ anti_bot?: boolean;
76
+ /**
77
+ * Specifies whether to include metadata in the response.
78
+ */
79
+ metadata?: boolean;
80
+ /**
81
+ * The dimensions of the viewport.
82
+ */
83
+ viewport?: Viewport;
84
+ /**
85
+ * The encoding to be used for the request.
86
+ */
87
+ encoding?: "UTF-8" | "SHIFT_JIS" | string;
88
+ /**
89
+ * Specifies whether to include subdomains in the crawl.
90
+ */
91
+ subdomains?: boolean;
92
+ /**
93
+ * The user agent string to be used for the request.
94
+ */
95
+ user_agent?: string;
96
+ /**
97
+ * Specifies whether the response data should be stored.
98
+ */
99
+ store_data?: boolean;
100
+ /**
101
+ * Configuration settings for GPT (general purpose texture mappings).
102
+ */
103
+ gpt_config?: string[];
104
+ /**
105
+ * Specifies whether to use fingerprinting protection.
106
+ */
107
+ fingerprint?: boolean;
108
+ /**
109
+ * Specifies whether to perform the request without using storage.
110
+ */
111
+ storageless?: boolean;
112
+ /**
113
+ * Specifies whether readability optimizations should be applied.
114
+ */
115
+ readability?: boolean;
116
+ /**
117
+ * Specifies whether to use a proxy for the request.
118
+ */
119
+ proxy_enabled?: boolean;
120
+ /**
121
+ * Specifies whether to respect the site's robots.txt file.
122
+ */
123
+ respect_robots?: boolean;
124
+ /**
125
+ * CSS selector to be used to filter the content.
126
+ */
127
+ query_selector?: string;
128
+ /**
129
+ * Specifies whether to load all resources of the crawl target.
130
+ */
131
+ full_resources?: boolean;
132
+ /**
133
+ * The timeout for the request, in milliseconds.
134
+ */
135
+ request_timeout?: number;
136
+ /**
137
+ * Specifies whether to run the request in the background.
138
+ */
139
+ run_in_background?: boolean;
140
+ /**
141
+ * Specifies whether to skip configuration checks.
142
+ */
143
+ skip_config_checks?: boolean;
144
+ }
package/dist/config.js ADDED
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
package/dist/index.d.ts CHANGED
@@ -1 +1,2 @@
1
1
  export { Spider } from "./client";
2
+ export type { SpiderParams, Budget, Viewport } from "./config";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@spider-cloud/spider-client",
3
- "version": "0.0.21",
3
+ "version": "0.0.23",
4
4
  "description": "A Javascript SDK for Spider Cloud services",
5
5
  "scripts": {
6
6
  "test": "jest",
@@ -15,6 +15,7 @@
15
15
  "keywords": [
16
16
  "spider",
17
17
  "sdk",
18
+ "web crawling",
18
19
  "web scraping",
19
20
  "api",
20
21
  "llm scraping"
@@ -24,7 +25,7 @@
24
25
  "devDependencies": {
25
26
  "@jest/globals": "^29.7.0",
26
27
  "@types/jest": "^29.5.12",
27
- "@types/node": "20.12.7",
28
+ "@types/node": "20.14.2",
28
29
  "dotenv": "^16.4.5",
29
30
  "ts-jest": "^29.1.2",
30
31
  "typescript": "5.4.5"