npm - @spider-cloud/spider-client - Versions diffs - 0.0.21 → 0.0.23 - Mend

@spider-cloud/spider-client 0.0.21 → 0.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md CHANGED Viewed

@@ -95,8 +95,10 @@ spider
 - **`scrapeUrl(url, params)`**: Scrape data from a specified URL. Optional parameters can be passed to customize the scraping behavior.
 - **`crawlUrl(url, params, stream)`**: Begin crawling from a specific URL with optional parameters for customization and an optional streaming response.
+- **`search(q, params)`**: Perform a search and gather a list of websites to start crawling and collect resources.
 - **`links(url, params)`**: Retrieve all links from the specified URL with optional parameters.
 - **`screenshot(url, params)`**: Take a screenshot of the specified URL.
+- **`transform(data, params)`**: Perform a fast HTML transformation to markdown or text.
 - **`extractContacts(url, params)`**: Extract contact information from the specified URL.
 - **`label(url, params)`**: Apply labeling to data extracted from the specified URL.
 - **`getCrawlState(url, params)`**: Check the website crawl state.

package/dist/client.d.ts CHANGED Viewed

@@ -1,3 +1,8 @@
+import { SpiderParams } from "./config";
+/**
+ * Generic params for core request.
+ */
+type GenericParams = Omit<SpiderParams, "url">;
 /**
  * Configuration interface for Spider.
  */
@@ -38,18 +43,18 @@ export declare class Spider {
     /**
      * Scrapes data from a specified URL.
      * @param {string} url - The URL to scrape.
-     * @param {object} [params={}] - Additional parameters for the scraping request.
+     * @param {GenericParams} [params={}] - Additional parameters for the scraping request.
      * @returns {Promise<any>} The scraped data from the URL.
      */
-    scrapeUrl(url: string, params?: {}): Promise<any>;
+    scrapeUrl(url: string, params?: GenericParams): Promise<any>;
     /**
      * Initiates a crawling job starting from the specified URL.
      * @param {string} url - The URL to start crawling.
-     * @param {object} [params={}] - Additional parameters for the crawl.
+     * @param {GenericParams} [params={}] - Additional parameters for the crawl.
      * @param {boolean} [stream=false] - Whether to receive the response as a stream.
      * @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming.
      */
-    crawlUrl(url: string, params?: {}, stream?: boolean): Promise<any>;
+    crawlUrl(url: string, params?: GenericParams, stream?: boolean): Promise<any>;
     /**
      * Retrieves all links from the specified URL.
      * @param {string} url - The URL from which to gather links.
@@ -60,31 +65,48 @@ export declare class Spider {
     /**
      * Takes a screenshot of the specified URL.
      * @param {string} url - The URL to screenshot.
-     * @param {object} [params={}] - Configuration parameters for the screenshot.
+     * @param {GenericParams} [params={}] - Configuration parameters for the screenshot.
      * @returns {Promise<any>} The screenshot data.
      */
-    screenshot(url: string, params?: {}): Promise<any>;
+    screenshot(url: string, params?: GenericParams): Promise<any>;
+    /**
+     *  Perform a search and gather a list of websites to start crawling and collect resources.
+     * @param {string} search - The search query.
+     * @param {GenericParams} [params={}] - Configuration parameters for the search.
+     * @returns {Promise<any>} The result of the crawl, either structured data or a Response object if streaming.
+     */
+    search(q: string, params?: GenericParams): Promise<any>;
+    /**
+     *  Transform HTML to Markdown or text. You can send up to 10MB of data at once.
+     * @param {object} data - The data to trasnform, a list of objects with the key 'html' and optional 'url' key for readability.
+     * @param {object} [params={}] - Configuration parameters for the transformation.
+     * @returns {Promise<any>} The transformation result.
+     */
+    transform(data: {
+        html: string;
+        url?: string;
+    }[], params?: {}): Promise<any>;
     /**
      * Extracts contact information from the specified URL.
      * @param {string} url - The URL from which to extract contacts.
-     * @param {object} [params={}] - Configuration parameters for the extraction.
+     * @param {GenericParams} [params={}] - Configuration parameters for the extraction.
      * @returns {Promise<any>} The contact information extracted.
      */
-    extractContacts(url: string, params?: {}): Promise<any>;
+    extractContacts(url: string, params?: GenericParams): Promise<any>;
     /**
      * Applies labeling to data extracted from a specified URL.
      * @param {string} url - The URL to label.
-     * @param {object} [params={}] - Configuration parameters for labeling.
+     * @param {GenericParams} [params={}] - Configuration parameters for labeling.
      * @returns {Promise<any>} The labeled data.
      */
-    label(url: string, params?: {}): Promise<any>;
+    label(url: string, params?: GenericParams): Promise<any>;
     /**
      * Check the crawl state of the website.
      * @param {string} url - The URL to check.
-     * @param {object} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
+     * @param {GenericParams} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
      * @returns {Promise<any>} The crawl state data.
      */
-    getCrawlState(url: string, params?: {}): Promise<any>;
+    getCrawlState(url: string, params?: GenericParams): Promise<any>;
     /**
      * Retrieves the number of credits available on the account.
      * @returns {Promise<any>} The current credit balance.
@@ -96,21 +118,21 @@ export declare class Spider {
      * @param {object} data - The data to be inserted.
      * @returns {Promise<any>} The response from the server.
      */
-    postData(table: string, data: object): Promise<any>;
+    postData(table: string, data: GenericParams | Record<string, any>): Promise<any>;
     /**
      * Send a GET request to retrieve data from a specified table.
      * @param {string} table - The table name in the database.
      * @param {object} params - The query parameters for data retrieval.
      * @returns {Promise<any>} The response from the server.
      */
-    getData(table: string, params: object): Promise<any>;
+    getData(table: string, params: GenericParams | Record<string, any>): Promise<any>;
     /**
      * Send a DELETE request to remove data from a specified table.
      * @param {string} table - The table name in the database.
      * @param {object} params - Parameters to identify records to delete.
      * @returns {Promise<any>} The response from the server.
      */
-    deleteData(table: string, params: object): Promise<any>;
+    deleteData(table: string, params: GenericParams | Record<string, any>): Promise<any>;
     /**
      * Prepares common headers for each API request.
      * @returns {HeadersInit} A headers object for fetch requests.
@@ -127,3 +149,4 @@ export declare class Spider {
      */
     handleError(response: Response, action: string): void;
 }
+export {};

package/dist/client.js CHANGED Viewed

@@ -80,7 +80,7 @@ class Spider {
     /**
      * Scrapes data from a specified URL.
      * @param {string} url - The URL to scrape.
-     * @param {object} [params={}] - Additional parameters for the scraping request.
+     * @param {GenericParams} [params={}] - Additional parameters for the scraping request.
      * @returns {Promise<any>} The scraped data from the URL.
      */
     async scrapeUrl(url, params = {}) {
@@ -89,7 +89,7 @@ class Spider {
     /**
      * Initiates a crawling job starting from the specified URL.
      * @param {string} url - The URL to start crawling.
-     * @param {object} [params={}] - Additional parameters for the crawl.
+     * @param {GenericParams} [params={}] - Additional parameters for the crawl.
      * @param {boolean} [stream=false] - Whether to receive the response as a stream.
      * @returns {Promise<any | Response>} The result of the crawl, either structured data or a Response object if streaming.
      */
@@ -108,16 +108,34 @@ class Spider {
     /**
      * Takes a screenshot of the specified URL.
      * @param {string} url - The URL to screenshot.
-     * @param {object} [params={}] - Configuration parameters for the screenshot.
+     * @param {GenericParams} [params={}] - Configuration parameters for the screenshot.
      * @returns {Promise<any>} The screenshot data.
      */
     async screenshot(url, params = {}) {
         return this._apiPost("screenshot", { url: url, ...params });
     }
+    /**
+     *  Perform a search and gather a list of websites to start crawling and collect resources.
+     * @param {string} search - The search query.
+     * @param {GenericParams} [params={}] - Configuration parameters for the search.
+     * @returns {Promise<any>} The result of the crawl, either structured data or a Response object if streaming.
+     */
+    async search(q, params = {}) {
+        return this._apiPost("search", { search: q, ...params });
+    }
+    /**
+     *  Transform HTML to Markdown or text. You can send up to 10MB of data at once.
+     * @param {object} data - The data to trasnform, a list of objects with the key 'html' and optional 'url' key for readability.
+     * @param {object} [params={}] - Configuration parameters for the transformation.
+     * @returns {Promise<any>} The transformation result.
+     */
+    async transform(data, params = {}) {
+        return this._apiPost("transform", { data, ...params });
+    }
     /**
      * Extracts contact information from the specified URL.
      * @param {string} url - The URL from which to extract contacts.
-     * @param {object} [params={}] - Configuration parameters for the extraction.
+     * @param {GenericParams} [params={}] - Configuration parameters for the extraction.
      * @returns {Promise<any>} The contact information extracted.
      */
     async extractContacts(url, params = {}) {
@@ -126,7 +144,7 @@ class Spider {
     /**
      * Applies labeling to data extracted from a specified URL.
      * @param {string} url - The URL to label.
-     * @param {object} [params={}] - Configuration parameters for labeling.
+     * @param {GenericParams} [params={}] - Configuration parameters for labeling.
      * @returns {Promise<any>} The labeled data.
      */
     async label(url, params = {}) {
@@ -135,7 +153,7 @@ class Spider {
     /**
      * Check the crawl state of the website.
      * @param {string} url - The URL to check.
-     * @param {object} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
+     * @param {GenericParams} [params={}] - Configuration parameters for crawl state. Can also pass in "domain" instead of the url to query.
      * @returns {Promise<any>} The crawl state data.
      */
     async getCrawlState(url, params = {}) {

package/dist/config.d.ts ADDED Viewed

@@ -0,0 +1,144 @@
+/**
+ * Represents viewport dimensions.
+ */
+export interface Viewport {
+    width: number;
+    height: number;
+}
+/**
+ * Represents HTTP headers as a dictionary object.
+ */
+export interface Headers {
+    [key: string]: string;
+}
+/**
+ * Represents a budget for various resources.
+ */
+export interface Budget {
+    [key: string]: number;
+}
+/**
+ * Represents the options available for making a spider request.
+ */
+export interface SpiderParams {
+    /**
+     * The URL to be crawled.
+     */
+    url: string;
+    /**
+     * The type of request to be made.
+     */
+    request?: "http" | "chrome" | "smart";
+    /**
+     * The maximum number of pages the crawler should visit.
+     */
+    limit?: number;
+    /**
+     * The format in which the result should be returned.
+     */
+    return_format?: "markdown" | "raw" | "text" | "html2text";
+    /**
+     * Specifies whether to only visit the top-level domain.
+     */
+    tld?: boolean;
+    /**
+     * The depth of the crawl.
+     */
+    depth?: number;
+    /**
+     * Specifies whether the request should be cached.
+     */
+    cache?: boolean;
+    /**
+     * The budget for various resources.
+     */
+    budget?: Budget;
+    /**
+     * The locale to be used during the crawl.
+     */
+    locale?: string;
+    /**
+     * The cookies to be set for the request, formatted as a single string.
+     */
+    cookies?: string;
+    /**
+     * Specifies whether to use stealth techniques to avoid detection.
+     */
+    stealth?: boolean;
+    /**
+     * The headers to be used for the request.
+     */
+    headers?: Headers;
+    /**
+     * Specifies whether anti-bot measures should be used.
+     */
+    anti_bot?: boolean;
+    /**
+     * Specifies whether to include metadata in the response.
+     */
+    metadata?: boolean;
+    /**
+     * The dimensions of the viewport.
+     */
+    viewport?: Viewport;
+    /**
+     * The encoding to be used for the request.
+     */
+    encoding?: "UTF-8" | "SHIFT_JIS" | string;
+    /**
+     * Specifies whether to include subdomains in the crawl.
+     */
+    subdomains?: boolean;
+    /**
+     * The user agent string to be used for the request.
+     */
+    user_agent?: string;
+    /**
+     * Specifies whether the response data should be stored.
+     */
+    store_data?: boolean;
+    /**
+     * Configuration settings for GPT (general purpose texture mappings).
+     */
+    gpt_config?: string[];
+    /**
+     * Specifies whether to use fingerprinting protection.
+     */
+    fingerprint?: boolean;
+    /**
+     * Specifies whether to perform the request without using storage.
+     */
+    storageless?: boolean;
+    /**
+     * Specifies whether readability optimizations should be applied.
+     */
+    readability?: boolean;
+    /**
+     * Specifies whether to use a proxy for the request.
+     */
+    proxy_enabled?: boolean;
+    /**
+     * Specifies whether to respect the site's robots.txt file.
+     */
+    respect_robots?: boolean;
+    /**
+     * CSS selector to be used to filter the content.
+     */
+    query_selector?: string;
+    /**
+     * Specifies whether to load all resources of the crawl target.
+     */
+    full_resources?: boolean;
+    /**
+     * The timeout for the request, in milliseconds.
+     */
+    request_timeout?: number;
+    /**
+     * Specifies whether to run the request in the background.
+     */
+    run_in_background?: boolean;
+    /**
+     * Specifies whether to skip configuration checks.
+     */
+    skip_config_checks?: boolean;
+}

package/dist/config.js ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ "use strict";
2	+ Object.defineProperty(exports, "__esModule", { value: true });

package/dist/index.d.ts CHANGED Viewed

	@@ -1 +1,2 @@
1 1	export { Spider } from "./client";
2	+ export type { SpiderParams, Budget, Viewport } from "./config";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@spider-cloud/spider-client",
-  "version": "0.0.21",
+  "version": "0.0.23",
   "description": "A Javascript SDK for Spider Cloud services",
   "scripts": {
     "test": "jest",
@@ -15,6 +15,7 @@
   "keywords": [
     "spider",
     "sdk",
+    "web crawling",
     "web scraping",
     "api",
     "llm scraping"
@@ -24,7 +25,7 @@
   "devDependencies": {
     "@jest/globals": "^29.7.0",
     "@types/jest": "^29.5.12",
-    "@types/node": "20.12.7",
+    "@types/node": "20.14.2",
     "dotenv": "^16.4.5",
     "ts-jest": "^29.1.2",
     "typescript": "5.4.5"