npm - @hyperbrowser/sdk - Versions diffs - 0.82.3 → 0.83.0 - Mend

@hyperbrowser/sdk 0.82.3 → 0.83.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/services/web/crawl.d.ts +26 -0
package/dist/services/web/crawl.js +170 -0
package/dist/services/web/index.d.ts +2 -0
package/dist/services/web/index.js +2 -0
package/dist/types/web/crawl.d.ts +38 -0
package/dist/types/web/crawl.js +2 -0
package/package.json +1 -1

package/dist/services/web/crawl.d.ts ADDED Viewed

@@ -0,0 +1,26 @@
+import { BaseService } from "../base";
+import { GetWebCrawlJobParams, StartWebCrawlJobParams, StartWebCrawlJobResponse, WebCrawlJobResponse, WebCrawlJobStatusResponse } from "../../types/web/crawl";
+export declare class WebCrawlService extends BaseService {
+    /**
+     * Start a new web crawl job
+     * @param params The parameters for the web crawl job
+     */
+    start(params: StartWebCrawlJobParams): Promise<StartWebCrawlJobResponse>;
+    /**
+     * Get the status of a web crawl job
+     * @param id The ID of the web crawl job to get
+     */
+    getStatus(id: string): Promise<WebCrawlJobStatusResponse>;
+    /**
+     * Get the details of a web crawl job
+     * @param id The ID of the web crawl job to get
+     * @param params Optional parameters to filter the web crawl job
+     */
+    get(id: string, params?: GetWebCrawlJobParams): Promise<WebCrawlJobResponse>;
+    /**
+     * Start a web crawl job and wait for it to complete
+     * @param params The parameters for the web crawl job
+     * @param returnAllPages Whether to return all pages in the web crawl job response
+     */
+    startAndWait(params: StartWebCrawlJobParams, returnAllPages?: boolean): Promise<WebCrawlJobResponse>;
+}

package/dist/services/web/crawl.js ADDED Viewed

@@ -0,0 +1,170 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.WebCrawlService = void 0;
+const zod_to_json_schema_1 = __importDefault(require("zod-to-json-schema"));
+const zod_1 = require("zod");
+const base_1 = require("../base");
+const utils_1 = require("../../utils");
+const client_1 = require("../../client");
+const constants_1 = require("../../types/constants");
+const utils_2 = require("../../utils");
+class WebCrawlService extends base_1.BaseService {
+    /**
+     * Start a new web crawl job
+     * @param params The parameters for the web crawl job
+     */
+    async start(params) {
+        try {
+            if (params.outputs?.formats) {
+                for (const output of params.outputs.formats) {
+                    if (typeof output === "object" && "type" in output && output.type === "json") {
+                        const jsonOutput = output;
+                        if (jsonOutput.schema) {
+                            if ((0, utils_2.isZodSchema)(jsonOutput.schema)) {
+                                try {
+                                    output.schema = (0, zod_1.toJSONSchema)(jsonOutput.schema);
+                                }
+                                catch {
+                                    output.schema = (0, zod_to_json_schema_1.default)(jsonOutput.schema);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            return await this.request("/web/crawl", {
+                method: "POST",
+                body: JSON.stringify(params),
+            });
+        }
+        catch (error) {
+            if (error instanceof client_1.HyperbrowserError) {
+                throw error;
+            }
+            throw new client_1.HyperbrowserError("Failed to start web crawl job", undefined);
+        }
+    }
+    /**
+     * Get the status of a web crawl job
+     * @param id The ID of the web crawl job to get
+     */
+    async getStatus(id) {
+        try {
+            return await this.request(`/web/crawl/${id}/status`);
+        }
+        catch (error) {
+            if (error instanceof client_1.HyperbrowserError) {
+                throw error;
+            }
+            throw new client_1.HyperbrowserError(`Failed to get web crawl job ${id} status`, undefined);
+        }
+    }
+    /**
+     * Get the details of a web crawl job
+     * @param id The ID of the web crawl job to get
+     * @param params Optional parameters to filter the web crawl job
+     */
+    async get(id, params) {
+        try {
+            return await this.request(`/web/crawl/${id}`, undefined, {
+                page: params?.page,
+                batchSize: params?.batchSize,
+            });
+        }
+        catch (error) {
+            if (error instanceof client_1.HyperbrowserError) {
+                throw error;
+            }
+            throw new client_1.HyperbrowserError(`Failed to get web crawl job ${id}`, undefined);
+        }
+    }
+    /**
+     * Start a web crawl job and wait for it to complete
+     * @param params The parameters for the web crawl job
+     * @param returnAllPages Whether to return all pages in the web crawl job response
+     */
+    async startAndWait(params, returnAllPages = true) {
+        const job = await this.start(params);
+        const jobId = job.jobId;
+        if (!jobId) {
+            throw new client_1.HyperbrowserError("Failed to start web crawl job, could not get job ID");
+        }
+        let failures = 0;
+        let jobStatus = "pending";
+        while (true) {
+            try {
+                const { status } = await this.getStatus(jobId);
+                if (status === "completed" || status === "failed") {
+                    jobStatus = status;
+                    break;
+                }
+                failures = 0;
+            }
+            catch (error) {
+                failures++;
+                if (failures >= constants_1.POLLING_ATTEMPTS) {
+                    throw new client_1.HyperbrowserError(`Failed to poll web crawl job ${jobId} after ${constants_1.POLLING_ATTEMPTS} attempts: ${error}`);
+                }
+            }
+            await (0, utils_1.sleep)(2000);
+        }
+        failures = 0;
+        if (!returnAllPages) {
+            while (true) {
+                try {
+                    return await this.get(jobId);
+                }
+                catch (error) {
+                    failures++;
+                    if (failures >= constants_1.POLLING_ATTEMPTS) {
+                        throw new client_1.HyperbrowserError(`Failed to get web crawl job ${jobId} after ${constants_1.POLLING_ATTEMPTS} attempts: ${error}`);
+                    }
+                }
+                await (0, utils_1.sleep)(500);
+            }
+        }
+        failures = 0;
+        const jobResponse = {
+            jobId,
+            status: jobStatus,
+            data: [],
+            currentPageBatch: 0,
+            totalPageBatches: 0,
+            totalPages: 0,
+            batchSize: 100,
+        };
+        let firstCheck = true;
+        while (firstCheck || jobResponse.currentPageBatch < jobResponse.totalPageBatches) {
+            try {
+                const tmpJobResponse = await this.get(jobId, {
+                    page: jobResponse.currentPageBatch + 1,
+                    batchSize: 100,
+                });
+                if (tmpJobResponse.data) {
+                    jobResponse.data?.push(...tmpJobResponse.data);
+                }
+                if (tmpJobResponse.error) {
+                    jobResponse.error = tmpJobResponse.error;
+                }
+                jobResponse.currentPageBatch = tmpJobResponse.currentPageBatch;
+                jobResponse.totalPages = tmpJobResponse.totalPages;
+                jobResponse.totalPageBatches = tmpJobResponse.totalPageBatches;
+                jobResponse.batchSize = tmpJobResponse.batchSize;
+                failures = 0;
+                firstCheck = false;
+            }
+            catch (error) {
+                failures++;
+                if (failures >= constants_1.POLLING_ATTEMPTS) {
+                    throw new client_1.HyperbrowserError(`Failed to get web crawl page ${jobResponse.currentPageBatch + 1} for job ${jobId} after ${constants_1.POLLING_ATTEMPTS} attempts: ${error}`);
+                }
+            }
+            await (0, utils_1.sleep)(500);
+        }
+        return jobResponse;
+    }
+}
+exports.WebCrawlService = WebCrawlService;

package/dist/services/web/index.d.ts CHANGED Viewed

@@ -2,8 +2,10 @@ import { BaseService } from "../base";
 import { FetchParams, FetchResponse } from "../../types/web/fetch";
 import { WebSearchParams, WebSearchResponse } from "../../types/web/search";
 import { BatchFetchService } from "./batch-fetch";
+import { WebCrawlService } from "./crawl";
 export declare class WebService extends BaseService {
     readonly batchFetch: BatchFetchService;
+    readonly crawl: WebCrawlService;
     constructor(apiKey: string, baseUrl: string, timeout: number);
     /**
      * Fetch a URL and extract content

package/dist/services/web/index.js CHANGED Viewed

@@ -10,10 +10,12 @@ const base_1 = require("../base");
 const client_1 = require("../../client");
 const utils_1 = require("../../utils");
 const batch_fetch_1 = require("./batch-fetch");
+const crawl_1 = require("./crawl");
 class WebService extends base_1.BaseService {
     constructor(apiKey, baseUrl, timeout) {
         super(apiKey, baseUrl, timeout);
         this.batchFetch = new batch_fetch_1.BatchFetchService(apiKey, baseUrl, timeout);
+        this.crawl = new crawl_1.WebCrawlService(apiKey, baseUrl, timeout);
     }
     /**
      * Fetch a URL and extract content

package/dist/types/web/crawl.d.ts ADDED Viewed

@@ -0,0 +1,38 @@
+import { FetchStealthMode, FetchOutputOptions, FetchBrowserOptions, FetchNavigationOptions, FetchCacheOptions, PageData } from "./common";
+export type WebCrawlJobStatus = "pending" | "running" | "completed" | "failed";
+export interface WebCrawlOptions {
+    maxPages?: number;
+    ignoreSitemap?: boolean;
+    followLinks?: boolean;
+    excludePatterns?: string[];
+    includePatterns?: string[];
+}
+export interface StartWebCrawlJobParams {
+    url: string;
+    stealth?: FetchStealthMode;
+    outputs?: FetchOutputOptions;
+    browser?: FetchBrowserOptions;
+    navigation?: FetchNavigationOptions;
+    cache?: FetchCacheOptions;
+    crawlOptions?: WebCrawlOptions;
+}
+export interface GetWebCrawlJobParams {
+    page?: number;
+    batchSize?: number;
+}
+export interface StartWebCrawlJobResponse {
+    jobId: string;
+}
+export interface WebCrawlJobStatusResponse {
+    status: WebCrawlJobStatus;
+}
+export interface WebCrawlJobResponse {
+    jobId: string;
+    status: WebCrawlJobStatus;
+    data?: PageData[];
+    error?: string;
+    totalPages: number;
+    totalPageBatches: number;
+    currentPageBatch: number;
+    batchSize: number;
+}

package/dist/types/web/crawl.js ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ "use strict";
2	+ Object.defineProperty(exports, "__esModule", { value: true });

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@hyperbrowser/sdk",
-  "version": "0.82.3",
+  "version": "0.83.0",
   "description": "Node SDK for Hyperbrowser API",
   "author": "",
   "main": "dist/index.js",