@hyperbrowser/sdk 0.82.3 → 0.83.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ import { BaseService } from "../base";
2
+ import { GetWebCrawlJobParams, StartWebCrawlJobParams, StartWebCrawlJobResponse, WebCrawlJobResponse, WebCrawlJobStatusResponse } from "../../types/web/crawl";
3
+ export declare class WebCrawlService extends BaseService {
4
+ /**
5
+ * Start a new web crawl job
6
+ * @param params The parameters for the web crawl job
7
+ */
8
+ start(params: StartWebCrawlJobParams): Promise<StartWebCrawlJobResponse>;
9
+ /**
10
+ * Get the status of a web crawl job
11
+ * @param id The ID of the web crawl job to get
12
+ */
13
+ getStatus(id: string): Promise<WebCrawlJobStatusResponse>;
14
+ /**
15
+ * Get the details of a web crawl job
16
+ * @param id The ID of the web crawl job to get
17
+ * @param params Optional parameters to filter the web crawl job
18
+ */
19
+ get(id: string, params?: GetWebCrawlJobParams): Promise<WebCrawlJobResponse>;
20
+ /**
21
+ * Start a web crawl job and wait for it to complete
22
+ * @param params The parameters for the web crawl job
23
+ * @param returnAllPages Whether to return all pages in the web crawl job response
24
+ */
25
+ startAndWait(params: StartWebCrawlJobParams, returnAllPages?: boolean): Promise<WebCrawlJobResponse>;
26
+ }
@@ -0,0 +1,170 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.WebCrawlService = void 0;
7
+ const zod_to_json_schema_1 = __importDefault(require("zod-to-json-schema"));
8
+ const zod_1 = require("zod");
9
+ const base_1 = require("../base");
10
+ const utils_1 = require("../../utils");
11
+ const client_1 = require("../../client");
12
+ const constants_1 = require("../../types/constants");
13
+ const utils_2 = require("../../utils");
14
+ class WebCrawlService extends base_1.BaseService {
15
+ /**
16
+ * Start a new web crawl job
17
+ * @param params The parameters for the web crawl job
18
+ */
19
+ async start(params) {
20
+ try {
21
+ if (params.outputs?.formats) {
22
+ for (const output of params.outputs.formats) {
23
+ if (typeof output === "object" && "type" in output && output.type === "json") {
24
+ const jsonOutput = output;
25
+ if (jsonOutput.schema) {
26
+ if ((0, utils_2.isZodSchema)(jsonOutput.schema)) {
27
+ try {
28
+ output.schema = (0, zod_1.toJSONSchema)(jsonOutput.schema);
29
+ }
30
+ catch {
31
+ output.schema = (0, zod_to_json_schema_1.default)(jsonOutput.schema);
32
+ }
33
+ }
34
+ }
35
+ }
36
+ }
37
+ }
38
+ return await this.request("/web/crawl", {
39
+ method: "POST",
40
+ body: JSON.stringify(params),
41
+ });
42
+ }
43
+ catch (error) {
44
+ if (error instanceof client_1.HyperbrowserError) {
45
+ throw error;
46
+ }
47
+ throw new client_1.HyperbrowserError("Failed to start web crawl job", undefined);
48
+ }
49
+ }
50
+ /**
51
+ * Get the status of a web crawl job
52
+ * @param id The ID of the web crawl job to get
53
+ */
54
+ async getStatus(id) {
55
+ try {
56
+ return await this.request(`/web/crawl/${id}/status`);
57
+ }
58
+ catch (error) {
59
+ if (error instanceof client_1.HyperbrowserError) {
60
+ throw error;
61
+ }
62
+ throw new client_1.HyperbrowserError(`Failed to get web crawl job ${id} status`, undefined);
63
+ }
64
+ }
65
+ /**
66
+ * Get the details of a web crawl job
67
+ * @param id The ID of the web crawl job to get
68
+ * @param params Optional parameters to filter the web crawl job
69
+ */
70
+ async get(id, params) {
71
+ try {
72
+ return await this.request(`/web/crawl/${id}`, undefined, {
73
+ page: params?.page,
74
+ batchSize: params?.batchSize,
75
+ });
76
+ }
77
+ catch (error) {
78
+ if (error instanceof client_1.HyperbrowserError) {
79
+ throw error;
80
+ }
81
+ throw new client_1.HyperbrowserError(`Failed to get web crawl job ${id}`, undefined);
82
+ }
83
+ }
84
+ /**
85
+ * Start a web crawl job and wait for it to complete
86
+ * @param params The parameters for the web crawl job
87
+ * @param returnAllPages Whether to return all pages in the web crawl job response
88
+ */
89
+ async startAndWait(params, returnAllPages = true) {
90
+ const job = await this.start(params);
91
+ const jobId = job.jobId;
92
+ if (!jobId) {
93
+ throw new client_1.HyperbrowserError("Failed to start web crawl job, could not get job ID");
94
+ }
95
+ let failures = 0;
96
+ let jobStatus = "pending";
97
+ while (true) {
98
+ try {
99
+ const { status } = await this.getStatus(jobId);
100
+ if (status === "completed" || status === "failed") {
101
+ jobStatus = status;
102
+ break;
103
+ }
104
+ failures = 0;
105
+ }
106
+ catch (error) {
107
+ failures++;
108
+ if (failures >= constants_1.POLLING_ATTEMPTS) {
109
+ throw new client_1.HyperbrowserError(`Failed to poll web crawl job ${jobId} after ${constants_1.POLLING_ATTEMPTS} attempts: ${error}`);
110
+ }
111
+ }
112
+ await (0, utils_1.sleep)(2000);
113
+ }
114
+ failures = 0;
115
+ if (!returnAllPages) {
116
+ while (true) {
117
+ try {
118
+ return await this.get(jobId);
119
+ }
120
+ catch (error) {
121
+ failures++;
122
+ if (failures >= constants_1.POLLING_ATTEMPTS) {
123
+ throw new client_1.HyperbrowserError(`Failed to get web crawl job ${jobId} after ${constants_1.POLLING_ATTEMPTS} attempts: ${error}`);
124
+ }
125
+ }
126
+ await (0, utils_1.sleep)(500);
127
+ }
128
+ }
129
+ failures = 0;
130
+ const jobResponse = {
131
+ jobId,
132
+ status: jobStatus,
133
+ data: [],
134
+ currentPageBatch: 0,
135
+ totalPageBatches: 0,
136
+ totalPages: 0,
137
+ batchSize: 100,
138
+ };
139
+ let firstCheck = true;
140
+ while (firstCheck || jobResponse.currentPageBatch < jobResponse.totalPageBatches) {
141
+ try {
142
+ const tmpJobResponse = await this.get(jobId, {
143
+ page: jobResponse.currentPageBatch + 1,
144
+ batchSize: 100,
145
+ });
146
+ if (tmpJobResponse.data) {
147
+ jobResponse.data?.push(...tmpJobResponse.data);
148
+ }
149
+ if (tmpJobResponse.error) {
150
+ jobResponse.error = tmpJobResponse.error;
151
+ }
152
+ jobResponse.currentPageBatch = tmpJobResponse.currentPageBatch;
153
+ jobResponse.totalPages = tmpJobResponse.totalPages;
154
+ jobResponse.totalPageBatches = tmpJobResponse.totalPageBatches;
155
+ jobResponse.batchSize = tmpJobResponse.batchSize;
156
+ failures = 0;
157
+ firstCheck = false;
158
+ }
159
+ catch (error) {
160
+ failures++;
161
+ if (failures >= constants_1.POLLING_ATTEMPTS) {
162
+ throw new client_1.HyperbrowserError(`Failed to get web crawl page ${jobResponse.currentPageBatch + 1} for job ${jobId} after ${constants_1.POLLING_ATTEMPTS} attempts: ${error}`);
163
+ }
164
+ }
165
+ await (0, utils_1.sleep)(500);
166
+ }
167
+ return jobResponse;
168
+ }
169
+ }
170
+ exports.WebCrawlService = WebCrawlService;
@@ -2,8 +2,10 @@ import { BaseService } from "../base";
2
2
  import { FetchParams, FetchResponse } from "../../types/web/fetch";
3
3
  import { WebSearchParams, WebSearchResponse } from "../../types/web/search";
4
4
  import { BatchFetchService } from "./batch-fetch";
5
+ import { WebCrawlService } from "./crawl";
5
6
  export declare class WebService extends BaseService {
6
7
  readonly batchFetch: BatchFetchService;
8
+ readonly crawl: WebCrawlService;
7
9
  constructor(apiKey: string, baseUrl: string, timeout: number);
8
10
  /**
9
11
  * Fetch a URL and extract content
@@ -10,10 +10,12 @@ const base_1 = require("../base");
10
10
  const client_1 = require("../../client");
11
11
  const utils_1 = require("../../utils");
12
12
  const batch_fetch_1 = require("./batch-fetch");
13
+ const crawl_1 = require("./crawl");
13
14
  class WebService extends base_1.BaseService {
14
15
  constructor(apiKey, baseUrl, timeout) {
15
16
  super(apiKey, baseUrl, timeout);
16
17
  this.batchFetch = new batch_fetch_1.BatchFetchService(apiKey, baseUrl, timeout);
18
+ this.crawl = new crawl_1.WebCrawlService(apiKey, baseUrl, timeout);
17
19
  }
18
20
  /**
19
21
  * Fetch a URL and extract content
@@ -0,0 +1,38 @@
1
+ import { FetchStealthMode, FetchOutputOptions, FetchBrowserOptions, FetchNavigationOptions, FetchCacheOptions, PageData } from "./common";
2
+ export type WebCrawlJobStatus = "pending" | "running" | "completed" | "failed";
3
+ export interface WebCrawlOptions {
4
+ maxPages?: number;
5
+ ignoreSitemap?: boolean;
6
+ followLinks?: boolean;
7
+ excludePatterns?: string[];
8
+ includePatterns?: string[];
9
+ }
10
+ export interface StartWebCrawlJobParams {
11
+ url: string;
12
+ stealth?: FetchStealthMode;
13
+ outputs?: FetchOutputOptions;
14
+ browser?: FetchBrowserOptions;
15
+ navigation?: FetchNavigationOptions;
16
+ cache?: FetchCacheOptions;
17
+ crawlOptions?: WebCrawlOptions;
18
+ }
19
+ export interface GetWebCrawlJobParams {
20
+ page?: number;
21
+ batchSize?: number;
22
+ }
23
+ export interface StartWebCrawlJobResponse {
24
+ jobId: string;
25
+ }
26
+ export interface WebCrawlJobStatusResponse {
27
+ status: WebCrawlJobStatus;
28
+ }
29
+ export interface WebCrawlJobResponse {
30
+ jobId: string;
31
+ status: WebCrawlJobStatus;
32
+ data?: PageData[];
33
+ error?: string;
34
+ totalPages: number;
35
+ totalPageBatches: number;
36
+ currentPageBatch: number;
37
+ batchSize: number;
38
+ }
@@ -0,0 +1,2 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hyperbrowser/sdk",
3
- "version": "0.82.3",
3
+ "version": "0.83.0",
4
4
  "description": "Node SDK for Hyperbrowser API",
5
5
  "author": "",
6
6
  "main": "dist/index.js",