@upcrawl/sdk 1.3.1 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +214 -65
- package/dist/index.d.mts +355 -225
- package/dist/index.d.ts +355 -225
- package/dist/index.js +311 -0
- package/dist/index.mjs +290 -0
- package/package.json +9 -4
package/dist/index.js
CHANGED
|
@@ -30,7 +30,28 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
30
30
|
// src/index.ts
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
|
+
BatchScrapeOptionsSchema: () => BatchScrapeOptionsSchema,
|
|
34
|
+
BatchScrapeResponseSchema: () => BatchScrapeResponseSchema,
|
|
35
|
+
Browser: () => Browser,
|
|
36
|
+
BrowserSessionSchema: () => BrowserSessionSchema,
|
|
37
|
+
CreateBrowserSessionOptionsSchema: () => CreateBrowserSessionOptionsSchema,
|
|
38
|
+
ExecuteCodeOptionsSchema: () => ExecuteCodeOptionsSchema,
|
|
39
|
+
ExecuteCodeResponseSchema: () => ExecuteCodeResponseSchema,
|
|
40
|
+
GeneratePdfFromUrlOptionsSchema: () => GeneratePdfFromUrlOptionsSchema,
|
|
41
|
+
GeneratePdfOptionsSchema: () => GeneratePdfOptionsSchema,
|
|
42
|
+
PdfMarginSchema: () => PdfMarginSchema,
|
|
43
|
+
PdfResponseSchema: () => PdfResponseSchema,
|
|
44
|
+
ScrapeMetadataSchema: () => ScrapeMetadataSchema,
|
|
45
|
+
ScrapeOptionsSchema: () => ScrapeOptionsSchema,
|
|
46
|
+
ScrapeResponseSchema: () => ScrapeResponseSchema,
|
|
47
|
+
SearchOptionsSchema: () => SearchOptionsSchema,
|
|
48
|
+
SearchResponseSchema: () => SearchResponseSchema,
|
|
49
|
+
SearchResultItemSchema: () => SearchResultItemSchema,
|
|
50
|
+
SearchResultWebSchema: () => SearchResultWebSchema,
|
|
51
|
+
SummaryQuerySchema: () => SummaryQuerySchema,
|
|
52
|
+
UpcrawlConfigSchema: () => UpcrawlConfigSchema,
|
|
33
53
|
UpcrawlError: () => UpcrawlError,
|
|
54
|
+
UpcrawlErrorResponseSchema: () => UpcrawlErrorResponseSchema,
|
|
34
55
|
batchScrape: () => batchScrape,
|
|
35
56
|
configure: () => configure,
|
|
36
57
|
default: () => index_default,
|
|
@@ -48,6 +69,146 @@ __export(index_exports, {
|
|
|
48
69
|
module.exports = __toCommonJS(index_exports);
|
|
49
70
|
|
|
50
71
|
// src/types.ts
|
|
72
|
+
var import_zod = require("zod");
|
|
73
|
+
var UpcrawlConfigSchema = import_zod.z.object({
|
|
74
|
+
apiKey: import_zod.z.string().optional().describe("Your Upcrawl API key"),
|
|
75
|
+
baseUrl: import_zod.z.string().optional().describe("Custom API base URL"),
|
|
76
|
+
timeout: import_zod.z.number().optional().describe("Request timeout in milliseconds")
|
|
77
|
+
});
|
|
78
|
+
var SummaryQuerySchema = import_zod.z.object({
|
|
79
|
+
query: import_zod.z.string().describe("Query/instruction for content summarization")
|
|
80
|
+
});
|
|
81
|
+
var ScrapeOptionsSchema = import_zod.z.object({
|
|
82
|
+
url: import_zod.z.string().url().describe("URL to scrape (required)"),
|
|
83
|
+
type: import_zod.z.enum(["html", "markdown"]).optional().describe('Output format: html or markdown. Defaults to "html"'),
|
|
84
|
+
onlyMainContent: import_zod.z.boolean().optional().describe("Extract only main content (removes nav, ads, footers). Defaults to true"),
|
|
85
|
+
extractMetadata: import_zod.z.boolean().optional().describe("Whether to extract page metadata"),
|
|
86
|
+
summary: SummaryQuerySchema.optional().describe("Summary query for LLM summarization"),
|
|
87
|
+
timeoutMs: import_zod.z.number().optional().describe("Custom timeout in milliseconds (1000-120000)"),
|
|
88
|
+
waitUntil: import_zod.z.enum(["load", "domcontentloaded", "networkidle"]).optional().describe("Wait strategy for page load")
|
|
89
|
+
});
|
|
90
|
+
var ScrapeMetadataSchema = import_zod.z.object({
|
|
91
|
+
title: import_zod.z.string().optional(),
|
|
92
|
+
description: import_zod.z.string().optional(),
|
|
93
|
+
canonicalUrl: import_zod.z.string().optional(),
|
|
94
|
+
finalUrl: import_zod.z.string().optional(),
|
|
95
|
+
contentType: import_zod.z.string().optional(),
|
|
96
|
+
contentLength: import_zod.z.number().optional()
|
|
97
|
+
});
|
|
98
|
+
var ScrapeResponseSchema = import_zod.z.object({
|
|
99
|
+
url: import_zod.z.string().describe("Original URL that was scraped"),
|
|
100
|
+
html: import_zod.z.string().nullable().optional().describe("Rendered HTML content (when type is html)"),
|
|
101
|
+
markdown: import_zod.z.string().nullable().optional().describe("Content converted to Markdown (when type is markdown)"),
|
|
102
|
+
statusCode: import_zod.z.number().nullable().describe("HTTP status code"),
|
|
103
|
+
success: import_zod.z.boolean().describe("Whether scraping was successful"),
|
|
104
|
+
error: import_zod.z.string().optional().describe("Error message if scraping failed"),
|
|
105
|
+
timestamp: import_zod.z.string().describe("ISO timestamp when scraping completed"),
|
|
106
|
+
loadTimeMs: import_zod.z.number().describe("Time taken to load and render the page in milliseconds"),
|
|
107
|
+
metadata: ScrapeMetadataSchema.optional().describe("Additional page metadata"),
|
|
108
|
+
retryCount: import_zod.z.number().describe("Number of retry attempts made"),
|
|
109
|
+
cost: import_zod.z.number().optional().describe("Cost in USD for this scrape operation"),
|
|
110
|
+
content: import_zod.z.string().nullable().optional().describe("Content after summarization (when summary query provided)")
|
|
111
|
+
});
|
|
112
|
+
var BatchScrapeOptionsSchema = import_zod.z.object({
|
|
113
|
+
urls: import_zod.z.array(import_zod.z.union([import_zod.z.string(), ScrapeOptionsSchema])).describe("Array of URLs to scrape (strings or detailed request objects)"),
|
|
114
|
+
type: import_zod.z.enum(["html", "markdown"]).optional().describe("Output format: html or markdown"),
|
|
115
|
+
onlyMainContent: import_zod.z.boolean().optional().describe("Extract only main content (removes nav, ads, footers)"),
|
|
116
|
+
summary: SummaryQuerySchema.optional().describe("Summary query for LLM summarization"),
|
|
117
|
+
batchTimeoutMs: import_zod.z.number().optional().describe("Global timeout for entire batch operation in milliseconds (10000-600000)"),
|
|
118
|
+
failFast: import_zod.z.boolean().optional().describe("Whether to stop on first error")
|
|
119
|
+
});
|
|
120
|
+
var BatchScrapeResponseSchema = import_zod.z.object({
|
|
121
|
+
results: import_zod.z.array(ScrapeResponseSchema).describe("Array of scrape results"),
|
|
122
|
+
total: import_zod.z.number().describe("Total number of URLs processed"),
|
|
123
|
+
successful: import_zod.z.number().describe("Number of successful scrapes"),
|
|
124
|
+
failed: import_zod.z.number().describe("Number of failed scrapes"),
|
|
125
|
+
totalTimeMs: import_zod.z.number().describe("Total time taken for batch operation in milliseconds"),
|
|
126
|
+
timestamp: import_zod.z.string().describe("Timestamp when batch operation completed"),
|
|
127
|
+
cost: import_zod.z.number().optional().describe("Total cost in USD for all scrape operations")
|
|
128
|
+
});
|
|
129
|
+
var SearchOptionsSchema = import_zod.z.object({
|
|
130
|
+
queries: import_zod.z.array(import_zod.z.string()).describe("Array of search queries to execute (1-20)"),
|
|
131
|
+
limit: import_zod.z.number().optional().describe("Number of results per query (1-100). Defaults to 10"),
|
|
132
|
+
location: import_zod.z.string().optional().describe('Location for search (e.g., "IN", "US")'),
|
|
133
|
+
includeDomains: import_zod.z.array(import_zod.z.string()).optional().describe("Domains to include (will add site: to query)"),
|
|
134
|
+
excludeDomains: import_zod.z.array(import_zod.z.string()).optional().describe("Domains to exclude (will add -site: to query)")
|
|
135
|
+
});
|
|
136
|
+
var SearchResultWebSchema = import_zod.z.object({
|
|
137
|
+
url: import_zod.z.string().describe("URL of the search result"),
|
|
138
|
+
title: import_zod.z.string().describe("Title of the search result"),
|
|
139
|
+
description: import_zod.z.string().describe("Description/snippet of the search result")
|
|
140
|
+
});
|
|
141
|
+
var SearchResultItemSchema = import_zod.z.object({
|
|
142
|
+
query: import_zod.z.string().describe("The search query"),
|
|
143
|
+
success: import_zod.z.boolean().describe("Whether the search was successful"),
|
|
144
|
+
results: import_zod.z.array(SearchResultWebSchema).describe("Parsed search result links"),
|
|
145
|
+
error: import_zod.z.string().optional().describe("Error message if failed"),
|
|
146
|
+
loadTimeMs: import_zod.z.number().optional().describe("Time taken in milliseconds"),
|
|
147
|
+
cost: import_zod.z.number().optional().describe("Cost in USD for this query")
|
|
148
|
+
});
|
|
149
|
+
var SearchResponseSchema = import_zod.z.object({
|
|
150
|
+
results: import_zod.z.array(SearchResultItemSchema).describe("Array of search results per query"),
|
|
151
|
+
total: import_zod.z.number().describe("Total number of queries"),
|
|
152
|
+
successful: import_zod.z.number().describe("Number of successful searches"),
|
|
153
|
+
failed: import_zod.z.number().describe("Number of failed searches"),
|
|
154
|
+
totalTimeMs: import_zod.z.number().describe("Total time in milliseconds"),
|
|
155
|
+
timestamp: import_zod.z.string().describe("ISO timestamp"),
|
|
156
|
+
cost: import_zod.z.number().optional().describe("Total cost in USD")
|
|
157
|
+
});
|
|
158
|
+
var PdfMarginSchema = import_zod.z.object({
|
|
159
|
+
top: import_zod.z.string().optional(),
|
|
160
|
+
right: import_zod.z.string().optional(),
|
|
161
|
+
bottom: import_zod.z.string().optional(),
|
|
162
|
+
left: import_zod.z.string().optional()
|
|
163
|
+
});
|
|
164
|
+
var GeneratePdfOptionsSchema = import_zod.z.object({
|
|
165
|
+
html: import_zod.z.string().describe("Complete HTML content to convert to PDF (required)"),
|
|
166
|
+
title: import_zod.z.string().optional().describe("Title used for the exported filename"),
|
|
167
|
+
pageSize: import_zod.z.enum(["A4", "Letter", "Legal"]).optional().describe('Page size. Defaults to "A4"'),
|
|
168
|
+
landscape: import_zod.z.boolean().optional().describe("Landscape orientation. Defaults to false"),
|
|
169
|
+
margin: PdfMarginSchema.optional().describe('Page margins (e.g., { top: "20mm", right: "20mm", bottom: "20mm", left: "20mm" })'),
|
|
170
|
+
printBackground: import_zod.z.boolean().optional().describe("Print background graphics and colors. Defaults to true"),
|
|
171
|
+
skipChartWait: import_zod.z.boolean().optional().describe("Skip waiting for chart rendering signal. Defaults to false"),
|
|
172
|
+
timeoutMs: import_zod.z.number().optional().describe("Timeout in milliseconds (5000-120000). Defaults to 30000")
|
|
173
|
+
});
|
|
174
|
+
var GeneratePdfFromUrlOptionsSchema = import_zod.z.object({
|
|
175
|
+
url: import_zod.z.string().url().describe("URL to navigate to and convert to PDF (required)"),
|
|
176
|
+
title: import_zod.z.string().optional().describe("Title used for the exported filename"),
|
|
177
|
+
pageSize: import_zod.z.enum(["A4", "Letter", "Legal"]).optional().describe('Page size. Defaults to "A4"'),
|
|
178
|
+
landscape: import_zod.z.boolean().optional().describe("Landscape orientation. Defaults to false"),
|
|
179
|
+
margin: PdfMarginSchema.optional().describe("Page margins"),
|
|
180
|
+
printBackground: import_zod.z.boolean().optional().describe("Print background graphics and colors. Defaults to true"),
|
|
181
|
+
timeoutMs: import_zod.z.number().optional().describe("Timeout in milliseconds (5000-120000). Defaults to 30000")
|
|
182
|
+
});
|
|
183
|
+
var PdfResponseSchema = import_zod.z.object({
|
|
184
|
+
success: import_zod.z.boolean().describe("Whether PDF generation succeeded"),
|
|
185
|
+
url: import_zod.z.string().optional().describe("Public URL of the generated PDF"),
|
|
186
|
+
filename: import_zod.z.string().optional().describe("Generated filename"),
|
|
187
|
+
blobName: import_zod.z.string().optional().describe("Blob storage path"),
|
|
188
|
+
error: import_zod.z.string().optional().describe("Error message on failure"),
|
|
189
|
+
durationMs: import_zod.z.number().describe("Total time taken in milliseconds")
|
|
190
|
+
});
|
|
191
|
+
var ExecuteCodeOptionsSchema = import_zod.z.object({
|
|
192
|
+
code: import_zod.z.string().describe("Code to execute (required)"),
|
|
193
|
+
language: import_zod.z.enum(["python"]).optional().describe('Language runtime. Defaults to "python"')
|
|
194
|
+
});
|
|
195
|
+
var ExecuteCodeResponseSchema = import_zod.z.object({
|
|
196
|
+
stdout: import_zod.z.string().describe("Standard output from the executed code"),
|
|
197
|
+
stderr: import_zod.z.string().describe("Standard error from the executed code"),
|
|
198
|
+
exitCode: import_zod.z.number().describe("Process exit code (0 = success, 124 = timeout)"),
|
|
199
|
+
executionTimeMs: import_zod.z.number().describe("Execution time in milliseconds"),
|
|
200
|
+
timedOut: import_zod.z.boolean().describe("Whether execution was killed due to timeout"),
|
|
201
|
+
memoryUsageMb: import_zod.z.number().optional().describe("Peak memory usage in megabytes"),
|
|
202
|
+
error: import_zod.z.string().optional().describe("Error message if execution infrastructure failed"),
|
|
203
|
+
cost: import_zod.z.number().optional().describe("Cost in USD for this execution")
|
|
204
|
+
});
|
|
205
|
+
var UpcrawlErrorResponseSchema = import_zod.z.object({
|
|
206
|
+
error: import_zod.z.object({
|
|
207
|
+
code: import_zod.z.string(),
|
|
208
|
+
message: import_zod.z.string()
|
|
209
|
+
}),
|
|
210
|
+
statusCode: import_zod.z.number().optional()
|
|
211
|
+
});
|
|
51
212
|
var UpcrawlError = class extends Error {
|
|
52
213
|
constructor(message, status, code = "UNKNOWN_ERROR") {
|
|
53
214
|
super(message);
|
|
@@ -56,6 +217,118 @@ var UpcrawlError = class extends Error {
|
|
|
56
217
|
this.code = code;
|
|
57
218
|
}
|
|
58
219
|
};
|
|
220
|
+
var CreateBrowserSessionOptionsSchema = import_zod.z.object({
|
|
221
|
+
width: import_zod.z.number().optional().describe("Browser viewport width (800-3840). Defaults to 1280"),
|
|
222
|
+
height: import_zod.z.number().optional().describe("Browser viewport height (600-2160). Defaults to 720"),
|
|
223
|
+
headless: import_zod.z.boolean().optional().describe("Run browser in headless mode. Defaults to true")
|
|
224
|
+
});
|
|
225
|
+
var BrowserSessionSchema = import_zod.z.object({
|
|
226
|
+
sessionId: import_zod.z.string().describe("Unique session identifier"),
|
|
227
|
+
wsEndpoint: import_zod.z.string().describe("WebSocket URL for connecting with Playwright/Puppeteer"),
|
|
228
|
+
vncUrl: import_zod.z.string().nullable().describe("VNC URL for viewing the browser (if available)"),
|
|
229
|
+
affinityCookie: import_zod.z.string().optional().describe("Affinity cookie for sticky session routing (format: SCRAPER_AFFINITY=xxx) - extracted from response headers"),
|
|
230
|
+
createdAt: import_zod.z.date().describe("Session creation timestamp"),
|
|
231
|
+
width: import_zod.z.number().describe("Browser viewport width"),
|
|
232
|
+
height: import_zod.z.number().describe("Browser viewport height")
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
// src/browser.ts
|
|
236
|
+
var Browser = class {
|
|
237
|
+
constructor(createClient2) {
|
|
238
|
+
this.createClient = createClient2;
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Create a new browser session for remote control
|
|
242
|
+
* @param options - Session options including viewport size and headless mode
|
|
243
|
+
* @returns Promise with session details including WebSocket URL
|
|
244
|
+
*
|
|
245
|
+
* @example
|
|
246
|
+
* ```typescript
|
|
247
|
+
* const session = await upcrawl.browser.create({
|
|
248
|
+
* width: 1280,
|
|
249
|
+
* height: 720,
|
|
250
|
+
* headless: true
|
|
251
|
+
* });
|
|
252
|
+
*
|
|
253
|
+
* console.log(session.wsEndpoint); // WebSocket URL for Playwright
|
|
254
|
+
* console.log(session.vncUrl); // VNC URL for viewing (if available)
|
|
255
|
+
* ```
|
|
256
|
+
*/
|
|
257
|
+
async create(options) {
|
|
258
|
+
try {
|
|
259
|
+
const client = this.createClient();
|
|
260
|
+
const response = await client.post("/browser/session", {
|
|
261
|
+
width: options?.width ?? 1280,
|
|
262
|
+
height: options?.height ?? 720,
|
|
263
|
+
headless: options?.headless ?? true
|
|
264
|
+
});
|
|
265
|
+
const setCookieHeaders = response.headers["set-cookie"] || [];
|
|
266
|
+
const affinityCookie = setCookieHeaders.map((c) => c.split(";")[0]).find((c) => c.startsWith("SCRAPER_AFFINITY="));
|
|
267
|
+
return {
|
|
268
|
+
sessionId: response.data.sessionId,
|
|
269
|
+
wsEndpoint: response.data.wsUrl,
|
|
270
|
+
vncUrl: response.data.vncUrl,
|
|
271
|
+
affinityCookie: affinityCookie || void 0,
|
|
272
|
+
createdAt: new Date(response.data.createdAt),
|
|
273
|
+
width: response.data.width,
|
|
274
|
+
height: response.data.height
|
|
275
|
+
};
|
|
276
|
+
} catch (error) {
|
|
277
|
+
this.handleError(error);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
/**
|
|
281
|
+
* Close a browser session
|
|
282
|
+
* @param sessionId - The session ID to close
|
|
283
|
+
* @returns Promise that resolves when session is closed
|
|
284
|
+
*
|
|
285
|
+
* @example
|
|
286
|
+
* ```typescript
|
|
287
|
+
* await upcrawl.browser.close(session.sessionId);
|
|
288
|
+
* ```
|
|
289
|
+
*/
|
|
290
|
+
async close(sessionId) {
|
|
291
|
+
try {
|
|
292
|
+
const client = this.createClient();
|
|
293
|
+
await client.delete(`/browser/session/${sessionId}`);
|
|
294
|
+
} catch (error) {
|
|
295
|
+
this.handleError(error);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Handle API errors and convert to UpcrawlError
|
|
300
|
+
*/
|
|
301
|
+
handleError(error) {
|
|
302
|
+
if (error && typeof error === "object" && "response" in error) {
|
|
303
|
+
const axiosError = error;
|
|
304
|
+
const status = axiosError.response?.status || 500;
|
|
305
|
+
const data = axiosError.response?.data;
|
|
306
|
+
if (data?.error) {
|
|
307
|
+
throw new UpcrawlError(data.error.message || "Unknown error", status, data.error.code || "UNKNOWN_ERROR");
|
|
308
|
+
}
|
|
309
|
+
switch (status) {
|
|
310
|
+
case 401:
|
|
311
|
+
throw new UpcrawlError("Invalid or missing API key", 401, "UNAUTHORIZED");
|
|
312
|
+
case 404:
|
|
313
|
+
throw new UpcrawlError("Session not found", 404, "NOT_FOUND");
|
|
314
|
+
case 429:
|
|
315
|
+
throw new UpcrawlError("Rate limit exceeded", 429, "RATE_LIMIT_EXCEEDED");
|
|
316
|
+
case 503:
|
|
317
|
+
throw new UpcrawlError("No browser slots available", 503, "SERVICE_UNAVAILABLE");
|
|
318
|
+
default:
|
|
319
|
+
throw new UpcrawlError(axiosError.message || "An unknown error occurred", status, "UNKNOWN_ERROR");
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
if (error instanceof UpcrawlError) {
|
|
323
|
+
throw error;
|
|
324
|
+
}
|
|
325
|
+
throw new UpcrawlError(
|
|
326
|
+
error instanceof Error ? error.message : "An unknown error occurred",
|
|
327
|
+
500,
|
|
328
|
+
"UNKNOWN_ERROR"
|
|
329
|
+
);
|
|
330
|
+
}
|
|
331
|
+
};
|
|
59
332
|
|
|
60
333
|
// src/client.ts
|
|
61
334
|
var import_axios = __toESM(require("axios"));
|
|
@@ -279,6 +552,23 @@ var Upcrawl = {
|
|
|
279
552
|
* @returns Promise with execution response (stdout, stderr, exit code, memory usage)
|
|
280
553
|
*/
|
|
281
554
|
executeCode,
|
|
555
|
+
/**
|
|
556
|
+
* Browser session management
|
|
557
|
+
* Create and manage browser sessions for remote control with Playwright/Puppeteer
|
|
558
|
+
*
|
|
559
|
+
* @example
|
|
560
|
+
* ```typescript
|
|
561
|
+
* const session = await Upcrawl.browser.create({
|
|
562
|
+
* width: 1280,
|
|
563
|
+
* height: 720,
|
|
564
|
+
* headless: true
|
|
565
|
+
* });
|
|
566
|
+
*
|
|
567
|
+
* const browser = await playwright.connect(session.wsEndpoint);
|
|
568
|
+
* await upcrawl.browser.close(session.sessionId);
|
|
569
|
+
* ```
|
|
570
|
+
*/
|
|
571
|
+
browser: new Browser(createClient),
|
|
282
572
|
/**
|
|
283
573
|
* Error class for Upcrawl API errors
|
|
284
574
|
*/
|
|
@@ -287,7 +577,28 @@ var Upcrawl = {
|
|
|
287
577
|
var index_default = Upcrawl;
|
|
288
578
|
// Annotate the CommonJS export names for ESM import in node:
|
|
289
579
|
0 && (module.exports = {
|
|
580
|
+
BatchScrapeOptionsSchema,
|
|
581
|
+
BatchScrapeResponseSchema,
|
|
582
|
+
Browser,
|
|
583
|
+
BrowserSessionSchema,
|
|
584
|
+
CreateBrowserSessionOptionsSchema,
|
|
585
|
+
ExecuteCodeOptionsSchema,
|
|
586
|
+
ExecuteCodeResponseSchema,
|
|
587
|
+
GeneratePdfFromUrlOptionsSchema,
|
|
588
|
+
GeneratePdfOptionsSchema,
|
|
589
|
+
PdfMarginSchema,
|
|
590
|
+
PdfResponseSchema,
|
|
591
|
+
ScrapeMetadataSchema,
|
|
592
|
+
ScrapeOptionsSchema,
|
|
593
|
+
ScrapeResponseSchema,
|
|
594
|
+
SearchOptionsSchema,
|
|
595
|
+
SearchResponseSchema,
|
|
596
|
+
SearchResultItemSchema,
|
|
597
|
+
SearchResultWebSchema,
|
|
598
|
+
SummaryQuerySchema,
|
|
599
|
+
UpcrawlConfigSchema,
|
|
290
600
|
UpcrawlError,
|
|
601
|
+
UpcrawlErrorResponseSchema,
|
|
291
602
|
batchScrape,
|
|
292
603
|
configure,
|
|
293
604
|
executeCode,
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,144 @@
|
|
|
1
1
|
// src/types.ts
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
var UpcrawlConfigSchema = z.object({
|
|
4
|
+
apiKey: z.string().optional().describe("Your Upcrawl API key"),
|
|
5
|
+
baseUrl: z.string().optional().describe("Custom API base URL"),
|
|
6
|
+
timeout: z.number().optional().describe("Request timeout in milliseconds")
|
|
7
|
+
});
|
|
8
|
+
var SummaryQuerySchema = z.object({
|
|
9
|
+
query: z.string().describe("Query/instruction for content summarization")
|
|
10
|
+
});
|
|
11
|
+
var ScrapeOptionsSchema = z.object({
|
|
12
|
+
url: z.string().url().describe("URL to scrape (required)"),
|
|
13
|
+
type: z.enum(["html", "markdown"]).optional().describe('Output format: html or markdown. Defaults to "html"'),
|
|
14
|
+
onlyMainContent: z.boolean().optional().describe("Extract only main content (removes nav, ads, footers). Defaults to true"),
|
|
15
|
+
extractMetadata: z.boolean().optional().describe("Whether to extract page metadata"),
|
|
16
|
+
summary: SummaryQuerySchema.optional().describe("Summary query for LLM summarization"),
|
|
17
|
+
timeoutMs: z.number().optional().describe("Custom timeout in milliseconds (1000-120000)"),
|
|
18
|
+
waitUntil: z.enum(["load", "domcontentloaded", "networkidle"]).optional().describe("Wait strategy for page load")
|
|
19
|
+
});
|
|
20
|
+
var ScrapeMetadataSchema = z.object({
|
|
21
|
+
title: z.string().optional(),
|
|
22
|
+
description: z.string().optional(),
|
|
23
|
+
canonicalUrl: z.string().optional(),
|
|
24
|
+
finalUrl: z.string().optional(),
|
|
25
|
+
contentType: z.string().optional(),
|
|
26
|
+
contentLength: z.number().optional()
|
|
27
|
+
});
|
|
28
|
+
var ScrapeResponseSchema = z.object({
|
|
29
|
+
url: z.string().describe("Original URL that was scraped"),
|
|
30
|
+
html: z.string().nullable().optional().describe("Rendered HTML content (when type is html)"),
|
|
31
|
+
markdown: z.string().nullable().optional().describe("Content converted to Markdown (when type is markdown)"),
|
|
32
|
+
statusCode: z.number().nullable().describe("HTTP status code"),
|
|
33
|
+
success: z.boolean().describe("Whether scraping was successful"),
|
|
34
|
+
error: z.string().optional().describe("Error message if scraping failed"),
|
|
35
|
+
timestamp: z.string().describe("ISO timestamp when scraping completed"),
|
|
36
|
+
loadTimeMs: z.number().describe("Time taken to load and render the page in milliseconds"),
|
|
37
|
+
metadata: ScrapeMetadataSchema.optional().describe("Additional page metadata"),
|
|
38
|
+
retryCount: z.number().describe("Number of retry attempts made"),
|
|
39
|
+
cost: z.number().optional().describe("Cost in USD for this scrape operation"),
|
|
40
|
+
content: z.string().nullable().optional().describe("Content after summarization (when summary query provided)")
|
|
41
|
+
});
|
|
42
|
+
var BatchScrapeOptionsSchema = z.object({
|
|
43
|
+
urls: z.array(z.union([z.string(), ScrapeOptionsSchema])).describe("Array of URLs to scrape (strings or detailed request objects)"),
|
|
44
|
+
type: z.enum(["html", "markdown"]).optional().describe("Output format: html or markdown"),
|
|
45
|
+
onlyMainContent: z.boolean().optional().describe("Extract only main content (removes nav, ads, footers)"),
|
|
46
|
+
summary: SummaryQuerySchema.optional().describe("Summary query for LLM summarization"),
|
|
47
|
+
batchTimeoutMs: z.number().optional().describe("Global timeout for entire batch operation in milliseconds (10000-600000)"),
|
|
48
|
+
failFast: z.boolean().optional().describe("Whether to stop on first error")
|
|
49
|
+
});
|
|
50
|
+
var BatchScrapeResponseSchema = z.object({
|
|
51
|
+
results: z.array(ScrapeResponseSchema).describe("Array of scrape results"),
|
|
52
|
+
total: z.number().describe("Total number of URLs processed"),
|
|
53
|
+
successful: z.number().describe("Number of successful scrapes"),
|
|
54
|
+
failed: z.number().describe("Number of failed scrapes"),
|
|
55
|
+
totalTimeMs: z.number().describe("Total time taken for batch operation in milliseconds"),
|
|
56
|
+
timestamp: z.string().describe("Timestamp when batch operation completed"),
|
|
57
|
+
cost: z.number().optional().describe("Total cost in USD for all scrape operations")
|
|
58
|
+
});
|
|
59
|
+
var SearchOptionsSchema = z.object({
|
|
60
|
+
queries: z.array(z.string()).describe("Array of search queries to execute (1-20)"),
|
|
61
|
+
limit: z.number().optional().describe("Number of results per query (1-100). Defaults to 10"),
|
|
62
|
+
location: z.string().optional().describe('Location for search (e.g., "IN", "US")'),
|
|
63
|
+
includeDomains: z.array(z.string()).optional().describe("Domains to include (will add site: to query)"),
|
|
64
|
+
excludeDomains: z.array(z.string()).optional().describe("Domains to exclude (will add -site: to query)")
|
|
65
|
+
});
|
|
66
|
+
var SearchResultWebSchema = z.object({
|
|
67
|
+
url: z.string().describe("URL of the search result"),
|
|
68
|
+
title: z.string().describe("Title of the search result"),
|
|
69
|
+
description: z.string().describe("Description/snippet of the search result")
|
|
70
|
+
});
|
|
71
|
+
var SearchResultItemSchema = z.object({
|
|
72
|
+
query: z.string().describe("The search query"),
|
|
73
|
+
success: z.boolean().describe("Whether the search was successful"),
|
|
74
|
+
results: z.array(SearchResultWebSchema).describe("Parsed search result links"),
|
|
75
|
+
error: z.string().optional().describe("Error message if failed"),
|
|
76
|
+
loadTimeMs: z.number().optional().describe("Time taken in milliseconds"),
|
|
77
|
+
cost: z.number().optional().describe("Cost in USD for this query")
|
|
78
|
+
});
|
|
79
|
+
var SearchResponseSchema = z.object({
|
|
80
|
+
results: z.array(SearchResultItemSchema).describe("Array of search results per query"),
|
|
81
|
+
total: z.number().describe("Total number of queries"),
|
|
82
|
+
successful: z.number().describe("Number of successful searches"),
|
|
83
|
+
failed: z.number().describe("Number of failed searches"),
|
|
84
|
+
totalTimeMs: z.number().describe("Total time in milliseconds"),
|
|
85
|
+
timestamp: z.string().describe("ISO timestamp"),
|
|
86
|
+
cost: z.number().optional().describe("Total cost in USD")
|
|
87
|
+
});
|
|
88
|
+
var PdfMarginSchema = z.object({
|
|
89
|
+
top: z.string().optional(),
|
|
90
|
+
right: z.string().optional(),
|
|
91
|
+
bottom: z.string().optional(),
|
|
92
|
+
left: z.string().optional()
|
|
93
|
+
});
|
|
94
|
+
var GeneratePdfOptionsSchema = z.object({
|
|
95
|
+
html: z.string().describe("Complete HTML content to convert to PDF (required)"),
|
|
96
|
+
title: z.string().optional().describe("Title used for the exported filename"),
|
|
97
|
+
pageSize: z.enum(["A4", "Letter", "Legal"]).optional().describe('Page size. Defaults to "A4"'),
|
|
98
|
+
landscape: z.boolean().optional().describe("Landscape orientation. Defaults to false"),
|
|
99
|
+
margin: PdfMarginSchema.optional().describe('Page margins (e.g., { top: "20mm", right: "20mm", bottom: "20mm", left: "20mm" })'),
|
|
100
|
+
printBackground: z.boolean().optional().describe("Print background graphics and colors. Defaults to true"),
|
|
101
|
+
skipChartWait: z.boolean().optional().describe("Skip waiting for chart rendering signal. Defaults to false"),
|
|
102
|
+
timeoutMs: z.number().optional().describe("Timeout in milliseconds (5000-120000). Defaults to 30000")
|
|
103
|
+
});
|
|
104
|
+
var GeneratePdfFromUrlOptionsSchema = z.object({
|
|
105
|
+
url: z.string().url().describe("URL to navigate to and convert to PDF (required)"),
|
|
106
|
+
title: z.string().optional().describe("Title used for the exported filename"),
|
|
107
|
+
pageSize: z.enum(["A4", "Letter", "Legal"]).optional().describe('Page size. Defaults to "A4"'),
|
|
108
|
+
landscape: z.boolean().optional().describe("Landscape orientation. Defaults to false"),
|
|
109
|
+
margin: PdfMarginSchema.optional().describe("Page margins"),
|
|
110
|
+
printBackground: z.boolean().optional().describe("Print background graphics and colors. Defaults to true"),
|
|
111
|
+
timeoutMs: z.number().optional().describe("Timeout in milliseconds (5000-120000). Defaults to 30000")
|
|
112
|
+
});
|
|
113
|
+
var PdfResponseSchema = z.object({
|
|
114
|
+
success: z.boolean().describe("Whether PDF generation succeeded"),
|
|
115
|
+
url: z.string().optional().describe("Public URL of the generated PDF"),
|
|
116
|
+
filename: z.string().optional().describe("Generated filename"),
|
|
117
|
+
blobName: z.string().optional().describe("Blob storage path"),
|
|
118
|
+
error: z.string().optional().describe("Error message on failure"),
|
|
119
|
+
durationMs: z.number().describe("Total time taken in milliseconds")
|
|
120
|
+
});
|
|
121
|
+
var ExecuteCodeOptionsSchema = z.object({
|
|
122
|
+
code: z.string().describe("Code to execute (required)"),
|
|
123
|
+
language: z.enum(["python"]).optional().describe('Language runtime. Defaults to "python"')
|
|
124
|
+
});
|
|
125
|
+
var ExecuteCodeResponseSchema = z.object({
|
|
126
|
+
stdout: z.string().describe("Standard output from the executed code"),
|
|
127
|
+
stderr: z.string().describe("Standard error from the executed code"),
|
|
128
|
+
exitCode: z.number().describe("Process exit code (0 = success, 124 = timeout)"),
|
|
129
|
+
executionTimeMs: z.number().describe("Execution time in milliseconds"),
|
|
130
|
+
timedOut: z.boolean().describe("Whether execution was killed due to timeout"),
|
|
131
|
+
memoryUsageMb: z.number().optional().describe("Peak memory usage in megabytes"),
|
|
132
|
+
error: z.string().optional().describe("Error message if execution infrastructure failed"),
|
|
133
|
+
cost: z.number().optional().describe("Cost in USD for this execution")
|
|
134
|
+
});
|
|
135
|
+
var UpcrawlErrorResponseSchema = z.object({
|
|
136
|
+
error: z.object({
|
|
137
|
+
code: z.string(),
|
|
138
|
+
message: z.string()
|
|
139
|
+
}),
|
|
140
|
+
statusCode: z.number().optional()
|
|
141
|
+
});
|
|
2
142
|
var UpcrawlError = class extends Error {
|
|
3
143
|
constructor(message, status, code = "UNKNOWN_ERROR") {
|
|
4
144
|
super(message);
|
|
@@ -7,6 +147,118 @@ var UpcrawlError = class extends Error {
|
|
|
7
147
|
this.code = code;
|
|
8
148
|
}
|
|
9
149
|
};
|
|
150
|
+
var CreateBrowserSessionOptionsSchema = z.object({
|
|
151
|
+
width: z.number().optional().describe("Browser viewport width (800-3840). Defaults to 1280"),
|
|
152
|
+
height: z.number().optional().describe("Browser viewport height (600-2160). Defaults to 720"),
|
|
153
|
+
headless: z.boolean().optional().describe("Run browser in headless mode. Defaults to true")
|
|
154
|
+
});
|
|
155
|
+
var BrowserSessionSchema = z.object({
|
|
156
|
+
sessionId: z.string().describe("Unique session identifier"),
|
|
157
|
+
wsEndpoint: z.string().describe("WebSocket URL for connecting with Playwright/Puppeteer"),
|
|
158
|
+
vncUrl: z.string().nullable().describe("VNC URL for viewing the browser (if available)"),
|
|
159
|
+
affinityCookie: z.string().optional().describe("Affinity cookie for sticky session routing (format: SCRAPER_AFFINITY=xxx) - extracted from response headers"),
|
|
160
|
+
createdAt: z.date().describe("Session creation timestamp"),
|
|
161
|
+
width: z.number().describe("Browser viewport width"),
|
|
162
|
+
height: z.number().describe("Browser viewport height")
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
// src/browser.ts
|
|
166
|
+
var Browser = class {
|
|
167
|
+
constructor(createClient2) {
|
|
168
|
+
this.createClient = createClient2;
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Create a new browser session for remote control
|
|
172
|
+
* @param options - Session options including viewport size and headless mode
|
|
173
|
+
* @returns Promise with session details including WebSocket URL
|
|
174
|
+
*
|
|
175
|
+
* @example
|
|
176
|
+
* ```typescript
|
|
177
|
+
* const session = await upcrawl.browser.create({
|
|
178
|
+
* width: 1280,
|
|
179
|
+
* height: 720,
|
|
180
|
+
* headless: true
|
|
181
|
+
* });
|
|
182
|
+
*
|
|
183
|
+
* console.log(session.wsEndpoint); // WebSocket URL for Playwright
|
|
184
|
+
* console.log(session.vncUrl); // VNC URL for viewing (if available)
|
|
185
|
+
* ```
|
|
186
|
+
*/
|
|
187
|
+
async create(options) {
|
|
188
|
+
try {
|
|
189
|
+
const client = this.createClient();
|
|
190
|
+
const response = await client.post("/browser/session", {
|
|
191
|
+
width: options?.width ?? 1280,
|
|
192
|
+
height: options?.height ?? 720,
|
|
193
|
+
headless: options?.headless ?? true
|
|
194
|
+
});
|
|
195
|
+
const setCookieHeaders = response.headers["set-cookie"] || [];
|
|
196
|
+
const affinityCookie = setCookieHeaders.map((c) => c.split(";")[0]).find((c) => c.startsWith("SCRAPER_AFFINITY="));
|
|
197
|
+
return {
|
|
198
|
+
sessionId: response.data.sessionId,
|
|
199
|
+
wsEndpoint: response.data.wsUrl,
|
|
200
|
+
vncUrl: response.data.vncUrl,
|
|
201
|
+
affinityCookie: affinityCookie || void 0,
|
|
202
|
+
createdAt: new Date(response.data.createdAt),
|
|
203
|
+
width: response.data.width,
|
|
204
|
+
height: response.data.height
|
|
205
|
+
};
|
|
206
|
+
} catch (error) {
|
|
207
|
+
this.handleError(error);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Close a browser session
|
|
212
|
+
* @param sessionId - The session ID to close
|
|
213
|
+
* @returns Promise that resolves when session is closed
|
|
214
|
+
*
|
|
215
|
+
* @example
|
|
216
|
+
* ```typescript
|
|
217
|
+
* await upcrawl.browser.close(session.sessionId);
|
|
218
|
+
* ```
|
|
219
|
+
*/
|
|
220
|
+
async close(sessionId) {
|
|
221
|
+
try {
|
|
222
|
+
const client = this.createClient();
|
|
223
|
+
await client.delete(`/browser/session/${sessionId}`);
|
|
224
|
+
} catch (error) {
|
|
225
|
+
this.handleError(error);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* Handle API errors and convert to UpcrawlError
|
|
230
|
+
*/
|
|
231
|
+
handleError(error) {
|
|
232
|
+
if (error && typeof error === "object" && "response" in error) {
|
|
233
|
+
const axiosError = error;
|
|
234
|
+
const status = axiosError.response?.status || 500;
|
|
235
|
+
const data = axiosError.response?.data;
|
|
236
|
+
if (data?.error) {
|
|
237
|
+
throw new UpcrawlError(data.error.message || "Unknown error", status, data.error.code || "UNKNOWN_ERROR");
|
|
238
|
+
}
|
|
239
|
+
switch (status) {
|
|
240
|
+
case 401:
|
|
241
|
+
throw new UpcrawlError("Invalid or missing API key", 401, "UNAUTHORIZED");
|
|
242
|
+
case 404:
|
|
243
|
+
throw new UpcrawlError("Session not found", 404, "NOT_FOUND");
|
|
244
|
+
case 429:
|
|
245
|
+
throw new UpcrawlError("Rate limit exceeded", 429, "RATE_LIMIT_EXCEEDED");
|
|
246
|
+
case 503:
|
|
247
|
+
throw new UpcrawlError("No browser slots available", 503, "SERVICE_UNAVAILABLE");
|
|
248
|
+
default:
|
|
249
|
+
throw new UpcrawlError(axiosError.message || "An unknown error occurred", status, "UNKNOWN_ERROR");
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
if (error instanceof UpcrawlError) {
|
|
253
|
+
throw error;
|
|
254
|
+
}
|
|
255
|
+
throw new UpcrawlError(
|
|
256
|
+
error instanceof Error ? error.message : "An unknown error occurred",
|
|
257
|
+
500,
|
|
258
|
+
"UNKNOWN_ERROR"
|
|
259
|
+
);
|
|
260
|
+
}
|
|
261
|
+
};
|
|
10
262
|
|
|
11
263
|
// src/client.ts
|
|
12
264
|
import axios from "axios";
|
|
@@ -230,6 +482,23 @@ var Upcrawl = {
|
|
|
230
482
|
* @returns Promise with execution response (stdout, stderr, exit code, memory usage)
|
|
231
483
|
*/
|
|
232
484
|
executeCode,
|
|
485
|
+
/**
|
|
486
|
+
* Browser session management
|
|
487
|
+
* Create and manage browser sessions for remote control with Playwright/Puppeteer
|
|
488
|
+
*
|
|
489
|
+
* @example
|
|
490
|
+
* ```typescript
|
|
491
|
+
* const session = await Upcrawl.browser.create({
|
|
492
|
+
* width: 1280,
|
|
493
|
+
* height: 720,
|
|
494
|
+
* headless: true
|
|
495
|
+
* });
|
|
496
|
+
*
|
|
497
|
+
* const browser = await playwright.connect(session.wsEndpoint);
|
|
498
|
+
* await upcrawl.browser.close(session.sessionId);
|
|
499
|
+
* ```
|
|
500
|
+
*/
|
|
501
|
+
browser: new Browser(createClient),
|
|
233
502
|
/**
|
|
234
503
|
* Error class for Upcrawl API errors
|
|
235
504
|
*/
|
|
@@ -237,7 +506,28 @@ var Upcrawl = {
|
|
|
237
506
|
};
|
|
238
507
|
var index_default = Upcrawl;
|
|
239
508
|
export {
|
|
509
|
+
BatchScrapeOptionsSchema,
|
|
510
|
+
BatchScrapeResponseSchema,
|
|
511
|
+
Browser,
|
|
512
|
+
BrowserSessionSchema,
|
|
513
|
+
CreateBrowserSessionOptionsSchema,
|
|
514
|
+
ExecuteCodeOptionsSchema,
|
|
515
|
+
ExecuteCodeResponseSchema,
|
|
516
|
+
GeneratePdfFromUrlOptionsSchema,
|
|
517
|
+
GeneratePdfOptionsSchema,
|
|
518
|
+
PdfMarginSchema,
|
|
519
|
+
PdfResponseSchema,
|
|
520
|
+
ScrapeMetadataSchema,
|
|
521
|
+
ScrapeOptionsSchema,
|
|
522
|
+
ScrapeResponseSchema,
|
|
523
|
+
SearchOptionsSchema,
|
|
524
|
+
SearchResponseSchema,
|
|
525
|
+
SearchResultItemSchema,
|
|
526
|
+
SearchResultWebSchema,
|
|
527
|
+
SummaryQuerySchema,
|
|
528
|
+
UpcrawlConfigSchema,
|
|
240
529
|
UpcrawlError,
|
|
530
|
+
UpcrawlErrorResponseSchema,
|
|
241
531
|
batchScrape,
|
|
242
532
|
configure,
|
|
243
533
|
index_default as default,
|