@crawlgate/sdk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +397 -0
- package/dist/index.cjs +1299 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +1356 -0
- package/dist/index.d.ts +1356 -0
- package/dist/index.js +1255 -0
- package/dist/index.js.map +1 -0
- package/package.json +60 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,1356 @@
|
|
|
1
|
+
import { ZodTypeAny } from 'zod';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Scraping engine selection
|
|
5
|
+
* - `static`: Axios + Cheerio (fast, no JS rendering)
|
|
6
|
+
* - `dynamic`: Playwright headless browser (full JS support)
|
|
7
|
+
* - `smart`: Auto-selects static first, falls back to dynamic if needed
|
|
8
|
+
*/
|
|
9
|
+
type Engine = "static" | "dynamic" | "smart";
|
|
10
|
+
/**
|
|
11
|
+
* Proxy options
|
|
12
|
+
* - `iproyal`: IPRoyal residential proxy
|
|
13
|
+
* - `tor`: Tor network proxy
|
|
14
|
+
* - `stealth`: Stealth mode with residential proxy (dynamic engine only)
|
|
15
|
+
*/
|
|
16
|
+
type ProxyOption = "iproyal" | "tor" | "stealth" | string;
|
|
17
|
+
/**
|
|
18
|
+
* Output format types
|
|
19
|
+
*/
|
|
20
|
+
type FormatType = "markdown" | "html" | "rawHtml" | "text";
|
|
21
|
+
/**
|
|
22
|
+
* LLM provider for extraction
|
|
23
|
+
*/
|
|
24
|
+
type LLMProvider = "openai" | "anthropic";
|
|
25
|
+
/**
|
|
26
|
+
* Configuration options for CrawlGateClient
|
|
27
|
+
*/
|
|
28
|
+
interface CrawlGateClientOptions {
|
|
29
|
+
/**
|
|
30
|
+
* API key for authentication (falls back to CRAWLGATE_API_KEY env variable)
|
|
31
|
+
*/
|
|
32
|
+
apiKey?: string;
|
|
33
|
+
/**
|
|
34
|
+
* Base URL for the API (default: https://api.crawlgate.io)
|
|
35
|
+
*/
|
|
36
|
+
apiUrl?: string;
|
|
37
|
+
/**
|
|
38
|
+
* Request timeout in milliseconds (default: 90000)
|
|
39
|
+
*/
|
|
40
|
+
timeoutMs?: number;
|
|
41
|
+
/**
|
|
42
|
+
* Maximum number of retries for failed requests (default: 3)
|
|
43
|
+
*/
|
|
44
|
+
maxRetries?: number;
|
|
45
|
+
/**
|
|
46
|
+
* Backoff factor for retries in seconds (default: 0.5)
|
|
47
|
+
*/
|
|
48
|
+
backoffFactor?: number;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* JSON Schema definition for LLM extraction
|
|
52
|
+
*/
|
|
53
|
+
type JsonSchema = Record<string, unknown> | ZodTypeAny;
|
|
54
|
+
/**
|
|
55
|
+
* Options for LLM-powered data extraction
|
|
56
|
+
*/
|
|
57
|
+
interface ExtractOptions {
|
|
58
|
+
/**
|
|
59
|
+
* JSON Schema or Zod schema defining the structure of extracted data
|
|
60
|
+
*/
|
|
61
|
+
schema: JsonSchema;
|
|
62
|
+
/**
|
|
63
|
+
* System prompt to guide the LLM extraction
|
|
64
|
+
*/
|
|
65
|
+
systemPrompt?: string;
|
|
66
|
+
/**
|
|
67
|
+
* LLM provider to use (default: openai)
|
|
68
|
+
*/
|
|
69
|
+
provider?: LLMProvider;
|
|
70
|
+
/**
|
|
71
|
+
* Enable fallback to alternative provider if primary fails
|
|
72
|
+
*/
|
|
73
|
+
enableFallback?: boolean;
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Result of LLM extraction
|
|
77
|
+
*/
|
|
78
|
+
interface ExtractResult {
|
|
79
|
+
/**
|
|
80
|
+
* Extracted structured data matching the schema
|
|
81
|
+
*/
|
|
82
|
+
data: unknown;
|
|
83
|
+
/**
|
|
84
|
+
* Provider used for extraction
|
|
85
|
+
*/
|
|
86
|
+
provider: string;
|
|
87
|
+
/**
|
|
88
|
+
* Model used for extraction
|
|
89
|
+
*/
|
|
90
|
+
model: string;
|
|
91
|
+
/**
|
|
92
|
+
* Token usage statistics
|
|
93
|
+
*/
|
|
94
|
+
usage?: {
|
|
95
|
+
promptTokens: number;
|
|
96
|
+
completionTokens: number;
|
|
97
|
+
totalTokens: number;
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Options for scraping a single URL
|
|
102
|
+
*/
|
|
103
|
+
interface ScrapeOptions {
|
|
104
|
+
/**
|
|
105
|
+
* Scraping engine to use
|
|
106
|
+
*/
|
|
107
|
+
engine?: Engine;
|
|
108
|
+
/**
|
|
109
|
+
* Output formats to return
|
|
110
|
+
*/
|
|
111
|
+
formats?: FormatType[];
|
|
112
|
+
/**
|
|
113
|
+
* Extract only the main content (removes headers, footers, sidebars)
|
|
114
|
+
*/
|
|
115
|
+
onlyMainContent?: boolean;
|
|
116
|
+
/**
|
|
117
|
+
* HTML tags to exclude from output
|
|
118
|
+
*/
|
|
119
|
+
excludeTags?: string[];
|
|
120
|
+
/**
|
|
121
|
+
* Wait for specified milliseconds before scraping (dynamic engine)
|
|
122
|
+
*/
|
|
123
|
+
waitFor?: number;
|
|
124
|
+
/**
|
|
125
|
+
* Request timeout in milliseconds
|
|
126
|
+
*/
|
|
127
|
+
timeout?: number;
|
|
128
|
+
/**
|
|
129
|
+
* Proxy configuration
|
|
130
|
+
*/
|
|
131
|
+
proxy?: ProxyOption;
|
|
132
|
+
/**
|
|
133
|
+
* LLM extraction configuration
|
|
134
|
+
*/
|
|
135
|
+
extract?: ExtractOptions;
|
|
136
|
+
/**
|
|
137
|
+
* Project ID for usage tracking
|
|
138
|
+
*/
|
|
139
|
+
projectId?: string;
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Metadata from scraped page
|
|
143
|
+
*/
|
|
144
|
+
interface DocumentMetadata {
|
|
145
|
+
title?: string;
|
|
146
|
+
description?: string;
|
|
147
|
+
language?: string;
|
|
148
|
+
sourceURL?: string;
|
|
149
|
+
ogTitle?: string;
|
|
150
|
+
ogDescription?: string;
|
|
151
|
+
ogImage?: string;
|
|
152
|
+
favicon?: string;
|
|
153
|
+
[key: string]: unknown;
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Scraped document data
|
|
157
|
+
*/
|
|
158
|
+
interface Document {
|
|
159
|
+
/**
|
|
160
|
+
* Source URL
|
|
161
|
+
*/
|
|
162
|
+
url: string;
|
|
163
|
+
/**
|
|
164
|
+
* Content in Markdown format
|
|
165
|
+
*/
|
|
166
|
+
markdown?: string;
|
|
167
|
+
/**
|
|
168
|
+
* Content in HTML format
|
|
169
|
+
*/
|
|
170
|
+
html?: string;
|
|
171
|
+
/**
|
|
172
|
+
* Raw HTML content
|
|
173
|
+
*/
|
|
174
|
+
rawHtml?: string;
|
|
175
|
+
/**
|
|
176
|
+
* Plain text content
|
|
177
|
+
*/
|
|
178
|
+
text?: string;
|
|
179
|
+
/**
|
|
180
|
+
* Page metadata
|
|
181
|
+
*/
|
|
182
|
+
metadata?: DocumentMetadata;
|
|
183
|
+
/**
|
|
184
|
+
* Scrape duration in milliseconds
|
|
185
|
+
*/
|
|
186
|
+
scrapeTime?: number;
|
|
187
|
+
/**
|
|
188
|
+
* LLM extraction result (if extract option was provided)
|
|
189
|
+
*/
|
|
190
|
+
extract?: ExtractResult;
|
|
191
|
+
/**
|
|
192
|
+
* Alias for extract.data (backward compatibility)
|
|
193
|
+
*/
|
|
194
|
+
json?: unknown;
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Response from scrape endpoint
|
|
198
|
+
*/
|
|
199
|
+
interface ScrapeResponse {
|
|
200
|
+
/**
|
|
201
|
+
* Whether the request was successful
|
|
202
|
+
*/
|
|
203
|
+
success: boolean;
|
|
204
|
+
/**
|
|
205
|
+
* Scraped document data
|
|
206
|
+
*/
|
|
207
|
+
data?: Document;
|
|
208
|
+
/**
|
|
209
|
+
* Engine that was requested
|
|
210
|
+
*/
|
|
211
|
+
engine?: Engine;
|
|
212
|
+
/**
|
|
213
|
+
* Engine that was actually used (for smart engine)
|
|
214
|
+
*/
|
|
215
|
+
engineUsed?: "static" | "dynamic";
|
|
216
|
+
/**
|
|
217
|
+
* Request ID for tracking
|
|
218
|
+
*/
|
|
219
|
+
requestId?: string;
|
|
220
|
+
/**
|
|
221
|
+
* Error message if failed
|
|
222
|
+
*/
|
|
223
|
+
error?: string;
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Crawl job status
|
|
227
|
+
*/
|
|
228
|
+
type CrawlStatus = "scraping" | "completed" | "failed" | "cancelled";
|
|
229
|
+
/**
|
|
230
|
+
* Options for crawling a website
|
|
231
|
+
*/
|
|
232
|
+
interface CrawlOptions {
|
|
233
|
+
/**
|
|
234
|
+
* Scraping engine to use
|
|
235
|
+
*/
|
|
236
|
+
engine?: Engine;
|
|
237
|
+
/**
|
|
238
|
+
* Maximum number of pages to crawl
|
|
239
|
+
*/
|
|
240
|
+
limit?: number;
|
|
241
|
+
/**
|
|
242
|
+
* Output formats to return
|
|
243
|
+
*/
|
|
244
|
+
formats?: FormatType[];
|
|
245
|
+
/**
|
|
246
|
+
* Extract only the main content
|
|
247
|
+
*/
|
|
248
|
+
onlyMainContent?: boolean;
|
|
249
|
+
/**
|
|
250
|
+
* HTML tags to exclude
|
|
251
|
+
*/
|
|
252
|
+
excludeTags?: string[];
|
|
253
|
+
/**
|
|
254
|
+
* Proxy configuration
|
|
255
|
+
*/
|
|
256
|
+
proxy?: ProxyOption;
|
|
257
|
+
/**
|
|
258
|
+
* Project ID for usage tracking
|
|
259
|
+
*/
|
|
260
|
+
projectId?: string;
|
|
261
|
+
/**
|
|
262
|
+
* Poll interval in milliseconds (for waiter method)
|
|
263
|
+
*/
|
|
264
|
+
pollInterval?: number;
|
|
265
|
+
/**
|
|
266
|
+
* Maximum wait time in seconds (for waiter method)
|
|
267
|
+
*/
|
|
268
|
+
timeout?: number;
|
|
269
|
+
}
|
|
270
|
+
/**
|
|
271
|
+
* Response from starting a crawl job
|
|
272
|
+
*/
|
|
273
|
+
interface CrawlResponse {
|
|
274
|
+
/**
|
|
275
|
+
* Whether the request was successful
|
|
276
|
+
*/
|
|
277
|
+
success: boolean;
|
|
278
|
+
/**
|
|
279
|
+
* Crawl job ID
|
|
280
|
+
*/
|
|
281
|
+
id: string;
|
|
282
|
+
/**
|
|
283
|
+
* Alias for id
|
|
284
|
+
*/
|
|
285
|
+
jobId?: string;
|
|
286
|
+
/**
|
|
287
|
+
* Initial job status
|
|
288
|
+
*/
|
|
289
|
+
status: CrawlStatus;
|
|
290
|
+
/**
|
|
291
|
+
* Engine being used
|
|
292
|
+
*/
|
|
293
|
+
engine?: string;
|
|
294
|
+
}
|
|
295
|
+
/**
|
|
296
|
+
* Crawl job status and data
|
|
297
|
+
*/
|
|
298
|
+
interface CrawlJob {
|
|
299
|
+
/**
|
|
300
|
+
* Job ID
|
|
301
|
+
*/
|
|
302
|
+
id: string;
|
|
303
|
+
/**
|
|
304
|
+
* Current status
|
|
305
|
+
*/
|
|
306
|
+
status: CrawlStatus;
|
|
307
|
+
/**
|
|
308
|
+
* Total pages to crawl
|
|
309
|
+
*/
|
|
310
|
+
total: number;
|
|
311
|
+
/**
|
|
312
|
+
* Pages completed
|
|
313
|
+
*/
|
|
314
|
+
completed: number;
|
|
315
|
+
/**
|
|
316
|
+
* Scraped documents (empty until completed)
|
|
317
|
+
*/
|
|
318
|
+
data: Document[];
|
|
319
|
+
/**
|
|
320
|
+
* Engine being used
|
|
321
|
+
*/
|
|
322
|
+
engine?: string;
|
|
323
|
+
/**
|
|
324
|
+
* Error message if failed
|
|
325
|
+
*/
|
|
326
|
+
error?: string;
|
|
327
|
+
}
|
|
328
|
+
/**
|
|
329
|
+
* Options for mapping URLs on a website
|
|
330
|
+
*/
|
|
331
|
+
interface MapOptions {
|
|
332
|
+
/**
|
|
333
|
+
* Scraping engine to use
|
|
334
|
+
*/
|
|
335
|
+
engine?: Engine;
|
|
336
|
+
/**
|
|
337
|
+
* Proxy configuration
|
|
338
|
+
*/
|
|
339
|
+
proxy?: ProxyOption;
|
|
340
|
+
/**
|
|
341
|
+
* Project ID for usage tracking
|
|
342
|
+
*/
|
|
343
|
+
projectId?: string;
|
|
344
|
+
}
|
|
345
|
+
/**
|
|
346
|
+
* Response from map endpoint
|
|
347
|
+
*/
|
|
348
|
+
interface MapResponse {
|
|
349
|
+
/**
|
|
350
|
+
* Whether the request was successful
|
|
351
|
+
*/
|
|
352
|
+
success: boolean;
|
|
353
|
+
/**
|
|
354
|
+
* Discovered URLs
|
|
355
|
+
*/
|
|
356
|
+
links: string[];
|
|
357
|
+
/**
|
|
358
|
+
* Number of URLs found
|
|
359
|
+
*/
|
|
360
|
+
count: number;
|
|
361
|
+
/**
|
|
362
|
+
* Engine used
|
|
363
|
+
*/
|
|
364
|
+
engine?: string;
|
|
365
|
+
/**
|
|
366
|
+
* Error message if failed
|
|
367
|
+
*/
|
|
368
|
+
error?: string;
|
|
369
|
+
}
|
|
370
|
+
/**
|
|
371
|
+
* Options for web search
|
|
372
|
+
*/
|
|
373
|
+
interface SearchOptions {
|
|
374
|
+
/**
|
|
375
|
+
* Maximum number of results (default: 10)
|
|
376
|
+
*/
|
|
377
|
+
limit?: number;
|
|
378
|
+
/**
|
|
379
|
+
* Language code (default: en)
|
|
380
|
+
*/
|
|
381
|
+
lang?: string;
|
|
382
|
+
/**
|
|
383
|
+
* Country code (default: us)
|
|
384
|
+
*/
|
|
385
|
+
country?: string;
|
|
386
|
+
/**
|
|
387
|
+
* Search engines to use (default: all available)
|
|
388
|
+
*/
|
|
389
|
+
engines?: string[];
|
|
390
|
+
/**
|
|
391
|
+
* Scrape options (if provided, results will be scraped)
|
|
392
|
+
*/
|
|
393
|
+
scrapeOptions?: {
|
|
394
|
+
formats?: FormatType[];
|
|
395
|
+
};
|
|
396
|
+
/**
|
|
397
|
+
* Scraping engine for scraping results
|
|
398
|
+
*/
|
|
399
|
+
engine?: Engine;
|
|
400
|
+
/**
|
|
401
|
+
* LLM extraction on combined search results
|
|
402
|
+
*/
|
|
403
|
+
extract?: ExtractOptions;
|
|
404
|
+
/**
|
|
405
|
+
* Project ID for usage tracking
|
|
406
|
+
*/
|
|
407
|
+
projectId?: string;
|
|
408
|
+
}
|
|
409
|
+
/**
|
|
410
|
+
* Individual search result
|
|
411
|
+
*/
|
|
412
|
+
interface SearchResult {
|
|
413
|
+
/**
|
|
414
|
+
* Result URL
|
|
415
|
+
*/
|
|
416
|
+
url: string;
|
|
417
|
+
/**
|
|
418
|
+
* Result title
|
|
419
|
+
*/
|
|
420
|
+
title?: string;
|
|
421
|
+
/**
|
|
422
|
+
* Result description/snippet
|
|
423
|
+
*/
|
|
424
|
+
description?: string;
|
|
425
|
+
/**
|
|
426
|
+
* Search engine that returned this result
|
|
427
|
+
*/
|
|
428
|
+
engine?: string;
|
|
429
|
+
/**
|
|
430
|
+
* Relevance score
|
|
431
|
+
*/
|
|
432
|
+
score?: number;
|
|
433
|
+
/**
|
|
434
|
+
* Position in search results
|
|
435
|
+
*/
|
|
436
|
+
position?: number;
|
|
437
|
+
/**
|
|
438
|
+
* Scraped markdown content (if scrapeOptions provided)
|
|
439
|
+
*/
|
|
440
|
+
markdown?: string | null;
|
|
441
|
+
/**
|
|
442
|
+
* Scraped HTML content (if scrapeOptions provided)
|
|
443
|
+
*/
|
|
444
|
+
html?: string | null;
|
|
445
|
+
/**
|
|
446
|
+
* Page metadata (if scraped)
|
|
447
|
+
*/
|
|
448
|
+
metadata?: DocumentMetadata;
|
|
449
|
+
/**
|
|
450
|
+
* Whether scraping was successful
|
|
451
|
+
*/
|
|
452
|
+
scrapeSuccess?: boolean;
|
|
453
|
+
}
|
|
454
|
+
/**
|
|
455
|
+
* Response from search endpoint
|
|
456
|
+
*/
|
|
457
|
+
interface SearchResponse {
|
|
458
|
+
/**
|
|
459
|
+
* Whether the request was successful
|
|
460
|
+
*/
|
|
461
|
+
success: boolean;
|
|
462
|
+
/**
|
|
463
|
+
* Search results
|
|
464
|
+
*/
|
|
465
|
+
data: SearchResult[];
|
|
466
|
+
/**
|
|
467
|
+
* Original search query
|
|
468
|
+
*/
|
|
469
|
+
query: string;
|
|
470
|
+
/**
|
|
471
|
+
* Total results available
|
|
472
|
+
*/
|
|
473
|
+
totalResults?: number;
|
|
474
|
+
/**
|
|
475
|
+
* Search/scrape time in milliseconds
|
|
476
|
+
*/
|
|
477
|
+
searchTime?: number;
|
|
478
|
+
/**
|
|
479
|
+
* LLM extraction result (if extract option provided)
|
|
480
|
+
*/
|
|
481
|
+
extract?: ExtractResult;
|
|
482
|
+
/**
|
|
483
|
+
* Error message if failed
|
|
484
|
+
*/
|
|
485
|
+
error?: string;
|
|
486
|
+
}
|
|
487
|
+
/**
|
|
488
|
+
* Options for batch scraping multiple URLs
|
|
489
|
+
*/
|
|
490
|
+
interface BatchScrapeOptions {
|
|
491
|
+
/**
|
|
492
|
+
* Scrape options to apply to all URLs
|
|
493
|
+
*/
|
|
494
|
+
options?: ScrapeOptions;
|
|
495
|
+
/**
|
|
496
|
+
* Webhook URL for job status updates
|
|
497
|
+
*/
|
|
498
|
+
webhook?: string | WebhookConfig;
|
|
499
|
+
/**
|
|
500
|
+
* Append results to existing batch job
|
|
501
|
+
*/
|
|
502
|
+
appendToId?: string;
|
|
503
|
+
/**
|
|
504
|
+
* Skip invalid URLs instead of failing
|
|
505
|
+
*/
|
|
506
|
+
ignoreInvalidURLs?: boolean;
|
|
507
|
+
/**
|
|
508
|
+
* Maximum concurrent scrapes
|
|
509
|
+
*/
|
|
510
|
+
maxConcurrency?: number;
|
|
511
|
+
/**
|
|
512
|
+
* Idempotency key for deduplication
|
|
513
|
+
*/
|
|
514
|
+
idempotencyKey?: string;
|
|
515
|
+
/**
|
|
516
|
+
* Project ID for usage tracking
|
|
517
|
+
*/
|
|
518
|
+
projectId?: string;
|
|
519
|
+
/**
|
|
520
|
+
* Poll interval in milliseconds (for waiter method)
|
|
521
|
+
*/
|
|
522
|
+
pollInterval?: number;
|
|
523
|
+
/**
|
|
524
|
+
* Maximum wait time in seconds (for waiter method)
|
|
525
|
+
*/
|
|
526
|
+
timeout?: number;
|
|
527
|
+
}
|
|
528
|
+
/**
|
|
529
|
+
* Webhook configuration
|
|
530
|
+
*/
|
|
531
|
+
interface WebhookConfig {
|
|
532
|
+
/**
|
|
533
|
+
* Webhook URL
|
|
534
|
+
*/
|
|
535
|
+
url: string;
|
|
536
|
+
/**
|
|
537
|
+
* Custom headers to send with webhook
|
|
538
|
+
*/
|
|
539
|
+
headers?: Record<string, string>;
|
|
540
|
+
/**
|
|
541
|
+
* Metadata to include in webhook payload
|
|
542
|
+
*/
|
|
543
|
+
metadata?: Record<string, string>;
|
|
544
|
+
/**
|
|
545
|
+
* Events to trigger webhook
|
|
546
|
+
*/
|
|
547
|
+
events?: Array<"completed" | "failed" | "page" | "started">;
|
|
548
|
+
}
|
|
549
|
+
/**
|
|
550
|
+
* Response from starting a batch scrape job
|
|
551
|
+
*/
|
|
552
|
+
interface BatchScrapeResponse {
|
|
553
|
+
/**
|
|
554
|
+
* Whether the request was successful
|
|
555
|
+
*/
|
|
556
|
+
success: boolean;
|
|
557
|
+
/**
|
|
558
|
+
* Batch job ID
|
|
559
|
+
*/
|
|
560
|
+
id: string;
|
|
561
|
+
/**
|
|
562
|
+
* URL to check job status
|
|
563
|
+
*/
|
|
564
|
+
url?: string;
|
|
565
|
+
/**
|
|
566
|
+
* Invalid URLs that were skipped
|
|
567
|
+
*/
|
|
568
|
+
invalidURLs?: string[];
|
|
569
|
+
/**
|
|
570
|
+
* Error message if failed
|
|
571
|
+
*/
|
|
572
|
+
error?: string;
|
|
573
|
+
}
|
|
574
|
+
/**
|
|
575
|
+
* Batch scrape job status and data
|
|
576
|
+
*/
|
|
577
|
+
interface BatchScrapeJob {
|
|
578
|
+
/**
|
|
579
|
+
* Job ID
|
|
580
|
+
*/
|
|
581
|
+
id: string;
|
|
582
|
+
/**
|
|
583
|
+
* Current status
|
|
584
|
+
*/
|
|
585
|
+
status: CrawlStatus;
|
|
586
|
+
/**
|
|
587
|
+
* Total URLs to scrape
|
|
588
|
+
*/
|
|
589
|
+
total: number;
|
|
590
|
+
/**
|
|
591
|
+
* URLs completed
|
|
592
|
+
*/
|
|
593
|
+
completed: number;
|
|
594
|
+
/**
|
|
595
|
+
* Credits used
|
|
596
|
+
*/
|
|
597
|
+
creditsUsed?: number;
|
|
598
|
+
/**
|
|
599
|
+
* Job expiration time
|
|
600
|
+
*/
|
|
601
|
+
expiresAt?: string;
|
|
602
|
+
/**
|
|
603
|
+
* Next page URL for pagination
|
|
604
|
+
*/
|
|
605
|
+
next?: string | null;
|
|
606
|
+
/**
|
|
607
|
+
* Scraped documents
|
|
608
|
+
*/
|
|
609
|
+
data: Document[];
|
|
610
|
+
/**
|
|
611
|
+
* Error message if failed
|
|
612
|
+
*/
|
|
613
|
+
error?: string;
|
|
614
|
+
}
|
|
615
|
+
/**
|
|
616
|
+
* Options for standalone LLM extraction
|
|
617
|
+
*/
|
|
618
|
+
interface ExtractRequestOptions {
|
|
619
|
+
/**
|
|
620
|
+
* URLs to extract data from
|
|
621
|
+
*/
|
|
622
|
+
urls?: string[];
|
|
623
|
+
/**
|
|
624
|
+
* Natural language prompt for extraction
|
|
625
|
+
*/
|
|
626
|
+
prompt?: string;
|
|
627
|
+
/**
|
|
628
|
+
* JSON Schema or Zod schema for structured extraction
|
|
629
|
+
*/
|
|
630
|
+
schema?: JsonSchema;
|
|
631
|
+
/**
|
|
632
|
+
* System prompt for LLM
|
|
633
|
+
*/
|
|
634
|
+
systemPrompt?: string;
|
|
635
|
+
/**
|
|
636
|
+
* Allow following external links
|
|
637
|
+
*/
|
|
638
|
+
allowExternalLinks?: boolean;
|
|
639
|
+
/**
|
|
640
|
+
* Enable web search for additional context
|
|
641
|
+
*/
|
|
642
|
+
enableWebSearch?: boolean;
|
|
643
|
+
/**
|
|
644
|
+
* Include source URLs in response
|
|
645
|
+
*/
|
|
646
|
+
showSources?: boolean;
|
|
647
|
+
/**
|
|
648
|
+
* Scrape options for URLs
|
|
649
|
+
*/
|
|
650
|
+
scrapeOptions?: ScrapeOptions;
|
|
651
|
+
/**
|
|
652
|
+
* Skip invalid URLs instead of failing
|
|
653
|
+
*/
|
|
654
|
+
ignoreInvalidURLs?: boolean;
|
|
655
|
+
/**
|
|
656
|
+
* LLM provider to use
|
|
657
|
+
*/
|
|
658
|
+
provider?: LLMProvider;
|
|
659
|
+
/**
|
|
660
|
+
* Project ID for usage tracking
|
|
661
|
+
*/
|
|
662
|
+
projectId?: string;
|
|
663
|
+
/**
|
|
664
|
+
* Poll interval in milliseconds (for waiter method)
|
|
665
|
+
*/
|
|
666
|
+
pollInterval?: number;
|
|
667
|
+
/**
|
|
668
|
+
* Maximum wait time in seconds (for waiter method)
|
|
669
|
+
*/
|
|
670
|
+
timeout?: number;
|
|
671
|
+
}
|
|
672
|
+
/**
|
|
673
|
+
* Extract job status
|
|
674
|
+
*/
|
|
675
|
+
type ExtractStatus = "processing" | "completed" | "failed" | "cancelled";
|
|
676
|
+
/**
|
|
677
|
+
* Response from extract endpoint
|
|
678
|
+
*/
|
|
679
|
+
interface ExtractResponse {
|
|
680
|
+
/**
|
|
681
|
+
* Whether the request was successful
|
|
682
|
+
*/
|
|
683
|
+
success?: boolean;
|
|
684
|
+
/**
|
|
685
|
+
* Extract job ID (for async operations)
|
|
686
|
+
*/
|
|
687
|
+
id?: string;
|
|
688
|
+
/**
|
|
689
|
+
* Job status
|
|
690
|
+
*/
|
|
691
|
+
status?: ExtractStatus;
|
|
692
|
+
/**
|
|
693
|
+
* Extracted data
|
|
694
|
+
*/
|
|
695
|
+
data?: unknown;
|
|
696
|
+
/**
|
|
697
|
+
* Error message if failed
|
|
698
|
+
*/
|
|
699
|
+
error?: string;
|
|
700
|
+
/**
|
|
701
|
+
* Warning message
|
|
702
|
+
*/
|
|
703
|
+
warning?: string;
|
|
704
|
+
/**
|
|
705
|
+
* Source URLs used for extraction
|
|
706
|
+
*/
|
|
707
|
+
sources?: Record<string, unknown>;
|
|
708
|
+
/**
|
|
709
|
+
* Job expiration time
|
|
710
|
+
*/
|
|
711
|
+
expiresAt?: string;
|
|
712
|
+
/**
|
|
713
|
+
* Credits used
|
|
714
|
+
*/
|
|
715
|
+
creditsUsed?: number;
|
|
716
|
+
}
|
|
717
|
+
/**
|
|
718
|
+
* Concurrency usage information
|
|
719
|
+
*/
|
|
720
|
+
interface ConcurrencyInfo {
|
|
721
|
+
/**
|
|
722
|
+
* Current active requests
|
|
723
|
+
*/
|
|
724
|
+
concurrency: number;
|
|
725
|
+
/**
|
|
726
|
+
* Maximum allowed concurrent requests
|
|
727
|
+
*/
|
|
728
|
+
maxConcurrency: number;
|
|
729
|
+
}
|
|
730
|
+
/**
|
|
731
|
+
* Credit usage information
|
|
732
|
+
*/
|
|
733
|
+
interface CreditUsage {
|
|
734
|
+
/**
|
|
735
|
+
* Remaining credits
|
|
736
|
+
*/
|
|
737
|
+
remainingCredits: number;
|
|
738
|
+
/**
|
|
739
|
+
* Total plan credits
|
|
740
|
+
*/
|
|
741
|
+
planCredits?: number;
|
|
742
|
+
/**
|
|
743
|
+
* Billing period start date
|
|
744
|
+
*/
|
|
745
|
+
billingPeriodStart?: string | null;
|
|
746
|
+
/**
|
|
747
|
+
* Billing period end date
|
|
748
|
+
*/
|
|
749
|
+
billingPeriodEnd?: string | null;
|
|
750
|
+
}
|
|
751
|
+
/**
|
|
752
|
+
* Token usage information
|
|
753
|
+
*/
|
|
754
|
+
interface TokenUsage {
|
|
755
|
+
/**
|
|
756
|
+
* Remaining tokens
|
|
757
|
+
*/
|
|
758
|
+
remainingTokens: number;
|
|
759
|
+
/**
|
|
760
|
+
* Total plan tokens
|
|
761
|
+
*/
|
|
762
|
+
planTokens?: number;
|
|
763
|
+
/**
|
|
764
|
+
* Billing period start date
|
|
765
|
+
*/
|
|
766
|
+
billingPeriodStart?: string | null;
|
|
767
|
+
/**
|
|
768
|
+
* Billing period end date
|
|
769
|
+
*/
|
|
770
|
+
billingPeriodEnd?: string | null;
|
|
771
|
+
}
|
|
772
|
+
/**
|
|
773
|
+
* Queue status information
|
|
774
|
+
*/
|
|
775
|
+
interface QueueStatus {
|
|
776
|
+
/**
|
|
777
|
+
* Whether the request was successful
|
|
778
|
+
*/
|
|
779
|
+
success: boolean;
|
|
780
|
+
/**
|
|
781
|
+
* Total jobs in queue
|
|
782
|
+
*/
|
|
783
|
+
jobsInQueue: number;
|
|
784
|
+
/**
|
|
785
|
+
* Active jobs currently processing
|
|
786
|
+
*/
|
|
787
|
+
activeJobsInQueue: number;
|
|
788
|
+
/**
|
|
789
|
+
* Jobs waiting to be processed
|
|
790
|
+
*/
|
|
791
|
+
waitingJobsInQueue: number;
|
|
792
|
+
/**
|
|
793
|
+
* Maximum concurrency
|
|
794
|
+
*/
|
|
795
|
+
maxConcurrency: number;
|
|
796
|
+
/**
|
|
797
|
+
* Timestamp of most recent successful job
|
|
798
|
+
*/
|
|
799
|
+
mostRecentSuccess: string | null;
|
|
800
|
+
}
|
|
801
|
+
/**
|
|
802
|
+
* Crawl error information
|
|
803
|
+
*/
|
|
804
|
+
interface CrawlError {
|
|
805
|
+
/**
|
|
806
|
+
* Error ID
|
|
807
|
+
*/
|
|
808
|
+
id: string;
|
|
809
|
+
/**
|
|
810
|
+
* Timestamp of error
|
|
811
|
+
*/
|
|
812
|
+
timestamp?: string;
|
|
813
|
+
/**
|
|
814
|
+
* URL that caused the error
|
|
815
|
+
*/
|
|
816
|
+
url: string;
|
|
817
|
+
/**
|
|
818
|
+
* Error code
|
|
819
|
+
*/
|
|
820
|
+
code?: string;
|
|
821
|
+
/**
|
|
822
|
+
* Error message
|
|
823
|
+
*/
|
|
824
|
+
error: string;
|
|
825
|
+
}
|
|
826
|
+
/**
|
|
827
|
+
* Response from crawl errors endpoint
|
|
828
|
+
*/
|
|
829
|
+
interface CrawlErrorsResponse {
|
|
830
|
+
/**
|
|
831
|
+
* List of errors encountered
|
|
832
|
+
*/
|
|
833
|
+
errors: CrawlError[];
|
|
834
|
+
/**
|
|
835
|
+
* URLs blocked by robots.txt
|
|
836
|
+
*/
|
|
837
|
+
robotsBlocked: string[];
|
|
838
|
+
}
|
|
839
|
+
/**
|
|
840
|
+
* Pagination configuration for large result sets
|
|
841
|
+
*/
|
|
842
|
+
interface PaginationConfig {
|
|
843
|
+
/**
|
|
844
|
+
* Automatically fetch all pages (default: true)
|
|
845
|
+
*/
|
|
846
|
+
autoPaginate?: boolean;
|
|
847
|
+
/**
|
|
848
|
+
* Maximum number of pages to fetch
|
|
849
|
+
*/
|
|
850
|
+
maxPages?: number;
|
|
851
|
+
/**
|
|
852
|
+
* Maximum total results to return
|
|
853
|
+
*/
|
|
854
|
+
maxResults?: number;
|
|
855
|
+
/**
|
|
856
|
+
* Maximum time to spend fetching pages (seconds)
|
|
857
|
+
*/
|
|
858
|
+
maxWaitTime?: number;
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
/**
|
|
862
|
+
* CrawlGate SDK Client
|
|
863
|
+
*
|
|
864
|
+
* @example
|
|
865
|
+
* ```typescript
|
|
866
|
+
* import { CrawlGateClient } from '@crawlgate/sdk';
|
|
867
|
+
*
|
|
868
|
+
* const client = new CrawlGateClient({
|
|
869
|
+
* apiKey: 'sk_live_...',
|
|
870
|
+
* apiUrl: 'https://api.crawlgate.io'
|
|
871
|
+
* });
|
|
872
|
+
*
|
|
873
|
+
* // Scrape a single URL
|
|
874
|
+
* const doc = await client.scrape('https://example.com', {
|
|
875
|
+
* engine: 'smart',
|
|
876
|
+
* formats: ['markdown', 'html']
|
|
877
|
+
* });
|
|
878
|
+
*
|
|
879
|
+
* // Batch scrape multiple URLs
|
|
880
|
+
* const job = await client.batchScrape(['https://a.com', 'https://b.com'], {
|
|
881
|
+
* options: { formats: ['markdown'] }
|
|
882
|
+
* });
|
|
883
|
+
*
|
|
884
|
+
* // Crawl a website
|
|
885
|
+
* const crawlJob = await client.crawl('https://example.com', {
|
|
886
|
+
* limit: 10,
|
|
887
|
+
* engine: 'dynamic'
|
|
888
|
+
* });
|
|
889
|
+
*
|
|
890
|
+
* // Extract structured data with LLM
|
|
891
|
+
* const extracted = await client.extract({
|
|
892
|
+
* urls: ['https://example.com/product'],
|
|
893
|
+
* schema: { name: 'string', price: 'number' },
|
|
894
|
+
* provider: 'openai'
|
|
895
|
+
* });
|
|
896
|
+
*
|
|
897
|
+
* // Search the web
|
|
898
|
+
* const results = await client.search('best restaurants', {
|
|
899
|
+
* limit: 5,
|
|
900
|
+
* scrapeOptions: { formats: ['markdown'] }
|
|
901
|
+
* });
|
|
902
|
+
* ```
|
|
903
|
+
*/
|
|
904
|
+
declare class CrawlGateClient {
|
|
905
|
+
private readonly http;
|
|
906
|
+
/**
|
|
907
|
+
* Create a new CrawlGate client
|
|
908
|
+
*
|
|
909
|
+
* @param options - Client configuration options
|
|
910
|
+
* @throws {CrawlGateError} If API key is not provided
|
|
911
|
+
*/
|
|
912
|
+
constructor(options?: CrawlGateClientOptions);
|
|
913
|
+
/**
|
|
914
|
+
* Scrape a single URL
|
|
915
|
+
*
|
|
916
|
+
* @param url - URL to scrape
|
|
917
|
+
* @param options - Scrape options
|
|
918
|
+
* @returns Scraped document with requested formats
|
|
919
|
+
*
|
|
920
|
+
* @example
|
|
921
|
+
* ```typescript
|
|
922
|
+
* const doc = await client.scrape('https://example.com', {
|
|
923
|
+
* engine: 'smart',
|
|
924
|
+
* formats: ['markdown', 'html'],
|
|
925
|
+
* onlyMainContent: true
|
|
926
|
+
* });
|
|
927
|
+
* console.log(doc.markdown);
|
|
928
|
+
* ```
|
|
929
|
+
*
|
|
930
|
+
* @example With LLM extraction
|
|
931
|
+
* ```typescript
|
|
932
|
+
* import { z } from 'zod';
|
|
933
|
+
*
|
|
934
|
+
* const schema = z.object({
|
|
935
|
+
* title: z.string(),
|
|
936
|
+
* price: z.number(),
|
|
937
|
+
* inStock: z.boolean()
|
|
938
|
+
* });
|
|
939
|
+
*
|
|
940
|
+
* const doc = await client.scrape('https://example.com/product', {
|
|
941
|
+
* engine: 'smart',
|
|
942
|
+
* extract: {
|
|
943
|
+
* schema,
|
|
944
|
+
* systemPrompt: 'Extract product details',
|
|
945
|
+
* provider: 'openai'
|
|
946
|
+
* }
|
|
947
|
+
* });
|
|
948
|
+
* console.log(doc.extract?.data);
|
|
949
|
+
* ```
|
|
950
|
+
*/
|
|
951
|
+
scrape(url: string, options?: ScrapeOptions): Promise<Document>;
|
|
952
|
+
/**
|
|
953
|
+
* Start a batch scrape job (async)
|
|
954
|
+
*
|
|
955
|
+
* @param urls - Array of URLs to scrape
|
|
956
|
+
* @param options - Batch scrape options
|
|
957
|
+
* @returns Batch job ID and initial status
|
|
958
|
+
*
|
|
959
|
+
* @example
|
|
960
|
+
* ```typescript
|
|
961
|
+
* const { id } = await client.startBatchScrape(
|
|
962
|
+
* ['https://a.com', 'https://b.com', 'https://c.com'],
|
|
963
|
+
* { options: { formats: ['markdown'] } }
|
|
964
|
+
* );
|
|
965
|
+
*
|
|
966
|
+
* // Poll manually
|
|
967
|
+
* let status = await client.getBatchScrapeStatus(id);
|
|
968
|
+
* while (status.status === 'scraping') {
|
|
969
|
+
* await new Promise(r => setTimeout(r, 2000));
|
|
970
|
+
* status = await client.getBatchScrapeStatus(id);
|
|
971
|
+
* }
|
|
972
|
+
* ```
|
|
973
|
+
*/
|
|
974
|
+
startBatchScrape(urls: string[], options?: BatchScrapeOptions): Promise<BatchScrapeResponse>;
|
|
975
|
+
/**
|
|
976
|
+
* Get batch scrape job status and data
|
|
977
|
+
*
|
|
978
|
+
* @param jobId - Batch job ID
|
|
979
|
+
* @returns Current job status and scraped data
|
|
980
|
+
*/
|
|
981
|
+
getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob>;
|
|
982
|
+
/**
|
|
983
|
+
* Cancel a batch scrape job
|
|
984
|
+
*
|
|
985
|
+
* @param jobId - Batch job ID
|
|
986
|
+
* @returns True if cancelled successfully
|
|
987
|
+
*/
|
|
988
|
+
cancelBatchScrape(jobId: string): Promise<boolean>;
|
|
989
|
+
/**
|
|
990
|
+
* Get batch scrape job errors
|
|
991
|
+
*
|
|
992
|
+
* @param jobId - Batch job ID
|
|
993
|
+
* @returns Errors and robots.txt blocked URLs
|
|
994
|
+
*/
|
|
995
|
+
getBatchScrapeErrors(jobId: string): Promise<CrawlErrorsResponse>;
|
|
996
|
+
/**
|
|
997
|
+
* Batch scrape multiple URLs and wait for completion
|
|
998
|
+
*
|
|
999
|
+
* @param urls - Array of URLs to scrape
|
|
1000
|
+
* @param options - Batch options including pollInterval and timeout
|
|
1001
|
+
* @returns Final job with all scraped data
|
|
1002
|
+
*
|
|
1003
|
+
* @example
|
|
1004
|
+
* ```typescript
|
|
1005
|
+
* const job = await client.batchScrape(
|
|
1006
|
+
* ['https://a.com', 'https://b.com', 'https://c.com'],
|
|
1007
|
+
* {
|
|
1008
|
+
* options: { formats: ['markdown'], engine: 'smart' },
|
|
1009
|
+
* pollInterval: 2000,
|
|
1010
|
+
* timeout: 300
|
|
1011
|
+
* }
|
|
1012
|
+
* );
|
|
1013
|
+
*
|
|
1014
|
+
* console.log(`Scraped ${job.completed} URLs`);
|
|
1015
|
+
* job.data.forEach(doc => console.log(doc.url, doc.markdown?.length));
|
|
1016
|
+
* ```
|
|
1017
|
+
*/
|
|
1018
|
+
batchScrape(urls: string[], options?: BatchScrapeOptions): Promise<BatchScrapeJob>;
|
|
1019
|
+
/**
|
|
1020
|
+
* Start a crawl job (async)
|
|
1021
|
+
*
|
|
1022
|
+
* Use this method when you want to start a crawl and manage polling yourself.
|
|
1023
|
+
* For automatic polling, use the `crawl()` method instead.
|
|
1024
|
+
*
|
|
1025
|
+
* @param url - Root URL to crawl
|
|
1026
|
+
* @param options - Crawl options
|
|
1027
|
+
* @returns Crawl job ID and initial status
|
|
1028
|
+
*
|
|
1029
|
+
* @example
|
|
1030
|
+
* ```typescript
|
|
1031
|
+
* const { id } = await client.startCrawl('https://example.com', {
|
|
1032
|
+
* limit: 10,
|
|
1033
|
+
* engine: 'dynamic'
|
|
1034
|
+
* });
|
|
1035
|
+
*
|
|
1036
|
+
* // Poll for status manually
|
|
1037
|
+
* let status = await client.getCrawlStatus(id);
|
|
1038
|
+
* while (status.status === 'scraping') {
|
|
1039
|
+
* await new Promise(r => setTimeout(r, 2000));
|
|
1040
|
+
* status = await client.getCrawlStatus(id);
|
|
1041
|
+
* }
|
|
1042
|
+
* ```
|
|
1043
|
+
*/
|
|
1044
|
+
startCrawl(url: string, options?: CrawlOptions): Promise<CrawlResponse>;
|
|
1045
|
+
/**
|
|
1046
|
+
* Get crawl job status and data
|
|
1047
|
+
*
|
|
1048
|
+
* @param jobId - Crawl job ID
|
|
1049
|
+
* @returns Current job status and scraped data
|
|
1050
|
+
*/
|
|
1051
|
+
getCrawlStatus(jobId: string): Promise<CrawlJob>;
|
|
1052
|
+
/**
|
|
1053
|
+
* Cancel a crawl job
|
|
1054
|
+
*
|
|
1055
|
+
* @param jobId - Crawl job ID
|
|
1056
|
+
* @returns True if cancelled successfully
|
|
1057
|
+
*/
|
|
1058
|
+
cancelCrawl(jobId: string): Promise<boolean>;
|
|
1059
|
+
/**
|
|
1060
|
+
* Get crawl job errors and robots.txt blocks
|
|
1061
|
+
*
|
|
1062
|
+
* @param jobId - Crawl job ID
|
|
1063
|
+
* @returns Errors and robots.txt blocked URLs
|
|
1064
|
+
*/
|
|
1065
|
+
getCrawlErrors(jobId: string): Promise<CrawlErrorsResponse>;
|
|
1066
|
+
/**
|
|
1067
|
+
* Crawl a website and wait for completion
|
|
1068
|
+
*
|
|
1069
|
+
* This method starts a crawl job and automatically polls until completion.
|
|
1070
|
+
*
|
|
1071
|
+
* @param url - Root URL to crawl
|
|
1072
|
+
* @param options - Crawl options including pollInterval and timeout
|
|
1073
|
+
* @returns Final crawl job with all scraped data
|
|
1074
|
+
*
|
|
1075
|
+
* @example
|
|
1076
|
+
* ```typescript
|
|
1077
|
+
* const job = await client.crawl('https://example.com', {
|
|
1078
|
+
* limit: 10,
|
|
1079
|
+
* engine: 'dynamic',
|
|
1080
|
+
* formats: ['markdown'],
|
|
1081
|
+
* pollInterval: 2000, // Poll every 2 seconds
|
|
1082
|
+
* timeout: 300 // 5 minute timeout
|
|
1083
|
+
* });
|
|
1084
|
+
*
|
|
1085
|
+
* console.log(`Crawled ${job.completed} pages`);
|
|
1086
|
+
* job.data.forEach(doc => console.log(doc.url));
|
|
1087
|
+
* ```
|
|
1088
|
+
*/
|
|
1089
|
+
crawl(url: string, options?: CrawlOptions): Promise<CrawlJob>;
|
|
1090
|
+
/**
|
|
1091
|
+
* Start an extract job (async)
|
|
1092
|
+
*
|
|
1093
|
+
* @param options - Extract request options
|
|
1094
|
+
* @returns Extract job ID or immediate result
|
|
1095
|
+
*
|
|
1096
|
+
* @example
|
|
1097
|
+
* ```typescript
|
|
1098
|
+
* const { id } = await client.startExtract({
|
|
1099
|
+
* urls: ['https://example.com/product'],
|
|
1100
|
+
* schema: { name: 'string', price: 'number' },
|
|
1101
|
+
* provider: 'openai'
|
|
1102
|
+
* });
|
|
1103
|
+
*
|
|
1104
|
+
* // Poll manually
|
|
1105
|
+
* let status = await client.getExtractStatus(id);
|
|
1106
|
+
* while (status.status === 'processing') {
|
|
1107
|
+
* await new Promise(r => setTimeout(r, 2000));
|
|
1108
|
+
* status = await client.getExtractStatus(id);
|
|
1109
|
+
* }
|
|
1110
|
+
* console.log(status.data);
|
|
1111
|
+
* ```
|
|
1112
|
+
*/
|
|
1113
|
+
startExtract(options: ExtractRequestOptions): Promise<ExtractResponse>;
|
|
1114
|
+
/**
|
|
1115
|
+
* Get extract job status and data
|
|
1116
|
+
*
|
|
1117
|
+
* @param jobId - Extract job ID
|
|
1118
|
+
* @returns Current job status and extracted data
|
|
1119
|
+
*/
|
|
1120
|
+
getExtractStatus(jobId: string): Promise<ExtractResponse>;
|
|
1121
|
+
/**
|
|
1122
|
+
* Extract structured data from URLs using LLM and wait for completion
|
|
1123
|
+
*
|
|
1124
|
+
* @param options - Extract options including schema, prompt, and timeout
|
|
1125
|
+
* @returns Final extract result with structured data
|
|
1126
|
+
*
|
|
1127
|
+
* @example With Zod schema
|
|
1128
|
+
* ```typescript
|
|
1129
|
+
* import { z } from 'zod';
|
|
1130
|
+
*
|
|
1131
|
+
* const result = await client.extract({
|
|
1132
|
+
* urls: ['https://example.com/product'],
|
|
1133
|
+
* schema: z.object({
|
|
1134
|
+
* name: z.string(),
|
|
1135
|
+
* price: z.number(),
|
|
1136
|
+
* inStock: z.boolean(),
|
|
1137
|
+
* features: z.array(z.string())
|
|
1138
|
+
* }),
|
|
1139
|
+
* systemPrompt: 'Extract product information from the page',
|
|
1140
|
+
* provider: 'openai',
|
|
1141
|
+
* timeout: 60
|
|
1142
|
+
* });
|
|
1143
|
+
*
|
|
1144
|
+
* console.log(result.data);
|
|
1145
|
+
* ```
|
|
1146
|
+
*
|
|
1147
|
+
* @example With natural language prompt
|
|
1148
|
+
* ```typescript
|
|
1149
|
+
* const result = await client.extract({
|
|
1150
|
+
* urls: ['https://example.com/about'],
|
|
1151
|
+
* prompt: 'Extract the company name, founding year, and list of team members',
|
|
1152
|
+
* enableWebSearch: true
|
|
1153
|
+
* });
|
|
1154
|
+
*
|
|
1155
|
+
* console.log(result.data);
|
|
1156
|
+
* ```
|
|
1157
|
+
*/
|
|
1158
|
+
extract(options: ExtractRequestOptions): Promise<ExtractResponse>;
|
|
1159
|
+
/**
|
|
1160
|
+
* Map a website to discover all URLs
|
|
1161
|
+
*
|
|
1162
|
+
* @param url - Root URL to map
|
|
1163
|
+
* @param options - Map options
|
|
1164
|
+
* @returns List of discovered URLs
|
|
1165
|
+
*
|
|
1166
|
+
* @example
|
|
1167
|
+
* ```typescript
|
|
1168
|
+
* const result = await client.map('https://example.com', {
|
|
1169
|
+
* engine: 'dynamic'
|
|
1170
|
+
* });
|
|
1171
|
+
*
|
|
1172
|
+
* console.log(`Found ${result.count} URLs:`);
|
|
1173
|
+
* result.links.forEach(url => console.log(url));
|
|
1174
|
+
* ```
|
|
1175
|
+
*/
|
|
1176
|
+
map(url: string, options?: MapOptions): Promise<MapResponse>;
|
|
1177
|
+
/**
|
|
1178
|
+
* Search the web and optionally scrape results
|
|
1179
|
+
*
|
|
1180
|
+
* @param query - Search query
|
|
1181
|
+
* @param options - Search options
|
|
1182
|
+
* @returns Search results with optional scraped content
|
|
1183
|
+
*
|
|
1184
|
+
* @example Basic search
|
|
1185
|
+
* ```typescript
|
|
1186
|
+
* const results = await client.search('best restaurants in NYC', {
|
|
1187
|
+
* limit: 10,
|
|
1188
|
+
* lang: 'en',
|
|
1189
|
+
* country: 'us'
|
|
1190
|
+
* });
|
|
1191
|
+
*
|
|
1192
|
+
* results.data.forEach(r => {
|
|
1193
|
+
* console.log(`${r.title}: ${r.url}`);
|
|
1194
|
+
* });
|
|
1195
|
+
* ```
|
|
1196
|
+
*
|
|
1197
|
+
* @example Search with scraping
|
|
1198
|
+
* ```typescript
|
|
1199
|
+
* const results = await client.search('best laptops 2024', {
|
|
1200
|
+
* limit: 5,
|
|
1201
|
+
* scrapeOptions: {
|
|
1202
|
+
* formats: ['markdown']
|
|
1203
|
+
* },
|
|
1204
|
+
* engine: 'smart'
|
|
1205
|
+
* });
|
|
1206
|
+
*
|
|
1207
|
+
* results.data.forEach(r => {
|
|
1208
|
+
* console.log(r.title);
|
|
1209
|
+
* console.log(r.markdown?.substring(0, 200));
|
|
1210
|
+
* });
|
|
1211
|
+
* ```
|
|
1212
|
+
*
|
|
1213
|
+
* @example Search with LLM extraction
|
|
1214
|
+
* ```typescript
|
|
1215
|
+
* import { z } from 'zod';
|
|
1216
|
+
*
|
|
1217
|
+
* const results = await client.search('iPhone 15 Pro reviews', {
|
|
1218
|
+
* limit: 5,
|
|
1219
|
+
* scrapeOptions: { formats: ['markdown'] },
|
|
1220
|
+
* extract: {
|
|
1221
|
+
* schema: z.object({
|
|
1222
|
+
* pros: z.array(z.string()),
|
|
1223
|
+
* cons: z.array(z.string()),
|
|
1224
|
+
* rating: z.number()
|
|
1225
|
+
* }),
|
|
1226
|
+
* systemPrompt: 'Extract review summary from the content'
|
|
1227
|
+
* }
|
|
1228
|
+
* });
|
|
1229
|
+
*
|
|
1230
|
+
* console.log(results.extract?.data);
|
|
1231
|
+
* ```
|
|
1232
|
+
*/
|
|
1233
|
+
search(query: string, options?: SearchOptions): Promise<SearchResponse>;
|
|
1234
|
+
/**
|
|
1235
|
+
* Get current concurrency usage
|
|
1236
|
+
*
|
|
1237
|
+
* @returns Current and max concurrency
|
|
1238
|
+
*
|
|
1239
|
+
* @example
|
|
1240
|
+
* ```typescript
|
|
1241
|
+
* const { concurrency, maxConcurrency } = await client.getConcurrency();
|
|
1242
|
+
* console.log(`Using ${concurrency}/${maxConcurrency} concurrent requests`);
|
|
1243
|
+
* ```
|
|
1244
|
+
*/
|
|
1245
|
+
getConcurrency(): Promise<ConcurrencyInfo>;
|
|
1246
|
+
/**
|
|
1247
|
+
* Get current credit usage
|
|
1248
|
+
*
|
|
1249
|
+
* @returns Credit usage information
|
|
1250
|
+
*
|
|
1251
|
+
* @example
|
|
1252
|
+
* ```typescript
|
|
1253
|
+
* const credits = await client.getCreditUsage();
|
|
1254
|
+
* console.log(`Remaining credits: ${credits.remainingCredits}`);
|
|
1255
|
+
* ```
|
|
1256
|
+
*/
|
|
1257
|
+
getCreditUsage(): Promise<CreditUsage>;
|
|
1258
|
+
/**
|
|
1259
|
+
* Get current token usage (for LLM extraction)
|
|
1260
|
+
*
|
|
1261
|
+
* @returns Token usage information
|
|
1262
|
+
*
|
|
1263
|
+
* @example
|
|
1264
|
+
* ```typescript
|
|
1265
|
+
* const tokens = await client.getTokenUsage();
|
|
1266
|
+
* console.log(`Remaining tokens: ${tokens.remainingTokens}`);
|
|
1267
|
+
* ```
|
|
1268
|
+
*/
|
|
1269
|
+
getTokenUsage(): Promise<TokenUsage>;
|
|
1270
|
+
/**
|
|
1271
|
+
* Get queue status information
|
|
1272
|
+
*
|
|
1273
|
+
* @returns Queue status metrics
|
|
1274
|
+
*
|
|
1275
|
+
* @example
|
|
1276
|
+
* ```typescript
|
|
1277
|
+
* const queue = await client.getQueueStatus();
|
|
1278
|
+
* console.log(`Jobs in queue: ${queue.jobsInQueue}`);
|
|
1279
|
+
* console.log(`Active: ${queue.activeJobsInQueue}, Waiting: ${queue.waitingJobsInQueue}`);
|
|
1280
|
+
* ```
|
|
1281
|
+
*/
|
|
1282
|
+
getQueueStatus(): Promise<QueueStatus>;
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
/**
|
|
1286
|
+
* Base error class for CrawlGate SDK errors
|
|
1287
|
+
*/
|
|
1288
|
+
declare class CrawlGateError extends Error {
|
|
1289
|
+
/**
|
|
1290
|
+
* HTTP status code (if applicable)
|
|
1291
|
+
*/
|
|
1292
|
+
readonly statusCode?: number;
|
|
1293
|
+
/**
|
|
1294
|
+
* Error code for programmatic handling
|
|
1295
|
+
*/
|
|
1296
|
+
readonly code?: string;
|
|
1297
|
+
/**
|
|
1298
|
+
* Additional error details
|
|
1299
|
+
*/
|
|
1300
|
+
readonly details?: unknown;
|
|
1301
|
+
constructor(message: string, statusCode?: number, code?: string, details?: unknown);
|
|
1302
|
+
}
|
|
1303
|
+
/**
|
|
1304
|
+
* Error thrown when authentication fails
|
|
1305
|
+
*/
|
|
1306
|
+
declare class AuthenticationError extends CrawlGateError {
|
|
1307
|
+
constructor(message?: string);
|
|
1308
|
+
}
|
|
1309
|
+
/**
|
|
1310
|
+
* Error thrown when request validation fails
|
|
1311
|
+
*/
|
|
1312
|
+
declare class ValidationError extends CrawlGateError {
|
|
1313
|
+
constructor(message: string, details?: unknown);
|
|
1314
|
+
}
|
|
1315
|
+
/**
|
|
1316
|
+
* Error thrown when a crawl job times out
|
|
1317
|
+
*/
|
|
1318
|
+
declare class JobTimeoutError extends CrawlGateError {
|
|
1319
|
+
/**
|
|
1320
|
+
* Job ID that timed out
|
|
1321
|
+
*/
|
|
1322
|
+
readonly jobId: string;
|
|
1323
|
+
/**
|
|
1324
|
+
* Timeout duration in seconds
|
|
1325
|
+
*/
|
|
1326
|
+
readonly timeoutSeconds: number;
|
|
1327
|
+
constructor(jobId: string, timeoutSeconds: number);
|
|
1328
|
+
}
|
|
1329
|
+
/**
|
|
1330
|
+
* Error thrown when upstream service is unavailable
|
|
1331
|
+
*/
|
|
1332
|
+
declare class ServiceUnavailableError extends CrawlGateError {
|
|
1333
|
+
constructor(message?: string);
|
|
1334
|
+
}
|
|
1335
|
+
/**
|
|
1336
|
+
* Error thrown when rate limit is exceeded
|
|
1337
|
+
*/
|
|
1338
|
+
declare class RateLimitError extends CrawlGateError {
|
|
1339
|
+
/**
|
|
1340
|
+
* Time to wait before retrying (in seconds)
|
|
1341
|
+
*/
|
|
1342
|
+
readonly retryAfter?: number;
|
|
1343
|
+
constructor(message?: string, retryAfter?: number);
|
|
1344
|
+
}
|
|
1345
|
+
/**
|
|
1346
|
+
* Error thrown when LLM extraction fails
|
|
1347
|
+
*/
|
|
1348
|
+
declare class ExtractionError extends CrawlGateError {
|
|
1349
|
+
/**
|
|
1350
|
+
* Provider that failed
|
|
1351
|
+
*/
|
|
1352
|
+
readonly provider?: string;
|
|
1353
|
+
constructor(message: string, provider?: string);
|
|
1354
|
+
}
|
|
1355
|
+
|
|
1356
|
+
export { AuthenticationError, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse, type ConcurrencyInfo, type CrawlError, type CrawlErrorsResponse, CrawlGateClient, type CrawlGateClientOptions, CrawlGateError, type CrawlJob, type CrawlOptions, type CrawlResponse, type CrawlStatus, type CreditUsage, type Document, type DocumentMetadata, type Engine, type ExtractOptions, type ExtractRequestOptions, type ExtractResponse, type ExtractResult, type ExtractStatus, ExtractionError, type FormatType, JobTimeoutError, type JsonSchema, type LLMProvider, type MapOptions, type MapResponse, type PaginationConfig, type ProxyOption, type QueueStatus, RateLimitError, type ScrapeOptions, type ScrapeResponse, type SearchOptions, type SearchResponse, type SearchResult, ServiceUnavailableError, type TokenUsage, ValidationError, type WebhookConfig, CrawlGateClient as default };
|