@crawlgate/sdk 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1356 @@
1
+ import { ZodTypeAny } from 'zod';
2
+
3
+ /**
4
+ * Scraping engine selection
5
+ * - `static`: Axios + Cheerio (fast, no JS rendering)
6
+ * - `dynamic`: Playwright headless browser (full JS support)
7
+ * - `smart`: Auto-selects static first, falls back to dynamic if needed
8
+ */
9
+ type Engine = "static" | "dynamic" | "smart";
10
+ /**
11
+ * Proxy options
12
+ * - `iproyal`: IPRoyal residential proxy
13
+ * - `tor`: Tor network proxy
14
+ * - `stealth`: Stealth mode with residential proxy (dynamic engine only)
15
+ */
16
+ type ProxyOption = "iproyal" | "tor" | "stealth" | string;
17
+ /**
18
+ * Output format types
19
+ */
20
+ type FormatType = "markdown" | "html" | "rawHtml" | "text";
21
+ /**
22
+ * LLM provider for extraction
23
+ */
24
+ type LLMProvider = "openai" | "anthropic";
25
+ /**
26
+ * Configuration options for CrawlGateClient
27
+ */
28
+ interface CrawlGateClientOptions {
29
+ /**
30
+ * API key for authentication (falls back to CRAWLGATE_API_KEY env variable)
31
+ */
32
+ apiKey?: string;
33
+ /**
34
+ * Base URL for the API (default: https://api.crawlgate.io)
35
+ */
36
+ apiUrl?: string;
37
+ /**
38
+ * Request timeout in milliseconds (default: 90000)
39
+ */
40
+ timeoutMs?: number;
41
+ /**
42
+ * Maximum number of retries for failed requests (default: 3)
43
+ */
44
+ maxRetries?: number;
45
+ /**
46
+ * Backoff factor for retries in seconds (default: 0.5)
47
+ */
48
+ backoffFactor?: number;
49
+ }
50
+ /**
51
+ * JSON Schema definition for LLM extraction
52
+ */
53
+ type JsonSchema = Record<string, unknown> | ZodTypeAny;
54
+ /**
55
+ * Options for LLM-powered data extraction
56
+ */
57
+ interface ExtractOptions {
58
+ /**
59
+ * JSON Schema or Zod schema defining the structure of extracted data
60
+ */
61
+ schema: JsonSchema;
62
+ /**
63
+ * System prompt to guide the LLM extraction
64
+ */
65
+ systemPrompt?: string;
66
+ /**
67
+ * LLM provider to use (default: openai)
68
+ */
69
+ provider?: LLMProvider;
70
+ /**
71
+ * Enable fallback to alternative provider if primary fails
72
+ */
73
+ enableFallback?: boolean;
74
+ }
75
+ /**
76
+ * Result of LLM extraction
77
+ */
78
+ interface ExtractResult {
79
+ /**
80
+ * Extracted structured data matching the schema
81
+ */
82
+ data: unknown;
83
+ /**
84
+ * Provider used for extraction
85
+ */
86
+ provider: string;
87
+ /**
88
+ * Model used for extraction
89
+ */
90
+ model: string;
91
+ /**
92
+ * Token usage statistics
93
+ */
94
+ usage?: {
95
+ promptTokens: number;
96
+ completionTokens: number;
97
+ totalTokens: number;
98
+ };
99
+ }
100
+ /**
101
+ * Options for scraping a single URL
102
+ */
103
+ interface ScrapeOptions {
104
+ /**
105
+ * Scraping engine to use
106
+ */
107
+ engine?: Engine;
108
+ /**
109
+ * Output formats to return
110
+ */
111
+ formats?: FormatType[];
112
+ /**
113
+ * Extract only the main content (removes headers, footers, sidebars)
114
+ */
115
+ onlyMainContent?: boolean;
116
+ /**
117
+ * HTML tags to exclude from output
118
+ */
119
+ excludeTags?: string[];
120
+ /**
121
+ * Wait for specified milliseconds before scraping (dynamic engine)
122
+ */
123
+ waitFor?: number;
124
+ /**
125
+ * Request timeout in milliseconds
126
+ */
127
+ timeout?: number;
128
+ /**
129
+ * Proxy configuration
130
+ */
131
+ proxy?: ProxyOption;
132
+ /**
133
+ * LLM extraction configuration
134
+ */
135
+ extract?: ExtractOptions;
136
+ /**
137
+ * Project ID for usage tracking
138
+ */
139
+ projectId?: string;
140
+ }
141
+ /**
142
+ * Metadata from scraped page
143
+ */
144
+ interface DocumentMetadata {
145
+ title?: string;
146
+ description?: string;
147
+ language?: string;
148
+ sourceURL?: string;
149
+ ogTitle?: string;
150
+ ogDescription?: string;
151
+ ogImage?: string;
152
+ favicon?: string;
153
+ [key: string]: unknown;
154
+ }
155
+ /**
156
+ * Scraped document data
157
+ */
158
+ interface Document {
159
+ /**
160
+ * Source URL
161
+ */
162
+ url: string;
163
+ /**
164
+ * Content in Markdown format
165
+ */
166
+ markdown?: string;
167
+ /**
168
+ * Content in HTML format
169
+ */
170
+ html?: string;
171
+ /**
172
+ * Raw HTML content
173
+ */
174
+ rawHtml?: string;
175
+ /**
176
+ * Plain text content
177
+ */
178
+ text?: string;
179
+ /**
180
+ * Page metadata
181
+ */
182
+ metadata?: DocumentMetadata;
183
+ /**
184
+ * Scrape duration in milliseconds
185
+ */
186
+ scrapeTime?: number;
187
+ /**
188
+ * LLM extraction result (if extract option was provided)
189
+ */
190
+ extract?: ExtractResult;
191
+ /**
192
+ * Alias for extract.data (backward compatibility)
193
+ */
194
+ json?: unknown;
195
+ }
196
+ /**
197
+ * Response from scrape endpoint
198
+ */
199
+ interface ScrapeResponse {
200
+ /**
201
+ * Whether the request was successful
202
+ */
203
+ success: boolean;
204
+ /**
205
+ * Scraped document data
206
+ */
207
+ data?: Document;
208
+ /**
209
+ * Engine that was requested
210
+ */
211
+ engine?: Engine;
212
+ /**
213
+ * Engine that was actually used (for smart engine)
214
+ */
215
+ engineUsed?: "static" | "dynamic";
216
+ /**
217
+ * Request ID for tracking
218
+ */
219
+ requestId?: string;
220
+ /**
221
+ * Error message if failed
222
+ */
223
+ error?: string;
224
+ }
225
+ /**
226
+ * Crawl job status
227
+ */
228
+ type CrawlStatus = "scraping" | "completed" | "failed" | "cancelled";
229
+ /**
230
+ * Options for crawling a website
231
+ */
232
+ interface CrawlOptions {
233
+ /**
234
+ * Scraping engine to use
235
+ */
236
+ engine?: Engine;
237
+ /**
238
+ * Maximum number of pages to crawl
239
+ */
240
+ limit?: number;
241
+ /**
242
+ * Output formats to return
243
+ */
244
+ formats?: FormatType[];
245
+ /**
246
+ * Extract only the main content
247
+ */
248
+ onlyMainContent?: boolean;
249
+ /**
250
+ * HTML tags to exclude
251
+ */
252
+ excludeTags?: string[];
253
+ /**
254
+ * Proxy configuration
255
+ */
256
+ proxy?: ProxyOption;
257
+ /**
258
+ * Project ID for usage tracking
259
+ */
260
+ projectId?: string;
261
+ /**
262
+ * Poll interval in milliseconds (for waiter method)
263
+ */
264
+ pollInterval?: number;
265
+ /**
266
+ * Maximum wait time in seconds (for waiter method)
267
+ */
268
+ timeout?: number;
269
+ }
270
+ /**
271
+ * Response from starting a crawl job
272
+ */
273
+ interface CrawlResponse {
274
+ /**
275
+ * Whether the request was successful
276
+ */
277
+ success: boolean;
278
+ /**
279
+ * Crawl job ID
280
+ */
281
+ id: string;
282
+ /**
283
+ * Alias for id
284
+ */
285
+ jobId?: string;
286
+ /**
287
+ * Initial job status
288
+ */
289
+ status: CrawlStatus;
290
+ /**
291
+ * Engine being used
292
+ */
293
+ engine?: string;
294
+ }
295
+ /**
296
+ * Crawl job status and data
297
+ */
298
+ interface CrawlJob {
299
+ /**
300
+ * Job ID
301
+ */
302
+ id: string;
303
+ /**
304
+ * Current status
305
+ */
306
+ status: CrawlStatus;
307
+ /**
308
+ * Total pages to crawl
309
+ */
310
+ total: number;
311
+ /**
312
+ * Pages completed
313
+ */
314
+ completed: number;
315
+ /**
316
+ * Scraped documents (empty until completed)
317
+ */
318
+ data: Document[];
319
+ /**
320
+ * Engine being used
321
+ */
322
+ engine?: string;
323
+ /**
324
+ * Error message if failed
325
+ */
326
+ error?: string;
327
+ }
328
+ /**
329
+ * Options for mapping URLs on a website
330
+ */
331
+ interface MapOptions {
332
+ /**
333
+ * Scraping engine to use
334
+ */
335
+ engine?: Engine;
336
+ /**
337
+ * Proxy configuration
338
+ */
339
+ proxy?: ProxyOption;
340
+ /**
341
+ * Project ID for usage tracking
342
+ */
343
+ projectId?: string;
344
+ }
345
+ /**
346
+ * Response from map endpoint
347
+ */
348
+ interface MapResponse {
349
+ /**
350
+ * Whether the request was successful
351
+ */
352
+ success: boolean;
353
+ /**
354
+ * Discovered URLs
355
+ */
356
+ links: string[];
357
+ /**
358
+ * Number of URLs found
359
+ */
360
+ count: number;
361
+ /**
362
+ * Engine used
363
+ */
364
+ engine?: string;
365
+ /**
366
+ * Error message if failed
367
+ */
368
+ error?: string;
369
+ }
370
+ /**
371
+ * Options for web search
372
+ */
373
+ interface SearchOptions {
374
+ /**
375
+ * Maximum number of results (default: 10)
376
+ */
377
+ limit?: number;
378
+ /**
379
+ * Language code (default: en)
380
+ */
381
+ lang?: string;
382
+ /**
383
+ * Country code (default: us)
384
+ */
385
+ country?: string;
386
+ /**
387
+ * Search engines to use (default: all available)
388
+ */
389
+ engines?: string[];
390
+ /**
391
+ * Scrape options (if provided, results will be scraped)
392
+ */
393
+ scrapeOptions?: {
394
+ formats?: FormatType[];
395
+ };
396
+ /**
397
+ * Scraping engine for scraping results
398
+ */
399
+ engine?: Engine;
400
+ /**
401
+ * LLM extraction on combined search results
402
+ */
403
+ extract?: ExtractOptions;
404
+ /**
405
+ * Project ID for usage tracking
406
+ */
407
+ projectId?: string;
408
+ }
409
+ /**
410
+ * Individual search result
411
+ */
412
+ interface SearchResult {
413
+ /**
414
+ * Result URL
415
+ */
416
+ url: string;
417
+ /**
418
+ * Result title
419
+ */
420
+ title?: string;
421
+ /**
422
+ * Result description/snippet
423
+ */
424
+ description?: string;
425
+ /**
426
+ * Search engine that returned this result
427
+ */
428
+ engine?: string;
429
+ /**
430
+ * Relevance score
431
+ */
432
+ score?: number;
433
+ /**
434
+ * Position in search results
435
+ */
436
+ position?: number;
437
+ /**
438
+ * Scraped markdown content (if scrapeOptions provided)
439
+ */
440
+ markdown?: string | null;
441
+ /**
442
+ * Scraped HTML content (if scrapeOptions provided)
443
+ */
444
+ html?: string | null;
445
+ /**
446
+ * Page metadata (if scraped)
447
+ */
448
+ metadata?: DocumentMetadata;
449
+ /**
450
+ * Whether scraping was successful
451
+ */
452
+ scrapeSuccess?: boolean;
453
+ }
454
+ /**
455
+ * Response from search endpoint
456
+ */
457
+ interface SearchResponse {
458
+ /**
459
+ * Whether the request was successful
460
+ */
461
+ success: boolean;
462
+ /**
463
+ * Search results
464
+ */
465
+ data: SearchResult[];
466
+ /**
467
+ * Original search query
468
+ */
469
+ query: string;
470
+ /**
471
+ * Total results available
472
+ */
473
+ totalResults?: number;
474
+ /**
475
+ * Search/scrape time in milliseconds
476
+ */
477
+ searchTime?: number;
478
+ /**
479
+ * LLM extraction result (if extract option provided)
480
+ */
481
+ extract?: ExtractResult;
482
+ /**
483
+ * Error message if failed
484
+ */
485
+ error?: string;
486
+ }
487
+ /**
488
+ * Options for batch scraping multiple URLs
489
+ */
490
+ interface BatchScrapeOptions {
491
+ /**
492
+ * Scrape options to apply to all URLs
493
+ */
494
+ options?: ScrapeOptions;
495
+ /**
496
+ * Webhook URL for job status updates
497
+ */
498
+ webhook?: string | WebhookConfig;
499
+ /**
500
+ * Append results to existing batch job
501
+ */
502
+ appendToId?: string;
503
+ /**
504
+ * Skip invalid URLs instead of failing
505
+ */
506
+ ignoreInvalidURLs?: boolean;
507
+ /**
508
+ * Maximum concurrent scrapes
509
+ */
510
+ maxConcurrency?: number;
511
+ /**
512
+ * Idempotency key for deduplication
513
+ */
514
+ idempotencyKey?: string;
515
+ /**
516
+ * Project ID for usage tracking
517
+ */
518
+ projectId?: string;
519
+ /**
520
+ * Poll interval in milliseconds (for waiter method)
521
+ */
522
+ pollInterval?: number;
523
+ /**
524
+ * Maximum wait time in seconds (for waiter method)
525
+ */
526
+ timeout?: number;
527
+ }
528
+ /**
529
+ * Webhook configuration
530
+ */
531
+ interface WebhookConfig {
532
+ /**
533
+ * Webhook URL
534
+ */
535
+ url: string;
536
+ /**
537
+ * Custom headers to send with webhook
538
+ */
539
+ headers?: Record<string, string>;
540
+ /**
541
+ * Metadata to include in webhook payload
542
+ */
543
+ metadata?: Record<string, string>;
544
+ /**
545
+ * Events to trigger webhook
546
+ */
547
+ events?: Array<"completed" | "failed" | "page" | "started">;
548
+ }
549
+ /**
550
+ * Response from starting a batch scrape job
551
+ */
552
+ interface BatchScrapeResponse {
553
+ /**
554
+ * Whether the request was successful
555
+ */
556
+ success: boolean;
557
+ /**
558
+ * Batch job ID
559
+ */
560
+ id: string;
561
+ /**
562
+ * URL to check job status
563
+ */
564
+ url?: string;
565
+ /**
566
+ * Invalid URLs that were skipped
567
+ */
568
+ invalidURLs?: string[];
569
+ /**
570
+ * Error message if failed
571
+ */
572
+ error?: string;
573
+ }
574
+ /**
575
+ * Batch scrape job status and data
576
+ */
577
+ interface BatchScrapeJob {
578
+ /**
579
+ * Job ID
580
+ */
581
+ id: string;
582
+ /**
583
+ * Current status
584
+ */
585
+ status: CrawlStatus;
586
+ /**
587
+ * Total URLs to scrape
588
+ */
589
+ total: number;
590
+ /**
591
+ * URLs completed
592
+ */
593
+ completed: number;
594
+ /**
595
+ * Credits used
596
+ */
597
+ creditsUsed?: number;
598
+ /**
599
+ * Job expiration time
600
+ */
601
+ expiresAt?: string;
602
+ /**
603
+ * Next page URL for pagination
604
+ */
605
+ next?: string | null;
606
+ /**
607
+ * Scraped documents
608
+ */
609
+ data: Document[];
610
+ /**
611
+ * Error message if failed
612
+ */
613
+ error?: string;
614
+ }
615
+ /**
616
+ * Options for standalone LLM extraction
617
+ */
618
+ interface ExtractRequestOptions {
619
+ /**
620
+ * URLs to extract data from
621
+ */
622
+ urls?: string[];
623
+ /**
624
+ * Natural language prompt for extraction
625
+ */
626
+ prompt?: string;
627
+ /**
628
+ * JSON Schema or Zod schema for structured extraction
629
+ */
630
+ schema?: JsonSchema;
631
+ /**
632
+ * System prompt for LLM
633
+ */
634
+ systemPrompt?: string;
635
+ /**
636
+ * Allow following external links
637
+ */
638
+ allowExternalLinks?: boolean;
639
+ /**
640
+ * Enable web search for additional context
641
+ */
642
+ enableWebSearch?: boolean;
643
+ /**
644
+ * Include source URLs in response
645
+ */
646
+ showSources?: boolean;
647
+ /**
648
+ * Scrape options for URLs
649
+ */
650
+ scrapeOptions?: ScrapeOptions;
651
+ /**
652
+ * Skip invalid URLs instead of failing
653
+ */
654
+ ignoreInvalidURLs?: boolean;
655
+ /**
656
+ * LLM provider to use
657
+ */
658
+ provider?: LLMProvider;
659
+ /**
660
+ * Project ID for usage tracking
661
+ */
662
+ projectId?: string;
663
+ /**
664
+ * Poll interval in milliseconds (for waiter method)
665
+ */
666
+ pollInterval?: number;
667
+ /**
668
+ * Maximum wait time in seconds (for waiter method)
669
+ */
670
+ timeout?: number;
671
+ }
672
+ /**
673
+ * Extract job status
674
+ */
675
+ type ExtractStatus = "processing" | "completed" | "failed" | "cancelled";
676
+ /**
677
+ * Response from extract endpoint
678
+ */
679
+ interface ExtractResponse {
680
+ /**
681
+ * Whether the request was successful
682
+ */
683
+ success?: boolean;
684
+ /**
685
+ * Extract job ID (for async operations)
686
+ */
687
+ id?: string;
688
+ /**
689
+ * Job status
690
+ */
691
+ status?: ExtractStatus;
692
+ /**
693
+ * Extracted data
694
+ */
695
+ data?: unknown;
696
+ /**
697
+ * Error message if failed
698
+ */
699
+ error?: string;
700
+ /**
701
+ * Warning message
702
+ */
703
+ warning?: string;
704
+ /**
705
+ * Source URLs used for extraction
706
+ */
707
+ sources?: Record<string, unknown>;
708
+ /**
709
+ * Job expiration time
710
+ */
711
+ expiresAt?: string;
712
+ /**
713
+ * Credits used
714
+ */
715
+ creditsUsed?: number;
716
+ }
717
+ /**
718
+ * Concurrency usage information
719
+ */
720
+ interface ConcurrencyInfo {
721
+ /**
722
+ * Current active requests
723
+ */
724
+ concurrency: number;
725
+ /**
726
+ * Maximum allowed concurrent requests
727
+ */
728
+ maxConcurrency: number;
729
+ }
730
+ /**
731
+ * Credit usage information
732
+ */
733
+ interface CreditUsage {
734
+ /**
735
+ * Remaining credits
736
+ */
737
+ remainingCredits: number;
738
+ /**
739
+ * Total plan credits
740
+ */
741
+ planCredits?: number;
742
+ /**
743
+ * Billing period start date
744
+ */
745
+ billingPeriodStart?: string | null;
746
+ /**
747
+ * Billing period end date
748
+ */
749
+ billingPeriodEnd?: string | null;
750
+ }
751
+ /**
752
+ * Token usage information
753
+ */
754
+ interface TokenUsage {
755
+ /**
756
+ * Remaining tokens
757
+ */
758
+ remainingTokens: number;
759
+ /**
760
+ * Total plan tokens
761
+ */
762
+ planTokens?: number;
763
+ /**
764
+ * Billing period start date
765
+ */
766
+ billingPeriodStart?: string | null;
767
+ /**
768
+ * Billing period end date
769
+ */
770
+ billingPeriodEnd?: string | null;
771
+ }
772
+ /**
773
+ * Queue status information
774
+ */
775
+ interface QueueStatus {
776
+ /**
777
+ * Whether the request was successful
778
+ */
779
+ success: boolean;
780
+ /**
781
+ * Total jobs in queue
782
+ */
783
+ jobsInQueue: number;
784
+ /**
785
+ * Active jobs currently processing
786
+ */
787
+ activeJobsInQueue: number;
788
+ /**
789
+ * Jobs waiting to be processed
790
+ */
791
+ waitingJobsInQueue: number;
792
+ /**
793
+ * Maximum concurrency
794
+ */
795
+ maxConcurrency: number;
796
+ /**
797
+ * Timestamp of most recent successful job
798
+ */
799
+ mostRecentSuccess: string | null;
800
+ }
801
+ /**
802
+ * Crawl error information
803
+ */
804
+ interface CrawlError {
805
+ /**
806
+ * Error ID
807
+ */
808
+ id: string;
809
+ /**
810
+ * Timestamp of error
811
+ */
812
+ timestamp?: string;
813
+ /**
814
+ * URL that caused the error
815
+ */
816
+ url: string;
817
+ /**
818
+ * Error code
819
+ */
820
+ code?: string;
821
+ /**
822
+ * Error message
823
+ */
824
+ error: string;
825
+ }
826
+ /**
827
+ * Response from crawl errors endpoint
828
+ */
829
+ interface CrawlErrorsResponse {
830
+ /**
831
+ * List of errors encountered
832
+ */
833
+ errors: CrawlError[];
834
+ /**
835
+ * URLs blocked by robots.txt
836
+ */
837
+ robotsBlocked: string[];
838
+ }
839
+ /**
840
+ * Pagination configuration for large result sets
841
+ */
842
+ interface PaginationConfig {
843
+ /**
844
+ * Automatically fetch all pages (default: true)
845
+ */
846
+ autoPaginate?: boolean;
847
+ /**
848
+ * Maximum number of pages to fetch
849
+ */
850
+ maxPages?: number;
851
+ /**
852
+ * Maximum total results to return
853
+ */
854
+ maxResults?: number;
855
+ /**
856
+ * Maximum time to spend fetching pages (seconds)
857
+ */
858
+ maxWaitTime?: number;
859
+ }
860
+
861
+ /**
862
+ * CrawlGate SDK Client
863
+ *
864
+ * @example
865
+ * ```typescript
866
+ * import { CrawlGateClient } from '@crawlgate/sdk';
867
+ *
868
+ * const client = new CrawlGateClient({
869
+ * apiKey: 'sk_live_...',
870
+ * apiUrl: 'https://api.crawlgate.io'
871
+ * });
872
+ *
873
+ * // Scrape a single URL
874
+ * const doc = await client.scrape('https://example.com', {
875
+ * engine: 'smart',
876
+ * formats: ['markdown', 'html']
877
+ * });
878
+ *
879
+ * // Batch scrape multiple URLs
880
+ * const job = await client.batchScrape(['https://a.com', 'https://b.com'], {
881
+ * options: { formats: ['markdown'] }
882
+ * });
883
+ *
884
+ * // Crawl a website
885
+ * const crawlJob = await client.crawl('https://example.com', {
886
+ * limit: 10,
887
+ * engine: 'dynamic'
888
+ * });
889
+ *
890
+ * // Extract structured data with LLM
891
+ * const extracted = await client.extract({
892
+ * urls: ['https://example.com/product'],
893
+ * schema: { name: 'string', price: 'number' },
894
+ * provider: 'openai'
895
+ * });
896
+ *
897
+ * // Search the web
898
+ * const results = await client.search('best restaurants', {
899
+ * limit: 5,
900
+ * scrapeOptions: { formats: ['markdown'] }
901
+ * });
902
+ * ```
903
+ */
904
+ declare class CrawlGateClient {
905
+ private readonly http;
906
+ /**
907
+ * Create a new CrawlGate client
908
+ *
909
+ * @param options - Client configuration options
910
+ * @throws {CrawlGateError} If API key is not provided
911
+ */
912
+ constructor(options?: CrawlGateClientOptions);
913
+ /**
914
+ * Scrape a single URL
915
+ *
916
+ * @param url - URL to scrape
917
+ * @param options - Scrape options
918
+ * @returns Scraped document with requested formats
919
+ *
920
+ * @example
921
+ * ```typescript
922
+ * const doc = await client.scrape('https://example.com', {
923
+ * engine: 'smart',
924
+ * formats: ['markdown', 'html'],
925
+ * onlyMainContent: true
926
+ * });
927
+ * console.log(doc.markdown);
928
+ * ```
929
+ *
930
+ * @example With LLM extraction
931
+ * ```typescript
932
+ * import { z } from 'zod';
933
+ *
934
+ * const schema = z.object({
935
+ * title: z.string(),
936
+ * price: z.number(),
937
+ * inStock: z.boolean()
938
+ * });
939
+ *
940
+ * const doc = await client.scrape('https://example.com/product', {
941
+ * engine: 'smart',
942
+ * extract: {
943
+ * schema,
944
+ * systemPrompt: 'Extract product details',
945
+ * provider: 'openai'
946
+ * }
947
+ * });
948
+ * console.log(doc.extract?.data);
949
+ * ```
950
+ */
951
+ scrape(url: string, options?: ScrapeOptions): Promise<Document>;
952
+ /**
953
+ * Start a batch scrape job (async)
954
+ *
955
+ * @param urls - Array of URLs to scrape
956
+ * @param options - Batch scrape options
957
+ * @returns Batch job ID and initial status
958
+ *
959
+ * @example
960
+ * ```typescript
961
+ * const { id } = await client.startBatchScrape(
962
+ * ['https://a.com', 'https://b.com', 'https://c.com'],
963
+ * { options: { formats: ['markdown'] } }
964
+ * );
965
+ *
966
+ * // Poll manually
967
+ * let status = await client.getBatchScrapeStatus(id);
968
+ * while (status.status === 'scraping') {
969
+ * await new Promise(r => setTimeout(r, 2000));
970
+ * status = await client.getBatchScrapeStatus(id);
971
+ * }
972
+ * ```
973
+ */
974
+ startBatchScrape(urls: string[], options?: BatchScrapeOptions): Promise<BatchScrapeResponse>;
975
+ /**
976
+ * Get batch scrape job status and data
977
+ *
978
+ * @param jobId - Batch job ID
979
+ * @returns Current job status and scraped data
980
+ */
981
+ getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob>;
982
+ /**
983
+ * Cancel a batch scrape job
984
+ *
985
+ * @param jobId - Batch job ID
986
+ * @returns True if cancelled successfully
987
+ */
988
+ cancelBatchScrape(jobId: string): Promise<boolean>;
989
+ /**
990
+ * Get batch scrape job errors
991
+ *
992
+ * @param jobId - Batch job ID
993
+ * @returns Errors and robots.txt blocked URLs
994
+ */
995
+ getBatchScrapeErrors(jobId: string): Promise<CrawlErrorsResponse>;
996
+ /**
997
+ * Batch scrape multiple URLs and wait for completion
998
+ *
999
+ * @param urls - Array of URLs to scrape
1000
+ * @param options - Batch options including pollInterval and timeout
1001
+ * @returns Final job with all scraped data
1002
+ *
1003
+ * @example
1004
+ * ```typescript
1005
+ * const job = await client.batchScrape(
1006
+ * ['https://a.com', 'https://b.com', 'https://c.com'],
1007
+ * {
1008
+ * options: { formats: ['markdown'], engine: 'smart' },
1009
+ * pollInterval: 2000,
1010
+ * timeout: 300
1011
+ * }
1012
+ * );
1013
+ *
1014
+ * console.log(`Scraped ${job.completed} URLs`);
1015
+ * job.data.forEach(doc => console.log(doc.url, doc.markdown?.length));
1016
+ * ```
1017
+ */
1018
+ batchScrape(urls: string[], options?: BatchScrapeOptions): Promise<BatchScrapeJob>;
1019
+ /**
1020
+ * Start a crawl job (async)
1021
+ *
1022
+ * Use this method when you want to start a crawl and manage polling yourself.
1023
+ * For automatic polling, use the `crawl()` method instead.
1024
+ *
1025
+ * @param url - Root URL to crawl
1026
+ * @param options - Crawl options
1027
+ * @returns Crawl job ID and initial status
1028
+ *
1029
+ * @example
1030
+ * ```typescript
1031
+ * const { id } = await client.startCrawl('https://example.com', {
1032
+ * limit: 10,
1033
+ * engine: 'dynamic'
1034
+ * });
1035
+ *
1036
+ * // Poll for status manually
1037
+ * let status = await client.getCrawlStatus(id);
1038
+ * while (status.status === 'scraping') {
1039
+ * await new Promise(r => setTimeout(r, 2000));
1040
+ * status = await client.getCrawlStatus(id);
1041
+ * }
1042
+ * ```
1043
+ */
1044
+ startCrawl(url: string, options?: CrawlOptions): Promise<CrawlResponse>;
1045
+ /**
1046
+ * Get crawl job status and data
1047
+ *
1048
+ * @param jobId - Crawl job ID
1049
+ * @returns Current job status and scraped data
1050
+ */
1051
+ getCrawlStatus(jobId: string): Promise<CrawlJob>;
1052
+ /**
1053
+ * Cancel a crawl job
1054
+ *
1055
+ * @param jobId - Crawl job ID
1056
+ * @returns True if cancelled successfully
1057
+ */
1058
+ cancelCrawl(jobId: string): Promise<boolean>;
1059
+ /**
1060
+ * Get crawl job errors and robots.txt blocks
1061
+ *
1062
+ * @param jobId - Crawl job ID
1063
+ * @returns Errors and robots.txt blocked URLs
1064
+ */
1065
+ getCrawlErrors(jobId: string): Promise<CrawlErrorsResponse>;
1066
+ /**
1067
+ * Crawl a website and wait for completion
1068
+ *
1069
+ * This method starts a crawl job and automatically polls until completion.
1070
+ *
1071
+ * @param url - Root URL to crawl
1072
+ * @param options - Crawl options including pollInterval and timeout
1073
+ * @returns Final crawl job with all scraped data
1074
+ *
1075
+ * @example
1076
+ * ```typescript
1077
+ * const job = await client.crawl('https://example.com', {
1078
+ * limit: 10,
1079
+ * engine: 'dynamic',
1080
+ * formats: ['markdown'],
1081
+ * pollInterval: 2000, // Poll every 2 seconds
1082
+ * timeout: 300 // 5 minute timeout
1083
+ * });
1084
+ *
1085
+ * console.log(`Crawled ${job.completed} pages`);
1086
+ * job.data.forEach(doc => console.log(doc.url));
1087
+ * ```
1088
+ */
1089
+ crawl(url: string, options?: CrawlOptions): Promise<CrawlJob>;
1090
+ /**
1091
+ * Start an extract job (async)
1092
+ *
1093
+ * @param options - Extract request options
1094
+ * @returns Extract job ID or immediate result
1095
+ *
1096
+ * @example
1097
+ * ```typescript
1098
+ * const { id } = await client.startExtract({
1099
+ * urls: ['https://example.com/product'],
1100
+ * schema: { name: 'string', price: 'number' },
1101
+ * provider: 'openai'
1102
+ * });
1103
+ *
1104
+ * // Poll manually
1105
+ * let status = await client.getExtractStatus(id);
1106
+ * while (status.status === 'processing') {
1107
+ * await new Promise(r => setTimeout(r, 2000));
1108
+ * status = await client.getExtractStatus(id);
1109
+ * }
1110
+ * console.log(status.data);
1111
+ * ```
1112
+ */
1113
+ startExtract(options: ExtractRequestOptions): Promise<ExtractResponse>;
1114
+ /**
1115
+ * Get extract job status and data
1116
+ *
1117
+ * @param jobId - Extract job ID
1118
+ * @returns Current job status and extracted data
1119
+ */
1120
+ getExtractStatus(jobId: string): Promise<ExtractResponse>;
1121
+ /**
1122
+ * Extract structured data from URLs using LLM and wait for completion
1123
+ *
1124
+ * @param options - Extract options including schema, prompt, and timeout
1125
+ * @returns Final extract result with structured data
1126
+ *
1127
+ * @example With Zod schema
1128
+ * ```typescript
1129
+ * import { z } from 'zod';
1130
+ *
1131
+ * const result = await client.extract({
1132
+ * urls: ['https://example.com/product'],
1133
+ * schema: z.object({
1134
+ * name: z.string(),
1135
+ * price: z.number(),
1136
+ * inStock: z.boolean(),
1137
+ * features: z.array(z.string())
1138
+ * }),
1139
+ * systemPrompt: 'Extract product information from the page',
1140
+ * provider: 'openai',
1141
+ * timeout: 60
1142
+ * });
1143
+ *
1144
+ * console.log(result.data);
1145
+ * ```
1146
+ *
1147
+ * @example With natural language prompt
1148
+ * ```typescript
1149
+ * const result = await client.extract({
1150
+ * urls: ['https://example.com/about'],
1151
+ * prompt: 'Extract the company name, founding year, and list of team members',
1152
+ * enableWebSearch: true
1153
+ * });
1154
+ *
1155
+ * console.log(result.data);
1156
+ * ```
1157
+ */
1158
+ extract(options: ExtractRequestOptions): Promise<ExtractResponse>;
1159
+ /**
1160
+ * Map a website to discover all URLs
1161
+ *
1162
+ * @param url - Root URL to map
1163
+ * @param options - Map options
1164
+ * @returns List of discovered URLs
1165
+ *
1166
+ * @example
1167
+ * ```typescript
1168
+ * const result = await client.map('https://example.com', {
1169
+ * engine: 'dynamic'
1170
+ * });
1171
+ *
1172
+ * console.log(`Found ${result.count} URLs:`);
1173
+ * result.links.forEach(url => console.log(url));
1174
+ * ```
1175
+ */
1176
+ map(url: string, options?: MapOptions): Promise<MapResponse>;
1177
+ /**
1178
+ * Search the web and optionally scrape results
1179
+ *
1180
+ * @param query - Search query
1181
+ * @param options - Search options
1182
+ * @returns Search results with optional scraped content
1183
+ *
1184
+ * @example Basic search
1185
+ * ```typescript
1186
+ * const results = await client.search('best restaurants in NYC', {
1187
+ * limit: 10,
1188
+ * lang: 'en',
1189
+ * country: 'us'
1190
+ * });
1191
+ *
1192
+ * results.data.forEach(r => {
1193
+ * console.log(`${r.title}: ${r.url}`);
1194
+ * });
1195
+ * ```
1196
+ *
1197
+ * @example Search with scraping
1198
+ * ```typescript
1199
+ * const results = await client.search('best laptops 2024', {
1200
+ * limit: 5,
1201
+ * scrapeOptions: {
1202
+ * formats: ['markdown']
1203
+ * },
1204
+ * engine: 'smart'
1205
+ * });
1206
+ *
1207
+ * results.data.forEach(r => {
1208
+ * console.log(r.title);
1209
+ * console.log(r.markdown?.substring(0, 200));
1210
+ * });
1211
+ * ```
1212
+ *
1213
+ * @example Search with LLM extraction
1214
+ * ```typescript
1215
+ * import { z } from 'zod';
1216
+ *
1217
+ * const results = await client.search('iPhone 15 Pro reviews', {
1218
+ * limit: 5,
1219
+ * scrapeOptions: { formats: ['markdown'] },
1220
+ * extract: {
1221
+ * schema: z.object({
1222
+ * pros: z.array(z.string()),
1223
+ * cons: z.array(z.string()),
1224
+ * rating: z.number()
1225
+ * }),
1226
+ * systemPrompt: 'Extract review summary from the content'
1227
+ * }
1228
+ * });
1229
+ *
1230
+ * console.log(results.extract?.data);
1231
+ * ```
1232
+ */
1233
+ search(query: string, options?: SearchOptions): Promise<SearchResponse>;
1234
+ /**
1235
+ * Get current concurrency usage
1236
+ *
1237
+ * @returns Current and max concurrency
1238
+ *
1239
+ * @example
1240
+ * ```typescript
1241
+ * const { concurrency, maxConcurrency } = await client.getConcurrency();
1242
+ * console.log(`Using ${concurrency}/${maxConcurrency} concurrent requests`);
1243
+ * ```
1244
+ */
1245
+ getConcurrency(): Promise<ConcurrencyInfo>;
1246
+ /**
1247
+ * Get current credit usage
1248
+ *
1249
+ * @returns Credit usage information
1250
+ *
1251
+ * @example
1252
+ * ```typescript
1253
+ * const credits = await client.getCreditUsage();
1254
+ * console.log(`Remaining credits: ${credits.remainingCredits}`);
1255
+ * ```
1256
+ */
1257
+ getCreditUsage(): Promise<CreditUsage>;
1258
+ /**
1259
+ * Get current token usage (for LLM extraction)
1260
+ *
1261
+ * @returns Token usage information
1262
+ *
1263
+ * @example
1264
+ * ```typescript
1265
+ * const tokens = await client.getTokenUsage();
1266
+ * console.log(`Remaining tokens: ${tokens.remainingTokens}`);
1267
+ * ```
1268
+ */
1269
+ getTokenUsage(): Promise<TokenUsage>;
1270
+ /**
1271
+ * Get queue status information
1272
+ *
1273
+ * @returns Queue status metrics
1274
+ *
1275
+ * @example
1276
+ * ```typescript
1277
+ * const queue = await client.getQueueStatus();
1278
+ * console.log(`Jobs in queue: ${queue.jobsInQueue}`);
1279
+ * console.log(`Active: ${queue.activeJobsInQueue}, Waiting: ${queue.waitingJobsInQueue}`);
1280
+ * ```
1281
+ */
1282
+ getQueueStatus(): Promise<QueueStatus>;
1283
+ }
1284
+
1285
+ /**
1286
+ * Base error class for CrawlGate SDK errors
1287
+ */
1288
+ declare class CrawlGateError extends Error {
1289
+ /**
1290
+ * HTTP status code (if applicable)
1291
+ */
1292
+ readonly statusCode?: number;
1293
+ /**
1294
+ * Error code for programmatic handling
1295
+ */
1296
+ readonly code?: string;
1297
+ /**
1298
+ * Additional error details
1299
+ */
1300
+ readonly details?: unknown;
1301
+ constructor(message: string, statusCode?: number, code?: string, details?: unknown);
1302
+ }
1303
+ /**
1304
+ * Error thrown when authentication fails
1305
+ */
1306
+ declare class AuthenticationError extends CrawlGateError {
1307
+ constructor(message?: string);
1308
+ }
1309
+ /**
1310
+ * Error thrown when request validation fails
1311
+ */
1312
+ declare class ValidationError extends CrawlGateError {
1313
+ constructor(message: string, details?: unknown);
1314
+ }
1315
+ /**
1316
+ * Error thrown when a crawl job times out
1317
+ */
1318
+ declare class JobTimeoutError extends CrawlGateError {
1319
+ /**
1320
+ * Job ID that timed out
1321
+ */
1322
+ readonly jobId: string;
1323
+ /**
1324
+ * Timeout duration in seconds
1325
+ */
1326
+ readonly timeoutSeconds: number;
1327
+ constructor(jobId: string, timeoutSeconds: number);
1328
+ }
1329
+ /**
1330
+ * Error thrown when upstream service is unavailable
1331
+ */
1332
+ declare class ServiceUnavailableError extends CrawlGateError {
1333
+ constructor(message?: string);
1334
+ }
1335
+ /**
1336
+ * Error thrown when rate limit is exceeded
1337
+ */
1338
+ declare class RateLimitError extends CrawlGateError {
1339
+ /**
1340
+ * Time to wait before retrying (in seconds)
1341
+ */
1342
+ readonly retryAfter?: number;
1343
+ constructor(message?: string, retryAfter?: number);
1344
+ }
1345
+ /**
1346
+ * Error thrown when LLM extraction fails
1347
+ */
1348
+ declare class ExtractionError extends CrawlGateError {
1349
+ /**
1350
+ * Provider that failed
1351
+ */
1352
+ readonly provider?: string;
1353
+ constructor(message: string, provider?: string);
1354
+ }
1355
+
1356
+ export { AuthenticationError, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse, type ConcurrencyInfo, type CrawlError, type CrawlErrorsResponse, CrawlGateClient, type CrawlGateClientOptions, CrawlGateError, type CrawlJob, type CrawlOptions, type CrawlResponse, type CrawlStatus, type CreditUsage, type Document, type DocumentMetadata, type Engine, type ExtractOptions, type ExtractRequestOptions, type ExtractResponse, type ExtractResult, type ExtractStatus, ExtractionError, type FormatType, JobTimeoutError, type JsonSchema, type LLMProvider, type MapOptions, type MapResponse, type PaginationConfig, type ProxyOption, type QueueStatus, RateLimitError, type ScrapeOptions, type ScrapeResponse, type SearchOptions, type SearchResponse, type SearchResult, ServiceUnavailableError, type TokenUsage, ValidationError, type WebhookConfig, CrawlGateClient as default };