@crawlgate/sdk 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,397 @@
1
+ # @crawlgate/sdk
2
+
3
+ Official JavaScript/TypeScript SDK for CrawlGate Search Engine API.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install @crawlgate/sdk
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```typescript
14
+ import { CrawlGateClient } from '@crawlgate/sdk';
15
+
16
+ const client = new CrawlGateClient({
17
+ apiKey: 'sk_live_...',
18
+ apiUrl: 'https://api.crawlgate.io' // or your self-hosted URL
19
+ });
20
+
21
+ // Scrape a single page
22
+ const doc = await client.scrape('https://example.com');
23
+ console.log(doc.markdown);
24
+ ```
25
+
26
+ ## Configuration
27
+
28
+ ```typescript
29
+ const client = new CrawlGateClient({
30
+ // Required: API key (or set CRAWLGATE_API_KEY env var)
31
+ apiKey: 'sk_live_...',
32
+
33
+ // Optional: API URL (default: https://api.crawlgate.io)
34
+ apiUrl: 'https://api.crawlgate.io',
35
+
36
+ // Optional: Request timeout in ms (default: 90000)
37
+ timeoutMs: 90000,
38
+
39
+ // Optional: Max retries for failed requests (default: 3)
40
+ maxRetries: 3,
41
+
42
+ // Optional: Backoff factor for retries in seconds (default: 0.5)
43
+ backoffFactor: 0.5
44
+ });
45
+ ```
46
+
47
+ ## Engines
48
+
49
+ CrawlGate supports three scraping engines:
50
+
51
+ | Engine | Description | Best For |
52
+ |--------|-------------|----------|
53
+ | `static` | Axios + Cheerio (no browser) | Fast, simple pages |
54
+ | `dynamic` | Playwright headless browser | JavaScript-heavy sites |
55
+ | `smart` | Auto-selects static/dynamic | Cost optimization |
56
+
57
+ ## API Reference
58
+
59
+ ### Scrape
60
+
61
+ Scrape a single URL.
62
+
63
+ ```typescript
64
+ const doc = await client.scrape('https://example.com', {
65
+ engine: 'smart', // 'static' | 'dynamic' | 'smart'
66
+ formats: ['markdown', 'html'],
67
+ onlyMainContent: true,
68
+ excludeTags: ['nav', 'footer'],
69
+ waitFor: 1000, // Wait ms before scraping (dynamic only)
70
+ timeout: 30000,
71
+ proxy: 'stealth' // 'iproyal' | 'stealth' | 'tor'
72
+ });
73
+
74
+ console.log(doc.markdown);
75
+ console.log(doc.metadata?.title);
76
+ ```
77
+
78
+ ### Scrape with LLM Extraction
79
+
80
+ Extract structured data using AI.
81
+
82
+ ```typescript
83
+ import { z } from 'zod';
84
+
85
+ const schema = z.object({
86
+ productName: z.string(),
87
+ price: z.number(),
88
+ inStock: z.boolean(),
89
+ features: z.array(z.string())
90
+ });
91
+
92
+ const doc = await client.scrape('https://example.com/product', {
93
+ engine: 'smart',
94
+ extract: {
95
+ schema,
96
+ systemPrompt: 'Extract product information from the page',
97
+ provider: 'openai', // 'openai' | 'anthropic'
98
+ enableFallback: true // Try other provider if primary fails
99
+ }
100
+ });
101
+
102
+ console.log(doc.extract?.data);
103
+ // { productName: '...', price: 99.99, inStock: true, features: [...] }
104
+ ```
105
+
106
+ ### Batch Scrape
107
+
108
+ Scrape multiple URLs in a single job.
109
+
110
+ ```typescript
111
+ // Method 1: Wait for completion (recommended)
112
+ const job = await client.batchScrape(
113
+ ['https://a.com', 'https://b.com', 'https://c.com'],
114
+ {
115
+ options: {
116
+ formats: ['markdown'],
117
+ engine: 'smart'
118
+ },
119
+ pollInterval: 2000, // Poll every 2 seconds
120
+ timeout: 300 // Max wait time in seconds
121
+ }
122
+ );
123
+
124
+ console.log(`Scraped ${job.completed} URLs`);
125
+ job.data.forEach(doc => {
126
+ console.log(doc.url, doc.markdown?.length);
127
+ });
128
+
129
+ // Method 2: Manual polling
130
+ const { id } = await client.startBatchScrape(
131
+ ['https://a.com', 'https://b.com'],
132
+ { options: { formats: ['markdown'] } }
133
+ );
134
+
135
+ let status = await client.getBatchScrapeStatus(id);
136
+ while (status.status === 'scraping') {
137
+ console.log(`Progress: ${status.completed}/${status.total}`);
138
+ await new Promise(r => setTimeout(r, 2000));
139
+ status = await client.getBatchScrapeStatus(id);
140
+ }
141
+
142
+ // Cancel a batch job
143
+ await client.cancelBatchScrape(id);
144
+
145
+ // Get errors
146
+ const errors = await client.getBatchScrapeErrors(id);
147
+ console.log('Failed URLs:', errors.errors.map(e => e.url));
148
+ ```
149
+
150
+ ### Crawl
151
+
152
+ Crawl multiple pages from a website.
153
+
154
+ ```typescript
155
+ // Method 1: Wait for completion (recommended)
156
+ const job = await client.crawl('https://example.com', {
157
+ limit: 10,
158
+ engine: 'dynamic',
159
+ formats: ['markdown'],
160
+ pollInterval: 2000, // Poll every 2 seconds
161
+ timeout: 300 // Max wait time in seconds
162
+ });
163
+
164
+ console.log(`Crawled ${job.completed} pages`);
165
+ job.data.forEach(doc => {
166
+ console.log(doc.url, doc.markdown?.length);
167
+ });
168
+
169
+ // Method 2: Manual polling
170
+ const { id } = await client.startCrawl('https://example.com', { limit: 10 });
171
+
172
+ let status = await client.getCrawlStatus(id);
173
+ while (status.status === 'scraping') {
174
+ console.log(`Progress: ${status.completed}/${status.total}`);
175
+ await new Promise(r => setTimeout(r, 2000));
176
+ status = await client.getCrawlStatus(id);
177
+ }
178
+
179
+ // Cancel a crawl job
180
+ await client.cancelCrawl(id);
181
+
182
+ // Get errors
183
+ const errors = await client.getCrawlErrors(id);
184
+ console.log('Failed URLs:', errors.errors.map(e => e.url));
185
+ console.log('Blocked by robots.txt:', errors.robotsBlocked);
186
+ ```
187
+
188
+ ### Extract (Standalone LLM Extraction)
189
+
190
+ Extract structured data from URLs using LLM.
191
+
192
+ ```typescript
193
+ import { z } from 'zod';
194
+
195
+ // With Zod schema
196
+ const result = await client.extract({
197
+ urls: ['https://example.com/product'],
198
+ schema: z.object({
199
+ name: z.string(),
200
+ price: z.number(),
201
+ inStock: z.boolean(),
202
+ features: z.array(z.string())
203
+ }),
204
+ systemPrompt: 'Extract product information from the page',
205
+ provider: 'openai',
206
+ timeout: 60
207
+ });
208
+
209
+ console.log(result.data);
210
+
211
+ // With natural language prompt
212
+ const result = await client.extract({
213
+ urls: ['https://example.com/about'],
214
+ prompt: 'Extract the company name, founding year, and list of team members',
215
+ enableWebSearch: true
216
+ });
217
+
218
+ console.log(result.data);
219
+
220
+ // Manual polling
221
+ const { id } = await client.startExtract({
222
+ urls: ['https://example.com'],
223
+ schema: { name: 'string', price: 'number' }
224
+ });
225
+
226
+ let status = await client.getExtractStatus(id);
227
+ while (status.status === 'processing') {
228
+ await new Promise(r => setTimeout(r, 2000));
229
+ status = await client.getExtractStatus(id);
230
+ }
231
+ console.log(status.data);
232
+ ```
233
+
234
+ ### Map
235
+
236
+ Discover all URLs on a website.
237
+
238
+ ```typescript
239
+ const result = await client.map('https://example.com', {
240
+ engine: 'dynamic'
241
+ });
242
+
243
+ console.log(`Found ${result.count} URLs:`);
244
+ result.links.forEach(url => console.log(url));
245
+ ```
246
+
247
+ ### Search
248
+
249
+ Search the web with optional scraping.
250
+
251
+ ```typescript
252
+ // Basic search
253
+ const results = await client.search('best restaurants in NYC', {
254
+ limit: 10,
255
+ lang: 'en',
256
+ country: 'us'
257
+ });
258
+
259
+ results.data.forEach(r => {
260
+ console.log(`${r.title}: ${r.url}`);
261
+ });
262
+
263
+ // Search + scrape each result
264
+ const results = await client.search('best laptops 2024', {
265
+ limit: 5,
266
+ scrapeOptions: {
267
+ formats: ['markdown']
268
+ },
269
+ engine: 'smart'
270
+ });
271
+
272
+ results.data.forEach(r => {
273
+ console.log(r.title);
274
+ console.log(r.markdown?.substring(0, 500));
275
+ });
276
+
277
+ // Search + LLM extraction
278
+ const results = await client.search('iPhone reviews', {
279
+ limit: 5,
280
+ scrapeOptions: { formats: ['markdown'] },
281
+ extract: {
282
+ schema: z.object({
283
+ sentiment: z.enum(['positive', 'negative', 'neutral']),
284
+ rating: z.number().optional(),
285
+ summary: z.string()
286
+ }),
287
+ systemPrompt: 'Analyze the review sentiment'
288
+ }
289
+ });
290
+
291
+ console.log(results.extract?.data);
292
+ ```
293
+
294
+ ### Usage & Monitoring
295
+
296
+ Monitor your API usage.
297
+
298
+ ```typescript
299
+ // Get concurrency usage
300
+ const { concurrency, maxConcurrency } = await client.getConcurrency();
301
+ console.log(`Using ${concurrency}/${maxConcurrency} concurrent requests`);
302
+
303
+ // Get credit usage
304
+ const credits = await client.getCreditUsage();
305
+ console.log(`Remaining credits: ${credits.remainingCredits}`);
306
+
307
+ // Get token usage (for LLM extraction)
308
+ const tokens = await client.getTokenUsage();
309
+ console.log(`Remaining tokens: ${tokens.remainingTokens}`);
310
+
311
+ // Get queue status
312
+ const queue = await client.getQueueStatus();
313
+ console.log(`Jobs in queue: ${queue.jobsInQueue}`);
314
+ console.log(`Active: ${queue.activeJobsInQueue}, Waiting: ${queue.waitingJobsInQueue}`);
315
+ ```
316
+
317
+ ## Error Handling
318
+
319
+ ```typescript
320
+ import {
321
+ CrawlGateClient,
322
+ CrawlGateError,
323
+ AuthenticationError,
324
+ ValidationError,
325
+ JobTimeoutError,
326
+ RateLimitError
327
+ } from '@crawlgate/sdk';
328
+
329
+ try {
330
+ const doc = await client.scrape('https://example.com');
331
+ } catch (error) {
332
+ if (error instanceof AuthenticationError) {
333
+ console.error('Invalid API key');
334
+ } else if (error instanceof ValidationError) {
335
+ console.error('Invalid request:', error.message);
336
+ } else if (error instanceof JobTimeoutError) {
337
+ console.error(`Job ${error.jobId} timed out after ${error.timeoutSeconds}s`);
338
+ } else if (error instanceof RateLimitError) {
339
+ console.error('Rate limited, retry after:', error.retryAfter);
340
+ } else if (error instanceof CrawlGateError) {
341
+ console.error('API error:', error.message, error.statusCode);
342
+ }
343
+ }
344
+ ```
345
+
346
+ ## TypeScript Support
347
+
348
+ Full TypeScript support with exported types:
349
+
350
+ ```typescript
351
+ import type {
352
+ CrawlGateClientOptions,
353
+ ScrapeOptions,
354
+ CrawlOptions,
355
+ MapOptions,
356
+ SearchOptions,
357
+ Document,
358
+ CrawlJob,
359
+ SearchResponse,
360
+ Engine,
361
+ ExtractOptions,
362
+ // Batch scrape
363
+ BatchScrapeOptions,
364
+ BatchScrapeJob,
365
+ BatchScrapeResponse,
366
+ // Extract
367
+ ExtractRequestOptions,
368
+ ExtractResponse,
369
+ // Usage
370
+ ConcurrencyInfo,
371
+ CreditUsage,
372
+ TokenUsage,
373
+ QueueStatus,
374
+ // Errors
375
+ CrawlError,
376
+ CrawlErrorsResponse
377
+ } from '@crawlgate/sdk';
378
+ ```
379
+
380
+ ## Environment Variables
381
+
382
+ ```bash
383
+ # API key (used if not passed to constructor)
384
+ CRAWLGATE_API_KEY=sk_live_...
385
+
386
+ # API URL (used if not passed to constructor)
387
+ CRAWLGATE_API_URL=https://api.crawlgate.io
388
+ ```
389
+
390
+ ## Documentation
391
+
392
+ Full documentation available at [docs.crawlgate.io](https://docs.crawlgate.io)
393
+
394
+ ## Support
395
+
396
+ - Website: [crawlgate.io](https://crawlgate.io)
397
+ - Documentation: [docs.crawlgate.io](https://docs.crawlgate.io)