@crawlgate/sdk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +397 -0
- package/dist/index.cjs +1299 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +1356 -0
- package/dist/index.d.ts +1356 -0
- package/dist/index.js +1255 -0
- package/dist/index.js.map +1 -0
- package/package.json +60 -0
package/README.md
ADDED
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
# @crawlgate/sdk
|
|
2
|
+
|
|
3
|
+
Official JavaScript/TypeScript SDK for CrawlGate Search Engine API.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @crawlgate/sdk
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```typescript
|
|
14
|
+
import { CrawlGateClient } from '@crawlgate/sdk';
|
|
15
|
+
|
|
16
|
+
const client = new CrawlGateClient({
|
|
17
|
+
apiKey: 'sk_live_...',
|
|
18
|
+
apiUrl: 'https://api.crawlgate.io' // or your self-hosted URL
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
// Scrape a single page
|
|
22
|
+
const doc = await client.scrape('https://example.com');
|
|
23
|
+
console.log(doc.markdown);
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Configuration
|
|
27
|
+
|
|
28
|
+
```typescript
|
|
29
|
+
const client = new CrawlGateClient({
|
|
30
|
+
// Required: API key (or set CRAWLGATE_API_KEY env var)
|
|
31
|
+
apiKey: 'sk_live_...',
|
|
32
|
+
|
|
33
|
+
// Optional: API URL (default: https://api.crawlgate.io)
|
|
34
|
+
apiUrl: 'https://api.crawlgate.io',
|
|
35
|
+
|
|
36
|
+
// Optional: Request timeout in ms (default: 90000)
|
|
37
|
+
timeoutMs: 90000,
|
|
38
|
+
|
|
39
|
+
// Optional: Max retries for failed requests (default: 3)
|
|
40
|
+
maxRetries: 3,
|
|
41
|
+
|
|
42
|
+
// Optional: Backoff factor for retries in seconds (default: 0.5)
|
|
43
|
+
backoffFactor: 0.5
|
|
44
|
+
});
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Engines
|
|
48
|
+
|
|
49
|
+
CrawlGate supports three scraping engines:
|
|
50
|
+
|
|
51
|
+
| Engine | Description | Best For |
|
|
52
|
+
|--------|-------------|----------|
|
|
53
|
+
| `static` | Axios + Cheerio (no browser) | Fast, simple pages |
|
|
54
|
+
| `dynamic` | Playwright headless browser | JavaScript-heavy sites |
|
|
55
|
+
| `smart` | Auto-selects static/dynamic | Cost optimization |
|
|
56
|
+
|
|
57
|
+
## API Reference
|
|
58
|
+
|
|
59
|
+
### Scrape
|
|
60
|
+
|
|
61
|
+
Scrape a single URL.
|
|
62
|
+
|
|
63
|
+
```typescript
|
|
64
|
+
const doc = await client.scrape('https://example.com', {
|
|
65
|
+
engine: 'smart', // 'static' | 'dynamic' | 'smart'
|
|
66
|
+
formats: ['markdown', 'html'],
|
|
67
|
+
onlyMainContent: true,
|
|
68
|
+
excludeTags: ['nav', 'footer'],
|
|
69
|
+
waitFor: 1000, // Wait ms before scraping (dynamic only)
|
|
70
|
+
timeout: 30000,
|
|
71
|
+
proxy: 'stealth' // 'iproyal' | 'stealth' | 'tor'
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
console.log(doc.markdown);
|
|
75
|
+
console.log(doc.metadata?.title);
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Scrape with LLM Extraction
|
|
79
|
+
|
|
80
|
+
Extract structured data using AI.
|
|
81
|
+
|
|
82
|
+
```typescript
|
|
83
|
+
import { z } from 'zod';
|
|
84
|
+
|
|
85
|
+
const schema = z.object({
|
|
86
|
+
productName: z.string(),
|
|
87
|
+
price: z.number(),
|
|
88
|
+
inStock: z.boolean(),
|
|
89
|
+
features: z.array(z.string())
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
const doc = await client.scrape('https://example.com/product', {
|
|
93
|
+
engine: 'smart',
|
|
94
|
+
extract: {
|
|
95
|
+
schema,
|
|
96
|
+
systemPrompt: 'Extract product information from the page',
|
|
97
|
+
provider: 'openai', // 'openai' | 'anthropic'
|
|
98
|
+
enableFallback: true // Try other provider if primary fails
|
|
99
|
+
}
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
console.log(doc.extract?.data);
|
|
103
|
+
// { productName: '...', price: 99.99, inStock: true, features: [...] }
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Batch Scrape
|
|
107
|
+
|
|
108
|
+
Scrape multiple URLs in a single job.
|
|
109
|
+
|
|
110
|
+
```typescript
|
|
111
|
+
// Method 1: Wait for completion (recommended)
|
|
112
|
+
const job = await client.batchScrape(
|
|
113
|
+
['https://a.com', 'https://b.com', 'https://c.com'],
|
|
114
|
+
{
|
|
115
|
+
options: {
|
|
116
|
+
formats: ['markdown'],
|
|
117
|
+
engine: 'smart'
|
|
118
|
+
},
|
|
119
|
+
pollInterval: 2000, // Poll every 2 seconds
|
|
120
|
+
timeout: 300 // Max wait time in seconds
|
|
121
|
+
}
|
|
122
|
+
);
|
|
123
|
+
|
|
124
|
+
console.log(`Scraped ${job.completed} URLs`);
|
|
125
|
+
job.data.forEach(doc => {
|
|
126
|
+
console.log(doc.url, doc.markdown?.length);
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
// Method 2: Manual polling
|
|
130
|
+
const { id } = await client.startBatchScrape(
|
|
131
|
+
['https://a.com', 'https://b.com'],
|
|
132
|
+
{ options: { formats: ['markdown'] } }
|
|
133
|
+
);
|
|
134
|
+
|
|
135
|
+
let status = await client.getBatchScrapeStatus(id);
|
|
136
|
+
while (status.status === 'scraping') {
|
|
137
|
+
console.log(`Progress: ${status.completed}/${status.total}`);
|
|
138
|
+
await new Promise(r => setTimeout(r, 2000));
|
|
139
|
+
status = await client.getBatchScrapeStatus(id);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Cancel a batch job
|
|
143
|
+
await client.cancelBatchScrape(id);
|
|
144
|
+
|
|
145
|
+
// Get errors
|
|
146
|
+
const errors = await client.getBatchScrapeErrors(id);
|
|
147
|
+
console.log('Failed URLs:', errors.errors.map(e => e.url));
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Crawl
|
|
151
|
+
|
|
152
|
+
Crawl multiple pages from a website.
|
|
153
|
+
|
|
154
|
+
```typescript
|
|
155
|
+
// Method 1: Wait for completion (recommended)
|
|
156
|
+
const job = await client.crawl('https://example.com', {
|
|
157
|
+
limit: 10,
|
|
158
|
+
engine: 'dynamic',
|
|
159
|
+
formats: ['markdown'],
|
|
160
|
+
pollInterval: 2000, // Poll every 2 seconds
|
|
161
|
+
timeout: 300 // Max wait time in seconds
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
console.log(`Crawled ${job.completed} pages`);
|
|
165
|
+
job.data.forEach(doc => {
|
|
166
|
+
console.log(doc.url, doc.markdown?.length);
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
// Method 2: Manual polling
|
|
170
|
+
const { id } = await client.startCrawl('https://example.com', { limit: 10 });
|
|
171
|
+
|
|
172
|
+
let status = await client.getCrawlStatus(id);
|
|
173
|
+
while (status.status === 'scraping') {
|
|
174
|
+
console.log(`Progress: ${status.completed}/${status.total}`);
|
|
175
|
+
await new Promise(r => setTimeout(r, 2000));
|
|
176
|
+
status = await client.getCrawlStatus(id);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Cancel a crawl job
|
|
180
|
+
await client.cancelCrawl(id);
|
|
181
|
+
|
|
182
|
+
// Get errors
|
|
183
|
+
const errors = await client.getCrawlErrors(id);
|
|
184
|
+
console.log('Failed URLs:', errors.errors.map(e => e.url));
|
|
185
|
+
console.log('Blocked by robots.txt:', errors.robotsBlocked);
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Extract (Standalone LLM Extraction)
|
|
189
|
+
|
|
190
|
+
Extract structured data from URLs using LLM.
|
|
191
|
+
|
|
192
|
+
```typescript
|
|
193
|
+
import { z } from 'zod';
|
|
194
|
+
|
|
195
|
+
// With Zod schema
|
|
196
|
+
const result = await client.extract({
|
|
197
|
+
urls: ['https://example.com/product'],
|
|
198
|
+
schema: z.object({
|
|
199
|
+
name: z.string(),
|
|
200
|
+
price: z.number(),
|
|
201
|
+
inStock: z.boolean(),
|
|
202
|
+
features: z.array(z.string())
|
|
203
|
+
}),
|
|
204
|
+
systemPrompt: 'Extract product information from the page',
|
|
205
|
+
provider: 'openai',
|
|
206
|
+
timeout: 60
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
console.log(result.data);
|
|
210
|
+
|
|
211
|
+
// With natural language prompt
|
|
212
|
+
const result = await client.extract({
|
|
213
|
+
urls: ['https://example.com/about'],
|
|
214
|
+
prompt: 'Extract the company name, founding year, and list of team members',
|
|
215
|
+
enableWebSearch: true
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
console.log(result.data);
|
|
219
|
+
|
|
220
|
+
// Manual polling
|
|
221
|
+
const { id } = await client.startExtract({
|
|
222
|
+
urls: ['https://example.com'],
|
|
223
|
+
schema: { name: 'string', price: 'number' }
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
let status = await client.getExtractStatus(id);
|
|
227
|
+
while (status.status === 'processing') {
|
|
228
|
+
await new Promise(r => setTimeout(r, 2000));
|
|
229
|
+
status = await client.getExtractStatus(id);
|
|
230
|
+
}
|
|
231
|
+
console.log(status.data);
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### Map
|
|
235
|
+
|
|
236
|
+
Discover all URLs on a website.
|
|
237
|
+
|
|
238
|
+
```typescript
|
|
239
|
+
const result = await client.map('https://example.com', {
|
|
240
|
+
engine: 'dynamic'
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
console.log(`Found ${result.count} URLs:`);
|
|
244
|
+
result.links.forEach(url => console.log(url));
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Search
|
|
248
|
+
|
|
249
|
+
Search the web with optional scraping.
|
|
250
|
+
|
|
251
|
+
```typescript
|
|
252
|
+
// Basic search
|
|
253
|
+
const results = await client.search('best restaurants in NYC', {
|
|
254
|
+
limit: 10,
|
|
255
|
+
lang: 'en',
|
|
256
|
+
country: 'us'
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
results.data.forEach(r => {
|
|
260
|
+
console.log(`${r.title}: ${r.url}`);
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
// Search + scrape each result
|
|
264
|
+
const results = await client.search('best laptops 2024', {
|
|
265
|
+
limit: 5,
|
|
266
|
+
scrapeOptions: {
|
|
267
|
+
formats: ['markdown']
|
|
268
|
+
},
|
|
269
|
+
engine: 'smart'
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
results.data.forEach(r => {
|
|
273
|
+
console.log(r.title);
|
|
274
|
+
console.log(r.markdown?.substring(0, 500));
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
// Search + LLM extraction
|
|
278
|
+
const results = await client.search('iPhone reviews', {
|
|
279
|
+
limit: 5,
|
|
280
|
+
scrapeOptions: { formats: ['markdown'] },
|
|
281
|
+
extract: {
|
|
282
|
+
schema: z.object({
|
|
283
|
+
sentiment: z.enum(['positive', 'negative', 'neutral']),
|
|
284
|
+
rating: z.number().optional(),
|
|
285
|
+
summary: z.string()
|
|
286
|
+
}),
|
|
287
|
+
systemPrompt: 'Analyze the review sentiment'
|
|
288
|
+
}
|
|
289
|
+
});
|
|
290
|
+
|
|
291
|
+
console.log(results.extract?.data);
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### Usage & Monitoring
|
|
295
|
+
|
|
296
|
+
Monitor your API usage.
|
|
297
|
+
|
|
298
|
+
```typescript
|
|
299
|
+
// Get concurrency usage
|
|
300
|
+
const { concurrency, maxConcurrency } = await client.getConcurrency();
|
|
301
|
+
console.log(`Using ${concurrency}/${maxConcurrency} concurrent requests`);
|
|
302
|
+
|
|
303
|
+
// Get credit usage
|
|
304
|
+
const credits = await client.getCreditUsage();
|
|
305
|
+
console.log(`Remaining credits: ${credits.remainingCredits}`);
|
|
306
|
+
|
|
307
|
+
// Get token usage (for LLM extraction)
|
|
308
|
+
const tokens = await client.getTokenUsage();
|
|
309
|
+
console.log(`Remaining tokens: ${tokens.remainingTokens}`);
|
|
310
|
+
|
|
311
|
+
// Get queue status
|
|
312
|
+
const queue = await client.getQueueStatus();
|
|
313
|
+
console.log(`Jobs in queue: ${queue.jobsInQueue}`);
|
|
314
|
+
console.log(`Active: ${queue.activeJobsInQueue}, Waiting: ${queue.waitingJobsInQueue}`);
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## Error Handling
|
|
318
|
+
|
|
319
|
+
```typescript
|
|
320
|
+
import {
|
|
321
|
+
CrawlGateClient,
|
|
322
|
+
CrawlGateError,
|
|
323
|
+
AuthenticationError,
|
|
324
|
+
ValidationError,
|
|
325
|
+
JobTimeoutError,
|
|
326
|
+
RateLimitError
|
|
327
|
+
} from '@crawlgate/sdk';
|
|
328
|
+
|
|
329
|
+
try {
|
|
330
|
+
const doc = await client.scrape('https://example.com');
|
|
331
|
+
} catch (error) {
|
|
332
|
+
if (error instanceof AuthenticationError) {
|
|
333
|
+
console.error('Invalid API key');
|
|
334
|
+
} else if (error instanceof ValidationError) {
|
|
335
|
+
console.error('Invalid request:', error.message);
|
|
336
|
+
} else if (error instanceof JobTimeoutError) {
|
|
337
|
+
console.error(`Job ${error.jobId} timed out after ${error.timeoutSeconds}s`);
|
|
338
|
+
} else if (error instanceof RateLimitError) {
|
|
339
|
+
console.error('Rate limited, retry after:', error.retryAfter);
|
|
340
|
+
} else if (error instanceof CrawlGateError) {
|
|
341
|
+
console.error('API error:', error.message, error.statusCode);
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
## TypeScript Support
|
|
347
|
+
|
|
348
|
+
Full TypeScript support with exported types:
|
|
349
|
+
|
|
350
|
+
```typescript
|
|
351
|
+
import type {
|
|
352
|
+
CrawlGateClientOptions,
|
|
353
|
+
ScrapeOptions,
|
|
354
|
+
CrawlOptions,
|
|
355
|
+
MapOptions,
|
|
356
|
+
SearchOptions,
|
|
357
|
+
Document,
|
|
358
|
+
CrawlJob,
|
|
359
|
+
SearchResponse,
|
|
360
|
+
Engine,
|
|
361
|
+
ExtractOptions,
|
|
362
|
+
// Batch scrape
|
|
363
|
+
BatchScrapeOptions,
|
|
364
|
+
BatchScrapeJob,
|
|
365
|
+
BatchScrapeResponse,
|
|
366
|
+
// Extract
|
|
367
|
+
ExtractRequestOptions,
|
|
368
|
+
ExtractResponse,
|
|
369
|
+
// Usage
|
|
370
|
+
ConcurrencyInfo,
|
|
371
|
+
CreditUsage,
|
|
372
|
+
TokenUsage,
|
|
373
|
+
QueueStatus,
|
|
374
|
+
// Errors
|
|
375
|
+
CrawlError,
|
|
376
|
+
CrawlErrorsResponse
|
|
377
|
+
} from '@crawlgate/sdk';
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
## Environment Variables
|
|
381
|
+
|
|
382
|
+
```bash
|
|
383
|
+
# API key (used if not passed to constructor)
|
|
384
|
+
CRAWLGATE_API_KEY=sk_live_...
|
|
385
|
+
|
|
386
|
+
# API URL (used if not passed to constructor)
|
|
387
|
+
CRAWLGATE_API_URL=https://api.crawlgate.io
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
## Documentation
|
|
391
|
+
|
|
392
|
+
Full documentation available at [docs.crawlgate.io](https://docs.crawlgate.io)
|
|
393
|
+
|
|
394
|
+
## Support
|
|
395
|
+
|
|
396
|
+
- Website: [crawlgate.io](https://crawlgate.io)
|
|
397
|
+
- Documentation: [docs.crawlgate.io](https://docs.crawlgate.io)
|