crawl4ai 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +66 -5
- package/dist/index.js +237 -14
- package/dist/sdk.d.ts +63 -4
- package/dist/types.d.ts +100 -6
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -10,8 +10,10 @@ A type-safe TypeScript SDK for the Crawl4AI REST API. Built for modern JavaScrip
|
|
|
10
10
|
- **Comprehensive Coverage** - All Crawl4AI endpoints including specialized features
|
|
11
11
|
- **Smart Error Handling** - Custom error classes with retry logic and timeouts
|
|
12
12
|
- **Batch Processing** - Efficiently crawl multiple URLs in a single request
|
|
13
|
+
- **Streaming Results** - NDJSON streaming for long-running crawls
|
|
13
14
|
- **Input Validation** - Built-in URL validation and parameter checking
|
|
14
15
|
- **Debug Mode** - Optional request/response logging for development
|
|
16
|
+
- **MCP Endpoints** - Model Context Protocol schema + SSE support
|
|
15
17
|
- **Zero Dependencies** - Uses only native fetch API
|
|
16
18
|
|
|
17
19
|
## 📦 Installation
|
|
@@ -37,6 +39,8 @@ yarn add crawl4ai
|
|
|
37
39
|
- **Official Project**: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
|
38
40
|
- **Official Documentation**: [https://docs.crawl4ai.com/](https://docs.crawl4ai.com/)
|
|
39
41
|
|
|
42
|
+
**Compatibility note:** SDK updated against Crawl4AI **v0.8.0** (January 18, 2026).
|
|
43
|
+
|
|
40
44
|
|
|
41
45
|
## 🏗️ Prerequisites
|
|
42
46
|
|
|
@@ -90,7 +94,9 @@ const results = await client.crawl({
|
|
|
90
94
|
|
|
91
95
|
const result = results[0]; // API returns array of results
|
|
92
96
|
console.log('Title:', result.metadata?.title);
|
|
93
|
-
|
|
97
|
+
const markdown =
|
|
98
|
+
typeof result.markdown === 'string' ? result.markdown : result.markdown?.raw_markdown;
|
|
99
|
+
console.log('Content:', markdown?.slice(0, 200));
|
|
94
100
|
```
|
|
95
101
|
|
|
96
102
|
### Configuration Options
|
|
@@ -135,6 +141,30 @@ const results = await client.crawl({
|
|
|
135
141
|
});
|
|
136
142
|
```
|
|
137
143
|
|
|
144
|
+
Need server metadata (processing time, memory stats, etc.)? Use `crawlWithMetadata`:
|
|
145
|
+
|
|
146
|
+
```typescript
|
|
147
|
+
const response = await client.crawlWithMetadata({
|
|
148
|
+
urls: ['https://example.com']
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
console.log(response.success);
|
|
152
|
+
console.log(response.server_processing_time_s);
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
#### `crawlStream(request)` - Stream Crawl Results (NDJSON)
|
|
156
|
+
Stream results as they arrive:
|
|
157
|
+
|
|
158
|
+
```typescript
|
|
159
|
+
for await (const chunk of client.crawlStream({ urls: ['https://example.com'] })) {
|
|
160
|
+
if ('status' in chunk) {
|
|
161
|
+
console.log('Stream status:', chunk.status);
|
|
162
|
+
} else {
|
|
163
|
+
console.log('Crawled:', chunk.url);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
```
|
|
167
|
+
|
|
138
168
|
|
|
139
169
|
### Content Generation
|
|
140
170
|
|
|
@@ -144,11 +174,21 @@ Extract markdown with various filters:
|
|
|
144
174
|
```typescript
|
|
145
175
|
const markdown = await client.markdown({
|
|
146
176
|
url: 'https://example.com',
|
|
147
|
-
|
|
148
|
-
|
|
177
|
+
filter: 'fit', // 'raw' | 'fit' | 'bm25' | 'llm'
|
|
178
|
+
query: 'search query for bm25/llm filters'
|
|
149
179
|
});
|
|
150
180
|
```
|
|
151
181
|
|
|
182
|
+
Want the full response metadata? Use `markdownResult`:
|
|
183
|
+
|
|
184
|
+
```typescript
|
|
185
|
+
const markdownResult = await client.markdownResult({
|
|
186
|
+
url: 'https://example.com',
|
|
187
|
+
filter: 'fit'
|
|
188
|
+
});
|
|
189
|
+
console.log(markdownResult.success, markdownResult.cache);
|
|
190
|
+
```
|
|
191
|
+
|
|
152
192
|
#### `html(request)` - Get Processed HTML
|
|
153
193
|
Get sanitized HTML for schema extraction:
|
|
154
194
|
|
|
@@ -158,6 +198,13 @@ const html = await client.html({
|
|
|
158
198
|
});
|
|
159
199
|
```
|
|
160
200
|
|
|
201
|
+
Full response metadata via `htmlResult`:
|
|
202
|
+
|
|
203
|
+
```typescript
|
|
204
|
+
const htmlResult = await client.htmlResult({ url: 'https://example.com' });
|
|
205
|
+
console.log(htmlResult.success);
|
|
206
|
+
```
|
|
207
|
+
|
|
161
208
|
#### `screenshot(request)` - Capture Screenshot
|
|
162
209
|
Capture full-page screenshots:
|
|
163
210
|
|
|
@@ -169,6 +216,8 @@ const screenshotBase64 = await client.screenshot({
|
|
|
169
216
|
});
|
|
170
217
|
```
|
|
171
218
|
|
|
219
|
+
For response metadata use `screenshotResult()`.
|
|
220
|
+
|
|
172
221
|
#### `pdf(request)` - Generate PDF
|
|
173
222
|
Generate PDF documents:
|
|
174
223
|
|
|
@@ -179,6 +228,8 @@ const pdfData = await client.pdf({
|
|
|
179
228
|
});
|
|
180
229
|
```
|
|
181
230
|
|
|
231
|
+
For response metadata use `pdfResult()`.
|
|
232
|
+
|
|
182
233
|
### JavaScript Execution
|
|
183
234
|
|
|
184
235
|
#### `executeJs(request)` - Run JavaScript
|
|
@@ -239,6 +290,16 @@ const version = await client.version({ throwOnError: true });
|
|
|
239
290
|
// Get Prometheus metrics
|
|
240
291
|
const metrics = await client.metrics();
|
|
241
292
|
|
|
293
|
+
// Request an API token
|
|
294
|
+
const token = await client.token({ email: 'user@example.com' });
|
|
295
|
+
|
|
296
|
+
// Dump server configuration (advanced)
|
|
297
|
+
const configDump = await client.configDump({ code: 'CrawlerRunConfig()' });
|
|
298
|
+
|
|
299
|
+
// MCP schema + SSE (advanced integrations)
|
|
300
|
+
const mcpSchema = await client.mcpSchema();
|
|
301
|
+
const mcpStream = await client.mcpSse();
|
|
302
|
+
|
|
242
303
|
// Update configuration
|
|
243
304
|
client.setApiToken('new_token');
|
|
244
305
|
client.setBaseUrl('https://new-url.com');
|
|
@@ -345,7 +406,7 @@ const results = await client.crawl({
|
|
|
345
406
|
The SDK provides custom error handling with detailed information:
|
|
346
407
|
|
|
347
408
|
```typescript
|
|
348
|
-
import { Crawl4AIError } from 'crawl4ai
|
|
409
|
+
import { Crawl4AIError } from 'crawl4ai';
|
|
349
410
|
|
|
350
411
|
try {
|
|
351
412
|
const results = await client.crawl({ urls: 'https://example.com' });
|
|
@@ -426,4 +487,4 @@ This SDK is released under the MIT License.
|
|
|
426
487
|
|
|
427
488
|
## 🙏 Acknowledgments
|
|
428
489
|
|
|
429
|
-
Built for the amazing [Crawl4AI](https://github.com/unclecode/crawl4ai) project by [@unclecode](https://github.com/unclecode) and the Crawl4AI community.
|
|
490
|
+
Built for the amazing [Crawl4AI](https://github.com/unclecode/crawl4ai) project by [@unclecode](https://github.com/unclecode) and the Crawl4AI community.
|
package/dist/index.js
CHANGED
|
@@ -233,6 +233,21 @@ class Crawl4AI {
|
|
|
233
233
|
}
|
|
234
234
|
return [response];
|
|
235
235
|
}
|
|
236
|
+
normalizeCrawlResponse(response) {
|
|
237
|
+
if (Array.isArray(response)) {
|
|
238
|
+
return { results: response };
|
|
239
|
+
}
|
|
240
|
+
if (typeof response === "object" && response !== null) {
|
|
241
|
+
const responseObject = response;
|
|
242
|
+
if (Array.isArray(responseObject.results)) {
|
|
243
|
+
return responseObject;
|
|
244
|
+
}
|
|
245
|
+
if (Array.isArray(responseObject.result)) {
|
|
246
|
+
return { results: responseObject.result };
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
return { results: [response] };
|
|
250
|
+
}
|
|
236
251
|
buildQueryParams(params) {
|
|
237
252
|
const searchParams = new URLSearchParams;
|
|
238
253
|
for (const [key, value] of Object.entries(params)) {
|
|
@@ -242,6 +257,19 @@ class Crawl4AI {
|
|
|
242
257
|
}
|
|
243
258
|
return searchParams.toString();
|
|
244
259
|
}
|
|
260
|
+
async parseResponseBody(response) {
|
|
261
|
+
const contentType = response.headers.get("content-type") || "";
|
|
262
|
+
if (contentType.includes("application/json")) {
|
|
263
|
+
return response.json();
|
|
264
|
+
}
|
|
265
|
+
if (contentType.includes("text/html") || contentType.includes("text/plain")) {
|
|
266
|
+
return response.text();
|
|
267
|
+
}
|
|
268
|
+
if (contentType.includes("text/event-stream") || contentType.includes("application/x-ndjson")) {
|
|
269
|
+
return response;
|
|
270
|
+
}
|
|
271
|
+
return response.text();
|
|
272
|
+
}
|
|
245
273
|
async request(endpoint, options = {}) {
|
|
246
274
|
const url = `${this.config.baseUrl}${endpoint}`;
|
|
247
275
|
const { timeout = this.config.timeout, signal, headers, ...fetchOptions } = options;
|
|
@@ -260,17 +288,7 @@ class Crawl4AI {
|
|
|
260
288
|
signal: requestSignal
|
|
261
289
|
});
|
|
262
290
|
clearTimeout(timeoutId);
|
|
263
|
-
const
|
|
264
|
-
let responseData;
|
|
265
|
-
if (contentType.includes("application/json")) {
|
|
266
|
-
responseData = await response.json();
|
|
267
|
-
} else if (contentType.includes("text/html") || contentType.includes("text/plain")) {
|
|
268
|
-
responseData = await response.text();
|
|
269
|
-
} else if (contentType.includes("text/event-stream")) {
|
|
270
|
-
return response;
|
|
271
|
-
} else {
|
|
272
|
-
responseData = await response.text();
|
|
273
|
-
}
|
|
291
|
+
const responseData = await this.parseResponseBody(response);
|
|
274
292
|
this.log(`Response: ${response.status}`, responseData);
|
|
275
293
|
if (!this.config.validateStatus(response.status)) {
|
|
276
294
|
const headers2 = {};
|
|
@@ -300,6 +318,53 @@ class Crawl4AI {
|
|
|
300
318
|
throw error;
|
|
301
319
|
}
|
|
302
320
|
}
|
|
321
|
+
async requestRaw(endpoint, options = {}) {
|
|
322
|
+
const url = `${this.config.baseUrl}${endpoint}`;
|
|
323
|
+
const { timeout = this.config.timeout, signal, headers, ...fetchOptions } = options;
|
|
324
|
+
this.log(`Request: ${fetchOptions.method || "GET"} ${url}`, fetchOptions.body);
|
|
325
|
+
const requestHeaders = {
|
|
326
|
+
...this.config.defaultHeaders,
|
|
327
|
+
...headers
|
|
328
|
+
};
|
|
329
|
+
const controller = new AbortController;
|
|
330
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
331
|
+
const requestSignal = signal || controller.signal;
|
|
332
|
+
try {
|
|
333
|
+
const response = await fetch(url, {
|
|
334
|
+
...fetchOptions,
|
|
335
|
+
headers: requestHeaders,
|
|
336
|
+
signal: requestSignal
|
|
337
|
+
});
|
|
338
|
+
clearTimeout(timeoutId);
|
|
339
|
+
if (!this.config.validateStatus(response.status)) {
|
|
340
|
+
const responseData = await this.parseResponseBody(response);
|
|
341
|
+
const headers2 = {};
|
|
342
|
+
response.headers.forEach((value, key) => {
|
|
343
|
+
headers2[key] = value;
|
|
344
|
+
});
|
|
345
|
+
const error = createHttpError(response.status, response.statusText, undefined, responseData, headers2);
|
|
346
|
+
error.request = {
|
|
347
|
+
url,
|
|
348
|
+
method: fetchOptions.method || "GET",
|
|
349
|
+
headers: requestHeaders,
|
|
350
|
+
body: fetchOptions.body
|
|
351
|
+
};
|
|
352
|
+
if (this.config.throwOnError) {
|
|
353
|
+
throw error;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
return response;
|
|
357
|
+
} catch (error) {
|
|
358
|
+
clearTimeout(timeoutId);
|
|
359
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
360
|
+
throw new TimeoutError(timeout, url);
|
|
361
|
+
}
|
|
362
|
+
if (error instanceof TypeError && error.message.includes("fetch")) {
|
|
363
|
+
throw new NetworkError(`Network request failed: ${error.message}`, error);
|
|
364
|
+
}
|
|
365
|
+
throw error;
|
|
366
|
+
}
|
|
367
|
+
}
|
|
303
368
|
async requestWithRetry(endpoint, options = {}) {
|
|
304
369
|
let lastError = new Error("No attempts made");
|
|
305
370
|
for (let attempt = 0;attempt <= this.config.retries; attempt++) {
|
|
@@ -324,6 +389,30 @@ class Crawl4AI {
|
|
|
324
389
|
}
|
|
325
390
|
throw lastError;
|
|
326
391
|
}
|
|
392
|
+
async requestRawWithRetry(endpoint, options = {}) {
|
|
393
|
+
let lastError = new Error("No attempts made");
|
|
394
|
+
for (let attempt = 0;attempt <= this.config.retries; attempt++) {
|
|
395
|
+
try {
|
|
396
|
+
return await this.requestRaw(endpoint, options);
|
|
397
|
+
} catch (error) {
|
|
398
|
+
lastError = error;
|
|
399
|
+
if (error instanceof Crawl4AIError && error.status && error.status >= CLIENT_ERROR_MIN && error.status < CLIENT_ERROR_MAX && error.status !== RATE_LIMIT_STATUS) {
|
|
400
|
+
throw error;
|
|
401
|
+
}
|
|
402
|
+
if (attempt < this.config.retries) {
|
|
403
|
+
let delay = this.config.retryDelay * RETRY_BACKOFF_MULTIPLIER ** attempt;
|
|
404
|
+
if (error instanceof RateLimitError && error.retryAfter) {
|
|
405
|
+
delay = error.retryAfter * 1000;
|
|
406
|
+
this.log(`Rate limited. Waiting ${error.retryAfter}s before retry (attempt ${attempt + 1}/${this.config.retries})`);
|
|
407
|
+
} else {
|
|
408
|
+
this.log(`Retry attempt ${attempt + 1}/${this.config.retries} after ${delay}ms`);
|
|
409
|
+
}
|
|
410
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
throw lastError;
|
|
415
|
+
}
|
|
327
416
|
async crawl(request, config) {
|
|
328
417
|
const urls = Array.isArray(request.urls) ? request.urls : [request.urls];
|
|
329
418
|
for (const url of urls) {
|
|
@@ -340,23 +429,131 @@ class Crawl4AI {
|
|
|
340
429
|
});
|
|
341
430
|
return this.normalizeArrayResponse(response);
|
|
342
431
|
}
|
|
432
|
+
async crawlWithMetadata(request, config) {
|
|
433
|
+
const urls = Array.isArray(request.urls) ? request.urls : [request.urls];
|
|
434
|
+
for (const url of urls) {
|
|
435
|
+
this.validateUrl(url);
|
|
436
|
+
}
|
|
437
|
+
const normalizedRequest = {
|
|
438
|
+
...request,
|
|
439
|
+
urls
|
|
440
|
+
};
|
|
441
|
+
const response = await this.requestWithRetry("/crawl", {
|
|
442
|
+
method: "POST",
|
|
443
|
+
body: JSON.stringify(normalizedRequest),
|
|
444
|
+
...config
|
|
445
|
+
});
|
|
446
|
+
return this.normalizeCrawlResponse(response);
|
|
447
|
+
}
|
|
448
|
+
async* crawlStream(request, config) {
|
|
449
|
+
const urls = Array.isArray(request.urls) ? request.urls : [request.urls];
|
|
450
|
+
for (const url of urls) {
|
|
451
|
+
this.validateUrl(url);
|
|
452
|
+
}
|
|
453
|
+
const normalizedRequest = {
|
|
454
|
+
...request,
|
|
455
|
+
urls
|
|
456
|
+
};
|
|
457
|
+
const response = await this.requestRawWithRetry("/crawl/stream", {
|
|
458
|
+
method: "POST",
|
|
459
|
+
body: JSON.stringify(normalizedRequest),
|
|
460
|
+
...config
|
|
461
|
+
});
|
|
462
|
+
if (!response.body) {
|
|
463
|
+
throw new NetworkError("Streaming response body is not available");
|
|
464
|
+
}
|
|
465
|
+
const reader = response.body.getReader();
|
|
466
|
+
const decoder = new TextDecoder;
|
|
467
|
+
let buffer = "";
|
|
468
|
+
while (true) {
|
|
469
|
+
const { value, done } = await reader.read();
|
|
470
|
+
if (done) {
|
|
471
|
+
break;
|
|
472
|
+
}
|
|
473
|
+
buffer += decoder.decode(value, { stream: true });
|
|
474
|
+
const lines = buffer.split(`
|
|
475
|
+
`);
|
|
476
|
+
buffer = lines.pop() || "";
|
|
477
|
+
for (const line of lines) {
|
|
478
|
+
const trimmed = line.trim();
|
|
479
|
+
if (!trimmed) {
|
|
480
|
+
continue;
|
|
481
|
+
}
|
|
482
|
+
const parsed = JSON.parse(trimmed);
|
|
483
|
+
yield parsed;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
const remaining = buffer.trim();
|
|
487
|
+
if (remaining) {
|
|
488
|
+
yield JSON.parse(remaining);
|
|
489
|
+
}
|
|
490
|
+
}
|
|
343
491
|
async markdown(request, config) {
|
|
492
|
+
const response = await this.markdownResult(request, config);
|
|
493
|
+
return response.markdown;
|
|
494
|
+
}
|
|
495
|
+
async markdownResult(request, config) {
|
|
344
496
|
this.validateUrl(request.url);
|
|
497
|
+
const apiRequest = {
|
|
498
|
+
url: request.url,
|
|
499
|
+
...request.f !== undefined || request.filter !== undefined ? { f: request.f ?? request.filter } : {},
|
|
500
|
+
...request.q !== undefined || request.query !== undefined ? { q: request.q ?? request.query } : {},
|
|
501
|
+
...request.c !== undefined || request.cache !== undefined ? { c: request.c ?? request.cache } : {}
|
|
502
|
+
};
|
|
345
503
|
const response = await this.requestWithRetry("/md", {
|
|
346
504
|
method: "POST",
|
|
347
|
-
body: JSON.stringify(
|
|
505
|
+
body: JSON.stringify(apiRequest),
|
|
348
506
|
...config
|
|
349
507
|
});
|
|
350
|
-
|
|
508
|
+
if (typeof response === "string") {
|
|
509
|
+
return {
|
|
510
|
+
url: request.url,
|
|
511
|
+
markdown: response
|
|
512
|
+
};
|
|
513
|
+
}
|
|
514
|
+
return response;
|
|
351
515
|
}
|
|
352
516
|
async html(request, config) {
|
|
517
|
+
const response = await this.htmlResult(request, config);
|
|
518
|
+
return response.html;
|
|
519
|
+
}
|
|
520
|
+
async htmlResult(request, config) {
|
|
353
521
|
this.validateUrl(request.url);
|
|
354
522
|
const response = await this.requestWithRetry("/html", {
|
|
355
523
|
method: "POST",
|
|
356
524
|
body: JSON.stringify(request),
|
|
357
525
|
...config
|
|
358
526
|
});
|
|
359
|
-
|
|
527
|
+
if (typeof response === "string") {
|
|
528
|
+
return { html: response, url: request.url };
|
|
529
|
+
}
|
|
530
|
+
return response;
|
|
531
|
+
}
|
|
532
|
+
async screenshot(request, config) {
|
|
533
|
+
const response = await this.screenshotResult(request, config);
|
|
534
|
+
return response.screenshot;
|
|
535
|
+
}
|
|
536
|
+
async screenshotResult(request, config) {
|
|
537
|
+
this.validateUrl(request.url);
|
|
538
|
+
const response = await this.requestWithRetry("/screenshot", {
|
|
539
|
+
method: "POST",
|
|
540
|
+
body: JSON.stringify(request),
|
|
541
|
+
...config
|
|
542
|
+
});
|
|
543
|
+
return response;
|
|
544
|
+
}
|
|
545
|
+
async pdf(request, config) {
|
|
546
|
+
const response = await this.pdfResult(request, config);
|
|
547
|
+
return response.pdf;
|
|
548
|
+
}
|
|
549
|
+
async pdfResult(request, config) {
|
|
550
|
+
this.validateUrl(request.url);
|
|
551
|
+
const response = await this.requestWithRetry("/pdf", {
|
|
552
|
+
method: "POST",
|
|
553
|
+
body: JSON.stringify(request),
|
|
554
|
+
...config
|
|
555
|
+
});
|
|
556
|
+
return response;
|
|
360
557
|
}
|
|
361
558
|
async executeJs(request, config) {
|
|
362
559
|
this.validateUrl(request.url);
|
|
@@ -419,12 +616,38 @@ class Crawl4AI {
|
|
|
419
616
|
...config
|
|
420
617
|
});
|
|
421
618
|
}
|
|
619
|
+
async mcpSchema(config) {
|
|
620
|
+
return this.request("/mcp/schema", {
|
|
621
|
+
method: "GET",
|
|
622
|
+
...config
|
|
623
|
+
});
|
|
624
|
+
}
|
|
625
|
+
async mcpSse(config) {
|
|
626
|
+
return this.requestRaw("/mcp/sse", {
|
|
627
|
+
method: "GET",
|
|
628
|
+
...config
|
|
629
|
+
});
|
|
630
|
+
}
|
|
422
631
|
async getRoot(config) {
|
|
423
632
|
return this.request("/", {
|
|
424
633
|
method: "GET",
|
|
425
634
|
...config
|
|
426
635
|
});
|
|
427
636
|
}
|
|
637
|
+
async token(request, config) {
|
|
638
|
+
return this.request("/token", {
|
|
639
|
+
method: "POST",
|
|
640
|
+
body: JSON.stringify(request),
|
|
641
|
+
...config
|
|
642
|
+
});
|
|
643
|
+
}
|
|
644
|
+
async configDump(request, config) {
|
|
645
|
+
return this.request("/config/dump", {
|
|
646
|
+
method: "POST",
|
|
647
|
+
body: JSON.stringify(request),
|
|
648
|
+
...config
|
|
649
|
+
});
|
|
650
|
+
}
|
|
428
651
|
async testConnection(options) {
|
|
429
652
|
try {
|
|
430
653
|
await this.health({ timeout: HEALTH_CHECK_TIMEOUT });
|
package/dist/sdk.d.ts
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Crawl4AI TypeScript SDK
|
|
3
3
|
* A comprehensive SDK for interacting with Crawl4AI REST API
|
|
4
4
|
*/
|
|
5
|
-
import type { AskRequest, AskResponse, Crawl4AIConfig, CrawlRequest, CrawlResult, ExecuteJsRequest, HealthResponse, HtmlRequest, MarkdownRequest, RequestConfig } from './types';
|
|
5
|
+
import type { AskRequest, AskResponse, ConfigDumpRequest, ConfigDumpResponse, Crawl4AIConfig, CrawlRequest, CrawlResponse, CrawlResult, CrawlStreamChunk, ExecuteJsRequest, HealthResponse, HtmlRequest, HtmlResponse, MarkdownRequest, MarkdownResponse, McpSchemaResponse, PdfRequest, PdfResponse, RequestConfig, ScreenshotRequest, ScreenshotResponse, TokenRequest, TokenResponse } from './types';
|
|
6
6
|
/**
|
|
7
7
|
* Crawl4AI SDK Client - Main class for interacting with Crawl4AI REST API
|
|
8
8
|
*
|
|
@@ -59,12 +59,22 @@ export declare class Crawl4AI {
|
|
|
59
59
|
* Normalize different API response formats to a consistent array
|
|
60
60
|
*/
|
|
61
61
|
private normalizeArrayResponse;
|
|
62
|
+
/**
|
|
63
|
+
* Normalize crawl response to include metadata when available
|
|
64
|
+
*/
|
|
65
|
+
private normalizeCrawlResponse;
|
|
62
66
|
/**
|
|
63
67
|
* Build query parameters from an object, filtering out undefined values
|
|
64
68
|
*/
|
|
65
69
|
private buildQueryParams;
|
|
70
|
+
/**
|
|
71
|
+
* Parse response body based on content type
|
|
72
|
+
*/
|
|
73
|
+
private parseResponseBody;
|
|
66
74
|
private request;
|
|
75
|
+
private requestRaw;
|
|
67
76
|
private requestWithRetry;
|
|
77
|
+
private requestRawWithRetry;
|
|
68
78
|
/**
|
|
69
79
|
* Main crawl endpoint - Extract content from one or more URLs
|
|
70
80
|
*
|
|
@@ -86,13 +96,22 @@ export declare class Crawl4AI {
|
|
|
86
96
|
* @throws {TimeoutError} If request times out
|
|
87
97
|
*/
|
|
88
98
|
crawl(request: CrawlRequest, config?: RequestConfig): Promise<CrawlResult[]>;
|
|
99
|
+
/**
|
|
100
|
+
* Crawl endpoint returning full metadata payload
|
|
101
|
+
*/
|
|
102
|
+
crawlWithMetadata(request: CrawlRequest, config?: RequestConfig): Promise<CrawlResponse>;
|
|
103
|
+
/**
|
|
104
|
+
* Stream crawl results via NDJSON
|
|
105
|
+
*/
|
|
106
|
+
crawlStream(request: CrawlRequest, config?: RequestConfig): AsyncIterable<CrawlStreamChunk>;
|
|
89
107
|
/**
|
|
90
108
|
* Get markdown content from URL with optional filtering
|
|
91
109
|
*
|
|
92
110
|
* @param request - Markdown extraction configuration
|
|
93
111
|
* @param request.url - URL to extract markdown from
|
|
94
|
-
* @param request.
|
|
95
|
-
* @param request.
|
|
112
|
+
* @param request.filter - Content filter: 'raw' | 'fit' | 'bm25' | 'llm'
|
|
113
|
+
* @param request.query - Query for BM25/LLM filtering
|
|
114
|
+
* @param request.cache - Cache mode (e.g., 'bypass')
|
|
96
115
|
* @param config - Optional request configuration
|
|
97
116
|
* @returns Promise resolving to markdown string
|
|
98
117
|
*
|
|
@@ -100,16 +119,40 @@ export declare class Crawl4AI {
|
|
|
100
119
|
* ```typescript
|
|
101
120
|
* const markdown = await client.markdown({
|
|
102
121
|
* url: 'https://example.com',
|
|
103
|
-
*
|
|
122
|
+
* filter: 'fit'
|
|
104
123
|
* });
|
|
105
124
|
* ```
|
|
106
125
|
*/
|
|
107
126
|
markdown(request: MarkdownRequest, config?: RequestConfig): Promise<string>;
|
|
127
|
+
/**
|
|
128
|
+
* Get markdown content with response metadata
|
|
129
|
+
*/
|
|
130
|
+
markdownResult(request: MarkdownRequest, config?: RequestConfig): Promise<MarkdownResponse>;
|
|
108
131
|
/**
|
|
109
132
|
* Get HTML content from URL
|
|
110
133
|
* @param request HTML extraction options
|
|
111
134
|
*/
|
|
112
135
|
html(request: HtmlRequest, config?: RequestConfig): Promise<string>;
|
|
136
|
+
/**
|
|
137
|
+
* Get HTML content with response metadata
|
|
138
|
+
*/
|
|
139
|
+
htmlResult(request: HtmlRequest, config?: RequestConfig): Promise<HtmlResponse>;
|
|
140
|
+
/**
|
|
141
|
+
* Capture screenshot and return base64 or file path
|
|
142
|
+
*/
|
|
143
|
+
screenshot(request: ScreenshotRequest, config?: RequestConfig): Promise<string>;
|
|
144
|
+
/**
|
|
145
|
+
* Capture screenshot with response metadata
|
|
146
|
+
*/
|
|
147
|
+
screenshotResult(request: ScreenshotRequest, config?: RequestConfig): Promise<ScreenshotResponse>;
|
|
148
|
+
/**
|
|
149
|
+
* Generate PDF and return base64 or file path
|
|
150
|
+
*/
|
|
151
|
+
pdf(request: PdfRequest, config?: RequestConfig): Promise<string>;
|
|
152
|
+
/**
|
|
153
|
+
* Generate PDF with response metadata
|
|
154
|
+
*/
|
|
155
|
+
pdfResult(request: PdfRequest, config?: RequestConfig): Promise<PdfResponse>;
|
|
113
156
|
/**
|
|
114
157
|
* Execute JavaScript on webpage and return results
|
|
115
158
|
*
|
|
@@ -166,10 +209,26 @@ export declare class Crawl4AI {
|
|
|
166
209
|
* Get API schema
|
|
167
210
|
*/
|
|
168
211
|
schema(config?: RequestConfig): Promise<unknown>;
|
|
212
|
+
/**
|
|
213
|
+
* Get MCP schema
|
|
214
|
+
*/
|
|
215
|
+
mcpSchema(config?: RequestConfig): Promise<McpSchemaResponse>;
|
|
216
|
+
/**
|
|
217
|
+
* Connect to MCP Server-Sent Events stream
|
|
218
|
+
*/
|
|
219
|
+
mcpSse(config?: RequestConfig): Promise<Response>;
|
|
169
220
|
/**
|
|
170
221
|
* Get root endpoint information
|
|
171
222
|
*/
|
|
172
223
|
getRoot(config?: RequestConfig): Promise<string>;
|
|
224
|
+
/**
|
|
225
|
+
* Request an API token
|
|
226
|
+
*/
|
|
227
|
+
token(request: TokenRequest, config?: RequestConfig): Promise<TokenResponse>;
|
|
228
|
+
/**
|
|
229
|
+
* Dump server configuration
|
|
230
|
+
*/
|
|
231
|
+
configDump(request: ConfigDumpRequest, config?: RequestConfig): Promise<ConfigDumpResponse>;
|
|
173
232
|
/**
|
|
174
233
|
* Test connection to the Crawl4AI API server
|
|
175
234
|
*
|
package/dist/types.d.ts
CHANGED
|
@@ -14,7 +14,14 @@ export interface Viewport {
|
|
|
14
14
|
export interface BrowserConfig {
|
|
15
15
|
headless?: boolean;
|
|
16
16
|
browser_type?: BrowserType;
|
|
17
|
+
browser_mode?: 'dedicated' | 'builtin' | 'custom' | 'docker';
|
|
18
|
+
use_managed_browser?: boolean;
|
|
19
|
+
cdp_url?: string;
|
|
20
|
+
debugging_port?: number;
|
|
21
|
+
host?: string;
|
|
22
|
+
proxy_config?: ProxyConfig;
|
|
17
23
|
user_agent?: string;
|
|
24
|
+
user_agent_mode?: string;
|
|
18
25
|
proxy?: string;
|
|
19
26
|
page_timeout?: number;
|
|
20
27
|
verbose?: boolean;
|
|
@@ -22,8 +29,13 @@ export interface BrowserConfig {
|
|
|
22
29
|
magic?: boolean;
|
|
23
30
|
override_navigator?: boolean;
|
|
24
31
|
user_data_dir?: string;
|
|
25
|
-
|
|
32
|
+
use_persistent_context?: boolean;
|
|
33
|
+
text_mode?: boolean;
|
|
34
|
+
light_mode?: boolean;
|
|
35
|
+
enable_stealth?: boolean;
|
|
26
36
|
viewport?: Viewport;
|
|
37
|
+
viewport_width?: number;
|
|
38
|
+
viewport_height?: number;
|
|
27
39
|
headers?: Record<string, string>;
|
|
28
40
|
cookies?: Cookie[];
|
|
29
41
|
extra_args?: string[];
|
|
@@ -31,6 +43,7 @@ export interface BrowserConfig {
|
|
|
31
43
|
java_script_enabled?: boolean;
|
|
32
44
|
accept_downloads?: boolean;
|
|
33
45
|
downloads_path?: string;
|
|
46
|
+
extra?: Record<string, unknown>;
|
|
34
47
|
}
|
|
35
48
|
export interface Cookie {
|
|
36
49
|
name: string;
|
|
@@ -77,6 +90,16 @@ export interface CosineExtractionParams {
|
|
|
77
90
|
top_k?: number;
|
|
78
91
|
model_name?: string;
|
|
79
92
|
}
|
|
93
|
+
export interface ProxyConfig {
|
|
94
|
+
server: string;
|
|
95
|
+
username?: string;
|
|
96
|
+
password?: string;
|
|
97
|
+
}
|
|
98
|
+
export interface GeolocationConfig {
|
|
99
|
+
latitude: number;
|
|
100
|
+
longitude: number;
|
|
101
|
+
accuracy?: number;
|
|
102
|
+
}
|
|
80
103
|
export type ExtractionStrategy = {
|
|
81
104
|
type: 'json_css';
|
|
82
105
|
params: JsonCssExtractionParams;
|
|
@@ -91,20 +114,28 @@ export interface CrawlerRunConfig {
|
|
|
91
114
|
word_count_threshold?: number;
|
|
92
115
|
extraction_strategy?: ExtractionStrategy;
|
|
93
116
|
chunking_strategy?: ChunkingStrategy;
|
|
117
|
+
markdown_generator?: Record<string, unknown>;
|
|
94
118
|
css_selector?: string;
|
|
95
119
|
screenshot?: boolean;
|
|
96
120
|
pdf?: boolean;
|
|
121
|
+
capture_mhtml?: boolean;
|
|
97
122
|
cache_mode?: CacheMode;
|
|
98
123
|
bypass_cache?: boolean;
|
|
99
124
|
disable_cache?: boolean;
|
|
100
125
|
no_cache_read?: boolean;
|
|
101
126
|
no_cache_write?: boolean;
|
|
127
|
+
capture_network_requests?: boolean;
|
|
128
|
+
capture_console_messages?: boolean;
|
|
102
129
|
log_console?: boolean;
|
|
103
130
|
stream?: boolean;
|
|
104
131
|
warmup?: boolean;
|
|
105
|
-
js_code?: string[];
|
|
132
|
+
js_code?: string | string[];
|
|
133
|
+
c4a_script?: string | string[];
|
|
106
134
|
js_only?: boolean;
|
|
107
135
|
wait_for?: string;
|
|
136
|
+
wait_until?: string;
|
|
137
|
+
scan_full_page?: boolean;
|
|
138
|
+
scroll_delay?: number;
|
|
108
139
|
page_timeout?: number;
|
|
109
140
|
delay_before_return_html?: number;
|
|
110
141
|
remove_overlay_elements?: boolean;
|
|
@@ -116,6 +147,15 @@ export interface CrawlerRunConfig {
|
|
|
116
147
|
ignore_robots_txt?: boolean;
|
|
117
148
|
anti_bot?: boolean;
|
|
118
149
|
light_mode?: boolean;
|
|
150
|
+
locale?: string;
|
|
151
|
+
timezone_id?: string;
|
|
152
|
+
geolocation?: GeolocationConfig;
|
|
153
|
+
proxy_config?: ProxyConfig;
|
|
154
|
+
proxy_rotation_strategy?: string | Record<string, unknown>;
|
|
155
|
+
url_matcher?: string | string[] | Record<string, unknown>;
|
|
156
|
+
match_mode?: 'or' | 'and' | 'OR' | 'AND' | string;
|
|
157
|
+
scraping_strategy?: Record<string, unknown>;
|
|
158
|
+
verbose?: boolean;
|
|
119
159
|
extra?: Record<string, unknown>;
|
|
120
160
|
}
|
|
121
161
|
export interface ChunkingStrategy {
|
|
@@ -133,9 +173,12 @@ export interface CrawlRequest {
|
|
|
133
173
|
}
|
|
134
174
|
export interface MarkdownRequest {
|
|
135
175
|
url: string;
|
|
176
|
+
filter?: ContentFilter;
|
|
177
|
+
query?: string;
|
|
178
|
+
cache?: string;
|
|
136
179
|
f?: ContentFilter;
|
|
137
|
-
q?: string;
|
|
138
|
-
c?: string;
|
|
180
|
+
q?: string | null;
|
|
181
|
+
c?: string | null;
|
|
139
182
|
}
|
|
140
183
|
export interface HtmlRequest {
|
|
141
184
|
url: string;
|
|
@@ -165,6 +208,46 @@ export interface AskRequest {
|
|
|
165
208
|
score_ratio?: number;
|
|
166
209
|
max_results?: number;
|
|
167
210
|
}
|
|
211
|
+
export interface MarkdownGenerationResult {
|
|
212
|
+
raw_markdown: string;
|
|
213
|
+
markdown_with_citations: string;
|
|
214
|
+
references_markdown: string;
|
|
215
|
+
fit_markdown?: string;
|
|
216
|
+
fit_html?: string;
|
|
217
|
+
}
|
|
218
|
+
export interface CrawlResponse {
|
|
219
|
+
success?: boolean;
|
|
220
|
+
results: CrawlResult[];
|
|
221
|
+
server_processing_time_s?: number;
|
|
222
|
+
server_memory_delta_mb?: number;
|
|
223
|
+
server_peak_memory_mb?: number;
|
|
224
|
+
}
|
|
225
|
+
export interface CrawlStreamStatus {
|
|
226
|
+
status: string;
|
|
227
|
+
[key: string]: unknown;
|
|
228
|
+
}
|
|
229
|
+
export type CrawlStreamChunk = CrawlResult | CrawlStreamStatus;
|
|
230
|
+
export interface MarkdownResponse {
|
|
231
|
+
url: string;
|
|
232
|
+
filter?: string | null;
|
|
233
|
+
query?: string | null;
|
|
234
|
+
cache?: string | null;
|
|
235
|
+
markdown: string;
|
|
236
|
+
success?: boolean;
|
|
237
|
+
}
|
|
238
|
+
export interface HtmlResponse {
|
|
239
|
+
html: string;
|
|
240
|
+
url?: string;
|
|
241
|
+
success?: boolean;
|
|
242
|
+
}
|
|
243
|
+
export interface ScreenshotResponse {
|
|
244
|
+
screenshot: string;
|
|
245
|
+
success?: boolean;
|
|
246
|
+
}
|
|
247
|
+
export interface PdfResponse {
|
|
248
|
+
pdf: string;
|
|
249
|
+
success?: boolean;
|
|
250
|
+
}
|
|
168
251
|
export interface CrawlResult {
|
|
169
252
|
url: string;
|
|
170
253
|
html: string;
|
|
@@ -177,7 +260,7 @@ export interface CrawlResult {
|
|
|
177
260
|
screenshot?: string;
|
|
178
261
|
pdf?: string;
|
|
179
262
|
mhtml?: string;
|
|
180
|
-
markdown?: string;
|
|
263
|
+
markdown?: string | MarkdownGenerationResult;
|
|
181
264
|
fit_markdown?: string;
|
|
182
265
|
raw_markdown?: string;
|
|
183
266
|
markdown_with_citations?: string;
|
|
@@ -197,6 +280,8 @@ export interface CrawlResult {
|
|
|
197
280
|
crawl_depth?: number;
|
|
198
281
|
text?: string;
|
|
199
282
|
cookies?: Cookie[];
|
|
283
|
+
tables?: Array<Record<string, unknown>>;
|
|
284
|
+
server_memory_mb?: number;
|
|
200
285
|
}
|
|
201
286
|
export interface MediaInfo {
|
|
202
287
|
images: MediaItem[];
|
|
@@ -207,8 +292,10 @@ export interface MediaItem {
|
|
|
207
292
|
src: string;
|
|
208
293
|
alt?: string;
|
|
209
294
|
desc?: string;
|
|
295
|
+
description?: string;
|
|
210
296
|
score?: number;
|
|
211
297
|
type?: string;
|
|
298
|
+
mime_type?: string;
|
|
212
299
|
}
|
|
213
300
|
export interface LinksInfo {
|
|
214
301
|
internal: LinkItem[];
|
|
@@ -218,6 +305,9 @@ export interface LinkItem {
|
|
|
218
305
|
href: string;
|
|
219
306
|
text?: string;
|
|
220
307
|
title?: string;
|
|
308
|
+
base_domain?: string;
|
|
309
|
+
relevance_score?: number;
|
|
310
|
+
type?: string;
|
|
221
311
|
}
|
|
222
312
|
export interface PageMetadata {
|
|
223
313
|
title?: string;
|
|
@@ -246,8 +336,12 @@ export interface HealthResponse {
|
|
|
246
336
|
timestamp: number;
|
|
247
337
|
version: string;
|
|
248
338
|
}
|
|
339
|
+
export type ConfigDumpResponse = Record<string, unknown> | string;
|
|
340
|
+
export type McpSchemaResponse = Record<string, unknown>;
|
|
249
341
|
export interface TokenResponse {
|
|
250
|
-
|
|
342
|
+
email: string;
|
|
343
|
+
access_token: string;
|
|
344
|
+
token_type: string;
|
|
251
345
|
}
|
|
252
346
|
export interface AskResponse {
|
|
253
347
|
context: string;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawl4ai",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "TypeScript SDK for Crawl4AI REST API - Bun & Node.js compatible",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
"build": "bun build ./src/index.ts --outdir ./dist --target node",
|
|
10
10
|
"build:types": "tsc --declaration --emitDeclarationOnly --outDir dist",
|
|
11
11
|
"test": "bun test",
|
|
12
|
+
"smoke": "bun run smoke-tests/run.ts",
|
|
12
13
|
"lint": "biome check src --assist-enabled=true",
|
|
13
14
|
"fix": "biome check --write src --assist-enabled=true",
|
|
14
15
|
"format": "biome format --write src",
|