crawl4ai 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -3
- package/dist/index.js +233 -16
- package/dist/sdk.d.ts +59 -1
- package/dist/types.d.ts +98 -4
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -10,8 +10,10 @@ A type-safe TypeScript SDK for the Crawl4AI REST API. Built for modern JavaScrip
|
|
|
10
10
|
- **Comprehensive Coverage** - All Crawl4AI endpoints including specialized features
|
|
11
11
|
- **Smart Error Handling** - Custom error classes with retry logic and timeouts
|
|
12
12
|
- **Batch Processing** - Efficiently crawl multiple URLs in a single request
|
|
13
|
+
- **Streaming Results** - NDJSON streaming for long-running crawls
|
|
13
14
|
- **Input Validation** - Built-in URL validation and parameter checking
|
|
14
15
|
- **Debug Mode** - Optional request/response logging for development
|
|
16
|
+
- **MCP Endpoints** - Model Context Protocol schema + SSE support
|
|
15
17
|
- **Zero Dependencies** - Uses only native fetch API
|
|
16
18
|
|
|
17
19
|
## 📦 Installation
|
|
@@ -37,6 +39,8 @@ yarn add crawl4ai
|
|
|
37
39
|
- **Official Project**: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
|
38
40
|
- **Official Documentation**: [https://docs.crawl4ai.com/](https://docs.crawl4ai.com/)
|
|
39
41
|
|
|
42
|
+
**Compatibility note:** SDK updated against Crawl4AI **v0.8.0** (January 18, 2026).
|
|
43
|
+
|
|
40
44
|
|
|
41
45
|
## 🏗️ Prerequisites
|
|
42
46
|
|
|
@@ -90,7 +94,9 @@ const results = await client.crawl({
|
|
|
90
94
|
|
|
91
95
|
const result = results[0]; // API returns array of results
|
|
92
96
|
console.log('Title:', result.metadata?.title);
|
|
93
|
-
|
|
97
|
+
const markdown =
|
|
98
|
+
typeof result.markdown === 'string' ? result.markdown : result.markdown?.raw_markdown;
|
|
99
|
+
console.log('Content:', markdown?.slice(0, 200));
|
|
94
100
|
```
|
|
95
101
|
|
|
96
102
|
### Configuration Options
|
|
@@ -135,6 +141,30 @@ const results = await client.crawl({
|
|
|
135
141
|
});
|
|
136
142
|
```
|
|
137
143
|
|
|
144
|
+
Need server metadata (processing time, memory stats, etc.)? Use `crawlWithMetadata`:
|
|
145
|
+
|
|
146
|
+
```typescript
|
|
147
|
+
const response = await client.crawlWithMetadata({
|
|
148
|
+
urls: ['https://example.com']
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
console.log(response.success);
|
|
152
|
+
console.log(response.server_processing_time_s);
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
#### `crawlStream(request)` - Stream Crawl Results (NDJSON)
|
|
156
|
+
Stream results as they arrive:
|
|
157
|
+
|
|
158
|
+
```typescript
|
|
159
|
+
for await (const chunk of client.crawlStream({ urls: ['https://example.com'] })) {
|
|
160
|
+
if ('status' in chunk) {
|
|
161
|
+
console.log('Stream status:', chunk.status);
|
|
162
|
+
} else {
|
|
163
|
+
console.log('Crawled:', chunk.url);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
```
|
|
167
|
+
|
|
138
168
|
|
|
139
169
|
### Content Generation
|
|
140
170
|
|
|
@@ -149,6 +179,16 @@ const markdown = await client.markdown({
|
|
|
149
179
|
});
|
|
150
180
|
```
|
|
151
181
|
|
|
182
|
+
Want the full response metadata? Use `markdownResult`:
|
|
183
|
+
|
|
184
|
+
```typescript
|
|
185
|
+
const markdownResult = await client.markdownResult({
|
|
186
|
+
url: 'https://example.com',
|
|
187
|
+
filter: 'fit'
|
|
188
|
+
});
|
|
189
|
+
console.log(markdownResult.success, markdownResult.cache);
|
|
190
|
+
```
|
|
191
|
+
|
|
152
192
|
#### `html(request)` - Get Processed HTML
|
|
153
193
|
Get sanitized HTML for schema extraction:
|
|
154
194
|
|
|
@@ -158,6 +198,13 @@ const html = await client.html({
|
|
|
158
198
|
});
|
|
159
199
|
```
|
|
160
200
|
|
|
201
|
+
Full response metadata via `htmlResult`:
|
|
202
|
+
|
|
203
|
+
```typescript
|
|
204
|
+
const htmlResult = await client.htmlResult({ url: 'https://example.com' });
|
|
205
|
+
console.log(htmlResult.success);
|
|
206
|
+
```
|
|
207
|
+
|
|
161
208
|
#### `screenshot(request)` - Capture Screenshot
|
|
162
209
|
Capture full-page screenshots:
|
|
163
210
|
|
|
@@ -169,6 +216,8 @@ const screenshotBase64 = await client.screenshot({
|
|
|
169
216
|
});
|
|
170
217
|
```
|
|
171
218
|
|
|
219
|
+
For response metadata use `screenshotResult()`.
|
|
220
|
+
|
|
172
221
|
#### `pdf(request)` - Generate PDF
|
|
173
222
|
Generate PDF documents:
|
|
174
223
|
|
|
@@ -179,6 +228,8 @@ const pdfData = await client.pdf({
|
|
|
179
228
|
});
|
|
180
229
|
```
|
|
181
230
|
|
|
231
|
+
For response metadata use `pdfResult()`.
|
|
232
|
+
|
|
182
233
|
### JavaScript Execution
|
|
183
234
|
|
|
184
235
|
#### `executeJs(request)` - Run JavaScript
|
|
@@ -239,6 +290,16 @@ const version = await client.version({ throwOnError: true });
|
|
|
239
290
|
// Get Prometheus metrics
|
|
240
291
|
const metrics = await client.metrics();
|
|
241
292
|
|
|
293
|
+
// Request an API token
|
|
294
|
+
const token = await client.token({ email: 'user@example.com' });
|
|
295
|
+
|
|
296
|
+
// Dump server configuration (advanced)
|
|
297
|
+
const configDump = await client.configDump({ code: 'CrawlerRunConfig()' });
|
|
298
|
+
|
|
299
|
+
// MCP schema + SSE (advanced integrations)
|
|
300
|
+
const mcpSchema = await client.mcpSchema();
|
|
301
|
+
const mcpStream = await client.mcpSse();
|
|
302
|
+
|
|
242
303
|
// Update configuration
|
|
243
304
|
client.setApiToken('new_token');
|
|
244
305
|
client.setBaseUrl('https://new-url.com');
|
|
@@ -345,7 +406,7 @@ const results = await client.crawl({
|
|
|
345
406
|
The SDK provides custom error handling with detailed information:
|
|
346
407
|
|
|
347
408
|
```typescript
|
|
348
|
-
import { Crawl4AIError } from 'crawl4ai
|
|
409
|
+
import { Crawl4AIError } from 'crawl4ai';
|
|
349
410
|
|
|
350
411
|
try {
|
|
351
412
|
const results = await client.crawl({ urls: 'https://example.com' });
|
|
@@ -426,4 +487,4 @@ This SDK is released under the MIT License.
|
|
|
426
487
|
|
|
427
488
|
## 🙏 Acknowledgments
|
|
428
489
|
|
|
429
|
-
Built for the amazing [Crawl4AI](https://github.com/unclecode/crawl4ai) project by [@unclecode](https://github.com/unclecode) and the Crawl4AI community.
|
|
490
|
+
Built for the amazing [Crawl4AI](https://github.com/unclecode/crawl4ai) project by [@unclecode](https://github.com/unclecode) and the Crawl4AI community.
|
package/dist/index.js
CHANGED
|
@@ -233,6 +233,21 @@ class Crawl4AI {
|
|
|
233
233
|
}
|
|
234
234
|
return [response];
|
|
235
235
|
}
|
|
236
|
+
normalizeCrawlResponse(response) {
|
|
237
|
+
if (Array.isArray(response)) {
|
|
238
|
+
return { results: response };
|
|
239
|
+
}
|
|
240
|
+
if (typeof response === "object" && response !== null) {
|
|
241
|
+
const responseObject = response;
|
|
242
|
+
if (Array.isArray(responseObject.results)) {
|
|
243
|
+
return responseObject;
|
|
244
|
+
}
|
|
245
|
+
if (Array.isArray(responseObject.result)) {
|
|
246
|
+
return { results: responseObject.result };
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
return { results: [response] };
|
|
250
|
+
}
|
|
236
251
|
buildQueryParams(params) {
|
|
237
252
|
const searchParams = new URLSearchParams;
|
|
238
253
|
for (const [key, value] of Object.entries(params)) {
|
|
@@ -242,6 +257,19 @@ class Crawl4AI {
|
|
|
242
257
|
}
|
|
243
258
|
return searchParams.toString();
|
|
244
259
|
}
|
|
260
|
+
async parseResponseBody(response) {
|
|
261
|
+
const contentType = response.headers.get("content-type") || "";
|
|
262
|
+
if (contentType.includes("application/json")) {
|
|
263
|
+
return response.json();
|
|
264
|
+
}
|
|
265
|
+
if (contentType.includes("text/html") || contentType.includes("text/plain")) {
|
|
266
|
+
return response.text();
|
|
267
|
+
}
|
|
268
|
+
if (contentType.includes("text/event-stream") || contentType.includes("application/x-ndjson")) {
|
|
269
|
+
return response;
|
|
270
|
+
}
|
|
271
|
+
return response.text();
|
|
272
|
+
}
|
|
245
273
|
async request(endpoint, options = {}) {
|
|
246
274
|
const url = `${this.config.baseUrl}${endpoint}`;
|
|
247
275
|
const { timeout = this.config.timeout, signal, headers, ...fetchOptions } = options;
|
|
@@ -260,17 +288,7 @@ class Crawl4AI {
|
|
|
260
288
|
signal: requestSignal
|
|
261
289
|
});
|
|
262
290
|
clearTimeout(timeoutId);
|
|
263
|
-
const
|
|
264
|
-
let responseData;
|
|
265
|
-
if (contentType.includes("application/json")) {
|
|
266
|
-
responseData = await response.json();
|
|
267
|
-
} else if (contentType.includes("text/html") || contentType.includes("text/plain")) {
|
|
268
|
-
responseData = await response.text();
|
|
269
|
-
} else if (contentType.includes("text/event-stream")) {
|
|
270
|
-
return response;
|
|
271
|
-
} else {
|
|
272
|
-
responseData = await response.text();
|
|
273
|
-
}
|
|
291
|
+
const responseData = await this.parseResponseBody(response);
|
|
274
292
|
this.log(`Response: ${response.status}`, responseData);
|
|
275
293
|
if (!this.config.validateStatus(response.status)) {
|
|
276
294
|
const headers2 = {};
|
|
@@ -300,6 +318,53 @@ class Crawl4AI {
|
|
|
300
318
|
throw error;
|
|
301
319
|
}
|
|
302
320
|
}
|
|
321
|
+
async requestRaw(endpoint, options = {}) {
|
|
322
|
+
const url = `${this.config.baseUrl}${endpoint}`;
|
|
323
|
+
const { timeout = this.config.timeout, signal, headers, ...fetchOptions } = options;
|
|
324
|
+
this.log(`Request: ${fetchOptions.method || "GET"} ${url}`, fetchOptions.body);
|
|
325
|
+
const requestHeaders = {
|
|
326
|
+
...this.config.defaultHeaders,
|
|
327
|
+
...headers
|
|
328
|
+
};
|
|
329
|
+
const controller = new AbortController;
|
|
330
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
331
|
+
const requestSignal = signal || controller.signal;
|
|
332
|
+
try {
|
|
333
|
+
const response = await fetch(url, {
|
|
334
|
+
...fetchOptions,
|
|
335
|
+
headers: requestHeaders,
|
|
336
|
+
signal: requestSignal
|
|
337
|
+
});
|
|
338
|
+
clearTimeout(timeoutId);
|
|
339
|
+
if (!this.config.validateStatus(response.status)) {
|
|
340
|
+
const responseData = await this.parseResponseBody(response);
|
|
341
|
+
const headers2 = {};
|
|
342
|
+
response.headers.forEach((value, key) => {
|
|
343
|
+
headers2[key] = value;
|
|
344
|
+
});
|
|
345
|
+
const error = createHttpError(response.status, response.statusText, undefined, responseData, headers2);
|
|
346
|
+
error.request = {
|
|
347
|
+
url,
|
|
348
|
+
method: fetchOptions.method || "GET",
|
|
349
|
+
headers: requestHeaders,
|
|
350
|
+
body: fetchOptions.body
|
|
351
|
+
};
|
|
352
|
+
if (this.config.throwOnError) {
|
|
353
|
+
throw error;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
return response;
|
|
357
|
+
} catch (error) {
|
|
358
|
+
clearTimeout(timeoutId);
|
|
359
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
360
|
+
throw new TimeoutError(timeout, url);
|
|
361
|
+
}
|
|
362
|
+
if (error instanceof TypeError && error.message.includes("fetch")) {
|
|
363
|
+
throw new NetworkError(`Network request failed: ${error.message}`, error);
|
|
364
|
+
}
|
|
365
|
+
throw error;
|
|
366
|
+
}
|
|
367
|
+
}
|
|
303
368
|
async requestWithRetry(endpoint, options = {}) {
|
|
304
369
|
let lastError = new Error("No attempts made");
|
|
305
370
|
for (let attempt = 0;attempt <= this.config.retries; attempt++) {
|
|
@@ -324,6 +389,30 @@ class Crawl4AI {
|
|
|
324
389
|
}
|
|
325
390
|
throw lastError;
|
|
326
391
|
}
|
|
392
|
+
async requestRawWithRetry(endpoint, options = {}) {
|
|
393
|
+
let lastError = new Error("No attempts made");
|
|
394
|
+
for (let attempt = 0;attempt <= this.config.retries; attempt++) {
|
|
395
|
+
try {
|
|
396
|
+
return await this.requestRaw(endpoint, options);
|
|
397
|
+
} catch (error) {
|
|
398
|
+
lastError = error;
|
|
399
|
+
if (error instanceof Crawl4AIError && error.status && error.status >= CLIENT_ERROR_MIN && error.status < CLIENT_ERROR_MAX && error.status !== RATE_LIMIT_STATUS) {
|
|
400
|
+
throw error;
|
|
401
|
+
}
|
|
402
|
+
if (attempt < this.config.retries) {
|
|
403
|
+
let delay = this.config.retryDelay * RETRY_BACKOFF_MULTIPLIER ** attempt;
|
|
404
|
+
if (error instanceof RateLimitError && error.retryAfter) {
|
|
405
|
+
delay = error.retryAfter * 1000;
|
|
406
|
+
this.log(`Rate limited. Waiting ${error.retryAfter}s before retry (attempt ${attempt + 1}/${this.config.retries})`);
|
|
407
|
+
} else {
|
|
408
|
+
this.log(`Retry attempt ${attempt + 1}/${this.config.retries} after ${delay}ms`);
|
|
409
|
+
}
|
|
410
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
throw lastError;
|
|
415
|
+
}
|
|
327
416
|
async crawl(request, config) {
|
|
328
417
|
const urls = Array.isArray(request.urls) ? request.urls : [request.urls];
|
|
329
418
|
for (const url of urls) {
|
|
@@ -340,29 +429,131 @@ class Crawl4AI {
|
|
|
340
429
|
});
|
|
341
430
|
return this.normalizeArrayResponse(response);
|
|
342
431
|
}
|
|
432
|
+
async crawlWithMetadata(request, config) {
|
|
433
|
+
const urls = Array.isArray(request.urls) ? request.urls : [request.urls];
|
|
434
|
+
for (const url of urls) {
|
|
435
|
+
this.validateUrl(url);
|
|
436
|
+
}
|
|
437
|
+
const normalizedRequest = {
|
|
438
|
+
...request,
|
|
439
|
+
urls
|
|
440
|
+
};
|
|
441
|
+
const response = await this.requestWithRetry("/crawl", {
|
|
442
|
+
method: "POST",
|
|
443
|
+
body: JSON.stringify(normalizedRequest),
|
|
444
|
+
...config
|
|
445
|
+
});
|
|
446
|
+
return this.normalizeCrawlResponse(response);
|
|
447
|
+
}
|
|
448
|
+
async* crawlStream(request, config) {
|
|
449
|
+
const urls = Array.isArray(request.urls) ? request.urls : [request.urls];
|
|
450
|
+
for (const url of urls) {
|
|
451
|
+
this.validateUrl(url);
|
|
452
|
+
}
|
|
453
|
+
const normalizedRequest = {
|
|
454
|
+
...request,
|
|
455
|
+
urls
|
|
456
|
+
};
|
|
457
|
+
const response = await this.requestRawWithRetry("/crawl/stream", {
|
|
458
|
+
method: "POST",
|
|
459
|
+
body: JSON.stringify(normalizedRequest),
|
|
460
|
+
...config
|
|
461
|
+
});
|
|
462
|
+
if (!response.body) {
|
|
463
|
+
throw new NetworkError("Streaming response body is not available");
|
|
464
|
+
}
|
|
465
|
+
const reader = response.body.getReader();
|
|
466
|
+
const decoder = new TextDecoder;
|
|
467
|
+
let buffer = "";
|
|
468
|
+
while (true) {
|
|
469
|
+
const { value, done } = await reader.read();
|
|
470
|
+
if (done) {
|
|
471
|
+
break;
|
|
472
|
+
}
|
|
473
|
+
buffer += decoder.decode(value, { stream: true });
|
|
474
|
+
const lines = buffer.split(`
|
|
475
|
+
`);
|
|
476
|
+
buffer = lines.pop() || "";
|
|
477
|
+
for (const line of lines) {
|
|
478
|
+
const trimmed = line.trim();
|
|
479
|
+
if (!trimmed) {
|
|
480
|
+
continue;
|
|
481
|
+
}
|
|
482
|
+
const parsed = JSON.parse(trimmed);
|
|
483
|
+
yield parsed;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
const remaining = buffer.trim();
|
|
487
|
+
if (remaining) {
|
|
488
|
+
yield JSON.parse(remaining);
|
|
489
|
+
}
|
|
490
|
+
}
|
|
343
491
|
async markdown(request, config) {
|
|
492
|
+
const response = await this.markdownResult(request, config);
|
|
493
|
+
return response.markdown;
|
|
494
|
+
}
|
|
495
|
+
async markdownResult(request, config) {
|
|
344
496
|
this.validateUrl(request.url);
|
|
345
497
|
const apiRequest = {
|
|
346
498
|
url: request.url,
|
|
347
|
-
...request.filter !== undefined
|
|
348
|
-
...request.query !== undefined
|
|
349
|
-
...request.cache !== undefined
|
|
499
|
+
...request.f !== undefined || request.filter !== undefined ? { f: request.f ?? request.filter } : {},
|
|
500
|
+
...request.q !== undefined || request.query !== undefined ? { q: request.q ?? request.query } : {},
|
|
501
|
+
...request.c !== undefined || request.cache !== undefined ? { c: request.c ?? request.cache } : {}
|
|
350
502
|
};
|
|
351
503
|
const response = await this.requestWithRetry("/md", {
|
|
352
504
|
method: "POST",
|
|
353
505
|
body: JSON.stringify(apiRequest),
|
|
354
506
|
...config
|
|
355
507
|
});
|
|
356
|
-
|
|
508
|
+
if (typeof response === "string") {
|
|
509
|
+
return {
|
|
510
|
+
url: request.url,
|
|
511
|
+
markdown: response
|
|
512
|
+
};
|
|
513
|
+
}
|
|
514
|
+
return response;
|
|
357
515
|
}
|
|
358
516
|
async html(request, config) {
|
|
517
|
+
const response = await this.htmlResult(request, config);
|
|
518
|
+
return response.html;
|
|
519
|
+
}
|
|
520
|
+
async htmlResult(request, config) {
|
|
359
521
|
this.validateUrl(request.url);
|
|
360
522
|
const response = await this.requestWithRetry("/html", {
|
|
361
523
|
method: "POST",
|
|
362
524
|
body: JSON.stringify(request),
|
|
363
525
|
...config
|
|
364
526
|
});
|
|
365
|
-
|
|
527
|
+
if (typeof response === "string") {
|
|
528
|
+
return { html: response, url: request.url };
|
|
529
|
+
}
|
|
530
|
+
return response;
|
|
531
|
+
}
|
|
532
|
+
async screenshot(request, config) {
|
|
533
|
+
const response = await this.screenshotResult(request, config);
|
|
534
|
+
return response.screenshot;
|
|
535
|
+
}
|
|
536
|
+
async screenshotResult(request, config) {
|
|
537
|
+
this.validateUrl(request.url);
|
|
538
|
+
const response = await this.requestWithRetry("/screenshot", {
|
|
539
|
+
method: "POST",
|
|
540
|
+
body: JSON.stringify(request),
|
|
541
|
+
...config
|
|
542
|
+
});
|
|
543
|
+
return response;
|
|
544
|
+
}
|
|
545
|
+
async pdf(request, config) {
|
|
546
|
+
const response = await this.pdfResult(request, config);
|
|
547
|
+
return response.pdf;
|
|
548
|
+
}
|
|
549
|
+
async pdfResult(request, config) {
|
|
550
|
+
this.validateUrl(request.url);
|
|
551
|
+
const response = await this.requestWithRetry("/pdf", {
|
|
552
|
+
method: "POST",
|
|
553
|
+
body: JSON.stringify(request),
|
|
554
|
+
...config
|
|
555
|
+
});
|
|
556
|
+
return response;
|
|
366
557
|
}
|
|
367
558
|
async executeJs(request, config) {
|
|
368
559
|
this.validateUrl(request.url);
|
|
@@ -425,12 +616,38 @@ class Crawl4AI {
|
|
|
425
616
|
...config
|
|
426
617
|
});
|
|
427
618
|
}
|
|
619
|
+
async mcpSchema(config) {
|
|
620
|
+
return this.request("/mcp/schema", {
|
|
621
|
+
method: "GET",
|
|
622
|
+
...config
|
|
623
|
+
});
|
|
624
|
+
}
|
|
625
|
+
async mcpSse(config) {
|
|
626
|
+
return this.requestRaw("/mcp/sse", {
|
|
627
|
+
method: "GET",
|
|
628
|
+
...config
|
|
629
|
+
});
|
|
630
|
+
}
|
|
428
631
|
async getRoot(config) {
|
|
429
632
|
return this.request("/", {
|
|
430
633
|
method: "GET",
|
|
431
634
|
...config
|
|
432
635
|
});
|
|
433
636
|
}
|
|
637
|
+
async token(request, config) {
|
|
638
|
+
return this.request("/token", {
|
|
639
|
+
method: "POST",
|
|
640
|
+
body: JSON.stringify(request),
|
|
641
|
+
...config
|
|
642
|
+
});
|
|
643
|
+
}
|
|
644
|
+
async configDump(request, config) {
|
|
645
|
+
return this.request("/config/dump", {
|
|
646
|
+
method: "POST",
|
|
647
|
+
body: JSON.stringify(request),
|
|
648
|
+
...config
|
|
649
|
+
});
|
|
650
|
+
}
|
|
434
651
|
async testConnection(options) {
|
|
435
652
|
try {
|
|
436
653
|
await this.health({ timeout: HEALTH_CHECK_TIMEOUT });
|
package/dist/sdk.d.ts
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Crawl4AI TypeScript SDK
|
|
3
3
|
* A comprehensive SDK for interacting with Crawl4AI REST API
|
|
4
4
|
*/
|
|
5
|
-
import type { AskRequest, AskResponse, Crawl4AIConfig, CrawlRequest, CrawlResult, ExecuteJsRequest, HealthResponse, HtmlRequest, MarkdownRequest, RequestConfig } from './types';
|
|
5
|
+
import type { AskRequest, AskResponse, ConfigDumpRequest, ConfigDumpResponse, Crawl4AIConfig, CrawlRequest, CrawlResponse, CrawlResult, CrawlStreamChunk, ExecuteJsRequest, HealthResponse, HtmlRequest, HtmlResponse, MarkdownRequest, MarkdownResponse, McpSchemaResponse, PdfRequest, PdfResponse, RequestConfig, ScreenshotRequest, ScreenshotResponse, TokenRequest, TokenResponse } from './types';
|
|
6
6
|
/**
|
|
7
7
|
* Crawl4AI SDK Client - Main class for interacting with Crawl4AI REST API
|
|
8
8
|
*
|
|
@@ -59,12 +59,22 @@ export declare class Crawl4AI {
|
|
|
59
59
|
* Normalize different API response formats to a consistent array
|
|
60
60
|
*/
|
|
61
61
|
private normalizeArrayResponse;
|
|
62
|
+
/**
|
|
63
|
+
* Normalize crawl response to include metadata when available
|
|
64
|
+
*/
|
|
65
|
+
private normalizeCrawlResponse;
|
|
62
66
|
/**
|
|
63
67
|
* Build query parameters from an object, filtering out undefined values
|
|
64
68
|
*/
|
|
65
69
|
private buildQueryParams;
|
|
70
|
+
/**
|
|
71
|
+
* Parse response body based on content type
|
|
72
|
+
*/
|
|
73
|
+
private parseResponseBody;
|
|
66
74
|
private request;
|
|
75
|
+
private requestRaw;
|
|
67
76
|
private requestWithRetry;
|
|
77
|
+
private requestRawWithRetry;
|
|
68
78
|
/**
|
|
69
79
|
* Main crawl endpoint - Extract content from one or more URLs
|
|
70
80
|
*
|
|
@@ -86,6 +96,14 @@ export declare class Crawl4AI {
|
|
|
86
96
|
* @throws {TimeoutError} If request times out
|
|
87
97
|
*/
|
|
88
98
|
crawl(request: CrawlRequest, config?: RequestConfig): Promise<CrawlResult[]>;
|
|
99
|
+
/**
|
|
100
|
+
* Crawl endpoint returning full metadata payload
|
|
101
|
+
*/
|
|
102
|
+
crawlWithMetadata(request: CrawlRequest, config?: RequestConfig): Promise<CrawlResponse>;
|
|
103
|
+
/**
|
|
104
|
+
* Stream crawl results via NDJSON
|
|
105
|
+
*/
|
|
106
|
+
crawlStream(request: CrawlRequest, config?: RequestConfig): AsyncIterable<CrawlStreamChunk>;
|
|
89
107
|
/**
|
|
90
108
|
* Get markdown content from URL with optional filtering
|
|
91
109
|
*
|
|
@@ -106,11 +124,35 @@ export declare class Crawl4AI {
|
|
|
106
124
|
* ```
|
|
107
125
|
*/
|
|
108
126
|
markdown(request: MarkdownRequest, config?: RequestConfig): Promise<string>;
|
|
127
|
+
/**
|
|
128
|
+
* Get markdown content with response metadata
|
|
129
|
+
*/
|
|
130
|
+
markdownResult(request: MarkdownRequest, config?: RequestConfig): Promise<MarkdownResponse>;
|
|
109
131
|
/**
|
|
110
132
|
* Get HTML content from URL
|
|
111
133
|
* @param request HTML extraction options
|
|
112
134
|
*/
|
|
113
135
|
html(request: HtmlRequest, config?: RequestConfig): Promise<string>;
|
|
136
|
+
/**
|
|
137
|
+
* Get HTML content with response metadata
|
|
138
|
+
*/
|
|
139
|
+
htmlResult(request: HtmlRequest, config?: RequestConfig): Promise<HtmlResponse>;
|
|
140
|
+
/**
|
|
141
|
+
* Capture screenshot and return base64 or file path
|
|
142
|
+
*/
|
|
143
|
+
screenshot(request: ScreenshotRequest, config?: RequestConfig): Promise<string>;
|
|
144
|
+
/**
|
|
145
|
+
* Capture screenshot with response metadata
|
|
146
|
+
*/
|
|
147
|
+
screenshotResult(request: ScreenshotRequest, config?: RequestConfig): Promise<ScreenshotResponse>;
|
|
148
|
+
/**
|
|
149
|
+
* Generate PDF and return base64 or file path
|
|
150
|
+
*/
|
|
151
|
+
pdf(request: PdfRequest, config?: RequestConfig): Promise<string>;
|
|
152
|
+
/**
|
|
153
|
+
* Generate PDF with response metadata
|
|
154
|
+
*/
|
|
155
|
+
pdfResult(request: PdfRequest, config?: RequestConfig): Promise<PdfResponse>;
|
|
114
156
|
/**
|
|
115
157
|
* Execute JavaScript on webpage and return results
|
|
116
158
|
*
|
|
@@ -167,10 +209,26 @@ export declare class Crawl4AI {
|
|
|
167
209
|
* Get API schema
|
|
168
210
|
*/
|
|
169
211
|
schema(config?: RequestConfig): Promise<unknown>;
|
|
212
|
+
/**
|
|
213
|
+
* Get MCP schema
|
|
214
|
+
*/
|
|
215
|
+
mcpSchema(config?: RequestConfig): Promise<McpSchemaResponse>;
|
|
216
|
+
/**
|
|
217
|
+
* Connect to MCP Server-Sent Events stream
|
|
218
|
+
*/
|
|
219
|
+
mcpSse(config?: RequestConfig): Promise<Response>;
|
|
170
220
|
/**
|
|
171
221
|
* Get root endpoint information
|
|
172
222
|
*/
|
|
173
223
|
getRoot(config?: RequestConfig): Promise<string>;
|
|
224
|
+
/**
|
|
225
|
+
* Request an API token
|
|
226
|
+
*/
|
|
227
|
+
token(request: TokenRequest, config?: RequestConfig): Promise<TokenResponse>;
|
|
228
|
+
/**
|
|
229
|
+
* Dump server configuration
|
|
230
|
+
*/
|
|
231
|
+
configDump(request: ConfigDumpRequest, config?: RequestConfig): Promise<ConfigDumpResponse>;
|
|
174
232
|
/**
|
|
175
233
|
* Test connection to the Crawl4AI API server
|
|
176
234
|
*
|
package/dist/types.d.ts
CHANGED
|
@@ -14,7 +14,14 @@ export interface Viewport {
|
|
|
14
14
|
export interface BrowserConfig {
|
|
15
15
|
headless?: boolean;
|
|
16
16
|
browser_type?: BrowserType;
|
|
17
|
+
browser_mode?: 'dedicated' | 'builtin' | 'custom' | 'docker';
|
|
18
|
+
use_managed_browser?: boolean;
|
|
19
|
+
cdp_url?: string;
|
|
20
|
+
debugging_port?: number;
|
|
21
|
+
host?: string;
|
|
22
|
+
proxy_config?: ProxyConfig;
|
|
17
23
|
user_agent?: string;
|
|
24
|
+
user_agent_mode?: string;
|
|
18
25
|
proxy?: string;
|
|
19
26
|
page_timeout?: number;
|
|
20
27
|
verbose?: boolean;
|
|
@@ -22,8 +29,13 @@ export interface BrowserConfig {
|
|
|
22
29
|
magic?: boolean;
|
|
23
30
|
override_navigator?: boolean;
|
|
24
31
|
user_data_dir?: string;
|
|
25
|
-
|
|
32
|
+
use_persistent_context?: boolean;
|
|
33
|
+
text_mode?: boolean;
|
|
34
|
+
light_mode?: boolean;
|
|
35
|
+
enable_stealth?: boolean;
|
|
26
36
|
viewport?: Viewport;
|
|
37
|
+
viewport_width?: number;
|
|
38
|
+
viewport_height?: number;
|
|
27
39
|
headers?: Record<string, string>;
|
|
28
40
|
cookies?: Cookie[];
|
|
29
41
|
extra_args?: string[];
|
|
@@ -31,6 +43,7 @@ export interface BrowserConfig {
|
|
|
31
43
|
java_script_enabled?: boolean;
|
|
32
44
|
accept_downloads?: boolean;
|
|
33
45
|
downloads_path?: string;
|
|
46
|
+
extra?: Record<string, unknown>;
|
|
34
47
|
}
|
|
35
48
|
export interface Cookie {
|
|
36
49
|
name: string;
|
|
@@ -77,6 +90,16 @@ export interface CosineExtractionParams {
|
|
|
77
90
|
top_k?: number;
|
|
78
91
|
model_name?: string;
|
|
79
92
|
}
|
|
93
|
+
export interface ProxyConfig {
|
|
94
|
+
server: string;
|
|
95
|
+
username?: string;
|
|
96
|
+
password?: string;
|
|
97
|
+
}
|
|
98
|
+
export interface GeolocationConfig {
|
|
99
|
+
latitude: number;
|
|
100
|
+
longitude: number;
|
|
101
|
+
accuracy?: number;
|
|
102
|
+
}
|
|
80
103
|
export type ExtractionStrategy = {
|
|
81
104
|
type: 'json_css';
|
|
82
105
|
params: JsonCssExtractionParams;
|
|
@@ -91,20 +114,28 @@ export interface CrawlerRunConfig {
|
|
|
91
114
|
word_count_threshold?: number;
|
|
92
115
|
extraction_strategy?: ExtractionStrategy;
|
|
93
116
|
chunking_strategy?: ChunkingStrategy;
|
|
117
|
+
markdown_generator?: Record<string, unknown>;
|
|
94
118
|
css_selector?: string;
|
|
95
119
|
screenshot?: boolean;
|
|
96
120
|
pdf?: boolean;
|
|
121
|
+
capture_mhtml?: boolean;
|
|
97
122
|
cache_mode?: CacheMode;
|
|
98
123
|
bypass_cache?: boolean;
|
|
99
124
|
disable_cache?: boolean;
|
|
100
125
|
no_cache_read?: boolean;
|
|
101
126
|
no_cache_write?: boolean;
|
|
127
|
+
capture_network_requests?: boolean;
|
|
128
|
+
capture_console_messages?: boolean;
|
|
102
129
|
log_console?: boolean;
|
|
103
130
|
stream?: boolean;
|
|
104
131
|
warmup?: boolean;
|
|
105
|
-
js_code?: string[];
|
|
132
|
+
js_code?: string | string[];
|
|
133
|
+
c4a_script?: string | string[];
|
|
106
134
|
js_only?: boolean;
|
|
107
135
|
wait_for?: string;
|
|
136
|
+
wait_until?: string;
|
|
137
|
+
scan_full_page?: boolean;
|
|
138
|
+
scroll_delay?: number;
|
|
108
139
|
page_timeout?: number;
|
|
109
140
|
delay_before_return_html?: number;
|
|
110
141
|
remove_overlay_elements?: boolean;
|
|
@@ -116,6 +147,15 @@ export interface CrawlerRunConfig {
|
|
|
116
147
|
ignore_robots_txt?: boolean;
|
|
117
148
|
anti_bot?: boolean;
|
|
118
149
|
light_mode?: boolean;
|
|
150
|
+
locale?: string;
|
|
151
|
+
timezone_id?: string;
|
|
152
|
+
geolocation?: GeolocationConfig;
|
|
153
|
+
proxy_config?: ProxyConfig;
|
|
154
|
+
proxy_rotation_strategy?: string | Record<string, unknown>;
|
|
155
|
+
url_matcher?: string | string[] | Record<string, unknown>;
|
|
156
|
+
match_mode?: 'or' | 'and' | 'OR' | 'AND' | string;
|
|
157
|
+
scraping_strategy?: Record<string, unknown>;
|
|
158
|
+
verbose?: boolean;
|
|
119
159
|
extra?: Record<string, unknown>;
|
|
120
160
|
}
|
|
121
161
|
export interface ChunkingStrategy {
|
|
@@ -136,6 +176,9 @@ export interface MarkdownRequest {
|
|
|
136
176
|
filter?: ContentFilter;
|
|
137
177
|
query?: string;
|
|
138
178
|
cache?: string;
|
|
179
|
+
f?: ContentFilter;
|
|
180
|
+
q?: string | null;
|
|
181
|
+
c?: string | null;
|
|
139
182
|
}
|
|
140
183
|
export interface HtmlRequest {
|
|
141
184
|
url: string;
|
|
@@ -165,6 +208,46 @@ export interface AskRequest {
|
|
|
165
208
|
score_ratio?: number;
|
|
166
209
|
max_results?: number;
|
|
167
210
|
}
|
|
211
|
+
export interface MarkdownGenerationResult {
|
|
212
|
+
raw_markdown: string;
|
|
213
|
+
markdown_with_citations: string;
|
|
214
|
+
references_markdown: string;
|
|
215
|
+
fit_markdown?: string;
|
|
216
|
+
fit_html?: string;
|
|
217
|
+
}
|
|
218
|
+
export interface CrawlResponse {
|
|
219
|
+
success?: boolean;
|
|
220
|
+
results: CrawlResult[];
|
|
221
|
+
server_processing_time_s?: number;
|
|
222
|
+
server_memory_delta_mb?: number;
|
|
223
|
+
server_peak_memory_mb?: number;
|
|
224
|
+
}
|
|
225
|
+
export interface CrawlStreamStatus {
|
|
226
|
+
status: string;
|
|
227
|
+
[key: string]: unknown;
|
|
228
|
+
}
|
|
229
|
+
export type CrawlStreamChunk = CrawlResult | CrawlStreamStatus;
|
|
230
|
+
export interface MarkdownResponse {
|
|
231
|
+
url: string;
|
|
232
|
+
filter?: string | null;
|
|
233
|
+
query?: string | null;
|
|
234
|
+
cache?: string | null;
|
|
235
|
+
markdown: string;
|
|
236
|
+
success?: boolean;
|
|
237
|
+
}
|
|
238
|
+
export interface HtmlResponse {
|
|
239
|
+
html: string;
|
|
240
|
+
url?: string;
|
|
241
|
+
success?: boolean;
|
|
242
|
+
}
|
|
243
|
+
export interface ScreenshotResponse {
|
|
244
|
+
screenshot: string;
|
|
245
|
+
success?: boolean;
|
|
246
|
+
}
|
|
247
|
+
export interface PdfResponse {
|
|
248
|
+
pdf: string;
|
|
249
|
+
success?: boolean;
|
|
250
|
+
}
|
|
168
251
|
export interface CrawlResult {
|
|
169
252
|
url: string;
|
|
170
253
|
html: string;
|
|
@@ -177,7 +260,7 @@ export interface CrawlResult {
|
|
|
177
260
|
screenshot?: string;
|
|
178
261
|
pdf?: string;
|
|
179
262
|
mhtml?: string;
|
|
180
|
-
markdown?: string;
|
|
263
|
+
markdown?: string | MarkdownGenerationResult;
|
|
181
264
|
fit_markdown?: string;
|
|
182
265
|
raw_markdown?: string;
|
|
183
266
|
markdown_with_citations?: string;
|
|
@@ -197,6 +280,8 @@ export interface CrawlResult {
|
|
|
197
280
|
crawl_depth?: number;
|
|
198
281
|
text?: string;
|
|
199
282
|
cookies?: Cookie[];
|
|
283
|
+
tables?: Array<Record<string, unknown>>;
|
|
284
|
+
server_memory_mb?: number;
|
|
200
285
|
}
|
|
201
286
|
export interface MediaInfo {
|
|
202
287
|
images: MediaItem[];
|
|
@@ -207,8 +292,10 @@ export interface MediaItem {
|
|
|
207
292
|
src: string;
|
|
208
293
|
alt?: string;
|
|
209
294
|
desc?: string;
|
|
295
|
+
description?: string;
|
|
210
296
|
score?: number;
|
|
211
297
|
type?: string;
|
|
298
|
+
mime_type?: string;
|
|
212
299
|
}
|
|
213
300
|
export interface LinksInfo {
|
|
214
301
|
internal: LinkItem[];
|
|
@@ -218,6 +305,9 @@ export interface LinkItem {
|
|
|
218
305
|
href: string;
|
|
219
306
|
text?: string;
|
|
220
307
|
title?: string;
|
|
308
|
+
base_domain?: string;
|
|
309
|
+
relevance_score?: number;
|
|
310
|
+
type?: string;
|
|
221
311
|
}
|
|
222
312
|
export interface PageMetadata {
|
|
223
313
|
title?: string;
|
|
@@ -246,8 +336,12 @@ export interface HealthResponse {
|
|
|
246
336
|
timestamp: number;
|
|
247
337
|
version: string;
|
|
248
338
|
}
|
|
339
|
+
export type ConfigDumpResponse = Record<string, unknown> | string;
|
|
340
|
+
export type McpSchemaResponse = Record<string, unknown>;
|
|
249
341
|
export interface TokenResponse {
|
|
250
|
-
|
|
342
|
+
email: string;
|
|
343
|
+
access_token: string;
|
|
344
|
+
token_type: string;
|
|
251
345
|
}
|
|
252
346
|
export interface AskResponse {
|
|
253
347
|
context: string;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "crawl4ai",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "TypeScript SDK for Crawl4AI REST API - Bun & Node.js compatible",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
"build": "bun build ./src/index.ts --outdir ./dist --target node",
|
|
10
10
|
"build:types": "tsc --declaration --emitDeclarationOnly --outDir dist",
|
|
11
11
|
"test": "bun test",
|
|
12
|
+
"smoke": "bun run smoke-tests/run.ts",
|
|
12
13
|
"lint": "biome check src --assist-enabled=true",
|
|
13
14
|
"fix": "biome check --write src --assist-enabled=true",
|
|
14
15
|
"format": "biome format --write src",
|