crawl4ai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Crawl4AI Community
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,429 @@
1
+ # Crawl4AI TypeScript SDK
2
+
3
+ A type-safe TypeScript SDK for the Crawl4AI REST API. Built for modern JavaScript/TypeScript environments with full Bun and Node.js compatibility.
4
+
5
+ ## 🚀 Features
6
+
7
+ - **Full TypeScript Support** - Complete type definitions for all API endpoints and responses
8
+ - **Bun & Node.js Compatible** - Works seamlessly in both runtimes
9
+ - **Modern Async/Await** - Promise-based API for all operations
10
+ - **Comprehensive Coverage** - All Crawl4AI endpoints including specialized features
11
+ - **Smart Error Handling** - Custom error classes with retry logic and timeouts
12
+ - **Batch Processing** - Efficiently crawl multiple URLs in a single request
13
+ - **Input Validation** - Built-in URL validation and parameter checking
14
+ - **Debug Mode** - Optional request/response logging for development
15
+ - **Zero Dependencies** - Uses only native fetch API
16
+
17
+ ## 📦 Installation
18
+
19
+ ### Using Bun (Recommended)
20
+
21
+ ```bash
22
+ bun add crawl4ai
23
+ ```
24
+
25
+ ### Using npm/yarn
26
+
27
+ ```bash
28
+ npm install crawl4ai
29
+ # or
30
+ yarn add crawl4ai
31
+ ```
32
+
33
+ ## 📚 About Crawl4AI
34
+
35
+ > ⚠️ **Unofficial Package**: This is an unofficial TypeScript SDK for the Crawl4AI REST API. This package was created for personal use to provide a type-safe way to interact with Crawl4AI's REST API.
36
+
37
+ - **Official Project**: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
38
+ - **Official Documentation**: [https://docs.crawl4ai.com/](https://docs.crawl4ai.com/)
39
+
40
+
41
+ ## 🏗️ Prerequisites
42
+
43
+ 1. **Crawl4AI Server Running**
44
+
45
+ You can use the hosted version or run your own:
46
+
47
+ ```bash
48
+ # Using Docker
49
+ docker run -p 11235:11235 unclecode/crawl4ai:latest
50
+
51
+ # With LLM support
52
+ docker run -p 11235:11235 \
53
+ -e OPENAI_API_KEY=your_openai_key \
54
+ -e ANTHROPIC_API_KEY=your_anthropic_key \
55
+ unclecode/crawl4ai:latest
56
+ ```
57
+
58
+ 2. **TypeScript** (if using TypeScript)
59
+ ```bash
60
+ bun add -d typescript
61
+ ```
62
+
63
+ ## 🚀 Quick Start
64
+
65
+ ### Basic Usage
66
+
67
+ ```typescript
68
+ import Crawl4AI from 'crawl4ai';
69
+
70
+ // Initialize the client
71
+ const client = new Crawl4AI({
72
+ baseUrl: 'https://example.com', // or your local instance
73
+ apiToken: 'your_token_here', // Optional
74
+ timeout: 30000,
75
+ debug: true // Enable request/response logging
76
+ });
77
+
78
+ // Perform a basic crawl
79
+ const results = await client.crawl({
80
+ urls: 'https://example.com',
81
+ browser_config: {
82
+ headless: true,
83
+ viewport: { width: 1920, height: 1080 }
84
+ },
85
+ crawler_config: {
86
+ cache_mode: 'bypass',
87
+ word_count_threshold: 10
88
+ }
89
+ });
90
+
91
+ const result = results[0]; // API returns array of results
92
+ console.log('Title:', result.metadata?.title);
93
+ console.log('Content:', result.markdown?.slice(0, 200));
94
+ ```
95
+
96
+ ### Configuration Options
97
+
98
+ ```typescript
99
+ const client = new Crawl4AI({
100
+ baseUrl: 'https://example.com',
101
+ apiToken: 'optional_api_token',
102
+ timeout: 60000, // Request timeout in ms
103
+ retries: 3, // Number of retry attempts
104
+ retryDelay: 1000, // Delay between retries in ms
105
+ throwOnError: true, // Throw on HTTP errors
106
+ debug: false, // Enable debug logging
107
+ defaultHeaders: { // Additional headers
108
+ 'User-Agent': 'MyApp/1.0'
109
+ }
110
+ });
111
+ ```
112
+
113
+ ## 📖 API Reference
114
+
115
+ ### Core Methods
116
+
117
+ #### `crawl(request)` - Main Crawl Endpoint
118
+ Crawl one or more URLs with full configuration options:
119
+
120
+ ```typescript
121
+ const results = await client.crawl({
122
+ urls: ['https://example.com', 'https://example.org'],
123
+ browser_config: {
124
+ headless: true,
125
+ simulate_user: true,
126
+ magic: true // Anti-detection features
127
+ },
128
+ crawler_config: {
129
+ cache_mode: 'bypass',
130
+ extraction_strategy: {
131
+ type: 'json_css',
132
+ params: { /* CSS extraction config */ }
133
+ }
134
+ }
135
+ });
136
+ ```
137
+
138
+
139
+ ### Content Generation
140
+
141
+ #### `markdown(request)` - Get Markdown
142
+ Extract markdown with various filters:
143
+
144
+ ```typescript
145
+ const markdown = await client.markdown({
146
+ url: 'https://example.com',
147
+ f: 'fit', // 'raw' | 'fit' | 'bm25' | 'llm'
148
+ q: 'search query for bm25/llm filters'
149
+ });
150
+ ```
151
+
152
+ #### `html(request)` - Get Processed HTML
153
+ Get sanitized HTML for schema extraction:
154
+
155
+ ```typescript
156
+ const html = await client.html({
157
+ url: 'https://example.com'
158
+ });
159
+ ```
160
+
161
+ #### `screenshot(request)` - Capture Screenshot
162
+ Capture full-page screenshots:
163
+
164
+ ```typescript
165
+ const screenshotBase64 = await client.screenshot({
166
+ url: 'https://example.com',
167
+ screenshot_wait_for: 2, // Wait 2 seconds before capture
168
+ output_path: '/path/to/save.png' // Optional: save to file
169
+ });
170
+ ```
171
+
172
+ #### `pdf(request)` - Generate PDF
173
+ Generate PDF documents:
174
+
175
+ ```typescript
176
+ const pdfData = await client.pdf({
177
+ url: 'https://example.com',
178
+ output_path: '/path/to/save.pdf' // Optional: save to file
179
+ });
180
+ ```
181
+
182
+ ### JavaScript Execution
183
+
184
+ #### `executeJs(request)` - Run JavaScript
185
+ Execute JavaScript on the page and get full crawl results:
186
+
187
+ ```typescript
188
+ const result = await client.executeJs({
189
+ url: 'https://example.com',
190
+ scripts: [
191
+ 'return document.title;',
192
+ 'return document.querySelectorAll("a").length;',
193
+ 'window.scrollTo(0, document.body.scrollHeight);'
194
+ ]
195
+ });
196
+
197
+ console.log('JS Results:', result.js_execution_result);
198
+ ```
199
+
200
+ ### AI/LLM Features
201
+
202
+ #### `ask(params)` - Get Library Context
203
+ Get Crawl4AI documentation for AI assistants:
204
+
205
+ ```typescript
206
+ const answer = await client.ask({
207
+ query: 'extraction strategies',
208
+ context_type: 'doc', // 'code' | 'doc' | 'all'
209
+ max_results: 10
210
+ });
211
+ ```
212
+
213
+ #### `llm(url, query)` - LLM Endpoint
214
+ Process URLs with LLM:
215
+
216
+ ```typescript
217
+ const response = await client.llm(
218
+ 'https://example.com',
219
+ 'What is the main purpose of this website?'
220
+ );
221
+ ```
222
+
223
+ ### Utility Methods
224
+
225
+ ```typescript
226
+ // Test connection
227
+ const isConnected = await client.testConnection();
228
+ // With error details
229
+ const isConnected = await client.testConnection({ throwOnError: true });
230
+
231
+ // Get health status
232
+ const health = await client.health();
233
+
234
+ // Get API version
235
+ const version = await client.version();
236
+ // With error details
237
+ const version = await client.version({ throwOnError: true });
238
+
239
+ // Get Prometheus metrics
240
+ const metrics = await client.metrics();
241
+
242
+ // Update configuration
243
+ client.setApiToken('new_token');
244
+ client.setBaseUrl('https://new-url.com');
245
+ client.setDebug(true);
246
+ ```
247
+
248
+ ## 🎯 Data Extraction Strategies
249
+
250
+ ### CSS Selector Extraction
251
+
252
+ Extract structured data using CSS selectors:
253
+
254
+ ```typescript
255
+ const results = await client.crawl({
256
+ urls: 'https://news.ycombinator.com',
257
+ crawler_config: {
258
+ extraction_strategy: {
259
+ type: 'json_css',
260
+ params: {
261
+ schema: {
262
+ baseSelector: '.athing',
263
+ fields: [
264
+ {
265
+ name: 'title',
266
+ selector: '.titleline > a',
267
+ type: 'text'
268
+ },
269
+ {
270
+ name: 'url',
271
+ selector: '.titleline > a',
272
+ type: 'href'
273
+ },
274
+ {
275
+ name: 'score',
276
+ selector: '+ tr .score',
277
+ type: 'text'
278
+ }
279
+ ]
280
+ }
281
+ }
282
+ }
283
+ }
284
+ });
285
+
286
+ const posts = JSON.parse(results[0].extracted_content || '[]');
287
+ ```
288
+
289
+ ### LLM-Based Extraction
290
+
291
+ Use AI models for intelligent data extraction:
292
+
293
+ ```typescript
294
+ const results = await client.crawl({
295
+ urls: 'https://www.bbc.com/news',
296
+ crawler_config: {
297
+ extraction_strategy: {
298
+ type: 'llm',
299
+ params: {
300
+ provider: 'openai/gpt-4o-mini',
301
+ api_token: process.env.OPENAI_API_KEY,
302
+ schema: {
303
+ type: 'object',
304
+ properties: {
305
+ headline: { type: 'string' },
306
+ summary: { type: 'string' },
307
+ author: { type: 'string' },
308
+ tags: {
309
+ type: 'array',
310
+ items: { type: 'string' }
311
+ }
312
+ }
313
+ },
314
+ extraction_type: 'schema',
315
+ instruction: 'Extract news articles with their key information'
316
+ }
317
+ }
318
+ }
319
+ });
320
+ ```
321
+
322
+ ### Cosine Similarity Extraction
323
+
324
+ Filter content based on semantic similarity:
325
+
326
+ ```typescript
327
+ const results = await client.crawl({
328
+ urls: 'https://example.com/blog',
329
+ crawler_config: {
330
+ extraction_strategy: {
331
+ type: 'cosine',
332
+ params: {
333
+ semantic_filter: 'artificial intelligence machine learning',
334
+ word_count_threshold: 50,
335
+ max_dist: 0.3,
336
+ top_k: 5
337
+ }
338
+ }
339
+ }
340
+ });
341
+ ```
342
+
343
+ ## 🛠️ Error Handling
344
+
345
+ The SDK provides custom error handling with detailed information:
346
+
347
+ ```typescript
348
+ import { Crawl4AIError } from 'crawl4ai-sdk';
349
+
350
+ try {
351
+ const results = await client.crawl({ urls: 'https://example.com' });
352
+ } catch (error) {
353
+ if (error instanceof Crawl4AIError) {
354
+ console.error('API Error:', error.message);
355
+ console.error('Status:', error.status);
356
+ console.error('Details:', error.data);
357
+ } else {
358
+ console.error('Unexpected error:', error);
359
+ }
360
+ }
361
+ ```
362
+
363
+ ## 🧪 Testing
364
+
365
+ Run the test suite:
366
+
367
+ ```bash
368
+ # Run all tests
369
+ bun test
370
+
371
+ # Run specific test file
372
+ bun test src/sdk.test.ts
373
+ ```
374
+
375
+ ## 📚 Examples
376
+
377
+ Run the included examples:
378
+
379
+ ```bash
380
+ # Basic usage
381
+ bun run example:basic
382
+
383
+ # Advanced features
384
+ bun run example:advanced
385
+
386
+ # LLM extraction
387
+ bun run example:llm
388
+ ```
389
+
390
+ ## 🔒 Security & Best Practices
391
+
392
+ ### Authentication
393
+
394
+ Always use API tokens in production:
395
+
396
+ ```typescript
397
+ const client = new Crawl4AI({
398
+ baseUrl: 'https://your-crawl4ai-server.com',
399
+ apiToken: process.env.CRAWL4AI_API_TOKEN
400
+ });
401
+ ```
402
+
403
+ ### Rate Limiting
404
+
405
+ Implement client-side throttling:
406
+
407
+ ```typescript
408
+ // Sequential processing with delays
409
+ for (const url of urls) {
410
+ const results = await client.crawl({ urls: url });
411
+ await new Promise(resolve => setTimeout(resolve, 1000)); // 1s delay
412
+ }
413
+ ```
414
+
415
+ ### Input Validation
416
+
417
+ The SDK automatically validates URLs before making requests. Invalid URLs will throw a `Crawl4AIError`.
418
+
419
+ ## 🤝 Contributing
420
+
421
+ Contributions are welcome! Please feel free to submit a Pull Request.
422
+
423
+ ## 📄 License
424
+
425
+ This SDK is released under the MIT License.
426
+
427
+ ## 🙏 Acknowledgments
428
+
429
+ Built for the amazing [Crawl4AI](https://github.com/unclecode/crawl4ai) project by [@unclecode](https://github.com/unclecode) and the Crawl4AI community.
@@ -0,0 +1,96 @@
1
+ /**
2
+ * Custom error classes for Crawl4AI SDK
3
+ */
4
+ import type { ValidationError } from './types';
5
+ /**
6
+ * Base error class for all Crawl4AI errors
7
+ */
8
+ export declare class Crawl4AIError extends Error {
9
+ status?: number;
10
+ statusText?: string;
11
+ data?: ValidationError | Record<string, unknown>;
12
+ request?: {
13
+ url: string;
14
+ method: string;
15
+ headers?: Record<string, string>;
16
+ body?: unknown;
17
+ };
18
+ constructor(message: string, status?: number, statusText?: string, data?: ValidationError | Record<string, unknown>);
19
+ }
20
+ /**
21
+ * Network-related errors (timeouts, connection failures)
22
+ */
23
+ export declare class NetworkError extends Crawl4AIError {
24
+ constructor(message: string, cause?: Error);
25
+ }
26
+ /**
27
+ * Request timeout error
28
+ */
29
+ export declare class TimeoutError extends NetworkError {
30
+ timeout: number;
31
+ constructor(timeout: number, url?: string);
32
+ }
33
+ /**
34
+ * Validation errors for request parameters
35
+ */
36
+ export declare class RequestValidationError extends Crawl4AIError {
37
+ field?: string;
38
+ value?: unknown;
39
+ constructor(message: string, field?: string, value?: unknown);
40
+ }
41
+ /**
42
+ * Rate limiting error
43
+ */
44
+ export declare class RateLimitError extends Crawl4AIError {
45
+ retryAfter?: number;
46
+ limit?: number;
47
+ remaining?: number;
48
+ reset?: Date;
49
+ constructor(message: string, retryAfter?: number, headers?: Record<string, string>);
50
+ }
51
+ /**
52
+ * Authentication/Authorization errors
53
+ */
54
+ export declare class AuthError extends Crawl4AIError {
55
+ constructor(message?: string, status?: number);
56
+ }
57
+ /**
58
+ * Server errors (5xx)
59
+ */
60
+ export declare class ServerError extends Crawl4AIError {
61
+ constructor(message?: string, status?: number, statusText?: string);
62
+ }
63
+ /**
64
+ * Resource not found error
65
+ */
66
+ export declare class NotFoundError extends Crawl4AIError {
67
+ resource?: string;
68
+ constructor(resource?: string);
69
+ }
70
+ /**
71
+ * Response parsing error
72
+ */
73
+ export declare class ParseError extends Crawl4AIError {
74
+ responseText?: string;
75
+ constructor(message: string, responseText?: string);
76
+ }
77
+ /**
78
+ * Type guard to check if an error is a Crawl4AI error
79
+ */
80
+ export declare function isCrawl4AIError(error: unknown): error is Crawl4AIError;
81
+ /**
82
+ * Type guard to check if an error is a rate limit error
83
+ */
84
+ export declare function isRateLimitError(error: unknown): error is RateLimitError;
85
+ /**
86
+ * Type guard to check if an error is an auth error
87
+ */
88
+ export declare function isAuthError(error: unknown): error is AuthError;
89
+ /**
90
+ * Type guard to check if an error is a network error
91
+ */
92
+ export declare function isNetworkError(error: unknown): error is NetworkError;
93
+ /**
94
+ * Helper to create appropriate error based on status code
95
+ */
96
+ export declare function createHttpError(status: number, statusText: string, message?: string, data?: unknown, headers?: Record<string, string>): Crawl4AIError;
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Crawl4AI TypeScript SDK
3
+ * Export all types and classes
4
+ */
5
+ export * from './errors';
6
+ export { Crawl4AI, default } from './sdk';
7
+ export * from './types';