Crawl4AI 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,580 @@
1
+ Metadata-Version: 2.1
2
+ Name: Crawl4AI
3
+ Version: 0.3.0
4
+ Summary: πŸ”₯πŸ•·οΈ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper
5
+ Home-page: https://github.com/unclecode/crawl4ai
6
+ Author: Unclecode
7
+ Author-email: unclecode@kidocode.com
8
+ License: MIT
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.7
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Requires-Python: >=3.7
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: aiohappyeyeballs==2.4.0
21
+ Requires-Dist: aiohttp==3.10.5
22
+ Requires-Dist: aiosignal==1.3.1
23
+ Requires-Dist: aiosqlite==0.20.0
24
+ Requires-Dist: annotated-types==0.7.0
25
+ Requires-Dist: anyio==4.6.0
26
+ Requires-Dist: async-timeout==4.0.3
27
+ Requires-Dist: attrs==24.2.0
28
+ Requires-Dist: beautifulsoup4==4.12.3
29
+ Requires-Dist: certifi==2024.8.30
30
+ Requires-Dist: charset-normalizer==3.3.2
31
+ Requires-Dist: click==8.1.7
32
+ Requires-Dist: distro==1.9.0
33
+ Requires-Dist: exceptiongroup==1.2.2
34
+ Requires-Dist: filelock==3.16.1
35
+ Requires-Dist: frozenlist==1.4.1
36
+ Requires-Dist: fsspec==2024.9.0
37
+ Requires-Dist: greenlet==3.0.3
38
+ Requires-Dist: h11==0.14.0
39
+ Requires-Dist: html2text==2024.2.26
40
+ Requires-Dist: httpcore==1.0.5
41
+ Requires-Dist: httpx==0.27.2
42
+ Requires-Dist: huggingface-hub==0.25.1
43
+ Requires-Dist: idna==3.10
44
+ Requires-Dist: importlib_metadata==8.5.0
45
+ Requires-Dist: Jinja2==3.1.4
46
+ Requires-Dist: jiter==0.5.0
47
+ Requires-Dist: jsonschema==4.23.0
48
+ Requires-Dist: jsonschema-specifications==2023.12.1
49
+ Requires-Dist: litellm==1.48.0
50
+ Requires-Dist: lxml==5.3.0
51
+ Requires-Dist: MarkupSafe==2.1.5
52
+ Requires-Dist: multidict==6.1.0
53
+ Requires-Dist: nest-asyncio==1.6.0
54
+ Requires-Dist: numpy==2.1.1
55
+ Requires-Dist: openai==1.47.1
56
+ Requires-Dist: outcome==1.3.0.post0
57
+ Requires-Dist: packaging==24.1
58
+ Requires-Dist: pillow==10.4.0
59
+ Requires-Dist: playwright==1.47.0
60
+ Requires-Dist: psutil==6.0.0
61
+ Requires-Dist: pydantic==2.9.2
62
+ Requires-Dist: pydantic_core==2.23.4
63
+ Requires-Dist: pyee==12.0.0
64
+ Requires-Dist: PySocks==1.7.1
65
+ Requires-Dist: python-dotenv==1.0.1
66
+ Requires-Dist: PyYAML==6.0.2
67
+ Requires-Dist: referencing==0.35.1
68
+ Requires-Dist: regex==2024.9.11
69
+ Requires-Dist: requests==2.32.3
70
+ Requires-Dist: rpds-py==0.20.0
71
+ Requires-Dist: sniffio==1.3.1
72
+ Requires-Dist: sortedcontainers==2.4.0
73
+ Requires-Dist: soupsieve==2.6
74
+ Requires-Dist: tiktoken==0.7.0
75
+ Requires-Dist: tqdm==4.66.5
76
+ Requires-Dist: trio==0.26.2
77
+ Requires-Dist: trio-websocket==0.11.1
78
+ Requires-Dist: typing_extensions==4.12.2
79
+ Requires-Dist: urllib3==2.2.3
80
+ Requires-Dist: websocket-client==1.8.0
81
+ Requires-Dist: wsproto==1.2.0
82
+ Requires-Dist: yarl==1.12.1
83
+ Requires-Dist: zipp==3.20.2
84
+ Provides-Extra: torch
85
+ Requires-Dist: numpy==2.1.1; extra == "torch"
86
+ Provides-Extra: transformer
87
+ Requires-Dist: tokenizers==0.20.0; extra == "transformer"
88
+ Provides-Extra: sync
89
+ Requires-Dist: selenium; extra == "sync"
90
+ Provides-Extra: cosine
91
+ Requires-Dist: torch; extra == "cosine"
92
+ Requires-Dist: transformers; extra == "cosine"
93
+ Requires-Dist: nltk; extra == "cosine"
94
+ Requires-Dist: spacy; extra == "cosine"
95
+ Provides-Extra: all
96
+ Requires-Dist: aiohappyeyeballs==2.4.0; extra == "all"
97
+ Requires-Dist: aiohttp==3.10.5; extra == "all"
98
+ Requires-Dist: aiosignal==1.3.1; extra == "all"
99
+ Requires-Dist: aiosqlite==0.20.0; extra == "all"
100
+ Requires-Dist: annotated-types==0.7.0; extra == "all"
101
+ Requires-Dist: anyio==4.6.0; extra == "all"
102
+ Requires-Dist: async-timeout==4.0.3; extra == "all"
103
+ Requires-Dist: attrs==24.2.0; extra == "all"
104
+ Requires-Dist: beautifulsoup4==4.12.3; extra == "all"
105
+ Requires-Dist: certifi==2024.8.30; extra == "all"
106
+ Requires-Dist: charset-normalizer==3.3.2; extra == "all"
107
+ Requires-Dist: click==8.1.7; extra == "all"
108
+ Requires-Dist: distro==1.9.0; extra == "all"
109
+ Requires-Dist: exceptiongroup==1.2.2; extra == "all"
110
+ Requires-Dist: filelock==3.16.1; extra == "all"
111
+ Requires-Dist: frozenlist==1.4.1; extra == "all"
112
+ Requires-Dist: fsspec==2024.9.0; extra == "all"
113
+ Requires-Dist: greenlet==3.0.3; extra == "all"
114
+ Requires-Dist: h11==0.14.0; extra == "all"
115
+ Requires-Dist: html2text==2024.2.26; extra == "all"
116
+ Requires-Dist: httpcore==1.0.5; extra == "all"
117
+ Requires-Dist: httpx==0.27.2; extra == "all"
118
+ Requires-Dist: huggingface-hub==0.25.1; extra == "all"
119
+ Requires-Dist: idna==3.10; extra == "all"
120
+ Requires-Dist: importlib_metadata==8.5.0; extra == "all"
121
+ Requires-Dist: Jinja2==3.1.4; extra == "all"
122
+ Requires-Dist: jiter==0.5.0; extra == "all"
123
+ Requires-Dist: jsonschema==4.23.0; extra == "all"
124
+ Requires-Dist: jsonschema-specifications==2023.12.1; extra == "all"
125
+ Requires-Dist: litellm==1.48.0; extra == "all"
126
+ Requires-Dist: lxml==5.3.0; extra == "all"
127
+ Requires-Dist: MarkupSafe==2.1.5; extra == "all"
128
+ Requires-Dist: multidict==6.1.0; extra == "all"
129
+ Requires-Dist: nest-asyncio==1.6.0; extra == "all"
130
+ Requires-Dist: numpy==2.1.1; extra == "all"
131
+ Requires-Dist: openai==1.47.1; extra == "all"
132
+ Requires-Dist: outcome==1.3.0.post0; extra == "all"
133
+ Requires-Dist: packaging==24.1; extra == "all"
134
+ Requires-Dist: pillow==10.4.0; extra == "all"
135
+ Requires-Dist: playwright==1.47.0; extra == "all"
136
+ Requires-Dist: psutil==6.0.0; extra == "all"
137
+ Requires-Dist: pydantic==2.9.2; extra == "all"
138
+ Requires-Dist: pydantic_core==2.23.4; extra == "all"
139
+ Requires-Dist: pyee==12.0.0; extra == "all"
140
+ Requires-Dist: PySocks==1.7.1; extra == "all"
141
+ Requires-Dist: python-dotenv==1.0.1; extra == "all"
142
+ Requires-Dist: PyYAML==6.0.2; extra == "all"
143
+ Requires-Dist: referencing==0.35.1; extra == "all"
144
+ Requires-Dist: regex==2024.9.11; extra == "all"
145
+ Requires-Dist: requests==2.32.3; extra == "all"
146
+ Requires-Dist: rpds-py==0.20.0; extra == "all"
147
+ Requires-Dist: selenium==4.25.0; extra == "all"
148
+ Requires-Dist: sniffio==1.3.1; extra == "all"
149
+ Requires-Dist: sortedcontainers==2.4.0; extra == "all"
150
+ Requires-Dist: soupsieve==2.6; extra == "all"
151
+ Requires-Dist: tiktoken==0.7.0; extra == "all"
152
+ Requires-Dist: tokenizers==0.20.0; extra == "all"
153
+ Requires-Dist: tqdm==4.66.5; extra == "all"
154
+ Requires-Dist: trio==0.26.2; extra == "all"
155
+ Requires-Dist: trio-websocket==0.11.1; extra == "all"
156
+ Requires-Dist: typing_extensions==4.12.2; extra == "all"
157
+ Requires-Dist: urllib3==2.2.3; extra == "all"
158
+ Requires-Dist: websocket-client==1.8.0; extra == "all"
159
+ Requires-Dist: wsproto==1.2.0; extra == "all"
160
+ Requires-Dist: yarl==1.12.1; extra == "all"
161
+ Requires-Dist: zipp==3.20.2; extra == "all"
162
+ Requires-Dist: selenium; extra == "all"
163
+ Requires-Dist: torch; extra == "all"
164
+ Requires-Dist: transformers; extra == "all"
165
+ Requires-Dist: nltk; extra == "all"
166
+ Requires-Dist: spacy; extra == "all"
167
+
168
+ # Crawl4AI 0.3.0 Async Version πŸ•·οΈπŸ€–
169
+
170
+ [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
171
+ [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
172
+ [![GitHub Issues](https://img.shields.io/github/issues/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/issues)
173
+ [![GitHub Pull Requests](https://img.shields.io/github/issues-pr/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/pulls)
174
+ [![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
175
+
176
+ Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. πŸ†“πŸŒ
177
+
178
+ > Looking for the synchronous version? Check out [README.sync.md](./README.sync.md).
179
+
180
+ ## Try it Now!
181
+
182
+ ✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1REChY6fXQf-EaVYLv0eHEWvzlYxGm0pd?usp=sharing)
183
+
184
+ ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)
185
+
186
+ ✨ Check out the [Demo](https://crawl4ai.com/mkdocs/demo)
187
+
188
+ ## Features ✨
189
+
190
+ - πŸ†“ Completely free and open-source
191
+ - πŸš€ Blazing fast performance, outperforming many paid services
192
+ - πŸ€– LLM-friendly output formats (JSON, cleaned HTML, markdown)
193
+ - 🌍 Supports crawling multiple URLs simultaneously
194
+ - 🎨 Extracts and returns all media tags (Images, Audio, and Video)
195
+ - πŸ”— Extracts all external and internal links
196
+ - πŸ“š Extracts metadata from the page
197
+ - πŸ”„ Custom hooks for authentication, headers, and page modifications before crawling
198
+ - πŸ•΅οΈ User-agent customization
199
+ - πŸ–ΌοΈ Takes screenshots of the page
200
+ - πŸ“œ Executes multiple custom JavaScripts before crawling
201
+ - πŸ“Š Generates structured output without LLM using JsonCssExtractionStrategy
202
+ - πŸ“š Various chunking strategies: topic-based, regex, sentence, and more
203
+ - 🧠 Advanced extraction strategies: cosine clustering, LLM, and more
204
+ - 🎯 CSS selector support for precise data extraction
205
+ - πŸ“ Passes instructions/keywords to refine extraction
206
+ - πŸ”’ Proxy support for enhanced privacy and access
207
+ - πŸ”„ Session management for complex multi-page crawling scenarios
208
+ - 🌐 Asynchronous architecture for improved performance and scalability
209
+
210
+
211
+ ## Installation πŸ› οΈ
212
+
213
+ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
214
+
215
+ ### Using pip 🐍
216
+
217
+ Choose the installation option that best fits your needs:
218
+
219
+ #### Basic Installation
220
+
221
+ For basic web crawling and scraping tasks:
222
+
223
+ ```bash
224
+ pip install crawl4ai
225
+ ```
226
+
227
+ #### Installation with PyTorch
228
+
229
+ For advanced text clustering (includes CosineSimilarity cluster strategy):
230
+
231
+ ```bash
232
+ pip install crawl4ai[torch]
233
+ ```
234
+
235
+ #### Installation with Transformers
236
+
237
+ For text summarization and Hugging Face models:
238
+
239
+ ```bash
240
+ pip install crawl4ai[transformer]
241
+ ```
242
+
243
+ #### Installation with Synchronous Version
244
+
245
+ If you need the synchronous version using Selenium:
246
+
247
+ ```bash
248
+ pip install crawl4ai[sync]
249
+ ```
250
+
251
+ #### Installation with Cosine Similarity
252
+
253
+ For using the cosine similarity strategy:
254
+
255
+ ```bash
256
+ pip install crawl4ai[cosine]
257
+ ```
258
+
259
+ #### Full Installation
260
+
261
+ For all features:
262
+
263
+ ```bash
264
+ pip install crawl4ai[all]
265
+ ```
266
+
267
+ After installation, run the following command to install Playwright dependencies:
268
+
269
+ ```bash
270
+ playwright install
271
+ ```
272
+
273
+ If you've installed the "torch", "transformer", or "all" options, it's recommended to run:
274
+
275
+ ```bash
276
+ crawl4ai-download-models
277
+ ```
278
+
279
+ ### Using Docker 🐳
280
+
281
+ ```bash
282
+ # For Mac users (M1/M2)
283
+ docker build --platform linux/amd64 -t crawl4ai .
284
+ # For other users
285
+ docker build -t crawl4ai .
286
+ docker run -d -p 8000:80 crawl4ai
287
+ ```
288
+
289
+ ### Using Docker Hub 🐳
290
+
291
+ ```bash
292
+ docker pull unclecode/crawl4ai:latest
293
+ docker run -d -p 8000:80 unclecode/crawl4ai:latest
294
+ ```
295
+
296
+ For more detailed installation instructions and options, please refer to our [Installation Guide](https://crawl4ai.com/mkdocs/installation).
297
+
298
+ ## Quick Start πŸš€
299
+
300
+ ```python
301
+ import asyncio
302
+ from crawl4ai import AsyncWebCrawler
303
+
304
+ async def main():
305
+ async with AsyncWebCrawler(verbose=True) as crawler:
306
+ result = await crawler.arun(url="https://www.nbcnews.com/business")
307
+ print(result.markdown)
308
+
309
+ if __name__ == "__main__":
310
+ asyncio.run(main())
311
+ ```
312
+
313
+ ## Advanced Usage πŸ”¬
314
+
315
+ ### Executing JavaScript and Using CSS Selectors
316
+
317
+ ```python
318
+ import asyncio
319
+ from crawl4ai import AsyncWebCrawler
320
+
321
+ async def main():
322
+ async with AsyncWebCrawler(verbose=True) as crawler:
323
+ js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"]
324
+ result = await crawler.arun(
325
+ url="https://www.nbcnews.com/business",
326
+ js_code=js_code,
327
+ css_selector="article.tease-card",
328
+ bypass_cache=True
329
+ )
330
+ print(result.extracted_content)
331
+
332
+ if __name__ == "__main__":
333
+ asyncio.run(main())
334
+ ```
335
+
336
+ ### Using a Proxy
337
+
338
+ ```python
339
+ import asyncio
340
+ from crawl4ai import AsyncWebCrawler
341
+
342
+ async def main():
343
+ async with AsyncWebCrawler(verbose=True, proxy="http://127.0.0.1:7890") as crawler:
344
+ result = await crawler.arun(
345
+ url="https://www.nbcnews.com/business",
346
+ bypass_cache=True
347
+ )
348
+ print(result.markdown)
349
+
350
+ if __name__ == "__main__":
351
+ asyncio.run(main())
352
+ ```
353
+
354
+ ### Extracting Structured Data with OpenAI
355
+
356
+ ```python
357
+ import os
358
+ import asyncio
359
+ from crawl4ai import AsyncWebCrawler
360
+ from crawl4ai.extraction_strategy import LLMExtractionStrategy
361
+ from pydantic import BaseModel, Field
362
+
363
+ class OpenAIModelFee(BaseModel):
364
+ model_name: str = Field(..., description="Name of the OpenAI model.")
365
+ input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
366
+ output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
367
+
368
+ async def main():
369
+ async with AsyncWebCrawler(verbose=True) as crawler:
370
+ result = await crawler.arun(
371
+ url='https://openai.com/api/pricing/',
372
+ word_count_threshold=1,
373
+ extraction_strategy=LLMExtractionStrategy(
374
+ provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'),
375
+ schema=OpenAIModelFee.schema(),
376
+ extraction_type="schema",
377
+ instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
378
+ Do not miss any models in the entire content. One extracted model JSON format should look like this:
379
+ {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""
380
+ ),
381
+ bypass_cache=True,
382
+ )
383
+ print(result.extracted_content)
384
+
385
+ if __name__ == "__main__":
386
+ asyncio.run(main())
387
+ ```
388
+
389
+ ### Advanced Multi-Page Crawling with JavaScript Execution
390
+
391
+ Crawl4AI excels at handling complex scenarios, such as crawling multiple pages with dynamic content loaded via JavaScript. Here's an example of crawling GitHub commits across multiple pages:
392
+
393
+ ```python
394
+ import asyncio
395
+ import re
396
+ from bs4 import BeautifulSoup
397
+ from crawl4ai import AsyncWebCrawler
398
+
399
+ async def crawl_typescript_commits():
400
+ first_commit = ""
401
+ async def on_execution_started(page):
402
+ nonlocal first_commit
403
+ try:
404
+ while True:
405
+ await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
406
+ commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
407
+ commit = await commit.evaluate('(element) => element.textContent')
408
+ commit = re.sub(r'\s+', '', commit)
409
+ if commit and commit != first_commit:
410
+ first_commit = commit
411
+ break
412
+ await asyncio.sleep(0.5)
413
+ except Exception as e:
414
+ print(f"Warning: New content didn't appear after JavaScript execution: {e}")
415
+
416
+ async with AsyncWebCrawler(verbose=True) as crawler:
417
+ crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)
418
+
419
+ url = "https://github.com/microsoft/TypeScript/commits/main"
420
+ session_id = "typescript_commits_session"
421
+ all_commits = []
422
+
423
+ js_next_page = """
424
+ const button = document.querySelector('a[data-testid="pagination-next-button"]');
425
+ if (button) button.click();
426
+ """
427
+
428
+ for page in range(3): # Crawl 3 pages
429
+ result = await crawler.arun(
430
+ url=url,
431
+ session_id=session_id,
432
+ css_selector="li.Box-sc-g0xbh4-0",
433
+ js=js_next_page if page > 0 else None,
434
+ bypass_cache=True,
435
+ js_only=page > 0
436
+ )
437
+
438
+ assert result.success, f"Failed to crawl page {page + 1}"
439
+
440
+ soup = BeautifulSoup(result.cleaned_html, 'html.parser')
441
+ commits = soup.select("li")
442
+ all_commits.extend(commits)
443
+
444
+ print(f"Page {page + 1}: Found {len(commits)} commits")
445
+
446
+ await crawler.crawler_strategy.kill_session(session_id)
447
+ print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
448
+
449
+ if __name__ == "__main__":
450
+ asyncio.run(crawl_typescript_commits())
451
+ ```
452
+
453
+ This example demonstrates Crawl4AI's ability to handle complex scenarios where content is loaded asynchronously. It crawls multiple pages of GitHub commits, executing JavaScript to load new content and using custom hooks to ensure data is loaded before proceeding.
454
+
455
+ ### Using JsonCssExtractionStrategy
456
+
457
+ The `JsonCssExtractionStrategy` allows for precise extraction of structured data from web pages using CSS selectors.
458
+
459
+ ```python
460
+ import asyncio
461
+ import json
462
+ from crawl4ai import AsyncWebCrawler
463
+ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
464
+
465
+ async def extract_news_teasers():
466
+ schema = {
467
+ "name": "News Teaser Extractor",
468
+ "baseSelector": ".wide-tease-item__wrapper",
469
+ "fields": [
470
+ {
471
+ "name": "category",
472
+ "selector": ".unibrow span[data-testid='unibrow-text']",
473
+ "type": "text",
474
+ },
475
+ {
476
+ "name": "headline",
477
+ "selector": ".wide-tease-item__headline",
478
+ "type": "text",
479
+ },
480
+ {
481
+ "name": "summary",
482
+ "selector": ".wide-tease-item__description",
483
+ "type": "text",
484
+ },
485
+ {
486
+ "name": "time",
487
+ "selector": "[data-testid='wide-tease-date']",
488
+ "type": "text",
489
+ },
490
+ {
491
+ "name": "image",
492
+ "type": "nested",
493
+ "selector": "picture.teasePicture img",
494
+ "fields": [
495
+ {"name": "src", "type": "attribute", "attribute": "src"},
496
+ {"name": "alt", "type": "attribute", "attribute": "alt"},
497
+ ],
498
+ },
499
+ {
500
+ "name": "link",
501
+ "selector": "a[href]",
502
+ "type": "attribute",
503
+ "attribute": "href",
504
+ },
505
+ ],
506
+ }
507
+
508
+ extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
509
+
510
+ async with AsyncWebCrawler(verbose=True) as crawler:
511
+ result = await crawler.arun(
512
+ url="https://www.nbcnews.com/business",
513
+ extraction_strategy=extraction_strategy,
514
+ bypass_cache=True,
515
+ )
516
+
517
+ assert result.success, "Failed to crawl the page"
518
+
519
+ news_teasers = json.loads(result.extracted_content)
520
+ print(f"Successfully extracted {len(news_teasers)} news teasers")
521
+ print(json.dumps(news_teasers[0], indent=2))
522
+
523
+ if __name__ == "__main__":
524
+ asyncio.run(extract_news_teasers())
525
+ ```
526
+
527
+ ## Speed Comparison πŸš€
528
+
529
+ Crawl4AI is designed with speed as a primary focus. Our goal is to provide the fastest possible response with high-quality data extraction, minimizing abstractions between the data and the user.
530
+
531
+ We've conducted a speed comparison between Crawl4AI and Firecrawl, a paid service. The results demonstrate Crawl4AI's superior performance:
532
+
533
+ ```
534
+ Firecrawl:
535
+ Time taken: 7.02 seconds
536
+ Content length: 42074 characters
537
+ Images found: 49
538
+
539
+ Crawl4AI (simple crawl):
540
+ Time taken: 1.60 seconds
541
+ Content length: 18238 characters
542
+ Images found: 49
543
+
544
+ Crawl4AI (with JavaScript execution):
545
+ Time taken: 4.64 seconds
546
+ Content length: 40869 characters
547
+ Images found: 89
548
+ ```
549
+
550
+ As you can see, Crawl4AI outperforms Firecrawl significantly:
551
+ - Simple crawl: Crawl4AI is over 4 times faster than Firecrawl.
552
+ - With JavaScript execution: Even when executing JavaScript to load more content (doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl.
553
+
554
+ You can find the full comparison code in our repository at `docs/examples/crawl4ai_vs_firecrawl.py`.
555
+
556
+ ## Documentation πŸ“š
557
+
558
+ For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).
559
+
560
+ ## Contributing 🀝
561
+
562
+ We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information.
563
+
564
+ ## License πŸ“„
565
+
566
+ Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE).
567
+
568
+ ## Contact πŸ“§
569
+
570
+ For questions, suggestions, or feedback, feel free to reach out:
571
+
572
+ - GitHub: [unclecode](https://github.com/unclecode)
573
+ - Twitter: [@unclecode](https://twitter.com/unclecode)
574
+ - Website: [crawl4ai.com](https://crawl4ai.com)
575
+
576
+ Happy Crawling! πŸ•ΈοΈπŸš€
577
+
578
+ ## Star History
579
+
580
+ [![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date)
@@ -0,0 +1,31 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ requirements.txt
5
+ setup.cfg
6
+ setup.py
7
+ Crawl4AI.egg-info/PKG-INFO
8
+ Crawl4AI.egg-info/SOURCES.txt
9
+ Crawl4AI.egg-info/dependency_links.txt
10
+ Crawl4AI.egg-info/entry_points.txt
11
+ Crawl4AI.egg-info/requires.txt
12
+ Crawl4AI.egg-info/top_level.txt
13
+ crawl4ai/__init__.py
14
+ crawl4ai/async_crawler_strategy.py
15
+ crawl4ai/async_database.py
16
+ crawl4ai/async_webcrawler.py
17
+ crawl4ai/chunking_strategy.py
18
+ crawl4ai/config.py
19
+ crawl4ai/content_scrapping_strategy.py
20
+ crawl4ai/crawler_strategy.py
21
+ crawl4ai/database.py
22
+ crawl4ai/extraction_strategy.py
23
+ crawl4ai/model_loader.py
24
+ crawl4ai/models.py
25
+ crawl4ai/prompts.py
26
+ crawl4ai/train.py
27
+ crawl4ai/utils.py
28
+ crawl4ai/web_crawler.back.py
29
+ crawl4ai/web_crawler.py
30
+ tests/__init__.py
31
+ tests/test_web_crawler.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ crawl4ai-download-models = crawl4ai.model_loader:main