Crawl4AI 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawl4ai-0.3.0/Crawl4AI.egg-info/PKG-INFO +580 -0
- crawl4ai-0.3.0/Crawl4AI.egg-info/SOURCES.txt +31 -0
- crawl4ai-0.3.0/Crawl4AI.egg-info/dependency_links.txt +1 -0
- crawl4ai-0.3.0/Crawl4AI.egg-info/entry_points.txt +2 -0
- crawl4ai-0.3.0/Crawl4AI.egg-info/requires.txt +152 -0
- crawl4ai-0.3.0/Crawl4AI.egg-info/top_level.txt +2 -0
- crawl4ai-0.3.0/LICENSE +51 -0
- crawl4ai-0.3.0/MANIFEST.in +1 -0
- crawl4ai-0.3.0/PKG-INFO +580 -0
- crawl4ai-0.3.0/README.md +413 -0
- crawl4ai-0.3.0/crawl4ai/__init__.py +30 -0
- crawl4ai-0.3.0/crawl4ai/async_crawler_strategy.py +385 -0
- crawl4ai-0.3.0/crawl4ai/async_database.py +97 -0
- crawl4ai-0.3.0/crawl4ai/async_webcrawler.py +276 -0
- crawl4ai-0.3.0/crawl4ai/chunking_strategy.py +106 -0
- crawl4ai-0.3.0/crawl4ai/config.py +40 -0
- crawl4ai-0.3.0/crawl4ai/content_scrapping_strategy.py +296 -0
- crawl4ai-0.3.0/crawl4ai/crawler_strategy.py +360 -0
- crawl4ai-0.3.0/crawl4ai/database.py +135 -0
- crawl4ai-0.3.0/crawl4ai/extraction_strategy.py +863 -0
- crawl4ai-0.3.0/crawl4ai/model_loader.py +248 -0
- crawl4ai-0.3.0/crawl4ai/models.py +22 -0
- crawl4ai-0.3.0/crawl4ai/prompts.py +204 -0
- crawl4ai-0.3.0/crawl4ai/train.py +146 -0
- crawl4ai-0.3.0/crawl4ai/utils.py +916 -0
- crawl4ai-0.3.0/crawl4ai/web_crawler.back.py +357 -0
- crawl4ai-0.3.0/crawl4ai/web_crawler.py +237 -0
- crawl4ai-0.3.0/requirements.txt +66 -0
- crawl4ai-0.3.0/setup.cfg +7 -0
- crawl4ai-0.3.0/setup.py +91 -0
- crawl4ai-0.3.0/tests/__init__.py +0 -0
- crawl4ai-0.3.0/tests/test_web_crawler.py +111 -0
|
@@ -0,0 +1,580 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: Crawl4AI
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: π₯π·οΈ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper
|
|
5
|
+
Home-page: https://github.com/unclecode/crawl4ai
|
|
6
|
+
Author: Unclecode
|
|
7
|
+
Author-email: unclecode@kidocode.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Requires-Python: >=3.7
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: aiohappyeyeballs==2.4.0
|
|
21
|
+
Requires-Dist: aiohttp==3.10.5
|
|
22
|
+
Requires-Dist: aiosignal==1.3.1
|
|
23
|
+
Requires-Dist: aiosqlite==0.20.0
|
|
24
|
+
Requires-Dist: annotated-types==0.7.0
|
|
25
|
+
Requires-Dist: anyio==4.6.0
|
|
26
|
+
Requires-Dist: async-timeout==4.0.3
|
|
27
|
+
Requires-Dist: attrs==24.2.0
|
|
28
|
+
Requires-Dist: beautifulsoup4==4.12.3
|
|
29
|
+
Requires-Dist: certifi==2024.8.30
|
|
30
|
+
Requires-Dist: charset-normalizer==3.3.2
|
|
31
|
+
Requires-Dist: click==8.1.7
|
|
32
|
+
Requires-Dist: distro==1.9.0
|
|
33
|
+
Requires-Dist: exceptiongroup==1.2.2
|
|
34
|
+
Requires-Dist: filelock==3.16.1
|
|
35
|
+
Requires-Dist: frozenlist==1.4.1
|
|
36
|
+
Requires-Dist: fsspec==2024.9.0
|
|
37
|
+
Requires-Dist: greenlet==3.0.3
|
|
38
|
+
Requires-Dist: h11==0.14.0
|
|
39
|
+
Requires-Dist: html2text==2024.2.26
|
|
40
|
+
Requires-Dist: httpcore==1.0.5
|
|
41
|
+
Requires-Dist: httpx==0.27.2
|
|
42
|
+
Requires-Dist: huggingface-hub==0.25.1
|
|
43
|
+
Requires-Dist: idna==3.10
|
|
44
|
+
Requires-Dist: importlib_metadata==8.5.0
|
|
45
|
+
Requires-Dist: Jinja2==3.1.4
|
|
46
|
+
Requires-Dist: jiter==0.5.0
|
|
47
|
+
Requires-Dist: jsonschema==4.23.0
|
|
48
|
+
Requires-Dist: jsonschema-specifications==2023.12.1
|
|
49
|
+
Requires-Dist: litellm==1.48.0
|
|
50
|
+
Requires-Dist: lxml==5.3.0
|
|
51
|
+
Requires-Dist: MarkupSafe==2.1.5
|
|
52
|
+
Requires-Dist: multidict==6.1.0
|
|
53
|
+
Requires-Dist: nest-asyncio==1.6.0
|
|
54
|
+
Requires-Dist: numpy==2.1.1
|
|
55
|
+
Requires-Dist: openai==1.47.1
|
|
56
|
+
Requires-Dist: outcome==1.3.0.post0
|
|
57
|
+
Requires-Dist: packaging==24.1
|
|
58
|
+
Requires-Dist: pillow==10.4.0
|
|
59
|
+
Requires-Dist: playwright==1.47.0
|
|
60
|
+
Requires-Dist: psutil==6.0.0
|
|
61
|
+
Requires-Dist: pydantic==2.9.2
|
|
62
|
+
Requires-Dist: pydantic_core==2.23.4
|
|
63
|
+
Requires-Dist: pyee==12.0.0
|
|
64
|
+
Requires-Dist: PySocks==1.7.1
|
|
65
|
+
Requires-Dist: python-dotenv==1.0.1
|
|
66
|
+
Requires-Dist: PyYAML==6.0.2
|
|
67
|
+
Requires-Dist: referencing==0.35.1
|
|
68
|
+
Requires-Dist: regex==2024.9.11
|
|
69
|
+
Requires-Dist: requests==2.32.3
|
|
70
|
+
Requires-Dist: rpds-py==0.20.0
|
|
71
|
+
Requires-Dist: sniffio==1.3.1
|
|
72
|
+
Requires-Dist: sortedcontainers==2.4.0
|
|
73
|
+
Requires-Dist: soupsieve==2.6
|
|
74
|
+
Requires-Dist: tiktoken==0.7.0
|
|
75
|
+
Requires-Dist: tqdm==4.66.5
|
|
76
|
+
Requires-Dist: trio==0.26.2
|
|
77
|
+
Requires-Dist: trio-websocket==0.11.1
|
|
78
|
+
Requires-Dist: typing_extensions==4.12.2
|
|
79
|
+
Requires-Dist: urllib3==2.2.3
|
|
80
|
+
Requires-Dist: websocket-client==1.8.0
|
|
81
|
+
Requires-Dist: wsproto==1.2.0
|
|
82
|
+
Requires-Dist: yarl==1.12.1
|
|
83
|
+
Requires-Dist: zipp==3.20.2
|
|
84
|
+
Provides-Extra: torch
|
|
85
|
+
Requires-Dist: numpy==2.1.1; extra == "torch"
|
|
86
|
+
Provides-Extra: transformer
|
|
87
|
+
Requires-Dist: tokenizers==0.20.0; extra == "transformer"
|
|
88
|
+
Provides-Extra: sync
|
|
89
|
+
Requires-Dist: selenium; extra == "sync"
|
|
90
|
+
Provides-Extra: cosine
|
|
91
|
+
Requires-Dist: torch; extra == "cosine"
|
|
92
|
+
Requires-Dist: transformers; extra == "cosine"
|
|
93
|
+
Requires-Dist: nltk; extra == "cosine"
|
|
94
|
+
Requires-Dist: spacy; extra == "cosine"
|
|
95
|
+
Provides-Extra: all
|
|
96
|
+
Requires-Dist: aiohappyeyeballs==2.4.0; extra == "all"
|
|
97
|
+
Requires-Dist: aiohttp==3.10.5; extra == "all"
|
|
98
|
+
Requires-Dist: aiosignal==1.3.1; extra == "all"
|
|
99
|
+
Requires-Dist: aiosqlite==0.20.0; extra == "all"
|
|
100
|
+
Requires-Dist: annotated-types==0.7.0; extra == "all"
|
|
101
|
+
Requires-Dist: anyio==4.6.0; extra == "all"
|
|
102
|
+
Requires-Dist: async-timeout==4.0.3; extra == "all"
|
|
103
|
+
Requires-Dist: attrs==24.2.0; extra == "all"
|
|
104
|
+
Requires-Dist: beautifulsoup4==4.12.3; extra == "all"
|
|
105
|
+
Requires-Dist: certifi==2024.8.30; extra == "all"
|
|
106
|
+
Requires-Dist: charset-normalizer==3.3.2; extra == "all"
|
|
107
|
+
Requires-Dist: click==8.1.7; extra == "all"
|
|
108
|
+
Requires-Dist: distro==1.9.0; extra == "all"
|
|
109
|
+
Requires-Dist: exceptiongroup==1.2.2; extra == "all"
|
|
110
|
+
Requires-Dist: filelock==3.16.1; extra == "all"
|
|
111
|
+
Requires-Dist: frozenlist==1.4.1; extra == "all"
|
|
112
|
+
Requires-Dist: fsspec==2024.9.0; extra == "all"
|
|
113
|
+
Requires-Dist: greenlet==3.0.3; extra == "all"
|
|
114
|
+
Requires-Dist: h11==0.14.0; extra == "all"
|
|
115
|
+
Requires-Dist: html2text==2024.2.26; extra == "all"
|
|
116
|
+
Requires-Dist: httpcore==1.0.5; extra == "all"
|
|
117
|
+
Requires-Dist: httpx==0.27.2; extra == "all"
|
|
118
|
+
Requires-Dist: huggingface-hub==0.25.1; extra == "all"
|
|
119
|
+
Requires-Dist: idna==3.10; extra == "all"
|
|
120
|
+
Requires-Dist: importlib_metadata==8.5.0; extra == "all"
|
|
121
|
+
Requires-Dist: Jinja2==3.1.4; extra == "all"
|
|
122
|
+
Requires-Dist: jiter==0.5.0; extra == "all"
|
|
123
|
+
Requires-Dist: jsonschema==4.23.0; extra == "all"
|
|
124
|
+
Requires-Dist: jsonschema-specifications==2023.12.1; extra == "all"
|
|
125
|
+
Requires-Dist: litellm==1.48.0; extra == "all"
|
|
126
|
+
Requires-Dist: lxml==5.3.0; extra == "all"
|
|
127
|
+
Requires-Dist: MarkupSafe==2.1.5; extra == "all"
|
|
128
|
+
Requires-Dist: multidict==6.1.0; extra == "all"
|
|
129
|
+
Requires-Dist: nest-asyncio==1.6.0; extra == "all"
|
|
130
|
+
Requires-Dist: numpy==2.1.1; extra == "all"
|
|
131
|
+
Requires-Dist: openai==1.47.1; extra == "all"
|
|
132
|
+
Requires-Dist: outcome==1.3.0.post0; extra == "all"
|
|
133
|
+
Requires-Dist: packaging==24.1; extra == "all"
|
|
134
|
+
Requires-Dist: pillow==10.4.0; extra == "all"
|
|
135
|
+
Requires-Dist: playwright==1.47.0; extra == "all"
|
|
136
|
+
Requires-Dist: psutil==6.0.0; extra == "all"
|
|
137
|
+
Requires-Dist: pydantic==2.9.2; extra == "all"
|
|
138
|
+
Requires-Dist: pydantic_core==2.23.4; extra == "all"
|
|
139
|
+
Requires-Dist: pyee==12.0.0; extra == "all"
|
|
140
|
+
Requires-Dist: PySocks==1.7.1; extra == "all"
|
|
141
|
+
Requires-Dist: python-dotenv==1.0.1; extra == "all"
|
|
142
|
+
Requires-Dist: PyYAML==6.0.2; extra == "all"
|
|
143
|
+
Requires-Dist: referencing==0.35.1; extra == "all"
|
|
144
|
+
Requires-Dist: regex==2024.9.11; extra == "all"
|
|
145
|
+
Requires-Dist: requests==2.32.3; extra == "all"
|
|
146
|
+
Requires-Dist: rpds-py==0.20.0; extra == "all"
|
|
147
|
+
Requires-Dist: selenium==4.25.0; extra == "all"
|
|
148
|
+
Requires-Dist: sniffio==1.3.1; extra == "all"
|
|
149
|
+
Requires-Dist: sortedcontainers==2.4.0; extra == "all"
|
|
150
|
+
Requires-Dist: soupsieve==2.6; extra == "all"
|
|
151
|
+
Requires-Dist: tiktoken==0.7.0; extra == "all"
|
|
152
|
+
Requires-Dist: tokenizers==0.20.0; extra == "all"
|
|
153
|
+
Requires-Dist: tqdm==4.66.5; extra == "all"
|
|
154
|
+
Requires-Dist: trio==0.26.2; extra == "all"
|
|
155
|
+
Requires-Dist: trio-websocket==0.11.1; extra == "all"
|
|
156
|
+
Requires-Dist: typing_extensions==4.12.2; extra == "all"
|
|
157
|
+
Requires-Dist: urllib3==2.2.3; extra == "all"
|
|
158
|
+
Requires-Dist: websocket-client==1.8.0; extra == "all"
|
|
159
|
+
Requires-Dist: wsproto==1.2.0; extra == "all"
|
|
160
|
+
Requires-Dist: yarl==1.12.1; extra == "all"
|
|
161
|
+
Requires-Dist: zipp==3.20.2; extra == "all"
|
|
162
|
+
Requires-Dist: selenium; extra == "all"
|
|
163
|
+
Requires-Dist: torch; extra == "all"
|
|
164
|
+
Requires-Dist: transformers; extra == "all"
|
|
165
|
+
Requires-Dist: nltk; extra == "all"
|
|
166
|
+
Requires-Dist: spacy; extra == "all"
|
|
167
|
+
|
|
168
|
+
# Crawl4AI 0.3.0 Async Version π·οΈπ€
|
|
169
|
+
|
|
170
|
+
[](https://github.com/unclecode/crawl4ai/stargazers)
|
|
171
|
+
[](https://github.com/unclecode/crawl4ai/network/members)
|
|
172
|
+
[](https://github.com/unclecode/crawl4ai/issues)
|
|
173
|
+
[](https://github.com/unclecode/crawl4ai/pulls)
|
|
174
|
+
[](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
|
|
175
|
+
|
|
176
|
+
Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. ππ
|
|
177
|
+
|
|
178
|
+
> Looking for the synchronous version? Check out [README.sync.md](./README.sync.md).
|
|
179
|
+
|
|
180
|
+
## Try it Now!
|
|
181
|
+
|
|
182
|
+
β¨ Play around with this [](https://colab.research.google.com/drive/1REChY6fXQf-EaVYLv0eHEWvzlYxGm0pd?usp=sharing)
|
|
183
|
+
|
|
184
|
+
β¨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)
|
|
185
|
+
|
|
186
|
+
β¨ Check out the [Demo](https://crawl4ai.com/mkdocs/demo)
|
|
187
|
+
|
|
188
|
+
## Features β¨
|
|
189
|
+
|
|
190
|
+
- π Completely free and open-source
|
|
191
|
+
- π Blazing fast performance, outperforming many paid services
|
|
192
|
+
- π€ LLM-friendly output formats (JSON, cleaned HTML, markdown)
|
|
193
|
+
- π Supports crawling multiple URLs simultaneously
|
|
194
|
+
- π¨ Extracts and returns all media tags (Images, Audio, and Video)
|
|
195
|
+
- π Extracts all external and internal links
|
|
196
|
+
- π Extracts metadata from the page
|
|
197
|
+
- π Custom hooks for authentication, headers, and page modifications before crawling
|
|
198
|
+
- π΅οΈ User-agent customization
|
|
199
|
+
- πΌοΈ Takes screenshots of the page
|
|
200
|
+
- π Executes multiple custom JavaScripts before crawling
|
|
201
|
+
- π Generates structured output without LLM using JsonCssExtractionStrategy
|
|
202
|
+
- π Various chunking strategies: topic-based, regex, sentence, and more
|
|
203
|
+
- π§ Advanced extraction strategies: cosine clustering, LLM, and more
|
|
204
|
+
- π― CSS selector support for precise data extraction
|
|
205
|
+
- π Passes instructions/keywords to refine extraction
|
|
206
|
+
- π Proxy support for enhanced privacy and access
|
|
207
|
+
- π Session management for complex multi-page crawling scenarios
|
|
208
|
+
- π Asynchronous architecture for improved performance and scalability
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
## Installation π οΈ
|
|
212
|
+
|
|
213
|
+
Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
|
|
214
|
+
|
|
215
|
+
### Using pip π
|
|
216
|
+
|
|
217
|
+
Choose the installation option that best fits your needs:
|
|
218
|
+
|
|
219
|
+
#### Basic Installation
|
|
220
|
+
|
|
221
|
+
For basic web crawling and scraping tasks:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
pip install crawl4ai
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
#### Installation with PyTorch
|
|
228
|
+
|
|
229
|
+
For advanced text clustering (includes CosineSimilarity cluster strategy):
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
pip install crawl4ai[torch]
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
#### Installation with Transformers
|
|
236
|
+
|
|
237
|
+
For text summarization and Hugging Face models:
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
pip install crawl4ai[transformer]
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
#### Installation with Synchronous Version
|
|
244
|
+
|
|
245
|
+
If you need the synchronous version using Selenium:
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
pip install crawl4ai[sync]
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
#### Installation with Cosine Similarity
|
|
252
|
+
|
|
253
|
+
For using the cosine similarity strategy:
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
pip install crawl4ai[cosine]
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
#### Full Installation
|
|
260
|
+
|
|
261
|
+
For all features:
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
pip install crawl4ai[all]
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
After installation, run the following command to install Playwright dependencies:
|
|
268
|
+
|
|
269
|
+
```bash
|
|
270
|
+
playwright install
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
If you've installed the "torch", "transformer", or "all" options, it's recommended to run:
|
|
274
|
+
|
|
275
|
+
```bash
|
|
276
|
+
crawl4ai-download-models
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
### Using Docker π³
|
|
280
|
+
|
|
281
|
+
```bash
|
|
282
|
+
# For Mac users (M1/M2)
|
|
283
|
+
docker build --platform linux/amd64 -t crawl4ai .
|
|
284
|
+
# For other users
|
|
285
|
+
docker build -t crawl4ai .
|
|
286
|
+
docker run -d -p 8000:80 crawl4ai
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
### Using Docker Hub π³
|
|
290
|
+
|
|
291
|
+
```bash
|
|
292
|
+
docker pull unclecode/crawl4ai:latest
|
|
293
|
+
docker run -d -p 8000:80 unclecode/crawl4ai:latest
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
For more detailed installation instructions and options, please refer to our [Installation Guide](https://crawl4ai.com/mkdocs/installation).
|
|
297
|
+
|
|
298
|
+
## Quick Start π
|
|
299
|
+
|
|
300
|
+
```python
|
|
301
|
+
import asyncio
|
|
302
|
+
from crawl4ai import AsyncWebCrawler
|
|
303
|
+
|
|
304
|
+
async def main():
|
|
305
|
+
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
306
|
+
result = await crawler.arun(url="https://www.nbcnews.com/business")
|
|
307
|
+
print(result.markdown)
|
|
308
|
+
|
|
309
|
+
if __name__ == "__main__":
|
|
310
|
+
asyncio.run(main())
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
## Advanced Usage π¬
|
|
314
|
+
|
|
315
|
+
### Executing JavaScript and Using CSS Selectors
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
import asyncio
|
|
319
|
+
from crawl4ai import AsyncWebCrawler
|
|
320
|
+
|
|
321
|
+
async def main():
|
|
322
|
+
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
323
|
+
js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"]
|
|
324
|
+
result = await crawler.arun(
|
|
325
|
+
url="https://www.nbcnews.com/business",
|
|
326
|
+
js_code=js_code,
|
|
327
|
+
css_selector="article.tease-card",
|
|
328
|
+
bypass_cache=True
|
|
329
|
+
)
|
|
330
|
+
print(result.extracted_content)
|
|
331
|
+
|
|
332
|
+
if __name__ == "__main__":
|
|
333
|
+
asyncio.run(main())
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
### Using a Proxy
|
|
337
|
+
|
|
338
|
+
```python
|
|
339
|
+
import asyncio
|
|
340
|
+
from crawl4ai import AsyncWebCrawler
|
|
341
|
+
|
|
342
|
+
async def main():
|
|
343
|
+
async with AsyncWebCrawler(verbose=True, proxy="http://127.0.0.1:7890") as crawler:
|
|
344
|
+
result = await crawler.arun(
|
|
345
|
+
url="https://www.nbcnews.com/business",
|
|
346
|
+
bypass_cache=True
|
|
347
|
+
)
|
|
348
|
+
print(result.markdown)
|
|
349
|
+
|
|
350
|
+
if __name__ == "__main__":
|
|
351
|
+
asyncio.run(main())
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
### Extracting Structured Data with OpenAI
|
|
355
|
+
|
|
356
|
+
```python
|
|
357
|
+
import os
|
|
358
|
+
import asyncio
|
|
359
|
+
from crawl4ai import AsyncWebCrawler
|
|
360
|
+
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
|
361
|
+
from pydantic import BaseModel, Field
|
|
362
|
+
|
|
363
|
+
class OpenAIModelFee(BaseModel):
|
|
364
|
+
model_name: str = Field(..., description="Name of the OpenAI model.")
|
|
365
|
+
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
|
366
|
+
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
|
367
|
+
|
|
368
|
+
async def main():
|
|
369
|
+
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
370
|
+
result = await crawler.arun(
|
|
371
|
+
url='https://openai.com/api/pricing/',
|
|
372
|
+
word_count_threshold=1,
|
|
373
|
+
extraction_strategy=LLMExtractionStrategy(
|
|
374
|
+
provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'),
|
|
375
|
+
schema=OpenAIModelFee.schema(),
|
|
376
|
+
extraction_type="schema",
|
|
377
|
+
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
|
378
|
+
Do not miss any models in the entire content. One extracted model JSON format should look like this:
|
|
379
|
+
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""
|
|
380
|
+
),
|
|
381
|
+
bypass_cache=True,
|
|
382
|
+
)
|
|
383
|
+
print(result.extracted_content)
|
|
384
|
+
|
|
385
|
+
if __name__ == "__main__":
|
|
386
|
+
asyncio.run(main())
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
### Advanced Multi-Page Crawling with JavaScript Execution
|
|
390
|
+
|
|
391
|
+
Crawl4AI excels at handling complex scenarios, such as crawling multiple pages with dynamic content loaded via JavaScript. Here's an example of crawling GitHub commits across multiple pages:
|
|
392
|
+
|
|
393
|
+
```python
|
|
394
|
+
import asyncio
|
|
395
|
+
import re
|
|
396
|
+
from bs4 import BeautifulSoup
|
|
397
|
+
from crawl4ai import AsyncWebCrawler
|
|
398
|
+
|
|
399
|
+
async def crawl_typescript_commits():
|
|
400
|
+
first_commit = ""
|
|
401
|
+
async def on_execution_started(page):
|
|
402
|
+
nonlocal first_commit
|
|
403
|
+
try:
|
|
404
|
+
while True:
|
|
405
|
+
await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
|
|
406
|
+
commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
|
|
407
|
+
commit = await commit.evaluate('(element) => element.textContent')
|
|
408
|
+
commit = re.sub(r'\s+', '', commit)
|
|
409
|
+
if commit and commit != first_commit:
|
|
410
|
+
first_commit = commit
|
|
411
|
+
break
|
|
412
|
+
await asyncio.sleep(0.5)
|
|
413
|
+
except Exception as e:
|
|
414
|
+
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
|
|
415
|
+
|
|
416
|
+
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
417
|
+
crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)
|
|
418
|
+
|
|
419
|
+
url = "https://github.com/microsoft/TypeScript/commits/main"
|
|
420
|
+
session_id = "typescript_commits_session"
|
|
421
|
+
all_commits = []
|
|
422
|
+
|
|
423
|
+
js_next_page = """
|
|
424
|
+
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
|
425
|
+
if (button) button.click();
|
|
426
|
+
"""
|
|
427
|
+
|
|
428
|
+
for page in range(3): # Crawl 3 pages
|
|
429
|
+
result = await crawler.arun(
|
|
430
|
+
url=url,
|
|
431
|
+
session_id=session_id,
|
|
432
|
+
css_selector="li.Box-sc-g0xbh4-0",
|
|
433
|
+
js=js_next_page if page > 0 else None,
|
|
434
|
+
bypass_cache=True,
|
|
435
|
+
js_only=page > 0
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
assert result.success, f"Failed to crawl page {page + 1}"
|
|
439
|
+
|
|
440
|
+
soup = BeautifulSoup(result.cleaned_html, 'html.parser')
|
|
441
|
+
commits = soup.select("li")
|
|
442
|
+
all_commits.extend(commits)
|
|
443
|
+
|
|
444
|
+
print(f"Page {page + 1}: Found {len(commits)} commits")
|
|
445
|
+
|
|
446
|
+
await crawler.crawler_strategy.kill_session(session_id)
|
|
447
|
+
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
|
448
|
+
|
|
449
|
+
if __name__ == "__main__":
|
|
450
|
+
asyncio.run(crawl_typescript_commits())
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
This example demonstrates Crawl4AI's ability to handle complex scenarios where content is loaded asynchronously. It crawls multiple pages of GitHub commits, executing JavaScript to load new content and using custom hooks to ensure data is loaded before proceeding.
|
|
454
|
+
|
|
455
|
+
### Using JsonCssExtractionStrategy
|
|
456
|
+
|
|
457
|
+
The `JsonCssExtractionStrategy` allows for precise extraction of structured data from web pages using CSS selectors.
|
|
458
|
+
|
|
459
|
+
```python
|
|
460
|
+
import asyncio
|
|
461
|
+
import json
|
|
462
|
+
from crawl4ai import AsyncWebCrawler
|
|
463
|
+
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
|
464
|
+
|
|
465
|
+
async def extract_news_teasers():
|
|
466
|
+
schema = {
|
|
467
|
+
"name": "News Teaser Extractor",
|
|
468
|
+
"baseSelector": ".wide-tease-item__wrapper",
|
|
469
|
+
"fields": [
|
|
470
|
+
{
|
|
471
|
+
"name": "category",
|
|
472
|
+
"selector": ".unibrow span[data-testid='unibrow-text']",
|
|
473
|
+
"type": "text",
|
|
474
|
+
},
|
|
475
|
+
{
|
|
476
|
+
"name": "headline",
|
|
477
|
+
"selector": ".wide-tease-item__headline",
|
|
478
|
+
"type": "text",
|
|
479
|
+
},
|
|
480
|
+
{
|
|
481
|
+
"name": "summary",
|
|
482
|
+
"selector": ".wide-tease-item__description",
|
|
483
|
+
"type": "text",
|
|
484
|
+
},
|
|
485
|
+
{
|
|
486
|
+
"name": "time",
|
|
487
|
+
"selector": "[data-testid='wide-tease-date']",
|
|
488
|
+
"type": "text",
|
|
489
|
+
},
|
|
490
|
+
{
|
|
491
|
+
"name": "image",
|
|
492
|
+
"type": "nested",
|
|
493
|
+
"selector": "picture.teasePicture img",
|
|
494
|
+
"fields": [
|
|
495
|
+
{"name": "src", "type": "attribute", "attribute": "src"},
|
|
496
|
+
{"name": "alt", "type": "attribute", "attribute": "alt"},
|
|
497
|
+
],
|
|
498
|
+
},
|
|
499
|
+
{
|
|
500
|
+
"name": "link",
|
|
501
|
+
"selector": "a[href]",
|
|
502
|
+
"type": "attribute",
|
|
503
|
+
"attribute": "href",
|
|
504
|
+
},
|
|
505
|
+
],
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
|
509
|
+
|
|
510
|
+
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
511
|
+
result = await crawler.arun(
|
|
512
|
+
url="https://www.nbcnews.com/business",
|
|
513
|
+
extraction_strategy=extraction_strategy,
|
|
514
|
+
bypass_cache=True,
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
assert result.success, "Failed to crawl the page"
|
|
518
|
+
|
|
519
|
+
news_teasers = json.loads(result.extracted_content)
|
|
520
|
+
print(f"Successfully extracted {len(news_teasers)} news teasers")
|
|
521
|
+
print(json.dumps(news_teasers[0], indent=2))
|
|
522
|
+
|
|
523
|
+
if __name__ == "__main__":
|
|
524
|
+
asyncio.run(extract_news_teasers())
|
|
525
|
+
```
|
|
526
|
+
|
|
527
|
+
## Speed Comparison π
|
|
528
|
+
|
|
529
|
+
Crawl4AI is designed with speed as a primary focus. Our goal is to provide the fastest possible response with high-quality data extraction, minimizing abstractions between the data and the user.
|
|
530
|
+
|
|
531
|
+
We've conducted a speed comparison between Crawl4AI and Firecrawl, a paid service. The results demonstrate Crawl4AI's superior performance:
|
|
532
|
+
|
|
533
|
+
```
|
|
534
|
+
Firecrawl:
|
|
535
|
+
Time taken: 7.02 seconds
|
|
536
|
+
Content length: 42074 characters
|
|
537
|
+
Images found: 49
|
|
538
|
+
|
|
539
|
+
Crawl4AI (simple crawl):
|
|
540
|
+
Time taken: 1.60 seconds
|
|
541
|
+
Content length: 18238 characters
|
|
542
|
+
Images found: 49
|
|
543
|
+
|
|
544
|
+
Crawl4AI (with JavaScript execution):
|
|
545
|
+
Time taken: 4.64 seconds
|
|
546
|
+
Content length: 40869 characters
|
|
547
|
+
Images found: 89
|
|
548
|
+
```
|
|
549
|
+
|
|
550
|
+
As you can see, Crawl4AI outperforms Firecrawl significantly:
|
|
551
|
+
- Simple crawl: Crawl4AI is over 4 times faster than Firecrawl.
|
|
552
|
+
- With JavaScript execution: Even when executing JavaScript to load more content (doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl.
|
|
553
|
+
|
|
554
|
+
You can find the full comparison code in our repository at `docs/examples/crawl4ai_vs_firecrawl.py`.
|
|
555
|
+
|
|
556
|
+
## Documentation π
|
|
557
|
+
|
|
558
|
+
For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).
|
|
559
|
+
|
|
560
|
+
## Contributing π€
|
|
561
|
+
|
|
562
|
+
We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information.
|
|
563
|
+
|
|
564
|
+
## License π
|
|
565
|
+
|
|
566
|
+
Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE).
|
|
567
|
+
|
|
568
|
+
## Contact π§
|
|
569
|
+
|
|
570
|
+
For questions, suggestions, or feedback, feel free to reach out:
|
|
571
|
+
|
|
572
|
+
- GitHub: [unclecode](https://github.com/unclecode)
|
|
573
|
+
- Twitter: [@unclecode](https://twitter.com/unclecode)
|
|
574
|
+
- Website: [crawl4ai.com](https://crawl4ai.com)
|
|
575
|
+
|
|
576
|
+
Happy Crawling! πΈοΈπ
|
|
577
|
+
|
|
578
|
+
## Star History
|
|
579
|
+
|
|
580
|
+
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
requirements.txt
|
|
5
|
+
setup.cfg
|
|
6
|
+
setup.py
|
|
7
|
+
Crawl4AI.egg-info/PKG-INFO
|
|
8
|
+
Crawl4AI.egg-info/SOURCES.txt
|
|
9
|
+
Crawl4AI.egg-info/dependency_links.txt
|
|
10
|
+
Crawl4AI.egg-info/entry_points.txt
|
|
11
|
+
Crawl4AI.egg-info/requires.txt
|
|
12
|
+
Crawl4AI.egg-info/top_level.txt
|
|
13
|
+
crawl4ai/__init__.py
|
|
14
|
+
crawl4ai/async_crawler_strategy.py
|
|
15
|
+
crawl4ai/async_database.py
|
|
16
|
+
crawl4ai/async_webcrawler.py
|
|
17
|
+
crawl4ai/chunking_strategy.py
|
|
18
|
+
crawl4ai/config.py
|
|
19
|
+
crawl4ai/content_scrapping_strategy.py
|
|
20
|
+
crawl4ai/crawler_strategy.py
|
|
21
|
+
crawl4ai/database.py
|
|
22
|
+
crawl4ai/extraction_strategy.py
|
|
23
|
+
crawl4ai/model_loader.py
|
|
24
|
+
crawl4ai/models.py
|
|
25
|
+
crawl4ai/prompts.py
|
|
26
|
+
crawl4ai/train.py
|
|
27
|
+
crawl4ai/utils.py
|
|
28
|
+
crawl4ai/web_crawler.back.py
|
|
29
|
+
crawl4ai/web_crawler.py
|
|
30
|
+
tests/__init__.py
|
|
31
|
+
tests/test_web_crawler.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|