webweavex 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webweavex-0.1.0/.gitignore +84 -0
- webweavex-0.1.0/.gitkeep +0 -0
- webweavex-0.1.0/PKG-INFO +180 -0
- webweavex-0.1.0/README.md +153 -0
- webweavex-0.1.0/pyproject.toml +53 -0
- webweavex-0.1.0/webweavex/__init__.py +33 -0
- webweavex-0.1.0/webweavex/api_server.py +40 -0
- webweavex-0.1.0/webweavex/async_crawler.py +56 -0
- webweavex-0.1.0/webweavex/async_engine.py +86 -0
- webweavex-0.1.0/webweavex/async_fetcher.py +69 -0
- webweavex-0.1.0/webweavex/cli.py +232 -0
- webweavex-0.1.0/webweavex/config.py +43 -0
- webweavex-0.1.0/webweavex/crawler.py +56 -0
- webweavex-0.1.0/webweavex/engine.py +23 -0
- webweavex-0.1.0/webweavex/exceptions.py +10 -0
- webweavex-0.1.0/webweavex/fetcher.py +66 -0
- webweavex-0.1.0/webweavex/js_renderer.py +38 -0
- webweavex-0.1.0/webweavex/logging.py +29 -0
- webweavex-0.1.0/webweavex/models.py +21 -0
- webweavex-0.1.0/webweavex/plugin_interface.py +15 -0
- webweavex-0.1.0/webweavex/plugin_loader.py +82 -0
- webweavex-0.1.0/webweavex/rate_limiter.py +35 -0
- webweavex-0.1.0/webweavex/robots.py +68 -0
- webweavex-0.1.0/webweavex/sitemap.py +107 -0
- webweavex-0.1.0/webweavex/ssl_utils.py +60 -0
- webweavex-0.1.0/webweavex/url_utils.py +30 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# OS
|
|
2
|
+
.DS_Store
|
|
3
|
+
Thumbs.db
|
|
4
|
+
|
|
5
|
+
# Python
|
|
6
|
+
__pycache__/
|
|
7
|
+
*.py[cod]
|
|
8
|
+
*.pyo
|
|
9
|
+
*.pyd
|
|
10
|
+
*.egg-info/
|
|
11
|
+
.venv/
|
|
12
|
+
.venv-*/
|
|
13
|
+
venv/
|
|
14
|
+
.env
|
|
15
|
+
|
|
16
|
+
# Node
|
|
17
|
+
node_modules/
|
|
18
|
+
.npm/
|
|
19
|
+
.pnpm-store/
|
|
20
|
+
.yarn/
|
|
21
|
+
.yarn-cache/
|
|
22
|
+
.yarnrc.yml
|
|
23
|
+
pnpm-debug.log*
|
|
24
|
+
npm-debug.log*
|
|
25
|
+
|
|
26
|
+
# Java/Kotlin/Gradle
|
|
27
|
+
.gradle/
|
|
28
|
+
build/
|
|
29
|
+
*.class
|
|
30
|
+
*.jar
|
|
31
|
+
*.war
|
|
32
|
+
*.ear
|
|
33
|
+
|
|
34
|
+
# Maven
|
|
35
|
+
target/
|
|
36
|
+
|
|
37
|
+
# Dart/Flutter
|
|
38
|
+
.dart_tool/
|
|
39
|
+
.flutter-plugins
|
|
40
|
+
.flutter-plugins-dependencies
|
|
41
|
+
.packages
|
|
42
|
+
.pub-cache/
|
|
43
|
+
build/
|
|
44
|
+
|
|
45
|
+
# IDE
|
|
46
|
+
.idea/
|
|
47
|
+
.vscode/
|
|
48
|
+
*.iml
|
|
49
|
+
|
|
50
|
+
# Logs
|
|
51
|
+
*.log
|
|
52
|
+
|
|
53
|
+
# Coverage
|
|
54
|
+
coverage/
|
|
55
|
+
htmlcov/
|
|
56
|
+
|
|
57
|
+
# Test artifacts
|
|
58
|
+
.pytest_cache/
|
|
59
|
+
|
|
60
|
+
# Terraform or infra
|
|
61
|
+
*.tfstate
|
|
62
|
+
*.tfstate.backup
|
|
63
|
+
|
|
64
|
+
# Misc
|
|
65
|
+
*.tmp
|
|
66
|
+
*.swp
|
|
67
|
+
.m2/
|
|
68
|
+
.pkg-verify*/
|
|
69
|
+
.tmp_dart_publish_check/
|
|
70
|
+
|
|
71
|
+
# Runtime artifacts
|
|
72
|
+
rag_dataset.jsonl
|
|
73
|
+
knowledge_graph.graphml
|
|
74
|
+
repo_dataset.jsonl
|
|
75
|
+
repo_graph.graphml
|
|
76
|
+
repo_summary.md
|
|
77
|
+
.tmp_site/
|
|
78
|
+
website/.docusaurus/
|
|
79
|
+
website/.npm-cache/
|
|
80
|
+
sdk/node/.npm-cache/
|
|
81
|
+
sdk/node/*.tgz
|
|
82
|
+
|
|
83
|
+
!sdk/dart/bin/
|
|
84
|
+
!sdk/dart/lib/
|
webweavex-0.1.0/.gitkeep
ADDED
|
File without changes
|
webweavex-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: webweavex
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: AI-native web crawling platform
|
|
5
|
+
Author: Piyush Mishra
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Keywords: ai,crawler,data extraction,knowledge graph,rag,web scraping
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
14
|
+
Requires-Python: >=3.8
|
|
15
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
16
|
+
Requires-Dist: certifi>=2024.9.0
|
|
17
|
+
Requires-Dist: fastapi>=0.110.0
|
|
18
|
+
Requires-Dist: httpx>=0.25.0
|
|
19
|
+
Requires-Dist: markdownify>=0.13.1
|
|
20
|
+
Requires-Dist: networkx>=3.0
|
|
21
|
+
Requires-Dist: playwright>=1.42.0
|
|
22
|
+
Requires-Dist: pydantic>=2.6.0
|
|
23
|
+
Requires-Dist: redis>=5.0
|
|
24
|
+
Requires-Dist: spacy>=3.7.0
|
|
25
|
+
Requires-Dist: uvicorn>=0.29.0
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# WebWeaveX 🚀
|
|
29
|
+
|
|
30
|
+
> The Next-Gen AI-Powered Web Crawling Engine
|
|
31
|
+
> Multi-language SDKs • Structured Data • Knowledge Graphs • RAG-ready
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## 🌍 Why WebWeaveX?
|
|
36
|
+
|
|
37
|
+
WebWeaveX is not just a crawler.
|
|
38
|
+
|
|
39
|
+
It is a **data intelligence engine** designed for:
|
|
40
|
+
|
|
41
|
+
* 🤖 AI / LLM pipelines (RAG datasets)
|
|
42
|
+
* 📊 Structured web extraction
|
|
43
|
+
* 🧠 Knowledge graph generation
|
|
44
|
+
* ⚡ High-performance crawling
|
|
45
|
+
* 🌐 Multi-language developer ecosystem
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## ⚡ Features
|
|
50
|
+
|
|
51
|
+
* 🔍 Smart crawling (HTML, metadata, links)
|
|
52
|
+
* 🧠 AI-ready outputs (JSON, text, markdown)
|
|
53
|
+
* 🌐 Multi-language SDKs:
|
|
54
|
+
|
|
55
|
+
* Python (pip)
|
|
56
|
+
* Node.js (npm)
|
|
57
|
+
* Dart (pub.dev)
|
|
58
|
+
* Java (Maven)
|
|
59
|
+
* Kotlin
|
|
60
|
+
* ⚡ Async + high-performance engine
|
|
61
|
+
* 🔐 SSL + secure crawling
|
|
62
|
+
* 📦 CLI + API server
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## 📦 Installation
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install webweavex
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## 🚀 Quick Start
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from webweavex import AsyncWebWeaveX
|
|
78
|
+
|
|
79
|
+
crawler = AsyncWebWeaveX()
|
|
80
|
+
result = crawler.crawl("https://example.com")
|
|
81
|
+
|
|
82
|
+
print(result["metadata"]["title"])
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## 🧪 CLI Usage
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
webweavex crawl https://example.com
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## 🧠 Output Example
|
|
96
|
+
|
|
97
|
+
```json
|
|
98
|
+
{
|
|
99
|
+
"url": "https://example.com",
|
|
100
|
+
"status": 200,
|
|
101
|
+
"metadata": {
|
|
102
|
+
"title": "Example Domain"
|
|
103
|
+
},
|
|
104
|
+
"links": [
|
|
105
|
+
{
|
|
106
|
+
"url": "https://iana.org/domains/example",
|
|
107
|
+
"text": "Learn more"
|
|
108
|
+
}
|
|
109
|
+
]
|
|
110
|
+
}
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## 🌐 Multi-Language SDKs
|
|
116
|
+
|
|
117
|
+
WebWeaveX is built for **global developer adoption**:
|
|
118
|
+
|
|
119
|
+
| Language | Package |
|
|
120
|
+
| -------- | --------------------- |
|
|
121
|
+
| Python | pip install webweavex |
|
|
122
|
+
| Node | npm install webweavex |
|
|
123
|
+
| Dart | pub.dev |
|
|
124
|
+
| Java | Maven |
|
|
125
|
+
| Kotlin | Gradle |
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## ⚡ Benchmarks
|
|
130
|
+
|
|
131
|
+
* ⚡ Fast async crawling
|
|
132
|
+
* 📉 Low memory usage
|
|
133
|
+
* 🔁 Concurrent processing
|
|
134
|
+
* 🔍 Optimized parsing
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## 🔐 Security
|
|
139
|
+
|
|
140
|
+
* SSL verification enabled by default
|
|
141
|
+
* Safe crawling practices
|
|
142
|
+
* Configurable policies
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## 🤝 Contributing
|
|
147
|
+
|
|
148
|
+
We welcome contributions 🚀
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
git clone https://github.com/PIYUSH-MISHRA-00/WebWeaveX.git
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## 📜 License
|
|
157
|
+
|
|
158
|
+
Apache License 2.0
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## ⭐ Support
|
|
163
|
+
|
|
164
|
+
If you like this project:
|
|
165
|
+
|
|
166
|
+
👉 Star the repo
|
|
167
|
+
👉 Share with developers
|
|
168
|
+
👉 Use in production
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## 🚀 Vision
|
|
173
|
+
|
|
174
|
+
WebWeaveX is built for the future of:
|
|
175
|
+
|
|
176
|
+
* AI Agents
|
|
177
|
+
* Autonomous systems
|
|
178
|
+
* Data intelligence platforms
|
|
179
|
+
|
|
180
|
+
---
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# WebWeaveX 🚀
|
|
2
|
+
|
|
3
|
+
> The Next-Gen AI-Powered Web Crawling Engine
|
|
4
|
+
> Multi-language SDKs • Structured Data • Knowledge Graphs • RAG-ready
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## 🌍 Why WebWeaveX?
|
|
9
|
+
|
|
10
|
+
WebWeaveX is not just a crawler.
|
|
11
|
+
|
|
12
|
+
It is a **data intelligence engine** designed for:
|
|
13
|
+
|
|
14
|
+
* 🤖 AI / LLM pipelines (RAG datasets)
|
|
15
|
+
* 📊 Structured web extraction
|
|
16
|
+
* 🧠 Knowledge graph generation
|
|
17
|
+
* ⚡ High-performance crawling
|
|
18
|
+
* 🌐 Multi-language developer ecosystem
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## ⚡ Features
|
|
23
|
+
|
|
24
|
+
* 🔍 Smart crawling (HTML, metadata, links)
|
|
25
|
+
* 🧠 AI-ready outputs (JSON, text, markdown)
|
|
26
|
+
* 🌐 Multi-language SDKs:
|
|
27
|
+
|
|
28
|
+
* Python (pip)
|
|
29
|
+
* Node.js (npm)
|
|
30
|
+
* Dart (pub.dev)
|
|
31
|
+
* Java (Maven)
|
|
32
|
+
* Kotlin
|
|
33
|
+
* ⚡ Async + high-performance engine
|
|
34
|
+
* 🔐 SSL + secure crawling
|
|
35
|
+
* 📦 CLI + API server
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## 📦 Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install webweavex
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## 🚀 Quick Start
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from webweavex import AsyncWebWeaveX
|
|
51
|
+
|
|
52
|
+
crawler = AsyncWebWeaveX()
|
|
53
|
+
result = crawler.crawl("https://example.com")
|
|
54
|
+
|
|
55
|
+
print(result["metadata"]["title"])
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## 🧪 CLI Usage
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
webweavex crawl https://example.com
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## 🧠 Output Example
|
|
69
|
+
|
|
70
|
+
```json
|
|
71
|
+
{
|
|
72
|
+
"url": "https://example.com",
|
|
73
|
+
"status": 200,
|
|
74
|
+
"metadata": {
|
|
75
|
+
"title": "Example Domain"
|
|
76
|
+
},
|
|
77
|
+
"links": [
|
|
78
|
+
{
|
|
79
|
+
"url": "https://iana.org/domains/example",
|
|
80
|
+
"text": "Learn more"
|
|
81
|
+
}
|
|
82
|
+
]
|
|
83
|
+
}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## 🌐 Multi-Language SDKs
|
|
89
|
+
|
|
90
|
+
WebWeaveX is built for **global developer adoption**:
|
|
91
|
+
|
|
92
|
+
| Language | Package |
|
|
93
|
+
| -------- | --------------------- |
|
|
94
|
+
| Python | pip install webweavex |
|
|
95
|
+
| Node | npm install webweavex |
|
|
96
|
+
| Dart | pub.dev |
|
|
97
|
+
| Java | Maven |
|
|
98
|
+
| Kotlin | Gradle |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## ⚡ Benchmarks
|
|
103
|
+
|
|
104
|
+
* ⚡ Fast async crawling
|
|
105
|
+
* 📉 Low memory usage
|
|
106
|
+
* 🔁 Concurrent processing
|
|
107
|
+
* 🔍 Optimized parsing
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## 🔐 Security
|
|
112
|
+
|
|
113
|
+
* SSL verification enabled by default
|
|
114
|
+
* Safe crawling practices
|
|
115
|
+
* Configurable policies
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## 🤝 Contributing
|
|
120
|
+
|
|
121
|
+
We welcome contributions 🚀
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
git clone https://github.com/PIYUSH-MISHRA-00/WebWeaveX.git
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## 📜 License
|
|
130
|
+
|
|
131
|
+
Apache License 2.0
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## ⭐ Support
|
|
136
|
+
|
|
137
|
+
If you like this project:
|
|
138
|
+
|
|
139
|
+
👉 Star the repo
|
|
140
|
+
👉 Share with developers
|
|
141
|
+
👉 Use in production
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## 🚀 Vision
|
|
146
|
+
|
|
147
|
+
WebWeaveX is built for the future of:
|
|
148
|
+
|
|
149
|
+
* AI Agents
|
|
150
|
+
* Autonomous systems
|
|
151
|
+
* Data intelligence platforms
|
|
152
|
+
|
|
153
|
+
---
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.21.0"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "webweavex"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "AI-native web crawling platform"
|
|
9
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "Apache-2.0" }
|
|
12
|
+
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Piyush Mishra" }
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
keywords = [
|
|
18
|
+
"crawler",
|
|
19
|
+
"web scraping",
|
|
20
|
+
"rag",
|
|
21
|
+
"knowledge graph",
|
|
22
|
+
"ai",
|
|
23
|
+
"data extraction"
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
classifiers = [
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"License :: OSI Approved :: Apache Software License",
|
|
29
|
+
"Operating System :: OS Independent",
|
|
30
|
+
"Intended Audience :: Developers",
|
|
31
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
32
|
+
"Topic :: Software Development :: Libraries"
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
dependencies = [
|
|
36
|
+
"httpx>=0.25.0",
|
|
37
|
+
"beautifulsoup4>=4.12.0",
|
|
38
|
+
"markdownify>=0.13.1",
|
|
39
|
+
"playwright>=1.42.0",
|
|
40
|
+
"pydantic>=2.6.0",
|
|
41
|
+
"fastapi>=0.110.0",
|
|
42
|
+
"uvicorn>=0.29.0",
|
|
43
|
+
"spacy>=3.7.0",
|
|
44
|
+
"redis>=5.0",
|
|
45
|
+
"networkx>=3.0",
|
|
46
|
+
"certifi>=2024.9.0"
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[project.scripts]
|
|
50
|
+
webweavex = "webweavex.cli:main"
|
|
51
|
+
|
|
52
|
+
[tool.hatch.build.targets.wheel]
|
|
53
|
+
packages = ["webweavex"]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""WebWeaveX core package."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import ssl
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
# Ensure CA certificates are available for HTTPS requests.
|
|
9
|
+
# Some environments (including certain CI containers) may lack a default CA bundle.
|
|
10
|
+
try:
|
|
11
|
+
import certifi
|
|
12
|
+
cafile = ssl.get_default_verify_paths().cafile
|
|
13
|
+
if not cafile or not os.path.exists(cafile):
|
|
14
|
+
os.environ.setdefault("SSL_CERT_FILE", certifi.where())
|
|
15
|
+
except ImportError: # pragma: no cover
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
ROOT = Path(__file__).resolve().parents[1] # core
|
|
19
|
+
sys.path.insert(0, str(ROOT.parent)) # root
|
|
20
|
+
|
|
21
|
+
from .async_engine import AsyncWebWeaveX
|
|
22
|
+
from .config import CrawlConfig
|
|
23
|
+
from .engine import WebWeaveX
|
|
24
|
+
from .models import Link, Metadata, PageResult
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"AsyncWebWeaveX",
|
|
28
|
+
"CrawlConfig",
|
|
29
|
+
"WebWeaveX",
|
|
30
|
+
"Link",
|
|
31
|
+
"Metadata",
|
|
32
|
+
"PageResult",
|
|
33
|
+
]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from fastapi import FastAPI
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from .async_engine import AsyncWebWeaveX
|
|
7
|
+
from .models import PageResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UrlRequest(BaseModel):
|
|
11
|
+
url: str
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
app = FastAPI(title="WebWeaveX API", version="0.1.0")
|
|
15
|
+
_engine = AsyncWebWeaveX()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@app.on_event("shutdown")
|
|
19
|
+
async def _shutdown_engine() -> None:
|
|
20
|
+
await _engine.aclose()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@app.post("/crawl", response_model=PageResult)
|
|
24
|
+
async def crawl(request: UrlRequest) -> PageResult:
|
|
25
|
+
return await _engine.crawl(request.url)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@app.post("/crawl_site", response_model=list[PageResult])
|
|
29
|
+
async def crawl_site(request: UrlRequest) -> list[PageResult]:
|
|
30
|
+
return await _engine.crawl_site(request.url)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@app.post("/rag_dataset")
|
|
34
|
+
async def rag_dataset(request: UrlRequest) -> list[dict[str, object]]:
|
|
35
|
+
return await _engine.build_rag_dataset(request.url)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@app.post("/knowledge_graph")
|
|
39
|
+
async def knowledge_graph(request: UrlRequest) -> dict[str, list[dict[str, str]]]:
|
|
40
|
+
return await _engine.build_knowledge_graph(request.url)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
5
|
+
from .async_fetcher import AsyncFetcher
|
|
6
|
+
from .config import CrawlConfig
|
|
7
|
+
from .crawler import parse_html
|
|
8
|
+
from .js_renderer import JSRenderer
|
|
9
|
+
from .logging import get_logger
|
|
10
|
+
from .models import Metadata, PageResult
|
|
11
|
+
from .robots import RobotsHandler
|
|
12
|
+
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class AsyncCrawler:
|
|
17
|
+
"""Concurrent async crawler for multiple URLs."""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
fetcher: AsyncFetcher,
|
|
22
|
+
config: CrawlConfig,
|
|
23
|
+
robots: RobotsHandler | None = None,
|
|
24
|
+
renderer: JSRenderer | None = None,
|
|
25
|
+
) -> None:
|
|
26
|
+
self._fetcher = fetcher
|
|
27
|
+
self._config = config
|
|
28
|
+
self._robots = robots
|
|
29
|
+
self._renderer = renderer
|
|
30
|
+
self._semaphore = asyncio.Semaphore(max(1, config.max_concurrency))
|
|
31
|
+
|
|
32
|
+
async def crawl(self, url: str) -> PageResult:
|
|
33
|
+
logger.info("Async crawling %s", url)
|
|
34
|
+
if self._robots:
|
|
35
|
+
allowed = await self._robots.allowed(url)
|
|
36
|
+
if not allowed:
|
|
37
|
+
logger.info("Robots blocked %s", url)
|
|
38
|
+
return PageResult(url=url, status=403, html=None, links=[], metadata=Metadata())
|
|
39
|
+
|
|
40
|
+
async with self._semaphore:
|
|
41
|
+
if self._config.enable_js:
|
|
42
|
+
if self._renderer is None:
|
|
43
|
+
logger.warning("JS rendering enabled but no renderer configured; falling back to fetcher")
|
|
44
|
+
status, html = await self._fetcher.fetch(url)
|
|
45
|
+
else:
|
|
46
|
+
logger.info("JS rendering used for %s", url)
|
|
47
|
+
html = await self._renderer.render(url)
|
|
48
|
+
status = 200
|
|
49
|
+
else:
|
|
50
|
+
status, html = await self._fetcher.fetch(url)
|
|
51
|
+
links, metadata = parse_html(html)
|
|
52
|
+
return PageResult(url=url, status=status, html=html, links=links, metadata=metadata)
|
|
53
|
+
|
|
54
|
+
async def crawl_many(self, urls: list[str]) -> list[PageResult]:
|
|
55
|
+
tasks = [asyncio.create_task(self.crawl(url)) for url in urls]
|
|
56
|
+
return await asyncio.gather(*tasks)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .async_crawler import AsyncCrawler
|
|
4
|
+
from .async_fetcher import AsyncFetcher
|
|
5
|
+
from .config import CrawlConfig
|
|
6
|
+
from .js_renderer import JSRenderer
|
|
7
|
+
from .logging import get_logger
|
|
8
|
+
from .models import PageResult
|
|
9
|
+
from .robots import RobotsHandler
|
|
10
|
+
|
|
11
|
+
logger = get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class AsyncWebWeaveX:
|
|
15
|
+
"""Async entry point for concurrent crawling."""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
config: CrawlConfig | None = None,
|
|
20
|
+
fetcher: AsyncFetcher | None = None,
|
|
21
|
+
renderer: JSRenderer | None = None,
|
|
22
|
+
robots: RobotsHandler | None = None,
|
|
23
|
+
) -> None:
|
|
24
|
+
self.config = config or CrawlConfig()
|
|
25
|
+
self.fetcher = fetcher or AsyncFetcher(self.config)
|
|
26
|
+
self._owns_fetcher = fetcher is None
|
|
27
|
+
self.renderer = renderer or (JSRenderer(self.config) if self.config.enable_js else None)
|
|
28
|
+
self.robots = robots or RobotsHandler(self.config)
|
|
29
|
+
self._owns_robots = robots is None
|
|
30
|
+
self._crawler = AsyncCrawler(
|
|
31
|
+
self.fetcher,
|
|
32
|
+
self.config,
|
|
33
|
+
robots=self.robots,
|
|
34
|
+
renderer=self.renderer,
|
|
35
|
+
)
|
|
36
|
+
logger.debug("AsyncWebWeaveX engine initialized")
|
|
37
|
+
|
|
38
|
+
async def crawl(self, url: str) -> PageResult:
|
|
39
|
+
logger.info("Async engine crawl requested for %s", url)
|
|
40
|
+
return await self._crawler.crawl(url)
|
|
41
|
+
|
|
42
|
+
async def crawl_many(self, urls: list[str]) -> list[PageResult]:
|
|
43
|
+
logger.info("Async engine crawl_many requested for %s urls", len(urls))
|
|
44
|
+
return await self._crawler.crawl_many(urls)
|
|
45
|
+
|
|
46
|
+
async def crawl_site(self, url: str) -> list[PageResult]:
|
|
47
|
+
logger.info("Async engine site crawl requested for %s", url)
|
|
48
|
+
from crawler_engine.site_crawler import SiteCrawler
|
|
49
|
+
site_crawler = SiteCrawler(
|
|
50
|
+
self.config,
|
|
51
|
+
fetcher=self.fetcher,
|
|
52
|
+
robots=self.robots,
|
|
53
|
+
renderer=self.renderer,
|
|
54
|
+
)
|
|
55
|
+
try:
|
|
56
|
+
return await site_crawler.crawl_site(url)
|
|
57
|
+
finally:
|
|
58
|
+
await site_crawler.aclose()
|
|
59
|
+
|
|
60
|
+
async def build_rag_dataset(self, url: str) -> list[dict[str, object]]:
|
|
61
|
+
logger.info("Dataset generation started for %s", url)
|
|
62
|
+
from rag.rag_pipeline import build_dataset
|
|
63
|
+
pages = await self.crawl_site(url)
|
|
64
|
+
dataset = build_dataset(pages)
|
|
65
|
+
logger.info("Dataset size %s", len(dataset))
|
|
66
|
+
return dataset
|
|
67
|
+
|
|
68
|
+
async def build_knowledge_graph(self, url: str) -> dict[str, list[dict[str, str]]]:
|
|
69
|
+
logger.info("Knowledge graph generation started for %s", url)
|
|
70
|
+
from knowledge_graph.graph_pipeline import build_graph
|
|
71
|
+
pages = await self.crawl_site(url)
|
|
72
|
+
graph = build_graph(pages)
|
|
73
|
+
logger.info("Knowledge graph nodes %s edges %s", len(graph.get("nodes", [])), len(graph.get("edges", [])))
|
|
74
|
+
return graph
|
|
75
|
+
|
|
76
|
+
async def aclose(self) -> None:
|
|
77
|
+
if self._owns_fetcher:
|
|
78
|
+
await self.fetcher.close()
|
|
79
|
+
if self._owns_robots:
|
|
80
|
+
await self.robots.close()
|
|
81
|
+
|
|
82
|
+
async def __aenter__(self) -> "AsyncWebWeaveX":
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
async def __aexit__(self, exc_type, exc, tb) -> None:
|
|
86
|
+
await self.aclose()
|