spiderforce4ai 0.1.7__tar.gz → 0.1.9__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-0.1.7 → spiderforce4ai-0.1.9}/PKG-INFO +106 -75
- {spiderforce4ai-0.1.7 → spiderforce4ai-0.1.9}/README.md +106 -75
- {spiderforce4ai-0.1.7 → spiderforce4ai-0.1.9}/pyproject.toml +1 -1
- {spiderforce4ai-0.1.7 → spiderforce4ai-0.1.9}/setup.py +1 -1
- {spiderforce4ai-0.1.7 → spiderforce4ai-0.1.9}/spiderforce4ai/__init__.py +223 -22
- {spiderforce4ai-0.1.7 → spiderforce4ai-0.1.9}/spiderforce4ai.egg-info/PKG-INFO +106 -75
- {spiderforce4ai-0.1.7 → spiderforce4ai-0.1.9}/setup.cfg +0 -0
- {spiderforce4ai-0.1.7 → spiderforce4ai-0.1.9}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-0.1.7 → spiderforce4ai-0.1.9}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-0.1.7 → spiderforce4ai-0.1.9}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-0.1.7 → spiderforce4ai-0.1.9}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: spiderforce4ai
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
5
|
Home-page: https://petertam.pro
|
6
6
|
Author: Piotr Tamulewicz
|
@@ -24,75 +24,73 @@ Dynamic: requires-python
|
|
24
24
|
|
25
25
|
# SpiderForce4AI Python Wrapper
|
26
26
|
|
27
|
-
A Python
|
28
|
-
|
29
|
-
## Installation
|
30
|
-
|
31
|
-
```bash
|
32
|
-
pip install spiderforce4ai
|
33
|
-
```
|
27
|
+
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
34
28
|
|
35
29
|
## Quick Start (Minimal Setup)
|
36
30
|
|
37
31
|
```python
|
38
32
|
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
39
33
|
|
40
|
-
# Initialize with your
|
34
|
+
# Initialize with your service URL
|
41
35
|
spider = SpiderForce4AI("http://localhost:3004")
|
42
36
|
|
43
|
-
#
|
37
|
+
# Create default config
|
44
38
|
config = CrawlConfig()
|
45
39
|
|
46
40
|
# Crawl a single URL
|
47
41
|
result = spider.crawl_url("https://example.com", config)
|
48
42
|
```
|
49
43
|
|
44
|
+
## Installation
|
45
|
+
|
46
|
+
```bash
|
47
|
+
pip install spiderforce4ai
|
48
|
+
```
|
49
|
+
|
50
50
|
## Crawling Methods
|
51
51
|
|
52
|
-
### 1. Single URL
|
52
|
+
### 1. Single URL
|
53
53
|
|
54
54
|
```python
|
55
|
-
#
|
55
|
+
# Basic usage
|
56
56
|
result = spider.crawl_url("https://example.com", config)
|
57
57
|
|
58
|
-
#
|
58
|
+
# Async version
|
59
59
|
async def crawl():
|
60
60
|
result = await spider.crawl_url_async("https://example.com", config)
|
61
61
|
```
|
62
62
|
|
63
|
-
### 2. Multiple URLs
|
63
|
+
### 2. Multiple URLs
|
64
64
|
|
65
65
|
```python
|
66
|
-
# List of URLs
|
67
66
|
urls = [
|
68
67
|
"https://example.com/page1",
|
69
|
-
"https://example.com/page2"
|
70
|
-
"https://example.com/page3"
|
68
|
+
"https://example.com/page2"
|
71
69
|
]
|
72
70
|
|
73
|
-
#
|
74
|
-
results = spider.
|
71
|
+
# Client-side parallel (using multiprocessing)
|
72
|
+
results = spider.crawl_urls_parallel(urls, config)
|
73
|
+
|
74
|
+
# Server-side parallel (single request)
|
75
|
+
results = spider.crawl_urls_server_parallel(urls, config)
|
75
76
|
|
76
|
-
#
|
77
|
+
# Async version
|
77
78
|
async def crawl():
|
78
79
|
results = await spider.crawl_urls_async(urls, config)
|
79
|
-
|
80
|
-
# Parallel (using multiprocessing)
|
81
|
-
results = spider.crawl_urls_parallel(urls, config)
|
82
80
|
```
|
83
81
|
|
84
82
|
### 3. Sitemap Crawling
|
85
83
|
|
86
84
|
```python
|
87
|
-
#
|
88
|
-
results = spider.
|
85
|
+
# Server-side parallel (recommended)
|
86
|
+
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
87
|
+
|
88
|
+
# Client-side parallel
|
89
|
+
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
89
90
|
|
90
|
-
#
|
91
|
+
# Async version
|
91
92
|
async def crawl():
|
92
93
|
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
93
|
-
|
94
|
-
# Parallel (using multiprocessing)
|
95
|
-
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
96
94
|
```
|
97
95
|
|
98
96
|
## Configuration Options
|
@@ -100,9 +98,11 @@ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", confi
|
|
100
98
|
All configuration options are optional with sensible defaults:
|
101
99
|
|
102
100
|
```python
|
101
|
+
from pathlib import Path
|
102
|
+
|
103
103
|
config = CrawlConfig(
|
104
104
|
# Content Selection (all optional)
|
105
|
-
target_selector="article", # Specific element to
|
105
|
+
target_selector="article", # Specific element to extract
|
106
106
|
remove_selectors=[ # Elements to remove
|
107
107
|
".ads",
|
108
108
|
"#popup",
|
@@ -112,21 +112,34 @@ config = CrawlConfig(
|
|
112
112
|
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
113
113
|
|
114
114
|
# Processing Settings
|
115
|
-
max_concurrent_requests=1, #
|
116
|
-
request_delay=0.5, # Delay between requests
|
117
|
-
timeout=30, # Request timeout
|
115
|
+
max_concurrent_requests=1, # For client-side parallel processing
|
116
|
+
request_delay=0.5, # Delay between requests (seconds)
|
117
|
+
timeout=30, # Request timeout (seconds)
|
118
118
|
|
119
119
|
# Output Settings
|
120
|
-
output_dir="
|
121
|
-
|
122
|
-
|
123
|
-
|
120
|
+
output_dir=Path("spiderforce_reports"), # Default directory for files
|
121
|
+
webhook_url="https://your-webhook.com", # Real-time notifications
|
122
|
+
webhook_timeout=10, # Webhook timeout
|
123
|
+
webhook_headers={ # Optional custom headers for webhook
|
124
|
+
"Authorization": "Bearer your-token",
|
125
|
+
"X-Custom-Header": "value"
|
126
|
+
},
|
127
|
+
webhook_payload_template='''{ # Optional custom webhook payload template
|
128
|
+
"crawled_url": "{url}",
|
129
|
+
"content": "{markdown}",
|
130
|
+
"crawl_status": "{status}",
|
131
|
+
"crawl_error": "{error}",
|
132
|
+
"crawl_time": "{timestamp}",
|
133
|
+
"custom_field": "your-value"
|
134
|
+
}''',
|
135
|
+
save_reports=False, # Whether to save crawl reports (default: False)
|
136
|
+
report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
|
124
137
|
)
|
125
138
|
```
|
126
139
|
|
127
140
|
## Real-World Examples
|
128
141
|
|
129
|
-
### 1. Basic
|
142
|
+
### 1. Basic Blog Crawling
|
130
143
|
|
131
144
|
```python
|
132
145
|
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
@@ -134,78 +147,77 @@ from pathlib import Path
|
|
134
147
|
|
135
148
|
spider = SpiderForce4AI("http://localhost:3004")
|
136
149
|
config = CrawlConfig(
|
150
|
+
target_selector="article.post-content",
|
137
151
|
output_dir=Path("blog_content")
|
138
152
|
)
|
139
153
|
|
140
|
-
result = spider.crawl_url("https://example.com/blog", config)
|
141
|
-
print(f"Content saved to: {result.url}.md")
|
154
|
+
result = spider.crawl_url("https://example.com/blog-post", config)
|
142
155
|
```
|
143
156
|
|
144
|
-
### 2.
|
157
|
+
### 2. Parallel Website Crawling
|
145
158
|
|
146
159
|
```python
|
147
160
|
config = CrawlConfig(
|
148
|
-
max_concurrent_requests=5,
|
149
|
-
output_dir=Path("website_content"),
|
150
161
|
remove_selectors=[
|
151
162
|
".navigation",
|
152
163
|
".footer",
|
153
164
|
".ads",
|
154
165
|
"#cookie-notice"
|
155
166
|
],
|
167
|
+
max_concurrent_requests=5,
|
168
|
+
output_dir=Path("website_content"),
|
156
169
|
webhook_url="https://your-webhook.com/endpoint"
|
157
170
|
)
|
158
171
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
172
|
+
# Using server-side parallel processing
|
173
|
+
results = spider.crawl_urls_server_parallel([
|
174
|
+
"https://example.com/page1",
|
175
|
+
"https://example.com/page2",
|
176
|
+
"https://example.com/page3"
|
177
|
+
], config)
|
163
178
|
```
|
164
179
|
|
165
|
-
### 3.
|
180
|
+
### 3. Full Sitemap Processing
|
166
181
|
|
167
182
|
```python
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
)
|
175
|
-
|
176
|
-
async with spider:
|
177
|
-
results = await spider.crawl_urls_async([
|
178
|
-
"https://example.com/1",
|
179
|
-
"https://example.com/2",
|
180
|
-
"https://example.com/3"
|
181
|
-
], config)
|
182
|
-
|
183
|
-
return results
|
183
|
+
config = CrawlConfig(
|
184
|
+
target_selector="main",
|
185
|
+
remove_selectors=[".sidebar", ".comments"],
|
186
|
+
output_dir=Path("site_content"),
|
187
|
+
report_file=Path("crawl_report.json")
|
188
|
+
)
|
184
189
|
|
185
|
-
results =
|
190
|
+
results = spider.crawl_sitemap_server_parallel(
|
191
|
+
"https://example.com/sitemap.xml",
|
192
|
+
config
|
193
|
+
)
|
186
194
|
```
|
187
195
|
|
188
196
|
## Output Structure
|
189
197
|
|
190
|
-
### 1.
|
198
|
+
### 1. Directory Layout
|
191
199
|
```
|
192
|
-
|
193
|
-
├── example-com-page1.md
|
200
|
+
spiderforce_reports/ # Default output directory
|
201
|
+
├── example-com-page1.md # Converted markdown files
|
194
202
|
├── example-com-page2.md
|
195
|
-
└── crawl_report.json
|
203
|
+
└── crawl_report.json # Crawl report
|
196
204
|
```
|
197
205
|
|
198
206
|
### 2. Markdown Files
|
199
|
-
Each
|
207
|
+
Each file is named using a slugified version of the URL:
|
208
|
+
```markdown
|
209
|
+
# Page Title
|
210
|
+
|
211
|
+
Content converted to clean markdown...
|
212
|
+
```
|
200
213
|
|
201
|
-
### 3. Report
|
214
|
+
### 3. Crawl Report
|
202
215
|
```json
|
203
216
|
{
|
204
217
|
"timestamp": "2025-02-15T10:30:00.123456",
|
205
218
|
"config": {
|
206
219
|
"target_selector": "article",
|
207
|
-
"remove_selectors": [".ads", "#popup"]
|
208
|
-
"remove_selectors_regex": ["modal-\\d+"]
|
220
|
+
"remove_selectors": [".ads", "#popup"]
|
209
221
|
},
|
210
222
|
"results": {
|
211
223
|
"successful": [
|
@@ -234,7 +246,7 @@ Each markdown file is named using a slugified version of the URL and contains th
|
|
234
246
|
```
|
235
247
|
|
236
248
|
### 4. Webhook Notifications
|
237
|
-
If configured,
|
249
|
+
If configured, real-time updates are sent for each processed URL:
|
238
250
|
```json
|
239
251
|
{
|
240
252
|
"url": "https://example.com/page1",
|
@@ -250,7 +262,7 @@ If configured, webhooks receive real-time updates in JSON format:
|
|
250
262
|
|
251
263
|
## Error Handling
|
252
264
|
|
253
|
-
The package handles various types of errors:
|
265
|
+
The package handles various types of errors gracefully:
|
254
266
|
- Network errors
|
255
267
|
- Timeout errors
|
256
268
|
- Invalid URLs
|
@@ -269,6 +281,25 @@ All errors are:
|
|
269
281
|
- Running SpiderForce4AI service
|
270
282
|
- Internet connection
|
271
283
|
|
284
|
+
## Performance Considerations
|
285
|
+
|
286
|
+
1. Server-side Parallel Processing
|
287
|
+
- Best for most cases
|
288
|
+
- Single HTTP request for multiple URLs
|
289
|
+
- Less network overhead
|
290
|
+
- Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
|
291
|
+
|
292
|
+
2. Client-side Parallel Processing
|
293
|
+
- Good for special cases requiring local control
|
294
|
+
- Uses Python multiprocessing
|
295
|
+
- More network overhead
|
296
|
+
- Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
|
297
|
+
|
298
|
+
3. Async Processing
|
299
|
+
- Best for integration with async applications
|
300
|
+
- Good for real-time processing
|
301
|
+
- Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
|
302
|
+
|
272
303
|
## License
|
273
304
|
|
274
305
|
MIT License
|
@@ -1,74 +1,72 @@
|
|
1
1
|
# SpiderForce4AI Python Wrapper
|
2
2
|
|
3
|
-
A Python
|
4
|
-
|
5
|
-
## Installation
|
6
|
-
|
7
|
-
```bash
|
8
|
-
pip install spiderforce4ai
|
9
|
-
```
|
3
|
+
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
10
4
|
|
11
5
|
## Quick Start (Minimal Setup)
|
12
6
|
|
13
7
|
```python
|
14
8
|
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
15
9
|
|
16
|
-
# Initialize with your
|
10
|
+
# Initialize with your service URL
|
17
11
|
spider = SpiderForce4AI("http://localhost:3004")
|
18
12
|
|
19
|
-
#
|
13
|
+
# Create default config
|
20
14
|
config = CrawlConfig()
|
21
15
|
|
22
16
|
# Crawl a single URL
|
23
17
|
result = spider.crawl_url("https://example.com", config)
|
24
18
|
```
|
25
19
|
|
20
|
+
## Installation
|
21
|
+
|
22
|
+
```bash
|
23
|
+
pip install spiderforce4ai
|
24
|
+
```
|
25
|
+
|
26
26
|
## Crawling Methods
|
27
27
|
|
28
|
-
### 1. Single URL
|
28
|
+
### 1. Single URL
|
29
29
|
|
30
30
|
```python
|
31
|
-
#
|
31
|
+
# Basic usage
|
32
32
|
result = spider.crawl_url("https://example.com", config)
|
33
33
|
|
34
|
-
#
|
34
|
+
# Async version
|
35
35
|
async def crawl():
|
36
36
|
result = await spider.crawl_url_async("https://example.com", config)
|
37
37
|
```
|
38
38
|
|
39
|
-
### 2. Multiple URLs
|
39
|
+
### 2. Multiple URLs
|
40
40
|
|
41
41
|
```python
|
42
|
-
# List of URLs
|
43
42
|
urls = [
|
44
43
|
"https://example.com/page1",
|
45
|
-
"https://example.com/page2"
|
46
|
-
"https://example.com/page3"
|
44
|
+
"https://example.com/page2"
|
47
45
|
]
|
48
46
|
|
49
|
-
#
|
50
|
-
results = spider.
|
47
|
+
# Client-side parallel (using multiprocessing)
|
48
|
+
results = spider.crawl_urls_parallel(urls, config)
|
49
|
+
|
50
|
+
# Server-side parallel (single request)
|
51
|
+
results = spider.crawl_urls_server_parallel(urls, config)
|
51
52
|
|
52
|
-
#
|
53
|
+
# Async version
|
53
54
|
async def crawl():
|
54
55
|
results = await spider.crawl_urls_async(urls, config)
|
55
|
-
|
56
|
-
# Parallel (using multiprocessing)
|
57
|
-
results = spider.crawl_urls_parallel(urls, config)
|
58
56
|
```
|
59
57
|
|
60
58
|
### 3. Sitemap Crawling
|
61
59
|
|
62
60
|
```python
|
63
|
-
#
|
64
|
-
results = spider.
|
61
|
+
# Server-side parallel (recommended)
|
62
|
+
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
63
|
+
|
64
|
+
# Client-side parallel
|
65
|
+
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
65
66
|
|
66
|
-
#
|
67
|
+
# Async version
|
67
68
|
async def crawl():
|
68
69
|
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
69
|
-
|
70
|
-
# Parallel (using multiprocessing)
|
71
|
-
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
72
70
|
```
|
73
71
|
|
74
72
|
## Configuration Options
|
@@ -76,9 +74,11 @@ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", confi
|
|
76
74
|
All configuration options are optional with sensible defaults:
|
77
75
|
|
78
76
|
```python
|
77
|
+
from pathlib import Path
|
78
|
+
|
79
79
|
config = CrawlConfig(
|
80
80
|
# Content Selection (all optional)
|
81
|
-
target_selector="article", # Specific element to
|
81
|
+
target_selector="article", # Specific element to extract
|
82
82
|
remove_selectors=[ # Elements to remove
|
83
83
|
".ads",
|
84
84
|
"#popup",
|
@@ -88,21 +88,34 @@ config = CrawlConfig(
|
|
88
88
|
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
89
89
|
|
90
90
|
# Processing Settings
|
91
|
-
max_concurrent_requests=1, #
|
92
|
-
request_delay=0.5, # Delay between requests
|
93
|
-
timeout=30, # Request timeout
|
91
|
+
max_concurrent_requests=1, # For client-side parallel processing
|
92
|
+
request_delay=0.5, # Delay between requests (seconds)
|
93
|
+
timeout=30, # Request timeout (seconds)
|
94
94
|
|
95
95
|
# Output Settings
|
96
|
-
output_dir="
|
97
|
-
|
98
|
-
|
99
|
-
|
96
|
+
output_dir=Path("spiderforce_reports"), # Default directory for files
|
97
|
+
webhook_url="https://your-webhook.com", # Real-time notifications
|
98
|
+
webhook_timeout=10, # Webhook timeout
|
99
|
+
webhook_headers={ # Optional custom headers for webhook
|
100
|
+
"Authorization": "Bearer your-token",
|
101
|
+
"X-Custom-Header": "value"
|
102
|
+
},
|
103
|
+
webhook_payload_template='''{ # Optional custom webhook payload template
|
104
|
+
"crawled_url": "{url}",
|
105
|
+
"content": "{markdown}",
|
106
|
+
"crawl_status": "{status}",
|
107
|
+
"crawl_error": "{error}",
|
108
|
+
"crawl_time": "{timestamp}",
|
109
|
+
"custom_field": "your-value"
|
110
|
+
}''',
|
111
|
+
save_reports=False, # Whether to save crawl reports (default: False)
|
112
|
+
report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
|
100
113
|
)
|
101
114
|
```
|
102
115
|
|
103
116
|
## Real-World Examples
|
104
117
|
|
105
|
-
### 1. Basic
|
118
|
+
### 1. Basic Blog Crawling
|
106
119
|
|
107
120
|
```python
|
108
121
|
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
@@ -110,78 +123,77 @@ from pathlib import Path
|
|
110
123
|
|
111
124
|
spider = SpiderForce4AI("http://localhost:3004")
|
112
125
|
config = CrawlConfig(
|
126
|
+
target_selector="article.post-content",
|
113
127
|
output_dir=Path("blog_content")
|
114
128
|
)
|
115
129
|
|
116
|
-
result = spider.crawl_url("https://example.com/blog", config)
|
117
|
-
print(f"Content saved to: {result.url}.md")
|
130
|
+
result = spider.crawl_url("https://example.com/blog-post", config)
|
118
131
|
```
|
119
132
|
|
120
|
-
### 2.
|
133
|
+
### 2. Parallel Website Crawling
|
121
134
|
|
122
135
|
```python
|
123
136
|
config = CrawlConfig(
|
124
|
-
max_concurrent_requests=5,
|
125
|
-
output_dir=Path("website_content"),
|
126
137
|
remove_selectors=[
|
127
138
|
".navigation",
|
128
139
|
".footer",
|
129
140
|
".ads",
|
130
141
|
"#cookie-notice"
|
131
142
|
],
|
143
|
+
max_concurrent_requests=5,
|
144
|
+
output_dir=Path("website_content"),
|
132
145
|
webhook_url="https://your-webhook.com/endpoint"
|
133
146
|
)
|
134
147
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
148
|
+
# Using server-side parallel processing
|
149
|
+
results = spider.crawl_urls_server_parallel([
|
150
|
+
"https://example.com/page1",
|
151
|
+
"https://example.com/page2",
|
152
|
+
"https://example.com/page3"
|
153
|
+
], config)
|
139
154
|
```
|
140
155
|
|
141
|
-
### 3.
|
156
|
+
### 3. Full Sitemap Processing
|
142
157
|
|
143
158
|
```python
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
)
|
151
|
-
|
152
|
-
async with spider:
|
153
|
-
results = await spider.crawl_urls_async([
|
154
|
-
"https://example.com/1",
|
155
|
-
"https://example.com/2",
|
156
|
-
"https://example.com/3"
|
157
|
-
], config)
|
158
|
-
|
159
|
-
return results
|
159
|
+
config = CrawlConfig(
|
160
|
+
target_selector="main",
|
161
|
+
remove_selectors=[".sidebar", ".comments"],
|
162
|
+
output_dir=Path("site_content"),
|
163
|
+
report_file=Path("crawl_report.json")
|
164
|
+
)
|
160
165
|
|
161
|
-
results =
|
166
|
+
results = spider.crawl_sitemap_server_parallel(
|
167
|
+
"https://example.com/sitemap.xml",
|
168
|
+
config
|
169
|
+
)
|
162
170
|
```
|
163
171
|
|
164
172
|
## Output Structure
|
165
173
|
|
166
|
-
### 1.
|
174
|
+
### 1. Directory Layout
|
167
175
|
```
|
168
|
-
|
169
|
-
├── example-com-page1.md
|
176
|
+
spiderforce_reports/ # Default output directory
|
177
|
+
├── example-com-page1.md # Converted markdown files
|
170
178
|
├── example-com-page2.md
|
171
|
-
└── crawl_report.json
|
179
|
+
└── crawl_report.json # Crawl report
|
172
180
|
```
|
173
181
|
|
174
182
|
### 2. Markdown Files
|
175
|
-
Each
|
183
|
+
Each file is named using a slugified version of the URL:
|
184
|
+
```markdown
|
185
|
+
# Page Title
|
186
|
+
|
187
|
+
Content converted to clean markdown...
|
188
|
+
```
|
176
189
|
|
177
|
-
### 3. Report
|
190
|
+
### 3. Crawl Report
|
178
191
|
```json
|
179
192
|
{
|
180
193
|
"timestamp": "2025-02-15T10:30:00.123456",
|
181
194
|
"config": {
|
182
195
|
"target_selector": "article",
|
183
|
-
"remove_selectors": [".ads", "#popup"]
|
184
|
-
"remove_selectors_regex": ["modal-\\d+"]
|
196
|
+
"remove_selectors": [".ads", "#popup"]
|
185
197
|
},
|
186
198
|
"results": {
|
187
199
|
"successful": [
|
@@ -210,7 +222,7 @@ Each markdown file is named using a slugified version of the URL and contains th
|
|
210
222
|
```
|
211
223
|
|
212
224
|
### 4. Webhook Notifications
|
213
|
-
If configured,
|
225
|
+
If configured, real-time updates are sent for each processed URL:
|
214
226
|
```json
|
215
227
|
{
|
216
228
|
"url": "https://example.com/page1",
|
@@ -226,7 +238,7 @@ If configured, webhooks receive real-time updates in JSON format:
|
|
226
238
|
|
227
239
|
## Error Handling
|
228
240
|
|
229
|
-
The package handles various types of errors:
|
241
|
+
The package handles various types of errors gracefully:
|
230
242
|
- Network errors
|
231
243
|
- Timeout errors
|
232
244
|
- Invalid URLs
|
@@ -245,10 +257,29 @@ All errors are:
|
|
245
257
|
- Running SpiderForce4AI service
|
246
258
|
- Internet connection
|
247
259
|
|
260
|
+
## Performance Considerations
|
261
|
+
|
262
|
+
1. Server-side Parallel Processing
|
263
|
+
- Best for most cases
|
264
|
+
- Single HTTP request for multiple URLs
|
265
|
+
- Less network overhead
|
266
|
+
- Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
|
267
|
+
|
268
|
+
2. Client-side Parallel Processing
|
269
|
+
- Good for special cases requiring local control
|
270
|
+
- Uses Python multiprocessing
|
271
|
+
- More network overhead
|
272
|
+
- Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
|
273
|
+
|
274
|
+
3. Async Processing
|
275
|
+
- Best for integration with async applications
|
276
|
+
- Good for real-time processing
|
277
|
+
- Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
|
278
|
+
|
248
279
|
## License
|
249
280
|
|
250
281
|
MIT License
|
251
282
|
|
252
283
|
## Credits
|
253
284
|
|
254
|
-
Created by [Peter Tam](https://petertam.pro)
|
285
|
+
Created by [Peter Tam](https://petertam.pro)
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.9"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
|
@@ -57,22 +57,27 @@ class CrawlConfig:
|
|
57
57
|
output_dir: Path = Path("spiderforce_reports") # Default to spiderforce_reports in current directory
|
58
58
|
webhook_url: Optional[str] = None # Optional webhook endpoint
|
59
59
|
webhook_timeout: int = 10 # Webhook timeout
|
60
|
-
|
60
|
+
webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
|
61
|
+
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
62
|
+
save_reports: bool = False # Whether to save crawl reports
|
63
|
+
report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
|
61
64
|
|
62
65
|
def __post_init__(self):
|
63
|
-
# Initialize empty lists for
|
66
|
+
# Initialize empty lists/dicts for None values
|
64
67
|
self.remove_selectors = self.remove_selectors or []
|
65
68
|
self.remove_selectors_regex = self.remove_selectors_regex or []
|
69
|
+
self.webhook_headers = self.webhook_headers or {}
|
66
70
|
|
67
71
|
# Ensure output_dir is a Path and exists
|
68
72
|
self.output_dir = Path(self.output_dir)
|
69
73
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
70
74
|
|
71
|
-
#
|
72
|
-
if self.
|
73
|
-
self.report_file
|
74
|
-
|
75
|
-
|
75
|
+
# Only setup report file if save_reports is True
|
76
|
+
if self.save_reports:
|
77
|
+
if self.report_file is None:
|
78
|
+
self.report_file = self.output_dir / "crawl_report.json"
|
79
|
+
else:
|
80
|
+
self.report_file = Path(self.report_file)
|
76
81
|
|
77
82
|
def to_dict(self) -> Dict:
|
78
83
|
"""Convert config to dictionary for API requests."""
|
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
|
|
92
97
|
if not config.webhook_url:
|
93
98
|
return
|
94
99
|
|
95
|
-
payload
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
100
|
+
# Use custom payload template if provided, otherwise use default
|
101
|
+
if config.webhook_payload_template:
|
102
|
+
# Replace variables in the template
|
103
|
+
payload_str = config.webhook_payload_template.format(
|
104
|
+
url=result.url,
|
105
|
+
status=result.status,
|
106
|
+
markdown=result.markdown if result.status == "success" else None,
|
107
|
+
error=result.error if result.status == "failed" else None,
|
108
|
+
timestamp=result.timestamp,
|
109
|
+
config=config.to_dict()
|
110
|
+
)
|
111
|
+
payload = json.loads(payload_str) # Parse the formatted JSON string
|
112
|
+
else:
|
113
|
+
# Use default payload format
|
114
|
+
payload = {
|
115
|
+
"url": result.url,
|
116
|
+
"status": result.status,
|
117
|
+
"markdown": result.markdown if result.status == "success" else None,
|
118
|
+
"error": result.error if result.status == "failed" else None,
|
119
|
+
"timestamp": result.timestamp,
|
120
|
+
"config": config.to_dict()
|
121
|
+
}
|
103
122
|
|
104
123
|
try:
|
105
124
|
response = requests.post(
|
106
125
|
config.webhook_url,
|
107
126
|
json=payload,
|
127
|
+
headers=config.webhook_headers,
|
108
128
|
timeout=config.webhook_timeout
|
109
129
|
)
|
110
130
|
response.raise_for_status()
|
@@ -196,6 +216,113 @@ class SpiderForce4AI:
|
|
196
216
|
await f.write(markdown)
|
197
217
|
return filepath
|
198
218
|
|
219
|
+
|
220
|
+
|
221
|
+
def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
222
|
+
"""
|
223
|
+
Crawl sitemap URLs using server-side parallel processing.
|
224
|
+
"""
|
225
|
+
print(f"Fetching sitemap from {sitemap_url}...")
|
226
|
+
|
227
|
+
# Fetch sitemap
|
228
|
+
try:
|
229
|
+
response = requests.get(sitemap_url, timeout=config.timeout)
|
230
|
+
response.raise_for_status()
|
231
|
+
sitemap_text = response.text
|
232
|
+
except Exception as e:
|
233
|
+
print(f"Error fetching sitemap: {str(e)}")
|
234
|
+
raise
|
235
|
+
|
236
|
+
# Parse sitemap
|
237
|
+
try:
|
238
|
+
root = ET.fromstring(sitemap_text)
|
239
|
+
namespace = {'ns': root.tag.split('}')[0].strip('{')}
|
240
|
+
urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
|
241
|
+
print(f"Found {len(urls)} URLs in sitemap")
|
242
|
+
except Exception as e:
|
243
|
+
print(f"Error parsing sitemap: {str(e)}")
|
244
|
+
raise
|
245
|
+
|
246
|
+
# Process URLs using server-side parallel endpoint
|
247
|
+
return self.crawl_urls_server_parallel(urls, config)
|
248
|
+
|
249
|
+
|
250
|
+
def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
251
|
+
"""
|
252
|
+
Crawl multiple URLs using server-side parallel processing.
|
253
|
+
This uses the /convert_parallel endpoint which handles parallelization on the server.
|
254
|
+
"""
|
255
|
+
print(f"Sending {len(urls)} URLs for parallel processing...")
|
256
|
+
|
257
|
+
try:
|
258
|
+
endpoint = f"{self.base_url}/convert_parallel"
|
259
|
+
|
260
|
+
# Prepare payload
|
261
|
+
payload = {
|
262
|
+
"urls": urls,
|
263
|
+
**config.to_dict()
|
264
|
+
}
|
265
|
+
|
266
|
+
# Send request
|
267
|
+
response = requests.post(
|
268
|
+
endpoint,
|
269
|
+
json=payload,
|
270
|
+
timeout=config.timeout
|
271
|
+
)
|
272
|
+
response.raise_for_status()
|
273
|
+
|
274
|
+
# Process results
|
275
|
+
results = []
|
276
|
+
server_results = response.json() # Assuming server returns JSON array of results
|
277
|
+
|
278
|
+
for url_result in server_results:
|
279
|
+
result = CrawlResult(
|
280
|
+
url=url_result["url"],
|
281
|
+
status=url_result.get("status", "failed"),
|
282
|
+
markdown=url_result.get("markdown"),
|
283
|
+
error=url_result.get("error"),
|
284
|
+
config=config.to_dict()
|
285
|
+
)
|
286
|
+
|
287
|
+
# Save markdown if successful and output dir is configured
|
288
|
+
if result.status == "success" and config.output_dir and result.markdown:
|
289
|
+
filepath = config.output_dir / f"{slugify(result.url)}.md"
|
290
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
291
|
+
f.write(result.markdown)
|
292
|
+
|
293
|
+
# Send webhook if configured
|
294
|
+
if config.webhook_url:
|
295
|
+
_send_webhook_sync(result, config)
|
296
|
+
|
297
|
+
results.append(result)
|
298
|
+
|
299
|
+
# Save report if enabled
|
300
|
+
if config.save_reports:
|
301
|
+
self._save_report_sync(results, config)
|
302
|
+
print(f"\nReport saved to: {config.report_file}")
|
303
|
+
|
304
|
+
# Print summary
|
305
|
+
successful = len([r for r in results if r.status == "success"])
|
306
|
+
failed = len([r for r in results if r.status == "failed"])
|
307
|
+
print(f"\nParallel processing completed:")
|
308
|
+
print(f"✓ Successful: {successful}")
|
309
|
+
print(f"✗ Failed: {failed}")
|
310
|
+
|
311
|
+
return results
|
312
|
+
|
313
|
+
except Exception as e:
|
314
|
+
print(f"Error during parallel processing: {str(e)}")
|
315
|
+
# Create failed results for all URLs
|
316
|
+
return [
|
317
|
+
CrawlResult(
|
318
|
+
url=url,
|
319
|
+
status="failed",
|
320
|
+
error=str(e),
|
321
|
+
config=config.to_dict()
|
322
|
+
) for url in urls
|
323
|
+
]
|
324
|
+
|
325
|
+
|
199
326
|
async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
|
200
327
|
"""Send webhook with crawl results."""
|
201
328
|
if not config.webhook_url:
|
@@ -313,6 +440,55 @@ class SpiderForce4AI:
|
|
313
440
|
"""Synchronous version of crawl_url_async."""
|
314
441
|
return asyncio.run(self.crawl_url_async(url, config))
|
315
442
|
|
443
|
+
async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
|
444
|
+
"""Retry failed URLs once."""
|
445
|
+
if not failed_results:
|
446
|
+
return []
|
447
|
+
|
448
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
449
|
+
retry_results = []
|
450
|
+
|
451
|
+
# Create a new progress bar if one wasn't provided
|
452
|
+
should_close_progress = progress is None
|
453
|
+
if progress is None:
|
454
|
+
progress = Progress(
|
455
|
+
SpinnerColumn(),
|
456
|
+
TextColumn("[progress.description]{task.description}"),
|
457
|
+
BarColumn(),
|
458
|
+
TaskProgressColumn(),
|
459
|
+
console=console
|
460
|
+
)
|
461
|
+
progress.start()
|
462
|
+
|
463
|
+
retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
|
464
|
+
|
465
|
+
for result in failed_results:
|
466
|
+
progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
|
467
|
+
|
468
|
+
try:
|
469
|
+
new_result = await self.crawl_url_async(result.url, config)
|
470
|
+
if new_result.status == "success":
|
471
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
472
|
+
else:
|
473
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
474
|
+
retry_results.append(new_result)
|
475
|
+
except Exception as e:
|
476
|
+
console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
|
477
|
+
retry_results.append(CrawlResult(
|
478
|
+
url=result.url,
|
479
|
+
status="failed",
|
480
|
+
error=f"Retry error: {str(e)}",
|
481
|
+
config=config.to_dict()
|
482
|
+
))
|
483
|
+
|
484
|
+
progress.update(retry_task, advance=1)
|
485
|
+
await asyncio.sleep(config.request_delay)
|
486
|
+
|
487
|
+
if should_close_progress:
|
488
|
+
progress.stop()
|
489
|
+
|
490
|
+
return retry_results
|
491
|
+
|
316
492
|
async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
317
493
|
"""Crawl multiple URLs asynchronously with progress bar."""
|
318
494
|
await self._ensure_session()
|
@@ -338,15 +514,27 @@ class SpiderForce4AI:
|
|
338
514
|
await asyncio.sleep(config.request_delay)
|
339
515
|
return result
|
340
516
|
|
341
|
-
|
517
|
+
initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
|
518
|
+
|
519
|
+
# Identify failed URLs
|
520
|
+
failed_results = [r for r in initial_results if r.status == "failed"]
|
521
|
+
|
522
|
+
# Retry failed URLs
|
523
|
+
if failed_results:
|
524
|
+
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
525
|
+
|
526
|
+
# Replace failed results with retry results
|
527
|
+
results = [r for r in initial_results if r.status == "success"] + retry_results
|
528
|
+
else:
|
529
|
+
results = initial_results
|
342
530
|
|
343
531
|
# Save final report
|
344
532
|
await self._save_report(config)
|
345
533
|
|
346
|
-
# Print summary
|
534
|
+
# Print final summary
|
347
535
|
successful = len([r for r in results if r.status == "success"])
|
348
536
|
failed = len([r for r in results if r.status == "failed"])
|
349
|
-
console.print(f"\n[green]
|
537
|
+
console.print(f"\n[green]Final crawling results:[/green]")
|
350
538
|
console.print(f"✓ Successful: {successful}")
|
351
539
|
console.print(f"✗ Failed: {failed}")
|
352
540
|
|
@@ -436,12 +624,25 @@ class SpiderForce4AI:
|
|
436
624
|
self._save_report_sync(results, config)
|
437
625
|
print(f"\nReport saved to: {config.report_file}")
|
438
626
|
|
439
|
-
#
|
627
|
+
# Identify failed URLs and retry them
|
628
|
+
failed_results = [r for r in results if r.status == "failed"]
|
629
|
+
if failed_results:
|
630
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
631
|
+
for result in failed_results:
|
632
|
+
new_result = _process_url_parallel((result.url, self.base_url, config))
|
633
|
+
if new_result.status == "success":
|
634
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
635
|
+
# Replace the failed result with the successful retry
|
636
|
+
results[results.index(result)] = new_result
|
637
|
+
else:
|
638
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
639
|
+
|
640
|
+
# Print final summary
|
440
641
|
successful = len([r for r in results if r.status == "success"])
|
441
642
|
failed = len([r for r in results if r.status == "failed"])
|
442
|
-
print(f"\
|
443
|
-
print(f"✓ Successful: {successful}")
|
444
|
-
print(f"✗ Failed: {failed}")
|
643
|
+
console.print(f"\n[green]Final crawling results:[/green]")
|
644
|
+
console.print(f"✓ Successful: {successful}")
|
645
|
+
console.print(f"✗ Failed: {failed}")
|
445
646
|
|
446
647
|
return results
|
447
648
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: spiderforce4ai
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
5
|
Home-page: https://petertam.pro
|
6
6
|
Author: Piotr Tamulewicz
|
@@ -24,75 +24,73 @@ Dynamic: requires-python
|
|
24
24
|
|
25
25
|
# SpiderForce4AI Python Wrapper
|
26
26
|
|
27
|
-
A Python
|
28
|
-
|
29
|
-
## Installation
|
30
|
-
|
31
|
-
```bash
|
32
|
-
pip install spiderforce4ai
|
33
|
-
```
|
27
|
+
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
34
28
|
|
35
29
|
## Quick Start (Minimal Setup)
|
36
30
|
|
37
31
|
```python
|
38
32
|
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
39
33
|
|
40
|
-
# Initialize with your
|
34
|
+
# Initialize with your service URL
|
41
35
|
spider = SpiderForce4AI("http://localhost:3004")
|
42
36
|
|
43
|
-
#
|
37
|
+
# Create default config
|
44
38
|
config = CrawlConfig()
|
45
39
|
|
46
40
|
# Crawl a single URL
|
47
41
|
result = spider.crawl_url("https://example.com", config)
|
48
42
|
```
|
49
43
|
|
44
|
+
## Installation
|
45
|
+
|
46
|
+
```bash
|
47
|
+
pip install spiderforce4ai
|
48
|
+
```
|
49
|
+
|
50
50
|
## Crawling Methods
|
51
51
|
|
52
|
-
### 1. Single URL
|
52
|
+
### 1. Single URL
|
53
53
|
|
54
54
|
```python
|
55
|
-
#
|
55
|
+
# Basic usage
|
56
56
|
result = spider.crawl_url("https://example.com", config)
|
57
57
|
|
58
|
-
#
|
58
|
+
# Async version
|
59
59
|
async def crawl():
|
60
60
|
result = await spider.crawl_url_async("https://example.com", config)
|
61
61
|
```
|
62
62
|
|
63
|
-
### 2. Multiple URLs
|
63
|
+
### 2. Multiple URLs
|
64
64
|
|
65
65
|
```python
|
66
|
-
# List of URLs
|
67
66
|
urls = [
|
68
67
|
"https://example.com/page1",
|
69
|
-
"https://example.com/page2"
|
70
|
-
"https://example.com/page3"
|
68
|
+
"https://example.com/page2"
|
71
69
|
]
|
72
70
|
|
73
|
-
#
|
74
|
-
results = spider.
|
71
|
+
# Client-side parallel (using multiprocessing)
|
72
|
+
results = spider.crawl_urls_parallel(urls, config)
|
73
|
+
|
74
|
+
# Server-side parallel (single request)
|
75
|
+
results = spider.crawl_urls_server_parallel(urls, config)
|
75
76
|
|
76
|
-
#
|
77
|
+
# Async version
|
77
78
|
async def crawl():
|
78
79
|
results = await spider.crawl_urls_async(urls, config)
|
79
|
-
|
80
|
-
# Parallel (using multiprocessing)
|
81
|
-
results = spider.crawl_urls_parallel(urls, config)
|
82
80
|
```
|
83
81
|
|
84
82
|
### 3. Sitemap Crawling
|
85
83
|
|
86
84
|
```python
|
87
|
-
#
|
88
|
-
results = spider.
|
85
|
+
# Server-side parallel (recommended)
|
86
|
+
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
87
|
+
|
88
|
+
# Client-side parallel
|
89
|
+
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
89
90
|
|
90
|
-
#
|
91
|
+
# Async version
|
91
92
|
async def crawl():
|
92
93
|
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
93
|
-
|
94
|
-
# Parallel (using multiprocessing)
|
95
|
-
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
96
94
|
```
|
97
95
|
|
98
96
|
## Configuration Options
|
@@ -100,9 +98,11 @@ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", confi
|
|
100
98
|
All configuration options are optional with sensible defaults:
|
101
99
|
|
102
100
|
```python
|
101
|
+
from pathlib import Path
|
102
|
+
|
103
103
|
config = CrawlConfig(
|
104
104
|
# Content Selection (all optional)
|
105
|
-
target_selector="article", # Specific element to
|
105
|
+
target_selector="article", # Specific element to extract
|
106
106
|
remove_selectors=[ # Elements to remove
|
107
107
|
".ads",
|
108
108
|
"#popup",
|
@@ -112,21 +112,34 @@ config = CrawlConfig(
|
|
112
112
|
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
113
113
|
|
114
114
|
# Processing Settings
|
115
|
-
max_concurrent_requests=1, #
|
116
|
-
request_delay=0.5, # Delay between requests
|
117
|
-
timeout=30, # Request timeout
|
115
|
+
max_concurrent_requests=1, # For client-side parallel processing
|
116
|
+
request_delay=0.5, # Delay between requests (seconds)
|
117
|
+
timeout=30, # Request timeout (seconds)
|
118
118
|
|
119
119
|
# Output Settings
|
120
|
-
output_dir="
|
121
|
-
|
122
|
-
|
123
|
-
|
120
|
+
output_dir=Path("spiderforce_reports"), # Default directory for files
|
121
|
+
webhook_url="https://your-webhook.com", # Real-time notifications
|
122
|
+
webhook_timeout=10, # Webhook timeout
|
123
|
+
webhook_headers={ # Optional custom headers for webhook
|
124
|
+
"Authorization": "Bearer your-token",
|
125
|
+
"X-Custom-Header": "value"
|
126
|
+
},
|
127
|
+
webhook_payload_template='''{ # Optional custom webhook payload template
|
128
|
+
"crawled_url": "{url}",
|
129
|
+
"content": "{markdown}",
|
130
|
+
"crawl_status": "{status}",
|
131
|
+
"crawl_error": "{error}",
|
132
|
+
"crawl_time": "{timestamp}",
|
133
|
+
"custom_field": "your-value"
|
134
|
+
}''',
|
135
|
+
save_reports=False, # Whether to save crawl reports (default: False)
|
136
|
+
report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
|
124
137
|
)
|
125
138
|
```
|
126
139
|
|
127
140
|
## Real-World Examples
|
128
141
|
|
129
|
-
### 1. Basic
|
142
|
+
### 1. Basic Blog Crawling
|
130
143
|
|
131
144
|
```python
|
132
145
|
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
@@ -134,78 +147,77 @@ from pathlib import Path
|
|
134
147
|
|
135
148
|
spider = SpiderForce4AI("http://localhost:3004")
|
136
149
|
config = CrawlConfig(
|
150
|
+
target_selector="article.post-content",
|
137
151
|
output_dir=Path("blog_content")
|
138
152
|
)
|
139
153
|
|
140
|
-
result = spider.crawl_url("https://example.com/blog", config)
|
141
|
-
print(f"Content saved to: {result.url}.md")
|
154
|
+
result = spider.crawl_url("https://example.com/blog-post", config)
|
142
155
|
```
|
143
156
|
|
144
|
-
### 2.
|
157
|
+
### 2. Parallel Website Crawling
|
145
158
|
|
146
159
|
```python
|
147
160
|
config = CrawlConfig(
|
148
|
-
max_concurrent_requests=5,
|
149
|
-
output_dir=Path("website_content"),
|
150
161
|
remove_selectors=[
|
151
162
|
".navigation",
|
152
163
|
".footer",
|
153
164
|
".ads",
|
154
165
|
"#cookie-notice"
|
155
166
|
],
|
167
|
+
max_concurrent_requests=5,
|
168
|
+
output_dir=Path("website_content"),
|
156
169
|
webhook_url="https://your-webhook.com/endpoint"
|
157
170
|
)
|
158
171
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
172
|
+
# Using server-side parallel processing
|
173
|
+
results = spider.crawl_urls_server_parallel([
|
174
|
+
"https://example.com/page1",
|
175
|
+
"https://example.com/page2",
|
176
|
+
"https://example.com/page3"
|
177
|
+
], config)
|
163
178
|
```
|
164
179
|
|
165
|
-
### 3.
|
180
|
+
### 3. Full Sitemap Processing
|
166
181
|
|
167
182
|
```python
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
)
|
175
|
-
|
176
|
-
async with spider:
|
177
|
-
results = await spider.crawl_urls_async([
|
178
|
-
"https://example.com/1",
|
179
|
-
"https://example.com/2",
|
180
|
-
"https://example.com/3"
|
181
|
-
], config)
|
182
|
-
|
183
|
-
return results
|
183
|
+
config = CrawlConfig(
|
184
|
+
target_selector="main",
|
185
|
+
remove_selectors=[".sidebar", ".comments"],
|
186
|
+
output_dir=Path("site_content"),
|
187
|
+
report_file=Path("crawl_report.json")
|
188
|
+
)
|
184
189
|
|
185
|
-
results =
|
190
|
+
results = spider.crawl_sitemap_server_parallel(
|
191
|
+
"https://example.com/sitemap.xml",
|
192
|
+
config
|
193
|
+
)
|
186
194
|
```
|
187
195
|
|
188
196
|
## Output Structure
|
189
197
|
|
190
|
-
### 1.
|
198
|
+
### 1. Directory Layout
|
191
199
|
```
|
192
|
-
|
193
|
-
├── example-com-page1.md
|
200
|
+
spiderforce_reports/ # Default output directory
|
201
|
+
├── example-com-page1.md # Converted markdown files
|
194
202
|
├── example-com-page2.md
|
195
|
-
└── crawl_report.json
|
203
|
+
└── crawl_report.json # Crawl report
|
196
204
|
```
|
197
205
|
|
198
206
|
### 2. Markdown Files
|
199
|
-
Each
|
207
|
+
Each file is named using a slugified version of the URL:
|
208
|
+
```markdown
|
209
|
+
# Page Title
|
210
|
+
|
211
|
+
Content converted to clean markdown...
|
212
|
+
```
|
200
213
|
|
201
|
-
### 3. Report
|
214
|
+
### 3. Crawl Report
|
202
215
|
```json
|
203
216
|
{
|
204
217
|
"timestamp": "2025-02-15T10:30:00.123456",
|
205
218
|
"config": {
|
206
219
|
"target_selector": "article",
|
207
|
-
"remove_selectors": [".ads", "#popup"]
|
208
|
-
"remove_selectors_regex": ["modal-\\d+"]
|
220
|
+
"remove_selectors": [".ads", "#popup"]
|
209
221
|
},
|
210
222
|
"results": {
|
211
223
|
"successful": [
|
@@ -234,7 +246,7 @@ Each markdown file is named using a slugified version of the URL and contains th
|
|
234
246
|
```
|
235
247
|
|
236
248
|
### 4. Webhook Notifications
|
237
|
-
If configured,
|
249
|
+
If configured, real-time updates are sent for each processed URL:
|
238
250
|
```json
|
239
251
|
{
|
240
252
|
"url": "https://example.com/page1",
|
@@ -250,7 +262,7 @@ If configured, webhooks receive real-time updates in JSON format:
|
|
250
262
|
|
251
263
|
## Error Handling
|
252
264
|
|
253
|
-
The package handles various types of errors:
|
265
|
+
The package handles various types of errors gracefully:
|
254
266
|
- Network errors
|
255
267
|
- Timeout errors
|
256
268
|
- Invalid URLs
|
@@ -269,6 +281,25 @@ All errors are:
|
|
269
281
|
- Running SpiderForce4AI service
|
270
282
|
- Internet connection
|
271
283
|
|
284
|
+
## Performance Considerations
|
285
|
+
|
286
|
+
1. Server-side Parallel Processing
|
287
|
+
- Best for most cases
|
288
|
+
- Single HTTP request for multiple URLs
|
289
|
+
- Less network overhead
|
290
|
+
- Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
|
291
|
+
|
292
|
+
2. Client-side Parallel Processing
|
293
|
+
- Good for special cases requiring local control
|
294
|
+
- Uses Python multiprocessing
|
295
|
+
- More network overhead
|
296
|
+
- Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
|
297
|
+
|
298
|
+
3. Async Processing
|
299
|
+
- Best for integration with async applications
|
300
|
+
- Good for real-time processing
|
301
|
+
- Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
|
302
|
+
|
272
303
|
## License
|
273
304
|
|
274
305
|
MIT License
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|