spiderforce4ai 1.0__py3-none-any.whl → 1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +32 -6
- spiderforce4ai-1.2.dist-info/METADATA +298 -0
- spiderforce4ai-1.2.dist-info/RECORD +5 -0
- spiderforce4ai-1.0.dist-info/METADATA +0 -309
- spiderforce4ai-1.0.dist-info/RECORD +0 -5
- {spiderforce4ai-1.0.dist-info → spiderforce4ai-1.2.dist-info}/WHEEL +0 -0
- {spiderforce4ai-1.0.dist-info → spiderforce4ai-1.2.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -445,7 +445,11 @@ class SpiderForce4AI:
|
|
445
445
|
if not failed_results:
|
446
446
|
return []
|
447
447
|
|
448
|
-
|
448
|
+
failed_count = len(failed_results)
|
449
|
+
total_count = len([r for r in self.crawl_results])
|
450
|
+
failure_ratio = (failed_count / total_count) * 100
|
451
|
+
|
452
|
+
console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
|
449
453
|
retry_results = []
|
450
454
|
|
451
455
|
# Create a new progress bar if one wasn't provided
|
@@ -531,8 +535,13 @@ class SpiderForce4AI:
|
|
531
535
|
results = initial_results
|
532
536
|
else:
|
533
537
|
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
534
|
-
#
|
535
|
-
results =
|
538
|
+
# Update results list by replacing failed results with successful retries
|
539
|
+
results = initial_results.copy()
|
540
|
+
for retry_result in retry_results:
|
541
|
+
for i, result in enumerate(results):
|
542
|
+
if result.url == retry_result.url:
|
543
|
+
results[i] = retry_result
|
544
|
+
break
|
536
545
|
else:
|
537
546
|
results = initial_results
|
538
547
|
|
@@ -652,15 +661,32 @@ class SpiderForce4AI:
|
|
652
661
|
if failure_ratio > 20:
|
653
662
|
console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
|
654
663
|
else:
|
655
|
-
|
664
|
+
failed_count = len(failed_results)
|
665
|
+
failure_ratio = (failed_count / total_urls) * 100
|
666
|
+
console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
|
656
667
|
for result in failed_results:
|
657
668
|
new_result = _process_url_parallel((result.url, self.base_url, config))
|
669
|
+
|
670
|
+
# Save markdown and trigger webhook for successful retries
|
658
671
|
if new_result.status == "success":
|
659
672
|
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
660
|
-
#
|
661
|
-
|
673
|
+
# Save markdown if output directory is configured
|
674
|
+
if config.output_dir and new_result.markdown:
|
675
|
+
filepath = config.output_dir / f"{slugify(new_result.url)}.md"
|
676
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
677
|
+
f.write(new_result.markdown)
|
678
|
+
# Send webhook for successful retry
|
679
|
+
_send_webhook_sync(new_result, config)
|
662
680
|
else:
|
663
681
|
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
682
|
+
# Send webhook for failed retry
|
683
|
+
_send_webhook_sync(new_result, config)
|
684
|
+
|
685
|
+
# Update results list
|
686
|
+
for i, r in enumerate(results):
|
687
|
+
if r.url == new_result.url:
|
688
|
+
results[i] = new_result
|
689
|
+
break
|
664
690
|
|
665
691
|
# Calculate final statistics
|
666
692
|
final_successful = len([r for r in results if r.status == "success"])
|
@@ -0,0 +1,298 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: spiderforce4ai
|
3
|
+
Version: 1.2
|
4
|
+
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
|
+
Home-page: https://petertam.pro
|
6
|
+
Author: Piotr Tamulewicz
|
7
|
+
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
+
License: MIT
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
14
|
+
Requires-Python: >=3.11
|
15
|
+
Description-Content-Type: text/markdown
|
16
|
+
Requires-Dist: aiohttp>=3.8.0
|
17
|
+
Requires-Dist: asyncio>=3.4.3
|
18
|
+
Requires-Dist: rich>=10.0.0
|
19
|
+
Requires-Dist: aiofiles>=0.8.0
|
20
|
+
Requires-Dist: httpx>=0.24.0
|
21
|
+
Dynamic: author
|
22
|
+
Dynamic: home-page
|
23
|
+
Dynamic: requires-python
|
24
|
+
|
25
|
+
# SpiderForce4AI Python Wrapper
|
26
|
+
|
27
|
+
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
28
|
+
|
29
|
+
## Features
|
30
|
+
|
31
|
+
- HTML to Markdown conversion
|
32
|
+
- Parallel and async crawling support
|
33
|
+
- Sitemap processing
|
34
|
+
- Custom content selection
|
35
|
+
- Automatic retry mechanism
|
36
|
+
- Detailed progress tracking
|
37
|
+
- Webhook notifications
|
38
|
+
- Customizable reporting
|
39
|
+
|
40
|
+
## Installation
|
41
|
+
|
42
|
+
```bash
|
43
|
+
pip install spiderforce4ai
|
44
|
+
```
|
45
|
+
|
46
|
+
## Quick Start
|
47
|
+
|
48
|
+
```python
|
49
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
50
|
+
from pathlib import Path
|
51
|
+
|
52
|
+
# Initialize crawler
|
53
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
54
|
+
|
55
|
+
# Configure crawling options
|
56
|
+
config = CrawlConfig(
|
57
|
+
target_selector="article",
|
58
|
+
remove_selectors=[".ads", ".navigation"],
|
59
|
+
max_concurrent_requests=5,
|
60
|
+
save_reports=True
|
61
|
+
)
|
62
|
+
|
63
|
+
# Crawl a sitemap
|
64
|
+
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
65
|
+
```
|
66
|
+
|
67
|
+
## Key Features
|
68
|
+
|
69
|
+
### 1. Smart Retry Mechanism
|
70
|
+
- Automatically retries failed URLs
|
71
|
+
- Monitors failure ratio to prevent server overload
|
72
|
+
- Detailed retry statistics and progress tracking
|
73
|
+
- Aborts retries if failure rate exceeds 20%
|
74
|
+
|
75
|
+
```python
|
76
|
+
# Retry behavior is automatic
|
77
|
+
config = CrawlConfig(
|
78
|
+
max_concurrent_requests=5,
|
79
|
+
request_delay=1.0 # Delay between retries
|
80
|
+
)
|
81
|
+
results = spider.crawl_urls_async(urls, config)
|
82
|
+
```
|
83
|
+
|
84
|
+
### 2. Custom Webhook Integration
|
85
|
+
- Flexible payload formatting
|
86
|
+
- Custom headers support
|
87
|
+
- Variable substitution in templates
|
88
|
+
|
89
|
+
```python
|
90
|
+
config = CrawlConfig(
|
91
|
+
webhook_url="https://your-webhook.com",
|
92
|
+
webhook_headers={
|
93
|
+
"Authorization": "Bearer token",
|
94
|
+
"X-Custom-Header": "value"
|
95
|
+
},
|
96
|
+
webhook_payload_template='''{
|
97
|
+
"url": "{url}",
|
98
|
+
"content": "{markdown}",
|
99
|
+
"status": "{status}",
|
100
|
+
"custom_field": "value"
|
101
|
+
}'''
|
102
|
+
)
|
103
|
+
```
|
104
|
+
|
105
|
+
### 3. Flexible Report Generation
|
106
|
+
- Optional report saving
|
107
|
+
- Customizable report location
|
108
|
+
- Detailed success/failure statistics
|
109
|
+
|
110
|
+
```python
|
111
|
+
config = CrawlConfig(
|
112
|
+
save_reports=True,
|
113
|
+
report_file=Path("custom_report.json"),
|
114
|
+
output_dir=Path("content")
|
115
|
+
)
|
116
|
+
```
|
117
|
+
|
118
|
+
## Crawling Methods
|
119
|
+
|
120
|
+
### 1. Single URL Processing
|
121
|
+
|
122
|
+
```python
|
123
|
+
# Synchronous
|
124
|
+
result = spider.crawl_url("https://example.com", config)
|
125
|
+
|
126
|
+
# Asynchronous
|
127
|
+
async def crawl():
|
128
|
+
result = await spider.crawl_url_async("https://example.com", config)
|
129
|
+
```
|
130
|
+
|
131
|
+
### 2. Multiple URLs
|
132
|
+
|
133
|
+
```python
|
134
|
+
urls = ["https://example.com/page1", "https://example.com/page2"]
|
135
|
+
|
136
|
+
# Server-side parallel (recommended)
|
137
|
+
results = spider.crawl_urls_server_parallel(urls, config)
|
138
|
+
|
139
|
+
# Client-side parallel
|
140
|
+
results = spider.crawl_urls_parallel(urls, config)
|
141
|
+
|
142
|
+
# Asynchronous
|
143
|
+
async def crawl():
|
144
|
+
results = await spider.crawl_urls_async(urls, config)
|
145
|
+
```
|
146
|
+
|
147
|
+
### 3. Sitemap Processing
|
148
|
+
|
149
|
+
```python
|
150
|
+
# Server-side parallel (recommended)
|
151
|
+
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
152
|
+
|
153
|
+
# Client-side parallel
|
154
|
+
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
155
|
+
|
156
|
+
# Asynchronous
|
157
|
+
async def crawl():
|
158
|
+
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
159
|
+
```
|
160
|
+
|
161
|
+
## Configuration Options
|
162
|
+
|
163
|
+
```python
|
164
|
+
config = CrawlConfig(
|
165
|
+
# Content Selection
|
166
|
+
target_selector="article", # Target element to extract
|
167
|
+
remove_selectors=[".ads", "#popup"], # Elements to remove
|
168
|
+
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
169
|
+
|
170
|
+
# Processing
|
171
|
+
max_concurrent_requests=5, # Parallel processing limit
|
172
|
+
request_delay=0.5, # Delay between requests
|
173
|
+
timeout=30, # Request timeout
|
174
|
+
|
175
|
+
# Output
|
176
|
+
output_dir=Path("content"), # Output directory
|
177
|
+
save_reports=False, # Enable/disable report saving
|
178
|
+
report_file=Path("report.json"), # Report location
|
179
|
+
|
180
|
+
# Webhook
|
181
|
+
webhook_url="https://webhook.com", # Webhook endpoint
|
182
|
+
webhook_timeout=10, # Webhook timeout
|
183
|
+
webhook_headers={ # Custom headers
|
184
|
+
"Authorization": "Bearer token"
|
185
|
+
},
|
186
|
+
webhook_payload_template=''' # Custom payload format
|
187
|
+
{
|
188
|
+
"url": "{url}",
|
189
|
+
"content": "{markdown}",
|
190
|
+
"status": "{status}",
|
191
|
+
"error": "{error}",
|
192
|
+
"time": "{timestamp}"
|
193
|
+
}'''
|
194
|
+
)
|
195
|
+
```
|
196
|
+
|
197
|
+
## Progress Tracking
|
198
|
+
|
199
|
+
The package provides detailed progress information:
|
200
|
+
|
201
|
+
```
|
202
|
+
Fetching sitemap from https://example.com/sitemap.xml...
|
203
|
+
Found 156 URLs in sitemap
|
204
|
+
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 156/156 URLs
|
205
|
+
|
206
|
+
Retrying failed URLs: 18 (11.5% failed)
|
207
|
+
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 18/18 retries
|
208
|
+
|
209
|
+
Crawling Summary:
|
210
|
+
Total URLs processed: 156
|
211
|
+
Initial failures: 18 (11.5%)
|
212
|
+
Final results:
|
213
|
+
✓ Successful: 150
|
214
|
+
✗ Failed: 6
|
215
|
+
Retry success rate: 12/18 (66.7%)
|
216
|
+
```
|
217
|
+
|
218
|
+
## Output Structure
|
219
|
+
|
220
|
+
### 1. Directory Layout
|
221
|
+
```
|
222
|
+
content/ # Output directory
|
223
|
+
├── example-com-page1.md # Markdown files
|
224
|
+
├── example-com-page2.md
|
225
|
+
└── report.json # Crawl report
|
226
|
+
```
|
227
|
+
|
228
|
+
### 2. Report Format
|
229
|
+
```json
|
230
|
+
{
|
231
|
+
"timestamp": "2025-02-15T10:30:00",
|
232
|
+
"config": {
|
233
|
+
"target_selector": "article",
|
234
|
+
"remove_selectors": [".ads"]
|
235
|
+
},
|
236
|
+
"results": {
|
237
|
+
"successful": [...],
|
238
|
+
"failed": [...]
|
239
|
+
},
|
240
|
+
"summary": {
|
241
|
+
"total": 156,
|
242
|
+
"successful": 150,
|
243
|
+
"failed": 6
|
244
|
+
}
|
245
|
+
}
|
246
|
+
```
|
247
|
+
|
248
|
+
## Performance Optimization
|
249
|
+
|
250
|
+
1. Server-side Parallel Processing
|
251
|
+
- Recommended for most cases
|
252
|
+
- Single HTTP request
|
253
|
+
- Reduced network overhead
|
254
|
+
- Built-in load balancing
|
255
|
+
|
256
|
+
2. Client-side Parallel Processing
|
257
|
+
- Better control over processing
|
258
|
+
- Customizable concurrency
|
259
|
+
- Progress tracking per URL
|
260
|
+
- Automatic retry handling
|
261
|
+
|
262
|
+
3. Asynchronous Processing
|
263
|
+
- Ideal for async applications
|
264
|
+
- Non-blocking operation
|
265
|
+
- Real-time progress updates
|
266
|
+
- Efficient resource usage
|
267
|
+
|
268
|
+
## Error Handling
|
269
|
+
|
270
|
+
The package provides comprehensive error handling:
|
271
|
+
|
272
|
+
- Automatic retry for failed URLs
|
273
|
+
- Failure ratio monitoring
|
274
|
+
- Detailed error reporting
|
275
|
+
- Webhook error notifications
|
276
|
+
- Progress tracking during retries
|
277
|
+
|
278
|
+
## Requirements
|
279
|
+
|
280
|
+
- Python 3.11+
|
281
|
+
- Running SpiderForce4AI service
|
282
|
+
- Internet connection
|
283
|
+
|
284
|
+
## Dependencies
|
285
|
+
|
286
|
+
- aiohttp
|
287
|
+
- asyncio
|
288
|
+
- rich
|
289
|
+
- aiofiles
|
290
|
+
- httpx
|
291
|
+
|
292
|
+
## License
|
293
|
+
|
294
|
+
MIT License
|
295
|
+
|
296
|
+
## Credits
|
297
|
+
|
298
|
+
Created by [Peter Tam](https://petertam.pro)
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=BHsdGGxEyS4RHbHyTnYRBE4oRy2i1pGSrEt_LT4vKWc,29384
|
2
|
+
spiderforce4ai-1.2.dist-info/METADATA,sha256=I5gmglzuRXSKwRc0lWk2Vslnx_4PIffIwjJ-SOTeYpU,7183
|
3
|
+
spiderforce4ai-1.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-1.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-1.2.dist-info/RECORD,,
|
@@ -1,309 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: spiderforce4ai
|
3
|
-
Version: 1.0
|
4
|
-
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
|
-
Home-page: https://petertam.pro
|
6
|
-
Author: Piotr Tamulewicz
|
7
|
-
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
-
License: MIT
|
9
|
-
Classifier: Development Status :: 4 - Beta
|
10
|
-
Classifier: Intended Audience :: Developers
|
11
|
-
Classifier: License :: OSI Approved :: MIT License
|
12
|
-
Classifier: Programming Language :: Python :: 3.11
|
13
|
-
Classifier: Programming Language :: Python :: 3.12
|
14
|
-
Requires-Python: >=3.11
|
15
|
-
Description-Content-Type: text/markdown
|
16
|
-
Requires-Dist: aiohttp>=3.8.0
|
17
|
-
Requires-Dist: asyncio>=3.4.3
|
18
|
-
Requires-Dist: rich>=10.0.0
|
19
|
-
Requires-Dist: aiofiles>=0.8.0
|
20
|
-
Requires-Dist: httpx>=0.24.0
|
21
|
-
Dynamic: author
|
22
|
-
Dynamic: home-page
|
23
|
-
Dynamic: requires-python
|
24
|
-
|
25
|
-
# SpiderForce4AI Python Wrapper
|
26
|
-
|
27
|
-
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
28
|
-
|
29
|
-
## Quick Start (Minimal Setup)
|
30
|
-
|
31
|
-
```python
|
32
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
33
|
-
|
34
|
-
# Initialize with your service URL
|
35
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
36
|
-
|
37
|
-
# Create default config
|
38
|
-
config = CrawlConfig()
|
39
|
-
|
40
|
-
# Crawl a single URL
|
41
|
-
result = spider.crawl_url("https://example.com", config)
|
42
|
-
```
|
43
|
-
|
44
|
-
## Installation
|
45
|
-
|
46
|
-
```bash
|
47
|
-
pip install spiderforce4ai
|
48
|
-
```
|
49
|
-
|
50
|
-
## Crawling Methods
|
51
|
-
|
52
|
-
### 1. Single URL
|
53
|
-
|
54
|
-
```python
|
55
|
-
# Basic usage
|
56
|
-
result = spider.crawl_url("https://example.com", config)
|
57
|
-
|
58
|
-
# Async version
|
59
|
-
async def crawl():
|
60
|
-
result = await spider.crawl_url_async("https://example.com", config)
|
61
|
-
```
|
62
|
-
|
63
|
-
### 2. Multiple URLs
|
64
|
-
|
65
|
-
```python
|
66
|
-
urls = [
|
67
|
-
"https://example.com/page1",
|
68
|
-
"https://example.com/page2"
|
69
|
-
]
|
70
|
-
|
71
|
-
# Client-side parallel (using multiprocessing)
|
72
|
-
results = spider.crawl_urls_parallel(urls, config)
|
73
|
-
|
74
|
-
# Server-side parallel (single request)
|
75
|
-
results = spider.crawl_urls_server_parallel(urls, config)
|
76
|
-
|
77
|
-
# Async version
|
78
|
-
async def crawl():
|
79
|
-
results = await spider.crawl_urls_async(urls, config)
|
80
|
-
```
|
81
|
-
|
82
|
-
### 3. Sitemap Crawling
|
83
|
-
|
84
|
-
```python
|
85
|
-
# Server-side parallel (recommended)
|
86
|
-
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
87
|
-
|
88
|
-
# Client-side parallel
|
89
|
-
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
90
|
-
|
91
|
-
# Async version
|
92
|
-
async def crawl():
|
93
|
-
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
94
|
-
```
|
95
|
-
|
96
|
-
## Configuration Options
|
97
|
-
|
98
|
-
All configuration options are optional with sensible defaults:
|
99
|
-
|
100
|
-
```python
|
101
|
-
from pathlib import Path
|
102
|
-
|
103
|
-
config = CrawlConfig(
|
104
|
-
# Content Selection (all optional)
|
105
|
-
target_selector="article", # Specific element to extract
|
106
|
-
remove_selectors=[ # Elements to remove
|
107
|
-
".ads",
|
108
|
-
"#popup",
|
109
|
-
".navigation",
|
110
|
-
".footer"
|
111
|
-
],
|
112
|
-
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
113
|
-
|
114
|
-
# Processing Settings
|
115
|
-
max_concurrent_requests=1, # For client-side parallel processing
|
116
|
-
request_delay=0.5, # Delay between requests (seconds)
|
117
|
-
timeout=30, # Request timeout (seconds)
|
118
|
-
|
119
|
-
# Output Settings
|
120
|
-
output_dir=Path("spiderforce_reports"), # Default directory for files
|
121
|
-
webhook_url="https://your-webhook.com", # Real-time notifications
|
122
|
-
webhook_timeout=10, # Webhook timeout
|
123
|
-
webhook_headers={ # Optional custom headers for webhook
|
124
|
-
"Authorization": "Bearer your-token",
|
125
|
-
"X-Custom-Header": "value"
|
126
|
-
},
|
127
|
-
webhook_payload_template='''{ # Optional custom webhook payload template
|
128
|
-
"crawled_url": "{url}",
|
129
|
-
"content": "{markdown}",
|
130
|
-
"crawl_status": "{status}",
|
131
|
-
"crawl_error": "{error}",
|
132
|
-
"crawl_time": "{timestamp}",
|
133
|
-
"custom_field": "your-value"
|
134
|
-
}''',
|
135
|
-
save_reports=False, # Whether to save crawl reports (default: False)
|
136
|
-
report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
|
137
|
-
)
|
138
|
-
```
|
139
|
-
|
140
|
-
## Real-World Examples
|
141
|
-
|
142
|
-
### 1. Basic Blog Crawling
|
143
|
-
|
144
|
-
```python
|
145
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
146
|
-
from pathlib import Path
|
147
|
-
|
148
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
149
|
-
config = CrawlConfig(
|
150
|
-
target_selector="article.post-content",
|
151
|
-
output_dir=Path("blog_content")
|
152
|
-
)
|
153
|
-
|
154
|
-
result = spider.crawl_url("https://example.com/blog-post", config)
|
155
|
-
```
|
156
|
-
|
157
|
-
### 2. Parallel Website Crawling
|
158
|
-
|
159
|
-
```python
|
160
|
-
config = CrawlConfig(
|
161
|
-
remove_selectors=[
|
162
|
-
".navigation",
|
163
|
-
".footer",
|
164
|
-
".ads",
|
165
|
-
"#cookie-notice"
|
166
|
-
],
|
167
|
-
max_concurrent_requests=5,
|
168
|
-
output_dir=Path("website_content"),
|
169
|
-
webhook_url="https://your-webhook.com/endpoint"
|
170
|
-
)
|
171
|
-
|
172
|
-
# Using server-side parallel processing
|
173
|
-
results = spider.crawl_urls_server_parallel([
|
174
|
-
"https://example.com/page1",
|
175
|
-
"https://example.com/page2",
|
176
|
-
"https://example.com/page3"
|
177
|
-
], config)
|
178
|
-
```
|
179
|
-
|
180
|
-
### 3. Full Sitemap Processing
|
181
|
-
|
182
|
-
```python
|
183
|
-
config = CrawlConfig(
|
184
|
-
target_selector="main",
|
185
|
-
remove_selectors=[".sidebar", ".comments"],
|
186
|
-
output_dir=Path("site_content"),
|
187
|
-
report_file=Path("crawl_report.json")
|
188
|
-
)
|
189
|
-
|
190
|
-
results = spider.crawl_sitemap_server_parallel(
|
191
|
-
"https://example.com/sitemap.xml",
|
192
|
-
config
|
193
|
-
)
|
194
|
-
```
|
195
|
-
|
196
|
-
## Output Structure
|
197
|
-
|
198
|
-
### 1. Directory Layout
|
199
|
-
```
|
200
|
-
spiderforce_reports/ # Default output directory
|
201
|
-
├── example-com-page1.md # Converted markdown files
|
202
|
-
├── example-com-page2.md
|
203
|
-
└── crawl_report.json # Crawl report
|
204
|
-
```
|
205
|
-
|
206
|
-
### 2. Markdown Files
|
207
|
-
Each file is named using a slugified version of the URL:
|
208
|
-
```markdown
|
209
|
-
# Page Title
|
210
|
-
|
211
|
-
Content converted to clean markdown...
|
212
|
-
```
|
213
|
-
|
214
|
-
### 3. Crawl Report
|
215
|
-
```json
|
216
|
-
{
|
217
|
-
"timestamp": "2025-02-15T10:30:00.123456",
|
218
|
-
"config": {
|
219
|
-
"target_selector": "article",
|
220
|
-
"remove_selectors": [".ads", "#popup"]
|
221
|
-
},
|
222
|
-
"results": {
|
223
|
-
"successful": [
|
224
|
-
{
|
225
|
-
"url": "https://example.com/page1",
|
226
|
-
"status": "success",
|
227
|
-
"markdown": "# Page Title\n\nContent...",
|
228
|
-
"timestamp": "2025-02-15T10:30:00.123456"
|
229
|
-
}
|
230
|
-
],
|
231
|
-
"failed": [
|
232
|
-
{
|
233
|
-
"url": "https://example.com/page2",
|
234
|
-
"status": "failed",
|
235
|
-
"error": "HTTP 404: Not Found",
|
236
|
-
"timestamp": "2025-02-15T10:30:01.123456"
|
237
|
-
}
|
238
|
-
]
|
239
|
-
},
|
240
|
-
"summary": {
|
241
|
-
"total": 2,
|
242
|
-
"successful": 1,
|
243
|
-
"failed": 1
|
244
|
-
}
|
245
|
-
}
|
246
|
-
```
|
247
|
-
|
248
|
-
### 4. Webhook Notifications
|
249
|
-
If configured, real-time updates are sent for each processed URL:
|
250
|
-
```json
|
251
|
-
{
|
252
|
-
"url": "https://example.com/page1",
|
253
|
-
"status": "success",
|
254
|
-
"markdown": "# Page Title\n\nContent...",
|
255
|
-
"timestamp": "2025-02-15T10:30:00.123456",
|
256
|
-
"config": {
|
257
|
-
"target_selector": "article",
|
258
|
-
"remove_selectors": [".ads", "#popup"]
|
259
|
-
}
|
260
|
-
}
|
261
|
-
```
|
262
|
-
|
263
|
-
## Error Handling
|
264
|
-
|
265
|
-
The package handles various types of errors gracefully:
|
266
|
-
- Network errors
|
267
|
-
- Timeout errors
|
268
|
-
- Invalid URLs
|
269
|
-
- Missing content
|
270
|
-
- Service errors
|
271
|
-
|
272
|
-
All errors are:
|
273
|
-
1. Logged in the console
|
274
|
-
2. Included in the JSON report
|
275
|
-
3. Sent via webhook (if configured)
|
276
|
-
4. Available in the results list
|
277
|
-
|
278
|
-
## Requirements
|
279
|
-
|
280
|
-
- Python 3.11 or later
|
281
|
-
- Running SpiderForce4AI service
|
282
|
-
- Internet connection
|
283
|
-
|
284
|
-
## Performance Considerations
|
285
|
-
|
286
|
-
1. Server-side Parallel Processing
|
287
|
-
- Best for most cases
|
288
|
-
- Single HTTP request for multiple URLs
|
289
|
-
- Less network overhead
|
290
|
-
- Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
|
291
|
-
|
292
|
-
2. Client-side Parallel Processing
|
293
|
-
- Good for special cases requiring local control
|
294
|
-
- Uses Python multiprocessing
|
295
|
-
- More network overhead
|
296
|
-
- Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
|
297
|
-
|
298
|
-
3. Async Processing
|
299
|
-
- Best for integration with async applications
|
300
|
-
- Good for real-time processing
|
301
|
-
- Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
|
302
|
-
|
303
|
-
## License
|
304
|
-
|
305
|
-
MIT License
|
306
|
-
|
307
|
-
## Credits
|
308
|
-
|
309
|
-
Created by [Peter Tam](https://petertam.pro)
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=8WEcryB8fckf5yIvH55s7a5FtxvK_AhXdi_dyaqqing,27929
|
2
|
-
spiderforce4ai-1.0.dist-info/METADATA,sha256=VqydJoQcHkzvIhYTPeH3j8ZSHK-lGbo1xmZwQZk6w2s,7769
|
3
|
-
spiderforce4ai-1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-1.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|