spiderforce4ai 1.1__py3-none-any.whl → 1.3__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +43 -12
- spiderforce4ai-1.3.dist-info/METADATA +298 -0
- spiderforce4ai-1.3.dist-info/RECORD +5 -0
- spiderforce4ai-1.1.dist-info/METADATA +0 -309
- spiderforce4ai-1.1.dist-info/RECORD +0 -5
- {spiderforce4ai-1.1.dist-info → spiderforce4ai-1.3.dist-info}/WHEEL +0 -0
- {spiderforce4ai-1.1.dist-info → spiderforce4ai-1.3.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -350,17 +350,23 @@ class SpiderForce4AI:
|
|
350
350
|
|
351
351
|
def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
|
352
352
|
"""Save crawl report synchronously."""
|
353
|
+
# Separate successful and failed results
|
354
|
+
successful_results = [r for r in results if r.status == "success"]
|
355
|
+
failed_results = [r for r in results if r.status == "failed"]
|
356
|
+
|
357
|
+
# Create report with only final state
|
353
358
|
report = {
|
354
359
|
"timestamp": datetime.now().isoformat(),
|
355
360
|
"config": config.to_dict(),
|
356
361
|
"results": {
|
357
|
-
"successful": [asdict(r) for r in
|
358
|
-
"failed": [asdict(r) for r in
|
362
|
+
"successful": [asdict(r) for r in successful_results],
|
363
|
+
"failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
|
359
364
|
},
|
360
365
|
"summary": {
|
361
366
|
"total": len(results),
|
362
|
-
"successful": len(
|
363
|
-
"failed": len(
|
367
|
+
"successful": len(successful_results),
|
368
|
+
"failed": len(failed_results),
|
369
|
+
"retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
|
364
370
|
}
|
365
371
|
}
|
366
372
|
|
@@ -372,17 +378,22 @@ class SpiderForce4AI:
|
|
372
378
|
if not config.report_file:
|
373
379
|
return
|
374
380
|
|
381
|
+
# Separate successful and failed results
|
382
|
+
successful_results = [r for r in self.crawl_results if r.status == "success"]
|
383
|
+
failed_results = [r for r in self.crawl_results if r.status == "failed"]
|
384
|
+
|
375
385
|
report = {
|
376
386
|
"timestamp": datetime.now().isoformat(),
|
377
387
|
"config": config.to_dict(),
|
378
388
|
"results": {
|
379
|
-
"successful": [asdict(r) for r in
|
380
|
-
"failed": [asdict(r) for r in
|
389
|
+
"successful": [asdict(r) for r in successful_results],
|
390
|
+
"failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
|
381
391
|
},
|
382
392
|
"summary": {
|
383
393
|
"total": len(self.crawl_results),
|
384
|
-
"successful": len(
|
385
|
-
"failed": len(
|
394
|
+
"successful": len(successful_results),
|
395
|
+
"failed": len(failed_results),
|
396
|
+
"retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
|
386
397
|
}
|
387
398
|
}
|
388
399
|
|
@@ -535,8 +546,13 @@ class SpiderForce4AI:
|
|
535
546
|
results = initial_results
|
536
547
|
else:
|
537
548
|
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
538
|
-
#
|
539
|
-
results =
|
549
|
+
# Update results list by replacing failed results with successful retries
|
550
|
+
results = initial_results.copy()
|
551
|
+
for retry_result in retry_results:
|
552
|
+
for i, result in enumerate(results):
|
553
|
+
if result.url == retry_result.url:
|
554
|
+
results[i] = retry_result
|
555
|
+
break
|
540
556
|
else:
|
541
557
|
results = initial_results
|
542
558
|
|
@@ -661,12 +677,27 @@ class SpiderForce4AI:
|
|
661
677
|
console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
|
662
678
|
for result in failed_results:
|
663
679
|
new_result = _process_url_parallel((result.url, self.base_url, config))
|
680
|
+
|
681
|
+
# Save markdown and trigger webhook for successful retries
|
664
682
|
if new_result.status == "success":
|
665
683
|
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
666
|
-
#
|
667
|
-
|
684
|
+
# Save markdown if output directory is configured
|
685
|
+
if config.output_dir and new_result.markdown:
|
686
|
+
filepath = config.output_dir / f"{slugify(new_result.url)}.md"
|
687
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
688
|
+
f.write(new_result.markdown)
|
689
|
+
# Send webhook for successful retry
|
690
|
+
_send_webhook_sync(new_result, config)
|
668
691
|
else:
|
669
692
|
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
693
|
+
# Send webhook for failed retry
|
694
|
+
_send_webhook_sync(new_result, config)
|
695
|
+
|
696
|
+
# Update results list
|
697
|
+
for i, r in enumerate(results):
|
698
|
+
if r.url == new_result.url:
|
699
|
+
results[i] = new_result
|
700
|
+
break
|
670
701
|
|
671
702
|
# Calculate final statistics
|
672
703
|
final_successful = len([r for r in results if r.status == "success"])
|
@@ -0,0 +1,298 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: spiderforce4ai
|
3
|
+
Version: 1.3
|
4
|
+
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
|
+
Home-page: https://petertam.pro
|
6
|
+
Author: Piotr Tamulewicz
|
7
|
+
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
+
License: MIT
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
14
|
+
Requires-Python: >=3.11
|
15
|
+
Description-Content-Type: text/markdown
|
16
|
+
Requires-Dist: aiohttp>=3.8.0
|
17
|
+
Requires-Dist: asyncio>=3.4.3
|
18
|
+
Requires-Dist: rich>=10.0.0
|
19
|
+
Requires-Dist: aiofiles>=0.8.0
|
20
|
+
Requires-Dist: httpx>=0.24.0
|
21
|
+
Dynamic: author
|
22
|
+
Dynamic: home-page
|
23
|
+
Dynamic: requires-python
|
24
|
+
|
25
|
+
# SpiderForce4AI Python Wrapper
|
26
|
+
|
27
|
+
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
28
|
+
|
29
|
+
## Features
|
30
|
+
|
31
|
+
- HTML to Markdown conversion
|
32
|
+
- Parallel and async crawling support
|
33
|
+
- Sitemap processing
|
34
|
+
- Custom content selection
|
35
|
+
- Automatic retry mechanism
|
36
|
+
- Detailed progress tracking
|
37
|
+
- Webhook notifications
|
38
|
+
- Customizable reporting
|
39
|
+
|
40
|
+
## Installation
|
41
|
+
|
42
|
+
```bash
|
43
|
+
pip install spiderforce4ai
|
44
|
+
```
|
45
|
+
|
46
|
+
## Quick Start
|
47
|
+
|
48
|
+
```python
|
49
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
50
|
+
from pathlib import Path
|
51
|
+
|
52
|
+
# Initialize crawler
|
53
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
54
|
+
|
55
|
+
# Configure crawling options
|
56
|
+
config = CrawlConfig(
|
57
|
+
target_selector="article",
|
58
|
+
remove_selectors=[".ads", ".navigation"],
|
59
|
+
max_concurrent_requests=5,
|
60
|
+
save_reports=True
|
61
|
+
)
|
62
|
+
|
63
|
+
# Crawl a sitemap
|
64
|
+
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
65
|
+
```
|
66
|
+
|
67
|
+
## Key Features
|
68
|
+
|
69
|
+
### 1. Smart Retry Mechanism
|
70
|
+
- Automatically retries failed URLs
|
71
|
+
- Monitors failure ratio to prevent server overload
|
72
|
+
- Detailed retry statistics and progress tracking
|
73
|
+
- Aborts retries if failure rate exceeds 20%
|
74
|
+
|
75
|
+
```python
|
76
|
+
# Retry behavior is automatic
|
77
|
+
config = CrawlConfig(
|
78
|
+
max_concurrent_requests=5,
|
79
|
+
request_delay=1.0 # Delay between retries
|
80
|
+
)
|
81
|
+
results = spider.crawl_urls_async(urls, config)
|
82
|
+
```
|
83
|
+
|
84
|
+
### 2. Custom Webhook Integration
|
85
|
+
- Flexible payload formatting
|
86
|
+
- Custom headers support
|
87
|
+
- Variable substitution in templates
|
88
|
+
|
89
|
+
```python
|
90
|
+
config = CrawlConfig(
|
91
|
+
webhook_url="https://your-webhook.com",
|
92
|
+
webhook_headers={
|
93
|
+
"Authorization": "Bearer token",
|
94
|
+
"X-Custom-Header": "value"
|
95
|
+
},
|
96
|
+
webhook_payload_template='''{
|
97
|
+
"url": "{url}",
|
98
|
+
"content": "{markdown}",
|
99
|
+
"status": "{status}",
|
100
|
+
"custom_field": "value"
|
101
|
+
}'''
|
102
|
+
)
|
103
|
+
```
|
104
|
+
|
105
|
+
### 3. Flexible Report Generation
|
106
|
+
- Optional report saving
|
107
|
+
- Customizable report location
|
108
|
+
- Detailed success/failure statistics
|
109
|
+
|
110
|
+
```python
|
111
|
+
config = CrawlConfig(
|
112
|
+
save_reports=True,
|
113
|
+
report_file=Path("custom_report.json"),
|
114
|
+
output_dir=Path("content")
|
115
|
+
)
|
116
|
+
```
|
117
|
+
|
118
|
+
## Crawling Methods
|
119
|
+
|
120
|
+
### 1. Single URL Processing
|
121
|
+
|
122
|
+
```python
|
123
|
+
# Synchronous
|
124
|
+
result = spider.crawl_url("https://example.com", config)
|
125
|
+
|
126
|
+
# Asynchronous
|
127
|
+
async def crawl():
|
128
|
+
result = await spider.crawl_url_async("https://example.com", config)
|
129
|
+
```
|
130
|
+
|
131
|
+
### 2. Multiple URLs
|
132
|
+
|
133
|
+
```python
|
134
|
+
urls = ["https://example.com/page1", "https://example.com/page2"]
|
135
|
+
|
136
|
+
# Server-side parallel (recommended)
|
137
|
+
results = spider.crawl_urls_server_parallel(urls, config)
|
138
|
+
|
139
|
+
# Client-side parallel
|
140
|
+
results = spider.crawl_urls_parallel(urls, config)
|
141
|
+
|
142
|
+
# Asynchronous
|
143
|
+
async def crawl():
|
144
|
+
results = await spider.crawl_urls_async(urls, config)
|
145
|
+
```
|
146
|
+
|
147
|
+
### 3. Sitemap Processing
|
148
|
+
|
149
|
+
```python
|
150
|
+
# Server-side parallel (recommended)
|
151
|
+
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
152
|
+
|
153
|
+
# Client-side parallel
|
154
|
+
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
155
|
+
|
156
|
+
# Asynchronous
|
157
|
+
async def crawl():
|
158
|
+
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
159
|
+
```
|
160
|
+
|
161
|
+
## Configuration Options
|
162
|
+
|
163
|
+
```python
|
164
|
+
config = CrawlConfig(
|
165
|
+
# Content Selection
|
166
|
+
target_selector="article", # Target element to extract
|
167
|
+
remove_selectors=[".ads", "#popup"], # Elements to remove
|
168
|
+
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
169
|
+
|
170
|
+
# Processing
|
171
|
+
max_concurrent_requests=5, # Parallel processing limit
|
172
|
+
request_delay=0.5, # Delay between requests
|
173
|
+
timeout=30, # Request timeout
|
174
|
+
|
175
|
+
# Output
|
176
|
+
output_dir=Path("content"), # Output directory
|
177
|
+
save_reports=False, # Enable/disable report saving
|
178
|
+
report_file=Path("report.json"), # Report location
|
179
|
+
|
180
|
+
# Webhook
|
181
|
+
webhook_url="https://webhook.com", # Webhook endpoint
|
182
|
+
webhook_timeout=10, # Webhook timeout
|
183
|
+
webhook_headers={ # Custom headers
|
184
|
+
"Authorization": "Bearer token"
|
185
|
+
},
|
186
|
+
webhook_payload_template=''' # Custom payload format
|
187
|
+
{
|
188
|
+
"url": "{url}",
|
189
|
+
"content": "{markdown}",
|
190
|
+
"status": "{status}",
|
191
|
+
"error": "{error}",
|
192
|
+
"time": "{timestamp}"
|
193
|
+
}'''
|
194
|
+
)
|
195
|
+
```
|
196
|
+
|
197
|
+
## Progress Tracking
|
198
|
+
|
199
|
+
The package provides detailed progress information:
|
200
|
+
|
201
|
+
```
|
202
|
+
Fetching sitemap from https://example.com/sitemap.xml...
|
203
|
+
Found 156 URLs in sitemap
|
204
|
+
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 156/156 URLs
|
205
|
+
|
206
|
+
Retrying failed URLs: 18 (11.5% failed)
|
207
|
+
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 18/18 retries
|
208
|
+
|
209
|
+
Crawling Summary:
|
210
|
+
Total URLs processed: 156
|
211
|
+
Initial failures: 18 (11.5%)
|
212
|
+
Final results:
|
213
|
+
✓ Successful: 150
|
214
|
+
✗ Failed: 6
|
215
|
+
Retry success rate: 12/18 (66.7%)
|
216
|
+
```
|
217
|
+
|
218
|
+
## Output Structure
|
219
|
+
|
220
|
+
### 1. Directory Layout
|
221
|
+
```
|
222
|
+
content/ # Output directory
|
223
|
+
├── example-com-page1.md # Markdown files
|
224
|
+
├── example-com-page2.md
|
225
|
+
└── report.json # Crawl report
|
226
|
+
```
|
227
|
+
|
228
|
+
### 2. Report Format
|
229
|
+
```json
|
230
|
+
{
|
231
|
+
"timestamp": "2025-02-15T10:30:00",
|
232
|
+
"config": {
|
233
|
+
"target_selector": "article",
|
234
|
+
"remove_selectors": [".ads"]
|
235
|
+
},
|
236
|
+
"results": {
|
237
|
+
"successful": [...],
|
238
|
+
"failed": [...]
|
239
|
+
},
|
240
|
+
"summary": {
|
241
|
+
"total": 156,
|
242
|
+
"successful": 150,
|
243
|
+
"failed": 6
|
244
|
+
}
|
245
|
+
}
|
246
|
+
```
|
247
|
+
|
248
|
+
## Performance Optimization
|
249
|
+
|
250
|
+
1. Server-side Parallel Processing
|
251
|
+
- Recommended for most cases
|
252
|
+
- Single HTTP request
|
253
|
+
- Reduced network overhead
|
254
|
+
- Built-in load balancing
|
255
|
+
|
256
|
+
2. Client-side Parallel Processing
|
257
|
+
- Better control over processing
|
258
|
+
- Customizable concurrency
|
259
|
+
- Progress tracking per URL
|
260
|
+
- Automatic retry handling
|
261
|
+
|
262
|
+
3. Asynchronous Processing
|
263
|
+
- Ideal for async applications
|
264
|
+
- Non-blocking operation
|
265
|
+
- Real-time progress updates
|
266
|
+
- Efficient resource usage
|
267
|
+
|
268
|
+
## Error Handling
|
269
|
+
|
270
|
+
The package provides comprehensive error handling:
|
271
|
+
|
272
|
+
- Automatic retry for failed URLs
|
273
|
+
- Failure ratio monitoring
|
274
|
+
- Detailed error reporting
|
275
|
+
- Webhook error notifications
|
276
|
+
- Progress tracking during retries
|
277
|
+
|
278
|
+
## Requirements
|
279
|
+
|
280
|
+
- Python 3.11+
|
281
|
+
- Running SpiderForce4AI service
|
282
|
+
- Internet connection
|
283
|
+
|
284
|
+
## Dependencies
|
285
|
+
|
286
|
+
- aiohttp
|
287
|
+
- asyncio
|
288
|
+
- rich
|
289
|
+
- aiofiles
|
290
|
+
- httpx
|
291
|
+
|
292
|
+
## License
|
293
|
+
|
294
|
+
MIT License
|
295
|
+
|
296
|
+
## Credits
|
297
|
+
|
298
|
+
Created by [Peter Tam](https://petertam.pro)
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=Fbgqu9uPg0wuWZgiVYNTv6CkkcOHgU_f5-uoXRKhgn4,29922
|
2
|
+
spiderforce4ai-1.3.dist-info/METADATA,sha256=ybuwcVE62JFnWJKcCdHDrOOqmbuh8PEzF69_yFK-eV0,7183
|
3
|
+
spiderforce4ai-1.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-1.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-1.3.dist-info/RECORD,,
|
@@ -1,309 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: spiderforce4ai
|
3
|
-
Version: 1.1
|
4
|
-
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
|
-
Home-page: https://petertam.pro
|
6
|
-
Author: Piotr Tamulewicz
|
7
|
-
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
-
License: MIT
|
9
|
-
Classifier: Development Status :: 4 - Beta
|
10
|
-
Classifier: Intended Audience :: Developers
|
11
|
-
Classifier: License :: OSI Approved :: MIT License
|
12
|
-
Classifier: Programming Language :: Python :: 3.11
|
13
|
-
Classifier: Programming Language :: Python :: 3.12
|
14
|
-
Requires-Python: >=3.11
|
15
|
-
Description-Content-Type: text/markdown
|
16
|
-
Requires-Dist: aiohttp>=3.8.0
|
17
|
-
Requires-Dist: asyncio>=3.4.3
|
18
|
-
Requires-Dist: rich>=10.0.0
|
19
|
-
Requires-Dist: aiofiles>=0.8.0
|
20
|
-
Requires-Dist: httpx>=0.24.0
|
21
|
-
Dynamic: author
|
22
|
-
Dynamic: home-page
|
23
|
-
Dynamic: requires-python
|
24
|
-
|
25
|
-
# SpiderForce4AI Python Wrapper
|
26
|
-
|
27
|
-
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
28
|
-
|
29
|
-
## Quick Start (Minimal Setup)
|
30
|
-
|
31
|
-
```python
|
32
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
33
|
-
|
34
|
-
# Initialize with your service URL
|
35
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
36
|
-
|
37
|
-
# Create default config
|
38
|
-
config = CrawlConfig()
|
39
|
-
|
40
|
-
# Crawl a single URL
|
41
|
-
result = spider.crawl_url("https://example.com", config)
|
42
|
-
```
|
43
|
-
|
44
|
-
## Installation
|
45
|
-
|
46
|
-
```bash
|
47
|
-
pip install spiderforce4ai
|
48
|
-
```
|
49
|
-
|
50
|
-
## Crawling Methods
|
51
|
-
|
52
|
-
### 1. Single URL
|
53
|
-
|
54
|
-
```python
|
55
|
-
# Basic usage
|
56
|
-
result = spider.crawl_url("https://example.com", config)
|
57
|
-
|
58
|
-
# Async version
|
59
|
-
async def crawl():
|
60
|
-
result = await spider.crawl_url_async("https://example.com", config)
|
61
|
-
```
|
62
|
-
|
63
|
-
### 2. Multiple URLs
|
64
|
-
|
65
|
-
```python
|
66
|
-
urls = [
|
67
|
-
"https://example.com/page1",
|
68
|
-
"https://example.com/page2"
|
69
|
-
]
|
70
|
-
|
71
|
-
# Client-side parallel (using multiprocessing)
|
72
|
-
results = spider.crawl_urls_parallel(urls, config)
|
73
|
-
|
74
|
-
# Server-side parallel (single request)
|
75
|
-
results = spider.crawl_urls_server_parallel(urls, config)
|
76
|
-
|
77
|
-
# Async version
|
78
|
-
async def crawl():
|
79
|
-
results = await spider.crawl_urls_async(urls, config)
|
80
|
-
```
|
81
|
-
|
82
|
-
### 3. Sitemap Crawling
|
83
|
-
|
84
|
-
```python
|
85
|
-
# Server-side parallel (recommended)
|
86
|
-
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
87
|
-
|
88
|
-
# Client-side parallel
|
89
|
-
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
90
|
-
|
91
|
-
# Async version
|
92
|
-
async def crawl():
|
93
|
-
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
94
|
-
```
|
95
|
-
|
96
|
-
## Configuration Options
|
97
|
-
|
98
|
-
All configuration options are optional with sensible defaults:
|
99
|
-
|
100
|
-
```python
|
101
|
-
from pathlib import Path
|
102
|
-
|
103
|
-
config = CrawlConfig(
|
104
|
-
# Content Selection (all optional)
|
105
|
-
target_selector="article", # Specific element to extract
|
106
|
-
remove_selectors=[ # Elements to remove
|
107
|
-
".ads",
|
108
|
-
"#popup",
|
109
|
-
".navigation",
|
110
|
-
".footer"
|
111
|
-
],
|
112
|
-
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
113
|
-
|
114
|
-
# Processing Settings
|
115
|
-
max_concurrent_requests=1, # For client-side parallel processing
|
116
|
-
request_delay=0.5, # Delay between requests (seconds)
|
117
|
-
timeout=30, # Request timeout (seconds)
|
118
|
-
|
119
|
-
# Output Settings
|
120
|
-
output_dir=Path("spiderforce_reports"), # Default directory for files
|
121
|
-
webhook_url="https://your-webhook.com", # Real-time notifications
|
122
|
-
webhook_timeout=10, # Webhook timeout
|
123
|
-
webhook_headers={ # Optional custom headers for webhook
|
124
|
-
"Authorization": "Bearer your-token",
|
125
|
-
"X-Custom-Header": "value"
|
126
|
-
},
|
127
|
-
webhook_payload_template='''{ # Optional custom webhook payload template
|
128
|
-
"crawled_url": "{url}",
|
129
|
-
"content": "{markdown}",
|
130
|
-
"crawl_status": "{status}",
|
131
|
-
"crawl_error": "{error}",
|
132
|
-
"crawl_time": "{timestamp}",
|
133
|
-
"custom_field": "your-value"
|
134
|
-
}''',
|
135
|
-
save_reports=False, # Whether to save crawl reports (default: False)
|
136
|
-
report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
|
137
|
-
)
|
138
|
-
```
|
139
|
-
|
140
|
-
## Real-World Examples
|
141
|
-
|
142
|
-
### 1. Basic Blog Crawling
|
143
|
-
|
144
|
-
```python
|
145
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
146
|
-
from pathlib import Path
|
147
|
-
|
148
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
149
|
-
config = CrawlConfig(
|
150
|
-
target_selector="article.post-content",
|
151
|
-
output_dir=Path("blog_content")
|
152
|
-
)
|
153
|
-
|
154
|
-
result = spider.crawl_url("https://example.com/blog-post", config)
|
155
|
-
```
|
156
|
-
|
157
|
-
### 2. Parallel Website Crawling
|
158
|
-
|
159
|
-
```python
|
160
|
-
config = CrawlConfig(
|
161
|
-
remove_selectors=[
|
162
|
-
".navigation",
|
163
|
-
".footer",
|
164
|
-
".ads",
|
165
|
-
"#cookie-notice"
|
166
|
-
],
|
167
|
-
max_concurrent_requests=5,
|
168
|
-
output_dir=Path("website_content"),
|
169
|
-
webhook_url="https://your-webhook.com/endpoint"
|
170
|
-
)
|
171
|
-
|
172
|
-
# Using server-side parallel processing
|
173
|
-
results = spider.crawl_urls_server_parallel([
|
174
|
-
"https://example.com/page1",
|
175
|
-
"https://example.com/page2",
|
176
|
-
"https://example.com/page3"
|
177
|
-
], config)
|
178
|
-
```
|
179
|
-
|
180
|
-
### 3. Full Sitemap Processing
|
181
|
-
|
182
|
-
```python
|
183
|
-
config = CrawlConfig(
|
184
|
-
target_selector="main",
|
185
|
-
remove_selectors=[".sidebar", ".comments"],
|
186
|
-
output_dir=Path("site_content"),
|
187
|
-
report_file=Path("crawl_report.json")
|
188
|
-
)
|
189
|
-
|
190
|
-
results = spider.crawl_sitemap_server_parallel(
|
191
|
-
"https://example.com/sitemap.xml",
|
192
|
-
config
|
193
|
-
)
|
194
|
-
```
|
195
|
-
|
196
|
-
## Output Structure
|
197
|
-
|
198
|
-
### 1. Directory Layout
|
199
|
-
```
|
200
|
-
spiderforce_reports/ # Default output directory
|
201
|
-
├── example-com-page1.md # Converted markdown files
|
202
|
-
├── example-com-page2.md
|
203
|
-
└── crawl_report.json # Crawl report
|
204
|
-
```
|
205
|
-
|
206
|
-
### 2. Markdown Files
|
207
|
-
Each file is named using a slugified version of the URL:
|
208
|
-
```markdown
|
209
|
-
# Page Title
|
210
|
-
|
211
|
-
Content converted to clean markdown...
|
212
|
-
```
|
213
|
-
|
214
|
-
### 3. Crawl Report
|
215
|
-
```json
|
216
|
-
{
|
217
|
-
"timestamp": "2025-02-15T10:30:00.123456",
|
218
|
-
"config": {
|
219
|
-
"target_selector": "article",
|
220
|
-
"remove_selectors": [".ads", "#popup"]
|
221
|
-
},
|
222
|
-
"results": {
|
223
|
-
"successful": [
|
224
|
-
{
|
225
|
-
"url": "https://example.com/page1",
|
226
|
-
"status": "success",
|
227
|
-
"markdown": "# Page Title\n\nContent...",
|
228
|
-
"timestamp": "2025-02-15T10:30:00.123456"
|
229
|
-
}
|
230
|
-
],
|
231
|
-
"failed": [
|
232
|
-
{
|
233
|
-
"url": "https://example.com/page2",
|
234
|
-
"status": "failed",
|
235
|
-
"error": "HTTP 404: Not Found",
|
236
|
-
"timestamp": "2025-02-15T10:30:01.123456"
|
237
|
-
}
|
238
|
-
]
|
239
|
-
},
|
240
|
-
"summary": {
|
241
|
-
"total": 2,
|
242
|
-
"successful": 1,
|
243
|
-
"failed": 1
|
244
|
-
}
|
245
|
-
}
|
246
|
-
```
|
247
|
-
|
248
|
-
### 4. Webhook Notifications
|
249
|
-
If configured, real-time updates are sent for each processed URL:
|
250
|
-
```json
|
251
|
-
{
|
252
|
-
"url": "https://example.com/page1",
|
253
|
-
"status": "success",
|
254
|
-
"markdown": "# Page Title\n\nContent...",
|
255
|
-
"timestamp": "2025-02-15T10:30:00.123456",
|
256
|
-
"config": {
|
257
|
-
"target_selector": "article",
|
258
|
-
"remove_selectors": [".ads", "#popup"]
|
259
|
-
}
|
260
|
-
}
|
261
|
-
```
|
262
|
-
|
263
|
-
## Error Handling
|
264
|
-
|
265
|
-
The package handles various types of errors gracefully:
|
266
|
-
- Network errors
|
267
|
-
- Timeout errors
|
268
|
-
- Invalid URLs
|
269
|
-
- Missing content
|
270
|
-
- Service errors
|
271
|
-
|
272
|
-
All errors are:
|
273
|
-
1. Logged in the console
|
274
|
-
2. Included in the JSON report
|
275
|
-
3. Sent via webhook (if configured)
|
276
|
-
4. Available in the results list
|
277
|
-
|
278
|
-
## Requirements
|
279
|
-
|
280
|
-
- Python 3.11 or later
|
281
|
-
- Running SpiderForce4AI service
|
282
|
-
- Internet connection
|
283
|
-
|
284
|
-
## Performance Considerations
|
285
|
-
|
286
|
-
1. Server-side Parallel Processing
|
287
|
-
- Best for most cases
|
288
|
-
- Single HTTP request for multiple URLs
|
289
|
-
- Less network overhead
|
290
|
-
- Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
|
291
|
-
|
292
|
-
2. Client-side Parallel Processing
|
293
|
-
- Good for special cases requiring local control
|
294
|
-
- Uses Python multiprocessing
|
295
|
-
- More network overhead
|
296
|
-
- Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
|
297
|
-
|
298
|
-
3. Async Processing
|
299
|
-
- Best for integration with async applications
|
300
|
-
- Good for real-time processing
|
301
|
-
- Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
|
302
|
-
|
303
|
-
## License
|
304
|
-
|
305
|
-
MIT License
|
306
|
-
|
307
|
-
## Credits
|
308
|
-
|
309
|
-
Created by [Peter Tam](https://petertam.pro)
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=lCviRhfLngSMehFJZwyK4LirPwbWEyZ0RJjCt5FkBcY,28304
|
2
|
-
spiderforce4ai-1.1.dist-info/METADATA,sha256=lQfqXn0ifJOmOmLkgr8YTSYUFiu6-HS3YsRD0togylo,7769
|
3
|
-
spiderforce4ai-1.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-1.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-1.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|