spiderforce4ai 1.1__py3-none-any.whl → 1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +43 -12
- spiderforce4ai-1.3.dist-info/METADATA +298 -0
- spiderforce4ai-1.3.dist-info/RECORD +5 -0
- spiderforce4ai-1.1.dist-info/METADATA +0 -309
- spiderforce4ai-1.1.dist-info/RECORD +0 -5
- {spiderforce4ai-1.1.dist-info → spiderforce4ai-1.3.dist-info}/WHEEL +0 -0
- {spiderforce4ai-1.1.dist-info → spiderforce4ai-1.3.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -350,17 +350,23 @@ class SpiderForce4AI:
|
|
350
350
|
|
351
351
|
def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
|
352
352
|
"""Save crawl report synchronously."""
|
353
|
+
# Separate successful and failed results
|
354
|
+
successful_results = [r for r in results if r.status == "success"]
|
355
|
+
failed_results = [r for r in results if r.status == "failed"]
|
356
|
+
|
357
|
+
# Create report with only final state
|
353
358
|
report = {
|
354
359
|
"timestamp": datetime.now().isoformat(),
|
355
360
|
"config": config.to_dict(),
|
356
361
|
"results": {
|
357
|
-
"successful": [asdict(r) for r in
|
358
|
-
"failed": [asdict(r) for r in
|
362
|
+
"successful": [asdict(r) for r in successful_results],
|
363
|
+
"failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
|
359
364
|
},
|
360
365
|
"summary": {
|
361
366
|
"total": len(results),
|
362
|
-
"successful": len(
|
363
|
-
"failed": len(
|
367
|
+
"successful": len(successful_results),
|
368
|
+
"failed": len(failed_results),
|
369
|
+
"retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
|
364
370
|
}
|
365
371
|
}
|
366
372
|
|
@@ -372,17 +378,22 @@ class SpiderForce4AI:
|
|
372
378
|
if not config.report_file:
|
373
379
|
return
|
374
380
|
|
381
|
+
# Separate successful and failed results
|
382
|
+
successful_results = [r for r in self.crawl_results if r.status == "success"]
|
383
|
+
failed_results = [r for r in self.crawl_results if r.status == "failed"]
|
384
|
+
|
375
385
|
report = {
|
376
386
|
"timestamp": datetime.now().isoformat(),
|
377
387
|
"config": config.to_dict(),
|
378
388
|
"results": {
|
379
|
-
"successful": [asdict(r) for r in
|
380
|
-
"failed": [asdict(r) for r in
|
389
|
+
"successful": [asdict(r) for r in successful_results],
|
390
|
+
"failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
|
381
391
|
},
|
382
392
|
"summary": {
|
383
393
|
"total": len(self.crawl_results),
|
384
|
-
"successful": len(
|
385
|
-
"failed": len(
|
394
|
+
"successful": len(successful_results),
|
395
|
+
"failed": len(failed_results),
|
396
|
+
"retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
|
386
397
|
}
|
387
398
|
}
|
388
399
|
|
@@ -535,8 +546,13 @@ class SpiderForce4AI:
|
|
535
546
|
results = initial_results
|
536
547
|
else:
|
537
548
|
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
538
|
-
#
|
539
|
-
results =
|
549
|
+
# Update results list by replacing failed results with successful retries
|
550
|
+
results = initial_results.copy()
|
551
|
+
for retry_result in retry_results:
|
552
|
+
for i, result in enumerate(results):
|
553
|
+
if result.url == retry_result.url:
|
554
|
+
results[i] = retry_result
|
555
|
+
break
|
540
556
|
else:
|
541
557
|
results = initial_results
|
542
558
|
|
@@ -661,12 +677,27 @@ class SpiderForce4AI:
|
|
661
677
|
console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
|
662
678
|
for result in failed_results:
|
663
679
|
new_result = _process_url_parallel((result.url, self.base_url, config))
|
680
|
+
|
681
|
+
# Save markdown and trigger webhook for successful retries
|
664
682
|
if new_result.status == "success":
|
665
683
|
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
666
|
-
#
|
667
|
-
|
684
|
+
# Save markdown if output directory is configured
|
685
|
+
if config.output_dir and new_result.markdown:
|
686
|
+
filepath = config.output_dir / f"{slugify(new_result.url)}.md"
|
687
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
688
|
+
f.write(new_result.markdown)
|
689
|
+
# Send webhook for successful retry
|
690
|
+
_send_webhook_sync(new_result, config)
|
668
691
|
else:
|
669
692
|
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
693
|
+
# Send webhook for failed retry
|
694
|
+
_send_webhook_sync(new_result, config)
|
695
|
+
|
696
|
+
# Update results list
|
697
|
+
for i, r in enumerate(results):
|
698
|
+
if r.url == new_result.url:
|
699
|
+
results[i] = new_result
|
700
|
+
break
|
670
701
|
|
671
702
|
# Calculate final statistics
|
672
703
|
final_successful = len([r for r in results if r.status == "success"])
|
@@ -0,0 +1,298 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: spiderforce4ai
|
3
|
+
Version: 1.3
|
4
|
+
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
|
+
Home-page: https://petertam.pro
|
6
|
+
Author: Piotr Tamulewicz
|
7
|
+
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
+
License: MIT
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
14
|
+
Requires-Python: >=3.11
|
15
|
+
Description-Content-Type: text/markdown
|
16
|
+
Requires-Dist: aiohttp>=3.8.0
|
17
|
+
Requires-Dist: asyncio>=3.4.3
|
18
|
+
Requires-Dist: rich>=10.0.0
|
19
|
+
Requires-Dist: aiofiles>=0.8.0
|
20
|
+
Requires-Dist: httpx>=0.24.0
|
21
|
+
Dynamic: author
|
22
|
+
Dynamic: home-page
|
23
|
+
Dynamic: requires-python
|
24
|
+
|
25
|
+
# SpiderForce4AI Python Wrapper
|
26
|
+
|
27
|
+
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
28
|
+
|
29
|
+
## Features
|
30
|
+
|
31
|
+
- HTML to Markdown conversion
|
32
|
+
- Parallel and async crawling support
|
33
|
+
- Sitemap processing
|
34
|
+
- Custom content selection
|
35
|
+
- Automatic retry mechanism
|
36
|
+
- Detailed progress tracking
|
37
|
+
- Webhook notifications
|
38
|
+
- Customizable reporting
|
39
|
+
|
40
|
+
## Installation
|
41
|
+
|
42
|
+
```bash
|
43
|
+
pip install spiderforce4ai
|
44
|
+
```
|
45
|
+
|
46
|
+
## Quick Start
|
47
|
+
|
48
|
+
```python
|
49
|
+
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
50
|
+
from pathlib import Path
|
51
|
+
|
52
|
+
# Initialize crawler
|
53
|
+
spider = SpiderForce4AI("http://localhost:3004")
|
54
|
+
|
55
|
+
# Configure crawling options
|
56
|
+
config = CrawlConfig(
|
57
|
+
target_selector="article",
|
58
|
+
remove_selectors=[".ads", ".navigation"],
|
59
|
+
max_concurrent_requests=5,
|
60
|
+
save_reports=True
|
61
|
+
)
|
62
|
+
|
63
|
+
# Crawl a sitemap
|
64
|
+
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
65
|
+
```
|
66
|
+
|
67
|
+
## Key Features
|
68
|
+
|
69
|
+
### 1. Smart Retry Mechanism
|
70
|
+
- Automatically retries failed URLs
|
71
|
+
- Monitors failure ratio to prevent server overload
|
72
|
+
- Detailed retry statistics and progress tracking
|
73
|
+
- Aborts retries if failure rate exceeds 20%
|
74
|
+
|
75
|
+
```python
|
76
|
+
# Retry behavior is automatic
|
77
|
+
config = CrawlConfig(
|
78
|
+
max_concurrent_requests=5,
|
79
|
+
request_delay=1.0 # Delay between retries
|
80
|
+
)
|
81
|
+
results = spider.crawl_urls_async(urls, config)
|
82
|
+
```
|
83
|
+
|
84
|
+
### 2. Custom Webhook Integration
|
85
|
+
- Flexible payload formatting
|
86
|
+
- Custom headers support
|
87
|
+
- Variable substitution in templates
|
88
|
+
|
89
|
+
```python
|
90
|
+
config = CrawlConfig(
|
91
|
+
webhook_url="https://your-webhook.com",
|
92
|
+
webhook_headers={
|
93
|
+
"Authorization": "Bearer token",
|
94
|
+
"X-Custom-Header": "value"
|
95
|
+
},
|
96
|
+
webhook_payload_template='''{
|
97
|
+
"url": "{url}",
|
98
|
+
"content": "{markdown}",
|
99
|
+
"status": "{status}",
|
100
|
+
"custom_field": "value"
|
101
|
+
}'''
|
102
|
+
)
|
103
|
+
```
|
104
|
+
|
105
|
+
### 3. Flexible Report Generation
|
106
|
+
- Optional report saving
|
107
|
+
- Customizable report location
|
108
|
+
- Detailed success/failure statistics
|
109
|
+
|
110
|
+
```python
|
111
|
+
config = CrawlConfig(
|
112
|
+
save_reports=True,
|
113
|
+
report_file=Path("custom_report.json"),
|
114
|
+
output_dir=Path("content")
|
115
|
+
)
|
116
|
+
```
|
117
|
+
|
118
|
+
## Crawling Methods
|
119
|
+
|
120
|
+
### 1. Single URL Processing
|
121
|
+
|
122
|
+
```python
|
123
|
+
# Synchronous
|
124
|
+
result = spider.crawl_url("https://example.com", config)
|
125
|
+
|
126
|
+
# Asynchronous
|
127
|
+
async def crawl():
|
128
|
+
result = await spider.crawl_url_async("https://example.com", config)
|
129
|
+
```
|
130
|
+
|
131
|
+
### 2. Multiple URLs
|
132
|
+
|
133
|
+
```python
|
134
|
+
urls = ["https://example.com/page1", "https://example.com/page2"]
|
135
|
+
|
136
|
+
# Server-side parallel (recommended)
|
137
|
+
results = spider.crawl_urls_server_parallel(urls, config)
|
138
|
+
|
139
|
+
# Client-side parallel
|
140
|
+
results = spider.crawl_urls_parallel(urls, config)
|
141
|
+
|
142
|
+
# Asynchronous
|
143
|
+
async def crawl():
|
144
|
+
results = await spider.crawl_urls_async(urls, config)
|
145
|
+
```
|
146
|
+
|
147
|
+
### 3. Sitemap Processing
|
148
|
+
|
149
|
+
```python
|
150
|
+
# Server-side parallel (recommended)
|
151
|
+
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
152
|
+
|
153
|
+
# Client-side parallel
|
154
|
+
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
155
|
+
|
156
|
+
# Asynchronous
|
157
|
+
async def crawl():
|
158
|
+
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
159
|
+
```
|
160
|
+
|
161
|
+
## Configuration Options
|
162
|
+
|
163
|
+
```python
|
164
|
+
config = CrawlConfig(
|
165
|
+
# Content Selection
|
166
|
+
target_selector="article", # Target element to extract
|
167
|
+
remove_selectors=[".ads", "#popup"], # Elements to remove
|
168
|
+
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
169
|
+
|
170
|
+
# Processing
|
171
|
+
max_concurrent_requests=5, # Parallel processing limit
|
172
|
+
request_delay=0.5, # Delay between requests
|
173
|
+
timeout=30, # Request timeout
|
174
|
+
|
175
|
+
# Output
|
176
|
+
output_dir=Path("content"), # Output directory
|
177
|
+
save_reports=False, # Enable/disable report saving
|
178
|
+
report_file=Path("report.json"), # Report location
|
179
|
+
|
180
|
+
# Webhook
|
181
|
+
webhook_url="https://webhook.com", # Webhook endpoint
|
182
|
+
webhook_timeout=10, # Webhook timeout
|
183
|
+
webhook_headers={ # Custom headers
|
184
|
+
"Authorization": "Bearer token"
|
185
|
+
},
|
186
|
+
webhook_payload_template=''' # Custom payload format
|
187
|
+
{
|
188
|
+
"url": "{url}",
|
189
|
+
"content": "{markdown}",
|
190
|
+
"status": "{status}",
|
191
|
+
"error": "{error}",
|
192
|
+
"time": "{timestamp}"
|
193
|
+
}'''
|
194
|
+
)
|
195
|
+
```
|
196
|
+
|
197
|
+
## Progress Tracking
|
198
|
+
|
199
|
+
The package provides detailed progress information:
|
200
|
+
|
201
|
+
```
|
202
|
+
Fetching sitemap from https://example.com/sitemap.xml...
|
203
|
+
Found 156 URLs in sitemap
|
204
|
+
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 156/156 URLs
|
205
|
+
|
206
|
+
Retrying failed URLs: 18 (11.5% failed)
|
207
|
+
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 18/18 retries
|
208
|
+
|
209
|
+
Crawling Summary:
|
210
|
+
Total URLs processed: 156
|
211
|
+
Initial failures: 18 (11.5%)
|
212
|
+
Final results:
|
213
|
+
✓ Successful: 150
|
214
|
+
✗ Failed: 6
|
215
|
+
Retry success rate: 12/18 (66.7%)
|
216
|
+
```
|
217
|
+
|
218
|
+
## Output Structure
|
219
|
+
|
220
|
+
### 1. Directory Layout
|
221
|
+
```
|
222
|
+
content/ # Output directory
|
223
|
+
├── example-com-page1.md # Markdown files
|
224
|
+
├── example-com-page2.md
|
225
|
+
└── report.json # Crawl report
|
226
|
+
```
|
227
|
+
|
228
|
+
### 2. Report Format
|
229
|
+
```json
|
230
|
+
{
|
231
|
+
"timestamp": "2025-02-15T10:30:00",
|
232
|
+
"config": {
|
233
|
+
"target_selector": "article",
|
234
|
+
"remove_selectors": [".ads"]
|
235
|
+
},
|
236
|
+
"results": {
|
237
|
+
"successful": [...],
|
238
|
+
"failed": [...]
|
239
|
+
},
|
240
|
+
"summary": {
|
241
|
+
"total": 156,
|
242
|
+
"successful": 150,
|
243
|
+
"failed": 6
|
244
|
+
}
|
245
|
+
}
|
246
|
+
```
|
247
|
+
|
248
|
+
## Performance Optimization
|
249
|
+
|
250
|
+
1. Server-side Parallel Processing
|
251
|
+
- Recommended for most cases
|
252
|
+
- Single HTTP request
|
253
|
+
- Reduced network overhead
|
254
|
+
- Built-in load balancing
|
255
|
+
|
256
|
+
2. Client-side Parallel Processing
|
257
|
+
- Better control over processing
|
258
|
+
- Customizable concurrency
|
259
|
+
- Progress tracking per URL
|
260
|
+
- Automatic retry handling
|
261
|
+
|
262
|
+
3. Asynchronous Processing
|
263
|
+
- Ideal for async applications
|
264
|
+
- Non-blocking operation
|
265
|
+
- Real-time progress updates
|
266
|
+
- Efficient resource usage
|
267
|
+
|
268
|
+
## Error Handling
|
269
|
+
|
270
|
+
The package provides comprehensive error handling:
|
271
|
+
|
272
|
+
- Automatic retry for failed URLs
|
273
|
+
- Failure ratio monitoring
|
274
|
+
- Detailed error reporting
|
275
|
+
- Webhook error notifications
|
276
|
+
- Progress tracking during retries
|
277
|
+
|
278
|
+
## Requirements
|
279
|
+
|
280
|
+
- Python 3.11+
|
281
|
+
- Running SpiderForce4AI service
|
282
|
+
- Internet connection
|
283
|
+
|
284
|
+
## Dependencies
|
285
|
+
|
286
|
+
- aiohttp
|
287
|
+
- asyncio
|
288
|
+
- rich
|
289
|
+
- aiofiles
|
290
|
+
- httpx
|
291
|
+
|
292
|
+
## License
|
293
|
+
|
294
|
+
MIT License
|
295
|
+
|
296
|
+
## Credits
|
297
|
+
|
298
|
+
Created by [Peter Tam](https://petertam.pro)
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=Fbgqu9uPg0wuWZgiVYNTv6CkkcOHgU_f5-uoXRKhgn4,29922
|
2
|
+
spiderforce4ai-1.3.dist-info/METADATA,sha256=ybuwcVE62JFnWJKcCdHDrOOqmbuh8PEzF69_yFK-eV0,7183
|
3
|
+
spiderforce4ai-1.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-1.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-1.3.dist-info/RECORD,,
|
@@ -1,309 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: spiderforce4ai
|
3
|
-
Version: 1.1
|
4
|
-
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
|
-
Home-page: https://petertam.pro
|
6
|
-
Author: Piotr Tamulewicz
|
7
|
-
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
-
License: MIT
|
9
|
-
Classifier: Development Status :: 4 - Beta
|
10
|
-
Classifier: Intended Audience :: Developers
|
11
|
-
Classifier: License :: OSI Approved :: MIT License
|
12
|
-
Classifier: Programming Language :: Python :: 3.11
|
13
|
-
Classifier: Programming Language :: Python :: 3.12
|
14
|
-
Requires-Python: >=3.11
|
15
|
-
Description-Content-Type: text/markdown
|
16
|
-
Requires-Dist: aiohttp>=3.8.0
|
17
|
-
Requires-Dist: asyncio>=3.4.3
|
18
|
-
Requires-Dist: rich>=10.0.0
|
19
|
-
Requires-Dist: aiofiles>=0.8.0
|
20
|
-
Requires-Dist: httpx>=0.24.0
|
21
|
-
Dynamic: author
|
22
|
-
Dynamic: home-page
|
23
|
-
Dynamic: requires-python
|
24
|
-
|
25
|
-
# SpiderForce4AI Python Wrapper
|
26
|
-
|
27
|
-
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
28
|
-
|
29
|
-
## Quick Start (Minimal Setup)
|
30
|
-
|
31
|
-
```python
|
32
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
33
|
-
|
34
|
-
# Initialize with your service URL
|
35
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
36
|
-
|
37
|
-
# Create default config
|
38
|
-
config = CrawlConfig()
|
39
|
-
|
40
|
-
# Crawl a single URL
|
41
|
-
result = spider.crawl_url("https://example.com", config)
|
42
|
-
```
|
43
|
-
|
44
|
-
## Installation
|
45
|
-
|
46
|
-
```bash
|
47
|
-
pip install spiderforce4ai
|
48
|
-
```
|
49
|
-
|
50
|
-
## Crawling Methods
|
51
|
-
|
52
|
-
### 1. Single URL
|
53
|
-
|
54
|
-
```python
|
55
|
-
# Basic usage
|
56
|
-
result = spider.crawl_url("https://example.com", config)
|
57
|
-
|
58
|
-
# Async version
|
59
|
-
async def crawl():
|
60
|
-
result = await spider.crawl_url_async("https://example.com", config)
|
61
|
-
```
|
62
|
-
|
63
|
-
### 2. Multiple URLs
|
64
|
-
|
65
|
-
```python
|
66
|
-
urls = [
|
67
|
-
"https://example.com/page1",
|
68
|
-
"https://example.com/page2"
|
69
|
-
]
|
70
|
-
|
71
|
-
# Client-side parallel (using multiprocessing)
|
72
|
-
results = spider.crawl_urls_parallel(urls, config)
|
73
|
-
|
74
|
-
# Server-side parallel (single request)
|
75
|
-
results = spider.crawl_urls_server_parallel(urls, config)
|
76
|
-
|
77
|
-
# Async version
|
78
|
-
async def crawl():
|
79
|
-
results = await spider.crawl_urls_async(urls, config)
|
80
|
-
```
|
81
|
-
|
82
|
-
### 3. Sitemap Crawling
|
83
|
-
|
84
|
-
```python
|
85
|
-
# Server-side parallel (recommended)
|
86
|
-
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
87
|
-
|
88
|
-
# Client-side parallel
|
89
|
-
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
90
|
-
|
91
|
-
# Async version
|
92
|
-
async def crawl():
|
93
|
-
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
94
|
-
```
|
95
|
-
|
96
|
-
## Configuration Options
|
97
|
-
|
98
|
-
All configuration options are optional with sensible defaults:
|
99
|
-
|
100
|
-
```python
|
101
|
-
from pathlib import Path
|
102
|
-
|
103
|
-
config = CrawlConfig(
|
104
|
-
# Content Selection (all optional)
|
105
|
-
target_selector="article", # Specific element to extract
|
106
|
-
remove_selectors=[ # Elements to remove
|
107
|
-
".ads",
|
108
|
-
"#popup",
|
109
|
-
".navigation",
|
110
|
-
".footer"
|
111
|
-
],
|
112
|
-
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
113
|
-
|
114
|
-
# Processing Settings
|
115
|
-
max_concurrent_requests=1, # For client-side parallel processing
|
116
|
-
request_delay=0.5, # Delay between requests (seconds)
|
117
|
-
timeout=30, # Request timeout (seconds)
|
118
|
-
|
119
|
-
# Output Settings
|
120
|
-
output_dir=Path("spiderforce_reports"), # Default directory for files
|
121
|
-
webhook_url="https://your-webhook.com", # Real-time notifications
|
122
|
-
webhook_timeout=10, # Webhook timeout
|
123
|
-
webhook_headers={ # Optional custom headers for webhook
|
124
|
-
"Authorization": "Bearer your-token",
|
125
|
-
"X-Custom-Header": "value"
|
126
|
-
},
|
127
|
-
webhook_payload_template='''{ # Optional custom webhook payload template
|
128
|
-
"crawled_url": "{url}",
|
129
|
-
"content": "{markdown}",
|
130
|
-
"crawl_status": "{status}",
|
131
|
-
"crawl_error": "{error}",
|
132
|
-
"crawl_time": "{timestamp}",
|
133
|
-
"custom_field": "your-value"
|
134
|
-
}''',
|
135
|
-
save_reports=False, # Whether to save crawl reports (default: False)
|
136
|
-
report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
|
137
|
-
)
|
138
|
-
```
|
139
|
-
|
140
|
-
## Real-World Examples
|
141
|
-
|
142
|
-
### 1. Basic Blog Crawling
|
143
|
-
|
144
|
-
```python
|
145
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
146
|
-
from pathlib import Path
|
147
|
-
|
148
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
149
|
-
config = CrawlConfig(
|
150
|
-
target_selector="article.post-content",
|
151
|
-
output_dir=Path("blog_content")
|
152
|
-
)
|
153
|
-
|
154
|
-
result = spider.crawl_url("https://example.com/blog-post", config)
|
155
|
-
```
|
156
|
-
|
157
|
-
### 2. Parallel Website Crawling
|
158
|
-
|
159
|
-
```python
|
160
|
-
config = CrawlConfig(
|
161
|
-
remove_selectors=[
|
162
|
-
".navigation",
|
163
|
-
".footer",
|
164
|
-
".ads",
|
165
|
-
"#cookie-notice"
|
166
|
-
],
|
167
|
-
max_concurrent_requests=5,
|
168
|
-
output_dir=Path("website_content"),
|
169
|
-
webhook_url="https://your-webhook.com/endpoint"
|
170
|
-
)
|
171
|
-
|
172
|
-
# Using server-side parallel processing
|
173
|
-
results = spider.crawl_urls_server_parallel([
|
174
|
-
"https://example.com/page1",
|
175
|
-
"https://example.com/page2",
|
176
|
-
"https://example.com/page3"
|
177
|
-
], config)
|
178
|
-
```
|
179
|
-
|
180
|
-
### 3. Full Sitemap Processing
|
181
|
-
|
182
|
-
```python
|
183
|
-
config = CrawlConfig(
|
184
|
-
target_selector="main",
|
185
|
-
remove_selectors=[".sidebar", ".comments"],
|
186
|
-
output_dir=Path("site_content"),
|
187
|
-
report_file=Path("crawl_report.json")
|
188
|
-
)
|
189
|
-
|
190
|
-
results = spider.crawl_sitemap_server_parallel(
|
191
|
-
"https://example.com/sitemap.xml",
|
192
|
-
config
|
193
|
-
)
|
194
|
-
```
|
195
|
-
|
196
|
-
## Output Structure
|
197
|
-
|
198
|
-
### 1. Directory Layout
|
199
|
-
```
|
200
|
-
spiderforce_reports/ # Default output directory
|
201
|
-
├── example-com-page1.md # Converted markdown files
|
202
|
-
├── example-com-page2.md
|
203
|
-
└── crawl_report.json # Crawl report
|
204
|
-
```
|
205
|
-
|
206
|
-
### 2. Markdown Files
|
207
|
-
Each file is named using a slugified version of the URL:
|
208
|
-
```markdown
|
209
|
-
# Page Title
|
210
|
-
|
211
|
-
Content converted to clean markdown...
|
212
|
-
```
|
213
|
-
|
214
|
-
### 3. Crawl Report
|
215
|
-
```json
|
216
|
-
{
|
217
|
-
"timestamp": "2025-02-15T10:30:00.123456",
|
218
|
-
"config": {
|
219
|
-
"target_selector": "article",
|
220
|
-
"remove_selectors": [".ads", "#popup"]
|
221
|
-
},
|
222
|
-
"results": {
|
223
|
-
"successful": [
|
224
|
-
{
|
225
|
-
"url": "https://example.com/page1",
|
226
|
-
"status": "success",
|
227
|
-
"markdown": "# Page Title\n\nContent...",
|
228
|
-
"timestamp": "2025-02-15T10:30:00.123456"
|
229
|
-
}
|
230
|
-
],
|
231
|
-
"failed": [
|
232
|
-
{
|
233
|
-
"url": "https://example.com/page2",
|
234
|
-
"status": "failed",
|
235
|
-
"error": "HTTP 404: Not Found",
|
236
|
-
"timestamp": "2025-02-15T10:30:01.123456"
|
237
|
-
}
|
238
|
-
]
|
239
|
-
},
|
240
|
-
"summary": {
|
241
|
-
"total": 2,
|
242
|
-
"successful": 1,
|
243
|
-
"failed": 1
|
244
|
-
}
|
245
|
-
}
|
246
|
-
```
|
247
|
-
|
248
|
-
### 4. Webhook Notifications
|
249
|
-
If configured, real-time updates are sent for each processed URL:
|
250
|
-
```json
|
251
|
-
{
|
252
|
-
"url": "https://example.com/page1",
|
253
|
-
"status": "success",
|
254
|
-
"markdown": "# Page Title\n\nContent...",
|
255
|
-
"timestamp": "2025-02-15T10:30:00.123456",
|
256
|
-
"config": {
|
257
|
-
"target_selector": "article",
|
258
|
-
"remove_selectors": [".ads", "#popup"]
|
259
|
-
}
|
260
|
-
}
|
261
|
-
```
|
262
|
-
|
263
|
-
## Error Handling
|
264
|
-
|
265
|
-
The package handles various types of errors gracefully:
|
266
|
-
- Network errors
|
267
|
-
- Timeout errors
|
268
|
-
- Invalid URLs
|
269
|
-
- Missing content
|
270
|
-
- Service errors
|
271
|
-
|
272
|
-
All errors are:
|
273
|
-
1. Logged in the console
|
274
|
-
2. Included in the JSON report
|
275
|
-
3. Sent via webhook (if configured)
|
276
|
-
4. Available in the results list
|
277
|
-
|
278
|
-
## Requirements
|
279
|
-
|
280
|
-
- Python 3.11 or later
|
281
|
-
- Running SpiderForce4AI service
|
282
|
-
- Internet connection
|
283
|
-
|
284
|
-
## Performance Considerations
|
285
|
-
|
286
|
-
1. Server-side Parallel Processing
|
287
|
-
- Best for most cases
|
288
|
-
- Single HTTP request for multiple URLs
|
289
|
-
- Less network overhead
|
290
|
-
- Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
|
291
|
-
|
292
|
-
2. Client-side Parallel Processing
|
293
|
-
- Good for special cases requiring local control
|
294
|
-
- Uses Python multiprocessing
|
295
|
-
- More network overhead
|
296
|
-
- Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
|
297
|
-
|
298
|
-
3. Async Processing
|
299
|
-
- Best for integration with async applications
|
300
|
-
- Good for real-time processing
|
301
|
-
- Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
|
302
|
-
|
303
|
-
## License
|
304
|
-
|
305
|
-
MIT License
|
306
|
-
|
307
|
-
## Credits
|
308
|
-
|
309
|
-
Created by [Peter Tam](https://petertam.pro)
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=lCviRhfLngSMehFJZwyK4LirPwbWEyZ0RJjCt5FkBcY,28304
|
2
|
-
spiderforce4ai-1.1.dist-info/METADATA,sha256=lQfqXn0ifJOmOmLkgr8YTSYUFiu6-HS3YsRD0togylo,7769
|
3
|
-
spiderforce4ai-1.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-1.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-1.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|