spiderforce4ai 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +223 -22
- {spiderforce4ai-0.1.7.dist-info → spiderforce4ai-0.1.9.dist-info}/METADATA +106 -75
- spiderforce4ai-0.1.9.dist-info/RECORD +5 -0
- spiderforce4ai-0.1.7.dist-info/RECORD +0 -5
- {spiderforce4ai-0.1.7.dist-info → spiderforce4ai-0.1.9.dist-info}/WHEEL +0 -0
- {spiderforce4ai-0.1.7.dist-info → spiderforce4ai-0.1.9.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -57,22 +57,27 @@ class CrawlConfig:
|
|
57
57
|
output_dir: Path = Path("spiderforce_reports") # Default to spiderforce_reports in current directory
|
58
58
|
webhook_url: Optional[str] = None # Optional webhook endpoint
|
59
59
|
webhook_timeout: int = 10 # Webhook timeout
|
60
|
-
|
60
|
+
webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
|
61
|
+
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
62
|
+
save_reports: bool = False # Whether to save crawl reports
|
63
|
+
report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
|
61
64
|
|
62
65
|
def __post_init__(self):
|
63
|
-
# Initialize empty lists for
|
66
|
+
# Initialize empty lists/dicts for None values
|
64
67
|
self.remove_selectors = self.remove_selectors or []
|
65
68
|
self.remove_selectors_regex = self.remove_selectors_regex or []
|
69
|
+
self.webhook_headers = self.webhook_headers or {}
|
66
70
|
|
67
71
|
# Ensure output_dir is a Path and exists
|
68
72
|
self.output_dir = Path(self.output_dir)
|
69
73
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
70
74
|
|
71
|
-
#
|
72
|
-
if self.
|
73
|
-
self.report_file
|
74
|
-
|
75
|
-
|
75
|
+
# Only setup report file if save_reports is True
|
76
|
+
if self.save_reports:
|
77
|
+
if self.report_file is None:
|
78
|
+
self.report_file = self.output_dir / "crawl_report.json"
|
79
|
+
else:
|
80
|
+
self.report_file = Path(self.report_file)
|
76
81
|
|
77
82
|
def to_dict(self) -> Dict:
|
78
83
|
"""Convert config to dictionary for API requests."""
|
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
|
|
92
97
|
if not config.webhook_url:
|
93
98
|
return
|
94
99
|
|
95
|
-
payload
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
100
|
+
# Use custom payload template if provided, otherwise use default
|
101
|
+
if config.webhook_payload_template:
|
102
|
+
# Replace variables in the template
|
103
|
+
payload_str = config.webhook_payload_template.format(
|
104
|
+
url=result.url,
|
105
|
+
status=result.status,
|
106
|
+
markdown=result.markdown if result.status == "success" else None,
|
107
|
+
error=result.error if result.status == "failed" else None,
|
108
|
+
timestamp=result.timestamp,
|
109
|
+
config=config.to_dict()
|
110
|
+
)
|
111
|
+
payload = json.loads(payload_str) # Parse the formatted JSON string
|
112
|
+
else:
|
113
|
+
# Use default payload format
|
114
|
+
payload = {
|
115
|
+
"url": result.url,
|
116
|
+
"status": result.status,
|
117
|
+
"markdown": result.markdown if result.status == "success" else None,
|
118
|
+
"error": result.error if result.status == "failed" else None,
|
119
|
+
"timestamp": result.timestamp,
|
120
|
+
"config": config.to_dict()
|
121
|
+
}
|
103
122
|
|
104
123
|
try:
|
105
124
|
response = requests.post(
|
106
125
|
config.webhook_url,
|
107
126
|
json=payload,
|
127
|
+
headers=config.webhook_headers,
|
108
128
|
timeout=config.webhook_timeout
|
109
129
|
)
|
110
130
|
response.raise_for_status()
|
@@ -196,6 +216,113 @@ class SpiderForce4AI:
|
|
196
216
|
await f.write(markdown)
|
197
217
|
return filepath
|
198
218
|
|
219
|
+
|
220
|
+
|
221
|
+
def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
222
|
+
"""
|
223
|
+
Crawl sitemap URLs using server-side parallel processing.
|
224
|
+
"""
|
225
|
+
print(f"Fetching sitemap from {sitemap_url}...")
|
226
|
+
|
227
|
+
# Fetch sitemap
|
228
|
+
try:
|
229
|
+
response = requests.get(sitemap_url, timeout=config.timeout)
|
230
|
+
response.raise_for_status()
|
231
|
+
sitemap_text = response.text
|
232
|
+
except Exception as e:
|
233
|
+
print(f"Error fetching sitemap: {str(e)}")
|
234
|
+
raise
|
235
|
+
|
236
|
+
# Parse sitemap
|
237
|
+
try:
|
238
|
+
root = ET.fromstring(sitemap_text)
|
239
|
+
namespace = {'ns': root.tag.split('}')[0].strip('{')}
|
240
|
+
urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
|
241
|
+
print(f"Found {len(urls)} URLs in sitemap")
|
242
|
+
except Exception as e:
|
243
|
+
print(f"Error parsing sitemap: {str(e)}")
|
244
|
+
raise
|
245
|
+
|
246
|
+
# Process URLs using server-side parallel endpoint
|
247
|
+
return self.crawl_urls_server_parallel(urls, config)
|
248
|
+
|
249
|
+
|
250
|
+
def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
251
|
+
"""
|
252
|
+
Crawl multiple URLs using server-side parallel processing.
|
253
|
+
This uses the /convert_parallel endpoint which handles parallelization on the server.
|
254
|
+
"""
|
255
|
+
print(f"Sending {len(urls)} URLs for parallel processing...")
|
256
|
+
|
257
|
+
try:
|
258
|
+
endpoint = f"{self.base_url}/convert_parallel"
|
259
|
+
|
260
|
+
# Prepare payload
|
261
|
+
payload = {
|
262
|
+
"urls": urls,
|
263
|
+
**config.to_dict()
|
264
|
+
}
|
265
|
+
|
266
|
+
# Send request
|
267
|
+
response = requests.post(
|
268
|
+
endpoint,
|
269
|
+
json=payload,
|
270
|
+
timeout=config.timeout
|
271
|
+
)
|
272
|
+
response.raise_for_status()
|
273
|
+
|
274
|
+
# Process results
|
275
|
+
results = []
|
276
|
+
server_results = response.json() # Assuming server returns JSON array of results
|
277
|
+
|
278
|
+
for url_result in server_results:
|
279
|
+
result = CrawlResult(
|
280
|
+
url=url_result["url"],
|
281
|
+
status=url_result.get("status", "failed"),
|
282
|
+
markdown=url_result.get("markdown"),
|
283
|
+
error=url_result.get("error"),
|
284
|
+
config=config.to_dict()
|
285
|
+
)
|
286
|
+
|
287
|
+
# Save markdown if successful and output dir is configured
|
288
|
+
if result.status == "success" and config.output_dir and result.markdown:
|
289
|
+
filepath = config.output_dir / f"{slugify(result.url)}.md"
|
290
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
291
|
+
f.write(result.markdown)
|
292
|
+
|
293
|
+
# Send webhook if configured
|
294
|
+
if config.webhook_url:
|
295
|
+
_send_webhook_sync(result, config)
|
296
|
+
|
297
|
+
results.append(result)
|
298
|
+
|
299
|
+
# Save report if enabled
|
300
|
+
if config.save_reports:
|
301
|
+
self._save_report_sync(results, config)
|
302
|
+
print(f"\nReport saved to: {config.report_file}")
|
303
|
+
|
304
|
+
# Print summary
|
305
|
+
successful = len([r for r in results if r.status == "success"])
|
306
|
+
failed = len([r for r in results if r.status == "failed"])
|
307
|
+
print(f"\nParallel processing completed:")
|
308
|
+
print(f"✓ Successful: {successful}")
|
309
|
+
print(f"✗ Failed: {failed}")
|
310
|
+
|
311
|
+
return results
|
312
|
+
|
313
|
+
except Exception as e:
|
314
|
+
print(f"Error during parallel processing: {str(e)}")
|
315
|
+
# Create failed results for all URLs
|
316
|
+
return [
|
317
|
+
CrawlResult(
|
318
|
+
url=url,
|
319
|
+
status="failed",
|
320
|
+
error=str(e),
|
321
|
+
config=config.to_dict()
|
322
|
+
) for url in urls
|
323
|
+
]
|
324
|
+
|
325
|
+
|
199
326
|
async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
|
200
327
|
"""Send webhook with crawl results."""
|
201
328
|
if not config.webhook_url:
|
@@ -313,6 +440,55 @@ class SpiderForce4AI:
|
|
313
440
|
"""Synchronous version of crawl_url_async."""
|
314
441
|
return asyncio.run(self.crawl_url_async(url, config))
|
315
442
|
|
443
|
+
async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
|
444
|
+
"""Retry failed URLs once."""
|
445
|
+
if not failed_results:
|
446
|
+
return []
|
447
|
+
|
448
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
449
|
+
retry_results = []
|
450
|
+
|
451
|
+
# Create a new progress bar if one wasn't provided
|
452
|
+
should_close_progress = progress is None
|
453
|
+
if progress is None:
|
454
|
+
progress = Progress(
|
455
|
+
SpinnerColumn(),
|
456
|
+
TextColumn("[progress.description]{task.description}"),
|
457
|
+
BarColumn(),
|
458
|
+
TaskProgressColumn(),
|
459
|
+
console=console
|
460
|
+
)
|
461
|
+
progress.start()
|
462
|
+
|
463
|
+
retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
|
464
|
+
|
465
|
+
for result in failed_results:
|
466
|
+
progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
|
467
|
+
|
468
|
+
try:
|
469
|
+
new_result = await self.crawl_url_async(result.url, config)
|
470
|
+
if new_result.status == "success":
|
471
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
472
|
+
else:
|
473
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
474
|
+
retry_results.append(new_result)
|
475
|
+
except Exception as e:
|
476
|
+
console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
|
477
|
+
retry_results.append(CrawlResult(
|
478
|
+
url=result.url,
|
479
|
+
status="failed",
|
480
|
+
error=f"Retry error: {str(e)}",
|
481
|
+
config=config.to_dict()
|
482
|
+
))
|
483
|
+
|
484
|
+
progress.update(retry_task, advance=1)
|
485
|
+
await asyncio.sleep(config.request_delay)
|
486
|
+
|
487
|
+
if should_close_progress:
|
488
|
+
progress.stop()
|
489
|
+
|
490
|
+
return retry_results
|
491
|
+
|
316
492
|
async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
317
493
|
"""Crawl multiple URLs asynchronously with progress bar."""
|
318
494
|
await self._ensure_session()
|
@@ -338,15 +514,27 @@ class SpiderForce4AI:
|
|
338
514
|
await asyncio.sleep(config.request_delay)
|
339
515
|
return result
|
340
516
|
|
341
|
-
|
517
|
+
initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
|
518
|
+
|
519
|
+
# Identify failed URLs
|
520
|
+
failed_results = [r for r in initial_results if r.status == "failed"]
|
521
|
+
|
522
|
+
# Retry failed URLs
|
523
|
+
if failed_results:
|
524
|
+
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
525
|
+
|
526
|
+
# Replace failed results with retry results
|
527
|
+
results = [r for r in initial_results if r.status == "success"] + retry_results
|
528
|
+
else:
|
529
|
+
results = initial_results
|
342
530
|
|
343
531
|
# Save final report
|
344
532
|
await self._save_report(config)
|
345
533
|
|
346
|
-
# Print summary
|
534
|
+
# Print final summary
|
347
535
|
successful = len([r for r in results if r.status == "success"])
|
348
536
|
failed = len([r for r in results if r.status == "failed"])
|
349
|
-
console.print(f"\n[green]
|
537
|
+
console.print(f"\n[green]Final crawling results:[/green]")
|
350
538
|
console.print(f"✓ Successful: {successful}")
|
351
539
|
console.print(f"✗ Failed: {failed}")
|
352
540
|
|
@@ -436,12 +624,25 @@ class SpiderForce4AI:
|
|
436
624
|
self._save_report_sync(results, config)
|
437
625
|
print(f"\nReport saved to: {config.report_file}")
|
438
626
|
|
439
|
-
#
|
627
|
+
# Identify failed URLs and retry them
|
628
|
+
failed_results = [r for r in results if r.status == "failed"]
|
629
|
+
if failed_results:
|
630
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
631
|
+
for result in failed_results:
|
632
|
+
new_result = _process_url_parallel((result.url, self.base_url, config))
|
633
|
+
if new_result.status == "success":
|
634
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
635
|
+
# Replace the failed result with the successful retry
|
636
|
+
results[results.index(result)] = new_result
|
637
|
+
else:
|
638
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
639
|
+
|
640
|
+
# Print final summary
|
440
641
|
successful = len([r for r in results if r.status == "success"])
|
441
642
|
failed = len([r for r in results if r.status == "failed"])
|
442
|
-
print(f"\
|
443
|
-
print(f"✓ Successful: {successful}")
|
444
|
-
print(f"✗ Failed: {failed}")
|
643
|
+
console.print(f"\n[green]Final crawling results:[/green]")
|
644
|
+
console.print(f"✓ Successful: {successful}")
|
645
|
+
console.print(f"✗ Failed: {failed}")
|
445
646
|
|
446
647
|
return results
|
447
648
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: spiderforce4ai
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
5
|
Home-page: https://petertam.pro
|
6
6
|
Author: Piotr Tamulewicz
|
@@ -24,75 +24,73 @@ Dynamic: requires-python
|
|
24
24
|
|
25
25
|
# SpiderForce4AI Python Wrapper
|
26
26
|
|
27
|
-
A Python
|
28
|
-
|
29
|
-
## Installation
|
30
|
-
|
31
|
-
```bash
|
32
|
-
pip install spiderforce4ai
|
33
|
-
```
|
27
|
+
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
34
28
|
|
35
29
|
## Quick Start (Minimal Setup)
|
36
30
|
|
37
31
|
```python
|
38
32
|
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
39
33
|
|
40
|
-
# Initialize with your
|
34
|
+
# Initialize with your service URL
|
41
35
|
spider = SpiderForce4AI("http://localhost:3004")
|
42
36
|
|
43
|
-
#
|
37
|
+
# Create default config
|
44
38
|
config = CrawlConfig()
|
45
39
|
|
46
40
|
# Crawl a single URL
|
47
41
|
result = spider.crawl_url("https://example.com", config)
|
48
42
|
```
|
49
43
|
|
44
|
+
## Installation
|
45
|
+
|
46
|
+
```bash
|
47
|
+
pip install spiderforce4ai
|
48
|
+
```
|
49
|
+
|
50
50
|
## Crawling Methods
|
51
51
|
|
52
|
-
### 1. Single URL
|
52
|
+
### 1. Single URL
|
53
53
|
|
54
54
|
```python
|
55
|
-
#
|
55
|
+
# Basic usage
|
56
56
|
result = spider.crawl_url("https://example.com", config)
|
57
57
|
|
58
|
-
#
|
58
|
+
# Async version
|
59
59
|
async def crawl():
|
60
60
|
result = await spider.crawl_url_async("https://example.com", config)
|
61
61
|
```
|
62
62
|
|
63
|
-
### 2. Multiple URLs
|
63
|
+
### 2. Multiple URLs
|
64
64
|
|
65
65
|
```python
|
66
|
-
# List of URLs
|
67
66
|
urls = [
|
68
67
|
"https://example.com/page1",
|
69
|
-
"https://example.com/page2"
|
70
|
-
"https://example.com/page3"
|
68
|
+
"https://example.com/page2"
|
71
69
|
]
|
72
70
|
|
73
|
-
#
|
74
|
-
results = spider.
|
71
|
+
# Client-side parallel (using multiprocessing)
|
72
|
+
results = spider.crawl_urls_parallel(urls, config)
|
73
|
+
|
74
|
+
# Server-side parallel (single request)
|
75
|
+
results = spider.crawl_urls_server_parallel(urls, config)
|
75
76
|
|
76
|
-
#
|
77
|
+
# Async version
|
77
78
|
async def crawl():
|
78
79
|
results = await spider.crawl_urls_async(urls, config)
|
79
|
-
|
80
|
-
# Parallel (using multiprocessing)
|
81
|
-
results = spider.crawl_urls_parallel(urls, config)
|
82
80
|
```
|
83
81
|
|
84
82
|
### 3. Sitemap Crawling
|
85
83
|
|
86
84
|
```python
|
87
|
-
#
|
88
|
-
results = spider.
|
85
|
+
# Server-side parallel (recommended)
|
86
|
+
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
87
|
+
|
88
|
+
# Client-side parallel
|
89
|
+
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
89
90
|
|
90
|
-
#
|
91
|
+
# Async version
|
91
92
|
async def crawl():
|
92
93
|
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
93
|
-
|
94
|
-
# Parallel (using multiprocessing)
|
95
|
-
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
96
94
|
```
|
97
95
|
|
98
96
|
## Configuration Options
|
@@ -100,9 +98,11 @@ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", confi
|
|
100
98
|
All configuration options are optional with sensible defaults:
|
101
99
|
|
102
100
|
```python
|
101
|
+
from pathlib import Path
|
102
|
+
|
103
103
|
config = CrawlConfig(
|
104
104
|
# Content Selection (all optional)
|
105
|
-
target_selector="article", # Specific element to
|
105
|
+
target_selector="article", # Specific element to extract
|
106
106
|
remove_selectors=[ # Elements to remove
|
107
107
|
".ads",
|
108
108
|
"#popup",
|
@@ -112,21 +112,34 @@ config = CrawlConfig(
|
|
112
112
|
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
113
113
|
|
114
114
|
# Processing Settings
|
115
|
-
max_concurrent_requests=1, #
|
116
|
-
request_delay=0.5, # Delay between requests
|
117
|
-
timeout=30, # Request timeout
|
115
|
+
max_concurrent_requests=1, # For client-side parallel processing
|
116
|
+
request_delay=0.5, # Delay between requests (seconds)
|
117
|
+
timeout=30, # Request timeout (seconds)
|
118
118
|
|
119
119
|
# Output Settings
|
120
|
-
output_dir="
|
121
|
-
|
122
|
-
|
123
|
-
|
120
|
+
output_dir=Path("spiderforce_reports"), # Default directory for files
|
121
|
+
webhook_url="https://your-webhook.com", # Real-time notifications
|
122
|
+
webhook_timeout=10, # Webhook timeout
|
123
|
+
webhook_headers={ # Optional custom headers for webhook
|
124
|
+
"Authorization": "Bearer your-token",
|
125
|
+
"X-Custom-Header": "value"
|
126
|
+
},
|
127
|
+
webhook_payload_template='''{ # Optional custom webhook payload template
|
128
|
+
"crawled_url": "{url}",
|
129
|
+
"content": "{markdown}",
|
130
|
+
"crawl_status": "{status}",
|
131
|
+
"crawl_error": "{error}",
|
132
|
+
"crawl_time": "{timestamp}",
|
133
|
+
"custom_field": "your-value"
|
134
|
+
}''',
|
135
|
+
save_reports=False, # Whether to save crawl reports (default: False)
|
136
|
+
report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
|
124
137
|
)
|
125
138
|
```
|
126
139
|
|
127
140
|
## Real-World Examples
|
128
141
|
|
129
|
-
### 1. Basic
|
142
|
+
### 1. Basic Blog Crawling
|
130
143
|
|
131
144
|
```python
|
132
145
|
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
@@ -134,78 +147,77 @@ from pathlib import Path
|
|
134
147
|
|
135
148
|
spider = SpiderForce4AI("http://localhost:3004")
|
136
149
|
config = CrawlConfig(
|
150
|
+
target_selector="article.post-content",
|
137
151
|
output_dir=Path("blog_content")
|
138
152
|
)
|
139
153
|
|
140
|
-
result = spider.crawl_url("https://example.com/blog", config)
|
141
|
-
print(f"Content saved to: {result.url}.md")
|
154
|
+
result = spider.crawl_url("https://example.com/blog-post", config)
|
142
155
|
```
|
143
156
|
|
144
|
-
### 2.
|
157
|
+
### 2. Parallel Website Crawling
|
145
158
|
|
146
159
|
```python
|
147
160
|
config = CrawlConfig(
|
148
|
-
max_concurrent_requests=5,
|
149
|
-
output_dir=Path("website_content"),
|
150
161
|
remove_selectors=[
|
151
162
|
".navigation",
|
152
163
|
".footer",
|
153
164
|
".ads",
|
154
165
|
"#cookie-notice"
|
155
166
|
],
|
167
|
+
max_concurrent_requests=5,
|
168
|
+
output_dir=Path("website_content"),
|
156
169
|
webhook_url="https://your-webhook.com/endpoint"
|
157
170
|
)
|
158
171
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
172
|
+
# Using server-side parallel processing
|
173
|
+
results = spider.crawl_urls_server_parallel([
|
174
|
+
"https://example.com/page1",
|
175
|
+
"https://example.com/page2",
|
176
|
+
"https://example.com/page3"
|
177
|
+
], config)
|
163
178
|
```
|
164
179
|
|
165
|
-
### 3.
|
180
|
+
### 3. Full Sitemap Processing
|
166
181
|
|
167
182
|
```python
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
)
|
175
|
-
|
176
|
-
async with spider:
|
177
|
-
results = await spider.crawl_urls_async([
|
178
|
-
"https://example.com/1",
|
179
|
-
"https://example.com/2",
|
180
|
-
"https://example.com/3"
|
181
|
-
], config)
|
182
|
-
|
183
|
-
return results
|
183
|
+
config = CrawlConfig(
|
184
|
+
target_selector="main",
|
185
|
+
remove_selectors=[".sidebar", ".comments"],
|
186
|
+
output_dir=Path("site_content"),
|
187
|
+
report_file=Path("crawl_report.json")
|
188
|
+
)
|
184
189
|
|
185
|
-
results =
|
190
|
+
results = spider.crawl_sitemap_server_parallel(
|
191
|
+
"https://example.com/sitemap.xml",
|
192
|
+
config
|
193
|
+
)
|
186
194
|
```
|
187
195
|
|
188
196
|
## Output Structure
|
189
197
|
|
190
|
-
### 1.
|
198
|
+
### 1. Directory Layout
|
191
199
|
```
|
192
|
-
|
193
|
-
├── example-com-page1.md
|
200
|
+
spiderforce_reports/ # Default output directory
|
201
|
+
├── example-com-page1.md # Converted markdown files
|
194
202
|
├── example-com-page2.md
|
195
|
-
└── crawl_report.json
|
203
|
+
└── crawl_report.json # Crawl report
|
196
204
|
```
|
197
205
|
|
198
206
|
### 2. Markdown Files
|
199
|
-
Each
|
207
|
+
Each file is named using a slugified version of the URL:
|
208
|
+
```markdown
|
209
|
+
# Page Title
|
210
|
+
|
211
|
+
Content converted to clean markdown...
|
212
|
+
```
|
200
213
|
|
201
|
-
### 3. Report
|
214
|
+
### 3. Crawl Report
|
202
215
|
```json
|
203
216
|
{
|
204
217
|
"timestamp": "2025-02-15T10:30:00.123456",
|
205
218
|
"config": {
|
206
219
|
"target_selector": "article",
|
207
|
-
"remove_selectors": [".ads", "#popup"]
|
208
|
-
"remove_selectors_regex": ["modal-\\d+"]
|
220
|
+
"remove_selectors": [".ads", "#popup"]
|
209
221
|
},
|
210
222
|
"results": {
|
211
223
|
"successful": [
|
@@ -234,7 +246,7 @@ Each markdown file is named using a slugified version of the URL and contains th
|
|
234
246
|
```
|
235
247
|
|
236
248
|
### 4. Webhook Notifications
|
237
|
-
If configured,
|
249
|
+
If configured, real-time updates are sent for each processed URL:
|
238
250
|
```json
|
239
251
|
{
|
240
252
|
"url": "https://example.com/page1",
|
@@ -250,7 +262,7 @@ If configured, webhooks receive real-time updates in JSON format:
|
|
250
262
|
|
251
263
|
## Error Handling
|
252
264
|
|
253
|
-
The package handles various types of errors:
|
265
|
+
The package handles various types of errors gracefully:
|
254
266
|
- Network errors
|
255
267
|
- Timeout errors
|
256
268
|
- Invalid URLs
|
@@ -269,6 +281,25 @@ All errors are:
|
|
269
281
|
- Running SpiderForce4AI service
|
270
282
|
- Internet connection
|
271
283
|
|
284
|
+
## Performance Considerations
|
285
|
+
|
286
|
+
1. Server-side Parallel Processing
|
287
|
+
- Best for most cases
|
288
|
+
- Single HTTP request for multiple URLs
|
289
|
+
- Less network overhead
|
290
|
+
- Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
|
291
|
+
|
292
|
+
2. Client-side Parallel Processing
|
293
|
+
- Good for special cases requiring local control
|
294
|
+
- Uses Python multiprocessing
|
295
|
+
- More network overhead
|
296
|
+
- Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
|
297
|
+
|
298
|
+
3. Async Processing
|
299
|
+
- Best for integration with async applications
|
300
|
+
- Good for real-time processing
|
301
|
+
- Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
|
302
|
+
|
272
303
|
## License
|
273
304
|
|
274
305
|
MIT License
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=oU_UIdzsQxExaVgD7NCaVm4G-9zMtKGnREfY6xL1uFY,26041
|
2
|
+
spiderforce4ai-0.1.9.dist-info/METADATA,sha256=poV1i_-H3AgzFhs9juRDJSfaWO0gVePb5JXN7ynL4Y4,7771
|
3
|
+
spiderforce4ai-0.1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-0.1.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-0.1.9.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=qLYHahjvFutdGmibbVZ7cfTd1mMM1FZNd_7nv-EMPtQ,17649
|
2
|
-
spiderforce4ai-0.1.7.dist-info/METADATA,sha256=-eWd9exoMxMAYClp6rWHaX_H3md4hBlRq6CHhTJ1ACg,6575
|
3
|
-
spiderforce4ai-0.1.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-0.1.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-0.1.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|