spiderforce4ai 0.1.8__py3-none-any.whl → 1.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +158 -30
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-1.0.dist-info}/METADATA +18 -5
- spiderforce4ai-1.0.dist-info/RECORD +5 -0
- spiderforce4ai-0.1.8.dist-info/RECORD +0 -5
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-1.0.dist-info}/WHEEL +0 -0
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-1.0.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -57,22 +57,27 @@ class CrawlConfig:
|
|
57
57
|
output_dir: Path = Path("spiderforce_reports") # Default to spiderforce_reports in current directory
|
58
58
|
webhook_url: Optional[str] = None # Optional webhook endpoint
|
59
59
|
webhook_timeout: int = 10 # Webhook timeout
|
60
|
-
|
60
|
+
webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
|
61
|
+
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
62
|
+
save_reports: bool = False # Whether to save crawl reports
|
63
|
+
report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
|
61
64
|
|
62
65
|
def __post_init__(self):
|
63
|
-
# Initialize empty lists for
|
66
|
+
# Initialize empty lists/dicts for None values
|
64
67
|
self.remove_selectors = self.remove_selectors or []
|
65
68
|
self.remove_selectors_regex = self.remove_selectors_regex or []
|
69
|
+
self.webhook_headers = self.webhook_headers or {}
|
66
70
|
|
67
71
|
# Ensure output_dir is a Path and exists
|
68
72
|
self.output_dir = Path(self.output_dir)
|
69
73
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
70
74
|
|
71
|
-
#
|
72
|
-
if self.
|
73
|
-
self.report_file
|
74
|
-
|
75
|
-
|
75
|
+
# Only setup report file if save_reports is True
|
76
|
+
if self.save_reports:
|
77
|
+
if self.report_file is None:
|
78
|
+
self.report_file = self.output_dir / "crawl_report.json"
|
79
|
+
else:
|
80
|
+
self.report_file = Path(self.report_file)
|
76
81
|
|
77
82
|
def to_dict(self) -> Dict:
|
78
83
|
"""Convert config to dictionary for API requests."""
|
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
|
|
92
97
|
if not config.webhook_url:
|
93
98
|
return
|
94
99
|
|
95
|
-
payload
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
100
|
+
# Use custom payload template if provided, otherwise use default
|
101
|
+
if config.webhook_payload_template:
|
102
|
+
# Replace variables in the template
|
103
|
+
payload_str = config.webhook_payload_template.format(
|
104
|
+
url=result.url,
|
105
|
+
status=result.status,
|
106
|
+
markdown=result.markdown if result.status == "success" else None,
|
107
|
+
error=result.error if result.status == "failed" else None,
|
108
|
+
timestamp=result.timestamp,
|
109
|
+
config=config.to_dict()
|
110
|
+
)
|
111
|
+
payload = json.loads(payload_str) # Parse the formatted JSON string
|
112
|
+
else:
|
113
|
+
# Use default payload format
|
114
|
+
payload = {
|
115
|
+
"url": result.url,
|
116
|
+
"status": result.status,
|
117
|
+
"markdown": result.markdown if result.status == "success" else None,
|
118
|
+
"error": result.error if result.status == "failed" else None,
|
119
|
+
"timestamp": result.timestamp,
|
120
|
+
"config": config.to_dict()
|
121
|
+
}
|
103
122
|
|
104
123
|
try:
|
105
124
|
response = requests.post(
|
106
125
|
config.webhook_url,
|
107
126
|
json=payload,
|
127
|
+
headers=config.webhook_headers,
|
108
128
|
timeout=config.webhook_timeout
|
109
129
|
)
|
110
130
|
response.raise_for_status()
|
@@ -276,8 +296,8 @@ class SpiderForce4AI:
|
|
276
296
|
|
277
297
|
results.append(result)
|
278
298
|
|
279
|
-
# Save report if
|
280
|
-
if config.
|
299
|
+
# Save report if enabled
|
300
|
+
if config.save_reports:
|
281
301
|
self._save_report_sync(results, config)
|
282
302
|
print(f"\nReport saved to: {config.report_file}")
|
283
303
|
|
@@ -420,6 +440,55 @@ class SpiderForce4AI:
|
|
420
440
|
"""Synchronous version of crawl_url_async."""
|
421
441
|
return asyncio.run(self.crawl_url_async(url, config))
|
422
442
|
|
443
|
+
async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
|
444
|
+
"""Retry failed URLs once."""
|
445
|
+
if not failed_results:
|
446
|
+
return []
|
447
|
+
|
448
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
449
|
+
retry_results = []
|
450
|
+
|
451
|
+
# Create a new progress bar if one wasn't provided
|
452
|
+
should_close_progress = progress is None
|
453
|
+
if progress is None:
|
454
|
+
progress = Progress(
|
455
|
+
SpinnerColumn(),
|
456
|
+
TextColumn("[progress.description]{task.description}"),
|
457
|
+
BarColumn(),
|
458
|
+
TaskProgressColumn(),
|
459
|
+
console=console
|
460
|
+
)
|
461
|
+
progress.start()
|
462
|
+
|
463
|
+
retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
|
464
|
+
|
465
|
+
for result in failed_results:
|
466
|
+
progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
|
467
|
+
|
468
|
+
try:
|
469
|
+
new_result = await self.crawl_url_async(result.url, config)
|
470
|
+
if new_result.status == "success":
|
471
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
472
|
+
else:
|
473
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
474
|
+
retry_results.append(new_result)
|
475
|
+
except Exception as e:
|
476
|
+
console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
|
477
|
+
retry_results.append(CrawlResult(
|
478
|
+
url=result.url,
|
479
|
+
status="failed",
|
480
|
+
error=f"Retry error: {str(e)}",
|
481
|
+
config=config.to_dict()
|
482
|
+
))
|
483
|
+
|
484
|
+
progress.update(retry_task, advance=1)
|
485
|
+
await asyncio.sleep(config.request_delay)
|
486
|
+
|
487
|
+
if should_close_progress:
|
488
|
+
progress.stop()
|
489
|
+
|
490
|
+
return retry_results
|
491
|
+
|
423
492
|
async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
424
493
|
"""Crawl multiple URLs asynchronously with progress bar."""
|
425
494
|
await self._ensure_session()
|
@@ -445,17 +514,46 @@ class SpiderForce4AI:
|
|
445
514
|
await asyncio.sleep(config.request_delay)
|
446
515
|
return result
|
447
516
|
|
448
|
-
|
517
|
+
initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
|
518
|
+
|
519
|
+
# Identify failed URLs
|
520
|
+
failed_results = [r for r in initial_results if r.status == "failed"]
|
521
|
+
|
522
|
+
# Calculate initial failure ratio
|
523
|
+
initial_failed = len(failed_results)
|
524
|
+
total_urls = len(urls)
|
525
|
+
failure_ratio = (initial_failed / total_urls) * 100
|
526
|
+
|
527
|
+
# Retry failed URLs if ratio is acceptable
|
528
|
+
if failed_results:
|
529
|
+
if failure_ratio > 20:
|
530
|
+
console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
|
531
|
+
results = initial_results
|
532
|
+
else:
|
533
|
+
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
534
|
+
# Replace failed results with retry results
|
535
|
+
results = [r for r in initial_results if r.status == "success"] + retry_results
|
536
|
+
else:
|
537
|
+
results = initial_results
|
449
538
|
|
450
539
|
# Save final report
|
451
540
|
await self._save_report(config)
|
452
541
|
|
453
|
-
#
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
console.print(f"
|
542
|
+
# Calculate final statistics
|
543
|
+
final_successful = len([r for r in results if r.status == "success"])
|
544
|
+
final_failed = len([r for r in results if r.status == "failed"])
|
545
|
+
|
546
|
+
# Print detailed summary
|
547
|
+
console.print(f"\n[green]Crawling Summary:[/green]")
|
548
|
+
console.print(f"Total URLs processed: {total_urls}")
|
549
|
+
console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
|
550
|
+
console.print(f"Final results:")
|
551
|
+
console.print(f" ✓ Successful: {final_successful}")
|
552
|
+
console.print(f" ✗ Failed: {final_failed}")
|
553
|
+
|
554
|
+
if initial_failed > 0:
|
555
|
+
retry_successful = initial_failed - final_failed
|
556
|
+
console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
|
459
557
|
|
460
558
|
if config.report_file:
|
461
559
|
console.print(f"📊 Report saved to: {config.report_file}")
|
@@ -543,12 +641,42 @@ class SpiderForce4AI:
|
|
543
641
|
self._save_report_sync(results, config)
|
544
642
|
print(f"\nReport saved to: {config.report_file}")
|
545
643
|
|
546
|
-
#
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
644
|
+
# Calculate initial failure statistics
|
645
|
+
failed_results = [r for r in results if r.status == "failed"]
|
646
|
+
initial_failed = len(failed_results)
|
647
|
+
total_urls = len(urls)
|
648
|
+
failure_ratio = (initial_failed / total_urls) * 100
|
649
|
+
|
650
|
+
# Retry failed URLs if ratio is acceptable
|
651
|
+
if failed_results:
|
652
|
+
if failure_ratio > 20:
|
653
|
+
console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
|
654
|
+
else:
|
655
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
656
|
+
for result in failed_results:
|
657
|
+
new_result = _process_url_parallel((result.url, self.base_url, config))
|
658
|
+
if new_result.status == "success":
|
659
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
660
|
+
# Replace the failed result with the successful retry
|
661
|
+
results[results.index(result)] = new_result
|
662
|
+
else:
|
663
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
664
|
+
|
665
|
+
# Calculate final statistics
|
666
|
+
final_successful = len([r for r in results if r.status == "success"])
|
667
|
+
final_failed = len([r for r in results if r.status == "failed"])
|
668
|
+
|
669
|
+
# Print detailed summary
|
670
|
+
console.print(f"\n[green]Crawling Summary:[/green]")
|
671
|
+
console.print(f"Total URLs processed: {total_urls}")
|
672
|
+
console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
|
673
|
+
console.print(f"Final results:")
|
674
|
+
console.print(f" ✓ Successful: {final_successful}")
|
675
|
+
console.print(f" ✗ Failed: {final_failed}")
|
676
|
+
|
677
|
+
if initial_failed > 0:
|
678
|
+
retry_successful = initial_failed - final_failed
|
679
|
+
console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
|
552
680
|
|
553
681
|
return results
|
554
682
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: spiderforce4ai
|
3
|
-
Version:
|
3
|
+
Version: 1.0
|
4
4
|
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
5
|
Home-page: https://petertam.pro
|
6
6
|
Author: Piotr Tamulewicz
|
@@ -117,10 +117,23 @@ config = CrawlConfig(
|
|
117
117
|
timeout=30, # Request timeout (seconds)
|
118
118
|
|
119
119
|
# Output Settings
|
120
|
-
output_dir=Path("spiderforce_reports"),
|
121
|
-
webhook_url="https://your-webhook.com",
|
122
|
-
webhook_timeout=10,
|
123
|
-
|
120
|
+
output_dir=Path("spiderforce_reports"), # Default directory for files
|
121
|
+
webhook_url="https://your-webhook.com", # Real-time notifications
|
122
|
+
webhook_timeout=10, # Webhook timeout
|
123
|
+
webhook_headers={ # Optional custom headers for webhook
|
124
|
+
"Authorization": "Bearer your-token",
|
125
|
+
"X-Custom-Header": "value"
|
126
|
+
},
|
127
|
+
webhook_payload_template='''{ # Optional custom webhook payload template
|
128
|
+
"crawled_url": "{url}",
|
129
|
+
"content": "{markdown}",
|
130
|
+
"crawl_status": "{status}",
|
131
|
+
"crawl_error": "{error}",
|
132
|
+
"crawl_time": "{timestamp}",
|
133
|
+
"custom_field": "your-value"
|
134
|
+
}''',
|
135
|
+
save_reports=False, # Whether to save crawl reports (default: False)
|
136
|
+
report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
|
124
137
|
)
|
125
138
|
```
|
126
139
|
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=8WEcryB8fckf5yIvH55s7a5FtxvK_AhXdi_dyaqqing,27929
|
2
|
+
spiderforce4ai-1.0.dist-info/METADATA,sha256=VqydJoQcHkzvIhYTPeH3j8ZSHK-lGbo1xmZwQZk6w2s,7769
|
3
|
+
spiderforce4ai-1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-1.0.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=Y_7CfRVYQ2ssH67YexwCV12J14tB125U7WIhVTQfYwU,21652
|
2
|
-
spiderforce4ai-0.1.8.dist-info/METADATA,sha256=kXn_kUTsFZm8wtdMt0lTo85Jr3SYAZQzZn_3VL4KkeU,7169
|
3
|
-
spiderforce4ai-0.1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-0.1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-0.1.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|