spiderforce4ai 0.1.8__py3-none-any.whl → 1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +158 -30
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-1.0.dist-info}/METADATA +18 -5
- spiderforce4ai-1.0.dist-info/RECORD +5 -0
- spiderforce4ai-0.1.8.dist-info/RECORD +0 -5
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-1.0.dist-info}/WHEEL +0 -0
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-1.0.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -57,22 +57,27 @@ class CrawlConfig:
|
|
57
57
|
output_dir: Path = Path("spiderforce_reports") # Default to spiderforce_reports in current directory
|
58
58
|
webhook_url: Optional[str] = None # Optional webhook endpoint
|
59
59
|
webhook_timeout: int = 10 # Webhook timeout
|
60
|
-
|
60
|
+
webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
|
61
|
+
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
62
|
+
save_reports: bool = False # Whether to save crawl reports
|
63
|
+
report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
|
61
64
|
|
62
65
|
def __post_init__(self):
|
63
|
-
# Initialize empty lists for
|
66
|
+
# Initialize empty lists/dicts for None values
|
64
67
|
self.remove_selectors = self.remove_selectors or []
|
65
68
|
self.remove_selectors_regex = self.remove_selectors_regex or []
|
69
|
+
self.webhook_headers = self.webhook_headers or {}
|
66
70
|
|
67
71
|
# Ensure output_dir is a Path and exists
|
68
72
|
self.output_dir = Path(self.output_dir)
|
69
73
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
70
74
|
|
71
|
-
#
|
72
|
-
if self.
|
73
|
-
self.report_file
|
74
|
-
|
75
|
-
|
75
|
+
# Only setup report file if save_reports is True
|
76
|
+
if self.save_reports:
|
77
|
+
if self.report_file is None:
|
78
|
+
self.report_file = self.output_dir / "crawl_report.json"
|
79
|
+
else:
|
80
|
+
self.report_file = Path(self.report_file)
|
76
81
|
|
77
82
|
def to_dict(self) -> Dict:
|
78
83
|
"""Convert config to dictionary for API requests."""
|
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
|
|
92
97
|
if not config.webhook_url:
|
93
98
|
return
|
94
99
|
|
95
|
-
payload
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
100
|
+
# Use custom payload template if provided, otherwise use default
|
101
|
+
if config.webhook_payload_template:
|
102
|
+
# Replace variables in the template
|
103
|
+
payload_str = config.webhook_payload_template.format(
|
104
|
+
url=result.url,
|
105
|
+
status=result.status,
|
106
|
+
markdown=result.markdown if result.status == "success" else None,
|
107
|
+
error=result.error if result.status == "failed" else None,
|
108
|
+
timestamp=result.timestamp,
|
109
|
+
config=config.to_dict()
|
110
|
+
)
|
111
|
+
payload = json.loads(payload_str) # Parse the formatted JSON string
|
112
|
+
else:
|
113
|
+
# Use default payload format
|
114
|
+
payload = {
|
115
|
+
"url": result.url,
|
116
|
+
"status": result.status,
|
117
|
+
"markdown": result.markdown if result.status == "success" else None,
|
118
|
+
"error": result.error if result.status == "failed" else None,
|
119
|
+
"timestamp": result.timestamp,
|
120
|
+
"config": config.to_dict()
|
121
|
+
}
|
103
122
|
|
104
123
|
try:
|
105
124
|
response = requests.post(
|
106
125
|
config.webhook_url,
|
107
126
|
json=payload,
|
127
|
+
headers=config.webhook_headers,
|
108
128
|
timeout=config.webhook_timeout
|
109
129
|
)
|
110
130
|
response.raise_for_status()
|
@@ -276,8 +296,8 @@ class SpiderForce4AI:
|
|
276
296
|
|
277
297
|
results.append(result)
|
278
298
|
|
279
|
-
# Save report if
|
280
|
-
if config.
|
299
|
+
# Save report if enabled
|
300
|
+
if config.save_reports:
|
281
301
|
self._save_report_sync(results, config)
|
282
302
|
print(f"\nReport saved to: {config.report_file}")
|
283
303
|
|
@@ -420,6 +440,55 @@ class SpiderForce4AI:
|
|
420
440
|
"""Synchronous version of crawl_url_async."""
|
421
441
|
return asyncio.run(self.crawl_url_async(url, config))
|
422
442
|
|
443
|
+
async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
|
444
|
+
"""Retry failed URLs once."""
|
445
|
+
if not failed_results:
|
446
|
+
return []
|
447
|
+
|
448
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
449
|
+
retry_results = []
|
450
|
+
|
451
|
+
# Create a new progress bar if one wasn't provided
|
452
|
+
should_close_progress = progress is None
|
453
|
+
if progress is None:
|
454
|
+
progress = Progress(
|
455
|
+
SpinnerColumn(),
|
456
|
+
TextColumn("[progress.description]{task.description}"),
|
457
|
+
BarColumn(),
|
458
|
+
TaskProgressColumn(),
|
459
|
+
console=console
|
460
|
+
)
|
461
|
+
progress.start()
|
462
|
+
|
463
|
+
retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
|
464
|
+
|
465
|
+
for result in failed_results:
|
466
|
+
progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
|
467
|
+
|
468
|
+
try:
|
469
|
+
new_result = await self.crawl_url_async(result.url, config)
|
470
|
+
if new_result.status == "success":
|
471
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
472
|
+
else:
|
473
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
474
|
+
retry_results.append(new_result)
|
475
|
+
except Exception as e:
|
476
|
+
console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
|
477
|
+
retry_results.append(CrawlResult(
|
478
|
+
url=result.url,
|
479
|
+
status="failed",
|
480
|
+
error=f"Retry error: {str(e)}",
|
481
|
+
config=config.to_dict()
|
482
|
+
))
|
483
|
+
|
484
|
+
progress.update(retry_task, advance=1)
|
485
|
+
await asyncio.sleep(config.request_delay)
|
486
|
+
|
487
|
+
if should_close_progress:
|
488
|
+
progress.stop()
|
489
|
+
|
490
|
+
return retry_results
|
491
|
+
|
423
492
|
async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
424
493
|
"""Crawl multiple URLs asynchronously with progress bar."""
|
425
494
|
await self._ensure_session()
|
@@ -445,17 +514,46 @@ class SpiderForce4AI:
|
|
445
514
|
await asyncio.sleep(config.request_delay)
|
446
515
|
return result
|
447
516
|
|
448
|
-
|
517
|
+
initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
|
518
|
+
|
519
|
+
# Identify failed URLs
|
520
|
+
failed_results = [r for r in initial_results if r.status == "failed"]
|
521
|
+
|
522
|
+
# Calculate initial failure ratio
|
523
|
+
initial_failed = len(failed_results)
|
524
|
+
total_urls = len(urls)
|
525
|
+
failure_ratio = (initial_failed / total_urls) * 100
|
526
|
+
|
527
|
+
# Retry failed URLs if ratio is acceptable
|
528
|
+
if failed_results:
|
529
|
+
if failure_ratio > 20:
|
530
|
+
console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
|
531
|
+
results = initial_results
|
532
|
+
else:
|
533
|
+
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
534
|
+
# Replace failed results with retry results
|
535
|
+
results = [r for r in initial_results if r.status == "success"] + retry_results
|
536
|
+
else:
|
537
|
+
results = initial_results
|
449
538
|
|
450
539
|
# Save final report
|
451
540
|
await self._save_report(config)
|
452
541
|
|
453
|
-
#
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
console.print(f"
|
542
|
+
# Calculate final statistics
|
543
|
+
final_successful = len([r for r in results if r.status == "success"])
|
544
|
+
final_failed = len([r for r in results if r.status == "failed"])
|
545
|
+
|
546
|
+
# Print detailed summary
|
547
|
+
console.print(f"\n[green]Crawling Summary:[/green]")
|
548
|
+
console.print(f"Total URLs processed: {total_urls}")
|
549
|
+
console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
|
550
|
+
console.print(f"Final results:")
|
551
|
+
console.print(f" ✓ Successful: {final_successful}")
|
552
|
+
console.print(f" ✗ Failed: {final_failed}")
|
553
|
+
|
554
|
+
if initial_failed > 0:
|
555
|
+
retry_successful = initial_failed - final_failed
|
556
|
+
console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
|
459
557
|
|
460
558
|
if config.report_file:
|
461
559
|
console.print(f"📊 Report saved to: {config.report_file}")
|
@@ -543,12 +641,42 @@ class SpiderForce4AI:
|
|
543
641
|
self._save_report_sync(results, config)
|
544
642
|
print(f"\nReport saved to: {config.report_file}")
|
545
643
|
|
546
|
-
#
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
644
|
+
# Calculate initial failure statistics
|
645
|
+
failed_results = [r for r in results if r.status == "failed"]
|
646
|
+
initial_failed = len(failed_results)
|
647
|
+
total_urls = len(urls)
|
648
|
+
failure_ratio = (initial_failed / total_urls) * 100
|
649
|
+
|
650
|
+
# Retry failed URLs if ratio is acceptable
|
651
|
+
if failed_results:
|
652
|
+
if failure_ratio > 20:
|
653
|
+
console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
|
654
|
+
else:
|
655
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
656
|
+
for result in failed_results:
|
657
|
+
new_result = _process_url_parallel((result.url, self.base_url, config))
|
658
|
+
if new_result.status == "success":
|
659
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
660
|
+
# Replace the failed result with the successful retry
|
661
|
+
results[results.index(result)] = new_result
|
662
|
+
else:
|
663
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
664
|
+
|
665
|
+
# Calculate final statistics
|
666
|
+
final_successful = len([r for r in results if r.status == "success"])
|
667
|
+
final_failed = len([r for r in results if r.status == "failed"])
|
668
|
+
|
669
|
+
# Print detailed summary
|
670
|
+
console.print(f"\n[green]Crawling Summary:[/green]")
|
671
|
+
console.print(f"Total URLs processed: {total_urls}")
|
672
|
+
console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
|
673
|
+
console.print(f"Final results:")
|
674
|
+
console.print(f" ✓ Successful: {final_successful}")
|
675
|
+
console.print(f" ✗ Failed: {final_failed}")
|
676
|
+
|
677
|
+
if initial_failed > 0:
|
678
|
+
retry_successful = initial_failed - final_failed
|
679
|
+
console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
|
552
680
|
|
553
681
|
return results
|
554
682
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: spiderforce4ai
|
3
|
-
Version:
|
3
|
+
Version: 1.0
|
4
4
|
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
5
|
Home-page: https://petertam.pro
|
6
6
|
Author: Piotr Tamulewicz
|
@@ -117,10 +117,23 @@ config = CrawlConfig(
|
|
117
117
|
timeout=30, # Request timeout (seconds)
|
118
118
|
|
119
119
|
# Output Settings
|
120
|
-
output_dir=Path("spiderforce_reports"),
|
121
|
-
webhook_url="https://your-webhook.com",
|
122
|
-
webhook_timeout=10,
|
123
|
-
|
120
|
+
output_dir=Path("spiderforce_reports"), # Default directory for files
|
121
|
+
webhook_url="https://your-webhook.com", # Real-time notifications
|
122
|
+
webhook_timeout=10, # Webhook timeout
|
123
|
+
webhook_headers={ # Optional custom headers for webhook
|
124
|
+
"Authorization": "Bearer your-token",
|
125
|
+
"X-Custom-Header": "value"
|
126
|
+
},
|
127
|
+
webhook_payload_template='''{ # Optional custom webhook payload template
|
128
|
+
"crawled_url": "{url}",
|
129
|
+
"content": "{markdown}",
|
130
|
+
"crawl_status": "{status}",
|
131
|
+
"crawl_error": "{error}",
|
132
|
+
"crawl_time": "{timestamp}",
|
133
|
+
"custom_field": "your-value"
|
134
|
+
}''',
|
135
|
+
save_reports=False, # Whether to save crawl reports (default: False)
|
136
|
+
report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
|
124
137
|
)
|
125
138
|
```
|
126
139
|
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=8WEcryB8fckf5yIvH55s7a5FtxvK_AhXdi_dyaqqing,27929
|
2
|
+
spiderforce4ai-1.0.dist-info/METADATA,sha256=VqydJoQcHkzvIhYTPeH3j8ZSHK-lGbo1xmZwQZk6w2s,7769
|
3
|
+
spiderforce4ai-1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-1.0.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=Y_7CfRVYQ2ssH67YexwCV12J14tB125U7WIhVTQfYwU,21652
|
2
|
-
spiderforce4ai-0.1.8.dist-info/METADATA,sha256=kXn_kUTsFZm8wtdMt0lTo85Jr3SYAZQzZn_3VL4KkeU,7169
|
3
|
-
spiderforce4ai-0.1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-0.1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-0.1.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|