spiderforce4ai 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +118 -24
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-0.1.9.dist-info}/METADATA +18 -5
- spiderforce4ai-0.1.9.dist-info/RECORD +5 -0
- spiderforce4ai-0.1.8.dist-info/RECORD +0 -5
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-0.1.9.dist-info}/WHEEL +0 -0
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-0.1.9.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -57,22 +57,27 @@ class CrawlConfig:
|
|
57
57
|
output_dir: Path = Path("spiderforce_reports") # Default to spiderforce_reports in current directory
|
58
58
|
webhook_url: Optional[str] = None # Optional webhook endpoint
|
59
59
|
webhook_timeout: int = 10 # Webhook timeout
|
60
|
-
|
60
|
+
webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
|
61
|
+
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
62
|
+
save_reports: bool = False # Whether to save crawl reports
|
63
|
+
report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
|
61
64
|
|
62
65
|
def __post_init__(self):
|
63
|
-
# Initialize empty lists for
|
66
|
+
# Initialize empty lists/dicts for None values
|
64
67
|
self.remove_selectors = self.remove_selectors or []
|
65
68
|
self.remove_selectors_regex = self.remove_selectors_regex or []
|
69
|
+
self.webhook_headers = self.webhook_headers or {}
|
66
70
|
|
67
71
|
# Ensure output_dir is a Path and exists
|
68
72
|
self.output_dir = Path(self.output_dir)
|
69
73
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
70
74
|
|
71
|
-
#
|
72
|
-
if self.
|
73
|
-
self.report_file
|
74
|
-
|
75
|
-
|
75
|
+
# Only setup report file if save_reports is True
|
76
|
+
if self.save_reports:
|
77
|
+
if self.report_file is None:
|
78
|
+
self.report_file = self.output_dir / "crawl_report.json"
|
79
|
+
else:
|
80
|
+
self.report_file = Path(self.report_file)
|
76
81
|
|
77
82
|
def to_dict(self) -> Dict:
|
78
83
|
"""Convert config to dictionary for API requests."""
|
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
|
|
92
97
|
if not config.webhook_url:
|
93
98
|
return
|
94
99
|
|
95
|
-
payload
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
100
|
+
# Use custom payload template if provided, otherwise use default
|
101
|
+
if config.webhook_payload_template:
|
102
|
+
# Replace variables in the template
|
103
|
+
payload_str = config.webhook_payload_template.format(
|
104
|
+
url=result.url,
|
105
|
+
status=result.status,
|
106
|
+
markdown=result.markdown if result.status == "success" else None,
|
107
|
+
error=result.error if result.status == "failed" else None,
|
108
|
+
timestamp=result.timestamp,
|
109
|
+
config=config.to_dict()
|
110
|
+
)
|
111
|
+
payload = json.loads(payload_str) # Parse the formatted JSON string
|
112
|
+
else:
|
113
|
+
# Use default payload format
|
114
|
+
payload = {
|
115
|
+
"url": result.url,
|
116
|
+
"status": result.status,
|
117
|
+
"markdown": result.markdown if result.status == "success" else None,
|
118
|
+
"error": result.error if result.status == "failed" else None,
|
119
|
+
"timestamp": result.timestamp,
|
120
|
+
"config": config.to_dict()
|
121
|
+
}
|
103
122
|
|
104
123
|
try:
|
105
124
|
response = requests.post(
|
106
125
|
config.webhook_url,
|
107
126
|
json=payload,
|
127
|
+
headers=config.webhook_headers,
|
108
128
|
timeout=config.webhook_timeout
|
109
129
|
)
|
110
130
|
response.raise_for_status()
|
@@ -276,8 +296,8 @@ class SpiderForce4AI:
|
|
276
296
|
|
277
297
|
results.append(result)
|
278
298
|
|
279
|
-
# Save report if
|
280
|
-
if config.
|
299
|
+
# Save report if enabled
|
300
|
+
if config.save_reports:
|
281
301
|
self._save_report_sync(results, config)
|
282
302
|
print(f"\nReport saved to: {config.report_file}")
|
283
303
|
|
@@ -420,6 +440,55 @@ class SpiderForce4AI:
|
|
420
440
|
"""Synchronous version of crawl_url_async."""
|
421
441
|
return asyncio.run(self.crawl_url_async(url, config))
|
422
442
|
|
443
|
+
async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
|
444
|
+
"""Retry failed URLs once."""
|
445
|
+
if not failed_results:
|
446
|
+
return []
|
447
|
+
|
448
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
449
|
+
retry_results = []
|
450
|
+
|
451
|
+
# Create a new progress bar if one wasn't provided
|
452
|
+
should_close_progress = progress is None
|
453
|
+
if progress is None:
|
454
|
+
progress = Progress(
|
455
|
+
SpinnerColumn(),
|
456
|
+
TextColumn("[progress.description]{task.description}"),
|
457
|
+
BarColumn(),
|
458
|
+
TaskProgressColumn(),
|
459
|
+
console=console
|
460
|
+
)
|
461
|
+
progress.start()
|
462
|
+
|
463
|
+
retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
|
464
|
+
|
465
|
+
for result in failed_results:
|
466
|
+
progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
|
467
|
+
|
468
|
+
try:
|
469
|
+
new_result = await self.crawl_url_async(result.url, config)
|
470
|
+
if new_result.status == "success":
|
471
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
472
|
+
else:
|
473
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
474
|
+
retry_results.append(new_result)
|
475
|
+
except Exception as e:
|
476
|
+
console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
|
477
|
+
retry_results.append(CrawlResult(
|
478
|
+
url=result.url,
|
479
|
+
status="failed",
|
480
|
+
error=f"Retry error: {str(e)}",
|
481
|
+
config=config.to_dict()
|
482
|
+
))
|
483
|
+
|
484
|
+
progress.update(retry_task, advance=1)
|
485
|
+
await asyncio.sleep(config.request_delay)
|
486
|
+
|
487
|
+
if should_close_progress:
|
488
|
+
progress.stop()
|
489
|
+
|
490
|
+
return retry_results
|
491
|
+
|
423
492
|
async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
424
493
|
"""Crawl multiple URLs asynchronously with progress bar."""
|
425
494
|
await self._ensure_session()
|
@@ -445,15 +514,27 @@ class SpiderForce4AI:
|
|
445
514
|
await asyncio.sleep(config.request_delay)
|
446
515
|
return result
|
447
516
|
|
448
|
-
|
517
|
+
initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
|
518
|
+
|
519
|
+
# Identify failed URLs
|
520
|
+
failed_results = [r for r in initial_results if r.status == "failed"]
|
521
|
+
|
522
|
+
# Retry failed URLs
|
523
|
+
if failed_results:
|
524
|
+
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
525
|
+
|
526
|
+
# Replace failed results with retry results
|
527
|
+
results = [r for r in initial_results if r.status == "success"] + retry_results
|
528
|
+
else:
|
529
|
+
results = initial_results
|
449
530
|
|
450
531
|
# Save final report
|
451
532
|
await self._save_report(config)
|
452
533
|
|
453
|
-
# Print summary
|
534
|
+
# Print final summary
|
454
535
|
successful = len([r for r in results if r.status == "success"])
|
455
536
|
failed = len([r for r in results if r.status == "failed"])
|
456
|
-
console.print(f"\n[green]
|
537
|
+
console.print(f"\n[green]Final crawling results:[/green]")
|
457
538
|
console.print(f"✓ Successful: {successful}")
|
458
539
|
console.print(f"✗ Failed: {failed}")
|
459
540
|
|
@@ -543,12 +624,25 @@ class SpiderForce4AI:
|
|
543
624
|
self._save_report_sync(results, config)
|
544
625
|
print(f"\nReport saved to: {config.report_file}")
|
545
626
|
|
546
|
-
#
|
627
|
+
# Identify failed URLs and retry them
|
628
|
+
failed_results = [r for r in results if r.status == "failed"]
|
629
|
+
if failed_results:
|
630
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
631
|
+
for result in failed_results:
|
632
|
+
new_result = _process_url_parallel((result.url, self.base_url, config))
|
633
|
+
if new_result.status == "success":
|
634
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
635
|
+
# Replace the failed result with the successful retry
|
636
|
+
results[results.index(result)] = new_result
|
637
|
+
else:
|
638
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
639
|
+
|
640
|
+
# Print final summary
|
547
641
|
successful = len([r for r in results if r.status == "success"])
|
548
642
|
failed = len([r for r in results if r.status == "failed"])
|
549
|
-
print(f"\
|
550
|
-
print(f"✓ Successful: {successful}")
|
551
|
-
print(f"✗ Failed: {failed}")
|
643
|
+
console.print(f"\n[green]Final crawling results:[/green]")
|
644
|
+
console.print(f"✓ Successful: {successful}")
|
645
|
+
console.print(f"✗ Failed: {failed}")
|
552
646
|
|
553
647
|
return results
|
554
648
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: spiderforce4ai
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
5
|
Home-page: https://petertam.pro
|
6
6
|
Author: Piotr Tamulewicz
|
@@ -117,10 +117,23 @@ config = CrawlConfig(
|
|
117
117
|
timeout=30, # Request timeout (seconds)
|
118
118
|
|
119
119
|
# Output Settings
|
120
|
-
output_dir=Path("spiderforce_reports"),
|
121
|
-
webhook_url="https://your-webhook.com",
|
122
|
-
webhook_timeout=10,
|
123
|
-
|
120
|
+
output_dir=Path("spiderforce_reports"), # Default directory for files
|
121
|
+
webhook_url="https://your-webhook.com", # Real-time notifications
|
122
|
+
webhook_timeout=10, # Webhook timeout
|
123
|
+
webhook_headers={ # Optional custom headers for webhook
|
124
|
+
"Authorization": "Bearer your-token",
|
125
|
+
"X-Custom-Header": "value"
|
126
|
+
},
|
127
|
+
webhook_payload_template='''{ # Optional custom webhook payload template
|
128
|
+
"crawled_url": "{url}",
|
129
|
+
"content": "{markdown}",
|
130
|
+
"crawl_status": "{status}",
|
131
|
+
"crawl_error": "{error}",
|
132
|
+
"crawl_time": "{timestamp}",
|
133
|
+
"custom_field": "your-value"
|
134
|
+
}''',
|
135
|
+
save_reports=False, # Whether to save crawl reports (default: False)
|
136
|
+
report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
|
124
137
|
)
|
125
138
|
```
|
126
139
|
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=oU_UIdzsQxExaVgD7NCaVm4G-9zMtKGnREfY6xL1uFY,26041
|
2
|
+
spiderforce4ai-0.1.9.dist-info/METADATA,sha256=poV1i_-H3AgzFhs9juRDJSfaWO0gVePb5JXN7ynL4Y4,7771
|
3
|
+
spiderforce4ai-0.1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-0.1.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-0.1.9.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=Y_7CfRVYQ2ssH67YexwCV12J14tB125U7WIhVTQfYwU,21652
|
2
|
-
spiderforce4ai-0.1.8.dist-info/METADATA,sha256=kXn_kUTsFZm8wtdMt0lTo85Jr3SYAZQzZn_3VL4KkeU,7169
|
3
|
-
spiderforce4ai-0.1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-0.1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-0.1.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|