spiderforce4ai 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +118 -24
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-0.1.9.dist-info}/METADATA +18 -5
- spiderforce4ai-0.1.9.dist-info/RECORD +5 -0
- spiderforce4ai-0.1.8.dist-info/RECORD +0 -5
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-0.1.9.dist-info}/WHEEL +0 -0
- {spiderforce4ai-0.1.8.dist-info → spiderforce4ai-0.1.9.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -57,22 +57,27 @@ class CrawlConfig:
|
|
57
57
|
output_dir: Path = Path("spiderforce_reports") # Default to spiderforce_reports in current directory
|
58
58
|
webhook_url: Optional[str] = None # Optional webhook endpoint
|
59
59
|
webhook_timeout: int = 10 # Webhook timeout
|
60
|
-
|
60
|
+
webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
|
61
|
+
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
62
|
+
save_reports: bool = False # Whether to save crawl reports
|
63
|
+
report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
|
61
64
|
|
62
65
|
def __post_init__(self):
|
63
|
-
# Initialize empty lists for
|
66
|
+
# Initialize empty lists/dicts for None values
|
64
67
|
self.remove_selectors = self.remove_selectors or []
|
65
68
|
self.remove_selectors_regex = self.remove_selectors_regex or []
|
69
|
+
self.webhook_headers = self.webhook_headers or {}
|
66
70
|
|
67
71
|
# Ensure output_dir is a Path and exists
|
68
72
|
self.output_dir = Path(self.output_dir)
|
69
73
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
70
74
|
|
71
|
-
#
|
72
|
-
if self.
|
73
|
-
self.report_file
|
74
|
-
|
75
|
-
|
75
|
+
# Only setup report file if save_reports is True
|
76
|
+
if self.save_reports:
|
77
|
+
if self.report_file is None:
|
78
|
+
self.report_file = self.output_dir / "crawl_report.json"
|
79
|
+
else:
|
80
|
+
self.report_file = Path(self.report_file)
|
76
81
|
|
77
82
|
def to_dict(self) -> Dict:
|
78
83
|
"""Convert config to dictionary for API requests."""
|
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
|
|
92
97
|
if not config.webhook_url:
|
93
98
|
return
|
94
99
|
|
95
|
-
payload
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
100
|
+
# Use custom payload template if provided, otherwise use default
|
101
|
+
if config.webhook_payload_template:
|
102
|
+
# Replace variables in the template
|
103
|
+
payload_str = config.webhook_payload_template.format(
|
104
|
+
url=result.url,
|
105
|
+
status=result.status,
|
106
|
+
markdown=result.markdown if result.status == "success" else None,
|
107
|
+
error=result.error if result.status == "failed" else None,
|
108
|
+
timestamp=result.timestamp,
|
109
|
+
config=config.to_dict()
|
110
|
+
)
|
111
|
+
payload = json.loads(payload_str) # Parse the formatted JSON string
|
112
|
+
else:
|
113
|
+
# Use default payload format
|
114
|
+
payload = {
|
115
|
+
"url": result.url,
|
116
|
+
"status": result.status,
|
117
|
+
"markdown": result.markdown if result.status == "success" else None,
|
118
|
+
"error": result.error if result.status == "failed" else None,
|
119
|
+
"timestamp": result.timestamp,
|
120
|
+
"config": config.to_dict()
|
121
|
+
}
|
103
122
|
|
104
123
|
try:
|
105
124
|
response = requests.post(
|
106
125
|
config.webhook_url,
|
107
126
|
json=payload,
|
127
|
+
headers=config.webhook_headers,
|
108
128
|
timeout=config.webhook_timeout
|
109
129
|
)
|
110
130
|
response.raise_for_status()
|
@@ -276,8 +296,8 @@ class SpiderForce4AI:
|
|
276
296
|
|
277
297
|
results.append(result)
|
278
298
|
|
279
|
-
# Save report if
|
280
|
-
if config.
|
299
|
+
# Save report if enabled
|
300
|
+
if config.save_reports:
|
281
301
|
self._save_report_sync(results, config)
|
282
302
|
print(f"\nReport saved to: {config.report_file}")
|
283
303
|
|
@@ -420,6 +440,55 @@ class SpiderForce4AI:
|
|
420
440
|
"""Synchronous version of crawl_url_async."""
|
421
441
|
return asyncio.run(self.crawl_url_async(url, config))
|
422
442
|
|
443
|
+
async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
|
444
|
+
"""Retry failed URLs once."""
|
445
|
+
if not failed_results:
|
446
|
+
return []
|
447
|
+
|
448
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
449
|
+
retry_results = []
|
450
|
+
|
451
|
+
# Create a new progress bar if one wasn't provided
|
452
|
+
should_close_progress = progress is None
|
453
|
+
if progress is None:
|
454
|
+
progress = Progress(
|
455
|
+
SpinnerColumn(),
|
456
|
+
TextColumn("[progress.description]{task.description}"),
|
457
|
+
BarColumn(),
|
458
|
+
TaskProgressColumn(),
|
459
|
+
console=console
|
460
|
+
)
|
461
|
+
progress.start()
|
462
|
+
|
463
|
+
retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
|
464
|
+
|
465
|
+
for result in failed_results:
|
466
|
+
progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
|
467
|
+
|
468
|
+
try:
|
469
|
+
new_result = await self.crawl_url_async(result.url, config)
|
470
|
+
if new_result.status == "success":
|
471
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
472
|
+
else:
|
473
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
474
|
+
retry_results.append(new_result)
|
475
|
+
except Exception as e:
|
476
|
+
console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
|
477
|
+
retry_results.append(CrawlResult(
|
478
|
+
url=result.url,
|
479
|
+
status="failed",
|
480
|
+
error=f"Retry error: {str(e)}",
|
481
|
+
config=config.to_dict()
|
482
|
+
))
|
483
|
+
|
484
|
+
progress.update(retry_task, advance=1)
|
485
|
+
await asyncio.sleep(config.request_delay)
|
486
|
+
|
487
|
+
if should_close_progress:
|
488
|
+
progress.stop()
|
489
|
+
|
490
|
+
return retry_results
|
491
|
+
|
423
492
|
async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
|
424
493
|
"""Crawl multiple URLs asynchronously with progress bar."""
|
425
494
|
await self._ensure_session()
|
@@ -445,15 +514,27 @@ class SpiderForce4AI:
|
|
445
514
|
await asyncio.sleep(config.request_delay)
|
446
515
|
return result
|
447
516
|
|
448
|
-
|
517
|
+
initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
|
518
|
+
|
519
|
+
# Identify failed URLs
|
520
|
+
failed_results = [r for r in initial_results if r.status == "failed"]
|
521
|
+
|
522
|
+
# Retry failed URLs
|
523
|
+
if failed_results:
|
524
|
+
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
525
|
+
|
526
|
+
# Replace failed results with retry results
|
527
|
+
results = [r for r in initial_results if r.status == "success"] + retry_results
|
528
|
+
else:
|
529
|
+
results = initial_results
|
449
530
|
|
450
531
|
# Save final report
|
451
532
|
await self._save_report(config)
|
452
533
|
|
453
|
-
# Print summary
|
534
|
+
# Print final summary
|
454
535
|
successful = len([r for r in results if r.status == "success"])
|
455
536
|
failed = len([r for r in results if r.status == "failed"])
|
456
|
-
console.print(f"\n[green]
|
537
|
+
console.print(f"\n[green]Final crawling results:[/green]")
|
457
538
|
console.print(f"✓ Successful: {successful}")
|
458
539
|
console.print(f"✗ Failed: {failed}")
|
459
540
|
|
@@ -543,12 +624,25 @@ class SpiderForce4AI:
|
|
543
624
|
self._save_report_sync(results, config)
|
544
625
|
print(f"\nReport saved to: {config.report_file}")
|
545
626
|
|
546
|
-
#
|
627
|
+
# Identify failed URLs and retry them
|
628
|
+
failed_results = [r for r in results if r.status == "failed"]
|
629
|
+
if failed_results:
|
630
|
+
console.print("\n[yellow]Retrying failed URLs...[/yellow]")
|
631
|
+
for result in failed_results:
|
632
|
+
new_result = _process_url_parallel((result.url, self.base_url, config))
|
633
|
+
if new_result.status == "success":
|
634
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
635
|
+
# Replace the failed result with the successful retry
|
636
|
+
results[results.index(result)] = new_result
|
637
|
+
else:
|
638
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
639
|
+
|
640
|
+
# Print final summary
|
547
641
|
successful = len([r for r in results if r.status == "success"])
|
548
642
|
failed = len([r for r in results if r.status == "failed"])
|
549
|
-
print(f"\
|
550
|
-
print(f"✓ Successful: {successful}")
|
551
|
-
print(f"✗ Failed: {failed}")
|
643
|
+
console.print(f"\n[green]Final crawling results:[/green]")
|
644
|
+
console.print(f"✓ Successful: {successful}")
|
645
|
+
console.print(f"✗ Failed: {failed}")
|
552
646
|
|
553
647
|
return results
|
554
648
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: spiderforce4ai
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.9
|
4
4
|
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
5
|
Home-page: https://petertam.pro
|
6
6
|
Author: Piotr Tamulewicz
|
@@ -117,10 +117,23 @@ config = CrawlConfig(
|
|
117
117
|
timeout=30, # Request timeout (seconds)
|
118
118
|
|
119
119
|
# Output Settings
|
120
|
-
output_dir=Path("spiderforce_reports"),
|
121
|
-
webhook_url="https://your-webhook.com",
|
122
|
-
webhook_timeout=10,
|
123
|
-
|
120
|
+
output_dir=Path("spiderforce_reports"), # Default directory for files
|
121
|
+
webhook_url="https://your-webhook.com", # Real-time notifications
|
122
|
+
webhook_timeout=10, # Webhook timeout
|
123
|
+
webhook_headers={ # Optional custom headers for webhook
|
124
|
+
"Authorization": "Bearer your-token",
|
125
|
+
"X-Custom-Header": "value"
|
126
|
+
},
|
127
|
+
webhook_payload_template='''{ # Optional custom webhook payload template
|
128
|
+
"crawled_url": "{url}",
|
129
|
+
"content": "{markdown}",
|
130
|
+
"crawl_status": "{status}",
|
131
|
+
"crawl_error": "{error}",
|
132
|
+
"crawl_time": "{timestamp}",
|
133
|
+
"custom_field": "your-value"
|
134
|
+
}''',
|
135
|
+
save_reports=False, # Whether to save crawl reports (default: False)
|
136
|
+
report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
|
124
137
|
)
|
125
138
|
```
|
126
139
|
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=oU_UIdzsQxExaVgD7NCaVm4G-9zMtKGnREfY6xL1uFY,26041
|
2
|
+
spiderforce4ai-0.1.9.dist-info/METADATA,sha256=poV1i_-H3AgzFhs9juRDJSfaWO0gVePb5JXN7ynL4Y4,7771
|
3
|
+
spiderforce4ai-0.1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-0.1.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-0.1.9.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=Y_7CfRVYQ2ssH67YexwCV12J14tB125U7WIhVTQfYwU,21652
|
2
|
-
spiderforce4ai-0.1.8.dist-info/METADATA,sha256=kXn_kUTsFZm8wtdMt0lTo85Jr3SYAZQzZn_3VL4KkeU,7169
|
3
|
-
spiderforce4ai-0.1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-0.1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-0.1.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|