spiderforce4ai 0.1.8__py3-none-any.whl → 1.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -57,22 +57,27 @@ class CrawlConfig:
57
57
  output_dir: Path = Path("spiderforce_reports") # Default to spiderforce_reports in current directory
58
58
  webhook_url: Optional[str] = None # Optional webhook endpoint
59
59
  webhook_timeout: int = 10 # Webhook timeout
60
- report_file: Optional[Path] = None # Optional report file location
60
+ webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
61
+ webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
62
+ save_reports: bool = False # Whether to save crawl reports
63
+ report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
61
64
 
62
65
  def __post_init__(self):
63
- # Initialize empty lists for selectors if None
66
+ # Initialize empty lists/dicts for None values
64
67
  self.remove_selectors = self.remove_selectors or []
65
68
  self.remove_selectors_regex = self.remove_selectors_regex or []
69
+ self.webhook_headers = self.webhook_headers or {}
66
70
 
67
71
  # Ensure output_dir is a Path and exists
68
72
  self.output_dir = Path(self.output_dir)
69
73
  self.output_dir.mkdir(parents=True, exist_ok=True)
70
74
 
71
- # If report_file is not specified, create it in output_dir
72
- if self.report_file is None:
73
- self.report_file = self.output_dir / "crawl_report.json"
74
- else:
75
- self.report_file = Path(self.report_file)
75
+ # Only setup report file if save_reports is True
76
+ if self.save_reports:
77
+ if self.report_file is None:
78
+ self.report_file = self.output_dir / "crawl_report.json"
79
+ else:
80
+ self.report_file = Path(self.report_file)
76
81
 
77
82
  def to_dict(self) -> Dict:
78
83
  """Convert config to dictionary for API requests."""
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
92
97
  if not config.webhook_url:
93
98
  return
94
99
 
95
- payload = {
96
- "url": result.url,
97
- "status": result.status,
98
- "markdown": result.markdown if result.status == "success" else None,
99
- "error": result.error if result.status == "failed" else None,
100
- "timestamp": result.timestamp,
101
- "config": config.to_dict()
102
- }
100
+ # Use custom payload template if provided, otherwise use default
101
+ if config.webhook_payload_template:
102
+ # Replace variables in the template
103
+ payload_str = config.webhook_payload_template.format(
104
+ url=result.url,
105
+ status=result.status,
106
+ markdown=result.markdown if result.status == "success" else None,
107
+ error=result.error if result.status == "failed" else None,
108
+ timestamp=result.timestamp,
109
+ config=config.to_dict()
110
+ )
111
+ payload = json.loads(payload_str) # Parse the formatted JSON string
112
+ else:
113
+ # Use default payload format
114
+ payload = {
115
+ "url": result.url,
116
+ "status": result.status,
117
+ "markdown": result.markdown if result.status == "success" else None,
118
+ "error": result.error if result.status == "failed" else None,
119
+ "timestamp": result.timestamp,
120
+ "config": config.to_dict()
121
+ }
103
122
 
104
123
  try:
105
124
  response = requests.post(
106
125
  config.webhook_url,
107
126
  json=payload,
127
+ headers=config.webhook_headers,
108
128
  timeout=config.webhook_timeout
109
129
  )
110
130
  response.raise_for_status()
@@ -276,8 +296,8 @@ class SpiderForce4AI:
276
296
 
277
297
  results.append(result)
278
298
 
279
- # Save report if configured
280
- if config.report_file:
299
+ # Save report if enabled
300
+ if config.save_reports:
281
301
  self._save_report_sync(results, config)
282
302
  print(f"\nReport saved to: {config.report_file}")
283
303
 
@@ -420,6 +440,55 @@ class SpiderForce4AI:
420
440
  """Synchronous version of crawl_url_async."""
421
441
  return asyncio.run(self.crawl_url_async(url, config))
422
442
 
443
+ async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
444
+ """Retry failed URLs once."""
445
+ if not failed_results:
446
+ return []
447
+
448
+ console.print("\n[yellow]Retrying failed URLs...[/yellow]")
449
+ retry_results = []
450
+
451
+ # Create a new progress bar if one wasn't provided
452
+ should_close_progress = progress is None
453
+ if progress is None:
454
+ progress = Progress(
455
+ SpinnerColumn(),
456
+ TextColumn("[progress.description]{task.description}"),
457
+ BarColumn(),
458
+ TaskProgressColumn(),
459
+ console=console
460
+ )
461
+ progress.start()
462
+
463
+ retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
464
+
465
+ for result in failed_results:
466
+ progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
467
+
468
+ try:
469
+ new_result = await self.crawl_url_async(result.url, config)
470
+ if new_result.status == "success":
471
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
472
+ else:
473
+ console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
474
+ retry_results.append(new_result)
475
+ except Exception as e:
476
+ console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
477
+ retry_results.append(CrawlResult(
478
+ url=result.url,
479
+ status="failed",
480
+ error=f"Retry error: {str(e)}",
481
+ config=config.to_dict()
482
+ ))
483
+
484
+ progress.update(retry_task, advance=1)
485
+ await asyncio.sleep(config.request_delay)
486
+
487
+ if should_close_progress:
488
+ progress.stop()
489
+
490
+ return retry_results
491
+
423
492
  async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
424
493
  """Crawl multiple URLs asynchronously with progress bar."""
425
494
  await self._ensure_session()
@@ -445,17 +514,46 @@ class SpiderForce4AI:
445
514
  await asyncio.sleep(config.request_delay)
446
515
  return result
447
516
 
448
- results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
517
+ initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
518
+
519
+ # Identify failed URLs
520
+ failed_results = [r for r in initial_results if r.status == "failed"]
521
+
522
+ # Calculate initial failure ratio
523
+ initial_failed = len(failed_results)
524
+ total_urls = len(urls)
525
+ failure_ratio = (initial_failed / total_urls) * 100
526
+
527
+ # Retry failed URLs if ratio is acceptable
528
+ if failed_results:
529
+ if failure_ratio > 20:
530
+ console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
531
+ results = initial_results
532
+ else:
533
+ retry_results = await self._retry_failed_urls(failed_results, config, progress)
534
+ # Replace failed results with retry results
535
+ results = [r for r in initial_results if r.status == "success"] + retry_results
536
+ else:
537
+ results = initial_results
449
538
 
450
539
  # Save final report
451
540
  await self._save_report(config)
452
541
 
453
- # Print summary
454
- successful = len([r for r in results if r.status == "success"])
455
- failed = len([r for r in results if r.status == "failed"])
456
- console.print(f"\n[green]Crawling completed:[/green]")
457
- console.print(f"✓ Successful: {successful}")
458
- console.print(f" Failed: {failed}")
542
+ # Calculate final statistics
543
+ final_successful = len([r for r in results if r.status == "success"])
544
+ final_failed = len([r for r in results if r.status == "failed"])
545
+
546
+ # Print detailed summary
547
+ console.print(f"\n[green]Crawling Summary:[/green]")
548
+ console.print(f"Total URLs processed: {total_urls}")
549
+ console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
550
+ console.print(f"Final results:")
551
+ console.print(f" ✓ Successful: {final_successful}")
552
+ console.print(f" ✗ Failed: {final_failed}")
553
+
554
+ if initial_failed > 0:
555
+ retry_successful = initial_failed - final_failed
556
+ console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
459
557
 
460
558
  if config.report_file:
461
559
  console.print(f"📊 Report saved to: {config.report_file}")
@@ -543,12 +641,42 @@ class SpiderForce4AI:
543
641
  self._save_report_sync(results, config)
544
642
  print(f"\nReport saved to: {config.report_file}")
545
643
 
546
- # Print summary
547
- successful = len([r for r in results if r.status == "success"])
548
- failed = len([r for r in results if r.status == "failed"])
549
- print(f"\nCrawling completed:")
550
- print(f"✓ Successful: {successful}")
551
- print(f"✗ Failed: {failed}")
644
+ # Calculate initial failure statistics
645
+ failed_results = [r for r in results if r.status == "failed"]
646
+ initial_failed = len(failed_results)
647
+ total_urls = len(urls)
648
+ failure_ratio = (initial_failed / total_urls) * 100
649
+
650
+ # Retry failed URLs if ratio is acceptable
651
+ if failed_results:
652
+ if failure_ratio > 20:
653
+ console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
654
+ else:
655
+ console.print("\n[yellow]Retrying failed URLs...[/yellow]")
656
+ for result in failed_results:
657
+ new_result = _process_url_parallel((result.url, self.base_url, config))
658
+ if new_result.status == "success":
659
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
660
+ # Replace the failed result with the successful retry
661
+ results[results.index(result)] = new_result
662
+ else:
663
+ console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
664
+
665
+ # Calculate final statistics
666
+ final_successful = len([r for r in results if r.status == "success"])
667
+ final_failed = len([r for r in results if r.status == "failed"])
668
+
669
+ # Print detailed summary
670
+ console.print(f"\n[green]Crawling Summary:[/green]")
671
+ console.print(f"Total URLs processed: {total_urls}")
672
+ console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
673
+ console.print(f"Final results:")
674
+ console.print(f" ✓ Successful: {final_successful}")
675
+ console.print(f" ✗ Failed: {final_failed}")
676
+
677
+ if initial_failed > 0:
678
+ retry_successful = initial_failed - final_failed
679
+ console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
552
680
 
553
681
  return results
554
682
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.8
3
+ Version: 1.0
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -117,10 +117,23 @@ config = CrawlConfig(
117
117
  timeout=30, # Request timeout (seconds)
118
118
 
119
119
  # Output Settings
120
- output_dir=Path("spiderforce_reports"), # Default directory for files
121
- webhook_url="https://your-webhook.com", # Real-time notifications
122
- webhook_timeout=10, # Webhook timeout
123
- report_file=Path("crawl_report.json") # Final report location
120
+ output_dir=Path("spiderforce_reports"), # Default directory for files
121
+ webhook_url="https://your-webhook.com", # Real-time notifications
122
+ webhook_timeout=10, # Webhook timeout
123
+ webhook_headers={ # Optional custom headers for webhook
124
+ "Authorization": "Bearer your-token",
125
+ "X-Custom-Header": "value"
126
+ },
127
+ webhook_payload_template='''{ # Optional custom webhook payload template
128
+ "crawled_url": "{url}",
129
+ "content": "{markdown}",
130
+ "crawl_status": "{status}",
131
+ "crawl_error": "{error}",
132
+ "crawl_time": "{timestamp}",
133
+ "custom_field": "your-value"
134
+ }''',
135
+ save_reports=False, # Whether to save crawl reports (default: False)
136
+ report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
124
137
  )
125
138
  ```
126
139
 
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=8WEcryB8fckf5yIvH55s7a5FtxvK_AhXdi_dyaqqing,27929
2
+ spiderforce4ai-1.0.dist-info/METADATA,sha256=VqydJoQcHkzvIhYTPeH3j8ZSHK-lGbo1xmZwQZk6w2s,7769
3
+ spiderforce4ai-1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-1.0.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=Y_7CfRVYQ2ssH67YexwCV12J14tB125U7WIhVTQfYwU,21652
2
- spiderforce4ai-0.1.8.dist-info/METADATA,sha256=kXn_kUTsFZm8wtdMt0lTo85Jr3SYAZQzZn_3VL4KkeU,7169
3
- spiderforce4ai-0.1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-0.1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-0.1.8.dist-info/RECORD,,