spiderforce4ai 0.1.8__py3-none-any.whl → 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -57,22 +57,27 @@ class CrawlConfig:
57
57
  output_dir: Path = Path("spiderforce_reports") # Default to spiderforce_reports in current directory
58
58
  webhook_url: Optional[str] = None # Optional webhook endpoint
59
59
  webhook_timeout: int = 10 # Webhook timeout
60
- report_file: Optional[Path] = None # Optional report file location
60
+ webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
61
+ webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
62
+ save_reports: bool = False # Whether to save crawl reports
63
+ report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
61
64
 
62
65
  def __post_init__(self):
63
- # Initialize empty lists for selectors if None
66
+ # Initialize empty lists/dicts for None values
64
67
  self.remove_selectors = self.remove_selectors or []
65
68
  self.remove_selectors_regex = self.remove_selectors_regex or []
69
+ self.webhook_headers = self.webhook_headers or {}
66
70
 
67
71
  # Ensure output_dir is a Path and exists
68
72
  self.output_dir = Path(self.output_dir)
69
73
  self.output_dir.mkdir(parents=True, exist_ok=True)
70
74
 
71
- # If report_file is not specified, create it in output_dir
72
- if self.report_file is None:
73
- self.report_file = self.output_dir / "crawl_report.json"
74
- else:
75
- self.report_file = Path(self.report_file)
75
+ # Only setup report file if save_reports is True
76
+ if self.save_reports:
77
+ if self.report_file is None:
78
+ self.report_file = self.output_dir / "crawl_report.json"
79
+ else:
80
+ self.report_file = Path(self.report_file)
76
81
 
77
82
  def to_dict(self) -> Dict:
78
83
  """Convert config to dictionary for API requests."""
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
92
97
  if not config.webhook_url:
93
98
  return
94
99
 
95
- payload = {
96
- "url": result.url,
97
- "status": result.status,
98
- "markdown": result.markdown if result.status == "success" else None,
99
- "error": result.error if result.status == "failed" else None,
100
- "timestamp": result.timestamp,
101
- "config": config.to_dict()
102
- }
100
+ # Use custom payload template if provided, otherwise use default
101
+ if config.webhook_payload_template:
102
+ # Replace variables in the template
103
+ payload_str = config.webhook_payload_template.format(
104
+ url=result.url,
105
+ status=result.status,
106
+ markdown=result.markdown if result.status == "success" else None,
107
+ error=result.error if result.status == "failed" else None,
108
+ timestamp=result.timestamp,
109
+ config=config.to_dict()
110
+ )
111
+ payload = json.loads(payload_str) # Parse the formatted JSON string
112
+ else:
113
+ # Use default payload format
114
+ payload = {
115
+ "url": result.url,
116
+ "status": result.status,
117
+ "markdown": result.markdown if result.status == "success" else None,
118
+ "error": result.error if result.status == "failed" else None,
119
+ "timestamp": result.timestamp,
120
+ "config": config.to_dict()
121
+ }
103
122
 
104
123
  try:
105
124
  response = requests.post(
106
125
  config.webhook_url,
107
126
  json=payload,
127
+ headers=config.webhook_headers,
108
128
  timeout=config.webhook_timeout
109
129
  )
110
130
  response.raise_for_status()
@@ -276,8 +296,8 @@ class SpiderForce4AI:
276
296
 
277
297
  results.append(result)
278
298
 
279
- # Save report if configured
280
- if config.report_file:
299
+ # Save report if enabled
300
+ if config.save_reports:
281
301
  self._save_report_sync(results, config)
282
302
  print(f"\nReport saved to: {config.report_file}")
283
303
 
@@ -420,6 +440,55 @@ class SpiderForce4AI:
420
440
  """Synchronous version of crawl_url_async."""
421
441
  return asyncio.run(self.crawl_url_async(url, config))
422
442
 
443
+ async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
444
+ """Retry failed URLs once."""
445
+ if not failed_results:
446
+ return []
447
+
448
+ console.print("\n[yellow]Retrying failed URLs...[/yellow]")
449
+ retry_results = []
450
+
451
+ # Create a new progress bar if one wasn't provided
452
+ should_close_progress = progress is None
453
+ if progress is None:
454
+ progress = Progress(
455
+ SpinnerColumn(),
456
+ TextColumn("[progress.description]{task.description}"),
457
+ BarColumn(),
458
+ TaskProgressColumn(),
459
+ console=console
460
+ )
461
+ progress.start()
462
+
463
+ retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
464
+
465
+ for result in failed_results:
466
+ progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
467
+
468
+ try:
469
+ new_result = await self.crawl_url_async(result.url, config)
470
+ if new_result.status == "success":
471
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
472
+ else:
473
+ console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
474
+ retry_results.append(new_result)
475
+ except Exception as e:
476
+ console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
477
+ retry_results.append(CrawlResult(
478
+ url=result.url,
479
+ status="failed",
480
+ error=f"Retry error: {str(e)}",
481
+ config=config.to_dict()
482
+ ))
483
+
484
+ progress.update(retry_task, advance=1)
485
+ await asyncio.sleep(config.request_delay)
486
+
487
+ if should_close_progress:
488
+ progress.stop()
489
+
490
+ return retry_results
491
+
423
492
  async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
424
493
  """Crawl multiple URLs asynchronously with progress bar."""
425
494
  await self._ensure_session()
@@ -445,17 +514,46 @@ class SpiderForce4AI:
445
514
  await asyncio.sleep(config.request_delay)
446
515
  return result
447
516
 
448
- results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
517
+ initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
518
+
519
+ # Identify failed URLs
520
+ failed_results = [r for r in initial_results if r.status == "failed"]
521
+
522
+ # Calculate initial failure ratio
523
+ initial_failed = len(failed_results)
524
+ total_urls = len(urls)
525
+ failure_ratio = (initial_failed / total_urls) * 100
526
+
527
+ # Retry failed URLs if ratio is acceptable
528
+ if failed_results:
529
+ if failure_ratio > 20:
530
+ console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
531
+ results = initial_results
532
+ else:
533
+ retry_results = await self._retry_failed_urls(failed_results, config, progress)
534
+ # Replace failed results with retry results
535
+ results = [r for r in initial_results if r.status == "success"] + retry_results
536
+ else:
537
+ results = initial_results
449
538
 
450
539
  # Save final report
451
540
  await self._save_report(config)
452
541
 
453
- # Print summary
454
- successful = len([r for r in results if r.status == "success"])
455
- failed = len([r for r in results if r.status == "failed"])
456
- console.print(f"\n[green]Crawling completed:[/green]")
457
- console.print(f"✓ Successful: {successful}")
458
- console.print(f" Failed: {failed}")
542
+ # Calculate final statistics
543
+ final_successful = len([r for r in results if r.status == "success"])
544
+ final_failed = len([r for r in results if r.status == "failed"])
545
+
546
+ # Print detailed summary
547
+ console.print(f"\n[green]Crawling Summary:[/green]")
548
+ console.print(f"Total URLs processed: {total_urls}")
549
+ console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
550
+ console.print(f"Final results:")
551
+ console.print(f" ✓ Successful: {final_successful}")
552
+ console.print(f" ✗ Failed: {final_failed}")
553
+
554
+ if initial_failed > 0:
555
+ retry_successful = initial_failed - final_failed
556
+ console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
459
557
 
460
558
  if config.report_file:
461
559
  console.print(f"📊 Report saved to: {config.report_file}")
@@ -543,12 +641,42 @@ class SpiderForce4AI:
543
641
  self._save_report_sync(results, config)
544
642
  print(f"\nReport saved to: {config.report_file}")
545
643
 
546
- # Print summary
547
- successful = len([r for r in results if r.status == "success"])
548
- failed = len([r for r in results if r.status == "failed"])
549
- print(f"\nCrawling completed:")
550
- print(f"✓ Successful: {successful}")
551
- print(f"✗ Failed: {failed}")
644
+ # Calculate initial failure statistics
645
+ failed_results = [r for r in results if r.status == "failed"]
646
+ initial_failed = len(failed_results)
647
+ total_urls = len(urls)
648
+ failure_ratio = (initial_failed / total_urls) * 100
649
+
650
+ # Retry failed URLs if ratio is acceptable
651
+ if failed_results:
652
+ if failure_ratio > 20:
653
+ console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
654
+ else:
655
+ console.print("\n[yellow]Retrying failed URLs...[/yellow]")
656
+ for result in failed_results:
657
+ new_result = _process_url_parallel((result.url, self.base_url, config))
658
+ if new_result.status == "success":
659
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
660
+ # Replace the failed result with the successful retry
661
+ results[results.index(result)] = new_result
662
+ else:
663
+ console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
664
+
665
+ # Calculate final statistics
666
+ final_successful = len([r for r in results if r.status == "success"])
667
+ final_failed = len([r for r in results if r.status == "failed"])
668
+
669
+ # Print detailed summary
670
+ console.print(f"\n[green]Crawling Summary:[/green]")
671
+ console.print(f"Total URLs processed: {total_urls}")
672
+ console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
673
+ console.print(f"Final results:")
674
+ console.print(f" ✓ Successful: {final_successful}")
675
+ console.print(f" ✗ Failed: {final_failed}")
676
+
677
+ if initial_failed > 0:
678
+ retry_successful = initial_failed - final_failed
679
+ console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
552
680
 
553
681
  return results
554
682
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.8
3
+ Version: 1.0
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -117,10 +117,23 @@ config = CrawlConfig(
117
117
  timeout=30, # Request timeout (seconds)
118
118
 
119
119
  # Output Settings
120
- output_dir=Path("spiderforce_reports"), # Default directory for files
121
- webhook_url="https://your-webhook.com", # Real-time notifications
122
- webhook_timeout=10, # Webhook timeout
123
- report_file=Path("crawl_report.json") # Final report location
120
+ output_dir=Path("spiderforce_reports"), # Default directory for files
121
+ webhook_url="https://your-webhook.com", # Real-time notifications
122
+ webhook_timeout=10, # Webhook timeout
123
+ webhook_headers={ # Optional custom headers for webhook
124
+ "Authorization": "Bearer your-token",
125
+ "X-Custom-Header": "value"
126
+ },
127
+ webhook_payload_template='''{ # Optional custom webhook payload template
128
+ "crawled_url": "{url}",
129
+ "content": "{markdown}",
130
+ "crawl_status": "{status}",
131
+ "crawl_error": "{error}",
132
+ "crawl_time": "{timestamp}",
133
+ "custom_field": "your-value"
134
+ }''',
135
+ save_reports=False, # Whether to save crawl reports (default: False)
136
+ report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
124
137
  )
125
138
  ```
126
139
 
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=8WEcryB8fckf5yIvH55s7a5FtxvK_AhXdi_dyaqqing,27929
2
+ spiderforce4ai-1.0.dist-info/METADATA,sha256=VqydJoQcHkzvIhYTPeH3j8ZSHK-lGbo1xmZwQZk6w2s,7769
3
+ spiderforce4ai-1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-1.0.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=Y_7CfRVYQ2ssH67YexwCV12J14tB125U7WIhVTQfYwU,21652
2
- spiderforce4ai-0.1.8.dist-info/METADATA,sha256=kXn_kUTsFZm8wtdMt0lTo85Jr3SYAZQzZn_3VL4KkeU,7169
3
- spiderforce4ai-0.1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-0.1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-0.1.8.dist-info/RECORD,,