spiderforce4ai 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -57,22 +57,27 @@ class CrawlConfig:
57
57
  output_dir: Path = Path("spiderforce_reports") # Default to spiderforce_reports in current directory
58
58
  webhook_url: Optional[str] = None # Optional webhook endpoint
59
59
  webhook_timeout: int = 10 # Webhook timeout
60
- report_file: Optional[Path] = None # Optional report file location
60
+ webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
61
+ webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
62
+ save_reports: bool = False # Whether to save crawl reports
63
+ report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
61
64
 
62
65
  def __post_init__(self):
63
- # Initialize empty lists for selectors if None
66
+ # Initialize empty lists/dicts for None values
64
67
  self.remove_selectors = self.remove_selectors or []
65
68
  self.remove_selectors_regex = self.remove_selectors_regex or []
69
+ self.webhook_headers = self.webhook_headers or {}
66
70
 
67
71
  # Ensure output_dir is a Path and exists
68
72
  self.output_dir = Path(self.output_dir)
69
73
  self.output_dir.mkdir(parents=True, exist_ok=True)
70
74
 
71
- # If report_file is not specified, create it in output_dir
72
- if self.report_file is None:
73
- self.report_file = self.output_dir / "crawl_report.json"
74
- else:
75
- self.report_file = Path(self.report_file)
75
+ # Only setup report file if save_reports is True
76
+ if self.save_reports:
77
+ if self.report_file is None:
78
+ self.report_file = self.output_dir / "crawl_report.json"
79
+ else:
80
+ self.report_file = Path(self.report_file)
76
81
 
77
82
  def to_dict(self) -> Dict:
78
83
  """Convert config to dictionary for API requests."""
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
92
97
  if not config.webhook_url:
93
98
  return
94
99
 
95
- payload = {
96
- "url": result.url,
97
- "status": result.status,
98
- "markdown": result.markdown if result.status == "success" else None,
99
- "error": result.error if result.status == "failed" else None,
100
- "timestamp": result.timestamp,
101
- "config": config.to_dict()
102
- }
100
+ # Use custom payload template if provided, otherwise use default
101
+ if config.webhook_payload_template:
102
+ # Replace variables in the template
103
+ payload_str = config.webhook_payload_template.format(
104
+ url=result.url,
105
+ status=result.status,
106
+ markdown=result.markdown if result.status == "success" else None,
107
+ error=result.error if result.status == "failed" else None,
108
+ timestamp=result.timestamp,
109
+ config=config.to_dict()
110
+ )
111
+ payload = json.loads(payload_str) # Parse the formatted JSON string
112
+ else:
113
+ # Use default payload format
114
+ payload = {
115
+ "url": result.url,
116
+ "status": result.status,
117
+ "markdown": result.markdown if result.status == "success" else None,
118
+ "error": result.error if result.status == "failed" else None,
119
+ "timestamp": result.timestamp,
120
+ "config": config.to_dict()
121
+ }
103
122
 
104
123
  try:
105
124
  response = requests.post(
106
125
  config.webhook_url,
107
126
  json=payload,
127
+ headers=config.webhook_headers,
108
128
  timeout=config.webhook_timeout
109
129
  )
110
130
  response.raise_for_status()
@@ -276,8 +296,8 @@ class SpiderForce4AI:
276
296
 
277
297
  results.append(result)
278
298
 
279
- # Save report if configured
280
- if config.report_file:
299
+ # Save report if enabled
300
+ if config.save_reports:
281
301
  self._save_report_sync(results, config)
282
302
  print(f"\nReport saved to: {config.report_file}")
283
303
 
@@ -420,6 +440,55 @@ class SpiderForce4AI:
420
440
  """Synchronous version of crawl_url_async."""
421
441
  return asyncio.run(self.crawl_url_async(url, config))
422
442
 
443
+ async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
444
+ """Retry failed URLs once."""
445
+ if not failed_results:
446
+ return []
447
+
448
+ console.print("\n[yellow]Retrying failed URLs...[/yellow]")
449
+ retry_results = []
450
+
451
+ # Create a new progress bar if one wasn't provided
452
+ should_close_progress = progress is None
453
+ if progress is None:
454
+ progress = Progress(
455
+ SpinnerColumn(),
456
+ TextColumn("[progress.description]{task.description}"),
457
+ BarColumn(),
458
+ TaskProgressColumn(),
459
+ console=console
460
+ )
461
+ progress.start()
462
+
463
+ retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
464
+
465
+ for result in failed_results:
466
+ progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
467
+
468
+ try:
469
+ new_result = await self.crawl_url_async(result.url, config)
470
+ if new_result.status == "success":
471
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
472
+ else:
473
+ console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
474
+ retry_results.append(new_result)
475
+ except Exception as e:
476
+ console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
477
+ retry_results.append(CrawlResult(
478
+ url=result.url,
479
+ status="failed",
480
+ error=f"Retry error: {str(e)}",
481
+ config=config.to_dict()
482
+ ))
483
+
484
+ progress.update(retry_task, advance=1)
485
+ await asyncio.sleep(config.request_delay)
486
+
487
+ if should_close_progress:
488
+ progress.stop()
489
+
490
+ return retry_results
491
+
423
492
  async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
424
493
  """Crawl multiple URLs asynchronously with progress bar."""
425
494
  await self._ensure_session()
@@ -445,15 +514,27 @@ class SpiderForce4AI:
445
514
  await asyncio.sleep(config.request_delay)
446
515
  return result
447
516
 
448
- results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
517
+ initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
518
+
519
+ # Identify failed URLs
520
+ failed_results = [r for r in initial_results if r.status == "failed"]
521
+
522
+ # Retry failed URLs
523
+ if failed_results:
524
+ retry_results = await self._retry_failed_urls(failed_results, config, progress)
525
+
526
+ # Replace failed results with retry results
527
+ results = [r for r in initial_results if r.status == "success"] + retry_results
528
+ else:
529
+ results = initial_results
449
530
 
450
531
  # Save final report
451
532
  await self._save_report(config)
452
533
 
453
- # Print summary
534
+ # Print final summary
454
535
  successful = len([r for r in results if r.status == "success"])
455
536
  failed = len([r for r in results if r.status == "failed"])
456
- console.print(f"\n[green]Crawling completed:[/green]")
537
+ console.print(f"\n[green]Final crawling results:[/green]")
457
538
  console.print(f"✓ Successful: {successful}")
458
539
  console.print(f"✗ Failed: {failed}")
459
540
 
@@ -543,12 +624,25 @@ class SpiderForce4AI:
543
624
  self._save_report_sync(results, config)
544
625
  print(f"\nReport saved to: {config.report_file}")
545
626
 
546
- # Print summary
627
+ # Identify failed URLs and retry them
628
+ failed_results = [r for r in results if r.status == "failed"]
629
+ if failed_results:
630
+ console.print("\n[yellow]Retrying failed URLs...[/yellow]")
631
+ for result in failed_results:
632
+ new_result = _process_url_parallel((result.url, self.base_url, config))
633
+ if new_result.status == "success":
634
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
635
+ # Replace the failed result with the successful retry
636
+ results[results.index(result)] = new_result
637
+ else:
638
+ console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
639
+
640
+ # Print final summary
547
641
  successful = len([r for r in results if r.status == "success"])
548
642
  failed = len([r for r in results if r.status == "failed"])
549
- print(f"\nCrawling completed:")
550
- print(f"✓ Successful: {successful}")
551
- print(f"✗ Failed: {failed}")
643
+ console.print(f"\n[green]Final crawling results:[/green]")
644
+ console.print(f"✓ Successful: {successful}")
645
+ console.print(f"✗ Failed: {failed}")
552
646
 
553
647
  return results
554
648
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -117,10 +117,23 @@ config = CrawlConfig(
117
117
  timeout=30, # Request timeout (seconds)
118
118
 
119
119
  # Output Settings
120
- output_dir=Path("spiderforce_reports"), # Default directory for files
121
- webhook_url="https://your-webhook.com", # Real-time notifications
122
- webhook_timeout=10, # Webhook timeout
123
- report_file=Path("crawl_report.json") # Final report location
120
+ output_dir=Path("spiderforce_reports"), # Default directory for files
121
+ webhook_url="https://your-webhook.com", # Real-time notifications
122
+ webhook_timeout=10, # Webhook timeout
123
+ webhook_headers={ # Optional custom headers for webhook
124
+ "Authorization": "Bearer your-token",
125
+ "X-Custom-Header": "value"
126
+ },
127
+ webhook_payload_template='''{ # Optional custom webhook payload template
128
+ "crawled_url": "{url}",
129
+ "content": "{markdown}",
130
+ "crawl_status": "{status}",
131
+ "crawl_error": "{error}",
132
+ "crawl_time": "{timestamp}",
133
+ "custom_field": "your-value"
134
+ }''',
135
+ save_reports=False, # Whether to save crawl reports (default: False)
136
+ report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
124
137
  )
125
138
  ```
126
139
 
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=oU_UIdzsQxExaVgD7NCaVm4G-9zMtKGnREfY6xL1uFY,26041
2
+ spiderforce4ai-0.1.9.dist-info/METADATA,sha256=poV1i_-H3AgzFhs9juRDJSfaWO0gVePb5JXN7ynL4Y4,7771
3
+ spiderforce4ai-0.1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-0.1.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-0.1.9.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=Y_7CfRVYQ2ssH67YexwCV12J14tB125U7WIhVTQfYwU,21652
2
- spiderforce4ai-0.1.8.dist-info/METADATA,sha256=kXn_kUTsFZm8wtdMt0lTo85Jr3SYAZQzZn_3VL4KkeU,7169
3
- spiderforce4ai-0.1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-0.1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-0.1.8.dist-info/RECORD,,