spiderforce4ai 0.1.8__tar.gz → 0.1.9__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -117,10 +117,23 @@ config = CrawlConfig(
117
117
  timeout=30, # Request timeout (seconds)
118
118
 
119
119
  # Output Settings
120
- output_dir=Path("spiderforce_reports"), # Default directory for files
121
- webhook_url="https://your-webhook.com", # Real-time notifications
122
- webhook_timeout=10, # Webhook timeout
123
- report_file=Path("crawl_report.json") # Final report location
120
+ output_dir=Path("spiderforce_reports"), # Default directory for files
121
+ webhook_url="https://your-webhook.com", # Real-time notifications
122
+ webhook_timeout=10, # Webhook timeout
123
+ webhook_headers={ # Optional custom headers for webhook
124
+ "Authorization": "Bearer your-token",
125
+ "X-Custom-Header": "value"
126
+ },
127
+ webhook_payload_template='''{ # Optional custom webhook payload template
128
+ "crawled_url": "{url}",
129
+ "content": "{markdown}",
130
+ "crawl_status": "{status}",
131
+ "crawl_error": "{error}",
132
+ "crawl_time": "{timestamp}",
133
+ "custom_field": "your-value"
134
+ }''',
135
+ save_reports=False, # Whether to save crawl reports (default: False)
136
+ report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
124
137
  )
125
138
  ```
126
139
 
@@ -93,10 +93,23 @@ config = CrawlConfig(
93
93
  timeout=30, # Request timeout (seconds)
94
94
 
95
95
  # Output Settings
96
- output_dir=Path("spiderforce_reports"), # Default directory for files
97
- webhook_url="https://your-webhook.com", # Real-time notifications
98
- webhook_timeout=10, # Webhook timeout
99
- report_file=Path("crawl_report.json") # Final report location
96
+ output_dir=Path("spiderforce_reports"), # Default directory for files
97
+ webhook_url="https://your-webhook.com", # Real-time notifications
98
+ webhook_timeout=10, # Webhook timeout
99
+ webhook_headers={ # Optional custom headers for webhook
100
+ "Authorization": "Bearer your-token",
101
+ "X-Custom-Header": "value"
102
+ },
103
+ webhook_payload_template='''{ # Optional custom webhook payload template
104
+ "crawled_url": "{url}",
105
+ "content": "{markdown}",
106
+ "crawl_status": "{status}",
107
+ "crawl_error": "{error}",
108
+ "crawl_time": "{timestamp}",
109
+ "custom_field": "your-value"
110
+ }''',
111
+ save_reports=False, # Whether to save crawl reports (default: False)
112
+ report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
100
113
  )
101
114
  ```
102
115
 
@@ -269,4 +282,4 @@ MIT License
269
282
 
270
283
  ## Credits
271
284
 
272
- Created by [Peter Tam](https://petertam.pro)
285
+ Created by [Peter Tam](https://petertam.pro)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "0.1.8"
7
+ version = "0.1.9"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name="spiderforce4ai",
6
- version="0.1.8",
6
+ version="0.1.9",
7
7
  author="Piotr Tamulewicz",
8
8
  author_email="pt@petertam.pro",
9
9
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",
@@ -57,22 +57,27 @@ class CrawlConfig:
57
57
  output_dir: Path = Path("spiderforce_reports") # Default to spiderforce_reports in current directory
58
58
  webhook_url: Optional[str] = None # Optional webhook endpoint
59
59
  webhook_timeout: int = 10 # Webhook timeout
60
- report_file: Optional[Path] = None # Optional report file location
60
+ webhook_headers: Optional[Dict[str, str]] = None # Optional webhook headers
61
+ webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
62
+ save_reports: bool = False # Whether to save crawl reports
63
+ report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
61
64
 
62
65
  def __post_init__(self):
63
- # Initialize empty lists for selectors if None
66
+ # Initialize empty lists/dicts for None values
64
67
  self.remove_selectors = self.remove_selectors or []
65
68
  self.remove_selectors_regex = self.remove_selectors_regex or []
69
+ self.webhook_headers = self.webhook_headers or {}
66
70
 
67
71
  # Ensure output_dir is a Path and exists
68
72
  self.output_dir = Path(self.output_dir)
69
73
  self.output_dir.mkdir(parents=True, exist_ok=True)
70
74
 
71
- # If report_file is not specified, create it in output_dir
72
- if self.report_file is None:
73
- self.report_file = self.output_dir / "crawl_report.json"
74
- else:
75
- self.report_file = Path(self.report_file)
75
+ # Only setup report file if save_reports is True
76
+ if self.save_reports:
77
+ if self.report_file is None:
78
+ self.report_file = self.output_dir / "crawl_report.json"
79
+ else:
80
+ self.report_file = Path(self.report_file)
76
81
 
77
82
  def to_dict(self) -> Dict:
78
83
  """Convert config to dictionary for API requests."""
@@ -92,19 +97,34 @@ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
92
97
  if not config.webhook_url:
93
98
  return
94
99
 
95
- payload = {
96
- "url": result.url,
97
- "status": result.status,
98
- "markdown": result.markdown if result.status == "success" else None,
99
- "error": result.error if result.status == "failed" else None,
100
- "timestamp": result.timestamp,
101
- "config": config.to_dict()
102
- }
100
+ # Use custom payload template if provided, otherwise use default
101
+ if config.webhook_payload_template:
102
+ # Replace variables in the template
103
+ payload_str = config.webhook_payload_template.format(
104
+ url=result.url,
105
+ status=result.status,
106
+ markdown=result.markdown if result.status == "success" else None,
107
+ error=result.error if result.status == "failed" else None,
108
+ timestamp=result.timestamp,
109
+ config=config.to_dict()
110
+ )
111
+ payload = json.loads(payload_str) # Parse the formatted JSON string
112
+ else:
113
+ # Use default payload format
114
+ payload = {
115
+ "url": result.url,
116
+ "status": result.status,
117
+ "markdown": result.markdown if result.status == "success" else None,
118
+ "error": result.error if result.status == "failed" else None,
119
+ "timestamp": result.timestamp,
120
+ "config": config.to_dict()
121
+ }
103
122
 
104
123
  try:
105
124
  response = requests.post(
106
125
  config.webhook_url,
107
126
  json=payload,
127
+ headers=config.webhook_headers,
108
128
  timeout=config.webhook_timeout
109
129
  )
110
130
  response.raise_for_status()
@@ -276,8 +296,8 @@ class SpiderForce4AI:
276
296
 
277
297
  results.append(result)
278
298
 
279
- # Save report if configured
280
- if config.report_file:
299
+ # Save report if enabled
300
+ if config.save_reports:
281
301
  self._save_report_sync(results, config)
282
302
  print(f"\nReport saved to: {config.report_file}")
283
303
 
@@ -420,6 +440,55 @@ class SpiderForce4AI:
420
440
  """Synchronous version of crawl_url_async."""
421
441
  return asyncio.run(self.crawl_url_async(url, config))
422
442
 
443
+ async def _retry_failed_urls(self, failed_results: List[CrawlResult], config: CrawlConfig, progress=None) -> List[CrawlResult]:
444
+ """Retry failed URLs once."""
445
+ if not failed_results:
446
+ return []
447
+
448
+ console.print("\n[yellow]Retrying failed URLs...[/yellow]")
449
+ retry_results = []
450
+
451
+ # Create a new progress bar if one wasn't provided
452
+ should_close_progress = progress is None
453
+ if progress is None:
454
+ progress = Progress(
455
+ SpinnerColumn(),
456
+ TextColumn("[progress.description]{task.description}"),
457
+ BarColumn(),
458
+ TaskProgressColumn(),
459
+ console=console
460
+ )
461
+ progress.start()
462
+
463
+ retry_task = progress.add_task("[yellow]Retrying failed URLs...", total=len(failed_results))
464
+
465
+ for result in failed_results:
466
+ progress.update(retry_task, description=f"[yellow]Retrying: {result.url}")
467
+
468
+ try:
469
+ new_result = await self.crawl_url_async(result.url, config)
470
+ if new_result.status == "success":
471
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
472
+ else:
473
+ console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
474
+ retry_results.append(new_result)
475
+ except Exception as e:
476
+ console.print(f"[red]✗ Retry error: {result.url} - {str(e)}[/red]")
477
+ retry_results.append(CrawlResult(
478
+ url=result.url,
479
+ status="failed",
480
+ error=f"Retry error: {str(e)}",
481
+ config=config.to_dict()
482
+ ))
483
+
484
+ progress.update(retry_task, advance=1)
485
+ await asyncio.sleep(config.request_delay)
486
+
487
+ if should_close_progress:
488
+ progress.stop()
489
+
490
+ return retry_results
491
+
423
492
  async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
424
493
  """Crawl multiple URLs asynchronously with progress bar."""
425
494
  await self._ensure_session()
@@ -445,15 +514,27 @@ class SpiderForce4AI:
445
514
  await asyncio.sleep(config.request_delay)
446
515
  return result
447
516
 
448
- results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
517
+ initial_results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
518
+
519
+ # Identify failed URLs
520
+ failed_results = [r for r in initial_results if r.status == "failed"]
521
+
522
+ # Retry failed URLs
523
+ if failed_results:
524
+ retry_results = await self._retry_failed_urls(failed_results, config, progress)
525
+
526
+ # Replace failed results with retry results
527
+ results = [r for r in initial_results if r.status == "success"] + retry_results
528
+ else:
529
+ results = initial_results
449
530
 
450
531
  # Save final report
451
532
  await self._save_report(config)
452
533
 
453
- # Print summary
534
+ # Print final summary
454
535
  successful = len([r for r in results if r.status == "success"])
455
536
  failed = len([r for r in results if r.status == "failed"])
456
- console.print(f"\n[green]Crawling completed:[/green]")
537
+ console.print(f"\n[green]Final crawling results:[/green]")
457
538
  console.print(f"✓ Successful: {successful}")
458
539
  console.print(f"✗ Failed: {failed}")
459
540
 
@@ -543,12 +624,25 @@ class SpiderForce4AI:
543
624
  self._save_report_sync(results, config)
544
625
  print(f"\nReport saved to: {config.report_file}")
545
626
 
546
- # Print summary
627
+ # Identify failed URLs and retry them
628
+ failed_results = [r for r in results if r.status == "failed"]
629
+ if failed_results:
630
+ console.print("\n[yellow]Retrying failed URLs...[/yellow]")
631
+ for result in failed_results:
632
+ new_result = _process_url_parallel((result.url, self.base_url, config))
633
+ if new_result.status == "success":
634
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
635
+ # Replace the failed result with the successful retry
636
+ results[results.index(result)] = new_result
637
+ else:
638
+ console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
639
+
640
+ # Print final summary
547
641
  successful = len([r for r in results if r.status == "success"])
548
642
  failed = len([r for r in results if r.status == "failed"])
549
- print(f"\nCrawling completed:")
550
- print(f"✓ Successful: {successful}")
551
- print(f"✗ Failed: {failed}")
643
+ console.print(f"\n[green]Final crawling results:[/green]")
644
+ console.print(f"✓ Successful: {successful}")
645
+ console.print(f"✗ Failed: {failed}")
552
646
 
553
647
  return results
554
648
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -117,10 +117,23 @@ config = CrawlConfig(
117
117
  timeout=30, # Request timeout (seconds)
118
118
 
119
119
  # Output Settings
120
- output_dir=Path("spiderforce_reports"), # Default directory for files
121
- webhook_url="https://your-webhook.com", # Real-time notifications
122
- webhook_timeout=10, # Webhook timeout
123
- report_file=Path("crawl_report.json") # Final report location
120
+ output_dir=Path("spiderforce_reports"), # Default directory for files
121
+ webhook_url="https://your-webhook.com", # Real-time notifications
122
+ webhook_timeout=10, # Webhook timeout
123
+ webhook_headers={ # Optional custom headers for webhook
124
+ "Authorization": "Bearer your-token",
125
+ "X-Custom-Header": "value"
126
+ },
127
+ webhook_payload_template='''{ # Optional custom webhook payload template
128
+ "crawled_url": "{url}",
129
+ "content": "{markdown}",
130
+ "crawl_status": "{status}",
131
+ "crawl_error": "{error}",
132
+ "crawl_time": "{timestamp}",
133
+ "custom_field": "your-value"
134
+ }''',
135
+ save_reports=False, # Whether to save crawl reports (default: False)
136
+ report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
124
137
  )
125
138
  ```
126
139
 
File without changes