spiderforce4ai 0.1.9__tar.gz → 1.0__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.9
3
+ Version: 1.0
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "0.1.9"
7
+ version = "1.0"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name="spiderforce4ai",
6
- version="0.1.9",
6
+ version="1.0",
7
7
  author="Piotr Tamulewicz",
8
8
  author_email="pt@petertam.pro",
9
9
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",
@@ -519,24 +519,41 @@ class SpiderForce4AI:
519
519
  # Identify failed URLs
520
520
  failed_results = [r for r in initial_results if r.status == "failed"]
521
521
 
522
- # Retry failed URLs
522
+ # Calculate initial failure ratio
523
+ initial_failed = len(failed_results)
524
+ total_urls = len(urls)
525
+ failure_ratio = (initial_failed / total_urls) * 100
526
+
527
+ # Retry failed URLs if ratio is acceptable
523
528
  if failed_results:
524
- retry_results = await self._retry_failed_urls(failed_results, config, progress)
525
-
526
- # Replace failed results with retry results
527
- results = [r for r in initial_results if r.status == "success"] + retry_results
529
+ if failure_ratio > 20:
530
+ console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
531
+ results = initial_results
532
+ else:
533
+ retry_results = await self._retry_failed_urls(failed_results, config, progress)
534
+ # Replace failed results with retry results
535
+ results = [r for r in initial_results if r.status == "success"] + retry_results
528
536
  else:
529
537
  results = initial_results
530
538
 
531
539
  # Save final report
532
540
  await self._save_report(config)
533
541
 
534
- # Print final summary
535
- successful = len([r for r in results if r.status == "success"])
536
- failed = len([r for r in results if r.status == "failed"])
537
- console.print(f"\n[green]Final crawling results:[/green]")
538
- console.print(f"✓ Successful: {successful}")
539
- console.print(f" Failed: {failed}")
542
+ # Calculate final statistics
543
+ final_successful = len([r for r in results if r.status == "success"])
544
+ final_failed = len([r for r in results if r.status == "failed"])
545
+
546
+ # Print detailed summary
547
+ console.print(f"\n[green]Crawling Summary:[/green]")
548
+ console.print(f"Total URLs processed: {total_urls}")
549
+ console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
550
+ console.print(f"Final results:")
551
+ console.print(f" ✓ Successful: {final_successful}")
552
+ console.print(f" ✗ Failed: {final_failed}")
553
+
554
+ if initial_failed > 0:
555
+ retry_successful = initial_failed - final_failed
556
+ console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
540
557
 
541
558
  if config.report_file:
542
559
  console.print(f"📊 Report saved to: {config.report_file}")
@@ -624,25 +641,42 @@ class SpiderForce4AI:
624
641
  self._save_report_sync(results, config)
625
642
  print(f"\nReport saved to: {config.report_file}")
626
643
 
627
- # Identify failed URLs and retry them
644
+ # Calculate initial failure statistics
628
645
  failed_results = [r for r in results if r.status == "failed"]
629
- if failed_results:
630
- console.print("\n[yellow]Retrying failed URLs...[/yellow]")
631
- for result in failed_results:
632
- new_result = _process_url_parallel((result.url, self.base_url, config))
633
- if new_result.status == "success":
634
- console.print(f"[green]✓ Retry successful: {result.url}[/green]")
635
- # Replace the failed result with the successful retry
636
- results[results.index(result)] = new_result
637
- else:
638
- console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
646
+ initial_failed = len(failed_results)
647
+ total_urls = len(urls)
648
+ failure_ratio = (initial_failed / total_urls) * 100
639
649
 
640
- # Print final summary
641
- successful = len([r for r in results if r.status == "success"])
642
- failed = len([r for r in results if r.status == "failed"])
643
- console.print(f"\n[green]Final crawling results:[/green]")
644
- console.print(f"✓ Successful: {successful}")
645
- console.print(f" Failed: {failed}")
650
+ # Retry failed URLs if ratio is acceptable
651
+ if failed_results:
652
+ if failure_ratio > 20:
653
+ console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
654
+ else:
655
+ console.print("\n[yellow]Retrying failed URLs...[/yellow]")
656
+ for result in failed_results:
657
+ new_result = _process_url_parallel((result.url, self.base_url, config))
658
+ if new_result.status == "success":
659
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
660
+ # Replace the failed result with the successful retry
661
+ results[results.index(result)] = new_result
662
+ else:
663
+ console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
664
+
665
+ # Calculate final statistics
666
+ final_successful = len([r for r in results if r.status == "success"])
667
+ final_failed = len([r for r in results if r.status == "failed"])
668
+
669
+ # Print detailed summary
670
+ console.print(f"\n[green]Crawling Summary:[/green]")
671
+ console.print(f"Total URLs processed: {total_urls}")
672
+ console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
673
+ console.print(f"Final results:")
674
+ console.print(f" ✓ Successful: {final_successful}")
675
+ console.print(f" ✗ Failed: {final_failed}")
676
+
677
+ if initial_failed > 0:
678
+ retry_successful = initial_failed - final_failed
679
+ console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
646
680
 
647
681
  return results
648
682
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.9
3
+ Version: 1.0
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes