spiderforce4ai 0.1.9__tar.gz → 1.1__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.9
3
+ Version: 1.1
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "0.1.9"
7
+ version = "1.1"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name="spiderforce4ai",
6
- version="0.1.9",
6
+ version="1.1",
7
7
  author="Piotr Tamulewicz",
8
8
  author_email="pt@petertam.pro",
9
9
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",
@@ -445,7 +445,11 @@ class SpiderForce4AI:
445
445
  if not failed_results:
446
446
  return []
447
447
 
448
- console.print("\n[yellow]Retrying failed URLs...[/yellow]")
448
+ failed_count = len(failed_results)
449
+ total_count = len([r for r in self.crawl_results])
450
+ failure_ratio = (failed_count / total_count) * 100
451
+
452
+ console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
449
453
  retry_results = []
450
454
 
451
455
  # Create a new progress bar if one wasn't provided
@@ -519,24 +523,41 @@ class SpiderForce4AI:
519
523
  # Identify failed URLs
520
524
  failed_results = [r for r in initial_results if r.status == "failed"]
521
525
 
522
- # Retry failed URLs
526
+ # Calculate initial failure ratio
527
+ initial_failed = len(failed_results)
528
+ total_urls = len(urls)
529
+ failure_ratio = (initial_failed / total_urls) * 100
530
+
531
+ # Retry failed URLs if ratio is acceptable
523
532
  if failed_results:
524
- retry_results = await self._retry_failed_urls(failed_results, config, progress)
525
-
526
- # Replace failed results with retry results
527
- results = [r for r in initial_results if r.status == "success"] + retry_results
533
+ if failure_ratio > 20:
534
+ console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
535
+ results = initial_results
536
+ else:
537
+ retry_results = await self._retry_failed_urls(failed_results, config, progress)
538
+ # Replace failed results with retry results
539
+ results = [r for r in initial_results if r.status == "success"] + retry_results
528
540
  else:
529
541
  results = initial_results
530
542
 
531
543
  # Save final report
532
544
  await self._save_report(config)
533
545
 
534
- # Print final summary
535
- successful = len([r for r in results if r.status == "success"])
536
- failed = len([r for r in results if r.status == "failed"])
537
- console.print(f"\n[green]Final crawling results:[/green]")
538
- console.print(f"✓ Successful: {successful}")
539
- console.print(f" Failed: {failed}")
546
+ # Calculate final statistics
547
+ final_successful = len([r for r in results if r.status == "success"])
548
+ final_failed = len([r for r in results if r.status == "failed"])
549
+
550
+ # Print detailed summary
551
+ console.print(f"\n[green]Crawling Summary:[/green]")
552
+ console.print(f"Total URLs processed: {total_urls}")
553
+ console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
554
+ console.print(f"Final results:")
555
+ console.print(f" ✓ Successful: {final_successful}")
556
+ console.print(f" ✗ Failed: {final_failed}")
557
+
558
+ if initial_failed > 0:
559
+ retry_successful = initial_failed - final_failed
560
+ console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
540
561
 
541
562
  if config.report_file:
542
563
  console.print(f"📊 Report saved to: {config.report_file}")
@@ -624,25 +645,44 @@ class SpiderForce4AI:
624
645
  self._save_report_sync(results, config)
625
646
  print(f"\nReport saved to: {config.report_file}")
626
647
 
627
- # Identify failed URLs and retry them
648
+ # Calculate initial failure statistics
628
649
  failed_results = [r for r in results if r.status == "failed"]
629
- if failed_results:
630
- console.print("\n[yellow]Retrying failed URLs...[/yellow]")
631
- for result in failed_results:
632
- new_result = _process_url_parallel((result.url, self.base_url, config))
633
- if new_result.status == "success":
634
- console.print(f"[green]✓ Retry successful: {result.url}[/green]")
635
- # Replace the failed result with the successful retry
636
- results[results.index(result)] = new_result
637
- else:
638
- console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
650
+ initial_failed = len(failed_results)
651
+ total_urls = len(urls)
652
+ failure_ratio = (initial_failed / total_urls) * 100
639
653
 
640
- # Print final summary
641
- successful = len([r for r in results if r.status == "success"])
642
- failed = len([r for r in results if r.status == "failed"])
643
- console.print(f"\n[green]Final crawling results:[/green]")
644
- console.print(f"✓ Successful: {successful}")
645
- console.print(f"✗ Failed: {failed}")
654
+ # Retry failed URLs if ratio is acceptable
655
+ if failed_results:
656
+ if failure_ratio > 20:
657
+ console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
658
+ else:
659
+ failed_count = len(failed_results)
660
+ failure_ratio = (failed_count / total_urls) * 100
661
+ console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
662
+ for result in failed_results:
663
+ new_result = _process_url_parallel((result.url, self.base_url, config))
664
+ if new_result.status == "success":
665
+ console.print(f"[green]✓ Retry successful: {result.url}[/green]")
666
+ # Replace the failed result with the successful retry
667
+ results[results.index(result)] = new_result
668
+ else:
669
+ console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
670
+
671
+ # Calculate final statistics
672
+ final_successful = len([r for r in results if r.status == "success"])
673
+ final_failed = len([r for r in results if r.status == "failed"])
674
+
675
+ # Print detailed summary
676
+ console.print(f"\n[green]Crawling Summary:[/green]")
677
+ console.print(f"Total URLs processed: {total_urls}")
678
+ console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
679
+ console.print(f"Final results:")
680
+ console.print(f" ✓ Successful: {final_successful}")
681
+ console.print(f" ✗ Failed: {final_failed}")
682
+
683
+ if initial_failed > 0:
684
+ retry_successful = initial_failed - final_failed
685
+ console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
646
686
 
647
687
  return results
648
688
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.9
3
+ Version: 1.1
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes