spiderforce4ai 0.1.9__tar.gz → 1.1__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-0.1.9 → spiderforce4ai-1.1}/PKG-INFO +1 -1
- {spiderforce4ai-0.1.9 → spiderforce4ai-1.1}/pyproject.toml +1 -1
- {spiderforce4ai-0.1.9 → spiderforce4ai-1.1}/setup.py +1 -1
- {spiderforce4ai-0.1.9 → spiderforce4ai-1.1}/spiderforce4ai/__init__.py +69 -29
- {spiderforce4ai-0.1.9 → spiderforce4ai-1.1}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-0.1.9 → spiderforce4ai-1.1}/README.md +0 -0
- {spiderforce4ai-0.1.9 → spiderforce4ai-1.1}/setup.cfg +0 -0
- {spiderforce4ai-0.1.9 → spiderforce4ai-1.1}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-0.1.9 → spiderforce4ai-1.1}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-0.1.9 → spiderforce4ai-1.1}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-0.1.9 → spiderforce4ai-1.1}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "
|
7
|
+
version = "1.1"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
|
@@ -445,7 +445,11 @@ class SpiderForce4AI:
|
|
445
445
|
if not failed_results:
|
446
446
|
return []
|
447
447
|
|
448
|
-
|
448
|
+
failed_count = len(failed_results)
|
449
|
+
total_count = len([r for r in self.crawl_results])
|
450
|
+
failure_ratio = (failed_count / total_count) * 100
|
451
|
+
|
452
|
+
console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
|
449
453
|
retry_results = []
|
450
454
|
|
451
455
|
# Create a new progress bar if one wasn't provided
|
@@ -519,24 +523,41 @@ class SpiderForce4AI:
|
|
519
523
|
# Identify failed URLs
|
520
524
|
failed_results = [r for r in initial_results if r.status == "failed"]
|
521
525
|
|
522
|
-
#
|
526
|
+
# Calculate initial failure ratio
|
527
|
+
initial_failed = len(failed_results)
|
528
|
+
total_urls = len(urls)
|
529
|
+
failure_ratio = (initial_failed / total_urls) * 100
|
530
|
+
|
531
|
+
# Retry failed URLs if ratio is acceptable
|
523
532
|
if failed_results:
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
533
|
+
if failure_ratio > 20:
|
534
|
+
console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
|
535
|
+
results = initial_results
|
536
|
+
else:
|
537
|
+
retry_results = await self._retry_failed_urls(failed_results, config, progress)
|
538
|
+
# Replace failed results with retry results
|
539
|
+
results = [r for r in initial_results if r.status == "success"] + retry_results
|
528
540
|
else:
|
529
541
|
results = initial_results
|
530
542
|
|
531
543
|
# Save final report
|
532
544
|
await self._save_report(config)
|
533
545
|
|
534
|
-
#
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
console.print(f"
|
546
|
+
# Calculate final statistics
|
547
|
+
final_successful = len([r for r in results if r.status == "success"])
|
548
|
+
final_failed = len([r for r in results if r.status == "failed"])
|
549
|
+
|
550
|
+
# Print detailed summary
|
551
|
+
console.print(f"\n[green]Crawling Summary:[/green]")
|
552
|
+
console.print(f"Total URLs processed: {total_urls}")
|
553
|
+
console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
|
554
|
+
console.print(f"Final results:")
|
555
|
+
console.print(f" ✓ Successful: {final_successful}")
|
556
|
+
console.print(f" ✗ Failed: {final_failed}")
|
557
|
+
|
558
|
+
if initial_failed > 0:
|
559
|
+
retry_successful = initial_failed - final_failed
|
560
|
+
console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
|
540
561
|
|
541
562
|
if config.report_file:
|
542
563
|
console.print(f"📊 Report saved to: {config.report_file}")
|
@@ -624,25 +645,44 @@ class SpiderForce4AI:
|
|
624
645
|
self._save_report_sync(results, config)
|
625
646
|
print(f"\nReport saved to: {config.report_file}")
|
626
647
|
|
627
|
-
#
|
648
|
+
# Calculate initial failure statistics
|
628
649
|
failed_results = [r for r in results if r.status == "failed"]
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
new_result = _process_url_parallel((result.url, self.base_url, config))
|
633
|
-
if new_result.status == "success":
|
634
|
-
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
635
|
-
# Replace the failed result with the successful retry
|
636
|
-
results[results.index(result)] = new_result
|
637
|
-
else:
|
638
|
-
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
650
|
+
initial_failed = len(failed_results)
|
651
|
+
total_urls = len(urls)
|
652
|
+
failure_ratio = (initial_failed / total_urls) * 100
|
639
653
|
|
640
|
-
#
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
654
|
+
# Retry failed URLs if ratio is acceptable
|
655
|
+
if failed_results:
|
656
|
+
if failure_ratio > 20:
|
657
|
+
console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
|
658
|
+
else:
|
659
|
+
failed_count = len(failed_results)
|
660
|
+
failure_ratio = (failed_count / total_urls) * 100
|
661
|
+
console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
|
662
|
+
for result in failed_results:
|
663
|
+
new_result = _process_url_parallel((result.url, self.base_url, config))
|
664
|
+
if new_result.status == "success":
|
665
|
+
console.print(f"[green]✓ Retry successful: {result.url}[/green]")
|
666
|
+
# Replace the failed result with the successful retry
|
667
|
+
results[results.index(result)] = new_result
|
668
|
+
else:
|
669
|
+
console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
|
670
|
+
|
671
|
+
# Calculate final statistics
|
672
|
+
final_successful = len([r for r in results if r.status == "success"])
|
673
|
+
final_failed = len([r for r in results if r.status == "failed"])
|
674
|
+
|
675
|
+
# Print detailed summary
|
676
|
+
console.print(f"\n[green]Crawling Summary:[/green]")
|
677
|
+
console.print(f"Total URLs processed: {total_urls}")
|
678
|
+
console.print(f"Initial failures: {initial_failed} ({failure_ratio:.1f}%)")
|
679
|
+
console.print(f"Final results:")
|
680
|
+
console.print(f" ✓ Successful: {final_successful}")
|
681
|
+
console.print(f" ✗ Failed: {final_failed}")
|
682
|
+
|
683
|
+
if initial_failed > 0:
|
684
|
+
retry_successful = initial_failed - final_failed
|
685
|
+
console.print(f"Retry success rate: {retry_successful}/{initial_failed} ({(retry_successful/initial_failed)*100:.1f}%)")
|
646
686
|
|
647
687
|
return results
|
648
688
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|