spiderforce4ai 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
312
312
  config=config.to_dict()
313
313
  )
314
314
 
315
- # Handle post-extraction if configured
316
- if config.post_extraction_agent:
317
- try:
318
- post_config = PostExtractionConfig(
319
- model=config.post_extraction_agent["model"],
320
- messages=config.post_extraction_agent["messages"],
321
- api_key=config.post_extraction_agent["api_key"],
322
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
323
- temperature=config.post_extraction_agent.get("temperature", 0.7),
324
- base_url=config.post_extraction_agent.get("base_url"),
325
- combine_output=bool(config.post_extraction_agent_save_to_file),
326
- output_file=config.post_extraction_agent_save_to_file,
327
- custom_transform_function=config.post_agent_transformer_function
328
- )
329
-
330
- agent = PostExtractionAgent(post_config)
331
- extraction_result = asyncio.run(agent.process_content(url, markdown))
332
- if extraction_result:
333
- result.extraction_result = extraction_result
334
- except Exception as e:
335
- console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
336
-
337
315
  # Send webhook for successful result
338
316
  _send_webhook_sync(result, config)
339
317
 
@@ -460,28 +438,6 @@ class SpiderForce4AI:
460
438
  if config.output_dir:
461
439
  await _save_markdown_async(url, markdown, config)
462
440
 
463
- # Handle post-extraction if configured
464
- if config.post_extraction_agent and result.status == "success":
465
- try:
466
- post_config = PostExtractionConfig(
467
- model=config.post_extraction_agent["model"],
468
- messages=config.post_extraction_agent["messages"],
469
- api_key=config.post_extraction_agent["api_key"],
470
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
471
- temperature=config.post_extraction_agent.get("temperature", 0.7),
472
- base_url=config.post_extraction_agent.get("base_url"),
473
- combine_output=bool(config.post_extraction_agent_save_to_file),
474
- output_file=config.post_extraction_agent_save_to_file,
475
- custom_transform_function=config.post_agent_transformer_function
476
- )
477
-
478
- agent = PostExtractionAgent(post_config)
479
- extraction_result = await agent.process_content(url, markdown)
480
- if extraction_result:
481
- result.extraction_result = extraction_result
482
- except Exception as e:
483
- console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
484
-
485
441
  await _send_webhook_async(result, config)
486
442
 
487
443
  self.crawl_results.append(result)
@@ -635,10 +591,40 @@ class SpiderForce4AI:
635
591
  except Exception as e:
636
592
  console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
637
593
 
594
+ # Process LLM requests sequentially after all crawling is complete
595
+ llm_successful = 0
596
+ if config.post_extraction_agent:
597
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
598
+ successful_results = [r for r in results if r.status == "success"]
599
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
600
+
601
+ post_config = PostExtractionConfig(
602
+ model=config.post_extraction_agent["model"],
603
+ messages=config.post_extraction_agent["messages"],
604
+ api_key=config.post_extraction_agent["api_key"],
605
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
606
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
607
+ base_url=config.post_extraction_agent.get("base_url"),
608
+ combine_output=bool(config.post_extraction_agent_save_to_file),
609
+ output_file=config.post_extraction_agent_save_to_file,
610
+ custom_transform_function=config.post_agent_transformer_function
611
+ )
612
+ agent = PostExtractionAgent(post_config)
613
+
614
+ for result in successful_results:
615
+ try:
616
+ result.extraction_result = await agent.process_content(result.url, result.markdown)
617
+ if result.extraction_result:
618
+ llm_successful += 1
619
+ progress.update(llm_task, advance=1)
620
+ except Exception as e:
621
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
622
+ await asyncio.sleep(1) # Add delay after error
623
+ await asyncio.sleep(0.5) # Rate limiting between requests
624
+
638
625
  # Calculate final statistics
639
626
  final_successful = len([r for r in results if r.status == "success"])
640
627
  final_failed = len([r for r in results if r.status == "failed"])
641
- llm_successful = len([r for r in results if r.extraction_result is not None])
642
628
 
643
629
  # Update retry stats
644
630
  self._retry_stats = {
@@ -725,13 +711,48 @@ class SpiderForce4AI:
725
711
  TextColumn("({task.completed}/{task.total})"),
726
712
  ) as progress:
727
713
  task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
728
-
714
+
729
715
  for result in pool.imap_unordered(_process_url_parallel, process_args):
730
716
  results.append(result)
731
717
  progress.update(task, advance=1)
732
718
  status = "✓" if result.status == "success" else "✗"
733
719
  progress.description = f"[cyan]Last: {status} {result.url}"
734
720
 
721
+ # Process LLM requests sequentially after all crawling is complete
722
+ if config.post_extraction_agent:
723
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
724
+ successful_results = [r for r in results if r.status == "success"]
725
+
726
+ with Progress(
727
+ SpinnerColumn(),
728
+ TextColumn("[progress.description]{task.description}"),
729
+ BarColumn(),
730
+ TaskProgressColumn(),
731
+ ) as progress:
732
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
733
+
734
+ post_config = PostExtractionConfig(
735
+ model=config.post_extraction_agent["model"],
736
+ messages=config.post_extraction_agent["messages"],
737
+ api_key=config.post_extraction_agent["api_key"],
738
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
739
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
740
+ base_url=config.post_extraction_agent.get("base_url"),
741
+ combine_output=bool(config.post_extraction_agent_save_to_file),
742
+ output_file=config.post_extraction_agent_save_to_file,
743
+ custom_transform_function=config.post_agent_transformer_function
744
+ )
745
+ agent = PostExtractionAgent(post_config)
746
+
747
+ for result in successful_results:
748
+ try:
749
+ result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
750
+ progress.update(llm_task, advance=1)
751
+ except Exception as e:
752
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
753
+ time.sleep(1) # Add delay after error
754
+ time.sleep(0.5) # Rate limiting between requests
755
+
735
756
  # Calculate statistics and handle retries
736
757
  failed_results = [r for r in results if r.status == "failed"]
737
758
  initial_failed = len(failed_results)
@@ -823,31 +844,44 @@ class SpiderForce4AI:
823
844
  if result.status == "success" and config.output_dir and result.markdown:
824
845
  _save_markdown_sync(result.url, result.markdown, config)
825
846
 
826
- # Handle post-extraction if configured
827
- if config.post_extraction_agent and result.status == "success":
828
- try:
829
- post_config = PostExtractionConfig(
830
- model=config.post_extraction_agent["model"],
831
- messages=config.post_extraction_agent["messages"],
832
- api_key=config.post_extraction_agent["api_key"],
833
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
834
- temperature=config.post_extraction_agent.get("temperature", 0.7),
835
- base_url=config.post_extraction_agent.get("base_url"),
836
- combine_output=bool(config.post_extraction_agent_save_to_file),
837
- output_file=config.post_extraction_agent_save_to_file,
838
- custom_transform_function=config.post_agent_transformer_function
839
- )
840
-
841
- agent = PostExtractionAgent(post_config)
842
- extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
843
- if extraction_result:
844
- result.extraction_result = extraction_result
845
- except Exception as e:
846
- console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
847
-
848
847
  # Send webhook if configured
849
848
  _send_webhook_sync(result, config)
850
849
  results.append(result)
850
+
851
+ # Process LLM requests sequentially after all crawling is complete
852
+ if config.post_extraction_agent:
853
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
854
+ successful_results = [r for r in results if r.status == "success"]
855
+
856
+ with Progress(
857
+ SpinnerColumn(),
858
+ TextColumn("[progress.description]{task.description}"),
859
+ BarColumn(),
860
+ TaskProgressColumn(),
861
+ ) as progress:
862
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
863
+
864
+ post_config = PostExtractionConfig(
865
+ model=config.post_extraction_agent["model"],
866
+ messages=config.post_extraction_agent["messages"],
867
+ api_key=config.post_extraction_agent["api_key"],
868
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
869
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
870
+ base_url=config.post_extraction_agent.get("base_url"),
871
+ combine_output=bool(config.post_extraction_agent_save_to_file),
872
+ output_file=config.post_extraction_agent_save_to_file,
873
+ custom_transform_function=config.post_agent_transformer_function
874
+ )
875
+ agent = PostExtractionAgent(post_config)
876
+
877
+ for result in successful_results:
878
+ try:
879
+ result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
880
+ progress.update(llm_task, advance=1)
881
+ except Exception as e:
882
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
883
+ time.sleep(1) # Add delay after error
884
+ time.sleep(0.5) # Rate limiting between requests
851
885
 
852
886
  # Calculate statistics
853
887
  successful = len([r for r in results if r.status == "success"])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.1
3
+ Version: 2.4.3
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=iwCLSvooHtFAo-rU52-nsFgyn99Dflpt_OpSrIW-PqA,42273
2
+ spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
+ spiderforce4ai-2.4.3.dist-info/METADATA,sha256=-i_vH6DDs4xVFVdDfaFG_Xka0pqXCSQdCrKgym5r5b0,9012
4
+ spiderforce4ai-2.4.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.3.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=IjoJSE-7PX8zxBF0Pl1ELQUraLU3agAtY_J6NvQSPf4,40533
2
- spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
- spiderforce4ai-2.4.1.dist-info/METADATA,sha256=xVm-JdLz6Kx73Bi0DA1QG6D9Ya_OLqWd_80PNWHXLsA,9012
4
- spiderforce4ai-2.4.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.4.1.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.4.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.4.1.dist-info/RECORD,,