spiderforce4ai 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
312
312
  config=config.to_dict()
313
313
  )
314
314
 
315
- # Handle post-extraction if configured
316
- if config.post_extraction_agent:
317
- try:
318
- post_config = PostExtractionConfig(
319
- model=config.post_extraction_agent["model"],
320
- messages=config.post_extraction_agent["messages"],
321
- api_key=config.post_extraction_agent["api_key"],
322
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
323
- temperature=config.post_extraction_agent.get("temperature", 0.7),
324
- base_url=config.post_extraction_agent.get("base_url"),
325
- combine_output=bool(config.post_extraction_agent_save_to_file),
326
- output_file=config.post_extraction_agent_save_to_file,
327
- custom_transform_function=config.post_agent_transformer_function
328
- )
329
-
330
- agent = PostExtractionAgent(post_config)
331
- extraction_result = asyncio.run(agent.process_content(url, markdown))
332
- if extraction_result:
333
- result.extraction_result = extraction_result
334
- except Exception as e:
335
- console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
336
-
337
315
  # Send webhook for successful result
338
316
  _send_webhook_sync(result, config)
339
317
 
@@ -460,28 +438,6 @@ class SpiderForce4AI:
460
438
  if config.output_dir:
461
439
  await _save_markdown_async(url, markdown, config)
462
440
 
463
- # Handle post-extraction if configured
464
- if config.post_extraction_agent and result.status == "success":
465
- try:
466
- post_config = PostExtractionConfig(
467
- model=config.post_extraction_agent["model"],
468
- messages=config.post_extraction_agent["messages"],
469
- api_key=config.post_extraction_agent["api_key"],
470
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
471
- temperature=config.post_extraction_agent.get("temperature", 0.7),
472
- base_url=config.post_extraction_agent.get("base_url"),
473
- combine_output=bool(config.post_extraction_agent_save_to_file),
474
- output_file=config.post_extraction_agent_save_to_file,
475
- custom_transform_function=config.post_agent_transformer_function
476
- )
477
-
478
- agent = PostExtractionAgent(post_config)
479
- extraction_result = await agent.process_content(url, markdown)
480
- if extraction_result:
481
- result.extraction_result = extraction_result
482
- except Exception as e:
483
- console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
484
-
485
441
  await _send_webhook_async(result, config)
486
442
 
487
443
  self.crawl_results.append(result)
@@ -635,10 +591,40 @@ class SpiderForce4AI:
635
591
  except Exception as e:
636
592
  console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
637
593
 
594
+ # Process LLM requests sequentially after all crawling is complete
595
+ llm_successful = 0
596
+ if config.post_extraction_agent:
597
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
598
+ successful_results = [r for r in results if r.status == "success"]
599
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
600
+
601
+ post_config = PostExtractionConfig(
602
+ model=config.post_extraction_agent["model"],
603
+ messages=config.post_extraction_agent["messages"],
604
+ api_key=config.post_extraction_agent["api_key"],
605
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
606
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
607
+ base_url=config.post_extraction_agent.get("base_url"),
608
+ combine_output=bool(config.post_extraction_agent_save_to_file),
609
+ output_file=config.post_extraction_agent_save_to_file,
610
+ custom_transform_function=config.post_agent_transformer_function
611
+ )
612
+ agent = PostExtractionAgent(post_config)
613
+
614
+ for result in successful_results:
615
+ try:
616
+ result.extraction_result = await agent.process_content(result.url, result.markdown)
617
+ if result.extraction_result:
618
+ llm_successful += 1
619
+ progress.update(llm_task, advance=1)
620
+ except Exception as e:
621
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
622
+ await asyncio.sleep(1) # Add delay after error
623
+ await asyncio.sleep(0.5) # Rate limiting between requests
624
+
638
625
  # Calculate final statistics
639
626
  final_successful = len([r for r in results if r.status == "success"])
640
627
  final_failed = len([r for r in results if r.status == "failed"])
641
- llm_successful = len([r for r in results if r.extraction_result is not None])
642
628
 
643
629
  # Update retry stats
644
630
  self._retry_stats = {
@@ -725,13 +711,48 @@ class SpiderForce4AI:
725
711
  TextColumn("({task.completed}/{task.total})"),
726
712
  ) as progress:
727
713
  task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
728
-
714
+
729
715
  for result in pool.imap_unordered(_process_url_parallel, process_args):
730
716
  results.append(result)
731
717
  progress.update(task, advance=1)
732
718
  status = "✓" if result.status == "success" else "✗"
733
719
  progress.description = f"[cyan]Last: {status} {result.url}"
734
720
 
721
+ # Process LLM requests sequentially after all crawling is complete
722
+ if config.post_extraction_agent:
723
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
724
+ successful_results = [r for r in results if r.status == "success"]
725
+
726
+ with Progress(
727
+ SpinnerColumn(),
728
+ TextColumn("[progress.description]{task.description}"),
729
+ BarColumn(),
730
+ TaskProgressColumn(),
731
+ ) as progress:
732
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
733
+
734
+ post_config = PostExtractionConfig(
735
+ model=config.post_extraction_agent["model"],
736
+ messages=config.post_extraction_agent["messages"],
737
+ api_key=config.post_extraction_agent["api_key"],
738
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
739
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
740
+ base_url=config.post_extraction_agent.get("base_url"),
741
+ combine_output=bool(config.post_extraction_agent_save_to_file),
742
+ output_file=config.post_extraction_agent_save_to_file,
743
+ custom_transform_function=config.post_agent_transformer_function
744
+ )
745
+ agent = PostExtractionAgent(post_config)
746
+
747
+ for result in successful_results:
748
+ try:
749
+ result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
750
+ progress.update(llm_task, advance=1)
751
+ except Exception as e:
752
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
753
+ time.sleep(1) # Add delay after error
754
+ time.sleep(0.5) # Rate limiting between requests
755
+
735
756
  # Calculate statistics and handle retries
736
757
  failed_results = [r for r in results if r.status == "failed"]
737
758
  initial_failed = len(failed_results)
@@ -823,31 +844,44 @@ class SpiderForce4AI:
823
844
  if result.status == "success" and config.output_dir and result.markdown:
824
845
  _save_markdown_sync(result.url, result.markdown, config)
825
846
 
826
- # Handle post-extraction if configured
827
- if config.post_extraction_agent and result.status == "success":
828
- try:
829
- post_config = PostExtractionConfig(
830
- model=config.post_extraction_agent["model"],
831
- messages=config.post_extraction_agent["messages"],
832
- api_key=config.post_extraction_agent["api_key"],
833
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
834
- temperature=config.post_extraction_agent.get("temperature", 0.7),
835
- base_url=config.post_extraction_agent.get("base_url"),
836
- combine_output=bool(config.post_extraction_agent_save_to_file),
837
- output_file=config.post_extraction_agent_save_to_file,
838
- custom_transform_function=config.post_agent_transformer_function
839
- )
840
-
841
- agent = PostExtractionAgent(post_config)
842
- extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
843
- if extraction_result:
844
- result.extraction_result = extraction_result
845
- except Exception as e:
846
- console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
847
-
848
847
  # Send webhook if configured
849
848
  _send_webhook_sync(result, config)
850
849
  results.append(result)
850
+
851
+ # Process LLM requests sequentially after all crawling is complete
852
+ if config.post_extraction_agent:
853
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
854
+ successful_results = [r for r in results if r.status == "success"]
855
+
856
+ with Progress(
857
+ SpinnerColumn(),
858
+ TextColumn("[progress.description]{task.description}"),
859
+ BarColumn(),
860
+ TaskProgressColumn(),
861
+ ) as progress:
862
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
863
+
864
+ post_config = PostExtractionConfig(
865
+ model=config.post_extraction_agent["model"],
866
+ messages=config.post_extraction_agent["messages"],
867
+ api_key=config.post_extraction_agent["api_key"],
868
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
869
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
870
+ base_url=config.post_extraction_agent.get("base_url"),
871
+ combine_output=bool(config.post_extraction_agent_save_to_file),
872
+ output_file=config.post_extraction_agent_save_to_file,
873
+ custom_transform_function=config.post_agent_transformer_function
874
+ )
875
+ agent = PostExtractionAgent(post_config)
876
+
877
+ for result in successful_results:
878
+ try:
879
+ result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
880
+ progress.update(llm_task, advance=1)
881
+ except Exception as e:
882
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
883
+ time.sleep(1) # Add delay after error
884
+ time.sleep(0.5) # Rate limiting between requests
851
885
 
852
886
  # Calculate statistics
853
887
  successful = len([r for r in results if r.status == "success"])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.1
3
+ Version: 2.4.3
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=iwCLSvooHtFAo-rU52-nsFgyn99Dflpt_OpSrIW-PqA,42273
2
+ spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
+ spiderforce4ai-2.4.3.dist-info/METADATA,sha256=-i_vH6DDs4xVFVdDfaFG_Xka0pqXCSQdCrKgym5r5b0,9012
4
+ spiderforce4ai-2.4.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.3.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=IjoJSE-7PX8zxBF0Pl1ELQUraLU3agAtY_J6NvQSPf4,40533
2
- spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
- spiderforce4ai-2.4.1.dist-info/METADATA,sha256=xVm-JdLz6Kx73Bi0DA1QG6D9Ya_OLqWd_80PNWHXLsA,9012
4
- spiderforce4ai-2.4.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.4.1.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.4.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.4.1.dist-info/RECORD,,