spiderforce4ai 2.4.1__py3-none-any.whl → 2.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -460,28 +460,6 @@ class SpiderForce4AI:
460
460
  if config.output_dir:
461
461
  await _save_markdown_async(url, markdown, config)
462
462
 
463
- # Handle post-extraction if configured
464
- if config.post_extraction_agent and result.status == "success":
465
- try:
466
- post_config = PostExtractionConfig(
467
- model=config.post_extraction_agent["model"],
468
- messages=config.post_extraction_agent["messages"],
469
- api_key=config.post_extraction_agent["api_key"],
470
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
471
- temperature=config.post_extraction_agent.get("temperature", 0.7),
472
- base_url=config.post_extraction_agent.get("base_url"),
473
- combine_output=bool(config.post_extraction_agent_save_to_file),
474
- output_file=config.post_extraction_agent_save_to_file,
475
- custom_transform_function=config.post_agent_transformer_function
476
- )
477
-
478
- agent = PostExtractionAgent(post_config)
479
- extraction_result = await agent.process_content(url, markdown)
480
- if extraction_result:
481
- result.extraction_result = extraction_result
482
- except Exception as e:
483
- console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
484
-
485
463
  await _send_webhook_async(result, config)
486
464
 
487
465
  self.crawl_results.append(result)
@@ -635,10 +613,40 @@ class SpiderForce4AI:
635
613
  except Exception as e:
636
614
  console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
637
615
 
616
+ # Process LLM requests sequentially after all crawling is complete
617
+ llm_successful = 0
618
+ if config.post_extraction_agent:
619
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
620
+ successful_results = [r for r in results if r.status == "success"]
621
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
622
+
623
+ post_config = PostExtractionConfig(
624
+ model=config.post_extraction_agent["model"],
625
+ messages=config.post_extraction_agent["messages"],
626
+ api_key=config.post_extraction_agent["api_key"],
627
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
628
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
629
+ base_url=config.post_extraction_agent.get("base_url"),
630
+ combine_output=bool(config.post_extraction_agent_save_to_file),
631
+ output_file=config.post_extraction_agent_save_to_file,
632
+ custom_transform_function=config.post_agent_transformer_function
633
+ )
634
+ agent = PostExtractionAgent(post_config)
635
+
636
+ for result in successful_results:
637
+ try:
638
+ result.extraction_result = await agent.process_content(result.url, result.markdown)
639
+ if result.extraction_result:
640
+ llm_successful += 1
641
+ progress.update(llm_task, advance=1)
642
+ except Exception as e:
643
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
644
+ await asyncio.sleep(1) # Add delay after error
645
+ await asyncio.sleep(0.5) # Rate limiting between requests
646
+
638
647
  # Calculate final statistics
639
648
  final_successful = len([r for r in results if r.status == "success"])
640
649
  final_failed = len([r for r in results if r.status == "failed"])
641
- llm_successful = len([r for r in results if r.extraction_result is not None])
642
650
 
643
651
  # Update retry stats
644
652
  self._retry_stats = {
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.1
3
+ Version: 2.4.2
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=6hqYztIqL_jRuKmQOGnap2-hP8Lq1YXarUQXTFwIVxY,40841
2
+ spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
+ spiderforce4ai-2.4.2.dist-info/METADATA,sha256=hyIp437hoWVVkbN88P6yNcKwvkvf2NpP6fyOsWxhM_I,9012
4
+ spiderforce4ai-2.4.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.2.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.2.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=IjoJSE-7PX8zxBF0Pl1ELQUraLU3agAtY_J6NvQSPf4,40533
2
- spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
- spiderforce4ai-2.4.1.dist-info/METADATA,sha256=xVm-JdLz6Kx73Bi0DA1QG6D9Ya_OLqWd_80PNWHXLsA,9012
4
- spiderforce4ai-2.4.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.4.1.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.4.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.4.1.dist-info/RECORD,,