spiderforce4ai 2.4__py3-none-any.whl → 2.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -460,28 +460,6 @@ class SpiderForce4AI:
460
460
  if config.output_dir:
461
461
  await _save_markdown_async(url, markdown, config)
462
462
 
463
- # Handle post-extraction if configured
464
- if config.post_extraction_agent and result.status == "success":
465
- try:
466
- post_config = PostExtractionConfig(
467
- model=config.post_extraction_agent["model"],
468
- messages=config.post_extraction_agent["messages"],
469
- api_key=config.post_extraction_agent["api_key"],
470
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
471
- temperature=config.post_extraction_agent.get("temperature", 0.7),
472
- base_url=config.post_extraction_agent.get("base_url"),
473
- combine_output=bool(config.post_extraction_agent_save_to_file),
474
- output_file=config.post_extraction_agent_save_to_file,
475
- custom_transform_function=config.post_agent_transformer_function
476
- )
477
-
478
- agent = PostExtractionAgent(post_config)
479
- extraction_result = await agent.process_content(url, markdown)
480
- if extraction_result:
481
- result.extraction_result = extraction_result
482
- except Exception as e:
483
- console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
484
-
485
463
  await _send_webhook_async(result, config)
486
464
 
487
465
  self.crawl_results.append(result)
@@ -576,8 +554,11 @@ class SpiderForce4AI:
576
554
  # Set up concurrency control
577
555
  semaphore = asyncio.Semaphore(config.max_concurrent_requests)
578
556
 
557
+ # Semaphore for crawling
558
+ crawl_semaphore = asyncio.Semaphore(config.max_concurrent_requests)
559
+
579
560
  async def crawl_with_semaphore(url):
580
- async with semaphore:
561
+ async with crawl_semaphore:
581
562
  result = await crawl_with_progress(url)
582
563
  await asyncio.sleep(config.request_delay)
583
564
  return result
@@ -606,6 +587,63 @@ class SpiderForce4AI:
606
587
  results[i] = retry_result
607
588
  break
608
589
 
590
+ # Process LLM requests sequentially after all crawling is complete
591
+ if config.post_extraction_agent:
592
+ console.print("\n[cyan]Processing content with LLM...[/cyan]")
593
+ llm_task = progress.add_task("[cyan]LLM Processing...", total=len([r for r in results if r.status == "success"]))
594
+
595
+ post_config = PostExtractionConfig(
596
+ model=config.post_extraction_agent["model"],
597
+ messages=config.post_extraction_agent["messages"],
598
+ api_key=config.post_extraction_agent["api_key"],
599
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
600
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
601
+ base_url=config.post_extraction_agent.get("base_url"),
602
+ combine_output=bool(config.post_extraction_agent_save_to_file),
603
+ output_file=config.post_extraction_agent_save_to_file,
604
+ custom_transform_function=config.post_agent_transformer_function
605
+ )
606
+ agent = PostExtractionAgent(post_config)
607
+
608
+ for result in results:
609
+ if result.status == "success":
610
+ try:
611
+ result.extraction_result = await agent.process_content(result.url, result.markdown)
612
+ progress.update(llm_task, advance=1)
613
+ except Exception as e:
614
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
615
+
616
+ # Process LLM requests sequentially after all crawling is complete
617
+ llm_successful = 0
618
+ if config.post_extraction_agent:
619
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
620
+ successful_results = [r for r in results if r.status == "success"]
621
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
622
+
623
+ post_config = PostExtractionConfig(
624
+ model=config.post_extraction_agent["model"],
625
+ messages=config.post_extraction_agent["messages"],
626
+ api_key=config.post_extraction_agent["api_key"],
627
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
628
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
629
+ base_url=config.post_extraction_agent.get("base_url"),
630
+ combine_output=bool(config.post_extraction_agent_save_to_file),
631
+ output_file=config.post_extraction_agent_save_to_file,
632
+ custom_transform_function=config.post_agent_transformer_function
633
+ )
634
+ agent = PostExtractionAgent(post_config)
635
+
636
+ for result in successful_results:
637
+ try:
638
+ result.extraction_result = await agent.process_content(result.url, result.markdown)
639
+ if result.extraction_result:
640
+ llm_successful += 1
641
+ progress.update(llm_task, advance=1)
642
+ except Exception as e:
643
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
644
+ await asyncio.sleep(1) # Add delay after error
645
+ await asyncio.sleep(0.5) # Rate limiting between requests
646
+
609
647
  # Calculate final statistics
610
648
  final_successful = len([r for r in results if r.status == "success"])
611
649
  final_failed = len([r for r in results if r.status == "failed"])
@@ -616,7 +654,7 @@ class SpiderForce4AI:
616
654
  "failure_ratio": failure_ratio,
617
655
  "retry_successful": retry_successful if initial_failed > 0 else 0,
618
656
  "retry_failed": final_failed,
619
- "post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
657
+ "llm_successful": llm_successful
620
658
  }
621
659
 
622
660
  # Print summary
@@ -894,4 +932,4 @@ class SpiderForce4AI:
894
932
  # Version info
895
933
  #__version__ = "2.3.1"
896
934
  #__author__ = "Piotr Tamulewicz"
897
- #__email__ = "pt@petertam.pro"
935
+ #__email__ = "pt@petertam.pro"
@@ -176,23 +176,38 @@ class PostExtractionAgent:
176
176
  for msg in self.config.messages
177
177
  ]
178
178
 
179
- # Make LLM request
180
- response = await completion(
181
- model=self.config.model,
182
- messages=messages,
183
- max_tokens=self.config.max_tokens,
184
- temperature=self.config.temperature,
185
- api_key=self.config.api_key,
186
- api_base=self.config.base_url
187
- )
179
+ # Make LLM request with retries
180
+ max_retries = 3
181
+ retry_delay = 1.0
182
+ last_error = None
188
183
 
189
- # Parse response
190
- try:
191
- extracted_data = json.loads(response.choices[0].message.content)
192
- self.buffer.remove_request(url) # Remove from buffer if successful
193
- return extracted_data
194
- except json.JSONDecodeError as e:
195
- raise ValueError(f"Invalid JSON response from LLM: {e}")
184
+ for attempt in range(max_retries):
185
+ try:
186
+ response = await completion(
187
+ model=self.config.model,
188
+ messages=messages,
189
+ max_tokens=self.config.max_tokens,
190
+ temperature=self.config.temperature,
191
+ api_key=self.config.api_key,
192
+ api_base=self.config.base_url
193
+ )
194
+
195
+ # Parse response
196
+ extracted_data = json.loads(response.choices[0].message.content)
197
+ self.buffer.remove_request(url) # Remove from buffer if successful
198
+ return extracted_data
199
+
200
+ except json.JSONDecodeError as e:
201
+ last_error = f"Invalid JSON response from LLM: {e}"
202
+ if attempt < max_retries - 1:
203
+ await asyncio.sleep(retry_delay * (attempt + 1))
204
+ except Exception as e:
205
+ last_error = str(e)
206
+ if attempt < max_retries - 1:
207
+ await asyncio.sleep(retry_delay * (attempt + 1))
208
+
209
+ # If we get here, all retries failed
210
+ raise Exception(last_error)
196
211
 
197
212
  except Exception as e:
198
213
  logger.error(f"Error processing {url}: {str(e)}")
@@ -256,4 +271,4 @@ class PostExtractionAgent:
256
271
  "failed_requests": len(self.buffer.get_failed_requests()),
257
272
  "retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
258
273
  "success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
259
- }
274
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4
3
+ Version: 2.4.2
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=6hqYztIqL_jRuKmQOGnap2-hP8Lq1YXarUQXTFwIVxY,40841
2
+ spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
+ spiderforce4ai-2.4.2.dist-info/METADATA,sha256=hyIp437hoWVVkbN88P6yNcKwvkvf2NpP6fyOsWxhM_I,9012
4
+ spiderforce4ai-2.4.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.2.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.2.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
2
- spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
3
- spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
4
- spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.4.dist-info/RECORD,,