spiderforce4ai 2.4__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -576,8 +576,11 @@ class SpiderForce4AI:
576
576
  # Set up concurrency control
577
577
  semaphore = asyncio.Semaphore(config.max_concurrent_requests)
578
578
 
579
+ # Semaphore for crawling
580
+ crawl_semaphore = asyncio.Semaphore(config.max_concurrent_requests)
581
+
579
582
  async def crawl_with_semaphore(url):
580
- async with semaphore:
583
+ async with crawl_semaphore:
581
584
  result = await crawl_with_progress(url)
582
585
  await asyncio.sleep(config.request_delay)
583
586
  return result
@@ -606,9 +609,36 @@ class SpiderForce4AI:
606
609
  results[i] = retry_result
607
610
  break
608
611
 
612
+ # Process LLM requests sequentially after all crawling is complete
613
+ if config.post_extraction_agent:
614
+ console.print("\n[cyan]Processing content with LLM...[/cyan]")
615
+ llm_task = progress.add_task("[cyan]LLM Processing...", total=len([r for r in results if r.status == "success"]))
616
+
617
+ post_config = PostExtractionConfig(
618
+ model=config.post_extraction_agent["model"],
619
+ messages=config.post_extraction_agent["messages"],
620
+ api_key=config.post_extraction_agent["api_key"],
621
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
622
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
623
+ base_url=config.post_extraction_agent.get("base_url"),
624
+ combine_output=bool(config.post_extraction_agent_save_to_file),
625
+ output_file=config.post_extraction_agent_save_to_file,
626
+ custom_transform_function=config.post_agent_transformer_function
627
+ )
628
+ agent = PostExtractionAgent(post_config)
629
+
630
+ for result in results:
631
+ if result.status == "success":
632
+ try:
633
+ result.extraction_result = await agent.process_content(result.url, result.markdown)
634
+ progress.update(llm_task, advance=1)
635
+ except Exception as e:
636
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
637
+
609
638
  # Calculate final statistics
610
639
  final_successful = len([r for r in results if r.status == "success"])
611
640
  final_failed = len([r for r in results if r.status == "failed"])
641
+ llm_successful = len([r for r in results if r.extraction_result is not None])
612
642
 
613
643
  # Update retry stats
614
644
  self._retry_stats = {
@@ -616,7 +646,7 @@ class SpiderForce4AI:
616
646
  "failure_ratio": failure_ratio,
617
647
  "retry_successful": retry_successful if initial_failed > 0 else 0,
618
648
  "retry_failed": final_failed,
619
- "post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
649
+ "llm_successful": llm_successful
620
650
  }
621
651
 
622
652
  # Print summary
@@ -894,4 +924,4 @@ class SpiderForce4AI:
894
924
  # Version info
895
925
  #__version__ = "2.3.1"
896
926
  #__author__ = "Piotr Tamulewicz"
897
- #__email__ = "pt@petertam.pro"
927
+ #__email__ = "pt@petertam.pro"
@@ -176,23 +176,38 @@ class PostExtractionAgent:
176
176
  for msg in self.config.messages
177
177
  ]
178
178
 
179
- # Make LLM request
180
- response = await completion(
181
- model=self.config.model,
182
- messages=messages,
183
- max_tokens=self.config.max_tokens,
184
- temperature=self.config.temperature,
185
- api_key=self.config.api_key,
186
- api_base=self.config.base_url
187
- )
179
+ # Make LLM request with retries
180
+ max_retries = 3
181
+ retry_delay = 1.0
182
+ last_error = None
188
183
 
189
- # Parse response
190
- try:
191
- extracted_data = json.loads(response.choices[0].message.content)
192
- self.buffer.remove_request(url) # Remove from buffer if successful
193
- return extracted_data
194
- except json.JSONDecodeError as e:
195
- raise ValueError(f"Invalid JSON response from LLM: {e}")
184
+ for attempt in range(max_retries):
185
+ try:
186
+ response = await completion(
187
+ model=self.config.model,
188
+ messages=messages,
189
+ max_tokens=self.config.max_tokens,
190
+ temperature=self.config.temperature,
191
+ api_key=self.config.api_key,
192
+ api_base=self.config.base_url
193
+ )
194
+
195
+ # Parse response
196
+ extracted_data = json.loads(response.choices[0].message.content)
197
+ self.buffer.remove_request(url) # Remove from buffer if successful
198
+ return extracted_data
199
+
200
+ except json.JSONDecodeError as e:
201
+ last_error = f"Invalid JSON response from LLM: {e}"
202
+ if attempt < max_retries - 1:
203
+ await asyncio.sleep(retry_delay * (attempt + 1))
204
+ except Exception as e:
205
+ last_error = str(e)
206
+ if attempt < max_retries - 1:
207
+ await asyncio.sleep(retry_delay * (attempt + 1))
208
+
209
+ # If we get here, all retries failed
210
+ raise Exception(last_error)
196
211
 
197
212
  except Exception as e:
198
213
  logger.error(f"Error processing {url}: {str(e)}")
@@ -256,4 +271,4 @@ class PostExtractionAgent:
256
271
  "failed_requests": len(self.buffer.get_failed_requests()),
257
272
  "retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
258
273
  "success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
259
- }
274
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4
3
+ Version: 2.4.1
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=IjoJSE-7PX8zxBF0Pl1ELQUraLU3agAtY_J6NvQSPf4,40533
2
+ spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
+ spiderforce4ai-2.4.1.dist-info/METADATA,sha256=xVm-JdLz6Kx73Bi0DA1QG6D9Ya_OLqWd_80PNWHXLsA,9012
4
+ spiderforce4ai-2.4.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.1.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.1.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
2
- spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
3
- spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
4
- spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.4.dist-info/RECORD,,