spiderforce4ai 2.4__tar.gz → 2.4.1__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4
3
+ Version: 2.4.1
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "2.4"
7
+ version = "2.4.1"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
9
9
  readme = "README.md"
10
10
  authors = [
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
7
7
 
8
8
  setup(
9
9
  name="spiderforce4ai",
10
- version="2.4",
10
+ version="2.4.1",
11
11
  author="Piotr Tamulewicz",
12
12
  author_email="pt@petertam.pro",
13
13
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
@@ -576,8 +576,11 @@ class SpiderForce4AI:
576
576
  # Set up concurrency control
577
577
  semaphore = asyncio.Semaphore(config.max_concurrent_requests)
578
578
 
579
+ # Semaphore for crawling
580
+ crawl_semaphore = asyncio.Semaphore(config.max_concurrent_requests)
581
+
579
582
  async def crawl_with_semaphore(url):
580
- async with semaphore:
583
+ async with crawl_semaphore:
581
584
  result = await crawl_with_progress(url)
582
585
  await asyncio.sleep(config.request_delay)
583
586
  return result
@@ -606,9 +609,36 @@ class SpiderForce4AI:
606
609
  results[i] = retry_result
607
610
  break
608
611
 
612
+ # Process LLM requests sequentially after all crawling is complete
613
+ if config.post_extraction_agent:
614
+ console.print("\n[cyan]Processing content with LLM...[/cyan]")
615
+ llm_task = progress.add_task("[cyan]LLM Processing...", total=len([r for r in results if r.status == "success"]))
616
+
617
+ post_config = PostExtractionConfig(
618
+ model=config.post_extraction_agent["model"],
619
+ messages=config.post_extraction_agent["messages"],
620
+ api_key=config.post_extraction_agent["api_key"],
621
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
622
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
623
+ base_url=config.post_extraction_agent.get("base_url"),
624
+ combine_output=bool(config.post_extraction_agent_save_to_file),
625
+ output_file=config.post_extraction_agent_save_to_file,
626
+ custom_transform_function=config.post_agent_transformer_function
627
+ )
628
+ agent = PostExtractionAgent(post_config)
629
+
630
+ for result in results:
631
+ if result.status == "success":
632
+ try:
633
+ result.extraction_result = await agent.process_content(result.url, result.markdown)
634
+ progress.update(llm_task, advance=1)
635
+ except Exception as e:
636
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
637
+
609
638
  # Calculate final statistics
610
639
  final_successful = len([r for r in results if r.status == "success"])
611
640
  final_failed = len([r for r in results if r.status == "failed"])
641
+ llm_successful = len([r for r in results if r.extraction_result is not None])
612
642
 
613
643
  # Update retry stats
614
644
  self._retry_stats = {
@@ -616,7 +646,7 @@ class SpiderForce4AI:
616
646
  "failure_ratio": failure_ratio,
617
647
  "retry_successful": retry_successful if initial_failed > 0 else 0,
618
648
  "retry_failed": final_failed,
619
- "post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
649
+ "llm_successful": llm_successful
620
650
  }
621
651
 
622
652
  # Print summary
@@ -894,4 +924,4 @@ class SpiderForce4AI:
894
924
  # Version info
895
925
  #__version__ = "2.3.1"
896
926
  #__author__ = "Piotr Tamulewicz"
897
- #__email__ = "pt@petertam.pro"
927
+ #__email__ = "pt@petertam.pro"
@@ -176,23 +176,38 @@ class PostExtractionAgent:
176
176
  for msg in self.config.messages
177
177
  ]
178
178
 
179
- # Make LLM request
180
- response = await completion(
181
- model=self.config.model,
182
- messages=messages,
183
- max_tokens=self.config.max_tokens,
184
- temperature=self.config.temperature,
185
- api_key=self.config.api_key,
186
- api_base=self.config.base_url
187
- )
179
+ # Make LLM request with retries
180
+ max_retries = 3
181
+ retry_delay = 1.0
182
+ last_error = None
188
183
 
189
- # Parse response
190
- try:
191
- extracted_data = json.loads(response.choices[0].message.content)
192
- self.buffer.remove_request(url) # Remove from buffer if successful
193
- return extracted_data
194
- except json.JSONDecodeError as e:
195
- raise ValueError(f"Invalid JSON response from LLM: {e}")
184
+ for attempt in range(max_retries):
185
+ try:
186
+ response = await completion(
187
+ model=self.config.model,
188
+ messages=messages,
189
+ max_tokens=self.config.max_tokens,
190
+ temperature=self.config.temperature,
191
+ api_key=self.config.api_key,
192
+ api_base=self.config.base_url
193
+ )
194
+
195
+ # Parse response
196
+ extracted_data = json.loads(response.choices[0].message.content)
197
+ self.buffer.remove_request(url) # Remove from buffer if successful
198
+ return extracted_data
199
+
200
+ except json.JSONDecodeError as e:
201
+ last_error = f"Invalid JSON response from LLM: {e}"
202
+ if attempt < max_retries - 1:
203
+ await asyncio.sleep(retry_delay * (attempt + 1))
204
+ except Exception as e:
205
+ last_error = str(e)
206
+ if attempt < max_retries - 1:
207
+ await asyncio.sleep(retry_delay * (attempt + 1))
208
+
209
+ # If we get here, all retries failed
210
+ raise Exception(last_error)
196
211
 
197
212
  except Exception as e:
198
213
  logger.error(f"Error processing {url}: {str(e)}")
@@ -256,4 +271,4 @@ class PostExtractionAgent:
256
271
  "failed_requests": len(self.buffer.get_failed_requests()),
257
272
  "retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
258
273
  "success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
259
- }
274
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4
3
+ Version: 2.4.1
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes