PyPI - spiderforce4ai - Versions diffs - 2.4__tar.gz → 2.4.1__tar.gz - Mend

spiderforce4ai 2.4tar.gz → 2.4.1tar.gz

Files changed (14) hide show

{spiderforce4ai-2.4 → spiderforce4ai-2.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 2.4
+Version: 2.4.1
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

{spiderforce4ai-2.4 → spiderforce4ai-2.4.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "spiderforce4ai"
-version = "2.4"
+version = "2.4.1"
 description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
 readme = "README.md"
 authors = [

{spiderforce4ai-2.4 → spiderforce4ai-2.4.1}/setup.py RENAMED Viewed

@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
 setup(
     name="spiderforce4ai",
-    version="2.4",
+    version="2.4.1",
     author="Piotr Tamulewicz",
     author_email="pt@petertam.pro",
     description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",

{spiderforce4ai-2.4 → spiderforce4ai-2.4.1}/spiderforce4ai/__init__.py RENAMED Viewed

@@ -576,8 +576,11 @@ class SpiderForce4AI:
             # Set up concurrency control
             semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+            # Semaphore for crawling
+            crawl_semaphore = asyncio.Semaphore(config.max_concurrent_requests)
             async def crawl_with_semaphore(url):
-                async with semaphore:
+                async with crawl_semaphore:
                     result = await crawl_with_progress(url)
                     await asyncio.sleep(config.request_delay)
                     return result
@@ -606,9 +609,36 @@ class SpiderForce4AI:
                             results[i] = retry_result
                             break
+            # Process LLM requests sequentially after all crawling is complete
+            if config.post_extraction_agent:
+                console.print("\n[cyan]Processing content with LLM...[/cyan]")
+                llm_task = progress.add_task("[cyan]LLM Processing...", total=len([r for r in results if r.status == "success"]))
+                post_config = PostExtractionConfig(
+                    model=config.post_extraction_agent["model"],
+                    messages=config.post_extraction_agent["messages"],
+                    api_key=config.post_extraction_agent["api_key"],
+                    max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                    temperature=config.post_extraction_agent.get("temperature", 0.7),
+                    base_url=config.post_extraction_agent.get("base_url"),
+                    combine_output=bool(config.post_extraction_agent_save_to_file),
+                    output_file=config.post_extraction_agent_save_to_file,
+                    custom_transform_function=config.post_agent_transformer_function
+                )
+                agent = PostExtractionAgent(post_config)
+                for result in results:
+                    if result.status == "success":
+                        try:
+                            result.extraction_result = await agent.process_content(result.url, result.markdown)
+                            progress.update(llm_task, advance=1)
+                        except Exception as e:
+                            console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
             # Calculate final statistics
             final_successful = len([r for r in results if r.status == "success"])
             final_failed = len([r for r in results if r.status == "failed"])
+            llm_successful = len([r for r in results if r.extraction_result is not None])
             # Update retry stats
             self._retry_stats = {
@@ -616,7 +646,7 @@ class SpiderForce4AI:
                 "failure_ratio": failure_ratio,
                 "retry_successful": retry_successful if initial_failed > 0 else 0,
                 "retry_failed": final_failed,
-                "post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
+                "llm_successful": llm_successful
             }
             # Print summary
@@ -894,4 +924,4 @@ class SpiderForce4AI:
 # Version info
 #__version__ = "2.3.1"
 #__author__ = "Piotr Tamulewicz"
-#__email__ = "pt@petertam.pro"
+#__email__ = "pt@petertam.pro"

{spiderforce4ai-2.4 → spiderforce4ai-2.4.1}/spiderforce4ai/post_extraction_agent.py RENAMED Viewed

@@ -176,23 +176,38 @@ class PostExtractionAgent:
                 for msg in self.config.messages
             ]
-            # Make LLM request
-            response = await completion(
-                model=self.config.model,
-                messages=messages,
-                max_tokens=self.config.max_tokens,
-                temperature=self.config.temperature,
-                api_key=self.config.api_key,
-                api_base=self.config.base_url
-            )
+            # Make LLM request with retries
+            max_retries = 3
+            retry_delay = 1.0
+            last_error = None
-            # Parse response
-            try:
-                extracted_data = json.loads(response.choices[0].message.content)
-                self.buffer.remove_request(url)  # Remove from buffer if successful
-                return extracted_data
-            except json.JSONDecodeError as e:
-                raise ValueError(f"Invalid JSON response from LLM: {e}")
+            for attempt in range(max_retries):
+                try:
+                    response = await completion(
+                        model=self.config.model,
+                        messages=messages,
+                        max_tokens=self.config.max_tokens,
+                        temperature=self.config.temperature,
+                        api_key=self.config.api_key,
+                        api_base=self.config.base_url
+                    )
+                    # Parse response
+                    extracted_data = json.loads(response.choices[0].message.content)
+                    self.buffer.remove_request(url)  # Remove from buffer if successful
+                    return extracted_data
+                except json.JSONDecodeError as e:
+                    last_error = f"Invalid JSON response from LLM: {e}"
+                    if attempt < max_retries - 1:
+                        await asyncio.sleep(retry_delay * (attempt + 1))
+                except Exception as e:
+                    last_error = str(e)
+                    if attempt < max_retries - 1:
+                        await asyncio.sleep(retry_delay * (attempt + 1))
+            # If we get here, all retries failed
+            raise Exception(last_error)
         except Exception as e:
             logger.error(f"Error processing {url}: {str(e)}")
@@ -256,4 +271,4 @@ class PostExtractionAgent:
             "failed_requests": len(self.buffer.get_failed_requests()),
             "retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
             "success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
-        }
+        }

{spiderforce4ai-2.4 → spiderforce4ai-2.4.1}/spiderforce4ai.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 2.4
+Version: 2.4.1
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz