PyPI - spiderforce4ai - Versions diffs - 2.4.2__tar.gz → 2.4.5__tar.gz - Mend

spiderforce4ai 2.4.2tar.gz → 2.4.5tar.gz

Files changed (14) hide show

{spiderforce4ai-2.4.2 → spiderforce4ai-2.4.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 2.4.2
+Version: 2.4.5
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

{spiderforce4ai-2.4.2 → spiderforce4ai-2.4.5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "spiderforce4ai"
-version = "2.4.2"
+version = "2.4.5"
 description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
 readme = "README.md"
 authors = [

{spiderforce4ai-2.4.2 → spiderforce4ai-2.4.5}/setup.py RENAMED Viewed

@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
 setup(
     name="spiderforce4ai",
-    version="2.4.2",
+    version="2.4.5",
     author="Piotr Tamulewicz",
     author_email="pt@petertam.pro",
     description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",

{spiderforce4ai-2.4.2 → spiderforce4ai-2.4.5}/spiderforce4ai/__init__.py RENAMED Viewed

@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
             config=config.to_dict()
         )
-        # Handle post-extraction if configured
-        if config.post_extraction_agent:
-            try:
-                post_config = PostExtractionConfig(
-                    model=config.post_extraction_agent["model"],
-                    messages=config.post_extraction_agent["messages"],
-                    api_key=config.post_extraction_agent["api_key"],
-                    max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
-                    temperature=config.post_extraction_agent.get("temperature", 0.7),
-                    base_url=config.post_extraction_agent.get("base_url"),
-                    combine_output=bool(config.post_extraction_agent_save_to_file),
-                    output_file=config.post_extraction_agent_save_to_file,
-                    custom_transform_function=config.post_agent_transformer_function
-                )
-                agent = PostExtractionAgent(post_config)
-                extraction_result = asyncio.run(agent.process_content(url, markdown))
-                if extraction_result:
-                    result.extraction_result = extraction_result
-            except Exception as e:
-                console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
         # Send webhook for successful result
         _send_webhook_sync(result, config)
@@ -608,7 +586,7 @@ class SpiderForce4AI:
                 for result in results:
                     if result.status == "success":
                         try:
-                            result.extraction_result = await agent.process_content(result.url, result.markdown)
+                            result.extraction_result = agent.process_content(result.url, result.markdown)
                             progress.update(llm_task, advance=1)
                         except Exception as e:
                             console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
@@ -733,13 +711,48 @@ class SpiderForce4AI:
                 TextColumn("({task.completed}/{task.total})"),
             ) as progress:
                 task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
                 for result in pool.imap_unordered(_process_url_parallel, process_args):
                     results.append(result)
                     progress.update(task, advance=1)
                     status = "✓" if result.status == "success" else "✗"
                     progress.description = f"[cyan]Last: {status} {result.url}"
+        # Process LLM requests sequentially after all crawling is complete
+        if config.post_extraction_agent:
+            console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
+            successful_results = [r for r in results if r.status == "success"]
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+            ) as progress:
+                llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
+                post_config = PostExtractionConfig(
+                    model=config.post_extraction_agent["model"],
+                    messages=config.post_extraction_agent["messages"],
+                    api_key=config.post_extraction_agent["api_key"],
+                    max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                    temperature=config.post_extraction_agent.get("temperature", 0.7),
+                    base_url=config.post_extraction_agent.get("base_url"),
+                    combine_output=bool(config.post_extraction_agent_save_to_file),
+                    output_file=config.post_extraction_agent_save_to_file,
+                    custom_transform_function=config.post_agent_transformer_function
+                )
+                agent = PostExtractionAgent(post_config)
+                for result in successful_results:
+                    try:
+                        result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
+                        progress.update(llm_task, advance=1)
+                    except Exception as e:
+                        console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
+                        time.sleep(1)  # Add delay after error
+                    time.sleep(0.5)  # Rate limiting between requests
         # Calculate statistics and handle retries
         failed_results = [r for r in results if r.status == "failed"]
         initial_failed = len(failed_results)
@@ -831,31 +844,44 @@ class SpiderForce4AI:
                 if result.status == "success" and config.output_dir and result.markdown:
                     _save_markdown_sync(result.url, result.markdown, config)
-                # Handle post-extraction if configured
-                if config.post_extraction_agent and result.status == "success":
-                    try:
-                        post_config = PostExtractionConfig(
-                            model=config.post_extraction_agent["model"],
-                            messages=config.post_extraction_agent["messages"],
-                            api_key=config.post_extraction_agent["api_key"],
-                            max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
-                            temperature=config.post_extraction_agent.get("temperature", 0.7),
-                            base_url=config.post_extraction_agent.get("base_url"),
-                            combine_output=bool(config.post_extraction_agent_save_to_file),
-                            output_file=config.post_extraction_agent_save_to_file,
-                            custom_transform_function=config.post_agent_transformer_function
-                        )
-                        agent = PostExtractionAgent(post_config)
-                        extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
-                        if extraction_result:
-                            result.extraction_result = extraction_result
-                    except Exception as e:
-                        console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
                 # Send webhook if configured
                 _send_webhook_sync(result, config)
                 results.append(result)
+            # Process LLM requests sequentially after all crawling is complete
+            if config.post_extraction_agent:
+                console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
+                successful_results = [r for r in results if r.status == "success"]
+                with Progress(
+                    SpinnerColumn(),
+                    TextColumn("[progress.description]{task.description}"),
+                    BarColumn(),
+                    TaskProgressColumn(),
+                ) as progress:
+                    llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
+                    post_config = PostExtractionConfig(
+                        model=config.post_extraction_agent["model"],
+                        messages=config.post_extraction_agent["messages"],
+                        api_key=config.post_extraction_agent["api_key"],
+                        max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                        temperature=config.post_extraction_agent.get("temperature", 0.7),
+                        base_url=config.post_extraction_agent.get("base_url"),
+                        combine_output=bool(config.post_extraction_agent_save_to_file),
+                        output_file=config.post_extraction_agent_save_to_file,
+                        custom_transform_function=config.post_agent_transformer_function
+                    )
+                    agent = PostExtractionAgent(post_config)
+                    for result in successful_results:
+                        try:
+                            result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
+                            progress.update(llm_task, advance=1)
+                        except Exception as e:
+                            console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
+                            time.sleep(1)  # Add delay after error
+                        time.sleep(0.5)  # Rate limiting between requests
             # Calculate statistics
             successful = len([r for r in results if r.status == "success"])

{spiderforce4ai-2.4.2 → spiderforce4ai-2.4.5}/spiderforce4ai/post_extraction_agent.py RENAMED Viewed

@@ -164,12 +164,9 @@ class PostExtractionAgent:
                 self.config.output_file.rename(backup_path)
             self.config.output_file.touch()
-    async def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
+    def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
         """Process a single piece of content through the LLM."""
         try:
-            # Apply rate limiting
-            await self.rate_limiter.acquire()
             # Replace placeholder in messages with actual content
             messages = [
                 {**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
@@ -183,7 +180,8 @@ class PostExtractionAgent:
             for attempt in range(max_retries):
                 try:
-                    response = await completion(
+                    # Call completion synchronously
+                    response = completion(
                         model=self.config.model,
                         messages=messages,
                         max_tokens=self.config.max_tokens,
@@ -200,11 +198,11 @@ class PostExtractionAgent:
                 except json.JSONDecodeError as e:
                     last_error = f"Invalid JSON response from LLM: {e}"
                     if attempt < max_retries - 1:
-                        await asyncio.sleep(retry_delay * (attempt + 1))
+                        time.sleep(retry_delay * (attempt + 1))
                 except Exception as e:
                     last_error = str(e)
                     if attempt < max_retries - 1:
-                        await asyncio.sleep(retry_delay * (attempt + 1))
+                        time.sleep(retry_delay * (attempt + 1))
             # If we get here, all retries failed
             raise Exception(last_error)
@@ -214,6 +212,20 @@ class PostExtractionAgent:
             self.buffer.add_failed_request(url, content, str(e))
             return None
+    def _save_result_sync(self, url: str, result: Dict) -> None:
+        """Save individual or combined results synchronously."""
+        try:
+            if self.config.combine_output and self.config.output_file:
+                self.results[url] = result
+                with open(self.config.output_file, 'w') as f:
+                    json.dump(self.results, f, indent=2)
+            elif not self.config.combine_output and self.config.output_file:
+                individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
+                with open(individual_file, 'w') as f:
+                    json.dump(result, f, indent=2)
+        except Exception as e:
+            logger.error(f"Error saving results for {url}: {str(e)}")
     async def _save_result(self, url: str, result: Dict) -> None:
         """Save individual or combined results."""
         try:
@@ -228,10 +240,10 @@ class PostExtractionAgent:
         except Exception as e:
             logger.error(f"Error saving results for {url}: {str(e)}")
-    async def process_content(self, url: str, content: str) -> Optional[Dict]:
+    def process_content(self, url: str, content: str) -> Optional[Dict]:
         """Process content with retry mechanism."""
         for attempt in range(self.config.max_retries):
-            result = await self._process_single_content(url, content)
+            result = self._process_single_content(url, content)
             if result:
                 # Apply custom transformation if provided
                 if self.config.custom_transform_function:
@@ -240,12 +252,13 @@ class PostExtractionAgent:
                     except Exception as e:
                         logger.error(f"Error in custom transform for {url}: {str(e)}")
-                await self._save_result(url, result)
+                # Save result synchronously
+                self._save_result_sync(url, result)
                 return result
             # Wait before retry
             if attempt < self.config.max_retries - 1:
-                await asyncio.sleep(self.config.retry_delay)
+                time.sleep(self.config.retry_delay)
         return None

{spiderforce4ai-2.4.2 → spiderforce4ai-2.4.5}/spiderforce4ai.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 2.4.2
+Version: 2.4.5
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz