PyPI - spiderforce4ai - Versions diffs - 2.4__py3-none-any.whl → 2.4.2__py3-none-any.whl - Mend

spiderforce4ai 2.4py3-none-any.whl → 2.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

spiderforce4ai/__init__.py CHANGED Viewed

@@ -460,28 +460,6 @@ class SpiderForce4AI:
                     if config.output_dir:
                         await _save_markdown_async(url, markdown, config)
-                    # Handle post-extraction if configured
-                    if config.post_extraction_agent and result.status == "success":
-                        try:
-                            post_config = PostExtractionConfig(
-                                model=config.post_extraction_agent["model"],
-                                messages=config.post_extraction_agent["messages"],
-                                api_key=config.post_extraction_agent["api_key"],
-                                max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
-                                temperature=config.post_extraction_agent.get("temperature", 0.7),
-                                base_url=config.post_extraction_agent.get("base_url"),
-                                combine_output=bool(config.post_extraction_agent_save_to_file),
-                                output_file=config.post_extraction_agent_save_to_file,
-                                custom_transform_function=config.post_agent_transformer_function
-                            )
-                            agent = PostExtractionAgent(post_config)
-                            extraction_result = await agent.process_content(url, markdown)
-                            if extraction_result:
-                                result.extraction_result = extraction_result
-                        except Exception as e:
-                            console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
                     await _send_webhook_async(result, config)
                 self.crawl_results.append(result)
@@ -576,8 +554,11 @@ class SpiderForce4AI:
             # Set up concurrency control
             semaphore = asyncio.Semaphore(config.max_concurrent_requests)
+            # Semaphore for crawling
+            crawl_semaphore = asyncio.Semaphore(config.max_concurrent_requests)
             async def crawl_with_semaphore(url):
-                async with semaphore:
+                async with crawl_semaphore:
                     result = await crawl_with_progress(url)
                     await asyncio.sleep(config.request_delay)
                     return result
@@ -606,6 +587,63 @@ class SpiderForce4AI:
                             results[i] = retry_result
                             break
+            # Process LLM requests sequentially after all crawling is complete
+            if config.post_extraction_agent:
+                console.print("\n[cyan]Processing content with LLM...[/cyan]")
+                llm_task = progress.add_task("[cyan]LLM Processing...", total=len([r for r in results if r.status == "success"]))
+                post_config = PostExtractionConfig(
+                    model=config.post_extraction_agent["model"],
+                    messages=config.post_extraction_agent["messages"],
+                    api_key=config.post_extraction_agent["api_key"],
+                    max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                    temperature=config.post_extraction_agent.get("temperature", 0.7),
+                    base_url=config.post_extraction_agent.get("base_url"),
+                    combine_output=bool(config.post_extraction_agent_save_to_file),
+                    output_file=config.post_extraction_agent_save_to_file,
+                    custom_transform_function=config.post_agent_transformer_function
+                )
+                agent = PostExtractionAgent(post_config)
+                for result in results:
+                    if result.status == "success":
+                        try:
+                            result.extraction_result = await agent.process_content(result.url, result.markdown)
+                            progress.update(llm_task, advance=1)
+                        except Exception as e:
+                            console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
+            # Process LLM requests sequentially after all crawling is complete
+            llm_successful = 0
+            if config.post_extraction_agent:
+                console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
+                successful_results = [r for r in results if r.status == "success"]
+                llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
+                post_config = PostExtractionConfig(
+                    model=config.post_extraction_agent["model"],
+                    messages=config.post_extraction_agent["messages"],
+                    api_key=config.post_extraction_agent["api_key"],
+                    max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                    temperature=config.post_extraction_agent.get("temperature", 0.7),
+                    base_url=config.post_extraction_agent.get("base_url"),
+                    combine_output=bool(config.post_extraction_agent_save_to_file),
+                    output_file=config.post_extraction_agent_save_to_file,
+                    custom_transform_function=config.post_agent_transformer_function
+                )
+                agent = PostExtractionAgent(post_config)
+                for result in successful_results:
+                    try:
+                        result.extraction_result = await agent.process_content(result.url, result.markdown)
+                        if result.extraction_result:
+                            llm_successful += 1
+                        progress.update(llm_task, advance=1)
+                    except Exception as e:
+                        console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
+                        await asyncio.sleep(1)  # Add delay after error
+                    await asyncio.sleep(0.5)  # Rate limiting between requests
             # Calculate final statistics
             final_successful = len([r for r in results if r.status == "success"])
             final_failed = len([r for r in results if r.status == "failed"])
@@ -616,7 +654,7 @@ class SpiderForce4AI:
                 "failure_ratio": failure_ratio,
                 "retry_successful": retry_successful if initial_failed > 0 else 0,
                 "retry_failed": final_failed,
-                "post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
+                "llm_successful": llm_successful
             }
             # Print summary
@@ -894,4 +932,4 @@ class SpiderForce4AI:
 # Version info
 #__version__ = "2.3.1"
 #__author__ = "Piotr Tamulewicz"
-#__email__ = "pt@petertam.pro"
+#__email__ = "pt@petertam.pro"

spiderforce4ai/post_extraction_agent.py CHANGED Viewed

@@ -176,23 +176,38 @@ class PostExtractionAgent:
                 for msg in self.config.messages
             ]
-            # Make LLM request
-            response = await completion(
-                model=self.config.model,
-                messages=messages,
-                max_tokens=self.config.max_tokens,
-                temperature=self.config.temperature,
-                api_key=self.config.api_key,
-                api_base=self.config.base_url
-            )
+            # Make LLM request with retries
+            max_retries = 3
+            retry_delay = 1.0
+            last_error = None
-            # Parse response
-            try:
-                extracted_data = json.loads(response.choices[0].message.content)
-                self.buffer.remove_request(url)  # Remove from buffer if successful
-                return extracted_data
-            except json.JSONDecodeError as e:
-                raise ValueError(f"Invalid JSON response from LLM: {e}")
+            for attempt in range(max_retries):
+                try:
+                    response = await completion(
+                        model=self.config.model,
+                        messages=messages,
+                        max_tokens=self.config.max_tokens,
+                        temperature=self.config.temperature,
+                        api_key=self.config.api_key,
+                        api_base=self.config.base_url
+                    )
+                    # Parse response
+                    extracted_data = json.loads(response.choices[0].message.content)
+                    self.buffer.remove_request(url)  # Remove from buffer if successful
+                    return extracted_data
+                except json.JSONDecodeError as e:
+                    last_error = f"Invalid JSON response from LLM: {e}"
+                    if attempt < max_retries - 1:
+                        await asyncio.sleep(retry_delay * (attempt + 1))
+                except Exception as e:
+                    last_error = str(e)
+                    if attempt < max_retries - 1:
+                        await asyncio.sleep(retry_delay * (attempt + 1))
+            # If we get here, all retries failed
+            raise Exception(last_error)
         except Exception as e:
             logger.error(f"Error processing {url}: {str(e)}")
@@ -256,4 +271,4 @@ class PostExtractionAgent:
             "failed_requests": len(self.buffer.get_failed_requests()),
             "retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
             "success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
-        }
+        }

{spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 2.4
+Version: 2.4.2
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

spiderforce4ai-2.4.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+spiderforce4ai/__init__.py,sha256=6hqYztIqL_jRuKmQOGnap2-hP8Lq1YXarUQXTFwIVxY,40841
+spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
+spiderforce4ai-2.4.2.dist-info/METADATA,sha256=hyIp437hoWVVkbN88P6yNcKwvkvf2NpP6fyOsWxhM_I,9012
+spiderforce4ai-2.4.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-2.4.2.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
+spiderforce4ai-2.4.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-2.4.2.dist-info/RECORD,,

spiderforce4ai-2.4.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
-spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
-spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
-spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
-spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-2.4.dist-info/RECORD,,

{spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{spiderforce4ai-2.4.dist-info → spiderforce4ai-2.4.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 2.4__py3-none-any.whl → 2.4.2__py3-none-any.whl

spiderforce4ai 2.4py3-none-any.whl → 2.4.2py3-none-any.whl