PyPI - spiderforce4ai - Versions diffs - 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl - Mend

spiderforce4ai 2.4.1py3-none-any.whl → 2.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

spiderforce4ai/__init__.py CHANGED Viewed

@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
             config=config.to_dict()
         )
-        # Handle post-extraction if configured
-        if config.post_extraction_agent:
-            try:
-                post_config = PostExtractionConfig(
-                    model=config.post_extraction_agent["model"],
-                    messages=config.post_extraction_agent["messages"],
-                    api_key=config.post_extraction_agent["api_key"],
-                    max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
-                    temperature=config.post_extraction_agent.get("temperature", 0.7),
-                    base_url=config.post_extraction_agent.get("base_url"),
-                    combine_output=bool(config.post_extraction_agent_save_to_file),
-                    output_file=config.post_extraction_agent_save_to_file,
-                    custom_transform_function=config.post_agent_transformer_function
-                )
-                agent = PostExtractionAgent(post_config)
-                extraction_result = asyncio.run(agent.process_content(url, markdown))
-                if extraction_result:
-                    result.extraction_result = extraction_result
-            except Exception as e:
-                console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
         # Send webhook for successful result
         _send_webhook_sync(result, config)
@@ -460,28 +438,6 @@ class SpiderForce4AI:
                     if config.output_dir:
                         await _save_markdown_async(url, markdown, config)
-                    # Handle post-extraction if configured
-                    if config.post_extraction_agent and result.status == "success":
-                        try:
-                            post_config = PostExtractionConfig(
-                                model=config.post_extraction_agent["model"],
-                                messages=config.post_extraction_agent["messages"],
-                                api_key=config.post_extraction_agent["api_key"],
-                                max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
-                                temperature=config.post_extraction_agent.get("temperature", 0.7),
-                                base_url=config.post_extraction_agent.get("base_url"),
-                                combine_output=bool(config.post_extraction_agent_save_to_file),
-                                output_file=config.post_extraction_agent_save_to_file,
-                                custom_transform_function=config.post_agent_transformer_function
-                            )
-                            agent = PostExtractionAgent(post_config)
-                            extraction_result = await agent.process_content(url, markdown)
-                            if extraction_result:
-                                result.extraction_result = extraction_result
-                        except Exception as e:
-                            console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
                     await _send_webhook_async(result, config)
                 self.crawl_results.append(result)
@@ -635,10 +591,40 @@ class SpiderForce4AI:
                         except Exception as e:
                             console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
+            # Process LLM requests sequentially after all crawling is complete
+            llm_successful = 0
+            if config.post_extraction_agent:
+                console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
+                successful_results = [r for r in results if r.status == "success"]
+                llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
+                post_config = PostExtractionConfig(
+                    model=config.post_extraction_agent["model"],
+                    messages=config.post_extraction_agent["messages"],
+                    api_key=config.post_extraction_agent["api_key"],
+                    max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                    temperature=config.post_extraction_agent.get("temperature", 0.7),
+                    base_url=config.post_extraction_agent.get("base_url"),
+                    combine_output=bool(config.post_extraction_agent_save_to_file),
+                    output_file=config.post_extraction_agent_save_to_file,
+                    custom_transform_function=config.post_agent_transformer_function
+                )
+                agent = PostExtractionAgent(post_config)
+                for result in successful_results:
+                    try:
+                        result.extraction_result = await agent.process_content(result.url, result.markdown)
+                        if result.extraction_result:
+                            llm_successful += 1
+                        progress.update(llm_task, advance=1)
+                    except Exception as e:
+                        console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
+                        await asyncio.sleep(1)  # Add delay after error
+                    await asyncio.sleep(0.5)  # Rate limiting between requests
             # Calculate final statistics
             final_successful = len([r for r in results if r.status == "success"])
             final_failed = len([r for r in results if r.status == "failed"])
-            llm_successful = len([r for r in results if r.extraction_result is not None])
             # Update retry stats
             self._retry_stats = {
@@ -725,13 +711,48 @@ class SpiderForce4AI:
                 TextColumn("({task.completed}/{task.total})"),
             ) as progress:
                 task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
                 for result in pool.imap_unordered(_process_url_parallel, process_args):
                     results.append(result)
                     progress.update(task, advance=1)
                     status = "✓" if result.status == "success" else "✗"
                     progress.description = f"[cyan]Last: {status} {result.url}"
+        # Process LLM requests sequentially after all crawling is complete
+        if config.post_extraction_agent:
+            console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
+            successful_results = [r for r in results if r.status == "success"]
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+            ) as progress:
+                llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
+                post_config = PostExtractionConfig(
+                    model=config.post_extraction_agent["model"],
+                    messages=config.post_extraction_agent["messages"],
+                    api_key=config.post_extraction_agent["api_key"],
+                    max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                    temperature=config.post_extraction_agent.get("temperature", 0.7),
+                    base_url=config.post_extraction_agent.get("base_url"),
+                    combine_output=bool(config.post_extraction_agent_save_to_file),
+                    output_file=config.post_extraction_agent_save_to_file,
+                    custom_transform_function=config.post_agent_transformer_function
+                )
+                agent = PostExtractionAgent(post_config)
+                for result in successful_results:
+                    try:
+                        result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
+                        progress.update(llm_task, advance=1)
+                    except Exception as e:
+                        console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
+                        time.sleep(1)  # Add delay after error
+                    time.sleep(0.5)  # Rate limiting between requests
         # Calculate statistics and handle retries
         failed_results = [r for r in results if r.status == "failed"]
         initial_failed = len(failed_results)
@@ -823,31 +844,44 @@ class SpiderForce4AI:
                 if result.status == "success" and config.output_dir and result.markdown:
                     _save_markdown_sync(result.url, result.markdown, config)
-                # Handle post-extraction if configured
-                if config.post_extraction_agent and result.status == "success":
-                    try:
-                        post_config = PostExtractionConfig(
-                            model=config.post_extraction_agent["model"],
-                            messages=config.post_extraction_agent["messages"],
-                            api_key=config.post_extraction_agent["api_key"],
-                            max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
-                            temperature=config.post_extraction_agent.get("temperature", 0.7),
-                            base_url=config.post_extraction_agent.get("base_url"),
-                            combine_output=bool(config.post_extraction_agent_save_to_file),
-                            output_file=config.post_extraction_agent_save_to_file,
-                            custom_transform_function=config.post_agent_transformer_function
-                        )
-                        agent = PostExtractionAgent(post_config)
-                        extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
-                        if extraction_result:
-                            result.extraction_result = extraction_result
-                    except Exception as e:
-                        console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
                 # Send webhook if configured
                 _send_webhook_sync(result, config)
                 results.append(result)
+            # Process LLM requests sequentially after all crawling is complete
+            if config.post_extraction_agent:
+                console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
+                successful_results = [r for r in results if r.status == "success"]
+                with Progress(
+                    SpinnerColumn(),
+                    TextColumn("[progress.description]{task.description}"),
+                    BarColumn(),
+                    TaskProgressColumn(),
+                ) as progress:
+                    llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
+                    post_config = PostExtractionConfig(
+                        model=config.post_extraction_agent["model"],
+                        messages=config.post_extraction_agent["messages"],
+                        api_key=config.post_extraction_agent["api_key"],
+                        max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
+                        temperature=config.post_extraction_agent.get("temperature", 0.7),
+                        base_url=config.post_extraction_agent.get("base_url"),
+                        combine_output=bool(config.post_extraction_agent_save_to_file),
+                        output_file=config.post_extraction_agent_save_to_file,
+                        custom_transform_function=config.post_agent_transformer_function
+                    )
+                    agent = PostExtractionAgent(post_config)
+                    for result in successful_results:
+                        try:
+                            result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
+                            progress.update(llm_task, advance=1)
+                        except Exception as e:
+                            console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
+                            time.sleep(1)  # Add delay after error
+                        time.sleep(0.5)  # Rate limiting between requests
             # Calculate statistics
             successful = len([r for r in results if r.status == "success"])

{spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 2.4.1
+Version: 2.4.3
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

spiderforce4ai-2.4.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+spiderforce4ai/__init__.py,sha256=iwCLSvooHtFAo-rU52-nsFgyn99Dflpt_OpSrIW-PqA,42273
+spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
+spiderforce4ai-2.4.3.dist-info/METADATA,sha256=-i_vH6DDs4xVFVdDfaFG_Xka0pqXCSQdCrKgym5r5b0,9012
+spiderforce4ai-2.4.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-2.4.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
+spiderforce4ai-2.4.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-2.4.3.dist-info/RECORD,,

spiderforce4ai-2.4.1.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-spiderforce4ai/__init__.py,sha256=IjoJSE-7PX8zxBF0Pl1ELQUraLU3agAtY_J6NvQSPf4,40533
-spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
-spiderforce4ai-2.4.1.dist-info/METADATA,sha256=xVm-JdLz6Kx73Bi0DA1QG6D9Ya_OLqWd_80PNWHXLsA,9012
-spiderforce4ai-2.4.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-2.4.1.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
-spiderforce4ai-2.4.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-2.4.1.dist-info/RECORD,,

{spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{spiderforce4ai-2.4.1.dist-info → spiderforce4ai-2.4.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 2.4.1__py3-none-any.whl → 2.4.3__py3-none-any.whl

spiderforce4ai 2.4.1py3-none-any.whl → 2.4.3py3-none-any.whl