spiderforce4ai 2.4__py3-none-any.whl → 2.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -460,28 +460,6 @@ class SpiderForce4AI:
460
460
  if config.output_dir:
461
461
  await _save_markdown_async(url, markdown, config)
462
462
 
463
- # Handle post-extraction if configured
464
- if config.post_extraction_agent and result.status == "success":
465
- try:
466
- post_config = PostExtractionConfig(
467
- model=config.post_extraction_agent["model"],
468
- messages=config.post_extraction_agent["messages"],
469
- api_key=config.post_extraction_agent["api_key"],
470
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
471
- temperature=config.post_extraction_agent.get("temperature", 0.7),
472
- base_url=config.post_extraction_agent.get("base_url"),
473
- combine_output=bool(config.post_extraction_agent_save_to_file),
474
- output_file=config.post_extraction_agent_save_to_file,
475
- custom_transform_function=config.post_agent_transformer_function
476
- )
477
-
478
- agent = PostExtractionAgent(post_config)
479
- extraction_result = await agent.process_content(url, markdown)
480
- if extraction_result:
481
- result.extraction_result = extraction_result
482
- except Exception as e:
483
- console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
484
-
485
463
  await _send_webhook_async(result, config)
486
464
 
487
465
  self.crawl_results.append(result)
@@ -576,8 +554,11 @@ class SpiderForce4AI:
576
554
  # Set up concurrency control
577
555
  semaphore = asyncio.Semaphore(config.max_concurrent_requests)
578
556
 
557
+ # Semaphore for crawling
558
+ crawl_semaphore = asyncio.Semaphore(config.max_concurrent_requests)
559
+
579
560
  async def crawl_with_semaphore(url):
580
- async with semaphore:
561
+ async with crawl_semaphore:
581
562
  result = await crawl_with_progress(url)
582
563
  await asyncio.sleep(config.request_delay)
583
564
  return result
@@ -606,6 +587,63 @@ class SpiderForce4AI:
606
587
  results[i] = retry_result
607
588
  break
608
589
 
590
+ # Process LLM requests sequentially after all crawling is complete
591
+ if config.post_extraction_agent:
592
+ console.print("\n[cyan]Processing content with LLM...[/cyan]")
593
+ llm_task = progress.add_task("[cyan]LLM Processing...", total=len([r for r in results if r.status == "success"]))
594
+
595
+ post_config = PostExtractionConfig(
596
+ model=config.post_extraction_agent["model"],
597
+ messages=config.post_extraction_agent["messages"],
598
+ api_key=config.post_extraction_agent["api_key"],
599
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
600
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
601
+ base_url=config.post_extraction_agent.get("base_url"),
602
+ combine_output=bool(config.post_extraction_agent_save_to_file),
603
+ output_file=config.post_extraction_agent_save_to_file,
604
+ custom_transform_function=config.post_agent_transformer_function
605
+ )
606
+ agent = PostExtractionAgent(post_config)
607
+
608
+ for result in results:
609
+ if result.status == "success":
610
+ try:
611
+ result.extraction_result = await agent.process_content(result.url, result.markdown)
612
+ progress.update(llm_task, advance=1)
613
+ except Exception as e:
614
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
615
+
616
+ # Process LLM requests sequentially after all crawling is complete
617
+ llm_successful = 0
618
+ if config.post_extraction_agent:
619
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
620
+ successful_results = [r for r in results if r.status == "success"]
621
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
622
+
623
+ post_config = PostExtractionConfig(
624
+ model=config.post_extraction_agent["model"],
625
+ messages=config.post_extraction_agent["messages"],
626
+ api_key=config.post_extraction_agent["api_key"],
627
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
628
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
629
+ base_url=config.post_extraction_agent.get("base_url"),
630
+ combine_output=bool(config.post_extraction_agent_save_to_file),
631
+ output_file=config.post_extraction_agent_save_to_file,
632
+ custom_transform_function=config.post_agent_transformer_function
633
+ )
634
+ agent = PostExtractionAgent(post_config)
635
+
636
+ for result in successful_results:
637
+ try:
638
+ result.extraction_result = await agent.process_content(result.url, result.markdown)
639
+ if result.extraction_result:
640
+ llm_successful += 1
641
+ progress.update(llm_task, advance=1)
642
+ except Exception as e:
643
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
644
+ await asyncio.sleep(1) # Add delay after error
645
+ await asyncio.sleep(0.5) # Rate limiting between requests
646
+
609
647
  # Calculate final statistics
610
648
  final_successful = len([r for r in results if r.status == "success"])
611
649
  final_failed = len([r for r in results if r.status == "failed"])
@@ -616,7 +654,7 @@ class SpiderForce4AI:
616
654
  "failure_ratio": failure_ratio,
617
655
  "retry_successful": retry_successful if initial_failed > 0 else 0,
618
656
  "retry_failed": final_failed,
619
- "post_extraction_successful": len(post_extraction_results) if post_extraction_results else 0
657
+ "llm_successful": llm_successful
620
658
  }
621
659
 
622
660
  # Print summary
@@ -894,4 +932,4 @@ class SpiderForce4AI:
894
932
  # Version info
895
933
  #__version__ = "2.3.1"
896
934
  #__author__ = "Piotr Tamulewicz"
897
- #__email__ = "pt@petertam.pro"
935
+ #__email__ = "pt@petertam.pro"
@@ -176,23 +176,38 @@ class PostExtractionAgent:
176
176
  for msg in self.config.messages
177
177
  ]
178
178
 
179
- # Make LLM request
180
- response = await completion(
181
- model=self.config.model,
182
- messages=messages,
183
- max_tokens=self.config.max_tokens,
184
- temperature=self.config.temperature,
185
- api_key=self.config.api_key,
186
- api_base=self.config.base_url
187
- )
179
+ # Make LLM request with retries
180
+ max_retries = 3
181
+ retry_delay = 1.0
182
+ last_error = None
188
183
 
189
- # Parse response
190
- try:
191
- extracted_data = json.loads(response.choices[0].message.content)
192
- self.buffer.remove_request(url) # Remove from buffer if successful
193
- return extracted_data
194
- except json.JSONDecodeError as e:
195
- raise ValueError(f"Invalid JSON response from LLM: {e}")
184
+ for attempt in range(max_retries):
185
+ try:
186
+ response = await completion(
187
+ model=self.config.model,
188
+ messages=messages,
189
+ max_tokens=self.config.max_tokens,
190
+ temperature=self.config.temperature,
191
+ api_key=self.config.api_key,
192
+ api_base=self.config.base_url
193
+ )
194
+
195
+ # Parse response
196
+ extracted_data = json.loads(response.choices[0].message.content)
197
+ self.buffer.remove_request(url) # Remove from buffer if successful
198
+ return extracted_data
199
+
200
+ except json.JSONDecodeError as e:
201
+ last_error = f"Invalid JSON response from LLM: {e}"
202
+ if attempt < max_retries - 1:
203
+ await asyncio.sleep(retry_delay * (attempt + 1))
204
+ except Exception as e:
205
+ last_error = str(e)
206
+ if attempt < max_retries - 1:
207
+ await asyncio.sleep(retry_delay * (attempt + 1))
208
+
209
+ # If we get here, all retries failed
210
+ raise Exception(last_error)
196
211
 
197
212
  except Exception as e:
198
213
  logger.error(f"Error processing {url}: {str(e)}")
@@ -256,4 +271,4 @@ class PostExtractionAgent:
256
271
  "failed_requests": len(self.buffer.get_failed_requests()),
257
272
  "retryable_requests": len(self.buffer.get_retryable_requests(self.config.max_retries)),
258
273
  "success_rate": len(self.results) / (len(self.results) + len(self.buffer.get_failed_requests())) * 100 if self.results else 0
259
- }
274
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4
3
+ Version: 2.4.2
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=6hqYztIqL_jRuKmQOGnap2-hP8Lq1YXarUQXTFwIVxY,40841
2
+ spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
+ spiderforce4ai-2.4.2.dist-info/METADATA,sha256=hyIp437hoWVVkbN88P6yNcKwvkvf2NpP6fyOsWxhM_I,9012
4
+ spiderforce4ai-2.4.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.2.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.2.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=JClWyqGGCVC6yxuK4TpJ7a-7iP6ueD20oKc0ERHxnyU,38701
2
- spiderforce4ai/post_extraction_agent.py,sha256=yZ17xdOtkNMDRGqqudNBZIb6N9bcsjOwbzPB6D5kJHg,10540
3
- spiderforce4ai-2.4.dist-info/METADATA,sha256=UNtth74KAHCNOngozhN2es3z4vY6J7SiKfTaIi0fYTI,9010
4
- spiderforce4ai-2.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.4.dist-info/RECORD,,