spiderforce4ai 2.4.2__py3-none-any.whl → 2.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
312
312
  config=config.to_dict()
313
313
  )
314
314
 
315
- # Handle post-extraction if configured
316
- if config.post_extraction_agent:
317
- try:
318
- post_config = PostExtractionConfig(
319
- model=config.post_extraction_agent["model"],
320
- messages=config.post_extraction_agent["messages"],
321
- api_key=config.post_extraction_agent["api_key"],
322
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
323
- temperature=config.post_extraction_agent.get("temperature", 0.7),
324
- base_url=config.post_extraction_agent.get("base_url"),
325
- combine_output=bool(config.post_extraction_agent_save_to_file),
326
- output_file=config.post_extraction_agent_save_to_file,
327
- custom_transform_function=config.post_agent_transformer_function
328
- )
329
-
330
- agent = PostExtractionAgent(post_config)
331
- extraction_result = asyncio.run(agent.process_content(url, markdown))
332
- if extraction_result:
333
- result.extraction_result = extraction_result
334
- except Exception as e:
335
- console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
336
-
337
315
  # Send webhook for successful result
338
316
  _send_webhook_sync(result, config)
339
317
 
@@ -608,7 +586,7 @@ class SpiderForce4AI:
608
586
  for result in results:
609
587
  if result.status == "success":
610
588
  try:
611
- result.extraction_result = await agent.process_content(result.url, result.markdown)
589
+ result.extraction_result = agent.process_content(result.url, result.markdown)
612
590
  progress.update(llm_task, advance=1)
613
591
  except Exception as e:
614
592
  console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
@@ -733,13 +711,48 @@ class SpiderForce4AI:
733
711
  TextColumn("({task.completed}/{task.total})"),
734
712
  ) as progress:
735
713
  task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
736
-
714
+
737
715
  for result in pool.imap_unordered(_process_url_parallel, process_args):
738
716
  results.append(result)
739
717
  progress.update(task, advance=1)
740
718
  status = "✓" if result.status == "success" else "✗"
741
719
  progress.description = f"[cyan]Last: {status} {result.url}"
742
720
 
721
+ # Process LLM requests sequentially after all crawling is complete
722
+ if config.post_extraction_agent:
723
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
724
+ successful_results = [r for r in results if r.status == "success"]
725
+
726
+ with Progress(
727
+ SpinnerColumn(),
728
+ TextColumn("[progress.description]{task.description}"),
729
+ BarColumn(),
730
+ TaskProgressColumn(),
731
+ ) as progress:
732
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
733
+
734
+ post_config = PostExtractionConfig(
735
+ model=config.post_extraction_agent["model"],
736
+ messages=config.post_extraction_agent["messages"],
737
+ api_key=config.post_extraction_agent["api_key"],
738
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
739
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
740
+ base_url=config.post_extraction_agent.get("base_url"),
741
+ combine_output=bool(config.post_extraction_agent_save_to_file),
742
+ output_file=config.post_extraction_agent_save_to_file,
743
+ custom_transform_function=config.post_agent_transformer_function
744
+ )
745
+ agent = PostExtractionAgent(post_config)
746
+
747
+ for result in successful_results:
748
+ try:
749
+ result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
750
+ progress.update(llm_task, advance=1)
751
+ except Exception as e:
752
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
753
+ time.sleep(1) # Add delay after error
754
+ time.sleep(0.5) # Rate limiting between requests
755
+
743
756
  # Calculate statistics and handle retries
744
757
  failed_results = [r for r in results if r.status == "failed"]
745
758
  initial_failed = len(failed_results)
@@ -831,31 +844,44 @@ class SpiderForce4AI:
831
844
  if result.status == "success" and config.output_dir and result.markdown:
832
845
  _save_markdown_sync(result.url, result.markdown, config)
833
846
 
834
- # Handle post-extraction if configured
835
- if config.post_extraction_agent and result.status == "success":
836
- try:
837
- post_config = PostExtractionConfig(
838
- model=config.post_extraction_agent["model"],
839
- messages=config.post_extraction_agent["messages"],
840
- api_key=config.post_extraction_agent["api_key"],
841
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
842
- temperature=config.post_extraction_agent.get("temperature", 0.7),
843
- base_url=config.post_extraction_agent.get("base_url"),
844
- combine_output=bool(config.post_extraction_agent_save_to_file),
845
- output_file=config.post_extraction_agent_save_to_file,
846
- custom_transform_function=config.post_agent_transformer_function
847
- )
848
-
849
- agent = PostExtractionAgent(post_config)
850
- extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
851
- if extraction_result:
852
- result.extraction_result = extraction_result
853
- except Exception as e:
854
- console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
855
-
856
847
  # Send webhook if configured
857
848
  _send_webhook_sync(result, config)
858
849
  results.append(result)
850
+
851
+ # Process LLM requests sequentially after all crawling is complete
852
+ if config.post_extraction_agent:
853
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
854
+ successful_results = [r for r in results if r.status == "success"]
855
+
856
+ with Progress(
857
+ SpinnerColumn(),
858
+ TextColumn("[progress.description]{task.description}"),
859
+ BarColumn(),
860
+ TaskProgressColumn(),
861
+ ) as progress:
862
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
863
+
864
+ post_config = PostExtractionConfig(
865
+ model=config.post_extraction_agent["model"],
866
+ messages=config.post_extraction_agent["messages"],
867
+ api_key=config.post_extraction_agent["api_key"],
868
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
869
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
870
+ base_url=config.post_extraction_agent.get("base_url"),
871
+ combine_output=bool(config.post_extraction_agent_save_to_file),
872
+ output_file=config.post_extraction_agent_save_to_file,
873
+ custom_transform_function=config.post_agent_transformer_function
874
+ )
875
+ agent = PostExtractionAgent(post_config)
876
+
877
+ for result in successful_results:
878
+ try:
879
+ result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
880
+ progress.update(llm_task, advance=1)
881
+ except Exception as e:
882
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
883
+ time.sleep(1) # Add delay after error
884
+ time.sleep(0.5) # Rate limiting between requests
859
885
 
860
886
  # Calculate statistics
861
887
  successful = len([r for r in results if r.status == "success"])
@@ -164,12 +164,9 @@ class PostExtractionAgent:
164
164
  self.config.output_file.rename(backup_path)
165
165
  self.config.output_file.touch()
166
166
 
167
- async def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
167
+ def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
168
168
  """Process a single piece of content through the LLM."""
169
169
  try:
170
- # Apply rate limiting
171
- await self.rate_limiter.acquire()
172
-
173
170
  # Replace placeholder in messages with actual content
174
171
  messages = [
175
172
  {**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
@@ -183,7 +180,8 @@ class PostExtractionAgent:
183
180
 
184
181
  for attempt in range(max_retries):
185
182
  try:
186
- response = await completion(
183
+ # Call completion synchronously
184
+ response = completion(
187
185
  model=self.config.model,
188
186
  messages=messages,
189
187
  max_tokens=self.config.max_tokens,
@@ -200,11 +198,11 @@ class PostExtractionAgent:
200
198
  except json.JSONDecodeError as e:
201
199
  last_error = f"Invalid JSON response from LLM: {e}"
202
200
  if attempt < max_retries - 1:
203
- await asyncio.sleep(retry_delay * (attempt + 1))
201
+ time.sleep(retry_delay * (attempt + 1))
204
202
  except Exception as e:
205
203
  last_error = str(e)
206
204
  if attempt < max_retries - 1:
207
- await asyncio.sleep(retry_delay * (attempt + 1))
205
+ time.sleep(retry_delay * (attempt + 1))
208
206
 
209
207
  # If we get here, all retries failed
210
208
  raise Exception(last_error)
@@ -214,6 +212,20 @@ class PostExtractionAgent:
214
212
  self.buffer.add_failed_request(url, content, str(e))
215
213
  return None
216
214
 
215
+ def _save_result_sync(self, url: str, result: Dict) -> None:
216
+ """Save individual or combined results synchronously."""
217
+ try:
218
+ if self.config.combine_output and self.config.output_file:
219
+ self.results[url] = result
220
+ with open(self.config.output_file, 'w') as f:
221
+ json.dump(self.results, f, indent=2)
222
+ elif not self.config.combine_output and self.config.output_file:
223
+ individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
224
+ with open(individual_file, 'w') as f:
225
+ json.dump(result, f, indent=2)
226
+ except Exception as e:
227
+ logger.error(f"Error saving results for {url}: {str(e)}")
228
+
217
229
  async def _save_result(self, url: str, result: Dict) -> None:
218
230
  """Save individual or combined results."""
219
231
  try:
@@ -228,10 +240,10 @@ class PostExtractionAgent:
228
240
  except Exception as e:
229
241
  logger.error(f"Error saving results for {url}: {str(e)}")
230
242
 
231
- async def process_content(self, url: str, content: str) -> Optional[Dict]:
243
+ def process_content(self, url: str, content: str) -> Optional[Dict]:
232
244
  """Process content with retry mechanism."""
233
245
  for attempt in range(self.config.max_retries):
234
- result = await self._process_single_content(url, content)
246
+ result = self._process_single_content(url, content)
235
247
  if result:
236
248
  # Apply custom transformation if provided
237
249
  if self.config.custom_transform_function:
@@ -240,12 +252,13 @@ class PostExtractionAgent:
240
252
  except Exception as e:
241
253
  logger.error(f"Error in custom transform for {url}: {str(e)}")
242
254
 
243
- await self._save_result(url, result)
255
+ # Save result synchronously
256
+ self._save_result_sync(url, result)
244
257
  return result
245
258
 
246
259
  # Wait before retry
247
260
  if attempt < self.config.max_retries - 1:
248
- await asyncio.sleep(self.config.retry_delay)
261
+ time.sleep(self.config.retry_delay)
249
262
 
250
263
  return None
251
264
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.2
3
+ Version: 2.4.5
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=PPpJLowJhgoRijsF2ebmdkFbIriI_yIFlCi1wL6hSP8,42267
2
+ spiderforce4ai/post_extraction_agent.py,sha256=t9KxjuNw16-6kige6ULPLyykNkiGmKhpCi8QjskdaTk,11959
3
+ spiderforce4ai-2.4.5.dist-info/METADATA,sha256=q3VBuGb5wxsi9OPkzEMwFMyg9f_vT2RamWYIgu2JbLc,9012
4
+ spiderforce4ai-2.4.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.5.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.5.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=6hqYztIqL_jRuKmQOGnap2-hP8Lq1YXarUQXTFwIVxY,40841
2
- spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
- spiderforce4ai-2.4.2.dist-info/METADATA,sha256=hyIp437hoWVVkbN88P6yNcKwvkvf2NpP6fyOsWxhM_I,9012
4
- spiderforce4ai-2.4.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.4.2.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.4.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.4.2.dist-info/RECORD,,