spiderforce4ai 2.4.2__tar.gz → 2.4.5__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.2
3
+ Version: 2.4.5
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "2.4.2"
7
+ version = "2.4.5"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
9
9
  readme = "README.md"
10
10
  authors = [
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
7
7
 
8
8
  setup(
9
9
  name="spiderforce4ai",
10
- version="2.4.2",
10
+ version="2.4.5",
11
11
  author="Piotr Tamulewicz",
12
12
  author_email="pt@petertam.pro",
13
13
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
@@ -312,28 +312,6 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
312
312
  config=config.to_dict()
313
313
  )
314
314
 
315
- # Handle post-extraction if configured
316
- if config.post_extraction_agent:
317
- try:
318
- post_config = PostExtractionConfig(
319
- model=config.post_extraction_agent["model"],
320
- messages=config.post_extraction_agent["messages"],
321
- api_key=config.post_extraction_agent["api_key"],
322
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
323
- temperature=config.post_extraction_agent.get("temperature", 0.7),
324
- base_url=config.post_extraction_agent.get("base_url"),
325
- combine_output=bool(config.post_extraction_agent_save_to_file),
326
- output_file=config.post_extraction_agent_save_to_file,
327
- custom_transform_function=config.post_agent_transformer_function
328
- )
329
-
330
- agent = PostExtractionAgent(post_config)
331
- extraction_result = asyncio.run(agent.process_content(url, markdown))
332
- if extraction_result:
333
- result.extraction_result = extraction_result
334
- except Exception as e:
335
- console.print(f"[red]Error in post-extraction processing for {url}: {str(e)}[/red]")
336
-
337
315
  # Send webhook for successful result
338
316
  _send_webhook_sync(result, config)
339
317
 
@@ -608,7 +586,7 @@ class SpiderForce4AI:
608
586
  for result in results:
609
587
  if result.status == "success":
610
588
  try:
611
- result.extraction_result = await agent.process_content(result.url, result.markdown)
589
+ result.extraction_result = agent.process_content(result.url, result.markdown)
612
590
  progress.update(llm_task, advance=1)
613
591
  except Exception as e:
614
592
  console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
@@ -733,13 +711,48 @@ class SpiderForce4AI:
733
711
  TextColumn("({task.completed}/{task.total})"),
734
712
  ) as progress:
735
713
  task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
736
-
714
+
737
715
  for result in pool.imap_unordered(_process_url_parallel, process_args):
738
716
  results.append(result)
739
717
  progress.update(task, advance=1)
740
718
  status = "✓" if result.status == "success" else "✗"
741
719
  progress.description = f"[cyan]Last: {status} {result.url}"
742
720
 
721
+ # Process LLM requests sequentially after all crawling is complete
722
+ if config.post_extraction_agent:
723
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
724
+ successful_results = [r for r in results if r.status == "success"]
725
+
726
+ with Progress(
727
+ SpinnerColumn(),
728
+ TextColumn("[progress.description]{task.description}"),
729
+ BarColumn(),
730
+ TaskProgressColumn(),
731
+ ) as progress:
732
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
733
+
734
+ post_config = PostExtractionConfig(
735
+ model=config.post_extraction_agent["model"],
736
+ messages=config.post_extraction_agent["messages"],
737
+ api_key=config.post_extraction_agent["api_key"],
738
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
739
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
740
+ base_url=config.post_extraction_agent.get("base_url"),
741
+ combine_output=bool(config.post_extraction_agent_save_to_file),
742
+ output_file=config.post_extraction_agent_save_to_file,
743
+ custom_transform_function=config.post_agent_transformer_function
744
+ )
745
+ agent = PostExtractionAgent(post_config)
746
+
747
+ for result in successful_results:
748
+ try:
749
+ result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
750
+ progress.update(llm_task, advance=1)
751
+ except Exception as e:
752
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
753
+ time.sleep(1) # Add delay after error
754
+ time.sleep(0.5) # Rate limiting between requests
755
+
743
756
  # Calculate statistics and handle retries
744
757
  failed_results = [r for r in results if r.status == "failed"]
745
758
  initial_failed = len(failed_results)
@@ -831,31 +844,44 @@ class SpiderForce4AI:
831
844
  if result.status == "success" and config.output_dir and result.markdown:
832
845
  _save_markdown_sync(result.url, result.markdown, config)
833
846
 
834
- # Handle post-extraction if configured
835
- if config.post_extraction_agent and result.status == "success":
836
- try:
837
- post_config = PostExtractionConfig(
838
- model=config.post_extraction_agent["model"],
839
- messages=config.post_extraction_agent["messages"],
840
- api_key=config.post_extraction_agent["api_key"],
841
- max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
842
- temperature=config.post_extraction_agent.get("temperature", 0.7),
843
- base_url=config.post_extraction_agent.get("base_url"),
844
- combine_output=bool(config.post_extraction_agent_save_to_file),
845
- output_file=config.post_extraction_agent_save_to_file,
846
- custom_transform_function=config.post_agent_transformer_function
847
- )
848
-
849
- agent = PostExtractionAgent(post_config)
850
- extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
851
- if extraction_result:
852
- result.extraction_result = extraction_result
853
- except Exception as e:
854
- console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
855
-
856
847
  # Send webhook if configured
857
848
  _send_webhook_sync(result, config)
858
849
  results.append(result)
850
+
851
+ # Process LLM requests sequentially after all crawling is complete
852
+ if config.post_extraction_agent:
853
+ console.print("\n[cyan]Starting post-extraction processing...[/cyan]")
854
+ successful_results = [r for r in results if r.status == "success"]
855
+
856
+ with Progress(
857
+ SpinnerColumn(),
858
+ TextColumn("[progress.description]{task.description}"),
859
+ BarColumn(),
860
+ TaskProgressColumn(),
861
+ ) as progress:
862
+ llm_task = progress.add_task("[cyan]Post-extraction processing...", total=len(successful_results))
863
+
864
+ post_config = PostExtractionConfig(
865
+ model=config.post_extraction_agent["model"],
866
+ messages=config.post_extraction_agent["messages"],
867
+ api_key=config.post_extraction_agent["api_key"],
868
+ max_tokens=config.post_extraction_agent.get("max_tokens", 1000),
869
+ temperature=config.post_extraction_agent.get("temperature", 0.7),
870
+ base_url=config.post_extraction_agent.get("base_url"),
871
+ combine_output=bool(config.post_extraction_agent_save_to_file),
872
+ output_file=config.post_extraction_agent_save_to_file,
873
+ custom_transform_function=config.post_agent_transformer_function
874
+ )
875
+ agent = PostExtractionAgent(post_config)
876
+
877
+ for result in successful_results:
878
+ try:
879
+ result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
880
+ progress.update(llm_task, advance=1)
881
+ except Exception as e:
882
+ console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
883
+ time.sleep(1) # Add delay after error
884
+ time.sleep(0.5) # Rate limiting between requests
859
885
 
860
886
  # Calculate statistics
861
887
  successful = len([r for r in results if r.status == "success"])
@@ -164,12 +164,9 @@ class PostExtractionAgent:
164
164
  self.config.output_file.rename(backup_path)
165
165
  self.config.output_file.touch()
166
166
 
167
- async def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
167
+ def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
168
168
  """Process a single piece of content through the LLM."""
169
169
  try:
170
- # Apply rate limiting
171
- await self.rate_limiter.acquire()
172
-
173
170
  # Replace placeholder in messages with actual content
174
171
  messages = [
175
172
  {**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
@@ -183,7 +180,8 @@ class PostExtractionAgent:
183
180
 
184
181
  for attempt in range(max_retries):
185
182
  try:
186
- response = await completion(
183
+ # Call completion synchronously
184
+ response = completion(
187
185
  model=self.config.model,
188
186
  messages=messages,
189
187
  max_tokens=self.config.max_tokens,
@@ -200,11 +198,11 @@ class PostExtractionAgent:
200
198
  except json.JSONDecodeError as e:
201
199
  last_error = f"Invalid JSON response from LLM: {e}"
202
200
  if attempt < max_retries - 1:
203
- await asyncio.sleep(retry_delay * (attempt + 1))
201
+ time.sleep(retry_delay * (attempt + 1))
204
202
  except Exception as e:
205
203
  last_error = str(e)
206
204
  if attempt < max_retries - 1:
207
- await asyncio.sleep(retry_delay * (attempt + 1))
205
+ time.sleep(retry_delay * (attempt + 1))
208
206
 
209
207
  # If we get here, all retries failed
210
208
  raise Exception(last_error)
@@ -214,6 +212,20 @@ class PostExtractionAgent:
214
212
  self.buffer.add_failed_request(url, content, str(e))
215
213
  return None
216
214
 
215
+ def _save_result_sync(self, url: str, result: Dict) -> None:
216
+ """Save individual or combined results synchronously."""
217
+ try:
218
+ if self.config.combine_output and self.config.output_file:
219
+ self.results[url] = result
220
+ with open(self.config.output_file, 'w') as f:
221
+ json.dump(self.results, f, indent=2)
222
+ elif not self.config.combine_output and self.config.output_file:
223
+ individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
224
+ with open(individual_file, 'w') as f:
225
+ json.dump(result, f, indent=2)
226
+ except Exception as e:
227
+ logger.error(f"Error saving results for {url}: {str(e)}")
228
+
217
229
  async def _save_result(self, url: str, result: Dict) -> None:
218
230
  """Save individual or combined results."""
219
231
  try:
@@ -228,10 +240,10 @@ class PostExtractionAgent:
228
240
  except Exception as e:
229
241
  logger.error(f"Error saving results for {url}: {str(e)}")
230
242
 
231
- async def process_content(self, url: str, content: str) -> Optional[Dict]:
243
+ def process_content(self, url: str, content: str) -> Optional[Dict]:
232
244
  """Process content with retry mechanism."""
233
245
  for attempt in range(self.config.max_retries):
234
- result = await self._process_single_content(url, content)
246
+ result = self._process_single_content(url, content)
235
247
  if result:
236
248
  # Apply custom transformation if provided
237
249
  if self.config.custom_transform_function:
@@ -240,12 +252,13 @@ class PostExtractionAgent:
240
252
  except Exception as e:
241
253
  logger.error(f"Error in custom transform for {url}: {str(e)}")
242
254
 
243
- await self._save_result(url, result)
255
+ # Save result synchronously
256
+ self._save_result_sync(url, result)
244
257
  return result
245
258
 
246
259
  # Wait before retry
247
260
  if attempt < self.config.max_retries - 1:
248
- await asyncio.sleep(self.config.retry_delay)
261
+ time.sleep(self.config.retry_delay)
249
262
 
250
263
  return None
251
264
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.2
3
+ Version: 2.4.5
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes