spiderforce4ai 2.4.3__py3-none-any.whl → 2.4.6__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -586,7 +586,7 @@ class SpiderForce4AI:
586
586
  for result in results:
587
587
  if result.status == "success":
588
588
  try:
589
- result.extraction_result = await agent.process_content(result.url, result.markdown)
589
+ result.extraction_result = agent.process_content(result.url, result.markdown)
590
590
  progress.update(llm_task, advance=1)
591
591
  except Exception as e:
592
592
  console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
@@ -164,12 +164,9 @@ class PostExtractionAgent:
164
164
  self.config.output_file.rename(backup_path)
165
165
  self.config.output_file.touch()
166
166
 
167
- async def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
167
+ def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
168
168
  """Process a single piece of content through the LLM."""
169
169
  try:
170
- # Apply rate limiting
171
- await self.rate_limiter.acquire()
172
-
173
170
  # Replace placeholder in messages with actual content
174
171
  messages = [
175
172
  {**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
@@ -183,7 +180,8 @@ class PostExtractionAgent:
183
180
 
184
181
  for attempt in range(max_retries):
185
182
  try:
186
- response = await completion(
183
+ # Call completion synchronously
184
+ response = completion(
187
185
  model=self.config.model,
188
186
  messages=messages,
189
187
  max_tokens=self.config.max_tokens,
@@ -192,19 +190,31 @@ class PostExtractionAgent:
192
190
  api_base=self.config.base_url
193
191
  )
194
192
 
195
- # Parse response
196
- extracted_data = json.loads(response.choices[0].message.content)
197
- self.buffer.remove_request(url) # Remove from buffer if successful
198
- return extracted_data
193
+ # Log raw response for debugging
194
+ raw_content = response.choices[0].message.content
195
+ logger.debug(f"Raw LLM response for {url}: {raw_content}")
199
196
 
200
- except json.JSONDecodeError as e:
201
- last_error = f"Invalid JSON response from LLM: {e}"
202
- if attempt < max_retries - 1:
203
- await asyncio.sleep(retry_delay * (attempt + 1))
197
+ try:
198
+ extracted_data = json.loads(raw_content)
199
+ self.buffer.remove_request(url) # Remove from buffer if successful
200
+ return extracted_data
201
+ except json.JSONDecodeError as e:
202
+ error_msg = (
203
+ f"Invalid JSON response from LLM for {url}:\n"
204
+ f"Error: {str(e)}\n"
205
+ f"Raw content: {raw_content[:500]}..." # First 500 chars of response
206
+ )
207
+ logger.error(error_msg)
208
+ last_error = error_msg
209
+ if attempt < max_retries - 1:
210
+ time.sleep(retry_delay * (attempt + 1))
211
+
204
212
  except Exception as e:
205
- last_error = str(e)
213
+ error_msg = f"LLM processing error for {url}: {str(e)}"
214
+ logger.error(error_msg)
215
+ last_error = error_msg
206
216
  if attempt < max_retries - 1:
207
- await asyncio.sleep(retry_delay * (attempt + 1))
217
+ time.sleep(retry_delay * (attempt + 1))
208
218
 
209
219
  # If we get here, all retries failed
210
220
  raise Exception(last_error)
@@ -214,6 +224,20 @@ class PostExtractionAgent:
214
224
  self.buffer.add_failed_request(url, content, str(e))
215
225
  return None
216
226
 
227
+ def _save_result_sync(self, url: str, result: Dict) -> None:
228
+ """Save individual or combined results synchronously."""
229
+ try:
230
+ if self.config.combine_output and self.config.output_file:
231
+ self.results[url] = result
232
+ with open(self.config.output_file, 'w') as f:
233
+ json.dump(self.results, f, indent=2)
234
+ elif not self.config.combine_output and self.config.output_file:
235
+ individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
236
+ with open(individual_file, 'w') as f:
237
+ json.dump(result, f, indent=2)
238
+ except Exception as e:
239
+ logger.error(f"Error saving results for {url}: {str(e)}")
240
+
217
241
  async def _save_result(self, url: str, result: Dict) -> None:
218
242
  """Save individual or combined results."""
219
243
  try:
@@ -228,25 +252,44 @@ class PostExtractionAgent:
228
252
  except Exception as e:
229
253
  logger.error(f"Error saving results for {url}: {str(e)}")
230
254
 
231
- async def process_content(self, url: str, content: str) -> Optional[Dict]:
255
+ def process_content(self, url: str, content: str) -> Optional[Dict]:
232
256
  """Process content with retry mechanism."""
257
+ logger.info(f"Starting content processing for {url}")
258
+
233
259
  for attempt in range(self.config.max_retries):
234
- result = await self._process_single_content(url, content)
260
+ logger.info(f"Processing attempt {attempt + 1}/{self.config.max_retries} for {url}")
261
+
262
+ result = self._process_single_content(url, content)
235
263
  if result:
264
+ logger.info(f"Successfully processed content for {url}")
265
+
236
266
  # Apply custom transformation if provided
237
267
  if self.config.custom_transform_function:
238
268
  try:
239
269
  result = self.config.custom_transform_function(result)
270
+ logger.info(f"Applied custom transformation for {url}")
240
271
  except Exception as e:
241
- logger.error(f"Error in custom transform for {url}: {str(e)}")
272
+ error_msg = f"Error in custom transform for {url}: {str(e)}"
273
+ logger.error(error_msg)
274
+ console.print(f"[red]{error_msg}[/red]")
275
+
276
+ # Save result synchronously
277
+ try:
278
+ self._save_result_sync(url, result)
279
+ logger.info(f"Saved results for {url}")
280
+ except Exception as e:
281
+ error_msg = f"Error saving results for {url}: {str(e)}"
282
+ logger.error(error_msg)
283
+ console.print(f"[red]{error_msg}[/red]")
242
284
 
243
- await self._save_result(url, result)
244
285
  return result
245
286
 
246
287
  # Wait before retry
247
288
  if attempt < self.config.max_retries - 1:
248
- await asyncio.sleep(self.config.retry_delay)
289
+ logger.info(f"Attempt {attempt + 1} failed for {url}, waiting {self.config.retry_delay}s before retry")
290
+ time.sleep(self.config.retry_delay)
249
291
 
292
+ logger.error(f"All processing attempts failed for {url}")
250
293
  return None
251
294
 
252
295
  async def process_bulk_content(self, content_map: Dict[str, str]) -> Dict[str, Optional[Dict]]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.3
3
+ Version: 2.4.6
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=PPpJLowJhgoRijsF2ebmdkFbIriI_yIFlCi1wL6hSP8,42267
2
+ spiderforce4ai/post_extraction_agent.py,sha256=Ty9-Ai3_RVT86RrPUxKEzf4oUh-Wr7bk2aM87Je_WvE,13580
3
+ spiderforce4ai-2.4.6.dist-info/METADATA,sha256=7FaEgAHdD-8a0XmuDMkpAUjAQ7ZmFTD89IqQM17nllI,9012
4
+ spiderforce4ai-2.4.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.6.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.6.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=iwCLSvooHtFAo-rU52-nsFgyn99Dflpt_OpSrIW-PqA,42273
2
- spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
- spiderforce4ai-2.4.3.dist-info/METADATA,sha256=-i_vH6DDs4xVFVdDfaFG_Xka0pqXCSQdCrKgym5r5b0,9012
4
- spiderforce4ai-2.4.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.4.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.4.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.4.3.dist-info/RECORD,,