spiderforce4ai 2.4.5__py3-none-any.whl → 2.4.7__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -12,6 +12,9 @@ from pydantic import BaseModel, Field
12
12
  import logging
13
13
  from datetime import datetime
14
14
  import re
15
+ from rich.console import Console
16
+
17
+ console = Console()
15
18
 
16
19
  logger = logging.getLogger(__name__)
17
20
 
@@ -190,17 +193,45 @@ class PostExtractionAgent:
190
193
  api_base=self.config.base_url
191
194
  )
192
195
 
193
- # Parse response
194
- extracted_data = json.loads(response.choices[0].message.content)
195
- self.buffer.remove_request(url) # Remove from buffer if successful
196
- return extracted_data
196
+ # Log raw response for debugging
197
+ raw_content = response.choices[0].message.content
198
+ logger.debug(f"Raw LLM response for {url}: {raw_content}")
197
199
 
198
- except json.JSONDecodeError as e:
199
- last_error = f"Invalid JSON response from LLM: {e}"
200
- if attempt < max_retries - 1:
201
- time.sleep(retry_delay * (attempt + 1))
200
+ try:
201
+ # First try direct JSON parsing
202
+ try:
203
+ extracted_data = json.loads(raw_content)
204
+ except json.JSONDecodeError:
205
+ # Look for JSON in markdown code blocks
206
+ json_match = re.search(r'```(?:json)?\s*\n([\s\S]*?)\n```', raw_content)
207
+ if json_match:
208
+ json_content = json_match.group(1).strip()
209
+ extracted_data = json.loads(json_content)
210
+ else:
211
+ # If no JSON found, try to extract structured data in any format
212
+ extracted_data = {
213
+ "raw_content": raw_content,
214
+ "format": "text",
215
+ "timestamp": datetime.now().isoformat()
216
+ }
217
+
218
+ self.buffer.remove_request(url) # Remove from buffer if successful
219
+ return extracted_data
220
+ except Exception as e:
221
+ error_msg = (
222
+ f"Error processing LLM response for {url}:\n"
223
+ f"Error: {str(e)}\n"
224
+ f"Raw content: {raw_content[:500]}..." # First 500 chars of response
225
+ )
226
+ logger.error(error_msg)
227
+ last_error = error_msg
228
+ if attempt < max_retries - 1:
229
+ time.sleep(retry_delay * (attempt + 1))
230
+
202
231
  except Exception as e:
203
- last_error = str(e)
232
+ error_msg = f"LLM processing error for {url}: {str(e)}"
233
+ logger.error(error_msg)
234
+ last_error = error_msg
204
235
  if attempt < max_retries - 1:
205
236
  time.sleep(retry_delay * (attempt + 1))
206
237
 
@@ -242,24 +273,42 @@ class PostExtractionAgent:
242
273
 
243
274
  def process_content(self, url: str, content: str) -> Optional[Dict]:
244
275
  """Process content with retry mechanism."""
276
+ logger.info(f"Starting content processing for {url}")
277
+
245
278
  for attempt in range(self.config.max_retries):
279
+ logger.info(f"Processing attempt {attempt + 1}/{self.config.max_retries} for {url}")
280
+
246
281
  result = self._process_single_content(url, content)
247
282
  if result:
283
+ logger.info(f"Successfully processed content for {url}")
284
+
248
285
  # Apply custom transformation if provided
249
286
  if self.config.custom_transform_function:
250
287
  try:
251
288
  result = self.config.custom_transform_function(result)
289
+ logger.info(f"Applied custom transformation for {url}")
252
290
  except Exception as e:
253
- logger.error(f"Error in custom transform for {url}: {str(e)}")
291
+ error_msg = f"Error in custom transform for {url}: {str(e)}"
292
+ logger.error(error_msg)
293
+ console.print(f"[red]{error_msg}[/red]")
254
294
 
255
295
  # Save result synchronously
256
- self._save_result_sync(url, result)
296
+ try:
297
+ self._save_result_sync(url, result)
298
+ logger.info(f"Saved results for {url}")
299
+ except Exception as e:
300
+ error_msg = f"Error saving results for {url}: {str(e)}"
301
+ logger.error(error_msg)
302
+ console.print(f"[red]{error_msg}[/red]")
303
+
257
304
  return result
258
305
 
259
306
  # Wait before retry
260
307
  if attempt < self.config.max_retries - 1:
308
+ logger.info(f"Attempt {attempt + 1} failed for {url}, waiting {self.config.retry_delay}s before retry")
261
309
  time.sleep(self.config.retry_delay)
262
310
 
311
+ logger.error(f"All processing attempts failed for {url}")
263
312
  return None
264
313
 
265
314
  async def process_bulk_content(self, content_map: Dict[str, str]) -> Dict[str, Optional[Dict]]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.5
3
+ Version: 2.4.7
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=PPpJLowJhgoRijsF2ebmdkFbIriI_yIFlCi1wL6hSP8,42267
2
+ spiderforce4ai/post_extraction_agent.py,sha256=q2ohsqw_F1e5rT2H9eSzCWzstJLbwGyCtwLsC6eMufs,14560
3
+ spiderforce4ai-2.4.7.dist-info/METADATA,sha256=r273h2ogI76aXTd8XN9b81EWtQLuhdSjZkXD2Ks8GnM,9012
4
+ spiderforce4ai-2.4.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.7.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.7.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=PPpJLowJhgoRijsF2ebmdkFbIriI_yIFlCi1wL6hSP8,42267
2
- spiderforce4ai/post_extraction_agent.py,sha256=t9KxjuNw16-6kige6ULPLyykNkiGmKhpCi8QjskdaTk,11959
3
- spiderforce4ai-2.4.5.dist-info/METADATA,sha256=q3VBuGb5wxsi9OPkzEMwFMyg9f_vT2RamWYIgu2JbLc,9012
4
- spiderforce4ai-2.4.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.4.5.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.4.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.4.5.dist-info/RECORD,,