spiderforce4ai 2.4.3__py3-none-any.whl → 2.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -586,7 +586,7 @@ class SpiderForce4AI:
586
586
  for result in results:
587
587
  if result.status == "success":
588
588
  try:
589
- result.extraction_result = await agent.process_content(result.url, result.markdown)
589
+ result.extraction_result = agent.process_content(result.url, result.markdown)
590
590
  progress.update(llm_task, advance=1)
591
591
  except Exception as e:
592
592
  console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
@@ -164,12 +164,9 @@ class PostExtractionAgent:
164
164
  self.config.output_file.rename(backup_path)
165
165
  self.config.output_file.touch()
166
166
 
167
- async def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
167
+ def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
168
168
  """Process a single piece of content through the LLM."""
169
169
  try:
170
- # Apply rate limiting
171
- await self.rate_limiter.acquire()
172
-
173
170
  # Replace placeholder in messages with actual content
174
171
  messages = [
175
172
  {**msg, 'content': msg['content'].replace('{here_markdown_content}', content)}
@@ -183,7 +180,8 @@ class PostExtractionAgent:
183
180
 
184
181
  for attempt in range(max_retries):
185
182
  try:
186
- response = await completion(
183
+ # Call completion synchronously
184
+ response = completion(
187
185
  model=self.config.model,
188
186
  messages=messages,
189
187
  max_tokens=self.config.max_tokens,
@@ -192,19 +190,31 @@ class PostExtractionAgent:
192
190
  api_base=self.config.base_url
193
191
  )
194
192
 
195
- # Parse response
196
- extracted_data = json.loads(response.choices[0].message.content)
197
- self.buffer.remove_request(url) # Remove from buffer if successful
198
- return extracted_data
193
+ # Log raw response for debugging
194
+ raw_content = response.choices[0].message.content
195
+ logger.debug(f"Raw LLM response for {url}: {raw_content}")
199
196
 
200
- except json.JSONDecodeError as e:
201
- last_error = f"Invalid JSON response from LLM: {e}"
202
- if attempt < max_retries - 1:
203
- await asyncio.sleep(retry_delay * (attempt + 1))
197
+ try:
198
+ extracted_data = json.loads(raw_content)
199
+ self.buffer.remove_request(url) # Remove from buffer if successful
200
+ return extracted_data
201
+ except json.JSONDecodeError as e:
202
+ error_msg = (
203
+ f"Invalid JSON response from LLM for {url}:\n"
204
+ f"Error: {str(e)}\n"
205
+ f"Raw content: {raw_content[:500]}..." # First 500 chars of response
206
+ )
207
+ logger.error(error_msg)
208
+ last_error = error_msg
209
+ if attempt < max_retries - 1:
210
+ time.sleep(retry_delay * (attempt + 1))
211
+
204
212
  except Exception as e:
205
- last_error = str(e)
213
+ error_msg = f"LLM processing error for {url}: {str(e)}"
214
+ logger.error(error_msg)
215
+ last_error = error_msg
206
216
  if attempt < max_retries - 1:
207
- await asyncio.sleep(retry_delay * (attempt + 1))
217
+ time.sleep(retry_delay * (attempt + 1))
208
218
 
209
219
  # If we get here, all retries failed
210
220
  raise Exception(last_error)
@@ -214,6 +224,20 @@ class PostExtractionAgent:
214
224
  self.buffer.add_failed_request(url, content, str(e))
215
225
  return None
216
226
 
227
+ def _save_result_sync(self, url: str, result: Dict) -> None:
228
+ """Save individual or combined results synchronously."""
229
+ try:
230
+ if self.config.combine_output and self.config.output_file:
231
+ self.results[url] = result
232
+ with open(self.config.output_file, 'w') as f:
233
+ json.dump(self.results, f, indent=2)
234
+ elif not self.config.combine_output and self.config.output_file:
235
+ individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
236
+ with open(individual_file, 'w') as f:
237
+ json.dump(result, f, indent=2)
238
+ except Exception as e:
239
+ logger.error(f"Error saving results for {url}: {str(e)}")
240
+
217
241
  async def _save_result(self, url: str, result: Dict) -> None:
218
242
  """Save individual or combined results."""
219
243
  try:
@@ -228,25 +252,44 @@ class PostExtractionAgent:
228
252
  except Exception as e:
229
253
  logger.error(f"Error saving results for {url}: {str(e)}")
230
254
 
231
- async def process_content(self, url: str, content: str) -> Optional[Dict]:
255
+ def process_content(self, url: str, content: str) -> Optional[Dict]:
232
256
  """Process content with retry mechanism."""
257
+ logger.info(f"Starting content processing for {url}")
258
+
233
259
  for attempt in range(self.config.max_retries):
234
- result = await self._process_single_content(url, content)
260
+ logger.info(f"Processing attempt {attempt + 1}/{self.config.max_retries} for {url}")
261
+
262
+ result = self._process_single_content(url, content)
235
263
  if result:
264
+ logger.info(f"Successfully processed content for {url}")
265
+
236
266
  # Apply custom transformation if provided
237
267
  if self.config.custom_transform_function:
238
268
  try:
239
269
  result = self.config.custom_transform_function(result)
270
+ logger.info(f"Applied custom transformation for {url}")
240
271
  except Exception as e:
241
- logger.error(f"Error in custom transform for {url}: {str(e)}")
272
+ error_msg = f"Error in custom transform for {url}: {str(e)}"
273
+ logger.error(error_msg)
274
+ console.print(f"[red]{error_msg}[/red]")
275
+
276
+ # Save result synchronously
277
+ try:
278
+ self._save_result_sync(url, result)
279
+ logger.info(f"Saved results for {url}")
280
+ except Exception as e:
281
+ error_msg = f"Error saving results for {url}: {str(e)}"
282
+ logger.error(error_msg)
283
+ console.print(f"[red]{error_msg}[/red]")
242
284
 
243
- await self._save_result(url, result)
244
285
  return result
245
286
 
246
287
  # Wait before retry
247
288
  if attempt < self.config.max_retries - 1:
248
- await asyncio.sleep(self.config.retry_delay)
289
+ logger.info(f"Attempt {attempt + 1} failed for {url}, waiting {self.config.retry_delay}s before retry")
290
+ time.sleep(self.config.retry_delay)
249
291
 
292
+ logger.error(f"All processing attempts failed for {url}")
250
293
  return None
251
294
 
252
295
  async def process_bulk_content(self, content_map: Dict[str, str]) -> Dict[str, Optional[Dict]]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.3
3
+ Version: 2.4.6
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=PPpJLowJhgoRijsF2ebmdkFbIriI_yIFlCi1wL6hSP8,42267
2
+ spiderforce4ai/post_extraction_agent.py,sha256=Ty9-Ai3_RVT86RrPUxKEzf4oUh-Wr7bk2aM87Je_WvE,13580
3
+ spiderforce4ai-2.4.6.dist-info/METADATA,sha256=7FaEgAHdD-8a0XmuDMkpAUjAQ7ZmFTD89IqQM17nllI,9012
4
+ spiderforce4ai-2.4.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.4.6.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.4.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.4.6.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=iwCLSvooHtFAo-rU52-nsFgyn99Dflpt_OpSrIW-PqA,42273
2
- spiderforce4ai/post_extraction_agent.py,sha256=m00-y0SCoutUnxsMwHxPaW-qRm4o5alQWjggDStUSrg,11249
3
- spiderforce4ai-2.4.3.dist-info/METADATA,sha256=-i_vH6DDs4xVFVdDfaFG_Xka0pqXCSQdCrKgym5r5b0,9012
4
- spiderforce4ai-2.4.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.4.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.4.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.4.3.dist-info/RECORD,,