spiderforce4ai 2.4.8__py3-none-any.whl → 2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,9 @@ import asyncio
5
5
  import aiohttp
6
6
  import json
7
7
  import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+ import logging
8
11
  from typing import List, Dict, Union, Optional, Tuple, Callable, Any
9
12
  from dataclasses import dataclass, asdict
10
13
  from urllib.parse import urljoin, urlparse
@@ -877,7 +880,20 @@ class SpiderForce4AI:
877
880
 
878
881
  for result in successful_results:
879
882
  try:
880
- result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
883
+ # Get LLM response
884
+ llm_response = agent.process_content(result.url, result.markdown)
885
+ if llm_response:
886
+ # Add URL to the response before transformation
887
+ llm_response['url'] = result.url
888
+ # Apply transformation if provided
889
+ if config.post_agent_transformer_function:
890
+ try:
891
+ result.extraction_result = config.post_agent_transformer_function(llm_response)
892
+ except Exception as e:
893
+ logger.error(f"Transformer error for {result.url}: {str(e)}")
894
+ result.extraction_result = llm_response # Use original response if transform fails
895
+ else:
896
+ result.extraction_result = llm_response
881
897
  progress.update(llm_task, advance=1)
882
898
  except Exception as e:
883
899
  console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
@@ -199,31 +199,35 @@ class PostExtractionAgent:
199
199
  completion_args["response_format"] = {"type": self.config.response_format}
200
200
 
201
201
  response = completion(**completion_args)
202
-
203
- # Log raw response for debugging
204
202
  raw_content = response.choices[0].message.content
205
203
  logger.debug(f"Raw LLM response for {url}: {raw_content}")
206
-
204
+
205
+ # Handle response based on response_format
207
206
  try:
208
- # First try direct JSON parsing
209
- try:
210
- extracted_data = json.loads(raw_content)
211
- except json.JSONDecodeError:
212
- # Look for JSON in markdown code blocks
213
- json_match = re.search(r'```(?:json)?\s*\n([\s\S]*?)\n```', raw_content)
214
- if json_match:
215
- json_content = json_match.group(1).strip()
216
- extracted_data = json.loads(json_content)
217
- else:
218
- # If no JSON found, try to extract structured data in any format
219
- extracted_data = {
220
- "raw_content": raw_content,
221
- "format": "text",
222
- "timestamp": datetime.now().isoformat()
223
- }
207
+ if self.config.response_format == "json_object":
208
+ # For json_object format, response should already be valid JSON
209
+ extracted_data = raw_content if isinstance(raw_content, dict) else json.loads(raw_content)
210
+ else:
211
+ # For text format or unspecified, try parsing JSON or use as text
212
+ try:
213
+ extracted_data = json.loads(raw_content)
214
+ except json.JSONDecodeError:
215
+ # Look for JSON in markdown code blocks
216
+ json_match = re.search(r'```(?:json)?\s*\n([\s\S]*?)\n```', raw_content)
217
+ if json_match:
218
+ json_content = json_match.group(1).strip()
219
+ extracted_data = json.loads(json_content)
220
+ else:
221
+ # If no JSON found and not json_object format, use raw content
222
+ extracted_data = {
223
+ "raw_content": raw_content,
224
+ "format": "text",
225
+ "timestamp": datetime.now().isoformat()
226
+ }
224
227
 
225
228
  self.buffer.remove_request(url) # Remove from buffer if successful
226
229
  return extracted_data
230
+
227
231
  except Exception as e:
228
232
  error_msg = (
229
233
  f"Error processing LLM response for {url}:\n"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.8
3
+ Version: 2.5
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=T0H2nqzhvXuxYMKgWAVoRrSIUV72H7yZ8SHIsbG9I4g,43327
2
+ spiderforce4ai/post_extraction_agent.py,sha256=so5Ze7Vz3konpQ0iT7ZxDGE9kIYeTwPTFyzezRc5oys,15392
3
+ spiderforce4ai-2.5.dist-info/METADATA,sha256=4iP462Pmx5GikzNhhPFHhm89BdkGqTzFDTeiPN1Xp4U,9010
4
+ spiderforce4ai-2.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.5.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.5.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=DUPOKF7-vCVQi7JimsStU1qjk5x3yVUoMnUVOJxOrGk,42360
2
- spiderforce4ai/post_extraction_agent.py,sha256=5M7pBU8O44Khfub2jSSPboSbrcsAPw6nnp576qIA2pY,14988
3
- spiderforce4ai-2.4.8.dist-info/METADATA,sha256=zsEmCfjL_ueJzIeTJ9BIdUEF3R-4uOPDqrRxox70Cto,9012
4
- spiderforce4ai-2.4.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.4.8.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.4.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.4.8.dist-info/RECORD,,