spiderforce4ai 2.4.8__tar.gz → 2.5__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.8
3
+ Version: 2.5
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "2.4.8"
7
+ version = "2.5"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
9
9
  readme = "README.md"
10
10
  authors = [
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
7
7
 
8
8
  setup(
9
9
  name="spiderforce4ai",
10
- version="2.4.8",
10
+ version="2.5",
11
11
  author="Piotr Tamulewicz",
12
12
  author_email="pt@petertam.pro",
13
13
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
@@ -5,6 +5,9 @@ import asyncio
5
5
  import aiohttp
6
6
  import json
7
7
  import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+ import logging
8
11
  from typing import List, Dict, Union, Optional, Tuple, Callable, Any
9
12
  from dataclasses import dataclass, asdict
10
13
  from urllib.parse import urljoin, urlparse
@@ -877,7 +880,20 @@ class SpiderForce4AI:
877
880
 
878
881
  for result in successful_results:
879
882
  try:
880
- result.extraction_result = asyncio.run(agent.process_content(result.url, result.markdown))
883
+ # Get LLM response
884
+ llm_response = agent.process_content(result.url, result.markdown)
885
+ if llm_response:
886
+ # Add URL to the response before transformation
887
+ llm_response['url'] = result.url
888
+ # Apply transformation if provided
889
+ if config.post_agent_transformer_function:
890
+ try:
891
+ result.extraction_result = config.post_agent_transformer_function(llm_response)
892
+ except Exception as e:
893
+ logger.error(f"Transformer error for {result.url}: {str(e)}")
894
+ result.extraction_result = llm_response # Use original response if transform fails
895
+ else:
896
+ result.extraction_result = llm_response
881
897
  progress.update(llm_task, advance=1)
882
898
  except Exception as e:
883
899
  console.print(f"[red]Error in post-extraction processing for {result.url}: {str(e)}[/red]")
@@ -199,31 +199,35 @@ class PostExtractionAgent:
199
199
  completion_args["response_format"] = {"type": self.config.response_format}
200
200
 
201
201
  response = completion(**completion_args)
202
-
203
- # Log raw response for debugging
204
202
  raw_content = response.choices[0].message.content
205
203
  logger.debug(f"Raw LLM response for {url}: {raw_content}")
206
-
204
+
205
+ # Handle response based on response_format
207
206
  try:
208
- # First try direct JSON parsing
209
- try:
210
- extracted_data = json.loads(raw_content)
211
- except json.JSONDecodeError:
212
- # Look for JSON in markdown code blocks
213
- json_match = re.search(r'```(?:json)?\s*\n([\s\S]*?)\n```', raw_content)
214
- if json_match:
215
- json_content = json_match.group(1).strip()
216
- extracted_data = json.loads(json_content)
217
- else:
218
- # If no JSON found, try to extract structured data in any format
219
- extracted_data = {
220
- "raw_content": raw_content,
221
- "format": "text",
222
- "timestamp": datetime.now().isoformat()
223
- }
207
+ if self.config.response_format == "json_object":
208
+ # For json_object format, response should already be valid JSON
209
+ extracted_data = raw_content if isinstance(raw_content, dict) else json.loads(raw_content)
210
+ else:
211
+ # For text format or unspecified, try parsing JSON or use as text
212
+ try:
213
+ extracted_data = json.loads(raw_content)
214
+ except json.JSONDecodeError:
215
+ # Look for JSON in markdown code blocks
216
+ json_match = re.search(r'```(?:json)?\s*\n([\s\S]*?)\n```', raw_content)
217
+ if json_match:
218
+ json_content = json_match.group(1).strip()
219
+ extracted_data = json.loads(json_content)
220
+ else:
221
+ # If no JSON found and not json_object format, use raw content
222
+ extracted_data = {
223
+ "raw_content": raw_content,
224
+ "format": "text",
225
+ "timestamp": datetime.now().isoformat()
226
+ }
224
227
 
225
228
  self.buffer.remove_request(url) # Remove from buffer if successful
226
229
  return extracted_data
230
+
227
231
  except Exception as e:
228
232
  error_msg = (
229
233
  f"Error processing LLM response for {url}:\n"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.4.8
3
+ Version: 2.5
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes