spiderforce4ai 2.4.7__tar.gz → 2.4.9__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/PKG-INFO +1 -1
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/pyproject.toml +1 -1
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/setup.py +1 -1
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/spiderforce4ai/__init__.py +2 -1
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/spiderforce4ai/post_extraction_agent.py +37 -26
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/README.md +0 -0
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/setup.cfg +0 -0
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/spiderforce4ai.egg-info/entry_points.txt +0 -0
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/spiderforce4ai.egg-info/not-zip-safe +0 -0
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-2.4.7 → spiderforce4ai-2.4.9}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "2.4.
|
7
|
+
version = "2.4.9"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [
|
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
|
|
7
7
|
|
8
8
|
setup(
|
9
9
|
name="spiderforce4ai",
|
10
|
-
version="2.4.
|
10
|
+
version="2.4.9",
|
11
11
|
author="Piotr Tamulewicz",
|
12
12
|
author_email="pt@petertam.pro",
|
13
13
|
description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
|
@@ -870,7 +870,8 @@ class SpiderForce4AI:
|
|
870
870
|
base_url=config.post_extraction_agent.get("base_url"),
|
871
871
|
combine_output=bool(config.post_extraction_agent_save_to_file),
|
872
872
|
output_file=config.post_extraction_agent_save_to_file,
|
873
|
-
custom_transform_function=config.post_agent_transformer_function
|
873
|
+
custom_transform_function=config.post_agent_transformer_function,
|
874
|
+
response_format=config.post_extraction_agent.get("response_format")
|
874
875
|
)
|
875
876
|
agent = PostExtractionAgent(post_config)
|
876
877
|
|
@@ -118,6 +118,7 @@ class PostExtractionConfig:
|
|
118
118
|
output_file: Optional[Path] = None
|
119
119
|
custom_transform_function: Optional[Callable] = None
|
120
120
|
buffer_file: Optional[Path] = None
|
121
|
+
response_format: Optional[str] = None # 'json' or 'text'
|
121
122
|
|
122
123
|
def __post_init__(self):
|
123
124
|
if self.output_file:
|
@@ -184,39 +185,49 @@ class PostExtractionAgent:
|
|
184
185
|
for attempt in range(max_retries):
|
185
186
|
try:
|
186
187
|
# Call completion synchronously
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
188
|
+
# Add response_format if specified
|
189
|
+
completion_args = {
|
190
|
+
"model": self.config.model,
|
191
|
+
"messages": messages,
|
192
|
+
"max_tokens": self.config.max_tokens,
|
193
|
+
"temperature": self.config.temperature,
|
194
|
+
"api_key": self.config.api_key,
|
195
|
+
}
|
196
|
+
if self.config.base_url:
|
197
|
+
completion_args["api_base"] = self.config.base_url
|
198
|
+
if self.config.response_format:
|
199
|
+
completion_args["response_format"] = {"type": self.config.response_format}
|
195
200
|
|
196
|
-
|
201
|
+
response = completion(**completion_args)
|
197
202
|
raw_content = response.choices[0].message.content
|
198
203
|
logger.debug(f"Raw LLM response for {url}: {raw_content}")
|
199
|
-
|
204
|
+
|
205
|
+
# Handle response based on response_format
|
200
206
|
try:
|
201
|
-
|
202
|
-
|
203
|
-
extracted_data = json.loads(raw_content)
|
204
|
-
|
205
|
-
#
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
207
|
+
if self.config.response_format == "json_object":
|
208
|
+
# For json_object format, response should already be valid JSON
|
209
|
+
extracted_data = raw_content if isinstance(raw_content, dict) else json.loads(raw_content)
|
210
|
+
else:
|
211
|
+
# For text format or unspecified, try parsing JSON or use as text
|
212
|
+
try:
|
213
|
+
extracted_data = json.loads(raw_content)
|
214
|
+
except json.JSONDecodeError:
|
215
|
+
# Look for JSON in markdown code blocks
|
216
|
+
json_match = re.search(r'```(?:json)?\s*\n([\s\S]*?)\n```', raw_content)
|
217
|
+
if json_match:
|
218
|
+
json_content = json_match.group(1).strip()
|
219
|
+
extracted_data = json.loads(json_content)
|
220
|
+
else:
|
221
|
+
# If no JSON found and not json_object format, use raw content
|
222
|
+
extracted_data = {
|
223
|
+
"raw_content": raw_content,
|
224
|
+
"format": "text",
|
225
|
+
"timestamp": datetime.now().isoformat()
|
226
|
+
}
|
217
227
|
|
218
228
|
self.buffer.remove_request(url) # Remove from buffer if successful
|
219
229
|
return extracted_data
|
230
|
+
|
220
231
|
except Exception as e:
|
221
232
|
error_msg = (
|
222
233
|
f"Error processing LLM response for {url}:\n"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|