spiderforce4ai 2.4.5__tar.gz → 2.4.7__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/PKG-INFO +1 -1
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/pyproject.toml +1 -1
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/setup.py +1 -1
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/spiderforce4ai/post_extraction_agent.py +60 -11
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/README.md +0 -0
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/setup.cfg +0 -0
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/spiderforce4ai/__init__.py +0 -0
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/spiderforce4ai.egg-info/entry_points.txt +0 -0
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/spiderforce4ai.egg-info/not-zip-safe +0 -0
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-2.4.5 → spiderforce4ai-2.4.7}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "2.4.
|
7
|
+
version = "2.4.7"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [
|
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
|
|
7
7
|
|
8
8
|
setup(
|
9
9
|
name="spiderforce4ai",
|
10
|
-
version="2.4.
|
10
|
+
version="2.4.7",
|
11
11
|
author="Piotr Tamulewicz",
|
12
12
|
author_email="pt@petertam.pro",
|
13
13
|
description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
|
@@ -12,6 +12,9 @@ from pydantic import BaseModel, Field
|
|
12
12
|
import logging
|
13
13
|
from datetime import datetime
|
14
14
|
import re
|
15
|
+
from rich.console import Console
|
16
|
+
|
17
|
+
console = Console()
|
15
18
|
|
16
19
|
logger = logging.getLogger(__name__)
|
17
20
|
|
@@ -190,17 +193,45 @@ class PostExtractionAgent:
|
|
190
193
|
api_base=self.config.base_url
|
191
194
|
)
|
192
195
|
|
193
|
-
#
|
194
|
-
|
195
|
-
|
196
|
-
return extracted_data
|
196
|
+
# Log raw response for debugging
|
197
|
+
raw_content = response.choices[0].message.content
|
198
|
+
logger.debug(f"Raw LLM response for {url}: {raw_content}")
|
197
199
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
200
|
+
try:
|
201
|
+
# First try direct JSON parsing
|
202
|
+
try:
|
203
|
+
extracted_data = json.loads(raw_content)
|
204
|
+
except json.JSONDecodeError:
|
205
|
+
# Look for JSON in markdown code blocks
|
206
|
+
json_match = re.search(r'```(?:json)?\s*\n([\s\S]*?)\n```', raw_content)
|
207
|
+
if json_match:
|
208
|
+
json_content = json_match.group(1).strip()
|
209
|
+
extracted_data = json.loads(json_content)
|
210
|
+
else:
|
211
|
+
# If no JSON found, try to extract structured data in any format
|
212
|
+
extracted_data = {
|
213
|
+
"raw_content": raw_content,
|
214
|
+
"format": "text",
|
215
|
+
"timestamp": datetime.now().isoformat()
|
216
|
+
}
|
217
|
+
|
218
|
+
self.buffer.remove_request(url) # Remove from buffer if successful
|
219
|
+
return extracted_data
|
220
|
+
except Exception as e:
|
221
|
+
error_msg = (
|
222
|
+
f"Error processing LLM response for {url}:\n"
|
223
|
+
f"Error: {str(e)}\n"
|
224
|
+
f"Raw content: {raw_content[:500]}..." # First 500 chars of response
|
225
|
+
)
|
226
|
+
logger.error(error_msg)
|
227
|
+
last_error = error_msg
|
228
|
+
if attempt < max_retries - 1:
|
229
|
+
time.sleep(retry_delay * (attempt + 1))
|
230
|
+
|
202
231
|
except Exception as e:
|
203
|
-
|
232
|
+
error_msg = f"LLM processing error for {url}: {str(e)}"
|
233
|
+
logger.error(error_msg)
|
234
|
+
last_error = error_msg
|
204
235
|
if attempt < max_retries - 1:
|
205
236
|
time.sleep(retry_delay * (attempt + 1))
|
206
237
|
|
@@ -242,24 +273,42 @@ class PostExtractionAgent:
|
|
242
273
|
|
243
274
|
def process_content(self, url: str, content: str) -> Optional[Dict]:
|
244
275
|
"""Process content with retry mechanism."""
|
276
|
+
logger.info(f"Starting content processing for {url}")
|
277
|
+
|
245
278
|
for attempt in range(self.config.max_retries):
|
279
|
+
logger.info(f"Processing attempt {attempt + 1}/{self.config.max_retries} for {url}")
|
280
|
+
|
246
281
|
result = self._process_single_content(url, content)
|
247
282
|
if result:
|
283
|
+
logger.info(f"Successfully processed content for {url}")
|
284
|
+
|
248
285
|
# Apply custom transformation if provided
|
249
286
|
if self.config.custom_transform_function:
|
250
287
|
try:
|
251
288
|
result = self.config.custom_transform_function(result)
|
289
|
+
logger.info(f"Applied custom transformation for {url}")
|
252
290
|
except Exception as e:
|
253
|
-
|
291
|
+
error_msg = f"Error in custom transform for {url}: {str(e)}"
|
292
|
+
logger.error(error_msg)
|
293
|
+
console.print(f"[red]{error_msg}[/red]")
|
254
294
|
|
255
295
|
# Save result synchronously
|
256
|
-
|
296
|
+
try:
|
297
|
+
self._save_result_sync(url, result)
|
298
|
+
logger.info(f"Saved results for {url}")
|
299
|
+
except Exception as e:
|
300
|
+
error_msg = f"Error saving results for {url}: {str(e)}"
|
301
|
+
logger.error(error_msg)
|
302
|
+
console.print(f"[red]{error_msg}[/red]")
|
303
|
+
|
257
304
|
return result
|
258
305
|
|
259
306
|
# Wait before retry
|
260
307
|
if attempt < self.config.max_retries - 1:
|
308
|
+
logger.info(f"Attempt {attempt + 1} failed for {url}, waiting {self.config.retry_delay}s before retry")
|
261
309
|
time.sleep(self.config.retry_delay)
|
262
310
|
|
311
|
+
logger.error(f"All processing attempts failed for {url}")
|
263
312
|
return None
|
264
313
|
|
265
314
|
async def process_bulk_content(self, content_map: Dict[str, str]) -> Dict[str, Optional[Dict]]:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|