spiderforce4ai 2.5.7__tar.gz → 2.5.9__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/PKG-INFO +1 -1
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/pyproject.toml +1 -1
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/setup.py +1 -1
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/spiderforce4ai/post_extraction_agent.py +31 -4
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/README.md +0 -0
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/setup.cfg +0 -0
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/spiderforce4ai/__init__.py +0 -0
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/spiderforce4ai.egg-info/entry_points.txt +0 -0
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/spiderforce4ai.egg-info/not-zip-safe +0 -0
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-2.5.7 → spiderforce4ai-2.5.9}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "2.5.
|
7
|
+
version = "2.5.9"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [
|
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
|
|
7
7
|
|
8
8
|
setup(
|
9
9
|
name="spiderforce4ai",
|
10
|
-
version="2.5.
|
10
|
+
version="2.5.9",
|
11
11
|
author="Piotr Tamulewicz",
|
12
12
|
author_email="pt@petertam.pro",
|
13
13
|
description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
|
@@ -167,6 +167,8 @@ class PostExtractionAgent:
|
|
167
167
|
backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
|
168
168
|
self.config.output_file.rename(backup_path)
|
169
169
|
self.config.output_file.touch()
|
170
|
+
# Initialize empty results dictionary
|
171
|
+
self.results = {}
|
170
172
|
|
171
173
|
def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
172
174
|
"""Process a single piece of content through the LLM."""
|
@@ -258,13 +260,33 @@ class PostExtractionAgent:
|
|
258
260
|
"""Save individual or combined results synchronously."""
|
259
261
|
try:
|
260
262
|
if self.config.combine_output and self.config.output_file:
|
263
|
+
# Update the results dictionary
|
261
264
|
self.results[url] = result
|
262
|
-
|
265
|
+
|
266
|
+
# Ensure output directory exists
|
267
|
+
self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
|
268
|
+
|
269
|
+
# Save combined results atomically
|
270
|
+
temp_file = self.config.output_file.with_suffix('.tmp')
|
271
|
+
with open(temp_file, 'w') as f:
|
263
272
|
json.dump(self.results, f, indent=2)
|
273
|
+
temp_file.replace(self.config.output_file)
|
274
|
+
|
275
|
+
logger.info(f"Updated combined results file with {url}")
|
276
|
+
|
277
|
+
# Cleanup backup files after successful save
|
278
|
+
for backup_file in self.config.output_file.parent.glob(f"{self.config.output_file.stem}.bak_*"):
|
279
|
+
try:
|
280
|
+
backup_file.unlink()
|
281
|
+
logger.info(f"Cleaned up backup file: {backup_file}")
|
282
|
+
except Exception as e:
|
283
|
+
logger.warning(f"Failed to remove backup file {backup_file}: {e}")
|
284
|
+
|
264
285
|
elif not self.config.combine_output and self.config.output_file:
|
265
286
|
individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
|
266
287
|
with open(individual_file, 'w') as f:
|
267
288
|
json.dump(result, f, indent=2)
|
289
|
+
logger.info(f"Saved individual result file for {url}")
|
268
290
|
except Exception as e:
|
269
291
|
logger.error(f"Error saving results for {url}: {str(e)}")
|
270
292
|
|
@@ -311,14 +333,19 @@ class PostExtractionAgent:
|
|
311
333
|
|
312
334
|
# Save result synchronously
|
313
335
|
try:
|
314
|
-
|
315
|
-
|
336
|
+
# Save both original and transformed result
|
337
|
+
if self.config.custom_transform_function:
|
338
|
+
self._save_result_sync(url, transformed_result)
|
339
|
+
logger.info(f"Saved transformed results for {url}")
|
340
|
+
else:
|
341
|
+
self._save_result_sync(url, result)
|
342
|
+
logger.info(f"Saved original results for {url}")
|
316
343
|
except Exception as e:
|
317
344
|
error_msg = f"Error saving results for {url}: {str(e)}"
|
318
345
|
logger.error(error_msg)
|
319
346
|
console.print(f"[red]{error_msg}[/red]")
|
320
347
|
|
321
|
-
return result
|
348
|
+
return transformed_result if self.config.custom_transform_function else result
|
322
349
|
|
323
350
|
# Wait before retry
|
324
351
|
if attempt < self.config.max_retries - 1:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|