spiderforce4ai 2.6.3__tar.gz → 2.6.5__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/PKG-INFO +1 -1
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/pyproject.toml +1 -1
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/setup.py +1 -1
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/spiderforce4ai/post_extraction_agent.py +40 -49
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/README.md +0 -0
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/setup.cfg +0 -0
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/spiderforce4ai/__init__.py +0 -0
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/spiderforce4ai.egg-info/entry_points.txt +0 -0
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/spiderforce4ai.egg-info/not-zip-safe +0 -0
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-2.6.3 → spiderforce4ai-2.6.5}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "2.6.
|
7
|
+
version = "2.6.5"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [
|
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
|
|
7
7
|
|
8
8
|
setup(
|
9
9
|
name="spiderforce4ai",
|
10
|
-
version="2.6.
|
10
|
+
version="2.6.5",
|
11
11
|
author="Piotr Tamulewicz",
|
12
12
|
author_email="pt@petertam.pro",
|
13
13
|
description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
|
@@ -159,6 +159,12 @@ class PostExtractionAgent:
|
|
159
159
|
# Convert string path to Path object if needed
|
160
160
|
if isinstance(self.config.output_file, str):
|
161
161
|
self.config.output_file = Path(self.config.output_file)
|
162
|
+
# Ensure parent directory exists
|
163
|
+
self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
|
164
|
+
# Create empty JSON file if it doesn't exist
|
165
|
+
if not self.config.output_file.exists():
|
166
|
+
with open(self.config.output_file, 'w') as f:
|
167
|
+
json.dump({}, f)
|
162
168
|
self._setup_output()
|
163
169
|
|
164
170
|
def _setup_output(self) -> None:
|
@@ -272,58 +278,35 @@ class PostExtractionAgent:
|
|
272
278
|
return None
|
273
279
|
|
274
280
|
def _save_result_sync(self, url: str, result: Dict) -> None:
|
275
|
-
"""Save
|
281
|
+
"""Save results synchronously to combined output file."""
|
276
282
|
try:
|
277
|
-
if self.config.
|
278
|
-
#
|
279
|
-
output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
|
280
|
-
|
281
|
-
# Ensure output directory exists
|
282
|
-
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
|
283
|
-
|
284
|
-
# Load existing results or create new
|
283
|
+
if self.config.output_file:
|
284
|
+
# Load existing results
|
285
285
|
try:
|
286
|
-
|
287
|
-
|
288
|
-
self.results = json.load(f)
|
289
|
-
else:
|
290
|
-
self.results = {}
|
286
|
+
with open(self.config.output_file, 'r', encoding='utf-8') as f:
|
287
|
+
current_results = json.load(f)
|
291
288
|
except (json.JSONDecodeError, FileNotFoundError):
|
292
|
-
|
293
|
-
|
294
|
-
# Update
|
295
|
-
|
296
|
-
|
297
|
-
# Save
|
298
|
-
temp_file =
|
289
|
+
current_results = {}
|
290
|
+
|
291
|
+
# Update with new result
|
292
|
+
current_results[url] = result
|
293
|
+
|
294
|
+
# Save atomically using temporary file
|
295
|
+
temp_file = self.config.output_file.with_suffix('.tmp')
|
299
296
|
with open(temp_file, 'w', encoding='utf-8') as f:
|
300
|
-
json.dump(
|
301
|
-
|
302
|
-
# Atomic replace
|
303
|
-
|
304
|
-
logger.info(f"Updated combined results file with {url}
|
305
|
-
|
306
|
-
# Cleanup all backup files
|
307
|
-
for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
|
308
|
-
try:
|
309
|
-
backup_file.unlink()
|
310
|
-
logger.info(f"Cleaned up backup file: {backup_file}")
|
311
|
-
except Exception as e:
|
312
|
-
logger.warning(f"Failed to remove backup file {backup_file}: {e}")
|
313
|
-
|
297
|
+
json.dump(current_results, f, indent=2, ensure_ascii=False)
|
298
|
+
|
299
|
+
# Atomic replace
|
300
|
+
temp_file.replace(self.config.output_file)
|
301
|
+
logger.info(f"Updated combined results file with {url}")
|
302
|
+
|
314
303
|
# Cleanup backup files
|
315
|
-
for backup_file in
|
304
|
+
for backup_file in self.config.output_file.parent.glob(f"{self.config.output_file.stem}.bak_*"):
|
316
305
|
try:
|
317
306
|
backup_file.unlink()
|
318
307
|
logger.info(f"Cleaned up backup file: {backup_file}")
|
319
308
|
except Exception as e:
|
320
309
|
logger.warning(f"Failed to remove backup file {backup_file}: {e}")
|
321
|
-
|
322
|
-
elif not self.config.combine_output and self.config.output_file:
|
323
|
-
individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
|
324
|
-
with open(individual_file, 'w') as f:
|
325
|
-
json.dump(result, f, indent=2)
|
326
|
-
logger.info(f"Saved individual result file for {url}")
|
327
310
|
except Exception as e:
|
328
311
|
logger.error(f"Error saving results for {url}: {str(e)}")
|
329
312
|
|
@@ -358,26 +341,34 @@ class PostExtractionAgent:
|
|
358
341
|
# Add URL to result before transformation
|
359
342
|
result['url'] = url
|
360
343
|
|
361
|
-
logger.info(f"Executing
|
344
|
+
logger.info(f"Executing transformer function for {url}")
|
362
345
|
transformed_result = self.config.custom_transform_function(result)
|
363
346
|
logger.info(f"Successfully applied custom transformation for {url}")
|
347
|
+
|
348
|
+
# Save the transformed result to combined output
|
349
|
+
if self.config.output_file:
|
350
|
+
self._save_result_sync(url, transformed_result)
|
351
|
+
logger.info(f"Saved transformed result to combined output for {url}")
|
352
|
+
|
364
353
|
logger.info(f"Webhook response sent for {url}")
|
365
|
-
return transformed_result
|
354
|
+
return transformed_result
|
366
355
|
except Exception as e:
|
367
356
|
error_msg = f"Warning: Issue in custom transform for {url}: {str(e)}"
|
368
357
|
logger.warning(error_msg)
|
369
358
|
console.print(f"[yellow]{error_msg}[/yellow]")
|
359
|
+
|
360
|
+
# Save original result if transformation fails
|
361
|
+
if self.config.output_file:
|
362
|
+
self._save_result_sync(url, result)
|
363
|
+
logger.info(f"Saved original result to combined output for {url}")
|
370
364
|
|
371
365
|
# Save result synchronously
|
372
366
|
try:
|
373
367
|
# Always save the result, whether transformed or original
|
374
368
|
result_to_save = transformed_result if self.config.custom_transform_function else result
|
375
|
-
if self.config.
|
376
|
-
self._save_result_sync(url, result_to_save)
|
377
|
-
logger.info(f"Saved results for {url} to combined output")
|
378
|
-
else:
|
369
|
+
if self.config.output_file:
|
379
370
|
self._save_result_sync(url, result_to_save)
|
380
|
-
logger.info(f"Saved
|
371
|
+
logger.info(f"Saved results for {url} to {self.config.output_file}")
|
381
372
|
except Exception as e:
|
382
373
|
error_msg = f"Error saving results for {url}: {str(e)}"
|
383
374
|
logger.error(error_msg)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|