spiderforce4ai 2.6__tar.gz → 2.6.4__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/PKG-INFO +1 -1
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/pyproject.toml +1 -1
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/setup.py +1 -1
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/spiderforce4ai/post_extraction_agent.py +53 -46
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/README.md +0 -0
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/setup.cfg +0 -0
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/spiderforce4ai/__init__.py +0 -0
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/spiderforce4ai.egg-info/entry_points.txt +0 -0
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/spiderforce4ai.egg-info/not-zip-safe +0 -0
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-2.6 → spiderforce4ai-2.6.4}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "2.6"
|
7
|
+
version = "2.6.4"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [
|
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
|
|
7
7
|
|
8
8
|
setup(
|
9
9
|
name="spiderforce4ai",
|
10
|
-
version="2.6",
|
10
|
+
version="2.6.4",
|
11
11
|
author="Piotr Tamulewicz",
|
12
12
|
author_email="pt@petertam.pro",
|
13
13
|
description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
|
@@ -156,19 +156,40 @@ class PostExtractionAgent:
|
|
156
156
|
self.buffer = PostExtractionBuffer(config.buffer_file)
|
157
157
|
self.results: Dict[str, Any] = {}
|
158
158
|
self.rate_limiter = RateLimiter()
|
159
|
+
# Convert string path to Path object if needed
|
160
|
+
if isinstance(self.config.output_file, str):
|
161
|
+
self.config.output_file = Path(self.config.output_file)
|
162
|
+
# Ensure parent directory exists
|
163
|
+
self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
|
164
|
+
# Create empty JSON file if it doesn't exist
|
165
|
+
if not self.config.output_file.exists():
|
166
|
+
with open(self.config.output_file, 'w') as f:
|
167
|
+
json.dump({}, f)
|
159
168
|
self._setup_output()
|
160
169
|
|
161
170
|
def _setup_output(self) -> None:
|
162
171
|
"""Setup output file if combining results."""
|
163
172
|
if self.config.combine_output and self.config.output_file:
|
173
|
+
# Ensure parent directory exists
|
164
174
|
self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
|
175
|
+
|
176
|
+
# Load existing results if file exists
|
165
177
|
if self.config.output_file.exists():
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
178
|
+
try:
|
179
|
+
with open(self.config.output_file, 'r') as f:
|
180
|
+
self.results = json.load(f)
|
181
|
+
except json.JSONDecodeError:
|
182
|
+
# If file is corrupted, backup and start fresh
|
183
|
+
backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
|
184
|
+
self.config.output_file.rename(backup_path)
|
185
|
+
self.results = {}
|
186
|
+
|
187
|
+
# Create file if it doesn't exist
|
188
|
+
if not self.config.output_file.exists():
|
189
|
+
self.config.output_file.touch()
|
190
|
+
self.results = {}
|
191
|
+
|
192
|
+
logger.info(f"Initialized output file at {self.config.output_file}")
|
172
193
|
|
173
194
|
def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
174
195
|
"""Process a single piece of content through the LLM."""
|
@@ -257,48 +278,35 @@ class PostExtractionAgent:
|
|
257
278
|
return None
|
258
279
|
|
259
280
|
def _save_result_sync(self, url: str, result: Dict) -> None:
|
260
|
-
"""Save
|
281
|
+
"""Save results synchronously to combined output file."""
|
261
282
|
try:
|
262
|
-
if self.config.
|
263
|
-
#
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
|
279
|
-
|
280
|
-
# Save combined results atomically
|
281
|
-
temp_file = f"{output_file}.tmp"
|
282
|
-
with open(temp_file, 'w') as f:
|
283
|
-
json.dump(self.results, f, indent=2)
|
284
|
-
|
283
|
+
if self.config.output_file:
|
284
|
+
# Load existing results
|
285
|
+
try:
|
286
|
+
with open(self.config.output_file, 'r', encoding='utf-8') as f:
|
287
|
+
current_results = json.load(f)
|
288
|
+
except (json.JSONDecodeError, FileNotFoundError):
|
289
|
+
current_results = {}
|
290
|
+
|
291
|
+
# Update with new result
|
292
|
+
current_results[url] = result
|
293
|
+
|
294
|
+
# Save atomically using temporary file
|
295
|
+
temp_file = self.config.output_file.with_suffix('.tmp')
|
296
|
+
with open(temp_file, 'w', encoding='utf-8') as f:
|
297
|
+
json.dump(current_results, f, indent=2, ensure_ascii=False)
|
298
|
+
|
285
299
|
# Atomic replace
|
286
|
-
|
300
|
+
temp_file.replace(self.config.output_file)
|
287
301
|
logger.info(f"Updated combined results file with {url}")
|
288
|
-
|
302
|
+
|
289
303
|
# Cleanup backup files
|
290
|
-
for backup_file in
|
304
|
+
for backup_file in self.config.output_file.parent.glob(f"{self.config.output_file.stem}.bak_*"):
|
291
305
|
try:
|
292
306
|
backup_file.unlink()
|
293
307
|
logger.info(f"Cleaned up backup file: {backup_file}")
|
294
308
|
except Exception as e:
|
295
309
|
logger.warning(f"Failed to remove backup file {backup_file}: {e}")
|
296
|
-
|
297
|
-
elif not self.config.combine_output and self.config.output_file:
|
298
|
-
individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
|
299
|
-
with open(individual_file, 'w') as f:
|
300
|
-
json.dump(result, f, indent=2)
|
301
|
-
logger.info(f"Saved individual result file for {url}")
|
302
310
|
except Exception as e:
|
303
311
|
logger.error(f"Error saving results for {url}: {str(e)}")
|
304
312
|
|
@@ -345,18 +353,17 @@ class PostExtractionAgent:
|
|
345
353
|
|
346
354
|
# Save result synchronously
|
347
355
|
try:
|
348
|
-
#
|
349
|
-
if self.config.custom_transform_function
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
self._save_result_sync(url, result)
|
354
|
-
logger.info(f"Saved original results for {url}")
|
356
|
+
# Always save the result, whether transformed or original
|
357
|
+
result_to_save = transformed_result if self.config.custom_transform_function else result
|
358
|
+
if self.config.output_file:
|
359
|
+
self._save_result_sync(url, result_to_save)
|
360
|
+
logger.info(f"Saved results for {url} to {self.config.output_file}")
|
355
361
|
except Exception as e:
|
356
362
|
error_msg = f"Error saving results for {url}: {str(e)}"
|
357
363
|
logger.error(error_msg)
|
358
364
|
console.print(f"[red]{error_msg}[/red]")
|
359
365
|
|
366
|
+
# Return the appropriate result
|
360
367
|
return transformed_result if self.config.custom_transform_function else result
|
361
368
|
|
362
369
|
# Wait before retry
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|