spiderforce4ai 2.6__py3-none-any.whl → 2.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/post_extraction_agent.py +51 -24
- {spiderforce4ai-2.6.dist-info → spiderforce4ai-2.6.3.dist-info}/METADATA +1 -1
- spiderforce4ai-2.6.3.dist-info/RECORD +7 -0
- spiderforce4ai-2.6.dist-info/RECORD +0 -7
- {spiderforce4ai-2.6.dist-info → spiderforce4ai-2.6.3.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.6.dist-info → spiderforce4ai-2.6.3.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.6.dist-info → spiderforce4ai-2.6.3.dist-info}/top_level.txt +0 -0
@@ -156,19 +156,34 @@ class PostExtractionAgent:
|
|
156
156
|
self.buffer = PostExtractionBuffer(config.buffer_file)
|
157
157
|
self.results: Dict[str, Any] = {}
|
158
158
|
self.rate_limiter = RateLimiter()
|
159
|
+
# Convert string path to Path object if needed
|
160
|
+
if isinstance(self.config.output_file, str):
|
161
|
+
self.config.output_file = Path(self.config.output_file)
|
159
162
|
self._setup_output()
|
160
163
|
|
161
164
|
def _setup_output(self) -> None:
|
162
165
|
"""Setup output file if combining results."""
|
163
166
|
if self.config.combine_output and self.config.output_file:
|
167
|
+
# Ensure parent directory exists
|
164
168
|
self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
|
169
|
+
|
170
|
+
# Load existing results if file exists
|
165
171
|
if self.config.output_file.exists():
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
+
try:
|
173
|
+
with open(self.config.output_file, 'r') as f:
|
174
|
+
self.results = json.load(f)
|
175
|
+
except json.JSONDecodeError:
|
176
|
+
# If file is corrupted, backup and start fresh
|
177
|
+
backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
|
178
|
+
self.config.output_file.rename(backup_path)
|
179
|
+
self.results = {}
|
180
|
+
|
181
|
+
# Create file if it doesn't exist
|
182
|
+
if not self.config.output_file.exists():
|
183
|
+
self.config.output_file.touch()
|
184
|
+
self.results = {}
|
185
|
+
|
186
|
+
logger.info(f"Initialized output file at {self.config.output_file}")
|
172
187
|
|
173
188
|
def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
174
189
|
"""Process a single piece of content through the LLM."""
|
@@ -263,28 +278,38 @@ class PostExtractionAgent:
|
|
263
278
|
# Convert Path to string if needed
|
264
279
|
output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
|
265
280
|
|
266
|
-
#
|
267
|
-
|
268
|
-
|
269
|
-
|
281
|
+
# Ensure output directory exists
|
282
|
+
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
|
283
|
+
|
284
|
+
# Load existing results or create new
|
285
|
+
try:
|
286
|
+
if Path(output_file).exists():
|
287
|
+
with open(output_file, 'r', encoding='utf-8') as f:
|
270
288
|
self.results = json.load(f)
|
271
|
-
|
289
|
+
else:
|
272
290
|
self.results = {}
|
291
|
+
except (json.JSONDecodeError, FileNotFoundError):
|
292
|
+
self.results = {}
|
273
293
|
|
274
294
|
# Update results with new data
|
275
295
|
self.results[url] = result
|
276
296
|
|
277
|
-
# Ensure output directory exists
|
278
|
-
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
|
279
|
-
|
280
297
|
# Save combined results atomically
|
281
298
|
temp_file = f"{output_file}.tmp"
|
282
|
-
with open(temp_file, 'w') as f:
|
283
|
-
json.dump(self.results, f, indent=2)
|
299
|
+
with open(temp_file, 'w', encoding='utf-8') as f:
|
300
|
+
json.dump(self.results, f, indent=2, ensure_ascii=False)
|
284
301
|
|
285
|
-
# Atomic replace
|
302
|
+
# Atomic replace and cleanup backup files
|
286
303
|
Path(temp_file).replace(output_file)
|
287
|
-
logger.info(f"Updated combined results file with {url}")
|
304
|
+
logger.info(f"Updated combined results file with {url} in {output_file}")
|
305
|
+
|
306
|
+
# Cleanup all backup files
|
307
|
+
for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
|
308
|
+
try:
|
309
|
+
backup_file.unlink()
|
310
|
+
logger.info(f"Cleaned up backup file: {backup_file}")
|
311
|
+
except Exception as e:
|
312
|
+
logger.warning(f"Failed to remove backup file {backup_file}: {e}")
|
288
313
|
|
289
314
|
# Cleanup backup files
|
290
315
|
for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
|
@@ -345,18 +370,20 @@ class PostExtractionAgent:
|
|
345
370
|
|
346
371
|
# Save result synchronously
|
347
372
|
try:
|
348
|
-
#
|
349
|
-
if self.config.custom_transform_function
|
350
|
-
|
351
|
-
|
373
|
+
# Always save the result, whether transformed or original
|
374
|
+
result_to_save = transformed_result if self.config.custom_transform_function else result
|
375
|
+
if self.config.combine_output and self.config.output_file:
|
376
|
+
self._save_result_sync(url, result_to_save)
|
377
|
+
logger.info(f"Saved results for {url} to combined output")
|
352
378
|
else:
|
353
|
-
self._save_result_sync(url,
|
354
|
-
logger.info(f"Saved
|
379
|
+
self._save_result_sync(url, result_to_save)
|
380
|
+
logger.info(f"Saved individual result for {url}")
|
355
381
|
except Exception as e:
|
356
382
|
error_msg = f"Error saving results for {url}: {str(e)}"
|
357
383
|
logger.error(error_msg)
|
358
384
|
console.print(f"[red]{error_msg}[/red]")
|
359
385
|
|
386
|
+
# Return the appropriate result
|
360
387
|
return transformed_result if self.config.custom_transform_function else result
|
361
388
|
|
362
389
|
# Wait before retry
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=3HB54YrPCbQTUoZWINr7yHvwXwQywmq0f_RBJwKr2gg,19355
|
3
|
+
spiderforce4ai-2.6.3.dist-info/METADATA,sha256=L5GCJHggqks18Z31ru5DbDdXT3mdS8pYEDhfdR9igms,9012
|
4
|
+
spiderforce4ai-2.6.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.6.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.6.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.6.3.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=AysrHIoD-IreqbvWqCDxyN7v8EPSdLOG9yxABamTZSg,17827
|
3
|
-
spiderforce4ai-2.6.dist-info/METADATA,sha256=JtIZ1-ojRvfm773-yF1a_M_x6eB5kbnb6WT5XT04KDA,9010
|
4
|
-
spiderforce4ai-2.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.6.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|