spiderforce4ai 2.5.9__py3-none-any.whl → 2.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/post_extraction_agent.py +61 -22
- {spiderforce4ai-2.5.9.dist-info → spiderforce4ai-2.6.3.dist-info}/METADATA +1 -1
- spiderforce4ai-2.6.3.dist-info/RECORD +7 -0
- spiderforce4ai-2.5.9.dist-info/RECORD +0 -7
- {spiderforce4ai-2.5.9.dist-info → spiderforce4ai-2.6.3.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.5.9.dist-info → spiderforce4ai-2.6.3.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.5.9.dist-info → spiderforce4ai-2.6.3.dist-info}/top_level.txt +0 -0
@@ -156,19 +156,34 @@ class PostExtractionAgent:
|
|
156
156
|
self.buffer = PostExtractionBuffer(config.buffer_file)
|
157
157
|
self.results: Dict[str, Any] = {}
|
158
158
|
self.rate_limiter = RateLimiter()
|
159
|
+
# Convert string path to Path object if needed
|
160
|
+
if isinstance(self.config.output_file, str):
|
161
|
+
self.config.output_file = Path(self.config.output_file)
|
159
162
|
self._setup_output()
|
160
163
|
|
161
164
|
def _setup_output(self) -> None:
|
162
165
|
"""Setup output file if combining results."""
|
163
166
|
if self.config.combine_output and self.config.output_file:
|
167
|
+
# Ensure parent directory exists
|
164
168
|
self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
|
169
|
+
|
170
|
+
# Load existing results if file exists
|
165
171
|
if self.config.output_file.exists():
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
+
try:
|
173
|
+
with open(self.config.output_file, 'r') as f:
|
174
|
+
self.results = json.load(f)
|
175
|
+
except json.JSONDecodeError:
|
176
|
+
# If file is corrupted, backup and start fresh
|
177
|
+
backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
|
178
|
+
self.config.output_file.rename(backup_path)
|
179
|
+
self.results = {}
|
180
|
+
|
181
|
+
# Create file if it doesn't exist
|
182
|
+
if not self.config.output_file.exists():
|
183
|
+
self.config.output_file.touch()
|
184
|
+
self.results = {}
|
185
|
+
|
186
|
+
logger.info(f"Initialized output file at {self.config.output_file}")
|
172
187
|
|
173
188
|
def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
174
189
|
"""Process a single piece of content through the LLM."""
|
@@ -260,22 +275,44 @@ class PostExtractionAgent:
|
|
260
275
|
"""Save individual or combined results synchronously."""
|
261
276
|
try:
|
262
277
|
if self.config.combine_output and self.config.output_file:
|
263
|
-
#
|
264
|
-
self.
|
278
|
+
# Convert Path to string if needed
|
279
|
+
output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
|
265
280
|
|
266
281
|
# Ensure output directory exists
|
267
|
-
|
282
|
+
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
|
283
|
+
|
284
|
+
# Load existing results or create new
|
285
|
+
try:
|
286
|
+
if Path(output_file).exists():
|
287
|
+
with open(output_file, 'r', encoding='utf-8') as f:
|
288
|
+
self.results = json.load(f)
|
289
|
+
else:
|
290
|
+
self.results = {}
|
291
|
+
except (json.JSONDecodeError, FileNotFoundError):
|
292
|
+
self.results = {}
|
293
|
+
|
294
|
+
# Update results with new data
|
295
|
+
self.results[url] = result
|
268
296
|
|
269
297
|
# Save combined results atomically
|
270
|
-
temp_file =
|
271
|
-
with open(temp_file, 'w') as f:
|
272
|
-
json.dump(self.results, f, indent=2)
|
273
|
-
temp_file.replace(self.config.output_file)
|
298
|
+
temp_file = f"{output_file}.tmp"
|
299
|
+
with open(temp_file, 'w', encoding='utf-8') as f:
|
300
|
+
json.dump(self.results, f, indent=2, ensure_ascii=False)
|
274
301
|
|
275
|
-
|
302
|
+
# Atomic replace and cleanup backup files
|
303
|
+
Path(temp_file).replace(output_file)
|
304
|
+
logger.info(f"Updated combined results file with {url} in {output_file}")
|
305
|
+
|
306
|
+
# Cleanup all backup files
|
307
|
+
for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
|
308
|
+
try:
|
309
|
+
backup_file.unlink()
|
310
|
+
logger.info(f"Cleaned up backup file: {backup_file}")
|
311
|
+
except Exception as e:
|
312
|
+
logger.warning(f"Failed to remove backup file {backup_file}: {e}")
|
276
313
|
|
277
|
-
# Cleanup backup files
|
278
|
-
for backup_file in
|
314
|
+
# Cleanup backup files
|
315
|
+
for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
|
279
316
|
try:
|
280
317
|
backup_file.unlink()
|
281
318
|
logger.info(f"Cleaned up backup file: {backup_file}")
|
@@ -333,18 +370,20 @@ class PostExtractionAgent:
|
|
333
370
|
|
334
371
|
# Save result synchronously
|
335
372
|
try:
|
336
|
-
#
|
337
|
-
if self.config.custom_transform_function
|
338
|
-
|
339
|
-
|
373
|
+
# Always save the result, whether transformed or original
|
374
|
+
result_to_save = transformed_result if self.config.custom_transform_function else result
|
375
|
+
if self.config.combine_output and self.config.output_file:
|
376
|
+
self._save_result_sync(url, result_to_save)
|
377
|
+
logger.info(f"Saved results for {url} to combined output")
|
340
378
|
else:
|
341
|
-
self._save_result_sync(url,
|
342
|
-
logger.info(f"Saved
|
379
|
+
self._save_result_sync(url, result_to_save)
|
380
|
+
logger.info(f"Saved individual result for {url}")
|
343
381
|
except Exception as e:
|
344
382
|
error_msg = f"Error saving results for {url}: {str(e)}"
|
345
383
|
logger.error(error_msg)
|
346
384
|
console.print(f"[red]{error_msg}[/red]")
|
347
385
|
|
386
|
+
# Return the appropriate result
|
348
387
|
return transformed_result if self.config.custom_transform_function else result
|
349
388
|
|
350
389
|
# Wait before retry
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=3HB54YrPCbQTUoZWINr7yHvwXwQywmq0f_RBJwKr2gg,19355
|
3
|
+
spiderforce4ai-2.6.3.dist-info/METADATA,sha256=L5GCJHggqks18Z31ru5DbDdXT3mdS8pYEDhfdR9igms,9012
|
4
|
+
spiderforce4ai-2.6.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.6.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.6.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.6.3.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=K6AGqeDO_MZ4pQMlkmnfK6Y5Sa1BWkUWv9u7_LMxsuM,17314
|
3
|
-
spiderforce4ai-2.5.9.dist-info/METADATA,sha256=4qXFZ6sEYnqsjULabDNc0ez0ZTuTPa1FuUTXpGuXG0I,9012
|
4
|
-
spiderforce4ai-2.5.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.5.9.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.5.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.5.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|