spiderforce4ai 2.5.9__py3-none-any.whl → 2.6.3__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/post_extraction_agent.py +61 -22
- {spiderforce4ai-2.5.9.dist-info → spiderforce4ai-2.6.3.dist-info}/METADATA +1 -1
- spiderforce4ai-2.6.3.dist-info/RECORD +7 -0
- spiderforce4ai-2.5.9.dist-info/RECORD +0 -7
- {spiderforce4ai-2.5.9.dist-info → spiderforce4ai-2.6.3.dist-info}/WHEEL +0 -0
- {spiderforce4ai-2.5.9.dist-info → spiderforce4ai-2.6.3.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.5.9.dist-info → spiderforce4ai-2.6.3.dist-info}/top_level.txt +0 -0
@@ -156,19 +156,34 @@ class PostExtractionAgent:
|
|
156
156
|
self.buffer = PostExtractionBuffer(config.buffer_file)
|
157
157
|
self.results: Dict[str, Any] = {}
|
158
158
|
self.rate_limiter = RateLimiter()
|
159
|
+
# Convert string path to Path object if needed
|
160
|
+
if isinstance(self.config.output_file, str):
|
161
|
+
self.config.output_file = Path(self.config.output_file)
|
159
162
|
self._setup_output()
|
160
163
|
|
161
164
|
def _setup_output(self) -> None:
|
162
165
|
"""Setup output file if combining results."""
|
163
166
|
if self.config.combine_output and self.config.output_file:
|
167
|
+
# Ensure parent directory exists
|
164
168
|
self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
|
169
|
+
|
170
|
+
# Load existing results if file exists
|
165
171
|
if self.config.output_file.exists():
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
+
try:
|
173
|
+
with open(self.config.output_file, 'r') as f:
|
174
|
+
self.results = json.load(f)
|
175
|
+
except json.JSONDecodeError:
|
176
|
+
# If file is corrupted, backup and start fresh
|
177
|
+
backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
|
178
|
+
self.config.output_file.rename(backup_path)
|
179
|
+
self.results = {}
|
180
|
+
|
181
|
+
# Create file if it doesn't exist
|
182
|
+
if not self.config.output_file.exists():
|
183
|
+
self.config.output_file.touch()
|
184
|
+
self.results = {}
|
185
|
+
|
186
|
+
logger.info(f"Initialized output file at {self.config.output_file}")
|
172
187
|
|
173
188
|
def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
|
174
189
|
"""Process a single piece of content through the LLM."""
|
@@ -260,22 +275,44 @@ class PostExtractionAgent:
|
|
260
275
|
"""Save individual or combined results synchronously."""
|
261
276
|
try:
|
262
277
|
if self.config.combine_output and self.config.output_file:
|
263
|
-
#
|
264
|
-
self.
|
278
|
+
# Convert Path to string if needed
|
279
|
+
output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
|
265
280
|
|
266
281
|
# Ensure output directory exists
|
267
|
-
|
282
|
+
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
|
283
|
+
|
284
|
+
# Load existing results or create new
|
285
|
+
try:
|
286
|
+
if Path(output_file).exists():
|
287
|
+
with open(output_file, 'r', encoding='utf-8') as f:
|
288
|
+
self.results = json.load(f)
|
289
|
+
else:
|
290
|
+
self.results = {}
|
291
|
+
except (json.JSONDecodeError, FileNotFoundError):
|
292
|
+
self.results = {}
|
293
|
+
|
294
|
+
# Update results with new data
|
295
|
+
self.results[url] = result
|
268
296
|
|
269
297
|
# Save combined results atomically
|
270
|
-
temp_file =
|
271
|
-
with open(temp_file, 'w') as f:
|
272
|
-
json.dump(self.results, f, indent=2)
|
273
|
-
temp_file.replace(self.config.output_file)
|
298
|
+
temp_file = f"{output_file}.tmp"
|
299
|
+
with open(temp_file, 'w', encoding='utf-8') as f:
|
300
|
+
json.dump(self.results, f, indent=2, ensure_ascii=False)
|
274
301
|
|
275
|
-
|
302
|
+
# Atomic replace and cleanup backup files
|
303
|
+
Path(temp_file).replace(output_file)
|
304
|
+
logger.info(f"Updated combined results file with {url} in {output_file}")
|
305
|
+
|
306
|
+
# Cleanup all backup files
|
307
|
+
for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
|
308
|
+
try:
|
309
|
+
backup_file.unlink()
|
310
|
+
logger.info(f"Cleaned up backup file: {backup_file}")
|
311
|
+
except Exception as e:
|
312
|
+
logger.warning(f"Failed to remove backup file {backup_file}: {e}")
|
276
313
|
|
277
|
-
# Cleanup backup files
|
278
|
-
for backup_file in
|
314
|
+
# Cleanup backup files
|
315
|
+
for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
|
279
316
|
try:
|
280
317
|
backup_file.unlink()
|
281
318
|
logger.info(f"Cleaned up backup file: {backup_file}")
|
@@ -333,18 +370,20 @@ class PostExtractionAgent:
|
|
333
370
|
|
334
371
|
# Save result synchronously
|
335
372
|
try:
|
336
|
-
#
|
337
|
-
if self.config.custom_transform_function
|
338
|
-
|
339
|
-
|
373
|
+
# Always save the result, whether transformed or original
|
374
|
+
result_to_save = transformed_result if self.config.custom_transform_function else result
|
375
|
+
if self.config.combine_output and self.config.output_file:
|
376
|
+
self._save_result_sync(url, result_to_save)
|
377
|
+
logger.info(f"Saved results for {url} to combined output")
|
340
378
|
else:
|
341
|
-
self._save_result_sync(url,
|
342
|
-
logger.info(f"Saved
|
379
|
+
self._save_result_sync(url, result_to_save)
|
380
|
+
logger.info(f"Saved individual result for {url}")
|
343
381
|
except Exception as e:
|
344
382
|
error_msg = f"Error saving results for {url}: {str(e)}"
|
345
383
|
logger.error(error_msg)
|
346
384
|
console.print(f"[red]{error_msg}[/red]")
|
347
385
|
|
386
|
+
# Return the appropriate result
|
348
387
|
return transformed_result if self.config.custom_transform_function else result
|
349
388
|
|
350
389
|
# Wait before retry
|
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=3HB54YrPCbQTUoZWINr7yHvwXwQywmq0f_RBJwKr2gg,19355
|
3
|
+
spiderforce4ai-2.6.3.dist-info/METADATA,sha256=L5GCJHggqks18Z31ru5DbDdXT3mdS8pYEDhfdR9igms,9012
|
4
|
+
spiderforce4ai-2.6.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
spiderforce4ai-2.6.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.6.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.6.3.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=K6AGqeDO_MZ4pQMlkmnfK6Y5Sa1BWkUWv9u7_LMxsuM,17314
|
3
|
-
spiderforce4ai-2.5.9.dist-info/METADATA,sha256=4qXFZ6sEYnqsjULabDNc0ez0ZTuTPa1FuUTXpGuXG0I,9012
|
4
|
-
spiderforce4ai-2.5.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.5.9.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.5.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.5.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|