spiderforce4ai 2.6.3__py3-none-any.whl → 2.6.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -159,6 +159,12 @@ class PostExtractionAgent:
159
159
  # Convert string path to Path object if needed
160
160
  if isinstance(self.config.output_file, str):
161
161
  self.config.output_file = Path(self.config.output_file)
162
+ # Ensure parent directory exists
163
+ self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
164
+ # Create empty JSON file if it doesn't exist
165
+ if not self.config.output_file.exists():
166
+ with open(self.config.output_file, 'w') as f:
167
+ json.dump({}, f)
162
168
  self._setup_output()
163
169
 
164
170
  def _setup_output(self) -> None:
@@ -272,58 +278,35 @@ class PostExtractionAgent:
272
278
  return None
273
279
 
274
280
  def _save_result_sync(self, url: str, result: Dict) -> None:
275
- """Save individual or combined results synchronously."""
281
+ """Save results synchronously to combined output file."""
276
282
  try:
277
- if self.config.combine_output and self.config.output_file:
278
- # Convert Path to string if needed
279
- output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
280
-
281
- # Ensure output directory exists
282
- Path(output_file).parent.mkdir(parents=True, exist_ok=True)
283
-
284
- # Load existing results or create new
283
+ if self.config.output_file:
284
+ # Load existing results
285
285
  try:
286
- if Path(output_file).exists():
287
- with open(output_file, 'r', encoding='utf-8') as f:
288
- self.results = json.load(f)
289
- else:
290
- self.results = {}
286
+ with open(self.config.output_file, 'r', encoding='utf-8') as f:
287
+ current_results = json.load(f)
291
288
  except (json.JSONDecodeError, FileNotFoundError):
292
- self.results = {}
293
-
294
- # Update results with new data
295
- self.results[url] = result
296
-
297
- # Save combined results atomically
298
- temp_file = f"{output_file}.tmp"
289
+ current_results = {}
290
+
291
+ # Update with new result
292
+ current_results[url] = result
293
+
294
+ # Save atomically using temporary file
295
+ temp_file = self.config.output_file.with_suffix('.tmp')
299
296
  with open(temp_file, 'w', encoding='utf-8') as f:
300
- json.dump(self.results, f, indent=2, ensure_ascii=False)
301
-
302
- # Atomic replace and cleanup backup files
303
- Path(temp_file).replace(output_file)
304
- logger.info(f"Updated combined results file with {url} in {output_file}")
305
-
306
- # Cleanup all backup files
307
- for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
308
- try:
309
- backup_file.unlink()
310
- logger.info(f"Cleaned up backup file: {backup_file}")
311
- except Exception as e:
312
- logger.warning(f"Failed to remove backup file {backup_file}: {e}")
313
-
297
+ json.dump(current_results, f, indent=2, ensure_ascii=False)
298
+
299
+ # Atomic replace
300
+ temp_file.replace(self.config.output_file)
301
+ logger.info(f"Updated combined results file with {url}")
302
+
314
303
  # Cleanup backup files
315
- for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
304
+ for backup_file in self.config.output_file.parent.glob(f"{self.config.output_file.stem}.bak_*"):
316
305
  try:
317
306
  backup_file.unlink()
318
307
  logger.info(f"Cleaned up backup file: {backup_file}")
319
308
  except Exception as e:
320
309
  logger.warning(f"Failed to remove backup file {backup_file}: {e}")
321
-
322
- elif not self.config.combine_output and self.config.output_file:
323
- individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
324
- with open(individual_file, 'w') as f:
325
- json.dump(result, f, indent=2)
326
- logger.info(f"Saved individual result file for {url}")
327
310
  except Exception as e:
328
311
  logger.error(f"Error saving results for {url}: {str(e)}")
329
312
 
@@ -372,12 +355,9 @@ class PostExtractionAgent:
372
355
  try:
373
356
  # Always save the result, whether transformed or original
374
357
  result_to_save = transformed_result if self.config.custom_transform_function else result
375
- if self.config.combine_output and self.config.output_file:
376
- self._save_result_sync(url, result_to_save)
377
- logger.info(f"Saved results for {url} to combined output")
378
- else:
358
+ if self.config.output_file:
379
359
  self._save_result_sync(url, result_to_save)
380
- logger.info(f"Saved individual result for {url}")
360
+ logger.info(f"Saved results for {url} to {self.config.output_file}")
381
361
  except Exception as e:
382
362
  error_msg = f"Error saving results for {url}: {str(e)}"
383
363
  logger.error(error_msg)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.6.3
3
+ Version: 2.6.4
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
2
+ spiderforce4ai/post_extraction_agent.py,sha256=GJXV-qTi9xAwil8YSUBaB0OUDDPDzWfcYScldxKsenw,18121
3
+ spiderforce4ai-2.6.4.dist-info/METADATA,sha256=pzdSGVryHkPAOuf3UcBrimThEi4paBzYKFoWCM-wVvY,9012
4
+ spiderforce4ai-2.6.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.6.4.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.6.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.6.4.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
2
- spiderforce4ai/post_extraction_agent.py,sha256=3HB54YrPCbQTUoZWINr7yHvwXwQywmq0f_RBJwKr2gg,19355
3
- spiderforce4ai-2.6.3.dist-info/METADATA,sha256=L5GCJHggqks18Z31ru5DbDdXT3mdS8pYEDhfdR9igms,9012
4
- spiderforce4ai-2.6.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.6.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.6.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.6.3.dist-info/RECORD,,