spiderforce4ai 2.5.9__py3-none-any.whl → 2.6.3__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -156,19 +156,34 @@ class PostExtractionAgent:
156
156
  self.buffer = PostExtractionBuffer(config.buffer_file)
157
157
  self.results: Dict[str, Any] = {}
158
158
  self.rate_limiter = RateLimiter()
159
+ # Convert string path to Path object if needed
160
+ if isinstance(self.config.output_file, str):
161
+ self.config.output_file = Path(self.config.output_file)
159
162
  self._setup_output()
160
163
 
161
164
  def _setup_output(self) -> None:
162
165
  """Setup output file if combining results."""
163
166
  if self.config.combine_output and self.config.output_file:
167
+ # Ensure parent directory exists
164
168
  self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
169
+
170
+ # Load existing results if file exists
165
171
  if self.config.output_file.exists():
166
- # Backup existing file
167
- backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
168
- self.config.output_file.rename(backup_path)
169
- self.config.output_file.touch()
170
- # Initialize empty results dictionary
171
- self.results = {}
172
+ try:
173
+ with open(self.config.output_file, 'r') as f:
174
+ self.results = json.load(f)
175
+ except json.JSONDecodeError:
176
+ # If file is corrupted, backup and start fresh
177
+ backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
178
+ self.config.output_file.rename(backup_path)
179
+ self.results = {}
180
+
181
+ # Create file if it doesn't exist
182
+ if not self.config.output_file.exists():
183
+ self.config.output_file.touch()
184
+ self.results = {}
185
+
186
+ logger.info(f"Initialized output file at {self.config.output_file}")
172
187
 
173
188
  def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
174
189
  """Process a single piece of content through the LLM."""
@@ -260,22 +275,44 @@ class PostExtractionAgent:
260
275
  """Save individual or combined results synchronously."""
261
276
  try:
262
277
  if self.config.combine_output and self.config.output_file:
263
- # Update the results dictionary
264
- self.results[url] = result
278
+ # Convert Path to string if needed
279
+ output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
265
280
 
266
281
  # Ensure output directory exists
267
- self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
282
+ Path(output_file).parent.mkdir(parents=True, exist_ok=True)
283
+
284
+ # Load existing results or create new
285
+ try:
286
+ if Path(output_file).exists():
287
+ with open(output_file, 'r', encoding='utf-8') as f:
288
+ self.results = json.load(f)
289
+ else:
290
+ self.results = {}
291
+ except (json.JSONDecodeError, FileNotFoundError):
292
+ self.results = {}
293
+
294
+ # Update results with new data
295
+ self.results[url] = result
268
296
 
269
297
  # Save combined results atomically
270
- temp_file = self.config.output_file.with_suffix('.tmp')
271
- with open(temp_file, 'w') as f:
272
- json.dump(self.results, f, indent=2)
273
- temp_file.replace(self.config.output_file)
298
+ temp_file = f"{output_file}.tmp"
299
+ with open(temp_file, 'w', encoding='utf-8') as f:
300
+ json.dump(self.results, f, indent=2, ensure_ascii=False)
274
301
 
275
- logger.info(f"Updated combined results file with {url}")
302
+ # Atomic replace and cleanup backup files
303
+ Path(temp_file).replace(output_file)
304
+ logger.info(f"Updated combined results file with {url} in {output_file}")
305
+
306
+ # Cleanup all backup files
307
+ for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
308
+ try:
309
+ backup_file.unlink()
310
+ logger.info(f"Cleaned up backup file: {backup_file}")
311
+ except Exception as e:
312
+ logger.warning(f"Failed to remove backup file {backup_file}: {e}")
276
313
 
277
- # Cleanup backup files after successful save
278
- for backup_file in self.config.output_file.parent.glob(f"{self.config.output_file.stem}.bak_*"):
314
+ # Cleanup backup files
315
+ for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
279
316
  try:
280
317
  backup_file.unlink()
281
318
  logger.info(f"Cleaned up backup file: {backup_file}")
@@ -333,18 +370,20 @@ class PostExtractionAgent:
333
370
 
334
371
  # Save result synchronously
335
372
  try:
336
- # Save both original and transformed result
337
- if self.config.custom_transform_function:
338
- self._save_result_sync(url, transformed_result)
339
- logger.info(f"Saved transformed results for {url}")
373
+ # Always save the result, whether transformed or original
374
+ result_to_save = transformed_result if self.config.custom_transform_function else result
375
+ if self.config.combine_output and self.config.output_file:
376
+ self._save_result_sync(url, result_to_save)
377
+ logger.info(f"Saved results for {url} to combined output")
340
378
  else:
341
- self._save_result_sync(url, result)
342
- logger.info(f"Saved original results for {url}")
379
+ self._save_result_sync(url, result_to_save)
380
+ logger.info(f"Saved individual result for {url}")
343
381
  except Exception as e:
344
382
  error_msg = f"Error saving results for {url}: {str(e)}"
345
383
  logger.error(error_msg)
346
384
  console.print(f"[red]{error_msg}[/red]")
347
385
 
386
+ # Return the appropriate result
348
387
  return transformed_result if self.config.custom_transform_function else result
349
388
 
350
389
  # Wait before retry
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.5.9
3
+ Version: 2.6.3
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,7 @@
1
+ spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
2
+ spiderforce4ai/post_extraction_agent.py,sha256=3HB54YrPCbQTUoZWINr7yHvwXwQywmq0f_RBJwKr2gg,19355
3
+ spiderforce4ai-2.6.3.dist-info/METADATA,sha256=L5GCJHggqks18Z31ru5DbDdXT3mdS8pYEDhfdR9igms,9012
4
+ spiderforce4ai-2.6.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ spiderforce4ai-2.6.3.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
+ spiderforce4ai-2.6.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
+ spiderforce4ai-2.6.3.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
2
- spiderforce4ai/post_extraction_agent.py,sha256=K6AGqeDO_MZ4pQMlkmnfK6Y5Sa1BWkUWv9u7_LMxsuM,17314
3
- spiderforce4ai-2.5.9.dist-info/METADATA,sha256=4qXFZ6sEYnqsjULabDNc0ez0ZTuTPa1FuUTXpGuXG0I,9012
4
- spiderforce4ai-2.5.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- spiderforce4ai-2.5.9.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
6
- spiderforce4ai-2.5.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
7
- spiderforce4ai-2.5.9.dist-info/RECORD,,