spiderforce4ai 2.6.3__tar.gz → 2.6.4__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.6.3
3
+ Version: 2.6.4
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "2.6.3"
7
+ version = "2.6.4"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
9
9
  readme = "README.md"
10
10
  authors = [
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
7
7
 
8
8
  setup(
9
9
  name="spiderforce4ai",
10
- version="2.6.3",
10
+ version="2.6.4",
11
11
  author="Piotr Tamulewicz",
12
12
  author_email="pt@petertam.pro",
13
13
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
@@ -159,6 +159,12 @@ class PostExtractionAgent:
159
159
  # Convert string path to Path object if needed
160
160
  if isinstance(self.config.output_file, str):
161
161
  self.config.output_file = Path(self.config.output_file)
162
+ # Ensure parent directory exists
163
+ self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
164
+ # Create empty JSON file if it doesn't exist
165
+ if not self.config.output_file.exists():
166
+ with open(self.config.output_file, 'w') as f:
167
+ json.dump({}, f)
162
168
  self._setup_output()
163
169
 
164
170
  def _setup_output(self) -> None:
@@ -272,58 +278,35 @@ class PostExtractionAgent:
272
278
  return None
273
279
 
274
280
  def _save_result_sync(self, url: str, result: Dict) -> None:
275
- """Save individual or combined results synchronously."""
281
+ """Save results synchronously to combined output file."""
276
282
  try:
277
- if self.config.combine_output and self.config.output_file:
278
- # Convert Path to string if needed
279
- output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
280
-
281
- # Ensure output directory exists
282
- Path(output_file).parent.mkdir(parents=True, exist_ok=True)
283
-
284
- # Load existing results or create new
283
+ if self.config.output_file:
284
+ # Load existing results
285
285
  try:
286
- if Path(output_file).exists():
287
- with open(output_file, 'r', encoding='utf-8') as f:
288
- self.results = json.load(f)
289
- else:
290
- self.results = {}
286
+ with open(self.config.output_file, 'r', encoding='utf-8') as f:
287
+ current_results = json.load(f)
291
288
  except (json.JSONDecodeError, FileNotFoundError):
292
- self.results = {}
293
-
294
- # Update results with new data
295
- self.results[url] = result
296
-
297
- # Save combined results atomically
298
- temp_file = f"{output_file}.tmp"
289
+ current_results = {}
290
+
291
+ # Update with new result
292
+ current_results[url] = result
293
+
294
+ # Save atomically using temporary file
295
+ temp_file = self.config.output_file.with_suffix('.tmp')
299
296
  with open(temp_file, 'w', encoding='utf-8') as f:
300
- json.dump(self.results, f, indent=2, ensure_ascii=False)
301
-
302
- # Atomic replace and cleanup backup files
303
- Path(temp_file).replace(output_file)
304
- logger.info(f"Updated combined results file with {url} in {output_file}")
305
-
306
- # Cleanup all backup files
307
- for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
308
- try:
309
- backup_file.unlink()
310
- logger.info(f"Cleaned up backup file: {backup_file}")
311
- except Exception as e:
312
- logger.warning(f"Failed to remove backup file {backup_file}: {e}")
313
-
297
+ json.dump(current_results, f, indent=2, ensure_ascii=False)
298
+
299
+ # Atomic replace
300
+ temp_file.replace(self.config.output_file)
301
+ logger.info(f"Updated combined results file with {url}")
302
+
314
303
  # Cleanup backup files
315
- for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
304
+ for backup_file in self.config.output_file.parent.glob(f"{self.config.output_file.stem}.bak_*"):
316
305
  try:
317
306
  backup_file.unlink()
318
307
  logger.info(f"Cleaned up backup file: {backup_file}")
319
308
  except Exception as e:
320
309
  logger.warning(f"Failed to remove backup file {backup_file}: {e}")
321
-
322
- elif not self.config.combine_output and self.config.output_file:
323
- individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
324
- with open(individual_file, 'w') as f:
325
- json.dump(result, f, indent=2)
326
- logger.info(f"Saved individual result file for {url}")
327
310
  except Exception as e:
328
311
  logger.error(f"Error saving results for {url}: {str(e)}")
329
312
 
@@ -372,12 +355,9 @@ class PostExtractionAgent:
372
355
  try:
373
356
  # Always save the result, whether transformed or original
374
357
  result_to_save = transformed_result if self.config.custom_transform_function else result
375
- if self.config.combine_output and self.config.output_file:
376
- self._save_result_sync(url, result_to_save)
377
- logger.info(f"Saved results for {url} to combined output")
378
- else:
358
+ if self.config.output_file:
379
359
  self._save_result_sync(url, result_to_save)
380
- logger.info(f"Saved individual result for {url}")
360
+ logger.info(f"Saved results for {url} to {self.config.output_file}")
381
361
  except Exception as e:
382
362
  error_msg = f"Error saving results for {url}: {str(e)}"
383
363
  logger.error(error_msg)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.6.3
3
+ Version: 2.6.4
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes