spiderforce4ai 2.6__tar.gz → 2.6.4__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.6
3
+ Version: 2.6.4
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "2.6"
7
+ version = "2.6.4"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing"
9
9
  readme = "README.md"
10
10
  authors = [
@@ -7,7 +7,7 @@ with open("README.md", encoding="utf-8") as f:
7
7
 
8
8
  setup(
9
9
  name="spiderforce4ai",
10
- version="2.6",
10
+ version="2.6.4",
11
11
  author="Piotr Tamulewicz",
12
12
  author_email="pt@petertam.pro",
13
13
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing",
@@ -156,19 +156,40 @@ class PostExtractionAgent:
156
156
  self.buffer = PostExtractionBuffer(config.buffer_file)
157
157
  self.results: Dict[str, Any] = {}
158
158
  self.rate_limiter = RateLimiter()
159
+ # Convert string path to Path object if needed
160
+ if isinstance(self.config.output_file, str):
161
+ self.config.output_file = Path(self.config.output_file)
162
+ # Ensure parent directory exists
163
+ self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
164
+ # Create empty JSON file if it doesn't exist
165
+ if not self.config.output_file.exists():
166
+ with open(self.config.output_file, 'w') as f:
167
+ json.dump({}, f)
159
168
  self._setup_output()
160
169
 
161
170
  def _setup_output(self) -> None:
162
171
  """Setup output file if combining results."""
163
172
  if self.config.combine_output and self.config.output_file:
173
+ # Ensure parent directory exists
164
174
  self.config.output_file.parent.mkdir(parents=True, exist_ok=True)
175
+
176
+ # Load existing results if file exists
165
177
  if self.config.output_file.exists():
166
- # Backup existing file
167
- backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
168
- self.config.output_file.rename(backup_path)
169
- self.config.output_file.touch()
170
- # Initialize empty results dictionary
171
- self.results = {}
178
+ try:
179
+ with open(self.config.output_file, 'r') as f:
180
+ self.results = json.load(f)
181
+ except json.JSONDecodeError:
182
+ # If file is corrupted, backup and start fresh
183
+ backup_path = self.config.output_file.with_suffix(f".bak_{int(time.time())}")
184
+ self.config.output_file.rename(backup_path)
185
+ self.results = {}
186
+
187
+ # Create file if it doesn't exist
188
+ if not self.config.output_file.exists():
189
+ self.config.output_file.touch()
190
+ self.results = {}
191
+
192
+ logger.info(f"Initialized output file at {self.config.output_file}")
172
193
 
173
194
  def _process_single_content(self, url: str, content: str) -> Optional[Dict]:
174
195
  """Process a single piece of content through the LLM."""
@@ -257,48 +278,35 @@ class PostExtractionAgent:
257
278
  return None
258
279
 
259
280
  def _save_result_sync(self, url: str, result: Dict) -> None:
260
- """Save individual or combined results synchronously."""
281
+ """Save results synchronously to combined output file."""
261
282
  try:
262
- if self.config.combine_output and self.config.output_file:
263
- # Convert Path to string if needed
264
- output_file = str(self.config.output_file) if isinstance(self.config.output_file, Path) else self.config.output_file
265
-
266
- # Load existing results if file exists
267
- if Path(output_file).exists():
268
- try:
269
- with open(output_file, 'r') as f:
270
- self.results = json.load(f)
271
- except json.JSONDecodeError:
272
- self.results = {}
273
-
274
- # Update results with new data
275
- self.results[url] = result
276
-
277
- # Ensure output directory exists
278
- Path(output_file).parent.mkdir(parents=True, exist_ok=True)
279
-
280
- # Save combined results atomically
281
- temp_file = f"{output_file}.tmp"
282
- with open(temp_file, 'w') as f:
283
- json.dump(self.results, f, indent=2)
284
-
283
+ if self.config.output_file:
284
+ # Load existing results
285
+ try:
286
+ with open(self.config.output_file, 'r', encoding='utf-8') as f:
287
+ current_results = json.load(f)
288
+ except (json.JSONDecodeError, FileNotFoundError):
289
+ current_results = {}
290
+
291
+ # Update with new result
292
+ current_results[url] = result
293
+
294
+ # Save atomically using temporary file
295
+ temp_file = self.config.output_file.with_suffix('.tmp')
296
+ with open(temp_file, 'w', encoding='utf-8') as f:
297
+ json.dump(current_results, f, indent=2, ensure_ascii=False)
298
+
285
299
  # Atomic replace
286
- Path(temp_file).replace(output_file)
300
+ temp_file.replace(self.config.output_file)
287
301
  logger.info(f"Updated combined results file with {url}")
288
-
302
+
289
303
  # Cleanup backup files
290
- for backup_file in Path(output_file).parent.glob(f"{Path(output_file).stem}.bak_*"):
304
+ for backup_file in self.config.output_file.parent.glob(f"{self.config.output_file.stem}.bak_*"):
291
305
  try:
292
306
  backup_file.unlink()
293
307
  logger.info(f"Cleaned up backup file: {backup_file}")
294
308
  except Exception as e:
295
309
  logger.warning(f"Failed to remove backup file {backup_file}: {e}")
296
-
297
- elif not self.config.combine_output and self.config.output_file:
298
- individual_file = self.config.output_file.parent / f"{url.replace('/', '_')}.json"
299
- with open(individual_file, 'w') as f:
300
- json.dump(result, f, indent=2)
301
- logger.info(f"Saved individual result file for {url}")
302
310
  except Exception as e:
303
311
  logger.error(f"Error saving results for {url}: {str(e)}")
304
312
 
@@ -345,18 +353,17 @@ class PostExtractionAgent:
345
353
 
346
354
  # Save result synchronously
347
355
  try:
348
- # Save both original and transformed result
349
- if self.config.custom_transform_function:
350
- self._save_result_sync(url, transformed_result)
351
- logger.info(f"Saved transformed results for {url}")
352
- else:
353
- self._save_result_sync(url, result)
354
- logger.info(f"Saved original results for {url}")
356
+ # Always save the result, whether transformed or original
357
+ result_to_save = transformed_result if self.config.custom_transform_function else result
358
+ if self.config.output_file:
359
+ self._save_result_sync(url, result_to_save)
360
+ logger.info(f"Saved results for {url} to {self.config.output_file}")
355
361
  except Exception as e:
356
362
  error_msg = f"Error saving results for {url}: {str(e)}"
357
363
  logger.error(error_msg)
358
364
  console.print(f"[red]{error_msg}[/red]")
359
365
 
366
+ # Return the appropriate result
360
367
  return transformed_result if self.config.custom_transform_function else result
361
368
 
362
369
  # Wait before retry
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.6
3
+ Version: 2.6.4
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes