spiderforce4ai 1.4__py3-none-any.whl → 1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,6 +22,30 @@ from multiprocessing import Pool
22
22
 
23
23
  console = Console()
24
24
 
25
+ def extract_metadata_headers(markdown: str) -> str:
26
+ """Extract metadata and headers from markdown content."""
27
+ lines = markdown.split('\n')
28
+ extracted = []
29
+ in_metadata = False
30
+
31
+ for line in lines:
32
+ # Check for metadata block
33
+ if line.strip() == '---':
34
+ in_metadata = not in_metadata
35
+ extracted.append(line)
36
+ continue
37
+
38
+ # Include metadata
39
+ if in_metadata:
40
+ extracted.append(line)
41
+ continue
42
+
43
+ # Include headers (lines starting with #)
44
+ if line.strip().startswith('#'):
45
+ extracted.append(line)
46
+
47
+ return '\n'.join(extracted)
48
+
25
49
  def slugify(url: str) -> str:
26
50
  """Convert URL to a valid filename."""
27
51
  parsed = urlparse(url)
@@ -61,6 +85,8 @@ class CrawlConfig:
61
85
  webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
62
86
  save_reports: bool = False # Whether to save crawl reports
63
87
  report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
88
+ combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
89
+ combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
64
90
 
65
91
  def __post_init__(self):
66
92
  # Initialize empty lists/dicts for None values
@@ -72,12 +98,21 @@ class CrawlConfig:
72
98
  self.output_dir = Path(self.output_dir)
73
99
  self.output_dir.mkdir(parents=True, exist_ok=True)
74
100
 
75
- # Only setup report file if save_reports is True
101
+ # Setup report file if save_reports is True
76
102
  if self.save_reports:
77
103
  if self.report_file is None:
78
104
  self.report_file = self.output_dir / "crawl_report.json"
79
105
  else:
80
106
  self.report_file = Path(self.report_file)
107
+
108
+ # Setup combined markdown file if needed
109
+ if self.combine_to_one_markdown:
110
+ if self.combined_markdown_file is None:
111
+ self.combined_markdown_file = self.output_dir / "combined_content.md"
112
+ else:
113
+ self.combined_markdown_file = Path(self.combined_markdown_file)
114
+ # Create or clear the combined file
115
+ self.combined_markdown_file.write_text('')
81
116
 
82
117
  def to_dict(self) -> Dict:
83
118
  """Convert config to dictionary for API requests."""
@@ -158,9 +193,19 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
158
193
 
159
194
  # Save markdown if output directory is configured
160
195
  if config.output_dir:
161
- filepath = config.output_dir / f"{slugify(url)}.md"
162
- with open(filepath, 'w', encoding='utf-8') as f:
163
- f.write(markdown)
196
+ # Save individual file if not combining or if combining in full mode
197
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
198
+ filepath = config.output_dir / f"{slugify(url)}.md"
199
+ with open(filepath, 'w', encoding='utf-8') as f:
200
+ f.write(markdown)
201
+
202
+ # Handle combined markdown file
203
+ if config.combine_to_one_markdown:
204
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
205
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
206
+
207
+ with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
208
+ f.write(combined_content)
164
209
 
165
210
  result = CrawlResult(
166
211
  url=url,
@@ -209,12 +254,21 @@ class SpiderForce4AI:
209
254
  await self.session.close()
210
255
 
211
256
  async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
212
- """Save markdown content to file."""
213
- filename = f"{slugify(url)}.md"
214
- filepath = output_dir / filename
215
- async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
216
- await f.write(markdown)
217
- return filepath
257
+ """Save markdown content to file and/or append to combined file."""
258
+ # Save individual file if not combining or if combining in full mode
259
+ if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
260
+ filename = f"{slugify(url)}.md"
261
+ filepath = output_dir / filename
262
+ async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
263
+ await f.write(markdown)
264
+
265
+ # Handle combined markdown file
266
+ if self.config.combine_to_one_markdown:
267
+ content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
268
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
269
+
270
+ async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
271
+ await f.write(combined_content)
218
272
 
219
273
 
220
274
 
@@ -296,17 +350,25 @@ class SpiderForce4AI:
296
350
 
297
351
  results.append(result)
298
352
 
299
- # Save report if enabled
300
- if config.save_reports:
301
- self._save_report_sync(results, config)
302
- print(f"\nReport saved to: {config.report_file}")
303
-
304
- # Print summary
353
+ # Calculate statistics
305
354
  successful = len([r for r in results if r.status == "success"])
306
355
  failed = len([r for r in results if r.status == "failed"])
356
+
357
+ # Print summary
307
358
  print(f"\nParallel processing completed:")
308
359
  print(f"✓ Successful: {successful}")
309
360
  print(f"✗ Failed: {failed}")
361
+
362
+ # Save report if enabled
363
+ if config.save_reports and config.report_file:
364
+ self._retry_stats = {
365
+ "initial_failures": failed,
366
+ "failure_ratio": (failed / len(urls)) * 100,
367
+ "retry_successful": 0, # No retries in server parallel mode
368
+ "retry_failed": failed
369
+ }
370
+ self._save_report_sync(results, config)
371
+ console.print(f"📊 Report saved to: {config.report_file}")
310
372
 
311
373
  return results
312
374
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.4
3
+ Version: 1.6
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=yNyBz8-HjENlAZ1NSy072Ir71T68xulTRj19Yxr1aEQ,33573
2
+ spiderforce4ai-1.6.dist-info/METADATA,sha256=pelYvJYMzC8W-P2ORQNWwP2Fyc5KshnzQ6edoYEYZQU,7183
3
+ spiderforce4ai-1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-1.6.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=7EMEEfFY3WLq58-vnK1Yhcb1trF2ZXU-Ny3licz45Yk,30585
2
- spiderforce4ai-1.4.dist-info/METADATA,sha256=7GRBz_bTtXOQ2N-gHRPJFEWW8mmOB_1gwrJCf-el8LM,7183
3
- spiderforce4ai-1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-1.4.dist-info/RECORD,,