spiderforce4ai 1.4__tar.gz → 1.6__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.4
3
+ Version: 1.6
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "1.4"
7
+ version = "1.6"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name="spiderforce4ai",
6
- version="1.4",
6
+ version="1.6",
7
7
  author="Piotr Tamulewicz",
8
8
  author_email="pt@petertam.pro",
9
9
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",
@@ -22,6 +22,30 @@ from multiprocessing import Pool
22
22
 
23
23
  console = Console()
24
24
 
25
+ def extract_metadata_headers(markdown: str) -> str:
26
+ """Extract metadata and headers from markdown content."""
27
+ lines = markdown.split('\n')
28
+ extracted = []
29
+ in_metadata = False
30
+
31
+ for line in lines:
32
+ # Check for metadata block
33
+ if line.strip() == '---':
34
+ in_metadata = not in_metadata
35
+ extracted.append(line)
36
+ continue
37
+
38
+ # Include metadata
39
+ if in_metadata:
40
+ extracted.append(line)
41
+ continue
42
+
43
+ # Include headers (lines starting with #)
44
+ if line.strip().startswith('#'):
45
+ extracted.append(line)
46
+
47
+ return '\n'.join(extracted)
48
+
25
49
  def slugify(url: str) -> str:
26
50
  """Convert URL to a valid filename."""
27
51
  parsed = urlparse(url)
@@ -61,6 +85,8 @@ class CrawlConfig:
61
85
  webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
62
86
  save_reports: bool = False # Whether to save crawl reports
63
87
  report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
88
+ combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
89
+ combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
64
90
 
65
91
  def __post_init__(self):
66
92
  # Initialize empty lists/dicts for None values
@@ -72,12 +98,21 @@ class CrawlConfig:
72
98
  self.output_dir = Path(self.output_dir)
73
99
  self.output_dir.mkdir(parents=True, exist_ok=True)
74
100
 
75
- # Only setup report file if save_reports is True
101
+ # Setup report file if save_reports is True
76
102
  if self.save_reports:
77
103
  if self.report_file is None:
78
104
  self.report_file = self.output_dir / "crawl_report.json"
79
105
  else:
80
106
  self.report_file = Path(self.report_file)
107
+
108
+ # Setup combined markdown file if needed
109
+ if self.combine_to_one_markdown:
110
+ if self.combined_markdown_file is None:
111
+ self.combined_markdown_file = self.output_dir / "combined_content.md"
112
+ else:
113
+ self.combined_markdown_file = Path(self.combined_markdown_file)
114
+ # Create or clear the combined file
115
+ self.combined_markdown_file.write_text('')
81
116
 
82
117
  def to_dict(self) -> Dict:
83
118
  """Convert config to dictionary for API requests."""
@@ -158,9 +193,19 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
158
193
 
159
194
  # Save markdown if output directory is configured
160
195
  if config.output_dir:
161
- filepath = config.output_dir / f"{slugify(url)}.md"
162
- with open(filepath, 'w', encoding='utf-8') as f:
163
- f.write(markdown)
196
+ # Save individual file if not combining or if combining in full mode
197
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
198
+ filepath = config.output_dir / f"{slugify(url)}.md"
199
+ with open(filepath, 'w', encoding='utf-8') as f:
200
+ f.write(markdown)
201
+
202
+ # Handle combined markdown file
203
+ if config.combine_to_one_markdown:
204
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
205
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
206
+
207
+ with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
208
+ f.write(combined_content)
164
209
 
165
210
  result = CrawlResult(
166
211
  url=url,
@@ -209,12 +254,21 @@ class SpiderForce4AI:
209
254
  await self.session.close()
210
255
 
211
256
  async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
212
- """Save markdown content to file."""
213
- filename = f"{slugify(url)}.md"
214
- filepath = output_dir / filename
215
- async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
216
- await f.write(markdown)
217
- return filepath
257
+ """Save markdown content to file and/or append to combined file."""
258
+ # Save individual file if not combining or if combining in full mode
259
+ if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
260
+ filename = f"{slugify(url)}.md"
261
+ filepath = output_dir / filename
262
+ async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
263
+ await f.write(markdown)
264
+
265
+ # Handle combined markdown file
266
+ if self.config.combine_to_one_markdown:
267
+ content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
268
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
269
+
270
+ async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
271
+ await f.write(combined_content)
218
272
 
219
273
 
220
274
 
@@ -296,17 +350,25 @@ class SpiderForce4AI:
296
350
 
297
351
  results.append(result)
298
352
 
299
- # Save report if enabled
300
- if config.save_reports:
301
- self._save_report_sync(results, config)
302
- print(f"\nReport saved to: {config.report_file}")
303
-
304
- # Print summary
353
+ # Calculate statistics
305
354
  successful = len([r for r in results if r.status == "success"])
306
355
  failed = len([r for r in results if r.status == "failed"])
356
+
357
+ # Print summary
307
358
  print(f"\nParallel processing completed:")
308
359
  print(f"✓ Successful: {successful}")
309
360
  print(f"✗ Failed: {failed}")
361
+
362
+ # Save report if enabled
363
+ if config.save_reports and config.report_file:
364
+ self._retry_stats = {
365
+ "initial_failures": failed,
366
+ "failure_ratio": (failed / len(urls)) * 100,
367
+ "retry_successful": 0, # No retries in server parallel mode
368
+ "retry_failed": failed
369
+ }
370
+ self._save_report_sync(results, config)
371
+ console.print(f"📊 Report saved to: {config.report_file}")
310
372
 
311
373
  return results
312
374
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.4
3
+ Version: 1.6
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes