spiderforce4ai 1.5__py3-none-any.whl → 1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,6 +22,59 @@ from multiprocessing import Pool
22
22
 
23
23
  console = Console()
24
24
 
25
+ def extract_metadata_headers(markdown: str) -> str:
26
+ """Extract metadata and headers from markdown content with SEO formatting."""
27
+ lines = markdown.split('\n')
28
+ extracted = []
29
+ in_metadata = False
30
+ metadata = {
31
+ 'title': '',
32
+ 'description': '',
33
+ 'canonical_url': '',
34
+ 'language': ''
35
+ }
36
+
37
+ # First pass - collect metadata
38
+ for line in lines:
39
+ if line.strip().startswith('title:'):
40
+ metadata['title'] = line.split(':', 1)[1].strip()
41
+ elif line.strip().startswith('description:'):
42
+ metadata['description'] = line.split(':', 1)[1].strip()
43
+ elif line.strip().startswith('canonical_url:'):
44
+ metadata['canonical_url'] = line.split(':', 1)[1].strip()
45
+ elif line.strip().startswith('language:'):
46
+ metadata['language'] = line.split(':', 1)[1].strip()
47
+
48
+ # Add formatted metadata section
49
+ extracted.append(f"Title: {metadata['title']}")
50
+ extracted.append(f"Description: {metadata['description']}")
51
+ extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
52
+ extracted.append(f"Language: {metadata['language']}")
53
+ extracted.append("") # Empty line after metadata
54
+
55
+ # Second pass - process headers
56
+ for line in lines:
57
+ if line.strip().startswith('#'):
58
+ # Count the number of # symbols
59
+ level = len(line) - len(line.lstrip('#'))
60
+ text = line.lstrip('#').strip()
61
+
62
+ # Format header according to level
63
+ if level == 1:
64
+ extracted.append(f"H1: {text}")
65
+ elif level == 2:
66
+ extracted.append(f"H2: {text}")
67
+ elif level == 3:
68
+ extracted.append(f"H3: {text}")
69
+ elif level == 4:
70
+ extracted.append(f"H4: {text}")
71
+ elif level == 5:
72
+ extracted.append(f"H5: {text}")
73
+ elif level == 6:
74
+ extracted.append(f"H6: {text}")
75
+
76
+ return '\n'.join(extracted)
77
+
25
78
  def slugify(url: str) -> str:
26
79
  """Convert URL to a valid filename."""
27
80
  parsed = urlparse(url)
@@ -61,6 +114,8 @@ class CrawlConfig:
61
114
  webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
62
115
  save_reports: bool = False # Whether to save crawl reports
63
116
  report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
117
+ combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
118
+ combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
64
119
 
65
120
  def __post_init__(self):
66
121
  # Initialize empty lists/dicts for None values
@@ -72,12 +127,21 @@ class CrawlConfig:
72
127
  self.output_dir = Path(self.output_dir)
73
128
  self.output_dir.mkdir(parents=True, exist_ok=True)
74
129
 
75
- # Only setup report file if save_reports is True
130
+ # Setup report file if save_reports is True
76
131
  if self.save_reports:
77
132
  if self.report_file is None:
78
133
  self.report_file = self.output_dir / "crawl_report.json"
79
134
  else:
80
135
  self.report_file = Path(self.report_file)
136
+
137
+ # Setup combined markdown file if needed
138
+ if self.combine_to_one_markdown:
139
+ if self.combined_markdown_file is None:
140
+ self.combined_markdown_file = self.output_dir / "combined_content.md"
141
+ else:
142
+ self.combined_markdown_file = Path(self.combined_markdown_file)
143
+ # Create or clear the combined file
144
+ self.combined_markdown_file.write_text('')
81
145
 
82
146
  def to_dict(self) -> Dict:
83
147
  """Convert config to dictionary for API requests."""
@@ -158,9 +222,19 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
158
222
 
159
223
  # Save markdown if output directory is configured
160
224
  if config.output_dir:
161
- filepath = config.output_dir / f"{slugify(url)}.md"
162
- with open(filepath, 'w', encoding='utf-8') as f:
163
- f.write(markdown)
225
+ # Save individual file if not combining or if combining in full mode
226
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
227
+ filepath = config.output_dir / f"{slugify(url)}.md"
228
+ with open(filepath, 'w', encoding='utf-8') as f:
229
+ f.write(markdown)
230
+
231
+ # Handle combined markdown file
232
+ if config.combine_to_one_markdown:
233
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
234
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
235
+
236
+ with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
237
+ f.write(combined_content)
164
238
 
165
239
  result = CrawlResult(
166
240
  url=url,
@@ -209,12 +283,21 @@ class SpiderForce4AI:
209
283
  await self.session.close()
210
284
 
211
285
  async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
212
- """Save markdown content to file."""
213
- filename = f"{slugify(url)}.md"
214
- filepath = output_dir / filename
215
- async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
216
- await f.write(markdown)
217
- return filepath
286
+ """Save markdown content to file and/or append to combined file."""
287
+ # Save individual file if not combining or if combining in full mode
288
+ if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
289
+ filename = f"{slugify(url)}.md"
290
+ filepath = output_dir / filename
291
+ async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
292
+ await f.write(markdown)
293
+
294
+ # Handle combined markdown file
295
+ if self.config.combine_to_one_markdown:
296
+ content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
297
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
298
+
299
+ async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
300
+ await f.write(combined_content)
218
301
 
219
302
 
220
303
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.5
3
+ Version: 1.7
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=7YpJdZfmy4z5wUFGTBsvi5VOxGGX594oVul3Q5Ngdko,34906
2
+ spiderforce4ai-1.7.dist-info/METADATA,sha256=ON-lQ4BARmNOrHwT2Xbl2oc1hoo8FyMQWxl6T0LbClA,7183
3
+ spiderforce4ai-1.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-1.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-1.7.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=lUCuNzYKjsYRLgP1ULSfAXnDSoLn91vYf71zOZFGmPg,30936
2
- spiderforce4ai-1.5.dist-info/METADATA,sha256=uHgxa-sPwP805d0jM3-LrDOLayfKqxtpxqvZMsFnizo,7183
3
- spiderforce4ai-1.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-1.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-1.5.dist-info/RECORD,,