spiderforce4ai 1.5__tar.gz → 1.7__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.5
3
+ Version: 1.7
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "1.5"
7
+ version = "1.7"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name="spiderforce4ai",
6
- version="1.5",
6
+ version="1.7",
7
7
  author="Piotr Tamulewicz",
8
8
  author_email="pt@petertam.pro",
9
9
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",
@@ -22,6 +22,59 @@ from multiprocessing import Pool
22
22
 
23
23
  console = Console()
24
24
 
25
+ def extract_metadata_headers(markdown: str) -> str:
26
+ """Extract metadata and headers from markdown content with SEO formatting."""
27
+ lines = markdown.split('\n')
28
+ extracted = []
29
+ in_metadata = False
30
+ metadata = {
31
+ 'title': '',
32
+ 'description': '',
33
+ 'canonical_url': '',
34
+ 'language': ''
35
+ }
36
+
37
+ # First pass - collect metadata
38
+ for line in lines:
39
+ if line.strip().startswith('title:'):
40
+ metadata['title'] = line.split(':', 1)[1].strip()
41
+ elif line.strip().startswith('description:'):
42
+ metadata['description'] = line.split(':', 1)[1].strip()
43
+ elif line.strip().startswith('canonical_url:'):
44
+ metadata['canonical_url'] = line.split(':', 1)[1].strip()
45
+ elif line.strip().startswith('language:'):
46
+ metadata['language'] = line.split(':', 1)[1].strip()
47
+
48
+ # Add formatted metadata section
49
+ extracted.append(f"Title: {metadata['title']}")
50
+ extracted.append(f"Description: {metadata['description']}")
51
+ extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
52
+ extracted.append(f"Language: {metadata['language']}")
53
+ extracted.append("") # Empty line after metadata
54
+
55
+ # Second pass - process headers
56
+ for line in lines:
57
+ if line.strip().startswith('#'):
58
+ # Count the number of # symbols
59
+ level = len(line) - len(line.lstrip('#'))
60
+ text = line.lstrip('#').strip()
61
+
62
+ # Format header according to level
63
+ if level == 1:
64
+ extracted.append(f"H1: {text}")
65
+ elif level == 2:
66
+ extracted.append(f"H2: {text}")
67
+ elif level == 3:
68
+ extracted.append(f"H3: {text}")
69
+ elif level == 4:
70
+ extracted.append(f"H4: {text}")
71
+ elif level == 5:
72
+ extracted.append(f"H5: {text}")
73
+ elif level == 6:
74
+ extracted.append(f"H6: {text}")
75
+
76
+ return '\n'.join(extracted)
77
+
25
78
  def slugify(url: str) -> str:
26
79
  """Convert URL to a valid filename."""
27
80
  parsed = urlparse(url)
@@ -61,6 +114,8 @@ class CrawlConfig:
61
114
  webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
62
115
  save_reports: bool = False # Whether to save crawl reports
63
116
  report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
117
+ combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
118
+ combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
64
119
 
65
120
  def __post_init__(self):
66
121
  # Initialize empty lists/dicts for None values
@@ -72,12 +127,21 @@ class CrawlConfig:
72
127
  self.output_dir = Path(self.output_dir)
73
128
  self.output_dir.mkdir(parents=True, exist_ok=True)
74
129
 
75
- # Only setup report file if save_reports is True
130
+ # Setup report file if save_reports is True
76
131
  if self.save_reports:
77
132
  if self.report_file is None:
78
133
  self.report_file = self.output_dir / "crawl_report.json"
79
134
  else:
80
135
  self.report_file = Path(self.report_file)
136
+
137
+ # Setup combined markdown file if needed
138
+ if self.combine_to_one_markdown:
139
+ if self.combined_markdown_file is None:
140
+ self.combined_markdown_file = self.output_dir / "combined_content.md"
141
+ else:
142
+ self.combined_markdown_file = Path(self.combined_markdown_file)
143
+ # Create or clear the combined file
144
+ self.combined_markdown_file.write_text('')
81
145
 
82
146
  def to_dict(self) -> Dict:
83
147
  """Convert config to dictionary for API requests."""
@@ -158,9 +222,19 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
158
222
 
159
223
  # Save markdown if output directory is configured
160
224
  if config.output_dir:
161
- filepath = config.output_dir / f"{slugify(url)}.md"
162
- with open(filepath, 'w', encoding='utf-8') as f:
163
- f.write(markdown)
225
+ # Save individual file if not combining or if combining in full mode
226
+ if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
227
+ filepath = config.output_dir / f"{slugify(url)}.md"
228
+ with open(filepath, 'w', encoding='utf-8') as f:
229
+ f.write(markdown)
230
+
231
+ # Handle combined markdown file
232
+ if config.combine_to_one_markdown:
233
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
234
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
235
+
236
+ with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
237
+ f.write(combined_content)
164
238
 
165
239
  result = CrawlResult(
166
240
  url=url,
@@ -209,12 +283,21 @@ class SpiderForce4AI:
209
283
  await self.session.close()
210
284
 
211
285
  async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
212
- """Save markdown content to file."""
213
- filename = f"{slugify(url)}.md"
214
- filepath = output_dir / filename
215
- async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
216
- await f.write(markdown)
217
- return filepath
286
+ """Save markdown content to file and/or append to combined file."""
287
+ # Save individual file if not combining or if combining in full mode
288
+ if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
289
+ filename = f"{slugify(url)}.md"
290
+ filepath = output_dir / filename
291
+ async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
292
+ await f.write(markdown)
293
+
294
+ # Handle combined markdown file
295
+ if self.config.combine_to_one_markdown:
296
+ content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
297
+ combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
298
+
299
+ async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
300
+ await f.write(combined_content)
218
301
 
219
302
 
220
303
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.5
3
+ Version: 1.7
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes