PyPI - spiderforce4ai - Versions diffs - 1.5__py3-none-any.whl → 1.7__py3-none-any.whl - Mend

spiderforce4ai 1.5py3-none-any.whl → 1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

spiderforce4ai/__init__.py CHANGED Viewed

@@ -22,6 +22,59 @@ from multiprocessing import Pool
 console = Console()
+def extract_metadata_headers(markdown: str) -> str:
+    """Extract metadata and headers from markdown content with SEO formatting."""
+    lines = markdown.split('\n')
+    extracted = []
+    in_metadata = False
+    metadata = {
+        'title': '',
+        'description': '',
+        'canonical_url': '',
+        'language': ''
+    }
+    # First pass - collect metadata
+    for line in lines:
+        if line.strip().startswith('title:'):
+            metadata['title'] = line.split(':', 1)[1].strip()
+        elif line.strip().startswith('description:'):
+            metadata['description'] = line.split(':', 1)[1].strip()
+        elif line.strip().startswith('canonical_url:'):
+            metadata['canonical_url'] = line.split(':', 1)[1].strip()
+        elif line.strip().startswith('language:'):
+            metadata['language'] = line.split(':', 1)[1].strip()
+    # Add formatted metadata section
+    extracted.append(f"Title: {metadata['title']}")
+    extracted.append(f"Description: {metadata['description']}")
+    extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
+    extracted.append(f"Language: {metadata['language']}")
+    extracted.append("")  # Empty line after metadata
+    # Second pass - process headers
+    for line in lines:
+        if line.strip().startswith('#'):
+            # Count the number of # symbols
+            level = len(line) - len(line.lstrip('#'))
+            text = line.lstrip('#').strip()
+            # Format header according to level
+            if level == 1:
+                extracted.append(f"H1: {text}")
+            elif level == 2:
+                extracted.append(f"H2: {text}")
+            elif level == 3:
+                extracted.append(f"H3: {text}")
+            elif level == 4:
+                extracted.append(f"H4: {text}")
+            elif level == 5:
+                extracted.append(f"H5: {text}")
+            elif level == 6:
+                extracted.append(f"H6: {text}")
+    return '\n'.join(extracted)
 def slugify(url: str) -> str:
     """Convert URL to a valid filename."""
     parsed = urlparse(url)
@@ -61,6 +114,8 @@ class CrawlConfig:
     webhook_payload_template: Optional[str] = None  # Optional custom webhook payload template
     save_reports: bool = False  # Whether to save crawl reports
     report_file: Optional[Path] = None  # Optional report file location (used only if save_reports is True)
+    combine_to_one_markdown: Optional[str] = None  # 'full' or 'metadata_headers' to combine all pages into one file
+    combined_markdown_file: Optional[Path] = None  # Optional path for combined markdown file
     def __post_init__(self):
         # Initialize empty lists/dicts for None values
@@ -72,12 +127,21 @@ class CrawlConfig:
         self.output_dir = Path(self.output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
-        # Only setup report file if save_reports is True
+        # Setup report file if save_reports is True
         if self.save_reports:
             if self.report_file is None:
                 self.report_file = self.output_dir / "crawl_report.json"
             else:
                 self.report_file = Path(self.report_file)
+        # Setup combined markdown file if needed
+        if self.combine_to_one_markdown:
+            if self.combined_markdown_file is None:
+                self.combined_markdown_file = self.output_dir / "combined_content.md"
+            else:
+                self.combined_markdown_file = Path(self.combined_markdown_file)
+            # Create or clear the combined file
+            self.combined_markdown_file.write_text('')
     def to_dict(self) -> Dict:
         """Convert config to dictionary for API requests."""
@@ -158,9 +222,19 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
         # Save markdown if output directory is configured
         if config.output_dir:
-            filepath = config.output_dir / f"{slugify(url)}.md"
-            with open(filepath, 'w', encoding='utf-8') as f:
-                f.write(markdown)
+            # Save individual file if not combining or if combining in full mode
+            if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
+                filepath = config.output_dir / f"{slugify(url)}.md"
+                with open(filepath, 'w', encoding='utf-8') as f:
+                    f.write(markdown)
+            # Handle combined markdown file
+            if config.combine_to_one_markdown:
+                content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
+                combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
+                with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
+                    f.write(combined_content)
         result = CrawlResult(
             url=url,
@@ -209,12 +283,21 @@ class SpiderForce4AI:
             await self.session.close()
     async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
-        """Save markdown content to file."""
-        filename = f"{slugify(url)}.md"
-        filepath = output_dir / filename
-        async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
-            await f.write(markdown)
-        return filepath
+        """Save markdown content to file and/or append to combined file."""
+        # Save individual file if not combining or if combining in full mode
+        if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
+            filename = f"{slugify(url)}.md"
+            filepath = output_dir / filename
+            async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
+                await f.write(markdown)
+        # Handle combined markdown file
+        if self.config.combine_to_one_markdown:
+            content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
+            combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
+            async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
+                await f.write(combined_content)

{spiderforce4ai-1.5.dist-info → spiderforce4ai-1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 1.5
+Version: 1.7
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

spiderforce4ai-1.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+spiderforce4ai/__init__.py,sha256=7YpJdZfmy4z5wUFGTBsvi5VOxGGX594oVul3Q5Ngdko,34906
+spiderforce4ai-1.7.dist-info/METADATA,sha256=ON-lQ4BARmNOrHwT2Xbl2oc1hoo8FyMQWxl6T0LbClA,7183
+spiderforce4ai-1.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-1.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-1.7.dist-info/RECORD,,

spiderforce4ai-1.5.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-spiderforce4ai/__init__.py,sha256=lUCuNzYKjsYRLgP1ULSfAXnDSoLn91vYf71zOZFGmPg,30936
-spiderforce4ai-1.5.dist-info/METADATA,sha256=uHgxa-sPwP805d0jM3-LrDOLayfKqxtpxqvZMsFnizo,7183
-spiderforce4ai-1.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-1.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-1.5.dist-info/RECORD,,

{spiderforce4ai-1.5.dist-info → spiderforce4ai-1.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-1.5.dist-info → spiderforce4ai-1.7.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 1.5__py3-none-any.whl → 1.7__py3-none-any.whl

spiderforce4ai 1.5py3-none-any.whl → 1.7py3-none-any.whl