PyPI - spiderforce4ai - Versions diffs - 1.5__tar.gz → 1.7__tar.gz - Mend

spiderforce4ai 1.5tar.gz → 1.7tar.gz

Files changed (11) hide show

{spiderforce4ai-1.5 → spiderforce4ai-1.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 1.5
+Version: 1.7
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

{spiderforce4ai-1.5 → spiderforce4ai-1.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "spiderforce4ai"
-version = "1.5"
+version = "1.7"
 description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
 readme = "README.md"
 authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]

{spiderforce4ai-1.5 → spiderforce4ai-1.7}/setup.py RENAMED Viewed

@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
 setup(
     name="spiderforce4ai",
-    version="1.5",
+    version="1.7",
     author="Piotr Tamulewicz",
     author_email="pt@petertam.pro",
     description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",

{spiderforce4ai-1.5 → spiderforce4ai-1.7}/spiderforce4ai/__init__.py RENAMED Viewed

@@ -22,6 +22,59 @@ from multiprocessing import Pool
 console = Console()
+def extract_metadata_headers(markdown: str) -> str:
+    """Extract metadata and headers from markdown content with SEO formatting."""
+    lines = markdown.split('\n')
+    extracted = []
+    in_metadata = False
+    metadata = {
+        'title': '',
+        'description': '',
+        'canonical_url': '',
+        'language': ''
+    }
+    # First pass - collect metadata
+    for line in lines:
+        if line.strip().startswith('title:'):
+            metadata['title'] = line.split(':', 1)[1].strip()
+        elif line.strip().startswith('description:'):
+            metadata['description'] = line.split(':', 1)[1].strip()
+        elif line.strip().startswith('canonical_url:'):
+            metadata['canonical_url'] = line.split(':', 1)[1].strip()
+        elif line.strip().startswith('language:'):
+            metadata['language'] = line.split(':', 1)[1].strip()
+    # Add formatted metadata section
+    extracted.append(f"Title: {metadata['title']}")
+    extracted.append(f"Description: {metadata['description']}")
+    extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
+    extracted.append(f"Language: {metadata['language']}")
+    extracted.append("")  # Empty line after metadata
+    # Second pass - process headers
+    for line in lines:
+        if line.strip().startswith('#'):
+            # Count the number of # symbols
+            level = len(line) - len(line.lstrip('#'))
+            text = line.lstrip('#').strip()
+            # Format header according to level
+            if level == 1:
+                extracted.append(f"H1: {text}")
+            elif level == 2:
+                extracted.append(f"H2: {text}")
+            elif level == 3:
+                extracted.append(f"H3: {text}")
+            elif level == 4:
+                extracted.append(f"H4: {text}")
+            elif level == 5:
+                extracted.append(f"H5: {text}")
+            elif level == 6:
+                extracted.append(f"H6: {text}")
+    return '\n'.join(extracted)
 def slugify(url: str) -> str:
     """Convert URL to a valid filename."""
     parsed = urlparse(url)
@@ -61,6 +114,8 @@ class CrawlConfig:
     webhook_payload_template: Optional[str] = None  # Optional custom webhook payload template
     save_reports: bool = False  # Whether to save crawl reports
     report_file: Optional[Path] = None  # Optional report file location (used only if save_reports is True)
+    combine_to_one_markdown: Optional[str] = None  # 'full' or 'metadata_headers' to combine all pages into one file
+    combined_markdown_file: Optional[Path] = None  # Optional path for combined markdown file
     def __post_init__(self):
         # Initialize empty lists/dicts for None values
@@ -72,12 +127,21 @@ class CrawlConfig:
         self.output_dir = Path(self.output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
-        # Only setup report file if save_reports is True
+        # Setup report file if save_reports is True
         if self.save_reports:
             if self.report_file is None:
                 self.report_file = self.output_dir / "crawl_report.json"
             else:
                 self.report_file = Path(self.report_file)
+        # Setup combined markdown file if needed
+        if self.combine_to_one_markdown:
+            if self.combined_markdown_file is None:
+                self.combined_markdown_file = self.output_dir / "combined_content.md"
+            else:
+                self.combined_markdown_file = Path(self.combined_markdown_file)
+            # Create or clear the combined file
+            self.combined_markdown_file.write_text('')
     def to_dict(self) -> Dict:
         """Convert config to dictionary for API requests."""
@@ -158,9 +222,19 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
         # Save markdown if output directory is configured
         if config.output_dir:
-            filepath = config.output_dir / f"{slugify(url)}.md"
-            with open(filepath, 'w', encoding='utf-8') as f:
-                f.write(markdown)
+            # Save individual file if not combining or if combining in full mode
+            if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
+                filepath = config.output_dir / f"{slugify(url)}.md"
+                with open(filepath, 'w', encoding='utf-8') as f:
+                    f.write(markdown)
+            # Handle combined markdown file
+            if config.combine_to_one_markdown:
+                content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
+                combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
+                with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
+                    f.write(combined_content)
         result = CrawlResult(
             url=url,
@@ -209,12 +283,21 @@ class SpiderForce4AI:
             await self.session.close()
     async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
-        """Save markdown content to file."""
-        filename = f"{slugify(url)}.md"
-        filepath = output_dir / filename
-        async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
-            await f.write(markdown)
-        return filepath
+        """Save markdown content to file and/or append to combined file."""
+        # Save individual file if not combining or if combining in full mode
+        if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
+            filename = f"{slugify(url)}.md"
+            filepath = output_dir / filename
+            async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
+                await f.write(markdown)
+        # Handle combined markdown file
+        if self.config.combine_to_one_markdown:
+            content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
+            combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
+            async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
+                await f.write(combined_content)

{spiderforce4ai-1.5 → spiderforce4ai-1.7}/spiderforce4ai.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 1.5
+Version: 1.7
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

{spiderforce4ai-1.5 → spiderforce4ai-1.7}/README.md RENAMED Viewed

File without changes

{spiderforce4ai-1.5 → spiderforce4ai-1.7}/setup.cfg RENAMED Viewed

File without changes

{spiderforce4ai-1.5 → spiderforce4ai-1.7}/spiderforce4ai.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{spiderforce4ai-1.5 → spiderforce4ai-1.7}/spiderforce4ai.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{spiderforce4ai-1.5 → spiderforce4ai-1.7}/spiderforce4ai.egg-info/requires.txt RENAMED Viewed

File without changes

{spiderforce4ai-1.5 → spiderforce4ai-1.7}/spiderforce4ai.egg-info/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 1.5__tar.gz → 1.7__tar.gz

spiderforce4ai 1.5tar.gz → 1.7tar.gz