PyPI - spiderforce4ai - Versions diffs - 1.7__py3-none-any.whl → 1.9__py3-none-any.whl - Mend

spiderforce4ai 1.7py3-none-any.whl → 1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

spiderforce4ai/__init__.py CHANGED Viewed

@@ -22,7 +22,7 @@ from multiprocessing import Pool
 console = Console()
-def extract_metadata_headers(markdown: str) -> str:
+def extract_metadata_headers(markdown: str, url: str = '') -> str:
     """Extract metadata and headers from markdown content with SEO formatting."""
     lines = markdown.split('\n')
     extracted = []
@@ -35,21 +35,38 @@ def extract_metadata_headers(markdown: str) -> str:
     }
     # First pass - collect metadata
-    for line in lines:
-        if line.strip().startswith('title:'):
-            metadata['title'] = line.split(':', 1)[1].strip()
-        elif line.strip().startswith('description:'):
-            metadata['description'] = line.split(':', 1)[1].strip()
-        elif line.strip().startswith('canonical_url:'):
-            metadata['canonical_url'] = line.split(':', 1)[1].strip()
-        elif line.strip().startswith('language:'):
-            metadata['language'] = line.split(':', 1)[1].strip()
+    for i, line in enumerate(lines):
+        # Check for metadata block boundaries
+        if line.strip() == '---':
+            if not in_metadata:
+                in_metadata = True
+                continue
+            else:
+                in_metadata = False
+                break
+        # Extract metadata within the block
+        if in_metadata:
+            if ':' in line:
+                key, value = line.split(':', 1)
+                key = key.strip().lower()
+                value = value.strip()
+                if key == 'title':
+                    metadata['title'] = value
+                elif key == 'description':
+                    metadata['description'] = value
+                elif key == 'canonical_url':
+                    metadata['canonical_url'] = value
+                elif key == 'language':
+                    metadata['language'] = value
-    # Add formatted metadata section
-    extracted.append(f"Title: {metadata['title']}")
+    # Add formatted metadata section with URL first
+    extracted.append(f"URL: {url}")  # Use the actual crawled URL
+    extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}")  # Fallback to URL segment
     extracted.append(f"Description: {metadata['description']}")
-    extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
-    extracted.append(f"Language: {metadata['language']}")
+    extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}")  # Fallback to crawled URL
+    extracted.append(f"Language: {metadata['language'] or 'en'}")  # Default to 'en' if not specified
     extracted.append("")  # Empty line after metadata
     # Second pass - process headers
@@ -230,7 +247,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
             # Handle combined markdown file
             if config.combine_to_one_markdown:
-                content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
+                content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
                 combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
                 with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:

{spiderforce4ai-1.7.dist-info → spiderforce4ai-1.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: spiderforce4ai
-Version: 1.7
+Version: 1.9
 Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
 Home-page: https://petertam.pro
 Author: Piotr Tamulewicz

spiderforce4ai-1.9.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,5 @@
+spiderforce4ai/__init__.py,sha256=JL3APcu08DJWqeVBssPODQ8zqZdislI-qiOah_7xnus,35564
+spiderforce4ai-1.9.dist-info/METADATA,sha256=St2DWVpNEWX22A9x7aizkUtRtTOk8tnva0izcXRNL5o,7183
+spiderforce4ai-1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+spiderforce4ai-1.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
+spiderforce4ai-1.9.dist-info/RECORD,,

spiderforce4ai-1.7.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-spiderforce4ai/__init__.py,sha256=7YpJdZfmy4z5wUFGTBsvi5VOxGGX594oVul3Q5Ngdko,34906
-spiderforce4ai-1.7.dist-info/METADATA,sha256=ON-lQ4BARmNOrHwT2Xbl2oc1hoo8FyMQWxl6T0LbClA,7183
-spiderforce4ai-1.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-spiderforce4ai-1.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
-spiderforce4ai-1.7.dist-info/RECORD,,

{spiderforce4ai-1.7.dist-info → spiderforce4ai-1.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{spiderforce4ai-1.7.dist-info → spiderforce4ai-1.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

spiderforce4ai 1.7__py3-none-any.whl → 1.9__py3-none-any.whl

spiderforce4ai 1.7py3-none-any.whl → 1.9py3-none-any.whl