spiderforce4ai 1.8__py3-none-any.whl → 1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,7 @@ from multiprocessing import Pool
22
22
 
23
23
  console = Console()
24
24
 
25
- def extract_metadata_headers(markdown: str) -> str:
25
+ def extract_metadata_headers(markdown: str, url: str = '') -> str:
26
26
  """Extract metadata and headers from markdown content with SEO formatting."""
27
27
  lines = markdown.split('\n')
28
28
  extracted = []
@@ -62,10 +62,10 @@ def extract_metadata_headers(markdown: str) -> str:
62
62
  metadata['language'] = value
63
63
 
64
64
  # Add formatted metadata section with URL first
65
- extracted.append(f"URL: {metadata.get('url', '')}")
66
- extracted.append(f"Title: {metadata['title']}")
65
+ extracted.append(f"URL: {url}") # Use the actual crawled URL
66
+ extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}") # Fallback to URL segment
67
67
  extracted.append(f"Description: {metadata['description']}")
68
- extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
68
+ extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}") # Fallback to crawled URL
69
69
  extracted.append(f"Language: {metadata['language'] or 'en'}") # Default to 'en' if not specified
70
70
  extracted.append("") # Empty line after metadata
71
71
 
@@ -247,7 +247,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
247
247
 
248
248
  # Handle combined markdown file
249
249
  if config.combine_to_one_markdown:
250
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
250
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
251
251
  combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
252
252
 
253
253
  with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.8
3
+ Version: 1.9
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=JL3APcu08DJWqeVBssPODQ8zqZdislI-qiOah_7xnus,35564
2
+ spiderforce4ai-1.9.dist-info/METADATA,sha256=St2DWVpNEWX22A9x7aizkUtRtTOk8tnva0izcXRNL5o,7183
3
+ spiderforce4ai-1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-1.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-1.9.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=GaW2zVESi1pi13pD0Dky4g0Yuj9hEj7_4eP_eyoBnWM,35425
2
- spiderforce4ai-1.8.dist-info/METADATA,sha256=T1K4wWbagvh0ZW_vsYNAAhSAqRH7bLDOF6lr7Yy1pfg,7183
3
- spiderforce4ai-1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-1.8.dist-info/RECORD,,