spiderforce4ai 1.8__py3-none-any.whl → 1.9__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -22,7 +22,7 @@ from multiprocessing import Pool
22
22
 
23
23
  console = Console()
24
24
 
25
- def extract_metadata_headers(markdown: str) -> str:
25
+ def extract_metadata_headers(markdown: str, url: str = '') -> str:
26
26
  """Extract metadata and headers from markdown content with SEO formatting."""
27
27
  lines = markdown.split('\n')
28
28
  extracted = []
@@ -62,10 +62,10 @@ def extract_metadata_headers(markdown: str) -> str:
62
62
  metadata['language'] = value
63
63
 
64
64
  # Add formatted metadata section with URL first
65
- extracted.append(f"URL: {metadata.get('url', '')}")
66
- extracted.append(f"Title: {metadata['title']}")
65
+ extracted.append(f"URL: {url}") # Use the actual crawled URL
66
+ extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}") # Fallback to URL segment
67
67
  extracted.append(f"Description: {metadata['description']}")
68
- extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
68
+ extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}") # Fallback to crawled URL
69
69
  extracted.append(f"Language: {metadata['language'] or 'en'}") # Default to 'en' if not specified
70
70
  extracted.append("") # Empty line after metadata
71
71
 
@@ -247,7 +247,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
247
247
 
248
248
  # Handle combined markdown file
249
249
  if config.combine_to_one_markdown:
250
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
250
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
251
251
  combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
252
252
 
253
253
  with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.8
3
+ Version: 1.9
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=JL3APcu08DJWqeVBssPODQ8zqZdislI-qiOah_7xnus,35564
2
+ spiderforce4ai-1.9.dist-info/METADATA,sha256=St2DWVpNEWX22A9x7aizkUtRtTOk8tnva0izcXRNL5o,7183
3
+ spiderforce4ai-1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-1.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-1.9.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=GaW2zVESi1pi13pD0Dky4g0Yuj9hEj7_4eP_eyoBnWM,35425
2
- spiderforce4ai-1.8.dist-info/METADATA,sha256=T1K4wWbagvh0ZW_vsYNAAhSAqRH7bLDOF6lr7Yy1pfg,7183
3
- spiderforce4ai-1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-1.8.dist-info/RECORD,,