spiderforce4ai 2.0__py3-none-any.whl → 2.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -23,7 +23,7 @@ from multiprocessing import Pool
23
23
  console = Console()
24
24
 
25
25
  def extract_metadata_headers(markdown: str, url: str = '') -> str:
26
- """Extract metadata and headers from markdown content with SEO formatting."""
26
+ """Extract metadata and headers from markdown content with enhanced SEO formatting."""
27
27
  lines = markdown.split('\n')
28
28
  extracted = []
29
29
  in_metadata = False
@@ -33,8 +33,9 @@ def extract_metadata_headers(markdown: str, url: str = '') -> str:
33
33
  'canonical_url': '',
34
34
  'language': ''
35
35
  }
36
+ first_paragraph = ''
36
37
 
37
- # First pass - collect metadata
38
+ # First pass - collect metadata and first paragraph
38
39
  for i, line in enumerate(lines):
39
40
  # Check for metadata block boundaries
40
41
  if line.strip() == '---':
@@ -61,41 +62,34 @@ def extract_metadata_headers(markdown: str, url: str = '') -> str:
61
62
 
62
63
  if key == 'title':
63
64
  metadata['title'] = value
64
- elif key == 'description' or key == 'meta_description':
65
+ elif key in ['description', 'meta_description', 'og:description', 'meta-description']:
65
66
  metadata['description'] = value
66
- elif key == 'canonical_url' or key == 'canonical':
67
+ elif key in ['canonical_url', 'canonical']:
67
68
  metadata['canonical_url'] = value
68
- elif key == 'language' or key == 'lang':
69
+ elif key in ['language', 'lang']:
69
70
  metadata['language'] = value
71
+ elif not in_metadata and not first_paragraph and line.strip() and not line.startswith('#'):
72
+ first_paragraph = line.strip()
70
73
 
71
- # Add formatted metadata section with URL first
72
- extracted.append(f"URL: {url}") # Use the actual crawled URL
73
- extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}") # Fallback to URL segment
74
+ # Use first paragraph as fallback description if none found
75
+ if not metadata['description'] and first_paragraph:
76
+ metadata['description'] = first_paragraph[:160] + ('...' if len(first_paragraph) > 160 else '')
77
+
78
+ # Add formatted metadata section
79
+ extracted.append(f"URL: {url}")
80
+ extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}")
74
81
  extracted.append(f"Description: {metadata['description']}")
75
- extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}") # Fallback to crawled URL
76
- extracted.append(f"Language: {metadata['language'] or 'en'}") # Default to 'en' if not specified
82
+ extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}")
83
+ extracted.append(f"Language: {metadata['language'] or 'en'}")
77
84
  extracted.append("") # Empty line after metadata
78
85
 
79
86
  # Second pass - process headers
80
87
  for line in lines:
81
88
  if line.strip().startswith('#'):
82
- # Count the number of # symbols
83
89
  level = len(line) - len(line.lstrip('#'))
84
90
  text = line.lstrip('#').strip()
85
-
86
- # Format header according to level
87
- if level == 1:
88
- extracted.append(f"H1: {text}")
89
- elif level == 2:
90
- extracted.append(f"H2: {text}")
91
- elif level == 3:
92
- extracted.append(f"H3: {text}")
93
- elif level == 4:
94
- extracted.append(f"H4: {text}")
95
- elif level == 5:
96
- extracted.append(f"H5: {text}")
97
- elif level == 6:
98
- extracted.append(f"H6: {text}")
91
+ if 1 <= level <= 6:
92
+ extracted.append(f"H{level}: {text}")
99
93
 
100
94
  return '\n'.join(extracted)
101
95
 
@@ -317,7 +311,7 @@ class SpiderForce4AI:
317
311
 
318
312
  # Handle combined markdown file
319
313
  if self.config.combine_to_one_markdown:
320
- content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
314
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
321
315
  combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
322
316
 
323
317
  async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.0
3
+ Version: 2.1
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=6WskofS5vOJuPhdwoCRvcOmWVimCKJxtkkP_pshrrlo,35805
2
+ spiderforce4ai-2.1.dist-info/METADATA,sha256=bK_85RBFEAmDTZgo2oCPKgDNd-dqfYvRJoBl92Zk-i8,7183
3
+ spiderforce4ai-2.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-2.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-2.1.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=CiZBCoRGCfu8587NbW_rtU6kFZEC0R7i_lZwJLesH3M,35975
2
- spiderforce4ai-2.0.dist-info/METADATA,sha256=nOYUQWRl46UwW3HybozwqcFdc2JPotPLXsEBiL4GuqI,7183
3
- spiderforce4ai-2.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-2.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-2.0.dist-info/RECORD,,