spiderforce4ai 1.6__py3-none-any.whl → 1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,26 +23,55 @@ from multiprocessing import Pool
23
23
  console = Console()
24
24
 
25
25
  def extract_metadata_headers(markdown: str) -> str:
26
- """Extract metadata and headers from markdown content."""
26
+ """Extract metadata and headers from markdown content with SEO formatting."""
27
27
  lines = markdown.split('\n')
28
28
  extracted = []
29
29
  in_metadata = False
30
+ metadata = {
31
+ 'title': '',
32
+ 'description': '',
33
+ 'canonical_url': '',
34
+ 'language': ''
35
+ }
30
36
 
37
+ # First pass - collect metadata
38
+ for line in lines:
39
+ if line.strip().startswith('title:'):
40
+ metadata['title'] = line.split(':', 1)[1].strip()
41
+ elif line.strip().startswith('description:'):
42
+ metadata['description'] = line.split(':', 1)[1].strip()
43
+ elif line.strip().startswith('canonical_url:'):
44
+ metadata['canonical_url'] = line.split(':', 1)[1].strip()
45
+ elif line.strip().startswith('language:'):
46
+ metadata['language'] = line.split(':', 1)[1].strip()
47
+
48
+ # Add formatted metadata section
49
+ extracted.append(f"Title: {metadata['title']}")
50
+ extracted.append(f"Description: {metadata['description']}")
51
+ extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
52
+ extracted.append(f"Language: {metadata['language']}")
53
+ extracted.append("") # Empty line after metadata
54
+
55
+ # Second pass - process headers
31
56
  for line in lines:
32
- # Check for metadata block
33
- if line.strip() == '---':
34
- in_metadata = not in_metadata
35
- extracted.append(line)
36
- continue
37
-
38
- # Include metadata
39
- if in_metadata:
40
- extracted.append(line)
41
- continue
42
-
43
- # Include headers (lines starting with #)
44
57
  if line.strip().startswith('#'):
45
- extracted.append(line)
58
+ # Count the number of # symbols
59
+ level = len(line) - len(line.lstrip('#'))
60
+ text = line.lstrip('#').strip()
61
+
62
+ # Format header according to level
63
+ if level == 1:
64
+ extracted.append(f"H1: {text}")
65
+ elif level == 2:
66
+ extracted.append(f"H2: {text}")
67
+ elif level == 3:
68
+ extracted.append(f"H3: {text}")
69
+ elif level == 4:
70
+ extracted.append(f"H4: {text}")
71
+ elif level == 5:
72
+ extracted.append(f"H5: {text}")
73
+ elif level == 6:
74
+ extracted.append(f"H6: {text}")
46
75
 
47
76
  return '\n'.join(extracted)
48
77
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.6
3
+ Version: 1.7
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=7YpJdZfmy4z5wUFGTBsvi5VOxGGX594oVul3Q5Ngdko,34906
2
+ spiderforce4ai-1.7.dist-info/METADATA,sha256=ON-lQ4BARmNOrHwT2Xbl2oc1hoo8FyMQWxl6T0LbClA,7183
3
+ spiderforce4ai-1.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-1.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-1.7.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=yNyBz8-HjENlAZ1NSy072Ir71T68xulTRj19Yxr1aEQ,33573
2
- spiderforce4ai-1.6.dist-info/METADATA,sha256=pelYvJYMzC8W-P2ORQNWwP2Fyc5KshnzQ6edoYEYZQU,7183
3
- spiderforce4ai-1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-1.6.dist-info/RECORD,,