spiderforce4ai 1.6__py3-none-any.whl → 1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,26 +23,72 @@ from multiprocessing import Pool
23
23
  console = Console()
24
24
 
25
25
  def extract_metadata_headers(markdown: str) -> str:
26
- """Extract metadata and headers from markdown content."""
26
+ """Extract metadata and headers from markdown content with SEO formatting."""
27
27
  lines = markdown.split('\n')
28
28
  extracted = []
29
29
  in_metadata = False
30
+ metadata = {
31
+ 'title': '',
32
+ 'description': '',
33
+ 'canonical_url': '',
34
+ 'language': ''
35
+ }
30
36
 
31
- for line in lines:
32
- # Check for metadata block
37
+ # First pass - collect metadata
38
+ for i, line in enumerate(lines):
39
+ # Check for metadata block boundaries
33
40
  if line.strip() == '---':
34
- in_metadata = not in_metadata
35
- extracted.append(line)
36
- continue
41
+ if not in_metadata:
42
+ in_metadata = True
43
+ continue
44
+ else:
45
+ in_metadata = False
46
+ break
37
47
 
38
- # Include metadata
48
+ # Extract metadata within the block
39
49
  if in_metadata:
40
- extracted.append(line)
41
- continue
42
-
43
- # Include headers (lines starting with #)
50
+ if ':' in line:
51
+ key, value = line.split(':', 1)
52
+ key = key.strip().lower()
53
+ value = value.strip()
54
+
55
+ if key == 'title':
56
+ metadata['title'] = value
57
+ elif key == 'description':
58
+ metadata['description'] = value
59
+ elif key == 'canonical_url':
60
+ metadata['canonical_url'] = value
61
+ elif key == 'language':
62
+ metadata['language'] = value
63
+
64
+ # Add formatted metadata section with URL first
65
+ extracted.append(f"URL: {metadata.get('url', '')}")
66
+ extracted.append(f"Title: {metadata['title']}")
67
+ extracted.append(f"Description: {metadata['description']}")
68
+ extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
69
+ extracted.append(f"Language: {metadata['language'] or 'en'}") # Default to 'en' if not specified
70
+ extracted.append("") # Empty line after metadata
71
+
72
+ # Second pass - process headers
73
+ for line in lines:
44
74
  if line.strip().startswith('#'):
45
- extracted.append(line)
75
+ # Count the number of # symbols
76
+ level = len(line) - len(line.lstrip('#'))
77
+ text = line.lstrip('#').strip()
78
+
79
+ # Format header according to level
80
+ if level == 1:
81
+ extracted.append(f"H1: {text}")
82
+ elif level == 2:
83
+ extracted.append(f"H2: {text}")
84
+ elif level == 3:
85
+ extracted.append(f"H3: {text}")
86
+ elif level == 4:
87
+ extracted.append(f"H4: {text}")
88
+ elif level == 5:
89
+ extracted.append(f"H5: {text}")
90
+ elif level == 6:
91
+ extracted.append(f"H6: {text}")
46
92
 
47
93
  return '\n'.join(extracted)
48
94
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.6
3
+ Version: 1.8
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=GaW2zVESi1pi13pD0Dky4g0Yuj9hEj7_4eP_eyoBnWM,35425
2
+ spiderforce4ai-1.8.dist-info/METADATA,sha256=T1K4wWbagvh0ZW_vsYNAAhSAqRH7bLDOF6lr7Yy1pfg,7183
3
+ spiderforce4ai-1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-1.8.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=yNyBz8-HjENlAZ1NSy072Ir71T68xulTRj19Yxr1aEQ,33573
2
- spiderforce4ai-1.6.dist-info/METADATA,sha256=pelYvJYMzC8W-P2ORQNWwP2Fyc5KshnzQ6edoYEYZQU,7183
3
- spiderforce4ai-1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-1.6.dist-info/RECORD,,