spiderforce4ai 1.6__tar.gz → 1.8__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.6
3
+ Version: 1.8
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "1.6"
7
+ version = "1.8"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name="spiderforce4ai",
6
- version="1.6",
6
+ version="1.8",
7
7
  author="Piotr Tamulewicz",
8
8
  author_email="pt@petertam.pro",
9
9
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",
@@ -23,26 +23,72 @@ from multiprocessing import Pool
23
23
  console = Console()
24
24
 
25
25
  def extract_metadata_headers(markdown: str) -> str:
26
- """Extract metadata and headers from markdown content."""
26
+ """Extract metadata and headers from markdown content with SEO formatting."""
27
27
  lines = markdown.split('\n')
28
28
  extracted = []
29
29
  in_metadata = False
30
+ metadata = {
31
+ 'title': '',
32
+ 'description': '',
33
+ 'canonical_url': '',
34
+ 'language': ''
35
+ }
30
36
 
31
- for line in lines:
32
- # Check for metadata block
37
+ # First pass - collect metadata
38
+ for i, line in enumerate(lines):
39
+ # Check for metadata block boundaries
33
40
  if line.strip() == '---':
34
- in_metadata = not in_metadata
35
- extracted.append(line)
36
- continue
41
+ if not in_metadata:
42
+ in_metadata = True
43
+ continue
44
+ else:
45
+ in_metadata = False
46
+ break
37
47
 
38
- # Include metadata
48
+ # Extract metadata within the block
39
49
  if in_metadata:
40
- extracted.append(line)
41
- continue
42
-
43
- # Include headers (lines starting with #)
50
+ if ':' in line:
51
+ key, value = line.split(':', 1)
52
+ key = key.strip().lower()
53
+ value = value.strip()
54
+
55
+ if key == 'title':
56
+ metadata['title'] = value
57
+ elif key == 'description':
58
+ metadata['description'] = value
59
+ elif key == 'canonical_url':
60
+ metadata['canonical_url'] = value
61
+ elif key == 'language':
62
+ metadata['language'] = value
63
+
64
+ # Add formatted metadata section with URL first
65
+ extracted.append(f"URL: {metadata.get('url', '')}")
66
+ extracted.append(f"Title: {metadata['title']}")
67
+ extracted.append(f"Description: {metadata['description']}")
68
+ extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
69
+ extracted.append(f"Language: {metadata['language'] or 'en'}") # Default to 'en' if not specified
70
+ extracted.append("") # Empty line after metadata
71
+
72
+ # Second pass - process headers
73
+ for line in lines:
44
74
  if line.strip().startswith('#'):
45
- extracted.append(line)
75
+ # Count the number of # symbols
76
+ level = len(line) - len(line.lstrip('#'))
77
+ text = line.lstrip('#').strip()
78
+
79
+ # Format header according to level
80
+ if level == 1:
81
+ extracted.append(f"H1: {text}")
82
+ elif level == 2:
83
+ extracted.append(f"H2: {text}")
84
+ elif level == 3:
85
+ extracted.append(f"H3: {text}")
86
+ elif level == 4:
87
+ extracted.append(f"H4: {text}")
88
+ elif level == 5:
89
+ extracted.append(f"H5: {text}")
90
+ elif level == 6:
91
+ extracted.append(f"H6: {text}")
46
92
 
47
93
  return '\n'.join(extracted)
48
94
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.6
3
+ Version: 1.8
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes