spiderforce4ai 2.1__tar.gz → 2.3.1__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.1
3
+ Version: 2.3.1
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "2.1"
7
+ version = "2.3.1"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name="spiderforce4ai",
6
- version="2.1",
6
+ version="2.3.1",
7
7
  author="Piotr Tamulewicz",
8
8
  author_email="pt@petertam.pro",
9
9
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",
@@ -23,76 +23,55 @@ from multiprocessing import Pool
23
23
  console = Console()
24
24
 
25
25
  def extract_metadata_headers(markdown: str, url: str = '') -> str:
26
- """Extract metadata and headers from markdown content with enhanced SEO formatting."""
26
+ """Extract metadata and headers from markdown content."""
27
27
  lines = markdown.split('\n')
28
- extracted = []
29
- in_metadata = False
30
- metadata = {
31
- 'title': '',
32
- 'description': '',
33
- 'canonical_url': '',
34
- 'language': ''
35
- }
36
- first_paragraph = ''
28
+ metadata = {}
29
+ headers = []
37
30
 
38
- # First pass - collect metadata and first paragraph
39
- for i, line in enumerate(lines):
40
- # Check for metadata block boundaries
41
- if line.strip() == '---':
42
- if not in_metadata:
43
- in_metadata = True
44
- continue
45
- else:
46
- in_metadata = False
47
- break
31
+ def parse_metadata_line(line):
32
+ """Parse a single metadata line correctly."""
33
+ first_colon = line.find(':')
34
+ if first_colon == -1:
35
+ return None, None
36
+
37
+ key = line[:first_colon].strip()
38
+ value = line[first_colon + 1:].strip()
48
39
 
49
- # Extract metadata within the block
50
- if in_metadata:
51
- if ':' in line:
52
- key, value = [part.strip() for part in line.split(':', 1)]
53
- key = key.lower()
54
-
55
- # Handle multi-line values
56
- if value.startswith('>'):
57
- value = value[1:].strip()
58
- j = i + 1
59
- while j < len(lines) and lines[j].strip() and not lines[j].strip() == '---':
60
- value += ' ' + lines[j].strip()
61
- j += 1
62
-
63
- if key == 'title':
64
- metadata['title'] = value
65
- elif key in ['description', 'meta_description', 'og:description', 'meta-description']:
66
- metadata['description'] = value
67
- elif key in ['canonical_url', 'canonical']:
68
- metadata['canonical_url'] = value
69
- elif key in ['language', 'lang']:
70
- metadata['language'] = value
71
- elif not in_metadata and not first_paragraph and line.strip() and not line.startswith('#'):
72
- first_paragraph = line.strip()
73
-
74
- # Use first paragraph as fallback description if none found
75
- if not metadata['description'] and first_paragraph:
76
- metadata['description'] = first_paragraph[:160] + ('...' if len(first_paragraph) > 160 else '')
77
-
78
- # Add formatted metadata section
79
- extracted.append(f"URL: {url}")
80
- extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}")
81
- extracted.append(f"Description: {metadata['description']}")
82
- extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}")
83
- extracted.append(f"Language: {metadata['language'] or 'en'}")
84
- extracted.append("") # Empty line after metadata
40
+ # Handle the case where value starts with "URL:" - this means it's a missing description
41
+ if value.startswith('URL:'):
42
+ return key, ''
43
+
44
+ return key, value
85
45
 
86
- # Second pass - process headers
46
+ # Process each line
87
47
  for line in lines:
88
- if line.strip().startswith('#'):
48
+ line = line.strip()
49
+ if not line:
50
+ continue
51
+
52
+ # Check if it's a metadata line (contains : but isn't a header)
53
+ if ':' in line and not line.startswith('#'):
54
+ key, value = parse_metadata_line(line)
55
+ if key:
56
+ metadata[key] = value
57
+ # Check if it's a header
58
+ elif line.startswith('#'):
89
59
  level = len(line) - len(line.lstrip('#'))
90
60
  text = line.lstrip('#').strip()
91
61
  if 1 <= level <= 6:
92
- extracted.append(f"H{level}: {text}")
62
+ headers.append(f"H{level}: {text}")
93
63
 
94
- return '\n'.join(extracted)
95
-
64
+ # Construct output
65
+ output = []
66
+ output.append(f"URL: {url}")
67
+ output.append(f"Title: {metadata.get('Title', url.split('/')[-2].replace('-', ' ').title())}")
68
+ output.append(f"Description: {metadata.get('Description', '')}") # Now this will be empty string for missing descriptions
69
+ output.append(f"CanonicalUrl: {metadata.get('CanonicalUrl', url)}")
70
+ output.append(f"Language: {metadata.get('Language', 'en')}")
71
+ output.append("") # Empty line
72
+ output.extend(headers)
73
+
74
+ return '\n'.join(output)
96
75
  def slugify(url: str) -> str:
97
76
  """Convert URL to a valid filename."""
98
77
  parsed = urlparse(url)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.1
3
+ Version: 2.3.1
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes