spiderforce4ai 2.0__tar.gz → 2.3.1__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.0
3
+ Version: 2.3.1
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "2.0"
7
+ version = "2.3.1"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name="spiderforce4ai",
6
- version="2.0",
6
+ version="2.3.1",
7
7
  author="Piotr Tamulewicz",
8
8
  author_email="pt@petertam.pro",
9
9
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",
@@ -23,82 +23,55 @@ from multiprocessing import Pool
23
23
  console = Console()
24
24
 
25
25
  def extract_metadata_headers(markdown: str, url: str = '') -> str:
26
- """Extract metadata and headers from markdown content with SEO formatting."""
26
+ """Extract metadata and headers from markdown content."""
27
27
  lines = markdown.split('\n')
28
- extracted = []
29
- in_metadata = False
30
- metadata = {
31
- 'title': '',
32
- 'description': '',
33
- 'canonical_url': '',
34
- 'language': ''
35
- }
28
+ metadata = {}
29
+ headers = []
36
30
 
37
- # First pass - collect metadata
38
- for i, line in enumerate(lines):
39
- # Check for metadata block boundaries
40
- if line.strip() == '---':
41
- if not in_metadata:
42
- in_metadata = True
43
- continue
44
- else:
45
- in_metadata = False
46
- break
31
+ def parse_metadata_line(line):
32
+ """Parse a single metadata line correctly."""
33
+ first_colon = line.find(':')
34
+ if first_colon == -1:
35
+ return None, None
36
+
37
+ key = line[:first_colon].strip()
38
+ value = line[first_colon + 1:].strip()
47
39
 
48
- # Extract metadata within the block
49
- if in_metadata:
50
- if ':' in line:
51
- key, value = [part.strip() for part in line.split(':', 1)]
52
- key = key.lower()
53
-
54
- # Handle multi-line values
55
- if value.startswith('>'):
56
- value = value[1:].strip()
57
- j = i + 1
58
- while j < len(lines) and lines[j].strip() and not lines[j].strip() == '---':
59
- value += ' ' + lines[j].strip()
60
- j += 1
61
-
62
- if key == 'title':
63
- metadata['title'] = value
64
- elif key == 'description' or key == 'meta_description':
65
- metadata['description'] = value
66
- elif key == 'canonical_url' or key == 'canonical':
67
- metadata['canonical_url'] = value
68
- elif key == 'language' or key == 'lang':
69
- metadata['language'] = value
70
-
71
- # Add formatted metadata section with URL first
72
- extracted.append(f"URL: {url}") # Use the actual crawled URL
73
- extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}") # Fallback to URL segment
74
- extracted.append(f"Description: {metadata['description']}")
75
- extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}") # Fallback to crawled URL
76
- extracted.append(f"Language: {metadata['language'] or 'en'}") # Default to 'en' if not specified
77
- extracted.append("") # Empty line after metadata
40
+ # Handle the case where value starts with "URL:" - this means it's a missing description
41
+ if value.startswith('URL:'):
42
+ return key, ''
43
+
44
+ return key, value
78
45
 
79
- # Second pass - process headers
46
+ # Process each line
80
47
  for line in lines:
81
- if line.strip().startswith('#'):
82
- # Count the number of # symbols
48
+ line = line.strip()
49
+ if not line:
50
+ continue
51
+
52
+ # Check if it's a metadata line (contains : but isn't a header)
53
+ if ':' in line and not line.startswith('#'):
54
+ key, value = parse_metadata_line(line)
55
+ if key:
56
+ metadata[key] = value
57
+ # Check if it's a header
58
+ elif line.startswith('#'):
83
59
  level = len(line) - len(line.lstrip('#'))
84
60
  text = line.lstrip('#').strip()
85
-
86
- # Format header according to level
87
- if level == 1:
88
- extracted.append(f"H1: {text}")
89
- elif level == 2:
90
- extracted.append(f"H2: {text}")
91
- elif level == 3:
92
- extracted.append(f"H3: {text}")
93
- elif level == 4:
94
- extracted.append(f"H4: {text}")
95
- elif level == 5:
96
- extracted.append(f"H5: {text}")
97
- elif level == 6:
98
- extracted.append(f"H6: {text}")
61
+ if 1 <= level <= 6:
62
+ headers.append(f"H{level}: {text}")
99
63
 
100
- return '\n'.join(extracted)
101
-
64
+ # Construct output
65
+ output = []
66
+ output.append(f"URL: {url}")
67
+ output.append(f"Title: {metadata.get('Title', url.split('/')[-2].replace('-', ' ').title())}")
68
+ output.append(f"Description: {metadata.get('Description', '')}") # Now this will be empty string for missing descriptions
69
+ output.append(f"CanonicalUrl: {metadata.get('CanonicalUrl', url)}")
70
+ output.append(f"Language: {metadata.get('Language', 'en')}")
71
+ output.append("") # Empty line
72
+ output.extend(headers)
73
+
74
+ return '\n'.join(output)
102
75
  def slugify(url: str) -> str:
103
76
  """Convert URL to a valid filename."""
104
77
  parsed = urlparse(url)
@@ -317,7 +290,7 @@ class SpiderForce4AI:
317
290
 
318
291
  # Handle combined markdown file
319
292
  if self.config.combine_to_one_markdown:
320
- content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
293
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
321
294
  combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
322
295
 
323
296
  async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 2.0
3
+ Version: 2.3.1
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes