spiderforce4ai 2.1__tar.gz → 2.3.1__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-2.1 → spiderforce4ai-2.3.1}/PKG-INFO +1 -1
- {spiderforce4ai-2.1 → spiderforce4ai-2.3.1}/pyproject.toml +1 -1
- {spiderforce4ai-2.1 → spiderforce4ai-2.3.1}/setup.py +1 -1
- {spiderforce4ai-2.1 → spiderforce4ai-2.3.1}/spiderforce4ai/__init__.py +40 -61
- {spiderforce4ai-2.1 → spiderforce4ai-2.3.1}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-2.1 → spiderforce4ai-2.3.1}/README.md +0 -0
- {spiderforce4ai-2.1 → spiderforce4ai-2.3.1}/setup.cfg +0 -0
- {spiderforce4ai-2.1 → spiderforce4ai-2.3.1}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-2.1 → spiderforce4ai-2.3.1}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-2.1 → spiderforce4ai-2.3.1}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-2.1 → spiderforce4ai-2.3.1}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "2.1"
|
7
|
+
version = "2.3.1"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
|
@@ -23,76 +23,55 @@ from multiprocessing import Pool
|
|
23
23
|
console = Console()
|
24
24
|
|
25
25
|
def extract_metadata_headers(markdown: str, url: str = '') -> str:
|
26
|
-
"""Extract metadata and headers from markdown content
|
26
|
+
"""Extract metadata and headers from markdown content."""
|
27
27
|
lines = markdown.split('\n')
|
28
|
-
|
29
|
-
|
30
|
-
metadata = {
|
31
|
-
'title': '',
|
32
|
-
'description': '',
|
33
|
-
'canonical_url': '',
|
34
|
-
'language': ''
|
35
|
-
}
|
36
|
-
first_paragraph = ''
|
28
|
+
metadata = {}
|
29
|
+
headers = []
|
37
30
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
if
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
in_metadata = False
|
47
|
-
break
|
31
|
+
def parse_metadata_line(line):
|
32
|
+
"""Parse a single metadata line correctly."""
|
33
|
+
first_colon = line.find(':')
|
34
|
+
if first_colon == -1:
|
35
|
+
return None, None
|
36
|
+
|
37
|
+
key = line[:first_colon].strip()
|
38
|
+
value = line[first_colon + 1:].strip()
|
48
39
|
|
49
|
-
#
|
50
|
-
if
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
# Handle multi-line values
|
56
|
-
if value.startswith('>'):
|
57
|
-
value = value[1:].strip()
|
58
|
-
j = i + 1
|
59
|
-
while j < len(lines) and lines[j].strip() and not lines[j].strip() == '---':
|
60
|
-
value += ' ' + lines[j].strip()
|
61
|
-
j += 1
|
62
|
-
|
63
|
-
if key == 'title':
|
64
|
-
metadata['title'] = value
|
65
|
-
elif key in ['description', 'meta_description', 'og:description', 'meta-description']:
|
66
|
-
metadata['description'] = value
|
67
|
-
elif key in ['canonical_url', 'canonical']:
|
68
|
-
metadata['canonical_url'] = value
|
69
|
-
elif key in ['language', 'lang']:
|
70
|
-
metadata['language'] = value
|
71
|
-
elif not in_metadata and not first_paragraph and line.strip() and not line.startswith('#'):
|
72
|
-
first_paragraph = line.strip()
|
73
|
-
|
74
|
-
# Use first paragraph as fallback description if none found
|
75
|
-
if not metadata['description'] and first_paragraph:
|
76
|
-
metadata['description'] = first_paragraph[:160] + ('...' if len(first_paragraph) > 160 else '')
|
77
|
-
|
78
|
-
# Add formatted metadata section
|
79
|
-
extracted.append(f"URL: {url}")
|
80
|
-
extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}")
|
81
|
-
extracted.append(f"Description: {metadata['description']}")
|
82
|
-
extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}")
|
83
|
-
extracted.append(f"Language: {metadata['language'] or 'en'}")
|
84
|
-
extracted.append("") # Empty line after metadata
|
40
|
+
# Handle the case where value starts with "URL:" - this means it's a missing description
|
41
|
+
if value.startswith('URL:'):
|
42
|
+
return key, ''
|
43
|
+
|
44
|
+
return key, value
|
85
45
|
|
86
|
-
#
|
46
|
+
# Process each line
|
87
47
|
for line in lines:
|
88
|
-
|
48
|
+
line = line.strip()
|
49
|
+
if not line:
|
50
|
+
continue
|
51
|
+
|
52
|
+
# Check if it's a metadata line (contains : but isn't a header)
|
53
|
+
if ':' in line and not line.startswith('#'):
|
54
|
+
key, value = parse_metadata_line(line)
|
55
|
+
if key:
|
56
|
+
metadata[key] = value
|
57
|
+
# Check if it's a header
|
58
|
+
elif line.startswith('#'):
|
89
59
|
level = len(line) - len(line.lstrip('#'))
|
90
60
|
text = line.lstrip('#').strip()
|
91
61
|
if 1 <= level <= 6:
|
92
|
-
|
62
|
+
headers.append(f"H{level}: {text}")
|
93
63
|
|
94
|
-
|
95
|
-
|
64
|
+
# Construct output
|
65
|
+
output = []
|
66
|
+
output.append(f"URL: {url}")
|
67
|
+
output.append(f"Title: {metadata.get('Title', url.split('/')[-2].replace('-', ' ').title())}")
|
68
|
+
output.append(f"Description: {metadata.get('Description', '')}") # Now this will be empty string for missing descriptions
|
69
|
+
output.append(f"CanonicalUrl: {metadata.get('CanonicalUrl', url)}")
|
70
|
+
output.append(f"Language: {metadata.get('Language', 'en')}")
|
71
|
+
output.append("") # Empty line
|
72
|
+
output.extend(headers)
|
73
|
+
|
74
|
+
return '\n'.join(output)
|
96
75
|
def slugify(url: str) -> str:
|
97
76
|
"""Convert URL to a valid filename."""
|
98
77
|
parsed = urlparse(url)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|