spiderforce4ai 2.0__tar.gz → 2.1__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-2.0 → spiderforce4ai-2.1}/PKG-INFO +1 -1
- {spiderforce4ai-2.0 → spiderforce4ai-2.1}/pyproject.toml +1 -1
- {spiderforce4ai-2.0 → spiderforce4ai-2.1}/setup.py +1 -1
- {spiderforce4ai-2.0 → spiderforce4ai-2.1}/spiderforce4ai/__init__.py +20 -26
- {spiderforce4ai-2.0 → spiderforce4ai-2.1}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-2.0 → spiderforce4ai-2.1}/README.md +0 -0
- {spiderforce4ai-2.0 → spiderforce4ai-2.1}/setup.cfg +0 -0
- {spiderforce4ai-2.0 → spiderforce4ai-2.1}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-2.0 → spiderforce4ai-2.1}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-2.0 → spiderforce4ai-2.1}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-2.0 → spiderforce4ai-2.1}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "2.
|
7
|
+
version = "2.1"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
|
@@ -23,7 +23,7 @@ from multiprocessing import Pool
|
|
23
23
|
console = Console()
|
24
24
|
|
25
25
|
def extract_metadata_headers(markdown: str, url: str = '') -> str:
|
26
|
-
"""Extract metadata and headers from markdown content with SEO formatting."""
|
26
|
+
"""Extract metadata and headers from markdown content with enhanced SEO formatting."""
|
27
27
|
lines = markdown.split('\n')
|
28
28
|
extracted = []
|
29
29
|
in_metadata = False
|
@@ -33,8 +33,9 @@ def extract_metadata_headers(markdown: str, url: str = '') -> str:
|
|
33
33
|
'canonical_url': '',
|
34
34
|
'language': ''
|
35
35
|
}
|
36
|
+
first_paragraph = ''
|
36
37
|
|
37
|
-
# First pass - collect metadata
|
38
|
+
# First pass - collect metadata and first paragraph
|
38
39
|
for i, line in enumerate(lines):
|
39
40
|
# Check for metadata block boundaries
|
40
41
|
if line.strip() == '---':
|
@@ -61,41 +62,34 @@ def extract_metadata_headers(markdown: str, url: str = '') -> str:
|
|
61
62
|
|
62
63
|
if key == 'title':
|
63
64
|
metadata['title'] = value
|
64
|
-
elif key
|
65
|
+
elif key in ['description', 'meta_description', 'og:description', 'meta-description']:
|
65
66
|
metadata['description'] = value
|
66
|
-
elif key
|
67
|
+
elif key in ['canonical_url', 'canonical']:
|
67
68
|
metadata['canonical_url'] = value
|
68
|
-
elif key
|
69
|
+
elif key in ['language', 'lang']:
|
69
70
|
metadata['language'] = value
|
71
|
+
elif not in_metadata and not first_paragraph and line.strip() and not line.startswith('#'):
|
72
|
+
first_paragraph = line.strip()
|
70
73
|
|
71
|
-
#
|
72
|
-
|
73
|
-
|
74
|
+
# Use first paragraph as fallback description if none found
|
75
|
+
if not metadata['description'] and first_paragraph:
|
76
|
+
metadata['description'] = first_paragraph[:160] + ('...' if len(first_paragraph) > 160 else '')
|
77
|
+
|
78
|
+
# Add formatted metadata section
|
79
|
+
extracted.append(f"URL: {url}")
|
80
|
+
extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}")
|
74
81
|
extracted.append(f"Description: {metadata['description']}")
|
75
|
-
extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}")
|
76
|
-
extracted.append(f"Language: {metadata['language'] or 'en'}")
|
82
|
+
extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}")
|
83
|
+
extracted.append(f"Language: {metadata['language'] or 'en'}")
|
77
84
|
extracted.append("") # Empty line after metadata
|
78
85
|
|
79
86
|
# Second pass - process headers
|
80
87
|
for line in lines:
|
81
88
|
if line.strip().startswith('#'):
|
82
|
-
# Count the number of # symbols
|
83
89
|
level = len(line) - len(line.lstrip('#'))
|
84
90
|
text = line.lstrip('#').strip()
|
85
|
-
|
86
|
-
|
87
|
-
if level == 1:
|
88
|
-
extracted.append(f"H1: {text}")
|
89
|
-
elif level == 2:
|
90
|
-
extracted.append(f"H2: {text}")
|
91
|
-
elif level == 3:
|
92
|
-
extracted.append(f"H3: {text}")
|
93
|
-
elif level == 4:
|
94
|
-
extracted.append(f"H4: {text}")
|
95
|
-
elif level == 5:
|
96
|
-
extracted.append(f"H5: {text}")
|
97
|
-
elif level == 6:
|
98
|
-
extracted.append(f"H6: {text}")
|
91
|
+
if 1 <= level <= 6:
|
92
|
+
extracted.append(f"H{level}: {text}")
|
99
93
|
|
100
94
|
return '\n'.join(extracted)
|
101
95
|
|
@@ -317,7 +311,7 @@ class SpiderForce4AI:
|
|
317
311
|
|
318
312
|
# Handle combined markdown file
|
319
313
|
if self.config.combine_to_one_markdown:
|
320
|
-
content = markdown if
|
314
|
+
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
|
321
315
|
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
322
316
|
|
323
317
|
async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|