spiderforce4ai 1.7__tar.gz → 1.9__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-1.7 → spiderforce4ai-1.9}/PKG-INFO +1 -1
- {spiderforce4ai-1.7 → spiderforce4ai-1.9}/pyproject.toml +1 -1
- {spiderforce4ai-1.7 → spiderforce4ai-1.9}/setup.py +1 -1
- {spiderforce4ai-1.7 → spiderforce4ai-1.9}/spiderforce4ai/__init__.py +32 -15
- {spiderforce4ai-1.7 → spiderforce4ai-1.9}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-1.7 → spiderforce4ai-1.9}/README.md +0 -0
- {spiderforce4ai-1.7 → spiderforce4ai-1.9}/setup.cfg +0 -0
- {spiderforce4ai-1.7 → spiderforce4ai-1.9}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-1.7 → spiderforce4ai-1.9}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-1.7 → spiderforce4ai-1.9}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-1.7 → spiderforce4ai-1.9}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "1.
|
7
|
+
version = "1.9"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
|
@@ -22,7 +22,7 @@ from multiprocessing import Pool
|
|
22
22
|
|
23
23
|
console = Console()
|
24
24
|
|
25
|
-
def extract_metadata_headers(markdown: str) -> str:
|
25
|
+
def extract_metadata_headers(markdown: str, url: str = '') -> str:
|
26
26
|
"""Extract metadata and headers from markdown content with SEO formatting."""
|
27
27
|
lines = markdown.split('\n')
|
28
28
|
extracted = []
|
@@ -35,21 +35,38 @@ def extract_metadata_headers(markdown: str) -> str:
|
|
35
35
|
}
|
36
36
|
|
37
37
|
# First pass - collect metadata
|
38
|
-
for line in lines:
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
38
|
+
for i, line in enumerate(lines):
|
39
|
+
# Check for metadata block boundaries
|
40
|
+
if line.strip() == '---':
|
41
|
+
if not in_metadata:
|
42
|
+
in_metadata = True
|
43
|
+
continue
|
44
|
+
else:
|
45
|
+
in_metadata = False
|
46
|
+
break
|
47
|
+
|
48
|
+
# Extract metadata within the block
|
49
|
+
if in_metadata:
|
50
|
+
if ':' in line:
|
51
|
+
key, value = line.split(':', 1)
|
52
|
+
key = key.strip().lower()
|
53
|
+
value = value.strip()
|
54
|
+
|
55
|
+
if key == 'title':
|
56
|
+
metadata['title'] = value
|
57
|
+
elif key == 'description':
|
58
|
+
metadata['description'] = value
|
59
|
+
elif key == 'canonical_url':
|
60
|
+
metadata['canonical_url'] = value
|
61
|
+
elif key == 'language':
|
62
|
+
metadata['language'] = value
|
47
63
|
|
48
|
-
# Add formatted metadata section
|
49
|
-
extracted.append(f"
|
64
|
+
# Add formatted metadata section with URL first
|
65
|
+
extracted.append(f"URL: {url}") # Use the actual crawled URL
|
66
|
+
extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}") # Fallback to URL segment
|
50
67
|
extracted.append(f"Description: {metadata['description']}")
|
51
|
-
extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
|
52
|
-
extracted.append(f"Language: {metadata['language']}")
|
68
|
+
extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}") # Fallback to crawled URL
|
69
|
+
extracted.append(f"Language: {metadata['language'] or 'en'}") # Default to 'en' if not specified
|
53
70
|
extracted.append("") # Empty line after metadata
|
54
71
|
|
55
72
|
# Second pass - process headers
|
@@ -230,7 +247,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
230
247
|
|
231
248
|
# Handle combined markdown file
|
232
249
|
if config.combine_to_one_markdown:
|
233
|
-
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
|
250
|
+
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
|
234
251
|
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
235
252
|
|
236
253
|
with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|