spiderforce4ai 1.8__py3-none-any.whl → 1.9__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +5 -5
- {spiderforce4ai-1.8.dist-info → spiderforce4ai-1.9.dist-info}/METADATA +1 -1
- spiderforce4ai-1.9.dist-info/RECORD +5 -0
- spiderforce4ai-1.8.dist-info/RECORD +0 -5
- {spiderforce4ai-1.8.dist-info → spiderforce4ai-1.9.dist-info}/WHEEL +0 -0
- {spiderforce4ai-1.8.dist-info → spiderforce4ai-1.9.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -22,7 +22,7 @@ from multiprocessing import Pool
|
|
22
22
|
|
23
23
|
console = Console()
|
24
24
|
|
25
|
-
def extract_metadata_headers(markdown: str) -> str:
|
25
|
+
def extract_metadata_headers(markdown: str, url: str = '') -> str:
|
26
26
|
"""Extract metadata and headers from markdown content with SEO formatting."""
|
27
27
|
lines = markdown.split('\n')
|
28
28
|
extracted = []
|
@@ -62,10 +62,10 @@ def extract_metadata_headers(markdown: str) -> str:
|
|
62
62
|
metadata['language'] = value
|
63
63
|
|
64
64
|
# Add formatted metadata section with URL first
|
65
|
-
extracted.append(f"URL: {
|
66
|
-
extracted.append(f"Title: {metadata['title']}")
|
65
|
+
extracted.append(f"URL: {url}") # Use the actual crawled URL
|
66
|
+
extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}") # Fallback to URL segment
|
67
67
|
extracted.append(f"Description: {metadata['description']}")
|
68
|
-
extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
|
68
|
+
extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}") # Fallback to crawled URL
|
69
69
|
extracted.append(f"Language: {metadata['language'] or 'en'}") # Default to 'en' if not specified
|
70
70
|
extracted.append("") # Empty line after metadata
|
71
71
|
|
@@ -247,7 +247,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
247
247
|
|
248
248
|
# Handle combined markdown file
|
249
249
|
if config.combine_to_one_markdown:
|
250
|
-
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
|
250
|
+
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
|
251
251
|
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
252
252
|
|
253
253
|
with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=JL3APcu08DJWqeVBssPODQ8zqZdislI-qiOah_7xnus,35564
|
2
|
+
spiderforce4ai-1.9.dist-info/METADATA,sha256=St2DWVpNEWX22A9x7aizkUtRtTOk8tnva0izcXRNL5o,7183
|
3
|
+
spiderforce4ai-1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-1.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-1.9.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=GaW2zVESi1pi13pD0Dky4g0Yuj9hEj7_4eP_eyoBnWM,35425
|
2
|
-
spiderforce4ai-1.8.dist-info/METADATA,sha256=T1K4wWbagvh0ZW_vsYNAAhSAqRH7bLDOF6lr7Yy1pfg,7183
|
3
|
-
spiderforce4ai-1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-1.8.dist-info/RECORD,,
|
File without changes
|
File without changes
|