spiderforce4ai 1.7__py3-none-any.whl → 1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,7 @@ from multiprocessing import Pool
22
22
 
23
23
  console = Console()
24
24
 
25
- def extract_metadata_headers(markdown: str) -> str:
25
+ def extract_metadata_headers(markdown: str, url: str = '') -> str:
26
26
  """Extract metadata and headers from markdown content with SEO formatting."""
27
27
  lines = markdown.split('\n')
28
28
  extracted = []
@@ -35,21 +35,38 @@ def extract_metadata_headers(markdown: str) -> str:
35
35
  }
36
36
 
37
37
  # First pass - collect metadata
38
- for line in lines:
39
- if line.strip().startswith('title:'):
40
- metadata['title'] = line.split(':', 1)[1].strip()
41
- elif line.strip().startswith('description:'):
42
- metadata['description'] = line.split(':', 1)[1].strip()
43
- elif line.strip().startswith('canonical_url:'):
44
- metadata['canonical_url'] = line.split(':', 1)[1].strip()
45
- elif line.strip().startswith('language:'):
46
- metadata['language'] = line.split(':', 1)[1].strip()
38
+ for i, line in enumerate(lines):
39
+ # Check for metadata block boundaries
40
+ if line.strip() == '---':
41
+ if not in_metadata:
42
+ in_metadata = True
43
+ continue
44
+ else:
45
+ in_metadata = False
46
+ break
47
+
48
+ # Extract metadata within the block
49
+ if in_metadata:
50
+ if ':' in line:
51
+ key, value = line.split(':', 1)
52
+ key = key.strip().lower()
53
+ value = value.strip()
54
+
55
+ if key == 'title':
56
+ metadata['title'] = value
57
+ elif key == 'description':
58
+ metadata['description'] = value
59
+ elif key == 'canonical_url':
60
+ metadata['canonical_url'] = value
61
+ elif key == 'language':
62
+ metadata['language'] = value
47
63
 
48
- # Add formatted metadata section
49
- extracted.append(f"Title: {metadata['title']}")
64
+ # Add formatted metadata section with URL first
65
+ extracted.append(f"URL: {url}") # Use the actual crawled URL
66
+ extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}") # Fallback to URL segment
50
67
  extracted.append(f"Description: {metadata['description']}")
51
- extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
52
- extracted.append(f"Language: {metadata['language']}")
68
+ extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}") # Fallback to crawled URL
69
+ extracted.append(f"Language: {metadata['language'] or 'en'}") # Default to 'en' if not specified
53
70
  extracted.append("") # Empty line after metadata
54
71
 
55
72
  # Second pass - process headers
@@ -230,7 +247,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
230
247
 
231
248
  # Handle combined markdown file
232
249
  if config.combine_to_one_markdown:
233
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
250
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
234
251
  combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
235
252
 
236
253
  with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.7
3
+ Version: 1.9
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=JL3APcu08DJWqeVBssPODQ8zqZdislI-qiOah_7xnus,35564
2
+ spiderforce4ai-1.9.dist-info/METADATA,sha256=St2DWVpNEWX22A9x7aizkUtRtTOk8tnva0izcXRNL5o,7183
3
+ spiderforce4ai-1.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-1.9.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-1.9.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=7YpJdZfmy4z5wUFGTBsvi5VOxGGX594oVul3Q5Ngdko,34906
2
- spiderforce4ai-1.7.dist-info/METADATA,sha256=ON-lQ4BARmNOrHwT2Xbl2oc1hoo8FyMQWxl6T0LbClA,7183
3
- spiderforce4ai-1.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-1.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-1.7.dist-info/RECORD,,