spiderforce4ai 1.8__tar.gz → 2.0__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.8
3
+ Version: 2.0
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "1.8"
7
+ version = "2.0"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name="spiderforce4ai",
6
- version="1.8",
6
+ version="2.0",
7
7
  author="Piotr Tamulewicz",
8
8
  author_email="pt@petertam.pro",
9
9
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",
@@ -22,7 +22,7 @@ from multiprocessing import Pool
22
22
 
23
23
  console = Console()
24
24
 
25
- def extract_metadata_headers(markdown: str) -> str:
25
+ def extract_metadata_headers(markdown: str, url: str = '') -> str:
26
26
  """Extract metadata and headers from markdown content with SEO formatting."""
27
27
  lines = markdown.split('\n')
28
28
  extracted = []
@@ -48,24 +48,31 @@ def extract_metadata_headers(markdown: str) -> str:
48
48
  # Extract metadata within the block
49
49
  if in_metadata:
50
50
  if ':' in line:
51
- key, value = line.split(':', 1)
52
- key = key.strip().lower()
53
- value = value.strip()
51
+ key, value = [part.strip() for part in line.split(':', 1)]
52
+ key = key.lower()
53
+
54
+ # Handle multi-line values
55
+ if value.startswith('>'):
56
+ value = value[1:].strip()
57
+ j = i + 1
58
+ while j < len(lines) and lines[j].strip() and not lines[j].strip() == '---':
59
+ value += ' ' + lines[j].strip()
60
+ j += 1
54
61
 
55
62
  if key == 'title':
56
63
  metadata['title'] = value
57
- elif key == 'description':
64
+ elif key == 'description' or key == 'meta_description':
58
65
  metadata['description'] = value
59
- elif key == 'canonical_url':
66
+ elif key == 'canonical_url' or key == 'canonical':
60
67
  metadata['canonical_url'] = value
61
- elif key == 'language':
68
+ elif key == 'language' or key == 'lang':
62
69
  metadata['language'] = value
63
70
 
64
71
  # Add formatted metadata section with URL first
65
- extracted.append(f"URL: {metadata.get('url', '')}")
66
- extracted.append(f"Title: {metadata['title']}")
72
+ extracted.append(f"URL: {url}") # Use the actual crawled URL
73
+ extracted.append(f"Title: {metadata['title'] or url.split('/')[-2].replace('-', ' ').title()}") # Fallback to URL segment
67
74
  extracted.append(f"Description: {metadata['description']}")
68
- extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
75
+ extracted.append(f"CanonicalUrl: {metadata['canonical_url'] or url}") # Fallback to crawled URL
69
76
  extracted.append(f"Language: {metadata['language'] or 'en'}") # Default to 'en' if not specified
70
77
  extracted.append("") # Empty line after metadata
71
78
 
@@ -247,7 +254,7 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
247
254
 
248
255
  # Handle combined markdown file
249
256
  if config.combine_to_one_markdown:
250
- content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
257
+ content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown, url)
251
258
  combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
252
259
 
253
260
  with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 1.8
3
+ Version: 2.0
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
File without changes
File without changes