spiderforce4ai 1.5__py3-none-any.whl → 1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +93 -10
- {spiderforce4ai-1.5.dist-info → spiderforce4ai-1.7.dist-info}/METADATA +1 -1
- spiderforce4ai-1.7.dist-info/RECORD +5 -0
- spiderforce4ai-1.5.dist-info/RECORD +0 -5
- {spiderforce4ai-1.5.dist-info → spiderforce4ai-1.7.dist-info}/WHEEL +0 -0
- {spiderforce4ai-1.5.dist-info → spiderforce4ai-1.7.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -22,6 +22,59 @@ from multiprocessing import Pool
|
|
22
22
|
|
23
23
|
console = Console()
|
24
24
|
|
25
|
+
def extract_metadata_headers(markdown: str) -> str:
|
26
|
+
"""Extract metadata and headers from markdown content with SEO formatting."""
|
27
|
+
lines = markdown.split('\n')
|
28
|
+
extracted = []
|
29
|
+
in_metadata = False
|
30
|
+
metadata = {
|
31
|
+
'title': '',
|
32
|
+
'description': '',
|
33
|
+
'canonical_url': '',
|
34
|
+
'language': ''
|
35
|
+
}
|
36
|
+
|
37
|
+
# First pass - collect metadata
|
38
|
+
for line in lines:
|
39
|
+
if line.strip().startswith('title:'):
|
40
|
+
metadata['title'] = line.split(':', 1)[1].strip()
|
41
|
+
elif line.strip().startswith('description:'):
|
42
|
+
metadata['description'] = line.split(':', 1)[1].strip()
|
43
|
+
elif line.strip().startswith('canonical_url:'):
|
44
|
+
metadata['canonical_url'] = line.split(':', 1)[1].strip()
|
45
|
+
elif line.strip().startswith('language:'):
|
46
|
+
metadata['language'] = line.split(':', 1)[1].strip()
|
47
|
+
|
48
|
+
# Add formatted metadata section
|
49
|
+
extracted.append(f"Title: {metadata['title']}")
|
50
|
+
extracted.append(f"Description: {metadata['description']}")
|
51
|
+
extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
|
52
|
+
extracted.append(f"Language: {metadata['language']}")
|
53
|
+
extracted.append("") # Empty line after metadata
|
54
|
+
|
55
|
+
# Second pass - process headers
|
56
|
+
for line in lines:
|
57
|
+
if line.strip().startswith('#'):
|
58
|
+
# Count the number of # symbols
|
59
|
+
level = len(line) - len(line.lstrip('#'))
|
60
|
+
text = line.lstrip('#').strip()
|
61
|
+
|
62
|
+
# Format header according to level
|
63
|
+
if level == 1:
|
64
|
+
extracted.append(f"H1: {text}")
|
65
|
+
elif level == 2:
|
66
|
+
extracted.append(f"H2: {text}")
|
67
|
+
elif level == 3:
|
68
|
+
extracted.append(f"H3: {text}")
|
69
|
+
elif level == 4:
|
70
|
+
extracted.append(f"H4: {text}")
|
71
|
+
elif level == 5:
|
72
|
+
extracted.append(f"H5: {text}")
|
73
|
+
elif level == 6:
|
74
|
+
extracted.append(f"H6: {text}")
|
75
|
+
|
76
|
+
return '\n'.join(extracted)
|
77
|
+
|
25
78
|
def slugify(url: str) -> str:
|
26
79
|
"""Convert URL to a valid filename."""
|
27
80
|
parsed = urlparse(url)
|
@@ -61,6 +114,8 @@ class CrawlConfig:
|
|
61
114
|
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
62
115
|
save_reports: bool = False # Whether to save crawl reports
|
63
116
|
report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
|
117
|
+
combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
|
118
|
+
combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
|
64
119
|
|
65
120
|
def __post_init__(self):
|
66
121
|
# Initialize empty lists/dicts for None values
|
@@ -72,12 +127,21 @@ class CrawlConfig:
|
|
72
127
|
self.output_dir = Path(self.output_dir)
|
73
128
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
74
129
|
|
75
|
-
#
|
130
|
+
# Setup report file if save_reports is True
|
76
131
|
if self.save_reports:
|
77
132
|
if self.report_file is None:
|
78
133
|
self.report_file = self.output_dir / "crawl_report.json"
|
79
134
|
else:
|
80
135
|
self.report_file = Path(self.report_file)
|
136
|
+
|
137
|
+
# Setup combined markdown file if needed
|
138
|
+
if self.combine_to_one_markdown:
|
139
|
+
if self.combined_markdown_file is None:
|
140
|
+
self.combined_markdown_file = self.output_dir / "combined_content.md"
|
141
|
+
else:
|
142
|
+
self.combined_markdown_file = Path(self.combined_markdown_file)
|
143
|
+
# Create or clear the combined file
|
144
|
+
self.combined_markdown_file.write_text('')
|
81
145
|
|
82
146
|
def to_dict(self) -> Dict:
|
83
147
|
"""Convert config to dictionary for API requests."""
|
@@ -158,9 +222,19 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
158
222
|
|
159
223
|
# Save markdown if output directory is configured
|
160
224
|
if config.output_dir:
|
161
|
-
|
162
|
-
|
163
|
-
f
|
225
|
+
# Save individual file if not combining or if combining in full mode
|
226
|
+
if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
|
227
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
228
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
229
|
+
f.write(markdown)
|
230
|
+
|
231
|
+
# Handle combined markdown file
|
232
|
+
if config.combine_to_one_markdown:
|
233
|
+
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
|
234
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
235
|
+
|
236
|
+
with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
237
|
+
f.write(combined_content)
|
164
238
|
|
165
239
|
result = CrawlResult(
|
166
240
|
url=url,
|
@@ -209,12 +283,21 @@ class SpiderForce4AI:
|
|
209
283
|
await self.session.close()
|
210
284
|
|
211
285
|
async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
|
212
|
-
"""Save markdown content to file."""
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
286
|
+
"""Save markdown content to file and/or append to combined file."""
|
287
|
+
# Save individual file if not combining or if combining in full mode
|
288
|
+
if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
|
289
|
+
filename = f"{slugify(url)}.md"
|
290
|
+
filepath = output_dir / filename
|
291
|
+
async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
|
292
|
+
await f.write(markdown)
|
293
|
+
|
294
|
+
# Handle combined markdown file
|
295
|
+
if self.config.combine_to_one_markdown:
|
296
|
+
content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
|
297
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
298
|
+
|
299
|
+
async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
300
|
+
await f.write(combined_content)
|
218
301
|
|
219
302
|
|
220
303
|
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=7YpJdZfmy4z5wUFGTBsvi5VOxGGX594oVul3Q5Ngdko,34906
|
2
|
+
spiderforce4ai-1.7.dist-info/METADATA,sha256=ON-lQ4BARmNOrHwT2Xbl2oc1hoo8FyMQWxl6T0LbClA,7183
|
3
|
+
spiderforce4ai-1.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-1.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-1.7.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=lUCuNzYKjsYRLgP1ULSfAXnDSoLn91vYf71zOZFGmPg,30936
|
2
|
-
spiderforce4ai-1.5.dist-info/METADATA,sha256=uHgxa-sPwP805d0jM3-LrDOLayfKqxtpxqvZMsFnizo,7183
|
3
|
-
spiderforce4ai-1.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-1.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-1.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|