spiderforce4ai 1.5__py3-none-any.whl → 1.7__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +93 -10
- {spiderforce4ai-1.5.dist-info → spiderforce4ai-1.7.dist-info}/METADATA +1 -1
- spiderforce4ai-1.7.dist-info/RECORD +5 -0
- spiderforce4ai-1.5.dist-info/RECORD +0 -5
- {spiderforce4ai-1.5.dist-info → spiderforce4ai-1.7.dist-info}/WHEEL +0 -0
- {spiderforce4ai-1.5.dist-info → spiderforce4ai-1.7.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -22,6 +22,59 @@ from multiprocessing import Pool
|
|
22
22
|
|
23
23
|
console = Console()
|
24
24
|
|
25
|
+
def extract_metadata_headers(markdown: str) -> str:
|
26
|
+
"""Extract metadata and headers from markdown content with SEO formatting."""
|
27
|
+
lines = markdown.split('\n')
|
28
|
+
extracted = []
|
29
|
+
in_metadata = False
|
30
|
+
metadata = {
|
31
|
+
'title': '',
|
32
|
+
'description': '',
|
33
|
+
'canonical_url': '',
|
34
|
+
'language': ''
|
35
|
+
}
|
36
|
+
|
37
|
+
# First pass - collect metadata
|
38
|
+
for line in lines:
|
39
|
+
if line.strip().startswith('title:'):
|
40
|
+
metadata['title'] = line.split(':', 1)[1].strip()
|
41
|
+
elif line.strip().startswith('description:'):
|
42
|
+
metadata['description'] = line.split(':', 1)[1].strip()
|
43
|
+
elif line.strip().startswith('canonical_url:'):
|
44
|
+
metadata['canonical_url'] = line.split(':', 1)[1].strip()
|
45
|
+
elif line.strip().startswith('language:'):
|
46
|
+
metadata['language'] = line.split(':', 1)[1].strip()
|
47
|
+
|
48
|
+
# Add formatted metadata section
|
49
|
+
extracted.append(f"Title: {metadata['title']}")
|
50
|
+
extracted.append(f"Description: {metadata['description']}")
|
51
|
+
extracted.append(f"CanonicalUrl: {metadata['canonical_url']}")
|
52
|
+
extracted.append(f"Language: {metadata['language']}")
|
53
|
+
extracted.append("") # Empty line after metadata
|
54
|
+
|
55
|
+
# Second pass - process headers
|
56
|
+
for line in lines:
|
57
|
+
if line.strip().startswith('#'):
|
58
|
+
# Count the number of # symbols
|
59
|
+
level = len(line) - len(line.lstrip('#'))
|
60
|
+
text = line.lstrip('#').strip()
|
61
|
+
|
62
|
+
# Format header according to level
|
63
|
+
if level == 1:
|
64
|
+
extracted.append(f"H1: {text}")
|
65
|
+
elif level == 2:
|
66
|
+
extracted.append(f"H2: {text}")
|
67
|
+
elif level == 3:
|
68
|
+
extracted.append(f"H3: {text}")
|
69
|
+
elif level == 4:
|
70
|
+
extracted.append(f"H4: {text}")
|
71
|
+
elif level == 5:
|
72
|
+
extracted.append(f"H5: {text}")
|
73
|
+
elif level == 6:
|
74
|
+
extracted.append(f"H6: {text}")
|
75
|
+
|
76
|
+
return '\n'.join(extracted)
|
77
|
+
|
25
78
|
def slugify(url: str) -> str:
|
26
79
|
"""Convert URL to a valid filename."""
|
27
80
|
parsed = urlparse(url)
|
@@ -61,6 +114,8 @@ class CrawlConfig:
|
|
61
114
|
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
62
115
|
save_reports: bool = False # Whether to save crawl reports
|
63
116
|
report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
|
117
|
+
combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
|
118
|
+
combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
|
64
119
|
|
65
120
|
def __post_init__(self):
|
66
121
|
# Initialize empty lists/dicts for None values
|
@@ -72,12 +127,21 @@ class CrawlConfig:
|
|
72
127
|
self.output_dir = Path(self.output_dir)
|
73
128
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
74
129
|
|
75
|
-
#
|
130
|
+
# Setup report file if save_reports is True
|
76
131
|
if self.save_reports:
|
77
132
|
if self.report_file is None:
|
78
133
|
self.report_file = self.output_dir / "crawl_report.json"
|
79
134
|
else:
|
80
135
|
self.report_file = Path(self.report_file)
|
136
|
+
|
137
|
+
# Setup combined markdown file if needed
|
138
|
+
if self.combine_to_one_markdown:
|
139
|
+
if self.combined_markdown_file is None:
|
140
|
+
self.combined_markdown_file = self.output_dir / "combined_content.md"
|
141
|
+
else:
|
142
|
+
self.combined_markdown_file = Path(self.combined_markdown_file)
|
143
|
+
# Create or clear the combined file
|
144
|
+
self.combined_markdown_file.write_text('')
|
81
145
|
|
82
146
|
def to_dict(self) -> Dict:
|
83
147
|
"""Convert config to dictionary for API requests."""
|
@@ -158,9 +222,19 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
158
222
|
|
159
223
|
# Save markdown if output directory is configured
|
160
224
|
if config.output_dir:
|
161
|
-
|
162
|
-
|
163
|
-
f
|
225
|
+
# Save individual file if not combining or if combining in full mode
|
226
|
+
if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
|
227
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
228
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
229
|
+
f.write(markdown)
|
230
|
+
|
231
|
+
# Handle combined markdown file
|
232
|
+
if config.combine_to_one_markdown:
|
233
|
+
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
|
234
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
235
|
+
|
236
|
+
with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
237
|
+
f.write(combined_content)
|
164
238
|
|
165
239
|
result = CrawlResult(
|
166
240
|
url=url,
|
@@ -209,12 +283,21 @@ class SpiderForce4AI:
|
|
209
283
|
await self.session.close()
|
210
284
|
|
211
285
|
async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
|
212
|
-
"""Save markdown content to file."""
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
286
|
+
"""Save markdown content to file and/or append to combined file."""
|
287
|
+
# Save individual file if not combining or if combining in full mode
|
288
|
+
if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
|
289
|
+
filename = f"{slugify(url)}.md"
|
290
|
+
filepath = output_dir / filename
|
291
|
+
async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
|
292
|
+
await f.write(markdown)
|
293
|
+
|
294
|
+
# Handle combined markdown file
|
295
|
+
if self.config.combine_to_one_markdown:
|
296
|
+
content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
|
297
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
298
|
+
|
299
|
+
async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
300
|
+
await f.write(combined_content)
|
218
301
|
|
219
302
|
|
220
303
|
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=7YpJdZfmy4z5wUFGTBsvi5VOxGGX594oVul3Q5Ngdko,34906
|
2
|
+
spiderforce4ai-1.7.dist-info/METADATA,sha256=ON-lQ4BARmNOrHwT2Xbl2oc1hoo8FyMQWxl6T0LbClA,7183
|
3
|
+
spiderforce4ai-1.7.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-1.7.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-1.7.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=lUCuNzYKjsYRLgP1ULSfAXnDSoLn91vYf71zOZFGmPg,30936
|
2
|
-
spiderforce4ai-1.5.dist-info/METADATA,sha256=uHgxa-sPwP805d0jM3-LrDOLayfKqxtpxqvZMsFnizo,7183
|
3
|
-
spiderforce4ai-1.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-1.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-1.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|