spiderforce4ai 1.4__py3-none-any.whl → 1.6__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +78 -16
- {spiderforce4ai-1.4.dist-info → spiderforce4ai-1.6.dist-info}/METADATA +1 -1
- spiderforce4ai-1.6.dist-info/RECORD +5 -0
- spiderforce4ai-1.4.dist-info/RECORD +0 -5
- {spiderforce4ai-1.4.dist-info → spiderforce4ai-1.6.dist-info}/WHEEL +0 -0
- {spiderforce4ai-1.4.dist-info → spiderforce4ai-1.6.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -22,6 +22,30 @@ from multiprocessing import Pool
|
|
22
22
|
|
23
23
|
console = Console()
|
24
24
|
|
25
|
+
def extract_metadata_headers(markdown: str) -> str:
|
26
|
+
"""Extract metadata and headers from markdown content."""
|
27
|
+
lines = markdown.split('\n')
|
28
|
+
extracted = []
|
29
|
+
in_metadata = False
|
30
|
+
|
31
|
+
for line in lines:
|
32
|
+
# Check for metadata block
|
33
|
+
if line.strip() == '---':
|
34
|
+
in_metadata = not in_metadata
|
35
|
+
extracted.append(line)
|
36
|
+
continue
|
37
|
+
|
38
|
+
# Include metadata
|
39
|
+
if in_metadata:
|
40
|
+
extracted.append(line)
|
41
|
+
continue
|
42
|
+
|
43
|
+
# Include headers (lines starting with #)
|
44
|
+
if line.strip().startswith('#'):
|
45
|
+
extracted.append(line)
|
46
|
+
|
47
|
+
return '\n'.join(extracted)
|
48
|
+
|
25
49
|
def slugify(url: str) -> str:
|
26
50
|
"""Convert URL to a valid filename."""
|
27
51
|
parsed = urlparse(url)
|
@@ -61,6 +85,8 @@ class CrawlConfig:
|
|
61
85
|
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
62
86
|
save_reports: bool = False # Whether to save crawl reports
|
63
87
|
report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
|
88
|
+
combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
|
89
|
+
combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
|
64
90
|
|
65
91
|
def __post_init__(self):
|
66
92
|
# Initialize empty lists/dicts for None values
|
@@ -72,12 +98,21 @@ class CrawlConfig:
|
|
72
98
|
self.output_dir = Path(self.output_dir)
|
73
99
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
74
100
|
|
75
|
-
#
|
101
|
+
# Setup report file if save_reports is True
|
76
102
|
if self.save_reports:
|
77
103
|
if self.report_file is None:
|
78
104
|
self.report_file = self.output_dir / "crawl_report.json"
|
79
105
|
else:
|
80
106
|
self.report_file = Path(self.report_file)
|
107
|
+
|
108
|
+
# Setup combined markdown file if needed
|
109
|
+
if self.combine_to_one_markdown:
|
110
|
+
if self.combined_markdown_file is None:
|
111
|
+
self.combined_markdown_file = self.output_dir / "combined_content.md"
|
112
|
+
else:
|
113
|
+
self.combined_markdown_file = Path(self.combined_markdown_file)
|
114
|
+
# Create or clear the combined file
|
115
|
+
self.combined_markdown_file.write_text('')
|
81
116
|
|
82
117
|
def to_dict(self) -> Dict:
|
83
118
|
"""Convert config to dictionary for API requests."""
|
@@ -158,9 +193,19 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
158
193
|
|
159
194
|
# Save markdown if output directory is configured
|
160
195
|
if config.output_dir:
|
161
|
-
|
162
|
-
|
163
|
-
f
|
196
|
+
# Save individual file if not combining or if combining in full mode
|
197
|
+
if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
|
198
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
199
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
200
|
+
f.write(markdown)
|
201
|
+
|
202
|
+
# Handle combined markdown file
|
203
|
+
if config.combine_to_one_markdown:
|
204
|
+
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
|
205
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
206
|
+
|
207
|
+
with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
208
|
+
f.write(combined_content)
|
164
209
|
|
165
210
|
result = CrawlResult(
|
166
211
|
url=url,
|
@@ -209,12 +254,21 @@ class SpiderForce4AI:
|
|
209
254
|
await self.session.close()
|
210
255
|
|
211
256
|
async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
|
212
|
-
"""Save markdown content to file."""
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
257
|
+
"""Save markdown content to file and/or append to combined file."""
|
258
|
+
# Save individual file if not combining or if combining in full mode
|
259
|
+
if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
|
260
|
+
filename = f"{slugify(url)}.md"
|
261
|
+
filepath = output_dir / filename
|
262
|
+
async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
|
263
|
+
await f.write(markdown)
|
264
|
+
|
265
|
+
# Handle combined markdown file
|
266
|
+
if self.config.combine_to_one_markdown:
|
267
|
+
content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
|
268
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
269
|
+
|
270
|
+
async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
271
|
+
await f.write(combined_content)
|
218
272
|
|
219
273
|
|
220
274
|
|
@@ -296,17 +350,25 @@ class SpiderForce4AI:
|
|
296
350
|
|
297
351
|
results.append(result)
|
298
352
|
|
299
|
-
#
|
300
|
-
if config.save_reports:
|
301
|
-
self._save_report_sync(results, config)
|
302
|
-
print(f"\nReport saved to: {config.report_file}")
|
303
|
-
|
304
|
-
# Print summary
|
353
|
+
# Calculate statistics
|
305
354
|
successful = len([r for r in results if r.status == "success"])
|
306
355
|
failed = len([r for r in results if r.status == "failed"])
|
356
|
+
|
357
|
+
# Print summary
|
307
358
|
print(f"\nParallel processing completed:")
|
308
359
|
print(f"✓ Successful: {successful}")
|
309
360
|
print(f"✗ Failed: {failed}")
|
361
|
+
|
362
|
+
# Save report if enabled
|
363
|
+
if config.save_reports and config.report_file:
|
364
|
+
self._retry_stats = {
|
365
|
+
"initial_failures": failed,
|
366
|
+
"failure_ratio": (failed / len(urls)) * 100,
|
367
|
+
"retry_successful": 0, # No retries in server parallel mode
|
368
|
+
"retry_failed": failed
|
369
|
+
}
|
370
|
+
self._save_report_sync(results, config)
|
371
|
+
console.print(f"📊 Report saved to: {config.report_file}")
|
310
372
|
|
311
373
|
return results
|
312
374
|
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=yNyBz8-HjENlAZ1NSy072Ir71T68xulTRj19Yxr1aEQ,33573
|
2
|
+
spiderforce4ai-1.6.dist-info/METADATA,sha256=pelYvJYMzC8W-P2ORQNWwP2Fyc5KshnzQ6edoYEYZQU,7183
|
3
|
+
spiderforce4ai-1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-1.6.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=7EMEEfFY3WLq58-vnK1Yhcb1trF2ZXU-Ny3licz45Yk,30585
|
2
|
-
spiderforce4ai-1.4.dist-info/METADATA,sha256=7GRBz_bTtXOQ2N-gHRPJFEWW8mmOB_1gwrJCf-el8LM,7183
|
3
|
-
spiderforce4ai-1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|