spiderforce4ai 1.4__py3-none-any.whl → 1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +78 -16
- {spiderforce4ai-1.4.dist-info → spiderforce4ai-1.6.dist-info}/METADATA +1 -1
- spiderforce4ai-1.6.dist-info/RECORD +5 -0
- spiderforce4ai-1.4.dist-info/RECORD +0 -5
- {spiderforce4ai-1.4.dist-info → spiderforce4ai-1.6.dist-info}/WHEEL +0 -0
- {spiderforce4ai-1.4.dist-info → spiderforce4ai-1.6.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -22,6 +22,30 @@ from multiprocessing import Pool
|
|
22
22
|
|
23
23
|
console = Console()
|
24
24
|
|
25
|
+
def extract_metadata_headers(markdown: str) -> str:
|
26
|
+
"""Extract metadata and headers from markdown content."""
|
27
|
+
lines = markdown.split('\n')
|
28
|
+
extracted = []
|
29
|
+
in_metadata = False
|
30
|
+
|
31
|
+
for line in lines:
|
32
|
+
# Check for metadata block
|
33
|
+
if line.strip() == '---':
|
34
|
+
in_metadata = not in_metadata
|
35
|
+
extracted.append(line)
|
36
|
+
continue
|
37
|
+
|
38
|
+
# Include metadata
|
39
|
+
if in_metadata:
|
40
|
+
extracted.append(line)
|
41
|
+
continue
|
42
|
+
|
43
|
+
# Include headers (lines starting with #)
|
44
|
+
if line.strip().startswith('#'):
|
45
|
+
extracted.append(line)
|
46
|
+
|
47
|
+
return '\n'.join(extracted)
|
48
|
+
|
25
49
|
def slugify(url: str) -> str:
|
26
50
|
"""Convert URL to a valid filename."""
|
27
51
|
parsed = urlparse(url)
|
@@ -61,6 +85,8 @@ class CrawlConfig:
|
|
61
85
|
webhook_payload_template: Optional[str] = None # Optional custom webhook payload template
|
62
86
|
save_reports: bool = False # Whether to save crawl reports
|
63
87
|
report_file: Optional[Path] = None # Optional report file location (used only if save_reports is True)
|
88
|
+
combine_to_one_markdown: Optional[str] = None # 'full' or 'metadata_headers' to combine all pages into one file
|
89
|
+
combined_markdown_file: Optional[Path] = None # Optional path for combined markdown file
|
64
90
|
|
65
91
|
def __post_init__(self):
|
66
92
|
# Initialize empty lists/dicts for None values
|
@@ -72,12 +98,21 @@ class CrawlConfig:
|
|
72
98
|
self.output_dir = Path(self.output_dir)
|
73
99
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
74
100
|
|
75
|
-
#
|
101
|
+
# Setup report file if save_reports is True
|
76
102
|
if self.save_reports:
|
77
103
|
if self.report_file is None:
|
78
104
|
self.report_file = self.output_dir / "crawl_report.json"
|
79
105
|
else:
|
80
106
|
self.report_file = Path(self.report_file)
|
107
|
+
|
108
|
+
# Setup combined markdown file if needed
|
109
|
+
if self.combine_to_one_markdown:
|
110
|
+
if self.combined_markdown_file is None:
|
111
|
+
self.combined_markdown_file = self.output_dir / "combined_content.md"
|
112
|
+
else:
|
113
|
+
self.combined_markdown_file = Path(self.combined_markdown_file)
|
114
|
+
# Create or clear the combined file
|
115
|
+
self.combined_markdown_file.write_text('')
|
81
116
|
|
82
117
|
def to_dict(self) -> Dict:
|
83
118
|
"""Convert config to dictionary for API requests."""
|
@@ -158,9 +193,19 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
|
158
193
|
|
159
194
|
# Save markdown if output directory is configured
|
160
195
|
if config.output_dir:
|
161
|
-
|
162
|
-
|
163
|
-
f
|
196
|
+
# Save individual file if not combining or if combining in full mode
|
197
|
+
if not config.combine_to_one_markdown or config.combine_to_one_markdown == 'full':
|
198
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
199
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
200
|
+
f.write(markdown)
|
201
|
+
|
202
|
+
# Handle combined markdown file
|
203
|
+
if config.combine_to_one_markdown:
|
204
|
+
content = markdown if config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
|
205
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
206
|
+
|
207
|
+
with open(config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
208
|
+
f.write(combined_content)
|
164
209
|
|
165
210
|
result = CrawlResult(
|
166
211
|
url=url,
|
@@ -209,12 +254,21 @@ class SpiderForce4AI:
|
|
209
254
|
await self.session.close()
|
210
255
|
|
211
256
|
async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
|
212
|
-
"""Save markdown content to file."""
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
257
|
+
"""Save markdown content to file and/or append to combined file."""
|
258
|
+
# Save individual file if not combining or if combining in full mode
|
259
|
+
if not self.config.combine_to_one_markdown or self.config.combine_to_one_markdown == 'full':
|
260
|
+
filename = f"{slugify(url)}.md"
|
261
|
+
filepath = output_dir / filename
|
262
|
+
async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
|
263
|
+
await f.write(markdown)
|
264
|
+
|
265
|
+
# Handle combined markdown file
|
266
|
+
if self.config.combine_to_one_markdown:
|
267
|
+
content = markdown if self.config.combine_to_one_markdown == 'full' else extract_metadata_headers(markdown)
|
268
|
+
combined_content = f"\n----PAGE----\n{url}\n\n{content}\n----PAGE END----\n"
|
269
|
+
|
270
|
+
async with aiofiles.open(self.config.combined_markdown_file, 'a', encoding='utf-8') as f:
|
271
|
+
await f.write(combined_content)
|
218
272
|
|
219
273
|
|
220
274
|
|
@@ -296,17 +350,25 @@ class SpiderForce4AI:
|
|
296
350
|
|
297
351
|
results.append(result)
|
298
352
|
|
299
|
-
#
|
300
|
-
if config.save_reports:
|
301
|
-
self._save_report_sync(results, config)
|
302
|
-
print(f"\nReport saved to: {config.report_file}")
|
303
|
-
|
304
|
-
# Print summary
|
353
|
+
# Calculate statistics
|
305
354
|
successful = len([r for r in results if r.status == "success"])
|
306
355
|
failed = len([r for r in results if r.status == "failed"])
|
356
|
+
|
357
|
+
# Print summary
|
307
358
|
print(f"\nParallel processing completed:")
|
308
359
|
print(f"✓ Successful: {successful}")
|
309
360
|
print(f"✗ Failed: {failed}")
|
361
|
+
|
362
|
+
# Save report if enabled
|
363
|
+
if config.save_reports and config.report_file:
|
364
|
+
self._retry_stats = {
|
365
|
+
"initial_failures": failed,
|
366
|
+
"failure_ratio": (failed / len(urls)) * 100,
|
367
|
+
"retry_successful": 0, # No retries in server parallel mode
|
368
|
+
"retry_failed": failed
|
369
|
+
}
|
370
|
+
self._save_report_sync(results, config)
|
371
|
+
console.print(f"📊 Report saved to: {config.report_file}")
|
310
372
|
|
311
373
|
return results
|
312
374
|
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=yNyBz8-HjENlAZ1NSy072Ir71T68xulTRj19Yxr1aEQ,33573
|
2
|
+
spiderforce4ai-1.6.dist-info/METADATA,sha256=pelYvJYMzC8W-P2ORQNWwP2Fyc5KshnzQ6edoYEYZQU,7183
|
3
|
+
spiderforce4ai-1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-1.6.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=7EMEEfFY3WLq58-vnK1Yhcb1trF2ZXU-Ny3licz45Yk,30585
|
2
|
-
spiderforce4ai-1.4.dist-info/METADATA,sha256=7GRBz_bTtXOQ2N-gHRPJFEWW8mmOB_1gwrJCf-el8LM,7183
|
3
|
-
spiderforce4ai-1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|