spiderforce4ai 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- spiderforce4ai/__init__.py +123 -1
- {spiderforce4ai-0.1.0.dist-info → spiderforce4ai-0.1.2.dist-info}/METADATA +2 -2
- spiderforce4ai-0.1.2.dist-info/RECORD +5 -0
- spiderforce4ai-0.1.0.dist-info/RECORD +0 -5
- {spiderforce4ai-0.1.0.dist-info → spiderforce4ai-0.1.2.dist-info}/WHEEL +0 -0
- {spiderforce4ai-0.1.0.dist-info → spiderforce4ai-0.1.2.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -20,6 +20,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
|
|
20
20
|
from rich.console import Console
|
21
21
|
import aiofiles
|
22
22
|
import httpx
|
23
|
+
from multiprocessing import Pool
|
23
24
|
|
24
25
|
console = Console()
|
25
26
|
|
@@ -300,4 +301,125 @@ class SpiderForce4AI:
|
|
300
301
|
|
301
302
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
302
303
|
"""Sync context manager exit."""
|
303
|
-
self._executor.shutdown(wait=True)
|
304
|
+
self._executor.shutdown(wait=True)
|
305
|
+
|
306
|
+
|
307
|
+
def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
308
|
+
"""
|
309
|
+
Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
|
310
|
+
"""
|
311
|
+
print(f"Fetching sitemap from {sitemap_url}...")
|
312
|
+
|
313
|
+
# Fetch sitemap
|
314
|
+
try:
|
315
|
+
response = requests.get(sitemap_url, timeout=config.timeout)
|
316
|
+
response.raise_for_status()
|
317
|
+
sitemap_text = response.text
|
318
|
+
except Exception as e:
|
319
|
+
print(f"Error fetching sitemap: {str(e)}")
|
320
|
+
raise
|
321
|
+
|
322
|
+
# Parse sitemap
|
323
|
+
try:
|
324
|
+
root = ET.fromstring(sitemap_text)
|
325
|
+
namespace = {'ns': root.tag.split('}')[0].strip('{')}
|
326
|
+
urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
|
327
|
+
print(f"Found {len(urls)} URLs in sitemap")
|
328
|
+
except Exception as e:
|
329
|
+
print(f"Error parsing sitemap: {str(e)}")
|
330
|
+
raise
|
331
|
+
|
332
|
+
def _crawl_single(url: str) -> CrawlResult:
|
333
|
+
try:
|
334
|
+
endpoint = f"{self.base_url}/convert"
|
335
|
+
payload = {
|
336
|
+
"url": url,
|
337
|
+
**config.to_dict()
|
338
|
+
}
|
339
|
+
|
340
|
+
response = requests.post(endpoint, json=payload, timeout=config.timeout)
|
341
|
+
if response.status_code != 200:
|
342
|
+
return CrawlResult(
|
343
|
+
url=url,
|
344
|
+
status="failed",
|
345
|
+
error=f"HTTP {response.status_code}: {response.text}",
|
346
|
+
config=config.to_dict()
|
347
|
+
)
|
348
|
+
|
349
|
+
markdown = response.text
|
350
|
+
|
351
|
+
# Save markdown if output directory is configured
|
352
|
+
if config.output_dir:
|
353
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
354
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
355
|
+
f.write(markdown)
|
356
|
+
|
357
|
+
# Add delay if configured
|
358
|
+
if config.request_delay:
|
359
|
+
time.sleep(config.request_delay)
|
360
|
+
|
361
|
+
return CrawlResult(
|
362
|
+
url=url,
|
363
|
+
status="success",
|
364
|
+
markdown=markdown,
|
365
|
+
config=config.to_dict()
|
366
|
+
)
|
367
|
+
|
368
|
+
except Exception as e:
|
369
|
+
return CrawlResult(
|
370
|
+
url=url,
|
371
|
+
status="failed",
|
372
|
+
error=str(e),
|
373
|
+
config=config.to_dict()
|
374
|
+
)
|
375
|
+
|
376
|
+
# Create process pool and execute crawls
|
377
|
+
results = []
|
378
|
+
with Pool(processes=config.max_concurrent_requests) as pool:
|
379
|
+
with Progress(
|
380
|
+
SpinnerColumn(),
|
381
|
+
TextColumn("[progress.description]{task.description}"),
|
382
|
+
BarColumn(),
|
383
|
+
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
384
|
+
TextColumn("({task.completed}/{task.total})"),
|
385
|
+
) as progress:
|
386
|
+
task = progress.add_task("Crawling URLs...", total=len(urls))
|
387
|
+
|
388
|
+
for result in pool.imap_unordered(_crawl_single, urls):
|
389
|
+
results.append(result)
|
390
|
+
progress.update(task, advance=1)
|
391
|
+
status = "✓" if result.status == "success" else "✗"
|
392
|
+
progress.description = f"Last: {status} {result.url}"
|
393
|
+
|
394
|
+
# Save final report
|
395
|
+
if config.report_file:
|
396
|
+
self._save_report_sync(results, config)
|
397
|
+
print(f"\nReport saved to: {config.report_file}")
|
398
|
+
|
399
|
+
# Print summary
|
400
|
+
successful = len([r for r in results if r.status == "success"])
|
401
|
+
failed = len([r for r in results if r.status == "failed"])
|
402
|
+
print(f"\nCrawling completed:")
|
403
|
+
print(f"✓ Successful: {successful}")
|
404
|
+
print(f"✗ Failed: {failed}")
|
405
|
+
|
406
|
+
return results
|
407
|
+
|
408
|
+
def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
|
409
|
+
"""Save crawl report synchronously."""
|
410
|
+
report = {
|
411
|
+
"timestamp": datetime.now().isoformat(),
|
412
|
+
"config": config.to_dict(),
|
413
|
+
"results": {
|
414
|
+
"successful": [asdict(r) for r in results if r.status == "success"],
|
415
|
+
"failed": [asdict(r) for r in results if r.status == "failed"]
|
416
|
+
},
|
417
|
+
"summary": {
|
418
|
+
"total": len(results),
|
419
|
+
"successful": len([r for r in results if r.status == "success"]),
|
420
|
+
"failed": len([r for r in results if r.status == "failed"])
|
421
|
+
}
|
422
|
+
}
|
423
|
+
|
424
|
+
with open(config.report_file, 'w', encoding='utf-8') as f:
|
425
|
+
json.dump(report, f, indent=2)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: spiderforce4ai
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
5
|
Home-page: https://petertam.pro
|
6
6
|
Author: Piotr Tamulewicz
|
@@ -22,7 +22,7 @@ Dynamic: author
|
|
22
22
|
Dynamic: home-page
|
23
23
|
Dynamic: requires-python
|
24
24
|
|
25
|
-
# SpiderForce4AI Python Wrapper
|
25
|
+
# SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
|
26
26
|
|
27
27
|
A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
|
28
28
|
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
|
2
|
+
spiderforce4ai-0.1.2.dist-info/METADATA,sha256=DmzqJ_eAXf4XEL3b8ZjOgaqIFwwz2DVHQheyBwXTSGY,6214
|
3
|
+
spiderforce4ai-0.1.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-0.1.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-0.1.2.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=TTUtXHp4QvFLhh4vgh0bCvYAyJEAZ-8xguoBNVcQUZI,11815
|
2
|
-
spiderforce4ai-0.1.0.dist-info/METADATA,sha256=X2Y8tb-sgJ_8fnilV9yHA_qM3xE1OQmTZPtXohT2nsg,6174
|
3
|
-
spiderforce4ai-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-0.1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-0.1.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|