spiderforce4ai 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +123 -1
- {spiderforce4ai-0.1.0.dist-info → spiderforce4ai-0.1.2.dist-info}/METADATA +2 -2
- spiderforce4ai-0.1.2.dist-info/RECORD +5 -0
- spiderforce4ai-0.1.0.dist-info/RECORD +0 -5
- {spiderforce4ai-0.1.0.dist-info → spiderforce4ai-0.1.2.dist-info}/WHEEL +0 -0
- {spiderforce4ai-0.1.0.dist-info → spiderforce4ai-0.1.2.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -20,6 +20,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
|
|
20
20
|
from rich.console import Console
|
21
21
|
import aiofiles
|
22
22
|
import httpx
|
23
|
+
from multiprocessing import Pool
|
23
24
|
|
24
25
|
console = Console()
|
25
26
|
|
@@ -300,4 +301,125 @@ class SpiderForce4AI:
|
|
300
301
|
|
301
302
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
302
303
|
"""Sync context manager exit."""
|
303
|
-
self._executor.shutdown(wait=True)
|
304
|
+
self._executor.shutdown(wait=True)
|
305
|
+
|
306
|
+
|
307
|
+
def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
308
|
+
"""
|
309
|
+
Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
|
310
|
+
"""
|
311
|
+
print(f"Fetching sitemap from {sitemap_url}...")
|
312
|
+
|
313
|
+
# Fetch sitemap
|
314
|
+
try:
|
315
|
+
response = requests.get(sitemap_url, timeout=config.timeout)
|
316
|
+
response.raise_for_status()
|
317
|
+
sitemap_text = response.text
|
318
|
+
except Exception as e:
|
319
|
+
print(f"Error fetching sitemap: {str(e)}")
|
320
|
+
raise
|
321
|
+
|
322
|
+
# Parse sitemap
|
323
|
+
try:
|
324
|
+
root = ET.fromstring(sitemap_text)
|
325
|
+
namespace = {'ns': root.tag.split('}')[0].strip('{')}
|
326
|
+
urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
|
327
|
+
print(f"Found {len(urls)} URLs in sitemap")
|
328
|
+
except Exception as e:
|
329
|
+
print(f"Error parsing sitemap: {str(e)}")
|
330
|
+
raise
|
331
|
+
|
332
|
+
def _crawl_single(url: str) -> CrawlResult:
|
333
|
+
try:
|
334
|
+
endpoint = f"{self.base_url}/convert"
|
335
|
+
payload = {
|
336
|
+
"url": url,
|
337
|
+
**config.to_dict()
|
338
|
+
}
|
339
|
+
|
340
|
+
response = requests.post(endpoint, json=payload, timeout=config.timeout)
|
341
|
+
if response.status_code != 200:
|
342
|
+
return CrawlResult(
|
343
|
+
url=url,
|
344
|
+
status="failed",
|
345
|
+
error=f"HTTP {response.status_code}: {response.text}",
|
346
|
+
config=config.to_dict()
|
347
|
+
)
|
348
|
+
|
349
|
+
markdown = response.text
|
350
|
+
|
351
|
+
# Save markdown if output directory is configured
|
352
|
+
if config.output_dir:
|
353
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
354
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
355
|
+
f.write(markdown)
|
356
|
+
|
357
|
+
# Add delay if configured
|
358
|
+
if config.request_delay:
|
359
|
+
time.sleep(config.request_delay)
|
360
|
+
|
361
|
+
return CrawlResult(
|
362
|
+
url=url,
|
363
|
+
status="success",
|
364
|
+
markdown=markdown,
|
365
|
+
config=config.to_dict()
|
366
|
+
)
|
367
|
+
|
368
|
+
except Exception as e:
|
369
|
+
return CrawlResult(
|
370
|
+
url=url,
|
371
|
+
status="failed",
|
372
|
+
error=str(e),
|
373
|
+
config=config.to_dict()
|
374
|
+
)
|
375
|
+
|
376
|
+
# Create process pool and execute crawls
|
377
|
+
results = []
|
378
|
+
with Pool(processes=config.max_concurrent_requests) as pool:
|
379
|
+
with Progress(
|
380
|
+
SpinnerColumn(),
|
381
|
+
TextColumn("[progress.description]{task.description}"),
|
382
|
+
BarColumn(),
|
383
|
+
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
384
|
+
TextColumn("({task.completed}/{task.total})"),
|
385
|
+
) as progress:
|
386
|
+
task = progress.add_task("Crawling URLs...", total=len(urls))
|
387
|
+
|
388
|
+
for result in pool.imap_unordered(_crawl_single, urls):
|
389
|
+
results.append(result)
|
390
|
+
progress.update(task, advance=1)
|
391
|
+
status = "✓" if result.status == "success" else "✗"
|
392
|
+
progress.description = f"Last: {status} {result.url}"
|
393
|
+
|
394
|
+
# Save final report
|
395
|
+
if config.report_file:
|
396
|
+
self._save_report_sync(results, config)
|
397
|
+
print(f"\nReport saved to: {config.report_file}")
|
398
|
+
|
399
|
+
# Print summary
|
400
|
+
successful = len([r for r in results if r.status == "success"])
|
401
|
+
failed = len([r for r in results if r.status == "failed"])
|
402
|
+
print(f"\nCrawling completed:")
|
403
|
+
print(f"✓ Successful: {successful}")
|
404
|
+
print(f"✗ Failed: {failed}")
|
405
|
+
|
406
|
+
return results
|
407
|
+
|
408
|
+
def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
|
409
|
+
"""Save crawl report synchronously."""
|
410
|
+
report = {
|
411
|
+
"timestamp": datetime.now().isoformat(),
|
412
|
+
"config": config.to_dict(),
|
413
|
+
"results": {
|
414
|
+
"successful": [asdict(r) for r in results if r.status == "success"],
|
415
|
+
"failed": [asdict(r) for r in results if r.status == "failed"]
|
416
|
+
},
|
417
|
+
"summary": {
|
418
|
+
"total": len(results),
|
419
|
+
"successful": len([r for r in results if r.status == "success"]),
|
420
|
+
"failed": len([r for r in results if r.status == "failed"])
|
421
|
+
}
|
422
|
+
}
|
423
|
+
|
424
|
+
with open(config.report_file, 'w', encoding='utf-8') as f:
|
425
|
+
json.dump(report, f, indent=2)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: spiderforce4ai
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.2
|
4
4
|
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
|
5
5
|
Home-page: https://petertam.pro
|
6
6
|
Author: Piotr Tamulewicz
|
@@ -22,7 +22,7 @@ Dynamic: author
|
|
22
22
|
Dynamic: home-page
|
23
23
|
Dynamic: requires-python
|
24
24
|
|
25
|
-
# SpiderForce4AI Python Wrapper
|
25
|
+
# SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
|
26
26
|
|
27
27
|
A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
|
28
28
|
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
|
2
|
+
spiderforce4ai-0.1.2.dist-info/METADATA,sha256=DmzqJ_eAXf4XEL3b8ZjOgaqIFwwz2DVHQheyBwXTSGY,6214
|
3
|
+
spiderforce4ai-0.1.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-0.1.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-0.1.2.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=TTUtXHp4QvFLhh4vgh0bCvYAyJEAZ-8xguoBNVcQUZI,11815
|
2
|
-
spiderforce4ai-0.1.0.dist-info/METADATA,sha256=X2Y8tb-sgJ_8fnilV9yHA_qM3xE1OQmTZPtXohT2nsg,6174
|
3
|
-
spiderforce4ai-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-0.1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-0.1.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|