spiderforce4ai 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
20
20
  from rich.console import Console
21
21
  import aiofiles
22
22
  import httpx
23
+ from multiprocessing import Pool
23
24
 
24
25
  console = Console()
25
26
 
@@ -300,4 +301,125 @@ class SpiderForce4AI:
300
301
 
301
302
  def __exit__(self, exc_type, exc_val, exc_tb):
302
303
  """Sync context manager exit."""
303
- self._executor.shutdown(wait=True)
304
+ self._executor.shutdown(wait=True)
305
+
306
+
307
+ def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
308
+ """
309
+ Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
310
+ """
311
+ print(f"Fetching sitemap from {sitemap_url}...")
312
+
313
+ # Fetch sitemap
314
+ try:
315
+ response = requests.get(sitemap_url, timeout=config.timeout)
316
+ response.raise_for_status()
317
+ sitemap_text = response.text
318
+ except Exception as e:
319
+ print(f"Error fetching sitemap: {str(e)}")
320
+ raise
321
+
322
+ # Parse sitemap
323
+ try:
324
+ root = ET.fromstring(sitemap_text)
325
+ namespace = {'ns': root.tag.split('}')[0].strip('{')}
326
+ urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
327
+ print(f"Found {len(urls)} URLs in sitemap")
328
+ except Exception as e:
329
+ print(f"Error parsing sitemap: {str(e)}")
330
+ raise
331
+
332
+ def _crawl_single(url: str) -> CrawlResult:
333
+ try:
334
+ endpoint = f"{self.base_url}/convert"
335
+ payload = {
336
+ "url": url,
337
+ **config.to_dict()
338
+ }
339
+
340
+ response = requests.post(endpoint, json=payload, timeout=config.timeout)
341
+ if response.status_code != 200:
342
+ return CrawlResult(
343
+ url=url,
344
+ status="failed",
345
+ error=f"HTTP {response.status_code}: {response.text}",
346
+ config=config.to_dict()
347
+ )
348
+
349
+ markdown = response.text
350
+
351
+ # Save markdown if output directory is configured
352
+ if config.output_dir:
353
+ filepath = config.output_dir / f"{slugify(url)}.md"
354
+ with open(filepath, 'w', encoding='utf-8') as f:
355
+ f.write(markdown)
356
+
357
+ # Add delay if configured
358
+ if config.request_delay:
359
+ time.sleep(config.request_delay)
360
+
361
+ return CrawlResult(
362
+ url=url,
363
+ status="success",
364
+ markdown=markdown,
365
+ config=config.to_dict()
366
+ )
367
+
368
+ except Exception as e:
369
+ return CrawlResult(
370
+ url=url,
371
+ status="failed",
372
+ error=str(e),
373
+ config=config.to_dict()
374
+ )
375
+
376
+ # Create process pool and execute crawls
377
+ results = []
378
+ with Pool(processes=config.max_concurrent_requests) as pool:
379
+ with Progress(
380
+ SpinnerColumn(),
381
+ TextColumn("[progress.description]{task.description}"),
382
+ BarColumn(),
383
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
384
+ TextColumn("({task.completed}/{task.total})"),
385
+ ) as progress:
386
+ task = progress.add_task("Crawling URLs...", total=len(urls))
387
+
388
+ for result in pool.imap_unordered(_crawl_single, urls):
389
+ results.append(result)
390
+ progress.update(task, advance=1)
391
+ status = "✓" if result.status == "success" else "✗"
392
+ progress.description = f"Last: {status} {result.url}"
393
+
394
+ # Save final report
395
+ if config.report_file:
396
+ self._save_report_sync(results, config)
397
+ print(f"\nReport saved to: {config.report_file}")
398
+
399
+ # Print summary
400
+ successful = len([r for r in results if r.status == "success"])
401
+ failed = len([r for r in results if r.status == "failed"])
402
+ print(f"\nCrawling completed:")
403
+ print(f"✓ Successful: {successful}")
404
+ print(f"✗ Failed: {failed}")
405
+
406
+ return results
407
+
408
+ def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
409
+ """Save crawl report synchronously."""
410
+ report = {
411
+ "timestamp": datetime.now().isoformat(),
412
+ "config": config.to_dict(),
413
+ "results": {
414
+ "successful": [asdict(r) for r in results if r.status == "success"],
415
+ "failed": [asdict(r) for r in results if r.status == "failed"]
416
+ },
417
+ "summary": {
418
+ "total": len(results),
419
+ "successful": len([r for r in results if r.status == "success"]),
420
+ "failed": len([r for r in results if r.status == "failed"])
421
+ }
422
+ }
423
+
424
+ with open(config.report_file, 'w', encoding='utf-8') as f:
425
+ json.dump(report, f, indent=2)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -22,7 +22,7 @@ Dynamic: author
22
22
  Dynamic: home-page
23
23
  Dynamic: requires-python
24
24
 
25
- # SpiderForce4AI Python Wrapper
25
+ # SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
26
26
 
27
27
  A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
28
28
 
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
2
+ spiderforce4ai-0.1.2.dist-info/METADATA,sha256=DmzqJ_eAXf4XEL3b8ZjOgaqIFwwz2DVHQheyBwXTSGY,6214
3
+ spiderforce4ai-0.1.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-0.1.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-0.1.2.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=TTUtXHp4QvFLhh4vgh0bCvYAyJEAZ-8xguoBNVcQUZI,11815
2
- spiderforce4ai-0.1.0.dist-info/METADATA,sha256=X2Y8tb-sgJ_8fnilV9yHA_qM3xE1OQmTZPtXohT2nsg,6174
3
- spiderforce4ai-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-0.1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-0.1.0.dist-info/RECORD,,