spiderforce4ai 0.1.0__tar.gz → 0.1.3__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.0
3
+ Version: 0.1.3
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -22,7 +22,7 @@ Dynamic: author
22
22
  Dynamic: home-page
23
23
  Dynamic: requires-python
24
24
 
25
- # SpiderForce4AI Python Wrapper
25
+ # SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
26
26
 
27
27
  A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
28
28
 
@@ -1,4 +1,4 @@
1
- # SpiderForce4AI Python Wrapper
1
+ # SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
2
2
 
3
3
  A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
4
4
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "0.1.0"
7
+ version = "0.1.3"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
@@ -20,6 +20,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
20
20
  from rich.console import Console
21
21
  import aiofiles
22
22
  import httpx
23
+ from multiprocessing import Pool
23
24
 
24
25
  console = Console()
25
26
 
@@ -300,4 +301,125 @@ class SpiderForce4AI:
300
301
 
301
302
  def __exit__(self, exc_type, exc_val, exc_tb):
302
303
  """Sync context manager exit."""
303
- self._executor.shutdown(wait=True)
304
+ self._executor.shutdown(wait=True)
305
+
306
+
307
+ def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
308
+ """
309
+ Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
310
+ """
311
+ print(f"Fetching sitemap from {sitemap_url}...")
312
+
313
+ # Fetch sitemap
314
+ try:
315
+ response = requests.get(sitemap_url, timeout=config.timeout)
316
+ response.raise_for_status()
317
+ sitemap_text = response.text
318
+ except Exception as e:
319
+ print(f"Error fetching sitemap: {str(e)}")
320
+ raise
321
+
322
+ # Parse sitemap
323
+ try:
324
+ root = ET.fromstring(sitemap_text)
325
+ namespace = {'ns': root.tag.split('}')[0].strip('{')}
326
+ urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
327
+ print(f"Found {len(urls)} URLs in sitemap")
328
+ except Exception as e:
329
+ print(f"Error parsing sitemap: {str(e)}")
330
+ raise
331
+
332
+ def _crawl_single(url: str) -> CrawlResult:
333
+ try:
334
+ endpoint = f"{self.base_url}/convert"
335
+ payload = {
336
+ "url": url,
337
+ **config.to_dict()
338
+ }
339
+
340
+ response = requests.post(endpoint, json=payload, timeout=config.timeout)
341
+ if response.status_code != 200:
342
+ return CrawlResult(
343
+ url=url,
344
+ status="failed",
345
+ error=f"HTTP {response.status_code}: {response.text}",
346
+ config=config.to_dict()
347
+ )
348
+
349
+ markdown = response.text
350
+
351
+ # Save markdown if output directory is configured
352
+ if config.output_dir:
353
+ filepath = config.output_dir / f"{slugify(url)}.md"
354
+ with open(filepath, 'w', encoding='utf-8') as f:
355
+ f.write(markdown)
356
+
357
+ # Add delay if configured
358
+ if config.request_delay:
359
+ time.sleep(config.request_delay)
360
+
361
+ return CrawlResult(
362
+ url=url,
363
+ status="success",
364
+ markdown=markdown,
365
+ config=config.to_dict()
366
+ )
367
+
368
+ except Exception as e:
369
+ return CrawlResult(
370
+ url=url,
371
+ status="failed",
372
+ error=str(e),
373
+ config=config.to_dict()
374
+ )
375
+
376
+ # Create process pool and execute crawls
377
+ results = []
378
+ with Pool(processes=config.max_concurrent_requests) as pool:
379
+ with Progress(
380
+ SpinnerColumn(),
381
+ TextColumn("[progress.description]{task.description}"),
382
+ BarColumn(),
383
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
384
+ TextColumn("({task.completed}/{task.total})"),
385
+ ) as progress:
386
+ task = progress.add_task("Crawling URLs...", total=len(urls))
387
+
388
+ for result in pool.imap_unordered(_crawl_single, urls):
389
+ results.append(result)
390
+ progress.update(task, advance=1)
391
+ status = "✓" if result.status == "success" else "✗"
392
+ progress.description = f"Last: {status} {result.url}"
393
+
394
+ # Save final report
395
+ if config.report_file:
396
+ self._save_report_sync(results, config)
397
+ print(f"\nReport saved to: {config.report_file}")
398
+
399
+ # Print summary
400
+ successful = len([r for r in results if r.status == "success"])
401
+ failed = len([r for r in results if r.status == "failed"])
402
+ print(f"\nCrawling completed:")
403
+ print(f"✓ Successful: {successful}")
404
+ print(f"✗ Failed: {failed}")
405
+
406
+ return results
407
+
408
+ def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
409
+ """Save crawl report synchronously."""
410
+ report = {
411
+ "timestamp": datetime.now().isoformat(),
412
+ "config": config.to_dict(),
413
+ "results": {
414
+ "successful": [asdict(r) for r in results if r.status == "success"],
415
+ "failed": [asdict(r) for r in results if r.status == "failed"]
416
+ },
417
+ "summary": {
418
+ "total": len(results),
419
+ "successful": len([r for r in results if r.status == "success"]),
420
+ "failed": len([r for r in results if r.status == "failed"])
421
+ }
422
+ }
423
+
424
+ with open(config.report_file, 'w', encoding='utf-8') as f:
425
+ json.dump(report, f, indent=2)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.0
3
+ Version: 0.1.3
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -22,7 +22,7 @@ Dynamic: author
22
22
  Dynamic: home-page
23
23
  Dynamic: requires-python
24
24
 
25
- # SpiderForce4AI Python Wrapper
25
+ # SpiderForce4AI Python Wrapper (Jina ai reader, fFrecrawl alternative)
26
26
 
27
27
  A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
28
28
 
File without changes
File without changes