PyPI - pyaccelerate - Versions diffs - 0.1.0__py3-none-any.whl - Mend

pyaccelerate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

pyaccelerate/__init__.py +52 -0
pyaccelerate/benchmark.py +308 -0
pyaccelerate/cli.py +156 -0
pyaccelerate/cpu.py +324 -0
pyaccelerate/engine.py +326 -0
pyaccelerate/gpu/__init__.py +44 -0
pyaccelerate/gpu/cuda.py +135 -0
pyaccelerate/gpu/detector.py +363 -0
pyaccelerate/gpu/dispatch.py +171 -0
pyaccelerate/gpu/intel.py +93 -0
pyaccelerate/gpu/opencl.py +161 -0
pyaccelerate/memory.py +144 -0
pyaccelerate/profiler.py +253 -0
pyaccelerate/py.typed +0 -0
pyaccelerate/threads.py +410 -0
pyaccelerate/virt.py +248 -0
pyaccelerate-0.1.0.dist-info/METADATA +271 -0
pyaccelerate-0.1.0.dist-info/RECORD +22 -0
pyaccelerate-0.1.0.dist-info/WHEEL +5 -0
pyaccelerate-0.1.0.dist-info/entry_points.txt +2 -0
pyaccelerate-0.1.0.dist-info/licenses/LICENSE +21 -0
pyaccelerate-0.1.0.dist-info/top_level.txt +1 -0

pyaccelerate/__init__.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""
+PyAccelerate — High-performance Python acceleration engine.
+Modules
+-------
+- **cpu**        : CPU detection, core count, frequency, affinity, NUMA topology
+- **threads**    : Virtual thread pool, sliding-window executor, async bridge
+- **gpu**        : Multi-vendor GPU detection, ranking, dispatch (CUDA/OpenCL/Intel)
+- **virt**       : Virtualization detection (Hyper-V, VT-x/AMD-V, WSL2, Docker)
+- **memory**     : Memory monitoring, pressure detection, pool allocator
+- **profiler**   : Decorator-based profiling & timing utilities
+- **benchmark**  : Built-in micro-benchmarks for the current host
+- **engine**     : Unified orchestrator that auto-tunes all subsystems
+Quick start::
+    from pyaccelerate import Engine
+    engine = Engine()          # auto-detects hardware
+    print(engine.summary())    # human-readable report
+    # Use the shared virtual-thread pool
+    from pyaccelerate.threads import get_pool, run_parallel
+    pool = get_pool()
+    fut = pool.submit(my_io_func, arg1, arg2)
+    # GPU compute
+    from pyaccelerate.gpu import detect_all, best_gpu, dispatch
+    gpus = detect_all()
+    result = dispatch(my_kernel, data, gpus=gpus)
+"""
+from importlib.metadata import PackageNotFoundError, version as _version
+try:
+    __version__: str = _version("pyaccelerate")
+except PackageNotFoundError:
+    # Running from source / not installed
+    from pathlib import Path as _Path
+    _vf = _Path(__file__).resolve().parent.parent.parent / "VERSION"
+    __version__ = _vf.read_text().strip() if _vf.exists() else "0.0.0-dev"
+# Convenience re-exports
+from pyaccelerate.engine import Engine  # noqa: E402, F401
+__all__ = [
+    "__version__",
+    "Engine",
+]

pyaccelerate/benchmark.py ADDED Viewed

@@ -0,0 +1,308 @@
+"""
+pyaccelerate.benchmark — Built-in micro-benchmarks for the current host.
+Runs quick benchmarks to characterise host performance:
+  - CPU single-thread & multi-thread throughput
+  - Memory bandwidth
+  - I/O thread pool latency
+  - GPU compute throughput (if available)
+Results are returned as structured dicts suitable for logging, dashboards
+or automated tuning decisions.
+Usage::
+    from pyaccelerate.benchmark import run_all, run_cpu, run_gpu
+    report = run_all()
+    print(report)
+"""
+from __future__ import annotations
+import hashlib
+import logging
+import math
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Any, Dict, List, Optional
+log = logging.getLogger("pyaccelerate.benchmark")
+# ═══════════════════════════════════════════════════════════════════════════
+#  CPU benchmarks
+# ═══════════════════════════════════════════════════════════════════════════
+def run_cpu(
+    iterations: int = 500_000,
+    hash_bytes: int = 4096,
+) -> Dict[str, Any]:
+    """Single-thread CPU throughput:  math + hashing workload."""
+    # Math workload
+    t0 = time.perf_counter()
+    total = 0.0
+    for i in range(1, iterations + 1):
+        total += math.sqrt(i) * math.sin(i)
+    math_time = time.perf_counter() - t0
+    # Hash workload
+    data = os.urandom(hash_bytes)
+    t0 = time.perf_counter()
+    for _ in range(iterations):
+        hashlib.md5(data).hexdigest()
+    hash_time = time.perf_counter() - t0
+    return {
+        "benchmark": "cpu_single_thread",
+        "iterations": iterations,
+        "math_time_s": round(math_time, 4),
+        "hash_time_s": round(hash_time, 4),
+        "total_s": round(math_time + hash_time, 4),
+        "math_ops_per_sec": int(iterations / math_time) if math_time > 0 else 0,
+        "hash_ops_per_sec": int(iterations / hash_time) if hash_time > 0 else 0,
+    }
+def run_cpu_multithread(
+    iterations: int = 200_000,
+    workers: int = 0,
+) -> Dict[str, Any]:
+    """Multi-thread CPU throughput using standard thread pool."""
+    if workers <= 0:
+        workers = os.cpu_count() or 4
+    def _work(start: int, end: int) -> float:
+        total = 0.0
+        for i in range(start, end):
+            total += math.sqrt(i) * math.sin(i)
+        return total
+    chunk = iterations // workers
+    ranges = [(i * chunk, (i + 1) * chunk) for i in range(workers)]
+    t0 = time.perf_counter()
+    with ThreadPoolExecutor(max_workers=workers) as pool:
+        futures = [pool.submit(_work, s, e) for s, e in ranges]
+        results = [f.result() for f in futures]
+    elapsed = time.perf_counter() - t0
+    return {
+        "benchmark": "cpu_multi_thread",
+        "iterations": iterations,
+        "workers": workers,
+        "time_s": round(elapsed, 4),
+        "ops_per_sec": int(iterations / elapsed) if elapsed > 0 else 0,
+    }
+# ═══════════════════════════════════════════════════════════════════════════
+#  Thread pool latency
+# ═══════════════════════════════════════════════════════════════════════════
+def run_thread_pool_latency(
+    tasks: int = 1000,
+    pool_size: int = 0,
+) -> Dict[str, Any]:
+    """Measure submit→complete latency for no-op tasks on the I/O pool."""
+    from pyaccelerate.threads import get_pool, io_pool_size
+    if pool_size <= 0:
+        pool_size = io_pool_size()
+    pool = get_pool()
+    latencies: list[float] = []
+    for _ in range(tasks):
+        t0 = time.perf_counter()
+        fut = pool.submit(lambda: None)
+        fut.result()
+        latencies.append(time.perf_counter() - t0)
+    avg = sum(latencies) / len(latencies) if latencies else 0
+    p95_idx = int(len(latencies) * 0.95)
+    sorted_lat = sorted(latencies)
+    return {
+        "benchmark": "thread_pool_latency",
+        "tasks": tasks,
+        "pool_size": pool_size,
+        "avg_latency_us": round(avg * 1_000_000, 1),
+        "p95_latency_us": round(sorted_lat[p95_idx] * 1_000_000, 1) if sorted_lat else 0,
+        "min_latency_us": round(sorted_lat[0] * 1_000_000, 1) if sorted_lat else 0,
+        "max_latency_us": round(sorted_lat[-1] * 1_000_000, 1) if sorted_lat else 0,
+    }
+# ═══════════════════════════════════════════════════════════════════════════
+#  Memory bandwidth
+# ═══════════════════════════════════════════════════════════════════════════
+def run_memory_bandwidth(
+    size_mb: int = 64,
+    iterations: int = 10,
+) -> Dict[str, Any]:
+    """Measure sequential memory read/write bandwidth."""
+    size = size_mb * 1024 * 1024
+    # Write
+    t0 = time.perf_counter()
+    for _ in range(iterations):
+        buf = bytearray(size)
+    write_time = time.perf_counter() - t0
+    # Read (hash to prevent optimization)
+    t0 = time.perf_counter()
+    for _ in range(iterations):
+        _h = hashlib.md5(buf).digest()
+    read_time = time.perf_counter() - t0
+    total_bytes = size * iterations
+    return {
+        "benchmark": "memory_bandwidth",
+        "size_mb": size_mb,
+        "iterations": iterations,
+        "write_gbps": round(total_bytes / write_time / (1024 ** 3), 2) if write_time > 0 else 0,
+        "read_gbps": round(total_bytes / read_time / (1024 ** 3), 2) if read_time > 0 else 0,
+    }
+# ═══════════════════════════════════════════════════════════════════════════
+#  GPU benchmark
+# ═══════════════════════════════════════════════════════════════════════════
+def run_gpu(
+    size: int = 10_000_000,
+    iterations: int = 100,
+) -> Dict[str, Any]:
+    """GPU compute throughput: element-wise operations on a large array.
+    Falls back to a CPU result if no GPU backend is available.
+    """
+    from pyaccelerate.gpu import gpu_available, best_gpu
+    if not gpu_available():
+        return {
+            "benchmark": "gpu_compute",
+            "available": False,
+            "note": "No usable GPU — skipped",
+        }
+    gpu = best_gpu()
+    backend = gpu.backend if gpu else "none"
+    if backend == "cuda":
+        return _bench_cuda(size, iterations, gpu)
+    elif backend == "opencl":
+        return _bench_opencl(size, iterations, gpu)
+    elif backend == "intel":
+        return _bench_intel(size, iterations, gpu)
+    return {"benchmark": "gpu_compute", "available": False, "note": "Unsupported backend"}
+def _bench_cuda(size: int, iterations: int, gpu: Any) -> Dict[str, Any]:
+    try:
+        import cupy as cp  # type: ignore[import-untyped]
+        with cp.cuda.Device(gpu._index):
+            a = cp.random.random(size, dtype=cp.float32)
+            b = cp.random.random(size, dtype=cp.float32)
+            cp.cuda.Device(gpu._index).synchronize()
+            t0 = time.perf_counter()
+            for _ in range(iterations):
+                c = a * b + a
+            cp.cuda.Device(gpu._index).synchronize()
+            elapsed = time.perf_counter() - t0
+        ops = size * 2 * iterations  # mul + add
+        return {
+            "benchmark": "gpu_compute",
+            "available": True,
+            "backend": "cuda",
+            "device": gpu.name,
+            "elements": size,
+            "iterations": iterations,
+            "time_s": round(elapsed, 4),
+            "gflops": round(ops / elapsed / 1e9, 2) if elapsed > 0 else 0,
+        }
+    except Exception as exc:
+        return {"benchmark": "gpu_compute", "available": True, "error": str(exc)}
+def _bench_opencl(size: int, iterations: int, gpu: Any) -> Dict[str, Any]:
+    try:
+        import numpy as np  # type: ignore[import-untyped]
+        a = np.random.random(size).astype(np.float32)
+        b = np.random.random(size).astype(np.float32)
+        t0 = time.perf_counter()
+        for _ in range(iterations):
+            c = a * b + a
+        elapsed = time.perf_counter() - t0
+        ops = size * 2 * iterations
+        return {
+            "benchmark": "gpu_compute",
+            "available": True,
+            "backend": "opencl",
+            "device": gpu.name,
+            "elements": size,
+            "iterations": iterations,
+            "time_s": round(elapsed, 4),
+            "gflops": round(ops / elapsed / 1e9, 2) if elapsed > 0 else 0,
+            "note": "OpenCL benchmark uses numpy host-side as proxy",
+        }
+    except Exception as exc:
+        return {"benchmark": "gpu_compute", "available": True, "error": str(exc)}
+def _bench_intel(size: int, iterations: int, gpu: Any) -> Dict[str, Any]:
+    try:
+        import dpnp  # type: ignore[import-untyped]
+        a = dpnp.random.random(size).astype(dpnp.float32)
+        b = dpnp.random.random(size).astype(dpnp.float32)
+        t0 = time.perf_counter()
+        for _ in range(iterations):
+            c = a * b + a
+        elapsed = time.perf_counter() - t0
+        ops = size * 2 * iterations
+        return {
+            "benchmark": "gpu_compute",
+            "available": True,
+            "backend": "intel",
+            "device": gpu.name,
+            "elements": size,
+            "iterations": iterations,
+            "time_s": round(elapsed, 4),
+            "gflops": round(ops / elapsed / 1e9, 2) if elapsed > 0 else 0,
+        }
+    except Exception as exc:
+        return {"benchmark": "gpu_compute", "available": True, "error": str(exc)}
+# ═══════════════════════════════════════════════════════════════════════════
+#  Full suite
+# ═══════════════════════════════════════════════════════════════════════════
+def run_all(quick: bool = True) -> Dict[str, Any]:
+    """Run all benchmarks and return a combined report.
+    Parameters
+    ----------
+    quick : bool
+        If True, use reduced iteration counts for a faster run (~5 s).
+    """
+    scale = 1 if quick else 5
+    results: Dict[str, Any] = {}
+    results["cpu_single"] = run_cpu(iterations=100_000 * scale)
+    results["cpu_multi"] = run_cpu_multithread(iterations=100_000 * scale)
+    results["thread_latency"] = run_thread_pool_latency(tasks=200 * scale)
+    results["memory"] = run_memory_bandwidth(size_mb=16 * scale, iterations=3 * scale)
+    results["gpu"] = run_gpu(size=1_000_000 * scale, iterations=20 * scale)
+    return results

pyaccelerate/cli.py ADDED Viewed

@@ -0,0 +1,156 @@
+"""
+pyaccelerate.cli — Command-line interface.
+Provides quick access to hardware detection, benchmarks and diagnostics::
+    pyaccelerate info          # Show full engine report
+    pyaccelerate benchmark     # Run micro-benchmarks
+    pyaccelerate gpu           # GPU detection details
+    pyaccelerate status        # One-line status
+"""
+from __future__ import annotations
+import argparse
+import json
+import logging
+import sys
+def main(argv: list[str] | None = None) -> None:
+    parser = argparse.ArgumentParser(
+        prog="pyaccelerate",
+        description="PyAccelerate — High-performance Python acceleration engine",
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Enable debug logging",
+    )
+    sub = parser.add_subparsers(dest="command")
+    # info
+    sub.add_parser("info", help="Full engine report")
+    # status
+    sub.add_parser("status", help="One-line status")
+    # benchmark
+    bench_p = sub.add_parser("benchmark", help="Run micro-benchmarks")
+    bench_p.add_argument("--full", action="store_true", help="Run full (slower) suite")
+    bench_p.add_argument("--json", action="store_true", dest="as_json", help="Output as JSON")
+    # gpu
+    sub.add_parser("gpu", help="GPU detection details")
+    # cpu
+    sub.add_parser("cpu", help="CPU detection details")
+    # virt
+    sub.add_parser("virt", help="Virtualization detection")
+    # memory
+    sub.add_parser("memory", help="Memory stats")
+    # version
+    sub.add_parser("version", help="Show version")
+    args = parser.parse_args(argv)
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG, format="%(name)s %(message)s")
+    else:
+        logging.basicConfig(level=logging.WARNING)
+    if args.command is None:
+        parser.print_help()
+        return
+    if args.command == "version":
+        from pyaccelerate import __version__
+        print(f"pyaccelerate {__version__}")
+        return
+    if args.command == "info":
+        from pyaccelerate.engine import Engine
+        engine = Engine()
+        print(engine.summary())
+        return
+    if args.command == "status":
+        from pyaccelerate.engine import Engine
+        engine = Engine()
+        print(engine.status_line())
+        return
+    if args.command == "benchmark":
+        from pyaccelerate.benchmark import run_all
+        print("Running benchmarks...")
+        results = run_all(quick=not args.full)
+        if args.as_json:
+            print(json.dumps(results, indent=2))
+        else:
+            for name, data in results.items():
+                print(f"\n{'─' * 50}")
+                print(f"  {name}")
+                print(f"{'─' * 50}")
+                if isinstance(data, dict):
+                    for k, v in data.items():
+                        print(f"  {k}: {v}")
+        return
+    if args.command == "gpu":
+        from pyaccelerate.gpu import detect_all, get_install_hint
+        gpus = detect_all()
+        if not gpus:
+            print("No GPU detected.")
+        for i, g in enumerate(gpus):
+            print(f"\n[{i}] {g.short_label()}")
+            print(f"    Vendor: {g.vendor}  |  Backend: {g.backend}")
+            print(f"    VRAM: {g.memory_gb:.1f} GB  |  CUs: {g.compute_units}")
+            print(f"    Discrete: {g.is_discrete}  |  Score: {g.score}")
+            print(f"    Usable: {g.usable}")
+        hint = get_install_hint()
+        if hint:
+            print(f"\n{hint}")
+        return
+    if args.command == "cpu":
+        from pyaccelerate.cpu import detect
+        info = detect()
+        print(f"Brand:          {info.brand}")
+        print(f"Architecture:   {info.arch}")
+        print(f"Physical cores: {info.physical_cores}")
+        print(f"Logical cores:  {info.logical_cores}")
+        print(f"Frequency:      {info.frequency_mhz:.0f} MHz (boost: {info.frequency_max_mhz:.0f} MHz)")
+        print(f"NUMA nodes:     {info.numa_nodes}")
+        print(f"SMT ratio:      {info.smt_ratio:.1f}x")
+        if info.flags:
+            print(f"ISA flags:      {', '.join(info.flags)}")
+        return
+    if args.command == "virt":
+        from pyaccelerate.virt import detect
+        vi = detect()
+        parts = vi.summary_parts()
+        if parts:
+            print("Detected:", ", ".join(parts))
+        else:
+            print("No virtualization features detected.")
+        return
+    if args.command == "memory":
+        from pyaccelerate.memory import get_stats, get_pressure
+        stats = get_stats()
+        pressure = get_pressure()
+        print(f"Pressure: {pressure.name}")
+        for k, v in stats.items():
+            if k == "error":
+                continue
+            print(f"  {k}: {v:.2f}")
+        return
+if __name__ == "__main__":
+    main()