PyPI - autoinference-utils - Versions diffs - 0.1.0__tar.gz - Mend

autoinference-utils 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

autoinference_utils-0.1.0/.gitignore +245 -0
autoinference_utils-0.1.0/PKG-INFO +5 -0
autoinference_utils-0.1.0/README.md +13 -0
autoinference_utils-0.1.0/autoinference_utils/__init__.py +0 -0
autoinference_utils-0.1.0/autoinference_utils/endpoint.py +506 -0
autoinference_utils-0.1.0/pyproject.toml +9 -0

autoinference_utils-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,245 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# Redis
+*.rdb
+*.aof
+*.pid
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+# ActiveMQ
+activemq-data/
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer,
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml
+# General
+.DS_Store
+__MACOSX/
+.AppleDouble
+.LSOverride
+Icon[
+]
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+benchmark_results/
+docs/

autoinference_utils-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,5 @@
+Metadata-Version: 2.4
+Name: autoinference-utils
+Version: 0.1.0
+Summary: Shared endpoint abstractions for autoinference deployments
+Requires-Python: >=3.10

autoinference_utils-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,13 @@
+# autoinference-utils
+Shared endpoint abstractions (`SGLangEndpoint`, `VLLMEndpoint`) for autoinference deployments.
+## Publishing
+```bash
+cd autoinference_utils
+uv build
+uv publish  # requires PyPI token via UV_PUBLISH_TOKEN or --token
+```
+Bump `version` in `pyproject.toml` before each release.

autoinference_utils-0.1.0/autoinference_utils/__init__.py ADDED Viewed

File without changes

autoinference_utils-0.1.0/autoinference_utils/endpoint.py ADDED Viewed

@@ -0,0 +1,506 @@
+"""Composable endpoint abstractions for inference server subprocess deployments.
+Adapted from the timmy branch's utils/endpoint.py. Key differences from the
+old SGLangFlash inheritance pattern: these are plain Python objects, not Modal
+classes. Deployments compose them rather than inheriting from them.
+"""
+from __future__ import annotations
+import json
+import shlex
+import subprocess
+import threading
+import time
+import urllib.error
+import urllib.request
+from abc import ABC
+from typing import Any, Callable, Literal, Mapping, Optional, Sequence
+class Endpoint(ABC):
+    """A thing with a URL that you can start and stop."""
+    def __init__(self, base_url: str):
+        self.base_url = base_url.rstrip("/")
+    def __enter__(self):
+        self.start()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
+        return False
+    def start(self):
+        return
+    def stop(self):
+        return
+class SGLangEndpoint(Endpoint):
+    """Manages an SGLang server subprocess on the local machine."""
+    def __init__(
+        self,
+        *,
+        model_path: str,
+        worker_port: int = 8000,
+        tp: Optional[int] = None,
+        ep: Optional[int] = None,
+        dp: Optional[int] = None,
+        speculative_model_path: Optional[str] = None,
+        load_format: Optional[str] = None,
+        nnodes: int = 1,
+        node_rank: int = 0,
+        dist_init_host: Optional[str] = None,
+        dist_init_port: int = 1234,
+        disaggregation_mode: Optional[Literal["prefill", "decode"]] = None,
+        prefill_bootstrap_port: int = 8998,
+        launcher_module: str = "sglang.launch_server",
+        extra_server_args: Optional[dict[str, str]] = None,
+        health_timeout: float = 20 * 60,
+        health_poll_interval: float = 5.0,
+        health_request_timeout: float = 5.0,
+    ):
+        super().__init__(base_url=f"http://localhost:{worker_port}")
+        self.worker_port = worker_port
+        self.model_path = model_path
+        self.tp = tp
+        self.ep = ep
+        self.dp = dp
+        self.speculative_model_path = speculative_model_path
+        self.load_format = load_format
+        self.nnodes = nnodes
+        self.node_rank = node_rank
+        self.dist_init_host = dist_init_host
+        self.dist_init_port = dist_init_port
+        self.disaggregation_mode = disaggregation_mode
+        self.prefill_bootstrap_port = prefill_bootstrap_port
+        self.launcher_module = launcher_module
+        self.extra_server_args = dict(extra_server_args) if extra_server_args else {}
+        self.health_timeout = health_timeout
+        self.health_poll_interval = health_poll_interval
+        self.health_request_timeout = health_request_timeout
+        self._proc: Optional[subprocess.Popen] = None
+        if self.disaggregation_mode not in (None, "prefill", "decode"):
+            raise ValueError("disaggregation_mode must be None, 'prefill', or 'decode'")
+    DEFAULT_OPERATIONAL_ARGS: dict[str, str] = {
+        "--enable-metrics": "",
+        "--decode-log-interval": "1",
+        "--enable-cache-report": "",
+        "--model-loader-extra-config": '{"enable_multithread_load":true,"num_threads":64}',
+    }
+    def _build_cmd(self) -> list[str]:
+        cmd = [
+            "python", "-m", self.launcher_module,
+            "--host", "0.0.0.0",
+            "--port", str(self.worker_port),
+            "--model-path", self.model_path,
+        ]
+        if self.speculative_model_path is not None:
+            cmd.extend(["--speculative-draft-model-path", self.speculative_model_path])
+        if self.load_format is not None:
+            cmd.extend(["--load-format", self.load_format])
+        if self.tp is not None:
+            cmd.extend(["--tp", str(self.tp)])
+        if self.ep is not None:
+            cmd.extend(["--ep", str(self.ep)])
+        if self.dp is not None:
+            cmd.extend(["--dp", str(self.dp), "--enable-dp-attention"])
+        if self.disaggregation_mode is not None:
+            cmd.extend(["--disaggregation-mode", self.disaggregation_mode])
+            if self.disaggregation_mode == "prefill":
+                cmd.extend(["--disaggregation-bootstrap-port", str(self.prefill_bootstrap_port)])
+        if self.nnodes > 1:
+            if self.dist_init_host is None:
+                raise ValueError("dist_init_host is required when nnodes > 1")
+            cmd.extend([
+                "--nnodes", str(self.nnodes),
+                "--node-rank", str(self.node_rank),
+                "--dist-init-addr", f"{self.dist_init_host}:{self.dist_init_port}",
+            ])
+        merged = {**self.DEFAULT_OPERATIONAL_ARGS, **self.extra_server_args}
+        for key, value in merged.items():
+            if value == "":
+                cmd.append(key)
+            else:
+                cmd.extend([key, *value.split()])
+        return cmd
+    def health_check(self) -> str | None:
+        url = f"http://127.0.0.1:{self.worker_port}/health"
+        return _health_check(url, request_timeout=self.health_request_timeout, process=self._proc)
+    def start(self):
+        cmd = self._build_cmd()
+        print(f"[endpoint] starting: {shlex.join(cmd)}")
+        self._proc = subprocess.Popen(cmd)
+        wait_ready(
+            self._proc,
+            port=self.worker_port,
+            timeout=self.health_timeout,
+            poll_interval=self.health_poll_interval,
+            request_timeout=self.health_request_timeout,
+        )
+    def stop(self):
+        terminate_process(self._proc)
+        self._proc = None
+class VLLMEndpoint(Endpoint):
+    """Manages a vLLM server subprocess on the local machine."""
+    def __init__(
+        self,
+        *,
+        model: str,
+        worker_port: int = 8000,
+        extra_server_args: Optional[dict[str, str]] = None,
+        health_timeout: float = 20 * 60,
+        health_poll_interval: float = 5.0,
+        health_request_timeout: float = 5.0,
+    ):
+        super().__init__(base_url=f"http://localhost:{worker_port}")
+        self.model = model
+        self.worker_port = worker_port
+        self.extra_server_args = dict(extra_server_args) if extra_server_args else {}
+        self.health_timeout = health_timeout
+        self.health_poll_interval = health_poll_interval
+        self.health_request_timeout = health_request_timeout
+        self._proc: Optional[subprocess.Popen] = None
+    def _build_cmd(self) -> list[str]:
+        cmd = [
+            "python", "-m", "vllm.entrypoints.openai.api_server",
+            "--host", "0.0.0.0",
+            "--port", str(self.worker_port),
+            "--model", self.model,
+        ]
+        for key, value in self.extra_server_args.items():
+            if value == "":
+                cmd.append(key)
+            else:
+                cmd.extend([key, *value.split()])
+        return cmd
+    def start(self):
+        cmd = self._build_cmd()
+        print(f"[vllm] starting: {shlex.join(cmd)}")
+        self._proc = subprocess.Popen(cmd)
+        wait_ready(
+            self._proc,
+            port=self.worker_port,
+            timeout=self.health_timeout,
+            poll_interval=self.health_poll_interval,
+            request_timeout=self.health_request_timeout,
+        )
+    def stop(self):
+        terminate_process(self._proc)
+        self._proc = None
+class RouterEndpoint(Endpoint):
+    """Manages an SGLang router process for PD disaggregation."""
+    def __init__(
+        self,
+        *,
+        pd_config: Sequence[tuple[str, str]],
+        worker_port: int = 8000,
+        router_port: int = 9000,
+        prefill_bootstrap_port: int = 8998,
+        api_key: Optional[str] = None,
+        health_timeout: float = 10 * 60,
+        health_poll_interval: float = 5.0,
+    ):
+        super().__init__(base_url=f"http://localhost:{router_port}")
+        self.pd_config = list(pd_config)
+        self.worker_port = worker_port
+        self.router_port = router_port
+        self.prefill_bootstrap_port = prefill_bootstrap_port
+        self.api_key = api_key
+        self.health_timeout = health_timeout
+        self.health_poll_interval = health_poll_interval
+        self._proc: Optional[subprocess.Popen] = None
+    def _build_cmd(self) -> list[str]:
+        cmd = [
+            "python", "-m", "sglang_router.launch_router",
+            "--host", "0.0.0.0",
+            "--port", str(self.router_port),
+            "--prefill-policy", "cache_aware",
+            "--decode-policy", "round_robin",
+            "--max-concurrent-requests", "128",
+            "--rate-limit-tokens-per-second", "0",
+            "--queue-size", "0",
+            "--health-check-timeout-secs", "600",
+            "--log-level", "info",
+            "--disable-circuit-breaker",
+            "--request-timeout-secs", "3600",
+        ]
+        if self.api_key is not None:
+            cmd.extend(["--api-key", self.api_key])
+        if any(role in ("prefill", "decode") for role, _ in self.pd_config):
+            cmd.append("--pd-disaggregation")
+        for role, node_ip in self.pd_config:
+            node_url = f"http://{node_ip}:{self.worker_port}"
+            if role == "prefill":
+                cmd.extend(["--prefill", node_url, str(self.prefill_bootstrap_port)])
+            elif role == "decode":
+                cmd.extend(["--decode", node_url])
+            elif role == "worker":
+                cmd.extend(["--worker-urls", node_url])
+            else:
+                raise ValueError(f"invalid pd_config role: {role}")
+        return cmd
+    def start(self):
+        for _, node_ip in self.pd_config:
+            _wait_ready_url(
+                f"http://{node_ip}:{self.worker_port}/health",
+                timeout=self.health_timeout,
+                poll_interval=self.health_poll_interval,
+            )
+        cmd = self._build_cmd()
+        print(f"[router] starting: {shlex.join(cmd)}")
+        self._proc = subprocess.Popen(cmd)
+        _wait_ready_url(
+            f"http://localhost:{self.router_port}/health",
+            timeout=self.health_timeout,
+            poll_interval=self.health_poll_interval,
+        )
+    def stop(self):
+        terminate_process(self._proc)
+        self._proc = None
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+def wait_ready(
+    process: subprocess.Popen,
+    *,
+    port: int,
+    timeout: float,
+    health_path: str = "/health",
+    poll_interval: float = 5.0,
+    request_timeout: float = 5.0,
+) -> None:
+    """Poll SGLang's HTTP health endpoint until ready, raising if the process dies."""
+    deadline = time.time() + timeout
+    url = f"http://127.0.0.1:{port}{health_path}"
+    last_error = "no response yet"
+    while time.time() < deadline:
+        try:
+            error = _health_check(url, request_timeout=request_timeout, process=process)
+        except subprocess.CalledProcessError:
+            raise
+        if error is None:
+            return
+        last_error = error
+        time.sleep(poll_interval)
+    raise TimeoutError(
+        f"SGLang health check timed out after {timeout}s for {url}. "
+        f"Last error: {last_error}"
+    )
+def warmup_chat_completions(
+    *,
+    port: int,
+    payload: Mapping[str, Any],
+    headers: Mapping[str, str] | None = None,
+    successful_requests: int = 3,
+    request_timeout: float = 30.0,
+    max_attempts_per_request: int = 2,
+    retry_delay: float = 1.0,
+) -> None:
+    """Warm the OpenAI chat completions endpoint with strict retries."""
+    url = f"http://127.0.0.1:{port}/v1/chat/completions"
+    request_headers = {"Content-Type": "application/json"}
+    if headers:
+        request_headers.update(headers)
+    for request_idx in range(successful_requests):
+        for attempt in range(max_attempts_per_request):
+            try:
+                _post_json(url, payload=payload, headers=request_headers, timeout=request_timeout)
+                break
+            except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError, OSError) as exc:
+                if attempt + 1 == max_attempts_per_request:
+                    detail = _format_http_error(exc) if isinstance(exc, urllib.error.HTTPError) else f"{type(exc).__name__}: {exc}"
+                    raise RuntimeError(
+                        f"warmup request {request_idx + 1}/{successful_requests}: {detail}"
+                    ) from exc
+            time.sleep(retry_delay)
+def start_heartbeat_thread(
+    health_check_fn: Callable[[], str | None],
+    *,
+    on_failure: Callable[[], None],
+    poll_interval: float = 5.0,
+    max_consecutive_failures: int = 3,
+) -> threading.Thread:
+    def _loop():
+        consecutive_failures = 0
+        while True:
+            time.sleep(poll_interval)
+            try:
+                error = health_check_fn()
+            except subprocess.CalledProcessError as exc:
+                print(f"[heartbeat] server process exited with code {exc.returncode}")
+                on_failure()
+                return
+            if error is None:
+                consecutive_failures = 0
+                continue
+            consecutive_failures += 1
+            print(f"[heartbeat] {error} ({consecutive_failures}/{max_consecutive_failures})")
+            if consecutive_failures >= max_consecutive_failures:
+                print("[heartbeat] sustained health-check failure, invoking on_failure")
+                on_failure()
+                return
+    t = threading.Thread(target=_loop, daemon=True)
+    t.start()
+    return t
+def terminate_process(
+    process: subprocess.Popen | None,
+    *,
+    terminate_timeout: float = 10.0,
+) -> None:
+    """Terminate a subprocess cleanly, then kill it if needed."""
+    if process is None or process.poll() is not None:
+        return
+    process.terminate()
+    try:
+        process.wait(timeout=terminate_timeout)
+    except subprocess.TimeoutExpired:
+        process.kill()
+        process.wait()
+# ---------------------------------------------------------------------------
+# SGLang snapshot helpers
+# ---------------------------------------------------------------------------
+def sgl_sleep(port: int) -> None:
+    _post_json(
+        f"http://127.0.0.1:{port}/release_memory_occupation",
+        payload={},
+        headers={"Content-Type": "application/json"},
+    )
+def sgl_wake(port: int) -> None:
+    _post_json(
+        f"http://127.0.0.1:{port}/resume_memory_occupation",
+        payload={},
+        headers={"Content-Type": "application/json"},
+    )
+# ---------------------------------------------------------------------------
+# Internal
+# ---------------------------------------------------------------------------
+def _health_check(
+    url: str,
+    *,
+    request_timeout: float = 5.0,
+    process: subprocess.Popen | None = None,
+) -> str | None:
+    """Single health-check attempt.
+    Returns None on success, or an error description string on failure.
+    Raises subprocess.CalledProcessError if *process* has exited.
+    """
+    if process is not None:
+        _raise_if_exited(process)
+    try:
+        status = _get_status(url, timeout=request_timeout)
+        if 200 <= status < 300:
+            return None
+        return f"health check returned status {status}"
+    except urllib.error.HTTPError as exc:
+        return _format_http_error(exc)
+    except (urllib.error.URLError, TimeoutError, OSError) as exc:
+        return f"{type(exc).__name__}: {exc}"
+def _raise_if_exited(process: subprocess.Popen) -> None:
+    if (rc := process.poll()) is not None:
+        raise subprocess.CalledProcessError(rc, cmd=process.args)
+def _wait_ready_url(
+    url: str,
+    *,
+    timeout: float,
+    poll_interval: float = 5.0,
+    request_timeout: float = 5.0,
+) -> None:
+    """Poll a URL until it returns 2xx (no subprocess to watch)."""
+    deadline = time.time() + timeout
+    last_error = "no response yet"
+    while time.time() < deadline:
+        error = _health_check(url, request_timeout=request_timeout)
+        if error is None:
+            return
+        last_error = error
+        time.sleep(poll_interval)
+    raise TimeoutError(f"Timed out after {timeout}s waiting for {url}. Last error: {last_error}")
+def _get_status(url: str, *, timeout: float) -> int:
+    req = urllib.request.Request(url, method="GET")
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        return resp.getcode()
+def _post_json(
+    url: str,
+    *,
+    payload: Mapping[str, Any],
+    headers: Mapping[str, str],
+    timeout: float | None = None,
+) -> int:
+    body = json.dumps(payload).encode("utf-8")
+    req = urllib.request.Request(url, data=body, headers=dict(headers), method="POST")
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        return resp.getcode()
+def _format_http_error(exc: urllib.error.HTTPError) -> str:
+    body = exc.read().decode("utf-8", errors="replace").strip()
+    if body:
+        return f"status {exc.code}: {body[:500]}"
+    return f"status {exc.code}"

autoinference_utils-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,9 @@
+[project]
+name = "autoinference-utils"
+version = "0.1.0"
+description = "Shared endpoint abstractions for autoinference deployments"
+requires-python = ">=3.10"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"