vec-inf 0.7.1__tar.gz → 0.7.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/workflows/code_checks.yml +4 -2
  2. {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/workflows/docker.yml +7 -2
  3. {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/workflows/docs.yml +7 -7
  4. {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/workflows/publish.yml +1 -1
  5. {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/workflows/unit_tests.yml +5 -5
  6. {vec_inf-0.7.1 → vec_inf-0.7.3}/.pre-commit-config.yaml +1 -1
  7. {vec_inf-0.7.1 → vec_inf-0.7.3}/Dockerfile +12 -8
  8. {vec_inf-0.7.1 → vec_inf-0.7.3}/MODEL_TRACKING.md +8 -2
  9. {vec_inf-0.7.1 → vec_inf-0.7.3}/PKG-INFO +7 -6
  10. {vec_inf-0.7.1 → vec_inf-0.7.3}/README.md +3 -3
  11. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/index.md +1 -1
  12. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/user_guide.md +35 -18
  13. {vec_inf-0.7.1 → vec_inf-0.7.3}/pyproject.toml +5 -4
  14. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/cli/test_cli.py +107 -1
  15. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/cli/test_helper.py +251 -0
  16. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_api.py +186 -0
  17. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_slurm_script_generator.py +13 -13
  18. vec_inf-0.7.3/uv.lock +6357 -0
  19. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/README.md +2 -1
  20. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/cli/_cli.py +39 -10
  21. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/cli/_helper.py +100 -19
  22. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_helper.py +80 -31
  23. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_slurm_script_generator.py +58 -30
  24. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_slurm_templates.py +27 -12
  25. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_utils.py +58 -6
  26. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/api.py +55 -2
  27. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/models.py +6 -0
  28. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/config/models.yaml +47 -99
  29. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/find_port.sh +10 -1
  30. vec_inf-0.7.1/uv.lock +0 -5260
  31. {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  32. {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  33. {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  34. {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/ISSUE_TEMPLATE/model-request.md +0 -0
  35. {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/dependabot.yml +0 -0
  36. {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/pull_request_template.md +0 -0
  37. {vec_inf-0.7.1 → vec_inf-0.7.3}/.gitignore +0 -0
  38. {vec_inf-0.7.1 → vec_inf-0.7.3}/.python-version +0 -0
  39. {vec_inf-0.7.1 → vec_inf-0.7.3}/LICENSE +0 -0
  40. {vec_inf-0.7.1 → vec_inf-0.7.3}/codecov.yml +0 -0
  41. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/Makefile +0 -0
  42. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/api.md +0 -0
  43. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/assets/favicon-48x48.svg +0 -0
  44. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/assets/favicon.ico +0 -0
  45. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/assets/vector-logo.svg +0 -0
  46. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/contributing.md +0 -0
  47. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/make.bat +0 -0
  48. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/overrides/partials/copyright.html +0 -0
  49. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/overrides/partials/logo.html +0 -0
  50. {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/stylesheets/extra.css +0 -0
  51. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/README.md +0 -0
  52. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/api/basic_usage.py +0 -0
  53. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/inference/llm/chat_completions.py +0 -0
  54. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/inference/llm/completions.py +0 -0
  55. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/inference/llm/completions.sh +0 -0
  56. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/inference/text_embedding/embeddings.py +0 -0
  57. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/inference/vlm/vision_completions.py +0 -0
  58. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/logits/logits.py +0 -0
  59. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/slurm_dependency/README.md +0 -0
  60. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/slurm_dependency/downstream_job.sbatch +0 -0
  61. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/slurm_dependency/run_downstream.py +0 -0
  62. {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/slurm_dependency/run_workflow.sh +0 -0
  63. {vec_inf-0.7.1 → vec_inf-0.7.3}/mkdocs.yml +0 -0
  64. {vec_inf-0.7.1 → vec_inf-0.7.3}/profile/avg_throughput.py +0 -0
  65. {vec_inf-0.7.1 → vec_inf-0.7.3}/profile/gen.py +0 -0
  66. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/__init__.py +0 -0
  67. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/test_imports.py +0 -0
  68. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/__init__.py +0 -0
  69. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/cli/__init__.py +0 -0
  70. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/cli/test_utils.py +0 -0
  71. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/__init__.py +0 -0
  72. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_examples.py +0 -0
  73. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_helper.py +0 -0
  74. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_models.py +0 -0
  75. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_utils.py +0 -0
  76. {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_vars.env +0 -0
  77. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/__init__.py +0 -0
  78. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/cli/__init__.py +0 -0
  79. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/cli/_utils.py +0 -0
  80. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/cli/_vars.py +0 -0
  81. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/__init__.py +0 -0
  82. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_client_vars.py +0 -0
  83. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_exceptions.py +0 -0
  84. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_slurm_vars.py +0 -0
  85. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/config.py +0 -0
  86. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/config/README.md +0 -0
  87. {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/config/environment.yaml +0 -0
  88. {vec_inf-0.7.1 → vec_inf-0.7.3}/venv.sh +0 -0
@@ -30,7 +30,7 @@ jobs:
30
30
  steps:
31
31
  - uses: actions/checkout@v5.0.0
32
32
  - name: Install uv
33
- uses: astral-sh/setup-uv@v6
33
+ uses: astral-sh/setup-uv@v7
34
34
  with:
35
35
  # Install a specific version of uv.
36
36
  version: "0.5.21"
@@ -40,7 +40,7 @@ jobs:
40
40
  with:
41
41
  python-version-file: ".python-version"
42
42
  - name: Install the project
43
- run: uv sync --dev
43
+ run: uv sync --dev --prerelease=allow
44
44
  - name: Install dependencies and check code
45
45
  run: |
46
46
  source .venv/bin/activate
@@ -49,3 +49,5 @@ jobs:
49
49
  uses: pypa/gh-action-pip-audit@v1.1.0
50
50
  with:
51
51
  virtual-environment: .venv/
52
+ # Temporary: ignore pip advisory until fixed in pip>=25.3
53
+ ignore-vulns: GHSA-4xh5-x5gv-qwph
@@ -21,7 +21,9 @@ on:
21
21
  jobs:
22
22
  push_to_registry:
23
23
  name: Push Docker image to Docker Hub
24
- runs-on: ubuntu-latest
24
+ runs-on:
25
+ - self-hosted
26
+ - docker
25
27
  steps:
26
28
  - name: Checkout repository
27
29
  uses: actions/checkout@v5.0.0
@@ -32,6 +34,9 @@ jobs:
32
34
  VERSION=$(grep -A 1 'name = "vllm"' uv.lock | grep version | cut -d '"' -f 2)
33
35
  echo "version=$VERSION" >> $GITHUB_OUTPUT
34
36
 
37
+ - name: Set up Docker Buildx
38
+ uses: docker/setup-buildx-action@v3
39
+
35
40
  - name: Log in to Docker Hub
36
41
  uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef
37
42
  with:
@@ -40,7 +45,7 @@ jobs:
40
45
 
41
46
  - name: Extract metadata (tags, labels) for Docker
42
47
  id: meta
43
- uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f
48
+ uses: docker/metadata-action@318604b99e75e41977312d83839a89be02ca4893
44
49
  with:
45
50
  images: vectorinstitute/vector-inference
46
51
 
@@ -56,7 +56,7 @@ jobs:
56
56
  fetch-depth: 0 # Fetch all history for proper versioning
57
57
 
58
58
  - name: Install uv
59
- uses: astral-sh/setup-uv@v6
59
+ uses: astral-sh/setup-uv@v7
60
60
  with:
61
61
  version: "0.5.21"
62
62
  enable-cache: true
@@ -67,16 +67,16 @@ jobs:
67
67
  python-version-file: ".python-version"
68
68
 
69
69
  - name: Install the project
70
- run: uv sync --all-extras --group docs
70
+ run: uv sync --all-extras --group docs --prerelease=allow
71
71
 
72
72
  - name: Build docs
73
- run: uv run mkdocs build
73
+ run: uv run --frozen mkdocs build
74
74
 
75
75
  - name: Create .nojekyll file
76
76
  run: touch site/.nojekyll
77
77
 
78
78
  - name: Upload artifact
79
- uses: actions/upload-artifact@v4
79
+ uses: actions/upload-artifact@v5
80
80
  with:
81
81
  name: docs-site
82
82
  path: site/
@@ -93,7 +93,7 @@ jobs:
93
93
  fetch-depth: 0 # Fetch all history for proper versioning
94
94
 
95
95
  - name: Install uv
96
- uses: astral-sh/setup-uv@v6
96
+ uses: astral-sh/setup-uv@v7
97
97
  with:
98
98
  version: "0.5.21"
99
99
  enable-cache: true
@@ -104,7 +104,7 @@ jobs:
104
104
  python-version-file: ".python-version"
105
105
 
106
106
  - name: Install the project
107
- run: uv sync --all-extras --group docs
107
+ run: uv sync --all-extras --group docs --frozen
108
108
 
109
109
  - name: Configure Git Credentials
110
110
  run: |
@@ -112,7 +112,7 @@ jobs:
112
112
  git config user.email 41898282+github-actions[bot]@users.noreply.github.com
113
113
 
114
114
  - name: Download artifact
115
- uses: actions/download-artifact@v5
115
+ uses: actions/download-artifact@v6
116
116
  with:
117
117
  name: docs-site
118
118
  path: site
@@ -16,7 +16,7 @@ jobs:
16
16
  - uses: actions/checkout@v5.0.0
17
17
 
18
18
  - name: Install uv
19
- uses: astral-sh/setup-uv@v6
19
+ uses: astral-sh/setup-uv@v7
20
20
  with:
21
21
  version: "0.6.6"
22
22
  enable-cache: true
@@ -46,7 +46,7 @@ jobs:
46
46
  - uses: actions/checkout@v5.0.0
47
47
 
48
48
  - name: Install uv
49
- uses: astral-sh/setup-uv@v6
49
+ uses: astral-sh/setup-uv@v7
50
50
  with:
51
51
  # Install a specific version of uv.
52
52
  version: "0.5.21"
@@ -58,18 +58,18 @@ jobs:
58
58
  python-version: ${{ matrix.python-version }}
59
59
 
60
60
  - name: Install the project
61
- run: uv sync --dev
61
+ run: uv sync --dev --prerelease=allow
62
62
 
63
63
  - name: Install dependencies and check code
64
64
  run: |
65
- uv run pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
65
+ uv run --frozen pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
66
66
 
67
67
  - name: Install the core package only
68
68
  run: uv sync --no-dev
69
69
 
70
70
  - name: Run package import tests
71
71
  run: |
72
- uv run pytest tests/test_imports.py
72
+ uv run --frozen pytest tests/test_imports.py
73
73
 
74
74
  - name: Import Codecov GPG public key
75
75
  run: |
@@ -79,7 +79,7 @@ jobs:
79
79
  uses: codecov/codecov-action@v5.5.1
80
80
  with:
81
81
  token: ${{ secrets.CODECOV_TOKEN }}
82
- file: ./coverage.xml
82
+ files: ./coverage.xml
83
83
  name: codecov-umbrella
84
84
  fail_ci_if_error: true
85
85
  verbose: true
@@ -17,7 +17,7 @@ repos:
17
17
  - id: check-toml
18
18
 
19
19
  - repo: https://github.com/astral-sh/ruff-pre-commit
20
- rev: 'v0.13.2'
20
+ rev: 'v0.14.5'
21
21
  hooks:
22
22
  - id: ruff
23
23
  args: [--fix, --exit-non-zero-on-fix]
@@ -35,29 +35,33 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && \
35
35
  rm get-pip.py && \
36
36
  python3.10 -m pip install --upgrade pip setuptools wheel uv
37
37
 
38
- # Install Infiniband/RDMA support
38
+ # Install RDMA support
39
39
  RUN apt-get update && apt-get install -y \
40
40
  libibverbs1 libibverbs-dev ibverbs-utils \
41
41
  librdmacm1 librdmacm-dev rdmacm-utils \
42
+ rdma-core ibverbs-providers infiniband-diags perftest \
42
43
  && rm -rf /var/lib/apt/lists/*
43
44
 
44
45
  # Set up RDMA environment (these will persist in the final container)
45
46
  ENV LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH"
46
- ENV UCX_NET_DEVICES=all
47
47
  ENV NCCL_IB_DISABLE=0
48
+ ENV NCCL_SOCKET_IFNAME="^lo,docker0"
49
+ ENV NCCL_NET_GDR_LEVEL=PHB
50
+ ENV NCCL_IB_TIMEOUT=22
51
+ ENV NCCL_IB_RETRY_CNT=7
52
+ ENV NCCL_DEBUG=INFO
48
53
 
49
54
  # Set up project
50
55
  WORKDIR /vec-inf
51
56
  COPY . /vec-inf
52
57
 
53
58
  # Install project dependencies with build requirements
54
- RUN PIP_INDEX_URL="https://download.pytorch.org/whl/cu128" uv pip install --system -e .[dev]
59
+ RUN uv pip install --system -e .[dev] --prerelease=allow
55
60
 
56
- # Final configuration
57
- RUN mkdir -p /vec-inf/nccl && \
58
- mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /vec-inf/nccl/libnccl.so.2.18.1
59
- ENV VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
60
- ENV NCCL_DEBUG=INFO
61
+ # Install a single, system NCCL (from NVIDIA CUDA repo in base image)
62
+ RUN apt-get update && apt-get install -y --allow-change-held-packages\
63
+ libnccl2 libnccl-dev \
64
+ && rm -rf /var/lib/apt/lists/*
61
65
 
62
66
  # Set the default command to start an interactive shell
63
67
  CMD ["bash"]
@@ -40,6 +40,7 @@ This document tracks all model weights available in the `/model-weights` directo
40
40
  | `gemma-2b-it` | ❌ |
41
41
  | `gemma-7b` | ❌ |
42
42
  | `gemma-7b-it` | ❌ |
43
+ | `gemma-2-2b-it` | ✅ |
43
44
  | `gemma-2-9b` | ✅ |
44
45
  | `gemma-2-9b-it` | ✅ |
45
46
  | `gemma-2-27b` | ✅ |
@@ -165,8 +166,8 @@ This document tracks all model weights available in the `/model-weights` directo
165
166
  | Model | Configuration |
166
167
  |:------|:-------------|
167
168
  | `Qwen3-14B` | ✅ |
168
- | `Qwen3-8B` | |
169
- | `Qwen3-32B` | |
169
+ | `Qwen3-8B` | |
170
+ | `Qwen3-32B` | |
170
171
  | `Qwen3-235B-A22B` | ❌ |
171
172
  | `Qwen3-Embedding-8B` | ❌ |
172
173
 
@@ -186,6 +187,11 @@ This document tracks all model weights available in the `/model-weights` directo
186
187
  | `DeepSeek-Coder-V2-Lite-Instruct` | ❌ |
187
188
  | `deepseek-math-7b-instruct` | ❌ |
188
189
 
190
+ ### OpenAI: GPT-OSS
191
+ | Model | Configuration |
192
+ |:------|:-------------|
193
+ | `gpt-oss-120b` | ✅ |
194
+
189
195
  ### Other LLM Models
190
196
  | Model | Configuration |
191
197
  |:------|:-------------|
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vec-inf
3
- Version: 0.7.1
3
+ Version: 0.7.3
4
4
  Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
5
  Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
6
6
  License-Expression: MIT
@@ -13,9 +13,10 @@ Requires-Dist: requests>=2.31.0
13
13
  Requires-Dist: rich>=13.7.0
14
14
  Provides-Extra: dev
15
15
  Requires-Dist: cupy-cuda12x==12.1.0; extra == 'dev'
16
- Requires-Dist: ray>=2.40.0; extra == 'dev'
16
+ Requires-Dist: flashinfer-python>=0.4.0; extra == 'dev'
17
+ Requires-Dist: ray[default]>=2.50.0; extra == 'dev'
18
+ Requires-Dist: sglang>=0.5.0; extra == 'dev'
17
19
  Requires-Dist: torch>=2.7.0; extra == 'dev'
18
- Requires-Dist: vllm-nccl-cu12<2.19,>=2.18; extra == 'dev'
19
20
  Requires-Dist: vllm>=0.10.0; extra == 'dev'
20
21
  Requires-Dist: xgrammar>=0.1.11; extra == 'dev'
21
22
  Description-Content-Type: text/markdown
@@ -29,7 +30,7 @@ Description-Content-Type: text/markdown
29
30
  [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
30
31
  [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
31
32
  [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
32
- [![vLLM](https://img.shields.io/badge/vLLM-0.10.1.1-blue)](https://docs.vllm.ai/en/v0.10.1.1/)
33
+ [![vLLM](https://img.shields.io/badge/vLLM-0.11.0-blue)](https://docs.vllm.ai/en/v0.11.0/)
33
34
  ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
34
35
 
35
36
  This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **This package runs natively on the Vector Institute cluster environments**. To adapt to other environments, follow the instructions in [Installation](#installation).
@@ -42,7 +43,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
42
43
  ```bash
43
44
  pip install vec-inf
44
45
  ```
45
- Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.10.1.1`.
46
+ Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`.
46
47
 
47
48
  If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
48
49
  * Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](vec_inf/config/), then install from source by running `pip install .`.
@@ -75,7 +76,7 @@ Models that are already supported by `vec-inf` would be launched using the cache
75
76
  #### Other commands
76
77
 
77
78
  * `batch-launch`: Launch multiple model inference servers at once, currently ONLY single node models supported,
78
- * `status`: Check the model status by providing its Slurm job ID.
79
+ * `status`: Check the status of all `vec-inf` jobs, or a specific job by providing its job ID.
79
80
  * `metrics`: Streams performance metrics to the console.
80
81
  * `shutdown`: Shutdown a model by providing its Slurm job ID.
81
82
  * `list`: List all available model names, or view the default/cached configuration of a specific model.
@@ -7,7 +7,7 @@
7
7
  [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
8
8
  [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
9
9
  [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
10
- [![vLLM](https://img.shields.io/badge/vLLM-0.10.1.1-blue)](https://docs.vllm.ai/en/v0.10.1.1/)
10
+ [![vLLM](https://img.shields.io/badge/vLLM-0.11.0-blue)](https://docs.vllm.ai/en/v0.11.0/)
11
11
  ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
12
12
 
13
13
  This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **This package runs natively on the Vector Institute cluster environments**. To adapt to other environments, follow the instructions in [Installation](#installation).
@@ -20,7 +20,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
20
20
  ```bash
21
21
  pip install vec-inf
22
22
  ```
23
- Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.10.1.1`.
23
+ Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`.
24
24
 
25
25
  If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
26
26
  * Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](vec_inf/config/), then install from source by running `pip install .`.
@@ -53,7 +53,7 @@ Models that are already supported by `vec-inf` would be launched using the cache
53
53
  #### Other commands
54
54
 
55
55
  * `batch-launch`: Launch multiple model inference servers at once, currently ONLY single node models supported,
56
- * `status`: Check the model status by providing its Slurm job ID.
56
+ * `status`: Check the status of all `vec-inf` jobs, or a specific job by providing its job ID.
57
57
  * `metrics`: Streams performance metrics to the console.
58
58
  * `shutdown`: Shutdown a model by providing its Slurm job ID.
59
59
  * `list`: List all available model names, or view the default/cached configuration of a specific model.
@@ -12,7 +12,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
12
12
  pip install vec-inf
13
13
  ```
14
14
 
15
- Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.10.1.1`.
15
+ Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`.
16
16
 
17
17
  If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
18
18
  * Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/config), then install from source by running `pip install .`.
@@ -149,35 +149,52 @@ Since batch launches use heterogeneous jobs, users can request different partiti
149
149
 
150
150
  ### `status` command
151
151
 
152
- You can check the inference server status by providing the Slurm job ID to the `status` command:
152
+ You can check the status of all inference servers launched through `vec-inf` by running the `status` command:
153
+ ```bash
154
+ vec-inf status
155
+ ```
156
+
157
+ And you should see an output like this:
158
+ ```
159
+ ┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓
160
+ ┃ Job ID ┃ Model Name ┃ Status ┃ Base URL ┃
161
+ ┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩
162
+ │ 1434429 │ Qwen3-8B │ READY │ http://gpu113:8080/v1 │
163
+ │ 1434584 │ Qwen3-14B │ READY │ http://gpu053:8080/v1 │
164
+ │ 1435035+0 │ Qwen3-32B │ PENDING │ UNAVAILABLE │
165
+ │ 1435035+1 │ Qwen3-14B │ PENDING │ UNAVAILABLE │
166
+ └───────────┴────────────┴─────────┴───────────────────────┘
167
+ ```
168
+
169
+ If you want to check why a specific job is pending or failing, append the job ID to the status command:
153
170
 
154
171
  ```bash
155
- vec-inf status 15373800
172
+ vec-inf status 1435035+1
156
173
  ```
157
174
 
158
175
  If the server is pending for resources, you should see an output like this:
159
176
 
160
177
  ```
161
- ┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
162
- ┃ Job Status ┃ Value
163
- ┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
164
- │ Model Name │ Meta-Llama-3.1-8B-Instruct
165
- │ Model Status │ PENDING
166
- │ Pending Reason │ Resources
167
- │ Base URL │ UNAVAILABLE
168
- └────────────────┴────────────────────────────┘
178
+ ┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓
179
+ ┃ Job Status ┃ Value
180
+ ┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩
181
+ │ Model Name │ Qwen3-14B
182
+ │ Model Status │ PENDING
183
+ │ Pending Reason │ Resources
184
+ │ Base URL │ UNAVAILABLE
185
+ └────────────────┴─────────────┘
169
186
  ```
170
187
 
171
188
  When the server is ready, you should see an output like this:
172
189
 
173
190
  ```
174
- ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
175
- ┃ Job Status ┃ Value
176
- ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
177
- │ Model Name │ Meta-Llama-3.1-8B-Instruct
178
- │ Model Status │ READY
179
- │ Base URL │ http://gpu042:8080/v1
180
- └──────────────┴────────────────────────────┘
191
+ ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓
192
+ ┃ Job Status ┃ Value
193
+ ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩
194
+ │ Model Name │ Qwen3-14B
195
+ │ Model Status │ READY
196
+ │ Base URL │ http://gpu105:8080/v1
197
+ └──────────────┴───────────────────────┘
181
198
  ```
182
199
 
183
200
  There are 5 possible states:
@@ -190,7 +207,7 @@ There are 5 possible states:
190
207
 
191
208
  **Note**
192
209
  * The base URL is only available when model is in `READY` state.
193
- * For servers launched with `batch-launch`, the job ID should follow the format of "MAIN_JOB_ID+OFFSET" (e.g. 17480109+0, 17480109+1).
210
+ * For servers launched with `batch-launch`, the job ID should follow the format of "MAIN_JOB_ID+OFFSET" (e.g. 1435035+0, 1435035+1).
194
211
 
195
212
  ### `metrics` command
196
213
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "vec-inf"
3
- version = "0.7.1"
3
+ version = "0.7.3"
4
4
  description = "Efficient LLM inference on Slurm clusters using vLLM."
5
5
  readme = "README.md"
6
6
  authors = [{name = "Marshall Wang", email = "marshall.wang@vectorinstitute.ai"}]
@@ -42,9 +42,10 @@ dev = [
42
42
  "xgrammar>=0.1.11",
43
43
  "torch>=2.7.0",
44
44
  "vllm>=0.10.0",
45
- "vllm-nccl-cu12>=2.18,<2.19",
46
- "ray>=2.40.0",
47
- "cupy-cuda12x==12.1.0"
45
+ "ray[default]>=2.50.0",
46
+ "cupy-cuda12x==12.1.0",
47
+ "flashinfer-python>=0.4.0",
48
+ "sglang>=0.5.0",
48
49
  ]
49
50
 
50
51
  [project.scripts]
@@ -39,6 +39,7 @@ def test_launch_command_success(runner):
39
39
  "mem_per_node": "32G",
40
40
  "model_weights_parent_dir": "/model-weights",
41
41
  "vocab_size": "128000",
42
+ "venv": "/path/to/venv",
42
43
  "vllm_args": {"max_model_len": 8192},
43
44
  "env": {"CACHE": "/cache"},
44
45
  }
@@ -134,7 +135,7 @@ def test_list_single_model(runner):
134
135
 
135
136
 
136
137
  def test_status_command(runner):
137
- """Test status command."""
138
+ """Test status command with job ID argument."""
138
139
  with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
139
140
  mock_client = MagicMock()
140
141
  mock_client_class.return_value = mock_client
@@ -153,6 +154,111 @@ def test_status_command(runner):
153
154
  assert "Meta-Llama-3.1-8B" in result.output
154
155
 
155
156
 
157
+ def test_status_command_no_job_id_no_running_jobs(runner):
158
+ """Test status command with no argument when no jobs are running."""
159
+ with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
160
+ mock_client = MagicMock()
161
+ mock_client_class.return_value = mock_client
162
+ mock_client.fetch_running_jobs.return_value = []
163
+
164
+ result = runner.invoke(cli, ["status"])
165
+
166
+ assert result.exit_code == 0
167
+ assert "No running jobs found." in result.output
168
+
169
+
170
+ def test_status_command_no_job_id_single_running_job(runner):
171
+ """Test status command with no argument when one job is running."""
172
+ with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
173
+ mock_client = MagicMock()
174
+ mock_client_class.return_value = mock_client
175
+ mock_client.fetch_running_jobs.return_value = ["12345"]
176
+
177
+ mock_status = MagicMock()
178
+ mock_status.model_name = "test-model-1"
179
+ mock_status.server_status = "READY"
180
+ mock_status.base_url = "http://localhost:8000"
181
+ mock_status.pending_reason = None
182
+ mock_status.failed_reason = None
183
+ mock_client.get_status.return_value = mock_status
184
+
185
+ result = runner.invoke(cli, ["status"])
186
+
187
+ assert result.exit_code == 0
188
+ assert "test-model-1" in result.output
189
+ mock_client.fetch_running_jobs.assert_called_once()
190
+ mock_client.get_status.assert_called_once_with("12345")
191
+
192
+
193
+ def test_status_command_no_job_id_multiple_running_jobs(runner):
194
+ """Test status command with no argument when multiple jobs are running."""
195
+ with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
196
+ mock_client = MagicMock()
197
+ mock_client_class.return_value = mock_client
198
+ mock_client.fetch_running_jobs.return_value = ["12345", "67890"]
199
+
200
+ mock_status_1 = MagicMock()
201
+ mock_status_1.model_name = "test-model-1"
202
+ mock_status_1.server_status = "READY"
203
+ mock_status_1.base_url = "http://localhost:8000"
204
+ mock_status_1.pending_reason = None
205
+ mock_status_1.failed_reason = None
206
+
207
+ mock_status_2 = MagicMock()
208
+ mock_status_2.model_name = "test-model-2"
209
+ mock_status_2.server_status = "PENDING"
210
+ mock_status_2.base_url = None
211
+ mock_status_2.pending_reason = "Waiting for resources"
212
+ mock_status_2.failed_reason = None
213
+
214
+ mock_client.get_status.side_effect = [mock_status_1, mock_status_2]
215
+
216
+ result = runner.invoke(cli, ["status"])
217
+
218
+ assert result.exit_code == 0
219
+ assert "test-model-1" in result.output
220
+ assert "test-model-2" in result.output
221
+ assert "12345" in result.output
222
+ assert "67890" in result.output
223
+ mock_client.fetch_running_jobs.assert_called_once()
224
+ assert mock_client.get_status.call_count == 2
225
+
226
+
227
+ def test_status_command_no_job_id_multiple_jobs_json_mode(runner):
228
+ """Test status command with no argument and JSON mode for multiple jobs."""
229
+ with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
230
+ mock_client = MagicMock()
231
+ mock_client_class.return_value = mock_client
232
+ mock_client.fetch_running_jobs.return_value = ["12345", "67890"]
233
+
234
+ mock_status_1 = MagicMock()
235
+ mock_status_1.model_name = "test-model-1"
236
+ mock_status_1.server_status = "READY"
237
+ mock_status_1.base_url = "http://localhost:8000"
238
+ mock_status_1.pending_reason = None
239
+ mock_status_1.failed_reason = None
240
+
241
+ mock_status_2 = MagicMock()
242
+ mock_status_2.model_name = "test-model-2"
243
+ mock_status_2.server_status = "FAILED"
244
+ mock_status_2.base_url = None
245
+ mock_status_2.pending_reason = None
246
+ mock_status_2.failed_reason = "Out of memory"
247
+
248
+ mock_client.get_status.side_effect = [mock_status_1, mock_status_2]
249
+
250
+ result = runner.invoke(cli, ["status", "--json-mode"])
251
+
252
+ assert result.exit_code == 0
253
+ output = json.loads(result.output)
254
+ assert isinstance(output, list)
255
+ assert len(output) == 2
256
+ assert output[0]["model_name"] == "test-model-1"
257
+ assert output[0]["model_status"] == "READY"
258
+ assert output[1]["model_name"] == "test-model-2"
259
+ assert output[1]["model_status"] == "FAILED"
260
+
261
+
156
262
  def test_shutdown_command(runner):
157
263
  """Test shutdown command."""
158
264
  with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class: