vec-inf 0.7.1__tar.gz → 0.7.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/workflows/code_checks.yml +4 -2
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/workflows/docker.yml +7 -2
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/workflows/docs.yml +7 -7
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/workflows/publish.yml +1 -1
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/workflows/unit_tests.yml +5 -5
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.pre-commit-config.yaml +1 -1
- {vec_inf-0.7.1 → vec_inf-0.7.3}/Dockerfile +12 -8
- {vec_inf-0.7.1 → vec_inf-0.7.3}/MODEL_TRACKING.md +8 -2
- {vec_inf-0.7.1 → vec_inf-0.7.3}/PKG-INFO +7 -6
- {vec_inf-0.7.1 → vec_inf-0.7.3}/README.md +3 -3
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/index.md +1 -1
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/user_guide.md +35 -18
- {vec_inf-0.7.1 → vec_inf-0.7.3}/pyproject.toml +5 -4
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/cli/test_cli.py +107 -1
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/cli/test_helper.py +251 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_api.py +186 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_slurm_script_generator.py +13 -13
- vec_inf-0.7.3/uv.lock +6357 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/README.md +2 -1
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/cli/_cli.py +39 -10
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/cli/_helper.py +100 -19
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_helper.py +80 -31
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_slurm_script_generator.py +58 -30
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_slurm_templates.py +27 -12
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_utils.py +58 -6
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/api.py +55 -2
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/models.py +6 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/config/models.yaml +47 -99
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/find_port.sh +10 -1
- vec_inf-0.7.1/uv.lock +0 -5260
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/ISSUE_TEMPLATE/model-request.md +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/dependabot.yml +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.github/pull_request_template.md +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.gitignore +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/.python-version +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/LICENSE +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/codecov.yml +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/Makefile +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/api.md +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/assets/favicon-48x48.svg +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/assets/favicon.ico +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/assets/vector-logo.svg +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/contributing.md +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/make.bat +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/overrides/partials/copyright.html +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/overrides/partials/logo.html +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/docs/stylesheets/extra.css +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/README.md +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/api/basic_usage.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/inference/llm/chat_completions.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/inference/llm/completions.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/inference/llm/completions.sh +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/inference/text_embedding/embeddings.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/inference/vlm/vision_completions.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/logits/logits.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/slurm_dependency/README.md +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/slurm_dependency/downstream_job.sbatch +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/slurm_dependency/run_downstream.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/examples/slurm_dependency/run_workflow.sh +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/mkdocs.yml +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/profile/avg_throughput.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/profile/gen.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/__init__.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/test_imports.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/__init__.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/cli/__init__.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/cli/test_utils.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/__init__.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_examples.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_helper.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_models.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_utils.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/tests/vec_inf/client/test_vars.env +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/__init__.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/cli/__init__.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/cli/_utils.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/cli/_vars.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/__init__.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_client_vars.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_exceptions.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/_slurm_vars.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/client/config.py +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/config/README.md +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/vec_inf/config/environment.yaml +0 -0
- {vec_inf-0.7.1 → vec_inf-0.7.3}/venv.sh +0 -0
|
@@ -30,7 +30,7 @@ jobs:
|
|
|
30
30
|
steps:
|
|
31
31
|
- uses: actions/checkout@v5.0.0
|
|
32
32
|
- name: Install uv
|
|
33
|
-
uses: astral-sh/setup-uv@
|
|
33
|
+
uses: astral-sh/setup-uv@v7
|
|
34
34
|
with:
|
|
35
35
|
# Install a specific version of uv.
|
|
36
36
|
version: "0.5.21"
|
|
@@ -40,7 +40,7 @@ jobs:
|
|
|
40
40
|
with:
|
|
41
41
|
python-version-file: ".python-version"
|
|
42
42
|
- name: Install the project
|
|
43
|
-
run: uv sync --dev
|
|
43
|
+
run: uv sync --dev --prerelease=allow
|
|
44
44
|
- name: Install dependencies and check code
|
|
45
45
|
run: |
|
|
46
46
|
source .venv/bin/activate
|
|
@@ -49,3 +49,5 @@ jobs:
|
|
|
49
49
|
uses: pypa/gh-action-pip-audit@v1.1.0
|
|
50
50
|
with:
|
|
51
51
|
virtual-environment: .venv/
|
|
52
|
+
# Temporary: ignore pip advisory until fixed in pip>=25.3
|
|
53
|
+
ignore-vulns: GHSA-4xh5-x5gv-qwph
|
|
@@ -21,7 +21,9 @@ on:
|
|
|
21
21
|
jobs:
|
|
22
22
|
push_to_registry:
|
|
23
23
|
name: Push Docker image to Docker Hub
|
|
24
|
-
runs-on:
|
|
24
|
+
runs-on:
|
|
25
|
+
- self-hosted
|
|
26
|
+
- docker
|
|
25
27
|
steps:
|
|
26
28
|
- name: Checkout repository
|
|
27
29
|
uses: actions/checkout@v5.0.0
|
|
@@ -32,6 +34,9 @@ jobs:
|
|
|
32
34
|
VERSION=$(grep -A 1 'name = "vllm"' uv.lock | grep version | cut -d '"' -f 2)
|
|
33
35
|
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
|
34
36
|
|
|
37
|
+
- name: Set up Docker Buildx
|
|
38
|
+
uses: docker/setup-buildx-action@v3
|
|
39
|
+
|
|
35
40
|
- name: Log in to Docker Hub
|
|
36
41
|
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef
|
|
37
42
|
with:
|
|
@@ -40,7 +45,7 @@ jobs:
|
|
|
40
45
|
|
|
41
46
|
- name: Extract metadata (tags, labels) for Docker
|
|
42
47
|
id: meta
|
|
43
|
-
uses: docker/metadata-action@
|
|
48
|
+
uses: docker/metadata-action@318604b99e75e41977312d83839a89be02ca4893
|
|
44
49
|
with:
|
|
45
50
|
images: vectorinstitute/vector-inference
|
|
46
51
|
|
|
@@ -56,7 +56,7 @@ jobs:
|
|
|
56
56
|
fetch-depth: 0 # Fetch all history for proper versioning
|
|
57
57
|
|
|
58
58
|
- name: Install uv
|
|
59
|
-
uses: astral-sh/setup-uv@
|
|
59
|
+
uses: astral-sh/setup-uv@v7
|
|
60
60
|
with:
|
|
61
61
|
version: "0.5.21"
|
|
62
62
|
enable-cache: true
|
|
@@ -67,16 +67,16 @@ jobs:
|
|
|
67
67
|
python-version-file: ".python-version"
|
|
68
68
|
|
|
69
69
|
- name: Install the project
|
|
70
|
-
run: uv sync --all-extras --group docs
|
|
70
|
+
run: uv sync --all-extras --group docs --prerelease=allow
|
|
71
71
|
|
|
72
72
|
- name: Build docs
|
|
73
|
-
run: uv run mkdocs build
|
|
73
|
+
run: uv run --frozen mkdocs build
|
|
74
74
|
|
|
75
75
|
- name: Create .nojekyll file
|
|
76
76
|
run: touch site/.nojekyll
|
|
77
77
|
|
|
78
78
|
- name: Upload artifact
|
|
79
|
-
uses: actions/upload-artifact@
|
|
79
|
+
uses: actions/upload-artifact@v5
|
|
80
80
|
with:
|
|
81
81
|
name: docs-site
|
|
82
82
|
path: site/
|
|
@@ -93,7 +93,7 @@ jobs:
|
|
|
93
93
|
fetch-depth: 0 # Fetch all history for proper versioning
|
|
94
94
|
|
|
95
95
|
- name: Install uv
|
|
96
|
-
uses: astral-sh/setup-uv@
|
|
96
|
+
uses: astral-sh/setup-uv@v7
|
|
97
97
|
with:
|
|
98
98
|
version: "0.5.21"
|
|
99
99
|
enable-cache: true
|
|
@@ -104,7 +104,7 @@ jobs:
|
|
|
104
104
|
python-version-file: ".python-version"
|
|
105
105
|
|
|
106
106
|
- name: Install the project
|
|
107
|
-
run: uv sync --all-extras --group docs
|
|
107
|
+
run: uv sync --all-extras --group docs --frozen
|
|
108
108
|
|
|
109
109
|
- name: Configure Git Credentials
|
|
110
110
|
run: |
|
|
@@ -112,7 +112,7 @@ jobs:
|
|
|
112
112
|
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
|
|
113
113
|
|
|
114
114
|
- name: Download artifact
|
|
115
|
-
uses: actions/download-artifact@
|
|
115
|
+
uses: actions/download-artifact@v6
|
|
116
116
|
with:
|
|
117
117
|
name: docs-site
|
|
118
118
|
path: site
|
|
@@ -46,7 +46,7 @@ jobs:
|
|
|
46
46
|
- uses: actions/checkout@v5.0.0
|
|
47
47
|
|
|
48
48
|
- name: Install uv
|
|
49
|
-
uses: astral-sh/setup-uv@
|
|
49
|
+
uses: astral-sh/setup-uv@v7
|
|
50
50
|
with:
|
|
51
51
|
# Install a specific version of uv.
|
|
52
52
|
version: "0.5.21"
|
|
@@ -58,18 +58,18 @@ jobs:
|
|
|
58
58
|
python-version: ${{ matrix.python-version }}
|
|
59
59
|
|
|
60
60
|
- name: Install the project
|
|
61
|
-
run: uv sync --dev
|
|
61
|
+
run: uv sync --dev --prerelease=allow
|
|
62
62
|
|
|
63
63
|
- name: Install dependencies and check code
|
|
64
64
|
run: |
|
|
65
|
-
uv run pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
|
|
65
|
+
uv run --frozen pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
|
|
66
66
|
|
|
67
67
|
- name: Install the core package only
|
|
68
68
|
run: uv sync --no-dev
|
|
69
69
|
|
|
70
70
|
- name: Run package import tests
|
|
71
71
|
run: |
|
|
72
|
-
uv run pytest tests/test_imports.py
|
|
72
|
+
uv run --frozen pytest tests/test_imports.py
|
|
73
73
|
|
|
74
74
|
- name: Import Codecov GPG public key
|
|
75
75
|
run: |
|
|
@@ -79,7 +79,7 @@ jobs:
|
|
|
79
79
|
uses: codecov/codecov-action@v5.5.1
|
|
80
80
|
with:
|
|
81
81
|
token: ${{ secrets.CODECOV_TOKEN }}
|
|
82
|
-
|
|
82
|
+
files: ./coverage.xml
|
|
83
83
|
name: codecov-umbrella
|
|
84
84
|
fail_ci_if_error: true
|
|
85
85
|
verbose: true
|
|
@@ -35,29 +35,33 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && \
|
|
|
35
35
|
rm get-pip.py && \
|
|
36
36
|
python3.10 -m pip install --upgrade pip setuptools wheel uv
|
|
37
37
|
|
|
38
|
-
# Install
|
|
38
|
+
# Install RDMA support
|
|
39
39
|
RUN apt-get update && apt-get install -y \
|
|
40
40
|
libibverbs1 libibverbs-dev ibverbs-utils \
|
|
41
41
|
librdmacm1 librdmacm-dev rdmacm-utils \
|
|
42
|
+
rdma-core ibverbs-providers infiniband-diags perftest \
|
|
42
43
|
&& rm -rf /var/lib/apt/lists/*
|
|
43
44
|
|
|
44
45
|
# Set up RDMA environment (these will persist in the final container)
|
|
45
46
|
ENV LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH"
|
|
46
|
-
ENV UCX_NET_DEVICES=all
|
|
47
47
|
ENV NCCL_IB_DISABLE=0
|
|
48
|
+
ENV NCCL_SOCKET_IFNAME="^lo,docker0"
|
|
49
|
+
ENV NCCL_NET_GDR_LEVEL=PHB
|
|
50
|
+
ENV NCCL_IB_TIMEOUT=22
|
|
51
|
+
ENV NCCL_IB_RETRY_CNT=7
|
|
52
|
+
ENV NCCL_DEBUG=INFO
|
|
48
53
|
|
|
49
54
|
# Set up project
|
|
50
55
|
WORKDIR /vec-inf
|
|
51
56
|
COPY . /vec-inf
|
|
52
57
|
|
|
53
58
|
# Install project dependencies with build requirements
|
|
54
|
-
RUN
|
|
59
|
+
RUN uv pip install --system -e .[dev] --prerelease=allow
|
|
55
60
|
|
|
56
|
-
#
|
|
57
|
-
RUN
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
ENV NCCL_DEBUG=INFO
|
|
61
|
+
# Install a single, system NCCL (from NVIDIA CUDA repo in base image)
|
|
62
|
+
RUN apt-get update && apt-get install -y --allow-change-held-packages\
|
|
63
|
+
libnccl2 libnccl-dev \
|
|
64
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
61
65
|
|
|
62
66
|
# Set the default command to start an interactive shell
|
|
63
67
|
CMD ["bash"]
|
|
@@ -40,6 +40,7 @@ This document tracks all model weights available in the `/model-weights` directo
|
|
|
40
40
|
| `gemma-2b-it` | ❌ |
|
|
41
41
|
| `gemma-7b` | ❌ |
|
|
42
42
|
| `gemma-7b-it` | ❌ |
|
|
43
|
+
| `gemma-2-2b-it` | ✅ |
|
|
43
44
|
| `gemma-2-9b` | ✅ |
|
|
44
45
|
| `gemma-2-9b-it` | ✅ |
|
|
45
46
|
| `gemma-2-27b` | ✅ |
|
|
@@ -165,8 +166,8 @@ This document tracks all model weights available in the `/model-weights` directo
|
|
|
165
166
|
| Model | Configuration |
|
|
166
167
|
|:------|:-------------|
|
|
167
168
|
| `Qwen3-14B` | ✅ |
|
|
168
|
-
| `Qwen3-8B` |
|
|
169
|
-
| `Qwen3-32B` |
|
|
169
|
+
| `Qwen3-8B` | ✅ |
|
|
170
|
+
| `Qwen3-32B` | ✅ |
|
|
170
171
|
| `Qwen3-235B-A22B` | ❌ |
|
|
171
172
|
| `Qwen3-Embedding-8B` | ❌ |
|
|
172
173
|
|
|
@@ -186,6 +187,11 @@ This document tracks all model weights available in the `/model-weights` directo
|
|
|
186
187
|
| `DeepSeek-Coder-V2-Lite-Instruct` | ❌ |
|
|
187
188
|
| `deepseek-math-7b-instruct` | ❌ |
|
|
188
189
|
|
|
190
|
+
### OpenAI: GPT-OSS
|
|
191
|
+
| Model | Configuration |
|
|
192
|
+
|:------|:-------------|
|
|
193
|
+
| `gpt-oss-120b` | ✅ |
|
|
194
|
+
|
|
189
195
|
### Other LLM Models
|
|
190
196
|
| Model | Configuration |
|
|
191
197
|
|:------|:-------------|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: vec-inf
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: Efficient LLM inference on Slurm clusters using vLLM.
|
|
5
5
|
Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -13,9 +13,10 @@ Requires-Dist: requests>=2.31.0
|
|
|
13
13
|
Requires-Dist: rich>=13.7.0
|
|
14
14
|
Provides-Extra: dev
|
|
15
15
|
Requires-Dist: cupy-cuda12x==12.1.0; extra == 'dev'
|
|
16
|
-
Requires-Dist:
|
|
16
|
+
Requires-Dist: flashinfer-python>=0.4.0; extra == 'dev'
|
|
17
|
+
Requires-Dist: ray[default]>=2.50.0; extra == 'dev'
|
|
18
|
+
Requires-Dist: sglang>=0.5.0; extra == 'dev'
|
|
17
19
|
Requires-Dist: torch>=2.7.0; extra == 'dev'
|
|
18
|
-
Requires-Dist: vllm-nccl-cu12<2.19,>=2.18; extra == 'dev'
|
|
19
20
|
Requires-Dist: vllm>=0.10.0; extra == 'dev'
|
|
20
21
|
Requires-Dist: xgrammar>=0.1.11; extra == 'dev'
|
|
21
22
|
Description-Content-Type: text/markdown
|
|
@@ -29,7 +30,7 @@ Description-Content-Type: text/markdown
|
|
|
29
30
|
[](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
|
|
30
31
|
[](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
|
|
31
32
|
[](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
|
|
32
|
-
[](https://docs.vllm.ai/en/v0.11.0/)
|
|
33
34
|

|
|
34
35
|
|
|
35
36
|
This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **This package runs natively on the Vector Institute cluster environments**. To adapt to other environments, follow the instructions in [Installation](#installation).
|
|
@@ -42,7 +43,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
|
|
|
42
43
|
```bash
|
|
43
44
|
pip install vec-inf
|
|
44
45
|
```
|
|
45
|
-
Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.
|
|
46
|
+
Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`.
|
|
46
47
|
|
|
47
48
|
If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
|
|
48
49
|
* Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](vec_inf/config/), then install from source by running `pip install .`.
|
|
@@ -75,7 +76,7 @@ Models that are already supported by `vec-inf` would be launched using the cache
|
|
|
75
76
|
#### Other commands
|
|
76
77
|
|
|
77
78
|
* `batch-launch`: Launch multiple model inference servers at once, currently ONLY single node models supported,
|
|
78
|
-
* `status`: Check the
|
|
79
|
+
* `status`: Check the status of all `vec-inf` jobs, or a specific job by providing its job ID.
|
|
79
80
|
* `metrics`: Streams performance metrics to the console.
|
|
80
81
|
* `shutdown`: Shutdown a model by providing its Slurm job ID.
|
|
81
82
|
* `list`: List all available model names, or view the default/cached configuration of a specific model.
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
[](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
|
|
8
8
|
[](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
|
|
9
9
|
[](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
|
|
10
|
-
[](https://docs.vllm.ai/en/v0.11.0/)
|
|
11
11
|

|
|
12
12
|
|
|
13
13
|
This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **This package runs natively on the Vector Institute cluster environments**. To adapt to other environments, follow the instructions in [Installation](#installation).
|
|
@@ -20,7 +20,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
|
|
|
20
20
|
```bash
|
|
21
21
|
pip install vec-inf
|
|
22
22
|
```
|
|
23
|
-
Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.
|
|
23
|
+
Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`.
|
|
24
24
|
|
|
25
25
|
If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
|
|
26
26
|
* Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](vec_inf/config/), then install from source by running `pip install .`.
|
|
@@ -53,7 +53,7 @@ Models that are already supported by `vec-inf` would be launched using the cache
|
|
|
53
53
|
#### Other commands
|
|
54
54
|
|
|
55
55
|
* `batch-launch`: Launch multiple model inference servers at once, currently ONLY single node models supported,
|
|
56
|
-
* `status`: Check the
|
|
56
|
+
* `status`: Check the status of all `vec-inf` jobs, or a specific job by providing its job ID.
|
|
57
57
|
* `metrics`: Streams performance metrics to the console.
|
|
58
58
|
* `shutdown`: Shutdown a model by providing its Slurm job ID.
|
|
59
59
|
* `list`: List all available model names, or view the default/cached configuration of a specific model.
|
|
@@ -12,7 +12,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
|
|
|
12
12
|
pip install vec-inf
|
|
13
13
|
```
|
|
14
14
|
|
|
15
|
-
Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.
|
|
15
|
+
Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`.
|
|
16
16
|
|
|
17
17
|
If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
|
|
18
18
|
* Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/config), then install from source by running `pip install .`.
|
|
@@ -149,35 +149,52 @@ Since batch launches use heterogeneous jobs, users can request different partiti
|
|
|
149
149
|
|
|
150
150
|
### `status` command
|
|
151
151
|
|
|
152
|
-
You can check the
|
|
152
|
+
You can check the status of all inference servers launched through `vec-inf` by running the `status` command:
|
|
153
|
+
```bash
|
|
154
|
+
vec-inf status
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
And you should see an output like this:
|
|
158
|
+
```
|
|
159
|
+
┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓
|
|
160
|
+
┃ Job ID ┃ Model Name ┃ Status ┃ Base URL ┃
|
|
161
|
+
┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩
|
|
162
|
+
│ 1434429 │ Qwen3-8B │ READY │ http://gpu113:8080/v1 │
|
|
163
|
+
│ 1434584 │ Qwen3-14B │ READY │ http://gpu053:8080/v1 │
|
|
164
|
+
│ 1435035+0 │ Qwen3-32B │ PENDING │ UNAVAILABLE │
|
|
165
|
+
│ 1435035+1 │ Qwen3-14B │ PENDING │ UNAVAILABLE │
|
|
166
|
+
└───────────┴────────────┴─────────┴───────────────────────┘
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
If you want to check why a specific job is pending or failing, append the job ID to the status command:
|
|
153
170
|
|
|
154
171
|
```bash
|
|
155
|
-
vec-inf status
|
|
172
|
+
vec-inf status 1435035+1
|
|
156
173
|
```
|
|
157
174
|
|
|
158
175
|
If the server is pending for resources, you should see an output like this:
|
|
159
176
|
|
|
160
177
|
```
|
|
161
|
-
|
|
162
|
-
┃ Job Status ┃ Value
|
|
163
|
-
|
|
164
|
-
│ Model Name │
|
|
165
|
-
│ Model Status │ PENDING
|
|
166
|
-
│ Pending Reason │ Resources
|
|
167
|
-
│ Base URL │ UNAVAILABLE
|
|
168
|
-
|
|
178
|
+
┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓
|
|
179
|
+
┃ Job Status ┃ Value ┃
|
|
180
|
+
┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩
|
|
181
|
+
│ Model Name │ Qwen3-14B │
|
|
182
|
+
│ Model Status │ PENDING │
|
|
183
|
+
│ Pending Reason │ Resources │
|
|
184
|
+
│ Base URL │ UNAVAILABLE │
|
|
185
|
+
└────────────────┴─────────────┘
|
|
169
186
|
```
|
|
170
187
|
|
|
171
188
|
When the server is ready, you should see an output like this:
|
|
172
189
|
|
|
173
190
|
```
|
|
174
|
-
|
|
175
|
-
┃ Job Status ┃ Value
|
|
176
|
-
|
|
177
|
-
│ Model Name │
|
|
178
|
-
│ Model Status │ READY
|
|
179
|
-
│ Base URL │ http://
|
|
180
|
-
|
|
191
|
+
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓
|
|
192
|
+
┃ Job Status ┃ Value ┃
|
|
193
|
+
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩
|
|
194
|
+
│ Model Name │ Qwen3-14B │
|
|
195
|
+
│ Model Status │ READY │
|
|
196
|
+
│ Base URL │ http://gpu105:8080/v1 │
|
|
197
|
+
└──────────────┴───────────────────────┘
|
|
181
198
|
```
|
|
182
199
|
|
|
183
200
|
There are 5 possible states:
|
|
@@ -190,7 +207,7 @@ There are 5 possible states:
|
|
|
190
207
|
|
|
191
208
|
**Note**
|
|
192
209
|
* The base URL is only available when model is in `READY` state.
|
|
193
|
-
* For servers launched with `batch-launch`, the job ID should follow the format of "MAIN_JOB_ID+OFFSET" (e.g.
|
|
210
|
+
* For servers launched with `batch-launch`, the job ID should follow the format of "MAIN_JOB_ID+OFFSET" (e.g. 1435035+0, 1435035+1).
|
|
194
211
|
|
|
195
212
|
### `metrics` command
|
|
196
213
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "vec-inf"
|
|
3
|
-
version = "0.7.
|
|
3
|
+
version = "0.7.3"
|
|
4
4
|
description = "Efficient LLM inference on Slurm clusters using vLLM."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [{name = "Marshall Wang", email = "marshall.wang@vectorinstitute.ai"}]
|
|
@@ -42,9 +42,10 @@ dev = [
|
|
|
42
42
|
"xgrammar>=0.1.11",
|
|
43
43
|
"torch>=2.7.0",
|
|
44
44
|
"vllm>=0.10.0",
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
45
|
+
"ray[default]>=2.50.0",
|
|
46
|
+
"cupy-cuda12x==12.1.0",
|
|
47
|
+
"flashinfer-python>=0.4.0",
|
|
48
|
+
"sglang>=0.5.0",
|
|
48
49
|
]
|
|
49
50
|
|
|
50
51
|
[project.scripts]
|
|
@@ -39,6 +39,7 @@ def test_launch_command_success(runner):
|
|
|
39
39
|
"mem_per_node": "32G",
|
|
40
40
|
"model_weights_parent_dir": "/model-weights",
|
|
41
41
|
"vocab_size": "128000",
|
|
42
|
+
"venv": "/path/to/venv",
|
|
42
43
|
"vllm_args": {"max_model_len": 8192},
|
|
43
44
|
"env": {"CACHE": "/cache"},
|
|
44
45
|
}
|
|
@@ -134,7 +135,7 @@ def test_list_single_model(runner):
|
|
|
134
135
|
|
|
135
136
|
|
|
136
137
|
def test_status_command(runner):
|
|
137
|
-
"""Test status command."""
|
|
138
|
+
"""Test status command with job ID argument."""
|
|
138
139
|
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
|
|
139
140
|
mock_client = MagicMock()
|
|
140
141
|
mock_client_class.return_value = mock_client
|
|
@@ -153,6 +154,111 @@ def test_status_command(runner):
|
|
|
153
154
|
assert "Meta-Llama-3.1-8B" in result.output
|
|
154
155
|
|
|
155
156
|
|
|
157
|
+
def test_status_command_no_job_id_no_running_jobs(runner):
|
|
158
|
+
"""Test status command with no argument when no jobs are running."""
|
|
159
|
+
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
|
|
160
|
+
mock_client = MagicMock()
|
|
161
|
+
mock_client_class.return_value = mock_client
|
|
162
|
+
mock_client.fetch_running_jobs.return_value = []
|
|
163
|
+
|
|
164
|
+
result = runner.invoke(cli, ["status"])
|
|
165
|
+
|
|
166
|
+
assert result.exit_code == 0
|
|
167
|
+
assert "No running jobs found." in result.output
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def test_status_command_no_job_id_single_running_job(runner):
|
|
171
|
+
"""Test status command with no argument when one job is running."""
|
|
172
|
+
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
|
|
173
|
+
mock_client = MagicMock()
|
|
174
|
+
mock_client_class.return_value = mock_client
|
|
175
|
+
mock_client.fetch_running_jobs.return_value = ["12345"]
|
|
176
|
+
|
|
177
|
+
mock_status = MagicMock()
|
|
178
|
+
mock_status.model_name = "test-model-1"
|
|
179
|
+
mock_status.server_status = "READY"
|
|
180
|
+
mock_status.base_url = "http://localhost:8000"
|
|
181
|
+
mock_status.pending_reason = None
|
|
182
|
+
mock_status.failed_reason = None
|
|
183
|
+
mock_client.get_status.return_value = mock_status
|
|
184
|
+
|
|
185
|
+
result = runner.invoke(cli, ["status"])
|
|
186
|
+
|
|
187
|
+
assert result.exit_code == 0
|
|
188
|
+
assert "test-model-1" in result.output
|
|
189
|
+
mock_client.fetch_running_jobs.assert_called_once()
|
|
190
|
+
mock_client.get_status.assert_called_once_with("12345")
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def test_status_command_no_job_id_multiple_running_jobs(runner):
|
|
194
|
+
"""Test status command with no argument when multiple jobs are running."""
|
|
195
|
+
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
|
|
196
|
+
mock_client = MagicMock()
|
|
197
|
+
mock_client_class.return_value = mock_client
|
|
198
|
+
mock_client.fetch_running_jobs.return_value = ["12345", "67890"]
|
|
199
|
+
|
|
200
|
+
mock_status_1 = MagicMock()
|
|
201
|
+
mock_status_1.model_name = "test-model-1"
|
|
202
|
+
mock_status_1.server_status = "READY"
|
|
203
|
+
mock_status_1.base_url = "http://localhost:8000"
|
|
204
|
+
mock_status_1.pending_reason = None
|
|
205
|
+
mock_status_1.failed_reason = None
|
|
206
|
+
|
|
207
|
+
mock_status_2 = MagicMock()
|
|
208
|
+
mock_status_2.model_name = "test-model-2"
|
|
209
|
+
mock_status_2.server_status = "PENDING"
|
|
210
|
+
mock_status_2.base_url = None
|
|
211
|
+
mock_status_2.pending_reason = "Waiting for resources"
|
|
212
|
+
mock_status_2.failed_reason = None
|
|
213
|
+
|
|
214
|
+
mock_client.get_status.side_effect = [mock_status_1, mock_status_2]
|
|
215
|
+
|
|
216
|
+
result = runner.invoke(cli, ["status"])
|
|
217
|
+
|
|
218
|
+
assert result.exit_code == 0
|
|
219
|
+
assert "test-model-1" in result.output
|
|
220
|
+
assert "test-model-2" in result.output
|
|
221
|
+
assert "12345" in result.output
|
|
222
|
+
assert "67890" in result.output
|
|
223
|
+
mock_client.fetch_running_jobs.assert_called_once()
|
|
224
|
+
assert mock_client.get_status.call_count == 2
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def test_status_command_no_job_id_multiple_jobs_json_mode(runner):
|
|
228
|
+
"""Test status command with no argument and JSON mode for multiple jobs."""
|
|
229
|
+
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
|
|
230
|
+
mock_client = MagicMock()
|
|
231
|
+
mock_client_class.return_value = mock_client
|
|
232
|
+
mock_client.fetch_running_jobs.return_value = ["12345", "67890"]
|
|
233
|
+
|
|
234
|
+
mock_status_1 = MagicMock()
|
|
235
|
+
mock_status_1.model_name = "test-model-1"
|
|
236
|
+
mock_status_1.server_status = "READY"
|
|
237
|
+
mock_status_1.base_url = "http://localhost:8000"
|
|
238
|
+
mock_status_1.pending_reason = None
|
|
239
|
+
mock_status_1.failed_reason = None
|
|
240
|
+
|
|
241
|
+
mock_status_2 = MagicMock()
|
|
242
|
+
mock_status_2.model_name = "test-model-2"
|
|
243
|
+
mock_status_2.server_status = "FAILED"
|
|
244
|
+
mock_status_2.base_url = None
|
|
245
|
+
mock_status_2.pending_reason = None
|
|
246
|
+
mock_status_2.failed_reason = "Out of memory"
|
|
247
|
+
|
|
248
|
+
mock_client.get_status.side_effect = [mock_status_1, mock_status_2]
|
|
249
|
+
|
|
250
|
+
result = runner.invoke(cli, ["status", "--json-mode"])
|
|
251
|
+
|
|
252
|
+
assert result.exit_code == 0
|
|
253
|
+
output = json.loads(result.output)
|
|
254
|
+
assert isinstance(output, list)
|
|
255
|
+
assert len(output) == 2
|
|
256
|
+
assert output[0]["model_name"] == "test-model-1"
|
|
257
|
+
assert output[0]["model_status"] == "READY"
|
|
258
|
+
assert output[1]["model_name"] == "test-model-2"
|
|
259
|
+
assert output[1]["model_status"] == "FAILED"
|
|
260
|
+
|
|
261
|
+
|
|
156
262
|
def test_shutdown_command(runner):
|
|
157
263
|
"""Test shutdown command."""
|
|
158
264
|
with patch("vec_inf.cli._cli.VecInfClient") as mock_client_class:
|