vec-inf 0.7.0__tar.gz → 0.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/workflows/code_checks.yml +5 -3
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/workflows/docker.yml +1 -1
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/workflows/docs.yml +9 -9
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/workflows/publish.yml +2 -2
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/workflows/unit_tests.yml +7 -7
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.pre-commit-config.yaml +2 -2
- {vec_inf-0.7.0 → vec_inf-0.7.2}/Dockerfile +12 -8
- {vec_inf-0.7.0 → vec_inf-0.7.2}/MODEL_TRACKING.md +8 -2
- vec_inf-0.7.0/README.md → vec_inf-0.7.2/PKG-INFO +44 -3
- vec_inf-0.7.0/PKG-INFO → vec_inf-0.7.2/README.md +21 -25
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/user_guide.md +3 -3
- {vec_inf-0.7.0 → vec_inf-0.7.2}/pyproject.toml +5 -4
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/cli/test_cli.py +1 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/cli/test_helper.py +2 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_slurm_script_generator.py +7 -8
- vec_inf-0.7.2/uv.lock +6357 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/cli/_cli.py +15 -1
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/cli/_helper.py +44 -19
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_client_vars.py +0 -7
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_helper.py +66 -26
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_slurm_script_generator.py +36 -19
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_slurm_templates.py +20 -3
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_slurm_vars.py +4 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_utils.py +56 -7
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/api.py +8 -2
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/models.py +6 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/config/environment.yaml +4 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/config/models.yaml +48 -99
- vec_inf-0.7.0/uv.lock +0 -5260
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/ISSUE_TEMPLATE/model-request.md +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/dependabot.yml +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/pull_request_template.md +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.gitignore +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/.python-version +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/LICENSE +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/codecov.yml +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/Makefile +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/api.md +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/assets/favicon-48x48.svg +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/assets/favicon.ico +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/assets/vector-logo.svg +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/contributing.md +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/index.md +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/make.bat +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/overrides/partials/copyright.html +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/overrides/partials/logo.html +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/stylesheets/extra.css +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/README.md +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/api/basic_usage.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/inference/llm/chat_completions.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/inference/llm/completions.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/inference/llm/completions.sh +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/inference/text_embedding/embeddings.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/inference/vlm/vision_completions.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/logits/logits.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/slurm_dependency/README.md +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/slurm_dependency/downstream_job.sbatch +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/slurm_dependency/run_downstream.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/slurm_dependency/run_workflow.sh +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/mkdocs.yml +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/profile/avg_throughput.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/profile/gen.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/__init__.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/test_imports.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/__init__.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/cli/__init__.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/cli/test_utils.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/__init__.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_api.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_examples.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_helper.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_models.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_utils.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_vars.env +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/README.md +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/__init__.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/cli/__init__.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/cli/_utils.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/cli/_vars.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/__init__.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_exceptions.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/config.py +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/config/README.md +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/find_port.sh +0 -0
- {vec_inf-0.7.0 → vec_inf-0.7.2}/venv.sh +0 -0
|
@@ -30,17 +30,17 @@ jobs:
|
|
|
30
30
|
steps:
|
|
31
31
|
- uses: actions/checkout@v5.0.0
|
|
32
32
|
- name: Install uv
|
|
33
|
-
uses: astral-sh/setup-uv@
|
|
33
|
+
uses: astral-sh/setup-uv@v7
|
|
34
34
|
with:
|
|
35
35
|
# Install a specific version of uv.
|
|
36
36
|
version: "0.5.21"
|
|
37
37
|
enable-cache: true
|
|
38
38
|
- name: "Set up Python"
|
|
39
|
-
uses: actions/setup-python@
|
|
39
|
+
uses: actions/setup-python@v6
|
|
40
40
|
with:
|
|
41
41
|
python-version-file: ".python-version"
|
|
42
42
|
- name: Install the project
|
|
43
|
-
run: uv sync --dev
|
|
43
|
+
run: uv sync --dev --prerelease=allow
|
|
44
44
|
- name: Install dependencies and check code
|
|
45
45
|
run: |
|
|
46
46
|
source .venv/bin/activate
|
|
@@ -49,3 +49,5 @@ jobs:
|
|
|
49
49
|
uses: pypa/gh-action-pip-audit@v1.1.0
|
|
50
50
|
with:
|
|
51
51
|
virtual-environment: .venv/
|
|
52
|
+
# Temporary: ignore pip advisory until fixed in pip>=25.3
|
|
53
|
+
ignore-vulns: GHSA-4xh5-x5gv-qwph
|
|
@@ -33,7 +33,7 @@ jobs:
|
|
|
33
33
|
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
|
34
34
|
|
|
35
35
|
- name: Log in to Docker Hub
|
|
36
|
-
uses: docker/login-action@
|
|
36
|
+
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef
|
|
37
37
|
with:
|
|
38
38
|
username: ${{ secrets.DOCKER_USERNAME }}
|
|
39
39
|
password: ${{ secrets.DOCKER_PASSWORD }}
|
|
@@ -56,27 +56,27 @@ jobs:
|
|
|
56
56
|
fetch-depth: 0 # Fetch all history for proper versioning
|
|
57
57
|
|
|
58
58
|
- name: Install uv
|
|
59
|
-
uses: astral-sh/setup-uv@
|
|
59
|
+
uses: astral-sh/setup-uv@v7
|
|
60
60
|
with:
|
|
61
61
|
version: "0.5.21"
|
|
62
62
|
enable-cache: true
|
|
63
63
|
|
|
64
64
|
- name: Set up Python
|
|
65
|
-
uses: actions/setup-python@
|
|
65
|
+
uses: actions/setup-python@v6
|
|
66
66
|
with:
|
|
67
67
|
python-version-file: ".python-version"
|
|
68
68
|
|
|
69
69
|
- name: Install the project
|
|
70
|
-
run: uv sync --all-extras --group docs
|
|
70
|
+
run: uv sync --all-extras --group docs --prerelease=allow
|
|
71
71
|
|
|
72
72
|
- name: Build docs
|
|
73
|
-
run: uv run mkdocs build
|
|
73
|
+
run: uv run --frozen mkdocs build
|
|
74
74
|
|
|
75
75
|
- name: Create .nojekyll file
|
|
76
76
|
run: touch site/.nojekyll
|
|
77
77
|
|
|
78
78
|
- name: Upload artifact
|
|
79
|
-
uses: actions/upload-artifact@
|
|
79
|
+
uses: actions/upload-artifact@v5
|
|
80
80
|
with:
|
|
81
81
|
name: docs-site
|
|
82
82
|
path: site/
|
|
@@ -93,18 +93,18 @@ jobs:
|
|
|
93
93
|
fetch-depth: 0 # Fetch all history for proper versioning
|
|
94
94
|
|
|
95
95
|
- name: Install uv
|
|
96
|
-
uses: astral-sh/setup-uv@
|
|
96
|
+
uses: astral-sh/setup-uv@v7
|
|
97
97
|
with:
|
|
98
98
|
version: "0.5.21"
|
|
99
99
|
enable-cache: true
|
|
100
100
|
|
|
101
101
|
- name: Set up Python
|
|
102
|
-
uses: actions/setup-python@
|
|
102
|
+
uses: actions/setup-python@v6
|
|
103
103
|
with:
|
|
104
104
|
python-version-file: ".python-version"
|
|
105
105
|
|
|
106
106
|
- name: Install the project
|
|
107
|
-
run: uv sync --all-extras --group docs
|
|
107
|
+
run: uv sync --all-extras --group docs --frozen
|
|
108
108
|
|
|
109
109
|
- name: Configure Git Credentials
|
|
110
110
|
run: |
|
|
@@ -112,7 +112,7 @@ jobs:
|
|
|
112
112
|
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
|
|
113
113
|
|
|
114
114
|
- name: Download artifact
|
|
115
|
-
uses: actions/download-artifact@
|
|
115
|
+
uses: actions/download-artifact@v6
|
|
116
116
|
with:
|
|
117
117
|
name: docs-site
|
|
118
118
|
path: site
|
|
@@ -16,12 +16,12 @@ jobs:
|
|
|
16
16
|
- uses: actions/checkout@v5.0.0
|
|
17
17
|
|
|
18
18
|
- name: Install uv
|
|
19
|
-
uses: astral-sh/setup-uv@
|
|
19
|
+
uses: astral-sh/setup-uv@v7
|
|
20
20
|
with:
|
|
21
21
|
version: "0.6.6"
|
|
22
22
|
enable-cache: true
|
|
23
23
|
|
|
24
|
-
- uses: actions/setup-python@
|
|
24
|
+
- uses: actions/setup-python@v6
|
|
25
25
|
with:
|
|
26
26
|
python-version: '3.10'
|
|
27
27
|
|
|
@@ -46,40 +46,40 @@ jobs:
|
|
|
46
46
|
- uses: actions/checkout@v5.0.0
|
|
47
47
|
|
|
48
48
|
- name: Install uv
|
|
49
|
-
uses: astral-sh/setup-uv@
|
|
49
|
+
uses: astral-sh/setup-uv@v7
|
|
50
50
|
with:
|
|
51
51
|
# Install a specific version of uv.
|
|
52
52
|
version: "0.5.21"
|
|
53
53
|
enable-cache: true
|
|
54
54
|
|
|
55
55
|
- name: "Set up Python ${{ matrix.python-version }}"
|
|
56
|
-
uses: actions/setup-python@
|
|
56
|
+
uses: actions/setup-python@v6
|
|
57
57
|
with:
|
|
58
58
|
python-version: ${{ matrix.python-version }}
|
|
59
59
|
|
|
60
60
|
- name: Install the project
|
|
61
|
-
run: uv sync --dev
|
|
61
|
+
run: uv sync --dev --prerelease=allow
|
|
62
62
|
|
|
63
63
|
- name: Install dependencies and check code
|
|
64
64
|
run: |
|
|
65
|
-
uv run pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
|
|
65
|
+
uv run --frozen pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
|
|
66
66
|
|
|
67
67
|
- name: Install the core package only
|
|
68
68
|
run: uv sync --no-dev
|
|
69
69
|
|
|
70
70
|
- name: Run package import tests
|
|
71
71
|
run: |
|
|
72
|
-
uv run pytest tests/test_imports.py
|
|
72
|
+
uv run --frozen pytest tests/test_imports.py
|
|
73
73
|
|
|
74
74
|
- name: Import Codecov GPG public key
|
|
75
75
|
run: |
|
|
76
76
|
gpg --keyserver keyserver.ubuntu.com --recv-keys 806BB28AED779869
|
|
77
77
|
|
|
78
78
|
- name: Upload coverage to Codecov
|
|
79
|
-
uses: codecov/codecov-action@v5.5.
|
|
79
|
+
uses: codecov/codecov-action@v5.5.1
|
|
80
80
|
with:
|
|
81
81
|
token: ${{ secrets.CODECOV_TOKEN }}
|
|
82
|
-
|
|
82
|
+
files: ./coverage.xml
|
|
83
83
|
name: codecov-umbrella
|
|
84
84
|
fail_ci_if_error: true
|
|
85
85
|
verbose: true
|
|
@@ -17,7 +17,7 @@ repos:
|
|
|
17
17
|
- id: check-toml
|
|
18
18
|
|
|
19
19
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
20
|
-
rev: 'v0.
|
|
20
|
+
rev: 'v0.14.3'
|
|
21
21
|
hooks:
|
|
22
22
|
- id: ruff
|
|
23
23
|
args: [--fix, --exit-non-zero-on-fix]
|
|
@@ -26,7 +26,7 @@ repos:
|
|
|
26
26
|
types_or: [python, jupyter]
|
|
27
27
|
|
|
28
28
|
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
29
|
-
rev: v1.
|
|
29
|
+
rev: v1.18.2
|
|
30
30
|
hooks:
|
|
31
31
|
- id: mypy
|
|
32
32
|
entry: python3 -m mypy --config-file pyproject.toml
|
|
@@ -35,29 +35,33 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && \
|
|
|
35
35
|
rm get-pip.py && \
|
|
36
36
|
python3.10 -m pip install --upgrade pip setuptools wheel uv
|
|
37
37
|
|
|
38
|
-
# Install
|
|
38
|
+
# Install RDMA support
|
|
39
39
|
RUN apt-get update && apt-get install -y \
|
|
40
40
|
libibverbs1 libibverbs-dev ibverbs-utils \
|
|
41
41
|
librdmacm1 librdmacm-dev rdmacm-utils \
|
|
42
|
+
rdma-core ibverbs-providers infiniband-diags perftest \
|
|
42
43
|
&& rm -rf /var/lib/apt/lists/*
|
|
43
44
|
|
|
44
45
|
# Set up RDMA environment (these will persist in the final container)
|
|
45
46
|
ENV LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH"
|
|
46
|
-
ENV UCX_NET_DEVICES=all
|
|
47
47
|
ENV NCCL_IB_DISABLE=0
|
|
48
|
+
ENV NCCL_SOCKET_IFNAME="^lo,docker0"
|
|
49
|
+
ENV NCCL_NET_GDR_LEVEL=PHB
|
|
50
|
+
ENV NCCL_IB_TIMEOUT=22
|
|
51
|
+
ENV NCCL_IB_RETRY_CNT=7
|
|
52
|
+
ENV NCCL_DEBUG=INFO
|
|
48
53
|
|
|
49
54
|
# Set up project
|
|
50
55
|
WORKDIR /vec-inf
|
|
51
56
|
COPY . /vec-inf
|
|
52
57
|
|
|
53
58
|
# Install project dependencies with build requirements
|
|
54
|
-
RUN
|
|
59
|
+
RUN uv pip install --system -e .[dev] --prerelease=allow
|
|
55
60
|
|
|
56
|
-
#
|
|
57
|
-
RUN
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
ENV NCCL_DEBUG=INFO
|
|
61
|
+
# Install a single, system NCCL (from NVIDIA CUDA repo in base image)
|
|
62
|
+
RUN apt-get update && apt-get install -y --allow-change-held-packages\
|
|
63
|
+
libnccl2 libnccl-dev \
|
|
64
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
61
65
|
|
|
62
66
|
# Set the default command to start an interactive shell
|
|
63
67
|
CMD ["bash"]
|
|
@@ -40,6 +40,7 @@ This document tracks all model weights available in the `/model-weights` directo
|
|
|
40
40
|
| `gemma-2b-it` | ❌ |
|
|
41
41
|
| `gemma-7b` | ❌ |
|
|
42
42
|
| `gemma-7b-it` | ❌ |
|
|
43
|
+
| `gemma-2-2b-it` | ✅ |
|
|
43
44
|
| `gemma-2-9b` | ✅ |
|
|
44
45
|
| `gemma-2-9b-it` | ✅ |
|
|
45
46
|
| `gemma-2-27b` | ✅ |
|
|
@@ -165,8 +166,8 @@ This document tracks all model weights available in the `/model-weights` directo
|
|
|
165
166
|
| Model | Configuration |
|
|
166
167
|
|:------|:-------------|
|
|
167
168
|
| `Qwen3-14B` | ✅ |
|
|
168
|
-
| `Qwen3-8B` |
|
|
169
|
-
| `Qwen3-32B` |
|
|
169
|
+
| `Qwen3-8B` | ✅ |
|
|
170
|
+
| `Qwen3-32B` | ✅ |
|
|
170
171
|
| `Qwen3-235B-A22B` | ❌ |
|
|
171
172
|
| `Qwen3-Embedding-8B` | ❌ |
|
|
172
173
|
|
|
@@ -186,6 +187,11 @@ This document tracks all model weights available in the `/model-weights` directo
|
|
|
186
187
|
| `DeepSeek-Coder-V2-Lite-Instruct` | ❌ |
|
|
187
188
|
| `deepseek-math-7b-instruct` | ❌ |
|
|
188
189
|
|
|
190
|
+
### OpenAI: GPT-OSS
|
|
191
|
+
| Model | Configuration |
|
|
192
|
+
|:------|:-------------|
|
|
193
|
+
| `gpt-oss-120b` | ✅ |
|
|
194
|
+
|
|
189
195
|
### Other LLM Models
|
|
190
196
|
| Model | Configuration |
|
|
191
197
|
|:------|:-------------|
|
|
@@ -1,3 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vec-inf
|
|
3
|
+
Version: 0.7.2
|
|
4
|
+
Summary: Efficient LLM inference on Slurm clusters using vLLM.
|
|
5
|
+
Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Requires-Dist: click>=8.1.0
|
|
10
|
+
Requires-Dist: pydantic>=2.10.6
|
|
11
|
+
Requires-Dist: pyyaml>=6.0.2
|
|
12
|
+
Requires-Dist: requests>=2.31.0
|
|
13
|
+
Requires-Dist: rich>=13.7.0
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: cupy-cuda12x==12.1.0; extra == 'dev'
|
|
16
|
+
Requires-Dist: flashinfer-python>=0.4.0; extra == 'dev'
|
|
17
|
+
Requires-Dist: ray[default]>=2.50.0; extra == 'dev'
|
|
18
|
+
Requires-Dist: sglang>=0.5.0; extra == 'dev'
|
|
19
|
+
Requires-Dist: torch>=2.7.0; extra == 'dev'
|
|
20
|
+
Requires-Dist: vllm>=0.10.0; extra == 'dev'
|
|
21
|
+
Requires-Dist: xgrammar>=0.1.11; extra == 'dev'
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
1
24
|
# Vector Inference: Easy inference on Slurm clusters
|
|
2
25
|
|
|
3
26
|
----------------------------------------------------
|
|
@@ -44,7 +67,7 @@ You should see an output like the following:
|
|
|
44
67
|
|
|
45
68
|
<img width="720" alt="launch_image" src="https://github.com/user-attachments/assets/c1e0c60c-cf7a-49ed-a426-fdb38ebf88ee" />
|
|
46
69
|
|
|
47
|
-
**NOTE**: On Vector Killarney Cluster environment, the
|
|
70
|
+
**NOTE**: You can set the required fields in the environment configuration (`environment.yaml`), it's a mapping between required arguments and their corresponding environment variables. On the Vector **Killarney** Cluster environment, the required fields are:
|
|
48
71
|
* `--account`, `-A`: The Slurm account, this argument can be set to default by setting environment variable `VEC_INF_ACCOUNT`.
|
|
49
72
|
* `--work-dir`, `-D`: A working directory other than your home directory, this argument can be set to default by seeting environment variable `VEC_INF_WORK_DIR`.
|
|
50
73
|
|
|
@@ -74,6 +97,11 @@ Example:
|
|
|
74
97
|
>>> status = client.get_status(job_id)
|
|
75
98
|
>>> if status.status == ModelStatus.READY:
|
|
76
99
|
... print(f"Model is ready at {status.base_url}")
|
|
100
|
+
>>> # Alternatively, use wait_until_ready which will either return a StatusResponse or throw a ServerError
|
|
101
|
+
>>> try:
|
|
102
|
+
>>> status = wait_until_ready(job_id)
|
|
103
|
+
>>> except ServerError as e:
|
|
104
|
+
>>> print(f"Model launch failed: {e}")
|
|
77
105
|
>>> client.shutdown_model(job_id)
|
|
78
106
|
```
|
|
79
107
|
|
|
@@ -124,6 +152,19 @@ Once the inference server is ready, you can start sending in inference requests.
|
|
|
124
152
|
## SSH tunnel from your local device
|
|
125
153
|
If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
|
|
126
154
|
```bash
|
|
127
|
-
ssh -L 8081:
|
|
155
|
+
ssh -L 8081:10.1.1.29:8081 username@v.vectorinstitute.ai -N
|
|
156
|
+
```
|
|
157
|
+
The example provided above is for the Vector Killarney cluster, change the variables accordingly for your environment. The IP address for the compute nodes on Killarney follow `10.1.1.XX` pattern, where `XX` is the GPU number (`kn029` -> `29` in this example).
|
|
158
|
+
|
|
159
|
+
## Reference
|
|
160
|
+
If you found Vector Inference useful in your research or applications, please cite using the following BibTeX template:
|
|
161
|
+
```
|
|
162
|
+
@software{vector_inference,
|
|
163
|
+
title = {Vector Inference: Efficient LLM inference on Slurm clusters using vLLM},
|
|
164
|
+
author = {Wang, Marshall},
|
|
165
|
+
organization = {Vector Institute},
|
|
166
|
+
year = {<YEAR_OF_RELEASE>},
|
|
167
|
+
version = {<VERSION_TAG>},
|
|
168
|
+
url = {https://github.com/VectorInstitute/vector-inference}
|
|
169
|
+
}
|
|
128
170
|
```
|
|
129
|
-
Where the last number in the URL is the GPU number (gpu029 in this case). The example provided above is for the vector cluster, change the variables accordingly for your environment
|
|
@@ -1,25 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: vec-inf
|
|
3
|
-
Version: 0.7.0
|
|
4
|
-
Summary: Efficient LLM inference on Slurm clusters using vLLM.
|
|
5
|
-
Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
|
|
6
|
-
License-Expression: MIT
|
|
7
|
-
License-File: LICENSE
|
|
8
|
-
Requires-Python: >=3.10
|
|
9
|
-
Requires-Dist: click>=8.1.0
|
|
10
|
-
Requires-Dist: pydantic>=2.10.6
|
|
11
|
-
Requires-Dist: pyyaml>=6.0.2
|
|
12
|
-
Requires-Dist: requests>=2.31.0
|
|
13
|
-
Requires-Dist: rich>=13.7.0
|
|
14
|
-
Provides-Extra: dev
|
|
15
|
-
Requires-Dist: cupy-cuda12x==12.1.0; extra == 'dev'
|
|
16
|
-
Requires-Dist: ray>=2.40.0; extra == 'dev'
|
|
17
|
-
Requires-Dist: torch>=2.7.0; extra == 'dev'
|
|
18
|
-
Requires-Dist: vllm-nccl-cu12<2.19,>=2.18; extra == 'dev'
|
|
19
|
-
Requires-Dist: vllm>=0.10.0; extra == 'dev'
|
|
20
|
-
Requires-Dist: xgrammar>=0.1.11; extra == 'dev'
|
|
21
|
-
Description-Content-Type: text/markdown
|
|
22
|
-
|
|
23
1
|
# Vector Inference: Easy inference on Slurm clusters
|
|
24
2
|
|
|
25
3
|
----------------------------------------------------
|
|
@@ -66,7 +44,7 @@ You should see an output like the following:
|
|
|
66
44
|
|
|
67
45
|
<img width="720" alt="launch_image" src="https://github.com/user-attachments/assets/c1e0c60c-cf7a-49ed-a426-fdb38ebf88ee" />
|
|
68
46
|
|
|
69
|
-
**NOTE**: On Vector Killarney Cluster environment, the
|
|
47
|
+
**NOTE**: You can set the required fields in the environment configuration (`environment.yaml`), it's a mapping between required arguments and their corresponding environment variables. On the Vector **Killarney** Cluster environment, the required fields are:
|
|
70
48
|
* `--account`, `-A`: The Slurm account, this argument can be set to default by setting environment variable `VEC_INF_ACCOUNT`.
|
|
71
49
|
* `--work-dir`, `-D`: A working directory other than your home directory, this argument can be set to default by seeting environment variable `VEC_INF_WORK_DIR`.
|
|
72
50
|
|
|
@@ -96,6 +74,11 @@ Example:
|
|
|
96
74
|
>>> status = client.get_status(job_id)
|
|
97
75
|
>>> if status.status == ModelStatus.READY:
|
|
98
76
|
... print(f"Model is ready at {status.base_url}")
|
|
77
|
+
>>> # Alternatively, use wait_until_ready which will either return a StatusResponse or throw a ServerError
|
|
78
|
+
>>> try:
|
|
79
|
+
>>> status = wait_until_ready(job_id)
|
|
80
|
+
>>> except ServerError as e:
|
|
81
|
+
>>> print(f"Model launch failed: {e}")
|
|
99
82
|
>>> client.shutdown_model(job_id)
|
|
100
83
|
```
|
|
101
84
|
|
|
@@ -146,6 +129,19 @@ Once the inference server is ready, you can start sending in inference requests.
|
|
|
146
129
|
## SSH tunnel from your local device
|
|
147
130
|
If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
|
|
148
131
|
```bash
|
|
149
|
-
ssh -L 8081:
|
|
132
|
+
ssh -L 8081:10.1.1.29:8081 username@v.vectorinstitute.ai -N
|
|
133
|
+
```
|
|
134
|
+
The example provided above is for the Vector Killarney cluster, change the variables accordingly for your environment. The IP address for the compute nodes on Killarney follow `10.1.1.XX` pattern, where `XX` is the GPU number (`kn029` -> `29` in this example).
|
|
135
|
+
|
|
136
|
+
## Reference
|
|
137
|
+
If you found Vector Inference useful in your research or applications, please cite using the following BibTeX template:
|
|
138
|
+
```
|
|
139
|
+
@software{vector_inference,
|
|
140
|
+
title = {Vector Inference: Efficient LLM inference on Slurm clusters using vLLM},
|
|
141
|
+
author = {Wang, Marshall},
|
|
142
|
+
organization = {Vector Institute},
|
|
143
|
+
year = {<YEAR_OF_RELEASE>},
|
|
144
|
+
version = {<VERSION_TAG>},
|
|
145
|
+
url = {https://github.com/VectorInstitute/vector-inference}
|
|
146
|
+
}
|
|
150
147
|
```
|
|
151
|
-
Where the last number in the URL is the GPU number (gpu029 in this case). The example provided above is for the vector cluster, change the variables accordingly for your environment
|
|
@@ -37,7 +37,7 @@ You should see an output like the following:
|
|
|
37
37
|
└─────────────────────────┴───────────────────────────────────────────┘
|
|
38
38
|
```
|
|
39
39
|
|
|
40
|
-
**NOTE**: On Vector Killarney Cluster environment, the
|
|
40
|
+
**NOTE**: You can set the required fields in the environment configuration (`environment.yaml`), it's a mapping between required arguments and their corresponding environment variables. On the Vector **Killarney** Cluster environment, the required fields are:
|
|
41
41
|
* `--account`, `-A`: The Slurm account, this argument can be set to default by setting environment variable `VEC_INF_ACCOUNT`.
|
|
42
42
|
* `--work-dir`, `-D`: A working directory other than your home directory, this argument can be set to default by seeting environment variable `VEC_INF_WORK_DIR`.
|
|
43
43
|
|
|
@@ -334,9 +334,9 @@ Once the inference server is ready, you can start sending in inference requests.
|
|
|
334
334
|
|
|
335
335
|
If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
|
|
336
336
|
```bash
|
|
337
|
-
ssh -L 8081:
|
|
337
|
+
ssh -L 8081:10.1.1.29:8081 username@v.vectorinstitute.ai -N
|
|
338
338
|
```
|
|
339
|
-
|
|
339
|
+
The example provided above is for the Vector Killarney cluster, change the variables accordingly for your environment. The IP address for the compute nodes on Killarney follow `10.1.1.XX` pattern, where `XX` is the GPU number (`kn029` -> `29` in this example). Similarly, for Bon Echo it's `172.17.8.XX`, where `XX` is from `gpuXX`.
|
|
340
340
|
|
|
341
341
|
## Python API Usage
|
|
342
342
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "vec-inf"
|
|
3
|
-
version = "0.7.
|
|
3
|
+
version = "0.7.2"
|
|
4
4
|
description = "Efficient LLM inference on Slurm clusters using vLLM."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [{name = "Marshall Wang", email = "marshall.wang@vectorinstitute.ai"}]
|
|
@@ -42,9 +42,10 @@ dev = [
|
|
|
42
42
|
"xgrammar>=0.1.11",
|
|
43
43
|
"torch>=2.7.0",
|
|
44
44
|
"vllm>=0.10.0",
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
45
|
+
"ray[default]>=2.50.0",
|
|
46
|
+
"cupy-cuda12x==12.1.0",
|
|
47
|
+
"flashinfer-python>=0.4.0",
|
|
48
|
+
"sglang>=0.5.0",
|
|
48
49
|
]
|
|
49
50
|
|
|
50
51
|
[project.scripts]
|
|
@@ -35,6 +35,7 @@ class TestLaunchResponseFormatter:
|
|
|
35
35
|
"mem_per_node": "32G",
|
|
36
36
|
"model_weights_parent_dir": "/model-weights",
|
|
37
37
|
"log_dir": "/tmp/logs",
|
|
38
|
+
"venv": "/path/to/venv",
|
|
38
39
|
"vllm_args": {"max_model_len": 8192, "enable_prefix_caching": True},
|
|
39
40
|
"env": {"CACHE": "/cache"},
|
|
40
41
|
}
|
|
@@ -63,6 +64,7 @@ class TestLaunchResponseFormatter:
|
|
|
63
64
|
"mem_per_node": "16G",
|
|
64
65
|
"model_weights_parent_dir": "/weights",
|
|
65
66
|
"log_dir": "/logs",
|
|
67
|
+
"venv": "/path/to/venv",
|
|
66
68
|
"vllm_args": {},
|
|
67
69
|
"env": {},
|
|
68
70
|
}
|
|
@@ -53,7 +53,7 @@ class TestSlurmScriptGenerator:
|
|
|
53
53
|
singularity = basic_params.copy()
|
|
54
54
|
singularity.update(
|
|
55
55
|
{
|
|
56
|
-
"venv": "
|
|
56
|
+
"venv": "apptainer",
|
|
57
57
|
"bind": "/scratch:/scratch,/data:/data",
|
|
58
58
|
"env": {
|
|
59
59
|
"CACHE_DIR": "/cache",
|
|
@@ -109,7 +109,7 @@ class TestSlurmScriptGenerator:
|
|
|
109
109
|
def test_init_singularity_no_bind(self, basic_params):
|
|
110
110
|
"""Test Singularity initialization without additional binds."""
|
|
111
111
|
params = basic_params.copy()
|
|
112
|
-
params["venv"] = "
|
|
112
|
+
params["venv"] = "apptainer"
|
|
113
113
|
generator = SlurmScriptGenerator(params)
|
|
114
114
|
|
|
115
115
|
assert generator.params == params
|
|
@@ -173,7 +173,6 @@ class TestSlurmScriptGenerator:
|
|
|
173
173
|
generator = SlurmScriptGenerator(basic_params)
|
|
174
174
|
launch_cmd = generator._generate_launch_cmd()
|
|
175
175
|
|
|
176
|
-
assert "source /path/to/venv/bin/activate" in launch_cmd
|
|
177
176
|
assert "vllm serve /path/to/model_weights/test-model" in launch_cmd
|
|
178
177
|
assert "--served-model-name test-model" in launch_cmd
|
|
179
178
|
assert "--tensor-parallel-size 4" in launch_cmd
|
|
@@ -185,7 +184,7 @@ class TestSlurmScriptGenerator:
|
|
|
185
184
|
generator = SlurmScriptGenerator(singularity_params)
|
|
186
185
|
launch_cmd = generator._generate_launch_cmd()
|
|
187
186
|
|
|
188
|
-
assert "exec --nv" in launch_cmd
|
|
187
|
+
assert "apptainer exec --nv" in launch_cmd
|
|
189
188
|
assert "--bind /path/to/model_weights/test-model" in launch_cmd
|
|
190
189
|
assert "--bind /scratch:/scratch,/data:/data" in launch_cmd
|
|
191
190
|
assert "source" not in launch_cmd
|
|
@@ -306,9 +305,9 @@ class TestBatchSlurmScriptGenerator:
|
|
|
306
305
|
def batch_singularity_params(self, batch_params):
|
|
307
306
|
"""Generate batch SLURM configuration parameters with Singularity."""
|
|
308
307
|
singularity_params = batch_params.copy()
|
|
309
|
-
singularity_params["venv"] = "
|
|
308
|
+
singularity_params["venv"] = "apptainer" # Set top-level venv to apptainer
|
|
310
309
|
for model_name in singularity_params["models"]:
|
|
311
|
-
singularity_params["models"][model_name]["venv"] = "
|
|
310
|
+
singularity_params["models"][model_name]["venv"] = "apptainer"
|
|
312
311
|
singularity_params["models"][model_name]["bind"] = (
|
|
313
312
|
"/scratch:/scratch,/data:/data"
|
|
314
313
|
)
|
|
@@ -341,9 +340,9 @@ class TestBatchSlurmScriptGenerator:
|
|
|
341
340
|
def test_init_singularity_no_bind(self, batch_params):
|
|
342
341
|
"""Test Singularity initialization without additional binds."""
|
|
343
342
|
params = batch_params.copy()
|
|
344
|
-
params["venv"] = "
|
|
343
|
+
params["venv"] = "apptainer" # Set top-level venv to apptainer
|
|
345
344
|
for model_name in params["models"]:
|
|
346
|
-
params["models"][model_name]["venv"] = "
|
|
345
|
+
params["models"][model_name]["venv"] = "apptainer"
|
|
347
346
|
|
|
348
347
|
generator = BatchSlurmScriptGenerator(params)
|
|
349
348
|
|