vec-inf 0.7.0__tar.gz → 0.7.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/workflows/code_checks.yml +5 -3
  2. {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/workflows/docker.yml +1 -1
  3. {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/workflows/docs.yml +9 -9
  4. {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/workflows/publish.yml +2 -2
  5. {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/workflows/unit_tests.yml +7 -7
  6. {vec_inf-0.7.0 → vec_inf-0.7.2}/.pre-commit-config.yaml +2 -2
  7. {vec_inf-0.7.0 → vec_inf-0.7.2}/Dockerfile +12 -8
  8. {vec_inf-0.7.0 → vec_inf-0.7.2}/MODEL_TRACKING.md +8 -2
  9. vec_inf-0.7.0/README.md → vec_inf-0.7.2/PKG-INFO +44 -3
  10. vec_inf-0.7.0/PKG-INFO → vec_inf-0.7.2/README.md +21 -25
  11. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/user_guide.md +3 -3
  12. {vec_inf-0.7.0 → vec_inf-0.7.2}/pyproject.toml +5 -4
  13. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/cli/test_cli.py +1 -0
  14. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/cli/test_helper.py +2 -0
  15. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_slurm_script_generator.py +7 -8
  16. vec_inf-0.7.2/uv.lock +6357 -0
  17. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/cli/_cli.py +15 -1
  18. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/cli/_helper.py +44 -19
  19. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_client_vars.py +0 -7
  20. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_helper.py +66 -26
  21. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_slurm_script_generator.py +36 -19
  22. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_slurm_templates.py +20 -3
  23. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_slurm_vars.py +4 -0
  24. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_utils.py +56 -7
  25. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/api.py +8 -2
  26. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/models.py +6 -0
  27. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/config/environment.yaml +4 -0
  28. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/config/models.yaml +48 -99
  29. vec_inf-0.7.0/uv.lock +0 -5260
  30. {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  31. {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  32. {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  33. {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/ISSUE_TEMPLATE/model-request.md +0 -0
  34. {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/dependabot.yml +0 -0
  35. {vec_inf-0.7.0 → vec_inf-0.7.2}/.github/pull_request_template.md +0 -0
  36. {vec_inf-0.7.0 → vec_inf-0.7.2}/.gitignore +0 -0
  37. {vec_inf-0.7.0 → vec_inf-0.7.2}/.python-version +0 -0
  38. {vec_inf-0.7.0 → vec_inf-0.7.2}/LICENSE +0 -0
  39. {vec_inf-0.7.0 → vec_inf-0.7.2}/codecov.yml +0 -0
  40. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/Makefile +0 -0
  41. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/api.md +0 -0
  42. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/assets/favicon-48x48.svg +0 -0
  43. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/assets/favicon.ico +0 -0
  44. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/assets/vector-logo.svg +0 -0
  45. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/contributing.md +0 -0
  46. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/index.md +0 -0
  47. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/make.bat +0 -0
  48. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/overrides/partials/copyright.html +0 -0
  49. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/overrides/partials/logo.html +0 -0
  50. {vec_inf-0.7.0 → vec_inf-0.7.2}/docs/stylesheets/extra.css +0 -0
  51. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/README.md +0 -0
  52. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/api/basic_usage.py +0 -0
  53. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/inference/llm/chat_completions.py +0 -0
  54. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/inference/llm/completions.py +0 -0
  55. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/inference/llm/completions.sh +0 -0
  56. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/inference/text_embedding/embeddings.py +0 -0
  57. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/inference/vlm/vision_completions.py +0 -0
  58. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/logits/logits.py +0 -0
  59. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/slurm_dependency/README.md +0 -0
  60. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/slurm_dependency/downstream_job.sbatch +0 -0
  61. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/slurm_dependency/run_downstream.py +0 -0
  62. {vec_inf-0.7.0 → vec_inf-0.7.2}/examples/slurm_dependency/run_workflow.sh +0 -0
  63. {vec_inf-0.7.0 → vec_inf-0.7.2}/mkdocs.yml +0 -0
  64. {vec_inf-0.7.0 → vec_inf-0.7.2}/profile/avg_throughput.py +0 -0
  65. {vec_inf-0.7.0 → vec_inf-0.7.2}/profile/gen.py +0 -0
  66. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/__init__.py +0 -0
  67. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/test_imports.py +0 -0
  68. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/__init__.py +0 -0
  69. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/cli/__init__.py +0 -0
  70. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/cli/test_utils.py +0 -0
  71. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/__init__.py +0 -0
  72. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_api.py +0 -0
  73. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_examples.py +0 -0
  74. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_helper.py +0 -0
  75. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_models.py +0 -0
  76. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_utils.py +0 -0
  77. {vec_inf-0.7.0 → vec_inf-0.7.2}/tests/vec_inf/client/test_vars.env +0 -0
  78. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/README.md +0 -0
  79. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/__init__.py +0 -0
  80. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/cli/__init__.py +0 -0
  81. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/cli/_utils.py +0 -0
  82. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/cli/_vars.py +0 -0
  83. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/__init__.py +0 -0
  84. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/_exceptions.py +0 -0
  85. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/client/config.py +0 -0
  86. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/config/README.md +0 -0
  87. {vec_inf-0.7.0 → vec_inf-0.7.2}/vec_inf/find_port.sh +0 -0
  88. {vec_inf-0.7.0 → vec_inf-0.7.2}/venv.sh +0 -0
@@ -30,17 +30,17 @@ jobs:
30
30
  steps:
31
31
  - uses: actions/checkout@v5.0.0
32
32
  - name: Install uv
33
- uses: astral-sh/setup-uv@v6
33
+ uses: astral-sh/setup-uv@v7
34
34
  with:
35
35
  # Install a specific version of uv.
36
36
  version: "0.5.21"
37
37
  enable-cache: true
38
38
  - name: "Set up Python"
39
- uses: actions/setup-python@v5.5.0
39
+ uses: actions/setup-python@v6
40
40
  with:
41
41
  python-version-file: ".python-version"
42
42
  - name: Install the project
43
- run: uv sync --dev
43
+ run: uv sync --dev --prerelease=allow
44
44
  - name: Install dependencies and check code
45
45
  run: |
46
46
  source .venv/bin/activate
@@ -49,3 +49,5 @@ jobs:
49
49
  uses: pypa/gh-action-pip-audit@v1.1.0
50
50
  with:
51
51
  virtual-environment: .venv/
52
+ # Temporary: ignore pip advisory until fixed in pip>=25.3
53
+ ignore-vulns: GHSA-4xh5-x5gv-qwph
@@ -33,7 +33,7 @@ jobs:
33
33
  echo "version=$VERSION" >> $GITHUB_OUTPUT
34
34
 
35
35
  - name: Log in to Docker Hub
36
- uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1
36
+ uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef
37
37
  with:
38
38
  username: ${{ secrets.DOCKER_USERNAME }}
39
39
  password: ${{ secrets.DOCKER_PASSWORD }}
@@ -56,27 +56,27 @@ jobs:
56
56
  fetch-depth: 0 # Fetch all history for proper versioning
57
57
 
58
58
  - name: Install uv
59
- uses: astral-sh/setup-uv@v6
59
+ uses: astral-sh/setup-uv@v7
60
60
  with:
61
61
  version: "0.5.21"
62
62
  enable-cache: true
63
63
 
64
64
  - name: Set up Python
65
- uses: actions/setup-python@v5
65
+ uses: actions/setup-python@v6
66
66
  with:
67
67
  python-version-file: ".python-version"
68
68
 
69
69
  - name: Install the project
70
- run: uv sync --all-extras --group docs
70
+ run: uv sync --all-extras --group docs --prerelease=allow
71
71
 
72
72
  - name: Build docs
73
- run: uv run mkdocs build
73
+ run: uv run --frozen mkdocs build
74
74
 
75
75
  - name: Create .nojekyll file
76
76
  run: touch site/.nojekyll
77
77
 
78
78
  - name: Upload artifact
79
- uses: actions/upload-artifact@v4
79
+ uses: actions/upload-artifact@v5
80
80
  with:
81
81
  name: docs-site
82
82
  path: site/
@@ -93,18 +93,18 @@ jobs:
93
93
  fetch-depth: 0 # Fetch all history for proper versioning
94
94
 
95
95
  - name: Install uv
96
- uses: astral-sh/setup-uv@v6
96
+ uses: astral-sh/setup-uv@v7
97
97
  with:
98
98
  version: "0.5.21"
99
99
  enable-cache: true
100
100
 
101
101
  - name: Set up Python
102
- uses: actions/setup-python@v5
102
+ uses: actions/setup-python@v6
103
103
  with:
104
104
  python-version-file: ".python-version"
105
105
 
106
106
  - name: Install the project
107
- run: uv sync --all-extras --group docs
107
+ run: uv sync --all-extras --group docs --frozen
108
108
 
109
109
  - name: Configure Git Credentials
110
110
  run: |
@@ -112,7 +112,7 @@ jobs:
112
112
  git config user.email 41898282+github-actions[bot]@users.noreply.github.com
113
113
 
114
114
  - name: Download artifact
115
- uses: actions/download-artifact@v5
115
+ uses: actions/download-artifact@v6
116
116
  with:
117
117
  name: docs-site
118
118
  path: site
@@ -16,12 +16,12 @@ jobs:
16
16
  - uses: actions/checkout@v5.0.0
17
17
 
18
18
  - name: Install uv
19
- uses: astral-sh/setup-uv@v6
19
+ uses: astral-sh/setup-uv@v7
20
20
  with:
21
21
  version: "0.6.6"
22
22
  enable-cache: true
23
23
 
24
- - uses: actions/setup-python@v5.5.0
24
+ - uses: actions/setup-python@v6
25
25
  with:
26
26
  python-version: '3.10'
27
27
 
@@ -46,40 +46,40 @@ jobs:
46
46
  - uses: actions/checkout@v5.0.0
47
47
 
48
48
  - name: Install uv
49
- uses: astral-sh/setup-uv@v6
49
+ uses: astral-sh/setup-uv@v7
50
50
  with:
51
51
  # Install a specific version of uv.
52
52
  version: "0.5.21"
53
53
  enable-cache: true
54
54
 
55
55
  - name: "Set up Python ${{ matrix.python-version }}"
56
- uses: actions/setup-python@v5.5.0
56
+ uses: actions/setup-python@v6
57
57
  with:
58
58
  python-version: ${{ matrix.python-version }}
59
59
 
60
60
  - name: Install the project
61
- run: uv sync --dev
61
+ run: uv sync --dev --prerelease=allow
62
62
 
63
63
  - name: Install dependencies and check code
64
64
  run: |
65
- uv run pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
65
+ uv run --frozen pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
66
66
 
67
67
  - name: Install the core package only
68
68
  run: uv sync --no-dev
69
69
 
70
70
  - name: Run package import tests
71
71
  run: |
72
- uv run pytest tests/test_imports.py
72
+ uv run --frozen pytest tests/test_imports.py
73
73
 
74
74
  - name: Import Codecov GPG public key
75
75
  run: |
76
76
  gpg --keyserver keyserver.ubuntu.com --recv-keys 806BB28AED779869
77
77
 
78
78
  - name: Upload coverage to Codecov
79
- uses: codecov/codecov-action@v5.5.0
79
+ uses: codecov/codecov-action@v5.5.1
80
80
  with:
81
81
  token: ${{ secrets.CODECOV_TOKEN }}
82
- file: ./coverage.xml
82
+ files: ./coverage.xml
83
83
  name: codecov-umbrella
84
84
  fail_ci_if_error: true
85
85
  verbose: true
@@ -17,7 +17,7 @@ repos:
17
17
  - id: check-toml
18
18
 
19
19
  - repo: https://github.com/astral-sh/ruff-pre-commit
20
- rev: 'v0.12.10'
20
+ rev: 'v0.14.3'
21
21
  hooks:
22
22
  - id: ruff
23
23
  args: [--fix, --exit-non-zero-on-fix]
@@ -26,7 +26,7 @@ repos:
26
26
  types_or: [python, jupyter]
27
27
 
28
28
  - repo: https://github.com/pre-commit/mirrors-mypy
29
- rev: v1.17.1
29
+ rev: v1.18.2
30
30
  hooks:
31
31
  - id: mypy
32
32
  entry: python3 -m mypy --config-file pyproject.toml
@@ -35,29 +35,33 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && \
35
35
  rm get-pip.py && \
36
36
  python3.10 -m pip install --upgrade pip setuptools wheel uv
37
37
 
38
- # Install Infiniband/RDMA support
38
+ # Install RDMA support
39
39
  RUN apt-get update && apt-get install -y \
40
40
  libibverbs1 libibverbs-dev ibverbs-utils \
41
41
  librdmacm1 librdmacm-dev rdmacm-utils \
42
+ rdma-core ibverbs-providers infiniband-diags perftest \
42
43
  && rm -rf /var/lib/apt/lists/*
43
44
 
44
45
  # Set up RDMA environment (these will persist in the final container)
45
46
  ENV LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH"
46
- ENV UCX_NET_DEVICES=all
47
47
  ENV NCCL_IB_DISABLE=0
48
+ ENV NCCL_SOCKET_IFNAME="^lo,docker0"
49
+ ENV NCCL_NET_GDR_LEVEL=PHB
50
+ ENV NCCL_IB_TIMEOUT=22
51
+ ENV NCCL_IB_RETRY_CNT=7
52
+ ENV NCCL_DEBUG=INFO
48
53
 
49
54
  # Set up project
50
55
  WORKDIR /vec-inf
51
56
  COPY . /vec-inf
52
57
 
53
58
  # Install project dependencies with build requirements
54
- RUN PIP_INDEX_URL="https://download.pytorch.org/whl/cu128" uv pip install --system -e .[dev]
59
+ RUN uv pip install --system -e .[dev] --prerelease=allow
55
60
 
56
- # Final configuration
57
- RUN mkdir -p /vec-inf/nccl && \
58
- mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /vec-inf/nccl/libnccl.so.2.18.1
59
- ENV VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
60
- ENV NCCL_DEBUG=INFO
61
+ # Install a single, system NCCL (from NVIDIA CUDA repo in base image)
62
+ RUN apt-get update && apt-get install -y --allow-change-held-packages\
63
+ libnccl2 libnccl-dev \
64
+ && rm -rf /var/lib/apt/lists/*
61
65
 
62
66
  # Set the default command to start an interactive shell
63
67
  CMD ["bash"]
@@ -40,6 +40,7 @@ This document tracks all model weights available in the `/model-weights` directo
40
40
  | `gemma-2b-it` | ❌ |
41
41
  | `gemma-7b` | ❌ |
42
42
  | `gemma-7b-it` | ❌ |
43
+ | `gemma-2-2b-it` | ✅ |
43
44
  | `gemma-2-9b` | ✅ |
44
45
  | `gemma-2-9b-it` | ✅ |
45
46
  | `gemma-2-27b` | ✅ |
@@ -165,8 +166,8 @@ This document tracks all model weights available in the `/model-weights` directo
165
166
  | Model | Configuration |
166
167
  |:------|:-------------|
167
168
  | `Qwen3-14B` | ✅ |
168
- | `Qwen3-8B` | |
169
- | `Qwen3-32B` | |
169
+ | `Qwen3-8B` | |
170
+ | `Qwen3-32B` | |
170
171
  | `Qwen3-235B-A22B` | ❌ |
171
172
  | `Qwen3-Embedding-8B` | ❌ |
172
173
 
@@ -186,6 +187,11 @@ This document tracks all model weights available in the `/model-weights` directo
186
187
  | `DeepSeek-Coder-V2-Lite-Instruct` | ❌ |
187
188
  | `deepseek-math-7b-instruct` | ❌ |
188
189
 
190
+ ### OpenAI: GPT-OSS
191
+ | Model | Configuration |
192
+ |:------|:-------------|
193
+ | `gpt-oss-120b` | ✅ |
194
+
189
195
  ### Other LLM Models
190
196
  | Model | Configuration |
191
197
  |:------|:-------------|
@@ -1,3 +1,26 @@
1
+ Metadata-Version: 2.4
2
+ Name: vec-inf
3
+ Version: 0.7.2
4
+ Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
+ Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.10
9
+ Requires-Dist: click>=8.1.0
10
+ Requires-Dist: pydantic>=2.10.6
11
+ Requires-Dist: pyyaml>=6.0.2
12
+ Requires-Dist: requests>=2.31.0
13
+ Requires-Dist: rich>=13.7.0
14
+ Provides-Extra: dev
15
+ Requires-Dist: cupy-cuda12x==12.1.0; extra == 'dev'
16
+ Requires-Dist: flashinfer-python>=0.4.0; extra == 'dev'
17
+ Requires-Dist: ray[default]>=2.50.0; extra == 'dev'
18
+ Requires-Dist: sglang>=0.5.0; extra == 'dev'
19
+ Requires-Dist: torch>=2.7.0; extra == 'dev'
20
+ Requires-Dist: vllm>=0.10.0; extra == 'dev'
21
+ Requires-Dist: xgrammar>=0.1.11; extra == 'dev'
22
+ Description-Content-Type: text/markdown
23
+
1
24
  # Vector Inference: Easy inference on Slurm clusters
2
25
 
3
26
  ----------------------------------------------------
@@ -44,7 +67,7 @@ You should see an output like the following:
44
67
 
45
68
  <img width="720" alt="launch_image" src="https://github.com/user-attachments/assets/c1e0c60c-cf7a-49ed-a426-fdb38ebf88ee" />
46
69
 
47
- **NOTE**: On Vector Killarney Cluster environment, the following fields are required:
70
+ **NOTE**: You can set the required fields in the environment configuration (`environment.yaml`), it's a mapping between required arguments and their corresponding environment variables. On the Vector **Killarney** Cluster environment, the required fields are:
48
71
  * `--account`, `-A`: The Slurm account, this argument can be set to default by setting environment variable `VEC_INF_ACCOUNT`.
49
72
  * `--work-dir`, `-D`: A working directory other than your home directory, this argument can be set to default by seeting environment variable `VEC_INF_WORK_DIR`.
50
73
 
@@ -74,6 +97,11 @@ Example:
74
97
  >>> status = client.get_status(job_id)
75
98
  >>> if status.status == ModelStatus.READY:
76
99
  ... print(f"Model is ready at {status.base_url}")
100
+ >>> # Alternatively, use wait_until_ready which will either return a StatusResponse or throw a ServerError
101
+ >>> try:
102
+ >>> status = wait_until_ready(job_id)
103
+ >>> except ServerError as e:
104
+ >>> print(f"Model launch failed: {e}")
77
105
  >>> client.shutdown_model(job_id)
78
106
  ```
79
107
 
@@ -124,6 +152,19 @@ Once the inference server is ready, you can start sending in inference requests.
124
152
  ## SSH tunnel from your local device
125
153
  If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
126
154
  ```bash
127
- ssh -L 8081:172.17.8.29:8081 username@v.vectorinstitute.ai -N
155
+ ssh -L 8081:10.1.1.29:8081 username@v.vectorinstitute.ai -N
156
+ ```
157
+ The example provided above is for the Vector Killarney cluster, change the variables accordingly for your environment. The IP address for the compute nodes on Killarney follow `10.1.1.XX` pattern, where `XX` is the GPU number (`kn029` -> `29` in this example).
158
+
159
+ ## Reference
160
+ If you found Vector Inference useful in your research or applications, please cite using the following BibTeX template:
161
+ ```
162
+ @software{vector_inference,
163
+ title = {Vector Inference: Efficient LLM inference on Slurm clusters using vLLM},
164
+ author = {Wang, Marshall},
165
+ organization = {Vector Institute},
166
+ year = {<YEAR_OF_RELEASE>},
167
+ version = {<VERSION_TAG>},
168
+ url = {https://github.com/VectorInstitute/vector-inference}
169
+ }
128
170
  ```
129
- Where the last number in the URL is the GPU number (gpu029 in this case). The example provided above is for the vector cluster, change the variables accordingly for your environment
@@ -1,25 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: vec-inf
3
- Version: 0.7.0
4
- Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
- Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
6
- License-Expression: MIT
7
- License-File: LICENSE
8
- Requires-Python: >=3.10
9
- Requires-Dist: click>=8.1.0
10
- Requires-Dist: pydantic>=2.10.6
11
- Requires-Dist: pyyaml>=6.0.2
12
- Requires-Dist: requests>=2.31.0
13
- Requires-Dist: rich>=13.7.0
14
- Provides-Extra: dev
15
- Requires-Dist: cupy-cuda12x==12.1.0; extra == 'dev'
16
- Requires-Dist: ray>=2.40.0; extra == 'dev'
17
- Requires-Dist: torch>=2.7.0; extra == 'dev'
18
- Requires-Dist: vllm-nccl-cu12<2.19,>=2.18; extra == 'dev'
19
- Requires-Dist: vllm>=0.10.0; extra == 'dev'
20
- Requires-Dist: xgrammar>=0.1.11; extra == 'dev'
21
- Description-Content-Type: text/markdown
22
-
23
1
  # Vector Inference: Easy inference on Slurm clusters
24
2
 
25
3
  ----------------------------------------------------
@@ -66,7 +44,7 @@ You should see an output like the following:
66
44
 
67
45
  <img width="720" alt="launch_image" src="https://github.com/user-attachments/assets/c1e0c60c-cf7a-49ed-a426-fdb38ebf88ee" />
68
46
 
69
- **NOTE**: On Vector Killarney Cluster environment, the following fields are required:
47
+ **NOTE**: You can set the required fields in the environment configuration (`environment.yaml`), it's a mapping between required arguments and their corresponding environment variables. On the Vector **Killarney** Cluster environment, the required fields are:
70
48
  * `--account`, `-A`: The Slurm account, this argument can be set to default by setting environment variable `VEC_INF_ACCOUNT`.
71
49
  * `--work-dir`, `-D`: A working directory other than your home directory, this argument can be set to default by seeting environment variable `VEC_INF_WORK_DIR`.
72
50
 
@@ -96,6 +74,11 @@ Example:
96
74
  >>> status = client.get_status(job_id)
97
75
  >>> if status.status == ModelStatus.READY:
98
76
  ... print(f"Model is ready at {status.base_url}")
77
+ >>> # Alternatively, use wait_until_ready which will either return a StatusResponse or throw a ServerError
78
+ >>> try:
79
+ >>> status = wait_until_ready(job_id)
80
+ >>> except ServerError as e:
81
+ >>> print(f"Model launch failed: {e}")
99
82
  >>> client.shutdown_model(job_id)
100
83
  ```
101
84
 
@@ -146,6 +129,19 @@ Once the inference server is ready, you can start sending in inference requests.
146
129
  ## SSH tunnel from your local device
147
130
  If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
148
131
  ```bash
149
- ssh -L 8081:172.17.8.29:8081 username@v.vectorinstitute.ai -N
132
+ ssh -L 8081:10.1.1.29:8081 username@v.vectorinstitute.ai -N
133
+ ```
134
+ The example provided above is for the Vector Killarney cluster, change the variables accordingly for your environment. The IP address for the compute nodes on Killarney follow `10.1.1.XX` pattern, where `XX` is the GPU number (`kn029` -> `29` in this example).
135
+
136
+ ## Reference
137
+ If you found Vector Inference useful in your research or applications, please cite using the following BibTeX template:
138
+ ```
139
+ @software{vector_inference,
140
+ title = {Vector Inference: Efficient LLM inference on Slurm clusters using vLLM},
141
+ author = {Wang, Marshall},
142
+ organization = {Vector Institute},
143
+ year = {<YEAR_OF_RELEASE>},
144
+ version = {<VERSION_TAG>},
145
+ url = {https://github.com/VectorInstitute/vector-inference}
146
+ }
150
147
  ```
151
- Where the last number in the URL is the GPU number (gpu029 in this case). The example provided above is for the vector cluster, change the variables accordingly for your environment
@@ -37,7 +37,7 @@ You should see an output like the following:
37
37
  └─────────────────────────┴───────────────────────────────────────────┘
38
38
  ```
39
39
 
40
- **NOTE**: On Vector Killarney Cluster environment, the following fields are required:
40
+ **NOTE**: You can set the required fields in the environment configuration (`environment.yaml`), it's a mapping between required arguments and their corresponding environment variables. On the Vector **Killarney** Cluster environment, the required fields are:
41
41
  * `--account`, `-A`: The Slurm account, this argument can be set to default by setting environment variable `VEC_INF_ACCOUNT`.
42
42
  * `--work-dir`, `-D`: A working directory other than your home directory, this argument can be set to default by seeting environment variable `VEC_INF_WORK_DIR`.
43
43
 
@@ -334,9 +334,9 @@ Once the inference server is ready, you can start sending in inference requests.
334
334
 
335
335
  If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
336
336
  ```bash
337
- ssh -L 8081:172.17.8.29:8081 username@v.vectorinstitute.ai -N
337
+ ssh -L 8081:10.1.1.29:8081 username@v.vectorinstitute.ai -N
338
338
  ```
339
- Where the last number in the URL is the GPU number (gpu029 in this case). The example provided above is for the vector cluster, change the variables accordingly for your environment
339
+ The example provided above is for the Vector Killarney cluster, change the variables accordingly for your environment. The IP address for the compute nodes on Killarney follow `10.1.1.XX` pattern, where `XX` is the GPU number (`kn029` -> `29` in this example). Similarly, for Bon Echo it's `172.17.8.XX`, where `XX` is from `gpuXX`.
340
340
 
341
341
  ## Python API Usage
342
342
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "vec-inf"
3
- version = "0.7.0"
3
+ version = "0.7.2"
4
4
  description = "Efficient LLM inference on Slurm clusters using vLLM."
5
5
  readme = "README.md"
6
6
  authors = [{name = "Marshall Wang", email = "marshall.wang@vectorinstitute.ai"}]
@@ -42,9 +42,10 @@ dev = [
42
42
  "xgrammar>=0.1.11",
43
43
  "torch>=2.7.0",
44
44
  "vllm>=0.10.0",
45
- "vllm-nccl-cu12>=2.18,<2.19",
46
- "ray>=2.40.0",
47
- "cupy-cuda12x==12.1.0"
45
+ "ray[default]>=2.50.0",
46
+ "cupy-cuda12x==12.1.0",
47
+ "flashinfer-python>=0.4.0",
48
+ "sglang>=0.5.0",
48
49
  ]
49
50
 
50
51
  [project.scripts]
@@ -39,6 +39,7 @@ def test_launch_command_success(runner):
39
39
  "mem_per_node": "32G",
40
40
  "model_weights_parent_dir": "/model-weights",
41
41
  "vocab_size": "128000",
42
+ "venv": "/path/to/venv",
42
43
  "vllm_args": {"max_model_len": 8192},
43
44
  "env": {"CACHE": "/cache"},
44
45
  }
@@ -35,6 +35,7 @@ class TestLaunchResponseFormatter:
35
35
  "mem_per_node": "32G",
36
36
  "model_weights_parent_dir": "/model-weights",
37
37
  "log_dir": "/tmp/logs",
38
+ "venv": "/path/to/venv",
38
39
  "vllm_args": {"max_model_len": 8192, "enable_prefix_caching": True},
39
40
  "env": {"CACHE": "/cache"},
40
41
  }
@@ -63,6 +64,7 @@ class TestLaunchResponseFormatter:
63
64
  "mem_per_node": "16G",
64
65
  "model_weights_parent_dir": "/weights",
65
66
  "log_dir": "/logs",
67
+ "venv": "/path/to/venv",
66
68
  "vllm_args": {},
67
69
  "env": {},
68
70
  }
@@ -53,7 +53,7 @@ class TestSlurmScriptGenerator:
53
53
  singularity = basic_params.copy()
54
54
  singularity.update(
55
55
  {
56
- "venv": "singularity",
56
+ "venv": "apptainer",
57
57
  "bind": "/scratch:/scratch,/data:/data",
58
58
  "env": {
59
59
  "CACHE_DIR": "/cache",
@@ -109,7 +109,7 @@ class TestSlurmScriptGenerator:
109
109
  def test_init_singularity_no_bind(self, basic_params):
110
110
  """Test Singularity initialization without additional binds."""
111
111
  params = basic_params.copy()
112
- params["venv"] = "singularity"
112
+ params["venv"] = "apptainer"
113
113
  generator = SlurmScriptGenerator(params)
114
114
 
115
115
  assert generator.params == params
@@ -173,7 +173,6 @@ class TestSlurmScriptGenerator:
173
173
  generator = SlurmScriptGenerator(basic_params)
174
174
  launch_cmd = generator._generate_launch_cmd()
175
175
 
176
- assert "source /path/to/venv/bin/activate" in launch_cmd
177
176
  assert "vllm serve /path/to/model_weights/test-model" in launch_cmd
178
177
  assert "--served-model-name test-model" in launch_cmd
179
178
  assert "--tensor-parallel-size 4" in launch_cmd
@@ -185,7 +184,7 @@ class TestSlurmScriptGenerator:
185
184
  generator = SlurmScriptGenerator(singularity_params)
186
185
  launch_cmd = generator._generate_launch_cmd()
187
186
 
188
- assert "exec --nv" in launch_cmd
187
+ assert "apptainer exec --nv" in launch_cmd
189
188
  assert "--bind /path/to/model_weights/test-model" in launch_cmd
190
189
  assert "--bind /scratch:/scratch,/data:/data" in launch_cmd
191
190
  assert "source" not in launch_cmd
@@ -306,9 +305,9 @@ class TestBatchSlurmScriptGenerator:
306
305
  def batch_singularity_params(self, batch_params):
307
306
  """Generate batch SLURM configuration parameters with Singularity."""
308
307
  singularity_params = batch_params.copy()
309
- singularity_params["venv"] = "singularity" # Set top-level venv to singularity
308
+ singularity_params["venv"] = "apptainer" # Set top-level venv to apptainer
310
309
  for model_name in singularity_params["models"]:
311
- singularity_params["models"][model_name]["venv"] = "singularity"
310
+ singularity_params["models"][model_name]["venv"] = "apptainer"
312
311
  singularity_params["models"][model_name]["bind"] = (
313
312
  "/scratch:/scratch,/data:/data"
314
313
  )
@@ -341,9 +340,9 @@ class TestBatchSlurmScriptGenerator:
341
340
  def test_init_singularity_no_bind(self, batch_params):
342
341
  """Test Singularity initialization without additional binds."""
343
342
  params = batch_params.copy()
344
- params["venv"] = "singularity" # Set top-level venv to singularity
343
+ params["venv"] = "apptainer" # Set top-level venv to apptainer
345
344
  for model_name in params["models"]:
346
- params["models"][model_name]["venv"] = "singularity"
345
+ params["models"][model_name]["venv"] = "apptainer"
347
346
 
348
347
  generator = BatchSlurmScriptGenerator(params)
349
348