vec-inf 0.5.0__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. {vec_inf-0.5.0 → vec_inf-0.6.1}/.github/workflows/code_checks.yml +2 -2
  2. {vec_inf-0.5.0 → vec_inf-0.6.1}/.github/workflows/docker.yml +3 -2
  3. vec_inf-0.6.1/.github/workflows/docs.yml +176 -0
  4. {vec_inf-0.5.0 → vec_inf-0.6.1}/.github/workflows/publish.yml +2 -2
  5. {vec_inf-0.5.0 → vec_inf-0.6.1}/.github/workflows/unit_tests.yml +3 -3
  6. {vec_inf-0.5.0 → vec_inf-0.6.1}/.pre-commit-config.yaml +2 -1
  7. {vec_inf-0.5.0 → vec_inf-0.6.1}/Dockerfile +4 -2
  8. {vec_inf-0.5.0 → vec_inf-0.6.1}/PKG-INFO +52 -63
  9. {vec_inf-0.5.0 → vec_inf-0.6.1}/README.md +51 -62
  10. {vec_inf-0.5.0 → vec_inf-0.6.1}/codecov.yml +1 -1
  11. vec_inf-0.6.1/docs/api.md +18 -0
  12. vec_inf-0.6.1/docs/assets/favicon-48x48.svg +9 -0
  13. vec_inf-0.6.1/docs/assets/favicon.ico +0 -0
  14. vec_inf-0.6.1/docs/assets/vector-logo.svg +172 -0
  15. vec_inf-0.6.1/docs/contributing.md +174 -0
  16. vec_inf-0.6.1/docs/index.md +13 -0
  17. vec_inf-0.6.1/docs/overrides/partials/copyright.html +22 -0
  18. vec_inf-0.6.1/docs/overrides/partials/logo.html +5 -0
  19. vec_inf-0.6.1/docs/stylesheets/extra.css +235 -0
  20. vec_inf-0.6.1/docs/user_guide.md +276 -0
  21. {vec_inf-0.5.0 → vec_inf-0.6.1}/examples/README.md +3 -0
  22. vec_inf-0.6.1/examples/api/basic_usage.py +43 -0
  23. vec_inf-0.6.1/examples/slurm_dependency/README.md +33 -0
  24. vec_inf-0.6.1/examples/slurm_dependency/downstream_job.sbatch +18 -0
  25. vec_inf-0.6.1/examples/slurm_dependency/run_downstream.py +26 -0
  26. vec_inf-0.6.1/examples/slurm_dependency/run_workflow.sh +14 -0
  27. vec_inf-0.6.1/mkdocs.yml +99 -0
  28. {vec_inf-0.5.0 → vec_inf-0.6.1}/pyproject.toml +11 -13
  29. vec_inf-0.6.1/tests/test_imports.py +32 -0
  30. {vec_inf-0.5.0 → vec_inf-0.6.1}/tests/vec_inf/cli/test_cli.py +94 -77
  31. vec_inf-0.6.1/tests/vec_inf/cli/test_utils.py +17 -0
  32. vec_inf-0.6.1/tests/vec_inf/client/__init__.py +1 -0
  33. vec_inf-0.6.1/tests/vec_inf/client/test_api.py +212 -0
  34. vec_inf-0.6.1/tests/vec_inf/client/test_examples.py +99 -0
  35. vec_inf-0.6.1/tests/vec_inf/client/test_helper.py +578 -0
  36. vec_inf-0.6.1/tests/vec_inf/client/test_models.py +56 -0
  37. {vec_inf-0.5.0/tests/vec_inf/cli → vec_inf-0.6.1/tests/vec_inf/client}/test_utils.py +11 -25
  38. vec_inf-0.6.1/uv.lock +4701 -0
  39. {vec_inf-0.5.0 → vec_inf-0.6.1}/vec_inf/README.md +3 -3
  40. vec_inf-0.6.1/vec_inf/cli/_cli.py +365 -0
  41. vec_inf-0.6.1/vec_inf/cli/_helper.py +400 -0
  42. vec_inf-0.6.1/vec_inf/cli/_utils.py +38 -0
  43. vec_inf-0.6.1/vec_inf/cli/_vars.py +32 -0
  44. vec_inf-0.6.1/vec_inf/client/__init__.py +31 -0
  45. vec_inf-0.6.1/vec_inf/client/_client_vars.py +231 -0
  46. vec_inf-0.6.1/vec_inf/client/_exceptions.py +37 -0
  47. vec_inf-0.6.1/vec_inf/client/_helper.py +661 -0
  48. vec_inf-0.6.1/vec_inf/client/_slurm_script_generator.py +178 -0
  49. vec_inf-0.6.1/vec_inf/client/_utils.py +287 -0
  50. vec_inf-0.6.1/vec_inf/client/api.py +302 -0
  51. vec_inf-0.6.1/vec_inf/client/config.py +138 -0
  52. vec_inf-0.6.1/vec_inf/client/models.py +234 -0
  53. vec_inf-0.6.1/vec_inf/client/slurm_vars.py +49 -0
  54. {vec_inf-0.5.0 → vec_inf-0.6.1}/vec_inf/config/README.md +0 -12
  55. {vec_inf-0.5.0 → vec_inf-0.6.1}/vec_inf/config/models.yaml +410 -391
  56. vec_inf-0.5.0/.github/workflows/docs_build.yml +0 -44
  57. vec_inf-0.5.0/.github/workflows/docs_deploy.yml +0 -59
  58. vec_inf-0.5.0/docs/source/_static/custom.js +0 -6
  59. vec_inf-0.5.0/docs/source/_static/logos/vector_logo.png +0 -0
  60. vec_inf-0.5.0/docs/source/_static/require.min.js +0 -1
  61. vec_inf-0.5.0/docs/source/_templates/base.html +0 -120
  62. vec_inf-0.5.0/docs/source/_templates/custom-class-template.rst +0 -34
  63. vec_inf-0.5.0/docs/source/_templates/custom-module-template.rst +0 -66
  64. vec_inf-0.5.0/docs/source/_templates/page.html +0 -219
  65. vec_inf-0.5.0/docs/source/conf.py +0 -113
  66. vec_inf-0.5.0/docs/source/index.md +0 -24
  67. vec_inf-0.5.0/docs/source/user_guide.md +0 -181
  68. vec_inf-0.5.0/tests/test_imports.py +0 -17
  69. vec_inf-0.5.0/uv.lock +0 -4511
  70. vec_inf-0.5.0/vec_inf/cli/_cli.py +0 -230
  71. vec_inf-0.5.0/vec_inf/cli/_config.py +0 -87
  72. vec_inf-0.5.0/vec_inf/cli/_helper.py +0 -675
  73. vec_inf-0.5.0/vec_inf/cli/_utils.py +0 -162
  74. vec_inf-0.5.0/vec_inf/multinode_vllm.slurm +0 -154
  75. vec_inf-0.5.0/vec_inf/vllm.slurm +0 -90
  76. {vec_inf-0.5.0 → vec_inf-0.6.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  77. {vec_inf-0.5.0 → vec_inf-0.6.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  78. {vec_inf-0.5.0 → vec_inf-0.6.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  79. {vec_inf-0.5.0 → vec_inf-0.6.1}/.github/dependabot.yml +0 -0
  80. {vec_inf-0.5.0 → vec_inf-0.6.1}/.github/pull_request_template.md +0 -0
  81. {vec_inf-0.5.0 → vec_inf-0.6.1}/.gitignore +0 -0
  82. {vec_inf-0.5.0 → vec_inf-0.6.1}/.python-version +0 -0
  83. {vec_inf-0.5.0 → vec_inf-0.6.1}/LICENSE +0 -0
  84. {vec_inf-0.5.0 → vec_inf-0.6.1}/docs/Makefile +0 -0
  85. {vec_inf-0.5.0 → vec_inf-0.6.1}/docs/make.bat +0 -0
  86. {vec_inf-0.5.0 → vec_inf-0.6.1}/examples/inference/llm/chat_completions.py +0 -0
  87. {vec_inf-0.5.0 → vec_inf-0.6.1}/examples/inference/llm/completions.py +0 -0
  88. {vec_inf-0.5.0 → vec_inf-0.6.1}/examples/inference/llm/completions.sh +0 -0
  89. {vec_inf-0.5.0 → vec_inf-0.6.1}/examples/inference/text_embedding/embeddings.py +0 -0
  90. {vec_inf-0.5.0 → vec_inf-0.6.1}/examples/inference/vlm/vision_completions.py +0 -0
  91. {vec_inf-0.5.0 → vec_inf-0.6.1}/examples/logits/logits.py +0 -0
  92. {vec_inf-0.5.0 → vec_inf-0.6.1}/profile/avg_throughput.py +0 -0
  93. {vec_inf-0.5.0 → vec_inf-0.6.1}/profile/gen.py +0 -0
  94. {vec_inf-0.5.0 → vec_inf-0.6.1}/tests/__init__.py +0 -0
  95. {vec_inf-0.5.0 → vec_inf-0.6.1}/tests/vec_inf/__init__.py +0 -0
  96. {vec_inf-0.5.0 → vec_inf-0.6.1}/tests/vec_inf/cli/__init__.py +0 -0
  97. {vec_inf-0.5.0 → vec_inf-0.6.1}/vec_inf/__init__.py +0 -0
  98. {vec_inf-0.5.0 → vec_inf-0.6.1}/vec_inf/cli/__init__.py +0 -0
  99. {vec_inf-0.5.0 → vec_inf-0.6.1}/vec_inf/find_port.sh +0 -0
  100. {vec_inf-0.5.0 → vec_inf-0.6.1}/venv.sh +0 -0
@@ -30,13 +30,13 @@ jobs:
30
30
  steps:
31
31
  - uses: actions/checkout@v4.2.2
32
32
  - name: Install uv
33
- uses: astral-sh/setup-uv@v5.3.1
33
+ uses: astral-sh/setup-uv@v6
34
34
  with:
35
35
  # Install a specific version of uv.
36
36
  version: "0.5.21"
37
37
  enable-cache: true
38
38
  - name: "Set up Python"
39
- uses: actions/setup-python@v5.4.0
39
+ uses: actions/setup-python@v5.5.0
40
40
  with:
41
41
  python-version-file: ".python-version"
42
42
  - name: Install the project
@@ -9,13 +9,14 @@ on:
9
9
  paths:
10
10
  - Dockerfile
11
11
  - .github/workflows/docker.yml
12
+ - uv.lock
12
13
  pull_request:
13
14
  branches:
14
15
  - main
15
- - develop
16
16
  paths:
17
17
  - Dockerfile
18
18
  - .github/workflows/docker.yml
19
+ - uv.lock
19
20
 
20
21
  jobs:
21
22
  push_to_registry:
@@ -44,7 +45,7 @@ jobs:
44
45
  images: vectorinstitute/vector-inference
45
46
 
46
47
  - name: Build and push Docker image
47
- uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4
48
+ uses: docker/build-push-action@1dc73863535b631f98b2378be8619f83b136f4a0
48
49
  with:
49
50
  context: .
50
51
  file: ./Dockerfile
@@ -0,0 +1,176 @@
1
+ name: docs
2
+ permissions:
3
+ contents: write
4
+ pull-requests: write
5
+
6
+ on:
7
+ push:
8
+ branches:
9
+ - main
10
+ paths:
11
+ - .pre-commit-config.yaml
12
+ - .github/workflows/docs.yml
13
+ - '**.py'
14
+ - '**.ipynb'
15
+ - '**.html'
16
+ - '**.js'
17
+ - '**.md'
18
+ - uv.lock
19
+ - pyproject.toml
20
+ - mkdocs.yml
21
+ - '**.png'
22
+ - '**.svg'
23
+ pull_request:
24
+ branches:
25
+ - main
26
+ paths:
27
+ - .pre-commit-config.yaml
28
+ - .github/workflows/docs.yml
29
+ - '**.py'
30
+ - '**.ipynb'
31
+ - '**.js'
32
+ - '**.html'
33
+ - uv.lock
34
+ - pyproject.toml
35
+ - '**.md'
36
+ - mkdocs.yml
37
+ - '**.png'
38
+ - '**.svg'
39
+ release:
40
+ types: [published]
41
+ # Allow manual trigger
42
+ workflow_dispatch:
43
+ inputs:
44
+ version:
45
+ description: 'Version to deploy (e.g., 0.5.0, latest)'
46
+ required: true
47
+ default: 'latest'
48
+
49
+ jobs:
50
+ build:
51
+ runs-on: ubuntu-latest
52
+ steps:
53
+ - name: Checkout code
54
+ uses: actions/checkout@v4.2.2
55
+ with:
56
+ fetch-depth: 0 # Fetch all history for proper versioning
57
+
58
+ - name: Install uv
59
+ uses: astral-sh/setup-uv@v6
60
+ with:
61
+ version: "0.5.21"
62
+ enable-cache: true
63
+
64
+ - name: Set up Python
65
+ uses: actions/setup-python@v5
66
+ with:
67
+ python-version-file: ".python-version"
68
+
69
+ - name: Install the project
70
+ run: uv sync --all-extras --group docs
71
+
72
+ - name: Build docs
73
+ run: uv run mkdocs build
74
+
75
+ - name: Create .nojekyll file
76
+ run: touch site/.nojekyll
77
+
78
+ - name: Upload artifact
79
+ uses: actions/upload-artifact@v4
80
+ with:
81
+ name: docs-site
82
+ path: site/
83
+ retention-days: 1
84
+
85
+ deploy:
86
+ needs: build
87
+ if: (github.event_name == 'push' && github.ref == 'refs/heads/main') || github.event_name == 'workflow_dispatch' || github.event_name == 'release'
88
+ runs-on: ubuntu-latest
89
+ steps:
90
+ - name: Checkout code
91
+ uses: actions/checkout@v4.2.2
92
+ with:
93
+ fetch-depth: 0 # Fetch all history for proper versioning
94
+
95
+ - name: Install uv
96
+ uses: astral-sh/setup-uv@v6
97
+ with:
98
+ version: "0.5.21"
99
+ enable-cache: true
100
+
101
+ - name: Set up Python
102
+ uses: actions/setup-python@v5
103
+ with:
104
+ python-version-file: ".python-version"
105
+
106
+ - name: Install the project
107
+ run: uv sync --all-extras --group docs
108
+
109
+ - name: Configure Git Credentials
110
+ run: |
111
+ git config user.name github-actions[bot]
112
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
113
+
114
+ - name: Download artifact
115
+ uses: actions/download-artifact@v4
116
+ with:
117
+ name: docs-site
118
+ path: site
119
+
120
+ - name: Ensure .nojekyll exists
121
+ run: touch site/.nojekyll
122
+
123
+ - name: Determine version
124
+ id: version
125
+ run: |
126
+ if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
127
+ # Use the version provided in the workflow dispatch
128
+ echo "VERSION=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
129
+ echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
130
+ elif [[ "${{ github.event_name }}" == "release" ]]; then
131
+ # Use the tag from the release
132
+ VERSION="${{ github.ref_name }}"
133
+ # Remove 'v' prefix if present
134
+ VERSION="${VERSION#v}"
135
+ echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
136
+ echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
137
+ elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then
138
+ # For pushes to main, tag as "main"
139
+ echo "VERSION=main" >> $GITHUB_OUTPUT
140
+ # No alias for main
141
+ echo "VERSION_ALIAS=" >> $GITHUB_OUTPUT
142
+ else
143
+ # Get version from pyproject.toml as fallback
144
+ VERSION=$(grep -m 1 '^version = ' pyproject.toml | sed 's/^version = "\(.*\)"$/\1/')
145
+ echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
146
+ echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
147
+ fi
148
+
149
+ - name: Deploy docs with mike
150
+ run: |
151
+ VERSION=${{ steps.version.outputs.VERSION }}
152
+ ALIAS=${{ steps.version.outputs.VERSION_ALIAS }}
153
+
154
+ # Add a temporary remote to fetch gh-pages if it exists
155
+ git remote add temp https://github.com/${{ github.repository }}.git || true
156
+ git fetch temp gh-pages || true
157
+
158
+ DEPLOY_ARGS="--push --update-aliases $VERSION"
159
+
160
+ if [[ ! -z "$ALIAS" ]]; then
161
+ DEPLOY_ARGS="$DEPLOY_ARGS $ALIAS"
162
+ fi
163
+
164
+ # Activate the virtual environment
165
+ source .venv/bin/activate
166
+
167
+ echo "Running: mike deploy $DEPLOY_ARGS"
168
+ mike deploy $DEPLOY_ARGS
169
+
170
+ # Set default version to latest only if we're deploying a version with the latest alias
171
+ if [[ ! -z "$ALIAS" && "$ALIAS" == "latest" ]]; then
172
+ mike set-default --push latest
173
+ fi
174
+
175
+ # Remove the temporary remote
176
+ git remote remove temp || true
@@ -16,12 +16,12 @@ jobs:
16
16
  - uses: actions/checkout@v4.2.2
17
17
 
18
18
  - name: Install uv
19
- uses: astral-sh/setup-uv@v5
19
+ uses: astral-sh/setup-uv@v6
20
20
  with:
21
21
  version: "0.6.6"
22
22
  enable-cache: true
23
23
 
24
- - uses: actions/setup-python@v5.4.0
24
+ - uses: actions/setup-python@v5.5.0
25
25
  with:
26
26
  python-version: '3.10'
27
27
 
@@ -46,14 +46,14 @@ jobs:
46
46
  - uses: actions/checkout@v4.2.2
47
47
 
48
48
  - name: Install uv
49
- uses: astral-sh/setup-uv@v5.3.1
49
+ uses: astral-sh/setup-uv@v6
50
50
  with:
51
51
  # Install a specific version of uv.
52
52
  version: "0.5.21"
53
53
  enable-cache: true
54
54
 
55
55
  - name: "Set up Python ${{ matrix.python-version }}"
56
- uses: actions/setup-python@v5.4.0
56
+ uses: actions/setup-python@v5.5.0
57
57
  with:
58
58
  python-version: ${{ matrix.python-version }}
59
59
 
@@ -72,7 +72,7 @@ jobs:
72
72
  uv run pytest tests/test_imports.py
73
73
 
74
74
  - name: Upload coverage to Codecov
75
- uses: codecov/codecov-action@v5.4.0
75
+ uses: codecov/codecov-action@v5.4.3
76
76
  with:
77
77
  token: ${{ secrets.CODECOV_TOKEN }}
78
78
  file: ./coverage.xml
@@ -13,10 +13,11 @@ repos:
13
13
  args: [--fix=lf]
14
14
  - id: requirements-txt-fixer
15
15
  - id: check-yaml
16
+ args: [--unsafe]
16
17
  - id: check-toml
17
18
 
18
19
  - repo: https://github.com/astral-sh/ruff-pre-commit
19
- rev: 'v0.11.0'
20
+ rev: 'v0.11.11'
20
21
  hooks:
21
22
  - id: ruff
22
23
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,4 +1,4 @@
1
- FROM nvidia/cuda:12.3.1-devel-ubuntu20.04
1
+ FROM nvidia/cuda:12.4.1-devel-ubuntu20.04
2
2
 
3
3
  # Non-interactive apt-get commands
4
4
  ARG DEBIAN_FRONTEND=noninteractive
@@ -41,8 +41,10 @@ COPY . /vec-inf
41
41
 
42
42
  # Install project dependencies with build requirements
43
43
  RUN PIP_INDEX_URL="https://download.pytorch.org/whl/cu121" uv pip install --system -e .[dev]
44
- # Install Flash Attention
44
+ # Install FlashAttention
45
45
  RUN python3.10 -m pip install flash-attn --no-build-isolation
46
+ # Install FlashInfer
47
+ RUN python3.10 -m pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.6/
46
48
 
47
49
  # Final configuration
48
50
  RUN mkdir -p /vec-inf/nccl && \
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vec-inf
3
- Version: 0.5.0
3
+ Version: 0.6.1
4
4
  Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
5
  Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
6
6
  License-Expression: MIT
@@ -25,12 +25,14 @@ Description-Content-Type: text/markdown
25
25
  ----------------------------------------------------
26
26
 
27
27
  [![PyPI](https://img.shields.io/pypi/v/vec-inf)](https://pypi.org/project/vec-inf)
28
+ [![downloads](https://img.shields.io/pypi/dm/vec-inf)](https://pypistats.org/packages/vec-inf)
28
29
  [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
29
- [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs_deploy.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs_deploy.yml)
30
- [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/develop/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/develop)
30
+ [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
31
+ [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
32
+ [![vLLM](https://img.shields.io/badge/vllm-0.8.5.post1-blue)](https://docs.vllm.ai/en/v0.8.5.post1/index.html)
31
33
  ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
32
34
 
33
- This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the environment variables in [`cli/_helper.py`](vec_inf/cli/_helper.py), [`cli/_config.py`](vec_inf/cli/_config.py), [`vllm.slurm`](vec_inf/vllm.slurm), [`multinode_vllm.slurm`](vec_inf/multinode_vllm.slurm) and [`models.yaml`](vec_inf/config/models.yaml) accordingly.
35
+ This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the environment variables in [`vec_inf/client/slurm_vars.py`](vec_inf/client/slurm_vars.py), and the model config for cached model weights in [`vec_inf/config/models.yaml`](vec_inf/config/models.yaml) accordingly.
34
36
 
35
37
  ## Installation
36
38
  If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
@@ -38,11 +40,13 @@ If you are using the Vector cluster environment, and you don't need any customiz
38
40
  ```bash
39
41
  pip install vec-inf
40
42
  ```
41
- Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package
43
+ Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.8.5.post1`.
42
44
 
43
45
  ## Usage
44
46
 
45
- ### `launch` command
47
+ Vector Inference provides 2 user interfaces, a CLI and an API
48
+
49
+ ### CLI
46
50
 
47
51
  The `launch` command allows users to deploy a model as a slurm job. If the job successfully launches, a URL endpoint is exposed for the user to send requests for inference.
48
52
 
@@ -53,18 +57,26 @@ vec-inf launch Meta-Llama-3.1-8B-Instruct
53
57
  ```
54
58
  You should see an output like the following:
55
59
 
56
- <img width="600" alt="launch_img" src="https://github.com/user-attachments/assets/883e6a5b-8016-4837-8fdf-39097dfb18bf">
60
+ <img width="600" alt="launch_image" src="https://github.com/user-attachments/assets/a72a99fd-4bf2-408e-8850-359761d96c4f">
57
61
 
58
62
 
59
63
  #### Overrides
60
64
 
61
- Models that are already supported by `vec-inf` would be launched using the cached configuration or [default configuration](vec_inf/config/models.yaml). You can override these values by providing additional parameters. Use `vec-inf launch --help` to see the full list of parameters that can be
65
+ Models that are already supported by `vec-inf` would be launched using the cached configuration (set in [slurm_vars.py](vec_inf/client/slurm_vars.py)) or [default configuration](vec_inf/config/models.yaml). You can override these values by providing additional parameters. Use `vec-inf launch --help` to see the full list of parameters that can be
62
66
  overriden. For example, if `qos` is to be overriden:
63
67
 
64
68
  ```bash
65
69
  vec-inf launch Meta-Llama-3.1-8B-Instruct --qos <new_qos>
66
70
  ```
67
71
 
72
+ To overwrite default vLLM engine arguments, you can specify the engine arguments in a comma separated string:
73
+
74
+ ```bash
75
+ vec-inf launch Meta-Llama-3.1-8B-Instruct --vllm-args '--max-model-len=65536,--compilation-config=3'
76
+ ```
77
+
78
+ For the full list of vLLM engine arguments, you can find them [here](https://docs.vllm.ai/en/stable/serving/engine_args.html), make sure you select the correct vLLM version.
79
+
68
80
  #### Custom models
69
81
 
70
82
  You can also launch your own custom model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), and make sure to follow the instructions below:
@@ -89,14 +101,14 @@ models:
89
101
  gpus_per_node: 1
90
102
  num_nodes: 1
91
103
  vocab_size: 152064
92
- max_model_len: 1010000
93
- max_num_seqs: 256
94
- pipeline_parallelism: true
95
- enforce_eager: false
96
104
  qos: m2
97
105
  time: 08:00:00
98
106
  partition: a40
99
107
  model_weights_parent_dir: /h/<username>/model-weights
108
+ vllm_args:
109
+ --max-model-len: 1010000
110
+ --max-num-seqs: 256
111
+ --compilation-config: 3
100
112
  ```
101
113
 
102
114
  You would then set the `VEC_INF_CONFIG` path using:
@@ -105,68 +117,44 @@ You would then set the `VEC_INF_CONFIG` path using:
105
117
  export VEC_INF_CONFIG=/h/<username>/my-model-config.yaml
106
118
  ```
107
119
 
108
- Note that there are other parameters that can also be added to the config but not shown in this example, such as `data_type` and `log_dir`.
120
+ **NOTE**
121
+ * There are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](vec_inf/client/config.py) for details.
122
+ * Check [vLLM Engine Arguments](https://docs.vllm.ai/en/stable/serving/engine_args.html) for the full list of available vLLM engine arguments, the default parallel size for any parallelization is default to 1, so none of the sizes were set specifically in this example
123
+ * For GPU partitions with non-Ampere architectures, e.g. `rtx6000`, `t4v2`, BF16 isn't supported. For models that have BF16 as the default type, when using a non-Ampere GPU, use FP16 instead, i.e. `--dtype: float16`
124
+ * Setting `--compilation-config` to `3` currently breaks multi-node model launches, so we don't set them for models that require multiple nodes of GPUs.
109
125
 
110
- ### `status` command
111
- You can check the inference server status by providing the Slurm job ID to the `status` command:
112
- ```bash
113
- vec-inf status 15373800
114
- ```
115
-
116
- If the server is pending for resources, you should see an output like this:
117
-
118
- <img width="400" alt="status_pending_img" src="https://github.com/user-attachments/assets/b659c302-eae1-4560-b7a9-14eb3a822a2f">
126
+ #### Other commands
119
127
 
120
- When the server is ready, you should see an output like this:
128
+ * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
129
+ * `metrics`: Streams performance metrics to the console.
130
+ * `shutdown`: Shutdown a model by providing its Slurm job ID.
131
+ * `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
121
132
 
122
- <img width="400" alt="status_ready_img" src="https://github.com/user-attachments/assets/672986c2-736c-41ce-ac7c-1fb585cdcb0d">
133
+ For more details on the usage of these commands, refer to the [User Guide](https://vectorinstitute.github.io/vector-inference/user_guide/)
123
134
 
124
- There are 5 possible states:
135
+ ### API
125
136
 
126
- * **PENDING**: Job submitted to Slurm, but not executed yet. Job pending reason will be shown.
127
- * **LAUNCHING**: Job is running but the server is not ready yet.
128
- * **READY**: Inference server running and ready to take requests.
129
- * **FAILED**: Inference server in an unhealthy state. Job failed reason will be shown.
130
- * **SHUTDOWN**: Inference server is shutdown/cancelled.
137
+ Example:
131
138
 
132
- Note that the base URL is only available when model is in `READY` state, and if you've changed the Slurm log directory path, you also need to specify it when using the `status` command.
133
-
134
- ### `metrics` command
135
- Once your server is ready, you can check performance metrics by providing the Slurm job ID to the `metrics` command:
136
- ```bash
137
- vec-inf metrics 15373800
139
+ ```python
140
+ >>> from vec_inf.api import VecInfClient
141
+ >>> client = VecInfClient()
142
+ >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
143
+ >>> job_id = response.slurm_job_id
144
+ >>> status = client.get_status(job_id)
145
+ >>> if status.status == ModelStatus.READY:
146
+ ... print(f"Model is ready at {status.base_url}")
147
+ >>> client.shutdown_model(job_id)
138
148
  ```
139
149
 
140
- And you will see the performance metrics streamed to your console, note that the metrics are updated with a 2-second interval.
150
+ For details on the usage of the API, refer to the [API Reference](https://vectorinstitute.github.io/vector-inference/api/)
141
151
 
142
- <img width="400" alt="metrics_img" src="https://github.com/user-attachments/assets/3ee143d0-1a71-4944-bbd7-4c3299bf0339">
152
+ ## Check Job Configuration
143
153
 
144
- ### `shutdown` command
145
- Finally, when you're finished using a model, you can shut it down by providing the Slurm job ID:
146
- ```bash
147
- vec-inf shutdown 15373800
148
-
149
- > Shutting down model with Slurm Job ID: 15373800
150
- ```
151
-
152
- ### `list` command
153
- You call view the full list of available models by running the `list` command:
154
- ```bash
155
- vec-inf list
156
- ```
157
- <img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
158
-
159
- NOTE: The above screenshot does not represent the full list of models supported.
160
-
161
- You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
162
- ```bash
163
- vec-inf list Meta-Llama-3.1-70B-Instruct
164
- ```
165
- <img width="500" alt="list_model_img" src="https://github.com/user-attachments/assets/34e53937-2d86-443e-85f6-34e408653ddb">
166
-
167
- `launch`, `list`, and `status` command supports `--json-mode`, where the command output would be structured as a JSON string.
154
+ With every model launch, a Slurm script will be generated dynamically based on the job and model configuration. Once the Slurm job is queued, the generated Slurm script will be moved to the log directory for reproducibility, located at `$log_dir/$model_family/$model_name.$slurm_job_id/$model_name.$slurm_job_id.slurm`. In the same directory you can also find a JSON file with the same name that captures the launch configuration, and will have an entry of server URL once the server is ready.
168
155
 
169
156
  ## Send inference requests
157
+
170
158
  Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/chat_completions.py`, and you should expect to see an output like the following:
171
159
 
172
160
  ```json
@@ -199,8 +187,9 @@ Once the inference server is ready, you can start sending in inference requests.
199
187
  },
200
188
  "prompt_logprobs":null
201
189
  }
190
+
202
191
  ```
203
- **NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
192
+ **NOTE**: Certain models don't adhere to OpenAI's chat template, e.g. Mistral family. For these models, you can either change your prompt to follow the model's default chat template or provide your own chat template via `--chat-template: TEMPLATE_PATH`.
204
193
 
205
194
  ## SSH tunnel from your local device
206
195
  If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following: