vec-inf 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {vec_inf-0.5.0 → vec_inf-0.6.0}/.github/workflows/code_checks.yml +2 -2
  2. {vec_inf-0.5.0 → vec_inf-0.6.0}/.github/workflows/docker.yml +3 -2
  3. vec_inf-0.6.0/.github/workflows/docs.yml +176 -0
  4. {vec_inf-0.5.0 → vec_inf-0.6.0}/.github/workflows/publish.yml +2 -2
  5. {vec_inf-0.5.0 → vec_inf-0.6.0}/.github/workflows/unit_tests.yml +3 -3
  6. {vec_inf-0.5.0 → vec_inf-0.6.0}/.pre-commit-config.yaml +2 -1
  7. {vec_inf-0.5.0 → vec_inf-0.6.0}/Dockerfile +4 -2
  8. {vec_inf-0.5.0 → vec_inf-0.6.0}/PKG-INFO +44 -61
  9. {vec_inf-0.5.0 → vec_inf-0.6.0}/README.md +43 -60
  10. {vec_inf-0.5.0 → vec_inf-0.6.0}/codecov.yml +1 -1
  11. vec_inf-0.6.0/docs/api.md +18 -0
  12. vec_inf-0.6.0/docs/assets/favicon-48x48.svg +9 -0
  13. vec_inf-0.6.0/docs/assets/favicon.ico +0 -0
  14. vec_inf-0.6.0/docs/assets/vector-logo.svg +172 -0
  15. vec_inf-0.6.0/docs/contributing.md +174 -0
  16. {vec_inf-0.5.0/docs/source → vec_inf-0.6.0/docs}/index.md +1 -12
  17. vec_inf-0.6.0/docs/overrides/partials/copyright.html +22 -0
  18. vec_inf-0.6.0/docs/overrides/partials/logo.html +5 -0
  19. vec_inf-0.6.0/docs/stylesheets/extra.css +235 -0
  20. vec_inf-0.6.0/docs/user_guide.md +273 -0
  21. {vec_inf-0.5.0 → vec_inf-0.6.0}/examples/README.md +2 -0
  22. vec_inf-0.6.0/examples/api/basic_usage.py +43 -0
  23. vec_inf-0.6.0/mkdocs.yml +99 -0
  24. {vec_inf-0.5.0 → vec_inf-0.6.0}/pyproject.toml +11 -13
  25. vec_inf-0.6.0/tests/test_imports.py +32 -0
  26. {vec_inf-0.5.0 → vec_inf-0.6.0}/tests/vec_inf/cli/test_cli.py +94 -77
  27. vec_inf-0.6.0/tests/vec_inf/cli/test_utils.py +17 -0
  28. vec_inf-0.6.0/tests/vec_inf/client/__init__.py +1 -0
  29. vec_inf-0.6.0/tests/vec_inf/client/test_api.py +130 -0
  30. vec_inf-0.6.0/tests/vec_inf/client/test_examples.py +99 -0
  31. vec_inf-0.6.0/tests/vec_inf/client/test_models.py +56 -0
  32. {vec_inf-0.5.0/tests/vec_inf/cli → vec_inf-0.6.0/tests/vec_inf/client}/test_utils.py +11 -25
  33. vec_inf-0.6.0/uv.lock +4701 -0
  34. {vec_inf-0.5.0 → vec_inf-0.6.0}/vec_inf/README.md +3 -3
  35. vec_inf-0.6.0/vec_inf/cli/_cli.py +340 -0
  36. vec_inf-0.6.0/vec_inf/cli/_helper.py +400 -0
  37. vec_inf-0.6.0/vec_inf/cli/_utils.py +38 -0
  38. vec_inf-0.6.0/vec_inf/cli/_vars.py +32 -0
  39. vec_inf-0.6.0/vec_inf/client/__init__.py +31 -0
  40. vec_inf-0.6.0/vec_inf/client/_client_vars.py +213 -0
  41. vec_inf-0.6.0/vec_inf/client/_exceptions.py +37 -0
  42. vec_inf-0.6.0/vec_inf/client/_helper.py +674 -0
  43. vec_inf-0.6.0/vec_inf/client/_slurm_script_generator.py +179 -0
  44. vec_inf-0.6.0/vec_inf/client/_utils.py +287 -0
  45. vec_inf-0.6.0/vec_inf/client/api.py +302 -0
  46. vec_inf-0.6.0/vec_inf/client/config.py +128 -0
  47. vec_inf-0.6.0/vec_inf/client/models.py +225 -0
  48. vec_inf-0.6.0/vec_inf/client/slurm_vars.py +49 -0
  49. {vec_inf-0.5.0 → vec_inf-0.6.0}/vec_inf/config/README.md +0 -12
  50. {vec_inf-0.5.0 → vec_inf-0.6.0}/vec_inf/config/models.yaml +417 -391
  51. vec_inf-0.5.0/.github/workflows/docs_build.yml +0 -44
  52. vec_inf-0.5.0/.github/workflows/docs_deploy.yml +0 -59
  53. vec_inf-0.5.0/docs/source/_static/custom.js +0 -6
  54. vec_inf-0.5.0/docs/source/_static/logos/vector_logo.png +0 -0
  55. vec_inf-0.5.0/docs/source/_static/require.min.js +0 -1
  56. vec_inf-0.5.0/docs/source/_templates/base.html +0 -120
  57. vec_inf-0.5.0/docs/source/_templates/custom-class-template.rst +0 -34
  58. vec_inf-0.5.0/docs/source/_templates/custom-module-template.rst +0 -66
  59. vec_inf-0.5.0/docs/source/_templates/page.html +0 -219
  60. vec_inf-0.5.0/docs/source/conf.py +0 -113
  61. vec_inf-0.5.0/docs/source/user_guide.md +0 -181
  62. vec_inf-0.5.0/tests/test_imports.py +0 -17
  63. vec_inf-0.5.0/uv.lock +0 -4511
  64. vec_inf-0.5.0/vec_inf/cli/_cli.py +0 -230
  65. vec_inf-0.5.0/vec_inf/cli/_config.py +0 -87
  66. vec_inf-0.5.0/vec_inf/cli/_helper.py +0 -675
  67. vec_inf-0.5.0/vec_inf/cli/_utils.py +0 -162
  68. vec_inf-0.5.0/vec_inf/multinode_vllm.slurm +0 -154
  69. vec_inf-0.5.0/vec_inf/vllm.slurm +0 -90
  70. {vec_inf-0.5.0 → vec_inf-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  71. {vec_inf-0.5.0 → vec_inf-0.6.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  72. {vec_inf-0.5.0 → vec_inf-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  73. {vec_inf-0.5.0 → vec_inf-0.6.0}/.github/dependabot.yml +0 -0
  74. {vec_inf-0.5.0 → vec_inf-0.6.0}/.github/pull_request_template.md +0 -0
  75. {vec_inf-0.5.0 → vec_inf-0.6.0}/.gitignore +0 -0
  76. {vec_inf-0.5.0 → vec_inf-0.6.0}/.python-version +0 -0
  77. {vec_inf-0.5.0 → vec_inf-0.6.0}/LICENSE +0 -0
  78. {vec_inf-0.5.0 → vec_inf-0.6.0}/docs/Makefile +0 -0
  79. {vec_inf-0.5.0 → vec_inf-0.6.0}/docs/make.bat +0 -0
  80. {vec_inf-0.5.0 → vec_inf-0.6.0}/examples/inference/llm/chat_completions.py +0 -0
  81. {vec_inf-0.5.0 → vec_inf-0.6.0}/examples/inference/llm/completions.py +0 -0
  82. {vec_inf-0.5.0 → vec_inf-0.6.0}/examples/inference/llm/completions.sh +0 -0
  83. {vec_inf-0.5.0 → vec_inf-0.6.0}/examples/inference/text_embedding/embeddings.py +0 -0
  84. {vec_inf-0.5.0 → vec_inf-0.6.0}/examples/inference/vlm/vision_completions.py +0 -0
  85. {vec_inf-0.5.0 → vec_inf-0.6.0}/examples/logits/logits.py +0 -0
  86. {vec_inf-0.5.0 → vec_inf-0.6.0}/profile/avg_throughput.py +0 -0
  87. {vec_inf-0.5.0 → vec_inf-0.6.0}/profile/gen.py +0 -0
  88. {vec_inf-0.5.0 → vec_inf-0.6.0}/tests/__init__.py +0 -0
  89. {vec_inf-0.5.0 → vec_inf-0.6.0}/tests/vec_inf/__init__.py +0 -0
  90. {vec_inf-0.5.0 → vec_inf-0.6.0}/tests/vec_inf/cli/__init__.py +0 -0
  91. {vec_inf-0.5.0 → vec_inf-0.6.0}/vec_inf/__init__.py +0 -0
  92. {vec_inf-0.5.0 → vec_inf-0.6.0}/vec_inf/cli/__init__.py +0 -0
  93. {vec_inf-0.5.0 → vec_inf-0.6.0}/vec_inf/find_port.sh +0 -0
  94. {vec_inf-0.5.0 → vec_inf-0.6.0}/venv.sh +0 -0
@@ -30,13 +30,13 @@ jobs:
30
30
  steps:
31
31
  - uses: actions/checkout@v4.2.2
32
32
  - name: Install uv
33
- uses: astral-sh/setup-uv@v5.3.1
33
+ uses: astral-sh/setup-uv@v6
34
34
  with:
35
35
  # Install a specific version of uv.
36
36
  version: "0.5.21"
37
37
  enable-cache: true
38
38
  - name: "Set up Python"
39
- uses: actions/setup-python@v5.4.0
39
+ uses: actions/setup-python@v5.5.0
40
40
  with:
41
41
  python-version-file: ".python-version"
42
42
  - name: Install the project
@@ -9,13 +9,14 @@ on:
9
9
  paths:
10
10
  - Dockerfile
11
11
  - .github/workflows/docker.yml
12
+ - uv.lock
12
13
  pull_request:
13
14
  branches:
14
15
  - main
15
- - develop
16
16
  paths:
17
17
  - Dockerfile
18
18
  - .github/workflows/docker.yml
19
+ - uv.lock
19
20
 
20
21
  jobs:
21
22
  push_to_registry:
@@ -44,7 +45,7 @@ jobs:
44
45
  images: vectorinstitute/vector-inference
45
46
 
46
47
  - name: Build and push Docker image
47
- uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4
48
+ uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1
48
49
  with:
49
50
  context: .
50
51
  file: ./Dockerfile
@@ -0,0 +1,176 @@
1
+ name: docs
2
+ permissions:
3
+ contents: write
4
+ pull-requests: write
5
+
6
+ on:
7
+ push:
8
+ branches:
9
+ - main
10
+ paths:
11
+ - .pre-commit-config.yaml
12
+ - .github/workflows/docs.yml
13
+ - '**.py'
14
+ - '**.ipynb'
15
+ - '**.html'
16
+ - '**.js'
17
+ - '**.md'
18
+ - uv.lock
19
+ - pyproject.toml
20
+ - mkdocs.yml
21
+ - '**.png'
22
+ - '**.svg'
23
+ pull_request:
24
+ branches:
25
+ - main
26
+ paths:
27
+ - .pre-commit-config.yaml
28
+ - .github/workflows/docs.yml
29
+ - '**.py'
30
+ - '**.ipynb'
31
+ - '**.js'
32
+ - '**.html'
33
+ - uv.lock
34
+ - pyproject.toml
35
+ - '**.md'
36
+ - mkdocs.yml
37
+ - '**.png'
38
+ - '**.svg'
39
+ release:
40
+ types: [published]
41
+ # Allow manual trigger
42
+ workflow_dispatch:
43
+ inputs:
44
+ version:
45
+ description: 'Version to deploy (e.g., 0.5.0, latest)'
46
+ required: true
47
+ default: 'latest'
48
+
49
+ jobs:
50
+ build:
51
+ runs-on: ubuntu-latest
52
+ steps:
53
+ - name: Checkout code
54
+ uses: actions/checkout@v4.2.2
55
+ with:
56
+ fetch-depth: 0 # Fetch all history for proper versioning
57
+
58
+ - name: Install uv
59
+ uses: astral-sh/setup-uv@v6
60
+ with:
61
+ version: "0.5.21"
62
+ enable-cache: true
63
+
64
+ - name: Set up Python
65
+ uses: actions/setup-python@v5
66
+ with:
67
+ python-version-file: ".python-version"
68
+
69
+ - name: Install the project
70
+ run: uv sync --all-extras --group docs
71
+
72
+ - name: Build docs
73
+ run: uv run mkdocs build
74
+
75
+ - name: Create .nojekyll file
76
+ run: touch site/.nojekyll
77
+
78
+ - name: Upload artifact
79
+ uses: actions/upload-artifact@v4
80
+ with:
81
+ name: docs-site
82
+ path: site/
83
+ retention-days: 1
84
+
85
+ deploy:
86
+ needs: build
87
+ if: (github.event_name == 'push' && github.ref == 'refs/heads/main') || github.event_name == 'workflow_dispatch' || github.event_name == 'release'
88
+ runs-on: ubuntu-latest
89
+ steps:
90
+ - name: Checkout code
91
+ uses: actions/checkout@v4.2.2
92
+ with:
93
+ fetch-depth: 0 # Fetch all history for proper versioning
94
+
95
+ - name: Install uv
96
+ uses: astral-sh/setup-uv@v6
97
+ with:
98
+ version: "0.5.21"
99
+ enable-cache: true
100
+
101
+ - name: Set up Python
102
+ uses: actions/setup-python@v5
103
+ with:
104
+ python-version-file: ".python-version"
105
+
106
+ - name: Install the project
107
+ run: uv sync --all-extras --group docs
108
+
109
+ - name: Configure Git Credentials
110
+ run: |
111
+ git config user.name github-actions[bot]
112
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
113
+
114
+ - name: Download artifact
115
+ uses: actions/download-artifact@v4
116
+ with:
117
+ name: docs-site
118
+ path: site
119
+
120
+ - name: Ensure .nojekyll exists
121
+ run: touch site/.nojekyll
122
+
123
+ - name: Determine version
124
+ id: version
125
+ run: |
126
+ if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
127
+ # Use the version provided in the workflow dispatch
128
+ echo "VERSION=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
129
+ echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
130
+ elif [[ "${{ github.event_name }}" == "release" ]]; then
131
+ # Use the tag from the release
132
+ VERSION="${{ github.ref_name }}"
133
+ # Remove 'v' prefix if present
134
+ VERSION="${VERSION#v}"
135
+ echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
136
+ echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
137
+ elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then
138
+ # For pushes to main, tag as "main"
139
+ echo "VERSION=main" >> $GITHUB_OUTPUT
140
+ # No alias for main
141
+ echo "VERSION_ALIAS=" >> $GITHUB_OUTPUT
142
+ else
143
+ # Get version from pyproject.toml as fallback
144
+ VERSION=$(grep -m 1 '^version = ' pyproject.toml | sed 's/^version = "\(.*\)"$/\1/')
145
+ echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
146
+ echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
147
+ fi
148
+
149
+ - name: Deploy docs with mike
150
+ run: |
151
+ VERSION=${{ steps.version.outputs.VERSION }}
152
+ ALIAS=${{ steps.version.outputs.VERSION_ALIAS }}
153
+
154
+ # Add a temporary remote to fetch gh-pages if it exists
155
+ git remote add temp https://github.com/${{ github.repository }}.git || true
156
+ git fetch temp gh-pages || true
157
+
158
+ DEPLOY_ARGS="--push --update-aliases $VERSION"
159
+
160
+ if [[ ! -z "$ALIAS" ]]; then
161
+ DEPLOY_ARGS="$DEPLOY_ARGS $ALIAS"
162
+ fi
163
+
164
+ # Activate the virtual environment
165
+ source .venv/bin/activate
166
+
167
+ echo "Running: mike deploy $DEPLOY_ARGS"
168
+ mike deploy $DEPLOY_ARGS
169
+
170
+ # Set default version to latest only if we're deploying a version with the latest alias
171
+ if [[ ! -z "$ALIAS" && "$ALIAS" == "latest" ]]; then
172
+ mike set-default --push latest
173
+ fi
174
+
175
+ # Remove the temporary remote
176
+ git remote remove temp || true
@@ -16,12 +16,12 @@ jobs:
16
16
  - uses: actions/checkout@v4.2.2
17
17
 
18
18
  - name: Install uv
19
- uses: astral-sh/setup-uv@v5
19
+ uses: astral-sh/setup-uv@v6
20
20
  with:
21
21
  version: "0.6.6"
22
22
  enable-cache: true
23
23
 
24
- - uses: actions/setup-python@v5.4.0
24
+ - uses: actions/setup-python@v5.5.0
25
25
  with:
26
26
  python-version: '3.10'
27
27
 
@@ -46,14 +46,14 @@ jobs:
46
46
  - uses: actions/checkout@v4.2.2
47
47
 
48
48
  - name: Install uv
49
- uses: astral-sh/setup-uv@v5.3.1
49
+ uses: astral-sh/setup-uv@v6
50
50
  with:
51
51
  # Install a specific version of uv.
52
52
  version: "0.5.21"
53
53
  enable-cache: true
54
54
 
55
55
  - name: "Set up Python ${{ matrix.python-version }}"
56
- uses: actions/setup-python@v5.4.0
56
+ uses: actions/setup-python@v5.5.0
57
57
  with:
58
58
  python-version: ${{ matrix.python-version }}
59
59
 
@@ -72,7 +72,7 @@ jobs:
72
72
  uv run pytest tests/test_imports.py
73
73
 
74
74
  - name: Upload coverage to Codecov
75
- uses: codecov/codecov-action@v5.4.0
75
+ uses: codecov/codecov-action@v5.4.2
76
76
  with:
77
77
  token: ${{ secrets.CODECOV_TOKEN }}
78
78
  file: ./coverage.xml
@@ -13,10 +13,11 @@ repos:
13
13
  args: [--fix=lf]
14
14
  - id: requirements-txt-fixer
15
15
  - id: check-yaml
16
+ args: [--unsafe]
16
17
  - id: check-toml
17
18
 
18
19
  - repo: https://github.com/astral-sh/ruff-pre-commit
19
- rev: 'v0.11.0'
20
+ rev: 'v0.11.8'
20
21
  hooks:
21
22
  - id: ruff
22
23
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,4 +1,4 @@
1
- FROM nvidia/cuda:12.3.1-devel-ubuntu20.04
1
+ FROM nvidia/cuda:12.4.1-devel-ubuntu20.04
2
2
 
3
3
  # Non-interactive apt-get commands
4
4
  ARG DEBIAN_FRONTEND=noninteractive
@@ -41,8 +41,10 @@ COPY . /vec-inf
41
41
 
42
42
  # Install project dependencies with build requirements
43
43
  RUN PIP_INDEX_URL="https://download.pytorch.org/whl/cu121" uv pip install --system -e .[dev]
44
- # Install Flash Attention
44
+ # Install FlashAttention
45
45
  RUN python3.10 -m pip install flash-attn --no-build-isolation
46
+ # Install FlashInfer
47
+ RUN python3.10 -m pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.6/
46
48
 
47
49
  # Final configuration
48
50
  RUN mkdir -p /vec-inf/nccl && \
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vec-inf
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
5
  Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
6
6
  License-Expression: MIT
@@ -25,12 +25,13 @@ Description-Content-Type: text/markdown
25
25
  ----------------------------------------------------
26
26
 
27
27
  [![PyPI](https://img.shields.io/pypi/v/vec-inf)](https://pypi.org/project/vec-inf)
28
+ [![downloads](https://img.shields.io/pypi/dm/vec-inf)](https://pypistats.org/packages/vec-inf)
28
29
  [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
29
- [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs_deploy.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs_deploy.yml)
30
- [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/develop/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/develop)
30
+ [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
31
+ [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
31
32
  ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
32
33
 
33
- This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the environment variables in [`cli/_helper.py`](vec_inf/cli/_helper.py), [`cli/_config.py`](vec_inf/cli/_config.py), [`vllm.slurm`](vec_inf/vllm.slurm), [`multinode_vllm.slurm`](vec_inf/multinode_vllm.slurm) and [`models.yaml`](vec_inf/config/models.yaml) accordingly.
34
+ This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the environment variables in [`vec_inf/client/slurm_vars.py`](vec_inf/client/slurm_vars.py), and the model config for cached model weights in [`vec_inf/config/models.yaml`](vec_inf/config/models.yaml) accordingly.
34
35
 
35
36
  ## Installation
36
37
  If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
@@ -42,7 +43,9 @@ Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up
42
43
 
43
44
  ## Usage
44
45
 
45
- ### `launch` command
46
+ Vector Inference provides 2 user interfaces, a CLI and an API
47
+
48
+ ### CLI
46
49
 
47
50
  The `launch` command allows users to deploy a model as a slurm job. If the job successfully launches, a URL endpoint is exposed for the user to send requests for inference.
48
51
 
@@ -53,18 +56,26 @@ vec-inf launch Meta-Llama-3.1-8B-Instruct
53
56
  ```
54
57
  You should see an output like the following:
55
58
 
56
- <img width="600" alt="launch_img" src="https://github.com/user-attachments/assets/883e6a5b-8016-4837-8fdf-39097dfb18bf">
59
+ <img width="600" alt="launch_image" src="https://github.com/user-attachments/assets/a72a99fd-4bf2-408e-8850-359761d96c4f">
57
60
 
58
61
 
59
62
  #### Overrides
60
63
 
61
- Models that are already supported by `vec-inf` would be launched using the cached configuration or [default configuration](vec_inf/config/models.yaml). You can override these values by providing additional parameters. Use `vec-inf launch --help` to see the full list of parameters that can be
64
+ Models that are already supported by `vec-inf` would be launched using the cached configuration (set in [slurm_vars.py](vec_inf/client/slurm_vars.py)) or [default configuration](vec_inf/config/models.yaml). You can override these values by providing additional parameters. Use `vec-inf launch --help` to see the full list of parameters that can be
62
65
  overriden. For example, if `qos` is to be overriden:
63
66
 
64
67
  ```bash
65
68
  vec-inf launch Meta-Llama-3.1-8B-Instruct --qos <new_qos>
66
69
  ```
67
70
 
71
+ To overwrite default vLLM engine arguments, you can specify the engine arguments in a comma separated string:
72
+
73
+ ```bash
74
+ vec-inf launch Meta-Llama-3.1-8B-Instruct --vllm-args '--max-model-len=65536,--compilation-config=3'
75
+ ```
76
+
77
+ For the full list of vLLM engine arguments, you can find them [here](https://docs.vllm.ai/en/stable/serving/engine_args.html), make sure you select the correct vLLM version.
78
+
68
79
  #### Custom models
69
80
 
70
81
  You can also launch your own custom model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), and make sure to follow the instructions below:
@@ -89,14 +100,14 @@ models:
89
100
  gpus_per_node: 1
90
101
  num_nodes: 1
91
102
  vocab_size: 152064
92
- max_model_len: 1010000
93
- max_num_seqs: 256
94
- pipeline_parallelism: true
95
- enforce_eager: false
96
103
  qos: m2
97
104
  time: 08:00:00
98
105
  partition: a40
99
106
  model_weights_parent_dir: /h/<username>/model-weights
107
+ vllm_args:
108
+ --max-model-len: 1010000
109
+ --max-num-seqs: 256
110
+ --compilation-confi: 3
100
111
  ```
101
112
 
102
113
  You would then set the `VEC_INF_CONFIG` path using:
@@ -105,68 +116,40 @@ You would then set the `VEC_INF_CONFIG` path using:
105
116
  export VEC_INF_CONFIG=/h/<username>/my-model-config.yaml
106
117
  ```
107
118
 
108
- Note that there are other parameters that can also be added to the config but not shown in this example, such as `data_type` and `log_dir`.
109
-
110
- ### `status` command
111
- You can check the inference server status by providing the Slurm job ID to the `status` command:
112
- ```bash
113
- vec-inf status 15373800
114
- ```
115
-
116
- If the server is pending for resources, you should see an output like this:
117
-
118
- <img width="400" alt="status_pending_img" src="https://github.com/user-attachments/assets/b659c302-eae1-4560-b7a9-14eb3a822a2f">
119
-
120
- When the server is ready, you should see an output like this:
119
+ Note that there are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](vec_inf/client/config.py) for details.
121
120
 
122
- <img width="400" alt="status_ready_img" src="https://github.com/user-attachments/assets/672986c2-736c-41ce-ac7c-1fb585cdcb0d">
121
+ #### Other commands
123
122
 
124
- There are 5 possible states:
123
+ * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
124
+ * `metrics`: Streams performance metrics to the console.
125
+ * `shutdown`: Shutdown a model by providing its Slurm job ID.
126
+ * `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
125
127
 
126
- * **PENDING**: Job submitted to Slurm, but not executed yet. Job pending reason will be shown.
127
- * **LAUNCHING**: Job is running but the server is not ready yet.
128
- * **READY**: Inference server running and ready to take requests.
129
- * **FAILED**: Inference server in an unhealthy state. Job failed reason will be shown.
130
- * **SHUTDOWN**: Inference server is shutdown/cancelled.
128
+ For more details on the usage of these commands, refer to the [User Guide](https://vectorinstitute.github.io/vector-inference/user_guide/)
131
129
 
132
- Note that the base URL is only available when model is in `READY` state, and if you've changed the Slurm log directory path, you also need to specify it when using the `status` command.
130
+ ### API
133
131
 
134
- ### `metrics` command
135
- Once your server is ready, you can check performance metrics by providing the Slurm job ID to the `metrics` command:
136
- ```bash
137
- vec-inf metrics 15373800
138
- ```
139
-
140
- And you will see the performance metrics streamed to your console, note that the metrics are updated with a 2-second interval.
141
-
142
- <img width="400" alt="metrics_img" src="https://github.com/user-attachments/assets/3ee143d0-1a71-4944-bbd7-4c3299bf0339">
143
-
144
- ### `shutdown` command
145
- Finally, when you're finished using a model, you can shut it down by providing the Slurm job ID:
146
- ```bash
147
- vec-inf shutdown 15373800
132
+ Example:
148
133
 
149
- > Shutting down model with Slurm Job ID: 15373800
134
+ ```python
135
+ >>> from vec_inf.api import VecInfClient
136
+ >>> client = VecInfClient()
137
+ >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
138
+ >>> job_id = response.slurm_job_id
139
+ >>> status = client.get_status(job_id)
140
+ >>> if status.status == ModelStatus.READY:
141
+ ... print(f"Model is ready at {status.base_url}")
142
+ >>> client.shutdown_model(job_id)
150
143
  ```
151
144
 
152
- ### `list` command
153
- You call view the full list of available models by running the `list` command:
154
- ```bash
155
- vec-inf list
156
- ```
157
- <img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
158
-
159
- NOTE: The above screenshot does not represent the full list of models supported.
145
+ For details on the usage of the API, refer to the [API Reference](https://vectorinstitute.github.io/vector-inference/api/)
160
146
 
161
- You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
162
- ```bash
163
- vec-inf list Meta-Llama-3.1-70B-Instruct
164
- ```
165
- <img width="500" alt="list_model_img" src="https://github.com/user-attachments/assets/34e53937-2d86-443e-85f6-34e408653ddb">
147
+ ## Check Job Configuration
166
148
 
167
- `launch`, `list`, and `status` command supports `--json-mode`, where the command output would be structured as a JSON string.
149
+ With every model launch, a Slurm script will be generated dynamically based on the job and model configuration. Once the Slurm job is queued, the generated Slurm script will be moved to the log directory for reproducibility, located at `$log_dir/$model_family/$model_name.$slurm_job_id/$model_name.$slurm_job_id.slurm`. In the same directory you can also find a JSON file with the same name that captures the launch configuration, and will have an entry of server URL once the server is ready.
168
150
 
169
151
  ## Send inference requests
152
+
170
153
  Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/chat_completions.py`, and you should expect to see an output like the following:
171
154
 
172
155
  ```json
@@ -3,12 +3,13 @@
3
3
  ----------------------------------------------------
4
4
 
5
5
  [![PyPI](https://img.shields.io/pypi/v/vec-inf)](https://pypi.org/project/vec-inf)
6
+ [![downloads](https://img.shields.io/pypi/dm/vec-inf)](https://pypistats.org/packages/vec-inf)
6
7
  [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
7
- [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs_deploy.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs_deploy.yml)
8
- [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/develop/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/develop)
8
+ [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
9
+ [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
9
10
  ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
10
11
 
11
- This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the environment variables in [`cli/_helper.py`](vec_inf/cli/_helper.py), [`cli/_config.py`](vec_inf/cli/_config.py), [`vllm.slurm`](vec_inf/vllm.slurm), [`multinode_vllm.slurm`](vec_inf/multinode_vllm.slurm) and [`models.yaml`](vec_inf/config/models.yaml) accordingly.
12
+ This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the environment variables in [`vec_inf/client/slurm_vars.py`](vec_inf/client/slurm_vars.py), and the model config for cached model weights in [`vec_inf/config/models.yaml`](vec_inf/config/models.yaml) accordingly.
12
13
 
13
14
  ## Installation
14
15
  If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
@@ -20,7 +21,9 @@ Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up
20
21
 
21
22
  ## Usage
22
23
 
23
- ### `launch` command
24
+ Vector Inference provides 2 user interfaces, a CLI and an API
25
+
26
+ ### CLI
24
27
 
25
28
  The `launch` command allows users to deploy a model as a slurm job. If the job successfully launches, a URL endpoint is exposed for the user to send requests for inference.
26
29
 
@@ -31,18 +34,26 @@ vec-inf launch Meta-Llama-3.1-8B-Instruct
31
34
  ```
32
35
  You should see an output like the following:
33
36
 
34
- <img width="600" alt="launch_img" src="https://github.com/user-attachments/assets/883e6a5b-8016-4837-8fdf-39097dfb18bf">
37
+ <img width="600" alt="launch_image" src="https://github.com/user-attachments/assets/a72a99fd-4bf2-408e-8850-359761d96c4f">
35
38
 
36
39
 
37
40
  #### Overrides
38
41
 
39
- Models that are already supported by `vec-inf` would be launched using the cached configuration or [default configuration](vec_inf/config/models.yaml). You can override these values by providing additional parameters. Use `vec-inf launch --help` to see the full list of parameters that can be
42
+ Models that are already supported by `vec-inf` would be launched using the cached configuration (set in [slurm_vars.py](vec_inf/client/slurm_vars.py)) or [default configuration](vec_inf/config/models.yaml). You can override these values by providing additional parameters. Use `vec-inf launch --help` to see the full list of parameters that can be
40
43
  overriden. For example, if `qos` is to be overriden:
41
44
 
42
45
  ```bash
43
46
  vec-inf launch Meta-Llama-3.1-8B-Instruct --qos <new_qos>
44
47
  ```
45
48
 
49
+ To overwrite default vLLM engine arguments, you can specify the engine arguments in a comma separated string:
50
+
51
+ ```bash
52
+ vec-inf launch Meta-Llama-3.1-8B-Instruct --vllm-args '--max-model-len=65536,--compilation-config=3'
53
+ ```
54
+
55
+ For the full list of vLLM engine arguments, you can find them [here](https://docs.vllm.ai/en/stable/serving/engine_args.html), make sure you select the correct vLLM version.
56
+
46
57
  #### Custom models
47
58
 
48
59
  You can also launch your own custom model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), and make sure to follow the instructions below:
@@ -67,14 +78,14 @@ models:
67
78
  gpus_per_node: 1
68
79
  num_nodes: 1
69
80
  vocab_size: 152064
70
- max_model_len: 1010000
71
- max_num_seqs: 256
72
- pipeline_parallelism: true
73
- enforce_eager: false
74
81
  qos: m2
75
82
  time: 08:00:00
76
83
  partition: a40
77
84
  model_weights_parent_dir: /h/<username>/model-weights
85
+ vllm_args:
86
+ --max-model-len: 1010000
87
+ --max-num-seqs: 256
88
+ --compilation-confi: 3
78
89
  ```
79
90
 
80
91
  You would then set the `VEC_INF_CONFIG` path using:
@@ -83,68 +94,40 @@ You would then set the `VEC_INF_CONFIG` path using:
83
94
  export VEC_INF_CONFIG=/h/<username>/my-model-config.yaml
84
95
  ```
85
96
 
86
- Note that there are other parameters that can also be added to the config but not shown in this example, such as `data_type` and `log_dir`.
87
-
88
- ### `status` command
89
- You can check the inference server status by providing the Slurm job ID to the `status` command:
90
- ```bash
91
- vec-inf status 15373800
92
- ```
93
-
94
- If the server is pending for resources, you should see an output like this:
95
-
96
- <img width="400" alt="status_pending_img" src="https://github.com/user-attachments/assets/b659c302-eae1-4560-b7a9-14eb3a822a2f">
97
-
98
- When the server is ready, you should see an output like this:
97
+ Note that there are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](vec_inf/client/config.py) for details.
99
98
 
100
- <img width="400" alt="status_ready_img" src="https://github.com/user-attachments/assets/672986c2-736c-41ce-ac7c-1fb585cdcb0d">
99
+ #### Other commands
101
100
 
102
- There are 5 possible states:
101
+ * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
102
+ * `metrics`: Streams performance metrics to the console.
103
+ * `shutdown`: Shutdown a model by providing its Slurm job ID.
104
+ * `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
103
105
 
104
- * **PENDING**: Job submitted to Slurm, but not executed yet. Job pending reason will be shown.
105
- * **LAUNCHING**: Job is running but the server is not ready yet.
106
- * **READY**: Inference server running and ready to take requests.
107
- * **FAILED**: Inference server in an unhealthy state. Job failed reason will be shown.
108
- * **SHUTDOWN**: Inference server is shutdown/cancelled.
106
+ For more details on the usage of these commands, refer to the [User Guide](https://vectorinstitute.github.io/vector-inference/user_guide/)
109
107
 
110
- Note that the base URL is only available when model is in `READY` state, and if you've changed the Slurm log directory path, you also need to specify it when using the `status` command.
108
+ ### API
111
109
 
112
- ### `metrics` command
113
- Once your server is ready, you can check performance metrics by providing the Slurm job ID to the `metrics` command:
114
- ```bash
115
- vec-inf metrics 15373800
116
- ```
117
-
118
- And you will see the performance metrics streamed to your console, note that the metrics are updated with a 2-second interval.
119
-
120
- <img width="400" alt="metrics_img" src="https://github.com/user-attachments/assets/3ee143d0-1a71-4944-bbd7-4c3299bf0339">
121
-
122
- ### `shutdown` command
123
- Finally, when you're finished using a model, you can shut it down by providing the Slurm job ID:
124
- ```bash
125
- vec-inf shutdown 15373800
110
+ Example:
126
111
 
127
- > Shutting down model with Slurm Job ID: 15373800
112
+ ```python
113
+ >>> from vec_inf.api import VecInfClient
114
+ >>> client = VecInfClient()
115
+ >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
116
+ >>> job_id = response.slurm_job_id
117
+ >>> status = client.get_status(job_id)
118
+ >>> if status.status == ModelStatus.READY:
119
+ ... print(f"Model is ready at {status.base_url}")
120
+ >>> client.shutdown_model(job_id)
128
121
  ```
129
122
 
130
- ### `list` command
131
- You call view the full list of available models by running the `list` command:
132
- ```bash
133
- vec-inf list
134
- ```
135
- <img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
136
-
137
- NOTE: The above screenshot does not represent the full list of models supported.
123
+ For details on the usage of the API, refer to the [API Reference](https://vectorinstitute.github.io/vector-inference/api/)
138
124
 
139
- You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
140
- ```bash
141
- vec-inf list Meta-Llama-3.1-70B-Instruct
142
- ```
143
- <img width="500" alt="list_model_img" src="https://github.com/user-attachments/assets/34e53937-2d86-443e-85f6-34e408653ddb">
125
+ ## Check Job Configuration
144
126
 
145
- `launch`, `list`, and `status` command supports `--json-mode`, where the command output would be structured as a JSON string.
127
+ With every model launch, a Slurm script will be generated dynamically based on the job and model configuration. Once the Slurm job is queued, the generated Slurm script will be moved to the log directory for reproducibility, located at `$log_dir/$model_family/$model_name.$slurm_job_id/$model_name.$slurm_job_id.slurm`. In the same directory you can also find a JSON file with the same name that captures the launch configuration, and will have an entry of server URL once the server is ready.
146
128
 
147
129
  ## Send inference requests
130
+
148
131
  Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/chat_completions.py`, and you should expect to see an output like the following:
149
132
 
150
133
  ```json
@@ -1,5 +1,5 @@
1
1
  codecov:
2
- branch: develop
2
+ branch: main
3
3
  require_ci_to_pass: true
4
4
  notify:
5
5
  after_n_builds: 2