vec-inf 0.4.1__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. {vec_inf-0.4.1 → vec_inf-0.6.0}/.github/workflows/code_checks.yml +3 -3
  2. vec_inf-0.6.0/.github/workflows/docker.yml +56 -0
  3. vec_inf-0.6.0/.github/workflows/docs.yml +176 -0
  4. {vec_inf-0.4.1 → vec_inf-0.6.0}/.github/workflows/publish.yml +14 -9
  5. {vec_inf-0.4.1 → vec_inf-0.6.0}/.github/workflows/unit_tests.yml +18 -8
  6. {vec_inf-0.4.1 → vec_inf-0.6.0}/.pre-commit-config.yaml +2 -1
  7. vec_inf-0.6.0/Dockerfile +54 -0
  8. vec_inf-0.6.0/PKG-INFO +193 -0
  9. vec_inf-0.6.0/README.md +171 -0
  10. {vec_inf-0.4.1 → vec_inf-0.6.0}/codecov.yml +1 -0
  11. vec_inf-0.6.0/docs/api.md +18 -0
  12. vec_inf-0.6.0/docs/assets/favicon-48x48.svg +9 -0
  13. vec_inf-0.6.0/docs/assets/favicon.ico +0 -0
  14. vec_inf-0.6.0/docs/assets/vector-logo.svg +172 -0
  15. vec_inf-0.6.0/docs/contributing.md +174 -0
  16. {vec_inf-0.4.1/docs/source → vec_inf-0.6.0/docs}/index.md +1 -12
  17. vec_inf-0.6.0/docs/overrides/partials/copyright.html +22 -0
  18. vec_inf-0.6.0/docs/overrides/partials/logo.html +5 -0
  19. vec_inf-0.6.0/docs/stylesheets/extra.css +235 -0
  20. vec_inf-0.6.0/docs/user_guide.md +273 -0
  21. {vec_inf-0.4.1 → vec_inf-0.6.0}/examples/README.md +2 -0
  22. vec_inf-0.6.0/examples/api/basic_usage.py +43 -0
  23. {vec_inf-0.4.1 → vec_inf-0.6.0}/examples/inference/llm/chat_completions.py +1 -1
  24. vec_inf-0.6.0/mkdocs.yml +99 -0
  25. {vec_inf-0.4.1 → vec_inf-0.6.0}/pyproject.toml +17 -17
  26. vec_inf-0.6.0/tests/test_imports.py +32 -0
  27. vec_inf-0.6.0/tests/vec_inf/cli/test_cli.py +533 -0
  28. vec_inf-0.6.0/tests/vec_inf/cli/test_utils.py +17 -0
  29. vec_inf-0.6.0/tests/vec_inf/client/__init__.py +1 -0
  30. vec_inf-0.6.0/tests/vec_inf/client/test_api.py +130 -0
  31. vec_inf-0.6.0/tests/vec_inf/client/test_examples.py +99 -0
  32. vec_inf-0.6.0/tests/vec_inf/client/test_models.py +56 -0
  33. {vec_inf-0.4.1/tests/vec_inf/cli → vec_inf-0.6.0/tests/vec_inf/client}/test_utils.py +92 -91
  34. vec_inf-0.6.0/uv.lock +4701 -0
  35. {vec_inf-0.4.1 → vec_inf-0.6.0}/vec_inf/README.md +3 -3
  36. vec_inf-0.6.0/vec_inf/cli/_cli.py +340 -0
  37. vec_inf-0.6.0/vec_inf/cli/_helper.py +400 -0
  38. vec_inf-0.6.0/vec_inf/cli/_utils.py +38 -0
  39. vec_inf-0.6.0/vec_inf/cli/_vars.py +32 -0
  40. vec_inf-0.6.0/vec_inf/client/__init__.py +31 -0
  41. vec_inf-0.6.0/vec_inf/client/_client_vars.py +213 -0
  42. vec_inf-0.6.0/vec_inf/client/_exceptions.py +37 -0
  43. vec_inf-0.6.0/vec_inf/client/_helper.py +674 -0
  44. vec_inf-0.6.0/vec_inf/client/_slurm_script_generator.py +179 -0
  45. vec_inf-0.6.0/vec_inf/client/_utils.py +287 -0
  46. vec_inf-0.6.0/vec_inf/client/api.py +302 -0
  47. vec_inf-0.6.0/vec_inf/client/config.py +128 -0
  48. vec_inf-0.6.0/vec_inf/client/models.py +225 -0
  49. vec_inf-0.6.0/vec_inf/client/slurm_vars.py +49 -0
  50. {vec_inf-0.4.1/vec_inf/models → vec_inf-0.6.0/vec_inf/config}/README.md +30 -12
  51. vec_inf-0.6.0/vec_inf/config/models.yaml +1300 -0
  52. vec_inf-0.4.1/.github/workflows/docs_build.yml +0 -44
  53. vec_inf-0.4.1/.github/workflows/docs_deploy.yml +0 -59
  54. vec_inf-0.4.1/Dockerfile +0 -79
  55. vec_inf-0.4.1/PKG-INFO +0 -121
  56. vec_inf-0.4.1/README.md +0 -101
  57. vec_inf-0.4.1/docs/source/_static/custom.js +0 -6
  58. vec_inf-0.4.1/docs/source/_static/logos/vector_logo.png +0 -0
  59. vec_inf-0.4.1/docs/source/_static/require.min.js +0 -1
  60. vec_inf-0.4.1/docs/source/_templates/base.html +0 -120
  61. vec_inf-0.4.1/docs/source/_templates/custom-class-template.rst +0 -34
  62. vec_inf-0.4.1/docs/source/_templates/custom-module-template.rst +0 -66
  63. vec_inf-0.4.1/docs/source/_templates/page.html +0 -219
  64. vec_inf-0.4.1/docs/source/conf.py +0 -113
  65. vec_inf-0.4.1/docs/source/user_guide.md +0 -123
  66. vec_inf-0.4.1/uv.lock +0 -3336
  67. vec_inf-0.4.1/vec_inf/cli/_cli.py +0 -438
  68. vec_inf-0.4.1/vec_inf/cli/_utils.py +0 -147
  69. vec_inf-0.4.1/vec_inf/launch_server.sh +0 -145
  70. vec_inf-0.4.1/vec_inf/models/models.csv +0 -85
  71. vec_inf-0.4.1/vec_inf/multinode_vllm.slurm +0 -124
  72. vec_inf-0.4.1/vec_inf/vllm.slurm +0 -59
  73. {vec_inf-0.4.1 → vec_inf-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  74. {vec_inf-0.4.1 → vec_inf-0.6.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  75. {vec_inf-0.4.1 → vec_inf-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  76. {vec_inf-0.4.1 → vec_inf-0.6.0}/.github/dependabot.yml +0 -0
  77. {vec_inf-0.4.1 → vec_inf-0.6.0}/.github/pull_request_template.md +0 -0
  78. {vec_inf-0.4.1 → vec_inf-0.6.0}/.gitignore +0 -0
  79. {vec_inf-0.4.1 → vec_inf-0.6.0}/.python-version +0 -0
  80. {vec_inf-0.4.1 → vec_inf-0.6.0}/LICENSE +0 -0
  81. {vec_inf-0.4.1 → vec_inf-0.6.0}/docs/Makefile +0 -0
  82. {vec_inf-0.4.1 → vec_inf-0.6.0}/docs/make.bat +0 -0
  83. {vec_inf-0.4.1 → vec_inf-0.6.0}/examples/inference/llm/completions.py +0 -0
  84. {vec_inf-0.4.1 → vec_inf-0.6.0}/examples/inference/llm/completions.sh +0 -0
  85. {vec_inf-0.4.1 → vec_inf-0.6.0}/examples/inference/text_embedding/embeddings.py +0 -0
  86. {vec_inf-0.4.1 → vec_inf-0.6.0}/examples/inference/vlm/vision_completions.py +0 -0
  87. {vec_inf-0.4.1 → vec_inf-0.6.0}/examples/logits/logits.py +0 -0
  88. {vec_inf-0.4.1 → vec_inf-0.6.0}/profile/avg_throughput.py +0 -0
  89. {vec_inf-0.4.1 → vec_inf-0.6.0}/profile/gen.py +0 -0
  90. {vec_inf-0.4.1 → vec_inf-0.6.0}/tests/__init__.py +0 -0
  91. {vec_inf-0.4.1 → vec_inf-0.6.0}/tests/vec_inf/__init__.py +0 -0
  92. {vec_inf-0.4.1 → vec_inf-0.6.0}/tests/vec_inf/cli/__init__.py +0 -0
  93. {vec_inf-0.4.1 → vec_inf-0.6.0}/vec_inf/__init__.py +0 -0
  94. {vec_inf-0.4.1 → vec_inf-0.6.0}/vec_inf/cli/__init__.py +0 -0
  95. {vec_inf-0.4.1 → vec_inf-0.6.0}/vec_inf/find_port.sh +0 -0
  96. {vec_inf-0.4.1 → vec_inf-0.6.0}/venv.sh +0 -0
@@ -30,13 +30,13 @@ jobs:
30
30
  steps:
31
31
  - uses: actions/checkout@v4.2.2
32
32
  - name: Install uv
33
- uses: astral-sh/setup-uv@v5.2.2
33
+ uses: astral-sh/setup-uv@v6
34
34
  with:
35
35
  # Install a specific version of uv.
36
36
  version: "0.5.21"
37
37
  enable-cache: true
38
38
  - name: "Set up Python"
39
- uses: actions/setup-python@v5.4.0
39
+ uses: actions/setup-python@v5.5.0
40
40
  with:
41
41
  python-version-file: ".python-version"
42
42
  - name: Install the project
@@ -46,6 +46,6 @@ jobs:
46
46
  source .venv/bin/activate
47
47
  pre-commit run --all-files
48
48
  - name: pip-audit (gh-action-pip-audit)
49
- uses: pypa/gh-action-pip-audit@v1.0.8
49
+ uses: pypa/gh-action-pip-audit@v1.1.0
50
50
  with:
51
51
  virtual-environment: .venv/
@@ -0,0 +1,56 @@
1
+ name: docker
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+ push:
7
+ branches:
8
+ - main
9
+ paths:
10
+ - Dockerfile
11
+ - .github/workflows/docker.yml
12
+ - uv.lock
13
+ pull_request:
14
+ branches:
15
+ - main
16
+ paths:
17
+ - Dockerfile
18
+ - .github/workflows/docker.yml
19
+ - uv.lock
20
+
21
+ jobs:
22
+ push_to_registry:
23
+ name: Push Docker image to Docker Hub
24
+ runs-on: ubuntu-latest
25
+ steps:
26
+ - name: Checkout repository
27
+ uses: actions/checkout@v4.2.2
28
+
29
+ - name: Extract vLLM version
30
+ id: vllm-version
31
+ run: |
32
+ VERSION=$(grep -A 1 'name = "vllm"' uv.lock | grep version | cut -d '"' -f 2)
33
+ echo "version=$VERSION" >> $GITHUB_OUTPUT
34
+
35
+ - name: Log in to Docker Hub
36
+ uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772
37
+ with:
38
+ username: ${{ secrets.DOCKER_USERNAME }}
39
+ password: ${{ secrets.DOCKER_PASSWORD }}
40
+
41
+ - name: Extract metadata (tags, labels) for Docker
42
+ id: meta
43
+ uses: docker/metadata-action@902fa8ec7d6ecbf8d84d538b9b233a880e428804
44
+ with:
45
+ images: vectorinstitute/vector-inference
46
+
47
+ - name: Build and push Docker image
48
+ uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1
49
+ with:
50
+ context: .
51
+ file: ./Dockerfile
52
+ push: true
53
+ tags: |
54
+ ${{ steps.meta.outputs.tags }}
55
+ vectorinstitute/vector-inference:${{ steps.vllm-version.outputs.version }}
56
+ labels: ${{ steps.meta.outputs.labels }}
@@ -0,0 +1,176 @@
1
+ name: docs
2
+ permissions:
3
+ contents: write
4
+ pull-requests: write
5
+
6
+ on:
7
+ push:
8
+ branches:
9
+ - main
10
+ paths:
11
+ - .pre-commit-config.yaml
12
+ - .github/workflows/docs.yml
13
+ - '**.py'
14
+ - '**.ipynb'
15
+ - '**.html'
16
+ - '**.js'
17
+ - '**.md'
18
+ - uv.lock
19
+ - pyproject.toml
20
+ - mkdocs.yml
21
+ - '**.png'
22
+ - '**.svg'
23
+ pull_request:
24
+ branches:
25
+ - main
26
+ paths:
27
+ - .pre-commit-config.yaml
28
+ - .github/workflows/docs.yml
29
+ - '**.py'
30
+ - '**.ipynb'
31
+ - '**.js'
32
+ - '**.html'
33
+ - uv.lock
34
+ - pyproject.toml
35
+ - '**.md'
36
+ - mkdocs.yml
37
+ - '**.png'
38
+ - '**.svg'
39
+ release:
40
+ types: [published]
41
+ # Allow manual trigger
42
+ workflow_dispatch:
43
+ inputs:
44
+ version:
45
+ description: 'Version to deploy (e.g., 0.5.0, latest)'
46
+ required: true
47
+ default: 'latest'
48
+
49
+ jobs:
50
+ build:
51
+ runs-on: ubuntu-latest
52
+ steps:
53
+ - name: Checkout code
54
+ uses: actions/checkout@v4.2.2
55
+ with:
56
+ fetch-depth: 0 # Fetch all history for proper versioning
57
+
58
+ - name: Install uv
59
+ uses: astral-sh/setup-uv@v6
60
+ with:
61
+ version: "0.5.21"
62
+ enable-cache: true
63
+
64
+ - name: Set up Python
65
+ uses: actions/setup-python@v5
66
+ with:
67
+ python-version-file: ".python-version"
68
+
69
+ - name: Install the project
70
+ run: uv sync --all-extras --group docs
71
+
72
+ - name: Build docs
73
+ run: uv run mkdocs build
74
+
75
+ - name: Create .nojekyll file
76
+ run: touch site/.nojekyll
77
+
78
+ - name: Upload artifact
79
+ uses: actions/upload-artifact@v4
80
+ with:
81
+ name: docs-site
82
+ path: site/
83
+ retention-days: 1
84
+
85
+ deploy:
86
+ needs: build
87
+ if: (github.event_name == 'push' && github.ref == 'refs/heads/main') || github.event_name == 'workflow_dispatch' || github.event_name == 'release'
88
+ runs-on: ubuntu-latest
89
+ steps:
90
+ - name: Checkout code
91
+ uses: actions/checkout@v4.2.2
92
+ with:
93
+ fetch-depth: 0 # Fetch all history for proper versioning
94
+
95
+ - name: Install uv
96
+ uses: astral-sh/setup-uv@v6
97
+ with:
98
+ version: "0.5.21"
99
+ enable-cache: true
100
+
101
+ - name: Set up Python
102
+ uses: actions/setup-python@v5
103
+ with:
104
+ python-version-file: ".python-version"
105
+
106
+ - name: Install the project
107
+ run: uv sync --all-extras --group docs
108
+
109
+ - name: Configure Git Credentials
110
+ run: |
111
+ git config user.name github-actions[bot]
112
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
113
+
114
+ - name: Download artifact
115
+ uses: actions/download-artifact@v4
116
+ with:
117
+ name: docs-site
118
+ path: site
119
+
120
+ - name: Ensure .nojekyll exists
121
+ run: touch site/.nojekyll
122
+
123
+ - name: Determine version
124
+ id: version
125
+ run: |
126
+ if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
127
+ # Use the version provided in the workflow dispatch
128
+ echo "VERSION=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
129
+ echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
130
+ elif [[ "${{ github.event_name }}" == "release" ]]; then
131
+ # Use the tag from the release
132
+ VERSION="${{ github.ref_name }}"
133
+ # Remove 'v' prefix if present
134
+ VERSION="${VERSION#v}"
135
+ echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
136
+ echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
137
+ elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then
138
+ # For pushes to main, tag as "main"
139
+ echo "VERSION=main" >> $GITHUB_OUTPUT
140
+ # No alias for main
141
+ echo "VERSION_ALIAS=" >> $GITHUB_OUTPUT
142
+ else
143
+ # Get version from pyproject.toml as fallback
144
+ VERSION=$(grep -m 1 '^version = ' pyproject.toml | sed 's/^version = "\(.*\)"$/\1/')
145
+ echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
146
+ echo "VERSION_ALIAS=latest" >> $GITHUB_OUTPUT
147
+ fi
148
+
149
+ - name: Deploy docs with mike
150
+ run: |
151
+ VERSION=${{ steps.version.outputs.VERSION }}
152
+ ALIAS=${{ steps.version.outputs.VERSION_ALIAS }}
153
+
154
+ # Add a temporary remote to fetch gh-pages if it exists
155
+ git remote add temp https://github.com/${{ github.repository }}.git || true
156
+ git fetch temp gh-pages || true
157
+
158
+ DEPLOY_ARGS="--push --update-aliases $VERSION"
159
+
160
+ if [[ ! -z "$ALIAS" ]]; then
161
+ DEPLOY_ARGS="$DEPLOY_ARGS $ALIAS"
162
+ fi
163
+
164
+ # Activate the virtual environment
165
+ source .venv/bin/activate
166
+
167
+ echo "Running: mike deploy $DEPLOY_ARGS"
168
+ mike deploy $DEPLOY_ARGS
169
+
170
+ # Set default version to latest only if we're deploying a version with the latest alias
171
+ if [[ ! -z "$ALIAS" && "$ALIAS" == "latest" ]]; then
172
+ mike set-default --push latest
173
+ fi
174
+
175
+ # Remove the temporary remote
176
+ git remote remove temp || true
@@ -12,16 +12,21 @@ jobs:
12
12
  run: |
13
13
  sudo apt-get update
14
14
  sudo apt-get install libcurl4-openssl-dev libssl-dev
15
- - uses: actions/checkout@v4.1.1
16
- - name: Install poetry
17
- run: python3 -m pip install --upgrade pip && python3 -m pip install poetry
18
- - uses: actions/setup-python@v5.0.0
15
+
16
+ - uses: actions/checkout@v4.2.2
17
+
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v6
20
+ with:
21
+ version: "0.6.6"
22
+ enable-cache: true
23
+
24
+ - uses: actions/setup-python@v5.5.0
19
25
  with:
20
26
  python-version: '3.10'
27
+
21
28
  - name: Build package
22
- run: poetry build
29
+ run: uv build
30
+
23
31
  - name: Publish package
24
- uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
25
- with:
26
- user: __token__
27
- password: ${{ secrets.PYPI_API_TOKEN }}
32
+ run: uv publish --token ${{ secrets.PYPI_API_TOKEN }}
@@ -39,33 +39,43 @@ on:
39
39
  jobs:
40
40
  unit-tests:
41
41
  runs-on: ubuntu-latest
42
+ strategy:
43
+ matrix:
44
+ python-version: ["3.10", "3.11", "3.12"]
42
45
  steps:
43
46
  - uses: actions/checkout@v4.2.2
44
47
 
45
48
  - name: Install uv
46
- uses: astral-sh/setup-uv@v5.2.2
49
+ uses: astral-sh/setup-uv@v6
47
50
  with:
48
51
  # Install a specific version of uv.
49
52
  version: "0.5.21"
50
53
  enable-cache: true
51
54
 
52
- - name: "Set up Python"
53
- uses: actions/setup-python@v5.4.0
55
+ - name: "Set up Python ${{ matrix.python-version }}"
56
+ uses: actions/setup-python@v5.5.0
54
57
  with:
55
- python-version-file: ".python-version"
58
+ python-version: ${{ matrix.python-version }}
56
59
 
57
60
  - name: Install the project
58
- run: uv sync --all-extras --dev
61
+ run: uv sync --dev
59
62
 
60
63
  - name: Install dependencies and check code
61
64
  run: |
62
65
  uv run pytest -m "not integration_test" --cov vec_inf --cov-report=xml tests
63
66
 
64
- # Uncomment this once this repo is configured on Codecov
67
+ - name: Install the core package only
68
+ run: uv sync --no-dev
69
+
70
+ - name: Run package import tests
71
+ run: |
72
+ uv run pytest tests/test_imports.py
73
+
65
74
  - name: Upload coverage to Codecov
66
- uses: codecov/codecov-action@v5.3.1
75
+ uses: codecov/codecov-action@v5.4.2
67
76
  with:
68
77
  token: ${{ secrets.CODECOV_TOKEN }}
69
- slug: VectorInstitute/vec-inf
78
+ file: ./coverage.xml
79
+ name: codecov-umbrella
70
80
  fail_ci_if_error: true
71
81
  verbose: true
@@ -13,10 +13,11 @@ repos:
13
13
  args: [--fix=lf]
14
14
  - id: requirements-txt-fixer
15
15
  - id: check-yaml
16
+ args: [--unsafe]
16
17
  - id: check-toml
17
18
 
18
19
  - repo: https://github.com/astral-sh/ruff-pre-commit
19
- rev: 'v0.9.6'
20
+ rev: 'v0.11.8'
20
21
  hooks:
21
22
  - id: ruff
22
23
  args: [--fix, --exit-non-zero-on-fix]
@@ -0,0 +1,54 @@
1
+ FROM nvidia/cuda:12.4.1-devel-ubuntu20.04
2
+
3
+ # Non-interactive apt-get commands
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+
6
+ # No GPUs visible during build
7
+ ARG CUDA_VISIBLE_DEVICES=none
8
+
9
+ # Specify CUDA architectures -> 7.5: RTX 6000 & T4, 8.0: A100, 8.6+PTX
10
+ ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6+PTX"
11
+
12
+ # Set the Python version
13
+ ARG PYTHON_VERSION=3.10.12
14
+
15
+ # Install system dependencies
16
+ RUN apt-get update && apt-get install -y \
17
+ wget build-essential libssl-dev zlib1g-dev libbz2-dev \
18
+ libreadline-dev libsqlite3-dev libffi-dev libncursesw5-dev \
19
+ xz-utils tk-dev libxml2-dev libxmlsec1-dev liblzma-dev git vim \
20
+ && rm -rf /var/lib/apt/lists/*
21
+
22
+ # Install Python
23
+ RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
24
+ tar -xzf Python-$PYTHON_VERSION.tgz && \
25
+ cd Python-$PYTHON_VERSION && \
26
+ ./configure --enable-optimizations && \
27
+ make -j$(nproc) && \
28
+ make altinstall && \
29
+ cd .. && \
30
+ rm -rf Python-$PYTHON_VERSION.tgz Python-$PYTHON_VERSION
31
+
32
+ # Install pip and core Python tools
33
+ RUN wget https://bootstrap.pypa.io/get-pip.py && \
34
+ python3.10 get-pip.py && \
35
+ rm get-pip.py && \
36
+ python3.10 -m pip install --upgrade pip setuptools wheel uv
37
+
38
+ # Set up project
39
+ WORKDIR /vec-inf
40
+ COPY . /vec-inf
41
+
42
+ # Install project dependencies with build requirements
43
+ RUN PIP_INDEX_URL="https://download.pytorch.org/whl/cu121" uv pip install --system -e .[dev]
44
+ # Install FlashAttention
45
+ RUN python3.10 -m pip install flash-attn --no-build-isolation
46
+ # Install FlashInfer
47
+ RUN python3.10 -m pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.6/
48
+
49
+ # Final configuration
50
+ RUN mkdir -p /vec-inf/nccl && \
51
+ mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /vec-inf/nccl/libnccl.so.2.18.1
52
+
53
+ # Set the default command to start an interactive shell
54
+ CMD ["bash"]
vec_inf-0.6.0/PKG-INFO ADDED
@@ -0,0 +1,193 @@
1
+ Metadata-Version: 2.4
2
+ Name: vec-inf
3
+ Version: 0.6.0
4
+ Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
+ Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.10
9
+ Requires-Dist: click>=8.1.0
10
+ Requires-Dist: pydantic>=2.10.6
11
+ Requires-Dist: pyyaml>=6.0.2
12
+ Requires-Dist: requests>=2.31.0
13
+ Requires-Dist: rich>=13.7.0
14
+ Provides-Extra: dev
15
+ Requires-Dist: cupy-cuda12x==12.1.0; extra == 'dev'
16
+ Requires-Dist: ray>=2.40.0; extra == 'dev'
17
+ Requires-Dist: torch>=2.5.1; extra == 'dev'
18
+ Requires-Dist: vllm-nccl-cu12<2.19,>=2.18; extra == 'dev'
19
+ Requires-Dist: vllm>=0.7.3; extra == 'dev'
20
+ Requires-Dist: xgrammar>=0.1.11; extra == 'dev'
21
+ Description-Content-Type: text/markdown
22
+
23
+ # Vector Inference: Easy inference on Slurm clusters
24
+
25
+ ----------------------------------------------------
26
+
27
+ [![PyPI](https://img.shields.io/pypi/v/vec-inf)](https://pypi.org/project/vec-inf)
28
+ [![downloads](https://img.shields.io/pypi/dm/vec-inf)](https://pypistats.org/packages/vec-inf)
29
+ [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
30
+ [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
31
+ [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
32
+ ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
33
+
34
+ This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the environment variables in [`vec_inf/client/slurm_vars.py`](vec_inf/client/slurm_vars.py), and the model config for cached model weights in [`vec_inf/config/models.yaml`](vec_inf/config/models.yaml) accordingly.
35
+
36
+ ## Installation
37
+ If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
38
+
39
+ ```bash
40
+ pip install vec-inf
41
+ ```
42
+ Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package
43
+
44
+ ## Usage
45
+
46
+ Vector Inference provides 2 user interfaces, a CLI and an API
47
+
48
+ ### CLI
49
+
50
+ The `launch` command allows users to deploy a model as a slurm job. If the job successfully launches, a URL endpoint is exposed for the user to send requests for inference.
51
+
52
+ We will use the Llama 3.1 model as example, to launch an OpenAI compatible inference server for Meta-Llama-3.1-8B-Instruct, run:
53
+
54
+ ```bash
55
+ vec-inf launch Meta-Llama-3.1-8B-Instruct
56
+ ```
57
+ You should see an output like the following:
58
+
59
+ <img width="600" alt="launch_image" src="https://github.com/user-attachments/assets/a72a99fd-4bf2-408e-8850-359761d96c4f">
60
+
61
+
62
+ #### Overrides
63
+
64
+ Models that are already supported by `vec-inf` would be launched using the cached configuration (set in [slurm_vars.py](vec_inf/client/slurm_vars.py)) or [default configuration](vec_inf/config/models.yaml). You can override these values by providing additional parameters. Use `vec-inf launch --help` to see the full list of parameters that can be
65
+ overriden. For example, if `qos` is to be overriden:
66
+
67
+ ```bash
68
+ vec-inf launch Meta-Llama-3.1-8B-Instruct --qos <new_qos>
69
+ ```
70
+
71
+ To overwrite default vLLM engine arguments, you can specify the engine arguments in a comma separated string:
72
+
73
+ ```bash
74
+ vec-inf launch Meta-Llama-3.1-8B-Instruct --vllm-args '--max-model-len=65536,--compilation-config=3'
75
+ ```
76
+
77
+ For the full list of vLLM engine arguments, you can find them [here](https://docs.vllm.ai/en/stable/serving/engine_args.html), make sure you select the correct vLLM version.
78
+
79
+ #### Custom models
80
+
81
+ You can also launch your own custom model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), and make sure to follow the instructions below:
82
+ * Your model weights directory naming convention should follow `$MODEL_FAMILY-$MODEL_VARIANT` ($MODEL_VARIANT is OPTIONAL).
83
+ * Your model weights directory should contain HuggingFace format weights.
84
+ * You should specify your model configuration by:
85
+ * Creating a custom configuration file for your model and specify its path via setting the environment variable `VEC_INF_CONFIG`. Check the [default parameters](vec_inf/config/models.yaml) file for the format of the config file. All the parameters for the model should be specified in that config file.
86
+ * Using launch command options to specify your model setup.
87
+ * For other model launch parameters you can reference the default values for similar models using the [`list` command ](#list-command).
88
+
89
+ Here is an example to deploy a custom [Qwen2.5-7B-Instruct-1M](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M) model which is not
90
+ supported in the default list of models using a user custom config. In this case, the model weights are assumed to be downloaded to
91
+ a `model-weights` directory inside the user's home directory. The weights directory of the model follows the naming convention so it
92
+ would be named `Qwen2.5-7B-Instruct-1M`. The following yaml file would need to be created, lets say it is named `/h/<username>/my-model-config.yaml`.
93
+
94
+ ```yaml
95
+ models:
96
+ Qwen2.5-7B-Instruct-1M:
97
+ model_family: Qwen2.5
98
+ model_variant: 7B-Instruct-1M
99
+ model_type: LLM
100
+ gpus_per_node: 1
101
+ num_nodes: 1
102
+ vocab_size: 152064
103
+ qos: m2
104
+ time: 08:00:00
105
+ partition: a40
106
+ model_weights_parent_dir: /h/<username>/model-weights
107
+ vllm_args:
108
+ --max-model-len: 1010000
109
+ --max-num-seqs: 256
110
+ --compilation-confi: 3
111
+ ```
112
+
113
+ You would then set the `VEC_INF_CONFIG` path using:
114
+
115
+ ```bash
116
+ export VEC_INF_CONFIG=/h/<username>/my-model-config.yaml
117
+ ```
118
+
119
+ Note that there are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](vec_inf/client/config.py) for details.
120
+
121
+ #### Other commands
122
+
123
+ * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
124
+ * `metrics`: Streams performance metrics to the console.
125
+ * `shutdown`: Shutdown a model by providing its Slurm job ID.
126
+ * `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
127
+
128
+ For more details on the usage of these commands, refer to the [User Guide](https://vectorinstitute.github.io/vector-inference/user_guide/)
129
+
130
+ ### API
131
+
132
+ Example:
133
+
134
+ ```python
135
+ >>> from vec_inf.api import VecInfClient
136
+ >>> client = VecInfClient()
137
+ >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
138
+ >>> job_id = response.slurm_job_id
139
+ >>> status = client.get_status(job_id)
140
+ >>> if status.status == ModelStatus.READY:
141
+ ... print(f"Model is ready at {status.base_url}")
142
+ >>> client.shutdown_model(job_id)
143
+ ```
144
+
145
+ For details on the usage of the API, refer to the [API Reference](https://vectorinstitute.github.io/vector-inference/api/)
146
+
147
+ ## Check Job Configuration
148
+
149
+ With every model launch, a Slurm script will be generated dynamically based on the job and model configuration. Once the Slurm job is queued, the generated Slurm script will be moved to the log directory for reproducibility, located at `$log_dir/$model_family/$model_name.$slurm_job_id/$model_name.$slurm_job_id.slurm`. In the same directory you can also find a JSON file with the same name that captures the launch configuration, and will have an entry of server URL once the server is ready.
150
+
151
+ ## Send inference requests
152
+
153
+ Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/chat_completions.py`, and you should expect to see an output like the following:
154
+
155
+ ```json
156
+ {
157
+ "id":"chatcmpl-387c2579231948ffaf66cdda5439d3dc",
158
+ "choices": [
159
+ {
160
+ "finish_reason":"stop",
161
+ "index":0,
162
+ "logprobs":null,
163
+ "message": {
164
+ "content":"Arrr, I be Captain Chatbeard, the scurviest chatbot on the seven seas! Ye be wantin' to know me identity, eh? Well, matey, I be a swashbucklin' AI, here to provide ye with answers and swappin' tales, savvy?",
165
+ "role":"assistant",
166
+ "function_call":null,
167
+ "tool_calls":[],
168
+ "reasoning_content":null
169
+ },
170
+ "stop_reason":null
171
+ }
172
+ ],
173
+ "created":1742496683,
174
+ "model":"Meta-Llama-3.1-8B-Instruct",
175
+ "object":"chat.completion",
176
+ "system_fingerprint":null,
177
+ "usage": {
178
+ "completion_tokens":66,
179
+ "prompt_tokens":32,
180
+ "total_tokens":98,
181
+ "prompt_tokens_details":null
182
+ },
183
+ "prompt_logprobs":null
184
+ }
185
+ ```
186
+ **NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
187
+
188
+ ## SSH tunnel from your local device
189
+ If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
190
+ ```bash
191
+ ssh -L 8081:172.17.8.29:8081 username@v.vectorinstitute.ai -N
192
+ ```
193
+ Where the last number in the URL is the GPU number (gpu029 in this case). The example provided above is for the vector cluster, change the variables accordingly for your environment