vec-inf 0.6.0__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. {vec_inf-0.6.0 → vec_inf-0.6.1}/.github/workflows/docker.yml +1 -1
  2. {vec_inf-0.6.0 → vec_inf-0.6.1}/.github/workflows/unit_tests.yml +1 -1
  3. {vec_inf-0.6.0 → vec_inf-0.6.1}/.pre-commit-config.yaml +1 -1
  4. {vec_inf-0.6.0 → vec_inf-0.6.1}/PKG-INFO +11 -5
  5. {vec_inf-0.6.0 → vec_inf-0.6.1}/README.md +10 -4
  6. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/index.md +1 -1
  7. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/user_guide.md +6 -3
  8. {vec_inf-0.6.0 → vec_inf-0.6.1}/examples/README.md +1 -0
  9. vec_inf-0.6.1/examples/slurm_dependency/README.md +33 -0
  10. vec_inf-0.6.1/examples/slurm_dependency/downstream_job.sbatch +18 -0
  11. vec_inf-0.6.1/examples/slurm_dependency/run_downstream.py +26 -0
  12. vec_inf-0.6.1/examples/slurm_dependency/run_workflow.sh +14 -0
  13. {vec_inf-0.6.0 → vec_inf-0.6.1}/pyproject.toml +1 -1
  14. {vec_inf-0.6.0 → vec_inf-0.6.1}/tests/vec_inf/client/test_api.py +82 -0
  15. vec_inf-0.6.1/tests/vec_inf/client/test_helper.py +578 -0
  16. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/cli/_cli.py +26 -1
  17. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/client/_client_vars.py +23 -5
  18. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/client/_helper.py +0 -13
  19. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/client/_slurm_script_generator.py +7 -8
  20. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/client/config.py +10 -0
  21. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/client/models.py +9 -0
  22. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/config/models.yaml +0 -7
  23. {vec_inf-0.6.0 → vec_inf-0.6.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  24. {vec_inf-0.6.0 → vec_inf-0.6.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  25. {vec_inf-0.6.0 → vec_inf-0.6.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  26. {vec_inf-0.6.0 → vec_inf-0.6.1}/.github/dependabot.yml +0 -0
  27. {vec_inf-0.6.0 → vec_inf-0.6.1}/.github/pull_request_template.md +0 -0
  28. {vec_inf-0.6.0 → vec_inf-0.6.1}/.github/workflows/code_checks.yml +0 -0
  29. {vec_inf-0.6.0 → vec_inf-0.6.1}/.github/workflows/docs.yml +0 -0
  30. {vec_inf-0.6.0 → vec_inf-0.6.1}/.github/workflows/publish.yml +0 -0
  31. {vec_inf-0.6.0 → vec_inf-0.6.1}/.gitignore +0 -0
  32. {vec_inf-0.6.0 → vec_inf-0.6.1}/.python-version +0 -0
  33. {vec_inf-0.6.0 → vec_inf-0.6.1}/Dockerfile +0 -0
  34. {vec_inf-0.6.0 → vec_inf-0.6.1}/LICENSE +0 -0
  35. {vec_inf-0.6.0 → vec_inf-0.6.1}/codecov.yml +0 -0
  36. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/Makefile +0 -0
  37. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/api.md +0 -0
  38. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/assets/favicon-48x48.svg +0 -0
  39. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/assets/favicon.ico +0 -0
  40. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/assets/vector-logo.svg +0 -0
  41. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/contributing.md +0 -0
  42. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/make.bat +0 -0
  43. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/overrides/partials/copyright.html +0 -0
  44. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/overrides/partials/logo.html +0 -0
  45. {vec_inf-0.6.0 → vec_inf-0.6.1}/docs/stylesheets/extra.css +0 -0
  46. {vec_inf-0.6.0 → vec_inf-0.6.1}/examples/api/basic_usage.py +0 -0
  47. {vec_inf-0.6.0 → vec_inf-0.6.1}/examples/inference/llm/chat_completions.py +0 -0
  48. {vec_inf-0.6.0 → vec_inf-0.6.1}/examples/inference/llm/completions.py +0 -0
  49. {vec_inf-0.6.0 → vec_inf-0.6.1}/examples/inference/llm/completions.sh +0 -0
  50. {vec_inf-0.6.0 → vec_inf-0.6.1}/examples/inference/text_embedding/embeddings.py +0 -0
  51. {vec_inf-0.6.0 → vec_inf-0.6.1}/examples/inference/vlm/vision_completions.py +0 -0
  52. {vec_inf-0.6.0 → vec_inf-0.6.1}/examples/logits/logits.py +0 -0
  53. {vec_inf-0.6.0 → vec_inf-0.6.1}/mkdocs.yml +0 -0
  54. {vec_inf-0.6.0 → vec_inf-0.6.1}/profile/avg_throughput.py +0 -0
  55. {vec_inf-0.6.0 → vec_inf-0.6.1}/profile/gen.py +0 -0
  56. {vec_inf-0.6.0 → vec_inf-0.6.1}/tests/__init__.py +0 -0
  57. {vec_inf-0.6.0 → vec_inf-0.6.1}/tests/test_imports.py +0 -0
  58. {vec_inf-0.6.0 → vec_inf-0.6.1}/tests/vec_inf/__init__.py +0 -0
  59. {vec_inf-0.6.0 → vec_inf-0.6.1}/tests/vec_inf/cli/__init__.py +0 -0
  60. {vec_inf-0.6.0 → vec_inf-0.6.1}/tests/vec_inf/cli/test_cli.py +0 -0
  61. {vec_inf-0.6.0 → vec_inf-0.6.1}/tests/vec_inf/cli/test_utils.py +0 -0
  62. {vec_inf-0.6.0 → vec_inf-0.6.1}/tests/vec_inf/client/__init__.py +0 -0
  63. {vec_inf-0.6.0 → vec_inf-0.6.1}/tests/vec_inf/client/test_examples.py +0 -0
  64. {vec_inf-0.6.0 → vec_inf-0.6.1}/tests/vec_inf/client/test_models.py +0 -0
  65. {vec_inf-0.6.0 → vec_inf-0.6.1}/tests/vec_inf/client/test_utils.py +0 -0
  66. {vec_inf-0.6.0 → vec_inf-0.6.1}/uv.lock +0 -0
  67. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/README.md +0 -0
  68. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/__init__.py +0 -0
  69. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/cli/__init__.py +0 -0
  70. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/cli/_helper.py +0 -0
  71. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/cli/_utils.py +0 -0
  72. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/cli/_vars.py +0 -0
  73. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/client/__init__.py +0 -0
  74. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/client/_exceptions.py +0 -0
  75. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/client/_utils.py +0 -0
  76. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/client/api.py +0 -0
  77. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/client/slurm_vars.py +0 -0
  78. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/config/README.md +0 -0
  79. {vec_inf-0.6.0 → vec_inf-0.6.1}/vec_inf/find_port.sh +0 -0
  80. {vec_inf-0.6.0 → vec_inf-0.6.1}/venv.sh +0 -0
@@ -45,7 +45,7 @@ jobs:
45
45
  images: vectorinstitute/vector-inference
46
46
 
47
47
  - name: Build and push Docker image
48
- uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1
48
+ uses: docker/build-push-action@1dc73863535b631f98b2378be8619f83b136f4a0
49
49
  with:
50
50
  context: .
51
51
  file: ./Dockerfile
@@ -72,7 +72,7 @@ jobs:
72
72
  uv run pytest tests/test_imports.py
73
73
 
74
74
  - name: Upload coverage to Codecov
75
- uses: codecov/codecov-action@v5.4.2
75
+ uses: codecov/codecov-action@v5.4.3
76
76
  with:
77
77
  token: ${{ secrets.CODECOV_TOKEN }}
78
78
  file: ./coverage.xml
@@ -17,7 +17,7 @@ repos:
17
17
  - id: check-toml
18
18
 
19
19
  - repo: https://github.com/astral-sh/ruff-pre-commit
20
- rev: 'v0.11.8'
20
+ rev: 'v0.11.11'
21
21
  hooks:
22
22
  - id: ruff
23
23
  args: [--fix, --exit-non-zero-on-fix]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vec-inf
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
5
  Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
6
6
  License-Expression: MIT
@@ -29,6 +29,7 @@ Description-Content-Type: text/markdown
29
29
  [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
30
30
  [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
31
31
  [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
32
+ [![vLLM](https://img.shields.io/badge/vllm-0.8.5.post1-blue)](https://docs.vllm.ai/en/v0.8.5.post1/index.html)
32
33
  ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
33
34
 
34
35
  This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the environment variables in [`vec_inf/client/slurm_vars.py`](vec_inf/client/slurm_vars.py), and the model config for cached model weights in [`vec_inf/config/models.yaml`](vec_inf/config/models.yaml) accordingly.
@@ -39,7 +40,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
39
40
  ```bash
40
41
  pip install vec-inf
41
42
  ```
42
- Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package
43
+ Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.8.5.post1`.
43
44
 
44
45
  ## Usage
45
46
 
@@ -107,7 +108,7 @@ models:
107
108
  vllm_args:
108
109
  --max-model-len: 1010000
109
110
  --max-num-seqs: 256
110
- --compilation-confi: 3
111
+ --compilation-config: 3
111
112
  ```
112
113
 
113
114
  You would then set the `VEC_INF_CONFIG` path using:
@@ -116,7 +117,11 @@ You would then set the `VEC_INF_CONFIG` path using:
116
117
  export VEC_INF_CONFIG=/h/<username>/my-model-config.yaml
117
118
  ```
118
119
 
119
- Note that there are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](vec_inf/client/config.py) for details.
120
+ **NOTE**
121
+ * There are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](vec_inf/client/config.py) for details.
122
+ * Check [vLLM Engine Arguments](https://docs.vllm.ai/en/stable/serving/engine_args.html) for the full list of available vLLM engine arguments, the default parallel size for any parallelization is default to 1, so none of the sizes were set specifically in this example
123
+ * For GPU partitions with non-Ampere architectures, e.g. `rtx6000`, `t4v2`, BF16 isn't supported. For models that have BF16 as the default type, when using a non-Ampere GPU, use FP16 instead, i.e. `--dtype: float16`
124
+ * Setting `--compilation-config` to `3` currently breaks multi-node model launches, so we don't set them for models that require multiple nodes of GPUs.
120
125
 
121
126
  #### Other commands
122
127
 
@@ -182,8 +187,9 @@ Once the inference server is ready, you can start sending in inference requests.
182
187
  },
183
188
  "prompt_logprobs":null
184
189
  }
190
+
185
191
  ```
186
- **NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
192
+ **NOTE**: Certain models don't adhere to OpenAI's chat template, e.g. Mistral family. For these models, you can either change your prompt to follow the model's default chat template or provide your own chat template via `--chat-template: TEMPLATE_PATH`.
187
193
 
188
194
  ## SSH tunnel from your local device
189
195
  If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
@@ -7,6 +7,7 @@
7
7
  [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
8
8
  [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
9
9
  [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
10
+ [![vLLM](https://img.shields.io/badge/vllm-0.8.5.post1-blue)](https://docs.vllm.ai/en/v0.8.5.post1/index.html)
10
11
  ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
11
12
 
12
13
  This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update the environment variables in [`vec_inf/client/slurm_vars.py`](vec_inf/client/slurm_vars.py), and the model config for cached model weights in [`vec_inf/config/models.yaml`](vec_inf/config/models.yaml) accordingly.
@@ -17,7 +18,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
17
18
  ```bash
18
19
  pip install vec-inf
19
20
  ```
20
- Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package
21
+ Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.8.5.post1`.
21
22
 
22
23
  ## Usage
23
24
 
@@ -85,7 +86,7 @@ models:
85
86
  vllm_args:
86
87
  --max-model-len: 1010000
87
88
  --max-num-seqs: 256
88
- --compilation-confi: 3
89
+ --compilation-config: 3
89
90
  ```
90
91
 
91
92
  You would then set the `VEC_INF_CONFIG` path using:
@@ -94,7 +95,11 @@ You would then set the `VEC_INF_CONFIG` path using:
94
95
  export VEC_INF_CONFIG=/h/<username>/my-model-config.yaml
95
96
  ```
96
97
 
97
- Note that there are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](vec_inf/client/config.py) for details.
98
+ **NOTE**
99
+ * There are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](vec_inf/client/config.py) for details.
100
+ * Check [vLLM Engine Arguments](https://docs.vllm.ai/en/stable/serving/engine_args.html) for the full list of available vLLM engine arguments, the default parallel size for any parallelization is default to 1, so none of the sizes were set specifically in this example
101
+ * For GPU partitions with non-Ampere architectures, e.g. `rtx6000`, `t4v2`, BF16 isn't supported. For models that have BF16 as the default type, when using a non-Ampere GPU, use FP16 instead, i.e. `--dtype: float16`
102
+ * Setting `--compilation-config` to `3` currently breaks multi-node model launches, so we don't set them for models that require multiple nodes of GPUs.
98
103
 
99
104
  #### Other commands
100
105
 
@@ -160,8 +165,9 @@ Once the inference server is ready, you can start sending in inference requests.
160
165
  },
161
166
  "prompt_logprobs":null
162
167
  }
168
+
163
169
  ```
164
- **NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
170
+ **NOTE**: Certain models don't adhere to OpenAI's chat template, e.g. Mistral family. For these models, you can either change your prompt to follow the model's default chat template or provide your own chat template via `--chat-template: TEMPLATE_PATH`.
165
171
 
166
172
  ## SSH tunnel from your local device
167
173
  If you want to run inference from your local device, you can open a SSH tunnel to your cluster environment like the following:
@@ -10,4 +10,4 @@ If you are using the Vector cluster environment, and you don't need any customiz
10
10
  pip install vec-inf
11
11
  ```
12
12
 
13
- Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package.
13
+ Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.8.5.post1`.
@@ -91,7 +91,11 @@ You would then set the `VEC_INF_CONFIG` path using:
91
91
  export VEC_INF_CONFIG=/h/<username>/my-model-config.yaml
92
92
  ```
93
93
 
94
- Note that there are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/client/config.py) for details.
94
+ **NOTE**
95
+ * There are other parameters that can also be added to the config but not shown in this example, check the [`ModelConfig`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/client/config.py) for details.
96
+ * Check [vLLM Engine Arguments](https://docs.vllm.ai/en/stable/serving/engine_args.html) for the full list of available vLLM engine arguments. The default parallel size for any parallelization defaults to 1, so none of the sizes were set specifically in this example.
97
+ * For GPU partitions with non-Ampere architectures, e.g. `rtx6000`, `t4v2`, BF16 isn't supported. For models that have BF16 as the default type, when using a non-Ampere GPU, use FP16 instead, i.e. `--dtype: float16`.
98
+ * Setting `--compilation-config` to `3` currently breaks multi-node model launches, so we don't set them for models that require multiple nodes of GPUs.
95
99
 
96
100
  ### `status` command
97
101
 
@@ -254,8 +258,7 @@ Once the inference server is ready, you can start sending in inference requests.
254
258
  }
255
259
  ```
256
260
 
257
-
258
- **NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
261
+ **NOTE**: Certain models don't adhere to OpenAI's chat template, e.g. Mistral family. For these models, you can either change your prompt to follow the model's default chat template or provide your own chat template via `--chat-template: TEMPLATE_PATH`.
259
262
 
260
263
  ## SSH tunnel from your local device
261
264
 
@@ -9,3 +9,4 @@
9
9
  - [`logits.py`](logits/logits.py): Python example of getting logits from hosted model.
10
10
  - [`api`](api): Examples for using the Python API
11
11
  - [`basic_usage.py`](api/basic_usage.py): Basic Python example demonstrating the Vector Inference API
12
+ - [`slurm_dependency`](slurm_dependency): Example of launching a model with `vec-inf` and running a downstream SLURM job that waits for the server to be ready before sending a request.
@@ -0,0 +1,33 @@
1
+ # SLURM Dependency Workflow Example
2
+
3
+ This example demonstrates how to launch a model server using `vec-inf`, and run a downstream SLURM job that waits for the server to become ready before querying it.
4
+
5
+ ## Files
6
+
7
+ This directory contains the following:
8
+
9
+ 1. [run_workflow.sh](run_workflow.sh)
10
+ Launches the model server and submits the downstream job with a dependency, so it starts only after the server job begins running.
11
+
12
+ 2. [downstream_job.sbatch](downstream_job.sbatch)
13
+ A SLURM job script that runs the downstream logic (e.g., prompting the model).
14
+
15
+ 3. [run_downstream.py](run_downstream.py)
16
+ A Python script that waits until the inference server is ready, then sends a request using the OpenAI-compatible API.
17
+
18
+ ## What to update
19
+
20
+ Before running this example, update the following in [downstream_job.sbatch](downstream_job.sbatch):
21
+
22
+ - `--job-name`, `--output`, and `--error` paths
23
+ - Virtual environment path in the `source` line
24
+ - SLURM resource configuration (e.g., partition, memory, GPU)
25
+
26
+ Also update the model name in [run_downstream.py](run_downstream.py) to match what you're launching.
27
+
28
+ ## Running the example
29
+
30
+ First, activate a virtual environment where `vec-inf` is installed. Then, from this directory, run:
31
+
32
+ ```bash
33
+ bash run_workflow.sh
@@ -0,0 +1,18 @@
1
+ #!/bin/bash
2
+ #SBATCH --job-name=Meta-Llama-3.1-8B-Instruct-downstream
3
+ #SBATCH --partition=a40
4
+ #SBATCH --qos=m2
5
+ #SBATCH --time=08:00:00
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gpus-per-node=1
8
+ #SBATCH --cpus-per-task=4
9
+ #SBATCH --mem=8G
10
+ #SBATCH --output=$HOME/.vec-inf-logs/Meta-Llama-3.1-8B-Instruct-downstream.%j.out
11
+ #SBATCH --error=$HOME/.vec-inf-logs/Meta-Llama-3.1-8B-Instruct-downstream.%j.err
12
+
13
+ # Activate your environment
14
+ # TODO: update this path to match your venv location
15
+ source $HOME/vector-inference/.venv/bin/activate
16
+
17
+ # Wait for the server to be ready using the job ID passed as CLI arg
18
+ python run_downstream.py "$SERVER_JOB_ID"
@@ -0,0 +1,26 @@
1
+ """Example script to query a launched model via the OpenAI-compatible API."""
2
+
3
+ import sys
4
+
5
+ from openai import OpenAI
6
+
7
+ from vec_inf.client import VecInfClient
8
+
9
+
10
+ if len(sys.argv) < 2:
11
+ raise ValueError("Expected server job ID as the first argument.")
12
+ job_id = int(sys.argv[1])
13
+
14
+ vi_client = VecInfClient()
15
+ print(f"Waiting for SLURM job {job_id} to be ready...")
16
+ status = vi_client.wait_until_ready(slurm_job_id=job_id)
17
+ print(f"Server is ready at {status.base_url}")
18
+
19
+ api_client = OpenAI(base_url=status.base_url, api_key="EMPTY")
20
+ resp = api_client.completions.create(
21
+ model="Meta-Llama-3.1-8B-Instruct",
22
+ prompt="Where is the capital of Canada?",
23
+ max_tokens=20,
24
+ )
25
+
26
+ print(resp)
@@ -0,0 +1,14 @@
1
+ #!/bin/bash
2
+
3
+ # ---- Config ----
4
+ MODEL_NAME="Meta-Llama-3.1-8B-Instruct"
5
+ LAUNCH_ARGS="$MODEL_NAME"
6
+
7
+ # ---- Step 1: Launch the server
8
+ RAW_JSON=$(vec-inf launch $LAUNCH_ARGS --json-mode)
9
+ SERVER_JOB_ID=$(echo "$RAW_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin)['slurm_job_id'])")
10
+ echo "Launched server as job $SERVER_JOB_ID"
11
+ echo "$RAW_JSON"
12
+
13
+ # ---- Step 2: Submit downstream job
14
+ sbatch --dependency=after:$SERVER_JOB_ID --export=SERVER_JOB_ID=$SERVER_JOB_ID downstream_job.sbatch
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "vec-inf"
3
- version = "0.6.0"
3
+ version = "0.6.1"
4
4
  description = "Efficient LLM inference on Slurm clusters using vLLM."
5
5
  readme = "README.md"
6
6
  authors = [{name = "Marshall Wang", email = "marshall.wang@vectorinstitute.ai"}]
@@ -5,6 +5,7 @@ from unittest.mock import MagicMock, patch
5
5
  import pytest
6
6
 
7
7
  from vec_inf.client import ModelStatus, ModelType, VecInfClient
8
+ from vec_inf.client._exceptions import ServerError, SlurmJobError
8
9
 
9
10
 
10
11
  @pytest.fixture
@@ -128,3 +129,84 @@ def test_wait_until_ready():
128
129
  assert result.server_status == ModelStatus.READY
129
130
  assert result.base_url == "http://gpu123:8080/v1"
130
131
  assert mock_status.call_count == 2
132
+
133
+
134
+ def test_shutdown_model_success():
135
+ """Test model shutdown success."""
136
+ client = VecInfClient()
137
+ with patch("vec_inf.client.api.run_bash_command") as mock_command:
138
+ mock_command.return_value = ("", "")
139
+ result = client.shutdown_model(12345)
140
+
141
+ assert result is True
142
+ mock_command.assert_called_once_with("scancel 12345")
143
+
144
+
145
+ def test_shutdown_model_failure():
146
+ """Test model shutdown failure."""
147
+ client = VecInfClient()
148
+ with patch("vec_inf.client.api.run_bash_command") as mock_command:
149
+ mock_command.return_value = ("", "Error: Job not found")
150
+ with pytest.raises(
151
+ SlurmJobError, match="Failed to shutdown model: Error: Job not found"
152
+ ):
153
+ client.shutdown_model(12345)
154
+
155
+
156
+ def test_wait_until_ready_timeout():
157
+ """Test timeout in wait_until_ready."""
158
+ client = VecInfClient()
159
+
160
+ with patch.object(client, "get_status") as mock_status:
161
+ mock_response = MagicMock()
162
+ mock_response.server_status = ModelStatus.LAUNCHING
163
+ mock_status.return_value = mock_response
164
+
165
+ with (
166
+ patch("time.sleep"),
167
+ pytest.raises(ServerError, match="Timed out waiting for model"),
168
+ ):
169
+ client.wait_until_ready(12345, timeout_seconds=1, poll_interval_seconds=0.5)
170
+
171
+
172
+ def test_wait_until_ready_failed_status():
173
+ """Test wait_until_ready when model fails."""
174
+ client = VecInfClient()
175
+
176
+ with patch.object(client, "get_status") as mock_status:
177
+ mock_response = MagicMock()
178
+ mock_response.server_status = ModelStatus.FAILED
179
+ mock_response.failed_reason = "Out of memory"
180
+ mock_status.return_value = mock_response
181
+
182
+ with pytest.raises(ServerError, match="Model failed to start: Out of memory"):
183
+ client.wait_until_ready(12345)
184
+
185
+
186
+ def test_wait_until_ready_failed_no_reason():
187
+ """Test wait_until_ready when model fails without reason."""
188
+ client = VecInfClient()
189
+
190
+ with patch.object(client, "get_status") as mock_status:
191
+ mock_response = MagicMock()
192
+ mock_response.server_status = ModelStatus.FAILED
193
+ mock_response.failed_reason = None
194
+ mock_status.return_value = mock_response
195
+
196
+ with pytest.raises(ServerError, match="Model failed to start: Unknown error"):
197
+ client.wait_until_ready(12345)
198
+
199
+
200
+ def test_wait_until_ready_shutdown():
201
+ """Test wait_until_ready when model is shutdown."""
202
+ client = VecInfClient()
203
+
204
+ with patch.object(client, "get_status") as mock_status:
205
+ mock_response = MagicMock()
206
+ mock_response.server_status = ModelStatus.SHUTDOWN
207
+ mock_status.return_value = mock_response
208
+
209
+ with pytest.raises(
210
+ ServerError, match="Model was shutdown before it became ready"
211
+ ):
212
+ client.wait_until_ready(12345)