vec-inf 0.4.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,213 @@
1
+ """Global variables for Vector Inference.
2
+
3
+ This module contains configuration constants and templates used throughout the
4
+ Vector Inference package, including SLURM script templates, model configurations,
5
+ and metric definitions.
6
+
7
+ Constants
8
+ ---------
9
+ MODEL_READY_SIGNATURE : str
10
+ Signature string indicating successful model server startup
11
+ SRC_DIR : str
12
+ Absolute path to the package source directory
13
+ REQUIRED_FIELDS : set
14
+ Set of required fields for model configuration
15
+ KEY_METRICS : dict
16
+ Mapping of vLLM metrics to their human-readable names
17
+ SLURM_JOB_CONFIG_ARGS : dict
18
+ Mapping of SLURM configuration arguments to their parameter names
19
+ """
20
+
21
+ from pathlib import Path
22
+ from typing import TypedDict
23
+
24
+ from vec_inf.client.slurm_vars import SINGULARITY_LOAD_CMD
25
+
26
+
27
+ MODEL_READY_SIGNATURE = "INFO: Application startup complete."
28
+ SRC_DIR = str(Path(__file__).parent.parent)
29
+
30
+
31
+ # Required fields for model configuration
32
+ REQUIRED_FIELDS = {
33
+ "model_family",
34
+ "model_type",
35
+ "gpus_per_node",
36
+ "num_nodes",
37
+ "vocab_size",
38
+ }
39
+
40
+ # Key production metrics for inference servers
41
+ KEY_METRICS = {
42
+ "vllm:prompt_tokens_total": "total_prompt_tokens",
43
+ "vllm:generation_tokens_total": "total_generation_tokens",
44
+ "vllm:e2e_request_latency_seconds_sum": "request_latency_sum",
45
+ "vllm:e2e_request_latency_seconds_count": "request_latency_count",
46
+ "vllm:request_queue_time_seconds_sum": "queue_time_sum",
47
+ "vllm:request_success_total": "successful_requests_total",
48
+ "vllm:num_requests_running": "requests_running",
49
+ "vllm:num_requests_waiting": "requests_waiting",
50
+ "vllm:num_requests_swapped": "requests_swapped",
51
+ "vllm:gpu_cache_usage_perc": "gpu_cache_usage",
52
+ "vllm:cpu_cache_usage_perc": "cpu_cache_usage",
53
+ }
54
+
55
+ # Slurm job configuration arguments
56
+ SLURM_JOB_CONFIG_ARGS = {
57
+ "job-name": "model_name",
58
+ "partition": "partition",
59
+ "account": "account",
60
+ "qos": "qos",
61
+ "time": "time",
62
+ "nodes": "num_nodes",
63
+ "gpus-per-node": "gpus_per_node",
64
+ "cpus-per-task": "cpus_per_task",
65
+ "mem": "mem_per_node",
66
+ "output": "out_file",
67
+ "error": "err_file",
68
+ }
69
+
70
+ # vLLM engine args mapping between short and long names
71
+ VLLM_SHORT_TO_LONG_MAP = {
72
+ "-tp": "--tensor-parallel-size",
73
+ "-pp": "--pipeline-parallel-size",
74
+ "-O": "--compilation-config",
75
+ }
76
+
77
+
78
+ # Slurm script templates
79
+ class ShebangConfig(TypedDict):
80
+ """TypedDict for SLURM script shebang configuration.
81
+
82
+ Parameters
83
+ ----------
84
+ base : str
85
+ Base shebang line for all SLURM scripts
86
+ multinode : list[str]
87
+ Additional SLURM directives for multi-node configurations
88
+ """
89
+
90
+ base: str
91
+ multinode: list[str]
92
+
93
+
94
+ class ServerSetupConfig(TypedDict):
95
+ """TypedDict for server setup configuration.
96
+
97
+ Parameters
98
+ ----------
99
+ single_node : list[str]
100
+ Setup commands for single-node deployments
101
+ multinode : list[str]
102
+ Setup commands for multi-node deployments, including Ray initialization
103
+ """
104
+
105
+ single_node: list[str]
106
+ multinode: list[str]
107
+
108
+
109
+ class SlurmScriptTemplate(TypedDict):
110
+ """TypedDict for complete SLURM script template configuration.
111
+
112
+ Parameters
113
+ ----------
114
+ shebang : ShebangConfig
115
+ Shebang and SLURM directive configuration
116
+ singularity_setup : list[str]
117
+ Commands for Singularity container setup
118
+ imports : str
119
+ Import statements and source commands
120
+ singularity_command : str
121
+ Template for Singularity execution command
122
+ activate_venv : str
123
+ Template for virtual environment activation
124
+ server_setup : ServerSetupConfig
125
+ Server initialization commands for different deployment modes
126
+ find_vllm_port : list[str]
127
+ Commands to find available ports for vLLM server
128
+ write_to_json : list[str]
129
+ Commands to write server configuration to JSON
130
+ launch_cmd : list[str]
131
+ vLLM server launch commands
132
+ """
133
+
134
+ shebang: ShebangConfig
135
+ singularity_setup: list[str]
136
+ imports: str
137
+ singularity_command: str
138
+ activate_venv: str
139
+ server_setup: ServerSetupConfig
140
+ find_vllm_port: list[str]
141
+ write_to_json: list[str]
142
+ launch_cmd: list[str]
143
+
144
+
145
+ SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = {
146
+ "shebang": {
147
+ "base": "#!/bin/bash",
148
+ "multinode": [
149
+ "#SBATCH --exclusive",
150
+ "#SBATCH --tasks-per-node=1",
151
+ ],
152
+ },
153
+ "singularity_setup": [
154
+ SINGULARITY_LOAD_CMD,
155
+ "singularity exec {singularity_image} ray stop",
156
+ ],
157
+ "imports": "source {src_dir}/find_port.sh",
158
+ "singularity_command": "singularity exec --nv --bind {model_weights_path}:{model_weights_path} --containall {singularity_image}",
159
+ "activate_venv": "source {venv}/bin/activate",
160
+ "server_setup": {
161
+ "single_node": [
162
+ "\n# Find available port",
163
+ "head_node_ip=${SLURMD_NODENAME}",
164
+ ],
165
+ "multinode": [
166
+ "\n# Get list of nodes",
167
+ 'nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")',
168
+ "nodes_array=($nodes)",
169
+ "head_node=${nodes_array[0]}",
170
+ 'head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)',
171
+ "\n# Start Ray head node",
172
+ "head_node_port=$(find_available_port $head_node_ip 8080 65535)",
173
+ "ray_head=$head_node_ip:$head_node_port",
174
+ 'echo "Ray Head IP: $ray_head"',
175
+ 'echo "Starting HEAD at $head_node"',
176
+ 'srun --nodes=1 --ntasks=1 -w "$head_node" \\',
177
+ " SINGULARITY_PLACEHOLDER \\",
178
+ ' ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \\',
179
+ ' --num-cpus "$SLURM_CPUS_PER_TASK" --num-gpus "$SLURM_GPUS_PER_NODE" --block &',
180
+ "sleep 10",
181
+ "\n# Start Ray worker nodes",
182
+ "worker_num=$((SLURM_JOB_NUM_NODES - 1))",
183
+ "for ((i = 1; i <= worker_num; i++)); do",
184
+ " node_i=${nodes_array[$i]}",
185
+ ' echo "Starting WORKER $i at $node_i"',
186
+ ' srun --nodes=1 --ntasks=1 -w "$node_i" \\',
187
+ " SINGULARITY_PLACEHOLDER \\",
188
+ ' ray start --address "$ray_head" \\',
189
+ ' --num-cpus "$SLURM_CPUS_PER_TASK" --num-gpus "$SLURM_GPUS_PER_NODE" --block &',
190
+ " sleep 5",
191
+ "done",
192
+ ],
193
+ },
194
+ "find_vllm_port": [
195
+ "\nvllm_port_number=$(find_available_port $head_node_ip 8080 65535)",
196
+ 'server_address="http://${head_node_ip}:${vllm_port_number}/v1"',
197
+ ],
198
+ "write_to_json": [
199
+ '\njson_path="{log_dir}/{model_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"',
200
+ 'jq --arg server_addr "$server_address" \\',
201
+ " '. + {{\"server_address\": $server_addr}}' \\",
202
+ ' "$json_path" > temp.json \\',
203
+ ' && mv temp.json "$json_path"',
204
+ ],
205
+ "launch_cmd": [
206
+ "python3.10 -m vllm.entrypoints.openai.api_server \\",
207
+ " --model {model_weights_path} \\",
208
+ " --served-model-name {model_name} \\",
209
+ ' --host "0.0.0.0" \\',
210
+ " --port $vllm_port_number \\",
211
+ " --trust-remote-code \\",
212
+ ],
213
+ }
@@ -0,0 +1,37 @@
1
+ """Exceptions for the vector inference package."""
2
+
3
+
4
+ class ModelConfigurationError(Exception):
5
+ """Raised when the model config or weights are missing or invalid."""
6
+
7
+ pass
8
+
9
+
10
+ class MissingRequiredFieldsError(ValueError):
11
+ """Raised when required fields are missing from the provided parameters."""
12
+
13
+ pass
14
+
15
+
16
+ class ModelNotFoundError(KeyError):
17
+ """Raised when the specified model name is not found in the configuration."""
18
+
19
+ pass
20
+
21
+
22
+ class SlurmJobError(RuntimeError):
23
+ """Raised when there's an error with a Slurm job."""
24
+
25
+ pass
26
+
27
+
28
+ class APIError(Exception):
29
+ """Base exception for API errors."""
30
+
31
+ pass
32
+
33
+
34
+ class ServerError(Exception):
35
+ """Exception raised when there's an error with the inference server."""
36
+
37
+ pass