@draht/pods 2026.3.2-2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +511 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +346 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/models.d.ts +39 -0
- package/dist/commands/models.d.ts.map +1 -0
- package/dist/commands/models.js +658 -0
- package/dist/commands/models.js.map +1 -0
- package/dist/commands/pods.d.ts +21 -0
- package/dist/commands/pods.d.ts.map +1 -0
- package/dist/commands/pods.js +175 -0
- package/dist/commands/pods.js.map +1 -0
- package/dist/commands/prompt.d.ts +7 -0
- package/dist/commands/prompt.d.ts.map +1 -0
- package/dist/commands/prompt.js +54 -0
- package/dist/commands/prompt.js.map +1 -0
- package/dist/config.d.ts +11 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +74 -0
- package/dist/config.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/dist/model-configs.d.ts +22 -0
- package/dist/model-configs.d.ts.map +1 -0
- package/dist/model-configs.js +75 -0
- package/dist/model-configs.js.map +1 -0
- package/dist/models.json +295 -0
- package/dist/scripts/model_run.sh +83 -0
- package/dist/scripts/pod_setup.sh +336 -0
- package/dist/ssh.d.ts +24 -0
- package/dist/ssh.d.ts.map +1 -0
- package/dist/ssh.js +115 -0
- package/dist/ssh.js.map +1 -0
- package/dist/types.d.ts +23 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/package.json +40 -0
- package/scripts/model_run.sh +83 -0
- package/scripts/pod_setup.sh +336 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"model-configs.d.ts","sourceRoot":"","sources":["../src/model-configs.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,GAAG,EAAE,MAAM,YAAY,CAAC;AA2BtC;;GAEG;AACH,eAAO,MAAM,cAAc;;;;QAwD1B,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,YAAY,8BAExB,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,cAAc,gBAE1B,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,YAAY,6BAExB,CAAC","sourcesContent":["import { readFileSync } from \"fs\";\nimport { dirname, join } from \"path\";\nimport { fileURLToPath } from \"url\";\nimport type { GPU } from \"./types.js\";\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = dirname(__filename);\n\ninterface ModelConfig {\n\tgpuCount: number;\n\tgpuTypes?: string[];\n\targs: string[];\n\tenv?: Record<string, string>;\n\tnotes?: string;\n}\n\ninterface ModelInfo {\n\tname: string;\n\tconfigs: ModelConfig[];\n\tnotes?: string;\n}\n\ninterface ModelsData {\n\tmodels: Record<string, ModelInfo>;\n}\n\n// Load models configuration - resolve relative to this file\nconst modelsJsonPath = join(__dirname, \"models.json\");\nconst modelsData: ModelsData = JSON.parse(readFileSync(modelsJsonPath, \"utf-8\"));\n\n/**\n * Get the best configuration for a model based on available GPUs\n */\nexport const getModelConfig = (\n\tmodelId: string,\n\tgpus: GPU[],\n\trequestedGpuCount: number,\n): { args: string[]; env?: Record<string, string>; notes?: string } | null => {\n\tconst modelInfo = modelsData.models[modelId];\n\tif (!modelInfo) {\n\t\t// Unknown model, no default config\n\t\treturn null;\n\t}\n\n\t// Extract GPU type from the first GPU name (e.g., \"NVIDIA H200\" -> \"H200\")\n\tconst gpuType = gpus[0]?.name?.replace(\"NVIDIA\", \"\")?.trim()?.split(\" \")[0] || \"\";\n\n\t// Find best matching config\n\tlet bestConfig: ModelConfig | null = null;\n\n\tfor (const config of modelInfo.configs) {\n\t\t// Check GPU count\n\t\tif (config.gpuCount !== requestedGpuCount) {\n\t\t\tcontinue;\n\t\t}\n\n\t\t// Check GPU type if specified\n\t\tif (config.gpuTypes && config.gpuTypes.length > 0) {\n\t\t\tconst typeMatches = config.gpuTypes.some((type) => gpuType.includes(type) || type.includes(gpuType));\n\t\t\tif (!typeMatches) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t}\n\n\t\t// This config matches\n\t\tbestConfig = config;\n\t\tbreak;\n\t}\n\n\t// If no exact match, try to find a config with just the right GPU count\n\tif (!bestConfig) {\n\t\tfor (const config of modelInfo.configs) {\n\t\t\tif (config.gpuCount === requestedGpuCount) {\n\t\t\t\tbestConfig = config;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (!bestConfig) {\n\t\t// No suitable config found\n\t\treturn null;\n\t}\n\n\treturn {\n\t\targs: [...bestConfig.args],\n\t\tenv: bestConfig.env ? { ...bestConfig.env } : undefined,\n\t\tnotes: bestConfig.notes || modelInfo.notes,\n\t};\n};\n\n/**\n * Check if a model is known\n */\nexport const isKnownModel = (modelId: string): boolean => {\n\treturn modelId in modelsData.models;\n};\n\n/**\n * Get all known models\n */\nexport const getKnownModels = (): string[] => {\n\treturn Object.keys(modelsData.models);\n};\n\n/**\n * Get model display name\n */\nexport const getModelName = (modelId: string): string => {\n\treturn modelsData.models[modelId]?.name || modelId;\n};\n"]}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import { readFileSync } from "fs";
|
|
2
|
+
import { dirname, join } from "path";
|
|
3
|
+
import { fileURLToPath } from "url";
|
|
4
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
5
|
+
const __dirname = dirname(__filename);
|
|
6
|
+
// Load models configuration - resolve relative to this file
|
|
7
|
+
const modelsJsonPath = join(__dirname, "models.json");
|
|
8
|
+
const modelsData = JSON.parse(readFileSync(modelsJsonPath, "utf-8"));
|
|
9
|
+
/**
|
|
10
|
+
* Get the best configuration for a model based on available GPUs
|
|
11
|
+
*/
|
|
12
|
+
export const getModelConfig = (modelId, gpus, requestedGpuCount) => {
|
|
13
|
+
const modelInfo = modelsData.models[modelId];
|
|
14
|
+
if (!modelInfo) {
|
|
15
|
+
// Unknown model, no default config
|
|
16
|
+
return null;
|
|
17
|
+
}
|
|
18
|
+
// Extract GPU type from the first GPU name (e.g., "NVIDIA H200" -> "H200")
|
|
19
|
+
const gpuType = gpus[0]?.name?.replace("NVIDIA", "")?.trim()?.split(" ")[0] || "";
|
|
20
|
+
// Find best matching config
|
|
21
|
+
let bestConfig = null;
|
|
22
|
+
for (const config of modelInfo.configs) {
|
|
23
|
+
// Check GPU count
|
|
24
|
+
if (config.gpuCount !== requestedGpuCount) {
|
|
25
|
+
continue;
|
|
26
|
+
}
|
|
27
|
+
// Check GPU type if specified
|
|
28
|
+
if (config.gpuTypes && config.gpuTypes.length > 0) {
|
|
29
|
+
const typeMatches = config.gpuTypes.some((type) => gpuType.includes(type) || type.includes(gpuType));
|
|
30
|
+
if (!typeMatches) {
|
|
31
|
+
continue;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
// This config matches
|
|
35
|
+
bestConfig = config;
|
|
36
|
+
break;
|
|
37
|
+
}
|
|
38
|
+
// If no exact match, try to find a config with just the right GPU count
|
|
39
|
+
if (!bestConfig) {
|
|
40
|
+
for (const config of modelInfo.configs) {
|
|
41
|
+
if (config.gpuCount === requestedGpuCount) {
|
|
42
|
+
bestConfig = config;
|
|
43
|
+
break;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
if (!bestConfig) {
|
|
48
|
+
// No suitable config found
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
51
|
+
return {
|
|
52
|
+
args: [...bestConfig.args],
|
|
53
|
+
env: bestConfig.env ? { ...bestConfig.env } : undefined,
|
|
54
|
+
notes: bestConfig.notes || modelInfo.notes,
|
|
55
|
+
};
|
|
56
|
+
};
|
|
57
|
+
/**
|
|
58
|
+
* Check if a model is known
|
|
59
|
+
*/
|
|
60
|
+
export const isKnownModel = (modelId) => {
|
|
61
|
+
return modelId in modelsData.models;
|
|
62
|
+
};
|
|
63
|
+
/**
|
|
64
|
+
* Get all known models
|
|
65
|
+
*/
|
|
66
|
+
export const getKnownModels = () => {
|
|
67
|
+
return Object.keys(modelsData.models);
|
|
68
|
+
};
|
|
69
|
+
/**
|
|
70
|
+
* Get model display name
|
|
71
|
+
*/
|
|
72
|
+
export const getModelName = (modelId) => {
|
|
73
|
+
return modelsData.models[modelId]?.name || modelId;
|
|
74
|
+
};
|
|
75
|
+
//# sourceMappingURL=model-configs.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"model-configs.js","sourceRoot":"","sources":["../src/model-configs.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AACrC,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AAGpC,MAAM,UAAU,GAAG,aAAa,CAAC,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC;AAClD,MAAM,SAAS,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAoBtC,4DAA4D;AAC5D,MAAM,cAAc,GAAG,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;AACtD,MAAM,UAAU,GAAe,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC,CAAC;AAEjF;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAC7B,OAAe,EACf,IAAW,EACX,iBAAyB,EACiD,EAAE,CAAC;IAC7E,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAC7C,IAAI,CAAC,SAAS,EAAE,CAAC;QAChB,mCAAmC;QACnC,OAAO,IAAI,CAAC;IACb,CAAC;IAED,2EAA2E;IAC3E,MAAM,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,IAAI,EAAE,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAElF,4BAA4B;IAC5B,IAAI,UAAU,GAAuB,IAAI,CAAC;IAE1C,KAAK,MAAM,MAAM,IAAI,SAAS,CAAC,OAAO,EAAE,CAAC;QACxC,kBAAkB;QAClB,IAAI,MAAM,CAAC,QAAQ,KAAK,iBAAiB,EAAE,CAAC;YAC3C,SAAS;QACV,CAAC;QAED,8BAA8B;QAC9B,IAAI,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnD,MAAM,WAAW,GAAG,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;YACrG,IAAI,CAAC,WAAW,EAAE,CAAC;gBAClB,SAAS;YACV,CAAC;QACF,CAAC;QAED,sBAAsB;QACtB,UAAU,GAAG,MAAM,CAAC;QACpB,MAAM;IACP,CAAC;IAED,wEAAwE;IACxE,IAAI,CAAC,UAAU,EAAE,CAAC;QACjB,KAAK,MAAM,MAAM,IAAI,SAAS,CAAC,OAAO,EAAE,CAAC;YACxC,IAAI,MAAM,CAAC,QAAQ,KAAK,iBAAiB,EAAE,CAAC;gBAC3C,UAAU,GAAG,MAAM,CAAC;gBACpB,MAAM;YACP,CAAC;QACF,CAAC;IACF,CAAC;IAED,IAAI,CAAC,UAAU,EAAE,CAAC;QACjB,2BAA2B;QAC3B,OAAO,IAAI,CAAC;IACb,CAAC;IAED,OAAO;QACN,IAAI,EAAE,CAAC,GAAG,UAAU,CAAC,IAAI,CAAC;QAC1B,GAAG,EAAE,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,SAAS;QACvD,KAAK,EAAE,UAAU,CAAC,KAAK,IAAI,SAAS,CAAC,KAAK;KAC1C,CAAC;AAAA,CACF,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,OAAe,EAAW,EAAE,CAAC;IACzD,OAAO,OAAO,IAAI,UAAU,CAAC,MAAM,CAAC;AAAA,CACpC,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,GAAa,EAAE,CAAC;IAC7C,OAAO,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;AAAA,CACtC,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,OAAe,EAAU,EAAE,CAAC;IACxD,OAAO,UAAU,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,IAAI,IAAI,OAAO,CAAC;AAAA,CACnD,CAAC","sourcesContent":["import { readFileSync } from \"fs\";\nimport { dirname, join } from \"path\";\nimport { fileURLToPath } from \"url\";\nimport type { GPU } from \"./types.js\";\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = dirname(__filename);\n\ninterface ModelConfig {\n\tgpuCount: number;\n\tgpuTypes?: string[];\n\targs: string[];\n\tenv?: Record<string, string>;\n\tnotes?: string;\n}\n\ninterface ModelInfo {\n\tname: string;\n\tconfigs: ModelConfig[];\n\tnotes?: string;\n}\n\ninterface ModelsData {\n\tmodels: Record<string, ModelInfo>;\n}\n\n// Load models configuration - resolve relative to this file\nconst modelsJsonPath = join(__dirname, \"models.json\");\nconst modelsData: ModelsData = JSON.parse(readFileSync(modelsJsonPath, \"utf-8\"));\n\n/**\n * Get the best configuration for a model based on available GPUs\n */\nexport const getModelConfig = (\n\tmodelId: string,\n\tgpus: GPU[],\n\trequestedGpuCount: number,\n): { args: string[]; env?: Record<string, string>; notes?: string } | null => {\n\tconst modelInfo = modelsData.models[modelId];\n\tif (!modelInfo) {\n\t\t// Unknown model, no default config\n\t\treturn null;\n\t}\n\n\t// Extract GPU type from the first GPU name (e.g., \"NVIDIA H200\" -> \"H200\")\n\tconst gpuType = gpus[0]?.name?.replace(\"NVIDIA\", \"\")?.trim()?.split(\" \")[0] || \"\";\n\n\t// Find best matching config\n\tlet bestConfig: ModelConfig | null = null;\n\n\tfor (const config of modelInfo.configs) {\n\t\t// Check GPU count\n\t\tif (config.gpuCount !== requestedGpuCount) {\n\t\t\tcontinue;\n\t\t}\n\n\t\t// Check GPU type if specified\n\t\tif (config.gpuTypes && config.gpuTypes.length > 0) {\n\t\t\tconst typeMatches = config.gpuTypes.some((type) => gpuType.includes(type) || type.includes(gpuType));\n\t\t\tif (!typeMatches) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t}\n\n\t\t// This config matches\n\t\tbestConfig = config;\n\t\tbreak;\n\t}\n\n\t// If no exact match, try to find a config with just the right GPU count\n\tif (!bestConfig) {\n\t\tfor (const config of modelInfo.configs) {\n\t\t\tif (config.gpuCount === requestedGpuCount) {\n\t\t\t\tbestConfig = config;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (!bestConfig) {\n\t\t// No suitable config found\n\t\treturn null;\n\t}\n\n\treturn {\n\t\targs: [...bestConfig.args],\n\t\tenv: bestConfig.env ? { ...bestConfig.env } : undefined,\n\t\tnotes: bestConfig.notes || modelInfo.notes,\n\t};\n};\n\n/**\n * Check if a model is known\n */\nexport const isKnownModel = (modelId: string): boolean => {\n\treturn modelId in modelsData.models;\n};\n\n/**\n * Get all known models\n */\nexport const getKnownModels = (): string[] => {\n\treturn Object.keys(modelsData.models);\n};\n\n/**\n * Get model display name\n */\nexport const getModelName = (modelId: string): string => {\n\treturn modelsData.models[modelId]?.name || modelId;\n};\n"]}
|
package/dist/models.json
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
{
|
|
2
|
+
"models": {
|
|
3
|
+
"Qwen/Qwen2.5-Coder-32B-Instruct": {
|
|
4
|
+
"name": "Qwen2.5-Coder-32B",
|
|
5
|
+
"configs": [
|
|
6
|
+
{
|
|
7
|
+
"gpuCount": 1,
|
|
8
|
+
"gpuTypes": ["H100", "H200"],
|
|
9
|
+
"args": ["--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"gpuCount": 2,
|
|
13
|
+
"gpuTypes": ["H100", "H200"],
|
|
14
|
+
"args": ["--tensor-parallel-size", "2", "--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
|
|
15
|
+
}
|
|
16
|
+
]
|
|
17
|
+
},
|
|
18
|
+
"Qwen/Qwen3-Coder-30B-A3B-Instruct": {
|
|
19
|
+
"name": "Qwen3-Coder-30B",
|
|
20
|
+
"configs": [
|
|
21
|
+
{
|
|
22
|
+
"gpuCount": 1,
|
|
23
|
+
"gpuTypes": ["H100", "H200"],
|
|
24
|
+
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
|
|
25
|
+
"notes": "Fits comfortably on single GPU. ~60GB model weight."
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
"gpuCount": 2,
|
|
29
|
+
"gpuTypes": ["H100", "H200"],
|
|
30
|
+
"args": [
|
|
31
|
+
"--tensor-parallel-size",
|
|
32
|
+
"2",
|
|
33
|
+
"--enable-auto-tool-choice",
|
|
34
|
+
"--tool-call-parser",
|
|
35
|
+
"qwen3_coder"
|
|
36
|
+
],
|
|
37
|
+
"notes": "For higher throughput/longer context."
|
|
38
|
+
}
|
|
39
|
+
]
|
|
40
|
+
},
|
|
41
|
+
"Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8": {
|
|
42
|
+
"name": "Qwen3-Coder-30B-FP8",
|
|
43
|
+
"configs": [
|
|
44
|
+
{
|
|
45
|
+
"gpuCount": 1,
|
|
46
|
+
"gpuTypes": ["H100", "H200"],
|
|
47
|
+
"args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
|
|
48
|
+
"env": {
|
|
49
|
+
"VLLM_USE_DEEP_GEMM": "1"
|
|
50
|
+
},
|
|
51
|
+
"notes": "FP8 quantized, ~30GB model weight. Excellent for single GPU deployment."
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
},
|
|
55
|
+
"Qwen/Qwen3-Coder-480B-A35B-Instruct": {
|
|
56
|
+
"name": "Qwen3-Coder-480B",
|
|
57
|
+
"configs": [
|
|
58
|
+
{
|
|
59
|
+
"gpuCount": 8,
|
|
60
|
+
"gpuTypes": ["H200", "H20"],
|
|
61
|
+
"args": [
|
|
62
|
+
"--tensor-parallel-size",
|
|
63
|
+
"8",
|
|
64
|
+
"--max-model-len",
|
|
65
|
+
"32000",
|
|
66
|
+
"--enable-auto-tool-choice",
|
|
67
|
+
"--tool-call-parser",
|
|
68
|
+
"qwen3_coder"
|
|
69
|
+
],
|
|
70
|
+
"notes": "Cannot serve full 262K context on single node. Reduce max-model-len or increase gpu-memory-utilization."
|
|
71
|
+
}
|
|
72
|
+
]
|
|
73
|
+
},
|
|
74
|
+
"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
|
|
75
|
+
"name": "Qwen3-Coder-480B-FP8",
|
|
76
|
+
"configs": [
|
|
77
|
+
{
|
|
78
|
+
"gpuCount": 8,
|
|
79
|
+
"gpuTypes": ["H200", "H20"],
|
|
80
|
+
"args": [
|
|
81
|
+
"--max-model-len",
|
|
82
|
+
"131072",
|
|
83
|
+
"--enable-expert-parallel",
|
|
84
|
+
"--data-parallel-size",
|
|
85
|
+
"8",
|
|
86
|
+
"--enable-auto-tool-choice",
|
|
87
|
+
"--tool-call-parser",
|
|
88
|
+
"qwen3_coder"
|
|
89
|
+
],
|
|
90
|
+
"env": {
|
|
91
|
+
"VLLM_USE_DEEP_GEMM": "1"
|
|
92
|
+
},
|
|
93
|
+
"notes": "Use data-parallel mode (not tensor-parallel) to avoid weight quantization errors."
|
|
94
|
+
}
|
|
95
|
+
]
|
|
96
|
+
},
|
|
97
|
+
"openai/gpt-oss-20b": {
|
|
98
|
+
"name": "GPT-OSS-20B",
|
|
99
|
+
"configs": [
|
|
100
|
+
{
|
|
101
|
+
"gpuCount": 1,
|
|
102
|
+
"gpuTypes": ["H100", "H200"],
|
|
103
|
+
"args": ["--async-scheduling"]
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"gpuCount": 1,
|
|
107
|
+
"gpuTypes": ["B200"],
|
|
108
|
+
"args": ["--async-scheduling"],
|
|
109
|
+
"env": {
|
|
110
|
+
"VLLM_USE_TRTLLM_ATTENTION": "1",
|
|
111
|
+
"VLLM_USE_TRTLLM_DECODE_ATTENTION": "1",
|
|
112
|
+
"VLLM_USE_TRTLLM_CONTEXT_ATTENTION": "1",
|
|
113
|
+
"VLLM_USE_FLASHINFER_MXFP4_MOE": "1"
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
],
|
|
117
|
+
"notes": "Tools/function calls only via /v1/responses endpoint."
|
|
118
|
+
},
|
|
119
|
+
"openai/gpt-oss-120b": {
|
|
120
|
+
"name": "GPT-OSS-120B",
|
|
121
|
+
"configs": [
|
|
122
|
+
{
|
|
123
|
+
"gpuCount": 1,
|
|
124
|
+
"gpuTypes": ["H100", "H200"],
|
|
125
|
+
"args": ["--async-scheduling", "--gpu-memory-utilization", "0.95", "--max-num-batched-tokens", "1024"],
|
|
126
|
+
"notes": "Single GPU deployment. Tools/function calls only via /v1/responses endpoint."
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
"gpuCount": 2,
|
|
130
|
+
"gpuTypes": ["H100", "H200"],
|
|
131
|
+
"args": ["--tensor-parallel-size", "2", "--async-scheduling", "--gpu-memory-utilization", "0.94"],
|
|
132
|
+
"notes": "Recommended for H100/H200. Tools/function calls only via /v1/responses endpoint."
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
"gpuCount": 4,
|
|
136
|
+
"gpuTypes": ["H100", "H200"],
|
|
137
|
+
"args": ["--tensor-parallel-size", "4", "--async-scheduling"],
|
|
138
|
+
"notes": "Higher throughput. Tools/function calls only via /v1/responses endpoint."
|
|
139
|
+
},
|
|
140
|
+
{
|
|
141
|
+
"gpuCount": 8,
|
|
142
|
+
"gpuTypes": ["H100", "H200"],
|
|
143
|
+
"args": ["--tensor-parallel-size", "8", "--async-scheduling"],
|
|
144
|
+
"notes": "Maximum throughput for evaluation workloads. Tools/function calls only via /v1/responses endpoint."
|
|
145
|
+
}
|
|
146
|
+
]
|
|
147
|
+
},
|
|
148
|
+
"zai-org/GLM-4.5": {
|
|
149
|
+
"name": "GLM-4.5",
|
|
150
|
+
"configs": [
|
|
151
|
+
{
|
|
152
|
+
"gpuCount": 16,
|
|
153
|
+
"gpuTypes": ["H100"],
|
|
154
|
+
"args": [
|
|
155
|
+
"--tensor-parallel-size",
|
|
156
|
+
"16",
|
|
157
|
+
"--tool-call-parser",
|
|
158
|
+
"glm45",
|
|
159
|
+
"--reasoning-parser",
|
|
160
|
+
"glm45",
|
|
161
|
+
"--enable-auto-tool-choice"
|
|
162
|
+
]
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
"gpuCount": 8,
|
|
166
|
+
"gpuTypes": ["H200"],
|
|
167
|
+
"args": [
|
|
168
|
+
"--tensor-parallel-size",
|
|
169
|
+
"8",
|
|
170
|
+
"--tool-call-parser",
|
|
171
|
+
"glm45",
|
|
172
|
+
"--reasoning-parser",
|
|
173
|
+
"glm45",
|
|
174
|
+
"--enable-auto-tool-choice"
|
|
175
|
+
]
|
|
176
|
+
}
|
|
177
|
+
],
|
|
178
|
+
"notes": "Models default to thinking mode. For full 128K context, double the GPU count."
|
|
179
|
+
},
|
|
180
|
+
"zai-org/GLM-4.5-FP8": {
|
|
181
|
+
"name": "GLM-4.5-FP8",
|
|
182
|
+
"configs": [
|
|
183
|
+
{
|
|
184
|
+
"gpuCount": 8,
|
|
185
|
+
"gpuTypes": ["H100"],
|
|
186
|
+
"args": [
|
|
187
|
+
"--tensor-parallel-size",
|
|
188
|
+
"8",
|
|
189
|
+
"--tool-call-parser",
|
|
190
|
+
"glm45",
|
|
191
|
+
"--reasoning-parser",
|
|
192
|
+
"glm45",
|
|
193
|
+
"--enable-auto-tool-choice"
|
|
194
|
+
]
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
"gpuCount": 4,
|
|
198
|
+
"gpuTypes": ["H200"],
|
|
199
|
+
"args": [
|
|
200
|
+
"--tensor-parallel-size",
|
|
201
|
+
"4",
|
|
202
|
+
"--tool-call-parser",
|
|
203
|
+
"glm45",
|
|
204
|
+
"--reasoning-parser",
|
|
205
|
+
"glm45",
|
|
206
|
+
"--enable-auto-tool-choice"
|
|
207
|
+
]
|
|
208
|
+
}
|
|
209
|
+
]
|
|
210
|
+
},
|
|
211
|
+
"zai-org/GLM-4.5-Air-FP8": {
|
|
212
|
+
"name": "GLM-4.5-Air-FP8",
|
|
213
|
+
"configs": [
|
|
214
|
+
{
|
|
215
|
+
"gpuCount": 2,
|
|
216
|
+
"gpuTypes": ["H100"],
|
|
217
|
+
"args": [
|
|
218
|
+
"--tensor-parallel-size",
|
|
219
|
+
"2",
|
|
220
|
+
"--tool-call-parser",
|
|
221
|
+
"glm45",
|
|
222
|
+
"--reasoning-parser",
|
|
223
|
+
"glm45",
|
|
224
|
+
"--enable-auto-tool-choice"
|
|
225
|
+
],
|
|
226
|
+
"env": {
|
|
227
|
+
"VLLM_ATTENTION_BACKEND": "XFORMERS"
|
|
228
|
+
},
|
|
229
|
+
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
"gpuCount": 1,
|
|
233
|
+
"gpuTypes": ["H200"],
|
|
234
|
+
"args": ["--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice"],
|
|
235
|
+
"env": {
|
|
236
|
+
"VLLM_ATTENTION_BACKEND": "XFORMERS"
|
|
237
|
+
},
|
|
238
|
+
"notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
|
|
239
|
+
}
|
|
240
|
+
]
|
|
241
|
+
},
|
|
242
|
+
"zai-org/GLM-4.5-Air": {
|
|
243
|
+
"name": "GLM-4.5-Air",
|
|
244
|
+
"configs": [
|
|
245
|
+
{
|
|
246
|
+
"gpuCount": 2,
|
|
247
|
+
"gpuTypes": ["H100", "H200"],
|
|
248
|
+
"args": [
|
|
249
|
+
"--tensor-parallel-size",
|
|
250
|
+
"2",
|
|
251
|
+
"--tool-call-parser",
|
|
252
|
+
"glm45",
|
|
253
|
+
"--reasoning-parser",
|
|
254
|
+
"glm45",
|
|
255
|
+
"--enable-auto-tool-choice"
|
|
256
|
+
],
|
|
257
|
+
"notes": "Non-quantized BF16 version, more compatible"
|
|
258
|
+
},
|
|
259
|
+
{
|
|
260
|
+
"gpuCount": 1,
|
|
261
|
+
"gpuTypes": ["H200"],
|
|
262
|
+
"args": [
|
|
263
|
+
"--tool-call-parser",
|
|
264
|
+
"glm45",
|
|
265
|
+
"--reasoning-parser",
|
|
266
|
+
"glm45",
|
|
267
|
+
"--enable-auto-tool-choice",
|
|
268
|
+
"--gpu-memory-utilization",
|
|
269
|
+
"0.95"
|
|
270
|
+
],
|
|
271
|
+
"notes": "Single H200 can fit the BF16 model with high memory utilization"
|
|
272
|
+
}
|
|
273
|
+
]
|
|
274
|
+
},
|
|
275
|
+
"moonshotai/Kimi-K2-Instruct": {
|
|
276
|
+
"name": "Kimi-K2",
|
|
277
|
+
"configs": [
|
|
278
|
+
{
|
|
279
|
+
"gpuCount": 16,
|
|
280
|
+
"gpuTypes": ["H200", "H20"],
|
|
281
|
+
"args": [
|
|
282
|
+
"--tensor-parallel-size",
|
|
283
|
+
"16",
|
|
284
|
+
"--trust-remote-code",
|
|
285
|
+
"--enable-auto-tool-choice",
|
|
286
|
+
"--tool-call-parser",
|
|
287
|
+
"kimi_k2"
|
|
288
|
+
],
|
|
289
|
+
"notes": "Pure TP mode. For >16 GPUs, combine with pipeline-parallelism."
|
|
290
|
+
}
|
|
291
|
+
],
|
|
292
|
+
"notes": "Requires vLLM v0.10.0rc1+. Minimum 16 GPUs for FP8 with 128k context."
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Model runner script - runs sequentially, killed by pi stop
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
# These values are replaced before upload by pi CLI
|
|
6
|
+
MODEL_ID="{{MODEL_ID}}"
|
|
7
|
+
NAME="{{NAME}}"
|
|
8
|
+
PORT="{{PORT}}"
|
|
9
|
+
VLLM_ARGS="{{VLLM_ARGS}}"
|
|
10
|
+
|
|
11
|
+
# Trap to ensure cleanup on exit and kill any child processes
|
|
12
|
+
cleanup() {
|
|
13
|
+
local exit_code=$?
|
|
14
|
+
echo "Model runner exiting with code $exit_code"
|
|
15
|
+
# Kill any child processes
|
|
16
|
+
pkill -P $$ 2>/dev/null || true
|
|
17
|
+
exit $exit_code
|
|
18
|
+
}
|
|
19
|
+
trap cleanup EXIT TERM INT
|
|
20
|
+
|
|
21
|
+
# Force colored output even when not a TTY
|
|
22
|
+
export FORCE_COLOR=1
|
|
23
|
+
export PYTHONUNBUFFERED=1
|
|
24
|
+
export TERM=xterm-256color
|
|
25
|
+
export RICH_FORCE_TERMINAL=1
|
|
26
|
+
export CLICOLOR_FORCE=1
|
|
27
|
+
|
|
28
|
+
# Source virtual environment
|
|
29
|
+
source /root/venv/bin/activate
|
|
30
|
+
|
|
31
|
+
echo "========================================="
|
|
32
|
+
echo "Model Run: $NAME"
|
|
33
|
+
echo "Model ID: $MODEL_ID"
|
|
34
|
+
echo "Port: $PORT"
|
|
35
|
+
if [ -n "$VLLM_ARGS" ]; then
|
|
36
|
+
echo "vLLM Args: $VLLM_ARGS"
|
|
37
|
+
fi
|
|
38
|
+
echo "========================================="
|
|
39
|
+
echo ""
|
|
40
|
+
|
|
41
|
+
# Download model (with color progress bars)
|
|
42
|
+
echo "Downloading model (will skip if cached)..."
|
|
43
|
+
HF_HUB_ENABLE_HF_TRANSFER=1 hf download "$MODEL_ID"
|
|
44
|
+
|
|
45
|
+
if [ $? -ne 0 ]; then
|
|
46
|
+
echo "❌ ERROR: Failed to download model" >&2
|
|
47
|
+
exit 1
|
|
48
|
+
fi
|
|
49
|
+
|
|
50
|
+
echo ""
|
|
51
|
+
echo "✅ Model download complete"
|
|
52
|
+
echo ""
|
|
53
|
+
|
|
54
|
+
# Build vLLM command
|
|
55
|
+
VLLM_CMD="vllm serve '$MODEL_ID' --port $PORT --api-key '$PI_API_KEY'"
|
|
56
|
+
if [ -n "$VLLM_ARGS" ]; then
|
|
57
|
+
VLLM_CMD="$VLLM_CMD $VLLM_ARGS"
|
|
58
|
+
fi
|
|
59
|
+
|
|
60
|
+
echo "Starting vLLM server..."
|
|
61
|
+
echo "Command: $VLLM_CMD"
|
|
62
|
+
echo "========================================="
|
|
63
|
+
echo ""
|
|
64
|
+
|
|
65
|
+
# Run vLLM in background so we can monitor it
|
|
66
|
+
echo "Starting vLLM process..."
|
|
67
|
+
bash -c "$VLLM_CMD" &
|
|
68
|
+
VLLM_PID=$!
|
|
69
|
+
|
|
70
|
+
# Monitor the vLLM process
|
|
71
|
+
echo "Monitoring vLLM process (PID: $VLLM_PID)..."
|
|
72
|
+
wait $VLLM_PID
|
|
73
|
+
VLLM_EXIT_CODE=$?
|
|
74
|
+
|
|
75
|
+
if [ $VLLM_EXIT_CODE -ne 0 ]; then
|
|
76
|
+
echo "❌ ERROR: vLLM exited with code $VLLM_EXIT_CODE" >&2
|
|
77
|
+
# Make sure to exit the script command too
|
|
78
|
+
kill -TERM $$ 2>/dev/null || true
|
|
79
|
+
exit $VLLM_EXIT_CODE
|
|
80
|
+
fi
|
|
81
|
+
|
|
82
|
+
echo "✅ vLLM exited normally"
|
|
83
|
+
exit 0
|