@draht/pods 2026.3.2-2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +511 -0
  2. package/dist/cli.d.ts +3 -0
  3. package/dist/cli.d.ts.map +1 -0
  4. package/dist/cli.js +346 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/commands/models.d.ts +39 -0
  7. package/dist/commands/models.d.ts.map +1 -0
  8. package/dist/commands/models.js +658 -0
  9. package/dist/commands/models.js.map +1 -0
  10. package/dist/commands/pods.d.ts +21 -0
  11. package/dist/commands/pods.d.ts.map +1 -0
  12. package/dist/commands/pods.js +175 -0
  13. package/dist/commands/pods.js.map +1 -0
  14. package/dist/commands/prompt.d.ts +7 -0
  15. package/dist/commands/prompt.d.ts.map +1 -0
  16. package/dist/commands/prompt.js +54 -0
  17. package/dist/commands/prompt.js.map +1 -0
  18. package/dist/config.d.ts +11 -0
  19. package/dist/config.d.ts.map +1 -0
  20. package/dist/config.js +74 -0
  21. package/dist/config.js.map +1 -0
  22. package/dist/index.d.ts +2 -0
  23. package/dist/index.d.ts.map +1 -0
  24. package/dist/index.js +3 -0
  25. package/dist/index.js.map +1 -0
  26. package/dist/model-configs.d.ts +22 -0
  27. package/dist/model-configs.d.ts.map +1 -0
  28. package/dist/model-configs.js +75 -0
  29. package/dist/model-configs.js.map +1 -0
  30. package/dist/models.json +295 -0
  31. package/dist/scripts/model_run.sh +83 -0
  32. package/dist/scripts/pod_setup.sh +336 -0
  33. package/dist/ssh.d.ts +24 -0
  34. package/dist/ssh.d.ts.map +1 -0
  35. package/dist/ssh.js +115 -0
  36. package/dist/ssh.js.map +1 -0
  37. package/dist/types.d.ts +23 -0
  38. package/dist/types.d.ts.map +1 -0
  39. package/dist/types.js +3 -0
  40. package/dist/types.js.map +1 -0
  41. package/package.json +40 -0
  42. package/scripts/model_run.sh +83 -0
  43. package/scripts/pod_setup.sh +336 -0
@@ -0,0 +1 @@
1
+ {"version":3,"file":"model-configs.d.ts","sourceRoot":"","sources":["../src/model-configs.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,GAAG,EAAE,MAAM,YAAY,CAAC;AA2BtC;;GAEG;AACH,eAAO,MAAM,cAAc;;;;QAwD1B,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,YAAY,8BAExB,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,cAAc,gBAE1B,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,YAAY,6BAExB,CAAC","sourcesContent":["import { readFileSync } from \"fs\";\nimport { dirname, join } from \"path\";\nimport { fileURLToPath } from \"url\";\nimport type { GPU } from \"./types.js\";\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = dirname(__filename);\n\ninterface ModelConfig {\n\tgpuCount: number;\n\tgpuTypes?: string[];\n\targs: string[];\n\tenv?: Record<string, string>;\n\tnotes?: string;\n}\n\ninterface ModelInfo {\n\tname: string;\n\tconfigs: ModelConfig[];\n\tnotes?: string;\n}\n\ninterface ModelsData {\n\tmodels: Record<string, ModelInfo>;\n}\n\n// Load models configuration - resolve relative to this file\nconst modelsJsonPath = join(__dirname, \"models.json\");\nconst modelsData: ModelsData = JSON.parse(readFileSync(modelsJsonPath, \"utf-8\"));\n\n/**\n * Get the best configuration for a model based on available GPUs\n */\nexport const getModelConfig = (\n\tmodelId: string,\n\tgpus: GPU[],\n\trequestedGpuCount: number,\n): { args: string[]; env?: Record<string, string>; notes?: string } | null => {\n\tconst modelInfo = modelsData.models[modelId];\n\tif (!modelInfo) {\n\t\t// Unknown model, no default config\n\t\treturn null;\n\t}\n\n\t// Extract GPU type from the first GPU name (e.g., \"NVIDIA H200\" -> \"H200\")\n\tconst gpuType = gpus[0]?.name?.replace(\"NVIDIA\", \"\")?.trim()?.split(\" \")[0] || \"\";\n\n\t// Find best matching config\n\tlet bestConfig: ModelConfig | null = null;\n\n\tfor (const config of modelInfo.configs) {\n\t\t// Check GPU count\n\t\tif (config.gpuCount !== requestedGpuCount) {\n\t\t\tcontinue;\n\t\t}\n\n\t\t// Check GPU type if specified\n\t\tif (config.gpuTypes && config.gpuTypes.length > 0) {\n\t\t\tconst typeMatches = config.gpuTypes.some((type) => gpuType.includes(type) || type.includes(gpuType));\n\t\t\tif (!typeMatches) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t}\n\n\t\t// This config matches\n\t\tbestConfig = config;\n\t\tbreak;\n\t}\n\n\t// If no exact match, try to find a config with just the right GPU count\n\tif (!bestConfig) {\n\t\tfor (const config of modelInfo.configs) {\n\t\t\tif (config.gpuCount === requestedGpuCount) {\n\t\t\t\tbestConfig = config;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (!bestConfig) {\n\t\t// No suitable config found\n\t\treturn null;\n\t}\n\n\treturn {\n\t\targs: [...bestConfig.args],\n\t\tenv: bestConfig.env ? { ...bestConfig.env } : undefined,\n\t\tnotes: bestConfig.notes || modelInfo.notes,\n\t};\n};\n\n/**\n * Check if a model is known\n */\nexport const isKnownModel = (modelId: string): boolean => {\n\treturn modelId in modelsData.models;\n};\n\n/**\n * Get all known models\n */\nexport const getKnownModels = (): string[] => {\n\treturn Object.keys(modelsData.models);\n};\n\n/**\n * Get model display name\n */\nexport const getModelName = (modelId: string): string => {\n\treturn modelsData.models[modelId]?.name || modelId;\n};\n"]}
@@ -0,0 +1,75 @@
1
+ import { readFileSync } from "fs";
2
+ import { dirname, join } from "path";
3
+ import { fileURLToPath } from "url";
4
+ const __filename = fileURLToPath(import.meta.url);
5
+ const __dirname = dirname(__filename);
6
+ // Load models configuration - resolve relative to this file
7
+ const modelsJsonPath = join(__dirname, "models.json");
8
+ const modelsData = JSON.parse(readFileSync(modelsJsonPath, "utf-8"));
9
+ /**
10
+ * Get the best configuration for a model based on available GPUs
11
+ */
12
+ export const getModelConfig = (modelId, gpus, requestedGpuCount) => {
13
+ const modelInfo = modelsData.models[modelId];
14
+ if (!modelInfo) {
15
+ // Unknown model, no default config
16
+ return null;
17
+ }
18
+ // Extract GPU type from the first GPU name (e.g., "NVIDIA H200" -> "H200")
19
+ const gpuType = gpus[0]?.name?.replace("NVIDIA", "")?.trim()?.split(" ")[0] || "";
20
+ // Find best matching config
21
+ let bestConfig = null;
22
+ for (const config of modelInfo.configs) {
23
+ // Check GPU count
24
+ if (config.gpuCount !== requestedGpuCount) {
25
+ continue;
26
+ }
27
+ // Check GPU type if specified
28
+ if (config.gpuTypes && config.gpuTypes.length > 0) {
29
+ const typeMatches = config.gpuTypes.some((type) => gpuType.includes(type) || type.includes(gpuType));
30
+ if (!typeMatches) {
31
+ continue;
32
+ }
33
+ }
34
+ // This config matches
35
+ bestConfig = config;
36
+ break;
37
+ }
38
+ // If no exact match, try to find a config with just the right GPU count
39
+ if (!bestConfig) {
40
+ for (const config of modelInfo.configs) {
41
+ if (config.gpuCount === requestedGpuCount) {
42
+ bestConfig = config;
43
+ break;
44
+ }
45
+ }
46
+ }
47
+ if (!bestConfig) {
48
+ // No suitable config found
49
+ return null;
50
+ }
51
+ return {
52
+ args: [...bestConfig.args],
53
+ env: bestConfig.env ? { ...bestConfig.env } : undefined,
54
+ notes: bestConfig.notes || modelInfo.notes,
55
+ };
56
+ };
57
+ /**
58
+ * Check if a model is known
59
+ */
60
+ export const isKnownModel = (modelId) => {
61
+ return modelId in modelsData.models;
62
+ };
63
+ /**
64
+ * Get all known models
65
+ */
66
+ export const getKnownModels = () => {
67
+ return Object.keys(modelsData.models);
68
+ };
69
+ /**
70
+ * Get model display name
71
+ */
72
+ export const getModelName = (modelId) => {
73
+ return modelsData.models[modelId]?.name || modelId;
74
+ };
75
+ //# sourceMappingURL=model-configs.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"model-configs.js","sourceRoot":"","sources":["../src/model-configs.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,IAAI,CAAC;AAClC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AACrC,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AAGpC,MAAM,UAAU,GAAG,aAAa,CAAC,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC;AAClD,MAAM,SAAS,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAoBtC,4DAA4D;AAC5D,MAAM,cAAc,GAAG,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;AACtD,MAAM,UAAU,GAAe,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC,CAAC;AAEjF;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,CAC7B,OAAe,EACf,IAAW,EACX,iBAAyB,EACiD,EAAE,CAAC;IAC7E,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;IAC7C,IAAI,CAAC,SAAS,EAAE,CAAC;QAChB,mCAAmC;QACnC,OAAO,IAAI,CAAC;IACb,CAAC;IAED,2EAA2E;IAC3E,MAAM,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,EAAE,IAAI,EAAE,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IAElF,4BAA4B;IAC5B,IAAI,UAAU,GAAuB,IAAI,CAAC;IAE1C,KAAK,MAAM,MAAM,IAAI,SAAS,CAAC,OAAO,EAAE,CAAC;QACxC,kBAAkB;QAClB,IAAI,MAAM,CAAC,QAAQ,KAAK,iBAAiB,EAAE,CAAC;YAC3C,SAAS;QACV,CAAC;QAED,8BAA8B;QAC9B,IAAI,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnD,MAAM,WAAW,GAAG,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;YACrG,IAAI,CAAC,WAAW,EAAE,CAAC;gBAClB,SAAS;YACV,CAAC;QACF,CAAC;QAED,sBAAsB;QACtB,UAAU,GAAG,MAAM,CAAC;QACpB,MAAM;IACP,CAAC;IAED,wEAAwE;IACxE,IAAI,CAAC,UAAU,EAAE,CAAC;QACjB,KAAK,MAAM,MAAM,IAAI,SAAS,CAAC,OAAO,EAAE,CAAC;YACxC,IAAI,MAAM,CAAC,QAAQ,KAAK,iBAAiB,EAAE,CAAC;gBAC3C,UAAU,GAAG,MAAM,CAAC;gBACpB,MAAM;YACP,CAAC;QACF,CAAC;IACF,CAAC;IAED,IAAI,CAAC,UAAU,EAAE,CAAC;QACjB,2BAA2B;QAC3B,OAAO,IAAI,CAAC;IACb,CAAC;IAED,OAAO;QACN,IAAI,EAAE,CAAC,GAAG,UAAU,CAAC,IAAI,CAAC;QAC1B,GAAG,EAAE,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,UAAU,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,SAAS;QACvD,KAAK,EAAE,UAAU,CAAC,KAAK,IAAI,SAAS,CAAC,KAAK;KAC1C,CAAC;AAAA,CACF,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,OAAe,EAAW,EAAE,CAAC;IACzD,OAAO,OAAO,IAAI,UAAU,CAAC,MAAM,CAAC;AAAA,CACpC,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG,GAAa,EAAE,CAAC;IAC7C,OAAO,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;AAAA,CACtC,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,OAAe,EAAU,EAAE,CAAC;IACxD,OAAO,UAAU,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,IAAI,IAAI,OAAO,CAAC;AAAA,CACnD,CAAC","sourcesContent":["import { readFileSync } from \"fs\";\nimport { dirname, join } from \"path\";\nimport { fileURLToPath } from \"url\";\nimport type { GPU } from \"./types.js\";\n\nconst __filename = fileURLToPath(import.meta.url);\nconst __dirname = dirname(__filename);\n\ninterface ModelConfig {\n\tgpuCount: number;\n\tgpuTypes?: string[];\n\targs: string[];\n\tenv?: Record<string, string>;\n\tnotes?: string;\n}\n\ninterface ModelInfo {\n\tname: string;\n\tconfigs: ModelConfig[];\n\tnotes?: string;\n}\n\ninterface ModelsData {\n\tmodels: Record<string, ModelInfo>;\n}\n\n// Load models configuration - resolve relative to this file\nconst modelsJsonPath = join(__dirname, \"models.json\");\nconst modelsData: ModelsData = JSON.parse(readFileSync(modelsJsonPath, \"utf-8\"));\n\n/**\n * Get the best configuration for a model based on available GPUs\n */\nexport const getModelConfig = (\n\tmodelId: string,\n\tgpus: GPU[],\n\trequestedGpuCount: number,\n): { args: string[]; env?: Record<string, string>; notes?: string } | null => {\n\tconst modelInfo = modelsData.models[modelId];\n\tif (!modelInfo) {\n\t\t// Unknown model, no default config\n\t\treturn null;\n\t}\n\n\t// Extract GPU type from the first GPU name (e.g., \"NVIDIA H200\" -> \"H200\")\n\tconst gpuType = gpus[0]?.name?.replace(\"NVIDIA\", \"\")?.trim()?.split(\" \")[0] || \"\";\n\n\t// Find best matching config\n\tlet bestConfig: ModelConfig | null = null;\n\n\tfor (const config of modelInfo.configs) {\n\t\t// Check GPU count\n\t\tif (config.gpuCount !== requestedGpuCount) {\n\t\t\tcontinue;\n\t\t}\n\n\t\t// Check GPU type if specified\n\t\tif (config.gpuTypes && config.gpuTypes.length > 0) {\n\t\t\tconst typeMatches = config.gpuTypes.some((type) => gpuType.includes(type) || type.includes(gpuType));\n\t\t\tif (!typeMatches) {\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t}\n\n\t\t// This config matches\n\t\tbestConfig = config;\n\t\tbreak;\n\t}\n\n\t// If no exact match, try to find a config with just the right GPU count\n\tif (!bestConfig) {\n\t\tfor (const config of modelInfo.configs) {\n\t\t\tif (config.gpuCount === requestedGpuCount) {\n\t\t\t\tbestConfig = config;\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t}\n\n\tif (!bestConfig) {\n\t\t// No suitable config found\n\t\treturn null;\n\t}\n\n\treturn {\n\t\targs: [...bestConfig.args],\n\t\tenv: bestConfig.env ? { ...bestConfig.env } : undefined,\n\t\tnotes: bestConfig.notes || modelInfo.notes,\n\t};\n};\n\n/**\n * Check if a model is known\n */\nexport const isKnownModel = (modelId: string): boolean => {\n\treturn modelId in modelsData.models;\n};\n\n/**\n * Get all known models\n */\nexport const getKnownModels = (): string[] => {\n\treturn Object.keys(modelsData.models);\n};\n\n/**\n * Get model display name\n */\nexport const getModelName = (modelId: string): string => {\n\treturn modelsData.models[modelId]?.name || modelId;\n};\n"]}
@@ -0,0 +1,295 @@
1
+ {
2
+ "models": {
3
+ "Qwen/Qwen2.5-Coder-32B-Instruct": {
4
+ "name": "Qwen2.5-Coder-32B",
5
+ "configs": [
6
+ {
7
+ "gpuCount": 1,
8
+ "gpuTypes": ["H100", "H200"],
9
+ "args": ["--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
10
+ },
11
+ {
12
+ "gpuCount": 2,
13
+ "gpuTypes": ["H100", "H200"],
14
+ "args": ["--tensor-parallel-size", "2", "--tool-call-parser", "hermes", "--enable-auto-tool-choice"]
15
+ }
16
+ ]
17
+ },
18
+ "Qwen/Qwen3-Coder-30B-A3B-Instruct": {
19
+ "name": "Qwen3-Coder-30B",
20
+ "configs": [
21
+ {
22
+ "gpuCount": 1,
23
+ "gpuTypes": ["H100", "H200"],
24
+ "args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
25
+ "notes": "Fits comfortably on single GPU. ~60GB model weight."
26
+ },
27
+ {
28
+ "gpuCount": 2,
29
+ "gpuTypes": ["H100", "H200"],
30
+ "args": [
31
+ "--tensor-parallel-size",
32
+ "2",
33
+ "--enable-auto-tool-choice",
34
+ "--tool-call-parser",
35
+ "qwen3_coder"
36
+ ],
37
+ "notes": "For higher throughput/longer context."
38
+ }
39
+ ]
40
+ },
41
+ "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8": {
42
+ "name": "Qwen3-Coder-30B-FP8",
43
+ "configs": [
44
+ {
45
+ "gpuCount": 1,
46
+ "gpuTypes": ["H100", "H200"],
47
+ "args": ["--enable-auto-tool-choice", "--tool-call-parser", "qwen3_coder"],
48
+ "env": {
49
+ "VLLM_USE_DEEP_GEMM": "1"
50
+ },
51
+ "notes": "FP8 quantized, ~30GB model weight. Excellent for single GPU deployment."
52
+ }
53
+ ]
54
+ },
55
+ "Qwen/Qwen3-Coder-480B-A35B-Instruct": {
56
+ "name": "Qwen3-Coder-480B",
57
+ "configs": [
58
+ {
59
+ "gpuCount": 8,
60
+ "gpuTypes": ["H200", "H20"],
61
+ "args": [
62
+ "--tensor-parallel-size",
63
+ "8",
64
+ "--max-model-len",
65
+ "32000",
66
+ "--enable-auto-tool-choice",
67
+ "--tool-call-parser",
68
+ "qwen3_coder"
69
+ ],
70
+ "notes": "Cannot serve full 262K context on single node. Reduce max-model-len or increase gpu-memory-utilization."
71
+ }
72
+ ]
73
+ },
74
+ "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
75
+ "name": "Qwen3-Coder-480B-FP8",
76
+ "configs": [
77
+ {
78
+ "gpuCount": 8,
79
+ "gpuTypes": ["H200", "H20"],
80
+ "args": [
81
+ "--max-model-len",
82
+ "131072",
83
+ "--enable-expert-parallel",
84
+ "--data-parallel-size",
85
+ "8",
86
+ "--enable-auto-tool-choice",
87
+ "--tool-call-parser",
88
+ "qwen3_coder"
89
+ ],
90
+ "env": {
91
+ "VLLM_USE_DEEP_GEMM": "1"
92
+ },
93
+ "notes": "Use data-parallel mode (not tensor-parallel) to avoid weight quantization errors."
94
+ }
95
+ ]
96
+ },
97
+ "openai/gpt-oss-20b": {
98
+ "name": "GPT-OSS-20B",
99
+ "configs": [
100
+ {
101
+ "gpuCount": 1,
102
+ "gpuTypes": ["H100", "H200"],
103
+ "args": ["--async-scheduling"]
104
+ },
105
+ {
106
+ "gpuCount": 1,
107
+ "gpuTypes": ["B200"],
108
+ "args": ["--async-scheduling"],
109
+ "env": {
110
+ "VLLM_USE_TRTLLM_ATTENTION": "1",
111
+ "VLLM_USE_TRTLLM_DECODE_ATTENTION": "1",
112
+ "VLLM_USE_TRTLLM_CONTEXT_ATTENTION": "1",
113
+ "VLLM_USE_FLASHINFER_MXFP4_MOE": "1"
114
+ }
115
+ }
116
+ ],
117
+ "notes": "Tools/function calls only via /v1/responses endpoint."
118
+ },
119
+ "openai/gpt-oss-120b": {
120
+ "name": "GPT-OSS-120B",
121
+ "configs": [
122
+ {
123
+ "gpuCount": 1,
124
+ "gpuTypes": ["H100", "H200"],
125
+ "args": ["--async-scheduling", "--gpu-memory-utilization", "0.95", "--max-num-batched-tokens", "1024"],
126
+ "notes": "Single GPU deployment. Tools/function calls only via /v1/responses endpoint."
127
+ },
128
+ {
129
+ "gpuCount": 2,
130
+ "gpuTypes": ["H100", "H200"],
131
+ "args": ["--tensor-parallel-size", "2", "--async-scheduling", "--gpu-memory-utilization", "0.94"],
132
+ "notes": "Recommended for H100/H200. Tools/function calls only via /v1/responses endpoint."
133
+ },
134
+ {
135
+ "gpuCount": 4,
136
+ "gpuTypes": ["H100", "H200"],
137
+ "args": ["--tensor-parallel-size", "4", "--async-scheduling"],
138
+ "notes": "Higher throughput. Tools/function calls only via /v1/responses endpoint."
139
+ },
140
+ {
141
+ "gpuCount": 8,
142
+ "gpuTypes": ["H100", "H200"],
143
+ "args": ["--tensor-parallel-size", "8", "--async-scheduling"],
144
+ "notes": "Maximum throughput for evaluation workloads. Tools/function calls only via /v1/responses endpoint."
145
+ }
146
+ ]
147
+ },
148
+ "zai-org/GLM-4.5": {
149
+ "name": "GLM-4.5",
150
+ "configs": [
151
+ {
152
+ "gpuCount": 16,
153
+ "gpuTypes": ["H100"],
154
+ "args": [
155
+ "--tensor-parallel-size",
156
+ "16",
157
+ "--tool-call-parser",
158
+ "glm45",
159
+ "--reasoning-parser",
160
+ "glm45",
161
+ "--enable-auto-tool-choice"
162
+ ]
163
+ },
164
+ {
165
+ "gpuCount": 8,
166
+ "gpuTypes": ["H200"],
167
+ "args": [
168
+ "--tensor-parallel-size",
169
+ "8",
170
+ "--tool-call-parser",
171
+ "glm45",
172
+ "--reasoning-parser",
173
+ "glm45",
174
+ "--enable-auto-tool-choice"
175
+ ]
176
+ }
177
+ ],
178
+ "notes": "Models default to thinking mode. For full 128K context, double the GPU count."
179
+ },
180
+ "zai-org/GLM-4.5-FP8": {
181
+ "name": "GLM-4.5-FP8",
182
+ "configs": [
183
+ {
184
+ "gpuCount": 8,
185
+ "gpuTypes": ["H100"],
186
+ "args": [
187
+ "--tensor-parallel-size",
188
+ "8",
189
+ "--tool-call-parser",
190
+ "glm45",
191
+ "--reasoning-parser",
192
+ "glm45",
193
+ "--enable-auto-tool-choice"
194
+ ]
195
+ },
196
+ {
197
+ "gpuCount": 4,
198
+ "gpuTypes": ["H200"],
199
+ "args": [
200
+ "--tensor-parallel-size",
201
+ "4",
202
+ "--tool-call-parser",
203
+ "glm45",
204
+ "--reasoning-parser",
205
+ "glm45",
206
+ "--enable-auto-tool-choice"
207
+ ]
208
+ }
209
+ ]
210
+ },
211
+ "zai-org/GLM-4.5-Air-FP8": {
212
+ "name": "GLM-4.5-Air-FP8",
213
+ "configs": [
214
+ {
215
+ "gpuCount": 2,
216
+ "gpuTypes": ["H100"],
217
+ "args": [
218
+ "--tensor-parallel-size",
219
+ "2",
220
+ "--tool-call-parser",
221
+ "glm45",
222
+ "--reasoning-parser",
223
+ "glm45",
224
+ "--enable-auto-tool-choice"
225
+ ],
226
+ "env": {
227
+ "VLLM_ATTENTION_BACKEND": "XFORMERS"
228
+ },
229
+ "notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
230
+ },
231
+ {
232
+ "gpuCount": 1,
233
+ "gpuTypes": ["H200"],
234
+ "args": ["--tool-call-parser", "glm45", "--reasoning-parser", "glm45", "--enable-auto-tool-choice"],
235
+ "env": {
236
+ "VLLM_ATTENTION_BACKEND": "XFORMERS"
237
+ },
238
+ "notes": "FP8 model requires vLLM with proper FP8 support or MTP module"
239
+ }
240
+ ]
241
+ },
242
+ "zai-org/GLM-4.5-Air": {
243
+ "name": "GLM-4.5-Air",
244
+ "configs": [
245
+ {
246
+ "gpuCount": 2,
247
+ "gpuTypes": ["H100", "H200"],
248
+ "args": [
249
+ "--tensor-parallel-size",
250
+ "2",
251
+ "--tool-call-parser",
252
+ "glm45",
253
+ "--reasoning-parser",
254
+ "glm45",
255
+ "--enable-auto-tool-choice"
256
+ ],
257
+ "notes": "Non-quantized BF16 version, more compatible"
258
+ },
259
+ {
260
+ "gpuCount": 1,
261
+ "gpuTypes": ["H200"],
262
+ "args": [
263
+ "--tool-call-parser",
264
+ "glm45",
265
+ "--reasoning-parser",
266
+ "glm45",
267
+ "--enable-auto-tool-choice",
268
+ "--gpu-memory-utilization",
269
+ "0.95"
270
+ ],
271
+ "notes": "Single H200 can fit the BF16 model with high memory utilization"
272
+ }
273
+ ]
274
+ },
275
+ "moonshotai/Kimi-K2-Instruct": {
276
+ "name": "Kimi-K2",
277
+ "configs": [
278
+ {
279
+ "gpuCount": 16,
280
+ "gpuTypes": ["H200", "H20"],
281
+ "args": [
282
+ "--tensor-parallel-size",
283
+ "16",
284
+ "--trust-remote-code",
285
+ "--enable-auto-tool-choice",
286
+ "--tool-call-parser",
287
+ "kimi_k2"
288
+ ],
289
+ "notes": "Pure TP mode. For >16 GPUs, combine with pipeline-parallelism."
290
+ }
291
+ ],
292
+ "notes": "Requires vLLM v0.10.0rc1+. Minimum 16 GPUs for FP8 with 128k context."
293
+ }
294
+ }
295
+ }
@@ -0,0 +1,83 @@
1
+ #!/usr/bin/env bash
2
+ # Model runner script - runs sequentially, killed by pi stop
3
+ set -euo pipefail
4
+
5
+ # These values are replaced before upload by pi CLI
6
+ MODEL_ID="{{MODEL_ID}}"
7
+ NAME="{{NAME}}"
8
+ PORT="{{PORT}}"
9
+ VLLM_ARGS="{{VLLM_ARGS}}"
10
+
11
+ # Trap to ensure cleanup on exit and kill any child processes
12
+ cleanup() {
13
+ local exit_code=$?
14
+ echo "Model runner exiting with code $exit_code"
15
+ # Kill any child processes
16
+ pkill -P $$ 2>/dev/null || true
17
+ exit $exit_code
18
+ }
19
+ trap cleanup EXIT TERM INT
20
+
21
+ # Force colored output even when not a TTY
22
+ export FORCE_COLOR=1
23
+ export PYTHONUNBUFFERED=1
24
+ export TERM=xterm-256color
25
+ export RICH_FORCE_TERMINAL=1
26
+ export CLICOLOR_FORCE=1
27
+
28
+ # Source virtual environment
29
+ source /root/venv/bin/activate
30
+
31
+ echo "========================================="
32
+ echo "Model Run: $NAME"
33
+ echo "Model ID: $MODEL_ID"
34
+ echo "Port: $PORT"
35
+ if [ -n "$VLLM_ARGS" ]; then
36
+ echo "vLLM Args: $VLLM_ARGS"
37
+ fi
38
+ echo "========================================="
39
+ echo ""
40
+
41
+ # Download model (with color progress bars)
42
+ echo "Downloading model (will skip if cached)..."
43
+ HF_HUB_ENABLE_HF_TRANSFER=1 hf download "$MODEL_ID"
44
+
45
+ if [ $? -ne 0 ]; then
46
+ echo "❌ ERROR: Failed to download model" >&2
47
+ exit 1
48
+ fi
49
+
50
+ echo ""
51
+ echo "✅ Model download complete"
52
+ echo ""
53
+
54
+ # Build vLLM command
55
+ VLLM_CMD="vllm serve '$MODEL_ID' --port $PORT --api-key '$PI_API_KEY'"
56
+ if [ -n "$VLLM_ARGS" ]; then
57
+ VLLM_CMD="$VLLM_CMD $VLLM_ARGS"
58
+ fi
59
+
60
+ echo "Starting vLLM server..."
61
+ echo "Command: $VLLM_CMD"
62
+ echo "========================================="
63
+ echo ""
64
+
65
+ # Run vLLM in background so we can monitor it
66
+ echo "Starting vLLM process..."
67
+ bash -c "$VLLM_CMD" &
68
+ VLLM_PID=$!
69
+
70
+ # Monitor the vLLM process
71
+ echo "Monitoring vLLM process (PID: $VLLM_PID)..."
72
+ wait $VLLM_PID
73
+ VLLM_EXIT_CODE=$?
74
+
75
+ if [ $VLLM_EXIT_CODE -ne 0 ]; then
76
+ echo "❌ ERROR: vLLM exited with code $VLLM_EXIT_CODE" >&2
77
+ # Make sure to exit the script command too
78
+ kill -TERM $$ 2>/dev/null || true
79
+ exit $VLLM_EXIT_CODE
80
+ fi
81
+
82
+ echo "✅ vLLM exited normally"
83
+ exit 0