@aws/ml-container-creator 0.13.3 → 0.13.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/servers/instance-sizer/lib/model-resolver.js +127 -185
- package/servers/instance-sizer/lib/vram-estimator.js +86 -0
- package/servers/lib/catalogs/instances.json +0 -27
- package/src/lib/bootstrap-command-handler.js +2 -2
- package/src/lib/generated/cli-options.js +1 -1
- package/src/lib/generated/parameter-matrix.js +1 -1
- package/src/lib/generated/validation-rules.js +1 -1
- package/src/lib/prompt-runner.js +14 -31
- package/templates/IAM_PERMISSIONS.md +64 -13
- package/templates/do/.tune_helper.py +5 -2
- package/templates/do/README.md +50 -604
- package/templates/do/adapter +1 -4
- package/templates/do/build +2 -5
- package/templates/do/clean.d/async-inference.ejs +2 -5
- package/templates/do/clean.d/batch-transform.ejs +2 -5
- package/templates/do/clean.d/hyperpod-eks.ejs +2 -5
- package/templates/do/clean.d/managed-inference.ejs +2 -5
- package/templates/do/deploy.d/async-inference.ejs +6 -9
- package/templates/do/deploy.d/batch-transform.ejs +4 -7
- package/templates/do/deploy.d/hyperpod-eks.ejs +1 -4
- package/templates/do/deploy.d/managed-inference.ejs +15 -6
- package/templates/do/lib/profile.sh +19 -15
- package/templates/do/push +2 -5
- package/templates/do/register +2 -5
- package/templates/do/stage +36 -33
- package/templates/do/submit +1 -4
- package/templates/do/tune +1 -4
- package/templates/MIGRATION.md +0 -488
- package/templates/TEMPLATE_SYSTEM.md +0 -243
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@aws/ml-container-creator",
|
|
3
|
-
"version": "0.13.
|
|
3
|
+
"version": "0.13.4",
|
|
4
4
|
"description": "Build and deploy custom ML containers on AWS SageMaker with minimal configuration.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -119,7 +119,7 @@
|
|
|
119
119
|
"@aws-sdk/client-sagemaker": "^3.700.0",
|
|
120
120
|
"@aws-sdk/client-service-quotas": "^3.700.0",
|
|
121
121
|
"@microsoft/eslint-formatter-sarif": "^3.1.0",
|
|
122
|
-
"eslint": "^8.57.
|
|
122
|
+
"eslint": "^8.57.1",
|
|
123
123
|
"eslint-plugin-property-test-rules": "file:eslint-rules",
|
|
124
124
|
"fast-check": "^4.5.2",
|
|
125
125
|
"husky": "^9.1.7",
|
|
@@ -2,137 +2,124 @@
|
|
|
2
2
|
// SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
|
-
* Model Metadata Resolver
|
|
5
|
+
* Model Metadata Resolver for the instance-sizer MCP server.
|
|
6
6
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
7
|
+
* Resolution pipeline:
|
|
8
|
+
* 1. Load model-sizes catalog (pre-built, offline-safe)
|
|
9
|
+
* 2. Attempt catalog lookup via glob pattern matching
|
|
10
|
+
* 3. If discover=true and no catalog hit, fetch config.json from HuggingFace Hub
|
|
11
|
+
* 4. Extract/estimate parameter count, dtype, architecture, context length
|
|
12
|
+
*
|
|
13
|
+
* Exports:
|
|
14
|
+
* - resolveModelMetadata(modelName, options) — full resolution pipeline
|
|
15
|
+
* - globMatch(pattern, string) — glob-style pattern matching (case-insensitive)
|
|
16
|
+
* - loadCatalog(path) — loads the model-sizes JSON catalog
|
|
17
|
+
* - catalogLookup(modelName, catalog) — finds a model in the catalog
|
|
18
|
+
* - estimateParamsFromConfig(config) — estimates params from architecture dimensions
|
|
19
|
+
* - extractFromHuggingFaceConfig(config) — extracts metadata from HF config.json
|
|
11
20
|
*/
|
|
12
21
|
|
|
13
|
-
import {
|
|
22
|
+
import { readFileSync } from 'node:fs';
|
|
23
|
+
import { resolve, dirname } from 'node:path';
|
|
14
24
|
import { fileURLToPath } from 'node:url';
|
|
15
|
-
import { dirname, join } from 'node:path';
|
|
16
|
-
|
|
17
|
-
// ── Constants ────────────────────────────────────────────────────────────────
|
|
18
25
|
|
|
19
26
|
const __filename = fileURLToPath(import.meta.url);
|
|
20
27
|
const __dirname = dirname(__filename);
|
|
21
28
|
|
|
22
|
-
const DEFAULT_CATALOG_PATH =
|
|
23
|
-
const HUGGINGFACE_BASE_URL = 'https://huggingface.co';
|
|
24
|
-
const HUGGINGFACE_TIMEOUT_MS = 5000;
|
|
29
|
+
const DEFAULT_CATALOG_PATH = resolve(__dirname, '../../lib/catalogs/model-sizes.json');
|
|
25
30
|
|
|
26
|
-
|
|
31
|
+
/**
|
|
32
|
+
* Known protocol prefixes that indicate non-HuggingFace model sources.
|
|
33
|
+
* Models with these prefixes are never fetched from HuggingFace Hub.
|
|
34
|
+
*/
|
|
35
|
+
export const PROTOCOL_PREFIXES = [
|
|
36
|
+
's3://',
|
|
37
|
+
'registry://',
|
|
38
|
+
'marketplace://',
|
|
39
|
+
'jumpstart://',
|
|
40
|
+
'jumpstart-hub://'
|
|
41
|
+
];
|
|
27
42
|
|
|
28
43
|
/**
|
|
29
|
-
*
|
|
30
|
-
*
|
|
44
|
+
* Determine if a model name matches the HuggingFace pattern: org/model-name.
|
|
45
|
+
* Must contain exactly one `/` and must NOT start with a protocol prefix.
|
|
31
46
|
*
|
|
32
|
-
* @param {string}
|
|
33
|
-
* @
|
|
34
|
-
* @returns {boolean} Whether the text matches the pattern
|
|
47
|
+
* @param {string} modelName - The model identifier to test
|
|
48
|
+
* @returns {boolean} Whether the model name is a HuggingFace model ID
|
|
35
49
|
*/
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
}
|
|
50
|
+
export function isHuggingFacePattern(modelName) {
|
|
51
|
+
if (!modelName || typeof modelName !== 'string') {
|
|
52
|
+
return false;
|
|
53
|
+
}
|
|
54
|
+
if (PROTOCOL_PREFIXES.some(prefix => modelName.startsWith(prefix))) {
|
|
55
|
+
return false;
|
|
56
|
+
}
|
|
57
|
+
const slashCount = (modelName.match(/\//g) || []).length;
|
|
58
|
+
return slashCount === 1;
|
|
59
|
+
}
|
|
43
60
|
|
|
44
|
-
|
|
61
|
+
/**
|
|
62
|
+
* Glob-style pattern matching (case-insensitive).
|
|
63
|
+
* Supports `*` as a wildcard that matches any sequence of characters.
|
|
64
|
+
*
|
|
65
|
+
* @param {string} pattern - Pattern with optional `*` wildcards
|
|
66
|
+
* @param {string} string - String to test against the pattern
|
|
67
|
+
* @returns {boolean} Whether the string matches the pattern
|
|
68
|
+
*/
|
|
69
|
+
export function globMatch(pattern, string) {
|
|
70
|
+
// Escape regex special characters except `*`
|
|
71
|
+
const escaped = pattern.replace(/[.+^${}()|[\]\\]/g, '\\$&');
|
|
72
|
+
// Replace `*` with `.*` for regex matching
|
|
73
|
+
const regex = new RegExp(`^${escaped.replace(/\*/g, '.*')}$`, 'i');
|
|
74
|
+
return regex.test(string);
|
|
75
|
+
}
|
|
45
76
|
|
|
46
77
|
/**
|
|
47
|
-
* Load the model-sizes catalog from
|
|
78
|
+
* Load the model-sizes catalog from a JSON file.
|
|
48
79
|
*
|
|
49
|
-
* @param {string} [catalogPath] -
|
|
50
|
-
* @returns {Promise<
|
|
80
|
+
* @param {string} [catalogPath] - Absolute path to the catalog JSON file
|
|
81
|
+
* @returns {Promise<Object|null>} Parsed catalog object, or null if file not found/invalid
|
|
51
82
|
*/
|
|
52
|
-
|
|
83
|
+
export async function loadCatalog(catalogPath = DEFAULT_CATALOG_PATH) {
|
|
53
84
|
try {
|
|
54
|
-
const raw =
|
|
85
|
+
const raw = readFileSync(catalogPath, 'utf8');
|
|
55
86
|
return JSON.parse(raw);
|
|
56
87
|
} catch {
|
|
57
88
|
return null;
|
|
58
89
|
}
|
|
59
|
-
}
|
|
90
|
+
}
|
|
60
91
|
|
|
61
92
|
/**
|
|
62
|
-
* Look up a model in the catalog
|
|
93
|
+
* Look up a model in the catalog using glob pattern matching.
|
|
94
|
+
* Iterates over catalog keys (which may contain `*` wildcards) and
|
|
95
|
+
* returns the first matching entry.
|
|
63
96
|
*
|
|
64
|
-
* @param {string} modelName - HuggingFace model ID
|
|
65
|
-
* @param {
|
|
66
|
-
* @returns {
|
|
97
|
+
* @param {string} modelName - HuggingFace model ID (e.g., "meta-llama/Llama-3.1-8B-Instruct")
|
|
98
|
+
* @param {Object|null} catalog - Loaded catalog object with a `models` field
|
|
99
|
+
* @returns {Object|null} Matching catalog entry, or null
|
|
67
100
|
*/
|
|
68
|
-
|
|
69
|
-
if (!catalog) {
|
|
101
|
+
export function catalogLookup(modelName, catalog) {
|
|
102
|
+
if (!catalog || !catalog.models) {
|
|
70
103
|
return null;
|
|
71
104
|
}
|
|
72
105
|
|
|
73
|
-
|
|
74
|
-
const models = catalog.models || catalog;
|
|
75
|
-
|
|
76
|
-
// Try exact match first
|
|
77
|
-
if (models[modelName]) {
|
|
78
|
-
return models[modelName];
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
// Try glob pattern matching
|
|
82
|
-
for (const pattern of Object.keys(models)) {
|
|
106
|
+
for (const [pattern, entry] of Object.entries(catalog.models)) {
|
|
83
107
|
if (globMatch(pattern, modelName)) {
|
|
84
|
-
return
|
|
108
|
+
return entry;
|
|
85
109
|
}
|
|
86
110
|
}
|
|
87
111
|
|
|
88
112
|
return null;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
// ── HuggingFace API ──────────────────────────────────────────────────────────
|
|
113
|
+
}
|
|
92
114
|
|
|
93
115
|
/**
|
|
94
|
-
*
|
|
116
|
+
* Estimate parameter count from model architecture dimensions.
|
|
117
|
+
* Uses the approximation: hidden_size × num_hidden_layers × 12.
|
|
95
118
|
*
|
|
96
|
-
* @param {
|
|
97
|
-
* @returns {
|
|
119
|
+
* @param {Object} config - Model configuration (HuggingFace config.json format)
|
|
120
|
+
* @returns {number|null} Estimated parameter count, or null if dimensions are missing
|
|
98
121
|
*/
|
|
99
|
-
|
|
100
|
-
const url = `${HUGGINGFACE_BASE_URL}/${modelName}/resolve/main/config.json`;
|
|
101
|
-
|
|
102
|
-
try {
|
|
103
|
-
const controller = new AbortController();
|
|
104
|
-
const timeout = setTimeout(() => controller.abort(), HUGGINGFACE_TIMEOUT_MS);
|
|
105
|
-
|
|
106
|
-
const response = await fetch(url, {
|
|
107
|
-
signal: controller.signal,
|
|
108
|
-
headers: { 'Accept': 'application/json' }
|
|
109
|
-
});
|
|
110
|
-
|
|
111
|
-
clearTimeout(timeout);
|
|
112
|
-
|
|
113
|
-
if (!response.ok) {
|
|
114
|
-
return null;
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
return await response.json();
|
|
118
|
-
} catch {
|
|
119
|
-
return null;
|
|
120
|
-
}
|
|
121
|
-
};
|
|
122
|
-
|
|
123
|
-
/**
|
|
124
|
-
* Estimate parameter count from architecture dimensions.
|
|
125
|
-
* Uses the approximation: hidden_size × num_hidden_layers × 12
|
|
126
|
-
*
|
|
127
|
-
* This accounts for:
|
|
128
|
-
* - Attention weights (Q, K, V, O projections = 4 × hidden_size²)
|
|
129
|
-
* - FFN weights (typically 8 × hidden_size²)
|
|
130
|
-
* - Embeddings and other components
|
|
131
|
-
*
|
|
132
|
-
* @param {object} config - HuggingFace config.json contents
|
|
133
|
-
* @returns {number|null} Estimated parameter count or null if dimensions unavailable
|
|
134
|
-
*/
|
|
135
|
-
const estimateParamsFromConfig = (config) => {
|
|
122
|
+
export function estimateParamsFromConfig(config) {
|
|
136
123
|
const hiddenSize = config.hidden_size;
|
|
137
124
|
const numLayers = config.num_hidden_layers;
|
|
138
125
|
|
|
@@ -141,20 +128,18 @@ const estimateParamsFromConfig = (config) => {
|
|
|
141
128
|
}
|
|
142
129
|
|
|
143
130
|
return hiddenSize * numLayers * 12;
|
|
144
|
-
}
|
|
131
|
+
}
|
|
145
132
|
|
|
146
133
|
/**
|
|
147
|
-
* Extract model metadata from a HuggingFace config.json.
|
|
134
|
+
* Extract model metadata from a HuggingFace config.json object.
|
|
148
135
|
*
|
|
149
|
-
* @param {
|
|
150
|
-
* @returns {
|
|
136
|
+
* @param {Object} config - Parsed config.json from HuggingFace Hub
|
|
137
|
+
* @returns {Object} Extracted metadata with parameterCount, dtype, architecture, maxPositionEmbeddings, source
|
|
151
138
|
*/
|
|
152
|
-
|
|
153
|
-
const parameterCount = config.num_parameters
|
|
154
|
-
?? estimateParamsFromConfig(config);
|
|
155
|
-
|
|
139
|
+
export function extractFromHuggingFaceConfig(config) {
|
|
140
|
+
const parameterCount = config.num_parameters || estimateParamsFromConfig(config);
|
|
156
141
|
const dtype = config.torch_dtype || 'float16';
|
|
157
|
-
const architecture = config.architectures
|
|
142
|
+
const architecture = (config.architectures && config.architectures[0]) || 'unknown';
|
|
158
143
|
const maxPositionEmbeddings = config.max_position_embeddings || 4096;
|
|
159
144
|
|
|
160
145
|
return {
|
|
@@ -164,106 +149,63 @@ const extractFromHuggingFaceConfig = (config) => {
|
|
|
164
149
|
maxPositionEmbeddings,
|
|
165
150
|
source: 'huggingface_api'
|
|
166
151
|
};
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
// ── In-memory cache for discover mode ────────────────────────────────────────
|
|
170
|
-
|
|
171
|
-
const discoverCache = new Map();
|
|
172
|
-
|
|
173
|
-
// ── Protocol prefix detection ────────────────────────────────────────────────
|
|
174
|
-
|
|
175
|
-
const PROTOCOL_PREFIXES = ['jumpstart://', 'jumpstart-hub://', 's3://', 'registry://'];
|
|
152
|
+
}
|
|
176
153
|
|
|
177
154
|
/**
|
|
178
|
-
*
|
|
179
|
-
*
|
|
155
|
+
* Resolve model metadata through the full pipeline:
|
|
156
|
+
* 1. Catalog lookup (offline, fast)
|
|
157
|
+
* 2. HuggingFace Hub fetch (if discover=true and no catalog hit)
|
|
180
158
|
*
|
|
181
|
-
* @param {string} modelName -
|
|
182
|
-
* @
|
|
159
|
+
* @param {string} modelName - HuggingFace model ID
|
|
160
|
+
* @param {Object} [options] - Resolution options
|
|
161
|
+
* @param {string} [options.catalogPath] - Path to model-sizes catalog
|
|
162
|
+
* @param {boolean} [options.discover=true] - Whether to fetch from HuggingFace Hub on cache miss
|
|
163
|
+
* @param {number} [options.timeout=5000] - HTTP timeout for HuggingFace API (ms)
|
|
164
|
+
* @returns {Promise<Object|null>} Resolved metadata, or null if unresolvable
|
|
183
165
|
*/
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
// ── Main Resolver ────────────────────────────────────────────────────────────
|
|
194
|
-
|
|
195
|
-
/**
|
|
196
|
-
* Resolve model metadata from available sources.
|
|
197
|
-
*
|
|
198
|
-
* Three-tier resolution:
|
|
199
|
-
* 1. Check model-sizes catalog (exact match or pattern match)
|
|
200
|
-
* 2. If discover mode enabled AND model matches HuggingFace pattern, fetch config.json
|
|
201
|
-
* 3. If neither available, return null
|
|
202
|
-
*
|
|
203
|
-
* @param {string} modelName - HuggingFace model ID or catalog key
|
|
204
|
-
* @param {object} [options={}]
|
|
205
|
-
* @param {boolean} [options.discover=false] - Enable HuggingFace API lookups
|
|
206
|
-
* @param {string} [options.catalogPath] - Path to model-sizes catalog (for testing)
|
|
207
|
-
* @returns {Promise<{ parameterCount: number, dtype: string, architecture: string, maxPositionEmbeddings: number, source: string } | null>}
|
|
208
|
-
*/
|
|
209
|
-
const resolveModelMetadata = async (modelName, options = {}) => {
|
|
210
|
-
const { discover = true, catalogPath } = options;
|
|
211
|
-
|
|
212
|
-
// Tier 1: Catalog lookup
|
|
166
|
+
export async function resolveModelMetadata(modelName, options = {}) {
|
|
167
|
+
const {
|
|
168
|
+
catalogPath = DEFAULT_CATALOG_PATH,
|
|
169
|
+
discover = true,
|
|
170
|
+
timeout = 5000
|
|
171
|
+
} = options;
|
|
172
|
+
|
|
173
|
+
// Step 1: Try catalog lookup
|
|
213
174
|
const catalog = await loadCatalog(catalogPath);
|
|
214
175
|
const catalogEntry = catalogLookup(modelName, catalog);
|
|
215
176
|
|
|
216
177
|
if (catalogEntry) {
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
maxPositionEmbeddings: catalogEntry.maxPositionEmbeddings,
|
|
225
|
-
source: 'catalog'
|
|
226
|
-
};
|
|
227
|
-
}
|
|
178
|
+
return {
|
|
179
|
+
parameterCount: catalogEntry.parameterCount,
|
|
180
|
+
dtype: catalogEntry.defaultDtype || 'float16',
|
|
181
|
+
architecture: catalogEntry.architecture || 'unknown',
|
|
182
|
+
maxPositionEmbeddings: catalogEntry.maxPositionEmbeddings || 4096,
|
|
183
|
+
source: 'catalog'
|
|
184
|
+
};
|
|
228
185
|
}
|
|
229
186
|
|
|
230
|
-
//
|
|
187
|
+
// Step 2: If discover mode, try HuggingFace Hub
|
|
231
188
|
if (discover && isHuggingFacePattern(modelName)) {
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
}
|
|
189
|
+
try {
|
|
190
|
+
const controller = new AbortController();
|
|
191
|
+
const timer = setTimeout(() => controller.abort(), timeout);
|
|
236
192
|
|
|
237
|
-
|
|
193
|
+
const url = `https://huggingface.co/${modelName}/resolve/main/config.json`;
|
|
194
|
+
const response = await fetch(url, {
|
|
195
|
+
signal: controller.signal,
|
|
196
|
+
headers: { 'User-Agent': 'ml-container-creator/instance-sizer' }
|
|
197
|
+
});
|
|
238
198
|
|
|
239
|
-
|
|
240
|
-
const metadata = extractFromHuggingFaceConfig(config);
|
|
199
|
+
clearTimeout(timer);
|
|
241
200
|
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
discoverCache.set(modelName, metadata);
|
|
246
|
-
return metadata;
|
|
201
|
+
if (response.ok) {
|
|
202
|
+
const config = await response.json();
|
|
203
|
+
return extractFromHuggingFaceConfig(config);
|
|
247
204
|
}
|
|
205
|
+
} catch {
|
|
206
|
+
// Network error or timeout — fall through to null
|
|
248
207
|
}
|
|
249
208
|
}
|
|
250
209
|
|
|
251
|
-
// Tier 3: No metadata available
|
|
252
210
|
return null;
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
export {
|
|
256
|
-
resolveModelMetadata,
|
|
257
|
-
globMatch,
|
|
258
|
-
loadCatalog,
|
|
259
|
-
catalogLookup,
|
|
260
|
-
fetchHuggingFaceConfig,
|
|
261
|
-
estimateParamsFromConfig,
|
|
262
|
-
extractFromHuggingFaceConfig,
|
|
263
|
-
isHuggingFacePattern,
|
|
264
|
-
discoverCache,
|
|
265
|
-
PROTOCOL_PREFIXES,
|
|
266
|
-
DEFAULT_CATALOG_PATH,
|
|
267
|
-
HUGGINGFACE_BASE_URL,
|
|
268
|
-
HUGGINGFACE_TIMEOUT_MS
|
|
269
|
-
};
|
|
211
|
+
}
|
|
@@ -31,6 +31,17 @@ const BYTES_IN_GB = 1024 ** 3;
|
|
|
31
31
|
const DEFAULT_MAX_SEQUENCE_LENGTH = 4096;
|
|
32
32
|
const DEFAULT_BATCH_SIZE = 1;
|
|
33
33
|
const OVERHEAD_FACTOR = 0.1;
|
|
34
|
+
const MIN_USEFUL_CONTEXT_LEN = 2048;
|
|
35
|
+
|
|
36
|
+
const KV_CACHE_DTYPE_BYTES = {
|
|
37
|
+
'auto': 2, // fp16/bf16
|
|
38
|
+
'fp16': 2,
|
|
39
|
+
'bfloat16': 2,
|
|
40
|
+
'fp8': 1,
|
|
41
|
+
'fp8_e5m2': 1,
|
|
42
|
+
'fp8_e4m3': 1,
|
|
43
|
+
'int8': 1
|
|
44
|
+
};
|
|
34
45
|
|
|
35
46
|
// ── Helper Functions ─────────────────────────────────────────────────────────
|
|
36
47
|
|
|
@@ -132,6 +143,78 @@ const estimateVram = (modelInfo) => {
|
|
|
132
143
|
};
|
|
133
144
|
};
|
|
134
145
|
|
|
146
|
+
// ── Max Model Length Computation ─────────────────────────────────────────────
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Compute the maximum model length (context window) that fits in available KV cache memory.
|
|
150
|
+
*
|
|
151
|
+
* KV cache per token = 2 × num_layers × num_kv_heads × head_dim × dtype_bytes
|
|
152
|
+
* Available KV memory = (gpu_memory × gpu_count × utilization) - model_weight_memory - overhead
|
|
153
|
+
* max_model_len = available_kv_memory / kv_per_token
|
|
154
|
+
*
|
|
155
|
+
* For GQA models (like Llama 3.2), num_kv_heads is the GQA head count, not the
|
|
156
|
+
* full attention head count.
|
|
157
|
+
*
|
|
158
|
+
* @param {object} params
|
|
159
|
+
* @param {number} params.modelWeightGb - Model weight memory in GB
|
|
160
|
+
* @param {number} params.totalGpuMemoryGb - Total GPU memory for the instance
|
|
161
|
+
* @param {number} params.gpuCount - Number of GPUs
|
|
162
|
+
* @param {number} [params.gpuMemoryUtilization=0.9] - vLLM gpu_memory_utilization
|
|
163
|
+
* @param {number} params.numLayers - Number of transformer layers
|
|
164
|
+
* @param {number} params.numKvHeads - Number of KV attention heads (after GQA)
|
|
165
|
+
* @param {number} params.headDim - Dimension per head
|
|
166
|
+
* @param {string} [params.kvCacheDtype='auto'] - KV cache dtype
|
|
167
|
+
* @param {number} [params.overheadFactor=0.15] - Fraction for activations/CUDA overhead (conservative)
|
|
168
|
+
* @returns {{ maxModelLen: number, availableForKvGb: number } | null}
|
|
169
|
+
*/
|
|
170
|
+
const computeMaxModelLen = (params) => {
|
|
171
|
+
const {
|
|
172
|
+
modelWeightGb,
|
|
173
|
+
totalGpuMemoryGb,
|
|
174
|
+
gpuCount = 1,
|
|
175
|
+
gpuMemoryUtilization = 0.9,
|
|
176
|
+
numLayers,
|
|
177
|
+
numKvHeads,
|
|
178
|
+
headDim,
|
|
179
|
+
kvCacheDtype = 'auto',
|
|
180
|
+
overheadFactor = 0.15
|
|
181
|
+
} = params;
|
|
182
|
+
|
|
183
|
+
// Require architecture params — graceful degradation if missing
|
|
184
|
+
if (!numLayers || !numKvHeads || !headDim) {
|
|
185
|
+
return null;
|
|
186
|
+
}
|
|
187
|
+
if (!totalGpuMemoryGb || !modelWeightGb) {
|
|
188
|
+
return null;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Compute available memory for KV cache
|
|
192
|
+
// Conservative: gpu_memory_utilization × (1 - overhead) gives usable memory
|
|
193
|
+
const usableMemoryGb = totalGpuMemoryGb * gpuCount * gpuMemoryUtilization * (1 - overheadFactor);
|
|
194
|
+
const availableForKvGb = usableMemoryGb - modelWeightGb;
|
|
195
|
+
|
|
196
|
+
if (availableForKvGb <= 0) {
|
|
197
|
+
return { maxModelLen: 0, availableForKvGb: 0 };
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// KV cache per token (bytes)
|
|
201
|
+
// Formula: 2 (K+V) × num_layers × num_kv_heads × head_dim × dtype_bytes
|
|
202
|
+
// For TP: heads are distributed but so is memory — net effect is the same per-GPU
|
|
203
|
+
const dtypeBytes = KV_CACHE_DTYPE_BYTES[kvCacheDtype] || 2;
|
|
204
|
+
const kvPerTokenBytes = 2 * numLayers * numKvHeads * headDim * dtypeBytes;
|
|
205
|
+
|
|
206
|
+
// Convert available GB to bytes and divide
|
|
207
|
+
const availableBytes = availableForKvGb * BYTES_IN_GB;
|
|
208
|
+
const maxModelLen = Math.floor(availableBytes / kvPerTokenBytes);
|
|
209
|
+
|
|
210
|
+
return {
|
|
211
|
+
maxModelLen: Math.max(0, maxModelLen),
|
|
212
|
+
availableForKvGb
|
|
213
|
+
};
|
|
214
|
+
};
|
|
215
|
+
|
|
216
|
+
// ── Confidence ───────────────────────────────────────────────────────────────
|
|
217
|
+
|
|
135
218
|
/**
|
|
136
219
|
* Determine confidence level based on which parameters were explicitly provided.
|
|
137
220
|
*
|
|
@@ -165,13 +248,16 @@ const determineConfidence = (modelInfo) => {
|
|
|
165
248
|
|
|
166
249
|
export {
|
|
167
250
|
estimateVram,
|
|
251
|
+
computeMaxModelLen,
|
|
168
252
|
bytesPerParam,
|
|
169
253
|
estimateKvCache,
|
|
170
254
|
determineConfidence,
|
|
171
255
|
BYTES_PER_PARAM,
|
|
172
256
|
QUANTIZATION_BYTES,
|
|
257
|
+
KV_CACHE_DTYPE_BYTES,
|
|
173
258
|
DEFAULT_MAX_SEQUENCE_LENGTH,
|
|
174
259
|
DEFAULT_BATCH_SIZE,
|
|
175
260
|
OVERHEAD_FACTOR,
|
|
261
|
+
MIN_USEFUL_CONTEXT_LEN,
|
|
176
262
|
BYTES_IN_GB
|
|
177
263
|
};
|
|
@@ -228,33 +228,6 @@
|
|
|
228
228
|
"gpuMemoryGb": 24,
|
|
229
229
|
"gpuType": "NVIDIA A10G",
|
|
230
230
|
"costTier": "medium"
|
|
231
|
-
},
|
|
232
|
-
"ml.p6-b200.48xlarge": {
|
|
233
|
-
"category": "gpu",
|
|
234
|
-
"gpus": 8,
|
|
235
|
-
"vcpus": 192,
|
|
236
|
-
"memGb": 1536,
|
|
237
|
-
"accelerator": "8x B200 1440GB",
|
|
238
|
-
"cudaVersions": [
|
|
239
|
-
"12.4",
|
|
240
|
-
"12.6"
|
|
241
|
-
],
|
|
242
|
-
"tags": [
|
|
243
|
-
"gpu",
|
|
244
|
-
"multi-gpu",
|
|
245
|
-
"b200",
|
|
246
|
-
"cuda-12",
|
|
247
|
-
"high-performance"
|
|
248
|
-
],
|
|
249
|
-
"family": "p6",
|
|
250
|
-
"acceleratorType": "cuda",
|
|
251
|
-
"hardware": "NVIDIA B200",
|
|
252
|
-
"gpuArchitecture": "Blackwell",
|
|
253
|
-
"defaultCudaVersion": "12.6",
|
|
254
|
-
"notes": "8x NVIDIA B200 GPUs (1440GB total). Next-gen Blackwell architecture",
|
|
255
|
-
"gpuMemoryGb": 180,
|
|
256
|
-
"gpuType": "NVIDIA B200",
|
|
257
|
-
"costTier": "high"
|
|
258
231
|
}
|
|
259
232
|
},
|
|
260
233
|
"recommendations": {
|
|
@@ -170,7 +170,7 @@ export default class BootstrapCommandHandler {
|
|
|
170
170
|
console.log('\n🚀 Bootstrap — Shared AWS Infrastructure Setup\n');
|
|
171
171
|
|
|
172
172
|
// Verify AWS CLI v2 is installed
|
|
173
|
-
if (!this.
|
|
173
|
+
if (!this._verifyCliV2()) {
|
|
174
174
|
return;
|
|
175
175
|
}
|
|
176
176
|
|
|
@@ -261,7 +261,7 @@ export default class BootstrapCommandHandler {
|
|
|
261
261
|
}
|
|
262
262
|
|
|
263
263
|
profileData.roleArn = stackOutputs.RoleArn;
|
|
264
|
-
profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName;
|
|
264
|
+
profileData.ecrRepositoryName = stackOutputs.EcrRepositoryName || 'ml-container-creator';
|
|
265
265
|
profileData.stackName = stackName;
|
|
266
266
|
profileData.sharedInfraFrom = otherStack; // Track that this profile reuses another's stack
|
|
267
267
|
if (stackOutputs.AsyncS3BucketName) profileData.asyncS3Bucket = stackOutputs.AsyncS3BucketName;
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
// AUTO-GENERATED by scripts/codegen-parameter-matrix.js — DO NOT EDIT
|
|
2
2
|
// Source: config/parameter-schema-v2.json
|
|
3
|
-
// Generated: 2026-06-
|
|
3
|
+
// Generated: 2026-06-15T20:16:03.952Z
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
6
|
* Parameter matrix defining how each parameter is loaded from various sources.
|