@aws/ml-container-creator 0.10.0 → 0.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE-THIRD-PARTY +9304 -0
- package/bin/cli.js +2 -0
- package/config/bootstrap-e2e-stack.json +341 -0
- package/config/bootstrap-stack.json +40 -3
- package/config/parameter-schema-v2.json +5 -21
- package/config/tune-catalog.json +1781 -0
- package/infra/ci-harness/buildspec.yml +1 -0
- package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
- package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
- package/infra/ci-harness/lib/ci-harness-stack.ts +837 -7
- package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
- package/package.json +51 -66
- package/servers/base-image-picker/index.js +121 -121
- package/servers/e2e-status/index.js +297 -0
- package/servers/e2e-status/manifest.json +14 -0
- package/servers/e2e-status/package.json +15 -0
- package/servers/endpoint-picker/LICENSE +202 -0
- package/servers/endpoint-picker/index.js +536 -0
- package/servers/endpoint-picker/manifest.json +14 -0
- package/servers/endpoint-picker/package.json +18 -0
- package/servers/hyperpod-cluster-picker/index.js +125 -125
- package/servers/instance-sizer/index.js +138 -138
- package/servers/instance-sizer/lib/instance-ranker.js +76 -76
- package/servers/instance-sizer/lib/model-resolver.js +61 -61
- package/servers/instance-sizer/lib/quota-resolver.js +113 -113
- package/servers/instance-sizer/lib/vram-estimator.js +31 -31
- package/servers/lib/bedrock-client.js +38 -38
- package/servers/lib/catalogs/model-servers.json +201 -3
- package/servers/lib/custom-validators.js +13 -13
- package/servers/lib/dynamic-resolver.js +4 -4
- package/servers/marketplace-picker/index.js +342 -0
- package/servers/marketplace-picker/manifest.json +14 -0
- package/servers/marketplace-picker/package.json +18 -0
- package/servers/model-picker/index.js +382 -382
- package/servers/region-picker/index.js +56 -56
- package/servers/workload-picker/LICENSE +202 -0
- package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
- package/servers/workload-picker/index.js +171 -0
- package/servers/workload-picker/manifest.json +16 -0
- package/servers/workload-picker/package.json +16 -0
- package/src/app.js +4 -2
- package/src/lib/bootstrap-command-handler.js +579 -14
- package/src/lib/bootstrap-config.js +36 -0
- package/src/lib/bootstrap-profile-manager.js +48 -41
- package/src/lib/ci-register-helpers.js +74 -0
- package/src/lib/config-loader.js +3 -0
- package/src/lib/config-manager.js +7 -0
- package/src/lib/cuda-resolver.js +17 -8
- package/src/lib/generated/cli-options.js +315 -315
- package/src/lib/generated/parameter-matrix.js +661 -661
- package/src/lib/generated/validation-rules.js +71 -71
- package/src/lib/path-prover-brain.js +607 -0
- package/src/lib/prompts/project-prompts.js +12 -0
- package/src/lib/template-variable-resolver.js +25 -1
- package/src/lib/tune-catalog-validator.js +37 -4
- package/templates/Dockerfile +9 -0
- package/templates/code/adapter_sidecar.py +444 -0
- package/templates/code/serve +6 -0
- package/templates/code/serve.d/vllm.ejs +1 -1
- package/templates/do/.benchmark_writer.py +1476 -0
- package/templates/do/.tune_helper.py +982 -57
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/adapter +149 -0
- package/templates/do/benchmark +639 -85
- package/templates/do/config +108 -5
- package/templates/do/deploy.d/managed-inference.ejs +192 -11
- package/templates/do/optimize +106 -37
- package/templates/do/register +89 -0
- package/templates/do/test +13 -0
- package/templates/do/tune +378 -59
- package/templates/do/validate +44 -4
|
@@ -18,43 +18,43 @@
|
|
|
18
18
|
* Returns: { values, choices, metadata }
|
|
19
19
|
*/
|
|
20
20
|
|
|
21
|
-
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'
|
|
22
|
-
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
|
|
23
|
-
import { z } from 'zod'
|
|
24
|
-
import { readFileSync } from 'node:fs'
|
|
25
|
-
import { fileURLToPath } from 'node:url'
|
|
26
|
-
import { resolve, dirname } from 'node:path'
|
|
27
|
-
import { resolveModelMetadata } from './lib/model-resolver.js'
|
|
28
|
-
import { estimateVram } from './lib/vram-estimator.js'
|
|
29
|
-
import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js'
|
|
30
|
-
import { QuotaResolver } from './lib/quota-resolver.js'
|
|
31
|
-
import { queryBedrock } from '../lib/bedrock-client.js'
|
|
21
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
22
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
23
|
+
import { z } from 'zod';
|
|
24
|
+
import { readFileSync } from 'node:fs';
|
|
25
|
+
import { fileURLToPath } from 'node:url';
|
|
26
|
+
import { resolve, dirname } from 'node:path';
|
|
27
|
+
import { resolveModelMetadata } from './lib/model-resolver.js';
|
|
28
|
+
import { estimateVram } from './lib/vram-estimator.js';
|
|
29
|
+
import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js';
|
|
30
|
+
import { QuotaResolver } from './lib/quota-resolver.js';
|
|
31
|
+
import { queryBedrock } from '../lib/bedrock-client.js';
|
|
32
32
|
|
|
33
33
|
// ── Path setup ───────────────────────────────────────────────────────────────
|
|
34
34
|
|
|
35
|
-
const __filename = fileURLToPath(import.meta.url)
|
|
36
|
-
const __dirname = dirname(__filename)
|
|
35
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
36
|
+
const __dirname = dirname(__filename);
|
|
37
37
|
|
|
38
38
|
// ── Load instance catalog from shared lib ────────────────────────────────────
|
|
39
39
|
|
|
40
|
-
let INSTANCE_CATALOG
|
|
40
|
+
let INSTANCE_CATALOG;
|
|
41
41
|
|
|
42
42
|
try {
|
|
43
|
-
const catalogPath = resolve(__dirname, '../lib/catalogs/instances.json')
|
|
44
|
-
const raw = readFileSync(catalogPath, 'utf8')
|
|
45
|
-
const data = JSON.parse(raw)
|
|
46
|
-
INSTANCE_CATALOG = data.catalog
|
|
43
|
+
const catalogPath = resolve(__dirname, '../lib/catalogs/instances.json');
|
|
44
|
+
const raw = readFileSync(catalogPath, 'utf8');
|
|
45
|
+
const data = JSON.parse(raw);
|
|
46
|
+
INSTANCE_CATALOG = data.catalog;
|
|
47
47
|
} catch (err) {
|
|
48
|
-
process.stderr.write(`[instance-sizer] Fatal: Failed to load instance catalog: ${err.message}\n`)
|
|
49
|
-
process.exit(1)
|
|
48
|
+
process.stderr.write(`[instance-sizer] Fatal: Failed to load instance catalog: ${err.message}\n`);
|
|
49
|
+
process.exit(1);
|
|
50
50
|
}
|
|
51
51
|
|
|
52
52
|
// ── Mode configuration ───────────────────────────────────────────────────────
|
|
53
53
|
|
|
54
|
-
const DISCOVER_MODE = process.env.DISCOVER_MODE !== 'false' && !process.argv.includes('--no-discover')
|
|
55
|
-
const SMART_MODE = process.env.BEDROCK_SMART === 'true'
|
|
56
|
-
const BEDROCK_MODEL = process.env.BEDROCK_MODEL || 'global.anthropic.claude-sonnet-4-20250514-v1:0'
|
|
57
|
-
const BEDROCK_REGION = process.env.BEDROCK_REGION || process.env.AWS_REGION || 'us-east-1'
|
|
54
|
+
const DISCOVER_MODE = process.env.DISCOVER_MODE !== 'false' && !process.argv.includes('--no-discover');
|
|
55
|
+
const SMART_MODE = process.env.BEDROCK_SMART === 'true';
|
|
56
|
+
const BEDROCK_MODEL = process.env.BEDROCK_MODEL || 'global.anthropic.claude-sonnet-4-20250514-v1:0';
|
|
57
|
+
const BEDROCK_REGION = process.env.BEDROCK_REGION || process.env.AWS_REGION || 'us-east-1';
|
|
58
58
|
|
|
59
59
|
// ── Bedrock server config ─────────────────────────────────────────────────────
|
|
60
60
|
|
|
@@ -95,7 +95,7 @@ Rules:
|
|
|
95
95
|
maxTokens: 1024,
|
|
96
96
|
modelId: BEDROCK_MODEL,
|
|
97
97
|
region: BEDROCK_REGION
|
|
98
|
-
}
|
|
98
|
+
};
|
|
99
99
|
|
|
100
100
|
// ── Logging ──────────────────────────────────────────────────────────────────
|
|
101
101
|
|
|
@@ -103,7 +103,7 @@ Rules:
|
|
|
103
103
|
* Log to stderr so it doesn't interfere with MCP stdio protocol on stdout.
|
|
104
104
|
*/
|
|
105
105
|
function log(message) {
|
|
106
|
-
process.stderr.write(`[instance-sizer] ${message}\n`)
|
|
106
|
+
process.stderr.write(`[instance-sizer] ${message}\n`);
|
|
107
107
|
}
|
|
108
108
|
|
|
109
109
|
// ── Tag-based search filtering ───────────────────────────────────────────────
|
|
@@ -119,76 +119,76 @@ function log(message) {
|
|
|
119
119
|
* @returns {string[]} Matching instance type names, sorted by relevance
|
|
120
120
|
*/
|
|
121
121
|
function searchInstancesByTag(search, instanceCatalog, options = {}) {
|
|
122
|
-
const { limit = 10 } = options
|
|
123
|
-
const candidates = Object.entries(instanceCatalog)
|
|
122
|
+
const { limit = 10 } = options;
|
|
123
|
+
const candidates = Object.entries(instanceCatalog);
|
|
124
124
|
|
|
125
125
|
// Tokenize search into lowercase keywords
|
|
126
|
-
const tokens = search.toLowerCase().split(/[\s,\-_]+/).filter(Boolean)
|
|
126
|
+
const tokens = search.toLowerCase().split(/[\s,\-_]+/).filter(Boolean);
|
|
127
127
|
|
|
128
128
|
// Detect compound terms
|
|
129
|
-
const rawLower = search.toLowerCase()
|
|
130
|
-
const wantsMultiGpu = rawLower.includes('multi gpu') || rawLower.includes('multi-gpu') || rawLower.includes('multigpu')
|
|
129
|
+
const rawLower = search.toLowerCase();
|
|
130
|
+
const wantsMultiGpu = rawLower.includes('multi gpu') || rawLower.includes('multi-gpu') || rawLower.includes('multigpu');
|
|
131
131
|
|
|
132
132
|
// Detect CUDA version requests: "cuda 12", "cuda 11.8", "cuda-12.1"
|
|
133
|
-
const cudaMatch = rawLower.match(/cuda[\s\-_]*(\d+(?:\.\d+)?)/)
|
|
134
|
-
const wantsCudaVersion = cudaMatch ? cudaMatch[1] : null
|
|
133
|
+
const cudaMatch = rawLower.match(/cuda[\s\-_]*(\d+(?:\.\d+)?)/);
|
|
134
|
+
const wantsCudaVersion = cudaMatch ? cudaMatch[1] : null;
|
|
135
135
|
|
|
136
136
|
// Score each instance
|
|
137
137
|
const scored = candidates.map(([name, meta]) => {
|
|
138
|
-
let score = 0
|
|
139
|
-
const cudaStr = meta.cudaVersions ? meta.cudaVersions.join(' ') : ''
|
|
140
|
-
const haystack = [...(meta.tags || []), (meta.accelerator || '').toLowerCase(), name, meta.category || '', cudaStr].join(' ')
|
|
138
|
+
let score = 0;
|
|
139
|
+
const cudaStr = meta.cudaVersions ? meta.cudaVersions.join(' ') : '';
|
|
140
|
+
const haystack = [...(meta.tags || []), (meta.accelerator || '').toLowerCase(), name, meta.category || '', cudaStr].join(' ');
|
|
141
141
|
|
|
142
142
|
// Compound term: multi-gpu
|
|
143
143
|
if (wantsMultiGpu) {
|
|
144
144
|
if (meta.gpus > 1) {
|
|
145
|
-
score += 5
|
|
145
|
+
score += 5;
|
|
146
146
|
} else {
|
|
147
|
-
return { name, meta, score: 0 }
|
|
147
|
+
return { name, meta, score: 0 };
|
|
148
148
|
}
|
|
149
149
|
}
|
|
150
150
|
|
|
151
151
|
// Compound term: cuda version
|
|
152
152
|
if (wantsCudaVersion) {
|
|
153
|
-
if (!meta.cudaVersions) return { name, meta, score: 0 }
|
|
154
|
-
const hasExact = meta.cudaVersions.includes(wantsCudaVersion)
|
|
155
|
-
const hasMajor = meta.cudaVersions.some(v => v.startsWith(wantsCudaVersion))
|
|
153
|
+
if (!meta.cudaVersions) return { name, meta, score: 0 };
|
|
154
|
+
const hasExact = meta.cudaVersions.includes(wantsCudaVersion);
|
|
155
|
+
const hasMajor = meta.cudaVersions.some(v => v.startsWith(wantsCudaVersion));
|
|
156
156
|
if (hasExact) {
|
|
157
|
-
score += 4
|
|
157
|
+
score += 4;
|
|
158
158
|
} else if (hasMajor) {
|
|
159
|
-
score += 3
|
|
159
|
+
score += 3;
|
|
160
160
|
} else {
|
|
161
|
-
return { name, meta, score: 0 }
|
|
161
|
+
return { name, meta, score: 0 };
|
|
162
162
|
}
|
|
163
163
|
}
|
|
164
164
|
|
|
165
165
|
for (const token of tokens) {
|
|
166
|
-
if (wantsMultiGpu && (token === 'multi' || token === 'gpu')) continue
|
|
167
|
-
if (wantsCudaVersion && (token === 'cuda' || token === wantsCudaVersion)) continue
|
|
166
|
+
if (wantsMultiGpu && (token === 'multi' || token === 'gpu')) continue;
|
|
167
|
+
if (wantsCudaVersion && (token === 'cuda' || token === wantsCudaVersion)) continue;
|
|
168
168
|
|
|
169
|
-
if (haystack.includes(token)) score += 1
|
|
170
|
-
if (meta.gpus > 1 && token === 'parallel') score += 2
|
|
171
|
-
if (token === 'gpu' && meta.gpus > 0) score += 1
|
|
172
|
-
if (token === 'cpu' && meta.gpus === 0) score += 1
|
|
169
|
+
if (haystack.includes(token)) score += 1;
|
|
170
|
+
if (meta.gpus > 1 && token === 'parallel') score += 2;
|
|
171
|
+
if (token === 'gpu' && meta.gpus > 0) score += 1;
|
|
172
|
+
if (token === 'cpu' && meta.gpus === 0) score += 1;
|
|
173
173
|
if (token === 'cheap' || token === 'budget' || token === 'cost') {
|
|
174
|
-
if ((meta.tags || []).includes('budget') || (meta.tags || []).includes('cost-effective')) score += 1
|
|
174
|
+
if ((meta.tags || []).includes('budget') || (meta.tags || []).includes('cost-effective')) score += 1;
|
|
175
175
|
}
|
|
176
176
|
if (token === 'memory' || token === 'high-memory') {
|
|
177
|
-
if (meta.memGb >= 32) score += 1
|
|
177
|
+
if (meta.memGb >= 32) score += 1;
|
|
178
178
|
}
|
|
179
|
-
if (token === 'large' && meta.vcpus >= 16) score += 1
|
|
180
|
-
if (meta.cudaVersions && meta.cudaVersions.includes(token)) score += 2
|
|
179
|
+
if (token === 'large' && meta.vcpus >= 16) score += 1;
|
|
180
|
+
if (meta.cudaVersions && meta.cudaVersions.includes(token)) score += 2;
|
|
181
181
|
}
|
|
182
|
-
return { name, meta, score }
|
|
183
|
-
})
|
|
182
|
+
return { name, meta, score };
|
|
183
|
+
});
|
|
184
184
|
|
|
185
|
-
const matched = scored.filter(s => s.score > 0).sort((a, b) => b.score - a.score)
|
|
185
|
+
const matched = scored.filter(s => s.score > 0).sort((a, b) => b.score - a.score);
|
|
186
186
|
|
|
187
187
|
if (matched.length === 0) {
|
|
188
|
-
return []
|
|
188
|
+
return [];
|
|
189
189
|
}
|
|
190
190
|
|
|
191
|
-
return matched.slice(0, limit).map(s => s.name)
|
|
191
|
+
return matched.slice(0, limit).map(s => s.name);
|
|
192
192
|
}
|
|
193
193
|
|
|
194
194
|
// ── CUDA version filtering ───────────────────────────────────────────────────
|
|
@@ -201,22 +201,22 @@ function searchInstancesByTag(search, instanceCatalog, options = {}) {
|
|
|
201
201
|
* @returns {object} Filtered instance catalog
|
|
202
202
|
*/
|
|
203
203
|
function filterByCudaVersion(instanceCatalog, requiredCuda) {
|
|
204
|
-
const majorRequired = requiredCuda.split('.')[0]
|
|
205
|
-
const filtered = {}
|
|
204
|
+
const majorRequired = requiredCuda.split('.')[0];
|
|
205
|
+
const filtered = {};
|
|
206
206
|
|
|
207
207
|
for (const [name, meta] of Object.entries(instanceCatalog)) {
|
|
208
|
-
if (!meta.cudaVersions || meta.cudaVersions.length === 0) continue
|
|
208
|
+
if (!meta.cudaVersions || meta.cudaVersions.length === 0) continue;
|
|
209
209
|
const hasCompatible = meta.cudaVersions.some(v => {
|
|
210
|
-
if (v === requiredCuda) return true
|
|
211
|
-
if (v.startsWith(majorRequired
|
|
212
|
-
return false
|
|
213
|
-
})
|
|
210
|
+
if (v === requiredCuda) return true;
|
|
211
|
+
if (v.startsWith(`${majorRequired }.`)) return true;
|
|
212
|
+
return false;
|
|
213
|
+
});
|
|
214
214
|
if (hasCompatible) {
|
|
215
|
-
filtered[name] = meta
|
|
215
|
+
filtered[name] = meta;
|
|
216
216
|
}
|
|
217
217
|
}
|
|
218
218
|
|
|
219
|
-
return filtered
|
|
219
|
+
return filtered;
|
|
220
220
|
}
|
|
221
221
|
|
|
222
222
|
// ── Tool handler ─────────────────────────────────────────────────────────────
|
|
@@ -239,26 +239,26 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
239
239
|
cudaVersion,
|
|
240
240
|
limit = 10,
|
|
241
241
|
context
|
|
242
|
-
} = params
|
|
242
|
+
} = params;
|
|
243
243
|
|
|
244
244
|
// Apply profile ENV overrides to sequence length and batch size
|
|
245
|
-
let effectiveMaxSeqLen = maxSequenceLength
|
|
246
|
-
let effectiveBatchSize = batchSize
|
|
245
|
+
let effectiveMaxSeqLen = maxSequenceLength;
|
|
246
|
+
let effectiveBatchSize = batchSize;
|
|
247
247
|
if (context?.profileEnvVars) {
|
|
248
248
|
if (context.profileEnvVars.VLLM_MAX_MODEL_LEN) {
|
|
249
|
-
effectiveMaxSeqLen = parseInt(context.profileEnvVars.VLLM_MAX_MODEL_LEN, 10) || effectiveMaxSeqLen
|
|
249
|
+
effectiveMaxSeqLen = parseInt(context.profileEnvVars.VLLM_MAX_MODEL_LEN, 10) || effectiveMaxSeqLen;
|
|
250
250
|
}
|
|
251
251
|
if (context.profileEnvVars.VLLM_MAX_NUM_SEQS) {
|
|
252
|
-
effectiveBatchSize = parseInt(context.profileEnvVars.VLLM_MAX_NUM_SEQS, 10) || effectiveBatchSize
|
|
252
|
+
effectiveBatchSize = parseInt(context.profileEnvVars.VLLM_MAX_NUM_SEQS, 10) || effectiveBatchSize;
|
|
253
253
|
}
|
|
254
254
|
}
|
|
255
255
|
|
|
256
256
|
// Apply CUDA version filtering to instance catalog
|
|
257
|
-
let effectiveCatalog = INSTANCE_CATALOG
|
|
257
|
+
let effectiveCatalog = INSTANCE_CATALOG;
|
|
258
258
|
if (cudaVersion) {
|
|
259
|
-
effectiveCatalog = filterByCudaVersion(INSTANCE_CATALOG, cudaVersion)
|
|
259
|
+
effectiveCatalog = filterByCudaVersion(INSTANCE_CATALOG, cudaVersion);
|
|
260
260
|
if (Object.keys(effectiveCatalog).length === 0) {
|
|
261
|
-
log(`CUDA version ${cudaVersion} filter eliminated all instances`)
|
|
261
|
+
log(`CUDA version ${cudaVersion} filter eliminated all instances`);
|
|
262
262
|
return {
|
|
263
263
|
content: [{
|
|
264
264
|
type: 'text',
|
|
@@ -272,13 +272,13 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
272
272
|
}
|
|
273
273
|
})
|
|
274
274
|
}]
|
|
275
|
-
}
|
|
275
|
+
};
|
|
276
276
|
}
|
|
277
277
|
}
|
|
278
278
|
|
|
279
279
|
// Mode: tag-based search only (no model name)
|
|
280
280
|
if (!modelName && instanceSearch) {
|
|
281
|
-
const searchResults = searchInstancesByTag(instanceSearch, effectiveCatalog, { limit })
|
|
281
|
+
const searchResults = searchInstancesByTag(instanceSearch, effectiveCatalog, { limit });
|
|
282
282
|
return {
|
|
283
283
|
content: [{
|
|
284
284
|
type: 'text',
|
|
@@ -293,14 +293,14 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
293
293
|
}
|
|
294
294
|
})
|
|
295
295
|
}]
|
|
296
|
-
}
|
|
296
|
+
};
|
|
297
297
|
}
|
|
298
298
|
|
|
299
299
|
// Mode: no model name and no search — return all GPU instances
|
|
300
300
|
if (!modelName) {
|
|
301
301
|
const allGpuInstances = Object.keys(effectiveCatalog)
|
|
302
302
|
.filter(key => effectiveCatalog[key].category === 'gpu')
|
|
303
|
-
.slice(0, limit)
|
|
303
|
+
.slice(0, limit);
|
|
304
304
|
|
|
305
305
|
return {
|
|
306
306
|
content: [{
|
|
@@ -316,20 +316,20 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
316
316
|
}
|
|
317
317
|
})
|
|
318
318
|
}]
|
|
319
|
-
}
|
|
319
|
+
};
|
|
320
320
|
}
|
|
321
321
|
|
|
322
322
|
// Step 1: Resolve model metadata
|
|
323
323
|
const modelMetadata = await resolveModelMetadata(modelName, {
|
|
324
324
|
discover: DISCOVER_MODE
|
|
325
|
-
})
|
|
325
|
+
});
|
|
326
326
|
|
|
327
327
|
// If model metadata cannot be resolved, return all GPU instances unfiltered
|
|
328
328
|
if (!modelMetadata) {
|
|
329
|
-
log(`Model metadata not found for "${modelName}", returning unfiltered GPU instances`)
|
|
329
|
+
log(`Model metadata not found for "${modelName}", returning unfiltered GPU instances`);
|
|
330
330
|
const allGpuInstances = Object.keys(effectiveCatalog)
|
|
331
331
|
.filter(key => effectiveCatalog[key].category === 'gpu')
|
|
332
|
-
.slice(0, limit)
|
|
332
|
+
.slice(0, limit);
|
|
333
333
|
|
|
334
334
|
return {
|
|
335
335
|
content: [{
|
|
@@ -358,78 +358,78 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
358
358
|
}
|
|
359
359
|
})
|
|
360
360
|
}]
|
|
361
|
-
}
|
|
361
|
+
};
|
|
362
362
|
}
|
|
363
363
|
|
|
364
364
|
// Step 2: Estimate VRAM
|
|
365
365
|
// Use model's max_position_embeddings as the sequence length when no explicit value is provided.
|
|
366
366
|
// This ensures KV cache is sized for the model's actual context window, not the 4096 default.
|
|
367
|
-
const resolvedMaxSeqLen = effectiveMaxSeqLen || modelMetadata.maxPositionEmbeddings || undefined
|
|
367
|
+
const resolvedMaxSeqLen = effectiveMaxSeqLen || modelMetadata.maxPositionEmbeddings || undefined;
|
|
368
368
|
const vramEstimate = estimateVram({
|
|
369
369
|
parameterCount: modelMetadata.parameterCount,
|
|
370
370
|
dtype: modelMetadata.dtype,
|
|
371
371
|
quantization: quantization || undefined,
|
|
372
372
|
maxSequenceLength: resolvedMaxSeqLen,
|
|
373
373
|
batchSize: effectiveBatchSize || undefined
|
|
374
|
-
})
|
|
374
|
+
});
|
|
375
375
|
|
|
376
376
|
// Step 3: Filter and rank instances
|
|
377
377
|
let recommendations = filterAndRankInstances(
|
|
378
378
|
vramEstimate.vramGb,
|
|
379
379
|
effectiveCatalog,
|
|
380
380
|
{ limit }
|
|
381
|
-
)
|
|
381
|
+
);
|
|
382
382
|
|
|
383
383
|
// Step 3a: Quota & availability filtering (discover mode only)
|
|
384
|
-
let preQuotaFilterCount = 0
|
|
385
|
-
let allFilteredByQuota = false
|
|
386
|
-
let preQuotaRecommendations = []
|
|
384
|
+
let preQuotaFilterCount = 0;
|
|
385
|
+
let allFilteredByQuota = false;
|
|
386
|
+
let preQuotaRecommendations = [];
|
|
387
387
|
if (DISCOVER_MODE && recommendations.length > 0) {
|
|
388
388
|
try {
|
|
389
|
-
const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION
|
|
390
|
-
const quotaResolver = new QuotaResolver(region)
|
|
389
|
+
const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION;
|
|
390
|
+
const quotaResolver = new QuotaResolver(region);
|
|
391
391
|
|
|
392
|
-
const instanceTypes = recommendations.map(r => r.instanceType)
|
|
392
|
+
const instanceTypes = recommendations.map(r => r.instanceType);
|
|
393
393
|
const [quotas, reservations, ftps] = await Promise.allSettled([
|
|
394
394
|
quotaResolver.getQuotaHeadroom(instanceTypes),
|
|
395
395
|
quotaResolver.getCapacityReservations(),
|
|
396
396
|
quotaResolver.getTrainingPlans()
|
|
397
|
-
])
|
|
397
|
+
]);
|
|
398
398
|
|
|
399
|
-
preQuotaFilterCount = recommendations.length
|
|
400
|
-
preQuotaRecommendations = [...recommendations]
|
|
399
|
+
preQuotaFilterCount = recommendations.length;
|
|
400
|
+
preQuotaRecommendations = [...recommendations];
|
|
401
401
|
recommendations = applyAvailabilityRanking(
|
|
402
402
|
recommendations,
|
|
403
403
|
quotas.status === 'fulfilled' ? quotas.value : null,
|
|
404
404
|
reservations.status === 'fulfilled' ? reservations.value : null,
|
|
405
405
|
ftps.status === 'fulfilled' ? ftps.value : null
|
|
406
|
-
)
|
|
406
|
+
);
|
|
407
407
|
if (recommendations.length === 0 && preQuotaFilterCount > 0) {
|
|
408
|
-
allFilteredByQuota = true
|
|
408
|
+
allFilteredByQuota = true;
|
|
409
409
|
// Restore pre-filter recommendations so user can see compatible instances
|
|
410
410
|
// and request quota increases for the ones they want
|
|
411
|
-
recommendations = preQuotaRecommendations
|
|
412
|
-
log(`All ${preQuotaFilterCount} instances filtered by zero-quota — restoring unfiltered list`)
|
|
411
|
+
recommendations = preQuotaRecommendations;
|
|
412
|
+
log(`All ${preQuotaFilterCount} instances filtered by zero-quota — restoring unfiltered list`);
|
|
413
413
|
}
|
|
414
414
|
} catch (err) {
|
|
415
415
|
// Graceful degradation: if credentials are missing or any unexpected
|
|
416
416
|
// error occurs, skip quota filtering and continue with unfiltered results
|
|
417
|
-
log(`Quota resolution skipped: ${err.message}`)
|
|
417
|
+
log(`Quota resolution skipped: ${err.message}`);
|
|
418
418
|
}
|
|
419
419
|
}
|
|
420
420
|
|
|
421
421
|
// Step 3b: If instanceSearch is also provided, further filter by tags
|
|
422
422
|
if (instanceSearch && recommendations.length > 0) {
|
|
423
|
-
const searchMatches = new Set(searchInstancesByTag(instanceSearch, effectiveCatalog, { limit: 100 }))
|
|
424
|
-
recommendations = recommendations.filter(r => searchMatches.has(r.instanceType))
|
|
423
|
+
const searchMatches = new Set(searchInstancesByTag(instanceSearch, effectiveCatalog, { limit: 100 }));
|
|
424
|
+
recommendations = recommendations.filter(r => searchMatches.has(r.instanceType));
|
|
425
425
|
}
|
|
426
426
|
|
|
427
427
|
// Step 4: Smart mode — query Bedrock for edge-case reasoning
|
|
428
|
-
let finalRecommendations = recommendations
|
|
429
|
-
let smartModeUsed = false
|
|
428
|
+
let finalRecommendations = recommendations;
|
|
429
|
+
let smartModeUsed = false;
|
|
430
430
|
|
|
431
431
|
if (SMART_MODE && recommendations.length > 0) {
|
|
432
|
-
log('[smart] Smart mode enabled, querying Amazon Bedrock...')
|
|
432
|
+
log('[smart] Smart mode enabled, querying Amazon Bedrock...');
|
|
433
433
|
|
|
434
434
|
const bedrockContext = {
|
|
435
435
|
modelName,
|
|
@@ -446,38 +446,38 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
446
446
|
tensorParallelism: r.tensorParallelism
|
|
447
447
|
})),
|
|
448
448
|
...(context || {})
|
|
449
|
-
}
|
|
449
|
+
};
|
|
450
450
|
|
|
451
451
|
const bedrockResult = await queryBedrock(
|
|
452
452
|
SERVER_CONFIG,
|
|
453
453
|
['instanceType'],
|
|
454
454
|
limit,
|
|
455
455
|
bedrockContext
|
|
456
|
-
)
|
|
456
|
+
);
|
|
457
457
|
|
|
458
458
|
if (bedrockResult?.values?.instanceType) {
|
|
459
|
-
const bedrockInstance = bedrockResult.values.instanceType
|
|
460
|
-
log(`[smart] Bedrock recommendation: ${bedrockInstance}`)
|
|
459
|
+
const bedrockInstance = bedrockResult.values.instanceType;
|
|
460
|
+
log(`[smart] Bedrock recommendation: ${bedrockInstance}`);
|
|
461
461
|
|
|
462
462
|
// Check if Bedrock's suggestion is already in our list
|
|
463
463
|
const existingIndex = finalRecommendations.findIndex(
|
|
464
464
|
r => r.instanceType === bedrockInstance
|
|
465
|
-
)
|
|
465
|
+
);
|
|
466
466
|
|
|
467
467
|
if (existingIndex > 0) {
|
|
468
468
|
// Move Bedrock's pick to the top
|
|
469
|
-
const [picked] = finalRecommendations.splice(existingIndex, 1)
|
|
470
|
-
finalRecommendations = [picked, ...finalRecommendations]
|
|
471
|
-
smartModeUsed = true
|
|
469
|
+
const [picked] = finalRecommendations.splice(existingIndex, 1);
|
|
470
|
+
finalRecommendations = [picked, ...finalRecommendations];
|
|
471
|
+
smartModeUsed = true;
|
|
472
472
|
} else if (existingIndex === 0) {
|
|
473
473
|
// Already at the top — Bedrock agrees with static
|
|
474
|
-
smartModeUsed = true
|
|
475
|
-
log('[smart] Bedrock agrees with static top recommendation')
|
|
474
|
+
smartModeUsed = true;
|
|
475
|
+
log('[smart] Bedrock agrees with static top recommendation');
|
|
476
476
|
} else {
|
|
477
477
|
// Bedrock suggested an instance not in our filtered list;
|
|
478
478
|
// verify it exists in the catalog before prepending
|
|
479
479
|
if (INSTANCE_CATALOG[bedrockInstance]) {
|
|
480
|
-
const catalogEntry = INSTANCE_CATALOG[bedrockInstance]
|
|
480
|
+
const catalogEntry = INSTANCE_CATALOG[bedrockInstance];
|
|
481
481
|
const bedrockRec = {
|
|
482
482
|
instanceType: bedrockInstance,
|
|
483
483
|
gpuCount: catalogEntry.gpus || 0,
|
|
@@ -485,24 +485,24 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
485
485
|
utilizationPercent: null,
|
|
486
486
|
tensorParallelism: catalogEntry.gpus || 1,
|
|
487
487
|
costTier: catalogEntry.costTier || null
|
|
488
|
-
}
|
|
489
|
-
finalRecommendations = [bedrockRec, ...finalRecommendations].slice(0, limit)
|
|
490
|
-
smartModeUsed = true
|
|
488
|
+
};
|
|
489
|
+
finalRecommendations = [bedrockRec, ...finalRecommendations].slice(0, limit);
|
|
490
|
+
smartModeUsed = true;
|
|
491
491
|
} else {
|
|
492
|
-
log(`[smart] Bedrock suggested unknown instance "${bedrockInstance}", ignoring`)
|
|
492
|
+
log(`[smart] Bedrock suggested unknown instance "${bedrockInstance}", ignoring`);
|
|
493
493
|
}
|
|
494
494
|
}
|
|
495
495
|
} else {
|
|
496
|
-
log('[smart] Bedrock did not return usable results, falling back to static recommendations')
|
|
496
|
+
log('[smart] Bedrock did not return usable results, falling back to static recommendations');
|
|
497
497
|
}
|
|
498
498
|
}
|
|
499
499
|
|
|
500
500
|
// Build response
|
|
501
501
|
const topRecommendation = finalRecommendations.length > 0
|
|
502
502
|
? finalRecommendations[0].instanceType
|
|
503
|
-
: null
|
|
503
|
+
: null;
|
|
504
504
|
|
|
505
|
-
const rankedList = finalRecommendations.map(r => r.instanceType)
|
|
505
|
+
const rankedList = finalRecommendations.map(r => r.instanceType);
|
|
506
506
|
|
|
507
507
|
return {
|
|
508
508
|
content: [{
|
|
@@ -524,7 +524,7 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
524
524
|
}
|
|
525
525
|
})
|
|
526
526
|
}]
|
|
527
|
-
}
|
|
527
|
+
};
|
|
528
528
|
}
|
|
529
529
|
|
|
530
530
|
// ── MCP Server setup ─────────────────────────────────────────────────────────
|
|
@@ -532,7 +532,7 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
532
532
|
const server = new McpServer({
|
|
533
533
|
name: 'instance-sizer',
|
|
534
534
|
version: '1.0.0'
|
|
535
|
-
})
|
|
535
|
+
});
|
|
536
536
|
|
|
537
537
|
// Register the get_instance_recommendation tool
|
|
538
538
|
server.tool(
|
|
@@ -554,9 +554,9 @@ server.tool(
|
|
|
554
554
|
}).optional().describe('Additional deployment context')
|
|
555
555
|
},
|
|
556
556
|
async (params) => {
|
|
557
|
-
return handleGetInstanceRecommendation(params)
|
|
557
|
+
return handleGetInstanceRecommendation(params);
|
|
558
558
|
}
|
|
559
|
-
)
|
|
559
|
+
);
|
|
560
560
|
|
|
561
561
|
// Register alias tool name for backward compatibility
|
|
562
562
|
server.tool(
|
|
@@ -578,27 +578,27 @@ server.tool(
|
|
|
578
578
|
}).optional().describe('Additional deployment context')
|
|
579
579
|
},
|
|
580
580
|
async (params) => {
|
|
581
|
-
return handleGetInstanceRecommendation(params)
|
|
581
|
+
return handleGetInstanceRecommendation(params);
|
|
582
582
|
}
|
|
583
|
-
)
|
|
583
|
+
);
|
|
584
584
|
|
|
585
585
|
// ── Exports for testing ──────────────────────────────────────────────────────
|
|
586
586
|
|
|
587
|
-
export { handleGetInstanceRecommendation, INSTANCE_CATALOG, SERVER_CONFIG, server, searchInstancesByTag, filterByCudaVersion }
|
|
587
|
+
export { handleGetInstanceRecommendation, INSTANCE_CATALOG, SERVER_CONFIG, server, searchInstancesByTag, filterByCudaVersion };
|
|
588
588
|
|
|
589
589
|
// ── Transport connection (main module only) ──────────────────────────────────
|
|
590
590
|
|
|
591
|
-
const isMain = process.argv[1] && resolve(process.argv[1]) === __filename
|
|
591
|
+
const isMain = process.argv[1] && resolve(process.argv[1]) === __filename;
|
|
592
592
|
|
|
593
593
|
if (isMain) {
|
|
594
594
|
if (SMART_MODE) {
|
|
595
|
-
log(`Smart mode enabled (model: ${BEDROCK_MODEL}, region: ${BEDROCK_REGION})`)
|
|
595
|
+
log(`Smart mode enabled (model: ${BEDROCK_MODEL}, region: ${BEDROCK_REGION})`);
|
|
596
596
|
} else if (!DISCOVER_MODE) {
|
|
597
|
-
log('Static mode (catalog-only, no network calls) — use --no-discover to force this')
|
|
597
|
+
log('Static mode (catalog-only, no network calls) — use --no-discover to force this');
|
|
598
598
|
} else {
|
|
599
|
-
log('Discover mode (HuggingFace API + quota lookups active)')
|
|
599
|
+
log('Discover mode (HuggingFace API + quota lookups active)');
|
|
600
600
|
}
|
|
601
601
|
|
|
602
|
-
const transport = new StdioServerTransport()
|
|
603
|
-
await server.connect(transport)
|
|
602
|
+
const transport = new StdioServerTransport();
|
|
603
|
+
await server.connect(transport);
|
|
604
604
|
}
|