@aws/ml-container-creator 0.10.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE-THIRD-PARTY +9304 -0
- package/bin/cli.js +2 -0
- package/config/bootstrap-e2e-stack.json +341 -0
- package/config/bootstrap-stack.json +40 -3
- package/config/parameter-schema-v2.json +33 -22
- package/config/tune-catalog.json +1781 -0
- package/infra/ci-harness/buildspec.yml +1 -0
- package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
- package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
- package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
- package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
- package/package.json +53 -67
- package/servers/base-image-picker/index.js +121 -121
- package/servers/e2e-status/index.js +297 -0
- package/servers/e2e-status/manifest.json +14 -0
- package/servers/e2e-status/package.json +15 -0
- package/servers/endpoint-picker/LICENSE +202 -0
- package/servers/endpoint-picker/index.js +536 -0
- package/servers/endpoint-picker/manifest.json +14 -0
- package/servers/endpoint-picker/package.json +18 -0
- package/servers/hyperpod-cluster-picker/index.js +125 -125
- package/servers/instance-sizer/index.js +166 -153
- package/servers/instance-sizer/lib/instance-ranker.js +120 -76
- package/servers/instance-sizer/lib/model-resolver.js +61 -61
- package/servers/instance-sizer/lib/quota-resolver.js +113 -113
- package/servers/instance-sizer/lib/vram-estimator.js +31 -31
- package/servers/lib/bedrock-client.js +38 -38
- package/servers/lib/catalogs/instances.json +27 -0
- package/servers/lib/catalogs/model-servers.json +201 -3
- package/servers/lib/custom-validators.js +13 -13
- package/servers/lib/dynamic-resolver.js +4 -4
- package/servers/marketplace-picker/index.js +342 -0
- package/servers/marketplace-picker/manifest.json +14 -0
- package/servers/marketplace-picker/package.json +18 -0
- package/servers/model-picker/index.js +382 -382
- package/servers/region-picker/index.js +56 -56
- package/servers/workload-picker/LICENSE +202 -0
- package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
- package/servers/workload-picker/index.js +171 -0
- package/servers/workload-picker/manifest.json +16 -0
- package/servers/workload-picker/package.json +16 -0
- package/src/app.js +12 -3
- package/src/lib/bootstrap-command-handler.js +609 -15
- package/src/lib/bootstrap-config.js +36 -0
- package/src/lib/bootstrap-profile-manager.js +48 -41
- package/src/lib/ci-register-helpers.js +74 -0
- package/src/lib/config-loader.js +3 -0
- package/src/lib/config-manager.js +7 -0
- package/src/lib/config-validator.js +1 -1
- package/src/lib/cuda-resolver.js +17 -8
- package/src/lib/generated/cli-options.js +319 -314
- package/src/lib/generated/parameter-matrix.js +672 -661
- package/src/lib/generated/validation-rules.js +76 -72
- package/src/lib/path-prover-brain.js +664 -0
- package/src/lib/prompts/infrastructure-prompts.js +2 -2
- package/src/lib/prompts/model-prompts.js +6 -0
- package/src/lib/prompts/project-prompts.js +12 -0
- package/src/lib/secrets-prompt-runner.js +4 -0
- package/src/lib/template-manager.js +1 -1
- package/src/lib/template-variable-resolver.js +87 -1
- package/src/lib/tune-catalog-validator.js +37 -4
- package/templates/Dockerfile +9 -0
- package/templates/code/adapter_sidecar.py +444 -0
- package/templates/code/serve +6 -0
- package/templates/code/serve.d/vllm.ejs +1 -1
- package/templates/do/.benchmark_writer.py +1476 -0
- package/templates/do/.tune_helper.py +982 -57
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/adapter +154 -0
- package/templates/do/benchmark +639 -85
- package/templates/do/build +5 -0
- package/templates/do/clean.d/async-inference.ejs +5 -0
- package/templates/do/clean.d/batch-transform.ejs +5 -0
- package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
- package/templates/do/clean.d/managed-inference.ejs +5 -0
- package/templates/do/config +115 -45
- package/templates/do/deploy.d/async-inference.ejs +30 -3
- package/templates/do/deploy.d/batch-transform.ejs +29 -3
- package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
- package/templates/do/deploy.d/managed-inference.ejs +216 -14
- package/templates/do/lib/endpoint-config.sh +1 -1
- package/templates/do/lib/profile.sh +44 -0
- package/templates/do/optimize +106 -37
- package/templates/do/push +5 -0
- package/templates/do/register +94 -0
- package/templates/do/stage +567 -0
- package/templates/do/submit +7 -0
- package/templates/do/test +14 -0
- package/templates/do/tune +382 -59
- package/templates/do/validate +44 -4
|
@@ -18,43 +18,43 @@
|
|
|
18
18
|
* Returns: { values, choices, metadata }
|
|
19
19
|
*/
|
|
20
20
|
|
|
21
|
-
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'
|
|
22
|
-
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
|
|
23
|
-
import { z } from 'zod'
|
|
24
|
-
import { readFileSync } from 'node:fs'
|
|
25
|
-
import { fileURLToPath } from 'node:url'
|
|
26
|
-
import { resolve, dirname } from 'node:path'
|
|
27
|
-
import { resolveModelMetadata } from './lib/model-resolver.js'
|
|
28
|
-
import { estimateVram } from './lib/vram-estimator.js'
|
|
29
|
-
import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js'
|
|
30
|
-
import { QuotaResolver } from './lib/quota-resolver.js'
|
|
31
|
-
import { queryBedrock } from '../lib/bedrock-client.js'
|
|
21
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
22
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
23
|
+
import { z } from 'zod';
|
|
24
|
+
import { readFileSync } from 'node:fs';
|
|
25
|
+
import { fileURLToPath } from 'node:url';
|
|
26
|
+
import { resolve, dirname } from 'node:path';
|
|
27
|
+
import { resolveModelMetadata } from './lib/model-resolver.js';
|
|
28
|
+
import { estimateVram } from './lib/vram-estimator.js';
|
|
29
|
+
import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js';
|
|
30
|
+
import { QuotaResolver } from './lib/quota-resolver.js';
|
|
31
|
+
import { queryBedrock } from '../lib/bedrock-client.js';
|
|
32
32
|
|
|
33
33
|
// ── Path setup ───────────────────────────────────────────────────────────────
|
|
34
34
|
|
|
35
|
-
const __filename = fileURLToPath(import.meta.url)
|
|
36
|
-
const __dirname = dirname(__filename)
|
|
35
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
36
|
+
const __dirname = dirname(__filename);
|
|
37
37
|
|
|
38
38
|
// ── Load instance catalog from shared lib ────────────────────────────────────
|
|
39
39
|
|
|
40
|
-
let INSTANCE_CATALOG
|
|
40
|
+
let INSTANCE_CATALOG;
|
|
41
41
|
|
|
42
42
|
try {
|
|
43
|
-
const catalogPath = resolve(__dirname, '../lib/catalogs/instances.json')
|
|
44
|
-
const raw = readFileSync(catalogPath, 'utf8')
|
|
45
|
-
const data = JSON.parse(raw)
|
|
46
|
-
INSTANCE_CATALOG = data.catalog
|
|
43
|
+
const catalogPath = resolve(__dirname, '../lib/catalogs/instances.json');
|
|
44
|
+
const raw = readFileSync(catalogPath, 'utf8');
|
|
45
|
+
const data = JSON.parse(raw);
|
|
46
|
+
INSTANCE_CATALOG = data.catalog;
|
|
47
47
|
} catch (err) {
|
|
48
|
-
process.stderr.write(`[instance-sizer] Fatal: Failed to load instance catalog: ${err.message}\n`)
|
|
49
|
-
process.exit(1)
|
|
48
|
+
process.stderr.write(`[instance-sizer] Fatal: Failed to load instance catalog: ${err.message}\n`);
|
|
49
|
+
process.exit(1);
|
|
50
50
|
}
|
|
51
51
|
|
|
52
52
|
// ── Mode configuration ───────────────────────────────────────────────────────
|
|
53
53
|
|
|
54
|
-
const DISCOVER_MODE = process.env.DISCOVER_MODE !== 'false' && !process.argv.includes('--no-discover')
|
|
55
|
-
const SMART_MODE = process.env.BEDROCK_SMART === 'true'
|
|
56
|
-
const BEDROCK_MODEL = process.env.BEDROCK_MODEL || 'global.anthropic.claude-sonnet-4-20250514-v1:0'
|
|
57
|
-
const BEDROCK_REGION = process.env.BEDROCK_REGION || process.env.AWS_REGION || 'us-east-1'
|
|
54
|
+
const DISCOVER_MODE = process.env.DISCOVER_MODE !== 'false' && !process.argv.includes('--no-discover');
|
|
55
|
+
const SMART_MODE = process.env.BEDROCK_SMART === 'true';
|
|
56
|
+
const BEDROCK_MODEL = process.env.BEDROCK_MODEL || 'global.anthropic.claude-sonnet-4-20250514-v1:0';
|
|
57
|
+
const BEDROCK_REGION = process.env.BEDROCK_REGION || process.env.AWS_REGION || 'us-east-1';
|
|
58
58
|
|
|
59
59
|
// ── Bedrock server config ─────────────────────────────────────────────────────
|
|
60
60
|
|
|
@@ -95,7 +95,7 @@ Rules:
|
|
|
95
95
|
maxTokens: 1024,
|
|
96
96
|
modelId: BEDROCK_MODEL,
|
|
97
97
|
region: BEDROCK_REGION
|
|
98
|
-
}
|
|
98
|
+
};
|
|
99
99
|
|
|
100
100
|
// ── Logging ──────────────────────────────────────────────────────────────────
|
|
101
101
|
|
|
@@ -103,7 +103,7 @@ Rules:
|
|
|
103
103
|
* Log to stderr so it doesn't interfere with MCP stdio protocol on stdout.
|
|
104
104
|
*/
|
|
105
105
|
function log(message) {
|
|
106
|
-
process.stderr.write(`[instance-sizer] ${message}\n`)
|
|
106
|
+
process.stderr.write(`[instance-sizer] ${message}\n`);
|
|
107
107
|
}
|
|
108
108
|
|
|
109
109
|
// ── Tag-based search filtering ───────────────────────────────────────────────
|
|
@@ -119,76 +119,76 @@ function log(message) {
|
|
|
119
119
|
* @returns {string[]} Matching instance type names, sorted by relevance
|
|
120
120
|
*/
|
|
121
121
|
function searchInstancesByTag(search, instanceCatalog, options = {}) {
|
|
122
|
-
const { limit = 10 } = options
|
|
123
|
-
const candidates = Object.entries(instanceCatalog)
|
|
122
|
+
const { limit = 10 } = options;
|
|
123
|
+
const candidates = Object.entries(instanceCatalog);
|
|
124
124
|
|
|
125
125
|
// Tokenize search into lowercase keywords
|
|
126
|
-
const tokens = search.toLowerCase().split(/[\s,\-_]+/).filter(Boolean)
|
|
126
|
+
const tokens = search.toLowerCase().split(/[\s,\-_]+/).filter(Boolean);
|
|
127
127
|
|
|
128
128
|
// Detect compound terms
|
|
129
|
-
const rawLower = search.toLowerCase()
|
|
130
|
-
const wantsMultiGpu = rawLower.includes('multi gpu') || rawLower.includes('multi-gpu') || rawLower.includes('multigpu')
|
|
129
|
+
const rawLower = search.toLowerCase();
|
|
130
|
+
const wantsMultiGpu = rawLower.includes('multi gpu') || rawLower.includes('multi-gpu') || rawLower.includes('multigpu');
|
|
131
131
|
|
|
132
132
|
// Detect CUDA version requests: "cuda 12", "cuda 11.8", "cuda-12.1"
|
|
133
|
-
const cudaMatch = rawLower.match(/cuda[\s\-_]*(\d+(?:\.\d+)?)/)
|
|
134
|
-
const wantsCudaVersion = cudaMatch ? cudaMatch[1] : null
|
|
133
|
+
const cudaMatch = rawLower.match(/cuda[\s\-_]*(\d+(?:\.\d+)?)/);
|
|
134
|
+
const wantsCudaVersion = cudaMatch ? cudaMatch[1] : null;
|
|
135
135
|
|
|
136
136
|
// Score each instance
|
|
137
137
|
const scored = candidates.map(([name, meta]) => {
|
|
138
|
-
let score = 0
|
|
139
|
-
const cudaStr = meta.cudaVersions ? meta.cudaVersions.join(' ') : ''
|
|
140
|
-
const haystack = [...(meta.tags || []), (meta.accelerator || '').toLowerCase(), name, meta.category || '', cudaStr].join(' ')
|
|
138
|
+
let score = 0;
|
|
139
|
+
const cudaStr = meta.cudaVersions ? meta.cudaVersions.join(' ') : '';
|
|
140
|
+
const haystack = [...(meta.tags || []), (meta.accelerator || '').toLowerCase(), name, meta.category || '', cudaStr].join(' ');
|
|
141
141
|
|
|
142
142
|
// Compound term: multi-gpu
|
|
143
143
|
if (wantsMultiGpu) {
|
|
144
144
|
if (meta.gpus > 1) {
|
|
145
|
-
score += 5
|
|
145
|
+
score += 5;
|
|
146
146
|
} else {
|
|
147
|
-
return { name, meta, score: 0 }
|
|
147
|
+
return { name, meta, score: 0 };
|
|
148
148
|
}
|
|
149
149
|
}
|
|
150
150
|
|
|
151
151
|
// Compound term: cuda version
|
|
152
152
|
if (wantsCudaVersion) {
|
|
153
|
-
if (!meta.cudaVersions) return { name, meta, score: 0 }
|
|
154
|
-
const hasExact = meta.cudaVersions.includes(wantsCudaVersion)
|
|
155
|
-
const hasMajor = meta.cudaVersions.some(v => v.startsWith(wantsCudaVersion))
|
|
153
|
+
if (!meta.cudaVersions) return { name, meta, score: 0 };
|
|
154
|
+
const hasExact = meta.cudaVersions.includes(wantsCudaVersion);
|
|
155
|
+
const hasMajor = meta.cudaVersions.some(v => v.startsWith(wantsCudaVersion));
|
|
156
156
|
if (hasExact) {
|
|
157
|
-
score += 4
|
|
157
|
+
score += 4;
|
|
158
158
|
} else if (hasMajor) {
|
|
159
|
-
score += 3
|
|
159
|
+
score += 3;
|
|
160
160
|
} else {
|
|
161
|
-
return { name, meta, score: 0 }
|
|
161
|
+
return { name, meta, score: 0 };
|
|
162
162
|
}
|
|
163
163
|
}
|
|
164
164
|
|
|
165
165
|
for (const token of tokens) {
|
|
166
|
-
if (wantsMultiGpu && (token === 'multi' || token === 'gpu')) continue
|
|
167
|
-
if (wantsCudaVersion && (token === 'cuda' || token === wantsCudaVersion)) continue
|
|
166
|
+
if (wantsMultiGpu && (token === 'multi' || token === 'gpu')) continue;
|
|
167
|
+
if (wantsCudaVersion && (token === 'cuda' || token === wantsCudaVersion)) continue;
|
|
168
168
|
|
|
169
|
-
if (haystack.includes(token)) score += 1
|
|
170
|
-
if (meta.gpus > 1 && token === 'parallel') score += 2
|
|
171
|
-
if (token === 'gpu' && meta.gpus > 0) score += 1
|
|
172
|
-
if (token === 'cpu' && meta.gpus === 0) score += 1
|
|
169
|
+
if (haystack.includes(token)) score += 1;
|
|
170
|
+
if (meta.gpus > 1 && token === 'parallel') score += 2;
|
|
171
|
+
if (token === 'gpu' && meta.gpus > 0) score += 1;
|
|
172
|
+
if (token === 'cpu' && meta.gpus === 0) score += 1;
|
|
173
173
|
if (token === 'cheap' || token === 'budget' || token === 'cost') {
|
|
174
|
-
if ((meta.tags || []).includes('budget') || (meta.tags || []).includes('cost-effective')) score += 1
|
|
174
|
+
if ((meta.tags || []).includes('budget') || (meta.tags || []).includes('cost-effective')) score += 1;
|
|
175
175
|
}
|
|
176
176
|
if (token === 'memory' || token === 'high-memory') {
|
|
177
|
-
if (meta.memGb >= 32) score += 1
|
|
177
|
+
if (meta.memGb >= 32) score += 1;
|
|
178
178
|
}
|
|
179
|
-
if (token === 'large' && meta.vcpus >= 16) score += 1
|
|
180
|
-
if (meta.cudaVersions && meta.cudaVersions.includes(token)) score += 2
|
|
179
|
+
if (token === 'large' && meta.vcpus >= 16) score += 1;
|
|
180
|
+
if (meta.cudaVersions && meta.cudaVersions.includes(token)) score += 2;
|
|
181
181
|
}
|
|
182
|
-
return { name, meta, score }
|
|
183
|
-
})
|
|
182
|
+
return { name, meta, score };
|
|
183
|
+
});
|
|
184
184
|
|
|
185
|
-
const matched = scored.filter(s => s.score > 0).sort((a, b) => b.score - a.score)
|
|
185
|
+
const matched = scored.filter(s => s.score > 0).sort((a, b) => b.score - a.score);
|
|
186
186
|
|
|
187
187
|
if (matched.length === 0) {
|
|
188
|
-
return []
|
|
188
|
+
return [];
|
|
189
189
|
}
|
|
190
190
|
|
|
191
|
-
return matched.slice(0, limit).map(s => s.name)
|
|
191
|
+
return matched.slice(0, limit).map(s => s.name);
|
|
192
192
|
}
|
|
193
193
|
|
|
194
194
|
// ── CUDA version filtering ───────────────────────────────────────────────────
|
|
@@ -201,22 +201,22 @@ function searchInstancesByTag(search, instanceCatalog, options = {}) {
|
|
|
201
201
|
* @returns {object} Filtered instance catalog
|
|
202
202
|
*/
|
|
203
203
|
function filterByCudaVersion(instanceCatalog, requiredCuda) {
|
|
204
|
-
const majorRequired = requiredCuda.split('.')[0]
|
|
205
|
-
const filtered = {}
|
|
204
|
+
const majorRequired = requiredCuda.split('.')[0];
|
|
205
|
+
const filtered = {};
|
|
206
206
|
|
|
207
207
|
for (const [name, meta] of Object.entries(instanceCatalog)) {
|
|
208
|
-
if (!meta.cudaVersions || meta.cudaVersions.length === 0) continue
|
|
208
|
+
if (!meta.cudaVersions || meta.cudaVersions.length === 0) continue;
|
|
209
209
|
const hasCompatible = meta.cudaVersions.some(v => {
|
|
210
|
-
if (v === requiredCuda) return true
|
|
211
|
-
if (v.startsWith(majorRequired
|
|
212
|
-
return false
|
|
213
|
-
})
|
|
210
|
+
if (v === requiredCuda) return true;
|
|
211
|
+
if (v.startsWith(`${majorRequired }.`)) return true;
|
|
212
|
+
return false;
|
|
213
|
+
});
|
|
214
214
|
if (hasCompatible) {
|
|
215
|
-
filtered[name] = meta
|
|
215
|
+
filtered[name] = meta;
|
|
216
216
|
}
|
|
217
217
|
}
|
|
218
218
|
|
|
219
|
-
return filtered
|
|
219
|
+
return filtered;
|
|
220
220
|
}
|
|
221
221
|
|
|
222
222
|
// ── Tool handler ─────────────────────────────────────────────────────────────
|
|
@@ -239,26 +239,26 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
239
239
|
cudaVersion,
|
|
240
240
|
limit = 10,
|
|
241
241
|
context
|
|
242
|
-
} = params
|
|
242
|
+
} = params;
|
|
243
243
|
|
|
244
244
|
// Apply profile ENV overrides to sequence length and batch size
|
|
245
|
-
let effectiveMaxSeqLen = maxSequenceLength
|
|
246
|
-
let effectiveBatchSize = batchSize
|
|
245
|
+
let effectiveMaxSeqLen = maxSequenceLength;
|
|
246
|
+
let effectiveBatchSize = batchSize;
|
|
247
247
|
if (context?.profileEnvVars) {
|
|
248
248
|
if (context.profileEnvVars.VLLM_MAX_MODEL_LEN) {
|
|
249
|
-
effectiveMaxSeqLen = parseInt(context.profileEnvVars.VLLM_MAX_MODEL_LEN, 10) || effectiveMaxSeqLen
|
|
249
|
+
effectiveMaxSeqLen = parseInt(context.profileEnvVars.VLLM_MAX_MODEL_LEN, 10) || effectiveMaxSeqLen;
|
|
250
250
|
}
|
|
251
251
|
if (context.profileEnvVars.VLLM_MAX_NUM_SEQS) {
|
|
252
|
-
effectiveBatchSize = parseInt(context.profileEnvVars.VLLM_MAX_NUM_SEQS, 10) || effectiveBatchSize
|
|
252
|
+
effectiveBatchSize = parseInt(context.profileEnvVars.VLLM_MAX_NUM_SEQS, 10) || effectiveBatchSize;
|
|
253
253
|
}
|
|
254
254
|
}
|
|
255
255
|
|
|
256
256
|
// Apply CUDA version filtering to instance catalog
|
|
257
|
-
let effectiveCatalog = INSTANCE_CATALOG
|
|
257
|
+
let effectiveCatalog = INSTANCE_CATALOG;
|
|
258
258
|
if (cudaVersion) {
|
|
259
|
-
effectiveCatalog = filterByCudaVersion(INSTANCE_CATALOG, cudaVersion)
|
|
259
|
+
effectiveCatalog = filterByCudaVersion(INSTANCE_CATALOG, cudaVersion);
|
|
260
260
|
if (Object.keys(effectiveCatalog).length === 0) {
|
|
261
|
-
log(`CUDA version ${cudaVersion} filter eliminated all instances`)
|
|
261
|
+
log(`CUDA version ${cudaVersion} filter eliminated all instances`);
|
|
262
262
|
return {
|
|
263
263
|
content: [{
|
|
264
264
|
type: 'text',
|
|
@@ -272,13 +272,13 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
272
272
|
}
|
|
273
273
|
})
|
|
274
274
|
}]
|
|
275
|
-
}
|
|
275
|
+
};
|
|
276
276
|
}
|
|
277
277
|
}
|
|
278
278
|
|
|
279
279
|
// Mode: tag-based search only (no model name)
|
|
280
280
|
if (!modelName && instanceSearch) {
|
|
281
|
-
const searchResults = searchInstancesByTag(instanceSearch, effectiveCatalog, { limit })
|
|
281
|
+
const searchResults = searchInstancesByTag(instanceSearch, effectiveCatalog, { limit });
|
|
282
282
|
return {
|
|
283
283
|
content: [{
|
|
284
284
|
type: 'text',
|
|
@@ -293,14 +293,14 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
293
293
|
}
|
|
294
294
|
})
|
|
295
295
|
}]
|
|
296
|
-
}
|
|
296
|
+
};
|
|
297
297
|
}
|
|
298
298
|
|
|
299
299
|
// Mode: no model name and no search — return all GPU instances
|
|
300
300
|
if (!modelName) {
|
|
301
301
|
const allGpuInstances = Object.keys(effectiveCatalog)
|
|
302
302
|
.filter(key => effectiveCatalog[key].category === 'gpu')
|
|
303
|
-
.slice(0, limit)
|
|
303
|
+
.slice(0, limit);
|
|
304
304
|
|
|
305
305
|
return {
|
|
306
306
|
content: [{
|
|
@@ -316,120 +316,133 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
316
316
|
}
|
|
317
317
|
})
|
|
318
318
|
}]
|
|
319
|
-
}
|
|
319
|
+
};
|
|
320
320
|
}
|
|
321
321
|
|
|
322
322
|
// Step 1: Resolve model metadata
|
|
323
323
|
const modelMetadata = await resolveModelMetadata(modelName, {
|
|
324
324
|
discover: DISCOVER_MODE
|
|
325
|
-
})
|
|
325
|
+
});
|
|
326
326
|
|
|
327
327
|
// If model metadata cannot be resolved, return all GPU instances unfiltered
|
|
328
328
|
if (!modelMetadata) {
|
|
329
|
-
log(`Model metadata not found for "${modelName}", returning unfiltered GPU instances`)
|
|
330
|
-
|
|
329
|
+
log(`Model metadata not found for "${modelName}", returning unfiltered GPU instances`);
|
|
330
|
+
let unfilteredRecs = Object.keys(effectiveCatalog)
|
|
331
331
|
.filter(key => effectiveCatalog[key].category === 'gpu')
|
|
332
332
|
.slice(0, limit)
|
|
333
|
+
.map(instanceType => ({
|
|
334
|
+
instanceType,
|
|
335
|
+
gpuCount: effectiveCatalog[instanceType]?.gpus || 0,
|
|
336
|
+
totalVramGb: null,
|
|
337
|
+
utilizationPercent: null,
|
|
338
|
+
tensorParallelism: null,
|
|
339
|
+
costTier: null
|
|
340
|
+
}));
|
|
341
|
+
|
|
342
|
+
// Still apply availability ranking so quota/FTP info is displayed
|
|
343
|
+
if (DISCOVER_MODE && unfilteredRecs.length > 0) {
|
|
344
|
+
try {
|
|
345
|
+
const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION;
|
|
346
|
+
const quotaResolver = new QuotaResolver(region);
|
|
347
|
+
const instanceTypes = unfilteredRecs.map(r => r.instanceType);
|
|
348
|
+
const [quotas, reservations, ftps] = await Promise.allSettled([
|
|
349
|
+
quotaResolver.getQuotaHeadroom(instanceTypes),
|
|
350
|
+
quotaResolver.getCapacityReservations(),
|
|
351
|
+
quotaResolver.getTrainingPlans()
|
|
352
|
+
]);
|
|
353
|
+
unfilteredRecs = applyAvailabilityRanking(unfilteredRecs, quotas.status === 'fulfilled' ? quotas.value : null, reservations.status === 'fulfilled' ? reservations.value : null, ftps.status === 'fulfilled' ? ftps.value : null);
|
|
354
|
+
} catch (err) {
|
|
355
|
+
log(`Quota resolution skipped (unfiltered path): ${err.message}`);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
333
358
|
|
|
334
359
|
return {
|
|
335
360
|
content: [{
|
|
336
361
|
type: 'text',
|
|
337
362
|
text: JSON.stringify({
|
|
338
|
-
values: { instanceType:
|
|
339
|
-
choices: { instanceType:
|
|
363
|
+
values: { instanceType: unfilteredRecs[0]?.instanceType || null },
|
|
364
|
+
choices: { instanceType: unfilteredRecs.map(r => r.instanceType) },
|
|
340
365
|
metadata: {
|
|
341
366
|
modelName,
|
|
342
|
-
|
|
343
|
-
dtype: null,
|
|
344
|
-
quantization: quantization || null,
|
|
345
|
-
estimatedVramGb: null,
|
|
346
|
-
vramBreakdown: null,
|
|
347
|
-
recommendations: allGpuInstances.map(instanceType => ({
|
|
348
|
-
instanceType,
|
|
349
|
-
gpuCount: effectiveCatalog[instanceType]?.gpus || 0,
|
|
350
|
-
totalVramGb: null,
|
|
351
|
-
utilizationPercent: null,
|
|
352
|
-
tensorParallelism: null,
|
|
353
|
-
costTier: null
|
|
354
|
-
})),
|
|
367
|
+
recommendations: unfilteredRecs,
|
|
355
368
|
source: 'unfiltered',
|
|
356
369
|
cudaVersionFilter: cudaVersion || null,
|
|
357
370
|
warning: `Could not resolve model metadata for "${modelName}". Returning all GPU instances without filtering.`
|
|
358
371
|
}
|
|
359
372
|
})
|
|
360
373
|
}]
|
|
361
|
-
}
|
|
374
|
+
};
|
|
362
375
|
}
|
|
363
376
|
|
|
364
377
|
// Step 2: Estimate VRAM
|
|
365
378
|
// Use model's max_position_embeddings as the sequence length when no explicit value is provided.
|
|
366
379
|
// This ensures KV cache is sized for the model's actual context window, not the 4096 default.
|
|
367
|
-
const resolvedMaxSeqLen = effectiveMaxSeqLen || modelMetadata.maxPositionEmbeddings || undefined
|
|
380
|
+
const resolvedMaxSeqLen = effectiveMaxSeqLen || modelMetadata.maxPositionEmbeddings || undefined;
|
|
368
381
|
const vramEstimate = estimateVram({
|
|
369
382
|
parameterCount: modelMetadata.parameterCount,
|
|
370
383
|
dtype: modelMetadata.dtype,
|
|
371
384
|
quantization: quantization || undefined,
|
|
372
385
|
maxSequenceLength: resolvedMaxSeqLen,
|
|
373
386
|
batchSize: effectiveBatchSize || undefined
|
|
374
|
-
})
|
|
387
|
+
});
|
|
375
388
|
|
|
376
389
|
// Step 3: Filter and rank instances
|
|
377
390
|
let recommendations = filterAndRankInstances(
|
|
378
391
|
vramEstimate.vramGb,
|
|
379
392
|
effectiveCatalog,
|
|
380
393
|
{ limit }
|
|
381
|
-
)
|
|
394
|
+
);
|
|
382
395
|
|
|
383
396
|
// Step 3a: Quota & availability filtering (discover mode only)
|
|
384
|
-
let preQuotaFilterCount = 0
|
|
385
|
-
let allFilteredByQuota = false
|
|
386
|
-
let preQuotaRecommendations = []
|
|
397
|
+
let preQuotaFilterCount = 0;
|
|
398
|
+
let allFilteredByQuota = false;
|
|
399
|
+
let preQuotaRecommendations = [];
|
|
387
400
|
if (DISCOVER_MODE && recommendations.length > 0) {
|
|
388
401
|
try {
|
|
389
|
-
const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION
|
|
390
|
-
const quotaResolver = new QuotaResolver(region)
|
|
402
|
+
const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION;
|
|
403
|
+
const quotaResolver = new QuotaResolver(region);
|
|
391
404
|
|
|
392
|
-
const instanceTypes = recommendations.map(r => r.instanceType)
|
|
405
|
+
const instanceTypes = recommendations.map(r => r.instanceType);
|
|
393
406
|
const [quotas, reservations, ftps] = await Promise.allSettled([
|
|
394
407
|
quotaResolver.getQuotaHeadroom(instanceTypes),
|
|
395
408
|
quotaResolver.getCapacityReservations(),
|
|
396
409
|
quotaResolver.getTrainingPlans()
|
|
397
|
-
])
|
|
410
|
+
]);
|
|
398
411
|
|
|
399
|
-
preQuotaFilterCount = recommendations.length
|
|
400
|
-
preQuotaRecommendations = [...recommendations]
|
|
412
|
+
preQuotaFilterCount = recommendations.length;
|
|
413
|
+
preQuotaRecommendations = [...recommendations];
|
|
401
414
|
recommendations = applyAvailabilityRanking(
|
|
402
415
|
recommendations,
|
|
403
416
|
quotas.status === 'fulfilled' ? quotas.value : null,
|
|
404
417
|
reservations.status === 'fulfilled' ? reservations.value : null,
|
|
405
418
|
ftps.status === 'fulfilled' ? ftps.value : null
|
|
406
|
-
)
|
|
419
|
+
);
|
|
407
420
|
if (recommendations.length === 0 && preQuotaFilterCount > 0) {
|
|
408
|
-
allFilteredByQuota = true
|
|
421
|
+
allFilteredByQuota = true;
|
|
409
422
|
// Restore pre-filter recommendations so user can see compatible instances
|
|
410
423
|
// and request quota increases for the ones they want
|
|
411
|
-
recommendations = preQuotaRecommendations
|
|
412
|
-
log(`All ${preQuotaFilterCount} instances filtered by zero-quota — restoring unfiltered list`)
|
|
424
|
+
recommendations = preQuotaRecommendations;
|
|
425
|
+
log(`All ${preQuotaFilterCount} instances filtered by zero-quota — restoring unfiltered list`);
|
|
413
426
|
}
|
|
414
427
|
} catch (err) {
|
|
415
428
|
// Graceful degradation: if credentials are missing or any unexpected
|
|
416
429
|
// error occurs, skip quota filtering and continue with unfiltered results
|
|
417
|
-
log(`Quota resolution skipped: ${err.message}`)
|
|
430
|
+
log(`Quota resolution skipped: ${err.message}`);
|
|
418
431
|
}
|
|
419
432
|
}
|
|
420
433
|
|
|
421
434
|
// Step 3b: If instanceSearch is also provided, further filter by tags
|
|
422
435
|
if (instanceSearch && recommendations.length > 0) {
|
|
423
|
-
const searchMatches = new Set(searchInstancesByTag(instanceSearch, effectiveCatalog, { limit: 100 }))
|
|
424
|
-
recommendations = recommendations.filter(r => searchMatches.has(r.instanceType))
|
|
436
|
+
const searchMatches = new Set(searchInstancesByTag(instanceSearch, effectiveCatalog, { limit: 100 }));
|
|
437
|
+
recommendations = recommendations.filter(r => searchMatches.has(r.instanceType));
|
|
425
438
|
}
|
|
426
439
|
|
|
427
440
|
// Step 4: Smart mode — query Bedrock for edge-case reasoning
|
|
428
|
-
let finalRecommendations = recommendations
|
|
429
|
-
let smartModeUsed = false
|
|
441
|
+
let finalRecommendations = recommendations;
|
|
442
|
+
let smartModeUsed = false;
|
|
430
443
|
|
|
431
444
|
if (SMART_MODE && recommendations.length > 0) {
|
|
432
|
-
log('[smart] Smart mode enabled, querying Amazon Bedrock...')
|
|
445
|
+
log('[smart] Smart mode enabled, querying Amazon Bedrock...');
|
|
433
446
|
|
|
434
447
|
const bedrockContext = {
|
|
435
448
|
modelName,
|
|
@@ -446,38 +459,38 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
446
459
|
tensorParallelism: r.tensorParallelism
|
|
447
460
|
})),
|
|
448
461
|
...(context || {})
|
|
449
|
-
}
|
|
462
|
+
};
|
|
450
463
|
|
|
451
464
|
const bedrockResult = await queryBedrock(
|
|
452
465
|
SERVER_CONFIG,
|
|
453
466
|
['instanceType'],
|
|
454
467
|
limit,
|
|
455
468
|
bedrockContext
|
|
456
|
-
)
|
|
469
|
+
);
|
|
457
470
|
|
|
458
471
|
if (bedrockResult?.values?.instanceType) {
|
|
459
|
-
const bedrockInstance = bedrockResult.values.instanceType
|
|
460
|
-
log(`[smart] Bedrock recommendation: ${bedrockInstance}`)
|
|
472
|
+
const bedrockInstance = bedrockResult.values.instanceType;
|
|
473
|
+
log(`[smart] Bedrock recommendation: ${bedrockInstance}`);
|
|
461
474
|
|
|
462
475
|
// Check if Bedrock's suggestion is already in our list
|
|
463
476
|
const existingIndex = finalRecommendations.findIndex(
|
|
464
477
|
r => r.instanceType === bedrockInstance
|
|
465
|
-
)
|
|
478
|
+
);
|
|
466
479
|
|
|
467
480
|
if (existingIndex > 0) {
|
|
468
481
|
// Move Bedrock's pick to the top
|
|
469
|
-
const [picked] = finalRecommendations.splice(existingIndex, 1)
|
|
470
|
-
finalRecommendations = [picked, ...finalRecommendations]
|
|
471
|
-
smartModeUsed = true
|
|
482
|
+
const [picked] = finalRecommendations.splice(existingIndex, 1);
|
|
483
|
+
finalRecommendations = [picked, ...finalRecommendations];
|
|
484
|
+
smartModeUsed = true;
|
|
472
485
|
} else if (existingIndex === 0) {
|
|
473
486
|
// Already at the top — Bedrock agrees with static
|
|
474
|
-
smartModeUsed = true
|
|
475
|
-
log('[smart] Bedrock agrees with static top recommendation')
|
|
487
|
+
smartModeUsed = true;
|
|
488
|
+
log('[smart] Bedrock agrees with static top recommendation');
|
|
476
489
|
} else {
|
|
477
490
|
// Bedrock suggested an instance not in our filtered list;
|
|
478
491
|
// verify it exists in the catalog before prepending
|
|
479
492
|
if (INSTANCE_CATALOG[bedrockInstance]) {
|
|
480
|
-
const catalogEntry = INSTANCE_CATALOG[bedrockInstance]
|
|
493
|
+
const catalogEntry = INSTANCE_CATALOG[bedrockInstance];
|
|
481
494
|
const bedrockRec = {
|
|
482
495
|
instanceType: bedrockInstance,
|
|
483
496
|
gpuCount: catalogEntry.gpus || 0,
|
|
@@ -485,24 +498,24 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
485
498
|
utilizationPercent: null,
|
|
486
499
|
tensorParallelism: catalogEntry.gpus || 1,
|
|
487
500
|
costTier: catalogEntry.costTier || null
|
|
488
|
-
}
|
|
489
|
-
finalRecommendations = [bedrockRec, ...finalRecommendations].slice(0, limit)
|
|
490
|
-
smartModeUsed = true
|
|
501
|
+
};
|
|
502
|
+
finalRecommendations = [bedrockRec, ...finalRecommendations].slice(0, limit);
|
|
503
|
+
smartModeUsed = true;
|
|
491
504
|
} else {
|
|
492
|
-
log(`[smart] Bedrock suggested unknown instance "${bedrockInstance}", ignoring`)
|
|
505
|
+
log(`[smart] Bedrock suggested unknown instance "${bedrockInstance}", ignoring`);
|
|
493
506
|
}
|
|
494
507
|
}
|
|
495
508
|
} else {
|
|
496
|
-
log('[smart] Bedrock did not return usable results, falling back to static recommendations')
|
|
509
|
+
log('[smart] Bedrock did not return usable results, falling back to static recommendations');
|
|
497
510
|
}
|
|
498
511
|
}
|
|
499
512
|
|
|
500
513
|
// Build response
|
|
501
514
|
const topRecommendation = finalRecommendations.length > 0
|
|
502
515
|
? finalRecommendations[0].instanceType
|
|
503
|
-
: null
|
|
516
|
+
: null;
|
|
504
517
|
|
|
505
|
-
const rankedList = finalRecommendations.map(r => r.instanceType)
|
|
518
|
+
const rankedList = finalRecommendations.map(r => r.instanceType);
|
|
506
519
|
|
|
507
520
|
return {
|
|
508
521
|
content: [{
|
|
@@ -524,7 +537,7 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
524
537
|
}
|
|
525
538
|
})
|
|
526
539
|
}]
|
|
527
|
-
}
|
|
540
|
+
};
|
|
528
541
|
}
|
|
529
542
|
|
|
530
543
|
// ── MCP Server setup ─────────────────────────────────────────────────────────
|
|
@@ -532,7 +545,7 @@ async function handleGetInstanceRecommendation(params) {
|
|
|
532
545
|
const server = new McpServer({
|
|
533
546
|
name: 'instance-sizer',
|
|
534
547
|
version: '1.0.0'
|
|
535
|
-
})
|
|
548
|
+
});
|
|
536
549
|
|
|
537
550
|
// Register the get_instance_recommendation tool
|
|
538
551
|
server.tool(
|
|
@@ -554,9 +567,9 @@ server.tool(
|
|
|
554
567
|
}).optional().describe('Additional deployment context')
|
|
555
568
|
},
|
|
556
569
|
async (params) => {
|
|
557
|
-
return handleGetInstanceRecommendation(params)
|
|
570
|
+
return handleGetInstanceRecommendation(params);
|
|
558
571
|
}
|
|
559
|
-
)
|
|
572
|
+
);
|
|
560
573
|
|
|
561
574
|
// Register alias tool name for backward compatibility
|
|
562
575
|
server.tool(
|
|
@@ -578,27 +591,27 @@ server.tool(
|
|
|
578
591
|
}).optional().describe('Additional deployment context')
|
|
579
592
|
},
|
|
580
593
|
async (params) => {
|
|
581
|
-
return handleGetInstanceRecommendation(params)
|
|
594
|
+
return handleGetInstanceRecommendation(params);
|
|
582
595
|
}
|
|
583
|
-
)
|
|
596
|
+
);
|
|
584
597
|
|
|
585
598
|
// ── Exports for testing ──────────────────────────────────────────────────────
|
|
586
599
|
|
|
587
|
-
export { handleGetInstanceRecommendation, INSTANCE_CATALOG, SERVER_CONFIG, server, searchInstancesByTag, filterByCudaVersion }
|
|
600
|
+
export { handleGetInstanceRecommendation, INSTANCE_CATALOG, SERVER_CONFIG, server, searchInstancesByTag, filterByCudaVersion };
|
|
588
601
|
|
|
589
602
|
// ── Transport connection (main module only) ──────────────────────────────────
|
|
590
603
|
|
|
591
|
-
const isMain = process.argv[1] && resolve(process.argv[1]) === __filename
|
|
604
|
+
const isMain = process.argv[1] && resolve(process.argv[1]) === __filename;
|
|
592
605
|
|
|
593
606
|
if (isMain) {
|
|
594
607
|
if (SMART_MODE) {
|
|
595
|
-
log(`Smart mode enabled (model: ${BEDROCK_MODEL}, region: ${BEDROCK_REGION})`)
|
|
608
|
+
log(`Smart mode enabled (model: ${BEDROCK_MODEL}, region: ${BEDROCK_REGION})`);
|
|
596
609
|
} else if (!DISCOVER_MODE) {
|
|
597
|
-
log('Static mode (catalog-only, no network calls) — use --no-discover to force this')
|
|
610
|
+
log('Static mode (catalog-only, no network calls) — use --no-discover to force this');
|
|
598
611
|
} else {
|
|
599
|
-
log('Discover mode (HuggingFace API + quota lookups active)')
|
|
612
|
+
log('Discover mode (HuggingFace API + quota lookups active)');
|
|
600
613
|
}
|
|
601
614
|
|
|
602
|
-
const transport = new StdioServerTransport()
|
|
603
|
-
await server.connect(transport)
|
|
615
|
+
const transport = new StdioServerTransport();
|
|
616
|
+
await server.connect(transport);
|
|
604
617
|
}
|