@aws/ml-container-creator 0.10.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/LICENSE-THIRD-PARTY +9304 -0
  2. package/bin/cli.js +2 -0
  3. package/config/bootstrap-e2e-stack.json +341 -0
  4. package/config/bootstrap-stack.json +40 -3
  5. package/config/parameter-schema-v2.json +33 -22
  6. package/config/tune-catalog.json +1781 -0
  7. package/infra/ci-harness/buildspec.yml +1 -0
  8. package/infra/ci-harness/lambda/path-prover/brain.ts +306 -0
  9. package/infra/ci-harness/lambda/path-prover/write-results.ts +152 -0
  10. package/infra/ci-harness/lib/ci-harness-stack.ts +851 -7
  11. package/infra/ci-harness/state-machines/path-prover.asl.json +496 -0
  12. package/package.json +53 -67
  13. package/servers/base-image-picker/index.js +121 -121
  14. package/servers/e2e-status/index.js +297 -0
  15. package/servers/e2e-status/manifest.json +14 -0
  16. package/servers/e2e-status/package.json +15 -0
  17. package/servers/endpoint-picker/LICENSE +202 -0
  18. package/servers/endpoint-picker/index.js +536 -0
  19. package/servers/endpoint-picker/manifest.json +14 -0
  20. package/servers/endpoint-picker/package.json +18 -0
  21. package/servers/hyperpod-cluster-picker/index.js +125 -125
  22. package/servers/instance-sizer/index.js +166 -153
  23. package/servers/instance-sizer/lib/instance-ranker.js +120 -76
  24. package/servers/instance-sizer/lib/model-resolver.js +61 -61
  25. package/servers/instance-sizer/lib/quota-resolver.js +113 -113
  26. package/servers/instance-sizer/lib/vram-estimator.js +31 -31
  27. package/servers/lib/bedrock-client.js +38 -38
  28. package/servers/lib/catalogs/instances.json +27 -0
  29. package/servers/lib/catalogs/model-servers.json +201 -3
  30. package/servers/lib/custom-validators.js +13 -13
  31. package/servers/lib/dynamic-resolver.js +4 -4
  32. package/servers/marketplace-picker/index.js +342 -0
  33. package/servers/marketplace-picker/manifest.json +14 -0
  34. package/servers/marketplace-picker/package.json +18 -0
  35. package/servers/model-picker/index.js +382 -382
  36. package/servers/region-picker/index.js +56 -56
  37. package/servers/workload-picker/LICENSE +202 -0
  38. package/servers/workload-picker/catalogs/workload-profiles.json +67 -0
  39. package/servers/workload-picker/index.js +171 -0
  40. package/servers/workload-picker/manifest.json +16 -0
  41. package/servers/workload-picker/package.json +16 -0
  42. package/src/app.js +12 -3
  43. package/src/lib/bootstrap-command-handler.js +609 -15
  44. package/src/lib/bootstrap-config.js +36 -0
  45. package/src/lib/bootstrap-profile-manager.js +48 -41
  46. package/src/lib/ci-register-helpers.js +74 -0
  47. package/src/lib/config-loader.js +3 -0
  48. package/src/lib/config-manager.js +7 -0
  49. package/src/lib/config-validator.js +1 -1
  50. package/src/lib/cuda-resolver.js +17 -8
  51. package/src/lib/generated/cli-options.js +319 -314
  52. package/src/lib/generated/parameter-matrix.js +672 -661
  53. package/src/lib/generated/validation-rules.js +76 -72
  54. package/src/lib/path-prover-brain.js +664 -0
  55. package/src/lib/prompts/infrastructure-prompts.js +2 -2
  56. package/src/lib/prompts/model-prompts.js +6 -0
  57. package/src/lib/prompts/project-prompts.js +12 -0
  58. package/src/lib/secrets-prompt-runner.js +4 -0
  59. package/src/lib/template-manager.js +1 -1
  60. package/src/lib/template-variable-resolver.js +87 -1
  61. package/src/lib/tune-catalog-validator.js +37 -4
  62. package/templates/Dockerfile +9 -0
  63. package/templates/code/adapter_sidecar.py +444 -0
  64. package/templates/code/serve +6 -0
  65. package/templates/code/serve.d/vllm.ejs +1 -1
  66. package/templates/do/.benchmark_writer.py +1476 -0
  67. package/templates/do/.tune_helper.py +982 -57
  68. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  69. package/templates/do/adapter +154 -0
  70. package/templates/do/benchmark +639 -85
  71. package/templates/do/build +5 -0
  72. package/templates/do/clean.d/async-inference.ejs +5 -0
  73. package/templates/do/clean.d/batch-transform.ejs +5 -0
  74. package/templates/do/clean.d/hyperpod-eks.ejs +5 -0
  75. package/templates/do/clean.d/managed-inference.ejs +5 -0
  76. package/templates/do/config +115 -45
  77. package/templates/do/deploy.d/async-inference.ejs +30 -3
  78. package/templates/do/deploy.d/batch-transform.ejs +29 -3
  79. package/templates/do/deploy.d/hyperpod-eks.ejs +4 -0
  80. package/templates/do/deploy.d/managed-inference.ejs +216 -14
  81. package/templates/do/lib/endpoint-config.sh +1 -1
  82. package/templates/do/lib/profile.sh +44 -0
  83. package/templates/do/optimize +106 -37
  84. package/templates/do/push +5 -0
  85. package/templates/do/register +94 -0
  86. package/templates/do/stage +567 -0
  87. package/templates/do/submit +7 -0
  88. package/templates/do/test +14 -0
  89. package/templates/do/tune +382 -59
  90. package/templates/do/validate +44 -4
@@ -18,43 +18,43 @@
18
18
  * Returns: { values, choices, metadata }
19
19
  */
20
20
 
21
- import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'
22
- import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
23
- import { z } from 'zod'
24
- import { readFileSync } from 'node:fs'
25
- import { fileURLToPath } from 'node:url'
26
- import { resolve, dirname } from 'node:path'
27
- import { resolveModelMetadata } from './lib/model-resolver.js'
28
- import { estimateVram } from './lib/vram-estimator.js'
29
- import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js'
30
- import { QuotaResolver } from './lib/quota-resolver.js'
31
- import { queryBedrock } from '../lib/bedrock-client.js'
21
+ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
22
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
23
+ import { z } from 'zod';
24
+ import { readFileSync } from 'node:fs';
25
+ import { fileURLToPath } from 'node:url';
26
+ import { resolve, dirname } from 'node:path';
27
+ import { resolveModelMetadata } from './lib/model-resolver.js';
28
+ import { estimateVram } from './lib/vram-estimator.js';
29
+ import { filterAndRankInstances, applyAvailabilityRanking } from './lib/instance-ranker.js';
30
+ import { QuotaResolver } from './lib/quota-resolver.js';
31
+ import { queryBedrock } from '../lib/bedrock-client.js';
32
32
 
33
33
  // ── Path setup ───────────────────────────────────────────────────────────────
34
34
 
35
- const __filename = fileURLToPath(import.meta.url)
36
- const __dirname = dirname(__filename)
35
+ const __filename = fileURLToPath(import.meta.url);
36
+ const __dirname = dirname(__filename);
37
37
 
38
38
  // ── Load instance catalog from shared lib ────────────────────────────────────
39
39
 
40
- let INSTANCE_CATALOG
40
+ let INSTANCE_CATALOG;
41
41
 
42
42
  try {
43
- const catalogPath = resolve(__dirname, '../lib/catalogs/instances.json')
44
- const raw = readFileSync(catalogPath, 'utf8')
45
- const data = JSON.parse(raw)
46
- INSTANCE_CATALOG = data.catalog
43
+ const catalogPath = resolve(__dirname, '../lib/catalogs/instances.json');
44
+ const raw = readFileSync(catalogPath, 'utf8');
45
+ const data = JSON.parse(raw);
46
+ INSTANCE_CATALOG = data.catalog;
47
47
  } catch (err) {
48
- process.stderr.write(`[instance-sizer] Fatal: Failed to load instance catalog: ${err.message}\n`)
49
- process.exit(1)
48
+ process.stderr.write(`[instance-sizer] Fatal: Failed to load instance catalog: ${err.message}\n`);
49
+ process.exit(1);
50
50
  }
51
51
 
52
52
  // ── Mode configuration ───────────────────────────────────────────────────────
53
53
 
54
- const DISCOVER_MODE = process.env.DISCOVER_MODE !== 'false' && !process.argv.includes('--no-discover')
55
- const SMART_MODE = process.env.BEDROCK_SMART === 'true'
56
- const BEDROCK_MODEL = process.env.BEDROCK_MODEL || 'global.anthropic.claude-sonnet-4-20250514-v1:0'
57
- const BEDROCK_REGION = process.env.BEDROCK_REGION || process.env.AWS_REGION || 'us-east-1'
54
+ const DISCOVER_MODE = process.env.DISCOVER_MODE !== 'false' && !process.argv.includes('--no-discover');
55
+ const SMART_MODE = process.env.BEDROCK_SMART === 'true';
56
+ const BEDROCK_MODEL = process.env.BEDROCK_MODEL || 'global.anthropic.claude-sonnet-4-20250514-v1:0';
57
+ const BEDROCK_REGION = process.env.BEDROCK_REGION || process.env.AWS_REGION || 'us-east-1';
58
58
 
59
59
  // ── Bedrock server config ─────────────────────────────────────────────────────
60
60
 
@@ -95,7 +95,7 @@ Rules:
95
95
  maxTokens: 1024,
96
96
  modelId: BEDROCK_MODEL,
97
97
  region: BEDROCK_REGION
98
- }
98
+ };
99
99
 
100
100
  // ── Logging ──────────────────────────────────────────────────────────────────
101
101
 
@@ -103,7 +103,7 @@ Rules:
103
103
  * Log to stderr so it doesn't interfere with MCP stdio protocol on stdout.
104
104
  */
105
105
  function log(message) {
106
- process.stderr.write(`[instance-sizer] ${message}\n`)
106
+ process.stderr.write(`[instance-sizer] ${message}\n`);
107
107
  }
108
108
 
109
109
  // ── Tag-based search filtering ───────────────────────────────────────────────
@@ -119,76 +119,76 @@ function log(message) {
119
119
  * @returns {string[]} Matching instance type names, sorted by relevance
120
120
  */
121
121
  function searchInstancesByTag(search, instanceCatalog, options = {}) {
122
- const { limit = 10 } = options
123
- const candidates = Object.entries(instanceCatalog)
122
+ const { limit = 10 } = options;
123
+ const candidates = Object.entries(instanceCatalog);
124
124
 
125
125
  // Tokenize search into lowercase keywords
126
- const tokens = search.toLowerCase().split(/[\s,\-_]+/).filter(Boolean)
126
+ const tokens = search.toLowerCase().split(/[\s,\-_]+/).filter(Boolean);
127
127
 
128
128
  // Detect compound terms
129
- const rawLower = search.toLowerCase()
130
- const wantsMultiGpu = rawLower.includes('multi gpu') || rawLower.includes('multi-gpu') || rawLower.includes('multigpu')
129
+ const rawLower = search.toLowerCase();
130
+ const wantsMultiGpu = rawLower.includes('multi gpu') || rawLower.includes('multi-gpu') || rawLower.includes('multigpu');
131
131
 
132
132
  // Detect CUDA version requests: "cuda 12", "cuda 11.8", "cuda-12.1"
133
- const cudaMatch = rawLower.match(/cuda[\s\-_]*(\d+(?:\.\d+)?)/)
134
- const wantsCudaVersion = cudaMatch ? cudaMatch[1] : null
133
+ const cudaMatch = rawLower.match(/cuda[\s\-_]*(\d+(?:\.\d+)?)/);
134
+ const wantsCudaVersion = cudaMatch ? cudaMatch[1] : null;
135
135
 
136
136
  // Score each instance
137
137
  const scored = candidates.map(([name, meta]) => {
138
- let score = 0
139
- const cudaStr = meta.cudaVersions ? meta.cudaVersions.join(' ') : ''
140
- const haystack = [...(meta.tags || []), (meta.accelerator || '').toLowerCase(), name, meta.category || '', cudaStr].join(' ')
138
+ let score = 0;
139
+ const cudaStr = meta.cudaVersions ? meta.cudaVersions.join(' ') : '';
140
+ const haystack = [...(meta.tags || []), (meta.accelerator || '').toLowerCase(), name, meta.category || '', cudaStr].join(' ');
141
141
 
142
142
  // Compound term: multi-gpu
143
143
  if (wantsMultiGpu) {
144
144
  if (meta.gpus > 1) {
145
- score += 5
145
+ score += 5;
146
146
  } else {
147
- return { name, meta, score: 0 }
147
+ return { name, meta, score: 0 };
148
148
  }
149
149
  }
150
150
 
151
151
  // Compound term: cuda version
152
152
  if (wantsCudaVersion) {
153
- if (!meta.cudaVersions) return { name, meta, score: 0 }
154
- const hasExact = meta.cudaVersions.includes(wantsCudaVersion)
155
- const hasMajor = meta.cudaVersions.some(v => v.startsWith(wantsCudaVersion))
153
+ if (!meta.cudaVersions) return { name, meta, score: 0 };
154
+ const hasExact = meta.cudaVersions.includes(wantsCudaVersion);
155
+ const hasMajor = meta.cudaVersions.some(v => v.startsWith(wantsCudaVersion));
156
156
  if (hasExact) {
157
- score += 4
157
+ score += 4;
158
158
  } else if (hasMajor) {
159
- score += 3
159
+ score += 3;
160
160
  } else {
161
- return { name, meta, score: 0 }
161
+ return { name, meta, score: 0 };
162
162
  }
163
163
  }
164
164
 
165
165
  for (const token of tokens) {
166
- if (wantsMultiGpu && (token === 'multi' || token === 'gpu')) continue
167
- if (wantsCudaVersion && (token === 'cuda' || token === wantsCudaVersion)) continue
166
+ if (wantsMultiGpu && (token === 'multi' || token === 'gpu')) continue;
167
+ if (wantsCudaVersion && (token === 'cuda' || token === wantsCudaVersion)) continue;
168
168
 
169
- if (haystack.includes(token)) score += 1
170
- if (meta.gpus > 1 && token === 'parallel') score += 2
171
- if (token === 'gpu' && meta.gpus > 0) score += 1
172
- if (token === 'cpu' && meta.gpus === 0) score += 1
169
+ if (haystack.includes(token)) score += 1;
170
+ if (meta.gpus > 1 && token === 'parallel') score += 2;
171
+ if (token === 'gpu' && meta.gpus > 0) score += 1;
172
+ if (token === 'cpu' && meta.gpus === 0) score += 1;
173
173
  if (token === 'cheap' || token === 'budget' || token === 'cost') {
174
- if ((meta.tags || []).includes('budget') || (meta.tags || []).includes('cost-effective')) score += 1
174
+ if ((meta.tags || []).includes('budget') || (meta.tags || []).includes('cost-effective')) score += 1;
175
175
  }
176
176
  if (token === 'memory' || token === 'high-memory') {
177
- if (meta.memGb >= 32) score += 1
177
+ if (meta.memGb >= 32) score += 1;
178
178
  }
179
- if (token === 'large' && meta.vcpus >= 16) score += 1
180
- if (meta.cudaVersions && meta.cudaVersions.includes(token)) score += 2
179
+ if (token === 'large' && meta.vcpus >= 16) score += 1;
180
+ if (meta.cudaVersions && meta.cudaVersions.includes(token)) score += 2;
181
181
  }
182
- return { name, meta, score }
183
- })
182
+ return { name, meta, score };
183
+ });
184
184
 
185
- const matched = scored.filter(s => s.score > 0).sort((a, b) => b.score - a.score)
185
+ const matched = scored.filter(s => s.score > 0).sort((a, b) => b.score - a.score);
186
186
 
187
187
  if (matched.length === 0) {
188
- return []
188
+ return [];
189
189
  }
190
190
 
191
- return matched.slice(0, limit).map(s => s.name)
191
+ return matched.slice(0, limit).map(s => s.name);
192
192
  }
193
193
 
194
194
  // ── CUDA version filtering ───────────────────────────────────────────────────
@@ -201,22 +201,22 @@ function searchInstancesByTag(search, instanceCatalog, options = {}) {
201
201
  * @returns {object} Filtered instance catalog
202
202
  */
203
203
  function filterByCudaVersion(instanceCatalog, requiredCuda) {
204
- const majorRequired = requiredCuda.split('.')[0]
205
- const filtered = {}
204
+ const majorRequired = requiredCuda.split('.')[0];
205
+ const filtered = {};
206
206
 
207
207
  for (const [name, meta] of Object.entries(instanceCatalog)) {
208
- if (!meta.cudaVersions || meta.cudaVersions.length === 0) continue
208
+ if (!meta.cudaVersions || meta.cudaVersions.length === 0) continue;
209
209
  const hasCompatible = meta.cudaVersions.some(v => {
210
- if (v === requiredCuda) return true
211
- if (v.startsWith(majorRequired + '.')) return true
212
- return false
213
- })
210
+ if (v === requiredCuda) return true;
211
+ if (v.startsWith(`${majorRequired }.`)) return true;
212
+ return false;
213
+ });
214
214
  if (hasCompatible) {
215
- filtered[name] = meta
215
+ filtered[name] = meta;
216
216
  }
217
217
  }
218
218
 
219
- return filtered
219
+ return filtered;
220
220
  }
221
221
 
222
222
  // ── Tool handler ─────────────────────────────────────────────────────────────
@@ -239,26 +239,26 @@ async function handleGetInstanceRecommendation(params) {
239
239
  cudaVersion,
240
240
  limit = 10,
241
241
  context
242
- } = params
242
+ } = params;
243
243
 
244
244
  // Apply profile ENV overrides to sequence length and batch size
245
- let effectiveMaxSeqLen = maxSequenceLength
246
- let effectiveBatchSize = batchSize
245
+ let effectiveMaxSeqLen = maxSequenceLength;
246
+ let effectiveBatchSize = batchSize;
247
247
  if (context?.profileEnvVars) {
248
248
  if (context.profileEnvVars.VLLM_MAX_MODEL_LEN) {
249
- effectiveMaxSeqLen = parseInt(context.profileEnvVars.VLLM_MAX_MODEL_LEN, 10) || effectiveMaxSeqLen
249
+ effectiveMaxSeqLen = parseInt(context.profileEnvVars.VLLM_MAX_MODEL_LEN, 10) || effectiveMaxSeqLen;
250
250
  }
251
251
  if (context.profileEnvVars.VLLM_MAX_NUM_SEQS) {
252
- effectiveBatchSize = parseInt(context.profileEnvVars.VLLM_MAX_NUM_SEQS, 10) || effectiveBatchSize
252
+ effectiveBatchSize = parseInt(context.profileEnvVars.VLLM_MAX_NUM_SEQS, 10) || effectiveBatchSize;
253
253
  }
254
254
  }
255
255
 
256
256
  // Apply CUDA version filtering to instance catalog
257
- let effectiveCatalog = INSTANCE_CATALOG
257
+ let effectiveCatalog = INSTANCE_CATALOG;
258
258
  if (cudaVersion) {
259
- effectiveCatalog = filterByCudaVersion(INSTANCE_CATALOG, cudaVersion)
259
+ effectiveCatalog = filterByCudaVersion(INSTANCE_CATALOG, cudaVersion);
260
260
  if (Object.keys(effectiveCatalog).length === 0) {
261
- log(`CUDA version ${cudaVersion} filter eliminated all instances`)
261
+ log(`CUDA version ${cudaVersion} filter eliminated all instances`);
262
262
  return {
263
263
  content: [{
264
264
  type: 'text',
@@ -272,13 +272,13 @@ async function handleGetInstanceRecommendation(params) {
272
272
  }
273
273
  })
274
274
  }]
275
- }
275
+ };
276
276
  }
277
277
  }
278
278
 
279
279
  // Mode: tag-based search only (no model name)
280
280
  if (!modelName && instanceSearch) {
281
- const searchResults = searchInstancesByTag(instanceSearch, effectiveCatalog, { limit })
281
+ const searchResults = searchInstancesByTag(instanceSearch, effectiveCatalog, { limit });
282
282
  return {
283
283
  content: [{
284
284
  type: 'text',
@@ -293,14 +293,14 @@ async function handleGetInstanceRecommendation(params) {
293
293
  }
294
294
  })
295
295
  }]
296
- }
296
+ };
297
297
  }
298
298
 
299
299
  // Mode: no model name and no search — return all GPU instances
300
300
  if (!modelName) {
301
301
  const allGpuInstances = Object.keys(effectiveCatalog)
302
302
  .filter(key => effectiveCatalog[key].category === 'gpu')
303
- .slice(0, limit)
303
+ .slice(0, limit);
304
304
 
305
305
  return {
306
306
  content: [{
@@ -316,120 +316,133 @@ async function handleGetInstanceRecommendation(params) {
316
316
  }
317
317
  })
318
318
  }]
319
- }
319
+ };
320
320
  }
321
321
 
322
322
  // Step 1: Resolve model metadata
323
323
  const modelMetadata = await resolveModelMetadata(modelName, {
324
324
  discover: DISCOVER_MODE
325
- })
325
+ });
326
326
 
327
327
  // If model metadata cannot be resolved, return all GPU instances unfiltered
328
328
  if (!modelMetadata) {
329
- log(`Model metadata not found for "${modelName}", returning unfiltered GPU instances`)
330
- const allGpuInstances = Object.keys(effectiveCatalog)
329
+ log(`Model metadata not found for "${modelName}", returning unfiltered GPU instances`);
330
+ let unfilteredRecs = Object.keys(effectiveCatalog)
331
331
  .filter(key => effectiveCatalog[key].category === 'gpu')
332
332
  .slice(0, limit)
333
+ .map(instanceType => ({
334
+ instanceType,
335
+ gpuCount: effectiveCatalog[instanceType]?.gpus || 0,
336
+ totalVramGb: null,
337
+ utilizationPercent: null,
338
+ tensorParallelism: null,
339
+ costTier: null
340
+ }));
341
+
342
+ // Still apply availability ranking so quota/FTP info is displayed
343
+ if (DISCOVER_MODE && unfilteredRecs.length > 0) {
344
+ try {
345
+ const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION;
346
+ const quotaResolver = new QuotaResolver(region);
347
+ const instanceTypes = unfilteredRecs.map(r => r.instanceType);
348
+ const [quotas, reservations, ftps] = await Promise.allSettled([
349
+ quotaResolver.getQuotaHeadroom(instanceTypes),
350
+ quotaResolver.getCapacityReservations(),
351
+ quotaResolver.getTrainingPlans()
352
+ ]);
353
+ unfilteredRecs = applyAvailabilityRanking(unfilteredRecs, quotas.status === 'fulfilled' ? quotas.value : null, reservations.status === 'fulfilled' ? reservations.value : null, ftps.status === 'fulfilled' ? ftps.value : null);
354
+ } catch (err) {
355
+ log(`Quota resolution skipped (unfiltered path): ${err.message}`);
356
+ }
357
+ }
333
358
 
334
359
  return {
335
360
  content: [{
336
361
  type: 'text',
337
362
  text: JSON.stringify({
338
- values: { instanceType: allGpuInstances[0] || null },
339
- choices: { instanceType: allGpuInstances },
363
+ values: { instanceType: unfilteredRecs[0]?.instanceType || null },
364
+ choices: { instanceType: unfilteredRecs.map(r => r.instanceType) },
340
365
  metadata: {
341
366
  modelName,
342
- parameterCount: null,
343
- dtype: null,
344
- quantization: quantization || null,
345
- estimatedVramGb: null,
346
- vramBreakdown: null,
347
- recommendations: allGpuInstances.map(instanceType => ({
348
- instanceType,
349
- gpuCount: effectiveCatalog[instanceType]?.gpus || 0,
350
- totalVramGb: null,
351
- utilizationPercent: null,
352
- tensorParallelism: null,
353
- costTier: null
354
- })),
367
+ recommendations: unfilteredRecs,
355
368
  source: 'unfiltered',
356
369
  cudaVersionFilter: cudaVersion || null,
357
370
  warning: `Could not resolve model metadata for "${modelName}". Returning all GPU instances without filtering.`
358
371
  }
359
372
  })
360
373
  }]
361
- }
374
+ };
362
375
  }
363
376
 
364
377
  // Step 2: Estimate VRAM
365
378
  // Use model's max_position_embeddings as the sequence length when no explicit value is provided.
366
379
  // This ensures KV cache is sized for the model's actual context window, not the 4096 default.
367
- const resolvedMaxSeqLen = effectiveMaxSeqLen || modelMetadata.maxPositionEmbeddings || undefined
380
+ const resolvedMaxSeqLen = effectiveMaxSeqLen || modelMetadata.maxPositionEmbeddings || undefined;
368
381
  const vramEstimate = estimateVram({
369
382
  parameterCount: modelMetadata.parameterCount,
370
383
  dtype: modelMetadata.dtype,
371
384
  quantization: quantization || undefined,
372
385
  maxSequenceLength: resolvedMaxSeqLen,
373
386
  batchSize: effectiveBatchSize || undefined
374
- })
387
+ });
375
388
 
376
389
  // Step 3: Filter and rank instances
377
390
  let recommendations = filterAndRankInstances(
378
391
  vramEstimate.vramGb,
379
392
  effectiveCatalog,
380
393
  { limit }
381
- )
394
+ );
382
395
 
383
396
  // Step 3a: Quota & availability filtering (discover mode only)
384
- let preQuotaFilterCount = 0
385
- let allFilteredByQuota = false
386
- let preQuotaRecommendations = []
397
+ let preQuotaFilterCount = 0;
398
+ let allFilteredByQuota = false;
399
+ let preQuotaRecommendations = [];
387
400
  if (DISCOVER_MODE && recommendations.length > 0) {
388
401
  try {
389
- const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION
390
- const quotaResolver = new QuotaResolver(region)
402
+ const region = process.env.AWS_REGION || process.env.AWS_DEFAULT_REGION || BEDROCK_REGION;
403
+ const quotaResolver = new QuotaResolver(region);
391
404
 
392
- const instanceTypes = recommendations.map(r => r.instanceType)
405
+ const instanceTypes = recommendations.map(r => r.instanceType);
393
406
  const [quotas, reservations, ftps] = await Promise.allSettled([
394
407
  quotaResolver.getQuotaHeadroom(instanceTypes),
395
408
  quotaResolver.getCapacityReservations(),
396
409
  quotaResolver.getTrainingPlans()
397
- ])
410
+ ]);
398
411
 
399
- preQuotaFilterCount = recommendations.length
400
- preQuotaRecommendations = [...recommendations]
412
+ preQuotaFilterCount = recommendations.length;
413
+ preQuotaRecommendations = [...recommendations];
401
414
  recommendations = applyAvailabilityRanking(
402
415
  recommendations,
403
416
  quotas.status === 'fulfilled' ? quotas.value : null,
404
417
  reservations.status === 'fulfilled' ? reservations.value : null,
405
418
  ftps.status === 'fulfilled' ? ftps.value : null
406
- )
419
+ );
407
420
  if (recommendations.length === 0 && preQuotaFilterCount > 0) {
408
- allFilteredByQuota = true
421
+ allFilteredByQuota = true;
409
422
  // Restore pre-filter recommendations so user can see compatible instances
410
423
  // and request quota increases for the ones they want
411
- recommendations = preQuotaRecommendations
412
- log(`All ${preQuotaFilterCount} instances filtered by zero-quota — restoring unfiltered list`)
424
+ recommendations = preQuotaRecommendations;
425
+ log(`All ${preQuotaFilterCount} instances filtered by zero-quota — restoring unfiltered list`);
413
426
  }
414
427
  } catch (err) {
415
428
  // Graceful degradation: if credentials are missing or any unexpected
416
429
  // error occurs, skip quota filtering and continue with unfiltered results
417
- log(`Quota resolution skipped: ${err.message}`)
430
+ log(`Quota resolution skipped: ${err.message}`);
418
431
  }
419
432
  }
420
433
 
421
434
  // Step 3b: If instanceSearch is also provided, further filter by tags
422
435
  if (instanceSearch && recommendations.length > 0) {
423
- const searchMatches = new Set(searchInstancesByTag(instanceSearch, effectiveCatalog, { limit: 100 }))
424
- recommendations = recommendations.filter(r => searchMatches.has(r.instanceType))
436
+ const searchMatches = new Set(searchInstancesByTag(instanceSearch, effectiveCatalog, { limit: 100 }));
437
+ recommendations = recommendations.filter(r => searchMatches.has(r.instanceType));
425
438
  }
426
439
 
427
440
  // Step 4: Smart mode — query Bedrock for edge-case reasoning
428
- let finalRecommendations = recommendations
429
- let smartModeUsed = false
441
+ let finalRecommendations = recommendations;
442
+ let smartModeUsed = false;
430
443
 
431
444
  if (SMART_MODE && recommendations.length > 0) {
432
- log('[smart] Smart mode enabled, querying Amazon Bedrock...')
445
+ log('[smart] Smart mode enabled, querying Amazon Bedrock...');
433
446
 
434
447
  const bedrockContext = {
435
448
  modelName,
@@ -446,38 +459,38 @@ async function handleGetInstanceRecommendation(params) {
446
459
  tensorParallelism: r.tensorParallelism
447
460
  })),
448
461
  ...(context || {})
449
- }
462
+ };
450
463
 
451
464
  const bedrockResult = await queryBedrock(
452
465
  SERVER_CONFIG,
453
466
  ['instanceType'],
454
467
  limit,
455
468
  bedrockContext
456
- )
469
+ );
457
470
 
458
471
  if (bedrockResult?.values?.instanceType) {
459
- const bedrockInstance = bedrockResult.values.instanceType
460
- log(`[smart] Bedrock recommendation: ${bedrockInstance}`)
472
+ const bedrockInstance = bedrockResult.values.instanceType;
473
+ log(`[smart] Bedrock recommendation: ${bedrockInstance}`);
461
474
 
462
475
  // Check if Bedrock's suggestion is already in our list
463
476
  const existingIndex = finalRecommendations.findIndex(
464
477
  r => r.instanceType === bedrockInstance
465
- )
478
+ );
466
479
 
467
480
  if (existingIndex > 0) {
468
481
  // Move Bedrock's pick to the top
469
- const [picked] = finalRecommendations.splice(existingIndex, 1)
470
- finalRecommendations = [picked, ...finalRecommendations]
471
- smartModeUsed = true
482
+ const [picked] = finalRecommendations.splice(existingIndex, 1);
483
+ finalRecommendations = [picked, ...finalRecommendations];
484
+ smartModeUsed = true;
472
485
  } else if (existingIndex === 0) {
473
486
  // Already at the top — Bedrock agrees with static
474
- smartModeUsed = true
475
- log('[smart] Bedrock agrees with static top recommendation')
487
+ smartModeUsed = true;
488
+ log('[smart] Bedrock agrees with static top recommendation');
476
489
  } else {
477
490
  // Bedrock suggested an instance not in our filtered list;
478
491
  // verify it exists in the catalog before prepending
479
492
  if (INSTANCE_CATALOG[bedrockInstance]) {
480
- const catalogEntry = INSTANCE_CATALOG[bedrockInstance]
493
+ const catalogEntry = INSTANCE_CATALOG[bedrockInstance];
481
494
  const bedrockRec = {
482
495
  instanceType: bedrockInstance,
483
496
  gpuCount: catalogEntry.gpus || 0,
@@ -485,24 +498,24 @@ async function handleGetInstanceRecommendation(params) {
485
498
  utilizationPercent: null,
486
499
  tensorParallelism: catalogEntry.gpus || 1,
487
500
  costTier: catalogEntry.costTier || null
488
- }
489
- finalRecommendations = [bedrockRec, ...finalRecommendations].slice(0, limit)
490
- smartModeUsed = true
501
+ };
502
+ finalRecommendations = [bedrockRec, ...finalRecommendations].slice(0, limit);
503
+ smartModeUsed = true;
491
504
  } else {
492
- log(`[smart] Bedrock suggested unknown instance "${bedrockInstance}", ignoring`)
505
+ log(`[smart] Bedrock suggested unknown instance "${bedrockInstance}", ignoring`);
493
506
  }
494
507
  }
495
508
  } else {
496
- log('[smart] Bedrock did not return usable results, falling back to static recommendations')
509
+ log('[smart] Bedrock did not return usable results, falling back to static recommendations');
497
510
  }
498
511
  }
499
512
 
500
513
  // Build response
501
514
  const topRecommendation = finalRecommendations.length > 0
502
515
  ? finalRecommendations[0].instanceType
503
- : null
516
+ : null;
504
517
 
505
- const rankedList = finalRecommendations.map(r => r.instanceType)
518
+ const rankedList = finalRecommendations.map(r => r.instanceType);
506
519
 
507
520
  return {
508
521
  content: [{
@@ -524,7 +537,7 @@ async function handleGetInstanceRecommendation(params) {
524
537
  }
525
538
  })
526
539
  }]
527
- }
540
+ };
528
541
  }
529
542
 
530
543
  // ── MCP Server setup ─────────────────────────────────────────────────────────
@@ -532,7 +545,7 @@ async function handleGetInstanceRecommendation(params) {
532
545
  const server = new McpServer({
533
546
  name: 'instance-sizer',
534
547
  version: '1.0.0'
535
- })
548
+ });
536
549
 
537
550
  // Register the get_instance_recommendation tool
538
551
  server.tool(
@@ -554,9 +567,9 @@ server.tool(
554
567
  }).optional().describe('Additional deployment context')
555
568
  },
556
569
  async (params) => {
557
- return handleGetInstanceRecommendation(params)
570
+ return handleGetInstanceRecommendation(params);
558
571
  }
559
- )
572
+ );
560
573
 
561
574
  // Register alias tool name for backward compatibility
562
575
  server.tool(
@@ -578,27 +591,27 @@ server.tool(
578
591
  }).optional().describe('Additional deployment context')
579
592
  },
580
593
  async (params) => {
581
- return handleGetInstanceRecommendation(params)
594
+ return handleGetInstanceRecommendation(params);
582
595
  }
583
- )
596
+ );
584
597
 
585
598
  // ── Exports for testing ──────────────────────────────────────────────────────
586
599
 
587
- export { handleGetInstanceRecommendation, INSTANCE_CATALOG, SERVER_CONFIG, server, searchInstancesByTag, filterByCudaVersion }
600
+ export { handleGetInstanceRecommendation, INSTANCE_CATALOG, SERVER_CONFIG, server, searchInstancesByTag, filterByCudaVersion };
588
601
 
589
602
  // ── Transport connection (main module only) ──────────────────────────────────
590
603
 
591
- const isMain = process.argv[1] && resolve(process.argv[1]) === __filename
604
+ const isMain = process.argv[1] && resolve(process.argv[1]) === __filename;
592
605
 
593
606
  if (isMain) {
594
607
  if (SMART_MODE) {
595
- log(`Smart mode enabled (model: ${BEDROCK_MODEL}, region: ${BEDROCK_REGION})`)
608
+ log(`Smart mode enabled (model: ${BEDROCK_MODEL}, region: ${BEDROCK_REGION})`);
596
609
  } else if (!DISCOVER_MODE) {
597
- log('Static mode (catalog-only, no network calls) — use --no-discover to force this')
610
+ log('Static mode (catalog-only, no network calls) — use --no-discover to force this');
598
611
  } else {
599
- log('Discover mode (HuggingFace API + quota lookups active)')
612
+ log('Discover mode (HuggingFace API + quota lookups active)');
600
613
  }
601
614
 
602
- const transport = new StdioServerTransport()
603
- await server.connect(transport)
615
+ const transport = new StdioServerTransport();
616
+ await server.connect(transport);
604
617
  }