@aws/ml-container-creator 0.2.6 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/cli.js CHANGED
@@ -90,6 +90,9 @@ program
90
90
 
91
91
  // --- Authentication ---
92
92
  .addOption(new Option('--hf-token <token>', 'HuggingFace token (or "$HF_TOKEN" for env var reference)'))
93
+ .addOption(new Option('--hf-token-arn <arn>', 'HuggingFace token ARN from Secrets Manager'))
94
+ .addOption(new Option('--ngc-token <token>', 'NVIDIA NGC token (or "$NGC_API_KEY" for env var reference)'))
95
+ .addOption(new Option('--ngc-token-arn <arn>', 'NVIDIA NGC token ARN from Secrets Manager'))
93
96
 
94
97
  // --- Optional Features ---
95
98
  .addOption(new Option('--include-sample', 'Include sample model code'))
@@ -106,7 +109,18 @@ program
106
109
  .addOption(new Option('--validate-with-docker', 'Enable Docker introspection validation (opt-in)'))
107
110
  .addOption(new Option('--offline', 'Disable HuggingFace API lookups'))
108
111
 
109
- .action((projectNameArgs, options) => run(projectNameArgs?.[0] || null, options));
112
+ .action((projectNameArgs, options) => {
113
+ // Mutual exclusion validation: plaintext token and ARN flags cannot both be provided
114
+ if (options.hfToken && options.hfTokenArn) {
115
+ console.error('❌ Cannot specify both --hf-token and --hf-token-arn. Use one or the other.');
116
+ process.exit(1);
117
+ }
118
+ if (options.ngcToken && options.ngcTokenArn) {
119
+ console.error('❌ Cannot specify both --ngc-token and --ngc-token-arn. Use one or the other.');
120
+ process.exit(1);
121
+ }
122
+ return run(projectNameArgs?.[0] || null, options);
123
+ });
110
124
 
111
125
  // Custom help formatting — group options into logical sections (root command only)
112
126
  program.configureHelp({
@@ -174,7 +188,7 @@ program.configureHelp({
174
188
  groups.hyperpod.push(opt);
175
189
  } else if (['--model-env', '--server-env'].includes(long)) {
176
190
  groups.env.push(opt);
177
- } else if (['--hf-token'].includes(long)) {
191
+ } else if (['--hf-token', '--hf-token-arn', '--ngc-token', '--ngc-token-arn'].includes(long)) {
178
192
  groups.auth.push(opt);
179
193
  } else if (['--include-sample', '--include-testing', '--test-types'].includes(long)) {
180
194
  groups.features.push(opt);
@@ -255,6 +269,7 @@ program
255
269
  .option('--ci', 'Provision CI integration infrastructure')
256
270
  .option('--skip-ci', 'Skip CI infrastructure provisioning')
257
271
  .option('--skip-s3', 'Skip S3 bucket creation')
272
+ .option('--skip-post-setup', 'Skip post-setup chain (mcp init, sync-architectures, sync-schemas)')
258
273
  .action(async (action, args, options) => {
259
274
  const { default: BootstrapCommandHandler } = await import('../src/lib/bootstrap-command-handler.js');
260
275
  const handler = new BootstrapCommandHandler();
@@ -314,12 +329,33 @@ program
314
329
  .option('--project', 'Use project-level registry')
315
330
  .option('--parameters <json>', 'Parameters JSON string')
316
331
  .option('--generator-version <version>', 'Generator version')
332
+ // Options used by `registry list-architectures`
333
+ .option('--server <name>', 'Filter by server name (for list-architectures)')
334
+ .option('--verbose', 'Show full list of supported model types (for list-architectures)')
317
335
  .action(async (action, args, options) => {
318
336
  const { default: RegistryCommandHandler } = await import('../src/lib/registry-command-handler.js');
319
337
  const handler = new RegistryCommandHandler();
320
338
  await handler.handle([action, ...args], options);
321
339
  });
322
340
 
341
+ program
342
+ .command('secrets')
343
+ .description('Manage secrets in AWS Secrets Manager (create, list, describe)')
344
+ .argument('[action]', 'Secrets action (create, list, describe)')
345
+ .argument('[args...]', 'Additional arguments')
346
+ .option('--type <type>', 'Secret type (e.g., hf-token, ngc-token)')
347
+ .option('--name <label>', 'Secret label (used in naming convention)')
348
+ .option('--secret-value <value>', 'Secret value (masked in terminal)')
349
+ .option('--description <text>', 'Secret description')
350
+ .option('--kms-key-id <key>', 'KMS key for encryption')
351
+ .option('--json <json-or-path>', 'JSON input (inline or file://path)')
352
+ .action(async (action, args, options) => {
353
+ const { default: SecretsCommandHandler } = await import('../src/lib/secrets-command-handler.js');
354
+ const handler = new SecretsCommandHandler();
355
+ const allArgs = action ? [action, ...args] : [];
356
+ await handler.handle(allArgs, options);
357
+ });
358
+
323
359
  program
324
360
  .command('configure')
325
361
  .description('Interactive configuration setup (experimental)')
@@ -105,6 +105,20 @@
105
105
  "arn:aws:s3:::ml-container-creator-*",
106
106
  "arn:aws:s3:::ml-container-creator-*/*"
107
107
  ]
108
+ },
109
+ {
110
+ "Sid": "SecretsManagerRead",
111
+ "Effect": "Allow",
112
+ "Action": [
113
+ "secretsmanager:GetSecretValue",
114
+ "secretsmanager:DescribeSecret"
115
+ ],
116
+ "Resource": "arn:aws:secretsmanager:*:*:secret:mlcc/*",
117
+ "Condition": {
118
+ "StringEquals": {
119
+ "aws:ResourceTag/mlcc:managed-by": "ml-container-creator"
120
+ }
121
+ }
108
122
  }
109
123
  ]
110
124
  }
@@ -48,7 +48,6 @@
48
48
  "semver"
49
49
  ],
50
50
  "license": "Apache-2.0",
51
- "peer": true,
52
51
  "dependencies": {
53
52
  "jsonschema": "~1.4.1",
54
53
  "semver": "^7.7.4"
@@ -2151,7 +2150,6 @@
2151
2150
  "integrity": "sha512-wGdMcf+vPYM6jikpS/qhg6WiqSV/OhG+jeeHT/KlVqxYfD40iYJf9/AE1uQxVWFvU7MipKRkRv8NSHiCGgPr8Q==",
2152
2151
  "dev": true,
2153
2152
  "license": "MIT",
2154
- "peer": true,
2155
2153
  "dependencies": {
2156
2154
  "undici-types": "~6.21.0"
2157
2155
  }
@@ -2791,8 +2789,7 @@
2791
2789
  "version": "10.6.0",
2792
2790
  "resolved": "https://registry.npmjs.org/constructs/-/constructs-10.6.0.tgz",
2793
2791
  "integrity": "sha512-TxHOnBO5zMo/G76ykzGF/wMpEHu257TbWiIxP9K0Yv/+t70UzgBQiTqjkAsWOPC6jW91DzJI0+ehQV6xDRNBuQ==",
2794
- "license": "Apache-2.0",
2795
- "peer": true
2792
+ "license": "Apache-2.0"
2796
2793
  },
2797
2794
  "node_modules/create-require": {
2798
2795
  "version": "1.1.1",
@@ -2937,9 +2934,9 @@
2937
2934
  }
2938
2935
  },
2939
2936
  "node_modules/fast-xml-builder": {
2940
- "version": "1.1.5",
2941
- "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.1.5.tgz",
2942
- "integrity": "sha512-4TJn/8FKLeslLAH3dnohXqE3QSoxkhvaMzepOIZytwJXZO69Bfz0HBdDHzOTOon6G59Zrk6VQ2bEiv1t61rfkA==",
2937
+ "version": "1.2.0",
2938
+ "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.2.0.tgz",
2939
+ "integrity": "sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==",
2943
2940
  "dev": true,
2944
2941
  "funding": [
2945
2942
  {
@@ -2949,7 +2946,8 @@
2949
2946
  ],
2950
2947
  "license": "MIT",
2951
2948
  "dependencies": {
2952
- "path-expression-matcher": "^1.1.3"
2949
+ "path-expression-matcher": "^1.5.0",
2950
+ "xml-naming": "^0.1.0"
2953
2951
  }
2954
2952
  },
2955
2953
  "node_modules/fast-xml-parser": {
@@ -3696,7 +3694,6 @@
3696
3694
  "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
3697
3695
  "dev": true,
3698
3696
  "license": "Apache-2.0",
3699
- "peer": true,
3700
3697
  "bin": {
3701
3698
  "tsc": "bin/tsc",
3702
3699
  "tsserver": "bin/tsserver"
@@ -3837,6 +3834,22 @@
3837
3834
  "url": "https://github.com/chalk/ansi-styles?sponsor=1"
3838
3835
  }
3839
3836
  },
3837
+ "node_modules/xml-naming": {
3838
+ "version": "0.1.0",
3839
+ "resolved": "https://registry.npmjs.org/xml-naming/-/xml-naming-0.1.0.tgz",
3840
+ "integrity": "sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==",
3841
+ "dev": true,
3842
+ "funding": [
3843
+ {
3844
+ "type": "github",
3845
+ "url": "https://github.com/sponsors/NaturalIntelligence"
3846
+ }
3847
+ ],
3848
+ "license": "MIT",
3849
+ "engines": {
3850
+ "node": ">=16.0.0"
3851
+ }
3852
+ },
3840
3853
  "node_modules/y18n": {
3841
3854
  "version": "5.0.8",
3842
3855
  "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aws/ml-container-creator",
3
- "version": "0.2.6",
3
+ "version": "0.3.0",
4
4
  "description": "Generator for SageMaker AI BYOC paradigm for predictive inference use-cases.",
5
5
  "type": "module",
6
6
  "main": "src/app.js",
@@ -114,11 +114,11 @@ function log(message) {
114
114
  * @param {string} search - Search query string
115
115
  * @param {object} instanceCatalog - Instance catalog object
116
116
  * @param {object} [options={}]
117
- * @param {number} [options.limit=8] - Max results
117
+ * @param {number} [options.limit=10] - Max results
118
118
  * @returns {string[]} Matching instance type names, sorted by relevance
119
119
  */
120
120
  function searchInstancesByTag(search, instanceCatalog, options = {}) {
121
- const { limit = 8 } = options
121
+ const { limit = 10 } = options
122
122
  const candidates = Object.entries(instanceCatalog)
123
123
 
124
124
  // Tokenize search into lowercase keywords
@@ -236,7 +236,7 @@ async function handleGetInstanceRecommendation(params) {
236
236
  maxSequenceLength,
237
237
  batchSize,
238
238
  cudaVersion,
239
- limit = 8,
239
+ limit = 10,
240
240
  context
241
241
  } = params
242
242
 
@@ -361,11 +361,14 @@ async function handleGetInstanceRecommendation(params) {
361
361
  }
362
362
 
363
363
  // Step 2: Estimate VRAM
364
+ // Use model's max_position_embeddings as the sequence length when no explicit value is provided.
365
+ // This ensures KV cache is sized for the model's actual context window, not the 4096 default.
366
+ const resolvedMaxSeqLen = effectiveMaxSeqLen || modelMetadata.maxPositionEmbeddings || undefined
364
367
  const vramEstimate = estimateVram({
365
368
  parameterCount: modelMetadata.parameterCount,
366
369
  dtype: modelMetadata.dtype,
367
370
  quantization: quantization || undefined,
368
- maxSequenceLength: effectiveMaxSeqLen || undefined,
371
+ maxSequenceLength: resolvedMaxSeqLen,
369
372
  batchSize: effectiveBatchSize || undefined
370
373
  })
371
374
 
@@ -502,7 +505,7 @@ server.tool(
502
505
  maxSequenceLength: z.number().optional().describe('Max context/sequence length (affects KV cache estimate)'),
503
506
  batchSize: z.number().optional().describe('Expected concurrent batch size'),
504
507
  cudaVersion: z.string().optional().describe('Required CUDA version from base image (filters incompatible instances)'),
505
- limit: z.number().optional().default(8).describe('Maximum number of instance recommendations to return'),
508
+ limit: z.number().optional().default(10).describe('Maximum number of instance recommendations to return'),
506
509
  context: z.object({
507
510
  architecture: z.string().optional(),
508
511
  backend: z.string().optional(),
@@ -526,7 +529,7 @@ server.tool(
526
529
  maxSequenceLength: z.number().optional().describe('Max context/sequence length (affects KV cache estimate)'),
527
530
  batchSize: z.number().optional().describe('Expected concurrent batch size'),
528
531
  cudaVersion: z.string().optional().describe('Required CUDA version from base image (filters incompatible instances)'),
529
- limit: z.number().optional().default(8).describe('Maximum number of instance recommendations to return'),
532
+ limit: z.number().optional().default(10).describe('Maximum number of instance recommendations to return'),
530
533
  context: z.object({
531
534
  architecture: z.string().optional(),
532
535
  backend: z.string().optional(),
@@ -51,6 +51,22 @@ const COST_TIER_WEIGHT = {
51
51
  'high': 3
52
52
  }
53
53
 
54
+ /**
55
+ * Generation weight by instance family.
56
+ * Lower is newer (sorted first). Newer generations offer better perf/$.
57
+ */
58
+ const GENERATION_WEIGHT = {
59
+ 'g6': 1,
60
+ 'p5': 1,
61
+ 'trn1': 2,
62
+ 'inf2': 2,
63
+ 'g5': 3,
64
+ 'p4de': 4,
65
+ 'p4d': 4,
66
+ 'p3': 5,
67
+ 'g4dn': 6
68
+ }
69
+
54
70
  /**
55
71
  * TP overhead penalty: 10% per additional GPU beyond the first.
56
72
  * Effective VRAM = totalVram × (1 - 0.10 × (gpuCount - 1))
@@ -144,12 +160,12 @@ const effectiveVram = (totalVramGb, gpuCount) => {
144
160
  * @param {number} vramRequired - Required VRAM in GB
145
161
  * @param {object} instanceCatalog - Object keyed by instance type, values are metadata
146
162
  * @param {object} [options={}]
147
- * @param {number} [options.limit=8] - Max results to return
163
+ * @param {number} [options.limit=10] - Max results to return
148
164
  * @param {boolean} [options.allowTensorParallelism=true] - Consider multi-GPU splits
149
165
  * @returns {object[]} Ranked list of compatible instances
150
166
  */
151
167
  const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) => {
152
- const { limit = 8, allowTensorParallelism = true } = options
168
+ const { limit = 10, allowTensorParallelism = true } = options
153
169
 
154
170
  if (!vramRequired || vramRequired <= 0) {
155
171
  return []
@@ -182,7 +198,8 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
182
198
  totalVramGb,
183
199
  utilizationPercent,
184
200
  tensorParallelism: 1,
185
- costTier: getCostTier(meta)
201
+ costTier: getCostTier(meta),
202
+ family: meta.family || ''
186
203
  })
187
204
  }
188
205
  } else if (allowTensorParallelism) {
@@ -196,7 +213,8 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
196
213
  totalVramGb,
197
214
  utilizationPercent,
198
215
  tensorParallelism: gpuCount,
199
- costTier: getCostTier(meta)
216
+ costTier: getCostTier(meta),
217
+ family: meta.family || ''
200
218
  })
201
219
  }
202
220
  }
@@ -204,24 +222,30 @@ const filterAndRankInstances = (vramRequired, instanceCatalog, options = {}) =>
204
222
 
205
223
  // Sort candidates by ranking criteria:
206
224
  // 1. Single-GPU first (TP=1), then multi-GPU by lowest TP degree
207
- // 2. Within each TP tier, sort by cost-efficiency (lowest cost tier first,
208
- // then by lowest utilization more headroom is better for the same cost)
225
+ // 2. Within each TP tier, newest generation first (g6 > g5 > g4dn)
226
+ // 3. Within same generation, sort by cost tier (lower is better)
227
+ // 4. Within same cost tier, prefer lower total VRAM (right-sized)
209
228
  candidates.sort((a, b) => {
210
229
  // Primary: TP degree (lower is better)
211
230
  if (a.tensorParallelism !== b.tensorParallelism) {
212
231
  return a.tensorParallelism - b.tensorParallelism
213
232
  }
214
233
 
215
- // Secondary: cost tier (lower is better)
234
+ // Secondary: generation (newer is better — lower weight)
235
+ const genA = GENERATION_WEIGHT[a.family] || 4
236
+ const genB = GENERATION_WEIGHT[b.family] || 4
237
+ if (genA !== genB) {
238
+ return genA - genB
239
+ }
240
+
241
+ // Tertiary: cost tier (lower is better)
216
242
  const costA = COST_TIER_WEIGHT[a.costTier] || 2
217
243
  const costB = COST_TIER_WEIGHT[b.costTier] || 2
218
244
  if (costA !== costB) {
219
245
  return costA - costB
220
246
  }
221
247
 
222
- // Tertiary: cost-efficiency lower $/GB approximated by
223
- // lower cost tier with higher total VRAM (more GB per dollar)
224
- // Since cost tier is equal here, prefer higher total VRAM (better value)
248
+ // Quaternary: prefer lower total VRAM (right-sized, less waste)
225
249
  if (a.totalVramGb !== b.totalVramGb) {
226
250
  return a.totalVramGb - b.totalVramGb
227
251
  }
@@ -241,5 +265,6 @@ export {
241
265
  GPU_MEMORY_MAP,
242
266
  COST_TIER_MAP,
243
267
  COST_TIER_WEIGHT,
268
+ GENERATION_WEIGHT,
244
269
  TP_OVERHEAD_PER_GPU
245
270
  }
@@ -214,12 +214,16 @@ const resolveModelMetadata = async (modelName, options = {}) => {
214
214
  const catalogEntry = catalogLookup(modelName, catalog)
215
215
 
216
216
  if (catalogEntry) {
217
- return {
218
- parameterCount: catalogEntry.parameterCount,
219
- dtype: catalogEntry.defaultDtype,
220
- architecture: catalogEntry.architecture,
221
- maxPositionEmbeddings: catalogEntry.maxPositionEmbeddings,
222
- source: 'catalog'
217
+ // Only use catalog entry if it has a usable parameterCount for VRAM estimation.
218
+ // If parameterCount is missing, fall through to HuggingFace API (tier 2).
219
+ if (catalogEntry.parameterCount) {
220
+ return {
221
+ parameterCount: catalogEntry.parameterCount,
222
+ dtype: catalogEntry.defaultDtype,
223
+ architecture: catalogEntry.architecture,
224
+ maxPositionEmbeddings: catalogEntry.maxPositionEmbeddings,
225
+ source: 'catalog'
226
+ }
223
227
  }
224
228
  }
225
229