@aws/ml-container-creator 0.2.6 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +38 -2
- package/config/bootstrap-stack.json +94 -1
- package/config/defaults.json +1 -1
- package/infra/ci-harness/package-lock.json +22 -9
- package/package.json +3 -1
- package/servers/instance-sizer/index.js +45 -8
- package/servers/instance-sizer/lib/instance-ranker.js +140 -11
- package/servers/instance-sizer/lib/model-resolver.js +10 -6
- package/servers/instance-sizer/lib/quota-resolver.js +368 -0
- package/servers/instance-sizer/package.json +2 -0
- package/servers/lib/catalogs/instances.json +527 -12
- package/servers/lib/catalogs/model-servers.json +298 -20
- package/servers/lib/catalogs/model-sizes.json +27 -0
- package/servers/lib/catalogs/models.json +101 -0
- package/servers/lib/schemas/image-catalog.schema.json +15 -1
- package/servers/model-picker/index.js +2 -1
- package/src/app.js +96 -2
- package/src/lib/architecture-sync.js +171 -0
- package/src/lib/arn-detection.js +22 -0
- package/src/lib/bootstrap-command-handler.js +178 -3
- package/src/lib/cli-handler.js +2 -2
- package/src/lib/config-manager.js +121 -1
- package/src/lib/cross-cutting-checker.js +119 -0
- package/src/lib/deployment-entry-schema.js +1 -2
- package/src/lib/prompt-runner.js +514 -20
- package/src/lib/prompts.js +67 -5
- package/src/lib/registry-command-handler.js +236 -0
- package/src/lib/schema-sync.js +31 -0
- package/src/lib/secret-classification.js +56 -0
- package/src/lib/secrets-command-handler.js +550 -0
- package/src/lib/template-manager.js +49 -1
- package/src/lib/validate-runner.js +174 -2
- package/src/lib/validation-report.js +8 -1
- package/src/prompt-adapter.js +3 -2
- package/templates/Dockerfile +10 -2
- package/templates/code/cuda_compat.sh +22 -0
- package/templates/code/serve +3 -0
- package/templates/code/start_server.sh +3 -0
- package/templates/diffusors/Dockerfile +2 -1
- package/templates/diffusors/serve +3 -0
- package/templates/do/README.md +33 -0
- package/templates/do/benchmark +646 -0
- package/templates/do/build +22 -0
- package/templates/do/clean +86 -0
- package/templates/do/config +41 -6
- package/templates/do/deploy +66 -6
- package/templates/do/logs +18 -3
- package/templates/do/register +8 -1
- package/templates/do/run +10 -0
- package/templates/triton/Dockerfile +5 -0
|
@@ -214,12 +214,16 @@ const resolveModelMetadata = async (modelName, options = {}) => {
|
|
|
214
214
|
const catalogEntry = catalogLookup(modelName, catalog)
|
|
215
215
|
|
|
216
216
|
if (catalogEntry) {
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
217
|
+
// Only use catalog entry if it has a usable parameterCount for VRAM estimation.
|
|
218
|
+
// If parameterCount is missing, fall through to HuggingFace API (tier 2).
|
|
219
|
+
if (catalogEntry.parameterCount) {
|
|
220
|
+
return {
|
|
221
|
+
parameterCount: catalogEntry.parameterCount,
|
|
222
|
+
dtype: catalogEntry.defaultDtype,
|
|
223
|
+
architecture: catalogEntry.architecture,
|
|
224
|
+
maxPositionEmbeddings: catalogEntry.maxPositionEmbeddings,
|
|
225
|
+
source: 'catalog'
|
|
226
|
+
}
|
|
223
227
|
}
|
|
224
228
|
}
|
|
225
229
|
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Quota Resolver
|
|
6
|
+
*
|
|
7
|
+
* Queries AWS APIs to determine account-level quota headroom, capacity
|
|
8
|
+
* reservations, and Flexible Training Plans for SageMaker instance types.
|
|
9
|
+
* Used in discover mode to filter and prioritize instance recommendations.
|
|
10
|
+
*
|
|
11
|
+
* All methods degrade gracefully — API failures return null and log to stderr.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { ServiceQuotasClient, ListServiceQuotasCommand } from '@aws-sdk/client-service-quotas'
|
|
15
|
+
import { SageMakerClient, ListEndpointsCommand, ListTrainingPlansCommand } from '@aws-sdk/client-sagemaker'
|
|
16
|
+
|
|
17
|
+
// ── Constants ────────────────────────────────────────────────────────────────
|
|
18
|
+
|
|
19
|
+
const SAGEMAKER_SERVICE_CODE = 'sagemaker'
|
|
20
|
+
const DEFAULT_TIMEOUT_MS = 5000
|
|
21
|
+
const DEFAULT_CACHE_TTL_MS = 300000 // 5 minutes
|
|
22
|
+
const QUOTA_NAME_PATTERN = /^(ml\.[a-z0-9]+\.[a-z0-9]+) for endpoint usage$/
|
|
23
|
+
|
|
24
|
+
// ── Logging ──────────────────────────────────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
function log(message) {
|
|
27
|
+
process.stderr.write(`[quota-resolver] ${message}\n`)
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// ── QuotaResolver Class ──────────────────────────────────────────────────────
|
|
31
|
+
|
|
32
|
+
class QuotaResolver {
|
|
33
|
+
/**
|
|
34
|
+
* @param {string} region - AWS region to query
|
|
35
|
+
* @param {object} [options={}]
|
|
36
|
+
* @param {number} [options.timeout=5000] - Timeout per API call in ms
|
|
37
|
+
* @param {number} [options.cacheTtl=300000] - Cache TTL in ms (default 5 min)
|
|
38
|
+
*/
|
|
39
|
+
constructor(region, options = {}) {
|
|
40
|
+
this.region = region
|
|
41
|
+
this.timeout = options.timeout || DEFAULT_TIMEOUT_MS
|
|
42
|
+
this.cacheTtl = options.cacheTtl || DEFAULT_CACHE_TTL_MS
|
|
43
|
+
this.cache = new Map()
|
|
44
|
+
|
|
45
|
+
const clientConfig = {
|
|
46
|
+
region: this.region,
|
|
47
|
+
requestHandler: {
|
|
48
|
+
requestTimeout: this.timeout
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
this.quotasClient = new ServiceQuotasClient(clientConfig)
|
|
53
|
+
this.sagemakerClient = new SageMakerClient(clientConfig)
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Check cache for a key. Returns cached value if within TTL, else null.
|
|
58
|
+
* @param {string} key - Cache key
|
|
59
|
+
* @returns {*|null} Cached value or null
|
|
60
|
+
*/
|
|
61
|
+
_getCached(key) {
|
|
62
|
+
const entry = this.cache.get(key)
|
|
63
|
+
if (!entry) return null
|
|
64
|
+
if (Date.now() - entry.timestamp > this.cacheTtl) {
|
|
65
|
+
this.cache.delete(key)
|
|
66
|
+
return null
|
|
67
|
+
}
|
|
68
|
+
return entry.value
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Store a value in the cache with current timestamp.
|
|
73
|
+
* @param {string} key - Cache key
|
|
74
|
+
* @param {*} value - Value to cache
|
|
75
|
+
*/
|
|
76
|
+
_setCache(key, value) {
|
|
77
|
+
this.cache.set(key, { value, timestamp: Date.now() })
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Parse a SageMaker quota name to extract the instance type.
|
|
82
|
+
* Expected pattern: "ml.<family>.<size> for endpoint usage"
|
|
83
|
+
*
|
|
84
|
+
* @param {string} quotaName - Quota name from Service Quotas API
|
|
85
|
+
* @returns {string|null} Instance type or null if pattern doesn't match
|
|
86
|
+
*/
|
|
87
|
+
_parseQuotaName(quotaName) {
|
|
88
|
+
const match = quotaName.match(QUOTA_NAME_PATTERN)
|
|
89
|
+
return match ? match[1] : null
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Get quota headroom for a list of instance types.
|
|
94
|
+
*
|
|
95
|
+
* Queries Service Quotas for SageMaker endpoint instance limits and
|
|
96
|
+
* ListEndpoints to count currently deployed instances per type.
|
|
97
|
+
* Headroom = quota limit - deployed count.
|
|
98
|
+
*
|
|
99
|
+
* @param {string[]} instanceTypes - Instance types to check (e.g., ['ml.g5.xlarge'])
|
|
100
|
+
* @returns {Promise<Map|null>} Map: instanceType → { quota, deployed, headroom }, or null on failure
|
|
101
|
+
*/
|
|
102
|
+
async getQuotaHeadroom(instanceTypes) {
|
|
103
|
+
const cacheKey = 'quotaHeadroom'
|
|
104
|
+
const cached = this._getCached(cacheKey)
|
|
105
|
+
if (cached) return cached
|
|
106
|
+
|
|
107
|
+
try {
|
|
108
|
+
const [quotaMap, deployedMap] = await Promise.allSettled([
|
|
109
|
+
this._fetchServiceQuotas(),
|
|
110
|
+
this._fetchDeployedCounts()
|
|
111
|
+
])
|
|
112
|
+
|
|
113
|
+
const quotas = quotaMap.status === 'fulfilled' ? quotaMap.value : null
|
|
114
|
+
const deployed = deployedMap.status === 'fulfilled' ? deployedMap.value : null
|
|
115
|
+
|
|
116
|
+
if (!quotas) {
|
|
117
|
+
return null
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const result = new Map()
|
|
121
|
+
const deployedCounts = deployed || new Map()
|
|
122
|
+
|
|
123
|
+
for (const instanceType of instanceTypes) {
|
|
124
|
+
const quota = quotas.get(instanceType)
|
|
125
|
+
if (quota != null) {
|
|
126
|
+
const deployedCount = deployedCounts.get(instanceType) || 0
|
|
127
|
+
const headroom = quota - deployedCount
|
|
128
|
+
result.set(instanceType, {
|
|
129
|
+
quota,
|
|
130
|
+
deployed: deployedCount,
|
|
131
|
+
headroom
|
|
132
|
+
})
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
this._setCache(cacheKey, result)
|
|
137
|
+
return result
|
|
138
|
+
} catch (err) {
|
|
139
|
+
if (err.name === 'AccessDeniedException' || err.Code === 'AccessDeniedException') {
|
|
140
|
+
log(`AccessDenied: insufficient permissions for quota queries — skipping`)
|
|
141
|
+
} else if (err.name === 'ThrottlingException' || err.Code === 'ThrottlingException') {
|
|
142
|
+
log(`Throttled: Service Quotas API rate limit hit — skipping`)
|
|
143
|
+
} else {
|
|
144
|
+
log(`Failed to get quota headroom: ${err.message}`)
|
|
145
|
+
}
|
|
146
|
+
return null
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Fetch all SageMaker service quotas for endpoint instance types.
|
|
152
|
+
* Paginates through all results.
|
|
153
|
+
*
|
|
154
|
+
* @returns {Promise<Map>} Map: instanceType → quota limit (number)
|
|
155
|
+
*/
|
|
156
|
+
async _fetchServiceQuotas() {
|
|
157
|
+
const quotaMap = new Map()
|
|
158
|
+
let nextToken = undefined
|
|
159
|
+
|
|
160
|
+
do {
|
|
161
|
+
const command = new ListServiceQuotasCommand({
|
|
162
|
+
ServiceCode: SAGEMAKER_SERVICE_CODE,
|
|
163
|
+
...(nextToken && { NextToken: nextToken })
|
|
164
|
+
})
|
|
165
|
+
|
|
166
|
+
const response = await this.quotasClient.send(command)
|
|
167
|
+
|
|
168
|
+
for (const quota of (response.Quotas || [])) {
|
|
169
|
+
const instanceType = this._parseQuotaName(quota.QuotaName || '')
|
|
170
|
+
if (instanceType && quota.Value != null) {
|
|
171
|
+
quotaMap.set(instanceType, quota.Value)
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
nextToken = response.NextToken
|
|
176
|
+
} while (nextToken)
|
|
177
|
+
|
|
178
|
+
return quotaMap
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Fetch currently deployed endpoint instances and count per type.
|
|
183
|
+
* Paginates through all endpoints.
|
|
184
|
+
*
|
|
185
|
+
* @returns {Promise<Map>} Map: instanceType → deployed count
|
|
186
|
+
*/
|
|
187
|
+
async _fetchDeployedCounts() {
|
|
188
|
+
const deployedMap = new Map()
|
|
189
|
+
let nextToken = undefined
|
|
190
|
+
|
|
191
|
+
do {
|
|
192
|
+
const command = new ListEndpointsCommand({
|
|
193
|
+
StatusEquals: 'InService',
|
|
194
|
+
...(nextToken && { NextToken: nextToken })
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
const response = await this.sagemakerClient.send(command)
|
|
198
|
+
|
|
199
|
+
for (const endpoint of (response.Endpoints || [])) {
|
|
200
|
+
// ListEndpoints returns endpoint summaries; instance type info
|
|
201
|
+
// is in the ProductionVariants. We count each endpoint as 1
|
|
202
|
+
// instance of its configured type. For more accurate counts,
|
|
203
|
+
// DescribeEndpoint would be needed, but that's too many API calls.
|
|
204
|
+
// The endpoint name often encodes the instance type, but the
|
|
205
|
+
// reliable approach is to count endpoints and map via config.
|
|
206
|
+
// For now, we track endpoint counts by checking production variants
|
|
207
|
+
// if available, otherwise skip.
|
|
208
|
+
if (endpoint.ProductionVariants) {
|
|
209
|
+
for (const variant of endpoint.ProductionVariants) {
|
|
210
|
+
if (variant.InstanceType) {
|
|
211
|
+
const current = deployedMap.get(variant.InstanceType) || 0
|
|
212
|
+
const count = variant.CurrentInstanceCount || 1
|
|
213
|
+
deployedMap.set(variant.InstanceType, current + count)
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
nextToken = response.NextToken
|
|
220
|
+
} while (nextToken)
|
|
221
|
+
|
|
222
|
+
return deployedMap
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Get active Training Plan reservations for inference endpoints.
|
|
227
|
+
*
|
|
228
|
+
* Queries ListTrainingPlans for active plans with TargetResources=endpoint.
|
|
229
|
+
* These are SageMaker-managed capacity reservations that can be referenced
|
|
230
|
+
* via MlReservationArn in CreateEndpointConfig.
|
|
231
|
+
*
|
|
232
|
+
* ⚠️ EXPERIMENTAL: Training Plans for inference is a newer feature.
|
|
233
|
+
*
|
|
234
|
+
* @returns {Promise<Map|null>} Map: instanceType → { planName, planArn, remainingCapacity, startDate, endDate }, or null on failure
|
|
235
|
+
*/
|
|
236
|
+
async getCapacityReservations() {
|
|
237
|
+
const cacheKey = 'capacityReservations'
|
|
238
|
+
const cached = this._getCached(cacheKey)
|
|
239
|
+
if (cached) return cached
|
|
240
|
+
|
|
241
|
+
try {
|
|
242
|
+
const result = new Map()
|
|
243
|
+
let nextToken = undefined
|
|
244
|
+
|
|
245
|
+
do {
|
|
246
|
+
const command = new ListTrainingPlansCommand({
|
|
247
|
+
StatusEquals: 'Active',
|
|
248
|
+
...(nextToken && { NextToken: nextToken })
|
|
249
|
+
})
|
|
250
|
+
|
|
251
|
+
const response = await this.sagemakerClient.send(command)
|
|
252
|
+
const now = new Date()
|
|
253
|
+
|
|
254
|
+
for (const plan of (response.TrainingPlanSummaries || [])) {
|
|
255
|
+
// Only include plans targeting inference endpoints
|
|
256
|
+
const targetResources = plan.TargetResources || []
|
|
257
|
+
if (!targetResources.includes('endpoint')) continue
|
|
258
|
+
|
|
259
|
+
const instanceType = plan.InstanceType || plan.ReservedCapacityInstanceType
|
|
260
|
+
if (!instanceType) continue
|
|
261
|
+
|
|
262
|
+
const planArn = plan.TrainingPlanArn
|
|
263
|
+
const planName = plan.TrainingPlanName || 'unknown'
|
|
264
|
+
const remainingCapacity = plan.AvailableInstanceCount
|
|
265
|
+
?? plan.RemainingCapacity
|
|
266
|
+
?? plan.TotalInstanceCount
|
|
267
|
+
?? 0
|
|
268
|
+
const startDate = plan.StartTime || null
|
|
269
|
+
const endDate = plan.EndTime || plan.ExpirationTime || null
|
|
270
|
+
|
|
271
|
+
// Skip plans outside their time window
|
|
272
|
+
if (startDate && new Date(startDate) > now) continue
|
|
273
|
+
if (endDate && new Date(endDate) < now) continue
|
|
274
|
+
|
|
275
|
+
// Only include if there's remaining capacity
|
|
276
|
+
if (remainingCapacity <= 0) continue
|
|
277
|
+
|
|
278
|
+
result.set(instanceType, {
|
|
279
|
+
planName,
|
|
280
|
+
planArn,
|
|
281
|
+
type: 'training-plan',
|
|
282
|
+
count: remainingCapacity,
|
|
283
|
+
startDate: startDate ? (startDate instanceof Date ? startDate.toISOString() : startDate) : null,
|
|
284
|
+
endDate: endDate ? (endDate instanceof Date ? endDate.toISOString() : endDate) : null
|
|
285
|
+
})
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
nextToken = response.NextToken
|
|
289
|
+
} while (nextToken)
|
|
290
|
+
|
|
291
|
+
this._setCache(cacheKey, result)
|
|
292
|
+
return result
|
|
293
|
+
} catch (err) {
|
|
294
|
+
if (err.name === 'AccessDeniedException' || err.Code === 'AccessDeniedException') {
|
|
295
|
+
log(`AccessDenied: insufficient permissions for training plan queries — skipping`)
|
|
296
|
+
} else if (err.name === 'ValidationException') {
|
|
297
|
+
log(`ListTrainingPlans not available in region ${this.region} — skipping`)
|
|
298
|
+
} else if (err.name === 'ThrottlingException' || err.Code === 'ThrottlingException') {
|
|
299
|
+
log(`Throttled: ListTrainingPlans rate limit hit — skipping`)
|
|
300
|
+
} else {
|
|
301
|
+
log(`Failed to get capacity reservations: ${err.message}`)
|
|
302
|
+
}
|
|
303
|
+
return null
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Get active Flexible Training Plans with remaining capacity.
|
|
309
|
+
*
|
|
310
|
+
* Calls ListTrainingPlans with status filter for active plans and
|
|
311
|
+
* extracts instance types and remaining capacity from each plan.
|
|
312
|
+
*
|
|
313
|
+
* @returns {Promise<Map|null>} Map: instanceType → { planName, remainingCapacity, expiresAt }, or null on failure
|
|
314
|
+
*/
|
|
315
|
+
async getTrainingPlans() {
|
|
316
|
+
const cacheKey = 'trainingPlans'
|
|
317
|
+
const cached = this._getCached(cacheKey)
|
|
318
|
+
if (cached) return cached
|
|
319
|
+
|
|
320
|
+
try {
|
|
321
|
+
const result = new Map()
|
|
322
|
+
let nextToken = undefined
|
|
323
|
+
|
|
324
|
+
do {
|
|
325
|
+
const command = new ListTrainingPlansCommand({
|
|
326
|
+
StatusEquals: 'Active',
|
|
327
|
+
...(nextToken && { NextToken: nextToken })
|
|
328
|
+
})
|
|
329
|
+
|
|
330
|
+
const response = await this.sagemakerClient.send(command)
|
|
331
|
+
|
|
332
|
+
for (const plan of (response.TrainingPlanSummaries || [])) {
|
|
333
|
+
const instanceType = plan.InstanceType || plan.ReservedCapacityInstanceType
|
|
334
|
+
const planName = plan.TrainingPlanName || plan.TrainingPlanArn || 'unknown'
|
|
335
|
+
const remainingCapacity = plan.AvailableInstanceCount
|
|
336
|
+
?? plan.RemainingCapacity
|
|
337
|
+
?? plan.TotalInstanceCount
|
|
338
|
+
?? 0
|
|
339
|
+
const expiresAt = plan.EndTime || plan.ExpirationTime || null
|
|
340
|
+
|
|
341
|
+
if (instanceType && remainingCapacity > 0) {
|
|
342
|
+
result.set(instanceType, {
|
|
343
|
+
planName,
|
|
344
|
+
remainingCapacity,
|
|
345
|
+
expiresAt
|
|
346
|
+
})
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
nextToken = response.NextToken
|
|
351
|
+
} while (nextToken)
|
|
352
|
+
|
|
353
|
+
this._setCache(cacheKey, result)
|
|
354
|
+
return result
|
|
355
|
+
} catch (err) {
|
|
356
|
+
if (err.name === 'AccessDeniedException' || err.Code === 'AccessDeniedException') {
|
|
357
|
+
log(`AccessDenied: insufficient permissions for training plan queries — skipping`)
|
|
358
|
+
} else if (err.name === 'ValidationException') {
|
|
359
|
+
log(`ListTrainingPlans not available in region ${this.region} — skipping`)
|
|
360
|
+
} else {
|
|
361
|
+
log(`Failed to get training plans: ${err.message}`)
|
|
362
|
+
}
|
|
363
|
+
return null
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
export { QuotaResolver, QUOTA_NAME_PATTERN, SAGEMAKER_SERVICE_CODE, DEFAULT_TIMEOUT_MS, DEFAULT_CACHE_TTL_MS }
|