@aws/ml-container-creator 0.12.1 → 0.13.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +11 -3
- package/servers/instance-sizer/lib/model-resolver.js +127 -185
- package/servers/instance-sizer/lib/vram-estimator.js +86 -0
- package/servers/lib/catalogs/instances.json +0 -27
- package/src/app.js +14 -0
- package/src/lib/bootstrap-command-handler.js +2 -2
- package/src/lib/generated/cli-options.js +1 -1
- package/src/lib/generated/parameter-matrix.js +1 -1
- package/src/lib/generated/validation-rules.js +1 -1
- package/src/lib/prompt-runner.js +14 -31
- package/src/lib/prove-pipeline-executor.js +294 -0
- package/templates/IAM_PERMISSIONS.md +64 -13
- package/templates/do/.tune_helper.py +5 -2
- package/templates/do/README.md +50 -567
- package/templates/do/adapter +1 -1
- package/templates/do/build +2 -2
- package/templates/do/clean.d/async-inference.ejs +2 -2
- package/templates/do/clean.d/batch-transform.ejs +2 -2
- package/templates/do/clean.d/hyperpod-eks.ejs +2 -2
- package/templates/do/clean.d/managed-inference.ejs +2 -2
- package/templates/do/deploy.d/async-inference.ejs +6 -6
- package/templates/do/deploy.d/batch-transform.ejs +4 -4
- package/templates/do/deploy.d/hyperpod-eks.ejs +1 -1
- package/templates/do/deploy.d/managed-inference.ejs +15 -3
- package/templates/do/lib/profile.sh +19 -15
- package/templates/do/lib/staged-assets.sh +217 -0
- package/templates/do/push +2 -2
- package/templates/do/register +2 -2
- package/templates/do/stage +38 -33
- package/templates/do/submit +1 -1
- package/templates/do/tune +1 -1
- package/templates/MIGRATION.md +0 -488
- package/templates/TEMPLATE_SYSTEM.md +0 -243
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
package/src/lib/prompt-runner.js
CHANGED
|
@@ -18,8 +18,6 @@ import {
|
|
|
18
18
|
modelLoadStrategyPrompts,
|
|
19
19
|
modelProfilePrompts,
|
|
20
20
|
modulePrompts,
|
|
21
|
-
loraPrompts,
|
|
22
|
-
benchmarkPrompts,
|
|
23
21
|
infraRegionAndTargetPrompts,
|
|
24
22
|
infraExistingEndpointPrompts,
|
|
25
23
|
infraInstancePrompts,
|
|
@@ -521,38 +519,23 @@ export default class PromptRunner {
|
|
|
521
519
|
const ngcApiKeyAnswers = { ngcApiKey: secretAnswers.ngcApiKey, ngcTokenArn: secretAnswers.ngcTokenArn };
|
|
522
520
|
|
|
523
521
|
// Module selection
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
522
|
+
// Only ask about sample model for non-transformers/diffusors (Triton etc.)
|
|
523
|
+
const moduleAnswers = {};
|
|
524
|
+
if (frameworkAnswers.architecture !== 'transformers' &&
|
|
525
|
+
frameworkAnswers.architecture !== 'diffusors') {
|
|
526
|
+
const sampleModelAnswers = await this._runPhase(
|
|
527
|
+
modulePrompts.filter(p => p.name === 'includeSampleModel'),
|
|
528
|
+
{ ...frameworkAnswers, ...engineAnswers }, explicitConfig, existingConfig
|
|
529
|
+
);
|
|
530
|
+
Object.assign(moduleAnswers, sampleModelAnswers);
|
|
531
|
+
} else {
|
|
531
532
|
moduleAnswers.includeSampleModel = false;
|
|
532
533
|
}
|
|
533
534
|
|
|
534
|
-
//
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
const testTypes = moduleAnswers.testTypes || [];
|
|
539
|
-
const includeBenchmark = testTypes.includes('sagemaker-ai-automated-benchmarking') ||
|
|
540
|
-
explicitConfig.includeBenchmark === true ||
|
|
541
|
-
explicitConfig.includeBenchmark === 'true';
|
|
542
|
-
benchmarkAnswers.includeBenchmark = includeBenchmark;
|
|
543
|
-
if (includeBenchmark) {
|
|
544
|
-
const subAnswers = await this._runPhase(benchmarkPrompts, { ...frameworkAnswers, ...moduleAnswers, includeBenchmark }, explicitConfig, existingConfig);
|
|
545
|
-
benchmarkAnswers = { ...benchmarkAnswers, ...subAnswers };
|
|
546
|
-
}
|
|
547
|
-
}
|
|
548
|
-
|
|
549
|
-
// LoRA adapter prompts — only for transformers with vllm/sglang/djl-lmi
|
|
550
|
-
// Requirements: 1.1, 1.2, 1.4
|
|
551
|
-
let loraAnswers = {};
|
|
552
|
-
const loraSubAnswers = await this._runPhase(loraPrompts, { ...frameworkAnswers, ...engineAnswers }, explicitConfig, existingConfig);
|
|
553
|
-
if (loraSubAnswers.enableLora !== undefined) {
|
|
554
|
-
loraAnswers = loraSubAnswers;
|
|
555
|
-
}
|
|
535
|
+
// Test types, benchmark, and LoRA are always-on (BL-122)
|
|
536
|
+
moduleAnswers.testTypes = ['hosted-model-endpoint', 'sagemaker-ai-automated-benchmarking'];
|
|
537
|
+
const benchmarkAnswers = { includeBenchmark: true };
|
|
538
|
+
const loraAnswers = { enableLora: true };
|
|
556
539
|
|
|
557
540
|
// Validate instance type against framework requirements (now that framework version is known)
|
|
558
541
|
const finalInstanceType = infraAnswers.customInstanceType || infraAnswers.instanceType;
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Prove Pipeline Executor
|
|
6
|
+
*
|
|
7
|
+
* Executes lifecycle stages for validation targets in the `mcc prove` workflow.
|
|
8
|
+
* Handles stage-specific logic including idempotency checks, status tracking,
|
|
9
|
+
* and fail-fast behavior.
|
|
10
|
+
*
|
|
11
|
+
* Feature: s3-model-loading
|
|
12
|
+
* Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { execFile } from 'node:child_process';
|
|
16
|
+
import { promisify } from 'node:util';
|
|
17
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
18
|
+
import path from 'node:path';
|
|
19
|
+
|
|
20
|
+
const execFileAsync = promisify(execFile);
|
|
21
|
+
|
|
22
|
+
// ── Valid Lifecycle Stages ────────────────────────────────────────────────────
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* All recognized lifecycle stages for the prove pipeline.
|
|
26
|
+
* The "stage" step pre-stages model weights from HuggingFace to S3.
|
|
27
|
+
*/
|
|
28
|
+
export const VALID_LIFECYCLE_STAGES = [
|
|
29
|
+
'generate',
|
|
30
|
+
'stage',
|
|
31
|
+
'build',
|
|
32
|
+
'push',
|
|
33
|
+
'deploy',
|
|
34
|
+
'test',
|
|
35
|
+
'tune',
|
|
36
|
+
'adapter',
|
|
37
|
+
'test-adapter',
|
|
38
|
+
'benchmark',
|
|
39
|
+
'register',
|
|
40
|
+
'clean'
|
|
41
|
+
];
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Possible staging states for status output.
|
|
45
|
+
*/
|
|
46
|
+
export const STAGING_STATES = {
|
|
47
|
+
STAGED: 'staged',
|
|
48
|
+
NOT_STAGED: 'not-staged',
|
|
49
|
+
STAGE_FAILED: 'stage-failed'
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
// ── Stage Lifecycle Step ─────────────────────────────────────────────────────
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Check if a model has already been staged by looking for `.mlcc/staged-assets.json`.
|
|
56
|
+
*
|
|
57
|
+
* @param {string} projectDir - Path to the generated project directory
|
|
58
|
+
* @returns {boolean} True if the model has already been staged
|
|
59
|
+
*/
|
|
60
|
+
export function isAlreadyStaged(projectDir) {
|
|
61
|
+
const stagedAssetsPath = path.join(projectDir, '.mlcc', 'staged-assets.json');
|
|
62
|
+
if (!existsSync(stagedAssetsPath)) {
|
|
63
|
+
return false;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
try {
|
|
67
|
+
const content = readFileSync(stagedAssetsPath, 'utf8');
|
|
68
|
+
const data = JSON.parse(content);
|
|
69
|
+
// Check that there's a valid staged URI
|
|
70
|
+
return !!(data?.models?.default?.staged_uri);
|
|
71
|
+
} catch {
|
|
72
|
+
return false;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Get the current staging state for a project.
|
|
78
|
+
*
|
|
79
|
+
* @param {string} projectDir - Path to the generated project directory
|
|
80
|
+
* @param {object} [stepResults] - Previous step results (to check for stage-failed)
|
|
81
|
+
* @returns {string} One of: 'staged', 'not-staged', 'stage-failed'
|
|
82
|
+
*/
|
|
83
|
+
export function getStagingState(projectDir, stepResults = null) {
|
|
84
|
+
// Check if stage previously failed
|
|
85
|
+
if (stepResults?.stage?.status === 'fail') {
|
|
86
|
+
return STAGING_STATES.STAGE_FAILED;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
if (isAlreadyStaged(projectDir)) {
|
|
90
|
+
return STAGING_STATES.STAGED;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return STAGING_STATES.NOT_STAGED;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Execute the stage lifecycle step with idempotency support.
|
|
98
|
+
*
|
|
99
|
+
* If the model is already staged (`.mlcc/staged-assets.json` exists with a valid URI),
|
|
100
|
+
* the step is skipped and marked as passed.
|
|
101
|
+
*
|
|
102
|
+
* If `do/stage` exits non-zero, the model is marked as stage-failed.
|
|
103
|
+
*
|
|
104
|
+
* @param {string} projectDir - Path to the generated project directory
|
|
105
|
+
* @param {object} [options] - Execution options
|
|
106
|
+
* @param {number} [options.timeout=1800] - Timeout in seconds (default: 30 minutes)
|
|
107
|
+
* @param {boolean} [options.verbose=false] - Stream stdout/stderr in real time
|
|
108
|
+
* @returns {Promise<object>} StepResult with name, status, duration, stagingState, and optional error
|
|
109
|
+
*/
|
|
110
|
+
export async function executeStageStep(projectDir, options = {}) {
|
|
111
|
+
const { timeout = 1800, verbose = false } = options;
|
|
112
|
+
const startTime = Date.now();
|
|
113
|
+
|
|
114
|
+
// Idempotency check: skip if already staged (Requirement 5.4)
|
|
115
|
+
if (isAlreadyStaged(projectDir)) {
|
|
116
|
+
return {
|
|
117
|
+
name: 'stage',
|
|
118
|
+
status: 'pass',
|
|
119
|
+
duration: Date.now() - startTime,
|
|
120
|
+
stagingState: STAGING_STATES.STAGED,
|
|
121
|
+
skipped: true,
|
|
122
|
+
message: '✓ Model already staged — skipping'
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Execute do/stage and verify exit code 0 (Requirement 5.2)
|
|
127
|
+
const command = './do/stage';
|
|
128
|
+
|
|
129
|
+
try {
|
|
130
|
+
if (verbose) {
|
|
131
|
+
// Verbose: stream output in real time
|
|
132
|
+
const { spawn } = await import('node:child_process');
|
|
133
|
+
const result = await new Promise((resolve) => {
|
|
134
|
+
const child = spawn('bash', ['-c', command], {
|
|
135
|
+
cwd: projectDir,
|
|
136
|
+
stdio: ['pipe', 'inherit', 'inherit']
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
let killed = false;
|
|
140
|
+
const timer = setTimeout(() => {
|
|
141
|
+
killed = true;
|
|
142
|
+
child.kill('SIGTERM');
|
|
143
|
+
}, timeout * 1000);
|
|
144
|
+
|
|
145
|
+
child.on('close', (code) => {
|
|
146
|
+
clearTimeout(timer);
|
|
147
|
+
if (code === 0) {
|
|
148
|
+
resolve({
|
|
149
|
+
name: 'stage',
|
|
150
|
+
status: 'pass',
|
|
151
|
+
duration: Date.now() - startTime,
|
|
152
|
+
stagingState: STAGING_STATES.STAGED
|
|
153
|
+
});
|
|
154
|
+
} else {
|
|
155
|
+
const error = killed
|
|
156
|
+
? `Timeout after ${timeout}s`
|
|
157
|
+
: `do/stage exited with code ${code}`;
|
|
158
|
+
resolve({
|
|
159
|
+
name: 'stage',
|
|
160
|
+
status: 'fail',
|
|
161
|
+
duration: Date.now() - startTime,
|
|
162
|
+
stagingState: STAGING_STATES.STAGE_FAILED,
|
|
163
|
+
error
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
child.on('error', (err) => {
|
|
169
|
+
clearTimeout(timer);
|
|
170
|
+
resolve({
|
|
171
|
+
name: 'stage',
|
|
172
|
+
status: 'fail',
|
|
173
|
+
duration: Date.now() - startTime,
|
|
174
|
+
stagingState: STAGING_STATES.STAGE_FAILED,
|
|
175
|
+
error: err.message.slice(-500)
|
|
176
|
+
});
|
|
177
|
+
});
|
|
178
|
+
});
|
|
179
|
+
return result;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Non-verbose: buffer output
|
|
183
|
+
await execFileAsync('bash', ['-c', command], {
|
|
184
|
+
cwd: projectDir,
|
|
185
|
+
timeout: timeout * 1000,
|
|
186
|
+
maxBuffer: 10 * 1024 * 1024
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
return {
|
|
190
|
+
name: 'stage',
|
|
191
|
+
status: 'pass',
|
|
192
|
+
duration: Date.now() - startTime,
|
|
193
|
+
stagingState: STAGING_STATES.STAGED
|
|
194
|
+
};
|
|
195
|
+
} catch (err) {
|
|
196
|
+
// Mark model as failed if staging fails (Requirement 5.3)
|
|
197
|
+
const error = err.killed
|
|
198
|
+
? `Timeout after ${timeout}s`
|
|
199
|
+
: (err.stderr || err.message).slice(-500);
|
|
200
|
+
|
|
201
|
+
return {
|
|
202
|
+
name: 'stage',
|
|
203
|
+
status: 'fail',
|
|
204
|
+
duration: Date.now() - startTime,
|
|
205
|
+
stagingState: STAGING_STATES.STAGE_FAILED,
|
|
206
|
+
error
|
|
207
|
+
};
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// ── Stage Validation ─────────────────────────────────────────────────────────
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Validate that a lifecycle stage name is recognized by the prove pipeline.
|
|
215
|
+
*
|
|
216
|
+
* @param {string} stageName - The stage name to validate
|
|
217
|
+
* @returns {boolean} True if the stage is valid
|
|
218
|
+
*/
|
|
219
|
+
export function isValidLifecycleStage(stageName) {
|
|
220
|
+
return VALID_LIFECYCLE_STAGES.includes(stageName);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Validate a stages array from validation-targets configuration.
|
|
225
|
+
*
|
|
226
|
+
* @param {string[]} stages - Array of stage names
|
|
227
|
+
* @returns {object} Validation result: { valid: boolean, errors: string[] }
|
|
228
|
+
*/
|
|
229
|
+
export function validateStagesArray(stages) {
|
|
230
|
+
const errors = [];
|
|
231
|
+
|
|
232
|
+
if (!Array.isArray(stages)) {
|
|
233
|
+
return { valid: false, errors: ['stages must be an array'] };
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (stages.length === 0) {
|
|
237
|
+
return { valid: false, errors: ['stages array must not be empty'] };
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
for (const stage of stages) {
|
|
241
|
+
if (typeof stage !== 'string') {
|
|
242
|
+
errors.push(`Invalid stage type: expected string, got ${typeof stage}`);
|
|
243
|
+
continue;
|
|
244
|
+
}
|
|
245
|
+
if (!isValidLifecycleStage(stage)) {
|
|
246
|
+
errors.push(`Unrecognized lifecycle stage: "${stage}"`);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
return { valid: errors.length === 0, errors };
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// ── Status Output ────────────────────────────────────────────────────────────
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Format the staging state for status output display.
|
|
257
|
+
*
|
|
258
|
+
* @param {string} state - One of STAGING_STATES values
|
|
259
|
+
* @returns {string} Formatted status string with emoji
|
|
260
|
+
*/
|
|
261
|
+
export function formatStagingStatus(state) {
|
|
262
|
+
switch (state) {
|
|
263
|
+
case STAGING_STATES.STAGED:
|
|
264
|
+
return '✓ staged';
|
|
265
|
+
case STAGING_STATES.NOT_STAGED:
|
|
266
|
+
return '○ not-staged';
|
|
267
|
+
case STAGING_STATES.STAGE_FAILED:
|
|
268
|
+
return '✗ stage-failed';
|
|
269
|
+
default:
|
|
270
|
+
return '? unknown';
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Build a status summary for a prove target including staging state.
|
|
276
|
+
*
|
|
277
|
+
* @param {object} target - The validation target
|
|
278
|
+
* @param {string} target.model_name - Model name
|
|
279
|
+
* @param {string} projectDir - Path to the project directory
|
|
280
|
+
* @param {object} [stepResults] - Results of executed steps
|
|
281
|
+
* @returns {object} Status summary including stagingState
|
|
282
|
+
*/
|
|
283
|
+
export function buildTargetStatus(target, projectDir, stepResults = null) {
|
|
284
|
+
const stagingState = getStagingState(projectDir, stepResults);
|
|
285
|
+
const stages = target.stages || [];
|
|
286
|
+
const includesStage = stages.includes('stage');
|
|
287
|
+
|
|
288
|
+
return {
|
|
289
|
+
model_name: target.model_name,
|
|
290
|
+
stagingState,
|
|
291
|
+
stagingStatus: formatStagingStatus(stagingState),
|
|
292
|
+
includesStageStep: includesStage
|
|
293
|
+
};
|
|
294
|
+
}
|
|
@@ -10,14 +10,47 @@ This project uses three sets of IAM permissions:
|
|
|
10
10
|
|
|
11
11
|
## SageMaker Execution Role
|
|
12
12
|
|
|
13
|
-
The bootstrap command creates an IAM role (`mlcc-sagemaker-execution-role`) with
|
|
13
|
+
The bootstrap command creates an IAM role (`mlcc-sagemaker-execution-role`) with these permission groups:
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
- **CloudWatch Logs**: Write container logs
|
|
18
|
-
- **S3**: Read model artifacts from `ml-container-creator-*` buckets
|
|
15
|
+
### Endpoint Management
|
|
16
|
+
Create, update, delete, describe, and invoke endpoints, endpoint configs, models, and inference components.
|
|
19
17
|
|
|
20
|
-
|
|
18
|
+
### AI Benchmarking
|
|
19
|
+
Create, describe, list, stop, and delete AI benchmark jobs, AI recommendation jobs, and AI workload configs.
|
|
20
|
+
|
|
21
|
+
### Training & Model Customization
|
|
22
|
+
Create/describe/stop training jobs, model packages, model package groups. Access SageMaker Hub contents. Manage training plans.
|
|
23
|
+
|
|
24
|
+
### MLflow Integration
|
|
25
|
+
List/describe MLflow tracking servers and apps. Create presigned URLs. Call MLflow app APIs.
|
|
26
|
+
|
|
27
|
+
### ECR
|
|
28
|
+
Pull container images (GetAuthorizationToken, BatchGetImage, GetDownloadUrlForLayer, BatchCheckLayerAvailability).
|
|
29
|
+
|
|
30
|
+
### S3
|
|
31
|
+
Read and write model artifacts, adapters, benchmark results:
|
|
32
|
+
- `s3:GetObject`, `s3:PutObject`, `s3:AbortMultipartUpload`, `s3:ListBucket`
|
|
33
|
+
- Scoped to `mlcc-*` and `ml-container-creator-*` buckets
|
|
34
|
+
|
|
35
|
+
### CloudWatch Logs
|
|
36
|
+
Create log groups/streams and put log events.
|
|
37
|
+
|
|
38
|
+
### Secrets Manager
|
|
39
|
+
Read and write secrets prefixed with `mlcc/` or `ml-container-creator/` (used for HF tokens, API keys).
|
|
40
|
+
|
|
41
|
+
### SNS
|
|
42
|
+
Publish notifications to `mlcc-*` and `ml-container-creator-*` topics (benchmark completion alerts).
|
|
43
|
+
|
|
44
|
+
### Service Quotas & Capacity
|
|
45
|
+
Query service quotas and training plan availability for instance selection.
|
|
46
|
+
|
|
47
|
+
### Lambda
|
|
48
|
+
Invoke functions (reward model evaluation during training/tuning).
|
|
49
|
+
|
|
50
|
+
### PassRole
|
|
51
|
+
Self-pass to SageMaker service, scoped to `mlcc-sagemaker-execution-role`.
|
|
52
|
+
|
|
53
|
+
The role is defined in `config/bootstrap-stack.json` and updated automatically when you re-run bootstrap after upgrading.
|
|
21
54
|
|
|
22
55
|
If you use a custom role (`--role-arn`), ensure it has at minimum:
|
|
23
56
|
|
|
@@ -25,12 +58,15 @@ If you use a custom role (`--role-arn`), ensure it has at minimum:
|
|
|
25
58
|
|-----------|---------|
|
|
26
59
|
| `sagemaker:CreateEndpoint`, `CreateEndpointConfig`, `CreateModel`, `CreateInferenceComponent` | Deploy |
|
|
27
60
|
| `sagemaker:DeleteEndpoint`, `DeleteEndpointConfig`, `DeleteModel`, `DeleteInferenceComponent` | Clean up |
|
|
28
|
-
| `sagemaker:DescribeEndpoint`, `DescribeEndpointConfig`, `DescribeModel`, `DescribeInferenceComponent` | Status
|
|
61
|
+
| `sagemaker:DescribeEndpoint`, `DescribeEndpointConfig`, `DescribeModel`, `DescribeInferenceComponent`, `ListInferenceComponents` | Status |
|
|
29
62
|
| `sagemaker:InvokeEndpoint`, `InvokeEndpointAsync` | Inference |
|
|
30
63
|
| `sagemaker:UpdateEndpoint`, `UpdateEndpointWeightsAndCapacities`, `UpdateInferenceComponent` | Updates |
|
|
31
|
-
| `
|
|
32
|
-
| `
|
|
33
|
-
| `
|
|
64
|
+
| `sagemaker:CreateAIBenchmarkJob`, `DescribeAIBenchmarkJob`, `ListAIBenchmarkJobs` | Benchmark |
|
|
65
|
+
| `sagemaker:CreateTrainingJob`, `DescribeTrainingJob`, `StopTrainingJob` | Training/tuning |
|
|
66
|
+
| `ecr:GetAuthorizationToken`, `BatchGetImage`, `GetDownloadUrlForLayer`, `BatchCheckLayerAvailability` | Pull image |
|
|
67
|
+
| `logs:CreateLogGroup`, `CreateLogStream`, `PutLogEvents` | Logging |
|
|
68
|
+
| `s3:GetObject`, `s3:PutObject`, `s3:ListBucket` on project buckets | Artifacts |
|
|
69
|
+
| `iam:PassRole` (to sagemaker.amazonaws.com) | Role delegation |
|
|
34
70
|
|
|
35
71
|
Trust policy must allow `sagemaker.amazonaws.com` to assume the role.
|
|
36
72
|
|
|
@@ -48,12 +84,27 @@ Your AWS user or CI system needs these permissions to run the do-scripts:
|
|
|
48
84
|
|
|
49
85
|
| Script | Permissions Needed |
|
|
50
86
|
|--------|-------------------|
|
|
87
|
+
| `./do/build` | Local only — no AWS permissions |
|
|
88
|
+
| `./do/run` | Local only — no AWS permissions |
|
|
51
89
|
| `./do/push` | `ecr:GetAuthorizationToken`, `ecr:PutImage`, `ecr:InitiateLayerUpload`, `ecr:UploadLayerPart`, `ecr:CompleteLayerUpload`, `ecr:BatchCheckLayerAvailability` |
|
|
52
90
|
| `./do/submit` | `codebuild:CreateProject`, `codebuild:StartBuild`, `codebuild:BatchGetBuilds`, `iam:CreateRole`, `iam:PutRolePolicy`, `iam:PassRole`, `s3:PutObject`, `s3:CreateBucket` |
|
|
53
|
-
| `./do/
|
|
54
|
-
| `./do/
|
|
91
|
+
| `./do/stage` | `s3:PutObject`, `s3:GetObject`, `s3:ListBucket` on mlcc-* buckets |
|
|
92
|
+
| `./do/deploy` | `sagemaker:CreateEndpointConfig`, `sagemaker:CreateEndpoint`, `sagemaker:CreateModel`, `sagemaker:CreateInferenceComponent`, `sagemaker:DescribeEndpoint`, `iam:PassRole` |
|
|
93
|
+
| `./do/add-ic` | `sagemaker:CreateInferenceComponent`, `sagemaker:DescribeEndpoint`, `sagemaker:ListInferenceComponents`, `iam:PassRole` |
|
|
55
94
|
| `./do/test` | `sagemaker-runtime:InvokeEndpoint` |
|
|
56
|
-
| `
|
|
95
|
+
| `./do/benchmark` | `sagemaker:CreateAIBenchmarkJob`, `sagemaker:DescribeAIBenchmarkJob`, `sagemaker:ListAIBenchmarkJobs`, `sagemaker:CreateAIWorkloadConfig`, `iam:PassRole`, `s3:GetObject` |
|
|
96
|
+
| `./do/train` | `sagemaker:CreateTrainingJob`, `sagemaker:DescribeTrainingJob`, `iam:PassRole`, `s3:GetObject`, `s3:PutObject` |
|
|
97
|
+
| `./do/tune` | `sagemaker:CreateTrainingJob`, `sagemaker:DescribeTrainingJob`, `iam:PassRole`, `s3:GetObject`, `s3:PutObject` |
|
|
98
|
+
| `./do/adapter` | `sagemaker:CreateInferenceComponent`, `sagemaker:UpdateInferenceComponent`, `sagemaker:DescribeInferenceComponent`, `s3:GetObject` |
|
|
99
|
+
| `./do/optimize` | `sagemaker:CreateModel`, `sagemaker:DescribeModel`, `s3:GetObject`, `s3:PutObject` |
|
|
100
|
+
| `./do/register` | `sagemaker:CreateModelPackage`, `sagemaker:CreateModelPackageGroup`, `sagemaker:DescribeModelPackage` |
|
|
101
|
+
| `./do/logs` | `logs:GetLogEvents`, `logs:FilterLogEvents`, `logs:DescribeLogStreams` |
|
|
102
|
+
| `./do/status` | `sagemaker:DescribeEndpoint`, `sagemaker:DescribeInferenceComponent`, `sagemaker:ListInferenceComponents` |
|
|
103
|
+
| `./do/clean` | `sagemaker:DeleteEndpoint`, `sagemaker:DeleteEndpointConfig`, `sagemaker:DeleteModel`, `sagemaker:DeleteInferenceComponent`, `codebuild:DeleteProject`, `iam:DeleteRole`, `iam:DeleteRolePolicy` |
|
|
104
|
+
| `./do/export` | Local only — reads config files |
|
|
105
|
+
| `./do/validate` | Local only — validates project structure |
|
|
106
|
+
| `./do/manifest` | Local only — generates deployment manifest |
|
|
107
|
+
| `bootstrap` | `cloudformation:*`, `iam:CreateRole`, `iam:PutRolePolicy`, `iam:TagRole`, `ecr:CreateRepository`, `s3:CreateBucket`, `sts:GetCallerIdentity` |
|
|
57
108
|
|
|
58
109
|
<% if (framework === 'transformers' && hfToken) { %>
|
|
59
110
|
## HuggingFace Token Security
|
|
@@ -1510,8 +1510,6 @@ def cmd_discover(args):
|
|
|
1510
1510
|
|
|
1511
1511
|
Returns: {"models": [str], "count": int}
|
|
1512
1512
|
"""
|
|
1513
|
-
import boto3
|
|
1514
|
-
|
|
1515
1513
|
region = args.region or os.environ.get('AWS_REGION', 'us-east-1')
|
|
1516
1514
|
|
|
1517
1515
|
family = args.family or ""
|
|
@@ -1528,6 +1526,11 @@ def cmd_discover(args):
|
|
|
1528
1526
|
if not prefix:
|
|
1529
1527
|
_error_exit("No family or filter provided for discovery")
|
|
1530
1528
|
|
|
1529
|
+
try:
|
|
1530
|
+
import boto3
|
|
1531
|
+
except ImportError:
|
|
1532
|
+
_error_exit("Hub discovery failed: boto3 is not installed. Install with: pip install boto3")
|
|
1533
|
+
|
|
1531
1534
|
try:
|
|
1532
1535
|
client = boto3.client("sagemaker", region_name=region)
|
|
1533
1536
|
models = []
|