@aws/ml-container-creator 1.0.3 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -1
- package/bin/cli.js +57 -0
- package/config/agent.json +16 -0
- package/infra/ci-harness/lib/ci-harness-stack.ts +43 -0
- package/package.json +5 -2
- package/pyproject.toml +3 -0
- package/servers/agent-knowledge/index.js +592 -0
- package/servers/agent-knowledge/package.json +15 -0
- package/servers/base-image-picker/index.js +65 -18
- package/servers/instance-sizer/index.js +32 -0
- package/servers/lib/catalogs/fleet-drivers.json +38 -0
- package/servers/lib/catalogs/model-arch-support.json +51 -0
- package/servers/lib/catalogs/model-servers.json +2842 -1730
- package/servers/lib/schemas/image-catalog.schema.json +12 -0
- package/src/agent/__init__.py +2 -0
- package/src/agent/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/agent/__pycache__/config_loader.cpython-312.pyc +0 -0
- package/src/agent/__pycache__/context.cpython-312.pyc +0 -0
- package/src/agent/__pycache__/health_check.cpython-312.pyc +0 -0
- package/src/agent/agent.py +513 -0
- package/src/agent/config_loader.py +215 -0
- package/src/agent/context.py +380 -0
- package/src/agent/data/capability-matrix.json +106 -0
- package/src/agent/health_check.py +341 -0
- package/src/agent/prompts/system.md +173 -0
- package/src/agent/requirements-agent.txt +3 -0
- package/src/app.js +6 -4
- package/src/lib/generated/cli-options.js +1 -1
- package/src/lib/generated/parameter-matrix.js +1 -1
- package/src/lib/generated/validation-rules.js +1 -1
- package/src/lib/mcp-query-runner.js +110 -3
- package/src/lib/prompt-runner.js +66 -22
- package/src/lib/template-variable-resolver.js +8 -0
- package/src/lib/train-config-builder.js +339 -0
- package/src/lib/tune-config-state.js +89 -68
- package/templates/do/.benchmark_writer.py +3 -0
- package/templates/do/.eval_helper.py +409 -0
- package/templates/do/.register_helper.py +185 -11
- package/templates/do/.train_build_request.py +102 -113
- package/templates/do/.train_helper.py +433 -0
- package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
- package/templates/do/adapter +157 -0
- package/templates/do/benchmark +60 -3
- package/templates/do/config +6 -1
- package/templates/do/deploy.d/managed-inference.ejs +83 -0
- package/templates/do/evaluate +272 -0
- package/templates/do/lib/resolve-instance.sh +155 -0
- package/templates/do/register +5 -0
- package/templates/do/test +1 -0
- package/templates/do/train +879 -126
- package/templates/do/training/config.yaml +83 -11
- package/templates/do/training/dpo/accelerate_config.yaml +24 -0
- package/templates/do/training/dpo/defaults.yaml +26 -0
- package/templates/do/training/dpo/prompts.json +8 -0
- package/templates/do/training/dpo/train.py +363 -0
- package/templates/do/training/sft/accelerate_config.yaml +22 -0
- package/templates/do/training/sft/defaults.yaml +18 -0
- package/templates/do/training/sft/prompts.json +7 -0
- package/templates/do/training/sft/train.py +310 -0
- package/templates/do/tune +11 -2
- package/src/lib/auto-prompt-builder.js +0 -172
- package/src/lib/cli-handler.js +0 -529
- package/src/lib/community-reports-validator.js +0 -91
- package/src/lib/configuration-exporter.js +0 -204
- package/src/lib/dataset-slug.js +0 -152
- package/src/lib/docker-introspection-validator.js +0 -51
- package/src/lib/known-flags-validator.js +0 -200
- package/src/lib/schema-validator.js +0 -157
- package/src/lib/train-config-parser.js +0 -136
- package/src/lib/train-config-persistence.js +0 -143
- package/src/lib/train-config-validator.js +0 -112
- package/src/lib/train-feedback.js +0 -46
- package/src/lib/train-idempotency.js +0 -97
- package/src/lib/train-request-builder.js +0 -120
- package/src/lib/tune-dataset-validator.js +0 -279
- package/src/lib/tune-output-resolver.js +0 -66
- package/templates/do/.train_poll_parser.py +0 -135
- package/templates/do/.train_status_parser.py +0 -187
- /package/templates/do/training/{train.py → custom/train.py} +0 -0
package/README.md
CHANGED
|
@@ -74,6 +74,14 @@ ml-container-creator my-model \
|
|
|
74
74
|
./do/test # Test the endpoint
|
|
75
75
|
```
|
|
76
76
|
|
|
77
|
+
### Get help from the advisor
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
ml-container-creator hey # Conversational AI advisor (powered by Bedrock)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Ask questions about your project, get optimization recommendations, troubleshoot issues, and plan workflows. See [Agent docs](https://awslabs.github.io/ml-container-creator/agent/) for details.
|
|
84
|
+
|
|
77
85
|
## Documentation
|
|
78
86
|
|
|
79
87
|
Full documentation is available at [awslabs.github.io/ml-container-creator](https://awslabs.github.io/ml-container-creator/).
|
|
@@ -83,6 +91,7 @@ Full documentation is available at [awslabs.github.io/ml-container-creator](http
|
|
|
83
91
|
- [Deployment Guide](https://awslabs.github.io/ml-container-creator/deployments/) — All deployment targets and lifecycle scripts
|
|
84
92
|
- [CI Integration](https://awslabs.github.io/ml-container-creator/ci-integration/) — Automated lifecycle testing for all deployment configurations
|
|
85
93
|
- [Examples](https://awslabs.github.io/ml-container-creator/EXAMPLES/) — Framework-specific walkthroughs
|
|
94
|
+
- [Advisory Agent](https://awslabs.github.io/ml-container-creator/agent/) — Conversational AI advisor (`ml-container-creator hey`)
|
|
86
95
|
- [Troubleshooting](https://awslabs.github.io/ml-container-creator/TROUBLESHOOTING/) — Common issues and solutions
|
|
87
96
|
|
|
88
97
|
## Prerequisites
|
|
@@ -97,7 +106,7 @@ Full documentation is available at [awslabs.github.io/ml-container-creator](http
|
|
|
97
106
|
|
|
98
107
|
### Python dependencies
|
|
99
108
|
|
|
100
|
-
The `do/` lifecycle scripts (`do/tune`, `do/stage`, `do/adapter`) require Python packages. Install them in your Python environment before first use:
|
|
109
|
+
The `do/` lifecycle scripts (`do/tune`, `do/train`, `do/stage`, `do/adapter`) require Python packages. Install them in your Python environment before first use:
|
|
101
110
|
|
|
102
111
|
```bash
|
|
103
112
|
# Recommended (fast):
|
package/bin/cli.js
CHANGED
|
@@ -4,10 +4,15 @@
|
|
|
4
4
|
|
|
5
5
|
import { createRequire } from 'module';
|
|
6
6
|
import path from 'path';
|
|
7
|
+
import { fileURLToPath } from 'url';
|
|
8
|
+
import { spawn, execSync } from 'child_process';
|
|
7
9
|
import { program, Option, Help } from 'commander';
|
|
8
10
|
import { run } from '../src/app.js';
|
|
9
11
|
import { cliOptions, helpGroups } from '../src/lib/generated/cli-options.js';
|
|
10
12
|
|
|
13
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
14
|
+
const __dirname = path.dirname(__filename);
|
|
15
|
+
|
|
11
16
|
const require = createRequire(import.meta.url);
|
|
12
17
|
const { version } = require('../package.json');
|
|
13
18
|
|
|
@@ -314,4 +319,56 @@ program
|
|
|
314
319
|
}
|
|
315
320
|
});
|
|
316
321
|
|
|
322
|
+
program
|
|
323
|
+
.command('hey')
|
|
324
|
+
.description('Chat with the ml-container-creator advisor')
|
|
325
|
+
.option('--project-dir <dir>', 'Project directory to analyze', process.cwd())
|
|
326
|
+
.option('-o, --offline', 'Static reference mode (no Bedrock calls)')
|
|
327
|
+
.action(async (options) => {
|
|
328
|
+
// 1. Check python3 is available
|
|
329
|
+
try {
|
|
330
|
+
execSync('python3 --version', { stdio: 'ignore' });
|
|
331
|
+
} catch {
|
|
332
|
+
console.error('❌ python3 not found. Install Python 3.10+ to use the advisor.');
|
|
333
|
+
console.error(' macOS: brew install python3');
|
|
334
|
+
console.error(' Ubuntu: sudo apt install python3');
|
|
335
|
+
process.exit(1);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// 2. If not offline, check strands-agents is installed
|
|
339
|
+
if (!options.offline) {
|
|
340
|
+
try {
|
|
341
|
+
execSync('python3 -c "import strands"', { stdio: 'ignore' });
|
|
342
|
+
} catch {
|
|
343
|
+
console.error('❌ strands-agents not installed. Run:');
|
|
344
|
+
console.error(' pip install -r src/agent/requirements-agent.txt');
|
|
345
|
+
process.exit(1);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// 3. Resolve agent script path
|
|
350
|
+
const agentScript = path.join(__dirname, '..', 'src', 'agent', 'agent.py');
|
|
351
|
+
|
|
352
|
+
// 4. Build args and spawn
|
|
353
|
+
const args = [agentScript, '--project-dir', options.projectDir];
|
|
354
|
+
if (options.offline) {
|
|
355
|
+
args.push('--offline');
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
const child = spawn('python3', args, {
|
|
359
|
+
stdio: 'inherit',
|
|
360
|
+
env: { ...process.env, PYTHONUNBUFFERED: '1' }
|
|
361
|
+
});
|
|
362
|
+
|
|
363
|
+
// 5. Forward exit code
|
|
364
|
+
child.on('close', (code) => {
|
|
365
|
+
process.exit(code ?? 0);
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
child.on('error', (err) => {
|
|
369
|
+
console.error(`❌ Failed to start agent: ${err.message}`);
|
|
370
|
+
process.exit(1);
|
|
371
|
+
});
|
|
372
|
+
});
|
|
373
|
+
|
|
317
374
|
program.parse();
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"modelId": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
|
|
3
|
+
"mcpServers": [
|
|
4
|
+
"instance-sizer",
|
|
5
|
+
"base-image-picker",
|
|
6
|
+
"model-picker",
|
|
7
|
+
"workload-picker",
|
|
8
|
+
"e2e-status",
|
|
9
|
+
"agent-knowledge"
|
|
10
|
+
],
|
|
11
|
+
"inputCostPer1k": 0.003,
|
|
12
|
+
"outputCostPer1k": 0.015,
|
|
13
|
+
"exitCommands": ["exit", "quit", "bye", "q"],
|
|
14
|
+
"reloadCommands": ["reload"],
|
|
15
|
+
"mcpServerTimeout": 30
|
|
16
|
+
}
|
|
@@ -1057,6 +1057,49 @@ export class MlccCiHarnessStack extends cdk.Stack {
|
|
|
1057
1057
|
glueTable.addDependency(glueDatabase);
|
|
1058
1058
|
glueTable.cfnOptions.condition = benchmarkInfraCondition;
|
|
1059
1059
|
|
|
1060
|
+
// Glue Table: mlcc_evaluations — model quality evaluation results
|
|
1061
|
+
// Written by do/evaluate via .eval_helper.py eval-write subcommand.
|
|
1062
|
+
// Partitioned by model + adapter for efficient comparison queries.
|
|
1063
|
+
const evalGlueTable = new glue.CfnTable(this, 'EvaluationResultsTable', {
|
|
1064
|
+
catalogId: this.account,
|
|
1065
|
+
databaseName: 'mlcc_ci',
|
|
1066
|
+
tableInput: {
|
|
1067
|
+
name: 'mlcc_evaluations',
|
|
1068
|
+
tableType: 'EXTERNAL_TABLE',
|
|
1069
|
+
parameters: {
|
|
1070
|
+
'classification': 'json',
|
|
1071
|
+
},
|
|
1072
|
+
storageDescriptor: {
|
|
1073
|
+
columns: [
|
|
1074
|
+
{ name: 'project_name', type: 'string', comment: 'MCC project name' },
|
|
1075
|
+
{ name: 'model_name', type: 'string', comment: 'HuggingFace model ID' },
|
|
1076
|
+
{ name: 'adapter_name', type: 'string', comment: 'Adapter name or IC name' },
|
|
1077
|
+
{ name: 'technique', type: 'string', comment: 'Training technique (sft, dpo)' },
|
|
1078
|
+
{ name: 'eval_dataset', type: 'string', comment: 'Evaluation dataset URI or name' },
|
|
1079
|
+
{ name: 'samples_evaluated', type: 'int', comment: 'Number of samples evaluated' },
|
|
1080
|
+
{ name: 'metrics', type: 'string', comment: 'JSON blob of all computed metrics' },
|
|
1081
|
+
{ name: 'timestamp', type: 'string', comment: 'ISO 8601 UTC timestamp' },
|
|
1082
|
+
{ name: 'region', type: 'string', comment: 'AWS region' },
|
|
1083
|
+
],
|
|
1084
|
+
location: `s3://mlcc-benchmark-results-${this.account}-${this.region}/evaluations/`,
|
|
1085
|
+
inputFormat: 'org.apache.hadoop.mapred.TextInputFormat',
|
|
1086
|
+
outputFormat: 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
|
|
1087
|
+
serdeInfo: {
|
|
1088
|
+
serializationLibrary: 'org.openx.data.jsonserde.JsonSerDe',
|
|
1089
|
+
parameters: {
|
|
1090
|
+
'serialization.format': '1',
|
|
1091
|
+
},
|
|
1092
|
+
},
|
|
1093
|
+
},
|
|
1094
|
+
partitionKeys: [
|
|
1095
|
+
{ name: 'model', type: 'string', comment: 'Model name (partition key)' },
|
|
1096
|
+
{ name: 'adapter', type: 'string', comment: 'Adapter name (partition key)' },
|
|
1097
|
+
],
|
|
1098
|
+
},
|
|
1099
|
+
});
|
|
1100
|
+
evalGlueTable.addDependency(glueDatabase);
|
|
1101
|
+
evalGlueTable.cfnOptions.condition = benchmarkInfraCondition;
|
|
1102
|
+
|
|
1060
1103
|
// Configurable lifecycle parameters for the benchmark results bucket
|
|
1061
1104
|
const benchmarkIaTransitionDays = new cdk.CfnParameter(this, 'BenchmarkIaTransitionDays', {
|
|
1062
1105
|
type: 'Number',
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@aws/ml-container-creator",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Build and deploy custom ML containers on AWS SageMaker with minimal configuration.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -51,6 +51,8 @@
|
|
|
51
51
|
"servers/workload-picker/index.js",
|
|
52
52
|
"servers/workload-picker/manifest.json",
|
|
53
53
|
"servers/workload-picker/package.json",
|
|
54
|
+
"servers/agent-knowledge/index.js",
|
|
55
|
+
"servers/agent-knowledge/package.json",
|
|
54
56
|
"servers/lib/bedrock-client.js",
|
|
55
57
|
"servers/lib/custom-validators.js",
|
|
56
58
|
"servers/lib/dynamic-resolver.js",
|
|
@@ -61,6 +63,7 @@
|
|
|
61
63
|
"config/bootstrap-stack.json",
|
|
62
64
|
"config/bootstrap-e2e-stack.json",
|
|
63
65
|
"config/parameter-schema-v2.json",
|
|
66
|
+
"config/agent.json",
|
|
64
67
|
"config/tune-catalog.json",
|
|
65
68
|
"config/presets/",
|
|
66
69
|
"infra/ci-harness/bin/",
|
|
@@ -88,7 +91,7 @@
|
|
|
88
91
|
},
|
|
89
92
|
"scripts": {
|
|
90
93
|
"test": "mocha 'test/**/*.test.js' --ignore 'test/property/**' --recursive --timeout 30000 --parallel",
|
|
91
|
-
"test:property": "mocha 'test/property/**/*.test.js' --recursive --timeout 60000 --parallel",
|
|
94
|
+
"test:property": "NODE_OPTIONS='--max-old-space-size=8192' mocha 'test/property/**/*.test.js' --recursive --timeout 60000 --parallel --jobs 4",
|
|
92
95
|
"test:all": "npm run test && npm run test:property",
|
|
93
96
|
"test:fast": "mocha 'test/**/*.test.js' --recursive --timeout 15000 --parallel",
|
|
94
97
|
"test:unit": "mocha 'test/unit/**/*.test.js' --recursive --timeout 15000",
|