@aws/ml-container-creator 1.0.3 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/README.md +10 -1
  2. package/bin/cli.js +57 -0
  3. package/config/agent.json +16 -0
  4. package/infra/ci-harness/lib/ci-harness-stack.ts +43 -0
  5. package/package.json +5 -2
  6. package/pyproject.toml +3 -0
  7. package/servers/agent-knowledge/index.js +592 -0
  8. package/servers/agent-knowledge/package.json +15 -0
  9. package/servers/base-image-picker/index.js +65 -18
  10. package/servers/instance-sizer/index.js +32 -0
  11. package/servers/lib/catalogs/fleet-drivers.json +38 -0
  12. package/servers/lib/catalogs/model-arch-support.json +51 -0
  13. package/servers/lib/catalogs/model-servers.json +2842 -1730
  14. package/servers/lib/schemas/image-catalog.schema.json +12 -0
  15. package/src/agent/__init__.py +2 -0
  16. package/src/agent/__pycache__/__init__.cpython-312.pyc +0 -0
  17. package/src/agent/__pycache__/config_loader.cpython-312.pyc +0 -0
  18. package/src/agent/__pycache__/context.cpython-312.pyc +0 -0
  19. package/src/agent/__pycache__/health_check.cpython-312.pyc +0 -0
  20. package/src/agent/agent.py +513 -0
  21. package/src/agent/config_loader.py +215 -0
  22. package/src/agent/context.py +380 -0
  23. package/src/agent/data/capability-matrix.json +106 -0
  24. package/src/agent/health_check.py +341 -0
  25. package/src/agent/prompts/system.md +173 -0
  26. package/src/agent/requirements-agent.txt +3 -0
  27. package/src/app.js +6 -4
  28. package/src/lib/generated/cli-options.js +1 -1
  29. package/src/lib/generated/parameter-matrix.js +1 -1
  30. package/src/lib/generated/validation-rules.js +1 -1
  31. package/src/lib/mcp-query-runner.js +110 -3
  32. package/src/lib/prompt-runner.js +66 -22
  33. package/src/lib/template-variable-resolver.js +8 -0
  34. package/src/lib/train-config-builder.js +339 -0
  35. package/src/lib/tune-config-state.js +89 -68
  36. package/templates/do/.benchmark_writer.py +3 -0
  37. package/templates/do/.eval_helper.py +409 -0
  38. package/templates/do/.register_helper.py +185 -11
  39. package/templates/do/.train_build_request.py +102 -113
  40. package/templates/do/.train_helper.py +433 -0
  41. package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
  42. package/templates/do/adapter +157 -0
  43. package/templates/do/benchmark +60 -3
  44. package/templates/do/config +6 -1
  45. package/templates/do/deploy.d/managed-inference.ejs +83 -0
  46. package/templates/do/evaluate +272 -0
  47. package/templates/do/lib/resolve-instance.sh +155 -0
  48. package/templates/do/register +5 -0
  49. package/templates/do/test +1 -0
  50. package/templates/do/train +879 -126
  51. package/templates/do/training/config.yaml +83 -11
  52. package/templates/do/training/dpo/accelerate_config.yaml +24 -0
  53. package/templates/do/training/dpo/defaults.yaml +26 -0
  54. package/templates/do/training/dpo/prompts.json +8 -0
  55. package/templates/do/training/dpo/train.py +363 -0
  56. package/templates/do/training/sft/accelerate_config.yaml +22 -0
  57. package/templates/do/training/sft/defaults.yaml +18 -0
  58. package/templates/do/training/sft/prompts.json +7 -0
  59. package/templates/do/training/sft/train.py +310 -0
  60. package/templates/do/tune +11 -2
  61. package/src/lib/auto-prompt-builder.js +0 -172
  62. package/src/lib/cli-handler.js +0 -529
  63. package/src/lib/community-reports-validator.js +0 -91
  64. package/src/lib/configuration-exporter.js +0 -204
  65. package/src/lib/dataset-slug.js +0 -152
  66. package/src/lib/docker-introspection-validator.js +0 -51
  67. package/src/lib/known-flags-validator.js +0 -200
  68. package/src/lib/schema-validator.js +0 -157
  69. package/src/lib/train-config-parser.js +0 -136
  70. package/src/lib/train-config-persistence.js +0 -143
  71. package/src/lib/train-config-validator.js +0 -112
  72. package/src/lib/train-feedback.js +0 -46
  73. package/src/lib/train-idempotency.js +0 -97
  74. package/src/lib/train-request-builder.js +0 -120
  75. package/src/lib/tune-dataset-validator.js +0 -279
  76. package/src/lib/tune-output-resolver.js +0 -66
  77. package/templates/do/.train_poll_parser.py +0 -135
  78. package/templates/do/.train_status_parser.py +0 -187
  79. /package/templates/do/training/{train.py → custom/train.py} +0 -0
package/README.md CHANGED
@@ -74,6 +74,14 @@ ml-container-creator my-model \
74
74
  ./do/test # Test the endpoint
75
75
  ```
76
76
 
77
+ ### Get help from the advisor
78
+
79
+ ```bash
80
+ ml-container-creator hey # Conversational AI advisor (powered by Bedrock)
81
+ ```
82
+
83
+ Ask questions about your project, get optimization recommendations, troubleshoot issues, and plan workflows. See [Agent docs](https://awslabs.github.io/ml-container-creator/agent/) for details.
84
+
77
85
  ## Documentation
78
86
 
79
87
  Full documentation is available at [awslabs.github.io/ml-container-creator](https://awslabs.github.io/ml-container-creator/).
@@ -83,6 +91,7 @@ Full documentation is available at [awslabs.github.io/ml-container-creator](http
83
91
  - [Deployment Guide](https://awslabs.github.io/ml-container-creator/deployments/) — All deployment targets and lifecycle scripts
84
92
  - [CI Integration](https://awslabs.github.io/ml-container-creator/ci-integration/) — Automated lifecycle testing for all deployment configurations
85
93
  - [Examples](https://awslabs.github.io/ml-container-creator/EXAMPLES/) — Framework-specific walkthroughs
94
+ - [Advisory Agent](https://awslabs.github.io/ml-container-creator/agent/) — Conversational AI advisor (`ml-container-creator hey`)
86
95
  - [Troubleshooting](https://awslabs.github.io/ml-container-creator/TROUBLESHOOTING/) — Common issues and solutions
87
96
 
88
97
  ## Prerequisites
@@ -97,7 +106,7 @@ Full documentation is available at [awslabs.github.io/ml-container-creator](http
97
106
 
98
107
  ### Python dependencies
99
108
 
100
- The `do/` lifecycle scripts (`do/tune`, `do/stage`, `do/adapter`) require Python packages. Install them in your Python environment before first use:
109
+ The `do/` lifecycle scripts (`do/tune`, `do/train`, `do/stage`, `do/adapter`) require Python packages. Install them in your Python environment before first use:
101
110
 
102
111
  ```bash
103
112
  # Recommended (fast):
package/bin/cli.js CHANGED
@@ -4,10 +4,15 @@
4
4
 
5
5
  import { createRequire } from 'module';
6
6
  import path from 'path';
7
+ import { fileURLToPath } from 'url';
8
+ import { spawn, execSync } from 'child_process';
7
9
  import { program, Option, Help } from 'commander';
8
10
  import { run } from '../src/app.js';
9
11
  import { cliOptions, helpGroups } from '../src/lib/generated/cli-options.js';
10
12
 
13
+ const __filename = fileURLToPath(import.meta.url);
14
+ const __dirname = path.dirname(__filename);
15
+
11
16
  const require = createRequire(import.meta.url);
12
17
  const { version } = require('../package.json');
13
18
 
@@ -314,4 +319,56 @@ program
314
319
  }
315
320
  });
316
321
 
322
+ program
323
+ .command('hey')
324
+ .description('Chat with the ml-container-creator advisor')
325
+ .option('--project-dir <dir>', 'Project directory to analyze', process.cwd())
326
+ .option('-o, --offline', 'Static reference mode (no Bedrock calls)')
327
+ .action(async (options) => {
328
+ // 1. Check python3 is available
329
+ try {
330
+ execSync('python3 --version', { stdio: 'ignore' });
331
+ } catch {
332
+ console.error('❌ python3 not found. Install Python 3.10+ to use the advisor.');
333
+ console.error(' macOS: brew install python3');
334
+ console.error(' Ubuntu: sudo apt install python3');
335
+ process.exit(1);
336
+ }
337
+
338
+ // 2. If not offline, check strands-agents is installed
339
+ if (!options.offline) {
340
+ try {
341
+ execSync('python3 -c "import strands"', { stdio: 'ignore' });
342
+ } catch {
343
+ console.error('❌ strands-agents not installed. Run:');
344
+ console.error(' pip install -r src/agent/requirements-agent.txt');
345
+ process.exit(1);
346
+ }
347
+ }
348
+
349
+ // 3. Resolve agent script path
350
+ const agentScript = path.join(__dirname, '..', 'src', 'agent', 'agent.py');
351
+
352
+ // 4. Build args and spawn
353
+ const args = [agentScript, '--project-dir', options.projectDir];
354
+ if (options.offline) {
355
+ args.push('--offline');
356
+ }
357
+
358
+ const child = spawn('python3', args, {
359
+ stdio: 'inherit',
360
+ env: { ...process.env, PYTHONUNBUFFERED: '1' }
361
+ });
362
+
363
+ // 5. Forward exit code
364
+ child.on('close', (code) => {
365
+ process.exit(code ?? 0);
366
+ });
367
+
368
+ child.on('error', (err) => {
369
+ console.error(`❌ Failed to start agent: ${err.message}`);
370
+ process.exit(1);
371
+ });
372
+ });
373
+
317
374
  program.parse();
@@ -0,0 +1,16 @@
1
+ {
2
+ "modelId": "global.anthropic.claude-sonnet-4-5-20250929-v1:0",
3
+ "mcpServers": [
4
+ "instance-sizer",
5
+ "base-image-picker",
6
+ "model-picker",
7
+ "workload-picker",
8
+ "e2e-status",
9
+ "agent-knowledge"
10
+ ],
11
+ "inputCostPer1k": 0.003,
12
+ "outputCostPer1k": 0.015,
13
+ "exitCommands": ["exit", "quit", "bye", "q"],
14
+ "reloadCommands": ["reload"],
15
+ "mcpServerTimeout": 30
16
+ }
@@ -1057,6 +1057,49 @@ export class MlccCiHarnessStack extends cdk.Stack {
1057
1057
  glueTable.addDependency(glueDatabase);
1058
1058
  glueTable.cfnOptions.condition = benchmarkInfraCondition;
1059
1059
 
1060
+ // Glue Table: mlcc_evaluations — model quality evaluation results
1061
+ // Written by do/evaluate via .eval_helper.py eval-write subcommand.
1062
+ // Partitioned by model + adapter for efficient comparison queries.
1063
+ const evalGlueTable = new glue.CfnTable(this, 'EvaluationResultsTable', {
1064
+ catalogId: this.account,
1065
+ databaseName: 'mlcc_ci',
1066
+ tableInput: {
1067
+ name: 'mlcc_evaluations',
1068
+ tableType: 'EXTERNAL_TABLE',
1069
+ parameters: {
1070
+ 'classification': 'json',
1071
+ },
1072
+ storageDescriptor: {
1073
+ columns: [
1074
+ { name: 'project_name', type: 'string', comment: 'MCC project name' },
1075
+ { name: 'model_name', type: 'string', comment: 'HuggingFace model ID' },
1076
+ { name: 'adapter_name', type: 'string', comment: 'Adapter name or IC name' },
1077
+ { name: 'technique', type: 'string', comment: 'Training technique (sft, dpo)' },
1078
+ { name: 'eval_dataset', type: 'string', comment: 'Evaluation dataset URI or name' },
1079
+ { name: 'samples_evaluated', type: 'int', comment: 'Number of samples evaluated' },
1080
+ { name: 'metrics', type: 'string', comment: 'JSON blob of all computed metrics' },
1081
+ { name: 'timestamp', type: 'string', comment: 'ISO 8601 UTC timestamp' },
1082
+ { name: 'region', type: 'string', comment: 'AWS region' },
1083
+ ],
1084
+ location: `s3://mlcc-benchmark-results-${this.account}-${this.region}/evaluations/`,
1085
+ inputFormat: 'org.apache.hadoop.mapred.TextInputFormat',
1086
+ outputFormat: 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
1087
+ serdeInfo: {
1088
+ serializationLibrary: 'org.openx.data.jsonserde.JsonSerDe',
1089
+ parameters: {
1090
+ 'serialization.format': '1',
1091
+ },
1092
+ },
1093
+ },
1094
+ partitionKeys: [
1095
+ { name: 'model', type: 'string', comment: 'Model name (partition key)' },
1096
+ { name: 'adapter', type: 'string', comment: 'Adapter name (partition key)' },
1097
+ ],
1098
+ },
1099
+ });
1100
+ evalGlueTable.addDependency(glueDatabase);
1101
+ evalGlueTable.cfnOptions.condition = benchmarkInfraCondition;
1102
+
1060
1103
  // Configurable lifecycle parameters for the benchmark results bucket
1061
1104
  const benchmarkIaTransitionDays = new cdk.CfnParameter(this, 'BenchmarkIaTransitionDays', {
1062
1105
  type: 'Number',
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aws/ml-container-creator",
3
- "version": "1.0.3",
3
+ "version": "1.1.0",
4
4
  "description": "Build and deploy custom ML containers on AWS SageMaker with minimal configuration.",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -51,6 +51,8 @@
51
51
  "servers/workload-picker/index.js",
52
52
  "servers/workload-picker/manifest.json",
53
53
  "servers/workload-picker/package.json",
54
+ "servers/agent-knowledge/index.js",
55
+ "servers/agent-knowledge/package.json",
54
56
  "servers/lib/bedrock-client.js",
55
57
  "servers/lib/custom-validators.js",
56
58
  "servers/lib/dynamic-resolver.js",
@@ -61,6 +63,7 @@
61
63
  "config/bootstrap-stack.json",
62
64
  "config/bootstrap-e2e-stack.json",
63
65
  "config/parameter-schema-v2.json",
66
+ "config/agent.json",
64
67
  "config/tune-catalog.json",
65
68
  "config/presets/",
66
69
  "infra/ci-harness/bin/",
@@ -88,7 +91,7 @@
88
91
  },
89
92
  "scripts": {
90
93
  "test": "mocha 'test/**/*.test.js' --ignore 'test/property/**' --recursive --timeout 30000 --parallel",
91
- "test:property": "mocha 'test/property/**/*.test.js' --recursive --timeout 60000 --parallel",
94
+ "test:property": "NODE_OPTIONS='--max-old-space-size=8192' mocha 'test/property/**/*.test.js' --recursive --timeout 60000 --parallel --jobs 4",
92
95
  "test:all": "npm run test && npm run test:property",
93
96
  "test:fast": "mocha 'test/**/*.test.js' --recursive --timeout 15000 --parallel",
94
97
  "test:unit": "mocha 'test/unit/**/*.test.js' --recursive --timeout 15000",
package/pyproject.toml CHANGED
@@ -15,6 +15,9 @@ dependencies = [
15
15
  "pyyaml>=6.0",
16
16
  ]
17
17
 
18
+ [tool.pytest.ini_options]
19
+ addopts = "--import-mode=importlib"
20
+
18
21
  [dependency-groups]
19
22
  dev = [
20
23
  "pytest>=8.0",