@aws/ml-container-creator 1.0.3 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -1
- package/bin/cli.js +57 -0
- package/config/agent.json +16 -0
- package/infra/ci-harness/lib/ci-harness-stack.ts +43 -0
- package/package.json +5 -2
- package/pyproject.toml +3 -0
- package/servers/agent-knowledge/index.js +592 -0
- package/servers/agent-knowledge/package.json +15 -0
- package/servers/base-image-picker/index.js +65 -18
- package/servers/instance-sizer/index.js +32 -0
- package/servers/lib/catalogs/fleet-drivers.json +38 -0
- package/servers/lib/catalogs/model-arch-support.json +51 -0
- package/servers/lib/catalogs/model-servers.json +2842 -1730
- package/servers/lib/schemas/image-catalog.schema.json +12 -0
- package/src/agent/__init__.py +2 -0
- package/src/agent/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/agent/__pycache__/config_loader.cpython-312.pyc +0 -0
- package/src/agent/__pycache__/context.cpython-312.pyc +0 -0
- package/src/agent/__pycache__/health_check.cpython-312.pyc +0 -0
- package/src/agent/agent.py +513 -0
- package/src/agent/config_loader.py +215 -0
- package/src/agent/context.py +380 -0
- package/src/agent/data/capability-matrix.json +106 -0
- package/src/agent/health_check.py +341 -0
- package/src/agent/prompts/system.md +173 -0
- package/src/agent/requirements-agent.txt +3 -0
- package/src/app.js +6 -4
- package/src/lib/generated/cli-options.js +1 -1
- package/src/lib/generated/parameter-matrix.js +1 -1
- package/src/lib/generated/validation-rules.js +1 -1
- package/src/lib/mcp-query-runner.js +110 -3
- package/src/lib/prompt-runner.js +66 -22
- package/src/lib/template-variable-resolver.js +8 -0
- package/src/lib/train-config-builder.js +339 -0
- package/src/lib/tune-config-state.js +89 -68
- package/templates/do/.benchmark_writer.py +3 -0
- package/templates/do/.eval_helper.py +409 -0
- package/templates/do/.register_helper.py +185 -11
- package/templates/do/.train_build_request.py +102 -113
- package/templates/do/.train_helper.py +433 -0
- package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
- package/templates/do/adapter +157 -0
- package/templates/do/benchmark +60 -3
- package/templates/do/config +6 -1
- package/templates/do/deploy.d/managed-inference.ejs +83 -0
- package/templates/do/evaluate +272 -0
- package/templates/do/lib/resolve-instance.sh +155 -0
- package/templates/do/register +5 -0
- package/templates/do/test +1 -0
- package/templates/do/train +879 -126
- package/templates/do/training/config.yaml +83 -11
- package/templates/do/training/dpo/accelerate_config.yaml +24 -0
- package/templates/do/training/dpo/defaults.yaml +26 -0
- package/templates/do/training/dpo/prompts.json +8 -0
- package/templates/do/training/dpo/train.py +363 -0
- package/templates/do/training/sft/accelerate_config.yaml +22 -0
- package/templates/do/training/sft/defaults.yaml +18 -0
- package/templates/do/training/sft/prompts.json +7 -0
- package/templates/do/training/sft/train.py +310 -0
- package/templates/do/tune +11 -2
- package/src/lib/auto-prompt-builder.js +0 -172
- package/src/lib/cli-handler.js +0 -529
- package/src/lib/community-reports-validator.js +0 -91
- package/src/lib/configuration-exporter.js +0 -204
- package/src/lib/dataset-slug.js +0 -152
- package/src/lib/docker-introspection-validator.js +0 -51
- package/src/lib/known-flags-validator.js +0 -200
- package/src/lib/schema-validator.js +0 -157
- package/src/lib/train-config-parser.js +0 -136
- package/src/lib/train-config-persistence.js +0 -143
- package/src/lib/train-config-validator.js +0 -112
- package/src/lib/train-feedback.js +0 -46
- package/src/lib/train-idempotency.js +0 -97
- package/src/lib/train-request-builder.js +0 -120
- package/src/lib/tune-dataset-validator.js +0 -279
- package/src/lib/tune-output-resolver.js +0 -66
- package/templates/do/.train_poll_parser.py +0 -135
- package/templates/do/.train_status_parser.py +0 -187
- /package/templates/do/training/{train.py → custom/train.py} +0 -0
|
@@ -1,157 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Schema Validator
|
|
3
|
-
*
|
|
4
|
-
* Validates registry data against JSON schemas without external dependencies.
|
|
5
|
-
* Uses a simple validation approach suitable for the registry structure.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
export default class SchemaValidator {
|
|
9
|
-
/**
|
|
10
|
-
* Validate data against a schema
|
|
11
|
-
* @param {Object} data - Data to validate
|
|
12
|
-
* @param {Object} schema - JSON schema
|
|
13
|
-
* @returns {Object} - { valid: boolean, errors: string[] }
|
|
14
|
-
*/
|
|
15
|
-
validate(data, schema) {
|
|
16
|
-
const errors = [];
|
|
17
|
-
|
|
18
|
-
try {
|
|
19
|
-
this._validateType(data, schema, '', errors);
|
|
20
|
-
} catch (error) {
|
|
21
|
-
errors.push(`Validation error: ${error.message}`);
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
return {
|
|
25
|
-
valid: errors.length === 0,
|
|
26
|
-
errors
|
|
27
|
-
};
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
_validateType(data, schema, path, errors) {
|
|
31
|
-
// Handle type validation
|
|
32
|
-
if (schema.type) {
|
|
33
|
-
const types = Array.isArray(schema.type) ? schema.type : [schema.type];
|
|
34
|
-
const dataType = this._getType(data);
|
|
35
|
-
|
|
36
|
-
if (!types.includes(dataType)) {
|
|
37
|
-
errors.push(`${path || 'root'}: Expected type ${types.join(' or ')}, got ${dataType}`);
|
|
38
|
-
return;
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
// Handle enum validation
|
|
43
|
-
if (schema.enum && !schema.enum.includes(data)) {
|
|
44
|
-
errors.push(`${path || 'root'}: Value must be one of ${schema.enum.join(', ')}`);
|
|
45
|
-
return;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
// Handle object validation
|
|
49
|
-
if (this._getType(data) === 'object' && schema.type === 'object') {
|
|
50
|
-
this._validateObject(data, schema, path, errors);
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
// Handle array validation
|
|
54
|
-
if (this._getType(data) === 'array' && schema.type === 'array') {
|
|
55
|
-
this._validateArray(data, schema, path, errors);
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
// Handle string validation
|
|
59
|
-
if (this._getType(data) === 'string' && schema.type === 'string') {
|
|
60
|
-
this._validateString(data, schema, path, errors);
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
// Handle number validation
|
|
64
|
-
if (this._getType(data) === 'number' && schema.type === 'number') {
|
|
65
|
-
this._validateNumber(data, schema, path, errors);
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
_validateObject(data, schema, path, errors) {
|
|
70
|
-
// Check required properties
|
|
71
|
-
if (schema.required) {
|
|
72
|
-
for (const requiredProp of schema.required) {
|
|
73
|
-
if (!(requiredProp in data)) {
|
|
74
|
-
errors.push(`${path || 'root'}: Missing required property '${requiredProp}'`);
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
// Validate properties
|
|
80
|
-
if (schema.properties) {
|
|
81
|
-
for (const [key, value] of Object.entries(data)) {
|
|
82
|
-
if (schema.properties[key]) {
|
|
83
|
-
this._validateType(value, schema.properties[key], `${path}.${key}`, errors);
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
// Validate patternProperties
|
|
89
|
-
if (schema.patternProperties) {
|
|
90
|
-
for (const [key, value] of Object.entries(data)) {
|
|
91
|
-
for (const [pattern, propSchema] of Object.entries(schema.patternProperties)) {
|
|
92
|
-
const regex = new RegExp(pattern);
|
|
93
|
-
if (regex.test(key)) {
|
|
94
|
-
this._validateType(value, propSchema, `${path}.${key}`, errors);
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
_validateArray(data, schema, path, errors) {
|
|
102
|
-
// Check minItems
|
|
103
|
-
if (schema.minItems !== undefined && data.length < schema.minItems) {
|
|
104
|
-
errors.push(`${path || 'root'}: Array must have at least ${schema.minItems} items`);
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
// Check maxItems
|
|
108
|
-
if (schema.maxItems !== undefined && data.length > schema.maxItems) {
|
|
109
|
-
errors.push(`${path || 'root'}: Array must have at most ${schema.maxItems} items`);
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
// Validate items
|
|
113
|
-
if (schema.items) {
|
|
114
|
-
data.forEach((item, index) => {
|
|
115
|
-
this._validateType(item, schema.items, `${path}[${index}]`, errors);
|
|
116
|
-
});
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
_validateString(data, schema, path, errors) {
|
|
121
|
-
// Check minLength
|
|
122
|
-
if (schema.minLength !== undefined && data.length < schema.minLength) {
|
|
123
|
-
errors.push(`${path || 'root'}: String must be at least ${schema.minLength} characters`);
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
// Check maxLength
|
|
127
|
-
if (schema.maxLength !== undefined && data.length > schema.maxLength) {
|
|
128
|
-
errors.push(`${path || 'root'}: String must be at most ${schema.maxLength} characters`);
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
// Check pattern
|
|
132
|
-
if (schema.pattern) {
|
|
133
|
-
const regex = new RegExp(schema.pattern);
|
|
134
|
-
if (!regex.test(data)) {
|
|
135
|
-
errors.push(`${path || 'root'}: String does not match pattern ${schema.pattern}`);
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
_validateNumber(data, schema, path, errors) {
|
|
141
|
-
// Check minimum
|
|
142
|
-
if (schema.minimum !== undefined && data < schema.minimum) {
|
|
143
|
-
errors.push(`${path || 'root'}: Number must be at least ${schema.minimum}`);
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
// Check maximum
|
|
147
|
-
if (schema.maximum !== undefined && data > schema.maximum) {
|
|
148
|
-
errors.push(`${path || 'root'}: Number must be at most ${schema.maximum}`);
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
_getType(value) {
|
|
153
|
-
if (value === null) return 'null';
|
|
154
|
-
if (Array.isArray(value)) return 'array';
|
|
155
|
-
return typeof value;
|
|
156
|
-
}
|
|
157
|
-
}
|
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
-
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* Train Config Parser
|
|
6
|
-
*
|
|
7
|
-
* JavaScript module that replicates the YAML config parsing logic from
|
|
8
|
-
* do/train's _parse_config_python() function. Parses do/training/config.yaml
|
|
9
|
-
* and extracts all supported fields into a structured object.
|
|
10
|
-
*
|
|
11
|
-
* This module mirrors the behavior of both the yq and Python fallback paths
|
|
12
|
-
* in the bash script, providing a testable implementation of the parsing logic.
|
|
13
|
-
*/
|
|
14
|
-
|
|
15
|
-
import { readFileSync } from 'node:fs';
|
|
16
|
-
import yaml from 'js-yaml';
|
|
17
|
-
|
|
18
|
-
/**
|
|
19
|
-
* Default values for optional fields, matching the bash script defaults.
|
|
20
|
-
*/
|
|
21
|
-
const DEFAULTS = {
|
|
22
|
-
instance_count: '1',
|
|
23
|
-
max_runtime_seconds: '86400',
|
|
24
|
-
volume_size_gb: '50',
|
|
25
|
-
enable_spot: 'false',
|
|
26
|
-
max_wait_seconds: '172800',
|
|
27
|
-
checkpoint_path: '',
|
|
28
|
-
hyperparameters: {},
|
|
29
|
-
metric_definitions: [],
|
|
30
|
-
environment: {},
|
|
31
|
-
tags: {}
|
|
32
|
-
};
|
|
33
|
-
|
|
34
|
-
/**
|
|
35
|
-
* Convert a value to its string representation, matching the Python helper's
|
|
36
|
-
* `s()` function behavior in _parse_config_python.
|
|
37
|
-
*
|
|
38
|
-
* @param {*} val - The value to convert
|
|
39
|
-
* @param {string} defaultVal - Default value if val is null/undefined
|
|
40
|
-
* @returns {string} String representation
|
|
41
|
-
*/
|
|
42
|
-
function toStringValue(val, defaultVal = '') {
|
|
43
|
-
if (val === null || val === undefined) {
|
|
44
|
-
return defaultVal;
|
|
45
|
-
}
|
|
46
|
-
if (typeof val === 'boolean') {
|
|
47
|
-
return val ? 'true' : 'false';
|
|
48
|
-
}
|
|
49
|
-
return String(val);
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
/**
|
|
53
|
-
* Parse a training config YAML file and extract all supported fields.
|
|
54
|
-
*
|
|
55
|
-
* This mirrors the behavior of _parse_config_python() in do/train:
|
|
56
|
-
* - Scalar fields are converted to strings
|
|
57
|
-
* - Boolean fields are converted to "true"/"false" strings
|
|
58
|
-
* - Missing optional fields get default values
|
|
59
|
-
* - Complex fields (hyperparameters, metric_definitions, environment, tags)
|
|
60
|
-
* are kept as their native types (objects/arrays)
|
|
61
|
-
*
|
|
62
|
-
* @param {string} configPath - Path to the YAML config file
|
|
63
|
-
* @returns {object} Parsed config with all supported fields
|
|
64
|
-
* @throws {Error} If the file cannot be read or parsed
|
|
65
|
-
*/
|
|
66
|
-
export function parseTrainingConfig(configPath) {
|
|
67
|
-
const content = readFileSync(configPath, 'utf8');
|
|
68
|
-
return parseTrainingConfigFromString(content);
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
/**
|
|
72
|
-
* Parse a training config from a YAML string.
|
|
73
|
-
* Useful for testing without file I/O.
|
|
74
|
-
*
|
|
75
|
-
* @param {string} yamlContent - YAML content string
|
|
76
|
-
* @returns {object} Parsed config with all supported fields
|
|
77
|
-
* @throws {Error} If the YAML cannot be parsed
|
|
78
|
-
*/
|
|
79
|
-
export function parseTrainingConfigFromString(yamlContent) {
|
|
80
|
-
const cfg = yaml.load(yamlContent) || {};
|
|
81
|
-
|
|
82
|
-
return {
|
|
83
|
-
// Required fields (empty string if missing)
|
|
84
|
-
image: toStringValue(cfg.image, ''),
|
|
85
|
-
script: toStringValue(cfg.script, ''),
|
|
86
|
-
instance_type: toStringValue(cfg.instance_type, ''),
|
|
87
|
-
instance_count: toStringValue(cfg.instance_count, DEFAULTS.instance_count),
|
|
88
|
-
dataset: toStringValue(cfg.dataset, ''),
|
|
89
|
-
output_path: toStringValue(cfg.output_path, ''),
|
|
90
|
-
|
|
91
|
-
// Optional scalar fields with defaults
|
|
92
|
-
max_runtime_seconds: toStringValue(cfg.max_runtime_seconds, DEFAULTS.max_runtime_seconds),
|
|
93
|
-
volume_size_gb: toStringValue(cfg.volume_size_gb, DEFAULTS.volume_size_gb),
|
|
94
|
-
enable_spot: toStringValue(cfg.enable_spot, DEFAULTS.enable_spot),
|
|
95
|
-
max_wait_seconds: toStringValue(cfg.max_wait_seconds, DEFAULTS.max_wait_seconds),
|
|
96
|
-
checkpoint_path: toStringValue(cfg.checkpoint_path, DEFAULTS.checkpoint_path),
|
|
97
|
-
|
|
98
|
-
// Complex fields (objects/arrays)
|
|
99
|
-
hyperparameters: cfg.hyperparameters || DEFAULTS.hyperparameters,
|
|
100
|
-
metric_definitions: cfg.metric_definitions || DEFAULTS.metric_definitions,
|
|
101
|
-
environment: cfg.environment || DEFAULTS.environment,
|
|
102
|
-
tags: cfg.tags || DEFAULTS.tags
|
|
103
|
-
};
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
/**
|
|
107
|
-
* List of all supported fields in the training config.
|
|
108
|
-
*/
|
|
109
|
-
export const SUPPORTED_FIELDS = [
|
|
110
|
-
'image',
|
|
111
|
-
'script',
|
|
112
|
-
'instance_type',
|
|
113
|
-
'instance_count',
|
|
114
|
-
'dataset',
|
|
115
|
-
'output_path',
|
|
116
|
-
'max_runtime_seconds',
|
|
117
|
-
'volume_size_gb',
|
|
118
|
-
'enable_spot',
|
|
119
|
-
'max_wait_seconds',
|
|
120
|
-
'checkpoint_path',
|
|
121
|
-
'hyperparameters',
|
|
122
|
-
'metric_definitions',
|
|
123
|
-
'environment',
|
|
124
|
-
'tags'
|
|
125
|
-
];
|
|
126
|
-
|
|
127
|
-
/**
|
|
128
|
-
* List of required fields that must be non-empty.
|
|
129
|
-
*/
|
|
130
|
-
export const REQUIRED_FIELDS = [
|
|
131
|
-
'image',
|
|
132
|
-
'script',
|
|
133
|
-
'instance_type',
|
|
134
|
-
'dataset',
|
|
135
|
-
'output_path'
|
|
136
|
-
];
|
|
@@ -1,143 +0,0 @@
|
|
|
1
|
-
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
-
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* Train Config Persistence
|
|
6
|
-
*
|
|
7
|
-
* JavaScript module that models the config persistence logic from the bash
|
|
8
|
-
* `_update_config_var` function in `templates/do/train`. This module provides
|
|
9
|
-
* a pure JavaScript implementation for property-based testing of the config
|
|
10
|
-
* persistence behavior after job submission and completion.
|
|
11
|
-
*
|
|
12
|
-
* The config file uses the format:
|
|
13
|
-
* export VAR_NAME="value"
|
|
14
|
-
*
|
|
15
|
-
* Behavior:
|
|
16
|
-
* - If the variable already exists: update its value in-place
|
|
17
|
-
* - If the variable doesn't exist: append it to the end
|
|
18
|
-
* - Existing variables in the config are preserved
|
|
19
|
-
*
|
|
20
|
-
* Requirements: 3.4, 5.1
|
|
21
|
-
*/
|
|
22
|
-
|
|
23
|
-
import { readFileSync, writeFileSync } from 'node:fs';
|
|
24
|
-
|
|
25
|
-
/**
|
|
26
|
-
* Update or add a config variable in a do/config-style file.
|
|
27
|
-
* Mimics the bash _update_config_var() function from templates/do/train:
|
|
28
|
-
*
|
|
29
|
-
* if grep -q "^export ${var_name}=" "${config_file}"; then
|
|
30
|
-
* sed -i.bak "s|^export ${var_name}=.*|export ${var_name}=\"${var_value}\"|" "${config_file}"
|
|
31
|
-
* rm -f "${config_file}.bak"
|
|
32
|
-
* else
|
|
33
|
-
* echo "export ${var_name}=\"${var_value}\"" >> "${config_file}"
|
|
34
|
-
* fi
|
|
35
|
-
*
|
|
36
|
-
* @param {string} configContent - Current content of the config file
|
|
37
|
-
* @param {string} varName - Variable name (e.g., TRAIN_JOB_NAME)
|
|
38
|
-
* @param {string} varValue - Variable value
|
|
39
|
-
* @returns {string} Updated config content
|
|
40
|
-
*/
|
|
41
|
-
export function updateConfigVar(configContent, varName, varValue) {
|
|
42
|
-
const pattern = new RegExp(`^export ${varName}=.*$`, 'm');
|
|
43
|
-
|
|
44
|
-
if (pattern.test(configContent)) {
|
|
45
|
-
// Variable exists — update in-place
|
|
46
|
-
return configContent.replace(pattern, `export ${varName}="${varValue}"`);
|
|
47
|
-
} else {
|
|
48
|
-
// Variable doesn't exist — append
|
|
49
|
-
let result = configContent;
|
|
50
|
-
if (result.length > 0 && !result.endsWith('\n')) {
|
|
51
|
-
result += '\n';
|
|
52
|
-
}
|
|
53
|
-
result += `export ${varName}="${varValue}"\n`;
|
|
54
|
-
return result;
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
/**
|
|
59
|
-
* Read a config variable value from a do/config-style file content.
|
|
60
|
-
*
|
|
61
|
-
* @param {string} configContent - Content of the config file
|
|
62
|
-
* @param {string} varName - Variable name to read
|
|
63
|
-
* @returns {string|null} The variable value, or null if not found
|
|
64
|
-
*/
|
|
65
|
-
export function readConfigVar(configContent, varName) {
|
|
66
|
-
const pattern = new RegExp(`^export ${varName}="([^"]*)"`, 'm');
|
|
67
|
-
const match = configContent.match(pattern);
|
|
68
|
-
return match ? match[1] : null;
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
/**
|
|
72
|
-
* Simulate the config writes that happen after a successful training job submission.
|
|
73
|
-
* This mirrors the behavior in do/train's _submit_job() function which calls:
|
|
74
|
-
* _update_config_var "TRAIN_JOB_NAME" "${JOB_NAME}"
|
|
75
|
-
*
|
|
76
|
-
* @param {string} configContent - Current content of the config file
|
|
77
|
-
* @param {object} params - Submission parameters
|
|
78
|
-
* @param {string} params.jobName - Generated job name (pattern: ${PROJECT_NAME}-train-${TIMESTAMP})
|
|
79
|
-
* @returns {string} Updated config content
|
|
80
|
-
*/
|
|
81
|
-
export function persistTrainSubmission(configContent, { jobName }) {
|
|
82
|
-
return updateConfigVar(configContent, 'TRAIN_JOB_NAME', jobName);
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
/**
|
|
86
|
-
* Simulate the config writes that happen after a training job completes.
|
|
87
|
-
* This mirrors the behavior in do/train's _handle_completion() function which calls:
|
|
88
|
-
* _update_config_var "TRAIN_OUTPUT_PATH" "${output_path}"
|
|
89
|
-
*
|
|
90
|
-
* @param {string} configContent - Current content of the config file
|
|
91
|
-
* @param {object} params - Completion parameters
|
|
92
|
-
* @param {string} params.outputPath - S3 path to the output artifacts
|
|
93
|
-
* @returns {string} Updated config content
|
|
94
|
-
*/
|
|
95
|
-
export function persistTrainCompletion(configContent, { outputPath }) {
|
|
96
|
-
return updateConfigVar(configContent, 'TRAIN_OUTPUT_PATH', outputPath);
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
/**
|
|
100
|
-
* Generate a training job name following the pattern used by do/train.
|
|
101
|
-
* Pattern: ${projectName}-train-YYYYMMDD-HHMMSS
|
|
102
|
-
*
|
|
103
|
-
* @param {string} projectName - Project name
|
|
104
|
-
* @param {Date} [timestamp] - Optional timestamp (defaults to now)
|
|
105
|
-
* @returns {string} Generated job name
|
|
106
|
-
*/
|
|
107
|
-
export function generateTrainJobName(projectName, timestamp = new Date()) {
|
|
108
|
-
const year = timestamp.getFullYear().toString();
|
|
109
|
-
const month = (timestamp.getMonth() + 1).toString().padStart(2, '0');
|
|
110
|
-
const day = timestamp.getDate().toString().padStart(2, '0');
|
|
111
|
-
const hours = timestamp.getHours().toString().padStart(2, '0');
|
|
112
|
-
const minutes = timestamp.getMinutes().toString().padStart(2, '0');
|
|
113
|
-
const seconds = timestamp.getSeconds().toString().padStart(2, '0');
|
|
114
|
-
const dateStr = `${year}${month}${day}`;
|
|
115
|
-
const timeStr = `${hours}${minutes}${seconds}`;
|
|
116
|
-
return `${projectName}-train-${dateStr}-${timeStr}`;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
/**
|
|
120
|
-
* File-based version of updateConfigVar that reads/writes to disk.
|
|
121
|
-
* Used for integration-style tests that need actual file I/O.
|
|
122
|
-
*
|
|
123
|
-
* @param {string} configPath - Path to the config file
|
|
124
|
-
* @param {string} varName - Variable name
|
|
125
|
-
* @param {string} varValue - Variable value
|
|
126
|
-
*/
|
|
127
|
-
export function updateConfigVarFile(configPath, varName, varValue) {
|
|
128
|
-
const content = readFileSync(configPath, 'utf8');
|
|
129
|
-
const updated = updateConfigVar(content, varName, varValue);
|
|
130
|
-
writeFileSync(configPath, updated, 'utf8');
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
/**
|
|
134
|
-
* File-based version of readConfigVar that reads from disk.
|
|
135
|
-
*
|
|
136
|
-
* @param {string} configPath - Path to the config file
|
|
137
|
-
* @param {string} varName - Variable name to read
|
|
138
|
-
* @returns {string|null} The variable value, or null if not found
|
|
139
|
-
*/
|
|
140
|
-
export function readConfigVarFile(configPath, varName) {
|
|
141
|
-
const content = readFileSync(configPath, 'utf8');
|
|
142
|
-
return readConfigVar(content, varName);
|
|
143
|
-
}
|
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
-
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* Train Config Validator
|
|
6
|
-
*
|
|
7
|
-
* Validates training configuration objects parsed from do/training/config.yaml.
|
|
8
|
-
* Checks that all required fields are present and provides descriptive error
|
|
9
|
-
* messages naming the specific missing field.
|
|
10
|
-
*
|
|
11
|
-
* This module mirrors the validation logic in the bash `_validate_config`
|
|
12
|
-
* function in templates/do/train, enabling property-based testing of the
|
|
13
|
-
* validation rules in isolation.
|
|
14
|
-
*
|
|
15
|
-
* Requirements: 2.12, 10.1
|
|
16
|
-
*/
|
|
17
|
-
|
|
18
|
-
/**
|
|
19
|
-
* Required fields for a valid training configuration.
|
|
20
|
-
* Each entry maps the field name to a human-readable description and expected format.
|
|
21
|
-
*/
|
|
22
|
-
export const REQUIRED_FIELDS = {
|
|
23
|
-
image: {
|
|
24
|
-
description: 'The container image URI',
|
|
25
|
-
format: 'image: "123456789012.dkr.ecr.us-east-1.amazonaws.com/my-training:latest"'
|
|
26
|
-
},
|
|
27
|
-
script: {
|
|
28
|
-
description: 'The training script S3 path',
|
|
29
|
-
format: 'script: "s3://my-bucket/scripts/train.py"'
|
|
30
|
-
},
|
|
31
|
-
instance_type: {
|
|
32
|
-
description: 'The SageMaker instance type',
|
|
33
|
-
format: 'instance_type: "ml.g5.xlarge"'
|
|
34
|
-
},
|
|
35
|
-
dataset: {
|
|
36
|
-
description: 'The S3 dataset path',
|
|
37
|
-
format: 'dataset: "s3://my-bucket/data/train/"'
|
|
38
|
-
},
|
|
39
|
-
output_path: {
|
|
40
|
-
description: 'The S3 output path',
|
|
41
|
-
format: 'output_path: "s3://my-bucket/output/"'
|
|
42
|
-
}
|
|
43
|
-
};
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* Validate that all required fields are present in a training config.
|
|
47
|
-
*
|
|
48
|
-
* @param {Object} config - The parsed training configuration object
|
|
49
|
-
* @returns {{ valid: boolean, errors: Array<{ field: string, message: string }> }}
|
|
50
|
-
* - valid: true if all required fields are present and non-empty
|
|
51
|
-
* - errors: array of error objects, each naming the missing field
|
|
52
|
-
*/
|
|
53
|
-
export function validateRequiredFields(config) {
|
|
54
|
-
const errors = [];
|
|
55
|
-
|
|
56
|
-
for (const [field, meta] of Object.entries(REQUIRED_FIELDS)) {
|
|
57
|
-
const value = config ? config[field] : undefined;
|
|
58
|
-
|
|
59
|
-
if (value === undefined || value === null || value === '') {
|
|
60
|
-
errors.push({
|
|
61
|
-
field,
|
|
62
|
-
message: `Missing required field: ${field}\n ${meta.description} is required in do/training/config.yaml\n\n Expected format: ${meta.format}`
|
|
63
|
-
});
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
return {
|
|
68
|
-
valid: errors.length === 0,
|
|
69
|
-
errors
|
|
70
|
-
};
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
/**
|
|
74
|
-
* Validate spot training checkpoint requirement.
|
|
75
|
-
* When enable_spot is true, checkpoint_path must be specified.
|
|
76
|
-
*
|
|
77
|
-
* @param {Object} config - The parsed training configuration object
|
|
78
|
-
* @returns {{ valid: boolean, errors: Array<{ field: string, message: string }> }}
|
|
79
|
-
*/
|
|
80
|
-
export function validateSpotConfig(config) {
|
|
81
|
-
const errors = [];
|
|
82
|
-
|
|
83
|
-
if (config && config.enable_spot === true && (!config.checkpoint_path || config.checkpoint_path === '')) {
|
|
84
|
-
errors.push({
|
|
85
|
-
field: 'checkpoint_path',
|
|
86
|
-
message: 'Checkpoint path required for spot training\n When enable_spot is true, a checkpoint S3 path must be specified\n so training can resume after spot interruptions.'
|
|
87
|
-
});
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
return {
|
|
91
|
-
valid: errors.length === 0,
|
|
92
|
-
errors
|
|
93
|
-
};
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
/**
|
|
97
|
-
* Full validation of a training config — checks required fields and spot config.
|
|
98
|
-
*
|
|
99
|
-
* @param {Object} config - The parsed training configuration object
|
|
100
|
-
* @returns {{ valid: boolean, errors: Array<{ field: string, message: string }> }}
|
|
101
|
-
*/
|
|
102
|
-
export function validateTrainingConfig(config) {
|
|
103
|
-
const requiredResult = validateRequiredFields(config);
|
|
104
|
-
const spotResult = validateSpotConfig(config);
|
|
105
|
-
|
|
106
|
-
const allErrors = [...requiredResult.errors, ...spotResult.errors];
|
|
107
|
-
|
|
108
|
-
return {
|
|
109
|
-
valid: allErrors.length === 0,
|
|
110
|
-
errors: allErrors
|
|
111
|
-
};
|
|
112
|
-
}
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
-
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* Train Feedback Loop — JavaScript equivalent of do/lib/feedback.sh
|
|
6
|
-
*
|
|
7
|
-
* Generates post-completion feedback output with artifact locations
|
|
8
|
-
* and deployment suggestions based on artifact type.
|
|
9
|
-
*/
|
|
10
|
-
|
|
11
|
-
/**
|
|
12
|
-
* Generate completion feedback output for a training/tuning job.
|
|
13
|
-
*
|
|
14
|
-
* Replicates the logic of print_completion_feedback() in do/lib/feedback.sh.
|
|
15
|
-
*
|
|
16
|
-
* @param {object} params
|
|
17
|
-
* @param {string} params.outputPath - S3 URI to the output artifacts
|
|
18
|
-
* @param {string} params.outputType - "adapter" or "full-model"
|
|
19
|
-
* @param {string} params.jobName - Job name for reference
|
|
20
|
-
* @param {string} [params.modelPackageArn] - Optional model package ARN
|
|
21
|
-
* @returns {string} The formatted feedback output
|
|
22
|
-
*/
|
|
23
|
-
export function generateCompletionFeedback({ outputPath, outputType, jobName, modelPackageArn = '' }) {
|
|
24
|
-
const lines = [];
|
|
25
|
-
|
|
26
|
-
lines.push('');
|
|
27
|
-
lines.push(`✅ Training complete: ${jobName}`);
|
|
28
|
-
lines.push('');
|
|
29
|
-
lines.push(` Artifacts: ${outputPath}`);
|
|
30
|
-
if (modelPackageArn) {
|
|
31
|
-
lines.push(` Model Package: ${modelPackageArn}`);
|
|
32
|
-
}
|
|
33
|
-
lines.push('');
|
|
34
|
-
lines.push(' Next steps:');
|
|
35
|
-
|
|
36
|
-
if (outputType === 'adapter') {
|
|
37
|
-
lines.push(` • Deploy as LoRA adapter: ./do/adapter add my-adapter --weights ${outputPath}`);
|
|
38
|
-
lines.push(' • (Requires running endpoint with LoRA enabled)');
|
|
39
|
-
} else if (outputType === 'full-model') {
|
|
40
|
-
lines.push(` • Deploy as new IC: ./do/add-ic my-model --model-data ${outputPath}`);
|
|
41
|
-
lines.push(` • Replace current base: ./do/deploy --force-ic --model-data ${outputPath}`);
|
|
42
|
-
}
|
|
43
|
-
lines.push('');
|
|
44
|
-
|
|
45
|
-
return lines.join('\n');
|
|
46
|
-
}
|
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
-
// SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* Train Idempotency Decision Logic
|
|
6
|
-
*
|
|
7
|
-
* Models the idempotency check logic from the bash `_check_idempotency` function
|
|
8
|
-
* in `templates/do/train` as a pure JavaScript function for property-based testing.
|
|
9
|
-
*
|
|
10
|
-
* The idempotency pattern:
|
|
11
|
-
* - If --force is set, always create a new job regardless of existing status
|
|
12
|
-
* - If no existing job, create a new job
|
|
13
|
-
* - If existing job is InProgress, poll it
|
|
14
|
-
* - If existing job is Completed, display results
|
|
15
|
-
* - If existing job is Failed or Stopped, display failure and suggest --force
|
|
16
|
-
*
|
|
17
|
-
* Requirements: 5.1–5.5
|
|
18
|
-
*/
|
|
19
|
-
|
|
20
|
-
/**
|
|
21
|
-
* Valid existing job statuses that SageMaker can report.
|
|
22
|
-
*/
|
|
23
|
-
export const JOB_STATUSES = ['InProgress', 'Completed', 'Failed', 'Stopped'];
|
|
24
|
-
|
|
25
|
-
/**
|
|
26
|
-
* Possible actions the train script can take after the idempotency check.
|
|
27
|
-
*/
|
|
28
|
-
export const ACTIONS = {
|
|
29
|
-
CREATE_NEW_JOB: 'create_new_job',
|
|
30
|
-
POLL_EXISTING: 'poll_existing',
|
|
31
|
-
DISPLAY_RESULTS: 'display_results',
|
|
32
|
-
DISPLAY_FAILURE: 'display_failure'
|
|
33
|
-
};
|
|
34
|
-
|
|
35
|
-
/**
|
|
36
|
-
* Determine the action to take based on existing job status and force flag.
|
|
37
|
-
*
|
|
38
|
-
* This mirrors the bash `_check_idempotency` logic in a testable form:
|
|
39
|
-
* - force=true → always create_new_job
|
|
40
|
-
* - no existing status (null/empty) → create_new_job
|
|
41
|
-
* - InProgress → poll_existing
|
|
42
|
-
* - Completed → display_results
|
|
43
|
-
* - Failed → display_failure
|
|
44
|
-
* - Stopped → display_failure
|
|
45
|
-
*
|
|
46
|
-
* @param {string|null|undefined} existingStatus - The current job status from DescribeTrainingJob
|
|
47
|
-
* @param {boolean} forceFlag - Whether --force was specified
|
|
48
|
-
* @returns {{ action: string, reason: string }}
|
|
49
|
-
* - action: one of ACTIONS values
|
|
50
|
-
* - reason: human-readable explanation of why this action was chosen
|
|
51
|
-
*/
|
|
52
|
-
export function determineAction(existingStatus, forceFlag) {
|
|
53
|
-
// Force flag always overrides — create a new job regardless of existing status
|
|
54
|
-
if (forceFlag === true) {
|
|
55
|
-
return {
|
|
56
|
-
action: ACTIONS.CREATE_NEW_JOB,
|
|
57
|
-
reason: '--force specified, creating new job regardless of existing status'
|
|
58
|
-
};
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
// No existing job — create a new one
|
|
62
|
-
if (!existingStatus || existingStatus === '') {
|
|
63
|
-
return {
|
|
64
|
-
action: ACTIONS.CREATE_NEW_JOB,
|
|
65
|
-
reason: 'No existing job found, creating new job'
|
|
66
|
-
};
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
// Existing job found — action depends on status
|
|
70
|
-
switch (existingStatus) {
|
|
71
|
-
case 'InProgress':
|
|
72
|
-
return {
|
|
73
|
-
action: ACTIONS.POLL_EXISTING,
|
|
74
|
-
reason: `Existing job is ${existingStatus}, resuming polling`
|
|
75
|
-
};
|
|
76
|
-
|
|
77
|
-
case 'Completed':
|
|
78
|
-
return {
|
|
79
|
-
action: ACTIONS.DISPLAY_RESULTS,
|
|
80
|
-
reason: `Existing job is ${existingStatus}, displaying results`
|
|
81
|
-
};
|
|
82
|
-
|
|
83
|
-
case 'Failed':
|
|
84
|
-
case 'Stopped':
|
|
85
|
-
return {
|
|
86
|
-
action: ACTIONS.DISPLAY_FAILURE,
|
|
87
|
-
reason: `Existing job is ${existingStatus}, suggest --force to create new job`
|
|
88
|
-
};
|
|
89
|
-
|
|
90
|
-
default:
|
|
91
|
-
// Unknown status — treat as failure, suggest --force
|
|
92
|
-
return {
|
|
93
|
-
action: ACTIONS.DISPLAY_FAILURE,
|
|
94
|
-
reason: `Unexpected job status: ${existingStatus}, suggest --force`
|
|
95
|
-
};
|
|
96
|
-
}
|
|
97
|
-
}
|