@aws/ml-container-creator 0.13.5 → 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/parameter-schema-v2.json +33 -5
- package/infra/ci-harness/lib/ci-harness-stack.ts +13 -5
- package/infra/ci-harness/package-lock.json +121 -111
- package/infra/ci-harness/package.json +1 -1
- package/package.json +2 -2
- package/servers/endpoint-picker/index.js +23 -14
- package/servers/instance-sizer/index.js +72 -4
- package/servers/instance-sizer/lib/model-resolver.js +28 -2
- package/src/app.js +15 -0
- package/src/lib/config-loader.js +18 -0
- package/src/lib/config-manager.js +6 -1
- package/src/lib/dataset-slug.js +152 -0
- package/src/lib/generated/cli-options.js +9 -3
- package/src/lib/generated/parameter-matrix.js +15 -4
- package/src/lib/generated/validation-rules.js +1 -1
- package/src/lib/mcp-client.js +15 -1
- package/src/lib/mcp-query-runner.js +11 -1
- package/src/lib/prompt-runner.js +40 -20
- package/src/lib/prompts/feature-prompts.js +1 -1
- package/src/lib/template-manager.js +0 -7
- package/src/lib/template-variable-resolver.js +51 -1
- package/src/lib/tune-config-state.js +14 -1
- package/templates/do/.benchmark_writer.py +43 -0
- package/templates/do/.register_helper.py +1185 -0
- package/templates/do/.tune_helper.py +168 -2
- package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
- package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
- package/templates/do/adapter +319 -27
- package/templates/do/add-ic +85 -3
- package/templates/do/benchmark +28 -8
- package/templates/do/config +20 -0
- package/templates/do/lib/inference-component.sh +56 -3
- package/templates/do/register +557 -6
- package/templates/do/test +12 -2
- package/templates/do/tune +219 -6
|
@@ -105,6 +105,136 @@ def _sanitize_for_json(value):
|
|
|
105
105
|
return str(value) if value else None
|
|
106
106
|
|
|
107
107
|
|
|
108
|
+
# ── Registry resolution helpers ───────────────────────────────────────────────
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _resolve_dataset_name(dataset_name):
|
|
112
|
+
"""Resolve a registered dataset name to S3 URI (or ARN) via .register_helper.py.
|
|
113
|
+
|
|
114
|
+
Calls the resolve-dataset subcommand of .register_helper.py and returns
|
|
115
|
+
the resolved value. If the response contains an 'arn' field (Backlog #023,
|
|
116
|
+
AI Registry mode), returns the ARN for use with SFTTrainer(training_dataset=arn).
|
|
117
|
+
Otherwise returns the S3 URI for backward compatibility.
|
|
118
|
+
"""
|
|
119
|
+
import subprocess
|
|
120
|
+
|
|
121
|
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
122
|
+
helper_path = os.path.join(script_dir, ".register_helper.py")
|
|
123
|
+
|
|
124
|
+
if not os.path.exists(helper_path):
|
|
125
|
+
_error_exit(
|
|
126
|
+
f"Cannot resolve dataset '{dataset_name}': .register_helper.py not found. "
|
|
127
|
+
f"Register datasets first with: ./do/register --dataset"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
result = subprocess.run(
|
|
132
|
+
["python3", helper_path, "resolve-dataset", "--name", dataset_name],
|
|
133
|
+
capture_output=True, text=True, timeout=30
|
|
134
|
+
)
|
|
135
|
+
except subprocess.TimeoutExpired:
|
|
136
|
+
_error_exit(f"Timeout resolving dataset '{dataset_name}' from registry")
|
|
137
|
+
except Exception as e:
|
|
138
|
+
_error_exit(f"Failed to resolve dataset '{dataset_name}': {e}")
|
|
139
|
+
|
|
140
|
+
if result.returncode != 0:
|
|
141
|
+
_error_exit(
|
|
142
|
+
f"Dataset '{dataset_name}' not found in registry. "
|
|
143
|
+
f"Register it first: ./do/register --dataset --dataset-name {dataset_name} --dataset-s3-uri s3://..."
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Parse JSON output from resolve-dataset
|
|
147
|
+
try:
|
|
148
|
+
output = json.loads(result.stdout.strip())
|
|
149
|
+
except (json.JSONDecodeError, ValueError):
|
|
150
|
+
_error_exit(
|
|
151
|
+
f"Failed to parse registry response for dataset '{dataset_name}'. "
|
|
152
|
+
f"Raw output: {result.stdout[:200]}"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if "error" in output:
|
|
156
|
+
_error_exit(
|
|
157
|
+
f"Dataset '{dataset_name}' not found in registry: {output['error']}. "
|
|
158
|
+
f"Register it first: ./do/register --dataset --dataset-name {dataset_name} --dataset-s3-uri s3://..."
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Prefer ARN if available (Backlog #023 — AI Registry mode)
|
|
162
|
+
# When arn is present, use it directly with SFTTrainer(training_dataset=arn)
|
|
163
|
+
arn = output.get("arn")
|
|
164
|
+
if arn:
|
|
165
|
+
return arn
|
|
166
|
+
|
|
167
|
+
# Fallback: use S3 URI
|
|
168
|
+
s3_uri = output.get("s3_uri", "")
|
|
169
|
+
if not s3_uri:
|
|
170
|
+
_error_exit(
|
|
171
|
+
f"Dataset '{dataset_name}' resolved but has no S3 URI or ARN. "
|
|
172
|
+
f"Re-register with: ./do/register --dataset --dataset-name {dataset_name} --dataset-s3-uri s3://..."
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return s3_uri
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _resolve_evaluator_name(evaluator_name):
|
|
179
|
+
"""Resolve a registered evaluator name to type and ARN/URI via .register_helper.py.
|
|
180
|
+
|
|
181
|
+
Returns (evaluator_type, arn_or_uri) tuple.
|
|
182
|
+
evaluator_type is "lambda" for RLVR or "model" for RLAIF.
|
|
183
|
+
"""
|
|
184
|
+
import subprocess
|
|
185
|
+
|
|
186
|
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
187
|
+
helper_path = os.path.join(script_dir, ".register_helper.py")
|
|
188
|
+
|
|
189
|
+
if not os.path.exists(helper_path):
|
|
190
|
+
_error_exit(
|
|
191
|
+
f"Cannot resolve evaluator '{evaluator_name}': .register_helper.py not found. "
|
|
192
|
+
f"Register evaluators first with: ./do/register --evaluator"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
result = subprocess.run(
|
|
197
|
+
["python3", helper_path, "resolve-evaluator", "--name", evaluator_name],
|
|
198
|
+
capture_output=True, text=True, timeout=30
|
|
199
|
+
)
|
|
200
|
+
except subprocess.TimeoutExpired:
|
|
201
|
+
_error_exit(f"Timeout resolving evaluator '{evaluator_name}' from registry")
|
|
202
|
+
except Exception as e:
|
|
203
|
+
_error_exit(f"Failed to resolve evaluator '{evaluator_name}': {e}")
|
|
204
|
+
|
|
205
|
+
if result.returncode != 0:
|
|
206
|
+
_error_exit(
|
|
207
|
+
f"Evaluator '{evaluator_name}' not found in registry. "
|
|
208
|
+
f"Register it first: ./do/register --evaluator --evaluator-name {evaluator_name} ..."
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Parse JSON output from resolve-evaluator
|
|
212
|
+
try:
|
|
213
|
+
output = json.loads(result.stdout.strip())
|
|
214
|
+
except (json.JSONDecodeError, ValueError):
|
|
215
|
+
_error_exit(
|
|
216
|
+
f"Failed to parse registry response for evaluator '{evaluator_name}'. "
|
|
217
|
+
f"Raw output: {result.stdout[:200]}"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
if "error" in output:
|
|
221
|
+
_error_exit(
|
|
222
|
+
f"Evaluator '{evaluator_name}' not found in registry: {output['error']}. "
|
|
223
|
+
f"Register it first: ./do/register --evaluator --evaluator-name {evaluator_name} ..."
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
ev_type = output.get("type", "")
|
|
227
|
+
arn_or_uri = output.get("arn_or_uri", "")
|
|
228
|
+
|
|
229
|
+
if not arn_or_uri:
|
|
230
|
+
_error_exit(
|
|
231
|
+
f"Evaluator '{evaluator_name}' resolved but has no ARN/URI. "
|
|
232
|
+
f"Re-register with: ./do/register --evaluator --evaluator-name {evaluator_name} ..."
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return ev_type, arn_or_uri
|
|
236
|
+
|
|
237
|
+
|
|
108
238
|
# ── Subcommand: submit ────────────────────────────────────────────────────────
|
|
109
239
|
|
|
110
240
|
|
|
@@ -124,6 +254,26 @@ def cmd_submit(args):
|
|
|
124
254
|
os.environ["AWS_DEFAULT_REGION"] = region
|
|
125
255
|
os.environ.setdefault("AWS_REGION", region)
|
|
126
256
|
|
|
257
|
+
# ── Resolve --dataset-name from registry (AC-2b.4) ────────────────────────
|
|
258
|
+
# --dataset-s3-uri wins if both are provided (backward compatible override)
|
|
259
|
+
if not args.dataset_s3_uri and args.dataset_name:
|
|
260
|
+
resolved_uri = _resolve_dataset_name(args.dataset_name)
|
|
261
|
+
args.dataset_s3_uri = resolved_uri
|
|
262
|
+
elif not args.dataset_s3_uri and not args.dataset_name:
|
|
263
|
+
_error_exit(
|
|
264
|
+
"Either --dataset-s3-uri or --dataset-name is required. "
|
|
265
|
+
"Provide an S3 URI directly or a registered dataset name."
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# ── Resolve --evaluator-name from registry (AC-2c.3, AC-2c.4) ────────────
|
|
269
|
+
# --reward-function / --reward-prompt win if provided (backward compatible override)
|
|
270
|
+
if args.evaluator_name and not args.reward_function and not args.reward_prompt:
|
|
271
|
+
ev_type, ev_arn_or_uri = _resolve_evaluator_name(args.evaluator_name)
|
|
272
|
+
if ev_type == "lambda":
|
|
273
|
+
args.reward_function = ev_arn_or_uri
|
|
274
|
+
else:
|
|
275
|
+
args.reward_prompt = ev_arn_or_uri
|
|
276
|
+
|
|
127
277
|
_check_sagemaker_sdk()
|
|
128
278
|
|
|
129
279
|
# SDK v3 moved trainers from sagemaker.modules.train → sagemaker.train
|
|
@@ -384,6 +534,12 @@ def cmd_status(args):
|
|
|
384
534
|
Returns: {"status": str, "failure_reason": str|None,
|
|
385
535
|
"metrics": dict|None, "elapsed_seconds": int}
|
|
386
536
|
"""
|
|
537
|
+
# Set region before any sagemaker import (creates boto3 clients at import time)
|
|
538
|
+
region = getattr(args, 'region', None) or os.environ.get('AWS_DEFAULT_REGION') or os.environ.get('AWS_REGION')
|
|
539
|
+
if region:
|
|
540
|
+
os.environ['AWS_DEFAULT_REGION'] = region
|
|
541
|
+
os.environ.setdefault('AWS_REGION', region)
|
|
542
|
+
|
|
387
543
|
from sagemaker.core.resources import TrainingJob
|
|
388
544
|
from botocore.exceptions import ClientError
|
|
389
545
|
|
|
@@ -485,6 +641,12 @@ def cmd_resolve(args):
|
|
|
485
641
|
Returns: {"artifact_path": str, "model_package_arn": str|None,
|
|
486
642
|
"output_type": str}
|
|
487
643
|
"""
|
|
644
|
+
# Set region before any sagemaker import (creates boto3 clients at import time)
|
|
645
|
+
region = getattr(args, 'region', None) or os.environ.get('AWS_DEFAULT_REGION') or os.environ.get('AWS_REGION')
|
|
646
|
+
if region:
|
|
647
|
+
os.environ['AWS_DEFAULT_REGION'] = region
|
|
648
|
+
os.environ.setdefault('AWS_REGION', region)
|
|
649
|
+
|
|
488
650
|
from sagemaker.core.resources import TrainingJob
|
|
489
651
|
|
|
490
652
|
try:
|
|
@@ -1719,8 +1881,10 @@ def main():
|
|
|
1719
1881
|
submit_parser.add_argument("--training-type", required=True,
|
|
1720
1882
|
choices=["lora", "full-rank"],
|
|
1721
1883
|
help="Training type (lora or full-rank)")
|
|
1722
|
-
submit_parser.add_argument("--dataset-s3-uri", required=
|
|
1723
|
-
help="S3 URI of the training dataset")
|
|
1884
|
+
submit_parser.add_argument("--dataset-s3-uri", required=False, default=None,
|
|
1885
|
+
help="S3 URI of the training dataset (direct override)")
|
|
1886
|
+
submit_parser.add_argument("--dataset-name", default=None,
|
|
1887
|
+
help="Registered dataset name to resolve from registry")
|
|
1724
1888
|
submit_parser.add_argument("--output-bucket", required=True,
|
|
1725
1889
|
help="S3 bucket for output artifacts")
|
|
1726
1890
|
submit_parser.add_argument("--role-arn", required=True,
|
|
@@ -1747,6 +1911,8 @@ def main():
|
|
|
1747
1911
|
help="Lambda ARN for reward function (RLVR)")
|
|
1748
1912
|
submit_parser.add_argument("--reward-prompt", default=None,
|
|
1749
1913
|
help="S3 URI for reward prompt (RLAIF)")
|
|
1914
|
+
submit_parser.add_argument("--evaluator-name", default=None,
|
|
1915
|
+
help="Registered evaluator name to resolve from registry")
|
|
1750
1916
|
submit_parser.add_argument("--accept-eula", action="store_true", default=False,
|
|
1751
1917
|
help="Accept model EULA for gated models (e.g., Llama)")
|
|
1752
1918
|
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|