@aws/ml-container-creator 0.13.5 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/config/parameter-schema-v2.json +33 -5
  2. package/infra/ci-harness/lib/ci-harness-stack.ts +13 -5
  3. package/infra/ci-harness/package-lock.json +121 -111
  4. package/infra/ci-harness/package.json +1 -1
  5. package/package.json +2 -2
  6. package/servers/endpoint-picker/index.js +23 -14
  7. package/servers/instance-sizer/index.js +72 -4
  8. package/servers/instance-sizer/lib/model-resolver.js +28 -2
  9. package/src/app.js +15 -0
  10. package/src/lib/config-loader.js +18 -0
  11. package/src/lib/config-manager.js +6 -1
  12. package/src/lib/dataset-slug.js +152 -0
  13. package/src/lib/generated/cli-options.js +9 -3
  14. package/src/lib/generated/parameter-matrix.js +15 -4
  15. package/src/lib/generated/validation-rules.js +1 -1
  16. package/src/lib/mcp-client.js +15 -1
  17. package/src/lib/mcp-query-runner.js +11 -1
  18. package/src/lib/prompt-runner.js +40 -20
  19. package/src/lib/prompts/feature-prompts.js +1 -1
  20. package/src/lib/template-manager.js +0 -7
  21. package/src/lib/template-variable-resolver.js +51 -1
  22. package/src/lib/tune-config-state.js +14 -1
  23. package/templates/do/.benchmark_writer.py +43 -0
  24. package/templates/do/.register_helper.py +1185 -0
  25. package/templates/do/.tune_helper.py +168 -2
  26. package/templates/do/__pycache__/.adapter_helper.cpython-312.pyc +0 -0
  27. package/templates/do/__pycache__/.benchmark_writer.cpython-312.pyc +0 -0
  28. package/templates/do/__pycache__/.register_helper.cpython-312.pyc +0 -0
  29. package/templates/do/__pycache__/.tune_helper.cpython-312.pyc +0 -0
  30. package/templates/do/adapter +319 -27
  31. package/templates/do/add-ic +85 -3
  32. package/templates/do/benchmark +28 -8
  33. package/templates/do/config +20 -0
  34. package/templates/do/lib/inference-component.sh +56 -3
  35. package/templates/do/register +557 -6
  36. package/templates/do/test +12 -2
  37. package/templates/do/tune +219 -6
@@ -105,6 +105,136 @@ def _sanitize_for_json(value):
105
105
  return str(value) if value else None
106
106
 
107
107
 
108
+ # ── Registry resolution helpers ───────────────────────────────────────────────
109
+
110
+
111
+ def _resolve_dataset_name(dataset_name):
112
+ """Resolve a registered dataset name to S3 URI (or ARN) via .register_helper.py.
113
+
114
+ Calls the resolve-dataset subcommand of .register_helper.py and returns
115
+ the resolved value. If the response contains an 'arn' field (Backlog #023,
116
+ AI Registry mode), returns the ARN for use with SFTTrainer(training_dataset=arn).
117
+ Otherwise returns the S3 URI for backward compatibility.
118
+ """
119
+ import subprocess
120
+
121
+ script_dir = os.path.dirname(os.path.abspath(__file__))
122
+ helper_path = os.path.join(script_dir, ".register_helper.py")
123
+
124
+ if not os.path.exists(helper_path):
125
+ _error_exit(
126
+ f"Cannot resolve dataset '{dataset_name}': .register_helper.py not found. "
127
+ f"Register datasets first with: ./do/register --dataset"
128
+ )
129
+
130
+ try:
131
+ result = subprocess.run(
132
+ ["python3", helper_path, "resolve-dataset", "--name", dataset_name],
133
+ capture_output=True, text=True, timeout=30
134
+ )
135
+ except subprocess.TimeoutExpired:
136
+ _error_exit(f"Timeout resolving dataset '{dataset_name}' from registry")
137
+ except Exception as e:
138
+ _error_exit(f"Failed to resolve dataset '{dataset_name}': {e}")
139
+
140
+ if result.returncode != 0:
141
+ _error_exit(
142
+ f"Dataset '{dataset_name}' not found in registry. "
143
+ f"Register it first: ./do/register --dataset --dataset-name {dataset_name} --dataset-s3-uri s3://..."
144
+ )
145
+
146
+ # Parse JSON output from resolve-dataset
147
+ try:
148
+ output = json.loads(result.stdout.strip())
149
+ except (json.JSONDecodeError, ValueError):
150
+ _error_exit(
151
+ f"Failed to parse registry response for dataset '{dataset_name}'. "
152
+ f"Raw output: {result.stdout[:200]}"
153
+ )
154
+
155
+ if "error" in output:
156
+ _error_exit(
157
+ f"Dataset '{dataset_name}' not found in registry: {output['error']}. "
158
+ f"Register it first: ./do/register --dataset --dataset-name {dataset_name} --dataset-s3-uri s3://..."
159
+ )
160
+
161
+ # Prefer ARN if available (Backlog #023 — AI Registry mode)
162
+ # When arn is present, use it directly with SFTTrainer(training_dataset=arn)
163
+ arn = output.get("arn")
164
+ if arn:
165
+ return arn
166
+
167
+ # Fallback: use S3 URI
168
+ s3_uri = output.get("s3_uri", "")
169
+ if not s3_uri:
170
+ _error_exit(
171
+ f"Dataset '{dataset_name}' resolved but has no S3 URI or ARN. "
172
+ f"Re-register with: ./do/register --dataset --dataset-name {dataset_name} --dataset-s3-uri s3://..."
173
+ )
174
+
175
+ return s3_uri
176
+
177
+
178
+ def _resolve_evaluator_name(evaluator_name):
179
+ """Resolve a registered evaluator name to type and ARN/URI via .register_helper.py.
180
+
181
+ Returns (evaluator_type, arn_or_uri) tuple.
182
+ evaluator_type is "lambda" for RLVR or "model" for RLAIF.
183
+ """
184
+ import subprocess
185
+
186
+ script_dir = os.path.dirname(os.path.abspath(__file__))
187
+ helper_path = os.path.join(script_dir, ".register_helper.py")
188
+
189
+ if not os.path.exists(helper_path):
190
+ _error_exit(
191
+ f"Cannot resolve evaluator '{evaluator_name}': .register_helper.py not found. "
192
+ f"Register evaluators first with: ./do/register --evaluator"
193
+ )
194
+
195
+ try:
196
+ result = subprocess.run(
197
+ ["python3", helper_path, "resolve-evaluator", "--name", evaluator_name],
198
+ capture_output=True, text=True, timeout=30
199
+ )
200
+ except subprocess.TimeoutExpired:
201
+ _error_exit(f"Timeout resolving evaluator '{evaluator_name}' from registry")
202
+ except Exception as e:
203
+ _error_exit(f"Failed to resolve evaluator '{evaluator_name}': {e}")
204
+
205
+ if result.returncode != 0:
206
+ _error_exit(
207
+ f"Evaluator '{evaluator_name}' not found in registry. "
208
+ f"Register it first: ./do/register --evaluator --evaluator-name {evaluator_name} ..."
209
+ )
210
+
211
+ # Parse JSON output from resolve-evaluator
212
+ try:
213
+ output = json.loads(result.stdout.strip())
214
+ except (json.JSONDecodeError, ValueError):
215
+ _error_exit(
216
+ f"Failed to parse registry response for evaluator '{evaluator_name}'. "
217
+ f"Raw output: {result.stdout[:200]}"
218
+ )
219
+
220
+ if "error" in output:
221
+ _error_exit(
222
+ f"Evaluator '{evaluator_name}' not found in registry: {output['error']}. "
223
+ f"Register it first: ./do/register --evaluator --evaluator-name {evaluator_name} ..."
224
+ )
225
+
226
+ ev_type = output.get("type", "")
227
+ arn_or_uri = output.get("arn_or_uri", "")
228
+
229
+ if not arn_or_uri:
230
+ _error_exit(
231
+ f"Evaluator '{evaluator_name}' resolved but has no ARN/URI. "
232
+ f"Re-register with: ./do/register --evaluator --evaluator-name {evaluator_name} ..."
233
+ )
234
+
235
+ return ev_type, arn_or_uri
236
+
237
+
108
238
  # ── Subcommand: submit ────────────────────────────────────────────────────────
109
239
 
110
240
 
@@ -124,6 +254,26 @@ def cmd_submit(args):
124
254
  os.environ["AWS_DEFAULT_REGION"] = region
125
255
  os.environ.setdefault("AWS_REGION", region)
126
256
 
257
+ # ── Resolve --dataset-name from registry (AC-2b.4) ────────────────────────
258
+ # --dataset-s3-uri wins if both are provided (backward compatible override)
259
+ if not args.dataset_s3_uri and args.dataset_name:
260
+ resolved_uri = _resolve_dataset_name(args.dataset_name)
261
+ args.dataset_s3_uri = resolved_uri
262
+ elif not args.dataset_s3_uri and not args.dataset_name:
263
+ _error_exit(
264
+ "Either --dataset-s3-uri or --dataset-name is required. "
265
+ "Provide an S3 URI directly or a registered dataset name."
266
+ )
267
+
268
+ # ── Resolve --evaluator-name from registry (AC-2c.3, AC-2c.4) ────────────
269
+ # --reward-function / --reward-prompt win if provided (backward compatible override)
270
+ if args.evaluator_name and not args.reward_function and not args.reward_prompt:
271
+ ev_type, ev_arn_or_uri = _resolve_evaluator_name(args.evaluator_name)
272
+ if ev_type == "lambda":
273
+ args.reward_function = ev_arn_or_uri
274
+ else:
275
+ args.reward_prompt = ev_arn_or_uri
276
+
127
277
  _check_sagemaker_sdk()
128
278
 
129
279
  # SDK v3 moved trainers from sagemaker.modules.train → sagemaker.train
@@ -384,6 +534,12 @@ def cmd_status(args):
384
534
  Returns: {"status": str, "failure_reason": str|None,
385
535
  "metrics": dict|None, "elapsed_seconds": int}
386
536
  """
537
+ # Set region before any sagemaker import (creates boto3 clients at import time)
538
+ region = getattr(args, 'region', None) or os.environ.get('AWS_DEFAULT_REGION') or os.environ.get('AWS_REGION')
539
+ if region:
540
+ os.environ['AWS_DEFAULT_REGION'] = region
541
+ os.environ.setdefault('AWS_REGION', region)
542
+
387
543
  from sagemaker.core.resources import TrainingJob
388
544
  from botocore.exceptions import ClientError
389
545
 
@@ -485,6 +641,12 @@ def cmd_resolve(args):
485
641
  Returns: {"artifact_path": str, "model_package_arn": str|None,
486
642
  "output_type": str}
487
643
  """
644
+ # Set region before any sagemaker import (creates boto3 clients at import time)
645
+ region = getattr(args, 'region', None) or os.environ.get('AWS_DEFAULT_REGION') or os.environ.get('AWS_REGION')
646
+ if region:
647
+ os.environ['AWS_DEFAULT_REGION'] = region
648
+ os.environ.setdefault('AWS_REGION', region)
649
+
488
650
  from sagemaker.core.resources import TrainingJob
489
651
 
490
652
  try:
@@ -1719,8 +1881,10 @@ def main():
1719
1881
  submit_parser.add_argument("--training-type", required=True,
1720
1882
  choices=["lora", "full-rank"],
1721
1883
  help="Training type (lora or full-rank)")
1722
- submit_parser.add_argument("--dataset-s3-uri", required=True,
1723
- help="S3 URI of the training dataset")
1884
+ submit_parser.add_argument("--dataset-s3-uri", required=False, default=None,
1885
+ help="S3 URI of the training dataset (direct override)")
1886
+ submit_parser.add_argument("--dataset-name", default=None,
1887
+ help="Registered dataset name to resolve from registry")
1724
1888
  submit_parser.add_argument("--output-bucket", required=True,
1725
1889
  help="S3 bucket for output artifacts")
1726
1890
  submit_parser.add_argument("--role-arn", required=True,
@@ -1747,6 +1911,8 @@ def main():
1747
1911
  help="Lambda ARN for reward function (RLVR)")
1748
1912
  submit_parser.add_argument("--reward-prompt", default=None,
1749
1913
  help="S3 URI for reward prompt (RLAIF)")
1914
+ submit_parser.add_argument("--evaluator-name", default=None,
1915
+ help="Registered evaluator name to resolve from registry")
1750
1916
  submit_parser.add_argument("--accept-eula", action="store_true", default=False,
1751
1917
  help="Accept model EULA for gated models (e.g., Llama)")
1752
1918