sagemaker-core 2.1.1__py3-none-any.whl → 2.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sagemaker/__init__.py +2 -0
- sagemaker/core/common_utils.py +119 -1
- sagemaker/core/experiments/experiment.py +3 -0
- sagemaker/core/fw_utils.py +56 -12
- sagemaker/core/git_utils.py +66 -0
- sagemaker/core/helper/session_helper.py +22 -10
- sagemaker/core/image_retriever/image_retriever_utils.py +1 -3
- sagemaker/core/image_uri_config/huggingface-llm-neuronx.json +111 -1
- sagemaker/core/image_uri_config/huggingface-llm.json +110 -1
- sagemaker/core/image_uri_config/huggingface-neuronx.json +182 -6
- sagemaker/core/image_uri_config/huggingface-vllm-neuronx.json +38 -0
- sagemaker/core/image_uri_config/huggingface.json +151 -2
- sagemaker/core/image_uri_config/sagemaker-tritonserver.json +40 -0
- sagemaker/core/image_uri_config/sklearn.json +48 -0
- sagemaker/core/image_uri_config/xgboost.json +84 -0
- sagemaker/core/image_uris.py +9 -3
- sagemaker/core/iterators.py +11 -0
- sagemaker/core/jumpstart/models.py +2 -0
- sagemaker/core/jumpstart/region_config.json +8 -0
- sagemaker/core/local/data.py +10 -0
- sagemaker/core/local/utils.py +6 -5
- sagemaker/core/model_monitor/clarify_model_monitoring.py +2 -0
- sagemaker/core/model_registry.py +1 -1
- sagemaker/core/modules/configs.py +14 -1
- sagemaker/core/modules/train/container_drivers/common/utils.py +2 -10
- sagemaker/core/modules/train/sm_recipes/utils.py +1 -1
- sagemaker/core/processing.py +2 -0
- sagemaker/core/remote_function/client.py +31 -6
- sagemaker/core/remote_function/core/pipeline_variables.py +0 -6
- sagemaker/core/remote_function/core/serialization.py +16 -28
- sagemaker/core/remote_function/core/stored_function.py +8 -11
- sagemaker/core/remote_function/errors.py +1 -3
- sagemaker/core/remote_function/invoke_function.py +1 -6
- sagemaker/core/remote_function/job.py +2 -21
- sagemaker/core/telemetry/constants.py +6 -8
- sagemaker/core/telemetry/telemetry_logging.py +6 -5
- sagemaker/core/training/configs.py +16 -4
- sagemaker/core/workflow/utilities.py +10 -3
- {sagemaker_core-2.1.1.dist-info → sagemaker_core-2.3.1.dist-info}/METADATA +1 -1
- {sagemaker_core-2.1.1.dist-info → sagemaker_core-2.3.1.dist-info}/RECORD +43 -47
- sagemaker/core/huggingface/__init__.py +0 -29
- sagemaker/core/huggingface/llm_utils.py +0 -150
- sagemaker/core/huggingface/processing.py +0 -139
- sagemaker/core/huggingface/training_compiler/__init__.py +0 -0
- sagemaker/core/huggingface/training_compiler/config.py +0 -167
- sagemaker/core/image_uri_config/__init__.py +0 -13
- {sagemaker_core-2.1.1.dist-info → sagemaker_core-2.3.1.dist-info}/WHEEL +0 -0
- {sagemaker_core-2.1.1.dist-info → sagemaker_core-2.3.1.dist-info}/licenses/LICENSE +0 -0
- {sagemaker_core-2.1.1.dist-info → sagemaker_core-2.3.1.dist-info}/top_level.txt +0 -0
|
@@ -388,6 +388,54 @@
|
|
|
388
388
|
"us-west-2": "246618743249"
|
|
389
389
|
},
|
|
390
390
|
"repository": "sagemaker-scikit-learn"
|
|
391
|
+
},
|
|
392
|
+
"1.4-2": {
|
|
393
|
+
"processors": [
|
|
394
|
+
"cpu"
|
|
395
|
+
],
|
|
396
|
+
"py_versions": [
|
|
397
|
+
"py3"
|
|
398
|
+
],
|
|
399
|
+
"registries": {
|
|
400
|
+
"af-south-1": "510948584623",
|
|
401
|
+
"ap-east-1": "651117190479",
|
|
402
|
+
"ap-northeast-1": "354813040037",
|
|
403
|
+
"ap-northeast-2": "366743142698",
|
|
404
|
+
"ap-northeast-3": "867004704886",
|
|
405
|
+
"ap-south-1": "720646828776",
|
|
406
|
+
"ap-south-2": "628508329040",
|
|
407
|
+
"ap-southeast-1": "121021644041",
|
|
408
|
+
"ap-southeast-2": "783357654285",
|
|
409
|
+
"ap-southeast-3": "951798379941",
|
|
410
|
+
"ap-southeast-4": "106583098589",
|
|
411
|
+
"ca-central-1": "341280168497",
|
|
412
|
+
"ca-west-1": "190319476487",
|
|
413
|
+
"cn-north-1": "450853457545",
|
|
414
|
+
"cn-northwest-1": "451049120500",
|
|
415
|
+
"eu-central-1": "492215442770",
|
|
416
|
+
"eu-central-2": "680994064768",
|
|
417
|
+
"eu-north-1": "662702820516",
|
|
418
|
+
"eu-south-1": "978288397137",
|
|
419
|
+
"eu-south-2": "104374241257",
|
|
420
|
+
"eu-west-1": "141502667606",
|
|
421
|
+
"eu-west-2": "764974769150",
|
|
422
|
+
"eu-west-3": "659782779980",
|
|
423
|
+
"il-central-1": "898809789911",
|
|
424
|
+
"me-central-1": "272398656194",
|
|
425
|
+
"me-south-1": "801668240914",
|
|
426
|
+
"sa-east-1": "737474898029",
|
|
427
|
+
"us-east-1": "683313688378",
|
|
428
|
+
"us-east-2": "257758044811",
|
|
429
|
+
"us-gov-east-1": "237065988967",
|
|
430
|
+
"us-gov-west-1": "414596584902",
|
|
431
|
+
"us-iso-east-1": "833128469047",
|
|
432
|
+
"us-isob-east-1": "281123927165",
|
|
433
|
+
"us-isof-east-1": "108575199400",
|
|
434
|
+
"us-isof-south-1": "124985052026",
|
|
435
|
+
"us-west-1": "746614075791",
|
|
436
|
+
"us-west-2": "246618743249"
|
|
437
|
+
},
|
|
438
|
+
"repository": "sagemaker-scikit-learn"
|
|
391
439
|
}
|
|
392
440
|
}
|
|
393
441
|
},
|
|
@@ -395,6 +395,48 @@
|
|
|
395
395
|
"us-west-2": "246618743249"
|
|
396
396
|
},
|
|
397
397
|
"repository": "sagemaker-xgboost"
|
|
398
|
+
},
|
|
399
|
+
"3.0-5": {
|
|
400
|
+
"registries": {
|
|
401
|
+
"af-south-1": "510948584623",
|
|
402
|
+
"ap-east-1": "651117190479",
|
|
403
|
+
"ap-northeast-1": "354813040037",
|
|
404
|
+
"ap-northeast-2": "366743142698",
|
|
405
|
+
"ap-northeast-3": "867004704886",
|
|
406
|
+
"ap-south-1": "720646828776",
|
|
407
|
+
"ap-south-2": "628508329040",
|
|
408
|
+
"ap-southeast-1": "121021644041",
|
|
409
|
+
"ap-southeast-2": "783357654285",
|
|
410
|
+
"ap-southeast-3": "951798379941",
|
|
411
|
+
"ap-southeast-4": "106583098589",
|
|
412
|
+
"ca-central-1": "341280168497",
|
|
413
|
+
"ca-west-1": "190319476487",
|
|
414
|
+
"cn-north-1": "450853457545",
|
|
415
|
+
"cn-northwest-1": "451049120500",
|
|
416
|
+
"eu-central-1": "492215442770",
|
|
417
|
+
"eu-central-2": "680994064768",
|
|
418
|
+
"eu-north-1": "662702820516",
|
|
419
|
+
"eu-south-1": "978288397137",
|
|
420
|
+
"eu-south-2": "104374241257",
|
|
421
|
+
"eu-west-1": "141502667606",
|
|
422
|
+
"eu-west-2": "764974769150",
|
|
423
|
+
"eu-west-3": "659782779980",
|
|
424
|
+
"il-central-1": "898809789911",
|
|
425
|
+
"me-central-1": "272398656194",
|
|
426
|
+
"me-south-1": "801668240914",
|
|
427
|
+
"sa-east-1": "737474898029",
|
|
428
|
+
"us-east-1": "683313688378",
|
|
429
|
+
"us-east-2": "257758044811",
|
|
430
|
+
"us-gov-east-1": "237065988967",
|
|
431
|
+
"us-gov-west-1": "414596584902",
|
|
432
|
+
"us-iso-east-1": "833128469047",
|
|
433
|
+
"us-isob-east-1": "281123927165",
|
|
434
|
+
"us-isof-east-1": "108575199400",
|
|
435
|
+
"us-isof-south-1": "124985052026",
|
|
436
|
+
"us-west-1": "746614075791",
|
|
437
|
+
"us-west-2": "246618743249"
|
|
438
|
+
},
|
|
439
|
+
"repository": "sagemaker-xgboost"
|
|
398
440
|
}
|
|
399
441
|
}
|
|
400
442
|
},
|
|
@@ -794,6 +836,48 @@
|
|
|
794
836
|
"us-west-2": "246618743249"
|
|
795
837
|
},
|
|
796
838
|
"repository": "sagemaker-xgboost"
|
|
839
|
+
},
|
|
840
|
+
"3.0-5": {
|
|
841
|
+
"registries": {
|
|
842
|
+
"af-south-1": "510948584623",
|
|
843
|
+
"ap-east-1": "651117190479",
|
|
844
|
+
"ap-northeast-1": "354813040037",
|
|
845
|
+
"ap-northeast-2": "366743142698",
|
|
846
|
+
"ap-northeast-3": "867004704886",
|
|
847
|
+
"ap-south-1": "720646828776",
|
|
848
|
+
"ap-south-2": "628508329040",
|
|
849
|
+
"ap-southeast-1": "121021644041",
|
|
850
|
+
"ap-southeast-2": "783357654285",
|
|
851
|
+
"ap-southeast-3": "951798379941",
|
|
852
|
+
"ap-southeast-4": "106583098589",
|
|
853
|
+
"ca-central-1": "341280168497",
|
|
854
|
+
"ca-west-1": "190319476487",
|
|
855
|
+
"cn-north-1": "450853457545",
|
|
856
|
+
"cn-northwest-1": "451049120500",
|
|
857
|
+
"eu-central-1": "492215442770",
|
|
858
|
+
"eu-central-2": "680994064768",
|
|
859
|
+
"eu-north-1": "662702820516",
|
|
860
|
+
"eu-south-1": "978288397137",
|
|
861
|
+
"eu-south-2": "104374241257",
|
|
862
|
+
"eu-west-1": "141502667606",
|
|
863
|
+
"eu-west-2": "764974769150",
|
|
864
|
+
"eu-west-3": "659782779980",
|
|
865
|
+
"il-central-1": "898809789911",
|
|
866
|
+
"me-central-1": "272398656194",
|
|
867
|
+
"me-south-1": "801668240914",
|
|
868
|
+
"sa-east-1": "737474898029",
|
|
869
|
+
"us-east-1": "683313688378",
|
|
870
|
+
"us-east-2": "257758044811",
|
|
871
|
+
"us-gov-east-1": "237065988967",
|
|
872
|
+
"us-gov-west-1": "414596584902",
|
|
873
|
+
"us-iso-east-1": "833128469047",
|
|
874
|
+
"us-isob-east-1": "281123927165",
|
|
875
|
+
"us-isof-east-1": "108575199400",
|
|
876
|
+
"us-isof-south-1": "124985052026",
|
|
877
|
+
"us-west-1": "746614075791",
|
|
878
|
+
"us-west-2": "246618743249"
|
|
879
|
+
},
|
|
880
|
+
"repository": "sagemaker-xgboost"
|
|
797
881
|
}
|
|
798
882
|
}
|
|
799
883
|
},
|
sagemaker/core/image_uris.py
CHANGED
|
@@ -41,6 +41,7 @@ HUGGING_FACE_LLM_FRAMEWORK = "huggingface-llm"
|
|
|
41
41
|
HUGGING_FACE_TEI_GPU_FRAMEWORK = "huggingface-tei"
|
|
42
42
|
HUGGING_FACE_TEI_CPU_FRAMEWORK = "huggingface-tei-cpu"
|
|
43
43
|
HUGGING_FACE_LLM_NEURONX_FRAMEWORK = "huggingface-llm-neuronx"
|
|
44
|
+
HUGGING_FACE_VLLM_NEURONX_FRAMEWORK = "huggingface-vllm-neuronx"
|
|
44
45
|
XGBOOST_FRAMEWORK = "xgboost"
|
|
45
46
|
SKLEARN_FRAMEWORK = "sklearn"
|
|
46
47
|
TRAINIUM_ALLOWED_FRAMEWORKS = "pytorch"
|
|
@@ -77,7 +78,8 @@ def retrieve(
|
|
|
77
78
|
) -> str:
|
|
78
79
|
"""Retrieves the ECR URI for the Docker image matching the given arguments.
|
|
79
80
|
|
|
80
|
-
Ideally this function should not be called directly
|
|
81
|
+
Ideally this function should not be called directly, rather it should be called from the
|
|
82
|
+
fit() function inside framework estimator.
|
|
81
83
|
|
|
82
84
|
Args:
|
|
83
85
|
framework (str): The name of the framework or algorithm.
|
|
@@ -126,7 +128,7 @@ def retrieve(
|
|
|
126
128
|
serverless_inference_config (sagemaker.serverless.ServerlessInferenceConfig):
|
|
127
129
|
Specifies configuration related to serverless endpoint. Instance type is
|
|
128
130
|
not provided in serverless inference. So this is used to determine processor type.
|
|
129
|
-
sagemaker_session (sagemaker.
|
|
131
|
+
sagemaker_session (sagemaker.session.Session): A SageMaker Session
|
|
130
132
|
object, used for SageMaker interactions. If not
|
|
131
133
|
specified, one is created using the default AWS configuration
|
|
132
134
|
chain. (Default: sagemaker.jumpstart.constants.DEFAULT_JUMPSTART_SAGEMAKER_SESSION).
|
|
@@ -229,7 +231,11 @@ def retrieve(
|
|
|
229
231
|
container_version = version_config["container_version"][processor]
|
|
230
232
|
|
|
231
233
|
# Append sdk version in case of trainium instances
|
|
232
|
-
if repo in [
|
|
234
|
+
if repo in [
|
|
235
|
+
"pytorch-training-neuron",
|
|
236
|
+
"pytorch-training-neuronx",
|
|
237
|
+
"huggingface-vllm-inference-neuronx",
|
|
238
|
+
]:
|
|
233
239
|
if not sdk_version:
|
|
234
240
|
sdk_version = _get_latest_versions(version_config["sdk_versions"])
|
|
235
241
|
container_version = sdk_version + "-" + container_version
|
sagemaker/core/iterators.py
CHANGED
|
@@ -17,6 +17,7 @@ from abc import ABC, abstractmethod
|
|
|
17
17
|
import io
|
|
18
18
|
|
|
19
19
|
from sagemaker.core.exceptions import ModelStreamError, InternalStreamFailure
|
|
20
|
+
from sagemaker.core.common_utils import _MAX_BUFFER_SIZE
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
def handle_stream_errors(chunk):
|
|
@@ -182,5 +183,15 @@ class LineIterator(BaseIterator):
|
|
|
182
183
|
# print and move on to next response byte
|
|
183
184
|
print("Unknown event type:" + chunk)
|
|
184
185
|
continue
|
|
186
|
+
|
|
187
|
+
# Check buffer size before writing to prevent unbounded memory consumption
|
|
188
|
+
chunk_size = len(chunk["PayloadPart"]["Bytes"])
|
|
189
|
+
current_size = self.buffer.getbuffer().nbytes
|
|
190
|
+
if current_size + chunk_size > _MAX_BUFFER_SIZE:
|
|
191
|
+
raise RuntimeError(
|
|
192
|
+
f"Line buffer exceeded maximum size of {_MAX_BUFFER_SIZE} bytes. "
|
|
193
|
+
f"No newline found in stream."
|
|
194
|
+
)
|
|
195
|
+
|
|
185
196
|
self.buffer.seek(0, io.SEEK_END)
|
|
186
197
|
self.buffer.write(chunk["PayloadPart"]["Bytes"])
|
|
@@ -129,6 +129,7 @@ class InferenceAmiVersionEnum(StrEnum):
|
|
|
129
129
|
"""InferenceAmiVersionEnum"""
|
|
130
130
|
|
|
131
131
|
AL2_AMI_SAGEMAKER_INFERENCE_GPU_2 = "al2-ami-sagemaker-inference-gpu-2"
|
|
132
|
+
AL2_AMI_SAGEMAKER_INFERENCE_GPU_3_1 = "al2-ami-sagemaker-inference-gpu-3-1"
|
|
132
133
|
|
|
133
134
|
|
|
134
135
|
class ScopeEnum(StrEnum):
|
|
@@ -398,6 +399,7 @@ class CapabilityEnum(StrEnum):
|
|
|
398
399
|
TRAINING = "TRAINING"
|
|
399
400
|
FINE_TUNING = "FINE_TUNING"
|
|
400
401
|
VALIDATION = "VALIDATION"
|
|
402
|
+
INCREMENTAL_TRAINING = "INCREMENTAL_TRAINING"
|
|
401
403
|
|
|
402
404
|
|
|
403
405
|
class DemoNotebookModel(BaseConfig):
|
|
@@ -7,6 +7,10 @@
|
|
|
7
7
|
"content_bucket": "jumpstart-cache-prod-ap-east-1",
|
|
8
8
|
"gated_content_bucket": "jumpstart-private-cache-prod-ap-east-1"
|
|
9
9
|
},
|
|
10
|
+
"ap-east-2": {
|
|
11
|
+
"content_bucket": "jumpstart-cache-prod-ap-east-2",
|
|
12
|
+
"gated_content_bucket": "jumpstart-private-cache-prod-ap-east-2"
|
|
13
|
+
},
|
|
10
14
|
"ap-northeast-1": {
|
|
11
15
|
"content_bucket": "jumpstart-cache-prod-ap-northeast-1",
|
|
12
16
|
"gated_content_bucket": "jumpstart-private-cache-prod-ap-northeast-1",
|
|
@@ -53,6 +57,10 @@
|
|
|
53
57
|
"content_bucket": "jumpstart-cache-prod-ap-southeast-5",
|
|
54
58
|
"gated_content_bucket": "jumpstart-private-cache-prod-ap-southeast-5"
|
|
55
59
|
},
|
|
60
|
+
"ap-southeast-6": {
|
|
61
|
+
"content_bucket": "jumpstart-cache-prod-ap-southeast-6",
|
|
62
|
+
"gated_content_bucket": "jumpstart-private-cache-prod-ap-southeast-6"
|
|
63
|
+
},
|
|
56
64
|
"ap-southeast-7": {
|
|
57
65
|
"content_bucket": "jumpstart-cache-prod-ap-southeast-7",
|
|
58
66
|
"gated_content_bucket": "jumpstart-private-cache-prod-ap-southeast-7"
|
sagemaker/core/local/data.py
CHANGED
|
@@ -24,6 +24,7 @@ from six import with_metaclass
|
|
|
24
24
|
from six.moves.urllib.parse import urlparse
|
|
25
25
|
|
|
26
26
|
import sagemaker.core
|
|
27
|
+
from sagemaker.core.common_utils import _SENSITIVE_SYSTEM_PATHS
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
def get_data_source_instance(data_source, sagemaker_session):
|
|
@@ -120,6 +121,15 @@ class LocalFileDataSource(DataSource):
|
|
|
120
121
|
super(LocalFileDataSource, self).__init__()
|
|
121
122
|
|
|
122
123
|
self.root_path = os.path.abspath(root_path)
|
|
124
|
+
|
|
125
|
+
# Validate that the path is not in restricted locations
|
|
126
|
+
for restricted_path in _SENSITIVE_SYSTEM_PATHS:
|
|
127
|
+
if self.root_path != "/" and self.root_path.startswith(restricted_path):
|
|
128
|
+
raise ValueError(
|
|
129
|
+
f"Local Mode does not support mounting from restricted system paths. "
|
|
130
|
+
f"Got: {root_path}"
|
|
131
|
+
)
|
|
132
|
+
|
|
123
133
|
if not os.path.exists(self.root_path):
|
|
124
134
|
raise RuntimeError("Invalid data source: %s does not exist." % self.root_path)
|
|
125
135
|
|
sagemaker/core/local/utils.py
CHANGED
|
@@ -48,10 +48,7 @@ def copy_directory_structure(destination_directory, relative_path):
|
|
|
48
48
|
destination_directory
|
|
49
49
|
"""
|
|
50
50
|
full_path = os.path.join(destination_directory, relative_path)
|
|
51
|
-
|
|
52
|
-
return
|
|
53
|
-
|
|
54
|
-
os.makedirs(destination_directory, relative_path)
|
|
51
|
+
os.makedirs(full_path, exist_ok=True)
|
|
55
52
|
|
|
56
53
|
|
|
57
54
|
def move_to_destination(source, destination, job_name, sagemaker_session, prefix=""):
|
|
@@ -137,7 +134,11 @@ def get_child_process_ids(pid):
|
|
|
137
134
|
Returns:
|
|
138
135
|
(List[int]): Child process ids
|
|
139
136
|
"""
|
|
140
|
-
|
|
137
|
+
if not str(pid).isdigit():
|
|
138
|
+
raise ValueError("Invalid PID")
|
|
139
|
+
|
|
140
|
+
cmd = ["pgrep", "-P", str(pid)]
|
|
141
|
+
|
|
141
142
|
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
142
143
|
output, err = process.communicate()
|
|
143
144
|
if err:
|
|
@@ -1117,6 +1117,8 @@ class ModelExplainabilityMonitor(ClarifyModelMonitor):
|
|
|
1117
1117
|
monitor_schedule_name=monitor_schedule_name,
|
|
1118
1118
|
job_definition_name=new_job_definition_name,
|
|
1119
1119
|
schedule_cron_expression=schedule_cron_expression,
|
|
1120
|
+
data_analysis_start_time=data_analysis_start_time,
|
|
1121
|
+
data_analysis_end_time=data_analysis_end_time,
|
|
1120
1122
|
)
|
|
1121
1123
|
self.job_definition_name = new_job_definition_name
|
|
1122
1124
|
self.monitoring_schedule_name = monitor_schedule_name
|
sagemaker/core/model_registry.py
CHANGED
|
@@ -98,7 +98,7 @@ def get_model_package_args(
|
|
|
98
98
|
if source_uri is not None:
|
|
99
99
|
model_package_args["source_uri"] = source_uri
|
|
100
100
|
if model_life_cycle is not None:
|
|
101
|
-
model_package_args["model_life_cycle"] = model_life_cycle
|
|
101
|
+
model_package_args["model_life_cycle"] = model_life_cycle._to_request_dict()
|
|
102
102
|
if model_card is not None:
|
|
103
103
|
original_req = model_card._create_request_args()
|
|
104
104
|
if original_req.get("ModelCardName") is not None:
|
|
@@ -45,8 +45,9 @@ from sagemaker.core.shapes import (
|
|
|
45
45
|
InstanceGroup,
|
|
46
46
|
TensorBoardOutputConfig,
|
|
47
47
|
CheckpointConfig,
|
|
48
|
+
MetricDefinition,
|
|
48
49
|
)
|
|
49
|
-
|
|
50
|
+
from typing import List
|
|
50
51
|
|
|
51
52
|
__all__ = [
|
|
52
53
|
"SourceCode",
|
|
@@ -70,6 +71,7 @@ __all__ = [
|
|
|
70
71
|
"Compute",
|
|
71
72
|
"Networking",
|
|
72
73
|
"InputData",
|
|
74
|
+
"MetricDefinition",
|
|
73
75
|
]
|
|
74
76
|
|
|
75
77
|
from sagemaker.core.modules.utils import convert_unassigned_to_none
|
|
@@ -99,12 +101,23 @@ class SourceCode(BaseConfig):
|
|
|
99
101
|
command (Optional[str]):
|
|
100
102
|
The command(s) to execute in the training job container. Example: "python my_script.py".
|
|
101
103
|
If not specified, entry_script must be provided.
|
|
104
|
+
ignore_patterns: (Optional[List[str]]) :
|
|
105
|
+
The ignore patterns to ignore specific files/folders when uploading to S3. If not specified,
|
|
106
|
+
default to: ['.env', '.git', '__pycache__', '.DS_Store', '.cache', '.ipynb_checkpoints'].
|
|
102
107
|
"""
|
|
103
108
|
|
|
104
109
|
source_dir: Optional[str] = None
|
|
105
110
|
requirements: Optional[str] = None
|
|
106
111
|
entry_script: Optional[str] = None
|
|
107
112
|
command: Optional[str] = None
|
|
113
|
+
ignore_patterns: Optional[List[str]] = [
|
|
114
|
+
".env",
|
|
115
|
+
".git",
|
|
116
|
+
"__pycache__",
|
|
117
|
+
".DS_Store",
|
|
118
|
+
".cache",
|
|
119
|
+
".ipynb_checkpoints",
|
|
120
|
+
]
|
|
108
121
|
|
|
109
122
|
|
|
110
123
|
class Compute(shapes.ResourceConfig):
|
|
@@ -124,10 +124,8 @@ def safe_deserialize(data: Any) -> Any:
|
|
|
124
124
|
|
|
125
125
|
This function handles the following cases:
|
|
126
126
|
1. If `data` is not a string, it returns the input as-is.
|
|
127
|
-
2. If `data` is a string
|
|
128
|
-
|
|
129
|
-
3. If `data` is a JSON-encoded string, it attempts to deserialize it using `json.loads()`.
|
|
130
|
-
4. If `data` is a string but cannot be decoded as JSON, it returns the original string.
|
|
127
|
+
2. If `data` is a JSON-encoded string, it attempts to deserialize it using `json.loads()`.
|
|
128
|
+
3. If `data` is a string but cannot be decoded as JSON, it returns the original string.
|
|
131
129
|
|
|
132
130
|
Returns:
|
|
133
131
|
Any: The deserialized data, or the original input if it cannot be JSON-decoded.
|
|
@@ -135,12 +133,6 @@ def safe_deserialize(data: Any) -> Any:
|
|
|
135
133
|
if not isinstance(data, str):
|
|
136
134
|
return data
|
|
137
135
|
|
|
138
|
-
lower_data = data.lower()
|
|
139
|
-
if lower_data in ["true"]:
|
|
140
|
-
return True
|
|
141
|
-
if lower_data in ["false"]:
|
|
142
|
-
return False
|
|
143
|
-
|
|
144
136
|
try:
|
|
145
137
|
return json.loads(data)
|
|
146
138
|
except json.JSONDecodeError:
|
|
@@ -129,7 +129,7 @@ def _get_trainining_recipe_gpu_model_name_and_script(model_type: str):
|
|
|
129
129
|
"""Get the model base name and script for the training recipe."""
|
|
130
130
|
|
|
131
131
|
model_type_to_script = {
|
|
132
|
-
"
|
|
132
|
+
"llama": ("llama", "llama_pretrain.py"),
|
|
133
133
|
"mistral": ("mistral", "mistral_pretrain.py"),
|
|
134
134
|
"mixtral": ("mixtral", "mixtral_pretrain.py"),
|
|
135
135
|
"deepseek": ("deepseek", "deepseek_pretrain.py"),
|
sagemaker/core/processing.py
CHANGED
|
@@ -625,6 +625,8 @@ class Processor(object):
|
|
|
625
625
|
from sagemaker.core.utils.code_injection.codec import transform
|
|
626
626
|
|
|
627
627
|
transformed = transform(serialized_request, "CreateProcessingJobRequest")
|
|
628
|
+
# Remove tags from transformed dict as ProcessingJob resource doesn't accept it
|
|
629
|
+
transformed.pop("tags", None)
|
|
628
630
|
return ProcessingJob(**transformed)
|
|
629
631
|
|
|
630
632
|
def _get_process_args(self, inputs, outputs, experiment_config):
|
|
@@ -303,6 +303,9 @@ def remote(
|
|
|
303
303
|
"""
|
|
304
304
|
|
|
305
305
|
def _remote(func):
|
|
306
|
+
|
|
307
|
+
if job_conda_env:
|
|
308
|
+
RemoteExecutor._validate_env_name(job_conda_env)
|
|
306
309
|
|
|
307
310
|
job_settings = _JobSettings(
|
|
308
311
|
dependencies=dependencies,
|
|
@@ -366,7 +369,7 @@ def remote(
|
|
|
366
369
|
s3_uri=s3_path_join(
|
|
367
370
|
job_settings.s3_root_uri, job.job_name, EXCEPTION_FOLDER
|
|
368
371
|
),
|
|
369
|
-
|
|
372
|
+
|
|
370
373
|
)
|
|
371
374
|
except ServiceError as serr:
|
|
372
375
|
chained_e = serr.__cause__
|
|
@@ -403,7 +406,7 @@ def remote(
|
|
|
403
406
|
return serialization.deserialize_obj_from_s3(
|
|
404
407
|
sagemaker_session=job_settings.sagemaker_session,
|
|
405
408
|
s3_uri=s3_path_join(job_settings.s3_root_uri, job.job_name, RESULTS_FOLDER),
|
|
406
|
-
|
|
409
|
+
|
|
407
410
|
)
|
|
408
411
|
|
|
409
412
|
if job.describe()["TrainingJobStatus"] == "Stopped":
|
|
@@ -774,6 +777,9 @@ class RemoteExecutor(object):
|
|
|
774
777
|
+ "without spark_config or use_torchrun or use_mpirun. "
|
|
775
778
|
+ "Please provide instance_count = 1"
|
|
776
779
|
)
|
|
780
|
+
|
|
781
|
+
if job_conda_env:
|
|
782
|
+
self._validate_env_name(job_conda_env)
|
|
777
783
|
|
|
778
784
|
self.job_settings = _JobSettings(
|
|
779
785
|
dependencies=dependencies,
|
|
@@ -951,6 +957,25 @@ class RemoteExecutor(object):
|
|
|
951
957
|
+ f"{'arguments' if len(missing_kwargs) > 1 else 'argument'}: "
|
|
952
958
|
+ f"{missing_kwargs_string}"
|
|
953
959
|
)
|
|
960
|
+
|
|
961
|
+
@staticmethod
|
|
962
|
+
def _validate_env_name(env_name: str) -> None:
|
|
963
|
+
"""Validate conda environment name to prevent command injection.
|
|
964
|
+
|
|
965
|
+
Args:
|
|
966
|
+
env_name (str): The environment name to validate
|
|
967
|
+
|
|
968
|
+
Raises:
|
|
969
|
+
ValueError: If the environment name contains invalid characters
|
|
970
|
+
"""
|
|
971
|
+
|
|
972
|
+
# Allow only alphanumeric, underscore, and hyphen
|
|
973
|
+
import re
|
|
974
|
+
if not re.match(r'^[a-zA-Z0-9_-]+$', env_name):
|
|
975
|
+
raise ValueError(
|
|
976
|
+
f"Invalid environment name '{env_name}'. "
|
|
977
|
+
"Only alphanumeric characters, underscores, and hyphens are allowed."
|
|
978
|
+
)
|
|
954
979
|
|
|
955
980
|
|
|
956
981
|
class Future(object):
|
|
@@ -983,7 +1008,7 @@ class Future(object):
|
|
|
983
1008
|
job_return = serialization.deserialize_obj_from_s3(
|
|
984
1009
|
sagemaker_session=sagemaker_session,
|
|
985
1010
|
s3_uri=s3_path_join(job.s3_uri, RESULTS_FOLDER),
|
|
986
|
-
|
|
1011
|
+
|
|
987
1012
|
)
|
|
988
1013
|
except DeserializationError as e:
|
|
989
1014
|
client_exception = e
|
|
@@ -995,7 +1020,7 @@ class Future(object):
|
|
|
995
1020
|
job_exception = serialization.deserialize_exception_from_s3(
|
|
996
1021
|
sagemaker_session=sagemaker_session,
|
|
997
1022
|
s3_uri=s3_path_join(job.s3_uri, EXCEPTION_FOLDER),
|
|
998
|
-
|
|
1023
|
+
|
|
999
1024
|
)
|
|
1000
1025
|
except ServiceError as serr:
|
|
1001
1026
|
chained_e = serr.__cause__
|
|
@@ -1085,7 +1110,7 @@ class Future(object):
|
|
|
1085
1110
|
self._return = serialization.deserialize_obj_from_s3(
|
|
1086
1111
|
sagemaker_session=self._job.sagemaker_session,
|
|
1087
1112
|
s3_uri=s3_path_join(self._job.s3_uri, RESULTS_FOLDER),
|
|
1088
|
-
|
|
1113
|
+
|
|
1089
1114
|
)
|
|
1090
1115
|
self._state = _FINISHED
|
|
1091
1116
|
return self._return
|
|
@@ -1094,7 +1119,7 @@ class Future(object):
|
|
|
1094
1119
|
self._exception = serialization.deserialize_exception_from_s3(
|
|
1095
1120
|
sagemaker_session=self._job.sagemaker_session,
|
|
1096
1121
|
s3_uri=s3_path_join(self._job.s3_uri, EXCEPTION_FOLDER),
|
|
1097
|
-
|
|
1122
|
+
|
|
1098
1123
|
)
|
|
1099
1124
|
except ServiceError as serr:
|
|
1100
1125
|
chained_e = serr.__cause__
|
|
@@ -164,7 +164,6 @@ class _DelayedReturnResolver:
|
|
|
164
164
|
def __init__(
|
|
165
165
|
self,
|
|
166
166
|
delayed_returns: List[_DelayedReturn],
|
|
167
|
-
hmac_key: str,
|
|
168
167
|
properties_resolver: _PropertiesResolver,
|
|
169
168
|
parameter_resolver: _ParameterResolver,
|
|
170
169
|
execution_variable_resolver: _ExecutionVariableResolver,
|
|
@@ -175,7 +174,6 @@ class _DelayedReturnResolver:
|
|
|
175
174
|
|
|
176
175
|
Args:
|
|
177
176
|
delayed_returns: list of delayed returns to resolve.
|
|
178
|
-
hmac_key: key used to encrypt serialized and deserialized function and arguments.
|
|
179
177
|
properties_resolver: resolver used to resolve step properties.
|
|
180
178
|
parameter_resolver: resolver used to pipeline parameters.
|
|
181
179
|
execution_variable_resolver: resolver used to resolve execution variables.
|
|
@@ -197,7 +195,6 @@ class _DelayedReturnResolver:
|
|
|
197
195
|
return uri, deserialize_obj_from_s3(
|
|
198
196
|
sagemaker_session=settings["sagemaker_session"],
|
|
199
197
|
s3_uri=uri,
|
|
200
|
-
hmac_key=hmac_key,
|
|
201
198
|
)
|
|
202
199
|
|
|
203
200
|
with ThreadPoolExecutor() as executor:
|
|
@@ -247,7 +244,6 @@ def resolve_pipeline_variables(
|
|
|
247
244
|
context: Context,
|
|
248
245
|
func_args: Tuple,
|
|
249
246
|
func_kwargs: Dict,
|
|
250
|
-
hmac_key: str,
|
|
251
247
|
s3_base_uri: str,
|
|
252
248
|
**settings,
|
|
253
249
|
):
|
|
@@ -257,7 +253,6 @@ def resolve_pipeline_variables(
|
|
|
257
253
|
context: context for the execution.
|
|
258
254
|
func_args: function args.
|
|
259
255
|
func_kwargs: function kwargs.
|
|
260
|
-
hmac_key: key used to encrypt serialized and deserialized function and arguments.
|
|
261
256
|
s3_base_uri: the s3 base uri of the function step that the serialized artifacts
|
|
262
257
|
will be uploaded to. The s3_base_uri = s3_root_uri + pipeline_name.
|
|
263
258
|
**settings: settings to pass to the deserialization function.
|
|
@@ -280,7 +275,6 @@ def resolve_pipeline_variables(
|
|
|
280
275
|
properties_resolver = _PropertiesResolver(context)
|
|
281
276
|
delayed_return_resolver = _DelayedReturnResolver(
|
|
282
277
|
delayed_returns=delayed_returns,
|
|
283
|
-
hmac_key=hmac_key,
|
|
284
278
|
properties_resolver=properties_resolver,
|
|
285
279
|
parameter_resolver=parameter_resolver,
|
|
286
280
|
execution_variable_resolver=execution_variable_resolver,
|