sagemaker-core 1.0.47__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sagemaker/core/__init__.py +16 -0
- sagemaker/core/_studio.py +116 -0
- sagemaker/core/_version.py +11 -0
- sagemaker/core/accept_types.py +131 -0
- sagemaker/core/analytics.py +744 -0
- sagemaker/core/apiutils/__init__.py +13 -0
- sagemaker/core/apiutils/_base_types.py +228 -0
- sagemaker/core/apiutils/_boto_functions.py +130 -0
- sagemaker/core/apiutils/_utils.py +34 -0
- sagemaker/core/base_deserializers.py +35 -0
- sagemaker/core/base_serializers.py +35 -0
- sagemaker/core/clarify/__init__.py +2898 -0
- sagemaker/core/collection.py +467 -0
- sagemaker/core/common_utils.py +2281 -0
- sagemaker/core/compute_resource_requirements/__init__.py +18 -0
- sagemaker/core/compute_resource_requirements/resource_requirements.py +94 -0
- sagemaker/core/config/__init__.py +181 -0
- sagemaker/core/config/config.py +238 -0
- sagemaker/core/config/config_manager.py +595 -0
- sagemaker/core/config/config_schema.py +1220 -0
- sagemaker/core/config/config_utils.py +297 -0
- {sagemaker_core/main → sagemaker/core}/config_schema.py +410 -4
- sagemaker/core/constants.py +73 -0
- sagemaker/core/content_types.py +137 -0
- sagemaker/core/debugger/__init__.py +39 -0
- sagemaker/core/debugger/debugger.py +945 -0
- sagemaker/core/debugger/framework_profile.py +292 -0
- sagemaker/core/debugger/metrics_config.py +468 -0
- sagemaker/core/debugger/profiler.py +42 -0
- sagemaker/core/debugger/profiler_config.py +190 -0
- sagemaker/core/debugger/profiler_constants.py +40 -0
- sagemaker/core/debugger/utils.py +148 -0
- sagemaker/core/deprecations.py +254 -0
- sagemaker/core/deserializers/__init__.py +10 -0
- sagemaker/core/deserializers/base.py +424 -0
- sagemaker/core/deserializers/implementations.py +157 -0
- sagemaker/core/drift_check_baselines.py +106 -0
- sagemaker/core/enums.py +51 -0
- sagemaker/core/environment_variables.py +101 -0
- sagemaker/core/exceptions.py +108 -0
- sagemaker/core/experiments/__init__.py +53 -0
- sagemaker/core/experiments/_api_types.py +251 -0
- sagemaker/core/experiments/_environment.py +124 -0
- sagemaker/core/experiments/_helper.py +294 -0
- sagemaker/core/experiments/_metrics.py +333 -0
- sagemaker/core/experiments/_run_context.py +58 -0
- sagemaker/core/experiments/_utils.py +216 -0
- sagemaker/core/experiments/experiment.py +244 -0
- sagemaker/core/experiments/run.py +970 -0
- sagemaker/core/experiments/trial.py +296 -0
- sagemaker/core/experiments/trial_component.py +387 -0
- sagemaker/core/explainer/__init__.py +24 -0
- sagemaker/core/explainer/clarify_explainer_config.py +298 -0
- sagemaker/core/explainer/explainer_config.py +44 -0
- sagemaker/core/fw_utils.py +1176 -0
- sagemaker/core/git_utils.py +349 -0
- sagemaker/core/helper/pipeline_variable.py +82 -0
- sagemaker/core/helper/session_helper.py +2965 -0
- sagemaker/core/huggingface/__init__.py +29 -0
- sagemaker/core/huggingface/llm_utils.py +150 -0
- sagemaker/core/huggingface/processing.py +139 -0
- sagemaker/core/huggingface/training_compiler/config.py +167 -0
- sagemaker/core/hyperparameters.py +172 -0
- sagemaker/core/image_retriever/__init__.py +3 -0
- sagemaker/core/image_retriever/image_retriever.py +640 -0
- sagemaker/core/image_retriever/image_retriever_utils.py +511 -0
- sagemaker/core/image_retriever/test.py +7 -0
- sagemaker/core/image_uri_config/__init__.py +13 -0
- sagemaker/core/image_uri_config/autogluon.json +1335 -0
- sagemaker/core/image_uri_config/blazingtext.json +50 -0
- sagemaker/core/image_uri_config/chainer.json +104 -0
- sagemaker/core/image_uri_config/clarify.json +39 -0
- sagemaker/core/image_uri_config/coach-mxnet.json +70 -0
- sagemaker/core/image_uri_config/coach-tensorflow.json +186 -0
- sagemaker/core/image_uri_config/data-wrangler.json +91 -0
- sagemaker/core/image_uri_config/debugger.json +34 -0
- sagemaker/core/image_uri_config/detailed-profiler.json +18 -0
- sagemaker/core/image_uri_config/djl-deepspeed.json +385 -0
- sagemaker/core/image_uri_config/djl-fastertransformer.json +167 -0
- sagemaker/core/image_uri_config/djl-lmi.json +136 -0
- sagemaker/core/image_uri_config/djl-neuronx.json +258 -0
- sagemaker/core/image_uri_config/djl-tensorrtllm.json +262 -0
- sagemaker/core/image_uri_config/factorization-machines.json +50 -0
- sagemaker/core/image_uri_config/forecasting-deepar.json +50 -0
- sagemaker/core/image_uri_config/huggingface-llm-neuronx.json +660 -0
- sagemaker/core/image_uri_config/huggingface-llm.json +1158 -0
- sagemaker/core/image_uri_config/huggingface-neuron.json +52 -0
- sagemaker/core/image_uri_config/huggingface-neuronx.json +510 -0
- sagemaker/core/image_uri_config/huggingface-tei-cpu.json +298 -0
- sagemaker/core/image_uri_config/huggingface-tei.json +298 -0
- sagemaker/core/image_uri_config/huggingface-training-compiler.json +195 -0
- sagemaker/core/image_uri_config/huggingface.json +2138 -0
- sagemaker/core/image_uri_config/hyperpod-recipes-neuron.json +52 -0
- sagemaker/core/image_uri_config/image-classification-neo.json +43 -0
- sagemaker/core/image_uri_config/image-classification.json +50 -0
- sagemaker/core/image_uri_config/inferentia-mxnet.json +88 -0
- sagemaker/core/image_uri_config/inferentia-pytorch.json +127 -0
- sagemaker/core/image_uri_config/inferentia-tensorflow.json +88 -0
- sagemaker/core/image_uri_config/instance_gpu_info.json +782 -0
- sagemaker/core/image_uri_config/ipinsights.json +50 -0
- sagemaker/core/image_uri_config/kmeans.json +50 -0
- sagemaker/core/image_uri_config/knn.json +50 -0
- sagemaker/core/image_uri_config/lda.json +26 -0
- sagemaker/core/image_uri_config/linear-learner.json +50 -0
- sagemaker/core/image_uri_config/model-monitor.json +42 -0
- sagemaker/core/image_uri_config/mxnet.json +1154 -0
- sagemaker/core/image_uri_config/neo-mxnet.json +64 -0
- sagemaker/core/image_uri_config/neo-pytorch.json +341 -0
- sagemaker/core/image_uri_config/neo-tensorflow.json +109 -0
- sagemaker/core/image_uri_config/ntm.json +50 -0
- sagemaker/core/image_uri_config/object-detection.json +50 -0
- sagemaker/core/image_uri_config/object2vec.json +50 -0
- sagemaker/core/image_uri_config/pca.json +50 -0
- sagemaker/core/image_uri_config/pytorch-neuron.json +43 -0
- sagemaker/core/image_uri_config/pytorch-smp.json +218 -0
- sagemaker/core/image_uri_config/pytorch-training-compiler.json +80 -0
- sagemaker/core/image_uri_config/pytorch.json +3101 -0
- sagemaker/core/image_uri_config/randomcutforest.json +50 -0
- sagemaker/core/image_uri_config/ray-pytorch.json +46 -0
- sagemaker/core/image_uri_config/ray-tensorflow.json +194 -0
- sagemaker/core/image_uri_config/sagemaker-base-python.json +46 -0
- sagemaker/core/image_uri_config/sagemaker-distribution.json +37 -0
- sagemaker/core/image_uri_config/sagemaker-geospatial.json +13 -0
- sagemaker/core/image_uri_config/sagemaker-tritonserver.json +212 -0
- sagemaker/core/image_uri_config/semantic-segmentation.json +50 -0
- sagemaker/core/image_uri_config/seq2seq.json +50 -0
- sagemaker/core/image_uri_config/sklearn.json +446 -0
- sagemaker/core/image_uri_config/spark.json +280 -0
- sagemaker/core/image_uri_config/sparkml-serving.json +97 -0
- sagemaker/core/image_uri_config/stabilityai.json +53 -0
- sagemaker/core/image_uri_config/tensorflow.json +5086 -0
- sagemaker/core/image_uri_config/vw.json +25 -0
- sagemaker/core/image_uri_config/xgboost-neo.json +43 -0
- sagemaker/core/image_uri_config/xgboost.json +888 -0
- sagemaker/core/image_uris.py +810 -0
- sagemaker/core/inference_config.py +144 -0
- sagemaker/core/inference_recommender/__init__.py +18 -0
- sagemaker/core/inference_recommender/inference_recommender_mixin.py +622 -0
- sagemaker/core/inputs.py +366 -0
- sagemaker/core/instance_group.py +61 -0
- sagemaker/core/instance_types.py +164 -0
- sagemaker/core/instance_types_gpu_info.py +43 -0
- sagemaker/core/interactive_apps/__init__.py +41 -0
- sagemaker/core/interactive_apps/base_interactive_app.py +204 -0
- sagemaker/core/interactive_apps/detail_profiler_app.py +139 -0
- sagemaker/core/interactive_apps/tensorboard.py +149 -0
- sagemaker/core/iterators.py +186 -0
- sagemaker/core/job.py +380 -0
- sagemaker/core/jumpstart/__init__.py +156 -0
- sagemaker/core/jumpstart/accessors.py +390 -0
- sagemaker/core/jumpstart/artifacts/__init__.py +69 -0
- sagemaker/core/jumpstart/artifacts/environment_variables.py +252 -0
- sagemaker/core/jumpstart/artifacts/hyperparameters.py +120 -0
- sagemaker/core/jumpstart/artifacts/image_uris.py +139 -0
- sagemaker/core/jumpstart/artifacts/incremental_training.py +87 -0
- sagemaker/core/jumpstart/artifacts/instance_types.py +223 -0
- sagemaker/core/jumpstart/artifacts/kwargs.py +289 -0
- sagemaker/core/jumpstart/artifacts/metric_definitions.py +117 -0
- sagemaker/core/jumpstart/artifacts/model_packages.py +202 -0
- sagemaker/core/jumpstart/artifacts/model_uris.py +252 -0
- sagemaker/core/jumpstart/artifacts/payloads.py +96 -0
- sagemaker/core/jumpstart/artifacts/predictors.py +540 -0
- sagemaker/core/jumpstart/artifacts/resource_names.py +86 -0
- sagemaker/core/jumpstart/artifacts/resource_requirements.py +162 -0
- sagemaker/core/jumpstart/artifacts/script_uris.py +172 -0
- sagemaker/core/jumpstart/cache.py +663 -0
- sagemaker/core/jumpstart/configs.py +50 -0
- sagemaker/core/jumpstart/constants.py +198 -0
- sagemaker/core/jumpstart/deserializers.py +81 -0
- sagemaker/core/jumpstart/document.py +76 -0
- sagemaker/core/jumpstart/enums.py +168 -0
- sagemaker/core/jumpstart/exceptions.py +236 -0
- sagemaker/core/jumpstart/factory/utils.py +833 -0
- sagemaker/core/jumpstart/filters.py +597 -0
- sagemaker/core/jumpstart/hub/__init__.py +0 -0
- sagemaker/core/jumpstart/hub/constants.py +16 -0
- sagemaker/core/jumpstart/hub/hub.py +291 -0
- sagemaker/core/jumpstart/hub/interfaces.py +936 -0
- sagemaker/core/jumpstart/hub/parser_utils.py +70 -0
- sagemaker/core/jumpstart/hub/parsers.py +288 -0
- sagemaker/core/jumpstart/hub/types.py +35 -0
- sagemaker/core/jumpstart/hub/utils.py +260 -0
- sagemaker/core/jumpstart/models.py +499 -0
- sagemaker/core/jumpstart/notebook_utils.py +575 -0
- sagemaker/core/jumpstart/parameters.py +20 -0
- sagemaker/core/jumpstart/payload_utils.py +239 -0
- sagemaker/core/jumpstart/region_config.json +163 -0
- sagemaker/core/jumpstart/search.py +171 -0
- sagemaker/core/jumpstart/serializers.py +81 -0
- sagemaker/core/jumpstart/session_utils.py +234 -0
- sagemaker/core/jumpstart/types.py +3044 -0
- sagemaker/core/jumpstart/utils.py +1731 -0
- sagemaker/core/jumpstart/validators.py +257 -0
- sagemaker/core/lambda_helper.py +312 -0
- sagemaker/core/lineage/__init__.py +42 -0
- sagemaker/core/lineage/_api_types.py +239 -0
- sagemaker/core/lineage/_utils.py +49 -0
- sagemaker/core/lineage/action.py +345 -0
- sagemaker/core/lineage/artifact.py +646 -0
- sagemaker/core/lineage/association.py +190 -0
- sagemaker/core/lineage/context.py +505 -0
- sagemaker/core/lineage/lineage_trial_component.py +191 -0
- sagemaker/core/lineage/query.py +732 -0
- sagemaker/core/lineage/visualizer.py +346 -0
- sagemaker/core/local/__init__.py +18 -0
- sagemaker/core/local/data.py +413 -0
- sagemaker/core/local/entities.py +678 -0
- sagemaker/core/local/exceptions.py +17 -0
- sagemaker/core/local/image.py +1243 -0
- sagemaker/core/local/local_session.py +739 -0
- sagemaker/core/local/utils.py +245 -0
- sagemaker/core/logs.py +181 -0
- sagemaker/core/metadata_properties.py +56 -0
- sagemaker/core/metric_definitions.py +91 -0
- sagemaker/core/mlflow/__init__.py +38 -0
- sagemaker/core/mlflow/forward_sagemaker_metrics.py +44 -0
- sagemaker/core/model_card/__init__.py +26 -0
- sagemaker/core/model_life_cycle.py +51 -0
- sagemaker/core/model_metrics.py +160 -0
- sagemaker/core/model_monitor/__init__.py +66 -0
- sagemaker/core/model_monitor/clarify_model_monitoring.py +1495 -0
- sagemaker/core/model_monitor/cron_expression_generator.py +82 -0
- sagemaker/core/model_monitor/data_capture_config.py +115 -0
- sagemaker/core/model_monitor/data_quality_monitoring_config.py +66 -0
- sagemaker/core/model_monitor/dataset_format.py +102 -0
- sagemaker/core/model_monitor/model_monitoring.py +4266 -0
- sagemaker/core/model_monitor/monitoring_alert.py +76 -0
- sagemaker/core/model_monitor/monitoring_files.py +506 -0
- sagemaker/core/model_monitor/utils.py +793 -0
- sagemaker/core/model_registry.py +480 -0
- sagemaker/core/model_uris.py +97 -0
- sagemaker/core/modules/__init__.py +19 -0
- sagemaker/core/modules/configs.py +226 -0
- sagemaker/core/modules/constants.py +37 -0
- sagemaker/core/modules/distributed.py +182 -0
- sagemaker/core/modules/local_core/__init__.py +0 -0
- sagemaker/core/modules/local_core/local_container.py +605 -0
- sagemaker/core/modules/templates.py +83 -0
- sagemaker/core/modules/train/__init__.py +14 -0
- sagemaker/core/modules/train/container_drivers/__init__.py +14 -0
- sagemaker/core/modules/train/container_drivers/common/__init__.py +14 -0
- sagemaker/core/modules/train/container_drivers/common/utils.py +213 -0
- sagemaker/core/modules/train/container_drivers/distributed_drivers/__init__.py +14 -0
- sagemaker/core/modules/train/container_drivers/distributed_drivers/basic_script_driver.py +81 -0
- sagemaker/core/modules/train/container_drivers/distributed_drivers/mpi_driver.py +123 -0
- sagemaker/core/modules/train/container_drivers/distributed_drivers/mpi_utils.py +302 -0
- sagemaker/core/modules/train/container_drivers/distributed_drivers/torchrun_driver.py +129 -0
- sagemaker/core/modules/train/container_drivers/scripts/__init__.py +14 -0
- sagemaker/core/modules/train/container_drivers/scripts/environment.py +305 -0
- sagemaker/core/modules/train/sm_recipes/__init__.py +0 -0
- sagemaker/core/modules/train/sm_recipes/utils.py +330 -0
- sagemaker/core/modules/types.py +19 -0
- sagemaker/core/modules/utils.py +194 -0
- sagemaker/core/network.py +185 -0
- sagemaker/core/parameter.py +173 -0
- sagemaker/core/payloads.py +185 -0
- sagemaker/core/processing.py +1597 -0
- sagemaker/core/remote_function/__init__.py +19 -0
- sagemaker/core/remote_function/checkpoint_location.py +47 -0
- sagemaker/core/remote_function/client.py +1285 -0
- sagemaker/core/remote_function/core/__init__.py +0 -0
- sagemaker/core/remote_function/core/_custom_dispatch_table.py +72 -0
- sagemaker/core/remote_function/core/pipeline_variables.py +353 -0
- sagemaker/core/remote_function/core/serialization.py +422 -0
- sagemaker/core/remote_function/core/stored_function.py +226 -0
- sagemaker/core/remote_function/custom_file_filter.py +128 -0
- sagemaker/core/remote_function/errors.py +104 -0
- sagemaker/core/remote_function/invoke_function.py +172 -0
- sagemaker/core/remote_function/job.py +2140 -0
- sagemaker/core/remote_function/logging_config.py +38 -0
- sagemaker/core/remote_function/runtime_environment/__init__.py +14 -0
- sagemaker/core/remote_function/runtime_environment/bootstrap_runtime_environment.py +605 -0
- sagemaker/core/remote_function/runtime_environment/mpi_utils_remote.py +252 -0
- sagemaker/core/remote_function/runtime_environment/runtime_environment_manager.py +554 -0
- sagemaker/core/remote_function/runtime_environment/spark_app.py +18 -0
- sagemaker/core/remote_function/spark_config.py +149 -0
- sagemaker/core/resource_requirements.py +168 -0
- {sagemaker_core/main → sagemaker/core}/resources.py +20121 -11728
- sagemaker/core/s3/__init__.py +41 -0
- sagemaker/core/s3/client.py +367 -0
- sagemaker/core/s3/utils.py +175 -0
- sagemaker/core/script_uris.py +93 -0
- sagemaker/core/serializers/__init__.py +11 -0
- sagemaker/core/serializers/base.py +510 -0
- sagemaker/core/serializers/implementations.py +159 -0
- sagemaker/core/serializers/utils.py +223 -0
- sagemaker/core/serverless_inference_config.py +63 -0
- sagemaker/core/session_settings.py +55 -0
- sagemaker/core/shapes/__init__.py +3 -0
- sagemaker/core/shapes/model_card_shapes.py +159 -0
- {sagemaker_core/main → sagemaker/core/shapes}/shapes.py +6384 -1865
- sagemaker/core/spark/__init__.py +16 -0
- sagemaker/core/spark/defaults.py +16 -0
- sagemaker/core/spark/processing.py +1380 -0
- sagemaker/core/telemetry/__init__.py +23 -0
- sagemaker/core/telemetry/constants.py +84 -0
- sagemaker/core/telemetry/telemetry_logging.py +284 -0
- sagemaker/core/tools/__init__.py +1 -0
- {sagemaker_core → sagemaker/core}/tools/codegen.py +4 -4
- {sagemaker_core → sagemaker/core}/tools/constants.py +23 -15
- {sagemaker_core → sagemaker/core}/tools/data_extractor.py +1 -1
- {sagemaker_core → sagemaker/core}/tools/method.py +1 -1
- sagemaker/core/tools/model_card/generate_model_card_from_schema.py +562 -0
- {sagemaker_core → sagemaker/core}/tools/resources_codegen.py +165 -98
- {sagemaker_core → sagemaker/core}/tools/resources_extractor.py +5 -13
- {sagemaker_core → sagemaker/core}/tools/shapes_codegen.py +16 -17
- {sagemaker_core → sagemaker/core}/tools/shapes_extractor.py +29 -67
- {sagemaker_core → sagemaker/core}/tools/templates.py +39 -17
- sagemaker/core/training/__init__.py +14 -0
- sagemaker/core/training/configs.py +333 -0
- sagemaker/core/training/constants.py +37 -0
- sagemaker/core/training/utils.py +77 -0
- sagemaker/core/training_compiler/__init__.py +16 -0
- sagemaker/core/training_compiler/config.py +197 -0
- sagemaker/core/training_compiler_config.py +197 -0
- sagemaker/core/transformer.py +793 -0
- sagemaker/core/user_agent.py +76 -0
- sagemaker/core/utilities/__init__.py +24 -0
- sagemaker/core/utilities/cache.py +169 -0
- sagemaker/core/utilities/search_expression.py +133 -0
- sagemaker/core/utils/__init__.py +48 -0
- sagemaker/core/utils/code_injection/__init__.py +0 -0
- {sagemaker_core/main → sagemaker/core/utils}/code_injection/codec.py +2 -2
- {sagemaker_core/main → sagemaker/core/utils}/code_injection/shape_dag.py +6479 -136
- {sagemaker_core/main → sagemaker/core/utils}/exceptions.py +8 -8
- sagemaker_core/main/default_configs_helper.py → sagemaker/core/utils/intelligent_defaults_helper.py +5 -6
- {sagemaker_core/main → sagemaker/core/utils}/logs.py +1 -2
- {sagemaker_core/main → sagemaker/core/utils}/utils.py +25 -20
- sagemaker/core/workflow/__init__.py +152 -0
- sagemaker/core/workflow/conditions.py +313 -0
- sagemaker/core/workflow/entities.py +58 -0
- sagemaker/core/workflow/execution_variables.py +89 -0
- sagemaker/core/workflow/functions.py +193 -0
- sagemaker/core/workflow/parameters.py +222 -0
- sagemaker/core/workflow/pipeline_context.py +394 -0
- sagemaker/core/workflow/pipeline_definition_config.py +31 -0
- sagemaker/core/workflow/properties.py +285 -0
- sagemaker/core/workflow/step_outputs.py +65 -0
- sagemaker/core/workflow/utilities.py +507 -0
- sagemaker/lineage/__init__.py +33 -0
- sagemaker/lineage/action.py +28 -0
- sagemaker/lineage/artifact.py +28 -0
- sagemaker/lineage/context.py +28 -0
- sagemaker/lineage/lineage_trial_component.py +28 -0
- {sagemaker_core-1.0.47.dist-info → sagemaker_core-2.1.1.dist-info}/METADATA +28 -9
- sagemaker_core-2.1.1.dist-info/RECORD +355 -0
- sagemaker_core-2.1.1.dist-info/top_level.txt +1 -0
- sagemaker_core/__init__.py +0 -4
- sagemaker_core/_version.py +0 -3
- sagemaker_core/helper/session_helper.py +0 -769
- sagemaker_core/resources/__init__.py +0 -1
- sagemaker_core/shapes/__init__.py +0 -1
- sagemaker_core/tools/__init__.py +0 -1
- sagemaker_core-1.0.47.dist-info/RECORD +0 -35
- sagemaker_core-1.0.47.dist-info/top_level.txt +0 -1
- {sagemaker_core → sagemaker/core}/helper/__init__.py +0 -0
- {sagemaker_core/main → sagemaker/core/huggingface/training_compiler}/__init__.py +0 -0
- {sagemaker_core/main/code_injection → sagemaker/core/jumpstart/factory}/__init__.py +0 -0
- {sagemaker_core/main → sagemaker/core/utils}/code_injection/base.py +0 -0
- {sagemaker_core/main → sagemaker/core/utils}/code_injection/constants.py +0 -0
- {sagemaker_core/main → sagemaker/core/utils}/user_agent.py +0 -0
- {sagemaker_core-1.0.47.dist-info → sagemaker_core-2.1.1.dist-info}/WHEEL +0 -0
- {sagemaker_core-1.0.47.dist-info → sagemaker_core-2.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
|
4
|
+
# may not use this file except in compliance with the License. A copy of
|
|
5
|
+
# the License is located at
|
|
6
|
+
#
|
|
7
|
+
# http://aws.amazon.com/apache2.0/
|
|
8
|
+
#
|
|
9
|
+
# or in the "license" file accompanying this file. This file is
|
|
10
|
+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
|
11
|
+
# ANY KIND, either express or implied. See the License for the specific
|
|
12
|
+
# language governing permissions and limitations under the License.
|
|
13
|
+
"""The various types of metrics configurations that can be specified in FrameworkProfile."""
|
|
14
|
+
from __future__ import absolute_import
|
|
15
|
+
|
|
16
|
+
from sagemaker.core.debugger.profiler_constants import (
|
|
17
|
+
DATALOADER_PROFILING_CONFIG_NAME,
|
|
18
|
+
DATALOADER_PROFILING_START_STEP_DEFAULT,
|
|
19
|
+
DETAILED_PROFILING_CONFIG_NAME,
|
|
20
|
+
DETAILED_PROFILING_START_STEP_DEFAULT,
|
|
21
|
+
SMDATAPARALLEL_PROFILING_CONFIG_NAME,
|
|
22
|
+
SMDATAPARALLEL_PROFILING_START_STEP_DEFAULT,
|
|
23
|
+
HOROVOD_PROFILING_CONFIG_NAME,
|
|
24
|
+
HOROVOD_PROFILING_START_STEP_DEFAULT,
|
|
25
|
+
PROFILING_NUM_STEPS_DEFAULT,
|
|
26
|
+
PYTHON_PROFILING_CONFIG_NAME,
|
|
27
|
+
PYTHON_PROFILING_NUM_STEPS_DEFAULT,
|
|
28
|
+
PYTHON_PROFILING_START_STEP_DEFAULT,
|
|
29
|
+
START_STEP_DEFAULT,
|
|
30
|
+
)
|
|
31
|
+
from sagemaker.core.debugger.utils import (
|
|
32
|
+
convert_json_config_to_string,
|
|
33
|
+
cProfileTimer,
|
|
34
|
+
is_valid_regex,
|
|
35
|
+
is_valid_unix_time,
|
|
36
|
+
ErrorMessages,
|
|
37
|
+
PythonProfiler,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class StepRange:
|
|
42
|
+
"""Configuration for the range of steps to profile.
|
|
43
|
+
|
|
44
|
+
It returns the target steps in dictionary format that you can pass to the
|
|
45
|
+
:class:`~sagemaker.debugger.FrameworkProfile` class.
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self, start_step, num_steps):
|
|
50
|
+
"""Set the start step and num steps.
|
|
51
|
+
|
|
52
|
+
If the start step is not specified,
|
|
53
|
+
Debugger starts profiling
|
|
54
|
+
at step 0. If num steps is not specified, profile for 1 step.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
start_step (int): The step to start profiling.
|
|
58
|
+
num_steps (int): The number of steps to profile.
|
|
59
|
+
|
|
60
|
+
"""
|
|
61
|
+
if start_step is None:
|
|
62
|
+
start_step = START_STEP_DEFAULT
|
|
63
|
+
elif num_steps is None:
|
|
64
|
+
num_steps = PROFILING_NUM_STEPS_DEFAULT
|
|
65
|
+
|
|
66
|
+
self.start_step = start_step
|
|
67
|
+
self.num_steps = num_steps
|
|
68
|
+
|
|
69
|
+
def to_json(self):
|
|
70
|
+
"""Convert the step range into a dictionary.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
dict: The step range as a dictionary.
|
|
74
|
+
|
|
75
|
+
"""
|
|
76
|
+
return {"StartStep": self.start_step, "NumSteps": self.num_steps}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class TimeRange:
|
|
80
|
+
"""Configuration for the range of Unix time to profile.
|
|
81
|
+
|
|
82
|
+
It returns the target time duration in dictionary format that you can pass to the
|
|
83
|
+
:class:`~sagemaker.debugger.FrameworkProfile` class.
|
|
84
|
+
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(self, start_unix_time, duration):
|
|
88
|
+
"""Set the start Unix time and duration.
|
|
89
|
+
|
|
90
|
+
If the start Unix time is not specified,
|
|
91
|
+
profile starting at step 0. If the duration is not specified, profile for 1 step.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
start_unix_time (int): The Unix time to start profiling.
|
|
95
|
+
duration (float): The duration in seconds to profile.
|
|
96
|
+
|
|
97
|
+
"""
|
|
98
|
+
self.start_unix_time = start_unix_time
|
|
99
|
+
self.duration = duration
|
|
100
|
+
|
|
101
|
+
def to_json(self):
|
|
102
|
+
"""Convert the time range into a dictionary.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
dict: The time range as a dictionary.
|
|
106
|
+
|
|
107
|
+
"""
|
|
108
|
+
time_range_json = {}
|
|
109
|
+
if self.start_unix_time is not None:
|
|
110
|
+
time_range_json["StartTimeInSecSinceEpoch"] = self.start_unix_time
|
|
111
|
+
if self.duration is not None:
|
|
112
|
+
time_range_json["Duration"] = self.duration
|
|
113
|
+
return time_range_json
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class MetricsConfigBase:
|
|
117
|
+
"""The base class for the metrics configuration.
|
|
118
|
+
|
|
119
|
+
It determines the step or time range that needs to be
|
|
120
|
+
profiled and validates the input value pairs. Available profiling range parameter pairs are
|
|
121
|
+
(**start_step** and **num_steps**) and (**start_unix_time** and **duration**).
|
|
122
|
+
The two parameter pairs are mutually exclusive, and this class validates
|
|
123
|
+
if one of the two pairs is used. If both pairs are specified, a
|
|
124
|
+
FOUND_BOTH_STEP_AND_TIME_FIELDS error occurs.
|
|
125
|
+
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(self, name, start_step, num_steps, start_unix_time, duration):
|
|
129
|
+
"""Validate the provided range fields and set the range to be profiled accordingly.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
name (str): The name of the metrics config.
|
|
133
|
+
start_step (int): The step to start profiling.
|
|
134
|
+
num_steps (int): The number of steps to profile.
|
|
135
|
+
start_unix_time (int): The Unix time to start profiling.
|
|
136
|
+
duration (float): The duration in seconds to profile.
|
|
137
|
+
|
|
138
|
+
"""
|
|
139
|
+
self.name = name
|
|
140
|
+
|
|
141
|
+
assert (
|
|
142
|
+
start_step is None or isinstance(start_step, int) and start_step >= 0
|
|
143
|
+
), ErrorMessages.INVALID_START_STEP.value
|
|
144
|
+
assert (
|
|
145
|
+
num_steps is None or isinstance(num_steps, int) and num_steps > 0
|
|
146
|
+
), ErrorMessages.INVALID_NUM_STEPS.value
|
|
147
|
+
assert (
|
|
148
|
+
start_unix_time is None
|
|
149
|
+
or isinstance(start_unix_time, int)
|
|
150
|
+
and is_valid_unix_time(start_unix_time)
|
|
151
|
+
), ErrorMessages.INVALID_START_UNIX_TIME.value
|
|
152
|
+
assert (
|
|
153
|
+
duration is None or isinstance(duration, (float, int)) and duration > 0
|
|
154
|
+
), ErrorMessages.INVALID_DURATION.value
|
|
155
|
+
|
|
156
|
+
has_step_range = start_step is not None or num_steps is not None
|
|
157
|
+
has_time_range = start_unix_time is not None or duration is not None
|
|
158
|
+
assert not (
|
|
159
|
+
has_step_range and has_time_range
|
|
160
|
+
), ErrorMessages.FOUND_BOTH_STEP_AND_TIME_FIELDS.value
|
|
161
|
+
|
|
162
|
+
self.range = (
|
|
163
|
+
StepRange(start_step, num_steps)
|
|
164
|
+
if has_step_range
|
|
165
|
+
else TimeRange(start_unix_time, duration)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def _to_json(self):
|
|
169
|
+
"""Convert the metrics configuration to a dictionary.
|
|
170
|
+
|
|
171
|
+
Convert the range object into a
|
|
172
|
+
dictionary.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
dict: This metrics config as a dictionary.
|
|
176
|
+
|
|
177
|
+
"""
|
|
178
|
+
return self.range.to_json()
|
|
179
|
+
|
|
180
|
+
def to_json_string(self):
|
|
181
|
+
"""Convert this metrics configuration to dictionary formatted as a string.
|
|
182
|
+
|
|
183
|
+
Calling eval on the
|
|
184
|
+
return value is the same as calling _to_json directly.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
str: This metrics configuration as a dictionary and formatted as a string.
|
|
188
|
+
|
|
189
|
+
"""
|
|
190
|
+
return convert_json_config_to_string(self._to_json())
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class DetailedProfilingConfig(MetricsConfigBase):
|
|
194
|
+
"""The configuration for framework metrics to be collected for detailed profiling."""
|
|
195
|
+
|
|
196
|
+
def __init__(
|
|
197
|
+
self,
|
|
198
|
+
start_step=None,
|
|
199
|
+
num_steps=None,
|
|
200
|
+
start_unix_time=None,
|
|
201
|
+
duration=None,
|
|
202
|
+
profile_default_steps=False,
|
|
203
|
+
):
|
|
204
|
+
"""Specify target steps or a target duration to profile.
|
|
205
|
+
|
|
206
|
+
By default, it profiles step 5 of the training job.
|
|
207
|
+
|
|
208
|
+
If **profile_default_steps** is set to `True` and none of the other
|
|
209
|
+
range parameters is specified,
|
|
210
|
+
the class uses the default configuration for detailed profiling.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
start_step (int): The step to start profiling. The default is step 5.
|
|
214
|
+
num_steps (int): The number of steps to profile. The default is for 1 step.
|
|
215
|
+
start_unix_time (int): The Unix time to start profiling.
|
|
216
|
+
duration (float): The duration in seconds to profile.
|
|
217
|
+
profile_default_steps (bool): Indicates whether the default config should be used.
|
|
218
|
+
|
|
219
|
+
.. tip::
|
|
220
|
+
Available profiling range parameter pairs are
|
|
221
|
+
(**start_step** and **num_steps**) and (**start_unix_time** and **duration**).
|
|
222
|
+
The two parameter pairs are mutually exclusive, and this class validates
|
|
223
|
+
if one of the two pairs is used. If both pairs are specified, a
|
|
224
|
+
conflict error occurs.
|
|
225
|
+
|
|
226
|
+
.. warning::
|
|
227
|
+
This detailed framework profiling feature discontinues support for TensorFlow v2.11
|
|
228
|
+
and later. To use the detailed profiling feature, use previous versions of
|
|
229
|
+
TensorFlow between v2.3.1 and v2.10.0.
|
|
230
|
+
|
|
231
|
+
"""
|
|
232
|
+
assert isinstance(
|
|
233
|
+
profile_default_steps, bool
|
|
234
|
+
), ErrorMessages.INVALID_PROFILE_DEFAULT_STEPS.value
|
|
235
|
+
if profile_default_steps or start_step is num_steps is start_unix_time is duration is None:
|
|
236
|
+
start_step = DETAILED_PROFILING_START_STEP_DEFAULT
|
|
237
|
+
num_steps = PROFILING_NUM_STEPS_DEFAULT
|
|
238
|
+
|
|
239
|
+
super().__init__(
|
|
240
|
+
DETAILED_PROFILING_CONFIG_NAME, start_step, num_steps, start_unix_time, duration
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class DataloaderProfilingConfig(MetricsConfigBase):
|
|
245
|
+
"""The configuration for framework metrics to be collected for data loader profiling."""
|
|
246
|
+
|
|
247
|
+
def __init__(
|
|
248
|
+
self,
|
|
249
|
+
start_step=None,
|
|
250
|
+
num_steps=None,
|
|
251
|
+
start_unix_time=None,
|
|
252
|
+
duration=None,
|
|
253
|
+
profile_default_steps=False,
|
|
254
|
+
metrics_regex=".*",
|
|
255
|
+
):
|
|
256
|
+
"""Specify target steps or a target duration to profile.
|
|
257
|
+
|
|
258
|
+
By default, it profiles step 7 of
|
|
259
|
+
training. If **profile_default_steps** is set to `True` and none of the other
|
|
260
|
+
range parameters is specified,
|
|
261
|
+
the class uses the default config for dataloader profiling.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
start_step (int): The step to start profiling. The default is step 7.
|
|
265
|
+
num_steps (int): The number of steps to profile. The default is for 1 step.
|
|
266
|
+
start_unix_time (int): The Unix time to start profiling. The default is for 1 step.
|
|
267
|
+
duration (float): The duration in seconds to profile.
|
|
268
|
+
profile_default_steps (bool): Indicates whether the default config should be used.
|
|
269
|
+
|
|
270
|
+
"""
|
|
271
|
+
assert isinstance(
|
|
272
|
+
profile_default_steps, bool
|
|
273
|
+
), ErrorMessages.INVALID_PROFILE_DEFAULT_STEPS.value
|
|
274
|
+
if profile_default_steps or start_step is num_steps is start_unix_time is duration is None:
|
|
275
|
+
start_step = DATALOADER_PROFILING_START_STEP_DEFAULT
|
|
276
|
+
num_steps = PROFILING_NUM_STEPS_DEFAULT
|
|
277
|
+
|
|
278
|
+
super().__init__(
|
|
279
|
+
DATALOADER_PROFILING_CONFIG_NAME, start_step, num_steps, start_unix_time, duration
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
assert is_valid_regex(metrics_regex), ErrorMessages.INVALID_METRICS_REGEX.value
|
|
283
|
+
self.metrics_regex = metrics_regex
|
|
284
|
+
|
|
285
|
+
def _to_json(self):
|
|
286
|
+
"""Convert the dataloader profiling config to a dictionary.
|
|
287
|
+
|
|
288
|
+
Build off of the base metrics
|
|
289
|
+
configuration dictionary to add the metrics regex.
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
dict: The dataloader that profiles the configuration as a dictionary.
|
|
293
|
+
|
|
294
|
+
"""
|
|
295
|
+
dataloader_profiling_config = super()._to_json()
|
|
296
|
+
dataloader_profiling_config["MetricsRegex"] = self.metrics_regex
|
|
297
|
+
return dataloader_profiling_config
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class PythonProfilingConfig(MetricsConfigBase):
|
|
301
|
+
"""The configuration for framework metrics to be collected for Python profiling."""
|
|
302
|
+
|
|
303
|
+
def __init__(
|
|
304
|
+
self,
|
|
305
|
+
start_step=None,
|
|
306
|
+
num_steps=None,
|
|
307
|
+
start_unix_time=None,
|
|
308
|
+
duration=None,
|
|
309
|
+
profile_default_steps=False,
|
|
310
|
+
python_profiler=PythonProfiler.CPROFILE,
|
|
311
|
+
cprofile_timer=cProfileTimer.TOTAL_TIME,
|
|
312
|
+
):
|
|
313
|
+
"""Choose a Python profiler: cProfile or Pyinstrument.
|
|
314
|
+
|
|
315
|
+
Specify target steps or a target duration to profile.
|
|
316
|
+
If no parameter is specified,
|
|
317
|
+
it profiles based on profiling configurations
|
|
318
|
+
preset by the **profile_default_steps** parameter,
|
|
319
|
+
which is set to `True` by default.
|
|
320
|
+
If you specify the following parameters,
|
|
321
|
+
then the **profile_default_steps** parameter
|
|
322
|
+
will be ignored.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
start_step (int): The step to start profiling. The default is step 9.
|
|
326
|
+
num_steps (int): The number of steps to profile. The default is for 3 steps.
|
|
327
|
+
start_unix_time (int): The Unix time to start profiling.
|
|
328
|
+
duration (float): The duration in seconds to profile.
|
|
329
|
+
profile_default_steps (bool): Indicates whether the default configuration
|
|
330
|
+
should be used. If set to `True`, Python profiling will be done
|
|
331
|
+
at step 9, 10, and 11 of training, using cProfiler
|
|
332
|
+
and collecting metrics based on the total time, cpu time,
|
|
333
|
+
and off cpu time for these three steps respectively.
|
|
334
|
+
The default is ``True``.
|
|
335
|
+
python_profiler (PythonProfiler): The Python profiler to use to collect
|
|
336
|
+
python profiling stats. Available options are ``"cProfile"``
|
|
337
|
+
and ``"Pyinstrument"``. The default is ``"cProfile"``.
|
|
338
|
+
Instead of passing the string values, you can also use the enumerator util,
|
|
339
|
+
:class:`~sagemaker.debugger.utils.PythonProfiler`,
|
|
340
|
+
to choose one of the available options.
|
|
341
|
+
cprofile_timer (cProfileTimer): The timer to be used by cProfile when collecting
|
|
342
|
+
python profiling stats. Available options are ``"total_time"``, ``"cpu_time"``,
|
|
343
|
+
and ``"off_cpu_time"``. The default is ``"total_time"``.
|
|
344
|
+
If you choose Pyinstrument, this parameter is ignored.
|
|
345
|
+
Instead of passing the string values, you can also use the enumerator util,
|
|
346
|
+
:class:`~sagemaker.debugger.utils.cProfileTimer`,
|
|
347
|
+
to choose one of the available options.
|
|
348
|
+
|
|
349
|
+
"""
|
|
350
|
+
assert isinstance(
|
|
351
|
+
profile_default_steps, bool
|
|
352
|
+
), ErrorMessages.INVALID_PROFILE_DEFAULT_STEPS.value
|
|
353
|
+
if profile_default_steps or start_step is num_steps is start_unix_time is duration is None:
|
|
354
|
+
start_step = PYTHON_PROFILING_START_STEP_DEFAULT
|
|
355
|
+
num_steps = PYTHON_PROFILING_NUM_STEPS_DEFAULT
|
|
356
|
+
|
|
357
|
+
if profile_default_steps:
|
|
358
|
+
cprofile_timer = cProfileTimer.DEFAULT
|
|
359
|
+
|
|
360
|
+
super().__init__(
|
|
361
|
+
PYTHON_PROFILING_CONFIG_NAME, start_step, num_steps, start_unix_time, duration
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
assert isinstance(
|
|
365
|
+
python_profiler, PythonProfiler
|
|
366
|
+
), ErrorMessages.INVALID_PYTHON_PROFILER.value
|
|
367
|
+
assert isinstance(cprofile_timer, cProfileTimer), ErrorMessages.INVALID_CPROFILE_TIMER.value
|
|
368
|
+
|
|
369
|
+
self.python_profiler = python_profiler
|
|
370
|
+
|
|
371
|
+
# The cprofile timer can only be used when the python profiler is cProfile.
|
|
372
|
+
if python_profiler == PythonProfiler.PYINSTRUMENT:
|
|
373
|
+
self.cprofile_timer = None
|
|
374
|
+
else:
|
|
375
|
+
self.cprofile_timer = cprofile_timer
|
|
376
|
+
|
|
377
|
+
def _to_json(self):
|
|
378
|
+
"""Convert the Python profiling config to a dictionary.
|
|
379
|
+
|
|
380
|
+
Build off of the base metrics configuration
|
|
381
|
+
dictionary to add the Python profiler and cProfile timer.
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
dict: The python profiling config as a dictionary.
|
|
385
|
+
|
|
386
|
+
"""
|
|
387
|
+
python_profiling_config = super()._to_json()
|
|
388
|
+
python_profiling_config["ProfilerName"] = self.python_profiler.value
|
|
389
|
+
if self.cprofile_timer is not None:
|
|
390
|
+
python_profiling_config["cProfileTimer"] = self.cprofile_timer.value
|
|
391
|
+
return python_profiling_config
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
class HorovodProfilingConfig(MetricsConfigBase):
|
|
395
|
+
"""The configuration for framework metrics from Horovod distributed training."""
|
|
396
|
+
|
|
397
|
+
def __init__(
|
|
398
|
+
self,
|
|
399
|
+
start_step=None,
|
|
400
|
+
num_steps=None,
|
|
401
|
+
start_unix_time=None,
|
|
402
|
+
duration=None,
|
|
403
|
+
profile_default_steps=False,
|
|
404
|
+
):
|
|
405
|
+
"""Specify target steps or a target duration to profile.
|
|
406
|
+
|
|
407
|
+
By default, it profiles step 13 of training.
|
|
408
|
+
If **profile_default_steps** is set to `True` and none of the other range
|
|
409
|
+
parameters is specified,
|
|
410
|
+
the class uses the default config for horovod profiling.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
start_step (int): The step to start profiling. The default is step 13.
|
|
414
|
+
num_steps (int): The number of steps to profile. The default is for 1 steps.
|
|
415
|
+
start_unix_time (int): The Unix time to start profiling.
|
|
416
|
+
duration (float): The duration in seconds to profile.
|
|
417
|
+
profile_default_steps (bool): Indicates whether the default config should be used.
|
|
418
|
+
|
|
419
|
+
"""
|
|
420
|
+
assert isinstance(
|
|
421
|
+
profile_default_steps, bool
|
|
422
|
+
), ErrorMessages.INVALID_PROFILE_DEFAULT_STEPS.value
|
|
423
|
+
if profile_default_steps or start_step is num_steps is start_unix_time is duration is None:
|
|
424
|
+
start_step = HOROVOD_PROFILING_START_STEP_DEFAULT
|
|
425
|
+
num_steps = PROFILING_NUM_STEPS_DEFAULT
|
|
426
|
+
|
|
427
|
+
super().__init__(
|
|
428
|
+
HOROVOD_PROFILING_CONFIG_NAME, start_step, num_steps, start_unix_time, duration
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
class SMDataParallelProfilingConfig(MetricsConfigBase):
|
|
433
|
+
"""Configuration for framework metrics collected from a SageMaker Distributed training job."""
|
|
434
|
+
|
|
435
|
+
def __init__(
|
|
436
|
+
self,
|
|
437
|
+
start_step=None,
|
|
438
|
+
num_steps=None,
|
|
439
|
+
start_unix_time=None,
|
|
440
|
+
duration=None,
|
|
441
|
+
profile_default_steps=False,
|
|
442
|
+
):
|
|
443
|
+
"""Specify target steps or a target duration to profile.
|
|
444
|
+
|
|
445
|
+
By default, it profiles step 15 of training.
|
|
446
|
+
If **profile_default_steps** is set to `True` and none of the other
|
|
447
|
+
range parameters is specified,
|
|
448
|
+
the class uses the default configuration for SageMaker Distributed profiling.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
start_step (int): The step to start profiling. The default is step 15.
|
|
452
|
+
num_steps (int): The number of steps to profile. The default is for 1 steps.
|
|
453
|
+
start_unix_time (int): The Unix time to start profiling.
|
|
454
|
+
duration (float): The duration in seconds to profile.
|
|
455
|
+
profile_default_steps (bool): Indicates whether the default configuration
|
|
456
|
+
should be used.
|
|
457
|
+
|
|
458
|
+
"""
|
|
459
|
+
assert isinstance(
|
|
460
|
+
profile_default_steps, bool
|
|
461
|
+
), ErrorMessages.INVALID_PROFILE_DEFAULT_STEPS.value
|
|
462
|
+
if profile_default_steps or start_step is num_steps is start_unix_time is duration is None:
|
|
463
|
+
start_step = SMDATAPARALLEL_PROFILING_START_STEP_DEFAULT
|
|
464
|
+
num_steps = PROFILING_NUM_STEPS_DEFAULT
|
|
465
|
+
|
|
466
|
+
super().__init__(
|
|
467
|
+
SMDATAPARALLEL_PROFILING_CONFIG_NAME, start_step, num_steps, start_unix_time, duration
|
|
468
|
+
)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License"). You
|
|
4
|
+
# may not use this file except in compliance with the License. A copy of
|
|
5
|
+
# the License is located at
|
|
6
|
+
#
|
|
7
|
+
# http://aws.amazon.com/apache2.0/
|
|
8
|
+
#
|
|
9
|
+
# or in the "license" file accompanying this file. This file is
|
|
10
|
+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
|
|
11
|
+
# ANY KIND, either express or implied. See the License for the specific
|
|
12
|
+
# language governing permissions and limitations under the License.
|
|
13
|
+
|
|
14
|
+
"""Configuration for collecting profiler v2 metrics in SageMaker training jobs."""
|
|
15
|
+
from __future__ import absolute_import
|
|
16
|
+
|
|
17
|
+
from sagemaker.core.debugger.profiler_constants import (
|
|
18
|
+
FILE_ROTATION_INTERVAL_DEFAULT,
|
|
19
|
+
CPU_PROFILING_DURATION,
|
|
20
|
+
DETAIL_PROF_PROCESSING_DEFAULT_INSTANCE_TYPE,
|
|
21
|
+
DETAIL_PROF_PROCESSING_DEFAULT_VOLUME_SIZE,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Profiler:
|
|
26
|
+
"""A configuration class to activate SageMaker Profiler."""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
cpu_profiling_duration: str = str(CPU_PROFILING_DURATION),
|
|
31
|
+
file_rotation_interval: str = str(FILE_ROTATION_INTERVAL_DEFAULT),
|
|
32
|
+
):
|
|
33
|
+
"""To specify values to adjust the Profiler configuration, use the following parameters.
|
|
34
|
+
|
|
35
|
+
:param cpu_profiling_duration: Specify the time duration in seconds for
|
|
36
|
+
profiling CPU activities. The default value is 3600 seconds.
|
|
37
|
+
"""
|
|
38
|
+
self.profiling_parameters = {}
|
|
39
|
+
self.profiling_parameters["CPUProfilingDuration"] = str(cpu_profiling_duration)
|
|
40
|
+
self.profiling_parameters["SMPFileRotationSecs"] = str(file_rotation_interval)
|
|
41
|
+
self.instanceType = DETAIL_PROF_PROCESSING_DEFAULT_INSTANCE_TYPE
|
|
42
|
+
self.volumeSizeInGB = DETAIL_PROF_PROCESSING_DEFAULT_VOLUME_SIZE
|