sagemaker-core 1.0.48__py3-none-any.whl → 1.0.50__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sagemaker-core might be problematic. Click here for more details.

@@ -3223,6 +3223,7 @@ class Cluster(Base):
3223
3223
  vpc_config:
3224
3224
  orchestrator: The type of orchestrator used for the SageMaker HyperPod cluster.
3225
3225
  node_recovery: The node recovery mode configured for the SageMaker HyperPod cluster.
3226
+ node_provisioning_mode: The mode used for provisioning nodes in the cluster.
3226
3227
 
3227
3228
  """
3228
3229
 
@@ -3238,6 +3239,7 @@ class Cluster(Base):
3238
3239
  vpc_config: Optional[shapes.VpcConfig] = Unassigned()
3239
3240
  orchestrator: Optional[shapes.ClusterOrchestrator] = Unassigned()
3240
3241
  node_recovery: Optional[str] = Unassigned()
3242
+ node_provisioning_mode: Optional[str] = Unassigned()
3241
3243
 
3242
3244
  def get_name(self) -> str:
3243
3245
  attributes = vars(self)
@@ -3287,6 +3289,7 @@ class Cluster(Base):
3287
3289
  tags: Optional[List[shapes.Tag]] = Unassigned(),
3288
3290
  orchestrator: Optional[shapes.ClusterOrchestrator] = Unassigned(),
3289
3291
  node_recovery: Optional[str] = Unassigned(),
3292
+ node_provisioning_mode: Optional[str] = Unassigned(),
3290
3293
  session: Optional[Session] = None,
3291
3294
  region: Optional[str] = None,
3292
3295
  ) -> Optional["Cluster"]:
@@ -3299,8 +3302,9 @@ class Cluster(Base):
3299
3302
  restricted_instance_groups: The specialized instance groups for training models like Amazon Nova to be created in the SageMaker HyperPod cluster.
3300
3303
  vpc_config: Specifies the Amazon Virtual Private Cloud (VPC) that is associated with the Amazon SageMaker HyperPod cluster. You can control access to and from your resources by configuring your VPC. For more information, see Give SageMaker access to resources in your Amazon VPC. When your Amazon VPC and subnets support IPv6, network communications differ based on the cluster orchestration platform: Slurm-orchestrated clusters automatically configure nodes with dual IPv6 and IPv4 addresses, allowing immediate IPv6 network communications. In Amazon EKS-orchestrated clusters, nodes receive dual-stack addressing, but pods can only use IPv6 when the Amazon EKS cluster is explicitly IPv6-enabled. For information about deploying an IPv6 Amazon EKS cluster, see Amazon EKS IPv6 Cluster Deployment. Additional resources for IPv6 configuration: For information about adding IPv6 support to your VPC, see to IPv6 Support for VPC. For information about creating a new IPv6-compatible VPC, see Amazon VPC Creation Guide. To configure SageMaker HyperPod with a custom Amazon VPC, see Custom Amazon VPC Setup for SageMaker HyperPod.
3301
3304
  tags: Custom tags for managing the SageMaker HyperPod cluster as an Amazon Web Services resource. You can add tags to your cluster in the same way you add them in other Amazon Web Services services that support tagging. To learn more about tagging Amazon Web Services resources in general, see Tagging Amazon Web Services Resources User Guide.
3302
- orchestrator: The type of orchestrator to use for the SageMaker HyperPod cluster. Currently, the only supported value is "eks", which is to use an Amazon Elastic Kubernetes Service (EKS) cluster as the orchestrator.
3305
+ orchestrator: The type of orchestrator to use for the SageMaker HyperPod cluster. Currently, the only supported value is "eks", which is to use an Amazon Elastic Kubernetes Service cluster as the orchestrator.
3303
3306
  node_recovery: The node recovery mode for the SageMaker HyperPod cluster. When set to Automatic, SageMaker HyperPod will automatically reboot or replace faulty nodes when issues are detected. When set to None, cluster administrators will need to manually manage any faulty cluster instances.
3307
+ node_provisioning_mode: The mode for provisioning nodes in the cluster. You can specify the following modes: Continuous: Scaling behavior that enables 1) concurrent operation execution within instance groups, 2) continuous retry mechanisms for failed operations, 3) enhanced customer visibility into cluster events through detailed event streams, 4) partial provisioning capabilities. Your clusters and instance groups remain InService while scaling. This mode is only supported for EKS orchestrated clusters.
3304
3308
  session: Boto3 session.
3305
3309
  region: Region name.
3306
3310
 
@@ -3337,6 +3341,7 @@ class Cluster(Base):
3337
3341
  "Tags": tags,
3338
3342
  "Orchestrator": orchestrator,
3339
3343
  "NodeRecovery": node_recovery,
3344
+ "NodeProvisioningMode": node_provisioning_mode,
3340
3345
  }
3341
3346
 
3342
3347
  operation_input_args = Base.populate_chained_attributes(
@@ -3731,6 +3736,7 @@ class Cluster(Base):
3731
3736
  def get_node(
3732
3737
  self,
3733
3738
  node_id: Optional[str] = Unassigned(),
3739
+ node_logical_id: Optional[str] = Unassigned(),
3734
3740
  session: Optional[Session] = None,
3735
3741
  region: Optional[str] = None,
3736
3742
  ) -> Optional[shapes.ClusterNodeDetails]:
@@ -3739,6 +3745,7 @@ class Cluster(Base):
3739
3745
 
3740
3746
  Parameters:
3741
3747
  node_id: The ID of the SageMaker HyperPod cluster node.
3748
+ node_logical_id: The logical identifier of the node to describe. You can specify either NodeLogicalId or InstanceId, but not both. NodeLogicalId can be used to describe nodes that are still being provisioned and don't yet have an InstanceId assigned.
3742
3749
  session: Boto3 session.
3743
3750
  region: Region name.
3744
3751
 
@@ -3761,6 +3768,7 @@ class Cluster(Base):
3761
3768
  operation_input_args = {
3762
3769
  "ClusterName": self.cluster_name,
3763
3770
  "NodeId": node_id,
3771
+ "NodeLogicalId": node_logical_id,
3764
3772
  }
3765
3773
  # serialize the input request
3766
3774
  operation_input_args = serialize(operation_input_args)
@@ -3785,6 +3793,7 @@ class Cluster(Base):
3785
3793
  instance_group_name_contains: Optional[str] = Unassigned(),
3786
3794
  sort_by: Optional[str] = Unassigned(),
3787
3795
  sort_order: Optional[str] = Unassigned(),
3796
+ include_node_logical_ids: Optional[bool] = Unassigned(),
3788
3797
  session: Optional[Session] = None,
3789
3798
  region: Optional[str] = None,
3790
3799
  ) -> ResourceIterator[shapes.ClusterNodeDetails]:
@@ -3799,6 +3808,7 @@ class Cluster(Base):
3799
3808
  next_token: If the result of the previous ListClusterNodes request was truncated, the response includes a NextToken. To retrieve the next set of cluster nodes, use the token in the next request.
3800
3809
  sort_by: The field by which to sort results. The default value is CREATION_TIME.
3801
3810
  sort_order: The sort order for results. The default value is Ascending.
3811
+ include_node_logical_ids: Specifies whether to include nodes that are still being provisioned in the response. When set to true, the response includes all nodes regardless of their provisioning status. When set to False (default), only nodes with assigned InstanceIds are returned.
3802
3812
  session: Boto3 session.
3803
3813
  region: Region name.
3804
3814
 
@@ -3825,6 +3835,7 @@ class Cluster(Base):
3825
3835
  "InstanceGroupNameContains": instance_group_name_contains,
3826
3836
  "SortBy": sort_by,
3827
3837
  "SortOrder": sort_order,
3838
+ "IncludeNodeLogicalIds": include_node_logical_ids,
3828
3839
  }
3829
3840
  # serialize the input request
3830
3841
  operation_input_args = serialize(operation_input_args)
@@ -3847,6 +3858,7 @@ class Cluster(Base):
3847
3858
  def update_software(
3848
3859
  self,
3849
3860
  deployment_config: Optional[shapes.DeploymentConfiguration] = Unassigned(),
3861
+ image_id: Optional[str] = Unassigned(),
3850
3862
  session: Optional[Session] = None,
3851
3863
  region: Optional[str] = None,
3852
3864
  ) -> None:
@@ -3855,6 +3867,7 @@ class Cluster(Base):
3855
3867
 
3856
3868
  Parameters:
3857
3869
  deployment_config: The configuration to use when updating the AMI versions.
3870
+ image_id: When configuring your HyperPod cluster, you can specify an image ID using one of the following options: HyperPodPublicAmiId: Use a HyperPod public AMI CustomAmiId: Use your custom AMI default: Use the default latest system image f you choose to use a custom AMI (CustomAmiId), ensure it meets the following requirements: Encryption: The custom AMI must be unencrypted. Ownership: The custom AMI must be owned by the same Amazon Web Services account that is creating the HyperPod cluster. Volume support: Only the primary AMI snapshot volume is supported; additional AMI volumes are not supported. When updating the instance group's AMI through the UpdateClusterSoftware operation, if an instance group uses a custom AMI, you must provide an ImageId or use the default as input.
3858
3871
  session: Boto3 session.
3859
3872
  region: Region name.
3860
3873
 
@@ -3876,6 +3889,7 @@ class Cluster(Base):
3876
3889
  "ClusterName": self.cluster_name,
3877
3890
  "InstanceGroups": self.instance_groups,
3878
3891
  "DeploymentConfig": deployment_config,
3892
+ "ImageId": image_id,
3879
3893
  }
3880
3894
  # serialize the input request
3881
3895
  operation_input_args = serialize(operation_input_args)
@@ -3893,6 +3907,7 @@ class Cluster(Base):
3893
3907
  def batch_delete_nodes(
3894
3908
  self,
3895
3909
  node_ids: Optional[List[str]] = Unassigned(),
3910
+ node_logical_ids: Optional[List[str]] = Unassigned(),
3896
3911
  session: Optional[Session] = None,
3897
3912
  region: Optional[str] = None,
3898
3913
  ) -> Optional[shapes.BatchDeleteClusterNodesResponse]:
@@ -3901,6 +3916,7 @@ class Cluster(Base):
3901
3916
 
3902
3917
  Parameters:
3903
3918
  node_ids: A list of node IDs to be deleted from the specified cluster. For SageMaker HyperPod clusters using the Slurm workload manager, you cannot remove instances that are configured as Slurm controller nodes. If you need to delete more than 99 instances, contact Support for assistance.
3919
+ node_logical_ids: A list of NodeLogicalIds identifying the nodes to be deleted. You can specify up to 50 NodeLogicalIds. You must specify either NodeLogicalIds, InstanceIds, or both, with a combined maximum of 50 identifiers.
3904
3920
  session: Boto3 session.
3905
3921
  region: Region name.
3906
3922
 
@@ -3923,6 +3939,7 @@ class Cluster(Base):
3923
3939
  operation_input_args = {
3924
3940
  "ClusterName": self.cluster_name,
3925
3941
  "NodeIds": node_ids,
3942
+ "NodeLogicalIds": node_logical_ids,
3926
3943
  }
3927
3944
  # serialize the input request
3928
3945
  operation_input_args = serialize(operation_input_args)
@@ -28874,6 +28891,9 @@ class TrainingPlan(Base):
28874
28891
  total_instance_count: The total number of instances reserved in this training plan.
28875
28892
  available_instance_count: The number of instances currently available for use in this training plan.
28876
28893
  in_use_instance_count: The number of instances currently in use from this training plan.
28894
+ unhealthy_instance_count: The number of instances in the training plan that are currently in an unhealthy state.
28895
+ available_spare_instance_count: The number of available spare instances in the training plan.
28896
+ total_ultra_server_count: The total number of UltraServers reserved to this training plan.
28877
28897
  target_resources: The target resources (e.g., SageMaker Training Jobs, SageMaker HyperPod) that can use this training plan. Training plans are specific to their target resource. A training plan designed for SageMaker training jobs can only be used to schedule and run training jobs. A training plan for HyperPod clusters can be used exclusively to provide compute resources to a cluster's instance group.
28878
28898
  reserved_capacity_summaries: The list of Reserved Capacity providing the underlying compute resources of the plan.
28879
28899
 
@@ -28892,6 +28912,9 @@ class TrainingPlan(Base):
28892
28912
  total_instance_count: Optional[int] = Unassigned()
28893
28913
  available_instance_count: Optional[int] = Unassigned()
28894
28914
  in_use_instance_count: Optional[int] = Unassigned()
28915
+ unhealthy_instance_count: Optional[int] = Unassigned()
28916
+ available_spare_instance_count: Optional[int] = Unassigned()
28917
+ total_ultra_server_count: Optional[int] = Unassigned()
28895
28918
  target_resources: Optional[List[str]] = Unassigned()
28896
28919
  reserved_capacity_summaries: Optional[List[shapes.ReservedCapacitySummary]] = Unassigned()
28897
28920
 
@@ -28917,6 +28940,7 @@ class TrainingPlan(Base):
28917
28940
  cls,
28918
28941
  training_plan_name: str,
28919
28942
  training_plan_offering_id: str,
28943
+ spare_instance_count_per_ultra_server: Optional[int] = Unassigned(),
28920
28944
  tags: Optional[List[shapes.Tag]] = Unassigned(),
28921
28945
  session: Optional[Session] = None,
28922
28946
  region: Optional[str] = None,
@@ -28927,6 +28951,7 @@ class TrainingPlan(Base):
28927
28951
  Parameters:
28928
28952
  training_plan_name: The name of the training plan to create.
28929
28953
  training_plan_offering_id: The unique identifier of the training plan offering to use for creating this plan.
28954
+ spare_instance_count_per_ultra_server: Number of spare instances to reserve per UltraServer for enhanced resiliency. Default is 1.
28930
28955
  tags: An array of key-value pairs to apply to this training plan.
28931
28956
  session: Boto3 session.
28932
28957
  region: Region name.
@@ -28960,6 +28985,7 @@ class TrainingPlan(Base):
28960
28985
  operation_input_args = {
28961
28986
  "TrainingPlanName": training_plan_name,
28962
28987
  "TrainingPlanOfferingId": training_plan_offering_id,
28988
+ "SpareInstanceCountPerUltraServer": spare_instance_count_per_ultra_server,
28963
28989
  "Tags": tags,
28964
28990
  }
28965
28991