PyPI - service-capacity-modeling - Versions diffs - 0.3.67__tar.gz → 0.3.69__tar.gz - Mend

service-capacity-modeling 0.3.67tar.gz → 0.3.69tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of service-capacity-modeling might be problematic. Click here for more details.

Files changed (96) hide show

{service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: service-capacity-modeling
-Version: 0.3.67
+Version: 0.3.69
 Summary: Contains utilities for modeling capacity for pluggable workloads
 Author: Joseph Lynch
 Author-email: josephl@netflix.com

{service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/interface.py RENAMED Viewed

@@ -876,7 +876,22 @@ class CapacityDesires(ExcludeUnsetModel):
     @property
     def reference_shape(self) -> Instance:
-        # TODO: this should use the shape from current clusters if it is there
+        if not self.current_clusters:
+            return default_reference_shape
+        zonal, regional = (self.current_clusters.zonal, self.current_clusters.regional)
+        if zonal and regional:
+            raise ValueError(
+                "The current cluster should not have both "
+                "zonal and regional instances. They're mutually exclusive."
+            )
+        if zonal and zonal[0].cluster_instance:
+            return zonal[0].cluster_instance
+        if regional and regional[0].cluster_instance:
+            return regional[0].cluster_instance
         return default_reference_shape
     def merge_with(self, defaults: "CapacityDesires") -> "CapacityDesires":

{service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/common.py RENAMED Viewed

@@ -69,6 +69,43 @@ def _sqrt_staffed_cores(rps: float, latency_s: float, qos: float) -> int:
     return math.ceil((rps * latency_s) + qos * math.sqrt(rps * latency_s))
+def get_effective_disk_per_node_gib(
+    instance: Instance,
+    drive: Drive,
+    disk_buffer_ratio: float,
+    max_local_data_per_node_gib: float = float("inf"),
+    max_attached_data_per_node_gib: float = float("inf"),
+) -> float:
+    """Calculate usable disk for an instance while respecting per-node data limits
+    and desired disk buffer ratio
+    Prevents overloading nodes with too much data, which causes slow bootstrapping and
+    recovery times
+    Args:
+        instance: The compute instance configuration
+        drive: The drive configuration for the instance
+        disk_buffer_ratio: Buffer ratio for operational headroom
+        max_local_data_per_node_gib: Maximum data per node for local drives
+        max_attached_data_per_node_gib: Maximum data per node for attached drives
+    Returns:
+        float: Maximum usable disk capacity per node in GiB
+    """
+    # TODO: @homatthew / @vrayini: Incorporate disk headroom for attached / local drives
+    if instance.drive is None:
+        if max_attached_data_per_node_gib == float("inf"):
+            return drive.max_size_gib
+        attached_disk_limit_gib = max_attached_data_per_node_gib * disk_buffer_ratio
+        # Attached disks are provisioned in 100GB limits
+        rounded_size = utils.next_n(attached_disk_limit_gib, n=100)
+        return min(rounded_size, drive.max_size_gib)
+    local_disk_limit_gib = max_local_data_per_node_gib * disk_buffer_ratio
+    return min(local_disk_limit_gib, instance.drive.size_gib)
 def sqrt_staffed_cores(desires: CapacityDesires) -> int:
     """Computes cores given a sqrt staffing model
@@ -357,11 +394,6 @@ def compute_stateful_zone(  # pylint: disable=too-many-positional-arguments
     # (per_node_size_gib, node_count) -> (read_ios, write_ios)
     required_disk_ios: Callable[[float, int], Tuple[float, float]] = lambda size_gib,
     count: (0, 0),
-    required_disk_space: Callable[[float], float] = lambda size_gib: size_gib,
-    # The maximum amount of state we can hold per node in the database
-    # typically you don't want stateful systems going much higher than a
-    # few TiB so that recovery functions properly
-    max_local_disk_gib: float = 2048,
     # Some stateful clusters have sidecars that take memory
     reserve_memory: Callable[[float], float] = lambda x: 0,
     # How much write buffer we get per instance (usually a percentage of
@@ -373,14 +405,7 @@ def compute_stateful_zone(  # pylint: disable=too-many-positional-arguments
     min_count: int = 0,
     adjusted_disk_io_needed: float = 0.0,
     read_write_ratio: float = 0.0,
-    # Max attached EBS volume size per node. Higher value here could allow
-    # for a lower instance count (allows more vertical scaling vs forcing horizontal)
-    max_attached_disk_gib: Optional[float] = None,
 ) -> ZoneClusterCapacity:
-    # Datastores often require disk headroom for e.g. compaction and such
-    if instance.drive is not None:
-        needed_disk_gib = math.ceil(required_disk_space(needed_disk_gib))
     # How many instances do we need for the CPU
     count = math.ceil(needed_cores / instance.cpu)
@@ -404,12 +429,8 @@ def compute_stateful_zone(  # pylint: disable=too-many-positional-arguments
     count = max(count, math.ceil(needed_network_mbps / instance.net_mbps))
     # How many instances do we need for the disk
-    if (
-        instance.drive is not None
-        and instance.drive.size_gib > 0
-        and max_local_disk_gib > 0
-    ):
-        disk_per_node = min(max_local_disk_gib, instance.drive.size_gib)
+    if instance.drive is not None and instance.drive.size_gib > 0:
+        disk_per_node = instance.drive.size_gib
         count = max(count, math.ceil(needed_disk_gib / disk_per_node))
         if adjusted_disk_io_needed != 0.0:
             instance_read_iops = (
@@ -441,13 +462,13 @@ def compute_stateful_zone(  # pylint: disable=too-many-positional-arguments
     cost = count * instance.annual_cost
     attached_drives = []
-    if instance.drive is None and required_disk_space(needed_disk_gib) > 0:
+    if instance.drive is None and needed_disk_gib > 0:
         # If we don't have disks attach the cloud drive with enough
         # space and IO for the requirement
         # Note that cloud drivers are provisioned _per node_ and must be chosen for
         # the max of space and IOS.
-        space_gib = max(1, math.ceil(required_disk_space(needed_disk_gib) / count))
+        space_gib = max(1, math.ceil(needed_disk_gib / count))
         read_io, write_io = required_disk_ios(space_gib, count)
         read_io, write_io = (
             utils.next_n(read_io, n=200),
@@ -463,9 +484,6 @@ def compute_stateful_zone(  # pylint: disable=too-many-positional-arguments
         # 1/3 the maximum volume size in one node (preferring more nodes
         # with smaller volumes)
         max_size = drive.max_size_gib / 3
-        if max_attached_disk_gib is not None:
-            max_size = max_attached_disk_gib
         if ebs_gib > max_size > 0:
             ratio = ebs_gib / max_size
             count = max(cluster_size(math.ceil(count * ratio)), min_count)
@@ -900,7 +918,7 @@ def zonal_requirements_from_current(
             mem_gib=certain_float(needed_memory_gib),
             disk_gib=certain_float(needed_disk_gib),
             network_mbps=certain_float(needed_network_mbps),
-            reference_shape=current_capacity.cluster_instance,
+            reference_shape=reference_shape,
         )
     else:
         raise ValueError("Please check if current_cluster is populated correctly.")

{service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/cassandra.py RENAMED Viewed

@@ -21,6 +21,7 @@ from service_capacity_modeling.interface import certain_float
 from service_capacity_modeling.interface import certain_int
 from service_capacity_modeling.interface import Clusters
 from service_capacity_modeling.interface import Consistency
+from service_capacity_modeling.interface import CurrentClusterCapacity
 from service_capacity_modeling.interface import DataShape
 from service_capacity_modeling.interface import Drive
 from service_capacity_modeling.interface import FixedInterval
@@ -35,12 +36,15 @@ from service_capacity_modeling.models import CapacityModel
 from service_capacity_modeling.models.common import buffer_for_components
 from service_capacity_modeling.models.common import compute_stateful_zone
 from service_capacity_modeling.models.common import derived_buffer_for_component
+from service_capacity_modeling.models.common import get_effective_disk_per_node_gib
 from service_capacity_modeling.models.common import network_services
 from service_capacity_modeling.models.common import normalize_cores
 from service_capacity_modeling.models.common import simple_network_mbps
 from service_capacity_modeling.models.common import sqrt_staffed_cores
 from service_capacity_modeling.models.common import working_set_from_drive_and_slo
 from service_capacity_modeling.models.common import zonal_requirements_from_current
+from service_capacity_modeling.models.utils import is_power_of_2
+from service_capacity_modeling.models.utils import next_doubling
 from service_capacity_modeling.models.utils import next_power_of_2
 from service_capacity_modeling.stats import dist_for_interval
@@ -106,6 +110,53 @@ def _get_disk_from_desires(desires, copies_per_region):
     )
+def _get_min_count(
+    tier: int,
+    required_cluster_size: Optional[int],
+    needed_disk_gib: float,
+    disk_per_node_gib: float,
+    cluster_size_lambda: Callable[[int], int],
+):
+    """
+    Compute the minimum number of nodes required for a zone.
+    This function is used to prevent the planner from allocating clusters that
+    would exceed the max data per node or under the required cluster size for
+    a tier or existing cluster
+    """
+    # Cassandra clusters should aim to be at least 2 nodes per zone to start
+    # out with for tier 0 or tier 1. This gives us more room to "up-color"]
+    # clusters.
+    min_nodes_for_tier = 2 if tier in CRITICAL_TIERS else 0
+    # Prevent allocating clusters that exceed the max data per node.
+    min_nodes_for_disk = math.ceil(needed_disk_gib / disk_per_node_gib)
+    # Take the max of the following in order to avoid:
+    # (1) if `required_cluster_size` < `min_nodes_for_disk`, don't let the planner
+    #     pick a shape that would exceed the max data per node
+    #
+    #     For example, if we need 4TiB of disk, and the max data per node is 1TiB,
+    #     Regardless of the `required_cluster_size`, we cannot allocate less than 4
+    #     nodes because that would exceed the max data per node.
+    #
+    # (2) if `required_cluster_size` > `min_nodes_for_disk`, don't let the
+    #     node density requirement affect the min count because the required
+    #     cluster size already meets the node density requirement.
+    #
+    #     For example, if we need 4TiB of disk, and the max data per node is 1TiB,
+    #     and the upstream requires >= 8 nodes, we can allocate 8 nodes because
+    #     each node would only have 500GB of data.
+    min_count = max(
+        min_nodes_for_tier,
+        required_cluster_size or 0,
+        min_nodes_for_disk,
+    )
+    # Ensure that the min count is an increment of the cluster size constraint (doubling)
+    return cluster_size_lambda(min_count)
 def _zonal_requirement_for_new_cluster(
     desires, instance, copies_per_region, zones_per_region
 ) -> CapacityRequirement:
@@ -149,22 +200,13 @@ def _estimate_cassandra_requirement(  # pylint: disable=too-many-positional-argu
     )
     memory_preserve = False
     reference_shape = desires.reference_shape
-    current_capacity = (
-        None
-        if desires.current_clusters is None
-        else (
-            desires.current_clusters.zonal[0]
-            if len(desires.current_clusters.zonal)
-            else desires.current_clusters.regional[0]
-        )
-    )
+    current_capacity = _get_current_capacity(desires)
     # If the cluster is already provisioned
     if current_capacity and desires.current_clusters is not None:
         capacity_requirement = zonal_requirements_from_current(
             desires.current_clusters, desires.buffers, instance, reference_shape
         )
-        reference_shape = capacity_requirement.reference_shape
         disk_scale, _ = derived_buffer_for_component(
             desires.buffers.derived, ["storage", "disk"]
         )
@@ -278,6 +320,26 @@ def _estimate_cassandra_requirement(  # pylint: disable=too-many-positional-argu
     )
+def _get_current_cluster_size(desires) -> int:
+    current_capacity = _get_current_capacity(desires)
+    if current_capacity is None:
+        return 0
+    return math.ceil(current_capacity.cluster_instance_count.mid)
+def _get_current_capacity(desires) -> Optional[CurrentClusterCapacity]:
+    current_capacity = (
+        None
+        if desires.current_clusters is None
+        else (
+            desires.current_clusters.zonal[0]
+            if len(desires.current_clusters.zonal)
+            else desires.current_clusters.regional[0]
+        )
+    )
+    return current_capacity
 def _upsert_params(cluster, params):
     if cluster.cluster_params:
         cluster.cluster_params.update(params)
@@ -285,6 +347,18 @@ def _upsert_params(cluster, params):
         cluster.cluster_params = params
+def _get_cluster_size_lambda(
+    current_cluster_size: int,
+    required_cluster_size: Optional[int],
+) -> Callable[[int], int]:
+    if required_cluster_size:
+        return lambda x: next_doubling(x, base=required_cluster_size)
+    elif current_cluster_size and not is_power_of_2(current_cluster_size):
+        return lambda x: next_doubling(x, base=current_cluster_size)
+    else:  # New provisionings
+        return next_power_of_2
 # pylint: disable=too-many-locals
 # pylint: disable=too-many-return-statements
 # flake8: noqa: C901
@@ -299,7 +373,8 @@ def _estimate_cassandra_cluster_zonal(  # pylint: disable=too-many-positional-ar
     require_attached_disks: bool = False,
     required_cluster_size: Optional[int] = None,
     max_rps_to_disk: int = 500,
-    max_local_disk_gib: int = 5120,
+    max_local_data_per_node_gib: int = 1280,
+    max_attached_data_per_node_gib: int = 2048,
     max_regional_size: int = 192,
     max_write_buffer_percent: float = 0.25,
     max_table_buffer_percent: float = 0.11,
@@ -362,10 +437,31 @@ def _estimate_cassandra_cluster_zonal(  # pylint: disable=too-many-positional-ar
         copies_per_region=copies_per_region,
     )
-    # Cassandra clusters should aim to be at least 2 nodes per zone to start
-    # out with for tier 0 or tier 1. This gives us more room to "up-color"]
-    # clusters.
-    min_count = 2 if desires.service_tier in CRITICAL_TIERS else 0
+    # Adjust the min count to adjust to prevent too much data on a single
+    needed_disk_gib = int(requirement.disk_gib.mid)
+    disk_buffer_ratio = buffer_for_components(
+        buffers=desires.buffers, components=[BufferComponent.disk]
+    ).ratio
+    disk_per_node_gib = get_effective_disk_per_node_gib(
+        instance,
+        drive,
+        disk_buffer_ratio,
+        max_local_data_per_node_gib=max_local_data_per_node_gib,
+        max_attached_data_per_node_gib=max_attached_data_per_node_gib,
+    )
+    current_cluster_size = _get_current_cluster_size(desires)
+    cluster_size_lambda = _get_cluster_size_lambda(
+        current_cluster_size, required_cluster_size
+    )
+    min_count = _get_min_count(
+        tier=desires.service_tier,
+        required_cluster_size=required_cluster_size,
+        needed_disk_gib=needed_disk_gib,
+        disk_per_node_gib=disk_per_node_gib,
+        cluster_size_lambda=cluster_size_lambda,
+    )
     base_mem = _get_base_memory(desires)
     heap_fn = _cass_heap_for_write_buffer(
@@ -379,7 +475,7 @@ def _estimate_cassandra_cluster_zonal(  # pylint: disable=too-many-positional-ar
         instance=instance,
         drive=drive,
         needed_cores=int(requirement.cpu_cores.mid),
-        needed_disk_gib=int(requirement.disk_gib.mid),
+        needed_disk_gib=needed_disk_gib,
         needed_memory_gib=int(requirement.mem_gib.mid),
         needed_network_mbps=requirement.network_mbps.mid,
         # Take into account the reads per read
@@ -388,14 +484,9 @@ def _estimate_cassandra_cluster_zonal(  # pylint: disable=too-many-positional-ar
             _cass_io_per_read(size) * math.ceil(read_io_per_sec / count),
             write_io_per_sec / count,
         ),
-        # Disk buffer is already added while computing C* estimates
-        required_disk_space=lambda x: x,
-        # C* clusters cannot recover data from neighbors quickly so we
-        # want to avoid clusters with more than 1 TiB of local state
-        max_local_disk_gib=max_local_disk_gib,
         # C* clusters provision in powers of 2 because doubling
-        cluster_size=next_power_of_2,
-        min_count=max(min_count, required_cluster_size or 0),
+        cluster_size=cluster_size_lambda,
+        min_count=min_count,
         # TODO: Take reserve memory calculation into account during buffer calculation
         # C* heap usage takes away from OS page cache memory
         reserve_memory=lambda x: base_mem + heap_fn(x),
@@ -618,6 +709,11 @@ class NflxCassandraCapacityModel(CapacityModel):
         desires: CapacityDesires,
         extra_model_arguments: Dict[str, Any],
     ) -> Optional[CapacityPlan]:
+        # TODO: Standardize these extra model argument defaults in a single
+        # place. Many of them are defined here and as default values in the
+        # downstream method but only these ones are used which is confusing for
+        # readability
         # Use durabiliy and consistency to compute RF.
         copies_per_region = _target_rf(
             desires, extra_model_arguments.get("copies_per_region", None)
@@ -636,7 +732,11 @@ class NflxCassandraCapacityModel(CapacityModel):
         max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 500)
         max_regional_size: int = extra_model_arguments.get("max_regional_size", 192)
-        max_local_disk_gib: int = extra_model_arguments.get("max_local_disk_gib", 5120)
+        max_local_data_per_node_gib: int = extra_model_arguments.get(
+            "max_local_data_per_node_gib",
+            extra_model_arguments.get("max_local_disk_gib", 1280),
+        )
         max_write_buffer_percent: float = min(
             0.5, extra_model_arguments.get("max_write_buffer_percent", 0.25)
         )
@@ -664,7 +764,7 @@ class NflxCassandraCapacityModel(CapacityModel):
             required_cluster_size=required_cluster_size,
             max_rps_to_disk=max_rps_to_disk,
             max_regional_size=max_regional_size,
-            max_local_disk_gib=max_local_disk_gib,
+            max_local_data_per_node_gib=max_local_data_per_node_gib,
             max_write_buffer_percent=max_write_buffer_percent,
             max_table_buffer_percent=max_table_buffer_percent,
         )
@@ -677,6 +777,26 @@ class NflxCassandraCapacityModel(CapacityModel):
     def extra_model_arguments_schema() -> Dict[str, Any]:
         return NflxCassandraArguments.model_json_schema()
+    @staticmethod
+    def default_buffers() -> Buffers:
+        return Buffers(
+            default=Buffer(ratio=1.5),
+            desired={
+                "compute": Buffer(ratio=1.5, components=[BufferComponent.compute]),
+                "storage": Buffer(ratio=4.0, components=[BufferComponent.storage]),
+                # Cassandra reserves headroom in both cpu and network for background
+                # work and tasks
+                "background": Buffer(
+                    ratio=2.0,
+                    components=[
+                        BufferComponent.cpu,
+                        BufferComponent.network,
+                        BACKGROUND_BUFFER,
+                    ],
+                ),
+            },
+        )
     @staticmethod
     def default_desires(user_desires, extra_model_arguments: Dict[str, Any]):
         acceptable_consistency = {
@@ -704,24 +824,7 @@ class NflxCassandraCapacityModel(CapacityModel):
         # By supplying these buffers we can deconstruct observed utilization into
         # load versus buffer.
-        buffers = Buffers(
-            default=Buffer(ratio=1.5),
-            desired={
-                "compute": Buffer(ratio=1.5, components=[BufferComponent.compute]),
-                "storage": Buffer(ratio=4.0, components=[BufferComponent.storage]),
-                # Cassandra reserves headroom in both cpu and network for background
-                # work and tasks
-                "background": Buffer(
-                    ratio=2.0,
-                    components=[
-                        BufferComponent.cpu,
-                        BufferComponent.network,
-                        BACKGROUND_BUFFER,
-                    ],
-                ),
-            },
-        )
+        buffers = NflxCassandraCapacityModel.default_buffers()
         if user_desires.query_pattern.access_pattern == AccessPattern.latency:
             return CapacityDesires(
                 query_pattern=QueryPattern(

{service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/crdb.py RENAMED Viewed

@@ -10,6 +10,9 @@ from pydantic import Field
 from service_capacity_modeling.interface import AccessConsistency
 from service_capacity_modeling.interface import AccessPattern
+from service_capacity_modeling.interface import Buffer
+from service_capacity_modeling.interface import BufferComponent
+from service_capacity_modeling.interface import Buffers
 from service_capacity_modeling.interface import CapacityDesires
 from service_capacity_modeling.interface import CapacityPlan
 from service_capacity_modeling.interface import CapacityRequirement
@@ -27,7 +30,9 @@ from service_capacity_modeling.interface import QueryPattern
 from service_capacity_modeling.interface import RegionContext
 from service_capacity_modeling.interface import Requirements
 from service_capacity_modeling.models import CapacityModel
+from service_capacity_modeling.models.common import buffer_for_components
 from service_capacity_modeling.models.common import compute_stateful_zone
+from service_capacity_modeling.models.common import get_effective_disk_per_node_gib
 from service_capacity_modeling.models.common import normalize_cores
 from service_capacity_modeling.models.common import simple_network_mbps
 from service_capacity_modeling.models.common import sqrt_staffed_cores
@@ -137,7 +142,7 @@ def _estimate_cockroachdb_cluster_zonal(  # noqa=E501 pylint: disable=too-many-p
     desires: CapacityDesires,
     zones_per_region: int = 3,
     copies_per_region: int = 3,
-    max_local_disk_gib: int = 2048,
+    max_local_data_per_node_gib: int = 2048,
     max_regional_size: int = 288,
     max_rps_to_disk: int = 500,
     min_vcpu_per_instance: int = 4,
@@ -184,11 +189,23 @@ def _estimate_cockroachdb_cluster_zonal(  # noqa=E501 pylint: disable=too-many-p
         + desires.data_shape.reserved_instance_system_mem_gib
     )
+    disk_buffer_ratio = buffer_for_components(
+        buffers=desires.buffers, components=[BufferComponent.disk]
+    ).ratio
+    max_data_per_node_gib = get_effective_disk_per_node_gib(
+        instance,
+        drive,
+        disk_buffer_ratio,
+        max_local_data_per_node_gib=max_local_data_per_node_gib,
+    )
+    needed_disk_gib = requirement.disk_gib.mid * disk_buffer_ratio
+    min_count = math.ceil(needed_disk_gib / max_data_per_node_gib)
     cluster = compute_stateful_zone(
         instance=instance,
         drive=drive,
         needed_cores=int(requirement.cpu_cores.mid),
-        needed_disk_gib=requirement.disk_gib.mid,
+        needed_disk_gib=needed_disk_gib,
         needed_memory_gib=requirement.mem_gib.mid,
         needed_network_mbps=requirement.network_mbps.mid,
         # Take into account the reads per read
@@ -199,13 +216,9 @@ def _estimate_cockroachdb_cluster_zonal(  # noqa=E501 pylint: disable=too-many-p
             # TODO: presumably there are some write IOs here
             0,
         ),
-        # CRDB requires ephemeral disks to be 80% full because leveled
-        # compaction can make progress as long as there is some headroom
-        required_disk_space=lambda x: x * 1.2,
-        max_local_disk_gib=max_local_disk_gib,
         # cockroachdb clusters will autobalance across available nodes
         cluster_size=lambda x: x,
-        min_count=1,
+        min_count=min_count,
         # Sidecars/System takes away memory from cockroachdb
         # cockroachdb by default uses --max-sql-memory of 25% of system memory
         # that cannot be used for caching
@@ -268,6 +281,12 @@ class NflxCockroachDBArguments(BaseModel):
 class NflxCockroachDBCapacityModel(CapacityModel):
+    @staticmethod
+    def default_buffers() -> Buffers:
+        return Buffers(
+            default=Buffer(ratio=1.2),
+        )
     @staticmethod
     def capacity_plan(
         instance: Instance,
@@ -282,7 +301,11 @@ class NflxCockroachDBCapacityModel(CapacityModel):
         max_regional_size: int = extra_model_arguments.get("max_regional_size", 500)
         max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 500)
         # Very large nodes are hard to recover
-        max_local_disk_gib: int = extra_model_arguments.get("max_local_disk_gib", 2048)
+        max_local_data_per_node_gib: int = extra_model_arguments.get(
+            "max_local_data_per_node_gib",
+            extra_model_arguments.get("max_local_disk_gib", 2048),
+        )
         # Cockroach Labs recommends a minimum of 8 vCPUs and strongly
         # recommends no fewer than 4 vCPUs per node.
         min_vcpu_per_instance: int = extra_model_arguments.get(
@@ -299,7 +322,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
             zones_per_region=context.zones_in_region,
             copies_per_region=copies_per_region,
             max_regional_size=max_regional_size,
-            max_local_disk_gib=max_local_disk_gib,
+            max_local_data_per_node_gib=max_local_data_per_node_gib,
             max_rps_to_disk=max_rps_to_disk,
             min_vcpu_per_instance=min_vcpu_per_instance,
             license_fee_per_core=license_fee_per_core,
@@ -330,6 +353,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
                     f"User asked for {key}={value}"
                 )
+        buffers = NflxCockroachDBCapacityModel.default_buffers()
         if user_desires.query_pattern.access_pattern == AccessPattern.latency:
             return CapacityDesires(
                 query_pattern=QueryPattern(
@@ -396,6 +420,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
                     # gateway taking about 1 MiB of memory
                     reserved_instance_app_mem_gib=0.001,
                 ),
+                buffers=buffers,
             )
         else:
             return CapacityDesires(
@@ -465,6 +490,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
                     # gateway taking about 1 MiB of memory
                     reserved_instance_app_mem_gib=0.001,
                 ),
+                buffers=buffers,
             )

{service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/elasticsearch.py RENAMED Viewed

@@ -11,6 +11,9 @@ from pydantic import Field
 from service_capacity_modeling.interface import AccessConsistency
 from service_capacity_modeling.interface import AccessPattern
+from service_capacity_modeling.interface import Buffer
+from service_capacity_modeling.interface import BufferComponent
+from service_capacity_modeling.interface import Buffers
 from service_capacity_modeling.interface import CapacityDesires
 from service_capacity_modeling.interface import CapacityPlan
 from service_capacity_modeling.interface import CapacityRequirement
@@ -27,7 +30,9 @@ from service_capacity_modeling.interface import RegionContext
 from service_capacity_modeling.interface import Requirements
 from service_capacity_modeling.interface import ZoneClusterCapacity
 from service_capacity_modeling.models import CapacityModel
+from service_capacity_modeling.models.common import buffer_for_components
 from service_capacity_modeling.models.common import compute_stateful_zone
+from service_capacity_modeling.models.common import get_effective_disk_per_node_gib
 from service_capacity_modeling.models.common import normalize_cores
 from service_capacity_modeling.models.common import simple_network_mbps
 from service_capacity_modeling.models.common import sqrt_staffed_cores
@@ -176,6 +181,20 @@ class NflxElasticsearchArguments(BaseModel):
 class NflxElasticsearchDataCapacityModel(CapacityModel):
+    @staticmethod
+    def default_buffers() -> Buffers:
+        return Buffers(
+            default=Buffer(ratio=1.33),
+        )
+    @staticmethod
+    def default_desires(
+        user_desires, extra_model_arguments: Dict[str, Any]
+    ) -> CapacityDesires:
+        return CapacityDesires(
+            buffers=NflxElasticsearchDataCapacityModel.default_buffers()
+        )
     @staticmethod
     def capacity_plan(
         instance: Instance,
@@ -190,7 +209,10 @@ class NflxElasticsearchDataCapacityModel(CapacityModel):
         max_regional_size: int = extra_model_arguments.get("max_regional_size", 120)
         max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 1000)
         # Very large nodes are hard to recover
-        max_local_disk_gib: int = extra_model_arguments.get("max_local_disk_gib", 8192)
+        max_local_data_per_node_gib: int = extra_model_arguments.get(
+            "max_local_data_per_node_gib",
+            extra_model_arguments.get("max_local_disk_gib", 8192),
+        )
         # the ratio of traffic that should be handled by search nodes.
         #  0.0 = no search nodes, all searches handled by data nodes
@@ -259,11 +281,23 @@ class NflxElasticsearchDataCapacityModel(CapacityModel):
         # io2/gp2 so for now we're just hardcoding.
         data_write_io_per_sec = (1 + 10) * max(1, data_write_bytes_per_sec // 16384)
+        disk_buffer_ratio = buffer_for_components(
+            buffers=desires.buffers, components=[BufferComponent.disk]
+        ).ratio
+        needed_disk_gib = data_requirement.disk_gib.mid * disk_buffer_ratio
+        max_data_per_node_gib = get_effective_disk_per_node_gib(
+            instance,
+            drive,
+            disk_buffer_ratio,
+            max_local_data_per_node_gib=max_local_data_per_node_gib,
+        )
+        min_count = math.ceil(needed_disk_gib / max_data_per_node_gib)
         data_cluster = compute_stateful_zone(
             instance=instance,
             drive=drive,
             needed_cores=int(data_requirement.cpu_cores.mid),
-            needed_disk_gib=int(data_requirement.disk_gib.mid),
+            needed_disk_gib=needed_disk_gib,
             needed_memory_gib=int(data_requirement.mem_gib.mid),
             needed_network_mbps=data_requirement.network_mbps.mid,
             # Take into account the reads per read
@@ -272,13 +306,9 @@ class NflxElasticsearchDataCapacityModel(CapacityModel):
                 _es_io_per_read(size) * math.ceil(data_rps / count),
                 data_write_io_per_sec / count,
             ),
-            # Elasticsearch requires ephemeral disks to be % full because tiered
-            # merging can make progress as long as there is some headroom
-            required_disk_space=lambda x: x * 1.33,
-            max_local_disk_gib=max_local_disk_gib,
             # Elasticsearch clusters can auto-balance via shard placement
             cluster_size=lambda x: x,
-            min_count=1,
+            min_count=min_count,
             # Sidecars/System takes away memory from Elasticsearch
             # which uses half of available system max of 32 for compressed oops
             reserve_memory=lambda x: base_mem + max(32, x / 2),

service-capacity-modeling 0.3.67__tar.gz → 0.3.69__tar.gz

Potentially problematic release.

service-capacity-modeling 0.3.67tar.gz → 0.3.69tar.gz