service-capacity-modeling 0.3.67__tar.gz → 0.3.69__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of service-capacity-modeling might be problematic. Click here for more details.
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/PKG-INFO +1 -1
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/interface.py +16 -1
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/common.py +42 -24
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/cassandra.py +146 -43
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/crdb.py +35 -9
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/elasticsearch.py +37 -7
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/evcache.py +22 -11
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/kafka.py +35 -36
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/utils.py +16 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/PKG-INFO +1 -1
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_utils.py +39 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/LICENSE +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/README.md +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/__init__.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/capacity_planner.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/__init__.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/__init__.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/pricing/aws/3yr-reserved_ec2.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/pricing/aws/3yr-reserved_zz-overrides.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/profiles.txt +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5a.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5d.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5n.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c6a.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c6i.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c6id.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c7a.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c7i.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m4.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m5.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m5n.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6a.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6i.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6id.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6idn.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6in.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m7a.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m7i.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r4.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r5.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r5n.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6a.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6i.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6id.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6idn.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6in.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r7a.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r7i.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/manual_drives.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/manual_instances.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/manual_services.json +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/__init__.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/headroom_strategy.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/__init__.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/__init__.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/aurora.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/counter.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/ddb.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/entity.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/graphkv.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/iso_date_math.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/key_value.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/postgres.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/rds.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/stateless_java.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/time_series.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/time_series_config.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/wal.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/zookeeper.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/stats.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/tools/__init__.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/tools/auto_shape.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/tools/fetch_pricing.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/tools/generate_missing.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling/tools/instance_families.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/SOURCES.txt +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/dependency_links.txt +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/entry_points.txt +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/requires.txt +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/top_level.txt +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/setup.cfg +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/setup.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_arguments.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_buffers.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_common.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_desire_merge.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_generate_scenarios.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_hardware.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_hardware_shapes.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_headroom_strategy.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_io2.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_model_dump.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_reproducible.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_simulation.py +0 -0
- {service_capacity_modeling-0.3.67 → service_capacity_modeling-0.3.69}/tests/test_working_set.py +0 -0
|
@@ -876,7 +876,22 @@ class CapacityDesires(ExcludeUnsetModel):
|
|
|
876
876
|
|
|
877
877
|
@property
|
|
878
878
|
def reference_shape(self) -> Instance:
|
|
879
|
-
|
|
879
|
+
if not self.current_clusters:
|
|
880
|
+
return default_reference_shape
|
|
881
|
+
|
|
882
|
+
zonal, regional = (self.current_clusters.zonal, self.current_clusters.regional)
|
|
883
|
+
if zonal and regional:
|
|
884
|
+
raise ValueError(
|
|
885
|
+
"The current cluster should not have both "
|
|
886
|
+
"zonal and regional instances. They're mutually exclusive."
|
|
887
|
+
)
|
|
888
|
+
|
|
889
|
+
if zonal and zonal[0].cluster_instance:
|
|
890
|
+
return zonal[0].cluster_instance
|
|
891
|
+
|
|
892
|
+
if regional and regional[0].cluster_instance:
|
|
893
|
+
return regional[0].cluster_instance
|
|
894
|
+
|
|
880
895
|
return default_reference_shape
|
|
881
896
|
|
|
882
897
|
def merge_with(self, defaults: "CapacityDesires") -> "CapacityDesires":
|
|
@@ -69,6 +69,43 @@ def _sqrt_staffed_cores(rps: float, latency_s: float, qos: float) -> int:
|
|
|
69
69
|
return math.ceil((rps * latency_s) + qos * math.sqrt(rps * latency_s))
|
|
70
70
|
|
|
71
71
|
|
|
72
|
+
def get_effective_disk_per_node_gib(
|
|
73
|
+
instance: Instance,
|
|
74
|
+
drive: Drive,
|
|
75
|
+
disk_buffer_ratio: float,
|
|
76
|
+
max_local_data_per_node_gib: float = float("inf"),
|
|
77
|
+
max_attached_data_per_node_gib: float = float("inf"),
|
|
78
|
+
) -> float:
|
|
79
|
+
"""Calculate usable disk for an instance while respecting per-node data limits
|
|
80
|
+
and desired disk buffer ratio
|
|
81
|
+
|
|
82
|
+
Prevents overloading nodes with too much data, which causes slow bootstrapping and
|
|
83
|
+
recovery times
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
instance: The compute instance configuration
|
|
87
|
+
drive: The drive configuration for the instance
|
|
88
|
+
disk_buffer_ratio: Buffer ratio for operational headroom
|
|
89
|
+
max_local_data_per_node_gib: Maximum data per node for local drives
|
|
90
|
+
max_attached_data_per_node_gib: Maximum data per node for attached drives
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
float: Maximum usable disk capacity per node in GiB
|
|
94
|
+
"""
|
|
95
|
+
# TODO: @homatthew / @vrayini: Incorporate disk headroom for attached / local drives
|
|
96
|
+
if instance.drive is None:
|
|
97
|
+
if max_attached_data_per_node_gib == float("inf"):
|
|
98
|
+
return drive.max_size_gib
|
|
99
|
+
|
|
100
|
+
attached_disk_limit_gib = max_attached_data_per_node_gib * disk_buffer_ratio
|
|
101
|
+
# Attached disks are provisioned in 100GB limits
|
|
102
|
+
rounded_size = utils.next_n(attached_disk_limit_gib, n=100)
|
|
103
|
+
return min(rounded_size, drive.max_size_gib)
|
|
104
|
+
|
|
105
|
+
local_disk_limit_gib = max_local_data_per_node_gib * disk_buffer_ratio
|
|
106
|
+
return min(local_disk_limit_gib, instance.drive.size_gib)
|
|
107
|
+
|
|
108
|
+
|
|
72
109
|
def sqrt_staffed_cores(desires: CapacityDesires) -> int:
|
|
73
110
|
"""Computes cores given a sqrt staffing model
|
|
74
111
|
|
|
@@ -357,11 +394,6 @@ def compute_stateful_zone( # pylint: disable=too-many-positional-arguments
|
|
|
357
394
|
# (per_node_size_gib, node_count) -> (read_ios, write_ios)
|
|
358
395
|
required_disk_ios: Callable[[float, int], Tuple[float, float]] = lambda size_gib,
|
|
359
396
|
count: (0, 0),
|
|
360
|
-
required_disk_space: Callable[[float], float] = lambda size_gib: size_gib,
|
|
361
|
-
# The maximum amount of state we can hold per node in the database
|
|
362
|
-
# typically you don't want stateful systems going much higher than a
|
|
363
|
-
# few TiB so that recovery functions properly
|
|
364
|
-
max_local_disk_gib: float = 2048,
|
|
365
397
|
# Some stateful clusters have sidecars that take memory
|
|
366
398
|
reserve_memory: Callable[[float], float] = lambda x: 0,
|
|
367
399
|
# How much write buffer we get per instance (usually a percentage of
|
|
@@ -373,14 +405,7 @@ def compute_stateful_zone( # pylint: disable=too-many-positional-arguments
|
|
|
373
405
|
min_count: int = 0,
|
|
374
406
|
adjusted_disk_io_needed: float = 0.0,
|
|
375
407
|
read_write_ratio: float = 0.0,
|
|
376
|
-
# Max attached EBS volume size per node. Higher value here could allow
|
|
377
|
-
# for a lower instance count (allows more vertical scaling vs forcing horizontal)
|
|
378
|
-
max_attached_disk_gib: Optional[float] = None,
|
|
379
408
|
) -> ZoneClusterCapacity:
|
|
380
|
-
# Datastores often require disk headroom for e.g. compaction and such
|
|
381
|
-
if instance.drive is not None:
|
|
382
|
-
needed_disk_gib = math.ceil(required_disk_space(needed_disk_gib))
|
|
383
|
-
|
|
384
409
|
# How many instances do we need for the CPU
|
|
385
410
|
count = math.ceil(needed_cores / instance.cpu)
|
|
386
411
|
|
|
@@ -404,12 +429,8 @@ def compute_stateful_zone( # pylint: disable=too-many-positional-arguments
|
|
|
404
429
|
count = max(count, math.ceil(needed_network_mbps / instance.net_mbps))
|
|
405
430
|
|
|
406
431
|
# How many instances do we need for the disk
|
|
407
|
-
if
|
|
408
|
-
instance.drive
|
|
409
|
-
and instance.drive.size_gib > 0
|
|
410
|
-
and max_local_disk_gib > 0
|
|
411
|
-
):
|
|
412
|
-
disk_per_node = min(max_local_disk_gib, instance.drive.size_gib)
|
|
432
|
+
if instance.drive is not None and instance.drive.size_gib > 0:
|
|
433
|
+
disk_per_node = instance.drive.size_gib
|
|
413
434
|
count = max(count, math.ceil(needed_disk_gib / disk_per_node))
|
|
414
435
|
if adjusted_disk_io_needed != 0.0:
|
|
415
436
|
instance_read_iops = (
|
|
@@ -441,13 +462,13 @@ def compute_stateful_zone( # pylint: disable=too-many-positional-arguments
|
|
|
441
462
|
cost = count * instance.annual_cost
|
|
442
463
|
|
|
443
464
|
attached_drives = []
|
|
444
|
-
if instance.drive is None and
|
|
465
|
+
if instance.drive is None and needed_disk_gib > 0:
|
|
445
466
|
# If we don't have disks attach the cloud drive with enough
|
|
446
467
|
# space and IO for the requirement
|
|
447
468
|
|
|
448
469
|
# Note that cloud drivers are provisioned _per node_ and must be chosen for
|
|
449
470
|
# the max of space and IOS.
|
|
450
|
-
space_gib = max(1, math.ceil(
|
|
471
|
+
space_gib = max(1, math.ceil(needed_disk_gib / count))
|
|
451
472
|
read_io, write_io = required_disk_ios(space_gib, count)
|
|
452
473
|
read_io, write_io = (
|
|
453
474
|
utils.next_n(read_io, n=200),
|
|
@@ -463,9 +484,6 @@ def compute_stateful_zone( # pylint: disable=too-many-positional-arguments
|
|
|
463
484
|
# 1/3 the maximum volume size in one node (preferring more nodes
|
|
464
485
|
# with smaller volumes)
|
|
465
486
|
max_size = drive.max_size_gib / 3
|
|
466
|
-
if max_attached_disk_gib is not None:
|
|
467
|
-
max_size = max_attached_disk_gib
|
|
468
|
-
|
|
469
487
|
if ebs_gib > max_size > 0:
|
|
470
488
|
ratio = ebs_gib / max_size
|
|
471
489
|
count = max(cluster_size(math.ceil(count * ratio)), min_count)
|
|
@@ -900,7 +918,7 @@ def zonal_requirements_from_current(
|
|
|
900
918
|
mem_gib=certain_float(needed_memory_gib),
|
|
901
919
|
disk_gib=certain_float(needed_disk_gib),
|
|
902
920
|
network_mbps=certain_float(needed_network_mbps),
|
|
903
|
-
reference_shape=
|
|
921
|
+
reference_shape=reference_shape,
|
|
904
922
|
)
|
|
905
923
|
else:
|
|
906
924
|
raise ValueError("Please check if current_cluster is populated correctly.")
|
|
@@ -21,6 +21,7 @@ from service_capacity_modeling.interface import certain_float
|
|
|
21
21
|
from service_capacity_modeling.interface import certain_int
|
|
22
22
|
from service_capacity_modeling.interface import Clusters
|
|
23
23
|
from service_capacity_modeling.interface import Consistency
|
|
24
|
+
from service_capacity_modeling.interface import CurrentClusterCapacity
|
|
24
25
|
from service_capacity_modeling.interface import DataShape
|
|
25
26
|
from service_capacity_modeling.interface import Drive
|
|
26
27
|
from service_capacity_modeling.interface import FixedInterval
|
|
@@ -35,12 +36,15 @@ from service_capacity_modeling.models import CapacityModel
|
|
|
35
36
|
from service_capacity_modeling.models.common import buffer_for_components
|
|
36
37
|
from service_capacity_modeling.models.common import compute_stateful_zone
|
|
37
38
|
from service_capacity_modeling.models.common import derived_buffer_for_component
|
|
39
|
+
from service_capacity_modeling.models.common import get_effective_disk_per_node_gib
|
|
38
40
|
from service_capacity_modeling.models.common import network_services
|
|
39
41
|
from service_capacity_modeling.models.common import normalize_cores
|
|
40
42
|
from service_capacity_modeling.models.common import simple_network_mbps
|
|
41
43
|
from service_capacity_modeling.models.common import sqrt_staffed_cores
|
|
42
44
|
from service_capacity_modeling.models.common import working_set_from_drive_and_slo
|
|
43
45
|
from service_capacity_modeling.models.common import zonal_requirements_from_current
|
|
46
|
+
from service_capacity_modeling.models.utils import is_power_of_2
|
|
47
|
+
from service_capacity_modeling.models.utils import next_doubling
|
|
44
48
|
from service_capacity_modeling.models.utils import next_power_of_2
|
|
45
49
|
from service_capacity_modeling.stats import dist_for_interval
|
|
46
50
|
|
|
@@ -106,6 +110,53 @@ def _get_disk_from_desires(desires, copies_per_region):
|
|
|
106
110
|
)
|
|
107
111
|
|
|
108
112
|
|
|
113
|
+
def _get_min_count(
|
|
114
|
+
tier: int,
|
|
115
|
+
required_cluster_size: Optional[int],
|
|
116
|
+
needed_disk_gib: float,
|
|
117
|
+
disk_per_node_gib: float,
|
|
118
|
+
cluster_size_lambda: Callable[[int], int],
|
|
119
|
+
):
|
|
120
|
+
"""
|
|
121
|
+
Compute the minimum number of nodes required for a zone.
|
|
122
|
+
|
|
123
|
+
This function is used to prevent the planner from allocating clusters that
|
|
124
|
+
would exceed the max data per node or under the required cluster size for
|
|
125
|
+
a tier or existing cluster
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
# Cassandra clusters should aim to be at least 2 nodes per zone to start
|
|
129
|
+
# out with for tier 0 or tier 1. This gives us more room to "up-color"]
|
|
130
|
+
# clusters.
|
|
131
|
+
min_nodes_for_tier = 2 if tier in CRITICAL_TIERS else 0
|
|
132
|
+
|
|
133
|
+
# Prevent allocating clusters that exceed the max data per node.
|
|
134
|
+
min_nodes_for_disk = math.ceil(needed_disk_gib / disk_per_node_gib)
|
|
135
|
+
|
|
136
|
+
# Take the max of the following in order to avoid:
|
|
137
|
+
# (1) if `required_cluster_size` < `min_nodes_for_disk`, don't let the planner
|
|
138
|
+
# pick a shape that would exceed the max data per node
|
|
139
|
+
#
|
|
140
|
+
# For example, if we need 4TiB of disk, and the max data per node is 1TiB,
|
|
141
|
+
# Regardless of the `required_cluster_size`, we cannot allocate less than 4
|
|
142
|
+
# nodes because that would exceed the max data per node.
|
|
143
|
+
#
|
|
144
|
+
# (2) if `required_cluster_size` > `min_nodes_for_disk`, don't let the
|
|
145
|
+
# node density requirement affect the min count because the required
|
|
146
|
+
# cluster size already meets the node density requirement.
|
|
147
|
+
#
|
|
148
|
+
# For example, if we need 4TiB of disk, and the max data per node is 1TiB,
|
|
149
|
+
# and the upstream requires >= 8 nodes, we can allocate 8 nodes because
|
|
150
|
+
# each node would only have 500GB of data.
|
|
151
|
+
min_count = max(
|
|
152
|
+
min_nodes_for_tier,
|
|
153
|
+
required_cluster_size or 0,
|
|
154
|
+
min_nodes_for_disk,
|
|
155
|
+
)
|
|
156
|
+
# Ensure that the min count is an increment of the cluster size constraint (doubling)
|
|
157
|
+
return cluster_size_lambda(min_count)
|
|
158
|
+
|
|
159
|
+
|
|
109
160
|
def _zonal_requirement_for_new_cluster(
|
|
110
161
|
desires, instance, copies_per_region, zones_per_region
|
|
111
162
|
) -> CapacityRequirement:
|
|
@@ -149,22 +200,13 @@ def _estimate_cassandra_requirement( # pylint: disable=too-many-positional-argu
|
|
|
149
200
|
)
|
|
150
201
|
memory_preserve = False
|
|
151
202
|
reference_shape = desires.reference_shape
|
|
152
|
-
current_capacity = (
|
|
153
|
-
None
|
|
154
|
-
if desires.current_clusters is None
|
|
155
|
-
else (
|
|
156
|
-
desires.current_clusters.zonal[0]
|
|
157
|
-
if len(desires.current_clusters.zonal)
|
|
158
|
-
else desires.current_clusters.regional[0]
|
|
159
|
-
)
|
|
160
|
-
)
|
|
203
|
+
current_capacity = _get_current_capacity(desires)
|
|
161
204
|
|
|
162
205
|
# If the cluster is already provisioned
|
|
163
206
|
if current_capacity and desires.current_clusters is not None:
|
|
164
207
|
capacity_requirement = zonal_requirements_from_current(
|
|
165
208
|
desires.current_clusters, desires.buffers, instance, reference_shape
|
|
166
209
|
)
|
|
167
|
-
reference_shape = capacity_requirement.reference_shape
|
|
168
210
|
disk_scale, _ = derived_buffer_for_component(
|
|
169
211
|
desires.buffers.derived, ["storage", "disk"]
|
|
170
212
|
)
|
|
@@ -278,6 +320,26 @@ def _estimate_cassandra_requirement( # pylint: disable=too-many-positional-argu
|
|
|
278
320
|
)
|
|
279
321
|
|
|
280
322
|
|
|
323
|
+
def _get_current_cluster_size(desires) -> int:
|
|
324
|
+
current_capacity = _get_current_capacity(desires)
|
|
325
|
+
if current_capacity is None:
|
|
326
|
+
return 0
|
|
327
|
+
return math.ceil(current_capacity.cluster_instance_count.mid)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _get_current_capacity(desires) -> Optional[CurrentClusterCapacity]:
|
|
331
|
+
current_capacity = (
|
|
332
|
+
None
|
|
333
|
+
if desires.current_clusters is None
|
|
334
|
+
else (
|
|
335
|
+
desires.current_clusters.zonal[0]
|
|
336
|
+
if len(desires.current_clusters.zonal)
|
|
337
|
+
else desires.current_clusters.regional[0]
|
|
338
|
+
)
|
|
339
|
+
)
|
|
340
|
+
return current_capacity
|
|
341
|
+
|
|
342
|
+
|
|
281
343
|
def _upsert_params(cluster, params):
|
|
282
344
|
if cluster.cluster_params:
|
|
283
345
|
cluster.cluster_params.update(params)
|
|
@@ -285,6 +347,18 @@ def _upsert_params(cluster, params):
|
|
|
285
347
|
cluster.cluster_params = params
|
|
286
348
|
|
|
287
349
|
|
|
350
|
+
def _get_cluster_size_lambda(
|
|
351
|
+
current_cluster_size: int,
|
|
352
|
+
required_cluster_size: Optional[int],
|
|
353
|
+
) -> Callable[[int], int]:
|
|
354
|
+
if required_cluster_size:
|
|
355
|
+
return lambda x: next_doubling(x, base=required_cluster_size)
|
|
356
|
+
elif current_cluster_size and not is_power_of_2(current_cluster_size):
|
|
357
|
+
return lambda x: next_doubling(x, base=current_cluster_size)
|
|
358
|
+
else: # New provisionings
|
|
359
|
+
return next_power_of_2
|
|
360
|
+
|
|
361
|
+
|
|
288
362
|
# pylint: disable=too-many-locals
|
|
289
363
|
# pylint: disable=too-many-return-statements
|
|
290
364
|
# flake8: noqa: C901
|
|
@@ -299,7 +373,8 @@ def _estimate_cassandra_cluster_zonal( # pylint: disable=too-many-positional-ar
|
|
|
299
373
|
require_attached_disks: bool = False,
|
|
300
374
|
required_cluster_size: Optional[int] = None,
|
|
301
375
|
max_rps_to_disk: int = 500,
|
|
302
|
-
|
|
376
|
+
max_local_data_per_node_gib: int = 1280,
|
|
377
|
+
max_attached_data_per_node_gib: int = 2048,
|
|
303
378
|
max_regional_size: int = 192,
|
|
304
379
|
max_write_buffer_percent: float = 0.25,
|
|
305
380
|
max_table_buffer_percent: float = 0.11,
|
|
@@ -362,10 +437,31 @@ def _estimate_cassandra_cluster_zonal( # pylint: disable=too-many-positional-ar
|
|
|
362
437
|
copies_per_region=copies_per_region,
|
|
363
438
|
)
|
|
364
439
|
|
|
365
|
-
#
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
440
|
+
# Adjust the min count to adjust to prevent too much data on a single
|
|
441
|
+
needed_disk_gib = int(requirement.disk_gib.mid)
|
|
442
|
+
disk_buffer_ratio = buffer_for_components(
|
|
443
|
+
buffers=desires.buffers, components=[BufferComponent.disk]
|
|
444
|
+
).ratio
|
|
445
|
+
disk_per_node_gib = get_effective_disk_per_node_gib(
|
|
446
|
+
instance,
|
|
447
|
+
drive,
|
|
448
|
+
disk_buffer_ratio,
|
|
449
|
+
max_local_data_per_node_gib=max_local_data_per_node_gib,
|
|
450
|
+
max_attached_data_per_node_gib=max_attached_data_per_node_gib,
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
current_cluster_size = _get_current_cluster_size(desires)
|
|
454
|
+
cluster_size_lambda = _get_cluster_size_lambda(
|
|
455
|
+
current_cluster_size, required_cluster_size
|
|
456
|
+
)
|
|
457
|
+
min_count = _get_min_count(
|
|
458
|
+
tier=desires.service_tier,
|
|
459
|
+
required_cluster_size=required_cluster_size,
|
|
460
|
+
needed_disk_gib=needed_disk_gib,
|
|
461
|
+
disk_per_node_gib=disk_per_node_gib,
|
|
462
|
+
cluster_size_lambda=cluster_size_lambda,
|
|
463
|
+
)
|
|
464
|
+
|
|
369
465
|
base_mem = _get_base_memory(desires)
|
|
370
466
|
|
|
371
467
|
heap_fn = _cass_heap_for_write_buffer(
|
|
@@ -379,7 +475,7 @@ def _estimate_cassandra_cluster_zonal( # pylint: disable=too-many-positional-ar
|
|
|
379
475
|
instance=instance,
|
|
380
476
|
drive=drive,
|
|
381
477
|
needed_cores=int(requirement.cpu_cores.mid),
|
|
382
|
-
needed_disk_gib=
|
|
478
|
+
needed_disk_gib=needed_disk_gib,
|
|
383
479
|
needed_memory_gib=int(requirement.mem_gib.mid),
|
|
384
480
|
needed_network_mbps=requirement.network_mbps.mid,
|
|
385
481
|
# Take into account the reads per read
|
|
@@ -388,14 +484,9 @@ def _estimate_cassandra_cluster_zonal( # pylint: disable=too-many-positional-ar
|
|
|
388
484
|
_cass_io_per_read(size) * math.ceil(read_io_per_sec / count),
|
|
389
485
|
write_io_per_sec / count,
|
|
390
486
|
),
|
|
391
|
-
# Disk buffer is already added while computing C* estimates
|
|
392
|
-
required_disk_space=lambda x: x,
|
|
393
|
-
# C* clusters cannot recover data from neighbors quickly so we
|
|
394
|
-
# want to avoid clusters with more than 1 TiB of local state
|
|
395
|
-
max_local_disk_gib=max_local_disk_gib,
|
|
396
487
|
# C* clusters provision in powers of 2 because doubling
|
|
397
|
-
cluster_size=
|
|
398
|
-
min_count=
|
|
488
|
+
cluster_size=cluster_size_lambda,
|
|
489
|
+
min_count=min_count,
|
|
399
490
|
# TODO: Take reserve memory calculation into account during buffer calculation
|
|
400
491
|
# C* heap usage takes away from OS page cache memory
|
|
401
492
|
reserve_memory=lambda x: base_mem + heap_fn(x),
|
|
@@ -618,6 +709,11 @@ class NflxCassandraCapacityModel(CapacityModel):
|
|
|
618
709
|
desires: CapacityDesires,
|
|
619
710
|
extra_model_arguments: Dict[str, Any],
|
|
620
711
|
) -> Optional[CapacityPlan]:
|
|
712
|
+
# TODO: Standardize these extra model argument defaults in a single
|
|
713
|
+
# place. Many of them are defined here and as default values in the
|
|
714
|
+
# downstream method but only these ones are used which is confusing for
|
|
715
|
+
# readability
|
|
716
|
+
|
|
621
717
|
# Use durabiliy and consistency to compute RF.
|
|
622
718
|
copies_per_region = _target_rf(
|
|
623
719
|
desires, extra_model_arguments.get("copies_per_region", None)
|
|
@@ -636,7 +732,11 @@ class NflxCassandraCapacityModel(CapacityModel):
|
|
|
636
732
|
|
|
637
733
|
max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 500)
|
|
638
734
|
max_regional_size: int = extra_model_arguments.get("max_regional_size", 192)
|
|
639
|
-
|
|
735
|
+
max_local_data_per_node_gib: int = extra_model_arguments.get(
|
|
736
|
+
"max_local_data_per_node_gib",
|
|
737
|
+
extra_model_arguments.get("max_local_disk_gib", 1280),
|
|
738
|
+
)
|
|
739
|
+
|
|
640
740
|
max_write_buffer_percent: float = min(
|
|
641
741
|
0.5, extra_model_arguments.get("max_write_buffer_percent", 0.25)
|
|
642
742
|
)
|
|
@@ -664,7 +764,7 @@ class NflxCassandraCapacityModel(CapacityModel):
|
|
|
664
764
|
required_cluster_size=required_cluster_size,
|
|
665
765
|
max_rps_to_disk=max_rps_to_disk,
|
|
666
766
|
max_regional_size=max_regional_size,
|
|
667
|
-
|
|
767
|
+
max_local_data_per_node_gib=max_local_data_per_node_gib,
|
|
668
768
|
max_write_buffer_percent=max_write_buffer_percent,
|
|
669
769
|
max_table_buffer_percent=max_table_buffer_percent,
|
|
670
770
|
)
|
|
@@ -677,6 +777,26 @@ class NflxCassandraCapacityModel(CapacityModel):
|
|
|
677
777
|
def extra_model_arguments_schema() -> Dict[str, Any]:
|
|
678
778
|
return NflxCassandraArguments.model_json_schema()
|
|
679
779
|
|
|
780
|
+
@staticmethod
|
|
781
|
+
def default_buffers() -> Buffers:
|
|
782
|
+
return Buffers(
|
|
783
|
+
default=Buffer(ratio=1.5),
|
|
784
|
+
desired={
|
|
785
|
+
"compute": Buffer(ratio=1.5, components=[BufferComponent.compute]),
|
|
786
|
+
"storage": Buffer(ratio=4.0, components=[BufferComponent.storage]),
|
|
787
|
+
# Cassandra reserves headroom in both cpu and network for background
|
|
788
|
+
# work and tasks
|
|
789
|
+
"background": Buffer(
|
|
790
|
+
ratio=2.0,
|
|
791
|
+
components=[
|
|
792
|
+
BufferComponent.cpu,
|
|
793
|
+
BufferComponent.network,
|
|
794
|
+
BACKGROUND_BUFFER,
|
|
795
|
+
],
|
|
796
|
+
),
|
|
797
|
+
},
|
|
798
|
+
)
|
|
799
|
+
|
|
680
800
|
@staticmethod
|
|
681
801
|
def default_desires(user_desires, extra_model_arguments: Dict[str, Any]):
|
|
682
802
|
acceptable_consistency = {
|
|
@@ -704,24 +824,7 @@ class NflxCassandraCapacityModel(CapacityModel):
|
|
|
704
824
|
|
|
705
825
|
# By supplying these buffers we can deconstruct observed utilization into
|
|
706
826
|
# load versus buffer.
|
|
707
|
-
buffers =
|
|
708
|
-
default=Buffer(ratio=1.5),
|
|
709
|
-
desired={
|
|
710
|
-
"compute": Buffer(ratio=1.5, components=[BufferComponent.compute]),
|
|
711
|
-
"storage": Buffer(ratio=4.0, components=[BufferComponent.storage]),
|
|
712
|
-
# Cassandra reserves headroom in both cpu and network for background
|
|
713
|
-
# work and tasks
|
|
714
|
-
"background": Buffer(
|
|
715
|
-
ratio=2.0,
|
|
716
|
-
components=[
|
|
717
|
-
BufferComponent.cpu,
|
|
718
|
-
BufferComponent.network,
|
|
719
|
-
BACKGROUND_BUFFER,
|
|
720
|
-
],
|
|
721
|
-
),
|
|
722
|
-
},
|
|
723
|
-
)
|
|
724
|
-
|
|
827
|
+
buffers = NflxCassandraCapacityModel.default_buffers()
|
|
725
828
|
if user_desires.query_pattern.access_pattern == AccessPattern.latency:
|
|
726
829
|
return CapacityDesires(
|
|
727
830
|
query_pattern=QueryPattern(
|
|
@@ -10,6 +10,9 @@ from pydantic import Field
|
|
|
10
10
|
|
|
11
11
|
from service_capacity_modeling.interface import AccessConsistency
|
|
12
12
|
from service_capacity_modeling.interface import AccessPattern
|
|
13
|
+
from service_capacity_modeling.interface import Buffer
|
|
14
|
+
from service_capacity_modeling.interface import BufferComponent
|
|
15
|
+
from service_capacity_modeling.interface import Buffers
|
|
13
16
|
from service_capacity_modeling.interface import CapacityDesires
|
|
14
17
|
from service_capacity_modeling.interface import CapacityPlan
|
|
15
18
|
from service_capacity_modeling.interface import CapacityRequirement
|
|
@@ -27,7 +30,9 @@ from service_capacity_modeling.interface import QueryPattern
|
|
|
27
30
|
from service_capacity_modeling.interface import RegionContext
|
|
28
31
|
from service_capacity_modeling.interface import Requirements
|
|
29
32
|
from service_capacity_modeling.models import CapacityModel
|
|
33
|
+
from service_capacity_modeling.models.common import buffer_for_components
|
|
30
34
|
from service_capacity_modeling.models.common import compute_stateful_zone
|
|
35
|
+
from service_capacity_modeling.models.common import get_effective_disk_per_node_gib
|
|
31
36
|
from service_capacity_modeling.models.common import normalize_cores
|
|
32
37
|
from service_capacity_modeling.models.common import simple_network_mbps
|
|
33
38
|
from service_capacity_modeling.models.common import sqrt_staffed_cores
|
|
@@ -137,7 +142,7 @@ def _estimate_cockroachdb_cluster_zonal( # noqa=E501 pylint: disable=too-many-p
|
|
|
137
142
|
desires: CapacityDesires,
|
|
138
143
|
zones_per_region: int = 3,
|
|
139
144
|
copies_per_region: int = 3,
|
|
140
|
-
|
|
145
|
+
max_local_data_per_node_gib: int = 2048,
|
|
141
146
|
max_regional_size: int = 288,
|
|
142
147
|
max_rps_to_disk: int = 500,
|
|
143
148
|
min_vcpu_per_instance: int = 4,
|
|
@@ -184,11 +189,23 @@ def _estimate_cockroachdb_cluster_zonal( # noqa=E501 pylint: disable=too-many-p
|
|
|
184
189
|
+ desires.data_shape.reserved_instance_system_mem_gib
|
|
185
190
|
)
|
|
186
191
|
|
|
192
|
+
disk_buffer_ratio = buffer_for_components(
|
|
193
|
+
buffers=desires.buffers, components=[BufferComponent.disk]
|
|
194
|
+
).ratio
|
|
195
|
+
max_data_per_node_gib = get_effective_disk_per_node_gib(
|
|
196
|
+
instance,
|
|
197
|
+
drive,
|
|
198
|
+
disk_buffer_ratio,
|
|
199
|
+
max_local_data_per_node_gib=max_local_data_per_node_gib,
|
|
200
|
+
)
|
|
201
|
+
needed_disk_gib = requirement.disk_gib.mid * disk_buffer_ratio
|
|
202
|
+
min_count = math.ceil(needed_disk_gib / max_data_per_node_gib)
|
|
203
|
+
|
|
187
204
|
cluster = compute_stateful_zone(
|
|
188
205
|
instance=instance,
|
|
189
206
|
drive=drive,
|
|
190
207
|
needed_cores=int(requirement.cpu_cores.mid),
|
|
191
|
-
needed_disk_gib=
|
|
208
|
+
needed_disk_gib=needed_disk_gib,
|
|
192
209
|
needed_memory_gib=requirement.mem_gib.mid,
|
|
193
210
|
needed_network_mbps=requirement.network_mbps.mid,
|
|
194
211
|
# Take into account the reads per read
|
|
@@ -199,13 +216,9 @@ def _estimate_cockroachdb_cluster_zonal( # noqa=E501 pylint: disable=too-many-p
|
|
|
199
216
|
# TODO: presumably there are some write IOs here
|
|
200
217
|
0,
|
|
201
218
|
),
|
|
202
|
-
# CRDB requires ephemeral disks to be 80% full because leveled
|
|
203
|
-
# compaction can make progress as long as there is some headroom
|
|
204
|
-
required_disk_space=lambda x: x * 1.2,
|
|
205
|
-
max_local_disk_gib=max_local_disk_gib,
|
|
206
219
|
# cockroachdb clusters will autobalance across available nodes
|
|
207
220
|
cluster_size=lambda x: x,
|
|
208
|
-
min_count=
|
|
221
|
+
min_count=min_count,
|
|
209
222
|
# Sidecars/System takes away memory from cockroachdb
|
|
210
223
|
# cockroachdb by default uses --max-sql-memory of 25% of system memory
|
|
211
224
|
# that cannot be used for caching
|
|
@@ -268,6 +281,12 @@ class NflxCockroachDBArguments(BaseModel):
|
|
|
268
281
|
|
|
269
282
|
|
|
270
283
|
class NflxCockroachDBCapacityModel(CapacityModel):
|
|
284
|
+
@staticmethod
|
|
285
|
+
def default_buffers() -> Buffers:
|
|
286
|
+
return Buffers(
|
|
287
|
+
default=Buffer(ratio=1.2),
|
|
288
|
+
)
|
|
289
|
+
|
|
271
290
|
@staticmethod
|
|
272
291
|
def capacity_plan(
|
|
273
292
|
instance: Instance,
|
|
@@ -282,7 +301,11 @@ class NflxCockroachDBCapacityModel(CapacityModel):
|
|
|
282
301
|
max_regional_size: int = extra_model_arguments.get("max_regional_size", 500)
|
|
283
302
|
max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 500)
|
|
284
303
|
# Very large nodes are hard to recover
|
|
285
|
-
|
|
304
|
+
max_local_data_per_node_gib: int = extra_model_arguments.get(
|
|
305
|
+
"max_local_data_per_node_gib",
|
|
306
|
+
extra_model_arguments.get("max_local_disk_gib", 2048),
|
|
307
|
+
)
|
|
308
|
+
|
|
286
309
|
# Cockroach Labs recommends a minimum of 8 vCPUs and strongly
|
|
287
310
|
# recommends no fewer than 4 vCPUs per node.
|
|
288
311
|
min_vcpu_per_instance: int = extra_model_arguments.get(
|
|
@@ -299,7 +322,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
|
|
|
299
322
|
zones_per_region=context.zones_in_region,
|
|
300
323
|
copies_per_region=copies_per_region,
|
|
301
324
|
max_regional_size=max_regional_size,
|
|
302
|
-
|
|
325
|
+
max_local_data_per_node_gib=max_local_data_per_node_gib,
|
|
303
326
|
max_rps_to_disk=max_rps_to_disk,
|
|
304
327
|
min_vcpu_per_instance=min_vcpu_per_instance,
|
|
305
328
|
license_fee_per_core=license_fee_per_core,
|
|
@@ -330,6 +353,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
|
|
|
330
353
|
f"User asked for {key}={value}"
|
|
331
354
|
)
|
|
332
355
|
|
|
356
|
+
buffers = NflxCockroachDBCapacityModel.default_buffers()
|
|
333
357
|
if user_desires.query_pattern.access_pattern == AccessPattern.latency:
|
|
334
358
|
return CapacityDesires(
|
|
335
359
|
query_pattern=QueryPattern(
|
|
@@ -396,6 +420,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
|
|
|
396
420
|
# gateway taking about 1 MiB of memory
|
|
397
421
|
reserved_instance_app_mem_gib=0.001,
|
|
398
422
|
),
|
|
423
|
+
buffers=buffers,
|
|
399
424
|
)
|
|
400
425
|
else:
|
|
401
426
|
return CapacityDesires(
|
|
@@ -465,6 +490,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
|
|
|
465
490
|
# gateway taking about 1 MiB of memory
|
|
466
491
|
reserved_instance_app_mem_gib=0.001,
|
|
467
492
|
),
|
|
493
|
+
buffers=buffers,
|
|
468
494
|
)
|
|
469
495
|
|
|
470
496
|
|
|
@@ -11,6 +11,9 @@ from pydantic import Field
|
|
|
11
11
|
|
|
12
12
|
from service_capacity_modeling.interface import AccessConsistency
|
|
13
13
|
from service_capacity_modeling.interface import AccessPattern
|
|
14
|
+
from service_capacity_modeling.interface import Buffer
|
|
15
|
+
from service_capacity_modeling.interface import BufferComponent
|
|
16
|
+
from service_capacity_modeling.interface import Buffers
|
|
14
17
|
from service_capacity_modeling.interface import CapacityDesires
|
|
15
18
|
from service_capacity_modeling.interface import CapacityPlan
|
|
16
19
|
from service_capacity_modeling.interface import CapacityRequirement
|
|
@@ -27,7 +30,9 @@ from service_capacity_modeling.interface import RegionContext
|
|
|
27
30
|
from service_capacity_modeling.interface import Requirements
|
|
28
31
|
from service_capacity_modeling.interface import ZoneClusterCapacity
|
|
29
32
|
from service_capacity_modeling.models import CapacityModel
|
|
33
|
+
from service_capacity_modeling.models.common import buffer_for_components
|
|
30
34
|
from service_capacity_modeling.models.common import compute_stateful_zone
|
|
35
|
+
from service_capacity_modeling.models.common import get_effective_disk_per_node_gib
|
|
31
36
|
from service_capacity_modeling.models.common import normalize_cores
|
|
32
37
|
from service_capacity_modeling.models.common import simple_network_mbps
|
|
33
38
|
from service_capacity_modeling.models.common import sqrt_staffed_cores
|
|
@@ -176,6 +181,20 @@ class NflxElasticsearchArguments(BaseModel):
|
|
|
176
181
|
|
|
177
182
|
|
|
178
183
|
class NflxElasticsearchDataCapacityModel(CapacityModel):
|
|
184
|
+
@staticmethod
|
|
185
|
+
def default_buffers() -> Buffers:
|
|
186
|
+
return Buffers(
|
|
187
|
+
default=Buffer(ratio=1.33),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
@staticmethod
|
|
191
|
+
def default_desires(
|
|
192
|
+
user_desires, extra_model_arguments: Dict[str, Any]
|
|
193
|
+
) -> CapacityDesires:
|
|
194
|
+
return CapacityDesires(
|
|
195
|
+
buffers=NflxElasticsearchDataCapacityModel.default_buffers()
|
|
196
|
+
)
|
|
197
|
+
|
|
179
198
|
@staticmethod
|
|
180
199
|
def capacity_plan(
|
|
181
200
|
instance: Instance,
|
|
@@ -190,7 +209,10 @@ class NflxElasticsearchDataCapacityModel(CapacityModel):
|
|
|
190
209
|
max_regional_size: int = extra_model_arguments.get("max_regional_size", 120)
|
|
191
210
|
max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 1000)
|
|
192
211
|
# Very large nodes are hard to recover
|
|
193
|
-
|
|
212
|
+
max_local_data_per_node_gib: int = extra_model_arguments.get(
|
|
213
|
+
"max_local_data_per_node_gib",
|
|
214
|
+
extra_model_arguments.get("max_local_disk_gib", 8192),
|
|
215
|
+
)
|
|
194
216
|
|
|
195
217
|
# the ratio of traffic that should be handled by search nodes.
|
|
196
218
|
# 0.0 = no search nodes, all searches handled by data nodes
|
|
@@ -259,11 +281,23 @@ class NflxElasticsearchDataCapacityModel(CapacityModel):
|
|
|
259
281
|
# io2/gp2 so for now we're just hardcoding.
|
|
260
282
|
data_write_io_per_sec = (1 + 10) * max(1, data_write_bytes_per_sec // 16384)
|
|
261
283
|
|
|
284
|
+
disk_buffer_ratio = buffer_for_components(
|
|
285
|
+
buffers=desires.buffers, components=[BufferComponent.disk]
|
|
286
|
+
).ratio
|
|
287
|
+
needed_disk_gib = data_requirement.disk_gib.mid * disk_buffer_ratio
|
|
288
|
+
max_data_per_node_gib = get_effective_disk_per_node_gib(
|
|
289
|
+
instance,
|
|
290
|
+
drive,
|
|
291
|
+
disk_buffer_ratio,
|
|
292
|
+
max_local_data_per_node_gib=max_local_data_per_node_gib,
|
|
293
|
+
)
|
|
294
|
+
min_count = math.ceil(needed_disk_gib / max_data_per_node_gib)
|
|
295
|
+
|
|
262
296
|
data_cluster = compute_stateful_zone(
|
|
263
297
|
instance=instance,
|
|
264
298
|
drive=drive,
|
|
265
299
|
needed_cores=int(data_requirement.cpu_cores.mid),
|
|
266
|
-
needed_disk_gib=
|
|
300
|
+
needed_disk_gib=needed_disk_gib,
|
|
267
301
|
needed_memory_gib=int(data_requirement.mem_gib.mid),
|
|
268
302
|
needed_network_mbps=data_requirement.network_mbps.mid,
|
|
269
303
|
# Take into account the reads per read
|
|
@@ -272,13 +306,9 @@ class NflxElasticsearchDataCapacityModel(CapacityModel):
|
|
|
272
306
|
_es_io_per_read(size) * math.ceil(data_rps / count),
|
|
273
307
|
data_write_io_per_sec / count,
|
|
274
308
|
),
|
|
275
|
-
# Elasticsearch requires ephemeral disks to be % full because tiered
|
|
276
|
-
# merging can make progress as long as there is some headroom
|
|
277
|
-
required_disk_space=lambda x: x * 1.33,
|
|
278
|
-
max_local_disk_gib=max_local_disk_gib,
|
|
279
309
|
# Elasticsearch clusters can auto-balance via shard placement
|
|
280
310
|
cluster_size=lambda x: x,
|
|
281
|
-
min_count=
|
|
311
|
+
min_count=min_count,
|
|
282
312
|
# Sidecars/System takes away memory from Elasticsearch
|
|
283
313
|
# which uses half of available system max of 32 for compressed oops
|
|
284
314
|
reserve_memory=lambda x: base_mem + max(32, x / 2),
|