service-capacity-modeling 0.3.68__tar.gz → 0.3.69__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of service-capacity-modeling might be problematic. Click here for more details.
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/PKG-INFO +1 -1
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/common.py +41 -23
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/cassandra.py +146 -42
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/crdb.py +35 -9
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/elasticsearch.py +37 -7
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/evcache.py +22 -11
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/kafka.py +35 -36
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/utils.py +16 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/PKG-INFO +1 -1
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_utils.py +39 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/LICENSE +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/README.md +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/__init__.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/capacity_planner.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/__init__.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/__init__.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/pricing/aws/3yr-reserved_ec2.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/pricing/aws/3yr-reserved_zz-overrides.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/profiles.txt +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5a.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5d.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5n.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c6a.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c6i.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c6id.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c7a.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c7i.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m4.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m5.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m5n.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6a.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6i.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6id.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6idn.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6in.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m7a.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m7i.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r4.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r5.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r5n.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6a.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6i.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6id.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6idn.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6in.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r7a.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r7i.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/manual_drives.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/manual_instances.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/hardware/profiles/shapes/aws/manual_services.json +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/interface.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/__init__.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/headroom_strategy.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/__init__.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/__init__.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/aurora.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/counter.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/ddb.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/entity.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/graphkv.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/iso_date_math.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/key_value.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/postgres.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/rds.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/stateless_java.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/time_series.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/time_series_config.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/wal.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/models/org/netflix/zookeeper.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/stats.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/tools/__init__.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/tools/auto_shape.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/tools/fetch_pricing.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/tools/generate_missing.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling/tools/instance_families.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/SOURCES.txt +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/dependency_links.txt +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/entry_points.txt +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/requires.txt +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/service_capacity_modeling.egg-info/top_level.txt +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/setup.cfg +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/setup.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_arguments.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_buffers.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_common.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_desire_merge.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_generate_scenarios.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_hardware.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_hardware_shapes.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_headroom_strategy.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_io2.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_model_dump.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_reproducible.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_simulation.py +0 -0
- {service_capacity_modeling-0.3.68 → service_capacity_modeling-0.3.69}/tests/test_working_set.py +0 -0
|
@@ -69,6 +69,43 @@ def _sqrt_staffed_cores(rps: float, latency_s: float, qos: float) -> int:
|
|
|
69
69
|
return math.ceil((rps * latency_s) + qos * math.sqrt(rps * latency_s))
|
|
70
70
|
|
|
71
71
|
|
|
72
|
+
def get_effective_disk_per_node_gib(
|
|
73
|
+
instance: Instance,
|
|
74
|
+
drive: Drive,
|
|
75
|
+
disk_buffer_ratio: float,
|
|
76
|
+
max_local_data_per_node_gib: float = float("inf"),
|
|
77
|
+
max_attached_data_per_node_gib: float = float("inf"),
|
|
78
|
+
) -> float:
|
|
79
|
+
"""Calculate usable disk for an instance while respecting per-node data limits
|
|
80
|
+
and desired disk buffer ratio
|
|
81
|
+
|
|
82
|
+
Prevents overloading nodes with too much data, which causes slow bootstrapping and
|
|
83
|
+
recovery times
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
instance: The compute instance configuration
|
|
87
|
+
drive: The drive configuration for the instance
|
|
88
|
+
disk_buffer_ratio: Buffer ratio for operational headroom
|
|
89
|
+
max_local_data_per_node_gib: Maximum data per node for local drives
|
|
90
|
+
max_attached_data_per_node_gib: Maximum data per node for attached drives
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
float: Maximum usable disk capacity per node in GiB
|
|
94
|
+
"""
|
|
95
|
+
# TODO: @homatthew / @vrayini: Incorporate disk headroom for attached / local drives
|
|
96
|
+
if instance.drive is None:
|
|
97
|
+
if max_attached_data_per_node_gib == float("inf"):
|
|
98
|
+
return drive.max_size_gib
|
|
99
|
+
|
|
100
|
+
attached_disk_limit_gib = max_attached_data_per_node_gib * disk_buffer_ratio
|
|
101
|
+
# Attached disks are provisioned in 100GB limits
|
|
102
|
+
rounded_size = utils.next_n(attached_disk_limit_gib, n=100)
|
|
103
|
+
return min(rounded_size, drive.max_size_gib)
|
|
104
|
+
|
|
105
|
+
local_disk_limit_gib = max_local_data_per_node_gib * disk_buffer_ratio
|
|
106
|
+
return min(local_disk_limit_gib, instance.drive.size_gib)
|
|
107
|
+
|
|
108
|
+
|
|
72
109
|
def sqrt_staffed_cores(desires: CapacityDesires) -> int:
|
|
73
110
|
"""Computes cores given a sqrt staffing model
|
|
74
111
|
|
|
@@ -357,11 +394,6 @@ def compute_stateful_zone( # pylint: disable=too-many-positional-arguments
|
|
|
357
394
|
# (per_node_size_gib, node_count) -> (read_ios, write_ios)
|
|
358
395
|
required_disk_ios: Callable[[float, int], Tuple[float, float]] = lambda size_gib,
|
|
359
396
|
count: (0, 0),
|
|
360
|
-
required_disk_space: Callable[[float], float] = lambda size_gib: size_gib,
|
|
361
|
-
# The maximum amount of state we can hold per node in the database
|
|
362
|
-
# typically you don't want stateful systems going much higher than a
|
|
363
|
-
# few TiB so that recovery functions properly
|
|
364
|
-
max_local_disk_gib: float = 2048,
|
|
365
397
|
# Some stateful clusters have sidecars that take memory
|
|
366
398
|
reserve_memory: Callable[[float], float] = lambda x: 0,
|
|
367
399
|
# How much write buffer we get per instance (usually a percentage of
|
|
@@ -373,14 +405,7 @@ def compute_stateful_zone( # pylint: disable=too-many-positional-arguments
|
|
|
373
405
|
min_count: int = 0,
|
|
374
406
|
adjusted_disk_io_needed: float = 0.0,
|
|
375
407
|
read_write_ratio: float = 0.0,
|
|
376
|
-
# Max attached EBS volume size per node. Higher value here could allow
|
|
377
|
-
# for a lower instance count (allows more vertical scaling vs forcing horizontal)
|
|
378
|
-
max_attached_disk_gib: Optional[float] = None,
|
|
379
408
|
) -> ZoneClusterCapacity:
|
|
380
|
-
# Datastores often require disk headroom for e.g. compaction and such
|
|
381
|
-
if instance.drive is not None:
|
|
382
|
-
needed_disk_gib = math.ceil(required_disk_space(needed_disk_gib))
|
|
383
|
-
|
|
384
409
|
# How many instances do we need for the CPU
|
|
385
410
|
count = math.ceil(needed_cores / instance.cpu)
|
|
386
411
|
|
|
@@ -404,12 +429,8 @@ def compute_stateful_zone( # pylint: disable=too-many-positional-arguments
|
|
|
404
429
|
count = max(count, math.ceil(needed_network_mbps / instance.net_mbps))
|
|
405
430
|
|
|
406
431
|
# How many instances do we need for the disk
|
|
407
|
-
if
|
|
408
|
-
instance.drive
|
|
409
|
-
and instance.drive.size_gib > 0
|
|
410
|
-
and max_local_disk_gib > 0
|
|
411
|
-
):
|
|
412
|
-
disk_per_node = min(max_local_disk_gib, instance.drive.size_gib)
|
|
432
|
+
if instance.drive is not None and instance.drive.size_gib > 0:
|
|
433
|
+
disk_per_node = instance.drive.size_gib
|
|
413
434
|
count = max(count, math.ceil(needed_disk_gib / disk_per_node))
|
|
414
435
|
if adjusted_disk_io_needed != 0.0:
|
|
415
436
|
instance_read_iops = (
|
|
@@ -441,13 +462,13 @@ def compute_stateful_zone( # pylint: disable=too-many-positional-arguments
|
|
|
441
462
|
cost = count * instance.annual_cost
|
|
442
463
|
|
|
443
464
|
attached_drives = []
|
|
444
|
-
if instance.drive is None and
|
|
465
|
+
if instance.drive is None and needed_disk_gib > 0:
|
|
445
466
|
# If we don't have disks attach the cloud drive with enough
|
|
446
467
|
# space and IO for the requirement
|
|
447
468
|
|
|
448
469
|
# Note that cloud drivers are provisioned _per node_ and must be chosen for
|
|
449
470
|
# the max of space and IOS.
|
|
450
|
-
space_gib = max(1, math.ceil(
|
|
471
|
+
space_gib = max(1, math.ceil(needed_disk_gib / count))
|
|
451
472
|
read_io, write_io = required_disk_ios(space_gib, count)
|
|
452
473
|
read_io, write_io = (
|
|
453
474
|
utils.next_n(read_io, n=200),
|
|
@@ -463,9 +484,6 @@ def compute_stateful_zone( # pylint: disable=too-many-positional-arguments
|
|
|
463
484
|
# 1/3 the maximum volume size in one node (preferring more nodes
|
|
464
485
|
# with smaller volumes)
|
|
465
486
|
max_size = drive.max_size_gib / 3
|
|
466
|
-
if max_attached_disk_gib is not None:
|
|
467
|
-
max_size = max_attached_disk_gib
|
|
468
|
-
|
|
469
487
|
if ebs_gib > max_size > 0:
|
|
470
488
|
ratio = ebs_gib / max_size
|
|
471
489
|
count = max(cluster_size(math.ceil(count * ratio)), min_count)
|
|
@@ -21,6 +21,7 @@ from service_capacity_modeling.interface import certain_float
|
|
|
21
21
|
from service_capacity_modeling.interface import certain_int
|
|
22
22
|
from service_capacity_modeling.interface import Clusters
|
|
23
23
|
from service_capacity_modeling.interface import Consistency
|
|
24
|
+
from service_capacity_modeling.interface import CurrentClusterCapacity
|
|
24
25
|
from service_capacity_modeling.interface import DataShape
|
|
25
26
|
from service_capacity_modeling.interface import Drive
|
|
26
27
|
from service_capacity_modeling.interface import FixedInterval
|
|
@@ -35,12 +36,15 @@ from service_capacity_modeling.models import CapacityModel
|
|
|
35
36
|
from service_capacity_modeling.models.common import buffer_for_components
|
|
36
37
|
from service_capacity_modeling.models.common import compute_stateful_zone
|
|
37
38
|
from service_capacity_modeling.models.common import derived_buffer_for_component
|
|
39
|
+
from service_capacity_modeling.models.common import get_effective_disk_per_node_gib
|
|
38
40
|
from service_capacity_modeling.models.common import network_services
|
|
39
41
|
from service_capacity_modeling.models.common import normalize_cores
|
|
40
42
|
from service_capacity_modeling.models.common import simple_network_mbps
|
|
41
43
|
from service_capacity_modeling.models.common import sqrt_staffed_cores
|
|
42
44
|
from service_capacity_modeling.models.common import working_set_from_drive_and_slo
|
|
43
45
|
from service_capacity_modeling.models.common import zonal_requirements_from_current
|
|
46
|
+
from service_capacity_modeling.models.utils import is_power_of_2
|
|
47
|
+
from service_capacity_modeling.models.utils import next_doubling
|
|
44
48
|
from service_capacity_modeling.models.utils import next_power_of_2
|
|
45
49
|
from service_capacity_modeling.stats import dist_for_interval
|
|
46
50
|
|
|
@@ -106,6 +110,53 @@ def _get_disk_from_desires(desires, copies_per_region):
|
|
|
106
110
|
)
|
|
107
111
|
|
|
108
112
|
|
|
113
|
+
def _get_min_count(
|
|
114
|
+
tier: int,
|
|
115
|
+
required_cluster_size: Optional[int],
|
|
116
|
+
needed_disk_gib: float,
|
|
117
|
+
disk_per_node_gib: float,
|
|
118
|
+
cluster_size_lambda: Callable[[int], int],
|
|
119
|
+
):
|
|
120
|
+
"""
|
|
121
|
+
Compute the minimum number of nodes required for a zone.
|
|
122
|
+
|
|
123
|
+
This function is used to prevent the planner from allocating clusters that
|
|
124
|
+
would exceed the max data per node or under the required cluster size for
|
|
125
|
+
a tier or existing cluster
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
# Cassandra clusters should aim to be at least 2 nodes per zone to start
|
|
129
|
+
# out with for tier 0 or tier 1. This gives us more room to "up-color"]
|
|
130
|
+
# clusters.
|
|
131
|
+
min_nodes_for_tier = 2 if tier in CRITICAL_TIERS else 0
|
|
132
|
+
|
|
133
|
+
# Prevent allocating clusters that exceed the max data per node.
|
|
134
|
+
min_nodes_for_disk = math.ceil(needed_disk_gib / disk_per_node_gib)
|
|
135
|
+
|
|
136
|
+
# Take the max of the following in order to avoid:
|
|
137
|
+
# (1) if `required_cluster_size` < `min_nodes_for_disk`, don't let the planner
|
|
138
|
+
# pick a shape that would exceed the max data per node
|
|
139
|
+
#
|
|
140
|
+
# For example, if we need 4TiB of disk, and the max data per node is 1TiB,
|
|
141
|
+
# Regardless of the `required_cluster_size`, we cannot allocate less than 4
|
|
142
|
+
# nodes because that would exceed the max data per node.
|
|
143
|
+
#
|
|
144
|
+
# (2) if `required_cluster_size` > `min_nodes_for_disk`, don't let the
|
|
145
|
+
# node density requirement affect the min count because the required
|
|
146
|
+
# cluster size already meets the node density requirement.
|
|
147
|
+
#
|
|
148
|
+
# For example, if we need 4TiB of disk, and the max data per node is 1TiB,
|
|
149
|
+
# and the upstream requires >= 8 nodes, we can allocate 8 nodes because
|
|
150
|
+
# each node would only have 500GB of data.
|
|
151
|
+
min_count = max(
|
|
152
|
+
min_nodes_for_tier,
|
|
153
|
+
required_cluster_size or 0,
|
|
154
|
+
min_nodes_for_disk,
|
|
155
|
+
)
|
|
156
|
+
# Ensure that the min count is an increment of the cluster size constraint (doubling)
|
|
157
|
+
return cluster_size_lambda(min_count)
|
|
158
|
+
|
|
159
|
+
|
|
109
160
|
def _zonal_requirement_for_new_cluster(
|
|
110
161
|
desires, instance, copies_per_region, zones_per_region
|
|
111
162
|
) -> CapacityRequirement:
|
|
@@ -149,15 +200,7 @@ def _estimate_cassandra_requirement( # pylint: disable=too-many-positional-argu
|
|
|
149
200
|
)
|
|
150
201
|
memory_preserve = False
|
|
151
202
|
reference_shape = desires.reference_shape
|
|
152
|
-
current_capacity = (
|
|
153
|
-
None
|
|
154
|
-
if desires.current_clusters is None
|
|
155
|
-
else (
|
|
156
|
-
desires.current_clusters.zonal[0]
|
|
157
|
-
if len(desires.current_clusters.zonal)
|
|
158
|
-
else desires.current_clusters.regional[0]
|
|
159
|
-
)
|
|
160
|
-
)
|
|
203
|
+
current_capacity = _get_current_capacity(desires)
|
|
161
204
|
|
|
162
205
|
# If the cluster is already provisioned
|
|
163
206
|
if current_capacity and desires.current_clusters is not None:
|
|
@@ -277,6 +320,26 @@ def _estimate_cassandra_requirement( # pylint: disable=too-many-positional-argu
|
|
|
277
320
|
)
|
|
278
321
|
|
|
279
322
|
|
|
323
|
+
def _get_current_cluster_size(desires) -> int:
|
|
324
|
+
current_capacity = _get_current_capacity(desires)
|
|
325
|
+
if current_capacity is None:
|
|
326
|
+
return 0
|
|
327
|
+
return math.ceil(current_capacity.cluster_instance_count.mid)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _get_current_capacity(desires) -> Optional[CurrentClusterCapacity]:
|
|
331
|
+
current_capacity = (
|
|
332
|
+
None
|
|
333
|
+
if desires.current_clusters is None
|
|
334
|
+
else (
|
|
335
|
+
desires.current_clusters.zonal[0]
|
|
336
|
+
if len(desires.current_clusters.zonal)
|
|
337
|
+
else desires.current_clusters.regional[0]
|
|
338
|
+
)
|
|
339
|
+
)
|
|
340
|
+
return current_capacity
|
|
341
|
+
|
|
342
|
+
|
|
280
343
|
def _upsert_params(cluster, params):
|
|
281
344
|
if cluster.cluster_params:
|
|
282
345
|
cluster.cluster_params.update(params)
|
|
@@ -284,6 +347,18 @@ def _upsert_params(cluster, params):
|
|
|
284
347
|
cluster.cluster_params = params
|
|
285
348
|
|
|
286
349
|
|
|
350
|
+
def _get_cluster_size_lambda(
|
|
351
|
+
current_cluster_size: int,
|
|
352
|
+
required_cluster_size: Optional[int],
|
|
353
|
+
) -> Callable[[int], int]:
|
|
354
|
+
if required_cluster_size:
|
|
355
|
+
return lambda x: next_doubling(x, base=required_cluster_size)
|
|
356
|
+
elif current_cluster_size and not is_power_of_2(current_cluster_size):
|
|
357
|
+
return lambda x: next_doubling(x, base=current_cluster_size)
|
|
358
|
+
else: # New provisionings
|
|
359
|
+
return next_power_of_2
|
|
360
|
+
|
|
361
|
+
|
|
287
362
|
# pylint: disable=too-many-locals
|
|
288
363
|
# pylint: disable=too-many-return-statements
|
|
289
364
|
# flake8: noqa: C901
|
|
@@ -298,7 +373,8 @@ def _estimate_cassandra_cluster_zonal( # pylint: disable=too-many-positional-ar
|
|
|
298
373
|
require_attached_disks: bool = False,
|
|
299
374
|
required_cluster_size: Optional[int] = None,
|
|
300
375
|
max_rps_to_disk: int = 500,
|
|
301
|
-
|
|
376
|
+
max_local_data_per_node_gib: int = 1280,
|
|
377
|
+
max_attached_data_per_node_gib: int = 2048,
|
|
302
378
|
max_regional_size: int = 192,
|
|
303
379
|
max_write_buffer_percent: float = 0.25,
|
|
304
380
|
max_table_buffer_percent: float = 0.11,
|
|
@@ -361,10 +437,31 @@ def _estimate_cassandra_cluster_zonal( # pylint: disable=too-many-positional-ar
|
|
|
361
437
|
copies_per_region=copies_per_region,
|
|
362
438
|
)
|
|
363
439
|
|
|
364
|
-
#
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
440
|
+
# Adjust the min count to adjust to prevent too much data on a single
|
|
441
|
+
needed_disk_gib = int(requirement.disk_gib.mid)
|
|
442
|
+
disk_buffer_ratio = buffer_for_components(
|
|
443
|
+
buffers=desires.buffers, components=[BufferComponent.disk]
|
|
444
|
+
).ratio
|
|
445
|
+
disk_per_node_gib = get_effective_disk_per_node_gib(
|
|
446
|
+
instance,
|
|
447
|
+
drive,
|
|
448
|
+
disk_buffer_ratio,
|
|
449
|
+
max_local_data_per_node_gib=max_local_data_per_node_gib,
|
|
450
|
+
max_attached_data_per_node_gib=max_attached_data_per_node_gib,
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
current_cluster_size = _get_current_cluster_size(desires)
|
|
454
|
+
cluster_size_lambda = _get_cluster_size_lambda(
|
|
455
|
+
current_cluster_size, required_cluster_size
|
|
456
|
+
)
|
|
457
|
+
min_count = _get_min_count(
|
|
458
|
+
tier=desires.service_tier,
|
|
459
|
+
required_cluster_size=required_cluster_size,
|
|
460
|
+
needed_disk_gib=needed_disk_gib,
|
|
461
|
+
disk_per_node_gib=disk_per_node_gib,
|
|
462
|
+
cluster_size_lambda=cluster_size_lambda,
|
|
463
|
+
)
|
|
464
|
+
|
|
368
465
|
base_mem = _get_base_memory(desires)
|
|
369
466
|
|
|
370
467
|
heap_fn = _cass_heap_for_write_buffer(
|
|
@@ -378,7 +475,7 @@ def _estimate_cassandra_cluster_zonal( # pylint: disable=too-many-positional-ar
|
|
|
378
475
|
instance=instance,
|
|
379
476
|
drive=drive,
|
|
380
477
|
needed_cores=int(requirement.cpu_cores.mid),
|
|
381
|
-
needed_disk_gib=
|
|
478
|
+
needed_disk_gib=needed_disk_gib,
|
|
382
479
|
needed_memory_gib=int(requirement.mem_gib.mid),
|
|
383
480
|
needed_network_mbps=requirement.network_mbps.mid,
|
|
384
481
|
# Take into account the reads per read
|
|
@@ -387,14 +484,9 @@ def _estimate_cassandra_cluster_zonal( # pylint: disable=too-many-positional-ar
|
|
|
387
484
|
_cass_io_per_read(size) * math.ceil(read_io_per_sec / count),
|
|
388
485
|
write_io_per_sec / count,
|
|
389
486
|
),
|
|
390
|
-
# Disk buffer is already added while computing C* estimates
|
|
391
|
-
required_disk_space=lambda x: x,
|
|
392
|
-
# C* clusters cannot recover data from neighbors quickly so we
|
|
393
|
-
# want to avoid clusters with more than 1 TiB of local state
|
|
394
|
-
max_local_disk_gib=max_local_disk_gib,
|
|
395
487
|
# C* clusters provision in powers of 2 because doubling
|
|
396
|
-
cluster_size=
|
|
397
|
-
min_count=
|
|
488
|
+
cluster_size=cluster_size_lambda,
|
|
489
|
+
min_count=min_count,
|
|
398
490
|
# TODO: Take reserve memory calculation into account during buffer calculation
|
|
399
491
|
# C* heap usage takes away from OS page cache memory
|
|
400
492
|
reserve_memory=lambda x: base_mem + heap_fn(x),
|
|
@@ -617,6 +709,11 @@ class NflxCassandraCapacityModel(CapacityModel):
|
|
|
617
709
|
desires: CapacityDesires,
|
|
618
710
|
extra_model_arguments: Dict[str, Any],
|
|
619
711
|
) -> Optional[CapacityPlan]:
|
|
712
|
+
# TODO: Standardize these extra model argument defaults in a single
|
|
713
|
+
# place. Many of them are defined here and as default values in the
|
|
714
|
+
# downstream method but only these ones are used which is confusing for
|
|
715
|
+
# readability
|
|
716
|
+
|
|
620
717
|
# Use durabiliy and consistency to compute RF.
|
|
621
718
|
copies_per_region = _target_rf(
|
|
622
719
|
desires, extra_model_arguments.get("copies_per_region", None)
|
|
@@ -635,7 +732,11 @@ class NflxCassandraCapacityModel(CapacityModel):
|
|
|
635
732
|
|
|
636
733
|
max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 500)
|
|
637
734
|
max_regional_size: int = extra_model_arguments.get("max_regional_size", 192)
|
|
638
|
-
|
|
735
|
+
max_local_data_per_node_gib: int = extra_model_arguments.get(
|
|
736
|
+
"max_local_data_per_node_gib",
|
|
737
|
+
extra_model_arguments.get("max_local_disk_gib", 1280),
|
|
738
|
+
)
|
|
739
|
+
|
|
639
740
|
max_write_buffer_percent: float = min(
|
|
640
741
|
0.5, extra_model_arguments.get("max_write_buffer_percent", 0.25)
|
|
641
742
|
)
|
|
@@ -663,7 +764,7 @@ class NflxCassandraCapacityModel(CapacityModel):
|
|
|
663
764
|
required_cluster_size=required_cluster_size,
|
|
664
765
|
max_rps_to_disk=max_rps_to_disk,
|
|
665
766
|
max_regional_size=max_regional_size,
|
|
666
|
-
|
|
767
|
+
max_local_data_per_node_gib=max_local_data_per_node_gib,
|
|
667
768
|
max_write_buffer_percent=max_write_buffer_percent,
|
|
668
769
|
max_table_buffer_percent=max_table_buffer_percent,
|
|
669
770
|
)
|
|
@@ -676,6 +777,26 @@ class NflxCassandraCapacityModel(CapacityModel):
|
|
|
676
777
|
def extra_model_arguments_schema() -> Dict[str, Any]:
|
|
677
778
|
return NflxCassandraArguments.model_json_schema()
|
|
678
779
|
|
|
780
|
+
@staticmethod
|
|
781
|
+
def default_buffers() -> Buffers:
|
|
782
|
+
return Buffers(
|
|
783
|
+
default=Buffer(ratio=1.5),
|
|
784
|
+
desired={
|
|
785
|
+
"compute": Buffer(ratio=1.5, components=[BufferComponent.compute]),
|
|
786
|
+
"storage": Buffer(ratio=4.0, components=[BufferComponent.storage]),
|
|
787
|
+
# Cassandra reserves headroom in both cpu and network for background
|
|
788
|
+
# work and tasks
|
|
789
|
+
"background": Buffer(
|
|
790
|
+
ratio=2.0,
|
|
791
|
+
components=[
|
|
792
|
+
BufferComponent.cpu,
|
|
793
|
+
BufferComponent.network,
|
|
794
|
+
BACKGROUND_BUFFER,
|
|
795
|
+
],
|
|
796
|
+
),
|
|
797
|
+
},
|
|
798
|
+
)
|
|
799
|
+
|
|
679
800
|
@staticmethod
|
|
680
801
|
def default_desires(user_desires, extra_model_arguments: Dict[str, Any]):
|
|
681
802
|
acceptable_consistency = {
|
|
@@ -703,24 +824,7 @@ class NflxCassandraCapacityModel(CapacityModel):
|
|
|
703
824
|
|
|
704
825
|
# By supplying these buffers we can deconstruct observed utilization into
|
|
705
826
|
# load versus buffer.
|
|
706
|
-
buffers =
|
|
707
|
-
default=Buffer(ratio=1.5),
|
|
708
|
-
desired={
|
|
709
|
-
"compute": Buffer(ratio=1.5, components=[BufferComponent.compute]),
|
|
710
|
-
"storage": Buffer(ratio=4.0, components=[BufferComponent.storage]),
|
|
711
|
-
# Cassandra reserves headroom in both cpu and network for background
|
|
712
|
-
# work and tasks
|
|
713
|
-
"background": Buffer(
|
|
714
|
-
ratio=2.0,
|
|
715
|
-
components=[
|
|
716
|
-
BufferComponent.cpu,
|
|
717
|
-
BufferComponent.network,
|
|
718
|
-
BACKGROUND_BUFFER,
|
|
719
|
-
],
|
|
720
|
-
),
|
|
721
|
-
},
|
|
722
|
-
)
|
|
723
|
-
|
|
827
|
+
buffers = NflxCassandraCapacityModel.default_buffers()
|
|
724
828
|
if user_desires.query_pattern.access_pattern == AccessPattern.latency:
|
|
725
829
|
return CapacityDesires(
|
|
726
830
|
query_pattern=QueryPattern(
|
|
@@ -10,6 +10,9 @@ from pydantic import Field
|
|
|
10
10
|
|
|
11
11
|
from service_capacity_modeling.interface import AccessConsistency
|
|
12
12
|
from service_capacity_modeling.interface import AccessPattern
|
|
13
|
+
from service_capacity_modeling.interface import Buffer
|
|
14
|
+
from service_capacity_modeling.interface import BufferComponent
|
|
15
|
+
from service_capacity_modeling.interface import Buffers
|
|
13
16
|
from service_capacity_modeling.interface import CapacityDesires
|
|
14
17
|
from service_capacity_modeling.interface import CapacityPlan
|
|
15
18
|
from service_capacity_modeling.interface import CapacityRequirement
|
|
@@ -27,7 +30,9 @@ from service_capacity_modeling.interface import QueryPattern
|
|
|
27
30
|
from service_capacity_modeling.interface import RegionContext
|
|
28
31
|
from service_capacity_modeling.interface import Requirements
|
|
29
32
|
from service_capacity_modeling.models import CapacityModel
|
|
33
|
+
from service_capacity_modeling.models.common import buffer_for_components
|
|
30
34
|
from service_capacity_modeling.models.common import compute_stateful_zone
|
|
35
|
+
from service_capacity_modeling.models.common import get_effective_disk_per_node_gib
|
|
31
36
|
from service_capacity_modeling.models.common import normalize_cores
|
|
32
37
|
from service_capacity_modeling.models.common import simple_network_mbps
|
|
33
38
|
from service_capacity_modeling.models.common import sqrt_staffed_cores
|
|
@@ -137,7 +142,7 @@ def _estimate_cockroachdb_cluster_zonal( # noqa=E501 pylint: disable=too-many-p
|
|
|
137
142
|
desires: CapacityDesires,
|
|
138
143
|
zones_per_region: int = 3,
|
|
139
144
|
copies_per_region: int = 3,
|
|
140
|
-
|
|
145
|
+
max_local_data_per_node_gib: int = 2048,
|
|
141
146
|
max_regional_size: int = 288,
|
|
142
147
|
max_rps_to_disk: int = 500,
|
|
143
148
|
min_vcpu_per_instance: int = 4,
|
|
@@ -184,11 +189,23 @@ def _estimate_cockroachdb_cluster_zonal( # noqa=E501 pylint: disable=too-many-p
|
|
|
184
189
|
+ desires.data_shape.reserved_instance_system_mem_gib
|
|
185
190
|
)
|
|
186
191
|
|
|
192
|
+
disk_buffer_ratio = buffer_for_components(
|
|
193
|
+
buffers=desires.buffers, components=[BufferComponent.disk]
|
|
194
|
+
).ratio
|
|
195
|
+
max_data_per_node_gib = get_effective_disk_per_node_gib(
|
|
196
|
+
instance,
|
|
197
|
+
drive,
|
|
198
|
+
disk_buffer_ratio,
|
|
199
|
+
max_local_data_per_node_gib=max_local_data_per_node_gib,
|
|
200
|
+
)
|
|
201
|
+
needed_disk_gib = requirement.disk_gib.mid * disk_buffer_ratio
|
|
202
|
+
min_count = math.ceil(needed_disk_gib / max_data_per_node_gib)
|
|
203
|
+
|
|
187
204
|
cluster = compute_stateful_zone(
|
|
188
205
|
instance=instance,
|
|
189
206
|
drive=drive,
|
|
190
207
|
needed_cores=int(requirement.cpu_cores.mid),
|
|
191
|
-
needed_disk_gib=
|
|
208
|
+
needed_disk_gib=needed_disk_gib,
|
|
192
209
|
needed_memory_gib=requirement.mem_gib.mid,
|
|
193
210
|
needed_network_mbps=requirement.network_mbps.mid,
|
|
194
211
|
# Take into account the reads per read
|
|
@@ -199,13 +216,9 @@ def _estimate_cockroachdb_cluster_zonal( # noqa=E501 pylint: disable=too-many-p
|
|
|
199
216
|
# TODO: presumably there are some write IOs here
|
|
200
217
|
0,
|
|
201
218
|
),
|
|
202
|
-
# CRDB requires ephemeral disks to be 80% full because leveled
|
|
203
|
-
# compaction can make progress as long as there is some headroom
|
|
204
|
-
required_disk_space=lambda x: x * 1.2,
|
|
205
|
-
max_local_disk_gib=max_local_disk_gib,
|
|
206
219
|
# cockroachdb clusters will autobalance across available nodes
|
|
207
220
|
cluster_size=lambda x: x,
|
|
208
|
-
min_count=
|
|
221
|
+
min_count=min_count,
|
|
209
222
|
# Sidecars/System takes away memory from cockroachdb
|
|
210
223
|
# cockroachdb by default uses --max-sql-memory of 25% of system memory
|
|
211
224
|
# that cannot be used for caching
|
|
@@ -268,6 +281,12 @@ class NflxCockroachDBArguments(BaseModel):
|
|
|
268
281
|
|
|
269
282
|
|
|
270
283
|
class NflxCockroachDBCapacityModel(CapacityModel):
|
|
284
|
+
@staticmethod
|
|
285
|
+
def default_buffers() -> Buffers:
|
|
286
|
+
return Buffers(
|
|
287
|
+
default=Buffer(ratio=1.2),
|
|
288
|
+
)
|
|
289
|
+
|
|
271
290
|
@staticmethod
|
|
272
291
|
def capacity_plan(
|
|
273
292
|
instance: Instance,
|
|
@@ -282,7 +301,11 @@ class NflxCockroachDBCapacityModel(CapacityModel):
|
|
|
282
301
|
max_regional_size: int = extra_model_arguments.get("max_regional_size", 500)
|
|
283
302
|
max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 500)
|
|
284
303
|
# Very large nodes are hard to recover
|
|
285
|
-
|
|
304
|
+
max_local_data_per_node_gib: int = extra_model_arguments.get(
|
|
305
|
+
"max_local_data_per_node_gib",
|
|
306
|
+
extra_model_arguments.get("max_local_disk_gib", 2048),
|
|
307
|
+
)
|
|
308
|
+
|
|
286
309
|
# Cockroach Labs recommends a minimum of 8 vCPUs and strongly
|
|
287
310
|
# recommends no fewer than 4 vCPUs per node.
|
|
288
311
|
min_vcpu_per_instance: int = extra_model_arguments.get(
|
|
@@ -299,7 +322,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
|
|
|
299
322
|
zones_per_region=context.zones_in_region,
|
|
300
323
|
copies_per_region=copies_per_region,
|
|
301
324
|
max_regional_size=max_regional_size,
|
|
302
|
-
|
|
325
|
+
max_local_data_per_node_gib=max_local_data_per_node_gib,
|
|
303
326
|
max_rps_to_disk=max_rps_to_disk,
|
|
304
327
|
min_vcpu_per_instance=min_vcpu_per_instance,
|
|
305
328
|
license_fee_per_core=license_fee_per_core,
|
|
@@ -330,6 +353,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
|
|
|
330
353
|
f"User asked for {key}={value}"
|
|
331
354
|
)
|
|
332
355
|
|
|
356
|
+
buffers = NflxCockroachDBCapacityModel.default_buffers()
|
|
333
357
|
if user_desires.query_pattern.access_pattern == AccessPattern.latency:
|
|
334
358
|
return CapacityDesires(
|
|
335
359
|
query_pattern=QueryPattern(
|
|
@@ -396,6 +420,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
|
|
|
396
420
|
# gateway taking about 1 MiB of memory
|
|
397
421
|
reserved_instance_app_mem_gib=0.001,
|
|
398
422
|
),
|
|
423
|
+
buffers=buffers,
|
|
399
424
|
)
|
|
400
425
|
else:
|
|
401
426
|
return CapacityDesires(
|
|
@@ -465,6 +490,7 @@ class NflxCockroachDBCapacityModel(CapacityModel):
|
|
|
465
490
|
# gateway taking about 1 MiB of memory
|
|
466
491
|
reserved_instance_app_mem_gib=0.001,
|
|
467
492
|
),
|
|
493
|
+
buffers=buffers,
|
|
468
494
|
)
|
|
469
495
|
|
|
470
496
|
|
|
@@ -11,6 +11,9 @@ from pydantic import Field
|
|
|
11
11
|
|
|
12
12
|
from service_capacity_modeling.interface import AccessConsistency
|
|
13
13
|
from service_capacity_modeling.interface import AccessPattern
|
|
14
|
+
from service_capacity_modeling.interface import Buffer
|
|
15
|
+
from service_capacity_modeling.interface import BufferComponent
|
|
16
|
+
from service_capacity_modeling.interface import Buffers
|
|
14
17
|
from service_capacity_modeling.interface import CapacityDesires
|
|
15
18
|
from service_capacity_modeling.interface import CapacityPlan
|
|
16
19
|
from service_capacity_modeling.interface import CapacityRequirement
|
|
@@ -27,7 +30,9 @@ from service_capacity_modeling.interface import RegionContext
|
|
|
27
30
|
from service_capacity_modeling.interface import Requirements
|
|
28
31
|
from service_capacity_modeling.interface import ZoneClusterCapacity
|
|
29
32
|
from service_capacity_modeling.models import CapacityModel
|
|
33
|
+
from service_capacity_modeling.models.common import buffer_for_components
|
|
30
34
|
from service_capacity_modeling.models.common import compute_stateful_zone
|
|
35
|
+
from service_capacity_modeling.models.common import get_effective_disk_per_node_gib
|
|
31
36
|
from service_capacity_modeling.models.common import normalize_cores
|
|
32
37
|
from service_capacity_modeling.models.common import simple_network_mbps
|
|
33
38
|
from service_capacity_modeling.models.common import sqrt_staffed_cores
|
|
@@ -176,6 +181,20 @@ class NflxElasticsearchArguments(BaseModel):
|
|
|
176
181
|
|
|
177
182
|
|
|
178
183
|
class NflxElasticsearchDataCapacityModel(CapacityModel):
|
|
184
|
+
@staticmethod
|
|
185
|
+
def default_buffers() -> Buffers:
|
|
186
|
+
return Buffers(
|
|
187
|
+
default=Buffer(ratio=1.33),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
@staticmethod
|
|
191
|
+
def default_desires(
|
|
192
|
+
user_desires, extra_model_arguments: Dict[str, Any]
|
|
193
|
+
) -> CapacityDesires:
|
|
194
|
+
return CapacityDesires(
|
|
195
|
+
buffers=NflxElasticsearchDataCapacityModel.default_buffers()
|
|
196
|
+
)
|
|
197
|
+
|
|
179
198
|
@staticmethod
|
|
180
199
|
def capacity_plan(
|
|
181
200
|
instance: Instance,
|
|
@@ -190,7 +209,10 @@ class NflxElasticsearchDataCapacityModel(CapacityModel):
|
|
|
190
209
|
max_regional_size: int = extra_model_arguments.get("max_regional_size", 120)
|
|
191
210
|
max_rps_to_disk: int = extra_model_arguments.get("max_rps_to_disk", 1000)
|
|
192
211
|
# Very large nodes are hard to recover
|
|
193
|
-
|
|
212
|
+
max_local_data_per_node_gib: int = extra_model_arguments.get(
|
|
213
|
+
"max_local_data_per_node_gib",
|
|
214
|
+
extra_model_arguments.get("max_local_disk_gib", 8192),
|
|
215
|
+
)
|
|
194
216
|
|
|
195
217
|
# the ratio of traffic that should be handled by search nodes.
|
|
196
218
|
# 0.0 = no search nodes, all searches handled by data nodes
|
|
@@ -259,11 +281,23 @@ class NflxElasticsearchDataCapacityModel(CapacityModel):
|
|
|
259
281
|
# io2/gp2 so for now we're just hardcoding.
|
|
260
282
|
data_write_io_per_sec = (1 + 10) * max(1, data_write_bytes_per_sec // 16384)
|
|
261
283
|
|
|
284
|
+
disk_buffer_ratio = buffer_for_components(
|
|
285
|
+
buffers=desires.buffers, components=[BufferComponent.disk]
|
|
286
|
+
).ratio
|
|
287
|
+
needed_disk_gib = data_requirement.disk_gib.mid * disk_buffer_ratio
|
|
288
|
+
max_data_per_node_gib = get_effective_disk_per_node_gib(
|
|
289
|
+
instance,
|
|
290
|
+
drive,
|
|
291
|
+
disk_buffer_ratio,
|
|
292
|
+
max_local_data_per_node_gib=max_local_data_per_node_gib,
|
|
293
|
+
)
|
|
294
|
+
min_count = math.ceil(needed_disk_gib / max_data_per_node_gib)
|
|
295
|
+
|
|
262
296
|
data_cluster = compute_stateful_zone(
|
|
263
297
|
instance=instance,
|
|
264
298
|
drive=drive,
|
|
265
299
|
needed_cores=int(data_requirement.cpu_cores.mid),
|
|
266
|
-
needed_disk_gib=
|
|
300
|
+
needed_disk_gib=needed_disk_gib,
|
|
267
301
|
needed_memory_gib=int(data_requirement.mem_gib.mid),
|
|
268
302
|
needed_network_mbps=data_requirement.network_mbps.mid,
|
|
269
303
|
# Take into account the reads per read
|
|
@@ -272,13 +306,9 @@ class NflxElasticsearchDataCapacityModel(CapacityModel):
|
|
|
272
306
|
_es_io_per_read(size) * math.ceil(data_rps / count),
|
|
273
307
|
data_write_io_per_sec / count,
|
|
274
308
|
),
|
|
275
|
-
# Elasticsearch requires ephemeral disks to be % full because tiered
|
|
276
|
-
# merging can make progress as long as there is some headroom
|
|
277
|
-
required_disk_space=lambda x: x * 1.33,
|
|
278
|
-
max_local_disk_gib=max_local_disk_gib,
|
|
279
309
|
# Elasticsearch clusters can auto-balance via shard placement
|
|
280
310
|
cluster_size=lambda x: x,
|
|
281
|
-
min_count=
|
|
311
|
+
min_count=min_count,
|
|
282
312
|
# Sidecars/System takes away memory from Elasticsearch
|
|
283
313
|
# which uses half of available system max of 32 for compressed oops
|
|
284
314
|
reserve_memory=lambda x: base_mem + max(32, x / 2),
|