service-capacity-modeling 0.3.51__tar.gz → 0.3.53__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of service-capacity-modeling might be problematic. Click here for more details.

Files changed (95) hide show
  1. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/PKG-INFO +1 -1
  2. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/cassandra.py +5 -1
  3. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/kafka.py +120 -76
  4. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling.egg-info/PKG-INFO +1 -1
  5. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/LICENSE +0 -0
  6. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/README.md +0 -0
  7. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/__init__.py +0 -0
  8. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/capacity_planner.py +0 -0
  9. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/__init__.py +0 -0
  10. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/__init__.py +0 -0
  11. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/pricing/aws/3yr-reserved_ec2.json +0 -0
  12. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/pricing/aws/3yr-reserved_zz-overrides.json +0 -0
  13. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/profiles.txt +0 -0
  14. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5.json +0 -0
  15. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5a.json +0 -0
  16. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5d.json +0 -0
  17. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c5n.json +0 -0
  18. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c6a.json +0 -0
  19. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c6i.json +0 -0
  20. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c6id.json +0 -0
  21. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c7a.json +0 -0
  22. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_c7i.json +0 -0
  23. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m4.json +0 -0
  24. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m5.json +0 -0
  25. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m5n.json +0 -0
  26. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6a.json +0 -0
  27. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6i.json +0 -0
  28. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6id.json +0 -0
  29. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6idn.json +0 -0
  30. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m6in.json +0 -0
  31. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m7a.json +0 -0
  32. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_m7i.json +0 -0
  33. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r4.json +0 -0
  34. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r5.json +0 -0
  35. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r5n.json +0 -0
  36. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6a.json +0 -0
  37. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6i.json +0 -0
  38. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6id.json +0 -0
  39. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6idn.json +0 -0
  40. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r6in.json +0 -0
  41. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r7a.json +0 -0
  42. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/auto_r7i.json +0 -0
  43. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/manual_drives.json +0 -0
  44. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/manual_instances.json +0 -0
  45. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/hardware/profiles/shapes/aws/manual_services.json +0 -0
  46. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/interface.py +0 -0
  47. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/__init__.py +0 -0
  48. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/common.py +0 -0
  49. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/headroom_strategy.py +0 -0
  50. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/__init__.py +0 -0
  51. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/__init__.py +0 -0
  52. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/aurora.py +0 -0
  53. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/counter.py +0 -0
  54. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/crdb.py +0 -0
  55. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/ddb.py +0 -0
  56. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/elasticsearch.py +0 -0
  57. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/entity.py +0 -0
  58. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/evcache.py +0 -0
  59. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/graphkv.py +0 -0
  60. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/iso_date_math.py +0 -0
  61. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/key_value.py +0 -0
  62. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/postgres.py +0 -0
  63. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/rds.py +0 -0
  64. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/stateless_java.py +0 -0
  65. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/time_series.py +0 -0
  66. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/time_series_config.py +0 -0
  67. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/wal.py +0 -0
  68. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/org/netflix/zookeeper.py +0 -0
  69. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/models/utils.py +0 -0
  70. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/stats.py +0 -0
  71. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/tools/__init__.py +0 -0
  72. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/tools/auto_shape.py +0 -0
  73. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/tools/fetch_pricing.py +0 -0
  74. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/tools/generate_missing.py +0 -0
  75. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling/tools/instance_families.py +0 -0
  76. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling.egg-info/SOURCES.txt +0 -0
  77. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling.egg-info/dependency_links.txt +0 -0
  78. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling.egg-info/entry_points.txt +0 -0
  79. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling.egg-info/requires.txt +0 -0
  80. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/service_capacity_modeling.egg-info/top_level.txt +0 -0
  81. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/setup.cfg +0 -0
  82. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/setup.py +0 -0
  83. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_arguments.py +0 -0
  84. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_buffers.py +0 -0
  85. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_common.py +0 -0
  86. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_desire_merge.py +0 -0
  87. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_generate_scenarios.py +0 -0
  88. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_hardware.py +0 -0
  89. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_hardware_shapes.py +0 -0
  90. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_headroom_strategy.py +0 -0
  91. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_io2.py +0 -0
  92. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_model_dump.py +0 -0
  93. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_reproducible.py +0 -0
  94. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_simulation.py +0 -0
  95. {service_capacity_modeling-0.3.51 → service_capacity_modeling-0.3.53}/tests/test_working_set.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: service-capacity-modeling
3
- Version: 0.3.51
3
+ Version: 0.3.53
4
4
  Summary: Contains utilities for modeling capacity for pluggable workloads
5
5
  Author: Joseph Lynch
6
6
  Author-email: josephl@netflix.com
@@ -164,7 +164,11 @@ def _estimate_cassandra_requirement( # pylint: disable=too-many-positional-argu
164
164
  disk_scale, _ = derived_buffer_for_component(
165
165
  desires.buffers.derived, ["storage", "disk"]
166
166
  )
167
- disk_used_gib = current_capacity.disk_utilization_gib.mid * (disk_scale or 1)
167
+ disk_used_gib = (
168
+ current_capacity.disk_utilization_gib.mid
169
+ * current_capacity.cluster_instance_count.mid
170
+ * (disk_scale or 1)
171
+ )
168
172
  else:
169
173
  # If the cluster is not yet provisioned
170
174
  capacity_requirement = _zonal_requirement_for_new_cluster(
@@ -11,6 +11,9 @@ from pydantic import Field
11
11
 
12
12
  from service_capacity_modeling.interface import AccessConsistency
13
13
  from service_capacity_modeling.interface import AccessPattern
14
+ from service_capacity_modeling.interface import Buffer
15
+ from service_capacity_modeling.interface import BufferComponent
16
+ from service_capacity_modeling.interface import Buffers
14
17
  from service_capacity_modeling.interface import CapacityDesires
15
18
  from service_capacity_modeling.interface import CapacityPlan
16
19
  from service_capacity_modeling.interface import CapacityRequirement
@@ -31,9 +34,11 @@ from service_capacity_modeling.interface import QueryPattern
31
34
  from service_capacity_modeling.interface import RegionContext
32
35
  from service_capacity_modeling.interface import Requirements
33
36
  from service_capacity_modeling.models import CapacityModel
37
+ from service_capacity_modeling.models import utils
34
38
  from service_capacity_modeling.models.common import compute_stateful_zone
35
39
  from service_capacity_modeling.models.common import normalize_cores
36
40
  from service_capacity_modeling.models.common import sqrt_staffed_cores
41
+ from service_capacity_modeling.models.common import zonal_requirements_from_current
37
42
  from service_capacity_modeling.models.org.netflix.iso_date_math import iso_to_seconds
38
43
 
39
44
  logger = logging.getLogger(__name__)
@@ -44,15 +49,6 @@ class ClusterType(str, Enum):
44
49
  ha = "high-availability"
45
50
 
46
51
 
47
- def target_cpu_utilization(tier: int) -> float:
48
- """
49
- Returns the target average cluster CPU utilization for a given tier
50
- """
51
- if tier in (0, 1):
52
- return 0.40
53
- return 0.50
54
-
55
-
56
52
  def _get_current_zonal_cluster(
57
53
  desires: CapacityDesires,
58
54
  ) -> Optional[CurrentZoneClusterCapacity]:
@@ -108,54 +104,8 @@ def _estimate_kafka_requirement( # pylint: disable=too-many-positional-argument
108
104
  write_mib_per_second
109
105
  )
110
106
  # use the current cluster capacity if available
111
- current_zonal_cluster = _get_current_zonal_cluster(desires)
112
-
113
- if (
114
- current_zonal_cluster
115
- and current_zonal_cluster.cluster_instance
116
- and required_zone_size is not None
117
- ):
118
- # For now, use the highest average CPU utilization seen on the cluster
119
- cpu_utilization = current_zonal_cluster.cpu_utilization.high
120
- # Validate with data if we should instead estimate 99th percentile here to get
121
- # rid of spikes in collected cpu usage data ?
122
- # Use the formula: 99th Percentile ≈ Average + (Z-score * SD).
123
- # https://en.wikipedia.org/wiki/Normal_curve_equivalent#:~:text=The%2099th%20percentile%20in%20a,49/2.3263%20=%2021.06.
124
- # curr_cpu_util = cpu_util.mid + (2.33 * (cpu_util.high - cpu_util.low) / 6)
125
- current_utilized_cores = (
126
- current_zonal_cluster.cluster_instance.cpu
127
- * required_zone_size
128
- * zones_per_region
129
- * cpu_utilization
130
- ) / 100
131
-
132
- # compute needed core capacity for cluster so avg cpu utilization for the
133
- # cluster stays under threshold for that tier
134
- needed_cores = int(
135
- current_utilized_cores / target_cpu_utilization(desires.service_tier)
136
- )
137
- logger.debug("kafka needed cores: %s", needed_cores)
138
- # Normalize those cores to the target shape
139
- reference_shape = current_zonal_cluster.cluster_instance
140
- needed_cores = normalize_cores(
141
- core_count=needed_cores,
142
- target_shape=instance,
143
- reference_shape=reference_shape,
144
- )
145
- logger.debug("kafka normalized needed cores: %s", needed_cores)
146
- else:
147
- # We have no existing utilization to go from
148
- needed_cores = normalize_cores(
149
- core_count=sqrt_staffed_cores(normalized_to_mib),
150
- target_shape=instance,
151
- reference_shape=desires.reference_shape,
152
- )
107
+ current_zonal_capacity = _get_current_zonal_cluster(desires)
153
108
 
154
- # (Nick): Keep 40% of available bandwidth for node recovery
155
- # (Joey): For kafka BW = BW_write + BW_reads
156
- # let X = input write BW
157
- # BW_in = X * RF
158
- # BW_out = X * (consumers) + X * (RF - 1)
159
109
  bw_in = (
160
110
  (write_mib_per_second * MIB_IN_BYTES) * copies_per_region
161
111
  ) / MEGABIT_IN_BYTES
@@ -165,21 +115,78 @@ def _estimate_kafka_requirement( # pylint: disable=too-many-positional-argument
165
115
  + ((write_mib_per_second * MIB_IN_BYTES) * (copies_per_region - 1))
166
116
  )
167
117
  ) / MEGABIT_IN_BYTES
168
- # BW = (in + out) because duplex then 40% headroom.
169
- needed_network_mbps = max(bw_in, bw_out) * 1.40
118
+ if (
119
+ current_zonal_capacity
120
+ and current_zonal_capacity.cluster_instance
121
+ and required_zone_size is not None
122
+ and desires.current_clusters is not None
123
+ ):
124
+ # zonal_requirements_from_current uses the midpoint utilization of the
125
+ # current cluster. For Kafka, we want to use the high value instead
126
+ # for cpu, disk, network, etc.
127
+ normalize_midpoint_desires = desires.model_copy(deep=True)
128
+ # This is checked in the if statement above. Assert here for mypy linter purpose
129
+ assert normalize_midpoint_desires.current_clusters is not None
130
+ curr_disk = desires.current_clusters.zonal[0].disk_utilization_gib
131
+ curr_cpu = desires.current_clusters.zonal[0].cpu_utilization
132
+ curr_network = desires.current_clusters.zonal[0].network_utilization_mbps
133
+ normalize_midpoint_desires.current_clusters.zonal[
134
+ 0
135
+ ].disk_utilization_gib = Interval(
136
+ low=curr_disk.high, mid=curr_disk.high, high=curr_disk.high
137
+ )
138
+ normalize_midpoint_desires.current_clusters.zonal[0].cpu_utilization = Interval(
139
+ low=curr_cpu.high, mid=curr_cpu.high, high=curr_cpu.high
140
+ )
141
+ normalize_midpoint_desires.current_clusters.zonal[
142
+ 0
143
+ ].network_utilization_mbps = Interval(
144
+ low=curr_network.high, mid=curr_network.high, high=curr_network.high
145
+ )
146
+ capacity_requirement = zonal_requirements_from_current(
147
+ current_cluster=normalize_midpoint_desires.current_clusters,
148
+ buffers=desires.buffers,
149
+ instance=instance,
150
+ reference_shape=current_zonal_capacity.cluster_instance,
151
+ )
152
+ needed_cores = int(capacity_requirement.cpu_cores.mid)
153
+ needed_disk = int(capacity_requirement.disk_gib.mid)
154
+ needed_network_mbps = int(capacity_requirement.network_mbps.mid)
155
+ logger.debug(
156
+ "kafka normalized needed cores: %s", capacity_requirement.cpu_cores
157
+ )
158
+ else:
159
+ # We have no existing utilization to go from
160
+ needed_cores = (
161
+ normalize_cores(
162
+ core_count=sqrt_staffed_cores(normalized_to_mib),
163
+ target_shape=instance,
164
+ reference_shape=desires.reference_shape,
165
+ )
166
+ // zones_per_region
167
+ )
170
168
 
171
- needed_disk = math.ceil(
172
- desires.data_shape.estimated_state_size_gib.mid,
173
- )
169
+ # (Nick): Keep 40% of available bandwidth for node recovery
170
+ # (Joey): For kafka BW = BW_write + BW_reads
171
+ # let X = input write BW
172
+ # BW_in = X * RF
173
+ # BW_out = X * (consumers) + X * (RF - 1)
174
+ # BW = (in + out) because duplex then 40% headroom.
175
+ needed_network_mbps = int((max(bw_in, bw_out) * 1.40) // zones_per_region)
176
+
177
+ # NOTE: data_shape is region, we need to convert it to zonal
178
+ # If we don't have an existing cluster, the estimated state size should be
179
+ # at most 40% of the total cluster's available disk.
180
+ # i.e. needed_disk = state_size * 2.5
181
+ needed_disk = math.ceil(
182
+ (desires.data_shape.estimated_state_size_gib.mid // zones_per_region) * 2.5,
183
+ )
174
184
 
175
185
  # Keep the last N seconds hot in cache
176
- needed_memory = (write_mib_per_second * hot_retention_seconds) // 1024
186
+ needed_memory = (
187
+ (write_mib_per_second * hot_retention_seconds) // 1024
188
+ ) // zones_per_region
177
189
 
178
- # Now convert to per zone
179
- needed_cores = max(1, needed_cores // zones_per_region)
180
- needed_disk = max(1, needed_disk // zones_per_region)
181
- needed_memory = max(1, int(needed_memory // zones_per_region))
182
- needed_network_mbps = max(1, int(needed_network_mbps // zones_per_region))
183
190
  logger.debug(
184
191
  "Need (cpu, mem, disk) = (%s, %s, %s)",
185
192
  needed_cores,
@@ -218,7 +225,8 @@ def _kafka_read_io(rps, io_size_kib, size_gib, recovery_seconds: int) -> float:
218
225
  # In practice we have cache reducing this by 99% or more
219
226
  read_ios = rps * 0.05
220
227
  # Recover the node in 60 minutes, to do that we need
221
- size_kib = size_gib * (1024 * 1024)
228
+ # Estimate that we are using ~ 50% of the disk prior to a recovery
229
+ size_kib = size_gib * 0.5 * (1024 * 1024)
222
230
  recovery_ios = max(1, size_kib / io_size_kib) / recovery_seconds
223
231
  # Leave 50% headroom for read IOs since generally we will hit cache
224
232
  return (read_ios + int(round(recovery_ios))) * 1.5
@@ -226,7 +234,8 @@ def _kafka_read_io(rps, io_size_kib, size_gib, recovery_seconds: int) -> float:
226
234
 
227
235
  # pylint: disable=too-many-locals
228
236
  # pylint: disable=too-many-return-statements
229
- def _estimate_kafka_cluster_zonal( # pylint: disable=too-many-positional-arguments
237
+ # pylint: disable=too-many-positional-arguments
238
+ def _estimate_kafka_cluster_zonal(
230
239
  instance: Instance,
231
240
  drive: Drive,
232
241
  desires: CapacityDesires,
@@ -306,6 +315,7 @@ def _estimate_kafka_cluster_zonal( # pylint: disable=too-many-positional-argume
306
315
  write_ios_per_second = max(
307
316
  1, (write_mib_per_second * 1024) // drive.seq_io_size_kib
308
317
  )
318
+ max_attached_disk_gib = 8 * 1024
309
319
 
310
320
  cluster = compute_stateful_zone(
311
321
  instance=instance,
@@ -329,8 +339,8 @@ def _estimate_kafka_cluster_zonal( # pylint: disable=too-many-positional-argume
329
339
  # Leave 100% IO headroom for writes
330
340
  copies_per_region * (write_ios_per_second / count) * 2,
331
341
  ),
332
- # Kafka can run up to 60% full on disk, let's stay safe at 40%
333
- required_disk_space=lambda x: x * 2.5,
342
+ # Disk buffer is already added when computing kafka disk requirements
343
+ required_disk_space=lambda x: x,
334
344
  max_local_disk_gib=max_local_disk_gib,
335
345
  cluster_size=lambda x: x,
336
346
  min_count=max(min_count, required_zone_size or 1),
@@ -338,17 +348,34 @@ def _estimate_kafka_cluster_zonal( # pylint: disable=too-many-positional-argume
338
348
  # Kafka currently uses 8GiB fixed, might want to change to min(30, x // 2)
339
349
  reserve_memory=lambda instance_mem_gib: base_mem + 8,
340
350
  # allow up to 8TiB of attached EBS
341
- max_attached_disk_gib=8 * 1024,
351
+ max_attached_disk_gib=max_attached_disk_gib,
342
352
  )
343
353
 
344
354
  # Communicate to the actual provision that if we want reduced RF
345
355
  params = {"kafka.copies": copies_per_region}
346
356
  _upsert_params(cluster, params)
347
357
 
348
- # Sometimes we don't want to modify cluster topology, so only allow
349
- # topologies that match the desired zone size
350
- if required_zone_size is not None and cluster.count != required_zone_size:
351
- return None
358
+ # This is roughly the disk we would have tried to provision with the current
359
+ # cluster's instance count (or required_zone_size)
360
+ if required_zone_size is not None:
361
+ space_gib = max(1, math.ceil(requirement.disk_gib.mid / required_zone_size))
362
+ ebs_gib = utils.next_n(space_gib, n=100)
363
+ max_size = (
364
+ max_attached_disk_gib
365
+ if max_attached_disk_gib is not None
366
+ else drive.max_size_gib / 3
367
+ ) # Max allowed disk in `compute_stateful_zone`
368
+
369
+ # Capacity planner only allows ~ 5TB disk (max_size) for gp3 drives
370
+ # or max_attached_disk_gib if provided.
371
+ # If ebs_gib > max_size, we do not have enough instances within the
372
+ # required_zone_size for the required disk. In these cases, it is
373
+ # not possible for cluster.count == required_zone_size. We should
374
+ # allow higher instance count for these cases so that we return some result
375
+ # If we did not exceed the max disk size with the required_zone_size, then
376
+ # we only allow topologies that match the desired zone size
377
+ if ebs_gib <= max_size and cluster.count != required_zone_size:
378
+ return None
352
379
 
353
380
  # Kafka clusters generally should try to stay under some total number
354
381
  # of nodes. Orgs do this for all kinds of reasons such as
@@ -542,6 +569,22 @@ class NflxKafkaCapacityModel(CapacityModel):
542
569
  write_bytes.mid * retention_secs * replication_factor
543
570
  ) / GIB_IN_BYTES
544
571
 
572
+ # By supplying these buffers we can deconstruct observed utilization into
573
+ # load versus buffer.
574
+ compute_buffer_ratio = 2.5 if user_desires.service_tier in (0, 1) else 2.0
575
+ buffers = Buffers(
576
+ default=Buffer(ratio=1.5),
577
+ desired={
578
+ # Amount of compute buffer that we need to reserve in addition to
579
+ # cpu_headroom_target that is reserved on a per instance basis
580
+ "compute": Buffer(
581
+ ratio=compute_buffer_ratio, components=[BufferComponent.compute]
582
+ ),
583
+ # This makes sure we use only 40% of the available storage
584
+ "storage": Buffer(ratio=2.5, components=[BufferComponent.storage]),
585
+ },
586
+ )
587
+
545
588
  return CapacityDesires(
546
589
  query_pattern=QueryPattern(
547
590
  access_pattern=AccessPattern.throughput,
@@ -581,6 +624,7 @@ class NflxKafkaCapacityModel(CapacityModel):
581
624
  # Connection overhead, kernel, etc ...
582
625
  reserved_instance_system_mem_gib=3,
583
626
  ),
627
+ buffers=buffers,
584
628
  )
585
629
 
586
630
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: service-capacity-modeling
3
- Version: 0.3.51
3
+ Version: 0.3.53
4
4
  Summary: Contains utilities for modeling capacity for pluggable workloads
5
5
  Author: Joseph Lynch
6
6
  Author-email: josephl@netflix.com