paasta-tools 1.30.9__py3-none-any.whl → 1.35.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of paasta-tools might be problematic. Click here for more details.

Files changed (98) hide show
  1. paasta_tools/__init__.py +1 -1
  2. paasta_tools/api/api_docs/swagger.json +5 -0
  3. paasta_tools/cli/cmds/autoscale.py +2 -0
  4. paasta_tools/cli/cmds/check.py +2 -0
  5. paasta_tools/cli/cmds/cook_image.py +2 -0
  6. paasta_tools/cli/cmds/get_docker_image.py +2 -0
  7. paasta_tools/cli/cmds/get_image_version.py +2 -0
  8. paasta_tools/cli/cmds/get_latest_deployment.py +2 -0
  9. paasta_tools/cli/cmds/info.py +5 -1
  10. paasta_tools/cli/cmds/itest.py +2 -0
  11. paasta_tools/cli/cmds/list_namespaces.py +2 -0
  12. paasta_tools/cli/cmds/local_run.py +116 -24
  13. paasta_tools/cli/cmds/logs.py +2 -0
  14. paasta_tools/cli/cmds/mark_for_deployment.py +12 -2
  15. paasta_tools/cli/cmds/mesh_status.py +2 -1
  16. paasta_tools/cli/cmds/push_to_registry.py +2 -0
  17. paasta_tools/cli/cmds/remote_run.py +10 -0
  18. paasta_tools/cli/cmds/rollback.py +5 -1
  19. paasta_tools/cli/cmds/secret.py +4 -2
  20. paasta_tools/cli/cmds/security_check.py +2 -0
  21. paasta_tools/cli/cmds/spark_run.py +4 -0
  22. paasta_tools/cli/cmds/status.py +35 -8
  23. paasta_tools/cli/cmds/validate.py +296 -19
  24. paasta_tools/cli/cmds/wait_for_deployment.py +2 -0
  25. paasta_tools/cli/schemas/autoscaling_schema.json +3 -2
  26. paasta_tools/cli/schemas/eks_schema.json +23 -1
  27. paasta_tools/cli/schemas/smartstack_schema.json +12 -0
  28. paasta_tools/cli/utils.py +2 -1
  29. paasta_tools/contrib/paasta_update_soa_memcpu.py +10 -14
  30. paasta_tools/generate_deployments_for_service.py +2 -0
  31. paasta_tools/instance/hpa_metrics_parser.py +3 -5
  32. paasta_tools/instance/kubernetes.py +58 -25
  33. paasta_tools/kubernetes/application/controller_wrappers.py +23 -2
  34. paasta_tools/kubernetes/remote_run.py +2 -2
  35. paasta_tools/kubernetes_tools.py +37 -66
  36. paasta_tools/long_running_service_tools.py +8 -1
  37. paasta_tools/paastaapi/model/kubernetes_version.py +3 -0
  38. paasta_tools/setup_prometheus_adapter_config.py +82 -0
  39. paasta_tools/tron_tools.py +3 -0
  40. paasta_tools/utils.py +26 -9
  41. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/generate_deployments_for_service.py +2 -0
  42. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_update_soa_memcpu.py +10 -14
  43. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_prometheus_adapter_config.py +82 -0
  44. {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/METADATA +4 -4
  45. {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/RECORD +98 -98
  46. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/apply_external_resources.py +0 -0
  47. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/bounce_log_latency_parser.py +0 -0
  48. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_autoscaler_max_instances.py +0 -0
  49. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_cassandracluster_services_replication.py +0 -0
  50. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_flink_services_health.py +0 -0
  51. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_kubernetes_api.py +0 -0
  52. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_kubernetes_services_replication.py +0 -0
  53. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_manual_oapi_changes.sh +0 -0
  54. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_oom_events.py +0 -0
  55. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_orphans.py +0 -0
  56. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_spark_jobs.py +0 -0
  57. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/cleanup_kubernetes_cr.py +0 -0
  58. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/cleanup_kubernetes_crd.py +0 -0
  59. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/cleanup_kubernetes_jobs.py +0 -0
  60. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/create_dynamodb_table.py +0 -0
  61. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/create_paasta_playground.py +0 -0
  62. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/delete_kubernetes_deployments.py +0 -0
  63. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/emit_allocated_cpu_metrics.py +0 -0
  64. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/generate_all_deployments +0 -0
  65. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/generate_authenticating_services.py +0 -0
  66. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/generate_services_file.py +0 -0
  67. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/generate_services_yaml.py +0 -0
  68. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/get_running_task_allocation.py +0 -0
  69. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/habitat_fixer.py +0 -0
  70. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/ide_helper.py +0 -0
  71. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/is_pod_healthy_in_proxy.py +0 -0
  72. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/is_pod_healthy_in_smartstack.py +0 -0
  73. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/kill_bad_containers.py +0 -0
  74. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/kubernetes_remove_evicted_pods.py +0 -0
  75. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/mass-deploy-tag.sh +0 -0
  76. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/mock_patch_checker.py +0 -0
  77. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_cleanup_remote_run_resources.py +0 -0
  78. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_cleanup_stale_nodes.py +0 -0
  79. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_deploy_tron_jobs +0 -0
  80. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_execute_docker_command.py +0 -0
  81. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_secrets_sync.py +0 -0
  82. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_tabcomplete.sh +0 -0
  83. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/render_template.py +0 -0
  84. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/rightsizer_soaconfigs_update.py +0 -0
  85. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/service_shard_remove.py +0 -0
  86. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/service_shard_update.py +0 -0
  87. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_istio_mesh.py +0 -0
  88. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_kubernetes_cr.py +0 -0
  89. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_kubernetes_crd.py +0 -0
  90. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_kubernetes_internal_crd.py +0 -0
  91. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_kubernetes_job.py +0 -0
  92. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/shared_ip_check.py +0 -0
  93. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/synapse_srv_namespaces_fact.py +0 -0
  94. {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/timeouts_metrics_prom.py +0 -0
  95. {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/WHEEL +0 -0
  96. {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/entry_points.txt +0 -0
  97. {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/licenses/LICENSE +0 -0
  98. {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import logging
2
3
  from asyncio.tasks import Task
3
4
  from collections import defaultdict
4
5
  from enum import Enum
@@ -17,6 +18,7 @@ from typing import Union
17
18
 
18
19
  import a_sync
19
20
  import pytz
21
+ import requests.exceptions
20
22
  from kubernetes.client import V1Container
21
23
  from kubernetes.client import V1ControllerRevision
22
24
  from kubernetes.client import V1Pod
@@ -75,6 +77,8 @@ INSTANCE_TYPE_CR_ID = dict(
75
77
  monkrelaycluster=monkrelaycluster_tools.cr_id,
76
78
  )
77
79
 
80
+ logger = logging.getLogger(__name__)
81
+
78
82
 
79
83
  class ServiceMesh(Enum):
80
84
  SMARTSTACK = "smartstack"
@@ -100,6 +104,7 @@ class KubernetesVersionDict(TypedDict, total=False):
100
104
  config_sha: str
101
105
  pods: Sequence[Mapping[str, Any]]
102
106
  namespace: str
107
+ container_port: Optional[int]
103
108
 
104
109
 
105
110
  def cr_id(service: str, instance: str, instance_type: str) -> Mapping[str, str]:
@@ -347,31 +352,49 @@ async def mesh_status(
347
352
 
348
353
  pods = await pods_task
349
354
  for location, hosts in node_hostname_by_location.items():
350
- host = replication_checker.get_hostname_in_pool(hosts, instance_pool)
351
- if service_mesh == ServiceMesh.SMARTSTACK:
352
- mesh_status["locations"].append(
353
- _build_smartstack_location_dict(
354
- synapse_host=host,
355
- synapse_port=settings.system_paasta_config.get_synapse_port(),
356
- synapse_haproxy_url_format=settings.system_paasta_config.get_synapse_haproxy_url_format(),
357
- registration=registration,
358
- pods=pods,
359
- location=location,
360
- should_return_individual_backends=should_return_individual_backends,
361
- )
362
- )
363
- elif service_mesh == ServiceMesh.ENVOY:
364
- mesh_status["locations"].append(
365
- _build_envoy_location_dict(
366
- envoy_host=host,
367
- envoy_admin_port=settings.system_paasta_config.get_envoy_admin_port(),
368
- envoy_admin_endpoint_format=settings.system_paasta_config.get_envoy_admin_endpoint_format(),
369
- registration=registration,
370
- pods=pods,
371
- location=location,
372
- should_return_individual_backends=should_return_individual_backends,
373
- )
374
- )
355
+ max_retries = 3
356
+
357
+ for attempt in range(max_retries):
358
+ host = replication_checker.get_hostname_in_pool(hosts, instance_pool)
359
+ try:
360
+ if service_mesh == ServiceMesh.SMARTSTACK:
361
+ location_dict = _build_smartstack_location_dict(
362
+ synapse_host=host,
363
+ synapse_port=settings.system_paasta_config.get_synapse_port(),
364
+ synapse_haproxy_url_format=settings.system_paasta_config.get_synapse_haproxy_url_format(),
365
+ registration=registration,
366
+ pods=pods,
367
+ location=location,
368
+ should_return_individual_backends=should_return_individual_backends,
369
+ )
370
+ elif service_mesh == ServiceMesh.ENVOY:
371
+ location_dict = _build_envoy_location_dict(
372
+ envoy_host=host,
373
+ envoy_admin_port=settings.system_paasta_config.get_envoy_admin_port(),
374
+ envoy_admin_endpoint_format=settings.system_paasta_config.get_envoy_admin_endpoint_format(),
375
+ registration=registration,
376
+ pods=pods,
377
+ location=location,
378
+ should_return_individual_backends=should_return_individual_backends,
379
+ )
380
+
381
+ mesh_status["locations"].append(location_dict)
382
+ return mesh_status
383
+
384
+ except requests.exceptions.ConnectTimeout:
385
+ if attempt < max_retries - 1:
386
+ logger.warning(
387
+ "attempt %s/%s: Unable to connect to %s, retrying (on another host, hopefully)...",
388
+ attempt,
389
+ max_retries,
390
+ host,
391
+ )
392
+ continue
393
+ else:
394
+ logger.critical(
395
+ "Unable to connect to %s, not retrying again.", host
396
+ )
397
+ raise
375
398
  return mesh_status
376
399
 
377
400
 
@@ -698,6 +721,7 @@ async def kubernetes_status_v2(
698
721
  instance=instance,
699
722
  namespaces=relevant_namespaces,
700
723
  pod_status_by_sha_and_readiness_task=pod_status_by_sha_and_readiness_task, # type: ignore # PAASTA-18698; ignoring due to unexpected type mismatch
724
+ container_port=job_config.get_container_port(),
701
725
  )
702
726
  )
703
727
  tasks.extend([pod_status_by_sha_and_readiness_task, versions_task]) # type: ignore # PAASTA-18698; ignoring due to unexpected type mismatch
@@ -717,6 +741,7 @@ async def kubernetes_status_v2(
717
741
  instance=instance,
718
742
  namespaces=relevant_namespaces,
719
743
  pod_status_by_replicaset_task=pod_status_by_replicaset_task, # type: ignore # PAASTA-18698; ignoring due to unexpected type mismatch
744
+ container_port=job_config.get_container_port(),
720
745
  )
721
746
  )
722
747
  tasks.extend([pod_status_by_replicaset_task, versions_task]) # type: ignore # PAASTA-18698; ignoring due to unexpected type mismatch
@@ -788,6 +813,7 @@ async def get_versions_for_replicasets(
788
813
  instance: str,
789
814
  namespaces: Iterable[str],
790
815
  pod_status_by_replicaset_task: "asyncio.Future[Mapping[str, Sequence[asyncio.Future[Dict[str, Any]]]]]",
816
+ container_port: Optional[int],
791
817
  ) -> List[KubernetesVersionDict]:
792
818
 
793
819
  replicaset_list: List[V1ReplicaSet] = []
@@ -815,6 +841,7 @@ async def get_versions_for_replicasets(
815
841
  replicaset,
816
842
  kube_client,
817
843
  pod_status_by_replicaset.get(replicaset.metadata.name),
844
+ container_port,
818
845
  )
819
846
  for replicaset in actually_running_replicasets
820
847
  ]
@@ -826,6 +853,7 @@ async def get_replicaset_status(
826
853
  replicaset: V1ReplicaSet,
827
854
  client: kubernetes_tools.KubeClient,
828
855
  pod_status_tasks: Sequence["asyncio.Future[Dict[str, Any]]"],
856
+ container_port: Optional[int],
829
857
  ) -> KubernetesVersionDict:
830
858
  return {
831
859
  "name": replicaset.metadata.name,
@@ -840,6 +868,7 @@ async def get_replicaset_status(
840
868
  "config_sha": replicaset.metadata.labels.get("paasta.yelp.com/config_sha"),
841
869
  "pods": await asyncio.gather(*pod_status_tasks) if pod_status_tasks else [],
842
870
  "namespace": replicaset.metadata.namespace,
871
+ "container_port": container_port,
843
872
  }
844
873
 
845
874
 
@@ -1063,6 +1092,7 @@ async def get_versions_for_controller_revisions(
1063
1092
  instance: str,
1064
1093
  namespaces: Iterable[str],
1065
1094
  pod_status_by_sha_and_readiness_task: "asyncio.Future[Mapping[Tuple[str, str], Mapping[bool, Sequence[asyncio.Future[Mapping[str, Any]]]]]]",
1095
+ container_port: Optional[int] = None,
1066
1096
  ) -> List[KubernetesVersionDict]:
1067
1097
  controller_revision_list: List[V1ControllerRevision] = []
1068
1098
 
@@ -1092,6 +1122,7 @@ async def get_versions_for_controller_revisions(
1092
1122
  cr,
1093
1123
  kube_client,
1094
1124
  pod_status_by_sha_and_readiness[(git_sha, config_sha)],
1125
+ container_port=container_port,
1095
1126
  )
1096
1127
  for (git_sha, config_sha), cr in cr_by_shas.items()
1097
1128
  ]
@@ -1106,6 +1137,7 @@ async def get_version_for_controller_revision(
1106
1137
  pod_status_tasks_by_readiness: Mapping[
1107
1138
  bool, Sequence["asyncio.Future[Mapping[str, Any]]"]
1108
1139
  ],
1140
+ container_port: Optional[int] = None,
1109
1141
  ) -> KubernetesVersionDict:
1110
1142
  all_pod_status_tasks = [
1111
1143
  task for tasks in pod_status_tasks_by_readiness.values() for task in tasks
@@ -1122,6 +1154,7 @@ async def get_version_for_controller_revision(
1122
1154
  "config_sha": cr.metadata.labels.get("paasta.yelp.com/config_sha"),
1123
1155
  "pods": [task.result() for task in all_pod_status_tasks],
1124
1156
  "namespace": cr.metadata.namespace,
1157
+ "container_port": container_port,
1125
1158
  }
1126
1159
 
1127
1160
 
@@ -173,19 +173,31 @@ class Application(ABC):
173
173
  self, kube_client: KubeClient, namespace: str
174
174
  ) -> V1PodDisruptionBudget:
175
175
  max_unavailable: Union[str, int]
176
+
177
+ system_paasta_config = load_system_paasta_config()
178
+
176
179
  if "bounce_margin_factor" in self.soa_config.config_dict:
177
180
  max_unavailable = (
178
181
  f"{int((1 - self.soa_config.get_bounce_margin_factor()) * 100)}%"
179
182
  )
180
183
  else:
181
- system_paasta_config = load_system_paasta_config()
182
184
  max_unavailable = system_paasta_config.get_pdb_max_unavailable()
183
185
 
186
+ if "unhealthy_pod_eviction_policy" in self.soa_config.config_dict:
187
+ unhealthy_pod_eviction_policy = (
188
+ self.soa_config.get_unhealthy_pod_eviction_policy()
189
+ )
190
+ else:
191
+ unhealthy_pod_eviction_policy = (
192
+ system_paasta_config.get_unhealthy_pod_eviction_policy()
193
+ )
194
+
184
195
  pdr = pod_disruption_budget_for_service_instance(
185
196
  service=self.kube_deployment.service,
186
197
  instance=self.kube_deployment.instance,
187
198
  max_unavailable=max_unavailable,
188
199
  namespace=namespace,
200
+ unhealthy_pod_eviction_policy=unhealthy_pod_eviction_policy,
189
201
  )
190
202
  try:
191
203
  existing_pdr = kube_client.policy.read_namespaced_pod_disruption_budget(
@@ -198,12 +210,21 @@ class Application(ABC):
198
210
  raise
199
211
 
200
212
  if existing_pdr:
213
+ """
214
+ Update the pod disruption budget only if spec.max_unavailable
215
+ or spec.unhealthy_pod_eviction_policy have changed;
216
+ ignore changes to other fields
217
+ """
201
218
  if existing_pdr.spec.min_available is not None:
202
219
  logging.info(
203
220
  "Not updating poddisruptionbudget: can't have both "
204
221
  "min_available and max_unavailable"
205
222
  )
206
- elif existing_pdr.spec.max_unavailable != pdr.spec.max_unavailable:
223
+ elif (
224
+ existing_pdr.spec.max_unavailable != pdr.spec.max_unavailable
225
+ or existing_pdr.spec.unhealthy_pod_eviction_policy
226
+ != pdr.spec.unhealthy_pod_eviction_policy
227
+ ):
207
228
  logging.info(f"Updating poddisruptionbudget {pdr.metadata.name}")
208
229
  return kube_client.policy.patch_namespaced_pod_disruption_budget(
209
230
  name=pdr.metadata.name, namespace=pdr.metadata.namespace, body=pdr
@@ -20,6 +20,7 @@ from typing import Sequence
20
20
  from typing import TypedDict
21
21
 
22
22
  from kubernetes.client import AuthenticationV1TokenRequest
23
+ from kubernetes.client import RbacV1Subject
23
24
  from kubernetes.client import V1Job
24
25
  from kubernetes.client import V1ObjectMeta
25
26
  from kubernetes.client import V1Pod
@@ -28,7 +29,6 @@ from kubernetes.client import V1Role
28
29
  from kubernetes.client import V1RoleBinding
29
30
  from kubernetes.client import V1RoleRef
30
31
  from kubernetes.client import V1ServiceAccount
31
- from kubernetes.client import V1Subject
32
32
  from kubernetes.client import V1TokenRequestSpec
33
33
  from kubernetes.client.exceptions import ApiException
34
34
 
@@ -522,7 +522,7 @@ def bind_role_to_service_account(
522
522
  name=role,
523
523
  ),
524
524
  subjects=[
525
- V1Subject(
525
+ RbacV1Subject(
526
526
  kind="ServiceAccount",
527
527
  name=service_account,
528
528
  ),
@@ -50,6 +50,7 @@ from kubernetes import client as kube_client
50
50
  from kubernetes import config as kube_config
51
51
  from kubernetes.client import CoreV1Event
52
52
  from kubernetes.client import models
53
+ from kubernetes.client import RbacV1Subject
53
54
  from kubernetes.client import V1Affinity
54
55
  from kubernetes.client import V1AWSElasticBlockStoreVolumeSource
55
56
  from kubernetes.client import V1Capabilities
@@ -113,7 +114,6 @@ from kubernetes.client import V1ServiceAccount
113
114
  from kubernetes.client import V1ServiceAccountTokenProjection
114
115
  from kubernetes.client import V1StatefulSet
115
116
  from kubernetes.client import V1StatefulSetSpec
116
- from kubernetes.client import V1Subject
117
117
  from kubernetes.client import V1TCPSocketAction
118
118
  from kubernetes.client import V1TopologySpreadConstraint
119
119
  from kubernetes.client import V1Volume
@@ -151,6 +151,7 @@ from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PISCINA
151
151
  from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PROMQL
152
152
  from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI
153
153
  from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI_V2
154
+ from paasta_tools.long_running_service_tools import METRICS_PROVIDER_WORKER_LOAD
154
155
  from paasta_tools.long_running_service_tools import ServiceNamespaceConfig
155
156
  from paasta_tools.secret_tools import get_secret_name_from_ref
156
157
  from paasta_tools.secret_tools import is_secret_ref
@@ -195,10 +196,8 @@ KUBE_DEPLOY_STATEGY_MAP = {
195
196
  "brutal": "RollingUpdate",
196
197
  }
197
198
  HACHECK_POD_NAME = "hacheck"
198
- GUNICORN_EXPORTER_POD_NAME = "gunicorn--exporter"
199
199
  SIDECAR_CONTAINER_NAMES = [
200
200
  HACHECK_POD_NAME,
201
- GUNICORN_EXPORTER_POD_NAME,
202
201
  ]
203
202
  KUBERNETES_NAMESPACE = "paasta"
204
203
  PAASTA_WORKLOAD_OWNER = "compute_infra_platform_experience"
@@ -876,7 +875,10 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
876
875
  ),
877
876
  ),
878
877
  )
879
- elif provider["type"] == METRICS_PROVIDER_UWSGI_V2:
878
+ elif provider["type"] in {
879
+ METRICS_PROVIDER_UWSGI_V2,
880
+ METRICS_PROVIDER_WORKER_LOAD,
881
+ }:
880
882
  return V2MetricSpec(
881
883
  type="Object",
882
884
  object=V2ObjectMetricSource(
@@ -1072,15 +1074,10 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
1072
1074
  service_namespace_config,
1073
1075
  hacheck_sidecar_volumes,
1074
1076
  )
1075
- gunicorn_exporter_container = self.get_gunicorn_exporter_sidecar_container(
1076
- system_paasta_config
1077
- )
1078
1077
 
1079
1078
  sidecars = []
1080
1079
  if hacheck_container:
1081
1080
  sidecars.append(hacheck_container)
1082
- if gunicorn_exporter_container:
1083
- sidecars.append(gunicorn_exporter_container)
1084
1081
  return sidecars
1085
1082
 
1086
1083
  def get_readiness_check_prefix(
@@ -1168,37 +1165,6 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
1168
1165
  )
1169
1166
  return None
1170
1167
 
1171
- def get_gunicorn_exporter_sidecar_container(
1172
- self,
1173
- system_paasta_config: SystemPaastaConfig,
1174
- ) -> Optional[V1Container]:
1175
-
1176
- if self.should_use_metrics_provider(METRICS_PROVIDER_GUNICORN):
1177
- return V1Container(
1178
- image=system_paasta_config.get_gunicorn_exporter_sidecar_image_url(),
1179
- resources=self.get_sidecar_resource_requirements(
1180
- "gunicorn_exporter", system_paasta_config
1181
- ),
1182
- name=GUNICORN_EXPORTER_POD_NAME,
1183
- env=self.get_kubernetes_environment(),
1184
- ports=[V1ContainerPort(container_port=9117)],
1185
- lifecycle=V1Lifecycle(
1186
- pre_stop=V1LifecycleHandler(
1187
- _exec=V1ExecAction(
1188
- command=[
1189
- "/bin/sh",
1190
- "-c",
1191
- # we sleep for the same amount of time as we do after an hadown to ensure that we have accurate
1192
- # metrics up until our Pod dies
1193
- f"sleep {self.get_hacheck_prestop_sleep_seconds()}",
1194
- ]
1195
- )
1196
- )
1197
- ),
1198
- )
1199
-
1200
- return None
1201
-
1202
1168
  def get_env(
1203
1169
  self, system_paasta_config: Optional["SystemPaastaConfig"] = None
1204
1170
  ) -> Dict[str, str]:
@@ -1546,7 +1512,7 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
1546
1512
  and the service will be removed from smartstack, which is the same effect we get after running hadown.
1547
1513
  """
1548
1514
 
1549
- # Everywhere this value is currently used (hacheck sidecar or gunicorn sidecar), we can pretty safely
1515
+ # Everywhere this value is currently used (hacheck sidecar), we can pretty safely
1550
1516
  # assume that the service is in smartstack.
1551
1517
  return self.get_prestop_sleep_seconds(is_in_smartstack=True) + 1
1552
1518
 
@@ -2306,6 +2272,7 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
2306
2272
  or self.get_prometheus_port() is not None
2307
2273
  or self.should_use_metrics_provider(METRICS_PROVIDER_UWSGI)
2308
2274
  or self.should_use_metrics_provider(METRICS_PROVIDER_GUNICORN)
2275
+ or self.should_use_metrics_provider(METRICS_PROVIDER_WORKER_LOAD)
2309
2276
  ):
2310
2277
  return "true"
2311
2278
  return "false"
@@ -2458,6 +2425,10 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
2458
2425
  "paasta.yelp.com/cluster": self.cluster,
2459
2426
  "yelp.com/owner": "compute_infra_platform_experience",
2460
2427
  "paasta.yelp.com/managed": "true",
2428
+ # NOTE: this is mostly here for autoscaling purposes: we use information from the deploy group
2429
+ # during Prometheus relabeling - but it's not a bad label to have around in general, thus its
2430
+ # inclusion here
2431
+ "paasta.yelp.com/deploy_group": self.get_deploy_group(),
2461
2432
  }
2462
2433
  if service_namespace_config.is_in_smartstack():
2463
2434
  labels["paasta.yelp.com/weight"] = str(self.get_weight())
@@ -2483,22 +2454,13 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
2483
2454
 
2484
2455
  # not all services use autoscaling, so we label those that do in order to have
2485
2456
  # prometheus selectively discover/scrape them
2486
- if self.should_use_metrics_provider(METRICS_PROVIDER_UWSGI):
2487
- # UWSGI no longer needs a label to indicate it needs to be scraped as all pods are checked for the uwsgi stats port by our centralized uwsgi-exporter
2488
- # But we do still need deploy_group for relabeling properly
2489
- # this should probably eventually be made into a default label,
2490
- # but for now we're fine with it being behind these feature toggles.
2491
- # ideally, we'd also have the docker image here for ease-of-use
2492
- # in Prometheus relabeling, but that information is over the
2493
- # character limit for k8s labels (63 chars)
2494
- labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group()
2495
-
2496
- elif self.should_use_metrics_provider(METRICS_PROVIDER_PISCINA):
2497
- labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group()
2457
+ # NOTE: these are not mutually exclusive as a service could use multiple autoscaling types
2458
+ if self.should_use_metrics_provider(METRICS_PROVIDER_PISCINA):
2498
2459
  labels["paasta.yelp.com/scrape_piscina_prometheus"] = "true"
2499
2460
 
2500
- elif self.should_use_metrics_provider(METRICS_PROVIDER_GUNICORN):
2501
- labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group()
2461
+ if self.should_use_metrics_provider(
2462
+ METRICS_PROVIDER_GUNICORN
2463
+ ) or self.should_use_metrics_provider(METRICS_PROVIDER_WORKER_LOAD):
2502
2464
  labels["paasta.yelp.com/scrape_gunicorn_prometheus"] = "true"
2503
2465
 
2504
2466
  # the default AWS LB Controller behavior is to enable this by-namespace
@@ -3030,7 +2992,7 @@ def ensure_paasta_api_rolebinding(kube_client: KubeClient, namespace: str) -> No
3030
2992
  name="paasta-api-server-per-namespace",
3031
2993
  ),
3032
2994
  subjects=[
3033
- V1Subject(
2995
+ RbacV1Subject(
3034
2996
  kind="User",
3035
2997
  name="yelp.com/paasta-api-server",
3036
2998
  ),
@@ -3412,21 +3374,26 @@ def pod_disruption_budget_for_service_instance(
3412
3374
  instance: str,
3413
3375
  max_unavailable: Union[str, int],
3414
3376
  namespace: str,
3377
+ unhealthy_pod_eviction_policy: str,
3415
3378
  ) -> V1PodDisruptionBudget:
3379
+ selector = V1LabelSelector(
3380
+ match_labels={
3381
+ "paasta.yelp.com/service": service,
3382
+ "paasta.yelp.com/instance": instance,
3383
+ }
3384
+ )
3385
+ spec = V1PodDisruptionBudgetSpec(
3386
+ max_unavailable=max_unavailable,
3387
+ unhealthy_pod_eviction_policy=unhealthy_pod_eviction_policy,
3388
+ selector=selector,
3389
+ )
3390
+
3416
3391
  return V1PodDisruptionBudget(
3417
3392
  metadata=V1ObjectMeta(
3418
3393
  name=get_kubernetes_app_name(service, instance),
3419
3394
  namespace=namespace,
3420
3395
  ),
3421
- spec=V1PodDisruptionBudgetSpec(
3422
- max_unavailable=max_unavailable,
3423
- selector=V1LabelSelector(
3424
- match_labels={
3425
- "paasta.yelp.com/service": service,
3426
- "paasta.yelp.com/instance": instance,
3427
- }
3428
- ),
3429
- ),
3396
+ spec=spec,
3430
3397
  )
3431
3398
 
3432
3399
 
@@ -4210,6 +4177,10 @@ def create_pod_topology_spread_constraints(
4210
4177
  when_unsatisfiable=constraint.get(
4211
4178
  "when_unsatisfiable", "ScheduleAnyway"
4212
4179
  ),
4180
+ # we might want to default this to someting else in the future
4181
+ # but for now, make this opt-in
4182
+ # (null or empty list means only match against the labelSelector)
4183
+ match_label_keys=constraint.get("match_label_keys", None),
4213
4184
  )
4214
4185
  )
4215
4186
 
@@ -4413,7 +4384,7 @@ def ensure_service_account(
4413
4384
  name=k8s_role,
4414
4385
  ),
4415
4386
  subjects=[
4416
- V1Subject(
4387
+ RbacV1Subject(
4417
4388
  kind="ServiceAccount",
4418
4389
  namespace=namespace,
4419
4390
  name=sa_name,
@@ -41,6 +41,7 @@ DEFAULT_ACTIVE_REQUESTS_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
41
41
  DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
42
42
  DEFAULT_PISCINA_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
43
43
  DEFAULT_GUNICORN_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
44
+ DEFAULT_WORKER_LOAD_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
44
45
 
45
46
  METRICS_PROVIDER_CPU = "cpu"
46
47
  METRICS_PROVIDER_UWSGI = "uwsgi"
@@ -48,7 +49,8 @@ METRICS_PROVIDER_UWSGI_V2 = "uwsgi-v2"
48
49
  METRICS_PROVIDER_GUNICORN = "gunicorn"
49
50
  METRICS_PROVIDER_PISCINA = "piscina"
50
51
  METRICS_PROVIDER_ACTIVE_REQUESTS = "active-requests"
51
- METRICS_PROVIDER_PROMQL = "arbitrary_promql"
52
+ METRICS_PROVIDER_PROMQL = "arbitrary-promql"
53
+ METRICS_PROVIDER_WORKER_LOAD = "worker-load"
52
54
 
53
55
  ALL_METRICS_PROVIDERS = [
54
56
  METRICS_PROVIDER_CPU,
@@ -58,6 +60,7 @@ ALL_METRICS_PROVIDERS = [
58
60
  METRICS_PROVIDER_PISCINA,
59
61
  METRICS_PROVIDER_ACTIVE_REQUESTS,
60
62
  METRICS_PROVIDER_PROMQL,
63
+ METRICS_PROVIDER_WORKER_LOAD,
61
64
  ]
62
65
 
63
66
 
@@ -85,6 +88,7 @@ class LongRunningServiceConfigDict(InstanceConfigDict, total=False):
85
88
  bounce_margin_factor: float
86
89
  should_ping_for_unhealthy_pods: bool
87
90
  weight: int
91
+ unhealthy_pod_eviction_policy: str
88
92
 
89
93
 
90
94
  class ServiceNamespaceConfig(dict):
@@ -410,6 +414,9 @@ class LongRunningServiceConfig(InstanceConfig):
410
414
  def get_bounce_margin_factor(self) -> float:
411
415
  return self.config_dict.get("bounce_margin_factor", 0.95)
412
416
 
417
+ def get_unhealthy_pod_eviction_policy(self) -> str:
418
+ return self.config_dict.get("unhealthy_pod_eviction_policy", "IfHealthyBudget")
419
+
413
420
  def get_should_ping_for_unhealthy_pods(self, default: bool) -> bool:
414
421
  return self.config_dict.get("should_ping_for_unhealthy_pods", default)
415
422
 
@@ -81,6 +81,7 @@ class KubernetesVersion(ModelNormal):
81
81
  """
82
82
  lazy_import()
83
83
  return {
84
+ 'container_port': (int,), # noqa: E501
84
85
  'type': (str,), # noqa: E501
85
86
  'create_timestamp': (float,), # noqa: E501
86
87
  'git_sha': (str,), # noqa: E501
@@ -99,6 +100,7 @@ class KubernetesVersion(ModelNormal):
99
100
 
100
101
 
101
102
  attribute_map = {
103
+ 'container_port': 'container_port', # noqa: E501
102
104
  'type': 'type', # noqa: E501
103
105
  'create_timestamp': 'create_timestamp', # noqa: E501
104
106
  'git_sha': 'git_sha', # noqa: E501
@@ -157,6 +159,7 @@ class KubernetesVersion(ModelNormal):
157
159
  Animal class but this time we won't travel
158
160
  through its discriminator because we passed in
159
161
  _visited_composed_classes = (Animal,)
162
+ container_port (int): Port the container is expecting to receive traffic on. [optional] # noqa: E501
160
163
  type (str): Type of version (ReplicaSet or ControllerRevision). [optional] # noqa: E501
161
164
  create_timestamp (float): Unix timestamp when version was created. [optional] # noqa: E501
162
165
  git_sha (str): Git SHA of service code for this version of the instance. [optional] # noqa: E501
@@ -53,6 +53,9 @@ from paasta_tools.long_running_service_tools import (
53
53
  from paasta_tools.long_running_service_tools import (
54
54
  DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW,
55
55
  )
56
+ from paasta_tools.long_running_service_tools import (
57
+ DEFAULT_WORKER_LOAD_AUTOSCALING_MOVING_AVERAGE_WINDOW,
58
+ )
56
59
  from paasta_tools.long_running_service_tools import METRICS_PROVIDER_ACTIVE_REQUESTS
57
60
  from paasta_tools.long_running_service_tools import METRICS_PROVIDER_CPU
58
61
  from paasta_tools.long_running_service_tools import METRICS_PROVIDER_GUNICORN
@@ -60,6 +63,7 @@ from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PISCINA
60
63
  from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PROMQL
61
64
  from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI
62
65
  from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI_V2
66
+ from paasta_tools.long_running_service_tools import METRICS_PROVIDER_WORKER_LOAD
63
67
  from paasta_tools.paasta_service_config_loader import PaastaServiceConfigLoader
64
68
  from paasta_tools.utils import DEFAULT_SOA_DIR
65
69
  from paasta_tools.utils import get_services_for_cluster
@@ -214,6 +218,10 @@ def create_instance_scaling_rule(
214
218
  return create_instance_uwsgi_v2_scaling_rule(
215
219
  service, instance_config, metrics_provider_config, paasta_cluster
216
220
  )
221
+ if metrics_provider_config["type"] == METRICS_PROVIDER_WORKER_LOAD:
222
+ return create_instance_worker_load_scaling_rule(
223
+ service, instance_config, metrics_provider_config, paasta_cluster
224
+ )
217
225
  if metrics_provider_config["type"] == METRICS_PROVIDER_PISCINA:
218
226
  return create_instance_piscina_scaling_rule(
219
227
  service, instance_config, metrics_provider_config, paasta_cluster
@@ -523,6 +531,80 @@ def create_instance_uwsgi_v2_scaling_rule(
523
531
  }
524
532
 
525
533
 
534
+ def create_instance_worker_load_scaling_rule(
535
+ service: str,
536
+ instance_config: KubernetesDeploymentConfig,
537
+ metrics_provider_config: MetricsProviderDict,
538
+ paasta_cluster: str,
539
+ ) -> PrometheusAdapterRule:
540
+ """
541
+ Creates a Prometheus adapter rule config for a given service instance using generic worker_busy metric.
542
+ """
543
+ instance = instance_config.instance
544
+ moving_average_window = metrics_provider_config.get(
545
+ "moving_average_window_seconds",
546
+ DEFAULT_WORKER_LOAD_AUTOSCALING_MOVING_AVERAGE_WINDOW,
547
+ )
548
+ deployment_name = get_kubernetes_app_name(service=service, instance=instance)
549
+
550
+ # In order for autoscaling to work safely while a service migrates from one namespace to another, the HPA needs to
551
+ # make sure that the deployment in the new namespace is scaled up enough to handle _all_ the load.
552
+ # This is because once the new deployment is 100% healthy, cleanup_kubernetes_job will delete the deployment out of
553
+ # the old namespace all at once, suddenly putting all the load onto the deployment in the new namespace.
554
+ # To ensure this, we must NOT filter on namespace in worker_filter_terms (which is used when calculating total_load.
555
+ # This makes sure that desired_instances includes load from all namespaces.
556
+ worker_filter_terms = f"paasta_cluster='{paasta_cluster}',paasta_service='{service}',paasta_instance='{instance}'"
557
+
558
+ # k8s:deployment:pods_status_ready is a metric created by summing kube_pod_status_ready
559
+ # over paasta service/instance/cluster. it counts the number of ready pods in a paasta
560
+ # deployment.
561
+ ready_pods = f"""
562
+ (sum(
563
+ k8s:deployment:pods_status_ready{{{worker_filter_terms}}} >= 0
564
+ or
565
+ max_over_time(
566
+ k8s:deployment:pods_status_ready{{{worker_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s]
567
+ )
568
+ ) by (kube_deployment))
569
+ """
570
+ load_per_instance = f"""
571
+ avg(
572
+ worker_busy{{{worker_filter_terms}}}
573
+ ) by (kube_pod, kube_deployment)
574
+ """
575
+ missing_instances = f"""
576
+ clamp_min(
577
+ {ready_pods} - count({load_per_instance}) by (kube_deployment),
578
+ 0
579
+ )
580
+ """
581
+ total_load = f"""
582
+ (
583
+ sum(
584
+ {load_per_instance}
585
+ ) by (kube_deployment)
586
+ +
587
+ {missing_instances}
588
+ )
589
+ """
590
+ total_load_smoothed = f"""
591
+ avg_over_time(
592
+ (
593
+ {total_load}
594
+ )[{moving_average_window}s:]
595
+ )
596
+ """
597
+
598
+ metric_name = f"{deployment_name}-worker-load-prom"
599
+
600
+ return {
601
+ "name": {"as": metric_name},
602
+ "seriesQuery": f"worker_busy{{{worker_filter_terms}}}",
603
+ "resources": {"template": "kube_<<.Resource>>"},
604
+ "metricsQuery": _minify_promql(total_load_smoothed),
605
+ }
606
+
607
+
526
608
  def create_instance_piscina_scaling_rule(
527
609
  service: str,
528
610
  instance_config: KubernetesDeploymentConfig,
@@ -506,6 +506,9 @@ class TronActionConfig(InstanceConfig):
506
506
  # XXX: update PAASTA_RESOURCE_* env vars to use the correct value from spark_args and set
507
507
  # these to the correct values for the executors as part of the driver commandline
508
508
 
509
+ # our internal Spark configuration service needs this to determine if any special behavior is required
510
+ env["SPARK_DRIVER_TYPE"] = "tron"
511
+
509
512
  return env
510
513
 
511
514
  def get_iam_role(self) -> str: