PyPI - paasta-tools - Versions diffs - 1.30.9__py3-none-any.whl → 1.35.8__py3-none-any.whl - Mend

paasta-tools 1.30.9py3-none-any.whl → 1.35.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of paasta-tools might be problematic. Click here for more details.

Files changed (98) hide show

paasta_tools/instance/kubernetes.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
+import logging
 from asyncio.tasks import Task
 from collections import defaultdict
 from enum import Enum
@@ -17,6 +18,7 @@ from typing import Union
 import a_sync
 import pytz
+import requests.exceptions
 from kubernetes.client import V1Container
 from kubernetes.client import V1ControllerRevision
 from kubernetes.client import V1Pod
@@ -75,6 +77,8 @@ INSTANCE_TYPE_CR_ID = dict(
     monkrelaycluster=monkrelaycluster_tools.cr_id,
 )
+logger = logging.getLogger(__name__)
 class ServiceMesh(Enum):
     SMARTSTACK = "smartstack"
@@ -100,6 +104,7 @@ class KubernetesVersionDict(TypedDict, total=False):
     config_sha: str
     pods: Sequence[Mapping[str, Any]]
     namespace: str
+    container_port: Optional[int]
 def cr_id(service: str, instance: str, instance_type: str) -> Mapping[str, str]:
@@ -347,31 +352,49 @@ async def mesh_status(
     pods = await pods_task
     for location, hosts in node_hostname_by_location.items():
-        host = replication_checker.get_hostname_in_pool(hosts, instance_pool)
-        if service_mesh == ServiceMesh.SMARTSTACK:
-            mesh_status["locations"].append(
-                _build_smartstack_location_dict(
-                    synapse_host=host,
-                    synapse_port=settings.system_paasta_config.get_synapse_port(),
-                    synapse_haproxy_url_format=settings.system_paasta_config.get_synapse_haproxy_url_format(),
-                    registration=registration,
-                    pods=pods,
-                    location=location,
-                    should_return_individual_backends=should_return_individual_backends,
-                )
-            )
-        elif service_mesh == ServiceMesh.ENVOY:
-            mesh_status["locations"].append(
-                _build_envoy_location_dict(
-                    envoy_host=host,
-                    envoy_admin_port=settings.system_paasta_config.get_envoy_admin_port(),
-                    envoy_admin_endpoint_format=settings.system_paasta_config.get_envoy_admin_endpoint_format(),
-                    registration=registration,
-                    pods=pods,
-                    location=location,
-                    should_return_individual_backends=should_return_individual_backends,
-                )
-            )
+        max_retries = 3
+        for attempt in range(max_retries):
+            host = replication_checker.get_hostname_in_pool(hosts, instance_pool)
+            try:
+                if service_mesh == ServiceMesh.SMARTSTACK:
+                    location_dict = _build_smartstack_location_dict(
+                        synapse_host=host,
+                        synapse_port=settings.system_paasta_config.get_synapse_port(),
+                        synapse_haproxy_url_format=settings.system_paasta_config.get_synapse_haproxy_url_format(),
+                        registration=registration,
+                        pods=pods,
+                        location=location,
+                        should_return_individual_backends=should_return_individual_backends,
+                    )
+                elif service_mesh == ServiceMesh.ENVOY:
+                    location_dict = _build_envoy_location_dict(
+                        envoy_host=host,
+                        envoy_admin_port=settings.system_paasta_config.get_envoy_admin_port(),
+                        envoy_admin_endpoint_format=settings.system_paasta_config.get_envoy_admin_endpoint_format(),
+                        registration=registration,
+                        pods=pods,
+                        location=location,
+                        should_return_individual_backends=should_return_individual_backends,
+                    )
+                mesh_status["locations"].append(location_dict)
+                return mesh_status
+            except requests.exceptions.ConnectTimeout:
+                if attempt < max_retries - 1:
+                    logger.warning(
+                        "attempt %s/%s: Unable to connect to %s, retrying (on another host, hopefully)...",
+                        attempt,
+                        max_retries,
+                        host,
+                    )
+                    continue
+                else:
+                    logger.critical(
+                        "Unable to connect to %s, not retrying again.", host
+                    )
+                    raise
     return mesh_status
@@ -698,6 +721,7 @@ async def kubernetes_status_v2(
                 instance=instance,
                 namespaces=relevant_namespaces,
                 pod_status_by_sha_and_readiness_task=pod_status_by_sha_and_readiness_task,  # type: ignore  # PAASTA-18698; ignoring due to unexpected type mismatch
+                container_port=job_config.get_container_port(),
             )
         )
         tasks.extend([pod_status_by_sha_and_readiness_task, versions_task])  # type: ignore  # PAASTA-18698; ignoring due to unexpected type mismatch
@@ -717,6 +741,7 @@ async def kubernetes_status_v2(
                 instance=instance,
                 namespaces=relevant_namespaces,
                 pod_status_by_replicaset_task=pod_status_by_replicaset_task,  # type: ignore  # PAASTA-18698; ignoring due to unexpected type mismatch
+                container_port=job_config.get_container_port(),
             )
         )
         tasks.extend([pod_status_by_replicaset_task, versions_task])  # type: ignore  # PAASTA-18698; ignoring due to unexpected type mismatch
@@ -788,6 +813,7 @@ async def get_versions_for_replicasets(
     instance: str,
     namespaces: Iterable[str],
     pod_status_by_replicaset_task: "asyncio.Future[Mapping[str, Sequence[asyncio.Future[Dict[str, Any]]]]]",
+    container_port: Optional[int],
 ) -> List[KubernetesVersionDict]:
     replicaset_list: List[V1ReplicaSet] = []
@@ -815,6 +841,7 @@ async def get_versions_for_replicasets(
                 replicaset,
                 kube_client,
                 pod_status_by_replicaset.get(replicaset.metadata.name),
+                container_port,
             )
             for replicaset in actually_running_replicasets
         ]
@@ -826,6 +853,7 @@ async def get_replicaset_status(
     replicaset: V1ReplicaSet,
     client: kubernetes_tools.KubeClient,
     pod_status_tasks: Sequence["asyncio.Future[Dict[str, Any]]"],
+    container_port: Optional[int],
 ) -> KubernetesVersionDict:
     return {
         "name": replicaset.metadata.name,
@@ -840,6 +868,7 @@ async def get_replicaset_status(
         "config_sha": replicaset.metadata.labels.get("paasta.yelp.com/config_sha"),
         "pods": await asyncio.gather(*pod_status_tasks) if pod_status_tasks else [],
         "namespace": replicaset.metadata.namespace,
+        "container_port": container_port,
     }
@@ -1063,6 +1092,7 @@ async def get_versions_for_controller_revisions(
     instance: str,
     namespaces: Iterable[str],
     pod_status_by_sha_and_readiness_task: "asyncio.Future[Mapping[Tuple[str, str], Mapping[bool, Sequence[asyncio.Future[Mapping[str, Any]]]]]]",
+    container_port: Optional[int] = None,
 ) -> List[KubernetesVersionDict]:
     controller_revision_list: List[V1ControllerRevision] = []
@@ -1092,6 +1122,7 @@ async def get_versions_for_controller_revisions(
                 cr,
                 kube_client,
                 pod_status_by_sha_and_readiness[(git_sha, config_sha)],
+                container_port=container_port,
             )
             for (git_sha, config_sha), cr in cr_by_shas.items()
         ]
@@ -1106,6 +1137,7 @@ async def get_version_for_controller_revision(
     pod_status_tasks_by_readiness: Mapping[
         bool, Sequence["asyncio.Future[Mapping[str, Any]]"]
     ],
+    container_port: Optional[int] = None,
 ) -> KubernetesVersionDict:
     all_pod_status_tasks = [
         task for tasks in pod_status_tasks_by_readiness.values() for task in tasks
@@ -1122,6 +1154,7 @@ async def get_version_for_controller_revision(
         "config_sha": cr.metadata.labels.get("paasta.yelp.com/config_sha"),
         "pods": [task.result() for task in all_pod_status_tasks],
         "namespace": cr.metadata.namespace,
+        "container_port": container_port,
     }

paasta_tools/kubernetes/application/controller_wrappers.py CHANGED Viewed

@@ -173,19 +173,31 @@ class Application(ABC):
         self, kube_client: KubeClient, namespace: str
     ) -> V1PodDisruptionBudget:
         max_unavailable: Union[str, int]
+        system_paasta_config = load_system_paasta_config()
         if "bounce_margin_factor" in self.soa_config.config_dict:
             max_unavailable = (
                 f"{int((1 - self.soa_config.get_bounce_margin_factor()) * 100)}%"
             )
         else:
-            system_paasta_config = load_system_paasta_config()
             max_unavailable = system_paasta_config.get_pdb_max_unavailable()
+        if "unhealthy_pod_eviction_policy" in self.soa_config.config_dict:
+            unhealthy_pod_eviction_policy = (
+                self.soa_config.get_unhealthy_pod_eviction_policy()
+            )
+        else:
+            unhealthy_pod_eviction_policy = (
+                system_paasta_config.get_unhealthy_pod_eviction_policy()
+            )
         pdr = pod_disruption_budget_for_service_instance(
             service=self.kube_deployment.service,
             instance=self.kube_deployment.instance,
             max_unavailable=max_unavailable,
             namespace=namespace,
+            unhealthy_pod_eviction_policy=unhealthy_pod_eviction_policy,
         )
         try:
             existing_pdr = kube_client.policy.read_namespaced_pod_disruption_budget(
@@ -198,12 +210,21 @@ class Application(ABC):
                 raise
         if existing_pdr:
+            """
+            Update the pod disruption budget only if spec.max_unavailable
+            or spec.unhealthy_pod_eviction_policy have changed;
+            ignore changes to other fields
+            """
             if existing_pdr.spec.min_available is not None:
                 logging.info(
                     "Not updating poddisruptionbudget: can't have both "
                     "min_available and max_unavailable"
                 )
-            elif existing_pdr.spec.max_unavailable != pdr.spec.max_unavailable:
+            elif (
+                existing_pdr.spec.max_unavailable != pdr.spec.max_unavailable
+                or existing_pdr.spec.unhealthy_pod_eviction_policy
+                != pdr.spec.unhealthy_pod_eviction_policy
+            ):
                 logging.info(f"Updating poddisruptionbudget {pdr.metadata.name}")
                 return kube_client.policy.patch_namespaced_pod_disruption_budget(
                     name=pdr.metadata.name, namespace=pdr.metadata.namespace, body=pdr

paasta_tools/kubernetes/remote_run.py CHANGED Viewed

@@ -20,6 +20,7 @@ from typing import Sequence
 from typing import TypedDict
 from kubernetes.client import AuthenticationV1TokenRequest
+from kubernetes.client import RbacV1Subject
 from kubernetes.client import V1Job
 from kubernetes.client import V1ObjectMeta
 from kubernetes.client import V1Pod
@@ -28,7 +29,6 @@ from kubernetes.client import V1Role
 from kubernetes.client import V1RoleBinding
 from kubernetes.client import V1RoleRef
 from kubernetes.client import V1ServiceAccount
-from kubernetes.client import V1Subject
 from kubernetes.client import V1TokenRequestSpec
 from kubernetes.client.exceptions import ApiException
@@ -522,7 +522,7 @@ def bind_role_to_service_account(
             name=role,
         ),
         subjects=[
-            V1Subject(
+            RbacV1Subject(
                 kind="ServiceAccount",
                 name=service_account,
             ),

paasta_tools/kubernetes_tools.py CHANGED Viewed

@@ -50,6 +50,7 @@ from kubernetes import client as kube_client
 from kubernetes import config as kube_config
 from kubernetes.client import CoreV1Event
 from kubernetes.client import models
+from kubernetes.client import RbacV1Subject
 from kubernetes.client import V1Affinity
 from kubernetes.client import V1AWSElasticBlockStoreVolumeSource
 from kubernetes.client import V1Capabilities
@@ -113,7 +114,6 @@ from kubernetes.client import V1ServiceAccount
 from kubernetes.client import V1ServiceAccountTokenProjection
 from kubernetes.client import V1StatefulSet
 from kubernetes.client import V1StatefulSetSpec
-from kubernetes.client import V1Subject
 from kubernetes.client import V1TCPSocketAction
 from kubernetes.client import V1TopologySpreadConstraint
 from kubernetes.client import V1Volume
@@ -151,6 +151,7 @@ from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PISCINA
 from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PROMQL
 from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI
 from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI_V2
+from paasta_tools.long_running_service_tools import METRICS_PROVIDER_WORKER_LOAD
 from paasta_tools.long_running_service_tools import ServiceNamespaceConfig
 from paasta_tools.secret_tools import get_secret_name_from_ref
 from paasta_tools.secret_tools import is_secret_ref
@@ -195,10 +196,8 @@ KUBE_DEPLOY_STATEGY_MAP = {
     "brutal": "RollingUpdate",
 }
 HACHECK_POD_NAME = "hacheck"
-GUNICORN_EXPORTER_POD_NAME = "gunicorn--exporter"
 SIDECAR_CONTAINER_NAMES = [
     HACHECK_POD_NAME,
-    GUNICORN_EXPORTER_POD_NAME,
 ]
 KUBERNETES_NAMESPACE = "paasta"
 PAASTA_WORKLOAD_OWNER = "compute_infra_platform_experience"
@@ -876,7 +875,10 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
                     ),
                 ),
             )
-        elif provider["type"] == METRICS_PROVIDER_UWSGI_V2:
+        elif provider["type"] in {
+            METRICS_PROVIDER_UWSGI_V2,
+            METRICS_PROVIDER_WORKER_LOAD,
+        }:
             return V2MetricSpec(
                 type="Object",
                 object=V2ObjectMetricSource(
@@ -1072,15 +1074,10 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
             service_namespace_config,
             hacheck_sidecar_volumes,
         )
-        gunicorn_exporter_container = self.get_gunicorn_exporter_sidecar_container(
-            system_paasta_config
-        )
         sidecars = []
         if hacheck_container:
             sidecars.append(hacheck_container)
-        if gunicorn_exporter_container:
-            sidecars.append(gunicorn_exporter_container)
         return sidecars
     def get_readiness_check_prefix(
@@ -1168,37 +1165,6 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
             )
         return None
-    def get_gunicorn_exporter_sidecar_container(
-        self,
-        system_paasta_config: SystemPaastaConfig,
-    ) -> Optional[V1Container]:
-        if self.should_use_metrics_provider(METRICS_PROVIDER_GUNICORN):
-            return V1Container(
-                image=system_paasta_config.get_gunicorn_exporter_sidecar_image_url(),
-                resources=self.get_sidecar_resource_requirements(
-                    "gunicorn_exporter", system_paasta_config
-                ),
-                name=GUNICORN_EXPORTER_POD_NAME,
-                env=self.get_kubernetes_environment(),
-                ports=[V1ContainerPort(container_port=9117)],
-                lifecycle=V1Lifecycle(
-                    pre_stop=V1LifecycleHandler(
-                        _exec=V1ExecAction(
-                            command=[
-                                "/bin/sh",
-                                "-c",
-                                # we sleep for the same amount of time as we do after an hadown to ensure that we have accurate
-                                # metrics up until our Pod dies
-                                f"sleep {self.get_hacheck_prestop_sleep_seconds()}",
-                            ]
-                        )
-                    )
-                ),
-            )
-        return None
     def get_env(
         self, system_paasta_config: Optional["SystemPaastaConfig"] = None
     ) -> Dict[str, str]:
@@ -1546,7 +1512,7 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
         and the service will be removed from smartstack, which is the same effect we get after running hadown.
         """
-        # Everywhere this value is currently used (hacheck sidecar or gunicorn sidecar), we can pretty safely
+        # Everywhere this value is currently used (hacheck sidecar), we can pretty safely
         # assume that the service is in smartstack.
         return self.get_prestop_sleep_seconds(is_in_smartstack=True) + 1
@@ -2306,6 +2272,7 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
             or self.get_prometheus_port() is not None
             or self.should_use_metrics_provider(METRICS_PROVIDER_UWSGI)
             or self.should_use_metrics_provider(METRICS_PROVIDER_GUNICORN)
+            or self.should_use_metrics_provider(METRICS_PROVIDER_WORKER_LOAD)
         ):
             return "true"
         return "false"
@@ -2458,6 +2425,10 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
             "paasta.yelp.com/cluster": self.cluster,
             "yelp.com/owner": "compute_infra_platform_experience",
             "paasta.yelp.com/managed": "true",
+            # NOTE: this is mostly here for autoscaling purposes: we use information from the deploy group
+            # during Prometheus relabeling - but it's not a bad label to have around in general, thus its
+            # inclusion here
+            "paasta.yelp.com/deploy_group": self.get_deploy_group(),
         }
         if service_namespace_config.is_in_smartstack():
             labels["paasta.yelp.com/weight"] = str(self.get_weight())
@@ -2483,22 +2454,13 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
         # not all services use autoscaling, so we label those that do in order to have
         # prometheus selectively discover/scrape them
-        if self.should_use_metrics_provider(METRICS_PROVIDER_UWSGI):
-            # UWSGI no longer needs a label to indicate it needs to be scraped as all pods are checked for the uwsgi stats port by our centralized uwsgi-exporter
-            # But we do still need deploy_group for relabeling properly
-            # this should probably eventually be made into a default label,
-            # but for now we're fine with it being behind these feature toggles.
-            # ideally, we'd also have the docker image here for ease-of-use
-            # in Prometheus relabeling, but that information is over the
-            # character limit for k8s labels (63 chars)
-            labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group()
-        elif self.should_use_metrics_provider(METRICS_PROVIDER_PISCINA):
-            labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group()
+        # NOTE: these are not mutually exclusive as a service could use multiple autoscaling types
+        if self.should_use_metrics_provider(METRICS_PROVIDER_PISCINA):
             labels["paasta.yelp.com/scrape_piscina_prometheus"] = "true"
-        elif self.should_use_metrics_provider(METRICS_PROVIDER_GUNICORN):
-            labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group()
+        if self.should_use_metrics_provider(
+            METRICS_PROVIDER_GUNICORN
+        ) or self.should_use_metrics_provider(METRICS_PROVIDER_WORKER_LOAD):
             labels["paasta.yelp.com/scrape_gunicorn_prometheus"] = "true"
         # the default AWS LB Controller behavior is to enable this by-namespace
@@ -3030,7 +2992,7 @@ def ensure_paasta_api_rolebinding(kube_client: KubeClient, namespace: str) -> No
                 name="paasta-api-server-per-namespace",
             ),
             subjects=[
-                V1Subject(
+                RbacV1Subject(
                     kind="User",
                     name="yelp.com/paasta-api-server",
                 ),
@@ -3412,21 +3374,26 @@ def pod_disruption_budget_for_service_instance(
     instance: str,
     max_unavailable: Union[str, int],
     namespace: str,
+    unhealthy_pod_eviction_policy: str,
 ) -> V1PodDisruptionBudget:
+    selector = V1LabelSelector(
+        match_labels={
+            "paasta.yelp.com/service": service,
+            "paasta.yelp.com/instance": instance,
+        }
+    )
+    spec = V1PodDisruptionBudgetSpec(
+        max_unavailable=max_unavailable,
+        unhealthy_pod_eviction_policy=unhealthy_pod_eviction_policy,
+        selector=selector,
+    )
     return V1PodDisruptionBudget(
         metadata=V1ObjectMeta(
             name=get_kubernetes_app_name(service, instance),
             namespace=namespace,
         ),
-        spec=V1PodDisruptionBudgetSpec(
-            max_unavailable=max_unavailable,
-            selector=V1LabelSelector(
-                match_labels={
-                    "paasta.yelp.com/service": service,
-                    "paasta.yelp.com/instance": instance,
-                }
-            ),
-        ),
+        spec=spec,
     )
@@ -4210,6 +4177,10 @@ def create_pod_topology_spread_constraints(
                 when_unsatisfiable=constraint.get(
                     "when_unsatisfiable", "ScheduleAnyway"
                 ),
+                # we might want to default this to someting else in the future
+                # but for now, make this opt-in
+                # (null or empty list means only match against the labelSelector)
+                match_label_keys=constraint.get("match_label_keys", None),
             )
         )
@@ -4413,7 +4384,7 @@ def ensure_service_account(
                     name=k8s_role,
                 ),
                 subjects=[
-                    V1Subject(
+                    RbacV1Subject(
                         kind="ServiceAccount",
                         namespace=namespace,
                         name=sa_name,

paasta_tools/long_running_service_tools.py CHANGED Viewed

@@ -41,6 +41,7 @@ DEFAULT_ACTIVE_REQUESTS_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
 DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
 DEFAULT_PISCINA_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
 DEFAULT_GUNICORN_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
+DEFAULT_WORKER_LOAD_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
 METRICS_PROVIDER_CPU = "cpu"
 METRICS_PROVIDER_UWSGI = "uwsgi"
@@ -48,7 +49,8 @@ METRICS_PROVIDER_UWSGI_V2 = "uwsgi-v2"
 METRICS_PROVIDER_GUNICORN = "gunicorn"
 METRICS_PROVIDER_PISCINA = "piscina"
 METRICS_PROVIDER_ACTIVE_REQUESTS = "active-requests"
-METRICS_PROVIDER_PROMQL = "arbitrary_promql"
+METRICS_PROVIDER_PROMQL = "arbitrary-promql"
+METRICS_PROVIDER_WORKER_LOAD = "worker-load"
 ALL_METRICS_PROVIDERS = [
     METRICS_PROVIDER_CPU,
@@ -58,6 +60,7 @@ ALL_METRICS_PROVIDERS = [
     METRICS_PROVIDER_PISCINA,
     METRICS_PROVIDER_ACTIVE_REQUESTS,
     METRICS_PROVIDER_PROMQL,
+    METRICS_PROVIDER_WORKER_LOAD,
 ]
@@ -85,6 +88,7 @@ class LongRunningServiceConfigDict(InstanceConfigDict, total=False):
     bounce_margin_factor: float
     should_ping_for_unhealthy_pods: bool
     weight: int
+    unhealthy_pod_eviction_policy: str
 class ServiceNamespaceConfig(dict):
@@ -410,6 +414,9 @@ class LongRunningServiceConfig(InstanceConfig):
     def get_bounce_margin_factor(self) -> float:
         return self.config_dict.get("bounce_margin_factor", 0.95)
+    def get_unhealthy_pod_eviction_policy(self) -> str:
+        return self.config_dict.get("unhealthy_pod_eviction_policy", "IfHealthyBudget")
     def get_should_ping_for_unhealthy_pods(self, default: bool) -> bool:
         return self.config_dict.get("should_ping_for_unhealthy_pods", default)

paasta_tools/paastaapi/model/kubernetes_version.py CHANGED Viewed

@@ -81,6 +81,7 @@ class KubernetesVersion(ModelNormal):
         """
         lazy_import()
         return {
+            'container_port': (int,),  # noqa: E501
             'type': (str,),  # noqa: E501
             'create_timestamp': (float,),  # noqa: E501
             'git_sha': (str,),  # noqa: E501
@@ -99,6 +100,7 @@ class KubernetesVersion(ModelNormal):
     attribute_map = {
+        'container_port': 'container_port',  # noqa: E501
         'type': 'type',  # noqa: E501
         'create_timestamp': 'create_timestamp',  # noqa: E501
         'git_sha': 'git_sha',  # noqa: E501
@@ -157,6 +159,7 @@ class KubernetesVersion(ModelNormal):
                                 Animal class but this time we won't travel
                                 through its discriminator because we passed in
                                 _visited_composed_classes = (Animal,)
+            container_port (int): Port the container is expecting to receive traffic on. [optional]  # noqa: E501
             type (str): Type of version (ReplicaSet or ControllerRevision). [optional]  # noqa: E501
             create_timestamp (float): Unix timestamp when version was created. [optional]  # noqa: E501
             git_sha (str): Git SHA of service code for this version of the instance. [optional]  # noqa: E501

paasta_tools/setup_prometheus_adapter_config.py CHANGED Viewed

@@ -53,6 +53,9 @@ from paasta_tools.long_running_service_tools import (
 from paasta_tools.long_running_service_tools import (
     DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW,
 )
+from paasta_tools.long_running_service_tools import (
+    DEFAULT_WORKER_LOAD_AUTOSCALING_MOVING_AVERAGE_WINDOW,
+)
 from paasta_tools.long_running_service_tools import METRICS_PROVIDER_ACTIVE_REQUESTS
 from paasta_tools.long_running_service_tools import METRICS_PROVIDER_CPU
 from paasta_tools.long_running_service_tools import METRICS_PROVIDER_GUNICORN
@@ -60,6 +63,7 @@ from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PISCINA
 from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PROMQL
 from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI
 from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI_V2
+from paasta_tools.long_running_service_tools import METRICS_PROVIDER_WORKER_LOAD
 from paasta_tools.paasta_service_config_loader import PaastaServiceConfigLoader
 from paasta_tools.utils import DEFAULT_SOA_DIR
 from paasta_tools.utils import get_services_for_cluster
@@ -214,6 +218,10 @@ def create_instance_scaling_rule(
         return create_instance_uwsgi_v2_scaling_rule(
             service, instance_config, metrics_provider_config, paasta_cluster
         )
+    if metrics_provider_config["type"] == METRICS_PROVIDER_WORKER_LOAD:
+        return create_instance_worker_load_scaling_rule(
+            service, instance_config, metrics_provider_config, paasta_cluster
+        )
     if metrics_provider_config["type"] == METRICS_PROVIDER_PISCINA:
         return create_instance_piscina_scaling_rule(
             service, instance_config, metrics_provider_config, paasta_cluster
@@ -523,6 +531,80 @@ def create_instance_uwsgi_v2_scaling_rule(
     }
+def create_instance_worker_load_scaling_rule(
+    service: str,
+    instance_config: KubernetesDeploymentConfig,
+    metrics_provider_config: MetricsProviderDict,
+    paasta_cluster: str,
+) -> PrometheusAdapterRule:
+    """
+    Creates a Prometheus adapter rule config for a given service instance using generic worker_busy metric.
+    """
+    instance = instance_config.instance
+    moving_average_window = metrics_provider_config.get(
+        "moving_average_window_seconds",
+        DEFAULT_WORKER_LOAD_AUTOSCALING_MOVING_AVERAGE_WINDOW,
+    )
+    deployment_name = get_kubernetes_app_name(service=service, instance=instance)
+    # In order for autoscaling to work safely while a service migrates from one namespace to another, the HPA needs to
+    # make sure that the deployment in the new namespace is scaled up enough to handle _all_ the load.
+    # This is because once the new deployment is 100% healthy, cleanup_kubernetes_job will delete the deployment out of
+    # the old namespace all at once, suddenly putting all the load onto the deployment in the new namespace.
+    # To ensure this, we must NOT filter on namespace in worker_filter_terms (which is used when calculating total_load.
+    # This makes sure that desired_instances includes load from all namespaces.
+    worker_filter_terms = f"paasta_cluster='{paasta_cluster}',paasta_service='{service}',paasta_instance='{instance}'"
+    # k8s:deployment:pods_status_ready is a metric created by summing kube_pod_status_ready
+    # over paasta service/instance/cluster. it counts the number of ready pods in a paasta
+    # deployment.
+    ready_pods = f"""
+        (sum(
+            k8s:deployment:pods_status_ready{{{worker_filter_terms}}} >= 0
+            or
+            max_over_time(
+                k8s:deployment:pods_status_ready{{{worker_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s]
+            )
+        ) by (kube_deployment))
+    """
+    load_per_instance = f"""
+        avg(
+            worker_busy{{{worker_filter_terms}}}
+        ) by (kube_pod, kube_deployment)
+    """
+    missing_instances = f"""
+        clamp_min(
+            {ready_pods} - count({load_per_instance}) by (kube_deployment),
+            0
+        )
+    """
+    total_load = f"""
+    (
+        sum(
+            {load_per_instance}
+        ) by (kube_deployment)
+        +
+        {missing_instances}
+    )
+    """
+    total_load_smoothed = f"""
+        avg_over_time(
+            (
+                {total_load}
+            )[{moving_average_window}s:]
+        )
+    """
+    metric_name = f"{deployment_name}-worker-load-prom"
+    return {
+        "name": {"as": metric_name},
+        "seriesQuery": f"worker_busy{{{worker_filter_terms}}}",
+        "resources": {"template": "kube_<<.Resource>>"},
+        "metricsQuery": _minify_promql(total_load_smoothed),
+    }
 def create_instance_piscina_scaling_rule(
     service: str,
     instance_config: KubernetesDeploymentConfig,

paasta_tools/tron_tools.py CHANGED Viewed

@@ -506,6 +506,9 @@ class TronActionConfig(InstanceConfig):
             # XXX: update PAASTA_RESOURCE_* env vars to use the correct value from spark_args and set
             # these to the correct values for the executors as part of the driver commandline
+            # our internal Spark configuration service needs this to determine if any special behavior is required
+            env["SPARK_DRIVER_TYPE"] = "tron"
         return env
     def get_iam_role(self) -> str:

paasta-tools 1.30.9__py3-none-any.whl → 1.35.8__py3-none-any.whl

Potentially problematic release.

paasta-tools 1.30.9py3-none-any.whl → 1.35.8py3-none-any.whl