paasta-tools 1.30.9__py3-none-any.whl → 1.35.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of paasta-tools might be problematic. Click here for more details.
- paasta_tools/__init__.py +1 -1
- paasta_tools/api/api_docs/swagger.json +5 -0
- paasta_tools/cli/cmds/autoscale.py +2 -0
- paasta_tools/cli/cmds/check.py +2 -0
- paasta_tools/cli/cmds/cook_image.py +2 -0
- paasta_tools/cli/cmds/get_docker_image.py +2 -0
- paasta_tools/cli/cmds/get_image_version.py +2 -0
- paasta_tools/cli/cmds/get_latest_deployment.py +2 -0
- paasta_tools/cli/cmds/info.py +5 -1
- paasta_tools/cli/cmds/itest.py +2 -0
- paasta_tools/cli/cmds/list_namespaces.py +2 -0
- paasta_tools/cli/cmds/local_run.py +116 -24
- paasta_tools/cli/cmds/logs.py +2 -0
- paasta_tools/cli/cmds/mark_for_deployment.py +12 -2
- paasta_tools/cli/cmds/mesh_status.py +2 -1
- paasta_tools/cli/cmds/push_to_registry.py +2 -0
- paasta_tools/cli/cmds/remote_run.py +10 -0
- paasta_tools/cli/cmds/rollback.py +5 -1
- paasta_tools/cli/cmds/secret.py +4 -2
- paasta_tools/cli/cmds/security_check.py +2 -0
- paasta_tools/cli/cmds/spark_run.py +4 -0
- paasta_tools/cli/cmds/status.py +35 -8
- paasta_tools/cli/cmds/validate.py +296 -19
- paasta_tools/cli/cmds/wait_for_deployment.py +2 -0
- paasta_tools/cli/schemas/autoscaling_schema.json +3 -2
- paasta_tools/cli/schemas/eks_schema.json +23 -1
- paasta_tools/cli/schemas/smartstack_schema.json +12 -0
- paasta_tools/cli/utils.py +2 -1
- paasta_tools/contrib/paasta_update_soa_memcpu.py +10 -14
- paasta_tools/generate_deployments_for_service.py +2 -0
- paasta_tools/instance/hpa_metrics_parser.py +3 -5
- paasta_tools/instance/kubernetes.py +58 -25
- paasta_tools/kubernetes/application/controller_wrappers.py +23 -2
- paasta_tools/kubernetes/remote_run.py +2 -2
- paasta_tools/kubernetes_tools.py +37 -66
- paasta_tools/long_running_service_tools.py +8 -1
- paasta_tools/paastaapi/model/kubernetes_version.py +3 -0
- paasta_tools/setup_prometheus_adapter_config.py +82 -0
- paasta_tools/tron_tools.py +3 -0
- paasta_tools/utils.py +26 -9
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/generate_deployments_for_service.py +2 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_update_soa_memcpu.py +10 -14
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_prometheus_adapter_config.py +82 -0
- {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/METADATA +4 -4
- {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/RECORD +98 -98
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/apply_external_resources.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/bounce_log_latency_parser.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_autoscaler_max_instances.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_cassandracluster_services_replication.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_flink_services_health.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_kubernetes_api.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_kubernetes_services_replication.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_manual_oapi_changes.sh +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_oom_events.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_orphans.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/check_spark_jobs.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/cleanup_kubernetes_cr.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/cleanup_kubernetes_crd.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/cleanup_kubernetes_jobs.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/create_dynamodb_table.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/create_paasta_playground.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/delete_kubernetes_deployments.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/emit_allocated_cpu_metrics.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/generate_all_deployments +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/generate_authenticating_services.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/generate_services_file.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/generate_services_yaml.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/get_running_task_allocation.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/habitat_fixer.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/ide_helper.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/is_pod_healthy_in_proxy.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/is_pod_healthy_in_smartstack.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/kill_bad_containers.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/kubernetes_remove_evicted_pods.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/mass-deploy-tag.sh +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/mock_patch_checker.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_cleanup_remote_run_resources.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_cleanup_stale_nodes.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_deploy_tron_jobs +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_execute_docker_command.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_secrets_sync.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/paasta_tabcomplete.sh +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/render_template.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/rightsizer_soaconfigs_update.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/service_shard_remove.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/service_shard_update.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_istio_mesh.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_kubernetes_cr.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_kubernetes_crd.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_kubernetes_internal_crd.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/setup_kubernetes_job.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/shared_ip_check.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/synapse_srv_namespaces_fact.py +0 -0
- {paasta_tools-1.30.9.data → paasta_tools-1.35.8.data}/scripts/timeouts_metrics_prom.py +0 -0
- {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/WHEEL +0 -0
- {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/entry_points.txt +0 -0
- {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/licenses/LICENSE +0 -0
- {paasta_tools-1.30.9.dist-info → paasta_tools-1.35.8.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import logging
|
|
2
3
|
from asyncio.tasks import Task
|
|
3
4
|
from collections import defaultdict
|
|
4
5
|
from enum import Enum
|
|
@@ -17,6 +18,7 @@ from typing import Union
|
|
|
17
18
|
|
|
18
19
|
import a_sync
|
|
19
20
|
import pytz
|
|
21
|
+
import requests.exceptions
|
|
20
22
|
from kubernetes.client import V1Container
|
|
21
23
|
from kubernetes.client import V1ControllerRevision
|
|
22
24
|
from kubernetes.client import V1Pod
|
|
@@ -75,6 +77,8 @@ INSTANCE_TYPE_CR_ID = dict(
|
|
|
75
77
|
monkrelaycluster=monkrelaycluster_tools.cr_id,
|
|
76
78
|
)
|
|
77
79
|
|
|
80
|
+
logger = logging.getLogger(__name__)
|
|
81
|
+
|
|
78
82
|
|
|
79
83
|
class ServiceMesh(Enum):
|
|
80
84
|
SMARTSTACK = "smartstack"
|
|
@@ -100,6 +104,7 @@ class KubernetesVersionDict(TypedDict, total=False):
|
|
|
100
104
|
config_sha: str
|
|
101
105
|
pods: Sequence[Mapping[str, Any]]
|
|
102
106
|
namespace: str
|
|
107
|
+
container_port: Optional[int]
|
|
103
108
|
|
|
104
109
|
|
|
105
110
|
def cr_id(service: str, instance: str, instance_type: str) -> Mapping[str, str]:
|
|
@@ -347,31 +352,49 @@ async def mesh_status(
|
|
|
347
352
|
|
|
348
353
|
pods = await pods_task
|
|
349
354
|
for location, hosts in node_hostname_by_location.items():
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
355
|
+
max_retries = 3
|
|
356
|
+
|
|
357
|
+
for attempt in range(max_retries):
|
|
358
|
+
host = replication_checker.get_hostname_in_pool(hosts, instance_pool)
|
|
359
|
+
try:
|
|
360
|
+
if service_mesh == ServiceMesh.SMARTSTACK:
|
|
361
|
+
location_dict = _build_smartstack_location_dict(
|
|
362
|
+
synapse_host=host,
|
|
363
|
+
synapse_port=settings.system_paasta_config.get_synapse_port(),
|
|
364
|
+
synapse_haproxy_url_format=settings.system_paasta_config.get_synapse_haproxy_url_format(),
|
|
365
|
+
registration=registration,
|
|
366
|
+
pods=pods,
|
|
367
|
+
location=location,
|
|
368
|
+
should_return_individual_backends=should_return_individual_backends,
|
|
369
|
+
)
|
|
370
|
+
elif service_mesh == ServiceMesh.ENVOY:
|
|
371
|
+
location_dict = _build_envoy_location_dict(
|
|
372
|
+
envoy_host=host,
|
|
373
|
+
envoy_admin_port=settings.system_paasta_config.get_envoy_admin_port(),
|
|
374
|
+
envoy_admin_endpoint_format=settings.system_paasta_config.get_envoy_admin_endpoint_format(),
|
|
375
|
+
registration=registration,
|
|
376
|
+
pods=pods,
|
|
377
|
+
location=location,
|
|
378
|
+
should_return_individual_backends=should_return_individual_backends,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
mesh_status["locations"].append(location_dict)
|
|
382
|
+
return mesh_status
|
|
383
|
+
|
|
384
|
+
except requests.exceptions.ConnectTimeout:
|
|
385
|
+
if attempt < max_retries - 1:
|
|
386
|
+
logger.warning(
|
|
387
|
+
"attempt %s/%s: Unable to connect to %s, retrying (on another host, hopefully)...",
|
|
388
|
+
attempt,
|
|
389
|
+
max_retries,
|
|
390
|
+
host,
|
|
391
|
+
)
|
|
392
|
+
continue
|
|
393
|
+
else:
|
|
394
|
+
logger.critical(
|
|
395
|
+
"Unable to connect to %s, not retrying again.", host
|
|
396
|
+
)
|
|
397
|
+
raise
|
|
375
398
|
return mesh_status
|
|
376
399
|
|
|
377
400
|
|
|
@@ -698,6 +721,7 @@ async def kubernetes_status_v2(
|
|
|
698
721
|
instance=instance,
|
|
699
722
|
namespaces=relevant_namespaces,
|
|
700
723
|
pod_status_by_sha_and_readiness_task=pod_status_by_sha_and_readiness_task, # type: ignore # PAASTA-18698; ignoring due to unexpected type mismatch
|
|
724
|
+
container_port=job_config.get_container_port(),
|
|
701
725
|
)
|
|
702
726
|
)
|
|
703
727
|
tasks.extend([pod_status_by_sha_and_readiness_task, versions_task]) # type: ignore # PAASTA-18698; ignoring due to unexpected type mismatch
|
|
@@ -717,6 +741,7 @@ async def kubernetes_status_v2(
|
|
|
717
741
|
instance=instance,
|
|
718
742
|
namespaces=relevant_namespaces,
|
|
719
743
|
pod_status_by_replicaset_task=pod_status_by_replicaset_task, # type: ignore # PAASTA-18698; ignoring due to unexpected type mismatch
|
|
744
|
+
container_port=job_config.get_container_port(),
|
|
720
745
|
)
|
|
721
746
|
)
|
|
722
747
|
tasks.extend([pod_status_by_replicaset_task, versions_task]) # type: ignore # PAASTA-18698; ignoring due to unexpected type mismatch
|
|
@@ -788,6 +813,7 @@ async def get_versions_for_replicasets(
|
|
|
788
813
|
instance: str,
|
|
789
814
|
namespaces: Iterable[str],
|
|
790
815
|
pod_status_by_replicaset_task: "asyncio.Future[Mapping[str, Sequence[asyncio.Future[Dict[str, Any]]]]]",
|
|
816
|
+
container_port: Optional[int],
|
|
791
817
|
) -> List[KubernetesVersionDict]:
|
|
792
818
|
|
|
793
819
|
replicaset_list: List[V1ReplicaSet] = []
|
|
@@ -815,6 +841,7 @@ async def get_versions_for_replicasets(
|
|
|
815
841
|
replicaset,
|
|
816
842
|
kube_client,
|
|
817
843
|
pod_status_by_replicaset.get(replicaset.metadata.name),
|
|
844
|
+
container_port,
|
|
818
845
|
)
|
|
819
846
|
for replicaset in actually_running_replicasets
|
|
820
847
|
]
|
|
@@ -826,6 +853,7 @@ async def get_replicaset_status(
|
|
|
826
853
|
replicaset: V1ReplicaSet,
|
|
827
854
|
client: kubernetes_tools.KubeClient,
|
|
828
855
|
pod_status_tasks: Sequence["asyncio.Future[Dict[str, Any]]"],
|
|
856
|
+
container_port: Optional[int],
|
|
829
857
|
) -> KubernetesVersionDict:
|
|
830
858
|
return {
|
|
831
859
|
"name": replicaset.metadata.name,
|
|
@@ -840,6 +868,7 @@ async def get_replicaset_status(
|
|
|
840
868
|
"config_sha": replicaset.metadata.labels.get("paasta.yelp.com/config_sha"),
|
|
841
869
|
"pods": await asyncio.gather(*pod_status_tasks) if pod_status_tasks else [],
|
|
842
870
|
"namespace": replicaset.metadata.namespace,
|
|
871
|
+
"container_port": container_port,
|
|
843
872
|
}
|
|
844
873
|
|
|
845
874
|
|
|
@@ -1063,6 +1092,7 @@ async def get_versions_for_controller_revisions(
|
|
|
1063
1092
|
instance: str,
|
|
1064
1093
|
namespaces: Iterable[str],
|
|
1065
1094
|
pod_status_by_sha_and_readiness_task: "asyncio.Future[Mapping[Tuple[str, str], Mapping[bool, Sequence[asyncio.Future[Mapping[str, Any]]]]]]",
|
|
1095
|
+
container_port: Optional[int] = None,
|
|
1066
1096
|
) -> List[KubernetesVersionDict]:
|
|
1067
1097
|
controller_revision_list: List[V1ControllerRevision] = []
|
|
1068
1098
|
|
|
@@ -1092,6 +1122,7 @@ async def get_versions_for_controller_revisions(
|
|
|
1092
1122
|
cr,
|
|
1093
1123
|
kube_client,
|
|
1094
1124
|
pod_status_by_sha_and_readiness[(git_sha, config_sha)],
|
|
1125
|
+
container_port=container_port,
|
|
1095
1126
|
)
|
|
1096
1127
|
for (git_sha, config_sha), cr in cr_by_shas.items()
|
|
1097
1128
|
]
|
|
@@ -1106,6 +1137,7 @@ async def get_version_for_controller_revision(
|
|
|
1106
1137
|
pod_status_tasks_by_readiness: Mapping[
|
|
1107
1138
|
bool, Sequence["asyncio.Future[Mapping[str, Any]]"]
|
|
1108
1139
|
],
|
|
1140
|
+
container_port: Optional[int] = None,
|
|
1109
1141
|
) -> KubernetesVersionDict:
|
|
1110
1142
|
all_pod_status_tasks = [
|
|
1111
1143
|
task for tasks in pod_status_tasks_by_readiness.values() for task in tasks
|
|
@@ -1122,6 +1154,7 @@ async def get_version_for_controller_revision(
|
|
|
1122
1154
|
"config_sha": cr.metadata.labels.get("paasta.yelp.com/config_sha"),
|
|
1123
1155
|
"pods": [task.result() for task in all_pod_status_tasks],
|
|
1124
1156
|
"namespace": cr.metadata.namespace,
|
|
1157
|
+
"container_port": container_port,
|
|
1125
1158
|
}
|
|
1126
1159
|
|
|
1127
1160
|
|
|
@@ -173,19 +173,31 @@ class Application(ABC):
|
|
|
173
173
|
self, kube_client: KubeClient, namespace: str
|
|
174
174
|
) -> V1PodDisruptionBudget:
|
|
175
175
|
max_unavailable: Union[str, int]
|
|
176
|
+
|
|
177
|
+
system_paasta_config = load_system_paasta_config()
|
|
178
|
+
|
|
176
179
|
if "bounce_margin_factor" in self.soa_config.config_dict:
|
|
177
180
|
max_unavailable = (
|
|
178
181
|
f"{int((1 - self.soa_config.get_bounce_margin_factor()) * 100)}%"
|
|
179
182
|
)
|
|
180
183
|
else:
|
|
181
|
-
system_paasta_config = load_system_paasta_config()
|
|
182
184
|
max_unavailable = system_paasta_config.get_pdb_max_unavailable()
|
|
183
185
|
|
|
186
|
+
if "unhealthy_pod_eviction_policy" in self.soa_config.config_dict:
|
|
187
|
+
unhealthy_pod_eviction_policy = (
|
|
188
|
+
self.soa_config.get_unhealthy_pod_eviction_policy()
|
|
189
|
+
)
|
|
190
|
+
else:
|
|
191
|
+
unhealthy_pod_eviction_policy = (
|
|
192
|
+
system_paasta_config.get_unhealthy_pod_eviction_policy()
|
|
193
|
+
)
|
|
194
|
+
|
|
184
195
|
pdr = pod_disruption_budget_for_service_instance(
|
|
185
196
|
service=self.kube_deployment.service,
|
|
186
197
|
instance=self.kube_deployment.instance,
|
|
187
198
|
max_unavailable=max_unavailable,
|
|
188
199
|
namespace=namespace,
|
|
200
|
+
unhealthy_pod_eviction_policy=unhealthy_pod_eviction_policy,
|
|
189
201
|
)
|
|
190
202
|
try:
|
|
191
203
|
existing_pdr = kube_client.policy.read_namespaced_pod_disruption_budget(
|
|
@@ -198,12 +210,21 @@ class Application(ABC):
|
|
|
198
210
|
raise
|
|
199
211
|
|
|
200
212
|
if existing_pdr:
|
|
213
|
+
"""
|
|
214
|
+
Update the pod disruption budget only if spec.max_unavailable
|
|
215
|
+
or spec.unhealthy_pod_eviction_policy have changed;
|
|
216
|
+
ignore changes to other fields
|
|
217
|
+
"""
|
|
201
218
|
if existing_pdr.spec.min_available is not None:
|
|
202
219
|
logging.info(
|
|
203
220
|
"Not updating poddisruptionbudget: can't have both "
|
|
204
221
|
"min_available and max_unavailable"
|
|
205
222
|
)
|
|
206
|
-
elif
|
|
223
|
+
elif (
|
|
224
|
+
existing_pdr.spec.max_unavailable != pdr.spec.max_unavailable
|
|
225
|
+
or existing_pdr.spec.unhealthy_pod_eviction_policy
|
|
226
|
+
!= pdr.spec.unhealthy_pod_eviction_policy
|
|
227
|
+
):
|
|
207
228
|
logging.info(f"Updating poddisruptionbudget {pdr.metadata.name}")
|
|
208
229
|
return kube_client.policy.patch_namespaced_pod_disruption_budget(
|
|
209
230
|
name=pdr.metadata.name, namespace=pdr.metadata.namespace, body=pdr
|
|
@@ -20,6 +20,7 @@ from typing import Sequence
|
|
|
20
20
|
from typing import TypedDict
|
|
21
21
|
|
|
22
22
|
from kubernetes.client import AuthenticationV1TokenRequest
|
|
23
|
+
from kubernetes.client import RbacV1Subject
|
|
23
24
|
from kubernetes.client import V1Job
|
|
24
25
|
from kubernetes.client import V1ObjectMeta
|
|
25
26
|
from kubernetes.client import V1Pod
|
|
@@ -28,7 +29,6 @@ from kubernetes.client import V1Role
|
|
|
28
29
|
from kubernetes.client import V1RoleBinding
|
|
29
30
|
from kubernetes.client import V1RoleRef
|
|
30
31
|
from kubernetes.client import V1ServiceAccount
|
|
31
|
-
from kubernetes.client import V1Subject
|
|
32
32
|
from kubernetes.client import V1TokenRequestSpec
|
|
33
33
|
from kubernetes.client.exceptions import ApiException
|
|
34
34
|
|
|
@@ -522,7 +522,7 @@ def bind_role_to_service_account(
|
|
|
522
522
|
name=role,
|
|
523
523
|
),
|
|
524
524
|
subjects=[
|
|
525
|
-
|
|
525
|
+
RbacV1Subject(
|
|
526
526
|
kind="ServiceAccount",
|
|
527
527
|
name=service_account,
|
|
528
528
|
),
|
paasta_tools/kubernetes_tools.py
CHANGED
|
@@ -50,6 +50,7 @@ from kubernetes import client as kube_client
|
|
|
50
50
|
from kubernetes import config as kube_config
|
|
51
51
|
from kubernetes.client import CoreV1Event
|
|
52
52
|
from kubernetes.client import models
|
|
53
|
+
from kubernetes.client import RbacV1Subject
|
|
53
54
|
from kubernetes.client import V1Affinity
|
|
54
55
|
from kubernetes.client import V1AWSElasticBlockStoreVolumeSource
|
|
55
56
|
from kubernetes.client import V1Capabilities
|
|
@@ -113,7 +114,6 @@ from kubernetes.client import V1ServiceAccount
|
|
|
113
114
|
from kubernetes.client import V1ServiceAccountTokenProjection
|
|
114
115
|
from kubernetes.client import V1StatefulSet
|
|
115
116
|
from kubernetes.client import V1StatefulSetSpec
|
|
116
|
-
from kubernetes.client import V1Subject
|
|
117
117
|
from kubernetes.client import V1TCPSocketAction
|
|
118
118
|
from kubernetes.client import V1TopologySpreadConstraint
|
|
119
119
|
from kubernetes.client import V1Volume
|
|
@@ -151,6 +151,7 @@ from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PISCINA
|
|
|
151
151
|
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PROMQL
|
|
152
152
|
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI
|
|
153
153
|
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI_V2
|
|
154
|
+
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_WORKER_LOAD
|
|
154
155
|
from paasta_tools.long_running_service_tools import ServiceNamespaceConfig
|
|
155
156
|
from paasta_tools.secret_tools import get_secret_name_from_ref
|
|
156
157
|
from paasta_tools.secret_tools import is_secret_ref
|
|
@@ -195,10 +196,8 @@ KUBE_DEPLOY_STATEGY_MAP = {
|
|
|
195
196
|
"brutal": "RollingUpdate",
|
|
196
197
|
}
|
|
197
198
|
HACHECK_POD_NAME = "hacheck"
|
|
198
|
-
GUNICORN_EXPORTER_POD_NAME = "gunicorn--exporter"
|
|
199
199
|
SIDECAR_CONTAINER_NAMES = [
|
|
200
200
|
HACHECK_POD_NAME,
|
|
201
|
-
GUNICORN_EXPORTER_POD_NAME,
|
|
202
201
|
]
|
|
203
202
|
KUBERNETES_NAMESPACE = "paasta"
|
|
204
203
|
PAASTA_WORKLOAD_OWNER = "compute_infra_platform_experience"
|
|
@@ -876,7 +875,10 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
|
|
|
876
875
|
),
|
|
877
876
|
),
|
|
878
877
|
)
|
|
879
|
-
elif provider["type"]
|
|
878
|
+
elif provider["type"] in {
|
|
879
|
+
METRICS_PROVIDER_UWSGI_V2,
|
|
880
|
+
METRICS_PROVIDER_WORKER_LOAD,
|
|
881
|
+
}:
|
|
880
882
|
return V2MetricSpec(
|
|
881
883
|
type="Object",
|
|
882
884
|
object=V2ObjectMetricSource(
|
|
@@ -1072,15 +1074,10 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
|
|
|
1072
1074
|
service_namespace_config,
|
|
1073
1075
|
hacheck_sidecar_volumes,
|
|
1074
1076
|
)
|
|
1075
|
-
gunicorn_exporter_container = self.get_gunicorn_exporter_sidecar_container(
|
|
1076
|
-
system_paasta_config
|
|
1077
|
-
)
|
|
1078
1077
|
|
|
1079
1078
|
sidecars = []
|
|
1080
1079
|
if hacheck_container:
|
|
1081
1080
|
sidecars.append(hacheck_container)
|
|
1082
|
-
if gunicorn_exporter_container:
|
|
1083
|
-
sidecars.append(gunicorn_exporter_container)
|
|
1084
1081
|
return sidecars
|
|
1085
1082
|
|
|
1086
1083
|
def get_readiness_check_prefix(
|
|
@@ -1168,37 +1165,6 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
|
|
|
1168
1165
|
)
|
|
1169
1166
|
return None
|
|
1170
1167
|
|
|
1171
|
-
def get_gunicorn_exporter_sidecar_container(
|
|
1172
|
-
self,
|
|
1173
|
-
system_paasta_config: SystemPaastaConfig,
|
|
1174
|
-
) -> Optional[V1Container]:
|
|
1175
|
-
|
|
1176
|
-
if self.should_use_metrics_provider(METRICS_PROVIDER_GUNICORN):
|
|
1177
|
-
return V1Container(
|
|
1178
|
-
image=system_paasta_config.get_gunicorn_exporter_sidecar_image_url(),
|
|
1179
|
-
resources=self.get_sidecar_resource_requirements(
|
|
1180
|
-
"gunicorn_exporter", system_paasta_config
|
|
1181
|
-
),
|
|
1182
|
-
name=GUNICORN_EXPORTER_POD_NAME,
|
|
1183
|
-
env=self.get_kubernetes_environment(),
|
|
1184
|
-
ports=[V1ContainerPort(container_port=9117)],
|
|
1185
|
-
lifecycle=V1Lifecycle(
|
|
1186
|
-
pre_stop=V1LifecycleHandler(
|
|
1187
|
-
_exec=V1ExecAction(
|
|
1188
|
-
command=[
|
|
1189
|
-
"/bin/sh",
|
|
1190
|
-
"-c",
|
|
1191
|
-
# we sleep for the same amount of time as we do after an hadown to ensure that we have accurate
|
|
1192
|
-
# metrics up until our Pod dies
|
|
1193
|
-
f"sleep {self.get_hacheck_prestop_sleep_seconds()}",
|
|
1194
|
-
]
|
|
1195
|
-
)
|
|
1196
|
-
)
|
|
1197
|
-
),
|
|
1198
|
-
)
|
|
1199
|
-
|
|
1200
|
-
return None
|
|
1201
|
-
|
|
1202
1168
|
def get_env(
|
|
1203
1169
|
self, system_paasta_config: Optional["SystemPaastaConfig"] = None
|
|
1204
1170
|
) -> Dict[str, str]:
|
|
@@ -1546,7 +1512,7 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
|
|
|
1546
1512
|
and the service will be removed from smartstack, which is the same effect we get after running hadown.
|
|
1547
1513
|
"""
|
|
1548
1514
|
|
|
1549
|
-
# Everywhere this value is currently used (hacheck sidecar
|
|
1515
|
+
# Everywhere this value is currently used (hacheck sidecar), we can pretty safely
|
|
1550
1516
|
# assume that the service is in smartstack.
|
|
1551
1517
|
return self.get_prestop_sleep_seconds(is_in_smartstack=True) + 1
|
|
1552
1518
|
|
|
@@ -2306,6 +2272,7 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
|
|
|
2306
2272
|
or self.get_prometheus_port() is not None
|
|
2307
2273
|
or self.should_use_metrics_provider(METRICS_PROVIDER_UWSGI)
|
|
2308
2274
|
or self.should_use_metrics_provider(METRICS_PROVIDER_GUNICORN)
|
|
2275
|
+
or self.should_use_metrics_provider(METRICS_PROVIDER_WORKER_LOAD)
|
|
2309
2276
|
):
|
|
2310
2277
|
return "true"
|
|
2311
2278
|
return "false"
|
|
@@ -2458,6 +2425,10 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
|
|
|
2458
2425
|
"paasta.yelp.com/cluster": self.cluster,
|
|
2459
2426
|
"yelp.com/owner": "compute_infra_platform_experience",
|
|
2460
2427
|
"paasta.yelp.com/managed": "true",
|
|
2428
|
+
# NOTE: this is mostly here for autoscaling purposes: we use information from the deploy group
|
|
2429
|
+
# during Prometheus relabeling - but it's not a bad label to have around in general, thus its
|
|
2430
|
+
# inclusion here
|
|
2431
|
+
"paasta.yelp.com/deploy_group": self.get_deploy_group(),
|
|
2461
2432
|
}
|
|
2462
2433
|
if service_namespace_config.is_in_smartstack():
|
|
2463
2434
|
labels["paasta.yelp.com/weight"] = str(self.get_weight())
|
|
@@ -2483,22 +2454,13 @@ class KubernetesDeploymentConfig(LongRunningServiceConfig):
|
|
|
2483
2454
|
|
|
2484
2455
|
# not all services use autoscaling, so we label those that do in order to have
|
|
2485
2456
|
# prometheus selectively discover/scrape them
|
|
2486
|
-
|
|
2487
|
-
|
|
2488
|
-
# But we do still need deploy_group for relabeling properly
|
|
2489
|
-
# this should probably eventually be made into a default label,
|
|
2490
|
-
# but for now we're fine with it being behind these feature toggles.
|
|
2491
|
-
# ideally, we'd also have the docker image here for ease-of-use
|
|
2492
|
-
# in Prometheus relabeling, but that information is over the
|
|
2493
|
-
# character limit for k8s labels (63 chars)
|
|
2494
|
-
labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group()
|
|
2495
|
-
|
|
2496
|
-
elif self.should_use_metrics_provider(METRICS_PROVIDER_PISCINA):
|
|
2497
|
-
labels["paasta.yelp.com/deploy_group"] = self.get_deploy_group()
|
|
2457
|
+
# NOTE: these are not mutually exclusive as a service could use multiple autoscaling types
|
|
2458
|
+
if self.should_use_metrics_provider(METRICS_PROVIDER_PISCINA):
|
|
2498
2459
|
labels["paasta.yelp.com/scrape_piscina_prometheus"] = "true"
|
|
2499
2460
|
|
|
2500
|
-
|
|
2501
|
-
|
|
2461
|
+
if self.should_use_metrics_provider(
|
|
2462
|
+
METRICS_PROVIDER_GUNICORN
|
|
2463
|
+
) or self.should_use_metrics_provider(METRICS_PROVIDER_WORKER_LOAD):
|
|
2502
2464
|
labels["paasta.yelp.com/scrape_gunicorn_prometheus"] = "true"
|
|
2503
2465
|
|
|
2504
2466
|
# the default AWS LB Controller behavior is to enable this by-namespace
|
|
@@ -3030,7 +2992,7 @@ def ensure_paasta_api_rolebinding(kube_client: KubeClient, namespace: str) -> No
|
|
|
3030
2992
|
name="paasta-api-server-per-namespace",
|
|
3031
2993
|
),
|
|
3032
2994
|
subjects=[
|
|
3033
|
-
|
|
2995
|
+
RbacV1Subject(
|
|
3034
2996
|
kind="User",
|
|
3035
2997
|
name="yelp.com/paasta-api-server",
|
|
3036
2998
|
),
|
|
@@ -3412,21 +3374,26 @@ def pod_disruption_budget_for_service_instance(
|
|
|
3412
3374
|
instance: str,
|
|
3413
3375
|
max_unavailable: Union[str, int],
|
|
3414
3376
|
namespace: str,
|
|
3377
|
+
unhealthy_pod_eviction_policy: str,
|
|
3415
3378
|
) -> V1PodDisruptionBudget:
|
|
3379
|
+
selector = V1LabelSelector(
|
|
3380
|
+
match_labels={
|
|
3381
|
+
"paasta.yelp.com/service": service,
|
|
3382
|
+
"paasta.yelp.com/instance": instance,
|
|
3383
|
+
}
|
|
3384
|
+
)
|
|
3385
|
+
spec = V1PodDisruptionBudgetSpec(
|
|
3386
|
+
max_unavailable=max_unavailable,
|
|
3387
|
+
unhealthy_pod_eviction_policy=unhealthy_pod_eviction_policy,
|
|
3388
|
+
selector=selector,
|
|
3389
|
+
)
|
|
3390
|
+
|
|
3416
3391
|
return V1PodDisruptionBudget(
|
|
3417
3392
|
metadata=V1ObjectMeta(
|
|
3418
3393
|
name=get_kubernetes_app_name(service, instance),
|
|
3419
3394
|
namespace=namespace,
|
|
3420
3395
|
),
|
|
3421
|
-
spec=
|
|
3422
|
-
max_unavailable=max_unavailable,
|
|
3423
|
-
selector=V1LabelSelector(
|
|
3424
|
-
match_labels={
|
|
3425
|
-
"paasta.yelp.com/service": service,
|
|
3426
|
-
"paasta.yelp.com/instance": instance,
|
|
3427
|
-
}
|
|
3428
|
-
),
|
|
3429
|
-
),
|
|
3396
|
+
spec=spec,
|
|
3430
3397
|
)
|
|
3431
3398
|
|
|
3432
3399
|
|
|
@@ -4210,6 +4177,10 @@ def create_pod_topology_spread_constraints(
|
|
|
4210
4177
|
when_unsatisfiable=constraint.get(
|
|
4211
4178
|
"when_unsatisfiable", "ScheduleAnyway"
|
|
4212
4179
|
),
|
|
4180
|
+
# we might want to default this to someting else in the future
|
|
4181
|
+
# but for now, make this opt-in
|
|
4182
|
+
# (null or empty list means only match against the labelSelector)
|
|
4183
|
+
match_label_keys=constraint.get("match_label_keys", None),
|
|
4213
4184
|
)
|
|
4214
4185
|
)
|
|
4215
4186
|
|
|
@@ -4413,7 +4384,7 @@ def ensure_service_account(
|
|
|
4413
4384
|
name=k8s_role,
|
|
4414
4385
|
),
|
|
4415
4386
|
subjects=[
|
|
4416
|
-
|
|
4387
|
+
RbacV1Subject(
|
|
4417
4388
|
kind="ServiceAccount",
|
|
4418
4389
|
namespace=namespace,
|
|
4419
4390
|
name=sa_name,
|
|
@@ -41,6 +41,7 @@ DEFAULT_ACTIVE_REQUESTS_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
|
|
|
41
41
|
DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
|
|
42
42
|
DEFAULT_PISCINA_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
|
|
43
43
|
DEFAULT_GUNICORN_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
|
|
44
|
+
DEFAULT_WORKER_LOAD_AUTOSCALING_MOVING_AVERAGE_WINDOW = 1800
|
|
44
45
|
|
|
45
46
|
METRICS_PROVIDER_CPU = "cpu"
|
|
46
47
|
METRICS_PROVIDER_UWSGI = "uwsgi"
|
|
@@ -48,7 +49,8 @@ METRICS_PROVIDER_UWSGI_V2 = "uwsgi-v2"
|
|
|
48
49
|
METRICS_PROVIDER_GUNICORN = "gunicorn"
|
|
49
50
|
METRICS_PROVIDER_PISCINA = "piscina"
|
|
50
51
|
METRICS_PROVIDER_ACTIVE_REQUESTS = "active-requests"
|
|
51
|
-
METRICS_PROVIDER_PROMQL = "
|
|
52
|
+
METRICS_PROVIDER_PROMQL = "arbitrary-promql"
|
|
53
|
+
METRICS_PROVIDER_WORKER_LOAD = "worker-load"
|
|
52
54
|
|
|
53
55
|
ALL_METRICS_PROVIDERS = [
|
|
54
56
|
METRICS_PROVIDER_CPU,
|
|
@@ -58,6 +60,7 @@ ALL_METRICS_PROVIDERS = [
|
|
|
58
60
|
METRICS_PROVIDER_PISCINA,
|
|
59
61
|
METRICS_PROVIDER_ACTIVE_REQUESTS,
|
|
60
62
|
METRICS_PROVIDER_PROMQL,
|
|
63
|
+
METRICS_PROVIDER_WORKER_LOAD,
|
|
61
64
|
]
|
|
62
65
|
|
|
63
66
|
|
|
@@ -85,6 +88,7 @@ class LongRunningServiceConfigDict(InstanceConfigDict, total=False):
|
|
|
85
88
|
bounce_margin_factor: float
|
|
86
89
|
should_ping_for_unhealthy_pods: bool
|
|
87
90
|
weight: int
|
|
91
|
+
unhealthy_pod_eviction_policy: str
|
|
88
92
|
|
|
89
93
|
|
|
90
94
|
class ServiceNamespaceConfig(dict):
|
|
@@ -410,6 +414,9 @@ class LongRunningServiceConfig(InstanceConfig):
|
|
|
410
414
|
def get_bounce_margin_factor(self) -> float:
|
|
411
415
|
return self.config_dict.get("bounce_margin_factor", 0.95)
|
|
412
416
|
|
|
417
|
+
def get_unhealthy_pod_eviction_policy(self) -> str:
|
|
418
|
+
return self.config_dict.get("unhealthy_pod_eviction_policy", "IfHealthyBudget")
|
|
419
|
+
|
|
413
420
|
def get_should_ping_for_unhealthy_pods(self, default: bool) -> bool:
|
|
414
421
|
return self.config_dict.get("should_ping_for_unhealthy_pods", default)
|
|
415
422
|
|
|
@@ -81,6 +81,7 @@ class KubernetesVersion(ModelNormal):
|
|
|
81
81
|
"""
|
|
82
82
|
lazy_import()
|
|
83
83
|
return {
|
|
84
|
+
'container_port': (int,), # noqa: E501
|
|
84
85
|
'type': (str,), # noqa: E501
|
|
85
86
|
'create_timestamp': (float,), # noqa: E501
|
|
86
87
|
'git_sha': (str,), # noqa: E501
|
|
@@ -99,6 +100,7 @@ class KubernetesVersion(ModelNormal):
|
|
|
99
100
|
|
|
100
101
|
|
|
101
102
|
attribute_map = {
|
|
103
|
+
'container_port': 'container_port', # noqa: E501
|
|
102
104
|
'type': 'type', # noqa: E501
|
|
103
105
|
'create_timestamp': 'create_timestamp', # noqa: E501
|
|
104
106
|
'git_sha': 'git_sha', # noqa: E501
|
|
@@ -157,6 +159,7 @@ class KubernetesVersion(ModelNormal):
|
|
|
157
159
|
Animal class but this time we won't travel
|
|
158
160
|
through its discriminator because we passed in
|
|
159
161
|
_visited_composed_classes = (Animal,)
|
|
162
|
+
container_port (int): Port the container is expecting to receive traffic on. [optional] # noqa: E501
|
|
160
163
|
type (str): Type of version (ReplicaSet or ControllerRevision). [optional] # noqa: E501
|
|
161
164
|
create_timestamp (float): Unix timestamp when version was created. [optional] # noqa: E501
|
|
162
165
|
git_sha (str): Git SHA of service code for this version of the instance. [optional] # noqa: E501
|
|
@@ -53,6 +53,9 @@ from paasta_tools.long_running_service_tools import (
|
|
|
53
53
|
from paasta_tools.long_running_service_tools import (
|
|
54
54
|
DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW,
|
|
55
55
|
)
|
|
56
|
+
from paasta_tools.long_running_service_tools import (
|
|
57
|
+
DEFAULT_WORKER_LOAD_AUTOSCALING_MOVING_AVERAGE_WINDOW,
|
|
58
|
+
)
|
|
56
59
|
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_ACTIVE_REQUESTS
|
|
57
60
|
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_CPU
|
|
58
61
|
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_GUNICORN
|
|
@@ -60,6 +63,7 @@ from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PISCINA
|
|
|
60
63
|
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_PROMQL
|
|
61
64
|
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI
|
|
62
65
|
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_UWSGI_V2
|
|
66
|
+
from paasta_tools.long_running_service_tools import METRICS_PROVIDER_WORKER_LOAD
|
|
63
67
|
from paasta_tools.paasta_service_config_loader import PaastaServiceConfigLoader
|
|
64
68
|
from paasta_tools.utils import DEFAULT_SOA_DIR
|
|
65
69
|
from paasta_tools.utils import get_services_for_cluster
|
|
@@ -214,6 +218,10 @@ def create_instance_scaling_rule(
|
|
|
214
218
|
return create_instance_uwsgi_v2_scaling_rule(
|
|
215
219
|
service, instance_config, metrics_provider_config, paasta_cluster
|
|
216
220
|
)
|
|
221
|
+
if metrics_provider_config["type"] == METRICS_PROVIDER_WORKER_LOAD:
|
|
222
|
+
return create_instance_worker_load_scaling_rule(
|
|
223
|
+
service, instance_config, metrics_provider_config, paasta_cluster
|
|
224
|
+
)
|
|
217
225
|
if metrics_provider_config["type"] == METRICS_PROVIDER_PISCINA:
|
|
218
226
|
return create_instance_piscina_scaling_rule(
|
|
219
227
|
service, instance_config, metrics_provider_config, paasta_cluster
|
|
@@ -523,6 +531,80 @@ def create_instance_uwsgi_v2_scaling_rule(
|
|
|
523
531
|
}
|
|
524
532
|
|
|
525
533
|
|
|
534
|
+
def create_instance_worker_load_scaling_rule(
|
|
535
|
+
service: str,
|
|
536
|
+
instance_config: KubernetesDeploymentConfig,
|
|
537
|
+
metrics_provider_config: MetricsProviderDict,
|
|
538
|
+
paasta_cluster: str,
|
|
539
|
+
) -> PrometheusAdapterRule:
|
|
540
|
+
"""
|
|
541
|
+
Creates a Prometheus adapter rule config for a given service instance using generic worker_busy metric.
|
|
542
|
+
"""
|
|
543
|
+
instance = instance_config.instance
|
|
544
|
+
moving_average_window = metrics_provider_config.get(
|
|
545
|
+
"moving_average_window_seconds",
|
|
546
|
+
DEFAULT_WORKER_LOAD_AUTOSCALING_MOVING_AVERAGE_WINDOW,
|
|
547
|
+
)
|
|
548
|
+
deployment_name = get_kubernetes_app_name(service=service, instance=instance)
|
|
549
|
+
|
|
550
|
+
# In order for autoscaling to work safely while a service migrates from one namespace to another, the HPA needs to
|
|
551
|
+
# make sure that the deployment in the new namespace is scaled up enough to handle _all_ the load.
|
|
552
|
+
# This is because once the new deployment is 100% healthy, cleanup_kubernetes_job will delete the deployment out of
|
|
553
|
+
# the old namespace all at once, suddenly putting all the load onto the deployment in the new namespace.
|
|
554
|
+
# To ensure this, we must NOT filter on namespace in worker_filter_terms (which is used when calculating total_load.
|
|
555
|
+
# This makes sure that desired_instances includes load from all namespaces.
|
|
556
|
+
worker_filter_terms = f"paasta_cluster='{paasta_cluster}',paasta_service='{service}',paasta_instance='{instance}'"
|
|
557
|
+
|
|
558
|
+
# k8s:deployment:pods_status_ready is a metric created by summing kube_pod_status_ready
|
|
559
|
+
# over paasta service/instance/cluster. it counts the number of ready pods in a paasta
|
|
560
|
+
# deployment.
|
|
561
|
+
ready_pods = f"""
|
|
562
|
+
(sum(
|
|
563
|
+
k8s:deployment:pods_status_ready{{{worker_filter_terms}}} >= 0
|
|
564
|
+
or
|
|
565
|
+
max_over_time(
|
|
566
|
+
k8s:deployment:pods_status_ready{{{worker_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s]
|
|
567
|
+
)
|
|
568
|
+
) by (kube_deployment))
|
|
569
|
+
"""
|
|
570
|
+
load_per_instance = f"""
|
|
571
|
+
avg(
|
|
572
|
+
worker_busy{{{worker_filter_terms}}}
|
|
573
|
+
) by (kube_pod, kube_deployment)
|
|
574
|
+
"""
|
|
575
|
+
missing_instances = f"""
|
|
576
|
+
clamp_min(
|
|
577
|
+
{ready_pods} - count({load_per_instance}) by (kube_deployment),
|
|
578
|
+
0
|
|
579
|
+
)
|
|
580
|
+
"""
|
|
581
|
+
total_load = f"""
|
|
582
|
+
(
|
|
583
|
+
sum(
|
|
584
|
+
{load_per_instance}
|
|
585
|
+
) by (kube_deployment)
|
|
586
|
+
+
|
|
587
|
+
{missing_instances}
|
|
588
|
+
)
|
|
589
|
+
"""
|
|
590
|
+
total_load_smoothed = f"""
|
|
591
|
+
avg_over_time(
|
|
592
|
+
(
|
|
593
|
+
{total_load}
|
|
594
|
+
)[{moving_average_window}s:]
|
|
595
|
+
)
|
|
596
|
+
"""
|
|
597
|
+
|
|
598
|
+
metric_name = f"{deployment_name}-worker-load-prom"
|
|
599
|
+
|
|
600
|
+
return {
|
|
601
|
+
"name": {"as": metric_name},
|
|
602
|
+
"seriesQuery": f"worker_busy{{{worker_filter_terms}}}",
|
|
603
|
+
"resources": {"template": "kube_<<.Resource>>"},
|
|
604
|
+
"metricsQuery": _minify_promql(total_load_smoothed),
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
|
|
526
608
|
def create_instance_piscina_scaling_rule(
|
|
527
609
|
service: str,
|
|
528
610
|
instance_config: KubernetesDeploymentConfig,
|
paasta_tools/tron_tools.py
CHANGED
|
@@ -506,6 +506,9 @@ class TronActionConfig(InstanceConfig):
|
|
|
506
506
|
# XXX: update PAASTA_RESOURCE_* env vars to use the correct value from spark_args and set
|
|
507
507
|
# these to the correct values for the executors as part of the driver commandline
|
|
508
508
|
|
|
509
|
+
# our internal Spark configuration service needs this to determine if any special behavior is required
|
|
510
|
+
env["SPARK_DRIVER_TYPE"] = "tron"
|
|
511
|
+
|
|
509
512
|
return env
|
|
510
513
|
|
|
511
514
|
def get_iam_role(self) -> str:
|