paasta-tools 1.30.8__py3-none-any.whl → 1.30.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paasta_tools/__init__.py +1 -1
- paasta_tools/api/views/instance.py +9 -2
- paasta_tools/async_utils.py +4 -1
- paasta_tools/bounce_lib.py +8 -5
- paasta_tools/check_services_replication_tools.py +10 -4
- paasta_tools/check_spark_jobs.py +1 -1
- paasta_tools/cli/cli.py +4 -4
- paasta_tools/cli/cmds/logs.py +29 -7
- paasta_tools/cli/cmds/mark_for_deployment.py +2 -2
- paasta_tools/cli/cmds/mesh_status.py +1 -1
- paasta_tools/cli/cmds/remote_run.py +1 -1
- paasta_tools/cli/cmds/rollback.py +1 -1
- paasta_tools/cli/cmds/spark_run.py +3 -3
- paasta_tools/cli/cmds/status.py +24 -21
- paasta_tools/cli/cmds/validate.py +3 -3
- paasta_tools/cli/utils.py +32 -19
- paasta_tools/contrib/check_orphans.py +1 -1
- paasta_tools/contrib/get_running_task_allocation.py +1 -1
- paasta_tools/instance/kubernetes.py +2 -1
- paasta_tools/kubernetes_tools.py +2 -40
- paasta_tools/metrics/metastatus_lib.py +0 -24
- paasta_tools/metrics/metrics_lib.py +12 -3
- paasta_tools/setup_kubernetes_job.py +1 -1
- paasta_tools/setup_tron_namespace.py +2 -2
- paasta_tools/tron_tools.py +1 -1
- paasta_tools/utils.py +2 -9
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_orphans.py +1 -1
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_spark_jobs.py +1 -1
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/get_running_task_allocation.py +1 -1
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_kubernetes_job.py +1 -1
- {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/METADATA +2 -2
- {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/RECORD +84 -89
- paasta_tools/frameworks/adhoc_scheduler.py +0 -71
- paasta_tools/frameworks/native_scheduler.py +0 -652
- paasta_tools/frameworks/task_store.py +0 -245
- paasta_tools/mesos_maintenance.py +0 -848
- paasta_tools/paasta_native_serviceinit.py +0 -21
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/apply_external_resources.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/bounce_log_latency_parser.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_autoscaler_max_instances.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_cassandracluster_services_replication.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_flink_services_health.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_kubernetes_api.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_kubernetes_services_replication.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_manual_oapi_changes.sh +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_oom_events.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/cleanup_kubernetes_cr.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/cleanup_kubernetes_crd.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/cleanup_kubernetes_jobs.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/create_dynamodb_table.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/create_paasta_playground.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/delete_kubernetes_deployments.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/emit_allocated_cpu_metrics.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/generate_all_deployments +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/generate_authenticating_services.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/generate_deployments_for_service.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/generate_services_file.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/generate_services_yaml.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/habitat_fixer.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/ide_helper.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/is_pod_healthy_in_proxy.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/is_pod_healthy_in_smartstack.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/kill_bad_containers.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/kubernetes_remove_evicted_pods.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/mass-deploy-tag.sh +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/mock_patch_checker.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_cleanup_remote_run_resources.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_cleanup_stale_nodes.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_deploy_tron_jobs +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_execute_docker_command.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_secrets_sync.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_tabcomplete.sh +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_update_soa_memcpu.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/render_template.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/rightsizer_soaconfigs_update.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/service_shard_remove.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/service_shard_update.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_istio_mesh.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_kubernetes_cr.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_kubernetes_crd.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_kubernetes_internal_crd.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_prometheus_adapter_config.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/shared_ip_check.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/synapse_srv_namespaces_fact.py +0 -0
- {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/timeouts_metrics_prom.py +0 -0
- {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/WHEEL +0 -0
- {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/entry_points.txt +0 -0
- {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/licenses/LICENSE +0 -0
- {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/top_level.txt +0 -0
|
@@ -1,652 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
import asyncio
|
|
3
|
-
import copy
|
|
4
|
-
import getpass
|
|
5
|
-
import logging
|
|
6
|
-
import random
|
|
7
|
-
import threading
|
|
8
|
-
import time
|
|
9
|
-
import uuid
|
|
10
|
-
from typing import Collection
|
|
11
|
-
from typing import Dict
|
|
12
|
-
from typing import List
|
|
13
|
-
from typing import Mapping
|
|
14
|
-
from typing import Optional
|
|
15
|
-
from typing import Tuple
|
|
16
|
-
|
|
17
|
-
import a_sync
|
|
18
|
-
import service_configuration_lib
|
|
19
|
-
from pymesos import MesosSchedulerDriver
|
|
20
|
-
from pymesos.interface import Scheduler
|
|
21
|
-
|
|
22
|
-
from paasta_tools import bounce_lib
|
|
23
|
-
from paasta_tools import drain_lib
|
|
24
|
-
from paasta_tools import mesos_tools
|
|
25
|
-
from paasta_tools.frameworks.constraints import check_offer_constraints
|
|
26
|
-
from paasta_tools.frameworks.constraints import ConstraintState
|
|
27
|
-
from paasta_tools.frameworks.constraints import update_constraint_state
|
|
28
|
-
from paasta_tools.frameworks.native_service_config import load_paasta_native_job_config
|
|
29
|
-
from paasta_tools.frameworks.native_service_config import NativeServiceConfig
|
|
30
|
-
from paasta_tools.frameworks.native_service_config import NativeServiceConfigDict
|
|
31
|
-
from paasta_tools.frameworks.native_service_config import TaskInfo
|
|
32
|
-
from paasta_tools.frameworks.task_store import MesosTaskParameters
|
|
33
|
-
from paasta_tools.frameworks.task_store import TaskStore
|
|
34
|
-
from paasta_tools.frameworks.task_store import ZKTaskStore
|
|
35
|
-
from paasta_tools.utils import _log
|
|
36
|
-
from paasta_tools.utils import DEFAULT_LOGLEVEL
|
|
37
|
-
from paasta_tools.utils import DEFAULT_SOA_DIR
|
|
38
|
-
from paasta_tools.utils import get_services_for_cluster
|
|
39
|
-
from paasta_tools.utils import SystemPaastaConfig
|
|
40
|
-
|
|
41
|
-
log = logging.getLogger(__name__)
|
|
42
|
-
|
|
43
|
-
MESOS_TASK_SPACER = "."
|
|
44
|
-
|
|
45
|
-
# Bring these into local scope for shorter lines of code.
|
|
46
|
-
TASK_STAGING = "TASK_STAGING"
|
|
47
|
-
TASK_STARTING = "TASK_STARTING"
|
|
48
|
-
TASK_RUNNING = "TASK_RUNNING"
|
|
49
|
-
|
|
50
|
-
TASK_KILLING = "TASK_KILLING"
|
|
51
|
-
TASK_FINISHED = "TASK_FINISHED"
|
|
52
|
-
TASK_FAILED = "TASK_FAILED"
|
|
53
|
-
TASK_KILLED = "TASK_KILLED"
|
|
54
|
-
TASK_LOST = "TASK_LOST"
|
|
55
|
-
TASK_ERROR = "TASK_ERROR"
|
|
56
|
-
|
|
57
|
-
LIVE_TASK_STATES = (TASK_STAGING, TASK_STARTING, TASK_RUNNING)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class ConstraintFailAllTasksError(Exception):
|
|
61
|
-
pass
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class NativeScheduler(Scheduler):
|
|
65
|
-
task_store: TaskStore
|
|
66
|
-
|
|
67
|
-
def __init__(
|
|
68
|
-
self,
|
|
69
|
-
service_name: str,
|
|
70
|
-
instance_name: str,
|
|
71
|
-
cluster: str,
|
|
72
|
-
system_paasta_config: SystemPaastaConfig,
|
|
73
|
-
staging_timeout: float,
|
|
74
|
-
soa_dir: str = DEFAULT_SOA_DIR,
|
|
75
|
-
service_config: Optional[NativeServiceConfig] = None,
|
|
76
|
-
reconcile_backoff: float = 30,
|
|
77
|
-
instance_type: str = "paasta_native",
|
|
78
|
-
service_config_overrides: Optional[NativeServiceConfigDict] = None,
|
|
79
|
-
reconcile_start_time: float = float("inf"),
|
|
80
|
-
task_store_type=ZKTaskStore,
|
|
81
|
-
) -> None:
|
|
82
|
-
self.service_name = service_name
|
|
83
|
-
self.instance_name = instance_name
|
|
84
|
-
self.instance_type = instance_type
|
|
85
|
-
self.cluster = cluster
|
|
86
|
-
self.system_paasta_config = system_paasta_config
|
|
87
|
-
self.soa_dir = soa_dir
|
|
88
|
-
|
|
89
|
-
# This will be initialized in registered().
|
|
90
|
-
self.task_store = None
|
|
91
|
-
self.task_store_type = task_store_type
|
|
92
|
-
|
|
93
|
-
self.service_config_overrides = service_config_overrides or {}
|
|
94
|
-
self.constraint_state: ConstraintState = {}
|
|
95
|
-
self.constraint_state_lock = threading.Lock()
|
|
96
|
-
self.frozen = False
|
|
97
|
-
|
|
98
|
-
# don't accept resources until we reconcile.
|
|
99
|
-
self.reconcile_start_time = reconcile_start_time
|
|
100
|
-
|
|
101
|
-
# wait this long after starting a reconcile before accepting offers.
|
|
102
|
-
self.reconcile_backoff = reconcile_backoff
|
|
103
|
-
|
|
104
|
-
# wait this long for a task to launch.
|
|
105
|
-
self.staging_timeout = staging_timeout
|
|
106
|
-
|
|
107
|
-
# Gets set when registered() is called
|
|
108
|
-
self.framework_id = None
|
|
109
|
-
|
|
110
|
-
# agent_id -> unix timestamp of when we blacklisted it
|
|
111
|
-
self.blacklisted_slaves: Dict[str, float] = {}
|
|
112
|
-
self.blacklist_timeout = 3600
|
|
113
|
-
|
|
114
|
-
if service_config is not None:
|
|
115
|
-
self.service_config = service_config
|
|
116
|
-
self.service_config.config_dict.update( # type: ignore
|
|
117
|
-
self.service_config_overrides
|
|
118
|
-
)
|
|
119
|
-
self.recreate_drain_method()
|
|
120
|
-
self.reload_constraints()
|
|
121
|
-
self.validate_config()
|
|
122
|
-
else:
|
|
123
|
-
self.load_config()
|
|
124
|
-
|
|
125
|
-
def log(self, line, level=DEFAULT_LOGLEVEL):
|
|
126
|
-
_log(
|
|
127
|
-
service=self.service_name,
|
|
128
|
-
instance=self.instance_name,
|
|
129
|
-
component="deploy",
|
|
130
|
-
line=line,
|
|
131
|
-
level=level,
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
def shutdown(self, driver: MesosSchedulerDriver):
|
|
135
|
-
# TODO: this is naive, as it does nothing to stop on-going calls
|
|
136
|
-
# to statusUpdate or resourceOffers.
|
|
137
|
-
self.log(
|
|
138
|
-
"Freezing the scheduler. Further status updates and resource offers are ignored."
|
|
139
|
-
)
|
|
140
|
-
self.frozen = True
|
|
141
|
-
self.log("Killing any remaining live tasks.")
|
|
142
|
-
for task, parameters in self.task_store.get_all_tasks().items():
|
|
143
|
-
if parameters.mesos_task_state in LIVE_TASK_STATES:
|
|
144
|
-
self.kill_task(driver, task)
|
|
145
|
-
self.task_store.close()
|
|
146
|
-
|
|
147
|
-
def registered(self, driver: MesosSchedulerDriver, frameworkId, masterInfo):
|
|
148
|
-
self.framework_id = frameworkId["value"]
|
|
149
|
-
self.log("Registered with framework ID %s" % frameworkId["value"])
|
|
150
|
-
|
|
151
|
-
self.task_store = self.task_store_type(
|
|
152
|
-
service_name=self.service_name,
|
|
153
|
-
instance_name=self.instance_name,
|
|
154
|
-
framework_id=self.framework_id,
|
|
155
|
-
system_paasta_config=self.system_paasta_config,
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
self.reconcile_start_time = time.time()
|
|
159
|
-
driver.reconcileTasks([])
|
|
160
|
-
|
|
161
|
-
def reregistered(self, driver: MesosSchedulerDriver, masterInfo):
|
|
162
|
-
self.registered(driver, {"value": driver.framework_id}, masterInfo)
|
|
163
|
-
|
|
164
|
-
def resourceOffers(self, driver: MesosSchedulerDriver, offers):
|
|
165
|
-
if self.frozen:
|
|
166
|
-
return
|
|
167
|
-
|
|
168
|
-
if self.within_reconcile_backoff():
|
|
169
|
-
self.log(
|
|
170
|
-
"Declining all offers since we started reconciliation too recently"
|
|
171
|
-
)
|
|
172
|
-
for offer in offers:
|
|
173
|
-
driver.declineOffer(offer.id)
|
|
174
|
-
else:
|
|
175
|
-
for idx, offer in enumerate(offers):
|
|
176
|
-
if offer.agent_id.value in self.blacklisted_slaves:
|
|
177
|
-
log.critical(
|
|
178
|
-
"Ignoring offer %s from blacklisted slave %s"
|
|
179
|
-
% (offer.id.value, offer.agent_id.value)
|
|
180
|
-
)
|
|
181
|
-
filters = {"refuse_seconds": self.blacklist_timeout}
|
|
182
|
-
driver.declineOffer(offer.id, filters)
|
|
183
|
-
del offers[idx]
|
|
184
|
-
|
|
185
|
-
self.launch_tasks_for_offers(driver, offers)
|
|
186
|
-
|
|
187
|
-
def launch_tasks_for_offers(
|
|
188
|
-
self, driver: MesosSchedulerDriver, offers
|
|
189
|
-
) -> List[TaskInfo]:
|
|
190
|
-
"""For each offer tries to launch all tasks that can fit in there.
|
|
191
|
-
Declines offer if no fitting tasks found."""
|
|
192
|
-
launched_tasks: List[TaskInfo] = []
|
|
193
|
-
|
|
194
|
-
for offer in offers:
|
|
195
|
-
with self.constraint_state_lock:
|
|
196
|
-
try:
|
|
197
|
-
tasks, new_state = self.tasks_and_state_for_offer(
|
|
198
|
-
driver, offer, self.constraint_state
|
|
199
|
-
)
|
|
200
|
-
|
|
201
|
-
if tasks is not None and len(tasks) > 0:
|
|
202
|
-
driver.launchTasks([offer.id], tasks)
|
|
203
|
-
|
|
204
|
-
for task in tasks:
|
|
205
|
-
self.task_store.add_task_if_doesnt_exist(
|
|
206
|
-
task["task_id"]["value"],
|
|
207
|
-
health=None,
|
|
208
|
-
mesos_task_state=TASK_STAGING,
|
|
209
|
-
offer=offer,
|
|
210
|
-
resources=task["resources"],
|
|
211
|
-
)
|
|
212
|
-
launched_tasks.extend(tasks)
|
|
213
|
-
self.constraint_state = new_state
|
|
214
|
-
else:
|
|
215
|
-
driver.declineOffer(offer.id)
|
|
216
|
-
except ConstraintFailAllTasksError:
|
|
217
|
-
self.log("Offer failed constraints for every task, rejecting 60s")
|
|
218
|
-
filters = {"refuse_seconds": 60}
|
|
219
|
-
driver.declineOffer(offer.id, filters)
|
|
220
|
-
return launched_tasks
|
|
221
|
-
|
|
222
|
-
def task_fits(self, offer):
|
|
223
|
-
"""Checks whether the offer is big enough to fit the tasks"""
|
|
224
|
-
needed_resources = {
|
|
225
|
-
"cpus": self.service_config.get_cpus(),
|
|
226
|
-
"mem": self.service_config.get_mem(),
|
|
227
|
-
"disk": self.service_config.get_disk(),
|
|
228
|
-
}
|
|
229
|
-
for resource in offer.resources:
|
|
230
|
-
try:
|
|
231
|
-
if resource.scalar.value < needed_resources[resource.name]:
|
|
232
|
-
return False
|
|
233
|
-
except KeyError:
|
|
234
|
-
pass
|
|
235
|
-
|
|
236
|
-
return True
|
|
237
|
-
|
|
238
|
-
def need_more_tasks(self, name, existingTasks, scheduledTasks):
|
|
239
|
-
"""Returns whether we need to start more tasks."""
|
|
240
|
-
num_have = 0
|
|
241
|
-
for task, parameters in existingTasks.items():
|
|
242
|
-
if self.is_task_new(name, task) and (
|
|
243
|
-
parameters.mesos_task_state in LIVE_TASK_STATES
|
|
244
|
-
):
|
|
245
|
-
num_have += 1
|
|
246
|
-
|
|
247
|
-
for task in scheduledTasks:
|
|
248
|
-
if task["name"] == name:
|
|
249
|
-
num_have += 1
|
|
250
|
-
|
|
251
|
-
return num_have < self.service_config.get_desired_instances()
|
|
252
|
-
|
|
253
|
-
def get_new_tasks(self, name, tasks_with_params: Dict[str, MesosTaskParameters]):
|
|
254
|
-
return {
|
|
255
|
-
tid: params
|
|
256
|
-
for tid, params in tasks_with_params.items()
|
|
257
|
-
if (
|
|
258
|
-
self.is_task_new(name, tid)
|
|
259
|
-
and (params.mesos_task_state in LIVE_TASK_STATES)
|
|
260
|
-
)
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
def get_old_tasks(self, name, tasks_with_params: Dict[str, MesosTaskParameters]):
|
|
264
|
-
return {
|
|
265
|
-
tid: params
|
|
266
|
-
for tid, params in tasks_with_params.items()
|
|
267
|
-
if (
|
|
268
|
-
(not self.is_task_new(name, tid))
|
|
269
|
-
and (params.mesos_task_state in LIVE_TASK_STATES)
|
|
270
|
-
)
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
def is_task_new(self, name, tid):
|
|
274
|
-
return tid.startswith("%s." % name)
|
|
275
|
-
|
|
276
|
-
def log_and_kill(self, driver: MesosSchedulerDriver, task_id):
|
|
277
|
-
log.critical(
|
|
278
|
-
"Task stuck launching for %ss, assuming to have failed. Killing task."
|
|
279
|
-
% self.staging_timeout
|
|
280
|
-
)
|
|
281
|
-
self.blacklist_slave(self.task_store.get_task(task_id).offer.agent_id.value)
|
|
282
|
-
self.kill_task(driver, task_id)
|
|
283
|
-
|
|
284
|
-
def tasks_and_state_for_offer(
|
|
285
|
-
self, driver: MesosSchedulerDriver, offer, state: ConstraintState
|
|
286
|
-
) -> Tuple[List[TaskInfo], ConstraintState]:
|
|
287
|
-
"""Returns collection of tasks that can fit inside an offer."""
|
|
288
|
-
tasks: List[TaskInfo] = []
|
|
289
|
-
offerCpus = 0.0
|
|
290
|
-
offerMem = 0.0
|
|
291
|
-
offerPorts: List[int] = []
|
|
292
|
-
for resource in offer.resources:
|
|
293
|
-
if resource.name == "cpus":
|
|
294
|
-
offerCpus += resource.scalar.value
|
|
295
|
-
elif resource.name == "mem":
|
|
296
|
-
offerMem += resource.scalar.value
|
|
297
|
-
elif resource.name == "ports":
|
|
298
|
-
for rg in resource.ranges.range:
|
|
299
|
-
# I believe mesos protobuf ranges are inclusive, but range() is exclusive
|
|
300
|
-
offerPorts += range(rg.begin, rg.end + 1)
|
|
301
|
-
remainingCpus = offerCpus
|
|
302
|
-
remainingMem = offerMem
|
|
303
|
-
remainingPorts = set(offerPorts)
|
|
304
|
-
|
|
305
|
-
base_task = self.service_config.base_task(self.system_paasta_config)
|
|
306
|
-
base_task["agent_id"]["value"] = offer["agent_id"]["value"]
|
|
307
|
-
|
|
308
|
-
task_mem = self.service_config.get_mem()
|
|
309
|
-
task_cpus = self.service_config.get_cpus()
|
|
310
|
-
|
|
311
|
-
# don't mutate existing state
|
|
312
|
-
new_constraint_state = copy.deepcopy(state)
|
|
313
|
-
total = 0
|
|
314
|
-
failed_constraints = 0
|
|
315
|
-
while self.need_more_tasks(
|
|
316
|
-
base_task["name"], self.task_store.get_all_tasks(), tasks
|
|
317
|
-
):
|
|
318
|
-
total += 1
|
|
319
|
-
|
|
320
|
-
if not (
|
|
321
|
-
remainingCpus >= task_cpus
|
|
322
|
-
and remainingMem >= task_mem
|
|
323
|
-
and self.offer_matches_pool(offer)
|
|
324
|
-
and len(remainingPorts) >= 1
|
|
325
|
-
):
|
|
326
|
-
break
|
|
327
|
-
|
|
328
|
-
if not (
|
|
329
|
-
check_offer_constraints(offer, self.constraints, new_constraint_state)
|
|
330
|
-
):
|
|
331
|
-
failed_constraints += 1
|
|
332
|
-
break
|
|
333
|
-
|
|
334
|
-
task_port = random.choice(list(remainingPorts))
|
|
335
|
-
|
|
336
|
-
task = copy.deepcopy(base_task)
|
|
337
|
-
task["task_id"] = {"value": "{}.{}".format(task["name"], uuid.uuid4().hex)}
|
|
338
|
-
|
|
339
|
-
task["container"]["docker"]["port_mappings"][0]["host_port"] = task_port
|
|
340
|
-
for resource in task["resources"]:
|
|
341
|
-
if resource["name"] == "ports":
|
|
342
|
-
resource["ranges"]["range"][0]["begin"] = task_port
|
|
343
|
-
resource["ranges"]["range"][0]["end"] = task_port
|
|
344
|
-
|
|
345
|
-
tasks.append(task)
|
|
346
|
-
|
|
347
|
-
remainingCpus -= task_cpus
|
|
348
|
-
remainingMem -= task_mem
|
|
349
|
-
remainingPorts -= {task_port}
|
|
350
|
-
|
|
351
|
-
update_constraint_state(offer, self.constraints, new_constraint_state)
|
|
352
|
-
|
|
353
|
-
# raise constraint error but only if no other tasks fit/fail the offer
|
|
354
|
-
if total > 0 and failed_constraints == total:
|
|
355
|
-
raise ConstraintFailAllTasksError
|
|
356
|
-
|
|
357
|
-
return tasks, new_constraint_state
|
|
358
|
-
|
|
359
|
-
def offer_matches_pool(self, offer):
|
|
360
|
-
for attribute in offer.attributes:
|
|
361
|
-
if attribute.name == "pool":
|
|
362
|
-
return attribute.text.value == self.service_config.get_pool()
|
|
363
|
-
# we didn't find a pool attribute on this slave, so assume it's not in our pool.
|
|
364
|
-
return False
|
|
365
|
-
|
|
366
|
-
def within_reconcile_backoff(self):
|
|
367
|
-
return time.time() - self.reconcile_backoff < self.reconcile_start_time
|
|
368
|
-
|
|
369
|
-
def periodic(self, driver: MesosSchedulerDriver):
|
|
370
|
-
if self.frozen:
|
|
371
|
-
return
|
|
372
|
-
|
|
373
|
-
self.periodic_was_called = True # Used for testing.
|
|
374
|
-
if not self.within_reconcile_backoff():
|
|
375
|
-
driver.reviveOffers()
|
|
376
|
-
|
|
377
|
-
self.load_config()
|
|
378
|
-
self.kill_tasks_if_necessary(driver)
|
|
379
|
-
self.check_blacklisted_slaves_for_timeout()
|
|
380
|
-
|
|
381
|
-
def statusUpdate(self, driver: MesosSchedulerDriver, update: Dict):
|
|
382
|
-
if self.frozen:
|
|
383
|
-
return
|
|
384
|
-
|
|
385
|
-
# update tasks
|
|
386
|
-
task_id = update["task_id"]["value"]
|
|
387
|
-
self.log("Task {} is in state {}".format(task_id, update["state"]))
|
|
388
|
-
|
|
389
|
-
task_params = self.task_store.update_task(
|
|
390
|
-
task_id, mesos_task_state=update["state"]
|
|
391
|
-
)
|
|
392
|
-
|
|
393
|
-
if task_params.mesos_task_state not in LIVE_TASK_STATES:
|
|
394
|
-
with self.constraint_state_lock:
|
|
395
|
-
update_constraint_state(
|
|
396
|
-
task_params.offer, self.constraints, self.constraint_state, step=-1
|
|
397
|
-
)
|
|
398
|
-
|
|
399
|
-
driver.acknowledgeStatusUpdate(update)
|
|
400
|
-
self.kill_tasks_if_necessary(driver)
|
|
401
|
-
|
|
402
|
-
def make_healthiness_sorter(
|
|
403
|
-
self, base_task_name: str, all_tasks_with_params: Dict[str, MesosTaskParameters]
|
|
404
|
-
):
|
|
405
|
-
def healthiness_score(task_id):
|
|
406
|
-
"""Return a tuple that can be used as a key for sorting, that expresses our desire to keep this task around.
|
|
407
|
-
Higher values (things that sort later) are more desirable."""
|
|
408
|
-
params = all_tasks_with_params[task_id]
|
|
409
|
-
|
|
410
|
-
state_score = {
|
|
411
|
-
TASK_KILLING: 0,
|
|
412
|
-
TASK_FINISHED: 0,
|
|
413
|
-
TASK_FAILED: 0,
|
|
414
|
-
TASK_KILLED: 0,
|
|
415
|
-
TASK_LOST: 0,
|
|
416
|
-
TASK_ERROR: 0,
|
|
417
|
-
TASK_STAGING: 1,
|
|
418
|
-
TASK_STARTING: 2,
|
|
419
|
-
TASK_RUNNING: 3,
|
|
420
|
-
}[params.mesos_task_state]
|
|
421
|
-
|
|
422
|
-
# unhealthy tasks < healthy
|
|
423
|
-
# staging < starting < running
|
|
424
|
-
# old < new
|
|
425
|
-
return (
|
|
426
|
-
params.is_healthy,
|
|
427
|
-
state_score,
|
|
428
|
-
self.is_task_new(base_task_name, task_id),
|
|
429
|
-
)
|
|
430
|
-
|
|
431
|
-
return healthiness_score
|
|
432
|
-
|
|
433
|
-
def kill_tasks_if_necessary(self, driver: MesosSchedulerDriver):
|
|
434
|
-
base_task = self.service_config.base_task(self.system_paasta_config)
|
|
435
|
-
|
|
436
|
-
all_tasks_with_params = self.task_store.get_all_tasks()
|
|
437
|
-
|
|
438
|
-
new_tasks_with_params = self.get_new_tasks(
|
|
439
|
-
base_task["name"], all_tasks_with_params
|
|
440
|
-
)
|
|
441
|
-
happy_new_tasks_with_params = self.get_happy_tasks(new_tasks_with_params)
|
|
442
|
-
|
|
443
|
-
desired_instances = self.service_config.get_desired_instances()
|
|
444
|
-
# this puts the most-desired tasks first. I would have left them in order of bad->good and used
|
|
445
|
-
# new_tasks_by_desirability[:-desired_instances] instead, but list[:-0] is an empty list, rather than the full
|
|
446
|
-
# list.
|
|
447
|
-
new_task_ids_by_desirability = sorted(
|
|
448
|
-
list(new_tasks_with_params.keys()),
|
|
449
|
-
key=self.make_healthiness_sorter(base_task["name"], all_tasks_with_params),
|
|
450
|
-
reverse=True,
|
|
451
|
-
)
|
|
452
|
-
new_task_ids_to_kill = new_task_ids_by_desirability[desired_instances:]
|
|
453
|
-
|
|
454
|
-
old_tasks_with_params = self.get_old_tasks(
|
|
455
|
-
base_task["name"], all_tasks_with_params
|
|
456
|
-
)
|
|
457
|
-
old_draining_tasks_with_params = self.get_draining_tasks(old_tasks_with_params)
|
|
458
|
-
old_non_draining_tasks = sorted(
|
|
459
|
-
list(
|
|
460
|
-
set(old_tasks_with_params.keys()) - set(old_draining_tasks_with_params)
|
|
461
|
-
),
|
|
462
|
-
key=self.make_healthiness_sorter(base_task["name"], all_tasks_with_params),
|
|
463
|
-
reverse=True,
|
|
464
|
-
)
|
|
465
|
-
|
|
466
|
-
actions = bounce_lib.crossover_bounce(
|
|
467
|
-
new_config={"instances": desired_instances},
|
|
468
|
-
new_app_running=True,
|
|
469
|
-
happy_new_tasks=happy_new_tasks_with_params.keys(),
|
|
470
|
-
old_non_draining_tasks=new_task_ids_to_kill + old_non_draining_tasks,
|
|
471
|
-
)
|
|
472
|
-
|
|
473
|
-
with a_sync.idle_event_loop():
|
|
474
|
-
futures = []
|
|
475
|
-
for task in set(new_tasks_with_params.keys()) - set(
|
|
476
|
-
actions["tasks_to_drain"]
|
|
477
|
-
):
|
|
478
|
-
futures.append(asyncio.ensure_future(self.undrain_task(task)))
|
|
479
|
-
for task in actions["tasks_to_drain"]:
|
|
480
|
-
futures.append(asyncio.ensure_future(self.drain_task(task)))
|
|
481
|
-
|
|
482
|
-
if futures:
|
|
483
|
-
a_sync.block(asyncio.wait, futures)
|
|
484
|
-
|
|
485
|
-
async def kill_if_safe_to_kill(task_id: str):
|
|
486
|
-
if await self.drain_method.is_safe_to_kill(
|
|
487
|
-
self.make_drain_task(task_id)
|
|
488
|
-
):
|
|
489
|
-
self.kill_task(driver, task_id)
|
|
490
|
-
|
|
491
|
-
futures = []
|
|
492
|
-
for task, parameters in all_tasks_with_params.items():
|
|
493
|
-
if (
|
|
494
|
-
parameters.is_draining
|
|
495
|
-
and parameters.mesos_task_state in LIVE_TASK_STATES
|
|
496
|
-
):
|
|
497
|
-
futures.append(asyncio.ensure_future(kill_if_safe_to_kill(task)))
|
|
498
|
-
if futures:
|
|
499
|
-
a_sync.block(asyncio.wait, futures)
|
|
500
|
-
|
|
501
|
-
def get_happy_tasks(self, tasks_with_params: Dict[str, MesosTaskParameters]):
|
|
502
|
-
"""Filter a dictionary of tasks->params to those that are running and not draining."""
|
|
503
|
-
happy_tasks = {}
|
|
504
|
-
for tid, params in tasks_with_params.items():
|
|
505
|
-
if params.mesos_task_state == TASK_RUNNING and not params.is_draining:
|
|
506
|
-
happy_tasks[tid] = params
|
|
507
|
-
return happy_tasks
|
|
508
|
-
|
|
509
|
-
def get_draining_tasks(self, tasks_with_params: Dict[str, MesosTaskParameters]):
|
|
510
|
-
"""Filter a dictionary of tasks->params to those that are draining."""
|
|
511
|
-
return {t: p for t, p in tasks_with_params.items() if p.is_draining}
|
|
512
|
-
|
|
513
|
-
def make_drain_task(self, task_id: str):
|
|
514
|
-
"""Return a DrainTask object, which is suitable for passing to drain methods."""
|
|
515
|
-
|
|
516
|
-
ports = []
|
|
517
|
-
|
|
518
|
-
params = self.task_store.get_task(task_id)
|
|
519
|
-
for resource in params.resources:
|
|
520
|
-
if resource["name"] == "ports":
|
|
521
|
-
for rg in resource["ranges"]["range"]:
|
|
522
|
-
for port in range(rg["begin"], rg["end"] + 1):
|
|
523
|
-
ports.append(port)
|
|
524
|
-
|
|
525
|
-
return DrainTask(
|
|
526
|
-
id=task_id, host=params.offer["agent_id"]["value"], ports=ports
|
|
527
|
-
)
|
|
528
|
-
|
|
529
|
-
async def undrain_task(self, task_id: str):
|
|
530
|
-
self.log("Undraining task %s" % task_id)
|
|
531
|
-
await self.drain_method.stop_draining(self.make_drain_task(task_id))
|
|
532
|
-
self.task_store.update_task(task_id, is_draining=False)
|
|
533
|
-
|
|
534
|
-
async def drain_task(self, task_id: str):
|
|
535
|
-
self.log("Draining task %s" % task_id)
|
|
536
|
-
await self.drain_method.drain(self.make_drain_task(task_id))
|
|
537
|
-
self.task_store.update_task(task_id, is_draining=True)
|
|
538
|
-
|
|
539
|
-
def kill_task(self, driver: MesosSchedulerDriver, task_id: str):
|
|
540
|
-
self.log("Killing task %s" % task_id)
|
|
541
|
-
driver.killTask({"value": task_id})
|
|
542
|
-
self.task_store.update_task(task_id, mesos_task_state=TASK_KILLING)
|
|
543
|
-
|
|
544
|
-
def group_tasks_by_version(
|
|
545
|
-
self, task_ids: Collection[str]
|
|
546
|
-
) -> Mapping[str, Collection[str]]:
|
|
547
|
-
d: Dict[str, List[str]] = {}
|
|
548
|
-
for task_id in task_ids:
|
|
549
|
-
version = task_id.rsplit(".", 1)[0]
|
|
550
|
-
d.setdefault(version, []).append(task_id)
|
|
551
|
-
return d
|
|
552
|
-
|
|
553
|
-
def load_config(self) -> None:
|
|
554
|
-
service_configuration_lib._yaml_cache = {}
|
|
555
|
-
self.service_config = load_paasta_native_job_config(
|
|
556
|
-
service=self.service_name,
|
|
557
|
-
instance=self.instance_name,
|
|
558
|
-
instance_type=self.instance_type,
|
|
559
|
-
cluster=self.cluster,
|
|
560
|
-
soa_dir=self.soa_dir,
|
|
561
|
-
config_overrides=self.service_config_overrides,
|
|
562
|
-
)
|
|
563
|
-
self.recreate_drain_method()
|
|
564
|
-
self.reload_constraints()
|
|
565
|
-
self.validate_config()
|
|
566
|
-
|
|
567
|
-
def validate_config(self) -> None:
|
|
568
|
-
pass
|
|
569
|
-
|
|
570
|
-
def recreate_drain_method(self) -> None:
|
|
571
|
-
"""Re-instantiate self.drain_method. Should be called after self.service_config changes."""
|
|
572
|
-
self.drain_method = drain_lib.get_drain_method(
|
|
573
|
-
name=self.service_config.get_drain_method(
|
|
574
|
-
self.service_config.service_namespace_config
|
|
575
|
-
),
|
|
576
|
-
service=self.service_name,
|
|
577
|
-
instance=self.instance_name,
|
|
578
|
-
registrations=self.service_config.get_registrations(),
|
|
579
|
-
**self.service_config.get_drain_method_params(
|
|
580
|
-
self.service_config.service_namespace_config
|
|
581
|
-
),
|
|
582
|
-
)
|
|
583
|
-
|
|
584
|
-
def reload_constraints(self):
|
|
585
|
-
self.constraints = self.service_config.get_constraints() or []
|
|
586
|
-
|
|
587
|
-
def blacklist_slave(self, agent_id: str):
|
|
588
|
-
log.debug("Blacklisting slave: %s" % agent_id)
|
|
589
|
-
self.blacklisted_slaves.setdefault(agent_id, time.time())
|
|
590
|
-
|
|
591
|
-
def unblacklist_slave(self, agent_id: str):
|
|
592
|
-
if agent_id not in self.blacklisted_slaves:
|
|
593
|
-
return
|
|
594
|
-
|
|
595
|
-
log.debug("Unblacklisting slave: %s" % agent_id)
|
|
596
|
-
with self.blacklisted_slaves_lock:
|
|
597
|
-
del self.blacklisted_slaves[agent_id]
|
|
598
|
-
|
|
599
|
-
def check_blacklisted_slaves_for_timeout(self):
|
|
600
|
-
for agent_id, blacklist_time in self.blacklisted_slaves.items():
|
|
601
|
-
if (blacklist_time + self.blacklist_timeout) < time.time():
|
|
602
|
-
self.unblacklist_slave(agent_id)
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
class DrainTask:
|
|
606
|
-
def __init__(self, id, host, ports):
|
|
607
|
-
self.id = id
|
|
608
|
-
self.host = host
|
|
609
|
-
self.ports = ports
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
def find_existing_id_if_exists_or_gen_new(name):
|
|
613
|
-
for framework in mesos_tools.get_all_frameworks(active_only=True):
|
|
614
|
-
if framework.name == name:
|
|
615
|
-
return framework.id
|
|
616
|
-
else:
|
|
617
|
-
return uuid.uuid4().hex
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
def create_driver(framework_name, scheduler, system_paasta_config, implicit_acks=False):
|
|
621
|
-
master_uri = "{}:{}".format(
|
|
622
|
-
mesos_tools.get_mesos_leader(), mesos_tools.MESOS_MASTER_PORT
|
|
623
|
-
)
|
|
624
|
-
|
|
625
|
-
framework = {
|
|
626
|
-
"user": getpass.getuser(),
|
|
627
|
-
"name": framework_name,
|
|
628
|
-
"failover_timeout": 604800,
|
|
629
|
-
"id": {"value": find_existing_id_if_exists_or_gen_new(framework_name)},
|
|
630
|
-
"checkpoint": True,
|
|
631
|
-
"principal": system_paasta_config.get_paasta_native_config()["principal"],
|
|
632
|
-
}
|
|
633
|
-
|
|
634
|
-
driver = MesosSchedulerDriver(
|
|
635
|
-
sched=scheduler,
|
|
636
|
-
framework=framework,
|
|
637
|
-
master_uri=master_uri,
|
|
638
|
-
use_addict=True,
|
|
639
|
-
implicit_acknowledgements=implicit_acks,
|
|
640
|
-
principal=system_paasta_config.get_paasta_native_config()["principal"],
|
|
641
|
-
secret=system_paasta_config.get_paasta_native_config()["secret"],
|
|
642
|
-
)
|
|
643
|
-
return driver
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
def get_paasta_native_jobs_for_cluster(cluster=None, soa_dir=DEFAULT_SOA_DIR):
|
|
647
|
-
"""A paasta_native-specific wrapper around utils.get_services_for_cluster
|
|
648
|
-
|
|
649
|
-
:param cluster: The cluster to read the configuration for
|
|
650
|
-
:param soa_dir: The SOA config directory to read from
|
|
651
|
-
:returns: A list of tuples of (service, job_name)"""
|
|
652
|
-
return get_services_for_cluster(cluster, "paasta_native", soa_dir)
|