paasta-tools 1.30.8__py3-none-any.whl → 1.30.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. paasta_tools/__init__.py +1 -1
  2. paasta_tools/api/views/instance.py +9 -2
  3. paasta_tools/async_utils.py +4 -1
  4. paasta_tools/bounce_lib.py +8 -5
  5. paasta_tools/check_services_replication_tools.py +10 -4
  6. paasta_tools/check_spark_jobs.py +1 -1
  7. paasta_tools/cli/cli.py +4 -4
  8. paasta_tools/cli/cmds/logs.py +29 -7
  9. paasta_tools/cli/cmds/mark_for_deployment.py +2 -2
  10. paasta_tools/cli/cmds/mesh_status.py +1 -1
  11. paasta_tools/cli/cmds/remote_run.py +1 -1
  12. paasta_tools/cli/cmds/rollback.py +1 -1
  13. paasta_tools/cli/cmds/spark_run.py +3 -3
  14. paasta_tools/cli/cmds/status.py +24 -21
  15. paasta_tools/cli/cmds/validate.py +3 -3
  16. paasta_tools/cli/utils.py +32 -19
  17. paasta_tools/contrib/check_orphans.py +1 -1
  18. paasta_tools/contrib/get_running_task_allocation.py +1 -1
  19. paasta_tools/instance/kubernetes.py +2 -1
  20. paasta_tools/kubernetes_tools.py +2 -40
  21. paasta_tools/metrics/metastatus_lib.py +0 -24
  22. paasta_tools/metrics/metrics_lib.py +12 -3
  23. paasta_tools/setup_kubernetes_job.py +1 -1
  24. paasta_tools/setup_tron_namespace.py +2 -2
  25. paasta_tools/tron_tools.py +1 -1
  26. paasta_tools/utils.py +2 -9
  27. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_orphans.py +1 -1
  28. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_spark_jobs.py +1 -1
  29. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/get_running_task_allocation.py +1 -1
  30. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_kubernetes_job.py +1 -1
  31. {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/METADATA +2 -2
  32. {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/RECORD +84 -89
  33. paasta_tools/frameworks/adhoc_scheduler.py +0 -71
  34. paasta_tools/frameworks/native_scheduler.py +0 -652
  35. paasta_tools/frameworks/task_store.py +0 -245
  36. paasta_tools/mesos_maintenance.py +0 -848
  37. paasta_tools/paasta_native_serviceinit.py +0 -21
  38. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/apply_external_resources.py +0 -0
  39. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/bounce_log_latency_parser.py +0 -0
  40. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_autoscaler_max_instances.py +0 -0
  41. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_cassandracluster_services_replication.py +0 -0
  42. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_flink_services_health.py +0 -0
  43. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_kubernetes_api.py +0 -0
  44. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_kubernetes_services_replication.py +0 -0
  45. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_manual_oapi_changes.sh +0 -0
  46. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/check_oom_events.py +0 -0
  47. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/cleanup_kubernetes_cr.py +0 -0
  48. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/cleanup_kubernetes_crd.py +0 -0
  49. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/cleanup_kubernetes_jobs.py +0 -0
  50. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/create_dynamodb_table.py +0 -0
  51. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/create_paasta_playground.py +0 -0
  52. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/delete_kubernetes_deployments.py +0 -0
  53. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/emit_allocated_cpu_metrics.py +0 -0
  54. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/generate_all_deployments +0 -0
  55. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/generate_authenticating_services.py +0 -0
  56. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/generate_deployments_for_service.py +0 -0
  57. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/generate_services_file.py +0 -0
  58. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/generate_services_yaml.py +0 -0
  59. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/habitat_fixer.py +0 -0
  60. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/ide_helper.py +0 -0
  61. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/is_pod_healthy_in_proxy.py +0 -0
  62. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/is_pod_healthy_in_smartstack.py +0 -0
  63. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/kill_bad_containers.py +0 -0
  64. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/kubernetes_remove_evicted_pods.py +0 -0
  65. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/mass-deploy-tag.sh +0 -0
  66. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/mock_patch_checker.py +0 -0
  67. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_cleanup_remote_run_resources.py +0 -0
  68. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_cleanup_stale_nodes.py +0 -0
  69. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_deploy_tron_jobs +0 -0
  70. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_execute_docker_command.py +0 -0
  71. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_secrets_sync.py +0 -0
  72. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_tabcomplete.sh +0 -0
  73. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/paasta_update_soa_memcpu.py +0 -0
  74. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/render_template.py +0 -0
  75. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/rightsizer_soaconfigs_update.py +0 -0
  76. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/service_shard_remove.py +0 -0
  77. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/service_shard_update.py +0 -0
  78. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_istio_mesh.py +0 -0
  79. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_kubernetes_cr.py +0 -0
  80. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_kubernetes_crd.py +0 -0
  81. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_kubernetes_internal_crd.py +0 -0
  82. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/setup_prometheus_adapter_config.py +0 -0
  83. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/shared_ip_check.py +0 -0
  84. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/synapse_srv_namespaces_fact.py +0 -0
  85. {paasta_tools-1.30.8.data → paasta_tools-1.30.10.data}/scripts/timeouts_metrics_prom.py +0 -0
  86. {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/WHEEL +0 -0
  87. {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/entry_points.txt +0 -0
  88. {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/licenses/LICENSE +0 -0
  89. {paasta_tools-1.30.8.dist-info → paasta_tools-1.30.10.dist-info}/top_level.txt +0 -0
@@ -1,652 +0,0 @@
1
- #!/usr/bin/env python
2
- import asyncio
3
- import copy
4
- import getpass
5
- import logging
6
- import random
7
- import threading
8
- import time
9
- import uuid
10
- from typing import Collection
11
- from typing import Dict
12
- from typing import List
13
- from typing import Mapping
14
- from typing import Optional
15
- from typing import Tuple
16
-
17
- import a_sync
18
- import service_configuration_lib
19
- from pymesos import MesosSchedulerDriver
20
- from pymesos.interface import Scheduler
21
-
22
- from paasta_tools import bounce_lib
23
- from paasta_tools import drain_lib
24
- from paasta_tools import mesos_tools
25
- from paasta_tools.frameworks.constraints import check_offer_constraints
26
- from paasta_tools.frameworks.constraints import ConstraintState
27
- from paasta_tools.frameworks.constraints import update_constraint_state
28
- from paasta_tools.frameworks.native_service_config import load_paasta_native_job_config
29
- from paasta_tools.frameworks.native_service_config import NativeServiceConfig
30
- from paasta_tools.frameworks.native_service_config import NativeServiceConfigDict
31
- from paasta_tools.frameworks.native_service_config import TaskInfo
32
- from paasta_tools.frameworks.task_store import MesosTaskParameters
33
- from paasta_tools.frameworks.task_store import TaskStore
34
- from paasta_tools.frameworks.task_store import ZKTaskStore
35
- from paasta_tools.utils import _log
36
- from paasta_tools.utils import DEFAULT_LOGLEVEL
37
- from paasta_tools.utils import DEFAULT_SOA_DIR
38
- from paasta_tools.utils import get_services_for_cluster
39
- from paasta_tools.utils import SystemPaastaConfig
40
-
41
- log = logging.getLogger(__name__)
42
-
43
- MESOS_TASK_SPACER = "."
44
-
45
- # Bring these into local scope for shorter lines of code.
46
- TASK_STAGING = "TASK_STAGING"
47
- TASK_STARTING = "TASK_STARTING"
48
- TASK_RUNNING = "TASK_RUNNING"
49
-
50
- TASK_KILLING = "TASK_KILLING"
51
- TASK_FINISHED = "TASK_FINISHED"
52
- TASK_FAILED = "TASK_FAILED"
53
- TASK_KILLED = "TASK_KILLED"
54
- TASK_LOST = "TASK_LOST"
55
- TASK_ERROR = "TASK_ERROR"
56
-
57
- LIVE_TASK_STATES = (TASK_STAGING, TASK_STARTING, TASK_RUNNING)
58
-
59
-
60
- class ConstraintFailAllTasksError(Exception):
61
- pass
62
-
63
-
64
- class NativeScheduler(Scheduler):
65
- task_store: TaskStore
66
-
67
- def __init__(
68
- self,
69
- service_name: str,
70
- instance_name: str,
71
- cluster: str,
72
- system_paasta_config: SystemPaastaConfig,
73
- staging_timeout: float,
74
- soa_dir: str = DEFAULT_SOA_DIR,
75
- service_config: Optional[NativeServiceConfig] = None,
76
- reconcile_backoff: float = 30,
77
- instance_type: str = "paasta_native",
78
- service_config_overrides: Optional[NativeServiceConfigDict] = None,
79
- reconcile_start_time: float = float("inf"),
80
- task_store_type=ZKTaskStore,
81
- ) -> None:
82
- self.service_name = service_name
83
- self.instance_name = instance_name
84
- self.instance_type = instance_type
85
- self.cluster = cluster
86
- self.system_paasta_config = system_paasta_config
87
- self.soa_dir = soa_dir
88
-
89
- # This will be initialized in registered().
90
- self.task_store = None
91
- self.task_store_type = task_store_type
92
-
93
- self.service_config_overrides = service_config_overrides or {}
94
- self.constraint_state: ConstraintState = {}
95
- self.constraint_state_lock = threading.Lock()
96
- self.frozen = False
97
-
98
- # don't accept resources until we reconcile.
99
- self.reconcile_start_time = reconcile_start_time
100
-
101
- # wait this long after starting a reconcile before accepting offers.
102
- self.reconcile_backoff = reconcile_backoff
103
-
104
- # wait this long for a task to launch.
105
- self.staging_timeout = staging_timeout
106
-
107
- # Gets set when registered() is called
108
- self.framework_id = None
109
-
110
- # agent_id -> unix timestamp of when we blacklisted it
111
- self.blacklisted_slaves: Dict[str, float] = {}
112
- self.blacklist_timeout = 3600
113
-
114
- if service_config is not None:
115
- self.service_config = service_config
116
- self.service_config.config_dict.update( # type: ignore
117
- self.service_config_overrides
118
- )
119
- self.recreate_drain_method()
120
- self.reload_constraints()
121
- self.validate_config()
122
- else:
123
- self.load_config()
124
-
125
- def log(self, line, level=DEFAULT_LOGLEVEL):
126
- _log(
127
- service=self.service_name,
128
- instance=self.instance_name,
129
- component="deploy",
130
- line=line,
131
- level=level,
132
- )
133
-
134
- def shutdown(self, driver: MesosSchedulerDriver):
135
- # TODO: this is naive, as it does nothing to stop on-going calls
136
- # to statusUpdate or resourceOffers.
137
- self.log(
138
- "Freezing the scheduler. Further status updates and resource offers are ignored."
139
- )
140
- self.frozen = True
141
- self.log("Killing any remaining live tasks.")
142
- for task, parameters in self.task_store.get_all_tasks().items():
143
- if parameters.mesos_task_state in LIVE_TASK_STATES:
144
- self.kill_task(driver, task)
145
- self.task_store.close()
146
-
147
- def registered(self, driver: MesosSchedulerDriver, frameworkId, masterInfo):
148
- self.framework_id = frameworkId["value"]
149
- self.log("Registered with framework ID %s" % frameworkId["value"])
150
-
151
- self.task_store = self.task_store_type(
152
- service_name=self.service_name,
153
- instance_name=self.instance_name,
154
- framework_id=self.framework_id,
155
- system_paasta_config=self.system_paasta_config,
156
- )
157
-
158
- self.reconcile_start_time = time.time()
159
- driver.reconcileTasks([])
160
-
161
- def reregistered(self, driver: MesosSchedulerDriver, masterInfo):
162
- self.registered(driver, {"value": driver.framework_id}, masterInfo)
163
-
164
- def resourceOffers(self, driver: MesosSchedulerDriver, offers):
165
- if self.frozen:
166
- return
167
-
168
- if self.within_reconcile_backoff():
169
- self.log(
170
- "Declining all offers since we started reconciliation too recently"
171
- )
172
- for offer in offers:
173
- driver.declineOffer(offer.id)
174
- else:
175
- for idx, offer in enumerate(offers):
176
- if offer.agent_id.value in self.blacklisted_slaves:
177
- log.critical(
178
- "Ignoring offer %s from blacklisted slave %s"
179
- % (offer.id.value, offer.agent_id.value)
180
- )
181
- filters = {"refuse_seconds": self.blacklist_timeout}
182
- driver.declineOffer(offer.id, filters)
183
- del offers[idx]
184
-
185
- self.launch_tasks_for_offers(driver, offers)
186
-
187
- def launch_tasks_for_offers(
188
- self, driver: MesosSchedulerDriver, offers
189
- ) -> List[TaskInfo]:
190
- """For each offer tries to launch all tasks that can fit in there.
191
- Declines offer if no fitting tasks found."""
192
- launched_tasks: List[TaskInfo] = []
193
-
194
- for offer in offers:
195
- with self.constraint_state_lock:
196
- try:
197
- tasks, new_state = self.tasks_and_state_for_offer(
198
- driver, offer, self.constraint_state
199
- )
200
-
201
- if tasks is not None and len(tasks) > 0:
202
- driver.launchTasks([offer.id], tasks)
203
-
204
- for task in tasks:
205
- self.task_store.add_task_if_doesnt_exist(
206
- task["task_id"]["value"],
207
- health=None,
208
- mesos_task_state=TASK_STAGING,
209
- offer=offer,
210
- resources=task["resources"],
211
- )
212
- launched_tasks.extend(tasks)
213
- self.constraint_state = new_state
214
- else:
215
- driver.declineOffer(offer.id)
216
- except ConstraintFailAllTasksError:
217
- self.log("Offer failed constraints for every task, rejecting 60s")
218
- filters = {"refuse_seconds": 60}
219
- driver.declineOffer(offer.id, filters)
220
- return launched_tasks
221
-
222
- def task_fits(self, offer):
223
- """Checks whether the offer is big enough to fit the tasks"""
224
- needed_resources = {
225
- "cpus": self.service_config.get_cpus(),
226
- "mem": self.service_config.get_mem(),
227
- "disk": self.service_config.get_disk(),
228
- }
229
- for resource in offer.resources:
230
- try:
231
- if resource.scalar.value < needed_resources[resource.name]:
232
- return False
233
- except KeyError:
234
- pass
235
-
236
- return True
237
-
238
- def need_more_tasks(self, name, existingTasks, scheduledTasks):
239
- """Returns whether we need to start more tasks."""
240
- num_have = 0
241
- for task, parameters in existingTasks.items():
242
- if self.is_task_new(name, task) and (
243
- parameters.mesos_task_state in LIVE_TASK_STATES
244
- ):
245
- num_have += 1
246
-
247
- for task in scheduledTasks:
248
- if task["name"] == name:
249
- num_have += 1
250
-
251
- return num_have < self.service_config.get_desired_instances()
252
-
253
- def get_new_tasks(self, name, tasks_with_params: Dict[str, MesosTaskParameters]):
254
- return {
255
- tid: params
256
- for tid, params in tasks_with_params.items()
257
- if (
258
- self.is_task_new(name, tid)
259
- and (params.mesos_task_state in LIVE_TASK_STATES)
260
- )
261
- }
262
-
263
- def get_old_tasks(self, name, tasks_with_params: Dict[str, MesosTaskParameters]):
264
- return {
265
- tid: params
266
- for tid, params in tasks_with_params.items()
267
- if (
268
- (not self.is_task_new(name, tid))
269
- and (params.mesos_task_state in LIVE_TASK_STATES)
270
- )
271
- }
272
-
273
- def is_task_new(self, name, tid):
274
- return tid.startswith("%s." % name)
275
-
276
- def log_and_kill(self, driver: MesosSchedulerDriver, task_id):
277
- log.critical(
278
- "Task stuck launching for %ss, assuming to have failed. Killing task."
279
- % self.staging_timeout
280
- )
281
- self.blacklist_slave(self.task_store.get_task(task_id).offer.agent_id.value)
282
- self.kill_task(driver, task_id)
283
-
284
- def tasks_and_state_for_offer(
285
- self, driver: MesosSchedulerDriver, offer, state: ConstraintState
286
- ) -> Tuple[List[TaskInfo], ConstraintState]:
287
- """Returns collection of tasks that can fit inside an offer."""
288
- tasks: List[TaskInfo] = []
289
- offerCpus = 0.0
290
- offerMem = 0.0
291
- offerPorts: List[int] = []
292
- for resource in offer.resources:
293
- if resource.name == "cpus":
294
- offerCpus += resource.scalar.value
295
- elif resource.name == "mem":
296
- offerMem += resource.scalar.value
297
- elif resource.name == "ports":
298
- for rg in resource.ranges.range:
299
- # I believe mesos protobuf ranges are inclusive, but range() is exclusive
300
- offerPorts += range(rg.begin, rg.end + 1)
301
- remainingCpus = offerCpus
302
- remainingMem = offerMem
303
- remainingPorts = set(offerPorts)
304
-
305
- base_task = self.service_config.base_task(self.system_paasta_config)
306
- base_task["agent_id"]["value"] = offer["agent_id"]["value"]
307
-
308
- task_mem = self.service_config.get_mem()
309
- task_cpus = self.service_config.get_cpus()
310
-
311
- # don't mutate existing state
312
- new_constraint_state = copy.deepcopy(state)
313
- total = 0
314
- failed_constraints = 0
315
- while self.need_more_tasks(
316
- base_task["name"], self.task_store.get_all_tasks(), tasks
317
- ):
318
- total += 1
319
-
320
- if not (
321
- remainingCpus >= task_cpus
322
- and remainingMem >= task_mem
323
- and self.offer_matches_pool(offer)
324
- and len(remainingPorts) >= 1
325
- ):
326
- break
327
-
328
- if not (
329
- check_offer_constraints(offer, self.constraints, new_constraint_state)
330
- ):
331
- failed_constraints += 1
332
- break
333
-
334
- task_port = random.choice(list(remainingPorts))
335
-
336
- task = copy.deepcopy(base_task)
337
- task["task_id"] = {"value": "{}.{}".format(task["name"], uuid.uuid4().hex)}
338
-
339
- task["container"]["docker"]["port_mappings"][0]["host_port"] = task_port
340
- for resource in task["resources"]:
341
- if resource["name"] == "ports":
342
- resource["ranges"]["range"][0]["begin"] = task_port
343
- resource["ranges"]["range"][0]["end"] = task_port
344
-
345
- tasks.append(task)
346
-
347
- remainingCpus -= task_cpus
348
- remainingMem -= task_mem
349
- remainingPorts -= {task_port}
350
-
351
- update_constraint_state(offer, self.constraints, new_constraint_state)
352
-
353
- # raise constraint error but only if no other tasks fit/fail the offer
354
- if total > 0 and failed_constraints == total:
355
- raise ConstraintFailAllTasksError
356
-
357
- return tasks, new_constraint_state
358
-
359
- def offer_matches_pool(self, offer):
360
- for attribute in offer.attributes:
361
- if attribute.name == "pool":
362
- return attribute.text.value == self.service_config.get_pool()
363
- # we didn't find a pool attribute on this slave, so assume it's not in our pool.
364
- return False
365
-
366
- def within_reconcile_backoff(self):
367
- return time.time() - self.reconcile_backoff < self.reconcile_start_time
368
-
369
- def periodic(self, driver: MesosSchedulerDriver):
370
- if self.frozen:
371
- return
372
-
373
- self.periodic_was_called = True # Used for testing.
374
- if not self.within_reconcile_backoff():
375
- driver.reviveOffers()
376
-
377
- self.load_config()
378
- self.kill_tasks_if_necessary(driver)
379
- self.check_blacklisted_slaves_for_timeout()
380
-
381
- def statusUpdate(self, driver: MesosSchedulerDriver, update: Dict):
382
- if self.frozen:
383
- return
384
-
385
- # update tasks
386
- task_id = update["task_id"]["value"]
387
- self.log("Task {} is in state {}".format(task_id, update["state"]))
388
-
389
- task_params = self.task_store.update_task(
390
- task_id, mesos_task_state=update["state"]
391
- )
392
-
393
- if task_params.mesos_task_state not in LIVE_TASK_STATES:
394
- with self.constraint_state_lock:
395
- update_constraint_state(
396
- task_params.offer, self.constraints, self.constraint_state, step=-1
397
- )
398
-
399
- driver.acknowledgeStatusUpdate(update)
400
- self.kill_tasks_if_necessary(driver)
401
-
402
- def make_healthiness_sorter(
403
- self, base_task_name: str, all_tasks_with_params: Dict[str, MesosTaskParameters]
404
- ):
405
- def healthiness_score(task_id):
406
- """Return a tuple that can be used as a key for sorting, that expresses our desire to keep this task around.
407
- Higher values (things that sort later) are more desirable."""
408
- params = all_tasks_with_params[task_id]
409
-
410
- state_score = {
411
- TASK_KILLING: 0,
412
- TASK_FINISHED: 0,
413
- TASK_FAILED: 0,
414
- TASK_KILLED: 0,
415
- TASK_LOST: 0,
416
- TASK_ERROR: 0,
417
- TASK_STAGING: 1,
418
- TASK_STARTING: 2,
419
- TASK_RUNNING: 3,
420
- }[params.mesos_task_state]
421
-
422
- # unhealthy tasks < healthy
423
- # staging < starting < running
424
- # old < new
425
- return (
426
- params.is_healthy,
427
- state_score,
428
- self.is_task_new(base_task_name, task_id),
429
- )
430
-
431
- return healthiness_score
432
-
433
- def kill_tasks_if_necessary(self, driver: MesosSchedulerDriver):
434
- base_task = self.service_config.base_task(self.system_paasta_config)
435
-
436
- all_tasks_with_params = self.task_store.get_all_tasks()
437
-
438
- new_tasks_with_params = self.get_new_tasks(
439
- base_task["name"], all_tasks_with_params
440
- )
441
- happy_new_tasks_with_params = self.get_happy_tasks(new_tasks_with_params)
442
-
443
- desired_instances = self.service_config.get_desired_instances()
444
- # this puts the most-desired tasks first. I would have left them in order of bad->good and used
445
- # new_tasks_by_desirability[:-desired_instances] instead, but list[:-0] is an empty list, rather than the full
446
- # list.
447
- new_task_ids_by_desirability = sorted(
448
- list(new_tasks_with_params.keys()),
449
- key=self.make_healthiness_sorter(base_task["name"], all_tasks_with_params),
450
- reverse=True,
451
- )
452
- new_task_ids_to_kill = new_task_ids_by_desirability[desired_instances:]
453
-
454
- old_tasks_with_params = self.get_old_tasks(
455
- base_task["name"], all_tasks_with_params
456
- )
457
- old_draining_tasks_with_params = self.get_draining_tasks(old_tasks_with_params)
458
- old_non_draining_tasks = sorted(
459
- list(
460
- set(old_tasks_with_params.keys()) - set(old_draining_tasks_with_params)
461
- ),
462
- key=self.make_healthiness_sorter(base_task["name"], all_tasks_with_params),
463
- reverse=True,
464
- )
465
-
466
- actions = bounce_lib.crossover_bounce(
467
- new_config={"instances": desired_instances},
468
- new_app_running=True,
469
- happy_new_tasks=happy_new_tasks_with_params.keys(),
470
- old_non_draining_tasks=new_task_ids_to_kill + old_non_draining_tasks,
471
- )
472
-
473
- with a_sync.idle_event_loop():
474
- futures = []
475
- for task in set(new_tasks_with_params.keys()) - set(
476
- actions["tasks_to_drain"]
477
- ):
478
- futures.append(asyncio.ensure_future(self.undrain_task(task)))
479
- for task in actions["tasks_to_drain"]:
480
- futures.append(asyncio.ensure_future(self.drain_task(task)))
481
-
482
- if futures:
483
- a_sync.block(asyncio.wait, futures)
484
-
485
- async def kill_if_safe_to_kill(task_id: str):
486
- if await self.drain_method.is_safe_to_kill(
487
- self.make_drain_task(task_id)
488
- ):
489
- self.kill_task(driver, task_id)
490
-
491
- futures = []
492
- for task, parameters in all_tasks_with_params.items():
493
- if (
494
- parameters.is_draining
495
- and parameters.mesos_task_state in LIVE_TASK_STATES
496
- ):
497
- futures.append(asyncio.ensure_future(kill_if_safe_to_kill(task)))
498
- if futures:
499
- a_sync.block(asyncio.wait, futures)
500
-
501
- def get_happy_tasks(self, tasks_with_params: Dict[str, MesosTaskParameters]):
502
- """Filter a dictionary of tasks->params to those that are running and not draining."""
503
- happy_tasks = {}
504
- for tid, params in tasks_with_params.items():
505
- if params.mesos_task_state == TASK_RUNNING and not params.is_draining:
506
- happy_tasks[tid] = params
507
- return happy_tasks
508
-
509
- def get_draining_tasks(self, tasks_with_params: Dict[str, MesosTaskParameters]):
510
- """Filter a dictionary of tasks->params to those that are draining."""
511
- return {t: p for t, p in tasks_with_params.items() if p.is_draining}
512
-
513
- def make_drain_task(self, task_id: str):
514
- """Return a DrainTask object, which is suitable for passing to drain methods."""
515
-
516
- ports = []
517
-
518
- params = self.task_store.get_task(task_id)
519
- for resource in params.resources:
520
- if resource["name"] == "ports":
521
- for rg in resource["ranges"]["range"]:
522
- for port in range(rg["begin"], rg["end"] + 1):
523
- ports.append(port)
524
-
525
- return DrainTask(
526
- id=task_id, host=params.offer["agent_id"]["value"], ports=ports
527
- )
528
-
529
- async def undrain_task(self, task_id: str):
530
- self.log("Undraining task %s" % task_id)
531
- await self.drain_method.stop_draining(self.make_drain_task(task_id))
532
- self.task_store.update_task(task_id, is_draining=False)
533
-
534
- async def drain_task(self, task_id: str):
535
- self.log("Draining task %s" % task_id)
536
- await self.drain_method.drain(self.make_drain_task(task_id))
537
- self.task_store.update_task(task_id, is_draining=True)
538
-
539
- def kill_task(self, driver: MesosSchedulerDriver, task_id: str):
540
- self.log("Killing task %s" % task_id)
541
- driver.killTask({"value": task_id})
542
- self.task_store.update_task(task_id, mesos_task_state=TASK_KILLING)
543
-
544
- def group_tasks_by_version(
545
- self, task_ids: Collection[str]
546
- ) -> Mapping[str, Collection[str]]:
547
- d: Dict[str, List[str]] = {}
548
- for task_id in task_ids:
549
- version = task_id.rsplit(".", 1)[0]
550
- d.setdefault(version, []).append(task_id)
551
- return d
552
-
553
- def load_config(self) -> None:
554
- service_configuration_lib._yaml_cache = {}
555
- self.service_config = load_paasta_native_job_config(
556
- service=self.service_name,
557
- instance=self.instance_name,
558
- instance_type=self.instance_type,
559
- cluster=self.cluster,
560
- soa_dir=self.soa_dir,
561
- config_overrides=self.service_config_overrides,
562
- )
563
- self.recreate_drain_method()
564
- self.reload_constraints()
565
- self.validate_config()
566
-
567
- def validate_config(self) -> None:
568
- pass
569
-
570
- def recreate_drain_method(self) -> None:
571
- """Re-instantiate self.drain_method. Should be called after self.service_config changes."""
572
- self.drain_method = drain_lib.get_drain_method(
573
- name=self.service_config.get_drain_method(
574
- self.service_config.service_namespace_config
575
- ),
576
- service=self.service_name,
577
- instance=self.instance_name,
578
- registrations=self.service_config.get_registrations(),
579
- **self.service_config.get_drain_method_params(
580
- self.service_config.service_namespace_config
581
- ),
582
- )
583
-
584
- def reload_constraints(self):
585
- self.constraints = self.service_config.get_constraints() or []
586
-
587
- def blacklist_slave(self, agent_id: str):
588
- log.debug("Blacklisting slave: %s" % agent_id)
589
- self.blacklisted_slaves.setdefault(agent_id, time.time())
590
-
591
- def unblacklist_slave(self, agent_id: str):
592
- if agent_id not in self.blacklisted_slaves:
593
- return
594
-
595
- log.debug("Unblacklisting slave: %s" % agent_id)
596
- with self.blacklisted_slaves_lock:
597
- del self.blacklisted_slaves[agent_id]
598
-
599
- def check_blacklisted_slaves_for_timeout(self):
600
- for agent_id, blacklist_time in self.blacklisted_slaves.items():
601
- if (blacklist_time + self.blacklist_timeout) < time.time():
602
- self.unblacklist_slave(agent_id)
603
-
604
-
605
- class DrainTask:
606
- def __init__(self, id, host, ports):
607
- self.id = id
608
- self.host = host
609
- self.ports = ports
610
-
611
-
612
- def find_existing_id_if_exists_or_gen_new(name):
613
- for framework in mesos_tools.get_all_frameworks(active_only=True):
614
- if framework.name == name:
615
- return framework.id
616
- else:
617
- return uuid.uuid4().hex
618
-
619
-
620
- def create_driver(framework_name, scheduler, system_paasta_config, implicit_acks=False):
621
- master_uri = "{}:{}".format(
622
- mesos_tools.get_mesos_leader(), mesos_tools.MESOS_MASTER_PORT
623
- )
624
-
625
- framework = {
626
- "user": getpass.getuser(),
627
- "name": framework_name,
628
- "failover_timeout": 604800,
629
- "id": {"value": find_existing_id_if_exists_or_gen_new(framework_name)},
630
- "checkpoint": True,
631
- "principal": system_paasta_config.get_paasta_native_config()["principal"],
632
- }
633
-
634
- driver = MesosSchedulerDriver(
635
- sched=scheduler,
636
- framework=framework,
637
- master_uri=master_uri,
638
- use_addict=True,
639
- implicit_acknowledgements=implicit_acks,
640
- principal=system_paasta_config.get_paasta_native_config()["principal"],
641
- secret=system_paasta_config.get_paasta_native_config()["secret"],
642
- )
643
- return driver
644
-
645
-
646
- def get_paasta_native_jobs_for_cluster(cluster=None, soa_dir=DEFAULT_SOA_DIR):
647
- """A paasta_native-specific wrapper around utils.get_services_for_cluster
648
-
649
- :param cluster: The cluster to read the configuration for
650
- :param soa_dir: The SOA config directory to read from
651
- :returns: A list of tuples of (service, job_name)"""
652
- return get_services_for_cluster(cluster, "paasta_native", soa_dir)