mongo-charms-single-kernel 1.8.6__py3-none-any.whl → 1.8.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mongo-charms-single-kernel might be problematic. Click here for more details.

Files changed (47) hide show
  1. {mongo_charms_single_kernel-1.8.6.dist-info → mongo_charms_single_kernel-1.8.8.dist-info}/METADATA +2 -1
  2. {mongo_charms_single_kernel-1.8.6.dist-info → mongo_charms_single_kernel-1.8.8.dist-info}/RECORD +41 -40
  3. single_kernel_mongo/abstract_charm.py +8 -0
  4. single_kernel_mongo/config/literals.py +2 -23
  5. single_kernel_mongo/config/models.py +12 -0
  6. single_kernel_mongo/config/relations.py +0 -1
  7. single_kernel_mongo/config/statuses.py +10 -57
  8. single_kernel_mongo/core/abstract_upgrades_v3.py +149 -0
  9. single_kernel_mongo/core/k8s_workload.py +2 -2
  10. single_kernel_mongo/core/kubernetes_upgrades_v3.py +17 -0
  11. single_kernel_mongo/core/machine_upgrades_v3.py +54 -0
  12. single_kernel_mongo/core/operator.py +86 -5
  13. single_kernel_mongo/core/version_checker.py +7 -6
  14. single_kernel_mongo/core/vm_workload.py +30 -13
  15. single_kernel_mongo/core/workload.py +17 -19
  16. single_kernel_mongo/events/backups.py +3 -3
  17. single_kernel_mongo/events/cluster.py +1 -1
  18. single_kernel_mongo/events/database.py +1 -1
  19. single_kernel_mongo/events/lifecycle.py +5 -4
  20. single_kernel_mongo/events/tls.py +7 -4
  21. single_kernel_mongo/exceptions.py +4 -24
  22. single_kernel_mongo/lib/charms/operator_libs_linux/v1/systemd.py +288 -0
  23. single_kernel_mongo/managers/cluster.py +8 -8
  24. single_kernel_mongo/managers/config.py +5 -3
  25. single_kernel_mongo/managers/ldap.py +2 -1
  26. single_kernel_mongo/managers/mongo.py +48 -9
  27. single_kernel_mongo/managers/mongodb_operator.py +199 -96
  28. single_kernel_mongo/managers/mongos_operator.py +97 -35
  29. single_kernel_mongo/managers/sharding.py +4 -4
  30. single_kernel_mongo/managers/tls.py +54 -27
  31. single_kernel_mongo/managers/upgrade_v3.py +452 -0
  32. single_kernel_mongo/managers/upgrade_v3_status.py +133 -0
  33. single_kernel_mongo/state/app_peer_state.py +12 -2
  34. single_kernel_mongo/state/charm_state.py +31 -141
  35. single_kernel_mongo/state/config_server_state.py +0 -33
  36. single_kernel_mongo/state/unit_peer_state.py +10 -0
  37. single_kernel_mongo/templates/enable-transparent-huge-pages.service.j2 +14 -0
  38. single_kernel_mongo/utils/helpers.py +0 -6
  39. single_kernel_mongo/utils/mongo_config.py +32 -8
  40. single_kernel_mongo/core/abstract_upgrades.py +0 -890
  41. single_kernel_mongo/core/kubernetes_upgrades.py +0 -194
  42. single_kernel_mongo/core/machine_upgrades.py +0 -188
  43. single_kernel_mongo/events/upgrades.py +0 -157
  44. single_kernel_mongo/managers/upgrade.py +0 -334
  45. single_kernel_mongo/state/upgrade_state.py +0 -134
  46. {mongo_charms_single_kernel-1.8.6.dist-info → mongo_charms_single_kernel-1.8.8.dist-info}/WHEEL +0 -0
  47. {mongo_charms_single_kernel-1.8.6.dist-info → mongo_charms_single_kernel-1.8.8.dist-info}/licenses/LICENSE +0 -0
@@ -1,890 +0,0 @@
1
- #!/usr/bin/env python3
2
- # Copyright 2024 Canonical Ltd.
3
- # See LICENSE file for licensing details.
4
-
5
- """The substrate agnostic Upgrades manager.
6
-
7
- In this class, we manage upgrades and their lifecycle.
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import copy
13
- import logging
14
- import secrets
15
- import string
16
- from abc import ABC, abstractmethod
17
- from enum import Enum
18
- from typing import TYPE_CHECKING, Generic, TypeVar
19
-
20
- import poetry.core.constraints.version as poetry_version
21
- from data_platform_helpers.advanced_statuses.models import StatusObject, StatusObjectList
22
- from data_platform_helpers.advanced_statuses.protocol import ManagerStatusProtocol
23
- from data_platform_helpers.advanced_statuses.types import Scope
24
- from ops import Object
25
- from pymongo.errors import OperationFailure, PyMongoError, ServerSelectionTimeoutError
26
- from tenacity import RetryError, Retrying, retry, stop_after_attempt, wait_fixed
27
-
28
- from single_kernel_mongo.config.literals import (
29
- FEATURE_VERSION,
30
- SNAP,
31
- CharmKind,
32
- Substrates,
33
- UnitState,
34
- )
35
- from single_kernel_mongo.config.relations import RelationNames
36
- from single_kernel_mongo.config.statuses import UpgradeStatuses
37
- from single_kernel_mongo.core.operator import MainWorkloadType, OperatorProtocol
38
- from single_kernel_mongo.core.structured_config import MongoDBRoles
39
- from single_kernel_mongo.exceptions import (
40
- BalancerStillRunningError,
41
- ClusterNotHealthyError,
42
- FailedToElectNewPrimaryError,
43
- FailedToMovePrimaryError,
44
- PeerRelationNotReadyError,
45
- PrecheckFailedError,
46
- )
47
- from single_kernel_mongo.state.charm_state import CharmState
48
- from single_kernel_mongo.utils.helpers import mongodb_only
49
- from single_kernel_mongo.utils.mongo_config import MongoConfiguration
50
- from single_kernel_mongo.utils.mongo_connection import MongoConnection
51
- from single_kernel_mongo.utils.mongodb_users import OperatorUser
52
-
53
- if TYPE_CHECKING:
54
- from single_kernel_mongo.core.kubernetes_upgrades import KubernetesUpgrade
55
- from single_kernel_mongo.core.machine_upgrades import MachineUpgrade
56
- from single_kernel_mongo.managers.mongodb_operator import MongoDBOperator
57
- from single_kernel_mongo.managers.mongos_operator import MongosOperator
58
-
59
- T = TypeVar("T", covariant=True, bound=OperatorProtocol)
60
-
61
- logger = logging.getLogger(__name__)
62
-
63
- WRITE_KEY = "write_value"
64
- SHARD_NAME_INDEX = "_id"
65
-
66
-
67
- class UpgradeActions(str, Enum):
68
- """All upgrade actions."""
69
-
70
- RESUME_ACTION_NAME = "resume-refresh"
71
- PRECHECK_ACTION_NAME = "pre-refresh-check"
72
- FORCE_REFRESH_START = "force-refresh-start"
73
-
74
-
75
- # BEGIN: Useful classes
76
- class AbstractUpgrade(ABC):
77
- """In-place upgrades abstract class (typing).
78
-
79
- Based off specification: DA058 - In-Place Upgrades - Kubernetes v2
80
- (https://docs.google.com/document/d/1tLjknwHudjcHs42nzPVBNkHs98XxAOT2BXGGpP7NyEU/)
81
- """
82
-
83
- def __init__(
84
- self,
85
- dependent: OperatorProtocol,
86
- workload: MainWorkloadType,
87
- state: CharmState,
88
- substrate: Substrates,
89
- ) -> None:
90
- self.charm = dependent.charm
91
- self.dependent = dependent
92
- self.workload = workload
93
- self.state = state
94
- self.substrate = substrate
95
- self.relation_name = RelationNames.UPGRADE_VERSION.value
96
-
97
- if not self.state.upgrade_relation:
98
- raise PeerRelationNotReadyError
99
-
100
- self.app_name = self.charm.app.name
101
- self.unit_name = self.charm.unit.name
102
- self._current_versions = {
103
- "charm": self.workload.get_charm_revision(),
104
- "workload": self.workload.get_version(),
105
- }
106
-
107
- @property
108
- def unit_state(self) -> UnitState | None:
109
- """Unit upgrade state."""
110
- return self.state.unit_upgrade_peer_data.unit_state
111
-
112
- @unit_state.setter
113
- def unit_state(self, value: UnitState) -> None:
114
- self.state.unit_upgrade_peer_data.unit_state = value
115
-
116
- @property
117
- def is_compatible(self) -> bool:
118
- """Whether upgrade is supported from previous versions."""
119
- if not (previous_version_strs := self.state.app_upgrade_peer_data.versions):
120
- logger.debug("`versions` missing from peer relation")
121
- return False
122
-
123
- # TODO charm versioning: remove `.split("+")` (which removes git hash before comparing)
124
- previous_version_strs["charm"] = previous_version_strs["charm"].split("+")[0]
125
- previous_versions: dict[str, poetry_version.Version] = {
126
- key: poetry_version.Version.parse(value) for key, value in previous_version_strs.items()
127
- }
128
- current_version_strs = copy.copy(self._current_versions)
129
- current_version_strs["charm"] = current_version_strs["charm"].split("+")[0]
130
- current_versions = {
131
- key: poetry_version.Version.parse(value) for key, value in current_version_strs.items()
132
- }
133
- try:
134
- # TODO Future PR: change this > sign to support downgrades
135
- if (
136
- previous_versions["charm"] > current_versions["charm"]
137
- or previous_versions["charm"].major != current_versions["charm"].major
138
- ):
139
- logger.debug(
140
- f'{previous_versions["charm"]=} incompatible with {current_versions["charm"]=}'
141
- )
142
- return False
143
- if (
144
- previous_versions["workload"] > current_versions["workload"]
145
- or previous_versions["workload"].major != current_versions["workload"].major
146
- ):
147
- logger.debug(
148
- f'{previous_versions["workload"]=} incompatible with {current_versions["workload"]=}'
149
- )
150
- return False
151
- logger.debug(
152
- f"Versions before refresh compatible with versions after refresh {previous_version_strs=} {self._current_versions=}"
153
- )
154
- return True
155
- except KeyError as exception:
156
- logger.debug(f"Version missing from {previous_versions=}", exc_info=exception)
157
- return False
158
-
159
- @abstractmethod
160
- def _get_unit_healthy_status(self) -> StatusObject:
161
- """Status shown during upgrade if unit is healthy."""
162
- raise NotImplementedError()
163
-
164
- def get_upgrade_unit_status(self) -> StatusObject | None:
165
- """Unit upgrade status."""
166
- if self.state.upgrade_in_progress:
167
- if not self.is_compatible:
168
- return UpgradeStatuses.INCOMPATIBLE_UPGRADE.value
169
- return self._get_unit_healthy_status()
170
- return None
171
-
172
- @property
173
- def app_status(self) -> StatusObject | None:
174
- """App upgrade status."""
175
- if not self.state.upgrade_in_progress:
176
- return None
177
- if self.dependent.name == CharmKind.MONGOD and not self.upgrade_resumed:
178
- # User confirmation needed to resume upgrade (i.e. upgrade second unit)
179
- # Statuses over 120 characters are truncated in `juju status` as of juju 3.1.6 and
180
- # 2.9.45
181
- resume_string = ""
182
- if len(self.state.units_upgrade_peer_data) > 1:
183
- resume_string = f"Verify highest unit is healthy & run `{UpgradeActions.RESUME_ACTION_NAME.value}` action. "
184
- return UpgradeStatuses.refreshing_needs_resume(resume_string)
185
- return UpgradeStatuses.REFRESH_IN_PROGRESS.value
186
-
187
- def set_versions_in_app_databag(self) -> None:
188
- """Save current versions in app databag.
189
-
190
- Used after next upgrade to check compatibility (i.e. whether that upgrade should be
191
- allowed).
192
- """
193
- assert not self.state.upgrade_in_progress
194
- logger.debug(f"Setting {self._current_versions=} in upgrade peer relation app databag")
195
- self.state.app_upgrade_peer_data.versions = self._current_versions
196
- logger.debug(f"Set {self._current_versions=} in upgrade peer relation app databag")
197
-
198
- @property
199
- @abstractmethod
200
- def upgrade_resumed(self) -> bool:
201
- """Whether user has resumed upgrade with Juju action."""
202
- raise NotImplementedError()
203
-
204
- @abstractmethod
205
- def reconcile_partition(self, *, from_event: bool = False, force: bool = False) -> str | None:
206
- """If ready, allow next unit to upgrade."""
207
- raise NotImplementedError()
208
-
209
- def pre_upgrade_check(self) -> None:
210
- """Check if this app is ready to upgrade.
211
-
212
- Runs before any units are upgraded
213
-
214
- Does *not* run during rollback
215
-
216
- On machines, this runs before any units are upgraded (after `juju refresh`)
217
- On machines & Kubernetes, this also runs during pre-upgrade-check action
218
-
219
- Can run on leader or non-leader unit
220
-
221
- Raises:
222
- PrecheckFailed: App is not ready to upgrade
223
-
224
- TODO Kubernetes: Run (some) checks after `juju refresh` (in case user forgets to run
225
- pre-upgrade-check action). Note: 1 unit will upgrade before we can run checks (checks may
226
- need to be modified).
227
- See https://chat.canonical.com/canonical/pl/cmf6uhm1rp8b7k8gkjkdsj4mya
228
- """
229
- logger.debug("Running pre-refresh checks")
230
-
231
- if self.dependent.name == CharmKind.MONGOS:
232
- if not self.state.db_initialised:
233
- return
234
- if not self.dependent.upgrade_manager.is_mongos_able_to_read_write():
235
- raise PrecheckFailedError("mongos is not able to read/write")
236
- return
237
-
238
- # TODO: if shard is getting upgraded but BOTH have same revision, then fail
239
- # https://warthogs.atlassian.net/browse/DPE-6397
240
- try:
241
- self.dependent.upgrade_manager.wait_for_cluster_healthy()
242
- except RetryError:
243
- logger.error("Cluster is not healthy")
244
- raise PrecheckFailedError("Cluster is not healthy")
245
-
246
- # On VM charms we can choose the order to upgrade, but not on K8s. In order to keep the
247
- # two charms in sync we decided to have the VM charm have the same upgrade order as the K8s
248
- # charm (i.e. highest to lowest.) Hence, we move the primary to the last unit to upgrade.
249
- # This prevents the primary from jumping around from unit to unit during the upgrade
250
- # procedure.
251
- try:
252
- self.dependent.upgrade_manager.move_primary_to_last_upgrade_unit()
253
- except FailedToMovePrimaryError:
254
- logger.error("Cluster failed to move primary before re-election.")
255
- raise PrecheckFailedError("Primary switchover failed")
256
-
257
- if not self.dependent.upgrade_manager.is_cluster_able_to_read_write():
258
- logger.error("Cluster cannot read/write to replicas")
259
- raise PrecheckFailedError("Cluster is not healthy")
260
-
261
- if self.state.is_role(MongoDBRoles.CONFIG_SERVER):
262
- if not self.dependent.upgrade_manager.are_pre_upgrade_operations_config_server_successful():
263
- raise PrecheckFailedError("Pre-refresh operations on config-server failed.")
264
-
265
- self.add_status_data_for_legacy_upgrades()
266
-
267
- def add_status_data_for_legacy_upgrades(self):
268
- """Add dummy data for legacy upgrades.
269
-
270
- Upgrades supported on revision 212 and lower require status information from shards.
271
- however in upgrades on later reisions this information was determined not necessary and
272
- obsolete. It is true that this information is *not* needed for earlier revisions to
273
- facilitate earlier revisions we populate this data with ActiveStatus.
274
- """
275
- if not self.state.is_role(MongoDBRoles.SHARD):
276
- return
277
-
278
- if not self.state.shard_relation:
279
- return
280
-
281
- self.state.unit_shard_state.status_ready_for_upgrade = True
282
-
283
-
284
- # END: Useful classes
285
-
286
-
287
- class GenericMongoDBUpgradeManager(ManagerStatusProtocol, Generic[T], Object, ABC):
288
- """Substrate agnostif, abstract handler for upgrade events."""
289
-
290
- def __init__(
291
- self,
292
- dependent: T,
293
- upgrade_backend: type[KubernetesUpgrade | MachineUpgrade],
294
- *args,
295
- **kwargs,
296
- ):
297
- self.name = "upgrade"
298
- super(Generic, self).__init__(dependent, *args, **kwargs) # type: ignore[arg-type]
299
- self.dependent = dependent
300
- self.substrate = self.dependent.substrate
301
- self.upgrade_backend = upgrade_backend
302
- self.charm = dependent.charm
303
- self.state = dependent.state
304
-
305
- @property
306
- def _upgrade(self) -> KubernetesUpgrade | MachineUpgrade | None:
307
- """Gets the correct upgrade backend if it exists."""
308
- try:
309
- return self.upgrade_backend(
310
- self.dependent,
311
- self.dependent.workload,
312
- self.state,
313
- self.dependent.substrate,
314
- )
315
- except PeerRelationNotReadyError:
316
- return None
317
-
318
- def _set_upgrade_status(self) -> None:
319
- """Sets the upgrade status in the unit and app status."""
320
- assert self._upgrade
321
- if self.charm.unit.is_leader():
322
- status_object = self._upgrade.app_status or UpgradeStatuses.ACTIVE_IDLE.value
323
- self.state.statuses.add(status_object, scope="app", component=self.name)
324
- # Set/clear upgrade unit status if no other unit status - upgrade status for units should
325
- # have the lowest priority.
326
- statuses: StatusObjectList = self.state.statuses.get(scope="unit", component=self.name)
327
- if (
328
- not statuses.root
329
- or UpgradeStatuses.WAITING_POST_UPGRADE_STATUS in statuses
330
- or statuses[0] == UpgradeStatuses.ACTIVE_IDLE # Works because the list is sorted
331
- or any("is not up-to date with" in status.message for status in statuses)
332
- ):
333
- self.state.statuses.set(
334
- self._upgrade.get_upgrade_unit_status() or UpgradeStatuses.ACTIVE_IDLE.value,
335
- scope="unit",
336
- component=self.name,
337
- )
338
-
339
- def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]:
340
- """Gets statuses for upgrades statelessly."""
341
- if not self._upgrade:
342
- return []
343
-
344
- if not recompute:
345
- return self.state.statuses.get(scope=scope, component=self.name).root
346
-
347
- match scope:
348
- case "unit":
349
- return [
350
- self._upgrade.get_upgrade_unit_status() or UpgradeStatuses.ACTIVE_IDLE.value
351
- ]
352
- case "app":
353
- return [self._upgrade.app_status or UpgradeStatuses.ACTIVE_IDLE.value]
354
- case _:
355
- raise ValueError(f"Invalid scope {scope}")
356
-
357
- def store_initial_revisions(self) -> None:
358
- """Handle peer relation created event."""
359
- assert self._upgrade
360
- if self.substrate == Substrates.VM:
361
- self.state.unit_workload_container_version = SNAP.revision
362
- logger.debug(f"Saved {SNAP.revision=} in unit databag after first install")
363
- if self.dependent.name == CharmKind.MONGOD:
364
- self.state.unit_upgrade_peer_data.current_revision = (
365
- self.dependent.cross_app_version_checker.version # type: ignore
366
- )
367
- if self.charm.unit.is_leader():
368
- if not self.state.upgrade_in_progress:
369
- # Save versions on initial start
370
- self._upgrade.set_versions_in_app_databag()
371
-
372
- @abstractmethod
373
- def run_post_app_upgrade_task(self) -> None:
374
- """Runs the post upgrade check to verify that the deployment is healthy."""
375
- raise NotImplementedError()
376
-
377
- def run_post_cluster_upgrade_task(self) -> None:
378
- """Runs the post upgrade check to verify that the deployment is healthy."""
379
- raise NotImplementedError()
380
-
381
- @abstractmethod
382
- def run_post_upgrade_checks(self, finished_whole_cluster: bool = False) -> None:
383
- """Runs post-upgrade checks for after an application upgrade."""
384
- raise NotImplementedError()
385
-
386
- def _reconcile_upgrade(self, during_upgrade: bool = False) -> None:
387
- """Handle upgrade events."""
388
- if not self._upgrade:
389
- logger.debug("Peer relation not available")
390
- return
391
- if not self.state.app_upgrade_peer_data.versions:
392
- logger.debug("Peer relation not ready")
393
- return
394
- if self.charm.unit.is_leader() and not self.state.upgrade_in_progress:
395
- # Run before checking `self._upgrade.is_compatible` in case incompatible upgrade was
396
- # forced & completed on all units.
397
- self._upgrade.set_versions_in_app_databag()
398
-
399
- if self.substrate == Substrates.VM and not self._upgrade.is_compatible:
400
- self._set_upgrade_status()
401
- return
402
-
403
- if self._upgrade.unit_state is UnitState.OUTDATED:
404
- self._on_vm_outdated() # type: ignore
405
- return
406
-
407
- if self._upgrade.unit_state is UnitState.RESTARTING: # Kubernetes only
408
- if not self._upgrade.is_compatible:
409
- logger.info(
410
- f"Refresh incompatible. If you accept potential *data loss* and *downtime*, you can continue with `{UpgradeActions.RESUME_ACTION_NAME.value} force=true`"
411
- )
412
- self.state.statuses.add(
413
- UpgradeStatuses.INCOMPATIBLE_UPGRADE.value,
414
- scope="unit",
415
- component=self.name,
416
- )
417
- return
418
-
419
- if self.dependent.substrate == Substrates.K8S:
420
- self._on_kubernetes_always(during_upgrade) # type: ignore
421
- self._set_upgrade_status()
422
-
423
- def _on_kubernetes_always(self, during_upgrade: bool) -> None:
424
- """Always run this as part of kubernetes reconcile_upgade call."""
425
- if not self._upgrade:
426
- logger.debug("Peer relation not available")
427
- return
428
- if (
429
- not during_upgrade
430
- and self.state.db_initialised
431
- and self.dependent.mongo_manager.mongod_ready()
432
- ):
433
- self._upgrade.unit_state = UnitState.HEALTHY
434
- if self.charm.unit.is_leader():
435
- self._upgrade.reconcile_partition()
436
- self._set_upgrade_status()
437
-
438
- def _on_vm_outdated(self) -> None:
439
- """This is run on VMs if the current unit is outdated."""
440
- try:
441
- # This is the case only for VM which is OK
442
- authorized = self._upgrade.authorized # type: ignore
443
- except PrecheckFailedError as exception:
444
- self._set_upgrade_status()
445
- self.state.statuses.add(exception.status, scope="unit", component=self.name)
446
- logger.debug(f"Set unit status to {exception.status}")
447
- logger.error(exception.status.message)
448
- return
449
- if authorized:
450
- self._set_upgrade_status()
451
- # We can type ignore because this branch is VM only
452
- self._upgrade.upgrade_unit(dependent=self.dependent) # type: ignore
453
- # Refresh status after upgrade
454
- else:
455
- logger.debug("Waiting to upgrade")
456
- self._set_upgrade_status()
457
-
458
- # BEGIN: Helpers
459
- @mongodb_only
460
- def move_primary_to_last_upgrade_unit(self) -> None:
461
- """Moves the primary to last unit that gets upgraded (the unit with the lowest id).
462
-
463
- Raises FailedToMovePrimaryError
464
- """
465
- # no need to move primary in the scenario of one unit
466
- if len(self.state.units_upgrade_peer_data) < 2:
467
- return
468
-
469
- with MongoConnection(self.state.mongo_config) as mongod:
470
- unit_with_lowest_id = self.state.units_upgrade_peer_data[-1].unit
471
- unit_host = self.state.peer_unit_data(unit_with_lowest_id).internal_address
472
- if mongod.primary() == unit_host:
473
- logger.debug(
474
- "Not moving Primary before refresh, primary is already on the last unit to refresh."
475
- )
476
- return
477
-
478
- logger.debug("Moving primary to unit: %s", unit_with_lowest_id)
479
- mongod.move_primary(new_primary_ip=unit_host)
480
-
481
- @mongodb_only
482
- def wait_for_cluster_healthy(
483
- self: GenericMongoDBUpgradeManager[MongoDBOperator],
484
- ) -> None:
485
- """Waits until the cluster is healthy after upgrading.
486
-
487
- After a unit restarts it can take some time for the cluster to settle.
488
-
489
- Raises:
490
- ClusterNotHealthyError.
491
- """
492
- for attempt in Retrying(stop=stop_after_attempt(10), wait=wait_fixed(1)):
493
- with attempt:
494
- if not self.is_cluster_healthy():
495
- raise ClusterNotHealthyError()
496
-
497
- @mongodb_only
498
- def is_cluster_healthy(self: GenericMongoDBUpgradeManager[MongoDBOperator]) -> bool:
499
- """Returns True if all nodes in the cluster/replica set are healthy."""
500
- # TODO: check mongos
501
- if not self.dependent.mongo_manager.mongod_ready():
502
- logger.error("Cannot proceed with refresh. Service mongod is not running")
503
- return False
504
-
505
- if self.state.is_sharding_component and not self.state.has_sharding_integration:
506
- return True
507
-
508
- try:
509
- return self.are_nodes_healthy()
510
- except (PyMongoError, OperationFailure, ServerSelectionTimeoutError) as e:
511
- logger.error(
512
- "Cannot proceed with refresh. Failed to check cluster health, error: %s",
513
- e,
514
- )
515
- return False
516
-
517
- @mongodb_only
518
- def are_nodes_healthy(self) -> bool:
519
- """Returns true if all nodes in the MongoDB deployment are healthy."""
520
- if self.state.is_role(MongoDBRoles.REPLICATION):
521
- return self.are_replica_set_nodes_healthy(self.state.mongo_config)
522
-
523
- mongos_config = self.get_cluster_mongos()
524
- if not self.are_shards_healthy(mongos_config):
525
- logger.debug(
526
- "One or more individual shards are not healthy - do not proceed with refresh."
527
- )
528
- return False
529
-
530
- if not self.are_replicas_in_sharded_cluster_healthy(mongos_config):
531
- logger.debug("One or more nodes are not healthy - do not proceed with refresh.")
532
- return False
533
-
534
- return True
535
-
536
- def are_replicas_in_sharded_cluster_healthy(self, mongos_config: MongoConfiguration) -> bool:
537
- """Returns True if all replicas in the sharded cluster are healthy."""
538
- # dictionary of all replica sets in the sharded cluster
539
- for mongodb_config in self.get_all_replica_set_configs_in_cluster():
540
- if not self.are_replica_set_nodes_healthy(mongodb_config):
541
- logger.debug(f"Replica set: {mongodb_config.replset} contains unhealthy nodes.")
542
- return False
543
-
544
- return True
545
-
546
- def are_shards_healthy(self, mongos_config: MongoConfiguration) -> bool:
547
- """Returns True if all shards in the cluster are healthy."""
548
- with MongoConnection(mongos_config) as mongos:
549
- if mongos.is_any_shard_draining():
550
- logger.debug("Cluster is draining a shard, do not proceed with refresh.")
551
- return False
552
-
553
- if not mongos.are_all_shards_aware():
554
- logger.debug("Not all shards are shard aware, do not proceed with refresh.")
555
- return False
556
-
557
- # Config-Server has access to all the related shard applications.
558
- if self.state.is_role(MongoDBRoles.CONFIG_SERVER):
559
- relation_shards = {
560
- relation.app.name for relation in self.state.config_server_relation
561
- }
562
- cluster_shards = mongos.get_shard_members()
563
- if len(relation_shards - cluster_shards):
564
- logger.debug(
565
- "Not all shards have been added/drained, do not proceed with refresh."
566
- )
567
- return False
568
-
569
- return True
570
-
571
- def get_all_replica_set_configs_in_cluster(self) -> list[MongoConfiguration]:
572
- """Returns a list of all the mongodb_configurations for each application in the cluster."""
573
- mongos_config = self.get_cluster_mongos()
574
- mongodb_configurations = []
575
- if self.state.is_role(MongoDBRoles.SHARD):
576
- # the hosts of the integrated mongos application are also the config-server hosts
577
- config_server_hosts = self.state.app_peer_data.mongos_hosts
578
- mongodb_configurations = [
579
- self.state.mongodb_config_for_user(
580
- OperatorUser,
581
- hosts=set(config_server_hosts),
582
- replset=self.state.config_server_name,
583
- )
584
- ]
585
- elif self.state.is_role(MongoDBRoles.CONFIG_SERVER):
586
- mongodb_configurations = [self.state.mongo_config]
587
-
588
- with MongoConnection(mongos_config) as mongos:
589
- sc_status = mongos.client.admin.command("listShards")
590
- for shard in sc_status["shards"]:
591
- mongodb_configurations.append(self.get_mongodb_config_from_shard_entry(shard))
592
-
593
- return mongodb_configurations
594
-
595
- def are_replica_set_nodes_healthy(self, mongodb_config: MongoConfiguration) -> bool:
596
- """Returns true if all nodes in the MongoDB replica set are healthy."""
597
- with MongoConnection(mongodb_config) as mongod:
598
- rs_status = mongod.get_replset_status()
599
- rs_status = mongod.client.admin.command("replSetGetStatus")
600
- return not mongod.is_any_sync(rs_status)
601
-
602
- def is_cluster_able_to_read_write(
603
- self: GenericMongoDBUpgradeManager[MongoDBOperator],
604
- ) -> bool:
605
- """Returns True if read and write is feasible for cluster."""
606
- try:
607
- if self.state.is_role(MongoDBRoles.REPLICATION):
608
- return self.is_replica_set_able_read_write()
609
- return self.is_sharded_cluster_able_to_read_write()
610
- except (ServerSelectionTimeoutError, OperationFailure):
611
- logger.warning("Impossible to select server, will try again later")
612
- return False
613
-
614
- def is_mongos_able_to_read_write(
615
- self: GenericMongoDBUpgradeManager[MongosOperator],
616
- ) -> bool:
617
- """Returns True if read and write is feasible from mongos."""
618
- _, collection_name, write_value = self.get_random_write_and_collection()
619
- config = self.state.mongos_config
620
- self.add_write_to_sharded_cluster(config, config.database, collection_name, write_value)
621
-
622
- write_replicated = self.confirm_excepted_write_cluster(
623
- config,
624
- collection_name,
625
- write_value,
626
- )
627
- self.clear_tmp_collection(config, collection_name)
628
-
629
- if not write_replicated:
630
- logger.debug("Test read/write to cluster failed.")
631
- return False
632
-
633
- return True
634
-
635
- @retry(
636
- stop=stop_after_attempt(10),
637
- wait=wait_fixed(1),
638
- reraise=True,
639
- )
640
- def confirm_excepted_write_cluster(
641
- self: GenericMongoDBUpgradeManager[MongosOperator],
642
- config: MongoConfiguration,
643
- collection_name: str,
644
- expected_write_value: str,
645
- ) -> bool:
646
- """Returns True if the replica contains the expected write in the provided collection."""
647
- with MongoConnection(config) as mongos:
648
- db = mongos.client[config.database]
649
- test_collection = db[collection_name]
650
- query = test_collection.find({}, {WRITE_KEY: 1})
651
- if query[0][WRITE_KEY] != expected_write_value:
652
- return False
653
-
654
- return True
655
-
656
- def is_sharded_cluster_able_to_read_write(
657
- self: GenericMongoDBUpgradeManager[MongoDBOperator],
658
- ) -> bool:
659
- """Returns True if possible to write all cluster shards and read from all replicas."""
660
- mongos_config = self.get_cluster_mongos()
661
- with MongoConnection(mongos_config) as mongos:
662
- sc_status = mongos.client.admin.command("listShards")
663
- for shard in sc_status["shards"]:
664
- # force a write to a specific shard to ensure the primary on that shard can
665
- # receive writes
666
- db_name, collection_name, write_value = self.get_random_write_and_collection()
667
- self.add_write_to_sharded_cluster(
668
- mongos_config, db_name, collection_name, write_value
669
- )
670
- mongos.client.admin.command("movePrimary", db_name, to=shard[SHARD_NAME_INDEX])
671
-
672
- write_replicated = self.is_write_on_secondaries(
673
- self.get_mongodb_config_from_shard_entry(shard),
674
- collection_name,
675
- write_value,
676
- db_name,
677
- )
678
-
679
- self.clear_db_collection(mongos_config, db_name)
680
- if not write_replicated:
681
- logger.debug(f"Test read/write to shard {shard['_id']} failed.")
682
- return False
683
-
684
- return True
685
-
686
- def get_mongodb_config_from_shard_entry(self, shard_entry: dict) -> MongoConfiguration:
687
- """Returns a replica set MongoConfiguration based on a shard entry from ListShards."""
688
- # field hosts is of the form shard01/host1:27018,host2:27018,host3:27018
689
- shard_hosts = shard_entry["host"].split("/")[1]
690
- parsed_ips = {host.split(":")[0] for host in shard_hosts.split(",")}
691
- return self.state.mongodb_config_for_user(
692
- OperatorUser, parsed_ips, replset=shard_entry[SHARD_NAME_INDEX]
693
- )
694
-
695
- def get_cluster_mongos(self) -> MongoConfiguration:
696
- """Return a mongos configuration for the sharded cluster."""
697
- return (
698
- self.state.mongos_config
699
- if self.state.is_role(MongoDBRoles.CONFIG_SERVER)
700
- else self.state.mongos_config_for_user(
701
- OperatorUser, hosts=set(self.state.shard_state.mongos_hosts)
702
- )
703
- )
704
-
705
- def is_replica_set_able_read_write(self) -> bool:
706
- """Returns True if is possible to write to primary and read from replicas."""
707
- _, collection_name, write_value = self.get_random_write_and_collection()
708
- mongodb_config = self.state.mongo_config
709
- self.add_write_to_replica_set(mongodb_config, collection_name, write_value)
710
- write_replicated = self.is_write_on_secondaries(
711
- mongodb_config, collection_name, write_value
712
- )
713
- self.clear_tmp_collection(mongodb_config, collection_name)
714
- return write_replicated
715
-
716
- def clear_db_collection(self, mongos_config: MongoConfiguration, db_name: str) -> None:
717
- """Clears the temporary collection."""
718
- with MongoConnection(mongos_config) as mongos:
719
- mongos.client.drop_database(db_name)
720
-
721
- def clear_tmp_collection(self, mongo_config: MongoConfiguration, collection_name: str) -> None:
722
- """Clears the temporary collection."""
723
- with MongoConnection(mongo_config) as mongo:
724
- db = mongo.client[mongo_config.database]
725
- db.drop_collection(collection_name)
726
-
727
- @retry(
728
- stop=stop_after_attempt(10),
729
- wait=wait_fixed(1),
730
- reraise=True,
731
- )
732
- def confirm_excepted_write_on_replica(
733
- self,
734
- host: str,
735
- db_name: str,
736
- collection: str,
737
- expected_write_value: str,
738
- secondary_config: MongoConfiguration,
739
- ) -> None:
740
- """Returns True if the replica contains the expected write in the provided collection."""
741
- secondary_config.hosts = {host}
742
- with MongoConnection(secondary_config, direct=True) as direct_seconary:
743
- db = direct_seconary.client[db_name]
744
- test_collection = db[collection]
745
- query = test_collection.find({}, {WRITE_KEY: 1})
746
- if query[0][WRITE_KEY] != expected_write_value:
747
- raise ClusterNotHealthyError
748
-
749
- def get_random_write_and_collection(self) -> tuple[str, str, str]:
750
- """Returns a tuple for a random collection name and a unique write to add to it."""
751
- choices = string.ascii_letters + string.digits
752
- collection_name = "collection_" + "".join([secrets.choice(choices) for _ in range(32)])
753
- write_value = "unique_write_" + "".join([secrets.choice(choices) for _ in range(16)])
754
- db_name = "db_name_" + "".join([secrets.choice(choices) for _ in range(32)])
755
- return (db_name, collection_name, write_value)
756
-
757
- def add_write_to_sharded_cluster(
758
- self, mongos_config: MongoConfiguration, db_name, collection_name, write_value
759
- ) -> None:
760
- """Adds a the provided write to the provided database with the provided collection."""
761
- with MongoConnection(mongos_config) as mongod:
762
- db = mongod.client[db_name]
763
- test_collection = db[collection_name]
764
- write = {WRITE_KEY: write_value}
765
- test_collection.insert_one(write)
766
-
767
- def add_write_to_replica_set(
768
- self, mongodb_config: MongoConfiguration, collection_name, write_value
769
- ) -> None:
770
- """Adds a the provided write to the admin database with the provided collection."""
771
- with MongoConnection(mongodb_config) as mongod:
772
- db = mongod.client["admin"]
773
- test_collection = db[collection_name]
774
- write = {WRITE_KEY: write_value}
775
- test_collection.insert_one(write)
776
-
777
- def is_write_on_secondaries(
778
- self,
779
- mongodb_config: MongoConfiguration,
780
- collection_name,
781
- expected_write_value,
782
- db_name: str = "admin",
783
- ) -> bool:
784
- """Returns true if the expected write."""
785
- for replica_ip in mongodb_config.hosts:
786
- try:
787
- self.confirm_excepted_write_on_replica(
788
- replica_ip,
789
- db_name,
790
- collection_name,
791
- expected_write_value,
792
- mongodb_config,
793
- )
794
- except ClusterNotHealthyError:
795
- # do not return False immediately - as it is
796
- logger.debug("Secondary with IP %s, does not contain the expected write.")
797
- return False
798
-
799
- return True
800
-
801
- def step_down_primary_and_wait_reelection(self) -> None:
802
- """Steps down the current primary and waits for a new one to be elected."""
803
- if len(self.state.internal_hosts) < 2:
804
- logger.warning(
805
- "No secondaries to become primary - upgrading primary without electing a new one, expect downtime."
806
- )
807
- return
808
-
809
- old_primary = self.dependent.primary_unit_name # type: ignore
810
- with MongoConnection(self.state.mongo_config) as mongod:
811
- mongod.step_down_primary()
812
-
813
- for attempt in Retrying(stop=stop_after_attempt(30), wait=wait_fixed(1), reraise=True):
814
- with attempt:
815
- new_primary = self.dependent.primary_unit_name # type: ignore
816
- if new_primary == old_primary:
817
- raise FailedToElectNewPrimaryError()
818
-
819
- def are_pre_upgrade_operations_config_server_successful(self) -> bool:
820
- """Runs pre-upgrade operations for config-server and returns True if successful."""
821
- if not self.state.is_role(MongoDBRoles.CONFIG_SERVER):
822
- return False
823
-
824
- if not self.is_feature_compatibility_version(FEATURE_VERSION):
825
- logger.debug(
826
- "Not all replicas have the expected feature compatibility: %s",
827
- FEATURE_VERSION,
828
- )
829
- return False
830
-
831
- self.set_mongos_feature_compatibilty_version(FEATURE_VERSION)
832
-
833
- # pre-upgrade sequence runs twice. Once when the user runs the pre-upgrade action and
834
- # again automatically on refresh (just in case the user forgot to). Disabling the balancer
835
- # can negatively impact the cluster, so we only disable it once the upgrade sequence has
836
- # begun.
837
- if self._upgrade and self.state.upgrade_in_progress:
838
- try:
839
- self.turn_off_and_wait_for_balancer()
840
- except BalancerStillRunningError:
841
- logger.debug("Balancer is still running. Please try the pre-refresh check later.")
842
- return False
843
-
844
- return True
845
-
846
- def is_feature_compatibility_version(self, expected_feature_version: str) -> bool:
847
- """Returns True if all nodes in the sharded cluster have the expected_feature_version.
848
-
849
- Note it is NOT sufficient to check only mongos or the individual shards. It is necessary to
850
- check each node according to MongoDB upgrade docs.
851
- """
852
- for replica_set_config in self.get_all_replica_set_configs_in_cluster():
853
- for single_host in replica_set_config.hosts:
854
- single_replica_config = self.state.mongodb_config_for_user(
855
- OperatorUser,
856
- hosts={single_host},
857
- replset=replica_set_config.replset,
858
- standalone=True,
859
- )
860
- with MongoConnection(single_replica_config) as mongod:
861
- version = mongod.client.admin.command(
862
- {"getParameter": 1, "featureCompatibilityVersion": 1}
863
- )
864
- if (
865
- version["featureCompatibilityVersion"]["version"]
866
- != expected_feature_version
867
- ):
868
- return False
869
-
870
- return True
871
-
872
- def set_mongos_feature_compatibilty_version(self, feature_version: str) -> None:
873
- """Sets the mongos feature compatibility version."""
874
- with MongoConnection(self.state.mongos_config) as mongos:
875
- mongos.client.admin.command("setFeatureCompatibilityVersion", feature_version)
876
-
877
- @retry(
878
- stop=stop_after_attempt(10),
879
- wait=wait_fixed(1),
880
- reraise=True,
881
- )
882
- def turn_off_and_wait_for_balancer(self) -> None:
883
- """Sends the stop command to the balancer and wait for it to stop running."""
884
- with MongoConnection(self.state.mongos_config) as mongos:
885
- mongos.client.admin.command("balancerStop")
886
- balancer_state = mongos.client.admin.command("balancerStatus")
887
- if balancer_state["mode"] != "off":
888
- raise BalancerStillRunningError("balancer is still Running.")
889
-
890
- # END: helpers