krkn-lib 5.1.11__py3-none-any.whl → 5.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- krkn_lib/k8s/krkn_kubernetes.py +376 -2
- krkn_lib/k8s/pod_monitor/pod_monitor.py +142 -54
- krkn_lib/k8s/templates/snapshot.j2 +10 -0
- krkn_lib/models/elastic/models.py +23 -0
- krkn_lib/models/telemetry/models.py +9 -0
- krkn_lib/tests/base_test.py +13 -0
- krkn_lib/tests/test_krkn_elastic_models.py +21 -0
- krkn_lib/tests/test_krkn_kubernetes_virt.py +790 -0
- krkn_lib/tests/test_krkn_openshift.py +570 -48
- krkn_lib/tests/test_krkn_telemetry_kubernetes.py +850 -0
- krkn_lib/tests/test_safe_logger.py +494 -0
- {krkn_lib-5.1.11.dist-info → krkn_lib-5.1.13.dist-info}/METADATA +2 -1
- {krkn_lib-5.1.11.dist-info → krkn_lib-5.1.13.dist-info}/RECORD +15 -11
- {krkn_lib-5.1.11.dist-info → krkn_lib-5.1.13.dist-info}/WHEEL +0 -0
- {krkn_lib-5.1.11.dist-info → krkn_lib-5.1.13.dist-info}/licenses/LICENSE +0 -0
krkn_lib/k8s/krkn_kubernetes.py
CHANGED
|
@@ -167,9 +167,9 @@ class KrknKubernetes:
|
|
|
167
167
|
client.Configuration.set_default(self.client_config)
|
|
168
168
|
self.watch_resource = watch.Watch()
|
|
169
169
|
# Get the logger for the kubernetes client
|
|
170
|
-
kubernetes_logger = logging.getLogger(
|
|
170
|
+
kubernetes_logger = logging.getLogger("kubernetes")
|
|
171
171
|
|
|
172
|
-
# Set the logging level to a higher level than DEBUG,
|
|
172
|
+
# Set the logging level to a higher level than DEBUG,
|
|
173
173
|
# such as INFO, WARNING, or ERROR
|
|
174
174
|
# This will effectively disable DEBUG level messages.
|
|
175
175
|
kubernetes_logger.setLevel(logging.INFO)
|
|
@@ -1388,6 +1388,304 @@ class KrknKubernetes:
|
|
|
1388
1388
|
str(e),
|
|
1389
1389
|
)
|
|
1390
1390
|
|
|
1391
|
+
def get_vm(self, name: str, namespace: str) -> Optional[Dict]:
|
|
1392
|
+
"""
|
|
1393
|
+
Get a Virtual Machine by name and namespace.
|
|
1394
|
+
|
|
1395
|
+
:param name: Name of the VM to retrieve
|
|
1396
|
+
:param namespace: Namespace of the VM
|
|
1397
|
+
:return: The VM object if found, None otherwise
|
|
1398
|
+
"""
|
|
1399
|
+
try:
|
|
1400
|
+
vm = self.custom_object_client.get_namespaced_custom_object(
|
|
1401
|
+
group="kubevirt.io",
|
|
1402
|
+
version="v1",
|
|
1403
|
+
namespace=namespace,
|
|
1404
|
+
plural="virtualmachines",
|
|
1405
|
+
name=name,
|
|
1406
|
+
)
|
|
1407
|
+
return vm
|
|
1408
|
+
except ApiException as e:
|
|
1409
|
+
if e.status == 404:
|
|
1410
|
+
logging.warning(
|
|
1411
|
+
f"VM {name} not found in namespace {namespace}"
|
|
1412
|
+
)
|
|
1413
|
+
return None
|
|
1414
|
+
else:
|
|
1415
|
+
logging.error(f"Error getting VM {name}: {e}")
|
|
1416
|
+
raise
|
|
1417
|
+
except Exception as e:
|
|
1418
|
+
logging.error(f"Unexpected error getting VM {name}: {e}")
|
|
1419
|
+
raise
|
|
1420
|
+
|
|
1421
|
+
def get_vmi(self, name: str, namespace: str) -> Optional[Dict]:
|
|
1422
|
+
"""
|
|
1423
|
+
Get a Virtual Machine Instance by name and namespace.
|
|
1424
|
+
|
|
1425
|
+
:param name: Name of the VMI to retrieve
|
|
1426
|
+
:param namespace: Namespace of the VMI
|
|
1427
|
+
:return: The VMI object if found, None otherwise
|
|
1428
|
+
"""
|
|
1429
|
+
try:
|
|
1430
|
+
vmi = self.custom_object_client.get_namespaced_custom_object(
|
|
1431
|
+
group="kubevirt.io",
|
|
1432
|
+
version="v1",
|
|
1433
|
+
namespace=namespace,
|
|
1434
|
+
plural="virtualmachineinstances",
|
|
1435
|
+
name=name,
|
|
1436
|
+
)
|
|
1437
|
+
return vmi
|
|
1438
|
+
except ApiException as e:
|
|
1439
|
+
if e.status == 404:
|
|
1440
|
+
logging.warning(
|
|
1441
|
+
f"VMI {name} not found in namespace {namespace}"
|
|
1442
|
+
)
|
|
1443
|
+
return None
|
|
1444
|
+
else:
|
|
1445
|
+
logging.error(f"Error getting VMI {name}: {e}")
|
|
1446
|
+
raise
|
|
1447
|
+
except Exception as e:
|
|
1448
|
+
logging.error(f"Unexpected error getting VMI {name}: {e}")
|
|
1449
|
+
raise
|
|
1450
|
+
|
|
1451
|
+
def get_vmis(self, regex_name: str, namespace: str) -> Optional[Dict]:
|
|
1452
|
+
"""
|
|
1453
|
+
Get a Virtual Machine Instance by name and namespace.
|
|
1454
|
+
|
|
1455
|
+
:param name: Name of the VMI to retrieve
|
|
1456
|
+
:param namespace: Namespace of the VMI
|
|
1457
|
+
:return: The VMI object if found, None otherwise
|
|
1458
|
+
"""
|
|
1459
|
+
try:
|
|
1460
|
+
vmis_list = []
|
|
1461
|
+
namespaces = self.list_namespaces_by_regex(namespace)
|
|
1462
|
+
for namespace in namespaces:
|
|
1463
|
+
vmis = self.custom_object_client.list_namespaced_custom_object(
|
|
1464
|
+
group="kubevirt.io",
|
|
1465
|
+
version="v1",
|
|
1466
|
+
namespace=namespace,
|
|
1467
|
+
plural="virtualmachineinstances",
|
|
1468
|
+
)
|
|
1469
|
+
|
|
1470
|
+
for vmi in vmis.get("items"):
|
|
1471
|
+
vmi_name = vmi.get("metadata", {}).get("name")
|
|
1472
|
+
match = re.match(regex_name, vmi_name)
|
|
1473
|
+
if match:
|
|
1474
|
+
vmis_list.append(vmi)
|
|
1475
|
+
except ApiException as e:
|
|
1476
|
+
if e.status == 404:
|
|
1477
|
+
logging.warning(
|
|
1478
|
+
f"VMI {regex_name} not found in namespace {namespace}"
|
|
1479
|
+
)
|
|
1480
|
+
return []
|
|
1481
|
+
else:
|
|
1482
|
+
logging.error(f"Error getting VMI {regex_name}: {e}")
|
|
1483
|
+
raise
|
|
1484
|
+
except Exception as e:
|
|
1485
|
+
logging.error(f"Unexpected error getting VMI {regex_name}: {e}")
|
|
1486
|
+
raise
|
|
1487
|
+
return vmis_list
|
|
1488
|
+
|
|
1489
|
+
def create_vmi(self, name: str, namespace: str, vm_name: str, vmi_body: dict) -> Optional[Dict]:
|
|
1490
|
+
"""
|
|
1491
|
+
Create a Virtual Machine Instance by name and namespace.
|
|
1492
|
+
|
|
1493
|
+
:param name: Name of the VMI to create
|
|
1494
|
+
:param namespace: Namespace of the VMI
|
|
1495
|
+
:param vm_name: Name of the Virtual Machine to create the VMI from
|
|
1496
|
+
:return: The VMI object if created, None otherwise
|
|
1497
|
+
"""
|
|
1498
|
+
try:
|
|
1499
|
+
vmi = self.custom_object_client.create_namespaced_custom_object(
|
|
1500
|
+
group="kubevirt.io",
|
|
1501
|
+
version="v1",
|
|
1502
|
+
namespace=namespace,
|
|
1503
|
+
plural="virtualmachineinstances",
|
|
1504
|
+
name=name,
|
|
1505
|
+
body=vmi_body,
|
|
1506
|
+
)
|
|
1507
|
+
return vmi
|
|
1508
|
+
except ApiException as e:
|
|
1509
|
+
if e.status == 404:
|
|
1510
|
+
logging.warning(f"VMI {name} not found in namespace {namespace}")
|
|
1511
|
+
return None
|
|
1512
|
+
else:
|
|
1513
|
+
logging.error(f"Error creating VMI {name}: {e}")
|
|
1514
|
+
raise
|
|
1515
|
+
except Exception as e:
|
|
1516
|
+
logging.error(f"Unexpected error creating VMI {name}: {e}")
|
|
1517
|
+
raise
|
|
1518
|
+
|
|
1519
|
+
def patch_vm(self, name: str, namespace: str, vm_body: dict) -> Optional[Dict]:
|
|
1520
|
+
"""
|
|
1521
|
+
Patch a Virtual Machine by name and namespace.
|
|
1522
|
+
|
|
1523
|
+
:param name: Name of the VM to patch
|
|
1524
|
+
:param namespace: Namespace of the VM
|
|
1525
|
+
:param vm_body: Body of the VM to patch
|
|
1526
|
+
:return: The VM object if patched, None otherwise
|
|
1527
|
+
"""
|
|
1528
|
+
try:
|
|
1529
|
+
vmi = self.custom_object_client.patch_namespaced_custom_object(
|
|
1530
|
+
group="kubevirt.io",
|
|
1531
|
+
version="v1",
|
|
1532
|
+
namespace=namespace,
|
|
1533
|
+
plural="virtualmachines",
|
|
1534
|
+
name=name,
|
|
1535
|
+
body=vm_body,
|
|
1536
|
+
)
|
|
1537
|
+
return vmi
|
|
1538
|
+
except ApiException as e:
|
|
1539
|
+
if e.status == 404:
|
|
1540
|
+
logging.warning(f"VM {name} not found in namespace {namespace}")
|
|
1541
|
+
return None
|
|
1542
|
+
else:
|
|
1543
|
+
logging.error(f"Error patching VM {name}: {e}")
|
|
1544
|
+
raise
|
|
1545
|
+
except Exception as e:
|
|
1546
|
+
logging.error(f"Unexpected error patching VM {name}: {e}")
|
|
1547
|
+
raise
|
|
1548
|
+
|
|
1549
|
+
def patch_vmi(self, name: str, namespace: str, vmi_body: dict) -> Optional[Dict]:
|
|
1550
|
+
"""
|
|
1551
|
+
Patch a Virtual Machine Instance by name and namespace.
|
|
1552
|
+
|
|
1553
|
+
:param name: Name of the VMI to patch
|
|
1554
|
+
:param namespace: Namespace of the VMI
|
|
1555
|
+
:param vmi_body: Body of the VMI to patch
|
|
1556
|
+
:return: The VMI object if patched, None otherwise
|
|
1557
|
+
"""
|
|
1558
|
+
try:
|
|
1559
|
+
vmi = self.custom_object_client.patch_namespaced_custom_object(
|
|
1560
|
+
group="kubevirt.io",
|
|
1561
|
+
version="v1",
|
|
1562
|
+
namespace=namespace,
|
|
1563
|
+
plural="virtualmachineinstances",
|
|
1564
|
+
name=name,
|
|
1565
|
+
body=vmi_body,
|
|
1566
|
+
)
|
|
1567
|
+
return vmi
|
|
1568
|
+
except ApiException as e:
|
|
1569
|
+
if e.status == 404:
|
|
1570
|
+
logging.warning(f"VMI {name} not found in namespace {namespace}")
|
|
1571
|
+
return None
|
|
1572
|
+
else:
|
|
1573
|
+
logging.error(f"Error patching VMI {name}: {e}")
|
|
1574
|
+
raise
|
|
1575
|
+
except Exception as e:
|
|
1576
|
+
logging.error(f"Unexpected error patching VMI {name}: {e}")
|
|
1577
|
+
raise
|
|
1578
|
+
|
|
1579
|
+
def get_vms(self, regex_name: str, namespace: str) -> Optional[Dict]:
|
|
1580
|
+
"""
|
|
1581
|
+
Get a Virtual Machine by name and namespace.
|
|
1582
|
+
|
|
1583
|
+
:param name: Name of the VM to retrieve
|
|
1584
|
+
:param namespace: Namespace of the VM
|
|
1585
|
+
:return: The VM object if found, None otherwise
|
|
1586
|
+
"""
|
|
1587
|
+
try:
|
|
1588
|
+
vms_list = []
|
|
1589
|
+
namespaces = self.list_namespaces_by_regex(namespace)
|
|
1590
|
+
for namespace in namespaces:
|
|
1591
|
+
vms = self.custom_object_client.list_namespaced_custom_object(
|
|
1592
|
+
group="kubevirt.io",
|
|
1593
|
+
version="v1",
|
|
1594
|
+
namespace=namespace,
|
|
1595
|
+
plural="virtualmachines",
|
|
1596
|
+
)
|
|
1597
|
+
|
|
1598
|
+
for vm in vms.get("items"):
|
|
1599
|
+
vm_name = vm.get("metadata", {}).get("name")
|
|
1600
|
+
match = re.match(regex_name, vm_name)
|
|
1601
|
+
if match:
|
|
1602
|
+
vms_list.append(vm)
|
|
1603
|
+
return vms_list
|
|
1604
|
+
except ApiException as e:
|
|
1605
|
+
if e.status == 404:
|
|
1606
|
+
logging.warning(
|
|
1607
|
+
f"VM {regex_name} not found in namespace {namespace}"
|
|
1608
|
+
)
|
|
1609
|
+
return []
|
|
1610
|
+
else:
|
|
1611
|
+
logging.error(f"Error getting VM {regex_name}: {e}")
|
|
1612
|
+
raise
|
|
1613
|
+
except Exception as e:
|
|
1614
|
+
logging.error(f"Unexpected error getting VM {regex_name}: {e}")
|
|
1615
|
+
raise
|
|
1616
|
+
|
|
1617
|
+
def get_snapshot(self, name: str, namespace: str) -> Optional[Dict]:
|
|
1618
|
+
"""
|
|
1619
|
+
Get a Snapshot by name and namespace.
|
|
1620
|
+
|
|
1621
|
+
:param name: Name of the Snapshot to retrieve
|
|
1622
|
+
:param namespace: Namespace of the Snapshot
|
|
1623
|
+
:return: The Snapshot object if found, None otherwise
|
|
1624
|
+
"""
|
|
1625
|
+
try:
|
|
1626
|
+
vmi = self.custom_object_client.get_namespaced_custom_object(
|
|
1627
|
+
group="kubevirt.io",
|
|
1628
|
+
version="v1",
|
|
1629
|
+
namespace=namespace,
|
|
1630
|
+
plural="VirtualMachineSnapshot",
|
|
1631
|
+
name=name,
|
|
1632
|
+
)
|
|
1633
|
+
return vmi
|
|
1634
|
+
except ApiException as e:
|
|
1635
|
+
if e.status == 404:
|
|
1636
|
+
logging.warning(
|
|
1637
|
+
f"VMI {name} not found in namespace {namespace}"
|
|
1638
|
+
)
|
|
1639
|
+
return None
|
|
1640
|
+
else:
|
|
1641
|
+
logging.error(f"Error getting VMI {name}: {e}")
|
|
1642
|
+
raise
|
|
1643
|
+
except Exception as e:
|
|
1644
|
+
logging.error(f"Unexpected error getting VMI {name}: {e}")
|
|
1645
|
+
raise
|
|
1646
|
+
|
|
1647
|
+
def create_snapshot(
|
|
1648
|
+
self, name: str, namespace: str, vm_name: str
|
|
1649
|
+
) -> Optional[Dict]:
|
|
1650
|
+
"""
|
|
1651
|
+
Create a Snapshot by name and namespace.
|
|
1652
|
+
|
|
1653
|
+
:param name: Name of the Snapshot to create
|
|
1654
|
+
:param namespace: Namespace of the Snapshot
|
|
1655
|
+
:param vm_name: Name of the Virtual Machine to create the Snapshot from
|
|
1656
|
+
:return: The Snapshot object if created, None otherwise
|
|
1657
|
+
"""
|
|
1658
|
+
try:
|
|
1659
|
+
file_loader = PackageLoader("krkn_lib.k8s", "templates")
|
|
1660
|
+
env = Environment(loader=file_loader, autoescape=True)
|
|
1661
|
+
snapshot_template = env.get_template("snapshot.j2")
|
|
1662
|
+
ss_body = yaml.safe_load(
|
|
1663
|
+
snapshot_template.render(
|
|
1664
|
+
name=name, namespace=namespace, vm_name=vm_name
|
|
1665
|
+
)
|
|
1666
|
+
)
|
|
1667
|
+
vmi = self.custom_object_client.create_namespaced_custom_object(
|
|
1668
|
+
group="kubevirt.io",
|
|
1669
|
+
version="v1",
|
|
1670
|
+
namespace=namespace,
|
|
1671
|
+
plural="VirtualMachineSnapshot",
|
|
1672
|
+
name=name,
|
|
1673
|
+
body=ss_body,
|
|
1674
|
+
)
|
|
1675
|
+
return vmi
|
|
1676
|
+
except ApiException as e:
|
|
1677
|
+
if e.status == 404:
|
|
1678
|
+
logging.warning(
|
|
1679
|
+
f"Snapshot {name} not found in namespace {namespace}"
|
|
1680
|
+
)
|
|
1681
|
+
return None
|
|
1682
|
+
else:
|
|
1683
|
+
logging.error(f"Error creating Snapshot {name}: {e}")
|
|
1684
|
+
raise
|
|
1685
|
+
except Exception as e:
|
|
1686
|
+
logging.error(f"Unexpected error creating Snapshot {name}: {e}")
|
|
1687
|
+
raise
|
|
1688
|
+
|
|
1391
1689
|
def get_job_status(
|
|
1392
1690
|
self, name: str, namespace: str = "default"
|
|
1393
1691
|
) -> client.V1Job:
|
|
@@ -1410,6 +1708,82 @@ class KrknKubernetes:
|
|
|
1410
1708
|
)
|
|
1411
1709
|
raise
|
|
1412
1710
|
|
|
1711
|
+
def delete_vm(self, name: str, namespace: str) -> Optional[Dict]:
|
|
1712
|
+
"""
|
|
1713
|
+
Delete a Virtual Machine by name and namespace.
|
|
1714
|
+
|
|
1715
|
+
:param name: Name of the VM to delete
|
|
1716
|
+
:param namespace: Namespace of the VM
|
|
1717
|
+
:return: The VM object if found, None otherwise
|
|
1718
|
+
"""
|
|
1719
|
+
try:
|
|
1720
|
+
return self.custom_object_client.delete_namespaced_custom_object(
|
|
1721
|
+
group="kubevirt.io",
|
|
1722
|
+
version="v1",
|
|
1723
|
+
namespace=namespace,
|
|
1724
|
+
plural="virtualmachines",
|
|
1725
|
+
name=name,
|
|
1726
|
+
)
|
|
1727
|
+
except ApiException as e:
|
|
1728
|
+
if e.status == 404:
|
|
1729
|
+
logging.warning(
|
|
1730
|
+
f"VM {name} not found in namespace {namespace}"
|
|
1731
|
+
)
|
|
1732
|
+
return None
|
|
1733
|
+
else:
|
|
1734
|
+
logging.error(f"Error deleting VM {name}: {e}")
|
|
1735
|
+
raise
|
|
1736
|
+
except Exception as e:
|
|
1737
|
+
logging.error(f"Error deleting VM {name}: {e}")
|
|
1738
|
+
raise
|
|
1739
|
+
|
|
1740
|
+
def delete_vmi(self, vm_name: str, namespace: str):
|
|
1741
|
+
"""
|
|
1742
|
+
Delete a Virtual Machine Instance to simulate a VM outage.
|
|
1743
|
+
|
|
1744
|
+
:param vm_name: Name of the VMI to delete
|
|
1745
|
+
:param namespace: Namespace of the VMI
|
|
1746
|
+
:return: 0 for success, 1 for failure
|
|
1747
|
+
"""
|
|
1748
|
+
logging.info(
|
|
1749
|
+
f"Injecting chaos: Deleting VMI {vm_name} in namespace {namespace}"
|
|
1750
|
+
)
|
|
1751
|
+
try:
|
|
1752
|
+
self.custom_object_client.delete_namespaced_custom_object(
|
|
1753
|
+
group="kubevirt.io",
|
|
1754
|
+
version="v1",
|
|
1755
|
+
namespace=namespace,
|
|
1756
|
+
plural="virtualmachineinstances",
|
|
1757
|
+
name=vm_name,
|
|
1758
|
+
)
|
|
1759
|
+
except ApiException as e:
|
|
1760
|
+
if e.status == 404:
|
|
1761
|
+
logging.warning(f"VMI {vm_name} not found during deletion")
|
|
1762
|
+
return 1
|
|
1763
|
+
else:
|
|
1764
|
+
logging.error(f"API error during VMI deletion: {e}")
|
|
1765
|
+
return 1
|
|
1766
|
+
|
|
1767
|
+
def delete_snapshot(self, snapshot_name: str, namespace: str):
|
|
1768
|
+
"""Helper method to delete any snapshot created by the scenario."""
|
|
1769
|
+
self.logger.info(f"Deleting snapshot '{self.snapshot_name}'...")
|
|
1770
|
+
try:
|
|
1771
|
+
self.custom_object_client.delete_namespaced_custom_object(
|
|
1772
|
+
group="kubevirt.io",
|
|
1773
|
+
version="v1",
|
|
1774
|
+
namespace=namespace,
|
|
1775
|
+
plural="VirtualMachineSnapshot",
|
|
1776
|
+
name=snapshot_name,
|
|
1777
|
+
)
|
|
1778
|
+
self.logger.info(
|
|
1779
|
+
f"Snapshot '{self.snapshot_name}' deleted successfully."
|
|
1780
|
+
)
|
|
1781
|
+
except Exception as e:
|
|
1782
|
+
self.logger.warning(
|
|
1783
|
+
"Failed to delete snapshot, "
|
|
1784
|
+
f"might have been already deleted: {e}"
|
|
1785
|
+
)
|
|
1786
|
+
|
|
1413
1787
|
def monitor_nodes(
|
|
1414
1788
|
self,
|
|
1415
1789
|
) -> (bool, list[str]):
|
|
@@ -1,10 +1,14 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import re
|
|
3
|
+
import time
|
|
4
|
+
import traceback
|
|
2
5
|
from concurrent.futures import Future
|
|
3
6
|
from concurrent.futures.thread import ThreadPoolExecutor
|
|
4
7
|
from functools import partial
|
|
5
8
|
|
|
6
9
|
from kubernetes import watch
|
|
7
10
|
from kubernetes.client import V1Pod, CoreV1Api
|
|
11
|
+
from urllib3.exceptions import ProtocolError
|
|
8
12
|
|
|
9
13
|
from krkn_lib.models.pod_monitor.models import (
|
|
10
14
|
PodsSnapshot,
|
|
@@ -47,67 +51,151 @@ def _monitor_pods(
|
|
|
47
51
|
max_timeout: int,
|
|
48
52
|
name_pattern: str = None,
|
|
49
53
|
namespace_pattern: str = None,
|
|
54
|
+
max_retries: int = 3,
|
|
50
55
|
) -> PodsSnapshot:
|
|
51
|
-
|
|
56
|
+
"""
|
|
57
|
+
Monitor pods with automatic retry on watch stream disconnection.
|
|
58
|
+
|
|
59
|
+
:param monitor_partial: Partial function for monitoring pods
|
|
60
|
+
:param snapshot: Snapshot to populate with pod events
|
|
61
|
+
:param max_timeout: Maximum time to monitor (seconds)
|
|
62
|
+
:param name_pattern: Regex pattern for pod names
|
|
63
|
+
:param namespace_pattern: Regex pattern for namespaces
|
|
64
|
+
:param max_retries: Maximum number of retries on connection error
|
|
65
|
+
(default: 3)
|
|
66
|
+
:return: PodsSnapshot with collected pod events
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
start_time = time.time()
|
|
70
|
+
retry_count = 0
|
|
52
71
|
deleted_parent_pods = []
|
|
53
72
|
restored_pods = []
|
|
54
73
|
cluster_restored = False
|
|
55
|
-
for event in w.stream(monitor_partial, timeout_seconds=max_timeout):
|
|
56
|
-
match_name = True
|
|
57
|
-
match_namespace = True
|
|
58
|
-
event_type = event["type"]
|
|
59
|
-
pod = event["object"]
|
|
60
74
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
75
|
+
while retry_count <= max_retries:
|
|
76
|
+
try:
|
|
77
|
+
# Calculate remaining timeout if retrying
|
|
78
|
+
if retry_count > 0:
|
|
79
|
+
elapsed = time.time() - start_time
|
|
80
|
+
remain_timeout = max(1, int(max_timeout - elapsed))
|
|
81
|
+
logging.info("remain timeout " + str(remain_timeout))
|
|
82
|
+
if remain_timeout <= 0:
|
|
83
|
+
logging.info("Maximum timeout reached, stopping monitoring")
|
|
84
|
+
break
|
|
85
|
+
logging.info(
|
|
86
|
+
"Reconnecting watch stream"
|
|
87
|
+
f"(attempt {retry_count}/{max_retries}),"
|
|
88
|
+
f"remaining timeout: {remain_timeout}s"
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
remain_timeout = max_timeout
|
|
67
92
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
93
|
+
w = watch.Watch(return_type=V1Pod)
|
|
94
|
+
|
|
95
|
+
for e in w.stream(monitor_partial, timeout_seconds=remain_timeout):
|
|
96
|
+
match_name = True
|
|
97
|
+
match_namespace = True
|
|
98
|
+
event_type = e["type"]
|
|
99
|
+
pod = e["object"]
|
|
100
|
+
|
|
101
|
+
if namespace_pattern:
|
|
102
|
+
match = re.match(namespace_pattern, pod.metadata.namespace)
|
|
103
|
+
match_namespace = match is not None
|
|
104
|
+
if name_pattern:
|
|
105
|
+
match = re.match(name_pattern, pod.metadata.name)
|
|
106
|
+
match_name = match is not None
|
|
107
|
+
|
|
108
|
+
if match_name and match_namespace:
|
|
109
|
+
pod_event = PodEvent()
|
|
110
|
+
pod_name = pod.metadata.name
|
|
111
|
+
if event_type == "MODIFIED":
|
|
112
|
+
if pod.metadata.deletion_timestamp is not None:
|
|
113
|
+
pod_event.status = PodStatus.DELETION_SCHEDULED
|
|
114
|
+
if pod_name not in deleted_parent_pods:
|
|
115
|
+
deleted_parent_pods.append(pod_name)
|
|
116
|
+
elif _is_pod_ready(pod):
|
|
117
|
+
pod_event.status = PodStatus.READY
|
|
118
|
+
# if there are at least the same number of ready
|
|
119
|
+
# pods as the snapshot.initial_pods set we assume
|
|
120
|
+
# the cluster is restored to the initial condition
|
|
121
|
+
if pod_name not in restored_pods:
|
|
122
|
+
restored_pods.append(pod_name)
|
|
123
|
+
inital_pod_len = len(snapshot.initial_pods)
|
|
124
|
+
if len(restored_pods) >= inital_pod_len:
|
|
125
|
+
cluster_restored = True
|
|
126
|
+
else:
|
|
127
|
+
pod_event.status = PodStatus.NOT_READY
|
|
128
|
+
|
|
129
|
+
elif event_type == "DELETED":
|
|
130
|
+
pod_event.status = PodStatus.DELETED
|
|
131
|
+
elif event_type == "ADDED":
|
|
132
|
+
pod_event.status = PodStatus.ADDED
|
|
133
|
+
|
|
134
|
+
if pod_event.status == PodStatus.ADDED:
|
|
135
|
+
|
|
136
|
+
if pod_name not in snapshot.added_pods:
|
|
137
|
+
snapshot.added_pods.append(pod_name)
|
|
138
|
+
# in case a pod is respawn with the same name
|
|
139
|
+
# the dictionary must not be reinitialized
|
|
140
|
+
if pod_name not in snapshot.pods:
|
|
141
|
+
snapshot.pods[pod_name] = MonitoredPod()
|
|
142
|
+
snapshot.pods[pod_name].name = pod_name
|
|
143
|
+
snapshot.pods[pod_name].namespace = (
|
|
144
|
+
pod.metadata.namespace
|
|
145
|
+
)
|
|
146
|
+
# skips events out of the snapshot
|
|
147
|
+
if pod_name in snapshot.pods:
|
|
148
|
+
snapshot.pods[pod_name].status_changes.append(
|
|
149
|
+
pod_event
|
|
150
|
+
)
|
|
151
|
+
# this flag is set when all the pods
|
|
152
|
+
# that has been deleted or not ready
|
|
153
|
+
# have been restored, if True the
|
|
154
|
+
# monitoring is stopped earlier
|
|
155
|
+
if cluster_restored:
|
|
156
|
+
logging.info("Cluster restored, stopping monitoring")
|
|
157
|
+
w.stop()
|
|
158
|
+
return snapshot
|
|
159
|
+
|
|
160
|
+
# If we exit the loop normally (timeout reached), we're done
|
|
161
|
+
logging.info("Watch stream completed normally")
|
|
162
|
+
break
|
|
163
|
+
|
|
164
|
+
except ProtocolError as e:
|
|
165
|
+
|
|
166
|
+
if retry_count > max_retries:
|
|
167
|
+
logging.warning(
|
|
168
|
+
f"Watch stream connection broken after {max_retries}"
|
|
169
|
+
f"retries. ProtocolError: {e}. Returning snapshot "
|
|
170
|
+
"with data collected so far."
|
|
171
|
+
)
|
|
172
|
+
break
|
|
173
|
+
|
|
174
|
+
# Log retry attempt
|
|
175
|
+
logging.info(
|
|
176
|
+
f"Watch stream connection broken (ProtocolError): {e}. "
|
|
177
|
+
f"Retry {retry_count}/{max_retries} in progress..."
|
|
178
|
+
)
|
|
179
|
+
backoff_time = 1
|
|
180
|
+
|
|
181
|
+
# Check if we have time for backoff
|
|
182
|
+
elapsed = time.time() - start_time
|
|
183
|
+
if elapsed + backoff_time >= max_timeout:
|
|
184
|
+
logging.info(
|
|
185
|
+
"Not enough time remaining for backoff, "
|
|
186
|
+
"returning snapshot with data collected."
|
|
104
187
|
)
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
logging.debug(f"Waiting {backoff_time}s before retry...")
|
|
191
|
+
time.sleep(backoff_time)
|
|
192
|
+
|
|
193
|
+
except Exception as e:
|
|
194
|
+
logging.error("Error in monitor pods: " + str(e))
|
|
195
|
+
logging.error("Stack trace:\n%s", traceback.format_exc())
|
|
196
|
+
raise Exception(e)
|
|
197
|
+
|
|
198
|
+
retry_count += 1
|
|
111
199
|
|
|
112
200
|
return snapshot
|
|
113
201
|
|
|
@@ -144,6 +144,7 @@ class ElasticChaosRunTelemetry(Document):
|
|
|
144
144
|
run_uuid = Text(fields={"keyword": Keyword()})
|
|
145
145
|
health_checks = Nested(ElasticHealthChecks, multi=True)
|
|
146
146
|
virt_checks = Nested(ElasticVirtChecks, multi=True)
|
|
147
|
+
post_virt_checks = Nested(ElasticVirtChecks, multi=True)
|
|
147
148
|
|
|
148
149
|
class Index:
|
|
149
150
|
name = "chaos_run_telemetry"
|
|
@@ -261,6 +262,28 @@ class ElasticChaosRunTelemetry(Document):
|
|
|
261
262
|
else:
|
|
262
263
|
self.virt_checks = None
|
|
263
264
|
|
|
265
|
+
if chaos_run_telemetry.post_virt_checks:
|
|
266
|
+
self.post_virt_checks = [
|
|
267
|
+
ElasticVirtChecks(
|
|
268
|
+
vm_name=post_info.vm_name,
|
|
269
|
+
ip_address=post_info.ip_address,
|
|
270
|
+
new_ip_address=post_info.new_ip_address,
|
|
271
|
+
namespace=post_info.namespace,
|
|
272
|
+
node_name=post_info.node_name,
|
|
273
|
+
status=post_info.status,
|
|
274
|
+
start_timestamp=datetime.datetime.fromisoformat(
|
|
275
|
+
str(post_info.start_timestamp)
|
|
276
|
+
),
|
|
277
|
+
end_timestamp=datetime.datetime.fromisoformat(
|
|
278
|
+
str(post_info.end_timestamp)
|
|
279
|
+
),
|
|
280
|
+
duration=post_info.duration,
|
|
281
|
+
)
|
|
282
|
+
for post_info in chaos_run_telemetry.post_virt_checks
|
|
283
|
+
]
|
|
284
|
+
else:
|
|
285
|
+
self.post_virt_checks = None
|
|
286
|
+
|
|
264
287
|
self.timestamp = chaos_run_telemetry.timestamp
|
|
265
288
|
self.total_node_count = chaos_run_telemetry.total_node_count
|
|
266
289
|
self.cloud_infrastructure = chaos_run_telemetry.cloud_infrastructure
|