k8s-helper-cli 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_helper/__init__.py +1 -1
- k8s_helper/cli.py +503 -0
- k8s_helper/core.py +957 -0
- {k8s_helper_cli-0.3.0.dist-info → k8s_helper_cli-0.4.1.dist-info}/METADATA +106 -1
- k8s_helper_cli-0.4.1.dist-info/RECORD +11 -0
- k8s_helper_cli-0.3.0.dist-info/RECORD +0 -11
- {k8s_helper_cli-0.3.0.dist-info → k8s_helper_cli-0.4.1.dist-info}/WHEEL +0 -0
- {k8s_helper_cli-0.3.0.dist-info → k8s_helper_cli-0.4.1.dist-info}/entry_points.txt +0 -0
- {k8s_helper_cli-0.3.0.dist-info → k8s_helper_cli-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {k8s_helper_cli-0.3.0.dist-info → k8s_helper_cli-0.4.1.dist-info}/top_level.txt +0 -0
k8s_helper/core.py
CHANGED
@@ -1679,3 +1679,960 @@ class K8sClient:
|
|
1679
1679
|
|
1680
1680
|
print(f"❌ Timeout waiting for deployment '{name}' to be ready")
|
1681
1681
|
return False
|
1682
|
+
|
1683
|
+
# ======================
|
1684
|
+
# MONITORING OPERATIONS
|
1685
|
+
# ======================
|
1686
|
+
def setup_monitoring(self, namespace: str = "monitoring",
|
1687
|
+
grafana_service_type: str = "NodePort",
|
1688
|
+
import_dashboard: bool = True,
|
1689
|
+
wait_for_ready: bool = True) -> Dict[str, Any]:
|
1690
|
+
"""Setup complete monitoring stack with Prometheus and Grafana
|
1691
|
+
|
1692
|
+
Args:
|
1693
|
+
namespace: Namespace to deploy monitoring stack
|
1694
|
+
grafana_service_type: Service type for Grafana (NodePort, LoadBalancer, ClusterIP)
|
1695
|
+
import_dashboard: Whether to import default Kubernetes dashboard
|
1696
|
+
wait_for_ready: Whether to wait for deployments to be ready
|
1697
|
+
|
1698
|
+
Returns:
|
1699
|
+
Dictionary with deployment info, URLs, and credentials
|
1700
|
+
"""
|
1701
|
+
result = {
|
1702
|
+
'namespace': namespace,
|
1703
|
+
'prometheus': {},
|
1704
|
+
'grafana': {},
|
1705
|
+
'success': False,
|
1706
|
+
'error': None
|
1707
|
+
}
|
1708
|
+
|
1709
|
+
try:
|
1710
|
+
# Create monitoring namespace
|
1711
|
+
self._create_monitoring_namespace(namespace)
|
1712
|
+
|
1713
|
+
# Deploy Prometheus
|
1714
|
+
prometheus_result = self._deploy_prometheus(namespace)
|
1715
|
+
result['prometheus'] = prometheus_result
|
1716
|
+
|
1717
|
+
# Deploy Grafana
|
1718
|
+
grafana_result = self._deploy_grafana(namespace, grafana_service_type)
|
1719
|
+
result['grafana'] = grafana_result
|
1720
|
+
|
1721
|
+
if wait_for_ready:
|
1722
|
+
# Wait for deployments to be ready
|
1723
|
+
if self._wait_for_monitoring_ready(namespace):
|
1724
|
+
# Configure Grafana data source and dashboard
|
1725
|
+
if self._configure_grafana(namespace, import_dashboard):
|
1726
|
+
result['success'] = True
|
1727
|
+
else:
|
1728
|
+
result['error'] = "Failed to configure Grafana"
|
1729
|
+
else:
|
1730
|
+
result['error'] = "Monitoring deployments failed to become ready"
|
1731
|
+
else:
|
1732
|
+
result['success'] = True
|
1733
|
+
|
1734
|
+
return result
|
1735
|
+
|
1736
|
+
except Exception as e:
|
1737
|
+
result['error'] = str(e)
|
1738
|
+
return result
|
1739
|
+
|
1740
|
+
def _create_monitoring_namespace(self, namespace: str) -> bool:
|
1741
|
+
"""Create monitoring namespace if it doesn't exist"""
|
1742
|
+
try:
|
1743
|
+
self.core_v1.read_namespace(name=namespace)
|
1744
|
+
print(f"✅ Namespace '{namespace}' already exists")
|
1745
|
+
return True
|
1746
|
+
except ApiException as e:
|
1747
|
+
if e.status == 404:
|
1748
|
+
# Create namespace
|
1749
|
+
namespace_obj = client.V1Namespace(
|
1750
|
+
metadata=client.V1ObjectMeta(name=namespace)
|
1751
|
+
)
|
1752
|
+
self.core_v1.create_namespace(body=namespace_obj)
|
1753
|
+
print(f"✅ Created namespace '{namespace}'")
|
1754
|
+
return True
|
1755
|
+
else:
|
1756
|
+
raise e
|
1757
|
+
|
1758
|
+
def _deploy_prometheus(self, namespace: str) -> Dict[str, Any]:
|
1759
|
+
"""Deploy Prometheus to the cluster"""
|
1760
|
+
result = {'deployed': False, 'service_name': 'prometheus-service'}
|
1761
|
+
|
1762
|
+
# Prometheus ConfigMap
|
1763
|
+
prometheus_config = """
|
1764
|
+
global:
|
1765
|
+
scrape_interval: 15s
|
1766
|
+
scrape_configs:
|
1767
|
+
- job_name: 'kubernetes-apiservers'
|
1768
|
+
kubernetes_sd_configs:
|
1769
|
+
- role: endpoints
|
1770
|
+
scheme: https
|
1771
|
+
tls_config:
|
1772
|
+
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
1773
|
+
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
1774
|
+
relabel_configs:
|
1775
|
+
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
1776
|
+
action: keep
|
1777
|
+
regex: default;kubernetes;https
|
1778
|
+
|
1779
|
+
- job_name: 'kubernetes-nodes'
|
1780
|
+
kubernetes_sd_configs:
|
1781
|
+
- role: node
|
1782
|
+
scheme: https
|
1783
|
+
tls_config:
|
1784
|
+
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
1785
|
+
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
1786
|
+
relabel_configs:
|
1787
|
+
- action: labelmap
|
1788
|
+
regex: __meta_kubernetes_node_label_(.+)
|
1789
|
+
|
1790
|
+
- job_name: 'kubernetes-pods'
|
1791
|
+
kubernetes_sd_configs:
|
1792
|
+
- role: pod
|
1793
|
+
relabel_configs:
|
1794
|
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
|
1795
|
+
action: keep
|
1796
|
+
regex: true
|
1797
|
+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
|
1798
|
+
action: replace
|
1799
|
+
target_label: __metrics_path__
|
1800
|
+
regex: (.+)
|
1801
|
+
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
|
1802
|
+
action: replace
|
1803
|
+
regex: ([^:]+)(?::\\d+)?;(\\d+)
|
1804
|
+
replacement: $1:$2
|
1805
|
+
target_label: __address__
|
1806
|
+
- action: labelmap
|
1807
|
+
regex: __meta_kubernetes_pod_label_(.+)
|
1808
|
+
- source_labels: [__meta_kubernetes_namespace]
|
1809
|
+
action: replace
|
1810
|
+
target_label: kubernetes_namespace
|
1811
|
+
- source_labels: [__meta_kubernetes_pod_name]
|
1812
|
+
action: replace
|
1813
|
+
target_label: kubernetes_pod_name
|
1814
|
+
"""
|
1815
|
+
|
1816
|
+
# Create ConfigMap
|
1817
|
+
configmap = client.V1ConfigMap(
|
1818
|
+
metadata=client.V1ObjectMeta(name="prometheus-config", namespace=namespace),
|
1819
|
+
data={"prometheus.yml": prometheus_config}
|
1820
|
+
)
|
1821
|
+
|
1822
|
+
try:
|
1823
|
+
self.core_v1.create_namespaced_config_map(namespace=namespace, body=configmap)
|
1824
|
+
except ApiException as e:
|
1825
|
+
if e.status == 409: # Already exists
|
1826
|
+
self.core_v1.replace_namespaced_config_map(
|
1827
|
+
name="prometheus-config", namespace=namespace, body=configmap
|
1828
|
+
)
|
1829
|
+
|
1830
|
+
# Create ServiceAccount and RBAC
|
1831
|
+
self._create_prometheus_rbac(namespace)
|
1832
|
+
|
1833
|
+
# Prometheus Deployment
|
1834
|
+
deployment = client.V1Deployment(
|
1835
|
+
metadata=client.V1ObjectMeta(name="prometheus", namespace=namespace),
|
1836
|
+
spec=client.V1DeploymentSpec(
|
1837
|
+
replicas=1,
|
1838
|
+
selector=client.V1LabelSelector(match_labels={"app": "prometheus"}),
|
1839
|
+
template=client.V1PodTemplateSpec(
|
1840
|
+
metadata=client.V1ObjectMeta(labels={"app": "prometheus"}),
|
1841
|
+
spec=client.V1PodSpec(
|
1842
|
+
service_account_name="prometheus",
|
1843
|
+
containers=[
|
1844
|
+
client.V1Container(
|
1845
|
+
name="prometheus",
|
1846
|
+
image="prom/prometheus:latest",
|
1847
|
+
ports=[client.V1ContainerPort(container_port=9090)],
|
1848
|
+
args=[
|
1849
|
+
"--config.file=/etc/prometheus/prometheus.yml",
|
1850
|
+
"--storage.tsdb.path=/prometheus/",
|
1851
|
+
"--web.console.libraries=/etc/prometheus/console_libraries",
|
1852
|
+
"--web.console.templates=/etc/prometheus/consoles",
|
1853
|
+
"--web.enable-lifecycle"
|
1854
|
+
],
|
1855
|
+
volume_mounts=[
|
1856
|
+
client.V1VolumeMount(
|
1857
|
+
name="prometheus-config",
|
1858
|
+
mount_path="/etc/prometheus/"
|
1859
|
+
),
|
1860
|
+
client.V1VolumeMount(
|
1861
|
+
name="prometheus-storage",
|
1862
|
+
mount_path="/prometheus/"
|
1863
|
+
)
|
1864
|
+
]
|
1865
|
+
)
|
1866
|
+
],
|
1867
|
+
volumes=[
|
1868
|
+
client.V1Volume(
|
1869
|
+
name="prometheus-config",
|
1870
|
+
config_map=client.V1ConfigMapVolumeSource(
|
1871
|
+
name="prometheus-config"
|
1872
|
+
)
|
1873
|
+
),
|
1874
|
+
client.V1Volume(
|
1875
|
+
name="prometheus-storage",
|
1876
|
+
empty_dir=client.V1EmptyDirVolumeSource()
|
1877
|
+
)
|
1878
|
+
]
|
1879
|
+
)
|
1880
|
+
)
|
1881
|
+
)
|
1882
|
+
)
|
1883
|
+
|
1884
|
+
try:
|
1885
|
+
self.apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment)
|
1886
|
+
except ApiException as e:
|
1887
|
+
if e.status == 409: # Already exists
|
1888
|
+
self.apps_v1.replace_namespaced_deployment(
|
1889
|
+
name="prometheus", namespace=namespace, body=deployment
|
1890
|
+
)
|
1891
|
+
|
1892
|
+
# Prometheus Service
|
1893
|
+
service = client.V1Service(
|
1894
|
+
metadata=client.V1ObjectMeta(name="prometheus-service", namespace=namespace),
|
1895
|
+
spec=client.V1ServiceSpec(
|
1896
|
+
selector={"app": "prometheus"},
|
1897
|
+
ports=[client.V1ServicePort(port=9090, target_port=9090)],
|
1898
|
+
type="ClusterIP"
|
1899
|
+
)
|
1900
|
+
)
|
1901
|
+
|
1902
|
+
try:
|
1903
|
+
self.core_v1.create_namespaced_service(namespace=namespace, body=service)
|
1904
|
+
except ApiException as e:
|
1905
|
+
if e.status == 409: # Already exists
|
1906
|
+
self.core_v1.replace_namespaced_service(
|
1907
|
+
name="prometheus-service", namespace=namespace, body=service
|
1908
|
+
)
|
1909
|
+
|
1910
|
+
result['deployed'] = True
|
1911
|
+
return result
|
1912
|
+
|
1913
|
+
def _create_prometheus_rbac(self, namespace: str) -> None:
|
1914
|
+
"""Create RBAC resources for Prometheus"""
|
1915
|
+
# ServiceAccount
|
1916
|
+
service_account = client.V1ServiceAccount(
|
1917
|
+
metadata=client.V1ObjectMeta(name="prometheus", namespace=namespace)
|
1918
|
+
)
|
1919
|
+
|
1920
|
+
try:
|
1921
|
+
self.core_v1.create_namespaced_service_account(namespace=namespace, body=service_account)
|
1922
|
+
except ApiException as e:
|
1923
|
+
if e.status != 409: # Ignore if already exists
|
1924
|
+
raise e
|
1925
|
+
|
1926
|
+
# ClusterRole
|
1927
|
+
rbac_v1 = client.RbacAuthorizationV1Api()
|
1928
|
+
cluster_role = client.V1ClusterRole(
|
1929
|
+
metadata=client.V1ObjectMeta(name="prometheus"),
|
1930
|
+
rules=[
|
1931
|
+
client.V1PolicyRule(
|
1932
|
+
api_groups=[""],
|
1933
|
+
resources=["nodes", "nodes/proxy", "services", "endpoints", "pods"],
|
1934
|
+
verbs=["get", "list", "watch"]
|
1935
|
+
),
|
1936
|
+
client.V1PolicyRule(
|
1937
|
+
api_groups=["extensions"],
|
1938
|
+
resources=["ingresses"],
|
1939
|
+
verbs=["get", "list", "watch"]
|
1940
|
+
)
|
1941
|
+
]
|
1942
|
+
)
|
1943
|
+
|
1944
|
+
# Add non-resource URL rule with version compatibility
|
1945
|
+
try:
|
1946
|
+
# Try the newer parameter name first
|
1947
|
+
non_resource_rule = client.V1PolicyRule(
|
1948
|
+
non_resource_ur_ls=["/metrics"],
|
1949
|
+
verbs=["get"]
|
1950
|
+
)
|
1951
|
+
except TypeError:
|
1952
|
+
# Fall back to older parameter name
|
1953
|
+
non_resource_rule = client.V1PolicyRule(
|
1954
|
+
non_resource_urls=["/metrics"],
|
1955
|
+
verbs=["get"]
|
1956
|
+
)
|
1957
|
+
|
1958
|
+
cluster_role.rules.append(non_resource_rule)
|
1959
|
+
|
1960
|
+
try:
|
1961
|
+
rbac_v1.create_cluster_role(body=cluster_role)
|
1962
|
+
except ApiException as e:
|
1963
|
+
if e.status != 409: # Ignore if already exists
|
1964
|
+
raise e
|
1965
|
+
|
1966
|
+
# ClusterRoleBinding
|
1967
|
+
cluster_role_binding = client.V1ClusterRoleBinding(
|
1968
|
+
metadata=client.V1ObjectMeta(name="prometheus"),
|
1969
|
+
subjects=[
|
1970
|
+
client.V1Subject(
|
1971
|
+
kind="ServiceAccount",
|
1972
|
+
name="prometheus",
|
1973
|
+
namespace=namespace
|
1974
|
+
)
|
1975
|
+
],
|
1976
|
+
role_ref=client.V1RoleRef(
|
1977
|
+
kind="ClusterRole",
|
1978
|
+
name="prometheus",
|
1979
|
+
api_group="rbac.authorization.k8s.io"
|
1980
|
+
)
|
1981
|
+
)
|
1982
|
+
|
1983
|
+
try:
|
1984
|
+
rbac_v1.create_cluster_role_binding(body=cluster_role_binding)
|
1985
|
+
except ApiException as e:
|
1986
|
+
if e.status != 409: # Ignore if already exists
|
1987
|
+
raise e
|
1988
|
+
|
1989
|
+
def _deploy_grafana(self, namespace: str, service_type: str = "NodePort") -> Dict[str, Any]:
|
1990
|
+
"""Deploy Grafana to the cluster"""
|
1991
|
+
result = {
|
1992
|
+
'deployed': False,
|
1993
|
+
'service_name': 'grafana-service',
|
1994
|
+
'service_type': service_type,
|
1995
|
+
'admin_user': 'admin',
|
1996
|
+
'admin_password': 'admin123'
|
1997
|
+
}
|
1998
|
+
|
1999
|
+
# Grafana Deployment
|
2000
|
+
deployment = client.V1Deployment(
|
2001
|
+
metadata=client.V1ObjectMeta(name="grafana", namespace=namespace),
|
2002
|
+
spec=client.V1DeploymentSpec(
|
2003
|
+
replicas=1,
|
2004
|
+
selector=client.V1LabelSelector(match_labels={"app": "grafana"}),
|
2005
|
+
template=client.V1PodTemplateSpec(
|
2006
|
+
metadata=client.V1ObjectMeta(labels={"app": "grafana"}),
|
2007
|
+
spec=client.V1PodSpec(
|
2008
|
+
containers=[
|
2009
|
+
client.V1Container(
|
2010
|
+
name="grafana",
|
2011
|
+
image="grafana/grafana:latest",
|
2012
|
+
ports=[client.V1ContainerPort(container_port=3000)],
|
2013
|
+
env=[
|
2014
|
+
client.V1EnvVar(name="GF_SECURITY_ADMIN_USER", value="admin"),
|
2015
|
+
client.V1EnvVar(name="GF_SECURITY_ADMIN_PASSWORD", value="admin123"),
|
2016
|
+
client.V1EnvVar(name="GF_INSTALL_PLUGINS", value="grafana-kubernetes-app")
|
2017
|
+
],
|
2018
|
+
volume_mounts=[
|
2019
|
+
client.V1VolumeMount(
|
2020
|
+
name="grafana-storage",
|
2021
|
+
mount_path="/var/lib/grafana"
|
2022
|
+
)
|
2023
|
+
]
|
2024
|
+
)
|
2025
|
+
],
|
2026
|
+
volumes=[
|
2027
|
+
client.V1Volume(
|
2028
|
+
name="grafana-storage",
|
2029
|
+
empty_dir=client.V1EmptyDirVolumeSource()
|
2030
|
+
)
|
2031
|
+
]
|
2032
|
+
)
|
2033
|
+
)
|
2034
|
+
)
|
2035
|
+
)
|
2036
|
+
|
2037
|
+
try:
|
2038
|
+
self.apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment)
|
2039
|
+
except ApiException as e:
|
2040
|
+
if e.status == 409: # Already exists
|
2041
|
+
self.apps_v1.replace_namespaced_deployment(
|
2042
|
+
name="grafana", namespace=namespace, body=deployment
|
2043
|
+
)
|
2044
|
+
|
2045
|
+
# Grafana Service
|
2046
|
+
service_ports = [client.V1ServicePort(port=3000, target_port=3000)]
|
2047
|
+
if service_type == "NodePort":
|
2048
|
+
service_ports[0].node_port = 30300
|
2049
|
+
|
2050
|
+
service = client.V1Service(
|
2051
|
+
metadata=client.V1ObjectMeta(name="grafana-service", namespace=namespace),
|
2052
|
+
spec=client.V1ServiceSpec(
|
2053
|
+
selector={"app": "grafana"},
|
2054
|
+
ports=service_ports,
|
2055
|
+
type=service_type
|
2056
|
+
)
|
2057
|
+
)
|
2058
|
+
|
2059
|
+
try:
|
2060
|
+
self.core_v1.create_namespaced_service(namespace=namespace, body=service)
|
2061
|
+
result['deployed'] = True
|
2062
|
+
except ApiException as e:
|
2063
|
+
if e.status == 409: # Already exists
|
2064
|
+
self.core_v1.replace_namespaced_service(
|
2065
|
+
name="grafana-service", namespace=namespace, body=service
|
2066
|
+
)
|
2067
|
+
result['deployed'] = True
|
2068
|
+
|
2069
|
+
return result
|
2070
|
+
|
2071
|
+
def _wait_for_monitoring_ready(self, namespace: str, timeout: int = 300) -> bool:
|
2072
|
+
"""Wait for Prometheus and Grafana to be ready"""
|
2073
|
+
deployments = ["prometheus", "grafana"]
|
2074
|
+
start_time = time.time()
|
2075
|
+
|
2076
|
+
for deployment_name in deployments:
|
2077
|
+
print(f"⏳ Waiting for {deployment_name} to be ready...")
|
2078
|
+
deployment_ready = False
|
2079
|
+
|
2080
|
+
while time.time() - start_time < timeout and not deployment_ready:
|
2081
|
+
try:
|
2082
|
+
deployment = self.apps_v1.read_namespaced_deployment(
|
2083
|
+
name=deployment_name, namespace=namespace
|
2084
|
+
)
|
2085
|
+
|
2086
|
+
if (deployment.status.ready_replicas == deployment.spec.replicas and
|
2087
|
+
deployment.status.ready_replicas > 0):
|
2088
|
+
print(f"✅ {deployment_name} is ready")
|
2089
|
+
deployment_ready = True
|
2090
|
+
else:
|
2091
|
+
ready = deployment.status.ready_replicas or 0
|
2092
|
+
total = deployment.spec.replicas
|
2093
|
+
print(f"⏳ {deployment_name}: {ready}/{total} replicas ready...")
|
2094
|
+
time.sleep(5)
|
2095
|
+
|
2096
|
+
except ApiException as e:
|
2097
|
+
print(f"❌ Error checking {deployment_name} status: {e}")
|
2098
|
+
return False
|
2099
|
+
|
2100
|
+
if not deployment_ready:
|
2101
|
+
print(f"❌ Timeout waiting for {deployment_name} to be ready")
|
2102
|
+
return False
|
2103
|
+
|
2104
|
+
return True
|
2105
|
+
|
2106
|
+
def _configure_grafana(self, namespace: str, import_dashboard: bool = True) -> bool:
|
2107
|
+
"""Configure Grafana with Prometheus data source and dashboard"""
|
2108
|
+
try:
|
2109
|
+
# Wait a bit for Grafana to fully start
|
2110
|
+
time.sleep(10)
|
2111
|
+
|
2112
|
+
import requests
|
2113
|
+
import json
|
2114
|
+
|
2115
|
+
# Get Grafana service URL
|
2116
|
+
grafana_url = self._get_grafana_url(namespace)
|
2117
|
+
if not grafana_url:
|
2118
|
+
print("❌ Could not determine Grafana URL")
|
2119
|
+
return False
|
2120
|
+
|
2121
|
+
print(f"🔧 Configuring Grafana at {grafana_url}")
|
2122
|
+
|
2123
|
+
# Add Prometheus data source
|
2124
|
+
datasource_payload = {
|
2125
|
+
"name": "Prometheus",
|
2126
|
+
"type": "prometheus",
|
2127
|
+
"url": f"http://prometheus-service.{namespace}.svc.cluster.local:9090",
|
2128
|
+
"access": "proxy",
|
2129
|
+
"isDefault": True
|
2130
|
+
}
|
2131
|
+
|
2132
|
+
auth = ('admin', 'admin123')
|
2133
|
+
|
2134
|
+
# Add data source
|
2135
|
+
response = requests.post(
|
2136
|
+
f"{grafana_url}/api/datasources",
|
2137
|
+
json=datasource_payload,
|
2138
|
+
auth=auth,
|
2139
|
+
timeout=30
|
2140
|
+
)
|
2141
|
+
|
2142
|
+
if response.status_code in [200, 409]: # Success or already exists
|
2143
|
+
print("✅ Prometheus data source configured")
|
2144
|
+
else:
|
2145
|
+
print(f"⚠️ Warning: Could not add Prometheus data source: {response.text}")
|
2146
|
+
|
2147
|
+
# Import default dashboard if requested
|
2148
|
+
if import_dashboard:
|
2149
|
+
self._import_kubernetes_dashboard(grafana_url, auth)
|
2150
|
+
|
2151
|
+
return True
|
2152
|
+
|
2153
|
+
except Exception as e:
|
2154
|
+
print(f"⚠️ Warning: Could not configure Grafana automatically: {e}")
|
2155
|
+
print("💡 You can manually add Prometheus as a data source in Grafana")
|
2156
|
+
return False
|
2157
|
+
|
2158
|
+
def _get_grafana_url(self, namespace: str) -> Optional[str]:
|
2159
|
+
"""Get Grafana service URL"""
|
2160
|
+
try:
|
2161
|
+
service = self.core_v1.read_namespaced_service(
|
2162
|
+
name="grafana-service", namespace=namespace
|
2163
|
+
)
|
2164
|
+
|
2165
|
+
if service.spec.type == "NodePort":
|
2166
|
+
# Try to get node IP
|
2167
|
+
nodes = self.core_v1.list_node()
|
2168
|
+
if nodes.items:
|
2169
|
+
# Get external IP or internal IP
|
2170
|
+
node_ip = None
|
2171
|
+
for node in nodes.items:
|
2172
|
+
for address in node.status.addresses:
|
2173
|
+
if address.type == "ExternalIP":
|
2174
|
+
node_ip = address.address
|
2175
|
+
break
|
2176
|
+
if not node_ip:
|
2177
|
+
for address in node.status.addresses:
|
2178
|
+
if address.type == "InternalIP":
|
2179
|
+
node_ip = address.address
|
2180
|
+
break
|
2181
|
+
if node_ip:
|
2182
|
+
break
|
2183
|
+
|
2184
|
+
if node_ip:
|
2185
|
+
node_port = None
|
2186
|
+
for port in service.spec.ports:
|
2187
|
+
if port.node_port:
|
2188
|
+
node_port = port.node_port
|
2189
|
+
break
|
2190
|
+
|
2191
|
+
if node_port:
|
2192
|
+
return f"http://{node_ip}:{node_port}"
|
2193
|
+
|
2194
|
+
elif service.spec.type == "LoadBalancer":
|
2195
|
+
if service.status.load_balancer.ingress:
|
2196
|
+
ingress = service.status.load_balancer.ingress[0]
|
2197
|
+
host = ingress.ip or ingress.hostname
|
2198
|
+
if host:
|
2199
|
+
return f"http://{host}:3000"
|
2200
|
+
|
2201
|
+
# Fallback to port-forward approach
|
2202
|
+
return None
|
2203
|
+
|
2204
|
+
except Exception as e:
|
2205
|
+
print(f"Warning: Could not determine Grafana URL: {e}")
|
2206
|
+
return None
|
2207
|
+
|
2208
|
+
def _import_kubernetes_dashboard(self, grafana_url: str, auth: tuple) -> None:
|
2209
|
+
"""Import a default Kubernetes dashboard"""
|
2210
|
+
try:
|
2211
|
+
import requests
|
2212
|
+
|
2213
|
+
# Simple Kubernetes cluster dashboard JSON
|
2214
|
+
dashboard_json = {
|
2215
|
+
"dashboard": {
|
2216
|
+
"id": None,
|
2217
|
+
"title": "Kubernetes Cluster Overview",
|
2218
|
+
"tags": ["kubernetes"],
|
2219
|
+
"timezone": "browser",
|
2220
|
+
"panels": [
|
2221
|
+
{
|
2222
|
+
"id": 1,
|
2223
|
+
"title": "Cluster CPU Usage",
|
2224
|
+
"type": "stat",
|
2225
|
+
"targets": [
|
2226
|
+
{
|
2227
|
+
"expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
2228
|
+
"refId": "A"
|
2229
|
+
}
|
2230
|
+
],
|
2231
|
+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
|
2232
|
+
},
|
2233
|
+
{
|
2234
|
+
"id": 2,
|
2235
|
+
"title": "Cluster Memory Usage",
|
2236
|
+
"type": "stat",
|
2237
|
+
"targets": [
|
2238
|
+
{
|
2239
|
+
"expr": "100 * (1 - ((avg_over_time(node_memory_MemFree_bytes[10m]) + avg_over_time(node_memory_Cached_bytes[10m]) + avg_over_time(node_memory_Buffers_bytes[10m])) / avg_over_time(node_memory_MemTotal_bytes[10m])))",
|
2240
|
+
"refId": "A"
|
2241
|
+
}
|
2242
|
+
],
|
2243
|
+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
|
2244
|
+
},
|
2245
|
+
{
|
2246
|
+
"id": 3,
|
2247
|
+
"title": "Pod Count",
|
2248
|
+
"type": "stat",
|
2249
|
+
"targets": [
|
2250
|
+
{
|
2251
|
+
"expr": "sum(kube_pod_info)",
|
2252
|
+
"refId": "A"
|
2253
|
+
}
|
2254
|
+
],
|
2255
|
+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
|
2256
|
+
}
|
2257
|
+
],
|
2258
|
+
"time": {"from": "now-1h", "to": "now"},
|
2259
|
+
"refresh": "30s"
|
2260
|
+
},
|
2261
|
+
"overwrite": True
|
2262
|
+
}
|
2263
|
+
|
2264
|
+
response = requests.post(
|
2265
|
+
f"{grafana_url}/api/dashboards/db",
|
2266
|
+
json=dashboard_json,
|
2267
|
+
auth=auth,
|
2268
|
+
timeout=30
|
2269
|
+
)
|
2270
|
+
|
2271
|
+
if response.status_code == 200:
|
2272
|
+
print("✅ Kubernetes dashboard imported")
|
2273
|
+
else:
|
2274
|
+
print(f"⚠️ Warning: Could not import dashboard: {response.text}")
|
2275
|
+
|
2276
|
+
except Exception as e:
|
2277
|
+
print(f"⚠️ Warning: Could not import dashboard: {e}")
|
2278
|
+
|
2279
|
+
def get_monitoring_info(self, namespace: str = "monitoring") -> Dict[str, Any]:
|
2280
|
+
"""Get monitoring stack information and URLs"""
|
2281
|
+
result = {
|
2282
|
+
'namespace': namespace,
|
2283
|
+
'prometheus': {'running': False, 'url': None},
|
2284
|
+
'grafana': {'running': False, 'url': None, 'credentials': None},
|
2285
|
+
'services': []
|
2286
|
+
}
|
2287
|
+
|
2288
|
+
try:
|
2289
|
+
# Check if deployments are running
|
2290
|
+
deployments = self.apps_v1.list_namespaced_deployment(namespace=namespace)
|
2291
|
+
|
2292
|
+
for deployment in deployments.items:
|
2293
|
+
if deployment.metadata.name == "prometheus":
|
2294
|
+
result['prometheus']['running'] = (
|
2295
|
+
deployment.status.ready_replicas == deployment.spec.replicas and
|
2296
|
+
deployment.status.ready_replicas > 0
|
2297
|
+
)
|
2298
|
+
elif deployment.metadata.name == "grafana":
|
2299
|
+
result['grafana']['running'] = (
|
2300
|
+
deployment.status.ready_replicas == deployment.spec.replicas and
|
2301
|
+
deployment.status.ready_replicas > 0
|
2302
|
+
)
|
2303
|
+
|
2304
|
+
# Get service information
|
2305
|
+
services = self.core_v1.list_namespaced_service(namespace=namespace)
|
2306
|
+
|
2307
|
+
for service in services.items:
|
2308
|
+
service_info = {
|
2309
|
+
'name': service.metadata.name,
|
2310
|
+
'type': service.spec.type,
|
2311
|
+
'cluster_ip': service.spec.cluster_ip,
|
2312
|
+
'ports': []
|
2313
|
+
}
|
2314
|
+
|
2315
|
+
for port in service.spec.ports:
|
2316
|
+
port_info = {
|
2317
|
+
'port': port.port,
|
2318
|
+
'target_port': port.target_port,
|
2319
|
+
'protocol': port.protocol
|
2320
|
+
}
|
2321
|
+
if hasattr(port, 'node_port') and port.node_port:
|
2322
|
+
port_info['node_port'] = port.node_port
|
2323
|
+
service_info['ports'].append(port_info)
|
2324
|
+
|
2325
|
+
result['services'].append(service_info)
|
2326
|
+
|
2327
|
+
# Set URLs
|
2328
|
+
if service.metadata.name == "grafana-service":
|
2329
|
+
result['grafana']['credentials'] = {
|
2330
|
+
'username': 'admin',
|
2331
|
+
'password': 'admin123'
|
2332
|
+
}
|
2333
|
+
|
2334
|
+
if service.spec.type == "NodePort":
|
2335
|
+
# Get node IP and port
|
2336
|
+
url_info = self.get_service_url("grafana-service", namespace)
|
2337
|
+
if url_info and url_info.get('external_url'):
|
2338
|
+
result['grafana']['url'] = url_info['external_url']
|
2339
|
+
|
2340
|
+
elif service.metadata.name == "prometheus-service":
|
2341
|
+
if service.spec.type == "NodePort":
|
2342
|
+
url_info = self.get_service_url("prometheus-service", namespace)
|
2343
|
+
if url_info and url_info.get('external_url'):
|
2344
|
+
result['prometheus']['url'] = url_info['external_url']
|
2345
|
+
|
2346
|
+
return result
|
2347
|
+
|
2348
|
+
except ApiException as e:
|
2349
|
+
result['error'] = str(e)
|
2350
|
+
return result
|
2351
|
+
|
2352
|
+
def add_prometheus_target(self, job_name: str, targets: List[str],
|
2353
|
+
namespace: str = "monitoring",
|
2354
|
+
metrics_path: str = "/metrics",
|
2355
|
+
scrape_interval: str = "15s") -> bool:
|
2356
|
+
"""
|
2357
|
+
Add a new scrape target to Prometheus configuration
|
2358
|
+
|
2359
|
+
Args:
|
2360
|
+
job_name: Name for the job in Prometheus config
|
2361
|
+
targets: List of target addresses (e.g., ["service:port", "1.2.3.4:9090"])
|
2362
|
+
namespace: Monitoring namespace
|
2363
|
+
metrics_path: Path to metrics endpoint
|
2364
|
+
scrape_interval: How often to scrape the target
|
2365
|
+
|
2366
|
+
Returns:
|
2367
|
+
bool: True if target was added successfully
|
2368
|
+
"""
|
2369
|
+
try:
|
2370
|
+
# Get current ConfigMap
|
2371
|
+
configmap = self.core_v1.read_namespaced_config_map(
|
2372
|
+
name="prometheus-config",
|
2373
|
+
namespace=namespace
|
2374
|
+
)
|
2375
|
+
|
2376
|
+
current_config = configmap.data.get("prometheus.yml", "")
|
2377
|
+
|
2378
|
+
# Parse YAML to add new target
|
2379
|
+
import yaml
|
2380
|
+
config_data = yaml.safe_load(current_config)
|
2381
|
+
|
2382
|
+
# Check if job already exists
|
2383
|
+
existing_jobs = [job['job_name'] for job in config_data.get('scrape_configs', [])]
|
2384
|
+
if job_name in existing_jobs:
|
2385
|
+
print(f"❌ Job '{job_name}' already exists. Use update or remove it first.")
|
2386
|
+
return False
|
2387
|
+
|
2388
|
+
# Create new job config
|
2389
|
+
new_job = {
|
2390
|
+
'job_name': job_name,
|
2391
|
+
'scrape_interval': scrape_interval,
|
2392
|
+
'metrics_path': metrics_path,
|
2393
|
+
'static_configs': [
|
2394
|
+
{'targets': targets}
|
2395
|
+
]
|
2396
|
+
}
|
2397
|
+
|
2398
|
+
# Add to scrape configs
|
2399
|
+
if 'scrape_configs' not in config_data:
|
2400
|
+
config_data['scrape_configs'] = []
|
2401
|
+
|
2402
|
+
config_data['scrape_configs'].append(new_job)
|
2403
|
+
|
2404
|
+
# Convert back to YAML
|
2405
|
+
updated_config = yaml.dump(config_data, default_flow_style=False, sort_keys=False)
|
2406
|
+
|
2407
|
+
# Update ConfigMap
|
2408
|
+
configmap.data["prometheus.yml"] = updated_config
|
2409
|
+
self.core_v1.replace_namespaced_config_map(
|
2410
|
+
name="prometheus-config",
|
2411
|
+
namespace=namespace,
|
2412
|
+
body=configmap
|
2413
|
+
)
|
2414
|
+
|
2415
|
+
# Restart Prometheus deployment to pick up new config
|
2416
|
+
self._restart_prometheus_deployment(namespace)
|
2417
|
+
|
2418
|
+
print(f"✅ Added Prometheus target job '{job_name}' with targets: {targets}")
|
2419
|
+
return True
|
2420
|
+
|
2421
|
+
except Exception as e:
|
2422
|
+
print(f"❌ Error adding Prometheus target: {e}")
|
2423
|
+
return False
|
2424
|
+
|
2425
|
+
def remove_prometheus_target(self, job_name: str, namespace: str = "monitoring") -> bool:
|
2426
|
+
"""
|
2427
|
+
Remove a scrape target from Prometheus configuration
|
2428
|
+
|
2429
|
+
Args:
|
2430
|
+
job_name: Name of the job to remove
|
2431
|
+
namespace: Monitoring namespace
|
2432
|
+
|
2433
|
+
Returns:
|
2434
|
+
bool: True if target was removed successfully
|
2435
|
+
"""
|
2436
|
+
try:
|
2437
|
+
# Get current ConfigMap
|
2438
|
+
configmap = self.core_v1.read_namespaced_config_map(
|
2439
|
+
name="prometheus-config",
|
2440
|
+
namespace=namespace
|
2441
|
+
)
|
2442
|
+
|
2443
|
+
current_config = configmap.data.get("prometheus.yml", "")
|
2444
|
+
|
2445
|
+
# Parse YAML to remove target
|
2446
|
+
import yaml
|
2447
|
+
config_data = yaml.safe_load(current_config)
|
2448
|
+
|
2449
|
+
# Find and remove the job
|
2450
|
+
original_jobs = config_data.get('scrape_configs', [])
|
2451
|
+
updated_jobs = [job for job in original_jobs if job.get('job_name') != job_name]
|
2452
|
+
|
2453
|
+
if len(original_jobs) == len(updated_jobs):
|
2454
|
+
print(f"❌ Job '{job_name}' not found in Prometheus configuration")
|
2455
|
+
return False
|
2456
|
+
|
2457
|
+
config_data['scrape_configs'] = updated_jobs
|
2458
|
+
|
2459
|
+
# Convert back to YAML
|
2460
|
+
updated_config = yaml.dump(config_data, default_flow_style=False, sort_keys=False)
|
2461
|
+
|
2462
|
+
# Update ConfigMap
|
2463
|
+
configmap.data["prometheus.yml"] = updated_config
|
2464
|
+
self.core_v1.replace_namespaced_config_map(
|
2465
|
+
name="prometheus-config",
|
2466
|
+
namespace=namespace,
|
2467
|
+
body=configmap
|
2468
|
+
)
|
2469
|
+
|
2470
|
+
# Restart Prometheus deployment to pick up new config
|
2471
|
+
self._restart_prometheus_deployment(namespace)
|
2472
|
+
|
2473
|
+
print(f"✅ Removed Prometheus target job '{job_name}'")
|
2474
|
+
return True
|
2475
|
+
|
2476
|
+
except Exception as e:
|
2477
|
+
print(f"❌ Error removing Prometheus target: {e}")
|
2478
|
+
return False
|
2479
|
+
|
2480
|
+
def list_prometheus_targets(self, namespace: str = "monitoring") -> Dict[str, Any]:
|
2481
|
+
"""
|
2482
|
+
List all configured Prometheus targets
|
2483
|
+
|
2484
|
+
Args:
|
2485
|
+
namespace: Monitoring namespace
|
2486
|
+
|
2487
|
+
Returns:
|
2488
|
+
Dict containing target information
|
2489
|
+
"""
|
2490
|
+
try:
|
2491
|
+
# Get current ConfigMap
|
2492
|
+
configmap = self.core_v1.read_namespaced_config_map(
|
2493
|
+
name="prometheus-config",
|
2494
|
+
namespace=namespace
|
2495
|
+
)
|
2496
|
+
|
2497
|
+
current_config = configmap.data.get("prometheus.yml", "")
|
2498
|
+
|
2499
|
+
# Parse YAML to get targets
|
2500
|
+
import yaml
|
2501
|
+
config_data = yaml.safe_load(current_config)
|
2502
|
+
|
2503
|
+
targets = []
|
2504
|
+
for job in config_data.get('scrape_configs', []):
|
2505
|
+
job_info = {
|
2506
|
+
'job_name': job.get('job_name', 'unknown'),
|
2507
|
+
'scrape_interval': job.get('scrape_interval', 'default'),
|
2508
|
+
'metrics_path': job.get('metrics_path', '/metrics'),
|
2509
|
+
'targets': []
|
2510
|
+
}
|
2511
|
+
|
2512
|
+
# Get static targets
|
2513
|
+
if 'static_configs' in job:
|
2514
|
+
for static_config in job['static_configs']:
|
2515
|
+
job_info['targets'].extend(static_config.get('targets', []))
|
2516
|
+
|
2517
|
+
# Mark kubernetes discovery jobs
|
2518
|
+
if 'kubernetes_sd_configs' in job:
|
2519
|
+
job_info['type'] = 'kubernetes_discovery'
|
2520
|
+
job_info['targets'] = ['<kubernetes_discovery>']
|
2521
|
+
else:
|
2522
|
+
job_info['type'] = 'static'
|
2523
|
+
|
2524
|
+
targets.append(job_info)
|
2525
|
+
|
2526
|
+
return {
|
2527
|
+
'namespace': namespace,
|
2528
|
+
'targets': targets,
|
2529
|
+
'total_jobs': len(targets)
|
2530
|
+
}
|
2531
|
+
|
2532
|
+
except Exception as e:
|
2533
|
+
return {'error': str(e)}
|
2534
|
+
|
2535
|
+
def update_prometheus_target(self, job_name: str, targets: List[str],
|
2536
|
+
namespace: str = "monitoring",
|
2537
|
+
metrics_path: str = None,
|
2538
|
+
scrape_interval: str = None) -> bool:
|
2539
|
+
"""
|
2540
|
+
Update an existing Prometheus target
|
2541
|
+
|
2542
|
+
Args:
|
2543
|
+
job_name: Name of the job to update
|
2544
|
+
targets: New list of target addresses
|
2545
|
+
namespace: Monitoring namespace
|
2546
|
+
metrics_path: Optional new metrics path
|
2547
|
+
scrape_interval: Optional new scrape interval
|
2548
|
+
|
2549
|
+
Returns:
|
2550
|
+
bool: True if target was updated successfully
|
2551
|
+
"""
|
2552
|
+
try:
|
2553
|
+
# Get current ConfigMap
|
2554
|
+
configmap = self.core_v1.read_namespaced_config_map(
|
2555
|
+
name="prometheus-config",
|
2556
|
+
namespace=namespace
|
2557
|
+
)
|
2558
|
+
|
2559
|
+
current_config = configmap.data.get("prometheus.yml", "")
|
2560
|
+
|
2561
|
+
# Parse YAML to update target
|
2562
|
+
import yaml
|
2563
|
+
config_data = yaml.safe_load(current_config)
|
2564
|
+
|
2565
|
+
# Find and update the job
|
2566
|
+
job_found = False
|
2567
|
+
for job in config_data.get('scrape_configs', []):
|
2568
|
+
if job.get('job_name') == job_name:
|
2569
|
+
job_found = True
|
2570
|
+
|
2571
|
+
# Update targets
|
2572
|
+
if 'static_configs' not in job:
|
2573
|
+
job['static_configs'] = [{}]
|
2574
|
+
|
2575
|
+
job['static_configs'][0]['targets'] = targets
|
2576
|
+
|
2577
|
+
# Update optional fields
|
2578
|
+
if metrics_path:
|
2579
|
+
job['metrics_path'] = metrics_path
|
2580
|
+
if scrape_interval:
|
2581
|
+
job['scrape_interval'] = scrape_interval
|
2582
|
+
|
2583
|
+
break
|
2584
|
+
|
2585
|
+
if not job_found:
|
2586
|
+
print(f"❌ Job '{job_name}' not found in Prometheus configuration")
|
2587
|
+
return False
|
2588
|
+
|
2589
|
+
# Convert back to YAML
|
2590
|
+
updated_config = yaml.dump(config_data, default_flow_style=False, sort_keys=False)
|
2591
|
+
|
2592
|
+
# Update ConfigMap
|
2593
|
+
configmap.data["prometheus.yml"] = updated_config
|
2594
|
+
self.core_v1.replace_namespaced_config_map(
|
2595
|
+
name="prometheus-config",
|
2596
|
+
namespace=namespace,
|
2597
|
+
body=configmap
|
2598
|
+
)
|
2599
|
+
|
2600
|
+
# Restart Prometheus deployment to pick up new config
|
2601
|
+
self._restart_prometheus_deployment(namespace)
|
2602
|
+
|
2603
|
+
print(f"✅ Updated Prometheus target job '{job_name}' with targets: {targets}")
|
2604
|
+
return True
|
2605
|
+
|
2606
|
+
except Exception as e:
|
2607
|
+
print(f"❌ Error updating Prometheus target: {e}")
|
2608
|
+
return False
|
2609
|
+
|
2610
|
+
def _restart_prometheus_deployment(self, namespace: str) -> bool:
|
2611
|
+
"""Restart Prometheus deployment to reload configuration"""
|
2612
|
+
try:
|
2613
|
+
# Get current deployment
|
2614
|
+
deployment = self.apps_v1.read_namespaced_deployment(
|
2615
|
+
name="prometheus",
|
2616
|
+
namespace=namespace
|
2617
|
+
)
|
2618
|
+
|
2619
|
+
# Update deployment to trigger restart
|
2620
|
+
if deployment.spec.template.metadata.annotations is None:
|
2621
|
+
deployment.spec.template.metadata.annotations = {}
|
2622
|
+
|
2623
|
+
import time
|
2624
|
+
deployment.spec.template.metadata.annotations['kubectl.kubernetes.io/restartedAt'] = str(int(time.time()))
|
2625
|
+
|
2626
|
+
# Apply the update
|
2627
|
+
self.apps_v1.patch_namespaced_deployment(
|
2628
|
+
name="prometheus",
|
2629
|
+
namespace=namespace,
|
2630
|
+
body=deployment
|
2631
|
+
)
|
2632
|
+
|
2633
|
+
print("🔄 Restarting Prometheus deployment to reload configuration...")
|
2634
|
+
return True
|
2635
|
+
|
2636
|
+
except Exception as e:
|
2637
|
+
print(f"⚠️ Could not restart Prometheus deployment: {e}")
|
2638
|
+
return False
|