k8s-helper-cli 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
k8s_helper/core.py CHANGED
@@ -1679,3 +1679,948 @@ class K8sClient:
1679
1679
 
1680
1680
  print(f"❌ Timeout waiting for deployment '{name}' to be ready")
1681
1681
  return False
1682
+
1683
+ # ======================
1684
+ # MONITORING OPERATIONS
1685
+ # ======================
1686
+ def setup_monitoring(self, namespace: str = "monitoring",
1687
+ grafana_service_type: str = "NodePort",
1688
+ import_dashboard: bool = True,
1689
+ wait_for_ready: bool = True) -> Dict[str, Any]:
1690
+ """Setup complete monitoring stack with Prometheus and Grafana
1691
+
1692
+ Args:
1693
+ namespace: Namespace to deploy monitoring stack
1694
+ grafana_service_type: Service type for Grafana (NodePort, LoadBalancer, ClusterIP)
1695
+ import_dashboard: Whether to import default Kubernetes dashboard
1696
+ wait_for_ready: Whether to wait for deployments to be ready
1697
+
1698
+ Returns:
1699
+ Dictionary with deployment info, URLs, and credentials
1700
+ """
1701
+ result = {
1702
+ 'namespace': namespace,
1703
+ 'prometheus': {},
1704
+ 'grafana': {},
1705
+ 'success': False,
1706
+ 'error': None
1707
+ }
1708
+
1709
+ try:
1710
+ # Create monitoring namespace
1711
+ self._create_monitoring_namespace(namespace)
1712
+
1713
+ # Deploy Prometheus
1714
+ prometheus_result = self._deploy_prometheus(namespace)
1715
+ result['prometheus'] = prometheus_result
1716
+
1717
+ # Deploy Grafana
1718
+ grafana_result = self._deploy_grafana(namespace, grafana_service_type)
1719
+ result['grafana'] = grafana_result
1720
+
1721
+ if wait_for_ready:
1722
+ # Wait for deployments to be ready
1723
+ if self._wait_for_monitoring_ready(namespace):
1724
+ # Configure Grafana data source and dashboard
1725
+ if self._configure_grafana(namespace, import_dashboard):
1726
+ result['success'] = True
1727
+ else:
1728
+ result['error'] = "Failed to configure Grafana"
1729
+ else:
1730
+ result['error'] = "Monitoring deployments failed to become ready"
1731
+ else:
1732
+ result['success'] = True
1733
+
1734
+ return result
1735
+
1736
+ except Exception as e:
1737
+ result['error'] = str(e)
1738
+ return result
1739
+
1740
+ def _create_monitoring_namespace(self, namespace: str) -> bool:
1741
+ """Create monitoring namespace if it doesn't exist"""
1742
+ try:
1743
+ self.core_v1.read_namespace(name=namespace)
1744
+ print(f"✅ Namespace '{namespace}' already exists")
1745
+ return True
1746
+ except ApiException as e:
1747
+ if e.status == 404:
1748
+ # Create namespace
1749
+ namespace_obj = client.V1Namespace(
1750
+ metadata=client.V1ObjectMeta(name=namespace)
1751
+ )
1752
+ self.core_v1.create_namespace(body=namespace_obj)
1753
+ print(f"✅ Created namespace '{namespace}'")
1754
+ return True
1755
+ else:
1756
+ raise e
1757
+
1758
+ def _deploy_prometheus(self, namespace: str) -> Dict[str, Any]:
1759
+ """Deploy Prometheus to the cluster"""
1760
+ result = {'deployed': False, 'service_name': 'prometheus-service'}
1761
+
1762
+ # Prometheus ConfigMap
1763
+ prometheus_config = """
1764
+ global:
1765
+ scrape_interval: 15s
1766
+ scrape_configs:
1767
+ - job_name: 'kubernetes-apiservers'
1768
+ kubernetes_sd_configs:
1769
+ - role: endpoints
1770
+ scheme: https
1771
+ tls_config:
1772
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
1773
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
1774
+ relabel_configs:
1775
+ - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
1776
+ action: keep
1777
+ regex: default;kubernetes;https
1778
+
1779
+ - job_name: 'kubernetes-nodes'
1780
+ kubernetes_sd_configs:
1781
+ - role: node
1782
+ scheme: https
1783
+ tls_config:
1784
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
1785
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
1786
+ relabel_configs:
1787
+ - action: labelmap
1788
+ regex: __meta_kubernetes_node_label_(.+)
1789
+
1790
+ - job_name: 'kubernetes-pods'
1791
+ kubernetes_sd_configs:
1792
+ - role: pod
1793
+ relabel_configs:
1794
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
1795
+ action: keep
1796
+ regex: true
1797
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
1798
+ action: replace
1799
+ target_label: __metrics_path__
1800
+ regex: (.+)
1801
+ - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
1802
+ action: replace
1803
+ regex: ([^:]+)(?::\\d+)?;(\\d+)
1804
+ replacement: $1:$2
1805
+ target_label: __address__
1806
+ - action: labelmap
1807
+ regex: __meta_kubernetes_pod_label_(.+)
1808
+ - source_labels: [__meta_kubernetes_namespace]
1809
+ action: replace
1810
+ target_label: kubernetes_namespace
1811
+ - source_labels: [__meta_kubernetes_pod_name]
1812
+ action: replace
1813
+ target_label: kubernetes_pod_name
1814
+ """
1815
+
1816
+ # Create ConfigMap
1817
+ configmap = client.V1ConfigMap(
1818
+ metadata=client.V1ObjectMeta(name="prometheus-config", namespace=namespace),
1819
+ data={"prometheus.yml": prometheus_config}
1820
+ )
1821
+
1822
+ try:
1823
+ self.core_v1.create_namespaced_config_map(namespace=namespace, body=configmap)
1824
+ except ApiException as e:
1825
+ if e.status == 409: # Already exists
1826
+ self.core_v1.replace_namespaced_config_map(
1827
+ name="prometheus-config", namespace=namespace, body=configmap
1828
+ )
1829
+
1830
+ # Create ServiceAccount and RBAC
1831
+ self._create_prometheus_rbac(namespace)
1832
+
1833
+ # Prometheus Deployment
1834
+ deployment = client.V1Deployment(
1835
+ metadata=client.V1ObjectMeta(name="prometheus", namespace=namespace),
1836
+ spec=client.V1DeploymentSpec(
1837
+ replicas=1,
1838
+ selector=client.V1LabelSelector(match_labels={"app": "prometheus"}),
1839
+ template=client.V1PodTemplateSpec(
1840
+ metadata=client.V1ObjectMeta(labels={"app": "prometheus"}),
1841
+ spec=client.V1PodSpec(
1842
+ service_account_name="prometheus",
1843
+ containers=[
1844
+ client.V1Container(
1845
+ name="prometheus",
1846
+ image="prom/prometheus:latest",
1847
+ ports=[client.V1ContainerPort(container_port=9090)],
1848
+ args=[
1849
+ "--config.file=/etc/prometheus/prometheus.yml",
1850
+ "--storage.tsdb.path=/prometheus/",
1851
+ "--web.console.libraries=/etc/prometheus/console_libraries",
1852
+ "--web.console.templates=/etc/prometheus/consoles",
1853
+ "--web.enable-lifecycle"
1854
+ ],
1855
+ volume_mounts=[
1856
+ client.V1VolumeMount(
1857
+ name="prometheus-config",
1858
+ mount_path="/etc/prometheus/"
1859
+ ),
1860
+ client.V1VolumeMount(
1861
+ name="prometheus-storage",
1862
+ mount_path="/prometheus/"
1863
+ )
1864
+ ]
1865
+ )
1866
+ ],
1867
+ volumes=[
1868
+ client.V1Volume(
1869
+ name="prometheus-config",
1870
+ config_map=client.V1ConfigMapVolumeSource(
1871
+ name="prometheus-config"
1872
+ )
1873
+ ),
1874
+ client.V1Volume(
1875
+ name="prometheus-storage",
1876
+ empty_dir=client.V1EmptyDirVolumeSource()
1877
+ )
1878
+ ]
1879
+ )
1880
+ )
1881
+ )
1882
+ )
1883
+
1884
+ try:
1885
+ self.apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment)
1886
+ except ApiException as e:
1887
+ if e.status == 409: # Already exists
1888
+ self.apps_v1.replace_namespaced_deployment(
1889
+ name="prometheus", namespace=namespace, body=deployment
1890
+ )
1891
+
1892
+ # Prometheus Service
1893
+ service = client.V1Service(
1894
+ metadata=client.V1ObjectMeta(name="prometheus-service", namespace=namespace),
1895
+ spec=client.V1ServiceSpec(
1896
+ selector={"app": "prometheus"},
1897
+ ports=[client.V1ServicePort(port=9090, target_port=9090)],
1898
+ type="ClusterIP"
1899
+ )
1900
+ )
1901
+
1902
+ try:
1903
+ self.core_v1.create_namespaced_service(namespace=namespace, body=service)
1904
+ except ApiException as e:
1905
+ if e.status == 409: # Already exists
1906
+ self.core_v1.replace_namespaced_service(
1907
+ name="prometheus-service", namespace=namespace, body=service
1908
+ )
1909
+
1910
+ result['deployed'] = True
1911
+ return result
1912
+
1913
+ def _create_prometheus_rbac(self, namespace: str) -> None:
1914
+ """Create RBAC resources for Prometheus"""
1915
+ # ServiceAccount
1916
+ service_account = client.V1ServiceAccount(
1917
+ metadata=client.V1ObjectMeta(name="prometheus", namespace=namespace)
1918
+ )
1919
+
1920
+ try:
1921
+ self.core_v1.create_namespaced_service_account(namespace=namespace, body=service_account)
1922
+ except ApiException as e:
1923
+ if e.status != 409: # Ignore if already exists
1924
+ raise e
1925
+
1926
+ # ClusterRole
1927
+ rbac_v1 = client.RbacAuthorizationV1Api()
1928
+ cluster_role = client.V1ClusterRole(
1929
+ metadata=client.V1ObjectMeta(name="prometheus"),
1930
+ rules=[
1931
+ client.V1PolicyRule(
1932
+ api_groups=[""],
1933
+ resources=["nodes", "nodes/proxy", "services", "endpoints", "pods"],
1934
+ verbs=["get", "list", "watch"]
1935
+ ),
1936
+ client.V1PolicyRule(
1937
+ api_groups=["extensions"],
1938
+ resources=["ingresses"],
1939
+ verbs=["get", "list", "watch"]
1940
+ ),
1941
+ client.V1PolicyRule(
1942
+ non_resource_urls=["/metrics"],
1943
+ verbs=["get"]
1944
+ )
1945
+ ]
1946
+ )
1947
+
1948
+ try:
1949
+ rbac_v1.create_cluster_role(body=cluster_role)
1950
+ except ApiException as e:
1951
+ if e.status != 409: # Ignore if already exists
1952
+ raise e
1953
+
1954
+ # ClusterRoleBinding
1955
+ cluster_role_binding = client.V1ClusterRoleBinding(
1956
+ metadata=client.V1ObjectMeta(name="prometheus"),
1957
+ subjects=[
1958
+ client.V1Subject(
1959
+ kind="ServiceAccount",
1960
+ name="prometheus",
1961
+ namespace=namespace
1962
+ )
1963
+ ],
1964
+ role_ref=client.V1RoleRef(
1965
+ kind="ClusterRole",
1966
+ name="prometheus",
1967
+ api_group="rbac.authorization.k8s.io"
1968
+ )
1969
+ )
1970
+
1971
+ try:
1972
+ rbac_v1.create_cluster_role_binding(body=cluster_role_binding)
1973
+ except ApiException as e:
1974
+ if e.status != 409: # Ignore if already exists
1975
+ raise e
1976
+
1977
+ def _deploy_grafana(self, namespace: str, service_type: str = "NodePort") -> Dict[str, Any]:
1978
+ """Deploy Grafana to the cluster"""
1979
+ result = {
1980
+ 'deployed': False,
1981
+ 'service_name': 'grafana-service',
1982
+ 'service_type': service_type,
1983
+ 'admin_user': 'admin',
1984
+ 'admin_password': 'admin123'
1985
+ }
1986
+
1987
+ # Grafana Deployment
1988
+ deployment = client.V1Deployment(
1989
+ metadata=client.V1ObjectMeta(name="grafana", namespace=namespace),
1990
+ spec=client.V1DeploymentSpec(
1991
+ replicas=1,
1992
+ selector=client.V1LabelSelector(match_labels={"app": "grafana"}),
1993
+ template=client.V1PodTemplateSpec(
1994
+ metadata=client.V1ObjectMeta(labels={"app": "grafana"}),
1995
+ spec=client.V1PodSpec(
1996
+ containers=[
1997
+ client.V1Container(
1998
+ name="grafana",
1999
+ image="grafana/grafana:latest",
2000
+ ports=[client.V1ContainerPort(container_port=3000)],
2001
+ env=[
2002
+ client.V1EnvVar(name="GF_SECURITY_ADMIN_USER", value="admin"),
2003
+ client.V1EnvVar(name="GF_SECURITY_ADMIN_PASSWORD", value="admin123"),
2004
+ client.V1EnvVar(name="GF_INSTALL_PLUGINS", value="grafana-kubernetes-app")
2005
+ ],
2006
+ volume_mounts=[
2007
+ client.V1VolumeMount(
2008
+ name="grafana-storage",
2009
+ mount_path="/var/lib/grafana"
2010
+ )
2011
+ ]
2012
+ )
2013
+ ],
2014
+ volumes=[
2015
+ client.V1Volume(
2016
+ name="grafana-storage",
2017
+ empty_dir=client.V1EmptyDirVolumeSource()
2018
+ )
2019
+ ]
2020
+ )
2021
+ )
2022
+ )
2023
+ )
2024
+
2025
+ try:
2026
+ self.apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment)
2027
+ except ApiException as e:
2028
+ if e.status == 409: # Already exists
2029
+ self.apps_v1.replace_namespaced_deployment(
2030
+ name="grafana", namespace=namespace, body=deployment
2031
+ )
2032
+
2033
+ # Grafana Service
2034
+ service_ports = [client.V1ServicePort(port=3000, target_port=3000)]
2035
+ if service_type == "NodePort":
2036
+ service_ports[0].node_port = 30300
2037
+
2038
+ service = client.V1Service(
2039
+ metadata=client.V1ObjectMeta(name="grafana-service", namespace=namespace),
2040
+ spec=client.V1ServiceSpec(
2041
+ selector={"app": "grafana"},
2042
+ ports=service_ports,
2043
+ type=service_type
2044
+ )
2045
+ )
2046
+
2047
+ try:
2048
+ self.core_v1.create_namespaced_service(namespace=namespace, body=service)
2049
+ result['deployed'] = True
2050
+ except ApiException as e:
2051
+ if e.status == 409: # Already exists
2052
+ self.core_v1.replace_namespaced_service(
2053
+ name="grafana-service", namespace=namespace, body=service
2054
+ )
2055
+ result['deployed'] = True
2056
+
2057
+ return result
2058
+
2059
+ def _wait_for_monitoring_ready(self, namespace: str, timeout: int = 300) -> bool:
2060
+ """Wait for Prometheus and Grafana to be ready"""
2061
+ deployments = ["prometheus", "grafana"]
2062
+ start_time = time.time()
2063
+
2064
+ for deployment_name in deployments:
2065
+ print(f"⏳ Waiting for {deployment_name} to be ready...")
2066
+ deployment_ready = False
2067
+
2068
+ while time.time() - start_time < timeout and not deployment_ready:
2069
+ try:
2070
+ deployment = self.apps_v1.read_namespaced_deployment(
2071
+ name=deployment_name, namespace=namespace
2072
+ )
2073
+
2074
+ if (deployment.status.ready_replicas == deployment.spec.replicas and
2075
+ deployment.status.ready_replicas > 0):
2076
+ print(f"✅ {deployment_name} is ready")
2077
+ deployment_ready = True
2078
+ else:
2079
+ ready = deployment.status.ready_replicas or 0
2080
+ total = deployment.spec.replicas
2081
+ print(f"⏳ {deployment_name}: {ready}/{total} replicas ready...")
2082
+ time.sleep(5)
2083
+
2084
+ except ApiException as e:
2085
+ print(f"❌ Error checking {deployment_name} status: {e}")
2086
+ return False
2087
+
2088
+ if not deployment_ready:
2089
+ print(f"❌ Timeout waiting for {deployment_name} to be ready")
2090
+ return False
2091
+
2092
+ return True
2093
+
2094
+ def _configure_grafana(self, namespace: str, import_dashboard: bool = True) -> bool:
2095
+ """Configure Grafana with Prometheus data source and dashboard"""
2096
+ try:
2097
+ # Wait a bit for Grafana to fully start
2098
+ time.sleep(10)
2099
+
2100
+ import requests
2101
+ import json
2102
+
2103
+ # Get Grafana service URL
2104
+ grafana_url = self._get_grafana_url(namespace)
2105
+ if not grafana_url:
2106
+ print("❌ Could not determine Grafana URL")
2107
+ return False
2108
+
2109
+ print(f"🔧 Configuring Grafana at {grafana_url}")
2110
+
2111
+ # Add Prometheus data source
2112
+ datasource_payload = {
2113
+ "name": "Prometheus",
2114
+ "type": "prometheus",
2115
+ "url": f"http://prometheus-service.{namespace}.svc.cluster.local:9090",
2116
+ "access": "proxy",
2117
+ "isDefault": True
2118
+ }
2119
+
2120
+ auth = ('admin', 'admin123')
2121
+
2122
+ # Add data source
2123
+ response = requests.post(
2124
+ f"{grafana_url}/api/datasources",
2125
+ json=datasource_payload,
2126
+ auth=auth,
2127
+ timeout=30
2128
+ )
2129
+
2130
+ if response.status_code in [200, 409]: # Success or already exists
2131
+ print("✅ Prometheus data source configured")
2132
+ else:
2133
+ print(f"⚠️ Warning: Could not add Prometheus data source: {response.text}")
2134
+
2135
+ # Import default dashboard if requested
2136
+ if import_dashboard:
2137
+ self._import_kubernetes_dashboard(grafana_url, auth)
2138
+
2139
+ return True
2140
+
2141
+ except Exception as e:
2142
+ print(f"⚠️ Warning: Could not configure Grafana automatically: {e}")
2143
+ print("💡 You can manually add Prometheus as a data source in Grafana")
2144
+ return False
2145
+
2146
+ def _get_grafana_url(self, namespace: str) -> Optional[str]:
2147
+ """Get Grafana service URL"""
2148
+ try:
2149
+ service = self.core_v1.read_namespaced_service(
2150
+ name="grafana-service", namespace=namespace
2151
+ )
2152
+
2153
+ if service.spec.type == "NodePort":
2154
+ # Try to get node IP
2155
+ nodes = self.core_v1.list_node()
2156
+ if nodes.items:
2157
+ # Get external IP or internal IP
2158
+ node_ip = None
2159
+ for node in nodes.items:
2160
+ for address in node.status.addresses:
2161
+ if address.type == "ExternalIP":
2162
+ node_ip = address.address
2163
+ break
2164
+ if not node_ip:
2165
+ for address in node.status.addresses:
2166
+ if address.type == "InternalIP":
2167
+ node_ip = address.address
2168
+ break
2169
+ if node_ip:
2170
+ break
2171
+
2172
+ if node_ip:
2173
+ node_port = None
2174
+ for port in service.spec.ports:
2175
+ if port.node_port:
2176
+ node_port = port.node_port
2177
+ break
2178
+
2179
+ if node_port:
2180
+ return f"http://{node_ip}:{node_port}"
2181
+
2182
+ elif service.spec.type == "LoadBalancer":
2183
+ if service.status.load_balancer.ingress:
2184
+ ingress = service.status.load_balancer.ingress[0]
2185
+ host = ingress.ip or ingress.hostname
2186
+ if host:
2187
+ return f"http://{host}:3000"
2188
+
2189
+ # Fallback to port-forward approach
2190
+ return None
2191
+
2192
+ except Exception as e:
2193
+ print(f"Warning: Could not determine Grafana URL: {e}")
2194
+ return None
2195
+
2196
+ def _import_kubernetes_dashboard(self, grafana_url: str, auth: tuple) -> None:
2197
+ """Import a default Kubernetes dashboard"""
2198
+ try:
2199
+ import requests
2200
+
2201
+ # Simple Kubernetes cluster dashboard JSON
2202
+ dashboard_json = {
2203
+ "dashboard": {
2204
+ "id": None,
2205
+ "title": "Kubernetes Cluster Overview",
2206
+ "tags": ["kubernetes"],
2207
+ "timezone": "browser",
2208
+ "panels": [
2209
+ {
2210
+ "id": 1,
2211
+ "title": "Cluster CPU Usage",
2212
+ "type": "stat",
2213
+ "targets": [
2214
+ {
2215
+ "expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
2216
+ "refId": "A"
2217
+ }
2218
+ ],
2219
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
2220
+ },
2221
+ {
2222
+ "id": 2,
2223
+ "title": "Cluster Memory Usage",
2224
+ "type": "stat",
2225
+ "targets": [
2226
+ {
2227
+ "expr": "100 * (1 - ((avg_over_time(node_memory_MemFree_bytes[10m]) + avg_over_time(node_memory_Cached_bytes[10m]) + avg_over_time(node_memory_Buffers_bytes[10m])) / avg_over_time(node_memory_MemTotal_bytes[10m])))",
2228
+ "refId": "A"
2229
+ }
2230
+ ],
2231
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
2232
+ },
2233
+ {
2234
+ "id": 3,
2235
+ "title": "Pod Count",
2236
+ "type": "stat",
2237
+ "targets": [
2238
+ {
2239
+ "expr": "sum(kube_pod_info)",
2240
+ "refId": "A"
2241
+ }
2242
+ ],
2243
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
2244
+ }
2245
+ ],
2246
+ "time": {"from": "now-1h", "to": "now"},
2247
+ "refresh": "30s"
2248
+ },
2249
+ "overwrite": True
2250
+ }
2251
+
2252
+ response = requests.post(
2253
+ f"{grafana_url}/api/dashboards/db",
2254
+ json=dashboard_json,
2255
+ auth=auth,
2256
+ timeout=30
2257
+ )
2258
+
2259
+ if response.status_code == 200:
2260
+ print("✅ Kubernetes dashboard imported")
2261
+ else:
2262
+ print(f"⚠️ Warning: Could not import dashboard: {response.text}")
2263
+
2264
+ except Exception as e:
2265
+ print(f"⚠️ Warning: Could not import dashboard: {e}")
2266
+
2267
+ def get_monitoring_info(self, namespace: str = "monitoring") -> Dict[str, Any]:
2268
+ """Get monitoring stack information and URLs"""
2269
+ result = {
2270
+ 'namespace': namespace,
2271
+ 'prometheus': {'running': False, 'url': None},
2272
+ 'grafana': {'running': False, 'url': None, 'credentials': None},
2273
+ 'services': []
2274
+ }
2275
+
2276
+ try:
2277
+ # Check if deployments are running
2278
+ deployments = self.apps_v1.list_namespaced_deployment(namespace=namespace)
2279
+
2280
+ for deployment in deployments.items:
2281
+ if deployment.metadata.name == "prometheus":
2282
+ result['prometheus']['running'] = (
2283
+ deployment.status.ready_replicas == deployment.spec.replicas and
2284
+ deployment.status.ready_replicas > 0
2285
+ )
2286
+ elif deployment.metadata.name == "grafana":
2287
+ result['grafana']['running'] = (
2288
+ deployment.status.ready_replicas == deployment.spec.replicas and
2289
+ deployment.status.ready_replicas > 0
2290
+ )
2291
+
2292
+ # Get service information
2293
+ services = self.core_v1.list_namespaced_service(namespace=namespace)
2294
+
2295
+ for service in services.items:
2296
+ service_info = {
2297
+ 'name': service.metadata.name,
2298
+ 'type': service.spec.type,
2299
+ 'cluster_ip': service.spec.cluster_ip,
2300
+ 'ports': []
2301
+ }
2302
+
2303
+ for port in service.spec.ports:
2304
+ port_info = {
2305
+ 'port': port.port,
2306
+ 'target_port': port.target_port,
2307
+ 'protocol': port.protocol
2308
+ }
2309
+ if hasattr(port, 'node_port') and port.node_port:
2310
+ port_info['node_port'] = port.node_port
2311
+ service_info['ports'].append(port_info)
2312
+
2313
+ result['services'].append(service_info)
2314
+
2315
+ # Set URLs
2316
+ if service.metadata.name == "grafana-service":
2317
+ result['grafana']['credentials'] = {
2318
+ 'username': 'admin',
2319
+ 'password': 'admin123'
2320
+ }
2321
+
2322
+ if service.spec.type == "NodePort":
2323
+ # Get node IP and port
2324
+ url_info = self.get_service_url("grafana-service", namespace)
2325
+ if url_info and url_info.get('external_url'):
2326
+ result['grafana']['url'] = url_info['external_url']
2327
+
2328
+ elif service.metadata.name == "prometheus-service":
2329
+ if service.spec.type == "NodePort":
2330
+ url_info = self.get_service_url("prometheus-service", namespace)
2331
+ if url_info and url_info.get('external_url'):
2332
+ result['prometheus']['url'] = url_info['external_url']
2333
+
2334
+ return result
2335
+
2336
+ except ApiException as e:
2337
+ result['error'] = str(e)
2338
+ return result
2339
+
2340
+ def add_prometheus_target(self, job_name: str, targets: List[str],
2341
+ namespace: str = "monitoring",
2342
+ metrics_path: str = "/metrics",
2343
+ scrape_interval: str = "15s") -> bool:
2344
+ """
2345
+ Add a new scrape target to Prometheus configuration
2346
+
2347
+ Args:
2348
+ job_name: Name for the job in Prometheus config
2349
+ targets: List of target addresses (e.g., ["service:port", "1.2.3.4:9090"])
2350
+ namespace: Monitoring namespace
2351
+ metrics_path: Path to metrics endpoint
2352
+ scrape_interval: How often to scrape the target
2353
+
2354
+ Returns:
2355
+ bool: True if target was added successfully
2356
+ """
2357
+ try:
2358
+ # Get current ConfigMap
2359
+ configmap = self.core_v1.read_namespaced_config_map(
2360
+ name="prometheus-config",
2361
+ namespace=namespace
2362
+ )
2363
+
2364
+ current_config = configmap.data.get("prometheus.yml", "")
2365
+
2366
+ # Parse YAML to add new target
2367
+ import yaml
2368
+ config_data = yaml.safe_load(current_config)
2369
+
2370
+ # Check if job already exists
2371
+ existing_jobs = [job['job_name'] for job in config_data.get('scrape_configs', [])]
2372
+ if job_name in existing_jobs:
2373
+ print(f"❌ Job '{job_name}' already exists. Use update or remove it first.")
2374
+ return False
2375
+
2376
+ # Create new job config
2377
+ new_job = {
2378
+ 'job_name': job_name,
2379
+ 'scrape_interval': scrape_interval,
2380
+ 'metrics_path': metrics_path,
2381
+ 'static_configs': [
2382
+ {'targets': targets}
2383
+ ]
2384
+ }
2385
+
2386
+ # Add to scrape configs
2387
+ if 'scrape_configs' not in config_data:
2388
+ config_data['scrape_configs'] = []
2389
+
2390
+ config_data['scrape_configs'].append(new_job)
2391
+
2392
+ # Convert back to YAML
2393
+ updated_config = yaml.dump(config_data, default_flow_style=False, sort_keys=False)
2394
+
2395
+ # Update ConfigMap
2396
+ configmap.data["prometheus.yml"] = updated_config
2397
+ self.core_v1.replace_namespaced_config_map(
2398
+ name="prometheus-config",
2399
+ namespace=namespace,
2400
+ body=configmap
2401
+ )
2402
+
2403
+ # Restart Prometheus deployment to pick up new config
2404
+ self._restart_prometheus_deployment(namespace)
2405
+
2406
+ print(f"✅ Added Prometheus target job '{job_name}' with targets: {targets}")
2407
+ return True
2408
+
2409
+ except Exception as e:
2410
+ print(f"❌ Error adding Prometheus target: {e}")
2411
+ return False
2412
+
2413
+ def remove_prometheus_target(self, job_name: str, namespace: str = "monitoring") -> bool:
2414
+ """
2415
+ Remove a scrape target from Prometheus configuration
2416
+
2417
+ Args:
2418
+ job_name: Name of the job to remove
2419
+ namespace: Monitoring namespace
2420
+
2421
+ Returns:
2422
+ bool: True if target was removed successfully
2423
+ """
2424
+ try:
2425
+ # Get current ConfigMap
2426
+ configmap = self.core_v1.read_namespaced_config_map(
2427
+ name="prometheus-config",
2428
+ namespace=namespace
2429
+ )
2430
+
2431
+ current_config = configmap.data.get("prometheus.yml", "")
2432
+
2433
+ # Parse YAML to remove target
2434
+ import yaml
2435
+ config_data = yaml.safe_load(current_config)
2436
+
2437
+ # Find and remove the job
2438
+ original_jobs = config_data.get('scrape_configs', [])
2439
+ updated_jobs = [job for job in original_jobs if job.get('job_name') != job_name]
2440
+
2441
+ if len(original_jobs) == len(updated_jobs):
2442
+ print(f"❌ Job '{job_name}' not found in Prometheus configuration")
2443
+ return False
2444
+
2445
+ config_data['scrape_configs'] = updated_jobs
2446
+
2447
+ # Convert back to YAML
2448
+ updated_config = yaml.dump(config_data, default_flow_style=False, sort_keys=False)
2449
+
2450
+ # Update ConfigMap
2451
+ configmap.data["prometheus.yml"] = updated_config
2452
+ self.core_v1.replace_namespaced_config_map(
2453
+ name="prometheus-config",
2454
+ namespace=namespace,
2455
+ body=configmap
2456
+ )
2457
+
2458
+ # Restart Prometheus deployment to pick up new config
2459
+ self._restart_prometheus_deployment(namespace)
2460
+
2461
+ print(f"✅ Removed Prometheus target job '{job_name}'")
2462
+ return True
2463
+
2464
+ except Exception as e:
2465
+ print(f"❌ Error removing Prometheus target: {e}")
2466
+ return False
2467
+
2468
+ def list_prometheus_targets(self, namespace: str = "monitoring") -> Dict[str, Any]:
2469
+ """
2470
+ List all configured Prometheus targets
2471
+
2472
+ Args:
2473
+ namespace: Monitoring namespace
2474
+
2475
+ Returns:
2476
+ Dict containing target information
2477
+ """
2478
+ try:
2479
+ # Get current ConfigMap
2480
+ configmap = self.core_v1.read_namespaced_config_map(
2481
+ name="prometheus-config",
2482
+ namespace=namespace
2483
+ )
2484
+
2485
+ current_config = configmap.data.get("prometheus.yml", "")
2486
+
2487
+ # Parse YAML to get targets
2488
+ import yaml
2489
+ config_data = yaml.safe_load(current_config)
2490
+
2491
+ targets = []
2492
+ for job in config_data.get('scrape_configs', []):
2493
+ job_info = {
2494
+ 'job_name': job.get('job_name', 'unknown'),
2495
+ 'scrape_interval': job.get('scrape_interval', 'default'),
2496
+ 'metrics_path': job.get('metrics_path', '/metrics'),
2497
+ 'targets': []
2498
+ }
2499
+
2500
+ # Get static targets
2501
+ if 'static_configs' in job:
2502
+ for static_config in job['static_configs']:
2503
+ job_info['targets'].extend(static_config.get('targets', []))
2504
+
2505
+ # Mark kubernetes discovery jobs
2506
+ if 'kubernetes_sd_configs' in job:
2507
+ job_info['type'] = 'kubernetes_discovery'
2508
+ job_info['targets'] = ['<kubernetes_discovery>']
2509
+ else:
2510
+ job_info['type'] = 'static'
2511
+
2512
+ targets.append(job_info)
2513
+
2514
+ return {
2515
+ 'namespace': namespace,
2516
+ 'targets': targets,
2517
+ 'total_jobs': len(targets)
2518
+ }
2519
+
2520
+ except Exception as e:
2521
+ return {'error': str(e)}
2522
+
2523
+ def update_prometheus_target(self, job_name: str, targets: List[str],
2524
+ namespace: str = "monitoring",
2525
+ metrics_path: str = None,
2526
+ scrape_interval: str = None) -> bool:
2527
+ """
2528
+ Update an existing Prometheus target
2529
+
2530
+ Args:
2531
+ job_name: Name of the job to update
2532
+ targets: New list of target addresses
2533
+ namespace: Monitoring namespace
2534
+ metrics_path: Optional new metrics path
2535
+ scrape_interval: Optional new scrape interval
2536
+
2537
+ Returns:
2538
+ bool: True if target was updated successfully
2539
+ """
2540
+ try:
2541
+ # Get current ConfigMap
2542
+ configmap = self.core_v1.read_namespaced_config_map(
2543
+ name="prometheus-config",
2544
+ namespace=namespace
2545
+ )
2546
+
2547
+ current_config = configmap.data.get("prometheus.yml", "")
2548
+
2549
+ # Parse YAML to update target
2550
+ import yaml
2551
+ config_data = yaml.safe_load(current_config)
2552
+
2553
+ # Find and update the job
2554
+ job_found = False
2555
+ for job in config_data.get('scrape_configs', []):
2556
+ if job.get('job_name') == job_name:
2557
+ job_found = True
2558
+
2559
+ # Update targets
2560
+ if 'static_configs' not in job:
2561
+ job['static_configs'] = [{}]
2562
+
2563
+ job['static_configs'][0]['targets'] = targets
2564
+
2565
+ # Update optional fields
2566
+ if metrics_path:
2567
+ job['metrics_path'] = metrics_path
2568
+ if scrape_interval:
2569
+ job['scrape_interval'] = scrape_interval
2570
+
2571
+ break
2572
+
2573
+ if not job_found:
2574
+ print(f"❌ Job '{job_name}' not found in Prometheus configuration")
2575
+ return False
2576
+
2577
+ # Convert back to YAML
2578
+ updated_config = yaml.dump(config_data, default_flow_style=False, sort_keys=False)
2579
+
2580
+ # Update ConfigMap
2581
+ configmap.data["prometheus.yml"] = updated_config
2582
+ self.core_v1.replace_namespaced_config_map(
2583
+ name="prometheus-config",
2584
+ namespace=namespace,
2585
+ body=configmap
2586
+ )
2587
+
2588
+ # Restart Prometheus deployment to pick up new config
2589
+ self._restart_prometheus_deployment(namespace)
2590
+
2591
+ print(f"✅ Updated Prometheus target job '{job_name}' with targets: {targets}")
2592
+ return True
2593
+
2594
+ except Exception as e:
2595
+ print(f"❌ Error updating Prometheus target: {e}")
2596
+ return False
2597
+
2598
+ def _restart_prometheus_deployment(self, namespace: str) -> bool:
2599
+ """Restart Prometheus deployment to reload configuration"""
2600
+ try:
2601
+ # Get current deployment
2602
+ deployment = self.apps_v1.read_namespaced_deployment(
2603
+ name="prometheus",
2604
+ namespace=namespace
2605
+ )
2606
+
2607
+ # Update deployment to trigger restart
2608
+ if deployment.spec.template.metadata.annotations is None:
2609
+ deployment.spec.template.metadata.annotations = {}
2610
+
2611
+ import time
2612
+ deployment.spec.template.metadata.annotations['kubectl.kubernetes.io/restartedAt'] = str(int(time.time()))
2613
+
2614
+ # Apply the update
2615
+ self.apps_v1.patch_namespaced_deployment(
2616
+ name="prometheus",
2617
+ namespace=namespace,
2618
+ body=deployment
2619
+ )
2620
+
2621
+ print("🔄 Restarting Prometheus deployment to reload configuration...")
2622
+ return True
2623
+
2624
+ except Exception as e:
2625
+ print(f"⚠️ Could not restart Prometheus deployment: {e}")
2626
+ return False