k8s-helper-cli 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
k8s_helper/core.py CHANGED
@@ -1679,3 +1679,960 @@ class K8sClient:
1679
1679
 
1680
1680
  print(f"❌ Timeout waiting for deployment '{name}' to be ready")
1681
1681
  return False
1682
+
1683
+ # ======================
1684
+ # MONITORING OPERATIONS
1685
+ # ======================
1686
+ def setup_monitoring(self, namespace: str = "monitoring",
1687
+ grafana_service_type: str = "NodePort",
1688
+ import_dashboard: bool = True,
1689
+ wait_for_ready: bool = True) -> Dict[str, Any]:
1690
+ """Setup complete monitoring stack with Prometheus and Grafana
1691
+
1692
+ Args:
1693
+ namespace: Namespace to deploy monitoring stack
1694
+ grafana_service_type: Service type for Grafana (NodePort, LoadBalancer, ClusterIP)
1695
+ import_dashboard: Whether to import default Kubernetes dashboard
1696
+ wait_for_ready: Whether to wait for deployments to be ready
1697
+
1698
+ Returns:
1699
+ Dictionary with deployment info, URLs, and credentials
1700
+ """
1701
+ result = {
1702
+ 'namespace': namespace,
1703
+ 'prometheus': {},
1704
+ 'grafana': {},
1705
+ 'success': False,
1706
+ 'error': None
1707
+ }
1708
+
1709
+ try:
1710
+ # Create monitoring namespace
1711
+ self._create_monitoring_namespace(namespace)
1712
+
1713
+ # Deploy Prometheus
1714
+ prometheus_result = self._deploy_prometheus(namespace)
1715
+ result['prometheus'] = prometheus_result
1716
+
1717
+ # Deploy Grafana
1718
+ grafana_result = self._deploy_grafana(namespace, grafana_service_type)
1719
+ result['grafana'] = grafana_result
1720
+
1721
+ if wait_for_ready:
1722
+ # Wait for deployments to be ready
1723
+ if self._wait_for_monitoring_ready(namespace):
1724
+ # Configure Grafana data source and dashboard
1725
+ if self._configure_grafana(namespace, import_dashboard):
1726
+ result['success'] = True
1727
+ else:
1728
+ result['error'] = "Failed to configure Grafana"
1729
+ else:
1730
+ result['error'] = "Monitoring deployments failed to become ready"
1731
+ else:
1732
+ result['success'] = True
1733
+
1734
+ return result
1735
+
1736
+ except Exception as e:
1737
+ result['error'] = str(e)
1738
+ return result
1739
+
1740
+ def _create_monitoring_namespace(self, namespace: str) -> bool:
1741
+ """Create monitoring namespace if it doesn't exist"""
1742
+ try:
1743
+ self.core_v1.read_namespace(name=namespace)
1744
+ print(f"✅ Namespace '{namespace}' already exists")
1745
+ return True
1746
+ except ApiException as e:
1747
+ if e.status == 404:
1748
+ # Create namespace
1749
+ namespace_obj = client.V1Namespace(
1750
+ metadata=client.V1ObjectMeta(name=namespace)
1751
+ )
1752
+ self.core_v1.create_namespace(body=namespace_obj)
1753
+ print(f"✅ Created namespace '{namespace}'")
1754
+ return True
1755
+ else:
1756
+ raise e
1757
+
1758
+ def _deploy_prometheus(self, namespace: str) -> Dict[str, Any]:
1759
+ """Deploy Prometheus to the cluster"""
1760
+ result = {'deployed': False, 'service_name': 'prometheus-service'}
1761
+
1762
+ # Prometheus ConfigMap
1763
+ prometheus_config = """
1764
+ global:
1765
+ scrape_interval: 15s
1766
+ scrape_configs:
1767
+ - job_name: 'kubernetes-apiservers'
1768
+ kubernetes_sd_configs:
1769
+ - role: endpoints
1770
+ scheme: https
1771
+ tls_config:
1772
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
1773
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
1774
+ relabel_configs:
1775
+ - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
1776
+ action: keep
1777
+ regex: default;kubernetes;https
1778
+
1779
+ - job_name: 'kubernetes-nodes'
1780
+ kubernetes_sd_configs:
1781
+ - role: node
1782
+ scheme: https
1783
+ tls_config:
1784
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
1785
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
1786
+ relabel_configs:
1787
+ - action: labelmap
1788
+ regex: __meta_kubernetes_node_label_(.+)
1789
+
1790
+ - job_name: 'kubernetes-pods'
1791
+ kubernetes_sd_configs:
1792
+ - role: pod
1793
+ relabel_configs:
1794
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
1795
+ action: keep
1796
+ regex: true
1797
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
1798
+ action: replace
1799
+ target_label: __metrics_path__
1800
+ regex: (.+)
1801
+ - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
1802
+ action: replace
1803
+ regex: ([^:]+)(?::\\d+)?;(\\d+)
1804
+ replacement: $1:$2
1805
+ target_label: __address__
1806
+ - action: labelmap
1807
+ regex: __meta_kubernetes_pod_label_(.+)
1808
+ - source_labels: [__meta_kubernetes_namespace]
1809
+ action: replace
1810
+ target_label: kubernetes_namespace
1811
+ - source_labels: [__meta_kubernetes_pod_name]
1812
+ action: replace
1813
+ target_label: kubernetes_pod_name
1814
+ """
1815
+
1816
+ # Create ConfigMap
1817
+ configmap = client.V1ConfigMap(
1818
+ metadata=client.V1ObjectMeta(name="prometheus-config", namespace=namespace),
1819
+ data={"prometheus.yml": prometheus_config}
1820
+ )
1821
+
1822
+ try:
1823
+ self.core_v1.create_namespaced_config_map(namespace=namespace, body=configmap)
1824
+ except ApiException as e:
1825
+ if e.status == 409: # Already exists
1826
+ self.core_v1.replace_namespaced_config_map(
1827
+ name="prometheus-config", namespace=namespace, body=configmap
1828
+ )
1829
+
1830
+ # Create ServiceAccount and RBAC
1831
+ self._create_prometheus_rbac(namespace)
1832
+
1833
+ # Prometheus Deployment
1834
+ deployment = client.V1Deployment(
1835
+ metadata=client.V1ObjectMeta(name="prometheus", namespace=namespace),
1836
+ spec=client.V1DeploymentSpec(
1837
+ replicas=1,
1838
+ selector=client.V1LabelSelector(match_labels={"app": "prometheus"}),
1839
+ template=client.V1PodTemplateSpec(
1840
+ metadata=client.V1ObjectMeta(labels={"app": "prometheus"}),
1841
+ spec=client.V1PodSpec(
1842
+ service_account_name="prometheus",
1843
+ containers=[
1844
+ client.V1Container(
1845
+ name="prometheus",
1846
+ image="prom/prometheus:latest",
1847
+ ports=[client.V1ContainerPort(container_port=9090)],
1848
+ args=[
1849
+ "--config.file=/etc/prometheus/prometheus.yml",
1850
+ "--storage.tsdb.path=/prometheus/",
1851
+ "--web.console.libraries=/etc/prometheus/console_libraries",
1852
+ "--web.console.templates=/etc/prometheus/consoles",
1853
+ "--web.enable-lifecycle"
1854
+ ],
1855
+ volume_mounts=[
1856
+ client.V1VolumeMount(
1857
+ name="prometheus-config",
1858
+ mount_path="/etc/prometheus/"
1859
+ ),
1860
+ client.V1VolumeMount(
1861
+ name="prometheus-storage",
1862
+ mount_path="/prometheus/"
1863
+ )
1864
+ ]
1865
+ )
1866
+ ],
1867
+ volumes=[
1868
+ client.V1Volume(
1869
+ name="prometheus-config",
1870
+ config_map=client.V1ConfigMapVolumeSource(
1871
+ name="prometheus-config"
1872
+ )
1873
+ ),
1874
+ client.V1Volume(
1875
+ name="prometheus-storage",
1876
+ empty_dir=client.V1EmptyDirVolumeSource()
1877
+ )
1878
+ ]
1879
+ )
1880
+ )
1881
+ )
1882
+ )
1883
+
1884
+ try:
1885
+ self.apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment)
1886
+ except ApiException as e:
1887
+ if e.status == 409: # Already exists
1888
+ self.apps_v1.replace_namespaced_deployment(
1889
+ name="prometheus", namespace=namespace, body=deployment
1890
+ )
1891
+
1892
+ # Prometheus Service
1893
+ service = client.V1Service(
1894
+ metadata=client.V1ObjectMeta(name="prometheus-service", namespace=namespace),
1895
+ spec=client.V1ServiceSpec(
1896
+ selector={"app": "prometheus"},
1897
+ ports=[client.V1ServicePort(port=9090, target_port=9090)],
1898
+ type="ClusterIP"
1899
+ )
1900
+ )
1901
+
1902
+ try:
1903
+ self.core_v1.create_namespaced_service(namespace=namespace, body=service)
1904
+ except ApiException as e:
1905
+ if e.status == 409: # Already exists
1906
+ self.core_v1.replace_namespaced_service(
1907
+ name="prometheus-service", namespace=namespace, body=service
1908
+ )
1909
+
1910
+ result['deployed'] = True
1911
+ return result
1912
+
1913
+ def _create_prometheus_rbac(self, namespace: str) -> None:
1914
+ """Create RBAC resources for Prometheus"""
1915
+ # ServiceAccount
1916
+ service_account = client.V1ServiceAccount(
1917
+ metadata=client.V1ObjectMeta(name="prometheus", namespace=namespace)
1918
+ )
1919
+
1920
+ try:
1921
+ self.core_v1.create_namespaced_service_account(namespace=namespace, body=service_account)
1922
+ except ApiException as e:
1923
+ if e.status != 409: # Ignore if already exists
1924
+ raise e
1925
+
1926
+ # ClusterRole
1927
+ rbac_v1 = client.RbacAuthorizationV1Api()
1928
+ cluster_role = client.V1ClusterRole(
1929
+ metadata=client.V1ObjectMeta(name="prometheus"),
1930
+ rules=[
1931
+ client.V1PolicyRule(
1932
+ api_groups=[""],
1933
+ resources=["nodes", "nodes/proxy", "services", "endpoints", "pods"],
1934
+ verbs=["get", "list", "watch"]
1935
+ ),
1936
+ client.V1PolicyRule(
1937
+ api_groups=["extensions"],
1938
+ resources=["ingresses"],
1939
+ verbs=["get", "list", "watch"]
1940
+ )
1941
+ ]
1942
+ )
1943
+
1944
+ # Add non-resource URL rule with version compatibility
1945
+ try:
1946
+ # Try the newer parameter name first
1947
+ non_resource_rule = client.V1PolicyRule(
1948
+ non_resource_ur_ls=["/metrics"],
1949
+ verbs=["get"]
1950
+ )
1951
+ except TypeError:
1952
+ # Fall back to older parameter name
1953
+ non_resource_rule = client.V1PolicyRule(
1954
+ non_resource_urls=["/metrics"],
1955
+ verbs=["get"]
1956
+ )
1957
+
1958
+ cluster_role.rules.append(non_resource_rule)
1959
+
1960
+ try:
1961
+ rbac_v1.create_cluster_role(body=cluster_role)
1962
+ except ApiException as e:
1963
+ if e.status != 409: # Ignore if already exists
1964
+ raise e
1965
+
1966
+ # ClusterRoleBinding
1967
+ cluster_role_binding = client.V1ClusterRoleBinding(
1968
+ metadata=client.V1ObjectMeta(name="prometheus"),
1969
+ subjects=[
1970
+ client.V1Subject(
1971
+ kind="ServiceAccount",
1972
+ name="prometheus",
1973
+ namespace=namespace
1974
+ )
1975
+ ],
1976
+ role_ref=client.V1RoleRef(
1977
+ kind="ClusterRole",
1978
+ name="prometheus",
1979
+ api_group="rbac.authorization.k8s.io"
1980
+ )
1981
+ )
1982
+
1983
+ try:
1984
+ rbac_v1.create_cluster_role_binding(body=cluster_role_binding)
1985
+ except ApiException as e:
1986
+ if e.status != 409: # Ignore if already exists
1987
+ raise e
1988
+
1989
+ def _deploy_grafana(self, namespace: str, service_type: str = "NodePort") -> Dict[str, Any]:
1990
+ """Deploy Grafana to the cluster"""
1991
+ result = {
1992
+ 'deployed': False,
1993
+ 'service_name': 'grafana-service',
1994
+ 'service_type': service_type,
1995
+ 'admin_user': 'admin',
1996
+ 'admin_password': 'admin123'
1997
+ }
1998
+
1999
+ # Grafana Deployment
2000
+ deployment = client.V1Deployment(
2001
+ metadata=client.V1ObjectMeta(name="grafana", namespace=namespace),
2002
+ spec=client.V1DeploymentSpec(
2003
+ replicas=1,
2004
+ selector=client.V1LabelSelector(match_labels={"app": "grafana"}),
2005
+ template=client.V1PodTemplateSpec(
2006
+ metadata=client.V1ObjectMeta(labels={"app": "grafana"}),
2007
+ spec=client.V1PodSpec(
2008
+ containers=[
2009
+ client.V1Container(
2010
+ name="grafana",
2011
+ image="grafana/grafana:latest",
2012
+ ports=[client.V1ContainerPort(container_port=3000)],
2013
+ env=[
2014
+ client.V1EnvVar(name="GF_SECURITY_ADMIN_USER", value="admin"),
2015
+ client.V1EnvVar(name="GF_SECURITY_ADMIN_PASSWORD", value="admin123"),
2016
+ client.V1EnvVar(name="GF_INSTALL_PLUGINS", value="grafana-kubernetes-app")
2017
+ ],
2018
+ volume_mounts=[
2019
+ client.V1VolumeMount(
2020
+ name="grafana-storage",
2021
+ mount_path="/var/lib/grafana"
2022
+ )
2023
+ ]
2024
+ )
2025
+ ],
2026
+ volumes=[
2027
+ client.V1Volume(
2028
+ name="grafana-storage",
2029
+ empty_dir=client.V1EmptyDirVolumeSource()
2030
+ )
2031
+ ]
2032
+ )
2033
+ )
2034
+ )
2035
+ )
2036
+
2037
+ try:
2038
+ self.apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment)
2039
+ except ApiException as e:
2040
+ if e.status == 409: # Already exists
2041
+ self.apps_v1.replace_namespaced_deployment(
2042
+ name="grafana", namespace=namespace, body=deployment
2043
+ )
2044
+
2045
+ # Grafana Service
2046
+ service_ports = [client.V1ServicePort(port=3000, target_port=3000)]
2047
+ if service_type == "NodePort":
2048
+ service_ports[0].node_port = 30300
2049
+
2050
+ service = client.V1Service(
2051
+ metadata=client.V1ObjectMeta(name="grafana-service", namespace=namespace),
2052
+ spec=client.V1ServiceSpec(
2053
+ selector={"app": "grafana"},
2054
+ ports=service_ports,
2055
+ type=service_type
2056
+ )
2057
+ )
2058
+
2059
+ try:
2060
+ self.core_v1.create_namespaced_service(namespace=namespace, body=service)
2061
+ result['deployed'] = True
2062
+ except ApiException as e:
2063
+ if e.status == 409: # Already exists
2064
+ self.core_v1.replace_namespaced_service(
2065
+ name="grafana-service", namespace=namespace, body=service
2066
+ )
2067
+ result['deployed'] = True
2068
+
2069
+ return result
2070
+
2071
+ def _wait_for_monitoring_ready(self, namespace: str, timeout: int = 300) -> bool:
2072
+ """Wait for Prometheus and Grafana to be ready"""
2073
+ deployments = ["prometheus", "grafana"]
2074
+ start_time = time.time()
2075
+
2076
+ for deployment_name in deployments:
2077
+ print(f"⏳ Waiting for {deployment_name} to be ready...")
2078
+ deployment_ready = False
2079
+
2080
+ while time.time() - start_time < timeout and not deployment_ready:
2081
+ try:
2082
+ deployment = self.apps_v1.read_namespaced_deployment(
2083
+ name=deployment_name, namespace=namespace
2084
+ )
2085
+
2086
+ if (deployment.status.ready_replicas == deployment.spec.replicas and
2087
+ deployment.status.ready_replicas > 0):
2088
+ print(f"✅ {deployment_name} is ready")
2089
+ deployment_ready = True
2090
+ else:
2091
+ ready = deployment.status.ready_replicas or 0
2092
+ total = deployment.spec.replicas
2093
+ print(f"⏳ {deployment_name}: {ready}/{total} replicas ready...")
2094
+ time.sleep(5)
2095
+
2096
+ except ApiException as e:
2097
+ print(f"❌ Error checking {deployment_name} status: {e}")
2098
+ return False
2099
+
2100
+ if not deployment_ready:
2101
+ print(f"❌ Timeout waiting for {deployment_name} to be ready")
2102
+ return False
2103
+
2104
+ return True
2105
+
2106
+ def _configure_grafana(self, namespace: str, import_dashboard: bool = True) -> bool:
2107
+ """Configure Grafana with Prometheus data source and dashboard"""
2108
+ try:
2109
+ # Wait a bit for Grafana to fully start
2110
+ time.sleep(10)
2111
+
2112
+ import requests
2113
+ import json
2114
+
2115
+ # Get Grafana service URL
2116
+ grafana_url = self._get_grafana_url(namespace)
2117
+ if not grafana_url:
2118
+ print("❌ Could not determine Grafana URL")
2119
+ return False
2120
+
2121
+ print(f"🔧 Configuring Grafana at {grafana_url}")
2122
+
2123
+ # Add Prometheus data source
2124
+ datasource_payload = {
2125
+ "name": "Prometheus",
2126
+ "type": "prometheus",
2127
+ "url": f"http://prometheus-service.{namespace}.svc.cluster.local:9090",
2128
+ "access": "proxy",
2129
+ "isDefault": True
2130
+ }
2131
+
2132
+ auth = ('admin', 'admin123')
2133
+
2134
+ # Add data source
2135
+ response = requests.post(
2136
+ f"{grafana_url}/api/datasources",
2137
+ json=datasource_payload,
2138
+ auth=auth,
2139
+ timeout=30
2140
+ )
2141
+
2142
+ if response.status_code in [200, 409]: # Success or already exists
2143
+ print("✅ Prometheus data source configured")
2144
+ else:
2145
+ print(f"⚠️ Warning: Could not add Prometheus data source: {response.text}")
2146
+
2147
+ # Import default dashboard if requested
2148
+ if import_dashboard:
2149
+ self._import_kubernetes_dashboard(grafana_url, auth)
2150
+
2151
+ return True
2152
+
2153
+ except Exception as e:
2154
+ print(f"⚠️ Warning: Could not configure Grafana automatically: {e}")
2155
+ print("💡 You can manually add Prometheus as a data source in Grafana")
2156
+ return False
2157
+
2158
+ def _get_grafana_url(self, namespace: str) -> Optional[str]:
2159
+ """Get Grafana service URL"""
2160
+ try:
2161
+ service = self.core_v1.read_namespaced_service(
2162
+ name="grafana-service", namespace=namespace
2163
+ )
2164
+
2165
+ if service.spec.type == "NodePort":
2166
+ # Try to get node IP
2167
+ nodes = self.core_v1.list_node()
2168
+ if nodes.items:
2169
+ # Get external IP or internal IP
2170
+ node_ip = None
2171
+ for node in nodes.items:
2172
+ for address in node.status.addresses:
2173
+ if address.type == "ExternalIP":
2174
+ node_ip = address.address
2175
+ break
2176
+ if not node_ip:
2177
+ for address in node.status.addresses:
2178
+ if address.type == "InternalIP":
2179
+ node_ip = address.address
2180
+ break
2181
+ if node_ip:
2182
+ break
2183
+
2184
+ if node_ip:
2185
+ node_port = None
2186
+ for port in service.spec.ports:
2187
+ if port.node_port:
2188
+ node_port = port.node_port
2189
+ break
2190
+
2191
+ if node_port:
2192
+ return f"http://{node_ip}:{node_port}"
2193
+
2194
+ elif service.spec.type == "LoadBalancer":
2195
+ if service.status.load_balancer.ingress:
2196
+ ingress = service.status.load_balancer.ingress[0]
2197
+ host = ingress.ip or ingress.hostname
2198
+ if host:
2199
+ return f"http://{host}:3000"
2200
+
2201
+ # Fallback to port-forward approach
2202
+ return None
2203
+
2204
+ except Exception as e:
2205
+ print(f"Warning: Could not determine Grafana URL: {e}")
2206
+ return None
2207
+
2208
+ def _import_kubernetes_dashboard(self, grafana_url: str, auth: tuple) -> None:
2209
+ """Import a default Kubernetes dashboard"""
2210
+ try:
2211
+ import requests
2212
+
2213
+ # Simple Kubernetes cluster dashboard JSON
2214
+ dashboard_json = {
2215
+ "dashboard": {
2216
+ "id": None,
2217
+ "title": "Kubernetes Cluster Overview",
2218
+ "tags": ["kubernetes"],
2219
+ "timezone": "browser",
2220
+ "panels": [
2221
+ {
2222
+ "id": 1,
2223
+ "title": "Cluster CPU Usage",
2224
+ "type": "stat",
2225
+ "targets": [
2226
+ {
2227
+ "expr": "100 - (avg(irate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
2228
+ "refId": "A"
2229
+ }
2230
+ ],
2231
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
2232
+ },
2233
+ {
2234
+ "id": 2,
2235
+ "title": "Cluster Memory Usage",
2236
+ "type": "stat",
2237
+ "targets": [
2238
+ {
2239
+ "expr": "100 * (1 - ((avg_over_time(node_memory_MemFree_bytes[10m]) + avg_over_time(node_memory_Cached_bytes[10m]) + avg_over_time(node_memory_Buffers_bytes[10m])) / avg_over_time(node_memory_MemTotal_bytes[10m])))",
2240
+ "refId": "A"
2241
+ }
2242
+ ],
2243
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
2244
+ },
2245
+ {
2246
+ "id": 3,
2247
+ "title": "Pod Count",
2248
+ "type": "stat",
2249
+ "targets": [
2250
+ {
2251
+ "expr": "sum(kube_pod_info)",
2252
+ "refId": "A"
2253
+ }
2254
+ ],
2255
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
2256
+ }
2257
+ ],
2258
+ "time": {"from": "now-1h", "to": "now"},
2259
+ "refresh": "30s"
2260
+ },
2261
+ "overwrite": True
2262
+ }
2263
+
2264
+ response = requests.post(
2265
+ f"{grafana_url}/api/dashboards/db",
2266
+ json=dashboard_json,
2267
+ auth=auth,
2268
+ timeout=30
2269
+ )
2270
+
2271
+ if response.status_code == 200:
2272
+ print("✅ Kubernetes dashboard imported")
2273
+ else:
2274
+ print(f"⚠️ Warning: Could not import dashboard: {response.text}")
2275
+
2276
+ except Exception as e:
2277
+ print(f"⚠️ Warning: Could not import dashboard: {e}")
2278
+
2279
+ def get_monitoring_info(self, namespace: str = "monitoring") -> Dict[str, Any]:
2280
+ """Get monitoring stack information and URLs"""
2281
+ result = {
2282
+ 'namespace': namespace,
2283
+ 'prometheus': {'running': False, 'url': None},
2284
+ 'grafana': {'running': False, 'url': None, 'credentials': None},
2285
+ 'services': []
2286
+ }
2287
+
2288
+ try:
2289
+ # Check if deployments are running
2290
+ deployments = self.apps_v1.list_namespaced_deployment(namespace=namespace)
2291
+
2292
+ for deployment in deployments.items:
2293
+ if deployment.metadata.name == "prometheus":
2294
+ result['prometheus']['running'] = (
2295
+ deployment.status.ready_replicas == deployment.spec.replicas and
2296
+ deployment.status.ready_replicas > 0
2297
+ )
2298
+ elif deployment.metadata.name == "grafana":
2299
+ result['grafana']['running'] = (
2300
+ deployment.status.ready_replicas == deployment.spec.replicas and
2301
+ deployment.status.ready_replicas > 0
2302
+ )
2303
+
2304
+ # Get service information
2305
+ services = self.core_v1.list_namespaced_service(namespace=namespace)
2306
+
2307
+ for service in services.items:
2308
+ service_info = {
2309
+ 'name': service.metadata.name,
2310
+ 'type': service.spec.type,
2311
+ 'cluster_ip': service.spec.cluster_ip,
2312
+ 'ports': []
2313
+ }
2314
+
2315
+ for port in service.spec.ports:
2316
+ port_info = {
2317
+ 'port': port.port,
2318
+ 'target_port': port.target_port,
2319
+ 'protocol': port.protocol
2320
+ }
2321
+ if hasattr(port, 'node_port') and port.node_port:
2322
+ port_info['node_port'] = port.node_port
2323
+ service_info['ports'].append(port_info)
2324
+
2325
+ result['services'].append(service_info)
2326
+
2327
+ # Set URLs
2328
+ if service.metadata.name == "grafana-service":
2329
+ result['grafana']['credentials'] = {
2330
+ 'username': 'admin',
2331
+ 'password': 'admin123'
2332
+ }
2333
+
2334
+ if service.spec.type == "NodePort":
2335
+ # Get node IP and port
2336
+ url_info = self.get_service_url("grafana-service", namespace)
2337
+ if url_info and url_info.get('external_url'):
2338
+ result['grafana']['url'] = url_info['external_url']
2339
+
2340
+ elif service.metadata.name == "prometheus-service":
2341
+ if service.spec.type == "NodePort":
2342
+ url_info = self.get_service_url("prometheus-service", namespace)
2343
+ if url_info and url_info.get('external_url'):
2344
+ result['prometheus']['url'] = url_info['external_url']
2345
+
2346
+ return result
2347
+
2348
+ except ApiException as e:
2349
+ result['error'] = str(e)
2350
+ return result
2351
+
2352
+ def add_prometheus_target(self, job_name: str, targets: List[str],
2353
+ namespace: str = "monitoring",
2354
+ metrics_path: str = "/metrics",
2355
+ scrape_interval: str = "15s") -> bool:
2356
+ """
2357
+ Add a new scrape target to Prometheus configuration
2358
+
2359
+ Args:
2360
+ job_name: Name for the job in Prometheus config
2361
+ targets: List of target addresses (e.g., ["service:port", "1.2.3.4:9090"])
2362
+ namespace: Monitoring namespace
2363
+ metrics_path: Path to metrics endpoint
2364
+ scrape_interval: How often to scrape the target
2365
+
2366
+ Returns:
2367
+ bool: True if target was added successfully
2368
+ """
2369
+ try:
2370
+ # Get current ConfigMap
2371
+ configmap = self.core_v1.read_namespaced_config_map(
2372
+ name="prometheus-config",
2373
+ namespace=namespace
2374
+ )
2375
+
2376
+ current_config = configmap.data.get("prometheus.yml", "")
2377
+
2378
+ # Parse YAML to add new target
2379
+ import yaml
2380
+ config_data = yaml.safe_load(current_config)
2381
+
2382
+ # Check if job already exists
2383
+ existing_jobs = [job['job_name'] for job in config_data.get('scrape_configs', [])]
2384
+ if job_name in existing_jobs:
2385
+ print(f"❌ Job '{job_name}' already exists. Use update or remove it first.")
2386
+ return False
2387
+
2388
+ # Create new job config
2389
+ new_job = {
2390
+ 'job_name': job_name,
2391
+ 'scrape_interval': scrape_interval,
2392
+ 'metrics_path': metrics_path,
2393
+ 'static_configs': [
2394
+ {'targets': targets}
2395
+ ]
2396
+ }
2397
+
2398
+ # Add to scrape configs
2399
+ if 'scrape_configs' not in config_data:
2400
+ config_data['scrape_configs'] = []
2401
+
2402
+ config_data['scrape_configs'].append(new_job)
2403
+
2404
+ # Convert back to YAML
2405
+ updated_config = yaml.dump(config_data, default_flow_style=False, sort_keys=False)
2406
+
2407
+ # Update ConfigMap
2408
+ configmap.data["prometheus.yml"] = updated_config
2409
+ self.core_v1.replace_namespaced_config_map(
2410
+ name="prometheus-config",
2411
+ namespace=namespace,
2412
+ body=configmap
2413
+ )
2414
+
2415
+ # Restart Prometheus deployment to pick up new config
2416
+ self._restart_prometheus_deployment(namespace)
2417
+
2418
+ print(f"✅ Added Prometheus target job '{job_name}' with targets: {targets}")
2419
+ return True
2420
+
2421
+ except Exception as e:
2422
+ print(f"❌ Error adding Prometheus target: {e}")
2423
+ return False
2424
+
2425
+ def remove_prometheus_target(self, job_name: str, namespace: str = "monitoring") -> bool:
2426
+ """
2427
+ Remove a scrape target from Prometheus configuration
2428
+
2429
+ Args:
2430
+ job_name: Name of the job to remove
2431
+ namespace: Monitoring namespace
2432
+
2433
+ Returns:
2434
+ bool: True if target was removed successfully
2435
+ """
2436
+ try:
2437
+ # Get current ConfigMap
2438
+ configmap = self.core_v1.read_namespaced_config_map(
2439
+ name="prometheus-config",
2440
+ namespace=namespace
2441
+ )
2442
+
2443
+ current_config = configmap.data.get("prometheus.yml", "")
2444
+
2445
+ # Parse YAML to remove target
2446
+ import yaml
2447
+ config_data = yaml.safe_load(current_config)
2448
+
2449
+ # Find and remove the job
2450
+ original_jobs = config_data.get('scrape_configs', [])
2451
+ updated_jobs = [job for job in original_jobs if job.get('job_name') != job_name]
2452
+
2453
+ if len(original_jobs) == len(updated_jobs):
2454
+ print(f"❌ Job '{job_name}' not found in Prometheus configuration")
2455
+ return False
2456
+
2457
+ config_data['scrape_configs'] = updated_jobs
2458
+
2459
+ # Convert back to YAML
2460
+ updated_config = yaml.dump(config_data, default_flow_style=False, sort_keys=False)
2461
+
2462
+ # Update ConfigMap
2463
+ configmap.data["prometheus.yml"] = updated_config
2464
+ self.core_v1.replace_namespaced_config_map(
2465
+ name="prometheus-config",
2466
+ namespace=namespace,
2467
+ body=configmap
2468
+ )
2469
+
2470
+ # Restart Prometheus deployment to pick up new config
2471
+ self._restart_prometheus_deployment(namespace)
2472
+
2473
+ print(f"✅ Removed Prometheus target job '{job_name}'")
2474
+ return True
2475
+
2476
+ except Exception as e:
2477
+ print(f"❌ Error removing Prometheus target: {e}")
2478
+ return False
2479
+
2480
+ def list_prometheus_targets(self, namespace: str = "monitoring") -> Dict[str, Any]:
2481
+ """
2482
+ List all configured Prometheus targets
2483
+
2484
+ Args:
2485
+ namespace: Monitoring namespace
2486
+
2487
+ Returns:
2488
+ Dict containing target information
2489
+ """
2490
+ try:
2491
+ # Get current ConfigMap
2492
+ configmap = self.core_v1.read_namespaced_config_map(
2493
+ name="prometheus-config",
2494
+ namespace=namespace
2495
+ )
2496
+
2497
+ current_config = configmap.data.get("prometheus.yml", "")
2498
+
2499
+ # Parse YAML to get targets
2500
+ import yaml
2501
+ config_data = yaml.safe_load(current_config)
2502
+
2503
+ targets = []
2504
+ for job in config_data.get('scrape_configs', []):
2505
+ job_info = {
2506
+ 'job_name': job.get('job_name', 'unknown'),
2507
+ 'scrape_interval': job.get('scrape_interval', 'default'),
2508
+ 'metrics_path': job.get('metrics_path', '/metrics'),
2509
+ 'targets': []
2510
+ }
2511
+
2512
+ # Get static targets
2513
+ if 'static_configs' in job:
2514
+ for static_config in job['static_configs']:
2515
+ job_info['targets'].extend(static_config.get('targets', []))
2516
+
2517
+ # Mark kubernetes discovery jobs
2518
+ if 'kubernetes_sd_configs' in job:
2519
+ job_info['type'] = 'kubernetes_discovery'
2520
+ job_info['targets'] = ['<kubernetes_discovery>']
2521
+ else:
2522
+ job_info['type'] = 'static'
2523
+
2524
+ targets.append(job_info)
2525
+
2526
+ return {
2527
+ 'namespace': namespace,
2528
+ 'targets': targets,
2529
+ 'total_jobs': len(targets)
2530
+ }
2531
+
2532
+ except Exception as e:
2533
+ return {'error': str(e)}
2534
+
2535
+ def update_prometheus_target(self, job_name: str, targets: List[str],
2536
+ namespace: str = "monitoring",
2537
+ metrics_path: str = None,
2538
+ scrape_interval: str = None) -> bool:
2539
+ """
2540
+ Update an existing Prometheus target
2541
+
2542
+ Args:
2543
+ job_name: Name of the job to update
2544
+ targets: New list of target addresses
2545
+ namespace: Monitoring namespace
2546
+ metrics_path: Optional new metrics path
2547
+ scrape_interval: Optional new scrape interval
2548
+
2549
+ Returns:
2550
+ bool: True if target was updated successfully
2551
+ """
2552
+ try:
2553
+ # Get current ConfigMap
2554
+ configmap = self.core_v1.read_namespaced_config_map(
2555
+ name="prometheus-config",
2556
+ namespace=namespace
2557
+ )
2558
+
2559
+ current_config = configmap.data.get("prometheus.yml", "")
2560
+
2561
+ # Parse YAML to update target
2562
+ import yaml
2563
+ config_data = yaml.safe_load(current_config)
2564
+
2565
+ # Find and update the job
2566
+ job_found = False
2567
+ for job in config_data.get('scrape_configs', []):
2568
+ if job.get('job_name') == job_name:
2569
+ job_found = True
2570
+
2571
+ # Update targets
2572
+ if 'static_configs' not in job:
2573
+ job['static_configs'] = [{}]
2574
+
2575
+ job['static_configs'][0]['targets'] = targets
2576
+
2577
+ # Update optional fields
2578
+ if metrics_path:
2579
+ job['metrics_path'] = metrics_path
2580
+ if scrape_interval:
2581
+ job['scrape_interval'] = scrape_interval
2582
+
2583
+ break
2584
+
2585
+ if not job_found:
2586
+ print(f"❌ Job '{job_name}' not found in Prometheus configuration")
2587
+ return False
2588
+
2589
+ # Convert back to YAML
2590
+ updated_config = yaml.dump(config_data, default_flow_style=False, sort_keys=False)
2591
+
2592
+ # Update ConfigMap
2593
+ configmap.data["prometheus.yml"] = updated_config
2594
+ self.core_v1.replace_namespaced_config_map(
2595
+ name="prometheus-config",
2596
+ namespace=namespace,
2597
+ body=configmap
2598
+ )
2599
+
2600
+ # Restart Prometheus deployment to pick up new config
2601
+ self._restart_prometheus_deployment(namespace)
2602
+
2603
+ print(f"✅ Updated Prometheus target job '{job_name}' with targets: {targets}")
2604
+ return True
2605
+
2606
+ except Exception as e:
2607
+ print(f"❌ Error updating Prometheus target: {e}")
2608
+ return False
2609
+
2610
+ def _restart_prometheus_deployment(self, namespace: str) -> bool:
2611
+ """Restart Prometheus deployment to reload configuration"""
2612
+ try:
2613
+ # Get current deployment
2614
+ deployment = self.apps_v1.read_namespaced_deployment(
2615
+ name="prometheus",
2616
+ namespace=namespace
2617
+ )
2618
+
2619
+ # Update deployment to trigger restart
2620
+ if deployment.spec.template.metadata.annotations is None:
2621
+ deployment.spec.template.metadata.annotations = {}
2622
+
2623
+ import time
2624
+ deployment.spec.template.metadata.annotations['kubectl.kubernetes.io/restartedAt'] = str(int(time.time()))
2625
+
2626
+ # Apply the update
2627
+ self.apps_v1.patch_namespaced_deployment(
2628
+ name="prometheus",
2629
+ namespace=namespace,
2630
+ body=deployment
2631
+ )
2632
+
2633
+ print("🔄 Restarting Prometheus deployment to reload configuration...")
2634
+ return True
2635
+
2636
+ except Exception as e:
2637
+ print(f"⚠️ Could not restart Prometheus deployment: {e}")
2638
+ return False