k8s-helper-cli 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- k8s_helper/__init__.py +1 -1
- k8s_helper/cli.py +236 -1
- k8s_helper/core.py +794 -3
- {k8s_helper_cli-0.4.3.dist-info → k8s_helper_cli-0.5.1.dist-info}/METADATA +1 -1
- k8s_helper_cli-0.5.1.dist-info/RECORD +11 -0
- k8s_helper_cli-0.4.3.dist-info/RECORD +0 -11
- {k8s_helper_cli-0.4.3.dist-info → k8s_helper_cli-0.5.1.dist-info}/WHEEL +0 -0
- {k8s_helper_cli-0.4.3.dist-info → k8s_helper_cli-0.5.1.dist-info}/entry_points.txt +0 -0
- {k8s_helper_cli-0.4.3.dist-info → k8s_helper_cli-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {k8s_helper_cli-0.4.3.dist-info → k8s_helper_cli-0.5.1.dist-info}/top_level.txt +0 -0
k8s_helper/__init__.py
CHANGED
k8s_helper/cli.py
CHANGED
@@ -1359,6 +1359,7 @@ def setup_monitoring(
|
|
1359
1359
|
namespace: str = typer.Option("monitoring", "--namespace", "-n", help="Namespace for monitoring stack"),
|
1360
1360
|
grafana_service_type: str = typer.Option("NodePort", "--service-type", "-t", help="Grafana service type: NodePort, LoadBalancer, ClusterIP"),
|
1361
1361
|
import_dashboard: bool = typer.Option(True, "--import-dashboard/--no-dashboard", help="Import default Kubernetes dashboard"),
|
1362
|
+
install_kube_state_metrics: bool = typer.Option(True, "--install-kube-state-metrics/--no-kube-state-metrics", help="Install kube-state-metrics for cluster metrics"),
|
1362
1363
|
wait: bool = typer.Option(True, "--wait/--no-wait", help="Wait for deployments to be ready"),
|
1363
1364
|
show_info: bool = typer.Option(True, "--show-info/--no-show-info", help="Show monitoring stack information after setup")
|
1364
1365
|
):
|
@@ -1377,12 +1378,17 @@ def setup_monitoring(
|
|
1377
1378
|
if import_dashboard:
|
1378
1379
|
console.print("📊 Will import default Kubernetes dashboard")
|
1379
1380
|
|
1381
|
+
if install_kube_state_metrics:
|
1382
|
+
console.print("📈 Will install kube-state-metrics for cluster metrics")
|
1383
|
+
|
1380
1384
|
# Show what will be deployed
|
1381
1385
|
console.print("\n📋 Components to deploy:")
|
1382
1386
|
console.print(" • Prometheus server with cluster monitoring configuration")
|
1383
1387
|
console.print(" • Grafana with admin credentials (admin/admin123)")
|
1384
1388
|
console.print(" • ServiceAccount and RBAC for Prometheus")
|
1385
1389
|
console.print(" • ConfigMaps for Prometheus configuration")
|
1390
|
+
if install_kube_state_metrics:
|
1391
|
+
console.print(" • kube-state-metrics for cluster metrics (via Helm or manual)")
|
1386
1392
|
if import_dashboard:
|
1387
1393
|
console.print(" • Default Kubernetes metrics dashboard")
|
1388
1394
|
|
@@ -1394,7 +1400,8 @@ def setup_monitoring(
|
|
1394
1400
|
namespace=namespace,
|
1395
1401
|
grafana_service_type=grafana_service_type,
|
1396
1402
|
import_dashboard=import_dashboard,
|
1397
|
-
wait_for_ready=wait
|
1403
|
+
wait_for_ready=wait,
|
1404
|
+
install_kube_state_metrics=install_kube_state_metrics
|
1398
1405
|
)
|
1399
1406
|
|
1400
1407
|
if result['success']:
|
@@ -1429,6 +1436,22 @@ def setup_monitoring(
|
|
1429
1436
|
else:
|
1430
1437
|
console.print("❌ Grafana: Failed to deploy")
|
1431
1438
|
|
1439
|
+
# Show kube-state-metrics status
|
1440
|
+
if install_kube_state_metrics and 'kube_state_metrics' in result:
|
1441
|
+
ksm = result['kube_state_metrics']
|
1442
|
+
if ksm.get('installed'):
|
1443
|
+
method = ksm.get('method', 'unknown')
|
1444
|
+
if method == 'helm':
|
1445
|
+
console.print("✅ kube-state-metrics: Deployed via Helm")
|
1446
|
+
elif method == 'manual':
|
1447
|
+
console.print("✅ kube-state-metrics: Deployed via manual YAML")
|
1448
|
+
elif method == 'existing':
|
1449
|
+
console.print("✅ kube-state-metrics: Already deployed")
|
1450
|
+
else:
|
1451
|
+
console.print(f"❌ kube-state-metrics: Failed to deploy")
|
1452
|
+
if ksm.get('error'):
|
1453
|
+
console.print(f" Error: {ksm['error']}")
|
1454
|
+
|
1432
1455
|
if show_info:
|
1433
1456
|
# Get and display monitoring information
|
1434
1457
|
with console.status("Retrieving monitoring stack information..."):
|
@@ -1863,5 +1886,217 @@ def update_prometheus_target(
|
|
1863
1886
|
console.print(f"❌ Error updating Prometheus target: {e}")
|
1864
1887
|
|
1865
1888
|
|
1889
|
+
# ======================
|
1890
|
+
# HELM-BASED MONITORING COMMANDS
|
1891
|
+
# ======================
|
1892
|
+
@app.command()
|
1893
|
+
def setup_monitoring_stack(
|
1894
|
+
namespace: str = typer.Option("monitoring", "--namespace", "-n", help="Namespace for monitoring stack"),
|
1895
|
+
grafana_service_type: str = typer.Option("NodePort", "--service-type", "-t", help="Grafana service type: NodePort, LoadBalancer, ClusterIP"),
|
1896
|
+
prometheus_storage_size: str = typer.Option("10Gi", "--prometheus-storage", help="Prometheus storage size"),
|
1897
|
+
grafana_storage_size: str = typer.Option("5Gi", "--grafana-storage", help="Grafana storage size"),
|
1898
|
+
wait: bool = typer.Option(True, "--wait/--no-wait", help="Wait for deployments to be ready"),
|
1899
|
+
install_ingress: bool = typer.Option(False, "--install-ingress", help="Install ingress for external access")
|
1900
|
+
):
|
1901
|
+
"""Deploy monitoring stack using official Helm charts (Prometheus + Grafana)"""
|
1902
|
+
|
1903
|
+
# Validate service type
|
1904
|
+
valid_service_types = ["NodePort", "LoadBalancer", "ClusterIP"]
|
1905
|
+
if grafana_service_type not in valid_service_types:
|
1906
|
+
console.print(f"❌ Invalid service type: {grafana_service_type}")
|
1907
|
+
console.print(f"💡 Valid options: {', '.join(valid_service_types)}")
|
1908
|
+
return
|
1909
|
+
|
1910
|
+
console.print(f"🚀 Setting up Helm-based monitoring stack in namespace: {namespace}")
|
1911
|
+
console.print(f"🔧 Grafana service type: {grafana_service_type}")
|
1912
|
+
console.print(f"💾 Prometheus storage: {prometheus_storage_size}")
|
1913
|
+
console.print(f"💾 Grafana storage: {grafana_storage_size}")
|
1914
|
+
|
1915
|
+
# Show what will be deployed
|
1916
|
+
console.print("\n📋 Components to deploy via Helm:")
|
1917
|
+
console.print(" • Prometheus Operator (kube-prometheus-stack)")
|
1918
|
+
console.print(" • Grafana with persistent storage")
|
1919
|
+
console.print(" • AlertManager for alerts")
|
1920
|
+
console.print(" • Node Exporter for node metrics")
|
1921
|
+
console.print(" • kube-state-metrics for cluster state")
|
1922
|
+
console.print(" • Prometheus rules and dashboards")
|
1923
|
+
if install_ingress:
|
1924
|
+
console.print(" • Ingress for external access")
|
1925
|
+
|
1926
|
+
try:
|
1927
|
+
client = K8sClient()
|
1928
|
+
|
1929
|
+
with console.status("Deploying Helm monitoring stack..."):
|
1930
|
+
result = client.setup_helm_monitoring(
|
1931
|
+
namespace=namespace,
|
1932
|
+
grafana_service_type=grafana_service_type,
|
1933
|
+
prometheus_storage_size=prometheus_storage_size,
|
1934
|
+
grafana_storage_size=grafana_storage_size,
|
1935
|
+
wait_for_ready=wait,
|
1936
|
+
install_ingress=install_ingress
|
1937
|
+
)
|
1938
|
+
|
1939
|
+
if result['success']:
|
1940
|
+
console.print("✅ Helm monitoring stack deployed successfully!")
|
1941
|
+
|
1942
|
+
# Show deployment summary
|
1943
|
+
console.print(f"\n📋 Deployment Summary:")
|
1944
|
+
console.print(f"📍 Namespace: {result['namespace']}")
|
1945
|
+
console.print(f"🎯 Helm Release: {result['release_name']}")
|
1946
|
+
|
1947
|
+
if result.get('prometheus', {}).get('deployed'):
|
1948
|
+
console.print("✅ Prometheus Operator: Deployed")
|
1949
|
+
else:
|
1950
|
+
console.print("❌ Prometheus Operator: Failed to deploy")
|
1951
|
+
|
1952
|
+
if result.get('grafana', {}).get('deployed'):
|
1953
|
+
console.print("✅ Grafana: Deployed")
|
1954
|
+
if result['grafana'].get('admin_password'):
|
1955
|
+
console.print(f"🔑 Grafana admin password: {result['grafana']['admin_password']}")
|
1956
|
+
else:
|
1957
|
+
console.print("🔑 Grafana admin password: admin")
|
1958
|
+
else:
|
1959
|
+
console.print("❌ Grafana: Failed to deploy")
|
1960
|
+
|
1961
|
+
# Show access information
|
1962
|
+
console.print(f"\n🔗 Access Information:")
|
1963
|
+
|
1964
|
+
if result.get('grafana_url'):
|
1965
|
+
console.print(f"🔗 Grafana URL: [blue]{result['grafana_url']}[/blue]")
|
1966
|
+
else:
|
1967
|
+
console.print(f"💡 Grafana: kubectl port-forward -n {namespace} svc/kube-prometheus-stack-grafana 3000:80")
|
1968
|
+
|
1969
|
+
if result.get('prometheus_url'):
|
1970
|
+
console.print(f"🔗 Prometheus URL: [blue]{result['prometheus_url']}[/blue]")
|
1971
|
+
else:
|
1972
|
+
console.print(f"💡 Prometheus: kubectl port-forward -n {namespace} svc/kube-prometheus-stack-prometheus 9090:9090")
|
1973
|
+
|
1974
|
+
if result.get('alertmanager_url'):
|
1975
|
+
console.print(f"🔗 AlertManager URL: [blue]{result['alertmanager_url']}[/blue]")
|
1976
|
+
else:
|
1977
|
+
console.print(f"💡 AlertManager: kubectl port-forward -n {namespace} svc/kube-prometheus-stack-alertmanager 9093:9093")
|
1978
|
+
|
1979
|
+
# Show next steps
|
1980
|
+
console.print(f"\n🚀 Next Steps:")
|
1981
|
+
console.print(f" 1. Access Grafana with admin/admin (or password shown above)")
|
1982
|
+
console.print(f" 2. Explore pre-configured dashboards")
|
1983
|
+
console.print(f" 3. Set up custom alerts in AlertManager")
|
1984
|
+
console.print(f" 4. Add custom Prometheus targets if needed")
|
1985
|
+
console.print(f"\n💡 Useful commands:")
|
1986
|
+
console.print(f" • Check status: k8s-helper monitoring-stack-status -n {namespace}")
|
1987
|
+
console.print(f" • List dashboards: kubectl get configmaps -n {namespace} | grep dashboard")
|
1988
|
+
console.print(f" • View Helm release: helm list -n {namespace}")
|
1989
|
+
|
1990
|
+
else:
|
1991
|
+
console.print(f"❌ Failed to deploy Helm monitoring stack: {result.get('error', 'Unknown error')}")
|
1992
|
+
|
1993
|
+
console.print("\n🛠️ Troubleshooting:")
|
1994
|
+
console.print(" • Ensure Helm is installed: helm version")
|
1995
|
+
console.print(" • Check cluster connectivity: kubectl get nodes")
|
1996
|
+
console.print(" • Verify namespace permissions")
|
1997
|
+
console.print(f" • View Helm status: helm status -n {namespace} kube-prometheus-stack")
|
1998
|
+
|
1999
|
+
except Exception as e:
|
2000
|
+
console.print(f"❌ Error setting up Helm monitoring: {e}")
|
2001
|
+
console.print("\n🛠️ Troubleshooting:")
|
2002
|
+
console.print(" • Ensure Helm is installed and configured")
|
2003
|
+
console.print(" • Check if kubectl is configured correctly")
|
2004
|
+
console.print(" • Verify you have cluster admin permissions")
|
2005
|
+
|
2006
|
+
|
2007
|
+
@app.command()
|
2008
|
+
def monitoring_stack_status(
|
2009
|
+
namespace: str = typer.Option("monitoring", "--namespace", "-n", help="Monitoring namespace"),
|
2010
|
+
output: str = output_option
|
2011
|
+
):
|
2012
|
+
"""Show status of Helm-based monitoring stack"""
|
2013
|
+
try:
|
2014
|
+
client = K8sClient()
|
2015
|
+
|
2016
|
+
with console.status("Checking Helm monitoring stack status..."):
|
2017
|
+
info = client.get_helm_monitoring_info(namespace)
|
2018
|
+
|
2019
|
+
if 'error' in info:
|
2020
|
+
console.print(f"❌ Error getting monitoring status: {info['error']}")
|
2021
|
+
return
|
2022
|
+
|
2023
|
+
if output == "table":
|
2024
|
+
# Helm release info
|
2025
|
+
console.print(f"🎯 Helm Release: {info.get('release_name', 'kube-prometheus-stack')}")
|
2026
|
+
console.print(f"📊 Release Status: {info.get('release_status', 'Unknown')}")
|
2027
|
+
console.print(f"📅 Last Deployed: {info.get('last_deployed', 'Unknown')}")
|
2028
|
+
|
2029
|
+
# Overview table
|
2030
|
+
table = Table(title=f"Monitoring Stack Status - {namespace}")
|
2031
|
+
table.add_column("Component", style="cyan")
|
2032
|
+
table.add_column("Status", style="green")
|
2033
|
+
table.add_column("URL", style="blue")
|
2034
|
+
|
2035
|
+
# Components status
|
2036
|
+
components = ['prometheus', 'grafana', 'alertmanager']
|
2037
|
+
for component in components:
|
2038
|
+
if component in info:
|
2039
|
+
comp_info = info[component]
|
2040
|
+
status = "🟢 Running" if comp_info.get('running') else "🔴 Not Running"
|
2041
|
+
url = comp_info.get('url', 'Port-forward required')
|
2042
|
+
table.add_row(component.capitalize(), status, url)
|
2043
|
+
|
2044
|
+
console.print(table)
|
2045
|
+
|
2046
|
+
# Show pod status
|
2047
|
+
if info.get('pods'):
|
2048
|
+
pod_table = Table(title="Pod Status")
|
2049
|
+
pod_table.add_column("Pod", style="cyan")
|
2050
|
+
pod_table.add_column("Status", style="green")
|
2051
|
+
pod_table.add_column("Ready", style="blue")
|
2052
|
+
|
2053
|
+
for pod in info['pods']:
|
2054
|
+
pod_table.add_row(
|
2055
|
+
pod['name'],
|
2056
|
+
pod['status'],
|
2057
|
+
f"{pod['ready']}/{pod['total']}"
|
2058
|
+
)
|
2059
|
+
|
2060
|
+
console.print(pod_table)
|
2061
|
+
|
2062
|
+
elif output == "json":
|
2063
|
+
console.print(format_json_output(info))
|
2064
|
+
elif output == "yaml":
|
2065
|
+
console.print(format_yaml_output(info))
|
2066
|
+
|
2067
|
+
except Exception as e:
|
2068
|
+
console.print(f"❌ Error checking Helm monitoring status: {e}")
|
2069
|
+
|
2070
|
+
|
2071
|
+
@app.command()
|
2072
|
+
def delete_monitoring_stack(
|
2073
|
+
namespace: str = typer.Option("monitoring", "--namespace", "-n", help="Monitoring namespace"),
|
2074
|
+
release_name: str = typer.Option("kube-prometheus-stack", "--release-name", help="Helm release name"),
|
2075
|
+
force: bool = typer.Option(False, "--force", help="Skip confirmation prompt")
|
2076
|
+
):
|
2077
|
+
"""Delete Helm-based monitoring stack"""
|
2078
|
+
if not force:
|
2079
|
+
if not typer.confirm(f"Are you sure you want to delete the Helm monitoring stack '{release_name}' in namespace '{namespace}'?"):
|
2080
|
+
console.print("❌ Operation cancelled")
|
2081
|
+
return
|
2082
|
+
|
2083
|
+
try:
|
2084
|
+
client = K8sClient()
|
2085
|
+
|
2086
|
+
console.print(f"🗑️ Deleting Helm monitoring stack: {release_name}")
|
2087
|
+
|
2088
|
+
with console.status("Uninstalling Helm release..."):
|
2089
|
+
result = client.delete_helm_monitoring(namespace, release_name)
|
2090
|
+
|
2091
|
+
if result['success']:
|
2092
|
+
console.print(f"✅ Helm monitoring stack '{release_name}' deleted successfully")
|
2093
|
+
console.print(f"📋 Cleaned up {result.get('resources_deleted', 0)} resources")
|
2094
|
+
else:
|
2095
|
+
console.print(f"❌ Failed to delete Helm monitoring stack: {result.get('error', 'Unknown error')}")
|
2096
|
+
|
2097
|
+
except Exception as e:
|
2098
|
+
console.print(f"❌ Error deleting Helm monitoring: {e}")
|
2099
|
+
|
2100
|
+
|
1866
2101
|
if __name__ == "__main__":
|
1867
2102
|
app()
|
k8s_helper/core.py
CHANGED
@@ -1683,10 +1683,346 @@ class K8sClient:
|
|
1683
1683
|
# ======================
|
1684
1684
|
# MONITORING OPERATIONS
|
1685
1685
|
# ======================
|
1686
|
+
|
1687
|
+
def _check_helm_available(self) -> bool:
|
1688
|
+
"""Check if Helm is available in the system"""
|
1689
|
+
import subprocess
|
1690
|
+
try:
|
1691
|
+
result = subprocess.run(['helm', 'version', '--short'],
|
1692
|
+
capture_output=True, text=True, timeout=10)
|
1693
|
+
return result.returncode == 0
|
1694
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
1695
|
+
return False
|
1696
|
+
|
1697
|
+
def _install_kube_state_metrics(self, namespace: str) -> Dict[str, Any]:
|
1698
|
+
"""Install kube-state-metrics using Helm if available, or manual YAML if not"""
|
1699
|
+
import subprocess
|
1700
|
+
|
1701
|
+
result = {
|
1702
|
+
'installed': False,
|
1703
|
+
'method': None,
|
1704
|
+
'error': None
|
1705
|
+
}
|
1706
|
+
|
1707
|
+
# Check if kube-state-metrics is already running
|
1708
|
+
try:
|
1709
|
+
deployments = self.apps_v1.list_deployment_for_all_namespaces()
|
1710
|
+
for deployment in deployments.items:
|
1711
|
+
if 'kube-state-metrics' in deployment.metadata.name:
|
1712
|
+
print(f"✅ kube-state-metrics already deployed in namespace: {deployment.metadata.namespace}")
|
1713
|
+
result['installed'] = True
|
1714
|
+
result['method'] = 'existing'
|
1715
|
+
return result
|
1716
|
+
except Exception as e:
|
1717
|
+
print(f"⚠️ Warning: Could not check existing deployments: {e}")
|
1718
|
+
|
1719
|
+
# Try Helm installation first
|
1720
|
+
if self._check_helm_available():
|
1721
|
+
try:
|
1722
|
+
print("📦 Installing kube-state-metrics using Helm...")
|
1723
|
+
|
1724
|
+
# Add prometheus-community repo if not exists
|
1725
|
+
subprocess.run(['helm', 'repo', 'add', 'prometheus-community',
|
1726
|
+
'https://prometheus-community.github.io/helm-charts'],
|
1727
|
+
capture_output=True, text=True, timeout=30)
|
1728
|
+
|
1729
|
+
# Update repo
|
1730
|
+
subprocess.run(['helm', 'repo', 'update'],
|
1731
|
+
capture_output=True, text=True, timeout=30)
|
1732
|
+
|
1733
|
+
# Install kube-state-metrics
|
1734
|
+
helm_cmd = [
|
1735
|
+
'helm', 'install', 'kube-state-metrics',
|
1736
|
+
'prometheus-community/kube-state-metrics',
|
1737
|
+
'--namespace', namespace,
|
1738
|
+
'--create-namespace',
|
1739
|
+
'--set', 'service.port=8080',
|
1740
|
+
'--set', 'service.targetPort=8080'
|
1741
|
+
]
|
1742
|
+
|
1743
|
+
helm_result = subprocess.run(helm_cmd, capture_output=True, text=True, timeout=120)
|
1744
|
+
|
1745
|
+
if helm_result.returncode == 0:
|
1746
|
+
print("✅ kube-state-metrics installed successfully via Helm")
|
1747
|
+
result['installed'] = True
|
1748
|
+
result['method'] = 'helm'
|
1749
|
+
return result
|
1750
|
+
else:
|
1751
|
+
print(f"⚠️ Helm installation failed: {helm_result.stderr}")
|
1752
|
+
|
1753
|
+
except subprocess.TimeoutExpired:
|
1754
|
+
print("⚠️ Helm installation timed out, falling back to manual installation")
|
1755
|
+
except Exception as e:
|
1756
|
+
print(f"⚠️ Helm installation failed: {e}, falling back to manual installation")
|
1757
|
+
|
1758
|
+
# Fallback to manual YAML installation
|
1759
|
+
try:
|
1760
|
+
print("📦 Installing kube-state-metrics using manual YAML...")
|
1761
|
+
|
1762
|
+
# Create ServiceAccount
|
1763
|
+
service_account = client.V1ServiceAccount(
|
1764
|
+
metadata=client.V1ObjectMeta(
|
1765
|
+
name="kube-state-metrics",
|
1766
|
+
namespace=namespace
|
1767
|
+
)
|
1768
|
+
)
|
1769
|
+
|
1770
|
+
try:
|
1771
|
+
self.core_v1.create_namespaced_service_account(namespace=namespace, body=service_account)
|
1772
|
+
print("✅ Created ServiceAccount for kube-state-metrics")
|
1773
|
+
except ApiException as e:
|
1774
|
+
if e.status == 409:
|
1775
|
+
print("⚠️ ServiceAccount already exists")
|
1776
|
+
else:
|
1777
|
+
raise e
|
1778
|
+
|
1779
|
+
# Create ClusterRole
|
1780
|
+
cluster_role = client.V1ClusterRole(
|
1781
|
+
metadata=client.V1ObjectMeta(name="kube-state-metrics"),
|
1782
|
+
rules=[
|
1783
|
+
client.V1PolicyRule(
|
1784
|
+
api_groups=[""],
|
1785
|
+
resources=["configmaps", "secrets", "nodes", "pods", "services",
|
1786
|
+
"resourcequotas", "replicationcontrollers", "limitranges",
|
1787
|
+
"persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"],
|
1788
|
+
verbs=["list", "watch"]
|
1789
|
+
),
|
1790
|
+
client.V1PolicyRule(
|
1791
|
+
api_groups=["apps"],
|
1792
|
+
resources=["statefulsets", "daemonsets", "deployments", "replicasets"],
|
1793
|
+
verbs=["list", "watch"]
|
1794
|
+
),
|
1795
|
+
client.V1PolicyRule(
|
1796
|
+
api_groups=["batch"],
|
1797
|
+
resources=["cronjobs", "jobs"],
|
1798
|
+
verbs=["list", "watch"]
|
1799
|
+
),
|
1800
|
+
client.V1PolicyRule(
|
1801
|
+
api_groups=["autoscaling"],
|
1802
|
+
resources=["horizontalpodautoscalers"],
|
1803
|
+
verbs=["list", "watch"]
|
1804
|
+
),
|
1805
|
+
client.V1PolicyRule(
|
1806
|
+
api_groups=["authentication.k8s.io"],
|
1807
|
+
resources=["tokenreviews"],
|
1808
|
+
verbs=["create"]
|
1809
|
+
),
|
1810
|
+
client.V1PolicyRule(
|
1811
|
+
api_groups=["authorization.k8s.io"],
|
1812
|
+
resources=["subjectaccessreviews"],
|
1813
|
+
verbs=["create"]
|
1814
|
+
),
|
1815
|
+
client.V1PolicyRule(
|
1816
|
+
api_groups=["policy"],
|
1817
|
+
resources=["poddisruptionbudgets"],
|
1818
|
+
verbs=["list", "watch"]
|
1819
|
+
),
|
1820
|
+
client.V1PolicyRule(
|
1821
|
+
api_groups=["certificates.k8s.io"],
|
1822
|
+
resources=["certificatesigningrequests"],
|
1823
|
+
verbs=["list", "watch"]
|
1824
|
+
),
|
1825
|
+
client.V1PolicyRule(
|
1826
|
+
api_groups=["storage.k8s.io"],
|
1827
|
+
resources=["storageclasses", "volumeattachments"],
|
1828
|
+
verbs=["list", "watch"]
|
1829
|
+
),
|
1830
|
+
client.V1PolicyRule(
|
1831
|
+
api_groups=["admissionregistration.k8s.io"],
|
1832
|
+
resources=["mutatingwebhookconfigurations", "validatingwebhookconfigurations"],
|
1833
|
+
verbs=["list", "watch"]
|
1834
|
+
),
|
1835
|
+
client.V1PolicyRule(
|
1836
|
+
api_groups=["networking.k8s.io"],
|
1837
|
+
resources=["networkpolicies", "ingresses"],
|
1838
|
+
verbs=["list", "watch"]
|
1839
|
+
),
|
1840
|
+
client.V1PolicyRule(
|
1841
|
+
api_groups=["coordination.k8s.io"],
|
1842
|
+
resources=["leases"],
|
1843
|
+
verbs=["list", "watch"]
|
1844
|
+
)
|
1845
|
+
]
|
1846
|
+
)
|
1847
|
+
|
1848
|
+
# Create ClusterRole
|
1849
|
+
rbac_v1 = client.RbacAuthorizationV1Api()
|
1850
|
+
try:
|
1851
|
+
rbac_v1.create_cluster_role(body=cluster_role)
|
1852
|
+
print("✅ Created ClusterRole for kube-state-metrics")
|
1853
|
+
except ApiException as e:
|
1854
|
+
if e.status == 409:
|
1855
|
+
print("⚠️ ClusterRole already exists")
|
1856
|
+
else:
|
1857
|
+
raise e
|
1858
|
+
|
1859
|
+
# Create ClusterRoleBinding
|
1860
|
+
# Create subject with version compatibility
|
1861
|
+
try:
|
1862
|
+
# Try V1Subject first (older versions)
|
1863
|
+
subject = client.V1Subject(
|
1864
|
+
kind="ServiceAccount",
|
1865
|
+
name="kube-state-metrics",
|
1866
|
+
namespace=namespace
|
1867
|
+
)
|
1868
|
+
except AttributeError:
|
1869
|
+
# Try RbacV1Subject (newer versions)
|
1870
|
+
try:
|
1871
|
+
subject = client.RbacV1Subject(
|
1872
|
+
kind="ServiceAccount",
|
1873
|
+
name="kube-state-metrics",
|
1874
|
+
namespace=namespace
|
1875
|
+
)
|
1876
|
+
except AttributeError:
|
1877
|
+
# Manual construction as fallback
|
1878
|
+
subject = {
|
1879
|
+
'kind': 'ServiceAccount',
|
1880
|
+
'name': 'kube-state-metrics',
|
1881
|
+
'namespace': namespace
|
1882
|
+
}
|
1883
|
+
|
1884
|
+
cluster_role_binding = client.V1ClusterRoleBinding(
|
1885
|
+
metadata=client.V1ObjectMeta(name="kube-state-metrics"),
|
1886
|
+
subjects=[subject],
|
1887
|
+
role_ref=client.V1RoleRef(
|
1888
|
+
kind="ClusterRole",
|
1889
|
+
name="kube-state-metrics",
|
1890
|
+
api_group="rbac.authorization.k8s.io"
|
1891
|
+
)
|
1892
|
+
)
|
1893
|
+
|
1894
|
+
try:
|
1895
|
+
rbac_v1.create_cluster_role_binding(body=cluster_role_binding)
|
1896
|
+
print("✅ Created ClusterRoleBinding for kube-state-metrics")
|
1897
|
+
except ApiException as e:
|
1898
|
+
if e.status == 409:
|
1899
|
+
print("⚠️ ClusterRoleBinding already exists")
|
1900
|
+
else:
|
1901
|
+
raise e
|
1902
|
+
|
1903
|
+
# Create Deployment
|
1904
|
+
deployment = client.V1Deployment(
|
1905
|
+
metadata=client.V1ObjectMeta(
|
1906
|
+
name="kube-state-metrics",
|
1907
|
+
namespace=namespace,
|
1908
|
+
labels={"app": "kube-state-metrics"}
|
1909
|
+
),
|
1910
|
+
spec=client.V1DeploymentSpec(
|
1911
|
+
replicas=1,
|
1912
|
+
selector=client.V1LabelSelector(
|
1913
|
+
match_labels={"app": "kube-state-metrics"}
|
1914
|
+
),
|
1915
|
+
template=client.V1PodTemplateSpec(
|
1916
|
+
metadata=client.V1ObjectMeta(
|
1917
|
+
labels={"app": "kube-state-metrics"}
|
1918
|
+
),
|
1919
|
+
spec=client.V1PodSpec(
|
1920
|
+
service_account_name="kube-state-metrics",
|
1921
|
+
containers=[
|
1922
|
+
client.V1Container(
|
1923
|
+
name="kube-state-metrics",
|
1924
|
+
image="registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.10.1",
|
1925
|
+
ports=[
|
1926
|
+
client.V1ContainerPort(
|
1927
|
+
name="http-metrics",
|
1928
|
+
container_port=8080,
|
1929
|
+
protocol="TCP"
|
1930
|
+
),
|
1931
|
+
client.V1ContainerPort(
|
1932
|
+
name="telemetry",
|
1933
|
+
container_port=8081,
|
1934
|
+
protocol="TCP"
|
1935
|
+
)
|
1936
|
+
],
|
1937
|
+
liveness_probe=client.V1Probe(
|
1938
|
+
http_get=client.V1HTTPGetAction(
|
1939
|
+
path="/healthz",
|
1940
|
+
port=8080
|
1941
|
+
),
|
1942
|
+
initial_delay_seconds=5,
|
1943
|
+
timeout_seconds=5
|
1944
|
+
),
|
1945
|
+
readiness_probe=client.V1Probe(
|
1946
|
+
http_get=client.V1HTTPGetAction(
|
1947
|
+
path="/",
|
1948
|
+
port=8081
|
1949
|
+
),
|
1950
|
+
initial_delay_seconds=5,
|
1951
|
+
timeout_seconds=5
|
1952
|
+
),
|
1953
|
+
security_context=client.V1SecurityContext(
|
1954
|
+
allow_privilege_escalation=False,
|
1955
|
+
read_only_root_filesystem=True,
|
1956
|
+
run_as_non_root=True,
|
1957
|
+
run_as_user=65534
|
1958
|
+
)
|
1959
|
+
)
|
1960
|
+
]
|
1961
|
+
)
|
1962
|
+
)
|
1963
|
+
)
|
1964
|
+
)
|
1965
|
+
|
1966
|
+
try:
|
1967
|
+
self.apps_v1.create_namespaced_deployment(namespace=namespace, body=deployment)
|
1968
|
+
print("✅ Created Deployment for kube-state-metrics")
|
1969
|
+
except ApiException as e:
|
1970
|
+
if e.status == 409:
|
1971
|
+
print("⚠️ Deployment already exists")
|
1972
|
+
else:
|
1973
|
+
raise e
|
1974
|
+
|
1975
|
+
# Create Service
|
1976
|
+
service = client.V1Service(
|
1977
|
+
metadata=client.V1ObjectMeta(
|
1978
|
+
name="kube-state-metrics",
|
1979
|
+
namespace=namespace,
|
1980
|
+
labels={"app": "kube-state-metrics"}
|
1981
|
+
),
|
1982
|
+
spec=client.V1ServiceSpec(
|
1983
|
+
selector={"app": "kube-state-metrics"},
|
1984
|
+
ports=[
|
1985
|
+
client.V1ServicePort(
|
1986
|
+
name="http-metrics",
|
1987
|
+
port=8080,
|
1988
|
+
target_port=8080,
|
1989
|
+
protocol="TCP"
|
1990
|
+
),
|
1991
|
+
client.V1ServicePort(
|
1992
|
+
name="telemetry",
|
1993
|
+
port=8081,
|
1994
|
+
target_port=8081,
|
1995
|
+
protocol="TCP"
|
1996
|
+
)
|
1997
|
+
],
|
1998
|
+
type="ClusterIP"
|
1999
|
+
)
|
2000
|
+
)
|
2001
|
+
|
2002
|
+
try:
|
2003
|
+
self.core_v1.create_namespaced_service(namespace=namespace, body=service)
|
2004
|
+
print("✅ Created Service for kube-state-metrics")
|
2005
|
+
except ApiException as e:
|
2006
|
+
if e.status == 409:
|
2007
|
+
print("⚠️ Service already exists")
|
2008
|
+
else:
|
2009
|
+
raise e
|
2010
|
+
|
2011
|
+
result['installed'] = True
|
2012
|
+
result['method'] = 'manual'
|
2013
|
+
print("✅ kube-state-metrics installed successfully via manual YAML")
|
2014
|
+
|
2015
|
+
except Exception as e:
|
2016
|
+
result['error'] = str(e)
|
2017
|
+
print(f"❌ Failed to install kube-state-metrics: {e}")
|
2018
|
+
|
2019
|
+
return result
|
2020
|
+
|
1686
2021
|
def setup_monitoring(self, namespace: str = "monitoring",
|
1687
2022
|
grafana_service_type: str = "NodePort",
|
1688
2023
|
import_dashboard: bool = True,
|
1689
|
-
wait_for_ready: bool = True
|
2024
|
+
wait_for_ready: bool = True,
|
2025
|
+
install_kube_state_metrics: bool = True) -> Dict[str, Any]:
|
1690
2026
|
"""Setup complete monitoring stack with Prometheus and Grafana
|
1691
2027
|
|
1692
2028
|
Args:
|
@@ -1694,6 +2030,7 @@ class K8sClient:
|
|
1694
2030
|
grafana_service_type: Service type for Grafana (NodePort, LoadBalancer, ClusterIP)
|
1695
2031
|
import_dashboard: Whether to import default Kubernetes dashboard
|
1696
2032
|
wait_for_ready: Whether to wait for deployments to be ready
|
2033
|
+
install_kube_state_metrics: Whether to install kube-state-metrics for cluster metrics
|
1697
2034
|
|
1698
2035
|
Returns:
|
1699
2036
|
Dictionary with deployment info, URLs, and credentials
|
@@ -1702,6 +2039,7 @@ class K8sClient:
|
|
1702
2039
|
'namespace': namespace,
|
1703
2040
|
'prometheus': {},
|
1704
2041
|
'grafana': {},
|
2042
|
+
'kube_state_metrics': {},
|
1705
2043
|
'success': False,
|
1706
2044
|
'error': None
|
1707
2045
|
}
|
@@ -1710,6 +2048,12 @@ class K8sClient:
|
|
1710
2048
|
# Create monitoring namespace
|
1711
2049
|
self._create_monitoring_namespace(namespace)
|
1712
2050
|
|
2051
|
+
# Install kube-state-metrics if requested
|
2052
|
+
if install_kube_state_metrics:
|
2053
|
+
print("📊 Installing kube-state-metrics for cluster metrics...")
|
2054
|
+
ksm_result = self._install_kube_state_metrics(namespace)
|
2055
|
+
result['kube_state_metrics'] = ksm_result
|
2056
|
+
|
1713
2057
|
# Deploy Prometheus
|
1714
2058
|
prometheus_result = self._deploy_prometheus(namespace)
|
1715
2059
|
result['prometheus'] = prometheus_result
|
@@ -1762,7 +2106,7 @@ class K8sClient:
|
|
1762
2106
|
"""Deploy Prometheus to the cluster"""
|
1763
2107
|
result = {'deployed': False, 'service_name': 'prometheus-service'}
|
1764
2108
|
|
1765
|
-
# Prometheus ConfigMap
|
2109
|
+
# Prometheus ConfigMap with kube-state-metrics support
|
1766
2110
|
prometheus_config = """
|
1767
2111
|
global:
|
1768
2112
|
scrape_interval: 15s
|
@@ -1790,6 +2134,29 @@ scrape_configs:
|
|
1790
2134
|
- action: labelmap
|
1791
2135
|
regex: __meta_kubernetes_node_label_(.+)
|
1792
2136
|
|
2137
|
+
- job_name: 'kubernetes-cadvisor'
|
2138
|
+
kubernetes_sd_configs:
|
2139
|
+
- role: node
|
2140
|
+
scheme: https
|
2141
|
+
tls_config:
|
2142
|
+
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
2143
|
+
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
2144
|
+
relabel_configs:
|
2145
|
+
- action: labelmap
|
2146
|
+
regex: __meta_kubernetes_node_label_(.+)
|
2147
|
+
- target_label: __address__
|
2148
|
+
replacement: kubernetes.default.svc:443
|
2149
|
+
- source_labels: [__meta_kubernetes_node_name]
|
2150
|
+
regex: (.+)
|
2151
|
+
target_label: __metrics_path__
|
2152
|
+
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
|
2153
|
+
|
2154
|
+
- job_name: 'kube-state-metrics'
|
2155
|
+
static_configs:
|
2156
|
+
- targets: ['kube-state-metrics.{}:8080']
|
2157
|
+
metrics_path: /metrics
|
2158
|
+
scrape_interval: 30s
|
2159
|
+
|
1793
2160
|
- job_name: 'kubernetes-pods'
|
1794
2161
|
kubernetes_sd_configs:
|
1795
2162
|
- role: pod
|
@@ -1814,7 +2181,36 @@ scrape_configs:
|
|
1814
2181
|
- source_labels: [__meta_kubernetes_pod_name]
|
1815
2182
|
action: replace
|
1816
2183
|
target_label: kubernetes_pod_name
|
1817
|
-
|
2184
|
+
|
2185
|
+
- job_name: 'kubernetes-service-endpoints'
|
2186
|
+
kubernetes_sd_configs:
|
2187
|
+
- role: endpoints
|
2188
|
+
relabel_configs:
|
2189
|
+
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
|
2190
|
+
action: keep
|
2191
|
+
regex: true
|
2192
|
+
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
|
2193
|
+
action: replace
|
2194
|
+
target_label: __scheme__
|
2195
|
+
regex: (https?)
|
2196
|
+
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
|
2197
|
+
action: replace
|
2198
|
+
target_label: __metrics_path__
|
2199
|
+
regex: (.+)
|
2200
|
+
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
|
2201
|
+
action: replace
|
2202
|
+
target_label: __address__
|
2203
|
+
regex: ([^:]+)(?::\\d+)?;(\\d+)
|
2204
|
+
replacement: $1:$2
|
2205
|
+
- action: labelmap
|
2206
|
+
regex: __meta_kubernetes_service_label_(.+)
|
2207
|
+
- source_labels: [__meta_kubernetes_namespace]
|
2208
|
+
action: replace
|
2209
|
+
target_label: kubernetes_namespace
|
2210
|
+
- source_labels: [__meta_kubernetes_service_name]
|
2211
|
+
action: replace
|
2212
|
+
target_label: kubernetes_name
|
2213
|
+
""".format(namespace)
|
1818
2214
|
|
1819
2215
|
# Create ConfigMap
|
1820
2216
|
configmap = client.V1ConfigMap(
|
@@ -2687,3 +3083,398 @@ scrape_configs:
|
|
2687
3083
|
except Exception as e:
|
2688
3084
|
print(f"⚠️ Could not restart Prometheus deployment: {e}")
|
2689
3085
|
return False
|
3086
|
+
|
3087
|
+
# ======================
|
3088
|
+
# HELM-BASED MONITORING METHODS
|
3089
|
+
# ======================
|
3090
|
+
|
3091
|
+
def setup_helm_monitoring(self, namespace: str = "monitoring",
|
3092
|
+
grafana_service_type: str = "NodePort",
|
3093
|
+
prometheus_storage_size: str = "10Gi",
|
3094
|
+
grafana_storage_size: str = "5Gi",
|
3095
|
+
wait_for_ready: bool = True,
|
3096
|
+
install_ingress: bool = False) -> Dict:
|
3097
|
+
"""Deploy monitoring stack using official Helm charts"""
|
3098
|
+
import subprocess
|
3099
|
+
import tempfile
|
3100
|
+
import os
|
3101
|
+
|
3102
|
+
try:
|
3103
|
+
# Check if Helm is available
|
3104
|
+
try:
|
3105
|
+
result = subprocess.run(['helm', 'version'], capture_output=True, text=True, check=True)
|
3106
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
3107
|
+
return {
|
3108
|
+
'success': False,
|
3109
|
+
'error': 'Helm is not installed or not in PATH. Please install Helm first.'
|
3110
|
+
}
|
3111
|
+
|
3112
|
+
# Create namespace if it doesn't exist
|
3113
|
+
try:
|
3114
|
+
self.core_v1.create_namespace(
|
3115
|
+
body=client.V1Namespace(metadata=client.V1ObjectMeta(name=namespace))
|
3116
|
+
)
|
3117
|
+
print(f"✅ Created namespace: {namespace}")
|
3118
|
+
except ApiException as e:
|
3119
|
+
if e.status == 409: # Already exists
|
3120
|
+
print(f"✅ Namespace {namespace} already exists")
|
3121
|
+
else:
|
3122
|
+
print(f"⚠️ Could not create namespace: {e}")
|
3123
|
+
|
3124
|
+
# Add Prometheus community Helm repository
|
3125
|
+
print("📦 Adding Prometheus community Helm repository...")
|
3126
|
+
try:
|
3127
|
+
subprocess.run([
|
3128
|
+
'helm', 'repo', 'add', 'prometheus-community',
|
3129
|
+
'https://prometheus-community.github.io/helm-charts'
|
3130
|
+
], check=True, capture_output=True)
|
3131
|
+
|
3132
|
+
subprocess.run(['helm', 'repo', 'update'], check=True, capture_output=True)
|
3133
|
+
print("✅ Helm repository added and updated")
|
3134
|
+
except subprocess.CalledProcessError as e:
|
3135
|
+
return {
|
3136
|
+
'success': False,
|
3137
|
+
'error': f'Failed to add Helm repository: {e.stderr.decode() if e.stderr else str(e)}'
|
3138
|
+
}
|
3139
|
+
|
3140
|
+
# Create Helm values file
|
3141
|
+
helm_values = {
|
3142
|
+
'grafana': {
|
3143
|
+
'enabled': True,
|
3144
|
+
'persistence': {
|
3145
|
+
'enabled': True,
|
3146
|
+
'size': grafana_storage_size
|
3147
|
+
},
|
3148
|
+
'service': {
|
3149
|
+
'type': grafana_service_type
|
3150
|
+
},
|
3151
|
+
'adminPassword': 'admin',
|
3152
|
+
'datasources': {
|
3153
|
+
'datasources.yaml': {
|
3154
|
+
'apiVersion': 1,
|
3155
|
+
'datasources': [{
|
3156
|
+
'name': 'Prometheus',
|
3157
|
+
'type': 'prometheus',
|
3158
|
+
'url': 'http://kube-prometheus-stack-prometheus:9090',
|
3159
|
+
'access': 'proxy',
|
3160
|
+
'isDefault': True
|
3161
|
+
}]
|
3162
|
+
}
|
3163
|
+
},
|
3164
|
+
'dashboardProviders': {
|
3165
|
+
'dashboardproviders.yaml': {
|
3166
|
+
'apiVersion': 1,
|
3167
|
+
'providers': [{
|
3168
|
+
'name': 'default',
|
3169
|
+
'orgId': 1,
|
3170
|
+
'folder': '',
|
3171
|
+
'type': 'file',
|
3172
|
+
'disableDeletion': False,
|
3173
|
+
'editable': True,
|
3174
|
+
'options': {
|
3175
|
+
'path': '/var/lib/grafana/dashboards/default'
|
3176
|
+
}
|
3177
|
+
}]
|
3178
|
+
}
|
3179
|
+
},
|
3180
|
+
'dashboards': {
|
3181
|
+
'default': {
|
3182
|
+
'kubernetes-cluster-dashboard': {
|
3183
|
+
'gnetId': 7249,
|
3184
|
+
'revision': 1,
|
3185
|
+
'datasource': 'Prometheus'
|
3186
|
+
},
|
3187
|
+
'kubernetes-pod-dashboard': {
|
3188
|
+
'gnetId': 6417,
|
3189
|
+
'revision': 1,
|
3190
|
+
'datasource': 'Prometheus'
|
3191
|
+
},
|
3192
|
+
'node-exporter-dashboard': {
|
3193
|
+
'gnetId': 1860,
|
3194
|
+
'revision': 27,
|
3195
|
+
'datasource': 'Prometheus'
|
3196
|
+
}
|
3197
|
+
}
|
3198
|
+
}
|
3199
|
+
},
|
3200
|
+
'prometheus': {
|
3201
|
+
'enabled': True,
|
3202
|
+
'prometheusSpec': {
|
3203
|
+
'retention': '30d',
|
3204
|
+
'storageSpec': {
|
3205
|
+
'volumeClaimTemplate': {
|
3206
|
+
'spec': {
|
3207
|
+
'accessModes': ['ReadWriteOnce'],
|
3208
|
+
'resources': {
|
3209
|
+
'requests': {
|
3210
|
+
'storage': prometheus_storage_size
|
3211
|
+
}
|
3212
|
+
}
|
3213
|
+
}
|
3214
|
+
}
|
3215
|
+
},
|
3216
|
+
'serviceMonitorSelectorNilUsesHelmValues': False
|
3217
|
+
}
|
3218
|
+
},
|
3219
|
+
'alertmanager': {
|
3220
|
+
'enabled': True
|
3221
|
+
},
|
3222
|
+
'nodeExporter': {
|
3223
|
+
'enabled': True
|
3224
|
+
},
|
3225
|
+
'kubeStateMetrics': {
|
3226
|
+
'enabled': True
|
3227
|
+
},
|
3228
|
+
'defaultRules': {
|
3229
|
+
'create': True,
|
3230
|
+
'rules': {
|
3231
|
+
'alertmanager': True,
|
3232
|
+
'etcd': True,
|
3233
|
+
'general': True,
|
3234
|
+
'k8s': True,
|
3235
|
+
'kubeApiserver': True,
|
3236
|
+
'kubePrometheusNodeRecording': True,
|
3237
|
+
'kubernetesApps': True,
|
3238
|
+
'kubernetesResources': True,
|
3239
|
+
'kubernetesStorage': True,
|
3240
|
+
'kubernetesSystem': True,
|
3241
|
+
'network': True,
|
3242
|
+
'node': True,
|
3243
|
+
'prometheus': True,
|
3244
|
+
'prometheusOperator': True
|
3245
|
+
}
|
3246
|
+
}
|
3247
|
+
}
|
3248
|
+
|
3249
|
+
# Add ingress if requested
|
3250
|
+
if install_ingress:
|
3251
|
+
helm_values['grafana']['ingress'] = {
|
3252
|
+
'enabled': True,
|
3253
|
+
'hosts': [f'grafana.{namespace}.local'],
|
3254
|
+
'paths': ['/']
|
3255
|
+
}
|
3256
|
+
|
3257
|
+
# Write values to temporary file
|
3258
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
|
3259
|
+
yaml.dump(helm_values, f, default_flow_style=False)
|
3260
|
+
values_file = f.name
|
3261
|
+
|
3262
|
+
try:
|
3263
|
+
# Install the Helm chart
|
3264
|
+
print("🚀 Installing kube-prometheus-stack via Helm...")
|
3265
|
+
helm_cmd = [
|
3266
|
+
'helm', 'install', 'kube-prometheus-stack',
|
3267
|
+
'prometheus-community/kube-prometheus-stack',
|
3268
|
+
'--namespace', namespace,
|
3269
|
+
'--values', values_file
|
3270
|
+
]
|
3271
|
+
|
3272
|
+
if wait_for_ready:
|
3273
|
+
helm_cmd.append('--wait')
|
3274
|
+
helm_cmd.extend(['--timeout', '10m'])
|
3275
|
+
|
3276
|
+
result = subprocess.run(helm_cmd, capture_output=True, text=True, check=True)
|
3277
|
+
print("✅ Helm chart installed successfully")
|
3278
|
+
|
3279
|
+
# Wait a bit for pods to start
|
3280
|
+
if wait_for_ready:
|
3281
|
+
print("⏳ Waiting for pods to be ready...")
|
3282
|
+
time.sleep(30)
|
3283
|
+
|
3284
|
+
# Get service information
|
3285
|
+
services_info = self._get_helm_monitoring_services(namespace)
|
3286
|
+
|
3287
|
+
return {
|
3288
|
+
'success': True,
|
3289
|
+
'namespace': namespace,
|
3290
|
+
'release_name': 'kube-prometheus-stack',
|
3291
|
+
'prometheus': {'deployed': True},
|
3292
|
+
'grafana': {
|
3293
|
+
'deployed': True,
|
3294
|
+
'admin_password': 'admin'
|
3295
|
+
},
|
3296
|
+
'grafana_url': services_info.get('grafana_url'),
|
3297
|
+
'prometheus_url': services_info.get('prometheus_url'),
|
3298
|
+
'alertmanager_url': services_info.get('alertmanager_url')
|
3299
|
+
}
|
3300
|
+
|
3301
|
+
finally:
|
3302
|
+
# Clean up temporary file
|
3303
|
+
os.unlink(values_file)
|
3304
|
+
|
3305
|
+
except subprocess.CalledProcessError as e:
|
3306
|
+
error_msg = e.stderr.decode() if e.stderr else str(e)
|
3307
|
+
return {
|
3308
|
+
'success': False,
|
3309
|
+
'error': f'Helm installation failed: {error_msg}'
|
3310
|
+
}
|
3311
|
+
except Exception as e:
|
3312
|
+
return {
|
3313
|
+
'success': False,
|
3314
|
+
'error': f'Failed to setup Helm monitoring: {str(e)}'
|
3315
|
+
}
|
3316
|
+
|
3317
|
+
def get_helm_monitoring_info(self, namespace: str = "monitoring") -> Dict:
|
3318
|
+
"""Get information about the Helm-based monitoring stack"""
|
3319
|
+
import subprocess
|
3320
|
+
|
3321
|
+
try:
|
3322
|
+
# Check if Helm release exists
|
3323
|
+
try:
|
3324
|
+
result = subprocess.run([
|
3325
|
+
'helm', 'status', 'kube-prometheus-stack',
|
3326
|
+
'--namespace', namespace
|
3327
|
+
], capture_output=True, text=True, check=True)
|
3328
|
+
|
3329
|
+
# Parse Helm status
|
3330
|
+
lines = result.stdout.split('\n')
|
3331
|
+
release_info = {}
|
3332
|
+
for line in lines:
|
3333
|
+
if 'STATUS:' in line:
|
3334
|
+
release_info['release_status'] = line.split('STATUS:')[1].strip()
|
3335
|
+
elif 'LAST DEPLOYED:' in line:
|
3336
|
+
release_info['last_deployed'] = line.split('LAST DEPLOYED:')[1].strip()
|
3337
|
+
|
3338
|
+
except subprocess.CalledProcessError:
|
3339
|
+
return {'error': 'Helm release not found. Use setup-helm-monitoring to deploy first.'}
|
3340
|
+
|
3341
|
+
# Get services information
|
3342
|
+
services_info = self._get_helm_monitoring_services(namespace)
|
3343
|
+
|
3344
|
+
# Get pod status
|
3345
|
+
pods_info = self._get_monitoring_pods_status(namespace)
|
3346
|
+
|
3347
|
+
return {
|
3348
|
+
'release_name': 'kube-prometheus-stack',
|
3349
|
+
'release_status': release_info.get('release_status', 'Unknown'),
|
3350
|
+
'last_deployed': release_info.get('last_deployed', 'Unknown'),
|
3351
|
+
'prometheus': {
|
3352
|
+
'running': any(pod['name'].startswith('prometheus-kube-prometheus-stack-prometheus')
|
3353
|
+
for pod in pods_info if pod['status'] == 'Running'),
|
3354
|
+
'url': services_info.get('prometheus_url', 'Port-forward required')
|
3355
|
+
},
|
3356
|
+
'grafana': {
|
3357
|
+
'running': any(pod['name'].startswith('kube-prometheus-stack-grafana')
|
3358
|
+
for pod in pods_info if pod['status'] == 'Running'),
|
3359
|
+
'url': services_info.get('grafana_url', 'Port-forward required')
|
3360
|
+
},
|
3361
|
+
'alertmanager': {
|
3362
|
+
'running': any(pod['name'].startswith('alertmanager-kube-prometheus-stack-alertmanager')
|
3363
|
+
for pod in pods_info if pod['status'] == 'Running'),
|
3364
|
+
'url': services_info.get('alertmanager_url', 'Port-forward required')
|
3365
|
+
},
|
3366
|
+
'pods': pods_info
|
3367
|
+
}
|
3368
|
+
|
3369
|
+
except Exception as e:
|
3370
|
+
return {'error': f'Failed to get monitoring info: {str(e)}'}
|
3371
|
+
|
3372
|
+
def delete_helm_monitoring(self, namespace: str = "monitoring",
|
3373
|
+
release_name: str = "kube-prometheus-stack") -> Dict:
|
3374
|
+
"""Delete Helm-based monitoring stack"""
|
3375
|
+
import subprocess
|
3376
|
+
|
3377
|
+
try:
|
3378
|
+
# Uninstall Helm release
|
3379
|
+
result = subprocess.run([
|
3380
|
+
'helm', 'uninstall', release_name,
|
3381
|
+
'--namespace', namespace
|
3382
|
+
], capture_output=True, text=True, check=True)
|
3383
|
+
|
3384
|
+
print(f"✅ Helm release '{release_name}' uninstalled")
|
3385
|
+
|
3386
|
+
# Count remaining resources (optional cleanup)
|
3387
|
+
try:
|
3388
|
+
# Delete PVCs that might remain
|
3389
|
+
pvcs = self.core_v1.list_namespaced_persistent_volume_claim(namespace=namespace)
|
3390
|
+
pvc_count = 0
|
3391
|
+
for pvc in pvcs.items:
|
3392
|
+
if 'prometheus' in pvc.metadata.name or 'grafana' in pvc.metadata.name:
|
3393
|
+
self.core_v1.delete_namespaced_persistent_volume_claim(
|
3394
|
+
name=pvc.metadata.name,
|
3395
|
+
namespace=namespace
|
3396
|
+
)
|
3397
|
+
pvc_count += 1
|
3398
|
+
|
3399
|
+
if pvc_count > 0:
|
3400
|
+
print(f"✅ Cleaned up {pvc_count} persistent volume claims")
|
3401
|
+
|
3402
|
+
except Exception as cleanup_error:
|
3403
|
+
print(f"⚠️ Could not clean up some resources: {cleanup_error}")
|
3404
|
+
|
3405
|
+
return {
|
3406
|
+
'success': True,
|
3407
|
+
'resources_deleted': pvc_count
|
3408
|
+
}
|
3409
|
+
|
3410
|
+
except subprocess.CalledProcessError as e:
|
3411
|
+
error_msg = e.stderr.decode() if e.stderr else str(e)
|
3412
|
+
return {
|
3413
|
+
'success': False,
|
3414
|
+
'error': f'Failed to uninstall Helm release: {error_msg}'
|
3415
|
+
}
|
3416
|
+
except Exception as e:
|
3417
|
+
return {
|
3418
|
+
'success': False,
|
3419
|
+
'error': f'Failed to delete monitoring stack: {str(e)}'
|
3420
|
+
}
|
3421
|
+
|
3422
|
+
def _get_helm_monitoring_services(self, namespace: str) -> Dict:
|
3423
|
+
"""Get service URLs for Helm monitoring components"""
|
3424
|
+
services_info = {}
|
3425
|
+
|
3426
|
+
try:
|
3427
|
+
# Get services
|
3428
|
+
services = self.core_v1.list_namespaced_service(namespace=namespace)
|
3429
|
+
|
3430
|
+
for service in services.items:
|
3431
|
+
service_name = service.metadata.name
|
3432
|
+
|
3433
|
+
if 'grafana' in service_name:
|
3434
|
+
url = self._get_service_url(service, namespace, 80)
|
3435
|
+
if url:
|
3436
|
+
services_info['grafana_url'] = url
|
3437
|
+
|
3438
|
+
elif 'prometheus' in service_name and 'operated' not in service_name:
|
3439
|
+
url = self._get_service_url(service, namespace, 9090)
|
3440
|
+
if url:
|
3441
|
+
services_info['prometheus_url'] = url
|
3442
|
+
|
3443
|
+
elif 'alertmanager' in service_name and 'operated' not in service_name:
|
3444
|
+
url = self._get_service_url(service, namespace, 9093)
|
3445
|
+
if url:
|
3446
|
+
services_info['alertmanager_url'] = url
|
3447
|
+
|
3448
|
+
except Exception as e:
|
3449
|
+
print(f"⚠️ Could not get service information: {e}")
|
3450
|
+
|
3451
|
+
return services_info
|
3452
|
+
|
3453
|
+
def _get_monitoring_pods_status(self, namespace: str) -> List[Dict]:
|
3454
|
+
"""Get status of monitoring pods"""
|
3455
|
+
pods_info = []
|
3456
|
+
|
3457
|
+
try:
|
3458
|
+
pods = self.core_v1.list_namespaced_pod(namespace=namespace)
|
3459
|
+
|
3460
|
+
for pod in pods.items:
|
3461
|
+
if any(component in pod.metadata.name for component in
|
3462
|
+
['prometheus', 'grafana', 'alertmanager', 'node-exporter', 'kube-state-metrics']):
|
3463
|
+
|
3464
|
+
ready_containers = 0
|
3465
|
+
total_containers = len(pod.status.container_statuses) if pod.status.container_statuses else 0
|
3466
|
+
|
3467
|
+
if pod.status.container_statuses:
|
3468
|
+
ready_containers = sum(1 for cs in pod.status.container_statuses if cs.ready)
|
3469
|
+
|
3470
|
+
pods_info.append({
|
3471
|
+
'name': pod.metadata.name,
|
3472
|
+
'status': pod.status.phase,
|
3473
|
+
'ready': ready_containers,
|
3474
|
+
'total': total_containers
|
3475
|
+
})
|
3476
|
+
|
3477
|
+
except Exception as e:
|
3478
|
+
print(f"⚠️ Could not get pod status: {e}")
|
3479
|
+
|
3480
|
+
return pods_info
|
@@ -0,0 +1,11 @@
|
|
1
|
+
k8s_helper/__init__.py,sha256=Rm3LMlyreNv628XF3jVu7wszfxHHL0JWZKzmHFmq_D0,2666
|
2
|
+
k8s_helper/cli.py,sha256=U4oPgjykRmkBvPli7jZLVvZrb13L6-wjV0hd-RQZN9g,92124
|
3
|
+
k8s_helper/config.py,sha256=P7YdfyvCHprrNs2J9DRb3RrClylfTTh5hfTtDzLug0A,6867
|
4
|
+
k8s_helper/core.py,sha256=P6nvVPuW44Jdvkm572__d4nycLin6cl8obZ-XqzuTY4,147614
|
5
|
+
k8s_helper/utils.py,sha256=wYgTd5ktyuI-EiVcfW7FrxA7MzXY5odrEKQgmMVdueY,9496
|
6
|
+
k8s_helper_cli-0.5.1.dist-info/licenses/LICENSE,sha256=tXPvVl3gLVc6e0qCEoLH9KjeA7z4JVL78UybpvGtBCw,1096
|
7
|
+
k8s_helper_cli-0.5.1.dist-info/METADATA,sha256=Z2Il7mbkN4p29oc_83mIjxD3l13IPTihkzo5d0JPIT4,30789
|
8
|
+
k8s_helper_cli-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
9
|
+
k8s_helper_cli-0.5.1.dist-info/entry_points.txt,sha256=IoCMWUZ6mn90LwzQzEy5YkWOwvogDdZ6ycqUWAzCFTQ,50
|
10
|
+
k8s_helper_cli-0.5.1.dist-info/top_level.txt,sha256=x9A1jflyer-z2cFnkqk5B42juoH2q0fy5hkT9upsTG8,11
|
11
|
+
k8s_helper_cli-0.5.1.dist-info/RECORD,,
|
@@ -1,11 +0,0 @@
|
|
1
|
-
k8s_helper/__init__.py,sha256=8xOzKrONciTYSjLyhDKR4cQs5wVVtW0UZll7TnsTpqQ,2666
|
2
|
-
k8s_helper/cli.py,sha256=g0hzBHaROOT0gbKN5xu5GeC-aqbIyXpl3U-xx7vNWEU,80259
|
3
|
-
k8s_helper/config.py,sha256=P7YdfyvCHprrNs2J9DRb3RrClylfTTh5hfTtDzLug0A,6867
|
4
|
-
k8s_helper/core.py,sha256=R0_EDqVGFWQCpu5YuWC4abLLWIqjFtpn6KHgGD_Wues,112490
|
5
|
-
k8s_helper/utils.py,sha256=wYgTd5ktyuI-EiVcfW7FrxA7MzXY5odrEKQgmMVdueY,9496
|
6
|
-
k8s_helper_cli-0.4.3.dist-info/licenses/LICENSE,sha256=tXPvVl3gLVc6e0qCEoLH9KjeA7z4JVL78UybpvGtBCw,1096
|
7
|
-
k8s_helper_cli-0.4.3.dist-info/METADATA,sha256=-uygNSjY9k0yhuw6KPwft5Uu-77b0GBGCPRURlt4PEM,30789
|
8
|
-
k8s_helper_cli-0.4.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
9
|
-
k8s_helper_cli-0.4.3.dist-info/entry_points.txt,sha256=IoCMWUZ6mn90LwzQzEy5YkWOwvogDdZ6ycqUWAzCFTQ,50
|
10
|
-
k8s_helper_cli-0.4.3.dist-info/top_level.txt,sha256=x9A1jflyer-z2cFnkqk5B42juoH2q0fy5hkT9upsTG8,11
|
11
|
-
k8s_helper_cli-0.4.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|