konduktor-nightly 0.1.0.dev20251128104812__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- konduktor/__init__.py +49 -0
- konduktor/adaptors/__init__.py +0 -0
- konduktor/adaptors/aws.py +221 -0
- konduktor/adaptors/common.py +118 -0
- konduktor/adaptors/gcp.py +126 -0
- konduktor/authentication.py +124 -0
- konduktor/backends/__init__.py +6 -0
- konduktor/backends/backend.py +86 -0
- konduktor/backends/constants.py +21 -0
- konduktor/backends/deployment.py +204 -0
- konduktor/backends/deployment_utils.py +1351 -0
- konduktor/backends/jobset.py +225 -0
- konduktor/backends/jobset_utils.py +726 -0
- konduktor/backends/pod_utils.py +501 -0
- konduktor/check.py +184 -0
- konduktor/cli.py +1945 -0
- konduktor/config.py +420 -0
- konduktor/constants.py +36 -0
- konduktor/controller/__init__.py +0 -0
- konduktor/controller/constants.py +56 -0
- konduktor/controller/launch.py +44 -0
- konduktor/controller/node.py +116 -0
- konduktor/controller/parse.py +111 -0
- konduktor/dashboard/README.md +30 -0
- konduktor/dashboard/backend/main.py +169 -0
- konduktor/dashboard/backend/sockets.py +154 -0
- konduktor/dashboard/frontend/.eslintrc.json +3 -0
- konduktor/dashboard/frontend/.gitignore +36 -0
- konduktor/dashboard/frontend/app/api/jobs/route.js +71 -0
- konduktor/dashboard/frontend/app/api/namespaces/route.js +69 -0
- konduktor/dashboard/frontend/app/components/Grafana.jsx +66 -0
- konduktor/dashboard/frontend/app/components/JobsData.jsx +197 -0
- konduktor/dashboard/frontend/app/components/LogsData.jsx +139 -0
- konduktor/dashboard/frontend/app/components/NavMenu.jsx +39 -0
- konduktor/dashboard/frontend/app/components/NavTabs.jsx +73 -0
- konduktor/dashboard/frontend/app/components/NavTabs2.jsx +30 -0
- konduktor/dashboard/frontend/app/components/SelectBtn.jsx +27 -0
- konduktor/dashboard/frontend/app/components/lib/utils.js +6 -0
- konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +78 -0
- konduktor/dashboard/frontend/app/components/ui/input.jsx +19 -0
- konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +104 -0
- konduktor/dashboard/frontend/app/components/ui/select.jsx +120 -0
- konduktor/dashboard/frontend/app/favicon.ico +0 -0
- konduktor/dashboard/frontend/app/globals.css +120 -0
- konduktor/dashboard/frontend/app/jobs/page.js +10 -0
- konduktor/dashboard/frontend/app/layout.js +22 -0
- konduktor/dashboard/frontend/app/logs/page.js +11 -0
- konduktor/dashboard/frontend/app/page.js +12 -0
- konduktor/dashboard/frontend/jsconfig.json +7 -0
- konduktor/dashboard/frontend/next.config.mjs +4 -0
- konduktor/dashboard/frontend/package-lock.json +6687 -0
- konduktor/dashboard/frontend/package.json +37 -0
- konduktor/dashboard/frontend/postcss.config.mjs +8 -0
- konduktor/dashboard/frontend/server.js +64 -0
- konduktor/dashboard/frontend/tailwind.config.js +17 -0
- konduktor/data/__init__.py +9 -0
- konduktor/data/aws/__init__.py +15 -0
- konduktor/data/aws/s3.py +1138 -0
- konduktor/data/constants.py +7 -0
- konduktor/data/data_utils.py +268 -0
- konduktor/data/gcp/__init__.py +19 -0
- konduktor/data/gcp/constants.py +42 -0
- konduktor/data/gcp/gcs.py +994 -0
- konduktor/data/gcp/utils.py +9 -0
- konduktor/data/registry.py +19 -0
- konduktor/data/storage.py +812 -0
- konduktor/data/storage_utils.py +535 -0
- konduktor/execution.py +447 -0
- konduktor/kube_client.py +237 -0
- konduktor/logging.py +111 -0
- konduktor/manifests/aibrix-setup.yaml +430 -0
- konduktor/manifests/apoxy-setup.yaml +184 -0
- konduktor/manifests/apoxy-setup2.yaml +98 -0
- konduktor/manifests/controller_deployment.yaml +69 -0
- konduktor/manifests/dashboard_deployment.yaml +131 -0
- konduktor/manifests/dmesg_daemonset.yaml +57 -0
- konduktor/manifests/pod_cleanup_controller.yaml +129 -0
- konduktor/resource.py +546 -0
- konduktor/serving.py +153 -0
- konduktor/task.py +949 -0
- konduktor/templates/deployment.yaml.j2 +191 -0
- konduktor/templates/jobset.yaml.j2 +43 -0
- konduktor/templates/pod.yaml.j2 +563 -0
- konduktor/usage/__init__.py +0 -0
- konduktor/usage/constants.py +21 -0
- konduktor/utils/__init__.py +0 -0
- konduktor/utils/accelerator_registry.py +17 -0
- konduktor/utils/annotations.py +62 -0
- konduktor/utils/base64_utils.py +95 -0
- konduktor/utils/common_utils.py +426 -0
- konduktor/utils/constants.py +5 -0
- konduktor/utils/env_options.py +55 -0
- konduktor/utils/exceptions.py +234 -0
- konduktor/utils/kubernetes_enums.py +8 -0
- konduktor/utils/kubernetes_utils.py +763 -0
- konduktor/utils/log_utils.py +467 -0
- konduktor/utils/loki_utils.py +102 -0
- konduktor/utils/rich_utils.py +123 -0
- konduktor/utils/schemas.py +625 -0
- konduktor/utils/subprocess_utils.py +273 -0
- konduktor/utils/ux_utils.py +247 -0
- konduktor/utils/validator.py +461 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/LICENSE +91 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/METADATA +98 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/RECORD +107 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/WHEEL +4 -0
- konduktor_nightly-0.1.0.dev20251128104812.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List, Set
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from konduktor import logging as konduktor_logging
|
|
8
|
+
from konduktor.controller import constants
|
|
9
|
+
|
|
10
|
+
# comma separated list of namespaces to watch for pod errors
|
|
11
|
+
WATCHED_NAMESPACES: List[str] = os.environ.get('WATCHED_NAMESPACES', 'default').split(
|
|
12
|
+
','
|
|
13
|
+
)
|
|
14
|
+
LOGS_SINCE: int = 10 # retrieves logs generated in the past 10 seconds
|
|
15
|
+
LOG_ENDPOINT: str = os.environ.get(
|
|
16
|
+
'LOG_ENDPOINT',
|
|
17
|
+
# this assumes you have access to this endpoint by
|
|
18
|
+
# running as a deployment within the cluster
|
|
19
|
+
# for local development use 'http://localhost:3100' and
|
|
20
|
+
# kubectl port-forward svc/loki -n loki 3100:3100
|
|
21
|
+
'http://loki.loki.svc.cluster.local:3100',
|
|
22
|
+
)
|
|
23
|
+
QUERY_URL: str = '/loki/api/v1/query_range'
|
|
24
|
+
|
|
25
|
+
logger = konduktor_logging.get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _query_range(pattern: str, **label_filters) -> List[Dict[str, Any]]:
|
|
29
|
+
"""Send LogQL query_range to loki
|
|
30
|
+
https://grafana.com/docs/loki/latest/reference/loki-http-api/#query-logs-within-a-range-of-time
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
pattern (str): regex pattern to match loglines against
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List[Dict[str, Any]]: List of loglines
|
|
37
|
+
"""
|
|
38
|
+
url = f'{LOG_ENDPOINT}{QUERY_URL}'
|
|
39
|
+
formatted_filters = ', '.join(
|
|
40
|
+
f'{key}="{value}"' for key, value in label_filters.items()
|
|
41
|
+
)
|
|
42
|
+
query = r'{' f'{formatted_filters}' r'}' f'|~ {pattern}'
|
|
43
|
+
params = {'query': query, 'since': f'{LOGS_SINCE}s'}
|
|
44
|
+
response = requests.get(url, params=params)
|
|
45
|
+
if response.status_code == 200:
|
|
46
|
+
data = response.json()
|
|
47
|
+
return data['data']['result']
|
|
48
|
+
elif response.status_code == 400:
|
|
49
|
+
logger.error(f'Bad Request: {response.status_code}')
|
|
50
|
+
logger.error(response.json()) # Optionally print the error details
|
|
51
|
+
else:
|
|
52
|
+
logger.error(f'loki query failed {params}')
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def pod_errors() -> Set[str]:
|
|
57
|
+
logger.info('querying pod logs')
|
|
58
|
+
bad_nodes = set()
|
|
59
|
+
for regex in constants.POD_LOG_ERROR_REGEXES:
|
|
60
|
+
for namespace in WATCHED_NAMESPACES:
|
|
61
|
+
log_lines = _query_range(regex, k8s_namespace_name=namespace)
|
|
62
|
+
for line in log_lines:
|
|
63
|
+
log_node = line['stream']['k8s_node_name']
|
|
64
|
+
bad_nodes.add(log_node)
|
|
65
|
+
return bad_nodes
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def sxid_error(pattern: str, log_content: str) -> int:
|
|
69
|
+
"""Regex pattern match for an xid error, from `log_content` otherwise return 0
|
|
70
|
+
Example Xid error from dmesg
|
|
71
|
+
[1235733.431527] NVRM: Xid (PCI:0000:4e:00): 79, pid='<unknown>', name=<unknown>, GPU has fallen off the bus.
|
|
72
|
+
Example sxid error from dmesg
|
|
73
|
+
[1235733.431527] nvidia-nvswitch3: SXid (PCI:0000:4e:00.0): 12028, Non-fatal, Link 32 egress non-posted PRIV error (First)
|
|
74
|
+
""" # noqa: E501
|
|
75
|
+
|
|
76
|
+
match = re.search(pattern, log_content)
|
|
77
|
+
if match:
|
|
78
|
+
return int(match.group(1))
|
|
79
|
+
|
|
80
|
+
return 0
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def is_sxid_error(log_content: str) -> bool:
|
|
84
|
+
"""Returns (S)Xid error code, zero otherwise"""
|
|
85
|
+
error_code = sxid_error(r'SXid.*?: (\d+),', log_content) or sxid_error(
|
|
86
|
+
r'NVRM: Xid.*?: (\d+),', log_content
|
|
87
|
+
)
|
|
88
|
+
return error_code not in constants.ALLOWLISTED_NVSWITCH_SXID_ERRORS
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def dmesg_errors() -> Set[str]:
|
|
92
|
+
logger.info('checking dmesg logs')
|
|
93
|
+
pattern = ' or '.join(constants.DMESG_ERROR_REGEXES)
|
|
94
|
+
log_lines = _query_range(pattern, k8s_daemonset_name='dmesg')
|
|
95
|
+
bad_nodes = set()
|
|
96
|
+
for line in log_lines:
|
|
97
|
+
log_node, log_content = line['stream']['k8s_node_name'], line['values'][0][1]
|
|
98
|
+
if is_sxid_error(log_content):
|
|
99
|
+
logger.info(f'node `{log_node}` has (S)Xid error: {log_content}')
|
|
100
|
+
else:
|
|
101
|
+
logger.info(f'dmesg error on node `{log_node}`: {log_content}')
|
|
102
|
+
bad_nodes.add(log_node)
|
|
103
|
+
return bad_nodes
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == '__main__':
|
|
107
|
+
import time
|
|
108
|
+
|
|
109
|
+
while True:
|
|
110
|
+
time.sleep(5)
|
|
111
|
+
print(dmesg_errors())
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
### Prereqs: kubectl is configured with remote machine/cluster
|
|
2
|
+
|
|
3
|
+
# OPTION 1 (Automated Setup)
|
|
4
|
+
|
|
5
|
+
To open the dashboard, run this inside the root konduktor directory:
|
|
6
|
+
```
|
|
7
|
+
./start_dashboard.sh
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
# OPTION 2 (Manual Setup)
|
|
11
|
+
|
|
12
|
+
## 1. Apply kubernetes manifest
|
|
13
|
+
Inside manifests directory (one with dashboard_deployment.yaml):
|
|
14
|
+
```
|
|
15
|
+
kubectl apply -f dashboard_deployment.yaml
|
|
16
|
+
```
|
|
17
|
+
Then, wait a minute or two for the pods to finish setup
|
|
18
|
+
|
|
19
|
+
## 2. Port forward frontend in a terminal
|
|
20
|
+
```
|
|
21
|
+
kubectl port-forward svc/frontend 5173:5173 -n konduktor-dashboard
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## 3. Port forward grafana in a terminal
|
|
25
|
+
```
|
|
26
|
+
kubectl port-forward svc/kube-prometheus-stack-grafana 3000:80 -n prometheus
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## 4. Open dashboard at http://localhost:5173/
|
|
30
|
+
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
import socketio
|
|
4
|
+
from fastapi import FastAPI, Request
|
|
5
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
6
|
+
from fastapi.responses import JSONResponse
|
|
7
|
+
from kubernetes import client
|
|
8
|
+
from kubernetes.client.exceptions import ApiException
|
|
9
|
+
|
|
10
|
+
from konduktor import logging as konduktor_logging
|
|
11
|
+
from konduktor.kube_client import batch_api, core_api, crd_api
|
|
12
|
+
|
|
13
|
+
from .sockets import socketio as sio
|
|
14
|
+
|
|
15
|
+
logger = konduktor_logging.get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
# FastAPI app
|
|
18
|
+
app = FastAPI()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# CORS Configuration
|
|
22
|
+
app.add_middleware(
|
|
23
|
+
CORSMiddleware,
|
|
24
|
+
allow_origins=['*'], # Allow all origins
|
|
25
|
+
allow_credentials=True,
|
|
26
|
+
allow_methods=['*'], # Allow all methods
|
|
27
|
+
allow_headers=['*'], # Allow all headers
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Use Kubernetes API clients
|
|
31
|
+
# Initialize BatchV1 and CoreV1 API (native kubernetes)
|
|
32
|
+
batch_client = batch_api()
|
|
33
|
+
core_client = core_api()
|
|
34
|
+
# Initialize Kueue API
|
|
35
|
+
crd_client = crd_api()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@app.get('/')
|
|
39
|
+
async def home():
|
|
40
|
+
return JSONResponse({'home': '/'})
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@app.delete('/deleteJob')
|
|
44
|
+
async def delete_job(request: Request):
|
|
45
|
+
data = await request.json()
|
|
46
|
+
name = data.get('name', '')
|
|
47
|
+
namespace = data.get('namespace', 'default')
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
delete_options = client.V1DeleteOptions(propagation_policy='Background')
|
|
51
|
+
|
|
52
|
+
crd_client.delete_namespaced_custom_object(
|
|
53
|
+
group='kueue.x-k8s.io',
|
|
54
|
+
version='v1beta1',
|
|
55
|
+
namespace=namespace,
|
|
56
|
+
plural='workloads',
|
|
57
|
+
name=name,
|
|
58
|
+
body=delete_options,
|
|
59
|
+
)
|
|
60
|
+
logger.debug(f"Kueue Workload '{name}' deleted successfully.")
|
|
61
|
+
|
|
62
|
+
return JSONResponse({'success': True, 'status': 200})
|
|
63
|
+
|
|
64
|
+
except ApiException as e:
|
|
65
|
+
logger.debug(f'Exception: {e}')
|
|
66
|
+
return JSONResponse({'error': str(e)}, status_code=e.status)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@app.get('/getJobs')
|
|
70
|
+
async def get_jobs():
|
|
71
|
+
rows = fetch_jobs()
|
|
72
|
+
return JSONResponse(rows)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@app.get('/getNamespaces')
|
|
76
|
+
async def get_namespaces():
|
|
77
|
+
try:
|
|
78
|
+
# Get the list of namespaces
|
|
79
|
+
namespaces = core_client.list_namespace()
|
|
80
|
+
# Extract the namespace names from the response
|
|
81
|
+
namespace_list = [ns.metadata.name for ns in namespaces.items]
|
|
82
|
+
return JSONResponse(namespace_list)
|
|
83
|
+
except ApiException as e:
|
|
84
|
+
logger.debug(f'Exception: {e}')
|
|
85
|
+
return JSONResponse({'error': str(e)}, status_code=e.status)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@app.put('/updatePriority')
|
|
89
|
+
async def update_priority(request: Request):
|
|
90
|
+
data = await request.json()
|
|
91
|
+
name = data.get('name', '')
|
|
92
|
+
namespace = data.get('namespace', 'default')
|
|
93
|
+
priority = data.get('priority', 0)
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
job = crd_client.get_namespaced_custom_object(
|
|
97
|
+
group='kueue.x-k8s.io',
|
|
98
|
+
version='v1beta1',
|
|
99
|
+
namespace=namespace,
|
|
100
|
+
plural='workloads',
|
|
101
|
+
name=name,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
job['spec']['priority'] = priority
|
|
105
|
+
|
|
106
|
+
crd_client.patch_namespaced_custom_object(
|
|
107
|
+
group='kueue.x-k8s.io',
|
|
108
|
+
version='v1beta1',
|
|
109
|
+
namespace=namespace,
|
|
110
|
+
plural='workloads',
|
|
111
|
+
name=name,
|
|
112
|
+
body=job,
|
|
113
|
+
)
|
|
114
|
+
return JSONResponse({'success': True, 'status': 200})
|
|
115
|
+
|
|
116
|
+
except ApiException as e:
|
|
117
|
+
logger.debug(f'Exception: {e}')
|
|
118
|
+
return JSONResponse({'error': str(e)}, status_code=e.status)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
# Get a listing of workloads in kueue
|
|
122
|
+
def fetch_jobs():
|
|
123
|
+
listing = crd_client.list_namespaced_custom_object(
|
|
124
|
+
group='kueue.x-k8s.io',
|
|
125
|
+
version='v1beta1',
|
|
126
|
+
namespace='default',
|
|
127
|
+
plural='workloads',
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return format_workloads(listing)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def format_workloads(listing: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
134
|
+
if not listing:
|
|
135
|
+
return []
|
|
136
|
+
|
|
137
|
+
res = []
|
|
138
|
+
|
|
139
|
+
for job in listing['items']:
|
|
140
|
+
id = job['metadata']['uid']
|
|
141
|
+
name = job['metadata']['name']
|
|
142
|
+
created_at = job['metadata']['creationTimestamp']
|
|
143
|
+
namespace = job['metadata']['namespace']
|
|
144
|
+
localQueueName = job['spec'].get('queueName', 'Unknown')
|
|
145
|
+
priority = job['spec']['priority']
|
|
146
|
+
active = job['spec'].get('active', 0)
|
|
147
|
+
status = 'ADMITTED' if 'admission' in job.get('status', {}) else 'PENDING'
|
|
148
|
+
|
|
149
|
+
statusVal = 1 if 'admission' in job.get('status', {}) else 0
|
|
150
|
+
order = (statusVal * 10) + priority
|
|
151
|
+
|
|
152
|
+
res.append(
|
|
153
|
+
{
|
|
154
|
+
'id': id,
|
|
155
|
+
'name': name,
|
|
156
|
+
'namespace': namespace,
|
|
157
|
+
'localQueueName': localQueueName,
|
|
158
|
+
'priority': priority,
|
|
159
|
+
'status': status,
|
|
160
|
+
'active': active,
|
|
161
|
+
'created_at': created_at,
|
|
162
|
+
'order': order,
|
|
163
|
+
}
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return res
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
app = socketio.ASGIApp(sio, app)
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import datetime
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
from typing import Dict, List
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from socketio import AsyncServer # Import the AsyncServer for ASGI compatibility
|
|
9
|
+
|
|
10
|
+
from konduktor import logging as konduktor_logging
|
|
11
|
+
|
|
12
|
+
# SocketIO configuration
|
|
13
|
+
socketio = AsyncServer(
|
|
14
|
+
cors_allowed_origins='*', ping_interval=25, ping_timeout=60, async_mode='asgi'
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = konduktor_logging.get_logger(__name__)
|
|
18
|
+
|
|
19
|
+
# Global variables
|
|
20
|
+
CLIENT_CONNECTED = False
|
|
21
|
+
FIRST_RUN = True
|
|
22
|
+
BACKGROUND_TASK_RUNNING = False
|
|
23
|
+
LOG_CHECKPOINT_TIME = None
|
|
24
|
+
SELECTED_NAMESPACES: list[str] = []
|
|
25
|
+
|
|
26
|
+
# "http://loki.loki.svc.cluster.local:3100/loki/api/v1/query_range" for prod
|
|
27
|
+
# "http://localhost:3100/loki/api/v1/query_range" for local
|
|
28
|
+
LOGS_URL = os.environ.get('LOGS_URL', 'http://localhost:3100/loki/api/v1/query_range')
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def format_log_entry(entry: List[str], namespace: str) -> Dict[str, str]:
|
|
32
|
+
"""
|
|
33
|
+
Formats a log entry and its corresponding namespace
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
entry (List[str]): A list of log entry strings to be formatted.
|
|
37
|
+
namespace (str): The namespace to apply to each log entry.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dict[str, str]: an object with the following properties:
|
|
41
|
+
timestamp, log (message), and namespace
|
|
42
|
+
"""
|
|
43
|
+
timestamp_ns = entry[0]
|
|
44
|
+
log_message = entry[1]
|
|
45
|
+
timestamp_s = int(timestamp_ns) / 1e9
|
|
46
|
+
dt = datetime.datetime.utcfromtimestamp(timestamp_s)
|
|
47
|
+
human_readable_time = dt.strftime('%Y-%m-%d %H:%M:%S')
|
|
48
|
+
formatted_log = {
|
|
49
|
+
'timestamp': human_readable_time,
|
|
50
|
+
'log': log_message,
|
|
51
|
+
'namespace': namespace,
|
|
52
|
+
}
|
|
53
|
+
return formatted_log
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_logs(FIRST_RUN: bool) -> List[Dict[str, str]]:
|
|
57
|
+
global LOG_CHECKPOINT_TIME
|
|
58
|
+
|
|
59
|
+
logger.debug(f'Selected namespaces: {SELECTED_NAMESPACES}')
|
|
60
|
+
|
|
61
|
+
# Use the selected namespaces in the query
|
|
62
|
+
namespace_filter = (
|
|
63
|
+
'|'.join(SELECTED_NAMESPACES) if SELECTED_NAMESPACES else 'default'
|
|
64
|
+
)
|
|
65
|
+
query = f'{{k8s_namespace_name=~"{namespace_filter}"}}'
|
|
66
|
+
|
|
67
|
+
logger.debug(f'Loki logs query: {query}')
|
|
68
|
+
|
|
69
|
+
if FIRST_RUN:
|
|
70
|
+
# Calculate how many nanoseconds to look back when first time looking at logs
|
|
71
|
+
# (currently 1 hour)
|
|
72
|
+
now = int(time.time() * 1e9)
|
|
73
|
+
one_hour_ago = now - int(3600 * 1e9)
|
|
74
|
+
start_time = str(one_hour_ago)
|
|
75
|
+
else:
|
|
76
|
+
# calculate new start_time based on newest, last message
|
|
77
|
+
if LOG_CHECKPOINT_TIME is None:
|
|
78
|
+
LOG_CHECKPOINT_TIME = 0
|
|
79
|
+
start_time = str(int(LOG_CHECKPOINT_TIME) + 1)
|
|
80
|
+
|
|
81
|
+
params = {'query': query, 'start': start_time, 'limit': '300'}
|
|
82
|
+
|
|
83
|
+
url = LOGS_URL
|
|
84
|
+
response = requests.get(url, params=params)
|
|
85
|
+
formatted_logs = []
|
|
86
|
+
|
|
87
|
+
last = 0
|
|
88
|
+
|
|
89
|
+
if response.status_code == 200:
|
|
90
|
+
data = response.json()
|
|
91
|
+
rows = data['data']['result']
|
|
92
|
+
|
|
93
|
+
for row in rows:
|
|
94
|
+
namespace = row['stream']['k8s_namespace_name']
|
|
95
|
+
for value in row['values']:
|
|
96
|
+
last = max(int(value[0]), last)
|
|
97
|
+
formatted_logs.append(format_log_entry(value, namespace))
|
|
98
|
+
|
|
99
|
+
if formatted_logs:
|
|
100
|
+
# sort because sometimes loki API is wrong and logs are out of order
|
|
101
|
+
formatted_logs.sort(
|
|
102
|
+
key=lambda log: datetime.datetime.strptime(
|
|
103
|
+
log['timestamp'], '%Y-%m-%d %H:%M:%S'
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
LOG_CHECKPOINT_TIME = last
|
|
107
|
+
|
|
108
|
+
logger.debug(f'Formatted logs length: {len(formatted_logs)}')
|
|
109
|
+
|
|
110
|
+
return formatted_logs
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
async def send_logs():
|
|
114
|
+
global CLIENT_CONNECTED, FIRST_RUN, BACKGROUND_TASK_RUNNING
|
|
115
|
+
while CLIENT_CONNECTED:
|
|
116
|
+
logs = get_logs(FIRST_RUN)
|
|
117
|
+
|
|
118
|
+
FIRST_RUN = False # After the first successful fetch, set to False
|
|
119
|
+
if logs:
|
|
120
|
+
await socketio.emit('log_data', logs)
|
|
121
|
+
|
|
122
|
+
await asyncio.sleep(5)
|
|
123
|
+
|
|
124
|
+
# Background task is no longer running after the loop
|
|
125
|
+
BACKGROUND_TASK_RUNNING = False
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@socketio.event
|
|
129
|
+
async def connect(sid, environ):
|
|
130
|
+
global CLIENT_CONNECTED, FIRST_RUN, BACKGROUND_TASK_RUNNING
|
|
131
|
+
CLIENT_CONNECTED = True
|
|
132
|
+
FIRST_RUN = True
|
|
133
|
+
logger.debug('Client connected')
|
|
134
|
+
|
|
135
|
+
# Start the background task only if it's not already running
|
|
136
|
+
if not BACKGROUND_TASK_RUNNING:
|
|
137
|
+
BACKGROUND_TASK_RUNNING = True
|
|
138
|
+
socketio.start_background_task(send_logs)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@socketio.event
|
|
142
|
+
async def update_namespaces(sid, namespaces):
|
|
143
|
+
global SELECTED_NAMESPACES
|
|
144
|
+
SELECTED_NAMESPACES = namespaces
|
|
145
|
+
logger.debug('Updated namespaces')
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@socketio.event
|
|
149
|
+
async def disconnect(sid):
|
|
150
|
+
global CLIENT_CONNECTED, FIRST_RUN, BACKGROUND_TASK_RUNNING
|
|
151
|
+
CLIENT_CONNECTED = False
|
|
152
|
+
FIRST_RUN = True
|
|
153
|
+
BACKGROUND_TASK_RUNNING = False
|
|
154
|
+
logger.debug('Client disconnected')
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
|
2
|
+
|
|
3
|
+
# dependencies
|
|
4
|
+
/node_modules
|
|
5
|
+
/.pnp
|
|
6
|
+
.pnp.js
|
|
7
|
+
.yarn/install-state.gz
|
|
8
|
+
|
|
9
|
+
# testing
|
|
10
|
+
/coverage
|
|
11
|
+
|
|
12
|
+
# next.js
|
|
13
|
+
/.next/
|
|
14
|
+
/out/
|
|
15
|
+
|
|
16
|
+
# production
|
|
17
|
+
/build
|
|
18
|
+
|
|
19
|
+
# misc
|
|
20
|
+
.DS_Store
|
|
21
|
+
*.pem
|
|
22
|
+
|
|
23
|
+
# debug
|
|
24
|
+
npm-debug.log*
|
|
25
|
+
yarn-debug.log*
|
|
26
|
+
yarn-error.log*
|
|
27
|
+
|
|
28
|
+
# local env files
|
|
29
|
+
.env*.local
|
|
30
|
+
|
|
31
|
+
# vercel
|
|
32
|
+
.vercel
|
|
33
|
+
|
|
34
|
+
# typescript
|
|
35
|
+
*.tsbuildinfo
|
|
36
|
+
next-env.d.ts
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { NextResponse } from 'next/server';
|
|
2
|
+
|
|
3
|
+
const backendUrl = process.env.NODE_ENV === 'development'
|
|
4
|
+
? 'http://127.0.0.1:5001' // Development API
|
|
5
|
+
: 'http://backend:5001' // Production API
|
|
6
|
+
|
|
7
|
+
// GET request for jobs
|
|
8
|
+
export async function GET() {
|
|
9
|
+
try {
|
|
10
|
+
// Forward request to backend API
|
|
11
|
+
const response = await fetch(`${backendUrl}/getJobs`, {
|
|
12
|
+
method: 'GET',
|
|
13
|
+
headers: {
|
|
14
|
+
'Content-Type': 'application/json'
|
|
15
|
+
},
|
|
16
|
+
})
|
|
17
|
+
|
|
18
|
+
const data = await response.json()
|
|
19
|
+
return new NextResponse(JSON.stringify(data))
|
|
20
|
+
} catch (error) {
|
|
21
|
+
console.error("Server get error:", error);
|
|
22
|
+
return new NextResponse(error)
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// DELETE request for job deletion
|
|
27
|
+
export async function DELETE(req) {
|
|
28
|
+
try {
|
|
29
|
+
const { name, namespace } = await req.json(); // Parse the request body
|
|
30
|
+
|
|
31
|
+
// Forward request to backend API
|
|
32
|
+
const response = await fetch(`${backendUrl}/deleteJob`, {
|
|
33
|
+
method: 'DELETE',
|
|
34
|
+
headers: {
|
|
35
|
+
'Content-Type': 'application/json'
|
|
36
|
+
},
|
|
37
|
+
body: JSON.stringify({ name, namespace })
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
const data = await response.json()
|
|
41
|
+
console.log(`Server Component deleteJob: ${JSON.stringify(data)}`)
|
|
42
|
+
return new NextResponse(JSON.stringify(data))
|
|
43
|
+
} catch (error) {
|
|
44
|
+
console.error("Server delete error:", error);
|
|
45
|
+
return new NextResponse(error)
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// PUT request for updating job priority
|
|
50
|
+
export async function PUT(req) {
|
|
51
|
+
try {
|
|
52
|
+
const { name, namespace, priority, priority_class_name } = await req.json(); // Parse the request body
|
|
53
|
+
|
|
54
|
+
// Forward request to backend API
|
|
55
|
+
const response = await fetch(`${backendUrl}/updatePriority`, {
|
|
56
|
+
method: 'PUT',
|
|
57
|
+
headers: {
|
|
58
|
+
'Content-Type': 'application/json'
|
|
59
|
+
},
|
|
60
|
+
body: JSON.stringify({ name, namespace, priority, priority_class_name })
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
const data = await response.json()
|
|
64
|
+
console.log(`Server Component updatePriority: ${JSON.stringify(data)}`)
|
|
65
|
+
return new NextResponse(JSON.stringify(data))
|
|
66
|
+
} catch (error) {
|
|
67
|
+
console.error("Server update error:", error);
|
|
68
|
+
return new NextResponse(error)
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import { NextResponse } from 'next/server';
|
|
2
|
+
|
|
3
|
+
const backendUrl = process.env.NODE_ENV === 'development'
|
|
4
|
+
? 'http://127.0.0.1:5001' // Development API
|
|
5
|
+
: 'http://backend:5001' // Production API
|
|
6
|
+
|
|
7
|
+
// GET request for jobs
|
|
8
|
+
export async function GET() {
|
|
9
|
+
try {
|
|
10
|
+
// Forward request to backend API
|
|
11
|
+
const response = await fetch(`${backendUrl}/getNamespaces`, {
|
|
12
|
+
method: 'GET',
|
|
13
|
+
headers: {
|
|
14
|
+
'Content-Type': 'application/json'
|
|
15
|
+
},
|
|
16
|
+
})
|
|
17
|
+
|
|
18
|
+
const data = await response.json()
|
|
19
|
+
return new NextResponse(JSON.stringify(data))
|
|
20
|
+
} catch (error) {
|
|
21
|
+
console.error("Server get error:", error);
|
|
22
|
+
return new NextResponse(error)
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// DELETE request for job deletion
|
|
27
|
+
export async function DELETE(req) {
|
|
28
|
+
try {
|
|
29
|
+
const { name, namespace } = await req.json(); // Parse the request body
|
|
30
|
+
|
|
31
|
+
// Forward request to backend API
|
|
32
|
+
const response = await fetch(`${backendUrl}/deleteJob`, {
|
|
33
|
+
method: 'DELETE',
|
|
34
|
+
headers: {
|
|
35
|
+
'Content-Type': 'application/json'
|
|
36
|
+
},
|
|
37
|
+
body: JSON.stringify({ name, namespace })
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
const data = await response.json()
|
|
41
|
+
return new NextResponse(JSON.stringify(data))
|
|
42
|
+
} catch (error) {
|
|
43
|
+
console.error("Server delete error:", error);
|
|
44
|
+
return new NextResponse(error)
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// PUT request for updating job priority
|
|
49
|
+
export async function PUT(req) {
|
|
50
|
+
try {
|
|
51
|
+
const { name, namespace, priority, priority_class_name } = await req.json(); // Parse the request body
|
|
52
|
+
|
|
53
|
+
// Forward request to backend API
|
|
54
|
+
const response = await fetch(`${backendUrl}/updatePriority`, {
|
|
55
|
+
method: 'PUT',
|
|
56
|
+
headers: {
|
|
57
|
+
'Content-Type': 'application/json'
|
|
58
|
+
},
|
|
59
|
+
body: JSON.stringify({ name, namespace, priority, priority_class_name })
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
const data = await response.json()
|
|
63
|
+
return new NextResponse(JSON.stringify(data))
|
|
64
|
+
} catch (error) {
|
|
65
|
+
console.error("Server update error:", error);
|
|
66
|
+
return new NextResponse(error)
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|