dbca-utils 2.2.0__tar.gz → 3.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/PKG-INFO +1 -1
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/pyproject.toml +1 -1
- dbca_utils-3.0.1/src/dbca_utils/apps.py +14 -0
- dbca_utils-3.0.1/src/dbca_utils/healthcheck/healthcheck.py +623 -0
- dbca_utils-3.0.1/src/dbca_utils/healthcheck/urls.py +16 -0
- dbca_utils-3.0.1/tests/migrations/__init__.py +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/LICENSE +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/README.md +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/src/dbca_utils/__init__.py +0 -0
- {dbca_utils-2.2.0/tests → dbca_utils-3.0.1/src/dbca_utils/healthcheck}/__init__.py +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/src/dbca_utils/middleware.py +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/src/dbca_utils/models.py +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/src/dbca_utils/utils.py +0 -0
- {dbca_utils-2.2.0/tests/migrations → dbca_utils-3.0.1/tests}/__init__.py +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/tests/apps.py +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/tests/migrations/0001_initial.py +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/tests/models.py +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/tests/settings.py +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/tests/templates/tests/test_model_list.html +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/tests/tests.py +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/tests/urls.py +0 -0
- {dbca_utils-2.2.0 → dbca_utils-3.0.1}/tests/views.py +0 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
|
|
3
|
+
from django.apps import AppConfig
|
|
4
|
+
from .healthcheck import healthcheck
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DbcaUtilsConfig(AppConfig):
|
|
8
|
+
name = 'dbca_utils'
|
|
9
|
+
|
|
10
|
+
def ready(self):
|
|
11
|
+
if healthcheck.HEALTHCHECK_ENABLED:
|
|
12
|
+
healthcheck.register_healtcheckurls()
|
|
13
|
+
|
|
14
|
+
|
|
@@ -0,0 +1,623 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import importlib
|
|
3
|
+
import logging
|
|
4
|
+
import subprocess
|
|
5
|
+
import random
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
import socket
|
|
9
|
+
import requests
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
|
|
12
|
+
from django.urls import reverse,path,include
|
|
13
|
+
from django.conf import settings
|
|
14
|
+
from django.http import HttpResponseForbidden, JsonResponse,HttpResponseServerError
|
|
15
|
+
from django.core.signals import request_started
|
|
16
|
+
from django.core.cache import cache
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
#WORKLOADS means the number of WORKLOADS should be started.
|
|
22
|
+
#If WORKLOADS is dynamic, please don't set it.
|
|
23
|
+
HEALTHCHECK_ENABLED = os.environ.get("HEALTHCHECK_ENABLED","true").lower() == "true"
|
|
24
|
+
if not HEALTHCHECK_ENABLED:
|
|
25
|
+
HEALTHCHECK_ENABLED = True if cache else None
|
|
26
|
+
|
|
27
|
+
PROCESS_FILTER = os.environ.get("WORKLOAD_PROCESS_FILTER","| grep python")
|
|
28
|
+
CACHE_PREFIX = os.environ.get("CACHE_PREFIX","")
|
|
29
|
+
PORT = int(os.environ.get("WORKLOAD_PORT",8080))
|
|
30
|
+
WORKLOADS = int(os.environ.get("WORKLOADS",0))
|
|
31
|
+
WORKLOAD_DEPLOYMENT = os.environ.get("WORKLOAD_DEPLOYMENT","true").lower() == "true"
|
|
32
|
+
if WORKLOADS < 0 :
|
|
33
|
+
WORKLOADS = 0
|
|
34
|
+
WORKLOAD_FAILED_THRESHOLD = int(os.environ.get("WORKLOAD_FAILED_THRESHOLD",2))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
RANDOM_CHARS="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYA0123456789~!@#$%^&*()-_+=`{}[];':\",./<>?"
|
|
38
|
+
RANDOM_CHARS_MAX_INDEX = len(RANDOM_CHARS) - 1
|
|
39
|
+
|
|
40
|
+
def generate_secret():
|
|
41
|
+
return "".join(RANDOM_CHARS[random.randint(0,RANDOM_CHARS_MAX_INDEX)] for i in range(0,32))
|
|
42
|
+
|
|
43
|
+
secret = None
|
|
44
|
+
|
|
45
|
+
def get_workloadname(index):
|
|
46
|
+
return "workload{}".format(index)
|
|
47
|
+
|
|
48
|
+
def get_local_ip():
|
|
49
|
+
# Create a UDP socket
|
|
50
|
+
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
51
|
+
try:
|
|
52
|
+
# Connect to a dummy external IP (doesn't have to be reachable)
|
|
53
|
+
s.connect(('192.168.1.1', 1))
|
|
54
|
+
ip = s.getsockname()[0]
|
|
55
|
+
except Exception:
|
|
56
|
+
# Fallback to localhost if network is down
|
|
57
|
+
ip = '127.0.0.1'
|
|
58
|
+
finally:
|
|
59
|
+
s.close()
|
|
60
|
+
return ip
|
|
61
|
+
|
|
62
|
+
hostname = socket.gethostname()
|
|
63
|
+
if WORKLOAD_DEPLOYMENT:
|
|
64
|
+
registerhostname = hostname
|
|
65
|
+
else:
|
|
66
|
+
statefulset_hostname_re = re.compile("-(?P<index>\\d+)$")
|
|
67
|
+
registerhostname = get_workloadname(statefulset_hostname_re.search(hostname).group("index"))
|
|
68
|
+
|
|
69
|
+
ip = get_local_ip()
|
|
70
|
+
|
|
71
|
+
webapp_process_registerfolder = "/tmp/__webapp__/proc"
|
|
72
|
+
|
|
73
|
+
def get_processregisterfile(pid):
|
|
74
|
+
return os.path.join(webapp_process_registerfolder,str(pid))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def register_webappprocess():
|
|
78
|
+
"""
|
|
79
|
+
Register all webapp related processes
|
|
80
|
+
Healthcheck will use the processes to calculate the resources used by webapp
|
|
81
|
+
"""
|
|
82
|
+
pid = os.getpid()
|
|
83
|
+
logger.debug("Register the webapp process '{}({}).{}'.".format(hostname,ip,pid))
|
|
84
|
+
try:
|
|
85
|
+
if not os.path.exists(webapp_process_registerfolder):
|
|
86
|
+
os.makedirs(webapp_process_registerfolder)
|
|
87
|
+
|
|
88
|
+
registerfile = get_processregisterfile(pid)
|
|
89
|
+
#register the webapp process first
|
|
90
|
+
with open(registerfile,"wt") as f:
|
|
91
|
+
f.write(datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f"))
|
|
92
|
+
except Exception as ex:
|
|
93
|
+
logger.error("Failed to register the webapp process '{}({}).{}'.".format(hostname,ip,pid))
|
|
94
|
+
|
|
95
|
+
def unregister_webappprocess():
|
|
96
|
+
pid = os.getpid()
|
|
97
|
+
logger.debug("Unregister the webapp process '{}({}).{}'.".format(hostname,ip,pid))
|
|
98
|
+
try:
|
|
99
|
+
registerfile = get_processregisterfile(pid)
|
|
100
|
+
#register the webapp process first
|
|
101
|
+
os.remove(registerfile)
|
|
102
|
+
except Exception as ex:
|
|
103
|
+
if os.path.exists(registerfile):
|
|
104
|
+
logger.error("Failed to unregister the webapp process '{}({}).{}'.".format(hostname,ip,pid))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
item_version = "__version__"
|
|
108
|
+
key_workloads = "{}__workloads__".format(CACHE_PREFIX)
|
|
109
|
+
key_workloads_lock = "{}lock__".format(key_workloads)
|
|
110
|
+
|
|
111
|
+
def register_webappserver(sender,environ,**kwargs):
|
|
112
|
+
"""
|
|
113
|
+
Register a web server running in the same workload
|
|
114
|
+
1. Write a server register file in workload's local file system
|
|
115
|
+
2. Register the workload to a cache shared by all workloads
|
|
116
|
+
"""
|
|
117
|
+
pid = os.getpid()
|
|
118
|
+
global secret
|
|
119
|
+
logger.debug("Register the webapp server '{}({}).{}'.".format(hostname,ip,pid))
|
|
120
|
+
try:
|
|
121
|
+
workloads_changed = False
|
|
122
|
+
workloads = cache.get(key_workloads) or {item_version:0}
|
|
123
|
+
if registerhostname not in workloads:
|
|
124
|
+
#not registered by other webservers running in the same workload
|
|
125
|
+
secret = generate_secret()
|
|
126
|
+
workloads[registerhostname] = [[ip,PORT],secret,0]
|
|
127
|
+
workloads_changed = True
|
|
128
|
+
else:
|
|
129
|
+
#already registered by other webservers, check whether the data is correct
|
|
130
|
+
data = workloads[registerhostname]
|
|
131
|
+
if not isinstance(data[0],list):
|
|
132
|
+
data[0] = [ip,PORT]
|
|
133
|
+
workloads_changed = True
|
|
134
|
+
if data[0][0] != ip:
|
|
135
|
+
data[0][0] = ip
|
|
136
|
+
workloads_changed = True
|
|
137
|
+
if data[0][1] != PORT:
|
|
138
|
+
data[0][1] = PORT
|
|
139
|
+
workloads_changed = True
|
|
140
|
+
if data[2] != 0:
|
|
141
|
+
data[2] = 0
|
|
142
|
+
workloads_changed = True
|
|
143
|
+
if workloads_changed:
|
|
144
|
+
#workload data is changed.
|
|
145
|
+
secret = generate_secret()
|
|
146
|
+
data[1] = secret
|
|
147
|
+
else:
|
|
148
|
+
#workload data is not changed.
|
|
149
|
+
secret = data[1]
|
|
150
|
+
|
|
151
|
+
if workloads_changed:
|
|
152
|
+
#save thw workloads data to cache
|
|
153
|
+
save_workloads(workloads)
|
|
154
|
+
|
|
155
|
+
except Exception as ex:
|
|
156
|
+
logger.error("Failed to register the webapp webserver '{}({}).{}'. {}: {}".format(hostname,ip,pid,ex.__class__.__name__,str(ex)))
|
|
157
|
+
#Failed to register workload, remove the server register file
|
|
158
|
+
try:
|
|
159
|
+
os.remove(registerfile)
|
|
160
|
+
except Excepton as ex:
|
|
161
|
+
if os.path.exists(registerfile):
|
|
162
|
+
logger.error("Failed to remove webapp webserver register file '{}'.{}: {}".format(registerfile,ex.__class__.__name__,str(ex)))
|
|
163
|
+
|
|
164
|
+
#ignore the exception
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
#register successfully, no need to register again.
|
|
168
|
+
#disconnect the receiver, no need to register again.
|
|
169
|
+
request_started.disconnect(dispatch_uid="register_webappserver")
|
|
170
|
+
logger.debug("Successfully register the webserver({}<{}>:{}.{}) to the cache.".format(hostname,ip,PORT,pid))
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
#register the signal receiver to register the workload
|
|
174
|
+
#the signal receiver will be disconnected after successful registration
|
|
175
|
+
if HEALTHCHECK_ENABLED:
|
|
176
|
+
#healthcheck is not initied
|
|
177
|
+
request_started.connect(register_webappserver,dispatch_uid="register_webappserver")
|
|
178
|
+
|
|
179
|
+
GET_RESOURCEUSAGE_CMD = "ps ax -o %cpu=,vsz=,rss=,cmd= {}".format(PROCESS_FILTER).strip()
|
|
180
|
+
GET_RESOURCEUSAGE_PIPECMDS = [c.strip() for c in GET_RESOURCEUSAGE_CMD.split("|")]
|
|
181
|
+
|
|
182
|
+
def get_workload_healthcheckdata():
|
|
183
|
+
#find all running web app processes
|
|
184
|
+
#find the resource usage for all processes
|
|
185
|
+
result = subprocess.run(GET_RESOURCEUSAGE_CMD,shell=True,capture_output=True,text=True)
|
|
186
|
+
if result.returncode != 0:
|
|
187
|
+
return (500,"Failed to get the resource usage data for webapp processes.{}".format(result.stderr))
|
|
188
|
+
|
|
189
|
+
processesdata = []
|
|
190
|
+
for line in result.stdout.split("\n"):
|
|
191
|
+
line = line.strip()
|
|
192
|
+
if not line:
|
|
193
|
+
continue
|
|
194
|
+
if any(c in line for c in GET_RESOURCEUSAGE_PIPECMDS):
|
|
195
|
+
continue
|
|
196
|
+
data = line.split(maxsplit=3)
|
|
197
|
+
data[0] = float(data[0])
|
|
198
|
+
data[1] = float(data[1]) / 1024
|
|
199
|
+
data[2] = float(data[2]) / 1024
|
|
200
|
+
del data[3]
|
|
201
|
+
processesdata.append(data)
|
|
202
|
+
|
|
203
|
+
#populate the resource data
|
|
204
|
+
result = {
|
|
205
|
+
"total_cpu":0,
|
|
206
|
+
"total_vmemory":0,
|
|
207
|
+
"total_pmemory":0,
|
|
208
|
+
"processes":0,
|
|
209
|
+
"min_cpu":None,
|
|
210
|
+
"max_cpu":None,
|
|
211
|
+
"min_vmemory":None,
|
|
212
|
+
"max_vmemory":None,
|
|
213
|
+
"min_pmemory":None,
|
|
214
|
+
"max_pmemory":None
|
|
215
|
+
}
|
|
216
|
+
for data in processesdata:
|
|
217
|
+
result["total_cpu"] += data[0]
|
|
218
|
+
result["total_vmemory"] += data[1]
|
|
219
|
+
result["total_pmemory"] += data[2]
|
|
220
|
+
result["processes"] += 1
|
|
221
|
+
|
|
222
|
+
if result["min_cpu"] is None or result["min_cpu"] > data[0]:
|
|
223
|
+
result["min_cpu"] = data[0]
|
|
224
|
+
if result["max_cpu"] is None or result["max_cpu"] < data[0]:
|
|
225
|
+
result["max_cpu"] = data[0]
|
|
226
|
+
|
|
227
|
+
if result["min_vmemory"] is None or result["min_vmemory"] > data[1]:
|
|
228
|
+
result["min_vmemory"] = data[1]
|
|
229
|
+
if result["max_vmemory"] is None or result["max_vmemory"] < data[1]:
|
|
230
|
+
result["max_vmemory"] = data[1]
|
|
231
|
+
|
|
232
|
+
if result["min_pmemory"] is None or result["min_pmemory"] > data[2]:
|
|
233
|
+
result["min_pmemory"] = data[2]
|
|
234
|
+
if result["max_pmemory"] is None or result["max_pmemory"] < data[2]:
|
|
235
|
+
result["max_pmemory"] = data[2]
|
|
236
|
+
|
|
237
|
+
return (200,result)
|
|
238
|
+
|
|
239
|
+
bearer_token_re = re.compile("^Bearer\\s+(?P<token>\\S+)\\s*$")
|
|
240
|
+
def get_auth_bearer(request):
|
|
241
|
+
"""
|
|
242
|
+
Check the bearer authentication
|
|
243
|
+
Return True if authenticated; otherwiser return False
|
|
244
|
+
"""
|
|
245
|
+
bearer_auth = request.META.get('HTTP_AUTHORIZATION').strip() if 'HTTP_AUTHORIZATION' in request.META else ''
|
|
246
|
+
m = bearer_token_re.search(bearer_auth)
|
|
247
|
+
token = None
|
|
248
|
+
if m:
|
|
249
|
+
token = m.group('token')
|
|
250
|
+
return token
|
|
251
|
+
|
|
252
|
+
key_assignedworkloads = "{}__assignedworkloads__".format(CACHE_PREFIX)
|
|
253
|
+
key_assignedworkloads_lock = "{}lock__".format(key_assignedworkloads)
|
|
254
|
+
|
|
255
|
+
def str_workloads(workloads):
|
|
256
|
+
return ",".join(["{}={}:{}({})".format(host,data[0][0],data[0][1],data[2]) if host != item_version else "{}={}".format(host,data) for host,data in workloads.items()])
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def save_workloads(workloads,unreached_servers=None):
|
|
260
|
+
"""
|
|
261
|
+
Save the updated workloads to cache
|
|
262
|
+
"""
|
|
263
|
+
#save the workloads
|
|
264
|
+
logger.debug("Begin to save the changed workloads data({}) to cache.".format(str_workloads(workloads)))
|
|
265
|
+
while True:
|
|
266
|
+
if cache.add(key_workloads_lock, 1, timeout=1):
|
|
267
|
+
#get the lock
|
|
268
|
+
try:
|
|
269
|
+
cur_workloads = cache.get(key_workloads)
|
|
270
|
+
if cur_workloads and cur_workloads.get(item_version,0) != workloads[item_version]:
|
|
271
|
+
#workloads data was changed after fetching the workloads data
|
|
272
|
+
#add the new added workloads data
|
|
273
|
+
for k,v in cur_workloads.items():
|
|
274
|
+
if k == item_version:
|
|
275
|
+
continue
|
|
276
|
+
if k not in workloads and (not unreached_servers or k not in unreached_servers):
|
|
277
|
+
workloads[k] = v
|
|
278
|
+
if cur_workloads.get(item_version,0) == 0:
|
|
279
|
+
workloads[item_version] += 1
|
|
280
|
+
else:
|
|
281
|
+
workloads[item_version] = cur_workloads[item_version] + 1
|
|
282
|
+
else:
|
|
283
|
+
#workloads data is not changed.
|
|
284
|
+
workloads[item_version] += 1
|
|
285
|
+
|
|
286
|
+
#save the new workloads data
|
|
287
|
+
cache.set(key_workloads,workloads)
|
|
288
|
+
logger.debug("Successfully save the workloads:{}".format(str_workloads(workloads)))
|
|
289
|
+
return
|
|
290
|
+
finally:
|
|
291
|
+
#release the lock
|
|
292
|
+
cache.delete(key_workloads_lock)
|
|
293
|
+
else:
|
|
294
|
+
#already locked.,wait 100 milliseconds, and try again
|
|
295
|
+
time.sleep(0.01)
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
def save_assignedworkloads(assignedworkloads):
|
|
299
|
+
"""
|
|
300
|
+
Save the updated assigned workloads to cache
|
|
301
|
+
"""
|
|
302
|
+
#save the workloads
|
|
303
|
+
logger.debug("Begin to save the changed assigned workloads data({}) to cache.".format(assignedworkloads))
|
|
304
|
+
while True:
|
|
305
|
+
if cache.add(key_assignedworkloads_lock, 1, timeout=1):
|
|
306
|
+
#get the lock
|
|
307
|
+
try:
|
|
308
|
+
cur_assignedworkloads = cache.get(key_assignedworkloads)
|
|
309
|
+
if cur_assignedworkloads and cur_assignedworkloads.get(item_version,0) != assignedworkloads[item_version]:
|
|
310
|
+
#sync the latest cache data
|
|
311
|
+
for k,v in cur_assignedworkloads.items():
|
|
312
|
+
if k == item_version:
|
|
313
|
+
continue
|
|
314
|
+
if k not in assignedworkloads:
|
|
315
|
+
assignedworkloads[k] = v
|
|
316
|
+
elif v != assignedworkloads[k]:
|
|
317
|
+
assignedworkloads[k] = v
|
|
318
|
+
|
|
319
|
+
if cur_assignedworkloads.get(item_version,0) == 0:
|
|
320
|
+
assignedworkloads[item_version] += 1
|
|
321
|
+
else:
|
|
322
|
+
assignedworkloads[item_version] = cur_assignedworkloads[item_version] + 1
|
|
323
|
+
else:
|
|
324
|
+
#workloads data is not changed.
|
|
325
|
+
assignedworkloads[item_version] += 1
|
|
326
|
+
|
|
327
|
+
#save the new workloads data
|
|
328
|
+
cache.set(key_assignedworkloads,assignedworkloads)
|
|
329
|
+
logger.debug("Successfully save the assigned workloads:{}".format(assignedworkloads))
|
|
330
|
+
return
|
|
331
|
+
finally:
|
|
332
|
+
#release the lock
|
|
333
|
+
cache.delete(key_assignedworkloads_lock)
|
|
334
|
+
else:
|
|
335
|
+
#already locked.,wait 100 milliseconds, and try again
|
|
336
|
+
time.sleep(0.01)
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
def populate_summary_data(datas):
|
|
340
|
+
"""
|
|
341
|
+
Populate the resource summary data from workloads' resource usage data
|
|
342
|
+
"""
|
|
343
|
+
summary = {
|
|
344
|
+
"total_cpu":0,
|
|
345
|
+
"total_vmemory":0,
|
|
346
|
+
"total_pmemory":0,
|
|
347
|
+
"total_processes":0,
|
|
348
|
+
"running_workloads":0,
|
|
349
|
+
"failed_workloads":0,
|
|
350
|
+
"min_process_cpu":None,
|
|
351
|
+
"max_process_cpu":None,
|
|
352
|
+
"min_process_vmemory":None,
|
|
353
|
+
"max_process_vmemory":None,
|
|
354
|
+
"min_process_pmemory":None,
|
|
355
|
+
"max_process_pmemory":None
|
|
356
|
+
}
|
|
357
|
+
for servername,serverdata in datas.items():
|
|
358
|
+
if isinstance(serverdata,str):
|
|
359
|
+
summary["failed_workloads"] += 1
|
|
360
|
+
continue
|
|
361
|
+
summary["running_workloads"] += 1
|
|
362
|
+
summary["total_cpu"] += serverdata["total_cpu"]
|
|
363
|
+
summary["total_vmemory"] += serverdata["total_vmemory"]
|
|
364
|
+
summary["total_pmemory"] += serverdata["total_pmemory"]
|
|
365
|
+
summary["total_processes"] += serverdata["processes"]
|
|
366
|
+
|
|
367
|
+
if summary["min_process_cpu"] is None or summary["min_process_cpu"] > serverdata["min_cpu"]:
|
|
368
|
+
summary["min_process_cpu"] = serverdata["min_cpu"]
|
|
369
|
+
if summary["max_process_cpu"] is None or summary["max_process_cpu"] < serverdata["max_cpu"]:
|
|
370
|
+
summary["max_process_cpu"] = serverdata["max_cpu"]
|
|
371
|
+
|
|
372
|
+
if summary["min_process_vmemory"] is None or summary["min_process_vmemory"] > serverdata["min_vmemory"]:
|
|
373
|
+
summary["min_process_vmemory"] = serverdata["min_vmemory"]
|
|
374
|
+
if summary["max_process_vmemory"] is None or summary["max_process_vmemory"] < serverdata["max_vmemory"]:
|
|
375
|
+
summary["max_process_vmemory"] = serverdata["max_vmemory"]
|
|
376
|
+
|
|
377
|
+
if summary["min_process_pmemory"] is None or summary["min_process_pmemory"] > serverdata["min_pmemory"]:
|
|
378
|
+
summary["min_process_pmemory"] = serverdata["min_pmemory"]
|
|
379
|
+
if summary["max_process_pmemory"] is None or summary["max_process_pmemory"] < serverdata["max_pmemory"]:
|
|
380
|
+
summary["max_process_pmemory"] = serverdata["max_pmemory"]
|
|
381
|
+
|
|
382
|
+
datas["summary"] = summary
|
|
383
|
+
|
|
384
|
+
workload_healthcheck_url = None
|
|
385
|
+
headers={"Authorization":None,"Accept": "application/json"}
|
|
386
|
+
|
|
387
|
+
def harvest_healthdata(request):
|
|
388
|
+
global secret
|
|
389
|
+
|
|
390
|
+
global workload_healthcheck_url
|
|
391
|
+
if not workload_healthcheck_url:
|
|
392
|
+
workload_healthcheck_url = reverse('healthcheck:workload_healthdata')
|
|
393
|
+
|
|
394
|
+
workloads = cache.get(key_workloads) or {item_version:0}
|
|
395
|
+
workloads_changed = False
|
|
396
|
+
logger.debug("Get the workloads from cache :{}".format(str_workloads(workloads)))
|
|
397
|
+
|
|
398
|
+
if registerhostname not in workloads:
|
|
399
|
+
secret = generate_secret()
|
|
400
|
+
workloads[registerhostname] = [[ip,PORT],secret,0]
|
|
401
|
+
workloads_changed = True
|
|
402
|
+
|
|
403
|
+
servers_res = {}
|
|
404
|
+
unreached_servers = []
|
|
405
|
+
#havest health data from all workloads
|
|
406
|
+
for servername, serverdata in workloads.items():
|
|
407
|
+
if servername == item_version:
|
|
408
|
+
continue
|
|
409
|
+
if servername == registerhostname:
|
|
410
|
+
servers_res[servername] = get_workload_healthcheckdata()
|
|
411
|
+
continue
|
|
412
|
+
|
|
413
|
+
serverip,port = serverdata[0]
|
|
414
|
+
headers["Authorization"] = "Bearer {}".format(serverdata[1])
|
|
415
|
+
headers["host"] = request.get_host()
|
|
416
|
+
url = "http://{}:{}{}".format(serverip,port,workload_healthcheck_url)
|
|
417
|
+
try:
|
|
418
|
+
res = requests.get(url,headers=headers)
|
|
419
|
+
except Exception as ex:
|
|
420
|
+
#the server is offline, don't add the data to servers_res
|
|
421
|
+
workloads_changed = True
|
|
422
|
+
serverdata[2] += 1
|
|
423
|
+
if serverdata[2] >= WORKLOAD_FAILED_THRESHOLD:
|
|
424
|
+
#continuous failed times is greater than WORKLOAD_FAILED_THRESHOLD.
|
|
425
|
+
unreached_servers.append(servername)
|
|
426
|
+
servers_res[servername] = (-1,"{1}:{2},url={0}".format(url,ex.__class__.__name__,str(ex)))
|
|
427
|
+
continue
|
|
428
|
+
if res.status_code in (502,503,504):
|
|
429
|
+
#the server is offline, don't add the data to servers_res
|
|
430
|
+
workloads_changed = True
|
|
431
|
+
serverdata[2] += 1
|
|
432
|
+
if serverdata[2] >= WORKLOAD_FAILED_THRESHOLD:
|
|
433
|
+
#continuous failed times is greater than WORKLOAD_FAILED_THRESHOLD.
|
|
434
|
+
unreached_servers.append(servername)
|
|
435
|
+
servers_res[servername] = (res.status_code,"{1}:{2},url={0}".format(url,res.status_code,res.text))
|
|
436
|
+
elif res.status_code == 200:
|
|
437
|
+
#the server is in good health, add the health data to servers_res
|
|
438
|
+
servers_res[servername] = (200,res.json())
|
|
439
|
+
if serverdata[2] > 0:
|
|
440
|
+
serverdata[2] -= 1
|
|
441
|
+
workloads_changed = True
|
|
442
|
+
else:
|
|
443
|
+
#the server is online, but running into error, add the error message to servers_res
|
|
444
|
+
servers_res[servername] = (res.status_code,"{1}: {2}. url={0}".format(res.status_code,res.text,url))
|
|
445
|
+
if serverdata[2] > 0:
|
|
446
|
+
serverdata[2] -= 1
|
|
447
|
+
workloads_changed = True
|
|
448
|
+
|
|
449
|
+
for servername in unreached_servers:
|
|
450
|
+
del workloads[servername]
|
|
451
|
+
|
|
452
|
+
logger.debug("healthdata harvest result :{}".format(servers_res))
|
|
453
|
+
|
|
454
|
+
if workloads_changed:
|
|
455
|
+
save_workloads(workloads,unreached_servers)
|
|
456
|
+
|
|
457
|
+
return (workloads,servers_res)
|
|
458
|
+
|
|
459
|
+
OFFLINE_STATUSCODE_LIST = (502,503,504,-1,-2)
|
|
460
|
+
if WORKLOADS > 0 and WORKLOAD_DEPLOYMENT:
|
|
461
|
+
#has a fixed number of workloads and it is a deployment
|
|
462
|
+
WORKLOADNAMES = [get_workloadname(index) for index in range(WORKLOADS)]
|
|
463
|
+
def healthdata_view(request):
|
|
464
|
+
#process the workloads which are alreasy assigned a workload name
|
|
465
|
+
workloads,servers_res = harvest_healthdata(request)
|
|
466
|
+
assignedworkloads = cache.get(key_assignedworkloads) or {item_version:1}
|
|
467
|
+
logger.debug("Get assigned workloads:{}".format(assignedworkloads))
|
|
468
|
+
datas = {}
|
|
469
|
+
index = 0
|
|
470
|
+
reassign_workloads = 0
|
|
471
|
+
for workloadname in WORKLOADNAMES:
|
|
472
|
+
servername = assignedworkloads.get(workloadname)
|
|
473
|
+
if not servername:
|
|
474
|
+
#workloadname is not assined to a server
|
|
475
|
+
reassign_workloads += 1
|
|
476
|
+
continue
|
|
477
|
+
|
|
478
|
+
#workload name is assigned to a server
|
|
479
|
+
if servername not in servers_res :
|
|
480
|
+
#the server is not available
|
|
481
|
+
reassign_workloads += 1
|
|
482
|
+
continue
|
|
483
|
+
|
|
484
|
+
datas[servername] = servers_res[servername]
|
|
485
|
+
if servers_res[servername][0] in OFFLINE_STATUSCODE_LIST:
|
|
486
|
+
#Related workload is offline, need to reassign another workload
|
|
487
|
+
reassign_workloads += 1
|
|
488
|
+
del servers_res[servername]
|
|
489
|
+
|
|
490
|
+
assignedworkloads_changed = False
|
|
491
|
+
if reassign_workloads > 0:
|
|
492
|
+
#Some workloads are not assigned a workload name or are not available
|
|
493
|
+
#Using the following to replace the exisint one with new one if possible
|
|
494
|
+
#Step 1: Replace the unavailable server with a new one
|
|
495
|
+
#Step 2: Assign the new server to the missing assignedworkloads(missed in the assignedworkloads before)
|
|
496
|
+
step = 0
|
|
497
|
+
while reassign_workloads > 0:
|
|
498
|
+
step += 1
|
|
499
|
+
for workloadname in WORKLOADNAMES:
|
|
500
|
+
servername = assignedworkloads.get(workloadname)
|
|
501
|
+
if servername in datas and datas[servername][0] not in OFFLINE_STATUSCODE_LIST:
|
|
502
|
+
#related server is online.no need to reassign
|
|
503
|
+
continue
|
|
504
|
+
elif step == 1:
|
|
505
|
+
#step 1 only reassign the assigned workloads
|
|
506
|
+
if workloadname not in assignedworkloads:
|
|
507
|
+
continue
|
|
508
|
+
replacedservername = None
|
|
509
|
+
for name,res in servers_res.items():
|
|
510
|
+
if res[0] == 200:
|
|
511
|
+
#found a good one, choose it
|
|
512
|
+
replacedservername = name
|
|
513
|
+
break
|
|
514
|
+
elif res[0] in OFFLINE_STATUSCODE_LIST:
|
|
515
|
+
continue
|
|
516
|
+
elif not replacedservername:
|
|
517
|
+
#fond a available one, but has some issues,choose it if can't find a good one
|
|
518
|
+
replacedservername = name
|
|
519
|
+
|
|
520
|
+
logger.debug("Replaced {1} with {2} for workload({0})".format(workloadname,servername,replacedservername))
|
|
521
|
+
if replacedservername:
|
|
522
|
+
datas[replacedservername] = servers_res[replacedservername]
|
|
523
|
+
del servers_res[replacedservername]
|
|
524
|
+
assignedworkloads[workloadname] = replacedservername
|
|
525
|
+
assignedworkloads_changed = True
|
|
526
|
+
|
|
527
|
+
if servers_res:
|
|
528
|
+
reassign_workloads -= 1
|
|
529
|
+
else:
|
|
530
|
+
reassign_workloads = 0
|
|
531
|
+
if reassign_workloads == 0:
|
|
532
|
+
break
|
|
533
|
+
|
|
534
|
+
if assignedworkloads_changed:
|
|
535
|
+
#save the workloads
|
|
536
|
+
logger.debug("Save the changed running workloads data({}).".format(assignedworkloads))
|
|
537
|
+
save_assignedworkloads(assignedworkloads)
|
|
538
|
+
|
|
539
|
+
#map the healthdata result to workload. and remove status code
|
|
540
|
+
result = {}
|
|
541
|
+
for workloadname in WORKLOADNAMES:
|
|
542
|
+
servername = assignedworkloads.get(workloadname)
|
|
543
|
+
if not servername:
|
|
544
|
+
result[workloadname] = "Can't find an available host for this non-assigned host.registered workloads: {0}, assigned workloads:{1}".format(str_workloads(workloads),assignedworkloads)
|
|
545
|
+
elif servername not in datas:
|
|
546
|
+
result[workloadname] = "Can't find an available host for this assigned offline host({2}).registered workloads: {0}, assigned workloads:{1}".format(str_workloads(workloads),assignedworkloads,servername)
|
|
547
|
+
elif datas[servername][0] == 200:
|
|
548
|
+
result[workloadname] = datas[servername][1]
|
|
549
|
+
result[workloadname]["hostname"] = servername
|
|
550
|
+
else:
|
|
551
|
+
result[workloadname] = "{}: {}".format(servername,datas[servername][1])
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
datas.clear()
|
|
555
|
+
|
|
556
|
+
populate_summary_data(result)
|
|
557
|
+
|
|
558
|
+
return JsonResponse(result)
|
|
559
|
+
|
|
560
|
+
elif WORKLOADS > 0 and not WORKLOAD_DEPLOYMENT:
|
|
561
|
+
WORKLOADNAMES = [get_workloadname(index) for index in range(1,WORKLOADS + 1,1)]
|
|
562
|
+
def healthdata_view(request):
|
|
563
|
+
workloads,servers_res = harvest_healthdata(request)
|
|
564
|
+
|
|
565
|
+
result = {}
|
|
566
|
+
for servername in WORKLOADNAMES:
|
|
567
|
+
if result in servers_res:
|
|
568
|
+
result[servername] = servers_res[servername][1]
|
|
569
|
+
else:
|
|
570
|
+
result[servername] = "Workload is offline.workloads={}".format(str_workloads(workloads))
|
|
571
|
+
|
|
572
|
+
populate_summary_data(result)
|
|
573
|
+
|
|
574
|
+
return JsonResponse(result)
|
|
575
|
+
else:
|
|
576
|
+
def healthdata_view(request):
|
|
577
|
+
workloads,servers_res = harvest_healthdata(request)
|
|
578
|
+
|
|
579
|
+
result = {}
|
|
580
|
+
for servername, serverdata in servers_res.items():
|
|
581
|
+
result[servername] = serverdata[1]
|
|
582
|
+
|
|
583
|
+
populate_summary_data(result)
|
|
584
|
+
|
|
585
|
+
return JsonResponse(result)
|
|
586
|
+
|
|
587
|
+
def workload_healthdata_view(request):
|
|
588
|
+
global secret
|
|
589
|
+
token = get_auth_bearer(request)
|
|
590
|
+
if not token:
|
|
591
|
+
return HttpResponseForbidden("Missing access token")
|
|
592
|
+
|
|
593
|
+
if not secret or secret != token:
|
|
594
|
+
workloads = cache.get(key_workloads)
|
|
595
|
+
data = workloads.get(registerhostname)
|
|
596
|
+
if data:
|
|
597
|
+
secret = data[1]
|
|
598
|
+
|
|
599
|
+
if secret != token:
|
|
600
|
+
return HttpResponseForbidden("Access token doesn't match")
|
|
601
|
+
|
|
602
|
+
statuscode,data = get_workload_healthcheckdata()
|
|
603
|
+
if statuscode == 200:
|
|
604
|
+
return JsonResponse(data)
|
|
605
|
+
else:
|
|
606
|
+
return HttpResponseServerError(data)
|
|
607
|
+
|
|
608
|
+
def register_healtcheckurls():
|
|
609
|
+
#Add urls
|
|
610
|
+
rootconf_module = importlib.import_module(settings.ROOT_URLCONF)
|
|
611
|
+
if not rootconf_module:
|
|
612
|
+
raise Exception("Failed to load module '{}'".format(settings.ROOT_URLCONF))
|
|
613
|
+
|
|
614
|
+
if HEALTHCHECK_ENABLED:
|
|
615
|
+
urlpatterns = [
|
|
616
|
+
path('healthcheck/healthdata', healthdata_view,name="healthdata"),
|
|
617
|
+
path('workload/healthcheck/healthdata',workload_healthdata_view,name="workload_healthdata")
|
|
618
|
+
]
|
|
619
|
+
else:
|
|
620
|
+
urlpatterns = []
|
|
621
|
+
|
|
622
|
+
rootconf_module.urlpatterns.append(path('',include((urlpatterns,'healthcheck'),namespace="healthcheck")))
|
|
623
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from django.urls import path
|
|
2
|
+
from django.conf import settings
|
|
3
|
+
|
|
4
|
+
from . import healthcheck
|
|
5
|
+
|
|
6
|
+
app_name = 'healthcheck'
|
|
7
|
+
|
|
8
|
+
if healthcheck.HEALTHCHECK_ENABLED:
|
|
9
|
+
urlpatterns = [
|
|
10
|
+
path('healthcheck/healthdata', healthcheck.healthdata_view,name="healthdata"),
|
|
11
|
+
path('healthcheck/workload_healthdata', healthcheck.workload_healthdata_view,name="workload_healthdata")
|
|
12
|
+
]
|
|
13
|
+
else:
|
|
14
|
+
urlpatterns = []
|
|
15
|
+
|
|
16
|
+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|