rda-python-metrics 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rda-python-metrics might be problematic. Click here for more details.
- rda_python_metrics/PgIPInfo.py +188 -0
- rda_python_metrics/PgView.py +782 -0
- rda_python_metrics/__init__.py +1 -0
- rda_python_metrics/fillawsusage.py +282 -0
- rda_python_metrics/fillawsusage.usg +17 -0
- rda_python_metrics/fillcodusage.py +247 -0
- rda_python_metrics/fillcodusage.usg +21 -0
- rda_python_metrics/fillcountry.py +79 -0
- rda_python_metrics/fillendtime.py +93 -0
- rda_python_metrics/fillglobususage.py +287 -0
- rda_python_metrics/fillglobususage.usg +17 -0
- rda_python_metrics/fillipinfo.py +185 -0
- rda_python_metrics/fillipinfo.usg +18 -0
- rda_python_metrics/filloneorder.py +155 -0
- rda_python_metrics/filloneorder.usg +41 -0
- rda_python_metrics/fillrdadb.py +151 -0
- rda_python_metrics/fillrdadb.usg +32 -0
- rda_python_metrics/filltdsusage.py +289 -0
- rda_python_metrics/filltdsusage.usg +17 -0
- rda_python_metrics/filluser.py +216 -0
- rda_python_metrics/filluser.usg +16 -0
- rda_python_metrics/logarch.py +359 -0
- rda_python_metrics/logarch.usg +27 -0
- rda_python_metrics/pgperson.py +72 -0
- rda_python_metrics/pgusername.py +50 -0
- rda_python_metrics/viewallusage.py +350 -0
- rda_python_metrics/viewallusage.usg +198 -0
- rda_python_metrics/viewcheckusage.py +289 -0
- rda_python_metrics/viewcheckusage.usg +185 -0
- rda_python_metrics/viewcodusage.py +314 -0
- rda_python_metrics/viewcodusage.usg +184 -0
- rda_python_metrics/viewordusage.py +340 -0
- rda_python_metrics/viewordusage.usg +224 -0
- rda_python_metrics/viewrqstusage.py +362 -0
- rda_python_metrics/viewrqstusage.usg +217 -0
- rda_python_metrics/viewtdsusage.py +323 -0
- rda_python_metrics/viewtdsusage.usg +191 -0
- rda_python_metrics/viewwebfile.py +294 -0
- rda_python_metrics/viewwebfile.usg +212 -0
- rda_python_metrics/viewwebusage.py +371 -0
- rda_python_metrics/viewwebusage.usg +211 -0
- rda_python_metrics-1.0.4.dist-info/METADATA +18 -0
- rda_python_metrics-1.0.4.dist-info/RECORD +47 -0
- rda_python_metrics-1.0.4.dist-info/WHEEL +5 -0
- rda_python_metrics-1.0.4.dist-info/entry_points.txt +22 -0
- rda_python_metrics-1.0.4.dist-info/licenses/LICENSE +21 -0
- rda_python_metrics-1.0.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
#
|
|
3
|
+
###############################################################################
|
|
4
|
+
#
|
|
5
|
+
# Title : fillawsusage
|
|
6
|
+
# Author : Zaihua Ji, zji@ucar.edu
|
|
7
|
+
# Date : 03/11/2022
|
|
8
|
+
# 2025-03-26 transferred to package rda_python_metrics from
|
|
9
|
+
# https://github.com/NCAR/rda-database.git
|
|
10
|
+
# Purpose : python program to retrieve info from AWS logs
|
|
11
|
+
# and fill table wusages in PgSQL database dssdb.
|
|
12
|
+
#
|
|
13
|
+
# Github : https://github.com/NCAR/rda-pythn-metrics.git
|
|
14
|
+
#
|
|
15
|
+
###############################################################################
|
|
16
|
+
#
|
|
17
|
+
import sys
|
|
18
|
+
import re
|
|
19
|
+
import glob
|
|
20
|
+
from os import path as op
|
|
21
|
+
from rda_python_common import PgLOG
|
|
22
|
+
from rda_python_common import PgUtil
|
|
23
|
+
from rda_python_common import PgFile
|
|
24
|
+
from rda_python_common import PgDBI
|
|
25
|
+
from . import PgIPInfo
|
|
26
|
+
|
|
27
|
+
USAGE = {
|
|
28
|
+
'PGTBL' : "wusage",
|
|
29
|
+
'AWSDIR' : PgLOG.PGLOG["TRANSFER"] + "/AWSera5log",
|
|
30
|
+
'AWSLOG' : "{}/{}-00-00-00-*",
|
|
31
|
+
'PFMT' : "YYYY/MM/DD"
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
DSIDS = {'nsf-ncar-era5' : PgUtil.format_dataset_id('d633000')}
|
|
35
|
+
|
|
36
|
+
#
|
|
37
|
+
# main function to run this program
|
|
38
|
+
#
|
|
39
|
+
def main():
|
|
40
|
+
|
|
41
|
+
params = [] # array of input values
|
|
42
|
+
argv = sys.argv[1:]
|
|
43
|
+
option = None
|
|
44
|
+
|
|
45
|
+
for arg in argv:
|
|
46
|
+
ms = re.match(r'^-(b|d|p|N)$', arg)
|
|
47
|
+
if ms:
|
|
48
|
+
opt = ms.group(1)
|
|
49
|
+
if opt == 'b':
|
|
50
|
+
PgLOG.PGLOG['BCKGRND'] = 1
|
|
51
|
+
elif option:
|
|
52
|
+
PgLOG.pglog("{}: Option -{} is present already".format(arg, option), PgLOG.LGWNEX)
|
|
53
|
+
else:
|
|
54
|
+
option = opt
|
|
55
|
+
elif re.match(r'^-', arg):
|
|
56
|
+
PgLOG.pglog(arg + ": Invalid Option", PgLOG.LGWNEX)
|
|
57
|
+
elif option:
|
|
58
|
+
params.append(arg)
|
|
59
|
+
else:
|
|
60
|
+
PgLOG.pglog(arg + ": Invalid Parameter", PgLOG.LGWNEX)
|
|
61
|
+
|
|
62
|
+
if not (option and params): PgLOG.show_usage('fillawsusage')
|
|
63
|
+
|
|
64
|
+
PgDBI.dssdb_dbname()
|
|
65
|
+
cmdstr = "fillawsusage {}".format(' '.join(argv))
|
|
66
|
+
PgLOG.cmdlog(cmdstr)
|
|
67
|
+
PgFile.change_local_directory(USAGE['AWSDIR'])
|
|
68
|
+
filenames = get_log_file_names(option, params)
|
|
69
|
+
if filenames:
|
|
70
|
+
fill_aws_usages(filenames)
|
|
71
|
+
else:
|
|
72
|
+
PgLOG.pglog("No log file found for given command: " + cmdstr, PgLOG.LOGWRN)
|
|
73
|
+
|
|
74
|
+
PgLOG.pglog(None, PgLOG.LOGWRN)
|
|
75
|
+
sys.exit(0)
|
|
76
|
+
|
|
77
|
+
#
|
|
78
|
+
# get the log file dates
|
|
79
|
+
#
|
|
80
|
+
def get_log_file_names(option, params):
|
|
81
|
+
|
|
82
|
+
filenames = []
|
|
83
|
+
if option == 'd':
|
|
84
|
+
for dt in params:
|
|
85
|
+
pdate = PgUtil.format_date(dt)
|
|
86
|
+
pd = PgUtil.format_date(pdate, USAGE['PFMT'])
|
|
87
|
+
fname = USAGE['AWSLOG'].format(pd, pdate)
|
|
88
|
+
fnames = glob.glob(fname)
|
|
89
|
+
if fnames: filenames.extend(sorted(fnames))
|
|
90
|
+
else:
|
|
91
|
+
if option == 'N':
|
|
92
|
+
edate = PgUtil.curdate()
|
|
93
|
+
pdate = PgUtil.adddate(edate, 0, 0, -int(params[0]))
|
|
94
|
+
else:
|
|
95
|
+
pdate = PgUtil.format_date(params[0])
|
|
96
|
+
if len(params) > 1:
|
|
97
|
+
edate = PgUtil.format_date(params[1])
|
|
98
|
+
else:
|
|
99
|
+
edate = PgUtil.curdate()
|
|
100
|
+
while pdate <= edate:
|
|
101
|
+
pd = PgUtil.format_date(pdate, USAGE['PFMT'])
|
|
102
|
+
fname = USAGE['AWSLOG'].format(pd, pdate)
|
|
103
|
+
fnames = glob.glob(fname)
|
|
104
|
+
if fnames: filenames.extend(sorted(fnames))
|
|
105
|
+
pdate = PgUtil.adddate(pdate, 0, 0, 1)
|
|
106
|
+
|
|
107
|
+
return filenames
|
|
108
|
+
|
|
109
|
+
#
|
|
110
|
+
# Fill AWS usages into table dssdb.awsusage of DSS PgSQL database from aws access logs
|
|
111
|
+
#
|
|
112
|
+
def fill_aws_usages(fnames):
|
|
113
|
+
|
|
114
|
+
cntall = addall = 0
|
|
115
|
+
fcnt = len(fnames)
|
|
116
|
+
for logfile in fnames:
|
|
117
|
+
if not op.isfile(logfile):
|
|
118
|
+
PgLOG.pglog("{}: Not exists for Gathering AWS usage".format(logfile), PgLOG.LOGWRN)
|
|
119
|
+
continue
|
|
120
|
+
PgLOG.pglog("Gathering usage info from {} at {}".format(logfile, PgLOG.current_datetime()), PgLOG.LOGWRN)
|
|
121
|
+
aws = PgFile.open_local_file(logfile)
|
|
122
|
+
if not aws: continue
|
|
123
|
+
ptime = ''
|
|
124
|
+
record = {}
|
|
125
|
+
cntadd = entcnt = 0
|
|
126
|
+
pkey = None
|
|
127
|
+
while True:
|
|
128
|
+
line = aws.readline()
|
|
129
|
+
if not line: break
|
|
130
|
+
entcnt += 1
|
|
131
|
+
if entcnt%10000 == 0:
|
|
132
|
+
PgLOG.pglog("{}: {}/{} AWS log entries processed/records added".format(logfile, entcnt, cntadd), PgLOG.WARNLG)
|
|
133
|
+
|
|
134
|
+
ms = re.match(r'^\w+ ([\w-]+) \[(\S+).*\] ([\d\.]+) .+ REST\.GET\.OBJECT (\S+) "GET.+" (200|206) - (\d+) (\d+) .* ".+" "(.+)" ', line)
|
|
135
|
+
if not ms: continue
|
|
136
|
+
values = list(ms.groups())
|
|
137
|
+
if values[0] not in DSIDS: continue
|
|
138
|
+
dsid = DSIDS[values[0]]
|
|
139
|
+
size = int(values[5])
|
|
140
|
+
fsize = int(values[6])
|
|
141
|
+
if fsize < 100: continue # ignore small files
|
|
142
|
+
ip = values[2]
|
|
143
|
+
wfile = values[3]
|
|
144
|
+
stat = values[4]
|
|
145
|
+
engine = values[7]
|
|
146
|
+
(year, quarter, date, time) = get_record_date_time(values[1])
|
|
147
|
+
locflag = 'A'
|
|
148
|
+
|
|
149
|
+
if re.match(r'^aiobotocore', engine, re.I):
|
|
150
|
+
method = "AIOBT"
|
|
151
|
+
elif re.match(r'^rclone', engine, re.I):
|
|
152
|
+
method = "RCLON"
|
|
153
|
+
elif re.match(r'^python', engine, re.I):
|
|
154
|
+
method = "PYTHN"
|
|
155
|
+
else:
|
|
156
|
+
method = "WEB"
|
|
157
|
+
|
|
158
|
+
key = "{}:{}:{}".format(ip, dsid, wfile) if stat == '206' else None
|
|
159
|
+
|
|
160
|
+
if record:
|
|
161
|
+
if key == pkey:
|
|
162
|
+
record['size'] += size
|
|
163
|
+
continue
|
|
164
|
+
cntadd += add_file_usage(year, record)
|
|
165
|
+
record = {'ip' : ip, 'dsid' : dsid, 'wfile' : wfile, 'date' : date,
|
|
166
|
+
'time' : time, 'quarter' : quarter, 'size' : size,
|
|
167
|
+
'locflag' : locflag, 'method' : method}
|
|
168
|
+
pkey = key
|
|
169
|
+
if not pkey:
|
|
170
|
+
cntadd += add_file_usage(year, record)
|
|
171
|
+
record = None
|
|
172
|
+
if record: cntadd += add_file_usage(year, record)
|
|
173
|
+
aws.close()
|
|
174
|
+
cntall += entcnt
|
|
175
|
+
addall += cntadd
|
|
176
|
+
PgLOG.pglog("{} AWS usage records added for {} entries at {}".format(addall, cntall, PgLOG.current_datetime()), PgLOG.LOGWRN)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def get_record_date_time(ctime):
|
|
180
|
+
|
|
181
|
+
ms = re.search(r'^(\d+)/(\w+)/(\d+):(\d+:\d+:\d+)$', ctime)
|
|
182
|
+
if ms:
|
|
183
|
+
d = int(ms.group(1))
|
|
184
|
+
m = PgUtil.get_month(ms.group(2))
|
|
185
|
+
y = ms.group(3)
|
|
186
|
+
t = ms.group(4)
|
|
187
|
+
q = 1 + (m-1)/3
|
|
188
|
+
return (y, q, "{}-{:02}-{:02}".format(y, m, d), t)
|
|
189
|
+
else:
|
|
190
|
+
PgLOG.pglog(ctime + ": Invalid date/time format", PgLOG.LGEREX)
|
|
191
|
+
|
|
192
|
+
#
|
|
193
|
+
# Fill usage of a single online data file into table dssdb.wusage of DSS PgSQL database
|
|
194
|
+
#
|
|
195
|
+
def add_file_usage(year, logrec):
|
|
196
|
+
|
|
197
|
+
pgrec = get_wfile_wid(logrec['dsid'], logrec['wfile'])
|
|
198
|
+
if not pgrec: return 0
|
|
199
|
+
|
|
200
|
+
table = "{}_{}".format(USAGE['PGTBL'], year)
|
|
201
|
+
cond = "wid = {} AND method = '{}' AND date_read = '{}' AND time_read = '{}'".format(pgrec['wid'], logrec['method'], logrec['date'], logrec['time'])
|
|
202
|
+
if PgDBI.pgget(table, "", cond, PgLOG.LOGWRN): return 0
|
|
203
|
+
|
|
204
|
+
wurec = get_wuser_record(logrec['ip'], logrec['date'])
|
|
205
|
+
if not wurec: return 0
|
|
206
|
+
record = {'wid' : pgrec['wid'], 'dsid' : pgrec['dsid']}
|
|
207
|
+
record['wuid_read'] = wurec['wuid']
|
|
208
|
+
record['date_read'] = logrec['date']
|
|
209
|
+
record['time_read'] = logrec['time']
|
|
210
|
+
record['size_read'] = logrec['size']
|
|
211
|
+
record['method'] = logrec['method']
|
|
212
|
+
record['locflag'] = logrec['locflag']
|
|
213
|
+
record['ip'] = logrec['ip']
|
|
214
|
+
record['quarter'] = logrec['quarter']
|
|
215
|
+
|
|
216
|
+
if add_to_allusage(year, logrec, wurec):
|
|
217
|
+
return PgDBI.add_yearly_wusage(year, record)
|
|
218
|
+
else:
|
|
219
|
+
return 0
|
|
220
|
+
|
|
221
|
+
def add_to_allusage(year, logrec, wurec):
|
|
222
|
+
|
|
223
|
+
pgrec = {'email' : wurec['email'], 'org_type' : wurec['org_type'], 'country' : wurec['country']}
|
|
224
|
+
pgrec['dsid'] = logrec['dsid']
|
|
225
|
+
pgrec['date'] = logrec['date']
|
|
226
|
+
pgrec['quarter'] = logrec['quarter']
|
|
227
|
+
pgrec['time'] = logrec['time']
|
|
228
|
+
pgrec['size'] = logrec['size']
|
|
229
|
+
pgrec['method'] = logrec['method']
|
|
230
|
+
pgrec['ip'] = logrec['ip']
|
|
231
|
+
pgrec['source'] = 'A'
|
|
232
|
+
return PgDBI.add_yearly_allusage(year, pgrec)
|
|
233
|
+
|
|
234
|
+
#
|
|
235
|
+
# return wfile.wid upon success, 0 otherwise
|
|
236
|
+
#
|
|
237
|
+
def get_wfile_wid(dsid, wfile):
|
|
238
|
+
|
|
239
|
+
dscond = "dsid = '{}' AND wfile = '{}'".format(dsid, wfile)
|
|
240
|
+
pgrec = PgDBI.pgget("wfile", "*", dscond)
|
|
241
|
+
|
|
242
|
+
if not pgrec:
|
|
243
|
+
pgrec = PgDBI.pgget("wmove", "wid, dsid", dscond)
|
|
244
|
+
if pgrec:
|
|
245
|
+
pgrec = PgDBI.pgget("wfile", "*", "wid = {}".format(pgrec['wid']))
|
|
246
|
+
if pgrec: pgrec['dsid'] = dsid
|
|
247
|
+
|
|
248
|
+
return pgrec
|
|
249
|
+
|
|
250
|
+
# return wuser record upon success, None otherwise
|
|
251
|
+
def get_wuser_record(ip, date):
|
|
252
|
+
|
|
253
|
+
ipinfo = PgIPInfo.set_ipinfo(ip)
|
|
254
|
+
if not ipinfo: return None
|
|
255
|
+
|
|
256
|
+
record = {'org_type' : ipinfo['org_type'], 'country' : ipinfo['country']}
|
|
257
|
+
email = 'unknown@' + ipinfo['hostname']
|
|
258
|
+
emcond = "email = '{}'".format(email)
|
|
259
|
+
flds = 'wuid, email, org_type, country, start_date'
|
|
260
|
+
pgrec = PgDBI.pgget("wuser", flds, emcond, PgLOG.LOGERR)
|
|
261
|
+
if pgrec:
|
|
262
|
+
if PgUtil.diffdate(pgrec['start_date'], date) > 0:
|
|
263
|
+
pgrec['start_date'] = record['start_date'] = date
|
|
264
|
+
PgDBI.pgupdt('wuser', record, emcond)
|
|
265
|
+
return pgrec
|
|
266
|
+
|
|
267
|
+
# now add one in
|
|
268
|
+
record['email'] = email
|
|
269
|
+
record['stat_flag'] = 'A'
|
|
270
|
+
record['start_date'] = date
|
|
271
|
+
wuid = PgDBI.pgadd("wuser", record, PgLOG.LOGERR|PgLOG.AUTOID)
|
|
272
|
+
if wuid:
|
|
273
|
+
record['wuid'] = wuid
|
|
274
|
+
PgLOG.pglog("{} Added as wuid({})".format(email, wuid), PgLOG.LGWNEM)
|
|
275
|
+
return record
|
|
276
|
+
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
#
|
|
280
|
+
# call main() to start program
|
|
281
|
+
#
|
|
282
|
+
if __name__ == "__main__": main()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
|
|
2
|
+
Retrieves usage information from AWS Server logs under directory
|
|
3
|
+
/gpfs/fs1/collections/rda/transer/AWSera5log/ to fill table 'wusage' in
|
|
4
|
+
database 'dssdb'.
|
|
5
|
+
|
|
6
|
+
Usage: fillawsusage [-b] [-d LogFileDates] [-N NumberDay] [-p BeginDate [Enddate]]
|
|
7
|
+
|
|
8
|
+
select option, -d, -N or -p to run this application.
|
|
9
|
+
|
|
10
|
+
- Option -b, log process information into logfile only;
|
|
11
|
+
|
|
12
|
+
- Option -d, retrieve usage info from given log file dates;
|
|
13
|
+
|
|
14
|
+
- Option -N, retrieve usage info in recent NumberDay days;
|
|
15
|
+
|
|
16
|
+
- Option -p, retrieve usage info between given period. For missing EndDate,
|
|
17
|
+
it defaults to the current date.
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
#
|
|
3
|
+
###############################################################################
|
|
4
|
+
#
|
|
5
|
+
# Title : fillcodusage
|
|
6
|
+
# Author : Zaihua Ji, zji@ucar.edu
|
|
7
|
+
# Date : 03/11/2022
|
|
8
|
+
# 2025-03-26 transferred to package rda_python_metrics from
|
|
9
|
+
# https://github.com/NCAR/rda-database.git
|
|
10
|
+
# Purpose : python program to retrieve info from web logs
|
|
11
|
+
# and fill table codusage in PgSQL database dssdb.
|
|
12
|
+
#
|
|
13
|
+
# Github : https://github.com/NCAR/rda-python-metrics.git
|
|
14
|
+
#
|
|
15
|
+
###############################################################################
|
|
16
|
+
#
|
|
17
|
+
import sys
|
|
18
|
+
import re
|
|
19
|
+
import glob
|
|
20
|
+
from os import path as op
|
|
21
|
+
from rda_python_common import PgLOG
|
|
22
|
+
from rda_python_common import PgUtil
|
|
23
|
+
from rda_python_common import PgFile
|
|
24
|
+
from rda_python_common import PgDBI
|
|
25
|
+
|
|
26
|
+
# the define options for gathering COD data usage, one at a time
|
|
27
|
+
MONTH = 0x02 # fet COD data usages for given months
|
|
28
|
+
YEARS = 0x04 # get COD data usages for given years
|
|
29
|
+
NDAYS = 0x08 # get COD data usages in recent number of days
|
|
30
|
+
FILES = 0x10 # get given file names
|
|
31
|
+
GTALL = 0x20 # get all data files of read
|
|
32
|
+
MASKS = (MONTH|YEARS|NDAYS|FILES)
|
|
33
|
+
|
|
34
|
+
USAGE = {
|
|
35
|
+
'OPTION' : 0,
|
|
36
|
+
'PGTBL' : "codusage",
|
|
37
|
+
'WEBLOG' : "/var/log/httpd",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
USERS = {} # cache user info for aid
|
|
41
|
+
|
|
42
|
+
#
|
|
43
|
+
# main function to run this program
|
|
44
|
+
#
|
|
45
|
+
def main():
|
|
46
|
+
|
|
47
|
+
params = [] # array of input values
|
|
48
|
+
argv = sys.argv[1:]
|
|
49
|
+
datelimit = ''
|
|
50
|
+
|
|
51
|
+
for arg in argv:
|
|
52
|
+
if arg == "-b":
|
|
53
|
+
PgLOG.PGLOG['BCKGRND'] = 1
|
|
54
|
+
elif re.match(r'^-[afmNy]$', arg) and USAGE['OPTION'] == 0:
|
|
55
|
+
if arg == "-a":
|
|
56
|
+
USAGE['OPTION'] = GTALL
|
|
57
|
+
params = ['']
|
|
58
|
+
elif arg == "-f":
|
|
59
|
+
USAGE['OPTION'] = FILES
|
|
60
|
+
elif arg == "-m":
|
|
61
|
+
USAGE['OPTION'] = MONTH
|
|
62
|
+
elif arg == "-y":
|
|
63
|
+
USAGE['OPTION'] = YEARS
|
|
64
|
+
elif arg == "-N":
|
|
65
|
+
USAGE['OPTION'] = NDAYS
|
|
66
|
+
elif re.match(r'^-', arg):
|
|
67
|
+
PgLOG.pglog(arg + ": Invalid Option", PgLOG.LGWNEX)
|
|
68
|
+
elif USAGE['OPTION']&MASKS:
|
|
69
|
+
params.append(arg)
|
|
70
|
+
else:
|
|
71
|
+
PgLOG.pglog(arg + ": Invalid Parameter", PgLOG.LGWNEX)
|
|
72
|
+
|
|
73
|
+
if not (USAGE['OPTION'] and params): PgLOG.show_usage('fillcodusage')
|
|
74
|
+
|
|
75
|
+
PgDBI.dssdb_dbname()
|
|
76
|
+
PgLOG.cmdlog("fillcodusage {}".format(' '.join(argv)))
|
|
77
|
+
|
|
78
|
+
if USAGE['OPTION']&NDAYS:
|
|
79
|
+
curdate = PgUtil.curdate()
|
|
80
|
+
datelimit = PgUtil.adddate(curdate, 0, 0, -int(params[0]))
|
|
81
|
+
|
|
82
|
+
USAGE['OPTION'] = MONTH
|
|
83
|
+
params = []
|
|
84
|
+
|
|
85
|
+
while curdate >= datelimit:
|
|
86
|
+
(year, month, day) = curdate.split('-')
|
|
87
|
+
params.append("{}-{}".format(year, month))
|
|
88
|
+
curdate = PgUtil.adddate(curdate, 0, 0, -int(day))
|
|
89
|
+
|
|
90
|
+
fill_cod_usages(USAGE['OPTION'], params, datelimit)
|
|
91
|
+
|
|
92
|
+
PgLOG.pglog(None, PgLOG.LOGWRN|PgLOG.SNDEML) # send email out if any
|
|
93
|
+
|
|
94
|
+
sys.exit(0)
|
|
95
|
+
|
|
96
|
+
#
|
|
97
|
+
# Fill COD usages into table dssdb.codusage of DSS PgSQL database from cod access logs
|
|
98
|
+
#
|
|
99
|
+
def fill_cod_usages(option, inputs, datelimit):
|
|
100
|
+
|
|
101
|
+
cntall = cntadd = 0
|
|
102
|
+
|
|
103
|
+
for input in inputs:
|
|
104
|
+
# get log file names
|
|
105
|
+
if option&FILES:
|
|
106
|
+
logfiles = [input]
|
|
107
|
+
elif option&MONTH:
|
|
108
|
+
tms = input.split('-')
|
|
109
|
+
yrmn = "{}{:02}".format(tms[0], int(tms[1]))
|
|
110
|
+
logfiles = ["{}/{}/access_log".format(USAGE['WEBLOG'], yrmn)]
|
|
111
|
+
else: # GTALL | YEARS
|
|
112
|
+
yrmn = input + "*"
|
|
113
|
+
logfiles = glob.glob("{}/{}/access_log".format(USAGE['WEBLOG'], yrmn))
|
|
114
|
+
|
|
115
|
+
for logfile in logfiles:
|
|
116
|
+
if not op.isfile(logfile):
|
|
117
|
+
PgLOG.pglog("{}: Not exists for Gathering custom OPeNDAP usage".format(logfile), PgLOG.LOGWRN)
|
|
118
|
+
continue
|
|
119
|
+
PgLOG.pglog("Gathering custom OPeNDAP usage info from {} at {}".format(logfile, PgLOG.current_datetime()), PgLOG.LOGWRN)
|
|
120
|
+
cod = PgFile.open_local_file(logfile)
|
|
121
|
+
if not cod: continue
|
|
122
|
+
|
|
123
|
+
pdate = ''
|
|
124
|
+
records = {}
|
|
125
|
+
while True:
|
|
126
|
+
line = cod.readline()
|
|
127
|
+
if not line: break
|
|
128
|
+
cntall += 1
|
|
129
|
+
if cntall%20000 == 0:
|
|
130
|
+
s = 's' if cntadd > 1 else ''
|
|
131
|
+
PgLOG.pglog("{}/{} COD log entries processed/records added".format(cntall, cntadd), PgLOG.WARNLG)
|
|
132
|
+
|
|
133
|
+
ms = re.search(r'GET /opendap/(\w{10})\.dods.*\s200\s+(\d+).{6}([^"]+)', line)
|
|
134
|
+
if not ms: continue
|
|
135
|
+
aid = ms.group(1)
|
|
136
|
+
size = int(ms.group(2))
|
|
137
|
+
engine = ms.group(3)
|
|
138
|
+
if not (aid in USERS or cache_users(aid)): continue
|
|
139
|
+
ms = re.match(r'^([\d\.]+).+\[(\d+)/(\w+)/(\d+):([\d:]+)', line)
|
|
140
|
+
if not ms: continue
|
|
141
|
+
ip = ms.group(1)
|
|
142
|
+
ctime = ms.group(5)
|
|
143
|
+
cdate = "{}-{:02}-{:02}".format(ms.group(4), PgUtil.get_month(ms.group(3)), int(ms.group(2)))
|
|
144
|
+
if pdate != cdate:
|
|
145
|
+
if records:
|
|
146
|
+
cntadd += add_usage_records(records, cdate)
|
|
147
|
+
records = {}
|
|
148
|
+
pdate = cdate
|
|
149
|
+
|
|
150
|
+
if datelimit and cdate < datelimit: continue
|
|
151
|
+
|
|
152
|
+
if aid in records:
|
|
153
|
+
records[aid]['size'] += size
|
|
154
|
+
records[aid]['count'] += 1
|
|
155
|
+
USERS[aid]['etime'] = ctime
|
|
156
|
+
else:
|
|
157
|
+
records[aid] = {}
|
|
158
|
+
records[aid]['ip'] = ip
|
|
159
|
+
records[aid]['count'] = 1
|
|
160
|
+
records[aid]['email'] = USERS[aid]['email']
|
|
161
|
+
records[aid]['dsid'] = USERS[aid]['dsid']
|
|
162
|
+
records[aid]['size'] = size
|
|
163
|
+
records[aid]['engine'] = engine
|
|
164
|
+
USERS[aid]['etime'] = ctime
|
|
165
|
+
USERS[aid]['btime'] = ctime
|
|
166
|
+
cod.close()
|
|
167
|
+
if records: cntadd += add_usage_records(records, cdate)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
s = 's' if cntadd > 1 else ''
|
|
171
|
+
PgLOG.pglog("{} COD usage records added for {} entries at {}".format(cntadd, cntall, PgLOG.current_datetime()), PgLOG.LOGWRN)
|
|
172
|
+
|
|
173
|
+
def add_usage_records(records, date):
|
|
174
|
+
|
|
175
|
+
ms = re.match(r'(\d+)-(\d+)-', date)
|
|
176
|
+
if not ms: return 0
|
|
177
|
+
year = ms.group(1)
|
|
178
|
+
quarter = 1 + int((int(ms.group(2)) - 1) / 3)
|
|
179
|
+
cnt = 0
|
|
180
|
+
|
|
181
|
+
for aid in records:
|
|
182
|
+
if PgDBI.pgget(USAGE['PGTBL'], '', "aid = '{}' AND date = '{}'".format(aid, date), PgLOG.LGEREX): continue
|
|
183
|
+
record = records[aid]
|
|
184
|
+
if record['email'] == '-':
|
|
185
|
+
record['org_type'] = record['country'] = '-'
|
|
186
|
+
else:
|
|
187
|
+
wuid = PgDBI.check_wuser_wuid(record['email'], date)
|
|
188
|
+
if not wuid: next
|
|
189
|
+
pgrec = PgDBI.pgget("wuser", "org_type, country", "wuid = {}".format(wuid), PgLOG.LGWNEX)
|
|
190
|
+
if not pgrec: continue
|
|
191
|
+
record['org_type'] = pgrec['org_type']
|
|
192
|
+
record['country'] = pgrec['country']
|
|
193
|
+
|
|
194
|
+
record['date'] = date
|
|
195
|
+
record['time'] = USERS[aid]['btime']
|
|
196
|
+
record['quarter'] = quarter
|
|
197
|
+
|
|
198
|
+
if add_to_allusage(record, year):
|
|
199
|
+
record['aid'] = aid
|
|
200
|
+
record['period'] = access_period(USERS[aid]['etime'], record['time'])
|
|
201
|
+
cnt += PgDBI.pgadd(USAGE['PGTBL'], record, PgLOG.LOGWRN)
|
|
202
|
+
|
|
203
|
+
return cnt
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def add_to_allusage(pgrec, year):
|
|
207
|
+
|
|
208
|
+
record = {'method' : 'COD', 'source' : 'C'}
|
|
209
|
+
for fld in pgrec:
|
|
210
|
+
ms = re.match(r'^(engine|count)$', fld)
|
|
211
|
+
if ms: continue
|
|
212
|
+
record[fld] = pgrec[fld]
|
|
213
|
+
|
|
214
|
+
return PgDBI.add_yearly_allusage(year, record) # change 1 to 0 to stop checking
|
|
215
|
+
|
|
216
|
+
def cache_users(aid):
|
|
217
|
+
|
|
218
|
+
pgrec = PgDBI.pgget("metautil.custom_dap_history", "*", "ID = '{}'".format(aid), PgLOG.LGEREX)
|
|
219
|
+
|
|
220
|
+
if pgrec:
|
|
221
|
+
ms = re.search(r'dsnum=(\d+\.\d|[a-z]\d{6});', pgrec['rinfo'])
|
|
222
|
+
if ms:
|
|
223
|
+
dsid = PgUtil.format_dataset_id(ms.group(1))
|
|
224
|
+
USERS[aid]= {'dsid' : dsid, 'email' : pgrec['duser']}
|
|
225
|
+
return 1
|
|
226
|
+
|
|
227
|
+
return 0
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def access_period(etime, btime):
|
|
231
|
+
|
|
232
|
+
period = 86400
|
|
233
|
+
|
|
234
|
+
ms = re.search(r'(\d+):(\d+):(\d+)', etime)
|
|
235
|
+
if ms:
|
|
236
|
+
period = int(ms.group(1))*3600+int(ms.group(2))*60+int(ms.group(3))
|
|
237
|
+
|
|
238
|
+
ms = re.search(r'(\d+):(\d+):(\d+)', btime)
|
|
239
|
+
if ms:
|
|
240
|
+
period -= int(ms.group(1))*3600+int(ms.group(2))*60+int(ms.group(3))
|
|
241
|
+
|
|
242
|
+
return period
|
|
243
|
+
|
|
244
|
+
#
|
|
245
|
+
# call main() to start program
|
|
246
|
+
#
|
|
247
|
+
if __name__ == "__main__": main()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
|
|
2
|
+
Retrieves usage information from RDA Web logs under /var/log to
|
|
3
|
+
fill table 'codusage' in MySQL database 'dssdb'.
|
|
4
|
+
|
|
5
|
+
Usage: fillcodusage [-a] [-b] [-f LogFileNames] [-m MonthList] [-N NumberDay] [-y YearList]
|
|
6
|
+
|
|
7
|
+
select one of the options, -a, -f, -m, -N or -y each time to run
|
|
8
|
+
this application.
|
|
9
|
+
|
|
10
|
+
- Option -b, log process information into logfile only;
|
|
11
|
+
|
|
12
|
+
- Option -a, retrieve usages for all available logs;
|
|
13
|
+
|
|
14
|
+
- Option -f, retrieve usage info from given log file names;
|
|
15
|
+
|
|
16
|
+
- Option -m, retrieve usage info in given months;
|
|
17
|
+
|
|
18
|
+
- Option -N, retrieve usage info in recent NumberDay days;
|
|
19
|
+
|
|
20
|
+
- Option -y, retrieve usage info in given years.
|
|
21
|
+
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
#
|
|
3
|
+
###############################################################################
|
|
4
|
+
#
|
|
5
|
+
# Title : fillcountry
|
|
6
|
+
# Author : Zaihua Ji, zji@ucar.edu
|
|
7
|
+
# Date : 2022-03-11
|
|
8
|
+
# 2025-03-26 transferred to package rda_python_metrics from
|
|
9
|
+
# https://github.com/NCAR/rda-database.git
|
|
10
|
+
# Purpose : python program to fill missing country field from email info for
|
|
11
|
+
# given table name
|
|
12
|
+
#
|
|
13
|
+
# Github : https://github.com/NCAR/rda-python-metrics.git
|
|
14
|
+
#
|
|
15
|
+
###############################################################################
|
|
16
|
+
#
|
|
17
|
+
import sys
|
|
18
|
+
import re
|
|
19
|
+
from rda_python_common import PgLOG
|
|
20
|
+
from rda_python_common import PgIMMA
|
|
21
|
+
from rda_python_common import PgUtil
|
|
22
|
+
from rda_python_common import PgDBI
|
|
23
|
+
|
|
24
|
+
#
|
|
25
|
+
# main function to run this program
|
|
26
|
+
#
|
|
27
|
+
def main():
|
|
28
|
+
|
|
29
|
+
argv = sys.argv[1:]
|
|
30
|
+
tables = ['allusage', 'user', 'wuser']
|
|
31
|
+
table = None
|
|
32
|
+
|
|
33
|
+
# check command line
|
|
34
|
+
for arg in argv:
|
|
35
|
+
if arg == "-b":
|
|
36
|
+
PgLOG.PGLOG['BCKGRND'] = 1
|
|
37
|
+
elif re.match(r'^-.*', arg):
|
|
38
|
+
PgLOG.pglog(arg + ": Unknown Option", PgLOG.LGEREX)
|
|
39
|
+
elif not table:
|
|
40
|
+
table = arg
|
|
41
|
+
else:
|
|
42
|
+
PgLOG.pglog(arg + ": one table name at a time", PgLOG.LGEREX)
|
|
43
|
+
|
|
44
|
+
if not table:
|
|
45
|
+
print("Usage: fillcountry TableName\n")
|
|
46
|
+
sys.exit(0)
|
|
47
|
+
elif table not in tables:
|
|
48
|
+
PgLOG.pglog("{}: table name must be ({})".format(table, '|'.join(tables)), PgLOG.LGEREX)
|
|
49
|
+
|
|
50
|
+
PgDBI.dssdb_dbname()
|
|
51
|
+
PgLOG.cmdlog("fillcountry {}".format(' '.join(argv)))
|
|
52
|
+
|
|
53
|
+
process_countries(table)
|
|
54
|
+
|
|
55
|
+
sys.exit(0)
|
|
56
|
+
|
|
57
|
+
def process_countries(table):
|
|
58
|
+
|
|
59
|
+
pgrecs = PgDBI.pgmget(table, "email", "country IS NULL", PgLOG.LOGWRN)
|
|
60
|
+
|
|
61
|
+
cntall = len(pgrecs['email']) if pgrecs else 0
|
|
62
|
+
PgLOG.pglog("Set {} record(s) for missing country in table {}".format(cntall, table), PgLOG.LOGWRN)
|
|
63
|
+
if not cntall: return
|
|
64
|
+
|
|
65
|
+
cntmod = 0
|
|
66
|
+
for i in range(cntall):
|
|
67
|
+
if i and (i % 500) == 0:
|
|
68
|
+
PgLOG.pglog("{}/{} Records modified/processed".format(cntmod, i), PgLOG.WARNLG)
|
|
69
|
+
|
|
70
|
+
email = pgrecs['email'][i]
|
|
71
|
+
record = {'country' : PgDBI.email_to_country(email)}
|
|
72
|
+
cntmod += PgDBI.pgupdt(table, record, "email = '{}' AND country IS NULL".format(email), PgLOG.LOGWRN)
|
|
73
|
+
|
|
74
|
+
PgLOG.pglog("{} Record(s) modified in table '{}'".format(cntmod, table), PgLOG.LOGWRN)
|
|
75
|
+
|
|
76
|
+
#
|
|
77
|
+
# call main() to start program
|
|
78
|
+
#
|
|
79
|
+
if __name__ == "__main__": main()
|