rda-python-metrics 1.0.33__tar.gz → 1.0.34__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rda-python-metrics might be problematic. Click here for more details.

Files changed (62) hide show
  1. {rda_python_metrics-1.0.33/src/rda_python_metrics.egg-info → rda_python_metrics-1.0.34}/PKG-INFO +1 -1
  2. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/pyproject.toml +1 -1
  3. rda_python_metrics-1.0.34/src/rda_python_metrics/fillawsusage.py +209 -0
  4. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillawsusage.usg +1 -1
  5. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillosdfusage.py +59 -29
  6. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillosdfusage.usg +1 -1
  7. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34/src/rda_python_metrics.egg-info}/PKG-INFO +1 -1
  8. rda_python_metrics-1.0.33/src/rda_python_metrics/fillawsusage.py +0 -254
  9. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/LICENSE +0 -0
  10. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/MANIFEST.in +0 -0
  11. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/README.md +0 -0
  12. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/setup.cfg +0 -0
  13. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/PgIPInfo.py +0 -0
  14. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/PgView.py +0 -0
  15. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/__init__.py +0 -0
  16. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillcdgusage.py +0 -0
  17. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillcdgusage.usg +0 -0
  18. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillcodusage.py +0 -0
  19. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillcodusage.usg +0 -0
  20. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillcountry.py +0 -0
  21. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillendtime.py +0 -0
  22. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillgdexusage.py +0 -0
  23. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillgdexusage.usg +0 -0
  24. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillglobususage.py +0 -0
  25. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillglobususage.usg +0 -0
  26. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillipinfo.py +0 -0
  27. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillipinfo.usg +0 -0
  28. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/filloneorder.py +0 -0
  29. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/filloneorder.usg +0 -0
  30. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillrdadb.py +0 -0
  31. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/fillrdadb.usg +0 -0
  32. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/filltdsusage.py +0 -0
  33. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/filltdsusage.usg +0 -0
  34. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/filluser.py +0 -0
  35. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/filluser.usg +0 -0
  36. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/logarch.py +0 -0
  37. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/logarch.usg +0 -0
  38. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/pgperson.py +0 -0
  39. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/pgsyspath.py +0 -0
  40. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/pgusername.py +0 -0
  41. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewallusage.py +0 -0
  42. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewallusage.usg +0 -0
  43. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewcheckusage.py +0 -0
  44. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewcheckusage.usg +0 -0
  45. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewcodusage.py +0 -0
  46. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewcodusage.usg +0 -0
  47. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewordusage.py +0 -0
  48. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewordusage.usg +0 -0
  49. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewrqstusage.py +0 -0
  50. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewrqstusage.usg +0 -0
  51. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewtdsusage.py +0 -0
  52. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewtdsusage.usg +0 -0
  53. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewwebfile.py +0 -0
  54. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewwebfile.usg +0 -0
  55. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewwebusage.py +0 -0
  56. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics/viewwebusage.usg +0 -0
  57. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics.egg-info/SOURCES.txt +0 -0
  58. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics.egg-info/dependency_links.txt +0 -0
  59. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics.egg-info/entry_points.txt +0 -0
  60. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics.egg-info/requires.txt +0 -0
  61. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/src/rda_python_metrics.egg-info/top_level.txt +0 -0
  62. {rda_python_metrics-1.0.33 → rda_python_metrics-1.0.34}/tests/test_metrics.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rda_python_metrics
3
- Version: 1.0.33
3
+ Version: 1.0.34
4
4
  Summary: RDA Python Package to gather and view data usage metrics
5
5
  Author-email: Zaihua Ji <zji@ucar.edu>
6
6
  Project-URL: Homepage, https://github.com/NCAR/rda-python-metrics
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
6
6
 
7
7
  [project]
8
8
  name = "rda_python_metrics"
9
- version = "1.0.33"
9
+ version = "1.0.34"
10
10
  authors = [
11
11
  { name="Zaihua Ji", email="zji@ucar.edu" },
12
12
  ]
@@ -0,0 +1,209 @@
1
+ #!/usr/bin/env python3
2
+ #
3
+ ###############################################################################
4
+ #
5
+ # Title : fillawsusage
6
+ # Author : Zaihua Ji, zji@ucar.edu
7
+ # Date : 03/11/2022
8
+ # 2025-03-26 transferred to package rda_python_metrics from
9
+ # https://github.com/NCAR/rda-database.git
10
+ # Purpose : python program to retrieve info from AWS logs
11
+ # and fill table wusages in PgSQL database dssdb.
12
+ #
13
+ # Github : https://github.com/NCAR/rda-pythn-metrics.git
14
+ #
15
+ ###############################################################################
16
+ #
17
+ import sys
18
+ import re
19
+ import glob
20
+ from os import path as op
21
+ from rda_python_common import PgLOG
22
+ from rda_python_common import PgUtil
23
+ from rda_python_common import PgFile
24
+ from rda_python_common import PgDBI
25
+ from . import PgIPInfo
26
+
27
+ USAGE = {
28
+ 'PGTBL' : "awsusage",
29
+ 'AWSDIR' : PgLOG.PGLOG["TRANSFER"] + "/AWSera5log",
30
+ 'AWSLOG' : "{}/{}-00-00-00-*",
31
+ 'PFMT' : "YYYY/MM/DD"
32
+ }
33
+
34
+ DSIDS = {'nsf-ncar-era5' : 'd633000'}
35
+
36
+ #
37
+ # main function to run this program
38
+ #
39
+ def main():
40
+
41
+ params = [] # array of input values
42
+ argv = sys.argv[1:]
43
+ option = None
44
+
45
+ for arg in argv:
46
+ ms = re.match(r'^-(b|d|p|N)$', arg)
47
+ if ms:
48
+ opt = ms.group(1)
49
+ if opt == 'b':
50
+ PgLOG.PGLOG['BCKGRND'] = 1
51
+ elif option:
52
+ PgLOG.pglog("{}: Option -{} is present already".format(arg, option), PgLOG.LGWNEX)
53
+ else:
54
+ option = opt
55
+ elif re.match(r'^-', arg):
56
+ PgLOG.pglog(arg + ": Invalid Option", PgLOG.LGWNEX)
57
+ elif option:
58
+ params.append(arg)
59
+ else:
60
+ PgLOG.pglog(arg + ": Invalid Parameter", PgLOG.LGWNEX)
61
+
62
+ if not (option and params): PgLOG.show_usage('fillawsusage')
63
+
64
+ PgDBI.dssdb_dbname()
65
+ cmdstr = "fillawsusage {}".format(' '.join(argv))
66
+ PgLOG.cmdlog(cmdstr)
67
+ PgFile.change_local_directory(USAGE['AWSDIR'])
68
+ filenames = get_log_file_names(option, params)
69
+ if filenames:
70
+ fill_aws_usages(filenames)
71
+ else:
72
+ PgLOG.pglog("No log file found for given command: " + cmdstr, PgLOG.LOGWRN)
73
+
74
+ PgLOG.pglog(None, PgLOG.LOGWRN)
75
+ sys.exit(0)
76
+
77
+ #
78
+ # get the log file dates
79
+ #
80
+ def get_log_file_names(option, params):
81
+
82
+ filenames = {}
83
+ if option == 'd':
84
+ for dt in params:
85
+ pdate = PgUtil.format_date(dt)
86
+ pd = PgUtil.format_date(pdate, USAGE['PFMT'])
87
+ fname = USAGE['AWSLOG'].format(pd, pdate)
88
+ fnames = glob.glob(fname)
89
+ if fnames: filenames[pdate] = sorted(fnames)
90
+ else:
91
+ if option == 'N':
92
+ edate = PgUtil.curdate()
93
+ pdate = PgUtil.adddate(edate, 0, 0, -int(params[0]))
94
+ else:
95
+ pdate = PgUtil.format_date(params[0])
96
+ if len(params) > 1:
97
+ edate = PgUtil.format_date(params[1])
98
+ else:
99
+ edate = PgUtil.curdate()
100
+ while pdate < edate:
101
+ pd = PgUtil.format_date(pdate, USAGE['PFMT'])
102
+ fname = USAGE['AWSLOG'].format(pd, pdate)
103
+ fnames = glob.glob(fname)
104
+ if fnames: filenames[pdate] = sorted(fnames)
105
+ pdate = PgUtil.adddate(pdate, 0, 0, 1)
106
+
107
+ return filenames
108
+
109
+ #
110
+ # Fill AWS usages into table dssdb.awsusage of DSS PgSQL database from aws access logs
111
+ #
112
+ def fill_aws_usages(filenames):
113
+
114
+ year = cntall = addall = 0
115
+ for pdate in filenames:
116
+ fnames = filenames[pdate]
117
+ records = {}
118
+ cntadd = entcnt = 0
119
+ for logfile in fnames:
120
+ if not op.isfile(logfile):
121
+ PgLOG.pglog("{}: Not exists for Gathering AWS usage".format(logfile), PgLOG.LOGWRN)
122
+ continue
123
+ PgLOG.pglog("Gathering AWS usage info from {} at {}".format(logfile, PgLOG.current_datetime()), PgLOG.LOGWRN)
124
+ aws = PgFile.open_local_file(logfile)
125
+ if not aws: continue
126
+ while True:
127
+ line = aws.readline()
128
+ if not line: break
129
+ entcnt += 1
130
+ if entcnt%20000 == 0:
131
+ dcnt = len(records)
132
+ PgLOG.pglog("{}: {}/{} AWS log entries processed/records to add".format(pdate, entcnt, dcnt), PgLOG.WARNLG)
133
+
134
+ ms = re.match(r'^\w+ ([\w-]+) \[(\S+).*\] ([\d\.]+) .+ REST\.GET\.OBJECT \S+ "GET.+" \d+ - (\d+) \d+ .* ".+" "(.+)" ', line)
135
+ if not ms: continue
136
+ values = list(ms.groups())
137
+ if values[0] not in DSIDS: continue
138
+ dsid = DSIDS[values[0]]
139
+ size = int(values[3])
140
+ ip = values[2]
141
+ engine = values[4]
142
+ moff = engine.find('/')
143
+ if moff > 0:
144
+ if moff > 20: moff = 20
145
+ method = engine[0:moff].upper()
146
+ else:
147
+ method = "AWS"
148
+ key = "{}:{}:{}".format(ip, dsid, method)
149
+ if key in records:
150
+ records[key]['size'] += size
151
+ records[key]['fcount'] += 1
152
+ else:
153
+ (year, quarter, date, time) = get_record_date_time(values[1])
154
+ iprec = PgIPInfo.get_missing_ipinfo(ip)
155
+ if not iprec: continue
156
+ records[key] = {'ip' : ip, 'dsid' : dsid, 'date' : date, 'time' : time, 'quarter' : quarter,
157
+ 'size' : size, 'fcount' : 1, 'method' : method, 'engine' : engine,
158
+ 'org_type' : iprec['org_type'], 'country' : iprec['country'],
159
+ 'region' : iprec['region'], 'email' : iprec['email']}
160
+ aws.close()
161
+ if records: cntadd = add_usage_records(records, year)
162
+ PgLOG.pglog("{}: {} AWS usage records added for {} entries at {}".format(pdate, cntadd, entcnt, PgLOG.current_datetime()), PgLOG.LOGWRN)
163
+ cntall += entcnt
164
+ if cntadd:
165
+ addall += cntadd
166
+ if addall > cntadd:
167
+ PgLOG.pglog("{} AWS usage records added for {} entries at {}".format(addall, cntall, PgLOG.current_datetime()), PgLOG.LOGWRN)
168
+
169
+ def get_record_date_time(ctime):
170
+
171
+ ms = re.search(r'^(\d+)/(\w+)/(\d+):(\d+:\d+:\d+)$', ctime)
172
+ if ms:
173
+ d = int(ms.group(1))
174
+ m = PgUtil.get_month(ms.group(2))
175
+ y = ms.group(3)
176
+ t = ms.group(4)
177
+ q = 1 + int((m-1)/3)
178
+ return (y, q, "{}-{:02}-{:02}".format(y, m, d), t)
179
+ else:
180
+ PgLOG.pglog(ctime + ": Invalid date/time format", PgLOG.LGEREX)
181
+
182
+ def add_usage_records(records, year):
183
+
184
+ cnt = 0
185
+ for key in records:
186
+ record = records[key]
187
+ cond = "date = '{}' AND time = '{}' AND ip = '{}' AND dsid = '{}'".format(record['date'], record['time'], record['ip'], record['dsid'])
188
+ if PgDBI.pgget(USAGE['PGTBL'], '', cond, PgLOG.LGEREX): continue
189
+ if add_to_allusage(year, record):
190
+ cnt += PgDBI.pgadd(USAGE['PGTBL'], record, PgLOG.LOGWRN)
191
+
192
+ return cnt
193
+
194
+
195
+ def add_to_allusage(year, pgrec):
196
+
197
+ record = {'source' : 'A'}
198
+ flds = ['ip', 'dsid', 'date', 'time', 'quarter', 'size', 'method',
199
+ 'org_type', 'country', 'region', 'email']
200
+
201
+ for fld in flds:
202
+ record[fld] = pgrec[fld]
203
+
204
+ return PgDBI.add_yearly_allusage(year, record)
205
+
206
+ #
207
+ # call main() to start program
208
+ #
209
+ if __name__ == "__main__": main()
@@ -1,6 +1,6 @@
1
1
 
2
2
  Retrieves usage information from AWS Server logs under directory
3
- ../rda/transer/AWSera5log/ to fill table 'wusage' in database 'rdadb'.
3
+ ../rda/transer/AWSera5log/ to fill table 'awsusage' in database 'rdadb'.
4
4
 
5
5
  Usage: fillawsusage [-b] [-d LogFileDates] [-N NumberDay] [-p BeginDate [Enddate]]
6
6
 
@@ -22,7 +22,7 @@ from rda_python_common import PgSplit
22
22
  from . import PgIPInfo
23
23
 
24
24
  USAGE = {
25
- 'OSDFTBL' : "wusage",
25
+ 'OSDFTBL' : "osdfusage",
26
26
  'OSDFDIR' : PgLOG.PGLOG["DSSDATA"] + "/work/zji/osdflogs/",
27
27
  'OSDFGET' : 'wget -m -nH -np -nd https://pelicanplatform.org/pelican-access-logs/ncar-access-log/',
28
28
  'OSDFLOG' : "{}-cache.log", # YYYY-MM-DD-cache.log
@@ -100,9 +100,7 @@ def get_log_file_names(option, params, datelimits):
100
100
  #
101
101
  def fill_osdf_usages(fnames):
102
102
 
103
- cntall = addall = 0
104
-
105
- fcnt = len(fnames)
103
+ year = cntall = addall = 0
106
104
  for logfile in fnames:
107
105
  linfo = PgFile.check_local_file(logfile)
108
106
  if not linfo:
@@ -119,46 +117,54 @@ def fill_osdf_usages(fnames):
119
117
  PgLOG.pglog("{}: Gathering OSDF usage at {}".format(logfile, PgLOG.current_datetime()), PgLOG.LOGWRN)
120
118
  osdf = PgFile.open_local_file(logfile)
121
119
  if not osdf: continue
120
+ records = {}
122
121
  cntadd = entcnt = 0
123
- pkey = None
124
122
  while True:
125
123
  line = osdf.readline()
126
124
  if not line: break
127
125
  entcnt += 1
128
- if entcnt%10000 == 0:
129
- PgLOG.pglog("{}: {}/{} OSDF log entries processed/records added".format(logfile, entcnt, cntadd), PgLOG.WARNLG)
126
+ if entcnt%20000 == 0:
127
+ dcnt = len(records)
128
+ PgLOG.pglog("{}: {}/{} OSDF log entries processed/records added".format(logfile, entcnt, dcnt), PgLOG.WARNLG)
130
129
 
131
- ms = re.match(r'^\[(\S+)\] \[Objectname:\/ncar\/rda\/([a-z]\d{6})\/(\S+)\].* \[Host:(\S+)\].* \[AppInfo:(\S+)\].* \[Read:(\d+)\]', line)
130
+ ms = re.match(r'^\[(\S+)\] \[Objectname:\/ncar\/rda\/([a-z]\d{6})\/\S+\].* \[Site:(\S+)\].* \[Host:(\S+)\].* \[AppInfo:(\S+)\].* \[Read:(\d+)\]', line)
132
131
  if not ms: continue
133
132
  dt = ms.group(1)
134
133
  dsid = ms.group(2)
135
- wfile = ms.group(3)
134
+ site = ms.group(3)
136
135
  ip = ms.group(4)
137
136
  if ip == 'N/A': ip = '0.0.0.0'
138
137
  engine = ms.group(5)
139
138
  size = int(ms.group(6))
140
- (year, quarter, date, time) = get_record_date_time(dt)
141
- locflag = 'C'
142
- if re.match(r'^curl', engine, re.I):
143
- method = "CURL"
144
- elif re.match(r'^wget', engine, re.I):
145
- method = "WGET"
146
- elif re.match(r'^python', engine, re.I):
147
- method = "PYTHN"
148
- elif re.match(r'^N/A', engine, re.I):
149
- method = "N/A"
139
+ if re.match(r'^N/A', engine, re.I):
140
+ method = "OSDF"
150
141
  else:
151
- method = "WEB"
152
- method = "OSDF"
153
-
154
- record = {'ip' : ip, 'dsid' : dsid, 'wfile' : wfile, 'date' : date,
155
- 'time' : time, 'quarter' : quarter, 'size' : size,
156
- 'locflag' : locflag, 'method' : method}
157
- cntadd += add_file_usage(year, record)
142
+ moff = engine.find('/')
143
+ if moff > 0:
144
+ if moff > 20: moff = 20
145
+ method = engine[0:moff].upper()
146
+ else:
147
+ method = "OSDF"
148
+ key = "{}:{}:{}".format(ip, dsid, method)
149
+ if key in records:
150
+ records[key]['size'] += size
151
+ records[key]['fcount'] += 1
152
+ else:
153
+ (year, quarter, date, time) = get_record_date_time(dt)
154
+ iprec = PgIPInfo.get_missing_ipinfo(ip)
155
+ if not iprec: continue
156
+ records[key] = {'ip' : ip, 'dsid' : dsid, 'date' : date, 'time' : time, 'quarter' : quarter,
157
+ 'size' : size, 'fcount' : 1, 'method' : method, 'engine' : engine,
158
+ 'org_type' : iprec['org_type'], 'country' : iprec['country'],
159
+ 'region' : iprec['region'], 'email' : iprec['email'], 'site' : site}
158
160
  osdf.close()
161
+ if records: cntadd = add_usage_records(records, year)
162
+ PgLOG.pglog("{}: {} OSDF usage records added for {} entries at {}".format(logfile, cntadd, entcnt, PgLOG.current_datetime()), PgLOG.LOGWRN)
159
163
  cntall += entcnt
160
- addall += cntadd
161
- PgLOG.pglog("{} OSDF usage records added for {} entries at {}".format(addall, cntall, PgLOG.current_datetime()), PgLOG.LOGWRN)
164
+ if cntadd:
165
+ addall += cntadd
166
+ if addall > cntadd:
167
+ PgLOG.pglog("{} OSDF usage records added for {} entries at {}".format(addall, cntall, PgLOG.current_datetime()), PgLOG.LOGWRN)
162
168
 
163
169
 
164
170
  def get_record_date_time(ctime):
@@ -174,6 +180,30 @@ def get_record_date_time(ctime):
174
180
  else:
175
181
  PgLOG.pglog(ctime + ": Invalid date/time format", PgLOG.LGEREX)
176
182
 
183
+ def add_usage_records(records, year):
184
+
185
+ cnt = 0
186
+ for key in records:
187
+ record = records[key]
188
+ cond = "date = '{}' AND time = '{}' AND ip = '{}' AND dsid = '{}'".format(record['date'], record['time'], record['ip'], record['dsid'])
189
+ if PgDBI.pgget(USAGE['OSDFTBL'], '', cond, PgLOG.LGEREX): continue
190
+ if add_to_allusage(year, record):
191
+ cnt += PgDBI.pgadd(USAGE['OSDFTBL'], record, PgLOG.LOGWRN)
192
+
193
+ return cnt
194
+
195
+ def add_to_allusage(year, pgrec):
196
+
197
+ record = {'source' : 'P'}
198
+ flds = ['ip', 'dsid', 'date', 'time', 'quarter', 'size', 'method',
199
+ 'org_type', 'country', 'region', 'email']
200
+
201
+ for fld in flds:
202
+ record[fld] = pgrec[fld]
203
+
204
+ return PgDBI.add_yearly_allusage(year, record)
205
+
206
+
177
207
  #
178
208
  # Fill usage of a single online data file into table dssdb.wusage of DSS PgSQL database
179
209
  #
@@ -184,7 +214,7 @@ def add_file_usage(year, logrec):
184
214
 
185
215
  table = "{}_{}".format(USAGE['OSDFTBL'], year)
186
216
  cond = "wid = {} AND method = '{}' AND date_read = '{}' AND time_read = '{}'".format(pgrec['wid'], logrec['method'], logrec['date'], logrec['time'])
187
- if PgDBI.pgget(table, "", cond, PgLOG.LOGWRN): return 0
217
+ if PgDBI.pgget(USAGE['OSDFTBL'], "", cond, PgLOG.LOGWRN): return 0
188
218
 
189
219
  wurec = PgIPInfo.get_wuser_record(logrec['ip'], logrec['date'])
190
220
  if not wurec: return 0
@@ -1,6 +1,6 @@
1
1
 
2
2
  Retrieves usage information from OSDF Server logs under directory
3
- /gpfs/fs1/collections/rda/work/zji/osdflogs/ to fill table 'wusage' in
3
+ /gpfs/fs1/collections/rda/work/zji/osdflogs/ to fill table 'osdfusage' in
4
4
  database 'dssdb'.
5
5
 
6
6
  Usage: fillosdfusage [-b] [-d LogFileDates] [-N NumberDay] [-p BeginDate [Enddate]]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rda_python_metrics
3
- Version: 1.0.33
3
+ Version: 1.0.34
4
4
  Summary: RDA Python Package to gather and view data usage metrics
5
5
  Author-email: Zaihua Ji <zji@ucar.edu>
6
6
  Project-URL: Homepage, https://github.com/NCAR/rda-python-metrics
@@ -1,254 +0,0 @@
1
- #!/usr/bin/env python3
2
- #
3
- ###############################################################################
4
- #
5
- # Title : fillawsusage
6
- # Author : Zaihua Ji, zji@ucar.edu
7
- # Date : 03/11/2022
8
- # 2025-03-26 transferred to package rda_python_metrics from
9
- # https://github.com/NCAR/rda-database.git
10
- # Purpose : python program to retrieve info from AWS logs
11
- # and fill table wusages in PgSQL database dssdb.
12
- #
13
- # Github : https://github.com/NCAR/rda-pythn-metrics.git
14
- #
15
- ###############################################################################
16
- #
17
- import sys
18
- import re
19
- import glob
20
- from os import path as op
21
- from rda_python_common import PgLOG
22
- from rda_python_common import PgUtil
23
- from rda_python_common import PgFile
24
- from rda_python_common import PgDBI
25
- from . import PgIPInfo
26
-
27
- USAGE = {
28
- 'PGTBL' : "wusage",
29
- 'AWSDIR' : PgLOG.PGLOG["TRANSFER"] + "/AWSera5log",
30
- 'AWSLOG' : "{}/{}-00-00-00-*",
31
- 'PFMT' : "YYYY/MM/DD"
32
- }
33
-
34
- DSIDS = {'nsf-ncar-era5' : PgUtil.format_dataset_id('d633000')}
35
-
36
- #
37
- # main function to run this program
38
- #
39
- def main():
40
-
41
- params = [] # array of input values
42
- argv = sys.argv[1:]
43
- option = None
44
-
45
- for arg in argv:
46
- ms = re.match(r'^-(b|d|p|N)$', arg)
47
- if ms:
48
- opt = ms.group(1)
49
- if opt == 'b':
50
- PgLOG.PGLOG['BCKGRND'] = 1
51
- elif option:
52
- PgLOG.pglog("{}: Option -{} is present already".format(arg, option), PgLOG.LGWNEX)
53
- else:
54
- option = opt
55
- elif re.match(r'^-', arg):
56
- PgLOG.pglog(arg + ": Invalid Option", PgLOG.LGWNEX)
57
- elif option:
58
- params.append(arg)
59
- else:
60
- PgLOG.pglog(arg + ": Invalid Parameter", PgLOG.LGWNEX)
61
-
62
- if not (option and params): PgLOG.show_usage('fillawsusage')
63
-
64
- PgDBI.dssdb_dbname()
65
- cmdstr = "fillawsusage {}".format(' '.join(argv))
66
- PgLOG.cmdlog(cmdstr)
67
- PgFile.change_local_directory(USAGE['AWSDIR'])
68
- filenames = get_log_file_names(option, params)
69
- if filenames:
70
- fill_aws_usages(filenames)
71
- else:
72
- PgLOG.pglog("No log file found for given command: " + cmdstr, PgLOG.LOGWRN)
73
-
74
- PgLOG.pglog(None, PgLOG.LOGWRN)
75
- sys.exit(0)
76
-
77
- #
78
- # get the log file dates
79
- #
80
- def get_log_file_names(option, params):
81
-
82
- filenames = []
83
- if option == 'd':
84
- for dt in params:
85
- pdate = PgUtil.format_date(dt)
86
- pd = PgUtil.format_date(pdate, USAGE['PFMT'])
87
- fname = USAGE['AWSLOG'].format(pd, pdate)
88
- fnames = glob.glob(fname)
89
- if fnames: filenames.extend(sorted(fnames))
90
- else:
91
- if option == 'N':
92
- edate = PgUtil.curdate()
93
- pdate = PgUtil.adddate(edate, 0, 0, -int(params[0]))
94
- else:
95
- pdate = PgUtil.format_date(params[0])
96
- if len(params) > 1:
97
- edate = PgUtil.format_date(params[1])
98
- else:
99
- edate = PgUtil.curdate()
100
- while pdate <= edate:
101
- pd = PgUtil.format_date(pdate, USAGE['PFMT'])
102
- fname = USAGE['AWSLOG'].format(pd, pdate)
103
- fnames = glob.glob(fname)
104
- if fnames: filenames.extend(sorted(fnames))
105
- pdate = PgUtil.adddate(pdate, 0, 0, 1)
106
-
107
- return filenames
108
-
109
- #
110
- # Fill AWS usages into table dssdb.awsusage of DSS PgSQL database from aws access logs
111
- #
112
- def fill_aws_usages(fnames):
113
-
114
- cntall = addall = 0
115
- fcnt = len(fnames)
116
- for logfile in fnames:
117
- if not op.isfile(logfile):
118
- PgLOG.pglog("{}: Not exists for Gathering AWS usage".format(logfile), PgLOG.LOGWRN)
119
- continue
120
- PgLOG.pglog("Gathering usage info from {} at {}".format(logfile, PgLOG.current_datetime()), PgLOG.LOGWRN)
121
- aws = PgFile.open_local_file(logfile)
122
- if not aws: continue
123
- ptime = ''
124
- record = {}
125
- cntadd = entcnt = 0
126
- pkey = None
127
- while True:
128
- line = aws.readline()
129
- if not line: break
130
- entcnt += 1
131
- if entcnt%10000 == 0:
132
- PgLOG.pglog("{}: {}/{} AWS log entries processed/records added".format(logfile, entcnt, cntadd), PgLOG.WARNLG)
133
-
134
- ms = re.match(r'^\w+ ([\w-]+) \[(\S+).*\] ([\d\.]+) .+ REST\.GET\.OBJECT (\S+) "GET.+" (200|206) - (\d+) (\d+) .* ".+" "(.+)" ', line)
135
- if not ms: continue
136
- values = list(ms.groups())
137
- if values[0] not in DSIDS: continue
138
- dsid = DSIDS[values[0]]
139
- size = int(values[5])
140
- fsize = int(values[6])
141
- if fsize < 100: continue # ignore small files
142
- ip = values[2]
143
- wfile = values[3]
144
- stat = values[4]
145
- engine = values[7]
146
- (year, quarter, date, time) = get_record_date_time(values[1])
147
- locflag = 'A'
148
-
149
- if re.match(r'^aiobotocore', engine, re.I):
150
- method = "AIOBT"
151
- elif re.match(r'^rclone', engine, re.I):
152
- method = "RCLON"
153
- elif re.match(r'^python', engine, re.I):
154
- method = "PYTHN"
155
- else:
156
- method = "WEB"
157
-
158
- key = "{}:{}:{}".format(ip, dsid, wfile) if stat == '206' else None
159
-
160
- if record:
161
- if key == pkey:
162
- record['size'] += size
163
- continue
164
- cntadd += add_file_usage(year, record)
165
- record = {'ip' : ip, 'dsid' : dsid, 'wfile' : wfile, 'date' : date,
166
- 'time' : time, 'quarter' : quarter, 'size' : size,
167
- 'locflag' : locflag, 'method' : method}
168
- pkey = key
169
- if not pkey:
170
- cntadd += add_file_usage(year, record)
171
- record = None
172
- if record: cntadd += add_file_usage(year, record)
173
- aws.close()
174
- cntall += entcnt
175
- addall += cntadd
176
- PgLOG.pglog("{} AWS usage records added for {} entries at {}".format(addall, cntall, PgLOG.current_datetime()), PgLOG.LOGWRN)
177
-
178
-
179
- def get_record_date_time(ctime):
180
-
181
- ms = re.search(r'^(\d+)/(\w+)/(\d+):(\d+:\d+:\d+)$', ctime)
182
- if ms:
183
- d = int(ms.group(1))
184
- m = PgUtil.get_month(ms.group(2))
185
- y = ms.group(3)
186
- t = ms.group(4)
187
- q = 1 + int((m-1)/3)
188
- return (y, q, "{}-{:02}-{:02}".format(y, m, d), t)
189
- else:
190
- PgLOG.pglog(ctime + ": Invalid date/time format", PgLOG.LGEREX)
191
-
192
- #
193
- # Fill usage of a single online data file into table dssdb.wusage of DSS PgSQL database
194
- #
195
- def add_file_usage(year, logrec):
196
-
197
- pgrec = get_wfile_wid(logrec['dsid'], logrec['wfile'])
198
- if not pgrec: return 0
199
-
200
- table = "{}_{}".format(USAGE['PGTBL'], year)
201
- cond = "wid = {} AND method = '{}' AND date_read = '{}' AND time_read = '{}'".format(pgrec['wid'], logrec['method'], logrec['date'], logrec['time'])
202
- if PgDBI.pgget(table, "", cond, PgLOG.LOGWRN): return 0
203
-
204
- wurec = PgIPInfo.get_wuser_record(logrec['ip'], logrec['date'])
205
- if not wurec: return 0
206
- record = {'wid' : pgrec['wid'], 'dsid' : pgrec['dsid']}
207
- record['wuid_read'] = wurec['wuid']
208
- record['date_read'] = logrec['date']
209
- record['time_read'] = logrec['time']
210
- record['size_read'] = logrec['size']
211
- record['method'] = logrec['method']
212
- record['locflag'] = logrec['locflag']
213
- record['ip'] = logrec['ip']
214
- record['quarter'] = logrec['quarter']
215
-
216
- if add_to_allusage(year, logrec, wurec):
217
- return PgDBI.add_yearly_wusage(year, record)
218
- else:
219
- return 0
220
-
221
- def add_to_allusage(year, logrec, wurec):
222
-
223
- pgrec = {'email' : wurec['email'], 'org_type' : wurec['org_type'],
224
- 'country' : wurec['country'], 'region' : wurec['region']}
225
- pgrec['dsid'] = logrec['dsid']
226
- pgrec['date'] = logrec['date']
227
- pgrec['quarter'] = logrec['quarter']
228
- pgrec['time'] = logrec['time']
229
- pgrec['size'] = logrec['size']
230
- pgrec['method'] = logrec['method']
231
- pgrec['ip'] = logrec['ip']
232
- pgrec['source'] = 'A'
233
- return PgDBI.add_yearly_allusage(year, pgrec)
234
-
235
- #
236
- # return wfile.wid upon success, 0 otherwise
237
- #
238
- def get_wfile_wid(dsid, wfile):
239
-
240
- dscond = "dsid = '{}' AND wfile = '{}'".format(dsid, wfile)
241
- pgrec = PgDBI.pgget("wfile", "*", dscond)
242
-
243
- if not pgrec:
244
- pgrec = PgDBI.pgget("wmove", "wid, dsid", dscond)
245
- if pgrec:
246
- pgrec = PgDBI.pgget("wfile", "*", "wid = {}".format(pgrec['wid']))
247
- if pgrec: pgrec['dsid'] = dsid
248
-
249
- return pgrec
250
-
251
- #
252
- # call main() to start program
253
- #
254
- if __name__ == "__main__": main()