rda-python-dsquasar 2.0.3__tar.gz → 2.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/PKG-INFO +1 -1
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/pyproject.toml +1 -1
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar/taccrec.py +4 -4
- rda_python_dsquasar-2.0.4/src/rda_python_dsquasar/tacctar.py +379 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/PKG-INFO +1 -1
- rda_python_dsquasar-2.0.3/src/rda_python_dsquasar/tacctar.py +0 -178
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/LICENSE +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/README.md +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/setup.cfg +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar/__init__.py +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar/ds_quasar.py +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar/dsquasar.py +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar/dstacc.py +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/SOURCES.txt +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/dependency_links.txt +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/entry_points.txt +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/requires.txt +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/top_level.txt +0 -0
- {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/test/test_dsquasar.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rda_python_dsquasar
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.4
|
|
4
4
|
Summary: RDA Python package to backup and recover RDA data archives to and from GLOBUS Quasar backup server
|
|
5
5
|
Author-email: Zaihua Ji <zji@ucar.edu>
|
|
6
6
|
Project-URL: Homepage, https://github.com/NCAR/rda-python-dsquasar
|
|
@@ -56,7 +56,7 @@ def get_tar_summary_and_details(tar_path):
|
|
|
56
56
|
}
|
|
57
57
|
|
|
58
58
|
def insert_tfile_row(summary, checksum, db_params, extra, update_on_conflict=False):
|
|
59
|
-
table_name = 'tfile'
|
|
59
|
+
table_name = 'dssdb.tfile'
|
|
60
60
|
conn = psycopg2.connect(**db_params)
|
|
61
61
|
cur = conn.cursor()
|
|
62
62
|
columns = [
|
|
@@ -99,21 +99,21 @@ def get_uid_from_logname(db_params):
|
|
|
99
99
|
conn = psycopg2.connect(**db_params)
|
|
100
100
|
cur = conn.cursor()
|
|
101
101
|
# Ensure logname is quoted as a string in the query
|
|
102
|
-
cur.execute("SELECT userno FROM dssgrp WHERE logname
|
|
102
|
+
cur.execute("SELECT userno FROM dssdb.dssgrp WHERE logname=%s LIMIT 1", (str(logname),))
|
|
103
103
|
row = cur.fetchone()
|
|
104
104
|
cur.close()
|
|
105
105
|
conn.close()
|
|
106
106
|
if row:
|
|
107
107
|
return row[0]
|
|
108
108
|
else:
|
|
109
|
-
raise ValueError(f"User logname '{logname}' not found in dssgrp table.")
|
|
109
|
+
raise ValueError(f"User logname '{logname}' not found in dssdb.dssgrp table.")
|
|
110
110
|
|
|
111
111
|
def main():
|
|
112
112
|
parser = argparse.ArgumentParser(description='Insert tar file summary into tfile table.')
|
|
113
113
|
parser.add_argument('tarfile', help='Path to the tar file')
|
|
114
114
|
parser.add_argument('--db-host', default='rda-db.ucar.edu', help='Database host (default: rda-db.ucar.edu)')
|
|
115
115
|
parser.add_argument('--db-port', default=5432, type=int, help='Database port (default: 5432)')
|
|
116
|
-
parser.add_argument('--db-name', default='
|
|
116
|
+
parser.add_argument('--db-name', default='rdadb', help='Database name (default: rdadb)')
|
|
117
117
|
parser.add_argument('--db-user', default='dssdb', help='Database user (default: dssdb)')
|
|
118
118
|
parser.add_argument('--db-password', help='Database password (optional, use .pgpass if omitted)')
|
|
119
119
|
parser.add_argument('--update', action='store_true', help='Update row if tfile already exists')
|
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tarfile
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import psycopg2
|
|
6
|
+
import hashlib
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
import getpass
|
|
9
|
+
|
|
10
|
+
# Constants for size limits
|
|
11
|
+
ONE_TB = 1_099_511_627_776 # 1TB in bytes
|
|
12
|
+
THREE_TB = 3_298_534_883_328 # 3TB in bytes
|
|
13
|
+
|
|
14
|
+
def setup_logging(output_dir):
|
|
15
|
+
import datetime
|
|
16
|
+
log_filename = f"tarlog_{datetime.datetime.now().strftime('%Y%m%d')}.log"
|
|
17
|
+
log_path = os.path.join(output_dir, log_filename)
|
|
18
|
+
logger = logging.getLogger()
|
|
19
|
+
logger.setLevel(logging.INFO)
|
|
20
|
+
# Remove any existing handlers
|
|
21
|
+
for handler in logger.handlers[:]:
|
|
22
|
+
logger.removeHandler(handler)
|
|
23
|
+
# File handler
|
|
24
|
+
fh = logging.FileHandler(log_path)
|
|
25
|
+
fh.setLevel(logging.INFO)
|
|
26
|
+
fh.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
|
|
27
|
+
# Console handler
|
|
28
|
+
ch = logging.StreamHandler()
|
|
29
|
+
ch.setLevel(logging.INFO)
|
|
30
|
+
ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
|
|
31
|
+
logger.addHandler(fh)
|
|
32
|
+
logger.addHandler(ch)
|
|
33
|
+
logging.info(f"Logging to file: {log_path}")
|
|
34
|
+
|
|
35
|
+
def get_file_size(path):
|
|
36
|
+
try:
|
|
37
|
+
return os.path.getsize(path)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
logging.warning(f"Could not get size for {path}: {e}")
|
|
40
|
+
return 0
|
|
41
|
+
|
|
42
|
+
def group_files_by_size(files, min_size, max_size):
|
|
43
|
+
"""
|
|
44
|
+
Group files into batches where each batch's total size is between min_size and max_size.
|
|
45
|
+
Returns a list of lists of file paths.
|
|
46
|
+
"""
|
|
47
|
+
batches = []
|
|
48
|
+
current_batch = []
|
|
49
|
+
current_size = 0
|
|
50
|
+
for f in files:
|
|
51
|
+
fsize = get_file_size(f)
|
|
52
|
+
if fsize > max_size:
|
|
53
|
+
logging.warning(f"File {f} is larger than max tar size ({max_size} bytes), skipping.")
|
|
54
|
+
continue
|
|
55
|
+
if current_size + fsize > max_size:
|
|
56
|
+
if current_size >= min_size:
|
|
57
|
+
batches.append(current_batch)
|
|
58
|
+
current_batch = [f]
|
|
59
|
+
current_size = fsize
|
|
60
|
+
else:
|
|
61
|
+
# If adding this file exceeds max_size but current batch is too small, force add
|
|
62
|
+
current_batch.append(f)
|
|
63
|
+
current_size += fsize
|
|
64
|
+
batches.append(current_batch)
|
|
65
|
+
current_batch = []
|
|
66
|
+
current_size = 0
|
|
67
|
+
else:
|
|
68
|
+
current_batch.append(f)
|
|
69
|
+
current_size += fsize
|
|
70
|
+
if current_batch:
|
|
71
|
+
batches.append(current_batch)
|
|
72
|
+
return batches
|
|
73
|
+
|
|
74
|
+
def tar_batches(dirpath, batches, output_dir):
|
|
75
|
+
src_dir_name = Path(dirpath).name
|
|
76
|
+
for idx, batch in enumerate(batches, 1):
|
|
77
|
+
tar_name = os.path.join(output_dir, f"{src_dir_name}_part{idx}.tar")
|
|
78
|
+
logging.info(f"Creating tar: {tar_name} with {len(batch)} files.")
|
|
79
|
+
with tarfile.open(tar_name, "w") as tar:
|
|
80
|
+
for f in batch:
|
|
81
|
+
try:
|
|
82
|
+
tar.add(f, arcname=os.path.relpath(f, dirpath))
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logging.warning(f"Failed to add {f} to tar: {e}")
|
|
85
|
+
|
|
86
|
+
def process_directory_tree(root_dir, output_dir, db_params=None, update_on_conflict=False):
|
|
87
|
+
for dirpath, dirnames, filenames in os.walk(root_dir):
|
|
88
|
+
abs_files = [os.path.join(dirpath, f) for f in filenames]
|
|
89
|
+
if not abs_files:
|
|
90
|
+
continue
|
|
91
|
+
batches = group_files_by_size(abs_files, ONE_TB, THREE_TB)
|
|
92
|
+
# Use tar_batches_across_dirs for consistency and DB recording
|
|
93
|
+
common_root = root_dir
|
|
94
|
+
tar_batches_across_dirs(abs_files, batches, output_dir, common_root, [root_dir], db_params=db_params, update_on_conflict=update_on_conflict)
|
|
95
|
+
|
|
96
|
+
def read_directories_from_file(input_file):
|
|
97
|
+
dirs = []
|
|
98
|
+
with open(input_file, 'r') as f:
|
|
99
|
+
for line in f:
|
|
100
|
+
line = line.strip()
|
|
101
|
+
if not line or line.startswith('#'):
|
|
102
|
+
continue
|
|
103
|
+
if os.path.isdir(line):
|
|
104
|
+
dirs.append(os.path.abspath(line))
|
|
105
|
+
else:
|
|
106
|
+
logging.warning(f"Directory does not exist: {line}")
|
|
107
|
+
return dirs
|
|
108
|
+
|
|
109
|
+
def collect_all_files(directories):
|
|
110
|
+
files = []
|
|
111
|
+
for d in directories:
|
|
112
|
+
for dirpath, dirnames, filenames in os.walk(d):
|
|
113
|
+
for fname in filenames:
|
|
114
|
+
fpath = os.path.join(dirpath, fname)
|
|
115
|
+
files.append(fpath)
|
|
116
|
+
return files
|
|
117
|
+
|
|
118
|
+
def find_common_root(paths):
|
|
119
|
+
return os.path.commonpath(paths) if paths else ''
|
|
120
|
+
|
|
121
|
+
def tar_batches_across_dirs(files, batches, output_dir, common_root, root_dirs, db_params=None, update_on_conflict=False):
|
|
122
|
+
root_dir_paths = [Path(d).resolve() for d in root_dirs]
|
|
123
|
+
uid = None
|
|
124
|
+
if db_params:
|
|
125
|
+
try:
|
|
126
|
+
uid = get_uid_from_logname(db_params)
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logging.error(f"Could not get uid from dssgrp: {e}")
|
|
129
|
+
uid = 9
|
|
130
|
+
for idx, batch in enumerate(batches, 1):
|
|
131
|
+
batch_root_names = set()
|
|
132
|
+
member_infos = []
|
|
133
|
+
dsid = None
|
|
134
|
+
for file_idx, f in enumerate(batch):
|
|
135
|
+
f_path = Path(f).resolve()
|
|
136
|
+
for root_path in root_dir_paths:
|
|
137
|
+
try:
|
|
138
|
+
if str(f_path).startswith(str(root_path)):
|
|
139
|
+
batch_root_names.add(root_path.name)
|
|
140
|
+
break
|
|
141
|
+
except Exception:
|
|
142
|
+
continue
|
|
143
|
+
# Gather member file info for note/dsid/dsids
|
|
144
|
+
try:
|
|
145
|
+
stat = os.stat(f)
|
|
146
|
+
arcname = os.path.relpath(f, common_root)
|
|
147
|
+
parts = arcname.split('/')
|
|
148
|
+
root_dir = parts[0] if parts else ''
|
|
149
|
+
if file_idx == 0:
|
|
150
|
+
dsid = root_dir
|
|
151
|
+
member_infos.append({
|
|
152
|
+
'name': arcname,
|
|
153
|
+
'size': stat.st_size,
|
|
154
|
+
'mtime': int(stat.st_mtime),
|
|
155
|
+
'type': '0', # regular file
|
|
156
|
+
'mode': stat.st_mode,
|
|
157
|
+
'uid': stat.st_uid,
|
|
158
|
+
'gid': stat.st_gid,
|
|
159
|
+
'uname': '',
|
|
160
|
+
'gname': ''
|
|
161
|
+
})
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logging.warning(f"Failed to stat {f} for tar member info: {e}")
|
|
164
|
+
# Determine tar file name: first leading root dir and number of distinct root dirs
|
|
165
|
+
batch_root_names_sorted = sorted(batch_root_names)
|
|
166
|
+
first_root = batch_root_names_sorted[0] if batch_root_names_sorted else 'batch'
|
|
167
|
+
num_roots = len(batch_root_names_sorted)
|
|
168
|
+
num_files = len(batch)
|
|
169
|
+
tar_name = os.path.join(output_dir, f"{first_root}_dn{num_roots}_gn{idx}_fn{num_files}.tar")
|
|
170
|
+
logging.info(f"Creating tar: {tar_name} with {num_files} files.")
|
|
171
|
+
with tarfile.open(tar_name, "w") as tar:
|
|
172
|
+
for f in batch:
|
|
173
|
+
try:
|
|
174
|
+
arcname = os.path.relpath(f, common_root)
|
|
175
|
+
tar.add(f, arcname=arcname)
|
|
176
|
+
except Exception as e:
|
|
177
|
+
logging.warning(f"Failed to add {f} to tar: {e}")
|
|
178
|
+
# After tar is created, insert info into tfile if db_params is provided
|
|
179
|
+
if db_params:
|
|
180
|
+
try:
|
|
181
|
+
# Build note and dsids from cached member_infos
|
|
182
|
+
note = '\n'.join([
|
|
183
|
+
f"name={m['name']};size={m['size']};mtime={m['mtime']};type={m['type']};mode={m['mode']};uid={m['uid']};gid={m['gid']};uname={m['uname']};gname={m['gname']}"
|
|
184
|
+
for m in member_infos
|
|
185
|
+
])
|
|
186
|
+
dsids = ','.join(batch_root_names_sorted)
|
|
187
|
+
tar_stat = os.stat(tar_name)
|
|
188
|
+
ctime = datetime.fromtimestamp(tar_stat.st_ctime)
|
|
189
|
+
mtime = datetime.fromtimestamp(tar_stat.st_mtime)
|
|
190
|
+
summary = {
|
|
191
|
+
'tfile': os.path.basename(tar_name),
|
|
192
|
+
'data_size': tar_stat.st_size,
|
|
193
|
+
'wcount': len(member_infos),
|
|
194
|
+
'date_created': ctime.date(),
|
|
195
|
+
'time_created': ctime.time(),
|
|
196
|
+
'date_modified': mtime.date(),
|
|
197
|
+
'time_modified': mtime.time(),
|
|
198
|
+
'note': note,
|
|
199
|
+
'dsids': dsids,
|
|
200
|
+
'dsid': dsid
|
|
201
|
+
}
|
|
202
|
+
checksum = compute_md5(tar_name)
|
|
203
|
+
extra = {
|
|
204
|
+
'uid': uid if uid is not None else 9,
|
|
205
|
+
'dsid': summary['dsid'],
|
|
206
|
+
'data_format': '',
|
|
207
|
+
'disp_order': 0,
|
|
208
|
+
'dsids': summary['dsids'],
|
|
209
|
+
'note': summary['note']
|
|
210
|
+
}
|
|
211
|
+
insert_tfile_row(summary, checksum, db_params, extra, update_on_conflict=update_on_conflict)
|
|
212
|
+
logging.info(f"Inserted tar summary for {tar_name} into tfile.")
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logging.error(f"Failed to insert tar info for {tar_name}: {e}")
|
|
215
|
+
|
|
216
|
+
def get_batch_size(batch):
|
|
217
|
+
return sum(get_file_size(f) for f in batch)
|
|
218
|
+
|
|
219
|
+
def process_from_file(input_file, output_dir):
|
|
220
|
+
directories = read_directories_from_file(input_file)
|
|
221
|
+
files = collect_all_files(directories)
|
|
222
|
+
if not files:
|
|
223
|
+
logging.info("No files found in provided directories.")
|
|
224
|
+
return
|
|
225
|
+
batches = group_files_by_size(files, ONE_TB, THREE_TB)
|
|
226
|
+
# If last batch is less than 1TB, append it to previous batch
|
|
227
|
+
if len(batches) > 1 and get_batch_size(batches[-1]) < ONE_TB:
|
|
228
|
+
logging.info(f"Last batch size ({get_batch_size(batches[-1])} bytes) < 1TB, appending to previous batch.")
|
|
229
|
+
batches[-2].extend(batches[-1])
|
|
230
|
+
batches.pop()
|
|
231
|
+
common_root = find_common_root(directories)
|
|
232
|
+
tar_batches_across_dirs(files, batches, output_dir, common_root, directories)
|
|
233
|
+
|
|
234
|
+
def compute_md5(file_path, chunk_size=8192):
|
|
235
|
+
md5 = hashlib.md5()
|
|
236
|
+
with open(file_path, 'rb') as f:
|
|
237
|
+
while True:
|
|
238
|
+
chunk = f.read(chunk_size)
|
|
239
|
+
if not chunk:
|
|
240
|
+
break
|
|
241
|
+
md5.update(chunk)
|
|
242
|
+
return md5.hexdigest()
|
|
243
|
+
|
|
244
|
+
def get_tar_summary_and_details(tar_path):
|
|
245
|
+
with tarfile.open(tar_path, 'r') as tar:
|
|
246
|
+
member_files = [m for m in tar.getmembers() if m.isfile()]
|
|
247
|
+
file_count = len(member_files)
|
|
248
|
+
details = []
|
|
249
|
+
root_dirs = set()
|
|
250
|
+
dsid = None
|
|
251
|
+
for idx, m in enumerate(member_files):
|
|
252
|
+
parts = m.name.split('/')
|
|
253
|
+
if len(parts) > 1:
|
|
254
|
+
root_dirs.add(parts[0])
|
|
255
|
+
if idx == 0:
|
|
256
|
+
dsid = parts[0]
|
|
257
|
+
elif len(parts) == 1:
|
|
258
|
+
root_dirs.add(parts[0])
|
|
259
|
+
if idx == 0:
|
|
260
|
+
dsid = parts[0]
|
|
261
|
+
details.append(f"name={m.name};size={m.size};mtime={m.mtime};type={m.type};mode={m.mode};uid={m.uid};gid={m.gid};uname={m.uname};gname={m.gname}")
|
|
262
|
+
note = '\n'.join(details)
|
|
263
|
+
dsids = ','.join(sorted(root_dirs))
|
|
264
|
+
tar_stat = os.stat(tar_path)
|
|
265
|
+
tar_size = tar_stat.st_size
|
|
266
|
+
ctime = datetime.fromtimestamp(tar_stat.st_ctime)
|
|
267
|
+
mtime = datetime.fromtimestamp(tar_stat.st_mtime)
|
|
268
|
+
return {
|
|
269
|
+
'tfile': os.path.basename(tar_path),
|
|
270
|
+
'data_size': tar_size,
|
|
271
|
+
'wcount': file_count,
|
|
272
|
+
'date_created': ctime.date(),
|
|
273
|
+
'time_created': ctime.time(),
|
|
274
|
+
'date_modified': mtime.date(),
|
|
275
|
+
'time_modified': mtime.time(),
|
|
276
|
+
'note': note,
|
|
277
|
+
'dsids': dsids,
|
|
278
|
+
'dsid': dsid
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
def get_uid_from_logname(db_params):
|
|
282
|
+
logname = None
|
|
283
|
+
try:
|
|
284
|
+
logname = os.getlogin()
|
|
285
|
+
except Exception:
|
|
286
|
+
logname = os.environ.get('USER') or getpass.getuser()
|
|
287
|
+
conn = psycopg2.connect(**db_params)
|
|
288
|
+
cur = conn.cursor()
|
|
289
|
+
cur.execute("SELECT userno FROM dssdb.dssgrp WHERE logname=%s LIMIT 1", (str(logname),))
|
|
290
|
+
row = cur.fetchone()
|
|
291
|
+
cur.close()
|
|
292
|
+
conn.close()
|
|
293
|
+
if row:
|
|
294
|
+
return row[0]
|
|
295
|
+
else:
|
|
296
|
+
raise ValueError(f"User logname '{logname}' not found in dssdb.dssgrp table.")
|
|
297
|
+
|
|
298
|
+
def insert_tfile_row(summary, checksum, db_params, extra, update_on_conflict=False):
|
|
299
|
+
table_name = 'dssdb.tfile'
|
|
300
|
+
conn = psycopg2.connect(**db_params)
|
|
301
|
+
cur = conn.cursor()
|
|
302
|
+
columns = [
|
|
303
|
+
'tfile', 'data_size', 'wcount', 'date_created', 'time_created',
|
|
304
|
+
'date_modified', 'time_modified', 'file_format', 'checksum', 'status',
|
|
305
|
+
'uid', 'dsid', 'data_format', 'disp_order', 'dsids', 'note'
|
|
306
|
+
]
|
|
307
|
+
values = [
|
|
308
|
+
summary['tfile'], summary['data_size'], summary['wcount'],
|
|
309
|
+
summary['date_created'], summary['time_created'],
|
|
310
|
+
summary['date_modified'], summary['time_modified'],
|
|
311
|
+
'tar', checksum, 'T',
|
|
312
|
+
extra.get('uid'), extra.get('dsid'), extra.get('data_format'),
|
|
313
|
+
extra.get('disp_order'), extra.get('dsids'), extra.get('note')
|
|
314
|
+
]
|
|
315
|
+
placeholders = ','.join(['%s'] * len(columns))
|
|
316
|
+
sql = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({placeholders})"
|
|
317
|
+
if update_on_conflict:
|
|
318
|
+
update_cols = [col for col in columns if col != 'tfile']
|
|
319
|
+
set_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in update_cols])
|
|
320
|
+
sql += f" ON CONFLICT (tfile) DO UPDATE SET {set_clause}"
|
|
321
|
+
try:
|
|
322
|
+
cur.execute(sql, values)
|
|
323
|
+
conn.commit()
|
|
324
|
+
except Exception as e:
|
|
325
|
+
logging.error(f"Database error: {e}")
|
|
326
|
+
conn.rollback()
|
|
327
|
+
raise
|
|
328
|
+
finally:
|
|
329
|
+
cur.close()
|
|
330
|
+
conn.close()
|
|
331
|
+
|
|
332
|
+
def main():
|
|
333
|
+
import argparse
|
|
334
|
+
parser = argparse.ArgumentParser(description="Tar files from a list of directories into 1-3TB tar files and record tar info in the database.")
|
|
335
|
+
parser.add_argument("--input-file", help="File containing list of directories to process (one per line)")
|
|
336
|
+
parser.add_argument("--output-dir", help="Directory to store tar files (default: current directory)")
|
|
337
|
+
parser.add_argument("root_dir", nargs='?', help="(Deprecated) Root directory to process")
|
|
338
|
+
parser.add_argument('--db-host', default='rda-db.ucar.edu', help='Database host (default: rda-db.ucar.edu)')
|
|
339
|
+
parser.add_argument('--db-port', default=5432, type=int, help='Database port (default: 5432)')
|
|
340
|
+
parser.add_argument('--db-name', default='rdadb', help='Database name (default: rdadb)')
|
|
341
|
+
parser.add_argument('--db-user', default='dssdb', help='Database user (default: dssdb)')
|
|
342
|
+
parser.add_argument('--db-password', help='Database password (optional, use .pgpass if omitted)')
|
|
343
|
+
parser.add_argument('--update', action='store_true', help='Update row if tfile already exists')
|
|
344
|
+
args = parser.parse_args()
|
|
345
|
+
output_dir = args.output_dir if args.output_dir else os.getcwd()
|
|
346
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
347
|
+
setup_logging(output_dir)
|
|
348
|
+
# Prepare db_params for tar_batches_across_dirs
|
|
349
|
+
db_params = None
|
|
350
|
+
if args.db_host and args.db_name and args.db_user:
|
|
351
|
+
db_params = {
|
|
352
|
+
'host': args.db_host,
|
|
353
|
+
'port': args.db_port,
|
|
354
|
+
'dbname': args.db_name,
|
|
355
|
+
'user': args.db_user
|
|
356
|
+
}
|
|
357
|
+
if args.db_password:
|
|
358
|
+
db_params['password'] = args.db_password
|
|
359
|
+
if args.input_file:
|
|
360
|
+
# Pass db_params and update flag to tar_batches_across_dirs
|
|
361
|
+
directories = read_directories_from_file(args.input_file)
|
|
362
|
+
files = collect_all_files(directories)
|
|
363
|
+
if not files:
|
|
364
|
+
logging.info("No files found in provided directories.")
|
|
365
|
+
return
|
|
366
|
+
batches = group_files_by_size(files, ONE_TB, THREE_TB)
|
|
367
|
+
if len(batches) > 1 and get_batch_size(batches[-1]) < ONE_TB:
|
|
368
|
+
logging.info(f"Last batch size ({get_batch_size(batches[-1])} bytes) < 1TB, appending to previous batch.")
|
|
369
|
+
batches[-2].extend(batches[-1])
|
|
370
|
+
batches.pop()
|
|
371
|
+
common_root = find_common_root(directories)
|
|
372
|
+
tar_batches_across_dirs(files, batches, output_dir, common_root, directories, db_params=db_params, update_on_conflict=args.update)
|
|
373
|
+
elif args.root_dir:
|
|
374
|
+
process_directory_tree(args.root_dir, output_dir, db_params=db_params, update_on_conflict=args.update)
|
|
375
|
+
else:
|
|
376
|
+
print("Error: Must provide either --input-file or root_dir.")
|
|
377
|
+
|
|
378
|
+
if __name__ == "__main__":
|
|
379
|
+
main()
|
{rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rda_python_dsquasar
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.4
|
|
4
4
|
Summary: RDA Python package to backup and recover RDA data archives to and from GLOBUS Quasar backup server
|
|
5
5
|
Author-email: Zaihua Ji <zji@ucar.edu>
|
|
6
6
|
Project-URL: Homepage, https://github.com/NCAR/rda-python-dsquasar
|
|
@@ -1,178 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import tarfile
|
|
3
|
-
import logging
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
# Constants for size limits
|
|
7
|
-
ONE_TB = 1_099_511_627_776 # 1TB in bytes
|
|
8
|
-
THREE_TB = 3_298_534_883_328 # 3TB in bytes
|
|
9
|
-
|
|
10
|
-
def setup_logging(output_dir):
|
|
11
|
-
import datetime
|
|
12
|
-
log_filename = f"tarlog_{datetime.datetime.now().strftime('%Y%m%d')}.log"
|
|
13
|
-
log_path = os.path.join(output_dir, log_filename)
|
|
14
|
-
logger = logging.getLogger()
|
|
15
|
-
logger.setLevel(logging.INFO)
|
|
16
|
-
# Remove any existing handlers
|
|
17
|
-
for handler in logger.handlers[:]:
|
|
18
|
-
logger.removeHandler(handler)
|
|
19
|
-
# File handler
|
|
20
|
-
fh = logging.FileHandler(log_path)
|
|
21
|
-
fh.setLevel(logging.INFO)
|
|
22
|
-
fh.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
|
|
23
|
-
# Console handler
|
|
24
|
-
ch = logging.StreamHandler()
|
|
25
|
-
ch.setLevel(logging.INFO)
|
|
26
|
-
ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
|
|
27
|
-
logger.addHandler(fh)
|
|
28
|
-
logger.addHandler(ch)
|
|
29
|
-
logging.info(f"Logging to file: {log_path}")
|
|
30
|
-
|
|
31
|
-
def get_file_size(path):
|
|
32
|
-
try:
|
|
33
|
-
return os.path.getsize(path)
|
|
34
|
-
except Exception as e:
|
|
35
|
-
logging.warning(f"Could not get size for {path}: {e}")
|
|
36
|
-
return 0
|
|
37
|
-
|
|
38
|
-
def group_files_by_size(files, min_size, max_size):
|
|
39
|
-
"""
|
|
40
|
-
Group files into batches where each batch's total size is between min_size and max_size.
|
|
41
|
-
Returns a list of lists of file paths.
|
|
42
|
-
"""
|
|
43
|
-
batches = []
|
|
44
|
-
current_batch = []
|
|
45
|
-
current_size = 0
|
|
46
|
-
for f in files:
|
|
47
|
-
fsize = get_file_size(f)
|
|
48
|
-
if fsize > max_size:
|
|
49
|
-
logging.warning(f"File {f} is larger than max tar size ({max_size} bytes), skipping.")
|
|
50
|
-
continue
|
|
51
|
-
if current_size + fsize > max_size:
|
|
52
|
-
if current_size >= min_size:
|
|
53
|
-
batches.append(current_batch)
|
|
54
|
-
current_batch = [f]
|
|
55
|
-
current_size = fsize
|
|
56
|
-
else:
|
|
57
|
-
# If adding this file exceeds max_size but current batch is too small, force add
|
|
58
|
-
current_batch.append(f)
|
|
59
|
-
current_size += fsize
|
|
60
|
-
batches.append(current_batch)
|
|
61
|
-
current_batch = []
|
|
62
|
-
current_size = 0
|
|
63
|
-
else:
|
|
64
|
-
current_batch.append(f)
|
|
65
|
-
current_size += fsize
|
|
66
|
-
if current_batch:
|
|
67
|
-
batches.append(current_batch)
|
|
68
|
-
return batches
|
|
69
|
-
|
|
70
|
-
def tar_batches(dirpath, batches, output_dir):
|
|
71
|
-
src_dir_name = Path(dirpath).name
|
|
72
|
-
for idx, batch in enumerate(batches, 1):
|
|
73
|
-
tar_name = os.path.join(output_dir, f"{src_dir_name}_part{idx}.tar")
|
|
74
|
-
logging.info(f"Creating tar: {tar_name} with {len(batch)} files.")
|
|
75
|
-
with tarfile.open(tar_name, "w") as tar:
|
|
76
|
-
for f in batch:
|
|
77
|
-
try:
|
|
78
|
-
tar.add(f, arcname=os.path.relpath(f, dirpath))
|
|
79
|
-
except Exception as e:
|
|
80
|
-
logging.warning(f"Failed to add {f} to tar: {e}")
|
|
81
|
-
|
|
82
|
-
def process_directory_tree(root_dir, output_dir):
|
|
83
|
-
for dirpath, dirnames, filenames in os.walk(root_dir):
|
|
84
|
-
abs_files = [os.path.join(dirpath, f) for f in filenames]
|
|
85
|
-
if not abs_files:
|
|
86
|
-
continue
|
|
87
|
-
batches = group_files_by_size(abs_files, ONE_TB, THREE_TB)
|
|
88
|
-
tar_batches(dirpath, batches, output_dir)
|
|
89
|
-
|
|
90
|
-
def read_directories_from_file(input_file):
|
|
91
|
-
dirs = []
|
|
92
|
-
with open(input_file, 'r') as f:
|
|
93
|
-
for line in f:
|
|
94
|
-
line = line.strip()
|
|
95
|
-
if not line or line.startswith('#'):
|
|
96
|
-
continue
|
|
97
|
-
if os.path.isdir(line):
|
|
98
|
-
dirs.append(os.path.abspath(line))
|
|
99
|
-
else:
|
|
100
|
-
logging.warning(f"Directory does not exist: {line}")
|
|
101
|
-
return dirs
|
|
102
|
-
|
|
103
|
-
def collect_all_files(directories):
|
|
104
|
-
files = []
|
|
105
|
-
for d in directories:
|
|
106
|
-
for dirpath, dirnames, filenames in os.walk(d):
|
|
107
|
-
for fname in filenames:
|
|
108
|
-
fpath = os.path.join(dirpath, fname)
|
|
109
|
-
files.append(fpath)
|
|
110
|
-
return files
|
|
111
|
-
|
|
112
|
-
def find_common_root(paths):
|
|
113
|
-
return os.path.commonpath(paths) if paths else ''
|
|
114
|
-
|
|
115
|
-
def tar_batches_across_dirs(files, batches, output_dir, common_root, root_dirs):
|
|
116
|
-
root_dir_paths = [Path(d).resolve() for d in root_dirs]
|
|
117
|
-
for idx, batch in enumerate(batches, 1):
|
|
118
|
-
# Find which root directories are represented in this batch
|
|
119
|
-
batch_root_names = set()
|
|
120
|
-
for f in batch:
|
|
121
|
-
f_path = Path(f).resolve()
|
|
122
|
-
for root_path in root_dir_paths:
|
|
123
|
-
try:
|
|
124
|
-
if str(f_path).startswith(str(root_path)):
|
|
125
|
-
batch_root_names.add(root_path.name)
|
|
126
|
-
break
|
|
127
|
-
except Exception:
|
|
128
|
-
continue
|
|
129
|
-
prefix = "_".join(sorted(batch_root_names)) if batch_root_names else "batch"
|
|
130
|
-
num_files = len(batch)
|
|
131
|
-
tar_name = os.path.join(output_dir, f"{prefix}_batch_part{idx}_{num_files}files.tar")
|
|
132
|
-
logging.info(f"Creating tar: {tar_name} with {num_files} files.")
|
|
133
|
-
with tarfile.open(tar_name, "w") as tar:
|
|
134
|
-
for f in batch:
|
|
135
|
-
try:
|
|
136
|
-
# Preserve relative path from common root in tar
|
|
137
|
-
arcname = os.path.relpath(f, common_root)
|
|
138
|
-
tar.add(f, arcname=arcname)
|
|
139
|
-
except Exception as e:
|
|
140
|
-
logging.warning(f"Failed to add {f} to tar: {e}")
|
|
141
|
-
|
|
142
|
-
def get_batch_size(batch):
|
|
143
|
-
return sum(get_file_size(f) for f in batch)
|
|
144
|
-
|
|
145
|
-
def process_from_file(input_file, output_dir):
|
|
146
|
-
directories = read_directories_from_file(input_file)
|
|
147
|
-
files = collect_all_files(directories)
|
|
148
|
-
if not files:
|
|
149
|
-
logging.info("No files found in provided directories.")
|
|
150
|
-
return
|
|
151
|
-
batches = group_files_by_size(files, ONE_TB, THREE_TB)
|
|
152
|
-
# If last batch is less than 1TB, append it to previous batch
|
|
153
|
-
if len(batches) > 1 and get_batch_size(batches[-1]) < ONE_TB:
|
|
154
|
-
logging.info(f"Last batch size ({get_batch_size(batches[-1])} bytes) < 1TB, appending to previous batch.")
|
|
155
|
-
batches[-2].extend(batches[-1])
|
|
156
|
-
batches.pop()
|
|
157
|
-
common_root = find_common_root(directories)
|
|
158
|
-
tar_batches_across_dirs(files, batches, output_dir, common_root, directories)
|
|
159
|
-
|
|
160
|
-
def main():
|
|
161
|
-
import argparse
|
|
162
|
-
parser = argparse.ArgumentParser(description="Tar files from a list of directories into 1-3TB tar files.")
|
|
163
|
-
parser.add_argument("--input-file", help="File containing list of directories to process (one per line)")
|
|
164
|
-
parser.add_argument("--output-dir", help="Directory to store tar files (default: current directory)")
|
|
165
|
-
parser.add_argument("root_dir", nargs='?', help="(Deprecated) Root directory to process")
|
|
166
|
-
args = parser.parse_args()
|
|
167
|
-
output_dir = args.output_dir if args.output_dir else os.getcwd()
|
|
168
|
-
os.makedirs(output_dir, exist_ok=True)
|
|
169
|
-
setup_logging(output_dir)
|
|
170
|
-
if args.input_file:
|
|
171
|
-
process_from_file(args.input_file, output_dir)
|
|
172
|
-
elif args.root_dir:
|
|
173
|
-
process_directory_tree(args.root_dir, output_dir)
|
|
174
|
-
else:
|
|
175
|
-
print("Error: Must provide either --input-file or root_dir.")
|
|
176
|
-
|
|
177
|
-
if __name__ == "__main__":
|
|
178
|
-
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar/ds_quasar.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|