rda-python-dsquasar 2.0.8__tar.gz → 2.0.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/PKG-INFO +1 -1
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/pyproject.toml +1 -1
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/taccrec.py +8 -8
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/tacctar.py +63 -41
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/PKG-INFO +1 -1
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/LICENSE +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/README.md +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/setup.cfg +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/__init__.py +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/ds_quasar.py +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/dsquasar.py +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/dstacc.py +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/SOURCES.txt +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/dependency_links.txt +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/entry_points.txt +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/requires.txt +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/top_level.txt +0 -0
- {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/test/test_dsquasar.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rda_python_dsquasar
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.10
|
|
4
4
|
Summary: RDA Python package to backup and recover RDA data archives to and from GLOBUS Quasar backup server
|
|
5
5
|
Author-email: Zaihua Ji <zji@ucar.edu>
|
|
6
6
|
Project-URL: Homepage, https://github.com/NCAR/rda-python-dsquasar
|
|
@@ -144,14 +144,14 @@ def get_uid_from_logname(db_params):
|
|
|
144
144
|
|
|
145
145
|
def main():
|
|
146
146
|
parser = argparse.ArgumentParser(description='Insert tar file summary into tfile table.')
|
|
147
|
-
parser.add_argument('--member-list', help='Path to tar member list file (from tar -tvf)')
|
|
148
|
-
parser.add_argument('--member-list-file', help='File containing list of tar member list files (one per line)')
|
|
149
|
-
parser.add_argument('--db-host', default='rda-db.ucar.edu', help='Database host (default: rda-db.ucar.edu)')
|
|
150
|
-
parser.add_argument('--db-port', default=5432, type=int, help='Database port (default: 5432)')
|
|
151
|
-
parser.add_argument('--db-name', default='rdadb', help='Database name (default: rdadb)')
|
|
152
|
-
parser.add_argument('--db-user', default='dssdb', help='Database user (default: dssdb)')
|
|
153
|
-
parser.add_argument('--db-password', help='Database password (optional, use .pgpass if omitted)')
|
|
154
|
-
parser.add_argument('--no-update', action='store_true', help='If tfile exists, skip all updates including wfile tables (default: False)')
|
|
147
|
+
parser.add_argument('-ml', '--member-list', help='Path to tar member list file (from tar -tvf)')
|
|
148
|
+
parser.add_argument('-mf', '--member-list-file', help='File containing list of tar member list files (one per line)')
|
|
149
|
+
parser.add_argument('-ht', '--db-host', default='rda-db.ucar.edu', help='Database host (default: rda-db.ucar.edu)')
|
|
150
|
+
parser.add_argument('-pt', '--db-port', default=5432, type=int, help='Database port (default: 5432)')
|
|
151
|
+
parser.add_argument('-db', '--db-name', default='rdadb', help='Database name (default: rdadb)')
|
|
152
|
+
parser.add_argument('-us', '--db-user', default='dssdb', help='Database user (default: dssdb)')
|
|
153
|
+
parser.add_argument('-pw', '--db-password', help='Database password (optional, use .pgpass if omitted)')
|
|
154
|
+
parser.add_argument('-nu', '--no-update', action='store_true', help='If tfile exists, skip all updates including wfile tables (default: False)')
|
|
155
155
|
args = parser.parse_args()
|
|
156
156
|
if args.member_list_file:
|
|
157
157
|
with open(args.member_list_file, 'r') as f:
|
|
@@ -69,14 +69,44 @@ def group_files_by_size(files, min_size, max_size):
|
|
|
69
69
|
batches.append(current_batch)
|
|
70
70
|
return batches
|
|
71
71
|
|
|
72
|
-
def
|
|
72
|
+
def write_mbr_and_archive_tar(tar_file, mbr_file, archive_dir=None):
|
|
73
73
|
"""
|
|
74
|
-
|
|
74
|
+
Write tar member list file in tar -tvf format and move tar file to archive_dir if provided.
|
|
75
75
|
"""
|
|
76
|
+
import pwd, grp, time, shutil
|
|
77
|
+
# Write member list file
|
|
78
|
+
with tarfile.open(tar_file, "r") as tar:
|
|
79
|
+
with open(mbr_file, "w") as mf:
|
|
80
|
+
for member in tar.getmembers():
|
|
81
|
+
mode = oct(member.mode)[-4:]
|
|
82
|
+
typechar = '-' if member.isfile() else 'd' if member.isdir() else 'l' if member.issym() else '?'
|
|
83
|
+
uname = member.uname or (pwd.getpwuid(member.uid).pw_name if hasattr(member, 'uid') else '')
|
|
84
|
+
gname = member.gname or (grp.getgrgid(member.gid).gr_name if hasattr(member, 'gid') else '')
|
|
85
|
+
size = member.size
|
|
86
|
+
mtime = time.strftime("%Y-%m-%d %H:%M", time.localtime(member.mtime))
|
|
87
|
+
mf.write(f"{typechar}{mode} {uname}/{gname} {size:9d} {mtime} {member.name}\n")
|
|
88
|
+
# Move tar file to archive_dir if specified
|
|
89
|
+
if archive_dir:
|
|
90
|
+
archive_dir_abs = os.path.abspath(archive_dir)
|
|
91
|
+
os.makedirs(archive_dir_abs, exist_ok=True)
|
|
92
|
+
dest_tar_file = os.path.join(archive_dir_abs, os.path.basename(tar_file))
|
|
93
|
+
try:
|
|
94
|
+
shutil.move(tar_file, dest_tar_file)
|
|
95
|
+
logging.info(f"Moved tar file {tar_file} to archive directory {dest_tar_file}.")
|
|
96
|
+
except Exception as e:
|
|
97
|
+
logging.error(f"Failed to move tar file {tar_file} to archive directory {archive_dir_abs}: {e}")
|
|
98
|
+
|
|
99
|
+
def tar_batches(dirpath, batches, output_dir, dataset_path=None, dataset_name=None, tar_batch=True, archive_dir=None):
|
|
100
|
+
"""
|
|
101
|
+
If tar_batch is True, create tar files for each batch and move to archive_dir if provided. Otherwise, dump file list to .batch files.
|
|
102
|
+
Always create a .mbr file for each tar file.
|
|
103
|
+
"""
|
|
104
|
+
import shutil
|
|
76
105
|
for idx, batch in enumerate(batches, 1):
|
|
77
106
|
num_files = len(batch)
|
|
78
107
|
tar_name = os.path.join(output_dir, f"{dataset_name}_part{idx}_{num_files}files.tar")
|
|
79
108
|
batch_name = tar_name.replace(".tar", ".batch")
|
|
109
|
+
mbr_file = tar_name + '.mbr'
|
|
80
110
|
if tar_batch:
|
|
81
111
|
logging.info(f"Creating tar: {tar_name} with {num_files} files.")
|
|
82
112
|
with tarfile.open(tar_name, "w") as tar:
|
|
@@ -87,6 +117,7 @@ def tar_batches(dirpath, batches, output_dir, dataset_path=None, dataset_name=No
|
|
|
87
117
|
tar.add(f, arcname=arcname)
|
|
88
118
|
except Exception as e:
|
|
89
119
|
logging.warning(f"Failed to add {f} to tar: {e}")
|
|
120
|
+
write_mbr_and_archive_tar(tar_name, mbr_file, archive_dir)
|
|
90
121
|
else:
|
|
91
122
|
logging.info(f"Writing batch file list: {batch_name} with {num_files} files.")
|
|
92
123
|
with open(batch_name, "w") as bf:
|
|
@@ -95,7 +126,7 @@ def tar_batches(dirpath, batches, output_dir, dataset_path=None, dataset_name=No
|
|
|
95
126
|
arcname = os.path.join(dataset_name, arcname)
|
|
96
127
|
bf.write(arcname + "\n")
|
|
97
128
|
|
|
98
|
-
def process_directory_tree(dataset_path, output_dir, db_params=None, tar_batch=True, dsid=None):
|
|
129
|
+
def process_directory_tree(dataset_path, output_dir, db_params=None, tar_batch=True, dsid=None, archive_dir=None):
|
|
99
130
|
dataset_name = dsid if dsid else os.path.basename(os.path.abspath(dataset_path))
|
|
100
131
|
wfile_table = f"dssdb.wfile_{dataset_name}"
|
|
101
132
|
all_files = []
|
|
@@ -130,7 +161,7 @@ def process_directory_tree(dataset_path, output_dir, db_params=None, tar_batch=T
|
|
|
130
161
|
if not all_files:
|
|
131
162
|
return
|
|
132
163
|
batches = group_files_by_size(all_files, ONE_TB, THREE_TB)
|
|
133
|
-
tar_batches(dataset_path, batches, output_dir, dataset_path=dataset_path, dataset_name=dataset_name, tar_batch=tar_batch)
|
|
164
|
+
tar_batches(dataset_path, batches, output_dir, dataset_path=dataset_path, dataset_name=dataset_name, tar_batch=tar_batch, archive_dir=archive_dir)
|
|
134
165
|
|
|
135
166
|
def read_directories_from_file(input_file, tar_root=None):
|
|
136
167
|
dataset_ids = []
|
|
@@ -186,7 +217,8 @@ def collect_all_files(dataset_dirs, db_params=None):
|
|
|
186
217
|
def find_common_root(paths):
|
|
187
218
|
return os.path.commonpath(paths) if paths else ''
|
|
188
219
|
|
|
189
|
-
def tar_batches_across_dirs(files, batches, output_dir, tar_root, dataset_paths, tar_batch=True, dataset_ids=None):
|
|
220
|
+
def tar_batches_across_dirs(files, batches, output_dir, tar_root, dataset_paths, tar_batch=True, dataset_ids=None, archive_dir=None):
|
|
221
|
+
import shutil
|
|
190
222
|
dataset_dir_paths = [Path(d).resolve() for d in dataset_paths]
|
|
191
223
|
dsid_map = {str(Path(d).resolve()): dsid for dsid, d in zip(dataset_ids, dataset_paths)} if dataset_ids else {}
|
|
192
224
|
for idx, batch in enumerate(batches, 1):
|
|
@@ -208,6 +240,7 @@ def tar_batches_across_dirs(files, batches, output_dir, tar_root, dataset_paths,
|
|
|
208
240
|
prefix = "_".join(sorted(batch_dataset_names)) if batch_dataset_names else "batch"
|
|
209
241
|
tar_name = os.path.join(output_dir, f"{prefix}_part{idx}_{num_files}files.tar")
|
|
210
242
|
batch_name = tar_name.replace(".tar", ".batch")
|
|
243
|
+
mbr_file = tar_name + '.mbr'
|
|
211
244
|
if tar_batch:
|
|
212
245
|
logging.info(f"Creating tar: {tar_name} with {num_files} files.")
|
|
213
246
|
with tarfile.open(tar_name, "w") as tar:
|
|
@@ -217,6 +250,7 @@ def tar_batches_across_dirs(files, batches, output_dir, tar_root, dataset_paths,
|
|
|
217
250
|
tar.add(f, arcname=arcname)
|
|
218
251
|
except Exception as e:
|
|
219
252
|
logging.warning(f"Failed to add {f} to tar: {e}")
|
|
253
|
+
write_mbr_and_archive_tar(tar_name, mbr_file, archive_dir)
|
|
220
254
|
else:
|
|
221
255
|
logging.info(f"Writing batch file list: {batch_name} with {num_files} files.")
|
|
222
256
|
with open(batch_name, "w") as bf:
|
|
@@ -227,14 +261,7 @@ def tar_batches_across_dirs(files, batches, output_dir, tar_root, dataset_paths,
|
|
|
227
261
|
def get_batch_size(batch):
|
|
228
262
|
return sum(get_file_size(f) for f in batch)
|
|
229
263
|
|
|
230
|
-
def tar_batch_file(batch_file, tar_root=None):
|
|
231
|
-
"""
|
|
232
|
-
Read a filelist in batch_file and tar the filelist into a tar file named like batch file name by replacing .batch with .tar.
|
|
233
|
-
Prepend path tar_root to the file names in each batch file for the full path.
|
|
234
|
-
After tarring, dump a member file detail list (like 'tar -tvf') into a .mbr file named as tarfilename+'.mbr'.
|
|
235
|
-
If the .mbr file already exists, skip the tar action.
|
|
236
|
-
"""
|
|
237
|
-
import pwd, grp, time
|
|
264
|
+
def tar_batch_file(batch_file, tar_root=None, archive_dir=None):
|
|
238
265
|
tar_file = batch_file.replace('.batch', '.tar')
|
|
239
266
|
mbr_file = tar_file + '.mbr'
|
|
240
267
|
if os.path.exists(mbr_file):
|
|
@@ -250,30 +277,25 @@ def tar_batch_file(batch_file, tar_root=None):
|
|
|
250
277
|
tar.add(abs_path, arcname=rel_path)
|
|
251
278
|
except Exception as e:
|
|
252
279
|
logging.warning(f"Failed to add {abs_path} as {rel_path} to tar: {e}")
|
|
253
|
-
|
|
254
|
-
with tarfile.open(tar_file, "r") as tar:
|
|
255
|
-
with open(mbr_file, "w") as mf:
|
|
256
|
-
for member in tar.getmembers():
|
|
257
|
-
mode = oct(member.mode)[-4:]
|
|
258
|
-
typechar = '-' if member.isfile() else 'd' if member.isdir() else 'l' if member.issym() else '?'
|
|
259
|
-
uname = member.uname or (pwd.getpwuid(member.uid).pw_name if hasattr(member, 'uid') else '')
|
|
260
|
-
gname = member.gname or (grp.getgrgid(member.gid).gr_name if hasattr(member, 'gid') else '')
|
|
261
|
-
size = member.size
|
|
262
|
-
mtime = time.strftime("%Y-%m-%d %H:%M", time.localtime(member.mtime))
|
|
263
|
-
# Format: -rw-r--r-- user/group size date name
|
|
264
|
-
mf.write(f"{typechar}{mode} {uname}/{gname} {size:9d} {mtime} {member.name}\n")
|
|
280
|
+
write_mbr_and_archive_tar(tar_file, mbr_file, archive_dir)
|
|
265
281
|
|
|
266
282
|
def main():
|
|
267
283
|
import argparse
|
|
268
284
|
parser = argparse.ArgumentParser(description="Tar files from a list of dataset IDs into 1-3TB tar files.")
|
|
269
|
-
parser.add_argument('--batch-input-file', type=str, default=None, help='A file containing a list of batch file names, one per line. Each batch file should contain relative file names to be tarred.')
|
|
270
|
-
parser.add_argument('--batch-files', nargs='*', default=None, help='List of batch files to tar. Each batch file should contain relative file names to be tarred.')
|
|
271
|
-
parser.add_argument('--tar-root', type=str, required=True, help='Root directory for relative tar member file names (arcname). REQUIRED for all modes.')
|
|
272
|
-
parser.add_argument(
|
|
273
|
-
parser.add_argument(
|
|
274
|
-
parser.add_argument('--check-tarred', action='store_true', default=False, help='Skip files already tarred (tid > 0 in wfile_<dataset_name>)')
|
|
275
|
-
parser.add_argument('--tar-batch', action='store_true', default=False, help='Tar files for each batch. If not set, dump file list to .batch files instead.')
|
|
276
|
-
parser.add_argument(
|
|
285
|
+
parser.add_argument('-bi', '--batch-input-file', type=str, default=None, help='A file containing a list of batch file names, one per line. Each batch file should contain relative file names to be tarred.')
|
|
286
|
+
parser.add_argument('-bf', '--batch-files', nargs='*', default=None, help='List of batch files to tar. Each batch file should contain relative file names to be tarred.')
|
|
287
|
+
parser.add_argument('-tr', '--tar-root', type=str, required=True, help='Root directory for relative tar member file names (arcname). REQUIRED for all modes.')
|
|
288
|
+
parser.add_argument('-if', '--input-file', help='File containing list of dataset IDs to process (one per line)')
|
|
289
|
+
parser.add_argument('-od', '--output-dir', help='Directory to store tar files (default: current directory)')
|
|
290
|
+
parser.add_argument('-ct', '--check-tarred', action='store_true', default=False, help='Skip files already tarred (tid > 0 in wfile_<dataset_name>)')
|
|
291
|
+
parser.add_argument('-tb', '--tar-batch', action='store_true', default=False, help='Tar files for each batch. If not set, dump file list to .batch files instead.')
|
|
292
|
+
parser.add_argument('-ds', '--dataset-ids', nargs='*', help='Dataset IDs to process')
|
|
293
|
+
parser.add_argument('-ht', '--db-host', type=str, default='rda-db.ucar.edu', help='Database host for tarred check')
|
|
294
|
+
parser.add_argument('-pt', '--db-port', type=int, default=5432, help='Database port for tarred check')
|
|
295
|
+
parser.add_argument('-db', '--db-name', type=str, default='rdadb', help='Database name for tarred check')
|
|
296
|
+
parser.add_argument('-us', '--db-user', type=str, default='dssdb', help='Database user for tarred check')
|
|
297
|
+
parser.add_argument('-pw', '--db-password', type=str, default=None, help='Database password for tarred check')
|
|
298
|
+
parser.add_argument('-ad', '--archive-dir', type=str, default=None, help='Directory to move tar files to after creation (optional)')
|
|
277
299
|
args = parser.parse_args()
|
|
278
300
|
output_dir = args.output_dir if args.output_dir else os.getcwd()
|
|
279
301
|
os.makedirs(output_dir, exist_ok=True)
|
|
@@ -281,11 +303,11 @@ def main():
|
|
|
281
303
|
db_params = None
|
|
282
304
|
if args.check_tarred:
|
|
283
305
|
db_params = {
|
|
284
|
-
'host': args.db_host
|
|
285
|
-
'port': args.db_port
|
|
286
|
-
'dbname': args.db_name
|
|
287
|
-
'user': args.db_user
|
|
288
|
-
'password': args.db_password
|
|
306
|
+
'host': args.db_host,
|
|
307
|
+
'port': args.db_port,
|
|
308
|
+
'dbname': args.db_name,
|
|
309
|
+
'user': args.db_user,
|
|
310
|
+
'password': args.db_password
|
|
289
311
|
}
|
|
290
312
|
# Batch tar mode
|
|
291
313
|
if args.batch_input_file or args.batch_files:
|
|
@@ -296,7 +318,7 @@ def main():
|
|
|
296
318
|
if args.batch_files:
|
|
297
319
|
batch_files.extend(args.batch_files)
|
|
298
320
|
for batch_file in batch_files:
|
|
299
|
-
tar_batch_file(batch_file, tar_root=args.tar_root)
|
|
321
|
+
tar_batch_file(batch_file, tar_root=args.tar_root, archive_dir=args.archive_dir)
|
|
300
322
|
return
|
|
301
323
|
# Directory tree processing mode
|
|
302
324
|
if args.input_file:
|
|
@@ -310,7 +332,7 @@ def main():
|
|
|
310
332
|
logging.info(f"Last batch size ({get_batch_size(batches[-1])} bytes) < 1TB, appending to previous batch.")
|
|
311
333
|
batches[-2].extend(batches[-1])
|
|
312
334
|
batches.pop()
|
|
313
|
-
tar_batches_across_dirs(files, batches, output_dir, args.tar_root, dataset_paths, tar_batch=args.tar_batch, dataset_ids=None)
|
|
335
|
+
tar_batches_across_dirs(files, batches, output_dir, args.tar_root, dataset_paths, tar_batch=args.tar_batch, dataset_ids=None, archive_dir=args.archive_dir)
|
|
314
336
|
return
|
|
315
337
|
elif args.dataset_ids:
|
|
316
338
|
for dsid in args.dataset_ids:
|
|
@@ -318,7 +340,7 @@ def main():
|
|
|
318
340
|
if not os.path.isdir(dataset_path):
|
|
319
341
|
logging.warning(f"Dataset directory does not exist: {dataset_path}")
|
|
320
342
|
continue
|
|
321
|
-
process_directory_tree(dataset_path, output_dir, db_params=db_params, tar_batch=args.tar_batch, dsid=dsid)
|
|
343
|
+
process_directory_tree(dataset_path, output_dir, db_params=db_params, tar_batch=args.tar_batch, dsid=dsid, archive_dir=args.archive_dir)
|
|
322
344
|
return
|
|
323
345
|
else:
|
|
324
346
|
print("Error: Must provide either --input-file or dataset_ids or --batch-files or --batch-input-file.")
|
{rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rda_python_dsquasar
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.10
|
|
4
4
|
Summary: RDA Python package to backup and recover RDA data archives to and from GLOBUS Quasar backup server
|
|
5
5
|
Author-email: Zaihua Ji <zji@ucar.edu>
|
|
6
6
|
Project-URL: Homepage, https://github.com/NCAR/rda-python-dsquasar
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/__init__.py
RENAMED
|
File without changes
|
{rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/ds_quasar.py
RENAMED
|
File without changes
|
{rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/dsquasar.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|