rda-python-dsquasar 2.0.8__tar.gz → 2.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/PKG-INFO +1 -1
  2. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/pyproject.toml +1 -1
  3. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/taccrec.py +8 -8
  4. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/tacctar.py +63 -41
  5. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/PKG-INFO +1 -1
  6. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/LICENSE +0 -0
  7. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/README.md +0 -0
  8. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/setup.cfg +0 -0
  9. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/__init__.py +0 -0
  10. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/ds_quasar.py +0 -0
  11. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/dsquasar.py +0 -0
  12. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar/dstacc.py +0 -0
  13. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/SOURCES.txt +0 -0
  14. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/dependency_links.txt +0 -0
  15. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/entry_points.txt +0 -0
  16. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/requires.txt +0 -0
  17. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/src/rda_python_dsquasar.egg-info/top_level.txt +0 -0
  18. {rda_python_dsquasar-2.0.8 → rda_python_dsquasar-2.0.10}/test/test_dsquasar.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rda_python_dsquasar
3
- Version: 2.0.8
3
+ Version: 2.0.10
4
4
  Summary: RDA Python package to backup and recover RDA data archives to and from GLOBUS Quasar backup server
5
5
  Author-email: Zaihua Ji <zji@ucar.edu>
6
6
  Project-URL: Homepage, https://github.com/NCAR/rda-python-dsquasar
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "rda_python_dsquasar"
7
- version = "2.0.8"
7
+ version = "2.0.10"
8
8
  authors = [
9
9
  { name="Zaihua Ji", email="zji@ucar.edu" },
10
10
  ]
@@ -144,14 +144,14 @@ def get_uid_from_logname(db_params):
144
144
 
145
145
  def main():
146
146
  parser = argparse.ArgumentParser(description='Insert tar file summary into tfile table.')
147
- parser.add_argument('--member-list', help='Path to tar member list file (from tar -tvf)')
148
- parser.add_argument('--member-list-file', help='File containing list of tar member list files (one per line)')
149
- parser.add_argument('--db-host', default='rda-db.ucar.edu', help='Database host (default: rda-db.ucar.edu)')
150
- parser.add_argument('--db-port', default=5432, type=int, help='Database port (default: 5432)')
151
- parser.add_argument('--db-name', default='rdadb', help='Database name (default: rdadb)')
152
- parser.add_argument('--db-user', default='dssdb', help='Database user (default: dssdb)')
153
- parser.add_argument('--db-password', help='Database password (optional, use .pgpass if omitted)')
154
- parser.add_argument('--no-update', action='store_true', help='If tfile exists, skip all updates including wfile tables (default: False)')
147
+ parser.add_argument('-ml', '--member-list', help='Path to tar member list file (from tar -tvf)')
148
+ parser.add_argument('-mf', '--member-list-file', help='File containing list of tar member list files (one per line)')
149
+ parser.add_argument('-ht', '--db-host', default='rda-db.ucar.edu', help='Database host (default: rda-db.ucar.edu)')
150
+ parser.add_argument('-pt', '--db-port', default=5432, type=int, help='Database port (default: 5432)')
151
+ parser.add_argument('-db', '--db-name', default='rdadb', help='Database name (default: rdadb)')
152
+ parser.add_argument('-us', '--db-user', default='dssdb', help='Database user (default: dssdb)')
153
+ parser.add_argument('-pw', '--db-password', help='Database password (optional, use .pgpass if omitted)')
154
+ parser.add_argument('-nu', '--no-update', action='store_true', help='If tfile exists, skip all updates including wfile tables (default: False)')
155
155
  args = parser.parse_args()
156
156
  if args.member_list_file:
157
157
  with open(args.member_list_file, 'r') as f:
@@ -69,14 +69,44 @@ def group_files_by_size(files, min_size, max_size):
69
69
  batches.append(current_batch)
70
70
  return batches
71
71
 
72
- def tar_batches(dirpath, batches, output_dir, dataset_path=None, dataset_name=None, tar_batch=True):
72
+ def write_mbr_and_archive_tar(tar_file, mbr_file, archive_dir=None):
73
73
  """
74
- If tar_batch is True, create tar files for each batch. Otherwise, dump file list to .batch files.
74
+ Write tar member list file in tar -tvf format and move tar file to archive_dir if provided.
75
75
  """
76
+ import pwd, grp, time, shutil
77
+ # Write member list file
78
+ with tarfile.open(tar_file, "r") as tar:
79
+ with open(mbr_file, "w") as mf:
80
+ for member in tar.getmembers():
81
+ mode = oct(member.mode)[-4:]
82
+ typechar = '-' if member.isfile() else 'd' if member.isdir() else 'l' if member.issym() else '?'
83
+ uname = member.uname or (pwd.getpwuid(member.uid).pw_name if hasattr(member, 'uid') else '')
84
+ gname = member.gname or (grp.getgrgid(member.gid).gr_name if hasattr(member, 'gid') else '')
85
+ size = member.size
86
+ mtime = time.strftime("%Y-%m-%d %H:%M", time.localtime(member.mtime))
87
+ mf.write(f"{typechar}{mode} {uname}/{gname} {size:9d} {mtime} {member.name}\n")
88
+ # Move tar file to archive_dir if specified
89
+ if archive_dir:
90
+ archive_dir_abs = os.path.abspath(archive_dir)
91
+ os.makedirs(archive_dir_abs, exist_ok=True)
92
+ dest_tar_file = os.path.join(archive_dir_abs, os.path.basename(tar_file))
93
+ try:
94
+ shutil.move(tar_file, dest_tar_file)
95
+ logging.info(f"Moved tar file {tar_file} to archive directory {dest_tar_file}.")
96
+ except Exception as e:
97
+ logging.error(f"Failed to move tar file {tar_file} to archive directory {archive_dir_abs}: {e}")
98
+
99
+ def tar_batches(dirpath, batches, output_dir, dataset_path=None, dataset_name=None, tar_batch=True, archive_dir=None):
100
+ """
101
+ If tar_batch is True, create tar files for each batch and move to archive_dir if provided. Otherwise, dump file list to .batch files.
102
+ Always create a .mbr file for each tar file.
103
+ """
104
+ import shutil
76
105
  for idx, batch in enumerate(batches, 1):
77
106
  num_files = len(batch)
78
107
  tar_name = os.path.join(output_dir, f"{dataset_name}_part{idx}_{num_files}files.tar")
79
108
  batch_name = tar_name.replace(".tar", ".batch")
109
+ mbr_file = tar_name + '.mbr'
80
110
  if tar_batch:
81
111
  logging.info(f"Creating tar: {tar_name} with {num_files} files.")
82
112
  with tarfile.open(tar_name, "w") as tar:
@@ -87,6 +117,7 @@ def tar_batches(dirpath, batches, output_dir, dataset_path=None, dataset_name=No
87
117
  tar.add(f, arcname=arcname)
88
118
  except Exception as e:
89
119
  logging.warning(f"Failed to add {f} to tar: {e}")
120
+ write_mbr_and_archive_tar(tar_name, mbr_file, archive_dir)
90
121
  else:
91
122
  logging.info(f"Writing batch file list: {batch_name} with {num_files} files.")
92
123
  with open(batch_name, "w") as bf:
@@ -95,7 +126,7 @@ def tar_batches(dirpath, batches, output_dir, dataset_path=None, dataset_name=No
95
126
  arcname = os.path.join(dataset_name, arcname)
96
127
  bf.write(arcname + "\n")
97
128
 
98
- def process_directory_tree(dataset_path, output_dir, db_params=None, tar_batch=True, dsid=None):
129
+ def process_directory_tree(dataset_path, output_dir, db_params=None, tar_batch=True, dsid=None, archive_dir=None):
99
130
  dataset_name = dsid if dsid else os.path.basename(os.path.abspath(dataset_path))
100
131
  wfile_table = f"dssdb.wfile_{dataset_name}"
101
132
  all_files = []
@@ -130,7 +161,7 @@ def process_directory_tree(dataset_path, output_dir, db_params=None, tar_batch=T
130
161
  if not all_files:
131
162
  return
132
163
  batches = group_files_by_size(all_files, ONE_TB, THREE_TB)
133
- tar_batches(dataset_path, batches, output_dir, dataset_path=dataset_path, dataset_name=dataset_name, tar_batch=tar_batch)
164
+ tar_batches(dataset_path, batches, output_dir, dataset_path=dataset_path, dataset_name=dataset_name, tar_batch=tar_batch, archive_dir=archive_dir)
134
165
 
135
166
  def read_directories_from_file(input_file, tar_root=None):
136
167
  dataset_ids = []
@@ -186,7 +217,8 @@ def collect_all_files(dataset_dirs, db_params=None):
186
217
  def find_common_root(paths):
187
218
  return os.path.commonpath(paths) if paths else ''
188
219
 
189
- def tar_batches_across_dirs(files, batches, output_dir, tar_root, dataset_paths, tar_batch=True, dataset_ids=None):
220
+ def tar_batches_across_dirs(files, batches, output_dir, tar_root, dataset_paths, tar_batch=True, dataset_ids=None, archive_dir=None):
221
+ import shutil
190
222
  dataset_dir_paths = [Path(d).resolve() for d in dataset_paths]
191
223
  dsid_map = {str(Path(d).resolve()): dsid for dsid, d in zip(dataset_ids, dataset_paths)} if dataset_ids else {}
192
224
  for idx, batch in enumerate(batches, 1):
@@ -208,6 +240,7 @@ def tar_batches_across_dirs(files, batches, output_dir, tar_root, dataset_paths,
208
240
  prefix = "_".join(sorted(batch_dataset_names)) if batch_dataset_names else "batch"
209
241
  tar_name = os.path.join(output_dir, f"{prefix}_part{idx}_{num_files}files.tar")
210
242
  batch_name = tar_name.replace(".tar", ".batch")
243
+ mbr_file = tar_name + '.mbr'
211
244
  if tar_batch:
212
245
  logging.info(f"Creating tar: {tar_name} with {num_files} files.")
213
246
  with tarfile.open(tar_name, "w") as tar:
@@ -217,6 +250,7 @@ def tar_batches_across_dirs(files, batches, output_dir, tar_root, dataset_paths,
217
250
  tar.add(f, arcname=arcname)
218
251
  except Exception as e:
219
252
  logging.warning(f"Failed to add {f} to tar: {e}")
253
+ write_mbr_and_archive_tar(tar_name, mbr_file, archive_dir)
220
254
  else:
221
255
  logging.info(f"Writing batch file list: {batch_name} with {num_files} files.")
222
256
  with open(batch_name, "w") as bf:
@@ -227,14 +261,7 @@ def tar_batches_across_dirs(files, batches, output_dir, tar_root, dataset_paths,
227
261
  def get_batch_size(batch):
228
262
  return sum(get_file_size(f) for f in batch)
229
263
 
230
- def tar_batch_file(batch_file, tar_root=None):
231
- """
232
- Read a filelist in batch_file and tar the filelist into a tar file named like batch file name by replacing .batch with .tar.
233
- Prepend path tar_root to the file names in each batch file for the full path.
234
- After tarring, dump a member file detail list (like 'tar -tvf') into a .mbr file named as tarfilename+'.mbr'.
235
- If the .mbr file already exists, skip the tar action.
236
- """
237
- import pwd, grp, time
264
+ def tar_batch_file(batch_file, tar_root=None, archive_dir=None):
238
265
  tar_file = batch_file.replace('.batch', '.tar')
239
266
  mbr_file = tar_file + '.mbr'
240
267
  if os.path.exists(mbr_file):
@@ -250,30 +277,25 @@ def tar_batch_file(batch_file, tar_root=None):
250
277
  tar.add(abs_path, arcname=rel_path)
251
278
  except Exception as e:
252
279
  logging.warning(f"Failed to add {abs_path} as {rel_path} to tar: {e}")
253
- # Write member list file in tar -tvf format
254
- with tarfile.open(tar_file, "r") as tar:
255
- with open(mbr_file, "w") as mf:
256
- for member in tar.getmembers():
257
- mode = oct(member.mode)[-4:]
258
- typechar = '-' if member.isfile() else 'd' if member.isdir() else 'l' if member.issym() else '?'
259
- uname = member.uname or (pwd.getpwuid(member.uid).pw_name if hasattr(member, 'uid') else '')
260
- gname = member.gname or (grp.getgrgid(member.gid).gr_name if hasattr(member, 'gid') else '')
261
- size = member.size
262
- mtime = time.strftime("%Y-%m-%d %H:%M", time.localtime(member.mtime))
263
- # Format: -rw-r--r-- user/group size date name
264
- mf.write(f"{typechar}{mode} {uname}/{gname} {size:9d} {mtime} {member.name}\n")
280
+ write_mbr_and_archive_tar(tar_file, mbr_file, archive_dir)
265
281
 
266
282
  def main():
267
283
  import argparse
268
284
  parser = argparse.ArgumentParser(description="Tar files from a list of dataset IDs into 1-3TB tar files.")
269
- parser.add_argument('--batch-input-file', type=str, default=None, help='A file containing a list of batch file names, one per line. Each batch file should contain relative file names to be tarred.')
270
- parser.add_argument('--batch-files', nargs='*', default=None, help='List of batch files to tar. Each batch file should contain relative file names to be tarred.')
271
- parser.add_argument('--tar-root', type=str, required=True, help='Root directory for relative tar member file names (arcname). REQUIRED for all modes.')
272
- parser.add_argument("--input-file", help="File containing list of dataset IDs to process (one per line)")
273
- parser.add_argument("--output-dir", help="Directory to store tar files (default: current directory)")
274
- parser.add_argument('--check-tarred', action='store_true', default=False, help='Skip files already tarred (tid > 0 in wfile_<dataset_name>)')
275
- parser.add_argument('--tar-batch', action='store_true', default=False, help='Tar files for each batch. If not set, dump file list to .batch files instead.')
276
- parser.add_argument("dataset_ids", nargs='*', help="Dataset IDs to process")
285
+ parser.add_argument('-bi', '--batch-input-file', type=str, default=None, help='A file containing a list of batch file names, one per line. Each batch file should contain relative file names to be tarred.')
286
+ parser.add_argument('-bf', '--batch-files', nargs='*', default=None, help='List of batch files to tar. Each batch file should contain relative file names to be tarred.')
287
+ parser.add_argument('-tr', '--tar-root', type=str, required=True, help='Root directory for relative tar member file names (arcname). REQUIRED for all modes.')
288
+ parser.add_argument('-if', '--input-file', help='File containing list of dataset IDs to process (one per line)')
289
+ parser.add_argument('-od', '--output-dir', help='Directory to store tar files (default: current directory)')
290
+ parser.add_argument('-ct', '--check-tarred', action='store_true', default=False, help='Skip files already tarred (tid > 0 in wfile_<dataset_name>)')
291
+ parser.add_argument('-tb', '--tar-batch', action='store_true', default=False, help='Tar files for each batch. If not set, dump file list to .batch files instead.')
292
+ parser.add_argument('-ds', '--dataset-ids', nargs='*', help='Dataset IDs to process')
293
+ parser.add_argument('-ht', '--db-host', type=str, default='rda-db.ucar.edu', help='Database host for tarred check')
294
+ parser.add_argument('-pt', '--db-port', type=int, default=5432, help='Database port for tarred check')
295
+ parser.add_argument('-db', '--db-name', type=str, default='rdadb', help='Database name for tarred check')
296
+ parser.add_argument('-us', '--db-user', type=str, default='dssdb', help='Database user for tarred check')
297
+ parser.add_argument('-pw', '--db-password', type=str, default=None, help='Database password for tarred check')
298
+ parser.add_argument('-ad', '--archive-dir', type=str, default=None, help='Directory to move tar files to after creation (optional)')
277
299
  args = parser.parse_args()
278
300
  output_dir = args.output_dir if args.output_dir else os.getcwd()
279
301
  os.makedirs(output_dir, exist_ok=True)
@@ -281,11 +303,11 @@ def main():
281
303
  db_params = None
282
304
  if args.check_tarred:
283
305
  db_params = {
284
- 'host': args.db_host if hasattr(args, 'db_host') else 'rda-db.ucar.edu',
285
- 'port': args.db_port if hasattr(args, 'db_port') else 5432,
286
- 'dbname': args.db_name if hasattr(args, 'db_name') else 'rdadb',
287
- 'user': args.db_user if hasattr(args, 'db_user') else 'dssdb',
288
- 'password': args.db_password if hasattr(args, 'db_password') else None
306
+ 'host': args.db_host,
307
+ 'port': args.db_port,
308
+ 'dbname': args.db_name,
309
+ 'user': args.db_user,
310
+ 'password': args.db_password
289
311
  }
290
312
  # Batch tar mode
291
313
  if args.batch_input_file or args.batch_files:
@@ -296,7 +318,7 @@ def main():
296
318
  if args.batch_files:
297
319
  batch_files.extend(args.batch_files)
298
320
  for batch_file in batch_files:
299
- tar_batch_file(batch_file, tar_root=args.tar_root)
321
+ tar_batch_file(batch_file, tar_root=args.tar_root, archive_dir=args.archive_dir)
300
322
  return
301
323
  # Directory tree processing mode
302
324
  if args.input_file:
@@ -310,7 +332,7 @@ def main():
310
332
  logging.info(f"Last batch size ({get_batch_size(batches[-1])} bytes) < 1TB, appending to previous batch.")
311
333
  batches[-2].extend(batches[-1])
312
334
  batches.pop()
313
- tar_batches_across_dirs(files, batches, output_dir, args.tar_root, dataset_paths, tar_batch=args.tar_batch, dataset_ids=None)
335
+ tar_batches_across_dirs(files, batches, output_dir, args.tar_root, dataset_paths, tar_batch=args.tar_batch, dataset_ids=None, archive_dir=args.archive_dir)
314
336
  return
315
337
  elif args.dataset_ids:
316
338
  for dsid in args.dataset_ids:
@@ -318,7 +340,7 @@ def main():
318
340
  if not os.path.isdir(dataset_path):
319
341
  logging.warning(f"Dataset directory does not exist: {dataset_path}")
320
342
  continue
321
- process_directory_tree(dataset_path, output_dir, db_params=db_params, tar_batch=args.tar_batch, dsid=dsid)
343
+ process_directory_tree(dataset_path, output_dir, db_params=db_params, tar_batch=args.tar_batch, dsid=dsid, archive_dir=args.archive_dir)
322
344
  return
323
345
  else:
324
346
  print("Error: Must provide either --input-file or dataset_ids or --batch-files or --batch-input-file.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rda_python_dsquasar
3
- Version: 2.0.8
3
+ Version: 2.0.10
4
4
  Summary: RDA Python package to backup and recover RDA data archives to and from GLOBUS Quasar backup server
5
5
  Author-email: Zaihua Ji <zji@ucar.edu>
6
6
  Project-URL: Homepage, https://github.com/NCAR/rda-python-dsquasar