rda-python-dsquasar 2.0.4__tar.gz → 2.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/PKG-INFO +1 -1
  2. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/pyproject.toml +1 -1
  3. rda_python_dsquasar-2.0.6/src/rda_python_dsquasar/taccrec.py +251 -0
  4. rda_python_dsquasar-2.0.6/src/rda_python_dsquasar/tacctar.py +191 -0
  5. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/src/rda_python_dsquasar.egg-info/PKG-INFO +1 -1
  6. rda_python_dsquasar-2.0.4/src/rda_python_dsquasar/taccrec.py +0 -152
  7. rda_python_dsquasar-2.0.4/src/rda_python_dsquasar/tacctar.py +0 -379
  8. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/LICENSE +0 -0
  9. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/README.md +0 -0
  10. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/setup.cfg +0 -0
  11. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/src/rda_python_dsquasar/__init__.py +0 -0
  12. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/src/rda_python_dsquasar/ds_quasar.py +0 -0
  13. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/src/rda_python_dsquasar/dsquasar.py +0 -0
  14. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/src/rda_python_dsquasar/dstacc.py +0 -0
  15. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/src/rda_python_dsquasar.egg-info/SOURCES.txt +0 -0
  16. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/src/rda_python_dsquasar.egg-info/dependency_links.txt +0 -0
  17. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/src/rda_python_dsquasar.egg-info/entry_points.txt +0 -0
  18. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/src/rda_python_dsquasar.egg-info/requires.txt +0 -0
  19. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/src/rda_python_dsquasar.egg-info/top_level.txt +0 -0
  20. {rda_python_dsquasar-2.0.4 → rda_python_dsquasar-2.0.6}/test/test_dsquasar.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rda_python_dsquasar
3
- Version: 2.0.4
3
+ Version: 2.0.6
4
4
  Summary: RDA Python package to backup and recover RDA data archives to and from GLOBUS Quasar backup server
5
5
  Author-email: Zaihua Ji <zji@ucar.edu>
6
6
  Project-URL: Homepage, https://github.com/NCAR/rda-python-dsquasar
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "rda_python_dsquasar"
7
- version = "2.0.4"
7
+ version = "2.0.6"
8
8
  authors = [
9
9
  { name="Zaihua Ji", email="zji@ucar.edu" },
10
10
  ]
@@ -0,0 +1,251 @@
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import tarfile
4
+ import argparse
5
+ import psycopg2
6
+ import hashlib
7
+ from datetime import datetime
8
+
9
+ def get_tar_summary_and_details(member_list_path):
10
+ details = []
11
+ member_details = []
12
+ root_dirs = set()
13
+ dsid = None
14
+ file_count = 0
15
+ tar_size = 0
16
+ if member_list_path and os.path.isfile(member_list_path):
17
+ with open(member_list_path, 'r') as f:
18
+ for idx, line in enumerate(f):
19
+ line = line.strip()
20
+ if not line or line.startswith('-rw') is False:
21
+ continue
22
+ parts = line.split()
23
+ if len(parts) < 6:
24
+ continue
25
+ perms = parts[0]
26
+ owner = parts[1]
27
+ size = int(parts[2])
28
+ date = parts[3]
29
+ time = parts[4]
30
+ name = ' '.join(parts[5:])
31
+ # Ignore member directory names ending with '/'
32
+ if name.endswith('/'):
33
+ continue
34
+ tar_size += size
35
+ mtime_str = f"{date} {time}"
36
+ try:
37
+ mtime = int(datetime.strptime(mtime_str, "%Y-%m-%d %H:%M").timestamp())
38
+ except Exception:
39
+ mtime = 0
40
+ file_count += 1
41
+ root = name.split('/')[0] if '/' in name else name
42
+ root_dirs.add(root)
43
+ if idx == 0:
44
+ dsid = root
45
+ details.append(f"name={name};size={size};mtime={mtime};type=0;mode=0;uid=0;gid=0;uname=;gname=")
46
+ member_details.append({'name': name})
47
+ else:
48
+ print('Error: member_list_path is required and must point to a valid file.')
49
+ return None, None
50
+ from datetime import datetime
51
+ now = datetime.now()
52
+ note = '\n'.join([m['name'] for m in member_details])
53
+ dsids = ','.join(sorted(root_dirs))
54
+ # Use member_list_path to infer tar file name
55
+ tar_file_name = os.path.basename(member_list_path).replace('.mbr', '')
56
+ ctime = now
57
+ mtime = now
58
+ return {
59
+ 'tfile': tar_file_name,
60
+ 'data_size': tar_size,
61
+ 'wcount': file_count,
62
+ 'date_created': ctime.date(),
63
+ 'time_created': ctime.time(),
64
+ 'date_modified': mtime.date(),
65
+ 'time_modified': mtime.time(),
66
+ 'note': note,
67
+ 'dsids': dsids,
68
+ 'dsid': dsid
69
+ }, member_details
70
+
71
+ def insert_tfile_row(summary, db_params, extra, update_on_conflict=False, member_details=None):
72
+ table_name = 'dssdb.tfile'
73
+ conn = psycopg2.connect(**db_params)
74
+ cur = conn.cursor()
75
+ columns = [
76
+ 'tfile', 'data_size', 'wcount', 'date_created', 'time_created',
77
+ 'date_modified', 'time_modified', 'file_format', 'status',
78
+ 'uid', 'dsid', 'data_format', 'disp_order', 'dsids', 'note'
79
+ ]
80
+ values = [
81
+ summary['tfile'], summary['data_size'], summary['wcount'],
82
+ summary['date_created'], summary['time_created'],
83
+ summary['date_modified'], summary['time_modified'],
84
+ 'tar', 'T',
85
+ extra.get('uid'), extra.get('dsid'), extra.get('data_format'),
86
+ extra.get('disp_order'), extra.get('dsids'), extra.get('note')
87
+ ]
88
+ placeholders = ','.join(['%s'] * len(columns))
89
+ sql = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({placeholders})"
90
+ if update_on_conflict:
91
+ update_cols = [col for col in columns if col != 'tfile']
92
+ set_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in update_cols])
93
+ sql += f" ON CONFLICT (tfile) DO UPDATE SET {set_clause}"
94
+ try:
95
+ cur.execute(sql, values)
96
+ conn.commit()
97
+ # Retrieve tid for the just-inserted tfile row
98
+ cur.execute(f"SELECT tid FROM {table_name} WHERE tfile=%s", (summary['tfile'],))
99
+ row = cur.fetchone()
100
+ tid = row[0] if row else None
101
+ # Update wfile tables if member_details provided
102
+ if member_details and tid is not None:
103
+ for m in member_details:
104
+ name = m['name']
105
+ if '/' in name:
106
+ cdsid, wfile = name.split('/', 1)
107
+ else:
108
+ cdsid, wfile = name, ''
109
+ wfile_table = f"dssdb.wfile_{cdsid}"
110
+ # Check if table exists
111
+ cur.execute("SELECT to_regclass(%s)", (wfile_table,))
112
+ exists = cur.fetchone()[0]
113
+ if not exists:
114
+ continue
115
+ # Update tid if record exists
116
+ cur.execute(f"UPDATE {wfile_table} SET tid=%s WHERE wfile=%s", (tid, wfile))
117
+ conn.commit()
118
+ except Exception as e:
119
+ print(f"Database error: {e}")
120
+ conn.rollback()
121
+ raise
122
+ finally:
123
+ cur.close()
124
+ conn.close()
125
+
126
+ def get_uid_from_logname(db_params):
127
+ import getpass
128
+ logname = None
129
+ try:
130
+ logname = os.getlogin()
131
+ except Exception:
132
+ logname = os.environ.get('USER') or getpass.getuser()
133
+ conn = psycopg2.connect(**db_params)
134
+ cur = conn.cursor()
135
+ # Ensure logname is quoted as a string in the query
136
+ cur.execute("SELECT userno FROM dssdb.dssgrp WHERE logname=%s LIMIT 1", (str(logname),))
137
+ row = cur.fetchone()
138
+ cur.close()
139
+ conn.close()
140
+ if row:
141
+ return row[0]
142
+ else:
143
+ raise ValueError(f"User logname '{logname}' not found in dssdb.dssgrp table.")
144
+
145
+ def main():
146
+ parser = argparse.ArgumentParser(description='Insert tar file summary into tfile table.')
147
+ parser.add_argument('--member-list', help='Path to tar member list file (from tar -tvf)')
148
+ parser.add_argument('--member-list-file', help='File containing list of tar member list files (one per line)')
149
+ parser.add_argument('--db-host', default='rda-db.ucar.edu', help='Database host (default: rda-db.ucar.edu)')
150
+ parser.add_argument('--db-port', default=5432, type=int, help='Database port (default: 5432)')
151
+ parser.add_argument('--db-name', default='rdadb', help='Database name (default: rdadb)')
152
+ parser.add_argument('--db-user', default='dssdb', help='Database user (default: dssdb)')
153
+ parser.add_argument('--db-password', help='Database password (optional, use .pgpass if omitted)')
154
+ parser.add_argument('--no-update', action='store_true', help='If tfile exists, skip all updates including wfile tables (default: False)')
155
+ args = parser.parse_args()
156
+ if args.member_list_file:
157
+ with open(args.member_list_file, 'r') as f:
158
+ for line in f:
159
+ member_list_path = line.strip()
160
+ if not member_list_path or not os.path.isfile(member_list_path):
161
+ print(f"Error: member list file '{member_list_path}' is invalid or does not exist.")
162
+ continue
163
+ summary, member_details = get_tar_summary_and_details(member_list_path)
164
+ if summary is None:
165
+ continue
166
+ db_params = {
167
+ 'host': args.db_host,
168
+ 'port': args.db_port,
169
+ 'dbname': args.db_name,
170
+ 'user': args.db_user
171
+ }
172
+ if args.db_password:
173
+ db_params['password'] = args.db_password
174
+ # Check if tfile exists if --no-update is set
175
+ if args.no_update:
176
+ try:
177
+ conn = psycopg2.connect(**db_params)
178
+ cur = conn.cursor()
179
+ cur.execute("SELECT 1 FROM dssdb.tfile WHERE tfile=%s LIMIT 1", (summary['tfile'],))
180
+ exists = cur.fetchone()
181
+ cur.close()
182
+ conn.close()
183
+ if exists:
184
+ print(f"tfile '{summary['tfile']}' already exists in dssdb.tfile. Skipping all updates.")
185
+ continue
186
+ except Exception as e:
187
+ print(f"Database error during tfile existence check: {e}")
188
+ continue
189
+ try:
190
+ uid = get_uid_from_logname(db_params)
191
+ except Exception as e:
192
+ print(f"Error getting uid from dssgrp: {e}")
193
+ continue
194
+ extra = {
195
+ 'uid': uid,
196
+ 'dsid': summary['dsid'],
197
+ 'data_format': '',
198
+ 'disp_order': 0,
199
+ 'dsids': summary['dsids'],
200
+ 'note': summary['note']
201
+ }
202
+ insert_tfile_row(summary, db_params, extra, update_on_conflict=True, member_details=member_details if not args.no_update else None)
203
+ print(f"Inserted tar summary for {summary['tfile']} into tfile.")
204
+ return
205
+ if not args.member_list or not os.path.isfile(args.member_list):
206
+ print('Error: --member-list argument is required and must point to a valid file.')
207
+ return
208
+ summary, member_details = get_tar_summary_and_details(args.member_list)
209
+ if summary is None:
210
+ return
211
+ db_params = {
212
+ 'host': args.db_host,
213
+ 'port': args.db_port,
214
+ 'dbname': args.db_name,
215
+ 'user': args.db_user
216
+ }
217
+ if args.db_password:
218
+ db_params['password'] = args.db_password
219
+ # Check if tfile exists if --no-update is set
220
+ if args.no_update:
221
+ try:
222
+ conn = psycopg2.connect(**db_params)
223
+ cur = conn.cursor()
224
+ cur.execute("SELECT 1 FROM dssdb.tfile WHERE tfile=%s LIMIT 1", (summary['tfile'],))
225
+ exists = cur.fetchone()
226
+ cur.close()
227
+ conn.close()
228
+ if exists:
229
+ print(f"tfile '{summary['tfile']}' already exists in dssdb.tfile. Skipping all updates.")
230
+ return
231
+ except Exception as e:
232
+ print(f"Database error during tfile existence check: {e}")
233
+ return
234
+ try:
235
+ uid = get_uid_from_logname(db_params)
236
+ except Exception as e:
237
+ print(f"Error getting uid from dssgrp: {e}")
238
+ return
239
+ extra = {
240
+ 'uid': uid,
241
+ 'dsid': summary['dsid'],
242
+ 'data_format': '',
243
+ 'disp_order': 0,
244
+ 'dsids': summary['dsids'],
245
+ 'note': summary['note']
246
+ }
247
+ insert_tfile_row(summary, db_params, extra, update_on_conflict=True, member_details=member_details if not args.no_update else None)
248
+ print(f"Inserted tar summary for {summary['tfile']} into tfile.")
249
+
250
+ if __name__ == '__main__':
251
+ main()
@@ -0,0 +1,191 @@
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import tarfile
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ # Constants for size limits
8
+ ONE_TB = 1_099_511_627_776 # 1TB in bytes
9
+ THREE_TB = 3_298_534_883_328 # 3TB in bytes
10
+
11
+ def setup_logging(output_dir):
12
+ import datetime
13
+ log_filename = f"tarlog_{datetime.datetime.now().strftime('%Y%m%d')}.log"
14
+ log_path = os.path.join(output_dir, log_filename)
15
+ logger = logging.getLogger()
16
+ logger.setLevel(logging.INFO)
17
+ # Remove any existing handlers
18
+ for handler in logger.handlers[:]:
19
+ logger.removeHandler(handler)
20
+ # File handler
21
+ fh = logging.FileHandler(log_path)
22
+ fh.setLevel(logging.INFO)
23
+ fh.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
24
+ # Console handler
25
+ ch = logging.StreamHandler()
26
+ ch.setLevel(logging.INFO)
27
+ ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
28
+ logger.addHandler(fh)
29
+ logger.addHandler(ch)
30
+ logging.info(f"Logging to file: {log_path}")
31
+
32
+ def get_file_size(path):
33
+ try:
34
+ return os.path.getsize(path)
35
+ except Exception as e:
36
+ logging.warning(f"Could not get size for {path}: {e}")
37
+ return 0
38
+
39
+ def group_files_by_size(files, min_size, max_size):
40
+ """
41
+ Group files into batches where each batch's total size is between min_size and max_size.
42
+ Returns a list of lists of file paths.
43
+ """
44
+ batches = []
45
+ current_batch = []
46
+ current_size = 0
47
+ for f in files:
48
+ fsize = get_file_size(f)
49
+ if fsize > max_size:
50
+ logging.warning(f"File {f} is larger than max tar size ({max_size} bytes), skipping.")
51
+ continue
52
+ if current_size + fsize > max_size:
53
+ if current_size >= min_size:
54
+ batches.append(current_batch)
55
+ current_batch = [f]
56
+ current_size = fsize
57
+ else:
58
+ # If adding this file exceeds max_size but current batch is too small, force add
59
+ current_batch.append(f)
60
+ current_size += fsize
61
+ batches.append(current_batch)
62
+ current_batch = []
63
+ current_size = 0
64
+ else:
65
+ current_batch.append(f)
66
+ current_size += fsize
67
+ if current_batch:
68
+ batches.append(current_batch)
69
+ return batches
70
+
71
+ def tar_batches(dirpath, batches, output_dir, root_path=None, root_dirname=None):
72
+ # Use root_dirname for tar file naming and arcname
73
+ for idx, batch in enumerate(batches, 1):
74
+ num_files = len(batch)
75
+ tar_name = os.path.join(output_dir, f"{root_dirname}_part{idx}_{num_files}files.tar")
76
+ logging.info(f"Creating tar: {tar_name} with {num_files} files.")
77
+ with tarfile.open(tar_name, "w") as tar:
78
+ for f in batch:
79
+ try:
80
+ # arcname should be relative to root_path, and always start with root_dirname
81
+ arcname = os.path.relpath(f, root_path)
82
+ arcname = os.path.join(root_dirname, arcname)
83
+ tar.add(f, arcname=arcname)
84
+ except Exception as e:
85
+ logging.warning(f"Failed to add {f} to tar: {e}")
86
+
87
+ def process_directory_tree(root_path, output_dir):
88
+ root_dirname = os.path.basename(os.path.abspath(root_path))
89
+ # Gather all files under root_path first
90
+ all_files = []
91
+ for dirpath, dirnames, filenames in os.walk(root_path):
92
+ abs_files = [os.path.join(dirpath, f) for f in filenames]
93
+ all_files.extend(abs_files)
94
+ if not all_files:
95
+ return
96
+ batches = group_files_by_size(all_files, ONE_TB, THREE_TB)
97
+ tar_batches(root_path, batches, output_dir, root_path=root_path, root_dirname=root_dirname)
98
+
99
+ def read_directories_from_file(input_file):
100
+ dirs = []
101
+ with open(input_file, 'r') as f:
102
+ for line in f:
103
+ line = line.strip()
104
+ if not line or line.startswith('#'):
105
+ continue
106
+ if os.path.isdir(line):
107
+ dirs.append(os.path.abspath(line))
108
+ else:
109
+ logging.warning(f"Directory does not exist: {line}")
110
+ return dirs
111
+
112
+ def collect_all_files(directories):
113
+ files = []
114
+ for d in directories:
115
+ for dirpath, dirnames, filenames in os.walk(d):
116
+ for fname in filenames:
117
+ fpath = os.path.join(dirpath, fname)
118
+ files.append(fpath)
119
+ return files
120
+
121
+ def find_common_root(paths):
122
+ return os.path.commonpath(paths) if paths else ''
123
+
124
+ def tar_batches_across_dirs(files, batches, output_dir, common_root, root_dirs):
125
+ root_dir_paths = [Path(d).resolve() for d in root_dirs]
126
+ for idx, batch in enumerate(batches, 1):
127
+ # Find which root directories are represented in this batch
128
+ batch_root_names = set()
129
+ for f in batch:
130
+ f_path = Path(f).resolve()
131
+ for root_path in root_dir_paths:
132
+ try:
133
+ if str(f_path).startswith(str(root_path)):
134
+ batch_root_names.add(root_path.name)
135
+ break
136
+ except Exception:
137
+ continue
138
+ prefix = "_".join(sorted(batch_root_names)) if batch_root_names else "batch"
139
+ num_files = len(batch)
140
+ tar_name = os.path.join(output_dir, f"{prefix}_batch_part{idx}_{num_files}files.tar")
141
+ logging.info(f"Creating tar: {tar_name} with {num_files} files.")
142
+ with tarfile.open(tar_name, "w") as tar:
143
+ for f in batch:
144
+ try:
145
+ # Preserve relative path from common root in tar
146
+ arcname = os.path.relpath(f, common_root)
147
+ tar.add(f, arcname=arcname)
148
+ except Exception as e:
149
+ logging.warning(f"Failed to add {f} to tar: {e}")
150
+
151
+ def get_batch_size(batch):
152
+ return sum(get_file_size(f) for f in batch)
153
+
154
+ def process_from_file(input_file, output_dir):
155
+ directories = read_directories_from_file(input_file)
156
+ files = collect_all_files(directories)
157
+ if not files:
158
+ logging.info("No files found in provided directories.")
159
+ return
160
+ batches = group_files_by_size(files, ONE_TB, THREE_TB)
161
+ # If last batch is less than 1TB, append it to previous batch
162
+ if len(batches) > 1 and get_batch_size(batches[-1]) < ONE_TB:
163
+ logging.info(f"Last batch size ({get_batch_size(batches[-1])} bytes) < 1TB, appending to previous batch.")
164
+ batches[-2].extend(batches[-1])
165
+ batches.pop()
166
+ common_root = find_common_root(directories)
167
+ tar_batches_across_dirs(files, batches, output_dir, common_root, directories)
168
+
169
+ def main():
170
+ import argparse
171
+ parser = argparse.ArgumentParser(description="Tar files from a list of directories into 1-3TB tar files.")
172
+ parser.add_argument("--input-file", help="File containing list of directories to process (one per line)")
173
+ parser.add_argument("--output-dir", help="Directory to store tar files (default: current directory)")
174
+ parser.add_argument("root_dir", nargs='*', help="Root directory or directories to process")
175
+ args = parser.parse_args()
176
+ output_dir = args.output_dir if args.output_dir else os.getcwd()
177
+ os.makedirs(output_dir, exist_ok=True)
178
+ setup_logging(output_dir)
179
+ if args.input_file:
180
+ process_from_file(args.input_file, output_dir)
181
+ elif args.root_dir:
182
+ for root in args.root_dir:
183
+ if not os.path.isdir(root):
184
+ logging.warning(f"Root directory does not exist: {root}")
185
+ continue
186
+ process_directory_tree(root, output_dir)
187
+ else:
188
+ print("Error: Must provide either --input-file or root_dir.")
189
+
190
+ if __name__ == "__main__":
191
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rda_python_dsquasar
3
- Version: 2.0.4
3
+ Version: 2.0.6
4
4
  Summary: RDA Python package to backup and recover RDA data archives to and from GLOBUS Quasar backup server
5
5
  Author-email: Zaihua Ji <zji@ucar.edu>
6
6
  Project-URL: Homepage, https://github.com/NCAR/rda-python-dsquasar
@@ -1,152 +0,0 @@
1
- import os
2
- import tarfile
3
- import argparse
4
- import psycopg2
5
- import hashlib
6
- from datetime import datetime
7
-
8
- def compute_md5(file_path, chunk_size=8192):
9
- md5 = hashlib.md5()
10
- with open(file_path, 'rb') as f:
11
- while True:
12
- chunk = f.read(chunk_size)
13
- if not chunk:
14
- break
15
- md5.update(chunk)
16
- return md5.hexdigest()
17
-
18
- def get_tar_summary_and_details(tar_path):
19
- with tarfile.open(tar_path, 'r') as tar:
20
- member_files = [m for m in tar.getmembers() if m.isfile()]
21
- file_count = len(member_files)
22
- # Collect details for note
23
- details = []
24
- root_dirs = set()
25
- dsid = None
26
- for idx, m in enumerate(member_files):
27
- # Extract root directory (first part of the path)
28
- parts = m.name.split('/')
29
- if len(parts) > 1:
30
- root_dirs.add(parts[0])
31
- if idx == 0:
32
- dsid = parts[0]
33
- elif len(parts) == 1:
34
- root_dirs.add(parts[0])
35
- if idx == 0:
36
- dsid = parts[0]
37
- # Collect file details
38
- details.append(f"name={m.name};size={m.size};mtime={m.mtime};type={m.type};mode={m.mode};uid={m.uid};gid={m.gid};uname={m.uname};gname={m.gname}")
39
- note = '\n'.join(details)
40
- dsids = ','.join(sorted(root_dirs))
41
- tar_stat = os.stat(tar_path)
42
- tar_size = tar_stat.st_size
43
- ctime = datetime.fromtimestamp(tar_stat.st_ctime)
44
- mtime = datetime.fromtimestamp(tar_stat.st_mtime)
45
- return {
46
- 'tfile': os.path.basename(tar_path),
47
- 'data_size': tar_size,
48
- 'wcount': file_count,
49
- 'date_created': ctime.date(),
50
- 'time_created': ctime.time(),
51
- 'date_modified': mtime.date(),
52
- 'time_modified': mtime.time(),
53
- 'note': note,
54
- 'dsids': dsids,
55
- 'dsid': dsid
56
- }
57
-
58
- def insert_tfile_row(summary, checksum, db_params, extra, update_on_conflict=False):
59
- table_name = 'dssdb.tfile'
60
- conn = psycopg2.connect(**db_params)
61
- cur = conn.cursor()
62
- columns = [
63
- 'tfile', 'data_size', 'wcount', 'date_created', 'time_created',
64
- 'date_modified', 'time_modified', 'file_format', 'checksum', 'status',
65
- 'uid', 'dsid', 'data_format', 'disp_order', 'dsids', 'note'
66
- ]
67
- values = [
68
- summary['tfile'], summary['data_size'], summary['wcount'],
69
- summary['date_created'], summary['time_created'],
70
- summary['date_modified'], summary['time_modified'],
71
- 'tar', checksum, 'T',
72
- extra.get('uid'), extra.get('dsid'), extra.get('data_format'),
73
- extra.get('disp_order'), extra.get('dsids'), extra.get('note')
74
- ]
75
- placeholders = ','.join(['%s'] * len(columns))
76
- sql = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({placeholders})"
77
- if update_on_conflict:
78
- update_cols = [col for col in columns if col != 'tfile']
79
- set_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in update_cols])
80
- sql += f" ON CONFLICT (tfile) DO UPDATE SET {set_clause}"
81
- try:
82
- cur.execute(sql, values)
83
- conn.commit()
84
- except Exception as e:
85
- print(f"Database error: {e}")
86
- conn.rollback()
87
- raise
88
- finally:
89
- cur.close()
90
- conn.close()
91
-
92
- def get_uid_from_logname(db_params):
93
- import getpass
94
- logname = None
95
- try:
96
- logname = os.getlogin()
97
- except Exception:
98
- logname = os.environ.get('USER') or getpass.getuser()
99
- conn = psycopg2.connect(**db_params)
100
- cur = conn.cursor()
101
- # Ensure logname is quoted as a string in the query
102
- cur.execute("SELECT userno FROM dssdb.dssgrp WHERE logname=%s LIMIT 1", (str(logname),))
103
- row = cur.fetchone()
104
- cur.close()
105
- conn.close()
106
- if row:
107
- return row[0]
108
- else:
109
- raise ValueError(f"User logname '{logname}' not found in dssdb.dssgrp table.")
110
-
111
- def main():
112
- parser = argparse.ArgumentParser(description='Insert tar file summary into tfile table.')
113
- parser.add_argument('tarfile', help='Path to the tar file')
114
- parser.add_argument('--db-host', default='rda-db.ucar.edu', help='Database host (default: rda-db.ucar.edu)')
115
- parser.add_argument('--db-port', default=5432, type=int, help='Database port (default: 5432)')
116
- parser.add_argument('--db-name', default='rdadb', help='Database name (default: rdadb)')
117
- parser.add_argument('--db-user', default='dssdb', help='Database user (default: dssdb)')
118
- parser.add_argument('--db-password', help='Database password (optional, use .pgpass if omitted)')
119
- parser.add_argument('--update', action='store_true', help='Update row if tfile already exists')
120
- args = parser.parse_args()
121
- tar_path = args.tarfile
122
- if not os.path.isfile(tar_path):
123
- print(f"Tar file not found: {tar_path}")
124
- return
125
- summary = get_tar_summary_and_details(tar_path)
126
- checksum = compute_md5(tar_path)
127
- db_params = {
128
- 'host': args.db_host,
129
- 'port': args.db_port,
130
- 'dbname': args.db_name,
131
- 'user': args.db_user
132
- }
133
- if args.db_password:
134
- db_params['password'] = args.db_password
135
- try:
136
- uid = get_uid_from_logname(db_params)
137
- except Exception as e:
138
- print(f"Error getting uid from dssgrp: {e}")
139
- return
140
- extra = {
141
- 'uid': uid,
142
- 'dsid': summary['dsid'],
143
- 'data_format': '',
144
- 'disp_order': 0,
145
- 'dsids': summary['dsids'],
146
- 'note': summary['note']
147
- }
148
- insert_tfile_row(summary, checksum, db_params, extra, update_on_conflict=args.update)
149
- print(f"Inserted tar summary for {tar_path} into tfile.")
150
-
151
- if __name__ == '__main__':
152
- main()
@@ -1,379 +0,0 @@
1
- import os
2
- import tarfile
3
- import logging
4
- from pathlib import Path
5
- import psycopg2
6
- import hashlib
7
- from datetime import datetime
8
- import getpass
9
-
10
- # Constants for size limits
11
- ONE_TB = 1_099_511_627_776 # 1TB in bytes
12
- THREE_TB = 3_298_534_883_328 # 3TB in bytes
13
-
14
- def setup_logging(output_dir):
15
- import datetime
16
- log_filename = f"tarlog_{datetime.datetime.now().strftime('%Y%m%d')}.log"
17
- log_path = os.path.join(output_dir, log_filename)
18
- logger = logging.getLogger()
19
- logger.setLevel(logging.INFO)
20
- # Remove any existing handlers
21
- for handler in logger.handlers[:]:
22
- logger.removeHandler(handler)
23
- # File handler
24
- fh = logging.FileHandler(log_path)
25
- fh.setLevel(logging.INFO)
26
- fh.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
27
- # Console handler
28
- ch = logging.StreamHandler()
29
- ch.setLevel(logging.INFO)
30
- ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
31
- logger.addHandler(fh)
32
- logger.addHandler(ch)
33
- logging.info(f"Logging to file: {log_path}")
34
-
35
- def get_file_size(path):
36
- try:
37
- return os.path.getsize(path)
38
- except Exception as e:
39
- logging.warning(f"Could not get size for {path}: {e}")
40
- return 0
41
-
42
- def group_files_by_size(files, min_size, max_size):
43
- """
44
- Group files into batches where each batch's total size is between min_size and max_size.
45
- Returns a list of lists of file paths.
46
- """
47
- batches = []
48
- current_batch = []
49
- current_size = 0
50
- for f in files:
51
- fsize = get_file_size(f)
52
- if fsize > max_size:
53
- logging.warning(f"File {f} is larger than max tar size ({max_size} bytes), skipping.")
54
- continue
55
- if current_size + fsize > max_size:
56
- if current_size >= min_size:
57
- batches.append(current_batch)
58
- current_batch = [f]
59
- current_size = fsize
60
- else:
61
- # If adding this file exceeds max_size but current batch is too small, force add
62
- current_batch.append(f)
63
- current_size += fsize
64
- batches.append(current_batch)
65
- current_batch = []
66
- current_size = 0
67
- else:
68
- current_batch.append(f)
69
- current_size += fsize
70
- if current_batch:
71
- batches.append(current_batch)
72
- return batches
73
-
74
- def tar_batches(dirpath, batches, output_dir):
75
- src_dir_name = Path(dirpath).name
76
- for idx, batch in enumerate(batches, 1):
77
- tar_name = os.path.join(output_dir, f"{src_dir_name}_part{idx}.tar")
78
- logging.info(f"Creating tar: {tar_name} with {len(batch)} files.")
79
- with tarfile.open(tar_name, "w") as tar:
80
- for f in batch:
81
- try:
82
- tar.add(f, arcname=os.path.relpath(f, dirpath))
83
- except Exception as e:
84
- logging.warning(f"Failed to add {f} to tar: {e}")
85
-
86
- def process_directory_tree(root_dir, output_dir, db_params=None, update_on_conflict=False):
87
- for dirpath, dirnames, filenames in os.walk(root_dir):
88
- abs_files = [os.path.join(dirpath, f) for f in filenames]
89
- if not abs_files:
90
- continue
91
- batches = group_files_by_size(abs_files, ONE_TB, THREE_TB)
92
- # Use tar_batches_across_dirs for consistency and DB recording
93
- common_root = root_dir
94
- tar_batches_across_dirs(abs_files, batches, output_dir, common_root, [root_dir], db_params=db_params, update_on_conflict=update_on_conflict)
95
-
96
- def read_directories_from_file(input_file):
97
- dirs = []
98
- with open(input_file, 'r') as f:
99
- for line in f:
100
- line = line.strip()
101
- if not line or line.startswith('#'):
102
- continue
103
- if os.path.isdir(line):
104
- dirs.append(os.path.abspath(line))
105
- else:
106
- logging.warning(f"Directory does not exist: {line}")
107
- return dirs
108
-
109
- def collect_all_files(directories):
110
- files = []
111
- for d in directories:
112
- for dirpath, dirnames, filenames in os.walk(d):
113
- for fname in filenames:
114
- fpath = os.path.join(dirpath, fname)
115
- files.append(fpath)
116
- return files
117
-
118
- def find_common_root(paths):
119
- return os.path.commonpath(paths) if paths else ''
120
-
121
- def tar_batches_across_dirs(files, batches, output_dir, common_root, root_dirs, db_params=None, update_on_conflict=False):
122
- root_dir_paths = [Path(d).resolve() for d in root_dirs]
123
- uid = None
124
- if db_params:
125
- try:
126
- uid = get_uid_from_logname(db_params)
127
- except Exception as e:
128
- logging.error(f"Could not get uid from dssgrp: {e}")
129
- uid = 9
130
- for idx, batch in enumerate(batches, 1):
131
- batch_root_names = set()
132
- member_infos = []
133
- dsid = None
134
- for file_idx, f in enumerate(batch):
135
- f_path = Path(f).resolve()
136
- for root_path in root_dir_paths:
137
- try:
138
- if str(f_path).startswith(str(root_path)):
139
- batch_root_names.add(root_path.name)
140
- break
141
- except Exception:
142
- continue
143
- # Gather member file info for note/dsid/dsids
144
- try:
145
- stat = os.stat(f)
146
- arcname = os.path.relpath(f, common_root)
147
- parts = arcname.split('/')
148
- root_dir = parts[0] if parts else ''
149
- if file_idx == 0:
150
- dsid = root_dir
151
- member_infos.append({
152
- 'name': arcname,
153
- 'size': stat.st_size,
154
- 'mtime': int(stat.st_mtime),
155
- 'type': '0', # regular file
156
- 'mode': stat.st_mode,
157
- 'uid': stat.st_uid,
158
- 'gid': stat.st_gid,
159
- 'uname': '',
160
- 'gname': ''
161
- })
162
- except Exception as e:
163
- logging.warning(f"Failed to stat {f} for tar member info: {e}")
164
- # Determine tar file name: first leading root dir and number of distinct root dirs
165
- batch_root_names_sorted = sorted(batch_root_names)
166
- first_root = batch_root_names_sorted[0] if batch_root_names_sorted else 'batch'
167
- num_roots = len(batch_root_names_sorted)
168
- num_files = len(batch)
169
- tar_name = os.path.join(output_dir, f"{first_root}_dn{num_roots}_gn{idx}_fn{num_files}.tar")
170
- logging.info(f"Creating tar: {tar_name} with {num_files} files.")
171
- with tarfile.open(tar_name, "w") as tar:
172
- for f in batch:
173
- try:
174
- arcname = os.path.relpath(f, common_root)
175
- tar.add(f, arcname=arcname)
176
- except Exception as e:
177
- logging.warning(f"Failed to add {f} to tar: {e}")
178
- # After tar is created, insert info into tfile if db_params is provided
179
- if db_params:
180
- try:
181
- # Build note and dsids from cached member_infos
182
- note = '\n'.join([
183
- f"name={m['name']};size={m['size']};mtime={m['mtime']};type={m['type']};mode={m['mode']};uid={m['uid']};gid={m['gid']};uname={m['uname']};gname={m['gname']}"
184
- for m in member_infos
185
- ])
186
- dsids = ','.join(batch_root_names_sorted)
187
- tar_stat = os.stat(tar_name)
188
- ctime = datetime.fromtimestamp(tar_stat.st_ctime)
189
- mtime = datetime.fromtimestamp(tar_stat.st_mtime)
190
- summary = {
191
- 'tfile': os.path.basename(tar_name),
192
- 'data_size': tar_stat.st_size,
193
- 'wcount': len(member_infos),
194
- 'date_created': ctime.date(),
195
- 'time_created': ctime.time(),
196
- 'date_modified': mtime.date(),
197
- 'time_modified': mtime.time(),
198
- 'note': note,
199
- 'dsids': dsids,
200
- 'dsid': dsid
201
- }
202
- checksum = compute_md5(tar_name)
203
- extra = {
204
- 'uid': uid if uid is not None else 9,
205
- 'dsid': summary['dsid'],
206
- 'data_format': '',
207
- 'disp_order': 0,
208
- 'dsids': summary['dsids'],
209
- 'note': summary['note']
210
- }
211
- insert_tfile_row(summary, checksum, db_params, extra, update_on_conflict=update_on_conflict)
212
- logging.info(f"Inserted tar summary for {tar_name} into tfile.")
213
- except Exception as e:
214
- logging.error(f"Failed to insert tar info for {tar_name}: {e}")
215
-
216
- def get_batch_size(batch):
217
- return sum(get_file_size(f) for f in batch)
218
-
219
- def process_from_file(input_file, output_dir):
220
- directories = read_directories_from_file(input_file)
221
- files = collect_all_files(directories)
222
- if not files:
223
- logging.info("No files found in provided directories.")
224
- return
225
- batches = group_files_by_size(files, ONE_TB, THREE_TB)
226
- # If last batch is less than 1TB, append it to previous batch
227
- if len(batches) > 1 and get_batch_size(batches[-1]) < ONE_TB:
228
- logging.info(f"Last batch size ({get_batch_size(batches[-1])} bytes) < 1TB, appending to previous batch.")
229
- batches[-2].extend(batches[-1])
230
- batches.pop()
231
- common_root = find_common_root(directories)
232
- tar_batches_across_dirs(files, batches, output_dir, common_root, directories)
233
-
234
- def compute_md5(file_path, chunk_size=8192):
235
- md5 = hashlib.md5()
236
- with open(file_path, 'rb') as f:
237
- while True:
238
- chunk = f.read(chunk_size)
239
- if not chunk:
240
- break
241
- md5.update(chunk)
242
- return md5.hexdigest()
243
-
244
- def get_tar_summary_and_details(tar_path):
245
- with tarfile.open(tar_path, 'r') as tar:
246
- member_files = [m for m in tar.getmembers() if m.isfile()]
247
- file_count = len(member_files)
248
- details = []
249
- root_dirs = set()
250
- dsid = None
251
- for idx, m in enumerate(member_files):
252
- parts = m.name.split('/')
253
- if len(parts) > 1:
254
- root_dirs.add(parts[0])
255
- if idx == 0:
256
- dsid = parts[0]
257
- elif len(parts) == 1:
258
- root_dirs.add(parts[0])
259
- if idx == 0:
260
- dsid = parts[0]
261
- details.append(f"name={m.name};size={m.size};mtime={m.mtime};type={m.type};mode={m.mode};uid={m.uid};gid={m.gid};uname={m.uname};gname={m.gname}")
262
- note = '\n'.join(details)
263
- dsids = ','.join(sorted(root_dirs))
264
- tar_stat = os.stat(tar_path)
265
- tar_size = tar_stat.st_size
266
- ctime = datetime.fromtimestamp(tar_stat.st_ctime)
267
- mtime = datetime.fromtimestamp(tar_stat.st_mtime)
268
- return {
269
- 'tfile': os.path.basename(tar_path),
270
- 'data_size': tar_size,
271
- 'wcount': file_count,
272
- 'date_created': ctime.date(),
273
- 'time_created': ctime.time(),
274
- 'date_modified': mtime.date(),
275
- 'time_modified': mtime.time(),
276
- 'note': note,
277
- 'dsids': dsids,
278
- 'dsid': dsid
279
- }
280
-
281
- def get_uid_from_logname(db_params):
282
- logname = None
283
- try:
284
- logname = os.getlogin()
285
- except Exception:
286
- logname = os.environ.get('USER') or getpass.getuser()
287
- conn = psycopg2.connect(**db_params)
288
- cur = conn.cursor()
289
- cur.execute("SELECT userno FROM dssdb.dssgrp WHERE logname=%s LIMIT 1", (str(logname),))
290
- row = cur.fetchone()
291
- cur.close()
292
- conn.close()
293
- if row:
294
- return row[0]
295
- else:
296
- raise ValueError(f"User logname '{logname}' not found in dssdb.dssgrp table.")
297
-
298
- def insert_tfile_row(summary, checksum, db_params, extra, update_on_conflict=False):
299
- table_name = 'dssdb.tfile'
300
- conn = psycopg2.connect(**db_params)
301
- cur = conn.cursor()
302
- columns = [
303
- 'tfile', 'data_size', 'wcount', 'date_created', 'time_created',
304
- 'date_modified', 'time_modified', 'file_format', 'checksum', 'status',
305
- 'uid', 'dsid', 'data_format', 'disp_order', 'dsids', 'note'
306
- ]
307
- values = [
308
- summary['tfile'], summary['data_size'], summary['wcount'],
309
- summary['date_created'], summary['time_created'],
310
- summary['date_modified'], summary['time_modified'],
311
- 'tar', checksum, 'T',
312
- extra.get('uid'), extra.get('dsid'), extra.get('data_format'),
313
- extra.get('disp_order'), extra.get('dsids'), extra.get('note')
314
- ]
315
- placeholders = ','.join(['%s'] * len(columns))
316
- sql = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({placeholders})"
317
- if update_on_conflict:
318
- update_cols = [col for col in columns if col != 'tfile']
319
- set_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in update_cols])
320
- sql += f" ON CONFLICT (tfile) DO UPDATE SET {set_clause}"
321
- try:
322
- cur.execute(sql, values)
323
- conn.commit()
324
- except Exception as e:
325
- logging.error(f"Database error: {e}")
326
- conn.rollback()
327
- raise
328
- finally:
329
- cur.close()
330
- conn.close()
331
-
332
- def main():
333
- import argparse
334
- parser = argparse.ArgumentParser(description="Tar files from a list of directories into 1-3TB tar files and record tar info in the database.")
335
- parser.add_argument("--input-file", help="File containing list of directories to process (one per line)")
336
- parser.add_argument("--output-dir", help="Directory to store tar files (default: current directory)")
337
- parser.add_argument("root_dir", nargs='?', help="(Deprecated) Root directory to process")
338
- parser.add_argument('--db-host', default='rda-db.ucar.edu', help='Database host (default: rda-db.ucar.edu)')
339
- parser.add_argument('--db-port', default=5432, type=int, help='Database port (default: 5432)')
340
- parser.add_argument('--db-name', default='rdadb', help='Database name (default: rdadb)')
341
- parser.add_argument('--db-user', default='dssdb', help='Database user (default: dssdb)')
342
- parser.add_argument('--db-password', help='Database password (optional, use .pgpass if omitted)')
343
- parser.add_argument('--update', action='store_true', help='Update row if tfile already exists')
344
- args = parser.parse_args()
345
- output_dir = args.output_dir if args.output_dir else os.getcwd()
346
- os.makedirs(output_dir, exist_ok=True)
347
- setup_logging(output_dir)
348
- # Prepare db_params for tar_batches_across_dirs
349
- db_params = None
350
- if args.db_host and args.db_name and args.db_user:
351
- db_params = {
352
- 'host': args.db_host,
353
- 'port': args.db_port,
354
- 'dbname': args.db_name,
355
- 'user': args.db_user
356
- }
357
- if args.db_password:
358
- db_params['password'] = args.db_password
359
- if args.input_file:
360
- # Pass db_params and update flag to tar_batches_across_dirs
361
- directories = read_directories_from_file(args.input_file)
362
- files = collect_all_files(directories)
363
- if not files:
364
- logging.info("No files found in provided directories.")
365
- return
366
- batches = group_files_by_size(files, ONE_TB, THREE_TB)
367
- if len(batches) > 1 and get_batch_size(batches[-1]) < ONE_TB:
368
- logging.info(f"Last batch size ({get_batch_size(batches[-1])} bytes) < 1TB, appending to previous batch.")
369
- batches[-2].extend(batches[-1])
370
- batches.pop()
371
- common_root = find_common_root(directories)
372
- tar_batches_across_dirs(files, batches, output_dir, common_root, directories, db_params=db_params, update_on_conflict=args.update)
373
- elif args.root_dir:
374
- process_directory_tree(args.root_dir, output_dir, db_params=db_params, update_on_conflict=args.update)
375
- else:
376
- print("Error: Must provide either --input-file or root_dir.")
377
-
378
- if __name__ == "__main__":
379
- main()