rda-python-dsquasar 2.0.3__tar.gz → 2.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/PKG-INFO +1 -1
  2. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/pyproject.toml +1 -1
  3. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar/taccrec.py +4 -4
  4. rda_python_dsquasar-2.0.4/src/rda_python_dsquasar/tacctar.py +379 -0
  5. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/PKG-INFO +1 -1
  6. rda_python_dsquasar-2.0.3/src/rda_python_dsquasar/tacctar.py +0 -178
  7. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/LICENSE +0 -0
  8. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/README.md +0 -0
  9. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/setup.cfg +0 -0
  10. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar/__init__.py +0 -0
  11. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar/ds_quasar.py +0 -0
  12. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar/dsquasar.py +0 -0
  13. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar/dstacc.py +0 -0
  14. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/SOURCES.txt +0 -0
  15. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/dependency_links.txt +0 -0
  16. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/entry_points.txt +0 -0
  17. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/requires.txt +0 -0
  18. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/src/rda_python_dsquasar.egg-info/top_level.txt +0 -0
  19. {rda_python_dsquasar-2.0.3 → rda_python_dsquasar-2.0.4}/test/test_dsquasar.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rda_python_dsquasar
3
- Version: 2.0.3
3
+ Version: 2.0.4
4
4
  Summary: RDA Python package to backup and recover RDA data archives to and from GLOBUS Quasar backup server
5
5
  Author-email: Zaihua Ji <zji@ucar.edu>
6
6
  Project-URL: Homepage, https://github.com/NCAR/rda-python-dsquasar
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "rda_python_dsquasar"
7
- version = "2.0.3"
7
+ version = "2.0.4"
8
8
  authors = [
9
9
  { name="Zaihua Ji", email="zji@ucar.edu" },
10
10
  ]
@@ -56,7 +56,7 @@ def get_tar_summary_and_details(tar_path):
56
56
  }
57
57
 
58
58
  def insert_tfile_row(summary, checksum, db_params, extra, update_on_conflict=False):
59
- table_name = 'tfile'
59
+ table_name = 'dssdb.tfile'
60
60
  conn = psycopg2.connect(**db_params)
61
61
  cur = conn.cursor()
62
62
  columns = [
@@ -99,21 +99,21 @@ def get_uid_from_logname(db_params):
99
99
  conn = psycopg2.connect(**db_params)
100
100
  cur = conn.cursor()
101
101
  # Ensure logname is quoted as a string in the query
102
- cur.execute("SELECT userno FROM dssgrp WHERE logname='%s' LIMIT 1", (str(logname),))
102
+ cur.execute("SELECT userno FROM dssdb.dssgrp WHERE logname=%s LIMIT 1", (str(logname),))
103
103
  row = cur.fetchone()
104
104
  cur.close()
105
105
  conn.close()
106
106
  if row:
107
107
  return row[0]
108
108
  else:
109
- raise ValueError(f"User logname '{logname}' not found in dssgrp table.")
109
+ raise ValueError(f"User logname '{logname}' not found in dssdb.dssgrp table.")
110
110
 
111
111
  def main():
112
112
  parser = argparse.ArgumentParser(description='Insert tar file summary into tfile table.')
113
113
  parser.add_argument('tarfile', help='Path to the tar file')
114
114
  parser.add_argument('--db-host', default='rda-db.ucar.edu', help='Database host (default: rda-db.ucar.edu)')
115
115
  parser.add_argument('--db-port', default=5432, type=int, help='Database port (default: 5432)')
116
- parser.add_argument('--db-name', default='dssdb', help='Database name (default: dssdb)')
116
+ parser.add_argument('--db-name', default='rdadb', help='Database name (default: rdadb)')
117
117
  parser.add_argument('--db-user', default='dssdb', help='Database user (default: dssdb)')
118
118
  parser.add_argument('--db-password', help='Database password (optional, use .pgpass if omitted)')
119
119
  parser.add_argument('--update', action='store_true', help='Update row if tfile already exists')
@@ -0,0 +1,379 @@
1
+ import os
2
+ import tarfile
3
+ import logging
4
+ from pathlib import Path
5
+ import psycopg2
6
+ import hashlib
7
+ from datetime import datetime
8
+ import getpass
9
+
10
+ # Constants for size limits
11
+ ONE_TB = 1_099_511_627_776 # 1TB in bytes
12
+ THREE_TB = 3_298_534_883_328 # 3TB in bytes
13
+
14
+ def setup_logging(output_dir):
15
+ import datetime
16
+ log_filename = f"tarlog_{datetime.datetime.now().strftime('%Y%m%d')}.log"
17
+ log_path = os.path.join(output_dir, log_filename)
18
+ logger = logging.getLogger()
19
+ logger.setLevel(logging.INFO)
20
+ # Remove any existing handlers
21
+ for handler in logger.handlers[:]:
22
+ logger.removeHandler(handler)
23
+ # File handler
24
+ fh = logging.FileHandler(log_path)
25
+ fh.setLevel(logging.INFO)
26
+ fh.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
27
+ # Console handler
28
+ ch = logging.StreamHandler()
29
+ ch.setLevel(logging.INFO)
30
+ ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
31
+ logger.addHandler(fh)
32
+ logger.addHandler(ch)
33
+ logging.info(f"Logging to file: {log_path}")
34
+
35
+ def get_file_size(path):
36
+ try:
37
+ return os.path.getsize(path)
38
+ except Exception as e:
39
+ logging.warning(f"Could not get size for {path}: {e}")
40
+ return 0
41
+
42
+ def group_files_by_size(files, min_size, max_size):
43
+ """
44
+ Group files into batches where each batch's total size is between min_size and max_size.
45
+ Returns a list of lists of file paths.
46
+ """
47
+ batches = []
48
+ current_batch = []
49
+ current_size = 0
50
+ for f in files:
51
+ fsize = get_file_size(f)
52
+ if fsize > max_size:
53
+ logging.warning(f"File {f} is larger than max tar size ({max_size} bytes), skipping.")
54
+ continue
55
+ if current_size + fsize > max_size:
56
+ if current_size >= min_size:
57
+ batches.append(current_batch)
58
+ current_batch = [f]
59
+ current_size = fsize
60
+ else:
61
+ # If adding this file exceeds max_size but current batch is too small, force add
62
+ current_batch.append(f)
63
+ current_size += fsize
64
+ batches.append(current_batch)
65
+ current_batch = []
66
+ current_size = 0
67
+ else:
68
+ current_batch.append(f)
69
+ current_size += fsize
70
+ if current_batch:
71
+ batches.append(current_batch)
72
+ return batches
73
+
74
+ def tar_batches(dirpath, batches, output_dir):
75
+ src_dir_name = Path(dirpath).name
76
+ for idx, batch in enumerate(batches, 1):
77
+ tar_name = os.path.join(output_dir, f"{src_dir_name}_part{idx}.tar")
78
+ logging.info(f"Creating tar: {tar_name} with {len(batch)} files.")
79
+ with tarfile.open(tar_name, "w") as tar:
80
+ for f in batch:
81
+ try:
82
+ tar.add(f, arcname=os.path.relpath(f, dirpath))
83
+ except Exception as e:
84
+ logging.warning(f"Failed to add {f} to tar: {e}")
85
+
86
+ def process_directory_tree(root_dir, output_dir, db_params=None, update_on_conflict=False):
87
+ for dirpath, dirnames, filenames in os.walk(root_dir):
88
+ abs_files = [os.path.join(dirpath, f) for f in filenames]
89
+ if not abs_files:
90
+ continue
91
+ batches = group_files_by_size(abs_files, ONE_TB, THREE_TB)
92
+ # Use tar_batches_across_dirs for consistency and DB recording
93
+ common_root = root_dir
94
+ tar_batches_across_dirs(abs_files, batches, output_dir, common_root, [root_dir], db_params=db_params, update_on_conflict=update_on_conflict)
95
+
96
+ def read_directories_from_file(input_file):
97
+ dirs = []
98
+ with open(input_file, 'r') as f:
99
+ for line in f:
100
+ line = line.strip()
101
+ if not line or line.startswith('#'):
102
+ continue
103
+ if os.path.isdir(line):
104
+ dirs.append(os.path.abspath(line))
105
+ else:
106
+ logging.warning(f"Directory does not exist: {line}")
107
+ return dirs
108
+
109
+ def collect_all_files(directories):
110
+ files = []
111
+ for d in directories:
112
+ for dirpath, dirnames, filenames in os.walk(d):
113
+ for fname in filenames:
114
+ fpath = os.path.join(dirpath, fname)
115
+ files.append(fpath)
116
+ return files
117
+
118
+ def find_common_root(paths):
119
+ return os.path.commonpath(paths) if paths else ''
120
+
121
+ def tar_batches_across_dirs(files, batches, output_dir, common_root, root_dirs, db_params=None, update_on_conflict=False):
122
+ root_dir_paths = [Path(d).resolve() for d in root_dirs]
123
+ uid = None
124
+ if db_params:
125
+ try:
126
+ uid = get_uid_from_logname(db_params)
127
+ except Exception as e:
128
+ logging.error(f"Could not get uid from dssgrp: {e}")
129
+ uid = 9
130
+ for idx, batch in enumerate(batches, 1):
131
+ batch_root_names = set()
132
+ member_infos = []
133
+ dsid = None
134
+ for file_idx, f in enumerate(batch):
135
+ f_path = Path(f).resolve()
136
+ for root_path in root_dir_paths:
137
+ try:
138
+ if str(f_path).startswith(str(root_path)):
139
+ batch_root_names.add(root_path.name)
140
+ break
141
+ except Exception:
142
+ continue
143
+ # Gather member file info for note/dsid/dsids
144
+ try:
145
+ stat = os.stat(f)
146
+ arcname = os.path.relpath(f, common_root)
147
+ parts = arcname.split('/')
148
+ root_dir = parts[0] if parts else ''
149
+ if file_idx == 0:
150
+ dsid = root_dir
151
+ member_infos.append({
152
+ 'name': arcname,
153
+ 'size': stat.st_size,
154
+ 'mtime': int(stat.st_mtime),
155
+ 'type': '0', # regular file
156
+ 'mode': stat.st_mode,
157
+ 'uid': stat.st_uid,
158
+ 'gid': stat.st_gid,
159
+ 'uname': '',
160
+ 'gname': ''
161
+ })
162
+ except Exception as e:
163
+ logging.warning(f"Failed to stat {f} for tar member info: {e}")
164
+ # Determine tar file name: first leading root dir and number of distinct root dirs
165
+ batch_root_names_sorted = sorted(batch_root_names)
166
+ first_root = batch_root_names_sorted[0] if batch_root_names_sorted else 'batch'
167
+ num_roots = len(batch_root_names_sorted)
168
+ num_files = len(batch)
169
+ tar_name = os.path.join(output_dir, f"{first_root}_dn{num_roots}_gn{idx}_fn{num_files}.tar")
170
+ logging.info(f"Creating tar: {tar_name} with {num_files} files.")
171
+ with tarfile.open(tar_name, "w") as tar:
172
+ for f in batch:
173
+ try:
174
+ arcname = os.path.relpath(f, common_root)
175
+ tar.add(f, arcname=arcname)
176
+ except Exception as e:
177
+ logging.warning(f"Failed to add {f} to tar: {e}")
178
+ # After tar is created, insert info into tfile if db_params is provided
179
+ if db_params:
180
+ try:
181
+ # Build note and dsids from cached member_infos
182
+ note = '\n'.join([
183
+ f"name={m['name']};size={m['size']};mtime={m['mtime']};type={m['type']};mode={m['mode']};uid={m['uid']};gid={m['gid']};uname={m['uname']};gname={m['gname']}"
184
+ for m in member_infos
185
+ ])
186
+ dsids = ','.join(batch_root_names_sorted)
187
+ tar_stat = os.stat(tar_name)
188
+ ctime = datetime.fromtimestamp(tar_stat.st_ctime)
189
+ mtime = datetime.fromtimestamp(tar_stat.st_mtime)
190
+ summary = {
191
+ 'tfile': os.path.basename(tar_name),
192
+ 'data_size': tar_stat.st_size,
193
+ 'wcount': len(member_infos),
194
+ 'date_created': ctime.date(),
195
+ 'time_created': ctime.time(),
196
+ 'date_modified': mtime.date(),
197
+ 'time_modified': mtime.time(),
198
+ 'note': note,
199
+ 'dsids': dsids,
200
+ 'dsid': dsid
201
+ }
202
+ checksum = compute_md5(tar_name)
203
+ extra = {
204
+ 'uid': uid if uid is not None else 9,
205
+ 'dsid': summary['dsid'],
206
+ 'data_format': '',
207
+ 'disp_order': 0,
208
+ 'dsids': summary['dsids'],
209
+ 'note': summary['note']
210
+ }
211
+ insert_tfile_row(summary, checksum, db_params, extra, update_on_conflict=update_on_conflict)
212
+ logging.info(f"Inserted tar summary for {tar_name} into tfile.")
213
+ except Exception as e:
214
+ logging.error(f"Failed to insert tar info for {tar_name}: {e}")
215
+
216
+ def get_batch_size(batch):
217
+ return sum(get_file_size(f) for f in batch)
218
+
219
+ def process_from_file(input_file, output_dir):
220
+ directories = read_directories_from_file(input_file)
221
+ files = collect_all_files(directories)
222
+ if not files:
223
+ logging.info("No files found in provided directories.")
224
+ return
225
+ batches = group_files_by_size(files, ONE_TB, THREE_TB)
226
+ # If last batch is less than 1TB, append it to previous batch
227
+ if len(batches) > 1 and get_batch_size(batches[-1]) < ONE_TB:
228
+ logging.info(f"Last batch size ({get_batch_size(batches[-1])} bytes) < 1TB, appending to previous batch.")
229
+ batches[-2].extend(batches[-1])
230
+ batches.pop()
231
+ common_root = find_common_root(directories)
232
+ tar_batches_across_dirs(files, batches, output_dir, common_root, directories)
233
+
234
+ def compute_md5(file_path, chunk_size=8192):
235
+ md5 = hashlib.md5()
236
+ with open(file_path, 'rb') as f:
237
+ while True:
238
+ chunk = f.read(chunk_size)
239
+ if not chunk:
240
+ break
241
+ md5.update(chunk)
242
+ return md5.hexdigest()
243
+
244
+ def get_tar_summary_and_details(tar_path):
245
+ with tarfile.open(tar_path, 'r') as tar:
246
+ member_files = [m for m in tar.getmembers() if m.isfile()]
247
+ file_count = len(member_files)
248
+ details = []
249
+ root_dirs = set()
250
+ dsid = None
251
+ for idx, m in enumerate(member_files):
252
+ parts = m.name.split('/')
253
+ if len(parts) > 1:
254
+ root_dirs.add(parts[0])
255
+ if idx == 0:
256
+ dsid = parts[0]
257
+ elif len(parts) == 1:
258
+ root_dirs.add(parts[0])
259
+ if idx == 0:
260
+ dsid = parts[0]
261
+ details.append(f"name={m.name};size={m.size};mtime={m.mtime};type={m.type};mode={m.mode};uid={m.uid};gid={m.gid};uname={m.uname};gname={m.gname}")
262
+ note = '\n'.join(details)
263
+ dsids = ','.join(sorted(root_dirs))
264
+ tar_stat = os.stat(tar_path)
265
+ tar_size = tar_stat.st_size
266
+ ctime = datetime.fromtimestamp(tar_stat.st_ctime)
267
+ mtime = datetime.fromtimestamp(tar_stat.st_mtime)
268
+ return {
269
+ 'tfile': os.path.basename(tar_path),
270
+ 'data_size': tar_size,
271
+ 'wcount': file_count,
272
+ 'date_created': ctime.date(),
273
+ 'time_created': ctime.time(),
274
+ 'date_modified': mtime.date(),
275
+ 'time_modified': mtime.time(),
276
+ 'note': note,
277
+ 'dsids': dsids,
278
+ 'dsid': dsid
279
+ }
280
+
281
+ def get_uid_from_logname(db_params):
282
+ logname = None
283
+ try:
284
+ logname = os.getlogin()
285
+ except Exception:
286
+ logname = os.environ.get('USER') or getpass.getuser()
287
+ conn = psycopg2.connect(**db_params)
288
+ cur = conn.cursor()
289
+ cur.execute("SELECT userno FROM dssdb.dssgrp WHERE logname=%s LIMIT 1", (str(logname),))
290
+ row = cur.fetchone()
291
+ cur.close()
292
+ conn.close()
293
+ if row:
294
+ return row[0]
295
+ else:
296
+ raise ValueError(f"User logname '{logname}' not found in dssdb.dssgrp table.")
297
+
298
+ def insert_tfile_row(summary, checksum, db_params, extra, update_on_conflict=False):
299
+ table_name = 'dssdb.tfile'
300
+ conn = psycopg2.connect(**db_params)
301
+ cur = conn.cursor()
302
+ columns = [
303
+ 'tfile', 'data_size', 'wcount', 'date_created', 'time_created',
304
+ 'date_modified', 'time_modified', 'file_format', 'checksum', 'status',
305
+ 'uid', 'dsid', 'data_format', 'disp_order', 'dsids', 'note'
306
+ ]
307
+ values = [
308
+ summary['tfile'], summary['data_size'], summary['wcount'],
309
+ summary['date_created'], summary['time_created'],
310
+ summary['date_modified'], summary['time_modified'],
311
+ 'tar', checksum, 'T',
312
+ extra.get('uid'), extra.get('dsid'), extra.get('data_format'),
313
+ extra.get('disp_order'), extra.get('dsids'), extra.get('note')
314
+ ]
315
+ placeholders = ','.join(['%s'] * len(columns))
316
+ sql = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES ({placeholders})"
317
+ if update_on_conflict:
318
+ update_cols = [col for col in columns if col != 'tfile']
319
+ set_clause = ', '.join([f"{col}=EXCLUDED.{col}" for col in update_cols])
320
+ sql += f" ON CONFLICT (tfile) DO UPDATE SET {set_clause}"
321
+ try:
322
+ cur.execute(sql, values)
323
+ conn.commit()
324
+ except Exception as e:
325
+ logging.error(f"Database error: {e}")
326
+ conn.rollback()
327
+ raise
328
+ finally:
329
+ cur.close()
330
+ conn.close()
331
+
332
+ def main():
333
+ import argparse
334
+ parser = argparse.ArgumentParser(description="Tar files from a list of directories into 1-3TB tar files and record tar info in the database.")
335
+ parser.add_argument("--input-file", help="File containing list of directories to process (one per line)")
336
+ parser.add_argument("--output-dir", help="Directory to store tar files (default: current directory)")
337
+ parser.add_argument("root_dir", nargs='?', help="(Deprecated) Root directory to process")
338
+ parser.add_argument('--db-host', default='rda-db.ucar.edu', help='Database host (default: rda-db.ucar.edu)')
339
+ parser.add_argument('--db-port', default=5432, type=int, help='Database port (default: 5432)')
340
+ parser.add_argument('--db-name', default='rdadb', help='Database name (default: rdadb)')
341
+ parser.add_argument('--db-user', default='dssdb', help='Database user (default: dssdb)')
342
+ parser.add_argument('--db-password', help='Database password (optional, use .pgpass if omitted)')
343
+ parser.add_argument('--update', action='store_true', help='Update row if tfile already exists')
344
+ args = parser.parse_args()
345
+ output_dir = args.output_dir if args.output_dir else os.getcwd()
346
+ os.makedirs(output_dir, exist_ok=True)
347
+ setup_logging(output_dir)
348
+ # Prepare db_params for tar_batches_across_dirs
349
+ db_params = None
350
+ if args.db_host and args.db_name and args.db_user:
351
+ db_params = {
352
+ 'host': args.db_host,
353
+ 'port': args.db_port,
354
+ 'dbname': args.db_name,
355
+ 'user': args.db_user
356
+ }
357
+ if args.db_password:
358
+ db_params['password'] = args.db_password
359
+ if args.input_file:
360
+ # Pass db_params and update flag to tar_batches_across_dirs
361
+ directories = read_directories_from_file(args.input_file)
362
+ files = collect_all_files(directories)
363
+ if not files:
364
+ logging.info("No files found in provided directories.")
365
+ return
366
+ batches = group_files_by_size(files, ONE_TB, THREE_TB)
367
+ if len(batches) > 1 and get_batch_size(batches[-1]) < ONE_TB:
368
+ logging.info(f"Last batch size ({get_batch_size(batches[-1])} bytes) < 1TB, appending to previous batch.")
369
+ batches[-2].extend(batches[-1])
370
+ batches.pop()
371
+ common_root = find_common_root(directories)
372
+ tar_batches_across_dirs(files, batches, output_dir, common_root, directories, db_params=db_params, update_on_conflict=args.update)
373
+ elif args.root_dir:
374
+ process_directory_tree(args.root_dir, output_dir, db_params=db_params, update_on_conflict=args.update)
375
+ else:
376
+ print("Error: Must provide either --input-file or root_dir.")
377
+
378
+ if __name__ == "__main__":
379
+ main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rda_python_dsquasar
3
- Version: 2.0.3
3
+ Version: 2.0.4
4
4
  Summary: RDA Python package to backup and recover RDA data archives to and from GLOBUS Quasar backup server
5
5
  Author-email: Zaihua Ji <zji@ucar.edu>
6
6
  Project-URL: Homepage, https://github.com/NCAR/rda-python-dsquasar
@@ -1,178 +0,0 @@
1
- import os
2
- import tarfile
3
- import logging
4
- from pathlib import Path
5
-
6
- # Constants for size limits
7
- ONE_TB = 1_099_511_627_776 # 1TB in bytes
8
- THREE_TB = 3_298_534_883_328 # 3TB in bytes
9
-
10
- def setup_logging(output_dir):
11
- import datetime
12
- log_filename = f"tarlog_{datetime.datetime.now().strftime('%Y%m%d')}.log"
13
- log_path = os.path.join(output_dir, log_filename)
14
- logger = logging.getLogger()
15
- logger.setLevel(logging.INFO)
16
- # Remove any existing handlers
17
- for handler in logger.handlers[:]:
18
- logger.removeHandler(handler)
19
- # File handler
20
- fh = logging.FileHandler(log_path)
21
- fh.setLevel(logging.INFO)
22
- fh.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
23
- # Console handler
24
- ch = logging.StreamHandler()
25
- ch.setLevel(logging.INFO)
26
- ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))
27
- logger.addHandler(fh)
28
- logger.addHandler(ch)
29
- logging.info(f"Logging to file: {log_path}")
30
-
31
- def get_file_size(path):
32
- try:
33
- return os.path.getsize(path)
34
- except Exception as e:
35
- logging.warning(f"Could not get size for {path}: {e}")
36
- return 0
37
-
38
- def group_files_by_size(files, min_size, max_size):
39
- """
40
- Group files into batches where each batch's total size is between min_size and max_size.
41
- Returns a list of lists of file paths.
42
- """
43
- batches = []
44
- current_batch = []
45
- current_size = 0
46
- for f in files:
47
- fsize = get_file_size(f)
48
- if fsize > max_size:
49
- logging.warning(f"File {f} is larger than max tar size ({max_size} bytes), skipping.")
50
- continue
51
- if current_size + fsize > max_size:
52
- if current_size >= min_size:
53
- batches.append(current_batch)
54
- current_batch = [f]
55
- current_size = fsize
56
- else:
57
- # If adding this file exceeds max_size but current batch is too small, force add
58
- current_batch.append(f)
59
- current_size += fsize
60
- batches.append(current_batch)
61
- current_batch = []
62
- current_size = 0
63
- else:
64
- current_batch.append(f)
65
- current_size += fsize
66
- if current_batch:
67
- batches.append(current_batch)
68
- return batches
69
-
70
- def tar_batches(dirpath, batches, output_dir):
71
- src_dir_name = Path(dirpath).name
72
- for idx, batch in enumerate(batches, 1):
73
- tar_name = os.path.join(output_dir, f"{src_dir_name}_part{idx}.tar")
74
- logging.info(f"Creating tar: {tar_name} with {len(batch)} files.")
75
- with tarfile.open(tar_name, "w") as tar:
76
- for f in batch:
77
- try:
78
- tar.add(f, arcname=os.path.relpath(f, dirpath))
79
- except Exception as e:
80
- logging.warning(f"Failed to add {f} to tar: {e}")
81
-
82
- def process_directory_tree(root_dir, output_dir):
83
- for dirpath, dirnames, filenames in os.walk(root_dir):
84
- abs_files = [os.path.join(dirpath, f) for f in filenames]
85
- if not abs_files:
86
- continue
87
- batches = group_files_by_size(abs_files, ONE_TB, THREE_TB)
88
- tar_batches(dirpath, batches, output_dir)
89
-
90
- def read_directories_from_file(input_file):
91
- dirs = []
92
- with open(input_file, 'r') as f:
93
- for line in f:
94
- line = line.strip()
95
- if not line or line.startswith('#'):
96
- continue
97
- if os.path.isdir(line):
98
- dirs.append(os.path.abspath(line))
99
- else:
100
- logging.warning(f"Directory does not exist: {line}")
101
- return dirs
102
-
103
- def collect_all_files(directories):
104
- files = []
105
- for d in directories:
106
- for dirpath, dirnames, filenames in os.walk(d):
107
- for fname in filenames:
108
- fpath = os.path.join(dirpath, fname)
109
- files.append(fpath)
110
- return files
111
-
112
- def find_common_root(paths):
113
- return os.path.commonpath(paths) if paths else ''
114
-
115
- def tar_batches_across_dirs(files, batches, output_dir, common_root, root_dirs):
116
- root_dir_paths = [Path(d).resolve() for d in root_dirs]
117
- for idx, batch in enumerate(batches, 1):
118
- # Find which root directories are represented in this batch
119
- batch_root_names = set()
120
- for f in batch:
121
- f_path = Path(f).resolve()
122
- for root_path in root_dir_paths:
123
- try:
124
- if str(f_path).startswith(str(root_path)):
125
- batch_root_names.add(root_path.name)
126
- break
127
- except Exception:
128
- continue
129
- prefix = "_".join(sorted(batch_root_names)) if batch_root_names else "batch"
130
- num_files = len(batch)
131
- tar_name = os.path.join(output_dir, f"{prefix}_batch_part{idx}_{num_files}files.tar")
132
- logging.info(f"Creating tar: {tar_name} with {num_files} files.")
133
- with tarfile.open(tar_name, "w") as tar:
134
- for f in batch:
135
- try:
136
- # Preserve relative path from common root in tar
137
- arcname = os.path.relpath(f, common_root)
138
- tar.add(f, arcname=arcname)
139
- except Exception as e:
140
- logging.warning(f"Failed to add {f} to tar: {e}")
141
-
142
- def get_batch_size(batch):
143
- return sum(get_file_size(f) for f in batch)
144
-
145
- def process_from_file(input_file, output_dir):
146
- directories = read_directories_from_file(input_file)
147
- files = collect_all_files(directories)
148
- if not files:
149
- logging.info("No files found in provided directories.")
150
- return
151
- batches = group_files_by_size(files, ONE_TB, THREE_TB)
152
- # If last batch is less than 1TB, append it to previous batch
153
- if len(batches) > 1 and get_batch_size(batches[-1]) < ONE_TB:
154
- logging.info(f"Last batch size ({get_batch_size(batches[-1])} bytes) < 1TB, appending to previous batch.")
155
- batches[-2].extend(batches[-1])
156
- batches.pop()
157
- common_root = find_common_root(directories)
158
- tar_batches_across_dirs(files, batches, output_dir, common_root, directories)
159
-
160
- def main():
161
- import argparse
162
- parser = argparse.ArgumentParser(description="Tar files from a list of directories into 1-3TB tar files.")
163
- parser.add_argument("--input-file", help="File containing list of directories to process (one per line)")
164
- parser.add_argument("--output-dir", help="Directory to store tar files (default: current directory)")
165
- parser.add_argument("root_dir", nargs='?', help="(Deprecated) Root directory to process")
166
- args = parser.parse_args()
167
- output_dir = args.output_dir if args.output_dir else os.getcwd()
168
- os.makedirs(output_dir, exist_ok=True)
169
- setup_logging(output_dir)
170
- if args.input_file:
171
- process_from_file(args.input_file, output_dir)
172
- elif args.root_dir:
173
- process_directory_tree(args.root_dir, output_dir)
174
- else:
175
- print("Error: Must provide either --input-file or root_dir.")
176
-
177
- if __name__ == "__main__":
178
- main()