nebu 0.1.23__py3-none-any.whl → 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nebu/data.py ADDED
@@ -0,0 +1,952 @@
1
+ import os
2
+ import subprocess
3
+ from datetime import datetime, timedelta, timezone
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+ from urllib.parse import urlparse
6
+
7
+ import boto3
8
+ from botocore.exceptions import ClientError
9
+
10
+
11
+ def rclone_copy(
12
+ source_dir: str,
13
+ destination: str,
14
+ dry_run: bool = False,
15
+ transfers: int = 4,
16
+ extra_args: Optional[List[str]] = None,
17
+ verbose: bool = True,
18
+ ) -> bool:
19
+ """
20
+ Upload a directory to a remote bucket using `rclone copy`.
21
+
22
+ Args:
23
+ source_dir (str): Path to local directory to upload.
24
+ destination (str): Remote destination, e.g., 's3:my-bucket/path'.
25
+ dry_run (bool): If True, performs a dry run without uploading.
26
+ transfers (int): Number of parallel transfers.
27
+ extra_args (Optional[List[str]]): Additional rclone flags.
28
+ verbose (bool): If True, prints command and output live.
29
+
30
+ Returns:
31
+ bool: True if upload succeeded, False otherwise.
32
+ """
33
+ command = [
34
+ "rclone",
35
+ "copy",
36
+ source_dir,
37
+ destination,
38
+ f"--transfers={transfers}",
39
+ "--progress",
40
+ ]
41
+
42
+ if dry_run:
43
+ command.append("--dry-run")
44
+ if extra_args:
45
+ command.extend(extra_args)
46
+
47
+ if verbose:
48
+ print("Running command:", " ".join(command))
49
+
50
+ try:
51
+ process = subprocess.Popen(
52
+ command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
53
+ )
54
+
55
+ if not process.stdout:
56
+ raise Exception("No output from rclone")
57
+
58
+ for line in process.stdout:
59
+ if verbose:
60
+ print(line.strip())
61
+
62
+ return process.wait() == 0
63
+
64
+ except Exception as e:
65
+ print(f"Error during rclone copy: {e}")
66
+ return False
67
+
68
+
69
+ def find_latest_checkpoint(training_dir: str) -> Optional[str]:
70
+ """
71
+ Finds the checkpoint directory with the highest step number in a Hugging Face
72
+ training output directory.
73
+
74
+ Args:
75
+ training_dir (str): The path to the training output directory.
76
+
77
+ Returns:
78
+ Optional[str]: The path to the latest checkpoint directory, or None if
79
+ no checkpoint directories are found or the directory
80
+ doesn't exist.
81
+ """
82
+ latest_step = -1
83
+ latest_checkpoint_dir = None
84
+
85
+ if not os.path.isdir(training_dir):
86
+ print(f"Error: Directory not found: {training_dir}")
87
+ return None
88
+
89
+ for item in os.listdir(training_dir):
90
+ item_path = os.path.join(training_dir, item)
91
+ if os.path.isdir(item_path) and item.startswith("checkpoint-"):
92
+ try:
93
+ step_str = item.split("-")[-1]
94
+ if step_str.isdigit():
95
+ step = int(step_str)
96
+ if step > latest_step:
97
+ latest_step = step
98
+ latest_checkpoint_dir = item_path
99
+ except (ValueError, IndexError):
100
+ # Ignore items that don't match the expected pattern
101
+ continue
102
+
103
+ return latest_checkpoint_dir
104
+
105
+
106
+ def _parse_s3_path(path: str) -> Tuple[Optional[str], Optional[str]]:
107
+ """Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
108
+ parsed = urlparse(path)
109
+ if parsed.scheme != "s3":
110
+ return None, None
111
+ bucket = parsed.netloc
112
+ prefix = parsed.path.lstrip("/")
113
+ return bucket, prefix
114
+
115
+
116
+ def _list_s3_objects(
117
+ s3_client: Any, bucket: str, prefix: Optional[str], verbose: bool = True
118
+ ) -> Dict[str, Dict[str, Any]]:
119
+ """Lists objects in an S3 prefix."""
120
+ objects: Dict[str, Dict[str, Any]] = {}
121
+ paginator = s3_client.get_paginator("list_objects_v2")
122
+ list_prefix = (
123
+ prefix if prefix else ""
124
+ ) # Use empty string if prefix is None for listing
125
+ if verbose:
126
+ print(f"Listing objects in s3://{bucket}/{list_prefix}...")
127
+
128
+ operation_parameters = {"Bucket": bucket}
129
+ # Only add Prefix parameter if it's non-empty
130
+ if list_prefix:
131
+ operation_parameters["Prefix"] = list_prefix
132
+
133
+ try:
134
+ page_iterator = paginator.paginate(**operation_parameters)
135
+ for page in page_iterator:
136
+ if "Contents" in page:
137
+ for obj in page["Contents"]:
138
+ # Ignore zero-byte objects ending in '/' (S3 console folders)
139
+ if obj["Key"].endswith("/") and obj["Size"] == 0:
140
+ continue
141
+
142
+ # Determine key relative to the *prefix* for comparison
143
+ # If prefix is None or empty, relative key is the full key.
144
+ relative_key: Optional[str] = None
145
+ if prefix and obj["Key"].startswith(prefix):
146
+ # Ensure trailing slash consistency if prefix has one
147
+ prefix_adjusted = (
148
+ prefix if prefix.endswith("/") else prefix + "/"
149
+ )
150
+ # Handle exact match of prefix as a file
151
+ if obj["Key"] == prefix:
152
+ relative_key = os.path.basename(obj["Key"])
153
+ # Handle keys within the prefix "directory"
154
+ elif obj["Key"].startswith(prefix_adjusted):
155
+ relative_key = obj["Key"][len(prefix_adjusted) :]
156
+ # This case should technically not be needed if prefix is used correctly in listing
157
+ # but handle defensively if object key *is* the prefix itself (without trailing slash)
158
+ elif obj["Key"] == prefix.rstrip("/"):
159
+ relative_key = os.path.basename(obj["Key"])
160
+ # else: # Should not happen if prefix filter works correctly
161
+ # print(f"Warning: Unexpected key {obj['Key']} found for prefix {prefix}")
162
+ # relative_key = obj["Key"] # Fallback
163
+ elif not prefix:
164
+ # If no prefix specified, the relative key is the full key
165
+ relative_key = obj["Key"]
166
+ # else: obj["Key"] does not start with prefix - ignore (shouldn't happen with Prefix param)
167
+
168
+ # Skip if relative key is empty or None (e.g., prefix itself listed, or unexpected case)
169
+ if not relative_key:
170
+ continue
171
+
172
+ # Ensure LastModified is timezone-aware
173
+ last_modified = obj["LastModified"]
174
+ if last_modified.tzinfo is None:
175
+ last_modified = last_modified.replace(tzinfo=timezone.utc)
176
+
177
+ objects[relative_key] = {
178
+ "path": f"s3://{bucket}/{obj['Key']}", # Store full path for reference
179
+ "key": obj["Key"], # Store full S3 key
180
+ "size": obj["Size"],
181
+ "mtime": last_modified,
182
+ "type": "s3",
183
+ }
184
+ except ClientError as e:
185
+ if e.response["Error"]["Code"] == "NoSuchBucket":
186
+ print(f"Error: Bucket '{bucket}' not found.")
187
+ # Allow sync *to* a non-existent prefix (will just upload all)
188
+ elif e.response["Error"]["Code"] == "NoSuchKey" and prefix:
189
+ if verbose:
190
+ print(f"Prefix s3://{bucket}/{prefix} not found (treating as empty).")
191
+ else:
192
+ print(f"Error listing S3 objects: {e}")
193
+ # Return empty dict on error that prevents listing (like NoSuchBucket)
194
+ if e.response["Error"]["Code"] == "NoSuchBucket":
195
+ return {}
196
+ except Exception as e:
197
+ print(f"An unexpected error occurred listing S3 objects: {e}")
198
+ return {} # Return empty on unexpected error
199
+
200
+ if verbose:
201
+ print(f"Found {len(objects)} objects in S3.")
202
+ return objects
203
+
204
+
205
+ def _list_local_files(
206
+ local_dir: str, verbose: bool = True
207
+ ) -> Dict[str, Dict[str, Any]]:
208
+ """Lists files in a local directory."""
209
+ if not os.path.isdir(local_dir):
210
+ # Check if it's a file path instead of a dir
211
+ if os.path.isfile(local_dir):
212
+ print(
213
+ f"Warning: Source {local_dir} is a file, not a directory. Syncing single file."
214
+ )
215
+ try:
216
+ local_size = os.path.getsize(local_dir)
217
+ local_mtime_ts = os.path.getmtime(local_dir)
218
+ local_mtime = datetime.fromtimestamp(local_mtime_ts, tz=timezone.utc)
219
+ file_name = os.path.basename(local_dir)
220
+ return {
221
+ file_name: {
222
+ "path": local_dir,
223
+ "size": local_size,
224
+ "mtime": local_mtime,
225
+ "type": "local",
226
+ }
227
+ }
228
+ except OSError as e:
229
+ print(f"Error accessing source file {local_dir}: {e}")
230
+ return {}
231
+ else:
232
+ print(f"Warning: Local path not found: {local_dir} (treating as empty).")
233
+ return {}
234
+
235
+ files: Dict[str, Dict[str, Any]] = {}
236
+ if verbose:
237
+ print(f"Scanning local directory: {local_dir}...")
238
+ for root, _, file_list in os.walk(local_dir):
239
+ for file_name in file_list:
240
+ local_path = os.path.join(root, file_name)
241
+ try:
242
+ # Use '/' for relative key consistency
243
+ relative_path = os.path.relpath(local_path, local_dir).replace(
244
+ "\\", "/"
245
+ )
246
+ # relative_path will be '.' if local_dir points to a file, handled above.
247
+
248
+ local_size = os.path.getsize(local_path)
249
+ local_mtime_ts = os.path.getmtime(local_path)
250
+ local_mtime = datetime.fromtimestamp(local_mtime_ts, tz=timezone.utc)
251
+
252
+ files[relative_path] = {
253
+ "path": local_path,
254
+ "size": local_size,
255
+ "mtime": local_mtime,
256
+ "type": "local",
257
+ }
258
+ except OSError as e:
259
+ print(f"Warning: Could not get metadata for {local_path}: {e}")
260
+ except Exception as e:
261
+ print(f"Warning: Unexpected error processing {local_path}: {e}")
262
+
263
+ if verbose:
264
+ print(f"Found {len(files)} files locally.")
265
+ return files
266
+
267
+
268
+ def s3_sync(
269
+ source: str,
270
+ destination: str,
271
+ delete: bool = False,
272
+ dry_run: bool = False,
273
+ verbose: bool = True,
274
+ ) -> None:
275
+ """
276
+ Synchronizes files between a source and a destination, which can be
277
+ local paths or S3 paths (e.g., 's3://my-bucket/my-prefix').
278
+
279
+ Compares file sizes and modification times. Copies files from source
280
+ to destination if they are missing, larger, or newer in the source.
281
+ Optionally deletes files from the destination if they are not present
282
+ in the source.
283
+
284
+ Args:
285
+ source (str): The source path (local directory/file or s3://...).
286
+ destination (str): The destination path (local directory or s3://...).
287
+ delete (bool): If True, delete extraneous files from the destination.
288
+ dry_run (bool): If True, print actions without performing them.
289
+ verbose (bool): If True, print actions being taken.
290
+ """
291
+ s3_client = boto3.client("s3")
292
+ mtime_tolerance = timedelta(
293
+ seconds=2
294
+ ) # S3 mtime might not have sub-second precision
295
+
296
+ src_bucket, src_prefix = _parse_s3_path(source)
297
+ dest_bucket, dest_prefix = _parse_s3_path(destination)
298
+
299
+ source_items: Dict[str, Dict[str, Any]] = {}
300
+ dest_items: Dict[str, Dict[str, Any]] = {}
301
+ sync_direction = ""
302
+ is_single_file_sync = False
303
+
304
+ # Determine sync direction and list items
305
+ if src_bucket is None and dest_bucket is not None:
306
+ sync_direction = "upload"
307
+ source_items = _list_local_files(source, verbose)
308
+ dest_items = _list_s3_objects(s3_client, dest_bucket, dest_prefix, verbose)
309
+ # Check if source exists (either dir or file)
310
+ if not os.path.exists(source):
311
+ print(f"Error: Source path {source} not found.")
312
+ return
313
+ is_single_file_sync = os.path.isfile(source)
314
+ # Destination prefix defaults to empty if not specified
315
+ if dest_prefix is None:
316
+ dest_prefix = ""
317
+
318
+ elif src_bucket is not None and dest_bucket is None:
319
+ sync_direction = "download"
320
+ source_items = _list_s3_objects(s3_client, src_bucket, src_prefix, verbose)
321
+ # For download, destination MUST be a directory (or created as one)
322
+ # If destination exists and is a file, it's an error.
323
+ if os.path.exists(destination) and not os.path.isdir(destination):
324
+ print(
325
+ f"Error: Local destination '{destination}' exists but is not a directory."
326
+ )
327
+ return
328
+
329
+ dest_items = _list_local_files(destination, verbose)
330
+
331
+ # Ensure destination directory exists for downloads
332
+ if not dry_run:
333
+ os.makedirs(destination, exist_ok=True)
334
+ elif not os.path.isdir(destination) and verbose:
335
+ print(f"Dry run: Would create local directory {destination}")
336
+
337
+ elif src_bucket is None and dest_bucket is None:
338
+ print(
339
+ "Error: Both source and destination are local paths. Use standard file copy tools."
340
+ )
341
+ return
342
+ elif src_bucket is not None and dest_bucket is not None:
343
+ print(
344
+ "Error: S3 to S3 sync not implemented. Use AWS CLI or S3 Batch Operations."
345
+ )
346
+ return
347
+ else:
348
+ # This case should not be reachable given the above checks
349
+ print("Error: Invalid source or destination path combination.")
350
+ return
351
+
352
+ actions_to_perform: List[Dict[str, Any]] = []
353
+
354
+ # --- Compare items ---
355
+ # Use source keys as the primary loop iterator
356
+ source_keys = set(source_items.keys())
357
+ dest_keys = set(dest_items.keys())
358
+
359
+ for rel_key in source_keys:
360
+ src_item = source_items[rel_key]
361
+ dest_item = dest_items.get(rel_key)
362
+ reason = ""
363
+
364
+ if dest_item is None:
365
+ reason = "does not exist in destination"
366
+ else:
367
+ # Compare metadata (size and mtime)
368
+ if src_item["size"] != dest_item["size"]:
369
+ reason = (
370
+ f"size differs (src: {src_item['size']}, dest: {dest_item['size']})"
371
+ )
372
+ # Sync if source is newer (outside tolerance)
373
+ elif src_item["mtime"] > (dest_item["mtime"] + mtime_tolerance):
374
+ reason = f"is newer in source (src: {src_item['mtime']}, dest: {dest_item['mtime']})"
375
+
376
+ if reason:
377
+ action_type = "upload" if sync_direction == "upload" else "download"
378
+ # Determine the final destination key/path
379
+ dest_full_path_or_key: Optional[str] = None
380
+ if sync_direction == "upload":
381
+ # If uploading single file, dest key is prefix + filename
382
+ # If uploading dir, dest key is prefix + relative key
383
+ # Ensure dest_prefix is treated as empty string if None
384
+ current_dest_prefix = dest_prefix or ""
385
+ final_dest_key = (
386
+ rel_key
387
+ if is_single_file_sync
388
+ else os.path.join(current_dest_prefix, rel_key).replace("\\", "/")
389
+ )
390
+ # Ensure we don't create keys like 's3://bucket//key' if prefix was empty
391
+ if not current_dest_prefix and final_dest_key.startswith("/"):
392
+ final_dest_key = final_dest_key.lstrip("/")
393
+ dest_full_path_or_key = f"s3://{dest_bucket}/{final_dest_key}"
394
+ else: # download
395
+ dest_full_path_or_key = os.path.join(
396
+ destination, rel_key.replace("/", os.sep)
397
+ )
398
+
399
+ actions_to_perform.append(
400
+ {
401
+ "action": action_type,
402
+ "relative_key": rel_key,
403
+ "source_path": src_item["path"], # Local path or S3 URI
404
+ "source_mtime": src_item.get("mtime"),
405
+ "dest_full_path_or_key": dest_full_path_or_key,
406
+ # Store details needed for specific actions
407
+ "dest_bucket": dest_bucket,
408
+ "dest_prefix": dest_prefix,
409
+ "s3_key_full_src": src_item.get("key")
410
+ if sync_direction == "download"
411
+ else None,
412
+ "source_bucket": src_bucket,
413
+ "reason": reason,
414
+ }
415
+ )
416
+
417
+ # Identify items for deletion in destination
418
+ if delete:
419
+ keys_to_delete = dest_keys - source_keys
420
+ for rel_key in keys_to_delete:
421
+ dest_item = dest_items[rel_key]
422
+ action_type = "delete_s3" if sync_direction == "upload" else "delete_local"
423
+ actions_to_perform.append(
424
+ {
425
+ "action": action_type,
426
+ "relative_key": rel_key,
427
+ "path_to_delete": dest_item["path"], # Full S3 URI or local path
428
+ "s3_key_full_dest": dest_item.get("key")
429
+ if sync_direction == "upload"
430
+ else None, # Needed for delete_s3
431
+ "dest_bucket": dest_bucket, # Needed for delete_s3
432
+ "reason": "does not exist in source",
433
+ }
434
+ )
435
+
436
+ # --- Execute Actions ---
437
+ uploads_done = downloads_done = deletions_done = 0
438
+ s3_deletions_batch: List[Dict[str, str]] = []
439
+
440
+ if not actions_to_perform:
441
+ print("Source and destination are already synchronized.")
442
+ # Still check if source/dest actually exist if nothing to do
443
+ if sync_direction == "upload" and not os.path.exists(source):
444
+ print(f"Note: Source path {source} does not exist.")
445
+ # Add check for S3 source existence if needed via head_bucket or similar
446
+ return
447
+
448
+ for action in actions_to_perform:
449
+ rel_key = action["relative_key"]
450
+ reason = action["reason"]
451
+ dest_full_path_or_key = action["dest_full_path_or_key"]
452
+
453
+ if action["action"] == "upload":
454
+ local_path = action["source_path"]
455
+ # Ensure dest_full_path_or_key is valid before parsing
456
+ if not isinstance(dest_full_path_or_key, str):
457
+ print(
458
+ f"ERROR: Invalid destination path calculated for upload: {dest_full_path_or_key}"
459
+ )
460
+ continue
461
+ # Extract final key from the pre-calculated dest_full_path_or_key
462
+ _, upload_key = _parse_s3_path(dest_full_path_or_key)
463
+ target_bucket = action["dest_bucket"]
464
+
465
+ if verbose:
466
+ print(f"Upload: {local_path} to {dest_full_path_or_key} ({reason})")
467
+ if not dry_run:
468
+ if target_bucket and upload_key is not None:
469
+ try:
470
+ s3_client.upload_file(local_path, target_bucket, upload_key)
471
+ uploads_done += 1
472
+ except ClientError as e:
473
+ print(f"ERROR uploading {local_path}: {e}")
474
+ except Exception as e:
475
+ print(f"ERROR uploading {local_path}: {e}")
476
+ else:
477
+ print(
478
+ f"ERROR: Invalid S3 target for upload: bucket={target_bucket}, key={upload_key}"
479
+ )
480
+
481
+ elif action["action"] == "download":
482
+ s3_key_full = action["s3_key_full_src"]
483
+ local_path = dest_full_path_or_key # This is the local destination path
484
+ source_bucket_dl = action["source_bucket"]
485
+
486
+ if verbose:
487
+ print(f"Download: {action['source_path']} to {local_path} ({reason})")
488
+ # Ensure local_path is valid before proceeding
489
+ if not isinstance(local_path, str):
490
+ print(
491
+ f"ERROR: Invalid local destination path calculated for download: {local_path}"
492
+ )
493
+ continue
494
+ if not dry_run:
495
+ if source_bucket_dl and s3_key_full and local_path:
496
+ try:
497
+ local_file_dir = os.path.dirname(local_path)
498
+ os.makedirs(local_file_dir, exist_ok=True)
499
+ s3_client.download_file(
500
+ source_bucket_dl, s3_key_full, local_path
501
+ )
502
+ downloads_done += 1
503
+ except ClientError as e:
504
+ print(f"ERROR downloading {s3_key_full}: {e}")
505
+ except OSError as e:
506
+ print(
507
+ f"ERROR creating directory or writing file {local_path}: {e}"
508
+ )
509
+ except Exception as e:
510
+ print(f"ERROR downloading {s3_key_full}: {e}")
511
+ else:
512
+ print(
513
+ f"ERROR: Invalid parameters for download: bucket={source_bucket_dl}, key={s3_key_full}, local={local_path}"
514
+ )
515
+
516
+ elif action["action"] == "delete_s3":
517
+ s3_key_to_delete = action["s3_key_full_dest"]
518
+ target_bucket_del = action["dest_bucket"]
519
+ if target_bucket_del and s3_key_to_delete:
520
+ if verbose:
521
+ print(f"Delete S3: {action['path_to_delete']} ({reason})")
522
+ # Check type before appending to batch
523
+ if isinstance(s3_key_to_delete, str):
524
+ s3_deletions_batch.append({"Key": s3_key_to_delete})
525
+ else:
526
+ print(f"ERROR: Invalid S3 key for deletion: {s3_key_to_delete}")
527
+ else:
528
+ print(
529
+ f"ERROR: Invalid S3 target for deletion: bucket={target_bucket_del}, key={s3_key_to_delete}"
530
+ )
531
+
532
+ elif action["action"] == "delete_local":
533
+ local_path_to_delete = action["path_to_delete"]
534
+ if verbose:
535
+ print(f"Delete Local: {local_path_to_delete} ({reason})")
536
+ if not dry_run:
537
+ try:
538
+ os.remove(local_path_to_delete)
539
+ deletions_done += 1
540
+ # TODO: Optionally clean up empty directories?
541
+ except OSError as e:
542
+ print(f"ERROR deleting local file {local_path_to_delete}: {e}")
543
+
544
+ # Process S3 deletions in batches
545
+ if s3_deletions_batch:
546
+ # Get the target bucket from the first deletion action (should be consistent)
547
+ target_bucket_del_batch = next(
548
+ (
549
+ a["dest_bucket"]
550
+ for a in actions_to_perform
551
+ if a["action"] == "delete_s3"
552
+ ),
553
+ None,
554
+ )
555
+ if not dry_run and target_bucket_del_batch:
556
+ deleted_count_batch = 0
557
+ for i in range(0, len(s3_deletions_batch), 1000):
558
+ batch = s3_deletions_batch[i : i + 1000]
559
+ delete_payload = {"Objects": batch, "Quiet": False} # Get errors back
560
+ try:
561
+ response = s3_client.delete_objects(
562
+ Bucket=target_bucket_del_batch, Delete=delete_payload
563
+ )
564
+ # Increment count based on successful deletions reported (if not Quiet) or assume success if Quiet
565
+ deleted_count_batch += len(
566
+ batch
567
+ ) # Assume success unless errors reported
568
+ if "Deleted" in response:
569
+ pass # Counted optimistically above
570
+ # deleted_count_batch += len(response['Deleted'])
571
+ if "Errors" in response and response["Errors"]:
572
+ deleted_count_batch -= len(
573
+ response["Errors"]
574
+ ) # Adjust count for errors
575
+ for error in response["Errors"]:
576
+ print(
577
+ f"ERROR deleting S3 object {error['Key']}: {error['Code']} - {error['Message']}"
578
+ )
579
+ except ClientError as e:
580
+ print(f"ERROR deleting S3 objects batch: {e}")
581
+ deleted_count_batch = 0 # Assume batch failed
582
+ except Exception as e:
583
+ print(f"ERROR deleting S3 objects batch: {e}")
584
+ deleted_count_batch = 0 # Assume batch failed
585
+ deletions_done += deleted_count_batch
586
+ elif target_bucket_del_batch: # dry_run is True
587
+ deletions_done = len(
588
+ s3_deletions_batch
589
+ ) # Report planned deletions for dry run
590
+ else:
591
+ print("Warning: Could not determine target bucket for S3 deletion batch.")
592
+
593
+ # --- Summary ---
594
+ if dry_run:
595
+ upload_count = sum(1 for a in actions_to_perform if a["action"] == "upload")
596
+ download_count = sum(1 for a in actions_to_perform if a["action"] == "download")
597
+ # Deletion count for dry run is based on the batch prepared
598
+ delete_s3_count = len(s3_deletions_batch)
599
+ delete_local_count = sum(
600
+ 1 for a in actions_to_perform if a["action"] == "delete_local"
601
+ )
602
+ print("\n--- DRY RUN SUMMARY ---")
603
+ if sync_direction == "upload":
604
+ print(f"Would upload: {upload_count} file(s)")
605
+ if delete:
606
+ print(f"Would delete from S3: {delete_s3_count} object(s)")
607
+ elif sync_direction == "download":
608
+ print(f"Would download: {download_count} file(s)")
609
+ if delete:
610
+ print(f"Would delete locally: {delete_local_count} file(s)")
611
+ print("--- END DRY RUN ---")
612
+ else:
613
+ if sync_direction == "upload":
614
+ print(
615
+ f"Sync completed. Uploaded: {uploads_done} file(s). Deleted from S3: {deletions_done if delete else 0} object(s)."
616
+ )
617
+ elif sync_direction == "download":
618
+ print(
619
+ f"Sync completed. Downloaded: {downloads_done} file(s). Deleted locally: {deletions_done if delete else 0} file(s)."
620
+ )
621
+
622
+
623
+ def s3_check(s3_uri: str) -> bool:
624
+ """
625
+ Check if an object or prefix exists in an S3 bucket using an S3 URI.
626
+
627
+ Args:
628
+ s3_uri (str): The S3 URI (e.g., 's3://my-bucket/my-key' or 's3://my-bucket/my-prefix/').
629
+ Use a trailing '/' to check for a prefix/directory.
630
+
631
+ Returns:
632
+ bool: True if the object or prefix exists, False otherwise.
633
+ """
634
+ s3 = boto3.client("s3")
635
+ bucket_name, s3_key = _parse_s3_path(s3_uri)
636
+
637
+ if bucket_name is None or s3_key is None:
638
+ # _parse_s3_path returns None, None if scheme is not 's3'
639
+ print(f"Error: Invalid S3 URI format: {s3_uri}")
640
+ return False
641
+
642
+ is_prefix = s3_key.endswith("/")
643
+
644
+ try:
645
+ if is_prefix:
646
+ # Check for prefix existence by listing objects
647
+ # Handle the case where s3_key might be empty if URI is just s3://bucket/
648
+ list_prefix = s3_key if s3_key else ""
649
+ response = s3.list_objects_v2(
650
+ Bucket=bucket_name, Prefix=list_prefix, MaxKeys=1
651
+ )
652
+ # Check if any objects OR common prefixes (folders) are returned for the prefix
653
+ return "Contents" in response or "CommonPrefixes" in response
654
+ else:
655
+ # Check for object existence
656
+ s3.head_object(Bucket=bucket_name, Key=s3_key)
657
+ return True
658
+ except ClientError as e: # Catch boto3 ClientError first
659
+ # If head_object returns 404 (NoSuchKey), the object doesn't exist
660
+ # list_objects_v2 does not raise NoSuchKey for prefixes
661
+ if e.response["Error"]["Code"] == "404":
662
+ return False
663
+ elif e.response["Error"]["Code"] == "NoSuchBucket":
664
+ print(f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri}).")
665
+ return False
666
+ # Handle other potential errors like AccessDenied differently if needed
667
+ print(f"Error checking {s3_uri}: {e}")
668
+ return False
669
+ # except s3.exceptions.NoSuchBucket: # This specific exception is less common with boto3 client
670
+ # print(f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri}).")
671
+ # return False
672
+ except Exception as e:
673
+ print(f"An unexpected error occurred checking {s3_uri}: {e}")
674
+ return False
675
+
676
+
677
+ def s3_copy(
678
+ source: str,
679
+ destination: str,
680
+ verbose: bool = True,
681
+ ) -> None:
682
+ """
683
+ Copies files or directories between local paths and S3 URIs.
684
+
685
+ Handles:
686
+ - Local file to S3 object
687
+ - Local directory to S3 prefix (recursive)
688
+ - S3 object to local file
689
+ - S3 prefix to local directory (recursive)
690
+
691
+ Does NOT handle:
692
+ - Local to Local (use shutil)
693
+ - S3 to S3 (use AWS CLI or boto3 object copy)
694
+
695
+ Args:
696
+ source (str): The source path (local file/dir or s3://...).
697
+ destination (str): The destination path (local file/dir or s3://...).
698
+ verbose (bool): If True, print actions being taken.
699
+ """
700
+ s3_client = boto3.client("s3")
701
+ src_bucket, src_prefix = _parse_s3_path(source)
702
+ dest_bucket, dest_prefix = _parse_s3_path(destination)
703
+
704
+ # --- Reject unsupported operations ---
705
+ if src_bucket is None and dest_bucket is None:
706
+ print(
707
+ "Error: Both source and destination are local. Use 'shutil.copy' or 'shutil.copytree'."
708
+ )
709
+ return
710
+ if src_bucket is not None and dest_bucket is not None:
711
+ print(
712
+ "Error: S3 to S3 copy not implemented. Use 'aws s3 cp' or boto3 'copy_object'."
713
+ )
714
+ return
715
+
716
+ # --- Upload: Local to S3 ---
717
+ if src_bucket is None and dest_bucket is not None:
718
+ if not os.path.exists(source):
719
+ print(f"Error: Local source path not found: {source}")
720
+ return
721
+ # Ensure dest_prefix is usable, default to empty string if None
722
+ dest_prefix = dest_prefix or ""
723
+
724
+ # Case 1: Source is a local file
725
+ if os.path.isfile(source):
726
+ # Determine final S3 key
727
+ # If dest looks like a dir (ends /) or is empty, append filename
728
+ if not dest_prefix or destination.endswith("/"):
729
+ s3_key = os.path.join(dest_prefix, os.path.basename(source)).replace(
730
+ "\\", "/"
731
+ )
732
+ else: # Treat dest as the exact key name
733
+ s3_key = dest_prefix
734
+
735
+ if verbose:
736
+ print(f"Uploading {source} to s3://{dest_bucket}/{s3_key}")
737
+ try:
738
+ s3_client.upload_file(source, dest_bucket, s3_key)
739
+ print("Upload complete.")
740
+ except ClientError as e:
741
+ print(f"ERROR uploading {source}: {e}")
742
+ except Exception as e:
743
+ print(f"ERROR uploading {source}: {e}")
744
+
745
+ # Case 2: Source is a local directory
746
+ elif os.path.isdir(source):
747
+ if verbose:
748
+ print(
749
+ f"Uploading directory {source}/* to s3://{dest_bucket}/{dest_prefix}/"
750
+ )
751
+ files_uploaded = 0
752
+ files_failed = 0
753
+ for root, _, files in os.walk(source):
754
+ for file in files:
755
+ local_path = os.path.join(root, file)
756
+ relative_path = os.path.relpath(local_path, source)
757
+ s3_key = os.path.join(dest_prefix, relative_path).replace("\\", "/")
758
+ if verbose:
759
+ print(
760
+ f" Uploading {local_path} to s3://{dest_bucket}/{s3_key}"
761
+ )
762
+ try:
763
+ s3_client.upload_file(local_path, dest_bucket, s3_key)
764
+ files_uploaded += 1
765
+ except ClientError as e:
766
+ print(f" ERROR uploading {local_path}: {e}")
767
+ files_failed += 1
768
+ except Exception as e:
769
+ print(f" ERROR uploading {local_path}: {e}")
770
+ files_failed += 1
771
+ print(
772
+ f"Directory upload complete. Files uploaded: {files_uploaded}, Failed: {files_failed}"
773
+ )
774
+ else:
775
+ print(f"Error: Source {source} is neither a file nor a directory.")
776
+
777
+ # --- Download: S3 to Local ---
778
+ elif src_bucket is not None and dest_bucket is None:
779
+ # Determine if source is likely a single object or a prefix
780
+ is_prefix_download = False
781
+ single_object_key = None
782
+
783
+ # If source ends with '/', treat it as a prefix explicitly
784
+ if source.endswith("/"):
785
+ is_prefix_download = True
786
+ src_prefix = src_prefix or "" # Ensure not None
787
+ else:
788
+ # Try checking if the source key exists as a single object
789
+ try:
790
+ s3_client.head_object(Bucket=src_bucket, Key=src_prefix)
791
+ single_object_key = src_prefix # It exists as a single object
792
+ except ClientError as e:
793
+ if e.response["Error"]["Code"] == "404":
794
+ # Object doesn't exist, assume it's a prefix for recursive download
795
+ is_prefix_download = True
796
+ src_prefix = src_prefix or "" # Ensure not None
797
+ elif e.response["Error"]["Code"] == "NoSuchBucket":
798
+ print(f"Error: Source bucket '{src_bucket}' not found.")
799
+ return
800
+ else:
801
+ # Other error (e.g., permissions)
802
+ print(
803
+ f"Error checking S3 source object s3://{src_bucket}/{src_prefix}: {e}"
804
+ )
805
+ return
806
+ except Exception as e:
807
+ print(
808
+ f"Error checking S3 source object s3://{src_bucket}/{src_prefix}: {e}"
809
+ )
810
+ return
811
+
812
+ # Case 1: Download single S3 object
813
+ if single_object_key is not None:
814
+ # Determine local destination path
815
+ if os.path.isdir(destination) or destination.endswith(os.sep):
816
+ # Download into the directory
817
+ local_dest_path = os.path.join(
818
+ destination, os.path.basename(single_object_key)
819
+ )
820
+ # Create local directory if downloading into it and it doesn't exist
821
+ os.makedirs(destination, exist_ok=True)
822
+ else:
823
+ # Download to the exact file path
824
+ local_dest_path = destination
825
+ # Ensure parent directory exists
826
+ parent_dir = os.path.dirname(local_dest_path)
827
+ if parent_dir:
828
+ os.makedirs(parent_dir, exist_ok=True)
829
+
830
+ if verbose:
831
+ print(
832
+ f"Downloading s3://{src_bucket}/{single_object_key} to {local_dest_path}"
833
+ )
834
+ try:
835
+ s3_client.download_file(src_bucket, single_object_key, local_dest_path)
836
+ print("Download complete.")
837
+ except ClientError as e:
838
+ print(f"ERROR downloading {single_object_key}: {e}")
839
+ except OSError as e:
840
+ print(
841
+ f"ERROR creating directory or writing file {local_dest_path}: {e}"
842
+ )
843
+ except Exception as e:
844
+ print(f"ERROR downloading {single_object_key}: {e}")
845
+
846
+ # Case 2: Download S3 prefix (recursive)
847
+ elif is_prefix_download:
848
+ # Ensure local destination is a directory
849
+ if os.path.exists(destination) and not os.path.isdir(destination):
850
+ print(
851
+ f"Error: Local destination '{destination}' exists but is not a directory for prefix download."
852
+ )
853
+ return
854
+ os.makedirs(destination, exist_ok=True)
855
+
856
+ if verbose:
857
+ print(
858
+ f"Downloading prefix s3://{src_bucket}/{src_prefix}/* to {destination}/"
859
+ )
860
+
861
+ paginator = s3_client.get_paginator("list_objects_v2")
862
+ files_downloaded = 0
863
+ files_failed = 0
864
+ operation_parameters = {"Bucket": src_bucket}
865
+ if src_prefix:
866
+ operation_parameters["Prefix"] = src_prefix
867
+
868
+ try:
869
+ page_iterator = paginator.paginate(**operation_parameters)
870
+ found_objects = False
871
+ for page in page_iterator:
872
+ if "Contents" in page:
873
+ found_objects = True
874
+ for obj in page["Contents"]:
875
+ s3_key = obj["Key"]
876
+ # Skip zero-byte directory markers if downloading a prefix
877
+ if s3_key.endswith("/") and obj["Size"] == 0:
878
+ continue
879
+
880
+ # Calculate relative path from the source prefix
881
+ if src_prefix and s3_key.startswith(src_prefix):
882
+ # Handle potential trailing slash inconsistency
883
+ prefix_adjusted = (
884
+ src_prefix
885
+ if src_prefix.endswith("/")
886
+ else src_prefix + "/"
887
+ )
888
+ if s3_key.startswith(prefix_adjusted):
889
+ relative_key = s3_key[len(prefix_adjusted) :]
890
+ # Handle the prefix itself if listed as an object (unlikely for prefix download)
891
+ elif s3_key == src_prefix.rstrip("/"):
892
+ relative_key = os.path.basename(s3_key)
893
+ else: # Should not happen
894
+ relative_key = s3_key
895
+ elif not src_prefix: # Downloading whole bucket essentially
896
+ relative_key = s3_key
897
+ else: # Key doesn't start with prefix, should not happen
898
+ continue
899
+
900
+ # Skip if relative key is empty (e.g. prefix marker was somehow processed)
901
+ if not relative_key:
902
+ continue
903
+
904
+ local_dest_path = os.path.join(
905
+ destination, relative_key.replace("/", os.sep)
906
+ )
907
+ local_dest_dir = os.path.dirname(local_dest_path)
908
+
909
+ if verbose:
910
+ print(
911
+ f" Downloading s3://{src_bucket}/{s3_key} to {local_dest_path}"
912
+ )
913
+ try:
914
+ if local_dest_dir:
915
+ os.makedirs(local_dest_dir, exist_ok=True)
916
+ s3_client.download_file(
917
+ src_bucket, s3_key, local_dest_path
918
+ )
919
+ files_downloaded += 1
920
+ except ClientError as e:
921
+ print(f" ERROR downloading {s3_key}: {e}")
922
+ files_failed += 1
923
+ except OSError as e:
924
+ print(
925
+ f" ERROR creating directory or writing file {local_dest_path}: {e}"
926
+ )
927
+ files_failed += 1
928
+ except Exception as e:
929
+ print(f" ERROR downloading {s3_key}: {e}")
930
+ files_failed += 1
931
+
932
+ if not found_objects:
933
+ print(
934
+ f"Warning: No objects found at source prefix s3://{src_bucket}/{src_prefix}"
935
+ )
936
+
937
+ print(
938
+ f"Prefix download complete. Files downloaded: {files_downloaded}, Failed: {files_failed}"
939
+ )
940
+
941
+ except ClientError as e:
942
+ if e.response["Error"]["Code"] == "NoSuchBucket":
943
+ print(f"Error: Source bucket '{src_bucket}' not found.")
944
+ else:
945
+ print(
946
+ f"Error listing objects in s3://{src_bucket}/{src_prefix}: {e}"
947
+ )
948
+ except Exception as e:
949
+ print(f"Error listing objects in s3://{src_bucket}/{src_prefix}: {e}")
950
+
951
+ else: # Should not be reachable
952
+ print("Error: Unknown copy operation type.")