nebu 0.1.24__py3-none-any.whl → 0.1.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nebu/data.py ADDED
@@ -0,0 +1,855 @@
1
+ import os
2
+ import subprocess
3
+ from datetime import datetime, timedelta, timezone
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+ from urllib.parse import urlparse
6
+
7
+ import boto3
8
+ from botocore.exceptions import ClientError
9
+
10
+
11
+ def rclone_copy(
12
+ source_dir: str,
13
+ destination: str,
14
+ dry_run: bool = False,
15
+ transfers: int = 4,
16
+ extra_args: Optional[List[str]] = None,
17
+ verbose: bool = True,
18
+ ) -> bool:
19
+ """
20
+ Upload a directory to a remote bucket using `rclone copy`.
21
+
22
+ Args:
23
+ source_dir (str): Path to local directory to upload.
24
+ destination (str): Remote destination, e.g., 's3:my-bucket/path'.
25
+ dry_run (bool): If True, performs a dry run without uploading.
26
+ transfers (int): Number of parallel transfers.
27
+ extra_args (Optional[List[str]]): Additional rclone flags.
28
+ verbose (bool): If True, prints command and output live.
29
+
30
+ Returns:
31
+ bool: True if upload succeeded, False otherwise.
32
+ """
33
+ command = [
34
+ "rclone",
35
+ "copy",
36
+ source_dir,
37
+ destination,
38
+ f"--transfers={transfers}",
39
+ "--progress",
40
+ ]
41
+
42
+ if dry_run:
43
+ command.append("--dry-run")
44
+ if extra_args:
45
+ command.extend(extra_args)
46
+
47
+ if verbose:
48
+ print("Running command:", " ".join(command))
49
+
50
+ try:
51
+ process = subprocess.Popen(
52
+ command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
53
+ )
54
+
55
+ if not process.stdout:
56
+ raise Exception("No output from rclone")
57
+
58
+ for line in process.stdout:
59
+ if verbose:
60
+ print(line.strip())
61
+
62
+ return process.wait() == 0
63
+
64
+ except Exception as e:
65
+ print(f"Error during rclone copy: {e}")
66
+ return False
67
+
68
+
69
+ def find_latest_checkpoint(training_dir: str) -> Optional[str]:
70
+ """
71
+ Finds the checkpoint directory with the highest step number in a Hugging Face
72
+ training output directory.
73
+
74
+ Args:
75
+ training_dir (str): The path to the training output directory.
76
+
77
+ Returns:
78
+ Optional[str]: The path to the latest checkpoint directory, or None if
79
+ no checkpoint directories are found or the directory
80
+ doesn't exist.
81
+ """
82
+ latest_step = -1
83
+ latest_checkpoint_dir = None
84
+
85
+ if not os.path.isdir(training_dir):
86
+ print(f"Error: Directory not found: {training_dir}")
87
+ return None
88
+
89
+ for item in os.listdir(training_dir):
90
+ item_path = os.path.join(training_dir, item)
91
+ if os.path.isdir(item_path) and item.startswith("checkpoint-"):
92
+ try:
93
+ step_str = item.split("-")[-1]
94
+ if step_str.isdigit():
95
+ step = int(step_str)
96
+ if step > latest_step:
97
+ latest_step = step
98
+ latest_checkpoint_dir = item_path
99
+ except (ValueError, IndexError):
100
+ # Ignore items that don't match the expected pattern
101
+ continue
102
+
103
+ return latest_checkpoint_dir
104
+
105
+
106
+ def _parse_s3_path(path: str) -> Tuple[Optional[str], Optional[str]]:
107
+ """Standalone helper: Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
108
+ parsed = urlparse(path)
109
+ if parsed.scheme != "s3":
110
+ return None, None
111
+ bucket = parsed.netloc
112
+ prefix = parsed.path.lstrip("/")
113
+ return bucket, prefix
114
+
115
+
116
+ class Bucket:
117
+ """Handles interactions with AWS S3."""
118
+
119
+ def __init__(self, verbose: bool = True):
120
+ """
121
+ Initializes the S3 handler.
122
+
123
+ Args:
124
+ verbose (bool): If True, prints status messages. Defaults to True.
125
+ """
126
+ self.client = boto3.client("s3")
127
+ self.verbose = verbose
128
+
129
+ def _parse_path(self, path: str) -> Tuple[Optional[str], Optional[str]]:
130
+ """Class method: Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
131
+ # Reusing the standalone logic here for consistency
132
+ return _parse_s3_path(path)
133
+
134
+ def _list_objects(
135
+ self, bucket: str, prefix: Optional[str]
136
+ ) -> Dict[str, Dict[str, Any]]:
137
+ """Class method: Lists objects in an S3 prefix."""
138
+ objects: Dict[str, Dict[str, Any]] = {}
139
+ paginator = self.client.get_paginator("list_objects_v2")
140
+ list_prefix = prefix or ""
141
+ if self.verbose:
142
+ print(f"Listing objects in s3://{bucket}/{list_prefix}...")
143
+
144
+ operation_parameters = {"Bucket": bucket}
145
+ if list_prefix:
146
+ operation_parameters["Prefix"] = list_prefix
147
+
148
+ try:
149
+ page_iterator = paginator.paginate(**operation_parameters)
150
+ for page in page_iterator:
151
+ if "Contents" in page:
152
+ for obj in page["Contents"]:
153
+ if obj["Key"].endswith("/") and obj["Size"] == 0:
154
+ continue
155
+ relative_key: Optional[str] = None
156
+ current_prefix = prefix or ""
157
+ if current_prefix and obj["Key"].startswith(current_prefix):
158
+ prefix_adjusted = current_prefix + (
159
+ "" if current_prefix.endswith("/") else "/"
160
+ )
161
+ if obj["Key"] == current_prefix.rstrip("/"):
162
+ relative_key = os.path.basename(obj["Key"])
163
+ elif obj["Key"].startswith(prefix_adjusted):
164
+ relative_key = obj["Key"][len(prefix_adjusted) :]
165
+ else:
166
+ potential_rel_key = obj["Key"][len(current_prefix) :]
167
+ relative_key = potential_rel_key.lstrip("/")
168
+ elif not current_prefix:
169
+ relative_key = obj["Key"]
170
+ if not relative_key:
171
+ continue
172
+ last_modified = obj["LastModified"]
173
+ if last_modified.tzinfo is None:
174
+ last_modified = last_modified.replace(tzinfo=timezone.utc)
175
+ objects[relative_key] = {
176
+ "path": f"s3://{bucket}/{obj['Key']}",
177
+ "key": obj["Key"],
178
+ "size": obj["Size"],
179
+ "mtime": last_modified,
180
+ "type": "s3",
181
+ }
182
+ except ClientError as e:
183
+ if e.response["Error"]["Code"] == "NoSuchBucket":
184
+ if self.verbose:
185
+ print(f"Error: Bucket '{bucket}' not found.")
186
+ elif e.response["Error"]["Code"] == "NoSuchKey" and prefix:
187
+ if self.verbose:
188
+ print(
189
+ f"Prefix s3://{bucket}/{prefix} not found (treating as empty)."
190
+ )
191
+ else:
192
+ print(f"Error listing S3 objects: {e}")
193
+ if e.response["Error"]["Code"] == "NoSuchBucket":
194
+ return {}
195
+ except Exception as e:
196
+ print(f"An unexpected error occurred listing S3 objects: {e}")
197
+ return {}
198
+ if self.verbose:
199
+ print(f"Found {len(objects)} objects in S3.")
200
+ return objects
201
+
202
+ def _list_local(self, local_dir: str) -> Dict[str, Dict[str, Any]]:
203
+ """Class method: Lists files in a local directory."""
204
+ files: Dict[str, Dict[str, Any]] = {}
205
+ if not os.path.exists(local_dir):
206
+ if self.verbose:
207
+ print(
208
+ f"Warning: Local path not found: {local_dir} (treating as empty)."
209
+ )
210
+ return files
211
+ if os.path.isfile(local_dir):
212
+ if self.verbose:
213
+ print(
214
+ f"Warning: Source {local_dir} is a file, not a directory. Syncing single file."
215
+ )
216
+ try:
217
+ file_name = os.path.basename(local_dir)
218
+ files[file_name] = {
219
+ "path": local_dir,
220
+ "size": os.path.getsize(local_dir),
221
+ "mtime": datetime.fromtimestamp(
222
+ os.path.getmtime(local_dir), tz=timezone.utc
223
+ ),
224
+ "type": "local",
225
+ }
226
+ except OSError as e:
227
+ print(f"Error accessing source file {local_dir}: {e}")
228
+ return files
229
+ if self.verbose:
230
+ print(f"Scanning local directory: {local_dir}...")
231
+ for root, _, file_list in os.walk(local_dir):
232
+ for file_name in file_list:
233
+ local_path = os.path.join(root, file_name)
234
+ try:
235
+ relative_path = os.path.relpath(local_path, local_dir).replace(
236
+ "\\", "/"
237
+ )
238
+ files[relative_path] = {
239
+ "path": local_path,
240
+ "size": os.path.getsize(local_path),
241
+ "mtime": datetime.fromtimestamp(
242
+ os.path.getmtime(local_path), tz=timezone.utc
243
+ ),
244
+ "type": "local",
245
+ }
246
+ except OSError as e:
247
+ print(f"Warning: Could not get metadata for {local_path}: {e}")
248
+ except Exception as e:
249
+ print(f"Warning: Unexpected error processing {local_path}: {e}")
250
+ if self.verbose:
251
+ print(f"Found {len(files)} files locally.")
252
+ return files
253
+
254
+ def sync(
255
+ self,
256
+ source: str,
257
+ destination: str,
258
+ delete: bool = False,
259
+ dry_run: bool = False,
260
+ ) -> None:
261
+ """
262
+ Synchronizes files between a source and a destination (local or S3).
263
+ Compares file sizes and modification times. Copies if missing, larger, or newer.
264
+ Optionally deletes extraneous files from the destination.
265
+ Args:
266
+ source (str): The source path (local directory/file or s3://...).
267
+ destination (str): The destination path (local directory or s3://...).
268
+ delete (bool): If True, delete extraneous files from the destination.
269
+ dry_run (bool): If True, print actions without performing them.
270
+ """
271
+ mtime_tolerance = timedelta(seconds=2)
272
+ src_bucket, src_prefix = self._parse_path(source)
273
+ dest_bucket, dest_prefix = self._parse_path(destination)
274
+ source_items: Dict[str, Dict[str, Any]] = {}
275
+ dest_items: Dict[str, Dict[str, Any]] = {}
276
+ sync_direction = ""
277
+ is_single_file_sync = False
278
+
279
+ if src_bucket is None and dest_bucket is not None:
280
+ sync_direction = "upload"
281
+ source_items = self._list_local(source)
282
+ dest_items = self._list_objects(dest_bucket, dest_prefix)
283
+ if not source_items and not os.path.exists(source):
284
+ print(
285
+ f"Error: Source path {source} not found and is not empty."
286
+ ) # Check needed? list_local handles it.
287
+ # return # Let it proceed if source is just empty
288
+ if os.path.isfile(source):
289
+ is_single_file_sync = True
290
+ # current_dest_prefix = dest_prefix or "" # Moved closer to usage
291
+
292
+ elif src_bucket is not None and dest_bucket is None:
293
+ sync_direction = "download"
294
+ source_items = self._list_objects(src_bucket, src_prefix)
295
+ if os.path.exists(destination) and not os.path.isdir(destination):
296
+ print(
297
+ f"Error: Local destination '{destination}' exists but is not a directory."
298
+ )
299
+ return
300
+ dest_items = self._list_local(destination)
301
+ if not dry_run:
302
+ os.makedirs(destination, exist_ok=True)
303
+ elif not os.path.isdir(destination) and self.verbose:
304
+ print(f"Dry run: Would create local directory {destination}")
305
+
306
+ elif src_bucket is None and dest_bucket is None:
307
+ print(
308
+ "Error: Both source and destination are local paths. Use standard file copy tools."
309
+ )
310
+ return
311
+ elif src_bucket is not None and dest_bucket is not None:
312
+ print(
313
+ "Error: S3 to S3 sync not implemented. Use AWS CLI or S3 Batch Operations."
314
+ )
315
+ return
316
+ else:
317
+ print("Error: Invalid source or destination path combination.")
318
+ return
319
+
320
+ actions_to_perform: List[Dict[str, Any]] = []
321
+ source_keys = set(source_items.keys())
322
+ dest_keys = set(dest_items.keys())
323
+
324
+ for rel_key in source_keys:
325
+ src_item = source_items[rel_key]
326
+ dest_item = dest_items.get(rel_key)
327
+ reason = ""
328
+ if dest_item is None:
329
+ reason = "does not exist in destination"
330
+ else:
331
+ if src_item["size"] != dest_item["size"]:
332
+ reason = f"size differs (src: {src_item['size']}, dest: {dest_item['size']})"
333
+ elif src_item["mtime"] > (dest_item["mtime"] + mtime_tolerance):
334
+ reason = f"is newer in source (src: {src_item['mtime']}, dest: {dest_item['mtime']})"
335
+ if reason:
336
+ action_type = "upload" if sync_direction == "upload" else "download"
337
+ dest_full_path_or_key: Optional[str] = None
338
+ if sync_direction == "upload":
339
+ # Define current_dest_prefix here, just before use
340
+ current_dest_prefix = dest_prefix or ""
341
+ final_dest_key = (
342
+ rel_key
343
+ if is_single_file_sync
344
+ else os.path.join(current_dest_prefix, rel_key).replace(
345
+ "\\", "/"
346
+ )
347
+ )
348
+ if not current_dest_prefix and final_dest_key.startswith("/"):
349
+ final_dest_key = final_dest_key.lstrip("/")
350
+ dest_full_path_or_key = f"s3://{dest_bucket}/{final_dest_key}"
351
+ else:
352
+ dest_full_path_or_key = os.path.join(
353
+ destination, rel_key.replace("/", os.sep)
354
+ )
355
+ actions_to_perform.append(
356
+ {
357
+ "action": action_type,
358
+ "relative_key": rel_key,
359
+ "source_path": src_item["path"],
360
+ "source_mtime": src_item.get("mtime"),
361
+ "dest_full_path_or_key": dest_full_path_or_key,
362
+ "dest_bucket": dest_bucket,
363
+ "dest_prefix": dest_prefix,
364
+ "s3_key_full_src": src_item.get("key")
365
+ if sync_direction == "download"
366
+ else None,
367
+ "source_bucket": src_bucket,
368
+ "reason": reason,
369
+ }
370
+ )
371
+
372
+ if delete:
373
+ keys_to_delete = dest_keys - source_keys
374
+ for rel_key in keys_to_delete:
375
+ dest_item = dest_items[rel_key]
376
+ action_type = (
377
+ "delete_s3" if sync_direction == "upload" else "delete_local"
378
+ )
379
+ actions_to_perform.append(
380
+ {
381
+ "action": action_type,
382
+ "relative_key": rel_key,
383
+ "path_to_delete": dest_item["path"],
384
+ "s3_key_full_dest": dest_item.get("key")
385
+ if sync_direction == "upload"
386
+ else None,
387
+ "dest_bucket": dest_bucket,
388
+ "reason": "does not exist in source",
389
+ }
390
+ )
391
+
392
+ uploads_done = downloads_done = deletions_done = 0
393
+ s3_deletions_batch: List[Dict[str, str]] = []
394
+ if not actions_to_perform:
395
+ if self.verbose:
396
+ print("Source and destination are already synchronized.")
397
+ # Optional: Add check if source exists if sync_direction == "upload" and not os.path.exists(source):
398
+ return
399
+
400
+ for action in actions_to_perform:
401
+ reason = action["reason"]
402
+ dest_full_path_or_key = action["dest_full_path_or_key"]
403
+ if action["action"] == "upload":
404
+ local_path = action["source_path"]
405
+ if not isinstance(dest_full_path_or_key, str):
406
+ print(f"ERROR: Invalid dest path: {dest_full_path_or_key}")
407
+ continue
408
+ _, upload_key = self._parse_path(dest_full_path_or_key)
409
+ target_bucket = action["dest_bucket"]
410
+ if self.verbose:
411
+ print(f"Upload: {local_path} to {dest_full_path_or_key} ({reason})")
412
+ if not dry_run:
413
+ if target_bucket and upload_key is not None:
414
+ try:
415
+ self.client.upload_file(
416
+ local_path, target_bucket, upload_key
417
+ )
418
+ uploads_done += 1
419
+ except ClientError as e:
420
+ print(f"ERROR uploading {local_path}: {e}")
421
+ except Exception as e:
422
+ print(f"ERROR uploading {local_path}: {e}")
423
+ else:
424
+ print(
425
+ f"ERROR: Invalid S3 target: bucket={target_bucket}, key={upload_key}"
426
+ )
427
+ elif action["action"] == "download":
428
+ s3_key_full = action["s3_key_full_src"]
429
+ local_path = dest_full_path_or_key
430
+ source_bucket_dl = action["source_bucket"]
431
+ if self.verbose:
432
+ print(
433
+ f"Download: {action['source_path']} to {local_path} ({reason})"
434
+ )
435
+ if not isinstance(local_path, str):
436
+ print(f"ERROR: Invalid local dest path: {local_path}")
437
+ continue
438
+ if not dry_run:
439
+ if source_bucket_dl and s3_key_full and local_path:
440
+ try:
441
+ local_file_dir = os.path.dirname(local_path)
442
+ os.makedirs(local_file_dir, exist_ok=True)
443
+ self.client.download_file(
444
+ source_bucket_dl, s3_key_full, local_path
445
+ )
446
+ downloads_done += 1
447
+ except ClientError as e:
448
+ print(f"ERROR downloading {s3_key_full}: {e}")
449
+ except OSError as e:
450
+ print(f"ERROR creating/writing {local_path}: {e}")
451
+ except Exception as e:
452
+ print(f"ERROR downloading {s3_key_full}: {e}")
453
+ else:
454
+ print(
455
+ f"ERROR: Invalid download params: bucket={source_bucket_dl}, key={s3_key_full}, local={local_path}"
456
+ )
457
+ elif action["action"] == "delete_s3":
458
+ s3_key_to_delete = action["s3_key_full_dest"]
459
+ target_bucket_del = action["dest_bucket"]
460
+ if target_bucket_del and s3_key_to_delete:
461
+ if self.verbose:
462
+ print(f"Delete S3: {action['path_to_delete']} ({reason})")
463
+ if isinstance(s3_key_to_delete, str):
464
+ s3_deletions_batch.append({"Key": s3_key_to_delete})
465
+ else:
466
+ print(f"ERROR: Invalid S3 key for deletion: {s3_key_to_delete}")
467
+ else:
468
+ print(
469
+ f"ERROR: Invalid S3 target for deletion: bucket={target_bucket_del}, key={s3_key_to_delete}"
470
+ )
471
+ elif action["action"] == "delete_local":
472
+ local_path_to_delete = action["path_to_delete"]
473
+ if self.verbose:
474
+ print(f"Delete Local: {local_path_to_delete} ({reason})")
475
+ if not dry_run:
476
+ try:
477
+ os.remove(local_path_to_delete)
478
+ deletions_done += 1
479
+ except OSError as e:
480
+ print(f"ERROR deleting local file {local_path_to_delete}: {e}")
481
+
482
+ if s3_deletions_batch:
483
+ target_bucket_del_batch = next(
484
+ (
485
+ a["dest_bucket"]
486
+ for a in actions_to_perform
487
+ if a["action"] == "delete_s3"
488
+ ),
489
+ None,
490
+ )
491
+ if not dry_run and target_bucket_del_batch:
492
+ deleted_count_batch = 0
493
+ for i in range(0, len(s3_deletions_batch), 1000):
494
+ batch = s3_deletions_batch[i : i + 1000]
495
+ delete_payload = {"Objects": batch, "Quiet": False}
496
+ try:
497
+ response = self.client.delete_objects(
498
+ Bucket=target_bucket_del_batch, Delete=delete_payload
499
+ )
500
+ deleted_count_batch += len(batch)
501
+ if "Errors" in response and response["Errors"]:
502
+ deleted_count_batch -= len(response["Errors"])
503
+ for error in response["Errors"]:
504
+ print(
505
+ f"ERROR deleting S3 object {error['Key']}: {error['Code']} - {error['Message']}"
506
+ )
507
+ except ClientError as e:
508
+ print(f"ERROR deleting S3 objects batch: {e}")
509
+ deleted_count_batch = 0
510
+ except Exception as e:
511
+ print(f"ERROR deleting S3 objects batch: {e}")
512
+ deleted_count_batch = 0
513
+ deletions_done += deleted_count_batch
514
+ elif target_bucket_del_batch:
515
+ deletions_done = len(s3_deletions_batch)
516
+ else:
517
+ print(
518
+ "Warning: Could not determine target bucket for S3 deletion batch."
519
+ )
520
+
521
+ if dry_run:
522
+ if self.verbose:
523
+ upload_count = sum(
524
+ 1 for a in actions_to_perform if a["action"] == "upload"
525
+ )
526
+ download_count = sum(
527
+ 1 for a in actions_to_perform if a["action"] == "download"
528
+ )
529
+ delete_s3_count = len(s3_deletions_batch)
530
+ delete_local_count = sum(
531
+ 1 for a in actions_to_perform if a["action"] == "delete_local"
532
+ )
533
+ print("\n--- DRY RUN SUMMARY ---")
534
+ if sync_direction == "upload":
535
+ print(f"Would upload: {upload_count} file(s)")
536
+ if delete:
537
+ print(f"Would delete from S3: {delete_s3_count} object(s)")
538
+ elif sync_direction == "download":
539
+ print(f"Would download: {download_count} file(s)")
540
+ if delete:
541
+ print(f"Would delete locally: {delete_local_count} file(s)")
542
+ print("--- END DRY RUN ---")
543
+ else:
544
+ if self.verbose:
545
+ if sync_direction == "upload":
546
+ print(
547
+ f"Sync completed. Uploaded: {uploads_done} file(s). Deleted from S3: {deletions_done if delete else 0} object(s)."
548
+ )
549
+ elif sync_direction == "download":
550
+ print(
551
+ f"Sync completed. Downloaded: {downloads_done} file(s). Deleted locally: {deletions_done if delete else 0} file(s)."
552
+ )
553
+
554
+ def copy(
555
+ self,
556
+ source: str,
557
+ destination: str,
558
+ ) -> None:
559
+ """
560
+ Copies files or directories between local paths and S3 URIs.
561
+ Handles:
562
+ - Local file to S3 object
563
+ - Local directory to S3 prefix (recursive)
564
+ - S3 object to local file
565
+ - S3 prefix to local directory (recursive)
566
+ Does NOT handle:
567
+ - Local to Local (use shutil)
568
+ - S3 to S3 (use AWS CLI or boto3 object copy)
569
+ Args:
570
+ source (str): The source path (local file/dir or s3://...).
571
+ destination (str): The destination path (local file/dir or s3://...).
572
+ """
573
+ src_bucket, src_prefix = self._parse_path(source)
574
+ dest_bucket, dest_prefix = self._parse_path(destination)
575
+
576
+ if src_bucket is None and dest_bucket is None:
577
+ print(
578
+ "Error: Both source and destination are local. Use 'shutil.copy' or 'shutil.copytree'."
579
+ )
580
+ return
581
+ if src_bucket is not None and dest_bucket is not None:
582
+ print(
583
+ "Error: S3 to S3 copy not implemented. Use 'aws s3 cp' or boto3 'copy_object'."
584
+ )
585
+ return
586
+
587
+ # Upload: Local to S3
588
+ if src_bucket is None and dest_bucket is not None:
589
+ if not os.path.exists(source):
590
+ print(f"Error: Local source path not found: {source}")
591
+ return
592
+ current_dest_prefix = dest_prefix or ""
593
+
594
+ if os.path.isfile(source):
595
+ if not current_dest_prefix or destination.endswith("/"):
596
+ s3_key = os.path.join(
597
+ current_dest_prefix, os.path.basename(source)
598
+ ).replace("\\", "/")
599
+ else:
600
+ s3_key = current_dest_prefix
601
+ if self.verbose:
602
+ print(f"Uploading {source} to s3://{dest_bucket}/{s3_key}")
603
+ try:
604
+ self.client.upload_file(source, dest_bucket, s3_key)
605
+ if self.verbose:
606
+ print("Upload complete.")
607
+ except ClientError as e:
608
+ print(f"ERROR uploading {source}: {e}")
609
+ except Exception as e:
610
+ print(f"ERROR uploading {source}: {e}")
611
+
612
+ elif os.path.isdir(source):
613
+ if self.verbose:
614
+ print(
615
+ f"Uploading directory {source}/* to s3://{dest_bucket}/{current_dest_prefix}/"
616
+ )
617
+ files_uploaded = files_failed = 0
618
+ for root, _, files in os.walk(source):
619
+ for file in files:
620
+ local_path = os.path.join(root, file)
621
+ relative_path = os.path.relpath(local_path, source)
622
+ s3_key = os.path.join(
623
+ current_dest_prefix, relative_path
624
+ ).replace("\\", "/")
625
+ if self.verbose:
626
+ print(
627
+ f" Uploading {local_path} to s3://{dest_bucket}/{s3_key}"
628
+ )
629
+ try:
630
+ self.client.upload_file(local_path, dest_bucket, s3_key)
631
+ files_uploaded += 1
632
+ except ClientError as e:
633
+ print(f" ERROR uploading {local_path}: {e}")
634
+ files_failed += 1
635
+ except Exception as e:
636
+ print(f" ERROR uploading {local_path}: {e}")
637
+ files_failed += 1
638
+ if self.verbose:
639
+ print(
640
+ f"Directory upload complete. Files uploaded: {files_uploaded}, Failed: {files_failed}"
641
+ )
642
+ else:
643
+ print(f"Error: Source {source} is neither a file nor a directory.")
644
+
645
+ # Download: S3 to Local
646
+ elif src_bucket is not None and dest_bucket is None:
647
+ is_prefix_download = False
648
+ single_object_key = None
649
+ current_src_prefix = src_prefix or "" # Ensure not None
650
+
651
+ if source.endswith("/"):
652
+ is_prefix_download = True
653
+ else:
654
+ try:
655
+ if current_src_prefix:
656
+ self.client.head_object(
657
+ Bucket=src_bucket, Key=current_src_prefix
658
+ )
659
+ single_object_key = current_src_prefix
660
+ else:
661
+ # Path like s3://bucket, treat as prefix download
662
+ is_prefix_download = True
663
+ except ClientError as e:
664
+ if e.response["Error"]["Code"] == "404":
665
+ is_prefix_download = True # Assume prefix if object not found
666
+ elif e.response["Error"]["Code"] == "NoSuchBucket":
667
+ print(f"Error: Source bucket '{src_bucket}' not found.")
668
+ return
669
+ else:
670
+ print(
671
+ f"Error checking S3 source s3://{src_bucket}/{current_src_prefix}: {e}"
672
+ )
673
+ return
674
+ except Exception as e:
675
+ print(
676
+ f"Error checking S3 source s3://{src_bucket}/{current_src_prefix}: {e}"
677
+ )
678
+ return
679
+
680
+ if single_object_key is not None:
681
+ if os.path.isdir(destination) or destination.endswith(os.sep):
682
+ local_dest_path = os.path.join(
683
+ destination, os.path.basename(single_object_key)
684
+ )
685
+ os.makedirs(destination, exist_ok=True)
686
+ else:
687
+ local_dest_path = destination
688
+ parent_dir = os.path.dirname(local_dest_path)
689
+ if parent_dir:
690
+ os.makedirs(parent_dir, exist_ok=True)
691
+ if self.verbose:
692
+ print(
693
+ f"Downloading s3://{src_bucket}/{single_object_key} to {local_dest_path}"
694
+ )
695
+ try:
696
+ self.client.download_file(
697
+ src_bucket, single_object_key, local_dest_path
698
+ )
699
+ if self.verbose:
700
+ print("Download complete.")
701
+ except ClientError as e:
702
+ print(f"ERROR downloading {single_object_key}: {e}")
703
+ except OSError as e:
704
+ print(f"ERROR creating/writing {local_dest_path}: {e}")
705
+ except Exception as e:
706
+ print(f"ERROR downloading {single_object_key}: {e}")
707
+
708
+ elif is_prefix_download:
709
+ if os.path.exists(destination) and not os.path.isdir(destination):
710
+ print(
711
+ f"Error: Local destination '{destination}' exists but is not a directory."
712
+ )
713
+ return
714
+ os.makedirs(destination, exist_ok=True)
715
+ if self.verbose:
716
+ print(
717
+ f"Downloading prefix s3://{src_bucket}/{current_src_prefix}/* to {destination}/"
718
+ )
719
+ paginator = self.client.get_paginator("list_objects_v2")
720
+ files_downloaded = files_failed = 0
721
+ operation_parameters = {"Bucket": src_bucket}
722
+ # The problematic line for the linter, re-adding type ignore
723
+ if current_src_prefix: # type: ignore
724
+ operation_parameters["Prefix"] = current_src_prefix
725
+ try:
726
+ page_iterator = paginator.paginate(**operation_parameters)
727
+ found_objects = False
728
+ for page in page_iterator:
729
+ if "Contents" in page:
730
+ found_objects = True
731
+ for obj in page["Contents"]:
732
+ s3_key = obj["Key"]
733
+ if s3_key.endswith("/") and obj["Size"] == 0:
734
+ continue
735
+ relative_key = s3_key
736
+ if current_src_prefix:
737
+ if s3_key.startswith(current_src_prefix):
738
+ if s3_key == current_src_prefix.rstrip("/"):
739
+ relative_key = os.path.basename(s3_key)
740
+ else:
741
+ prefix_adjusted = current_src_prefix + (
742
+ ""
743
+ if current_src_prefix.endswith("/")
744
+ else "/"
745
+ )
746
+ if s3_key.startswith(prefix_adjusted):
747
+ relative_key = s3_key[
748
+ len(prefix_adjusted) :
749
+ ]
750
+ elif not current_src_prefix.endswith("/"):
751
+ relative_key = s3_key[
752
+ len(current_src_prefix) :
753
+ ].lstrip("/")
754
+ if not relative_key:
755
+ continue
756
+ local_dest_path = os.path.join(
757
+ destination, relative_key.replace("/", os.sep)
758
+ )
759
+ local_dest_dir = os.path.dirname(local_dest_path)
760
+ if self.verbose:
761
+ print(
762
+ f" Downloading s3://{src_bucket}/{s3_key} to {local_dest_path}"
763
+ )
764
+ try:
765
+ if local_dest_dir:
766
+ os.makedirs(local_dest_dir, exist_ok=True)
767
+ self.client.download_file(
768
+ src_bucket, s3_key, local_dest_path
769
+ )
770
+ files_downloaded += 1
771
+ except ClientError as e:
772
+ print(f" ERROR downloading {s3_key}: {e}")
773
+ files_failed += 1
774
+ except OSError as e:
775
+ print(
776
+ f" ERROR creating/writing {local_dest_path}: {e}"
777
+ )
778
+ files_failed += 1
779
+ except Exception as e:
780
+ print(f" ERROR downloading {s3_key}: {e}")
781
+ files_failed += 1
782
+ if not found_objects and self.verbose:
783
+ print(
784
+ f"Warning: No objects found at source prefix s3://{src_bucket}/{current_src_prefix}"
785
+ )
786
+ if self.verbose:
787
+ print(
788
+ f"Prefix download complete. Files downloaded: {files_downloaded}, Failed: {files_failed}"
789
+ )
790
+ except ClientError as e:
791
+ if e.response["Error"]["Code"] == "NoSuchBucket":
792
+ print(f"Error: Source bucket '{src_bucket}' not found.")
793
+ else:
794
+ print(
795
+ f"Error listing objects in s3://{src_bucket}/{current_src_prefix}: {e}"
796
+ )
797
+ except Exception as e:
798
+ print(
799
+ f"Error listing objects in s3://{src_bucket}/{current_src_prefix}: {e}"
800
+ )
801
+ else:
802
+ print("Error: Unknown copy operation type.")
803
+
804
+ def check(self, s3_uri: str) -> bool:
805
+ """
806
+ Check if an object or prefix exists in an S3 bucket using an S3 URI.
807
+
808
+ Args:
809
+ s3_uri (str): The S3 URI (e.g., 's3://my-bucket/my-key' or 's3://my-bucket/my-prefix/').
810
+ Use a trailing '/' to check for a prefix/directory.
811
+
812
+ Returns:
813
+ bool: True if the object or prefix exists, False otherwise.
814
+ """
815
+ # Use the class client and parse method
816
+ bucket_name, s3_key = self._parse_path(s3_uri)
817
+
818
+ if bucket_name is None or s3_key is None:
819
+ # _parse_path returns None, None if scheme is not 's3'
820
+ print(f"Error: Invalid S3 URI format: {s3_uri}")
821
+ return False
822
+
823
+ is_prefix = s3_key.endswith("/")
824
+
825
+ try:
826
+ if is_prefix:
827
+ # Check for prefix existence by listing objects
828
+ # Handle the case where s3_key might be empty if URI is just s3://bucket/
829
+ list_prefix = s3_key if s3_key else ""
830
+ response = self.client.list_objects_v2(
831
+ Bucket=bucket_name, Prefix=list_prefix, MaxKeys=1
832
+ )
833
+ # Check if any objects OR common prefixes (folders) are returned for the prefix
834
+ return "Contents" in response or "CommonPrefixes" in response
835
+ else:
836
+ # Check for object existence
837
+ self.client.head_object(Bucket=bucket_name, Key=s3_key)
838
+ return True
839
+ except ClientError as e: # Catch boto3 ClientError first
840
+ # If head_object returns 404 (NoSuchKey), the object doesn't exist
841
+ # list_objects_v2 does not raise NoSuchKey for prefixes
842
+ if e.response["Error"]["Code"] == "404":
843
+ return False
844
+ elif e.response["Error"]["Code"] == "NoSuchBucket":
845
+ if self.verbose:
846
+ print(
847
+ f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri})."
848
+ )
849
+ return False
850
+ # Handle other potential errors like AccessDenied differently if needed
851
+ print(f"Error checking {s3_uri}: {e}")
852
+ return False
853
+ except Exception as e:
854
+ print(f"An unexpected error occurred checking {s3_uri}: {e}")
855
+ return False