nebu 0.1.23__py3-none-any.whl → 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nebu/__init__.py +2 -1
- nebu/config.py +38 -2
- nebu/convert.py +130 -0
- nebu/data.py +952 -0
- nebu/processors/consumer.py +6 -9
- nebu/processors/decorate.py +16 -22
- nebu/processors/models.py +12 -0
- {nebu-0.1.23.dist-info → nebu-0.1.27.dist-info}/METADATA +3 -1
- {nebu-0.1.23.dist-info → nebu-0.1.27.dist-info}/RECORD +12 -10
- {nebu-0.1.23.dist-info → nebu-0.1.27.dist-info}/WHEEL +0 -0
- {nebu-0.1.23.dist-info → nebu-0.1.27.dist-info}/licenses/LICENSE +0 -0
- {nebu-0.1.23.dist-info → nebu-0.1.27.dist-info}/top_level.txt +0 -0
nebu/data.py
ADDED
@@ -0,0 +1,952 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
from datetime import datetime, timedelta, timezone
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
5
|
+
from urllib.parse import urlparse
|
6
|
+
|
7
|
+
import boto3
|
8
|
+
from botocore.exceptions import ClientError
|
9
|
+
|
10
|
+
|
11
|
+
def rclone_copy(
|
12
|
+
source_dir: str,
|
13
|
+
destination: str,
|
14
|
+
dry_run: bool = False,
|
15
|
+
transfers: int = 4,
|
16
|
+
extra_args: Optional[List[str]] = None,
|
17
|
+
verbose: bool = True,
|
18
|
+
) -> bool:
|
19
|
+
"""
|
20
|
+
Upload a directory to a remote bucket using `rclone copy`.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
source_dir (str): Path to local directory to upload.
|
24
|
+
destination (str): Remote destination, e.g., 's3:my-bucket/path'.
|
25
|
+
dry_run (bool): If True, performs a dry run without uploading.
|
26
|
+
transfers (int): Number of parallel transfers.
|
27
|
+
extra_args (Optional[List[str]]): Additional rclone flags.
|
28
|
+
verbose (bool): If True, prints command and output live.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
bool: True if upload succeeded, False otherwise.
|
32
|
+
"""
|
33
|
+
command = [
|
34
|
+
"rclone",
|
35
|
+
"copy",
|
36
|
+
source_dir,
|
37
|
+
destination,
|
38
|
+
f"--transfers={transfers}",
|
39
|
+
"--progress",
|
40
|
+
]
|
41
|
+
|
42
|
+
if dry_run:
|
43
|
+
command.append("--dry-run")
|
44
|
+
if extra_args:
|
45
|
+
command.extend(extra_args)
|
46
|
+
|
47
|
+
if verbose:
|
48
|
+
print("Running command:", " ".join(command))
|
49
|
+
|
50
|
+
try:
|
51
|
+
process = subprocess.Popen(
|
52
|
+
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
53
|
+
)
|
54
|
+
|
55
|
+
if not process.stdout:
|
56
|
+
raise Exception("No output from rclone")
|
57
|
+
|
58
|
+
for line in process.stdout:
|
59
|
+
if verbose:
|
60
|
+
print(line.strip())
|
61
|
+
|
62
|
+
return process.wait() == 0
|
63
|
+
|
64
|
+
except Exception as e:
|
65
|
+
print(f"Error during rclone copy: {e}")
|
66
|
+
return False
|
67
|
+
|
68
|
+
|
69
|
+
def find_latest_checkpoint(training_dir: str) -> Optional[str]:
|
70
|
+
"""
|
71
|
+
Finds the checkpoint directory with the highest step number in a Hugging Face
|
72
|
+
training output directory.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
training_dir (str): The path to the training output directory.
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
Optional[str]: The path to the latest checkpoint directory, or None if
|
79
|
+
no checkpoint directories are found or the directory
|
80
|
+
doesn't exist.
|
81
|
+
"""
|
82
|
+
latest_step = -1
|
83
|
+
latest_checkpoint_dir = None
|
84
|
+
|
85
|
+
if not os.path.isdir(training_dir):
|
86
|
+
print(f"Error: Directory not found: {training_dir}")
|
87
|
+
return None
|
88
|
+
|
89
|
+
for item in os.listdir(training_dir):
|
90
|
+
item_path = os.path.join(training_dir, item)
|
91
|
+
if os.path.isdir(item_path) and item.startswith("checkpoint-"):
|
92
|
+
try:
|
93
|
+
step_str = item.split("-")[-1]
|
94
|
+
if step_str.isdigit():
|
95
|
+
step = int(step_str)
|
96
|
+
if step > latest_step:
|
97
|
+
latest_step = step
|
98
|
+
latest_checkpoint_dir = item_path
|
99
|
+
except (ValueError, IndexError):
|
100
|
+
# Ignore items that don't match the expected pattern
|
101
|
+
continue
|
102
|
+
|
103
|
+
return latest_checkpoint_dir
|
104
|
+
|
105
|
+
|
106
|
+
def _parse_s3_path(path: str) -> Tuple[Optional[str], Optional[str]]:
|
107
|
+
"""Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
|
108
|
+
parsed = urlparse(path)
|
109
|
+
if parsed.scheme != "s3":
|
110
|
+
return None, None
|
111
|
+
bucket = parsed.netloc
|
112
|
+
prefix = parsed.path.lstrip("/")
|
113
|
+
return bucket, prefix
|
114
|
+
|
115
|
+
|
116
|
+
def _list_s3_objects(
|
117
|
+
s3_client: Any, bucket: str, prefix: Optional[str], verbose: bool = True
|
118
|
+
) -> Dict[str, Dict[str, Any]]:
|
119
|
+
"""Lists objects in an S3 prefix."""
|
120
|
+
objects: Dict[str, Dict[str, Any]] = {}
|
121
|
+
paginator = s3_client.get_paginator("list_objects_v2")
|
122
|
+
list_prefix = (
|
123
|
+
prefix if prefix else ""
|
124
|
+
) # Use empty string if prefix is None for listing
|
125
|
+
if verbose:
|
126
|
+
print(f"Listing objects in s3://{bucket}/{list_prefix}...")
|
127
|
+
|
128
|
+
operation_parameters = {"Bucket": bucket}
|
129
|
+
# Only add Prefix parameter if it's non-empty
|
130
|
+
if list_prefix:
|
131
|
+
operation_parameters["Prefix"] = list_prefix
|
132
|
+
|
133
|
+
try:
|
134
|
+
page_iterator = paginator.paginate(**operation_parameters)
|
135
|
+
for page in page_iterator:
|
136
|
+
if "Contents" in page:
|
137
|
+
for obj in page["Contents"]:
|
138
|
+
# Ignore zero-byte objects ending in '/' (S3 console folders)
|
139
|
+
if obj["Key"].endswith("/") and obj["Size"] == 0:
|
140
|
+
continue
|
141
|
+
|
142
|
+
# Determine key relative to the *prefix* for comparison
|
143
|
+
# If prefix is None or empty, relative key is the full key.
|
144
|
+
relative_key: Optional[str] = None
|
145
|
+
if prefix and obj["Key"].startswith(prefix):
|
146
|
+
# Ensure trailing slash consistency if prefix has one
|
147
|
+
prefix_adjusted = (
|
148
|
+
prefix if prefix.endswith("/") else prefix + "/"
|
149
|
+
)
|
150
|
+
# Handle exact match of prefix as a file
|
151
|
+
if obj["Key"] == prefix:
|
152
|
+
relative_key = os.path.basename(obj["Key"])
|
153
|
+
# Handle keys within the prefix "directory"
|
154
|
+
elif obj["Key"].startswith(prefix_adjusted):
|
155
|
+
relative_key = obj["Key"][len(prefix_adjusted) :]
|
156
|
+
# This case should technically not be needed if prefix is used correctly in listing
|
157
|
+
# but handle defensively if object key *is* the prefix itself (without trailing slash)
|
158
|
+
elif obj["Key"] == prefix.rstrip("/"):
|
159
|
+
relative_key = os.path.basename(obj["Key"])
|
160
|
+
# else: # Should not happen if prefix filter works correctly
|
161
|
+
# print(f"Warning: Unexpected key {obj['Key']} found for prefix {prefix}")
|
162
|
+
# relative_key = obj["Key"] # Fallback
|
163
|
+
elif not prefix:
|
164
|
+
# If no prefix specified, the relative key is the full key
|
165
|
+
relative_key = obj["Key"]
|
166
|
+
# else: obj["Key"] does not start with prefix - ignore (shouldn't happen with Prefix param)
|
167
|
+
|
168
|
+
# Skip if relative key is empty or None (e.g., prefix itself listed, or unexpected case)
|
169
|
+
if not relative_key:
|
170
|
+
continue
|
171
|
+
|
172
|
+
# Ensure LastModified is timezone-aware
|
173
|
+
last_modified = obj["LastModified"]
|
174
|
+
if last_modified.tzinfo is None:
|
175
|
+
last_modified = last_modified.replace(tzinfo=timezone.utc)
|
176
|
+
|
177
|
+
objects[relative_key] = {
|
178
|
+
"path": f"s3://{bucket}/{obj['Key']}", # Store full path for reference
|
179
|
+
"key": obj["Key"], # Store full S3 key
|
180
|
+
"size": obj["Size"],
|
181
|
+
"mtime": last_modified,
|
182
|
+
"type": "s3",
|
183
|
+
}
|
184
|
+
except ClientError as e:
|
185
|
+
if e.response["Error"]["Code"] == "NoSuchBucket":
|
186
|
+
print(f"Error: Bucket '{bucket}' not found.")
|
187
|
+
# Allow sync *to* a non-existent prefix (will just upload all)
|
188
|
+
elif e.response["Error"]["Code"] == "NoSuchKey" and prefix:
|
189
|
+
if verbose:
|
190
|
+
print(f"Prefix s3://{bucket}/{prefix} not found (treating as empty).")
|
191
|
+
else:
|
192
|
+
print(f"Error listing S3 objects: {e}")
|
193
|
+
# Return empty dict on error that prevents listing (like NoSuchBucket)
|
194
|
+
if e.response["Error"]["Code"] == "NoSuchBucket":
|
195
|
+
return {}
|
196
|
+
except Exception as e:
|
197
|
+
print(f"An unexpected error occurred listing S3 objects: {e}")
|
198
|
+
return {} # Return empty on unexpected error
|
199
|
+
|
200
|
+
if verbose:
|
201
|
+
print(f"Found {len(objects)} objects in S3.")
|
202
|
+
return objects
|
203
|
+
|
204
|
+
|
205
|
+
def _list_local_files(
|
206
|
+
local_dir: str, verbose: bool = True
|
207
|
+
) -> Dict[str, Dict[str, Any]]:
|
208
|
+
"""Lists files in a local directory."""
|
209
|
+
if not os.path.isdir(local_dir):
|
210
|
+
# Check if it's a file path instead of a dir
|
211
|
+
if os.path.isfile(local_dir):
|
212
|
+
print(
|
213
|
+
f"Warning: Source {local_dir} is a file, not a directory. Syncing single file."
|
214
|
+
)
|
215
|
+
try:
|
216
|
+
local_size = os.path.getsize(local_dir)
|
217
|
+
local_mtime_ts = os.path.getmtime(local_dir)
|
218
|
+
local_mtime = datetime.fromtimestamp(local_mtime_ts, tz=timezone.utc)
|
219
|
+
file_name = os.path.basename(local_dir)
|
220
|
+
return {
|
221
|
+
file_name: {
|
222
|
+
"path": local_dir,
|
223
|
+
"size": local_size,
|
224
|
+
"mtime": local_mtime,
|
225
|
+
"type": "local",
|
226
|
+
}
|
227
|
+
}
|
228
|
+
except OSError as e:
|
229
|
+
print(f"Error accessing source file {local_dir}: {e}")
|
230
|
+
return {}
|
231
|
+
else:
|
232
|
+
print(f"Warning: Local path not found: {local_dir} (treating as empty).")
|
233
|
+
return {}
|
234
|
+
|
235
|
+
files: Dict[str, Dict[str, Any]] = {}
|
236
|
+
if verbose:
|
237
|
+
print(f"Scanning local directory: {local_dir}...")
|
238
|
+
for root, _, file_list in os.walk(local_dir):
|
239
|
+
for file_name in file_list:
|
240
|
+
local_path = os.path.join(root, file_name)
|
241
|
+
try:
|
242
|
+
# Use '/' for relative key consistency
|
243
|
+
relative_path = os.path.relpath(local_path, local_dir).replace(
|
244
|
+
"\\", "/"
|
245
|
+
)
|
246
|
+
# relative_path will be '.' if local_dir points to a file, handled above.
|
247
|
+
|
248
|
+
local_size = os.path.getsize(local_path)
|
249
|
+
local_mtime_ts = os.path.getmtime(local_path)
|
250
|
+
local_mtime = datetime.fromtimestamp(local_mtime_ts, tz=timezone.utc)
|
251
|
+
|
252
|
+
files[relative_path] = {
|
253
|
+
"path": local_path,
|
254
|
+
"size": local_size,
|
255
|
+
"mtime": local_mtime,
|
256
|
+
"type": "local",
|
257
|
+
}
|
258
|
+
except OSError as e:
|
259
|
+
print(f"Warning: Could not get metadata for {local_path}: {e}")
|
260
|
+
except Exception as e:
|
261
|
+
print(f"Warning: Unexpected error processing {local_path}: {e}")
|
262
|
+
|
263
|
+
if verbose:
|
264
|
+
print(f"Found {len(files)} files locally.")
|
265
|
+
return files
|
266
|
+
|
267
|
+
|
268
|
+
def s3_sync(
|
269
|
+
source: str,
|
270
|
+
destination: str,
|
271
|
+
delete: bool = False,
|
272
|
+
dry_run: bool = False,
|
273
|
+
verbose: bool = True,
|
274
|
+
) -> None:
|
275
|
+
"""
|
276
|
+
Synchronizes files between a source and a destination, which can be
|
277
|
+
local paths or S3 paths (e.g., 's3://my-bucket/my-prefix').
|
278
|
+
|
279
|
+
Compares file sizes and modification times. Copies files from source
|
280
|
+
to destination if they are missing, larger, or newer in the source.
|
281
|
+
Optionally deletes files from the destination if they are not present
|
282
|
+
in the source.
|
283
|
+
|
284
|
+
Args:
|
285
|
+
source (str): The source path (local directory/file or s3://...).
|
286
|
+
destination (str): The destination path (local directory or s3://...).
|
287
|
+
delete (bool): If True, delete extraneous files from the destination.
|
288
|
+
dry_run (bool): If True, print actions without performing them.
|
289
|
+
verbose (bool): If True, print actions being taken.
|
290
|
+
"""
|
291
|
+
s3_client = boto3.client("s3")
|
292
|
+
mtime_tolerance = timedelta(
|
293
|
+
seconds=2
|
294
|
+
) # S3 mtime might not have sub-second precision
|
295
|
+
|
296
|
+
src_bucket, src_prefix = _parse_s3_path(source)
|
297
|
+
dest_bucket, dest_prefix = _parse_s3_path(destination)
|
298
|
+
|
299
|
+
source_items: Dict[str, Dict[str, Any]] = {}
|
300
|
+
dest_items: Dict[str, Dict[str, Any]] = {}
|
301
|
+
sync_direction = ""
|
302
|
+
is_single_file_sync = False
|
303
|
+
|
304
|
+
# Determine sync direction and list items
|
305
|
+
if src_bucket is None and dest_bucket is not None:
|
306
|
+
sync_direction = "upload"
|
307
|
+
source_items = _list_local_files(source, verbose)
|
308
|
+
dest_items = _list_s3_objects(s3_client, dest_bucket, dest_prefix, verbose)
|
309
|
+
# Check if source exists (either dir or file)
|
310
|
+
if not os.path.exists(source):
|
311
|
+
print(f"Error: Source path {source} not found.")
|
312
|
+
return
|
313
|
+
is_single_file_sync = os.path.isfile(source)
|
314
|
+
# Destination prefix defaults to empty if not specified
|
315
|
+
if dest_prefix is None:
|
316
|
+
dest_prefix = ""
|
317
|
+
|
318
|
+
elif src_bucket is not None and dest_bucket is None:
|
319
|
+
sync_direction = "download"
|
320
|
+
source_items = _list_s3_objects(s3_client, src_bucket, src_prefix, verbose)
|
321
|
+
# For download, destination MUST be a directory (or created as one)
|
322
|
+
# If destination exists and is a file, it's an error.
|
323
|
+
if os.path.exists(destination) and not os.path.isdir(destination):
|
324
|
+
print(
|
325
|
+
f"Error: Local destination '{destination}' exists but is not a directory."
|
326
|
+
)
|
327
|
+
return
|
328
|
+
|
329
|
+
dest_items = _list_local_files(destination, verbose)
|
330
|
+
|
331
|
+
# Ensure destination directory exists for downloads
|
332
|
+
if not dry_run:
|
333
|
+
os.makedirs(destination, exist_ok=True)
|
334
|
+
elif not os.path.isdir(destination) and verbose:
|
335
|
+
print(f"Dry run: Would create local directory {destination}")
|
336
|
+
|
337
|
+
elif src_bucket is None and dest_bucket is None:
|
338
|
+
print(
|
339
|
+
"Error: Both source and destination are local paths. Use standard file copy tools."
|
340
|
+
)
|
341
|
+
return
|
342
|
+
elif src_bucket is not None and dest_bucket is not None:
|
343
|
+
print(
|
344
|
+
"Error: S3 to S3 sync not implemented. Use AWS CLI or S3 Batch Operations."
|
345
|
+
)
|
346
|
+
return
|
347
|
+
else:
|
348
|
+
# This case should not be reachable given the above checks
|
349
|
+
print("Error: Invalid source or destination path combination.")
|
350
|
+
return
|
351
|
+
|
352
|
+
actions_to_perform: List[Dict[str, Any]] = []
|
353
|
+
|
354
|
+
# --- Compare items ---
|
355
|
+
# Use source keys as the primary loop iterator
|
356
|
+
source_keys = set(source_items.keys())
|
357
|
+
dest_keys = set(dest_items.keys())
|
358
|
+
|
359
|
+
for rel_key in source_keys:
|
360
|
+
src_item = source_items[rel_key]
|
361
|
+
dest_item = dest_items.get(rel_key)
|
362
|
+
reason = ""
|
363
|
+
|
364
|
+
if dest_item is None:
|
365
|
+
reason = "does not exist in destination"
|
366
|
+
else:
|
367
|
+
# Compare metadata (size and mtime)
|
368
|
+
if src_item["size"] != dest_item["size"]:
|
369
|
+
reason = (
|
370
|
+
f"size differs (src: {src_item['size']}, dest: {dest_item['size']})"
|
371
|
+
)
|
372
|
+
# Sync if source is newer (outside tolerance)
|
373
|
+
elif src_item["mtime"] > (dest_item["mtime"] + mtime_tolerance):
|
374
|
+
reason = f"is newer in source (src: {src_item['mtime']}, dest: {dest_item['mtime']})"
|
375
|
+
|
376
|
+
if reason:
|
377
|
+
action_type = "upload" if sync_direction == "upload" else "download"
|
378
|
+
# Determine the final destination key/path
|
379
|
+
dest_full_path_or_key: Optional[str] = None
|
380
|
+
if sync_direction == "upload":
|
381
|
+
# If uploading single file, dest key is prefix + filename
|
382
|
+
# If uploading dir, dest key is prefix + relative key
|
383
|
+
# Ensure dest_prefix is treated as empty string if None
|
384
|
+
current_dest_prefix = dest_prefix or ""
|
385
|
+
final_dest_key = (
|
386
|
+
rel_key
|
387
|
+
if is_single_file_sync
|
388
|
+
else os.path.join(current_dest_prefix, rel_key).replace("\\", "/")
|
389
|
+
)
|
390
|
+
# Ensure we don't create keys like 's3://bucket//key' if prefix was empty
|
391
|
+
if not current_dest_prefix and final_dest_key.startswith("/"):
|
392
|
+
final_dest_key = final_dest_key.lstrip("/")
|
393
|
+
dest_full_path_or_key = f"s3://{dest_bucket}/{final_dest_key}"
|
394
|
+
else: # download
|
395
|
+
dest_full_path_or_key = os.path.join(
|
396
|
+
destination, rel_key.replace("/", os.sep)
|
397
|
+
)
|
398
|
+
|
399
|
+
actions_to_perform.append(
|
400
|
+
{
|
401
|
+
"action": action_type,
|
402
|
+
"relative_key": rel_key,
|
403
|
+
"source_path": src_item["path"], # Local path or S3 URI
|
404
|
+
"source_mtime": src_item.get("mtime"),
|
405
|
+
"dest_full_path_or_key": dest_full_path_or_key,
|
406
|
+
# Store details needed for specific actions
|
407
|
+
"dest_bucket": dest_bucket,
|
408
|
+
"dest_prefix": dest_prefix,
|
409
|
+
"s3_key_full_src": src_item.get("key")
|
410
|
+
if sync_direction == "download"
|
411
|
+
else None,
|
412
|
+
"source_bucket": src_bucket,
|
413
|
+
"reason": reason,
|
414
|
+
}
|
415
|
+
)
|
416
|
+
|
417
|
+
# Identify items for deletion in destination
|
418
|
+
if delete:
|
419
|
+
keys_to_delete = dest_keys - source_keys
|
420
|
+
for rel_key in keys_to_delete:
|
421
|
+
dest_item = dest_items[rel_key]
|
422
|
+
action_type = "delete_s3" if sync_direction == "upload" else "delete_local"
|
423
|
+
actions_to_perform.append(
|
424
|
+
{
|
425
|
+
"action": action_type,
|
426
|
+
"relative_key": rel_key,
|
427
|
+
"path_to_delete": dest_item["path"], # Full S3 URI or local path
|
428
|
+
"s3_key_full_dest": dest_item.get("key")
|
429
|
+
if sync_direction == "upload"
|
430
|
+
else None, # Needed for delete_s3
|
431
|
+
"dest_bucket": dest_bucket, # Needed for delete_s3
|
432
|
+
"reason": "does not exist in source",
|
433
|
+
}
|
434
|
+
)
|
435
|
+
|
436
|
+
# --- Execute Actions ---
|
437
|
+
uploads_done = downloads_done = deletions_done = 0
|
438
|
+
s3_deletions_batch: List[Dict[str, str]] = []
|
439
|
+
|
440
|
+
if not actions_to_perform:
|
441
|
+
print("Source and destination are already synchronized.")
|
442
|
+
# Still check if source/dest actually exist if nothing to do
|
443
|
+
if sync_direction == "upload" and not os.path.exists(source):
|
444
|
+
print(f"Note: Source path {source} does not exist.")
|
445
|
+
# Add check for S3 source existence if needed via head_bucket or similar
|
446
|
+
return
|
447
|
+
|
448
|
+
for action in actions_to_perform:
|
449
|
+
rel_key = action["relative_key"]
|
450
|
+
reason = action["reason"]
|
451
|
+
dest_full_path_or_key = action["dest_full_path_or_key"]
|
452
|
+
|
453
|
+
if action["action"] == "upload":
|
454
|
+
local_path = action["source_path"]
|
455
|
+
# Ensure dest_full_path_or_key is valid before parsing
|
456
|
+
if not isinstance(dest_full_path_or_key, str):
|
457
|
+
print(
|
458
|
+
f"ERROR: Invalid destination path calculated for upload: {dest_full_path_or_key}"
|
459
|
+
)
|
460
|
+
continue
|
461
|
+
# Extract final key from the pre-calculated dest_full_path_or_key
|
462
|
+
_, upload_key = _parse_s3_path(dest_full_path_or_key)
|
463
|
+
target_bucket = action["dest_bucket"]
|
464
|
+
|
465
|
+
if verbose:
|
466
|
+
print(f"Upload: {local_path} to {dest_full_path_or_key} ({reason})")
|
467
|
+
if not dry_run:
|
468
|
+
if target_bucket and upload_key is not None:
|
469
|
+
try:
|
470
|
+
s3_client.upload_file(local_path, target_bucket, upload_key)
|
471
|
+
uploads_done += 1
|
472
|
+
except ClientError as e:
|
473
|
+
print(f"ERROR uploading {local_path}: {e}")
|
474
|
+
except Exception as e:
|
475
|
+
print(f"ERROR uploading {local_path}: {e}")
|
476
|
+
else:
|
477
|
+
print(
|
478
|
+
f"ERROR: Invalid S3 target for upload: bucket={target_bucket}, key={upload_key}"
|
479
|
+
)
|
480
|
+
|
481
|
+
elif action["action"] == "download":
|
482
|
+
s3_key_full = action["s3_key_full_src"]
|
483
|
+
local_path = dest_full_path_or_key # This is the local destination path
|
484
|
+
source_bucket_dl = action["source_bucket"]
|
485
|
+
|
486
|
+
if verbose:
|
487
|
+
print(f"Download: {action['source_path']} to {local_path} ({reason})")
|
488
|
+
# Ensure local_path is valid before proceeding
|
489
|
+
if not isinstance(local_path, str):
|
490
|
+
print(
|
491
|
+
f"ERROR: Invalid local destination path calculated for download: {local_path}"
|
492
|
+
)
|
493
|
+
continue
|
494
|
+
if not dry_run:
|
495
|
+
if source_bucket_dl and s3_key_full and local_path:
|
496
|
+
try:
|
497
|
+
local_file_dir = os.path.dirname(local_path)
|
498
|
+
os.makedirs(local_file_dir, exist_ok=True)
|
499
|
+
s3_client.download_file(
|
500
|
+
source_bucket_dl, s3_key_full, local_path
|
501
|
+
)
|
502
|
+
downloads_done += 1
|
503
|
+
except ClientError as e:
|
504
|
+
print(f"ERROR downloading {s3_key_full}: {e}")
|
505
|
+
except OSError as e:
|
506
|
+
print(
|
507
|
+
f"ERROR creating directory or writing file {local_path}: {e}"
|
508
|
+
)
|
509
|
+
except Exception as e:
|
510
|
+
print(f"ERROR downloading {s3_key_full}: {e}")
|
511
|
+
else:
|
512
|
+
print(
|
513
|
+
f"ERROR: Invalid parameters for download: bucket={source_bucket_dl}, key={s3_key_full}, local={local_path}"
|
514
|
+
)
|
515
|
+
|
516
|
+
elif action["action"] == "delete_s3":
|
517
|
+
s3_key_to_delete = action["s3_key_full_dest"]
|
518
|
+
target_bucket_del = action["dest_bucket"]
|
519
|
+
if target_bucket_del and s3_key_to_delete:
|
520
|
+
if verbose:
|
521
|
+
print(f"Delete S3: {action['path_to_delete']} ({reason})")
|
522
|
+
# Check type before appending to batch
|
523
|
+
if isinstance(s3_key_to_delete, str):
|
524
|
+
s3_deletions_batch.append({"Key": s3_key_to_delete})
|
525
|
+
else:
|
526
|
+
print(f"ERROR: Invalid S3 key for deletion: {s3_key_to_delete}")
|
527
|
+
else:
|
528
|
+
print(
|
529
|
+
f"ERROR: Invalid S3 target for deletion: bucket={target_bucket_del}, key={s3_key_to_delete}"
|
530
|
+
)
|
531
|
+
|
532
|
+
elif action["action"] == "delete_local":
|
533
|
+
local_path_to_delete = action["path_to_delete"]
|
534
|
+
if verbose:
|
535
|
+
print(f"Delete Local: {local_path_to_delete} ({reason})")
|
536
|
+
if not dry_run:
|
537
|
+
try:
|
538
|
+
os.remove(local_path_to_delete)
|
539
|
+
deletions_done += 1
|
540
|
+
# TODO: Optionally clean up empty directories?
|
541
|
+
except OSError as e:
|
542
|
+
print(f"ERROR deleting local file {local_path_to_delete}: {e}")
|
543
|
+
|
544
|
+
# Process S3 deletions in batches
|
545
|
+
if s3_deletions_batch:
|
546
|
+
# Get the target bucket from the first deletion action (should be consistent)
|
547
|
+
target_bucket_del_batch = next(
|
548
|
+
(
|
549
|
+
a["dest_bucket"]
|
550
|
+
for a in actions_to_perform
|
551
|
+
if a["action"] == "delete_s3"
|
552
|
+
),
|
553
|
+
None,
|
554
|
+
)
|
555
|
+
if not dry_run and target_bucket_del_batch:
|
556
|
+
deleted_count_batch = 0
|
557
|
+
for i in range(0, len(s3_deletions_batch), 1000):
|
558
|
+
batch = s3_deletions_batch[i : i + 1000]
|
559
|
+
delete_payload = {"Objects": batch, "Quiet": False} # Get errors back
|
560
|
+
try:
|
561
|
+
response = s3_client.delete_objects(
|
562
|
+
Bucket=target_bucket_del_batch, Delete=delete_payload
|
563
|
+
)
|
564
|
+
# Increment count based on successful deletions reported (if not Quiet) or assume success if Quiet
|
565
|
+
deleted_count_batch += len(
|
566
|
+
batch
|
567
|
+
) # Assume success unless errors reported
|
568
|
+
if "Deleted" in response:
|
569
|
+
pass # Counted optimistically above
|
570
|
+
# deleted_count_batch += len(response['Deleted'])
|
571
|
+
if "Errors" in response and response["Errors"]:
|
572
|
+
deleted_count_batch -= len(
|
573
|
+
response["Errors"]
|
574
|
+
) # Adjust count for errors
|
575
|
+
for error in response["Errors"]:
|
576
|
+
print(
|
577
|
+
f"ERROR deleting S3 object {error['Key']}: {error['Code']} - {error['Message']}"
|
578
|
+
)
|
579
|
+
except ClientError as e:
|
580
|
+
print(f"ERROR deleting S3 objects batch: {e}")
|
581
|
+
deleted_count_batch = 0 # Assume batch failed
|
582
|
+
except Exception as e:
|
583
|
+
print(f"ERROR deleting S3 objects batch: {e}")
|
584
|
+
deleted_count_batch = 0 # Assume batch failed
|
585
|
+
deletions_done += deleted_count_batch
|
586
|
+
elif target_bucket_del_batch: # dry_run is True
|
587
|
+
deletions_done = len(
|
588
|
+
s3_deletions_batch
|
589
|
+
) # Report planned deletions for dry run
|
590
|
+
else:
|
591
|
+
print("Warning: Could not determine target bucket for S3 deletion batch.")
|
592
|
+
|
593
|
+
# --- Summary ---
|
594
|
+
if dry_run:
|
595
|
+
upload_count = sum(1 for a in actions_to_perform if a["action"] == "upload")
|
596
|
+
download_count = sum(1 for a in actions_to_perform if a["action"] == "download")
|
597
|
+
# Deletion count for dry run is based on the batch prepared
|
598
|
+
delete_s3_count = len(s3_deletions_batch)
|
599
|
+
delete_local_count = sum(
|
600
|
+
1 for a in actions_to_perform if a["action"] == "delete_local"
|
601
|
+
)
|
602
|
+
print("\n--- DRY RUN SUMMARY ---")
|
603
|
+
if sync_direction == "upload":
|
604
|
+
print(f"Would upload: {upload_count} file(s)")
|
605
|
+
if delete:
|
606
|
+
print(f"Would delete from S3: {delete_s3_count} object(s)")
|
607
|
+
elif sync_direction == "download":
|
608
|
+
print(f"Would download: {download_count} file(s)")
|
609
|
+
if delete:
|
610
|
+
print(f"Would delete locally: {delete_local_count} file(s)")
|
611
|
+
print("--- END DRY RUN ---")
|
612
|
+
else:
|
613
|
+
if sync_direction == "upload":
|
614
|
+
print(
|
615
|
+
f"Sync completed. Uploaded: {uploads_done} file(s). Deleted from S3: {deletions_done if delete else 0} object(s)."
|
616
|
+
)
|
617
|
+
elif sync_direction == "download":
|
618
|
+
print(
|
619
|
+
f"Sync completed. Downloaded: {downloads_done} file(s). Deleted locally: {deletions_done if delete else 0} file(s)."
|
620
|
+
)
|
621
|
+
|
622
|
+
|
623
|
+
def s3_check(s3_uri: str) -> bool:
|
624
|
+
"""
|
625
|
+
Check if an object or prefix exists in an S3 bucket using an S3 URI.
|
626
|
+
|
627
|
+
Args:
|
628
|
+
s3_uri (str): The S3 URI (e.g., 's3://my-bucket/my-key' or 's3://my-bucket/my-prefix/').
|
629
|
+
Use a trailing '/' to check for a prefix/directory.
|
630
|
+
|
631
|
+
Returns:
|
632
|
+
bool: True if the object or prefix exists, False otherwise.
|
633
|
+
"""
|
634
|
+
s3 = boto3.client("s3")
|
635
|
+
bucket_name, s3_key = _parse_s3_path(s3_uri)
|
636
|
+
|
637
|
+
if bucket_name is None or s3_key is None:
|
638
|
+
# _parse_s3_path returns None, None if scheme is not 's3'
|
639
|
+
print(f"Error: Invalid S3 URI format: {s3_uri}")
|
640
|
+
return False
|
641
|
+
|
642
|
+
is_prefix = s3_key.endswith("/")
|
643
|
+
|
644
|
+
try:
|
645
|
+
if is_prefix:
|
646
|
+
# Check for prefix existence by listing objects
|
647
|
+
# Handle the case where s3_key might be empty if URI is just s3://bucket/
|
648
|
+
list_prefix = s3_key if s3_key else ""
|
649
|
+
response = s3.list_objects_v2(
|
650
|
+
Bucket=bucket_name, Prefix=list_prefix, MaxKeys=1
|
651
|
+
)
|
652
|
+
# Check if any objects OR common prefixes (folders) are returned for the prefix
|
653
|
+
return "Contents" in response or "CommonPrefixes" in response
|
654
|
+
else:
|
655
|
+
# Check for object existence
|
656
|
+
s3.head_object(Bucket=bucket_name, Key=s3_key)
|
657
|
+
return True
|
658
|
+
except ClientError as e: # Catch boto3 ClientError first
|
659
|
+
# If head_object returns 404 (NoSuchKey), the object doesn't exist
|
660
|
+
# list_objects_v2 does not raise NoSuchKey for prefixes
|
661
|
+
if e.response["Error"]["Code"] == "404":
|
662
|
+
return False
|
663
|
+
elif e.response["Error"]["Code"] == "NoSuchBucket":
|
664
|
+
print(f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri}).")
|
665
|
+
return False
|
666
|
+
# Handle other potential errors like AccessDenied differently if needed
|
667
|
+
print(f"Error checking {s3_uri}: {e}")
|
668
|
+
return False
|
669
|
+
# except s3.exceptions.NoSuchBucket: # This specific exception is less common with boto3 client
|
670
|
+
# print(f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri}).")
|
671
|
+
# return False
|
672
|
+
except Exception as e:
|
673
|
+
print(f"An unexpected error occurred checking {s3_uri}: {e}")
|
674
|
+
return False
|
675
|
+
|
676
|
+
|
677
|
+
def s3_copy(
|
678
|
+
source: str,
|
679
|
+
destination: str,
|
680
|
+
verbose: bool = True,
|
681
|
+
) -> None:
|
682
|
+
"""
|
683
|
+
Copies files or directories between local paths and S3 URIs.
|
684
|
+
|
685
|
+
Handles:
|
686
|
+
- Local file to S3 object
|
687
|
+
- Local directory to S3 prefix (recursive)
|
688
|
+
- S3 object to local file
|
689
|
+
- S3 prefix to local directory (recursive)
|
690
|
+
|
691
|
+
Does NOT handle:
|
692
|
+
- Local to Local (use shutil)
|
693
|
+
- S3 to S3 (use AWS CLI or boto3 object copy)
|
694
|
+
|
695
|
+
Args:
|
696
|
+
source (str): The source path (local file/dir or s3://...).
|
697
|
+
destination (str): The destination path (local file/dir or s3://...).
|
698
|
+
verbose (bool): If True, print actions being taken.
|
699
|
+
"""
|
700
|
+
s3_client = boto3.client("s3")
|
701
|
+
src_bucket, src_prefix = _parse_s3_path(source)
|
702
|
+
dest_bucket, dest_prefix = _parse_s3_path(destination)
|
703
|
+
|
704
|
+
# --- Reject unsupported operations ---
|
705
|
+
if src_bucket is None and dest_bucket is None:
|
706
|
+
print(
|
707
|
+
"Error: Both source and destination are local. Use 'shutil.copy' or 'shutil.copytree'."
|
708
|
+
)
|
709
|
+
return
|
710
|
+
if src_bucket is not None and dest_bucket is not None:
|
711
|
+
print(
|
712
|
+
"Error: S3 to S3 copy not implemented. Use 'aws s3 cp' or boto3 'copy_object'."
|
713
|
+
)
|
714
|
+
return
|
715
|
+
|
716
|
+
# --- Upload: Local to S3 ---
|
717
|
+
if src_bucket is None and dest_bucket is not None:
|
718
|
+
if not os.path.exists(source):
|
719
|
+
print(f"Error: Local source path not found: {source}")
|
720
|
+
return
|
721
|
+
# Ensure dest_prefix is usable, default to empty string if None
|
722
|
+
dest_prefix = dest_prefix or ""
|
723
|
+
|
724
|
+
# Case 1: Source is a local file
|
725
|
+
if os.path.isfile(source):
|
726
|
+
# Determine final S3 key
|
727
|
+
# If dest looks like a dir (ends /) or is empty, append filename
|
728
|
+
if not dest_prefix or destination.endswith("/"):
|
729
|
+
s3_key = os.path.join(dest_prefix, os.path.basename(source)).replace(
|
730
|
+
"\\", "/"
|
731
|
+
)
|
732
|
+
else: # Treat dest as the exact key name
|
733
|
+
s3_key = dest_prefix
|
734
|
+
|
735
|
+
if verbose:
|
736
|
+
print(f"Uploading {source} to s3://{dest_bucket}/{s3_key}")
|
737
|
+
try:
|
738
|
+
s3_client.upload_file(source, dest_bucket, s3_key)
|
739
|
+
print("Upload complete.")
|
740
|
+
except ClientError as e:
|
741
|
+
print(f"ERROR uploading {source}: {e}")
|
742
|
+
except Exception as e:
|
743
|
+
print(f"ERROR uploading {source}: {e}")
|
744
|
+
|
745
|
+
# Case 2: Source is a local directory
|
746
|
+
elif os.path.isdir(source):
|
747
|
+
if verbose:
|
748
|
+
print(
|
749
|
+
f"Uploading directory {source}/* to s3://{dest_bucket}/{dest_prefix}/"
|
750
|
+
)
|
751
|
+
files_uploaded = 0
|
752
|
+
files_failed = 0
|
753
|
+
for root, _, files in os.walk(source):
|
754
|
+
for file in files:
|
755
|
+
local_path = os.path.join(root, file)
|
756
|
+
relative_path = os.path.relpath(local_path, source)
|
757
|
+
s3_key = os.path.join(dest_prefix, relative_path).replace("\\", "/")
|
758
|
+
if verbose:
|
759
|
+
print(
|
760
|
+
f" Uploading {local_path} to s3://{dest_bucket}/{s3_key}"
|
761
|
+
)
|
762
|
+
try:
|
763
|
+
s3_client.upload_file(local_path, dest_bucket, s3_key)
|
764
|
+
files_uploaded += 1
|
765
|
+
except ClientError as e:
|
766
|
+
print(f" ERROR uploading {local_path}: {e}")
|
767
|
+
files_failed += 1
|
768
|
+
except Exception as e:
|
769
|
+
print(f" ERROR uploading {local_path}: {e}")
|
770
|
+
files_failed += 1
|
771
|
+
print(
|
772
|
+
f"Directory upload complete. Files uploaded: {files_uploaded}, Failed: {files_failed}"
|
773
|
+
)
|
774
|
+
else:
|
775
|
+
print(f"Error: Source {source} is neither a file nor a directory.")
|
776
|
+
|
777
|
+
# --- Download: S3 to Local ---
|
778
|
+
elif src_bucket is not None and dest_bucket is None:
|
779
|
+
# Determine if source is likely a single object or a prefix
|
780
|
+
is_prefix_download = False
|
781
|
+
single_object_key = None
|
782
|
+
|
783
|
+
# If source ends with '/', treat it as a prefix explicitly
|
784
|
+
if source.endswith("/"):
|
785
|
+
is_prefix_download = True
|
786
|
+
src_prefix = src_prefix or "" # Ensure not None
|
787
|
+
else:
|
788
|
+
# Try checking if the source key exists as a single object
|
789
|
+
try:
|
790
|
+
s3_client.head_object(Bucket=src_bucket, Key=src_prefix)
|
791
|
+
single_object_key = src_prefix # It exists as a single object
|
792
|
+
except ClientError as e:
|
793
|
+
if e.response["Error"]["Code"] == "404":
|
794
|
+
# Object doesn't exist, assume it's a prefix for recursive download
|
795
|
+
is_prefix_download = True
|
796
|
+
src_prefix = src_prefix or "" # Ensure not None
|
797
|
+
elif e.response["Error"]["Code"] == "NoSuchBucket":
|
798
|
+
print(f"Error: Source bucket '{src_bucket}' not found.")
|
799
|
+
return
|
800
|
+
else:
|
801
|
+
# Other error (e.g., permissions)
|
802
|
+
print(
|
803
|
+
f"Error checking S3 source object s3://{src_bucket}/{src_prefix}: {e}"
|
804
|
+
)
|
805
|
+
return
|
806
|
+
except Exception as e:
|
807
|
+
print(
|
808
|
+
f"Error checking S3 source object s3://{src_bucket}/{src_prefix}: {e}"
|
809
|
+
)
|
810
|
+
return
|
811
|
+
|
812
|
+
# Case 1: Download single S3 object
|
813
|
+
if single_object_key is not None:
|
814
|
+
# Determine local destination path
|
815
|
+
if os.path.isdir(destination) or destination.endswith(os.sep):
|
816
|
+
# Download into the directory
|
817
|
+
local_dest_path = os.path.join(
|
818
|
+
destination, os.path.basename(single_object_key)
|
819
|
+
)
|
820
|
+
# Create local directory if downloading into it and it doesn't exist
|
821
|
+
os.makedirs(destination, exist_ok=True)
|
822
|
+
else:
|
823
|
+
# Download to the exact file path
|
824
|
+
local_dest_path = destination
|
825
|
+
# Ensure parent directory exists
|
826
|
+
parent_dir = os.path.dirname(local_dest_path)
|
827
|
+
if parent_dir:
|
828
|
+
os.makedirs(parent_dir, exist_ok=True)
|
829
|
+
|
830
|
+
if verbose:
|
831
|
+
print(
|
832
|
+
f"Downloading s3://{src_bucket}/{single_object_key} to {local_dest_path}"
|
833
|
+
)
|
834
|
+
try:
|
835
|
+
s3_client.download_file(src_bucket, single_object_key, local_dest_path)
|
836
|
+
print("Download complete.")
|
837
|
+
except ClientError as e:
|
838
|
+
print(f"ERROR downloading {single_object_key}: {e}")
|
839
|
+
except OSError as e:
|
840
|
+
print(
|
841
|
+
f"ERROR creating directory or writing file {local_dest_path}: {e}"
|
842
|
+
)
|
843
|
+
except Exception as e:
|
844
|
+
print(f"ERROR downloading {single_object_key}: {e}")
|
845
|
+
|
846
|
+
# Case 2: Download S3 prefix (recursive)
|
847
|
+
elif is_prefix_download:
|
848
|
+
# Ensure local destination is a directory
|
849
|
+
if os.path.exists(destination) and not os.path.isdir(destination):
|
850
|
+
print(
|
851
|
+
f"Error: Local destination '{destination}' exists but is not a directory for prefix download."
|
852
|
+
)
|
853
|
+
return
|
854
|
+
os.makedirs(destination, exist_ok=True)
|
855
|
+
|
856
|
+
if verbose:
|
857
|
+
print(
|
858
|
+
f"Downloading prefix s3://{src_bucket}/{src_prefix}/* to {destination}/"
|
859
|
+
)
|
860
|
+
|
861
|
+
paginator = s3_client.get_paginator("list_objects_v2")
|
862
|
+
files_downloaded = 0
|
863
|
+
files_failed = 0
|
864
|
+
operation_parameters = {"Bucket": src_bucket}
|
865
|
+
if src_prefix:
|
866
|
+
operation_parameters["Prefix"] = src_prefix
|
867
|
+
|
868
|
+
try:
|
869
|
+
page_iterator = paginator.paginate(**operation_parameters)
|
870
|
+
found_objects = False
|
871
|
+
for page in page_iterator:
|
872
|
+
if "Contents" in page:
|
873
|
+
found_objects = True
|
874
|
+
for obj in page["Contents"]:
|
875
|
+
s3_key = obj["Key"]
|
876
|
+
# Skip zero-byte directory markers if downloading a prefix
|
877
|
+
if s3_key.endswith("/") and obj["Size"] == 0:
|
878
|
+
continue
|
879
|
+
|
880
|
+
# Calculate relative path from the source prefix
|
881
|
+
if src_prefix and s3_key.startswith(src_prefix):
|
882
|
+
# Handle potential trailing slash inconsistency
|
883
|
+
prefix_adjusted = (
|
884
|
+
src_prefix
|
885
|
+
if src_prefix.endswith("/")
|
886
|
+
else src_prefix + "/"
|
887
|
+
)
|
888
|
+
if s3_key.startswith(prefix_adjusted):
|
889
|
+
relative_key = s3_key[len(prefix_adjusted) :]
|
890
|
+
# Handle the prefix itself if listed as an object (unlikely for prefix download)
|
891
|
+
elif s3_key == src_prefix.rstrip("/"):
|
892
|
+
relative_key = os.path.basename(s3_key)
|
893
|
+
else: # Should not happen
|
894
|
+
relative_key = s3_key
|
895
|
+
elif not src_prefix: # Downloading whole bucket essentially
|
896
|
+
relative_key = s3_key
|
897
|
+
else: # Key doesn't start with prefix, should not happen
|
898
|
+
continue
|
899
|
+
|
900
|
+
# Skip if relative key is empty (e.g. prefix marker was somehow processed)
|
901
|
+
if not relative_key:
|
902
|
+
continue
|
903
|
+
|
904
|
+
local_dest_path = os.path.join(
|
905
|
+
destination, relative_key.replace("/", os.sep)
|
906
|
+
)
|
907
|
+
local_dest_dir = os.path.dirname(local_dest_path)
|
908
|
+
|
909
|
+
if verbose:
|
910
|
+
print(
|
911
|
+
f" Downloading s3://{src_bucket}/{s3_key} to {local_dest_path}"
|
912
|
+
)
|
913
|
+
try:
|
914
|
+
if local_dest_dir:
|
915
|
+
os.makedirs(local_dest_dir, exist_ok=True)
|
916
|
+
s3_client.download_file(
|
917
|
+
src_bucket, s3_key, local_dest_path
|
918
|
+
)
|
919
|
+
files_downloaded += 1
|
920
|
+
except ClientError as e:
|
921
|
+
print(f" ERROR downloading {s3_key}: {e}")
|
922
|
+
files_failed += 1
|
923
|
+
except OSError as e:
|
924
|
+
print(
|
925
|
+
f" ERROR creating directory or writing file {local_dest_path}: {e}"
|
926
|
+
)
|
927
|
+
files_failed += 1
|
928
|
+
except Exception as e:
|
929
|
+
print(f" ERROR downloading {s3_key}: {e}")
|
930
|
+
files_failed += 1
|
931
|
+
|
932
|
+
if not found_objects:
|
933
|
+
print(
|
934
|
+
f"Warning: No objects found at source prefix s3://{src_bucket}/{src_prefix}"
|
935
|
+
)
|
936
|
+
|
937
|
+
print(
|
938
|
+
f"Prefix download complete. Files downloaded: {files_downloaded}, Failed: {files_failed}"
|
939
|
+
)
|
940
|
+
|
941
|
+
except ClientError as e:
|
942
|
+
if e.response["Error"]["Code"] == "NoSuchBucket":
|
943
|
+
print(f"Error: Source bucket '{src_bucket}' not found.")
|
944
|
+
else:
|
945
|
+
print(
|
946
|
+
f"Error listing objects in s3://{src_bucket}/{src_prefix}: {e}"
|
947
|
+
)
|
948
|
+
except Exception as e:
|
949
|
+
print(f"Error listing objects in s3://{src_bucket}/{src_prefix}: {e}")
|
950
|
+
|
951
|
+
else: # Should not be reachable
|
952
|
+
print("Error: Unknown copy operation type.")
|