nebu 0.1.24__py3-none-any.whl → 0.1.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nebu/__init__.py +6 -0
- nebu/adapter.py +11 -0
- nebu/auth.py +15 -0
- nebu/cache.py +90 -0
- nebu/chatx/convert.py +206 -0
- nebu/chatx/openai.py +976 -0
- nebu/config.py +38 -2
- nebu/data.py +855 -0
- nebu/processors/consumer.py +1 -4
- nebu/processors/decorate.py +1 -1
- nebu/processors/processor.py +3 -7
- nebu/processors/remote.py +47 -0
- {nebu-0.1.24.dist-info → nebu-0.1.29.dist-info}/METADATA +4 -1
- nebu-0.1.29.dist-info/RECORD +26 -0
- nebu-0.1.24.dist-info/RECORD +0 -20
- {nebu-0.1.24.dist-info → nebu-0.1.29.dist-info}/WHEEL +0 -0
- {nebu-0.1.24.dist-info → nebu-0.1.29.dist-info}/licenses/LICENSE +0 -0
- {nebu-0.1.24.dist-info → nebu-0.1.29.dist-info}/top_level.txt +0 -0
nebu/data.py
ADDED
@@ -0,0 +1,855 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
from datetime import datetime, timedelta, timezone
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
5
|
+
from urllib.parse import urlparse
|
6
|
+
|
7
|
+
import boto3
|
8
|
+
from botocore.exceptions import ClientError
|
9
|
+
|
10
|
+
|
11
|
+
def rclone_copy(
|
12
|
+
source_dir: str,
|
13
|
+
destination: str,
|
14
|
+
dry_run: bool = False,
|
15
|
+
transfers: int = 4,
|
16
|
+
extra_args: Optional[List[str]] = None,
|
17
|
+
verbose: bool = True,
|
18
|
+
) -> bool:
|
19
|
+
"""
|
20
|
+
Upload a directory to a remote bucket using `rclone copy`.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
source_dir (str): Path to local directory to upload.
|
24
|
+
destination (str): Remote destination, e.g., 's3:my-bucket/path'.
|
25
|
+
dry_run (bool): If True, performs a dry run without uploading.
|
26
|
+
transfers (int): Number of parallel transfers.
|
27
|
+
extra_args (Optional[List[str]]): Additional rclone flags.
|
28
|
+
verbose (bool): If True, prints command and output live.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
bool: True if upload succeeded, False otherwise.
|
32
|
+
"""
|
33
|
+
command = [
|
34
|
+
"rclone",
|
35
|
+
"copy",
|
36
|
+
source_dir,
|
37
|
+
destination,
|
38
|
+
f"--transfers={transfers}",
|
39
|
+
"--progress",
|
40
|
+
]
|
41
|
+
|
42
|
+
if dry_run:
|
43
|
+
command.append("--dry-run")
|
44
|
+
if extra_args:
|
45
|
+
command.extend(extra_args)
|
46
|
+
|
47
|
+
if verbose:
|
48
|
+
print("Running command:", " ".join(command))
|
49
|
+
|
50
|
+
try:
|
51
|
+
process = subprocess.Popen(
|
52
|
+
command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
|
53
|
+
)
|
54
|
+
|
55
|
+
if not process.stdout:
|
56
|
+
raise Exception("No output from rclone")
|
57
|
+
|
58
|
+
for line in process.stdout:
|
59
|
+
if verbose:
|
60
|
+
print(line.strip())
|
61
|
+
|
62
|
+
return process.wait() == 0
|
63
|
+
|
64
|
+
except Exception as e:
|
65
|
+
print(f"Error during rclone copy: {e}")
|
66
|
+
return False
|
67
|
+
|
68
|
+
|
69
|
+
def find_latest_checkpoint(training_dir: str) -> Optional[str]:
|
70
|
+
"""
|
71
|
+
Finds the checkpoint directory with the highest step number in a Hugging Face
|
72
|
+
training output directory.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
training_dir (str): The path to the training output directory.
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
Optional[str]: The path to the latest checkpoint directory, or None if
|
79
|
+
no checkpoint directories are found or the directory
|
80
|
+
doesn't exist.
|
81
|
+
"""
|
82
|
+
latest_step = -1
|
83
|
+
latest_checkpoint_dir = None
|
84
|
+
|
85
|
+
if not os.path.isdir(training_dir):
|
86
|
+
print(f"Error: Directory not found: {training_dir}")
|
87
|
+
return None
|
88
|
+
|
89
|
+
for item in os.listdir(training_dir):
|
90
|
+
item_path = os.path.join(training_dir, item)
|
91
|
+
if os.path.isdir(item_path) and item.startswith("checkpoint-"):
|
92
|
+
try:
|
93
|
+
step_str = item.split("-")[-1]
|
94
|
+
if step_str.isdigit():
|
95
|
+
step = int(step_str)
|
96
|
+
if step > latest_step:
|
97
|
+
latest_step = step
|
98
|
+
latest_checkpoint_dir = item_path
|
99
|
+
except (ValueError, IndexError):
|
100
|
+
# Ignore items that don't match the expected pattern
|
101
|
+
continue
|
102
|
+
|
103
|
+
return latest_checkpoint_dir
|
104
|
+
|
105
|
+
|
106
|
+
def _parse_s3_path(path: str) -> Tuple[Optional[str], Optional[str]]:
|
107
|
+
"""Standalone helper: Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
|
108
|
+
parsed = urlparse(path)
|
109
|
+
if parsed.scheme != "s3":
|
110
|
+
return None, None
|
111
|
+
bucket = parsed.netloc
|
112
|
+
prefix = parsed.path.lstrip("/")
|
113
|
+
return bucket, prefix
|
114
|
+
|
115
|
+
|
116
|
+
class Bucket:
|
117
|
+
"""Handles interactions with AWS S3."""
|
118
|
+
|
119
|
+
def __init__(self, verbose: bool = True):
|
120
|
+
"""
|
121
|
+
Initializes the S3 handler.
|
122
|
+
|
123
|
+
Args:
|
124
|
+
verbose (bool): If True, prints status messages. Defaults to True.
|
125
|
+
"""
|
126
|
+
self.client = boto3.client("s3")
|
127
|
+
self.verbose = verbose
|
128
|
+
|
129
|
+
def _parse_path(self, path: str) -> Tuple[Optional[str], Optional[str]]:
|
130
|
+
"""Class method: Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
|
131
|
+
# Reusing the standalone logic here for consistency
|
132
|
+
return _parse_s3_path(path)
|
133
|
+
|
134
|
+
def _list_objects(
|
135
|
+
self, bucket: str, prefix: Optional[str]
|
136
|
+
) -> Dict[str, Dict[str, Any]]:
|
137
|
+
"""Class method: Lists objects in an S3 prefix."""
|
138
|
+
objects: Dict[str, Dict[str, Any]] = {}
|
139
|
+
paginator = self.client.get_paginator("list_objects_v2")
|
140
|
+
list_prefix = prefix or ""
|
141
|
+
if self.verbose:
|
142
|
+
print(f"Listing objects in s3://{bucket}/{list_prefix}...")
|
143
|
+
|
144
|
+
operation_parameters = {"Bucket": bucket}
|
145
|
+
if list_prefix:
|
146
|
+
operation_parameters["Prefix"] = list_prefix
|
147
|
+
|
148
|
+
try:
|
149
|
+
page_iterator = paginator.paginate(**operation_parameters)
|
150
|
+
for page in page_iterator:
|
151
|
+
if "Contents" in page:
|
152
|
+
for obj in page["Contents"]:
|
153
|
+
if obj["Key"].endswith("/") and obj["Size"] == 0:
|
154
|
+
continue
|
155
|
+
relative_key: Optional[str] = None
|
156
|
+
current_prefix = prefix or ""
|
157
|
+
if current_prefix and obj["Key"].startswith(current_prefix):
|
158
|
+
prefix_adjusted = current_prefix + (
|
159
|
+
"" if current_prefix.endswith("/") else "/"
|
160
|
+
)
|
161
|
+
if obj["Key"] == current_prefix.rstrip("/"):
|
162
|
+
relative_key = os.path.basename(obj["Key"])
|
163
|
+
elif obj["Key"].startswith(prefix_adjusted):
|
164
|
+
relative_key = obj["Key"][len(prefix_adjusted) :]
|
165
|
+
else:
|
166
|
+
potential_rel_key = obj["Key"][len(current_prefix) :]
|
167
|
+
relative_key = potential_rel_key.lstrip("/")
|
168
|
+
elif not current_prefix:
|
169
|
+
relative_key = obj["Key"]
|
170
|
+
if not relative_key:
|
171
|
+
continue
|
172
|
+
last_modified = obj["LastModified"]
|
173
|
+
if last_modified.tzinfo is None:
|
174
|
+
last_modified = last_modified.replace(tzinfo=timezone.utc)
|
175
|
+
objects[relative_key] = {
|
176
|
+
"path": f"s3://{bucket}/{obj['Key']}",
|
177
|
+
"key": obj["Key"],
|
178
|
+
"size": obj["Size"],
|
179
|
+
"mtime": last_modified,
|
180
|
+
"type": "s3",
|
181
|
+
}
|
182
|
+
except ClientError as e:
|
183
|
+
if e.response["Error"]["Code"] == "NoSuchBucket":
|
184
|
+
if self.verbose:
|
185
|
+
print(f"Error: Bucket '{bucket}' not found.")
|
186
|
+
elif e.response["Error"]["Code"] == "NoSuchKey" and prefix:
|
187
|
+
if self.verbose:
|
188
|
+
print(
|
189
|
+
f"Prefix s3://{bucket}/{prefix} not found (treating as empty)."
|
190
|
+
)
|
191
|
+
else:
|
192
|
+
print(f"Error listing S3 objects: {e}")
|
193
|
+
if e.response["Error"]["Code"] == "NoSuchBucket":
|
194
|
+
return {}
|
195
|
+
except Exception as e:
|
196
|
+
print(f"An unexpected error occurred listing S3 objects: {e}")
|
197
|
+
return {}
|
198
|
+
if self.verbose:
|
199
|
+
print(f"Found {len(objects)} objects in S3.")
|
200
|
+
return objects
|
201
|
+
|
202
|
+
def _list_local(self, local_dir: str) -> Dict[str, Dict[str, Any]]:
|
203
|
+
"""Class method: Lists files in a local directory."""
|
204
|
+
files: Dict[str, Dict[str, Any]] = {}
|
205
|
+
if not os.path.exists(local_dir):
|
206
|
+
if self.verbose:
|
207
|
+
print(
|
208
|
+
f"Warning: Local path not found: {local_dir} (treating as empty)."
|
209
|
+
)
|
210
|
+
return files
|
211
|
+
if os.path.isfile(local_dir):
|
212
|
+
if self.verbose:
|
213
|
+
print(
|
214
|
+
f"Warning: Source {local_dir} is a file, not a directory. Syncing single file."
|
215
|
+
)
|
216
|
+
try:
|
217
|
+
file_name = os.path.basename(local_dir)
|
218
|
+
files[file_name] = {
|
219
|
+
"path": local_dir,
|
220
|
+
"size": os.path.getsize(local_dir),
|
221
|
+
"mtime": datetime.fromtimestamp(
|
222
|
+
os.path.getmtime(local_dir), tz=timezone.utc
|
223
|
+
),
|
224
|
+
"type": "local",
|
225
|
+
}
|
226
|
+
except OSError as e:
|
227
|
+
print(f"Error accessing source file {local_dir}: {e}")
|
228
|
+
return files
|
229
|
+
if self.verbose:
|
230
|
+
print(f"Scanning local directory: {local_dir}...")
|
231
|
+
for root, _, file_list in os.walk(local_dir):
|
232
|
+
for file_name in file_list:
|
233
|
+
local_path = os.path.join(root, file_name)
|
234
|
+
try:
|
235
|
+
relative_path = os.path.relpath(local_path, local_dir).replace(
|
236
|
+
"\\", "/"
|
237
|
+
)
|
238
|
+
files[relative_path] = {
|
239
|
+
"path": local_path,
|
240
|
+
"size": os.path.getsize(local_path),
|
241
|
+
"mtime": datetime.fromtimestamp(
|
242
|
+
os.path.getmtime(local_path), tz=timezone.utc
|
243
|
+
),
|
244
|
+
"type": "local",
|
245
|
+
}
|
246
|
+
except OSError as e:
|
247
|
+
print(f"Warning: Could not get metadata for {local_path}: {e}")
|
248
|
+
except Exception as e:
|
249
|
+
print(f"Warning: Unexpected error processing {local_path}: {e}")
|
250
|
+
if self.verbose:
|
251
|
+
print(f"Found {len(files)} files locally.")
|
252
|
+
return files
|
253
|
+
|
254
|
+
def sync(
|
255
|
+
self,
|
256
|
+
source: str,
|
257
|
+
destination: str,
|
258
|
+
delete: bool = False,
|
259
|
+
dry_run: bool = False,
|
260
|
+
) -> None:
|
261
|
+
"""
|
262
|
+
Synchronizes files between a source and a destination (local or S3).
|
263
|
+
Compares file sizes and modification times. Copies if missing, larger, or newer.
|
264
|
+
Optionally deletes extraneous files from the destination.
|
265
|
+
Args:
|
266
|
+
source (str): The source path (local directory/file or s3://...).
|
267
|
+
destination (str): The destination path (local directory or s3://...).
|
268
|
+
delete (bool): If True, delete extraneous files from the destination.
|
269
|
+
dry_run (bool): If True, print actions without performing them.
|
270
|
+
"""
|
271
|
+
mtime_tolerance = timedelta(seconds=2)
|
272
|
+
src_bucket, src_prefix = self._parse_path(source)
|
273
|
+
dest_bucket, dest_prefix = self._parse_path(destination)
|
274
|
+
source_items: Dict[str, Dict[str, Any]] = {}
|
275
|
+
dest_items: Dict[str, Dict[str, Any]] = {}
|
276
|
+
sync_direction = ""
|
277
|
+
is_single_file_sync = False
|
278
|
+
|
279
|
+
if src_bucket is None and dest_bucket is not None:
|
280
|
+
sync_direction = "upload"
|
281
|
+
source_items = self._list_local(source)
|
282
|
+
dest_items = self._list_objects(dest_bucket, dest_prefix)
|
283
|
+
if not source_items and not os.path.exists(source):
|
284
|
+
print(
|
285
|
+
f"Error: Source path {source} not found and is not empty."
|
286
|
+
) # Check needed? list_local handles it.
|
287
|
+
# return # Let it proceed if source is just empty
|
288
|
+
if os.path.isfile(source):
|
289
|
+
is_single_file_sync = True
|
290
|
+
# current_dest_prefix = dest_prefix or "" # Moved closer to usage
|
291
|
+
|
292
|
+
elif src_bucket is not None and dest_bucket is None:
|
293
|
+
sync_direction = "download"
|
294
|
+
source_items = self._list_objects(src_bucket, src_prefix)
|
295
|
+
if os.path.exists(destination) and not os.path.isdir(destination):
|
296
|
+
print(
|
297
|
+
f"Error: Local destination '{destination}' exists but is not a directory."
|
298
|
+
)
|
299
|
+
return
|
300
|
+
dest_items = self._list_local(destination)
|
301
|
+
if not dry_run:
|
302
|
+
os.makedirs(destination, exist_ok=True)
|
303
|
+
elif not os.path.isdir(destination) and self.verbose:
|
304
|
+
print(f"Dry run: Would create local directory {destination}")
|
305
|
+
|
306
|
+
elif src_bucket is None and dest_bucket is None:
|
307
|
+
print(
|
308
|
+
"Error: Both source and destination are local paths. Use standard file copy tools."
|
309
|
+
)
|
310
|
+
return
|
311
|
+
elif src_bucket is not None and dest_bucket is not None:
|
312
|
+
print(
|
313
|
+
"Error: S3 to S3 sync not implemented. Use AWS CLI or S3 Batch Operations."
|
314
|
+
)
|
315
|
+
return
|
316
|
+
else:
|
317
|
+
print("Error: Invalid source or destination path combination.")
|
318
|
+
return
|
319
|
+
|
320
|
+
actions_to_perform: List[Dict[str, Any]] = []
|
321
|
+
source_keys = set(source_items.keys())
|
322
|
+
dest_keys = set(dest_items.keys())
|
323
|
+
|
324
|
+
for rel_key in source_keys:
|
325
|
+
src_item = source_items[rel_key]
|
326
|
+
dest_item = dest_items.get(rel_key)
|
327
|
+
reason = ""
|
328
|
+
if dest_item is None:
|
329
|
+
reason = "does not exist in destination"
|
330
|
+
else:
|
331
|
+
if src_item["size"] != dest_item["size"]:
|
332
|
+
reason = f"size differs (src: {src_item['size']}, dest: {dest_item['size']})"
|
333
|
+
elif src_item["mtime"] > (dest_item["mtime"] + mtime_tolerance):
|
334
|
+
reason = f"is newer in source (src: {src_item['mtime']}, dest: {dest_item['mtime']})"
|
335
|
+
if reason:
|
336
|
+
action_type = "upload" if sync_direction == "upload" else "download"
|
337
|
+
dest_full_path_or_key: Optional[str] = None
|
338
|
+
if sync_direction == "upload":
|
339
|
+
# Define current_dest_prefix here, just before use
|
340
|
+
current_dest_prefix = dest_prefix or ""
|
341
|
+
final_dest_key = (
|
342
|
+
rel_key
|
343
|
+
if is_single_file_sync
|
344
|
+
else os.path.join(current_dest_prefix, rel_key).replace(
|
345
|
+
"\\", "/"
|
346
|
+
)
|
347
|
+
)
|
348
|
+
if not current_dest_prefix and final_dest_key.startswith("/"):
|
349
|
+
final_dest_key = final_dest_key.lstrip("/")
|
350
|
+
dest_full_path_or_key = f"s3://{dest_bucket}/{final_dest_key}"
|
351
|
+
else:
|
352
|
+
dest_full_path_or_key = os.path.join(
|
353
|
+
destination, rel_key.replace("/", os.sep)
|
354
|
+
)
|
355
|
+
actions_to_perform.append(
|
356
|
+
{
|
357
|
+
"action": action_type,
|
358
|
+
"relative_key": rel_key,
|
359
|
+
"source_path": src_item["path"],
|
360
|
+
"source_mtime": src_item.get("mtime"),
|
361
|
+
"dest_full_path_or_key": dest_full_path_or_key,
|
362
|
+
"dest_bucket": dest_bucket,
|
363
|
+
"dest_prefix": dest_prefix,
|
364
|
+
"s3_key_full_src": src_item.get("key")
|
365
|
+
if sync_direction == "download"
|
366
|
+
else None,
|
367
|
+
"source_bucket": src_bucket,
|
368
|
+
"reason": reason,
|
369
|
+
}
|
370
|
+
)
|
371
|
+
|
372
|
+
if delete:
|
373
|
+
keys_to_delete = dest_keys - source_keys
|
374
|
+
for rel_key in keys_to_delete:
|
375
|
+
dest_item = dest_items[rel_key]
|
376
|
+
action_type = (
|
377
|
+
"delete_s3" if sync_direction == "upload" else "delete_local"
|
378
|
+
)
|
379
|
+
actions_to_perform.append(
|
380
|
+
{
|
381
|
+
"action": action_type,
|
382
|
+
"relative_key": rel_key,
|
383
|
+
"path_to_delete": dest_item["path"],
|
384
|
+
"s3_key_full_dest": dest_item.get("key")
|
385
|
+
if sync_direction == "upload"
|
386
|
+
else None,
|
387
|
+
"dest_bucket": dest_bucket,
|
388
|
+
"reason": "does not exist in source",
|
389
|
+
}
|
390
|
+
)
|
391
|
+
|
392
|
+
uploads_done = downloads_done = deletions_done = 0
|
393
|
+
s3_deletions_batch: List[Dict[str, str]] = []
|
394
|
+
if not actions_to_perform:
|
395
|
+
if self.verbose:
|
396
|
+
print("Source and destination are already synchronized.")
|
397
|
+
# Optional: Add check if source exists if sync_direction == "upload" and not os.path.exists(source):
|
398
|
+
return
|
399
|
+
|
400
|
+
for action in actions_to_perform:
|
401
|
+
reason = action["reason"]
|
402
|
+
dest_full_path_or_key = action["dest_full_path_or_key"]
|
403
|
+
if action["action"] == "upload":
|
404
|
+
local_path = action["source_path"]
|
405
|
+
if not isinstance(dest_full_path_or_key, str):
|
406
|
+
print(f"ERROR: Invalid dest path: {dest_full_path_or_key}")
|
407
|
+
continue
|
408
|
+
_, upload_key = self._parse_path(dest_full_path_or_key)
|
409
|
+
target_bucket = action["dest_bucket"]
|
410
|
+
if self.verbose:
|
411
|
+
print(f"Upload: {local_path} to {dest_full_path_or_key} ({reason})")
|
412
|
+
if not dry_run:
|
413
|
+
if target_bucket and upload_key is not None:
|
414
|
+
try:
|
415
|
+
self.client.upload_file(
|
416
|
+
local_path, target_bucket, upload_key
|
417
|
+
)
|
418
|
+
uploads_done += 1
|
419
|
+
except ClientError as e:
|
420
|
+
print(f"ERROR uploading {local_path}: {e}")
|
421
|
+
except Exception as e:
|
422
|
+
print(f"ERROR uploading {local_path}: {e}")
|
423
|
+
else:
|
424
|
+
print(
|
425
|
+
f"ERROR: Invalid S3 target: bucket={target_bucket}, key={upload_key}"
|
426
|
+
)
|
427
|
+
elif action["action"] == "download":
|
428
|
+
s3_key_full = action["s3_key_full_src"]
|
429
|
+
local_path = dest_full_path_or_key
|
430
|
+
source_bucket_dl = action["source_bucket"]
|
431
|
+
if self.verbose:
|
432
|
+
print(
|
433
|
+
f"Download: {action['source_path']} to {local_path} ({reason})"
|
434
|
+
)
|
435
|
+
if not isinstance(local_path, str):
|
436
|
+
print(f"ERROR: Invalid local dest path: {local_path}")
|
437
|
+
continue
|
438
|
+
if not dry_run:
|
439
|
+
if source_bucket_dl and s3_key_full and local_path:
|
440
|
+
try:
|
441
|
+
local_file_dir = os.path.dirname(local_path)
|
442
|
+
os.makedirs(local_file_dir, exist_ok=True)
|
443
|
+
self.client.download_file(
|
444
|
+
source_bucket_dl, s3_key_full, local_path
|
445
|
+
)
|
446
|
+
downloads_done += 1
|
447
|
+
except ClientError as e:
|
448
|
+
print(f"ERROR downloading {s3_key_full}: {e}")
|
449
|
+
except OSError as e:
|
450
|
+
print(f"ERROR creating/writing {local_path}: {e}")
|
451
|
+
except Exception as e:
|
452
|
+
print(f"ERROR downloading {s3_key_full}: {e}")
|
453
|
+
else:
|
454
|
+
print(
|
455
|
+
f"ERROR: Invalid download params: bucket={source_bucket_dl}, key={s3_key_full}, local={local_path}"
|
456
|
+
)
|
457
|
+
elif action["action"] == "delete_s3":
|
458
|
+
s3_key_to_delete = action["s3_key_full_dest"]
|
459
|
+
target_bucket_del = action["dest_bucket"]
|
460
|
+
if target_bucket_del and s3_key_to_delete:
|
461
|
+
if self.verbose:
|
462
|
+
print(f"Delete S3: {action['path_to_delete']} ({reason})")
|
463
|
+
if isinstance(s3_key_to_delete, str):
|
464
|
+
s3_deletions_batch.append({"Key": s3_key_to_delete})
|
465
|
+
else:
|
466
|
+
print(f"ERROR: Invalid S3 key for deletion: {s3_key_to_delete}")
|
467
|
+
else:
|
468
|
+
print(
|
469
|
+
f"ERROR: Invalid S3 target for deletion: bucket={target_bucket_del}, key={s3_key_to_delete}"
|
470
|
+
)
|
471
|
+
elif action["action"] == "delete_local":
|
472
|
+
local_path_to_delete = action["path_to_delete"]
|
473
|
+
if self.verbose:
|
474
|
+
print(f"Delete Local: {local_path_to_delete} ({reason})")
|
475
|
+
if not dry_run:
|
476
|
+
try:
|
477
|
+
os.remove(local_path_to_delete)
|
478
|
+
deletions_done += 1
|
479
|
+
except OSError as e:
|
480
|
+
print(f"ERROR deleting local file {local_path_to_delete}: {e}")
|
481
|
+
|
482
|
+
if s3_deletions_batch:
|
483
|
+
target_bucket_del_batch = next(
|
484
|
+
(
|
485
|
+
a["dest_bucket"]
|
486
|
+
for a in actions_to_perform
|
487
|
+
if a["action"] == "delete_s3"
|
488
|
+
),
|
489
|
+
None,
|
490
|
+
)
|
491
|
+
if not dry_run and target_bucket_del_batch:
|
492
|
+
deleted_count_batch = 0
|
493
|
+
for i in range(0, len(s3_deletions_batch), 1000):
|
494
|
+
batch = s3_deletions_batch[i : i + 1000]
|
495
|
+
delete_payload = {"Objects": batch, "Quiet": False}
|
496
|
+
try:
|
497
|
+
response = self.client.delete_objects(
|
498
|
+
Bucket=target_bucket_del_batch, Delete=delete_payload
|
499
|
+
)
|
500
|
+
deleted_count_batch += len(batch)
|
501
|
+
if "Errors" in response and response["Errors"]:
|
502
|
+
deleted_count_batch -= len(response["Errors"])
|
503
|
+
for error in response["Errors"]:
|
504
|
+
print(
|
505
|
+
f"ERROR deleting S3 object {error['Key']}: {error['Code']} - {error['Message']}"
|
506
|
+
)
|
507
|
+
except ClientError as e:
|
508
|
+
print(f"ERROR deleting S3 objects batch: {e}")
|
509
|
+
deleted_count_batch = 0
|
510
|
+
except Exception as e:
|
511
|
+
print(f"ERROR deleting S3 objects batch: {e}")
|
512
|
+
deleted_count_batch = 0
|
513
|
+
deletions_done += deleted_count_batch
|
514
|
+
elif target_bucket_del_batch:
|
515
|
+
deletions_done = len(s3_deletions_batch)
|
516
|
+
else:
|
517
|
+
print(
|
518
|
+
"Warning: Could not determine target bucket for S3 deletion batch."
|
519
|
+
)
|
520
|
+
|
521
|
+
if dry_run:
|
522
|
+
if self.verbose:
|
523
|
+
upload_count = sum(
|
524
|
+
1 for a in actions_to_perform if a["action"] == "upload"
|
525
|
+
)
|
526
|
+
download_count = sum(
|
527
|
+
1 for a in actions_to_perform if a["action"] == "download"
|
528
|
+
)
|
529
|
+
delete_s3_count = len(s3_deletions_batch)
|
530
|
+
delete_local_count = sum(
|
531
|
+
1 for a in actions_to_perform if a["action"] == "delete_local"
|
532
|
+
)
|
533
|
+
print("\n--- DRY RUN SUMMARY ---")
|
534
|
+
if sync_direction == "upload":
|
535
|
+
print(f"Would upload: {upload_count} file(s)")
|
536
|
+
if delete:
|
537
|
+
print(f"Would delete from S3: {delete_s3_count} object(s)")
|
538
|
+
elif sync_direction == "download":
|
539
|
+
print(f"Would download: {download_count} file(s)")
|
540
|
+
if delete:
|
541
|
+
print(f"Would delete locally: {delete_local_count} file(s)")
|
542
|
+
print("--- END DRY RUN ---")
|
543
|
+
else:
|
544
|
+
if self.verbose:
|
545
|
+
if sync_direction == "upload":
|
546
|
+
print(
|
547
|
+
f"Sync completed. Uploaded: {uploads_done} file(s). Deleted from S3: {deletions_done if delete else 0} object(s)."
|
548
|
+
)
|
549
|
+
elif sync_direction == "download":
|
550
|
+
print(
|
551
|
+
f"Sync completed. Downloaded: {downloads_done} file(s). Deleted locally: {deletions_done if delete else 0} file(s)."
|
552
|
+
)
|
553
|
+
|
554
|
+
def copy(
|
555
|
+
self,
|
556
|
+
source: str,
|
557
|
+
destination: str,
|
558
|
+
) -> None:
|
559
|
+
"""
|
560
|
+
Copies files or directories between local paths and S3 URIs.
|
561
|
+
Handles:
|
562
|
+
- Local file to S3 object
|
563
|
+
- Local directory to S3 prefix (recursive)
|
564
|
+
- S3 object to local file
|
565
|
+
- S3 prefix to local directory (recursive)
|
566
|
+
Does NOT handle:
|
567
|
+
- Local to Local (use shutil)
|
568
|
+
- S3 to S3 (use AWS CLI or boto3 object copy)
|
569
|
+
Args:
|
570
|
+
source (str): The source path (local file/dir or s3://...).
|
571
|
+
destination (str): The destination path (local file/dir or s3://...).
|
572
|
+
"""
|
573
|
+
src_bucket, src_prefix = self._parse_path(source)
|
574
|
+
dest_bucket, dest_prefix = self._parse_path(destination)
|
575
|
+
|
576
|
+
if src_bucket is None and dest_bucket is None:
|
577
|
+
print(
|
578
|
+
"Error: Both source and destination are local. Use 'shutil.copy' or 'shutil.copytree'."
|
579
|
+
)
|
580
|
+
return
|
581
|
+
if src_bucket is not None and dest_bucket is not None:
|
582
|
+
print(
|
583
|
+
"Error: S3 to S3 copy not implemented. Use 'aws s3 cp' or boto3 'copy_object'."
|
584
|
+
)
|
585
|
+
return
|
586
|
+
|
587
|
+
# Upload: Local to S3
|
588
|
+
if src_bucket is None and dest_bucket is not None:
|
589
|
+
if not os.path.exists(source):
|
590
|
+
print(f"Error: Local source path not found: {source}")
|
591
|
+
return
|
592
|
+
current_dest_prefix = dest_prefix or ""
|
593
|
+
|
594
|
+
if os.path.isfile(source):
|
595
|
+
if not current_dest_prefix or destination.endswith("/"):
|
596
|
+
s3_key = os.path.join(
|
597
|
+
current_dest_prefix, os.path.basename(source)
|
598
|
+
).replace("\\", "/")
|
599
|
+
else:
|
600
|
+
s3_key = current_dest_prefix
|
601
|
+
if self.verbose:
|
602
|
+
print(f"Uploading {source} to s3://{dest_bucket}/{s3_key}")
|
603
|
+
try:
|
604
|
+
self.client.upload_file(source, dest_bucket, s3_key)
|
605
|
+
if self.verbose:
|
606
|
+
print("Upload complete.")
|
607
|
+
except ClientError as e:
|
608
|
+
print(f"ERROR uploading {source}: {e}")
|
609
|
+
except Exception as e:
|
610
|
+
print(f"ERROR uploading {source}: {e}")
|
611
|
+
|
612
|
+
elif os.path.isdir(source):
|
613
|
+
if self.verbose:
|
614
|
+
print(
|
615
|
+
f"Uploading directory {source}/* to s3://{dest_bucket}/{current_dest_prefix}/"
|
616
|
+
)
|
617
|
+
files_uploaded = files_failed = 0
|
618
|
+
for root, _, files in os.walk(source):
|
619
|
+
for file in files:
|
620
|
+
local_path = os.path.join(root, file)
|
621
|
+
relative_path = os.path.relpath(local_path, source)
|
622
|
+
s3_key = os.path.join(
|
623
|
+
current_dest_prefix, relative_path
|
624
|
+
).replace("\\", "/")
|
625
|
+
if self.verbose:
|
626
|
+
print(
|
627
|
+
f" Uploading {local_path} to s3://{dest_bucket}/{s3_key}"
|
628
|
+
)
|
629
|
+
try:
|
630
|
+
self.client.upload_file(local_path, dest_bucket, s3_key)
|
631
|
+
files_uploaded += 1
|
632
|
+
except ClientError as e:
|
633
|
+
print(f" ERROR uploading {local_path}: {e}")
|
634
|
+
files_failed += 1
|
635
|
+
except Exception as e:
|
636
|
+
print(f" ERROR uploading {local_path}: {e}")
|
637
|
+
files_failed += 1
|
638
|
+
if self.verbose:
|
639
|
+
print(
|
640
|
+
f"Directory upload complete. Files uploaded: {files_uploaded}, Failed: {files_failed}"
|
641
|
+
)
|
642
|
+
else:
|
643
|
+
print(f"Error: Source {source} is neither a file nor a directory.")
|
644
|
+
|
645
|
+
# Download: S3 to Local
|
646
|
+
elif src_bucket is not None and dest_bucket is None:
|
647
|
+
is_prefix_download = False
|
648
|
+
single_object_key = None
|
649
|
+
current_src_prefix = src_prefix or "" # Ensure not None
|
650
|
+
|
651
|
+
if source.endswith("/"):
|
652
|
+
is_prefix_download = True
|
653
|
+
else:
|
654
|
+
try:
|
655
|
+
if current_src_prefix:
|
656
|
+
self.client.head_object(
|
657
|
+
Bucket=src_bucket, Key=current_src_prefix
|
658
|
+
)
|
659
|
+
single_object_key = current_src_prefix
|
660
|
+
else:
|
661
|
+
# Path like s3://bucket, treat as prefix download
|
662
|
+
is_prefix_download = True
|
663
|
+
except ClientError as e:
|
664
|
+
if e.response["Error"]["Code"] == "404":
|
665
|
+
is_prefix_download = True # Assume prefix if object not found
|
666
|
+
elif e.response["Error"]["Code"] == "NoSuchBucket":
|
667
|
+
print(f"Error: Source bucket '{src_bucket}' not found.")
|
668
|
+
return
|
669
|
+
else:
|
670
|
+
print(
|
671
|
+
f"Error checking S3 source s3://{src_bucket}/{current_src_prefix}: {e}"
|
672
|
+
)
|
673
|
+
return
|
674
|
+
except Exception as e:
|
675
|
+
print(
|
676
|
+
f"Error checking S3 source s3://{src_bucket}/{current_src_prefix}: {e}"
|
677
|
+
)
|
678
|
+
return
|
679
|
+
|
680
|
+
if single_object_key is not None:
|
681
|
+
if os.path.isdir(destination) or destination.endswith(os.sep):
|
682
|
+
local_dest_path = os.path.join(
|
683
|
+
destination, os.path.basename(single_object_key)
|
684
|
+
)
|
685
|
+
os.makedirs(destination, exist_ok=True)
|
686
|
+
else:
|
687
|
+
local_dest_path = destination
|
688
|
+
parent_dir = os.path.dirname(local_dest_path)
|
689
|
+
if parent_dir:
|
690
|
+
os.makedirs(parent_dir, exist_ok=True)
|
691
|
+
if self.verbose:
|
692
|
+
print(
|
693
|
+
f"Downloading s3://{src_bucket}/{single_object_key} to {local_dest_path}"
|
694
|
+
)
|
695
|
+
try:
|
696
|
+
self.client.download_file(
|
697
|
+
src_bucket, single_object_key, local_dest_path
|
698
|
+
)
|
699
|
+
if self.verbose:
|
700
|
+
print("Download complete.")
|
701
|
+
except ClientError as e:
|
702
|
+
print(f"ERROR downloading {single_object_key}: {e}")
|
703
|
+
except OSError as e:
|
704
|
+
print(f"ERROR creating/writing {local_dest_path}: {e}")
|
705
|
+
except Exception as e:
|
706
|
+
print(f"ERROR downloading {single_object_key}: {e}")
|
707
|
+
|
708
|
+
elif is_prefix_download:
|
709
|
+
if os.path.exists(destination) and not os.path.isdir(destination):
|
710
|
+
print(
|
711
|
+
f"Error: Local destination '{destination}' exists but is not a directory."
|
712
|
+
)
|
713
|
+
return
|
714
|
+
os.makedirs(destination, exist_ok=True)
|
715
|
+
if self.verbose:
|
716
|
+
print(
|
717
|
+
f"Downloading prefix s3://{src_bucket}/{current_src_prefix}/* to {destination}/"
|
718
|
+
)
|
719
|
+
paginator = self.client.get_paginator("list_objects_v2")
|
720
|
+
files_downloaded = files_failed = 0
|
721
|
+
operation_parameters = {"Bucket": src_bucket}
|
722
|
+
# The problematic line for the linter, re-adding type ignore
|
723
|
+
if current_src_prefix: # type: ignore
|
724
|
+
operation_parameters["Prefix"] = current_src_prefix
|
725
|
+
try:
|
726
|
+
page_iterator = paginator.paginate(**operation_parameters)
|
727
|
+
found_objects = False
|
728
|
+
for page in page_iterator:
|
729
|
+
if "Contents" in page:
|
730
|
+
found_objects = True
|
731
|
+
for obj in page["Contents"]:
|
732
|
+
s3_key = obj["Key"]
|
733
|
+
if s3_key.endswith("/") and obj["Size"] == 0:
|
734
|
+
continue
|
735
|
+
relative_key = s3_key
|
736
|
+
if current_src_prefix:
|
737
|
+
if s3_key.startswith(current_src_prefix):
|
738
|
+
if s3_key == current_src_prefix.rstrip("/"):
|
739
|
+
relative_key = os.path.basename(s3_key)
|
740
|
+
else:
|
741
|
+
prefix_adjusted = current_src_prefix + (
|
742
|
+
""
|
743
|
+
if current_src_prefix.endswith("/")
|
744
|
+
else "/"
|
745
|
+
)
|
746
|
+
if s3_key.startswith(prefix_adjusted):
|
747
|
+
relative_key = s3_key[
|
748
|
+
len(prefix_adjusted) :
|
749
|
+
]
|
750
|
+
elif not current_src_prefix.endswith("/"):
|
751
|
+
relative_key = s3_key[
|
752
|
+
len(current_src_prefix) :
|
753
|
+
].lstrip("/")
|
754
|
+
if not relative_key:
|
755
|
+
continue
|
756
|
+
local_dest_path = os.path.join(
|
757
|
+
destination, relative_key.replace("/", os.sep)
|
758
|
+
)
|
759
|
+
local_dest_dir = os.path.dirname(local_dest_path)
|
760
|
+
if self.verbose:
|
761
|
+
print(
|
762
|
+
f" Downloading s3://{src_bucket}/{s3_key} to {local_dest_path}"
|
763
|
+
)
|
764
|
+
try:
|
765
|
+
if local_dest_dir:
|
766
|
+
os.makedirs(local_dest_dir, exist_ok=True)
|
767
|
+
self.client.download_file(
|
768
|
+
src_bucket, s3_key, local_dest_path
|
769
|
+
)
|
770
|
+
files_downloaded += 1
|
771
|
+
except ClientError as e:
|
772
|
+
print(f" ERROR downloading {s3_key}: {e}")
|
773
|
+
files_failed += 1
|
774
|
+
except OSError as e:
|
775
|
+
print(
|
776
|
+
f" ERROR creating/writing {local_dest_path}: {e}"
|
777
|
+
)
|
778
|
+
files_failed += 1
|
779
|
+
except Exception as e:
|
780
|
+
print(f" ERROR downloading {s3_key}: {e}")
|
781
|
+
files_failed += 1
|
782
|
+
if not found_objects and self.verbose:
|
783
|
+
print(
|
784
|
+
f"Warning: No objects found at source prefix s3://{src_bucket}/{current_src_prefix}"
|
785
|
+
)
|
786
|
+
if self.verbose:
|
787
|
+
print(
|
788
|
+
f"Prefix download complete. Files downloaded: {files_downloaded}, Failed: {files_failed}"
|
789
|
+
)
|
790
|
+
except ClientError as e:
|
791
|
+
if e.response["Error"]["Code"] == "NoSuchBucket":
|
792
|
+
print(f"Error: Source bucket '{src_bucket}' not found.")
|
793
|
+
else:
|
794
|
+
print(
|
795
|
+
f"Error listing objects in s3://{src_bucket}/{current_src_prefix}: {e}"
|
796
|
+
)
|
797
|
+
except Exception as e:
|
798
|
+
print(
|
799
|
+
f"Error listing objects in s3://{src_bucket}/{current_src_prefix}: {e}"
|
800
|
+
)
|
801
|
+
else:
|
802
|
+
print("Error: Unknown copy operation type.")
|
803
|
+
|
804
|
+
def check(self, s3_uri: str) -> bool:
|
805
|
+
"""
|
806
|
+
Check if an object or prefix exists in an S3 bucket using an S3 URI.
|
807
|
+
|
808
|
+
Args:
|
809
|
+
s3_uri (str): The S3 URI (e.g., 's3://my-bucket/my-key' or 's3://my-bucket/my-prefix/').
|
810
|
+
Use a trailing '/' to check for a prefix/directory.
|
811
|
+
|
812
|
+
Returns:
|
813
|
+
bool: True if the object or prefix exists, False otherwise.
|
814
|
+
"""
|
815
|
+
# Use the class client and parse method
|
816
|
+
bucket_name, s3_key = self._parse_path(s3_uri)
|
817
|
+
|
818
|
+
if bucket_name is None or s3_key is None:
|
819
|
+
# _parse_path returns None, None if scheme is not 's3'
|
820
|
+
print(f"Error: Invalid S3 URI format: {s3_uri}")
|
821
|
+
return False
|
822
|
+
|
823
|
+
is_prefix = s3_key.endswith("/")
|
824
|
+
|
825
|
+
try:
|
826
|
+
if is_prefix:
|
827
|
+
# Check for prefix existence by listing objects
|
828
|
+
# Handle the case where s3_key might be empty if URI is just s3://bucket/
|
829
|
+
list_prefix = s3_key if s3_key else ""
|
830
|
+
response = self.client.list_objects_v2(
|
831
|
+
Bucket=bucket_name, Prefix=list_prefix, MaxKeys=1
|
832
|
+
)
|
833
|
+
# Check if any objects OR common prefixes (folders) are returned for the prefix
|
834
|
+
return "Contents" in response or "CommonPrefixes" in response
|
835
|
+
else:
|
836
|
+
# Check for object existence
|
837
|
+
self.client.head_object(Bucket=bucket_name, Key=s3_key)
|
838
|
+
return True
|
839
|
+
except ClientError as e: # Catch boto3 ClientError first
|
840
|
+
# If head_object returns 404 (NoSuchKey), the object doesn't exist
|
841
|
+
# list_objects_v2 does not raise NoSuchKey for prefixes
|
842
|
+
if e.response["Error"]["Code"] == "404":
|
843
|
+
return False
|
844
|
+
elif e.response["Error"]["Code"] == "NoSuchBucket":
|
845
|
+
if self.verbose:
|
846
|
+
print(
|
847
|
+
f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri})."
|
848
|
+
)
|
849
|
+
return False
|
850
|
+
# Handle other potential errors like AccessDenied differently if needed
|
851
|
+
print(f"Error checking {s3_uri}: {e}")
|
852
|
+
return False
|
853
|
+
except Exception as e:
|
854
|
+
print(f"An unexpected error occurred checking {s3_uri}: {e}")
|
855
|
+
return False
|