nebu 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nebu/__init__.py +5 -1
- nebu/adapter.py +11 -0
- nebu/auth.py +15 -0
- nebu/cache.py +103 -0
- nebu/{convert.py → chatx/convert.py} +111 -15
- nebu/chatx/openai.py +976 -0
- nebu/data.py +682 -779
- nebu/processors/consumer.py +4 -2
- nebu/processors/decorate.py +1 -1
- nebu/processors/processor.py +10 -8
- nebu/processors/remote.py +47 -0
- {nebu-0.1.27.dist-info → nebu-0.1.30.dist-info}/METADATA +2 -1
- nebu-0.1.30.dist-info/RECORD +26 -0
- nebu-0.1.27.dist-info/RECORD +0 -22
- {nebu-0.1.27.dist-info → nebu-0.1.30.dist-info}/WHEEL +0 -0
- {nebu-0.1.27.dist-info → nebu-0.1.30.dist-info}/licenses/LICENSE +0 -0
- {nebu-0.1.27.dist-info → nebu-0.1.30.dist-info}/top_level.txt +0 -0
nebu/data.py
CHANGED
@@ -104,7 +104,7 @@ def find_latest_checkpoint(training_dir: str) -> Optional[str]:
|
|
104
104
|
|
105
105
|
|
106
106
|
def _parse_s3_path(path: str) -> Tuple[Optional[str], Optional[str]]:
|
107
|
-
"""Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
|
107
|
+
"""Standalone helper: Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
|
108
108
|
parsed = urlparse(path)
|
109
109
|
if parsed.scheme != "s3":
|
110
110
|
return None, None
|
@@ -113,840 +113,743 @@ def _parse_s3_path(path: str) -> Tuple[Optional[str], Optional[str]]:
|
|
113
113
|
return bucket, prefix
|
114
114
|
|
115
115
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
116
|
+
class Bucket:
|
117
|
+
"""Handles interactions with AWS S3."""
|
118
|
+
|
119
|
+
def __init__(self, verbose: bool = True):
|
120
|
+
"""
|
121
|
+
Initializes the S3 handler.
|
122
|
+
|
123
|
+
Args:
|
124
|
+
verbose (bool): If True, prints status messages. Defaults to True.
|
125
|
+
"""
|
126
|
+
self.client = boto3.client("s3")
|
127
|
+
self.verbose = verbose
|
128
|
+
|
129
|
+
def _parse_path(self, path: str) -> Tuple[Optional[str], Optional[str]]:
|
130
|
+
"""Class method: Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
|
131
|
+
# Reusing the standalone logic here for consistency
|
132
|
+
return _parse_s3_path(path)
|
133
|
+
|
134
|
+
def _list_objects(
|
135
|
+
self, bucket: str, prefix: Optional[str]
|
136
|
+
) -> Dict[str, Dict[str, Any]]:
|
137
|
+
"""Class method: Lists objects in an S3 prefix."""
|
138
|
+
objects: Dict[str, Dict[str, Any]] = {}
|
139
|
+
paginator = self.client.get_paginator("list_objects_v2")
|
140
|
+
list_prefix = prefix or ""
|
141
|
+
if self.verbose:
|
142
|
+
print(f"Listing objects in s3://{bucket}/{list_prefix}...")
|
143
|
+
|
144
|
+
operation_parameters = {"Bucket": bucket}
|
145
|
+
if list_prefix:
|
146
|
+
operation_parameters["Prefix"] = list_prefix
|
147
|
+
|
148
|
+
try:
|
149
|
+
page_iterator = paginator.paginate(**operation_parameters)
|
150
|
+
for page in page_iterator:
|
151
|
+
if "Contents" in page:
|
152
|
+
for obj in page["Contents"]:
|
153
|
+
if obj["Key"].endswith("/") and obj["Size"] == 0:
|
154
|
+
continue
|
155
|
+
relative_key: Optional[str] = None
|
156
|
+
current_prefix = prefix or ""
|
157
|
+
if current_prefix and obj["Key"].startswith(current_prefix):
|
158
|
+
prefix_adjusted = current_prefix + (
|
159
|
+
"" if current_prefix.endswith("/") else "/"
|
160
|
+
)
|
161
|
+
if obj["Key"] == current_prefix.rstrip("/"):
|
162
|
+
relative_key = os.path.basename(obj["Key"])
|
163
|
+
elif obj["Key"].startswith(prefix_adjusted):
|
164
|
+
relative_key = obj["Key"][len(prefix_adjusted) :]
|
165
|
+
else:
|
166
|
+
potential_rel_key = obj["Key"][len(current_prefix) :]
|
167
|
+
relative_key = potential_rel_key.lstrip("/")
|
168
|
+
elif not current_prefix:
|
169
|
+
relative_key = obj["Key"]
|
170
|
+
if not relative_key:
|
171
|
+
continue
|
172
|
+
last_modified = obj["LastModified"]
|
173
|
+
if last_modified.tzinfo is None:
|
174
|
+
last_modified = last_modified.replace(tzinfo=timezone.utc)
|
175
|
+
objects[relative_key] = {
|
176
|
+
"path": f"s3://{bucket}/{obj['Key']}",
|
177
|
+
"key": obj["Key"],
|
178
|
+
"size": obj["Size"],
|
179
|
+
"mtime": last_modified,
|
180
|
+
"type": "s3",
|
181
|
+
}
|
182
|
+
except ClientError as e:
|
183
|
+
if e.response["Error"]["Code"] == "NoSuchBucket":
|
184
|
+
if self.verbose:
|
185
|
+
print(f"Error: Bucket '{bucket}' not found.")
|
186
|
+
elif e.response["Error"]["Code"] == "NoSuchKey" and prefix:
|
187
|
+
if self.verbose:
|
188
|
+
print(
|
189
|
+
f"Prefix s3://{bucket}/{prefix} not found (treating as empty)."
|
190
|
+
)
|
191
|
+
else:
|
192
|
+
print(f"Error listing S3 objects: {e}")
|
193
|
+
if e.response["Error"]["Code"] == "NoSuchBucket":
|
194
|
+
return {}
|
195
|
+
except Exception as e:
|
196
|
+
print(f"An unexpected error occurred listing S3 objects: {e}")
|
195
197
|
return {}
|
196
|
-
|
197
|
-
|
198
|
-
return
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
)
|
208
|
-
|
209
|
-
if not os.path.isdir(local_dir):
|
210
|
-
# Check if it's a file path instead of a dir
|
198
|
+
if self.verbose:
|
199
|
+
print(f"Found {len(objects)} objects in S3.")
|
200
|
+
return objects
|
201
|
+
|
202
|
+
def _list_local(self, local_dir: str) -> Dict[str, Dict[str, Any]]:
|
203
|
+
"""Class method: Lists files in a local directory."""
|
204
|
+
files: Dict[str, Dict[str, Any]] = {}
|
205
|
+
if not os.path.exists(local_dir):
|
206
|
+
if self.verbose:
|
207
|
+
print(
|
208
|
+
f"Warning: Local path not found: {local_dir} (treating as empty)."
|
209
|
+
)
|
210
|
+
return files
|
211
211
|
if os.path.isfile(local_dir):
|
212
|
-
|
213
|
-
|
214
|
-
|
212
|
+
if self.verbose:
|
213
|
+
print(
|
214
|
+
f"Warning: Source {local_dir} is a file, not a directory. Syncing single file."
|
215
|
+
)
|
215
216
|
try:
|
216
|
-
local_size = os.path.getsize(local_dir)
|
217
|
-
local_mtime_ts = os.path.getmtime(local_dir)
|
218
|
-
local_mtime = datetime.fromtimestamp(local_mtime_ts, tz=timezone.utc)
|
219
217
|
file_name = os.path.basename(local_dir)
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
218
|
+
files[file_name] = {
|
219
|
+
"path": local_dir,
|
220
|
+
"size": os.path.getsize(local_dir),
|
221
|
+
"mtime": datetime.fromtimestamp(
|
222
|
+
os.path.getmtime(local_dir), tz=timezone.utc
|
223
|
+
),
|
224
|
+
"type": "local",
|
227
225
|
}
|
228
226
|
except OSError as e:
|
229
227
|
print(f"Error accessing source file {local_dir}: {e}")
|
230
|
-
|
231
|
-
|
232
|
-
print(f"
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
228
|
+
return files
|
229
|
+
if self.verbose:
|
230
|
+
print(f"Scanning local directory: {local_dir}...")
|
231
|
+
for root, _, file_list in os.walk(local_dir):
|
232
|
+
for file_name in file_list:
|
233
|
+
local_path = os.path.join(root, file_name)
|
234
|
+
try:
|
235
|
+
relative_path = os.path.relpath(local_path, local_dir).replace(
|
236
|
+
"\\", "/"
|
237
|
+
)
|
238
|
+
files[relative_path] = {
|
239
|
+
"path": local_path,
|
240
|
+
"size": os.path.getsize(local_path),
|
241
|
+
"mtime": datetime.fromtimestamp(
|
242
|
+
os.path.getmtime(local_path), tz=timezone.utc
|
243
|
+
),
|
244
|
+
"type": "local",
|
245
|
+
}
|
246
|
+
except OSError as e:
|
247
|
+
print(f"Warning: Could not get metadata for {local_path}: {e}")
|
248
|
+
except Exception as e:
|
249
|
+
print(f"Warning: Unexpected error processing {local_path}: {e}")
|
250
|
+
if self.verbose:
|
251
|
+
print(f"Found {len(files)} files locally.")
|
252
|
+
return files
|
253
|
+
|
254
|
+
def sync(
|
255
|
+
self,
|
256
|
+
source: str,
|
257
|
+
destination: str,
|
258
|
+
delete: bool = False,
|
259
|
+
dry_run: bool = False,
|
260
|
+
) -> None:
|
261
|
+
"""
|
262
|
+
Synchronizes files between a source and a destination (local or S3).
|
263
|
+
Compares file sizes and modification times. Copies if missing, larger, or newer.
|
264
|
+
Optionally deletes extraneous files from the destination.
|
265
|
+
Args:
|
266
|
+
source (str): The source path (local directory/file or s3://...).
|
267
|
+
destination (str): The destination path (local directory or s3://...).
|
268
|
+
delete (bool): If True, delete extraneous files from the destination.
|
269
|
+
dry_run (bool): If True, print actions without performing them.
|
270
|
+
"""
|
271
|
+
mtime_tolerance = timedelta(seconds=2)
|
272
|
+
src_bucket, src_prefix = self._parse_path(source)
|
273
|
+
dest_bucket, dest_prefix = self._parse_path(destination)
|
274
|
+
source_items: Dict[str, Dict[str, Any]] = {}
|
275
|
+
dest_items: Dict[str, Dict[str, Any]] = {}
|
276
|
+
sync_direction = ""
|
277
|
+
is_single_file_sync = False
|
278
|
+
|
279
|
+
if src_bucket is None and dest_bucket is not None:
|
280
|
+
sync_direction = "upload"
|
281
|
+
source_items = self._list_local(source)
|
282
|
+
dest_items = self._list_objects(dest_bucket, dest_prefix)
|
283
|
+
if not source_items and not os.path.exists(source):
|
284
|
+
print(
|
285
|
+
f"Error: Source path {source} not found and is not empty."
|
286
|
+
) # Check needed? list_local handles it.
|
287
|
+
# return # Let it proceed if source is just empty
|
288
|
+
if os.path.isfile(source):
|
289
|
+
is_single_file_sync = True
|
290
|
+
# current_dest_prefix = dest_prefix or "" # Moved closer to usage
|
291
|
+
|
292
|
+
elif src_bucket is not None and dest_bucket is None:
|
293
|
+
sync_direction = "download"
|
294
|
+
source_items = self._list_objects(src_bucket, src_prefix)
|
295
|
+
if os.path.exists(destination) and not os.path.isdir(destination):
|
296
|
+
print(
|
297
|
+
f"Error: Local destination '{destination}' exists but is not a directory."
|
245
298
|
)
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
files[relative_path] = {
|
253
|
-
"path": local_path,
|
254
|
-
"size": local_size,
|
255
|
-
"mtime": local_mtime,
|
256
|
-
"type": "local",
|
257
|
-
}
|
258
|
-
except OSError as e:
|
259
|
-
print(f"Warning: Could not get metadata for {local_path}: {e}")
|
260
|
-
except Exception as e:
|
261
|
-
print(f"Warning: Unexpected error processing {local_path}: {e}")
|
262
|
-
|
263
|
-
if verbose:
|
264
|
-
print(f"Found {len(files)} files locally.")
|
265
|
-
return files
|
266
|
-
|
267
|
-
|
268
|
-
def s3_sync(
|
269
|
-
source: str,
|
270
|
-
destination: str,
|
271
|
-
delete: bool = False,
|
272
|
-
dry_run: bool = False,
|
273
|
-
verbose: bool = True,
|
274
|
-
) -> None:
|
275
|
-
"""
|
276
|
-
Synchronizes files between a source and a destination, which can be
|
277
|
-
local paths or S3 paths (e.g., 's3://my-bucket/my-prefix').
|
278
|
-
|
279
|
-
Compares file sizes and modification times. Copies files from source
|
280
|
-
to destination if they are missing, larger, or newer in the source.
|
281
|
-
Optionally deletes files from the destination if they are not present
|
282
|
-
in the source.
|
299
|
+
return
|
300
|
+
dest_items = self._list_local(destination)
|
301
|
+
if not dry_run:
|
302
|
+
os.makedirs(destination, exist_ok=True)
|
303
|
+
elif not os.path.isdir(destination) and self.verbose:
|
304
|
+
print(f"Dry run: Would create local directory {destination}")
|
283
305
|
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
dry_run (bool): If True, print actions without performing them.
|
289
|
-
verbose (bool): If True, print actions being taken.
|
290
|
-
"""
|
291
|
-
s3_client = boto3.client("s3")
|
292
|
-
mtime_tolerance = timedelta(
|
293
|
-
seconds=2
|
294
|
-
) # S3 mtime might not have sub-second precision
|
295
|
-
|
296
|
-
src_bucket, src_prefix = _parse_s3_path(source)
|
297
|
-
dest_bucket, dest_prefix = _parse_s3_path(destination)
|
298
|
-
|
299
|
-
source_items: Dict[str, Dict[str, Any]] = {}
|
300
|
-
dest_items: Dict[str, Dict[str, Any]] = {}
|
301
|
-
sync_direction = ""
|
302
|
-
is_single_file_sync = False
|
303
|
-
|
304
|
-
# Determine sync direction and list items
|
305
|
-
if src_bucket is None and dest_bucket is not None:
|
306
|
-
sync_direction = "upload"
|
307
|
-
source_items = _list_local_files(source, verbose)
|
308
|
-
dest_items = _list_s3_objects(s3_client, dest_bucket, dest_prefix, verbose)
|
309
|
-
# Check if source exists (either dir or file)
|
310
|
-
if not os.path.exists(source):
|
311
|
-
print(f"Error: Source path {source} not found.")
|
306
|
+
elif src_bucket is None and dest_bucket is None:
|
307
|
+
print(
|
308
|
+
"Error: Both source and destination are local paths. Use standard file copy tools."
|
309
|
+
)
|
312
310
|
return
|
313
|
-
|
314
|
-
# Destination prefix defaults to empty if not specified
|
315
|
-
if dest_prefix is None:
|
316
|
-
dest_prefix = ""
|
317
|
-
|
318
|
-
elif src_bucket is not None and dest_bucket is None:
|
319
|
-
sync_direction = "download"
|
320
|
-
source_items = _list_s3_objects(s3_client, src_bucket, src_prefix, verbose)
|
321
|
-
# For download, destination MUST be a directory (or created as one)
|
322
|
-
# If destination exists and is a file, it's an error.
|
323
|
-
if os.path.exists(destination) and not os.path.isdir(destination):
|
311
|
+
elif src_bucket is not None and dest_bucket is not None:
|
324
312
|
print(
|
325
|
-
|
313
|
+
"Error: S3 to S3 sync not implemented. Use AWS CLI or S3 Batch Operations."
|
326
314
|
)
|
327
315
|
return
|
316
|
+
else:
|
317
|
+
print("Error: Invalid source or destination path combination.")
|
318
|
+
return
|
328
319
|
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
if not dry_run:
|
333
|
-
os.makedirs(destination, exist_ok=True)
|
334
|
-
elif not os.path.isdir(destination) and verbose:
|
335
|
-
print(f"Dry run: Would create local directory {destination}")
|
320
|
+
actions_to_perform: List[Dict[str, Any]] = []
|
321
|
+
source_keys = set(source_items.keys())
|
322
|
+
dest_keys = set(dest_items.keys())
|
336
323
|
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
324
|
+
for rel_key in source_keys:
|
325
|
+
src_item = source_items[rel_key]
|
326
|
+
dest_item = dest_items.get(rel_key)
|
327
|
+
reason = ""
|
328
|
+
if dest_item is None:
|
329
|
+
reason = "does not exist in destination"
|
330
|
+
else:
|
331
|
+
if src_item["size"] != dest_item["size"]:
|
332
|
+
reason = f"size differs (src: {src_item['size']}, dest: {dest_item['size']})"
|
333
|
+
elif src_item["mtime"] > (dest_item["mtime"] + mtime_tolerance):
|
334
|
+
reason = f"is newer in source (src: {src_item['mtime']}, dest: {dest_item['mtime']})"
|
335
|
+
if reason:
|
336
|
+
action_type = "upload" if sync_direction == "upload" else "download"
|
337
|
+
dest_full_path_or_key: Optional[str] = None
|
338
|
+
if sync_direction == "upload":
|
339
|
+
# Define current_dest_prefix here, just before use
|
340
|
+
current_dest_prefix = dest_prefix or ""
|
341
|
+
final_dest_key = (
|
342
|
+
rel_key
|
343
|
+
if is_single_file_sync
|
344
|
+
else os.path.join(current_dest_prefix, rel_key).replace(
|
345
|
+
"\\", "/"
|
346
|
+
)
|
347
|
+
)
|
348
|
+
if not current_dest_prefix and final_dest_key.startswith("/"):
|
349
|
+
final_dest_key = final_dest_key.lstrip("/")
|
350
|
+
dest_full_path_or_key = f"s3://{dest_bucket}/{final_dest_key}"
|
351
|
+
else:
|
352
|
+
dest_full_path_or_key = os.path.join(
|
353
|
+
destination, rel_key.replace("/", os.sep)
|
354
|
+
)
|
355
|
+
actions_to_perform.append(
|
356
|
+
{
|
357
|
+
"action": action_type,
|
358
|
+
"relative_key": rel_key,
|
359
|
+
"source_path": src_item["path"],
|
360
|
+
"source_mtime": src_item.get("mtime"),
|
361
|
+
"dest_full_path_or_key": dest_full_path_or_key,
|
362
|
+
"dest_bucket": dest_bucket,
|
363
|
+
"dest_prefix": dest_prefix,
|
364
|
+
"s3_key_full_src": src_item.get("key")
|
365
|
+
if sync_direction == "download"
|
366
|
+
else None,
|
367
|
+
"source_bucket": src_bucket,
|
368
|
+
"reason": reason,
|
369
|
+
}
|
371
370
|
)
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
dest_full_path_or_key: Optional[str] = None
|
380
|
-
if sync_direction == "upload":
|
381
|
-
# If uploading single file, dest key is prefix + filename
|
382
|
-
# If uploading dir, dest key is prefix + relative key
|
383
|
-
# Ensure dest_prefix is treated as empty string if None
|
384
|
-
current_dest_prefix = dest_prefix or ""
|
385
|
-
final_dest_key = (
|
386
|
-
rel_key
|
387
|
-
if is_single_file_sync
|
388
|
-
else os.path.join(current_dest_prefix, rel_key).replace("\\", "/")
|
371
|
+
|
372
|
+
if delete:
|
373
|
+
keys_to_delete = dest_keys - source_keys
|
374
|
+
for rel_key in keys_to_delete:
|
375
|
+
dest_item = dest_items[rel_key]
|
376
|
+
action_type = (
|
377
|
+
"delete_s3" if sync_direction == "upload" else "delete_local"
|
389
378
|
)
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
379
|
+
actions_to_perform.append(
|
380
|
+
{
|
381
|
+
"action": action_type,
|
382
|
+
"relative_key": rel_key,
|
383
|
+
"path_to_delete": dest_item["path"],
|
384
|
+
"s3_key_full_dest": dest_item.get("key")
|
385
|
+
if sync_direction == "upload"
|
386
|
+
else None,
|
387
|
+
"dest_bucket": dest_bucket,
|
388
|
+
"reason": "does not exist in source",
|
389
|
+
}
|
397
390
|
)
|
398
391
|
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
# Store details needed for specific actions
|
407
|
-
"dest_bucket": dest_bucket,
|
408
|
-
"dest_prefix": dest_prefix,
|
409
|
-
"s3_key_full_src": src_item.get("key")
|
410
|
-
if sync_direction == "download"
|
411
|
-
else None,
|
412
|
-
"source_bucket": src_bucket,
|
413
|
-
"reason": reason,
|
414
|
-
}
|
415
|
-
)
|
416
|
-
|
417
|
-
# Identify items for deletion in destination
|
418
|
-
if delete:
|
419
|
-
keys_to_delete = dest_keys - source_keys
|
420
|
-
for rel_key in keys_to_delete:
|
421
|
-
dest_item = dest_items[rel_key]
|
422
|
-
action_type = "delete_s3" if sync_direction == "upload" else "delete_local"
|
423
|
-
actions_to_perform.append(
|
424
|
-
{
|
425
|
-
"action": action_type,
|
426
|
-
"relative_key": rel_key,
|
427
|
-
"path_to_delete": dest_item["path"], # Full S3 URI or local path
|
428
|
-
"s3_key_full_dest": dest_item.get("key")
|
429
|
-
if sync_direction == "upload"
|
430
|
-
else None, # Needed for delete_s3
|
431
|
-
"dest_bucket": dest_bucket, # Needed for delete_s3
|
432
|
-
"reason": "does not exist in source",
|
433
|
-
}
|
434
|
-
)
|
435
|
-
|
436
|
-
# --- Execute Actions ---
|
437
|
-
uploads_done = downloads_done = deletions_done = 0
|
438
|
-
s3_deletions_batch: List[Dict[str, str]] = []
|
439
|
-
|
440
|
-
if not actions_to_perform:
|
441
|
-
print("Source and destination are already synchronized.")
|
442
|
-
# Still check if source/dest actually exist if nothing to do
|
443
|
-
if sync_direction == "upload" and not os.path.exists(source):
|
444
|
-
print(f"Note: Source path {source} does not exist.")
|
445
|
-
# Add check for S3 source existence if needed via head_bucket or similar
|
446
|
-
return
|
447
|
-
|
448
|
-
for action in actions_to_perform:
|
449
|
-
rel_key = action["relative_key"]
|
450
|
-
reason = action["reason"]
|
451
|
-
dest_full_path_or_key = action["dest_full_path_or_key"]
|
452
|
-
|
453
|
-
if action["action"] == "upload":
|
454
|
-
local_path = action["source_path"]
|
455
|
-
# Ensure dest_full_path_or_key is valid before parsing
|
456
|
-
if not isinstance(dest_full_path_or_key, str):
|
457
|
-
print(
|
458
|
-
f"ERROR: Invalid destination path calculated for upload: {dest_full_path_or_key}"
|
459
|
-
)
|
460
|
-
continue
|
461
|
-
# Extract final key from the pre-calculated dest_full_path_or_key
|
462
|
-
_, upload_key = _parse_s3_path(dest_full_path_or_key)
|
463
|
-
target_bucket = action["dest_bucket"]
|
392
|
+
uploads_done = downloads_done = deletions_done = 0
|
393
|
+
s3_deletions_batch: List[Dict[str, str]] = []
|
394
|
+
if not actions_to_perform:
|
395
|
+
if self.verbose:
|
396
|
+
print("Source and destination are already synchronized.")
|
397
|
+
# Optional: Add check if source exists if sync_direction == "upload" and not os.path.exists(source):
|
398
|
+
return
|
464
399
|
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
400
|
+
for action in actions_to_perform:
|
401
|
+
reason = action["reason"]
|
402
|
+
dest_full_path_or_key = action["dest_full_path_or_key"]
|
403
|
+
if action["action"] == "upload":
|
404
|
+
local_path = action["source_path"]
|
405
|
+
if not isinstance(dest_full_path_or_key, str):
|
406
|
+
print(f"ERROR: Invalid dest path: {dest_full_path_or_key}")
|
407
|
+
continue
|
408
|
+
_, upload_key = self._parse_path(dest_full_path_or_key)
|
409
|
+
target_bucket = action["dest_bucket"]
|
410
|
+
if self.verbose:
|
411
|
+
print(f"Upload: {local_path} to {dest_full_path_or_key} ({reason})")
|
412
|
+
if not dry_run:
|
413
|
+
if target_bucket and upload_key is not None:
|
414
|
+
try:
|
415
|
+
self.client.upload_file(
|
416
|
+
local_path, target_bucket, upload_key
|
417
|
+
)
|
418
|
+
uploads_done += 1
|
419
|
+
except ClientError as e:
|
420
|
+
print(f"ERROR uploading {local_path}: {e}")
|
421
|
+
except Exception as e:
|
422
|
+
print(f"ERROR uploading {local_path}: {e}")
|
423
|
+
else:
|
424
|
+
print(
|
425
|
+
f"ERROR: Invalid S3 target: bucket={target_bucket}, key={upload_key}"
|
426
|
+
)
|
427
|
+
elif action["action"] == "download":
|
428
|
+
s3_key_full = action["s3_key_full_src"]
|
429
|
+
local_path = dest_full_path_or_key
|
430
|
+
source_bucket_dl = action["source_bucket"]
|
431
|
+
if self.verbose:
|
477
432
|
print(
|
478
|
-
f"
|
433
|
+
f"Download: {action['source_path']} to {local_path} ({reason})"
|
479
434
|
)
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
s3_client.download_file(
|
500
|
-
source_bucket_dl, s3_key_full, local_path
|
501
|
-
)
|
502
|
-
downloads_done += 1
|
503
|
-
except ClientError as e:
|
504
|
-
print(f"ERROR downloading {s3_key_full}: {e}")
|
505
|
-
except OSError as e:
|
435
|
+
if not isinstance(local_path, str):
|
436
|
+
print(f"ERROR: Invalid local dest path: {local_path}")
|
437
|
+
continue
|
438
|
+
if not dry_run:
|
439
|
+
if source_bucket_dl and s3_key_full and local_path:
|
440
|
+
try:
|
441
|
+
local_file_dir = os.path.dirname(local_path)
|
442
|
+
os.makedirs(local_file_dir, exist_ok=True)
|
443
|
+
self.client.download_file(
|
444
|
+
source_bucket_dl, s3_key_full, local_path
|
445
|
+
)
|
446
|
+
downloads_done += 1
|
447
|
+
except ClientError as e:
|
448
|
+
print(f"ERROR downloading {s3_key_full}: {e}")
|
449
|
+
except OSError as e:
|
450
|
+
print(f"ERROR creating/writing {local_path}: {e}")
|
451
|
+
except Exception as e:
|
452
|
+
print(f"ERROR downloading {s3_key_full}: {e}")
|
453
|
+
else:
|
506
454
|
print(
|
507
|
-
f"ERROR
|
455
|
+
f"ERROR: Invalid download params: bucket={source_bucket_dl}, key={s3_key_full}, local={local_path}"
|
508
456
|
)
|
509
|
-
|
510
|
-
|
457
|
+
elif action["action"] == "delete_s3":
|
458
|
+
s3_key_to_delete = action["s3_key_full_dest"]
|
459
|
+
target_bucket_del = action["dest_bucket"]
|
460
|
+
if target_bucket_del and s3_key_to_delete:
|
461
|
+
if self.verbose:
|
462
|
+
print(f"Delete S3: {action['path_to_delete']} ({reason})")
|
463
|
+
if isinstance(s3_key_to_delete, str):
|
464
|
+
s3_deletions_batch.append({"Key": s3_key_to_delete})
|
465
|
+
else:
|
466
|
+
print(f"ERROR: Invalid S3 key for deletion: {s3_key_to_delete}")
|
511
467
|
else:
|
512
468
|
print(
|
513
|
-
f"ERROR: Invalid
|
469
|
+
f"ERROR: Invalid S3 target for deletion: bucket={target_bucket_del}, key={s3_key_to_delete}"
|
514
470
|
)
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
471
|
+
elif action["action"] == "delete_local":
|
472
|
+
local_path_to_delete = action["path_to_delete"]
|
473
|
+
if self.verbose:
|
474
|
+
print(f"Delete Local: {local_path_to_delete} ({reason})")
|
475
|
+
if not dry_run:
|
476
|
+
try:
|
477
|
+
os.remove(local_path_to_delete)
|
478
|
+
deletions_done += 1
|
479
|
+
except OSError as e:
|
480
|
+
print(f"ERROR deleting local file {local_path_to_delete}: {e}")
|
481
|
+
|
482
|
+
if s3_deletions_batch:
|
483
|
+
target_bucket_del_batch = next(
|
484
|
+
(
|
485
|
+
a["dest_bucket"]
|
486
|
+
for a in actions_to_perform
|
487
|
+
if a["action"] == "delete_s3"
|
488
|
+
),
|
489
|
+
None,
|
490
|
+
)
|
491
|
+
if not dry_run and target_bucket_del_batch:
|
492
|
+
deleted_count_batch = 0
|
493
|
+
for i in range(0, len(s3_deletions_batch), 1000):
|
494
|
+
batch = s3_deletions_batch[i : i + 1000]
|
495
|
+
delete_payload = {"Objects": batch, "Quiet": False}
|
496
|
+
try:
|
497
|
+
response = self.client.delete_objects(
|
498
|
+
Bucket=target_bucket_del_batch, Delete=delete_payload
|
499
|
+
)
|
500
|
+
deleted_count_batch += len(batch)
|
501
|
+
if "Errors" in response and response["Errors"]:
|
502
|
+
deleted_count_batch -= len(response["Errors"])
|
503
|
+
for error in response["Errors"]:
|
504
|
+
print(
|
505
|
+
f"ERROR deleting S3 object {error['Key']}: {error['Code']} - {error['Message']}"
|
506
|
+
)
|
507
|
+
except ClientError as e:
|
508
|
+
print(f"ERROR deleting S3 objects batch: {e}")
|
509
|
+
deleted_count_batch = 0
|
510
|
+
except Exception as e:
|
511
|
+
print(f"ERROR deleting S3 objects batch: {e}")
|
512
|
+
deleted_count_batch = 0
|
513
|
+
deletions_done += deleted_count_batch
|
514
|
+
elif target_bucket_del_batch:
|
515
|
+
deletions_done = len(s3_deletions_batch)
|
527
516
|
else:
|
528
517
|
print(
|
529
|
-
|
518
|
+
"Warning: Could not determine target bucket for S3 deletion batch."
|
530
519
|
)
|
531
520
|
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
)
|
555
|
-
if not dry_run and target_bucket_del_batch:
|
556
|
-
deleted_count_batch = 0
|
557
|
-
for i in range(0, len(s3_deletions_batch), 1000):
|
558
|
-
batch = s3_deletions_batch[i : i + 1000]
|
559
|
-
delete_payload = {"Objects": batch, "Quiet": False} # Get errors back
|
560
|
-
try:
|
561
|
-
response = s3_client.delete_objects(
|
562
|
-
Bucket=target_bucket_del_batch, Delete=delete_payload
|
563
|
-
)
|
564
|
-
# Increment count based on successful deletions reported (if not Quiet) or assume success if Quiet
|
565
|
-
deleted_count_batch += len(
|
566
|
-
batch
|
567
|
-
) # Assume success unless errors reported
|
568
|
-
if "Deleted" in response:
|
569
|
-
pass # Counted optimistically above
|
570
|
-
# deleted_count_batch += len(response['Deleted'])
|
571
|
-
if "Errors" in response and response["Errors"]:
|
572
|
-
deleted_count_batch -= len(
|
573
|
-
response["Errors"]
|
574
|
-
) # Adjust count for errors
|
575
|
-
for error in response["Errors"]:
|
576
|
-
print(
|
577
|
-
f"ERROR deleting S3 object {error['Key']}: {error['Code']} - {error['Message']}"
|
578
|
-
)
|
579
|
-
except ClientError as e:
|
580
|
-
print(f"ERROR deleting S3 objects batch: {e}")
|
581
|
-
deleted_count_batch = 0 # Assume batch failed
|
582
|
-
except Exception as e:
|
583
|
-
print(f"ERROR deleting S3 objects batch: {e}")
|
584
|
-
deleted_count_batch = 0 # Assume batch failed
|
585
|
-
deletions_done += deleted_count_batch
|
586
|
-
elif target_bucket_del_batch: # dry_run is True
|
587
|
-
deletions_done = len(
|
588
|
-
s3_deletions_batch
|
589
|
-
) # Report planned deletions for dry run
|
521
|
+
if dry_run:
|
522
|
+
if self.verbose:
|
523
|
+
upload_count = sum(
|
524
|
+
1 for a in actions_to_perform if a["action"] == "upload"
|
525
|
+
)
|
526
|
+
download_count = sum(
|
527
|
+
1 for a in actions_to_perform if a["action"] == "download"
|
528
|
+
)
|
529
|
+
delete_s3_count = len(s3_deletions_batch)
|
530
|
+
delete_local_count = sum(
|
531
|
+
1 for a in actions_to_perform if a["action"] == "delete_local"
|
532
|
+
)
|
533
|
+
print("\n--- DRY RUN SUMMARY ---")
|
534
|
+
if sync_direction == "upload":
|
535
|
+
print(f"Would upload: {upload_count} file(s)")
|
536
|
+
if delete:
|
537
|
+
print(f"Would delete from S3: {delete_s3_count} object(s)")
|
538
|
+
elif sync_direction == "download":
|
539
|
+
print(f"Would download: {download_count} file(s)")
|
540
|
+
if delete:
|
541
|
+
print(f"Would delete locally: {delete_local_count} file(s)")
|
542
|
+
print("--- END DRY RUN ---")
|
590
543
|
else:
|
591
|
-
|
544
|
+
if self.verbose:
|
545
|
+
if sync_direction == "upload":
|
546
|
+
print(
|
547
|
+
f"Sync completed. Uploaded: {uploads_done} file(s). Deleted from S3: {deletions_done if delete else 0} object(s)."
|
548
|
+
)
|
549
|
+
elif sync_direction == "download":
|
550
|
+
print(
|
551
|
+
f"Sync completed. Downloaded: {downloads_done} file(s). Deleted locally: {deletions_done if delete else 0} file(s)."
|
552
|
+
)
|
592
553
|
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
554
|
+
def copy(
|
555
|
+
self,
|
556
|
+
source: str,
|
557
|
+
destination: str,
|
558
|
+
) -> None:
|
559
|
+
"""
|
560
|
+
Copies files or directories between local paths and S3 URIs.
|
561
|
+
Handles:
|
562
|
+
- Local file to S3 object
|
563
|
+
- Local directory to S3 prefix (recursive)
|
564
|
+
- S3 object to local file
|
565
|
+
- S3 prefix to local directory (recursive)
|
566
|
+
Does NOT handle:
|
567
|
+
- Local to Local (use shutil)
|
568
|
+
- S3 to S3 (use AWS CLI or boto3 object copy)
|
569
|
+
Args:
|
570
|
+
source (str): The source path (local file/dir or s3://...).
|
571
|
+
destination (str): The destination path (local file/dir or s3://...).
|
572
|
+
"""
|
573
|
+
src_bucket, src_prefix = self._parse_path(source)
|
574
|
+
dest_bucket, dest_prefix = self._parse_path(destination)
|
575
|
+
|
576
|
+
if src_bucket is None and dest_bucket is None:
|
614
577
|
print(
|
615
|
-
|
578
|
+
"Error: Both source and destination are local. Use 'shutil.copy' or 'shutil.copytree'."
|
616
579
|
)
|
617
|
-
|
580
|
+
return
|
581
|
+
if src_bucket is not None and dest_bucket is not None:
|
618
582
|
print(
|
619
|
-
|
620
|
-
)
|
621
|
-
|
622
|
-
|
623
|
-
def s3_check(s3_uri: str) -> bool:
|
624
|
-
"""
|
625
|
-
Check if an object or prefix exists in an S3 bucket using an S3 URI.
|
626
|
-
|
627
|
-
Args:
|
628
|
-
s3_uri (str): The S3 URI (e.g., 's3://my-bucket/my-key' or 's3://my-bucket/my-prefix/').
|
629
|
-
Use a trailing '/' to check for a prefix/directory.
|
630
|
-
|
631
|
-
Returns:
|
632
|
-
bool: True if the object or prefix exists, False otherwise.
|
633
|
-
"""
|
634
|
-
s3 = boto3.client("s3")
|
635
|
-
bucket_name, s3_key = _parse_s3_path(s3_uri)
|
636
|
-
|
637
|
-
if bucket_name is None or s3_key is None:
|
638
|
-
# _parse_s3_path returns None, None if scheme is not 's3'
|
639
|
-
print(f"Error: Invalid S3 URI format: {s3_uri}")
|
640
|
-
return False
|
641
|
-
|
642
|
-
is_prefix = s3_key.endswith("/")
|
643
|
-
|
644
|
-
try:
|
645
|
-
if is_prefix:
|
646
|
-
# Check for prefix existence by listing objects
|
647
|
-
# Handle the case where s3_key might be empty if URI is just s3://bucket/
|
648
|
-
list_prefix = s3_key if s3_key else ""
|
649
|
-
response = s3.list_objects_v2(
|
650
|
-
Bucket=bucket_name, Prefix=list_prefix, MaxKeys=1
|
583
|
+
"Error: S3 to S3 copy not implemented. Use 'aws s3 cp' or boto3 'copy_object'."
|
651
584
|
)
|
652
|
-
|
653
|
-
return "Contents" in response or "CommonPrefixes" in response
|
654
|
-
else:
|
655
|
-
# Check for object existence
|
656
|
-
s3.head_object(Bucket=bucket_name, Key=s3_key)
|
657
|
-
return True
|
658
|
-
except ClientError as e: # Catch boto3 ClientError first
|
659
|
-
# If head_object returns 404 (NoSuchKey), the object doesn't exist
|
660
|
-
# list_objects_v2 does not raise NoSuchKey for prefixes
|
661
|
-
if e.response["Error"]["Code"] == "404":
|
662
|
-
return False
|
663
|
-
elif e.response["Error"]["Code"] == "NoSuchBucket":
|
664
|
-
print(f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri}).")
|
665
|
-
return False
|
666
|
-
# Handle other potential errors like AccessDenied differently if needed
|
667
|
-
print(f"Error checking {s3_uri}: {e}")
|
668
|
-
return False
|
669
|
-
# except s3.exceptions.NoSuchBucket: # This specific exception is less common with boto3 client
|
670
|
-
# print(f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri}).")
|
671
|
-
# return False
|
672
|
-
except Exception as e:
|
673
|
-
print(f"An unexpected error occurred checking {s3_uri}: {e}")
|
674
|
-
return False
|
675
|
-
|
676
|
-
|
677
|
-
def s3_copy(
|
678
|
-
source: str,
|
679
|
-
destination: str,
|
680
|
-
verbose: bool = True,
|
681
|
-
) -> None:
|
682
|
-
"""
|
683
|
-
Copies files or directories between local paths and S3 URIs.
|
585
|
+
return
|
684
586
|
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
587
|
+
# Upload: Local to S3
|
588
|
+
if src_bucket is None and dest_bucket is not None:
|
589
|
+
if not os.path.exists(source):
|
590
|
+
print(f"Error: Local source path not found: {source}")
|
591
|
+
return
|
592
|
+
current_dest_prefix = dest_prefix or ""
|
690
593
|
|
691
|
-
|
692
|
-
|
693
|
-
|
594
|
+
if os.path.isfile(source):
|
595
|
+
if not current_dest_prefix or destination.endswith("/"):
|
596
|
+
s3_key = os.path.join(
|
597
|
+
current_dest_prefix, os.path.basename(source)
|
598
|
+
).replace("\\", "/")
|
599
|
+
else:
|
600
|
+
s3_key = current_dest_prefix
|
601
|
+
if self.verbose:
|
602
|
+
print(f"Uploading {source} to s3://{dest_bucket}/{s3_key}")
|
603
|
+
try:
|
604
|
+
self.client.upload_file(source, dest_bucket, s3_key)
|
605
|
+
if self.verbose:
|
606
|
+
print("Upload complete.")
|
607
|
+
except ClientError as e:
|
608
|
+
print(f"ERROR uploading {source}: {e}")
|
609
|
+
except Exception as e:
|
610
|
+
print(f"ERROR uploading {source}: {e}")
|
694
611
|
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
612
|
+
elif os.path.isdir(source):
|
613
|
+
if self.verbose:
|
614
|
+
print(
|
615
|
+
f"Uploading directory {source}/* to s3://{dest_bucket}/{current_dest_prefix}/"
|
616
|
+
)
|
617
|
+
files_uploaded = files_failed = 0
|
618
|
+
for root, _, files in os.walk(source):
|
619
|
+
for file in files:
|
620
|
+
local_path = os.path.join(root, file)
|
621
|
+
relative_path = os.path.relpath(local_path, source)
|
622
|
+
s3_key = os.path.join(
|
623
|
+
current_dest_prefix, relative_path
|
624
|
+
).replace("\\", "/")
|
625
|
+
if self.verbose:
|
626
|
+
print(
|
627
|
+
f" Uploading {local_path} to s3://{dest_bucket}/{s3_key}"
|
628
|
+
)
|
629
|
+
try:
|
630
|
+
self.client.upload_file(local_path, dest_bucket, s3_key)
|
631
|
+
files_uploaded += 1
|
632
|
+
except ClientError as e:
|
633
|
+
print(f" ERROR uploading {local_path}: {e}")
|
634
|
+
files_failed += 1
|
635
|
+
except Exception as e:
|
636
|
+
print(f" ERROR uploading {local_path}: {e}")
|
637
|
+
files_failed += 1
|
638
|
+
if self.verbose:
|
639
|
+
print(
|
640
|
+
f"Directory upload complete. Files uploaded: {files_uploaded}, Failed: {files_failed}"
|
641
|
+
)
|
642
|
+
else:
|
643
|
+
print(f"Error: Source {source} is neither a file nor a directory.")
|
715
644
|
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
# Ensure dest_prefix is usable, default to empty string if None
|
722
|
-
dest_prefix = dest_prefix or ""
|
723
|
-
|
724
|
-
# Case 1: Source is a local file
|
725
|
-
if os.path.isfile(source):
|
726
|
-
# Determine final S3 key
|
727
|
-
# If dest looks like a dir (ends /) or is empty, append filename
|
728
|
-
if not dest_prefix or destination.endswith("/"):
|
729
|
-
s3_key = os.path.join(dest_prefix, os.path.basename(source)).replace(
|
730
|
-
"\\", "/"
|
731
|
-
)
|
732
|
-
else: # Treat dest as the exact key name
|
733
|
-
s3_key = dest_prefix
|
645
|
+
# Download: S3 to Local
|
646
|
+
elif src_bucket is not None and dest_bucket is None:
|
647
|
+
is_prefix_download = False
|
648
|
+
single_object_key = None
|
649
|
+
current_src_prefix = src_prefix or "" # Ensure not None
|
734
650
|
|
735
|
-
if
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
for file in files:
|
755
|
-
local_path = os.path.join(root, file)
|
756
|
-
relative_path = os.path.relpath(local_path, source)
|
757
|
-
s3_key = os.path.join(dest_prefix, relative_path).replace("\\", "/")
|
758
|
-
if verbose:
|
651
|
+
if source.endswith("/"):
|
652
|
+
is_prefix_download = True
|
653
|
+
else:
|
654
|
+
try:
|
655
|
+
if current_src_prefix:
|
656
|
+
self.client.head_object(
|
657
|
+
Bucket=src_bucket, Key=current_src_prefix
|
658
|
+
)
|
659
|
+
single_object_key = current_src_prefix
|
660
|
+
else:
|
661
|
+
# Path like s3://bucket, treat as prefix download
|
662
|
+
is_prefix_download = True
|
663
|
+
except ClientError as e:
|
664
|
+
if e.response["Error"]["Code"] == "404":
|
665
|
+
is_prefix_download = True # Assume prefix if object not found
|
666
|
+
elif e.response["Error"]["Code"] == "NoSuchBucket":
|
667
|
+
print(f"Error: Source bucket '{src_bucket}' not found.")
|
668
|
+
return
|
669
|
+
else:
|
759
670
|
print(
|
760
|
-
f"
|
671
|
+
f"Error checking S3 source s3://{src_bucket}/{current_src_prefix}: {e}"
|
761
672
|
)
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
files_failed += 1
|
768
|
-
except Exception as e:
|
769
|
-
print(f" ERROR uploading {local_path}: {e}")
|
770
|
-
files_failed += 1
|
771
|
-
print(
|
772
|
-
f"Directory upload complete. Files uploaded: {files_uploaded}, Failed: {files_failed}"
|
773
|
-
)
|
774
|
-
else:
|
775
|
-
print(f"Error: Source {source} is neither a file nor a directory.")
|
776
|
-
|
777
|
-
# --- Download: S3 to Local ---
|
778
|
-
elif src_bucket is not None and dest_bucket is None:
|
779
|
-
# Determine if source is likely a single object or a prefix
|
780
|
-
is_prefix_download = False
|
781
|
-
single_object_key = None
|
782
|
-
|
783
|
-
# If source ends with '/', treat it as a prefix explicitly
|
784
|
-
if source.endswith("/"):
|
785
|
-
is_prefix_download = True
|
786
|
-
src_prefix = src_prefix or "" # Ensure not None
|
787
|
-
else:
|
788
|
-
# Try checking if the source key exists as a single object
|
789
|
-
try:
|
790
|
-
s3_client.head_object(Bucket=src_bucket, Key=src_prefix)
|
791
|
-
single_object_key = src_prefix # It exists as a single object
|
792
|
-
except ClientError as e:
|
793
|
-
if e.response["Error"]["Code"] == "404":
|
794
|
-
# Object doesn't exist, assume it's a prefix for recursive download
|
795
|
-
is_prefix_download = True
|
796
|
-
src_prefix = src_prefix or "" # Ensure not None
|
797
|
-
elif e.response["Error"]["Code"] == "NoSuchBucket":
|
798
|
-
print(f"Error: Source bucket '{src_bucket}' not found.")
|
673
|
+
return
|
674
|
+
except Exception as e:
|
675
|
+
print(
|
676
|
+
f"Error checking S3 source s3://{src_bucket}/{current_src_prefix}: {e}"
|
677
|
+
)
|
799
678
|
return
|
679
|
+
|
680
|
+
if single_object_key is not None:
|
681
|
+
if os.path.isdir(destination) or destination.endswith(os.sep):
|
682
|
+
local_dest_path = os.path.join(
|
683
|
+
destination, os.path.basename(single_object_key)
|
684
|
+
)
|
685
|
+
os.makedirs(destination, exist_ok=True)
|
800
686
|
else:
|
801
|
-
|
687
|
+
local_dest_path = destination
|
688
|
+
parent_dir = os.path.dirname(local_dest_path)
|
689
|
+
if parent_dir:
|
690
|
+
os.makedirs(parent_dir, exist_ok=True)
|
691
|
+
if self.verbose:
|
802
692
|
print(
|
803
|
-
f"
|
693
|
+
f"Downloading s3://{src_bucket}/{single_object_key} to {local_dest_path}"
|
804
694
|
)
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
695
|
+
try:
|
696
|
+
self.client.download_file(
|
697
|
+
src_bucket, single_object_key, local_dest_path
|
698
|
+
)
|
699
|
+
if self.verbose:
|
700
|
+
print("Download complete.")
|
701
|
+
except ClientError as e:
|
702
|
+
print(f"ERROR downloading {single_object_key}: {e}")
|
703
|
+
except OSError as e:
|
704
|
+
print(f"ERROR creating/writing {local_dest_path}: {e}")
|
705
|
+
except Exception as e:
|
706
|
+
print(f"ERROR downloading {single_object_key}: {e}")
|
811
707
|
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
destination, os.path.basename(single_object_key)
|
819
|
-
)
|
820
|
-
# Create local directory if downloading into it and it doesn't exist
|
708
|
+
elif is_prefix_download:
|
709
|
+
if os.path.exists(destination) and not os.path.isdir(destination):
|
710
|
+
print(
|
711
|
+
f"Error: Local destination '{destination}' exists but is not a directory."
|
712
|
+
)
|
713
|
+
return
|
821
714
|
os.makedirs(destination, exist_ok=True)
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
)
|
843
|
-
except Exception as e:
|
844
|
-
print(f"ERROR downloading {single_object_key}: {e}")
|
845
|
-
|
846
|
-
# Case 2: Download S3 prefix (recursive)
|
847
|
-
elif is_prefix_download:
|
848
|
-
# Ensure local destination is a directory
|
849
|
-
if os.path.exists(destination) and not os.path.isdir(destination):
|
850
|
-
print(
|
851
|
-
f"Error: Local destination '{destination}' exists but is not a directory for prefix download."
|
852
|
-
)
|
853
|
-
return
|
854
|
-
os.makedirs(destination, exist_ok=True)
|
855
|
-
|
856
|
-
if verbose:
|
857
|
-
print(
|
858
|
-
f"Downloading prefix s3://{src_bucket}/{src_prefix}/* to {destination}/"
|
859
|
-
)
|
860
|
-
|
861
|
-
paginator = s3_client.get_paginator("list_objects_v2")
|
862
|
-
files_downloaded = 0
|
863
|
-
files_failed = 0
|
864
|
-
operation_parameters = {"Bucket": src_bucket}
|
865
|
-
if src_prefix:
|
866
|
-
operation_parameters["Prefix"] = src_prefix
|
867
|
-
|
868
|
-
try:
|
869
|
-
page_iterator = paginator.paginate(**operation_parameters)
|
870
|
-
found_objects = False
|
871
|
-
for page in page_iterator:
|
872
|
-
if "Contents" in page:
|
873
|
-
found_objects = True
|
874
|
-
for obj in page["Contents"]:
|
875
|
-
s3_key = obj["Key"]
|
876
|
-
# Skip zero-byte directory markers if downloading a prefix
|
877
|
-
if s3_key.endswith("/") and obj["Size"] == 0:
|
878
|
-
continue
|
879
|
-
|
880
|
-
# Calculate relative path from the source prefix
|
881
|
-
if src_prefix and s3_key.startswith(src_prefix):
|
882
|
-
# Handle potential trailing slash inconsistency
|
883
|
-
prefix_adjusted = (
|
884
|
-
src_prefix
|
885
|
-
if src_prefix.endswith("/")
|
886
|
-
else src_prefix + "/"
|
887
|
-
)
|
888
|
-
if s3_key.startswith(prefix_adjusted):
|
889
|
-
relative_key = s3_key[len(prefix_adjusted) :]
|
890
|
-
# Handle the prefix itself if listed as an object (unlikely for prefix download)
|
891
|
-
elif s3_key == src_prefix.rstrip("/"):
|
892
|
-
relative_key = os.path.basename(s3_key)
|
893
|
-
else: # Should not happen
|
894
|
-
relative_key = s3_key
|
895
|
-
elif not src_prefix: # Downloading whole bucket essentially
|
715
|
+
if self.verbose:
|
716
|
+
print(
|
717
|
+
f"Downloading prefix s3://{src_bucket}/{current_src_prefix}/* to {destination}/"
|
718
|
+
)
|
719
|
+
paginator = self.client.get_paginator("list_objects_v2")
|
720
|
+
files_downloaded = files_failed = 0
|
721
|
+
operation_parameters = {"Bucket": src_bucket}
|
722
|
+
# The problematic line for the linter, re-adding type ignore
|
723
|
+
if current_src_prefix: # type: ignore
|
724
|
+
operation_parameters["Prefix"] = current_src_prefix
|
725
|
+
try:
|
726
|
+
page_iterator = paginator.paginate(**operation_parameters)
|
727
|
+
found_objects = False
|
728
|
+
for page in page_iterator:
|
729
|
+
if "Contents" in page:
|
730
|
+
found_objects = True
|
731
|
+
for obj in page["Contents"]:
|
732
|
+
s3_key = obj["Key"]
|
733
|
+
if s3_key.endswith("/") and obj["Size"] == 0:
|
734
|
+
continue
|
896
735
|
relative_key = s3_key
|
897
|
-
|
898
|
-
|
736
|
+
if current_src_prefix:
|
737
|
+
if s3_key.startswith(current_src_prefix):
|
738
|
+
if s3_key == current_src_prefix.rstrip("/"):
|
739
|
+
relative_key = os.path.basename(s3_key)
|
740
|
+
else:
|
741
|
+
prefix_adjusted = current_src_prefix + (
|
742
|
+
""
|
743
|
+
if current_src_prefix.endswith("/")
|
744
|
+
else "/"
|
745
|
+
)
|
746
|
+
if s3_key.startswith(prefix_adjusted):
|
747
|
+
relative_key = s3_key[
|
748
|
+
len(prefix_adjusted) :
|
749
|
+
]
|
750
|
+
elif not current_src_prefix.endswith("/"):
|
751
|
+
relative_key = s3_key[
|
752
|
+
len(current_src_prefix) :
|
753
|
+
].lstrip("/")
|
754
|
+
if not relative_key:
|
755
|
+
continue
|
756
|
+
local_dest_path = os.path.join(
|
757
|
+
destination, relative_key.replace("/", os.sep)
|
758
|
+
)
|
759
|
+
local_dest_dir = os.path.dirname(local_dest_path)
|
760
|
+
if self.verbose:
|
761
|
+
print(
|
762
|
+
f" Downloading s3://{src_bucket}/{s3_key} to {local_dest_path}"
|
763
|
+
)
|
764
|
+
try:
|
765
|
+
if local_dest_dir:
|
766
|
+
os.makedirs(local_dest_dir, exist_ok=True)
|
767
|
+
self.client.download_file(
|
768
|
+
src_bucket, s3_key, local_dest_path
|
769
|
+
)
|
770
|
+
files_downloaded += 1
|
771
|
+
except ClientError as e:
|
772
|
+
print(f" ERROR downloading {s3_key}: {e}")
|
773
|
+
files_failed += 1
|
774
|
+
except OSError as e:
|
775
|
+
print(
|
776
|
+
f" ERROR creating/writing {local_dest_path}: {e}"
|
777
|
+
)
|
778
|
+
files_failed += 1
|
779
|
+
except Exception as e:
|
780
|
+
print(f" ERROR downloading {s3_key}: {e}")
|
781
|
+
files_failed += 1
|
782
|
+
if not found_objects and self.verbose:
|
783
|
+
print(
|
784
|
+
f"Warning: No objects found at source prefix s3://{src_bucket}/{current_src_prefix}"
|
785
|
+
)
|
786
|
+
if self.verbose:
|
787
|
+
print(
|
788
|
+
f"Prefix download complete. Files downloaded: {files_downloaded}, Failed: {files_failed}"
|
789
|
+
)
|
790
|
+
except ClientError as e:
|
791
|
+
if e.response["Error"]["Code"] == "NoSuchBucket":
|
792
|
+
print(f"Error: Source bucket '{src_bucket}' not found.")
|
793
|
+
else:
|
794
|
+
print(
|
795
|
+
f"Error listing objects in s3://{src_bucket}/{current_src_prefix}: {e}"
|
796
|
+
)
|
797
|
+
except Exception as e:
|
798
|
+
print(
|
799
|
+
f"Error listing objects in s3://{src_bucket}/{current_src_prefix}: {e}"
|
800
|
+
)
|
801
|
+
else:
|
802
|
+
print("Error: Unknown copy operation type.")
|
899
803
|
|
900
|
-
|
901
|
-
|
902
|
-
|
804
|
+
def check(self, s3_uri: str) -> bool:
|
805
|
+
"""
|
806
|
+
Check if an object or prefix exists in an S3 bucket using an S3 URI.
|
903
807
|
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
local_dest_dir = os.path.dirname(local_dest_path)
|
808
|
+
Args:
|
809
|
+
s3_uri (str): The S3 URI (e.g., 's3://my-bucket/my-key' or 's3://my-bucket/my-prefix/').
|
810
|
+
Use a trailing '/' to check for a prefix/directory.
|
908
811
|
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
if local_dest_dir:
|
915
|
-
os.makedirs(local_dest_dir, exist_ok=True)
|
916
|
-
s3_client.download_file(
|
917
|
-
src_bucket, s3_key, local_dest_path
|
918
|
-
)
|
919
|
-
files_downloaded += 1
|
920
|
-
except ClientError as e:
|
921
|
-
print(f" ERROR downloading {s3_key}: {e}")
|
922
|
-
files_failed += 1
|
923
|
-
except OSError as e:
|
924
|
-
print(
|
925
|
-
f" ERROR creating directory or writing file {local_dest_path}: {e}"
|
926
|
-
)
|
927
|
-
files_failed += 1
|
928
|
-
except Exception as e:
|
929
|
-
print(f" ERROR downloading {s3_key}: {e}")
|
930
|
-
files_failed += 1
|
812
|
+
Returns:
|
813
|
+
bool: True if the object or prefix exists, False otherwise.
|
814
|
+
"""
|
815
|
+
# Use the class client and parse method
|
816
|
+
bucket_name, s3_key = self._parse_path(s3_uri)
|
931
817
|
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
|
818
|
+
if bucket_name is None or s3_key is None:
|
819
|
+
# _parse_path returns None, None if scheme is not 's3'
|
820
|
+
print(f"Error: Invalid S3 URI format: {s3_uri}")
|
821
|
+
return False
|
936
822
|
|
937
|
-
|
938
|
-
f"Prefix download complete. Files downloaded: {files_downloaded}, Failed: {files_failed}"
|
939
|
-
)
|
823
|
+
is_prefix = s3_key.endswith("/")
|
940
824
|
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
825
|
+
try:
|
826
|
+
if is_prefix:
|
827
|
+
# Check for prefix existence by listing objects
|
828
|
+
# Handle the case where s3_key might be empty if URI is just s3://bucket/
|
829
|
+
list_prefix = s3_key if s3_key else ""
|
830
|
+
response = self.client.list_objects_v2(
|
831
|
+
Bucket=bucket_name, Prefix=list_prefix, MaxKeys=1
|
832
|
+
)
|
833
|
+
# Check if any objects OR common prefixes (folders) are returned for the prefix
|
834
|
+
return "Contents" in response or "CommonPrefixes" in response
|
835
|
+
else:
|
836
|
+
# Check for object existence
|
837
|
+
self.client.head_object(Bucket=bucket_name, Key=s3_key)
|
838
|
+
return True
|
839
|
+
except ClientError as e: # Catch boto3 ClientError first
|
840
|
+
# If head_object returns 404 (NoSuchKey), the object doesn't exist
|
841
|
+
# list_objects_v2 does not raise NoSuchKey for prefixes
|
842
|
+
if e.response["Error"]["Code"] == "404":
|
843
|
+
return False
|
844
|
+
elif e.response["Error"]["Code"] == "NoSuchBucket":
|
845
|
+
if self.verbose:
|
945
846
|
print(
|
946
|
-
f"Error
|
847
|
+
f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri})."
|
947
848
|
)
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
849
|
+
return False
|
850
|
+
# Handle other potential errors like AccessDenied differently if needed
|
851
|
+
print(f"Error checking {s3_uri}: {e}")
|
852
|
+
return False
|
853
|
+
except Exception as e:
|
854
|
+
print(f"An unexpected error occurred checking {s3_uri}: {e}")
|
855
|
+
return False
|