nebu 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nebu/data.py CHANGED
@@ -104,7 +104,7 @@ def find_latest_checkpoint(training_dir: str) -> Optional[str]:
104
104
 
105
105
 
106
106
  def _parse_s3_path(path: str) -> Tuple[Optional[str], Optional[str]]:
107
- """Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
107
+ """Standalone helper: Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
108
108
  parsed = urlparse(path)
109
109
  if parsed.scheme != "s3":
110
110
  return None, None
@@ -113,840 +113,743 @@ def _parse_s3_path(path: str) -> Tuple[Optional[str], Optional[str]]:
113
113
  return bucket, prefix
114
114
 
115
115
 
116
- def _list_s3_objects(
117
- s3_client: Any, bucket: str, prefix: Optional[str], verbose: bool = True
118
- ) -> Dict[str, Dict[str, Any]]:
119
- """Lists objects in an S3 prefix."""
120
- objects: Dict[str, Dict[str, Any]] = {}
121
- paginator = s3_client.get_paginator("list_objects_v2")
122
- list_prefix = (
123
- prefix if prefix else ""
124
- ) # Use empty string if prefix is None for listing
125
- if verbose:
126
- print(f"Listing objects in s3://{bucket}/{list_prefix}...")
127
-
128
- operation_parameters = {"Bucket": bucket}
129
- # Only add Prefix parameter if it's non-empty
130
- if list_prefix:
131
- operation_parameters["Prefix"] = list_prefix
132
-
133
- try:
134
- page_iterator = paginator.paginate(**operation_parameters)
135
- for page in page_iterator:
136
- if "Contents" in page:
137
- for obj in page["Contents"]:
138
- # Ignore zero-byte objects ending in '/' (S3 console folders)
139
- if obj["Key"].endswith("/") and obj["Size"] == 0:
140
- continue
141
-
142
- # Determine key relative to the *prefix* for comparison
143
- # If prefix is None or empty, relative key is the full key.
144
- relative_key: Optional[str] = None
145
- if prefix and obj["Key"].startswith(prefix):
146
- # Ensure trailing slash consistency if prefix has one
147
- prefix_adjusted = (
148
- prefix if prefix.endswith("/") else prefix + "/"
149
- )
150
- # Handle exact match of prefix as a file
151
- if obj["Key"] == prefix:
152
- relative_key = os.path.basename(obj["Key"])
153
- # Handle keys within the prefix "directory"
154
- elif obj["Key"].startswith(prefix_adjusted):
155
- relative_key = obj["Key"][len(prefix_adjusted) :]
156
- # This case should technically not be needed if prefix is used correctly in listing
157
- # but handle defensively if object key *is* the prefix itself (without trailing slash)
158
- elif obj["Key"] == prefix.rstrip("/"):
159
- relative_key = os.path.basename(obj["Key"])
160
- # else: # Should not happen if prefix filter works correctly
161
- # print(f"Warning: Unexpected key {obj['Key']} found for prefix {prefix}")
162
- # relative_key = obj["Key"] # Fallback
163
- elif not prefix:
164
- # If no prefix specified, the relative key is the full key
165
- relative_key = obj["Key"]
166
- # else: obj["Key"] does not start with prefix - ignore (shouldn't happen with Prefix param)
167
-
168
- # Skip if relative key is empty or None (e.g., prefix itself listed, or unexpected case)
169
- if not relative_key:
170
- continue
171
-
172
- # Ensure LastModified is timezone-aware
173
- last_modified = obj["LastModified"]
174
- if last_modified.tzinfo is None:
175
- last_modified = last_modified.replace(tzinfo=timezone.utc)
176
-
177
- objects[relative_key] = {
178
- "path": f"s3://{bucket}/{obj['Key']}", # Store full path for reference
179
- "key": obj["Key"], # Store full S3 key
180
- "size": obj["Size"],
181
- "mtime": last_modified,
182
- "type": "s3",
183
- }
184
- except ClientError as e:
185
- if e.response["Error"]["Code"] == "NoSuchBucket":
186
- print(f"Error: Bucket '{bucket}' not found.")
187
- # Allow sync *to* a non-existent prefix (will just upload all)
188
- elif e.response["Error"]["Code"] == "NoSuchKey" and prefix:
189
- if verbose:
190
- print(f"Prefix s3://{bucket}/{prefix} not found (treating as empty).")
191
- else:
192
- print(f"Error listing S3 objects: {e}")
193
- # Return empty dict on error that prevents listing (like NoSuchBucket)
194
- if e.response["Error"]["Code"] == "NoSuchBucket":
116
+ class Bucket:
117
+ """Handles interactions with AWS S3."""
118
+
119
+ def __init__(self, verbose: bool = True):
120
+ """
121
+ Initializes the S3 handler.
122
+
123
+ Args:
124
+ verbose (bool): If True, prints status messages. Defaults to True.
125
+ """
126
+ self.client = boto3.client("s3")
127
+ self.verbose = verbose
128
+
129
+ def _parse_path(self, path: str) -> Tuple[Optional[str], Optional[str]]:
130
+ """Class method: Parses an S3 path (s3://bucket/prefix) into bucket and prefix."""
131
+ # Reusing the standalone logic here for consistency
132
+ return _parse_s3_path(path)
133
+
134
+ def _list_objects(
135
+ self, bucket: str, prefix: Optional[str]
136
+ ) -> Dict[str, Dict[str, Any]]:
137
+ """Class method: Lists objects in an S3 prefix."""
138
+ objects: Dict[str, Dict[str, Any]] = {}
139
+ paginator = self.client.get_paginator("list_objects_v2")
140
+ list_prefix = prefix or ""
141
+ if self.verbose:
142
+ print(f"Listing objects in s3://{bucket}/{list_prefix}...")
143
+
144
+ operation_parameters = {"Bucket": bucket}
145
+ if list_prefix:
146
+ operation_parameters["Prefix"] = list_prefix
147
+
148
+ try:
149
+ page_iterator = paginator.paginate(**operation_parameters)
150
+ for page in page_iterator:
151
+ if "Contents" in page:
152
+ for obj in page["Contents"]:
153
+ if obj["Key"].endswith("/") and obj["Size"] == 0:
154
+ continue
155
+ relative_key: Optional[str] = None
156
+ current_prefix = prefix or ""
157
+ if current_prefix and obj["Key"].startswith(current_prefix):
158
+ prefix_adjusted = current_prefix + (
159
+ "" if current_prefix.endswith("/") else "/"
160
+ )
161
+ if obj["Key"] == current_prefix.rstrip("/"):
162
+ relative_key = os.path.basename(obj["Key"])
163
+ elif obj["Key"].startswith(prefix_adjusted):
164
+ relative_key = obj["Key"][len(prefix_adjusted) :]
165
+ else:
166
+ potential_rel_key = obj["Key"][len(current_prefix) :]
167
+ relative_key = potential_rel_key.lstrip("/")
168
+ elif not current_prefix:
169
+ relative_key = obj["Key"]
170
+ if not relative_key:
171
+ continue
172
+ last_modified = obj["LastModified"]
173
+ if last_modified.tzinfo is None:
174
+ last_modified = last_modified.replace(tzinfo=timezone.utc)
175
+ objects[relative_key] = {
176
+ "path": f"s3://{bucket}/{obj['Key']}",
177
+ "key": obj["Key"],
178
+ "size": obj["Size"],
179
+ "mtime": last_modified,
180
+ "type": "s3",
181
+ }
182
+ except ClientError as e:
183
+ if e.response["Error"]["Code"] == "NoSuchBucket":
184
+ if self.verbose:
185
+ print(f"Error: Bucket '{bucket}' not found.")
186
+ elif e.response["Error"]["Code"] == "NoSuchKey" and prefix:
187
+ if self.verbose:
188
+ print(
189
+ f"Prefix s3://{bucket}/{prefix} not found (treating as empty)."
190
+ )
191
+ else:
192
+ print(f"Error listing S3 objects: {e}")
193
+ if e.response["Error"]["Code"] == "NoSuchBucket":
194
+ return {}
195
+ except Exception as e:
196
+ print(f"An unexpected error occurred listing S3 objects: {e}")
195
197
  return {}
196
- except Exception as e:
197
- print(f"An unexpected error occurred listing S3 objects: {e}")
198
- return {} # Return empty on unexpected error
199
-
200
- if verbose:
201
- print(f"Found {len(objects)} objects in S3.")
202
- return objects
203
-
204
-
205
- def _list_local_files(
206
- local_dir: str, verbose: bool = True
207
- ) -> Dict[str, Dict[str, Any]]:
208
- """Lists files in a local directory."""
209
- if not os.path.isdir(local_dir):
210
- # Check if it's a file path instead of a dir
198
+ if self.verbose:
199
+ print(f"Found {len(objects)} objects in S3.")
200
+ return objects
201
+
202
+ def _list_local(self, local_dir: str) -> Dict[str, Dict[str, Any]]:
203
+ """Class method: Lists files in a local directory."""
204
+ files: Dict[str, Dict[str, Any]] = {}
205
+ if not os.path.exists(local_dir):
206
+ if self.verbose:
207
+ print(
208
+ f"Warning: Local path not found: {local_dir} (treating as empty)."
209
+ )
210
+ return files
211
211
  if os.path.isfile(local_dir):
212
- print(
213
- f"Warning: Source {local_dir} is a file, not a directory. Syncing single file."
214
- )
212
+ if self.verbose:
213
+ print(
214
+ f"Warning: Source {local_dir} is a file, not a directory. Syncing single file."
215
+ )
215
216
  try:
216
- local_size = os.path.getsize(local_dir)
217
- local_mtime_ts = os.path.getmtime(local_dir)
218
- local_mtime = datetime.fromtimestamp(local_mtime_ts, tz=timezone.utc)
219
217
  file_name = os.path.basename(local_dir)
220
- return {
221
- file_name: {
222
- "path": local_dir,
223
- "size": local_size,
224
- "mtime": local_mtime,
225
- "type": "local",
226
- }
218
+ files[file_name] = {
219
+ "path": local_dir,
220
+ "size": os.path.getsize(local_dir),
221
+ "mtime": datetime.fromtimestamp(
222
+ os.path.getmtime(local_dir), tz=timezone.utc
223
+ ),
224
+ "type": "local",
227
225
  }
228
226
  except OSError as e:
229
227
  print(f"Error accessing source file {local_dir}: {e}")
230
- return {}
231
- else:
232
- print(f"Warning: Local path not found: {local_dir} (treating as empty).")
233
- return {}
234
-
235
- files: Dict[str, Dict[str, Any]] = {}
236
- if verbose:
237
- print(f"Scanning local directory: {local_dir}...")
238
- for root, _, file_list in os.walk(local_dir):
239
- for file_name in file_list:
240
- local_path = os.path.join(root, file_name)
241
- try:
242
- # Use '/' for relative key consistency
243
- relative_path = os.path.relpath(local_path, local_dir).replace(
244
- "\\", "/"
228
+ return files
229
+ if self.verbose:
230
+ print(f"Scanning local directory: {local_dir}...")
231
+ for root, _, file_list in os.walk(local_dir):
232
+ for file_name in file_list:
233
+ local_path = os.path.join(root, file_name)
234
+ try:
235
+ relative_path = os.path.relpath(local_path, local_dir).replace(
236
+ "\\", "/"
237
+ )
238
+ files[relative_path] = {
239
+ "path": local_path,
240
+ "size": os.path.getsize(local_path),
241
+ "mtime": datetime.fromtimestamp(
242
+ os.path.getmtime(local_path), tz=timezone.utc
243
+ ),
244
+ "type": "local",
245
+ }
246
+ except OSError as e:
247
+ print(f"Warning: Could not get metadata for {local_path}: {e}")
248
+ except Exception as e:
249
+ print(f"Warning: Unexpected error processing {local_path}: {e}")
250
+ if self.verbose:
251
+ print(f"Found {len(files)} files locally.")
252
+ return files
253
+
254
+ def sync(
255
+ self,
256
+ source: str,
257
+ destination: str,
258
+ delete: bool = False,
259
+ dry_run: bool = False,
260
+ ) -> None:
261
+ """
262
+ Synchronizes files between a source and a destination (local or S3).
263
+ Compares file sizes and modification times. Copies if missing, larger, or newer.
264
+ Optionally deletes extraneous files from the destination.
265
+ Args:
266
+ source (str): The source path (local directory/file or s3://...).
267
+ destination (str): The destination path (local directory or s3://...).
268
+ delete (bool): If True, delete extraneous files from the destination.
269
+ dry_run (bool): If True, print actions without performing them.
270
+ """
271
+ mtime_tolerance = timedelta(seconds=2)
272
+ src_bucket, src_prefix = self._parse_path(source)
273
+ dest_bucket, dest_prefix = self._parse_path(destination)
274
+ source_items: Dict[str, Dict[str, Any]] = {}
275
+ dest_items: Dict[str, Dict[str, Any]] = {}
276
+ sync_direction = ""
277
+ is_single_file_sync = False
278
+
279
+ if src_bucket is None and dest_bucket is not None:
280
+ sync_direction = "upload"
281
+ source_items = self._list_local(source)
282
+ dest_items = self._list_objects(dest_bucket, dest_prefix)
283
+ if not source_items and not os.path.exists(source):
284
+ print(
285
+ f"Error: Source path {source} not found and is not empty."
286
+ ) # Check needed? list_local handles it.
287
+ # return # Let it proceed if source is just empty
288
+ if os.path.isfile(source):
289
+ is_single_file_sync = True
290
+ # current_dest_prefix = dest_prefix or "" # Moved closer to usage
291
+
292
+ elif src_bucket is not None and dest_bucket is None:
293
+ sync_direction = "download"
294
+ source_items = self._list_objects(src_bucket, src_prefix)
295
+ if os.path.exists(destination) and not os.path.isdir(destination):
296
+ print(
297
+ f"Error: Local destination '{destination}' exists but is not a directory."
245
298
  )
246
- # relative_path will be '.' if local_dir points to a file, handled above.
247
-
248
- local_size = os.path.getsize(local_path)
249
- local_mtime_ts = os.path.getmtime(local_path)
250
- local_mtime = datetime.fromtimestamp(local_mtime_ts, tz=timezone.utc)
251
-
252
- files[relative_path] = {
253
- "path": local_path,
254
- "size": local_size,
255
- "mtime": local_mtime,
256
- "type": "local",
257
- }
258
- except OSError as e:
259
- print(f"Warning: Could not get metadata for {local_path}: {e}")
260
- except Exception as e:
261
- print(f"Warning: Unexpected error processing {local_path}: {e}")
262
-
263
- if verbose:
264
- print(f"Found {len(files)} files locally.")
265
- return files
266
-
267
-
268
- def s3_sync(
269
- source: str,
270
- destination: str,
271
- delete: bool = False,
272
- dry_run: bool = False,
273
- verbose: bool = True,
274
- ) -> None:
275
- """
276
- Synchronizes files between a source and a destination, which can be
277
- local paths or S3 paths (e.g., 's3://my-bucket/my-prefix').
278
-
279
- Compares file sizes and modification times. Copies files from source
280
- to destination if they are missing, larger, or newer in the source.
281
- Optionally deletes files from the destination if they are not present
282
- in the source.
299
+ return
300
+ dest_items = self._list_local(destination)
301
+ if not dry_run:
302
+ os.makedirs(destination, exist_ok=True)
303
+ elif not os.path.isdir(destination) and self.verbose:
304
+ print(f"Dry run: Would create local directory {destination}")
283
305
 
284
- Args:
285
- source (str): The source path (local directory/file or s3://...).
286
- destination (str): The destination path (local directory or s3://...).
287
- delete (bool): If True, delete extraneous files from the destination.
288
- dry_run (bool): If True, print actions without performing them.
289
- verbose (bool): If True, print actions being taken.
290
- """
291
- s3_client = boto3.client("s3")
292
- mtime_tolerance = timedelta(
293
- seconds=2
294
- ) # S3 mtime might not have sub-second precision
295
-
296
- src_bucket, src_prefix = _parse_s3_path(source)
297
- dest_bucket, dest_prefix = _parse_s3_path(destination)
298
-
299
- source_items: Dict[str, Dict[str, Any]] = {}
300
- dest_items: Dict[str, Dict[str, Any]] = {}
301
- sync_direction = ""
302
- is_single_file_sync = False
303
-
304
- # Determine sync direction and list items
305
- if src_bucket is None and dest_bucket is not None:
306
- sync_direction = "upload"
307
- source_items = _list_local_files(source, verbose)
308
- dest_items = _list_s3_objects(s3_client, dest_bucket, dest_prefix, verbose)
309
- # Check if source exists (either dir or file)
310
- if not os.path.exists(source):
311
- print(f"Error: Source path {source} not found.")
306
+ elif src_bucket is None and dest_bucket is None:
307
+ print(
308
+ "Error: Both source and destination are local paths. Use standard file copy tools."
309
+ )
312
310
  return
313
- is_single_file_sync = os.path.isfile(source)
314
- # Destination prefix defaults to empty if not specified
315
- if dest_prefix is None:
316
- dest_prefix = ""
317
-
318
- elif src_bucket is not None and dest_bucket is None:
319
- sync_direction = "download"
320
- source_items = _list_s3_objects(s3_client, src_bucket, src_prefix, verbose)
321
- # For download, destination MUST be a directory (or created as one)
322
- # If destination exists and is a file, it's an error.
323
- if os.path.exists(destination) and not os.path.isdir(destination):
311
+ elif src_bucket is not None and dest_bucket is not None:
324
312
  print(
325
- f"Error: Local destination '{destination}' exists but is not a directory."
313
+ "Error: S3 to S3 sync not implemented. Use AWS CLI or S3 Batch Operations."
326
314
  )
327
315
  return
316
+ else:
317
+ print("Error: Invalid source or destination path combination.")
318
+ return
328
319
 
329
- dest_items = _list_local_files(destination, verbose)
330
-
331
- # Ensure destination directory exists for downloads
332
- if not dry_run:
333
- os.makedirs(destination, exist_ok=True)
334
- elif not os.path.isdir(destination) and verbose:
335
- print(f"Dry run: Would create local directory {destination}")
320
+ actions_to_perform: List[Dict[str, Any]] = []
321
+ source_keys = set(source_items.keys())
322
+ dest_keys = set(dest_items.keys())
336
323
 
337
- elif src_bucket is None and dest_bucket is None:
338
- print(
339
- "Error: Both source and destination are local paths. Use standard file copy tools."
340
- )
341
- return
342
- elif src_bucket is not None and dest_bucket is not None:
343
- print(
344
- "Error: S3 to S3 sync not implemented. Use AWS CLI or S3 Batch Operations."
345
- )
346
- return
347
- else:
348
- # This case should not be reachable given the above checks
349
- print("Error: Invalid source or destination path combination.")
350
- return
351
-
352
- actions_to_perform: List[Dict[str, Any]] = []
353
-
354
- # --- Compare items ---
355
- # Use source keys as the primary loop iterator
356
- source_keys = set(source_items.keys())
357
- dest_keys = set(dest_items.keys())
358
-
359
- for rel_key in source_keys:
360
- src_item = source_items[rel_key]
361
- dest_item = dest_items.get(rel_key)
362
- reason = ""
363
-
364
- if dest_item is None:
365
- reason = "does not exist in destination"
366
- else:
367
- # Compare metadata (size and mtime)
368
- if src_item["size"] != dest_item["size"]:
369
- reason = (
370
- f"size differs (src: {src_item['size']}, dest: {dest_item['size']})"
324
+ for rel_key in source_keys:
325
+ src_item = source_items[rel_key]
326
+ dest_item = dest_items.get(rel_key)
327
+ reason = ""
328
+ if dest_item is None:
329
+ reason = "does not exist in destination"
330
+ else:
331
+ if src_item["size"] != dest_item["size"]:
332
+ reason = f"size differs (src: {src_item['size']}, dest: {dest_item['size']})"
333
+ elif src_item["mtime"] > (dest_item["mtime"] + mtime_tolerance):
334
+ reason = f"is newer in source (src: {src_item['mtime']}, dest: {dest_item['mtime']})"
335
+ if reason:
336
+ action_type = "upload" if sync_direction == "upload" else "download"
337
+ dest_full_path_or_key: Optional[str] = None
338
+ if sync_direction == "upload":
339
+ # Define current_dest_prefix here, just before use
340
+ current_dest_prefix = dest_prefix or ""
341
+ final_dest_key = (
342
+ rel_key
343
+ if is_single_file_sync
344
+ else os.path.join(current_dest_prefix, rel_key).replace(
345
+ "\\", "/"
346
+ )
347
+ )
348
+ if not current_dest_prefix and final_dest_key.startswith("/"):
349
+ final_dest_key = final_dest_key.lstrip("/")
350
+ dest_full_path_or_key = f"s3://{dest_bucket}/{final_dest_key}"
351
+ else:
352
+ dest_full_path_or_key = os.path.join(
353
+ destination, rel_key.replace("/", os.sep)
354
+ )
355
+ actions_to_perform.append(
356
+ {
357
+ "action": action_type,
358
+ "relative_key": rel_key,
359
+ "source_path": src_item["path"],
360
+ "source_mtime": src_item.get("mtime"),
361
+ "dest_full_path_or_key": dest_full_path_or_key,
362
+ "dest_bucket": dest_bucket,
363
+ "dest_prefix": dest_prefix,
364
+ "s3_key_full_src": src_item.get("key")
365
+ if sync_direction == "download"
366
+ else None,
367
+ "source_bucket": src_bucket,
368
+ "reason": reason,
369
+ }
371
370
  )
372
- # Sync if source is newer (outside tolerance)
373
- elif src_item["mtime"] > (dest_item["mtime"] + mtime_tolerance):
374
- reason = f"is newer in source (src: {src_item['mtime']}, dest: {dest_item['mtime']})"
375
-
376
- if reason:
377
- action_type = "upload" if sync_direction == "upload" else "download"
378
- # Determine the final destination key/path
379
- dest_full_path_or_key: Optional[str] = None
380
- if sync_direction == "upload":
381
- # If uploading single file, dest key is prefix + filename
382
- # If uploading dir, dest key is prefix + relative key
383
- # Ensure dest_prefix is treated as empty string if None
384
- current_dest_prefix = dest_prefix or ""
385
- final_dest_key = (
386
- rel_key
387
- if is_single_file_sync
388
- else os.path.join(current_dest_prefix, rel_key).replace("\\", "/")
371
+
372
+ if delete:
373
+ keys_to_delete = dest_keys - source_keys
374
+ for rel_key in keys_to_delete:
375
+ dest_item = dest_items[rel_key]
376
+ action_type = (
377
+ "delete_s3" if sync_direction == "upload" else "delete_local"
389
378
  )
390
- # Ensure we don't create keys like 's3://bucket//key' if prefix was empty
391
- if not current_dest_prefix and final_dest_key.startswith("/"):
392
- final_dest_key = final_dest_key.lstrip("/")
393
- dest_full_path_or_key = f"s3://{dest_bucket}/{final_dest_key}"
394
- else: # download
395
- dest_full_path_or_key = os.path.join(
396
- destination, rel_key.replace("/", os.sep)
379
+ actions_to_perform.append(
380
+ {
381
+ "action": action_type,
382
+ "relative_key": rel_key,
383
+ "path_to_delete": dest_item["path"],
384
+ "s3_key_full_dest": dest_item.get("key")
385
+ if sync_direction == "upload"
386
+ else None,
387
+ "dest_bucket": dest_bucket,
388
+ "reason": "does not exist in source",
389
+ }
397
390
  )
398
391
 
399
- actions_to_perform.append(
400
- {
401
- "action": action_type,
402
- "relative_key": rel_key,
403
- "source_path": src_item["path"], # Local path or S3 URI
404
- "source_mtime": src_item.get("mtime"),
405
- "dest_full_path_or_key": dest_full_path_or_key,
406
- # Store details needed for specific actions
407
- "dest_bucket": dest_bucket,
408
- "dest_prefix": dest_prefix,
409
- "s3_key_full_src": src_item.get("key")
410
- if sync_direction == "download"
411
- else None,
412
- "source_bucket": src_bucket,
413
- "reason": reason,
414
- }
415
- )
416
-
417
- # Identify items for deletion in destination
418
- if delete:
419
- keys_to_delete = dest_keys - source_keys
420
- for rel_key in keys_to_delete:
421
- dest_item = dest_items[rel_key]
422
- action_type = "delete_s3" if sync_direction == "upload" else "delete_local"
423
- actions_to_perform.append(
424
- {
425
- "action": action_type,
426
- "relative_key": rel_key,
427
- "path_to_delete": dest_item["path"], # Full S3 URI or local path
428
- "s3_key_full_dest": dest_item.get("key")
429
- if sync_direction == "upload"
430
- else None, # Needed for delete_s3
431
- "dest_bucket": dest_bucket, # Needed for delete_s3
432
- "reason": "does not exist in source",
433
- }
434
- )
435
-
436
- # --- Execute Actions ---
437
- uploads_done = downloads_done = deletions_done = 0
438
- s3_deletions_batch: List[Dict[str, str]] = []
439
-
440
- if not actions_to_perform:
441
- print("Source and destination are already synchronized.")
442
- # Still check if source/dest actually exist if nothing to do
443
- if sync_direction == "upload" and not os.path.exists(source):
444
- print(f"Note: Source path {source} does not exist.")
445
- # Add check for S3 source existence if needed via head_bucket or similar
446
- return
447
-
448
- for action in actions_to_perform:
449
- rel_key = action["relative_key"]
450
- reason = action["reason"]
451
- dest_full_path_or_key = action["dest_full_path_or_key"]
452
-
453
- if action["action"] == "upload":
454
- local_path = action["source_path"]
455
- # Ensure dest_full_path_or_key is valid before parsing
456
- if not isinstance(dest_full_path_or_key, str):
457
- print(
458
- f"ERROR: Invalid destination path calculated for upload: {dest_full_path_or_key}"
459
- )
460
- continue
461
- # Extract final key from the pre-calculated dest_full_path_or_key
462
- _, upload_key = _parse_s3_path(dest_full_path_or_key)
463
- target_bucket = action["dest_bucket"]
392
+ uploads_done = downloads_done = deletions_done = 0
393
+ s3_deletions_batch: List[Dict[str, str]] = []
394
+ if not actions_to_perform:
395
+ if self.verbose:
396
+ print("Source and destination are already synchronized.")
397
+ # Optional: Add check if source exists if sync_direction == "upload" and not os.path.exists(source):
398
+ return
464
399
 
465
- if verbose:
466
- print(f"Upload: {local_path} to {dest_full_path_or_key} ({reason})")
467
- if not dry_run:
468
- if target_bucket and upload_key is not None:
469
- try:
470
- s3_client.upload_file(local_path, target_bucket, upload_key)
471
- uploads_done += 1
472
- except ClientError as e:
473
- print(f"ERROR uploading {local_path}: {e}")
474
- except Exception as e:
475
- print(f"ERROR uploading {local_path}: {e}")
476
- else:
400
+ for action in actions_to_perform:
401
+ reason = action["reason"]
402
+ dest_full_path_or_key = action["dest_full_path_or_key"]
403
+ if action["action"] == "upload":
404
+ local_path = action["source_path"]
405
+ if not isinstance(dest_full_path_or_key, str):
406
+ print(f"ERROR: Invalid dest path: {dest_full_path_or_key}")
407
+ continue
408
+ _, upload_key = self._parse_path(dest_full_path_or_key)
409
+ target_bucket = action["dest_bucket"]
410
+ if self.verbose:
411
+ print(f"Upload: {local_path} to {dest_full_path_or_key} ({reason})")
412
+ if not dry_run:
413
+ if target_bucket and upload_key is not None:
414
+ try:
415
+ self.client.upload_file(
416
+ local_path, target_bucket, upload_key
417
+ )
418
+ uploads_done += 1
419
+ except ClientError as e:
420
+ print(f"ERROR uploading {local_path}: {e}")
421
+ except Exception as e:
422
+ print(f"ERROR uploading {local_path}: {e}")
423
+ else:
424
+ print(
425
+ f"ERROR: Invalid S3 target: bucket={target_bucket}, key={upload_key}"
426
+ )
427
+ elif action["action"] == "download":
428
+ s3_key_full = action["s3_key_full_src"]
429
+ local_path = dest_full_path_or_key
430
+ source_bucket_dl = action["source_bucket"]
431
+ if self.verbose:
477
432
  print(
478
- f"ERROR: Invalid S3 target for upload: bucket={target_bucket}, key={upload_key}"
433
+ f"Download: {action['source_path']} to {local_path} ({reason})"
479
434
  )
480
-
481
- elif action["action"] == "download":
482
- s3_key_full = action["s3_key_full_src"]
483
- local_path = dest_full_path_or_key # This is the local destination path
484
- source_bucket_dl = action["source_bucket"]
485
-
486
- if verbose:
487
- print(f"Download: {action['source_path']} to {local_path} ({reason})")
488
- # Ensure local_path is valid before proceeding
489
- if not isinstance(local_path, str):
490
- print(
491
- f"ERROR: Invalid local destination path calculated for download: {local_path}"
492
- )
493
- continue
494
- if not dry_run:
495
- if source_bucket_dl and s3_key_full and local_path:
496
- try:
497
- local_file_dir = os.path.dirname(local_path)
498
- os.makedirs(local_file_dir, exist_ok=True)
499
- s3_client.download_file(
500
- source_bucket_dl, s3_key_full, local_path
501
- )
502
- downloads_done += 1
503
- except ClientError as e:
504
- print(f"ERROR downloading {s3_key_full}: {e}")
505
- except OSError as e:
435
+ if not isinstance(local_path, str):
436
+ print(f"ERROR: Invalid local dest path: {local_path}")
437
+ continue
438
+ if not dry_run:
439
+ if source_bucket_dl and s3_key_full and local_path:
440
+ try:
441
+ local_file_dir = os.path.dirname(local_path)
442
+ os.makedirs(local_file_dir, exist_ok=True)
443
+ self.client.download_file(
444
+ source_bucket_dl, s3_key_full, local_path
445
+ )
446
+ downloads_done += 1
447
+ except ClientError as e:
448
+ print(f"ERROR downloading {s3_key_full}: {e}")
449
+ except OSError as e:
450
+ print(f"ERROR creating/writing {local_path}: {e}")
451
+ except Exception as e:
452
+ print(f"ERROR downloading {s3_key_full}: {e}")
453
+ else:
506
454
  print(
507
- f"ERROR creating directory or writing file {local_path}: {e}"
455
+ f"ERROR: Invalid download params: bucket={source_bucket_dl}, key={s3_key_full}, local={local_path}"
508
456
  )
509
- except Exception as e:
510
- print(f"ERROR downloading {s3_key_full}: {e}")
457
+ elif action["action"] == "delete_s3":
458
+ s3_key_to_delete = action["s3_key_full_dest"]
459
+ target_bucket_del = action["dest_bucket"]
460
+ if target_bucket_del and s3_key_to_delete:
461
+ if self.verbose:
462
+ print(f"Delete S3: {action['path_to_delete']} ({reason})")
463
+ if isinstance(s3_key_to_delete, str):
464
+ s3_deletions_batch.append({"Key": s3_key_to_delete})
465
+ else:
466
+ print(f"ERROR: Invalid S3 key for deletion: {s3_key_to_delete}")
511
467
  else:
512
468
  print(
513
- f"ERROR: Invalid parameters for download: bucket={source_bucket_dl}, key={s3_key_full}, local={local_path}"
469
+ f"ERROR: Invalid S3 target for deletion: bucket={target_bucket_del}, key={s3_key_to_delete}"
514
470
  )
515
-
516
- elif action["action"] == "delete_s3":
517
- s3_key_to_delete = action["s3_key_full_dest"]
518
- target_bucket_del = action["dest_bucket"]
519
- if target_bucket_del and s3_key_to_delete:
520
- if verbose:
521
- print(f"Delete S3: {action['path_to_delete']} ({reason})")
522
- # Check type before appending to batch
523
- if isinstance(s3_key_to_delete, str):
524
- s3_deletions_batch.append({"Key": s3_key_to_delete})
525
- else:
526
- print(f"ERROR: Invalid S3 key for deletion: {s3_key_to_delete}")
471
+ elif action["action"] == "delete_local":
472
+ local_path_to_delete = action["path_to_delete"]
473
+ if self.verbose:
474
+ print(f"Delete Local: {local_path_to_delete} ({reason})")
475
+ if not dry_run:
476
+ try:
477
+ os.remove(local_path_to_delete)
478
+ deletions_done += 1
479
+ except OSError as e:
480
+ print(f"ERROR deleting local file {local_path_to_delete}: {e}")
481
+
482
+ if s3_deletions_batch:
483
+ target_bucket_del_batch = next(
484
+ (
485
+ a["dest_bucket"]
486
+ for a in actions_to_perform
487
+ if a["action"] == "delete_s3"
488
+ ),
489
+ None,
490
+ )
491
+ if not dry_run and target_bucket_del_batch:
492
+ deleted_count_batch = 0
493
+ for i in range(0, len(s3_deletions_batch), 1000):
494
+ batch = s3_deletions_batch[i : i + 1000]
495
+ delete_payload = {"Objects": batch, "Quiet": False}
496
+ try:
497
+ response = self.client.delete_objects(
498
+ Bucket=target_bucket_del_batch, Delete=delete_payload
499
+ )
500
+ deleted_count_batch += len(batch)
501
+ if "Errors" in response and response["Errors"]:
502
+ deleted_count_batch -= len(response["Errors"])
503
+ for error in response["Errors"]:
504
+ print(
505
+ f"ERROR deleting S3 object {error['Key']}: {error['Code']} - {error['Message']}"
506
+ )
507
+ except ClientError as e:
508
+ print(f"ERROR deleting S3 objects batch: {e}")
509
+ deleted_count_batch = 0
510
+ except Exception as e:
511
+ print(f"ERROR deleting S3 objects batch: {e}")
512
+ deleted_count_batch = 0
513
+ deletions_done += deleted_count_batch
514
+ elif target_bucket_del_batch:
515
+ deletions_done = len(s3_deletions_batch)
527
516
  else:
528
517
  print(
529
- f"ERROR: Invalid S3 target for deletion: bucket={target_bucket_del}, key={s3_key_to_delete}"
518
+ "Warning: Could not determine target bucket for S3 deletion batch."
530
519
  )
531
520
 
532
- elif action["action"] == "delete_local":
533
- local_path_to_delete = action["path_to_delete"]
534
- if verbose:
535
- print(f"Delete Local: {local_path_to_delete} ({reason})")
536
- if not dry_run:
537
- try:
538
- os.remove(local_path_to_delete)
539
- deletions_done += 1
540
- # TODO: Optionally clean up empty directories?
541
- except OSError as e:
542
- print(f"ERROR deleting local file {local_path_to_delete}: {e}")
543
-
544
- # Process S3 deletions in batches
545
- if s3_deletions_batch:
546
- # Get the target bucket from the first deletion action (should be consistent)
547
- target_bucket_del_batch = next(
548
- (
549
- a["dest_bucket"]
550
- for a in actions_to_perform
551
- if a["action"] == "delete_s3"
552
- ),
553
- None,
554
- )
555
- if not dry_run and target_bucket_del_batch:
556
- deleted_count_batch = 0
557
- for i in range(0, len(s3_deletions_batch), 1000):
558
- batch = s3_deletions_batch[i : i + 1000]
559
- delete_payload = {"Objects": batch, "Quiet": False} # Get errors back
560
- try:
561
- response = s3_client.delete_objects(
562
- Bucket=target_bucket_del_batch, Delete=delete_payload
563
- )
564
- # Increment count based on successful deletions reported (if not Quiet) or assume success if Quiet
565
- deleted_count_batch += len(
566
- batch
567
- ) # Assume success unless errors reported
568
- if "Deleted" in response:
569
- pass # Counted optimistically above
570
- # deleted_count_batch += len(response['Deleted'])
571
- if "Errors" in response and response["Errors"]:
572
- deleted_count_batch -= len(
573
- response["Errors"]
574
- ) # Adjust count for errors
575
- for error in response["Errors"]:
576
- print(
577
- f"ERROR deleting S3 object {error['Key']}: {error['Code']} - {error['Message']}"
578
- )
579
- except ClientError as e:
580
- print(f"ERROR deleting S3 objects batch: {e}")
581
- deleted_count_batch = 0 # Assume batch failed
582
- except Exception as e:
583
- print(f"ERROR deleting S3 objects batch: {e}")
584
- deleted_count_batch = 0 # Assume batch failed
585
- deletions_done += deleted_count_batch
586
- elif target_bucket_del_batch: # dry_run is True
587
- deletions_done = len(
588
- s3_deletions_batch
589
- ) # Report planned deletions for dry run
521
+ if dry_run:
522
+ if self.verbose:
523
+ upload_count = sum(
524
+ 1 for a in actions_to_perform if a["action"] == "upload"
525
+ )
526
+ download_count = sum(
527
+ 1 for a in actions_to_perform if a["action"] == "download"
528
+ )
529
+ delete_s3_count = len(s3_deletions_batch)
530
+ delete_local_count = sum(
531
+ 1 for a in actions_to_perform if a["action"] == "delete_local"
532
+ )
533
+ print("\n--- DRY RUN SUMMARY ---")
534
+ if sync_direction == "upload":
535
+ print(f"Would upload: {upload_count} file(s)")
536
+ if delete:
537
+ print(f"Would delete from S3: {delete_s3_count} object(s)")
538
+ elif sync_direction == "download":
539
+ print(f"Would download: {download_count} file(s)")
540
+ if delete:
541
+ print(f"Would delete locally: {delete_local_count} file(s)")
542
+ print("--- END DRY RUN ---")
590
543
  else:
591
- print("Warning: Could not determine target bucket for S3 deletion batch.")
544
+ if self.verbose:
545
+ if sync_direction == "upload":
546
+ print(
547
+ f"Sync completed. Uploaded: {uploads_done} file(s). Deleted from S3: {deletions_done if delete else 0} object(s)."
548
+ )
549
+ elif sync_direction == "download":
550
+ print(
551
+ f"Sync completed. Downloaded: {downloads_done} file(s). Deleted locally: {deletions_done if delete else 0} file(s)."
552
+ )
592
553
 
593
- # --- Summary ---
594
- if dry_run:
595
- upload_count = sum(1 for a in actions_to_perform if a["action"] == "upload")
596
- download_count = sum(1 for a in actions_to_perform if a["action"] == "download")
597
- # Deletion count for dry run is based on the batch prepared
598
- delete_s3_count = len(s3_deletions_batch)
599
- delete_local_count = sum(
600
- 1 for a in actions_to_perform if a["action"] == "delete_local"
601
- )
602
- print("\n--- DRY RUN SUMMARY ---")
603
- if sync_direction == "upload":
604
- print(f"Would upload: {upload_count} file(s)")
605
- if delete:
606
- print(f"Would delete from S3: {delete_s3_count} object(s)")
607
- elif sync_direction == "download":
608
- print(f"Would download: {download_count} file(s)")
609
- if delete:
610
- print(f"Would delete locally: {delete_local_count} file(s)")
611
- print("--- END DRY RUN ---")
612
- else:
613
- if sync_direction == "upload":
554
+ def copy(
555
+ self,
556
+ source: str,
557
+ destination: str,
558
+ ) -> None:
559
+ """
560
+ Copies files or directories between local paths and S3 URIs.
561
+ Handles:
562
+ - Local file to S3 object
563
+ - Local directory to S3 prefix (recursive)
564
+ - S3 object to local file
565
+ - S3 prefix to local directory (recursive)
566
+ Does NOT handle:
567
+ - Local to Local (use shutil)
568
+ - S3 to S3 (use AWS CLI or boto3 object copy)
569
+ Args:
570
+ source (str): The source path (local file/dir or s3://...).
571
+ destination (str): The destination path (local file/dir or s3://...).
572
+ """
573
+ src_bucket, src_prefix = self._parse_path(source)
574
+ dest_bucket, dest_prefix = self._parse_path(destination)
575
+
576
+ if src_bucket is None and dest_bucket is None:
614
577
  print(
615
- f"Sync completed. Uploaded: {uploads_done} file(s). Deleted from S3: {deletions_done if delete else 0} object(s)."
578
+ "Error: Both source and destination are local. Use 'shutil.copy' or 'shutil.copytree'."
616
579
  )
617
- elif sync_direction == "download":
580
+ return
581
+ if src_bucket is not None and dest_bucket is not None:
618
582
  print(
619
- f"Sync completed. Downloaded: {downloads_done} file(s). Deleted locally: {deletions_done if delete else 0} file(s)."
620
- )
621
-
622
-
623
- def s3_check(s3_uri: str) -> bool:
624
- """
625
- Check if an object or prefix exists in an S3 bucket using an S3 URI.
626
-
627
- Args:
628
- s3_uri (str): The S3 URI (e.g., 's3://my-bucket/my-key' or 's3://my-bucket/my-prefix/').
629
- Use a trailing '/' to check for a prefix/directory.
630
-
631
- Returns:
632
- bool: True if the object or prefix exists, False otherwise.
633
- """
634
- s3 = boto3.client("s3")
635
- bucket_name, s3_key = _parse_s3_path(s3_uri)
636
-
637
- if bucket_name is None or s3_key is None:
638
- # _parse_s3_path returns None, None if scheme is not 's3'
639
- print(f"Error: Invalid S3 URI format: {s3_uri}")
640
- return False
641
-
642
- is_prefix = s3_key.endswith("/")
643
-
644
- try:
645
- if is_prefix:
646
- # Check for prefix existence by listing objects
647
- # Handle the case where s3_key might be empty if URI is just s3://bucket/
648
- list_prefix = s3_key if s3_key else ""
649
- response = s3.list_objects_v2(
650
- Bucket=bucket_name, Prefix=list_prefix, MaxKeys=1
583
+ "Error: S3 to S3 copy not implemented. Use 'aws s3 cp' or boto3 'copy_object'."
651
584
  )
652
- # Check if any objects OR common prefixes (folders) are returned for the prefix
653
- return "Contents" in response or "CommonPrefixes" in response
654
- else:
655
- # Check for object existence
656
- s3.head_object(Bucket=bucket_name, Key=s3_key)
657
- return True
658
- except ClientError as e: # Catch boto3 ClientError first
659
- # If head_object returns 404 (NoSuchKey), the object doesn't exist
660
- # list_objects_v2 does not raise NoSuchKey for prefixes
661
- if e.response["Error"]["Code"] == "404":
662
- return False
663
- elif e.response["Error"]["Code"] == "NoSuchBucket":
664
- print(f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri}).")
665
- return False
666
- # Handle other potential errors like AccessDenied differently if needed
667
- print(f"Error checking {s3_uri}: {e}")
668
- return False
669
- # except s3.exceptions.NoSuchBucket: # This specific exception is less common with boto3 client
670
- # print(f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri}).")
671
- # return False
672
- except Exception as e:
673
- print(f"An unexpected error occurred checking {s3_uri}: {e}")
674
- return False
675
-
676
-
677
- def s3_copy(
678
- source: str,
679
- destination: str,
680
- verbose: bool = True,
681
- ) -> None:
682
- """
683
- Copies files or directories between local paths and S3 URIs.
585
+ return
684
586
 
685
- Handles:
686
- - Local file to S3 object
687
- - Local directory to S3 prefix (recursive)
688
- - S3 object to local file
689
- - S3 prefix to local directory (recursive)
587
+ # Upload: Local to S3
588
+ if src_bucket is None and dest_bucket is not None:
589
+ if not os.path.exists(source):
590
+ print(f"Error: Local source path not found: {source}")
591
+ return
592
+ current_dest_prefix = dest_prefix or ""
690
593
 
691
- Does NOT handle:
692
- - Local to Local (use shutil)
693
- - S3 to S3 (use AWS CLI or boto3 object copy)
594
+ if os.path.isfile(source):
595
+ if not current_dest_prefix or destination.endswith("/"):
596
+ s3_key = os.path.join(
597
+ current_dest_prefix, os.path.basename(source)
598
+ ).replace("\\", "/")
599
+ else:
600
+ s3_key = current_dest_prefix
601
+ if self.verbose:
602
+ print(f"Uploading {source} to s3://{dest_bucket}/{s3_key}")
603
+ try:
604
+ self.client.upload_file(source, dest_bucket, s3_key)
605
+ if self.verbose:
606
+ print("Upload complete.")
607
+ except ClientError as e:
608
+ print(f"ERROR uploading {source}: {e}")
609
+ except Exception as e:
610
+ print(f"ERROR uploading {source}: {e}")
694
611
 
695
- Args:
696
- source (str): The source path (local file/dir or s3://...).
697
- destination (str): The destination path (local file/dir or s3://...).
698
- verbose (bool): If True, print actions being taken.
699
- """
700
- s3_client = boto3.client("s3")
701
- src_bucket, src_prefix = _parse_s3_path(source)
702
- dest_bucket, dest_prefix = _parse_s3_path(destination)
703
-
704
- # --- Reject unsupported operations ---
705
- if src_bucket is None and dest_bucket is None:
706
- print(
707
- "Error: Both source and destination are local. Use 'shutil.copy' or 'shutil.copytree'."
708
- )
709
- return
710
- if src_bucket is not None and dest_bucket is not None:
711
- print(
712
- "Error: S3 to S3 copy not implemented. Use 'aws s3 cp' or boto3 'copy_object'."
713
- )
714
- return
612
+ elif os.path.isdir(source):
613
+ if self.verbose:
614
+ print(
615
+ f"Uploading directory {source}/* to s3://{dest_bucket}/{current_dest_prefix}/"
616
+ )
617
+ files_uploaded = files_failed = 0
618
+ for root, _, files in os.walk(source):
619
+ for file in files:
620
+ local_path = os.path.join(root, file)
621
+ relative_path = os.path.relpath(local_path, source)
622
+ s3_key = os.path.join(
623
+ current_dest_prefix, relative_path
624
+ ).replace("\\", "/")
625
+ if self.verbose:
626
+ print(
627
+ f" Uploading {local_path} to s3://{dest_bucket}/{s3_key}"
628
+ )
629
+ try:
630
+ self.client.upload_file(local_path, dest_bucket, s3_key)
631
+ files_uploaded += 1
632
+ except ClientError as e:
633
+ print(f" ERROR uploading {local_path}: {e}")
634
+ files_failed += 1
635
+ except Exception as e:
636
+ print(f" ERROR uploading {local_path}: {e}")
637
+ files_failed += 1
638
+ if self.verbose:
639
+ print(
640
+ f"Directory upload complete. Files uploaded: {files_uploaded}, Failed: {files_failed}"
641
+ )
642
+ else:
643
+ print(f"Error: Source {source} is neither a file nor a directory.")
715
644
 
716
- # --- Upload: Local to S3 ---
717
- if src_bucket is None and dest_bucket is not None:
718
- if not os.path.exists(source):
719
- print(f"Error: Local source path not found: {source}")
720
- return
721
- # Ensure dest_prefix is usable, default to empty string if None
722
- dest_prefix = dest_prefix or ""
723
-
724
- # Case 1: Source is a local file
725
- if os.path.isfile(source):
726
- # Determine final S3 key
727
- # If dest looks like a dir (ends /) or is empty, append filename
728
- if not dest_prefix or destination.endswith("/"):
729
- s3_key = os.path.join(dest_prefix, os.path.basename(source)).replace(
730
- "\\", "/"
731
- )
732
- else: # Treat dest as the exact key name
733
- s3_key = dest_prefix
645
+ # Download: S3 to Local
646
+ elif src_bucket is not None and dest_bucket is None:
647
+ is_prefix_download = False
648
+ single_object_key = None
649
+ current_src_prefix = src_prefix or "" # Ensure not None
734
650
 
735
- if verbose:
736
- print(f"Uploading {source} to s3://{dest_bucket}/{s3_key}")
737
- try:
738
- s3_client.upload_file(source, dest_bucket, s3_key)
739
- print("Upload complete.")
740
- except ClientError as e:
741
- print(f"ERROR uploading {source}: {e}")
742
- except Exception as e:
743
- print(f"ERROR uploading {source}: {e}")
744
-
745
- # Case 2: Source is a local directory
746
- elif os.path.isdir(source):
747
- if verbose:
748
- print(
749
- f"Uploading directory {source}/* to s3://{dest_bucket}/{dest_prefix}/"
750
- )
751
- files_uploaded = 0
752
- files_failed = 0
753
- for root, _, files in os.walk(source):
754
- for file in files:
755
- local_path = os.path.join(root, file)
756
- relative_path = os.path.relpath(local_path, source)
757
- s3_key = os.path.join(dest_prefix, relative_path).replace("\\", "/")
758
- if verbose:
651
+ if source.endswith("/"):
652
+ is_prefix_download = True
653
+ else:
654
+ try:
655
+ if current_src_prefix:
656
+ self.client.head_object(
657
+ Bucket=src_bucket, Key=current_src_prefix
658
+ )
659
+ single_object_key = current_src_prefix
660
+ else:
661
+ # Path like s3://bucket, treat as prefix download
662
+ is_prefix_download = True
663
+ except ClientError as e:
664
+ if e.response["Error"]["Code"] == "404":
665
+ is_prefix_download = True # Assume prefix if object not found
666
+ elif e.response["Error"]["Code"] == "NoSuchBucket":
667
+ print(f"Error: Source bucket '{src_bucket}' not found.")
668
+ return
669
+ else:
759
670
  print(
760
- f" Uploading {local_path} to s3://{dest_bucket}/{s3_key}"
671
+ f"Error checking S3 source s3://{src_bucket}/{current_src_prefix}: {e}"
761
672
  )
762
- try:
763
- s3_client.upload_file(local_path, dest_bucket, s3_key)
764
- files_uploaded += 1
765
- except ClientError as e:
766
- print(f" ERROR uploading {local_path}: {e}")
767
- files_failed += 1
768
- except Exception as e:
769
- print(f" ERROR uploading {local_path}: {e}")
770
- files_failed += 1
771
- print(
772
- f"Directory upload complete. Files uploaded: {files_uploaded}, Failed: {files_failed}"
773
- )
774
- else:
775
- print(f"Error: Source {source} is neither a file nor a directory.")
776
-
777
- # --- Download: S3 to Local ---
778
- elif src_bucket is not None and dest_bucket is None:
779
- # Determine if source is likely a single object or a prefix
780
- is_prefix_download = False
781
- single_object_key = None
782
-
783
- # If source ends with '/', treat it as a prefix explicitly
784
- if source.endswith("/"):
785
- is_prefix_download = True
786
- src_prefix = src_prefix or "" # Ensure not None
787
- else:
788
- # Try checking if the source key exists as a single object
789
- try:
790
- s3_client.head_object(Bucket=src_bucket, Key=src_prefix)
791
- single_object_key = src_prefix # It exists as a single object
792
- except ClientError as e:
793
- if e.response["Error"]["Code"] == "404":
794
- # Object doesn't exist, assume it's a prefix for recursive download
795
- is_prefix_download = True
796
- src_prefix = src_prefix or "" # Ensure not None
797
- elif e.response["Error"]["Code"] == "NoSuchBucket":
798
- print(f"Error: Source bucket '{src_bucket}' not found.")
673
+ return
674
+ except Exception as e:
675
+ print(
676
+ f"Error checking S3 source s3://{src_bucket}/{current_src_prefix}: {e}"
677
+ )
799
678
  return
679
+
680
+ if single_object_key is not None:
681
+ if os.path.isdir(destination) or destination.endswith(os.sep):
682
+ local_dest_path = os.path.join(
683
+ destination, os.path.basename(single_object_key)
684
+ )
685
+ os.makedirs(destination, exist_ok=True)
800
686
  else:
801
- # Other error (e.g., permissions)
687
+ local_dest_path = destination
688
+ parent_dir = os.path.dirname(local_dest_path)
689
+ if parent_dir:
690
+ os.makedirs(parent_dir, exist_ok=True)
691
+ if self.verbose:
802
692
  print(
803
- f"Error checking S3 source object s3://{src_bucket}/{src_prefix}: {e}"
693
+ f"Downloading s3://{src_bucket}/{single_object_key} to {local_dest_path}"
804
694
  )
805
- return
806
- except Exception as e:
807
- print(
808
- f"Error checking S3 source object s3://{src_bucket}/{src_prefix}: {e}"
809
- )
810
- return
695
+ try:
696
+ self.client.download_file(
697
+ src_bucket, single_object_key, local_dest_path
698
+ )
699
+ if self.verbose:
700
+ print("Download complete.")
701
+ except ClientError as e:
702
+ print(f"ERROR downloading {single_object_key}: {e}")
703
+ except OSError as e:
704
+ print(f"ERROR creating/writing {local_dest_path}: {e}")
705
+ except Exception as e:
706
+ print(f"ERROR downloading {single_object_key}: {e}")
811
707
 
812
- # Case 1: Download single S3 object
813
- if single_object_key is not None:
814
- # Determine local destination path
815
- if os.path.isdir(destination) or destination.endswith(os.sep):
816
- # Download into the directory
817
- local_dest_path = os.path.join(
818
- destination, os.path.basename(single_object_key)
819
- )
820
- # Create local directory if downloading into it and it doesn't exist
708
+ elif is_prefix_download:
709
+ if os.path.exists(destination) and not os.path.isdir(destination):
710
+ print(
711
+ f"Error: Local destination '{destination}' exists but is not a directory."
712
+ )
713
+ return
821
714
  os.makedirs(destination, exist_ok=True)
822
- else:
823
- # Download to the exact file path
824
- local_dest_path = destination
825
- # Ensure parent directory exists
826
- parent_dir = os.path.dirname(local_dest_path)
827
- if parent_dir:
828
- os.makedirs(parent_dir, exist_ok=True)
829
-
830
- if verbose:
831
- print(
832
- f"Downloading s3://{src_bucket}/{single_object_key} to {local_dest_path}"
833
- )
834
- try:
835
- s3_client.download_file(src_bucket, single_object_key, local_dest_path)
836
- print("Download complete.")
837
- except ClientError as e:
838
- print(f"ERROR downloading {single_object_key}: {e}")
839
- except OSError as e:
840
- print(
841
- f"ERROR creating directory or writing file {local_dest_path}: {e}"
842
- )
843
- except Exception as e:
844
- print(f"ERROR downloading {single_object_key}: {e}")
845
-
846
- # Case 2: Download S3 prefix (recursive)
847
- elif is_prefix_download:
848
- # Ensure local destination is a directory
849
- if os.path.exists(destination) and not os.path.isdir(destination):
850
- print(
851
- f"Error: Local destination '{destination}' exists but is not a directory for prefix download."
852
- )
853
- return
854
- os.makedirs(destination, exist_ok=True)
855
-
856
- if verbose:
857
- print(
858
- f"Downloading prefix s3://{src_bucket}/{src_prefix}/* to {destination}/"
859
- )
860
-
861
- paginator = s3_client.get_paginator("list_objects_v2")
862
- files_downloaded = 0
863
- files_failed = 0
864
- operation_parameters = {"Bucket": src_bucket}
865
- if src_prefix:
866
- operation_parameters["Prefix"] = src_prefix
867
-
868
- try:
869
- page_iterator = paginator.paginate(**operation_parameters)
870
- found_objects = False
871
- for page in page_iterator:
872
- if "Contents" in page:
873
- found_objects = True
874
- for obj in page["Contents"]:
875
- s3_key = obj["Key"]
876
- # Skip zero-byte directory markers if downloading a prefix
877
- if s3_key.endswith("/") and obj["Size"] == 0:
878
- continue
879
-
880
- # Calculate relative path from the source prefix
881
- if src_prefix and s3_key.startswith(src_prefix):
882
- # Handle potential trailing slash inconsistency
883
- prefix_adjusted = (
884
- src_prefix
885
- if src_prefix.endswith("/")
886
- else src_prefix + "/"
887
- )
888
- if s3_key.startswith(prefix_adjusted):
889
- relative_key = s3_key[len(prefix_adjusted) :]
890
- # Handle the prefix itself if listed as an object (unlikely for prefix download)
891
- elif s3_key == src_prefix.rstrip("/"):
892
- relative_key = os.path.basename(s3_key)
893
- else: # Should not happen
894
- relative_key = s3_key
895
- elif not src_prefix: # Downloading whole bucket essentially
715
+ if self.verbose:
716
+ print(
717
+ f"Downloading prefix s3://{src_bucket}/{current_src_prefix}/* to {destination}/"
718
+ )
719
+ paginator = self.client.get_paginator("list_objects_v2")
720
+ files_downloaded = files_failed = 0
721
+ operation_parameters = {"Bucket": src_bucket}
722
+ # The problematic line for the linter, re-adding type ignore
723
+ if current_src_prefix: # type: ignore
724
+ operation_parameters["Prefix"] = current_src_prefix
725
+ try:
726
+ page_iterator = paginator.paginate(**operation_parameters)
727
+ found_objects = False
728
+ for page in page_iterator:
729
+ if "Contents" in page:
730
+ found_objects = True
731
+ for obj in page["Contents"]:
732
+ s3_key = obj["Key"]
733
+ if s3_key.endswith("/") and obj["Size"] == 0:
734
+ continue
896
735
  relative_key = s3_key
897
- else: # Key doesn't start with prefix, should not happen
898
- continue
736
+ if current_src_prefix:
737
+ if s3_key.startswith(current_src_prefix):
738
+ if s3_key == current_src_prefix.rstrip("/"):
739
+ relative_key = os.path.basename(s3_key)
740
+ else:
741
+ prefix_adjusted = current_src_prefix + (
742
+ ""
743
+ if current_src_prefix.endswith("/")
744
+ else "/"
745
+ )
746
+ if s3_key.startswith(prefix_adjusted):
747
+ relative_key = s3_key[
748
+ len(prefix_adjusted) :
749
+ ]
750
+ elif not current_src_prefix.endswith("/"):
751
+ relative_key = s3_key[
752
+ len(current_src_prefix) :
753
+ ].lstrip("/")
754
+ if not relative_key:
755
+ continue
756
+ local_dest_path = os.path.join(
757
+ destination, relative_key.replace("/", os.sep)
758
+ )
759
+ local_dest_dir = os.path.dirname(local_dest_path)
760
+ if self.verbose:
761
+ print(
762
+ f" Downloading s3://{src_bucket}/{s3_key} to {local_dest_path}"
763
+ )
764
+ try:
765
+ if local_dest_dir:
766
+ os.makedirs(local_dest_dir, exist_ok=True)
767
+ self.client.download_file(
768
+ src_bucket, s3_key, local_dest_path
769
+ )
770
+ files_downloaded += 1
771
+ except ClientError as e:
772
+ print(f" ERROR downloading {s3_key}: {e}")
773
+ files_failed += 1
774
+ except OSError as e:
775
+ print(
776
+ f" ERROR creating/writing {local_dest_path}: {e}"
777
+ )
778
+ files_failed += 1
779
+ except Exception as e:
780
+ print(f" ERROR downloading {s3_key}: {e}")
781
+ files_failed += 1
782
+ if not found_objects and self.verbose:
783
+ print(
784
+ f"Warning: No objects found at source prefix s3://{src_bucket}/{current_src_prefix}"
785
+ )
786
+ if self.verbose:
787
+ print(
788
+ f"Prefix download complete. Files downloaded: {files_downloaded}, Failed: {files_failed}"
789
+ )
790
+ except ClientError as e:
791
+ if e.response["Error"]["Code"] == "NoSuchBucket":
792
+ print(f"Error: Source bucket '{src_bucket}' not found.")
793
+ else:
794
+ print(
795
+ f"Error listing objects in s3://{src_bucket}/{current_src_prefix}: {e}"
796
+ )
797
+ except Exception as e:
798
+ print(
799
+ f"Error listing objects in s3://{src_bucket}/{current_src_prefix}: {e}"
800
+ )
801
+ else:
802
+ print("Error: Unknown copy operation type.")
899
803
 
900
- # Skip if relative key is empty (e.g. prefix marker was somehow processed)
901
- if not relative_key:
902
- continue
804
+ def check(self, s3_uri: str) -> bool:
805
+ """
806
+ Check if an object or prefix exists in an S3 bucket using an S3 URI.
903
807
 
904
- local_dest_path = os.path.join(
905
- destination, relative_key.replace("/", os.sep)
906
- )
907
- local_dest_dir = os.path.dirname(local_dest_path)
808
+ Args:
809
+ s3_uri (str): The S3 URI (e.g., 's3://my-bucket/my-key' or 's3://my-bucket/my-prefix/').
810
+ Use a trailing '/' to check for a prefix/directory.
908
811
 
909
- if verbose:
910
- print(
911
- f" Downloading s3://{src_bucket}/{s3_key} to {local_dest_path}"
912
- )
913
- try:
914
- if local_dest_dir:
915
- os.makedirs(local_dest_dir, exist_ok=True)
916
- s3_client.download_file(
917
- src_bucket, s3_key, local_dest_path
918
- )
919
- files_downloaded += 1
920
- except ClientError as e:
921
- print(f" ERROR downloading {s3_key}: {e}")
922
- files_failed += 1
923
- except OSError as e:
924
- print(
925
- f" ERROR creating directory or writing file {local_dest_path}: {e}"
926
- )
927
- files_failed += 1
928
- except Exception as e:
929
- print(f" ERROR downloading {s3_key}: {e}")
930
- files_failed += 1
812
+ Returns:
813
+ bool: True if the object or prefix exists, False otherwise.
814
+ """
815
+ # Use the class client and parse method
816
+ bucket_name, s3_key = self._parse_path(s3_uri)
931
817
 
932
- if not found_objects:
933
- print(
934
- f"Warning: No objects found at source prefix s3://{src_bucket}/{src_prefix}"
935
- )
818
+ if bucket_name is None or s3_key is None:
819
+ # _parse_path returns None, None if scheme is not 's3'
820
+ print(f"Error: Invalid S3 URI format: {s3_uri}")
821
+ return False
936
822
 
937
- print(
938
- f"Prefix download complete. Files downloaded: {files_downloaded}, Failed: {files_failed}"
939
- )
823
+ is_prefix = s3_key.endswith("/")
940
824
 
941
- except ClientError as e:
942
- if e.response["Error"]["Code"] == "NoSuchBucket":
943
- print(f"Error: Source bucket '{src_bucket}' not found.")
944
- else:
825
+ try:
826
+ if is_prefix:
827
+ # Check for prefix existence by listing objects
828
+ # Handle the case where s3_key might be empty if URI is just s3://bucket/
829
+ list_prefix = s3_key if s3_key else ""
830
+ response = self.client.list_objects_v2(
831
+ Bucket=bucket_name, Prefix=list_prefix, MaxKeys=1
832
+ )
833
+ # Check if any objects OR common prefixes (folders) are returned for the prefix
834
+ return "Contents" in response or "CommonPrefixes" in response
835
+ else:
836
+ # Check for object existence
837
+ self.client.head_object(Bucket=bucket_name, Key=s3_key)
838
+ return True
839
+ except ClientError as e: # Catch boto3 ClientError first
840
+ # If head_object returns 404 (NoSuchKey), the object doesn't exist
841
+ # list_objects_v2 does not raise NoSuchKey for prefixes
842
+ if e.response["Error"]["Code"] == "404":
843
+ return False
844
+ elif e.response["Error"]["Code"] == "NoSuchBucket":
845
+ if self.verbose:
945
846
  print(
946
- f"Error listing objects in s3://{src_bucket}/{src_prefix}: {e}"
847
+ f"Error: Bucket '{bucket_name}' not found (from URI: {s3_uri})."
947
848
  )
948
- except Exception as e:
949
- print(f"Error listing objects in s3://{src_bucket}/{src_prefix}: {e}")
950
-
951
- else: # Should not be reachable
952
- print("Error: Unknown copy operation type.")
849
+ return False
850
+ # Handle other potential errors like AccessDenied differently if needed
851
+ print(f"Error checking {s3_uri}: {e}")
852
+ return False
853
+ except Exception as e:
854
+ print(f"An unexpected error occurred checking {s3_uri}: {e}")
855
+ return False