pos3 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pos3/__init__.py ADDED
@@ -0,0 +1,833 @@
1
+ """Global S3 mirror API with dynamic registration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import shutil
7
+ import threading
8
+ import time
9
+ from collections.abc import Iterable, Iterator
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from contextlib import contextmanager, nullcontext
12
+ from contextvars import ContextVar
13
+ from dataclasses import dataclass, field
14
+ from functools import wraps
15
+ from pathlib import Path, PurePosixPath
16
+ from typing import Any
17
+ from urllib.parse import urlparse
18
+
19
+ import boto3
20
+ from botocore.exceptions import ClientError
21
+ from tqdm import tqdm
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class _NullTqdm(nullcontext):
27
+ def update(self, *_args: Any, **_kwargs: Any) -> None: # pragma: no cover - trivial
28
+ pass
29
+
30
+ def __enter__(self) -> _NullTqdm:
31
+ return self
32
+
33
+
34
+ def _parse_s3_url(s3_url: str) -> tuple[str, str]:
35
+ parsed = urlparse(s3_url)
36
+ if parsed.scheme != "s3":
37
+ raise ValueError(f"Not an S3 URL: {s3_url}")
38
+ return parsed.netloc, parsed.path.lstrip("/")
39
+
40
+
41
+ def _normalize_s3_url(s3_url: str) -> str:
42
+ bucket, key = _parse_s3_url(s3_url)
43
+ key = key.strip("/")
44
+ return f"s3://{bucket}/{key}" if key else f"s3://{bucket}"
45
+
46
+
47
+ def _is_s3_path(path: str) -> bool:
48
+ return path.startswith("s3://")
49
+
50
+
51
+ def _s3_paths_conflict(left: str, right: str) -> bool:
52
+ left_norm = left.rstrip("/")
53
+ right_norm = right.rstrip("/")
54
+ if left_norm == right_norm:
55
+ return True
56
+ return left_norm.startswith(right_norm + "/") or right_norm.startswith(left_norm + "/")
57
+
58
+
59
+ def _process_futures(futures, operation: str) -> None:
60
+ for future in futures:
61
+ try:
62
+ future.result()
63
+ except Exception as exc:
64
+ logger.error("%s failed: %s", operation, exc)
65
+
66
+
67
+ @dataclass(frozen=True)
68
+ class FileInfo:
69
+ """Represents a file or directory with metadata for sync operations."""
70
+
71
+ relative_path: str # Relative path from root (empty string for root file/dir)
72
+ size: int # File size in bytes, 0 for directories
73
+ is_dir: bool # True if this represents a directory
74
+
75
+
76
+ def _scan_local(path: Path) -> Iterator[FileInfo]:
77
+ if not path.exists():
78
+ return
79
+
80
+ base = path
81
+ stack = [path]
82
+ # path => relative
83
+ while stack:
84
+ p = stack.pop()
85
+ if p.is_symlink():
86
+ continue
87
+
88
+ relative = p.relative_to(base).as_posix() if p != base else ""
89
+ if p.is_dir():
90
+ # Always yield directories, including the root (relative_path='')
91
+ yield FileInfo(relative_path=relative, size=0, is_dir=True)
92
+ stack.extend(p.iterdir())
93
+ else:
94
+ yield FileInfo(relative_path=relative, size=p.stat().st_size, is_dir=False)
95
+
96
+
97
+ def _filter_fileinfo(fileinfo_iter: Iterator[FileInfo], exclude: list[str] | None) -> Iterator[FileInfo]:
98
+ """
99
+ Filter FileInfo objects based on glob patterns.
100
+
101
+ Args:
102
+ fileinfo_iter: Iterator of FileInfo objects to filter
103
+ exclude: List of glob patterns to exclude (supports *, **, ?, [...])
104
+
105
+ Yields:
106
+ FileInfo objects that don't match any exclude patterns
107
+ """
108
+ if not exclude:
109
+ yield from fileinfo_iter
110
+ return
111
+ excluded_dirs: set[str] = set()
112
+
113
+ for info in fileinfo_iter:
114
+ # Skip if any parent directory was excluded
115
+ if any(info.relative_path == ed or info.relative_path.startswith(ed + "/") for ed in excluded_dirs):
116
+ continue
117
+
118
+ # Check if this path matches any exclude pattern
119
+ path = PurePosixPath(info.relative_path) if info.relative_path else PurePosixPath(".")
120
+ if any(path.match(pattern) for pattern in exclude):
121
+ if info.is_dir:
122
+ excluded_dirs.add(info.relative_path)
123
+ else:
124
+ yield info
125
+
126
+
127
+ def _compute_sync_diff(source: Iterator[FileInfo], target: Iterator[FileInfo]) -> tuple[list[FileInfo], list[FileInfo]]:
128
+ source_map: dict[str, FileInfo] = {info.relative_path: info for info in source}
129
+ target_map: dict[str, FileInfo] = {info.relative_path: info for info in target}
130
+
131
+ to_copy, to_delete = [], []
132
+
133
+ for relative_path, source_info in source_map.items():
134
+ target_info = target_map.get(relative_path)
135
+
136
+ if target_info is None:
137
+ to_copy.append(source_info)
138
+ elif source_info.is_dir != target_info.is_dir:
139
+ to_delete.append(target_info)
140
+ to_copy.append(source_info)
141
+ elif not source_info.is_dir and source_info.size != target_info.size:
142
+ to_copy.append(source_info)
143
+
144
+ for relative_path, target_info in target_map.items():
145
+ if relative_path not in source_map:
146
+ to_delete.append(target_info)
147
+
148
+ return to_copy, to_delete
149
+
150
+
151
+ @dataclass
152
+ class _Options:
153
+ cache_root: str = "~/.cache/positronic/s3/"
154
+ show_progress: bool = True
155
+ max_workers: int = 10
156
+
157
+ def cache_path_for(self, remote: str) -> Path:
158
+ bucket, key = _parse_s3_url(remote)
159
+ cache_root = Path(self.cache_root).expanduser().resolve()
160
+ return cache_root / bucket / key
161
+
162
+
163
+ @dataclass
164
+ class _DownloadRegistration:
165
+ remote: str
166
+ local_path: Path
167
+ delete: bool
168
+ exclude: list[str] | None
169
+ ready: threading.Event = field(default_factory=threading.Event)
170
+ error: Exception | None = None
171
+
172
+ def __eq__(self, other):
173
+ if not isinstance(other, _DownloadRegistration):
174
+ return False
175
+ return (
176
+ self.remote == other.remote
177
+ and self.local_path == other.local_path
178
+ and self.delete == other.delete
179
+ and self.exclude == other.exclude
180
+ )
181
+
182
+
183
+ @dataclass
184
+ class _UploadRegistration:
185
+ remote: str
186
+ local_path: Path
187
+ interval: int | None
188
+ delete: bool
189
+ sync_on_error: bool
190
+ exclude: list[str] | None
191
+ last_sync: float = 0.0
192
+
193
+ def __eq__(self, other):
194
+ if not isinstance(other, _UploadRegistration):
195
+ return False
196
+ return (
197
+ self.remote == other.remote
198
+ and self.local_path == other.local_path
199
+ and self.interval == other.interval
200
+ and self.delete == other.delete
201
+ and self.sync_on_error == other.sync_on_error
202
+ and self.exclude == other.exclude
203
+ )
204
+
205
+
206
+ _ACTIVE_MIRROR: ContextVar[_Mirror | None] = ContextVar("_ACTIVE_MIRROR", default=None)
207
+ _GLOBAL_MIRROR_LOCK = threading.RLock()
208
+ _GLOBAL_ACTIVE_MIRROR: _Mirror | None = None
209
+
210
+
211
+ class _Mirror:
212
+ def __init__(self, options: _Options):
213
+ self.options = options
214
+ self.cache_root = Path(self.options.cache_root).expanduser().resolve()
215
+ self.cache_root.mkdir(parents=True, exist_ok=True)
216
+
217
+ self.s3_client = boto3.client("s3")
218
+
219
+ self._downloads: dict[str, _DownloadRegistration] = {}
220
+ self._uploads: dict[str, _UploadRegistration] = {}
221
+ self._lock = threading.RLock()
222
+
223
+ self._stop_event: threading.Event | None = None
224
+ self._sync_thread: threading.Thread | None = None
225
+
226
+ @property
227
+ def running(self) -> bool:
228
+ return self._stop_event is not None
229
+
230
+ def start(self) -> None:
231
+ if not self.running:
232
+ self._stop_event = threading.Event()
233
+
234
+ def stop(self, had_error: bool = False) -> None:
235
+ if self.running:
236
+ self._stop_event.set()
237
+
238
+ if self._sync_thread:
239
+ self._sync_thread.join(timeout=60)
240
+ self._sync_thread = None
241
+ self._final_sync(had_error=had_error)
242
+ self._stop_event = None
243
+
244
+ def download(
245
+ self,
246
+ remote: str,
247
+ local: str | Path | None,
248
+ delete: bool,
249
+ exclude: list[str] | None = None,
250
+ ) -> Path:
251
+ """
252
+ Register (and perform if needed) a download from a remote S3 bucket path to a local directory or file.
253
+
254
+ Args:
255
+ remote (str): Source S3 URL (e.g., "s3://bucket/key/prefix") or local path.
256
+ If a local path is provided, it is validated and returned directly.
257
+ local (str | Path | None): Local directory or file destination. If None, uses cache path from options.
258
+ delete (bool): If True, deletes local files not present in S3.
259
+ exclude (list[str] | None): List of glob patterns to exclude from download.
260
+
261
+ Returns:
262
+ Path: The canonical local path associated with this download registration.
263
+
264
+ Raises:
265
+ FileNotFoundError: If remote is a local path that does not exist.
266
+ ValueError: If download registration conflicts with an existing download or upload or parameters differ.
267
+ """
268
+ if not _is_s3_path(remote):
269
+ path = Path(remote).expanduser().resolve()
270
+ return path
271
+
272
+ normalized = _normalize_s3_url(remote)
273
+ local_path = self.options.cache_path_for(remote) if local is None else Path(local).expanduser().resolve()
274
+ new_registration = _DownloadRegistration(
275
+ remote=normalized, local_path=local_path, delete=delete, exclude=exclude
276
+ )
277
+
278
+ with self._lock:
279
+ existing = self._downloads.get(normalized)
280
+ if existing:
281
+ if existing != new_registration:
282
+ raise ValueError(f"Download for '{normalized}' already registered with different parameters")
283
+ registration = existing
284
+ need_download = False
285
+ else:
286
+ self._check_download_conflicts(normalized)
287
+ self._downloads[normalized] = new_registration
288
+ registration = new_registration
289
+ need_download = True
290
+
291
+ if need_download:
292
+ try:
293
+ self._perform_download(normalized, local_path, delete, exclude)
294
+ except Exception as exc:
295
+ registration.error = exc
296
+ registration.ready.set()
297
+ with self._lock:
298
+ self._downloads.pop(normalized, None)
299
+ raise
300
+ else:
301
+ registration.ready.set()
302
+ else:
303
+ registration.ready.wait()
304
+ if registration.error is not None:
305
+ raise registration.error
306
+
307
+ return local_path
308
+
309
+ def upload(
310
+ self,
311
+ remote,
312
+ local,
313
+ interval,
314
+ delete,
315
+ sync_on_error,
316
+ exclude: list[str] | None = None,
317
+ ) -> Path:
318
+ """
319
+ Register (and perform if needed) an upload from a local directory or file to a remote S3 bucket path.
320
+
321
+ Args:
322
+ remote (str): Destination S3 URL (e.g., "s3://bucket/key/prefix")
323
+ local (str | Path | None): Local directory or file to upload. If None, determines default from options.
324
+ interval (int | None): If set, enables periodic background uploads (seconds between syncs).
325
+ delete (bool): If True, deletes remote files not present locally.
326
+ sync_on_error (bool): If True, attempts to sync files even when encountering errors.
327
+ exclude (list[str] | None): List of glob patterns to exclude from upload.
328
+
329
+ Returns:
330
+ Path: The canonical local path associated with this upload registration.
331
+
332
+ Raises:
333
+ ValueError: If upload registration conflicts with an existing download or upload or parameters differ.
334
+ """
335
+ if not _is_s3_path(remote):
336
+ path = Path(remote).expanduser().resolve()
337
+ path.mkdir(parents=True, exist_ok=True)
338
+ return path
339
+
340
+ normalized = _normalize_s3_url(remote)
341
+ local_path = self.options.cache_path_for(remote) if local is None else Path(local).expanduser().resolve()
342
+
343
+ new_registration = _UploadRegistration(
344
+ remote=normalized,
345
+ local_path=local_path,
346
+ interval=interval,
347
+ delete=delete,
348
+ sync_on_error=sync_on_error,
349
+ exclude=exclude,
350
+ last_sync=0,
351
+ )
352
+
353
+ with self._lock:
354
+ existing = self._uploads.get(normalized)
355
+ if existing:
356
+ if existing != new_registration:
357
+ raise ValueError(f"Upload for '{normalized}' already registered with different parameters")
358
+ return existing.local_path
359
+
360
+ self._check_upload_conflicts(new_registration)
361
+ self._uploads[normalized] = new_registration
362
+ if interval is not None:
363
+ self._ensure_background_thread_unlocked()
364
+
365
+ return local_path
366
+
367
+ def sync(
368
+ self,
369
+ remote: str,
370
+ local: str | Path | None,
371
+ interval: int | None,
372
+ delete_local: bool,
373
+ delete_remote: bool,
374
+ sync_on_error: bool,
375
+ exclude: list[str] | None = None,
376
+ ) -> Path:
377
+ local_path = self.download(remote, local, delete_local, exclude)
378
+ if not _is_s3_path(remote):
379
+ return local_path
380
+
381
+ normalized = _normalize_s3_url(remote)
382
+ # Unregister the download to allow upload registration for the same remote
383
+ self._downloads.pop(normalized, None)
384
+ return self.upload(remote, local_path, interval, delete_remote, sync_on_error, exclude)
385
+
386
+ def ls(self, prefix: str, recursive: bool = False) -> list[str]:
387
+ """Lists objects under the given prefix, working for both local directories and S3 prefixes."""
388
+ if _is_s3_path(prefix):
389
+ normalized = _normalize_s3_url(prefix)
390
+ bucket, key = _parse_s3_url(normalized)
391
+ # Ensure directory-like listing by appending '/' to avoid spurious prefix matches
392
+ if key:
393
+ key = key + "/"
394
+ items = []
395
+ for info in self._scan_s3(bucket, key):
396
+ if info.relative_path:
397
+ # Skip nested items if not recursive
398
+ if not recursive and "/" in info.relative_path:
399
+ continue
400
+ # Reconstruct the full S3 key
401
+ if key:
402
+ s3_key = key.rstrip("/") + "/" + info.relative_path
403
+ else:
404
+ s3_key = info.relative_path
405
+ items.append(f"s3://{bucket}/{s3_key}")
406
+ return items
407
+ else:
408
+ display_path = Path(prefix).expanduser()
409
+ scan_path = display_path.resolve()
410
+ items = []
411
+ for info in _scan_local(scan_path):
412
+ if info.relative_path:
413
+ # Skip nested items if not recursive
414
+ if not recursive and "/" in info.relative_path:
415
+ continue
416
+ items.append(str(display_path.joinpath(Path(info.relative_path))))
417
+ return items
418
+
419
+ def _check_download_conflicts(self, candidate: str) -> None:
420
+ for upload_remote in self._uploads:
421
+ if _s3_paths_conflict(candidate, upload_remote):
422
+ raise ValueError(f"Conflict: download '{candidate}' overlaps with upload '{upload_remote}'")
423
+
424
+ def _check_upload_conflicts(self, new_registration) -> None:
425
+ candidate = new_registration.remote
426
+ for download_remote in self._downloads:
427
+ if _s3_paths_conflict(candidate, download_remote):
428
+ raise ValueError(f"Conflict: upload '{candidate}' overlaps with download '{download_remote}'")
429
+ for upload_remote, reg in self._uploads.items():
430
+ if _s3_paths_conflict(candidate, upload_remote):
431
+ same_remote = candidate == upload_remote
432
+ if not same_remote or reg != new_registration:
433
+ raise ValueError(f"Conflict: upload '{candidate}' overlaps with upload '{upload_remote}'")
434
+
435
+ def _ensure_background_thread_unlocked(self) -> None:
436
+ assert self.running, "The mirror must be started before performing any uploads"
437
+ if not self._sync_thread or not self._sync_thread.is_alive():
438
+ thread = threading.Thread(target=self._background_worker, name="positronic-s3-sync", daemon=True)
439
+ thread.start()
440
+ self._sync_thread = thread
441
+
442
+ def _background_worker(self) -> None:
443
+ while not self._stop_event.wait(1):
444
+ now = time.monotonic()
445
+ due: list[_UploadRegistration] = []
446
+ with self._lock:
447
+ for registration in self._uploads.values():
448
+ if registration.interval is not None and now - registration.last_sync >= registration.interval:
449
+ registration.last_sync = now
450
+ due.append(registration)
451
+
452
+ self._sync_uploads(due)
453
+
454
+ def _final_sync(self, had_error: bool = False) -> None:
455
+ with self._lock:
456
+ uploads = list(self._uploads.values())
457
+ if had_error:
458
+ uploads = [u for u in uploads if u.sync_on_error]
459
+ self._sync_uploads(uploads)
460
+
461
+ def _sync_uploads(self, registrations: Iterable[_UploadRegistration]) -> None:
462
+ tasks: list[tuple[str, Path, bool, list[str] | None]] = []
463
+ for registration in registrations:
464
+ if registration.local_path.exists():
465
+ tasks.append(
466
+ (
467
+ registration.remote,
468
+ registration.local_path,
469
+ registration.delete,
470
+ registration.exclude,
471
+ )
472
+ )
473
+
474
+ if not tasks:
475
+ return
476
+
477
+ to_put: list[tuple[FileInfo, Path, str, str]] = []
478
+ to_remove: list[tuple[str, str]] = []
479
+ total_bytes = 0
480
+
481
+ for remote, local_path, delete, exclude in tasks:
482
+ logger.debug("Syncing upload: %s from %s (delete=%s)", remote, local_path, delete)
483
+ bucket, prefix = _parse_s3_url(remote)
484
+ to_copy, to_delete = _compute_sync_diff(
485
+ _filter_fileinfo(_scan_local(local_path), exclude),
486
+ _filter_fileinfo(self._scan_s3(bucket, prefix), exclude),
487
+ )
488
+
489
+ for info in to_copy:
490
+ s3_key = prefix + ("/" + info.relative_path if info.relative_path else "")
491
+ to_put.append((info, local_path, bucket, s3_key))
492
+ total_bytes += info.size
493
+
494
+ for info in to_delete if delete else []:
495
+ s3_key = prefix + ("/" + info.relative_path if info.relative_path else "")
496
+ to_remove.append((bucket, s3_key))
497
+
498
+ if to_put:
499
+ with (
500
+ self._progress_bar(total_bytes, f"Uploading {remote}") as pbar,
501
+ ThreadPoolExecutor(max_workers=self.options.max_workers) as executor,
502
+ ):
503
+ futures = [
504
+ executor.submit(self._put_to_s3, info, local_path, bucket, key, pbar)
505
+ for info, local_path, bucket, key in to_put
506
+ ]
507
+ _process_futures(as_completed(futures), "Upload")
508
+
509
+ if to_remove:
510
+ to_remove_sorted = sorted(to_remove, key=lambda x: x[1].count("/"), reverse=True)
511
+ with ThreadPoolExecutor(max_workers=self.options.max_workers) as executor:
512
+ futures = [executor.submit(self._remove_from_s3, bucket, key) for bucket, key in to_remove_sorted]
513
+ iterator = as_completed(futures)
514
+ if self.options.show_progress:
515
+ iterator = tqdm(
516
+ iterator,
517
+ total=len(to_remove_sorted),
518
+ desc=f"Deleting in {remote}",
519
+ )
520
+ _process_futures(iterator, "Delete")
521
+
522
+ def _perform_download(self, remote: str, local_path: Path, delete: bool, exclude: list[str] | None) -> None:
523
+ bucket, prefix = _parse_s3_url(remote)
524
+ logger.debug(
525
+ "Performing download: s3://%s/%s to %s (delete=%s)",
526
+ bucket,
527
+ prefix,
528
+ local_path,
529
+ delete,
530
+ )
531
+ to_copy, to_delete = _compute_sync_diff(
532
+ _filter_fileinfo(self._scan_s3(bucket, prefix), exclude),
533
+ _filter_fileinfo(_scan_local(local_path), exclude),
534
+ )
535
+
536
+ to_put: list[tuple[FileInfo, str, str, Path]] = []
537
+ to_remove: list[Path] = []
538
+ total_bytes = 0
539
+
540
+ for info in to_copy:
541
+ s3_key = prefix + ("/" + info.relative_path if info.relative_path else "")
542
+ to_put.append((info, bucket, s3_key, local_path))
543
+ total_bytes += info.size
544
+
545
+ if delete:
546
+ logger.debug("Will delete %d local items not in S3", len(to_delete))
547
+ for info in to_delete if delete else []:
548
+ target = local_path / info.relative_path if info.relative_path else local_path
549
+ logger.debug("Marking for local deletion: %s", target)
550
+ to_remove.append(target)
551
+
552
+ if to_put:
553
+ with (
554
+ self._progress_bar(total_bytes, f"Downloading {remote}") as pbar,
555
+ ThreadPoolExecutor(max_workers=self.options.max_workers) as executor,
556
+ ):
557
+ futures = [executor.submit(self._put_locally, *args, pbar) for args in to_put]
558
+ _process_futures(as_completed(futures), "Download")
559
+
560
+ if to_remove:
561
+ to_remove_sorted = sorted(to_remove, key=lambda x: len(x.parts), reverse=True)
562
+ iterator = to_remove_sorted
563
+ if self.options.show_progress:
564
+ iterator = tqdm(iterator, desc=f"Deleting in {remote}")
565
+ for path in iterator:
566
+ self._remove_locally(path)
567
+
568
+ def _list_s3_objects(self, bucket: str, key: str) -> Iterator[dict]:
569
+ logger.debug("Listing S3 objects: bucket=%s, key=%s", bucket, key)
570
+ # Skip head_object for directory-like keys ending with '/'
571
+ # as we want to list contents, not check if the directory marker exists
572
+ if not key.endswith("/"):
573
+ try:
574
+ obj = self.s3_client.head_object(Bucket=bucket, Key=key)
575
+ except ClientError as exc:
576
+ error_code = exc.response["Error"]["Code"]
577
+ if error_code != "404":
578
+ raise
579
+ else:
580
+ logger.debug("Found single object via head_object: %s", key)
581
+ if "ContentLength" in obj and "Size" not in obj:
582
+ obj["Size"] = obj["ContentLength"]
583
+ yield {**obj, "Key": key}
584
+ return
585
+
586
+ paginator = self.s3_client.get_paginator("list_objects_v2")
587
+ for page in paginator.paginate(Bucket=bucket, Prefix=key):
588
+ objects = page.get("Contents", [])
589
+ logger.debug("Listed %d objects with prefix %s", len(objects), key)
590
+ yield from objects
591
+
592
+ def _scan_s3(self, bucket: str, prefix: str) -> Iterator[FileInfo]:
593
+ logger.debug("Scanning S3: s3://%s/%s", bucket, prefix)
594
+ seen_dirs: set[str] = set()
595
+ has_content = False
596
+
597
+ for obj in self._list_s3_objects(bucket, prefix):
598
+ has_content = True
599
+ key = obj["Key"]
600
+ relative = key[len(prefix) :].lstrip("/")
601
+
602
+ if key.endswith("/"):
603
+ relative = relative.rstrip("/")
604
+ if relative:
605
+ yield FileInfo(relative_path=relative, size=0, is_dir=True)
606
+ seen_dirs.add(relative)
607
+ else:
608
+ yield FileInfo(relative_path=relative, size=obj["Size"], is_dir=False)
609
+
610
+ if "/" in relative:
611
+ parts = relative.split("/")
612
+ for i in range(len(parts) - 1):
613
+ dir_path = "/".join(parts[: i + 1])
614
+ if dir_path and dir_path not in seen_dirs:
615
+ yield FileInfo(relative_path=dir_path, size=0, is_dir=True)
616
+ seen_dirs.add(dir_path)
617
+
618
+ if has_content:
619
+ yield FileInfo(
620
+ relative_path="", size=0, is_dir=True
621
+ ) # Yield root directory marker for symmetry with _scan_local
622
+
623
+ def _progress_bar(self, total_bytes: int, desc: str):
624
+ if not self.options.show_progress:
625
+ return _NullTqdm()
626
+ return tqdm(total=total_bytes, unit="B", unit_scale=True, unit_divisor=1024, desc=desc)
627
+
628
+ def _put_to_s3(self, info: FileInfo, local_path: Path, bucket: str, key: str, pbar) -> None:
629
+ try:
630
+ if info.is_dir:
631
+ key += "/" if not key.endswith("/") else ""
632
+ self.s3_client.put_object(Bucket=bucket, Key=key, Body=b"")
633
+ else:
634
+ file_path = local_path / info.relative_path if info.relative_path else local_path
635
+ self.s3_client.upload_file(str(file_path), bucket, key, Callback=pbar.update)
636
+ except Exception as exc:
637
+ logger.error("Failed to put %s to %s/%s: %s", local_path, bucket, key, exc)
638
+ raise
639
+
640
+ def _remove_from_s3(self, bucket: str, key: str) -> None:
641
+ try:
642
+ self.s3_client.delete_object(Bucket=bucket, Key=key)
643
+ except Exception as exc:
644
+ logger.error("Failed to remove %s/%s: %s", bucket, key, exc)
645
+ raise
646
+
647
+ def _put_locally(self, info: FileInfo, bucket: str, key: str, local_path: Path, pbar) -> None:
648
+ try:
649
+ target = local_path / info.relative_path if info.relative_path else local_path
650
+ if info.is_dir:
651
+ target.mkdir(parents=True, exist_ok=True)
652
+ else:
653
+ target.parent.mkdir(parents=True, exist_ok=True)
654
+ self.s3_client.download_file(bucket, key, str(target), Callback=pbar.update)
655
+ except Exception as exc:
656
+ logger.error("Failed to put %s locally: %s", key, exc)
657
+ raise
658
+
659
+ def _remove_locally(self, path: Path) -> None:
660
+ try:
661
+ if path.is_dir():
662
+ shutil.rmtree(path)
663
+ else:
664
+ path.unlink()
665
+ except Exception as exc:
666
+ logger.error("Failed to remove %s: %s", path, exc)
667
+ raise
668
+
669
+
670
+ @contextmanager
671
+ def mirror(
672
+ cache_root: str = "~/.cache/positronic/s3/",
673
+ show_progress: bool = True,
674
+ max_workers: int = 10,
675
+ ):
676
+ """
677
+ Context manager that activates the sync environment.
678
+
679
+ Args:
680
+ cache_root: Base directory for caching downloaded files.
681
+ show_progress: Display tqdm progress bars.
682
+ max_workers: Threads for parallel S3 operations.
683
+ """
684
+ global _GLOBAL_ACTIVE_MIRROR
685
+ options = _Options(cache_root=cache_root, show_progress=show_progress, max_workers=max_workers)
686
+
687
+ with _GLOBAL_MIRROR_LOCK:
688
+ if _GLOBAL_ACTIVE_MIRROR is not None:
689
+ raise RuntimeError("Mirror already active")
690
+
691
+ mirror_obj = _Mirror(options)
692
+ mirror_obj.start()
693
+ _GLOBAL_ACTIVE_MIRROR = mirror_obj
694
+
695
+ token = _ACTIVE_MIRROR.set(mirror_obj)
696
+ had_error = False
697
+ try:
698
+ yield
699
+ except Exception:
700
+ had_error = True
701
+ raise
702
+ finally:
703
+ try:
704
+ mirror_obj.stop(had_error=had_error)
705
+ finally:
706
+ with _GLOBAL_MIRROR_LOCK:
707
+ _GLOBAL_ACTIVE_MIRROR = None
708
+ _ACTIVE_MIRROR.reset(token)
709
+
710
+
711
+ def with_mirror(
712
+ cache_root: str = "~/.cache/positronic/s3/",
713
+ show_progress: bool = True,
714
+ max_workers: int = 10,
715
+ ):
716
+ """
717
+ Decorator equivalent of mirror() for wrapping functions.
718
+ See mirror() for argument details.
719
+ """
720
+
721
+ def decorator(func):
722
+ @wraps(func)
723
+ def wrapper(*args, **kwargs):
724
+ with mirror(
725
+ cache_root=cache_root,
726
+ show_progress=show_progress,
727
+ max_workers=max_workers,
728
+ ):
729
+ return func(*args, **kwargs)
730
+
731
+ return wrapper
732
+
733
+ return decorator
734
+
735
+
736
+ def _require_active_mirror() -> _Mirror:
737
+ mirror_obj = _ACTIVE_MIRROR.get()
738
+ if mirror_obj is not None:
739
+ return mirror_obj
740
+
741
+ global _GLOBAL_ACTIVE_MIRROR
742
+ if _GLOBAL_ACTIVE_MIRROR is not None:
743
+ return _GLOBAL_ACTIVE_MIRROR
744
+
745
+ raise RuntimeError("No active mirror context")
746
+
747
+
748
+ def download(
749
+ remote: str,
750
+ local: str | Path | None = None,
751
+ delete: bool = True,
752
+ exclude: list[str] | None = None,
753
+ ) -> Path:
754
+ """
755
+ Register a path for download. Ensures local copy matches S3 immediately.
756
+
757
+ Args:
758
+ remote: S3 URL or local path.
759
+ local: Explicit local destination. Defaults to standard cache path.
760
+ delete: If True (default), deletes local files NOT in S3 ("mirror" behavior).
761
+ exclude: List of glob patterns to skip.
762
+
763
+ Returns:
764
+ Path to the local directory/file.
765
+ """
766
+ mirror_obj = _require_active_mirror()
767
+ return mirror_obj.download(remote, local, delete, exclude)
768
+
769
+
770
+ def upload(
771
+ remote: str,
772
+ local: str | Path | None = None,
773
+ interval: int | None = 300,
774
+ delete: bool = True,
775
+ sync_on_error: bool = False,
776
+ exclude: list[str] | None = None,
777
+ ) -> Path:
778
+ """
779
+ Register a local path for upload. Uploads on exit and optionally in background.
780
+
781
+ Args:
782
+ remote: Destination S3 URL.
783
+ local: Local source path. Auto-resolved from cache path if None.
784
+ interval: Seconds between background syncs. None for exit-only.
785
+ delete: If True (default), deletes S3 files NOT present locally.
786
+ sync_on_error: If True, syncs even if the context exits with an exception.
787
+
788
+ Returns:
789
+ Path to the local directory/file.
790
+ """
791
+ mirror_obj = _require_active_mirror()
792
+ return mirror_obj.upload(remote, local, interval, delete, sync_on_error, exclude)
793
+
794
+
795
+ def sync(
796
+ remote: str,
797
+ local: str | Path | None = None,
798
+ interval: int | None = 300,
799
+ delete_local: bool = True,
800
+ delete_remote: bool = True,
801
+ sync_on_error: bool = False,
802
+ exclude: list[str] | None = None,
803
+ ) -> Path:
804
+ """
805
+ Bi-directional helper. Performs download() then registers upload().
806
+
807
+ Args:
808
+ delete_local: Cleanup local files during download.
809
+ delete_remote: Cleanup remote files during upload.
810
+
811
+ Returns:
812
+ Path to the local directory/file.
813
+ """
814
+ mirror_obj = _require_active_mirror()
815
+ return mirror_obj.sync(remote, local, interval, delete_local, delete_remote, sync_on_error, exclude)
816
+
817
+
818
+ def ls(prefix: str, recursive: bool = False) -> list[str]:
819
+ """
820
+ Lists files/objects in a directory or S3 prefix.
821
+
822
+ Args:
823
+ prefix: S3 URL or local path.
824
+ recursive: List subdirectories if True.
825
+
826
+ Returns:
827
+ List of full S3 URLs or local paths.
828
+ """
829
+ mirror_obj = _require_active_mirror()
830
+ return mirror_obj.ls(prefix, recursive)
831
+
832
+
833
+ __all__ = ["mirror", "download", "upload", "sync", "ls", "_parse_s3_url"]
@@ -0,0 +1,152 @@
1
+ Metadata-Version: 2.4
2
+ Name: pos3
3
+ Version: 0.1.0
4
+ Summary: S3 Simple Sync - Make using S3 as simple as using local files
5
+ Author-email: Positronic Robotics <hi@positronic.ro>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/Positronic-Robotics/pos3
8
+ Project-URL: Repository, https://github.com/Positronic-Robotics/pos3
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Requires-Python: >=3.9
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: boto3>=1.26.0
16
+ Requires-Dist: tqdm>=4.65.0
17
+ Provides-Extra: dev
18
+ Requires-Dist: pytest>=7.0; extra == "dev"
19
+ Requires-Dist: pytest-cov; extra == "dev"
20
+ Requires-Dist: ruff; extra == "dev"
21
+ Requires-Dist: pre-commit; extra == "dev"
22
+
23
+ # pos3
24
+
25
+ **PO**sitronic **S3** — Make using S3 as simple as using local files.
26
+
27
+ `pos3` provides a Pythonic context manager for syncing directories and files with S3. It is designed for data processing pipelines and machine learning workflows where you need to integrate S3 with code that **only understands local files**.
28
+
29
+ > The main value of `pos3` is enabling you to pass S3 data to **third-party libraries or legacy scripts** that expect local file paths (e.g., `opencv`, `pandas.read_csv`, or model training scripts). Instead of rewriting their I/O logic to support S3, `pos3` transparently bridges the gap.
30
+
31
+ ## Core Concepts
32
+
33
+ - **Context Manager**: All operations run within a `with pos3.mirror():` block.
34
+ - **Enter**: Initializes the sync environment (threads, cache).
35
+ - **Body**: You explicitly call `pos3.download()` to fetch files and `pos3.upload()` to register outputs.
36
+ - **Exit**: Uploads registered output paths (mirroring local to S3).
37
+ - **Lazy & Efficient**: Only transfers files that have changed (based on size/presence).
38
+ - **Local Paths**: All API calls return a `pathlib.Path` to the local file/directory. If you pass a local path instead of an S3 URL, it is passed through unchanged (no copy).
39
+ - **Background Sync**: Can optionally upload changes in the background (e.g., every 60s) for long-running jobs.
40
+
41
+ ## Quick Start
42
+
43
+ The primary API is the `pos3.mirror()` context manager.
44
+
45
+ ```python
46
+ import pos3
47
+
48
+ # 1. Start the context
49
+ with pos3.mirror(cache_root='~/.cache/positronic/s3'):
50
+
51
+ # 2. Download Input
52
+ # - Downloads s3://bucket/data to cache
53
+ # - Deletes local files that don't exist in S3 (mirroring)
54
+ # - Returns local Path object
55
+ dataset_path = pos3.download('s3://bucket/data')
56
+
57
+ # 3. Sync Output (Resume & Upload)
58
+ # - Downloads existing checkpoints (to resume)
59
+ # - Registers path for background uploads
60
+ checkpoints_path = pos3.sync('s3://bucket/ckpt', interval=60, delete_remote=False)
61
+
62
+ # 4. Upload Logs (Write-only)
63
+ # - Creates local directory
64
+ # - Uploads new files to S3 on exit/interval
65
+ logs_path = pos3.upload('s3://bucket/logs', interval=30)
66
+
67
+ # 5. Use standard local file paths
68
+ print(f"Reading from {dataset_path}") # -> ~/.cache/positronic/s3/bucket/data
69
+ print(f"Writing to {checkpoints_path}") # -> ~/.cache/positronic/s3/bucket/ckpt
70
+ print(f"Logging to {logs_path}") # -> ~/.cache/positronic/s3/bucket/logs
71
+
72
+ train(dataset_path, checkpoints_path, logs_path)
73
+ ```
74
+
75
+ ## API Guide
76
+
77
+ > **Note**: All operational methods (`download`, `upload`, `sync`, `ls`) must be called within an active `pos3.mirror()` context. Calling them outside will raise a `RuntimeError`.
78
+
79
+ ### `pos3.mirror(...)` / `@pos3.with_mirror(...)`
80
+
81
+ Context manager (or decorator) that activates the sync environment.
82
+
83
+ **Parameters:**
84
+ - `cache_root` (default: `'~/.cache/positronic/s3/'`): Base directory for caching downloaded files.
85
+ - `show_progress` (default: `True`): Display tqdm progress bars.
86
+ - `max_workers` (default: `10`): Threads for parallel S3 operations.
87
+
88
+ **Decorator Example:**
89
+
90
+ ```python
91
+ @pos3.with_mirror(cache_root='/tmp/cache')
92
+ def main():
93
+ # Only works when called!
94
+ data_path = pos3.download('s3://bucket/data')
95
+ train(data_path)
96
+
97
+ if __name__ == "__main__":
98
+ main()
99
+ ```
100
+
101
+ ### `pos3.download(remote, local=None, delete=True, exclude=None)`
102
+
103
+ Registers a path for download. Ensures local copy matches S3 immediately.
104
+ - `remote`: S3 URL (e.g., `s3://bucket/key`) or local path.
105
+ - `local`: Explicit local destination. Defaults to standard cache path.
106
+ - `delete`: If `True` (default), deletes local files NOT in S3 ("mirror" behavior).
107
+ - `exclude`: List of glob patterns to skip.
108
+
109
+ **Returns**: `pathlib.Path` to the local directory/file.
110
+
111
+ ### `pos3.upload(remote, local=None, interval=300, delete=True, sync_on_error=False, exclude=None)`
112
+
113
+ Registers a local path for upload. Uploads on exit and optionally in background.
114
+ - `remote`: Destination S3 URL.
115
+ - `local`: Local source path. Auto-resolved from cache path if `None`.
116
+ - `interval`: Seconds between background syncs. `None` for exit-only.
117
+ - `delete`: If `True` (default), deletes S3 files NOT present locally.
118
+ - `sync_on_error`: If `True`, syncs even if the context exits with an exception.
119
+
120
+ **Returns**: `pathlib.Path` to the local directory/file.
121
+
122
+ ### `pos3.sync(remote, local=None, interval=300, delete_local=True, delete_remote=True, sync_on_error=False, exclude=None)`
123
+
124
+ Bi-directional helper. Performs `download()` then registers `upload()`. Useful for jobs that work on existing files, like when you resume training from a checkpoint.
125
+ - `delete_local`: Cleanup local files during download.
126
+ - `delete_remote`: Cleanup remote files during upload. carefully consider setting to `False` when resuming jobs to avoid deleting history.
127
+
128
+ **Returns**: `pathlib.Path` to the local directory/file.
129
+
130
+ ### `pos3.ls(prefix, recursive=False)`
131
+
132
+ Lists files/objects in a directory or S3 prefix.
133
+ - `prefix`: S3 URL or local path.
134
+ - `recursive`: List subdirectories if `True`.
135
+
136
+ **Returns**: List of full S3 URLs or local paths.
137
+
138
+ ## Comparison with Libraries
139
+
140
+ Why use `pos3` instead of other Python libraries?
141
+
142
+ | Feature | `pos3` | `boto3` | `s3fs` / `fsspec` |
143
+ | :--- | :--- | :--- | :--- |
144
+ | **Abstraction Level** | **High** (Context Manager) | **Low** (API Client) | **Medium** (File System) |
145
+ | **Sync Logic** | **Built-in** (Differential) | Manual Implementation | `put`/`get` (Recursive) |
146
+ | **Lifecycle** | **Automated** (Open/Close) | Manual | Manual |
147
+ | **Background Upload** | **Yes** (Non-blocking) | Manual Threading | No (Blocking) |
148
+ | **Local I/O Speed** | **Native** (SSD) | Native | Network Bound (Virtual FS) |
149
+ | **Use Case** | **ML / Pipelines / 3rd Party Code** | App Development | DataFrames / Interactive |
150
+
151
+ - **vs `boto3`**: `boto3` is the raw AWS SDK. `pos3` wraps it to provide "mirroring" logic, threading, and diffing out of the box.
152
+ - **vs `s3fs`**: `s3fs` treats S3 as a filesystem. `pos3` treats S3 as a persistence layer for your high-speed local storage, ensuring you always get native IO performance.
@@ -0,0 +1,5 @@
1
+ pos3/__init__.py,sha256=rxcmD1K5M9zvBqDhIX1guSFxgcm6XOSBhLTbY5wNzkk,30453
2
+ pos3-0.1.0.dist-info/METADATA,sha256=9XUDlEQjdkmJt7WX0PDeThq5X1ZplpEnJnNRrj03V7M,6940
3
+ pos3-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
4
+ pos3-0.1.0.dist-info/top_level.txt,sha256=JWOpXHz1F6cbH0nfanGWLaozt8RJFRmv5H3eKkxz7e8,5
5
+ pos3-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ pos3