PyPI - s3duct - Versions diffs - 0.3.0__py3-none-any.whl - Mend

s3duct 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

s3duct/__init__.py +3 -0
s3duct/backends/__init__.py +0 -0
s3duct/backends/base.py +62 -0
s3duct/backends/local.py +72 -0
s3duct/backends/s3.py +150 -0
s3duct/backpressure.py +86 -0
s3duct/chunker.py +114 -0
s3duct/cli.py +256 -0
s3duct/config.py +20 -0
s3duct/downloader.py +273 -0
s3duct/encryption.py +153 -0
s3duct/integrity.py +86 -0
s3duct/manifest.py +82 -0
s3duct/py.typed +0 -0
s3duct/resume.py +148 -0
s3duct/thaw.py +13 -0
s3duct/uploader.py +560 -0
s3duct-0.3.0.dist-info/METADATA +411 -0
s3duct-0.3.0.dist-info/RECORD +23 -0
s3duct-0.3.0.dist-info/WHEEL +5 -0
s3duct-0.3.0.dist-info/entry_points.txt +2 -0
s3duct-0.3.0.dist-info/licenses/LICENSE +93 -0
s3duct-0.3.0.dist-info/top_level.txt +1 -0

s3duct/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""s3duct - Chunked, resumable, encrypted pipe to object storage."""
+__version__ = "0.3.0"

s3duct/backends/__init__.py ADDED Viewed

File without changes

s3duct/backends/base.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Abstract storage backend interface."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass
+class ObjectInfo:
+    key: str
+    size: int
+    etag: str
+    storage_class: str | None = None
+    restore_status: str | None = None
+class StorageBackend(ABC):
+    @abstractmethod
+    def upload(self, key: str, file_path: Path, storage_class: str | None = None) -> str:
+        """Upload a file. Returns ETag."""
+        ...
+    @abstractmethod
+    def upload_bytes(self, key: str, data: bytes, storage_class: str | None = None) -> str:
+        """Upload raw bytes. Returns ETag."""
+        ...
+    @abstractmethod
+    def download(self, key: str, dest_path: Path) -> None:
+        """Download an object to a local file."""
+        ...
+    @abstractmethod
+    def download_bytes(self, key: str) -> bytes:
+        """Download an object as bytes."""
+        ...
+    @abstractmethod
+    def list_objects(self, prefix: str) -> list[ObjectInfo]:
+        """List objects with a given prefix."""
+        ...
+    @abstractmethod
+    def head_object(self, key: str) -> ObjectInfo:
+        """Get metadata for a single object."""
+        ...
+    @abstractmethod
+    def delete_object(self, key: str) -> None:
+        """Delete a single object."""
+        ...
+    @abstractmethod
+    def initiate_restore(self, key: str, days: int, tier: str) -> None:
+        """Request restore of a Glacier/GDA object."""
+        ...
+    @abstractmethod
+    def is_restore_complete(self, key: str) -> bool:
+        """Check if a Glacier restore is complete."""
+        ...

s3duct/backends/local.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Local filesystem storage backend for development and testing."""
+import hashlib
+import shutil
+from pathlib import Path
+from s3duct.backends.base import ObjectInfo, StorageBackend
+class LocalBackend(StorageBackend):
+    """Stores objects as files in a local directory tree."""
+    def __init__(self, root: Path, prefix: str = "") -> None:
+        self._root = root
+        self._prefix = prefix.rstrip("/") + "/" if prefix else ""
+        self._root.mkdir(parents=True, exist_ok=True)
+    def _full_path(self, key: str) -> Path:
+        return self._root / f"{self._prefix}{key}"
+    @staticmethod
+    def _etag(data: bytes) -> str:
+        return hashlib.md5(data).hexdigest()
+    def upload(self, key: str, file_path: Path, storage_class: str | None = None) -> str:
+        dest = self._full_path(key)
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(file_path, dest)
+        return self._etag(dest.read_bytes())
+    def upload_bytes(self, key: str, data: bytes, storage_class: str | None = None) -> str:
+        dest = self._full_path(key)
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        dest.write_bytes(data)
+        return self._etag(data)
+    def download(self, key: str, dest_path: Path) -> None:
+        src = self._full_path(key)
+        shutil.copy2(src, dest_path)
+    def download_bytes(self, key: str) -> bytes:
+        return self._full_path(key).read_bytes()
+    def list_objects(self, prefix: str) -> list[ObjectInfo]:
+        full_prefix = f"{self._prefix}{prefix}"
+        results = []
+        for p in self._root.rglob("*"):
+            if not p.is_file():
+                continue
+            rel = str(p.relative_to(self._root))
+            if not rel.startswith(full_prefix):
+                continue
+            data = p.read_bytes()
+            key = rel[len(self._prefix):] if self._prefix and rel.startswith(self._prefix) else rel
+            results.append(ObjectInfo(
+                key=key, size=len(data), etag=self._etag(data),
+            ))
+        return results
+    def head_object(self, key: str) -> ObjectInfo:
+        p = self._full_path(key)
+        data = p.read_bytes()
+        return ObjectInfo(key=key, size=len(data), etag=self._etag(data))
+    def delete_object(self, key: str) -> None:
+        self._full_path(key).unlink(missing_ok=True)
+    def initiate_restore(self, key: str, days: int, tier: str) -> None:
+        pass  # no-op for local storage
+    def is_restore_complete(self, key: str) -> bool:
+        return True  # always available locally

s3duct/backends/s3.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""AWS S3 storage backend."""
+import time
+from pathlib import Path
+import boto3
+from botocore.exceptions import (
+    ClientError,
+    ConnectionClosedError,
+    ConnectTimeoutError,
+    EndpointConnectionError,
+    ReadTimeoutError,
+)
+from s3duct.backends.base import ObjectInfo, StorageBackend
+from s3duct.config import MAX_RETRY_ATTEMPTS, RETRY_BASE_DELAY, RETRY_MAX_DELAY
+# Errors worth retrying: API errors + connection-level failures
+_RETRYABLE = (ClientError, ConnectionClosedError, ConnectTimeoutError,
+              EndpointConnectionError, ReadTimeoutError, ConnectionError, OSError)
+class S3Backend(StorageBackend):
+    def __init__(self, bucket: str, region: str | None = None, prefix: str = "",
+                 endpoint_url: str | None = None,
+                 max_retries: int = MAX_RETRY_ATTEMPTS,
+                 retry_base_delay: float = RETRY_BASE_DELAY,
+                 retry_max_delay: float = RETRY_MAX_DELAY) -> None:
+        self._bucket = bucket
+        self._prefix = prefix.rstrip("/") + "/" if prefix else ""
+        self._max_retries = max_retries
+        self._retry_base_delay = retry_base_delay
+        self._retry_max_delay = retry_max_delay
+        session = boto3.Session(region_name=region)
+        self._client = session.client("s3", endpoint_url=endpoint_url)
+    def _full_key(self, key: str) -> str:
+        return f"{self._prefix}{key}"
+    def upload(self, key: str, file_path: Path, storage_class: str | None = None) -> str:
+        full_key = self._full_key(key)
+        extra_args = {}
+        if storage_class:
+            extra_args["StorageClass"] = storage_class
+        for attempt in range(self._max_retries):
+            try:
+                self._client.upload_file(
+                    str(file_path),
+                    self._bucket,
+                    full_key,
+                    ExtraArgs=extra_args or None,
+                )
+                resp = self._client.head_object(Bucket=self._bucket, Key=full_key)
+                return resp["ETag"]
+            except _RETRYABLE:
+                if attempt == self._max_retries - 1:
+                    raise
+                delay = min(self._retry_base_delay * (2 ** attempt), self._retry_max_delay)
+                time.sleep(delay)
+        raise RuntimeError("unreachable")
+    def upload_bytes(self, key: str, data: bytes, storage_class: str | None = None) -> str:
+        full_key = self._full_key(key)
+        kwargs: dict = {"Bucket": self._bucket, "Key": full_key, "Body": data}
+        if storage_class:
+            kwargs["StorageClass"] = storage_class
+        for attempt in range(self._max_retries):
+            try:
+                resp = self._client.put_object(**kwargs)
+                return resp["ETag"]
+            except _RETRYABLE:
+                if attempt == self._max_retries - 1:
+                    raise
+                delay = min(self._retry_base_delay * (2 ** attempt), self._retry_max_delay)
+                time.sleep(delay)
+        raise RuntimeError("unreachable")
+    def download(self, key: str, dest_path: Path) -> None:
+        full_key = self._full_key(key)
+        for attempt in range(self._max_retries):
+            try:
+                self._client.download_file(self._bucket, full_key, str(dest_path))
+                return
+            except _RETRYABLE:
+                if attempt == self._max_retries - 1:
+                    raise
+                delay = min(self._retry_base_delay * (2 ** attempt), self._retry_max_delay)
+                time.sleep(delay)
+    def download_bytes(self, key: str) -> bytes:
+        full_key = self._full_key(key)
+        for attempt in range(self._max_retries):
+            try:
+                resp = self._client.get_object(Bucket=self._bucket, Key=full_key)
+                return resp["Body"].read()
+            except _RETRYABLE:
+                if attempt == self._max_retries - 1:
+                    raise
+                delay = min(self._retry_base_delay * (2 ** attempt), self._retry_max_delay)
+                time.sleep(delay)
+        raise RuntimeError("unreachable")
+    def list_objects(self, prefix: str) -> list[ObjectInfo]:
+        full_prefix = self._full_key(prefix)
+        objects = []
+        paginator = self._client.get_paginator("list_objects_v2")
+        for page in paginator.paginate(Bucket=self._bucket, Prefix=full_prefix):
+            for obj in page.get("Contents", []):
+                key = obj["Key"]
+                if self._prefix and key.startswith(self._prefix):
+                    key = key[len(self._prefix):]
+                objects.append(ObjectInfo(
+                    key=key,
+                    size=obj["Size"],
+                    etag=obj["ETag"],
+                    storage_class=obj.get("StorageClass"),
+                ))
+        return objects
+    def head_object(self, key: str) -> ObjectInfo:
+        full_key = self._full_key(key)
+        resp = self._client.head_object(Bucket=self._bucket, Key=full_key)
+        return ObjectInfo(
+            key=key,
+            size=resp["ContentLength"],
+            etag=resp["ETag"],
+            storage_class=resp.get("StorageClass"),
+            restore_status=resp.get("Restore"),
+        )
+    def delete_object(self, key: str) -> None:
+        full_key = self._full_key(key)
+        self._client.delete_object(Bucket=self._bucket, Key=full_key)
+    def initiate_restore(self, key: str, days: int, tier: str) -> None:
+        full_key = self._full_key(key)
+        self._client.restore_object(
+            Bucket=self._bucket,
+            Key=full_key,
+            RestoreRequest={"Days": days, "GlacierJobParameters": {"Tier": tier}},
+        )
+    def is_restore_complete(self, key: str) -> bool:
+        info = self.head_object(key)
+        if info.restore_status is None:
+            return False
+        return 'ongoing-request="false"' in info.restore_status

s3duct/backpressure.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""Disk space backpressure for chunked uploads."""
+import shutil
+import time
+from dataclasses import dataclass
+from pathlib import Path
+# Minimum safety margin of free disk to preserve (100 MB)
+_DISK_SAFETY_MARGIN = 100 * 1024 * 1024
+@dataclass
+class BackpressureConfig:
+    chunk_size: int
+    scratch_dir: Path
+    max_buffer_chunks: int | None = None  # None = auto
+    diskspace_limit: int | None = None    # explicit byte limit, or None
+    min_buffer_chunks: int = 2            # floor for parallel uploads
+    def __post_init__(self) -> None:
+        if self.diskspace_limit is not None and self.diskspace_limit < self.chunk_size:
+            raise ValueError(
+                f"--diskspace-limit ({self.diskspace_limit:,} bytes) must be "
+                f">= chunk size ({self.chunk_size:,} bytes)"
+            )
+def compute_adaptive_buffer(chunk_size: int, scratch_dir: Path) -> int:
+    """Determine buffer chunk count based on available disk space.
+    Uses 80% of free space, clamped to [2, 10] chunks.
+    """
+    free = shutil.disk_usage(scratch_dir).free
+    max_by_disk = int((free * 0.8) / chunk_size) if chunk_size > 0 else 10
+    return max(2, min(max_by_disk, 10))
+class BackpressureMonitor:
+    """Monitors scratch disk usage and gates chunk writes."""
+    def __init__(self, config: BackpressureConfig) -> None:
+        self._config = config
+        self._effective_limit = self._compute_limit()
+    def _compute_limit(self) -> int:
+        if self._config.diskspace_limit is not None:
+            return self._config.diskspace_limit
+        buf = self._config.max_buffer_chunks
+        if buf is None:
+            buf = compute_adaptive_buffer(
+                self._config.chunk_size, self._config.scratch_dir
+            )
+        buf = max(buf, self._config.min_buffer_chunks)
+        return buf * self._config.chunk_size
+    @property
+    def effective_limit(self) -> int:
+        return self._effective_limit
+    def scratch_usage(self) -> int:
+        """Current bytes used in scratch dir."""
+        return sum(
+            f.stat().st_size
+            for f in self._config.scratch_dir.iterdir()
+            if f.is_file()
+        )
+    def free_disk_space(self) -> int:
+        """Free space on the filesystem containing scratch_dir."""
+        return shutil.disk_usage(self._config.scratch_dir).free
+    def can_write_chunk(self) -> bool:
+        """Check if there is room to write another chunk."""
+        usage = self.scratch_usage()
+        if usage + self._config.chunk_size > self._effective_limit:
+            return False
+        safety = max(_DISK_SAFETY_MARGIN, int(1.5 * self._config.chunk_size))
+        if self.free_disk_space() < self._config.chunk_size + safety:
+            return False
+        return True
+    def wait_for_space(self, poll_interval: float = 0.5) -> None:
+        """Block until space is available for the next chunk."""
+        while not self.can_write_chunk():
+            time.sleep(poll_interval)

s3duct/chunker.py ADDED Viewed

@@ -0,0 +1,114 @@
+"""Stream chunking from stdin to disk files."""
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from collections.abc import Callable
+from typing import BinaryIO, Generator
+from s3duct.config import DEFAULT_CHUNK_SIZE, READ_BUFFER_SIZE, SCRATCH_DIR
+from s3duct.integrity import IntegrityHasher, StreamHasher, DualHash
+@dataclass
+class ChunkInfo:
+    """Metadata about a written chunk."""
+    index: int
+    path: Path
+    size: int
+    dual_hash: DualHash
+def chunk_stream(
+    stream: BinaryIO,
+    chunk_size: int = DEFAULT_CHUNK_SIZE,
+    scratch_dir: Path | None = None,
+    stream_hasher: StreamHasher | None = None,
+    pre_chunk_hook: Callable[[], None] | None = None,
+) -> Generator[ChunkInfo, None, None]:
+    """Read from a stream and yield chunk files on disk.
+    Each chunk is written to scratch_dir and yielded. The caller is
+    responsible for deleting chunk files after use.
+    Args:
+        stream: Input byte stream (typically sys.stdin.buffer).
+        chunk_size: Target size per chunk in bytes.
+        scratch_dir: Directory for temporary chunk files.
+        stream_hasher: Optional hasher to track the full stream hash.
+        pre_chunk_hook: Optional callable invoked before reading each chunk.
+            Used for backpressure (blocks until disk space is available).
+    Yields:
+        ChunkInfo for each completed chunk.
+    """
+    if scratch_dir is None:
+        scratch_dir = SCRATCH_DIR
+    scratch_dir.mkdir(parents=True, exist_ok=True)
+    chunk_index = 0
+    eof = False
+    while not eof:
+        if pre_chunk_hook is not None:
+            pre_chunk_hook()
+        chunk_path = scratch_dir / f"chunk-{chunk_index:06d}"
+        hasher = IntegrityHasher()
+        bytes_written = 0
+        with open(chunk_path, "wb") as f:
+            while bytes_written < chunk_size:
+                to_read = min(READ_BUFFER_SIZE, chunk_size - bytes_written)
+                data = stream.read(to_read)
+                if not data:
+                    eof = True
+                    break
+                f.write(data)
+                hasher.update(data)
+                if stream_hasher:
+                    stream_hasher.update(data)
+                bytes_written += len(data)
+        if bytes_written == 0:
+            chunk_path.unlink(missing_ok=True)
+            break
+        yield ChunkInfo(
+            index=chunk_index,
+            path=chunk_path,
+            size=bytes_written,
+            dual_hash=hasher.finalize(),
+        )
+        chunk_index += 1
+def fast_forward_stream(
+    stream: BinaryIO,
+    chunk_size: int,
+    count: int,
+    stream_hasher: StreamHasher | None = None,
+) -> Generator[tuple[int, DualHash, int], None, None]:
+    """Read and hash chunks from stream without writing to disk.
+    Used during resume to verify the stream matches the resume log.
+    Yields:
+        (chunk_index, dual_hash, size) for each chunk read.
+    """
+    for i in range(count):
+        hasher = IntegrityHasher()
+        bytes_read = 0
+        while bytes_read < chunk_size:
+            to_read = min(READ_BUFFER_SIZE, chunk_size - bytes_read)
+            data = stream.read(to_read)
+            if not data:
+                if bytes_read > 0:
+                    yield i, hasher.finalize(), bytes_read
+                return
+            hasher.update(data)
+            if stream_hasher:
+                stream_hasher.update(data)
+            bytes_read += len(data)
+        yield i, hasher.finalize(), bytes_read