s3duct 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
s3duct/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """s3duct - Chunked, resumable, encrypted pipe to object storage."""
2
+
3
+ __version__ = "0.3.0"
File without changes
@@ -0,0 +1,62 @@
1
+ """Abstract storage backend interface."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+
8
+ @dataclass
9
+ class ObjectInfo:
10
+ key: str
11
+ size: int
12
+ etag: str
13
+ storage_class: str | None = None
14
+ restore_status: str | None = None
15
+
16
+
17
+ class StorageBackend(ABC):
18
+
19
+ @abstractmethod
20
+ def upload(self, key: str, file_path: Path, storage_class: str | None = None) -> str:
21
+ """Upload a file. Returns ETag."""
22
+ ...
23
+
24
+ @abstractmethod
25
+ def upload_bytes(self, key: str, data: bytes, storage_class: str | None = None) -> str:
26
+ """Upload raw bytes. Returns ETag."""
27
+ ...
28
+
29
+ @abstractmethod
30
+ def download(self, key: str, dest_path: Path) -> None:
31
+ """Download an object to a local file."""
32
+ ...
33
+
34
+ @abstractmethod
35
+ def download_bytes(self, key: str) -> bytes:
36
+ """Download an object as bytes."""
37
+ ...
38
+
39
+ @abstractmethod
40
+ def list_objects(self, prefix: str) -> list[ObjectInfo]:
41
+ """List objects with a given prefix."""
42
+ ...
43
+
44
+ @abstractmethod
45
+ def head_object(self, key: str) -> ObjectInfo:
46
+ """Get metadata for a single object."""
47
+ ...
48
+
49
+ @abstractmethod
50
+ def delete_object(self, key: str) -> None:
51
+ """Delete a single object."""
52
+ ...
53
+
54
+ @abstractmethod
55
+ def initiate_restore(self, key: str, days: int, tier: str) -> None:
56
+ """Request restore of a Glacier/GDA object."""
57
+ ...
58
+
59
+ @abstractmethod
60
+ def is_restore_complete(self, key: str) -> bool:
61
+ """Check if a Glacier restore is complete."""
62
+ ...
@@ -0,0 +1,72 @@
1
+ """Local filesystem storage backend for development and testing."""
2
+
3
+ import hashlib
4
+ import shutil
5
+ from pathlib import Path
6
+
7
+ from s3duct.backends.base import ObjectInfo, StorageBackend
8
+
9
+
10
+ class LocalBackend(StorageBackend):
11
+ """Stores objects as files in a local directory tree."""
12
+
13
+ def __init__(self, root: Path, prefix: str = "") -> None:
14
+ self._root = root
15
+ self._prefix = prefix.rstrip("/") + "/" if prefix else ""
16
+ self._root.mkdir(parents=True, exist_ok=True)
17
+
18
+ def _full_path(self, key: str) -> Path:
19
+ return self._root / f"{self._prefix}{key}"
20
+
21
+ @staticmethod
22
+ def _etag(data: bytes) -> str:
23
+ return hashlib.md5(data).hexdigest()
24
+
25
+ def upload(self, key: str, file_path: Path, storage_class: str | None = None) -> str:
26
+ dest = self._full_path(key)
27
+ dest.parent.mkdir(parents=True, exist_ok=True)
28
+ shutil.copy2(file_path, dest)
29
+ return self._etag(dest.read_bytes())
30
+
31
+ def upload_bytes(self, key: str, data: bytes, storage_class: str | None = None) -> str:
32
+ dest = self._full_path(key)
33
+ dest.parent.mkdir(parents=True, exist_ok=True)
34
+ dest.write_bytes(data)
35
+ return self._etag(data)
36
+
37
+ def download(self, key: str, dest_path: Path) -> None:
38
+ src = self._full_path(key)
39
+ shutil.copy2(src, dest_path)
40
+
41
+ def download_bytes(self, key: str) -> bytes:
42
+ return self._full_path(key).read_bytes()
43
+
44
+ def list_objects(self, prefix: str) -> list[ObjectInfo]:
45
+ full_prefix = f"{self._prefix}{prefix}"
46
+ results = []
47
+ for p in self._root.rglob("*"):
48
+ if not p.is_file():
49
+ continue
50
+ rel = str(p.relative_to(self._root))
51
+ if not rel.startswith(full_prefix):
52
+ continue
53
+ data = p.read_bytes()
54
+ key = rel[len(self._prefix):] if self._prefix and rel.startswith(self._prefix) else rel
55
+ results.append(ObjectInfo(
56
+ key=key, size=len(data), etag=self._etag(data),
57
+ ))
58
+ return results
59
+
60
+ def head_object(self, key: str) -> ObjectInfo:
61
+ p = self._full_path(key)
62
+ data = p.read_bytes()
63
+ return ObjectInfo(key=key, size=len(data), etag=self._etag(data))
64
+
65
+ def delete_object(self, key: str) -> None:
66
+ self._full_path(key).unlink(missing_ok=True)
67
+
68
+ def initiate_restore(self, key: str, days: int, tier: str) -> None:
69
+ pass # no-op for local storage
70
+
71
+ def is_restore_complete(self, key: str) -> bool:
72
+ return True # always available locally
s3duct/backends/s3.py ADDED
@@ -0,0 +1,150 @@
1
+ """AWS S3 storage backend."""
2
+
3
+ import time
4
+ from pathlib import Path
5
+
6
+ import boto3
7
+ from botocore.exceptions import (
8
+ ClientError,
9
+ ConnectionClosedError,
10
+ ConnectTimeoutError,
11
+ EndpointConnectionError,
12
+ ReadTimeoutError,
13
+ )
14
+
15
+ from s3duct.backends.base import ObjectInfo, StorageBackend
16
+ from s3duct.config import MAX_RETRY_ATTEMPTS, RETRY_BASE_DELAY, RETRY_MAX_DELAY
17
+
18
+ # Errors worth retrying: API errors + connection-level failures
19
+ _RETRYABLE = (ClientError, ConnectionClosedError, ConnectTimeoutError,
20
+ EndpointConnectionError, ReadTimeoutError, ConnectionError, OSError)
21
+
22
+
23
+ class S3Backend(StorageBackend):
24
+
25
+ def __init__(self, bucket: str, region: str | None = None, prefix: str = "",
26
+ endpoint_url: str | None = None,
27
+ max_retries: int = MAX_RETRY_ATTEMPTS,
28
+ retry_base_delay: float = RETRY_BASE_DELAY,
29
+ retry_max_delay: float = RETRY_MAX_DELAY) -> None:
30
+ self._bucket = bucket
31
+ self._prefix = prefix.rstrip("/") + "/" if prefix else ""
32
+ self._max_retries = max_retries
33
+ self._retry_base_delay = retry_base_delay
34
+ self._retry_max_delay = retry_max_delay
35
+ session = boto3.Session(region_name=region)
36
+ self._client = session.client("s3", endpoint_url=endpoint_url)
37
+
38
+ def _full_key(self, key: str) -> str:
39
+ return f"{self._prefix}{key}"
40
+
41
+ def upload(self, key: str, file_path: Path, storage_class: str | None = None) -> str:
42
+ full_key = self._full_key(key)
43
+ extra_args = {}
44
+ if storage_class:
45
+ extra_args["StorageClass"] = storage_class
46
+
47
+ for attempt in range(self._max_retries):
48
+ try:
49
+ self._client.upload_file(
50
+ str(file_path),
51
+ self._bucket,
52
+ full_key,
53
+ ExtraArgs=extra_args or None,
54
+ )
55
+ resp = self._client.head_object(Bucket=self._bucket, Key=full_key)
56
+ return resp["ETag"]
57
+ except _RETRYABLE:
58
+ if attempt == self._max_retries - 1:
59
+ raise
60
+ delay = min(self._retry_base_delay * (2 ** attempt), self._retry_max_delay)
61
+ time.sleep(delay)
62
+ raise RuntimeError("unreachable")
63
+
64
+ def upload_bytes(self, key: str, data: bytes, storage_class: str | None = None) -> str:
65
+ full_key = self._full_key(key)
66
+ kwargs: dict = {"Bucket": self._bucket, "Key": full_key, "Body": data}
67
+ if storage_class:
68
+ kwargs["StorageClass"] = storage_class
69
+
70
+ for attempt in range(self._max_retries):
71
+ try:
72
+ resp = self._client.put_object(**kwargs)
73
+ return resp["ETag"]
74
+ except _RETRYABLE:
75
+ if attempt == self._max_retries - 1:
76
+ raise
77
+ delay = min(self._retry_base_delay * (2 ** attempt), self._retry_max_delay)
78
+ time.sleep(delay)
79
+ raise RuntimeError("unreachable")
80
+
81
+ def download(self, key: str, dest_path: Path) -> None:
82
+ full_key = self._full_key(key)
83
+ for attempt in range(self._max_retries):
84
+ try:
85
+ self._client.download_file(self._bucket, full_key, str(dest_path))
86
+ return
87
+ except _RETRYABLE:
88
+ if attempt == self._max_retries - 1:
89
+ raise
90
+ delay = min(self._retry_base_delay * (2 ** attempt), self._retry_max_delay)
91
+ time.sleep(delay)
92
+
93
+ def download_bytes(self, key: str) -> bytes:
94
+ full_key = self._full_key(key)
95
+ for attempt in range(self._max_retries):
96
+ try:
97
+ resp = self._client.get_object(Bucket=self._bucket, Key=full_key)
98
+ return resp["Body"].read()
99
+ except _RETRYABLE:
100
+ if attempt == self._max_retries - 1:
101
+ raise
102
+ delay = min(self._retry_base_delay * (2 ** attempt), self._retry_max_delay)
103
+ time.sleep(delay)
104
+ raise RuntimeError("unreachable")
105
+
106
+ def list_objects(self, prefix: str) -> list[ObjectInfo]:
107
+ full_prefix = self._full_key(prefix)
108
+ objects = []
109
+ paginator = self._client.get_paginator("list_objects_v2")
110
+ for page in paginator.paginate(Bucket=self._bucket, Prefix=full_prefix):
111
+ for obj in page.get("Contents", []):
112
+ key = obj["Key"]
113
+ if self._prefix and key.startswith(self._prefix):
114
+ key = key[len(self._prefix):]
115
+ objects.append(ObjectInfo(
116
+ key=key,
117
+ size=obj["Size"],
118
+ etag=obj["ETag"],
119
+ storage_class=obj.get("StorageClass"),
120
+ ))
121
+ return objects
122
+
123
+ def head_object(self, key: str) -> ObjectInfo:
124
+ full_key = self._full_key(key)
125
+ resp = self._client.head_object(Bucket=self._bucket, Key=full_key)
126
+ return ObjectInfo(
127
+ key=key,
128
+ size=resp["ContentLength"],
129
+ etag=resp["ETag"],
130
+ storage_class=resp.get("StorageClass"),
131
+ restore_status=resp.get("Restore"),
132
+ )
133
+
134
+ def delete_object(self, key: str) -> None:
135
+ full_key = self._full_key(key)
136
+ self._client.delete_object(Bucket=self._bucket, Key=full_key)
137
+
138
+ def initiate_restore(self, key: str, days: int, tier: str) -> None:
139
+ full_key = self._full_key(key)
140
+ self._client.restore_object(
141
+ Bucket=self._bucket,
142
+ Key=full_key,
143
+ RestoreRequest={"Days": days, "GlacierJobParameters": {"Tier": tier}},
144
+ )
145
+
146
+ def is_restore_complete(self, key: str) -> bool:
147
+ info = self.head_object(key)
148
+ if info.restore_status is None:
149
+ return False
150
+ return 'ongoing-request="false"' in info.restore_status
s3duct/backpressure.py ADDED
@@ -0,0 +1,86 @@
1
+ """Disk space backpressure for chunked uploads."""
2
+
3
+ import shutil
4
+ import time
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+ # Minimum safety margin of free disk to preserve (100 MB)
9
+ _DISK_SAFETY_MARGIN = 100 * 1024 * 1024
10
+
11
+
12
+ @dataclass
13
+ class BackpressureConfig:
14
+ chunk_size: int
15
+ scratch_dir: Path
16
+ max_buffer_chunks: int | None = None # None = auto
17
+ diskspace_limit: int | None = None # explicit byte limit, or None
18
+ min_buffer_chunks: int = 2 # floor for parallel uploads
19
+
20
+ def __post_init__(self) -> None:
21
+ if self.diskspace_limit is not None and self.diskspace_limit < self.chunk_size:
22
+ raise ValueError(
23
+ f"--diskspace-limit ({self.diskspace_limit:,} bytes) must be "
24
+ f">= chunk size ({self.chunk_size:,} bytes)"
25
+ )
26
+
27
+
28
+ def compute_adaptive_buffer(chunk_size: int, scratch_dir: Path) -> int:
29
+ """Determine buffer chunk count based on available disk space.
30
+
31
+ Uses 80% of free space, clamped to [2, 10] chunks.
32
+ """
33
+ free = shutil.disk_usage(scratch_dir).free
34
+ max_by_disk = int((free * 0.8) / chunk_size) if chunk_size > 0 else 10
35
+ return max(2, min(max_by_disk, 10))
36
+
37
+
38
+ class BackpressureMonitor:
39
+ """Monitors scratch disk usage and gates chunk writes."""
40
+
41
+ def __init__(self, config: BackpressureConfig) -> None:
42
+ self._config = config
43
+ self._effective_limit = self._compute_limit()
44
+
45
+ def _compute_limit(self) -> int:
46
+ if self._config.diskspace_limit is not None:
47
+ return self._config.diskspace_limit
48
+
49
+ buf = self._config.max_buffer_chunks
50
+ if buf is None:
51
+ buf = compute_adaptive_buffer(
52
+ self._config.chunk_size, self._config.scratch_dir
53
+ )
54
+ buf = max(buf, self._config.min_buffer_chunks)
55
+ return buf * self._config.chunk_size
56
+
57
+ @property
58
+ def effective_limit(self) -> int:
59
+ return self._effective_limit
60
+
61
+ def scratch_usage(self) -> int:
62
+ """Current bytes used in scratch dir."""
63
+ return sum(
64
+ f.stat().st_size
65
+ for f in self._config.scratch_dir.iterdir()
66
+ if f.is_file()
67
+ )
68
+
69
+ def free_disk_space(self) -> int:
70
+ """Free space on the filesystem containing scratch_dir."""
71
+ return shutil.disk_usage(self._config.scratch_dir).free
72
+
73
+ def can_write_chunk(self) -> bool:
74
+ """Check if there is room to write another chunk."""
75
+ usage = self.scratch_usage()
76
+ if usage + self._config.chunk_size > self._effective_limit:
77
+ return False
78
+ safety = max(_DISK_SAFETY_MARGIN, int(1.5 * self._config.chunk_size))
79
+ if self.free_disk_space() < self._config.chunk_size + safety:
80
+ return False
81
+ return True
82
+
83
+ def wait_for_space(self, poll_interval: float = 0.5) -> None:
84
+ """Block until space is available for the next chunk."""
85
+ while not self.can_write_chunk():
86
+ time.sleep(poll_interval)
s3duct/chunker.py ADDED
@@ -0,0 +1,114 @@
1
+ """Stream chunking from stdin to disk files."""
2
+
3
+ import sys
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from collections.abc import Callable
7
+ from typing import BinaryIO, Generator
8
+
9
+ from s3duct.config import DEFAULT_CHUNK_SIZE, READ_BUFFER_SIZE, SCRATCH_DIR
10
+ from s3duct.integrity import IntegrityHasher, StreamHasher, DualHash
11
+
12
+
13
+ @dataclass
14
+ class ChunkInfo:
15
+ """Metadata about a written chunk."""
16
+ index: int
17
+ path: Path
18
+ size: int
19
+ dual_hash: DualHash
20
+
21
+
22
+ def chunk_stream(
23
+ stream: BinaryIO,
24
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
25
+ scratch_dir: Path | None = None,
26
+ stream_hasher: StreamHasher | None = None,
27
+ pre_chunk_hook: Callable[[], None] | None = None,
28
+ ) -> Generator[ChunkInfo, None, None]:
29
+ """Read from a stream and yield chunk files on disk.
30
+
31
+ Each chunk is written to scratch_dir and yielded. The caller is
32
+ responsible for deleting chunk files after use.
33
+
34
+ Args:
35
+ stream: Input byte stream (typically sys.stdin.buffer).
36
+ chunk_size: Target size per chunk in bytes.
37
+ scratch_dir: Directory for temporary chunk files.
38
+ stream_hasher: Optional hasher to track the full stream hash.
39
+ pre_chunk_hook: Optional callable invoked before reading each chunk.
40
+ Used for backpressure (blocks until disk space is available).
41
+
42
+ Yields:
43
+ ChunkInfo for each completed chunk.
44
+ """
45
+ if scratch_dir is None:
46
+ scratch_dir = SCRATCH_DIR
47
+ scratch_dir.mkdir(parents=True, exist_ok=True)
48
+
49
+ chunk_index = 0
50
+ eof = False
51
+
52
+ while not eof:
53
+ if pre_chunk_hook is not None:
54
+ pre_chunk_hook()
55
+ chunk_path = scratch_dir / f"chunk-{chunk_index:06d}"
56
+ hasher = IntegrityHasher()
57
+ bytes_written = 0
58
+
59
+ with open(chunk_path, "wb") as f:
60
+ while bytes_written < chunk_size:
61
+ to_read = min(READ_BUFFER_SIZE, chunk_size - bytes_written)
62
+ data = stream.read(to_read)
63
+ if not data:
64
+ eof = True
65
+ break
66
+ f.write(data)
67
+ hasher.update(data)
68
+ if stream_hasher:
69
+ stream_hasher.update(data)
70
+ bytes_written += len(data)
71
+
72
+ if bytes_written == 0:
73
+ chunk_path.unlink(missing_ok=True)
74
+ break
75
+
76
+ yield ChunkInfo(
77
+ index=chunk_index,
78
+ path=chunk_path,
79
+ size=bytes_written,
80
+ dual_hash=hasher.finalize(),
81
+ )
82
+ chunk_index += 1
83
+
84
+
85
+ def fast_forward_stream(
86
+ stream: BinaryIO,
87
+ chunk_size: int,
88
+ count: int,
89
+ stream_hasher: StreamHasher | None = None,
90
+ ) -> Generator[tuple[int, DualHash, int], None, None]:
91
+ """Read and hash chunks from stream without writing to disk.
92
+
93
+ Used during resume to verify the stream matches the resume log.
94
+
95
+ Yields:
96
+ (chunk_index, dual_hash, size) for each chunk read.
97
+ """
98
+ for i in range(count):
99
+ hasher = IntegrityHasher()
100
+ bytes_read = 0
101
+
102
+ while bytes_read < chunk_size:
103
+ to_read = min(READ_BUFFER_SIZE, chunk_size - bytes_read)
104
+ data = stream.read(to_read)
105
+ if not data:
106
+ if bytes_read > 0:
107
+ yield i, hasher.finalize(), bytes_read
108
+ return
109
+ hasher.update(data)
110
+ if stream_hasher:
111
+ stream_hasher.update(data)
112
+ bytes_read += len(data)
113
+
114
+ yield i, hasher.finalize(), bytes_read