firecloud-devnet 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fc_mlops/__init__.py +3 -0
- fc_mlops/__main__.py +5 -0
- fc_mlops/anomaly.py +112 -0
- fc_mlops/artifact_store.py +111 -0
- fc_mlops/cli.py +190 -0
- fc_mlops/simulate_failure.py +100 -0
- fc_mlops/telemetry.py +72 -0
- fc_rag/__init__.py +3 -0
- fc_rag/cli.py +51 -0
- fc_rag/config.py +24 -0
- fc_rag/embedder.py +62 -0
- fc_rag/indexer.py +121 -0
- fc_rag/query_engine.py +79 -0
- fc_rag/requirements.txt +6 -0
- fc_rag/retriever.py +46 -0
- firecloud/__init__.py +17 -0
- firecloud/chunker.py +122 -0
- firecloud/cli.py +540 -0
- firecloud/crypto.py +269 -0
- firecloud/discovery.py +164 -0
- firecloud/distributor.py +269 -0
- firecloud/exceptions.py +41 -0
- firecloud/fec.py +87 -0
- firecloud/manifest.py +263 -0
- firecloud/network.py +90 -0
- firecloud/node.py +562 -0
- firecloud/storage.py +146 -0
- firecloud/sync.py +277 -0
- firecloud/transport.py +387 -0
- firecloud_devnet-0.1.0.dist-info/METADATA +158 -0
- firecloud_devnet-0.1.0.dist-info/RECORD +34 -0
- firecloud_devnet-0.1.0.dist-info/WHEEL +4 -0
- firecloud_devnet-0.1.0.dist-info/entry_points.txt +4 -0
- firecloud_devnet-0.1.0.dist-info/licenses/LICENSE +21 -0
firecloud/storage.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Local filesystem chunk storage with sharded directories and quota enforcement."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import threading
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from firecloud.exceptions import ChunkNotFoundError, StorageFullError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ChunkStore:
|
|
11
|
+
"""Thread-safe, sharded local storage for encrypted chunks.
|
|
12
|
+
|
|
13
|
+
Chunks are stored in a two-level directory tree sharded by the first two
|
|
14
|
+
hex characters of the chunk ID::
|
|
15
|
+
|
|
16
|
+
base_path/ab/abcdef0123456789...
|
|
17
|
+
|
|
18
|
+
A storage quota is enforced on every ``store()`` call. When *max_storage*
|
|
19
|
+
is ``None`` the quota defaults to 80 % of the free space reported by the OS
|
|
20
|
+
at construction time.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, base_path: Path | str, max_storage: int | None = None) -> None:
|
|
24
|
+
"""Initialise the chunk store.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
base_path: Root directory for chunk storage.
|
|
28
|
+
max_storage: Maximum bytes allowed. ``None`` means 80 % of the
|
|
29
|
+
available disk space at *base_path*.
|
|
30
|
+
"""
|
|
31
|
+
self._base = Path(base_path)
|
|
32
|
+
self._base.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
self._lock = threading.Lock()
|
|
34
|
+
|
|
35
|
+
if max_storage is not None:
|
|
36
|
+
self._max_storage = max_storage
|
|
37
|
+
else:
|
|
38
|
+
self._max_storage = int(shutil.disk_usage(self._base).free * 0.8)
|
|
39
|
+
|
|
40
|
+
# ------------------------------------------------------------------
|
|
41
|
+
# Public API
|
|
42
|
+
# ------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
def store(self, chunk_id: str, data: bytes) -> None:
|
|
45
|
+
"""Store an encrypted chunk on disk.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
chunk_id: Hex string identifying the chunk.
|
|
49
|
+
data: Raw (already-encrypted) chunk bytes.
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
StorageFullError: If storing *data* would exceed the quota.
|
|
53
|
+
"""
|
|
54
|
+
with self._lock:
|
|
55
|
+
if self.used_bytes() + len(data) > self._max_storage:
|
|
56
|
+
raise StorageFullError(
|
|
57
|
+
f"Storing chunk {chunk_id} ({len(data)} bytes) would exceed "
|
|
58
|
+
f"the quota of {self._max_storage} bytes"
|
|
59
|
+
)
|
|
60
|
+
path = self._chunk_path(chunk_id)
|
|
61
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
62
|
+
path.write_bytes(data)
|
|
63
|
+
|
|
64
|
+
def retrieve(self, chunk_id: str) -> bytes:
|
|
65
|
+
"""Retrieve a stored chunk by its ID.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
chunk_id: Hex string identifying the chunk.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
The raw bytes of the chunk.
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ChunkNotFoundError: If the chunk is not in the store.
|
|
75
|
+
"""
|
|
76
|
+
with self._lock:
|
|
77
|
+
path = self._chunk_path(chunk_id)
|
|
78
|
+
if not path.is_file():
|
|
79
|
+
raise ChunkNotFoundError(
|
|
80
|
+
f"Chunk {chunk_id} not found in store"
|
|
81
|
+
)
|
|
82
|
+
return path.read_bytes()
|
|
83
|
+
|
|
84
|
+
def delete(self, chunk_id: str) -> None:
|
|
85
|
+
"""Delete a chunk from the store.
|
|
86
|
+
|
|
87
|
+
This is a no-op if the chunk does not exist.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
chunk_id: Hex string identifying the chunk.
|
|
91
|
+
"""
|
|
92
|
+
with self._lock:
|
|
93
|
+
path = self._chunk_path(chunk_id)
|
|
94
|
+
if path.is_file():
|
|
95
|
+
path.unlink()
|
|
96
|
+
# Clean up empty shard directory.
|
|
97
|
+
try:
|
|
98
|
+
path.parent.rmdir()
|
|
99
|
+
except OSError:
|
|
100
|
+
pass # Directory not empty — that's fine.
|
|
101
|
+
|
|
102
|
+
def has(self, chunk_id: str) -> bool:
|
|
103
|
+
"""Check whether a chunk exists in the store.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
chunk_id: Hex string identifying the chunk.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
``True`` if the chunk is stored, ``False`` otherwise.
|
|
110
|
+
"""
|
|
111
|
+
with self._lock:
|
|
112
|
+
return self._chunk_path(chunk_id).is_file()
|
|
113
|
+
|
|
114
|
+
def used_bytes(self) -> int:
|
|
115
|
+
"""Return the total number of bytes consumed by stored chunks."""
|
|
116
|
+
total = 0
|
|
117
|
+
for path in self._base.rglob("*"):
|
|
118
|
+
if path.is_file():
|
|
119
|
+
total += path.stat().st_size
|
|
120
|
+
return total
|
|
121
|
+
|
|
122
|
+
def available_bytes(self) -> int:
|
|
123
|
+
"""Return the number of bytes remaining before the quota is hit."""
|
|
124
|
+
return max(0, self._max_storage - self.used_bytes())
|
|
125
|
+
|
|
126
|
+
def list_chunks(self) -> list[str]:
|
|
127
|
+
"""Return a list of all stored chunk IDs."""
|
|
128
|
+
chunks: list[str] = []
|
|
129
|
+
for shard_dir in sorted(self._base.iterdir()):
|
|
130
|
+
if not shard_dir.is_dir():
|
|
131
|
+
continue
|
|
132
|
+
for chunk_file in sorted(shard_dir.iterdir()):
|
|
133
|
+
if chunk_file.is_file():
|
|
134
|
+
chunks.append(chunk_file.name)
|
|
135
|
+
return chunks
|
|
136
|
+
|
|
137
|
+
# ------------------------------------------------------------------
|
|
138
|
+
# Internal helpers
|
|
139
|
+
# ------------------------------------------------------------------
|
|
140
|
+
|
|
141
|
+
def _chunk_path(self, chunk_id: str) -> Path:
|
|
142
|
+
"""Return the sharded filesystem path for *chunk_id*.
|
|
143
|
+
|
|
144
|
+
Layout: ``base_path / chunk_id[:2] / chunk_id``
|
|
145
|
+
"""
|
|
146
|
+
return self._base / chunk_id[:2] / chunk_id
|
firecloud/sync.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""FireCloud Folder Sync — watchdog-based bi-directional folder synchronization.
|
|
2
|
+
|
|
3
|
+
Uses :pypi:`watchdog` to monitor a local folder for file changes and
|
|
4
|
+
automatically uploads / deletes files through the :class:`~firecloud.node.Node`.
|
|
5
|
+
Incoming files from remote peers are downloaded periodically by comparing the
|
|
6
|
+
manifest against the sync folder contents.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import logging
|
|
11
|
+
import threading
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
from watchdog.events import FileSystemEvent, FileSystemEventHandler
|
|
17
|
+
from watchdog.observers import Observer
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from firecloud.node import Node
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("firecloud.sync")
|
|
23
|
+
|
|
24
|
+
# Debounce window — how long (in seconds) to wait after the last filesystem
|
|
25
|
+
# event before processing the change. This handles editors that do
|
|
26
|
+
# write-to-temp-then-rename.
|
|
27
|
+
_DEBOUNCE_SECONDS = 0.5
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _SyncEventHandler(FileSystemEventHandler):
|
|
31
|
+
"""Collects filesystem events and feeds them to the async sync loop."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, sync: "FolderSync") -> None:
|
|
34
|
+
super().__init__()
|
|
35
|
+
self.sync = sync
|
|
36
|
+
|
|
37
|
+
def on_created(self, event: FileSystemEvent) -> None:
|
|
38
|
+
if not event.is_directory:
|
|
39
|
+
self.sync._schedule_event("created", event.src_path)
|
|
40
|
+
|
|
41
|
+
def on_modified(self, event: FileSystemEvent) -> None:
|
|
42
|
+
if not event.is_directory:
|
|
43
|
+
self.sync._schedule_event("modified", event.src_path)
|
|
44
|
+
|
|
45
|
+
def on_deleted(self, event: FileSystemEvent) -> None:
|
|
46
|
+
if not event.is_directory:
|
|
47
|
+
self.sync._schedule_event("deleted", event.src_path)
|
|
48
|
+
|
|
49
|
+
def on_moved(self, event: FileSystemEvent) -> None:
|
|
50
|
+
if not event.is_directory:
|
|
51
|
+
self.sync._schedule_event("deleted", event.src_path)
|
|
52
|
+
if hasattr(event, "dest_path"):
|
|
53
|
+
self.sync._schedule_event("created", event.dest_path)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class FolderSync:
|
|
57
|
+
"""Watches a folder and syncs its contents through a FireCloud node.
|
|
58
|
+
|
|
59
|
+
- **Outbound:** local file changes (create / modify / delete) are uploaded
|
|
60
|
+
or tombstoned via the node.
|
|
61
|
+
- **Inbound:** new files in the manifest that are missing from the local
|
|
62
|
+
folder are downloaded periodically.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self, node: "Node", folder: Path | str) -> None:
|
|
66
|
+
self.node = node
|
|
67
|
+
self.folder = Path(folder)
|
|
68
|
+
self.folder.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
|
|
70
|
+
self._observer: Observer | None = None
|
|
71
|
+
self._running = False
|
|
72
|
+
self._incoming_task: asyncio.Task | None = None
|
|
73
|
+
self._debounce_task: asyncio.Task | None = None
|
|
74
|
+
|
|
75
|
+
# Pending events: path → (event_type, timestamp)
|
|
76
|
+
self._pending: dict[str, tuple[str, float]] = {}
|
|
77
|
+
self._pending_lock = threading.Lock()
|
|
78
|
+
|
|
79
|
+
# Track file_id ↔ filename mappings for delete propagation
|
|
80
|
+
self._name_to_id: dict[str, str] = {}
|
|
81
|
+
self._id_to_name: dict[str, str] = {}
|
|
82
|
+
|
|
83
|
+
# Files we are currently downloading — skip outbound re-upload
|
|
84
|
+
self._downloading: set[str] = set()
|
|
85
|
+
|
|
86
|
+
# ------------------------------------------------------------------
|
|
87
|
+
# Lifecycle
|
|
88
|
+
# ------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
async def start(self) -> None:
|
|
91
|
+
"""Start watching the folder for changes."""
|
|
92
|
+
if self._running:
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
# Seed name ↔ id mapping from existing manifest
|
|
96
|
+
self._rebuild_name_map()
|
|
97
|
+
|
|
98
|
+
self._running = True
|
|
99
|
+
self._observer = Observer()
|
|
100
|
+
handler = _SyncEventHandler(self)
|
|
101
|
+
self._observer.schedule(handler, str(self.folder), recursive=False)
|
|
102
|
+
self._observer.start()
|
|
103
|
+
|
|
104
|
+
# Background task to check for incoming files every 5 seconds
|
|
105
|
+
self._incoming_task = asyncio.create_task(self._incoming_loop())
|
|
106
|
+
|
|
107
|
+
# Background task to process debounced events
|
|
108
|
+
self._debounce_task = asyncio.create_task(self._debounce_loop())
|
|
109
|
+
|
|
110
|
+
logger.info(f"Folder sync started for {self.folder}")
|
|
111
|
+
|
|
112
|
+
async def stop(self) -> None:
|
|
113
|
+
"""Stop watching the folder."""
|
|
114
|
+
if not self._running:
|
|
115
|
+
return
|
|
116
|
+
self._running = False
|
|
117
|
+
|
|
118
|
+
if self._observer:
|
|
119
|
+
self._observer.stop()
|
|
120
|
+
self._observer.join(timeout=2)
|
|
121
|
+
self._observer = None
|
|
122
|
+
|
|
123
|
+
for task in (self._incoming_task, self._debounce_task):
|
|
124
|
+
if task and not task.done():
|
|
125
|
+
task.cancel()
|
|
126
|
+
try:
|
|
127
|
+
await task
|
|
128
|
+
except asyncio.CancelledError:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
logger.info(f"Folder sync stopped for {self.folder}")
|
|
132
|
+
|
|
133
|
+
# ------------------------------------------------------------------
|
|
134
|
+
# Event scheduling (called from watchdog thread)
|
|
135
|
+
# ------------------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
def _schedule_event(self, event_type: str, path: str) -> None:
|
|
138
|
+
"""Record a filesystem event for debounced processing."""
|
|
139
|
+
# Skip files we are downloading ourselves
|
|
140
|
+
filename = Path(path).name
|
|
141
|
+
if filename in self._downloading:
|
|
142
|
+
return
|
|
143
|
+
|
|
144
|
+
with self._pending_lock:
|
|
145
|
+
self._pending[path] = (event_type, time.monotonic())
|
|
146
|
+
|
|
147
|
+
# ------------------------------------------------------------------
|
|
148
|
+
# Debounce loop (runs on asyncio loop)
|
|
149
|
+
# ------------------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
async def _debounce_loop(self) -> None:
|
|
152
|
+
"""Process filesystem events after the debounce window elapses."""
|
|
153
|
+
try:
|
|
154
|
+
while self._running:
|
|
155
|
+
await asyncio.sleep(_DEBOUNCE_SECONDS)
|
|
156
|
+
now = time.monotonic()
|
|
157
|
+
|
|
158
|
+
# Collect events that have settled
|
|
159
|
+
ready: list[tuple[str, str]] = []
|
|
160
|
+
with self._pending_lock:
|
|
161
|
+
for path, (event_type, ts) in list(self._pending.items()):
|
|
162
|
+
if now - ts >= _DEBOUNCE_SECONDS:
|
|
163
|
+
ready.append((path, event_type))
|
|
164
|
+
del self._pending[path]
|
|
165
|
+
|
|
166
|
+
for path, event_type in ready:
|
|
167
|
+
try:
|
|
168
|
+
await self._handle_event(event_type, path)
|
|
169
|
+
except Exception as exc:
|
|
170
|
+
logger.error(f"Sync event error for {path}: {exc}")
|
|
171
|
+
except asyncio.CancelledError:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
async def _handle_event(self, event_type: str, path: str) -> None:
|
|
175
|
+
"""Process a single filesystem event."""
|
|
176
|
+
filepath = Path(path)
|
|
177
|
+
filename = filepath.name
|
|
178
|
+
|
|
179
|
+
if event_type in ("created", "modified"):
|
|
180
|
+
if filepath.is_file():
|
|
181
|
+
file_id = await self.node.upload(filepath)
|
|
182
|
+
self._name_to_id[filename] = file_id
|
|
183
|
+
self._id_to_name[file_id] = filename
|
|
184
|
+
logger.debug(f"Sync uploaded {filename} → {file_id}")
|
|
185
|
+
|
|
186
|
+
elif event_type == "deleted":
|
|
187
|
+
file_id = self._name_to_id.get(filename)
|
|
188
|
+
if file_id:
|
|
189
|
+
try:
|
|
190
|
+
await self.node.delete(file_id)
|
|
191
|
+
del self._name_to_id[filename]
|
|
192
|
+
del self._id_to_name[file_id]
|
|
193
|
+
logger.debug(f"Sync deleted {filename} ({file_id})")
|
|
194
|
+
except Exception:
|
|
195
|
+
pass
|
|
196
|
+
|
|
197
|
+
# ------------------------------------------------------------------
|
|
198
|
+
# Incoming file download loop
|
|
199
|
+
# ------------------------------------------------------------------
|
|
200
|
+
|
|
201
|
+
async def _incoming_loop(self) -> None:
|
|
202
|
+
"""Periodically check the manifest for files missing from the folder."""
|
|
203
|
+
try:
|
|
204
|
+
while self._running:
|
|
205
|
+
await asyncio.sleep(5)
|
|
206
|
+
try:
|
|
207
|
+
await self._pull_incoming()
|
|
208
|
+
except Exception as exc:
|
|
209
|
+
logger.error(f"Incoming sync error: {exc}")
|
|
210
|
+
except asyncio.CancelledError:
|
|
211
|
+
pass
|
|
212
|
+
|
|
213
|
+
async def _pull_incoming(self) -> None:
|
|
214
|
+
"""Download any manifest files that are not present locally or are newer on remote."""
|
|
215
|
+
# Group entries by filename and find the one with the highest Lamport timestamp
|
|
216
|
+
latest_entries = {}
|
|
217
|
+
for entry in self.node.manifest.list_files():
|
|
218
|
+
filename = entry.name
|
|
219
|
+
current = latest_entries.get(filename)
|
|
220
|
+
if current is None or entry.lamport_ts > current.lamport_ts:
|
|
221
|
+
latest_entries[filename] = entry
|
|
222
|
+
|
|
223
|
+
for filename, entry in latest_entries.items():
|
|
224
|
+
local_path = self.folder / filename
|
|
225
|
+
|
|
226
|
+
# If the file exists locally and we already have this file_id mapped, it is up to date
|
|
227
|
+
mapped_id = self._name_to_id.get(filename)
|
|
228
|
+
if local_path.exists() and mapped_id == entry.file_id:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
# If the file exists locally but corresponds to a different file_id
|
|
232
|
+
if local_path.exists() and mapped_id is not None:
|
|
233
|
+
# If the remote version is not newer than our locally mapped version, skip it
|
|
234
|
+
try:
|
|
235
|
+
local_entry = self.node.manifest.get_file(mapped_id)
|
|
236
|
+
if entry.lamport_ts <= local_entry.lamport_ts:
|
|
237
|
+
continue
|
|
238
|
+
except Exception:
|
|
239
|
+
pass
|
|
240
|
+
|
|
241
|
+
# Skip if we are currently downloading this file
|
|
242
|
+
if filename in self._downloading:
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
# Already tracked by us (skip if it has been tombstoned / deleted)
|
|
246
|
+
if entry.file_id in self._id_to_name and entry.deleted:
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
# Download from the network
|
|
250
|
+
try:
|
|
251
|
+
self._downloading.add(filename)
|
|
252
|
+
await self.node.download(entry.file_id, local_path)
|
|
253
|
+
self._name_to_id[filename] = entry.file_id
|
|
254
|
+
self._id_to_name[entry.file_id] = filename
|
|
255
|
+
logger.debug(f"Sync downloaded {filename} from network (latest file_id: {entry.file_id})")
|
|
256
|
+
except Exception as exc:
|
|
257
|
+
logger.error(f"Failed to download {filename}: {exc}")
|
|
258
|
+
finally:
|
|
259
|
+
self._downloading.discard(filename)
|
|
260
|
+
|
|
261
|
+
# ------------------------------------------------------------------
|
|
262
|
+
# Helpers
|
|
263
|
+
# ------------------------------------------------------------------
|
|
264
|
+
|
|
265
|
+
def _rebuild_name_map(self) -> None:
|
|
266
|
+
"""Populate the name ↔ id mapping from the current manifest.
|
|
267
|
+
|
|
268
|
+
Only includes files that actually exist in the sync folder, so that
|
|
269
|
+
files uploaded by remote peers can be downloaded on first sync start.
|
|
270
|
+
"""
|
|
271
|
+
self._name_to_id.clear()
|
|
272
|
+
self._id_to_name.clear()
|
|
273
|
+
for entry in self.node.manifest.list_files():
|
|
274
|
+
local_path = self.folder / entry.name
|
|
275
|
+
if local_path.exists():
|
|
276
|
+
self._name_to_id[entry.name] = entry.file_id
|
|
277
|
+
self._id_to_name[entry.file_id] = entry.name
|