firecloud-devnet 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fc_mlops/__init__.py +3 -0
- fc_mlops/__main__.py +5 -0
- fc_mlops/anomaly.py +112 -0
- fc_mlops/artifact_store.py +111 -0
- fc_mlops/cli.py +190 -0
- fc_mlops/simulate_failure.py +100 -0
- fc_mlops/telemetry.py +72 -0
- fc_rag/__init__.py +3 -0
- fc_rag/cli.py +51 -0
- fc_rag/config.py +24 -0
- fc_rag/embedder.py +62 -0
- fc_rag/indexer.py +121 -0
- fc_rag/query_engine.py +79 -0
- fc_rag/requirements.txt +6 -0
- fc_rag/retriever.py +46 -0
- firecloud/__init__.py +17 -0
- firecloud/chunker.py +122 -0
- firecloud/cli.py +540 -0
- firecloud/crypto.py +269 -0
- firecloud/discovery.py +164 -0
- firecloud/distributor.py +269 -0
- firecloud/exceptions.py +41 -0
- firecloud/fec.py +87 -0
- firecloud/manifest.py +263 -0
- firecloud/network.py +90 -0
- firecloud/node.py +562 -0
- firecloud/storage.py +146 -0
- firecloud/sync.py +277 -0
- firecloud/transport.py +387 -0
- firecloud_devnet-0.1.0.dist-info/METADATA +158 -0
- firecloud_devnet-0.1.0.dist-info/RECORD +34 -0
- firecloud_devnet-0.1.0.dist-info/WHEEL +4 -0
- firecloud_devnet-0.1.0.dist-info/entry_points.txt +4 -0
- firecloud_devnet-0.1.0.dist-info/licenses/LICENSE +21 -0
firecloud/node.py
ADDED
|
@@ -0,0 +1,562 @@
|
|
|
1
|
+
"""FireCloud Node — orchestrates storage, transport, discovery, and sync.
|
|
2
|
+
|
|
3
|
+
The :class:`Node` is the primary user-facing object. It wires together
|
|
4
|
+
the chunk store, manifest, transport layer, mDNS discovery, and
|
|
5
|
+
distributor so that files can be uploaded, downloaded, deleted, and
|
|
6
|
+
synced across the LAN with a single method call.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import builtins
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import uuid
|
|
14
|
+
from dataclasses import asdict
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from firecloud.chunker import Chunk, chunk_file, reassemble_chunks, compute_file_id
|
|
19
|
+
from firecloud.crypto import encrypt_chunk, decrypt_chunk, compute_integrity_hash
|
|
20
|
+
from firecloud.discovery import LANDiscovery
|
|
21
|
+
from firecloud.distributor import Distributor
|
|
22
|
+
from firecloud.exceptions import (
|
|
23
|
+
ChunkCorruptError,
|
|
24
|
+
)
|
|
25
|
+
from firecloud.manifest import FileEntry, Manifest
|
|
26
|
+
from firecloud.network import Network
|
|
27
|
+
from firecloud.storage import ChunkStore
|
|
28
|
+
from firecloud.transport import NodeClient, NodeServer, PeerConnection, MSG_SYNC_MANIFEST
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger("firecloud.node")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Node:
|
|
34
|
+
"""Main FireCloud node that orchestrates all operations.
|
|
35
|
+
|
|
36
|
+
Ties together:
|
|
37
|
+
- :class:`~firecloud.storage.ChunkStore` for local chunk persistence
|
|
38
|
+
- :class:`~firecloud.manifest.Manifest` for file metadata
|
|
39
|
+
- :class:`~firecloud.transport.NodeServer` / :class:`~firecloud.transport.NodeClient`
|
|
40
|
+
for peer-to-peer communication
|
|
41
|
+
- :class:`~firecloud.discovery.LANDiscovery` for mDNS peer discovery
|
|
42
|
+
- :class:`~firecloud.distributor.Distributor` for chunk placement
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
network: Network,
|
|
48
|
+
storage_path: Path | str,
|
|
49
|
+
port: int = 7474,
|
|
50
|
+
max_storage: int | None = None,
|
|
51
|
+
host: str = "0.0.0.0",
|
|
52
|
+
node_id: str | None = None,
|
|
53
|
+
enable_discovery: bool = True,
|
|
54
|
+
) -> None:
|
|
55
|
+
"""Initialise the node.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
network: The :class:`~firecloud.network.Network` this node belongs to.
|
|
59
|
+
storage_path: Root directory for chunk storage and metadata.
|
|
60
|
+
port: TCP port to listen on.
|
|
61
|
+
max_storage: Maximum bytes for the chunk store (``None`` = 80 % of disk).
|
|
62
|
+
host: Interface address to bind the server to.
|
|
63
|
+
node_id: Unique identifier for this node. Auto-generated when ``None``.
|
|
64
|
+
enable_discovery: Whether to start mDNS discovery.
|
|
65
|
+
"""
|
|
66
|
+
self.network = network
|
|
67
|
+
self.storage_path = Path(storage_path)
|
|
68
|
+
self.storage_path.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
self.port = port
|
|
70
|
+
self.host = host
|
|
71
|
+
self.enable_discovery = enable_discovery
|
|
72
|
+
|
|
73
|
+
# Node identity
|
|
74
|
+
self.node_id = node_id or uuid.uuid4().hex[:16]
|
|
75
|
+
|
|
76
|
+
# Core subsystems — initialised eagerly so tests can inject mocks.
|
|
77
|
+
chunks_dir = self.storage_path / "chunks"
|
|
78
|
+
self.chunk_store = ChunkStore(chunks_dir, max_storage=max_storage)
|
|
79
|
+
self.manifest = Manifest(self.storage_path)
|
|
80
|
+
|
|
81
|
+
# Transport
|
|
82
|
+
self._server: NodeServer | None = None
|
|
83
|
+
self._client: NodeClient | None = None
|
|
84
|
+
|
|
85
|
+
# Discovery
|
|
86
|
+
self._discovery: LANDiscovery | None = None
|
|
87
|
+
|
|
88
|
+
# Active peer connections keyed by peer node_id.
|
|
89
|
+
self.connections: dict[str, PeerConnection] = {}
|
|
90
|
+
|
|
91
|
+
# Known peer addresses: {node_id: (host, port)}
|
|
92
|
+
self._known_peers: dict[str, tuple[str, int]] = {}
|
|
93
|
+
|
|
94
|
+
# Background tasks
|
|
95
|
+
self._heartbeat_task: asyncio.Task | None = None
|
|
96
|
+
self._manifest_sync_task: asyncio.Task | None = None
|
|
97
|
+
|
|
98
|
+
# Running state
|
|
99
|
+
self._running = False
|
|
100
|
+
|
|
101
|
+
# ------------------------------------------------------------------
|
|
102
|
+
# Lifecycle
|
|
103
|
+
# ------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
async def start(self) -> None:
|
|
106
|
+
"""Start the node: server, discovery, heartbeat, and manifest sync."""
|
|
107
|
+
if self._running:
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
# Start TCP server
|
|
111
|
+
self._server = NodeServer(self, self.host, self.port)
|
|
112
|
+
await self._server.start()
|
|
113
|
+
|
|
114
|
+
# Start client
|
|
115
|
+
self._client = NodeClient(self)
|
|
116
|
+
|
|
117
|
+
# Start mDNS discovery
|
|
118
|
+
if self.enable_discovery:
|
|
119
|
+
try:
|
|
120
|
+
self._discovery = LANDiscovery(
|
|
121
|
+
self.node_id,
|
|
122
|
+
self.network.network_id,
|
|
123
|
+
self.port,
|
|
124
|
+
)
|
|
125
|
+
self._discovery.on_peer_found(self._on_peer_discovered)
|
|
126
|
+
self._discovery.on_peer_removed(self._on_peer_removed)
|
|
127
|
+
await self._discovery.start()
|
|
128
|
+
except Exception as exc:
|
|
129
|
+
logger.warning(f"mDNS discovery failed to start: {exc}")
|
|
130
|
+
self._discovery = None
|
|
131
|
+
|
|
132
|
+
# Start periodic tasks
|
|
133
|
+
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
|
134
|
+
self._manifest_sync_task = asyncio.create_task(self._manifest_sync_loop())
|
|
135
|
+
|
|
136
|
+
self._running = True
|
|
137
|
+
logger.info(
|
|
138
|
+
f"Node {self.node_id} started on {self.host}:{self.port} "
|
|
139
|
+
f"(network {self.network.network_id})"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
async def stop(self) -> None:
|
|
143
|
+
"""Gracefully shut down the node."""
|
|
144
|
+
if not self._running:
|
|
145
|
+
return
|
|
146
|
+
self._running = False
|
|
147
|
+
|
|
148
|
+
# Cancel periodic tasks
|
|
149
|
+
for task in (self._heartbeat_task, self._manifest_sync_task):
|
|
150
|
+
if task and not task.done():
|
|
151
|
+
task.cancel()
|
|
152
|
+
try:
|
|
153
|
+
await task
|
|
154
|
+
except asyncio.CancelledError:
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
# Close all peer connections
|
|
158
|
+
for conn in list(self.connections.values()):
|
|
159
|
+
try:
|
|
160
|
+
await conn.close()
|
|
161
|
+
except Exception:
|
|
162
|
+
pass
|
|
163
|
+
self.connections.clear()
|
|
164
|
+
|
|
165
|
+
# Stop server
|
|
166
|
+
if self._server:
|
|
167
|
+
await self._server.stop()
|
|
168
|
+
self._server = None
|
|
169
|
+
|
|
170
|
+
# Stop discovery
|
|
171
|
+
if self._discovery:
|
|
172
|
+
try:
|
|
173
|
+
await self._discovery.stop()
|
|
174
|
+
except Exception:
|
|
175
|
+
pass
|
|
176
|
+
self._discovery = None
|
|
177
|
+
|
|
178
|
+
logger.info(f"Node {self.node_id} stopped")
|
|
179
|
+
|
|
180
|
+
# ------------------------------------------------------------------
|
|
181
|
+
# File operations
|
|
182
|
+
# ------------------------------------------------------------------
|
|
183
|
+
|
|
184
|
+
async def upload(self, filepath: str | Path) -> str:
|
|
185
|
+
"""Upload a file to the network.
|
|
186
|
+
|
|
187
|
+
Pipeline: read → chunk → encrypt → distribute → manifest.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
filepath: Path to the local file to upload.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
The file_id (HMAC-SHA-256 of the whole file content).
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
builtins.FileNotFoundError: If *filepath* does not exist.
|
|
197
|
+
StorageFullError: If the quota is exceeded.
|
|
198
|
+
"""
|
|
199
|
+
filepath = Path(filepath)
|
|
200
|
+
if not filepath.is_file():
|
|
201
|
+
raise builtins.FileNotFoundError(f"File not found: {filepath}")
|
|
202
|
+
|
|
203
|
+
hmac_key = self.network.hmac_key
|
|
204
|
+
enc_key = self.network.encryption_key
|
|
205
|
+
|
|
206
|
+
# 1. Compute the file-level ID
|
|
207
|
+
file_id = compute_file_id(filepath, hmac_key)
|
|
208
|
+
|
|
209
|
+
# 2. Content-defined chunking
|
|
210
|
+
chunks = chunk_file(filepath, hmac_key)
|
|
211
|
+
|
|
212
|
+
# 3. Encrypt each chunk
|
|
213
|
+
encrypted_chunks = []
|
|
214
|
+
for c in chunks:
|
|
215
|
+
enc_data = encrypt_chunk(c.data, enc_key)
|
|
216
|
+
encrypted_chunks.append(
|
|
217
|
+
Chunk(
|
|
218
|
+
index=c.index,
|
|
219
|
+
offset=c.offset,
|
|
220
|
+
length=c.length,
|
|
221
|
+
data=enc_data,
|
|
222
|
+
chunk_id=c.chunk_id,
|
|
223
|
+
integrity_hash=c.integrity_hash,
|
|
224
|
+
)
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
# 4. Distribute
|
|
228
|
+
peer_ids = list(self.connections.keys())
|
|
229
|
+
distributor = Distributor(
|
|
230
|
+
peers=peer_ids,
|
|
231
|
+
local_node_id=self.node_id,
|
|
232
|
+
fec_enabled=len(peer_ids) + 1 >= 5,
|
|
233
|
+
)
|
|
234
|
+
chunk_infos = await distributor.distribute(encrypted_chunks, self._client)
|
|
235
|
+
|
|
236
|
+
# 5. Build manifest entry
|
|
237
|
+
strategy = distributor.get_strategy()
|
|
238
|
+
entry = FileEntry(
|
|
239
|
+
file_id=file_id,
|
|
240
|
+
name=filepath.name,
|
|
241
|
+
size=filepath.stat().st_size,
|
|
242
|
+
chunk_count=len(chunks),
|
|
243
|
+
uploaded_at=datetime.now(timezone.utc).isoformat(),
|
|
244
|
+
uploaded_by=self.node_id,
|
|
245
|
+
chunks=chunk_infos,
|
|
246
|
+
fec_enabled=(strategy == "erasure_coding"),
|
|
247
|
+
replication_factor=2 if strategy == "replication" else 1,
|
|
248
|
+
)
|
|
249
|
+
self.manifest.add_file(entry)
|
|
250
|
+
|
|
251
|
+
# 6. Sync manifest to peers
|
|
252
|
+
await self._sync_manifest_to_peers()
|
|
253
|
+
|
|
254
|
+
logger.info(f"Uploaded {filepath.name} → {file_id}")
|
|
255
|
+
return file_id
|
|
256
|
+
|
|
257
|
+
async def download(self, file_id: str, output: str | Path) -> None:
|
|
258
|
+
"""Download a file from the network.
|
|
259
|
+
|
|
260
|
+
Pipeline: manifest → retrieve → decrypt → verify → reassemble → write.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
file_id: The unique file identifier.
|
|
264
|
+
output: Local path to write the reassembled file to.
|
|
265
|
+
|
|
266
|
+
Raises:
|
|
267
|
+
firecloud.exceptions.FileNotFoundError: If the file is not in
|
|
268
|
+
the manifest or is tombstoned.
|
|
269
|
+
ChunkNotFoundError: If chunks are irrecoverable.
|
|
270
|
+
ChunkCorruptError: If integrity verification fails.
|
|
271
|
+
"""
|
|
272
|
+
output = Path(output)
|
|
273
|
+
entry = self.manifest.get_file(file_id)
|
|
274
|
+
|
|
275
|
+
enc_key = self.network.encryption_key
|
|
276
|
+
|
|
277
|
+
# Retrieve encrypted chunks
|
|
278
|
+
peer_ids = list(self.connections.keys())
|
|
279
|
+
distributor = Distributor(
|
|
280
|
+
peers=peer_ids,
|
|
281
|
+
local_node_id=self.node_id,
|
|
282
|
+
fec_enabled=entry.fec_enabled,
|
|
283
|
+
)
|
|
284
|
+
encrypted_data_list = await distributor.retrieve(entry.chunks, self._client)
|
|
285
|
+
|
|
286
|
+
# Decrypt and verify each chunk
|
|
287
|
+
decrypted_chunks: list[Chunk] = []
|
|
288
|
+
for i, enc_data in enumerate(encrypted_data_list):
|
|
289
|
+
plaintext = decrypt_chunk(enc_data, enc_key)
|
|
290
|
+
|
|
291
|
+
# When NOT using FEC, verify integrity against the manifest
|
|
292
|
+
if not entry.fec_enabled and i < len(entry.chunks):
|
|
293
|
+
expected_hash = entry.chunks[i].integrity_hash
|
|
294
|
+
actual_hash = compute_integrity_hash(plaintext)
|
|
295
|
+
if actual_hash != expected_hash:
|
|
296
|
+
raise ChunkCorruptError(
|
|
297
|
+
f"Integrity check failed for chunk {entry.chunks[i].chunk_id}"
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
decrypted_chunks.append(
|
|
301
|
+
Chunk(
|
|
302
|
+
index=i,
|
|
303
|
+
offset=0,
|
|
304
|
+
length=len(plaintext),
|
|
305
|
+
data=plaintext,
|
|
306
|
+
chunk_id="",
|
|
307
|
+
integrity_hash="",
|
|
308
|
+
)
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
# Reassemble and write
|
|
312
|
+
reassembled = reassemble_chunks(decrypted_chunks)
|
|
313
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
314
|
+
output.write_bytes(reassembled)
|
|
315
|
+
|
|
316
|
+
logger.info(f"Downloaded {entry.name} → {output}")
|
|
317
|
+
|
|
318
|
+
async def delete(self, file_id: str) -> None:
|
|
319
|
+
"""Tombstone a file in the manifest and sync to peers.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
file_id: The file to delete.
|
|
323
|
+
|
|
324
|
+
Raises:
|
|
325
|
+
firecloud.exceptions.FileNotFoundError: If the file is not found.
|
|
326
|
+
"""
|
|
327
|
+
self.manifest.delete_file(file_id)
|
|
328
|
+
await self._sync_manifest_to_peers()
|
|
329
|
+
logger.info(f"Deleted file {file_id}")
|
|
330
|
+
|
|
331
|
+
def list_files(self) -> list[dict]:
|
|
332
|
+
"""Return a list of all non-deleted files as plain dicts.
|
|
333
|
+
|
|
334
|
+
Each dict contains: ``file_id``, ``name``, ``size``,
|
|
335
|
+
``chunk_count``, ``uploaded_at``, ``uploaded_by``,
|
|
336
|
+
``fec_enabled``, ``replication_factor``.
|
|
337
|
+
"""
|
|
338
|
+
entries = self.manifest.list_files()
|
|
339
|
+
return [
|
|
340
|
+
{
|
|
341
|
+
"file_id": e.file_id,
|
|
342
|
+
"name": e.name,
|
|
343
|
+
"size": e.size,
|
|
344
|
+
"chunk_count": e.chunk_count,
|
|
345
|
+
"uploaded_at": e.uploaded_at,
|
|
346
|
+
"uploaded_by": e.uploaded_by,
|
|
347
|
+
"fec_enabled": e.fec_enabled,
|
|
348
|
+
"replication_factor": e.replication_factor,
|
|
349
|
+
}
|
|
350
|
+
for e in entries
|
|
351
|
+
]
|
|
352
|
+
|
|
353
|
+
# ------------------------------------------------------------------
|
|
354
|
+
# Networking
|
|
355
|
+
# ------------------------------------------------------------------
|
|
356
|
+
|
|
357
|
+
async def connect(self, address: str) -> None:
|
|
358
|
+
"""Connect to a peer by ``host:port`` string.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
address: Peer address in ``host:port`` format.
|
|
362
|
+
"""
|
|
363
|
+
host, port_str = address.rsplit(":", 1)
|
|
364
|
+
port = int(port_str)
|
|
365
|
+
peer_node_id = await self._client.connect(host, port)
|
|
366
|
+
self._known_peers[peer_node_id] = (host, port)
|
|
367
|
+
logger.info(f"Connected to peer {peer_node_id} at {host}:{port}")
|
|
368
|
+
|
|
369
|
+
def status(self) -> dict:
|
|
370
|
+
"""Return a status dict describing this node."""
|
|
371
|
+
return {
|
|
372
|
+
"node_id": self.node_id,
|
|
373
|
+
"network_id": self.network.network_id,
|
|
374
|
+
"host": self.host,
|
|
375
|
+
"port": self.port,
|
|
376
|
+
"running": self._running,
|
|
377
|
+
"peers_connected": len(self.connections),
|
|
378
|
+
"files_stored": len(self.manifest.list_files()),
|
|
379
|
+
"chunks_stored": len(self.chunk_store.list_chunks()),
|
|
380
|
+
"storage_used": self.chunk_store.used_bytes(),
|
|
381
|
+
"storage_available": self.chunk_store.available_bytes(),
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
def peers(self) -> list[dict]:
|
|
385
|
+
"""Return a list of known / connected peers."""
|
|
386
|
+
result = []
|
|
387
|
+
all_peer_ids = set(self.connections.keys()) | set(self._known_peers.keys())
|
|
388
|
+
for pid in all_peer_ids:
|
|
389
|
+
addr = self._known_peers.get(pid)
|
|
390
|
+
result.append({
|
|
391
|
+
"node_id": pid,
|
|
392
|
+
"host": addr[0] if addr else "unknown",
|
|
393
|
+
"port": addr[1] if addr else 0,
|
|
394
|
+
"connected": pid in self.connections,
|
|
395
|
+
})
|
|
396
|
+
return result
|
|
397
|
+
|
|
398
|
+
# ------------------------------------------------------------------
|
|
399
|
+
# Connection management (called by transport layer)
|
|
400
|
+
# ------------------------------------------------------------------
|
|
401
|
+
|
|
402
|
+
def register_connection(self, peer_node_id: str, conn: PeerConnection) -> None:
|
|
403
|
+
"""Register an active peer connection (called by transport)."""
|
|
404
|
+
self.connections[peer_node_id] = conn
|
|
405
|
+
logger.debug(f"Registered connection with peer {peer_node_id}")
|
|
406
|
+
|
|
407
|
+
def on_connection_closed(self, peer_node_id: str) -> None:
|
|
408
|
+
"""Handle a closed connection (called by PeerConnection)."""
|
|
409
|
+
self.connections.pop(peer_node_id, None)
|
|
410
|
+
logger.debug(f"Connection with peer {peer_node_id} closed")
|
|
411
|
+
if self._running:
|
|
412
|
+
asyncio.create_task(self._rereplicate_peer_chunks(peer_node_id))
|
|
413
|
+
|
|
414
|
+
async def remove_node(self, node_id: str) -> None:
|
|
415
|
+
"""Explicitly remove a node from the network and trigger re-replication.
|
|
416
|
+
|
|
417
|
+
This closes any connection, removes the node from known list,
|
|
418
|
+
and re-replicates any of its chunks that were replicated on this network.
|
|
419
|
+
"""
|
|
420
|
+
conn = self.connections.pop(node_id, None)
|
|
421
|
+
if conn:
|
|
422
|
+
try:
|
|
423
|
+
await conn.close()
|
|
424
|
+
except Exception:
|
|
425
|
+
pass
|
|
426
|
+
self._known_peers.pop(node_id, None)
|
|
427
|
+
await self._rereplicate_peer_chunks(node_id)
|
|
428
|
+
|
|
429
|
+
async def _rereplicate_peer_chunks(self, offline_node_id: str) -> None:
|
|
430
|
+
"""Scan manifest for chunks stored on the offline node and re-replicate them."""
|
|
431
|
+
from firecloud.transport import MSG_STORE_CHUNK
|
|
432
|
+
active_peers = [pid for pid in self.connections.keys() if pid != offline_node_id]
|
|
433
|
+
if not active_peers:
|
|
434
|
+
logger.info("No active peers available for re-replication.")
|
|
435
|
+
return
|
|
436
|
+
|
|
437
|
+
all_nodes = [self.node_id] + active_peers
|
|
438
|
+
|
|
439
|
+
for entry in self.manifest.list_files():
|
|
440
|
+
# zfec shares are handled separately; focus on standard replication for re-replication
|
|
441
|
+
if entry.fec_enabled or entry.replication_factor < 2:
|
|
442
|
+
continue
|
|
443
|
+
|
|
444
|
+
updated = False
|
|
445
|
+
for chunk_info in entry.chunks:
|
|
446
|
+
if offline_node_id in chunk_info.stored_on:
|
|
447
|
+
chunk_info.stored_on = [nid for nid in chunk_info.stored_on if nid != offline_node_id]
|
|
448
|
+
|
|
449
|
+
while len(chunk_info.stored_on) < entry.replication_factor:
|
|
450
|
+
candidate = None
|
|
451
|
+
for nid in all_nodes:
|
|
452
|
+
if nid not in chunk_info.stored_on:
|
|
453
|
+
candidate = nid
|
|
454
|
+
break
|
|
455
|
+
if not candidate:
|
|
456
|
+
break
|
|
457
|
+
|
|
458
|
+
chunk_data = None
|
|
459
|
+
if self.chunk_store.has(chunk_info.chunk_id):
|
|
460
|
+
chunk_data = self.chunk_store.retrieve(chunk_info.chunk_id)
|
|
461
|
+
else:
|
|
462
|
+
for nid in chunk_info.stored_on:
|
|
463
|
+
conn = self.connections.get(nid)
|
|
464
|
+
if conn:
|
|
465
|
+
chunk_data = await conn.retrieve_chunk(chunk_info.chunk_id)
|
|
466
|
+
if chunk_data:
|
|
467
|
+
break
|
|
468
|
+
|
|
469
|
+
if chunk_data is not None:
|
|
470
|
+
try:
|
|
471
|
+
if candidate == self.node_id:
|
|
472
|
+
self.chunk_store.store(chunk_info.chunk_id, chunk_data)
|
|
473
|
+
else:
|
|
474
|
+
conn = self.connections.get(candidate)
|
|
475
|
+
if conn:
|
|
476
|
+
payload = chunk_info.chunk_id.encode("utf-8") + chunk_data
|
|
477
|
+
await conn.send_message(MSG_STORE_CHUNK, payload)
|
|
478
|
+
chunk_info.stored_on.append(candidate)
|
|
479
|
+
updated = True
|
|
480
|
+
logger.info(f"Re-replicated chunk {chunk_info.chunk_id[:16]}... to {candidate}")
|
|
481
|
+
except Exception as exc:
|
|
482
|
+
logger.warning(f"Failed to re-replicate chunk {chunk_info.chunk_id} to {candidate}: {exc}")
|
|
483
|
+
else:
|
|
484
|
+
break
|
|
485
|
+
|
|
486
|
+
if updated:
|
|
487
|
+
self.manifest.add_file(entry)
|
|
488
|
+
await self._sync_manifest_to_peers()
|
|
489
|
+
|
|
490
|
+
def add_peer_discovered(self, node_id: str, host: str, port: int) -> None:
|
|
491
|
+
"""Record a newly discovered peer address (called by transport/discovery)."""
|
|
492
|
+
if node_id != self.node_id:
|
|
493
|
+
self._known_peers[node_id] = (host, port)
|
|
494
|
+
|
|
495
|
+
# ------------------------------------------------------------------
|
|
496
|
+
# Discovery callbacks
|
|
497
|
+
# ------------------------------------------------------------------
|
|
498
|
+
|
|
499
|
+
def _on_peer_discovered(self, node_id: str, host: str, port: int) -> None:
|
|
500
|
+
"""Callback when mDNS discovers a peer — schedule auto-connect."""
|
|
501
|
+
if node_id == self.node_id or node_id in self.connections:
|
|
502
|
+
return
|
|
503
|
+
self._known_peers[node_id] = (host, port)
|
|
504
|
+
asyncio.ensure_future(self._try_connect(node_id, host, port))
|
|
505
|
+
|
|
506
|
+
def _on_peer_removed(self, node_id: str) -> None:
|
|
507
|
+
"""Callback when mDNS detects a peer departure."""
|
|
508
|
+
conn = self.connections.pop(node_id, None)
|
|
509
|
+
if conn:
|
|
510
|
+
asyncio.ensure_future(conn.close())
|
|
511
|
+
logger.debug(f"Peer {node_id} removed via mDNS")
|
|
512
|
+
|
|
513
|
+
async def _try_connect(self, node_id: str, host: str, port: int) -> None:
|
|
514
|
+
"""Try to connect to a discovered peer, silently ignoring failures."""
|
|
515
|
+
try:
|
|
516
|
+
if node_id not in self.connections:
|
|
517
|
+
await self._client.connect(host, port)
|
|
518
|
+
except Exception as exc:
|
|
519
|
+
logger.debug(f"Auto-connect to {node_id} at {host}:{port} failed: {exc}")
|
|
520
|
+
|
|
521
|
+
# ------------------------------------------------------------------
|
|
522
|
+
# Periodic tasks
|
|
523
|
+
# ------------------------------------------------------------------
|
|
524
|
+
|
|
525
|
+
async def _heartbeat_loop(self) -> None:
|
|
526
|
+
"""Send heartbeats to all connected peers every 30 seconds."""
|
|
527
|
+
from firecloud.transport import MSG_HEARTBEAT
|
|
528
|
+
|
|
529
|
+
try:
|
|
530
|
+
while self._running:
|
|
531
|
+
await asyncio.sleep(30)
|
|
532
|
+
ts = datetime.now(timezone.utc).isoformat().encode("utf-8")
|
|
533
|
+
payload = self.node_id.encode("utf-8") + b"|" + ts
|
|
534
|
+
for conn in list(self.connections.values()):
|
|
535
|
+
try:
|
|
536
|
+
await conn.send_message(MSG_HEARTBEAT, payload)
|
|
537
|
+
except Exception:
|
|
538
|
+
pass
|
|
539
|
+
except asyncio.CancelledError:
|
|
540
|
+
pass
|
|
541
|
+
|
|
542
|
+
async def _manifest_sync_loop(self) -> None:
|
|
543
|
+
"""Periodically sync the manifest to all peers every 60 seconds."""
|
|
544
|
+
try:
|
|
545
|
+
while self._running:
|
|
546
|
+
await asyncio.sleep(60)
|
|
547
|
+
await self._sync_manifest_to_peers()
|
|
548
|
+
except asyncio.CancelledError:
|
|
549
|
+
pass
|
|
550
|
+
|
|
551
|
+
async def _sync_manifest_to_peers(self) -> None:
|
|
552
|
+
"""Push the local manifest entries to all connected peers."""
|
|
553
|
+
entries = self.manifest.to_entries()
|
|
554
|
+
if not entries:
|
|
555
|
+
return
|
|
556
|
+
entries_dicts = [asdict(e) for e in entries]
|
|
557
|
+
payload = json.dumps(entries_dicts).encode("utf-8")
|
|
558
|
+
for conn in list(self.connections.values()):
|
|
559
|
+
try:
|
|
560
|
+
await conn.send_message(MSG_SYNC_MANIFEST, payload)
|
|
561
|
+
except Exception as exc:
|
|
562
|
+
logger.debug(f"Manifest sync failed for peer: {exc}")
|