firecloud-devnet 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fc_mlops/__init__.py +3 -0
- fc_mlops/__main__.py +5 -0
- fc_mlops/anomaly.py +112 -0
- fc_mlops/artifact_store.py +111 -0
- fc_mlops/cli.py +190 -0
- fc_mlops/simulate_failure.py +100 -0
- fc_mlops/telemetry.py +72 -0
- fc_rag/__init__.py +3 -0
- fc_rag/cli.py +51 -0
- fc_rag/config.py +24 -0
- fc_rag/embedder.py +62 -0
- fc_rag/indexer.py +121 -0
- fc_rag/query_engine.py +79 -0
- fc_rag/requirements.txt +6 -0
- fc_rag/retriever.py +46 -0
- firecloud/__init__.py +17 -0
- firecloud/chunker.py +122 -0
- firecloud/cli.py +540 -0
- firecloud/crypto.py +269 -0
- firecloud/discovery.py +164 -0
- firecloud/distributor.py +269 -0
- firecloud/exceptions.py +41 -0
- firecloud/fec.py +87 -0
- firecloud/manifest.py +263 -0
- firecloud/network.py +90 -0
- firecloud/node.py +562 -0
- firecloud/storage.py +146 -0
- firecloud/sync.py +277 -0
- firecloud/transport.py +387 -0
- firecloud_devnet-0.1.0.dist-info/METADATA +158 -0
- firecloud_devnet-0.1.0.dist-info/RECORD +34 -0
- firecloud_devnet-0.1.0.dist-info/WHEEL +4 -0
- firecloud_devnet-0.1.0.dist-info/entry_points.txt +4 -0
- firecloud_devnet-0.1.0.dist-info/licenses/LICENSE +21 -0
fc_rag/embedder.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Local text chunking and embedding via fastembed.
|
|
2
|
+
|
|
3
|
+
Default model: BAAI/bge-small-en-v1.5 (384-dim, CPU-only).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from functools import lru_cache
|
|
7
|
+
from fastembed import TextEmbedding
|
|
8
|
+
from fc_rag.config import get_settings
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@lru_cache(maxsize=1)
|
|
12
|
+
def _get_model() -> TextEmbedding:
|
|
13
|
+
settings = get_settings()
|
|
14
|
+
return TextEmbedding(model_name=settings.embedding_model)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> list[str]:
|
|
18
|
+
"""Split *text* into overlapping chunks on whitespace boundaries."""
|
|
19
|
+
if not text or not text.strip():
|
|
20
|
+
return []
|
|
21
|
+
|
|
22
|
+
words = text.split()
|
|
23
|
+
chunks: list[str] = []
|
|
24
|
+
current_words: list[str] = []
|
|
25
|
+
current_len = 0
|
|
26
|
+
|
|
27
|
+
for word in words:
|
|
28
|
+
word_len = len(word)
|
|
29
|
+
addition = word_len if not current_words else word_len + 1
|
|
30
|
+
|
|
31
|
+
if current_len + addition > chunk_size and current_words:
|
|
32
|
+
chunks.append(" ".join(current_words))
|
|
33
|
+
|
|
34
|
+
# keep tail words for overlap
|
|
35
|
+
overlap_words: list[str] = []
|
|
36
|
+
overlap_len = 0
|
|
37
|
+
for w in reversed(current_words):
|
|
38
|
+
candidate = len(w) if not overlap_words else len(w) + 1
|
|
39
|
+
if overlap_len + candidate > overlap:
|
|
40
|
+
break
|
|
41
|
+
overlap_words.insert(0, w)
|
|
42
|
+
overlap_len += candidate
|
|
43
|
+
|
|
44
|
+
current_words = overlap_words
|
|
45
|
+
current_len = overlap_len
|
|
46
|
+
|
|
47
|
+
current_words.append(word)
|
|
48
|
+
current_len += addition
|
|
49
|
+
|
|
50
|
+
if current_words:
|
|
51
|
+
chunks.append(" ".join(current_words))
|
|
52
|
+
|
|
53
|
+
return chunks
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def embed_chunks(chunks: list[str]) -> list[list[float]]:
|
|
57
|
+
"""Batch-embed text chunks using the local fastembed model."""
|
|
58
|
+
if not chunks:
|
|
59
|
+
return []
|
|
60
|
+
model = _get_model()
|
|
61
|
+
embeddings = list(model.embed(chunks))
|
|
62
|
+
return [emb.tolist() for emb in embeddings]
|
fc_rag/indexer.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Index local files into a Qdrant vector store for RAG retrieval."""
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from qdrant_client import QdrantClient
|
|
8
|
+
from qdrant_client.models import Distance, PointStruct, VectorParams
|
|
9
|
+
from rich.progress import Progress
|
|
10
|
+
|
|
11
|
+
from fc_rag.config import get_settings
|
|
12
|
+
from fc_rag.embedder import chunk_text, embed_chunks
|
|
13
|
+
|
|
14
|
+
_SUPPORTED_EXTENSIONS = {".txt", ".md", ".py", ".json"}
|
|
15
|
+
_VECTOR_DIM = 384 # BAAI/bge-small-en-v1.5
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _ensure_collection(client: QdrantClient, name: str) -> None:
|
|
19
|
+
"""Create the collection if it doesn't exist yet."""
|
|
20
|
+
existing = [c.name for c in client.get_collections().collections]
|
|
21
|
+
if name not in existing:
|
|
22
|
+
client.create_collection(
|
|
23
|
+
collection_name=name,
|
|
24
|
+
vectors_config=VectorParams(size=_VECTOR_DIM, distance=Distance.COSINE),
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _safety_check(path: Path) -> None:
|
|
29
|
+
# block indexing of firecloud's encrypted chunk storage — those are
|
|
30
|
+
# ciphertext and would just pollute the vector store with garbage
|
|
31
|
+
resolved = path.resolve()
|
|
32
|
+
parts = resolved.parts
|
|
33
|
+
|
|
34
|
+
for i, part in enumerate(parts):
|
|
35
|
+
if part == "chunks" and i > 0 and "firecloud" in parts[i - 1].lower():
|
|
36
|
+
raise ValueError(
|
|
37
|
+
"Do not index encrypted chunk storage. "
|
|
38
|
+
"Pass your original files before encryption."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
resolved_str = str(resolved).lower()
|
|
42
|
+
if "/firecloud/" in resolved_str and "chunks" in resolved_str:
|
|
43
|
+
raise ValueError(
|
|
44
|
+
"Do not index encrypted chunk storage. "
|
|
45
|
+
"Pass your original files before encryption."
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
if resolved.name == "chunks":
|
|
49
|
+
parent_name = resolved.parent.name.lower() if resolved.parent else ""
|
|
50
|
+
if "firecloud" in parent_name or "storage" in parent_name:
|
|
51
|
+
raise ValueError(
|
|
52
|
+
"Do not index encrypted chunk storage. "
|
|
53
|
+
"Pass your original files before encryption."
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def index_path(path: Path) -> int:
|
|
58
|
+
"""Index files at *path* into the local Qdrant collection.
|
|
59
|
+
|
|
60
|
+
Only .txt, .md, .py, and .json files are processed.
|
|
61
|
+
Returns the total number of chunks indexed.
|
|
62
|
+
"""
|
|
63
|
+
path = Path(path).resolve()
|
|
64
|
+
_safety_check(path)
|
|
65
|
+
|
|
66
|
+
settings = get_settings()
|
|
67
|
+
settings.qdrant_path.mkdir(parents=True, exist_ok=True)
|
|
68
|
+
|
|
69
|
+
client = QdrantClient(path=str(settings.qdrant_path))
|
|
70
|
+
_ensure_collection(client, settings.collection_name)
|
|
71
|
+
|
|
72
|
+
if path.is_file():
|
|
73
|
+
files = [path] if path.suffix in _SUPPORTED_EXTENSIONS else []
|
|
74
|
+
elif path.is_dir():
|
|
75
|
+
files = sorted(
|
|
76
|
+
f for f in path.rglob("*")
|
|
77
|
+
if f.is_file() and f.suffix in _SUPPORTED_EXTENSIONS
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
return 0
|
|
81
|
+
|
|
82
|
+
total_chunks = 0
|
|
83
|
+
|
|
84
|
+
with Progress() as progress:
|
|
85
|
+
task = progress.add_task("[cyan]Indexing files…", total=len(files))
|
|
86
|
+
|
|
87
|
+
for filepath in files:
|
|
88
|
+
try:
|
|
89
|
+
content = filepath.read_text(encoding="utf-8", errors="replace")
|
|
90
|
+
except Exception:
|
|
91
|
+
progress.advance(task)
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
chunks = chunk_text(content)
|
|
95
|
+
if not chunks:
|
|
96
|
+
progress.advance(task)
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
vectors = embed_chunks(chunks)
|
|
100
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
101
|
+
|
|
102
|
+
points = [
|
|
103
|
+
PointStruct(
|
|
104
|
+
id=str(uuid.uuid4()),
|
|
105
|
+
vector=vec,
|
|
106
|
+
payload={
|
|
107
|
+
"filename": filepath.name,
|
|
108
|
+
"filepath": str(filepath),
|
|
109
|
+
"chunk_index": i,
|
|
110
|
+
"content": chunk,
|
|
111
|
+
"indexed_at": now,
|
|
112
|
+
},
|
|
113
|
+
)
|
|
114
|
+
for i, (chunk, vec) in enumerate(zip(chunks, vectors))
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
client.upsert(collection_name=settings.collection_name, points=points)
|
|
118
|
+
total_chunks += len(points)
|
|
119
|
+
progress.advance(task)
|
|
120
|
+
|
|
121
|
+
return total_chunks
|
fc_rag/query_engine.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""RAG pipeline: retrieve context → build prompt → query Ollama."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
|
|
8
|
+
import ollama
|
|
9
|
+
from fc_rag.config import get_settings
|
|
10
|
+
from fc_rag.retriever import retrieve
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def query(user_question: str) -> str:
|
|
14
|
+
"""Run the full RAG pipeline and return the LLM's answer."""
|
|
15
|
+
settings = get_settings()
|
|
16
|
+
start = time.monotonic()
|
|
17
|
+
|
|
18
|
+
results = retrieve(user_question)
|
|
19
|
+
|
|
20
|
+
# build context block from retrieved chunks
|
|
21
|
+
if results:
|
|
22
|
+
ctx_parts = []
|
|
23
|
+
for i, r in enumerate(results, 1):
|
|
24
|
+
ctx_parts.append(f"[{i}] (source: {r.filename})\n{r.content}")
|
|
25
|
+
context = "\n\n".join(ctx_parts)
|
|
26
|
+
else:
|
|
27
|
+
context = "(No relevant context found.)"
|
|
28
|
+
|
|
29
|
+
messages = [
|
|
30
|
+
{
|
|
31
|
+
"role": "system",
|
|
32
|
+
"content": (
|
|
33
|
+
"Answer only from the provided context. "
|
|
34
|
+
"If the context does not contain the answer, say so."
|
|
35
|
+
),
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"role": "user",
|
|
39
|
+
"content": f"Context:\n{context}\n\nQuestion: {user_question}",
|
|
40
|
+
},
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
answer: str | None = None
|
|
44
|
+
success = False
|
|
45
|
+
|
|
46
|
+
for attempt in range(1, settings.max_retries + 1):
|
|
47
|
+
try:
|
|
48
|
+
response = ollama.chat(
|
|
49
|
+
model=settings.ollama_model,
|
|
50
|
+
messages=messages,
|
|
51
|
+
)
|
|
52
|
+
answer = response["message"]["content"]
|
|
53
|
+
success = True
|
|
54
|
+
break
|
|
55
|
+
except (ollama.ResponseError, ConnectionError) as exc:
|
|
56
|
+
print(
|
|
57
|
+
f"[attempt {attempt}/{settings.max_retries}] Ollama error: {exc}",
|
|
58
|
+
file=sys.stderr,
|
|
59
|
+
)
|
|
60
|
+
if attempt < settings.max_retries:
|
|
61
|
+
time.sleep(2)
|
|
62
|
+
|
|
63
|
+
if not success:
|
|
64
|
+
answer = "Local LLM unavailable. Start Ollama with: ollama serve"
|
|
65
|
+
|
|
66
|
+
elapsed_ms = (time.monotonic() - start) * 1000
|
|
67
|
+
|
|
68
|
+
# append to query log
|
|
69
|
+
settings.log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
with open(settings.log_path, "a", encoding="utf-8") as fh:
|
|
71
|
+
fh.write(json.dumps({
|
|
72
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
73
|
+
"question_length": len(user_question),
|
|
74
|
+
"chunks_retrieved": len(results),
|
|
75
|
+
"latency_ms": round(elapsed_ms, 2),
|
|
76
|
+
"success": success,
|
|
77
|
+
}, default=str) + "\n")
|
|
78
|
+
|
|
79
|
+
return answer
|
fc_rag/requirements.txt
ADDED
fc_rag/retriever.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Search the local Qdrant collection for relevant chunks."""
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, ConfigDict
|
|
4
|
+
from qdrant_client import QdrantClient
|
|
5
|
+
|
|
6
|
+
from fc_rag.config import get_settings
|
|
7
|
+
from fc_rag.embedder import embed_chunks
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RetrievalResult(BaseModel):
|
|
11
|
+
model_config = ConfigDict(frozen=True)
|
|
12
|
+
|
|
13
|
+
content: str
|
|
14
|
+
filename: str
|
|
15
|
+
score: float
|
|
16
|
+
chunk_index: int
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def retrieve(query: str, top_k: int | None = None) -> list[RetrievalResult]:
|
|
20
|
+
"""Embed *query* and return the closest chunks from Qdrant."""
|
|
21
|
+
settings = get_settings()
|
|
22
|
+
k = top_k if top_k is not None else settings.top_k
|
|
23
|
+
|
|
24
|
+
vectors = embed_chunks([query])
|
|
25
|
+
if not vectors:
|
|
26
|
+
return []
|
|
27
|
+
query_vector = vectors[0]
|
|
28
|
+
|
|
29
|
+
client = QdrantClient(path=str(settings.qdrant_path))
|
|
30
|
+
|
|
31
|
+
response = client.query_points(
|
|
32
|
+
collection_name=settings.collection_name,
|
|
33
|
+
query=query_vector,
|
|
34
|
+
limit=k,
|
|
35
|
+
)
|
|
36
|
+
results = response.points
|
|
37
|
+
|
|
38
|
+
return [
|
|
39
|
+
RetrievalResult(
|
|
40
|
+
content=hit.payload.get("content", ""),
|
|
41
|
+
filename=hit.payload.get("filename", "unknown"),
|
|
42
|
+
score=hit.score,
|
|
43
|
+
chunk_index=hit.payload.get("chunk_index", 0),
|
|
44
|
+
)
|
|
45
|
+
for hit in results
|
|
46
|
+
]
|
firecloud/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
FireCloud — Private, encrypted, distributed storage across your own machines.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from firecloud import Node, Network
|
|
6
|
+
|
|
7
|
+
net = Network.create(passphrase="your-passphrase")
|
|
8
|
+
node = Node(network=net, storage_path="~/.firecloud/storage")
|
|
9
|
+
node.start()
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
__version__ = "0.1.0"
|
|
13
|
+
|
|
14
|
+
from firecloud.network import Network
|
|
15
|
+
from firecloud.node import Node
|
|
16
|
+
|
|
17
|
+
__all__ = ["Node", "Network", "__version__"]
|
firecloud/chunker.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""FireCloud Chunker Engine.
|
|
2
|
+
|
|
3
|
+
Wraps FastCDC for content-defined chunking with keyed chunk addressing
|
|
4
|
+
and integrity verification.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
import hashlib
|
|
9
|
+
import hmac
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from fastcdc import fastcdc
|
|
12
|
+
|
|
13
|
+
from firecloud.crypto import derive_chunk_id, compute_integrity_hash
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class Chunk:
|
|
18
|
+
"""Represents a content-defined chunk of a file."""
|
|
19
|
+
index: int
|
|
20
|
+
offset: int
|
|
21
|
+
length: int
|
|
22
|
+
data: bytes
|
|
23
|
+
chunk_id: str # HMAC-SHA-256 (keyed address)
|
|
24
|
+
integrity_hash: str # SHA-256 (for verification)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def chunk_file(
|
|
28
|
+
filepath: Path | str,
|
|
29
|
+
hmac_key: bytes,
|
|
30
|
+
min_size: int = 4096,
|
|
31
|
+
avg_size: int = 16384,
|
|
32
|
+
max_size: int = 65536,
|
|
33
|
+
) -> list[Chunk]:
|
|
34
|
+
"""Read a file and chunk it with FastCDC.
|
|
35
|
+
|
|
36
|
+
Computes the keyed chunk_id and integrity_hash for each chunk.
|
|
37
|
+
"""
|
|
38
|
+
path = Path(filepath)
|
|
39
|
+
if not path.exists() or path.stat().st_size == 0:
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
# fastcdc expects string filepath or bytes.
|
|
43
|
+
# We use fat=True so that c.data contains the actual chunk bytes.
|
|
44
|
+
cdc_chunks = fastcdc(
|
|
45
|
+
str(path),
|
|
46
|
+
min_size=min_size,
|
|
47
|
+
avg_size=avg_size,
|
|
48
|
+
max_size=max_size,
|
|
49
|
+
fat=True,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
chunks = []
|
|
53
|
+
for index, c in enumerate(cdc_chunks):
|
|
54
|
+
chunk_data = c.data
|
|
55
|
+
chunk_id = derive_chunk_id(chunk_data, hmac_key)
|
|
56
|
+
int_hash = compute_integrity_hash(chunk_data)
|
|
57
|
+
chunks.append(
|
|
58
|
+
Chunk(
|
|
59
|
+
index=index,
|
|
60
|
+
offset=c.offset,
|
|
61
|
+
length=c.length,
|
|
62
|
+
data=chunk_data,
|
|
63
|
+
chunk_id=chunk_id,
|
|
64
|
+
integrity_hash=int_hash,
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
return chunks
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def chunk_bytes(
|
|
71
|
+
data: bytes,
|
|
72
|
+
hmac_key: bytes,
|
|
73
|
+
min_size: int = 4096,
|
|
74
|
+
avg_size: int = 16384,
|
|
75
|
+
max_size: int = 65536,
|
|
76
|
+
) -> list[Chunk]:
|
|
77
|
+
"""Chunk in-memory bytes using FastCDC."""
|
|
78
|
+
if not data:
|
|
79
|
+
return []
|
|
80
|
+
|
|
81
|
+
cdc_chunks = fastcdc(
|
|
82
|
+
data,
|
|
83
|
+
min_size=min_size,
|
|
84
|
+
avg_size=avg_size,
|
|
85
|
+
max_size=max_size,
|
|
86
|
+
fat=True,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
chunks = []
|
|
90
|
+
for index, c in enumerate(cdc_chunks):
|
|
91
|
+
chunk_data = c.data
|
|
92
|
+
chunk_id = derive_chunk_id(chunk_data, hmac_key)
|
|
93
|
+
int_hash = compute_integrity_hash(chunk_data)
|
|
94
|
+
chunks.append(
|
|
95
|
+
Chunk(
|
|
96
|
+
index=index,
|
|
97
|
+
offset=c.offset,
|
|
98
|
+
length=c.length,
|
|
99
|
+
data=chunk_data,
|
|
100
|
+
chunk_id=chunk_id,
|
|
101
|
+
integrity_hash=int_hash,
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
return chunks
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def reassemble_chunks(chunks: list[Chunk]) -> bytes:
|
|
108
|
+
"""Reassemble chunks in index order back to original bytes."""
|
|
109
|
+
sorted_chunks = sorted(chunks, key=lambda c: c.index)
|
|
110
|
+
return b"".join(c.data for c in sorted_chunks)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def compute_file_id(filepath: Path | str, hmac_key: bytes) -> str:
|
|
114
|
+
"""Compute the HMAC-SHA-256 of the entire file content."""
|
|
115
|
+
h = hmac.new(hmac_key, digestmod=hashlib.sha256)
|
|
116
|
+
with open(filepath, "rb") as f:
|
|
117
|
+
while True:
|
|
118
|
+
block = f.read(65536)
|
|
119
|
+
if not block:
|
|
120
|
+
break
|
|
121
|
+
h.update(block)
|
|
122
|
+
return h.hexdigest()
|