firecloud-devnet 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fc_rag/embedder.py ADDED
@@ -0,0 +1,62 @@
1
+ """Local text chunking and embedding via fastembed.
2
+
3
+ Default model: BAAI/bge-small-en-v1.5 (384-dim, CPU-only).
4
+ """
5
+
6
+ from functools import lru_cache
7
+ from fastembed import TextEmbedding
8
+ from fc_rag.config import get_settings
9
+
10
+
11
+ @lru_cache(maxsize=1)
12
+ def _get_model() -> TextEmbedding:
13
+ settings = get_settings()
14
+ return TextEmbedding(model_name=settings.embedding_model)
15
+
16
+
17
+ def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> list[str]:
18
+ """Split *text* into overlapping chunks on whitespace boundaries."""
19
+ if not text or not text.strip():
20
+ return []
21
+
22
+ words = text.split()
23
+ chunks: list[str] = []
24
+ current_words: list[str] = []
25
+ current_len = 0
26
+
27
+ for word in words:
28
+ word_len = len(word)
29
+ addition = word_len if not current_words else word_len + 1
30
+
31
+ if current_len + addition > chunk_size and current_words:
32
+ chunks.append(" ".join(current_words))
33
+
34
+ # keep tail words for overlap
35
+ overlap_words: list[str] = []
36
+ overlap_len = 0
37
+ for w in reversed(current_words):
38
+ candidate = len(w) if not overlap_words else len(w) + 1
39
+ if overlap_len + candidate > overlap:
40
+ break
41
+ overlap_words.insert(0, w)
42
+ overlap_len += candidate
43
+
44
+ current_words = overlap_words
45
+ current_len = overlap_len
46
+
47
+ current_words.append(word)
48
+ current_len += addition
49
+
50
+ if current_words:
51
+ chunks.append(" ".join(current_words))
52
+
53
+ return chunks
54
+
55
+
56
+ def embed_chunks(chunks: list[str]) -> list[list[float]]:
57
+ """Batch-embed text chunks using the local fastembed model."""
58
+ if not chunks:
59
+ return []
60
+ model = _get_model()
61
+ embeddings = list(model.embed(chunks))
62
+ return [emb.tolist() for emb in embeddings]
fc_rag/indexer.py ADDED
@@ -0,0 +1,121 @@
1
+ """Index local files into a Qdrant vector store for RAG retrieval."""
2
+
3
+ import uuid
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+
7
+ from qdrant_client import QdrantClient
8
+ from qdrant_client.models import Distance, PointStruct, VectorParams
9
+ from rich.progress import Progress
10
+
11
+ from fc_rag.config import get_settings
12
+ from fc_rag.embedder import chunk_text, embed_chunks
13
+
14
+ _SUPPORTED_EXTENSIONS = {".txt", ".md", ".py", ".json"}
15
+ _VECTOR_DIM = 384 # BAAI/bge-small-en-v1.5
16
+
17
+
18
+ def _ensure_collection(client: QdrantClient, name: str) -> None:
19
+ """Create the collection if it doesn't exist yet."""
20
+ existing = [c.name for c in client.get_collections().collections]
21
+ if name not in existing:
22
+ client.create_collection(
23
+ collection_name=name,
24
+ vectors_config=VectorParams(size=_VECTOR_DIM, distance=Distance.COSINE),
25
+ )
26
+
27
+
28
+ def _safety_check(path: Path) -> None:
29
+ # block indexing of firecloud's encrypted chunk storage — those are
30
+ # ciphertext and would just pollute the vector store with garbage
31
+ resolved = path.resolve()
32
+ parts = resolved.parts
33
+
34
+ for i, part in enumerate(parts):
35
+ if part == "chunks" and i > 0 and "firecloud" in parts[i - 1].lower():
36
+ raise ValueError(
37
+ "Do not index encrypted chunk storage. "
38
+ "Pass your original files before encryption."
39
+ )
40
+
41
+ resolved_str = str(resolved).lower()
42
+ if "/firecloud/" in resolved_str and "chunks" in resolved_str:
43
+ raise ValueError(
44
+ "Do not index encrypted chunk storage. "
45
+ "Pass your original files before encryption."
46
+ )
47
+
48
+ if resolved.name == "chunks":
49
+ parent_name = resolved.parent.name.lower() if resolved.parent else ""
50
+ if "firecloud" in parent_name or "storage" in parent_name:
51
+ raise ValueError(
52
+ "Do not index encrypted chunk storage. "
53
+ "Pass your original files before encryption."
54
+ )
55
+
56
+
57
+ def index_path(path: Path) -> int:
58
+ """Index files at *path* into the local Qdrant collection.
59
+
60
+ Only .txt, .md, .py, and .json files are processed.
61
+ Returns the total number of chunks indexed.
62
+ """
63
+ path = Path(path).resolve()
64
+ _safety_check(path)
65
+
66
+ settings = get_settings()
67
+ settings.qdrant_path.mkdir(parents=True, exist_ok=True)
68
+
69
+ client = QdrantClient(path=str(settings.qdrant_path))
70
+ _ensure_collection(client, settings.collection_name)
71
+
72
+ if path.is_file():
73
+ files = [path] if path.suffix in _SUPPORTED_EXTENSIONS else []
74
+ elif path.is_dir():
75
+ files = sorted(
76
+ f for f in path.rglob("*")
77
+ if f.is_file() and f.suffix in _SUPPORTED_EXTENSIONS
78
+ )
79
+ else:
80
+ return 0
81
+
82
+ total_chunks = 0
83
+
84
+ with Progress() as progress:
85
+ task = progress.add_task("[cyan]Indexing files…", total=len(files))
86
+
87
+ for filepath in files:
88
+ try:
89
+ content = filepath.read_text(encoding="utf-8", errors="replace")
90
+ except Exception:
91
+ progress.advance(task)
92
+ continue
93
+
94
+ chunks = chunk_text(content)
95
+ if not chunks:
96
+ progress.advance(task)
97
+ continue
98
+
99
+ vectors = embed_chunks(chunks)
100
+ now = datetime.now(timezone.utc).isoformat()
101
+
102
+ points = [
103
+ PointStruct(
104
+ id=str(uuid.uuid4()),
105
+ vector=vec,
106
+ payload={
107
+ "filename": filepath.name,
108
+ "filepath": str(filepath),
109
+ "chunk_index": i,
110
+ "content": chunk,
111
+ "indexed_at": now,
112
+ },
113
+ )
114
+ for i, (chunk, vec) in enumerate(zip(chunks, vectors))
115
+ ]
116
+
117
+ client.upsert(collection_name=settings.collection_name, points=points)
118
+ total_chunks += len(points)
119
+ progress.advance(task)
120
+
121
+ return total_chunks
fc_rag/query_engine.py ADDED
@@ -0,0 +1,79 @@
1
+ """RAG pipeline: retrieve context → build prompt → query Ollama."""
2
+
3
+ import json
4
+ import sys
5
+ import time
6
+ from datetime import datetime, timezone
7
+
8
+ import ollama
9
+ from fc_rag.config import get_settings
10
+ from fc_rag.retriever import retrieve
11
+
12
+
13
+ def query(user_question: str) -> str:
14
+ """Run the full RAG pipeline and return the LLM's answer."""
15
+ settings = get_settings()
16
+ start = time.monotonic()
17
+
18
+ results = retrieve(user_question)
19
+
20
+ # build context block from retrieved chunks
21
+ if results:
22
+ ctx_parts = []
23
+ for i, r in enumerate(results, 1):
24
+ ctx_parts.append(f"[{i}] (source: {r.filename})\n{r.content}")
25
+ context = "\n\n".join(ctx_parts)
26
+ else:
27
+ context = "(No relevant context found.)"
28
+
29
+ messages = [
30
+ {
31
+ "role": "system",
32
+ "content": (
33
+ "Answer only from the provided context. "
34
+ "If the context does not contain the answer, say so."
35
+ ),
36
+ },
37
+ {
38
+ "role": "user",
39
+ "content": f"Context:\n{context}\n\nQuestion: {user_question}",
40
+ },
41
+ ]
42
+
43
+ answer: str | None = None
44
+ success = False
45
+
46
+ for attempt in range(1, settings.max_retries + 1):
47
+ try:
48
+ response = ollama.chat(
49
+ model=settings.ollama_model,
50
+ messages=messages,
51
+ )
52
+ answer = response["message"]["content"]
53
+ success = True
54
+ break
55
+ except (ollama.ResponseError, ConnectionError) as exc:
56
+ print(
57
+ f"[attempt {attempt}/{settings.max_retries}] Ollama error: {exc}",
58
+ file=sys.stderr,
59
+ )
60
+ if attempt < settings.max_retries:
61
+ time.sleep(2)
62
+
63
+ if not success:
64
+ answer = "Local LLM unavailable. Start Ollama with: ollama serve"
65
+
66
+ elapsed_ms = (time.monotonic() - start) * 1000
67
+
68
+ # append to query log
69
+ settings.log_path.parent.mkdir(parents=True, exist_ok=True)
70
+ with open(settings.log_path, "a", encoding="utf-8") as fh:
71
+ fh.write(json.dumps({
72
+ "timestamp": datetime.now(timezone.utc).isoformat(),
73
+ "question_length": len(user_question),
74
+ "chunks_retrieved": len(results),
75
+ "latency_ms": round(elapsed_ms, 2),
76
+ "success": success,
77
+ }, default=str) + "\n")
78
+
79
+ return answer
@@ -0,0 +1,6 @@
1
+ qdrant-client
2
+ fastembed
3
+ ollama
4
+ pydantic>=2.0
5
+ click
6
+ rich
fc_rag/retriever.py ADDED
@@ -0,0 +1,46 @@
1
+ """Search the local Qdrant collection for relevant chunks."""
2
+
3
+ from pydantic import BaseModel, ConfigDict
4
+ from qdrant_client import QdrantClient
5
+
6
+ from fc_rag.config import get_settings
7
+ from fc_rag.embedder import embed_chunks
8
+
9
+
10
+ class RetrievalResult(BaseModel):
11
+ model_config = ConfigDict(frozen=True)
12
+
13
+ content: str
14
+ filename: str
15
+ score: float
16
+ chunk_index: int
17
+
18
+
19
+ def retrieve(query: str, top_k: int | None = None) -> list[RetrievalResult]:
20
+ """Embed *query* and return the closest chunks from Qdrant."""
21
+ settings = get_settings()
22
+ k = top_k if top_k is not None else settings.top_k
23
+
24
+ vectors = embed_chunks([query])
25
+ if not vectors:
26
+ return []
27
+ query_vector = vectors[0]
28
+
29
+ client = QdrantClient(path=str(settings.qdrant_path))
30
+
31
+ response = client.query_points(
32
+ collection_name=settings.collection_name,
33
+ query=query_vector,
34
+ limit=k,
35
+ )
36
+ results = response.points
37
+
38
+ return [
39
+ RetrievalResult(
40
+ content=hit.payload.get("content", ""),
41
+ filename=hit.payload.get("filename", "unknown"),
42
+ score=hit.score,
43
+ chunk_index=hit.payload.get("chunk_index", 0),
44
+ )
45
+ for hit in results
46
+ ]
firecloud/__init__.py ADDED
@@ -0,0 +1,17 @@
1
+ """
2
+ FireCloud — Private, encrypted, distributed storage across your own machines.
3
+
4
+ Usage:
5
+ from firecloud import Node, Network
6
+
7
+ net = Network.create(passphrase="your-passphrase")
8
+ node = Node(network=net, storage_path="~/.firecloud/storage")
9
+ node.start()
10
+ """
11
+
12
+ __version__ = "0.1.0"
13
+
14
+ from firecloud.network import Network
15
+ from firecloud.node import Node
16
+
17
+ __all__ = ["Node", "Network", "__version__"]
firecloud/chunker.py ADDED
@@ -0,0 +1,122 @@
1
+ """FireCloud Chunker Engine.
2
+
3
+ Wraps FastCDC for content-defined chunking with keyed chunk addressing
4
+ and integrity verification.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ import hashlib
9
+ import hmac
10
+ from pathlib import Path
11
+ from fastcdc import fastcdc
12
+
13
+ from firecloud.crypto import derive_chunk_id, compute_integrity_hash
14
+
15
+
16
+ @dataclass
17
+ class Chunk:
18
+ """Represents a content-defined chunk of a file."""
19
+ index: int
20
+ offset: int
21
+ length: int
22
+ data: bytes
23
+ chunk_id: str # HMAC-SHA-256 (keyed address)
24
+ integrity_hash: str # SHA-256 (for verification)
25
+
26
+
27
+ def chunk_file(
28
+ filepath: Path | str,
29
+ hmac_key: bytes,
30
+ min_size: int = 4096,
31
+ avg_size: int = 16384,
32
+ max_size: int = 65536,
33
+ ) -> list[Chunk]:
34
+ """Read a file and chunk it with FastCDC.
35
+
36
+ Computes the keyed chunk_id and integrity_hash for each chunk.
37
+ """
38
+ path = Path(filepath)
39
+ if not path.exists() or path.stat().st_size == 0:
40
+ return []
41
+
42
+ # fastcdc expects string filepath or bytes.
43
+ # We use fat=True so that c.data contains the actual chunk bytes.
44
+ cdc_chunks = fastcdc(
45
+ str(path),
46
+ min_size=min_size,
47
+ avg_size=avg_size,
48
+ max_size=max_size,
49
+ fat=True,
50
+ )
51
+
52
+ chunks = []
53
+ for index, c in enumerate(cdc_chunks):
54
+ chunk_data = c.data
55
+ chunk_id = derive_chunk_id(chunk_data, hmac_key)
56
+ int_hash = compute_integrity_hash(chunk_data)
57
+ chunks.append(
58
+ Chunk(
59
+ index=index,
60
+ offset=c.offset,
61
+ length=c.length,
62
+ data=chunk_data,
63
+ chunk_id=chunk_id,
64
+ integrity_hash=int_hash,
65
+ )
66
+ )
67
+ return chunks
68
+
69
+
70
+ def chunk_bytes(
71
+ data: bytes,
72
+ hmac_key: bytes,
73
+ min_size: int = 4096,
74
+ avg_size: int = 16384,
75
+ max_size: int = 65536,
76
+ ) -> list[Chunk]:
77
+ """Chunk in-memory bytes using FastCDC."""
78
+ if not data:
79
+ return []
80
+
81
+ cdc_chunks = fastcdc(
82
+ data,
83
+ min_size=min_size,
84
+ avg_size=avg_size,
85
+ max_size=max_size,
86
+ fat=True,
87
+ )
88
+
89
+ chunks = []
90
+ for index, c in enumerate(cdc_chunks):
91
+ chunk_data = c.data
92
+ chunk_id = derive_chunk_id(chunk_data, hmac_key)
93
+ int_hash = compute_integrity_hash(chunk_data)
94
+ chunks.append(
95
+ Chunk(
96
+ index=index,
97
+ offset=c.offset,
98
+ length=c.length,
99
+ data=chunk_data,
100
+ chunk_id=chunk_id,
101
+ integrity_hash=int_hash,
102
+ )
103
+ )
104
+ return chunks
105
+
106
+
107
+ def reassemble_chunks(chunks: list[Chunk]) -> bytes:
108
+ """Reassemble chunks in index order back to original bytes."""
109
+ sorted_chunks = sorted(chunks, key=lambda c: c.index)
110
+ return b"".join(c.data for c in sorted_chunks)
111
+
112
+
113
+ def compute_file_id(filepath: Path | str, hmac_key: bytes) -> str:
114
+ """Compute the HMAC-SHA-256 of the entire file content."""
115
+ h = hmac.new(hmac_key, digestmod=hashlib.sha256)
116
+ with open(filepath, "rb") as f:
117
+ while True:
118
+ block = f.read(65536)
119
+ if not block:
120
+ break
121
+ h.update(block)
122
+ return h.hexdigest()