rapid-rag 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ # Secrets & env
2
+ .env
3
+ *.env
4
+ *.secret
5
+
6
+ # Keys & certs
7
+ *.key
8
+ *.pem
9
+ certs/
10
+ secrets/
11
+
12
+ # Databases & dumps
13
+ *.db
14
+ *.sqlite
15
+ *.sql
16
+ dump_*/
17
+
18
+ # EXCEPT: Allow database schemas (needed for server rebuild)
19
+ !database-schemas/*.sql
20
+
21
+ # Logs & runtime data
22
+ logs/
23
+ *.log
24
+ __pycache__/
25
+ *.pyc
26
+ venv/
27
+ .venv/
28
+ **/venv/
29
+ **/.venv/
30
+
31
+ # Configs met secrets (we gebruiken straks templates)
32
+ config/
33
+ brain_api/provisioning.local.json
34
+ brain_api/provisioning.json
35
+
36
+ # Landing pages (privé - niet open source)
37
+ landing-pages/
38
+ humotica.com/
39
+ jtel.nl/
40
+
41
+ # Social media posts (strategie - niet open source)
42
+ SOCIAL-MEDIA-POSTS.md
43
+ HN-POST-UNDER-4000.md
44
+ STRATO-DEPLOY-HUMOTICA.md
45
+
46
+ # Endorsement outreach (privaat contact)
47
+ ARXIV-ENDORSEMENT-OUTREACH.md
48
+
49
+ # Deployment secrets
50
+ DEPLOYMENT-GUIDE.md
51
+
52
+ # R Project files (Dirty Data Challenge)
53
+ .Rproj.user
54
+ .Rhistory
55
+ .RData
56
+ .Ruserdata
57
+ *.zip
58
+ .mural_tokens.json
59
+ auth.json
60
+ gen-lang-client*.json
61
+ *.credentials.json
62
+
63
+ # Rust build artifacts
64
+ **/target/
65
+ *.whl
66
+
67
+ # Compiled binaries (build locally)
68
+ jis-router/jis-router
69
+ sentinel-rs/sentinel-rs
70
+
71
+ # Build distribution
72
+ sandbox/ai/codex/dist/
73
+ sandbox_backup/
74
+ did-jis-core
@@ -0,0 +1,119 @@
1
+ Metadata-Version: 2.4
2
+ Name: rapid-rag
3
+ Version: 0.1.0
4
+ Summary: Fast local RAG - search your documents with AI, no cloud needed
5
+ Project-URL: Homepage, https://humotica.com
6
+ Project-URL: Repository, https://github.com/humotica/rapid-rag
7
+ Project-URL: Documentation, https://humotica.com/docs/rapid-rag
8
+ Author-email: "J. van de Meent" <jasper@humotica.com>, "R. AI" <info@humotica.com>
9
+ License: MIT
10
+ Keywords: ai,augmented,chromadb,documents,embeddings,generation,llm,local,offline,rag,retrieval,search,semantic-search,vector-search
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Text Processing :: Indexing
23
+ Requires-Python: >=3.10
24
+ Requires-Dist: chromadb>=0.4.0
25
+ Requires-Dist: httpx>=0.24.0
26
+ Requires-Dist: sentence-transformers>=2.2.0
27
+ Provides-Extra: all
28
+ Requires-Dist: fastapi>=0.100.0; extra == 'all'
29
+ Requires-Dist: ollama>=0.1.0; extra == 'all'
30
+ Requires-Dist: pdfplumber>=0.9.0; extra == 'all'
31
+ Requires-Dist: pypdf>=3.0.0; extra == 'all'
32
+ Requires-Dist: uvicorn>=0.22.0; extra == 'all'
33
+ Provides-Extra: api
34
+ Requires-Dist: fastapi>=0.100.0; extra == 'api'
35
+ Requires-Dist: uvicorn>=0.22.0; extra == 'api'
36
+ Provides-Extra: dev
37
+ Requires-Dist: pytest>=7.0; extra == 'dev'
38
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
39
+ Provides-Extra: ollama
40
+ Requires-Dist: ollama>=0.1.0; extra == 'ollama'
41
+ Provides-Extra: pdf
42
+ Requires-Dist: pdfplumber>=0.9.0; extra == 'pdf'
43
+ Requires-Dist: pypdf>=3.0.0; extra == 'pdf'
44
+ Description-Content-Type: text/markdown
45
+
46
+ # rapid-rag
47
+
48
+ Fast local RAG - search your documents with AI, no cloud needed.
49
+
50
+ ## Installation
51
+
52
+ ```bash
53
+ pip install rapid-rag
54
+ ```
55
+
56
+ For PDF support:
57
+ ```bash
58
+ pip install rapid-rag[pdf]
59
+ ```
60
+
61
+ ## Quick Start
62
+
63
+ ```python
64
+ from rapid_rag import RapidRAG
65
+
66
+ # Create a RAG instance
67
+ rag = RapidRAG("my_documents")
68
+
69
+ # Add documents
70
+ rag.add("doc1", "The quick brown fox jumps over the lazy dog.")
71
+ rag.add_file("report.pdf")
72
+ rag.add_directory("./docs/")
73
+
74
+ # Semantic search
75
+ results = rag.search("fox jumping")
76
+ for r in results:
77
+ print(f"{r['score']:.3f}: {r['content'][:100]}")
78
+
79
+ # RAG query with LLM (requires Ollama)
80
+ answer = rag.query("What does the fox do?", model="qwen2.5:7b")
81
+ print(answer["answer"])
82
+ ```
83
+
84
+ ## CLI Usage
85
+
86
+ ```bash
87
+ # Initialize a collection
88
+ rapid-rag init my_docs
89
+
90
+ # Add documents
91
+ rapid-rag add ./documents/ -c my_docs -r
92
+
93
+ # Search
94
+ rapid-rag search "query here" -c my_docs
95
+
96
+ # RAG query (requires Ollama)
97
+ rapid-rag query "What is X?" -c my_docs -m qwen2.5:7b
98
+
99
+ # Info
100
+ rapid-rag info -c my_docs
101
+ ```
102
+
103
+ ## Features
104
+
105
+ - **Local-first**: Everything runs on your machine
106
+ - **Fast**: ChromaDB + sentence-transformers
107
+ - **Simple API**: Add, search, query in 3 lines
108
+ - **File support**: .txt, .md, .pdf
109
+ - **Chunking**: Automatic with overlap
110
+ - **LLM integration**: Works with Ollama
111
+
112
+ ## Requirements
113
+
114
+ - Python 3.10+
115
+ - For LLM queries: [Ollama](https://ollama.ai) running locally
116
+
117
+ ## License
118
+
119
+ MIT - Humotica
@@ -0,0 +1,74 @@
1
+ # rapid-rag
2
+
3
+ Fast local RAG - search your documents with AI, no cloud needed.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install rapid-rag
9
+ ```
10
+
11
+ For PDF support:
12
+ ```bash
13
+ pip install rapid-rag[pdf]
14
+ ```
15
+
16
+ ## Quick Start
17
+
18
+ ```python
19
+ from rapid_rag import RapidRAG
20
+
21
+ # Create a RAG instance
22
+ rag = RapidRAG("my_documents")
23
+
24
+ # Add documents
25
+ rag.add("doc1", "The quick brown fox jumps over the lazy dog.")
26
+ rag.add_file("report.pdf")
27
+ rag.add_directory("./docs/")
28
+
29
+ # Semantic search
30
+ results = rag.search("fox jumping")
31
+ for r in results:
32
+ print(f"{r['score']:.3f}: {r['content'][:100]}")
33
+
34
+ # RAG query with LLM (requires Ollama)
35
+ answer = rag.query("What does the fox do?", model="qwen2.5:7b")
36
+ print(answer["answer"])
37
+ ```
38
+
39
+ ## CLI Usage
40
+
41
+ ```bash
42
+ # Initialize a collection
43
+ rapid-rag init my_docs
44
+
45
+ # Add documents
46
+ rapid-rag add ./documents/ -c my_docs -r
47
+
48
+ # Search
49
+ rapid-rag search "query here" -c my_docs
50
+
51
+ # RAG query (requires Ollama)
52
+ rapid-rag query "What is X?" -c my_docs -m qwen2.5:7b
53
+
54
+ # Info
55
+ rapid-rag info -c my_docs
56
+ ```
57
+
58
+ ## Features
59
+
60
+ - **Local-first**: Everything runs on your machine
61
+ - **Fast**: ChromaDB + sentence-transformers
62
+ - **Simple API**: Add, search, query in 3 lines
63
+ - **File support**: .txt, .md, .pdf
64
+ - **Chunking**: Automatic with overlap
65
+ - **LLM integration**: Works with Ollama
66
+
67
+ ## Requirements
68
+
69
+ - Python 3.10+
70
+ - For LLM queries: [Ollama](https://ollama.ai) running locally
71
+
72
+ ## License
73
+
74
+ MIT - Humotica
@@ -0,0 +1,60 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "rapid-rag"
7
+ version = "0.1.0"
8
+ description = "Fast local RAG - search your documents with AI, no cloud needed"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "J. van de Meent", email = "jasper@humotica.com"},
14
+ {name = "R. AI", email = "info@humotica.com"},
15
+ ]
16
+ keywords = [
17
+ "rag", "retrieval", "augmented", "generation", "local",
18
+ "chromadb", "embeddings", "search", "documents", "ai",
19
+ "vector-search", "semantic-search", "llm", "offline"
20
+ ]
21
+ classifiers = [
22
+ "Development Status :: 4 - Beta",
23
+ "Intended Audience :: Developers",
24
+ "Intended Audience :: Science/Research",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Operating System :: OS Independent",
27
+ "Programming Language :: Python :: 3",
28
+ "Programming Language :: Python :: 3.10",
29
+ "Programming Language :: Python :: 3.11",
30
+ "Programming Language :: Python :: 3.12",
31
+ "Programming Language :: Python :: 3.13",
32
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
33
+ "Topic :: Text Processing :: Indexing",
34
+ ]
35
+ dependencies = [
36
+ "chromadb>=0.4.0",
37
+ "sentence-transformers>=2.2.0",
38
+ "httpx>=0.24.0",
39
+ ]
40
+
41
+ [project.optional-dependencies]
42
+ api = ["fastapi>=0.100.0", "uvicorn>=0.22.0"]
43
+ pdf = ["pypdf>=3.0.0", "pdfplumber>=0.9.0"]
44
+ ollama = ["ollama>=0.1.0"]
45
+ all = ["rapid-rag[api,pdf,ollama]"]
46
+ dev = ["pytest>=7.0", "ruff>=0.1.0"]
47
+
48
+ [project.urls]
49
+ Homepage = "https://humotica.com"
50
+ Repository = "https://github.com/humotica/rapid-rag"
51
+ Documentation = "https://humotica.com/docs/rapid-rag"
52
+
53
+ [project.scripts]
54
+ rapid-rag = "rapid_rag.cli:main"
55
+
56
+ [tool.hatch.build.targets.sdist]
57
+ include = ["/src"]
58
+
59
+ [tool.hatch.build.targets.wheel]
60
+ packages = ["src/rapid_rag"]
@@ -0,0 +1,27 @@
1
+ """
2
+ rapid-rag: Fast local RAG - search your documents with AI, no cloud needed.
3
+
4
+ Usage:
5
+ from rapid_rag import RapidRAG
6
+
7
+ # Create a RAG instance
8
+ rag = RapidRAG("my_documents")
9
+
10
+ # Add documents
11
+ rag.add("doc1", "The quick brown fox jumps over the lazy dog.")
12
+ rag.add_file("report.pdf")
13
+ rag.add_directory("./docs/")
14
+
15
+ # Search
16
+ results = rag.search("fox jumping")
17
+
18
+ # RAG query (with LLM)
19
+ answer = rag.query("What does the fox do?", model="qwen2.5:7b")
20
+ """
21
+
22
+ from .core import RapidRAG
23
+ from .ingest import DocumentIngester
24
+ from .search import SemanticSearch
25
+
26
+ __version__ = "0.1.0"
27
+ __all__ = ["RapidRAG", "DocumentIngester", "SemanticSearch"]
@@ -0,0 +1,150 @@
1
+ """
2
+ rapid-rag CLI - Command line interface for local RAG.
3
+ """
4
+
5
+ import argparse
6
+ import sys
7
+ from pathlib import Path
8
+
9
+
10
+ def main():
11
+ """Main CLI entry point."""
12
+ parser = argparse.ArgumentParser(
13
+ prog="rapid-rag",
14
+ description="Fast local RAG - search your documents with AI"
15
+ )
16
+ subparsers = parser.add_subparsers(dest="command", help="Commands")
17
+
18
+ # Init command
19
+ init_parser = subparsers.add_parser("init", help="Initialize a new RAG collection")
20
+ init_parser.add_argument("name", help="Collection name")
21
+ init_parser.add_argument("--dir", help="Persist directory", default=None)
22
+
23
+ # Add command
24
+ add_parser = subparsers.add_parser("add", help="Add documents")
25
+ add_parser.add_argument("path", help="File or directory to add")
26
+ add_parser.add_argument("-c", "--collection", default="default", help="Collection name")
27
+ add_parser.add_argument("-r", "--recursive", action="store_true", help="Recursive directory scan")
28
+ add_parser.add_argument("--chunk-size", type=int, default=1000, help="Chunk size")
29
+ add_parser.add_argument("--chunk-overlap", type=int, default=200, help="Chunk overlap")
30
+
31
+ # Search command
32
+ search_parser = subparsers.add_parser("search", help="Semantic search")
33
+ search_parser.add_argument("query", help="Search query")
34
+ search_parser.add_argument("-c", "--collection", default="default", help="Collection name")
35
+ search_parser.add_argument("-n", "--num", type=int, default=5, help="Number of results")
36
+
37
+ # Query command (RAG with LLM)
38
+ query_parser = subparsers.add_parser("query", help="RAG query with LLM")
39
+ query_parser.add_argument("question", help="Question to answer")
40
+ query_parser.add_argument("-c", "--collection", default="default", help="Collection name")
41
+ query_parser.add_argument("-m", "--model", default="qwen2.5:7b", help="Ollama model")
42
+ query_parser.add_argument("-n", "--num", type=int, default=5, help="Context documents")
43
+
44
+ # Info command
45
+ info_parser = subparsers.add_parser("info", help="Collection info")
46
+ info_parser.add_argument("-c", "--collection", default="default", help="Collection name")
47
+
48
+ # Clear command
49
+ clear_parser = subparsers.add_parser("clear", help="Clear collection")
50
+ clear_parser.add_argument("-c", "--collection", default="default", help="Collection name")
51
+ clear_parser.add_argument("-y", "--yes", action="store_true", help="Skip confirmation")
52
+
53
+ args = parser.parse_args()
54
+
55
+ if args.command is None:
56
+ parser.print_help()
57
+ return 0
58
+
59
+ # Import here to avoid slow startup
60
+ from .core import RapidRAG
61
+
62
+ if args.command == "init":
63
+ rag = RapidRAG(args.name, persist_dir=args.dir)
64
+ print(f"Initialized collection '{args.name}'")
65
+ print(f"Persist dir: {rag.persist_dir}")
66
+ return 0
67
+
68
+ if args.command == "add":
69
+ rag = RapidRAG(args.collection)
70
+ path = Path(args.path)
71
+
72
+ if path.is_file():
73
+ ids = rag.add_file(
74
+ path,
75
+ chunk_size=args.chunk_size,
76
+ chunk_overlap=args.chunk_overlap
77
+ )
78
+ print(f"Added {len(ids)} chunks from {path.name}")
79
+ elif path.is_dir():
80
+ ids = rag.add_directory(
81
+ path,
82
+ recursive=args.recursive,
83
+ chunk_size=args.chunk_size,
84
+ chunk_overlap=args.chunk_overlap
85
+ )
86
+ print(f"Added {len(ids)} chunks total")
87
+ else:
88
+ print(f"Path not found: {path}", file=sys.stderr)
89
+ return 1
90
+
91
+ return 0
92
+
93
+ if args.command == "search":
94
+ rag = RapidRAG(args.collection)
95
+ results = rag.search(args.query, n_results=args.num)
96
+
97
+ if not results:
98
+ print("No results found.")
99
+ return 0
100
+
101
+ for i, r in enumerate(results, 1):
102
+ score = r["score"]
103
+ source = r["metadata"].get("source", r["id"])
104
+ content = r["content"][:200].replace("\n", " ")
105
+ print(f"\n[{i}] {source} (score: {score:.3f})")
106
+ print(f" {content}...")
107
+
108
+ return 0
109
+
110
+ if args.command == "query":
111
+ rag = RapidRAG(args.collection)
112
+ result = rag.query(args.question, n_results=args.num, model=args.model)
113
+
114
+ print("\n" + "=" * 60)
115
+ print("ANSWER:")
116
+ print("=" * 60)
117
+ print(result["answer"])
118
+ print("\n" + "-" * 60)
119
+ print("SOURCES:")
120
+ for s in result["sources"]:
121
+ source = s["metadata"].get("source", s["id"])
122
+ print(f" - {source} (score: {s['score']:.3f})")
123
+
124
+ return 0
125
+
126
+ if args.command == "info":
127
+ rag = RapidRAG(args.collection)
128
+ print(f"Collection: {args.collection}")
129
+ print(f"Documents: {rag.count()}")
130
+ print(f"Persist dir: {rag.persist_dir}")
131
+ print(f"Embedding model: {rag.embedding_model}")
132
+ return 0
133
+
134
+ if args.command == "clear":
135
+ if not args.yes:
136
+ confirm = input(f"Clear collection '{args.collection}'? [y/N] ")
137
+ if confirm.lower() != "y":
138
+ print("Cancelled.")
139
+ return 0
140
+
141
+ rag = RapidRAG(args.collection)
142
+ rag.clear()
143
+ print(f"Cleared collection '{args.collection}'")
144
+ return 0
145
+
146
+ return 0
147
+
148
+
149
+ if __name__ == "__main__":
150
+ sys.exit(main())
@@ -0,0 +1,417 @@
1
+ """
2
+ Core RapidRAG class - the main interface for local RAG.
3
+ """
4
+
5
+ import os
6
+ import hashlib
7
+ from pathlib import Path
8
+ from typing import Optional, List, Dict, Any, Union
9
+ from datetime import datetime
10
+
11
+ import chromadb
12
+ from chromadb.config import Settings
13
+
14
+ # Optional imports
15
+ try:
16
+ import httpx
17
+ HTTPX_AVAILABLE = True
18
+ except ImportError:
19
+ HTTPX_AVAILABLE = False
20
+
21
+
22
+ class RapidRAG:
23
+ """
24
+ Fast local RAG - search your documents with AI, no cloud needed.
25
+
26
+ Example:
27
+ rag = RapidRAG("my_project")
28
+ rag.add("doc1", "Some text content")
29
+ results = rag.search("query")
30
+ answer = rag.query("What is...?", model="qwen2.5:7b")
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ collection_name: str = "default",
36
+ persist_dir: Optional[str] = None,
37
+ embedding_model: str = "all-MiniLM-L6-v2",
38
+ ollama_url: str = "http://localhost:11434",
39
+ ):
40
+ """
41
+ Initialize RapidRAG.
42
+
43
+ Args:
44
+ collection_name: Name for the document collection
45
+ persist_dir: Directory to persist the database (default: ./rapid_rag_data/)
46
+ embedding_model: Sentence-transformers model for embeddings
47
+ ollama_url: URL for Ollama API (for LLM queries)
48
+ """
49
+ self.collection_name = collection_name
50
+ self.persist_dir = persist_dir or f"./rapid_rag_data/{collection_name}"
51
+ self.embedding_model = embedding_model
52
+ self.ollama_url = ollama_url
53
+
54
+ # Create persist directory
55
+ os.makedirs(self.persist_dir, exist_ok=True)
56
+
57
+ # Initialize ChromaDB with persistence
58
+ self.client = chromadb.PersistentClient(
59
+ path=self.persist_dir,
60
+ settings=Settings(anonymized_telemetry=False)
61
+ )
62
+
63
+ # Get or create collection
64
+ self.collection = self.client.get_or_create_collection(
65
+ name=collection_name,
66
+ metadata={"hnsw:space": "cosine"}
67
+ )
68
+
69
+ # Lazy load embedding model
70
+ self._embedder = None
71
+
72
+ @property
73
+ def embedder(self):
74
+ """Lazy load sentence-transformers model."""
75
+ if self._embedder is None:
76
+ from sentence_transformers import SentenceTransformer
77
+ self._embedder = SentenceTransformer(self.embedding_model)
78
+ return self._embedder
79
+
80
+ def _generate_id(self, content: str) -> str:
81
+ """Generate a unique ID for content."""
82
+ return hashlib.sha256(content.encode()).hexdigest()[:16]
83
+
84
+ def add(
85
+ self,
86
+ doc_id: str,
87
+ content: str,
88
+ metadata: Optional[Dict[str, Any]] = None
89
+ ) -> str:
90
+ """
91
+ Add a document to the collection.
92
+
93
+ Args:
94
+ doc_id: Unique document identifier
95
+ content: Text content to index
96
+ metadata: Optional metadata dict
97
+
98
+ Returns:
99
+ Document ID
100
+ """
101
+ metadata = metadata or {}
102
+ metadata["added_at"] = datetime.now().isoformat()
103
+ metadata["content_hash"] = self._generate_id(content)
104
+
105
+ # Generate embedding
106
+ embedding = self.embedder.encode(content).tolist()
107
+
108
+ # Add to collection
109
+ self.collection.add(
110
+ ids=[doc_id],
111
+ embeddings=[embedding],
112
+ documents=[content],
113
+ metadatas=[metadata]
114
+ )
115
+
116
+ return doc_id
117
+
118
+ def add_texts(
119
+ self,
120
+ texts: List[str],
121
+ ids: Optional[List[str]] = None,
122
+ metadatas: Optional[List[Dict]] = None
123
+ ) -> List[str]:
124
+ """
125
+ Add multiple documents at once (faster).
126
+
127
+ Args:
128
+ texts: List of text contents
129
+ ids: Optional list of IDs (auto-generated if not provided)
130
+ metadatas: Optional list of metadata dicts
131
+
132
+ Returns:
133
+ List of document IDs
134
+ """
135
+ if ids is None:
136
+ ids = [self._generate_id(t) for t in texts]
137
+
138
+ if metadatas is None:
139
+ metadatas = [{} for _ in texts]
140
+
141
+ # Add timestamps
142
+ now = datetime.now().isoformat()
143
+ for meta in metadatas:
144
+ meta["added_at"] = now
145
+
146
+ # Generate embeddings (batch)
147
+ embeddings = self.embedder.encode(texts).tolist()
148
+
149
+ # Add to collection
150
+ self.collection.add(
151
+ ids=ids,
152
+ embeddings=embeddings,
153
+ documents=texts,
154
+ metadatas=metadatas
155
+ )
156
+
157
+ return ids
158
+
159
+ def add_file(
160
+ self,
161
+ file_path: Union[str, Path],
162
+ chunk_size: int = 1000,
163
+ chunk_overlap: int = 200
164
+ ) -> List[str]:
165
+ """
166
+ Add a file to the collection.
167
+
168
+ Supports: .txt, .md, .pdf (with pdf extra)
169
+
170
+ Args:
171
+ file_path: Path to file
172
+ chunk_size: Characters per chunk
173
+ chunk_overlap: Overlap between chunks
174
+
175
+ Returns:
176
+ List of chunk IDs
177
+ """
178
+ file_path = Path(file_path)
179
+
180
+ if not file_path.exists():
181
+ raise FileNotFoundError(f"File not found: {file_path}")
182
+
183
+ # Read content based on file type
184
+ suffix = file_path.suffix.lower()
185
+
186
+ if suffix in [".txt", ".md"]:
187
+ content = file_path.read_text(encoding="utf-8", errors="ignore")
188
+ elif suffix == ".pdf":
189
+ content = self._read_pdf(file_path)
190
+ else:
191
+ # Try reading as text
192
+ content = file_path.read_text(encoding="utf-8", errors="ignore")
193
+
194
+ # Chunk the content
195
+ chunks = self._chunk_text(content, chunk_size, chunk_overlap)
196
+
197
+ # Create IDs and metadata
198
+ ids = [f"{file_path.stem}_{i}" for i in range(len(chunks))]
199
+ metadatas = [
200
+ {
201
+ "source": str(file_path),
202
+ "chunk_index": i,
203
+ "total_chunks": len(chunks)
204
+ }
205
+ for i in range(len(chunks))
206
+ ]
207
+
208
+ return self.add_texts(chunks, ids, metadatas)
209
+
210
+ def add_directory(
211
+ self,
212
+ dir_path: Union[str, Path],
213
+ extensions: Optional[List[str]] = None,
214
+ recursive: bool = True,
215
+ **kwargs
216
+ ) -> List[str]:
217
+ """
218
+ Add all files in a directory.
219
+
220
+ Args:
221
+ dir_path: Path to directory
222
+ extensions: File extensions to include (default: [".txt", ".md", ".pdf"])
223
+ recursive: Search subdirectories
224
+ **kwargs: Passed to add_file()
225
+
226
+ Returns:
227
+ List of all chunk IDs
228
+ """
229
+ dir_path = Path(dir_path)
230
+ extensions = extensions or [".txt", ".md", ".pdf"]
231
+
232
+ all_ids = []
233
+ pattern = "**/*" if recursive else "*"
234
+
235
+ for ext in extensions:
236
+ for file_path in dir_path.glob(f"{pattern}{ext}"):
237
+ if file_path.is_file():
238
+ try:
239
+ ids = self.add_file(file_path, **kwargs)
240
+ all_ids.extend(ids)
241
+ print(f"✓ {file_path.name}: {len(ids)} chunks")
242
+ except Exception as e:
243
+ print(f"✗ {file_path.name}: {e}")
244
+
245
+ return all_ids
246
+
247
+ def search(
248
+ self,
249
+ query: str,
250
+ n_results: int = 5,
251
+ where: Optional[Dict] = None
252
+ ) -> List[Dict[str, Any]]:
253
+ """
254
+ Semantic search in the collection.
255
+
256
+ Args:
257
+ query: Search query
258
+ n_results: Number of results to return
259
+ where: Optional filter dict
260
+
261
+ Returns:
262
+ List of results with content, metadata, and score
263
+ """
264
+ # Generate query embedding
265
+ query_embedding = self.embedder.encode(query).tolist()
266
+
267
+ # Search
268
+ results = self.collection.query(
269
+ query_embeddings=[query_embedding],
270
+ n_results=n_results,
271
+ where=where,
272
+ include=["documents", "metadatas", "distances"]
273
+ )
274
+
275
+ # Format results
276
+ output = []
277
+ for i in range(len(results["ids"][0])):
278
+ output.append({
279
+ "id": results["ids"][0][i],
280
+ "content": results["documents"][0][i],
281
+ "metadata": results["metadatas"][0][i],
282
+ "score": 1 - results["distances"][0][i] # Convert distance to similarity
283
+ })
284
+
285
+ return output
286
+
287
+ def query(
288
+ self,
289
+ question: str,
290
+ n_results: int = 5,
291
+ model: str = "qwen2.5:7b",
292
+ system_prompt: Optional[str] = None
293
+ ) -> Dict[str, Any]:
294
+ """
295
+ RAG query - search + LLM analysis.
296
+
297
+ Args:
298
+ question: Question to answer
299
+ n_results: Number of documents to retrieve
300
+ model: Ollama model to use
301
+ system_prompt: Optional system prompt
302
+
303
+ Returns:
304
+ Dict with answer and sources
305
+ """
306
+ if not HTTPX_AVAILABLE:
307
+ raise ImportError("httpx required for LLM queries: pip install httpx")
308
+
309
+ # Search for relevant documents
310
+ results = self.search(question, n_results=n_results)
311
+
312
+ if not results:
313
+ return {
314
+ "answer": "No relevant documents found.",
315
+ "sources": [],
316
+ "query": question
317
+ }
318
+
319
+ # Build context from results
320
+ context = "\n\n---\n\n".join([
321
+ f"[Source: {r['metadata'].get('source', r['id'])}]\n{r['content']}"
322
+ for r in results
323
+ ])
324
+
325
+ # Default system prompt
326
+ if system_prompt is None:
327
+ system_prompt = (
328
+ "You are a helpful assistant. Answer questions based on the provided context. "
329
+ "If the answer is not in the context, say so. Cite your sources."
330
+ )
331
+
332
+ # Build prompt
333
+ prompt = f"""Context:
334
+ {context}
335
+
336
+ Question: {question}
337
+
338
+ Answer based on the context above:"""
339
+
340
+ # Call Ollama
341
+ try:
342
+ with httpx.Client(timeout=60.0) as client:
343
+ response = client.post(
344
+ f"{self.ollama_url}/api/generate",
345
+ json={
346
+ "model": model,
347
+ "prompt": prompt,
348
+ "system": system_prompt,
349
+ "stream": False
350
+ }
351
+ )
352
+ response.raise_for_status()
353
+ answer = response.json().get("response", "")
354
+ except Exception as e:
355
+ answer = f"LLM error: {e}"
356
+
357
+ return {
358
+ "answer": answer,
359
+ "sources": results,
360
+ "query": question,
361
+ "model": model
362
+ }
363
+
364
+ def count(self) -> int:
365
+ """Return number of documents in collection."""
366
+ return self.collection.count()
367
+
368
+ def delete(self, ids: List[str]) -> None:
369
+ """Delete documents by ID."""
370
+ self.collection.delete(ids=ids)
371
+
372
+ def clear(self) -> None:
373
+ """Clear all documents from collection."""
374
+ self.client.delete_collection(self.collection_name)
375
+ self.collection = self.client.create_collection(
376
+ name=self.collection_name,
377
+ metadata={"hnsw:space": "cosine"}
378
+ )
379
+
380
+ def _chunk_text(
381
+ self,
382
+ text: str,
383
+ chunk_size: int,
384
+ chunk_overlap: int
385
+ ) -> List[str]:
386
+ """Split text into overlapping chunks."""
387
+ chunks = []
388
+ start = 0
389
+
390
+ while start < len(text):
391
+ end = start + chunk_size
392
+ chunk = text[start:end]
393
+
394
+ # Try to break at sentence boundary
395
+ if end < len(text):
396
+ last_period = chunk.rfind(". ")
397
+ if last_period > chunk_size // 2:
398
+ chunk = chunk[:last_period + 1]
399
+ end = start + last_period + 1
400
+
401
+ chunks.append(chunk.strip())
402
+ start = end - chunk_overlap
403
+
404
+ return [c for c in chunks if c] # Remove empty chunks
405
+
406
+ def _read_pdf(self, file_path: Path) -> str:
407
+ """Read text from PDF file."""
408
+ try:
409
+ from pypdf import PdfReader
410
+ reader = PdfReader(str(file_path))
411
+ text = "\n".join(page.extract_text() or "" for page in reader.pages)
412
+ return text
413
+ except ImportError:
414
+ raise ImportError("PDF support requires: pip install rapid-rag[pdf]")
415
+
416
+ def __repr__(self) -> str:
417
+ return f"RapidRAG(collection='{self.collection_name}', docs={self.count()})"
@@ -0,0 +1,134 @@
1
+ """
2
+ Document ingestion utilities for RapidRAG.
3
+ """
4
+
5
+ from pathlib import Path
6
+ from typing import Optional, List, Dict, Any, Generator
7
+ import hashlib
8
+
9
+
10
+ class DocumentIngester:
11
+ """
12
+ Ingest documents from various sources.
13
+
14
+ Example:
15
+ ingester = DocumentIngester(rag)
16
+ ingester.from_directory("./docs/")
17
+ ingester.from_url("https://example.com/doc.txt")
18
+ """
19
+
20
+ def __init__(self, rag: "RapidRAG"):
21
+ """
22
+ Initialize ingester.
23
+
24
+ Args:
25
+ rag: RapidRAG instance to add documents to
26
+ """
27
+ self.rag = rag
28
+ self._stats = {"files": 0, "chunks": 0, "errors": 0}
29
+
30
+ def from_directory(
31
+ self,
32
+ path: str,
33
+ extensions: Optional[List[str]] = None,
34
+ recursive: bool = True,
35
+ chunk_size: int = 1000,
36
+ chunk_overlap: int = 200
37
+ ) -> Dict[str, int]:
38
+ """
39
+ Ingest all documents from a directory.
40
+
41
+ Args:
42
+ path: Directory path
43
+ extensions: File extensions to process
44
+ recursive: Include subdirectories
45
+ chunk_size: Characters per chunk
46
+ chunk_overlap: Overlap between chunks
47
+
48
+ Returns:
49
+ Stats dict with files, chunks, errors
50
+ """
51
+ self._stats = {"files": 0, "chunks": 0, "errors": 0}
52
+
53
+ ids = self.rag.add_directory(
54
+ path,
55
+ extensions=extensions,
56
+ recursive=recursive,
57
+ chunk_size=chunk_size,
58
+ chunk_overlap=chunk_overlap
59
+ )
60
+
61
+ self._stats["chunks"] = len(ids)
62
+ return self._stats
63
+
64
+ def from_texts(
65
+ self,
66
+ texts: List[str],
67
+ ids: Optional[List[str]] = None,
68
+ metadatas: Optional[List[Dict]] = None
69
+ ) -> int:
70
+ """
71
+ Ingest a list of texts.
72
+
73
+ Args:
74
+ texts: List of text strings
75
+ ids: Optional document IDs
76
+ metadatas: Optional metadata dicts
77
+
78
+ Returns:
79
+ Number of documents added
80
+ """
81
+ added_ids = self.rag.add_texts(texts, ids, metadatas)
82
+ return len(added_ids)
83
+
84
+ def from_jsonl(
85
+ self,
86
+ path: str,
87
+ content_field: str = "text",
88
+ id_field: Optional[str] = None,
89
+ metadata_fields: Optional[List[str]] = None
90
+ ) -> int:
91
+ """
92
+ Ingest documents from a JSONL file.
93
+
94
+ Args:
95
+ path: Path to JSONL file
96
+ content_field: Field containing text content
97
+ id_field: Field containing document ID
98
+ metadata_fields: Fields to include as metadata
99
+
100
+ Returns:
101
+ Number of documents added
102
+ """
103
+ import json
104
+
105
+ texts = []
106
+ ids = []
107
+ metadatas = []
108
+
109
+ with open(path, "r", encoding="utf-8") as f:
110
+ for line in f:
111
+ doc = json.loads(line)
112
+ content = doc.get(content_field, "")
113
+ if not content:
114
+ continue
115
+
116
+ texts.append(content)
117
+
118
+ if id_field and id_field in doc:
119
+ ids.append(str(doc[id_field]))
120
+ else:
121
+ ids.append(hashlib.sha256(content.encode()).hexdigest()[:16])
122
+
123
+ if metadata_fields:
124
+ meta = {k: doc.get(k) for k in metadata_fields if k in doc}
125
+ else:
126
+ meta = {k: v for k, v in doc.items() if k != content_field}
127
+ metadatas.append(meta)
128
+
129
+ return self.from_texts(texts, ids, metadatas)
130
+
131
+ @property
132
+ def stats(self) -> Dict[str, int]:
133
+ """Get ingestion stats."""
134
+ return self._stats
@@ -0,0 +1,115 @@
1
+ """
2
+ Semantic search utilities for RapidRAG.
3
+ """
4
+
5
+ from typing import Optional, List, Dict, Any
6
+
7
+
8
+ class SemanticSearch:
9
+ """
10
+ Semantic search utilities.
11
+
12
+ Wraps RapidRAG.search() with additional functionality.
13
+ """
14
+
15
+ def __init__(self, rag: "RapidRAG"):
16
+ """Initialize with a RapidRAG instance."""
17
+ self.rag = rag
18
+
19
+ def search(
20
+ self,
21
+ query: str,
22
+ n_results: int = 5,
23
+ min_score: float = 0.0,
24
+ source_filter: Optional[str] = None
25
+ ) -> List[Dict[str, Any]]:
26
+ """
27
+ Search with additional filters.
28
+
29
+ Args:
30
+ query: Search query
31
+ n_results: Max results
32
+ min_score: Minimum similarity score (0-1)
33
+ source_filter: Filter by source path (contains)
34
+
35
+ Returns:
36
+ Filtered search results
37
+ """
38
+ results = self.rag.search(query, n_results=n_results * 2)
39
+
40
+ # Filter by score
41
+ results = [r for r in results if r["score"] >= min_score]
42
+
43
+ # Filter by source
44
+ if source_filter:
45
+ results = [
46
+ r for r in results
47
+ if source_filter in r["metadata"].get("source", "")
48
+ ]
49
+
50
+ return results[:n_results]
51
+
52
+ def find_similar(
53
+ self,
54
+ doc_id: str,
55
+ n_results: int = 5
56
+ ) -> List[Dict[str, Any]]:
57
+ """
58
+ Find documents similar to a given document.
59
+
60
+ Args:
61
+ doc_id: ID of the reference document
62
+ n_results: Number of similar documents to find
63
+
64
+ Returns:
65
+ List of similar documents
66
+ """
67
+ # Get the document
68
+ result = self.rag.collection.get(ids=[doc_id], include=["documents"])
69
+
70
+ if not result["documents"]:
71
+ return []
72
+
73
+ content = result["documents"][0]
74
+
75
+ # Search for similar (excluding the original)
76
+ results = self.rag.search(content, n_results=n_results + 1)
77
+
78
+ # Remove the original document
79
+ return [r for r in results if r["id"] != doc_id][:n_results]
80
+
81
+ def hybrid_search(
82
+ self,
83
+ query: str,
84
+ keywords: List[str],
85
+ n_results: int = 5,
86
+ keyword_boost: float = 0.3
87
+ ) -> List[Dict[str, Any]]:
88
+ """
89
+ Hybrid search combining semantic + keyword matching.
90
+
91
+ Args:
92
+ query: Semantic search query
93
+ keywords: Keywords to boost
94
+ n_results: Number of results
95
+ keyword_boost: Score boost for keyword matches (0-1)
96
+
97
+ Returns:
98
+ Reranked results
99
+ """
100
+ # Semantic search
101
+ results = self.rag.search(query, n_results=n_results * 2)
102
+
103
+ # Boost scores for keyword matches
104
+ for r in results:
105
+ content_lower = r["content"].lower()
106
+ matches = sum(1 for kw in keywords if kw.lower() in content_lower)
107
+ if matches > 0:
108
+ boost = min(keyword_boost * matches, keyword_boost * 3)
109
+ r["score"] = min(1.0, r["score"] + boost)
110
+ r["keyword_matches"] = matches
111
+
112
+ # Sort by score
113
+ results.sort(key=lambda x: x["score"], reverse=True)
114
+
115
+ return results[:n_results]