rapid-rag 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rapid_rag-0.1.0/.gitignore +74 -0
- rapid_rag-0.1.0/PKG-INFO +119 -0
- rapid_rag-0.1.0/README.md +74 -0
- rapid_rag-0.1.0/pyproject.toml +60 -0
- rapid_rag-0.1.0/src/rapid_rag/__init__.py +27 -0
- rapid_rag-0.1.0/src/rapid_rag/cli.py +150 -0
- rapid_rag-0.1.0/src/rapid_rag/core.py +417 -0
- rapid_rag-0.1.0/src/rapid_rag/ingest.py +134 -0
- rapid_rag-0.1.0/src/rapid_rag/search.py +115 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Secrets & env
|
|
2
|
+
.env
|
|
3
|
+
*.env
|
|
4
|
+
*.secret
|
|
5
|
+
|
|
6
|
+
# Keys & certs
|
|
7
|
+
*.key
|
|
8
|
+
*.pem
|
|
9
|
+
certs/
|
|
10
|
+
secrets/
|
|
11
|
+
|
|
12
|
+
# Databases & dumps
|
|
13
|
+
*.db
|
|
14
|
+
*.sqlite
|
|
15
|
+
*.sql
|
|
16
|
+
dump_*/
|
|
17
|
+
|
|
18
|
+
# EXCEPT: Allow database schemas (needed for server rebuild)
|
|
19
|
+
!database-schemas/*.sql
|
|
20
|
+
|
|
21
|
+
# Logs & runtime data
|
|
22
|
+
logs/
|
|
23
|
+
*.log
|
|
24
|
+
__pycache__/
|
|
25
|
+
*.pyc
|
|
26
|
+
venv/
|
|
27
|
+
.venv/
|
|
28
|
+
**/venv/
|
|
29
|
+
**/.venv/
|
|
30
|
+
|
|
31
|
+
# Configs met secrets (we gebruiken straks templates)
|
|
32
|
+
config/
|
|
33
|
+
brain_api/provisioning.local.json
|
|
34
|
+
brain_api/provisioning.json
|
|
35
|
+
|
|
36
|
+
# Landing pages (privé - niet open source)
|
|
37
|
+
landing-pages/
|
|
38
|
+
humotica.com/
|
|
39
|
+
jtel.nl/
|
|
40
|
+
|
|
41
|
+
# Social media posts (strategie - niet open source)
|
|
42
|
+
SOCIAL-MEDIA-POSTS.md
|
|
43
|
+
HN-POST-UNDER-4000.md
|
|
44
|
+
STRATO-DEPLOY-HUMOTICA.md
|
|
45
|
+
|
|
46
|
+
# Endorsement outreach (privaat contact)
|
|
47
|
+
ARXIV-ENDORSEMENT-OUTREACH.md
|
|
48
|
+
|
|
49
|
+
# Deployment secrets
|
|
50
|
+
DEPLOYMENT-GUIDE.md
|
|
51
|
+
|
|
52
|
+
# R Project files (Dirty Data Challenge)
|
|
53
|
+
.Rproj.user
|
|
54
|
+
.Rhistory
|
|
55
|
+
.RData
|
|
56
|
+
.Ruserdata
|
|
57
|
+
*.zip
|
|
58
|
+
.mural_tokens.json
|
|
59
|
+
auth.json
|
|
60
|
+
gen-lang-client*.json
|
|
61
|
+
*.credentials.json
|
|
62
|
+
|
|
63
|
+
# Rust build artifacts
|
|
64
|
+
**/target/
|
|
65
|
+
*.whl
|
|
66
|
+
|
|
67
|
+
# Compiled binaries (build locally)
|
|
68
|
+
jis-router/jis-router
|
|
69
|
+
sentinel-rs/sentinel-rs
|
|
70
|
+
|
|
71
|
+
# Build distribution
|
|
72
|
+
sandbox/ai/codex/dist/
|
|
73
|
+
sandbox_backup/
|
|
74
|
+
did-jis-core
|
rapid_rag-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rapid-rag
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fast local RAG - search your documents with AI, no cloud needed
|
|
5
|
+
Project-URL: Homepage, https://humotica.com
|
|
6
|
+
Project-URL: Repository, https://github.com/humotica/rapid-rag
|
|
7
|
+
Project-URL: Documentation, https://humotica.com/docs/rapid-rag
|
|
8
|
+
Author-email: "J. van de Meent" <jasper@humotica.com>, "R. AI" <info@humotica.com>
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: ai,augmented,chromadb,documents,embeddings,generation,llm,local,offline,rag,retrieval,search,semantic-search,vector-search
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Requires-Dist: chromadb>=0.4.0
|
|
25
|
+
Requires-Dist: httpx>=0.24.0
|
|
26
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Requires-Dist: fastapi>=0.100.0; extra == 'all'
|
|
29
|
+
Requires-Dist: ollama>=0.1.0; extra == 'all'
|
|
30
|
+
Requires-Dist: pdfplumber>=0.9.0; extra == 'all'
|
|
31
|
+
Requires-Dist: pypdf>=3.0.0; extra == 'all'
|
|
32
|
+
Requires-Dist: uvicorn>=0.22.0; extra == 'all'
|
|
33
|
+
Provides-Extra: api
|
|
34
|
+
Requires-Dist: fastapi>=0.100.0; extra == 'api'
|
|
35
|
+
Requires-Dist: uvicorn>=0.22.0; extra == 'api'
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
38
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
39
|
+
Provides-Extra: ollama
|
|
40
|
+
Requires-Dist: ollama>=0.1.0; extra == 'ollama'
|
|
41
|
+
Provides-Extra: pdf
|
|
42
|
+
Requires-Dist: pdfplumber>=0.9.0; extra == 'pdf'
|
|
43
|
+
Requires-Dist: pypdf>=3.0.0; extra == 'pdf'
|
|
44
|
+
Description-Content-Type: text/markdown
|
|
45
|
+
|
|
46
|
+
# rapid-rag
|
|
47
|
+
|
|
48
|
+
Fast local RAG - search your documents with AI, no cloud needed.
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install rapid-rag
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
For PDF support:
|
|
57
|
+
```bash
|
|
58
|
+
pip install rapid-rag[pdf]
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from rapid_rag import RapidRAG
|
|
65
|
+
|
|
66
|
+
# Create a RAG instance
|
|
67
|
+
rag = RapidRAG("my_documents")
|
|
68
|
+
|
|
69
|
+
# Add documents
|
|
70
|
+
rag.add("doc1", "The quick brown fox jumps over the lazy dog.")
|
|
71
|
+
rag.add_file("report.pdf")
|
|
72
|
+
rag.add_directory("./docs/")
|
|
73
|
+
|
|
74
|
+
# Semantic search
|
|
75
|
+
results = rag.search("fox jumping")
|
|
76
|
+
for r in results:
|
|
77
|
+
print(f"{r['score']:.3f}: {r['content'][:100]}")
|
|
78
|
+
|
|
79
|
+
# RAG query with LLM (requires Ollama)
|
|
80
|
+
answer = rag.query("What does the fox do?", model="qwen2.5:7b")
|
|
81
|
+
print(answer["answer"])
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## CLI Usage
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# Initialize a collection
|
|
88
|
+
rapid-rag init my_docs
|
|
89
|
+
|
|
90
|
+
# Add documents
|
|
91
|
+
rapid-rag add ./documents/ -c my_docs -r
|
|
92
|
+
|
|
93
|
+
# Search
|
|
94
|
+
rapid-rag search "query here" -c my_docs
|
|
95
|
+
|
|
96
|
+
# RAG query (requires Ollama)
|
|
97
|
+
rapid-rag query "What is X?" -c my_docs -m qwen2.5:7b
|
|
98
|
+
|
|
99
|
+
# Info
|
|
100
|
+
rapid-rag info -c my_docs
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Features
|
|
104
|
+
|
|
105
|
+
- **Local-first**: Everything runs on your machine
|
|
106
|
+
- **Fast**: ChromaDB + sentence-transformers
|
|
107
|
+
- **Simple API**: Add, search, query in 3 lines
|
|
108
|
+
- **File support**: .txt, .md, .pdf
|
|
109
|
+
- **Chunking**: Automatic with overlap
|
|
110
|
+
- **LLM integration**: Works with Ollama
|
|
111
|
+
|
|
112
|
+
## Requirements
|
|
113
|
+
|
|
114
|
+
- Python 3.10+
|
|
115
|
+
- For LLM queries: [Ollama](https://ollama.ai) running locally
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
MIT - Humotica
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# rapid-rag
|
|
2
|
+
|
|
3
|
+
Fast local RAG - search your documents with AI, no cloud needed.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install rapid-rag
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
For PDF support:
|
|
12
|
+
```bash
|
|
13
|
+
pip install rapid-rag[pdf]
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Quick Start
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
from rapid_rag import RapidRAG
|
|
20
|
+
|
|
21
|
+
# Create a RAG instance
|
|
22
|
+
rag = RapidRAG("my_documents")
|
|
23
|
+
|
|
24
|
+
# Add documents
|
|
25
|
+
rag.add("doc1", "The quick brown fox jumps over the lazy dog.")
|
|
26
|
+
rag.add_file("report.pdf")
|
|
27
|
+
rag.add_directory("./docs/")
|
|
28
|
+
|
|
29
|
+
# Semantic search
|
|
30
|
+
results = rag.search("fox jumping")
|
|
31
|
+
for r in results:
|
|
32
|
+
print(f"{r['score']:.3f}: {r['content'][:100]}")
|
|
33
|
+
|
|
34
|
+
# RAG query with LLM (requires Ollama)
|
|
35
|
+
answer = rag.query("What does the fox do?", model="qwen2.5:7b")
|
|
36
|
+
print(answer["answer"])
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## CLI Usage
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Initialize a collection
|
|
43
|
+
rapid-rag init my_docs
|
|
44
|
+
|
|
45
|
+
# Add documents
|
|
46
|
+
rapid-rag add ./documents/ -c my_docs -r
|
|
47
|
+
|
|
48
|
+
# Search
|
|
49
|
+
rapid-rag search "query here" -c my_docs
|
|
50
|
+
|
|
51
|
+
# RAG query (requires Ollama)
|
|
52
|
+
rapid-rag query "What is X?" -c my_docs -m qwen2.5:7b
|
|
53
|
+
|
|
54
|
+
# Info
|
|
55
|
+
rapid-rag info -c my_docs
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Features
|
|
59
|
+
|
|
60
|
+
- **Local-first**: Everything runs on your machine
|
|
61
|
+
- **Fast**: ChromaDB + sentence-transformers
|
|
62
|
+
- **Simple API**: Add, search, query in 3 lines
|
|
63
|
+
- **File support**: .txt, .md, .pdf
|
|
64
|
+
- **Chunking**: Automatic with overlap
|
|
65
|
+
- **LLM integration**: Works with Ollama
|
|
66
|
+
|
|
67
|
+
## Requirements
|
|
68
|
+
|
|
69
|
+
- Python 3.10+
|
|
70
|
+
- For LLM queries: [Ollama](https://ollama.ai) running locally
|
|
71
|
+
|
|
72
|
+
## License
|
|
73
|
+
|
|
74
|
+
MIT - Humotica
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "rapid-rag"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Fast local RAG - search your documents with AI, no cloud needed"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "J. van de Meent", email = "jasper@humotica.com"},
|
|
14
|
+
{name = "R. AI", email = "info@humotica.com"},
|
|
15
|
+
]
|
|
16
|
+
keywords = [
|
|
17
|
+
"rag", "retrieval", "augmented", "generation", "local",
|
|
18
|
+
"chromadb", "embeddings", "search", "documents", "ai",
|
|
19
|
+
"vector-search", "semantic-search", "llm", "offline"
|
|
20
|
+
]
|
|
21
|
+
classifiers = [
|
|
22
|
+
"Development Status :: 4 - Beta",
|
|
23
|
+
"Intended Audience :: Developers",
|
|
24
|
+
"Intended Audience :: Science/Research",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Operating System :: OS Independent",
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"Programming Language :: Python :: 3.10",
|
|
29
|
+
"Programming Language :: Python :: 3.11",
|
|
30
|
+
"Programming Language :: Python :: 3.12",
|
|
31
|
+
"Programming Language :: Python :: 3.13",
|
|
32
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
33
|
+
"Topic :: Text Processing :: Indexing",
|
|
34
|
+
]
|
|
35
|
+
dependencies = [
|
|
36
|
+
"chromadb>=0.4.0",
|
|
37
|
+
"sentence-transformers>=2.2.0",
|
|
38
|
+
"httpx>=0.24.0",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[project.optional-dependencies]
|
|
42
|
+
api = ["fastapi>=0.100.0", "uvicorn>=0.22.0"]
|
|
43
|
+
pdf = ["pypdf>=3.0.0", "pdfplumber>=0.9.0"]
|
|
44
|
+
ollama = ["ollama>=0.1.0"]
|
|
45
|
+
all = ["rapid-rag[api,pdf,ollama]"]
|
|
46
|
+
dev = ["pytest>=7.0", "ruff>=0.1.0"]
|
|
47
|
+
|
|
48
|
+
[project.urls]
|
|
49
|
+
Homepage = "https://humotica.com"
|
|
50
|
+
Repository = "https://github.com/humotica/rapid-rag"
|
|
51
|
+
Documentation = "https://humotica.com/docs/rapid-rag"
|
|
52
|
+
|
|
53
|
+
[project.scripts]
|
|
54
|
+
rapid-rag = "rapid_rag.cli:main"
|
|
55
|
+
|
|
56
|
+
[tool.hatch.build.targets.sdist]
|
|
57
|
+
include = ["/src"]
|
|
58
|
+
|
|
59
|
+
[tool.hatch.build.targets.wheel]
|
|
60
|
+
packages = ["src/rapid_rag"]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
rapid-rag: Fast local RAG - search your documents with AI, no cloud needed.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from rapid_rag import RapidRAG
|
|
6
|
+
|
|
7
|
+
# Create a RAG instance
|
|
8
|
+
rag = RapidRAG("my_documents")
|
|
9
|
+
|
|
10
|
+
# Add documents
|
|
11
|
+
rag.add("doc1", "The quick brown fox jumps over the lazy dog.")
|
|
12
|
+
rag.add_file("report.pdf")
|
|
13
|
+
rag.add_directory("./docs/")
|
|
14
|
+
|
|
15
|
+
# Search
|
|
16
|
+
results = rag.search("fox jumping")
|
|
17
|
+
|
|
18
|
+
# RAG query (with LLM)
|
|
19
|
+
answer = rag.query("What does the fox do?", model="qwen2.5:7b")
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from .core import RapidRAG
|
|
23
|
+
from .ingest import DocumentIngester
|
|
24
|
+
from .search import SemanticSearch
|
|
25
|
+
|
|
26
|
+
__version__ = "0.1.0"
|
|
27
|
+
__all__ = ["RapidRAG", "DocumentIngester", "SemanticSearch"]
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""
|
|
2
|
+
rapid-rag CLI - Command line interface for local RAG.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
"""Main CLI entry point."""
|
|
12
|
+
parser = argparse.ArgumentParser(
|
|
13
|
+
prog="rapid-rag",
|
|
14
|
+
description="Fast local RAG - search your documents with AI"
|
|
15
|
+
)
|
|
16
|
+
subparsers = parser.add_subparsers(dest="command", help="Commands")
|
|
17
|
+
|
|
18
|
+
# Init command
|
|
19
|
+
init_parser = subparsers.add_parser("init", help="Initialize a new RAG collection")
|
|
20
|
+
init_parser.add_argument("name", help="Collection name")
|
|
21
|
+
init_parser.add_argument("--dir", help="Persist directory", default=None)
|
|
22
|
+
|
|
23
|
+
# Add command
|
|
24
|
+
add_parser = subparsers.add_parser("add", help="Add documents")
|
|
25
|
+
add_parser.add_argument("path", help="File or directory to add")
|
|
26
|
+
add_parser.add_argument("-c", "--collection", default="default", help="Collection name")
|
|
27
|
+
add_parser.add_argument("-r", "--recursive", action="store_true", help="Recursive directory scan")
|
|
28
|
+
add_parser.add_argument("--chunk-size", type=int, default=1000, help="Chunk size")
|
|
29
|
+
add_parser.add_argument("--chunk-overlap", type=int, default=200, help="Chunk overlap")
|
|
30
|
+
|
|
31
|
+
# Search command
|
|
32
|
+
search_parser = subparsers.add_parser("search", help="Semantic search")
|
|
33
|
+
search_parser.add_argument("query", help="Search query")
|
|
34
|
+
search_parser.add_argument("-c", "--collection", default="default", help="Collection name")
|
|
35
|
+
search_parser.add_argument("-n", "--num", type=int, default=5, help="Number of results")
|
|
36
|
+
|
|
37
|
+
# Query command (RAG with LLM)
|
|
38
|
+
query_parser = subparsers.add_parser("query", help="RAG query with LLM")
|
|
39
|
+
query_parser.add_argument("question", help="Question to answer")
|
|
40
|
+
query_parser.add_argument("-c", "--collection", default="default", help="Collection name")
|
|
41
|
+
query_parser.add_argument("-m", "--model", default="qwen2.5:7b", help="Ollama model")
|
|
42
|
+
query_parser.add_argument("-n", "--num", type=int, default=5, help="Context documents")
|
|
43
|
+
|
|
44
|
+
# Info command
|
|
45
|
+
info_parser = subparsers.add_parser("info", help="Collection info")
|
|
46
|
+
info_parser.add_argument("-c", "--collection", default="default", help="Collection name")
|
|
47
|
+
|
|
48
|
+
# Clear command
|
|
49
|
+
clear_parser = subparsers.add_parser("clear", help="Clear collection")
|
|
50
|
+
clear_parser.add_argument("-c", "--collection", default="default", help="Collection name")
|
|
51
|
+
clear_parser.add_argument("-y", "--yes", action="store_true", help="Skip confirmation")
|
|
52
|
+
|
|
53
|
+
args = parser.parse_args()
|
|
54
|
+
|
|
55
|
+
if args.command is None:
|
|
56
|
+
parser.print_help()
|
|
57
|
+
return 0
|
|
58
|
+
|
|
59
|
+
# Import here to avoid slow startup
|
|
60
|
+
from .core import RapidRAG
|
|
61
|
+
|
|
62
|
+
if args.command == "init":
|
|
63
|
+
rag = RapidRAG(args.name, persist_dir=args.dir)
|
|
64
|
+
print(f"Initialized collection '{args.name}'")
|
|
65
|
+
print(f"Persist dir: {rag.persist_dir}")
|
|
66
|
+
return 0
|
|
67
|
+
|
|
68
|
+
if args.command == "add":
|
|
69
|
+
rag = RapidRAG(args.collection)
|
|
70
|
+
path = Path(args.path)
|
|
71
|
+
|
|
72
|
+
if path.is_file():
|
|
73
|
+
ids = rag.add_file(
|
|
74
|
+
path,
|
|
75
|
+
chunk_size=args.chunk_size,
|
|
76
|
+
chunk_overlap=args.chunk_overlap
|
|
77
|
+
)
|
|
78
|
+
print(f"Added {len(ids)} chunks from {path.name}")
|
|
79
|
+
elif path.is_dir():
|
|
80
|
+
ids = rag.add_directory(
|
|
81
|
+
path,
|
|
82
|
+
recursive=args.recursive,
|
|
83
|
+
chunk_size=args.chunk_size,
|
|
84
|
+
chunk_overlap=args.chunk_overlap
|
|
85
|
+
)
|
|
86
|
+
print(f"Added {len(ids)} chunks total")
|
|
87
|
+
else:
|
|
88
|
+
print(f"Path not found: {path}", file=sys.stderr)
|
|
89
|
+
return 1
|
|
90
|
+
|
|
91
|
+
return 0
|
|
92
|
+
|
|
93
|
+
if args.command == "search":
|
|
94
|
+
rag = RapidRAG(args.collection)
|
|
95
|
+
results = rag.search(args.query, n_results=args.num)
|
|
96
|
+
|
|
97
|
+
if not results:
|
|
98
|
+
print("No results found.")
|
|
99
|
+
return 0
|
|
100
|
+
|
|
101
|
+
for i, r in enumerate(results, 1):
|
|
102
|
+
score = r["score"]
|
|
103
|
+
source = r["metadata"].get("source", r["id"])
|
|
104
|
+
content = r["content"][:200].replace("\n", " ")
|
|
105
|
+
print(f"\n[{i}] {source} (score: {score:.3f})")
|
|
106
|
+
print(f" {content}...")
|
|
107
|
+
|
|
108
|
+
return 0
|
|
109
|
+
|
|
110
|
+
if args.command == "query":
|
|
111
|
+
rag = RapidRAG(args.collection)
|
|
112
|
+
result = rag.query(args.question, n_results=args.num, model=args.model)
|
|
113
|
+
|
|
114
|
+
print("\n" + "=" * 60)
|
|
115
|
+
print("ANSWER:")
|
|
116
|
+
print("=" * 60)
|
|
117
|
+
print(result["answer"])
|
|
118
|
+
print("\n" + "-" * 60)
|
|
119
|
+
print("SOURCES:")
|
|
120
|
+
for s in result["sources"]:
|
|
121
|
+
source = s["metadata"].get("source", s["id"])
|
|
122
|
+
print(f" - {source} (score: {s['score']:.3f})")
|
|
123
|
+
|
|
124
|
+
return 0
|
|
125
|
+
|
|
126
|
+
if args.command == "info":
|
|
127
|
+
rag = RapidRAG(args.collection)
|
|
128
|
+
print(f"Collection: {args.collection}")
|
|
129
|
+
print(f"Documents: {rag.count()}")
|
|
130
|
+
print(f"Persist dir: {rag.persist_dir}")
|
|
131
|
+
print(f"Embedding model: {rag.embedding_model}")
|
|
132
|
+
return 0
|
|
133
|
+
|
|
134
|
+
if args.command == "clear":
|
|
135
|
+
if not args.yes:
|
|
136
|
+
confirm = input(f"Clear collection '{args.collection}'? [y/N] ")
|
|
137
|
+
if confirm.lower() != "y":
|
|
138
|
+
print("Cancelled.")
|
|
139
|
+
return 0
|
|
140
|
+
|
|
141
|
+
rag = RapidRAG(args.collection)
|
|
142
|
+
rag.clear()
|
|
143
|
+
print(f"Cleared collection '{args.collection}'")
|
|
144
|
+
return 0
|
|
145
|
+
|
|
146
|
+
return 0
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
if __name__ == "__main__":
|
|
150
|
+
sys.exit(main())
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core RapidRAG class - the main interface for local RAG.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import hashlib
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional, List, Dict, Any, Union
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
|
|
11
|
+
import chromadb
|
|
12
|
+
from chromadb.config import Settings
|
|
13
|
+
|
|
14
|
+
# Optional imports
|
|
15
|
+
try:
|
|
16
|
+
import httpx
|
|
17
|
+
HTTPX_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
HTTPX_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RapidRAG:
|
|
23
|
+
"""
|
|
24
|
+
Fast local RAG - search your documents with AI, no cloud needed.
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
rag = RapidRAG("my_project")
|
|
28
|
+
rag.add("doc1", "Some text content")
|
|
29
|
+
results = rag.search("query")
|
|
30
|
+
answer = rag.query("What is...?", model="qwen2.5:7b")
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
collection_name: str = "default",
|
|
36
|
+
persist_dir: Optional[str] = None,
|
|
37
|
+
embedding_model: str = "all-MiniLM-L6-v2",
|
|
38
|
+
ollama_url: str = "http://localhost:11434",
|
|
39
|
+
):
|
|
40
|
+
"""
|
|
41
|
+
Initialize RapidRAG.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
collection_name: Name for the document collection
|
|
45
|
+
persist_dir: Directory to persist the database (default: ./rapid_rag_data/)
|
|
46
|
+
embedding_model: Sentence-transformers model for embeddings
|
|
47
|
+
ollama_url: URL for Ollama API (for LLM queries)
|
|
48
|
+
"""
|
|
49
|
+
self.collection_name = collection_name
|
|
50
|
+
self.persist_dir = persist_dir or f"./rapid_rag_data/{collection_name}"
|
|
51
|
+
self.embedding_model = embedding_model
|
|
52
|
+
self.ollama_url = ollama_url
|
|
53
|
+
|
|
54
|
+
# Create persist directory
|
|
55
|
+
os.makedirs(self.persist_dir, exist_ok=True)
|
|
56
|
+
|
|
57
|
+
# Initialize ChromaDB with persistence
|
|
58
|
+
self.client = chromadb.PersistentClient(
|
|
59
|
+
path=self.persist_dir,
|
|
60
|
+
settings=Settings(anonymized_telemetry=False)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Get or create collection
|
|
64
|
+
self.collection = self.client.get_or_create_collection(
|
|
65
|
+
name=collection_name,
|
|
66
|
+
metadata={"hnsw:space": "cosine"}
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Lazy load embedding model
|
|
70
|
+
self._embedder = None
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def embedder(self):
|
|
74
|
+
"""Lazy load sentence-transformers model."""
|
|
75
|
+
if self._embedder is None:
|
|
76
|
+
from sentence_transformers import SentenceTransformer
|
|
77
|
+
self._embedder = SentenceTransformer(self.embedding_model)
|
|
78
|
+
return self._embedder
|
|
79
|
+
|
|
80
|
+
def _generate_id(self, content: str) -> str:
|
|
81
|
+
"""Generate a unique ID for content."""
|
|
82
|
+
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
83
|
+
|
|
84
|
+
def add(
|
|
85
|
+
self,
|
|
86
|
+
doc_id: str,
|
|
87
|
+
content: str,
|
|
88
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
89
|
+
) -> str:
|
|
90
|
+
"""
|
|
91
|
+
Add a document to the collection.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
doc_id: Unique document identifier
|
|
95
|
+
content: Text content to index
|
|
96
|
+
metadata: Optional metadata dict
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Document ID
|
|
100
|
+
"""
|
|
101
|
+
metadata = metadata or {}
|
|
102
|
+
metadata["added_at"] = datetime.now().isoformat()
|
|
103
|
+
metadata["content_hash"] = self._generate_id(content)
|
|
104
|
+
|
|
105
|
+
# Generate embedding
|
|
106
|
+
embedding = self.embedder.encode(content).tolist()
|
|
107
|
+
|
|
108
|
+
# Add to collection
|
|
109
|
+
self.collection.add(
|
|
110
|
+
ids=[doc_id],
|
|
111
|
+
embeddings=[embedding],
|
|
112
|
+
documents=[content],
|
|
113
|
+
metadatas=[metadata]
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return doc_id
|
|
117
|
+
|
|
118
|
+
def add_texts(
|
|
119
|
+
self,
|
|
120
|
+
texts: List[str],
|
|
121
|
+
ids: Optional[List[str]] = None,
|
|
122
|
+
metadatas: Optional[List[Dict]] = None
|
|
123
|
+
) -> List[str]:
|
|
124
|
+
"""
|
|
125
|
+
Add multiple documents at once (faster).
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
texts: List of text contents
|
|
129
|
+
ids: Optional list of IDs (auto-generated if not provided)
|
|
130
|
+
metadatas: Optional list of metadata dicts
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
List of document IDs
|
|
134
|
+
"""
|
|
135
|
+
if ids is None:
|
|
136
|
+
ids = [self._generate_id(t) for t in texts]
|
|
137
|
+
|
|
138
|
+
if metadatas is None:
|
|
139
|
+
metadatas = [{} for _ in texts]
|
|
140
|
+
|
|
141
|
+
# Add timestamps
|
|
142
|
+
now = datetime.now().isoformat()
|
|
143
|
+
for meta in metadatas:
|
|
144
|
+
meta["added_at"] = now
|
|
145
|
+
|
|
146
|
+
# Generate embeddings (batch)
|
|
147
|
+
embeddings = self.embedder.encode(texts).tolist()
|
|
148
|
+
|
|
149
|
+
# Add to collection
|
|
150
|
+
self.collection.add(
|
|
151
|
+
ids=ids,
|
|
152
|
+
embeddings=embeddings,
|
|
153
|
+
documents=texts,
|
|
154
|
+
metadatas=metadatas
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return ids
|
|
158
|
+
|
|
159
|
+
def add_file(
|
|
160
|
+
self,
|
|
161
|
+
file_path: Union[str, Path],
|
|
162
|
+
chunk_size: int = 1000,
|
|
163
|
+
chunk_overlap: int = 200
|
|
164
|
+
) -> List[str]:
|
|
165
|
+
"""
|
|
166
|
+
Add a file to the collection.
|
|
167
|
+
|
|
168
|
+
Supports: .txt, .md, .pdf (with pdf extra)
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
file_path: Path to file
|
|
172
|
+
chunk_size: Characters per chunk
|
|
173
|
+
chunk_overlap: Overlap between chunks
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
List of chunk IDs
|
|
177
|
+
"""
|
|
178
|
+
file_path = Path(file_path)
|
|
179
|
+
|
|
180
|
+
if not file_path.exists():
|
|
181
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
182
|
+
|
|
183
|
+
# Read content based on file type
|
|
184
|
+
suffix = file_path.suffix.lower()
|
|
185
|
+
|
|
186
|
+
if suffix in [".txt", ".md"]:
|
|
187
|
+
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
188
|
+
elif suffix == ".pdf":
|
|
189
|
+
content = self._read_pdf(file_path)
|
|
190
|
+
else:
|
|
191
|
+
# Try reading as text
|
|
192
|
+
content = file_path.read_text(encoding="utf-8", errors="ignore")
|
|
193
|
+
|
|
194
|
+
# Chunk the content
|
|
195
|
+
chunks = self._chunk_text(content, chunk_size, chunk_overlap)
|
|
196
|
+
|
|
197
|
+
# Create IDs and metadata
|
|
198
|
+
ids = [f"{file_path.stem}_{i}" for i in range(len(chunks))]
|
|
199
|
+
metadatas = [
|
|
200
|
+
{
|
|
201
|
+
"source": str(file_path),
|
|
202
|
+
"chunk_index": i,
|
|
203
|
+
"total_chunks": len(chunks)
|
|
204
|
+
}
|
|
205
|
+
for i in range(len(chunks))
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
return self.add_texts(chunks, ids, metadatas)
|
|
209
|
+
|
|
210
|
+
def add_directory(
|
|
211
|
+
self,
|
|
212
|
+
dir_path: Union[str, Path],
|
|
213
|
+
extensions: Optional[List[str]] = None,
|
|
214
|
+
recursive: bool = True,
|
|
215
|
+
**kwargs
|
|
216
|
+
) -> List[str]:
|
|
217
|
+
"""
|
|
218
|
+
Add all files in a directory.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
dir_path: Path to directory
|
|
222
|
+
extensions: File extensions to include (default: [".txt", ".md", ".pdf"])
|
|
223
|
+
recursive: Search subdirectories
|
|
224
|
+
**kwargs: Passed to add_file()
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
List of all chunk IDs
|
|
228
|
+
"""
|
|
229
|
+
dir_path = Path(dir_path)
|
|
230
|
+
extensions = extensions or [".txt", ".md", ".pdf"]
|
|
231
|
+
|
|
232
|
+
all_ids = []
|
|
233
|
+
pattern = "**/*" if recursive else "*"
|
|
234
|
+
|
|
235
|
+
for ext in extensions:
|
|
236
|
+
for file_path in dir_path.glob(f"{pattern}{ext}"):
|
|
237
|
+
if file_path.is_file():
|
|
238
|
+
try:
|
|
239
|
+
ids = self.add_file(file_path, **kwargs)
|
|
240
|
+
all_ids.extend(ids)
|
|
241
|
+
print(f"✓ {file_path.name}: {len(ids)} chunks")
|
|
242
|
+
except Exception as e:
|
|
243
|
+
print(f"✗ {file_path.name}: {e}")
|
|
244
|
+
|
|
245
|
+
return all_ids
|
|
246
|
+
|
|
247
|
+
def search(
|
|
248
|
+
self,
|
|
249
|
+
query: str,
|
|
250
|
+
n_results: int = 5,
|
|
251
|
+
where: Optional[Dict] = None
|
|
252
|
+
) -> List[Dict[str, Any]]:
|
|
253
|
+
"""
|
|
254
|
+
Semantic search in the collection.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
query: Search query
|
|
258
|
+
n_results: Number of results to return
|
|
259
|
+
where: Optional filter dict
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
List of results with content, metadata, and score
|
|
263
|
+
"""
|
|
264
|
+
# Generate query embedding
|
|
265
|
+
query_embedding = self.embedder.encode(query).tolist()
|
|
266
|
+
|
|
267
|
+
# Search
|
|
268
|
+
results = self.collection.query(
|
|
269
|
+
query_embeddings=[query_embedding],
|
|
270
|
+
n_results=n_results,
|
|
271
|
+
where=where,
|
|
272
|
+
include=["documents", "metadatas", "distances"]
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Format results
|
|
276
|
+
output = []
|
|
277
|
+
for i in range(len(results["ids"][0])):
|
|
278
|
+
output.append({
|
|
279
|
+
"id": results["ids"][0][i],
|
|
280
|
+
"content": results["documents"][0][i],
|
|
281
|
+
"metadata": results["metadatas"][0][i],
|
|
282
|
+
"score": 1 - results["distances"][0][i] # Convert distance to similarity
|
|
283
|
+
})
|
|
284
|
+
|
|
285
|
+
return output
|
|
286
|
+
|
|
287
|
+
def query(
|
|
288
|
+
self,
|
|
289
|
+
question: str,
|
|
290
|
+
n_results: int = 5,
|
|
291
|
+
model: str = "qwen2.5:7b",
|
|
292
|
+
system_prompt: Optional[str] = None
|
|
293
|
+
) -> Dict[str, Any]:
|
|
294
|
+
"""
|
|
295
|
+
RAG query - search + LLM analysis.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
question: Question to answer
|
|
299
|
+
n_results: Number of documents to retrieve
|
|
300
|
+
model: Ollama model to use
|
|
301
|
+
system_prompt: Optional system prompt
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
Dict with answer and sources
|
|
305
|
+
"""
|
|
306
|
+
if not HTTPX_AVAILABLE:
|
|
307
|
+
raise ImportError("httpx required for LLM queries: pip install httpx")
|
|
308
|
+
|
|
309
|
+
# Search for relevant documents
|
|
310
|
+
results = self.search(question, n_results=n_results)
|
|
311
|
+
|
|
312
|
+
if not results:
|
|
313
|
+
return {
|
|
314
|
+
"answer": "No relevant documents found.",
|
|
315
|
+
"sources": [],
|
|
316
|
+
"query": question
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
# Build context from results
|
|
320
|
+
context = "\n\n---\n\n".join([
|
|
321
|
+
f"[Source: {r['metadata'].get('source', r['id'])}]\n{r['content']}"
|
|
322
|
+
for r in results
|
|
323
|
+
])
|
|
324
|
+
|
|
325
|
+
# Default system prompt
|
|
326
|
+
if system_prompt is None:
|
|
327
|
+
system_prompt = (
|
|
328
|
+
"You are a helpful assistant. Answer questions based on the provided context. "
|
|
329
|
+
"If the answer is not in the context, say so. Cite your sources."
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# Build prompt
|
|
333
|
+
prompt = f"""Context:
|
|
334
|
+
{context}
|
|
335
|
+
|
|
336
|
+
Question: {question}
|
|
337
|
+
|
|
338
|
+
Answer based on the context above:"""
|
|
339
|
+
|
|
340
|
+
# Call Ollama
|
|
341
|
+
try:
|
|
342
|
+
with httpx.Client(timeout=60.0) as client:
|
|
343
|
+
response = client.post(
|
|
344
|
+
f"{self.ollama_url}/api/generate",
|
|
345
|
+
json={
|
|
346
|
+
"model": model,
|
|
347
|
+
"prompt": prompt,
|
|
348
|
+
"system": system_prompt,
|
|
349
|
+
"stream": False
|
|
350
|
+
}
|
|
351
|
+
)
|
|
352
|
+
response.raise_for_status()
|
|
353
|
+
answer = response.json().get("response", "")
|
|
354
|
+
except Exception as e:
|
|
355
|
+
answer = f"LLM error: {e}"
|
|
356
|
+
|
|
357
|
+
return {
|
|
358
|
+
"answer": answer,
|
|
359
|
+
"sources": results,
|
|
360
|
+
"query": question,
|
|
361
|
+
"model": model
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
def count(self) -> int:
|
|
365
|
+
"""Return number of documents in collection."""
|
|
366
|
+
return self.collection.count()
|
|
367
|
+
|
|
368
|
+
def delete(self, ids: List[str]) -> None:
|
|
369
|
+
"""Delete documents by ID."""
|
|
370
|
+
self.collection.delete(ids=ids)
|
|
371
|
+
|
|
372
|
+
def clear(self) -> None:
|
|
373
|
+
"""Clear all documents from collection."""
|
|
374
|
+
self.client.delete_collection(self.collection_name)
|
|
375
|
+
self.collection = self.client.create_collection(
|
|
376
|
+
name=self.collection_name,
|
|
377
|
+
metadata={"hnsw:space": "cosine"}
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
def _chunk_text(
|
|
381
|
+
self,
|
|
382
|
+
text: str,
|
|
383
|
+
chunk_size: int,
|
|
384
|
+
chunk_overlap: int
|
|
385
|
+
) -> List[str]:
|
|
386
|
+
"""Split text into overlapping chunks."""
|
|
387
|
+
chunks = []
|
|
388
|
+
start = 0
|
|
389
|
+
|
|
390
|
+
while start < len(text):
|
|
391
|
+
end = start + chunk_size
|
|
392
|
+
chunk = text[start:end]
|
|
393
|
+
|
|
394
|
+
# Try to break at sentence boundary
|
|
395
|
+
if end < len(text):
|
|
396
|
+
last_period = chunk.rfind(". ")
|
|
397
|
+
if last_period > chunk_size // 2:
|
|
398
|
+
chunk = chunk[:last_period + 1]
|
|
399
|
+
end = start + last_period + 1
|
|
400
|
+
|
|
401
|
+
chunks.append(chunk.strip())
|
|
402
|
+
start = end - chunk_overlap
|
|
403
|
+
|
|
404
|
+
return [c for c in chunks if c] # Remove empty chunks
|
|
405
|
+
|
|
406
|
+
def _read_pdf(self, file_path: Path) -> str:
|
|
407
|
+
"""Read text from PDF file."""
|
|
408
|
+
try:
|
|
409
|
+
from pypdf import PdfReader
|
|
410
|
+
reader = PdfReader(str(file_path))
|
|
411
|
+
text = "\n".join(page.extract_text() or "" for page in reader.pages)
|
|
412
|
+
return text
|
|
413
|
+
except ImportError:
|
|
414
|
+
raise ImportError("PDF support requires: pip install rapid-rag[pdf]")
|
|
415
|
+
|
|
416
|
+
def __repr__(self) -> str:
|
|
417
|
+
return f"RapidRAG(collection='{self.collection_name}', docs={self.count()})"
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document ingestion utilities for RapidRAG.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional, List, Dict, Any, Generator
|
|
7
|
+
import hashlib
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DocumentIngester:
|
|
11
|
+
"""
|
|
12
|
+
Ingest documents from various sources.
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
ingester = DocumentIngester(rag)
|
|
16
|
+
ingester.from_directory("./docs/")
|
|
17
|
+
ingester.from_url("https://example.com/doc.txt")
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, rag: "RapidRAG"):
|
|
21
|
+
"""
|
|
22
|
+
Initialize ingester.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
rag: RapidRAG instance to add documents to
|
|
26
|
+
"""
|
|
27
|
+
self.rag = rag
|
|
28
|
+
self._stats = {"files": 0, "chunks": 0, "errors": 0}
|
|
29
|
+
|
|
30
|
+
def from_directory(
|
|
31
|
+
self,
|
|
32
|
+
path: str,
|
|
33
|
+
extensions: Optional[List[str]] = None,
|
|
34
|
+
recursive: bool = True,
|
|
35
|
+
chunk_size: int = 1000,
|
|
36
|
+
chunk_overlap: int = 200
|
|
37
|
+
) -> Dict[str, int]:
|
|
38
|
+
"""
|
|
39
|
+
Ingest all documents from a directory.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
path: Directory path
|
|
43
|
+
extensions: File extensions to process
|
|
44
|
+
recursive: Include subdirectories
|
|
45
|
+
chunk_size: Characters per chunk
|
|
46
|
+
chunk_overlap: Overlap between chunks
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Stats dict with files, chunks, errors
|
|
50
|
+
"""
|
|
51
|
+
self._stats = {"files": 0, "chunks": 0, "errors": 0}
|
|
52
|
+
|
|
53
|
+
ids = self.rag.add_directory(
|
|
54
|
+
path,
|
|
55
|
+
extensions=extensions,
|
|
56
|
+
recursive=recursive,
|
|
57
|
+
chunk_size=chunk_size,
|
|
58
|
+
chunk_overlap=chunk_overlap
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
self._stats["chunks"] = len(ids)
|
|
62
|
+
return self._stats
|
|
63
|
+
|
|
64
|
+
def from_texts(
|
|
65
|
+
self,
|
|
66
|
+
texts: List[str],
|
|
67
|
+
ids: Optional[List[str]] = None,
|
|
68
|
+
metadatas: Optional[List[Dict]] = None
|
|
69
|
+
) -> int:
|
|
70
|
+
"""
|
|
71
|
+
Ingest a list of texts.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
texts: List of text strings
|
|
75
|
+
ids: Optional document IDs
|
|
76
|
+
metadatas: Optional metadata dicts
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Number of documents added
|
|
80
|
+
"""
|
|
81
|
+
added_ids = self.rag.add_texts(texts, ids, metadatas)
|
|
82
|
+
return len(added_ids)
|
|
83
|
+
|
|
84
|
+
def from_jsonl(
|
|
85
|
+
self,
|
|
86
|
+
path: str,
|
|
87
|
+
content_field: str = "text",
|
|
88
|
+
id_field: Optional[str] = None,
|
|
89
|
+
metadata_fields: Optional[List[str]] = None
|
|
90
|
+
) -> int:
|
|
91
|
+
"""
|
|
92
|
+
Ingest documents from a JSONL file.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
path: Path to JSONL file
|
|
96
|
+
content_field: Field containing text content
|
|
97
|
+
id_field: Field containing document ID
|
|
98
|
+
metadata_fields: Fields to include as metadata
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Number of documents added
|
|
102
|
+
"""
|
|
103
|
+
import json
|
|
104
|
+
|
|
105
|
+
texts = []
|
|
106
|
+
ids = []
|
|
107
|
+
metadatas = []
|
|
108
|
+
|
|
109
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
110
|
+
for line in f:
|
|
111
|
+
doc = json.loads(line)
|
|
112
|
+
content = doc.get(content_field, "")
|
|
113
|
+
if not content:
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
texts.append(content)
|
|
117
|
+
|
|
118
|
+
if id_field and id_field in doc:
|
|
119
|
+
ids.append(str(doc[id_field]))
|
|
120
|
+
else:
|
|
121
|
+
ids.append(hashlib.sha256(content.encode()).hexdigest()[:16])
|
|
122
|
+
|
|
123
|
+
if metadata_fields:
|
|
124
|
+
meta = {k: doc.get(k) for k in metadata_fields if k in doc}
|
|
125
|
+
else:
|
|
126
|
+
meta = {k: v for k, v in doc.items() if k != content_field}
|
|
127
|
+
metadatas.append(meta)
|
|
128
|
+
|
|
129
|
+
return self.from_texts(texts, ids, metadatas)
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def stats(self) -> Dict[str, int]:
|
|
133
|
+
"""Get ingestion stats."""
|
|
134
|
+
return self._stats
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic search utilities for RapidRAG.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Optional, List, Dict, Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SemanticSearch:
|
|
9
|
+
"""
|
|
10
|
+
Semantic search utilities.
|
|
11
|
+
|
|
12
|
+
Wraps RapidRAG.search() with additional functionality.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, rag: "RapidRAG"):
|
|
16
|
+
"""Initialize with a RapidRAG instance."""
|
|
17
|
+
self.rag = rag
|
|
18
|
+
|
|
19
|
+
def search(
|
|
20
|
+
self,
|
|
21
|
+
query: str,
|
|
22
|
+
n_results: int = 5,
|
|
23
|
+
min_score: float = 0.0,
|
|
24
|
+
source_filter: Optional[str] = None
|
|
25
|
+
) -> List[Dict[str, Any]]:
|
|
26
|
+
"""
|
|
27
|
+
Search with additional filters.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
query: Search query
|
|
31
|
+
n_results: Max results
|
|
32
|
+
min_score: Minimum similarity score (0-1)
|
|
33
|
+
source_filter: Filter by source path (contains)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Filtered search results
|
|
37
|
+
"""
|
|
38
|
+
results = self.rag.search(query, n_results=n_results * 2)
|
|
39
|
+
|
|
40
|
+
# Filter by score
|
|
41
|
+
results = [r for r in results if r["score"] >= min_score]
|
|
42
|
+
|
|
43
|
+
# Filter by source
|
|
44
|
+
if source_filter:
|
|
45
|
+
results = [
|
|
46
|
+
r for r in results
|
|
47
|
+
if source_filter in r["metadata"].get("source", "")
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
return results[:n_results]
|
|
51
|
+
|
|
52
|
+
def find_similar(
|
|
53
|
+
self,
|
|
54
|
+
doc_id: str,
|
|
55
|
+
n_results: int = 5
|
|
56
|
+
) -> List[Dict[str, Any]]:
|
|
57
|
+
"""
|
|
58
|
+
Find documents similar to a given document.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
doc_id: ID of the reference document
|
|
62
|
+
n_results: Number of similar documents to find
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
List of similar documents
|
|
66
|
+
"""
|
|
67
|
+
# Get the document
|
|
68
|
+
result = self.rag.collection.get(ids=[doc_id], include=["documents"])
|
|
69
|
+
|
|
70
|
+
if not result["documents"]:
|
|
71
|
+
return []
|
|
72
|
+
|
|
73
|
+
content = result["documents"][0]
|
|
74
|
+
|
|
75
|
+
# Search for similar (excluding the original)
|
|
76
|
+
results = self.rag.search(content, n_results=n_results + 1)
|
|
77
|
+
|
|
78
|
+
# Remove the original document
|
|
79
|
+
return [r for r in results if r["id"] != doc_id][:n_results]
|
|
80
|
+
|
|
81
|
+
def hybrid_search(
|
|
82
|
+
self,
|
|
83
|
+
query: str,
|
|
84
|
+
keywords: List[str],
|
|
85
|
+
n_results: int = 5,
|
|
86
|
+
keyword_boost: float = 0.3
|
|
87
|
+
) -> List[Dict[str, Any]]:
|
|
88
|
+
"""
|
|
89
|
+
Hybrid search combining semantic + keyword matching.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
query: Semantic search query
|
|
93
|
+
keywords: Keywords to boost
|
|
94
|
+
n_results: Number of results
|
|
95
|
+
keyword_boost: Score boost for keyword matches (0-1)
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Reranked results
|
|
99
|
+
"""
|
|
100
|
+
# Semantic search
|
|
101
|
+
results = self.rag.search(query, n_results=n_results * 2)
|
|
102
|
+
|
|
103
|
+
# Boost scores for keyword matches
|
|
104
|
+
for r in results:
|
|
105
|
+
content_lower = r["content"].lower()
|
|
106
|
+
matches = sum(1 for kw in keywords if kw.lower() in content_lower)
|
|
107
|
+
if matches > 0:
|
|
108
|
+
boost = min(keyword_boost * matches, keyword_boost * 3)
|
|
109
|
+
r["score"] = min(1.0, r["score"] + boost)
|
|
110
|
+
r["keyword_matches"] = matches
|
|
111
|
+
|
|
112
|
+
# Sort by score
|
|
113
|
+
results.sort(key=lambda x: x["score"], reverse=True)
|
|
114
|
+
|
|
115
|
+
return results[:n_results]
|