rag-python 0.1.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rag_python-0.1.0/src/rag_python.egg-info → rag_python-0.3.0}/PKG-INFO +26 -4
- {rag_python-0.1.0 → rag_python-0.3.0}/README.md +19 -2
- {rag_python-0.1.0 → rag_python-0.3.0}/pyproject.toml +5 -3
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/__init__.py +1 -1
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/cli.py +55 -5
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/client.py +3 -0
- rag_python-0.3.0/src/rag_python/document_loaders.py +146 -0
- rag_python-0.3.0/src/rag_python/hybrid_search.py +51 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/options.py +3 -2
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/providers/factory.py +4 -1
- rag_python-0.3.0/src/rag_python/providers/local_provider.py +34 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/rag_pipeline.py +8 -2
- rag_python-0.3.0/src/rag_python/retrieval.py +101 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/vector_store.py +13 -0
- {rag_python-0.1.0 → rag_python-0.3.0/src/rag_python.egg-info}/PKG-INFO +26 -4
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python.egg-info/SOURCES.txt +9 -1
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python.egg-info/requires.txt +8 -1
- rag_python-0.3.0/tests/test_chunking.py +25 -0
- rag_python-0.3.0/tests/test_hybrid_search.py +35 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/tests/test_import.py +1 -1
- rag_python-0.3.0/tests/test_loaders.py +54 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/tests/test_package.py +1 -1
- rag_python-0.3.0/tests/test_pipeline.py +55 -0
- rag_python-0.3.0/tests/test_providers.py +6 -0
- rag_python-0.3.0/tests/test_retrieval.py +52 -0
- rag_python-0.1.0/src/rag_python/document_loaders.py +0 -74
- rag_python-0.1.0/src/rag_python/retrieval.py +0 -61
- {rag_python-0.1.0 → rag_python-0.3.0}/LICENSE +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/setup.cfg +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/chunking.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/cleaning.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/config.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/evaluation.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/generation.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/guardrails.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/providers/__init__.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/providers/anthropic_provider.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/providers/azure_openai_provider.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/providers/base.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/providers/gemini_provider.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/providers/ollama_provider.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/providers/openai_provider.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/py.typed +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/query_rewriting.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python/reranker.py +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python.egg-info/dependency_links.txt +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python.egg-info/entry_points.txt +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/src/rag_python.egg-info/top_level.txt +0 -0
- {rag_python-0.1.0 → rag_python-0.3.0}/tests/test_config.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: rag-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Production-grade RAG for Python: multi-LLM, query rewriting, reranking, guardrails, and evaluation.
|
|
5
5
|
Author-email: Raghav Singla <04raghavsingla28@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -33,6 +33,10 @@ Requires-Dist: requests>=2.31.0
|
|
|
33
33
|
Provides-Extra: rerank
|
|
34
34
|
Requires-Dist: sentence-transformers>=2.2.0; extra == "rerank"
|
|
35
35
|
Requires-Dist: torch>=2.0.0; extra == "rerank"
|
|
36
|
+
Provides-Extra: local
|
|
37
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == "local"
|
|
38
|
+
Provides-Extra: hybrid
|
|
39
|
+
Requires-Dist: rank-bm25>=0.2.2; extra == "hybrid"
|
|
36
40
|
Provides-Extra: anthropic
|
|
37
41
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
|
38
42
|
Provides-Extra: gemini
|
|
@@ -42,11 +46,14 @@ Requires-Dist: pytest>=7.0; extra == "dev"
|
|
|
42
46
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
43
47
|
Requires-Dist: build; extra == "dev"
|
|
44
48
|
Requires-Dist: twine; extra == "dev"
|
|
49
|
+
Requires-Dist: rank-bm25>=0.2.2; extra == "dev"
|
|
45
50
|
Provides-Extra: all
|
|
46
|
-
Requires-Dist: rag-python[anthropic,gemini,rerank]; extra == "all"
|
|
51
|
+
Requires-Dist: rag-python[anthropic,gemini,hybrid,local,rerank]; extra == "all"
|
|
47
52
|
|
|
48
53
|
# rag-python
|
|
49
54
|
|
|
55
|
+
[](https://pypi.org/project/rag-python/)
|
|
56
|
+
[](https://pypi.org/project/rag-python/)
|
|
50
57
|
[](https://www.python.org/downloads/)
|
|
51
58
|
[](LICENSE)
|
|
52
59
|
[](https://github.com/RaghavOG/rag-python)
|
|
@@ -63,10 +70,11 @@ Ingest your documents, ask questions, get grounded answers — with query rewrit
|
|
|
63
70
|
## Features
|
|
64
71
|
|
|
65
72
|
- Document pipeline: loaders → cleaning → chunking → embeddings → ChromaDB
|
|
66
|
-
- Query pipeline: rewriting → multi-query retrieval → reranking
|
|
73
|
+
- Query pipeline: rewriting → multi-query / **hybrid** retrieval → reranking
|
|
67
74
|
- Generation with guardrails (prompt injection + hallucination checks)
|
|
68
75
|
- Evaluation scores + self-correction retry loop
|
|
69
76
|
- **LLM providers:** OpenAI, Azure OpenAI, Anthropic, Gemini, Ollama
|
|
77
|
+
- **Loaders:** TXT, MD, PDF, DOCX, CSV, JSON, HTML
|
|
70
78
|
|
|
71
79
|
---
|
|
72
80
|
|
|
@@ -77,7 +85,7 @@ pip install rag-python
|
|
|
77
85
|
# or from source
|
|
78
86
|
pip install -e .
|
|
79
87
|
# with reranking + extra providers
|
|
80
|
-
pip install -e ".[rerank,anthropic,gemini,all]"
|
|
88
|
+
pip install -e ".[rerank,local,hybrid,anthropic,gemini,all]"
|
|
81
89
|
```
|
|
82
90
|
|
|
83
91
|
---
|
|
@@ -99,12 +107,26 @@ answer = rag.query("How many days of annual leave?")
|
|
|
99
107
|
print(answer.text)
|
|
100
108
|
```
|
|
101
109
|
|
|
110
|
+
### Hybrid search + metadata filter
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from rag_python import RAG, SearchConfig
|
|
114
|
+
|
|
115
|
+
rag = RAG(
|
|
116
|
+
retriever="hybrid", # pip install rag-python[hybrid]
|
|
117
|
+
metadata_filter={"filename": "leave-policy.pdf"},
|
|
118
|
+
)
|
|
119
|
+
rag.ingest(["./policies/leave-policy.pdf", "./policies/handbook.pdf"])
|
|
120
|
+
answer = rag.query("How many days of annual leave?")
|
|
121
|
+
```
|
|
122
|
+
|
|
102
123
|
### CLI
|
|
103
124
|
|
|
104
125
|
```bash
|
|
105
126
|
export OPENAI_API_KEY=sk-...
|
|
106
127
|
rag-python ingest ./data --reindex
|
|
107
128
|
rag-python query "How many days of annual leave?" -v
|
|
129
|
+
rag-python query "leave policy" --retriever hybrid --metadata-filter '{"filename": "leave-policy.pdf"}'
|
|
108
130
|
```
|
|
109
131
|
|
|
110
132
|
---
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# rag-python
|
|
2
2
|
|
|
3
|
+
[](https://pypi.org/project/rag-python/)
|
|
4
|
+
[](https://pypi.org/project/rag-python/)
|
|
3
5
|
[](https://www.python.org/downloads/)
|
|
4
6
|
[](LICENSE)
|
|
5
7
|
[](https://github.com/RaghavOG/rag-python)
|
|
@@ -16,10 +18,11 @@ Ingest your documents, ask questions, get grounded answers — with query rewrit
|
|
|
16
18
|
## Features
|
|
17
19
|
|
|
18
20
|
- Document pipeline: loaders → cleaning → chunking → embeddings → ChromaDB
|
|
19
|
-
- Query pipeline: rewriting → multi-query retrieval → reranking
|
|
21
|
+
- Query pipeline: rewriting → multi-query / **hybrid** retrieval → reranking
|
|
20
22
|
- Generation with guardrails (prompt injection + hallucination checks)
|
|
21
23
|
- Evaluation scores + self-correction retry loop
|
|
22
24
|
- **LLM providers:** OpenAI, Azure OpenAI, Anthropic, Gemini, Ollama
|
|
25
|
+
- **Loaders:** TXT, MD, PDF, DOCX, CSV, JSON, HTML
|
|
23
26
|
|
|
24
27
|
---
|
|
25
28
|
|
|
@@ -30,7 +33,7 @@ pip install rag-python
|
|
|
30
33
|
# or from source
|
|
31
34
|
pip install -e .
|
|
32
35
|
# with reranking + extra providers
|
|
33
|
-
pip install -e ".[rerank,anthropic,gemini,all]"
|
|
36
|
+
pip install -e ".[rerank,local,hybrid,anthropic,gemini,all]"
|
|
34
37
|
```
|
|
35
38
|
|
|
36
39
|
---
|
|
@@ -52,12 +55,26 @@ answer = rag.query("How many days of annual leave?")
|
|
|
52
55
|
print(answer.text)
|
|
53
56
|
```
|
|
54
57
|
|
|
58
|
+
### Hybrid search + metadata filter
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from rag_python import RAG, SearchConfig
|
|
62
|
+
|
|
63
|
+
rag = RAG(
|
|
64
|
+
retriever="hybrid", # pip install rag-python[hybrid]
|
|
65
|
+
metadata_filter={"filename": "leave-policy.pdf"},
|
|
66
|
+
)
|
|
67
|
+
rag.ingest(["./policies/leave-policy.pdf", "./policies/handbook.pdf"])
|
|
68
|
+
answer = rag.query("How many days of annual leave?")
|
|
69
|
+
```
|
|
70
|
+
|
|
55
71
|
### CLI
|
|
56
72
|
|
|
57
73
|
```bash
|
|
58
74
|
export OPENAI_API_KEY=sk-...
|
|
59
75
|
rag-python ingest ./data --reindex
|
|
60
76
|
rag-python query "How many days of annual leave?" -v
|
|
77
|
+
rag-python query "leave policy" --retriever hybrid --metadata-filter '{"filename": "leave-policy.pdf"}'
|
|
61
78
|
```
|
|
62
79
|
|
|
63
80
|
---
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "rag-python"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "Production-grade RAG for Python: multi-LLM, query rewriting, reranking, guardrails, and evaluation."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { text = "MIT" }
|
|
@@ -38,10 +38,12 @@ dependencies = [
|
|
|
38
38
|
|
|
39
39
|
[project.optional-dependencies]
|
|
40
40
|
rerank = ["sentence-transformers>=2.2.0", "torch>=2.0.0"]
|
|
41
|
+
local = ["sentence-transformers>=2.2.0"]
|
|
42
|
+
hybrid = ["rank-bm25>=0.2.2"]
|
|
41
43
|
anthropic = ["anthropic>=0.20.0"]
|
|
42
44
|
gemini = ["google-genai>=0.3.0"]
|
|
43
|
-
dev = ["pytest>=7.0", "ruff>=0.1.0", "build", "twine"]
|
|
44
|
-
all = ["rag-python[rerank,anthropic,gemini]"]
|
|
45
|
+
dev = ["pytest>=7.0", "ruff>=0.1.0", "build", "twine", "rank-bm25>=0.2.2"]
|
|
46
|
+
all = ["rag-python[rerank,local,hybrid,anthropic,gemini]"]
|
|
45
47
|
|
|
46
48
|
[project.scripts]
|
|
47
49
|
rag-python = "rag_python.cli:main"
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
"""rag-python command-line interface."""
|
|
2
2
|
import argparse
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import replace
|
|
3
5
|
|
|
6
|
+
from . import __version__
|
|
4
7
|
from .client import RAG
|
|
5
8
|
|
|
6
9
|
|
|
7
10
|
def _build_rag(args: argparse.Namespace) -> RAG:
|
|
8
|
-
|
|
11
|
+
kwargs: dict = dict(
|
|
9
12
|
llm_provider=args.llm_provider,
|
|
10
13
|
llm_model=args.llm_model,
|
|
11
14
|
embedding_provider=args.embedding_provider,
|
|
@@ -18,12 +21,34 @@ def _build_rag(args: argparse.Namespace) -> RAG:
|
|
|
18
21
|
gemini_api_key=args.gemini_api_key,
|
|
19
22
|
ollama_base_url=args.ollama_base_url,
|
|
20
23
|
)
|
|
24
|
+
if getattr(args, "retriever", None):
|
|
25
|
+
kwargs["retriever"] = args.retriever
|
|
26
|
+
if getattr(args, "metadata_filter", None):
|
|
27
|
+
kwargs["metadata_filter"] = args.metadata_filter
|
|
28
|
+
return RAG(**kwargs)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _parse_metadata_filter(raw: str | None) -> dict | None:
|
|
32
|
+
if not raw:
|
|
33
|
+
return None
|
|
34
|
+
try:
|
|
35
|
+
return json.loads(raw)
|
|
36
|
+
except json.JSONDecodeError as e:
|
|
37
|
+
raise argparse.ArgumentTypeError(f"Invalid JSON for metadata filter: {e}") from e
|
|
21
38
|
|
|
22
39
|
|
|
23
40
|
def _add_provider_args(parser: argparse.ArgumentParser) -> None:
|
|
24
|
-
parser.add_argument(
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"--llm-provider",
|
|
43
|
+
default="openai",
|
|
44
|
+
choices=["openai", "azure_openai", "anthropic", "gemini", "ollama"],
|
|
45
|
+
)
|
|
25
46
|
parser.add_argument("--llm-model", default=None)
|
|
26
|
-
parser.add_argument(
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"--embedding-provider",
|
|
49
|
+
default="openai",
|
|
50
|
+
choices=["openai", "azure_openai", "ollama", "local"],
|
|
51
|
+
)
|
|
27
52
|
parser.add_argument("--embedding-model", default=None)
|
|
28
53
|
parser.add_argument("--ollama-base-url", default=None)
|
|
29
54
|
parser.add_argument("--azure-endpoint", default=None)
|
|
@@ -34,11 +59,27 @@ def _add_provider_args(parser: argparse.ArgumentParser) -> None:
|
|
|
34
59
|
parser.add_argument("--gemini-api-key", default=None)
|
|
35
60
|
|
|
36
61
|
|
|
62
|
+
def _add_search_args(parser: argparse.ArgumentParser) -> None:
|
|
63
|
+
parser.add_argument(
|
|
64
|
+
"--retriever",
|
|
65
|
+
choices=["vector", "multi_query", "hybrid"],
|
|
66
|
+
default=None,
|
|
67
|
+
help="Retrieval strategy (default: multi_query; hybrid needs pip install rag-python[hybrid])",
|
|
68
|
+
)
|
|
69
|
+
parser.add_argument(
|
|
70
|
+
"--metadata-filter",
|
|
71
|
+
type=_parse_metadata_filter,
|
|
72
|
+
default=None,
|
|
73
|
+
help='Chroma metadata filter as JSON, e.g. \'{"filename": "policy.pdf"}\'',
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
37
77
|
def main() -> None:
|
|
38
78
|
parser = argparse.ArgumentParser(
|
|
39
79
|
prog="rag-python",
|
|
40
80
|
description="rag-python — modular RAG with query rewriting, reranking, guardrails, and multi-LLM support.",
|
|
41
81
|
)
|
|
82
|
+
parser.add_argument("--version", action="version", version=f"rag-python {__version__}")
|
|
42
83
|
sub = parser.add_subparsers(dest="command", required=True)
|
|
43
84
|
|
|
44
85
|
ing = sub.add_parser("ingest", help="Ingest files/folders into the vector store")
|
|
@@ -48,9 +89,10 @@ def main() -> None:
|
|
|
48
89
|
|
|
49
90
|
q = sub.add_parser("query", help="Ask a question against ingested documents")
|
|
50
91
|
q.add_argument("question", nargs="+", help="Question text")
|
|
51
|
-
q.add_argument("--no-multi-query", action="store_true")
|
|
92
|
+
q.add_argument("--no-multi-query", action="store_true", help="Use vector retriever only")
|
|
52
93
|
q.add_argument("-v", "--verbose", action="store_true")
|
|
53
94
|
_add_provider_args(q)
|
|
95
|
+
_add_search_args(q)
|
|
54
96
|
|
|
55
97
|
args = parser.parse_args()
|
|
56
98
|
|
|
@@ -63,7 +105,15 @@ def main() -> None:
|
|
|
63
105
|
if args.command == "query":
|
|
64
106
|
rag = _build_rag(args)
|
|
65
107
|
question = " ".join(args.question)
|
|
66
|
-
|
|
108
|
+
retriever = args.retriever
|
|
109
|
+
if retriever is None and args.no_multi_query:
|
|
110
|
+
retriever = "vector"
|
|
111
|
+
search = replace(
|
|
112
|
+
rag.config.search,
|
|
113
|
+
retriever=retriever or rag.config.search.retriever,
|
|
114
|
+
metadata_filter=args.metadata_filter or rag.config.search.metadata_filter,
|
|
115
|
+
)
|
|
116
|
+
ans = rag.query(question, search=search)
|
|
67
117
|
print(ans.text)
|
|
68
118
|
if args.verbose:
|
|
69
119
|
print("\n--- evaluation ---")
|
|
@@ -60,6 +60,7 @@ class RAG:
|
|
|
60
60
|
chunk_size: int | None = None,
|
|
61
61
|
chunk_overlap: int | None = None,
|
|
62
62
|
retriever: str | None = None,
|
|
63
|
+
metadata_filter: dict | None = None,
|
|
63
64
|
top_k_retrieve: int | None = None,
|
|
64
65
|
top_k_rerank: int | None = None,
|
|
65
66
|
multi_query_n: int | None = None,
|
|
@@ -104,6 +105,8 @@ class RAG:
|
|
|
104
105
|
self.config.search = replace(self.config.search, rerank_enabled=rerank_enabled)
|
|
105
106
|
if document_extensions is not None:
|
|
106
107
|
self.config.documents = replace(self.config.documents, extensions=document_extensions)
|
|
108
|
+
if metadata_filter is not None:
|
|
109
|
+
self.config.search = replace(self.config.search, metadata_filter=metadata_filter)
|
|
107
110
|
|
|
108
111
|
self.llm = make_llm_provider(
|
|
109
112
|
llm_provider, # type: ignore[arg-type]
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Document loaders: raw data → structured text + metadata."""
|
|
2
|
+
import csv
|
|
3
|
+
import json
|
|
4
|
+
from html.parser import HTMLParser
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Iterator
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from pypdf import PdfReader
|
|
11
|
+
except ImportError:
|
|
12
|
+
PdfReader = None
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from docx import Document as DocxDocument
|
|
16
|
+
except ImportError:
|
|
17
|
+
DocxDocument = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class LoadedDocument:
|
|
22
|
+
"""Single document with content and metadata."""
|
|
23
|
+
content: str
|
|
24
|
+
source: str
|
|
25
|
+
metadata: dict
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _HTMLTextExtractor(HTMLParser):
|
|
29
|
+
def __init__(self) -> None:
|
|
30
|
+
super().__init__()
|
|
31
|
+
self.parts: list[str] = []
|
|
32
|
+
|
|
33
|
+
def handle_data(self, data: str) -> None:
|
|
34
|
+
text = data.strip()
|
|
35
|
+
if text:
|
|
36
|
+
self.parts.append(text)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _html_to_text(html: str) -> str:
|
|
40
|
+
parser = _HTMLTextExtractor()
|
|
41
|
+
parser.feed(html)
|
|
42
|
+
return "\n".join(parser.parts)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _load_csv(path: Path, metadata: dict) -> LoadedDocument | None:
|
|
46
|
+
rows: list[str] = []
|
|
47
|
+
with path.open(encoding="utf-8", errors="replace", newline="") as f:
|
|
48
|
+
reader = csv.DictReader(f)
|
|
49
|
+
if reader.fieldnames:
|
|
50
|
+
for row in reader:
|
|
51
|
+
rows.append(", ".join(f"{k}: {v}" for k, v in row.items() if v))
|
|
52
|
+
else:
|
|
53
|
+
f.seek(0)
|
|
54
|
+
for row in csv.reader(f):
|
|
55
|
+
rows.append(", ".join(row))
|
|
56
|
+
content = "\n".join(rows)
|
|
57
|
+
metadata["rows"] = len(rows)
|
|
58
|
+
return LoadedDocument(content=content, source=str(path), metadata=metadata) if content.strip() else None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _load_json(path: Path, metadata: dict) -> LoadedDocument | None:
|
|
62
|
+
data = json.loads(path.read_text(encoding="utf-8", errors="replace"))
|
|
63
|
+
if isinstance(data, list):
|
|
64
|
+
parts = []
|
|
65
|
+
for item in data:
|
|
66
|
+
if isinstance(item, dict) and "text" in item:
|
|
67
|
+
parts.append(str(item["text"]))
|
|
68
|
+
else:
|
|
69
|
+
parts.append(json.dumps(item, ensure_ascii=False))
|
|
70
|
+
content = "\n\n".join(parts)
|
|
71
|
+
elif isinstance(data, dict):
|
|
72
|
+
if "text" in data:
|
|
73
|
+
content = str(data["text"])
|
|
74
|
+
else:
|
|
75
|
+
content = json.dumps(data, ensure_ascii=False, indent=2)
|
|
76
|
+
else:
|
|
77
|
+
content = str(data)
|
|
78
|
+
return LoadedDocument(content=content, source=str(path), metadata=metadata) if content.strip() else None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def load_file(path: Path) -> LoadedDocument | None:
|
|
82
|
+
"""Load a single file (PDF, TXT, DOCX, MD, CSV, JSON, HTML) into text + metadata."""
|
|
83
|
+
path = Path(path)
|
|
84
|
+
if not path.exists():
|
|
85
|
+
return None
|
|
86
|
+
suffix = path.suffix.lower()
|
|
87
|
+
metadata = {"source": str(path), "filename": path.name}
|
|
88
|
+
|
|
89
|
+
if suffix in (".txt", ".md"):
|
|
90
|
+
content = path.read_text(encoding="utf-8", errors="replace")
|
|
91
|
+
return LoadedDocument(content=content, source=str(path), metadata=metadata)
|
|
92
|
+
|
|
93
|
+
if suffix == ".html":
|
|
94
|
+
html = path.read_text(encoding="utf-8", errors="replace")
|
|
95
|
+
content = _html_to_text(html)
|
|
96
|
+
return LoadedDocument(content=content, source=str(path), metadata=metadata) if content.strip() else None
|
|
97
|
+
|
|
98
|
+
if suffix == ".csv":
|
|
99
|
+
return _load_csv(path, metadata)
|
|
100
|
+
|
|
101
|
+
if suffix == ".json":
|
|
102
|
+
try:
|
|
103
|
+
return _load_json(path, metadata)
|
|
104
|
+
except json.JSONDecodeError:
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
if suffix == ".pdf" and PdfReader:
|
|
108
|
+
try:
|
|
109
|
+
reader = PdfReader(path)
|
|
110
|
+
parts = []
|
|
111
|
+
for i, page in enumerate(reader.pages):
|
|
112
|
+
text = page.extract_text() or ""
|
|
113
|
+
parts.append(text)
|
|
114
|
+
metadata.setdefault("page_numbers", []).append(i + 1)
|
|
115
|
+
content = "\n\n".join(parts)
|
|
116
|
+
metadata["pages"] = len(parts)
|
|
117
|
+
return LoadedDocument(content=content, source=str(path), metadata=metadata)
|
|
118
|
+
except Exception:
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
if suffix in (".docx", ".doc") and DocxDocument:
|
|
122
|
+
try:
|
|
123
|
+
doc = DocxDocument(path)
|
|
124
|
+
parts = [p.text for p in doc.paragraphs]
|
|
125
|
+
content = "\n\n".join(parts)
|
|
126
|
+
metadata["paragraphs"] = len(parts)
|
|
127
|
+
return LoadedDocument(content=content, source=str(path), metadata=metadata)
|
|
128
|
+
except Exception:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def load_directory(
|
|
135
|
+
dir_path: Path,
|
|
136
|
+
extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html"),
|
|
137
|
+
) -> Iterator[LoadedDocument]:
|
|
138
|
+
"""Yield LoadedDocument for each supported file under dir_path."""
|
|
139
|
+
dir_path = Path(dir_path)
|
|
140
|
+
if not dir_path.is_dir():
|
|
141
|
+
return
|
|
142
|
+
for f in dir_path.rglob("*"):
|
|
143
|
+
if f.is_file() and f.suffix.lower() in extensions:
|
|
144
|
+
doc = load_file(f)
|
|
145
|
+
if doc and doc.content.strip():
|
|
146
|
+
yield doc
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""BM25 + vector fusion via reciprocal rank fusion (RRF)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def reciprocal_rank_fusion(
|
|
8
|
+
rankings: list[list[tuple[str, dict[str, Any], float]]],
|
|
9
|
+
*,
|
|
10
|
+
rrf_k: int = 60,
|
|
11
|
+
) -> list[tuple[str, dict[str, Any], float]]:
|
|
12
|
+
"""Merge ranked lists with RRF. Higher score is better."""
|
|
13
|
+
scores: dict[tuple[str, str], float] = {}
|
|
14
|
+
doc_map: dict[tuple[str, str], tuple[str, dict[str, Any]]] = {}
|
|
15
|
+
|
|
16
|
+
for ranking in rankings:
|
|
17
|
+
for rank, (doc, meta, _score) in enumerate(ranking):
|
|
18
|
+
key = (doc[:200], str(meta.get("source", "")))
|
|
19
|
+
doc_map[key] = (doc, meta)
|
|
20
|
+
scores[key] = scores.get(key, 0.0) + 1.0 / (rrf_k + rank + 1)
|
|
21
|
+
|
|
22
|
+
merged = sorted(scores.items(), key=lambda item: item[1], reverse=True)
|
|
23
|
+
return [(doc_map[key][0], doc_map[key][1], score) for key, score in merged]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def bm25_retrieve(
|
|
27
|
+
query: str,
|
|
28
|
+
documents: list[str],
|
|
29
|
+
metadatas: list[dict[str, Any]],
|
|
30
|
+
*,
|
|
31
|
+
top_k: int = 20,
|
|
32
|
+
) -> list[tuple[str, dict[str, Any], float]]:
|
|
33
|
+
"""Keyword retrieval with BM25. Requires ``pip install rag-python[hybrid]``."""
|
|
34
|
+
if not documents:
|
|
35
|
+
return []
|
|
36
|
+
try:
|
|
37
|
+
from rank_bm25 import BM25Okapi
|
|
38
|
+
except ImportError as e:
|
|
39
|
+
raise ImportError(
|
|
40
|
+
"Hybrid search requires optional dependencies. Install with: pip install rag-python[hybrid]"
|
|
41
|
+
) from e
|
|
42
|
+
|
|
43
|
+
tokenized_corpus = [doc.lower().split() for doc in documents]
|
|
44
|
+
bm25 = BM25Okapi(tokenized_corpus)
|
|
45
|
+
scores = bm25.get_scores(query.lower().split())
|
|
46
|
+
ranked = sorted(
|
|
47
|
+
((documents[i], metadatas[i], float(scores[i])) for i in range(len(documents))),
|
|
48
|
+
key=lambda item: item[2],
|
|
49
|
+
reverse=True,
|
|
50
|
+
)
|
|
51
|
+
return ranked[:top_k]
|
|
@@ -16,7 +16,7 @@ from .config import (
|
|
|
16
16
|
)
|
|
17
17
|
|
|
18
18
|
ChunkStrategy = Literal["recursive", "structure_aware", "semantic"]
|
|
19
|
-
RetrieverStrategy = Literal["vector", "multi_query"]
|
|
19
|
+
RetrieverStrategy = Literal["vector", "multi_query", "hybrid"]
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
@dataclass
|
|
@@ -37,13 +37,14 @@ class SearchConfig:
|
|
|
37
37
|
top_k_rerank: int = TOP_K_RERANK
|
|
38
38
|
multi_query_n: int = MULTI_QUERY_N
|
|
39
39
|
rerank_enabled: bool = RERANK_ENABLED
|
|
40
|
+
metadata_filter: dict | None = None
|
|
40
41
|
|
|
41
42
|
|
|
42
43
|
@dataclass
|
|
43
44
|
class DocumentConfig:
|
|
44
45
|
"""Which files to load and how to preprocess them."""
|
|
45
46
|
|
|
46
|
-
extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx")
|
|
47
|
+
extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html")
|
|
47
48
|
clean: bool = True
|
|
48
49
|
copy_to_data_dir: bool = True
|
|
49
50
|
|
|
@@ -9,10 +9,11 @@ from .azure_openai_provider import AzureOpenAIProvider
|
|
|
9
9
|
from .anthropic_provider import AnthropicProvider
|
|
10
10
|
from .gemini_provider import GeminiProvider
|
|
11
11
|
from .ollama_provider import OllamaProvider
|
|
12
|
+
from .local_provider import LocalEmbeddingProvider
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
LLMProviderName = Literal["openai", "azure_openai", "anthropic", "gemini", "ollama"]
|
|
15
|
-
EmbeddingProviderName = Literal["openai", "azure_openai", "ollama"]
|
|
16
|
+
EmbeddingProviderName = Literal["openai", "azure_openai", "ollama", "local"]
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
def make_llm_provider(name: LLMProviderName, **kwargs) -> LLMProvider:
|
|
@@ -49,5 +50,7 @@ def make_embedding_provider(name: EmbeddingProviderName, **kwargs) -> EmbeddingP
|
|
|
49
50
|
)
|
|
50
51
|
if name == "ollama":
|
|
51
52
|
return OllamaProvider(base_url=kwargs.get("base_url") or os.getenv("OLLAMA_BASE_URL", "http://localhost:11434"))
|
|
53
|
+
if name == "local":
|
|
54
|
+
return LocalEmbeddingProvider(model_name=kwargs.get("model") or os.getenv("LOCAL_EMBEDDING_MODEL"))
|
|
52
55
|
raise ValueError(f"Unknown embedding provider: {name}")
|
|
53
56
|
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Local sentence-transformers embeddings (no API key required)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
_DEFAULT_MODEL = "all-MiniLM-L6-v2"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LocalEmbeddingProvider:
|
|
10
|
+
"""Offline embeddings via sentence-transformers."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, model_name: str | None = None) -> None:
|
|
13
|
+
self.default_model = model_name or os.getenv("LOCAL_EMBEDDING_MODEL", _DEFAULT_MODEL)
|
|
14
|
+
self._models: dict[str, object] = {}
|
|
15
|
+
|
|
16
|
+
def _get_model(self, model_name: str):
|
|
17
|
+
if model_name not in self._models:
|
|
18
|
+
try:
|
|
19
|
+
from sentence_transformers import SentenceTransformer
|
|
20
|
+
except ImportError as e:
|
|
21
|
+
raise ImportError(
|
|
22
|
+
"Local embeddings require optional dependencies. "
|
|
23
|
+
"Install with: pip install rag-python[local]"
|
|
24
|
+
) from e
|
|
25
|
+
self._models[model_name] = SentenceTransformer(model_name)
|
|
26
|
+
return self._models[model_name]
|
|
27
|
+
|
|
28
|
+
def embed(self, texts: list[str], *, model: str | None = None) -> list[list[float]]:
|
|
29
|
+
if not texts:
|
|
30
|
+
return []
|
|
31
|
+
model_name = model or self.default_model
|
|
32
|
+
encoder = self._get_model(model_name)
|
|
33
|
+
vectors = encoder.encode(texts, convert_to_numpy=True)
|
|
34
|
+
return [v.tolist() for v in vectors]
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Full RAG pipeline: Query → Understanding/Rewrite → Retrieval (multi-query) → Rerank → LLM → Guardrails → Eval/Retry."""
|
|
2
|
+
import logging
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
@@ -14,6 +15,8 @@ from .providers import LLMProvider, EmbeddingProvider, make_llm_provider, make_e
|
|
|
14
15
|
from .config import DATA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, CHUNK_STRATEGY
|
|
15
16
|
from .options import QueryConfig, SearchConfig
|
|
16
17
|
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
17
20
|
|
|
18
21
|
@dataclass
|
|
19
22
|
class RAGResponse:
|
|
@@ -34,7 +37,7 @@ def _load_documents(
|
|
|
34
37
|
paths: list[Path] | None = None,
|
|
35
38
|
data_path: Path | None = None,
|
|
36
39
|
*,
|
|
37
|
-
extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx"),
|
|
40
|
+
extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html"),
|
|
38
41
|
) -> list[LoadedDocument]:
|
|
39
42
|
"""Load documents from explicit paths and/or a data directory."""
|
|
40
43
|
docs: list[LoadedDocument] = []
|
|
@@ -136,12 +139,13 @@ def ingest(
|
|
|
136
139
|
strategy = chunk_strategy or CHUNK_STRATEGY
|
|
137
140
|
size = chunk_size or CHUNK_SIZE
|
|
138
141
|
overlap = chunk_overlap or CHUNK_OVERLAP
|
|
139
|
-
ext = extensions or (".txt", ".md", ".pdf", ".docx")
|
|
142
|
+
ext = extensions or (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html")
|
|
140
143
|
embedder = embedder or make_embedding_provider("openai")
|
|
141
144
|
|
|
142
145
|
path_list = [Path(p) for p in paths] if paths else None
|
|
143
146
|
root = Path(data_path) if data_path else (None if path_list else Path(DATA_DIR))
|
|
144
147
|
docs = _load_documents(path_list, root, extensions=ext)
|
|
148
|
+
logger.info("Loaded %s documents for ingest", len(docs))
|
|
145
149
|
return _ingest_documents(
|
|
146
150
|
docs,
|
|
147
151
|
clean=clean,
|
|
@@ -202,11 +206,13 @@ def query(
|
|
|
202
206
|
top_k_retrieve=search_cfg.top_k_retrieve,
|
|
203
207
|
top_k_rerank=search_cfg.top_k_rerank,
|
|
204
208
|
rerank_enabled=search_cfg.rerank_enabled,
|
|
209
|
+
metadata_filter=search_cfg.metadata_filter,
|
|
205
210
|
embedder=embedder,
|
|
206
211
|
embedding_model=embedding_model,
|
|
207
212
|
llm=llm,
|
|
208
213
|
llm_model=llm_model,
|
|
209
214
|
)
|
|
215
|
+
logger.info("Retrieved %s chunks (retriever=%s)", len(hits), search_cfg.retriever)
|
|
210
216
|
context_chunks = [h[0] for h in hits]
|
|
211
217
|
sources = [{"text": h[0][:200], "metadata": h[1], "score": h[2]} for h in hits]
|
|
212
218
|
context_str = "\n\n".join(context_chunks)
|