rag-python 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rag_python/__init__.py CHANGED
@@ -9,7 +9,7 @@ Quick start::
9
9
  print(rag.query("What is our leave policy?").text)
10
10
  """
11
11
 
12
- __version__ = "0.2.0"
12
+ __version__ = "0.3.0"
13
13
 
14
14
  from .client import RAG, RAGAnswer
15
15
  from .rag_pipeline import ingest, query, RAGResponse
rag_python/cli.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """rag-python command-line interface."""
2
2
  import argparse
3
+ import json
3
4
  from dataclasses import replace
4
5
 
5
6
  from . import __version__
@@ -7,7 +8,7 @@ from .client import RAG
7
8
 
8
9
 
9
10
  def _build_rag(args: argparse.Namespace) -> RAG:
10
- return RAG(
11
+ kwargs: dict = dict(
11
12
  llm_provider=args.llm_provider,
12
13
  llm_model=args.llm_model,
13
14
  embedding_provider=args.embedding_provider,
@@ -20,6 +21,20 @@ def _build_rag(args: argparse.Namespace) -> RAG:
20
21
  gemini_api_key=args.gemini_api_key,
21
22
  ollama_base_url=args.ollama_base_url,
22
23
  )
24
+ if getattr(args, "retriever", None):
25
+ kwargs["retriever"] = args.retriever
26
+ if getattr(args, "metadata_filter", None):
27
+ kwargs["metadata_filter"] = args.metadata_filter
28
+ return RAG(**kwargs)
29
+
30
+
31
+ def _parse_metadata_filter(raw: str | None) -> dict | None:
32
+ if not raw:
33
+ return None
34
+ try:
35
+ return json.loads(raw)
36
+ except json.JSONDecodeError as e:
37
+ raise argparse.ArgumentTypeError(f"Invalid JSON for metadata filter: {e}") from e
23
38
 
24
39
 
25
40
  def _add_provider_args(parser: argparse.ArgumentParser) -> None:
@@ -44,6 +59,21 @@ def _add_provider_args(parser: argparse.ArgumentParser) -> None:
44
59
  parser.add_argument("--gemini-api-key", default=None)
45
60
 
46
61
 
62
+ def _add_search_args(parser: argparse.ArgumentParser) -> None:
63
+ parser.add_argument(
64
+ "--retriever",
65
+ choices=["vector", "multi_query", "hybrid"],
66
+ default=None,
67
+ help="Retrieval strategy (default: multi_query; hybrid needs pip install rag-python[hybrid])",
68
+ )
69
+ parser.add_argument(
70
+ "--metadata-filter",
71
+ type=_parse_metadata_filter,
72
+ default=None,
73
+ help='Chroma metadata filter as JSON, e.g. \'{"filename": "policy.pdf"}\'',
74
+ )
75
+
76
+
47
77
  def main() -> None:
48
78
  parser = argparse.ArgumentParser(
49
79
  prog="rag-python",
@@ -59,9 +89,10 @@ def main() -> None:
59
89
 
60
90
  q = sub.add_parser("query", help="Ask a question against ingested documents")
61
91
  q.add_argument("question", nargs="+", help="Question text")
62
- q.add_argument("--no-multi-query", action="store_true")
92
+ q.add_argument("--no-multi-query", action="store_true", help="Use vector retriever only")
63
93
  q.add_argument("-v", "--verbose", action="store_true")
64
94
  _add_provider_args(q)
95
+ _add_search_args(q)
65
96
 
66
97
  args = parser.parse_args()
67
98
 
@@ -74,9 +105,13 @@ def main() -> None:
74
105
  if args.command == "query":
75
106
  rag = _build_rag(args)
76
107
  question = " ".join(args.question)
108
+ retriever = args.retriever
109
+ if retriever is None and args.no_multi_query:
110
+ retriever = "vector"
77
111
  search = replace(
78
112
  rag.config.search,
79
- retriever="vector" if args.no_multi_query else "multi_query",
113
+ retriever=retriever or rag.config.search.retriever,
114
+ metadata_filter=args.metadata_filter or rag.config.search.metadata_filter,
80
115
  )
81
116
  ans = rag.query(question, search=search)
82
117
  print(ans.text)
rag_python/client.py CHANGED
@@ -60,6 +60,7 @@ class RAG:
60
60
  chunk_size: int | None = None,
61
61
  chunk_overlap: int | None = None,
62
62
  retriever: str | None = None,
63
+ metadata_filter: dict | None = None,
63
64
  top_k_retrieve: int | None = None,
64
65
  top_k_rerank: int | None = None,
65
66
  multi_query_n: int | None = None,
@@ -104,6 +105,8 @@ class RAG:
104
105
  self.config.search = replace(self.config.search, rerank_enabled=rerank_enabled)
105
106
  if document_extensions is not None:
106
107
  self.config.documents = replace(self.config.documents, extensions=document_extensions)
108
+ if metadata_filter is not None:
109
+ self.config.search = replace(self.config.search, metadata_filter=metadata_filter)
107
110
 
108
111
  self.llm = make_llm_provider(
109
112
  llm_provider, # type: ignore[arg-type]
@@ -1,4 +1,7 @@
1
1
  """Document loaders: raw data → structured text + metadata."""
2
+ import csv
3
+ import json
4
+ from html.parser import HTMLParser
2
5
  from pathlib import Path
3
6
  from dataclasses import dataclass
4
7
  from typing import Iterator
@@ -22,18 +25,85 @@ class LoadedDocument:
22
25
  metadata: dict
23
26
 
24
27
 
28
+ class _HTMLTextExtractor(HTMLParser):
29
+ def __init__(self) -> None:
30
+ super().__init__()
31
+ self.parts: list[str] = []
32
+
33
+ def handle_data(self, data: str) -> None:
34
+ text = data.strip()
35
+ if text:
36
+ self.parts.append(text)
37
+
38
+
39
+ def _html_to_text(html: str) -> str:
40
+ parser = _HTMLTextExtractor()
41
+ parser.feed(html)
42
+ return "\n".join(parser.parts)
43
+
44
+
45
+ def _load_csv(path: Path, metadata: dict) -> LoadedDocument | None:
46
+ rows: list[str] = []
47
+ with path.open(encoding="utf-8", errors="replace", newline="") as f:
48
+ reader = csv.DictReader(f)
49
+ if reader.fieldnames:
50
+ for row in reader:
51
+ rows.append(", ".join(f"{k}: {v}" for k, v in row.items() if v))
52
+ else:
53
+ f.seek(0)
54
+ for row in csv.reader(f):
55
+ rows.append(", ".join(row))
56
+ content = "\n".join(rows)
57
+ metadata["rows"] = len(rows)
58
+ return LoadedDocument(content=content, source=str(path), metadata=metadata) if content.strip() else None
59
+
60
+
61
+ def _load_json(path: Path, metadata: dict) -> LoadedDocument | None:
62
+ data = json.loads(path.read_text(encoding="utf-8", errors="replace"))
63
+ if isinstance(data, list):
64
+ parts = []
65
+ for item in data:
66
+ if isinstance(item, dict) and "text" in item:
67
+ parts.append(str(item["text"]))
68
+ else:
69
+ parts.append(json.dumps(item, ensure_ascii=False))
70
+ content = "\n\n".join(parts)
71
+ elif isinstance(data, dict):
72
+ if "text" in data:
73
+ content = str(data["text"])
74
+ else:
75
+ content = json.dumps(data, ensure_ascii=False, indent=2)
76
+ else:
77
+ content = str(data)
78
+ return LoadedDocument(content=content, source=str(path), metadata=metadata) if content.strip() else None
79
+
80
+
25
81
  def load_file(path: Path) -> LoadedDocument | None:
26
- """Load a single file (PDF, TXT, DOCX, MD) into text + metadata."""
82
+ """Load a single file (PDF, TXT, DOCX, MD, CSV, JSON, HTML) into text + metadata."""
27
83
  path = Path(path)
28
84
  if not path.exists():
29
85
  return None
30
86
  suffix = path.suffix.lower()
31
87
  metadata = {"source": str(path), "filename": path.name}
32
88
 
33
- if suffix == ".txt" or suffix == ".md":
89
+ if suffix in (".txt", ".md"):
34
90
  content = path.read_text(encoding="utf-8", errors="replace")
35
91
  return LoadedDocument(content=content, source=str(path), metadata=metadata)
36
92
 
93
+ if suffix == ".html":
94
+ html = path.read_text(encoding="utf-8", errors="replace")
95
+ content = _html_to_text(html)
96
+ return LoadedDocument(content=content, source=str(path), metadata=metadata) if content.strip() else None
97
+
98
+ if suffix == ".csv":
99
+ return _load_csv(path, metadata)
100
+
101
+ if suffix == ".json":
102
+ try:
103
+ return _load_json(path, metadata)
104
+ except json.JSONDecodeError:
105
+ return None
106
+
37
107
  if suffix == ".pdf" and PdfReader:
38
108
  try:
39
109
  reader = PdfReader(path)
@@ -61,7 +131,10 @@ def load_file(path: Path) -> LoadedDocument | None:
61
131
  return None
62
132
 
63
133
 
64
- def load_directory(dir_path: Path, extensions: tuple = (".txt", ".md", ".pdf", ".docx")) -> Iterator[LoadedDocument]:
134
+ def load_directory(
135
+ dir_path: Path,
136
+ extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html"),
137
+ ) -> Iterator[LoadedDocument]:
65
138
  """Yield LoadedDocument for each supported file under dir_path."""
66
139
  dir_path = Path(dir_path)
67
140
  if not dir_path.is_dir():
@@ -71,4 +144,3 @@ def load_directory(dir_path: Path, extensions: tuple = (".txt", ".md", ".pdf", "
71
144
  doc = load_file(f)
72
145
  if doc and doc.content.strip():
73
146
  yield doc
74
-
@@ -0,0 +1,51 @@
1
+ """BM25 + vector fusion via reciprocal rank fusion (RRF)."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+
6
+
7
+ def reciprocal_rank_fusion(
8
+ rankings: list[list[tuple[str, dict[str, Any], float]]],
9
+ *,
10
+ rrf_k: int = 60,
11
+ ) -> list[tuple[str, dict[str, Any], float]]:
12
+ """Merge ranked lists with RRF. Higher score is better."""
13
+ scores: dict[tuple[str, str], float] = {}
14
+ doc_map: dict[tuple[str, str], tuple[str, dict[str, Any]]] = {}
15
+
16
+ for ranking in rankings:
17
+ for rank, (doc, meta, _score) in enumerate(ranking):
18
+ key = (doc[:200], str(meta.get("source", "")))
19
+ doc_map[key] = (doc, meta)
20
+ scores[key] = scores.get(key, 0.0) + 1.0 / (rrf_k + rank + 1)
21
+
22
+ merged = sorted(scores.items(), key=lambda item: item[1], reverse=True)
23
+ return [(doc_map[key][0], doc_map[key][1], score) for key, score in merged]
24
+
25
+
26
+ def bm25_retrieve(
27
+ query: str,
28
+ documents: list[str],
29
+ metadatas: list[dict[str, Any]],
30
+ *,
31
+ top_k: int = 20,
32
+ ) -> list[tuple[str, dict[str, Any], float]]:
33
+ """Keyword retrieval with BM25. Requires ``pip install rag-python[hybrid]``."""
34
+ if not documents:
35
+ return []
36
+ try:
37
+ from rank_bm25 import BM25Okapi
38
+ except ImportError as e:
39
+ raise ImportError(
40
+ "Hybrid search requires optional dependencies. Install with: pip install rag-python[hybrid]"
41
+ ) from e
42
+
43
+ tokenized_corpus = [doc.lower().split() for doc in documents]
44
+ bm25 = BM25Okapi(tokenized_corpus)
45
+ scores = bm25.get_scores(query.lower().split())
46
+ ranked = sorted(
47
+ ((documents[i], metadatas[i], float(scores[i])) for i in range(len(documents))),
48
+ key=lambda item: item[2],
49
+ reverse=True,
50
+ )
51
+ return ranked[:top_k]
rag_python/options.py CHANGED
@@ -16,7 +16,7 @@ from .config import (
16
16
  )
17
17
 
18
18
  ChunkStrategy = Literal["recursive", "structure_aware", "semantic"]
19
- RetrieverStrategy = Literal["vector", "multi_query"]
19
+ RetrieverStrategy = Literal["vector", "multi_query", "hybrid"]
20
20
 
21
21
 
22
22
  @dataclass
@@ -37,13 +37,14 @@ class SearchConfig:
37
37
  top_k_rerank: int = TOP_K_RERANK
38
38
  multi_query_n: int = MULTI_QUERY_N
39
39
  rerank_enabled: bool = RERANK_ENABLED
40
+ metadata_filter: dict | None = None
40
41
 
41
42
 
42
43
  @dataclass
43
44
  class DocumentConfig:
44
45
  """Which files to load and how to preprocess them."""
45
46
 
46
- extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx")
47
+ extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html")
47
48
  clean: bool = True
48
49
  copy_to_data_dir: bool = True
49
50
 
@@ -1,4 +1,5 @@
1
1
  """Full RAG pipeline: Query → Understanding/Rewrite → Retrieval (multi-query) → Rerank → LLM → Guardrails → Eval/Retry."""
2
+ import logging
2
3
  from dataclasses import dataclass
3
4
  from pathlib import Path
4
5
 
@@ -14,6 +15,8 @@ from .providers import LLMProvider, EmbeddingProvider, make_llm_provider, make_e
14
15
  from .config import DATA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, CHUNK_STRATEGY
15
16
  from .options import QueryConfig, SearchConfig
16
17
 
18
+ logger = logging.getLogger(__name__)
19
+
17
20
 
18
21
  @dataclass
19
22
  class RAGResponse:
@@ -34,7 +37,7 @@ def _load_documents(
34
37
  paths: list[Path] | None = None,
35
38
  data_path: Path | None = None,
36
39
  *,
37
- extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx"),
40
+ extensions: tuple[str, ...] = (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html"),
38
41
  ) -> list[LoadedDocument]:
39
42
  """Load documents from explicit paths and/or a data directory."""
40
43
  docs: list[LoadedDocument] = []
@@ -136,12 +139,13 @@ def ingest(
136
139
  strategy = chunk_strategy or CHUNK_STRATEGY
137
140
  size = chunk_size or CHUNK_SIZE
138
141
  overlap = chunk_overlap or CHUNK_OVERLAP
139
- ext = extensions or (".txt", ".md", ".pdf", ".docx")
142
+ ext = extensions or (".txt", ".md", ".pdf", ".docx", ".csv", ".json", ".html")
140
143
  embedder = embedder or make_embedding_provider("openai")
141
144
 
142
145
  path_list = [Path(p) for p in paths] if paths else None
143
146
  root = Path(data_path) if data_path else (None if path_list else Path(DATA_DIR))
144
147
  docs = _load_documents(path_list, root, extensions=ext)
148
+ logger.info("Loaded %s documents for ingest", len(docs))
145
149
  return _ingest_documents(
146
150
  docs,
147
151
  clean=clean,
@@ -202,11 +206,13 @@ def query(
202
206
  top_k_retrieve=search_cfg.top_k_retrieve,
203
207
  top_k_rerank=search_cfg.top_k_rerank,
204
208
  rerank_enabled=search_cfg.rerank_enabled,
209
+ metadata_filter=search_cfg.metadata_filter,
205
210
  embedder=embedder,
206
211
  embedding_model=embedding_model,
207
212
  llm=llm,
208
213
  llm_model=llm_model,
209
214
  )
215
+ logger.info("Retrieved %s chunks (retriever=%s)", len(hits), search_cfg.retriever)
210
216
  context_chunks = [h[0] for h in hits]
211
217
  sources = [{"text": h[0][:200], "metadata": h[1], "score": h[2]} for h in hits]
212
218
  context_str = "\n\n".join(context_chunks)
rag_python/retrieval.py CHANGED
@@ -1,14 +1,49 @@
1
- """Retrieval: multi-query retrieval + reranking."""
1
+ """Retrieval: vector, multi-query, hybrid (BM25+vector), and reranking."""
2
2
  from typing import Any
3
3
 
4
- from .vector_store import retrieve as chroma_retrieve
4
+ from .vector_store import retrieve as chroma_retrieve, list_documents
5
5
  from .query_rewriting import rewrite_for_retrieval
6
6
  from .reranker import rerank_with_metadata
7
+ from .hybrid_search import bm25_retrieve, reciprocal_rank_fusion
7
8
  from .providers import EmbeddingProvider, LLMProvider
8
9
  from .options import RetrieverStrategy
9
10
  from .config import TOP_K_RETRIEVE, TOP_K_RERANK, MULTI_QUERY_N
10
11
 
11
12
 
13
+ def _dedupe_candidates(candidates: list[tuple[str, dict, float]]) -> list[tuple[str, dict, float]]:
14
+ seen: set[tuple[str, str]] = set()
15
+ out: list[tuple[str, dict, float]] = []
16
+ for doc, meta, score in candidates:
17
+ key = (doc[:200], str(meta.get("source", "")))
18
+ if key in seen:
19
+ continue
20
+ seen.add(key)
21
+ out.append((doc, meta, score))
22
+ return out
23
+
24
+
25
+ def _vector_candidates(
26
+ queries: list[str],
27
+ *,
28
+ embedder: EmbeddingProvider,
29
+ embedding_model: str | None,
30
+ top_k_retrieve: int,
31
+ where: dict | None,
32
+ ) -> list[tuple[str, dict, float]]:
33
+ seen_docs: set[tuple[str, str]] = set()
34
+ all_candidates: list[tuple[str, dict, float]] = []
35
+ for q in queries:
36
+ emb = embedder.embed([q], model=embedding_model)[0]
37
+ hits = chroma_retrieve(emb, top_k=top_k_retrieve, where=where)
38
+ for doc, meta, dist in hits:
39
+ key = (doc[:200], str(meta.get("source", "")))
40
+ if key in seen_docs:
41
+ continue
42
+ seen_docs.add(key)
43
+ all_candidates.append((doc, meta, -dist))
44
+ return all_candidates
45
+
46
+
12
47
  def retrieve(
13
48
  query: str,
14
49
  *,
@@ -20,42 +55,47 @@ def retrieve(
20
55
  top_k_retrieve: int | None = None,
21
56
  top_k_rerank: int | None = None,
22
57
  rerank_enabled: bool | None = None,
58
+ metadata_filter: dict | None = None,
23
59
  llm: LLMProvider | None = None,
24
60
  llm_model: str | None = None,
25
61
  ) -> list[tuple[str, dict[str, Any], float]]:
26
62
  """
27
- Retrieve relevant chunks using vector or multi-query search, then rerank.
63
+ Retrieve relevant chunks using vector, multi-query, or hybrid search, then rerank.
28
64
  Returns list of (document_text, metadata, rerank_score).
29
65
  """
30
66
  top_k_retrieve = top_k_retrieve or TOP_K_RETRIEVE
31
67
  top_k_rerank = top_k_rerank or TOP_K_RERANK
32
68
  n_queries = n_queries or MULTI_QUERY_N
33
- use_multi_query = retriever == "multi_query" if multi_query is None else multi_query
34
69
 
35
- queries = [query]
36
- if use_multi_query and n_queries > 1:
37
- rewritten = rewrite_for_retrieval(query, n_queries=n_queries, llm=llm, llm_model=llm_model)
38
- if rewritten:
39
- queries = rewritten
70
+ if retriever == "hybrid":
71
+ emb = embedder.embed([query], model=embedding_model)[0]
72
+ vector_hits = chroma_retrieve(emb, top_k=top_k_retrieve, where=metadata_filter)
73
+ vector_ranked = [(d, m, -dist) for d, m, dist in vector_hits]
40
74
 
41
- seen_docs: set[str] = set()
42
- all_candidates: list[tuple[str, dict, float]] = []
43
- for q in queries:
44
- emb = embedder.embed([q], model=embedding_model)[0]
45
- hits = chroma_retrieve(emb, top_k=top_k_retrieve)
46
- for doc, meta, dist in hits:
47
- key = (doc[:200], meta.get("source", ""))
48
- if key in seen_docs:
49
- continue
50
- seen_docs.add(key)
51
- all_candidates.append((doc, meta, -dist))
75
+ docs, metas = list_documents(where=metadata_filter)
76
+ bm25_ranked = bm25_retrieve(query, docs, metas, top_k=top_k_retrieve)
77
+ fused = reciprocal_rank_fusion([vector_ranked, bm25_ranked])[:top_k_retrieve]
78
+ all_candidates = _dedupe_candidates(fused)
79
+ else:
80
+ use_multi_query = retriever == "multi_query" if multi_query is None else multi_query
81
+ queries = [query]
82
+ if use_multi_query and n_queries > 1:
83
+ rewritten = rewrite_for_retrieval(query, n_queries=n_queries, llm=llm, llm_model=llm_model)
84
+ if rewritten:
85
+ queries = rewritten
86
+ all_candidates = _vector_candidates(
87
+ queries,
88
+ embedder=embedder,
89
+ embedding_model=embedding_model,
90
+ top_k_retrieve=top_k_retrieve,
91
+ where=metadata_filter,
92
+ )
52
93
 
53
94
  if not all_candidates:
54
95
  return []
96
+
55
97
  docs = [c[0] for c in all_candidates]
56
98
  metas = [c[1] for c in all_candidates]
57
- reranked = rerank_with_metadata(
99
+ return rerank_with_metadata(
58
100
  query, list(zip(docs, metas)), top_k=top_k_rerank, rerank_enabled=rerank_enabled
59
101
  )
60
- return reranked
61
-
@@ -85,6 +85,19 @@ def retrieve(
85
85
  return list(zip(docs, metas, dists))
86
86
 
87
87
 
88
+ def list_documents(
89
+ *,
90
+ where: dict | None = None,
91
+ limit: int | None = None,
92
+ ) -> tuple[list[str], list[dict[str, Any]]]:
93
+ """Return all stored chunk texts and metadata (for BM25 indexing)."""
94
+ coll = get_collection()
95
+ res = coll.get(where=where, include=["documents", "metadatas"], limit=limit)
96
+ docs = res.get("documents") or []
97
+ metas = res.get("metadatas") or []
98
+ return docs, metas
99
+
100
+
88
101
  def delete_all() -> None:
89
102
  """Remove all documents from the collection (for re-ingestion)."""
90
103
  _get_client().delete_collection(COLLECTION_NAME)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rag-python
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Production-grade RAG for Python: multi-LLM, query rewriting, reranking, guardrails, and evaluation.
5
5
  Author-email: Raghav Singla <04raghavsingla28@gmail.com>
6
6
  License: MIT
@@ -35,6 +35,8 @@ Requires-Dist: sentence-transformers>=2.2.0; extra == "rerank"
35
35
  Requires-Dist: torch>=2.0.0; extra == "rerank"
36
36
  Provides-Extra: local
37
37
  Requires-Dist: sentence-transformers>=2.2.0; extra == "local"
38
+ Provides-Extra: hybrid
39
+ Requires-Dist: rank-bm25>=0.2.2; extra == "hybrid"
38
40
  Provides-Extra: anthropic
39
41
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
40
42
  Provides-Extra: gemini
@@ -44,8 +46,9 @@ Requires-Dist: pytest>=7.0; extra == "dev"
44
46
  Requires-Dist: ruff>=0.1.0; extra == "dev"
45
47
  Requires-Dist: build; extra == "dev"
46
48
  Requires-Dist: twine; extra == "dev"
49
+ Requires-Dist: rank-bm25>=0.2.2; extra == "dev"
47
50
  Provides-Extra: all
48
- Requires-Dist: rag-python[anthropic,gemini,local,rerank]; extra == "all"
51
+ Requires-Dist: rag-python[anthropic,gemini,hybrid,local,rerank]; extra == "all"
49
52
 
50
53
  # rag-python
51
54
 
@@ -67,10 +70,11 @@ Ingest your documents, ask questions, get grounded answers — with query rewrit
67
70
  ## Features
68
71
 
69
72
  - Document pipeline: loaders → cleaning → chunking → embeddings → ChromaDB
70
- - Query pipeline: rewriting → multi-query retrieval → reranking
73
+ - Query pipeline: rewriting → multi-query / **hybrid** retrieval → reranking
71
74
  - Generation with guardrails (prompt injection + hallucination checks)
72
75
  - Evaluation scores + self-correction retry loop
73
76
  - **LLM providers:** OpenAI, Azure OpenAI, Anthropic, Gemini, Ollama
77
+ - **Loaders:** TXT, MD, PDF, DOCX, CSV, JSON, HTML
74
78
 
75
79
  ---
76
80
 
@@ -81,7 +85,7 @@ pip install rag-python
81
85
  # or from source
82
86
  pip install -e .
83
87
  # with reranking + extra providers
84
- pip install -e ".[rerank,local,anthropic,gemini,all]"
88
+ pip install -e ".[rerank,local,hybrid,anthropic,gemini,all]"
85
89
  ```
86
90
 
87
91
  ---
@@ -103,12 +107,26 @@ answer = rag.query("How many days of annual leave?")
103
107
  print(answer.text)
104
108
  ```
105
109
 
110
+ ### Hybrid search + metadata filter
111
+
112
+ ```python
113
+ from rag_python import RAG, SearchConfig
114
+
115
+ rag = RAG(
116
+ retriever="hybrid", # pip install rag-python[hybrid]
117
+ metadata_filter={"filename": "leave-policy.pdf"},
118
+ )
119
+ rag.ingest(["./policies/leave-policy.pdf", "./policies/handbook.pdf"])
120
+ answer = rag.query("How many days of annual leave?")
121
+ ```
122
+
106
123
  ### CLI
107
124
 
108
125
  ```bash
109
126
  export OPENAI_API_KEY=sk-...
110
127
  rag-python ingest ./data --reindex
111
128
  rag-python query "How many days of annual leave?" -v
129
+ rag-python query "leave policy" --retriever hybrid --metadata-filter '{"filename": "leave-policy.pdf"}'
112
130
  ```
113
131
 
114
132
  ---
@@ -1,20 +1,21 @@
1
- rag_python/__init__.py,sha256=4XDhojztA62P_jxiZ0maaVr8UtcVqbcQLjA_MaIZ8X8,834
1
+ rag_python/__init__.py,sha256=TzZxXzRdKszqqbq7KynrO-Cc0JMzZc1UcIxtNSLhvqQ,834
2
2
  rag_python/chunking.py,sha256=P1dbZ8ZY7487MxrWe2cypCiKhzIJ8zBPCTVz20vt8fo,6204
3
3
  rag_python/cleaning.py,sha256=fSux4T0pg7Xe_8NUP2pgzuForyRk1i2VPYIXSzRajzs,3193
4
- rag_python/cli.py,sha256=Cm7P-ryNrb2m2VEp293KbL2z2U_KSooP50wY044fmh4,3481
5
- rag_python/client.py,sha256=MhWAm92Ic2FQ1DTej4EhAlT9UoPN-GjxA0xrHIvwNA8,7656
4
+ rag_python/cli.py,sha256=z22LLX6dWnMlaI9yIU2tf4HpcLbG2zRz66RQWsFxGNY,4775
5
+ rag_python/client.py,sha256=RyWLBvj4bAJW1Vb529me7Eo608e9Wwq-OeImAAKjyIY,7838
6
6
  rag_python/config.py,sha256=Zw8TjQFKRvOUHpIb7kjEb7DtPFoYPzdQyOPzSXTqDcc,1389
7
- rag_python/document_loaders.py,sha256=izguVJjPq8v4hDWC8wGP2-LwiYUJbVe-DOsIX6n9J9E,2429
7
+ rag_python/document_loaders.py,sha256=blI-rMqzmHSHzcX9RmFBQZ_MYiM_uKLvesCDTPyoQbo,4866
8
8
  rag_python/evaluation.py,sha256=gTiXMaAtTUIsV6Ffhywz829BhfR8YhfJFkYZYrD9WYI,3561
9
9
  rag_python/generation.py,sha256=t6aSct2vZELIf20JDwRVt8UTwPnTXx0bU3TKoliiwVg,1108
10
10
  rag_python/guardrails.py,sha256=hJLXvpPNI9o8emyipSy5PpePofGzktlDLyMAXfAxUXs,2520
11
- rag_python/options.py,sha256=QvangjsYbct204_p_avraAuw_Ry2mcjhxby2Sx96dE0,1858
11
+ rag_python/hybrid_search.py,sha256=71kZyJ9obZBZGzhrl1DQjK32X4AtFppk_wvmpkUVzwo,1814
12
+ rag_python/options.py,sha256=P_nLMk7vQdRM11HCoR9AMUk2D0NmEVA5B5_ufhRiAmE,1935
12
13
  rag_python/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
14
  rag_python/query_rewriting.py,sha256=og_XWai2-08C7W67mFndA3k-aTxMdqGnu70qHi1Ohgc,2293
14
- rag_python/rag_pipeline.py,sha256=R8roEc-OSFPqrjZmvJXp7IhYaOdqd6kZ63otN1PUKTI,8747
15
+ rag_python/rag_pipeline.py,sha256=qth2LDVi6QxpqJVskjLxaxnpwpV5dKwh515334fc8DY,9058
15
16
  rag_python/reranker.py,sha256=8RxCPfgp80c-KSKojllGzbpZ7iSku-i7VLgPHa1a3rk,2181
16
- rag_python/retrieval.py,sha256=A9ZMIkrifyDF1rhtgNt_uR09o1_hCkywIuODjvLaY6Q,2261
17
- rag_python/vector_store.py,sha256=16I9g7Q8bMjwpxhTNv0nCq2WXQwftjZdSaxFNteVsH4,2909
17
+ rag_python/retrieval.py,sha256=iTlkaCs79iDDa_K9gktjJC9bAE0bHzy302CFGwmwEk0,3887
18
+ rag_python/vector_store.py,sha256=iAjGRXtzvh9F3aQJVRZ7abUfvwR5YM-qQ0N52qwJGmw,3340
18
19
  rag_python/providers/__init__.py,sha256=SjhMvYoA30EY5VUYVXhEGwcmQnIU2tUomcNE0_0NFho,215
19
20
  rag_python/providers/anthropic_provider.py,sha256=dSiCdM4F90jI9w7z_wS10XuVsX-pR733-cAgJHtVV2Q,1493
20
21
  rag_python/providers/azure_openai_provider.py,sha256=8SbI7rDzQgvC4ZXP89Q8kjfqeWuBfX1KKgExGLFkmx0,1940
@@ -24,9 +25,9 @@ rag_python/providers/gemini_provider.py,sha256=OZzs1YJQSZituoxS5Gk8yv3jYNIFY1SVo
24
25
  rag_python/providers/local_provider.py,sha256=tgYBNUrs7pKpPebA0tpNhJmtZLwwINuZFqKMyHlymTQ,1332
25
26
  rag_python/providers/ollama_provider.py,sha256=DDhDriB6-Ob0r2-M-P3SvIFG37ruDAErtU7LWDK8xh0,1958
26
27
  rag_python/providers/openai_provider.py,sha256=oR7rCCaxCtirAVetJrR4oC3UrWySuqLc9kbosydoQAQ,1585
27
- rag_python-0.2.0.dist-info/LICENSE,sha256=PZ61Z6ve0hBHgztaC1rPgnxQTRXRkeHKASlnKkX2pvc,1079
28
- rag_python-0.2.0.dist-info/METADATA,sha256=2_E_U0z3lYdjXhHAYRrqqDQaetoIsnUI2BrWL-GErfs,5506
29
- rag_python-0.2.0.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
30
- rag_python-0.2.0.dist-info/entry_points.txt,sha256=558Rd4GWV_6mIyqdRSVNE4ZZi0-KdblTZhcMbIn3ryY,51
31
- rag_python-0.2.0.dist-info/top_level.txt,sha256=SrgudPwkJWfJ3gUn2n-dhrt9vN2XbQcaZ3wLQZed4Z4,11
32
- rag_python-0.2.0.dist-info/RECORD,,
28
+ rag_python-0.3.0.dist-info/LICENSE,sha256=PZ61Z6ve0hBHgztaC1rPgnxQTRXRkeHKASlnKkX2pvc,1079
29
+ rag_python-0.3.0.dist-info/METADATA,sha256=iIp2OG2jfo7xVYYQCQf264ZAFBeIhecfs5lIy-XTLZo,6171
30
+ rag_python-0.3.0.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
31
+ rag_python-0.3.0.dist-info/entry_points.txt,sha256=558Rd4GWV_6mIyqdRSVNE4ZZi0-KdblTZhcMbIn3ryY,51
32
+ rag_python-0.3.0.dist-info/top_level.txt,sha256=SrgudPwkJWfJ3gUn2n-dhrt9vN2XbQcaZ3wLQZed4Z4,11
33
+ rag_python-0.3.0.dist-info/RECORD,,