multi-rag 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
multi_rag/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .main import embed,embed_from_doc,embed_from_image,embed_from_pdf,embed_from_txt,embed_from_xlsx,retrieve,query
multi_rag/main.py ADDED
@@ -0,0 +1,325 @@
1
+ import pymupdf
2
+ from PIL import Image
3
+ import io
4
+ import os
5
+ import re
6
+ from sentence_transformers import SentenceTransformer
7
+ from chromadb import PersistentClient
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ import json
10
+ import base64
11
+ from langchain_core.messages import HumanMessage
12
+ from langchain.chat_models import init_chat_model
13
+ import zipfile
14
+ import docx
15
+ import pandas as pd
16
+
17
+ model=init_chat_model(model="gemini-2.5-flash",model_provider="google-genai")
18
+ image_embedder=SentenceTransformer("clip-ViT-B-32")
19
+ text_embedder=SentenceTransformer("BAAI/bge-base-en-v1.5") #"BAAI/bge-m3"
20
+
21
+ client=PersistentClient(path="./chroma_db")
22
+ text_store = client.get_or_create_collection("rag-text",configuration={"hnsw":{"space":"cosine"}})
23
+ image_store = client.get_or_create_collection("rag-image",configuration={"hnsw":{"space":"ip"}})
24
+ table_store=client.get_or_create_collection("rag-table",configuration={"hnsw":{"space":"cosine"}})
25
+ text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
26
+ os.makedirs("./temp", exist_ok=True)
27
+ os.makedirs("./temp/images", exist_ok=True)
28
+
29
+ def docnamer(docname: str):
30
+ ref_docname = docname
31
+ ref_docname=re.sub(r"[<>:/\\|?*]",'.',ref_docname)
32
+ ref_docname=ref_docname.replace(' ','-')
33
+ ref_docname=ref_docname.lstrip('.')
34
+ return ref_docname
35
+
36
+ def embed(docname:str):
37
+ ext=docname[docname.rfind('.')+1:]
38
+ if ext=="docx" or ext=="doc":
39
+ embed_from_doc(docname)
40
+ elif ext=="pdf":
41
+ embed_from_pdf(docname)
42
+ elif ext=="txt":
43
+ embed_from_txt(docname)
44
+ elif ext=="png" or ext=="jpeg" or ext=="jpg":
45
+ embed_from_image(docname)
46
+ elif ext=="xlsx":
47
+ embed_from_xlsx(docname)
48
+
49
+ def embed_from_pdf(docname:str):
50
+ ref_docname=docnamer(docname)
51
+ doc=pymupdf.open(docname)
52
+ tmd=[]
53
+ imd=[]
54
+ tamd=[]
55
+ images=[]
56
+ tids=[]
57
+ iids=[]
58
+ taids=[]
59
+ tables=[]
60
+ splits=[]
61
+ pages=0
62
+ for page in doc:
63
+ text=page.get_text()
64
+ img_list=page.get_images()
65
+ tab_list=page.find_tables()
66
+ os.makedirs(f"./temp/images/{ref_docname}", exist_ok=True)
67
+ if tab_list:
68
+ for table in tab_list.tables:
69
+ table=table.to_markdown()
70
+ tables.append(table)
71
+ tamd.extend([{"docname": f"{ref_docname}","page": f"{pages}","table": f"{i}","type": "table"} for i in range(len(tab_list.tables))])
72
+ taids.extend([f"{ref_docname}_p{pages}_t{i}" for i in range(len(tab_list.tables))])
73
+ if img_list:
74
+ for img in img_list:
75
+ img=doc.extract_image(img[0])
76
+ img=Image.open(io.BytesIO(img['image']))
77
+ images.append(img)
78
+ imd.extend([{"docname": f"{ref_docname}","page": f"{pages}","imno": f"{i}","type": "image/png"} for i in range(len(img_list))])
79
+ iids.extend([f"{ref_docname}_p{pages}_i{i}" for i in range(len(img_list))])
80
+ split=text_splitter.split_text(text)
81
+ tids.extend([f"{ref_docname}_p{pages}_c{i}" for i in range(len(split))])
82
+ tmd.extend([{"docname": f"{ref_docname}","page": f"{pages}","chunk": f"{i}","type": "text",} for i in range(len(split))])
83
+ splits.extend(split)
84
+ pages+=1
85
+
86
+ for iid,image in zip(iids,images):
87
+ image.save(f"./temp/images/{ref_docname}/{iid}.png")
88
+ if os.path.exists("./temp/text.json"):
89
+ with open("./temp/text.json", "r", encoding="utf-8") as f:
90
+ text_lookup = json.load(f)
91
+ else:
92
+ text_lookup = {}
93
+ text_lookup.setdefault(ref_docname,{})
94
+ text_lookup[ref_docname].update(dict(zip(tids,splits)))
95
+ with open(f"./temp/text.json","w",encoding="utf-8") as f:
96
+ json.dump(text_lookup,f,indent=4)
97
+
98
+ if os.path.exists("./temp/tables.json"):
99
+ with open("./temp/tables.json", "r", encoding="utf-8") as f:
100
+ table_lookup = json.load(f)
101
+ else:
102
+ table_lookup = {}
103
+ table_lookup.setdefault(ref_docname,{})
104
+ table_lookup[ref_docname].update(dict(zip(taids,tables)))
105
+ with open(f"./temp/tables.json","w",encoding="utf-8") as f:
106
+ json.dump(table_lookup,f,indent=4)
107
+
108
+ if splits:
109
+ t_emb=text_embedder.encode(splits)
110
+ text_store.upsert(ids=tids,embeddings=t_emb,metadatas=tmd)
111
+ if images:
112
+ i_emb=image_embedder.encode(images)
113
+ image_store.upsert(ids=iids,embeddings=i_emb,metadatas=imd)
114
+ if tables:
115
+ ta_emb=text_embedder.encode(tables)
116
+ table_store.upsert(ids=taids,embeddings=ta_emb,metadatas=tamd)
117
+
118
+ def embed_from_doc(docname: str):
119
+ ref_docname = docnamer(docname)
120
+ os.makedirs(f"./temp/images/{ref_docname}", exist_ok=True)
121
+ tmd=[]
122
+ imd=[]
123
+ tamd=[]
124
+ images = []
125
+ tids = []
126
+ iids = []
127
+ taids = []
128
+ tables = []
129
+ splits = []
130
+
131
+ # 1. EXTRACT AND SAVE IMAGES FROM ZIP ARCHIVE
132
+ i = 0
133
+ with zipfile.ZipFile(docname, "r") as archive:
134
+ for file in archive.namelist():
135
+ if file.startswith("word/media/"):
136
+ ext = file.split(".")[-1]
137
+ # Open image and force load data into RAM immediately
138
+ img = Image.open(io.BytesIO(archive.read(file)))
139
+ img.load()
140
+ images.append(img)
141
+
142
+ img_id = f"{ref_docname}_i{i}"
143
+ iids.append(img_id)
144
+ imd.append({"docname":f"{ref_docname}","imno":f"{i}","type":f"image/{ext}"})
145
+
146
+ # Save the image right away to your temp folder
147
+ img.save(f"./temp/images/{ref_docname}/{img_id}.{ext}")
148
+ i += 1
149
+
150
+ # 2. EXTRACT TEXT AND CONVERT TABLES TO MARKDOWN STRINGS
151
+ doc = docx.Document(docname)
152
+
153
+ paragraph_index = 0
154
+ for p in doc.paragraphs:
155
+ text_content = p.text.strip()
156
+ if text_content:
157
+ split = text_splitter.split_text(text_content)
158
+ splits.extend(split)
159
+ tids.extend([f"{ref_docname}_p{paragraph_index}_c{chunk_index}" for chunk_index in range(len(split))])
160
+ tmd.extend([{"docname":f"{ref_docname}","pno":f"{paragraph_index}","chunk":f"{chunk_index}","type":"text"} for chunk_index in range(len(split))])
161
+ paragraph_index += 1
162
+
163
+ # Process Document Tables (and convert to Markdown strings)
164
+ for table_idx, table in enumerate(doc.tables):
165
+ markdown_rows = []
166
+ for row_idx, row in enumerate(table.rows):
167
+ row_cells = [cell.text.strip() for cell in row.cells]
168
+
169
+ # Generate Markdown format string grid
170
+ markdown_rows.append("| " + " | ".join(row_cells) + " |")
171
+
172
+ # Insert Markdown header separator line under the first header row
173
+ if row_idx == 0:
174
+ separator = "|" + "|".join(["---"] * len(row_cells)) + "|"
175
+ markdown_rows.append(separator)
176
+
177
+ table_markdown = "\n".join(markdown_rows)
178
+ tables.append(table_markdown)
179
+ taids.append(f"{ref_docname}_t{table_idx}")
180
+ tamd.append({"docname":f"{ref_docname}","table":f"{table_idx}","type":"table"})
181
+
182
+ if os.path.exists("./temp/text.json"):
183
+ with open("./temp/text.json", "r", encoding="utf-8") as f:
184
+ text_lookup = json.load(f)
185
+ else:
186
+ text_lookup = {}
187
+ text_lookup.setdefault(ref_docname, {})
188
+ text_lookup[ref_docname].update(dict(zip(tids, splits)))
189
+ with open("./temp/text.json", "w", encoding="utf-8") as f:
190
+ json.dump(text_lookup, f, indent=4)
191
+
192
+ if os.path.exists("./temp/tables.json"):
193
+ with open("./temp/tables.json", "r", encoding="utf-8") as f:
194
+ table_lookup = json.load(f)
195
+ else:
196
+ table_lookup = {}
197
+ table_lookup.setdefault(ref_docname, {})
198
+ table_lookup[ref_docname].update(dict(zip(taids, tables)))
199
+ with open("./temp/tables.json", "w", encoding="utf-8") as f:
200
+ json.dump(table_lookup, f, indent=4)
201
+
202
+ if splits:
203
+ t_emb = text_embedder.encode(splits)
204
+ text_store.upsert(ids=tids, embeddings=t_emb,metadatas=tmd)
205
+ if images:
206
+ i_emb = image_embedder.encode(images)
207
+ image_store.upsert(ids=iids, embeddings=i_emb,metadatas=imd)
208
+ if tables:
209
+ ta_emb = text_embedder.encode(tables)
210
+ table_store.upsert(ids=taids, embeddings=ta_emb,metadatas=tamd)
211
+
212
+ def embed_from_txt(docname:str):
213
+ ref_docname=docnamer(docname)
214
+ with open(docname,"r",encoding="utf-8") as f:
215
+ text=f.read()
216
+ splits=text_splitter.split_text(text)
217
+ ids=[f"{ref_docname}_c{i}" for i in range(len(splits))]
218
+ mds=[{"docname":ref_docname,"chunk":i,"type":"text"} for i in range(len(splits))]
219
+ emb=text_embedder.encode(splits)
220
+ text_store.upsert(ids=ids,metadatas=mds,embeddings=emb)
221
+ if os.path.exists("./temp/text.json"):
222
+ with open("./temp/text.json", "r", encoding="utf-8") as f:
223
+ text_lookup = json.load(f)
224
+ else:
225
+ text_lookup = {}
226
+ text_lookup.setdefault(ref_docname, {})
227
+ text_lookup[ref_docname].update(dict(zip(ids, splits)))
228
+ with open("./temp/text.json", "w", encoding="utf-8") as f:
229
+ json.dump(text_lookup, f, indent=4)
230
+
231
+ def embed_from_image(docname:str):
232
+ ref_docname=docnamer(docname)
233
+ img=Image.open(docname)
234
+ emb=image_embedder.encode([img])
235
+ id=[f"{ref_docname}_0"]
236
+ md=[{"docname":f"{ref_docname}","imno":0,"type":f"image/{docname[docname.rfind('.')+1:]}"}]
237
+ image_store.upsert(embeddings=emb,ids=id,metadatas=md)
238
+ os.makedirs(f"./temp/images/{ref_docname}",exist_ok=True)
239
+ img.save(f"./temp/images/{ref_docname}/{ref_docname}")
240
+
241
+ def embed_from_xlsx(docname:str):
242
+ ref_docname=docnamer(docname)
243
+ df=pd.read_excel(docname)
244
+ tmd=[df.to_markdown()]
245
+ md=[{"docname":ref_docname,"type":"table","table":"table"}]
246
+ emb=text_embedder.encode(tmd)
247
+ id=[f"{ref_docname}_0"]
248
+ table_store.upsert(embeddings=emb,ids=id,metadatas=md)
249
+ if os.path.exists("./temp/text.json"):
250
+ with open("./temp/text.json", "r", encoding="utf-8") as f:
251
+ text_lookup = json.load(f)
252
+ else:
253
+ text_lookup = {}
254
+ text_lookup.setdefault(ref_docname, {})
255
+ text_lookup[ref_docname].update(dict(zip(id, tmd)))
256
+ with open("./temp/text.json", "w", encoding="utf-8") as f:
257
+ json.dump(text_lookup, f, indent=4)
258
+
259
+ def retrieve(query:str):
260
+ with open("./temp/text.json", "r", encoding="utf-8") as f:
261
+ text_lookup = json.load(f)
262
+ with open("./temp/tables.json", "r", encoding="utf-8") as f:
263
+ table_lookup = json.load(f)
264
+ qt_emb = text_embedder.encode(query)
265
+ qi_emb=image_embedder.encode(query)
266
+
267
+ text_results = text_store.query(
268
+ query_embeddings=[qt_emb],
269
+ n_results=5
270
+ )
271
+ image_results = image_store.query(
272
+ query_embeddings=[qi_emb],
273
+ n_results=2
274
+ )
275
+ table_results=table_store.query(
276
+ query_embeddings=[qt_emb],
277
+ n_results=3
278
+ )
279
+
280
+ text_ids = text_results["ids"][0]
281
+ chunks=[text_lookup.get(text_results["metadatas"][0][i]["docname"],{}).get(text_ids[i],"NOT FOUND") for i in range(len(text_ids))]
282
+ image_ids = image_results["ids"][0]
283
+ images=[]
284
+ for i in range(len(image_ids)):
285
+ with open(f"./temp/images/{image_results['metadatas'][0][i]['docname']}/{image_ids[i]}.{image_results['metadatas'][0][i]['type'][6:]}","rb") as f:
286
+ images.append(base64.b64encode(f.read()).decode("utf-8"))
287
+ table_ids=table_results["ids"][0]
288
+ tables=[table_lookup.get(table_results["metadatas"][0][i]["docname"],{}).get(table_ids[i],"NOT FOUND") for i in range(len(table_ids))]
289
+ return {
290
+ "images":images,
291
+ "text":chunks,
292
+ "tables":tables
293
+ }
294
+
295
+ def query(query:str):
296
+ result=retrieve(query)
297
+ parts=[
298
+ {
299
+ "type":"text",
300
+ "text":f"""
301
+ Answer the Question based on the Context given.
302
+ If the given Context doesn't contain any information regarding the Question, Answer 'Out of Context'
303
+
304
+ Question:{query}"""
305
+ }
306
+ ]
307
+ for text in result["text"]:
308
+ parts.append({
309
+ "type":"text",
310
+ "text":text
311
+ })
312
+ for table in result["tables"]:
313
+ parts.append({
314
+ "type":"text",
315
+ "text":f"TABLE:\n{table}"
316
+ })
317
+ for image in result["images"]:
318
+ parts.append({
319
+ "type":"image_url",
320
+ "image_url":f"data:image/png;base64,{image}"
321
+ })
322
+
323
+ message=HumanMessage(content=parts)
324
+ response=model.invoke([message])
325
+ return [result,response.content]
@@ -0,0 +1,35 @@
1
+ Metadata-Version: 2.4
2
+ Name: multi_rag
3
+ Version: 0.1.0
4
+ Summary: A module to facilitate local testing of RAG pipeline for multiple datatypes
5
+ Project-URL: Homepage, https://github.com
6
+ Author-email: Phani Srimadhav Mula <phanisrimadhav.mula@gmail.com>
7
+ License-File: LICENSE
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Python: >=3.8
12
+ Requires-Dist: chromadb>=0.5.0
13
+ Requires-Dist: langchain-core>=0.2.0
14
+ Requires-Dist: langchain-text-splitters>=0.2.0
15
+ Requires-Dist: langchain>=0.2.0
16
+ Requires-Dist: openpyxl>=3.1.0
17
+ Requires-Dist: pandas>=2.0.0
18
+ Requires-Dist: pillow>=10.0.0
19
+ Requires-Dist: pymupdf>=1.24.0
20
+ Requires-Dist: python-docx>=1.1.0
21
+ Requires-Dist: sentence-transformers>=2.5.0
22
+ Description-Content-Type: text/markdown
23
+
24
+ # multi_rag
25
+ ### This module facilitates testing RAG pipelines on Local machines with chroma_db, with text embeddings from 'bge-base-en-v1.5' and image embeddings from 'clip-vit-b-32'
26
+
27
+ ### Currently this module supports pdf, docx, xlsx, png, jpg, jpeg and txt file formats
28
+
29
+ #### User can give the path of the file to the embed function, which sets up the chroma_db/ folder for the embeddings while temp/ folder gets set up to mimic the actual database to store the data chunks.
30
+
31
+ #### Retrive function takes query as input and gives out a dictionary of 'text','tables','images'
32
+
33
+ #### query function takes query as input and returns the answer and retrieved data as output
34
+
35
+ #### One needs to have gemini api key to query, but embedding and retrieval part is completely local
@@ -0,0 +1,6 @@
1
+ multi_rag/__init__.py,sha256=uy5AJ_g_I8wiAiokdNkupF_tpd8_5aY8v904aoG_rTo,116
2
+ multi_rag/main.py,sha256=YCabjfIZjZXhK9XsBESu_9X_6ag7mjZEjNKiBmSboE4,12355
3
+ multi_rag-0.1.0.dist-info/METADATA,sha256=69l99WJgLZ5FDrMRpd6-LrgO72KqER7kap7XYSIdnAc,1596
4
+ multi_rag-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
5
+ multi_rag-0.1.0.dist-info/licenses/LICENSE,sha256=TC00b67fDKxkxm5iZ-0u-0-Zb_OQpirO5U7x9tFJtBk,1070
6
+ multi_rag-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Srimadhav2007
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.