multi-rag 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,70 @@
1
+ # This workflow will upload a Python Package to PyPI when a release is created
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3
+
4
+ # This workflow uses actions that are not certified by GitHub.
5
+ # They are provided by a third-party and are governed by
6
+ # separate terms of service, privacy policy, and support
7
+ # documentation.
8
+
9
+ name: Upload Python Package
10
+
11
+ on:
12
+ release:
13
+ types: [published]
14
+
15
+ permissions:
16
+ contents: read
17
+
18
+ jobs:
19
+ release-build:
20
+ runs-on: ubuntu-latest
21
+
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+
25
+ - uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.x"
28
+
29
+ - name: Build release distributions
30
+ run: |
31
+ # NOTE: put your own distribution build steps here.
32
+ python -m pip install build
33
+ python -m build
34
+
35
+ - name: Upload distributions
36
+ uses: actions/upload-artifact@v4
37
+ with:
38
+ name: release-dists
39
+ path: dist/
40
+
41
+ pypi-publish:
42
+ runs-on: ubuntu-latest
43
+ needs:
44
+ - release-build
45
+ permissions:
46
+ # IMPORTANT: this permission is mandatory for trusted publishing
47
+ id-token: write
48
+
49
+ # Dedicated environments with protections for publishing are strongly recommended.
50
+ # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
51
+ environment:
52
+ name: pypi
53
+ # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
54
+ # url: https://pypi.org/p/YOURPROJECT
55
+ #
56
+ # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
57
+ # ALTERNATIVE: exactly, uncomment the following line instead:
58
+ # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
59
+
60
+ steps:
61
+ - name: Retrieve release distributions
62
+ uses: actions/download-artifact@v4
63
+ with:
64
+ name: release-dists
65
+ path: dist/
66
+
67
+ - name: Publish release distributions to PyPI
68
+ uses: pypa/gh-action-pypi-publish@release/v1
69
+ with:
70
+ packages-dir: dist/
File without changes
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Srimadhav2007
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,35 @@
1
+ Metadata-Version: 2.4
2
+ Name: multi_rag
3
+ Version: 0.1.0
4
+ Summary: A module to facilitate local testing of RAG pipeline for multiple datatypes
5
+ Project-URL: Homepage, https://github.com
6
+ Author-email: Phani Srimadhav Mula <phanisrimadhav.mula@gmail.com>
7
+ License-File: LICENSE
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Requires-Python: >=3.8
12
+ Requires-Dist: chromadb>=0.5.0
13
+ Requires-Dist: langchain-core>=0.2.0
14
+ Requires-Dist: langchain-text-splitters>=0.2.0
15
+ Requires-Dist: langchain>=0.2.0
16
+ Requires-Dist: openpyxl>=3.1.0
17
+ Requires-Dist: pandas>=2.0.0
18
+ Requires-Dist: pillow>=10.0.0
19
+ Requires-Dist: pymupdf>=1.24.0
20
+ Requires-Dist: python-docx>=1.1.0
21
+ Requires-Dist: sentence-transformers>=2.5.0
22
+ Description-Content-Type: text/markdown
23
+
24
+ # multi_rag
25
+ ### This module facilitates testing RAG pipelines on Local machines with chroma_db, with text embeddings from 'bge-base-en-v1.5' and image embeddings from 'clip-vit-b-32'
26
+
27
+ ### Currently this module supports pdf, docx, xlsx, png, jpg, jpeg and txt file formats
28
+
29
+ #### User can give the path of the file to the embed function, which sets up the chroma_db/ folder for the embeddings while temp/ folder gets set up to mimic the actual database to store the data chunks.
30
+
31
+ #### Retrive function takes query as input and gives out a dictionary of 'text','tables','images'
32
+
33
+ #### query function takes query as input and returns the answer and retrieved data as output
34
+
35
+ #### One needs to have gemini api key to query, but embedding and retrieval part is completely local
@@ -0,0 +1,12 @@
1
+ # multi_rag
2
+ ### This module facilitates testing RAG pipelines on Local machines with chroma_db, with text embeddings from 'bge-base-en-v1.5' and image embeddings from 'clip-vit-b-32'
3
+
4
+ ### Currently this module supports pdf, docx, xlsx, png, jpg, jpeg and txt file formats
5
+
6
+ #### User can give the path of the file to the embed function, which sets up the chroma_db/ folder for the embeddings while temp/ folder gets set up to mimic the actual database to store the data chunks.
7
+
8
+ #### Retrive function takes query as input and gives out a dictionary of 'text','tables','images'
9
+
10
+ #### query function takes query as input and returns the answer and retrieved data as output
11
+
12
+ #### One needs to have gemini api key to query, but embedding and retrieval part is completely local
@@ -0,0 +1,33 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "multi_rag"
7
+ version = "0.1.0"
8
+ authors = [
9
+ { name="Phani Srimadhav Mula", email="phanisrimadhav.mula@gmail.com" },
10
+ ]
11
+ description = "A module to facilitate local testing of RAG pipeline for multiple datatypes"
12
+ readme = "README.md"
13
+ requires-python = ">=3.8"
14
+ dependencies = [
15
+ "pymupdf>=1.24.0",
16
+ "pillow>=10.0.0",
17
+ "sentence-transformers>=2.5.0",
18
+ "chromadb>=0.5.0",
19
+ "langchain-text-splitters>=0.2.0",
20
+ "langchain-core>=0.2.0",
21
+ "langchain>=0.2.0",
22
+ "python-docx>=1.1.0",
23
+ "pandas>=2.0.0",
24
+ "openpyxl>=3.1.0",
25
+ ]
26
+ classifiers = [
27
+ "Programming Language :: Python :: 3",
28
+ "License :: OSI Approved :: MIT License",
29
+ "Operating System :: OS Independent",
30
+ ]
31
+
32
+ [project.urls]
33
+ "Homepage" = "https://github.com"
@@ -0,0 +1 @@
1
+ from .main import embed,embed_from_doc,embed_from_image,embed_from_pdf,embed_from_txt,embed_from_xlsx,retrieve,query
@@ -0,0 +1,325 @@
1
+ import pymupdf
2
+ from PIL import Image
3
+ import io
4
+ import os
5
+ import re
6
+ from sentence_transformers import SentenceTransformer
7
+ from chromadb import PersistentClient
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ import json
10
+ import base64
11
+ from langchain_core.messages import HumanMessage
12
+ from langchain.chat_models import init_chat_model
13
+ import zipfile
14
+ import docx
15
+ import pandas as pd
16
+
17
+ model=init_chat_model(model="gemini-2.5-flash",model_provider="google-genai")
18
+ image_embedder=SentenceTransformer("clip-ViT-B-32")
19
+ text_embedder=SentenceTransformer("BAAI/bge-base-en-v1.5") #"BAAI/bge-m3"
20
+
21
+ client=PersistentClient(path="./chroma_db")
22
+ text_store = client.get_or_create_collection("rag-text",configuration={"hnsw":{"space":"cosine"}})
23
+ image_store = client.get_or_create_collection("rag-image",configuration={"hnsw":{"space":"ip"}})
24
+ table_store=client.get_or_create_collection("rag-table",configuration={"hnsw":{"space":"cosine"}})
25
+ text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
26
+ os.makedirs("./temp", exist_ok=True)
27
+ os.makedirs("./temp/images", exist_ok=True)
28
+
29
+ def docnamer(docname: str):
30
+ ref_docname = docname
31
+ ref_docname=re.sub(r"[<>:/\\|?*]",'.',ref_docname)
32
+ ref_docname=ref_docname.replace(' ','-')
33
+ ref_docname=ref_docname.lstrip('.')
34
+ return ref_docname
35
+
36
+ def embed(docname:str):
37
+ ext=docname[docname.rfind('.')+1:]
38
+ if ext=="docx" or ext=="doc":
39
+ embed_from_doc(docname)
40
+ elif ext=="pdf":
41
+ embed_from_pdf(docname)
42
+ elif ext=="txt":
43
+ embed_from_txt(docname)
44
+ elif ext=="png" or ext=="jpeg" or ext=="jpg":
45
+ embed_from_image(docname)
46
+ elif ext=="xlsx":
47
+ embed_from_xlsx(docname)
48
+
49
+ def embed_from_pdf(docname:str):
50
+ ref_docname=docnamer(docname)
51
+ doc=pymupdf.open(docname)
52
+ tmd=[]
53
+ imd=[]
54
+ tamd=[]
55
+ images=[]
56
+ tids=[]
57
+ iids=[]
58
+ taids=[]
59
+ tables=[]
60
+ splits=[]
61
+ pages=0
62
+ for page in doc:
63
+ text=page.get_text()
64
+ img_list=page.get_images()
65
+ tab_list=page.find_tables()
66
+ os.makedirs(f"./temp/images/{ref_docname}", exist_ok=True)
67
+ if tab_list:
68
+ for table in tab_list.tables:
69
+ table=table.to_markdown()
70
+ tables.append(table)
71
+ tamd.extend([{"docname": f"{ref_docname}","page": f"{pages}","table": f"{i}","type": "table"} for i in range(len(tab_list.tables))])
72
+ taids.extend([f"{ref_docname}_p{pages}_t{i}" for i in range(len(tab_list.tables))])
73
+ if img_list:
74
+ for img in img_list:
75
+ img=doc.extract_image(img[0])
76
+ img=Image.open(io.BytesIO(img['image']))
77
+ images.append(img)
78
+ imd.extend([{"docname": f"{ref_docname}","page": f"{pages}","imno": f"{i}","type": "image/png"} for i in range(len(img_list))])
79
+ iids.extend([f"{ref_docname}_p{pages}_i{i}" for i in range(len(img_list))])
80
+ split=text_splitter.split_text(text)
81
+ tids.extend([f"{ref_docname}_p{pages}_c{i}" for i in range(len(split))])
82
+ tmd.extend([{"docname": f"{ref_docname}","page": f"{pages}","chunk": f"{i}","type": "text",} for i in range(len(split))])
83
+ splits.extend(split)
84
+ pages+=1
85
+
86
+ for iid,image in zip(iids,images):
87
+ image.save(f"./temp/images/{ref_docname}/{iid}.png")
88
+ if os.path.exists("./temp/text.json"):
89
+ with open("./temp/text.json", "r", encoding="utf-8") as f:
90
+ text_lookup = json.load(f)
91
+ else:
92
+ text_lookup = {}
93
+ text_lookup.setdefault(ref_docname,{})
94
+ text_lookup[ref_docname].update(dict(zip(tids,splits)))
95
+ with open(f"./temp/text.json","w",encoding="utf-8") as f:
96
+ json.dump(text_lookup,f,indent=4)
97
+
98
+ if os.path.exists("./temp/tables.json"):
99
+ with open("./temp/tables.json", "r", encoding="utf-8") as f:
100
+ table_lookup = json.load(f)
101
+ else:
102
+ table_lookup = {}
103
+ table_lookup.setdefault(ref_docname,{})
104
+ table_lookup[ref_docname].update(dict(zip(taids,tables)))
105
+ with open(f"./temp/tables.json","w",encoding="utf-8") as f:
106
+ json.dump(table_lookup,f,indent=4)
107
+
108
+ if splits:
109
+ t_emb=text_embedder.encode(splits)
110
+ text_store.upsert(ids=tids,embeddings=t_emb,metadatas=tmd)
111
+ if images:
112
+ i_emb=image_embedder.encode(images)
113
+ image_store.upsert(ids=iids,embeddings=i_emb,metadatas=imd)
114
+ if tables:
115
+ ta_emb=text_embedder.encode(tables)
116
+ table_store.upsert(ids=taids,embeddings=ta_emb,metadatas=tamd)
117
+
118
+ def embed_from_doc(docname: str):
119
+ ref_docname = docnamer(docname)
120
+ os.makedirs(f"./temp/images/{ref_docname}", exist_ok=True)
121
+ tmd=[]
122
+ imd=[]
123
+ tamd=[]
124
+ images = []
125
+ tids = []
126
+ iids = []
127
+ taids = []
128
+ tables = []
129
+ splits = []
130
+
131
+ # 1. EXTRACT AND SAVE IMAGES FROM ZIP ARCHIVE
132
+ i = 0
133
+ with zipfile.ZipFile(docname, "r") as archive:
134
+ for file in archive.namelist():
135
+ if file.startswith("word/media/"):
136
+ ext = file.split(".")[-1]
137
+ # Open image and force load data into RAM immediately
138
+ img = Image.open(io.BytesIO(archive.read(file)))
139
+ img.load()
140
+ images.append(img)
141
+
142
+ img_id = f"{ref_docname}_i{i}"
143
+ iids.append(img_id)
144
+ imd.append({"docname":f"{ref_docname}","imno":f"{i}","type":f"image/{ext}"})
145
+
146
+ # Save the image right away to your temp folder
147
+ img.save(f"./temp/images/{ref_docname}/{img_id}.{ext}")
148
+ i += 1
149
+
150
+ # 2. EXTRACT TEXT AND CONVERT TABLES TO MARKDOWN STRINGS
151
+ doc = docx.Document(docname)
152
+
153
+ paragraph_index = 0
154
+ for p in doc.paragraphs:
155
+ text_content = p.text.strip()
156
+ if text_content:
157
+ split = text_splitter.split_text(text_content)
158
+ splits.extend(split)
159
+ tids.extend([f"{ref_docname}_p{paragraph_index}_c{chunk_index}" for chunk_index in range(len(split))])
160
+ tmd.extend([{"docname":f"{ref_docname}","pno":f"{paragraph_index}","chunk":f"{chunk_index}","type":"text"} for chunk_index in range(len(split))])
161
+ paragraph_index += 1
162
+
163
+ # Process Document Tables (and convert to Markdown strings)
164
+ for table_idx, table in enumerate(doc.tables):
165
+ markdown_rows = []
166
+ for row_idx, row in enumerate(table.rows):
167
+ row_cells = [cell.text.strip() for cell in row.cells]
168
+
169
+ # Generate Markdown format string grid
170
+ markdown_rows.append("| " + " | ".join(row_cells) + " |")
171
+
172
+ # Insert Markdown header separator line under the first header row
173
+ if row_idx == 0:
174
+ separator = "|" + "|".join(["---"] * len(row_cells)) + "|"
175
+ markdown_rows.append(separator)
176
+
177
+ table_markdown = "\n".join(markdown_rows)
178
+ tables.append(table_markdown)
179
+ taids.append(f"{ref_docname}_t{table_idx}")
180
+ tamd.append({"docname":f"{ref_docname}","table":f"{table_idx}","type":"table"})
181
+
182
+ if os.path.exists("./temp/text.json"):
183
+ with open("./temp/text.json", "r", encoding="utf-8") as f:
184
+ text_lookup = json.load(f)
185
+ else:
186
+ text_lookup = {}
187
+ text_lookup.setdefault(ref_docname, {})
188
+ text_lookup[ref_docname].update(dict(zip(tids, splits)))
189
+ with open("./temp/text.json", "w", encoding="utf-8") as f:
190
+ json.dump(text_lookup, f, indent=4)
191
+
192
+ if os.path.exists("./temp/tables.json"):
193
+ with open("./temp/tables.json", "r", encoding="utf-8") as f:
194
+ table_lookup = json.load(f)
195
+ else:
196
+ table_lookup = {}
197
+ table_lookup.setdefault(ref_docname, {})
198
+ table_lookup[ref_docname].update(dict(zip(taids, tables)))
199
+ with open("./temp/tables.json", "w", encoding="utf-8") as f:
200
+ json.dump(table_lookup, f, indent=4)
201
+
202
+ if splits:
203
+ t_emb = text_embedder.encode(splits)
204
+ text_store.upsert(ids=tids, embeddings=t_emb,metadatas=tmd)
205
+ if images:
206
+ i_emb = image_embedder.encode(images)
207
+ image_store.upsert(ids=iids, embeddings=i_emb,metadatas=imd)
208
+ if tables:
209
+ ta_emb = text_embedder.encode(tables)
210
+ table_store.upsert(ids=taids, embeddings=ta_emb,metadatas=tamd)
211
+
212
+ def embed_from_txt(docname:str):
213
+ ref_docname=docnamer(docname)
214
+ with open(docname,"r",encoding="utf-8") as f:
215
+ text=f.read()
216
+ splits=text_splitter.split_text(text)
217
+ ids=[f"{ref_docname}_c{i}" for i in range(len(splits))]
218
+ mds=[{"docname":ref_docname,"chunk":i,"type":"text"} for i in range(len(splits))]
219
+ emb=text_embedder.encode(splits)
220
+ text_store.upsert(ids=ids,metadatas=mds,embeddings=emb)
221
+ if os.path.exists("./temp/text.json"):
222
+ with open("./temp/text.json", "r", encoding="utf-8") as f:
223
+ text_lookup = json.load(f)
224
+ else:
225
+ text_lookup = {}
226
+ text_lookup.setdefault(ref_docname, {})
227
+ text_lookup[ref_docname].update(dict(zip(ids, splits)))
228
+ with open("./temp/text.json", "w", encoding="utf-8") as f:
229
+ json.dump(text_lookup, f, indent=4)
230
+
231
+ def embed_from_image(docname:str):
232
+ ref_docname=docnamer(docname)
233
+ img=Image.open(docname)
234
+ emb=image_embedder.encode([img])
235
+ id=[f"{ref_docname}_0"]
236
+ md=[{"docname":f"{ref_docname}","imno":0,"type":f"image/{docname[docname.rfind('.')+1:]}"}]
237
+ image_store.upsert(embeddings=emb,ids=id,metadatas=md)
238
+ os.makedirs(f"./temp/images/{ref_docname}",exist_ok=True)
239
+ img.save(f"./temp/images/{ref_docname}/{ref_docname}")
240
+
241
+ def embed_from_xlsx(docname:str):
242
+ ref_docname=docnamer(docname)
243
+ df=pd.read_excel(docname)
244
+ tmd=[df.to_markdown()]
245
+ md=[{"docname":ref_docname,"type":"table","table":"table"}]
246
+ emb=text_embedder.encode(tmd)
247
+ id=[f"{ref_docname}_0"]
248
+ table_store.upsert(embeddings=emb,ids=id,metadatas=md)
249
+ if os.path.exists("./temp/text.json"):
250
+ with open("./temp/text.json", "r", encoding="utf-8") as f:
251
+ text_lookup = json.load(f)
252
+ else:
253
+ text_lookup = {}
254
+ text_lookup.setdefault(ref_docname, {})
255
+ text_lookup[ref_docname].update(dict(zip(id, tmd)))
256
+ with open("./temp/text.json", "w", encoding="utf-8") as f:
257
+ json.dump(text_lookup, f, indent=4)
258
+
259
+ def retrieve(query:str):
260
+ with open("./temp/text.json", "r", encoding="utf-8") as f:
261
+ text_lookup = json.load(f)
262
+ with open("./temp/tables.json", "r", encoding="utf-8") as f:
263
+ table_lookup = json.load(f)
264
+ qt_emb = text_embedder.encode(query)
265
+ qi_emb=image_embedder.encode(query)
266
+
267
+ text_results = text_store.query(
268
+ query_embeddings=[qt_emb],
269
+ n_results=5
270
+ )
271
+ image_results = image_store.query(
272
+ query_embeddings=[qi_emb],
273
+ n_results=2
274
+ )
275
+ table_results=table_store.query(
276
+ query_embeddings=[qt_emb],
277
+ n_results=3
278
+ )
279
+
280
+ text_ids = text_results["ids"][0]
281
+ chunks=[text_lookup.get(text_results["metadatas"][0][i]["docname"],{}).get(text_ids[i],"NOT FOUND") for i in range(len(text_ids))]
282
+ image_ids = image_results["ids"][0]
283
+ images=[]
284
+ for i in range(len(image_ids)):
285
+ with open(f"./temp/images/{image_results['metadatas'][0][i]['docname']}/{image_ids[i]}.{image_results['metadatas'][0][i]['type'][6:]}","rb") as f:
286
+ images.append(base64.b64encode(f.read()).decode("utf-8"))
287
+ table_ids=table_results["ids"][0]
288
+ tables=[table_lookup.get(table_results["metadatas"][0][i]["docname"],{}).get(table_ids[i],"NOT FOUND") for i in range(len(table_ids))]
289
+ return {
290
+ "images":images,
291
+ "text":chunks,
292
+ "tables":tables
293
+ }
294
+
295
+ def query(query:str):
296
+ result=retrieve(query)
297
+ parts=[
298
+ {
299
+ "type":"text",
300
+ "text":f"""
301
+ Answer the Question based on the Context given.
302
+ If the given Context doesn't contain any information regarding the Question, Answer 'Out of Context'
303
+
304
+ Question:{query}"""
305
+ }
306
+ ]
307
+ for text in result["text"]:
308
+ parts.append({
309
+ "type":"text",
310
+ "text":text
311
+ })
312
+ for table in result["tables"]:
313
+ parts.append({
314
+ "type":"text",
315
+ "text":f"TABLE:\n{table}"
316
+ })
317
+ for image in result["images"]:
318
+ parts.append({
319
+ "type":"image_url",
320
+ "image_url":f"data:image/png;base64,{image}"
321
+ })
322
+
323
+ message=HumanMessage(content=parts)
324
+ response=model.invoke([message])
325
+ return [result,response.content]