multi-rag 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- multi_rag-0.1.0/.github/workflows/python-publish.yml +70 -0
- multi_rag-0.1.0/.gitignore +0 -0
- multi_rag-0.1.0/LICENSE +21 -0
- multi_rag-0.1.0/PKG-INFO +35 -0
- multi_rag-0.1.0/README.md +12 -0
- multi_rag-0.1.0/pyproject.toml +33 -0
- multi_rag-0.1.0/src/multi_rag/__init__.py +1 -0
- multi_rag-0.1.0/src/multi_rag/main.py +325 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# This workflow will upload a Python Package to PyPI when a release is created
|
|
2
|
+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
|
3
|
+
|
|
4
|
+
# This workflow uses actions that are not certified by GitHub.
|
|
5
|
+
# They are provided by a third-party and are governed by
|
|
6
|
+
# separate terms of service, privacy policy, and support
|
|
7
|
+
# documentation.
|
|
8
|
+
|
|
9
|
+
name: Upload Python Package
|
|
10
|
+
|
|
11
|
+
on:
|
|
12
|
+
release:
|
|
13
|
+
types: [published]
|
|
14
|
+
|
|
15
|
+
permissions:
|
|
16
|
+
contents: read
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
release-build:
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v4
|
|
24
|
+
|
|
25
|
+
- uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.x"
|
|
28
|
+
|
|
29
|
+
- name: Build release distributions
|
|
30
|
+
run: |
|
|
31
|
+
# NOTE: put your own distribution build steps here.
|
|
32
|
+
python -m pip install build
|
|
33
|
+
python -m build
|
|
34
|
+
|
|
35
|
+
- name: Upload distributions
|
|
36
|
+
uses: actions/upload-artifact@v4
|
|
37
|
+
with:
|
|
38
|
+
name: release-dists
|
|
39
|
+
path: dist/
|
|
40
|
+
|
|
41
|
+
pypi-publish:
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
needs:
|
|
44
|
+
- release-build
|
|
45
|
+
permissions:
|
|
46
|
+
# IMPORTANT: this permission is mandatory for trusted publishing
|
|
47
|
+
id-token: write
|
|
48
|
+
|
|
49
|
+
# Dedicated environments with protections for publishing are strongly recommended.
|
|
50
|
+
# For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
|
|
51
|
+
environment:
|
|
52
|
+
name: pypi
|
|
53
|
+
# OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
|
|
54
|
+
# url: https://pypi.org/p/YOURPROJECT
|
|
55
|
+
#
|
|
56
|
+
# ALTERNATIVE: if your GitHub Release name is the PyPI project version string
|
|
57
|
+
# ALTERNATIVE: exactly, uncomment the following line instead:
|
|
58
|
+
# url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
|
|
59
|
+
|
|
60
|
+
steps:
|
|
61
|
+
- name: Retrieve release distributions
|
|
62
|
+
uses: actions/download-artifact@v4
|
|
63
|
+
with:
|
|
64
|
+
name: release-dists
|
|
65
|
+
path: dist/
|
|
66
|
+
|
|
67
|
+
- name: Publish release distributions to PyPI
|
|
68
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
69
|
+
with:
|
|
70
|
+
packages-dir: dist/
|
|
File without changes
|
multi_rag-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Srimadhav2007
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
multi_rag-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: multi_rag
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A module to facilitate local testing of RAG pipeline for multiple datatypes
|
|
5
|
+
Project-URL: Homepage, https://github.com
|
|
6
|
+
Author-email: Phani Srimadhav Mula <phanisrimadhav.mula@gmail.com>
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Requires-Dist: chromadb>=0.5.0
|
|
13
|
+
Requires-Dist: langchain-core>=0.2.0
|
|
14
|
+
Requires-Dist: langchain-text-splitters>=0.2.0
|
|
15
|
+
Requires-Dist: langchain>=0.2.0
|
|
16
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
17
|
+
Requires-Dist: pandas>=2.0.0
|
|
18
|
+
Requires-Dist: pillow>=10.0.0
|
|
19
|
+
Requires-Dist: pymupdf>=1.24.0
|
|
20
|
+
Requires-Dist: python-docx>=1.1.0
|
|
21
|
+
Requires-Dist: sentence-transformers>=2.5.0
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# multi_rag
|
|
25
|
+
### This module facilitates testing RAG pipelines on Local machines with chroma_db, with text embeddings from 'bge-base-en-v1.5' and image embeddings from 'clip-vit-b-32'
|
|
26
|
+
|
|
27
|
+
### Currently this module supports pdf, docx, xlsx, png, jpg, jpeg and txt file formats
|
|
28
|
+
|
|
29
|
+
#### User can give the path of the file to the embed function, which sets up the chroma_db/ folder for the embeddings while temp/ folder gets set up to mimic the actual database to store the data chunks.
|
|
30
|
+
|
|
31
|
+
#### Retrive function takes query as input and gives out a dictionary of 'text','tables','images'
|
|
32
|
+
|
|
33
|
+
#### query function takes query as input and returns the answer and retrieved data as output
|
|
34
|
+
|
|
35
|
+
#### One needs to have gemini api key to query, but embedding and retrieval part is completely local
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# multi_rag
|
|
2
|
+
### This module facilitates testing RAG pipelines on Local machines with chroma_db, with text embeddings from 'bge-base-en-v1.5' and image embeddings from 'clip-vit-b-32'
|
|
3
|
+
|
|
4
|
+
### Currently this module supports pdf, docx, xlsx, png, jpg, jpeg and txt file formats
|
|
5
|
+
|
|
6
|
+
#### User can give the path of the file to the embed function, which sets up the chroma_db/ folder for the embeddings while temp/ folder gets set up to mimic the actual database to store the data chunks.
|
|
7
|
+
|
|
8
|
+
#### Retrive function takes query as input and gives out a dictionary of 'text','tables','images'
|
|
9
|
+
|
|
10
|
+
#### query function takes query as input and returns the answer and retrieved data as output
|
|
11
|
+
|
|
12
|
+
#### One needs to have gemini api key to query, but embedding and retrieval part is completely local
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "multi_rag"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Phani Srimadhav Mula", email="phanisrimadhav.mula@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "A module to facilitate local testing of RAG pipeline for multiple datatypes"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"pymupdf>=1.24.0",
|
|
16
|
+
"pillow>=10.0.0",
|
|
17
|
+
"sentence-transformers>=2.5.0",
|
|
18
|
+
"chromadb>=0.5.0",
|
|
19
|
+
"langchain-text-splitters>=0.2.0",
|
|
20
|
+
"langchain-core>=0.2.0",
|
|
21
|
+
"langchain>=0.2.0",
|
|
22
|
+
"python-docx>=1.1.0",
|
|
23
|
+
"pandas>=2.0.0",
|
|
24
|
+
"openpyxl>=3.1.0",
|
|
25
|
+
]
|
|
26
|
+
classifiers = [
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"License :: OSI Approved :: MIT License",
|
|
29
|
+
"Operating System :: OS Independent",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
"Homepage" = "https://github.com"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .main import embed,embed_from_doc,embed_from_image,embed_from_pdf,embed_from_txt,embed_from_xlsx,retrieve,query
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import pymupdf
|
|
2
|
+
from PIL import Image
|
|
3
|
+
import io
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
from sentence_transformers import SentenceTransformer
|
|
7
|
+
from chromadb import PersistentClient
|
|
8
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
9
|
+
import json
|
|
10
|
+
import base64
|
|
11
|
+
from langchain_core.messages import HumanMessage
|
|
12
|
+
from langchain.chat_models import init_chat_model
|
|
13
|
+
import zipfile
|
|
14
|
+
import docx
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
model=init_chat_model(model="gemini-2.5-flash",model_provider="google-genai")
|
|
18
|
+
image_embedder=SentenceTransformer("clip-ViT-B-32")
|
|
19
|
+
text_embedder=SentenceTransformer("BAAI/bge-base-en-v1.5") #"BAAI/bge-m3"
|
|
20
|
+
|
|
21
|
+
client=PersistentClient(path="./chroma_db")
|
|
22
|
+
text_store = client.get_or_create_collection("rag-text",configuration={"hnsw":{"space":"cosine"}})
|
|
23
|
+
image_store = client.get_or_create_collection("rag-image",configuration={"hnsw":{"space":"ip"}})
|
|
24
|
+
table_store=client.get_or_create_collection("rag-table",configuration={"hnsw":{"space":"cosine"}})
|
|
25
|
+
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)
|
|
26
|
+
os.makedirs("./temp", exist_ok=True)
|
|
27
|
+
os.makedirs("./temp/images", exist_ok=True)
|
|
28
|
+
|
|
29
|
+
def docnamer(docname: str):
|
|
30
|
+
ref_docname = docname
|
|
31
|
+
ref_docname=re.sub(r"[<>:/\\|?*]",'.',ref_docname)
|
|
32
|
+
ref_docname=ref_docname.replace(' ','-')
|
|
33
|
+
ref_docname=ref_docname.lstrip('.')
|
|
34
|
+
return ref_docname
|
|
35
|
+
|
|
36
|
+
def embed(docname:str):
|
|
37
|
+
ext=docname[docname.rfind('.')+1:]
|
|
38
|
+
if ext=="docx" or ext=="doc":
|
|
39
|
+
embed_from_doc(docname)
|
|
40
|
+
elif ext=="pdf":
|
|
41
|
+
embed_from_pdf(docname)
|
|
42
|
+
elif ext=="txt":
|
|
43
|
+
embed_from_txt(docname)
|
|
44
|
+
elif ext=="png" or ext=="jpeg" or ext=="jpg":
|
|
45
|
+
embed_from_image(docname)
|
|
46
|
+
elif ext=="xlsx":
|
|
47
|
+
embed_from_xlsx(docname)
|
|
48
|
+
|
|
49
|
+
def embed_from_pdf(docname:str):
|
|
50
|
+
ref_docname=docnamer(docname)
|
|
51
|
+
doc=pymupdf.open(docname)
|
|
52
|
+
tmd=[]
|
|
53
|
+
imd=[]
|
|
54
|
+
tamd=[]
|
|
55
|
+
images=[]
|
|
56
|
+
tids=[]
|
|
57
|
+
iids=[]
|
|
58
|
+
taids=[]
|
|
59
|
+
tables=[]
|
|
60
|
+
splits=[]
|
|
61
|
+
pages=0
|
|
62
|
+
for page in doc:
|
|
63
|
+
text=page.get_text()
|
|
64
|
+
img_list=page.get_images()
|
|
65
|
+
tab_list=page.find_tables()
|
|
66
|
+
os.makedirs(f"./temp/images/{ref_docname}", exist_ok=True)
|
|
67
|
+
if tab_list:
|
|
68
|
+
for table in tab_list.tables:
|
|
69
|
+
table=table.to_markdown()
|
|
70
|
+
tables.append(table)
|
|
71
|
+
tamd.extend([{"docname": f"{ref_docname}","page": f"{pages}","table": f"{i}","type": "table"} for i in range(len(tab_list.tables))])
|
|
72
|
+
taids.extend([f"{ref_docname}_p{pages}_t{i}" for i in range(len(tab_list.tables))])
|
|
73
|
+
if img_list:
|
|
74
|
+
for img in img_list:
|
|
75
|
+
img=doc.extract_image(img[0])
|
|
76
|
+
img=Image.open(io.BytesIO(img['image']))
|
|
77
|
+
images.append(img)
|
|
78
|
+
imd.extend([{"docname": f"{ref_docname}","page": f"{pages}","imno": f"{i}","type": "image/png"} for i in range(len(img_list))])
|
|
79
|
+
iids.extend([f"{ref_docname}_p{pages}_i{i}" for i in range(len(img_list))])
|
|
80
|
+
split=text_splitter.split_text(text)
|
|
81
|
+
tids.extend([f"{ref_docname}_p{pages}_c{i}" for i in range(len(split))])
|
|
82
|
+
tmd.extend([{"docname": f"{ref_docname}","page": f"{pages}","chunk": f"{i}","type": "text",} for i in range(len(split))])
|
|
83
|
+
splits.extend(split)
|
|
84
|
+
pages+=1
|
|
85
|
+
|
|
86
|
+
for iid,image in zip(iids,images):
|
|
87
|
+
image.save(f"./temp/images/{ref_docname}/{iid}.png")
|
|
88
|
+
if os.path.exists("./temp/text.json"):
|
|
89
|
+
with open("./temp/text.json", "r", encoding="utf-8") as f:
|
|
90
|
+
text_lookup = json.load(f)
|
|
91
|
+
else:
|
|
92
|
+
text_lookup = {}
|
|
93
|
+
text_lookup.setdefault(ref_docname,{})
|
|
94
|
+
text_lookup[ref_docname].update(dict(zip(tids,splits)))
|
|
95
|
+
with open(f"./temp/text.json","w",encoding="utf-8") as f:
|
|
96
|
+
json.dump(text_lookup,f,indent=4)
|
|
97
|
+
|
|
98
|
+
if os.path.exists("./temp/tables.json"):
|
|
99
|
+
with open("./temp/tables.json", "r", encoding="utf-8") as f:
|
|
100
|
+
table_lookup = json.load(f)
|
|
101
|
+
else:
|
|
102
|
+
table_lookup = {}
|
|
103
|
+
table_lookup.setdefault(ref_docname,{})
|
|
104
|
+
table_lookup[ref_docname].update(dict(zip(taids,tables)))
|
|
105
|
+
with open(f"./temp/tables.json","w",encoding="utf-8") as f:
|
|
106
|
+
json.dump(table_lookup,f,indent=4)
|
|
107
|
+
|
|
108
|
+
if splits:
|
|
109
|
+
t_emb=text_embedder.encode(splits)
|
|
110
|
+
text_store.upsert(ids=tids,embeddings=t_emb,metadatas=tmd)
|
|
111
|
+
if images:
|
|
112
|
+
i_emb=image_embedder.encode(images)
|
|
113
|
+
image_store.upsert(ids=iids,embeddings=i_emb,metadatas=imd)
|
|
114
|
+
if tables:
|
|
115
|
+
ta_emb=text_embedder.encode(tables)
|
|
116
|
+
table_store.upsert(ids=taids,embeddings=ta_emb,metadatas=tamd)
|
|
117
|
+
|
|
118
|
+
def embed_from_doc(docname: str):
|
|
119
|
+
ref_docname = docnamer(docname)
|
|
120
|
+
os.makedirs(f"./temp/images/{ref_docname}", exist_ok=True)
|
|
121
|
+
tmd=[]
|
|
122
|
+
imd=[]
|
|
123
|
+
tamd=[]
|
|
124
|
+
images = []
|
|
125
|
+
tids = []
|
|
126
|
+
iids = []
|
|
127
|
+
taids = []
|
|
128
|
+
tables = []
|
|
129
|
+
splits = []
|
|
130
|
+
|
|
131
|
+
# 1. EXTRACT AND SAVE IMAGES FROM ZIP ARCHIVE
|
|
132
|
+
i = 0
|
|
133
|
+
with zipfile.ZipFile(docname, "r") as archive:
|
|
134
|
+
for file in archive.namelist():
|
|
135
|
+
if file.startswith("word/media/"):
|
|
136
|
+
ext = file.split(".")[-1]
|
|
137
|
+
# Open image and force load data into RAM immediately
|
|
138
|
+
img = Image.open(io.BytesIO(archive.read(file)))
|
|
139
|
+
img.load()
|
|
140
|
+
images.append(img)
|
|
141
|
+
|
|
142
|
+
img_id = f"{ref_docname}_i{i}"
|
|
143
|
+
iids.append(img_id)
|
|
144
|
+
imd.append({"docname":f"{ref_docname}","imno":f"{i}","type":f"image/{ext}"})
|
|
145
|
+
|
|
146
|
+
# Save the image right away to your temp folder
|
|
147
|
+
img.save(f"./temp/images/{ref_docname}/{img_id}.{ext}")
|
|
148
|
+
i += 1
|
|
149
|
+
|
|
150
|
+
# 2. EXTRACT TEXT AND CONVERT TABLES TO MARKDOWN STRINGS
|
|
151
|
+
doc = docx.Document(docname)
|
|
152
|
+
|
|
153
|
+
paragraph_index = 0
|
|
154
|
+
for p in doc.paragraphs:
|
|
155
|
+
text_content = p.text.strip()
|
|
156
|
+
if text_content:
|
|
157
|
+
split = text_splitter.split_text(text_content)
|
|
158
|
+
splits.extend(split)
|
|
159
|
+
tids.extend([f"{ref_docname}_p{paragraph_index}_c{chunk_index}" for chunk_index in range(len(split))])
|
|
160
|
+
tmd.extend([{"docname":f"{ref_docname}","pno":f"{paragraph_index}","chunk":f"{chunk_index}","type":"text"} for chunk_index in range(len(split))])
|
|
161
|
+
paragraph_index += 1
|
|
162
|
+
|
|
163
|
+
# Process Document Tables (and convert to Markdown strings)
|
|
164
|
+
for table_idx, table in enumerate(doc.tables):
|
|
165
|
+
markdown_rows = []
|
|
166
|
+
for row_idx, row in enumerate(table.rows):
|
|
167
|
+
row_cells = [cell.text.strip() for cell in row.cells]
|
|
168
|
+
|
|
169
|
+
# Generate Markdown format string grid
|
|
170
|
+
markdown_rows.append("| " + " | ".join(row_cells) + " |")
|
|
171
|
+
|
|
172
|
+
# Insert Markdown header separator line under the first header row
|
|
173
|
+
if row_idx == 0:
|
|
174
|
+
separator = "|" + "|".join(["---"] * len(row_cells)) + "|"
|
|
175
|
+
markdown_rows.append(separator)
|
|
176
|
+
|
|
177
|
+
table_markdown = "\n".join(markdown_rows)
|
|
178
|
+
tables.append(table_markdown)
|
|
179
|
+
taids.append(f"{ref_docname}_t{table_idx}")
|
|
180
|
+
tamd.append({"docname":f"{ref_docname}","table":f"{table_idx}","type":"table"})
|
|
181
|
+
|
|
182
|
+
if os.path.exists("./temp/text.json"):
|
|
183
|
+
with open("./temp/text.json", "r", encoding="utf-8") as f:
|
|
184
|
+
text_lookup = json.load(f)
|
|
185
|
+
else:
|
|
186
|
+
text_lookup = {}
|
|
187
|
+
text_lookup.setdefault(ref_docname, {})
|
|
188
|
+
text_lookup[ref_docname].update(dict(zip(tids, splits)))
|
|
189
|
+
with open("./temp/text.json", "w", encoding="utf-8") as f:
|
|
190
|
+
json.dump(text_lookup, f, indent=4)
|
|
191
|
+
|
|
192
|
+
if os.path.exists("./temp/tables.json"):
|
|
193
|
+
with open("./temp/tables.json", "r", encoding="utf-8") as f:
|
|
194
|
+
table_lookup = json.load(f)
|
|
195
|
+
else:
|
|
196
|
+
table_lookup = {}
|
|
197
|
+
table_lookup.setdefault(ref_docname, {})
|
|
198
|
+
table_lookup[ref_docname].update(dict(zip(taids, tables)))
|
|
199
|
+
with open("./temp/tables.json", "w", encoding="utf-8") as f:
|
|
200
|
+
json.dump(table_lookup, f, indent=4)
|
|
201
|
+
|
|
202
|
+
if splits:
|
|
203
|
+
t_emb = text_embedder.encode(splits)
|
|
204
|
+
text_store.upsert(ids=tids, embeddings=t_emb,metadatas=tmd)
|
|
205
|
+
if images:
|
|
206
|
+
i_emb = image_embedder.encode(images)
|
|
207
|
+
image_store.upsert(ids=iids, embeddings=i_emb,metadatas=imd)
|
|
208
|
+
if tables:
|
|
209
|
+
ta_emb = text_embedder.encode(tables)
|
|
210
|
+
table_store.upsert(ids=taids, embeddings=ta_emb,metadatas=tamd)
|
|
211
|
+
|
|
212
|
+
def embed_from_txt(docname:str):
|
|
213
|
+
ref_docname=docnamer(docname)
|
|
214
|
+
with open(docname,"r",encoding="utf-8") as f:
|
|
215
|
+
text=f.read()
|
|
216
|
+
splits=text_splitter.split_text(text)
|
|
217
|
+
ids=[f"{ref_docname}_c{i}" for i in range(len(splits))]
|
|
218
|
+
mds=[{"docname":ref_docname,"chunk":i,"type":"text"} for i in range(len(splits))]
|
|
219
|
+
emb=text_embedder.encode(splits)
|
|
220
|
+
text_store.upsert(ids=ids,metadatas=mds,embeddings=emb)
|
|
221
|
+
if os.path.exists("./temp/text.json"):
|
|
222
|
+
with open("./temp/text.json", "r", encoding="utf-8") as f:
|
|
223
|
+
text_lookup = json.load(f)
|
|
224
|
+
else:
|
|
225
|
+
text_lookup = {}
|
|
226
|
+
text_lookup.setdefault(ref_docname, {})
|
|
227
|
+
text_lookup[ref_docname].update(dict(zip(ids, splits)))
|
|
228
|
+
with open("./temp/text.json", "w", encoding="utf-8") as f:
|
|
229
|
+
json.dump(text_lookup, f, indent=4)
|
|
230
|
+
|
|
231
|
+
def embed_from_image(docname:str):
|
|
232
|
+
ref_docname=docnamer(docname)
|
|
233
|
+
img=Image.open(docname)
|
|
234
|
+
emb=image_embedder.encode([img])
|
|
235
|
+
id=[f"{ref_docname}_0"]
|
|
236
|
+
md=[{"docname":f"{ref_docname}","imno":0,"type":f"image/{docname[docname.rfind('.')+1:]}"}]
|
|
237
|
+
image_store.upsert(embeddings=emb,ids=id,metadatas=md)
|
|
238
|
+
os.makedirs(f"./temp/images/{ref_docname}",exist_ok=True)
|
|
239
|
+
img.save(f"./temp/images/{ref_docname}/{ref_docname}")
|
|
240
|
+
|
|
241
|
+
def embed_from_xlsx(docname:str):
|
|
242
|
+
ref_docname=docnamer(docname)
|
|
243
|
+
df=pd.read_excel(docname)
|
|
244
|
+
tmd=[df.to_markdown()]
|
|
245
|
+
md=[{"docname":ref_docname,"type":"table","table":"table"}]
|
|
246
|
+
emb=text_embedder.encode(tmd)
|
|
247
|
+
id=[f"{ref_docname}_0"]
|
|
248
|
+
table_store.upsert(embeddings=emb,ids=id,metadatas=md)
|
|
249
|
+
if os.path.exists("./temp/text.json"):
|
|
250
|
+
with open("./temp/text.json", "r", encoding="utf-8") as f:
|
|
251
|
+
text_lookup = json.load(f)
|
|
252
|
+
else:
|
|
253
|
+
text_lookup = {}
|
|
254
|
+
text_lookup.setdefault(ref_docname, {})
|
|
255
|
+
text_lookup[ref_docname].update(dict(zip(id, tmd)))
|
|
256
|
+
with open("./temp/text.json", "w", encoding="utf-8") as f:
|
|
257
|
+
json.dump(text_lookup, f, indent=4)
|
|
258
|
+
|
|
259
|
+
def retrieve(query:str):
|
|
260
|
+
with open("./temp/text.json", "r", encoding="utf-8") as f:
|
|
261
|
+
text_lookup = json.load(f)
|
|
262
|
+
with open("./temp/tables.json", "r", encoding="utf-8") as f:
|
|
263
|
+
table_lookup = json.load(f)
|
|
264
|
+
qt_emb = text_embedder.encode(query)
|
|
265
|
+
qi_emb=image_embedder.encode(query)
|
|
266
|
+
|
|
267
|
+
text_results = text_store.query(
|
|
268
|
+
query_embeddings=[qt_emb],
|
|
269
|
+
n_results=5
|
|
270
|
+
)
|
|
271
|
+
image_results = image_store.query(
|
|
272
|
+
query_embeddings=[qi_emb],
|
|
273
|
+
n_results=2
|
|
274
|
+
)
|
|
275
|
+
table_results=table_store.query(
|
|
276
|
+
query_embeddings=[qt_emb],
|
|
277
|
+
n_results=3
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
text_ids = text_results["ids"][0]
|
|
281
|
+
chunks=[text_lookup.get(text_results["metadatas"][0][i]["docname"],{}).get(text_ids[i],"NOT FOUND") for i in range(len(text_ids))]
|
|
282
|
+
image_ids = image_results["ids"][0]
|
|
283
|
+
images=[]
|
|
284
|
+
for i in range(len(image_ids)):
|
|
285
|
+
with open(f"./temp/images/{image_results['metadatas'][0][i]['docname']}/{image_ids[i]}.{image_results['metadatas'][0][i]['type'][6:]}","rb") as f:
|
|
286
|
+
images.append(base64.b64encode(f.read()).decode("utf-8"))
|
|
287
|
+
table_ids=table_results["ids"][0]
|
|
288
|
+
tables=[table_lookup.get(table_results["metadatas"][0][i]["docname"],{}).get(table_ids[i],"NOT FOUND") for i in range(len(table_ids))]
|
|
289
|
+
return {
|
|
290
|
+
"images":images,
|
|
291
|
+
"text":chunks,
|
|
292
|
+
"tables":tables
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
def query(query:str):
|
|
296
|
+
result=retrieve(query)
|
|
297
|
+
parts=[
|
|
298
|
+
{
|
|
299
|
+
"type":"text",
|
|
300
|
+
"text":f"""
|
|
301
|
+
Answer the Question based on the Context given.
|
|
302
|
+
If the given Context doesn't contain any information regarding the Question, Answer 'Out of Context'
|
|
303
|
+
|
|
304
|
+
Question:{query}"""
|
|
305
|
+
}
|
|
306
|
+
]
|
|
307
|
+
for text in result["text"]:
|
|
308
|
+
parts.append({
|
|
309
|
+
"type":"text",
|
|
310
|
+
"text":text
|
|
311
|
+
})
|
|
312
|
+
for table in result["tables"]:
|
|
313
|
+
parts.append({
|
|
314
|
+
"type":"text",
|
|
315
|
+
"text":f"TABLE:\n{table}"
|
|
316
|
+
})
|
|
317
|
+
for image in result["images"]:
|
|
318
|
+
parts.append({
|
|
319
|
+
"type":"image_url",
|
|
320
|
+
"image_url":f"data:image/png;base64,{image}"
|
|
321
|
+
})
|
|
322
|
+
|
|
323
|
+
message=HumanMessage(content=parts)
|
|
324
|
+
response=model.invoke([message])
|
|
325
|
+
return [result,response.content]
|