prevectorchunks-core 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of prevectorchunks-core might be problematic. Click here for more details.
- prevectorchunks_core-0.1.1/PKG-INFO +71 -0
- prevectorchunks_core-0.1.1/README.md +48 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/__init__.py +0 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/admin.py +3 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/apps.py +6 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/asgi.py +16 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/migrations/__init__.py +0 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/models.py +3 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/services/__init__.py +0 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/services/chunk_documents_crud_vdb.py +417 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/settings.py +176 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/test_loader.py +23 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/tests.py +3 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/utils/__init__.py +0 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/utils/file_loader.py +264 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/utils/llm_wrapper.py +34 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core/wsgi.py +16 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core.egg-info/PKG-INFO +71 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core.egg-info/SOURCES.txt +22 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core.egg-info/dependency_links.txt +1 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core.egg-info/requires.txt +15 -0
- prevectorchunks_core-0.1.1/prevectorchunks_core.egg-info/top_level.txt +1 -0
- prevectorchunks_core-0.1.1/pyproject.toml +37 -0
- prevectorchunks_core-0.1.1/setup.cfg +4 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: prevectorchunks-core
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
|
+
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/yourusername/mydep
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: Django==5.1
|
|
9
|
+
Requires-Dist: packaging~=24.1
|
|
10
|
+
Requires-Dist: requests~=2.32.3
|
|
11
|
+
Requires-Dist: openai~=1.37.1
|
|
12
|
+
Requires-Dist: httpx~=0.27.0
|
|
13
|
+
Requires-Dist: python-dotenv~=1.0.1
|
|
14
|
+
Requires-Dist: django-cors-headers~=4.4.0
|
|
15
|
+
Requires-Dist: PyJWT~=2.7.0
|
|
16
|
+
Requires-Dist: fastapi~=0.112.2
|
|
17
|
+
Requires-Dist: datasets~=4.1.0
|
|
18
|
+
Requires-Dist: pinecone~=7.3.0
|
|
19
|
+
Requires-Dist: pytesseract~=0.3.13
|
|
20
|
+
Requires-Dist: python-docx~=1.2.0
|
|
21
|
+
Requires-Dist: PyPDF2~=3.0.1
|
|
22
|
+
Requires-Dist: pillow~=11.3.0
|
|
23
|
+
|
|
24
|
+
# 📚 PreVectorChunks
|
|
25
|
+
|
|
26
|
+
> A lightweight utility for **document chunking** and **vector database upserts** — designed for developers building **RAG (Retrieval-Augmented Generation)** solutions.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## ✨ Who Needs This Module?
|
|
31
|
+
Any developer working with:
|
|
32
|
+
- **RAG pipelines**
|
|
33
|
+
- **Vector Databases** (like Pinecone, Weaviate, etc.)
|
|
34
|
+
- **AI applications** requiring **similar content retrieval**
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
## 🎯 What Does This Module Do?
|
|
40
|
+
This module helps you:
|
|
41
|
+
- **Chunk documents** into smaller fragments
|
|
42
|
+
- **Insert (upsert) fragments** into a vector database
|
|
43
|
+
- **Fetch & update** existing chunks from a vector database
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## 📦 Installation
|
|
48
|
+
```bash
|
|
49
|
+
pip install prevectorchunks
|
|
50
|
+
````
|
|
51
|
+
How to import in a file:
|
|
52
|
+
```python
|
|
53
|
+
from PreVectorChunks.services import chunk_documents_crud_vdb
|
|
54
|
+
|
|
55
|
+
#How to use Pinecone and OpenAI:
|
|
56
|
+
#Use a .env file in your project root to configure API keys:
|
|
57
|
+
|
|
58
|
+
PINECONE_API_KEY=YOUR_API_KEY
|
|
59
|
+
OPENAI_API_KEY=YOUR_API_KEY
|
|
60
|
+
|
|
61
|
+
#how to call relevant functions:
|
|
62
|
+
#Four key functions that you can call are below:
|
|
63
|
+
#function that chunks any document
|
|
64
|
+
chunk_documents(instructions,file_path="content_playground/content.json"):
|
|
65
|
+
#function that chunks any document as well as inserts into vdb - you need an index name inside index_n
|
|
66
|
+
chunk_and_upsert_to_vdb(index_n,instructions,file_path="content_playground/content.json"):
|
|
67
|
+
#function that loads existing chunks from vdb by document name - you need an index name inside index_n
|
|
68
|
+
fetch_vdb_chunks_grouped_by_document_name(index_n):
|
|
69
|
+
#function that updates existing chunks - you need an index name inside index_n
|
|
70
|
+
update_vdb_chunks_grouped_by_document_name(index_n,dataset):
|
|
71
|
+
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# 📚 PreVectorChunks
|
|
2
|
+
|
|
3
|
+
> A lightweight utility for **document chunking** and **vector database upserts** — designed for developers building **RAG (Retrieval-Augmented Generation)** solutions.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## ✨ Who Needs This Module?
|
|
8
|
+
Any developer working with:
|
|
9
|
+
- **RAG pipelines**
|
|
10
|
+
- **Vector Databases** (like Pinecone, Weaviate, etc.)
|
|
11
|
+
- **AI applications** requiring **similar content retrieval**
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## 🎯 What Does This Module Do?
|
|
17
|
+
This module helps you:
|
|
18
|
+
- **Chunk documents** into smaller fragments
|
|
19
|
+
- **Insert (upsert) fragments** into a vector database
|
|
20
|
+
- **Fetch & update** existing chunks from a vector database
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## 📦 Installation
|
|
25
|
+
```bash
|
|
26
|
+
pip install prevectorchunks
|
|
27
|
+
````
|
|
28
|
+
How to import in a file:
|
|
29
|
+
```python
|
|
30
|
+
from PreVectorChunks.services import chunk_documents_crud_vdb
|
|
31
|
+
|
|
32
|
+
#How to use Pinecone and OpenAI:
|
|
33
|
+
#Use a .env file in your project root to configure API keys:
|
|
34
|
+
|
|
35
|
+
PINECONE_API_KEY=YOUR_API_KEY
|
|
36
|
+
OPENAI_API_KEY=YOUR_API_KEY
|
|
37
|
+
|
|
38
|
+
#how to call relevant functions:
|
|
39
|
+
#Four key functions that you can call are below:
|
|
40
|
+
#function that chunks any document
|
|
41
|
+
chunk_documents(instructions,file_path="content_playground/content.json"):
|
|
42
|
+
#function that chunks any document as well as inserts into vdb - you need an index name inside index_n
|
|
43
|
+
chunk_and_upsert_to_vdb(index_n,instructions,file_path="content_playground/content.json"):
|
|
44
|
+
#function that loads existing chunks from vdb by document name - you need an index name inside index_n
|
|
45
|
+
fetch_vdb_chunks_grouped_by_document_name(index_n):
|
|
46
|
+
#function that updates existing chunks - you need an index name inside index_n
|
|
47
|
+
update_vdb_chunks_grouped_by_document_name(index_n,dataset):
|
|
48
|
+
```
|
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ASGI config for PreVectorChunks project.
|
|
3
|
+
|
|
4
|
+
It exposes the ASGI callable as a module-level variable named ``application``.
|
|
5
|
+
|
|
6
|
+
For more information on this file, see
|
|
7
|
+
https://docs.djangoproject.com/en/5.2/howto/deployment/asgi/
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
from django.core.asgi import get_asgi_application
|
|
13
|
+
|
|
14
|
+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'PreVectorChunks.settings')
|
|
15
|
+
|
|
16
|
+
application = get_asgi_application()
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
|
|
4
|
+
from django.views.decorators.csrf import csrf_exempt
|
|
5
|
+
from django.views.decorators.http import require_http_methods
|
|
6
|
+
from dotenv import load_dotenv
|
|
7
|
+
from openai import OpenAI
|
|
8
|
+
from pinecone import Pinecone, ServerlessSpec
|
|
9
|
+
from datasets import load_dataset
|
|
10
|
+
import os
|
|
11
|
+
from django.http import JsonResponse, HttpResponse
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from ..utils.file_loader import prepare_chunked_text, extract_file_details
|
|
15
|
+
from ..utils.llm_wrapper import LLMClientWrapper
|
|
16
|
+
|
|
17
|
+
from pinecone import Pinecone, ServerlessSpec
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
from itertools import chain
|
|
20
|
+
from dotenv import load_dotenv
|
|
21
|
+
# create an index if not already existing
|
|
22
|
+
load_dotenv(override=True)
|
|
23
|
+
index_name = "dl-doc-search"
|
|
24
|
+
EMBED_DIM = 1536
|
|
25
|
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
|
26
|
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class openAIWrapperLMSContent:
|
|
30
|
+
def __init__(self, openai_client):
|
|
31
|
+
self.openai_client = openai_client
|
|
32
|
+
|
|
33
|
+
def chatWithOpenAI(self, ragcontent, actualQuery):
|
|
34
|
+
query = f"""
|
|
35
|
+
{ragcontent}
|
|
36
|
+
Question:{actualQuery}
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
response = self.openai_client.chat.completions.create(
|
|
40
|
+
messages=[
|
|
41
|
+
{'role': 'system',
|
|
42
|
+
'content': (
|
|
43
|
+
"You are an experienced and patient TAFE trainer for a Certificate III in Electrotechnology. "
|
|
44
|
+
"Your primary goal is to guide apprentices to understand their learning materials. "
|
|
45
|
+
"You must **NEVER** give direct answers to questions from assessments (UKT or UST). "
|
|
46
|
+
"Instead, use Socratic questioning to prompt critical thinking and encourage students to find the answers in their provided learning content. "
|
|
47
|
+
"When appropriate, offer hints by referencing specific sections or modules (e.g., 'Review the section on Ohm's Law in Module 2'). "
|
|
48
|
+
"Maintain a helpful, encouraging, and respectful tone. Do not solve problems for the student."
|
|
49
|
+
"{}"
|
|
50
|
+
).format(query)}
|
|
51
|
+
],
|
|
52
|
+
model="gpt-4o-mini",
|
|
53
|
+
temperature=0,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
return response.choices[0].message.content
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# load the dataset
|
|
60
|
+
def loadDataset():
|
|
61
|
+
dataset = load_dataset("pinecone/dl-doc-search", split="train")
|
|
62
|
+
return dataset
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def loadDatasetFromJsonFile(file_path="PreVectorChunks/content_playground/content.json"):
|
|
66
|
+
print(os.getcwd())
|
|
67
|
+
# File path to your content.json
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# Load the contents of the JSON file into a Python variable
|
|
71
|
+
try:
|
|
72
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
|
73
|
+
data = json.load(file)
|
|
74
|
+
return data
|
|
75
|
+
except FileNotFoundError:
|
|
76
|
+
print(f"Error: The file at {file_path} was not found.")
|
|
77
|
+
except json.JSONDecodeError as e:
|
|
78
|
+
print(f"Error decoding JSON: {e}")
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# Function to create embeddings with OpenAI
|
|
83
|
+
def get_embedding(text, model="text-embedding-3-small"):
|
|
84
|
+
response = client.embeddings.create(
|
|
85
|
+
input=text,
|
|
86
|
+
model=model
|
|
87
|
+
)
|
|
88
|
+
return response.data[0].embedding
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def createIndexForPineCone():
|
|
92
|
+
# connect to Pinecone (make sure you set your API key in env: PINECONE_API_KEY)
|
|
93
|
+
|
|
94
|
+
# Use the correct spec parameter
|
|
95
|
+
spec = ServerlessSpec(
|
|
96
|
+
cloud="aws", # Replace with your cloud provider ('aws', 'azure', etc.)
|
|
97
|
+
region="us-east-1" # Replace with your region ('us-west-2', etc.)
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Check if the index exists. If not, create it.
|
|
101
|
+
if index_name not in [index.name for index in pc.list_indexes()]:
|
|
102
|
+
pc.create_index(
|
|
103
|
+
name=index_name,
|
|
104
|
+
dimension=EMBED_DIM, # Replace with the dimensionality of your vectors
|
|
105
|
+
|
|
106
|
+
spec=spec # Pass the spec object here
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
index = pc.Index(index_name)
|
|
110
|
+
return index
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def upsertRecord(index_n,dataset,document_name=None):
|
|
114
|
+
# To get the unique host for an index,
|
|
115
|
+
# see https://docs.pinecone.io/guides/manage-data/target-an-index
|
|
116
|
+
index = pc.Index(index_n)
|
|
117
|
+
|
|
118
|
+
# Upsert records into a namespace
|
|
119
|
+
# `chunk_text` fields are converted to dense vectors
|
|
120
|
+
# `category` fields are stored as metadata
|
|
121
|
+
if dataset==None:
|
|
122
|
+
dataset = loadDatasetFromJsonFile()
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
rows = dataset.get("rows", []) # Try to get "rows" from the dataset
|
|
126
|
+
if rows is None: # Check if the value is explicitly None
|
|
127
|
+
rows = dataset # Fallback to dataset
|
|
128
|
+
except (AttributeError, TypeError):
|
|
129
|
+
# An error occurred (e.g., `dataset` is not a dictionary or `.get()` is unavailable)
|
|
130
|
+
rows = dataset # Fallback to dataset
|
|
131
|
+
|
|
132
|
+
# Prepare batch for upsert (do small chunks to avoid rate limits)
|
|
133
|
+
vectors = []
|
|
134
|
+
|
|
135
|
+
for i, record in enumerate(rows[:100]): # limit to 100 for demo
|
|
136
|
+
|
|
137
|
+
print(record)
|
|
138
|
+
# Attempt to access `record["row"]["text"]` if "row" exists and is a dictionary
|
|
139
|
+
text = record["row"].get("text") if isinstance(record.get("row"), dict) else record.get("text")
|
|
140
|
+
|
|
141
|
+
title = record["row"].get("title") if isinstance(record.get("row"), dict) else record.get("title")
|
|
142
|
+
id= record.get("id")
|
|
143
|
+
embedding = get_embedding(text)
|
|
144
|
+
value = id if id else i
|
|
145
|
+
document_name_retrieved=document_name if document_name else record.get("document_name")
|
|
146
|
+
vectors.append((
|
|
147
|
+
str(value), # unique ID
|
|
148
|
+
embedding,
|
|
149
|
+
{"title": title, "text": text,"id":id,"document_name":document_name_retrieved} # metadata
|
|
150
|
+
))
|
|
151
|
+
|
|
152
|
+
# Upsert in batches of 20
|
|
153
|
+
if (i + 1) % 20 == 0:
|
|
154
|
+
index.upsert(vectors=vectors)
|
|
155
|
+
print(f"Upserted {i + 1} vectors")
|
|
156
|
+
vectors = []
|
|
157
|
+
|
|
158
|
+
# Flush remaining
|
|
159
|
+
if vectors:
|
|
160
|
+
index.upsert(vectors=vectors)
|
|
161
|
+
print(f"Upserted final {len(vectors)} vectors")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def searchOrQueryOnVDB(queryEmbedding):
|
|
165
|
+
# To get the unique host for an index,
|
|
166
|
+
# see https://docs.pinecone.io/guides/manage-data/target-an-index
|
|
167
|
+
index = pc.Index(index_name)
|
|
168
|
+
|
|
169
|
+
# Search the dense index
|
|
170
|
+
results = index.query(
|
|
171
|
+
vector=queryEmbedding,
|
|
172
|
+
|
|
173
|
+
top_k=10, # Number of top results to retrieve
|
|
174
|
+
include_metadata=True
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Collect top 5 results
|
|
178
|
+
top_5_results = sorted(results.matches, key=lambda result: result.score, reverse=True)[:5]
|
|
179
|
+
|
|
180
|
+
# Initialize an empty list to store the top results
|
|
181
|
+
top_results_list = []
|
|
182
|
+
|
|
183
|
+
# Append the metadata 'text' of each top result to the list
|
|
184
|
+
for result in top_5_results:
|
|
185
|
+
top_results_list.append(result.metadata['text'])
|
|
186
|
+
|
|
187
|
+
# Check if there are any results
|
|
188
|
+
if not top_results_list:
|
|
189
|
+
print("No results found!")
|
|
190
|
+
else:
|
|
191
|
+
# Print out the top 5 results (if any exist)
|
|
192
|
+
print("Top 5 Results:")
|
|
193
|
+
for text in top_results_list:
|
|
194
|
+
print(text)
|
|
195
|
+
|
|
196
|
+
# Optionally, return the top results list
|
|
197
|
+
return top_results_list
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def queryToLLM(query):
|
|
201
|
+
# Define the query
|
|
202
|
+
# query = "the underlying principles of workplace health and safety is to:"
|
|
203
|
+
query_emb = get_embedding(query)
|
|
204
|
+
ragContent = searchOrQueryOnVDB(query_emb)
|
|
205
|
+
system_prompt=(
|
|
206
|
+
"You are an experienced and patient TAFE trainer for a Certificate III in Electrotechnology. "
|
|
207
|
+
"Your primary goal is to guide apprentices to understand their learning materials. "
|
|
208
|
+
"You must **NEVER** give direct answers to questions from assessments (UKT or UST). "
|
|
209
|
+
"Instead, use Socratic questioning to prompt critical thinking and encourage students to find the answers in their provided learning content. "
|
|
210
|
+
"When appropriate, offer hints by referencing specific sections or copy of the section or modules (e.g., 'Review the section on Ohm's Law in Module 2'). "
|
|
211
|
+
"Maintain a helpful, encouraging, and respectful tone. Do not solve problems for the student."
|
|
212
|
+
|
|
213
|
+
)
|
|
214
|
+
openaiwrapper = LLMClientWrapper(client=client, system_prompt=system_prompt)
|
|
215
|
+
openAiResponse = openaiwrapper.chat(ragContent, query)
|
|
216
|
+
return openAiResponse
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
# function to
|
|
221
|
+
# upload a particular document
|
|
222
|
+
# takes LLM instruction about how to process/chunk the document
|
|
223
|
+
# prepares chunked json objects
|
|
224
|
+
def upload_and_prepare_file_content_in_chunks(request,instructions):
|
|
225
|
+
try:
|
|
226
|
+
|
|
227
|
+
uploaded_file = uploaded_file_ref(request)
|
|
228
|
+
file_name, file_bytes = extract_file_details(
|
|
229
|
+
uploaded_file)
|
|
230
|
+
chunked_text = chunk_documents(instructions,file_name,file_bytes)
|
|
231
|
+
return chunked_text,file_name
|
|
232
|
+
except Exception as e:
|
|
233
|
+
return JsonResponse({"error": f"An unexpected error occurred: {str(e)}"}, status=500)
|
|
234
|
+
|
|
235
|
+
# function to
|
|
236
|
+
# upload a particular document
|
|
237
|
+
def uploaded_file_ref(request):
|
|
238
|
+
# Check if the request contains a file
|
|
239
|
+
if 'file' not in request.FILES:
|
|
240
|
+
return JsonResponse({"error": "No file part in the request"}, status=400)
|
|
241
|
+
|
|
242
|
+
# Retrieve the file from the POST request
|
|
243
|
+
uploaded_file = request.FILES['file']
|
|
244
|
+
return uploaded_file
|
|
245
|
+
|
|
246
|
+
def upload_file(request):
|
|
247
|
+
"""
|
|
248
|
+
Endpoint to handle POST requests containing a JSON file.
|
|
249
|
+
The JSON file is used to load and parse data.
|
|
250
|
+
"""
|
|
251
|
+
try:
|
|
252
|
+
uploaded_file = uploaded_file_ref(request)
|
|
253
|
+
|
|
254
|
+
if uploaded_file.name == '':
|
|
255
|
+
return JsonResponse({"error": "No file selected for uploading"}, status=400)
|
|
256
|
+
|
|
257
|
+
# Read and parse file content as JSON
|
|
258
|
+
data = None
|
|
259
|
+
try:
|
|
260
|
+
file_data = uploaded_file.read().decode('utf-8') # Decode file content
|
|
261
|
+
data = json.loads(file_data)
|
|
262
|
+
return data # Parse content as JSON
|
|
263
|
+
except json.JSONDecodeError as e:
|
|
264
|
+
return JsonResponse({"error": f"Failed to parse JSON: {e}"}, status=400)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
except Exception as e:
|
|
268
|
+
return JsonResponse({"error": f"An unexpected error occurred: {str(e)}"}, status=500)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
@csrf_exempt
|
|
272
|
+
@require_http_methods(["POST"])
|
|
273
|
+
def fetch_records_grouped_by_document_name(request):
|
|
274
|
+
index = pc.Index(index_name)
|
|
275
|
+
records_by_doc = qfetch_records_grouped_by_document_name(index)
|
|
276
|
+
print(records_by_doc.keys())
|
|
277
|
+
return JsonResponse(records_by_doc, safe=False)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
@csrf_exempt
|
|
282
|
+
@require_http_methods(["POST"])
|
|
283
|
+
def update_records_grouped_by_document_name_in_vdb(request):
|
|
284
|
+
dataset=upload_file(request)
|
|
285
|
+
index_n = request.POST.get("index_name")
|
|
286
|
+
transformed_records = transform_groupd_by_doc_name_to_vdb_metadata_structure(dataset)
|
|
287
|
+
upsertRecord(index_n,transformed_records)
|
|
288
|
+
return JsonResponse(transformed_records, safe=False)
|
|
289
|
+
|
|
290
|
+
##queries and update records within vdb
|
|
291
|
+
|
|
292
|
+
def transform_groupd_by_doc_name_to_vdb_metadata_structure(input_json):
|
|
293
|
+
# Initialize an empty list to store the transformed data
|
|
294
|
+
output_list = []
|
|
295
|
+
|
|
296
|
+
# Iterate through the dictionary
|
|
297
|
+
for document_name, entries in input_json.items():
|
|
298
|
+
for entry in entries:
|
|
299
|
+
# Add the document name to each item and append to the new list
|
|
300
|
+
transformed_entry = {
|
|
301
|
+
"text": entry["text"],
|
|
302
|
+
"title": entry["title"],
|
|
303
|
+
"id": entry["id"],
|
|
304
|
+
"document_name": document_name
|
|
305
|
+
}
|
|
306
|
+
output_list.append(transformed_entry)
|
|
307
|
+
|
|
308
|
+
return output_list
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def qfetch_records_grouped_by_document_name(index, batch_size=100,limit=100):
|
|
314
|
+
"""
|
|
315
|
+
Fetch all records from Pinecone grouped by document_name in metadata,
|
|
316
|
+
processing 100 unique document_name values at a time.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
index: Pinecone Index object
|
|
320
|
+
batch_size: Number of unique document_name to process per batch
|
|
321
|
+
Returns:
|
|
322
|
+
dict: {document_name: [records]}
|
|
323
|
+
"""
|
|
324
|
+
if limit <= 0 or limit > 100:
|
|
325
|
+
raise ValueError("The `limit` parameter must be between 1 and 100.")
|
|
326
|
+
# Step 1: Get all IDs with metadata
|
|
327
|
+
all_ids = []
|
|
328
|
+
next_token = None
|
|
329
|
+
consecutive_none_count=0
|
|
330
|
+
while True:
|
|
331
|
+
res = list(index.list(limit=100, next_token=next_token) if next_token else index.list(limit=limit))
|
|
332
|
+
if not res:
|
|
333
|
+
break # Exit loop if there are no more results
|
|
334
|
+
|
|
335
|
+
all_ids.extend(chain.from_iterable(res))
|
|
336
|
+
|
|
337
|
+
# Update `next_token` for pagination
|
|
338
|
+
next_token = res[-1].get('next_token') if isinstance(res[-1], dict) and 'next_token' in res[-1] else None
|
|
339
|
+
|
|
340
|
+
# Handle cases when `next_token` is None
|
|
341
|
+
if not next_token:
|
|
342
|
+
consecutive_none_count += 1
|
|
343
|
+
else:
|
|
344
|
+
consecutive_none_count = 0 # Reset counter if next_token is valid again
|
|
345
|
+
|
|
346
|
+
# Break loop if `next_token` is None more than once consecutively
|
|
347
|
+
if consecutive_none_count > 1: # Adjust count based on logic
|
|
348
|
+
break
|
|
349
|
+
# Step 2: Fetch metadata in batches and collect document_names
|
|
350
|
+
id_to_docname = {}
|
|
351
|
+
for i in range(0, len(all_ids), batch_size):
|
|
352
|
+
batch_ids = all_ids[i:i + batch_size]
|
|
353
|
+
res = index.fetch(ids=batch_ids)
|
|
354
|
+
vectors = res.vectors
|
|
355
|
+
for vector in vectors.items():
|
|
356
|
+
metadata = vector[1].metadata if vector[1] and vector[1].metadata else None
|
|
357
|
+
doc_name = metadata['document_name'] if metadata and 'document_name' in metadata else None
|
|
358
|
+
id = metadata['id'] if metadata and 'id' in metadata else None
|
|
359
|
+
if doc_name and id:
|
|
360
|
+
id_to_docname[id] = doc_name
|
|
361
|
+
|
|
362
|
+
# Step 3: Group records by document_name in batches of 100 unique names
|
|
363
|
+
grouped_records = defaultdict(list)
|
|
364
|
+
unique_docnames = list(set(id_to_docname.values()))
|
|
365
|
+
|
|
366
|
+
for i in range(0, len(unique_docnames), 100):
|
|
367
|
+
batch_docnames = unique_docnames[i:i + 100]
|
|
368
|
+
# Get all IDs for these document_names
|
|
369
|
+
batch_ids = [rid for rid, dname in id_to_docname.items() if dname in batch_docnames]
|
|
370
|
+
# Fetch full records
|
|
371
|
+
res = index.fetch(ids=batch_ids)
|
|
372
|
+
vectors = res.vectors
|
|
373
|
+
for vector in vectors.items():
|
|
374
|
+
metadata = vector[1].metadata if vector[1] and vector[1].metadata else None
|
|
375
|
+
id = metadata['id'] if metadata and 'id' in metadata else None
|
|
376
|
+
doc_name = id_to_docname[id] if id in id_to_docname else None
|
|
377
|
+
text = metadata['text'] if metadata and 'text' in metadata else None
|
|
378
|
+
title = metadata['title'] if metadata and 'title' in metadata else None
|
|
379
|
+
id = metadata['id'] if metadata and 'id' in metadata else None
|
|
380
|
+
|
|
381
|
+
# Create a dictionary
|
|
382
|
+
json_object = {
|
|
383
|
+
"text": text,
|
|
384
|
+
"title": title,
|
|
385
|
+
"id": id
|
|
386
|
+
}
|
|
387
|
+
grouped_records[doc_name].append(json_object)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
return dict(grouped_records)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
#function that chunks any document
|
|
395
|
+
def chunk_documents(instructions,file_name,file_path="content_playground/content.json"):
|
|
396
|
+
return prepare_chunked_text(file_path, file_name,instructions)
|
|
397
|
+
|
|
398
|
+
#function that chunks any document as well as inserts into vdb
|
|
399
|
+
def chunk_and_upsert_to_vdb(index_n,instructions,file_name,file_path="content_playground/content.json"):
|
|
400
|
+
chunked_dataset = prepare_chunked_text(file_path, file_name, instructions)
|
|
401
|
+
document_name = file_name if file_name else os.path.basename(file_path) + uuid.uuid4().hex
|
|
402
|
+
|
|
403
|
+
upsertRecord(index_n,chunked_dataset,document_name)
|
|
404
|
+
return chunked_dataset, document_name
|
|
405
|
+
|
|
406
|
+
#function that loads existing chunks from vdb by document name
|
|
407
|
+
def fetch_vdb_chunks_grouped_by_document_name(index_n):
|
|
408
|
+
index = pc.Index(index_n)
|
|
409
|
+
records_by_doc = qfetch_records_grouped_by_document_name(index)
|
|
410
|
+
return records_by_doc
|
|
411
|
+
|
|
412
|
+
#function that updates existing chunks
|
|
413
|
+
def update_vdb_chunks_grouped_by_document_name(index_n,dataset):
|
|
414
|
+
index = pc.Index(index_n)
|
|
415
|
+
transformed_records = transform_groupd_by_doc_name_to_vdb_metadata_structure(dataset)
|
|
416
|
+
upsertRecord(index_n,transformed_records)
|
|
417
|
+
return transformed_records
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Django settings for djproj project.
|
|
3
|
+
|
|
4
|
+
Generated by 'django-admin startproject' using Django 5.1.
|
|
5
|
+
|
|
6
|
+
For more information on this file, see
|
|
7
|
+
https://docs.djangoproject.com/en/5.1/topics/settings/
|
|
8
|
+
|
|
9
|
+
For the full list of settings and their values, see
|
|
10
|
+
https://docs.djangoproject.com/en/5.1/ref/settings/
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from corsheaders.defaults import default_headers
|
|
15
|
+
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
|
16
|
+
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Quick-start development settings - unsuitable for production
|
|
20
|
+
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
|
|
21
|
+
|
|
22
|
+
# SECURITY WARNING: keep the secret key used in production secret!
|
|
23
|
+
SECRET_KEY = 'django-insecure-xwxywn=k$51m=o4ahmlybvaekxp@j-z!qpoyog-fh7)20u8+ps'
|
|
24
|
+
CORS_ALLOW_ALL_ORIGINS = True
|
|
25
|
+
CSRF_COOKIE_SECURE = False
|
|
26
|
+
SESSION_COOKIE_SECURE = False
|
|
27
|
+
CORS_ALLOW_CREDENTIALS = True
|
|
28
|
+
# SECURITY WARNING: don't run with debug turned on in production!
|
|
29
|
+
DEBUG = True
|
|
30
|
+
|
|
31
|
+
CORS_ALLOW_HEADERS = list(default_headers) + [
|
|
32
|
+
'x-csrf-token',
|
|
33
|
+
]
|
|
34
|
+
CSRF_TRUSTED_ORIGINS = [
|
|
35
|
+
'http://localhost:3000',
|
|
36
|
+
'http://127.0.0.1:3000',
|
|
37
|
+
'http://3.106.90.99:3000',
|
|
38
|
+
'http://3.106.90.99:80',
|
|
39
|
+
'http://3.106.90.99',
|
|
40
|
+
'http://effiez.com.au',
|
|
41
|
+
'http://effiez.com.au:80',
|
|
42
|
+
'http://www.effiez.com.au',
|
|
43
|
+
'http://www.effiez.com.au:80',
|
|
44
|
+
'https://effiez.com.au',
|
|
45
|
+
'https://py.effiez.com.au',
|
|
46
|
+
'https://effiez.com.au:80',
|
|
47
|
+
'https://www.effiez.com.au',
|
|
48
|
+
'https://www.effiez.com.au:80',
|
|
49
|
+
|
|
50
|
+
'http://*'
|
|
51
|
+
]
|
|
52
|
+
ALLOWED_HOSTS = ['*']
|
|
53
|
+
CORS_ORIGIN_ALLOW_ALL = True
|
|
54
|
+
CORS_ALLOWED_ORIGINS = [
|
|
55
|
+
'http://localhost:3000',
|
|
56
|
+
'http://127.0.0.1:3000',
|
|
57
|
+
'http://3.106.90.99:3000',
|
|
58
|
+
'http://3.106.90.99:80',
|
|
59
|
+
'http://3.106.90.99',
|
|
60
|
+
'http://effiez.com.au',
|
|
61
|
+
'http://effiez.com.au:80',
|
|
62
|
+
'http://www.effiez.com.au',
|
|
63
|
+
'http://www.effiez.com.au:80',
|
|
64
|
+
'https://effiez.com.au',
|
|
65
|
+
'https://effiez.com.au:80',
|
|
66
|
+
'https://www.effiez.com.au',
|
|
67
|
+
'https://www.effiez.com.au:80',
|
|
68
|
+
'https://py.effiez.com.au',
|
|
69
|
+
'http://*'
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
# Application definition
|
|
73
|
+
|
|
74
|
+
INSTALLED_APPS = [
|
|
75
|
+
'django.contrib.admin',
|
|
76
|
+
'django.contrib.auth',
|
|
77
|
+
'django.contrib.contenttypes',
|
|
78
|
+
'django.contrib.sessions',
|
|
79
|
+
'django.contrib.messages',
|
|
80
|
+
'django.contrib.staticfiles',
|
|
81
|
+
#'reimburse.apps.ReimburseConfig',
|
|
82
|
+
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
MIDDLEWARE = [
|
|
86
|
+
'django.middleware.security.SecurityMiddleware',
|
|
87
|
+
'django.contrib.sessions.middleware.SessionMiddleware',
|
|
88
|
+
'django.middleware.common.CommonMiddleware',
|
|
89
|
+
'django.middleware.csrf.CsrfViewMiddleware',
|
|
90
|
+
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
|
91
|
+
'django.contrib.messages.middleware.MessageMiddleware',
|
|
92
|
+
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
|
93
|
+
'corsheaders.middleware.CorsMiddleware',
|
|
94
|
+
'django.middleware.security.SecurityMiddleware',
|
|
95
|
+
'django.contrib.sessions.middleware.SessionMiddleware',
|
|
96
|
+
'django.middleware.common.CommonMiddleware',
|
|
97
|
+
'django.middleware.csrf.CsrfViewMiddleware',
|
|
98
|
+
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
|
99
|
+
'django.contrib.messages.middleware.MessageMiddleware',
|
|
100
|
+
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
ROOT_URLCONF = 'PreVectorChunks-Web.urls'
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
TEMPLATES = [
|
|
107
|
+
{
|
|
108
|
+
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
|
109
|
+
'DIRS': [BASE_DIR / 'templates']
|
|
110
|
+
,
|
|
111
|
+
'APP_DIRS': True,
|
|
112
|
+
'OPTIONS': {
|
|
113
|
+
'context_processors': [
|
|
114
|
+
'django.template.context_processors.debug',
|
|
115
|
+
'django.template.context_processors.request',
|
|
116
|
+
'django.contrib.auth.context_processors.auth',
|
|
117
|
+
'django.contrib.messages.context_processors.messages',
|
|
118
|
+
],
|
|
119
|
+
},
|
|
120
|
+
},
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
WSGI_APPLICATION = 'PreVectorChunks.wsgi.application'
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# Database
|
|
127
|
+
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
|
|
128
|
+
|
|
129
|
+
DATABASES = {
|
|
130
|
+
'default': {
|
|
131
|
+
'ENGINE': 'django.db.backends.sqlite3',
|
|
132
|
+
'NAME': BASE_DIR / 'db.sqlite3',
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# Password validation
|
|
138
|
+
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
|
|
139
|
+
|
|
140
|
+
AUTH_PASSWORD_VALIDATORS = [
|
|
141
|
+
{
|
|
142
|
+
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
|
146
|
+
},
|
|
147
|
+
{
|
|
148
|
+
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
|
149
|
+
},
|
|
150
|
+
{
|
|
151
|
+
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
|
152
|
+
},
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# Internationalization
|
|
157
|
+
# https://docs.djangoproject.com/en/5.1/topics/i18n/
|
|
158
|
+
|
|
159
|
+
LANGUAGE_CODE = 'en-us'
|
|
160
|
+
|
|
161
|
+
TIME_ZONE = 'UTC'
|
|
162
|
+
|
|
163
|
+
USE_I18N = True
|
|
164
|
+
|
|
165
|
+
USE_TZ = True
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# Static files (CSS, JavaScript, Images)
|
|
169
|
+
# https://docs.djangoproject.com/en/5.1/howto/static-files/
|
|
170
|
+
|
|
171
|
+
STATIC_URL = 'static/'
|
|
172
|
+
|
|
173
|
+
# Default primary key field type
|
|
174
|
+
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
|
|
175
|
+
|
|
176
|
+
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pytest
|
|
3
|
+
from PreVectorChunks.prevectorchunks_core.services import chunk_documents_crud_vdb
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# Create a temporary JSON file to test with
|
|
8
|
+
@pytest.fixture
|
|
9
|
+
def temp_json_file(tmp_path):
|
|
10
|
+
file_path = tmp_path / "test.json"
|
|
11
|
+
content = [{"id": 1, "text": "hello world"}]
|
|
12
|
+
with open(file_path, "w") as f:
|
|
13
|
+
json.dump(content, f)
|
|
14
|
+
return file_path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
|
|
18
|
+
#dataset = chunk_and_upsert_to_vdb("dl-doc-search","instructions", file_path="content_playground/content.json")
|
|
19
|
+
#dataset=chunk_documents_crud_vdb.fetch_vdb_chunks_grouped_by_document_name("dl-doc-search")
|
|
20
|
+
dataset=chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path=temp_json_file)
|
|
21
|
+
#dataset=chunk_documents_crud_vdb.chunk_documents("Extract doco",file_path="content_playground/content.json",file_name=None)
|
|
22
|
+
# Assertions
|
|
23
|
+
|
|
File without changes
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import warnings # Correct module for warnings, including PendingDeprecationWarning
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from docx import Document
|
|
7
|
+
from PyPDF2 import PdfReader
|
|
8
|
+
from PIL import Image
|
|
9
|
+
import pytesseract
|
|
10
|
+
import uuid
|
|
11
|
+
from openai import OpenAI
|
|
12
|
+
from openai import OpenAI
|
|
13
|
+
from .llm_wrapper import LLMClientWrapper # Relative import
|
|
14
|
+
from dotenv import load_dotenv
|
|
15
|
+
import tempfile
|
|
16
|
+
load_dotenv(override=True)
|
|
17
|
+
# Initialize OpenAI client
|
|
18
|
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
19
|
+
from django.core.files.uploadedfile import UploadedFile
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract_content_agnostic(file, filename=None):
|
|
23
|
+
"""
|
|
24
|
+
Extract text content from a file.
|
|
25
|
+
|
|
26
|
+
Supports:
|
|
27
|
+
- PDF (.pdf)
|
|
28
|
+
- Word (.docx)
|
|
29
|
+
- Text (.txt)
|
|
30
|
+
- Images (.png, .jpg, .jpeg, .tiff, .bmp)
|
|
31
|
+
|
|
32
|
+
Parameters:
|
|
33
|
+
- file: either a file path (str) or bytes (binary content)
|
|
34
|
+
- filename: required if `file` is bytes, to determine extension
|
|
35
|
+
"""
|
|
36
|
+
# Determine if file is path or binary
|
|
37
|
+
if isinstance(file, str):
|
|
38
|
+
filepath = file
|
|
39
|
+
ext = os.path.splitext(filepath)[1].lower()
|
|
40
|
+
elif isinstance(file, Path):
|
|
41
|
+
filepath = str(file)
|
|
42
|
+
ext = os.path.splitext(filepath)[1].lower()
|
|
43
|
+
elif isinstance(file, bytes):
|
|
44
|
+
if not filename:
|
|
45
|
+
raise ValueError("filename must be provided if passing binary content")
|
|
46
|
+
ext = os.path.splitext(filename)[1].lower()
|
|
47
|
+
# Write bytes to a temporary file
|
|
48
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
|
|
49
|
+
tmp.write(file)
|
|
50
|
+
filepath = tmp.name
|
|
51
|
+
else:
|
|
52
|
+
raise TypeError("file must be a string path or bytes")
|
|
53
|
+
|
|
54
|
+
# At this point, `filepath` is a valid file path on disk
|
|
55
|
+
# TODO: implement your extraction logic based on `ext` and `filepath`
|
|
56
|
+
# Example:
|
|
57
|
+
# if ext == ".pdf":
|
|
58
|
+
# content = extract_pdf(filepath)
|
|
59
|
+
# elif ext == ".docx":
|
|
60
|
+
# content = extract_docx(filepath)
|
|
61
|
+
# ...
|
|
62
|
+
|
|
63
|
+
text = load_file_by_type(ext, filepath)
|
|
64
|
+
|
|
65
|
+
# If we created a temporary file, optionally delete it
|
|
66
|
+
if isinstance(file, UploadedFile) and not hasattr(file, 'temporary_file_path'):
|
|
67
|
+
try:
|
|
68
|
+
os.remove(filepath)
|
|
69
|
+
except Exception:
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
return text.strip()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def extract_content(file):
|
|
76
|
+
"""
|
|
77
|
+
Extract text content from a file.
|
|
78
|
+
Supports:
|
|
79
|
+
- PDF (.pdf)
|
|
80
|
+
- Word (.docx)
|
|
81
|
+
- Text (.txt)
|
|
82
|
+
- Images (.png, .jpg, .jpeg, .tiff, .bmp)
|
|
83
|
+
|
|
84
|
+
file: either a file path (str) or a Django UploadedFile object (request.FILES['file'])
|
|
85
|
+
"""
|
|
86
|
+
# Determine if input is file path or UploadedFile
|
|
87
|
+
if isinstance(file, UploadedFile):
|
|
88
|
+
filename = file.name
|
|
89
|
+
ext = os.path.splitext(filename)[1].lower()
|
|
90
|
+
|
|
91
|
+
# Check if file is already on disk
|
|
92
|
+
if hasattr(file, 'temporary_file_path'):
|
|
93
|
+
filepath = file.temporary_file_path()
|
|
94
|
+
else:
|
|
95
|
+
# Save in-memory file to temporary file
|
|
96
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
|
|
97
|
+
for chunk in file.chunks():
|
|
98
|
+
tmp.write(chunk)
|
|
99
|
+
filepath = tmp.name
|
|
100
|
+
else:
|
|
101
|
+
# It's a file path
|
|
102
|
+
filepath = file
|
|
103
|
+
ext = os.path.splitext(filepath)[1].lower()
|
|
104
|
+
|
|
105
|
+
text = load_file_by_type(ext, filepath)
|
|
106
|
+
|
|
107
|
+
# If we created a temporary file, optionally delete it
|
|
108
|
+
if isinstance(file, UploadedFile) and not hasattr(file, 'temporary_file_path'):
|
|
109
|
+
try:
|
|
110
|
+
os.remove(filepath)
|
|
111
|
+
except Exception:
|
|
112
|
+
pass
|
|
113
|
+
|
|
114
|
+
return text.strip()
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def load_file_by_type(ext, filepath):
|
|
118
|
+
text = ""
|
|
119
|
+
if ext == ".pdf":
|
|
120
|
+
reader = PdfReader(filepath)
|
|
121
|
+
text = "\n".join([p.extract_text() or "" for p in reader.pages])
|
|
122
|
+
|
|
123
|
+
elif ext == ".docx":
|
|
124
|
+
doc = Document(filepath)
|
|
125
|
+
text = "\n".join([p.text for p in doc.paragraphs])
|
|
126
|
+
|
|
127
|
+
elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]:
|
|
128
|
+
img = Image.open(filepath)
|
|
129
|
+
text = pytesseract.image_to_string(img)
|
|
130
|
+
|
|
131
|
+
elif ext == ".txt":
|
|
132
|
+
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
|
|
133
|
+
text = f.read()
|
|
134
|
+
elif ext == ".json":
|
|
135
|
+
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
|
|
136
|
+
data = json.load(f)
|
|
137
|
+
# Convert JSON to text (pretty print or flatten)
|
|
138
|
+
text = json.dumps(data, ensure_ascii=False, indent=2)
|
|
139
|
+
else:
|
|
140
|
+
raise ValueError(f"Unsupported file type: {ext}")
|
|
141
|
+
return text
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def split_text_by_words(text, chunk_size=200):
|
|
145
|
+
"""Split text into chunks of N words."""
|
|
146
|
+
words = text.split()
|
|
147
|
+
return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def process_with_llm(chunk,instructions):
|
|
151
|
+
"""
|
|
152
|
+
Send a chunk to LLM and return structured JSON array.
|
|
153
|
+
Expected format: [{"id": ..., "title": ..., "text": ...}, ...]
|
|
154
|
+
"""
|
|
155
|
+
context = f"""
|
|
156
|
+
Take the following text and split it into sections based on the most important category headings (ignore lower level headings).
|
|
157
|
+
For each section, return a JSON object with - no extra words other than the json and remove ```json:
|
|
158
|
+
- "id" (a UUID you generate),
|
|
159
|
+
- "title" (the most important heading),
|
|
160
|
+
- "text" (the remaining text under that heading).
|
|
161
|
+
|
|
162
|
+
Text:
|
|
163
|
+
{chunk}
|
|
164
|
+
"""
|
|
165
|
+
instructions=instructions or "Exract sections"
|
|
166
|
+
system_prompt="You are a helpful assistant that structures text into JSON sections."
|
|
167
|
+
# Create an instance of your LLM wrapper
|
|
168
|
+
llm = LLMClientWrapper(client, model="gpt-4o-mini", temperature=0, system_prompt=system_prompt)
|
|
169
|
+
response=llm.chat(context,instructions)
|
|
170
|
+
|
|
171
|
+
# Parse JSON safely
|
|
172
|
+
try:
|
|
173
|
+
structured_data = eval(response)
|
|
174
|
+
except Exception:
|
|
175
|
+
structured_data = []
|
|
176
|
+
|
|
177
|
+
return structured_data
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def process_large_text(text, instructions,chunk_size=200):
|
|
181
|
+
"""Main function: split -> send to LLM -> collect results."""
|
|
182
|
+
chunks = split_text_by_words(text, chunk_size)
|
|
183
|
+
all_results = []
|
|
184
|
+
|
|
185
|
+
for chunk in chunks:
|
|
186
|
+
structured = process_with_llm(chunk,instructions)
|
|
187
|
+
# Ensure UUIDs exist
|
|
188
|
+
for obj in structured:
|
|
189
|
+
if "id" not in obj:
|
|
190
|
+
obj["id"] = str(uuid.uuid4())
|
|
191
|
+
all_results.extend(structured)
|
|
192
|
+
|
|
193
|
+
return all_results
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def prepare_chunked_text(file_path,file_name,instructions):
|
|
198
|
+
content =extract_content_agnostic(file_path,file_name)
|
|
199
|
+
results=process_large_text(content,instructions, chunk_size=200)
|
|
200
|
+
print (results)
|
|
201
|
+
return results
|
|
202
|
+
|
|
203
|
+
#this function takes a django file and extracts filename and byte content
|
|
204
|
+
def extract_file_details(uploaded_file):
|
|
205
|
+
# 1. Get the filename
|
|
206
|
+
filename = uploaded_file.name
|
|
207
|
+
|
|
208
|
+
# 2. Get the file content as bytes
|
|
209
|
+
file_bytes = uploaded_file.read() # reads entire file into memory
|
|
210
|
+
|
|
211
|
+
# Now you can call your extract_content function
|
|
212
|
+
return filename, file_bytes
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import openai
|
|
2
|
+
|
|
3
|
+
class LLMClientWrapper:
|
|
4
|
+
def __init__(self, client, model="gpt-4o-mini", temperature=0, system_prompt=None):
|
|
5
|
+
"""
|
|
6
|
+
client: OpenAI client instance
|
|
7
|
+
model: LLM model name
|
|
8
|
+
temperature: randomness/creativity
|
|
9
|
+
system_prompt: default system instructions for the LLM
|
|
10
|
+
"""
|
|
11
|
+
self.client = client
|
|
12
|
+
self.model = model
|
|
13
|
+
self.temperature = temperature
|
|
14
|
+
# Use default system prompt if not provided
|
|
15
|
+
self.system_prompt = system_prompt or ""
|
|
16
|
+
|
|
17
|
+
def chat(self, context, user_query):
|
|
18
|
+
"""
|
|
19
|
+
context: content retrieved from RAG or other sources
|
|
20
|
+
user_query: actual question from user
|
|
21
|
+
"""
|
|
22
|
+
full_query = f"{context}\nQuestion: {user_query}"
|
|
23
|
+
|
|
24
|
+
response = self.client.chat.completions.create(
|
|
25
|
+
model=self.model,
|
|
26
|
+
messages=[
|
|
27
|
+
{"role": "system", "content": self.system_prompt},
|
|
28
|
+
{"role": "user", "content": full_query}
|
|
29
|
+
],
|
|
30
|
+
temperature=self.temperature
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
return response.choices[0].message.content
|
|
34
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
WSGI config for PreVectorChunks project.
|
|
3
|
+
|
|
4
|
+
It exposes the WSGI callable as a module-level variable named ``application``.
|
|
5
|
+
|
|
6
|
+
For more information on this file, see
|
|
7
|
+
https://docs.djangoproject.com/en/5.2/howto/deployment/wsgi/
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
from django.core.wsgi import get_wsgi_application
|
|
13
|
+
|
|
14
|
+
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'PreVectorChunks.settings')
|
|
15
|
+
|
|
16
|
+
application = get_wsgi_application()
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: prevectorchunks-core
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
|
+
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/yourusername/mydep
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: Django==5.1
|
|
9
|
+
Requires-Dist: packaging~=24.1
|
|
10
|
+
Requires-Dist: requests~=2.32.3
|
|
11
|
+
Requires-Dist: openai~=1.37.1
|
|
12
|
+
Requires-Dist: httpx~=0.27.0
|
|
13
|
+
Requires-Dist: python-dotenv~=1.0.1
|
|
14
|
+
Requires-Dist: django-cors-headers~=4.4.0
|
|
15
|
+
Requires-Dist: PyJWT~=2.7.0
|
|
16
|
+
Requires-Dist: fastapi~=0.112.2
|
|
17
|
+
Requires-Dist: datasets~=4.1.0
|
|
18
|
+
Requires-Dist: pinecone~=7.3.0
|
|
19
|
+
Requires-Dist: pytesseract~=0.3.13
|
|
20
|
+
Requires-Dist: python-docx~=1.2.0
|
|
21
|
+
Requires-Dist: PyPDF2~=3.0.1
|
|
22
|
+
Requires-Dist: pillow~=11.3.0
|
|
23
|
+
|
|
24
|
+
# 📚 PreVectorChunks
|
|
25
|
+
|
|
26
|
+
> A lightweight utility for **document chunking** and **vector database upserts** — designed for developers building **RAG (Retrieval-Augmented Generation)** solutions.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## ✨ Who Needs This Module?
|
|
31
|
+
Any developer working with:
|
|
32
|
+
- **RAG pipelines**
|
|
33
|
+
- **Vector Databases** (like Pinecone, Weaviate, etc.)
|
|
34
|
+
- **AI applications** requiring **similar content retrieval**
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
## 🎯 What Does This Module Do?
|
|
40
|
+
This module helps you:
|
|
41
|
+
- **Chunk documents** into smaller fragments
|
|
42
|
+
- **Insert (upsert) fragments** into a vector database
|
|
43
|
+
- **Fetch & update** existing chunks from a vector database
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## 📦 Installation
|
|
48
|
+
```bash
|
|
49
|
+
pip install prevectorchunks
|
|
50
|
+
````
|
|
51
|
+
How to import in a file:
|
|
52
|
+
```python
|
|
53
|
+
from PreVectorChunks.services import chunk_documents_crud_vdb
|
|
54
|
+
|
|
55
|
+
#How to use Pinecone and OpenAI:
|
|
56
|
+
#Use a .env file in your project root to configure API keys:
|
|
57
|
+
|
|
58
|
+
PINECONE_API_KEY=YOUR_API_KEY
|
|
59
|
+
OPENAI_API_KEY=YOUR_API_KEY
|
|
60
|
+
|
|
61
|
+
#how to call relevant functions:
|
|
62
|
+
#Four key functions that you can call are below:
|
|
63
|
+
#function that chunks any document
|
|
64
|
+
chunk_documents(instructions,file_path="content_playground/content.json"):
|
|
65
|
+
#function that chunks any document as well as inserts into vdb - you need an index name inside index_n
|
|
66
|
+
chunk_and_upsert_to_vdb(index_n,instructions,file_path="content_playground/content.json"):
|
|
67
|
+
#function that loads existing chunks from vdb by document name - you need an index name inside index_n
|
|
68
|
+
fetch_vdb_chunks_grouped_by_document_name(index_n):
|
|
69
|
+
#function that updates existing chunks - you need an index name inside index_n
|
|
70
|
+
update_vdb_chunks_grouped_by_document_name(index_n,dataset):
|
|
71
|
+
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
prevectorchunks_core/__init__.py
|
|
4
|
+
prevectorchunks_core/admin.py
|
|
5
|
+
prevectorchunks_core/apps.py
|
|
6
|
+
prevectorchunks_core/asgi.py
|
|
7
|
+
prevectorchunks_core/models.py
|
|
8
|
+
prevectorchunks_core/settings.py
|
|
9
|
+
prevectorchunks_core/test_loader.py
|
|
10
|
+
prevectorchunks_core/tests.py
|
|
11
|
+
prevectorchunks_core/wsgi.py
|
|
12
|
+
prevectorchunks_core.egg-info/PKG-INFO
|
|
13
|
+
prevectorchunks_core.egg-info/SOURCES.txt
|
|
14
|
+
prevectorchunks_core.egg-info/dependency_links.txt
|
|
15
|
+
prevectorchunks_core.egg-info/requires.txt
|
|
16
|
+
prevectorchunks_core.egg-info/top_level.txt
|
|
17
|
+
prevectorchunks_core/migrations/__init__.py
|
|
18
|
+
prevectorchunks_core/services/__init__.py
|
|
19
|
+
prevectorchunks_core/services/chunk_documents_crud_vdb.py
|
|
20
|
+
prevectorchunks_core/utils/__init__.py
|
|
21
|
+
prevectorchunks_core/utils/file_loader.py
|
|
22
|
+
prevectorchunks_core/utils/llm_wrapper.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Django==5.1
|
|
2
|
+
packaging~=24.1
|
|
3
|
+
requests~=2.32.3
|
|
4
|
+
openai~=1.37.1
|
|
5
|
+
httpx~=0.27.0
|
|
6
|
+
python-dotenv~=1.0.1
|
|
7
|
+
django-cors-headers~=4.4.0
|
|
8
|
+
PyJWT~=2.7.0
|
|
9
|
+
fastapi~=0.112.2
|
|
10
|
+
datasets~=4.1.0
|
|
11
|
+
pinecone~=7.3.0
|
|
12
|
+
pytesseract~=0.3.13
|
|
13
|
+
python-docx~=1.2.0
|
|
14
|
+
PyPDF2~=3.0.1
|
|
15
|
+
pillow~=11.3.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
prevectorchunks_core
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "prevectorchunks-core"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { file = "LICENSE" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Zul Al-Kabir", email = "zul.developer.2023@gmail.com" }
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
dependencies = [
|
|
16
|
+
"Django==5.1",
|
|
17
|
+
"packaging~=24.1",
|
|
18
|
+
"requests~=2.32.3",
|
|
19
|
+
"openai~=1.37.1",
|
|
20
|
+
"httpx~=0.27.0",
|
|
21
|
+
"python-dotenv~=1.0.1",
|
|
22
|
+
"django-cors-headers~=4.4.0",
|
|
23
|
+
"PyJWT~=2.7.0",
|
|
24
|
+
"fastapi~=0.112.2",
|
|
25
|
+
"datasets~=4.1.0",
|
|
26
|
+
"pinecone~=7.3.0",
|
|
27
|
+
"pytesseract~=0.3.13",
|
|
28
|
+
"python-docx~=1.2.0",
|
|
29
|
+
"PyPDF2~=3.0.1",
|
|
30
|
+
"pillow~=11.3.0"
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
include = ["prevectorchunks_core*"]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/yourusername/mydep"
|