prevectorchunks-core 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of prevectorchunks-core might be problematic. Click here for more details.

Files changed (24) hide show
  1. prevectorchunks_core-0.1.1/PKG-INFO +71 -0
  2. prevectorchunks_core-0.1.1/README.md +48 -0
  3. prevectorchunks_core-0.1.1/prevectorchunks_core/__init__.py +0 -0
  4. prevectorchunks_core-0.1.1/prevectorchunks_core/admin.py +3 -0
  5. prevectorchunks_core-0.1.1/prevectorchunks_core/apps.py +6 -0
  6. prevectorchunks_core-0.1.1/prevectorchunks_core/asgi.py +16 -0
  7. prevectorchunks_core-0.1.1/prevectorchunks_core/migrations/__init__.py +0 -0
  8. prevectorchunks_core-0.1.1/prevectorchunks_core/models.py +3 -0
  9. prevectorchunks_core-0.1.1/prevectorchunks_core/services/__init__.py +0 -0
  10. prevectorchunks_core-0.1.1/prevectorchunks_core/services/chunk_documents_crud_vdb.py +417 -0
  11. prevectorchunks_core-0.1.1/prevectorchunks_core/settings.py +176 -0
  12. prevectorchunks_core-0.1.1/prevectorchunks_core/test_loader.py +23 -0
  13. prevectorchunks_core-0.1.1/prevectorchunks_core/tests.py +3 -0
  14. prevectorchunks_core-0.1.1/prevectorchunks_core/utils/__init__.py +0 -0
  15. prevectorchunks_core-0.1.1/prevectorchunks_core/utils/file_loader.py +264 -0
  16. prevectorchunks_core-0.1.1/prevectorchunks_core/utils/llm_wrapper.py +34 -0
  17. prevectorchunks_core-0.1.1/prevectorchunks_core/wsgi.py +16 -0
  18. prevectorchunks_core-0.1.1/prevectorchunks_core.egg-info/PKG-INFO +71 -0
  19. prevectorchunks_core-0.1.1/prevectorchunks_core.egg-info/SOURCES.txt +22 -0
  20. prevectorchunks_core-0.1.1/prevectorchunks_core.egg-info/dependency_links.txt +1 -0
  21. prevectorchunks_core-0.1.1/prevectorchunks_core.egg-info/requires.txt +15 -0
  22. prevectorchunks_core-0.1.1/prevectorchunks_core.egg-info/top_level.txt +1 -0
  23. prevectorchunks_core-0.1.1/pyproject.toml +37 -0
  24. prevectorchunks_core-0.1.1/setup.cfg +4 -0
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.4
2
+ Name: prevectorchunks-core
3
+ Version: 0.1.1
4
+ Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
+ Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
+ Project-URL: Homepage, https://github.com/yourusername/mydep
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: Django==5.1
9
+ Requires-Dist: packaging~=24.1
10
+ Requires-Dist: requests~=2.32.3
11
+ Requires-Dist: openai~=1.37.1
12
+ Requires-Dist: httpx~=0.27.0
13
+ Requires-Dist: python-dotenv~=1.0.1
14
+ Requires-Dist: django-cors-headers~=4.4.0
15
+ Requires-Dist: PyJWT~=2.7.0
16
+ Requires-Dist: fastapi~=0.112.2
17
+ Requires-Dist: datasets~=4.1.0
18
+ Requires-Dist: pinecone~=7.3.0
19
+ Requires-Dist: pytesseract~=0.3.13
20
+ Requires-Dist: python-docx~=1.2.0
21
+ Requires-Dist: PyPDF2~=3.0.1
22
+ Requires-Dist: pillow~=11.3.0
23
+
24
+ # 📚 PreVectorChunks
25
+
26
+ > A lightweight utility for **document chunking** and **vector database upserts** — designed for developers building **RAG (Retrieval-Augmented Generation)** solutions.
27
+
28
+ ---
29
+
30
+ ## ✨ Who Needs This Module?
31
+ Any developer working with:
32
+ - **RAG pipelines**
33
+ - **Vector Databases** (like Pinecone, Weaviate, etc.)
34
+ - **AI applications** requiring **similar content retrieval**
35
+
36
+ ---
37
+
38
+
39
+ ## 🎯 What Does This Module Do?
40
+ This module helps you:
41
+ - **Chunk documents** into smaller fragments
42
+ - **Insert (upsert) fragments** into a vector database
43
+ - **Fetch & update** existing chunks from a vector database
44
+
45
+ ---
46
+
47
+ ## 📦 Installation
48
+ ```bash
49
+ pip install prevectorchunks
50
+ ````
51
+ How to import in a file:
52
+ ```python
53
+ from PreVectorChunks.services import chunk_documents_crud_vdb
54
+
55
+ #How to use Pinecone and OpenAI:
56
+ #Use a .env file in your project root to configure API keys:
57
+
58
+ PINECONE_API_KEY=YOUR_API_KEY
59
+ OPENAI_API_KEY=YOUR_API_KEY
60
+
61
+ #how to call relevant functions:
62
+ #Four key functions that you can call are below:
63
+ #function that chunks any document
64
+ chunk_documents(instructions,file_path="content_playground/content.json"):
65
+ #function that chunks any document as well as inserts into vdb - you need an index name inside index_n
66
+ chunk_and_upsert_to_vdb(index_n,instructions,file_path="content_playground/content.json"):
67
+ #function that loads existing chunks from vdb by document name - you need an index name inside index_n
68
+ fetch_vdb_chunks_grouped_by_document_name(index_n):
69
+ #function that updates existing chunks - you need an index name inside index_n
70
+ update_vdb_chunks_grouped_by_document_name(index_n,dataset):
71
+ ```
@@ -0,0 +1,48 @@
1
+ # 📚 PreVectorChunks
2
+
3
+ > A lightweight utility for **document chunking** and **vector database upserts** — designed for developers building **RAG (Retrieval-Augmented Generation)** solutions.
4
+
5
+ ---
6
+
7
+ ## ✨ Who Needs This Module?
8
+ Any developer working with:
9
+ - **RAG pipelines**
10
+ - **Vector Databases** (like Pinecone, Weaviate, etc.)
11
+ - **AI applications** requiring **similar content retrieval**
12
+
13
+ ---
14
+
15
+
16
+ ## 🎯 What Does This Module Do?
17
+ This module helps you:
18
+ - **Chunk documents** into smaller fragments
19
+ - **Insert (upsert) fragments** into a vector database
20
+ - **Fetch & update** existing chunks from a vector database
21
+
22
+ ---
23
+
24
+ ## 📦 Installation
25
+ ```bash
26
+ pip install prevectorchunks
27
+ ````
28
+ How to import in a file:
29
+ ```python
30
+ from PreVectorChunks.services import chunk_documents_crud_vdb
31
+
32
+ #How to use Pinecone and OpenAI:
33
+ #Use a .env file in your project root to configure API keys:
34
+
35
+ PINECONE_API_KEY=YOUR_API_KEY
36
+ OPENAI_API_KEY=YOUR_API_KEY
37
+
38
+ #how to call relevant functions:
39
+ #Four key functions that you can call are below:
40
+ #function that chunks any document
41
+ chunk_documents(instructions,file_path="content_playground/content.json"):
42
+ #function that chunks any document as well as inserts into vdb - you need an index name inside index_n
43
+ chunk_and_upsert_to_vdb(index_n,instructions,file_path="content_playground/content.json"):
44
+ #function that loads existing chunks from vdb by document name - you need an index name inside index_n
45
+ fetch_vdb_chunks_grouped_by_document_name(index_n):
46
+ #function that updates existing chunks - you need an index name inside index_n
47
+ update_vdb_chunks_grouped_by_document_name(index_n,dataset):
48
+ ```
@@ -0,0 +1,3 @@
1
+ from django.contrib import admin
2
+
3
+ # Register your models here.
@@ -0,0 +1,6 @@
1
+ from django.apps import AppConfig
2
+
3
+
4
+ class LmsConfig(AppConfig):
5
+ default_auto_field = 'django.db.models.BigAutoField'
6
+ name = 'lms'
@@ -0,0 +1,16 @@
1
+ """
2
+ ASGI config for PreVectorChunks project.
3
+
4
+ It exposes the ASGI callable as a module-level variable named ``application``.
5
+
6
+ For more information on this file, see
7
+ https://docs.djangoproject.com/en/5.2/howto/deployment/asgi/
8
+ """
9
+
10
+ import os
11
+
12
+ from django.core.asgi import get_asgi_application
13
+
14
+ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'PreVectorChunks.settings')
15
+
16
+ application = get_asgi_application()
@@ -0,0 +1,3 @@
1
+ from django.db import models
2
+
3
+ # Create your models here.
@@ -0,0 +1,417 @@
1
+ import json
2
+ import uuid
3
+
4
+ from django.views.decorators.csrf import csrf_exempt
5
+ from django.views.decorators.http import require_http_methods
6
+ from dotenv import load_dotenv
7
+ from openai import OpenAI
8
+ from pinecone import Pinecone, ServerlessSpec
9
+ from datasets import load_dataset
10
+ import os
11
+ from django.http import JsonResponse, HttpResponse
12
+
13
+
14
+ from ..utils.file_loader import prepare_chunked_text, extract_file_details
15
+ from ..utils.llm_wrapper import LLMClientWrapper
16
+
17
+ from pinecone import Pinecone, ServerlessSpec
18
+ from collections import defaultdict
19
+ from itertools import chain
20
+ from dotenv import load_dotenv
21
+ # create an index if not already existing
22
+ load_dotenv(override=True)
23
+ index_name = "dl-doc-search"
24
+ EMBED_DIM = 1536
25
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
26
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
27
+
28
+
29
+ class openAIWrapperLMSContent:
30
+ def __init__(self, openai_client):
31
+ self.openai_client = openai_client
32
+
33
+ def chatWithOpenAI(self, ragcontent, actualQuery):
34
+ query = f"""
35
+ {ragcontent}
36
+ Question:{actualQuery}
37
+ """
38
+
39
+ response = self.openai_client.chat.completions.create(
40
+ messages=[
41
+ {'role': 'system',
42
+ 'content': (
43
+ "You are an experienced and patient TAFE trainer for a Certificate III in Electrotechnology. "
44
+ "Your primary goal is to guide apprentices to understand their learning materials. "
45
+ "You must **NEVER** give direct answers to questions from assessments (UKT or UST). "
46
+ "Instead, use Socratic questioning to prompt critical thinking and encourage students to find the answers in their provided learning content. "
47
+ "When appropriate, offer hints by referencing specific sections or modules (e.g., 'Review the section on Ohm's Law in Module 2'). "
48
+ "Maintain a helpful, encouraging, and respectful tone. Do not solve problems for the student."
49
+ "{}"
50
+ ).format(query)}
51
+ ],
52
+ model="gpt-4o-mini",
53
+ temperature=0,
54
+ )
55
+
56
+ return response.choices[0].message.content
57
+
58
+
59
+ # load the dataset
60
+ def loadDataset():
61
+ dataset = load_dataset("pinecone/dl-doc-search", split="train")
62
+ return dataset
63
+
64
+
65
+ def loadDatasetFromJsonFile(file_path="PreVectorChunks/content_playground/content.json"):
66
+ print(os.getcwd())
67
+ # File path to your content.json
68
+
69
+
70
+ # Load the contents of the JSON file into a Python variable
71
+ try:
72
+ with open(file_path, 'r', encoding='utf-8') as file:
73
+ data = json.load(file)
74
+ return data
75
+ except FileNotFoundError:
76
+ print(f"Error: The file at {file_path} was not found.")
77
+ except json.JSONDecodeError as e:
78
+ print(f"Error decoding JSON: {e}")
79
+ return None
80
+
81
+
82
+ # Function to create embeddings with OpenAI
83
+ def get_embedding(text, model="text-embedding-3-small"):
84
+ response = client.embeddings.create(
85
+ input=text,
86
+ model=model
87
+ )
88
+ return response.data[0].embedding
89
+
90
+
91
+ def createIndexForPineCone():
92
+ # connect to Pinecone (make sure you set your API key in env: PINECONE_API_KEY)
93
+
94
+ # Use the correct spec parameter
95
+ spec = ServerlessSpec(
96
+ cloud="aws", # Replace with your cloud provider ('aws', 'azure', etc.)
97
+ region="us-east-1" # Replace with your region ('us-west-2', etc.)
98
+ )
99
+
100
+ # Check if the index exists. If not, create it.
101
+ if index_name not in [index.name for index in pc.list_indexes()]:
102
+ pc.create_index(
103
+ name=index_name,
104
+ dimension=EMBED_DIM, # Replace with the dimensionality of your vectors
105
+
106
+ spec=spec # Pass the spec object here
107
+ )
108
+
109
+ index = pc.Index(index_name)
110
+ return index
111
+
112
+
113
+ def upsertRecord(index_n,dataset,document_name=None):
114
+ # To get the unique host for an index,
115
+ # see https://docs.pinecone.io/guides/manage-data/target-an-index
116
+ index = pc.Index(index_n)
117
+
118
+ # Upsert records into a namespace
119
+ # `chunk_text` fields are converted to dense vectors
120
+ # `category` fields are stored as metadata
121
+ if dataset==None:
122
+ dataset = loadDatasetFromJsonFile()
123
+
124
+ try:
125
+ rows = dataset.get("rows", []) # Try to get "rows" from the dataset
126
+ if rows is None: # Check if the value is explicitly None
127
+ rows = dataset # Fallback to dataset
128
+ except (AttributeError, TypeError):
129
+ # An error occurred (e.g., `dataset` is not a dictionary or `.get()` is unavailable)
130
+ rows = dataset # Fallback to dataset
131
+
132
+ # Prepare batch for upsert (do small chunks to avoid rate limits)
133
+ vectors = []
134
+
135
+ for i, record in enumerate(rows[:100]): # limit to 100 for demo
136
+
137
+ print(record)
138
+ # Attempt to access `record["row"]["text"]` if "row" exists and is a dictionary
139
+ text = record["row"].get("text") if isinstance(record.get("row"), dict) else record.get("text")
140
+
141
+ title = record["row"].get("title") if isinstance(record.get("row"), dict) else record.get("title")
142
+ id= record.get("id")
143
+ embedding = get_embedding(text)
144
+ value = id if id else i
145
+ document_name_retrieved=document_name if document_name else record.get("document_name")
146
+ vectors.append((
147
+ str(value), # unique ID
148
+ embedding,
149
+ {"title": title, "text": text,"id":id,"document_name":document_name_retrieved} # metadata
150
+ ))
151
+
152
+ # Upsert in batches of 20
153
+ if (i + 1) % 20 == 0:
154
+ index.upsert(vectors=vectors)
155
+ print(f"Upserted {i + 1} vectors")
156
+ vectors = []
157
+
158
+ # Flush remaining
159
+ if vectors:
160
+ index.upsert(vectors=vectors)
161
+ print(f"Upserted final {len(vectors)} vectors")
162
+
163
+
164
+ def searchOrQueryOnVDB(queryEmbedding):
165
+ # To get the unique host for an index,
166
+ # see https://docs.pinecone.io/guides/manage-data/target-an-index
167
+ index = pc.Index(index_name)
168
+
169
+ # Search the dense index
170
+ results = index.query(
171
+ vector=queryEmbedding,
172
+
173
+ top_k=10, # Number of top results to retrieve
174
+ include_metadata=True
175
+ )
176
+
177
+ # Collect top 5 results
178
+ top_5_results = sorted(results.matches, key=lambda result: result.score, reverse=True)[:5]
179
+
180
+ # Initialize an empty list to store the top results
181
+ top_results_list = []
182
+
183
+ # Append the metadata 'text' of each top result to the list
184
+ for result in top_5_results:
185
+ top_results_list.append(result.metadata['text'])
186
+
187
+ # Check if there are any results
188
+ if not top_results_list:
189
+ print("No results found!")
190
+ else:
191
+ # Print out the top 5 results (if any exist)
192
+ print("Top 5 Results:")
193
+ for text in top_results_list:
194
+ print(text)
195
+
196
+ # Optionally, return the top results list
197
+ return top_results_list
198
+
199
+
200
+ def queryToLLM(query):
201
+ # Define the query
202
+ # query = "the underlying principles of workplace health and safety is to:"
203
+ query_emb = get_embedding(query)
204
+ ragContent = searchOrQueryOnVDB(query_emb)
205
+ system_prompt=(
206
+ "You are an experienced and patient TAFE trainer for a Certificate III in Electrotechnology. "
207
+ "Your primary goal is to guide apprentices to understand their learning materials. "
208
+ "You must **NEVER** give direct answers to questions from assessments (UKT or UST). "
209
+ "Instead, use Socratic questioning to prompt critical thinking and encourage students to find the answers in their provided learning content. "
210
+ "When appropriate, offer hints by referencing specific sections or copy of the section or modules (e.g., 'Review the section on Ohm's Law in Module 2'). "
211
+ "Maintain a helpful, encouraging, and respectful tone. Do not solve problems for the student."
212
+
213
+ )
214
+ openaiwrapper = LLMClientWrapper(client=client, system_prompt=system_prompt)
215
+ openAiResponse = openaiwrapper.chat(ragContent, query)
216
+ return openAiResponse
217
+
218
+
219
+
220
+ # function to
221
+ # upload a particular document
222
+ # takes LLM instruction about how to process/chunk the document
223
+ # prepares chunked json objects
224
+ def upload_and_prepare_file_content_in_chunks(request,instructions):
225
+ try:
226
+
227
+ uploaded_file = uploaded_file_ref(request)
228
+ file_name, file_bytes = extract_file_details(
229
+ uploaded_file)
230
+ chunked_text = chunk_documents(instructions,file_name,file_bytes)
231
+ return chunked_text,file_name
232
+ except Exception as e:
233
+ return JsonResponse({"error": f"An unexpected error occurred: {str(e)}"}, status=500)
234
+
235
+ # function to
236
+ # upload a particular document
237
+ def uploaded_file_ref(request):
238
+ # Check if the request contains a file
239
+ if 'file' not in request.FILES:
240
+ return JsonResponse({"error": "No file part in the request"}, status=400)
241
+
242
+ # Retrieve the file from the POST request
243
+ uploaded_file = request.FILES['file']
244
+ return uploaded_file
245
+
246
+ def upload_file(request):
247
+ """
248
+ Endpoint to handle POST requests containing a JSON file.
249
+ The JSON file is used to load and parse data.
250
+ """
251
+ try:
252
+ uploaded_file = uploaded_file_ref(request)
253
+
254
+ if uploaded_file.name == '':
255
+ return JsonResponse({"error": "No file selected for uploading"}, status=400)
256
+
257
+ # Read and parse file content as JSON
258
+ data = None
259
+ try:
260
+ file_data = uploaded_file.read().decode('utf-8') # Decode file content
261
+ data = json.loads(file_data)
262
+ return data # Parse content as JSON
263
+ except json.JSONDecodeError as e:
264
+ return JsonResponse({"error": f"Failed to parse JSON: {e}"}, status=400)
265
+
266
+
267
+ except Exception as e:
268
+ return JsonResponse({"error": f"An unexpected error occurred: {str(e)}"}, status=500)
269
+
270
+
271
+ @csrf_exempt
272
+ @require_http_methods(["POST"])
273
+ def fetch_records_grouped_by_document_name(request):
274
+ index = pc.Index(index_name)
275
+ records_by_doc = qfetch_records_grouped_by_document_name(index)
276
+ print(records_by_doc.keys())
277
+ return JsonResponse(records_by_doc, safe=False)
278
+
279
+
280
+
281
+ @csrf_exempt
282
+ @require_http_methods(["POST"])
283
+ def update_records_grouped_by_document_name_in_vdb(request):
284
+ dataset=upload_file(request)
285
+ index_n = request.POST.get("index_name")
286
+ transformed_records = transform_groupd_by_doc_name_to_vdb_metadata_structure(dataset)
287
+ upsertRecord(index_n,transformed_records)
288
+ return JsonResponse(transformed_records, safe=False)
289
+
290
+ ##queries and update records within vdb
291
+
292
+ def transform_groupd_by_doc_name_to_vdb_metadata_structure(input_json):
293
+ # Initialize an empty list to store the transformed data
294
+ output_list = []
295
+
296
+ # Iterate through the dictionary
297
+ for document_name, entries in input_json.items():
298
+ for entry in entries:
299
+ # Add the document name to each item and append to the new list
300
+ transformed_entry = {
301
+ "text": entry["text"],
302
+ "title": entry["title"],
303
+ "id": entry["id"],
304
+ "document_name": document_name
305
+ }
306
+ output_list.append(transformed_entry)
307
+
308
+ return output_list
309
+
310
+
311
+
312
+
313
+ def qfetch_records_grouped_by_document_name(index, batch_size=100,limit=100):
314
+ """
315
+ Fetch all records from Pinecone grouped by document_name in metadata,
316
+ processing 100 unique document_name values at a time.
317
+
318
+ Args:
319
+ index: Pinecone Index object
320
+ batch_size: Number of unique document_name to process per batch
321
+ Returns:
322
+ dict: {document_name: [records]}
323
+ """
324
+ if limit <= 0 or limit > 100:
325
+ raise ValueError("The `limit` parameter must be between 1 and 100.")
326
+ # Step 1: Get all IDs with metadata
327
+ all_ids = []
328
+ next_token = None
329
+ consecutive_none_count=0
330
+ while True:
331
+ res = list(index.list(limit=100, next_token=next_token) if next_token else index.list(limit=limit))
332
+ if not res:
333
+ break # Exit loop if there are no more results
334
+
335
+ all_ids.extend(chain.from_iterable(res))
336
+
337
+ # Update `next_token` for pagination
338
+ next_token = res[-1].get('next_token') if isinstance(res[-1], dict) and 'next_token' in res[-1] else None
339
+
340
+ # Handle cases when `next_token` is None
341
+ if not next_token:
342
+ consecutive_none_count += 1
343
+ else:
344
+ consecutive_none_count = 0 # Reset counter if next_token is valid again
345
+
346
+ # Break loop if `next_token` is None more than once consecutively
347
+ if consecutive_none_count > 1: # Adjust count based on logic
348
+ break
349
+ # Step 2: Fetch metadata in batches and collect document_names
350
+ id_to_docname = {}
351
+ for i in range(0, len(all_ids), batch_size):
352
+ batch_ids = all_ids[i:i + batch_size]
353
+ res = index.fetch(ids=batch_ids)
354
+ vectors = res.vectors
355
+ for vector in vectors.items():
356
+ metadata = vector[1].metadata if vector[1] and vector[1].metadata else None
357
+ doc_name = metadata['document_name'] if metadata and 'document_name' in metadata else None
358
+ id = metadata['id'] if metadata and 'id' in metadata else None
359
+ if doc_name and id:
360
+ id_to_docname[id] = doc_name
361
+
362
+ # Step 3: Group records by document_name in batches of 100 unique names
363
+ grouped_records = defaultdict(list)
364
+ unique_docnames = list(set(id_to_docname.values()))
365
+
366
+ for i in range(0, len(unique_docnames), 100):
367
+ batch_docnames = unique_docnames[i:i + 100]
368
+ # Get all IDs for these document_names
369
+ batch_ids = [rid for rid, dname in id_to_docname.items() if dname in batch_docnames]
370
+ # Fetch full records
371
+ res = index.fetch(ids=batch_ids)
372
+ vectors = res.vectors
373
+ for vector in vectors.items():
374
+ metadata = vector[1].metadata if vector[1] and vector[1].metadata else None
375
+ id = metadata['id'] if metadata and 'id' in metadata else None
376
+ doc_name = id_to_docname[id] if id in id_to_docname else None
377
+ text = metadata['text'] if metadata and 'text' in metadata else None
378
+ title = metadata['title'] if metadata and 'title' in metadata else None
379
+ id = metadata['id'] if metadata and 'id' in metadata else None
380
+
381
+ # Create a dictionary
382
+ json_object = {
383
+ "text": text,
384
+ "title": title,
385
+ "id": id
386
+ }
387
+ grouped_records[doc_name].append(json_object)
388
+
389
+
390
+ return dict(grouped_records)
391
+
392
+
393
+
394
+ #function that chunks any document
395
+ def chunk_documents(instructions,file_name,file_path="content_playground/content.json"):
396
+ return prepare_chunked_text(file_path, file_name,instructions)
397
+
398
+ #function that chunks any document as well as inserts into vdb
399
+ def chunk_and_upsert_to_vdb(index_n,instructions,file_name,file_path="content_playground/content.json"):
400
+ chunked_dataset = prepare_chunked_text(file_path, file_name, instructions)
401
+ document_name = file_name if file_name else os.path.basename(file_path) + uuid.uuid4().hex
402
+
403
+ upsertRecord(index_n,chunked_dataset,document_name)
404
+ return chunked_dataset, document_name
405
+
406
+ #function that loads existing chunks from vdb by document name
407
+ def fetch_vdb_chunks_grouped_by_document_name(index_n):
408
+ index = pc.Index(index_n)
409
+ records_by_doc = qfetch_records_grouped_by_document_name(index)
410
+ return records_by_doc
411
+
412
+ #function that updates existing chunks
413
+ def update_vdb_chunks_grouped_by_document_name(index_n,dataset):
414
+ index = pc.Index(index_n)
415
+ transformed_records = transform_groupd_by_doc_name_to_vdb_metadata_structure(dataset)
416
+ upsertRecord(index_n,transformed_records)
417
+ return transformed_records
@@ -0,0 +1,176 @@
1
+ """
2
+ Django settings for djproj project.
3
+
4
+ Generated by 'django-admin startproject' using Django 5.1.
5
+
6
+ For more information on this file, see
7
+ https://docs.djangoproject.com/en/5.1/topics/settings/
8
+
9
+ For the full list of settings and their values, see
10
+ https://docs.djangoproject.com/en/5.1/ref/settings/
11
+ """
12
+
13
+ from pathlib import Path
14
+ from corsheaders.defaults import default_headers
15
+ # Build paths inside the project like this: BASE_DIR / 'subdir'.
16
+ BASE_DIR = Path(__file__).resolve().parent.parent
17
+
18
+
19
+ # Quick-start development settings - unsuitable for production
20
+ # See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
21
+
22
+ # SECURITY WARNING: keep the secret key used in production secret!
23
+ SECRET_KEY = 'django-insecure-xwxywn=k$51m=o4ahmlybvaekxp@j-z!qpoyog-fh7)20u8+ps'
24
+ CORS_ALLOW_ALL_ORIGINS = True
25
+ CSRF_COOKIE_SECURE = False
26
+ SESSION_COOKIE_SECURE = False
27
+ CORS_ALLOW_CREDENTIALS = True
28
+ # SECURITY WARNING: don't run with debug turned on in production!
29
+ DEBUG = True
30
+
31
+ CORS_ALLOW_HEADERS = list(default_headers) + [
32
+ 'x-csrf-token',
33
+ ]
34
+ CSRF_TRUSTED_ORIGINS = [
35
+ 'http://localhost:3000',
36
+ 'http://127.0.0.1:3000',
37
+ 'http://3.106.90.99:3000',
38
+ 'http://3.106.90.99:80',
39
+ 'http://3.106.90.99',
40
+ 'http://effiez.com.au',
41
+ 'http://effiez.com.au:80',
42
+ 'http://www.effiez.com.au',
43
+ 'http://www.effiez.com.au:80',
44
+ 'https://effiez.com.au',
45
+ 'https://py.effiez.com.au',
46
+ 'https://effiez.com.au:80',
47
+ 'https://www.effiez.com.au',
48
+ 'https://www.effiez.com.au:80',
49
+
50
+ 'http://*'
51
+ ]
52
+ ALLOWED_HOSTS = ['*']
53
+ CORS_ORIGIN_ALLOW_ALL = True
54
+ CORS_ALLOWED_ORIGINS = [
55
+ 'http://localhost:3000',
56
+ 'http://127.0.0.1:3000',
57
+ 'http://3.106.90.99:3000',
58
+ 'http://3.106.90.99:80',
59
+ 'http://3.106.90.99',
60
+ 'http://effiez.com.au',
61
+ 'http://effiez.com.au:80',
62
+ 'http://www.effiez.com.au',
63
+ 'http://www.effiez.com.au:80',
64
+ 'https://effiez.com.au',
65
+ 'https://effiez.com.au:80',
66
+ 'https://www.effiez.com.au',
67
+ 'https://www.effiez.com.au:80',
68
+ 'https://py.effiez.com.au',
69
+ 'http://*'
70
+ ]
71
+
72
+ # Application definition
73
+
74
+ INSTALLED_APPS = [
75
+ 'django.contrib.admin',
76
+ 'django.contrib.auth',
77
+ 'django.contrib.contenttypes',
78
+ 'django.contrib.sessions',
79
+ 'django.contrib.messages',
80
+ 'django.contrib.staticfiles',
81
+ #'reimburse.apps.ReimburseConfig',
82
+
83
+ ]
84
+
85
+ MIDDLEWARE = [
86
+ 'django.middleware.security.SecurityMiddleware',
87
+ 'django.contrib.sessions.middleware.SessionMiddleware',
88
+ 'django.middleware.common.CommonMiddleware',
89
+ 'django.middleware.csrf.CsrfViewMiddleware',
90
+ 'django.contrib.auth.middleware.AuthenticationMiddleware',
91
+ 'django.contrib.messages.middleware.MessageMiddleware',
92
+ 'django.middleware.clickjacking.XFrameOptionsMiddleware',
93
+ 'corsheaders.middleware.CorsMiddleware',
94
+ 'django.middleware.security.SecurityMiddleware',
95
+ 'django.contrib.sessions.middleware.SessionMiddleware',
96
+ 'django.middleware.common.CommonMiddleware',
97
+ 'django.middleware.csrf.CsrfViewMiddleware',
98
+ 'django.contrib.auth.middleware.AuthenticationMiddleware',
99
+ 'django.contrib.messages.middleware.MessageMiddleware',
100
+ 'django.middleware.clickjacking.XFrameOptionsMiddleware',
101
+ ]
102
+
103
+ ROOT_URLCONF = 'PreVectorChunks-Web.urls'
104
+
105
+
106
+ TEMPLATES = [
107
+ {
108
+ 'BACKEND': 'django.template.backends.django.DjangoTemplates',
109
+ 'DIRS': [BASE_DIR / 'templates']
110
+ ,
111
+ 'APP_DIRS': True,
112
+ 'OPTIONS': {
113
+ 'context_processors': [
114
+ 'django.template.context_processors.debug',
115
+ 'django.template.context_processors.request',
116
+ 'django.contrib.auth.context_processors.auth',
117
+ 'django.contrib.messages.context_processors.messages',
118
+ ],
119
+ },
120
+ },
121
+ ]
122
+
123
+ WSGI_APPLICATION = 'PreVectorChunks.wsgi.application'
124
+
125
+
126
+ # Database
127
+ # https://docs.djangoproject.com/en/5.1/ref/settings/#databases
128
+
129
+ DATABASES = {
130
+ 'default': {
131
+ 'ENGINE': 'django.db.backends.sqlite3',
132
+ 'NAME': BASE_DIR / 'db.sqlite3',
133
+ }
134
+ }
135
+
136
+
137
+ # Password validation
138
+ # https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
139
+
140
+ AUTH_PASSWORD_VALIDATORS = [
141
+ {
142
+ 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
143
+ },
144
+ {
145
+ 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
146
+ },
147
+ {
148
+ 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
149
+ },
150
+ {
151
+ 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
152
+ },
153
+ ]
154
+
155
+
156
+ # Internationalization
157
+ # https://docs.djangoproject.com/en/5.1/topics/i18n/
158
+
159
+ LANGUAGE_CODE = 'en-us'
160
+
161
+ TIME_ZONE = 'UTC'
162
+
163
+ USE_I18N = True
164
+
165
+ USE_TZ = True
166
+
167
+
168
+ # Static files (CSS, JavaScript, Images)
169
+ # https://docs.djangoproject.com/en/5.1/howto/static-files/
170
+
171
+ STATIC_URL = 'static/'
172
+
173
+ # Default primary key field type
174
+ # https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
175
+
176
+ DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
@@ -0,0 +1,23 @@
1
+ import json
2
+ import pytest
3
+ from PreVectorChunks.prevectorchunks_core.services import chunk_documents_crud_vdb
4
+
5
+
6
+
7
+ # Create a temporary JSON file to test with
8
+ @pytest.fixture
9
+ def temp_json_file(tmp_path):
10
+ file_path = tmp_path / "test.json"
11
+ content = [{"id": 1, "text": "hello world"}]
12
+ with open(file_path, "w") as f:
13
+ json.dump(content, f)
14
+ return file_path
15
+
16
+
17
+ def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
18
+ #dataset = chunk_and_upsert_to_vdb("dl-doc-search","instructions", file_path="content_playground/content.json")
19
+ #dataset=chunk_documents_crud_vdb.fetch_vdb_chunks_grouped_by_document_name("dl-doc-search")
20
+ dataset=chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path=temp_json_file)
21
+ #dataset=chunk_documents_crud_vdb.chunk_documents("Extract doco",file_path="content_playground/content.json",file_name=None)
22
+ # Assertions
23
+
@@ -0,0 +1,3 @@
1
+ from django.test import TestCase
2
+
3
+ # Create your tests here.
@@ -0,0 +1,264 @@
1
+ import json
2
+ import warnings # Correct module for warnings, including PendingDeprecationWarning
3
+ import os
4
+ from pathlib import Path
5
+
6
+ from docx import Document
7
+ from PyPDF2 import PdfReader
8
+ from PIL import Image
9
+ import pytesseract
10
+ import uuid
11
+ from openai import OpenAI
12
+ from openai import OpenAI
13
+ from .llm_wrapper import LLMClientWrapper # Relative import
14
+ from dotenv import load_dotenv
15
+ import tempfile
16
+ load_dotenv(override=True)
17
+ # Initialize OpenAI client
18
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
19
+ from django.core.files.uploadedfile import UploadedFile
20
+
21
+
22
+ def extract_content_agnostic(file, filename=None):
23
+ """
24
+ Extract text content from a file.
25
+
26
+ Supports:
27
+ - PDF (.pdf)
28
+ - Word (.docx)
29
+ - Text (.txt)
30
+ - Images (.png, .jpg, .jpeg, .tiff, .bmp)
31
+
32
+ Parameters:
33
+ - file: either a file path (str) or bytes (binary content)
34
+ - filename: required if `file` is bytes, to determine extension
35
+ """
36
+ # Determine if file is path or binary
37
+ if isinstance(file, str):
38
+ filepath = file
39
+ ext = os.path.splitext(filepath)[1].lower()
40
+ elif isinstance(file, Path):
41
+ filepath = str(file)
42
+ ext = os.path.splitext(filepath)[1].lower()
43
+ elif isinstance(file, bytes):
44
+ if not filename:
45
+ raise ValueError("filename must be provided if passing binary content")
46
+ ext = os.path.splitext(filename)[1].lower()
47
+ # Write bytes to a temporary file
48
+ with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
49
+ tmp.write(file)
50
+ filepath = tmp.name
51
+ else:
52
+ raise TypeError("file must be a string path or bytes")
53
+
54
+ # At this point, `filepath` is a valid file path on disk
55
+ # TODO: implement your extraction logic based on `ext` and `filepath`
56
+ # Example:
57
+ # if ext == ".pdf":
58
+ # content = extract_pdf(filepath)
59
+ # elif ext == ".docx":
60
+ # content = extract_docx(filepath)
61
+ # ...
62
+
63
+ text = load_file_by_type(ext, filepath)
64
+
65
+ # If we created a temporary file, optionally delete it
66
+ if isinstance(file, UploadedFile) and not hasattr(file, 'temporary_file_path'):
67
+ try:
68
+ os.remove(filepath)
69
+ except Exception:
70
+ pass
71
+
72
+ return text.strip()
73
+
74
+
75
+ def extract_content(file):
76
+ """
77
+ Extract text content from a file.
78
+ Supports:
79
+ - PDF (.pdf)
80
+ - Word (.docx)
81
+ - Text (.txt)
82
+ - Images (.png, .jpg, .jpeg, .tiff, .bmp)
83
+
84
+ file: either a file path (str) or a Django UploadedFile object (request.FILES['file'])
85
+ """
86
+ # Determine if input is file path or UploadedFile
87
+ if isinstance(file, UploadedFile):
88
+ filename = file.name
89
+ ext = os.path.splitext(filename)[1].lower()
90
+
91
+ # Check if file is already on disk
92
+ if hasattr(file, 'temporary_file_path'):
93
+ filepath = file.temporary_file_path()
94
+ else:
95
+ # Save in-memory file to temporary file
96
+ with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
97
+ for chunk in file.chunks():
98
+ tmp.write(chunk)
99
+ filepath = tmp.name
100
+ else:
101
+ # It's a file path
102
+ filepath = file
103
+ ext = os.path.splitext(filepath)[1].lower()
104
+
105
+ text = load_file_by_type(ext, filepath)
106
+
107
+ # If we created a temporary file, optionally delete it
108
+ if isinstance(file, UploadedFile) and not hasattr(file, 'temporary_file_path'):
109
+ try:
110
+ os.remove(filepath)
111
+ except Exception:
112
+ pass
113
+
114
+ return text.strip()
115
+
116
+
117
+ def load_file_by_type(ext, filepath):
118
+ text = ""
119
+ if ext == ".pdf":
120
+ reader = PdfReader(filepath)
121
+ text = "\n".join([p.extract_text() or "" for p in reader.pages])
122
+
123
+ elif ext == ".docx":
124
+ doc = Document(filepath)
125
+ text = "\n".join([p.text for p in doc.paragraphs])
126
+
127
+ elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]:
128
+ img = Image.open(filepath)
129
+ text = pytesseract.image_to_string(img)
130
+
131
+ elif ext == ".txt":
132
+ with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
133
+ text = f.read()
134
+ elif ext == ".json":
135
+ with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
136
+ data = json.load(f)
137
+ # Convert JSON to text (pretty print or flatten)
138
+ text = json.dumps(data, ensure_ascii=False, indent=2)
139
+ else:
140
+ raise ValueError(f"Unsupported file type: {ext}")
141
+ return text
142
+
143
+
144
+ def split_text_by_words(text, chunk_size=200):
145
+ """Split text into chunks of N words."""
146
+ words = text.split()
147
+ return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
148
+
149
+
150
+ def process_with_llm(chunk,instructions):
151
+ """
152
+ Send a chunk to LLM and return structured JSON array.
153
+ Expected format: [{"id": ..., "title": ..., "text": ...}, ...]
154
+ """
155
+ context = f"""
156
+ Take the following text and split it into sections based on the most important category headings (ignore lower level headings).
157
+ For each section, return a JSON object with - no extra words other than the json and remove ```json:
158
+ - "id" (a UUID you generate),
159
+ - "title" (the most important heading),
160
+ - "text" (the remaining text under that heading).
161
+
162
+ Text:
163
+ {chunk}
164
+ """
165
+ instructions=instructions or "Exract sections"
166
+ system_prompt="You are a helpful assistant that structures text into JSON sections."
167
+ # Create an instance of your LLM wrapper
168
+ llm = LLMClientWrapper(client, model="gpt-4o-mini", temperature=0, system_prompt=system_prompt)
169
+ response=llm.chat(context,instructions)
170
+
171
+ # Parse JSON safely
172
+ try:
173
+ structured_data = eval(response)
174
+ except Exception:
175
+ structured_data = []
176
+
177
+ return structured_data
178
+
179
+
180
+ def process_large_text(text, instructions,chunk_size=200):
181
+ """Main function: split -> send to LLM -> collect results."""
182
+ chunks = split_text_by_words(text, chunk_size)
183
+ all_results = []
184
+
185
+ for chunk in chunks:
186
+ structured = process_with_llm(chunk,instructions)
187
+ # Ensure UUIDs exist
188
+ for obj in structured:
189
+ if "id" not in obj:
190
+ obj["id"] = str(uuid.uuid4())
191
+ all_results.extend(structured)
192
+
193
+ return all_results
194
+
195
+
196
+
197
+ def prepare_chunked_text(file_path,file_name,instructions):
198
+ content =extract_content_agnostic(file_path,file_name)
199
+ results=process_large_text(content,instructions, chunk_size=200)
200
+ print (results)
201
+ return results
202
+
203
+ #this function takes a django file and extracts filename and byte content
204
+ def extract_file_details(uploaded_file):
205
+ # 1. Get the filename
206
+ filename = uploaded_file.name
207
+
208
+ # 2. Get the file content as bytes
209
+ file_bytes = uploaded_file.read() # reads entire file into memory
210
+
211
+ # Now you can call your extract_content function
212
+ return filename, file_bytes
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
@@ -0,0 +1,34 @@
1
+ import openai
2
+
3
+ class LLMClientWrapper:
4
+ def __init__(self, client, model="gpt-4o-mini", temperature=0, system_prompt=None):
5
+ """
6
+ client: OpenAI client instance
7
+ model: LLM model name
8
+ temperature: randomness/creativity
9
+ system_prompt: default system instructions for the LLM
10
+ """
11
+ self.client = client
12
+ self.model = model
13
+ self.temperature = temperature
14
+ # Use default system prompt if not provided
15
+ self.system_prompt = system_prompt or ""
16
+
17
+ def chat(self, context, user_query):
18
+ """
19
+ context: content retrieved from RAG or other sources
20
+ user_query: actual question from user
21
+ """
22
+ full_query = f"{context}\nQuestion: {user_query}"
23
+
24
+ response = self.client.chat.completions.create(
25
+ model=self.model,
26
+ messages=[
27
+ {"role": "system", "content": self.system_prompt},
28
+ {"role": "user", "content": full_query}
29
+ ],
30
+ temperature=self.temperature
31
+ )
32
+
33
+ return response.choices[0].message.content
34
+
@@ -0,0 +1,16 @@
1
+ """
2
+ WSGI config for PreVectorChunks project.
3
+
4
+ It exposes the WSGI callable as a module-level variable named ``application``.
5
+
6
+ For more information on this file, see
7
+ https://docs.djangoproject.com/en/5.2/howto/deployment/wsgi/
8
+ """
9
+
10
+ import os
11
+
12
+ from django.core.wsgi import get_wsgi_application
13
+
14
+ os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'PreVectorChunks.settings')
15
+
16
+ application = get_wsgi_application()
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.4
2
+ Name: prevectorchunks-core
3
+ Version: 0.1.1
4
+ Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
+ Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
+ Project-URL: Homepage, https://github.com/yourusername/mydep
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: Django==5.1
9
+ Requires-Dist: packaging~=24.1
10
+ Requires-Dist: requests~=2.32.3
11
+ Requires-Dist: openai~=1.37.1
12
+ Requires-Dist: httpx~=0.27.0
13
+ Requires-Dist: python-dotenv~=1.0.1
14
+ Requires-Dist: django-cors-headers~=4.4.0
15
+ Requires-Dist: PyJWT~=2.7.0
16
+ Requires-Dist: fastapi~=0.112.2
17
+ Requires-Dist: datasets~=4.1.0
18
+ Requires-Dist: pinecone~=7.3.0
19
+ Requires-Dist: pytesseract~=0.3.13
20
+ Requires-Dist: python-docx~=1.2.0
21
+ Requires-Dist: PyPDF2~=3.0.1
22
+ Requires-Dist: pillow~=11.3.0
23
+
24
+ # 📚 PreVectorChunks
25
+
26
+ > A lightweight utility for **document chunking** and **vector database upserts** — designed for developers building **RAG (Retrieval-Augmented Generation)** solutions.
27
+
28
+ ---
29
+
30
+ ## ✨ Who Needs This Module?
31
+ Any developer working with:
32
+ - **RAG pipelines**
33
+ - **Vector Databases** (like Pinecone, Weaviate, etc.)
34
+ - **AI applications** requiring **similar content retrieval**
35
+
36
+ ---
37
+
38
+
39
+ ## 🎯 What Does This Module Do?
40
+ This module helps you:
41
+ - **Chunk documents** into smaller fragments
42
+ - **Insert (upsert) fragments** into a vector database
43
+ - **Fetch & update** existing chunks from a vector database
44
+
45
+ ---
46
+
47
+ ## 📦 Installation
48
+ ```bash
49
+ pip install prevectorchunks
50
+ ````
51
+ How to import in a file:
52
+ ```python
53
+ from PreVectorChunks.services import chunk_documents_crud_vdb
54
+
55
+ #How to use Pinecone and OpenAI:
56
+ #Use a .env file in your project root to configure API keys:
57
+
58
+ PINECONE_API_KEY=YOUR_API_KEY
59
+ OPENAI_API_KEY=YOUR_API_KEY
60
+
61
+ #how to call relevant functions:
62
+ #Four key functions that you can call are below:
63
+ #function that chunks any document
64
+ chunk_documents(instructions,file_path="content_playground/content.json"):
65
+ #function that chunks any document as well as inserts into vdb - you need an index name inside index_n
66
+ chunk_and_upsert_to_vdb(index_n,instructions,file_path="content_playground/content.json"):
67
+ #function that loads existing chunks from vdb by document name - you need an index name inside index_n
68
+ fetch_vdb_chunks_grouped_by_document_name(index_n):
69
+ #function that updates existing chunks - you need an index name inside index_n
70
+ update_vdb_chunks_grouped_by_document_name(index_n,dataset):
71
+ ```
@@ -0,0 +1,22 @@
1
+ README.md
2
+ pyproject.toml
3
+ prevectorchunks_core/__init__.py
4
+ prevectorchunks_core/admin.py
5
+ prevectorchunks_core/apps.py
6
+ prevectorchunks_core/asgi.py
7
+ prevectorchunks_core/models.py
8
+ prevectorchunks_core/settings.py
9
+ prevectorchunks_core/test_loader.py
10
+ prevectorchunks_core/tests.py
11
+ prevectorchunks_core/wsgi.py
12
+ prevectorchunks_core.egg-info/PKG-INFO
13
+ prevectorchunks_core.egg-info/SOURCES.txt
14
+ prevectorchunks_core.egg-info/dependency_links.txt
15
+ prevectorchunks_core.egg-info/requires.txt
16
+ prevectorchunks_core.egg-info/top_level.txt
17
+ prevectorchunks_core/migrations/__init__.py
18
+ prevectorchunks_core/services/__init__.py
19
+ prevectorchunks_core/services/chunk_documents_crud_vdb.py
20
+ prevectorchunks_core/utils/__init__.py
21
+ prevectorchunks_core/utils/file_loader.py
22
+ prevectorchunks_core/utils/llm_wrapper.py
@@ -0,0 +1,15 @@
1
+ Django==5.1
2
+ packaging~=24.1
3
+ requests~=2.32.3
4
+ openai~=1.37.1
5
+ httpx~=0.27.0
6
+ python-dotenv~=1.0.1
7
+ django-cors-headers~=4.4.0
8
+ PyJWT~=2.7.0
9
+ fastapi~=0.112.2
10
+ datasets~=4.1.0
11
+ pinecone~=7.3.0
12
+ pytesseract~=0.3.13
13
+ python-docx~=1.2.0
14
+ PyPDF2~=3.0.1
15
+ pillow~=11.3.0
@@ -0,0 +1 @@
1
+ prevectorchunks_core
@@ -0,0 +1,37 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "prevectorchunks-core"
7
+ version = "0.1.1"
8
+ description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ authors = [
12
+ { name = "Zul Al-Kabir", email = "zul.developer.2023@gmail.com" }
13
+ ]
14
+
15
+ dependencies = [
16
+ "Django==5.1",
17
+ "packaging~=24.1",
18
+ "requests~=2.32.3",
19
+ "openai~=1.37.1",
20
+ "httpx~=0.27.0",
21
+ "python-dotenv~=1.0.1",
22
+ "django-cors-headers~=4.4.0",
23
+ "PyJWT~=2.7.0",
24
+ "fastapi~=0.112.2",
25
+ "datasets~=4.1.0",
26
+ "pinecone~=7.3.0",
27
+ "pytesseract~=0.3.13",
28
+ "python-docx~=1.2.0",
29
+ "PyPDF2~=3.0.1",
30
+ "pillow~=11.3.0"
31
+ ]
32
+
33
+ [tool.setuptools.packages.find]
34
+ include = ["prevectorchunks_core*"]
35
+
36
+ [project.urls]
37
+ Homepage = "https://github.com/yourusername/mydep"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+