pembot 0.0.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pembot might be problematic. Click here for more details.
- pembot/.git/COMMIT_EDITMSG +1 -0
- pembot/.git/HEAD +1 -0
- pembot/.git/config +11 -0
- pembot/.git/description +1 -0
- pembot/.git/hooks/applypatch-msg.sample +15 -0
- pembot/.git/hooks/commit-msg.sample +24 -0
- pembot/.git/hooks/fsmonitor-watchman.sample +174 -0
- pembot/.git/hooks/post-update.sample +8 -0
- pembot/.git/hooks/pre-applypatch.sample +14 -0
- pembot/.git/hooks/pre-commit.sample +49 -0
- pembot/.git/hooks/pre-merge-commit.sample +13 -0
- pembot/.git/hooks/pre-push.sample +53 -0
- pembot/.git/hooks/pre-rebase.sample +169 -0
- pembot/.git/hooks/pre-receive.sample +24 -0
- pembot/.git/hooks/prepare-commit-msg.sample +42 -0
- pembot/.git/hooks/push-to-checkout.sample +78 -0
- pembot/.git/hooks/sendemail-validate.sample +77 -0
- pembot/.git/hooks/update.sample +128 -0
- pembot/.git/index +0 -0
- pembot/.git/info/exclude +6 -0
- pembot/.git/logs/HEAD +6 -0
- pembot/.git/logs/refs/heads/main +6 -0
- pembot/.git/logs/refs/remotes/origin/HEAD +1 -0
- pembot/.git/logs/refs/remotes/origin/main +5 -0
- pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c +0 -0
- pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1 +0 -0
- pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63 +0 -0
- pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7 +0 -0
- pembot/.git/objects/28/db0ab48059acccd7d257aa02e52e9b6b83a4a5 +0 -0
- pembot/.git/objects/35/97e518a8658280be9f377f78edf1dfa1f23814 +0 -0
- pembot/.git/objects/3d/07d3b29ff53d95de3898fb786d61732f210515 +0 -0
- pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb +0 -0
- pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba +0 -0
- pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0 +0 -0
- pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904 +0 -0
- pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba +0 -0
- pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177 +0 -0
- pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9 +0 -0
- pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331 +0 -0
- pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5 +0 -0
- pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25 +0 -0
- pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7 +0 -0
- pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8 +0 -0
- pembot/.git/objects/93/8f29d9b4b1ae86e39dddf9e3d115a82ddfc9b6 +0 -0
- pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456 +0 -0
- pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7 +0 -0
- pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880 +0 -0
- pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d +0 -0
- pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb +0 -0
- pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3 +0 -0
- pembot/.git/objects/bf/32a7e6872e5dc4025ee3df3c921ec7ade0855f +0 -0
- pembot/.git/objects/c0/793458db6e1bee7f79f1a504fb8ff4963f8ed3 +0 -0
- pembot/.git/objects/c2/443060c07101948487cfa93cc39e082e9e0f5f +0 -0
- pembot/.git/objects/e5/3070f2b07f45d031444b09b1b38658f3caf29e +0 -0
- pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc +0 -0
- pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d +0 -0
- pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58 +0 -0
- pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238 +0 -0
- pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552 +0 -0
- pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8 +0 -0
- pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300 +0 -0
- pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88 +0 -0
- pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6 +0 -0
- pembot/.git/objects/fb/6c90c9ce5e0cdfbe074a3f060afc66f62eefde +0 -0
- pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2 +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev +0 -0
- pembot/.git/packed-refs +2 -0
- pembot/.git/refs/heads/main +1 -0
- pembot/.git/refs/remotes/origin/HEAD +1 -0
- pembot/.git/refs/remotes/origin/main +1 -0
- pembot/.gitignore +7 -0
- pembot/AnyToText/__init__.py +0 -0
- pembot/AnyToText/convertor.py +260 -0
- pembot/LICENSE +674 -0
- pembot/TextEmbedder/__init__.py +0 -0
- pembot/TextEmbedder/gemini_embedder.py +27 -0
- pembot/TextEmbedder/mongodb_embedder.py +258 -0
- pembot/TextEmbedder/mongodb_index_creator.py +133 -0
- pembot/TextEmbedder/vector_query.py +64 -0
- pembot/__init__.py +6 -0
- pembot/config/config.yaml +5 -0
- pembot/gartner.py +140 -0
- pembot/main.py +208 -0
- pembot/output_structure_local.py +63 -0
- pembot/pdf2markdown/.git/HEAD +1 -0
- pembot/pdf2markdown/.git/config +11 -0
- pembot/pdf2markdown/.git/description +1 -0
- pembot/pdf2markdown/.git/hooks/applypatch-msg.sample +15 -0
- pembot/pdf2markdown/.git/hooks/commit-msg.sample +24 -0
- pembot/pdf2markdown/.git/hooks/fsmonitor-watchman.sample +174 -0
- pembot/pdf2markdown/.git/hooks/post-update.sample +8 -0
- pembot/pdf2markdown/.git/hooks/pre-applypatch.sample +14 -0
- pembot/pdf2markdown/.git/hooks/pre-commit.sample +49 -0
- pembot/pdf2markdown/.git/hooks/pre-merge-commit.sample +13 -0
- pembot/pdf2markdown/.git/hooks/pre-push.sample +53 -0
- pembot/pdf2markdown/.git/hooks/pre-rebase.sample +169 -0
- pembot/pdf2markdown/.git/hooks/pre-receive.sample +24 -0
- pembot/pdf2markdown/.git/hooks/prepare-commit-msg.sample +42 -0
- pembot/pdf2markdown/.git/hooks/push-to-checkout.sample +78 -0
- pembot/pdf2markdown/.git/hooks/sendemail-validate.sample +77 -0
- pembot/pdf2markdown/.git/hooks/update.sample +128 -0
- pembot/pdf2markdown/.git/index +0 -0
- pembot/pdf2markdown/.git/info/exclude +6 -0
- pembot/pdf2markdown/.git/logs/HEAD +1 -0
- pembot/pdf2markdown/.git/logs/refs/heads/main +1 -0
- pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD +1 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx +0 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack +0 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev +0 -0
- pembot/pdf2markdown/.git/packed-refs +2 -0
- pembot/pdf2markdown/.git/refs/heads/main +1 -0
- pembot/pdf2markdown/.git/refs/remotes/origin/HEAD +1 -0
- pembot/pdf2markdown/LICENSE +21 -0
- pembot/pdf2markdown/README.md +107 -0
- pembot/pdf2markdown/__init__.py +0 -0
- pembot/pdf2markdown/config/config.yaml +2 -0
- pembot/pdf2markdown/extract.py +888 -0
- pembot/pdf2markdown/requirements.txt +8 -0
- pembot/pem.py +157 -0
- pembot/query.py +204 -0
- pembot/utils/__init__.py +0 -0
- pembot/utils/inference_client.py +132 -0
- pembot/utils/string_tools.py +45 -0
- pembot-0.0.3.dist-info/METADATA +8 -0
- pembot-0.0.3.dist-info/RECORD +129 -0
- pembot-0.0.3.dist-info/WHEEL +5 -0
- pembot-0.0.3.dist-info/licenses/LICENSE +674 -0
pembot/pem.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from huggingface_hub import InferenceClient
|
|
4
|
+
import ollama
|
|
5
|
+
from pymongo import MongoClient, ReturnDocument
|
|
6
|
+
from pembot.TextEmbedder.mongodb_embedder import process_document_and_embed
|
|
7
|
+
from pembot.TextEmbedder.mongodb_index_creator import create_vector_index
|
|
8
|
+
import os
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
|
|
12
|
+
class Category(Enum):
|
|
13
|
+
WEBDEV = "webdev"
|
|
14
|
+
ANDROID = "android"
|
|
15
|
+
PHYSICS = "physics"
|
|
16
|
+
SPACE = "space"
|
|
17
|
+
PHILOSOPHY = "philosophy"
|
|
18
|
+
CULTURE = "culture"
|
|
19
|
+
HISTORY = "history"
|
|
20
|
+
MATHS = "maths"
|
|
21
|
+
SOCIOLOGY = "sociology"
|
|
22
|
+
CODING = "coding"
|
|
23
|
+
|
|
24
|
+
def get_title(input_text: str) -> str:
|
|
25
|
+
# 1) splits `input_text` on "\n" → an array
|
|
26
|
+
# 2) takes the [0]th element → the first line
|
|
27
|
+
# 3) trims that element
|
|
28
|
+
|
|
29
|
+
s= input_text.split('\n')[0].strip()
|
|
30
|
+
|
|
31
|
+
if s.startswith("#"):
|
|
32
|
+
return s[1:].strip()
|
|
33
|
+
return s.strip()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def embed_all_docs(database, llm_client, inference_client, chunk_size, embedding_model, embeddings_collection_name, index_name, docs_collection):
|
|
37
|
+
for doc in docs_collection.find({}, {"_id": 1, "content": 1}):
|
|
38
|
+
|
|
39
|
+
print("on doc: ", doc)
|
|
40
|
+
|
|
41
|
+
docs_embed= process_document_and_embed(db_client= database, llm_client= llm_client,
|
|
42
|
+
inference_client= inference_client, file_path= Path(), chunk_size= chunk_size,
|
|
43
|
+
embedding_model= embedding_model, embeddings_collection_name= embeddings_collection_name,
|
|
44
|
+
use_custom_id= str(doc["_id"]), use_custom_input= doc["content"])
|
|
45
|
+
|
|
46
|
+
doc_embedding_length= len(docs_embed[-1]["embedding"])
|
|
47
|
+
|
|
48
|
+
# ensuring
|
|
49
|
+
create_vector_index(database[embeddings_collection_name], index_name, num_dimensions= doc_embedding_length)
|
|
50
|
+
|
|
51
|
+
def update_from_local_fs_using_title(database, llm_client, inference_client, docs_dir, chunk_size, embedding_model, embeddings_collection_name, index_name, docs_collection):
|
|
52
|
+
|
|
53
|
+
for docfile in docs_dir.iterdir():
|
|
54
|
+
# for each blog in blog dir, update it if its not alr there in blogs coll,
|
|
55
|
+
# that that created/updated doc's id
|
|
56
|
+
# and update the embeddings in blog_embeddings collection with docId as that id
|
|
57
|
+
# the docId of the embedding doc must be id of the corresponding blog
|
|
58
|
+
if docfile.is_file :
|
|
59
|
+
# TODO: create/update doc and get doc._id, pass that as the optional string param "use_custom_id"
|
|
60
|
+
|
|
61
|
+
input_text= ""
|
|
62
|
+
with open(str(docfile), "r") as md_file:
|
|
63
|
+
input_text = md_file.read()
|
|
64
|
+
|
|
65
|
+
categories= list([[i + 1, x, x.value] for i, x in enumerate(Category)])
|
|
66
|
+
category= "culture" # default
|
|
67
|
+
while True:
|
|
68
|
+
category_index = input("Enter category no. for {}...\n{}\n: ".format(
|
|
69
|
+
input_text[:300],
|
|
70
|
+
'\n'.join(map(lambda x: str(x[0]) + ". " + str(x[2]), categories))
|
|
71
|
+
))
|
|
72
|
+
if category_index.isdigit():
|
|
73
|
+
category= categories[int(category_index) - 1][2]
|
|
74
|
+
break
|
|
75
|
+
else:
|
|
76
|
+
print("pls enter a number.")
|
|
77
|
+
|
|
78
|
+
set_on_insert= {
|
|
79
|
+
"name": os.getenv("PROFILE_NAME"),
|
|
80
|
+
"email": os.getenv("PROFILE_EMAIL"),
|
|
81
|
+
"category": category,
|
|
82
|
+
"dateAdded": datetime.now(),
|
|
83
|
+
"views": 0,
|
|
84
|
+
"likes": 0
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
title= get_title(input_text)
|
|
88
|
+
|
|
89
|
+
updated= docs_collection.find_one_and_update(
|
|
90
|
+
{'title': title},
|
|
91
|
+
{
|
|
92
|
+
'$set': {'content': input_text},
|
|
93
|
+
'$setOnInsert': set_on_insert
|
|
94
|
+
},
|
|
95
|
+
upsert=True,
|
|
96
|
+
return_document=ReturnDocument.AFTER
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
docs_embed= process_document_and_embed(db_client= database, llm_client= llm_client,
|
|
100
|
+
inference_client= inference_client, file_path= docfile, chunk_size= chunk_size,
|
|
101
|
+
embedding_model= embedding_model, embeddings_collection_name= embeddings_collection_name,
|
|
102
|
+
use_custom_id= str(updated["_id"]))
|
|
103
|
+
doc_embedding_length= len(docs_embed[-1]["embedding"])
|
|
104
|
+
|
|
105
|
+
# ensuring
|
|
106
|
+
create_vector_index(database[embeddings_collection_name], index_name, num_dimensions= doc_embedding_length)
|
|
107
|
+
print("in the db now!")
|
|
108
|
+
|
|
109
|
+
if __name__ == "__main__":
|
|
110
|
+
|
|
111
|
+
mongodb_uri= os.environ['MONGODB_PEM']
|
|
112
|
+
mc = MongoClient(mongodb_uri)
|
|
113
|
+
|
|
114
|
+
llm_client= ollama.Client()
|
|
115
|
+
|
|
116
|
+
#### FOR USING JINA INSTEAD OF HUGGINGFACE SDK, REPLACE WITH THE InferenceClient TOP IMPORT
|
|
117
|
+
# from pembot.utils.inference_client import InferenceClient
|
|
118
|
+
# JINA_API_KEY= os.environ['JINA_API_KEY']
|
|
119
|
+
# inference_client= InferenceClient(
|
|
120
|
+
# provider="Jina AI",
|
|
121
|
+
# api_key= JINA_API_KEY,
|
|
122
|
+
# )
|
|
123
|
+
|
|
124
|
+
inference_client= InferenceClient(
|
|
125
|
+
provider="hf-inference",
|
|
126
|
+
api_key= os.environ["HF_TOKEN"],
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
mc.admin.command('ping')
|
|
130
|
+
print("ping test ok")
|
|
131
|
+
database = mc["pem"]
|
|
132
|
+
print("dbs and cols loaded")
|
|
133
|
+
|
|
134
|
+
embeddings_collection_name: str= "blog_chunks"
|
|
135
|
+
docs_collection_name: str= "blogs"
|
|
136
|
+
docs_collection= database[docs_collection_name]
|
|
137
|
+
|
|
138
|
+
# nerfed, but provided by hf serverless inference: BAAI/bge-small-en-v1.5
|
|
139
|
+
# Worth mentioning:
|
|
140
|
+
# jinaai/jina-embeddings-v3
|
|
141
|
+
# BAAI/bge-base-en-v1.5
|
|
142
|
+
# nomic-ai/nomic-embed-text-v1.5
|
|
143
|
+
# embedding_model: str= 'BAAI/bge-base-en-v1.5'
|
|
144
|
+
embedding_model: str= 'gemini-embedding-exp-03-07'
|
|
145
|
+
|
|
146
|
+
index_name: str= "gemini_vectors"
|
|
147
|
+
|
|
148
|
+
# the number depends on the amount of chunks that will go into LLMs in the end
|
|
149
|
+
total_input_chars_allowed= 2_50_000 # we got 63k tokens => ~2.5 lac characters
|
|
150
|
+
no_of_chunks_to_be_sent = 2 #len(required_fields)
|
|
151
|
+
chunk_size= int( total_input_chars_allowed / no_of_chunks_to_be_sent)
|
|
152
|
+
|
|
153
|
+
# curdir <- project dir <- dev -> (~) -> Documents -> notes -> blogs
|
|
154
|
+
docs_dir = Path.cwd().parent.parent.parent / "Documents" / "notes" / "Obsidian" / "blogs"
|
|
155
|
+
|
|
156
|
+
embed_all_docs(database, llm_client, inference_client, chunk_size, embedding_model, embeddings_collection_name, index_name, docs_collection)
|
|
157
|
+
# update_from_local_fs_using_title(database, llm_client, inference_client, docs_dir, chunk_size, embedding_model, embeddings_collection_name, index_name, docs_collection)
|
pembot/query.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
from os import environ
|
|
2
|
+
from huggingface_hub.inference._generated.types.chat_completion import ChatCompletionOutputMessage
|
|
3
|
+
from huggingface_hub.inference._providers import PROVIDER_T
|
|
4
|
+
import ollama
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from pydantic_core.core_schema import ErrorType
|
|
8
|
+
from pembot.TextEmbedder.mongodb_embedder import search_within_document
|
|
9
|
+
import numpy as np
|
|
10
|
+
from huggingface_hub import InferenceClient
|
|
11
|
+
from google import genai
|
|
12
|
+
from google.genai import types
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
from pembot.TextEmbedder.mongodb_index_creator import create_vector_index
|
|
16
|
+
|
|
17
|
+
def external_llm(rag_prompt, model_name, llm_provider_name: PROVIDER_T= "novita", inference_client = None) -> str:
|
|
18
|
+
|
|
19
|
+
# Here, one can change the provider of the inference LLM if
|
|
20
|
+
# for embedding we are using one which doesnt have our LLM available
|
|
21
|
+
# or, is costly, so we choose different, just here in the function header, or from the main()
|
|
22
|
+
|
|
23
|
+
if not inference_client:
|
|
24
|
+
inference_client= InferenceClient(
|
|
25
|
+
# "nebius" "novita" "hyperbolic"
|
|
26
|
+
provider= llm_provider_name,
|
|
27
|
+
api_key= environ["HF_TOKEN"]
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
completion= inference_client.chat.completions.create(
|
|
31
|
+
model= model_name,
|
|
32
|
+
messages= [
|
|
33
|
+
{"role": "user", "content": rag_prompt}
|
|
34
|
+
]
|
|
35
|
+
)
|
|
36
|
+
response_message: ChatCompletionOutputMessage= completion.choices[0].message
|
|
37
|
+
|
|
38
|
+
if response_message.content:
|
|
39
|
+
return response_message.content
|
|
40
|
+
else:
|
|
41
|
+
return '{}'
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def multi_embedding_average(llm_client, inference_client, descriptions, model= "BAAI/bge-en-icl", embed_locally= False):
|
|
45
|
+
|
|
46
|
+
description_embeddings = []
|
|
47
|
+
for desc in descriptions:
|
|
48
|
+
try:
|
|
49
|
+
if 'gemini' in model:
|
|
50
|
+
client = genai.Client(api_key= environ['GEMINI_API_KEY'])
|
|
51
|
+
result = client.models.embed_content(
|
|
52
|
+
model= model,
|
|
53
|
+
contents= desc,
|
|
54
|
+
config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT")
|
|
55
|
+
)
|
|
56
|
+
if result is not None and result.embeddings:
|
|
57
|
+
description_embeddings.append(result.embeddings[0].values)
|
|
58
|
+
else:
|
|
59
|
+
raise ValueError("Gemini not givingz embeddingzzz")
|
|
60
|
+
elif embed_locally:
|
|
61
|
+
response = llm_client.embeddings(model=model, prompt=desc)
|
|
62
|
+
description_embeddings.append(response['embedding'])
|
|
63
|
+
|
|
64
|
+
else:
|
|
65
|
+
response = inference_client.feature_extraction(desc, model=model)
|
|
66
|
+
description_embeddings.append(response)
|
|
67
|
+
|
|
68
|
+
except Exception as e:
|
|
69
|
+
print(f"Error generating embedding for description '{desc}': {e}")
|
|
70
|
+
# Decide how to handle errors: skip, raise, or use a placeholder
|
|
71
|
+
continue
|
|
72
|
+
time.sleep(1)
|
|
73
|
+
|
|
74
|
+
if not description_embeddings:
|
|
75
|
+
print("No embeddings could be generated for the descriptions. Aborting search.")
|
|
76
|
+
return []
|
|
77
|
+
|
|
78
|
+
# Aggregate embeddings: A simple approach is to average them.
|
|
79
|
+
# This creates a single query vector that represents the combined meaning.
|
|
80
|
+
return np.mean(description_embeddings, axis=0).tolist()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def rag_query_llm(db_client, llm_client, inference_client, user_query: str, document_id: str, required_fields_descriptions: list[str], model_name: str = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", ollama_base_url: str = "http://localhost:11434", no_of_fields= 4, embedding_model= "BAAI/bge-en-icl", llm_provider_name: PROVIDER_T= "novita", index_name: str= "test_search", embeddings_collection= "doc_chunks"):
|
|
85
|
+
"""
|
|
86
|
+
Performs a RAG (Retrieval Augmented Generation) query using a Hugging Face
|
|
87
|
+
embedding model, ChromaDB for retrieval, and a local Ollama model for generation.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
db_client: The vector DB client
|
|
91
|
+
user_query (str): The user's query.
|
|
92
|
+
required_fields_descriptions: The required fields which are to be queried from context
|
|
93
|
+
model_name (str): The name of the Ollama model to use (e.g., "llama2", "mistral").
|
|
94
|
+
no_of_fields (str): number of vectors which are to be retrieved from DB
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
str: The generated response from the Ollama model.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
models = llm_client.list()
|
|
101
|
+
found= False
|
|
102
|
+
embed_locally= False
|
|
103
|
+
|
|
104
|
+
for model in models.models:
|
|
105
|
+
# print(model.model)
|
|
106
|
+
if model.model == model_name:
|
|
107
|
+
found= True
|
|
108
|
+
if model.model == embedding_model:
|
|
109
|
+
embed_locally= True
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
aggregate_query_embedding= multi_embedding_average(llm_client, inference_client, required_fields_descriptions, model= embedding_model, embed_locally= embed_locally)
|
|
114
|
+
print("Aggregate query embedding generated. length: ", len(aggregate_query_embedding))
|
|
115
|
+
|
|
116
|
+
create_vector_index(db_client[embeddings_collection], index_name, num_dimensions= len(aggregate_query_embedding))
|
|
117
|
+
|
|
118
|
+
# check the order of args
|
|
119
|
+
relevant_chunks= search_within_document(db_client, aggregate_query_embedding, document_id, limit= no_of_fields, index_name= index_name, embeddings_collection_name= embeddings_collection)
|
|
120
|
+
relevant_chunks= list(map(lambda x: x['chunk_text'], relevant_chunks))
|
|
121
|
+
|
|
122
|
+
if not relevant_chunks:
|
|
123
|
+
context = "No relevant context available."
|
|
124
|
+
else:
|
|
125
|
+
# print(f"Found {len(relevant_chunks)} relevant chunks.")
|
|
126
|
+
# Concatenate relevant chunks into a single context string
|
|
127
|
+
context = "\n\n".join(relevant_chunks)
|
|
128
|
+
|
|
129
|
+
# Construct the RAG prompt
|
|
130
|
+
rag_prompt = f"""
|
|
131
|
+
You are a helpful assistant. Use the following context to answer the question.
|
|
132
|
+
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
|
133
|
+
|
|
134
|
+
Context:
|
|
135
|
+
{context}
|
|
136
|
+
|
|
137
|
+
Question: {user_query}
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
# print("Step 3: Calling Ollama model with RAG prompt...")
|
|
141
|
+
# print("final prompt: ")
|
|
142
|
+
# print(rag_prompt)
|
|
143
|
+
if 'gemini' in model_name:
|
|
144
|
+
|
|
145
|
+
client = genai.Client(api_key= environ['GEMINI_API_KEY'])
|
|
146
|
+
response = client.models.generate_content(
|
|
147
|
+
model= model_name,
|
|
148
|
+
contents= rag_prompt,
|
|
149
|
+
)
|
|
150
|
+
return response.text
|
|
151
|
+
|
|
152
|
+
elif found:
|
|
153
|
+
try:
|
|
154
|
+
# You can use ollama.chat or ollama.generate depending on your model and preference
|
|
155
|
+
# ollama.chat is generally preferred for conversational models.
|
|
156
|
+
response = llm_client.chat(
|
|
157
|
+
model=model_name,
|
|
158
|
+
messages=[{'role': 'user', 'content': rag_prompt}],
|
|
159
|
+
options={"base_url": ollama_base_url} # Ensure the base URL is set
|
|
160
|
+
)
|
|
161
|
+
return response['message']['content']
|
|
162
|
+
except ollama.ResponseError as e:
|
|
163
|
+
print(f"Error calling Ollama API: {e}")
|
|
164
|
+
return f"Error: Could not get a response from Ollama. Please check if Ollama is running and the model '{model_name}' is pulled."
|
|
165
|
+
except Exception as e:
|
|
166
|
+
print(f"An unexpected error occurred while calling Ollama: {e}")
|
|
167
|
+
return "An unexpected error occurred."
|
|
168
|
+
elif 'qwen' in model_name or 'gemma' in model_name or 'Qwen' in model_name or 'deepseek' in model_name:
|
|
169
|
+
return external_llm(rag_prompt, model_name= model_name, llm_provider_name= llm_provider_name)
|
|
170
|
+
else:
|
|
171
|
+
return '{}'
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def remove_bs(text):
|
|
176
|
+
"""
|
|
177
|
+
Removes everything between <think></think> tags and any text outside of JSON curly brackets.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
text (str): The input string.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
str: The string with text between <think></think> tags removed and only the
|
|
184
|
+
content within the outermost JSON curly brackets.
|
|
185
|
+
Returns an empty string if no valid JSON is found.
|
|
186
|
+
"""
|
|
187
|
+
# 1. Remove <think></think> tags
|
|
188
|
+
think_pattern = r'<think>.*?</think>'
|
|
189
|
+
text_without_think = re.sub(think_pattern, '', text, flags=re.DOTALL)
|
|
190
|
+
|
|
191
|
+
# 2. Extract JSON content
|
|
192
|
+
# This regex looks for the first opening curly brace and the last closing curly brace.
|
|
193
|
+
# It assumes the JSON structure is well-formed within the string.
|
|
194
|
+
json_match = re.search(r'\{(.*)\}', text_without_think, re.DOTALL)
|
|
195
|
+
|
|
196
|
+
if json_match:
|
|
197
|
+
json_content_str = "{" + json_match.group(1) + "}"
|
|
198
|
+
return json_content_str
|
|
199
|
+
else:
|
|
200
|
+
return ""
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
if __name__ == "__main__":
|
|
204
|
+
print("hemlo worls")
|
pembot/utils/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import requests
|
|
3
|
+
import json
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
# Define an Enum for providers if you plan to support more in the future
|
|
7
|
+
class PROVIDER_T(Enum):
|
|
8
|
+
JINA = "Jina AI"
|
|
9
|
+
# OLLAMA = "Ollama" # Example for future expansion
|
|
10
|
+
|
|
11
|
+
class InferenceClient:
|
|
12
|
+
"""
|
|
13
|
+
A client for making inference calls to various AI model providers.
|
|
14
|
+
Currently supports Jina AI for feature extraction (embeddings).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, provider: PROVIDER_T, api_key: str):
|
|
18
|
+
"""
|
|
19
|
+
Initializes the InferenceClient.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
provider: The AI model provider (e.g., PROVIDER_T.JINA).
|
|
23
|
+
api_key: The API key for the specified provider.
|
|
24
|
+
"""
|
|
25
|
+
self.provider = provider
|
|
26
|
+
self.api_key = api_key
|
|
27
|
+
self.base_url = self._get_base_url(provider)
|
|
28
|
+
|
|
29
|
+
def _get_base_url(self, provider: PROVIDER_T) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Returns the base URL for the given provider's API.
|
|
32
|
+
"""
|
|
33
|
+
if provider == PROVIDER_T.JINA:
|
|
34
|
+
return "https://api.jina.ai/v1/embeddings"
|
|
35
|
+
# Add more providers here as needed
|
|
36
|
+
# elif provider == PROVIDER_T.OLLAMA:
|
|
37
|
+
# return "http://localhost:11434/api/embeddings"
|
|
38
|
+
else:
|
|
39
|
+
raise ValueError(f"Unsupported provider: {provider}")
|
|
40
|
+
|
|
41
|
+
def feature_extraction(self, prompt_texts: list[str], model: str = "jina-embeddings-v3") -> list[list[float]]:
|
|
42
|
+
"""
|
|
43
|
+
Calls the Jina AI API to get embeddings (feature extraction) for input texts.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
prompt_texts: A list of strings for which to generate embeddings.
|
|
47
|
+
model: The name of the Jina AI embedding model to use.
|
|
48
|
+
Defaults to "jina-embeddings-v3".
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
A list of embedding vectors (list of floats).
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
Exception: If the API call fails or returns an error.
|
|
55
|
+
"""
|
|
56
|
+
if self.provider != PROVIDER_T.JINA:
|
|
57
|
+
raise NotImplementedError(f"Feature extraction not implemented for provider: {self.provider}")
|
|
58
|
+
|
|
59
|
+
headers = {
|
|
60
|
+
"Content-Type": "application/json",
|
|
61
|
+
"Authorization": f"Bearer {self.api_key}"
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# Jina API specific payload
|
|
65
|
+
data = {
|
|
66
|
+
"model": model,
|
|
67
|
+
"input": prompt_texts,
|
|
68
|
+
"task": "qa" # Recommended task for general purpose embeddings, can be "text-matching", "retrieval", etc.
|
|
69
|
+
# "qa" is often good for general question-answering/retrieval contexts.
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
response = requests.post(self.base_url, headers=headers, data=json.dumps(data))
|
|
74
|
+
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
|
|
75
|
+
|
|
76
|
+
response_json = response.json()
|
|
77
|
+
if "data" in response_json and isinstance(response_json["data"], list):
|
|
78
|
+
# Extract just the embedding vectors
|
|
79
|
+
embeddings = [item["embedding"] for item in response_json["data"]]
|
|
80
|
+
return embeddings
|
|
81
|
+
else:
|
|
82
|
+
raise Exception(f"Unexpected API response format: {response_json}")
|
|
83
|
+
|
|
84
|
+
except requests.exceptions.HTTPError as http_err:
|
|
85
|
+
print(f"HTTP error occurred: {http_err} - {response.text}")
|
|
86
|
+
raise
|
|
87
|
+
except requests.exceptions.ConnectionError as conn_err:
|
|
88
|
+
print(f"Connection error occurred: {conn_err}")
|
|
89
|
+
raise
|
|
90
|
+
except requests.exceptions.Timeout as timeout_err:
|
|
91
|
+
print(f"Timeout error occurred: {timeout_err}")
|
|
92
|
+
raise
|
|
93
|
+
except requests.exceptions.RequestException as req_err:
|
|
94
|
+
print(f"An unexpected request error occurred: {req_err}")
|
|
95
|
+
raise
|
|
96
|
+
except Exception as e:
|
|
97
|
+
print(f"An error occurred during Jina AI API call: {e}")
|
|
98
|
+
raise
|
|
99
|
+
|
|
100
|
+
# --- Example Usage ---
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
try:
|
|
103
|
+
JINA_API_KEY = os.environ['JINA_API_KEY']
|
|
104
|
+
except KeyError:
|
|
105
|
+
print("Error: JINA_API_KEY environment variable not set.")
|
|
106
|
+
print("Please set it (e.g., export JINA_API_KEY='your_api_key') and try again.")
|
|
107
|
+
exit(1)
|
|
108
|
+
|
|
109
|
+
# Initialize the InferenceClient for Jina AI
|
|
110
|
+
jina_client = InferenceClient(provider=PROVIDER_T.JINA, api_key=JINA_API_KEY)
|
|
111
|
+
|
|
112
|
+
# Example input texts
|
|
113
|
+
input_texts = [
|
|
114
|
+
"Organic skincare for sensitive skin with aloe vera and chamomile.",
|
|
115
|
+
"Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille.",
|
|
116
|
+
"Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla.",
|
|
117
|
+
"新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています。"
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
print("--- Calling Jina AI for Feature Extraction ---")
|
|
121
|
+
try:
|
|
122
|
+
embeddings = jina_client.feature_extraction(prompt_texts=input_texts)
|
|
123
|
+
print(f"Successfully extracted {len(embeddings)} embeddings.")
|
|
124
|
+
print(f"First embedding (first 5 values): {embeddings[0][:5]}...")
|
|
125
|
+
print(f"Embedding dimension: {len(embeddings[0])}")
|
|
126
|
+
|
|
127
|
+
# Example with a different model (if available and you want to specify)
|
|
128
|
+
# embeddings_v2 = jina_client.feature_extraction(prompt_texts=input_texts, model="jina-embeddings-v2")
|
|
129
|
+
# print(f"\nSuccessfully extracted {len(embeddings_v2)} embeddings with jina-embeddings-v2.")
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
print(f"Failed to extract features: {e}")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import random
|
|
3
|
+
|
|
4
|
+
def make_it_an_id(file_name):
|
|
5
|
+
"""
|
|
6
|
+
Input: A file name mixed with spaces, periods, etc.
|
|
7
|
+
Output: '_' separated smallcase alphabetic id, with random filler if less than 5 chars
|
|
8
|
+
"""
|
|
9
|
+
# 1. Convert to lowercase
|
|
10
|
+
file_name = file_name.lower()
|
|
11
|
+
|
|
12
|
+
# 2. Replace non-alphanumeric characters (except periods and spaces) with underscores
|
|
13
|
+
# Keep periods for now to split by later, and spaces for initial conversion
|
|
14
|
+
cleaned_name = re.sub(r'[^a-z0-9\s\.]', '_', file_name)
|
|
15
|
+
|
|
16
|
+
# 3. Replace spaces and periods with underscores
|
|
17
|
+
cleaned_name = re.sub(r'[\s\.]+', '_', cleaned_name)
|
|
18
|
+
|
|
19
|
+
# 4. Remove leading/trailing underscores and multiple consecutive underscores
|
|
20
|
+
cleaned_name = re.sub(r'_{2,}', '_', cleaned_name).strip('_')
|
|
21
|
+
|
|
22
|
+
# Ensure it only contains alphabetic characters (after previous cleaning)
|
|
23
|
+
# If the file_name was something like "123.pdf", this step ensures we only keep alphabetic parts.
|
|
24
|
+
# We will filter out non-alphabetic parts after initial cleaning to retain some structure.
|
|
25
|
+
# Let's refine this to ensure we only keep alphabetic parts before padding.
|
|
26
|
+
alphabetic_parts = re.findall(r'[a-z]+', cleaned_name)
|
|
27
|
+
|
|
28
|
+
# Join alphabetic parts with underscores
|
|
29
|
+
result_id = '_'.join(alphabetic_parts)
|
|
30
|
+
|
|
31
|
+
# 5. Add random filler if less than 5 chars
|
|
32
|
+
if len(result_id) < 5:
|
|
33
|
+
# Generate random lowercase alphabetic characters
|
|
34
|
+
filler_length = 5 - len(result_id)
|
|
35
|
+
random_filler = ''.join(random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(filler_length))
|
|
36
|
+
|
|
37
|
+
# Append filler with an underscore if result_id is not empty
|
|
38
|
+
if result_id:
|
|
39
|
+
result_id += '_' + random_filler
|
|
40
|
+
else: # If the result_id is empty (e.g., from "123.txt" or "$.*"), just use the filler
|
|
41
|
+
result_id = random_filler
|
|
42
|
+
|
|
43
|
+
return result_id
|
|
44
|
+
|
|
45
|
+
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pembot
|
|
3
|
+
Version: 0.0.3
|
|
4
|
+
Summary: A Python Package to convert PEM blog content to usseful information by leveraging LLMs
|
|
5
|
+
Author-email: cyto <aryan_sidhwani@protonmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Project-URL: Home, https://github.com/silverstone-git/pem-rag-chatbot
|