PyPI - pembot - Versions diffs - 0.0.7__py2.py3-none-any.whl → 0.0.8__py2.py3-none-any.whl - Mend

pembot 0.0.7py2.py3-none-any.whl → 0.0.8py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pembot might be problematic. Click here for more details.

Files changed (9) hide show

pembot/TextEmbedder/mongodb_embedder.py +50 -19
pembot/TextEmbedder/mongodb_index_creator.py +29 -24
pembot/__init__.py +1 -1
pembot/config/config.yaml +1 -1
pembot/query.py +5 -4
{pembot-0.0.7.dist-info → pembot-0.0.8.dist-info}/METADATA +1 -1
{pembot-0.0.7.dist-info → pembot-0.0.8.dist-info}/RECORD +9 -9
{pembot-0.0.7.dist-info → pembot-0.0.8.dist-info}/WHEEL +0 -0
{pembot-0.0.7.dist-info → pembot-0.0.8.dist-info}/licenses/LICENSE +0 -0

pembot/TextEmbedder/mongodb_embedder.py CHANGED Viewed

@@ -29,6 +29,7 @@ def search_within_document(
     limit: int = 5,
     index_name: str = "test_search",
     embeddings_collection_name: str= "doc_chunks",
+    document_belongs_to_a_type = "",
 ):
     """
     Performs a vector similarity search within the chunks of a specific document
@@ -42,6 +43,7 @@ def search_within_document(
         index_name: The name of your MongoDB Atlas Vector Search index.
                     You MUST have a vector search index created on the 'embedding' field
                     of the 'embeddings_collection' collection for this to work efficiently.
+        document_belongs_to_a_type: When search spaces intersect for different docIds, such that docId is an array field,
     Returns:
         A list of dictionaries, where each dictionary represents a matching chunk
@@ -50,10 +52,23 @@ def search_within_document(
     embeddings_collection = db_client[embeddings_collection_name]
     print(f"Searching within document (docId: {document_name_id})...")
+    # print(f" filter (slug: {document_belongs_to_a_type})...")
     # MongoDB Atlas Vector Search aggregation pipeline
     # The 'path' should point to the field containing the embeddings.
     # The 'filter' stage is crucial for searching within a specific document.
+    #
+    project_dict= {
+        '_id': 0,
+        'docId': 1,
+        'chunk_number': 1,
+        'chunk_text': 1,
+        'score': { '$meta': 'vectorSearchScore' } # Get the similarity score
+    }
+    if document_belongs_to_a_type:
+        project_dict['type']= 1
     pipeline = [
         {
             '$vectorSearch': {
@@ -66,25 +81,21 @@ def search_within_document(
                 'index': index_name,
                 #filter to search only within the specified document
-                'filter': {
-                    'docId': document_name_id
-                }
+                'filter':
+                    { "type": {"$in": [document_belongs_to_a_type ]} } if document_belongs_to_a_type else
+                    { 'docId': document_name_id }
             }
         },
         # to exclude the MongoDB internal _id
         {
-            '$project': {
-                '_id': 0,
-                'docId': 1,
-                'chunk_number': 1,
-                'chunk_text': 1,
-                'score': { '$meta': 'vectorSearchScore' } # Get the similarity score
-            }
+            '$project': project_dict
         }
     ]
+    # print("sesraching now:")
     results = list(embeddings_collection.aggregate(pipeline))
+    # print("search results: ", results)
     if not results:
         print(f"No relevant chunks found for document '{document_name_id}' with the given query.")
@@ -100,15 +111,18 @@ def search_within_document(
-def process_document_and_embed(db_client,
+def process_document_and_embed(
+    db_client,
     llm_client,
     inference_client,
     file_path: Path,
     chunk_size: int,
-    embedding_model: str = 'nomic-embed-text:v1.5',
+    embedding_model: str = 'BAAI/bge-en-icl',
     embeddings_collection_name= "doc_chunks",
     use_custom_id: str | None = None,
-    use_custom_input: str | None = None
+    use_custom_input: str | None = None,
+    document_belongs_to_a_type= "",
+    type_info= []
 ) -> list[dict]:
     """
     Processes an input document by chunking its text, generating embeddings using
@@ -228,13 +242,30 @@ def process_document_and_embed(db_client,
                 'chunk_text': chunk,
                 'embedding': embedding,
                 'chunk_id_global': chunk_id_global,
-                'chunk_id_doc_specific': chunk_id_doc_specific
+                'chunk_id_doc_specific': chunk_id_doc_specific,
             }
-            embeddings_collection.update_one(
-                {'docId': document_name_id, 'chunk_number': i + 1},
-                {'$set': doc_set},
-                upsert=True
-            )
+            # TBD: this is NOT pushing array, this is creating a "$push" field with type: "" object
+            if len(type_info) > 0:
+                embeddings_collection.update_one(
+                    {'docId': document_name_id, 'chunk_number': i + 1},
+                    {
+                        '$set': doc_set,
+                        '$push': {
+                            "type": type_info
+                        }
+                    },
+                    upsert=True
+                )
+            else:
+                embeddings_collection.update_one(
+                    {'docId': document_name_id, 'chunk_number': i + 1},
+                    {'$set': doc_set},
+                    upsert=True
+                )
             print(f"Successfully stored chunk {i+1} for '{file_path.name}' in MongoDB.")
             res.append({**doc_set, "docId": document_name_id, "chunk_number": i + 1})

pembot/TextEmbedder/mongodb_index_creator.py CHANGED Viewed

@@ -4,7 +4,7 @@ from pymongo.operations import SearchIndexModel
 import time
 import os
-def create_vector_index(collection: Collection, index_name: str, num_dimensions: int = 768):
+def create_vector_index(collection: Collection, index_name: str, num_dimensions: int = 768, document_belongs_to_a_type= ""):
     """
     Creates a MongoDB Atlas Vector Search index if it does not already exist.
@@ -13,14 +13,14 @@ def create_vector_index(collection: Collection, index_name: str, num_dimensions:
         index_name: The desired name for the vector search index.
         num_dimensions: The number of dimensions for the embedding vectors.
     """
     # 1. Check if the index already exists
     existing_indexes = list(collection.list_search_indexes())
     for index in existing_indexes:
         if index.get('name') == index_name:
             print(f"Search index '{index_name}' already exists. Skipping creation.")
             # Optional: You can also check if the existing index is "READY"
             if index.get('status') == 'READY':
                 print(f"Index '{index_name}' is already ready for querying.")
@@ -33,20 +33,27 @@ def create_vector_index(collection: Collection, index_name: str, num_dimensions:
     # 2. If the index does not exist, proceed to create it
     print(f"Search index '{index_name}' does not exist. Creating it now...")
+    fields_arr= [
+        {
+            "type": "vector",
+            "path": "embedding",
+            "similarity": "dotProduct", # Or "cosine", "euclidean"
+            "numDimensions": num_dimensions,
+            "quantization": "scalar" # Or "none"
+        },
+        {
+            "type": "filter",
+            "path": "docId"
+        }
+    ]
+    if document_belongs_to_a_type:
+        fields_arr.append({
+            "type": "filter",
+            "path": "type"
+        })
     search_index_model = SearchIndexModel(definition={
-            "fields": [
-                {
-                    "type": "vector",
-                    "path": "embedding",
-                    "similarity": "dotProduct", # Or "cosine", "euclidean"
-                    "numDimensions": num_dimensions,
-                    "quantization": "scalar" # Or "none"
-                },
-                {
-                    "type": "filter",
-                    "path": "docId"
-                }
-            ]
+            "fields": fields_arr
         },
         name=index_name,
         type="vectorSearch"
@@ -70,7 +77,7 @@ def _wait_for_index_ready(collection: Collection, index_name: str):
     Helper function to poll the index status until it's ready.
     """
     print("Polling to check if the index is ready. This may take some time (up to a few minutes for large indexes).")
     start_time = time.time()
     timeout = 300 # 5 minutes timeout, adjust as needed
@@ -89,7 +96,7 @@ def _wait_for_index_ready(collection: Collection, index_name: str):
                 print(f"Index '{index_name}' status: {current_status}. Waiting...")
         except Exception as e:
             print(f"Error while polling index status: {e}. Retrying...")
         if time.time() - start_time > timeout:
             status= indices[0].get('status') if indices else 'N/A'
             print(f"Timeout: Index '{index_name}' did not become ready within {timeout} seconds. Current status: {status}")
@@ -99,9 +106,9 @@ def _wait_for_index_ready(collection: Collection, index_name: str):
 # --- Example Usage ---
 if __name__ == "__main__":
     # Replace with your database and collection names
-    DATABASE_NAME = "pembot"
+    DATABASE_NAME = "pembot"
     COLLECTION_NAME = "doc_chunks"
     VECTOR_INDEX_NAME = "test_search"
@@ -119,7 +126,7 @@ if __name__ == "__main__":
         # Call the function to create the index, with existence check
         create_vector_index(collection, VECTOR_INDEX_NAME, num_dimensions=EMBEDDING_DIMENSIONS)
         # Test calling it again to see the "already exists" message
         create_vector_index(collection, VECTOR_INDEX_NAME, num_dimensions=EMBEDDING_DIMENSIONS)
@@ -129,5 +136,3 @@ if __name__ == "__main__":
         if 'mongo_client' in locals() and mongo_client:
             mongo_client.close()
             print("MongoDB connection closed.")

pembot/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 A Python Package to convert PEM blog content to usseful information by leveraging LLMs
 """
-__version__ = '0.0.7'
+__version__ = '0.0.8'
 from .main import save_to_json_file, make_query
 __all__ = ["save_to_json_file", "make_query"]

pembot/config/config.yaml CHANGED Viewed

@@ -2,4 +2,4 @@ OUTPUT_DIR: /home/cyto/dev/pembotdir
 PAGE_DELIMITER: ___________________________ NEXT PAGE ___________________________
 app:
   name: pembot
-  version: 0.0.7
+  version: 0.0.8

pembot/query.py CHANGED Viewed

@@ -68,7 +68,8 @@ def multi_embedding_average(llm_client, inference_client, descriptions, model= "
         except Exception as e:
             print(f"Error generating embedding for description '{desc}': {e}")
             # Decide how to handle errors: skip, raise, or use a placeholder
-            continue
+            # continue
+            raise e
         time.sleep(1)
     if not description_embeddings:
@@ -81,7 +82,7 @@ def multi_embedding_average(llm_client, inference_client, descriptions, model= "
-def rag_query_llm(db_client, llm_client, inference_client, user_query: str, document_id: str, required_fields_descriptions: list[str], model_name: str = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", ollama_base_url: str = "http://localhost:11434", no_of_fields= 4, embedding_model= "BAAI/bge-en-icl", llm_provider_name: PROVIDER_T= "novita", index_name: str= "test_search", embeddings_collection= "doc_chunks"):
+def rag_query_llm(db_client, llm_client, inference_client, user_query: str, document_id: str, required_fields_descriptions: list[str], model_name: str = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", ollama_base_url: str = "http://localhost:11434", no_of_fields= 4, embedding_model= "BAAI/bge-en-icl", llm_provider_name: PROVIDER_T= "novita", index_name: str= "test_search", embeddings_collection= "doc_chunks", document_belongs_to_a_type= ""):
     """
     Performs a RAG (Retrieval Augmented Generation) query using a Hugging Face
     embedding model, ChromaDB for retrieval, and a local Ollama model for generation.
@@ -119,10 +120,10 @@ def rag_query_llm(db_client, llm_client, inference_client, user_query: str, docu
     aggregate_query_embedding= multi_embedding_average(llm_client, inference_client, required_fields_descriptions, model= embedding_model, embed_locally= embed_locally)
     print("Aggregate query embedding generated. length: ", len(aggregate_query_embedding))
-    create_vector_index(db_client[embeddings_collection], index_name, num_dimensions= len(aggregate_query_embedding))
+    create_vector_index(db_client[embeddings_collection], index_name, num_dimensions= len(aggregate_query_embedding), document_belongs_to_a_type= document_belongs_to_a_type)
     # check the order of args
-    relevant_chunks= search_within_document(db_client, aggregate_query_embedding, document_id, limit= no_of_fields, index_name= index_name, embeddings_collection_name= embeddings_collection)
+    relevant_chunks= search_within_document(db_client, aggregate_query_embedding, document_id, limit= no_of_fields, index_name= index_name, embeddings_collection_name= embeddings_collection, document_belongs_to_a_type= document_belongs_to_a_type)
     relevant_chunks= list(map(lambda x: x['chunk_text'], relevant_chunks))
     if not relevant_chunks:

{pembot-0.0.7.dist-info → pembot-0.0.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pembot
-Version: 0.0.7
+Version: 0.0.8
 Summary: A Python Package to convert PEM blog content to usseful information by leveraging LLMs
 Author-email: cyto <aryan_sidhwani@protonmail.com>
 License-Expression: MIT

{pembot-0.0.7.dist-info → pembot-0.0.8.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 pembot/.gitignore,sha256=_7FTsZokJ_pzEyyPjOsGw5x5Xx3gUBFaafs7UlPsv9E,98
 pembot/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-pembot/__init__.py,sha256=AQsB73MqtY0M4iaYZzGve48zpUhohq6R9DjdFwsbbKo,211
+pembot/__init__.py,sha256=XAG3pbUQGlrUZZgAKAxq96MjvCGaEPkpGTSKIwzNvGY,211
 pembot/gartner.py,sha256=3ALknQ5mSXIimmwCa3JFDzB_EW2hHEcQO1T2odyBquk,5408
 pembot/main.py,sha256=lZLIV8XPonvNoY4LVS-5fct1y9URMXWoSGJUKMw3Yg8,9667
 pembot/output_structure_local.py,sha256=YfpHzfTNeLMSsB_CjAamha9D6Iz7E1IC-tW9xPCMWFc,3000
 pembot/pem.py,sha256=mv6iGcN1peSY7z2dtCQ_BKj31EFBNfczBhps_d-0XDo,6377
-pembot/query.py,sha256=D1RPRoImDWCafbshT2NpO4ymVj2RySm8j5FJ5bRzYWw,8476
+pembot/query.py,sha256=d6K2PyDDGoIOqwn7A_KIBr83w0zjMAHjhmx1S9VlVgg,8642
 pembot/requirements.txt,sha256=6OV_n5JVco2lLA8Wq38tJX1bYgo_UU0R9RKgs4d2wfc,1360
 pembot/.git/COMMIT_EDITMSG,sha256=H9feTx6U3VWbFycy9cq077mD4oxuv2gz4G3EUOdQmV4,30
 pembot/.git/HEAD,sha256=KNJb-Cr0wOK3L1CVmyvrhZ4-YLljCl6MYD2tTdsrboA,21
@@ -108,10 +108,10 @@ pembot/AnyToText/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
 pembot/AnyToText/convertor.py,sha256=gqvhwFssUsAeirfO4n0Ztwga1hn8zHbdG96sMTjYrpE,17188
 pembot/TextEmbedder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pembot/TextEmbedder/gemini_embedder.py,sha256=P679-2mmQESlYKML1vcrwx_-CSgWJgIQk7NL4F7BLQE,677
-pembot/TextEmbedder/mongodb_embedder.py,sha256=pD8mP-uC_o0COPdOrCTMpoC5PdF8hXlqARHvTr2T-VI,9642
-pembot/TextEmbedder/mongodb_index_creator.py,sha256=ejpsF_y1zY6Z0nux02vjODiDPnxx-YA_xy2PmT94zZ4,5306
+pembot/TextEmbedder/mongodb_embedder.py,sha256=RotNlerS3WKEUGRNeQM5MTkl5BtaWNHVaXO1gN5NicI,10682
+pembot/TextEmbedder/mongodb_index_creator.py,sha256=kopqdVYJii_wExVrXGZjMfqWZ2dD42b3PeNWo71weHI,5354
 pembot/TextEmbedder/vector_query.py,sha256=Kh1uhx9CatB-oQlQtnW-1I2Qz7MGHI20n2h_8peAChM,1986
-pembot/config/config.yaml,sha256=xqo_Zq2dKEw98tzXDvJqOiJBluFCvT5JNhXpqkIqW0Y,156
+pembot/config/config.yaml,sha256=y-2BklPelldaXJ_hxFD9k-bFpDA6OAZkaoh5XlvASCE,156
 pembot/pdf2markdown/LICENSE,sha256=1JTJhQjUYDqJzFJhNtitm7mHyE71PRHgetIqRRWg6Pk,1068
 pembot/pdf2markdown/README.md,sha256=jitM1pwI69oa0N4mXv5-SY1ka9Sz3jsRNCDdpW-50kY,4545
 pembot/pdf2markdown/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -167,7 +167,7 @@ pembot/pdf2markdown/config/config.yaml,sha256=w75W2Eg4-tu8rRk_23PqxWDh0010kRKLmP
 pembot/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pembot/utils/inference_client.py,sha256=jeURmY2P5heVlH1dCV0XSgiX3U2qYGEmrnUv0KFpdww,5380
 pembot/utils/string_tools.py,sha256=gtRa5rBR0Q7GspTu2WtCnvhJQLFjPfWLvhmyiPkyStU,1883
-pembot-0.0.7.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-pembot-0.0.7.dist-info/WHEEL,sha256=Dyt6SBfaasWElUrURkknVFAZDHSTwxg3PaTza7RSbkY,100
-pembot-0.0.7.dist-info/METADATA,sha256=7kfZ28VYYaCy0mWNNLTP_pP6Bi9c-PtzbEwdbyNdLzI,313
-pembot-0.0.7.dist-info/RECORD,,
+pembot-0.0.8.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+pembot-0.0.8.dist-info/WHEEL,sha256=Dyt6SBfaasWElUrURkknVFAZDHSTwxg3PaTza7RSbkY,100
+pembot-0.0.8.dist-info/METADATA,sha256=kfa20bL5qROy6a8bsALEzDRlmF-JnTgmR7Qc8rz6PNQ,313
+pembot-0.0.8.dist-info/RECORD,,

{pembot-0.0.7.dist-info → pembot-0.0.8.dist-info}/WHEEL RENAMED Viewed

File without changes

{pembot-0.0.7.dist-info → pembot-0.0.8.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

pembot 0.0.7__py2.py3-none-any.whl → 0.0.8__py2.py3-none-any.whl

Potentially problematic release.

pembot 0.0.7py2.py3-none-any.whl → 0.0.8py2.py3-none-any.whl