pembot 0.0.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pembot might be problematic. Click here for more details.

Files changed (129) hide show
  1. pembot/.git/COMMIT_EDITMSG +1 -0
  2. pembot/.git/HEAD +1 -0
  3. pembot/.git/config +11 -0
  4. pembot/.git/description +1 -0
  5. pembot/.git/hooks/applypatch-msg.sample +15 -0
  6. pembot/.git/hooks/commit-msg.sample +24 -0
  7. pembot/.git/hooks/fsmonitor-watchman.sample +174 -0
  8. pembot/.git/hooks/post-update.sample +8 -0
  9. pembot/.git/hooks/pre-applypatch.sample +14 -0
  10. pembot/.git/hooks/pre-commit.sample +49 -0
  11. pembot/.git/hooks/pre-merge-commit.sample +13 -0
  12. pembot/.git/hooks/pre-push.sample +53 -0
  13. pembot/.git/hooks/pre-rebase.sample +169 -0
  14. pembot/.git/hooks/pre-receive.sample +24 -0
  15. pembot/.git/hooks/prepare-commit-msg.sample +42 -0
  16. pembot/.git/hooks/push-to-checkout.sample +78 -0
  17. pembot/.git/hooks/sendemail-validate.sample +77 -0
  18. pembot/.git/hooks/update.sample +128 -0
  19. pembot/.git/index +0 -0
  20. pembot/.git/info/exclude +6 -0
  21. pembot/.git/logs/HEAD +6 -0
  22. pembot/.git/logs/refs/heads/main +6 -0
  23. pembot/.git/logs/refs/remotes/origin/HEAD +1 -0
  24. pembot/.git/logs/refs/remotes/origin/main +5 -0
  25. pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c +0 -0
  26. pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1 +0 -0
  27. pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63 +0 -0
  28. pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7 +0 -0
  29. pembot/.git/objects/28/db0ab48059acccd7d257aa02e52e9b6b83a4a5 +0 -0
  30. pembot/.git/objects/35/97e518a8658280be9f377f78edf1dfa1f23814 +0 -0
  31. pembot/.git/objects/3d/07d3b29ff53d95de3898fb786d61732f210515 +0 -0
  32. pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb +0 -0
  33. pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba +0 -0
  34. pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0 +0 -0
  35. pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904 +0 -0
  36. pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba +0 -0
  37. pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177 +0 -0
  38. pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9 +0 -0
  39. pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331 +0 -0
  40. pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5 +0 -0
  41. pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25 +0 -0
  42. pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7 +0 -0
  43. pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8 +0 -0
  44. pembot/.git/objects/93/8f29d9b4b1ae86e39dddf9e3d115a82ddfc9b6 +0 -0
  45. pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456 +0 -0
  46. pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7 +0 -0
  47. pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880 +0 -0
  48. pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d +0 -0
  49. pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb +0 -0
  50. pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3 +0 -0
  51. pembot/.git/objects/bf/32a7e6872e5dc4025ee3df3c921ec7ade0855f +0 -0
  52. pembot/.git/objects/c0/793458db6e1bee7f79f1a504fb8ff4963f8ed3 +0 -0
  53. pembot/.git/objects/c2/443060c07101948487cfa93cc39e082e9e0f5f +0 -0
  54. pembot/.git/objects/e5/3070f2b07f45d031444b09b1b38658f3caf29e +0 -0
  55. pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc +0 -0
  56. pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d +0 -0
  57. pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58 +0 -0
  58. pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238 +0 -0
  59. pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552 +0 -0
  60. pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8 +0 -0
  61. pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300 +0 -0
  62. pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88 +0 -0
  63. pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6 +0 -0
  64. pembot/.git/objects/fb/6c90c9ce5e0cdfbe074a3f060afc66f62eefde +0 -0
  65. pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2 +0 -0
  66. pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx +0 -0
  67. pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack +0 -0
  68. pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev +0 -0
  69. pembot/.git/packed-refs +2 -0
  70. pembot/.git/refs/heads/main +1 -0
  71. pembot/.git/refs/remotes/origin/HEAD +1 -0
  72. pembot/.git/refs/remotes/origin/main +1 -0
  73. pembot/.gitignore +7 -0
  74. pembot/AnyToText/__init__.py +0 -0
  75. pembot/AnyToText/convertor.py +260 -0
  76. pembot/LICENSE +674 -0
  77. pembot/TextEmbedder/__init__.py +0 -0
  78. pembot/TextEmbedder/gemini_embedder.py +27 -0
  79. pembot/TextEmbedder/mongodb_embedder.py +258 -0
  80. pembot/TextEmbedder/mongodb_index_creator.py +133 -0
  81. pembot/TextEmbedder/vector_query.py +64 -0
  82. pembot/__init__.py +6 -0
  83. pembot/config/config.yaml +5 -0
  84. pembot/gartner.py +140 -0
  85. pembot/main.py +208 -0
  86. pembot/output_structure_local.py +63 -0
  87. pembot/pdf2markdown/.git/HEAD +1 -0
  88. pembot/pdf2markdown/.git/config +11 -0
  89. pembot/pdf2markdown/.git/description +1 -0
  90. pembot/pdf2markdown/.git/hooks/applypatch-msg.sample +15 -0
  91. pembot/pdf2markdown/.git/hooks/commit-msg.sample +24 -0
  92. pembot/pdf2markdown/.git/hooks/fsmonitor-watchman.sample +174 -0
  93. pembot/pdf2markdown/.git/hooks/post-update.sample +8 -0
  94. pembot/pdf2markdown/.git/hooks/pre-applypatch.sample +14 -0
  95. pembot/pdf2markdown/.git/hooks/pre-commit.sample +49 -0
  96. pembot/pdf2markdown/.git/hooks/pre-merge-commit.sample +13 -0
  97. pembot/pdf2markdown/.git/hooks/pre-push.sample +53 -0
  98. pembot/pdf2markdown/.git/hooks/pre-rebase.sample +169 -0
  99. pembot/pdf2markdown/.git/hooks/pre-receive.sample +24 -0
  100. pembot/pdf2markdown/.git/hooks/prepare-commit-msg.sample +42 -0
  101. pembot/pdf2markdown/.git/hooks/push-to-checkout.sample +78 -0
  102. pembot/pdf2markdown/.git/hooks/sendemail-validate.sample +77 -0
  103. pembot/pdf2markdown/.git/hooks/update.sample +128 -0
  104. pembot/pdf2markdown/.git/index +0 -0
  105. pembot/pdf2markdown/.git/info/exclude +6 -0
  106. pembot/pdf2markdown/.git/logs/HEAD +1 -0
  107. pembot/pdf2markdown/.git/logs/refs/heads/main +1 -0
  108. pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD +1 -0
  109. pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx +0 -0
  110. pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack +0 -0
  111. pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev +0 -0
  112. pembot/pdf2markdown/.git/packed-refs +2 -0
  113. pembot/pdf2markdown/.git/refs/heads/main +1 -0
  114. pembot/pdf2markdown/.git/refs/remotes/origin/HEAD +1 -0
  115. pembot/pdf2markdown/LICENSE +21 -0
  116. pembot/pdf2markdown/README.md +107 -0
  117. pembot/pdf2markdown/__init__.py +0 -0
  118. pembot/pdf2markdown/config/config.yaml +2 -0
  119. pembot/pdf2markdown/extract.py +888 -0
  120. pembot/pdf2markdown/requirements.txt +8 -0
  121. pembot/pem.py +157 -0
  122. pembot/query.py +204 -0
  123. pembot/utils/__init__.py +0 -0
  124. pembot/utils/inference_client.py +132 -0
  125. pembot/utils/string_tools.py +45 -0
  126. pembot-0.0.3.dist-info/METADATA +8 -0
  127. pembot-0.0.3.dist-info/RECORD +129 -0
  128. pembot-0.0.3.dist-info/WHEEL +5 -0
  129. pembot-0.0.3.dist-info/licenses/LICENSE +674 -0
@@ -0,0 +1,27 @@
1
+ from google.genai import types
2
+ import requests
3
+ import json
4
+ import os
5
+ from google import genai
6
+
7
+
8
+ def get_embedding(client, text):
9
+
10
+ result = client.models.embed_content(
11
+ model="gemini-embedding-exp-03-07",
12
+ contents= text,
13
+ config= types.EmbedContentConfig(task_type= "RETRIEVAL_DOCUMENT"),
14
+ )
15
+
16
+ if result:
17
+ return result.embeddings[0].values
18
+ else:
19
+ print(f"Error: {result}")
20
+ #raise ValueError("Error communicating with the gemini embedding API")
21
+
22
+
23
+ if __name__ == "__main__":
24
+
25
+ api_key = os.environ['GEMINI_API_KEY']
26
+ genai_client= genai.Client(api_key= api_key)
27
+ print(get_embedding(genai_client, "send bobs"))
@@ -0,0 +1,258 @@
1
+ from pathlib import Path
2
+ import uuid
3
+ from google import genai
4
+ from google.genai import types
5
+ from pymongo import MongoClient
6
+ import os
7
+ import re
8
+ import time
9
+ import numpy as np
10
+
11
+ from pembot.AnyToText.convertor import chunk_text
12
+ from pembot.utils.string_tools import make_it_an_id
13
+
14
+
15
+
16
+ def clean_text(text):
17
+ # Use regex to find words with at least one alphanumeric character
18
+ cleaned_words = [word for word in text.split() if re.search(r'\w', word)]
19
+ # Join the cleaned words back into a single string
20
+ cleaned_text = ' '.join(cleaned_words)
21
+ return cleaned_text
22
+
23
+
24
+
25
+ def search_within_document(
26
+ db_client,
27
+ aggregate_query_embedding,
28
+ document_name_id: str,
29
+ limit: int = 5,
30
+ index_name: str = "test_search",
31
+ embeddings_collection_name: str= "doc_chunks",
32
+ ):
33
+ """
34
+ Performs a vector similarity search within the chunks of a specific document
35
+ in the 'embeddings_collection' MongoDB collection.
36
+
37
+ Args:
38
+ db_client: An initialized PyMongo Database instance.
39
+ aggregate_query_embedding: The np.mean of queries vectors of your search query.
40
+ document_name_id: This will be used to filter by the 'docId'.
41
+ limit: The maximum number of similar chunks to return.
42
+ index_name: The name of your MongoDB Atlas Vector Search index.
43
+ You MUST have a vector search index created on the 'embedding' field
44
+ of the 'embeddings_collection' collection for this to work efficiently.
45
+
46
+ Returns:
47
+ A list of dictionaries, where each dictionary represents a matching chunk
48
+ from the specified document, including its text, docId, and score.
49
+ """
50
+ embeddings_collection = db_client[embeddings_collection_name]
51
+
52
+ print(f"Searching within document (docId: {document_name_id})...")
53
+
54
+ # MongoDB Atlas Vector Search aggregation pipeline
55
+ # The 'path' should point to the field containing the embeddings.
56
+ # The 'filter' stage is crucial for searching within a specific document.
57
+ pipeline = [
58
+ {
59
+ '$vectorSearch': {
60
+ 'queryVector': aggregate_query_embedding,
61
+ 'path': 'embedding',
62
+
63
+ #number of nearest neighbors to consider
64
+ 'numCandidates': 100,
65
+ 'limit': limit,
66
+ 'index': index_name,
67
+
68
+ #filter to search only within the specified document
69
+ 'filter': {
70
+ 'docId': document_name_id
71
+ }
72
+ }
73
+ },
74
+
75
+ # to exclude the MongoDB internal _id
76
+ {
77
+ '$project': {
78
+ '_id': 0,
79
+ 'docId': 1,
80
+ 'chunk_number': 1,
81
+ 'chunk_text': 1,
82
+ 'score': { '$meta': 'vectorSearchScore' } # Get the similarity score
83
+ }
84
+ }
85
+ ]
86
+
87
+ results = list(embeddings_collection.aggregate(pipeline))
88
+
89
+ if not results:
90
+ print(f"No relevant chunks found for document '{document_name_id}' with the given query.")
91
+ else:
92
+ print(f"Found {len(results)} relevant chunks in document '{document_name_id}':")
93
+ for i, res in enumerate(results):
94
+ print(f" Result {i+1} (Score: {res['score']:.4f}):")
95
+ print(f" Chunk Number: {res['chunk_number']}")
96
+ print(f" Text: '{res['chunk_text'][:100]}...'") # Print first 100 chars
97
+ print("-" * 30)
98
+
99
+ return results
100
+
101
+
102
+
103
+ def process_document_and_embed(db_client,
104
+ llm_client,
105
+ inference_client,
106
+ file_path: Path,
107
+ chunk_size: int,
108
+ embedding_model: str = 'nomic-embed-text:v1.5',
109
+ embeddings_collection_name= "doc_chunks",
110
+ use_custom_id: str | None = None,
111
+ use_custom_input: str | None = None
112
+ ) -> list[dict]:
113
+ """
114
+ Processes an input document by chunking its text, generating embeddings using
115
+ Ollama's specified embedding model, and storing these embeddings and chunks
116
+ in a MongoDB collection.
117
+
118
+ Args:
119
+ db_client: An initialized PyMongo Database instance.
120
+ file_path: The original path of the document being processed.
121
+ This path will be used to create a sanitized ID for the
122
+ document.
123
+ chunk_size: The desired chunk size in words for text segmentation.
124
+ embedding_model: The name of the Ollama embedding model to use.
125
+ """
126
+
127
+ input_text= None
128
+ if use_custom_input is not None:
129
+ input_text= use_custom_input
130
+ else:
131
+ # Read the input text from the file
132
+ with open(str(file_path), "r") as md_file:
133
+ input_text = md_file.read()
134
+
135
+
136
+ document_name_id= None
137
+ if use_custom_id is not None:
138
+ document_name_id= use_custom_id
139
+ else:
140
+ # Create a valid ID for the document from the file name (without extension)
141
+ file_root = os.path.splitext(file_path.name)[0]
142
+ document_name_id = make_it_an_id(file_root)
143
+
144
+ # Reference the MongoDB collection where chunks will be stored
145
+ # This single collection will serve as the global 'embeddings_collection'
146
+ # and document-specific data can be queried using 'docId'.
147
+ embeddings_collection = db_client[embeddings_collection_name]
148
+
149
+ # Check if this document's embeddings already exist in MongoDB
150
+ # We check if any document with this docId exists in the 'embeddings_collection' collection.
151
+ an_existing_chunk= embeddings_collection.find_one({'docId': document_name_id})
152
+ if an_existing_chunk:
153
+ print(f"Document '{file_path.name}' (ID: {document_name_id}) already processed. Skipping.")
154
+ return [an_existing_chunk]
155
+
156
+ print(f"Processing document '{file_path.name}' (ID: {document_name_id})...")
157
+
158
+ embed_locally= False
159
+ try:
160
+ models = llm_client.list()
161
+
162
+ for model in models.models:
163
+ if model.model == embedding_model:
164
+ embed_locally= True
165
+ except Exception as e:
166
+ print("local model list error: ", e)
167
+
168
+
169
+ # Chunk the input text into smaller segments
170
+ chunks = chunk_text(input_text, chunk_size)
171
+ print(f"Text chunked into {len(chunks)} segments.")
172
+
173
+ # Process each chunk: generate embedding and add to MongoDB
174
+ res = []
175
+ for i, chunk in enumerate(chunks):
176
+ try:
177
+ print(f"Processing chunk {i+1}/{len(chunks)} for document '{file_path.name}'...")
178
+ # Generate embedding using the specified Ollama model
179
+ print("embedding_model is: ", embedding_model)
180
+ print("if statement is", 'gemini' in embedding_model)
181
+
182
+ if 'gemini' in embedding_model:
183
+
184
+ client = genai.Client(api_key= os.environ['GEMINI_API_KEY'])
185
+ result = client.models.embed_content(
186
+ model= embedding_model,
187
+ contents= chunk,
188
+ config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT")
189
+ )
190
+ if result is not None and result.embeddings:
191
+ embedding= result.embeddings[0].values
192
+ else:
193
+ raise ValueError("Gemini not givingz embeddingzzz")
194
+
195
+ # API safety
196
+ time.sleep(1)
197
+
198
+ elif embed_locally:
199
+ response = llm_client.embeddings(model=embedding_model, prompt=chunk)
200
+ embedding= response['embedding']
201
+
202
+ else:
203
+ embedding = inference_client.feature_extraction(chunk, model=embedding_model)
204
+
205
+ # API rate limiting safety
206
+ time.sleep(1)
207
+ print("zzzzzzzzz")
208
+
209
+ if isinstance(embedding, np.ndarray):
210
+ embedding = embedding.tolist()
211
+ elif not isinstance(embedding, list):
212
+ raise TypeError("Embedding is not a list or numpy array. Cannot store in MongoDB.")
213
+
214
+
215
+ # Generate a random suffix for unique chunk IDs
216
+ random_suffix = uuid.uuid4().hex[:8]
217
+
218
+ # Create the unique ID for the chunk for the global 'embeddings_collection' collection
219
+ # This ID can be stored in the document if needed for external reference
220
+ chunk_id_global = f"{document_name_id}_chunk_{i + 1}_{random_suffix}"
221
+ # Create the unique ID for the chunk (local to the document's chunking)
222
+ chunk_id_doc_specific = f"chunk_{i}_{random_suffix}"
223
+
224
+ # Store the chunk data in MongoDB using update_one with upsert=True
225
+ # This will insert a new document if 'docId' and 'chunk_number' don't match,
226
+ # or update an existing one if they do.
227
+ doc_set = {
228
+ 'chunk_text': chunk,
229
+ 'embedding': embedding,
230
+ 'chunk_id_global': chunk_id_global,
231
+ 'chunk_id_doc_specific': chunk_id_doc_specific
232
+ }
233
+ embeddings_collection.update_one(
234
+ {'docId': document_name_id, 'chunk_number': i + 1},
235
+ {'$set': doc_set},
236
+ upsert=True
237
+ )
238
+ print(f"Successfully stored chunk {i+1} for '{file_path.name}' in MongoDB.")
239
+ res.append({**doc_set, "docId": document_name_id, "chunk_number": i + 1})
240
+
241
+ except Exception as e:
242
+ print(f"Error processing chunk {i+1} for '{file_path.name}': {e}")
243
+ # Continue to the next chunk even if one fails
244
+ continue
245
+
246
+ print(f"Finished processing document '{file_path.name}'. All chunks embedded and stored in MongoDB.")
247
+ return res
248
+
249
+
250
+
251
+
252
+ if __name__ == "__main__":
253
+ client = MongoClient(os.environ['MONGODB_PEM'])
254
+ db = client["pem"]
255
+ collection = db["blogs"]
256
+ embeddings_collection = db["blog_embeddings"]
257
+
258
+ api_key = os.environ['GEMINI_API_KEY']
@@ -0,0 +1,133 @@
1
+ from pymongo import MongoClient
2
+ from pymongo.collection import Collection
3
+ from pymongo.operations import SearchIndexModel
4
+ import time
5
+ import os
6
+
7
+ def create_vector_index(collection: Collection, index_name: str, num_dimensions: int = 768):
8
+ """
9
+ Creates a MongoDB Atlas Vector Search index if it does not already exist.
10
+
11
+ Args:
12
+ collection: The PyMongo Collection object on which to create the index.
13
+ index_name: The desired name for the vector search index.
14
+ num_dimensions: The number of dimensions for the embedding vectors.
15
+ """
16
+
17
+ # 1. Check if the index already exists
18
+ existing_indexes = list(collection.list_search_indexes())
19
+
20
+ for index in existing_indexes:
21
+ if index.get('name') == index_name:
22
+ print(f"Search index '{index_name}' already exists. Skipping creation.")
23
+
24
+ # Optional: You can also check if the existing index is "READY"
25
+ if index.get('status') == 'READY':
26
+ print(f"Index '{index_name}' is already ready for querying.")
27
+ else:
28
+ print(f"Index '{index_name}' is currently {index.get('status')}. Polling for readiness...")
29
+ # If it exists but is not ready, proceed to the polling loop
30
+ _wait_for_index_ready(collection, index_name)
31
+ return # Exit the function if the index already exists
32
+
33
+ # 2. If the index does not exist, proceed to create it
34
+ print(f"Search index '{index_name}' does not exist. Creating it now...")
35
+
36
+ search_index_model = SearchIndexModel(definition={
37
+ "fields": [
38
+ {
39
+ "type": "vector",
40
+ "path": "embedding",
41
+ "similarity": "dotProduct", # Or "cosine", "euclidean"
42
+ "numDimensions": num_dimensions,
43
+ "quantization": "scalar" # Or "none"
44
+ },
45
+ {
46
+ "type": "filter",
47
+ "path": "docId"
48
+ }
49
+ ]
50
+ },
51
+ name=index_name,
52
+ type="vectorSearch"
53
+ )
54
+
55
+ try:
56
+ # Create the search index
57
+ result = collection.create_search_index(model=search_index_model)
58
+ print("New search index named " + result + " is building.")
59
+
60
+ # Wait for initial sync to complete
61
+ _wait_for_index_ready(collection, index_name)
62
+ print(result + " is ready for querying.")
63
+
64
+ except Exception as e:
65
+ print(f"Error creating or waiting for search index '{index_name}': {e}")
66
+ # Depending on the error, you might want to re-raise or handle it differently.
67
+
68
+ def _wait_for_index_ready(collection: Collection, index_name: str):
69
+ """
70
+ Helper function to poll the index status until it's ready.
71
+ """
72
+ print("Polling to check if the index is ready. This may take some time (up to a few minutes for large indexes).")
73
+
74
+ start_time = time.time()
75
+ timeout = 300 # 5 minutes timeout, adjust as needed
76
+
77
+ while True:
78
+ indices= None
79
+ try:
80
+ indices = list(collection.list_search_indexes(name=index_name))
81
+ if len(indices) == 1 and indices[0].get("status") == "READY":
82
+ print(f"Index '{index_name}' is READY for querying.")
83
+ break
84
+ elif len(indices) == 0:
85
+ print(f"Warning: Index '{index_name}' not found during polling, might have failed creation or name mismatch.")
86
+ break # Exit if index disappears
87
+ else:
88
+ current_status = indices[0].get("status", "UNKNOWN")
89
+ print(f"Index '{index_name}' status: {current_status}. Waiting...")
90
+ except Exception as e:
91
+ print(f"Error while polling index status: {e}. Retrying...")
92
+
93
+ if time.time() - start_time > timeout:
94
+ status= indices[0].get('status') if indices else 'N/A'
95
+ print(f"Timeout: Index '{index_name}' did not become ready within {timeout} seconds. Current status: {status}")
96
+ break # Exit on timeout
97
+
98
+ time.sleep(10) # Poll less frequently, every 10 seconds
99
+
100
+ # --- Example Usage ---
101
+ if __name__ == "__main__":
102
+
103
+ # Replace with your database and collection names
104
+ DATABASE_NAME = "pembot"
105
+ COLLECTION_NAME = "doc_chunks"
106
+ VECTOR_INDEX_NAME = "test_search"
107
+
108
+ # Connect to MongoDB
109
+ mongo_client= None
110
+
111
+ try:
112
+ mongo_client = MongoClient(os.environ["MONGODB_PEM"])
113
+ db = mongo_client[DATABASE_NAME]
114
+ collection = db[COLLECTION_NAME]
115
+ print("Connected to MongoDB successfully.")
116
+
117
+ # Define dimensions (e.g., for nomic-embed-text:v1.5)
118
+ EMBEDDING_DIMENSIONS = 768 # Check your model's output dimension
119
+
120
+ # Call the function to create the index, with existence check
121
+ create_vector_index(collection, VECTOR_INDEX_NAME, num_dimensions=EMBEDDING_DIMENSIONS)
122
+
123
+ # Test calling it again to see the "already exists" message
124
+ create_vector_index(collection, VECTOR_INDEX_NAME, num_dimensions=EMBEDDING_DIMENSIONS)
125
+
126
+ except Exception as e:
127
+ print(f"Failed to connect to MongoDB or process: {e}")
128
+ finally:
129
+ if 'mongo_client' in locals() and mongo_client:
130
+ mongo_client.close()
131
+ print("MongoDB connection closed.")
132
+
133
+
@@ -0,0 +1,64 @@
1
+ import os
2
+ from google import genai
3
+ from pymongo import MongoClient
4
+ from TextEmbedder import gemini_embedder, mongodb_embedder
5
+ from os import environ
6
+ import re
7
+
8
+
9
+ def vectory_query_run(genai_client, docs_collection, embeddings_collection, search_string, database, index_name, chunksize= 170):
10
+
11
+ query_embedding = gemini_embedder.get_embedding(genai_client, search_string)
12
+
13
+ # Define the aggregation pipeline
14
+ pipeline = [
15
+ {
16
+ "$vectorSearch": {
17
+ "queryVector": query_embedding,
18
+ "index": f"{index_name}",
19
+ "path": "embedding",
20
+ "numCandidates": 3,
21
+ "limit": 3,
22
+ "k": 3 # Number of results to return
23
+ }
24
+ }
25
+ ]
26
+
27
+ # Execute the aggregation pipeline
28
+ results = list(embeddings_collection.aggregate(pipeline))
29
+ for i, res in enumerate(results):
30
+ doc = docs_collection.find_one({"_id": res['docId']})
31
+
32
+ wordslist = mongodb_embedder.clean_text(doc['content']).split()
33
+
34
+ cn = int(res['chunk_number'])
35
+ search_hit = " ".join(wordslist[cn * chunksize: (cn+1) * chunksize])
36
+
37
+ print(f"RESULT #{i + 1}")
38
+ print(search_hit)
39
+ print("\n\n")
40
+
41
+
42
+
43
+ if __name__ == "__main__":
44
+ mc = MongoClient(environ['MONGODB_PEM'])
45
+
46
+ api_key = os.environ['GEMINI_API_KEY']
47
+ genai_client= genai.Client(api_key= api_key)
48
+
49
+ try:
50
+ mc.admin.command('ping') # This is a simple way to check the connection
51
+ database = mc["pem"]
52
+
53
+ while True:
54
+ query = input("enter search query [Enter 'exit' to exit]: ")
55
+ if query.lower() == 'exit':
56
+ print('bye!')
57
+ break
58
+ docs_collection= database["blogs"]
59
+ embeddings_collection= database["blog_embeddings"]
60
+ vectory_query_run(genai_client, docs_collection, embeddings_collection, query, database, index_name= 'vector_index_blog_chunks')
61
+
62
+ finally:
63
+ mc.close()
64
+
pembot/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """
2
+ A Python Package to convert PEM blog content to usseful information by leveraging LLMs
3
+ """
4
+ __version__ = '0.0.3'
5
+ from .main import save_to_json_file, make_query
6
+ __all__ = ["save_to_json_file", "make_query"]
@@ -0,0 +1,5 @@
1
+ OUTPUT_DIR: /home/cyto/dev/pembotdir
2
+ PAGE_DELIMITER: ___________________________ NEXT PAGE ___________________________
3
+ app:
4
+ name: pembot
5
+ version: 0.0.3
pembot/gartner.py ADDED
@@ -0,0 +1,140 @@
1
+ from selenium.webdriver.common.by import By
2
+ from selenium.webdriver.support.ui import WebDriverWait
3
+ from selenium.webdriver.support import expected_conditions as EC
4
+ from bs4 import BeautifulSoup
5
+ import json
6
+ import time
7
+ import random
8
+ import undetected_chromedriver as uc
9
+ driver = uc.Chrome(headless=False,use_subprocess=False)
10
+
11
+ # Random delays between actions
12
+ def human_delay():
13
+ time.sleep(random.uniform(1, 3))
14
+
15
+ # Use this before each navigation
16
+ driver.get("https://www.gartner.com/en/insights")
17
+ human_delay()
18
+
19
+ # Wait for the dynamic content to load
20
+ wait = WebDriverWait(driver, 20)
21
+ try:
22
+ print("waiting")
23
+ wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'row.dynamic-content')))
24
+ # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.row.dynamic-content')))
25
+ print("waitdone")
26
+ except Exception as e:
27
+ print("Timed out waiting for page to load")
28
+ driver.quit()
29
+ exit()
30
+
31
+ # Parse the page with BeautifulSoup
32
+ soup = BeautifulSoup(driver.page_source, 'html.parser')
33
+ main_div = soup.find('div', class_='row dynamic-content')
34
+ category_blocks = main_div.find_all('div', class_='category-block') if main_div else []
35
+
36
+ print("category blocks: ", category_blocks)
37
+
38
+ result = {}
39
+
40
+ for block in category_blocks:
41
+ # Extract category name
42
+ a_tag = block.find('a')
43
+ if not a_tag:
44
+ continue
45
+ h4 = a_tag.find('h4', class_='categoryheadline')
46
+ if not h4:
47
+ continue
48
+ category_name = h4.get_text(strip=True)
49
+ result[category_name] = {}
50
+ print("result", result)
51
+
52
+ # Extract all article links in the category
53
+ items_content = block.find('div', class_='categoryItemsContent')
54
+ if not items_content:
55
+ continue
56
+ links = items_content.find_all('a', href=True)
57
+
58
+ for link in links:
59
+ href = link['href']
60
+ article_title = link.get_text(strip=True)
61
+
62
+ # Form absolute URL
63
+ full_url = f'https://www.gartner.com{href}' if href.startswith('/') else href
64
+
65
+ # Open article in a new tab
66
+ original_window = driver.current_window_handle
67
+ driver.switch_to.new_window('tab')
68
+ try:
69
+ human_delay()
70
+ driver.get(full_url)
71
+ human_delay()
72
+ # Wait for article content to load
73
+ article_wait = WebDriverWait(driver, 15)
74
+ article_wait.until(EC.presence_of_element_located((By.TAG_NAME, 'article')))
75
+
76
+ # Parse article content
77
+ # Modify the article content extraction part of your script:
78
+
79
+ # Inside the article processing block after driver.get(full_url):
80
+ article_soup = BeautifulSoup(driver.page_source, 'html.parser')
81
+
82
+ # Extract article title
83
+ title = article_soup.find('h1').get_text(strip=True) if article_soup.find('h1') else "Untitled"
84
+
85
+ # Extract authors and date
86
+ byline = ""
87
+ authors = []
88
+ date = ""
89
+ byline_element = article_soup.find('article', class_='article-text').find('span', class_='rte')
90
+ if byline_element:
91
+ byline_text = byline_element.get_text(strip=True, separator='|').split('|')
92
+ for part in byline_text:
93
+ if 'By ' in part:
94
+ authors = [a.strip() for a in part.replace('By ', '').split(' and ')]
95
+ if any(word in part.lower() for word in ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
96
+ 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']):
97
+ date = part.strip()
98
+
99
+ # Format title with metadata
100
+ formatted_title = f"{title}"
101
+ if authors or date:
102
+ formatted_title += " ("
103
+ if authors:
104
+ formatted_title += f"by:{','.join(authors)}"
105
+ if date:
106
+ if authors: formatted_title += ";"
107
+ formatted_title += f"on:{date}"
108
+ formatted_title += ")"
109
+
110
+ # Extract main content
111
+ content_parts = []
112
+ main_articles = article_soup.find_all('article', class_='article-text')
113
+ for article in main_articles:
114
+ # Skip the byline article
115
+ if 'By ' in article.get_text(): continue
116
+
117
+ # Get all text elements including headings
118
+ for element in article.find_all(['p', 'h2', 'h3', 'h4']):
119
+ text = element.get_text(strip=True, separator=' ')
120
+ if text:
121
+ content_parts.append(text)
122
+
123
+ full_content = '\n'.join([t for t in content_parts if t.strip()])
124
+
125
+ # Store in dictionary structure
126
+ result[category_name][formatted_title] = full_content
127
+ except Exception as e:
128
+ print(f"Error retrieving {full_url}: {str(e)}")
129
+ result[category_name].append('')
130
+ finally:
131
+ driver.close()
132
+ driver.switch_to.window(original_window)
133
+ time.sleep(1) # Polite delay
134
+
135
+ # Save the results to a JSON file
136
+ with open('gartner_articles.json', 'w', encoding='utf-8') as f:
137
+ print("dumping...")
138
+ json.dump(result, f, ensure_ascii=False, indent=4)
139
+
140
+ driver.quit()