pembot 0.0.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pembot might be problematic. Click here for more details.
- pembot/.git/COMMIT_EDITMSG +1 -0
- pembot/.git/HEAD +1 -0
- pembot/.git/config +11 -0
- pembot/.git/description +1 -0
- pembot/.git/hooks/applypatch-msg.sample +15 -0
- pembot/.git/hooks/commit-msg.sample +24 -0
- pembot/.git/hooks/fsmonitor-watchman.sample +174 -0
- pembot/.git/hooks/post-update.sample +8 -0
- pembot/.git/hooks/pre-applypatch.sample +14 -0
- pembot/.git/hooks/pre-commit.sample +49 -0
- pembot/.git/hooks/pre-merge-commit.sample +13 -0
- pembot/.git/hooks/pre-push.sample +53 -0
- pembot/.git/hooks/pre-rebase.sample +169 -0
- pembot/.git/hooks/pre-receive.sample +24 -0
- pembot/.git/hooks/prepare-commit-msg.sample +42 -0
- pembot/.git/hooks/push-to-checkout.sample +78 -0
- pembot/.git/hooks/sendemail-validate.sample +77 -0
- pembot/.git/hooks/update.sample +128 -0
- pembot/.git/index +0 -0
- pembot/.git/info/exclude +6 -0
- pembot/.git/logs/HEAD +6 -0
- pembot/.git/logs/refs/heads/main +6 -0
- pembot/.git/logs/refs/remotes/origin/HEAD +1 -0
- pembot/.git/logs/refs/remotes/origin/main +5 -0
- pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c +0 -0
- pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1 +0 -0
- pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63 +0 -0
- pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7 +0 -0
- pembot/.git/objects/28/db0ab48059acccd7d257aa02e52e9b6b83a4a5 +0 -0
- pembot/.git/objects/35/97e518a8658280be9f377f78edf1dfa1f23814 +0 -0
- pembot/.git/objects/3d/07d3b29ff53d95de3898fb786d61732f210515 +0 -0
- pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb +0 -0
- pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba +0 -0
- pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0 +0 -0
- pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904 +0 -0
- pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba +0 -0
- pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177 +0 -0
- pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9 +0 -0
- pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331 +0 -0
- pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5 +0 -0
- pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25 +0 -0
- pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7 +0 -0
- pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8 +0 -0
- pembot/.git/objects/93/8f29d9b4b1ae86e39dddf9e3d115a82ddfc9b6 +0 -0
- pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456 +0 -0
- pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7 +0 -0
- pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880 +0 -0
- pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d +0 -0
- pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb +0 -0
- pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3 +0 -0
- pembot/.git/objects/bf/32a7e6872e5dc4025ee3df3c921ec7ade0855f +0 -0
- pembot/.git/objects/c0/793458db6e1bee7f79f1a504fb8ff4963f8ed3 +0 -0
- pembot/.git/objects/c2/443060c07101948487cfa93cc39e082e9e0f5f +0 -0
- pembot/.git/objects/e5/3070f2b07f45d031444b09b1b38658f3caf29e +0 -0
- pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc +0 -0
- pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d +0 -0
- pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58 +0 -0
- pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238 +0 -0
- pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552 +0 -0
- pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8 +0 -0
- pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300 +0 -0
- pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88 +0 -0
- pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6 +0 -0
- pembot/.git/objects/fb/6c90c9ce5e0cdfbe074a3f060afc66f62eefde +0 -0
- pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2 +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack +0 -0
- pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev +0 -0
- pembot/.git/packed-refs +2 -0
- pembot/.git/refs/heads/main +1 -0
- pembot/.git/refs/remotes/origin/HEAD +1 -0
- pembot/.git/refs/remotes/origin/main +1 -0
- pembot/.gitignore +7 -0
- pembot/AnyToText/__init__.py +0 -0
- pembot/AnyToText/convertor.py +260 -0
- pembot/LICENSE +674 -0
- pembot/TextEmbedder/__init__.py +0 -0
- pembot/TextEmbedder/gemini_embedder.py +27 -0
- pembot/TextEmbedder/mongodb_embedder.py +258 -0
- pembot/TextEmbedder/mongodb_index_creator.py +133 -0
- pembot/TextEmbedder/vector_query.py +64 -0
- pembot/__init__.py +6 -0
- pembot/config/config.yaml +5 -0
- pembot/gartner.py +140 -0
- pembot/main.py +208 -0
- pembot/output_structure_local.py +63 -0
- pembot/pdf2markdown/.git/HEAD +1 -0
- pembot/pdf2markdown/.git/config +11 -0
- pembot/pdf2markdown/.git/description +1 -0
- pembot/pdf2markdown/.git/hooks/applypatch-msg.sample +15 -0
- pembot/pdf2markdown/.git/hooks/commit-msg.sample +24 -0
- pembot/pdf2markdown/.git/hooks/fsmonitor-watchman.sample +174 -0
- pembot/pdf2markdown/.git/hooks/post-update.sample +8 -0
- pembot/pdf2markdown/.git/hooks/pre-applypatch.sample +14 -0
- pembot/pdf2markdown/.git/hooks/pre-commit.sample +49 -0
- pembot/pdf2markdown/.git/hooks/pre-merge-commit.sample +13 -0
- pembot/pdf2markdown/.git/hooks/pre-push.sample +53 -0
- pembot/pdf2markdown/.git/hooks/pre-rebase.sample +169 -0
- pembot/pdf2markdown/.git/hooks/pre-receive.sample +24 -0
- pembot/pdf2markdown/.git/hooks/prepare-commit-msg.sample +42 -0
- pembot/pdf2markdown/.git/hooks/push-to-checkout.sample +78 -0
- pembot/pdf2markdown/.git/hooks/sendemail-validate.sample +77 -0
- pembot/pdf2markdown/.git/hooks/update.sample +128 -0
- pembot/pdf2markdown/.git/index +0 -0
- pembot/pdf2markdown/.git/info/exclude +6 -0
- pembot/pdf2markdown/.git/logs/HEAD +1 -0
- pembot/pdf2markdown/.git/logs/refs/heads/main +1 -0
- pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD +1 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx +0 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack +0 -0
- pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev +0 -0
- pembot/pdf2markdown/.git/packed-refs +2 -0
- pembot/pdf2markdown/.git/refs/heads/main +1 -0
- pembot/pdf2markdown/.git/refs/remotes/origin/HEAD +1 -0
- pembot/pdf2markdown/LICENSE +21 -0
- pembot/pdf2markdown/README.md +107 -0
- pembot/pdf2markdown/__init__.py +0 -0
- pembot/pdf2markdown/config/config.yaml +2 -0
- pembot/pdf2markdown/extract.py +888 -0
- pembot/pdf2markdown/requirements.txt +8 -0
- pembot/pem.py +157 -0
- pembot/query.py +204 -0
- pembot/utils/__init__.py +0 -0
- pembot/utils/inference_client.py +132 -0
- pembot/utils/string_tools.py +45 -0
- pembot-0.0.3.dist-info/METADATA +8 -0
- pembot-0.0.3.dist-info/RECORD +129 -0
- pembot-0.0.3.dist-info/WHEEL +5 -0
- pembot-0.0.3.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from google.genai import types
|
|
2
|
+
import requests
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from google import genai
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_embedding(client, text):
|
|
9
|
+
|
|
10
|
+
result = client.models.embed_content(
|
|
11
|
+
model="gemini-embedding-exp-03-07",
|
|
12
|
+
contents= text,
|
|
13
|
+
config= types.EmbedContentConfig(task_type= "RETRIEVAL_DOCUMENT"),
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
if result:
|
|
17
|
+
return result.embeddings[0].values
|
|
18
|
+
else:
|
|
19
|
+
print(f"Error: {result}")
|
|
20
|
+
#raise ValueError("Error communicating with the gemini embedding API")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
if __name__ == "__main__":
|
|
24
|
+
|
|
25
|
+
api_key = os.environ['GEMINI_API_KEY']
|
|
26
|
+
genai_client= genai.Client(api_key= api_key)
|
|
27
|
+
print(get_embedding(genai_client, "send bobs"))
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import uuid
|
|
3
|
+
from google import genai
|
|
4
|
+
from google.genai import types
|
|
5
|
+
from pymongo import MongoClient
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import time
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from pembot.AnyToText.convertor import chunk_text
|
|
12
|
+
from pembot.utils.string_tools import make_it_an_id
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def clean_text(text):
|
|
17
|
+
# Use regex to find words with at least one alphanumeric character
|
|
18
|
+
cleaned_words = [word for word in text.split() if re.search(r'\w', word)]
|
|
19
|
+
# Join the cleaned words back into a single string
|
|
20
|
+
cleaned_text = ' '.join(cleaned_words)
|
|
21
|
+
return cleaned_text
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def search_within_document(
|
|
26
|
+
db_client,
|
|
27
|
+
aggregate_query_embedding,
|
|
28
|
+
document_name_id: str,
|
|
29
|
+
limit: int = 5,
|
|
30
|
+
index_name: str = "test_search",
|
|
31
|
+
embeddings_collection_name: str= "doc_chunks",
|
|
32
|
+
):
|
|
33
|
+
"""
|
|
34
|
+
Performs a vector similarity search within the chunks of a specific document
|
|
35
|
+
in the 'embeddings_collection' MongoDB collection.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
db_client: An initialized PyMongo Database instance.
|
|
39
|
+
aggregate_query_embedding: The np.mean of queries vectors of your search query.
|
|
40
|
+
document_name_id: This will be used to filter by the 'docId'.
|
|
41
|
+
limit: The maximum number of similar chunks to return.
|
|
42
|
+
index_name: The name of your MongoDB Atlas Vector Search index.
|
|
43
|
+
You MUST have a vector search index created on the 'embedding' field
|
|
44
|
+
of the 'embeddings_collection' collection for this to work efficiently.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
A list of dictionaries, where each dictionary represents a matching chunk
|
|
48
|
+
from the specified document, including its text, docId, and score.
|
|
49
|
+
"""
|
|
50
|
+
embeddings_collection = db_client[embeddings_collection_name]
|
|
51
|
+
|
|
52
|
+
print(f"Searching within document (docId: {document_name_id})...")
|
|
53
|
+
|
|
54
|
+
# MongoDB Atlas Vector Search aggregation pipeline
|
|
55
|
+
# The 'path' should point to the field containing the embeddings.
|
|
56
|
+
# The 'filter' stage is crucial for searching within a specific document.
|
|
57
|
+
pipeline = [
|
|
58
|
+
{
|
|
59
|
+
'$vectorSearch': {
|
|
60
|
+
'queryVector': aggregate_query_embedding,
|
|
61
|
+
'path': 'embedding',
|
|
62
|
+
|
|
63
|
+
#number of nearest neighbors to consider
|
|
64
|
+
'numCandidates': 100,
|
|
65
|
+
'limit': limit,
|
|
66
|
+
'index': index_name,
|
|
67
|
+
|
|
68
|
+
#filter to search only within the specified document
|
|
69
|
+
'filter': {
|
|
70
|
+
'docId': document_name_id
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
},
|
|
74
|
+
|
|
75
|
+
# to exclude the MongoDB internal _id
|
|
76
|
+
{
|
|
77
|
+
'$project': {
|
|
78
|
+
'_id': 0,
|
|
79
|
+
'docId': 1,
|
|
80
|
+
'chunk_number': 1,
|
|
81
|
+
'chunk_text': 1,
|
|
82
|
+
'score': { '$meta': 'vectorSearchScore' } # Get the similarity score
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
results = list(embeddings_collection.aggregate(pipeline))
|
|
88
|
+
|
|
89
|
+
if not results:
|
|
90
|
+
print(f"No relevant chunks found for document '{document_name_id}' with the given query.")
|
|
91
|
+
else:
|
|
92
|
+
print(f"Found {len(results)} relevant chunks in document '{document_name_id}':")
|
|
93
|
+
for i, res in enumerate(results):
|
|
94
|
+
print(f" Result {i+1} (Score: {res['score']:.4f}):")
|
|
95
|
+
print(f" Chunk Number: {res['chunk_number']}")
|
|
96
|
+
print(f" Text: '{res['chunk_text'][:100]}...'") # Print first 100 chars
|
|
97
|
+
print("-" * 30)
|
|
98
|
+
|
|
99
|
+
return results
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def process_document_and_embed(db_client,
|
|
104
|
+
llm_client,
|
|
105
|
+
inference_client,
|
|
106
|
+
file_path: Path,
|
|
107
|
+
chunk_size: int,
|
|
108
|
+
embedding_model: str = 'nomic-embed-text:v1.5',
|
|
109
|
+
embeddings_collection_name= "doc_chunks",
|
|
110
|
+
use_custom_id: str | None = None,
|
|
111
|
+
use_custom_input: str | None = None
|
|
112
|
+
) -> list[dict]:
|
|
113
|
+
"""
|
|
114
|
+
Processes an input document by chunking its text, generating embeddings using
|
|
115
|
+
Ollama's specified embedding model, and storing these embeddings and chunks
|
|
116
|
+
in a MongoDB collection.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
db_client: An initialized PyMongo Database instance.
|
|
120
|
+
file_path: The original path of the document being processed.
|
|
121
|
+
This path will be used to create a sanitized ID for the
|
|
122
|
+
document.
|
|
123
|
+
chunk_size: The desired chunk size in words for text segmentation.
|
|
124
|
+
embedding_model: The name of the Ollama embedding model to use.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
input_text= None
|
|
128
|
+
if use_custom_input is not None:
|
|
129
|
+
input_text= use_custom_input
|
|
130
|
+
else:
|
|
131
|
+
# Read the input text from the file
|
|
132
|
+
with open(str(file_path), "r") as md_file:
|
|
133
|
+
input_text = md_file.read()
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
document_name_id= None
|
|
137
|
+
if use_custom_id is not None:
|
|
138
|
+
document_name_id= use_custom_id
|
|
139
|
+
else:
|
|
140
|
+
# Create a valid ID for the document from the file name (without extension)
|
|
141
|
+
file_root = os.path.splitext(file_path.name)[0]
|
|
142
|
+
document_name_id = make_it_an_id(file_root)
|
|
143
|
+
|
|
144
|
+
# Reference the MongoDB collection where chunks will be stored
|
|
145
|
+
# This single collection will serve as the global 'embeddings_collection'
|
|
146
|
+
# and document-specific data can be queried using 'docId'.
|
|
147
|
+
embeddings_collection = db_client[embeddings_collection_name]
|
|
148
|
+
|
|
149
|
+
# Check if this document's embeddings already exist in MongoDB
|
|
150
|
+
# We check if any document with this docId exists in the 'embeddings_collection' collection.
|
|
151
|
+
an_existing_chunk= embeddings_collection.find_one({'docId': document_name_id})
|
|
152
|
+
if an_existing_chunk:
|
|
153
|
+
print(f"Document '{file_path.name}' (ID: {document_name_id}) already processed. Skipping.")
|
|
154
|
+
return [an_existing_chunk]
|
|
155
|
+
|
|
156
|
+
print(f"Processing document '{file_path.name}' (ID: {document_name_id})...")
|
|
157
|
+
|
|
158
|
+
embed_locally= False
|
|
159
|
+
try:
|
|
160
|
+
models = llm_client.list()
|
|
161
|
+
|
|
162
|
+
for model in models.models:
|
|
163
|
+
if model.model == embedding_model:
|
|
164
|
+
embed_locally= True
|
|
165
|
+
except Exception as e:
|
|
166
|
+
print("local model list error: ", e)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# Chunk the input text into smaller segments
|
|
170
|
+
chunks = chunk_text(input_text, chunk_size)
|
|
171
|
+
print(f"Text chunked into {len(chunks)} segments.")
|
|
172
|
+
|
|
173
|
+
# Process each chunk: generate embedding and add to MongoDB
|
|
174
|
+
res = []
|
|
175
|
+
for i, chunk in enumerate(chunks):
|
|
176
|
+
try:
|
|
177
|
+
print(f"Processing chunk {i+1}/{len(chunks)} for document '{file_path.name}'...")
|
|
178
|
+
# Generate embedding using the specified Ollama model
|
|
179
|
+
print("embedding_model is: ", embedding_model)
|
|
180
|
+
print("if statement is", 'gemini' in embedding_model)
|
|
181
|
+
|
|
182
|
+
if 'gemini' in embedding_model:
|
|
183
|
+
|
|
184
|
+
client = genai.Client(api_key= os.environ['GEMINI_API_KEY'])
|
|
185
|
+
result = client.models.embed_content(
|
|
186
|
+
model= embedding_model,
|
|
187
|
+
contents= chunk,
|
|
188
|
+
config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT")
|
|
189
|
+
)
|
|
190
|
+
if result is not None and result.embeddings:
|
|
191
|
+
embedding= result.embeddings[0].values
|
|
192
|
+
else:
|
|
193
|
+
raise ValueError("Gemini not givingz embeddingzzz")
|
|
194
|
+
|
|
195
|
+
# API safety
|
|
196
|
+
time.sleep(1)
|
|
197
|
+
|
|
198
|
+
elif embed_locally:
|
|
199
|
+
response = llm_client.embeddings(model=embedding_model, prompt=chunk)
|
|
200
|
+
embedding= response['embedding']
|
|
201
|
+
|
|
202
|
+
else:
|
|
203
|
+
embedding = inference_client.feature_extraction(chunk, model=embedding_model)
|
|
204
|
+
|
|
205
|
+
# API rate limiting safety
|
|
206
|
+
time.sleep(1)
|
|
207
|
+
print("zzzzzzzzz")
|
|
208
|
+
|
|
209
|
+
if isinstance(embedding, np.ndarray):
|
|
210
|
+
embedding = embedding.tolist()
|
|
211
|
+
elif not isinstance(embedding, list):
|
|
212
|
+
raise TypeError("Embedding is not a list or numpy array. Cannot store in MongoDB.")
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# Generate a random suffix for unique chunk IDs
|
|
216
|
+
random_suffix = uuid.uuid4().hex[:8]
|
|
217
|
+
|
|
218
|
+
# Create the unique ID for the chunk for the global 'embeddings_collection' collection
|
|
219
|
+
# This ID can be stored in the document if needed for external reference
|
|
220
|
+
chunk_id_global = f"{document_name_id}_chunk_{i + 1}_{random_suffix}"
|
|
221
|
+
# Create the unique ID for the chunk (local to the document's chunking)
|
|
222
|
+
chunk_id_doc_specific = f"chunk_{i}_{random_suffix}"
|
|
223
|
+
|
|
224
|
+
# Store the chunk data in MongoDB using update_one with upsert=True
|
|
225
|
+
# This will insert a new document if 'docId' and 'chunk_number' don't match,
|
|
226
|
+
# or update an existing one if they do.
|
|
227
|
+
doc_set = {
|
|
228
|
+
'chunk_text': chunk,
|
|
229
|
+
'embedding': embedding,
|
|
230
|
+
'chunk_id_global': chunk_id_global,
|
|
231
|
+
'chunk_id_doc_specific': chunk_id_doc_specific
|
|
232
|
+
}
|
|
233
|
+
embeddings_collection.update_one(
|
|
234
|
+
{'docId': document_name_id, 'chunk_number': i + 1},
|
|
235
|
+
{'$set': doc_set},
|
|
236
|
+
upsert=True
|
|
237
|
+
)
|
|
238
|
+
print(f"Successfully stored chunk {i+1} for '{file_path.name}' in MongoDB.")
|
|
239
|
+
res.append({**doc_set, "docId": document_name_id, "chunk_number": i + 1})
|
|
240
|
+
|
|
241
|
+
except Exception as e:
|
|
242
|
+
print(f"Error processing chunk {i+1} for '{file_path.name}': {e}")
|
|
243
|
+
# Continue to the next chunk even if one fails
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
print(f"Finished processing document '{file_path.name}'. All chunks embedded and stored in MongoDB.")
|
|
247
|
+
return res
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
if __name__ == "__main__":
|
|
253
|
+
client = MongoClient(os.environ['MONGODB_PEM'])
|
|
254
|
+
db = client["pem"]
|
|
255
|
+
collection = db["blogs"]
|
|
256
|
+
embeddings_collection = db["blog_embeddings"]
|
|
257
|
+
|
|
258
|
+
api_key = os.environ['GEMINI_API_KEY']
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from pymongo import MongoClient
|
|
2
|
+
from pymongo.collection import Collection
|
|
3
|
+
from pymongo.operations import SearchIndexModel
|
|
4
|
+
import time
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
def create_vector_index(collection: Collection, index_name: str, num_dimensions: int = 768):
|
|
8
|
+
"""
|
|
9
|
+
Creates a MongoDB Atlas Vector Search index if it does not already exist.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
collection: The PyMongo Collection object on which to create the index.
|
|
13
|
+
index_name: The desired name for the vector search index.
|
|
14
|
+
num_dimensions: The number of dimensions for the embedding vectors.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
# 1. Check if the index already exists
|
|
18
|
+
existing_indexes = list(collection.list_search_indexes())
|
|
19
|
+
|
|
20
|
+
for index in existing_indexes:
|
|
21
|
+
if index.get('name') == index_name:
|
|
22
|
+
print(f"Search index '{index_name}' already exists. Skipping creation.")
|
|
23
|
+
|
|
24
|
+
# Optional: You can also check if the existing index is "READY"
|
|
25
|
+
if index.get('status') == 'READY':
|
|
26
|
+
print(f"Index '{index_name}' is already ready for querying.")
|
|
27
|
+
else:
|
|
28
|
+
print(f"Index '{index_name}' is currently {index.get('status')}. Polling for readiness...")
|
|
29
|
+
# If it exists but is not ready, proceed to the polling loop
|
|
30
|
+
_wait_for_index_ready(collection, index_name)
|
|
31
|
+
return # Exit the function if the index already exists
|
|
32
|
+
|
|
33
|
+
# 2. If the index does not exist, proceed to create it
|
|
34
|
+
print(f"Search index '{index_name}' does not exist. Creating it now...")
|
|
35
|
+
|
|
36
|
+
search_index_model = SearchIndexModel(definition={
|
|
37
|
+
"fields": [
|
|
38
|
+
{
|
|
39
|
+
"type": "vector",
|
|
40
|
+
"path": "embedding",
|
|
41
|
+
"similarity": "dotProduct", # Or "cosine", "euclidean"
|
|
42
|
+
"numDimensions": num_dimensions,
|
|
43
|
+
"quantization": "scalar" # Or "none"
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"type": "filter",
|
|
47
|
+
"path": "docId"
|
|
48
|
+
}
|
|
49
|
+
]
|
|
50
|
+
},
|
|
51
|
+
name=index_name,
|
|
52
|
+
type="vectorSearch"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
# Create the search index
|
|
57
|
+
result = collection.create_search_index(model=search_index_model)
|
|
58
|
+
print("New search index named " + result + " is building.")
|
|
59
|
+
|
|
60
|
+
# Wait for initial sync to complete
|
|
61
|
+
_wait_for_index_ready(collection, index_name)
|
|
62
|
+
print(result + " is ready for querying.")
|
|
63
|
+
|
|
64
|
+
except Exception as e:
|
|
65
|
+
print(f"Error creating or waiting for search index '{index_name}': {e}")
|
|
66
|
+
# Depending on the error, you might want to re-raise or handle it differently.
|
|
67
|
+
|
|
68
|
+
def _wait_for_index_ready(collection: Collection, index_name: str):
|
|
69
|
+
"""
|
|
70
|
+
Helper function to poll the index status until it's ready.
|
|
71
|
+
"""
|
|
72
|
+
print("Polling to check if the index is ready. This may take some time (up to a few minutes for large indexes).")
|
|
73
|
+
|
|
74
|
+
start_time = time.time()
|
|
75
|
+
timeout = 300 # 5 minutes timeout, adjust as needed
|
|
76
|
+
|
|
77
|
+
while True:
|
|
78
|
+
indices= None
|
|
79
|
+
try:
|
|
80
|
+
indices = list(collection.list_search_indexes(name=index_name))
|
|
81
|
+
if len(indices) == 1 and indices[0].get("status") == "READY":
|
|
82
|
+
print(f"Index '{index_name}' is READY for querying.")
|
|
83
|
+
break
|
|
84
|
+
elif len(indices) == 0:
|
|
85
|
+
print(f"Warning: Index '{index_name}' not found during polling, might have failed creation or name mismatch.")
|
|
86
|
+
break # Exit if index disappears
|
|
87
|
+
else:
|
|
88
|
+
current_status = indices[0].get("status", "UNKNOWN")
|
|
89
|
+
print(f"Index '{index_name}' status: {current_status}. Waiting...")
|
|
90
|
+
except Exception as e:
|
|
91
|
+
print(f"Error while polling index status: {e}. Retrying...")
|
|
92
|
+
|
|
93
|
+
if time.time() - start_time > timeout:
|
|
94
|
+
status= indices[0].get('status') if indices else 'N/A'
|
|
95
|
+
print(f"Timeout: Index '{index_name}' did not become ready within {timeout} seconds. Current status: {status}")
|
|
96
|
+
break # Exit on timeout
|
|
97
|
+
|
|
98
|
+
time.sleep(10) # Poll less frequently, every 10 seconds
|
|
99
|
+
|
|
100
|
+
# --- Example Usage ---
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
|
|
103
|
+
# Replace with your database and collection names
|
|
104
|
+
DATABASE_NAME = "pembot"
|
|
105
|
+
COLLECTION_NAME = "doc_chunks"
|
|
106
|
+
VECTOR_INDEX_NAME = "test_search"
|
|
107
|
+
|
|
108
|
+
# Connect to MongoDB
|
|
109
|
+
mongo_client= None
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
mongo_client = MongoClient(os.environ["MONGODB_PEM"])
|
|
113
|
+
db = mongo_client[DATABASE_NAME]
|
|
114
|
+
collection = db[COLLECTION_NAME]
|
|
115
|
+
print("Connected to MongoDB successfully.")
|
|
116
|
+
|
|
117
|
+
# Define dimensions (e.g., for nomic-embed-text:v1.5)
|
|
118
|
+
EMBEDDING_DIMENSIONS = 768 # Check your model's output dimension
|
|
119
|
+
|
|
120
|
+
# Call the function to create the index, with existence check
|
|
121
|
+
create_vector_index(collection, VECTOR_INDEX_NAME, num_dimensions=EMBEDDING_DIMENSIONS)
|
|
122
|
+
|
|
123
|
+
# Test calling it again to see the "already exists" message
|
|
124
|
+
create_vector_index(collection, VECTOR_INDEX_NAME, num_dimensions=EMBEDDING_DIMENSIONS)
|
|
125
|
+
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print(f"Failed to connect to MongoDB or process: {e}")
|
|
128
|
+
finally:
|
|
129
|
+
if 'mongo_client' in locals() and mongo_client:
|
|
130
|
+
mongo_client.close()
|
|
131
|
+
print("MongoDB connection closed.")
|
|
132
|
+
|
|
133
|
+
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from google import genai
|
|
3
|
+
from pymongo import MongoClient
|
|
4
|
+
from TextEmbedder import gemini_embedder, mongodb_embedder
|
|
5
|
+
from os import environ
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def vectory_query_run(genai_client, docs_collection, embeddings_collection, search_string, database, index_name, chunksize= 170):
|
|
10
|
+
|
|
11
|
+
query_embedding = gemini_embedder.get_embedding(genai_client, search_string)
|
|
12
|
+
|
|
13
|
+
# Define the aggregation pipeline
|
|
14
|
+
pipeline = [
|
|
15
|
+
{
|
|
16
|
+
"$vectorSearch": {
|
|
17
|
+
"queryVector": query_embedding,
|
|
18
|
+
"index": f"{index_name}",
|
|
19
|
+
"path": "embedding",
|
|
20
|
+
"numCandidates": 3,
|
|
21
|
+
"limit": 3,
|
|
22
|
+
"k": 3 # Number of results to return
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
# Execute the aggregation pipeline
|
|
28
|
+
results = list(embeddings_collection.aggregate(pipeline))
|
|
29
|
+
for i, res in enumerate(results):
|
|
30
|
+
doc = docs_collection.find_one({"_id": res['docId']})
|
|
31
|
+
|
|
32
|
+
wordslist = mongodb_embedder.clean_text(doc['content']).split()
|
|
33
|
+
|
|
34
|
+
cn = int(res['chunk_number'])
|
|
35
|
+
search_hit = " ".join(wordslist[cn * chunksize: (cn+1) * chunksize])
|
|
36
|
+
|
|
37
|
+
print(f"RESULT #{i + 1}")
|
|
38
|
+
print(search_hit)
|
|
39
|
+
print("\n\n")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
if __name__ == "__main__":
|
|
44
|
+
mc = MongoClient(environ['MONGODB_PEM'])
|
|
45
|
+
|
|
46
|
+
api_key = os.environ['GEMINI_API_KEY']
|
|
47
|
+
genai_client= genai.Client(api_key= api_key)
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
mc.admin.command('ping') # This is a simple way to check the connection
|
|
51
|
+
database = mc["pem"]
|
|
52
|
+
|
|
53
|
+
while True:
|
|
54
|
+
query = input("enter search query [Enter 'exit' to exit]: ")
|
|
55
|
+
if query.lower() == 'exit':
|
|
56
|
+
print('bye!')
|
|
57
|
+
break
|
|
58
|
+
docs_collection= database["blogs"]
|
|
59
|
+
embeddings_collection= database["blog_embeddings"]
|
|
60
|
+
vectory_query_run(genai_client, docs_collection, embeddings_collection, query, database, index_name= 'vector_index_blog_chunks')
|
|
61
|
+
|
|
62
|
+
finally:
|
|
63
|
+
mc.close()
|
|
64
|
+
|
pembot/__init__.py
ADDED
pembot/gartner.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from selenium.webdriver.common.by import By
|
|
2
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
3
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
import random
|
|
8
|
+
import undetected_chromedriver as uc
|
|
9
|
+
driver = uc.Chrome(headless=False,use_subprocess=False)
|
|
10
|
+
|
|
11
|
+
# Random delays between actions
|
|
12
|
+
def human_delay():
|
|
13
|
+
time.sleep(random.uniform(1, 3))
|
|
14
|
+
|
|
15
|
+
# Use this before each navigation
|
|
16
|
+
driver.get("https://www.gartner.com/en/insights")
|
|
17
|
+
human_delay()
|
|
18
|
+
|
|
19
|
+
# Wait for the dynamic content to load
|
|
20
|
+
wait = WebDriverWait(driver, 20)
|
|
21
|
+
try:
|
|
22
|
+
print("waiting")
|
|
23
|
+
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'row.dynamic-content')))
|
|
24
|
+
# wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.row.dynamic-content')))
|
|
25
|
+
print("waitdone")
|
|
26
|
+
except Exception as e:
|
|
27
|
+
print("Timed out waiting for page to load")
|
|
28
|
+
driver.quit()
|
|
29
|
+
exit()
|
|
30
|
+
|
|
31
|
+
# Parse the page with BeautifulSoup
|
|
32
|
+
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
33
|
+
main_div = soup.find('div', class_='row dynamic-content')
|
|
34
|
+
category_blocks = main_div.find_all('div', class_='category-block') if main_div else []
|
|
35
|
+
|
|
36
|
+
print("category blocks: ", category_blocks)
|
|
37
|
+
|
|
38
|
+
result = {}
|
|
39
|
+
|
|
40
|
+
for block in category_blocks:
|
|
41
|
+
# Extract category name
|
|
42
|
+
a_tag = block.find('a')
|
|
43
|
+
if not a_tag:
|
|
44
|
+
continue
|
|
45
|
+
h4 = a_tag.find('h4', class_='categoryheadline')
|
|
46
|
+
if not h4:
|
|
47
|
+
continue
|
|
48
|
+
category_name = h4.get_text(strip=True)
|
|
49
|
+
result[category_name] = {}
|
|
50
|
+
print("result", result)
|
|
51
|
+
|
|
52
|
+
# Extract all article links in the category
|
|
53
|
+
items_content = block.find('div', class_='categoryItemsContent')
|
|
54
|
+
if not items_content:
|
|
55
|
+
continue
|
|
56
|
+
links = items_content.find_all('a', href=True)
|
|
57
|
+
|
|
58
|
+
for link in links:
|
|
59
|
+
href = link['href']
|
|
60
|
+
article_title = link.get_text(strip=True)
|
|
61
|
+
|
|
62
|
+
# Form absolute URL
|
|
63
|
+
full_url = f'https://www.gartner.com{href}' if href.startswith('/') else href
|
|
64
|
+
|
|
65
|
+
# Open article in a new tab
|
|
66
|
+
original_window = driver.current_window_handle
|
|
67
|
+
driver.switch_to.new_window('tab')
|
|
68
|
+
try:
|
|
69
|
+
human_delay()
|
|
70
|
+
driver.get(full_url)
|
|
71
|
+
human_delay()
|
|
72
|
+
# Wait for article content to load
|
|
73
|
+
article_wait = WebDriverWait(driver, 15)
|
|
74
|
+
article_wait.until(EC.presence_of_element_located((By.TAG_NAME, 'article')))
|
|
75
|
+
|
|
76
|
+
# Parse article content
|
|
77
|
+
# Modify the article content extraction part of your script:
|
|
78
|
+
|
|
79
|
+
# Inside the article processing block after driver.get(full_url):
|
|
80
|
+
article_soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
81
|
+
|
|
82
|
+
# Extract article title
|
|
83
|
+
title = article_soup.find('h1').get_text(strip=True) if article_soup.find('h1') else "Untitled"
|
|
84
|
+
|
|
85
|
+
# Extract authors and date
|
|
86
|
+
byline = ""
|
|
87
|
+
authors = []
|
|
88
|
+
date = ""
|
|
89
|
+
byline_element = article_soup.find('article', class_='article-text').find('span', class_='rte')
|
|
90
|
+
if byline_element:
|
|
91
|
+
byline_text = byline_element.get_text(strip=True, separator='|').split('|')
|
|
92
|
+
for part in byline_text:
|
|
93
|
+
if 'By ' in part:
|
|
94
|
+
authors = [a.strip() for a in part.replace('By ', '').split(' and ')]
|
|
95
|
+
if any(word in part.lower() for word in ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
|
|
96
|
+
'jul', 'aug', 'sep', 'oct', 'nov', 'dec']):
|
|
97
|
+
date = part.strip()
|
|
98
|
+
|
|
99
|
+
# Format title with metadata
|
|
100
|
+
formatted_title = f"{title}"
|
|
101
|
+
if authors or date:
|
|
102
|
+
formatted_title += " ("
|
|
103
|
+
if authors:
|
|
104
|
+
formatted_title += f"by:{','.join(authors)}"
|
|
105
|
+
if date:
|
|
106
|
+
if authors: formatted_title += ";"
|
|
107
|
+
formatted_title += f"on:{date}"
|
|
108
|
+
formatted_title += ")"
|
|
109
|
+
|
|
110
|
+
# Extract main content
|
|
111
|
+
content_parts = []
|
|
112
|
+
main_articles = article_soup.find_all('article', class_='article-text')
|
|
113
|
+
for article in main_articles:
|
|
114
|
+
# Skip the byline article
|
|
115
|
+
if 'By ' in article.get_text(): continue
|
|
116
|
+
|
|
117
|
+
# Get all text elements including headings
|
|
118
|
+
for element in article.find_all(['p', 'h2', 'h3', 'h4']):
|
|
119
|
+
text = element.get_text(strip=True, separator=' ')
|
|
120
|
+
if text:
|
|
121
|
+
content_parts.append(text)
|
|
122
|
+
|
|
123
|
+
full_content = '\n'.join([t for t in content_parts if t.strip()])
|
|
124
|
+
|
|
125
|
+
# Store in dictionary structure
|
|
126
|
+
result[category_name][formatted_title] = full_content
|
|
127
|
+
except Exception as e:
|
|
128
|
+
print(f"Error retrieving {full_url}: {str(e)}")
|
|
129
|
+
result[category_name].append('')
|
|
130
|
+
finally:
|
|
131
|
+
driver.close()
|
|
132
|
+
driver.switch_to.window(original_window)
|
|
133
|
+
time.sleep(1) # Polite delay
|
|
134
|
+
|
|
135
|
+
# Save the results to a JSON file
|
|
136
|
+
with open('gartner_articles.json', 'w', encoding='utf-8') as f:
|
|
137
|
+
print("dumping...")
|
|
138
|
+
json.dump(result, f, ensure_ascii=False, indent=4)
|
|
139
|
+
|
|
140
|
+
driver.quit()
|