pembot 0.0.6__py2.py3-none-any.whl → 0.0.8__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pembot might be problematic. Click here for more details.
- pembot/.git/COMMIT_EDITMSG +1 -1
- pembot/.git/index +0 -0
- pembot/.git/logs/HEAD +2 -0
- pembot/.git/logs/refs/heads/main +2 -0
- pembot/.git/logs/refs/remotes/origin/main +2 -0
- pembot/.git/objects/0c/ab66ffbaf50ef60dd41f3498595ebd2526b33c +0 -0
- pembot/.git/objects/41/ae8fa8f8baa2daee5ec0aa21ae17922ae051a0 +0 -0
- pembot/.git/objects/50/39b29fda67743a044993436df6a4a1db7b8888 +0 -0
- pembot/.git/objects/7a/7d28b0313a3d9d509823faaae31949af8610ef +0 -0
- pembot/.git/objects/7e/0907822f7d316ebe0be07e1f6918bef412c80b +0 -0
- pembot/.git/objects/af/80ddb5890f062e364ea8ade2d602df4e12de8c +0 -0
- pembot/.git/objects/b8/884c6145221ac66f84bf88919754c2cb05c12d +0 -0
- pembot/.git/objects/ee/a73c7f24094ed83b014f7cfce46e10f817bec8 +0 -0
- pembot/.git/objects/ef/0503a60244391590b16042019032e91d7cc30d +3 -0
- pembot/.git/objects/f6/b1d54483ce20fbcb252a8a93a5eff7bec88729 +0 -0
- pembot/.git/objects/f8/6fbd490878cb0d3c35cc4443672d1309171bf1 +0 -0
- pembot/.git/refs/heads/main +1 -1
- pembot/.git/refs/remotes/origin/main +1 -1
- pembot/AnyToText/convertor.py +5 -3
- pembot/TextEmbedder/mongodb_embedder.py +50 -19
- pembot/TextEmbedder/mongodb_index_creator.py +29 -24
- pembot/__init__.py +1 -1
- pembot/config/config.yaml +1 -1
- pembot/pdf2markdown/.git/COMMIT_EDITMSG +1 -1
- pembot/pdf2markdown/.git/index +0 -0
- pembot/pdf2markdown/.git/logs/HEAD +1 -0
- pembot/pdf2markdown/.git/logs/refs/heads/main +1 -0
- pembot/pdf2markdown/.git/logs/refs/remotes/myorigin/main +1 -0
- pembot/pdf2markdown/.git/objects/24/7b15a6b1e0e3d270c05af184f048736376cd4e +0 -0
- pembot/pdf2markdown/.git/objects/a7/4bcd5e67cb1066dd504b92b42390fe0b2c3d38 +0 -0
- pembot/pdf2markdown/.git/objects/f3/b2d76c75bbd50e04fc4c2ad17fc94ca6daed32 +1 -0
- pembot/pdf2markdown/.git/refs/heads/main +1 -1
- pembot/pdf2markdown/.git/refs/remotes/myorigin/main +1 -1
- pembot/pdf2markdown/extract.py +26 -1
- pembot/query.py +5 -4
- {pembot-0.0.6.dist-info → pembot-0.0.8.dist-info}/METADATA +1 -1
- {pembot-0.0.6.dist-info → pembot-0.0.8.dist-info}/RECORD +39 -25
- {pembot-0.0.6.dist-info → pembot-0.0.8.dist-info}/WHEEL +0 -0
- {pembot-0.0.6.dist-info → pembot-0.0.8.dist-info}/licenses/LICENSE +0 -0
pembot/.git/COMMIT_EDITMSG
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
added model name to convertor
|
pembot/.git/index
CHANGED
|
Binary file
|
pembot/.git/logs/HEAD
CHANGED
|
@@ -6,3 +6,5 @@ e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f
|
|
|
6
6
|
0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856653 +0530 commit: made arrangements for the cases when custom file bytes are to be processed to text output; handled a ollama running / crashing error
|
|
7
7
|
eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937276 +0530 commit: fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
|
|
8
8
|
0bdb4169fc0f312b8698f1df17a258fff163aeaa 9528bbccd167e3f4ad583a1ae9fac98a52620e27 cyto <silverstone965@gmail.com> 1750947488 +0530 commit: handled local llm nonexistent error properly for choice of just passing None as llm_client;
|
|
9
|
+
9528bbccd167e3f4ad583a1ae9fac98a52620e27 ef0503a60244391590b16042019032e91d7cc30d cyto <silverstone965@gmail.com> 1751872559 +0530 commit: added a model_name_parameter to change models quicky
|
|
10
|
+
ef0503a60244391590b16042019032e91d7cc30d af80ddb5890f062e364ea8ade2d602df4e12de8c cyto <silverstone965@gmail.com> 1751896700 +0530 commit: added model name to convertor
|
pembot/.git/logs/refs/heads/main
CHANGED
|
@@ -6,3 +6,5 @@ e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f
|
|
|
6
6
|
0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856653 +0530 commit: made arrangements for the cases when custom file bytes are to be processed to text output; handled a ollama running / crashing error
|
|
7
7
|
eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937276 +0530 commit: fixed the output_dir bug; fixed the excel to json function; ran some tests on convertor; incremented the version on the package; removed dependency on schema / structure, and shifted required fields to a pickle file path in the cli args;
|
|
8
8
|
0bdb4169fc0f312b8698f1df17a258fff163aeaa 9528bbccd167e3f4ad583a1ae9fac98a52620e27 cyto <silverstone965@gmail.com> 1750947488 +0530 commit: handled local llm nonexistent error properly for choice of just passing None as llm_client;
|
|
9
|
+
9528bbccd167e3f4ad583a1ae9fac98a52620e27 ef0503a60244391590b16042019032e91d7cc30d cyto <silverstone965@gmail.com> 1751872559 +0530 commit: added a model_name_parameter to change models quicky
|
|
10
|
+
ef0503a60244391590b16042019032e91d7cc30d af80ddb5890f062e364ea8ade2d602df4e12de8c cyto <silverstone965@gmail.com> 1751896700 +0530 commit: added model name to convertor
|
|
@@ -5,3 +5,5 @@ e91172752e9a421ae463112d2b0506b37498c98d 0c8d9b2690545bf1906b05cd9f18b783b3eb74f
|
|
|
5
5
|
0c8d9b2690545bf1906b05cd9f18b783b3eb74f1 eb75e1c49f1e5b79dca17ccdbec8067756523238 cyto <silverstone965@gmail.com> 1750856672 +0530 update by push
|
|
6
6
|
eb75e1c49f1e5b79dca17ccdbec8067756523238 0bdb4169fc0f312b8698f1df17a258fff163aeaa cyto <silverstone965@gmail.com> 1750937389 +0530 update by push
|
|
7
7
|
0bdb4169fc0f312b8698f1df17a258fff163aeaa 9528bbccd167e3f4ad583a1ae9fac98a52620e27 cyto <silverstone965@gmail.com> 1750947502 +0530 update by push
|
|
8
|
+
9528bbccd167e3f4ad583a1ae9fac98a52620e27 ef0503a60244391590b16042019032e91d7cc30d cyto <silverstone965@gmail.com> 1751872581 +0530 update by push
|
|
9
|
+
ef0503a60244391590b16042019032e91d7cc30d af80ddb5890f062e364ea8ade2d602df4e12de8c cyto <silverstone965@gmail.com> 1751896713 +0530 update by push
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
pembot/.git/refs/heads/main
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
af80ddb5890f062e364ea8ade2d602df4e12de8c
|
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
af80ddb5890f062e364ea8ade2d602df4e12de8c
|
pembot/AnyToText/convertor.py
CHANGED
|
@@ -31,12 +31,14 @@ EXCEL_FILE_TYPES= [
|
|
|
31
31
|
class Convertor():
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
def __init__(self, myfile: Path | None= None, output_dir: Path | None= None, file_bytes: bytes | None= None, suffix: str | None= None, file_type: str | None= None):
|
|
34
|
+
def __init__(self, myfile: Path | None= None, output_dir: Path | None= None, file_bytes: bytes | None= None, suffix: str | None= None, file_type: str | None= None, model_name: str | None = None):
|
|
35
35
|
|
|
36
36
|
self.output= ""
|
|
37
37
|
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
if model_name is None:
|
|
39
|
+
# model_name= "gemini-2.5-flash"
|
|
40
|
+
model_name= "Nanonets-OCR-s"
|
|
41
|
+
|
|
40
42
|
# file_type can be pdf, excel, etc.
|
|
41
43
|
if output_dir is None and myfile is None and file_bytes is not None and suffix is not None:
|
|
42
44
|
with tempfile.TemporaryDirectory() as dp:
|
|
@@ -29,6 +29,7 @@ def search_within_document(
|
|
|
29
29
|
limit: int = 5,
|
|
30
30
|
index_name: str = "test_search",
|
|
31
31
|
embeddings_collection_name: str= "doc_chunks",
|
|
32
|
+
document_belongs_to_a_type = "",
|
|
32
33
|
):
|
|
33
34
|
"""
|
|
34
35
|
Performs a vector similarity search within the chunks of a specific document
|
|
@@ -42,6 +43,7 @@ def search_within_document(
|
|
|
42
43
|
index_name: The name of your MongoDB Atlas Vector Search index.
|
|
43
44
|
You MUST have a vector search index created on the 'embedding' field
|
|
44
45
|
of the 'embeddings_collection' collection for this to work efficiently.
|
|
46
|
+
document_belongs_to_a_type: When search spaces intersect for different docIds, such that docId is an array field,
|
|
45
47
|
|
|
46
48
|
Returns:
|
|
47
49
|
A list of dictionaries, where each dictionary represents a matching chunk
|
|
@@ -50,10 +52,23 @@ def search_within_document(
|
|
|
50
52
|
embeddings_collection = db_client[embeddings_collection_name]
|
|
51
53
|
|
|
52
54
|
print(f"Searching within document (docId: {document_name_id})...")
|
|
55
|
+
# print(f" filter (slug: {document_belongs_to_a_type})...")
|
|
53
56
|
|
|
54
57
|
# MongoDB Atlas Vector Search aggregation pipeline
|
|
55
58
|
# The 'path' should point to the field containing the embeddings.
|
|
56
59
|
# The 'filter' stage is crucial for searching within a specific document.
|
|
60
|
+
#
|
|
61
|
+
project_dict= {
|
|
62
|
+
'_id': 0,
|
|
63
|
+
'docId': 1,
|
|
64
|
+
'chunk_number': 1,
|
|
65
|
+
'chunk_text': 1,
|
|
66
|
+
'score': { '$meta': 'vectorSearchScore' } # Get the similarity score
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if document_belongs_to_a_type:
|
|
70
|
+
project_dict['type']= 1
|
|
71
|
+
|
|
57
72
|
pipeline = [
|
|
58
73
|
{
|
|
59
74
|
'$vectorSearch': {
|
|
@@ -66,25 +81,21 @@ def search_within_document(
|
|
|
66
81
|
'index': index_name,
|
|
67
82
|
|
|
68
83
|
#filter to search only within the specified document
|
|
69
|
-
'filter':
|
|
70
|
-
|
|
71
|
-
|
|
84
|
+
'filter':
|
|
85
|
+
{ "type": {"$in": [document_belongs_to_a_type ]} } if document_belongs_to_a_type else
|
|
86
|
+
{ 'docId': document_name_id }
|
|
72
87
|
}
|
|
73
88
|
},
|
|
74
89
|
|
|
75
90
|
# to exclude the MongoDB internal _id
|
|
76
91
|
{
|
|
77
|
-
'$project':
|
|
78
|
-
'_id': 0,
|
|
79
|
-
'docId': 1,
|
|
80
|
-
'chunk_number': 1,
|
|
81
|
-
'chunk_text': 1,
|
|
82
|
-
'score': { '$meta': 'vectorSearchScore' } # Get the similarity score
|
|
83
|
-
}
|
|
92
|
+
'$project': project_dict
|
|
84
93
|
}
|
|
85
94
|
]
|
|
86
95
|
|
|
96
|
+
# print("sesraching now:")
|
|
87
97
|
results = list(embeddings_collection.aggregate(pipeline))
|
|
98
|
+
# print("search results: ", results)
|
|
88
99
|
|
|
89
100
|
if not results:
|
|
90
101
|
print(f"No relevant chunks found for document '{document_name_id}' with the given query.")
|
|
@@ -100,15 +111,18 @@ def search_within_document(
|
|
|
100
111
|
|
|
101
112
|
|
|
102
113
|
|
|
103
|
-
def process_document_and_embed(
|
|
114
|
+
def process_document_and_embed(
|
|
115
|
+
db_client,
|
|
104
116
|
llm_client,
|
|
105
117
|
inference_client,
|
|
106
118
|
file_path: Path,
|
|
107
119
|
chunk_size: int,
|
|
108
|
-
embedding_model: str = '
|
|
120
|
+
embedding_model: str = 'BAAI/bge-en-icl',
|
|
109
121
|
embeddings_collection_name= "doc_chunks",
|
|
110
122
|
use_custom_id: str | None = None,
|
|
111
|
-
use_custom_input: str | None = None
|
|
123
|
+
use_custom_input: str | None = None,
|
|
124
|
+
document_belongs_to_a_type= "",
|
|
125
|
+
type_info= []
|
|
112
126
|
) -> list[dict]:
|
|
113
127
|
"""
|
|
114
128
|
Processes an input document by chunking its text, generating embeddings using
|
|
@@ -228,13 +242,30 @@ def process_document_and_embed(db_client,
|
|
|
228
242
|
'chunk_text': chunk,
|
|
229
243
|
'embedding': embedding,
|
|
230
244
|
'chunk_id_global': chunk_id_global,
|
|
231
|
-
'chunk_id_doc_specific': chunk_id_doc_specific
|
|
245
|
+
'chunk_id_doc_specific': chunk_id_doc_specific,
|
|
232
246
|
}
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# TBD: this is NOT pushing array, this is creating a "$push" field with type: "" object
|
|
250
|
+
|
|
251
|
+
if len(type_info) > 0:
|
|
252
|
+
embeddings_collection.update_one(
|
|
253
|
+
{'docId': document_name_id, 'chunk_number': i + 1},
|
|
254
|
+
{
|
|
255
|
+
'$set': doc_set,
|
|
256
|
+
'$push': {
|
|
257
|
+
"type": type_info
|
|
258
|
+
}
|
|
259
|
+
},
|
|
260
|
+
upsert=True
|
|
261
|
+
)
|
|
262
|
+
else:
|
|
263
|
+
|
|
264
|
+
embeddings_collection.update_one(
|
|
265
|
+
{'docId': document_name_id, 'chunk_number': i + 1},
|
|
266
|
+
{'$set': doc_set},
|
|
267
|
+
upsert=True
|
|
268
|
+
)
|
|
238
269
|
print(f"Successfully stored chunk {i+1} for '{file_path.name}' in MongoDB.")
|
|
239
270
|
res.append({**doc_set, "docId": document_name_id, "chunk_number": i + 1})
|
|
240
271
|
|
|
@@ -4,7 +4,7 @@ from pymongo.operations import SearchIndexModel
|
|
|
4
4
|
import time
|
|
5
5
|
import os
|
|
6
6
|
|
|
7
|
-
def create_vector_index(collection: Collection, index_name: str, num_dimensions: int = 768):
|
|
7
|
+
def create_vector_index(collection: Collection, index_name: str, num_dimensions: int = 768, document_belongs_to_a_type= ""):
|
|
8
8
|
"""
|
|
9
9
|
Creates a MongoDB Atlas Vector Search index if it does not already exist.
|
|
10
10
|
|
|
@@ -13,14 +13,14 @@ def create_vector_index(collection: Collection, index_name: str, num_dimensions:
|
|
|
13
13
|
index_name: The desired name for the vector search index.
|
|
14
14
|
num_dimensions: The number of dimensions for the embedding vectors.
|
|
15
15
|
"""
|
|
16
|
-
|
|
16
|
+
|
|
17
17
|
# 1. Check if the index already exists
|
|
18
18
|
existing_indexes = list(collection.list_search_indexes())
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
for index in existing_indexes:
|
|
21
21
|
if index.get('name') == index_name:
|
|
22
22
|
print(f"Search index '{index_name}' already exists. Skipping creation.")
|
|
23
|
-
|
|
23
|
+
|
|
24
24
|
# Optional: You can also check if the existing index is "READY"
|
|
25
25
|
if index.get('status') == 'READY':
|
|
26
26
|
print(f"Index '{index_name}' is already ready for querying.")
|
|
@@ -33,20 +33,27 @@ def create_vector_index(collection: Collection, index_name: str, num_dimensions:
|
|
|
33
33
|
# 2. If the index does not exist, proceed to create it
|
|
34
34
|
print(f"Search index '{index_name}' does not exist. Creating it now...")
|
|
35
35
|
|
|
36
|
+
fields_arr= [
|
|
37
|
+
{
|
|
38
|
+
"type": "vector",
|
|
39
|
+
"path": "embedding",
|
|
40
|
+
"similarity": "dotProduct", # Or "cosine", "euclidean"
|
|
41
|
+
"numDimensions": num_dimensions,
|
|
42
|
+
"quantization": "scalar" # Or "none"
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"type": "filter",
|
|
46
|
+
"path": "docId"
|
|
47
|
+
}
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
if document_belongs_to_a_type:
|
|
51
|
+
fields_arr.append({
|
|
52
|
+
"type": "filter",
|
|
53
|
+
"path": "type"
|
|
54
|
+
})
|
|
36
55
|
search_index_model = SearchIndexModel(definition={
|
|
37
|
-
"fields":
|
|
38
|
-
{
|
|
39
|
-
"type": "vector",
|
|
40
|
-
"path": "embedding",
|
|
41
|
-
"similarity": "dotProduct", # Or "cosine", "euclidean"
|
|
42
|
-
"numDimensions": num_dimensions,
|
|
43
|
-
"quantization": "scalar" # Or "none"
|
|
44
|
-
},
|
|
45
|
-
{
|
|
46
|
-
"type": "filter",
|
|
47
|
-
"path": "docId"
|
|
48
|
-
}
|
|
49
|
-
]
|
|
56
|
+
"fields": fields_arr
|
|
50
57
|
},
|
|
51
58
|
name=index_name,
|
|
52
59
|
type="vectorSearch"
|
|
@@ -70,7 +77,7 @@ def _wait_for_index_ready(collection: Collection, index_name: str):
|
|
|
70
77
|
Helper function to poll the index status until it's ready.
|
|
71
78
|
"""
|
|
72
79
|
print("Polling to check if the index is ready. This may take some time (up to a few minutes for large indexes).")
|
|
73
|
-
|
|
80
|
+
|
|
74
81
|
start_time = time.time()
|
|
75
82
|
timeout = 300 # 5 minutes timeout, adjust as needed
|
|
76
83
|
|
|
@@ -89,7 +96,7 @@ def _wait_for_index_ready(collection: Collection, index_name: str):
|
|
|
89
96
|
print(f"Index '{index_name}' status: {current_status}. Waiting...")
|
|
90
97
|
except Exception as e:
|
|
91
98
|
print(f"Error while polling index status: {e}. Retrying...")
|
|
92
|
-
|
|
99
|
+
|
|
93
100
|
if time.time() - start_time > timeout:
|
|
94
101
|
status= indices[0].get('status') if indices else 'N/A'
|
|
95
102
|
print(f"Timeout: Index '{index_name}' did not become ready within {timeout} seconds. Current status: {status}")
|
|
@@ -99,9 +106,9 @@ def _wait_for_index_ready(collection: Collection, index_name: str):
|
|
|
99
106
|
|
|
100
107
|
# --- Example Usage ---
|
|
101
108
|
if __name__ == "__main__":
|
|
102
|
-
|
|
109
|
+
|
|
103
110
|
# Replace with your database and collection names
|
|
104
|
-
DATABASE_NAME = "pembot"
|
|
111
|
+
DATABASE_NAME = "pembot"
|
|
105
112
|
COLLECTION_NAME = "doc_chunks"
|
|
106
113
|
VECTOR_INDEX_NAME = "test_search"
|
|
107
114
|
|
|
@@ -119,7 +126,7 @@ if __name__ == "__main__":
|
|
|
119
126
|
|
|
120
127
|
# Call the function to create the index, with existence check
|
|
121
128
|
create_vector_index(collection, VECTOR_INDEX_NAME, num_dimensions=EMBEDDING_DIMENSIONS)
|
|
122
|
-
|
|
129
|
+
|
|
123
130
|
# Test calling it again to see the "already exists" message
|
|
124
131
|
create_vector_index(collection, VECTOR_INDEX_NAME, num_dimensions=EMBEDDING_DIMENSIONS)
|
|
125
132
|
|
|
@@ -129,5 +136,3 @@ if __name__ == "__main__":
|
|
|
129
136
|
if 'mongo_client' in locals() and mongo_client:
|
|
130
137
|
mongo_client.close()
|
|
131
138
|
print("MongoDB connection closed.")
|
|
132
|
-
|
|
133
|
-
|
pembot/__init__.py
CHANGED
pembot/config/config.yaml
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
handled the gpu errors non-gracefully so that it stops
|
pembot/pdf2markdown/.git/index
CHANGED
|
Binary file
|
|
@@ -2,3 +2,4 @@
|
|
|
2
2
|
ffb759ee4605b232366a9ee58134532913c3f9e0 b8702320e56074e9680181d8b7897d6a0a552e2d cyto <silverstone965@gmail.com> 1750947962 +0530 commit: handled config loading errors gracefully; added gemini support, as an option; added huggingface nanonets transformers support (as an option); redesigned the extract markdown for captioning and image ocr (block image and full-page image);
|
|
3
3
|
b8702320e56074e9680181d8b7897d6a0a552e2d 14251b198e0bac39a3dc3b42f9e57b20c01465fb cyto <silverstone965@gmail.com> 1751604763 +0530 commit: removed deps on torch and transformers; used gradio client for ocr through public spaces;
|
|
4
4
|
14251b198e0bac39a3dc3b42f9e57b20c01465fb b48d697aa9fd97151eb2a84a1af5d408b7630232 cyto <silverstone965@gmail.com> 1751871887 +0530 commit: cyto/argument-list-bug-fix;authentication-used-in-gradio-client
|
|
5
|
+
b48d697aa9fd97151eb2a84a1af5d408b7630232 f3b2d76c75bbd50e04fc4c2ad17fc94ca6daed32 cyto <silverstone965@gmail.com> 1751896628 +0530 commit: handled the gpu errors non-gracefully so that it stops
|
|
@@ -2,3 +2,4 @@
|
|
|
2
2
|
ffb759ee4605b232366a9ee58134532913c3f9e0 b8702320e56074e9680181d8b7897d6a0a552e2d cyto <silverstone965@gmail.com> 1750947962 +0530 commit: handled config loading errors gracefully; added gemini support, as an option; added huggingface nanonets transformers support (as an option); redesigned the extract markdown for captioning and image ocr (block image and full-page image);
|
|
3
3
|
b8702320e56074e9680181d8b7897d6a0a552e2d 14251b198e0bac39a3dc3b42f9e57b20c01465fb cyto <silverstone965@gmail.com> 1751604763 +0530 commit: removed deps on torch and transformers; used gradio client for ocr through public spaces;
|
|
4
4
|
14251b198e0bac39a3dc3b42f9e57b20c01465fb b48d697aa9fd97151eb2a84a1af5d408b7630232 cyto <silverstone965@gmail.com> 1751871887 +0530 commit: cyto/argument-list-bug-fix;authentication-used-in-gradio-client
|
|
5
|
+
b48d697aa9fd97151eb2a84a1af5d408b7630232 f3b2d76c75bbd50e04fc4c2ad17fc94ca6daed32 cyto <silverstone965@gmail.com> 1751896628 +0530 commit: handled the gpu errors non-gracefully so that it stops
|
|
@@ -1,3 +1,4 @@
|
|
|
1
1
|
0000000000000000000000000000000000000000 b8702320e56074e9680181d8b7897d6a0a552e2d cyto <silverstone965@gmail.com> 1750948073 +0530 update by push
|
|
2
2
|
b8702320e56074e9680181d8b7897d6a0a552e2d 14251b198e0bac39a3dc3b42f9e57b20c01465fb cyto <silverstone965@gmail.com> 1751604904 +0530 update by push
|
|
3
3
|
14251b198e0bac39a3dc3b42f9e57b20c01465fb b48d697aa9fd97151eb2a84a1af5d408b7630232 cyto <silverstone965@gmail.com> 1751872077 +0530 update by push
|
|
4
|
+
b48d697aa9fd97151eb2a84a1af5d408b7630232 f3b2d76c75bbd50e04fc4c2ad17fc94ca6daed32 cyto <silverstone965@gmail.com> 1751896663 +0530 update by push
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
x��Kj!3vw� �l媷?`k�v��>�!�C'��:'Hk�f't:�lȺ�6g�u2j�߈G�TV��ةN��gb�rp���F��ɚ���RI��<Z���
|
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
f3b2d76c75bbd50e04fc4c2ad17fc94ca6daed32
|
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
f3b2d76c75bbd50e04fc4c2ad17fc94ca6daed32
|
pembot/pdf2markdown/extract.py
CHANGED
|
@@ -115,6 +115,10 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
115
115
|
except Exception as e:
|
|
116
116
|
self.logger.error(f"Error processing PDF: {e}")
|
|
117
117
|
self.logger.exception(traceback.format_exc())
|
|
118
|
+
|
|
119
|
+
error_message= str(e).lower()
|
|
120
|
+
if "GPU" in error_message and "quota" in error_message:
|
|
121
|
+
return "GPU quota error", []
|
|
118
122
|
return "", []
|
|
119
123
|
|
|
120
124
|
|
|
@@ -176,6 +180,12 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
176
180
|
print("ocr'd: ", result[:100] + "...")
|
|
177
181
|
except Exception as e:
|
|
178
182
|
print("Error during nanonet inference", e)
|
|
183
|
+
error_message = str(e)
|
|
184
|
+
if "You have exceeded your Pro GPU quota" in error_message:
|
|
185
|
+
# print("\n\n\nFALLING BACK TO TESS\n\n\n")
|
|
186
|
+
# return pytesseract.image_to_string(pil_image)
|
|
187
|
+
raise e
|
|
188
|
+
|
|
179
189
|
|
|
180
190
|
return result
|
|
181
191
|
else:
|
|
@@ -262,6 +272,9 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
262
272
|
except Exception as e:
|
|
263
273
|
self.logger.error(f" Error processing embedded image block for OCR: {e}")
|
|
264
274
|
current_page_markdown_blocks.append("\n\n\n\n")
|
|
275
|
+
error_message= str(e).lower()
|
|
276
|
+
if "GPU" in error_message and "quota" in error_message:
|
|
277
|
+
raise e
|
|
265
278
|
|
|
266
279
|
|
|
267
280
|
# Insert tables at their approximate positions (after blocks are processed for the page)
|
|
@@ -306,6 +319,9 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
306
319
|
self.logger.info(f" Full-page OCR yielded no text for page {page_num+1}.")
|
|
307
320
|
except Exception as e:
|
|
308
321
|
self.logger.error(f" Error during full-page OCR on page {page_num+1}: {e}")
|
|
322
|
+
error_message= str(e).lower()
|
|
323
|
+
if "GPU" in error_message and "quota" in error_message:
|
|
324
|
+
raise e
|
|
309
325
|
else:
|
|
310
326
|
self.logger.info(f" Page {page_num + 1} has sufficient searchable text or embedded image OCR; skipping full-page OCR.")
|
|
311
327
|
|
|
@@ -329,7 +345,12 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
329
345
|
except Exception as e:
|
|
330
346
|
self.logger.critical(f"An unexpected error occurred during markdown extraction: {e}")
|
|
331
347
|
self.logger.exception(traceback.format_exc())
|
|
332
|
-
|
|
348
|
+
|
|
349
|
+
error_message= str(e).lower()
|
|
350
|
+
if "GPU" in error_message and "quota" in error_message:
|
|
351
|
+
return "GPU quota error", []
|
|
352
|
+
else:
|
|
353
|
+
return "", []
|
|
333
354
|
|
|
334
355
|
def extract_tables(self):
|
|
335
356
|
"""Extract tables from PDF using pdfplumber."""
|
|
@@ -412,6 +433,9 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
412
433
|
except Exception as e:
|
|
413
434
|
self.logger.error(f"Error captioning image: {e}")
|
|
414
435
|
self.logger.exception(traceback.format_exc())
|
|
436
|
+
error_message= str(e)
|
|
437
|
+
if "GPU" in error_message and "quota" in error_message:
|
|
438
|
+
raise e
|
|
415
439
|
return ""
|
|
416
440
|
|
|
417
441
|
def clean_text(self, text):
|
|
@@ -726,6 +750,7 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
726
750
|
self.logger.exception(traceback.format_exc())
|
|
727
751
|
return ""
|
|
728
752
|
|
|
753
|
+
|
|
729
754
|
def get_header_level(self, font_size):
|
|
730
755
|
"""Determine header level based on font size."""
|
|
731
756
|
if font_size > 24:
|
pembot/query.py
CHANGED
|
@@ -68,7 +68,8 @@ def multi_embedding_average(llm_client, inference_client, descriptions, model= "
|
|
|
68
68
|
except Exception as e:
|
|
69
69
|
print(f"Error generating embedding for description '{desc}': {e}")
|
|
70
70
|
# Decide how to handle errors: skip, raise, or use a placeholder
|
|
71
|
-
continue
|
|
71
|
+
# continue
|
|
72
|
+
raise e
|
|
72
73
|
time.sleep(1)
|
|
73
74
|
|
|
74
75
|
if not description_embeddings:
|
|
@@ -81,7 +82,7 @@ def multi_embedding_average(llm_client, inference_client, descriptions, model= "
|
|
|
81
82
|
|
|
82
83
|
|
|
83
84
|
|
|
84
|
-
def rag_query_llm(db_client, llm_client, inference_client, user_query: str, document_id: str, required_fields_descriptions: list[str], model_name: str = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", ollama_base_url: str = "http://localhost:11434", no_of_fields= 4, embedding_model= "BAAI/bge-en-icl", llm_provider_name: PROVIDER_T= "novita", index_name: str= "test_search", embeddings_collection= "doc_chunks"):
|
|
85
|
+
def rag_query_llm(db_client, llm_client, inference_client, user_query: str, document_id: str, required_fields_descriptions: list[str], model_name: str = "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", ollama_base_url: str = "http://localhost:11434", no_of_fields= 4, embedding_model= "BAAI/bge-en-icl", llm_provider_name: PROVIDER_T= "novita", index_name: str= "test_search", embeddings_collection= "doc_chunks", document_belongs_to_a_type= ""):
|
|
85
86
|
"""
|
|
86
87
|
Performs a RAG (Retrieval Augmented Generation) query using a Hugging Face
|
|
87
88
|
embedding model, ChromaDB for retrieval, and a local Ollama model for generation.
|
|
@@ -119,10 +120,10 @@ def rag_query_llm(db_client, llm_client, inference_client, user_query: str, docu
|
|
|
119
120
|
aggregate_query_embedding= multi_embedding_average(llm_client, inference_client, required_fields_descriptions, model= embedding_model, embed_locally= embed_locally)
|
|
120
121
|
print("Aggregate query embedding generated. length: ", len(aggregate_query_embedding))
|
|
121
122
|
|
|
122
|
-
create_vector_index(db_client[embeddings_collection], index_name, num_dimensions= len(aggregate_query_embedding))
|
|
123
|
+
create_vector_index(db_client[embeddings_collection], index_name, num_dimensions= len(aggregate_query_embedding), document_belongs_to_a_type= document_belongs_to_a_type)
|
|
123
124
|
|
|
124
125
|
# check the order of args
|
|
125
|
-
relevant_chunks= search_within_document(db_client, aggregate_query_embedding, document_id, limit= no_of_fields, index_name= index_name, embeddings_collection_name= embeddings_collection)
|
|
126
|
+
relevant_chunks= search_within_document(db_client, aggregate_query_embedding, document_id, limit= no_of_fields, index_name= index_name, embeddings_collection_name= embeddings_collection, document_belongs_to_a_type= document_belongs_to_a_type)
|
|
126
127
|
relevant_chunks= list(map(lambda x: x['chunk_text'], relevant_chunks))
|
|
127
128
|
|
|
128
129
|
if not relevant_chunks:
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
pembot/.gitignore,sha256=_7FTsZokJ_pzEyyPjOsGw5x5Xx3gUBFaafs7UlPsv9E,98
|
|
2
2
|
pembot/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
3
|
-
pembot/__init__.py,sha256=
|
|
3
|
+
pembot/__init__.py,sha256=XAG3pbUQGlrUZZgAKAxq96MjvCGaEPkpGTSKIwzNvGY,211
|
|
4
4
|
pembot/gartner.py,sha256=3ALknQ5mSXIimmwCa3JFDzB_EW2hHEcQO1T2odyBquk,5408
|
|
5
5
|
pembot/main.py,sha256=lZLIV8XPonvNoY4LVS-5fct1y9URMXWoSGJUKMw3Yg8,9667
|
|
6
6
|
pembot/output_structure_local.py,sha256=YfpHzfTNeLMSsB_CjAamha9D6Iz7E1IC-tW9xPCMWFc,3000
|
|
7
7
|
pembot/pem.py,sha256=mv6iGcN1peSY7z2dtCQ_BKj31EFBNfczBhps_d-0XDo,6377
|
|
8
|
-
pembot/query.py,sha256=
|
|
8
|
+
pembot/query.py,sha256=d6K2PyDDGoIOqwn7A_KIBr83w0zjMAHjhmx1S9VlVgg,8642
|
|
9
9
|
pembot/requirements.txt,sha256=6OV_n5JVco2lLA8Wq38tJX1bYgo_UU0R9RKgs4d2wfc,1360
|
|
10
|
-
pembot/.git/COMMIT_EDITMSG,sha256=
|
|
10
|
+
pembot/.git/COMMIT_EDITMSG,sha256=H9feTx6U3VWbFycy9cq077mD4oxuv2gz4G3EUOdQmV4,30
|
|
11
11
|
pembot/.git/HEAD,sha256=KNJb-Cr0wOK3L1CVmyvrhZ4-YLljCl6MYD2tTdsrboA,21
|
|
12
12
|
pembot/.git/config,sha256=ZFl9d2GyxirgRXRsv8iULIieKxwGC9P6SAjB_AmTkmQ,271
|
|
13
13
|
pembot/.git/description,sha256=hatsFj1DoX6pz3eIMIvKFGbxsKjRzJLibpv2PaQGKu4,73
|
|
14
|
-
pembot/.git/index,sha256=
|
|
14
|
+
pembot/.git/index,sha256=EEe4lVsgYW5zuGbFVZ8a4t7AqjDlPTqU1JGDynBe2Sc,1814
|
|
15
15
|
pembot/.git/packed-refs,sha256=7DECsr7q7vJ6Gw6a2gS3dE4v-YzbxGiWYoSWM43DgsQ,112
|
|
16
16
|
pembot/.git/hooks/applypatch-msg.sample,sha256=AiNJeguLAzqlijpSG4YphpOGz3qw4vEBlj0yiqYhk_c,478
|
|
17
17
|
pembot/.git/hooks/commit-msg.sample,sha256=H3TV6SkpebVz69WXQdRsuT_zkazdCD00C5Q3B1PZJDc,896
|
|
@@ -28,13 +28,14 @@ pembot/.git/hooks/push-to-checkout.sample,sha256=pT0HQXmLKHxt16-mSu5HPzBeZdP0lGO
|
|
|
28
28
|
pembot/.git/hooks/sendemail-validate.sample,sha256=ROv8kj3FRmvACWAvDs8Ge5xlRZq_6IaN3Em3jmztepI,2308
|
|
29
29
|
pembot/.git/hooks/update.sample,sha256=jV8vqD4QPPCLV-qmdSHfkZT0XL28s32lKtWGCXoU0QY,3650
|
|
30
30
|
pembot/.git/info/exclude,sha256=ZnH-g7egfIky7okWTR8nk7IxgFjri5jcXAbuClo7DsE,240
|
|
31
|
-
pembot/.git/logs/HEAD,sha256=
|
|
32
|
-
pembot/.git/logs/refs/heads/main,sha256=
|
|
31
|
+
pembot/.git/logs/HEAD,sha256=zUB4DZpCGTMM1FnKY1jQ98WAMwY8twSD8qaFz3Q-K-4,2521
|
|
32
|
+
pembot/.git/logs/refs/heads/main,sha256=zUB4DZpCGTMM1FnKY1jQ98WAMwY8twSD8qaFz3Q-K-4,2521
|
|
33
33
|
pembot/.git/logs/refs/remotes/origin/HEAD,sha256=OrkNquczPPh6fEGtutFKva_-_JhAdwnvXpCCPC4N6jk,194
|
|
34
|
-
pembot/.git/logs/refs/remotes/origin/main,sha256=
|
|
34
|
+
pembot/.git/logs/refs/remotes/origin/main,sha256=t9RDc56CIhCL27FolwvJyBJ6LK8uGVFkzXTZmWqHukw,1314
|
|
35
35
|
pembot/.git/objects/0a/fb3a98cdc55b1434b44534ec2bf22c56cfa26c,sha256=Xxw20vI57zuhERWopDAZpQw6rAOhFtUr05lzpGyCTTE,120
|
|
36
36
|
pembot/.git/objects/0b/db4169fc0f312b8698f1df17a258fff163aeaa,sha256=hsOHhX0Yajg27Y7B9lo-WjDXzW1KNMg2CBr93G116EY,387
|
|
37
37
|
pembot/.git/objects/0c/8d9b2690545bf1906b05cd9f18b783b3eb74f1,sha256=GKt_CAJNOQXwGnoFLuiNpkd0s_hP_UDLKd59VRknYy0,330
|
|
38
|
+
pembot/.git/objects/0c/ab66ffbaf50ef60dd41f3498595ebd2526b33c,sha256=Uk1dStvEBica-t38qHsZZ_4mxvi6b6VA9PaKE4KSunQ,90
|
|
38
39
|
pembot/.git/objects/18/28e18ab80aa64d334b26428708140e280cbc63,sha256=PTF8WLVhzxBDTZhwU_PBHrkQBbijHbKvttSr0XVTOcU,3936
|
|
39
40
|
pembot/.git/objects/19/f61df7dbd562d04f561288677bbf2f18f5dff7,sha256=zg8IdUSnMYpJ6HsfY2LQbXQTMwlT1IPWRSEiY2uDwyE,392
|
|
40
41
|
pembot/.git/objects/1f/83a471c8119f7794d98c049170a5d7d07a4b71,sha256=XnMaYQUA8iT1fiOIvlBav331Ry7pNBOBqI3wB3Y1VM0,90
|
|
@@ -45,8 +46,10 @@ pembot/.git/objects/3e/23850624fcf5f111d6ea88ddd64adf924cf82f,sha256=ygVUpaLo7cx
|
|
|
45
46
|
pembot/.git/objects/3e/cf23eb95123287531d708a21d4ba88d92ccabb,sha256=Jlg3XIzIjk3N5ZKolXbz_betMybJ2t2TVuOARg2ruQU,4943
|
|
46
47
|
pembot/.git/objects/3f/78215d7e17da726fb352fd92b3c117db9b63ba,sha256=J8r5hqTEgAwlH5sDjr9tp1ipqpvs4BAVQY5rkiKqDCw,4080
|
|
47
48
|
pembot/.git/objects/3f/e072cf3cb6a9f30c3e9936e3ddf622e80270d0,sha256=Z-UoKi2MYe0qGTtBxAr5cnIOHKkhoEXMgalevFUz9lA,2992
|
|
49
|
+
pembot/.git/objects/41/ae8fa8f8baa2daee5ec0aa21ae17922ae051a0,sha256=TLuVmtSH9K33qB-WHMxKDUihHCrwdTtCKtjBs-rAnJ4,56
|
|
48
50
|
pembot/.git/objects/41/cbeb6bcb4c6fa9ef9be571082d95ecb4ea0ee3,sha256=waMrzjG_o5D4JgHkjjqcDQCwuS17w60JRkVr25ZFlcI,117
|
|
49
51
|
pembot/.git/objects/4d/a03134f70896f72053fbdc0cd4f4c76d4ac1d8,sha256=GBhAvxM1omIt-PN6mNXYlIJMN5nx2AUE0ZOf68El5pc,117
|
|
52
|
+
pembot/.git/objects/50/39b29fda67743a044993436df6a4a1db7b8888,sha256=NYNmYtOq8IMmH32GaQSOBpTRTTm6jEJfY3vytVpzfKM,115
|
|
50
53
|
pembot/.git/objects/51/9e780574933d7627a083222bd10dd74f430904,sha256=3e3Iu2-waVySghbLYXmwhDPpfhV4PF82suvjcYkSVog,3604
|
|
51
54
|
pembot/.git/objects/61/46a371b9c1bd9f51af273f11f986cfd1bedeba,sha256=KZvfnjxuriY54uWZQOM-GLovAvHs1k8_KwhpjNA5lW4,128
|
|
52
55
|
pembot/.git/objects/63/1700a51c8fa97b543991f5f61bfcd1e7e1327d,sha256=sYkhBkrSPQ8klX2gPrXJUZVt2a0iaF7KC7NFGBuxgeY,4360
|
|
@@ -54,6 +57,8 @@ pembot/.git/objects/64/00040794955d17c9a1fe1aaaea59f2c4822177,sha256=-tFnLFQvYrt
|
|
|
54
57
|
pembot/.git/objects/6d/7a865a23b1cb4182f67907820104ced48b11c9,sha256=dJRTCmT9rLygONcQ7MPETl9AImF3Iy5tB_KUeCvKyKY,2651
|
|
55
58
|
pembot/.git/objects/72/f047cda92abcd1ddc857f6461de605f8668331,sha256=PFb9LUDMnUCnuJcXUa5W1ea__fdP17kNyWrnqvnOpjs,240
|
|
56
59
|
pembot/.git/objects/73/2e98f08bc806c331b06847fc8c743f545499e5,sha256=kbKUb6fwwhRO73B4EZmol55JBvckqE3GNZ9PqHRB2ag,3995
|
|
60
|
+
pembot/.git/objects/7a/7d28b0313a3d9d509823faaae31949af8610ef,sha256=X59k-p9VNLBpmJlL53qIz8mntLeCSpnjw-rq9u9z_6I,90
|
|
61
|
+
pembot/.git/objects/7e/0907822f7d316ebe0be07e1f6918bef412c80b,sha256=lFc55Bu-vEXF8In553gHxlEsB47Vg2qFXHiJqepWEqg,5167
|
|
57
62
|
pembot/.git/objects/86/cdaec229f1fbebf43042266b03878944669f25,sha256=eTvQhUeYXP8E181oTOcBydcgmImr62IizaH_Jbcbg8g,4077
|
|
58
63
|
pembot/.git/objects/87/d6df5217a4a374f8c1211a05f9bd657f72c9a7,sha256=OGq5-x1lFa94vTX7WYO6o4TGvCZwAvZ6LXm6N3dpiKM,3881
|
|
59
64
|
pembot/.git/objects/8b/5be2af9b16f290549193859c214cd9072212e8,sha256=DhGeGisCdFZ0TcRKp5angRpaseI87TQDt5FtGZInstk,117
|
|
@@ -63,8 +68,10 @@ pembot/.git/objects/9b/123713e30fc9e225f9ac8ff5b02f8f8cf86456,sha256=xIETiieOoil
|
|
|
63
68
|
pembot/.git/objects/ab/139d2cd4798dd8e2c565b80440b1a44b376126,sha256=v1UO-WINmigZNYD74kyIv310Kq5k4SNL-gQ2DYlw9xk,6258
|
|
64
69
|
pembot/.git/objects/ab/c6b15265171457b41e2cfdaf3b8c3994a59eb7,sha256=ivRCkHzUZHXB16wn2ojARknUrwBkoUsV_18QT3Jbs-k,205
|
|
65
70
|
pembot/.git/objects/ac/9c9018c62fa30dc142665c1b5a375f4e056880,sha256=P_8LPBV0v4D17Akj4f5Cr2dhgNFUsh4o7DLK78CfNPo,349
|
|
71
|
+
pembot/.git/objects/af/80ddb5890f062e364ea8ade2d602df4e12de8c,sha256=QELzH3NdMCFohFEcf5oAAu_e54VFr-LhTyPbXY7GjSk,169
|
|
66
72
|
pembot/.git/objects/b1/1173d9b68db117437ccb9551461152e1e8a77d,sha256=6cl8NMNQ9b5fBh97GPEQNssOVrh-EQLJfhqSBbNb_vU,205
|
|
67
73
|
pembot/.git/objects/b2/4e79ab07fe9e68781961a25ff9f1dbb1546fbb,sha256=zfd9KnP9YtBMwzci1BMWFHAQR4BWJ3XQsyr-rFqdw0Q,135
|
|
74
|
+
pembot/.git/objects/b8/884c6145221ac66f84bf88919754c2cb05c12d,sha256=6EJskrHAkqVAC5ExxIZDQT_2kZWhfLPPAPbX61tmwgw,170
|
|
68
75
|
pembot/.git/objects/b8/eea52176ffa4d88c5a9976bee26092421565d3,sha256=xCom1B6wyws8ZNTJoIL4JtVIXNv1yPCwsXfNsVCAGQA,4410
|
|
69
76
|
pembot/.git/objects/bd/8fd1cb166996e74a8631f3a6f764a53af75297,sha256=JOkICUEv6tdVp7mYDUKtXnsWq3IIZSmm8iUP7OqQwc4,56
|
|
70
77
|
pembot/.git/objects/bf/068a0714e2145de83a5c004f4213b091439d0e,sha256=MpiiCqAk6GQ5iGzeThU0rsabrgA5tCAgdIWudAM0IrA,420
|
|
@@ -79,8 +86,12 @@ pembot/.git/objects/e7/911a702079a6144997ea4e70f59abbe59ec2bc,sha256=r4zY-__F4gS
|
|
|
79
86
|
pembot/.git/objects/e9/1172752e9a421ae463112d2b0506b37498c98d,sha256=qWZpM65kQPSxlVHAtyzH5L-j3rL-b9Jw-A7YBm4NMlI,249
|
|
80
87
|
pembot/.git/objects/ea/0af89e61a882c5afc2a8c281b2d96f174bfe58,sha256=lXbMvL_xl8PhWWfL5WAnvxqE3usiGO3iY83yi3GZwXc,4438
|
|
81
88
|
pembot/.git/objects/eb/75e1c49f1e5b79dca17ccdbec8067756523238,sha256=ltEINFUpQP86CkE4nAT1Afegz3ytY3Nlx1P6ibTFEbo,305
|
|
89
|
+
pembot/.git/objects/ee/a73c7f24094ed83b014f7cfce46e10f817bec8,sha256=fFYq_ODekFhF9SwBL9GP_fGDsNavXVVOuI6kmnHlkiY,5140
|
|
90
|
+
pembot/.git/objects/ef/0503a60244391590b16042019032e91d7cc30d,sha256=mrF9jZHY2oJm8tkd8nQdMgUPbrZfENOFaR3mvbwi1dg,187
|
|
82
91
|
pembot/.git/objects/f1/655afa1c5636c8d58969e3194bb770aefbc552,sha256=Ugf-wTcOlwZXmxmbnjEc3iOK3dDRntTVONOJsrOjl3E,205
|
|
83
92
|
pembot/.git/objects/f4/e991088a63def67a30a2b8bbdb4d58514abab8,sha256=Y5WfCEpk121Cy9gaFfSY4ZkUz54qu45osRZdTy9kZ8c,393
|
|
93
|
+
pembot/.git/objects/f6/b1d54483ce20fbcb252a8a93a5eff7bec88729,sha256=MrRy-fBSXZcp-yJM3e-tH3wCdUS-VFX6rW_mKTa-0_Y,419
|
|
94
|
+
pembot/.git/objects/f8/6fbd490878cb0d3c35cc4443672d1309171bf1,sha256=hBVqthGLEEX2NmdD51kjiIeTd5CP5MU8it41zHlW3m0,419
|
|
84
95
|
pembot/.git/objects/f8/cbb5bfd1503e66cec2c593362c60a317b6d300,sha256=p_PmV9ng8Bhxj4AJjQRxpZDWcSHcksAhYU5r77adriY,4581
|
|
85
96
|
pembot/.git/objects/f9/98e1f01c2bf0a20159fc851327af05beb3ac88,sha256=uU1uWZIPyQBpUuEM4m9Ff-gnn7opvjOpIt3JoqdFwN0,205
|
|
86
97
|
pembot/.git/objects/fa/9c9a62ec1203a5868b033ded428c2382c4e1b6,sha256=is9gmIhAL-QXC9oLx3DHkatfI9wWFePJlo1yGPZrPaE,196
|
|
@@ -90,28 +101,28 @@ pembot/.git/objects/fc/e56f1e09d09a05b9babf796fb40bece176f3a2,sha256=g-IVuI_8YBn
|
|
|
90
101
|
pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.idx,sha256=CNzx_lz6v4PulPxRW2t9nz-ifvplpSFPhMA2M9WNUrA,3424
|
|
91
102
|
pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.pack,sha256=dk3Sqrd0L-tNVLRy3uJdTYJNkw8v59mE1hV8zrCFNzc,41355
|
|
92
103
|
pembot/.git/objects/pack/pack-d5469edc8c36e3bb1de5e0070e4d5b1eae935dd4.rev,sha256=7U3tpTWQ3dn5dwQo_KWMWxF31cKaDnCk2AzTO7Cx4Bg,388
|
|
93
|
-
pembot/.git/refs/heads/main,sha256=
|
|
104
|
+
pembot/.git/refs/heads/main,sha256=DymD5B54ONj5DkUMd4HYO5m52NWPWMez_QV6RdNCG0g,41
|
|
94
105
|
pembot/.git/refs/remotes/origin/HEAD,sha256=K7aiSqD8bEhBAPXVGim7rYQc0sdV9dk_qiBOXbtOsrQ,30
|
|
95
|
-
pembot/.git/refs/remotes/origin/main,sha256=
|
|
106
|
+
pembot/.git/refs/remotes/origin/main,sha256=DymD5B54ONj5DkUMd4HYO5m52NWPWMez_QV6RdNCG0g,41
|
|
96
107
|
pembot/AnyToText/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
97
|
-
pembot/AnyToText/convertor.py,sha256=
|
|
108
|
+
pembot/AnyToText/convertor.py,sha256=gqvhwFssUsAeirfO4n0Ztwga1hn8zHbdG96sMTjYrpE,17188
|
|
98
109
|
pembot/TextEmbedder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
99
110
|
pembot/TextEmbedder/gemini_embedder.py,sha256=P679-2mmQESlYKML1vcrwx_-CSgWJgIQk7NL4F7BLQE,677
|
|
100
|
-
pembot/TextEmbedder/mongodb_embedder.py,sha256=
|
|
101
|
-
pembot/TextEmbedder/mongodb_index_creator.py,sha256=
|
|
111
|
+
pembot/TextEmbedder/mongodb_embedder.py,sha256=RotNlerS3WKEUGRNeQM5MTkl5BtaWNHVaXO1gN5NicI,10682
|
|
112
|
+
pembot/TextEmbedder/mongodb_index_creator.py,sha256=kopqdVYJii_wExVrXGZjMfqWZ2dD42b3PeNWo71weHI,5354
|
|
102
113
|
pembot/TextEmbedder/vector_query.py,sha256=Kh1uhx9CatB-oQlQtnW-1I2Qz7MGHI20n2h_8peAChM,1986
|
|
103
|
-
pembot/config/config.yaml,sha256=
|
|
114
|
+
pembot/config/config.yaml,sha256=y-2BklPelldaXJ_hxFD9k-bFpDA6OAZkaoh5XlvASCE,156
|
|
104
115
|
pembot/pdf2markdown/LICENSE,sha256=1JTJhQjUYDqJzFJhNtitm7mHyE71PRHgetIqRRWg6Pk,1068
|
|
105
116
|
pembot/pdf2markdown/README.md,sha256=jitM1pwI69oa0N4mXv5-SY1ka9Sz3jsRNCDdpW-50kY,4545
|
|
106
117
|
pembot/pdf2markdown/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
107
|
-
pembot/pdf2markdown/extract.py,sha256=
|
|
118
|
+
pembot/pdf2markdown/extract.py,sha256=0fnZnUqNy7shef6dijqEAMgBSf0YAiEx_mbRSU_pmQg,34418
|
|
108
119
|
pembot/pdf2markdown/pyrightconfig.json,sha256=Vt_k4N2LtZhth0lQOQAOnRKDOQkYYVzmdtb-bP3gu7M,47
|
|
109
120
|
pembot/pdf2markdown/requirements.txt,sha256=0vZQzkSZKLNVUttd4euoDyYEy0nc2W3CIVxhepHW5Ho,76
|
|
110
|
-
pembot/pdf2markdown/.git/COMMIT_EDITMSG,sha256=
|
|
121
|
+
pembot/pdf2markdown/.git/COMMIT_EDITMSG,sha256=K6REOtE5mjRmxGSeQdpaFAr1luu1BmaZnzXkTjKINzY,55
|
|
111
122
|
pembot/pdf2markdown/.git/HEAD,sha256=KNJb-Cr0wOK3L1CVmyvrhZ4-YLljCl6MYD2tTdsrboA,21
|
|
112
123
|
pembot/pdf2markdown/.git/config,sha256=bxpN4Vp2IKsAw9QkRoCIXULseCngmK7OQMg_81HDmww,398
|
|
113
124
|
pembot/pdf2markdown/.git/description,sha256=hatsFj1DoX6pz3eIMIvKFGbxsKjRzJLibpv2PaQGKu4,73
|
|
114
|
-
pembot/pdf2markdown/.git/index,sha256=
|
|
125
|
+
pembot/pdf2markdown/.git/index,sha256=0VmTX9ESG5zK1K1FPqgzWMCzNvTfSUfnBeod1rhNHW8,656
|
|
115
126
|
pembot/pdf2markdown/.git/packed-refs,sha256=kJfKR7KBh8Ao4cGF_14wFxiFMP_lBLTKdXRAB2UMQ_o,112
|
|
116
127
|
pembot/pdf2markdown/.git/hooks/applypatch-msg.sample,sha256=AiNJeguLAzqlijpSG4YphpOGz3qw4vEBlj0yiqYhk_c,478
|
|
117
128
|
pembot/pdf2markdown/.git/hooks/commit-msg.sample,sha256=H3TV6SkpebVz69WXQdRsuT_zkazdCD00C5Q3B1PZJDc,896
|
|
@@ -128,11 +139,12 @@ pembot/pdf2markdown/.git/hooks/push-to-checkout.sample,sha256=pT0HQXmLKHxt16-mSu
|
|
|
128
139
|
pembot/pdf2markdown/.git/hooks/sendemail-validate.sample,sha256=ROv8kj3FRmvACWAvDs8Ge5xlRZq_6IaN3Em3jmztepI,2308
|
|
129
140
|
pembot/pdf2markdown/.git/hooks/update.sample,sha256=jV8vqD4QPPCLV-qmdSHfkZT0XL28s32lKtWGCXoU0QY,3650
|
|
130
141
|
pembot/pdf2markdown/.git/info/exclude,sha256=ZnH-g7egfIky7okWTR8nk7IxgFjri5jcXAbuClo7DsE,240
|
|
131
|
-
pembot/pdf2markdown/.git/logs/HEAD,sha256=
|
|
132
|
-
pembot/pdf2markdown/.git/logs/refs/heads/main,sha256=
|
|
133
|
-
pembot/pdf2markdown/.git/logs/refs/remotes/myorigin/main,sha256=
|
|
142
|
+
pembot/pdf2markdown/.git/logs/HEAD,sha256=Y5czyWfueqU9tPNqxXVaF_68HbwbQmSHXFJtAgsqAZc,1196
|
|
143
|
+
pembot/pdf2markdown/.git/logs/refs/heads/main,sha256=Y5czyWfueqU9tPNqxXVaF_68HbwbQmSHXFJtAgsqAZc,1196
|
|
144
|
+
pembot/pdf2markdown/.git/logs/refs/remotes/myorigin/main,sha256=IRBHjv2h2ZmZW1wiNFT6J0mY2KXK9juXpTz_QgW4R24,584
|
|
134
145
|
pembot/pdf2markdown/.git/logs/refs/remotes/origin/HEAD,sha256=jJscThcgJ-i1V19vA4RVs9acp0QIKsVSwY9zAmV3tjU,193
|
|
135
146
|
pembot/pdf2markdown/.git/objects/14/251b198e0bac39a3dc3b42f9e57b20c01465fb,sha256=Ssx4RupGzteVz0Irtgh95-Ccnacskv8ql8zLtqUgmOE,209
|
|
147
|
+
pembot/pdf2markdown/.git/objects/24/7b15a6b1e0e3d270c05af184f048736376cd4e,sha256=rD1H5Ywoiuw8gwDZyHJ1p4zxqfIh47ym6jJg0pL6KLI,10023
|
|
136
148
|
pembot/pdf2markdown/.git/objects/24/8f03b5f969a7fbd396b496f40b57f0ae81c148,sha256=ScB91DWSzfIrFLnghWglGqxxxmHxzODACQiXJEHDeWA,229
|
|
137
149
|
pembot/pdf2markdown/.git/objects/57/74dc9c3901d2ffb2cd7dafe2ad6612a7f9f42c,sha256=0Vkgzw7kU0cludbgJUyqCWLgK5Q3vfFnoKmeLq6c-uU,52
|
|
138
150
|
pembot/pdf2markdown/.git/objects/72/2dc14f82e78ce41717348b256e0c17834933b4,sha256=062pZN8JWfsC9z4MKIEgUcLIdnjzC6hwPjjsvHDhW-M,266
|
|
@@ -140,20 +152,22 @@ pembot/pdf2markdown/.git/objects/79/eb7b93ced70e399bd561093c45de7641414dbd,sha25
|
|
|
140
152
|
pembot/pdf2markdown/.git/objects/8d/9ce1fd9733a78c592b34af9c94b98960c601ed,sha256=eJMRf2BFDCxSgPuVPPLd6zZu3NmwMeYVYwyxW9QkW6M,9772
|
|
141
153
|
pembot/pdf2markdown/.git/objects/95/745843bb4377d6042180daeda818c0b16fd493,sha256=ddMj81nqLqqtVtrJ6TV7eOEjrzq38AbIjgWAPj0MaT8,12391
|
|
142
154
|
pembot/pdf2markdown/.git/objects/a5/c6dfb577782c259990dcf977e355298e923428,sha256=c6vkmaxLJ8-6V2DykAhGnGUFJc1EH_-TuDeijrrHRWg,265
|
|
155
|
+
pembot/pdf2markdown/.git/objects/a7/4bcd5e67cb1066dd504b92b42390fe0b2c3d38,sha256=Tg_Co5uUFebLteNs9dqzI-P4FGwktB-K1PZN4drr9Rk,266
|
|
143
156
|
pembot/pdf2markdown/.git/objects/b4/8d697aa9fd97151eb2a84a1af5d408b7630232,sha256=nSKTkx4mVrz7uaJkacuDJH7KO-vR1-OrvBV-e2HQvm0,194
|
|
144
157
|
pembot/pdf2markdown/.git/objects/b8/702320e56074e9680181d8b7897d6a0a552e2d,sha256=-XJJ4C0svu4LaZ9Zi3pAWVvy18w2CJ2lg16Zr2Hnu-U,372
|
|
145
158
|
pembot/pdf2markdown/.git/objects/e6/9de29bb2d1d6434b8b29ae775ad8c2e48c5391,sha256=FkxfqAZ_rPGkPwnOPQ416_U6f1cj7L8VqGZ8_FPCb2w,15
|
|
159
|
+
pembot/pdf2markdown/.git/objects/f3/b2d76c75bbd50e04fc4c2ad17fc94ca6daed32,sha256=CohsUCG2jiRVFKvC6ouCuy1pE0RS8C_dHMfOpSoCTM8,187
|
|
146
160
|
pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.idx,sha256=nZ0BJQYRC49OtqnyhZR_teR85PqslUG6j16UAKoX8m4,3452
|
|
147
161
|
pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.pack,sha256=_KzHMGgrVzHGn2ZiKyHlvqc-BwTEeq3PqDPPJ9DYI5E,32222
|
|
148
162
|
pembot/pdf2markdown/.git/objects/pack/pack-d3051affdd6c31306dc53489168fc870872085d1.rev,sha256=1jASJFjt2r2Sxd2G87oSTfrQnowK2ThvjVlWTIF-47E,392
|
|
149
|
-
pembot/pdf2markdown/.git/refs/heads/main,sha256=
|
|
150
|
-
pembot/pdf2markdown/.git/refs/remotes/myorigin/main,sha256=
|
|
163
|
+
pembot/pdf2markdown/.git/refs/heads/main,sha256=II7qd2fp3dz8A72owuHimxMIScJpmznueAkXM0sHxJU,41
|
|
164
|
+
pembot/pdf2markdown/.git/refs/remotes/myorigin/main,sha256=II7qd2fp3dz8A72owuHimxMIScJpmznueAkXM0sHxJU,41
|
|
151
165
|
pembot/pdf2markdown/.git/refs/remotes/origin/HEAD,sha256=K7aiSqD8bEhBAPXVGim7rYQc0sdV9dk_qiBOXbtOsrQ,30
|
|
152
166
|
pembot/pdf2markdown/config/config.yaml,sha256=w75W2Eg4-tu8rRk_23PqxWDh0010kRKLmPrh46f_Njc,66
|
|
153
167
|
pembot/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
154
168
|
pembot/utils/inference_client.py,sha256=jeURmY2P5heVlH1dCV0XSgiX3U2qYGEmrnUv0KFpdww,5380
|
|
155
169
|
pembot/utils/string_tools.py,sha256=gtRa5rBR0Q7GspTu2WtCnvhJQLFjPfWLvhmyiPkyStU,1883
|
|
156
|
-
pembot-0.0.
|
|
157
|
-
pembot-0.0.
|
|
158
|
-
pembot-0.0.
|
|
159
|
-
pembot-0.0.
|
|
170
|
+
pembot-0.0.8.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
171
|
+
pembot-0.0.8.dist-info/WHEEL,sha256=Dyt6SBfaasWElUrURkknVFAZDHSTwxg3PaTza7RSbkY,100
|
|
172
|
+
pembot-0.0.8.dist-info/METADATA,sha256=kfa20bL5qROy6a8bsALEzDRlmF-JnTgmR7Qc8rz6PNQ,313
|
|
173
|
+
pembot-0.0.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|