mb-rag 1.1.43__py3-none-any.whl → 1.1.45__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mb-rag might be problematic. Click here for more details.
- mb_rag/chatbot/basic.py +1 -1
- mb_rag/rag/embeddings.py +19 -0
- mb_rag/utils/all_data_extract.py +65 -0
- mb_rag/version.py +1 -1
- {mb_rag-1.1.43.dist-info → mb_rag-1.1.45.dist-info}/METADATA +1 -1
- {mb_rag-1.1.43.dist-info → mb_rag-1.1.45.dist-info}/RECORD +8 -7
- {mb_rag-1.1.43.dist-info → mb_rag-1.1.45.dist-info}/WHEEL +0 -0
- {mb_rag-1.1.43.dist-info → mb_rag-1.1.45.dist-info}/top_level.txt +0 -0
mb_rag/chatbot/basic.py
CHANGED
|
@@ -119,7 +119,7 @@ class ModelFactory:
|
|
|
119
119
|
return ChatAnthropic(**kwargs)
|
|
120
120
|
|
|
121
121
|
@classmethod
|
|
122
|
-
def create_google(cls, model_name: str = "gemini-
|
|
122
|
+
def create_google(cls, model_name: str = "gemini-2.0-flash", **kwargs) -> Any:
|
|
123
123
|
"""
|
|
124
124
|
Create Google chatbot model
|
|
125
125
|
Args:
|
mb_rag/rag/embeddings.py
CHANGED
|
@@ -180,6 +180,23 @@ class ModelProvider:
|
|
|
180
180
|
kwargs["model"] = model_name
|
|
181
181
|
return GoogleGenerativeAIEmbeddings(**kwargs)
|
|
182
182
|
|
|
183
|
+
@staticmethod
|
|
184
|
+
def get_rag_qwen(model_name: str = "qwen", **kwargs):
|
|
185
|
+
"""
|
|
186
|
+
Load Qwen embedding model.
|
|
187
|
+
Uses Transformers for embedding generation.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
model_name (str): Model identifier (default: "qwen")
|
|
191
|
+
**kwargs: Additional arguments for model initialization
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
QwenEmbeddings: Initialized Qwen embeddings model
|
|
195
|
+
"""
|
|
196
|
+
from langchain.embeddings import HuggingFaceEmbeddings
|
|
197
|
+
|
|
198
|
+
return HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B", **kwargs)
|
|
199
|
+
|
|
183
200
|
def load_embedding_model(model_name: str = 'openai', model_type: str = "text-embedding-ada-002", **kwargs):
|
|
184
201
|
"""
|
|
185
202
|
Load a RAG model based on provider and type.
|
|
@@ -206,6 +223,8 @@ def load_embedding_model(model_name: str = 'openai', model_type: str = "text-emb
|
|
|
206
223
|
return ModelProvider.get_rag_google(model_type, **kwargs)
|
|
207
224
|
elif model_name == 'anthropic':
|
|
208
225
|
return ModelProvider.get_rag_anthropic(model_type, **kwargs)
|
|
226
|
+
elif model_name == 'qwen':
|
|
227
|
+
return ModelProvider.get_rag_qwen(model_type, **kwargs)
|
|
209
228
|
else:
|
|
210
229
|
raise ValueError(f"Invalid model name: {model_name}")
|
|
211
230
|
except ImportError as e:
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
## Docling data extract
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
from mb_rag.utils.extra import check_package
|
|
5
|
+
|
|
6
|
+
__all__ = ['DocumentExtractor']
|
|
7
|
+
|
|
8
|
+
class DocumentExtractor:
|
|
9
|
+
"""
|
|
10
|
+
DocumentExtractor class for extracting data from documents using Docling.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
"""
|
|
15
|
+
Initialize the DocumentExtractor class.
|
|
16
|
+
Checking for Docling package.
|
|
17
|
+
"""
|
|
18
|
+
if not check_package("docling"):
|
|
19
|
+
raise ImportError("Docling package not found. Please install it using: pip install docling")
|
|
20
|
+
from docling import Docling
|
|
21
|
+
self.Docling = Docling
|
|
22
|
+
|
|
23
|
+
def _extract_data(self, file_path: str, **kwargs) -> List[str]:
|
|
24
|
+
"""
|
|
25
|
+
Extract data from a document using Docling.
|
|
26
|
+
"""
|
|
27
|
+
try:
|
|
28
|
+
docling = self.Docling(file_path, **kwargs)
|
|
29
|
+
return docling.extract()
|
|
30
|
+
except Exception as e:
|
|
31
|
+
raise Exception(f"Error extracting data from {file_path}: {str(e)}")
|
|
32
|
+
|
|
33
|
+
def get_data(self,file_path: str, save_path: str = None, data_store_type: str = "markdown",**kwargs) -> List[str]:
|
|
34
|
+
"""
|
|
35
|
+
Get data from a document using Docling.
|
|
36
|
+
Args:
|
|
37
|
+
file_path (str): Path to the document
|
|
38
|
+
save_path (str): Path to save the extracted data. Default is None. If None, data saved as Markdown file as docling_{file_name}.md
|
|
39
|
+
data_store_type (str): Saving document as markdown, txt or html. Default is markdown
|
|
40
|
+
**kwargs: Additional arguments for Docling
|
|
41
|
+
Returns:
|
|
42
|
+
List[str]: Extracted data
|
|
43
|
+
"""
|
|
44
|
+
data = self._extract_data(file_path, **kwargs)
|
|
45
|
+
if data_store_type == "markdown":
|
|
46
|
+
data_type = "md"
|
|
47
|
+
elif data_store_type == "txt":
|
|
48
|
+
data_type = "txt"
|
|
49
|
+
elif data_store_type == "html":
|
|
50
|
+
data_type = "html"
|
|
51
|
+
else:
|
|
52
|
+
print("Invalid data store type. Defaulting to text (txt)")
|
|
53
|
+
data_type = "txt"
|
|
54
|
+
if save_path is None:
|
|
55
|
+
save_path = f"docling_{file_path.split('/')[-1].split('.')[0]}.{data_type}"
|
|
56
|
+
print(f"Saving extracted data to {save_path}")
|
|
57
|
+
if data_store_type == "markdown":
|
|
58
|
+
data_with_type = data.document.export_to_markdown()
|
|
59
|
+
elif data_store_type == "txt":
|
|
60
|
+
data_with_type = data.document.export_to_text()
|
|
61
|
+
elif data_store_type == "html":
|
|
62
|
+
data_with_type = data.document.export_to_html()
|
|
63
|
+
with open(save_path, 'w') as f:
|
|
64
|
+
f.write(data_with_type)
|
|
65
|
+
return data
|
mb_rag/version.py
CHANGED
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
mb_rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
mb_rag/version.py,sha256=
|
|
2
|
+
mb_rag/version.py,sha256=9g4JnrnLgsbs9ZJE0iG3ErX8u7puBHMVjLiS08_wP_0,207
|
|
3
3
|
mb_rag/chatbot/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
mb_rag/chatbot/basic.py,sha256=
|
|
4
|
+
mb_rag/chatbot/basic.py,sha256=8tXU_3Yiqv0J-2Bnpw8p9sQaOlZHzX-Xenjs9GmWqes,23825
|
|
5
5
|
mb_rag/chatbot/chains.py,sha256=vDbLX5R29sWN1pcFqJ5fyxJEgMCM81JAikunAEvMC9A,7223
|
|
6
6
|
mb_rag/chatbot/prompts.py,sha256=n1PyiLbU-5fkslRv6aVOzt0dDlwya_cEdQ7kRnRhMuY,1749
|
|
7
7
|
mb_rag/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
mb_rag/rag/embeddings.py,sha256=
|
|
8
|
+
mb_rag/rag/embeddings.py,sha256=CI1tJnIUyGsZhFaqCCZ5xmKKJqdAT1ZAMRReUXLLt2k,28274
|
|
9
9
|
mb_rag/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
mb_rag/utils/all_data_extract.py,sha256=TL6O4vNc7mPW-OGK-LhXQQIkSr0o3_7BqNAD-YpTQMU,2532
|
|
10
11
|
mb_rag/utils/bounding_box.py,sha256=G0hdDam8QmYtD9lfwMeDHGm-TTo6KZg-yK5ESFL9zaM,8366
|
|
11
12
|
mb_rag/utils/document_extract.py,sha256=vZiFB1RYm1BIEaNA0MveJ5Zp-KEi0ngKjW8xEdtPqXA,12558
|
|
12
13
|
mb_rag/utils/extra.py,sha256=spbFrGgdruNyYQ5PzgvpSIa6Nm0rn9bb4qc8W9g582o,2492
|
|
13
14
|
mb_rag/utils/pdf_extract.py,sha256=cVeMyhnAU4XZxjIZHKMYhrktTjUNOjhx2r_LZKReOZE,15598
|
|
14
|
-
mb_rag-1.1.
|
|
15
|
-
mb_rag-1.1.
|
|
16
|
-
mb_rag-1.1.
|
|
17
|
-
mb_rag-1.1.
|
|
15
|
+
mb_rag-1.1.45.dist-info/METADATA,sha256=o7mzyY2MJfPaopqUvup1i4ptZeHnMja0jphc4y7jylM,234
|
|
16
|
+
mb_rag-1.1.45.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
17
|
+
mb_rag-1.1.45.dist-info/top_level.txt,sha256=FIK1eAa5uYnurgXZquBG-s3PIy-HDTC5yJBW4lTH_pM,7
|
|
18
|
+
mb_rag-1.1.45.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|