ws-bom-robot-app 0.0.37__py3-none-any.whl → 0.0.103__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ws_bom_robot_app/config.py +35 -7
- ws_bom_robot_app/cron_manager.py +15 -14
- ws_bom_robot_app/llm/agent_context.py +26 -0
- ws_bom_robot_app/llm/agent_description.py +123 -123
- ws_bom_robot_app/llm/agent_handler.py +176 -180
- ws_bom_robot_app/llm/agent_lcel.py +107 -54
- ws_bom_robot_app/llm/api.py +100 -7
- ws_bom_robot_app/llm/defaut_prompt.py +15 -15
- ws_bom_robot_app/llm/evaluator.py +319 -0
- ws_bom_robot_app/llm/feedbacks/__init__.py +0 -0
- ws_bom_robot_app/llm/feedbacks/feedback_manager.py +66 -0
- ws_bom_robot_app/llm/main.py +159 -110
- ws_bom_robot_app/llm/models/api.py +70 -5
- ws_bom_robot_app/llm/models/feedback.py +30 -0
- ws_bom_robot_app/llm/nebuly_handler.py +185 -0
- ws_bom_robot_app/llm/providers/llm_manager.py +244 -80
- ws_bom_robot_app/llm/tools/models/main.py +8 -0
- ws_bom_robot_app/llm/tools/tool_builder.py +68 -23
- ws_bom_robot_app/llm/tools/tool_manager.py +343 -133
- ws_bom_robot_app/llm/tools/utils.py +41 -25
- ws_bom_robot_app/llm/utils/agent.py +34 -0
- ws_bom_robot_app/llm/utils/chunker.py +6 -1
- ws_bom_robot_app/llm/utils/cleanup.py +81 -0
- ws_bom_robot_app/llm/utils/cms.py +123 -0
- ws_bom_robot_app/llm/utils/download.py +183 -79
- ws_bom_robot_app/llm/utils/print.py +29 -29
- ws_bom_robot_app/llm/vector_store/db/__init__.py +0 -0
- ws_bom_robot_app/llm/vector_store/db/base.py +193 -0
- ws_bom_robot_app/llm/vector_store/db/chroma.py +97 -0
- ws_bom_robot_app/llm/vector_store/db/faiss.py +91 -0
- ws_bom_robot_app/llm/vector_store/db/manager.py +15 -0
- ws_bom_robot_app/llm/vector_store/db/qdrant.py +73 -0
- ws_bom_robot_app/llm/vector_store/generator.py +137 -137
- ws_bom_robot_app/llm/vector_store/integration/api.py +216 -0
- ws_bom_robot_app/llm/vector_store/integration/azure.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/base.py +58 -15
- ws_bom_robot_app/llm/vector_store/integration/confluence.py +41 -11
- ws_bom_robot_app/llm/vector_store/integration/dropbox.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/gcs.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/github.py +22 -22
- ws_bom_robot_app/llm/vector_store/integration/googledrive.py +46 -17
- ws_bom_robot_app/llm/vector_store/integration/jira.py +112 -75
- ws_bom_robot_app/llm/vector_store/integration/manager.py +6 -2
- ws_bom_robot_app/llm/vector_store/integration/s3.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/sftp.py +1 -1
- ws_bom_robot_app/llm/vector_store/integration/sharepoint.py +7 -14
- ws_bom_robot_app/llm/vector_store/integration/shopify.py +143 -0
- ws_bom_robot_app/llm/vector_store/integration/sitemap.py +9 -1
- ws_bom_robot_app/llm/vector_store/integration/slack.py +3 -2
- ws_bom_robot_app/llm/vector_store/integration/thron.py +236 -0
- ws_bom_robot_app/llm/vector_store/loader/base.py +52 -8
- ws_bom_robot_app/llm/vector_store/loader/docling.py +71 -33
- ws_bom_robot_app/llm/vector_store/loader/json_loader.py +25 -25
- ws_bom_robot_app/main.py +148 -146
- ws_bom_robot_app/subprocess_runner.py +106 -0
- ws_bom_robot_app/task_manager.py +207 -54
- ws_bom_robot_app/util.py +65 -20
- ws_bom_robot_app-0.0.103.dist-info/METADATA +364 -0
- ws_bom_robot_app-0.0.103.dist-info/RECORD +76 -0
- {ws_bom_robot_app-0.0.37.dist-info → ws_bom_robot_app-0.0.103.dist-info}/WHEEL +1 -1
- ws_bom_robot_app/llm/settings.py +0 -4
- ws_bom_robot_app/llm/utils/agent_utils.py +0 -17
- ws_bom_robot_app/llm/utils/kb.py +0 -34
- ws_bom_robot_app-0.0.37.dist-info/METADATA +0 -277
- ws_bom_robot_app-0.0.37.dist-info/RECORD +0 -60
- {ws_bom_robot_app-0.0.37.dist-info → ws_bom_robot_app-0.0.103.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any, Optional, List, Dict
|
|
3
|
+
import asyncio
|
|
4
|
+
from langchain_core.documents import Document
|
|
5
|
+
from langchain_core.embeddings import Embeddings
|
|
6
|
+
from langchain_core.language_models import BaseChatModel
|
|
7
|
+
from langchain_core.vectorstores.base import VectorStoreRetriever, VectorStore
|
|
8
|
+
from langchain.retrievers import SelfQueryRetriever
|
|
9
|
+
from langchain.chains.query_constructor.schema import AttributeInfo
|
|
10
|
+
import tiktoken
|
|
11
|
+
|
|
12
|
+
class VectorDBStrategy(ABC):
|
|
13
|
+
class VectorDBStrategy:
|
|
14
|
+
"""
|
|
15
|
+
A strategy interface for managing vector databases. It caches and retrieves vector
|
|
16
|
+
stores, providing mechanisms for creating them, retrieving them, and invoking
|
|
17
|
+
document searches.
|
|
18
|
+
Attributes:
|
|
19
|
+
_CACHE (dict[str, VectorStore]):
|
|
20
|
+
A dictionary that caches loaded VectoreStore(e.g. Faiss,Chroma,qDrant) indexes keyed by their storage IDs.
|
|
21
|
+
Methods:
|
|
22
|
+
create(embeddings, documents, storage_id, **kwargs):
|
|
23
|
+
Asynchronously create a vector store using the provided embeddings,
|
|
24
|
+
documents, and a unique storage identifier. Returns the created
|
|
25
|
+
store's ID or None if creation fails.
|
|
26
|
+
get_loader(embeddings, storage_id, **kwargs):
|
|
27
|
+
Retrieve a vector store loader based on the provided embeddings
|
|
28
|
+
and storage identifier. This loader can be used to perform
|
|
29
|
+
further operations like retrieving documents.
|
|
30
|
+
get_retriever(embeddings, storage_id, search_type, search_kwargs, **kwargs):
|
|
31
|
+
Retrieve a VectorStoreRetriever for searching documents. Supports
|
|
32
|
+
different search methods (e.g., similarity, mmr) and employs the
|
|
33
|
+
appropriate strategy based on the search_type argument.
|
|
34
|
+
supports_self_query():
|
|
35
|
+
Indicates whether this strategy supports self-querying functionality.
|
|
36
|
+
By default, returns True.
|
|
37
|
+
_get_self_query_retriever(llm, store, description, metadata):
|
|
38
|
+
Creates a SelfQueryRetriever using the specified language model,
|
|
39
|
+
vector store, document description, and metadata. Used internally
|
|
40
|
+
for self-querying when supported.
|
|
41
|
+
invoke(embeddings, storage_id, query, search_type, search_kwargs, **kwargs):
|
|
42
|
+
Asynchronously searches for documents based on a query. Depending
|
|
43
|
+
on arguments and available metadata, either uses a self-query
|
|
44
|
+
retriever or falls back to other retrieval methods (e.g., mixed
|
|
45
|
+
similarity and mmr).
|
|
46
|
+
_remove_duplicates(docs):
|
|
47
|
+
Removes duplicate documents by checking their page content,
|
|
48
|
+
returning a list with unique results.
|
|
49
|
+
_combine_search(retrievers, query):
|
|
50
|
+
Asynchronously invokes multiple retrievers in parallel, then merges
|
|
51
|
+
their results while removing duplicates.
|
|
52
|
+
"""
|
|
53
|
+
MAX_TOKENS_PER_BATCH = 300_000 * 0.8
|
|
54
|
+
def __init__(self):
|
|
55
|
+
try:
|
|
56
|
+
self.encoding = tiktoken.get_encoding("cl100k_base") # text-embedding-3-small, text-embedding-3-large: https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
|
|
57
|
+
except Exception:
|
|
58
|
+
self.encoding = None
|
|
59
|
+
|
|
60
|
+
def _count_tokens(self, text: str) -> int:
|
|
61
|
+
"""Count tokens in text using tiktoken or fallback estimation"""
|
|
62
|
+
if self.encoding:
|
|
63
|
+
try:
|
|
64
|
+
return len(self.encoding.encode(text))
|
|
65
|
+
except Exception:
|
|
66
|
+
pass
|
|
67
|
+
# fallback: rough estimation (1 token ≈ 4 characters)
|
|
68
|
+
return len(text) // 4
|
|
69
|
+
|
|
70
|
+
def _batch_documents_by_tokens(self, documents: list[Document]) -> list[list[Document]]:
|
|
71
|
+
"""Split documents into batches based on token count"""
|
|
72
|
+
if not documents:
|
|
73
|
+
return []
|
|
74
|
+
batches = []
|
|
75
|
+
current_batch = []
|
|
76
|
+
current_token_count = 0
|
|
77
|
+
|
|
78
|
+
for doc in documents:
|
|
79
|
+
doc_tokens = self._count_tokens(doc.page_content)
|
|
80
|
+
# check if adding this document exceeds the limit
|
|
81
|
+
if current_token_count + doc_tokens > VectorDBStrategy.MAX_TOKENS_PER_BATCH:
|
|
82
|
+
# start new batch if current batch is not empty
|
|
83
|
+
if current_batch:
|
|
84
|
+
batches.append(current_batch)
|
|
85
|
+
# reset current batch
|
|
86
|
+
current_batch = [doc]
|
|
87
|
+
current_token_count = doc_tokens # reset to current doc's tokens
|
|
88
|
+
else:
|
|
89
|
+
# add to current batch
|
|
90
|
+
current_batch.append(doc)
|
|
91
|
+
current_token_count += doc_tokens
|
|
92
|
+
|
|
93
|
+
# add final batch if not empty
|
|
94
|
+
if current_batch:
|
|
95
|
+
batches.append(current_batch)
|
|
96
|
+
|
|
97
|
+
return batches
|
|
98
|
+
|
|
99
|
+
_CACHE: dict[str, VectorStore] = {}
|
|
100
|
+
def _clear_cache(self, key: str):
|
|
101
|
+
if key in self._CACHE:
|
|
102
|
+
del self._CACHE[key]
|
|
103
|
+
|
|
104
|
+
@abstractmethod
|
|
105
|
+
async def create(
|
|
106
|
+
self,
|
|
107
|
+
embeddings: Embeddings,
|
|
108
|
+
documents: List[Document],
|
|
109
|
+
storage_id: str,
|
|
110
|
+
**kwargs
|
|
111
|
+
) -> Optional[str]:
|
|
112
|
+
pass
|
|
113
|
+
|
|
114
|
+
@abstractmethod
|
|
115
|
+
def get_loader(
|
|
116
|
+
self,
|
|
117
|
+
embeddings: Embeddings,
|
|
118
|
+
storage_id: str,
|
|
119
|
+
**kwargs
|
|
120
|
+
) -> VectorStore:
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
def get_retriever(
|
|
124
|
+
self,
|
|
125
|
+
embeddings: Embeddings,
|
|
126
|
+
storage_id: str,
|
|
127
|
+
search_type: str,
|
|
128
|
+
search_kwargs: Dict[str, Any],
|
|
129
|
+
**kwargs
|
|
130
|
+
) -> VectorStoreRetriever:
|
|
131
|
+
return self.get_loader(embeddings, storage_id).as_retriever(
|
|
132
|
+
search_type=search_type,
|
|
133
|
+
search_kwargs=search_kwargs
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def supports_self_query(self) -> bool:
|
|
137
|
+
return True
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
def _get_self_query_retriever(llm:BaseChatModel,store:VectorStore,description:str, metadata: list[AttributeInfo]) -> SelfQueryRetriever:
|
|
141
|
+
return SelfQueryRetriever.from_llm(
|
|
142
|
+
llm=llm,
|
|
143
|
+
vectorstore=store,
|
|
144
|
+
document_contents=description,
|
|
145
|
+
metadata_field_info=metadata,
|
|
146
|
+
enable_limit=True,
|
|
147
|
+
verbose=True
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
async def invoke(
|
|
151
|
+
self,
|
|
152
|
+
embeddings: Embeddings,
|
|
153
|
+
storage_id: str,
|
|
154
|
+
query: str,
|
|
155
|
+
search_type: str,
|
|
156
|
+
search_kwargs: Dict[str, Any],
|
|
157
|
+
**kwargs
|
|
158
|
+
) -> List[Document]:
|
|
159
|
+
if self.supports_self_query():
|
|
160
|
+
if "app_tool" in kwargs and "llm" in kwargs:
|
|
161
|
+
from ws_bom_robot_app.llm.tools.tool_manager import LlmAppTool
|
|
162
|
+
app_tool: LlmAppTool = kwargs["app_tool"]
|
|
163
|
+
_description,_metadata=app_tool.get_vector_filtering()
|
|
164
|
+
if _description and _metadata:
|
|
165
|
+
llm: BaseChatModel = kwargs["llm"]
|
|
166
|
+
retriever = VectorDBStrategy._get_self_query_retriever(llm,self.get_loader(embeddings, storage_id),_description,_metadata)
|
|
167
|
+
return await retriever.ainvoke(query, config={"source": kwargs.get("source", "retriever")})
|
|
168
|
+
if search_type == "mixed":
|
|
169
|
+
similarity_retriever = self.get_retriever(embeddings, storage_id, "similarity", search_kwargs)
|
|
170
|
+
mmr_kwargs = {
|
|
171
|
+
"k": search_kwargs.get("k", 4),
|
|
172
|
+
"fetch_k": search_kwargs.get("fetch_k", 20),
|
|
173
|
+
"lambda_mult": search_kwargs.get("lambda_mult", 0.2),
|
|
174
|
+
}
|
|
175
|
+
mmr_retriever = self.get_retriever(embeddings, storage_id, "mmr", mmr_kwargs)
|
|
176
|
+
return await VectorDBStrategy._combine_search([similarity_retriever, mmr_retriever], query)
|
|
177
|
+
retriever = self.get_retriever(embeddings, storage_id, search_type, search_kwargs)
|
|
178
|
+
return await retriever.ainvoke(query, config={"source": kwargs.get("source", "retriever")})
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def _remove_empty_documents(docs: List[Document]) -> List[Document]:
|
|
182
|
+
return [doc for doc in docs if doc.page_content and doc.page_content.strip()]
|
|
183
|
+
@staticmethod
|
|
184
|
+
def _remove_duplicates(docs: List[Document]) -> List[Document]:
|
|
185
|
+
seen = set()
|
|
186
|
+
return [doc for doc in docs if not (doc.page_content in seen or seen.add(doc.page_content))]
|
|
187
|
+
@staticmethod
|
|
188
|
+
async def _combine_search(
|
|
189
|
+
retrievers: List[VectorStoreRetriever],
|
|
190
|
+
query: str
|
|
191
|
+
) -> List[Document]:
|
|
192
|
+
tasks = [retriever.ainvoke(query, config={"source": "custom source"}) for retriever in retrievers]
|
|
193
|
+
return VectorDBStrategy._remove_duplicates([doc for res in await asyncio.gather(*tasks) for doc in res])
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from langchain_chroma import Chroma as CHROMA
|
|
2
|
+
from langchain_core.documents import Document
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
import asyncio, gc, logging
|
|
5
|
+
from langchain_core.embeddings import Embeddings
|
|
6
|
+
from ws_bom_robot_app.llm.utils.chunker import DocumentChunker
|
|
7
|
+
from ws_bom_robot_app.llm.vector_store.db.base import VectorDBStrategy
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Chroma(VectorDBStrategy):
|
|
11
|
+
"""
|
|
12
|
+
A strategy class for interacting with a Chroma-based vector store implementation.
|
|
13
|
+
This class provides methods to create a Chroma vector store from a list of documents
|
|
14
|
+
and retrieve an existing Chroma instance. The vector store can be used to perform
|
|
15
|
+
operations such as embedding documents, persisting them to a storage directory, and
|
|
16
|
+
later loading them for retrieval tasks.
|
|
17
|
+
Attributes:
|
|
18
|
+
_CACHE (dict[str, CHROMA]): A cache to store and reuse Chroma instances.
|
|
19
|
+
Methods:
|
|
20
|
+
create(embeddings, documents, storage_id, **kwargs):
|
|
21
|
+
Creates a new Chroma instance after chunking the provided documents
|
|
22
|
+
and embedding them. Persists the vector store in the given storage directory.
|
|
23
|
+
If any error occurs during creation, logs the error and returns None.
|
|
24
|
+
Args:
|
|
25
|
+
embeddings (Embeddings): The embeddings strategy used to embed documents.
|
|
26
|
+
documents (list[Document]): The list of documents to be chunked and embedded.
|
|
27
|
+
storage_id (str): The directory where the Chroma vector store should be persisted.
|
|
28
|
+
**kwargs: Additional keyword arguments.
|
|
29
|
+
Returns:
|
|
30
|
+
Optional[str]: The storage ID if creation is successful; otherwise, None.
|
|
31
|
+
get_loader(embeddings, storage_id, **kwargs):
|
|
32
|
+
Retrieves a Chroma instance from the cache if it exists;
|
|
33
|
+
otherwise, creates and caches a new instance using the given embeddings and storage ID.
|
|
34
|
+
Args:
|
|
35
|
+
embeddings (Embeddings): The embeddings strategy used to create or load the Chroma instance.
|
|
36
|
+
storage_id (str): The directory where the Chroma vector store is persisted.
|
|
37
|
+
**kwargs: Additional keyword arguments.
|
|
38
|
+
Returns:
|
|
39
|
+
CHROMA: The retrieved or newly created Chroma instance.
|
|
40
|
+
"""
|
|
41
|
+
def __init__(self):
|
|
42
|
+
super().__init__()
|
|
43
|
+
|
|
44
|
+
async def create(
|
|
45
|
+
self,
|
|
46
|
+
embeddings: Embeddings,
|
|
47
|
+
documents: list[Document],
|
|
48
|
+
storage_id: str,
|
|
49
|
+
**kwargs
|
|
50
|
+
) -> Optional[str]:
|
|
51
|
+
try:
|
|
52
|
+
documents = self._remove_empty_documents(documents)
|
|
53
|
+
chunked_docs = DocumentChunker.chunk(documents)
|
|
54
|
+
batches = self._batch_documents_by_tokens(chunked_docs)
|
|
55
|
+
logging.info(f"documents: {len(documents)}, after chunking: {len(chunked_docs)}, processing batches: {len(batches)}")
|
|
56
|
+
_instance: CHROMA = None
|
|
57
|
+
for i, batch in enumerate(batches):
|
|
58
|
+
batch_tokens = sum(self._count_tokens(doc.page_content) for doc in batch)
|
|
59
|
+
logging.info(f"processing batch {i+1}/{len(batches)} with {len(batch)} docs ({batch_tokens:,} tokens)")
|
|
60
|
+
# create instance from first batch
|
|
61
|
+
if _instance is None:
|
|
62
|
+
_instance = await asyncio.to_thread(
|
|
63
|
+
CHROMA.from_documents,
|
|
64
|
+
documents=batch,
|
|
65
|
+
embedding=embeddings,
|
|
66
|
+
persist_directory=storage_id
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
# merge to existing instance
|
|
70
|
+
await _instance.aadd_documents(batch)
|
|
71
|
+
# add a small delay to avoid rate limiting
|
|
72
|
+
if i < len(batches) - 1: # except last batch
|
|
73
|
+
await asyncio.sleep(1)
|
|
74
|
+
if _instance:
|
|
75
|
+
self._clear_cache(storage_id)
|
|
76
|
+
logging.info(f"Successfully created {Chroma.__name__} index with {len(chunked_docs)} total documents")
|
|
77
|
+
return storage_id
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logging.error(f"{Chroma.__name__} create error: {e}")
|
|
80
|
+
raise e
|
|
81
|
+
finally:
|
|
82
|
+
del documents, chunked_docs, _instance
|
|
83
|
+
gc.collect()
|
|
84
|
+
|
|
85
|
+
def get_loader(
|
|
86
|
+
self,
|
|
87
|
+
embeddings: Embeddings,
|
|
88
|
+
storage_id: str,
|
|
89
|
+
**kwargs
|
|
90
|
+
) -> CHROMA:
|
|
91
|
+
if storage_id not in self._CACHE:
|
|
92
|
+
self._CACHE[storage_id] = CHROMA(
|
|
93
|
+
collection_name="default",
|
|
94
|
+
embedding_function=embeddings,
|
|
95
|
+
persist_directory=storage_id
|
|
96
|
+
)
|
|
97
|
+
return self._CACHE[storage_id]
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from langchain_community.vectorstores.faiss import FAISS
|
|
2
|
+
from langchain_core.documents import Document
|
|
3
|
+
from typing import Any, Optional
|
|
4
|
+
import asyncio, gc, logging
|
|
5
|
+
from langchain_core.embeddings import Embeddings
|
|
6
|
+
from ws_bom_robot_app.llm.utils.chunker import DocumentChunker
|
|
7
|
+
from ws_bom_robot_app.llm.vector_store.db.base import VectorDBStrategy
|
|
8
|
+
|
|
9
|
+
class Faiss(VectorDBStrategy):
|
|
10
|
+
"""
|
|
11
|
+
Faiss is a vector database strategy that leverages a FAISS index to store and retrieve
|
|
12
|
+
vectorized documents. It provides methods for creating a new FAISS index and for
|
|
13
|
+
loading an existing index from a local directory, with an internal caching mechanism
|
|
14
|
+
to optimize repeated retrievals.
|
|
15
|
+
Methods:
|
|
16
|
+
create(
|
|
17
|
+
Asynchronously creates a FAISS index from the given documents, using the
|
|
18
|
+
provided embeddings, then saves it locally under the specified storage ID.
|
|
19
|
+
Returns the storage ID if successful, or None otherwise.
|
|
20
|
+
get_loader(
|
|
21
|
+
Retrieves a FAISS index associated with a given storage ID. If this index
|
|
22
|
+
was previously loaded and cached, it returns the cached instance; otherwise,
|
|
23
|
+
it loads the index from local storage and caches it for subsequent use.
|
|
24
|
+
"""
|
|
25
|
+
def __init__(self):
|
|
26
|
+
super().__init__()
|
|
27
|
+
|
|
28
|
+
async def create(
|
|
29
|
+
self,
|
|
30
|
+
embeddings: Embeddings,
|
|
31
|
+
documents: list[Document],
|
|
32
|
+
storage_id: str,
|
|
33
|
+
**kwargs
|
|
34
|
+
) -> Optional[str]:
|
|
35
|
+
try:
|
|
36
|
+
documents = self._remove_empty_documents(documents)
|
|
37
|
+
chunked_docs = DocumentChunker.chunk(documents)
|
|
38
|
+
batches = self._batch_documents_by_tokens(chunked_docs)
|
|
39
|
+
logging.info(f"documents: {len(documents)}, after chunking: {len(chunked_docs)}, processing batches: {len(batches)}")
|
|
40
|
+
_instance: FAISS = None
|
|
41
|
+
for i, batch in enumerate(batches):
|
|
42
|
+
batch_tokens = sum(self._count_tokens(doc.page_content) for doc in batch)
|
|
43
|
+
logging.info(f"processing batch {i+1}/{len(batches)} with {len(batch)} docs ({batch_tokens:,} tokens)")
|
|
44
|
+
# init
|
|
45
|
+
_batch_instance = await asyncio.to_thread(
|
|
46
|
+
FAISS.from_documents,
|
|
47
|
+
batch,
|
|
48
|
+
embeddings
|
|
49
|
+
)
|
|
50
|
+
# create instance from first batch
|
|
51
|
+
if _instance is None:
|
|
52
|
+
_instance = _batch_instance
|
|
53
|
+
else:
|
|
54
|
+
# merge to existing instance
|
|
55
|
+
await asyncio.to_thread(
|
|
56
|
+
_instance.merge_from,
|
|
57
|
+
_batch_instance
|
|
58
|
+
)
|
|
59
|
+
del _batch_instance
|
|
60
|
+
gc.collect()
|
|
61
|
+
# add a small delay to avoid rate limiting
|
|
62
|
+
if i < len(batches) - 1: # except last batch
|
|
63
|
+
await asyncio.sleep(1)
|
|
64
|
+
if _instance:
|
|
65
|
+
await asyncio.to_thread(_instance.save_local, storage_id)
|
|
66
|
+
self._clear_cache(storage_id)
|
|
67
|
+
logging.info(f"Successfully created {Faiss.__name__} index with {len(chunked_docs)} total documents")
|
|
68
|
+
return storage_id
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logging.error(f"{Faiss.__name__} create error: {e}")
|
|
71
|
+
raise e
|
|
72
|
+
finally:
|
|
73
|
+
del documents, chunked_docs, _instance
|
|
74
|
+
gc.collect()
|
|
75
|
+
|
|
76
|
+
def get_loader(
|
|
77
|
+
self,
|
|
78
|
+
embeddings: Embeddings,
|
|
79
|
+
storage_id: str,
|
|
80
|
+
**kwargs
|
|
81
|
+
) -> FAISS:
|
|
82
|
+
if storage_id not in self._CACHE:
|
|
83
|
+
self._CACHE[storage_id] = FAISS.load_local(
|
|
84
|
+
folder_path=storage_id,
|
|
85
|
+
embeddings=embeddings,
|
|
86
|
+
allow_dangerous_deserialization=True
|
|
87
|
+
)
|
|
88
|
+
return self._CACHE[storage_id]
|
|
89
|
+
|
|
90
|
+
def supports_self_query(self) -> bool:
|
|
91
|
+
return False
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from ws_bom_robot_app.llm.vector_store.db.base import VectorDBStrategy
|
|
2
|
+
from ws_bom_robot_app.llm.vector_store.db.chroma import Chroma
|
|
3
|
+
from ws_bom_robot_app.llm.vector_store.db.faiss import Faiss
|
|
4
|
+
from ws_bom_robot_app.llm.vector_store.db.qdrant import Qdrant
|
|
5
|
+
|
|
6
|
+
class VectorDbManager:
|
|
7
|
+
_list: dict[str, VectorDBStrategy] = {
|
|
8
|
+
"chroma": Chroma(),
|
|
9
|
+
"faiss": Faiss(),
|
|
10
|
+
"qdrant": Qdrant()
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
def get_strategy(cls, name: str) -> VectorDBStrategy:
|
|
15
|
+
return cls._list.get(name.lower(), Faiss())
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from langchain_qdrant import QdrantVectorStore as QDRANT, FastEmbedSparse, RetrievalMode
|
|
2
|
+
from qdrant_client import QdrantClient
|
|
3
|
+
from langchain_core.documents import Document
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
import asyncio, gc, logging, os
|
|
6
|
+
from langchain_core.embeddings import Embeddings
|
|
7
|
+
from ws_bom_robot_app.llm.utils.chunker import DocumentChunker
|
|
8
|
+
from ws_bom_robot_app.llm.vector_store.db.base import VectorDBStrategy
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Qdrant(VectorDBStrategy):
|
|
12
|
+
async def create(
|
|
13
|
+
self,
|
|
14
|
+
embeddings: Embeddings,
|
|
15
|
+
documents: list[Document],
|
|
16
|
+
storage_id: str,
|
|
17
|
+
**kwargs
|
|
18
|
+
) -> Optional[str]:
|
|
19
|
+
try:
|
|
20
|
+
documents = self._remove_empty_documents(documents)
|
|
21
|
+
chunked_docs = DocumentChunker.chunk(documents)
|
|
22
|
+
batches = self._batch_documents_by_tokens(chunked_docs)
|
|
23
|
+
logging.info(f"documents: {len(documents)}, after chunking: {len(chunked_docs)}, processing batches: {len(batches)}")
|
|
24
|
+
_instance: QDRANT = None
|
|
25
|
+
if not os.path.exists(storage_id):
|
|
26
|
+
os.makedirs(storage_id)
|
|
27
|
+
|
|
28
|
+
for i, batch in enumerate(batches):
|
|
29
|
+
batch_tokens = sum(self._count_tokens(doc.page_content) for doc in batch)
|
|
30
|
+
logging.info(f"processing batch {i+1}/{len(batches)} with {len(batch)} docs ({batch_tokens:,} tokens)")
|
|
31
|
+
# create instance from first batch
|
|
32
|
+
if _instance is None:
|
|
33
|
+
_instance = await asyncio.to_thread(
|
|
34
|
+
QDRANT.from_documents,
|
|
35
|
+
documents=batch,
|
|
36
|
+
embedding=embeddings,
|
|
37
|
+
sparse_embedding=kwargs['sparse_embedding'] if 'sparse_embedding' in kwargs else FastEmbedSparse(),
|
|
38
|
+
collection_name="default",
|
|
39
|
+
path=storage_id,
|
|
40
|
+
retrieval_mode=RetrievalMode.HYBRID
|
|
41
|
+
)
|
|
42
|
+
else:
|
|
43
|
+
# merge to existing instance
|
|
44
|
+
await _instance.aadd_documents(batch)
|
|
45
|
+
# add a small delay to avoid rate limiting
|
|
46
|
+
if i < len(batches) - 1: # except last batch
|
|
47
|
+
await asyncio.sleep(1)
|
|
48
|
+
if _instance:
|
|
49
|
+
self._clear_cache(storage_id)
|
|
50
|
+
logging.info(f"Successfully created {Qdrant.__name__} index with {len(chunked_docs)} total documents")
|
|
51
|
+
return storage_id
|
|
52
|
+
except Exception as e:
|
|
53
|
+
logging.error(f"{Qdrant.__name__} create error: {e}")
|
|
54
|
+
raise e
|
|
55
|
+
finally:
|
|
56
|
+
del documents, chunked_docs, _instance
|
|
57
|
+
gc.collect()
|
|
58
|
+
|
|
59
|
+
def get_loader(
|
|
60
|
+
self,
|
|
61
|
+
embeddings: Embeddings,
|
|
62
|
+
storage_id: str,
|
|
63
|
+
**kwargs
|
|
64
|
+
) -> QDRANT:
|
|
65
|
+
if storage_id not in self._CACHE:
|
|
66
|
+
self._CACHE[storage_id] = QDRANT(
|
|
67
|
+
client=QdrantClient(path=storage_id),
|
|
68
|
+
collection_name="default",
|
|
69
|
+
embedding=embeddings,
|
|
70
|
+
sparse_embedding=FastEmbedSparse(),
|
|
71
|
+
retrieval_mode=RetrievalMode.HYBRID,
|
|
72
|
+
)
|
|
73
|
+
return self._CACHE[storage_id]
|