ws-bom-robot-app 0.0.106__py3-none-any.whl → 0.0.107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ws_bom_robot_app/llm/models/api.py +3 -0
- ws_bom_robot_app/llm/utils/chunker.py +77 -7
- ws_bom_robot_app/llm/vector_store/db/base.py +3 -0
- ws_bom_robot_app/llm/vector_store/db/chroma.py +4 -1
- ws_bom_robot_app/llm/vector_store/db/faiss.py +4 -1
- ws_bom_robot_app/llm/vector_store/db/qdrant.py +7 -4
- ws_bom_robot_app/llm/vector_store/generator.py +1 -1
- {ws_bom_robot_app-0.0.106.dist-info → ws_bom_robot_app-0.0.107.dist-info}/METADATA +1 -1
- {ws_bom_robot_app-0.0.106.dist-info → ws_bom_robot_app-0.0.107.dist-info}/RECORD +11 -11
- {ws_bom_robot_app-0.0.106.dist-info → ws_bom_robot_app-0.0.107.dist-info}/WHEEL +1 -1
- {ws_bom_robot_app-0.0.106.dist-info → ws_bom_robot_app-0.0.107.dist-info}/top_level.txt +0 -0
|
@@ -236,6 +236,9 @@ class RulesRequest(VectorDbRequest):
|
|
|
236
236
|
rules: List[str]
|
|
237
237
|
|
|
238
238
|
class KbRequest(VectorDbRequest):
|
|
239
|
+
chucking_method: Optional[str] = Field("recursive", validation_alias=AliasChoices("chunkingMethod","chunking_method"))
|
|
240
|
+
chuck_size: Optional[int] = Field(3_000, validation_alias=AliasChoices("chunkSize","chuckt_size"))
|
|
241
|
+
chunk_overlap: Optional[int] = Field(300, validation_alias=AliasChoices("chunkOverlap","chunk_overlap"))
|
|
239
242
|
files: Optional[List[str]] = []
|
|
240
243
|
integrations: Optional[List[LlmKbIntegration]] = []
|
|
241
244
|
endpoints: Optional[List[LlmKbEndpoint]] = []
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
from langchain_core.documents import Document
|
|
2
2
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
3
|
+
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
|
3
4
|
|
|
4
5
|
class DocumentChunker:
|
|
5
6
|
@staticmethod
|
|
6
|
-
def chunk(documents: list[Document]) -> list[Document]:
|
|
7
|
-
|
|
7
|
+
def chunk(documents: list[Document], chucking_method: str = "recursive", chunk_size: int=3_000, chunk_overlap: int=300) -> list[Document]:
|
|
8
|
+
if chucking_method == "recursive":
|
|
9
|
+
return DocumentChunker.chunk_recursive(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
10
|
+
elif chucking_method == "markdownHeader":
|
|
11
|
+
return DocumentChunker.chunk_markdown(documents, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
8
12
|
|
|
9
13
|
@staticmethod
|
|
10
|
-
def chunk_recursive(documents: list[Document], chunk_size: int=3_000) -> list[Document]:
|
|
14
|
+
def chunk_recursive(documents: list[Document], chunk_size: int=3_000, chunk_overlap: int=300) -> list[Document]:
|
|
11
15
|
"""
|
|
12
16
|
Recursively split documents into smaller chunks while preserving metadata.
|
|
13
17
|
|
|
@@ -17,16 +21,16 @@ class DocumentChunker:
|
|
|
17
21
|
|
|
18
22
|
Args:
|
|
19
23
|
documents (list[Document]): A list of Document objects to be chunked.
|
|
20
|
-
chunk_size (int, optional): The maximum size of each chunk in characters.
|
|
24
|
+
chunk_size (int, optional): The maximum size of each chunk in characters.
|
|
21
25
|
Defaults to 3,000.
|
|
22
26
|
|
|
23
27
|
Returns:
|
|
24
|
-
list[Document]: A list of Document objects where each document's content is
|
|
25
|
-
at most chunk_size characters. Each chunk preserves the metadata from
|
|
28
|
+
list[Document]: A list of Document objects where each document's content is
|
|
29
|
+
at most chunk_size characters. Each chunk preserves the metadata from
|
|
26
30
|
its original document.
|
|
27
31
|
|
|
28
32
|
Notes:
|
|
29
|
-
- Chunk overlap is automatically set to 10% of the chunk_size to maintain
|
|
33
|
+
- Chunk overlap is automatically set to 10% of the chunk_size to maintain
|
|
30
34
|
context between chunks.
|
|
31
35
|
- Documents smaller than or equal to chunk_size are returned unchanged.
|
|
32
36
|
- Metadata from the original document is copied to all resulting chunks.
|
|
@@ -80,3 +84,69 @@ class DocumentChunker:
|
|
|
80
84
|
)
|
|
81
85
|
return chunked_documents
|
|
82
86
|
|
|
87
|
+
@staticmethod
|
|
88
|
+
def chunk_markdown(documents: list[Document], chunk_size: int=3_000, chunk_overlap: int=300) -> list[Document]:
|
|
89
|
+
"""
|
|
90
|
+
Splits markdown documents based on headers and then into smaller chunks.
|
|
91
|
+
|
|
92
|
+
This function takes a list of Document objects containing markdown content and splits
|
|
93
|
+
them based on markdown headers (# H1, ## H2, ### H3, etc.). After splitting by headers,
|
|
94
|
+
it further chunks large sections using RecursiveCharacterTextSplitter to ensure no
|
|
95
|
+
chunk exceeds the specified size.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
documents (list[Document]): A list of Document objects with markdown content.
|
|
99
|
+
chunk_size (int, optional): The maximum size of each chunk in characters.
|
|
100
|
+
Defaults to 3,000.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
list[Document]: A list of Document objects where each document represents a chunk.
|
|
104
|
+
Metadata includes the markdown header hierarchy and original document metadata.
|
|
105
|
+
|
|
106
|
+
Note:
|
|
107
|
+
- Headers are split at levels: H1 (#), H2 (##), and H3 (###)
|
|
108
|
+
- Header information is preserved in the metadata
|
|
109
|
+
- Large sections are further split to respect chunk_size limit
|
|
110
|
+
- Chunk overlap is set to 10% of chunk_size for context preservation
|
|
111
|
+
"""
|
|
112
|
+
# Define headers to split on
|
|
113
|
+
headers_to_split_on = [
|
|
114
|
+
("#", "h1"),
|
|
115
|
+
("##", "h2"),
|
|
116
|
+
("###", "h3"),
|
|
117
|
+
("####", "h4"),
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
markdown_splitter = MarkdownHeaderTextSplitter(
|
|
121
|
+
headers_to_split_on=headers_to_split_on,
|
|
122
|
+
strip_headers=False
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Secondary splitter for large sections
|
|
126
|
+
text_splitter = RecursiveCharacterTextSplitter(
|
|
127
|
+
chunk_size=chunk_size,
|
|
128
|
+
chunk_overlap=chunk_overlap
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
chunked_documents = []
|
|
132
|
+
for doc in documents:
|
|
133
|
+
# First split by markdown headers
|
|
134
|
+
md_chunks = markdown_splitter.split_text(doc.page_content)
|
|
135
|
+
|
|
136
|
+
# Then split large sections if needed
|
|
137
|
+
for md_chunk in md_chunks:
|
|
138
|
+
# Merge metadata from original doc and header metadata
|
|
139
|
+
merged_metadata = {**doc.metadata, **md_chunk.metadata}
|
|
140
|
+
if len(md_chunk.page_content) <= chunk_size:
|
|
141
|
+
chunked_documents.append(
|
|
142
|
+
Document(page_content=md_chunk.page_content, metadata=merged_metadata)
|
|
143
|
+
)
|
|
144
|
+
else:
|
|
145
|
+
# Further split large sections
|
|
146
|
+
sub_chunks = text_splitter.split_text(md_chunk.page_content)
|
|
147
|
+
for sub_chunk in sub_chunks:
|
|
148
|
+
chunked_documents.append(
|
|
149
|
+
Document(page_content=sub_chunk, metadata=merged_metadata)
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
return chunked_documents
|
|
@@ -46,11 +46,14 @@ class Chroma(VectorDBStrategy):
|
|
|
46
46
|
embeddings: Embeddings,
|
|
47
47
|
documents: list[Document],
|
|
48
48
|
storage_id: str,
|
|
49
|
+
chucking_method: str,
|
|
50
|
+
chunk_size: int,
|
|
51
|
+
chunk_overlap: int,
|
|
49
52
|
**kwargs
|
|
50
53
|
) -> Optional[str]:
|
|
51
54
|
try:
|
|
52
55
|
documents = self._remove_empty_documents(documents)
|
|
53
|
-
chunked_docs = DocumentChunker.chunk(documents)
|
|
56
|
+
chunked_docs = DocumentChunker.chunk(documents, chucking_method, chunk_size, chunk_overlap)
|
|
54
57
|
batches = self._batch_documents_by_tokens(chunked_docs)
|
|
55
58
|
logging.info(f"documents: {len(documents)}, after chunking: {len(chunked_docs)}, processing batches: {len(batches)}")
|
|
56
59
|
_instance: CHROMA = None
|
|
@@ -30,11 +30,14 @@ class Faiss(VectorDBStrategy):
|
|
|
30
30
|
embeddings: Embeddings,
|
|
31
31
|
documents: list[Document],
|
|
32
32
|
storage_id: str,
|
|
33
|
+
chucking_method: str,
|
|
34
|
+
chunk_size: int,
|
|
35
|
+
chunk_overlap: int,
|
|
33
36
|
**kwargs
|
|
34
37
|
) -> Optional[str]:
|
|
35
38
|
try:
|
|
36
39
|
documents = self._remove_empty_documents(documents)
|
|
37
|
-
chunked_docs = DocumentChunker.chunk(documents)
|
|
40
|
+
chunked_docs = DocumentChunker.chunk(documents, chucking_method, chunk_size, chunk_overlap)
|
|
38
41
|
batches = self._batch_documents_by_tokens(chunked_docs)
|
|
39
42
|
logging.info(f"documents: {len(documents)}, after chunking: {len(chunked_docs)}, processing batches: {len(batches)}")
|
|
40
43
|
_instance: FAISS = None
|
|
@@ -14,14 +14,17 @@ class Qdrant(VectorDBStrategy):
|
|
|
14
14
|
embeddings: Embeddings,
|
|
15
15
|
documents: list[Document],
|
|
16
16
|
storage_id: str,
|
|
17
|
+
chucking_method: str,
|
|
18
|
+
chunk_size: int,
|
|
19
|
+
chunk_overlap: int,
|
|
17
20
|
**kwargs
|
|
18
21
|
) -> Optional[str]:
|
|
19
22
|
try:
|
|
20
23
|
documents = self._remove_empty_documents(documents)
|
|
21
|
-
chunked_docs = DocumentChunker.chunk(documents)
|
|
24
|
+
chunked_docs = DocumentChunker.chunk(documents, chucking_method, chunk_size, chunk_overlap)
|
|
22
25
|
batches = self._batch_documents_by_tokens(chunked_docs)
|
|
23
26
|
logging.info(f"documents: {len(documents)}, after chunking: {len(chunked_docs)}, processing batches: {len(batches)}")
|
|
24
|
-
_instance: QDRANT = None
|
|
27
|
+
_instance: QDRANT = None
|
|
25
28
|
if not os.path.exists(storage_id):
|
|
26
29
|
os.makedirs(storage_id)
|
|
27
30
|
|
|
@@ -45,10 +48,10 @@ class Qdrant(VectorDBStrategy):
|
|
|
45
48
|
# add a small delay to avoid rate limiting
|
|
46
49
|
if i < len(batches) - 1: # except last batch
|
|
47
50
|
await asyncio.sleep(1)
|
|
48
|
-
if _instance:
|
|
51
|
+
if _instance:
|
|
49
52
|
self._clear_cache(storage_id)
|
|
50
53
|
logging.info(f"Successfully created {Qdrant.__name__} index with {len(chunked_docs)} total documents")
|
|
51
|
-
return storage_id
|
|
54
|
+
return storage_id
|
|
52
55
|
except Exception as e:
|
|
53
56
|
logging.error(f"{Qdrant.__name__} create error: {e}")
|
|
54
57
|
raise e
|
|
@@ -112,7 +112,7 @@ async def kb(rq: KbRequest) -> VectorDbResponse:
|
|
|
112
112
|
db_file_path = await aiofiles.os.wrap(shutil.make_archive)(
|
|
113
113
|
os.path.join(_config.robot_data_folder, _config.robot_data_db_folder, _config.robot_data_db_folder_out, db_name),
|
|
114
114
|
"zip",
|
|
115
|
-
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, return_folder_path=True)
|
|
115
|
+
await VectorDbManager.get_strategy(rq.vector_type).create(rq.embeddings(), documents, store_path, rq.chucking_method, rq.chuck_size, rq.chunk_overlap, return_folder_path=True)
|
|
116
116
|
)
|
|
117
117
|
return VectorDbResponse(file = os.path.basename(db_file_path), vector_type=rq.vector_type)
|
|
118
118
|
except Exception as e:
|
|
@@ -19,7 +19,7 @@ ws_bom_robot_app/llm/nebuly_handler.py,sha256=wFO2UG849kv5hmjM5EoOp0Jsloy-BtQjrR
|
|
|
19
19
|
ws_bom_robot_app/llm/feedbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
20
|
ws_bom_robot_app/llm/feedbacks/feedback_manager.py,sha256=vNcZLG9IKhurAk7hjBqyFgQTjnh3Cd4GnxeYsX7ZdiA,2922
|
|
21
21
|
ws_bom_robot_app/llm/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
|
-
ws_bom_robot_app/llm/models/api.py,sha256=
|
|
22
|
+
ws_bom_robot_app/llm/models/api.py,sha256=jvoU8z82L7xGkqo2a2m--0OxZrENoLVPuucA-rdr74o,12798
|
|
23
23
|
ws_bom_robot_app/llm/models/base.py,sha256=1TqxuTK3rjJEALn7lvgoen_1ba3R2brAgGx6EDTtDZo,152
|
|
24
24
|
ws_bom_robot_app/llm/models/feedback.py,sha256=pYNQGxNOBgeAAfdJLI95l7ePLBI5tVdsgnyjp5oMOQU,1722
|
|
25
25
|
ws_bom_robot_app/llm/models/kb.py,sha256=oVSw6_dmNxikAHrPqcfxDXz9M0ezLIYuxpgvzfs_Now,9514
|
|
@@ -33,7 +33,7 @@ ws_bom_robot_app/llm/tools/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeR
|
|
|
33
33
|
ws_bom_robot_app/llm/tools/models/main.py,sha256=1hICqHs-KS2heenkH7b2eH0N2GrPaaNGBrn64cl_A40,827
|
|
34
34
|
ws_bom_robot_app/llm/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
35
|
ws_bom_robot_app/llm/utils/agent.py,sha256=uFuSfYMfGIE2WCKGNSKL-T2SDFn-tUKvbAYbGTPIw6g,1445
|
|
36
|
-
ws_bom_robot_app/llm/utils/chunker.py,sha256
|
|
36
|
+
ws_bom_robot_app/llm/utils/chunker.py,sha256=-WfDG6xUU_oOUJmWhDlQbI1hsGCRkmnKyqkY_bEG8WA,7420
|
|
37
37
|
ws_bom_robot_app/llm/utils/cleanup.py,sha256=ARLZTX4mLbkLCEnMdIWYDYEAPOjzfy1laLGkYnxZe30,3063
|
|
38
38
|
ws_bom_robot_app/llm/utils/cms.py,sha256=gfIXvY3DxgbgDf0LCzyekWitaduxKGLHfV6gbRmh8zk,6960
|
|
39
39
|
ws_bom_robot_app/llm/utils/download.py,sha256=rvc88E63UGHnFVlJJeMb05Z2FcBYIITqKnIE3ldEu6I,7293
|
|
@@ -41,13 +41,13 @@ ws_bom_robot_app/llm/utils/print.py,sha256=HK3zhZOd4cEyXZ8QcudLtTIfqqtMOERce_yTo
|
|
|
41
41
|
ws_bom_robot_app/llm/utils/secrets.py,sha256=-HtqLIDVIJrpvGC5YhPAVyLsq8P4ChVM5g3GOfdwqVk,878
|
|
42
42
|
ws_bom_robot_app/llm/utils/webhooks.py,sha256=LAAZqyN6VhV13wu4X-X85TwdDgAV2rNvIwQFIIc0FJM,2114
|
|
43
43
|
ws_bom_robot_app/llm/vector_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
|
-
ws_bom_robot_app/llm/vector_store/generator.py,sha256=
|
|
44
|
+
ws_bom_robot_app/llm/vector_store/generator.py,sha256=nSj8aLARr4h1SJlkEI7X1hDef195fAPKEi2fFkl7_wM,6504
|
|
45
45
|
ws_bom_robot_app/llm/vector_store/db/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
|
-
ws_bom_robot_app/llm/vector_store/db/base.py,sha256=
|
|
47
|
-
ws_bom_robot_app/llm/vector_store/db/chroma.py,sha256=
|
|
48
|
-
ws_bom_robot_app/llm/vector_store/db/faiss.py,sha256=
|
|
46
|
+
ws_bom_robot_app/llm/vector_store/db/base.py,sha256=GhTkOq4ms_vUf_nuncyskUpI6kWPKDQi5dfLU5zduFY,8576
|
|
47
|
+
ws_bom_robot_app/llm/vector_store/db/chroma.py,sha256=9tnEKQLvBt5TPthULR08ktDkcpFjuIxuYV7REFp9kuY,4752
|
|
48
|
+
ws_bom_robot_app/llm/vector_store/db/faiss.py,sha256=lHpBZV1s_OZTiRlcVM-KJBf2wWWkzvYm_gt57BdbbUs,4055
|
|
49
49
|
ws_bom_robot_app/llm/vector_store/db/manager.py,sha256=5rqBvc0QKmHFUgVHqBAr1Y4FZRl-w-ylGMjgXZywrdA,533
|
|
50
|
-
ws_bom_robot_app/llm/vector_store/db/qdrant.py,sha256
|
|
50
|
+
ws_bom_robot_app/llm/vector_store/db/qdrant.py,sha256=v3YKLZ9_ysaNB64UVA1JCYg-W1BMGfo9CLCG4roXtJ4,3323
|
|
51
51
|
ws_bom_robot_app/llm/vector_store/integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
52
|
ws_bom_robot_app/llm/vector_store/integration/api.py,sha256=jivsqw3iMr4isnxi-jQYtFWPtBcTgIDe88hiUqXv5NE,8400
|
|
53
53
|
ws_bom_robot_app/llm/vector_store/integration/azure.py,sha256=OEa96Dlf1CX0tjrTjX4KP3D_HTn249ukc9sluPbdOyU,3389
|
|
@@ -70,7 +70,7 @@ ws_bom_robot_app/llm/vector_store/loader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5
|
|
|
70
70
|
ws_bom_robot_app/llm/vector_store/loader/base.py,sha256=InpRwKPxp0tuM4drezBvxxAWHe3XTmu60MGvFsT7RPE,7176
|
|
71
71
|
ws_bom_robot_app/llm/vector_store/loader/docling.py,sha256=RFYSZkZAYtU8wJSd1rN2T0lVo-wK1-ddtr6bH2fBr6Q,5170
|
|
72
72
|
ws_bom_robot_app/llm/vector_store/loader/json_loader.py,sha256=LDppW0ZATo4_1hh-KlsAM3TLawBvwBxva_a7k5Oz1sc,858
|
|
73
|
-
ws_bom_robot_app-0.0.
|
|
74
|
-
ws_bom_robot_app-0.0.
|
|
75
|
-
ws_bom_robot_app-0.0.
|
|
76
|
-
ws_bom_robot_app-0.0.
|
|
73
|
+
ws_bom_robot_app-0.0.107.dist-info/METADATA,sha256=ctg-rgQKDvVvLr_L_wJxP1FwhNdl6NZRYdYB5JhV9Ys,11011
|
|
74
|
+
ws_bom_robot_app-0.0.107.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
75
|
+
ws_bom_robot_app-0.0.107.dist-info/top_level.txt,sha256=Yl0akyHVbynsBX_N7wx3H3ZTkcMLjYyLJs5zBMDAKcM,17
|
|
76
|
+
ws_bom_robot_app-0.0.107.dist-info/RECORD,,
|
|
File without changes
|