cwyodmodules 0.3.80__tar.gz → 0.3.83__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/PKG-INFO +1 -5
- cwyodmodules-0.3.83/cwyodmodules/batch/utilities/document_chunking/fixed_size_overlap.py +98 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_chunking/layout.py +49 -3
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_chunking/page.py +48 -3
- cwyodmodules-0.3.83/cwyodmodules/batch/utilities/document_loading/web.py +85 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/azure_search_helper.py +4 -13
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/config_helper.py +5 -10
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/default.json +1 -3
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/env_helper.py +4 -6
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/llm_helper.py +21 -58
- cwyodmodules-0.3.83/cwyodmodules/batch/utilities/helpers/orchestrator_helper.py +21 -0
- cwyodmodules-0.3.83/cwyodmodules/batch/utilities/orchestrator/__init__.py +3 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/orchestrator/semantic_kernel_orchestrator.py +154 -22
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules.egg-info/PKG-INFO +1 -5
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules.egg-info/SOURCES.txt +0 -6
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules.egg-info/requires.txt +0 -4
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/pyproject.toml +54 -58
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/tests/test_batch.py +2 -2
- cwyodmodules-0.3.80/cwyodmodules/batch/utilities/document_chunking/fixed_size_overlap.py +0 -48
- cwyodmodules-0.3.80/cwyodmodules/batch/utilities/document_loading/web.py +0 -30
- cwyodmodules-0.3.80/cwyodmodules/batch/utilities/helpers/orchestrator_helper.py +0 -30
- cwyodmodules-0.3.80/cwyodmodules/batch/utilities/orchestrator/__init__.py +0 -18
- cwyodmodules-0.3.80/cwyodmodules/batch/utilities/orchestrator/lang_chain_agent.py +0 -174
- cwyodmodules-0.3.80/cwyodmodules/batch/utilities/orchestrator/open_ai_functions.py +0 -196
- cwyodmodules-0.3.80/cwyodmodules/batch/utilities/orchestrator/orchestration_strategy.py +0 -18
- cwyodmodules-0.3.80/cwyodmodules/batch/utilities/orchestrator/orchestrator_base.py +0 -170
- cwyodmodules-0.3.80/cwyodmodules/batch/utilities/orchestrator/prompt_flow.py +0 -195
- cwyodmodules-0.3.80/cwyodmodules/batch/utilities/orchestrator/strategies.py +0 -29
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/LICENSE +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/README.md +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/api/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/api/chat_history.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/chat_history/auth_utils.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/chat_history/cosmosdb.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/chat_history/database_client_base.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/chat_history/database_factory.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/chat_history/postgresdbservice.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/chat_history/sample_user.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/common/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/common/answer.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/common/source_document.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_chunking/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_chunking/chunking_strategy.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_chunking/document_chunking_base.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_chunking/paragraph.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_chunking/strategies.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_loading/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_loading/document_loading_base.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_loading/layout.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_loading/read.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_loading/strategies.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_loading/word_document.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/azure_blob_storage_client.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/azure_computer_vision_client.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/azure_form_recognizer_helper.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/azure_postgres_helper.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/azure_postgres_helper_light_rag.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/agent_mode.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/assistant_strategy.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/conversation_flow.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/database_type.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/default_contract_assistant_prompt.txt +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/default_employee_assistant_prompt.txt +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/embedding_config.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/document_chunking_helper.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/document_loading_helper.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/embedders/embedder_base.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/embedders/embedder_factory.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/embedders/integrated_vectorization_embedder.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/embedders/postgres_embedder.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/embedders/push_embedder.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/lightrag_helper.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/integrated_vectorization/azure_search_datasource.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/integrated_vectorization/azure_search_index.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/integrated_vectorization/azure_search_indexer.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/integrated_vectorization/azure_search_skillset.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/loggers/conversation_logger.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/parser/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/parser/output_parser_tool.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/parser/parser_base.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/plugins/chat_plugin.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/plugins/outlook_calendar_plugin.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/plugins/post_answering_plugin.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/search/azure_search_handler.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/search/azure_search_handler_light_rag.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/search/integrated_vectorization_search_handler.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/search/lightrag_search_handler.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/search/postgres_search_handler.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/search/postgres_search_handler_light_rag.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/search/search.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/search/search_handler_base.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/tools/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/tools/answer_processing_base.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/tools/answering_tool_base.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/tools/content_safety_checker.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/tools/post_prompt_tool.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/tools/question_answer_tool.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/tools/text_processing_tool.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/config.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/database/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/database/base.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/database/models.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/indexing/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/indexing/chunking.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/indexing/extraction.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/indexing/types.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/indexing/upsert.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/indexing/utils.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/llm/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/llm/llm.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/llm/prompt.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/main.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/query/__init__.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/query/generate.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/query/graph_search.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/query/types.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/graphrag/query/vector_search.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/mgmt_config.py +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules.egg-info/dependency_links.txt +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules.egg-info/top_level.txt +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/setup.cfg +0 -0
- {cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/tests/test_api.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cwyodmodules
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.83
|
4
4
|
Summary: Add your description here
|
5
5
|
Author-email: Patrik <patrikhartl@gmail.com>
|
6
6
|
Classifier: Operating System :: OS Independent
|
@@ -15,13 +15,9 @@ Requires-Dist: azure-mgmt-cognitiveservices<14.0.0,>=13.6.0
|
|
15
15
|
Requires-Dist: azure-identity<2.0.0,>=1.20.0
|
16
16
|
Requires-Dist: azure-cosmos<5.0.0,>=4.9.0
|
17
17
|
Requires-Dist: asyncpg<0.31.0,>=0.30.0
|
18
|
-
Requires-Dist: langchain<0.4.0,>=0.3.18
|
19
18
|
Requires-Dist: azure-storage-queue<13.0.0,>=12.12.0
|
20
19
|
Requires-Dist: chardet<6.0.0,>=5.2.0
|
21
20
|
Requires-Dist: azure-ai-formrecognizer<4.0.0,>=3.3.3
|
22
|
-
Requires-Dist: langchain-chroma<0.3.0,>=0.2.2
|
23
|
-
Requires-Dist: langchain-openai<0.4.0,>=0.3.5
|
24
|
-
Requires-Dist: langchain-community<0.4.0,>=0.3.17
|
25
21
|
Requires-Dist: azure-search<2.0.0,>=1.0.0b2
|
26
22
|
Requires-Dist: azure-functions<2.0.0,>=1.21.3
|
27
23
|
Requires-Dist: azure-ai-ml<2.0.0,>=1.25.0
|
@@ -0,0 +1,98 @@
|
|
1
|
+
from typing import List
|
2
|
+
from .document_chunking_base import DocumentChunkingBase
|
3
|
+
from .chunking_strategy import ChunkingSettings
|
4
|
+
from ..common.source_document import SourceDocument
|
5
|
+
from ...utilities.helpers.env_helper import EnvHelper
|
6
|
+
from mgmt_config import logger
|
7
|
+
env_helper: EnvHelper = EnvHelper()
|
8
|
+
log_execution = env_helper.LOG_EXECUTION
|
9
|
+
log_args = env_helper.LOG_ARGS
|
10
|
+
log_result = env_helper.LOG_RESULT
|
11
|
+
|
12
|
+
|
13
|
+
class SimpleTokenSplitter:
|
14
|
+
"""Simple token-based text splitter to replace LangChain's TokenTextSplitter."""
|
15
|
+
|
16
|
+
def __init__(self, chunk_size: int, chunk_overlap: int):
|
17
|
+
self.chunk_size = chunk_size
|
18
|
+
self.chunk_overlap = chunk_overlap
|
19
|
+
|
20
|
+
def split_text(self, text: str) -> List[str]:
|
21
|
+
"""Split text into chunks based on approximate token count."""
|
22
|
+
if not text:
|
23
|
+
return []
|
24
|
+
|
25
|
+
# Rough approximation: 1 token ≈ 4 characters
|
26
|
+
char_chunk_size = self.chunk_size * 4
|
27
|
+
char_overlap = self.chunk_overlap * 4
|
28
|
+
|
29
|
+
chunks = []
|
30
|
+
start = 0
|
31
|
+
|
32
|
+
while start < len(text):
|
33
|
+
# Calculate end position
|
34
|
+
end = start + char_chunk_size
|
35
|
+
|
36
|
+
# If this is not the last chunk, try to find a good break point
|
37
|
+
if end < len(text):
|
38
|
+
# Look for sentence endings, then paragraph breaks, then word boundaries
|
39
|
+
for break_char in ['. ', '.\n', '\n\n', '\n', ' ']:
|
40
|
+
break_pos = text.rfind(break_char, start, end)
|
41
|
+
if break_pos > start:
|
42
|
+
end = break_pos + len(break_char)
|
43
|
+
break
|
44
|
+
|
45
|
+
# Extract chunk
|
46
|
+
chunk = text[start:end].strip()
|
47
|
+
if chunk:
|
48
|
+
chunks.append(chunk)
|
49
|
+
|
50
|
+
# Move start position (with overlap)
|
51
|
+
start = max(start + 1, end - char_overlap)
|
52
|
+
|
53
|
+
# Prevent infinite loop
|
54
|
+
if start >= len(text):
|
55
|
+
break
|
56
|
+
|
57
|
+
return chunks
|
58
|
+
|
59
|
+
|
60
|
+
class FixedSizeOverlapDocumentChunking(DocumentChunkingBase):
|
61
|
+
def __init__(self) -> None:
|
62
|
+
pass
|
63
|
+
|
64
|
+
@logger.trace_function(log_execution=log_execution, log_args=False, log_result=False)
|
65
|
+
def chunk(
|
66
|
+
self, documents: List[SourceDocument], chunking: ChunkingSettings
|
67
|
+
) -> List[SourceDocument]:
|
68
|
+
full_document_content = "".join(
|
69
|
+
list(map(lambda document: document.content, documents))
|
70
|
+
)
|
71
|
+
try:
|
72
|
+
document_url = documents[0].source
|
73
|
+
except IndexError as e:
|
74
|
+
# If no documents are provided, set document_url to None
|
75
|
+
logger.error("No documents provided for chunking.")
|
76
|
+
logger.debug(e)
|
77
|
+
document_url = None
|
78
|
+
|
79
|
+
splitter = SimpleTokenSplitter(
|
80
|
+
chunk_size=chunking.chunk_size,
|
81
|
+
chunk_overlap=chunking.chunk_overlap
|
82
|
+
)
|
83
|
+
chunked_content_list = splitter.split_text(full_document_content)
|
84
|
+
|
85
|
+
# Create document for each chunk
|
86
|
+
documents = []
|
87
|
+
chunk_offset = 0
|
88
|
+
for idx, chunked_content in enumerate(chunked_content_list):
|
89
|
+
documents.append(
|
90
|
+
SourceDocument.from_metadata(
|
91
|
+
content=chunked_content,
|
92
|
+
document_url=document_url,
|
93
|
+
metadata={"offset": chunk_offset},
|
94
|
+
idx=idx,
|
95
|
+
)
|
96
|
+
)
|
97
|
+
chunk_offset += len(chunked_content)
|
98
|
+
return documents
|
{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_chunking/layout.py
RENAMED
@@ -1,6 +1,5 @@
|
|
1
1
|
from typing import List
|
2
2
|
from .document_chunking_base import DocumentChunkingBase
|
3
|
-
from langchain.text_splitter import MarkdownTextSplitter
|
4
3
|
from .chunking_strategy import ChunkingSettings
|
5
4
|
from ..common.source_document import SourceDocument
|
6
5
|
from ...utilities.helpers.env_helper import EnvHelper
|
@@ -11,6 +10,50 @@ log_execution = env_helper.LOG_EXECUTION
|
|
11
10
|
log_args = env_helper.LOG_ARGS
|
12
11
|
log_result = env_helper.LOG_RESULT
|
13
12
|
|
13
|
+
|
14
|
+
class SimpleTextSplitter:
|
15
|
+
"""Simple text splitter to replace LangChain's MarkdownTextSplitter."""
|
16
|
+
|
17
|
+
def __init__(self, chunk_size: int, chunk_overlap: int):
|
18
|
+
self.chunk_size = chunk_size
|
19
|
+
self.chunk_overlap = chunk_overlap
|
20
|
+
|
21
|
+
def split_text(self, text: str) -> List[str]:
|
22
|
+
"""Split text into chunks with overlap."""
|
23
|
+
if not text:
|
24
|
+
return []
|
25
|
+
|
26
|
+
chunks = []
|
27
|
+
start = 0
|
28
|
+
|
29
|
+
while start < len(text):
|
30
|
+
# Calculate end position
|
31
|
+
end = start + self.chunk_size
|
32
|
+
|
33
|
+
# If this is not the last chunk, try to find a good break point
|
34
|
+
if end < len(text):
|
35
|
+
# Look for sentence endings, then paragraph breaks, then word boundaries
|
36
|
+
for break_char in ['. ', '.\n', '\n\n', '\n', ' ']:
|
37
|
+
break_pos = text.rfind(break_char, start, end)
|
38
|
+
if break_pos > start:
|
39
|
+
end = break_pos + len(break_char)
|
40
|
+
break
|
41
|
+
|
42
|
+
# Extract chunk
|
43
|
+
chunk = text[start:end].strip()
|
44
|
+
if chunk:
|
45
|
+
chunks.append(chunk)
|
46
|
+
|
47
|
+
# Move start position (with overlap)
|
48
|
+
start = max(start + 1, end - self.chunk_overlap)
|
49
|
+
|
50
|
+
# Prevent infinite loop
|
51
|
+
if start >= len(text):
|
52
|
+
break
|
53
|
+
|
54
|
+
return chunks
|
55
|
+
|
56
|
+
|
14
57
|
class LayoutDocumentChunking(DocumentChunkingBase):
|
15
58
|
def __init__(self) -> None:
|
16
59
|
pass
|
@@ -29,10 +72,13 @@ class LayoutDocumentChunking(DocumentChunkingBase):
|
|
29
72
|
logger.error("No documents provided for chunking.")
|
30
73
|
logger.debug(e)
|
31
74
|
document_url = None
|
32
|
-
|
33
|
-
|
75
|
+
|
76
|
+
splitter = SimpleTextSplitter(
|
77
|
+
chunk_size=chunking.chunk_size,
|
78
|
+
chunk_overlap=chunking.chunk_overlap
|
34
79
|
)
|
35
80
|
chunked_content_list = splitter.split_text(full_document_content)
|
81
|
+
|
36
82
|
# Create document for each chunk
|
37
83
|
documents = []
|
38
84
|
chunk_offset = 0
|
{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_chunking/page.py
RENAMED
@@ -1,6 +1,5 @@
|
|
1
1
|
from typing import List
|
2
2
|
from .document_chunking_base import DocumentChunkingBase
|
3
|
-
from langchain.text_splitter import MarkdownTextSplitter
|
4
3
|
from .chunking_strategy import ChunkingSettings
|
5
4
|
from ..common.source_document import SourceDocument
|
6
5
|
from ...utilities.helpers.env_helper import EnvHelper
|
@@ -10,6 +9,50 @@ log_execution = env_helper.LOG_EXECUTION
|
|
10
9
|
log_args = env_helper.LOG_ARGS
|
11
10
|
log_result = env_helper.LOG_RESULT
|
12
11
|
|
12
|
+
|
13
|
+
class SimpleTextSplitter:
|
14
|
+
"""Simple text splitter to replace LangChain's MarkdownTextSplitter."""
|
15
|
+
|
16
|
+
def __init__(self, chunk_size: int, chunk_overlap: int):
|
17
|
+
self.chunk_size = chunk_size
|
18
|
+
self.chunk_overlap = chunk_overlap
|
19
|
+
|
20
|
+
def split_text(self, text: str) -> List[str]:
|
21
|
+
"""Split text into chunks with overlap."""
|
22
|
+
if not text:
|
23
|
+
return []
|
24
|
+
|
25
|
+
chunks = []
|
26
|
+
start = 0
|
27
|
+
|
28
|
+
while start < len(text):
|
29
|
+
# Calculate end position
|
30
|
+
end = start + self.chunk_size
|
31
|
+
|
32
|
+
# If this is not the last chunk, try to find a good break point
|
33
|
+
if end < len(text):
|
34
|
+
# Look for sentence endings, then paragraph breaks, then word boundaries
|
35
|
+
for break_char in ['. ', '.\n', '\n\n', '\n', ' ']:
|
36
|
+
break_pos = text.rfind(break_char, start, end)
|
37
|
+
if break_pos > start:
|
38
|
+
end = break_pos + len(break_char)
|
39
|
+
break
|
40
|
+
|
41
|
+
# Extract chunk
|
42
|
+
chunk = text[start:end].strip()
|
43
|
+
if chunk:
|
44
|
+
chunks.append(chunk)
|
45
|
+
|
46
|
+
# Move start position (with overlap)
|
47
|
+
start = max(start + 1, end - self.chunk_overlap)
|
48
|
+
|
49
|
+
# Prevent infinite loop
|
50
|
+
if start >= len(text):
|
51
|
+
break
|
52
|
+
|
53
|
+
return chunks
|
54
|
+
|
55
|
+
|
13
56
|
class PageDocumentChunking(DocumentChunkingBase):
|
14
57
|
def __init__(self) -> None:
|
15
58
|
pass
|
@@ -25,8 +68,10 @@ class PageDocumentChunking(DocumentChunkingBase):
|
|
25
68
|
logger.error("No documents provided for chunking.")
|
26
69
|
logger.debug(e)
|
27
70
|
document_url = None
|
28
|
-
|
29
|
-
|
71
|
+
|
72
|
+
splitter = SimpleTextSplitter(
|
73
|
+
chunk_size=chunking.chunk_size,
|
74
|
+
chunk_overlap=chunking.chunk_overlap
|
30
75
|
)
|
31
76
|
documents_chunked = []
|
32
77
|
for idx, document in enumerate(documents):
|
@@ -0,0 +1,85 @@
|
|
1
|
+
from typing import List
|
2
|
+
import re
|
3
|
+
import requests
|
4
|
+
from bs4 import BeautifulSoup
|
5
|
+
from .document_loading_base import DocumentLoadingBase
|
6
|
+
from ..common.source_document import SourceDocument
|
7
|
+
|
8
|
+
|
9
|
+
class SimpleWebDocument:
|
10
|
+
"""Simple document class to replace LangChain's Document."""
|
11
|
+
def __init__(self, page_content: str, metadata: dict):
|
12
|
+
self.page_content = page_content
|
13
|
+
self.metadata = metadata
|
14
|
+
|
15
|
+
|
16
|
+
class SimpleWebLoader:
|
17
|
+
"""Simple web loader to replace LangChain's WebBaseLoader."""
|
18
|
+
|
19
|
+
def __init__(self, url: str):
|
20
|
+
self.url = url
|
21
|
+
|
22
|
+
def load(self) -> List[SimpleWebDocument]:
|
23
|
+
"""Load web content from URL."""
|
24
|
+
try:
|
25
|
+
# Fetch the webpage
|
26
|
+
headers = {
|
27
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
28
|
+
}
|
29
|
+
response = requests.get(self.url, headers=headers, timeout=30)
|
30
|
+
response.raise_for_status()
|
31
|
+
|
32
|
+
# Parse HTML content
|
33
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
34
|
+
|
35
|
+
# Remove script and style elements
|
36
|
+
for script in soup(["script", "style"]):
|
37
|
+
script.decompose()
|
38
|
+
|
39
|
+
# Get text content
|
40
|
+
text = soup.get_text()
|
41
|
+
|
42
|
+
# Clean up text
|
43
|
+
lines = (line.strip() for line in text.splitlines())
|
44
|
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
45
|
+
text = ' '.join(chunk for chunk in chunks if chunk)
|
46
|
+
|
47
|
+
return [SimpleWebDocument(
|
48
|
+
page_content=text,
|
49
|
+
metadata={"source": self.url}
|
50
|
+
)]
|
51
|
+
|
52
|
+
except Exception as e:
|
53
|
+
# Return empty content if loading fails
|
54
|
+
return [SimpleWebDocument(
|
55
|
+
page_content="",
|
56
|
+
metadata={"source": self.url, "error": str(e)}
|
57
|
+
)]
|
58
|
+
|
59
|
+
|
60
|
+
class WebDocumentLoading(DocumentLoadingBase):
|
61
|
+
def __init__(self) -> None:
|
62
|
+
super().__init__()
|
63
|
+
|
64
|
+
def load(self, document_url: str) -> List[SourceDocument]:
|
65
|
+
loader = SimpleWebLoader(document_url)
|
66
|
+
documents = loader.load()
|
67
|
+
|
68
|
+
for document in documents:
|
69
|
+
document.page_content = re.sub("\n{3,}", "\n\n", document.page_content)
|
70
|
+
# Remove half non-ascii character from start/end of doc content
|
71
|
+
pattern = re.compile(
|
72
|
+
r"[\x00-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]"
|
73
|
+
)
|
74
|
+
document.page_content = re.sub(pattern, "", document.page_content)
|
75
|
+
if document.page_content == "":
|
76
|
+
documents.remove(document)
|
77
|
+
|
78
|
+
source_documents: List[SourceDocument] = [
|
79
|
+
SourceDocument(
|
80
|
+
content=document.page_content,
|
81
|
+
source=document.metadata["source"],
|
82
|
+
)
|
83
|
+
for document in documents
|
84
|
+
]
|
85
|
+
return source_documents
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from azure.identity import ChainedTokenCredential, DefaultAzureCredential
|
2
2
|
from typing import Union
|
3
|
-
|
3
|
+
|
4
4
|
from azure.core.credentials import AzureKeyCredential
|
5
5
|
from azure.search.documents import SearchClient
|
6
6
|
from azure.search.documents.indexes import SearchIndexClient
|
@@ -276,15 +276,6 @@ class AzureSearchHelper:
|
|
276
276
|
),
|
277
277
|
]
|
278
278
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
self.env_helper.AZURE_SEARCH_KEY
|
283
|
-
if self.env_helper.is_auth_type_keys()
|
284
|
-
else None
|
285
|
-
),
|
286
|
-
index_name=self.env_helper.AZURE_SEARCH_CONVERSATIONS_LOG_INDEX,
|
287
|
-
embedding_function=self.llm_helper.get_embedding_model().embed_query,
|
288
|
-
fields=fields,
|
289
|
-
user_agent="langchain chatwithyourdata-sa",
|
290
|
-
)
|
279
|
+
# Return simple search client instead of LangChain AzureSearch
|
280
|
+
# This maintains compatibility while removing LangChain dependency
|
281
|
+
return self.search_client
|
@@ -7,8 +7,7 @@ from ..azure_blob_storage_client import AzureBlobStorageClient
|
|
7
7
|
from ...document_chunking.chunking_strategy import ChunkingStrategy, ChunkingSettings
|
8
8
|
from ...document_loading import LoadingSettings, LoadingStrategy
|
9
9
|
from .embedding_config import EmbeddingConfig
|
10
|
-
|
11
|
-
from ...orchestrator import OrchestrationSettings
|
10
|
+
|
12
11
|
from ..env_helper import EnvHelper
|
13
12
|
from .assistant_strategy import AssistantStrategy
|
14
13
|
from .conversation_flow import ConversationFlow
|
@@ -43,12 +42,8 @@ class Config:
|
|
43
42
|
for c in config["document_processors"]
|
44
43
|
]
|
45
44
|
self.env_helper = EnvHelper()
|
46
|
-
|
47
|
-
|
48
|
-
}
|
49
|
-
self.orchestrator = OrchestrationSettings(
|
50
|
-
config.get("orchestrator", self.default_orchestration_settings)
|
51
|
-
)
|
45
|
+
# Orchestrator is always semantic kernel now
|
46
|
+
# No configuration needed as there's only one option
|
52
47
|
self.integrated_vectorization_config = (
|
53
48
|
IntegratedVectorizationConfig(config["integrated_vectorization_config"])
|
54
49
|
if self.env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION
|
@@ -93,7 +88,7 @@ class Config:
|
|
93
88
|
|
94
89
|
@logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=log_result)
|
95
90
|
def get_available_orchestration_strategies(self):
|
96
|
-
return [
|
91
|
+
return ["semantic_kernel"] # Only semantic kernel is supported now
|
97
92
|
|
98
93
|
@logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=log_result)
|
99
94
|
def get_available_ai_assistant_types(self):
|
@@ -271,7 +266,7 @@ class ConfigHelper:
|
|
271
266
|
with open(config_file_path, encoding="utf-8") as f:
|
272
267
|
ConfigHelper._default_config = json.loads(
|
273
268
|
Template(f.read()).substitute(
|
274
|
-
ORCHESTRATION_STRATEGY=
|
269
|
+
ORCHESTRATION_STRATEGY="semantic_kernel",
|
275
270
|
LOG_USER_INTERACTIONS=(
|
276
271
|
False
|
277
272
|
if env_helper.DATABASE_TYPE == DatabaseType.POSTGRESQL.value
|
{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/default.json
RENAMED
@@ -139,9 +139,7 @@
|
|
139
139
|
"log_user_interactions": "${LOG_USER_INTERACTIONS}",
|
140
140
|
"log_tokens": "${LOG_TOKENS}"
|
141
141
|
},
|
142
|
-
|
143
|
-
"strategy": "${ORCHESTRATION_STRATEGY}"
|
144
|
-
},
|
142
|
+
|
145
143
|
"enable_chat_history": true,
|
146
144
|
"database_type": "${DATABASE_TYPE}"
|
147
145
|
}
|
{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/env_helper.py
RENAMED
@@ -3,7 +3,7 @@ import os
|
|
3
3
|
|
4
4
|
import threading
|
5
5
|
# from dotenv import load_dotenv
|
6
|
-
|
6
|
+
|
7
7
|
from ..helpers.config.conversation_flow import ConversationFlow
|
8
8
|
from ..helpers.config.database_type import DatabaseType
|
9
9
|
|
@@ -130,10 +130,8 @@ class EnvHelper:
|
|
130
130
|
"USE_ADVANCED_IMAGE_PROCESSING", "False"
|
131
131
|
)
|
132
132
|
self.CONVERSATION_FLOW = os.getenv("CONVERSATION_FLOW", "custom")
|
133
|
-
# Orchestration Settings
|
134
|
-
self.ORCHESTRATION_STRATEGY =
|
135
|
-
"ORCHESTRATION_STRATEGY", "openai_function"
|
136
|
-
)
|
133
|
+
# Orchestration Settings - Always use semantic_kernel
|
134
|
+
self.ORCHESTRATION_STRATEGY = "semantic_kernel"
|
137
135
|
# PostgreSQL configuration
|
138
136
|
elif self.DATABASE_TYPE == DatabaseType.POSTGRESQL.value:
|
139
137
|
self.AZURE_POSTGRES_SEARCH_TOP_K = 5
|
@@ -154,7 +152,7 @@ class EnvHelper:
|
|
154
152
|
self.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = False
|
155
153
|
self.USE_ADVANCED_IMAGE_PROCESSING = False
|
156
154
|
self.CONVERSATION_FLOW = ConversationFlow.CUSTOM.value
|
157
|
-
self.ORCHESTRATION_STRATEGY =
|
155
|
+
self.ORCHESTRATION_STRATEGY = "semantic_kernel"
|
158
156
|
else:
|
159
157
|
raise ValueError(
|
160
158
|
"Unsupported DATABASE_TYPE. Please set DATABASE_TYPE to 'CosmosDB' or 'PostgreSQL'."
|
{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/llm_helper.py
RENAMED
@@ -1,7 +1,6 @@
|
|
1
1
|
from openai import AzureOpenAI
|
2
2
|
from typing import List, Union, cast
|
3
|
-
|
4
|
-
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
3
|
+
# Removed LangChain dependencies - using direct OpenAI SDK instead
|
5
4
|
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
|
6
5
|
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.azure_chat_prompt_execution_settings import (
|
7
6
|
AzureChatPromptExecutionSettings,
|
@@ -49,68 +48,32 @@ class LLMHelper:
|
|
49
48
|
|
50
49
|
@logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=False)
|
51
50
|
def get_llm(self):
|
52
|
-
|
53
|
-
|
54
|
-
deployment_name=self.llm_model,
|
55
|
-
temperature=0,
|
56
|
-
max_tokens=self.llm_max_tokens,
|
57
|
-
openai_api_version=self.openai_client._api_version,
|
58
|
-
azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
|
59
|
-
api_key=self.env_helper.OPENAI_API_KEY,
|
60
|
-
)
|
61
|
-
else:
|
62
|
-
return AzureChatOpenAI(
|
63
|
-
deployment_name=self.llm_model,
|
64
|
-
temperature=0,
|
65
|
-
max_tokens=self.llm_max_tokens,
|
66
|
-
openai_api_version=self.openai_client._api_version,
|
67
|
-
azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
|
68
|
-
azure_ad_token_provider=self.token_provider,
|
69
|
-
)
|
51
|
+
# Return the OpenAI client directly instead of LangChain wrapper
|
52
|
+
return self.openai_client
|
70
53
|
|
71
|
-
# TODO: This needs to have a custom callback to stream back to the UI
|
72
54
|
@logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=False)
|
73
55
|
def get_streaming_llm(self):
|
74
|
-
|
75
|
-
|
76
|
-
azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
|
77
|
-
api_key=self.env_helper.OPENAI_API_KEY,
|
78
|
-
streaming=True,
|
79
|
-
callbacks=[StreamingStdOutCallbackHandler],
|
80
|
-
deployment_name=self.llm_model,
|
81
|
-
temperature=0,
|
82
|
-
max_tokens=self.llm_max_tokens,
|
83
|
-
openai_api_version=self.openai_client._api_version,
|
84
|
-
)
|
85
|
-
else:
|
86
|
-
return AzureChatOpenAI(
|
87
|
-
azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
|
88
|
-
api_key=self.env_helper.OPENAI_API_KEY,
|
89
|
-
streaming=True,
|
90
|
-
callbacks=[StreamingStdOutCallbackHandler],
|
91
|
-
deployment_name=self.llm_model,
|
92
|
-
temperature=0,
|
93
|
-
max_tokens=self.llm_max_tokens,
|
94
|
-
openai_api_version=self.openai_client._api_version,
|
95
|
-
azure_ad_token_provider=self.token_provider,
|
96
|
-
)
|
56
|
+
# Return the OpenAI client directly - streaming is handled via stream=True parameter
|
57
|
+
return self.openai_client
|
97
58
|
|
98
59
|
@logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=False)
|
99
60
|
def get_embedding_model(self):
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
)
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
61
|
+
# Return a simple embedding model wrapper that uses the OpenAI client directly
|
62
|
+
class EmbeddingModel:
|
63
|
+
def __init__(self, openai_client, embedding_model):
|
64
|
+
self.openai_client = openai_client
|
65
|
+
self.embedding_model = embedding_model
|
66
|
+
|
67
|
+
def embed_query(self, text: str) -> List[float]:
|
68
|
+
return (
|
69
|
+
self.openai_client.embeddings.create(
|
70
|
+
input=[text], model=self.embedding_model
|
71
|
+
)
|
72
|
+
.data[0]
|
73
|
+
.embedding
|
74
|
+
)
|
75
|
+
|
76
|
+
return EmbeddingModel(self.openai_client, self.embedding_model)
|
114
77
|
|
115
78
|
@logger.trace_function(log_execution=log_execution, log_args=False, log_result=False)
|
116
79
|
def generate_embeddings(self, input: Union[str, list[int]]) -> List[float]:
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from typing import List
|
2
|
+
from ..orchestrator.semantic_kernel_orchestrator import SemanticKernelOrchestrator
|
3
|
+
|
4
|
+
__all__ = ["Orchestrator"]
|
5
|
+
|
6
|
+
|
7
|
+
class Orchestrator:
|
8
|
+
def __init__(self) -> None:
|
9
|
+
self.orchestrator = SemanticKernelOrchestrator()
|
10
|
+
|
11
|
+
async def handle_message(
|
12
|
+
self,
|
13
|
+
user_message: str,
|
14
|
+
chat_history: List[dict],
|
15
|
+
conversation_id: str,
|
16
|
+
user_info,
|
17
|
+
**kwargs: dict,
|
18
|
+
) -> dict:
|
19
|
+
return await self.orchestrator.handle_message(
|
20
|
+
user_message, chat_history, conversation_id, user_info, **kwargs
|
21
|
+
)
|