PyPI - cwyodmodules - Versions diffs - 0.3.80__tar.gz → 0.3.83__tar.gz - Mend

cwyodmodules 0.3.80tar.gz → 0.3.83tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cwyodmodules
-Version: 0.3.80
+Version: 0.3.83
 Summary: Add your description here
 Author-email: Patrik <patrikhartl@gmail.com>
 Classifier: Operating System :: OS Independent
@@ -15,13 +15,9 @@ Requires-Dist: azure-mgmt-cognitiveservices<14.0.0,>=13.6.0
 Requires-Dist: azure-identity<2.0.0,>=1.20.0
 Requires-Dist: azure-cosmos<5.0.0,>=4.9.0
 Requires-Dist: asyncpg<0.31.0,>=0.30.0
-Requires-Dist: langchain<0.4.0,>=0.3.18
 Requires-Dist: azure-storage-queue<13.0.0,>=12.12.0
 Requires-Dist: chardet<6.0.0,>=5.2.0
 Requires-Dist: azure-ai-formrecognizer<4.0.0,>=3.3.3
-Requires-Dist: langchain-chroma<0.3.0,>=0.2.2
-Requires-Dist: langchain-openai<0.4.0,>=0.3.5
-Requires-Dist: langchain-community<0.4.0,>=0.3.17
 Requires-Dist: azure-search<2.0.0,>=1.0.0b2
 Requires-Dist: azure-functions<2.0.0,>=1.21.3
 Requires-Dist: azure-ai-ml<2.0.0,>=1.25.0

cwyodmodules-0.3.83/cwyodmodules/batch/utilities/document_chunking/fixed_size_overlap.py ADDED Viewed

@@ -0,0 +1,98 @@
+from typing import List
+from .document_chunking_base import DocumentChunkingBase
+from .chunking_strategy import ChunkingSettings
+from ..common.source_document import SourceDocument
+from ...utilities.helpers.env_helper import EnvHelper
+from mgmt_config import logger
+env_helper: EnvHelper = EnvHelper()
+log_execution = env_helper.LOG_EXECUTION
+log_args = env_helper.LOG_ARGS
+log_result = env_helper.LOG_RESULT
+class SimpleTokenSplitter:
+    """Simple token-based text splitter to replace LangChain's TokenTextSplitter."""
+    def __init__(self, chunk_size: int, chunk_overlap: int):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def split_text(self, text: str) -> List[str]:
+        """Split text into chunks based on approximate token count."""
+        if not text:
+            return []
+        # Rough approximation: 1 token ≈ 4 characters
+        char_chunk_size = self.chunk_size * 4
+        char_overlap = self.chunk_overlap * 4
+        chunks = []
+        start = 0
+        while start < len(text):
+            # Calculate end position
+            end = start + char_chunk_size
+            # If this is not the last chunk, try to find a good break point
+            if end < len(text):
+                # Look for sentence endings, then paragraph breaks, then word boundaries
+                for break_char in ['. ', '.\n', '\n\n', '\n', ' ']:
+                    break_pos = text.rfind(break_char, start, end)
+                    if break_pos > start:
+                        end = break_pos + len(break_char)
+                        break
+            # Extract chunk
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+            # Move start position (with overlap)
+            start = max(start + 1, end - char_overlap)
+            # Prevent infinite loop
+            if start >= len(text):
+                break
+        return chunks
+class FixedSizeOverlapDocumentChunking(DocumentChunkingBase):
+    def __init__(self) -> None:
+        pass
+    @logger.trace_function(log_execution=log_execution, log_args=False, log_result=False)
+    def chunk(
+        self, documents: List[SourceDocument], chunking: ChunkingSettings
+    ) -> List[SourceDocument]:
+        full_document_content = "".join(
+            list(map(lambda document: document.content, documents))
+        )
+        try:
+            document_url = documents[0].source
+        except IndexError as e:
+            # If no documents are provided, set document_url to None
+            logger.error("No documents provided for chunking.")
+            logger.debug(e)
+            document_url = None
+        splitter = SimpleTokenSplitter(
+            chunk_size=chunking.chunk_size,
+            chunk_overlap=chunking.chunk_overlap
+        )
+        chunked_content_list = splitter.split_text(full_document_content)
+        # Create document for each chunk
+        documents = []
+        chunk_offset = 0
+        for idx, chunked_content in enumerate(chunked_content_list):
+            documents.append(
+                SourceDocument.from_metadata(
+                    content=chunked_content,
+                    document_url=document_url,
+                    metadata={"offset": chunk_offset},
+                    idx=idx,
+                )
+            )
+            chunk_offset += len(chunked_content)
+        return documents

{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_chunking/layout.py RENAMED Viewed

@@ -1,6 +1,5 @@
 from typing import List
 from .document_chunking_base import DocumentChunkingBase
-from langchain.text_splitter import MarkdownTextSplitter
 from .chunking_strategy import ChunkingSettings
 from ..common.source_document import SourceDocument
 from ...utilities.helpers.env_helper import EnvHelper
@@ -11,6 +10,50 @@ log_execution = env_helper.LOG_EXECUTION
 log_args = env_helper.LOG_ARGS
 log_result = env_helper.LOG_RESULT
+class SimpleTextSplitter:
+    """Simple text splitter to replace LangChain's MarkdownTextSplitter."""
+    def __init__(self, chunk_size: int, chunk_overlap: int):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def split_text(self, text: str) -> List[str]:
+        """Split text into chunks with overlap."""
+        if not text:
+            return []
+        chunks = []
+        start = 0
+        while start < len(text):
+            # Calculate end position
+            end = start + self.chunk_size
+            # If this is not the last chunk, try to find a good break point
+            if end < len(text):
+                # Look for sentence endings, then paragraph breaks, then word boundaries
+                for break_char in ['. ', '.\n', '\n\n', '\n', ' ']:
+                    break_pos = text.rfind(break_char, start, end)
+                    if break_pos > start:
+                        end = break_pos + len(break_char)
+                        break
+            # Extract chunk
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+            # Move start position (with overlap)
+            start = max(start + 1, end - self.chunk_overlap)
+            # Prevent infinite loop
+            if start >= len(text):
+                break
+        return chunks
 class LayoutDocumentChunking(DocumentChunkingBase):
     def __init__(self) -> None:
         pass
@@ -29,10 +72,13 @@ class LayoutDocumentChunking(DocumentChunkingBase):
             logger.error("No documents provided for chunking.")
             logger.debug(e)
             document_url = None
-        splitter = MarkdownTextSplitter.from_tiktoken_encoder(
-            chunk_size=chunking.chunk_size, chunk_overlap=chunking.chunk_overlap
+        splitter = SimpleTextSplitter(
+            chunk_size=chunking.chunk_size,
+            chunk_overlap=chunking.chunk_overlap
         )
         chunked_content_list = splitter.split_text(full_document_content)
         # Create document for each chunk
         documents = []
         chunk_offset = 0

{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/document_chunking/page.py RENAMED Viewed

@@ -1,6 +1,5 @@
 from typing import List
 from .document_chunking_base import DocumentChunkingBase
-from langchain.text_splitter import MarkdownTextSplitter
 from .chunking_strategy import ChunkingSettings
 from ..common.source_document import SourceDocument
 from ...utilities.helpers.env_helper import EnvHelper
@@ -10,6 +9,50 @@ log_execution = env_helper.LOG_EXECUTION
 log_args = env_helper.LOG_ARGS
 log_result = env_helper.LOG_RESULT
+class SimpleTextSplitter:
+    """Simple text splitter to replace LangChain's MarkdownTextSplitter."""
+    def __init__(self, chunk_size: int, chunk_overlap: int):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def split_text(self, text: str) -> List[str]:
+        """Split text into chunks with overlap."""
+        if not text:
+            return []
+        chunks = []
+        start = 0
+        while start < len(text):
+            # Calculate end position
+            end = start + self.chunk_size
+            # If this is not the last chunk, try to find a good break point
+            if end < len(text):
+                # Look for sentence endings, then paragraph breaks, then word boundaries
+                for break_char in ['. ', '.\n', '\n\n', '\n', ' ']:
+                    break_pos = text.rfind(break_char, start, end)
+                    if break_pos > start:
+                        end = break_pos + len(break_char)
+                        break
+            # Extract chunk
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+            # Move start position (with overlap)
+            start = max(start + 1, end - self.chunk_overlap)
+            # Prevent infinite loop
+            if start >= len(text):
+                break
+        return chunks
 class PageDocumentChunking(DocumentChunkingBase):
     def __init__(self) -> None:
         pass
@@ -25,8 +68,10 @@ class PageDocumentChunking(DocumentChunkingBase):
             logger.error("No documents provided for chunking.")
             logger.debug(e)
             document_url = None
-        splitter = MarkdownTextSplitter.from_tiktoken_encoder(
-            chunk_size=chunking.chunk_size, chunk_overlap=chunking.chunk_overlap
+        splitter = SimpleTextSplitter(
+            chunk_size=chunking.chunk_size,
+            chunk_overlap=chunking.chunk_overlap
         )
         documents_chunked = []
         for idx, document in enumerate(documents):

cwyodmodules-0.3.83/cwyodmodules/batch/utilities/document_loading/web.py ADDED Viewed

@@ -0,0 +1,85 @@
+from typing import List
+import re
+import requests
+from bs4 import BeautifulSoup
+from .document_loading_base import DocumentLoadingBase
+from ..common.source_document import SourceDocument
+class SimpleWebDocument:
+    """Simple document class to replace LangChain's Document."""
+    def __init__(self, page_content: str, metadata: dict):
+        self.page_content = page_content
+        self.metadata = metadata
+class SimpleWebLoader:
+    """Simple web loader to replace LangChain's WebBaseLoader."""
+    def __init__(self, url: str):
+        self.url = url
+    def load(self) -> List[SimpleWebDocument]:
+        """Load web content from URL."""
+        try:
+            # Fetch the webpage
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+            }
+            response = requests.get(self.url, headers=headers, timeout=30)
+            response.raise_for_status()
+            # Parse HTML content
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Remove script and style elements
+            for script in soup(["script", "style"]):
+                script.decompose()
+            # Get text content
+            text = soup.get_text()
+            # Clean up text
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = ' '.join(chunk for chunk in chunks if chunk)
+            return [SimpleWebDocument(
+                page_content=text,
+                metadata={"source": self.url}
+            )]
+        except Exception as e:
+            # Return empty content if loading fails
+            return [SimpleWebDocument(
+                page_content="",
+                metadata={"source": self.url, "error": str(e)}
+            )]
+class WebDocumentLoading(DocumentLoadingBase):
+    def __init__(self) -> None:
+        super().__init__()
+    def load(self, document_url: str) -> List[SourceDocument]:
+        loader = SimpleWebLoader(document_url)
+        documents = loader.load()
+        for document in documents:
+            document.page_content = re.sub("\n{3,}", "\n\n", document.page_content)
+            # Remove half non-ascii character from start/end of doc content
+            pattern = re.compile(
+                r"[\x00-\x1f\x7f\u0080-\u00a0\u2000-\u3000\ufff0-\uffff]"
+            )
+            document.page_content = re.sub(pattern, "", document.page_content)
+            if document.page_content == "":
+                documents.remove(document)
+        source_documents: List[SourceDocument] = [
+            SourceDocument(
+                content=document.page_content,
+                source=document.metadata["source"],
+            )
+            for document in documents
+        ]
+        return source_documents

{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/azure_search_helper.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from azure.identity import ChainedTokenCredential, DefaultAzureCredential
 from typing import Union
-from langchain_community.vectorstores import AzureSearch
 from azure.core.credentials import AzureKeyCredential
 from azure.search.documents import SearchClient
 from azure.search.documents.indexes import SearchIndexClient
@@ -276,15 +276,6 @@ class AzureSearchHelper:
             ),
         ]
-        return AzureSearch(
-            azure_search_endpoint=self.env_helper.AZURE_SEARCH_SERVICE,
-            azure_search_key=(
-                self.env_helper.AZURE_SEARCH_KEY
-                if self.env_helper.is_auth_type_keys()
-                else None
-            ),
-            index_name=self.env_helper.AZURE_SEARCH_CONVERSATIONS_LOG_INDEX,
-            embedding_function=self.llm_helper.get_embedding_model().embed_query,
-            fields=fields,
-            user_agent="langchain chatwithyourdata-sa",
-        )
+        # Return simple search client instead of LangChain AzureSearch
+        # This maintains compatibility while removing LangChain dependency
+        return self.search_client

{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/config_helper.py RENAMED Viewed

@@ -7,8 +7,7 @@ from ..azure_blob_storage_client import AzureBlobStorageClient
 from ...document_chunking.chunking_strategy import ChunkingStrategy, ChunkingSettings
 from ...document_loading import LoadingSettings, LoadingStrategy
 from .embedding_config import EmbeddingConfig
-from ...orchestrator.orchestration_strategy import OrchestrationStrategy
-from ...orchestrator import OrchestrationSettings
 from ..env_helper import EnvHelper
 from .assistant_strategy import AssistantStrategy
 from .conversation_flow import ConversationFlow
@@ -43,12 +42,8 @@ class Config:
             for c in config["document_processors"]
         ]
         self.env_helper = EnvHelper()
-        self.default_orchestration_settings = {
-            "strategy": self.env_helper.ORCHESTRATION_STRATEGY
-        }
-        self.orchestrator = OrchestrationSettings(
-            config.get("orchestrator", self.default_orchestration_settings)
-        )
+        # Orchestrator is always semantic kernel now
+        # No configuration needed as there's only one option
         self.integrated_vectorization_config = (
             IntegratedVectorizationConfig(config["integrated_vectorization_config"])
             if self.env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION
@@ -93,7 +88,7 @@ class Config:
     @logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=log_result)
     def get_available_orchestration_strategies(self):
-        return [c.value for c in OrchestrationStrategy]
+        return ["semantic_kernel"]  # Only semantic kernel is supported now
     @logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=log_result)
     def get_available_ai_assistant_types(self):
@@ -271,7 +266,7 @@ class ConfigHelper:
             with open(config_file_path, encoding="utf-8") as f:
                 ConfigHelper._default_config = json.loads(
                     Template(f.read()).substitute(
-                        ORCHESTRATION_STRATEGY=env_helper.ORCHESTRATION_STRATEGY,
+                        ORCHESTRATION_STRATEGY="semantic_kernel",
                         LOG_USER_INTERACTIONS=(
                             False
                             if env_helper.DATABASE_TYPE == DatabaseType.POSTGRESQL.value

{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/config/default.json RENAMED Viewed

@@ -139,9 +139,7 @@
     "log_user_interactions": "${LOG_USER_INTERACTIONS}",
     "log_tokens": "${LOG_TOKENS}"
   },
-  "orchestrator": {
-    "strategy": "${ORCHESTRATION_STRATEGY}"
-  },
   "enable_chat_history": true,
   "database_type": "${DATABASE_TYPE}"
 }

{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/env_helper.py RENAMED Viewed

@@ -3,7 +3,7 @@ import os
 import threading
 # from dotenv import load_dotenv
-from ..orchestrator.orchestration_strategy import OrchestrationStrategy
 from ..helpers.config.conversation_flow import ConversationFlow
 from ..helpers.config.database_type import DatabaseType
@@ -130,10 +130,8 @@ class EnvHelper:
                 "USE_ADVANCED_IMAGE_PROCESSING", "False"
             )
             self.CONVERSATION_FLOW = os.getenv("CONVERSATION_FLOW", "custom")
-            # Orchestration Settings
-            self.ORCHESTRATION_STRATEGY = os.getenv(
-                "ORCHESTRATION_STRATEGY", "openai_function"
-            )
+            # Orchestration Settings - Always use semantic_kernel
+            self.ORCHESTRATION_STRATEGY = "semantic_kernel"
         # PostgreSQL configuration
         elif self.DATABASE_TYPE == DatabaseType.POSTGRESQL.value:
             self.AZURE_POSTGRES_SEARCH_TOP_K = 5
@@ -154,7 +152,7 @@ class EnvHelper:
             self.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = False
             self.USE_ADVANCED_IMAGE_PROCESSING = False
             self.CONVERSATION_FLOW = ConversationFlow.CUSTOM.value
-            self.ORCHESTRATION_STRATEGY = OrchestrationStrategy.SEMANTIC_KERNEL.value
+            self.ORCHESTRATION_STRATEGY = "semantic_kernel"
         else:
             raise ValueError(
                 "Unsupported DATABASE_TYPE. Please set DATABASE_TYPE to 'CosmosDB' or 'PostgreSQL'."

{cwyodmodules-0.3.80 → cwyodmodules-0.3.83}/cwyodmodules/batch/utilities/helpers/llm_helper.py RENAMED Viewed

@@ -1,7 +1,6 @@
 from openai import AzureOpenAI
 from typing import List, Union, cast
-from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
-from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+# Removed LangChain dependencies - using direct OpenAI SDK instead
 from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
 from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.azure_chat_prompt_execution_settings import (
     AzureChatPromptExecutionSettings,
@@ -49,68 +48,32 @@ class LLMHelper:
     @logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=False)
     def get_llm(self):
-        if self.auth_type_keys:
-            return AzureChatOpenAI(
-                deployment_name=self.llm_model,
-                temperature=0,
-                max_tokens=self.llm_max_tokens,
-                openai_api_version=self.openai_client._api_version,
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                api_key=self.env_helper.OPENAI_API_KEY,
-            )
-        else:
-            return AzureChatOpenAI(
-                deployment_name=self.llm_model,
-                temperature=0,
-                max_tokens=self.llm_max_tokens,
-                openai_api_version=self.openai_client._api_version,
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                azure_ad_token_provider=self.token_provider,
-            )
+        # Return the OpenAI client directly instead of LangChain wrapper
+        return self.openai_client
-    # TODO: This needs to have a custom callback to stream back to the UI
     @logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=False)
     def get_streaming_llm(self):
-        if self.auth_type_keys:
-            return AzureChatOpenAI(
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                api_key=self.env_helper.OPENAI_API_KEY,
-                streaming=True,
-                callbacks=[StreamingStdOutCallbackHandler],
-                deployment_name=self.llm_model,
-                temperature=0,
-                max_tokens=self.llm_max_tokens,
-                openai_api_version=self.openai_client._api_version,
-            )
-        else:
-            return AzureChatOpenAI(
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                api_key=self.env_helper.OPENAI_API_KEY,
-                streaming=True,
-                callbacks=[StreamingStdOutCallbackHandler],
-                deployment_name=self.llm_model,
-                temperature=0,
-                max_tokens=self.llm_max_tokens,
-                openai_api_version=self.openai_client._api_version,
-                azure_ad_token_provider=self.token_provider,
-            )
+        # Return the OpenAI client directly - streaming is handled via stream=True parameter
+        return self.openai_client
     @logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=False)
     def get_embedding_model(self):
-        if self.auth_type_keys:
-            return AzureOpenAIEmbeddings(
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                api_key=self.env_helper.OPENAI_API_KEY,
-                azure_deployment=self.embedding_model,
-                chunk_size=1,
-            )
-        else:
-            return AzureOpenAIEmbeddings(
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                azure_deployment=self.embedding_model,
-                chunk_size=1,
-                azure_ad_token_provider=self.token_provider,
-            )
+        # Return a simple embedding model wrapper that uses the OpenAI client directly
+        class EmbeddingModel:
+            def __init__(self, openai_client, embedding_model):
+                self.openai_client = openai_client
+                self.embedding_model = embedding_model
+            def embed_query(self, text: str) -> List[float]:
+                return (
+                    self.openai_client.embeddings.create(
+                        input=[text], model=self.embedding_model
+                    )
+                    .data[0]
+                    .embedding
+                )
+        return EmbeddingModel(self.openai_client, self.embedding_model)
     @logger.trace_function(log_execution=log_execution, log_args=False, log_result=False)
     def generate_embeddings(self, input: Union[str, list[int]]) -> List[float]:

cwyodmodules-0.3.83/cwyodmodules/batch/utilities/helpers/orchestrator_helper.py ADDED Viewed

@@ -0,0 +1,21 @@
+from typing import List
+from ..orchestrator.semantic_kernel_orchestrator import SemanticKernelOrchestrator
+__all__ = ["Orchestrator"]
+class Orchestrator:
+    def __init__(self) -> None:
+        self.orchestrator = SemanticKernelOrchestrator()
+    async def handle_message(
+        self,
+        user_message: str,
+        chat_history: List[dict],
+        conversation_id: str,
+        user_info,
+        **kwargs: dict,
+    ) -> dict:
+        return await self.orchestrator.handle_message(
+            user_message, chat_history, conversation_id, user_info, **kwargs
+        )

cwyodmodules-0.3.83/cwyodmodules/batch/utilities/orchestrator/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .semantic_kernel_orchestrator import SemanticKernelOrchestrator
+__all__ = ["SemanticKernelOrchestrator"]

cwyodmodules 0.3.80__tar.gz → 0.3.83__tar.gz

cwyodmodules 0.3.80tar.gz → 0.3.83tar.gz