PyPI - cwyodmodules - Versions diffs - 0.3.80__py3-none-any.whl → 0.3.83__py3-none-any.whl - Mend

cwyodmodules 0.3.80py3-none-any.whl → 0.3.83py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

cwyodmodules/batch/utilities/document_chunking/fixed_size_overlap.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from typing import List
 from .document_chunking_base import DocumentChunkingBase
-from langchain.text_splitter import TokenTextSplitter
 from .chunking_strategy import ChunkingSettings
 from ..common.source_document import SourceDocument
 from ...utilities.helpers.env_helper import EnvHelper
@@ -10,6 +9,54 @@ log_execution = env_helper.LOG_EXECUTION
 log_args = env_helper.LOG_ARGS
 log_result = env_helper.LOG_RESULT
+class SimpleTokenSplitter:
+    """Simple token-based text splitter to replace LangChain's TokenTextSplitter."""
+    def __init__(self, chunk_size: int, chunk_overlap: int):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def split_text(self, text: str) -> List[str]:
+        """Split text into chunks based on approximate token count."""
+        if not text:
+            return []
+        # Rough approximation: 1 token ≈ 4 characters
+        char_chunk_size = self.chunk_size * 4
+        char_overlap = self.chunk_overlap * 4
+        chunks = []
+        start = 0
+        while start < len(text):
+            # Calculate end position
+            end = start + char_chunk_size
+            # If this is not the last chunk, try to find a good break point
+            if end < len(text):
+                # Look for sentence endings, then paragraph breaks, then word boundaries
+                for break_char in ['. ', '.\n', '\n\n', '\n', ' ']:
+                    break_pos = text.rfind(break_char, start, end)
+                    if break_pos > start:
+                        end = break_pos + len(break_char)
+                        break
+            # Extract chunk
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+            # Move start position (with overlap)
+            start = max(start + 1, end - char_overlap)
+            # Prevent infinite loop
+            if start >= len(text):
+                break
+        return chunks
 class FixedSizeOverlapDocumentChunking(DocumentChunkingBase):
     def __init__(self) -> None:
         pass
@@ -28,10 +75,13 @@ class FixedSizeOverlapDocumentChunking(DocumentChunkingBase):
             logger.error("No documents provided for chunking.")
             logger.debug(e)
             document_url = None
-        splitter = TokenTextSplitter.from_tiktoken_encoder(
-            chunk_size=chunking.chunk_size, chunk_overlap=chunking.chunk_overlap
+        splitter = SimpleTokenSplitter(
+            chunk_size=chunking.chunk_size,
+            chunk_overlap=chunking.chunk_overlap
         )
         chunked_content_list = splitter.split_text(full_document_content)
         # Create document for each chunk
         documents = []
         chunk_offset = 0

cwyodmodules/batch/utilities/document_chunking/layout.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from typing import List
 from .document_chunking_base import DocumentChunkingBase
-from langchain.text_splitter import MarkdownTextSplitter
 from .chunking_strategy import ChunkingSettings
 from ..common.source_document import SourceDocument
 from ...utilities.helpers.env_helper import EnvHelper
@@ -11,6 +10,50 @@ log_execution = env_helper.LOG_EXECUTION
 log_args = env_helper.LOG_ARGS
 log_result = env_helper.LOG_RESULT
+class SimpleTextSplitter:
+    """Simple text splitter to replace LangChain's MarkdownTextSplitter."""
+    def __init__(self, chunk_size: int, chunk_overlap: int):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def split_text(self, text: str) -> List[str]:
+        """Split text into chunks with overlap."""
+        if not text:
+            return []
+        chunks = []
+        start = 0
+        while start < len(text):
+            # Calculate end position
+            end = start + self.chunk_size
+            # If this is not the last chunk, try to find a good break point
+            if end < len(text):
+                # Look for sentence endings, then paragraph breaks, then word boundaries
+                for break_char in ['. ', '.\n', '\n\n', '\n', ' ']:
+                    break_pos = text.rfind(break_char, start, end)
+                    if break_pos > start:
+                        end = break_pos + len(break_char)
+                        break
+            # Extract chunk
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+            # Move start position (with overlap)
+            start = max(start + 1, end - self.chunk_overlap)
+            # Prevent infinite loop
+            if start >= len(text):
+                break
+        return chunks
 class LayoutDocumentChunking(DocumentChunkingBase):
     def __init__(self) -> None:
         pass
@@ -29,10 +72,13 @@ class LayoutDocumentChunking(DocumentChunkingBase):
             logger.error("No documents provided for chunking.")
             logger.debug(e)
             document_url = None
-        splitter = MarkdownTextSplitter.from_tiktoken_encoder(
-            chunk_size=chunking.chunk_size, chunk_overlap=chunking.chunk_overlap
+        splitter = SimpleTextSplitter(
+            chunk_size=chunking.chunk_size,
+            chunk_overlap=chunking.chunk_overlap
         )
         chunked_content_list = splitter.split_text(full_document_content)
         # Create document for each chunk
         documents = []
         chunk_offset = 0

cwyodmodules/batch/utilities/document_chunking/page.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from typing import List
 from .document_chunking_base import DocumentChunkingBase
-from langchain.text_splitter import MarkdownTextSplitter
 from .chunking_strategy import ChunkingSettings
 from ..common.source_document import SourceDocument
 from ...utilities.helpers.env_helper import EnvHelper
@@ -10,6 +9,50 @@ log_execution = env_helper.LOG_EXECUTION
 log_args = env_helper.LOG_ARGS
 log_result = env_helper.LOG_RESULT
+class SimpleTextSplitter:
+    """Simple text splitter to replace LangChain's MarkdownTextSplitter."""
+    def __init__(self, chunk_size: int, chunk_overlap: int):
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+    def split_text(self, text: str) -> List[str]:
+        """Split text into chunks with overlap."""
+        if not text:
+            return []
+        chunks = []
+        start = 0
+        while start < len(text):
+            # Calculate end position
+            end = start + self.chunk_size
+            # If this is not the last chunk, try to find a good break point
+            if end < len(text):
+                # Look for sentence endings, then paragraph breaks, then word boundaries
+                for break_char in ['. ', '.\n', '\n\n', '\n', ' ']:
+                    break_pos = text.rfind(break_char, start, end)
+                    if break_pos > start:
+                        end = break_pos + len(break_char)
+                        break
+            # Extract chunk
+            chunk = text[start:end].strip()
+            if chunk:
+                chunks.append(chunk)
+            # Move start position (with overlap)
+            start = max(start + 1, end - self.chunk_overlap)
+            # Prevent infinite loop
+            if start >= len(text):
+                break
+        return chunks
 class PageDocumentChunking(DocumentChunkingBase):
     def __init__(self) -> None:
         pass
@@ -25,8 +68,10 @@ class PageDocumentChunking(DocumentChunkingBase):
             logger.error("No documents provided for chunking.")
             logger.debug(e)
             document_url = None
-        splitter = MarkdownTextSplitter.from_tiktoken_encoder(
-            chunk_size=chunking.chunk_size, chunk_overlap=chunking.chunk_overlap
+        splitter = SimpleTextSplitter(
+            chunk_size=chunking.chunk_size,
+            chunk_overlap=chunking.chunk_overlap
         )
         documents_chunked = []
         for idx, document in enumerate(documents):

cwyodmodules/batch/utilities/document_loading/web.py CHANGED Viewed

@@ -1,16 +1,70 @@
 from typing import List
 import re
-from langchain_community.document_loaders import WebBaseLoader
+import requests
+from bs4 import BeautifulSoup
 from .document_loading_base import DocumentLoadingBase
 from ..common.source_document import SourceDocument
+class SimpleWebDocument:
+    """Simple document class to replace LangChain's Document."""
+    def __init__(self, page_content: str, metadata: dict):
+        self.page_content = page_content
+        self.metadata = metadata
+class SimpleWebLoader:
+    """Simple web loader to replace LangChain's WebBaseLoader."""
+    def __init__(self, url: str):
+        self.url = url
+    def load(self) -> List[SimpleWebDocument]:
+        """Load web content from URL."""
+        try:
+            # Fetch the webpage
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+            }
+            response = requests.get(self.url, headers=headers, timeout=30)
+            response.raise_for_status()
+            # Parse HTML content
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Remove script and style elements
+            for script in soup(["script", "style"]):
+                script.decompose()
+            # Get text content
+            text = soup.get_text()
+            # Clean up text
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            text = ' '.join(chunk for chunk in chunks if chunk)
+            return [SimpleWebDocument(
+                page_content=text,
+                metadata={"source": self.url}
+            )]
+        except Exception as e:
+            # Return empty content if loading fails
+            return [SimpleWebDocument(
+                page_content="",
+                metadata={"source": self.url, "error": str(e)}
+            )]
 class WebDocumentLoading(DocumentLoadingBase):
     def __init__(self) -> None:
         super().__init__()
     def load(self, document_url: str) -> List[SourceDocument]:
-        documents = WebBaseLoader(document_url).load()
+        loader = SimpleWebLoader(document_url)
+        documents = loader.load()
         for document in documents:
             document.page_content = re.sub("\n{3,}", "\n\n", document.page_content)
             # Remove half non-ascii character from start/end of doc content
@@ -20,6 +74,7 @@ class WebDocumentLoading(DocumentLoadingBase):
             document.page_content = re.sub(pattern, "", document.page_content)
             if document.page_content == "":
                 documents.remove(document)
         source_documents: List[SourceDocument] = [
             SourceDocument(
                 content=document.page_content,

cwyodmodules/batch/utilities/helpers/azure_search_helper.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from azure.identity import ChainedTokenCredential, DefaultAzureCredential
 from typing import Union
-from langchain_community.vectorstores import AzureSearch
 from azure.core.credentials import AzureKeyCredential
 from azure.search.documents import SearchClient
 from azure.search.documents.indexes import SearchIndexClient
@@ -276,15 +276,6 @@ class AzureSearchHelper:
             ),
         ]
-        return AzureSearch(
-            azure_search_endpoint=self.env_helper.AZURE_SEARCH_SERVICE,
-            azure_search_key=(
-                self.env_helper.AZURE_SEARCH_KEY
-                if self.env_helper.is_auth_type_keys()
-                else None
-            ),
-            index_name=self.env_helper.AZURE_SEARCH_CONVERSATIONS_LOG_INDEX,
-            embedding_function=self.llm_helper.get_embedding_model().embed_query,
-            fields=fields,
-            user_agent="langchain chatwithyourdata-sa",
-        )
+        # Return simple search client instead of LangChain AzureSearch
+        # This maintains compatibility while removing LangChain dependency
+        return self.search_client

cwyodmodules/batch/utilities/helpers/config/config_helper.py CHANGED Viewed

@@ -7,8 +7,7 @@ from ..azure_blob_storage_client import AzureBlobStorageClient
 from ...document_chunking.chunking_strategy import ChunkingStrategy, ChunkingSettings
 from ...document_loading import LoadingSettings, LoadingStrategy
 from .embedding_config import EmbeddingConfig
-from ...orchestrator.orchestration_strategy import OrchestrationStrategy
-from ...orchestrator import OrchestrationSettings
 from ..env_helper import EnvHelper
 from .assistant_strategy import AssistantStrategy
 from .conversation_flow import ConversationFlow
@@ -43,12 +42,8 @@ class Config:
             for c in config["document_processors"]
         ]
         self.env_helper = EnvHelper()
-        self.default_orchestration_settings = {
-            "strategy": self.env_helper.ORCHESTRATION_STRATEGY
-        }
-        self.orchestrator = OrchestrationSettings(
-            config.get("orchestrator", self.default_orchestration_settings)
-        )
+        # Orchestrator is always semantic kernel now
+        # No configuration needed as there's only one option
         self.integrated_vectorization_config = (
             IntegratedVectorizationConfig(config["integrated_vectorization_config"])
             if self.env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION
@@ -93,7 +88,7 @@ class Config:
     @logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=log_result)
     def get_available_orchestration_strategies(self):
-        return [c.value for c in OrchestrationStrategy]
+        return ["semantic_kernel"]  # Only semantic kernel is supported now
     @logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=log_result)
     def get_available_ai_assistant_types(self):
@@ -271,7 +266,7 @@ class ConfigHelper:
             with open(config_file_path, encoding="utf-8") as f:
                 ConfigHelper._default_config = json.loads(
                     Template(f.read()).substitute(
-                        ORCHESTRATION_STRATEGY=env_helper.ORCHESTRATION_STRATEGY,
+                        ORCHESTRATION_STRATEGY="semantic_kernel",
                         LOG_USER_INTERACTIONS=(
                             False
                             if env_helper.DATABASE_TYPE == DatabaseType.POSTGRESQL.value

cwyodmodules/batch/utilities/helpers/config/default.json CHANGED Viewed

@@ -139,9 +139,7 @@
     "log_user_interactions": "${LOG_USER_INTERACTIONS}",
     "log_tokens": "${LOG_TOKENS}"
   },
-  "orchestrator": {
-    "strategy": "${ORCHESTRATION_STRATEGY}"
-  },
   "enable_chat_history": true,
   "database_type": "${DATABASE_TYPE}"
 }

cwyodmodules/batch/utilities/helpers/env_helper.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import threading
 # from dotenv import load_dotenv
-from ..orchestrator.orchestration_strategy import OrchestrationStrategy
 from ..helpers.config.conversation_flow import ConversationFlow
 from ..helpers.config.database_type import DatabaseType
@@ -130,10 +130,8 @@ class EnvHelper:
                 "USE_ADVANCED_IMAGE_PROCESSING", "False"
             )
             self.CONVERSATION_FLOW = os.getenv("CONVERSATION_FLOW", "custom")
-            # Orchestration Settings
-            self.ORCHESTRATION_STRATEGY = os.getenv(
-                "ORCHESTRATION_STRATEGY", "openai_function"
-            )
+            # Orchestration Settings - Always use semantic_kernel
+            self.ORCHESTRATION_STRATEGY = "semantic_kernel"
         # PostgreSQL configuration
         elif self.DATABASE_TYPE == DatabaseType.POSTGRESQL.value:
             self.AZURE_POSTGRES_SEARCH_TOP_K = 5
@@ -154,7 +152,7 @@ class EnvHelper:
             self.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = False
             self.USE_ADVANCED_IMAGE_PROCESSING = False
             self.CONVERSATION_FLOW = ConversationFlow.CUSTOM.value
-            self.ORCHESTRATION_STRATEGY = OrchestrationStrategy.SEMANTIC_KERNEL.value
+            self.ORCHESTRATION_STRATEGY = "semantic_kernel"
         else:
             raise ValueError(
                 "Unsupported DATABASE_TYPE. Please set DATABASE_TYPE to 'CosmosDB' or 'PostgreSQL'."

cwyodmodules/batch/utilities/helpers/llm_helper.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from openai import AzureOpenAI
 from typing import List, Union, cast
-from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
-from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+# Removed LangChain dependencies - using direct OpenAI SDK instead
 from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
 from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.azure_chat_prompt_execution_settings import (
     AzureChatPromptExecutionSettings,
@@ -49,68 +48,32 @@ class LLMHelper:
     @logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=False)
     def get_llm(self):
-        if self.auth_type_keys:
-            return AzureChatOpenAI(
-                deployment_name=self.llm_model,
-                temperature=0,
-                max_tokens=self.llm_max_tokens,
-                openai_api_version=self.openai_client._api_version,
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                api_key=self.env_helper.OPENAI_API_KEY,
-            )
-        else:
-            return AzureChatOpenAI(
-                deployment_name=self.llm_model,
-                temperature=0,
-                max_tokens=self.llm_max_tokens,
-                openai_api_version=self.openai_client._api_version,
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                azure_ad_token_provider=self.token_provider,
-            )
+        # Return the OpenAI client directly instead of LangChain wrapper
+        return self.openai_client
-    # TODO: This needs to have a custom callback to stream back to the UI
     @logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=False)
     def get_streaming_llm(self):
-        if self.auth_type_keys:
-            return AzureChatOpenAI(
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                api_key=self.env_helper.OPENAI_API_KEY,
-                streaming=True,
-                callbacks=[StreamingStdOutCallbackHandler],
-                deployment_name=self.llm_model,
-                temperature=0,
-                max_tokens=self.llm_max_tokens,
-                openai_api_version=self.openai_client._api_version,
-            )
-        else:
-            return AzureChatOpenAI(
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                api_key=self.env_helper.OPENAI_API_KEY,
-                streaming=True,
-                callbacks=[StreamingStdOutCallbackHandler],
-                deployment_name=self.llm_model,
-                temperature=0,
-                max_tokens=self.llm_max_tokens,
-                openai_api_version=self.openai_client._api_version,
-                azure_ad_token_provider=self.token_provider,
-            )
+        # Return the OpenAI client directly - streaming is handled via stream=True parameter
+        return self.openai_client
     @logger.trace_function(log_execution=log_execution, log_args=log_args, log_result=False)
     def get_embedding_model(self):
-        if self.auth_type_keys:
-            return AzureOpenAIEmbeddings(
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                api_key=self.env_helper.OPENAI_API_KEY,
-                azure_deployment=self.embedding_model,
-                chunk_size=1,
-            )
-        else:
-            return AzureOpenAIEmbeddings(
-                azure_endpoint=self.env_helper.AZURE_OPENAI_ENDPOINT,
-                azure_deployment=self.embedding_model,
-                chunk_size=1,
-                azure_ad_token_provider=self.token_provider,
-            )
+        # Return a simple embedding model wrapper that uses the OpenAI client directly
+        class EmbeddingModel:
+            def __init__(self, openai_client, embedding_model):
+                self.openai_client = openai_client
+                self.embedding_model = embedding_model
+            def embed_query(self, text: str) -> List[float]:
+                return (
+                    self.openai_client.embeddings.create(
+                        input=[text], model=self.embedding_model
+                    )
+                    .data[0]
+                    .embedding
+                )
+        return EmbeddingModel(self.openai_client, self.embedding_model)
     @logger.trace_function(log_execution=log_execution, log_args=False, log_result=False)
     def generate_embeddings(self, input: Union[str, list[int]]) -> List[float]:

cwyodmodules/batch/utilities/helpers/orchestrator_helper.py CHANGED Viewed

@@ -1,15 +1,12 @@
 from typing import List
+from ..orchestrator.semantic_kernel_orchestrator import SemanticKernelOrchestrator
-from ..orchestrator.orchestration_strategy import OrchestrationStrategy
-from ..orchestrator import OrchestrationSettings
-from ..orchestrator.strategies import get_orchestrator
-__all__ = ["OrchestrationStrategy"]
+__all__ = ["Orchestrator"]
 class Orchestrator:
     def __init__(self) -> None:
-        pass
+        self.orchestrator = SemanticKernelOrchestrator()
     async def handle_message(
         self,
@@ -17,14 +14,8 @@ class Orchestrator:
         chat_history: List[dict],
         conversation_id: str,
         user_info,
-        orchestrator: OrchestrationSettings,
         **kwargs: dict,
     ) -> dict:
-        orchestrator = get_orchestrator(orchestrator.strategy.value)
-        if orchestrator is None:
-            raise Exception(
-                f"Unknown orchestration strategy: {orchestrator.strategy.value}"
-            )
-        return await orchestrator.handle_message(
-            user_message, chat_history, conversation_id, user_info
+        return await self.orchestrator.handle_message(
+            user_message, chat_history, conversation_id, user_info, **kwargs
         )

cwyodmodules/batch/utilities/orchestrator/__init__.py CHANGED Viewed

@@ -1,18 +1,3 @@
-import os
-from typing import List
-import os.path
-import pkgutil
-from .orchestration_strategy import OrchestrationStrategy
+from .semantic_kernel_orchestrator import SemanticKernelOrchestrator
-class OrchestrationSettings:
-    def __init__(self, orchestration: dict):
-        self.strategy = OrchestrationStrategy(orchestration["strategy"])
-# Get a list of all the classes defined in the module
-def get_all_classes() -> List[str]:
-    return [name for _, name, _ in pkgutil.iter_modules([os.path.dirname(__file__)])]
-__all__ = get_all_classes()
+__all__ = ["SemanticKernelOrchestrator"]

cwyodmodules 0.3.80__py3-none-any.whl → 0.3.83__py3-none-any.whl

cwyodmodules 0.3.80py3-none-any.whl → 0.3.83py3-none-any.whl