PyPI - wizit-context-ingestor - Versions diffs - 0.2.5b2__py3-none-any.whl → 0.3.0b1__py3-none-any.whl - Mend

wizit-context-ingestor 0.2.5b2py3-none-any.whl → 0.3.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

wizit_context_ingestor/__init__.py +2 -2
wizit_context_ingestor/application/context_chunk_service.py +149 -35
wizit_context_ingestor/application/interfaces.py +1 -1
wizit_context_ingestor/application/transcription_service.py +132 -49
wizit_context_ingestor/data/kdb.py +10 -0
wizit_context_ingestor/data/prompts.py +156 -2
wizit_context_ingestor/data/storage.py +10 -0
wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
wizit_context_ingestor/infra/rag/chroma_embeddings.py +135 -0
wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
wizit_context_ingestor/infra/vertex_model.py +56 -28
wizit_context_ingestor/main.py +160 -105
wizit_context_ingestor/utils/file_utils.py +13 -0
wizit_context_ingestor/workflows/context_nodes.py +73 -0
wizit_context_ingestor/workflows/context_state.py +10 -0
wizit_context_ingestor/workflows/context_tools.py +58 -0
wizit_context_ingestor/workflows/context_workflow.py +42 -0
wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
wizit_context_ingestor/workflows/transcription_state.py +17 -0
wizit_context_ingestor/workflows/transcription_tools.py +54 -0
wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
{wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/METADATA +10 -1
wizit_context_ingestor-0.3.0b1.dist-info/RECORD +44 -0
{wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/WHEEL +1 -1
wizit_context_ingestor-0.2.5b2.dist-info/RECORD +0 -31

wizit_context_ingestor/infra/rag/redis_embeddings.py CHANGED Viewed

@@ -2,6 +2,7 @@ from langchain_core.documents import Document
 from langchain_redis import RedisConfig, RedisVectorStore
 from typing import List
 import logging
 # from dotenv import load_dotenv
 from ...application.interfaces import EmbeddingsManager
@@ -9,10 +10,13 @@ from ...application.interfaces import EmbeddingsManager
 logger = logging.getLogger(__name__)
-class RedisEmbeddingsManager(EmbeddingsManager):
+class RedisEmbeddingsManager(EmbeddingsManager):
     __slots__ = ("embeddings_model", "redis_conn_string", "metadata_tags")
-    def __init__(self, embeddings_model, redis_conn_string: str, metadata_tags: dict):
+    def __init__(
+        self, embeddings_model, redis_conn_string: str, metadata_tags: List[str] = []
+    ):
         """
         Initialize the RedisEmbeddingsManager.
         Args:
@@ -27,27 +31,23 @@ class RedisEmbeddingsManager(EmbeddingsManager):
         """
         self.redis_conn_string = redis_conn_string
         self.embeddings_model = embeddings_model
-        self.metadata_tags_schema = []
+        self.metadata_tags_schema = [{"type": "text", "name": "context"}]
         for tag_key in metadata_tags:
-          self.metadata_tags_schema.append({
-              "type": "tag",
-              "name": tag_key
-          })
+            self.metadata_tags_schema.append({"type": "text", "name": tag_key})
         try:
-          self.redis_config = RedisConfig(
-            index_name="vector_store",
-            redis_url=self.redis_conn_string,
-            metadata_schema=[
-              {"type": "text", "name": "context"}
-            ]+self.metadata_tags_schema,
-          )
-          self.vector_store = RedisVectorStore(self.embeddings_model, config=self.redis_config)
-          logger.info("RedisEmbeddingsManager initialized")
+            self.redis_config = RedisConfig(
+                index_name="vector_store",
+                redis_url=self.redis_conn_string,
+                metadata_schema=self.metadata_tags_schema,
+            )
+            self.vector_store = RedisVectorStore(
+                self.embeddings_model, config=self.redis_config
+            )
+            logger.info("RedisEmbeddingsManager initialized")
         except Exception as e:
-          logger.error(f"Failed to initialize RedisEmbeddingsManager: {str(e)}")
-          raise
+            logger.error(f"Failed to initialize RedisEmbeddingsManager: {str(e)}")
+            raise
     def configure_vector_store(
         self,
@@ -56,7 +56,7 @@ class RedisEmbeddingsManager(EmbeddingsManager):
         content_column: str = "document",
         id_column: str = "id",
         metadata_json_column: str = "cmetadata",
-        pg_record_manager: str = "postgres/langchain_pg_collection"
+        pg_record_manager: str = "postgres/langchain_pg_collection",
     ):
         """Configure the vector store."""
         pass
@@ -73,13 +73,14 @@ class RedisEmbeddingsManager(EmbeddingsManager):
     def vector_store_initialized(func):
         """validate vector store initialization"""
         def wrapper(self, *args, **kwargs):
-          # Common validation logic
-          if self.vector_store is None:
-            raise Exception("Vector store not initialized")
-          return func(self, *args, **kwargs)
-        return wrapper
+            # Common validation logic
+            if self.vector_store is None:
+                raise Exception("Vector store not initialized")
+            return func(self, *args, **kwargs)
+        return wrapper
     @vector_store_initialized
     def index_documents(self, docs: List[Document]):
@@ -129,6 +130,14 @@ class RedisEmbeddingsManager(EmbeddingsManager):
             logger.error(f"Error deleting documents by ID: {str(e)}")
             raise
+    @vector_store_initialized
+    def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
+        """
+        Delete documents by filter from the vector store.
+        """
+        # TODO investigate how to do this
+        pass
     def get_documents_keys_by_source_id(self, source_id: str):
         """Get documents keys by source ID."""
         pass

wizit_context_ingestor/infra/rag/semantic_chunks.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from posix import fork
 # check this documentation
 # https://python.langchain.com/docs/how_to/semantic-chunker/
 # https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
@@ -16,7 +18,9 @@ class SemanticChunks(RagChunker):
     Class for semantically chunking documents into smaller pieces based on semantic similarity.
     Uses LangChain's SemanticChunker to create semantically coherent document chunks.
     """
     __slots__ = ("embeddings_model",)
     def __init__(self, embeddings_model: Any):
         """
         Initialize a document chunker with an embeddings model.
@@ -35,7 +39,7 @@ class SemanticChunks(RagChunker):
             add_start_index=True,
             breakpoint_threshold_type="percentile",
             breakpoint_threshold_amount=95,
-            min_chunk_size=200
+            min_chunk_size=200,
         )
     def gen_chunks_for_document(self, document: Document) -> List[Document]:
@@ -53,6 +57,10 @@ class SemanticChunks(RagChunker):
         """
         try:
             chunks = self.text_splitter.split_documents([document])
+            source = document.metadata["source"]
+            for i, chunk in enumerate(chunks):
+                if document.metadata["source"]:
+                    chunk.id = f"{source}-{i}"
             logger.info(f"{len(chunks)} chunks generated successfully")
             return chunks
         except Exception as e:

wizit_context_ingestor/infra/vertex_model.py CHANGED Viewed

@@ -15,14 +15,23 @@ class VertexModels(AiApplicationService):
     A wrapper class for Google Cloud Vertex AI models that handles credentials and
     provides methods to load embeddings and chat models.
     """
-    __slots__ = ('project_id', 'location', 'json_service_account', 'scopes', 'llm_model_id')
+    __slots__ = (
+        "project_id",
+        "location",
+        "json_service_account",
+        "scopes",
+        "llm_model_id",
+    )
     def __init__(
-            self,
-            project_id: str,
-            location: str,
-            json_service_account: Dict[str, Any],
-            scopes: Optional[List[str]] = None,
-            llm_model_id: str = "claude-3-5-haiku@20241022"):
+        self,
+        project_id: str,
+        location: str,
+        json_service_account: Dict[str, Any],
+        scopes: Optional[List[str]] = None,
+        llm_model_id: str = "claude-sonnet-4@20250514",
+    ):
         """
         Initialize the VertexModels class with Google Cloud credentials.
@@ -36,25 +45,24 @@ class VertexModels(AiApplicationService):
             print(location)
             self.scopes = scopes or ["https://www.googleapis.com/auth/cloud-platform"]
             self.credentials = service_account.Credentials.from_service_account_info(
-                json_service_account,
-                scopes=self.scopes
+                json_service_account, scopes=self.scopes
             )
             self.llm_model_id = llm_model_id
             self.project_id = project_id
             self.location = location
             vertexai_init(
-                project=project_id,
-                location=location,
-                credentials=self.credentials
+                project=project_id, location=location, credentials=self.credentials
+            )
+            logger.info(
+                f"VertexModels initialized with project {project_id} in {location}"
             )
-            logger.info(f"VertexModels initialized with project {project_id} in {location}")
         except Exception as e:
             logger.error(f"Failed to initialize VertexModels: {str(e)}")
             raise
     def load_embeddings_model(
-        self,
-        embeddings_model_id: str = "text-multilingual-embedding-002") -> VertexAIEmbeddings:  # noqa: E125
+        self, embeddings_model_id: str = "text-multilingual-embedding-002"
+    ) -> VertexAIEmbeddings:  # noqa: E125
         """
         Load and return a Vertex AI embeddings model.
         default embeddings length is 768 https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings
@@ -73,14 +81,18 @@ class VertexModels(AiApplicationService):
             logger.debug(f"Loaded embedding model: {embeddings_model_id}")
             return embeddings
         except Exception as e:
-            logger.error(f"Failed to load embeddings model {embeddings_model_id}: {str(e)}")
+            logger.error(
+                f"Failed to load embeddings model {embeddings_model_id}: {str(e)}"
+            )
             raise
-    def load_chat_model(self,
+    def load_chat_model(
+        self,
         temperature: float = 0.15,
         max_tokens: int = 8192,
         stop: Optional[List[str]] = None,
-        **chat_model_params) -> Union[ChatVertexAI, ChatAnthropicVertex]:
+        **chat_model_params,
+    ) -> Union[ChatVertexAI, ChatAnthropicVertex]:
         """
         Load a Vertex AI chat model for text generation.
@@ -98,21 +110,35 @@ class VertexModels(AiApplicationService):
         """
         try:
             if "gemini" in self.llm_model_id:
-                return self.load_chat_model_gemini(self.llm_model_id, temperature, max_tokens, stop, **chat_model_params)
+                return self.load_chat_model_gemini(
+                    self.llm_model_id,
+                    temperature,
+                    max_tokens,
+                    stop,
+                    **chat_model_params,
+                )
             elif "claude" in self.llm_model_id:
-                return self.load_chat_model_anthropic(self.llm_model_id, temperature, max_tokens, stop, **chat_model_params)
+                return self.load_chat_model_anthropic(
+                    self.llm_model_id,
+                    temperature,
+                    max_tokens,
+                    stop,
+                    **chat_model_params,
+                )
             else:
                 raise ValueError(f"Unsupported chat model: {self.llm_model_id}")
         except Exception as e:
             logger.error(f"Failed to retrieve chat model {self.llm_model_id}: {str(e)}")
             raise
-    def load_chat_model_gemini(self,
+    def load_chat_model_gemini(
+        self,
         chat_model_id: str = "publishers/google/models/gemini-2.5-flash",
         temperature: float = 0.15,
-        max_tokens: int = 8192,
+        max_tokens: int = 64000,
         stop: Optional[List[str]] = None,
-        **chat_model_params) -> ChatVertexAI:
+        **chat_model_params,
+    ) -> ChatVertexAI:
         """
         Load a Vertex AI chat model for text generation.
@@ -137,7 +163,7 @@ class VertexModels(AiApplicationService):
                 max_tokens=max_tokens,
                 max_retries=1,
                 stop=stop,
-                **chat_model_params
+                **chat_model_params,
             )
             logger.debug(f"Retrieved chat model: {chat_model_id}")
             return self.llm_model
@@ -145,12 +171,14 @@ class VertexModels(AiApplicationService):
             logger.error(f"Failed to retrieve chat model {chat_model_id}: {str(e)}")
             raise
-    def load_chat_model_anthropic(self,
+    def load_chat_model_anthropic(
+        self,
         chat_model_id: str = "claude-3-5-haiku@20241022",
         temperature: float = 0.7,
-        max_tokens: int = 8000,
+        max_tokens: int = 64000,
         stop: Optional[List[str]] = None,
-        **chat_model_params) -> ChatAnthropicVertex:
+        **chat_model_params,
+    ) -> ChatAnthropicVertex:
         """
         Load a Vertex AI chat model for text generation.
         """
@@ -163,7 +191,7 @@ class VertexModels(AiApplicationService):
                 max_tokens=max_tokens,
                 max_retries=1,
                 stop=stop,
-                **chat_model_params
+                **chat_model_params,
             )
             logger.debug(f"Retrieved chat model: {chat_model_id}")
             return self.llm_model

wizit_context_ingestor/main.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
+from typing import Dict, Any, Literal
 from .infra.vertex_model import VertexModels
 from .application.transcription_service import TranscriptionService
 from .application.context_chunk_service import ContextChunksInDocumentService
@@ -6,16 +7,76 @@ from .infra.persistence.s3_storage import S3StorageService
 from .infra.persistence.local_storage import LocalStorageService
 from .infra.rag.semantic_chunks import SemanticChunks
 from .infra.rag.redis_embeddings import RedisEmbeddingsManager
+from .infra.rag.chroma_embeddings import ChromaEmbeddingsManager
 from .infra.secrets.aws_secrets_manager import AwsSecretsManager
+from .data.storage import storage_services, StorageServices
+from .data.kdb import kdb_services, KdbServices
+from .utils.file_utils import has_invalid_file_name_format
-class DeelabTranscribeManager:
-    def __init__(self,
+class KdbManager:
+    def __init__(
+        self, embeddings_model, kdb_service: kdb_services, kdb_params: Dict[Any, Any]
+    ):
+        self.kdb_service = kdb_service
+        self.kdb_params = kdb_params
+        self.embeddings_model = embeddings_model
+    def retrieve_kdb_service(self):
+        if self.kdb_service == KdbServices.REDIS.value:
+            return RedisEmbeddingsManager(
+                self.embeddings_model,
+                **self.kdb_params,
+            )
+        elif self.kdb_service == KdbServices.CHROMA.value:
+            return ChromaEmbeddingsManager(
+                self.embeddings_model,
+                **self.kdb_params,
+            )
+        else:
+            raise ValueError(f"Unsupported kdb provider: {self.kdb_service}")
+class PersistenceManager:
+    def __init__(
+        self,
+        storage_service: storage_services,
+        source_storage_route,
+        target_storage_route,
+    ):
+        self.storage_service = storage_service
+        self.source_storage_route = source_storage_route
+        self.target_storage_route = target_storage_route
+    def retrieve_storage_service(self):
+        if self.storage_service == StorageServices.S3.value:
+            return S3StorageService(
+                origin_bucket_name=self.source_storage_route,
+                target_bucket_name=self.target_storage_route,
+            )
+        elif self.storage_service == StorageServices.LOCAL.value:
+            return LocalStorageService(
+                source_storage_route=self.source_storage_route,
+                target_storage_route=self.target_storage_route,
+            )
+        else:
+            raise ValueError(f"Unsupported storage service: {self.storage_service}")
+class TranscriptionManager:
+    def __init__(
+        self,
         gcp_project_id: str,
         gcp_project_location: str,
         gcp_secret_name: str,
-        llm_model_id: str = "claude-3-5-sonnet-v2@20241022",
-        target_language: str = 'es',
+        storage_service: storage_services,
+        source_storage_route: str,
+        target_storage_route: str,
+        llm_model_id: str = "claude-sonnet-4@20250514",
+        target_language: str = "es",
+        transcription_additional_instructions: str = "",
+        transcription_accuracy_threshold: int = 90,
+        max_transcription_retries: int = 2,
     ):
         self.gcp_project_id = gcp_project_id
         self.gcp_project_location = gcp_project_location
@@ -23,6 +84,14 @@ class DeelabTranscribeManager:
         self.gcp_secret_name = gcp_secret_name
         self.llm_model_id = llm_model_id
         self.target_language = target_language
+        self.storage_service = storage_service
+        self.source_storage_route = source_storage_route
+        self.target_storage_route = target_storage_route
+        self.transcription_additional_instructions = (
+            transcription_additional_instructions
+        )
+        self.transcription_accuracy_threshold = transcription_accuracy_threshold
+        self.max_transcription_retries = max_transcription_retries
         self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
         self.vertex_model = self._get_vertex_model()
@@ -36,50 +105,78 @@ class DeelabTranscribeManager:
             self.gcp_project_id,
             self.gcp_project_location,
             self.gcp_sa_dict,
-            llm_model_id=self.llm_model_id
+            llm_model_id=self.llm_model_id,
         )
         return vertex_model
-    def aws_cloud_transcribe_document(
-            self,
-            file_key: str,
-            s3_origin_bucket_name: str,
-            s3_target_bucket_name: str
-    ):
+    def transcribe_document(self, file_key: str):
+        """Transcribe a document from source storage to target storage.
+        This method serves as a generic interface for transcribing documents from
+        various storage sources to target destinations. The specific implementation
+        depends on the storage route types provided.
+        Args:
+            file_key (str): The unique identifier or path of the file to be transcribed.
+        Returns:
+            The result of the transcription process, typically the path or identifier
+            of the transcribed document.
+        Raises:
+            Exception: If an error occurs during the transcription process.
+        """
         try:
-            s3_persistence_service = S3StorageService(
-                origin_bucket_name=s3_origin_bucket_name,
-                target_bucket_name=s3_target_bucket_name
+            if has_invalid_file_name_format(file_key):
+                raise ValueError(
+                    "Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
+                )
+            persistence_layer = PersistenceManager(
+                self.storage_service,
+                self.source_storage_route,
+                self.target_storage_route,
             )
+            persistence_service = persistence_layer.retrieve_storage_service()
             transcribe_document_service = TranscriptionService(
                 ai_application_service=self.vertex_model,
-                persistence_service=s3_persistence_service,
-                target_language=self.target_language
+                persistence_service=persistence_service,
+                target_language=self.target_language,
+                transcription_additional_instructions=self.transcription_additional_instructions,
+                transcription_accuracy_threshold=self.transcription_accuracy_threshold,
+                max_transcription_retries=self.max_transcription_retries,
+            )
+            parsed_pages, parsed_document = (
+                transcribe_document_service.process_document(file_key)
+            )
+            source_storage_file_tags = {}
+            if persistence_service.supports_tagging:
+                # source_storage_file_tags.tag_file(file_key, {"status": "transcribed"})
+                source_storage_file_tags = persistence_service.retrieve_file_tags(
+                    file_key, self.source_storage_route
+                )
+            transcribe_document_service.save_parsed_document(
+                f"{file_key}.md", parsed_document, source_storage_file_tags
             )
-            parsed_pages, parsed_document = transcribe_document_service.process_document(file_key)
-            origin_bucket_file_tags = s3_persistence_service.retrieve_file_tags(file_key, s3_origin_bucket_name)
-            transcribe_document_service.save_parsed_document(f"{file_key}.md", parsed_document, origin_bucket_file_tags)
             # create md document from parsed_pages
             print("parsed_pages", len(parsed_pages))
             # print("parsed_document", parsed_document)
             return f"{file_key}.md"
         except Exception as e:
-            print(f"Error transcribing document: {e}")
+            print(f"Error processing document: {e}")
             raise e
-class DeelabRedisChunksManager:
+class ChunksManager:
     def __init__(
-            self,
-            gcp_project_id: str,
-            gcp_project_location: str,
-            gcp_secret_name: str,
-            redis_connection_string: str,
-            llm_model_id: str = "claude-3-5-haiku@20241022",
-            embeddings_model_id: str = "text-multilingual-embedding-002",
-            target_language: str = "es"
+        self,
+        gcp_project_id: str,
+        gcp_project_location: str,
+        gcp_secret_name: str,
+        storage_service: storage_services,
+        kdb_service: Literal["redis", "chroma"],
+        kdb_params: Dict[Any, Any],
+        llm_model_id: str = "claude-3-5-haiku@20241022",
+        embeddings_model_id: str = "text-multilingual-embedding-002",
+        target_language: str = "es",
     ):
         self.gcp_project_id = gcp_project_id
         self.gcp_project_location = gcp_project_location
@@ -88,9 +185,14 @@ class DeelabRedisChunksManager:
         self.llm_model_id = llm_model_id
         self.target_language = target_language
         self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
-        self.redis_connection_string = redis_connection_string
+        self.storage_service = storage_service
+        self.kdb_params = kdb_params
+        self.kdb_service = kdb_service
+        # self.redis_connection_string = redis_connection_string
         self.vertex_model = self._get_vertex_model()
-        self.embeddings_model = self.vertex_model.load_embeddings_model(embeddings_model_id)
+        self.embeddings_model = self.vertex_model.load_embeddings_model(
+            embeddings_model_id
+        )
     def _get_gcp_sa_dict(self, gcp_secret_name: str):
         vertex_gcp_sa = self.aws_secrets_manager.get_secret(gcp_secret_name)
@@ -102,92 +204,45 @@ class DeelabRedisChunksManager:
             self.gcp_project_id,
             self.gcp_project_location,
             self.gcp_sa_dict,
-            llm_model_id=self.llm_model_id
+            llm_model_id=self.llm_model_id,
         )
         return vertex_model
-    def context_chunks_in_document(
-        self,
-        file_key: str
+    def gen_context_chunks(
+        self, file_key: str, source_storage_route: str, target_storage_route: str
     ):
         try:
-            rag_chunker = SemanticChunks(self.embeddings_model)
-            redis_embeddings_manager = RedisEmbeddingsManager(
-                self.embeddings_model,
-                self.redis_connection_string,
-                {
-                    "file_key": file_key
-                }
-            )
-            local_persistence_service = LocalStorageService()
-            context_chunks_in_document_service = ContextChunksInDocumentService(
-                ai_application_service=self.vertex_model,
-                persistence_service=local_persistence_service,
-                rag_chunker=rag_chunker,
-                embeddings_manager=redis_embeddings_manager,
-                target_language=self.target_language
+            if has_invalid_file_name_format(file_key):
+                raise ValueError(
+                    "Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
+                )
+            persistence_layer = PersistenceManager(
+                self.storage_service, source_storage_route, target_storage_route
             )
-            context_chunks = context_chunks_in_document_service.get_context_chunks_in_document(file_key)
-            print("context_chunks", context_chunks)
-            return context_chunks
-        except Exception as e:
-            print(f"Error getting context chunks in document: {e}")
-            raise e
-    # TODO
-    def context_chunks_in_document_from_aws_cloud(
-            self,
-            file_key: str,
-            s3_origin_bucket_name: str,
-            s3_target_bucket_name: str
-        ):
-        try:
-            s3_persistence_service = S3StorageService(
-                origin_bucket_name=s3_origin_bucket_name,
-                target_bucket_name=s3_target_bucket_name
-            )
-            target_bucket_file_tags = s3_persistence_service.retrieve_file_tags(file_key, s3_target_bucket_name)
+            persistence_service = persistence_layer.retrieve_storage_service()
+            target_bucket_file_tags = []
+            if persistence_service.supports_tagging:
+                target_bucket_file_tags = persistence_service.retrieve_file_tags(
+                    file_key, target_storage_route
+                )
             rag_chunker = SemanticChunks(self.embeddings_model)
-            redis_embeddings_manager = RedisEmbeddingsManager(
-                embeddings_model=self.embeddings_model,
-                redis_conn_string=self.redis_connection_string,
-                metadata_tags=target_bucket_file_tags
+            kdb_manager = KdbManager(
+                self.embeddings_model, self.kdb_service, self.kdb_params
             )
+            kdb_service = kdb_manager.retrieve_kdb_service()
             context_chunks_in_document_service = ContextChunksInDocumentService(
                 ai_application_service=self.vertex_model,
-                persistence_service=s3_persistence_service,
+                persistence_service=persistence_service,
                 rag_chunker=rag_chunker,
-                embeddings_manager=redis_embeddings_manager,
-                target_language=self.target_language
+                embeddings_manager=kdb_service,
+                target_language=self.target_language,
+            )
+            context_chunks = (
+                context_chunks_in_document_service.get_context_chunks_in_document(
+                    file_key, target_bucket_file_tags
+                )
             )
-            context_chunks = context_chunks_in_document_service.get_context_chunks_in_document(file_key, target_bucket_file_tags)
             return context_chunks
         except Exception as e:
             print(f"Error getting context chunks in document: {e}")
             raise e
-    def delete_document_context_chunks_from_aws_cloud(
-            self,
-            file_key: str,
-            s3_origin_bucket_name: str,
-            s3_target_bucket_name: str
-        ):
-        pass
-        # rag_chunker = SemanticChunks(self.embeddings_model)
-        # pg_embeddings_manager = PgEmbeddingsManager(
-        #     embeddings_model=self.embeddings_model,
-        #     pg_connection=self.vector_store_connection
-        # )
-        # s3_persistence_service = S3StorageService(
-        #     origin_bucket_name=s3_origin_bucket_name,
-        #     target_bucket_name=s3_target_bucket_name
-        # )
-        # context_chunks_in_document_service = ContextChunksInDocumentService(
-        #     ai_application_service=self.vertex_model,
-        #     persistence_service=s3_persistence_service,
-        #     rag_chunker=rag_chunker,
-        #     embeddings_manager=pg_embeddings_manager
-        # )
-        # context_chunks_in_document_service.delete_document_context_chunks(file_key)

wizit_context_ingestor/utils/file_utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+import re
+def has_invalid_file_name_format(file_name):
+    """Check if file name has special characters or spaces instead of underscores"""
+    # Check for spaces
+    if " " in file_name:
+        return True
+    # Check for special characters (anything that's not alphanumeric, underscore, dash, or dot)
+    if re.search(r"[^a-zA-Z0-9_.-]", file_name):
+        return True
+    return False

wizit-context-ingestor 0.2.5b2__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

wizit-context-ingestor 0.2.5b2py3-none-any.whl → 0.3.0b1py3-none-any.whl