PyPI - wizit-context-ingestor - Versions diffs - 0.2.5b1__tar.gz → 0.2.5b3__tar.gz - Mend

wizit-context-ingestor 0.2.5b1tar.gz → 0.2.5b3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/PKG-INFO RENAMED Viewed

@@ -1,10 +1,11 @@
 Metadata-Version: 2.3
 Name: wizit-context-ingestor
-Version: 0.2.5b1
+Version: 0.2.5b3
 Summary: Contextual Rag with Cloud Solutions
 Requires-Dist: anthropic[vertex]>=0.66.0
 Requires-Dist: boto3>=1.40.23
 Requires-Dist: langchain-aws>=0.2.31
+Requires-Dist: langchain-chroma>=0.2.6
 Requires-Dist: langchain-experimental>=0.3.4
 Requires-Dist: langchain-google-vertexai>=2.0.28
 Requires-Dist: langchain-redis>=0.2.3

{wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "wizit_context_ingestor"
-version = "0.2.5-beta-1"
+version = "0.2.5-beta-3"
 description = "Contextual Rag with Cloud Solutions"
 readme = "README.md"
 requires-python = ">=3.12"
@@ -8,6 +8,7 @@ dependencies = [
     "anthropic[vertex]>=0.66.0",
     "boto3>=1.40.23",
     "langchain-aws>=0.2.31",
+    "langchain-chroma>=0.2.6",
     "langchain-experimental>=0.3.4",
     "langchain-google-vertexai>=2.0.28",
     "langchain-redis>=0.2.3",

{wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/interfaces.py RENAMED Viewed

@@ -92,7 +92,7 @@ class EmbeddingsManager(ABC):
         pass
     @abstractmethod
-    def index_documents(self, documents: List[Document]):
+    def index_documents(self, documents: list[Document]):
         """Index documents."""
         pass

{wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/transcription_service.py RENAMED Viewed

@@ -19,11 +19,13 @@ class TranscriptionService:
         self,
         ai_application_service: AiApplicationService,
         persistence_service: PersistenceService,
-        target_language: str = 'es'
+        target_language: str = 'es',
+        transcription_additional_instructions: str = ''
     ):
         self.ai_application_service = ai_application_service
         self.persistence_service = persistence_service
         self.target_language = target_language
+        self.transcription_additional_instructions = transcription_additional_instructions
         self.chat_model = self.ai_application_service.load_chat_model()
     def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
@@ -50,6 +52,7 @@ class TranscriptionService:
                         }]
                     ),
                 ]).partial(
+                    transcription_additional_instructions=self.transcription_additional_instructions,
                     format_instructions=transcription_output_parser.get_format_instructions()
                 )
                 model_with_structured_output = self.chat_model.with_structured_output(Transcription)

{wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/data/prompts.py RENAMED Viewed

@@ -22,6 +22,7 @@ TRANSCRIPTION RULES:
    - Include: footnotes, page numbers, bullet points, lists, captions
    - Preserve: bold, italic, underlined, and other text formatting using markdown
    - Mark unclear text as [unclear] or [illegible] with best guess in brackets
+    - Enclose all underlined content in <UnderlinedContent></UnderlinedContent> tags
 2. LANGUAGE REQUIREMENTS:
    - All transcribed content MUST be in the document's primary language
@@ -70,9 +71,15 @@ CRITICAL REMINDERS:
 - Maintain professional transcription standards
 - Complete transcription is mandatory
+<additional_instructions>
+    {transcription_additional_instructions}
+</additional_instructions>
 Generate the optimized transcription following these specifications:
 {format_instructions}
 """
 CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """

{wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/domain/services.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import base64
 import logging
 import io
-import fitz
+import pymupdf
 from PIL import Image
 from typing import List
 from ..domain.models import ParsedDocPage, ParsedDoc
@@ -17,25 +17,25 @@ class ParseDocModelService():
     def __init__(self, file_path: str):
         """
         Initialize a PDF document parser.
         Args:
             file_path: Path to the PDF file to parse
         """
         self.file_path = file_path
-        self.pdf_document = fitz.open(file_path)
+        self.pdf_document = pymupdf.open(file_path)
         self.page_count = self.pdf_document.page_count
     def pdf_page_to_base64(self, page_number: int) -> ParsedDocPage:
         """
         Convert a PDF page to a base64-encoded PNG image.
         Args:
             page_number: One-indexed page number to convert
         Returns:
             Base64 encoded string of the page image
         Raises:
             Exception: If there's an error during conversion
         """
@@ -49,7 +49,7 @@ class ParseDocModelService():
             b64_encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
             logger.info(f"Page {page_number} encoded successfully")
             return ParsedDocPage(
-                page_number=page_number,
+                page_number=page_number,
                 page_base64=b64_encoded_image
             )
         except Exception as e:
@@ -59,15 +59,15 @@ class ParseDocModelService():
     def parse_document_to_base64(self) -> List[ParsedDocPage]:
         """
         Convert all pages in the PDF document to base64-encoded images.
         Returns:
             List of base64 encoded strings for each page
         Raises:
             Exception: If there's an error during conversion
         """
         # BASE DE DATOS SINTETICOS DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO, FINE TUNING PARA EL LLM
-        # GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
+        # GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
         # RAG --> FINETUNING AUTOMATICO / CONSULTAR EL MODELO
         # OPENAI --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
         # COLAB --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
@@ -95,4 +95,4 @@ class ParseDocModelService():
             document_text=md_content
         )
-    # def
+    # def

wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/infra/rag/chroma_embeddings.py ADDED Viewed

@@ -0,0 +1,132 @@
+from typing_extensions import Sequence
+from test.test_typing import CoolEmployee
+from langchain_core.documents import Document
+from langchain_chroma import Chroma
+from typing import List
+import logging
+from uuid import uuid4
+from ...application.interfaces import EmbeddingsManager
+# load_dotenv()
+logger = logging.getLogger(__name__)
+class ChromaEmbeddingsManager(EmbeddingsManager):
+    __slots__ = ("embeddings_model", "chroma_host", "collection_name", "metadata_tags")
+    def __init__(
+        self,
+        embeddings_model,
+        chroma_host,
+        collection_name: str,
+        metadata_tags: dict
+    ):
+        """
+        Initialize the ChromaEmbeddingsManager.
+        Args:
+            embeddings_model: The embeddings model to use for generating vector embeddings
+                              (typically a LangChain embeddings model instance)
+            chroma_host: The Chroma host URL
+            collection_name: The Chroma collection name
+            metadata_tags: Tags to add as metadata to Chroma vector store
+        Raises:
+            Exception: If there's an error initializing the RedisEmbeddingsManager
+        """
+        self.collection_name = collection_name
+        self.embeddings_model = embeddings_model
+        self.chroma_host = chroma_host
+        self.metadata_tags_schema = []
+        for tag_key in metadata_tags:
+          self.metadata_tags_schema.append({
+              "type": "tag",
+              "name": tag_key
+          })
+        try:
+            self.chroma = Chroma(
+                collection_name=self.collection_name,
+                embedding_function=self.embeddings_model,
+                host=self.chroma_host,
+            )
+            logger.info("ChromaEmbeddingsManager initialized")
+        except Exception as e:
+          logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
+          raise
+    def configure_vector_store(
+        self,
+        table_name: str = "",
+        vector_size: int = 768,
+        content_column: str = "document",
+        id_column: str = "id",
+        metadata_json_column: str = "cmetadata",
+        pg_record_manager: str = ""
+    ):
+        """Configure the vector store."""
+        pass
+    def init_vector_store(
+        self,
+        table_name: str = "",
+        content_column: str = "document",
+        metadata_json_column: str = "cmetadata",
+        id_column: str = "id",
+    ):
+        """Initialize the vector store."""
+        pass
+    def index_documents(self, documents: list[Document]):
+        """
+        Add documents to the vector store with their embeddings.
+        This method takes a list of Document objects, generates embeddings for them
+        using the embeddings model, and stores both the documents and their
+        embeddings in the PostgreSQL database.
+        Args:
+          docs: A list of LangChain Document objects to add to the vector store
+                Each Document should have page_content and metadata attributes
+                from langchain_core.documents import Document
+        Returns:
+          None
+        Raises:
+          Exception: If there's an error adding documents to the vector store
+        """
+        try:
+            logger.info(f"Indexing {len(documents)} documents in vector store")
+            self.chroma.add_documents(documents)
+        except Exception as e:
+            logger.error(f"Error indexing documents: {str(e)}")
+            raise
+    def get_documents_by_id(self, ids: list[str]):
+        """
+        Get document by ID from the vector store.
+        """
+        try:
+            return self.chroma.get_by_ids(ids)
+        except Exception as e:
+            logger.error(f"Error getting documents by ID: {str(e)}")
+            raise
+    def delete_documents_by_id(self, ids: list[str]):
+        """
+        Delete documents by ID from the vector store.
+        """
+        try:
+            self.chroma.delete(ids)
+        except Exception as e:
+            logger.error(f"Error deleting documents by ID: {str(e)}")
+            raise
+    def get_documents_keys_by_source_id(self, source_id: str):
+        """Get documents keys by source ID."""
+        pass
+    def delete_documents_by_source_id(self, source_id: str):
+        """Delete documents by source ID."""
+        pass

{wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/main.py RENAMED Viewed

@@ -14,8 +14,9 @@ class DeelabTranscribeManager:
         gcp_project_id: str,
         gcp_project_location: str,
         gcp_secret_name: str,
-        llm_model_id: str = "claude-3-5-sonnet-v2@20241022",
+        llm_model_id: str = "claude-sonnet-4@20250514",
         target_language: str = 'es',
+        transcription_additional_instructions: str = ''
     ):
         self.gcp_project_id = gcp_project_id
         self.gcp_project_location = gcp_project_location
@@ -23,6 +24,7 @@ class DeelabTranscribeManager:
         self.gcp_secret_name = gcp_secret_name
         self.llm_model_id = llm_model_id
         self.target_language = target_language
+        self.transcription_additional_instructions = transcription_additional_instructions
         self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
         self.vertex_model = self._get_vertex_model()
@@ -55,7 +57,8 @@ class DeelabTranscribeManager:
             transcribe_document_service = TranscriptionService(
                 ai_application_service=self.vertex_model,
                 persistence_service=s3_persistence_service,
-                target_language=self.target_language
+                target_language=self.target_language,
+                transcription_additional_instructions=self.transcription_additional_instructions
             )
             parsed_pages, parsed_document = transcribe_document_service.process_document(file_key)
             origin_bucket_file_tags = s3_persistence_service.retrieve_file_tags(file_key, s3_origin_bucket_name)

{wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/services/parse_doc.py RENAMED Viewed

@@ -4,7 +4,7 @@ from langchain_core.output_parsers import StrOutputParser
 import base64
 import logging
 import io
-import fitz
+import pymupdf
 from PIL import Image
 from typing import List, Any
 from dotenv import load_dotenv
@@ -23,13 +23,13 @@ class ParseDoc:
     def __init__(self, file_path: str, system_prompt, chat_model: Any):
         """
         Initialize a PDF document parser.
         Args:
             file_path: Path to the PDF file to parse
             chat_model: Language model for processing document content
         """
         self.file_path = file_path
-        self.pdf_document = fitz.open(file_path)
+        self.pdf_document = pymupdf.open(file_path)
         self.page_count = self.pdf_document.page_count
         self.system_prompt = system_prompt
         self.chat_model = chat_model
@@ -37,13 +37,13 @@ class ParseDoc:
     def pdf_page_to_base64(self, page_number: int) -> str:
         """
         Convert a PDF page to a base64-encoded PNG image.
         Args:
             page_number: One-indexed page number to convert
         Returns:
             Base64 encoded string of the page image
         Raises:
             Exception: If there's an error during conversion
         """
@@ -69,10 +69,10 @@ class ParseDoc:
     def parse_document_to_base64(self) -> List[str]:
         """
         Convert all pages in the PDF document to base64-encoded images.
         Returns:
             List of base64 encoded strings for each page
         Raises:
             Exception: If there's an error during conversion
         """
@@ -90,14 +90,14 @@ class ParseDoc:
     def parse_with_llm(self, base_64_image: str, prompt: str) -> AIMessage:
         """
         Process a base64-encoded image with a language model using the provided prompt.
         Args:
             base_64_image: Base64 encoded image string
             prompt: Text prompt to send with the image
         Returns:
             Language model response
         Raises:
             Exception: If there's an error during processing
         """