PyPI - wizit-context-ingestor - Versions diffs - 0.2.5b2__tar.gz → 0.2.5b3__tar.gz - Mend

wizit-context-ingestor 0.2.5b2tar.gz → 0.2.5b3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/PKG-INFO RENAMED Viewed

@@ -1,10 +1,11 @@
 Metadata-Version: 2.3
 Name: wizit-context-ingestor
-Version: 0.2.5b2
+Version: 0.2.5b3
 Summary: Contextual Rag with Cloud Solutions
 Requires-Dist: anthropic[vertex]>=0.66.0
 Requires-Dist: boto3>=1.40.23
 Requires-Dist: langchain-aws>=0.2.31
+Requires-Dist: langchain-chroma>=0.2.6
 Requires-Dist: langchain-experimental>=0.3.4
 Requires-Dist: langchain-google-vertexai>=2.0.28
 Requires-Dist: langchain-redis>=0.2.3

{wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "wizit_context_ingestor"
-version = "0.2.5-beta-2"
+version = "0.2.5-beta-3"
 description = "Contextual Rag with Cloud Solutions"
 readme = "README.md"
 requires-python = ">=3.12"
@@ -8,6 +8,7 @@ dependencies = [
     "anthropic[vertex]>=0.66.0",
     "boto3>=1.40.23",
     "langchain-aws>=0.2.31",
+    "langchain-chroma>=0.2.6",
     "langchain-experimental>=0.3.4",
     "langchain-google-vertexai>=2.0.28",
     "langchain-redis>=0.2.3",

{wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/interfaces.py RENAMED Viewed

@@ -92,7 +92,7 @@ class EmbeddingsManager(ABC):
         pass
     @abstractmethod
-    def index_documents(self, documents: List[Document]):
+    def index_documents(self, documents: list[Document]):
         """Index documents."""
         pass

{wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/transcription_service.py RENAMED Viewed

@@ -19,11 +19,13 @@ class TranscriptionService:
         self,
         ai_application_service: AiApplicationService,
         persistence_service: PersistenceService,
-        target_language: str = 'es'
+        target_language: str = 'es',
+        transcription_additional_instructions: str = ''
     ):
         self.ai_application_service = ai_application_service
         self.persistence_service = persistence_service
         self.target_language = target_language
+        self.transcription_additional_instructions = transcription_additional_instructions
         self.chat_model = self.ai_application_service.load_chat_model()
     def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
@@ -50,6 +52,7 @@ class TranscriptionService:
                         }]
                     ),
                 ]).partial(
+                    transcription_additional_instructions=self.transcription_additional_instructions,
                     format_instructions=transcription_output_parser.get_format_instructions()
                 )
                 model_with_structured_output = self.chat_model.with_structured_output(Transcription)

{wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/data/prompts.py RENAMED Viewed

@@ -22,6 +22,7 @@ TRANSCRIPTION RULES:
    - Include: footnotes, page numbers, bullet points, lists, captions
    - Preserve: bold, italic, underlined, and other text formatting using markdown
    - Mark unclear text as [unclear] or [illegible] with best guess in brackets
+    - Enclose all underlined content in <UnderlinedContent></UnderlinedContent> tags
 2. LANGUAGE REQUIREMENTS:
    - All transcribed content MUST be in the document's primary language
@@ -70,9 +71,15 @@ CRITICAL REMINDERS:
 - Maintain professional transcription standards
 - Complete transcription is mandatory
+<additional_instructions>
+    {transcription_additional_instructions}
+</additional_instructions>
 Generate the optimized transcription following these specifications:
 {format_instructions}
 """
 CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """

wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/infra/rag/chroma_embeddings.py ADDED Viewed

@@ -0,0 +1,132 @@
+from typing_extensions import Sequence
+from test.test_typing import CoolEmployee
+from langchain_core.documents import Document
+from langchain_chroma import Chroma
+from typing import List
+import logging
+from uuid import uuid4
+from ...application.interfaces import EmbeddingsManager
+# load_dotenv()
+logger = logging.getLogger(__name__)
+class ChromaEmbeddingsManager(EmbeddingsManager):
+    __slots__ = ("embeddings_model", "chroma_host", "collection_name", "metadata_tags")
+    def __init__(
+        self,
+        embeddings_model,
+        chroma_host,
+        collection_name: str,
+        metadata_tags: dict
+    ):
+        """
+        Initialize the ChromaEmbeddingsManager.
+        Args:
+            embeddings_model: The embeddings model to use for generating vector embeddings
+                              (typically a LangChain embeddings model instance)
+            chroma_host: The Chroma host URL
+            collection_name: The Chroma collection name
+            metadata_tags: Tags to add as metadata to Chroma vector store
+        Raises:
+            Exception: If there's an error initializing the RedisEmbeddingsManager
+        """
+        self.collection_name = collection_name
+        self.embeddings_model = embeddings_model
+        self.chroma_host = chroma_host
+        self.metadata_tags_schema = []
+        for tag_key in metadata_tags:
+          self.metadata_tags_schema.append({
+              "type": "tag",
+              "name": tag_key
+          })
+        try:
+            self.chroma = Chroma(
+                collection_name=self.collection_name,
+                embedding_function=self.embeddings_model,
+                host=self.chroma_host,
+            )
+            logger.info("ChromaEmbeddingsManager initialized")
+        except Exception as e:
+          logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
+          raise
+    def configure_vector_store(
+        self,
+        table_name: str = "",
+        vector_size: int = 768,
+        content_column: str = "document",
+        id_column: str = "id",
+        metadata_json_column: str = "cmetadata",
+        pg_record_manager: str = ""
+    ):
+        """Configure the vector store."""
+        pass
+    def init_vector_store(
+        self,
+        table_name: str = "",
+        content_column: str = "document",
+        metadata_json_column: str = "cmetadata",
+        id_column: str = "id",
+    ):
+        """Initialize the vector store."""
+        pass
+    def index_documents(self, documents: list[Document]):
+        """
+        Add documents to the vector store with their embeddings.
+        This method takes a list of Document objects, generates embeddings for them
+        using the embeddings model, and stores both the documents and their
+        embeddings in the PostgreSQL database.
+        Args:
+          docs: A list of LangChain Document objects to add to the vector store
+                Each Document should have page_content and metadata attributes
+                from langchain_core.documents import Document
+        Returns:
+          None
+        Raises:
+          Exception: If there's an error adding documents to the vector store
+        """
+        try:
+            logger.info(f"Indexing {len(documents)} documents in vector store")
+            self.chroma.add_documents(documents)
+        except Exception as e:
+            logger.error(f"Error indexing documents: {str(e)}")
+            raise
+    def get_documents_by_id(self, ids: list[str]):
+        """
+        Get document by ID from the vector store.
+        """
+        try:
+            return self.chroma.get_by_ids(ids)
+        except Exception as e:
+            logger.error(f"Error getting documents by ID: {str(e)}")
+            raise
+    def delete_documents_by_id(self, ids: list[str]):
+        """
+        Delete documents by ID from the vector store.
+        """
+        try:
+            self.chroma.delete(ids)
+        except Exception as e:
+            logger.error(f"Error deleting documents by ID: {str(e)}")
+            raise
+    def get_documents_keys_by_source_id(self, source_id: str):
+        """Get documents keys by source ID."""
+        pass
+    def delete_documents_by_source_id(self, source_id: str):
+        """Delete documents by source ID."""
+        pass

{wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/main.py RENAMED Viewed

@@ -14,8 +14,9 @@ class DeelabTranscribeManager:
         gcp_project_id: str,
         gcp_project_location: str,
         gcp_secret_name: str,
-        llm_model_id: str = "claude-3-5-sonnet-v2@20241022",
+        llm_model_id: str = "claude-sonnet-4@20250514",
         target_language: str = 'es',
+        transcription_additional_instructions: str = ''
     ):
         self.gcp_project_id = gcp_project_id
         self.gcp_project_location = gcp_project_location
@@ -23,6 +24,7 @@ class DeelabTranscribeManager:
         self.gcp_secret_name = gcp_secret_name
         self.llm_model_id = llm_model_id
         self.target_language = target_language
+        self.transcription_additional_instructions = transcription_additional_instructions
         self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
         self.vertex_model = self._get_vertex_model()
@@ -55,7 +57,8 @@ class DeelabTranscribeManager:
             transcribe_document_service = TranscriptionService(
                 ai_application_service=self.vertex_model,
                 persistence_service=s3_persistence_service,
-                target_language=self.target_language
+                target_language=self.target_language,
+                transcription_additional_instructions=self.transcription_additional_instructions
             )
             parsed_pages, parsed_document = transcribe_document_service.process_document(file_key)
             origin_bucket_file_tags = s3_persistence_service.retrieve_file_tags(file_key, s3_origin_bucket_name)