PyPI - wizit-context-ingestor - Versions diffs - 0.3.0b2__tar.gz → 0.3.0b4__tar.gz - Mend

wizit-context-ingestor 0.3.0b2tar.gz → 0.3.0b4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of wizit-context-ingestor might be problematic. Click here for more details.

Files changed (44) hide show

{wizit_context_ingestor-0.3.0b2 → wizit_context_ingestor-0.3.0b4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: wizit-context-ingestor
-Version: 0.3.0b2
+Version: 0.3.0b4
 Summary: Contextual Rag with Cloud Solutions
 Requires-Dist: anthropic[vertex]>=0.66.0
 Requires-Dist: boto3>=1.40.23

{wizit_context_ingestor-0.3.0b2 → wizit_context_ingestor-0.3.0b4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "wizit_context_ingestor"
-version = "0.3.0-beta-2"
+version = "0.3.0-beta-4"
 description = "Contextual Rag with Cloud Solutions"
 readme = "README.md"
 requires-python = ">=3.12"

{wizit_context_ingestor-0.3.0b2 → wizit_context_ingestor-0.3.0b4}/src/wizit_context_ingestor/application/context_chunk_service.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import asyncio
 from langchain_core.output_parsers.pydantic import PydanticOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.documents import Document
@@ -44,7 +45,7 @@ class ContextChunksInDocumentService:
         self.context_additional_instructions = ""
         self.metadata_source = "source"
-    def _retrieve_context_chunk_in_document_with_workflow(
+    async def _retrieve_context_chunk_in_document_with_workflow(
         self,
         workflow,
         markdown_content: str,
@@ -53,7 +54,7 @@ class ContextChunksInDocumentService:
     ) -> Document:
         """Retrieve context chunks in document."""
         try:
-            result = workflow.invoke(
+            result = await workflow.ainvoke(
                 {
                     "messages": [
                         HumanMessage(
@@ -74,9 +75,7 @@ class ContextChunksInDocumentService:
                     }
                 },
             )
-            # chunk.page_content = (
-            #     f"Context:{result['context']}, Content:{chunk.page_content}"
-            # )
+            chunk.page_content = f"<context>\n{result['context']}\n</context>\n <content>\n{chunk.page_content}\n</content>"
             chunk.metadata["context"] = result["context"]
             if chunk_metadata:
                 for key, value in chunk_metadata.items():
@@ -154,7 +153,7 @@ class ContextChunksInDocumentService:
     #         logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
     #         raise
-    def retrieve_context_chunks_in_document_with_workflow(
+    async def retrieve_context_chunks_in_document_with_workflow(
         self,
         markdown_content: str,
         chunks: List[Document],
@@ -167,7 +166,7 @@ class ContextChunksInDocumentService:
             )
             compiled_context_workflow = context_workflow.gen_workflow()
             compiled_context_workflow = compiled_context_workflow.compile()
-            context_chunks = list(
+            context_chunks_workflow_invocations = list(
                 map(
                     lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
                         compiled_context_workflow,
@@ -178,12 +177,13 @@ class ContextChunksInDocumentService:
                     chunks,
                 )
             )
+            context_chunks = await asyncio.gather(*context_chunks_workflow_invocations)
             return context_chunks
         except Exception as e:
             logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
             raise
-    def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
+    async def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
         """
         Get the context chunks in a document.
         """
@@ -199,8 +199,10 @@ class ContextChunksInDocumentService:
             logger.info(f"Document loaded:{file_key}")
             chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
             logger.info(f"Chunks generated:{len(chunks)}")
-            context_chunks = self.retrieve_context_chunks_in_document_with_workflow(
-                markdown_content, chunks, file_tags
+            context_chunks = (
+                await self.retrieve_context_chunks_in_document_with_workflow(
+                    markdown_content, chunks, file_tags
+                )
             )
             logger.info(f"Context chunks generated:{len(context_chunks)}")
             # upsert validation

{wizit_context_ingestor-0.3.0b2 → wizit_context_ingestor-0.3.0b4}/src/wizit_context_ingestor/application/transcription_service.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import asyncio
 from typing import Tuple, List, Dict, Optional
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers.pydantic import PydanticOutputParser
@@ -23,15 +24,15 @@ class TranscriptionService:
         persistence_service: PersistenceService,
         target_language: str = "es",
         transcription_additional_instructions: str = "",
-        transcription_accuracy_threshold: int = 90,
+        transcription_accuracy_threshold: float = 0.90,
         max_transcription_retries: int = 2,
     ):
         self.ai_application_service = ai_application_service
         self.persistence_service = persistence_service
         self.target_language = target_language
         if (
-            transcription_accuracy_threshold < 0
-            or transcription_accuracy_threshold > 95
+            transcription_accuracy_threshold < 0.0
+            or transcription_accuracy_threshold > 0.95
         ):
             raise ValueError(
                 "transcription_accuracy_threshold must be between 0 and 95"
@@ -46,6 +47,15 @@ class TranscriptionService:
             transcription_additional_instructions
         )
         self.chat_model = self.ai_application_service.load_chat_model()
+        self.transcription_workflow = TranscriptionWorkflow(
+            self.chat_model, self.transcription_additional_instructions
+        )
+        self.compiled_transcription_workflow = (
+            self.transcription_workflow.gen_workflow()
+        )
+        self.compiled_transcription_workflow = (
+            self.compiled_transcription_workflow.compile()
+        )
     # def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
     #     """Transcribe an image to text.
@@ -101,19 +111,16 @@ class TranscriptionService:
     #         logger.error(f"Failed to parse document page: {str(e)}")
     #         raise
-    def parse_doc_page_with_workflow(self, document: ParsedDocPage) -> ParsedDocPage:
+    async def parse_doc_page_with_workflow(
+        self, document: ParsedDocPage
+    ) -> ParsedDocPage:
         """Transcribe an image to text using an agent.
         Args:
             document: The document with the image to transcribe
         Returns:
             Processed text
         """
-        transcription_workflow = TranscriptionWorkflow(
-            self.chat_model, self.transcription_additional_instructions
-        )
-        compiled_transcription_workflow = transcription_workflow.gen_workflow()
-        compiled_transcription_workflow = compiled_transcription_workflow.compile()
-        result = compiled_transcription_workflow.invoke(
+        result = await self.compiled_transcription_workflow.ainvoke(
             {
                 "messages": [
                     HumanMessage(
@@ -146,20 +153,39 @@ class TranscriptionService:
         if result["transcription"]:
             document.page_text = result["transcription"]
         else:
-            raise ValueError("No transcription found")
+            raise ValueError(f"No transcription found: {result} ")
         return document
-    def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
+    # def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
+    #     """
+    #     Process a document by parsing it and returning the parsed content.
+    #     """
+    #     raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
+    #     parse_doc_model_service = ParseDocModelService(raw_file_path)
+    #     document_pages = parse_doc_model_service.parse_document_to_base64()
+    #     parsed_pages = []
+    #     for page in document_pages:
+    #         page = self.parse_doc_page_with_workflow(page)
+    #         parsed_pages.append(page)
+    #     logger.info(f"Parsed {len(parsed_pages)} pages")
+    #     parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
+    #     return parsed_pages, parsed_document
+    async def process_document(
+        self, file_key: str
+    ) -> Tuple[List[ParsedDocPage], ParsedDoc]:
         """
         Process a document by parsing it and returning the parsed content.
         """
         raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
         parse_doc_model_service = ParseDocModelService(raw_file_path)
         document_pages = parse_doc_model_service.parse_document_to_base64()
+        parse_pages_workflow_tasks = []
         parsed_pages = []
         for page in document_pages:
-            page = self.parse_doc_page_with_workflow(page)
-            parsed_pages.append(page)
+            parse_pages_workflow_tasks.append(self.parse_doc_page_with_workflow(page))
+        # here
+        parsed_pages = await asyncio.gather(*parse_pages_workflow_tasks)
         logger.info(f"Parsed {len(parsed_pages)} pages")
         parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
         return parsed_pages, parsed_document

{wizit_context_ingestor-0.3.0b2 → wizit_context_ingestor-0.3.0b4}/src/wizit_context_ingestor/domain/services.py RENAMED Viewed

@@ -8,8 +8,9 @@ from ..domain.models import ParsedDocPage, ParsedDoc
 logger = logging.getLogger(__name__)
 # CHECK THIS THING IMPROVE THE WAY CODE IS STRUCTURED
-class ParseDocModelService():
+class ParseDocModelService:
     """
     Class for parsing PDF documents, converting pages to base64 images
     """
@@ -25,7 +26,6 @@ class ParseDocModelService():
         self.pdf_document = pymupdf.open(file_path)
         self.page_count = self.pdf_document.page_count
     def pdf_page_to_base64(self, page_number: int) -> ParsedDocPage:
         """
         Convert a PDF page to a base64-encoded PNG image.
@@ -48,10 +48,7 @@ class ParseDocModelService():
             img.save(buffer, format="PNG")
             b64_encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
             logger.info(f"Page {page_number} encoded successfully")
-            return ParsedDocPage(
-                page_number=page_number,
-                page_base64=b64_encoded_image
-            )
+            return ParsedDocPage(page_number=page_number, page_base64=b64_encoded_image)
         except Exception as e:
             logger.error(f"Failed to parse b64 image: {str(e)}")
             raise
@@ -87,12 +84,10 @@ class ParseDocModelService():
         Create a markdown content from a list of parsed pages.
         """
         md_content = ""
-        for page in parsed_pages:
+        sorted_pages = sorted(parsed_pages, key=lambda page: page.page_number)
+        for page in sorted_pages:
             md_content += f"## Page {page.page_number}\n\n"
             md_content += f"{page.page_text}\n\n"
-        return ParsedDoc(
-            pages=parsed_pages,
-            document_text=md_content
-        )
+        return ParsedDoc(pages=parsed_pages, document_text=md_content)
     # def

{wizit_context_ingestor-0.3.0b2 → wizit_context_ingestor-0.3.0b4}/src/wizit_context_ingestor/infra/secrets/aws_secrets_manager.py RENAMED Viewed

@@ -3,11 +3,10 @@ import logging
 logger = logging.getLogger(__name__)
-class AwsSecretsManager:
-    def __init__(self):
-        self.client = boto3_client('secretsmanager')
+class AwsSecretsManager:
+    def __init__(self, aws_region="us-east-1"):
+        self.client = boto3_client("secretsmanager", region_name=aws_region)
     def get_secret(self, secret_name):
         """

{wizit_context_ingestor-0.3.0b2 → wizit_context_ingestor-0.3.0b4}/src/wizit_context_ingestor/main.py RENAMED Viewed

@@ -78,7 +78,7 @@ class TranscriptionManager:
         llm_model_id: str = "claude-sonnet-4@20250514",
         target_language: str = "es",
         transcription_additional_instructions: str = "",
-        transcription_accuracy_threshold: int = 90,
+        transcription_accuracy_threshold: float = 0.90,
         max_transcription_retries: int = 2,
     ):
         self.gcp_project_id = gcp_project_id
@@ -116,18 +116,18 @@ class TranscriptionManager:
         return vertex_model
     def tracing(func):
-        def gen_tracing_context(self, *args, **kwargs):
+        async def gen_tracing_context(self, *args, **kwargs):
             with tracing_context(
                 enabled=True,
                 project_name=self.langsmith_project_name,
                 client=self.langsmith_client,
             ):
-                return func(self, *args, **kwargs)
+                return await func(self, *args, **kwargs)
         return gen_tracing_context
     @tracing
-    def transcribe_document(self, file_key: str):
+    async def transcribe_document(self, file_key: str):
         """Transcribe a document from source storage to target storage.
         This method serves as a generic interface for transcribing documents from
         various storage sources to target destinations. The specific implementation
@@ -162,9 +162,10 @@ class TranscriptionManager:
                 transcription_accuracy_threshold=self.transcription_accuracy_threshold,
                 max_transcription_retries=self.max_transcription_retries,
             )
-            parsed_pages, parsed_document = (
-                transcribe_document_service.process_document(file_key)
-            )
+            (
+                parsed_pages,
+                parsed_document,
+            ) = await transcribe_document_service.process_document(file_key)
             source_storage_file_tags = {}
             if persistence_service.supports_tagging:
                 # source_storage_file_tags.tag_file(file_key, {"status": "transcribed"})
@@ -231,18 +232,18 @@ class ChunksManager:
         return vertex_model
     def tracing(func):
-        def gen_tacing_context(self, *args, **kwargs):
+        async def gen_tracing_context(self, *args, **kwargs):
             with tracing_context(
                 enabled=True,
                 project_name=self.langsmith_project_name,
                 client=self.langsmith_client,
             ):
-                return func(self, *args, **kwargs)
+                return await func(self, *args, **kwargs)
-        return gen_tacing_context
+        return gen_tracing_context
     @tracing
-    def gen_context_chunks(
+    async def gen_context_chunks(
         self, file_key: str, source_storage_route: str, target_storage_route: str
     ):
         try:
@@ -272,7 +273,7 @@ class ChunksManager:
                 target_language=self.target_language,
             )
             context_chunks = (
-                context_chunks_in_document_service.get_context_chunks_in_document(
+                await context_chunks_in_document_service.get_context_chunks_in_document(
                     file_key, target_bucket_file_tags
                 )
             )