PyPI - wizit-context-ingestor - Versions diffs - 0.2.4b0__tar.gz → 0.2.5b2__tar.gz - Mend

wizit-context-ingestor 0.2.4b0tar.gz → 0.2.5b2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of wizit-context-ingestor might be problematic. Click here for more details.

Files changed (31) hide show

{wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: wizit-context-ingestor
-Version: 0.2.4b0
+Version: 0.2.5b2
 Summary: Contextual Rag with Cloud Solutions
 Requires-Dist: anthropic[vertex]>=0.66.0
 Requires-Dist: boto3>=1.40.23

{wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "wizit_context_ingestor"
-version = "0.2.4-beta"
+version = "0.2.5-beta-2"
 description = "Contextual Rag with Cloud Solutions"
 readme = "README.md"
 requires-python = ">=3.12"

{wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/application/transcription_service.py RENAMED Viewed

@@ -56,6 +56,12 @@ class TranscriptionService:
                 # Create the chain
                 chain = prompt | model_with_structured_output
                 # Process the image
+                chain = chain.with_retry(
+                    stop_after_attempt=3,
+                    exponential_jitter_params={
+                        "initial": 60
+                    }
+                )
                 result = chain.invoke({})
                 if result.transcription:
                     document.page_text = result.transcription

{wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/domain/services.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import base64
 import logging
 import io
-import fitz
+import pymupdf
 from PIL import Image
 from typing import List
 from ..domain.models import ParsedDocPage, ParsedDoc
@@ -17,25 +17,25 @@ class ParseDocModelService():
     def __init__(self, file_path: str):
         """
         Initialize a PDF document parser.
         Args:
             file_path: Path to the PDF file to parse
         """
         self.file_path = file_path
-        self.pdf_document = fitz.open(file_path)
+        self.pdf_document = pymupdf.open(file_path)
         self.page_count = self.pdf_document.page_count
     def pdf_page_to_base64(self, page_number: int) -> ParsedDocPage:
         """
         Convert a PDF page to a base64-encoded PNG image.
         Args:
             page_number: One-indexed page number to convert
         Returns:
             Base64 encoded string of the page image
         Raises:
             Exception: If there's an error during conversion
         """
@@ -49,7 +49,7 @@ class ParseDocModelService():
             b64_encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
             logger.info(f"Page {page_number} encoded successfully")
             return ParsedDocPage(
-                page_number=page_number,
+                page_number=page_number,
                 page_base64=b64_encoded_image
             )
         except Exception as e:
@@ -59,15 +59,15 @@ class ParseDocModelService():
     def parse_document_to_base64(self) -> List[ParsedDocPage]:
         """
         Convert all pages in the PDF document to base64-encoded images.
         Returns:
             List of base64 encoded strings for each page
         Raises:
             Exception: If there's an error during conversion
         """
         # BASE DE DATOS SINTETICOS DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO, FINE TUNING PARA EL LLM
-        # GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
+        # GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
         # RAG --> FINETUNING AUTOMATICO / CONSULTAR EL MODELO
         # OPENAI --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
         # COLAB --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
@@ -95,4 +95,4 @@ class ParseDocModelService():
             document_text=md_content
         )
-    # def
+    # def

{wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/aws_model.py RENAMED Viewed

@@ -37,7 +37,6 @@ class AWSModels(AiApplicationService):
         temperature: float = 0.7,
         max_tokens: int = 8000,
         region_name: str = "us-east-1") -> ChatBedrockConverse:
         """
         Load an AWS AI chat model for text generation.

{wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/persistence/local_storage.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from ...application.interfaces import PersistenceService
 from ...domain.models import ParsedDoc
+from typing import Optional
 import logging
 import os
 logger = logging.getLogger(__name__)
@@ -40,7 +41,7 @@ class LocalStorageService(PersistenceService):
             raise
-    def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc):
+    def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
         """Save a parsed document."""
         with open(f"{self.tmp_folder}/{file_key}", "w", encoding="utf-8") as f:
             f.write(parsed_document.document_text)

{wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/persistence/s3_storage.py RENAMED Viewed

@@ -4,6 +4,7 @@ from boto3 import client as boto3_client
 import logging
 import os
 from botocore.exceptions import ClientError
+from typing import Optional
 logger = logging.getLogger(__name__)
@@ -77,8 +78,7 @@ class S3StorageService(PersistenceService):
             raise
-    def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: dict = {}):
+    def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
         """Save a parsed document to S3.
         Args:

{wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/main.py RENAMED Viewed

@@ -113,8 +113,11 @@ class DeelabRedisChunksManager:
         try:
             rag_chunker = SemanticChunks(self.embeddings_model)
             redis_embeddings_manager = RedisEmbeddingsManager(
-                embeddings_model=self.embeddings_model,
-                redis_connection_string=self.redis_connection_string
+                self.embeddings_model,
+                self.redis_connection_string,
+                {
+                    "file_key": file_key
+                }
             )
             local_persistence_service = LocalStorageService()
             context_chunks_in_document_service = ContextChunksInDocumentService(

{wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/services/parse_doc.py RENAMED Viewed

@@ -4,7 +4,7 @@ from langchain_core.output_parsers import StrOutputParser
 import base64
 import logging
 import io
-import fitz
+import pymupdf
 from PIL import Image
 from typing import List, Any
 from dotenv import load_dotenv
@@ -23,13 +23,13 @@ class ParseDoc:
     def __init__(self, file_path: str, system_prompt, chat_model: Any):
         """
         Initialize a PDF document parser.
         Args:
             file_path: Path to the PDF file to parse
             chat_model: Language model for processing document content
         """
         self.file_path = file_path
-        self.pdf_document = fitz.open(file_path)
+        self.pdf_document = pymupdf.open(file_path)
         self.page_count = self.pdf_document.page_count
         self.system_prompt = system_prompt
         self.chat_model = chat_model
@@ -37,13 +37,13 @@ class ParseDoc:
     def pdf_page_to_base64(self, page_number: int) -> str:
         """
         Convert a PDF page to a base64-encoded PNG image.
         Args:
             page_number: One-indexed page number to convert
         Returns:
             Base64 encoded string of the page image
         Raises:
             Exception: If there's an error during conversion
         """
@@ -69,10 +69,10 @@ class ParseDoc:
     def parse_document_to_base64(self) -> List[str]:
         """
         Convert all pages in the PDF document to base64-encoded images.
         Returns:
             List of base64 encoded strings for each page
         Raises:
             Exception: If there's an error during conversion
         """
@@ -90,14 +90,14 @@ class ParseDoc:
     def parse_with_llm(self, base_64_image: str, prompt: str) -> AIMessage:
         """
         Process a base64-encoded image with a language model using the provided prompt.
         Args:
             base_64_image: Base64 encoded image string
             prompt: Text prompt to send with the image
         Returns:
             Language model response
         Raises:
             Exception: If there's an error during processing
         """