PyPI - wizit-context-ingestor - Versions diffs - 0.2.5b3__py3-none-any.whl → 0.3.0b2__py3-none-any.whl - Mend

wizit-context-ingestor 0.2.5b3py3-none-any.whl → 0.3.0b2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

wizit_context_ingestor/__init__.py +2 -2
wizit_context_ingestor/application/context_chunk_service.py +149 -35
wizit_context_ingestor/application/transcription_service.py +132 -52
wizit_context_ingestor/data/kdb.py +10 -0
wizit_context_ingestor/data/prompts.py +150 -3
wizit_context_ingestor/data/storage.py +10 -0
wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
wizit_context_ingestor/infra/rag/chroma_embeddings.py +30 -31
wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
wizit_context_ingestor/infra/vertex_model.py +56 -28
wizit_context_ingestor/main.py +192 -106
wizit_context_ingestor/utils/file_utils.py +13 -0
wizit_context_ingestor/workflows/context_nodes.py +73 -0
wizit_context_ingestor/workflows/context_state.py +10 -0
wizit_context_ingestor/workflows/context_tools.py +58 -0
wizit_context_ingestor/workflows/context_workflow.py +42 -0
wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
wizit_context_ingestor/workflows/transcription_state.py +17 -0
wizit_context_ingestor/workflows/transcription_tools.py +54 -0
wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
{wizit_context_ingestor-0.2.5b3.dist-info → wizit_context_ingestor-0.3.0b2.dist-info}/METADATA +9 -1
wizit_context_ingestor-0.3.0b2.dist-info/RECORD +44 -0
{wizit_context_ingestor-0.2.5b3.dist-info → wizit_context_ingestor-0.3.0b2.dist-info}/WHEEL +1 -1
wizit_context_ingestor-0.2.5b3.dist-info/RECORD +0 -32

wizit_context_ingestor/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .main import DeelabTranscribeManager, DeelabRedisChunksManager
+from .main import ChunksManager, TranscriptionManager
-__all__ = ["DeelabTranscribeManager", "DeelabRedisChunksManager"]
+__all__ = ["ChunksManager", "TranscriptionManager"]

wizit_context_ingestor/application/context_chunk_service.py CHANGED Viewed

@@ -2,8 +2,15 @@ from langchain_core.output_parsers.pydantic import PydanticOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.documents import Document
 from ..data.prompts import CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT, ContextChunk
+from langchain_core.messages.human import HumanMessage
+from ..workflows.context_workflow import ContextWorkflow
 from typing import Dict, Any, Optional, List
-from .interfaces import AiApplicationService, PersistenceService, RagChunker, EmbeddingsManager
+from .interfaces import (
+    AiApplicationService,
+    PersistenceService,
+    RagChunker,
+    EmbeddingsManager,
+)
 import logging
@@ -21,7 +28,7 @@ class ContextChunksInDocumentService:
         persistence_service: PersistenceService,
         rag_chunker: RagChunker,
         embeddings_manager: EmbeddingsManager,
-        target_language: str = 'es'
+        target_language: str = "es",
     ):
         """
         Initialize the ChunkerService.
@@ -33,48 +40,144 @@ class ContextChunksInDocumentService:
         self.target_language = target_language
         self.embeddings_manager.init_vector_store()
         self.chat_model = self.ai_application_service.load_chat_model()
+        # TODO
+        self.context_additional_instructions = ""
+        self.metadata_source = "source"
-    def _retrieve_context_chunk_in_document(self, markdown_content: str, chunk: Document, chunk_metadata: Optional[Dict[str, Any]] = None) -> Document:
+    def _retrieve_context_chunk_in_document_with_workflow(
+        self,
+        workflow,
+        markdown_content: str,
+        chunk: Document,
+        chunk_metadata: Optional[Dict[str, Any]] = None,
+    ) -> Document:
         """Retrieve context chunks in document."""
         try:
-            chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
-            # Create the prompt template with image
-            prompt = ChatPromptTemplate.from_messages([
-                ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
-                (
-                    "human", [{
-                        "type": "text",
-                            "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>,  ensure all content chunks are generated in '{self.target_language}' language"
-                    }]
-                ),
-            ]).partial(
-                document_content=markdown_content,
-                format_instructions=chunk_output_parser.get_format_instructions()
+            result = workflow.invoke(
+                {
+                    "messages": [
+                        HumanMessage(
+                            content=[
+                                {
+                                    "type": "text",
+                                    "text": f"Retrieve a complete context for the following chunk: <chunk>{chunk.page_content}</chunk>,  ensure all content chunks are generated with the same document's language.",
+                                },
+                            ]
+                        )
+                    ],
+                    "document_content": markdown_content,
+                },
+                {
+                    "configurable": {
+                        "transcription_accuracy_threshold": 0.95,
+                        "max_transcription_retries": 2,
+                    }
+                },
             )
-            model_with_structured_output = self.chat_model.with_structured_output(ContextChunk)
-            # Create the chain
-            chain = prompt | model_with_structured_output
-            # Process the image
-            results = chain.invoke({})
-            chunk.page_content = f"Context:{results.context}, Content:{chunk.page_content}"
-            chunk.metadata["context"] = results.context
+            # chunk.page_content = (
+            #     f"Context:{result['context']}, Content:{chunk.page_content}"
+            # )
+            chunk.metadata["context"] = result["context"]
             if chunk_metadata:
                 for key, value in chunk_metadata.items():
                     chunk.metadata[key] = value
             return chunk
         except Exception as e:
             logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
             raise
+    # def _retrieve_context_chunk_in_document(
+    #     self,
+    #     markdown_content: str,
+    #     chunk: Document,
+    #     chunk_metadata: Optional[Dict[str, Any]] = None,
+    # ) -> Document:
+    #     """Retrieve context chunks in document."""
+    #     try:
+    #         chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
+    #         # Create the prompt template with image
+    #         prompt = ChatPromptTemplate.from_messages(
+    #             [
+    #                 ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
+    #                 (
+    #                     "human",
+    #                     [
+    #                         {
+    #                             "type": "text",
+    #                             "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>,  ensure all content chunks are generated in '{self.target_language}' language",
+    #                         }
+    #                     ],
+    #                 ),
+    #             ]
+    #         ).partial(
+    #             document_content=markdown_content,
+    #             format_instructions=chunk_output_parser.get_format_instructions(),
+    #         )
+    #         model_with_structured_output = self.chat_model.with_structured_output(
+    #             ContextChunk
+    #         )
+    #         # Create the chain
+    #         chain = prompt | model_with_structured_output
+    #         # Process the image
+    #         results = chain.invoke({})
+    #         # chunk.page_content = (
+    #         #     f"Context:{results.context}, Content:{chunk.page_content}"
+    #         # )
+    #         chunk.metadata["context"] = results.context
+    #         if chunk_metadata:
+    #             for key, value in chunk_metadata.items():
+    #                 chunk.metadata[key] = value
+    #         return chunk
+    #     except Exception as e:
+    #         logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
+    #         raise
-    def retrieve_context_chunks_in_document(self, markdown_content: str, chunks: List[Document], chunks_metadata: Optional[Dict[str, Any]] = None) -> List[Document]:
+    # def retrieve_context_chunks_in_document(
+    #     self,
+    #     markdown_content: str,
+    #     chunks: List[Document],
+    #     chunks_metadata: Optional[Dict[str, Any]] = None,
+    # ) -> List[Document]:
+    #     """Retrieve context chunks in document."""
+    #     try:
+    #         context_chunks = list(
+    #             map(
+    #                 lambda chunk: self._retrieve_context_chunk_in_document(
+    #                     markdown_content, chunk, chunks_metadata
+    #                 ),
+    #                 chunks,
+    #             )
+    #         )
+    #         return context_chunks
+    #     except Exception as e:
+    #         logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
+    #         raise
+    def retrieve_context_chunks_in_document_with_workflow(
+        self,
+        markdown_content: str,
+        chunks: List[Document],
+        chunks_metadata: Optional[Dict[str, Any]] = None,
+    ) -> List[Document]:
         """Retrieve context chunks in document."""
         try:
-            context_chunks = list(map(
-                lambda chunk: self._retrieve_context_chunk_in_document(markdown_content, chunk, chunks_metadata),
-                chunks
-            ))
+            context_workflow = ContextWorkflow(
+                self.chat_model, self.context_additional_instructions
+            )
+            compiled_context_workflow = context_workflow.gen_workflow()
+            compiled_context_workflow = compiled_context_workflow.compile()
+            context_chunks = list(
+                map(
+                    lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
+                        compiled_context_workflow,
+                        markdown_content,
+                        chunk,
+                        chunks_metadata,
+                    ),
+                    chunks,
+                )
+            )
             return context_chunks
         except Exception as e:
             logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
@@ -85,18 +188,27 @@ class ContextChunksInDocumentService:
         Get the context chunks in a document.
         """
         try:
-            markdown_content = self.persistence_service.load_markdown_file_content(file_key)
+            markdown_content = self.persistence_service.load_markdown_file_content(
+                file_key
+            )
             langchain_rag_document = Document(
+                id=file_key,
                 page_content=markdown_content,
-                metadata={
-                    "source": file_key
-                }
+                metadata={self.metadata_source: file_key},
             )
             logger.info(f"Document loaded:{file_key}")
             chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
             logger.info(f"Chunks generated:{len(chunks)}")
-            context_chunks = self.retrieve_context_chunks_in_document(markdown_content, chunks, file_tags)
+            context_chunks = self.retrieve_context_chunks_in_document_with_workflow(
+                markdown_content, chunks, file_tags
+            )
             logger.info(f"Context chunks generated:{len(context_chunks)}")
+            # upsert validation
+            try:
+                print(f"deleting chunks: {file_key}")
+                self.delete_document_context_chunks(file_key)
+            except Exception as e:
+                logger.error(f"could not delete by source: {e}")
             self.embeddings_manager.index_documents(context_chunks)
             return context_chunks
         except Exception as e:
@@ -108,7 +220,9 @@ class ContextChunksInDocumentService:
         Delete the context chunks in a document.
         """
         try:
-            self.embeddings_manager.delete_documents_by_source_id(file_key)
+            self.embeddings_manager.delete_documents_by_metadata_key(
+                self.metadata_source, file_key
+            )
         except Exception as e:
             logger.error(f"Error delete_document_context_chunks: {str(e)}")
             raise e

wizit_context_ingestor/application/transcription_service.py CHANGED Viewed

@@ -1,79 +1,153 @@
 from typing import Tuple, List, Dict, Optional
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers.pydantic import PydanticOutputParser
+from langchain_core.messages import HumanMessage
 from logging import getLogger
 from ..data.prompts import IMAGE_TRANSCRIPTION_SYSTEM_PROMPT, Transcription
 from ..domain.models import ParsedDoc, ParsedDocPage
 from ..domain.services import ParseDocModelService
 from .interfaces import AiApplicationService, PersistenceService
+from ..workflows.transcription_workflow import TranscriptionWorkflow
 logger = getLogger(__name__)
 class TranscriptionService:
     """
-        Service for transcribing documents.
+    Service for transcribing documents.
     """
     def __init__(
         self,
         ai_application_service: AiApplicationService,
         persistence_service: PersistenceService,
-        target_language: str = 'es',
-        transcription_additional_instructions: str = ''
+        target_language: str = "es",
+        transcription_additional_instructions: str = "",
+        transcription_accuracy_threshold: int = 90,
+        max_transcription_retries: int = 2,
     ):
         self.ai_application_service = ai_application_service
         self.persistence_service = persistence_service
         self.target_language = target_language
-        self.transcription_additional_instructions = transcription_additional_instructions
+        if (
+            transcription_accuracy_threshold < 0
+            or transcription_accuracy_threshold > 95
+        ):
+            raise ValueError(
+                "transcription_accuracy_threshold must be between 0 and 95"
+            )
+        if max_transcription_retries < 1 or max_transcription_retries > 3:
+            raise ValueError(
+                "max_transcription_retries must be between 1 and 3 to prevent token exhaustion"
+            )
+        self.transcription_accuracy_threshold = transcription_accuracy_threshold
+        self.max_transcription_retries = max_transcription_retries
+        self.transcription_additional_instructions = (
+            transcription_additional_instructions
+        )
         self.chat_model = self.ai_application_service.load_chat_model()
-    def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
-            """Transcribe an image to text.
-            Args:
-                document: The document with the image to transcribe
-            Returns:
-                Processed text
-            """
-            try:
-                # Create the prompt template with image
-                transcription_output_parser = PydanticOutputParser(pydantic_object=Transcription)
-                prompt = ChatPromptTemplate.from_messages([
-                    ("system", IMAGE_TRANSCRIPTION_SYSTEM_PROMPT),
-                    ("human", [{
-                            "type": "image",
-                            "image_url": {
-                                "url": f"data:image/png;base64,{document.page_base64}"
+    # def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
+    #     """Transcribe an image to text.
+    #     Args:
+    #         document: The document with the image to transcribe
+    #     Returns:
+    #         Processed text
+    #     """
+    #     try:
+    #         # Create the prompt template with image
+    #         transcription_output_parser = PydanticOutputParser(
+    #             pydantic_object=Transcription
+    #         )
+    #         prompt = ChatPromptTemplate.from_messages(
+    #             [
+    #                 ("system", IMAGE_TRANSCRIPTION_SYSTEM_PROMPT),
+    #                 (
+    #                     "human",
+    #                     [
+    #                         {
+    #                             "type": "image",
+    #                             "image_url": {
+    #                                 "url": f"data:image/png;base64,{document.page_base64}"
+    #                             },
+    #                         },
+    #                         {
+    #                             "type": "text",
+    #                             "text": "Transcribe the document, ensure all content transcribed accurately",
+    #                         },
+    #                     ],
+    #                 ),
+    #             ]
+    #         ).partial(
+    #             transcription_additional_instructions=self.transcription_additional_instructions,
+    #             format_instructions=transcription_output_parser.get_format_instructions(),
+    #         )
+    #         model_with_structured_output = self.chat_model.with_structured_output(
+    #             Transcription
+    #         )
+    #         # Create the chain
+    #         chain = prompt | model_with_structured_output
+    #         # Process the image
+    #         chain = chain.with_retry(
+    #             stop_after_attempt=3, exponential_jitter_params={"initial": 60}
+    #         )
+    #         result = chain.invoke({})
+    #         if result.transcription:
+    #             document.page_text = result.transcription
+    #         else:
+    #             raise ValueError("No transcription found")
+    #         return document
+    #     except Exception as e:
+    #         logger.error(f"Failed to parse document page: {str(e)}")
+    #         raise
+    def parse_doc_page_with_workflow(self, document: ParsedDocPage) -> ParsedDocPage:
+        """Transcribe an image to text using an agent.
+        Args:
+            document: The document with the image to transcribe
+        Returns:
+            Processed text
+        """
+        transcription_workflow = TranscriptionWorkflow(
+            self.chat_model, self.transcription_additional_instructions
+        )
+        compiled_transcription_workflow = transcription_workflow.gen_workflow()
+        compiled_transcription_workflow = compiled_transcription_workflow.compile()
+        result = compiled_transcription_workflow.invoke(
+            {
+                "messages": [
+                    HumanMessage(
+                        content=[
+                            {
+                                "type": "text",
+                                "text": "Transcribe the document, ensure all content transcribed accurately. transcription must be in the same language of source document.",
+                            },
+                        ]
+                    ),
+                    HumanMessage(
+                        content=[
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{document.page_base64}"
+                                },
                             }
-                        },
-                        {
-                            "type": "text",
-                            "text": f"Transcribe the document, ensure all content transcribed is using '{self.target_language}' language"
-                        }]
+                        ]
                     ),
-                ]).partial(
-                    transcription_additional_instructions=self.transcription_additional_instructions,
-                    format_instructions=transcription_output_parser.get_format_instructions()
-                )
-                model_with_structured_output = self.chat_model.with_structured_output(Transcription)
-                # Create the chain
-                chain = prompt | model_with_structured_output
-                # Process the image
-                chain = chain.with_retry(
-                    stop_after_attempt=3,
-                    exponential_jitter_params={
-                        "initial": 60
-                    }
-                )
-                result = chain.invoke({})
-                if result.transcription:
-                    document.page_text = result.transcription
-                else:
-                    raise ValueError("No transcription found")
-                return document
-            except Exception as e:
-                logger.error(f"Failed to parse document page: {str(e)}")
-                raise
+                ]
+            },
+            {
+                "configurable": {
+                    "transcription_accuracy_threshold": self.transcription_accuracy_threshold,
+                    "max_transcription_retries": self.max_transcription_retries,
+                }
+            },
+        )
+        if result["transcription"]:
+            document.page_text = result["transcription"]
+        else:
+            raise ValueError("No transcription found")
+        return document
     def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
         """
@@ -84,15 +158,21 @@ class TranscriptionService:
         document_pages = parse_doc_model_service.parse_document_to_base64()
         parsed_pages = []
         for page in document_pages:
-            page = self.parse_doc_page(page)
+            page = self.parse_doc_page_with_workflow(page)
             parsed_pages.append(page)
         logger.info(f"Parsed {len(parsed_pages)} pages")
         parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
         return parsed_pages, parsed_document
-    def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[Dict[str, str]] = {}):
+    def save_parsed_document(
+        self,
+        file_key: str,
+        parsed_document: ParsedDoc,
+        file_tags: Optional[Dict[str, str]] = {},
+    ):
         """
         Save the parsed document to a file.
         """
-        self.persistence_service.save_parsed_document(file_key, parsed_document, file_tags)
+        self.persistence_service.save_parsed_document(
+            file_key, parsed_document, file_tags
+        )

wizit_context_ingestor/data/kdb.py ADDED Viewed

@@ -0,0 +1,10 @@
+from enum import Enum
+from typing import Literal
+class KdbServices(Enum):
+    REDIS = "redis"
+    CHROMA = "chroma"
+kdb_services = Literal[KdbServices.REDIS.value, KdbServices.CHROMA.value]

wizit_context_ingestor/data/prompts.py CHANGED Viewed

@@ -1,5 +1,93 @@
 from pydantic import BaseModel, Field
+AGENT_TRANSCRIPTION_SYSTEM_PROMPT = """
+    You are an expert document transcription assistant.
+    Your task is to transcribe the exact text from the provided document with extreme accuracy while organizing the output using markdown formatting.
+    OBJECTIVE: Create a complete, accurate transcription that preserves the original document's content, structure and formatting.
+    TRANSCRIPTION RULES:
+    <hard_rules>
+    1. document's languages must be detected to ensure correct transcription
+    2. Systematically examine each content element (text, images, tables, formatting)
+    3. Convert all content to markdown while preserving structure and meaning
+    5. Ensure completeness and accuracy of the transcription
+    6. TEXT TRANSCRIPTION:
+    - Transcribe all visible text exactly as it appears
+    - Include: paragraphs, headings, subheadings, headers, footers
+    - Include: footnotes, page numbers, bullet points, lists, captions
+    - Preserve: bold, italic, underlined, and other text formatting using markdown
+    7. LANGUAGE REQUIREMENTS:
+    - Transcribed content MUST preserve document's language
+    - Translate any secondary language content to maintain consistency
+    8. COMPLETENESS:
+    - Transcribe the entire document, partial transcriptions are not allowed
+    - Never summarize, modify, or generate additional content
+    - Maintain original meaning and context
+    9. FORMATTING STANDARDS:
+    - Use proper markdown syntax for structure
+    - Avoid blank lines in transcription
+    - Exclude logos, watermarks, and decorative icons
+    - Omit special characters that interfere with markdown
+    10. IMAGE HANDLING:
+    <image_transcription_rules>
+    - Extract and transcribe any text within images
+    - For data-rich images: create markdown tables when applicable
+    - For other images: provide descriptive content summaries
+    - Classify each visual element as: Chart, Diagram, Natural Image, Screenshot, or Other
+    - Format: <figure_type>Classification</figure_type>
+    - Wrap content in <figure></figure> tags with title/caption if available
+    </image_transcription_rules>
+    11. TABLE PROCESSING:
+    <tables_transcription_rules>
+    - Convert all tables to proper markdown table format
+    - Preserve cell alignment and structure as closely as possible
+    - Maintain data relationships and hierarchy
+    - Include table headers and formatting
+    </tables_transcription_rules>
+    12. QUALITY ASSURANCE:
+    - Ensure no content is omitted or added
+    - Check markdown formatting is correct
+    - Confirm structural integrity is maintained
+    </hard_rules>
+    CRITICAL REMINDERS:
+    <critical_reminders>
+    - Accuracy over speed, every character matters
+    - Preserve original document intent and meaning
+    - Maintain professional transcription standards
+    - Complete transcription is mandatory
+    </critical_reminders>
+    When provided, use the following transcription notes from previous transcriptions intents to improve the current transcription:
+    <transcription_notes>
+        {transcription_notes}
+    </transcription_notes>
+    When provided, use the following additional transcription instructions to improve results:
+    <additional_instructions>
+        {transcription_additional_instructions}
+    </additional_instructions>
+"""
+# Generate the optimized transcription following these specifications:
+# {format_instructions}
+IMAGE_TRANSCRIPTION_CHECK_SYSTEM_PROMPT = """
+You are an expert document transcription grader.
+Your task is to evaluate the following transcription quality.
+<rules>
+    - Provide an accurate evaluation of the transcription ensuring quality, completeness and accuracy.
+    - Transcription has markdown formatting, the markdown format must reflect the original document's structure and formatting.
+    - Compare the transcription with the original document (provided as image)
+</rules>
+<transcription>
+    {transcription}
+</transcription>
+When provided, evaluate whether the following additional transcription instructions provided by the user have been followed:
+<additional_instructions>
+    {transcription_additional_instructions}
+</additional_instructions>
+"""
 IMAGE_TRANSCRIPTION_SYSTEM_PROMPT = """
 You are an expert document transcription assistant. Your task is to transcribe the exact text from the provided document with extreme accuracy while organizing the output using markdown formatting.
@@ -78,8 +166,6 @@ CRITICAL REMINDERS:
 Generate the optimized transcription following these specifications:
 {format_instructions}
 """
 CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
@@ -139,10 +225,71 @@ Generate the optimized context following these specifications:
 {format_instructions}
 """
+WORKFLOW_CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
+You are an expert RAG (Retrieval-Augmented Generation) context generator that creates optimized contextual chunks from markdown document content for enhanced search and retrieval performance.
+OBJECTIVE: Generate rich, searchable context descriptions that maximize retrieval accuracy and relevance in RAG systems.
+WORKFLOW:
+<task_analysis>
+1. LANGUAGE DETECTION: Identify the primary language used in the document content
+2. SEMANTIC ANALYSIS: Understand the chunk's meaning, relationships, and significance within the broader document
+3. CONTEXT GENERATION: Create comprehensive context metadata that enhances retrieval effectiveness
+4. SEARCH OPTIMIZATION: Ensure context includes terms and concepts that users might search for
+5. QUALITY VALIDATION: Verify context completeness and retrieval utility
+</task_analysis>
+CONTEXT GENERATION REQUIREMENTS:
+<context_elements>
+Your generated context must synthesize ALL of these elements into a coherent description:
+- chunk_relation_with_document: How this chunk connects to and fits within the overall document structure and narrative
+- chunk_keywords: Primary and secondary keywords, technical terms, and searchable phrases that would help users find this content
+- chunk_description: Clear explanation of what the chunk contains, including data types, concepts, and information presented
+- chunk_function: The chunk's specific purpose and role (e.g., definition, explanation, example, instruction, procedure, list, summary, analysis, conclusion)
+- chunk_structure: Format and organizational pattern (paragraph, bulleted list, numbered steps, table, code block, heading, etc.)
+- chunk_main_idea: The central concept, message, or takeaway that the chunk communicates
+- chunk_domain: Subject area or field of knowledge (e.g., technical documentation, legal text, medical information, business process)
+- chunk_audience: Intended reader level and background (e.g., beginner, expert, general audience, specific role)
+</context_elements>
+CRITICAL RULES:
+<critical_rules>
+- Context MUST be written in the SAME language as the source document content
+- Be comprehensive yet concise - aim for maximum information density
+- Prioritize search retrieval optimization and semantic understanding
+- Include synonyms and alternative phrasings users might search for
+- Focus on conceptual relationships and knowledge connections
+- Do NOT reproduce or quote the original chunk content verbatim
+- Ensure context is self-contained and understandable without the original chunk
+- Use natural language that flows well while incorporating all required elements
+</critical_rules>
+SEARCH OPTIMIZATION GUIDELINES:
+<search_optimization>
+- Include both explicit terms from the content and implicit concepts
+- Consider various ways users might phrase queries related to this content
+- Incorporate hierarchical information (section → subsection → detail level)
+- Add contextual bridges that connect this chunk to related topics
+- Use varied vocabulary to capture different search approaches
+</search_optimization>
+<document_content>
+{document_content}
+</document_content>
+When provided, follow these additional context extraction instructions:
+<additional_instructions>
+    {context_additional_instructions}
+</additional_instructions>
+"""
 class ContextChunk(BaseModel):
-    context: str = Field(description="Context description that helps with search retrieval")
+    context: str = Field(
+        description="Context description that helps with search retrieval"
+    )
 class Transcription(BaseModel):
     """Document Transcription."""
     transcription: str = Field(description="Full transcription")
     language: str = Field(description="Main language")

wizit_context_ingestor/data/storage.py ADDED Viewed

@@ -0,0 +1,10 @@
+from enum import Enum
+from typing import Literal
+class StorageServices(Enum):
+    S3 = "s3"
+    LOCAL = "local"
+storage_services = Literal[StorageServices.S3.value, StorageServices.LOCAL.value]

wizit-context-ingestor 0.2.5b3__py3-none-any.whl → 0.3.0b2__py3-none-any.whl

wizit-context-ingestor 0.2.5b3py3-none-any.whl → 0.3.0b2py3-none-any.whl