PyPI - wizit-context-ingestor - Versions diffs - 0.2.5b2__py3-none-any.whl → 0.3.0b1__py3-none-any.whl - Mend

wizit-context-ingestor 0.2.5b2py3-none-any.whl → 0.3.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

wizit_context_ingestor/__init__.py +2 -2
wizit_context_ingestor/application/context_chunk_service.py +149 -35
wizit_context_ingestor/application/interfaces.py +1 -1
wizit_context_ingestor/application/transcription_service.py +132 -49
wizit_context_ingestor/data/kdb.py +10 -0
wizit_context_ingestor/data/prompts.py +156 -2
wizit_context_ingestor/data/storage.py +10 -0
wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
wizit_context_ingestor/infra/rag/chroma_embeddings.py +135 -0
wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
wizit_context_ingestor/infra/vertex_model.py +56 -28
wizit_context_ingestor/main.py +160 -105
wizit_context_ingestor/utils/file_utils.py +13 -0
wizit_context_ingestor/workflows/context_nodes.py +73 -0
wizit_context_ingestor/workflows/context_state.py +10 -0
wizit_context_ingestor/workflows/context_tools.py +58 -0
wizit_context_ingestor/workflows/context_workflow.py +42 -0
wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
wizit_context_ingestor/workflows/transcription_state.py +17 -0
wizit_context_ingestor/workflows/transcription_tools.py +54 -0
wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
{wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/METADATA +10 -1
wizit_context_ingestor-0.3.0b1.dist-info/RECORD +44 -0
{wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/WHEEL +1 -1
wizit_context_ingestor-0.2.5b2.dist-info/RECORD +0 -31

wizit_context_ingestor/data/prompts.py CHANGED Viewed

@@ -1,5 +1,93 @@
 from pydantic import BaseModel, Field
+AGENT_TRANSCRIPTION_SYSTEM_PROMPT = """
+    You are an expert document transcription assistant.
+    Your task is to transcribe the exact text from the provided document with extreme accuracy while organizing the output using markdown formatting.
+    OBJECTIVE: Create a complete, accurate transcription that preserves the original document's content, structure and formatting.
+    TRANSCRIPTION RULES:
+    <hard_rules>
+    1. document's languages must be detected to ensure correct transcription
+    2. Systematically examine each content element (text, images, tables, formatting)
+    3. Convert all content to markdown while preserving structure and meaning
+    5. Ensure completeness and accuracy of the transcription
+    6. TEXT TRANSCRIPTION:
+    - Transcribe all visible text exactly as it appears
+    - Include: paragraphs, headings, subheadings, headers, footers
+    - Include: footnotes, page numbers, bullet points, lists, captions
+    - Preserve: bold, italic, underlined, and other text formatting using markdown
+    7. LANGUAGE REQUIREMENTS:
+    - Transcribed content MUST preserve document's language
+    - Translate any secondary language content to maintain consistency
+    8. COMPLETENESS:
+    - Transcribe the entire document, partial transcriptions are not allowed
+    - Never summarize, modify, or generate additional content
+    - Maintain original meaning and context
+    9. FORMATTING STANDARDS:
+    - Use proper markdown syntax for structure
+    - Avoid blank lines in transcription
+    - Exclude logos, watermarks, and decorative icons
+    - Omit special characters that interfere with markdown
+    10. IMAGE HANDLING:
+    <image_transcription_rules>
+    - Extract and transcribe any text within images
+    - For data-rich images: create markdown tables when applicable
+    - For other images: provide descriptive content summaries
+    - Classify each visual element as: Chart, Diagram, Natural Image, Screenshot, or Other
+    - Format: <figure_type>Classification</figure_type>
+    - Wrap content in <figure></figure> tags with title/caption if available
+    </image_transcription_rules>
+    11. TABLE PROCESSING:
+    <tables_transcription_rules>
+    - Convert all tables to proper markdown table format
+    - Preserve cell alignment and structure as closely as possible
+    - Maintain data relationships and hierarchy
+    - Include table headers and formatting
+    </tables_transcription_rules>
+    12. QUALITY ASSURANCE:
+    - Ensure no content is omitted or added
+    - Check markdown formatting is correct
+    - Confirm structural integrity is maintained
+    </hard_rules>
+    CRITICAL REMINDERS:
+    <critical_reminders>
+    - Accuracy over speed, every character matters
+    - Preserve original document intent and meaning
+    - Maintain professional transcription standards
+    - Complete transcription is mandatory
+    </critical_reminders>
+    When provided, use the following transcription notes from previous transcriptions intents to improve the current transcription:
+    <transcription_notes>
+        {transcription_notes}
+    </transcription_notes>
+    When provided, use the following additional transcription instructions to improve results:
+    <additional_instructions>
+        {transcription_additional_instructions}
+    </additional_instructions>
+"""
+# Generate the optimized transcription following these specifications:
+# {format_instructions}
+IMAGE_TRANSCRIPTION_CHECK_SYSTEM_PROMPT = """
+You are an expert document transcription grader.
+Your task is to evaluate the following transcription quality.
+<rules>
+    - Provide an accurate evaluation of the transcription ensuring quality, completeness and accuracy.
+    - Transcription has markdown formatting, the markdown format must reflect the original document's structure and formatting.
+    - Compare the transcription with the original document (provided as image)
+</rules>
+<transcription>
+    {transcription}
+</transcription>
+When provided, evaluate whether the following additional transcription instructions provided by the user have been followed:
+<additional_instructions>
+    {transcription_additional_instructions}
+</additional_instructions>
+"""
 IMAGE_TRANSCRIPTION_SYSTEM_PROMPT = """
 You are an expert document transcription assistant. Your task is to transcribe the exact text from the provided document with extreme accuracy while organizing the output using markdown formatting.
@@ -22,6 +110,7 @@ TRANSCRIPTION RULES:
    - Include: footnotes, page numbers, bullet points, lists, captions
    - Preserve: bold, italic, underlined, and other text formatting using markdown
    - Mark unclear text as [unclear] or [illegible] with best guess in brackets
+    - Enclose all underlined content in <UnderlinedContent></UnderlinedContent> tags
 2. LANGUAGE REQUIREMENTS:
    - All transcribed content MUST be in the document's primary language
@@ -70,9 +159,13 @@ CRITICAL REMINDERS:
 - Maintain professional transcription standards
 - Complete transcription is mandatory
+<additional_instructions>
+    {transcription_additional_instructions}
+</additional_instructions>
 Generate the optimized transcription following these specifications:
 {format_instructions}
 """
 CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
@@ -132,10 +225,71 @@ Generate the optimized context following these specifications:
 {format_instructions}
 """
+WORKFLOW_CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
+You are an expert RAG (Retrieval-Augmented Generation) context generator that creates optimized contextual chunks from markdown document content for enhanced search and retrieval performance.
+OBJECTIVE: Generate rich, searchable context descriptions that maximize retrieval accuracy and relevance in RAG systems.
+WORKFLOW:
+<task_analysis>
+1. LANGUAGE DETECTION: Identify the primary language used in the document content
+2. SEMANTIC ANALYSIS: Understand the chunk's meaning, relationships, and significance within the broader document
+3. CONTEXT GENERATION: Create comprehensive context metadata that enhances retrieval effectiveness
+4. SEARCH OPTIMIZATION: Ensure context includes terms and concepts that users might search for
+5. QUALITY VALIDATION: Verify context completeness and retrieval utility
+</task_analysis>
+CONTEXT GENERATION REQUIREMENTS:
+<context_elements>
+Your generated context must synthesize ALL of these elements into a coherent description:
+- chunk_relation_with_document: How this chunk connects to and fits within the overall document structure and narrative
+- chunk_keywords: Primary and secondary keywords, technical terms, and searchable phrases that would help users find this content
+- chunk_description: Clear explanation of what the chunk contains, including data types, concepts, and information presented
+- chunk_function: The chunk's specific purpose and role (e.g., definition, explanation, example, instruction, procedure, list, summary, analysis, conclusion)
+- chunk_structure: Format and organizational pattern (paragraph, bulleted list, numbered steps, table, code block, heading, etc.)
+- chunk_main_idea: The central concept, message, or takeaway that the chunk communicates
+- chunk_domain: Subject area or field of knowledge (e.g., technical documentation, legal text, medical information, business process)
+- chunk_audience: Intended reader level and background (e.g., beginner, expert, general audience, specific role)
+</context_elements>
+CRITICAL RULES:
+<critical_rules>
+- Context MUST be written in the SAME language as the source document content
+- Be comprehensive yet concise - aim for maximum information density
+- Prioritize search retrieval optimization and semantic understanding
+- Include synonyms and alternative phrasings users might search for
+- Focus on conceptual relationships and knowledge connections
+- Do NOT reproduce or quote the original chunk content verbatim
+- Ensure context is self-contained and understandable without the original chunk
+- Use natural language that flows well while incorporating all required elements
+</critical_rules>
+SEARCH OPTIMIZATION GUIDELINES:
+<search_optimization>
+- Include both explicit terms from the content and implicit concepts
+- Consider various ways users might phrase queries related to this content
+- Incorporate hierarchical information (section → subsection → detail level)
+- Add contextual bridges that connect this chunk to related topics
+- Use varied vocabulary to capture different search approaches
+</search_optimization>
+<document_content>
+{document_content}
+</document_content>
+When provided, follow these additional context extraction instructions:
+<additional_instructions>
+    {context_additional_instructions}
+</additional_instructions>
+"""
 class ContextChunk(BaseModel):
-    context: str = Field(description="Context description that helps with search retrieval")
+    context: str = Field(
+        description="Context description that helps with search retrieval"
+    )
 class Transcription(BaseModel):
     """Document Transcription."""
     transcription: str = Field(description="Full transcription")
     language: str = Field(description="Main language")

wizit_context_ingestor/data/storage.py ADDED Viewed

@@ -0,0 +1,10 @@
+from enum import Enum
+from typing import Literal
+class StorageServices(Enum):
+    S3 = "s3"
+    LOCAL = "local"
+storage_services = Literal[StorageServices.S3.value, StorageServices.LOCAL.value]

wizit_context_ingestor/infra/persistence/local_storage.py CHANGED Viewed

@@ -3,22 +3,27 @@ from ...domain.models import ParsedDoc
 from typing import Optional
 import logging
 import os
 logger = logging.getLogger(__name__)
 class LocalStorageService(PersistenceService):
     """Persistence service for local storage."""
-    def __init__(self):
-        self.tmp_folder = "tmp"
+    def __init__(self, source_storage_route: str, target_storage_route: str):
+        self.source_storage_route = source_storage_route
+        self.target_storage_route = target_storage_route
+        self.supports_tagging = hasattr(self, "retrieve_file_tags")
     def load_markdown_file_content(self, file_key: str) -> str:
         """Load markdown file content from local storage."""
         file_content = None
-        with open(f"{self.tmp_folder}/{file_key}", "r", encoding="utf-8") as file:
+        with open(
+            f"{self.source_storage_route}/{file_key}", "r", encoding="utf-8"
+        ) as file:
             file_content = file.read()
         return file_content
     def retrieve_raw_file(self, file_key: str) -> str:
         """Retrieve file path in tmp folder from local storage.
@@ -32,16 +37,21 @@ class LocalStorageService(PersistenceService):
             ClientError: If there's an error retrieving the object from local storage
         """
         try:
-            tmp_file_path = f"{self.tmp_folder}/{file_key}"
+            tmp_file_path = f"{self.source_storage_route}/{file_key}"
             if not os.path.exists(tmp_file_path):
                 raise FileNotFoundError(f"File {file_key} not found in local storage")
             return tmp_file_path
         except Exception as e:
-            logger.error(f"Unexpected error retrieving file {file_key} from local storage: {str(e)}")
+            logger.error(
+                f"Unexpected error retrieving file {file_key} from local storage: {str(e)}"
+            )
             raise
-    def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
+    def save_parsed_document(
+        self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
+    ):
         """Save a parsed document."""
-        with open(f"{self.tmp_folder}/{file_key}", "w", encoding="utf-8") as f:
+        with open(
+            f"{self.target_storage_route}/{file_key}", "w", encoding="utf-8"
+        ) as f:
             f.write(parsed_document.document_text)

wizit_context_ingestor/infra/persistence/s3_storage.py CHANGED Viewed

@@ -11,12 +11,19 @@ logger = logging.getLogger(__name__)
 class S3StorageService(PersistenceService):
     """Persistence service for S3 storage."""
-    __slots__ = ('origin_bucket_name', 'target_bucket_name', 'region_name')
-    def __init__(self, origin_bucket_name: str, target_bucket_name: str, region_name: str = 'us-east-1'):
-        self.s3 = boto3_client('s3', region_name=region_name)
+    __slots__ = ("origin_bucket_name", "target_bucket_name", "region_name")
+    def __init__(
+        self,
+        origin_bucket_name: str,
+        target_bucket_name: str,
+        region_name: str = "us-east-1",
+    ):
+        self.s3 = boto3_client("s3", region_name=region_name)
         self.origin_bucket_name = origin_bucket_name
         self.target_bucket_name = target_bucket_name
+        self.supports_tagging = hasattr(self, "retrieve_file_tags")
     def load_markdown_file_content(self, file_key: str) -> str:
         """Load markdown file content from S3 storage.
@@ -36,9 +43,9 @@ class S3StorageService(PersistenceService):
             response = self.s3.get_object(Bucket=self.target_bucket_name, Key=file_key)
             tmp_file_key = f"/tmp/{file_key}"
             os.makedirs(os.path.dirname(tmp_file_key), exist_ok=True)
-            with open(tmp_file_key, 'wb') as f:
-                f.write(response['Body'].read())
-            with open(tmp_file_key, 'r', encoding='utf-8') as f:
+            with open(tmp_file_key, "wb") as f:
+                f.write(response["Body"].read())
+            with open(tmp_file_key, "r", encoding="utf-8") as f:
                 file_content = f.read()
             return file_content
         except ClientError as e:
@@ -48,7 +55,6 @@ class S3StorageService(PersistenceService):
             logger.error(f"Unexpected error loading file {file_key} from S3: {str(e)}")
             raise
     def retrieve_raw_file(self, file_key: str) -> str:
         """Retrieve file path in tmp folder from S3 storage.
@@ -67,18 +73,21 @@ class S3StorageService(PersistenceService):
             tmp_file_key = f"/tmp/{file_key}"
             # Create parent directories if they don't exist
             os.makedirs(os.path.dirname(tmp_file_key), exist_ok=True)
-            with open(tmp_file_key, 'wb') as f:
-                f.write(response['Body'].read())
+            with open(tmp_file_key, "wb") as f:
+                f.write(response["Body"].read())
             return tmp_file_key
         except ClientError as e:
             logger.error(f"Error retrieving file {file_key} from S3: {str(e)}")
             raise
         except Exception as e:
-            logger.error(f"Unexpected error retrieving file {file_key} from S3: {str(e)}")
+            logger.error(
+                f"Unexpected error retrieving file {file_key} from S3: {str(e)}"
+            )
             raise
-    def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
+    def save_parsed_document(
+        self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
+    ):
         """Save a parsed document to S3.
         Args:
@@ -91,21 +100,21 @@ class S3StorageService(PersistenceService):
         """
         try:
             # Convert document content to bytes
-            content_bytes = parsed_document.document_text.encode('utf-8')
+            content_bytes = parsed_document.document_text.encode("utf-8")
             # Upload the file to S3
             if not file_tags:
                 self.s3.put_object(
-                    Bucket=self.target_bucket_name,
-                    Key=file_key,
-                    Body=content_bytes
+                    Bucket=self.target_bucket_name, Key=file_key, Body=content_bytes
                 )
             else:
-                tagging_string = "&".join([f"{key}={value}" for key, value in file_tags.items()])
+                tagging_string = "&".join(
+                    [f"{key}={value}" for key, value in file_tags.items()]
+                )
                 self.s3.put_object(
                     Bucket=self.target_bucket_name,
                     Key=file_key,
                     Body=content_bytes,
-                    Tagging=tagging_string
+                    Tagging=tagging_string,
                 )
             logger.info(f"Successfully saved document to S3 as {file_key}")
@@ -122,8 +131,5 @@ class S3StorageService(PersistenceService):
         Args:
             file_key: The key (path) to retrieve tags
         """
-        response = self.s3.get_object_tagging(
-            Bucket=bucket_name,
-            Key=file_key
-        )
+        response = self.s3.get_object_tagging(Bucket=bucket_name, Key=file_key)
         return {item["Key"]: item["Value"] for item in response["TagSet"]}

wizit_context_ingestor/infra/rag/chroma_embeddings.py ADDED Viewed

@@ -0,0 +1,135 @@
+from typing_extensions import Sequence
+from test.test_typing import CoolEmployee
+from langchain_core.documents import Document
+from langchain_chroma import Chroma
+from typing import List
+import logging
+from uuid import uuid4
+from ...application.interfaces import EmbeddingsManager
+# load_dotenv()
+logger = logging.getLogger(__name__)
+class ChromaEmbeddingsManager(EmbeddingsManager):
+    __slots__ = ("embeddings_model", "collection_name")
+    def __init__(
+        self,
+        embeddings_model,
+        chroma_host=None,
+        **chroma_conn_kwargs,
+    ):
+        """
+        Initialize the ChromaEmbeddingsManager.
+        Args:
+            embeddings_model: The embeddings model to use for generating vector embeddings
+                              (typically a LangChain embeddings model instance)
+            chroma_host: The Chroma host URL
+        Raises:
+            Exception: If there's an error initializing the RedisEmbeddingsManager
+        """
+        self.embeddings_model = embeddings_model
+        self.chroma_host = chroma_host
+        try:
+            if chroma_host:
+                self.chroma = Chroma(
+                    embedding_function=self.embeddings_model,
+                    host=chroma_host,
+                    **chroma_conn_kwargs,
+                )
+                logger.info("ChromaEmbeddingsManager initialized")
+            else:
+                self.chroma = Chroma(
+                    embedding_function=self.embeddings_model, **chroma_conn_kwargs
+                )
+                logger.info("ChromaEmbeddingsManager initialized")
+        except Exception as e:
+            logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
+            raise
+    def configure_vector_store(
+        self,
+        table_name: str = "",
+        vector_size: int = 768,
+        content_column: str = "document",
+        id_column: str = "id",
+    ):
+        """Configure the vector store."""
+        pass
+    def init_vector_store(
+        self,
+        table_name: str = "",
+        content_column: str = "document",
+        id_column: str = "id",
+    ):
+        """Initialize the vector store."""
+        pass
+    def index_documents(self, documents: list[Document]):
+        """
+        Add documents to the vector store with their embeddings.
+        This method takes a list of Document objects, generates embeddings for them
+        using the embeddings model, and stores both the documents and their
+        embeddings in the PostgreSQL database.
+        Args:
+          docs: A list of LangChain Document objects to add to the vector store
+                Each Document should have page_content and metadata attributes
+                from langchain_core.documents import Document
+        Returns:
+          None
+        Raises:
+          Exception: If there's an error adding documents to the vector store
+        """
+        try:
+            logger.info(f"Indexing {len(documents)} documents in vector store")
+            self.chroma.add_documents(documents)
+        except Exception as e:
+            logger.error(f"Error indexing documents: {str(e)}")
+            raise
+    def get_documents_by_id(self, ids: list[str]):
+        """
+        Get document by ID from the vector store.
+        """
+        try:
+            return self.chroma.get_by_ids(ids)
+        except Exception as e:
+            logger.error(f"Error getting documents by ID: {str(e)}")
+            raise
+    def delete_documents_by_id(self, ids: list[str]):
+        """
+        Delete documents by ID from the vector store.
+        """
+        try:
+            self.chroma.delete(ids)
+        except Exception as e:
+            logger.error(f"Error deleting documents by ID: {str(e)}")
+            raise
+    def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
+        """
+        Delete documents by filter from the vector store.
+        """
+        try:
+            self.chroma.delete(where={metadata_key: metadata_value})
+        except Exception as error:
+            logger.error(
+                f"Error deleting documents by filter: {str(filter)}, error: {error} "
+            )
+            raise
+    def get_documents_keys_by_source_id(self, source_id: str):
+        """Get documents keys by source ID."""
+        pass
+    def delete_documents_by_source_id(self, source_id: str):
+        """Delete documents by source ID."""
+        pass

wizit_context_ingestor/infra/rag/pg_embeddings.py CHANGED Viewed

@@ -6,6 +6,7 @@ from langchain_postgres import PGVectorStore, PGEngine
 from sqlalchemy import create_engine
 from dotenv import load_dotenv
 from wizit_context_ingestor.application.interfaces import EmbeddingsManager
 load_dotenv()
 logger = logging.getLogger(__name__)
@@ -38,19 +39,21 @@ class PgEmbeddingsManager(EmbeddingsManager):
       ... )
       >>> documents = [Document(page_content="Sample text", metadata={"source": "example"})]
     """
     __slots__ = ("embeddings_model", "pg_connection")
     def __init__(self, embeddings_model, pg_connection: str):
         """
-          Initialize the PgEmbeddingsManager.
+        Initialize the PgEmbeddingsManager.
-          Args:
-              embeddings_model: The embeddings model to use for generating vector embeddings
-                                (typically a LangChain embeddings model instance)
-              pg_connection: The PostgreSQL connection string
-                            (format: postgresql://user:password@host:port/database)
+        Args:
+            embeddings_model: The embeddings model to use for generating vector embeddings
+                              (typically a LangChain embeddings model instance)
+            pg_connection: The PostgreSQL connection string
+                          (format: postgresql://user:password@host:port/database)
-          Raises:
-              Exception: If there's an error initializing the vector store
+        Raises:
+            Exception: If there's an error initializing the vector store
         """
         self.pg_connection = pg_connection
         self.embeddings_model = embeddings_model
@@ -58,65 +61,65 @@ class PgEmbeddingsManager(EmbeddingsManager):
         self.vector_store = None
         self.record_manager = None
         try:
-          self.pg_engine = PGEngine.from_connection_string(url=pg_connection)
-          logger.info("PgEmbeddingsManager initialized")
+            self.pg_engine = PGEngine.from_connection_string(url=pg_connection)
+            logger.info("PgEmbeddingsManager initialized")
         except Exception as e:
             logger.error(f"Failed to initialize PgEmbeddingsManager: {str(e)}")
             raise
     def configure_vector_store(
-      self,
-      table_name: str = "langchain_pg_embedding",
-      vector_size: int = 768,
-      content_column: str = "document",
-      id_column: str = "id",
-      metadata_json_column: str = "cmetadata",
-      pg_record_manager: str = "postgres/langchain_pg_collection"
+        self,
+        table_name: str = "langchain_pg_embedding",
+        vector_size: int = 768,
+        content_column: str = "document",
+        id_column: str = "id",
+        metadata_json_column: str = "cmetadata",
+        pg_record_manager: str = "postgres/langchain_pg_collection",
     ):
-      self.pg_engine.init_vectorstore_table(
-        table_name=table_name,
-        vector_size=vector_size,
-        content_column=content_column,
-        id_column=id_column,
-        metadata_json_column=metadata_json_column,
-      )
-      self.record_manager = SQLRecordManager(
-          pg_record_manager,
-          engine=create_engine(url=self.pg_connection)
-      )
-      # TODO move this from here
-      self.record_manager.create_schema()
+        self.pg_engine.init_vectorstore_table(
+            table_name=table_name,
+            vector_size=vector_size,
+            content_column=content_column,
+            id_column=id_column,
+            metadata_json_column=metadata_json_column,
+        )
+        self.record_manager = SQLRecordManager(
+            pg_record_manager, engine=create_engine(url=self.pg_connection)
+        )
+        # TODO move this from here
+        self.record_manager.create_schema()
     def init_vector_store(
-      self,
-      table_name: str = "langchain_pg_embedding",
-      content_column: str = "document",
-      metadata_json_column: str = "cmetadata",
-      id_column: str = "id",
-      pg_record_manager: str = "postgres/langchain_pg_collection"
+        self,
+        table_name: str = "langchain_pg_embedding",
+        content_column: str = "document",
+        metadata_json_column: str = "cmetadata",
+        id_column: str = "id",
+        pg_record_manager: str = "postgres/langchain_pg_collection",
     ):
-      self.vector_store = PGVectorStore.create_sync(
-        embedding_service=self.embeddings_model,
-        engine=self.pg_engine,
-        table_name=table_name,
-        content_column=content_column,
-        metadata_json_column=metadata_json_column,
-        id_column=id_column,
-      )
-      self.record_manager = SQLRecordManager(
-          pg_record_manager,
-          engine=create_engine(url=self.pg_connection)
-      )
+        self.vector_store = PGVectorStore.create_sync(
+            embedding_service=self.embeddings_model,
+            engine=self.pg_engine,
+            table_name=table_name,
+            content_column=content_column,
+            metadata_json_column=metadata_json_column,
+            id_column=id_column,
+        )
+        self.record_manager = SQLRecordManager(
+            pg_record_manager, engine=create_engine(url=self.pg_connection)
+        )
     def vector_store_initialized(func):
         """validate vector store initialization"""
         def wrapper(self, *args, **kwargs):
-          # Common validation logic
-          if self.vector_store is None:
-            raise Exception("Vector store not initialized")
-          if self.record_manager is None:
-            raise Exception("Record manager not initialized")
-          return func(self, *args, **kwargs)
+            # Common validation logic
+            if self.vector_store is None:
+                raise Exception("Vector store not initialized")
+            if self.record_manager is None:
+                raise Exception("Record manager not initialized")
+            return func(self, *args, **kwargs)
         return wrapper
     @vector_store_initialized

wizit-context-ingestor 0.2.5b2__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

wizit-context-ingestor 0.2.5b2py3-none-any.whl → 0.3.0b1py3-none-any.whl