wizit-context-ingestor 0.2.5b2__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. wizit_context_ingestor/__init__.py +2 -2
  2. wizit_context_ingestor/application/context_chunk_service.py +149 -35
  3. wizit_context_ingestor/application/interfaces.py +1 -1
  4. wizit_context_ingestor/application/transcription_service.py +132 -49
  5. wizit_context_ingestor/data/kdb.py +10 -0
  6. wizit_context_ingestor/data/prompts.py +156 -2
  7. wizit_context_ingestor/data/storage.py +10 -0
  8. wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
  9. wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
  10. wizit_context_ingestor/infra/rag/chroma_embeddings.py +135 -0
  11. wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
  12. wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
  13. wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
  14. wizit_context_ingestor/infra/vertex_model.py +56 -28
  15. wizit_context_ingestor/main.py +160 -105
  16. wizit_context_ingestor/utils/file_utils.py +13 -0
  17. wizit_context_ingestor/workflows/context_nodes.py +73 -0
  18. wizit_context_ingestor/workflows/context_state.py +10 -0
  19. wizit_context_ingestor/workflows/context_tools.py +58 -0
  20. wizit_context_ingestor/workflows/context_workflow.py +42 -0
  21. wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
  22. wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
  23. wizit_context_ingestor/workflows/transcription_state.py +17 -0
  24. wizit_context_ingestor/workflows/transcription_tools.py +54 -0
  25. wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
  26. {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/METADATA +10 -1
  27. wizit_context_ingestor-0.3.0b1.dist-info/RECORD +44 -0
  28. {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/WHEEL +1 -1
  29. wizit_context_ingestor-0.2.5b2.dist-info/RECORD +0 -31
@@ -1,5 +1,93 @@
1
1
  from pydantic import BaseModel, Field
2
2
 
3
+ AGENT_TRANSCRIPTION_SYSTEM_PROMPT = """
4
+ You are an expert document transcription assistant.
5
+ Your task is to transcribe the exact text from the provided document with extreme accuracy while organizing the output using markdown formatting.
6
+ OBJECTIVE: Create a complete, accurate transcription that preserves the original document's content, structure and formatting.
7
+ TRANSCRIPTION RULES:
8
+ <hard_rules>
9
+ 1. document's languages must be detected to ensure correct transcription
10
+ 2. Systematically examine each content element (text, images, tables, formatting)
11
+ 3. Convert all content to markdown while preserving structure and meaning
12
+ 5. Ensure completeness and accuracy of the transcription
13
+ 6. TEXT TRANSCRIPTION:
14
+ - Transcribe all visible text exactly as it appears
15
+ - Include: paragraphs, headings, subheadings, headers, footers
16
+ - Include: footnotes, page numbers, bullet points, lists, captions
17
+ - Preserve: bold, italic, underlined, and other text formatting using markdown
18
+ 7. LANGUAGE REQUIREMENTS:
19
+ - Transcribed content MUST preserve document's language
20
+ - Translate any secondary language content to maintain consistency
21
+ 8. COMPLETENESS:
22
+ - Transcribe the entire document, partial transcriptions are not allowed
23
+ - Never summarize, modify, or generate additional content
24
+ - Maintain original meaning and context
25
+ 9. FORMATTING STANDARDS:
26
+ - Use proper markdown syntax for structure
27
+ - Avoid blank lines in transcription
28
+ - Exclude logos, watermarks, and decorative icons
29
+ - Omit special characters that interfere with markdown
30
+ 10. IMAGE HANDLING:
31
+ <image_transcription_rules>
32
+ - Extract and transcribe any text within images
33
+ - For data-rich images: create markdown tables when applicable
34
+ - For other images: provide descriptive content summaries
35
+ - Classify each visual element as: Chart, Diagram, Natural Image, Screenshot, or Other
36
+ - Format: <figure_type>Classification</figure_type>
37
+ - Wrap content in <figure></figure> tags with title/caption if available
38
+ </image_transcription_rules>
39
+ 11. TABLE PROCESSING:
40
+ <tables_transcription_rules>
41
+ - Convert all tables to proper markdown table format
42
+ - Preserve cell alignment and structure as closely as possible
43
+ - Maintain data relationships and hierarchy
44
+ - Include table headers and formatting
45
+ </tables_transcription_rules>
46
+ 12. QUALITY ASSURANCE:
47
+ - Ensure no content is omitted or added
48
+ - Check markdown formatting is correct
49
+ - Confirm structural integrity is maintained
50
+ </hard_rules>
51
+
52
+ CRITICAL REMINDERS:
53
+ <critical_reminders>
54
+ - Accuracy over speed, every character matters
55
+ - Preserve original document intent and meaning
56
+ - Maintain professional transcription standards
57
+ - Complete transcription is mandatory
58
+ </critical_reminders>
59
+ When provided, use the following transcription notes from previous transcriptions intents to improve the current transcription:
60
+ <transcription_notes>
61
+ {transcription_notes}
62
+ </transcription_notes>
63
+ When provided, use the following additional transcription instructions to improve results:
64
+ <additional_instructions>
65
+ {transcription_additional_instructions}
66
+ </additional_instructions>
67
+ """
68
+ # Generate the optimized transcription following these specifications:
69
+ # {format_instructions}
70
+
71
+
72
+ IMAGE_TRANSCRIPTION_CHECK_SYSTEM_PROMPT = """
73
+ You are an expert document transcription grader.
74
+ Your task is to evaluate the following transcription quality.
75
+ <rules>
76
+ - Provide an accurate evaluation of the transcription ensuring quality, completeness and accuracy.
77
+ - Transcription has markdown formatting, the markdown format must reflect the original document's structure and formatting.
78
+ - Compare the transcription with the original document (provided as image)
79
+ </rules>
80
+ <transcription>
81
+ {transcription}
82
+ </transcription>
83
+
84
+ When provided, evaluate whether the following additional transcription instructions provided by the user have been followed:
85
+ <additional_instructions>
86
+ {transcription_additional_instructions}
87
+ </additional_instructions>
88
+ """
89
+
90
+
3
91
  IMAGE_TRANSCRIPTION_SYSTEM_PROMPT = """
4
92
  You are an expert document transcription assistant. Your task is to transcribe the exact text from the provided document with extreme accuracy while organizing the output using markdown formatting.
5
93
 
@@ -22,6 +110,7 @@ TRANSCRIPTION RULES:
22
110
  - Include: footnotes, page numbers, bullet points, lists, captions
23
111
  - Preserve: bold, italic, underlined, and other text formatting using markdown
24
112
  - Mark unclear text as [unclear] or [illegible] with best guess in brackets
113
+ - Enclose all underlined content in <UnderlinedContent></UnderlinedContent> tags
25
114
 
26
115
  2. LANGUAGE REQUIREMENTS:
27
116
  - All transcribed content MUST be in the document's primary language
@@ -70,9 +159,13 @@ CRITICAL REMINDERS:
70
159
  - Maintain professional transcription standards
71
160
  - Complete transcription is mandatory
72
161
 
162
+ <additional_instructions>
163
+ {transcription_additional_instructions}
164
+ </additional_instructions>
165
+
166
+
73
167
  Generate the optimized transcription following these specifications:
74
168
  {format_instructions}
75
-
76
169
  """
77
170
 
78
171
  CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
@@ -132,10 +225,71 @@ Generate the optimized context following these specifications:
132
225
  {format_instructions}
133
226
  """
134
227
 
228
+ WORKFLOW_CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
229
+ You are an expert RAG (Retrieval-Augmented Generation) context generator that creates optimized contextual chunks from markdown document content for enhanced search and retrieval performance.
230
+ OBJECTIVE: Generate rich, searchable context descriptions that maximize retrieval accuracy and relevance in RAG systems.
231
+ WORKFLOW:
232
+ <task_analysis>
233
+ 1. LANGUAGE DETECTION: Identify the primary language used in the document content
234
+ 2. SEMANTIC ANALYSIS: Understand the chunk's meaning, relationships, and significance within the broader document
235
+ 3. CONTEXT GENERATION: Create comprehensive context metadata that enhances retrieval effectiveness
236
+ 4. SEARCH OPTIMIZATION: Ensure context includes terms and concepts that users might search for
237
+ 5. QUALITY VALIDATION: Verify context completeness and retrieval utility
238
+ </task_analysis>
239
+ CONTEXT GENERATION REQUIREMENTS:
240
+ <context_elements>
241
+ Your generated context must synthesize ALL of these elements into a coherent description:
242
+ - chunk_relation_with_document: How this chunk connects to and fits within the overall document structure and narrative
243
+ - chunk_keywords: Primary and secondary keywords, technical terms, and searchable phrases that would help users find this content
244
+ - chunk_description: Clear explanation of what the chunk contains, including data types, concepts, and information presented
245
+ - chunk_function: The chunk's specific purpose and role (e.g., definition, explanation, example, instruction, procedure, list, summary, analysis, conclusion)
246
+ - chunk_structure: Format and organizational pattern (paragraph, bulleted list, numbered steps, table, code block, heading, etc.)
247
+ - chunk_main_idea: The central concept, message, or takeaway that the chunk communicates
248
+ - chunk_domain: Subject area or field of knowledge (e.g., technical documentation, legal text, medical information, business process)
249
+ - chunk_audience: Intended reader level and background (e.g., beginner, expert, general audience, specific role)
250
+ </context_elements>
251
+ CRITICAL RULES:
252
+ <critical_rules>
253
+ - Context MUST be written in the SAME language as the source document content
254
+ - Be comprehensive yet concise - aim for maximum information density
255
+ - Prioritize search retrieval optimization and semantic understanding
256
+ - Include synonyms and alternative phrasings users might search for
257
+ - Focus on conceptual relationships and knowledge connections
258
+ - Do NOT reproduce or quote the original chunk content verbatim
259
+ - Ensure context is self-contained and understandable without the original chunk
260
+ - Use natural language that flows well while incorporating all required elements
261
+ </critical_rules>
262
+
263
+ SEARCH OPTIMIZATION GUIDELINES:
264
+ <search_optimization>
265
+ - Include both explicit terms from the content and implicit concepts
266
+ - Consider various ways users might phrase queries related to this content
267
+ - Incorporate hierarchical information (section → subsection → detail level)
268
+ - Add contextual bridges that connect this chunk to related topics
269
+ - Use varied vocabulary to capture different search approaches
270
+ </search_optimization>
271
+
272
+ <document_content>
273
+ {document_content}
274
+ </document_content>
275
+
276
+
277
+ When provided, follow these additional context extraction instructions:
278
+ <additional_instructions>
279
+ {context_additional_instructions}
280
+ </additional_instructions>
281
+
282
+ """
283
+
284
+
135
285
  class ContextChunk(BaseModel):
136
- context: str = Field(description="Context description that helps with search retrieval")
286
+ context: str = Field(
287
+ description="Context description that helps with search retrieval"
288
+ )
289
+
137
290
 
138
291
  class Transcription(BaseModel):
139
292
  """Document Transcription."""
293
+
140
294
  transcription: str = Field(description="Full transcription")
141
295
  language: str = Field(description="Main language")
@@ -0,0 +1,10 @@
1
+ from enum import Enum
2
+ from typing import Literal
3
+
4
+
5
+ class StorageServices(Enum):
6
+ S3 = "s3"
7
+ LOCAL = "local"
8
+
9
+
10
+ storage_services = Literal[StorageServices.S3.value, StorageServices.LOCAL.value]
@@ -3,22 +3,27 @@ from ...domain.models import ParsedDoc
3
3
  from typing import Optional
4
4
  import logging
5
5
  import os
6
+
6
7
  logger = logging.getLogger(__name__)
7
8
 
9
+
8
10
  class LocalStorageService(PersistenceService):
9
11
  """Persistence service for local storage."""
10
12
 
11
- def __init__(self):
12
- self.tmp_folder = "tmp"
13
+ def __init__(self, source_storage_route: str, target_storage_route: str):
14
+ self.source_storage_route = source_storage_route
15
+ self.target_storage_route = target_storage_route
16
+ self.supports_tagging = hasattr(self, "retrieve_file_tags")
13
17
 
14
18
  def load_markdown_file_content(self, file_key: str) -> str:
15
19
  """Load markdown file content from local storage."""
16
20
  file_content = None
17
- with open(f"{self.tmp_folder}/{file_key}", "r", encoding="utf-8") as file:
21
+ with open(
22
+ f"{self.source_storage_route}/{file_key}", "r", encoding="utf-8"
23
+ ) as file:
18
24
  file_content = file.read()
19
25
  return file_content
20
26
 
21
-
22
27
  def retrieve_raw_file(self, file_key: str) -> str:
23
28
  """Retrieve file path in tmp folder from local storage.
24
29
 
@@ -32,16 +37,21 @@ class LocalStorageService(PersistenceService):
32
37
  ClientError: If there's an error retrieving the object from local storage
33
38
  """
34
39
  try:
35
- tmp_file_path = f"{self.tmp_folder}/{file_key}"
40
+ tmp_file_path = f"{self.source_storage_route}/{file_key}"
36
41
  if not os.path.exists(tmp_file_path):
37
42
  raise FileNotFoundError(f"File {file_key} not found in local storage")
38
43
  return tmp_file_path
39
44
  except Exception as e:
40
- logger.error(f"Unexpected error retrieving file {file_key} from local storage: {str(e)}")
45
+ logger.error(
46
+ f"Unexpected error retrieving file {file_key} from local storage: {str(e)}"
47
+ )
41
48
  raise
42
49
 
43
-
44
- def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
50
+ def save_parsed_document(
51
+ self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
52
+ ):
45
53
  """Save a parsed document."""
46
- with open(f"{self.tmp_folder}/{file_key}", "w", encoding="utf-8") as f:
54
+ with open(
55
+ f"{self.target_storage_route}/{file_key}", "w", encoding="utf-8"
56
+ ) as f:
47
57
  f.write(parsed_document.document_text)
@@ -11,12 +11,19 @@ logger = logging.getLogger(__name__)
11
11
 
12
12
  class S3StorageService(PersistenceService):
13
13
  """Persistence service for S3 storage."""
14
- __slots__ = ('origin_bucket_name', 'target_bucket_name', 'region_name')
15
- def __init__(self, origin_bucket_name: str, target_bucket_name: str, region_name: str = 'us-east-1'):
16
- self.s3 = boto3_client('s3', region_name=region_name)
14
+
15
+ __slots__ = ("origin_bucket_name", "target_bucket_name", "region_name")
16
+
17
+ def __init__(
18
+ self,
19
+ origin_bucket_name: str,
20
+ target_bucket_name: str,
21
+ region_name: str = "us-east-1",
22
+ ):
23
+ self.s3 = boto3_client("s3", region_name=region_name)
17
24
  self.origin_bucket_name = origin_bucket_name
18
25
  self.target_bucket_name = target_bucket_name
19
-
26
+ self.supports_tagging = hasattr(self, "retrieve_file_tags")
20
27
 
21
28
  def load_markdown_file_content(self, file_key: str) -> str:
22
29
  """Load markdown file content from S3 storage.
@@ -36,9 +43,9 @@ class S3StorageService(PersistenceService):
36
43
  response = self.s3.get_object(Bucket=self.target_bucket_name, Key=file_key)
37
44
  tmp_file_key = f"/tmp/{file_key}"
38
45
  os.makedirs(os.path.dirname(tmp_file_key), exist_ok=True)
39
- with open(tmp_file_key, 'wb') as f:
40
- f.write(response['Body'].read())
41
- with open(tmp_file_key, 'r', encoding='utf-8') as f:
46
+ with open(tmp_file_key, "wb") as f:
47
+ f.write(response["Body"].read())
48
+ with open(tmp_file_key, "r", encoding="utf-8") as f:
42
49
  file_content = f.read()
43
50
  return file_content
44
51
  except ClientError as e:
@@ -48,7 +55,6 @@ class S3StorageService(PersistenceService):
48
55
  logger.error(f"Unexpected error loading file {file_key} from S3: {str(e)}")
49
56
  raise
50
57
 
51
-
52
58
  def retrieve_raw_file(self, file_key: str) -> str:
53
59
  """Retrieve file path in tmp folder from S3 storage.
54
60
 
@@ -67,18 +73,21 @@ class S3StorageService(PersistenceService):
67
73
  tmp_file_key = f"/tmp/{file_key}"
68
74
  # Create parent directories if they don't exist
69
75
  os.makedirs(os.path.dirname(tmp_file_key), exist_ok=True)
70
- with open(tmp_file_key, 'wb') as f:
71
- f.write(response['Body'].read())
76
+ with open(tmp_file_key, "wb") as f:
77
+ f.write(response["Body"].read())
72
78
  return tmp_file_key
73
79
  except ClientError as e:
74
80
  logger.error(f"Error retrieving file {file_key} from S3: {str(e)}")
75
81
  raise
76
82
  except Exception as e:
77
- logger.error(f"Unexpected error retrieving file {file_key} from S3: {str(e)}")
83
+ logger.error(
84
+ f"Unexpected error retrieving file {file_key} from S3: {str(e)}"
85
+ )
78
86
  raise
79
87
 
80
-
81
- def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
88
+ def save_parsed_document(
89
+ self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
90
+ ):
82
91
  """Save a parsed document to S3.
83
92
 
84
93
  Args:
@@ -91,21 +100,21 @@ class S3StorageService(PersistenceService):
91
100
  """
92
101
  try:
93
102
  # Convert document content to bytes
94
- content_bytes = parsed_document.document_text.encode('utf-8')
103
+ content_bytes = parsed_document.document_text.encode("utf-8")
95
104
  # Upload the file to S3
96
105
  if not file_tags:
97
106
  self.s3.put_object(
98
- Bucket=self.target_bucket_name,
99
- Key=file_key,
100
- Body=content_bytes
107
+ Bucket=self.target_bucket_name, Key=file_key, Body=content_bytes
101
108
  )
102
109
  else:
103
- tagging_string = "&".join([f"{key}={value}" for key, value in file_tags.items()])
110
+ tagging_string = "&".join(
111
+ [f"{key}={value}" for key, value in file_tags.items()]
112
+ )
104
113
  self.s3.put_object(
105
114
  Bucket=self.target_bucket_name,
106
115
  Key=file_key,
107
116
  Body=content_bytes,
108
- Tagging=tagging_string
117
+ Tagging=tagging_string,
109
118
  )
110
119
 
111
120
  logger.info(f"Successfully saved document to S3 as {file_key}")
@@ -122,8 +131,5 @@ class S3StorageService(PersistenceService):
122
131
  Args:
123
132
  file_key: The key (path) to retrieve tags
124
133
  """
125
- response = self.s3.get_object_tagging(
126
- Bucket=bucket_name,
127
- Key=file_key
128
- )
134
+ response = self.s3.get_object_tagging(Bucket=bucket_name, Key=file_key)
129
135
  return {item["Key"]: item["Value"] for item in response["TagSet"]}
@@ -0,0 +1,135 @@
1
+ from typing_extensions import Sequence
2
+ from test.test_typing import CoolEmployee
3
+ from langchain_core.documents import Document
4
+ from langchain_chroma import Chroma
5
+ from typing import List
6
+ import logging
7
+ from uuid import uuid4
8
+ from ...application.interfaces import EmbeddingsManager
9
+
10
+ # load_dotenv()
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class ChromaEmbeddingsManager(EmbeddingsManager):
16
+ __slots__ = ("embeddings_model", "collection_name")
17
+
18
+ def __init__(
19
+ self,
20
+ embeddings_model,
21
+ chroma_host=None,
22
+ **chroma_conn_kwargs,
23
+ ):
24
+ """
25
+ Initialize the ChromaEmbeddingsManager.
26
+ Args:
27
+ embeddings_model: The embeddings model to use for generating vector embeddings
28
+ (typically a LangChain embeddings model instance)
29
+ chroma_host: The Chroma host URL
30
+
31
+ Raises:
32
+ Exception: If there's an error initializing the RedisEmbeddingsManager
33
+ """
34
+ self.embeddings_model = embeddings_model
35
+ self.chroma_host = chroma_host
36
+ try:
37
+ if chroma_host:
38
+ self.chroma = Chroma(
39
+ embedding_function=self.embeddings_model,
40
+ host=chroma_host,
41
+ **chroma_conn_kwargs,
42
+ )
43
+ logger.info("ChromaEmbeddingsManager initialized")
44
+ else:
45
+ self.chroma = Chroma(
46
+ embedding_function=self.embeddings_model, **chroma_conn_kwargs
47
+ )
48
+ logger.info("ChromaEmbeddingsManager initialized")
49
+ except Exception as e:
50
+ logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
51
+ raise
52
+
53
+ def configure_vector_store(
54
+ self,
55
+ table_name: str = "",
56
+ vector_size: int = 768,
57
+ content_column: str = "document",
58
+ id_column: str = "id",
59
+ ):
60
+ """Configure the vector store."""
61
+ pass
62
+
63
+ def init_vector_store(
64
+ self,
65
+ table_name: str = "",
66
+ content_column: str = "document",
67
+ id_column: str = "id",
68
+ ):
69
+ """Initialize the vector store."""
70
+ pass
71
+
72
+ def index_documents(self, documents: list[Document]):
73
+ """
74
+ Add documents to the vector store with their embeddings.
75
+
76
+ This method takes a list of Document objects, generates embeddings for them
77
+ using the embeddings model, and stores both the documents and their
78
+ embeddings in the PostgreSQL database.
79
+
80
+ Args:
81
+ docs: A list of LangChain Document objects to add to the vector store
82
+ Each Document should have page_content and metadata attributes
83
+ from langchain_core.documents import Document
84
+ Returns:
85
+ None
86
+
87
+ Raises:
88
+ Exception: If there's an error adding documents to the vector store
89
+ """
90
+ try:
91
+ logger.info(f"Indexing {len(documents)} documents in vector store")
92
+ self.chroma.add_documents(documents)
93
+ except Exception as e:
94
+ logger.error(f"Error indexing documents: {str(e)}")
95
+ raise
96
+
97
+ def get_documents_by_id(self, ids: list[str]):
98
+ """
99
+ Get document by ID from the vector store.
100
+ """
101
+ try:
102
+ return self.chroma.get_by_ids(ids)
103
+ except Exception as e:
104
+ logger.error(f"Error getting documents by ID: {str(e)}")
105
+ raise
106
+
107
+ def delete_documents_by_id(self, ids: list[str]):
108
+ """
109
+ Delete documents by ID from the vector store.
110
+ """
111
+ try:
112
+ self.chroma.delete(ids)
113
+ except Exception as e:
114
+ logger.error(f"Error deleting documents by ID: {str(e)}")
115
+ raise
116
+
117
+ def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
118
+ """
119
+ Delete documents by filter from the vector store.
120
+ """
121
+ try:
122
+ self.chroma.delete(where={metadata_key: metadata_value})
123
+ except Exception as error:
124
+ logger.error(
125
+ f"Error deleting documents by filter: {str(filter)}, error: {error} "
126
+ )
127
+ raise
128
+
129
+ def get_documents_keys_by_source_id(self, source_id: str):
130
+ """Get documents keys by source ID."""
131
+ pass
132
+
133
+ def delete_documents_by_source_id(self, source_id: str):
134
+ """Delete documents by source ID."""
135
+ pass
@@ -6,6 +6,7 @@ from langchain_postgres import PGVectorStore, PGEngine
6
6
  from sqlalchemy import create_engine
7
7
  from dotenv import load_dotenv
8
8
  from wizit_context_ingestor.application.interfaces import EmbeddingsManager
9
+
9
10
  load_dotenv()
10
11
 
11
12
  logger = logging.getLogger(__name__)
@@ -38,19 +39,21 @@ class PgEmbeddingsManager(EmbeddingsManager):
38
39
  ... )
39
40
  >>> documents = [Document(page_content="Sample text", metadata={"source": "example"})]
40
41
  """
42
+
41
43
  __slots__ = ("embeddings_model", "pg_connection")
44
+
42
45
  def __init__(self, embeddings_model, pg_connection: str):
43
46
  """
44
- Initialize the PgEmbeddingsManager.
47
+ Initialize the PgEmbeddingsManager.
45
48
 
46
- Args:
47
- embeddings_model: The embeddings model to use for generating vector embeddings
48
- (typically a LangChain embeddings model instance)
49
- pg_connection: The PostgreSQL connection string
50
- (format: postgresql://user:password@host:port/database)
49
+ Args:
50
+ embeddings_model: The embeddings model to use for generating vector embeddings
51
+ (typically a LangChain embeddings model instance)
52
+ pg_connection: The PostgreSQL connection string
53
+ (format: postgresql://user:password@host:port/database)
51
54
 
52
- Raises:
53
- Exception: If there's an error initializing the vector store
55
+ Raises:
56
+ Exception: If there's an error initializing the vector store
54
57
  """
55
58
  self.pg_connection = pg_connection
56
59
  self.embeddings_model = embeddings_model
@@ -58,65 +61,65 @@ class PgEmbeddingsManager(EmbeddingsManager):
58
61
  self.vector_store = None
59
62
  self.record_manager = None
60
63
  try:
61
- self.pg_engine = PGEngine.from_connection_string(url=pg_connection)
62
- logger.info("PgEmbeddingsManager initialized")
64
+ self.pg_engine = PGEngine.from_connection_string(url=pg_connection)
65
+ logger.info("PgEmbeddingsManager initialized")
63
66
  except Exception as e:
64
67
  logger.error(f"Failed to initialize PgEmbeddingsManager: {str(e)}")
65
68
  raise
66
69
 
67
70
  def configure_vector_store(
68
- self,
69
- table_name: str = "langchain_pg_embedding",
70
- vector_size: int = 768,
71
- content_column: str = "document",
72
- id_column: str = "id",
73
- metadata_json_column: str = "cmetadata",
74
- pg_record_manager: str = "postgres/langchain_pg_collection"
71
+ self,
72
+ table_name: str = "langchain_pg_embedding",
73
+ vector_size: int = 768,
74
+ content_column: str = "document",
75
+ id_column: str = "id",
76
+ metadata_json_column: str = "cmetadata",
77
+ pg_record_manager: str = "postgres/langchain_pg_collection",
75
78
  ):
76
- self.pg_engine.init_vectorstore_table(
77
- table_name=table_name,
78
- vector_size=vector_size,
79
- content_column=content_column,
80
- id_column=id_column,
81
- metadata_json_column=metadata_json_column,
82
- )
83
- self.record_manager = SQLRecordManager(
84
- pg_record_manager,
85
- engine=create_engine(url=self.pg_connection)
86
- )
87
- # TODO move this from here
88
- self.record_manager.create_schema()
79
+ self.pg_engine.init_vectorstore_table(
80
+ table_name=table_name,
81
+ vector_size=vector_size,
82
+ content_column=content_column,
83
+ id_column=id_column,
84
+ metadata_json_column=metadata_json_column,
85
+ )
86
+ self.record_manager = SQLRecordManager(
87
+ pg_record_manager, engine=create_engine(url=self.pg_connection)
88
+ )
89
+ # TODO move this from here
90
+ self.record_manager.create_schema()
89
91
 
90
92
  def init_vector_store(
91
- self,
92
- table_name: str = "langchain_pg_embedding",
93
- content_column: str = "document",
94
- metadata_json_column: str = "cmetadata",
95
- id_column: str = "id",
96
- pg_record_manager: str = "postgres/langchain_pg_collection"
93
+ self,
94
+ table_name: str = "langchain_pg_embedding",
95
+ content_column: str = "document",
96
+ metadata_json_column: str = "cmetadata",
97
+ id_column: str = "id",
98
+ pg_record_manager: str = "postgres/langchain_pg_collection",
97
99
  ):
98
- self.vector_store = PGVectorStore.create_sync(
99
- embedding_service=self.embeddings_model,
100
- engine=self.pg_engine,
101
- table_name=table_name,
102
- content_column=content_column,
103
- metadata_json_column=metadata_json_column,
104
- id_column=id_column,
105
- )
106
- self.record_manager = SQLRecordManager(
107
- pg_record_manager,
108
- engine=create_engine(url=self.pg_connection)
109
- )
100
+ self.vector_store = PGVectorStore.create_sync(
101
+ embedding_service=self.embeddings_model,
102
+ engine=self.pg_engine,
103
+ table_name=table_name,
104
+ content_column=content_column,
105
+ metadata_json_column=metadata_json_column,
106
+ id_column=id_column,
107
+ )
108
+ self.record_manager = SQLRecordManager(
109
+ pg_record_manager, engine=create_engine(url=self.pg_connection)
110
+ )
110
111
 
111
112
  def vector_store_initialized(func):
112
113
  """validate vector store initialization"""
114
+
113
115
  def wrapper(self, *args, **kwargs):
114
- # Common validation logic
115
- if self.vector_store is None:
116
- raise Exception("Vector store not initialized")
117
- if self.record_manager is None:
118
- raise Exception("Record manager not initialized")
119
- return func(self, *args, **kwargs)
116
+ # Common validation logic
117
+ if self.vector_store is None:
118
+ raise Exception("Vector store not initialized")
119
+ if self.record_manager is None:
120
+ raise Exception("Record manager not initialized")
121
+ return func(self, *args, **kwargs)
122
+
120
123
  return wrapper
121
124
 
122
125
  @vector_store_initialized