wizit-context-ingestor 0.3.0b8__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
- from .main import ChunksManager, TranscriptionManager
1
+ from .main import TranscriptionManager
2
+ from .main_chunks import ChunksManager
2
3
 
3
4
  __all__ = ["ChunksManager", "TranscriptionManager"]
@@ -1,19 +1,20 @@
1
1
  import asyncio
2
+ import logging
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from langchain_core.documents import Document
6
+ from langchain_core.messages.human import HumanMessage
2
7
  from langchain_core.output_parsers.pydantic import PydanticOutputParser
3
8
  from langchain_core.prompts import ChatPromptTemplate
4
- from langchain_core.documents import Document
9
+
5
10
  from ..data.prompts import CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT, ContextChunk
6
- from langchain_core.messages.human import HumanMessage
7
11
  from ..workflows.context_workflow import ContextWorkflow
8
- from typing import Dict, Any, Optional, List
9
12
  from .interfaces import (
10
13
  AiApplicationService,
14
+ EmbeddingsManager,
11
15
  PersistenceService,
12
16
  RagChunker,
13
- EmbeddingsManager,
14
17
  )
15
- import logging
16
-
17
18
 
18
19
  logger = logging.getLogger(__name__)
19
20
 
@@ -39,7 +40,7 @@ class ContextChunksInDocumentService:
39
40
  self.rag_chunker = rag_chunker
40
41
  self.embeddings_manager = embeddings_manager
41
42
  self.target_language = target_language
42
- self.embeddings_manager.init_vector_store()
43
+ # self.embeddings_manager.init_vector_store()
43
44
  self.chat_model = self.ai_application_service.load_chat_model()
44
45
  # TODO
45
46
  self.context_additional_instructions = ""
@@ -85,74 +86,6 @@ class ContextChunksInDocumentService:
85
86
  logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
86
87
  raise
87
88
 
88
- # def _retrieve_context_chunk_in_document(
89
- # self,
90
- # markdown_content: str,
91
- # chunk: Document,
92
- # chunk_metadata: Optional[Dict[str, Any]] = None,
93
- # ) -> Document:
94
- # """Retrieve context chunks in document."""
95
- # try:
96
- # chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
97
- # # Create the prompt template with image
98
- # prompt = ChatPromptTemplate.from_messages(
99
- # [
100
- # ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
101
- # (
102
- # "human",
103
- # [
104
- # {
105
- # "type": "text",
106
- # "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated in '{self.target_language}' language",
107
- # }
108
- # ],
109
- # ),
110
- # ]
111
- # ).partial(
112
- # document_content=markdown_content,
113
- # format_instructions=chunk_output_parser.get_format_instructions(),
114
- # )
115
- # model_with_structured_output = self.chat_model.with_structured_output(
116
- # ContextChunk
117
- # )
118
- # # Create the chain
119
- # chain = prompt | model_with_structured_output
120
- # # Process the image
121
- # results = chain.invoke({})
122
- # # chunk.page_content = (
123
- # # f"Context:{results.context}, Content:{chunk.page_content}"
124
- # # )
125
- # chunk.metadata["context"] = results.context
126
- # if chunk_metadata:
127
- # for key, value in chunk_metadata.items():
128
- # chunk.metadata[key] = value
129
- # return chunk
130
-
131
- # except Exception as e:
132
- # logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
133
- # raise
134
-
135
- # def retrieve_context_chunks_in_document(
136
- # self,
137
- # markdown_content: str,
138
- # chunks: List[Document],
139
- # chunks_metadata: Optional[Dict[str, Any]] = None,
140
- # ) -> List[Document]:
141
- # """Retrieve context chunks in document."""
142
- # try:
143
- # context_chunks = list(
144
- # map(
145
- # lambda chunk: self._retrieve_context_chunk_in_document(
146
- # markdown_content, chunk, chunks_metadata
147
- # ),
148
- # chunks,
149
- # )
150
- # )
151
- # return context_chunks
152
- # except Exception as e:
153
- # logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
154
- # raise
155
-
156
89
  async def retrieve_context_chunks_in_document_with_workflow(
157
90
  self,
158
91
  markdown_content: str,
@@ -205,26 +138,7 @@ class ContextChunksInDocumentService:
205
138
  )
206
139
  )
207
140
  logger.info(f"Context chunks generated:{len(context_chunks)}")
208
- # upsert validation
209
- try:
210
- print(f"deleting chunks: {file_key}")
211
- self.delete_document_context_chunks(file_key)
212
- except Exception as e:
213
- logger.error(f"could not delete by source: {e}")
214
- self.embeddings_manager.index_documents(context_chunks)
215
141
  return context_chunks
216
142
  except Exception as e:
217
- logger.error("Error get_context_chunks_in_document")
218
- raise e
219
-
220
- def delete_document_context_chunks(self, file_key: str):
221
- """
222
- Delete the context chunks in a document.
223
- """
224
- try:
225
- self.embeddings_manager.delete_documents_by_metadata_key(
226
- self.metadata_source, file_key
227
- )
228
- except Exception as e:
229
- logger.error(f"Error delete_document_context_chunks: {str(e)}")
143
+ logger.error(f"Error: {str(e)}")
230
144
  raise e
@@ -1,13 +1,19 @@
1
1
  """
2
2
  Application interfaces defining application layer contracts.
3
3
  """
4
+
4
5
  from abc import ABC, abstractmethod
5
- from ..domain.models import ParsedDocPage, ParsedDoc
6
- from typing import List, Union, Optional
7
- from langchain_core.documents import Document
6
+ from typing import List, Optional, Union
7
+
8
+ from langchain.indexes import IndexingResult, SQLRecordManager
8
9
  from langchain_aws import ChatBedrockConverse
10
+ from langchain_core.documents import Document
9
11
  from langchain_google_vertexai import ChatVertexAI
10
12
  from langchain_google_vertexai.model_garden import ChatAnthropicVertex
13
+ from langchain_postgres import PGVectorStore
14
+
15
+ from ..domain.models import ParsedDoc, ParsedDocPage
16
+
11
17
 
12
18
  class TranscriptionService(ABC):
13
19
  """Interface for transcription services."""
@@ -17,6 +23,7 @@ class TranscriptionService(ABC):
17
23
  """Parse a document page."""
18
24
  pass
19
25
 
26
+
20
27
  class AiApplicationService(ABC):
21
28
  """Interface for AI application services."""
22
29
 
@@ -26,7 +33,9 @@ class AiApplicationService(ABC):
26
33
  # pass
27
34
 
28
35
  @abstractmethod
29
- def load_chat_model(self, **kwargs) -> Union[ChatVertexAI, ChatAnthropicVertex, ChatBedrockConverse]:
36
+ def load_chat_model(
37
+ self, **kwargs
38
+ ) -> Union[ChatVertexAI, ChatAnthropicVertex, ChatBedrockConverse]:
30
39
  """Load a chat model."""
31
40
  pass
32
41
 
@@ -40,7 +49,9 @@ class PersistenceService(ABC):
40
49
  """Interface for persistence services."""
41
50
 
42
51
  @abstractmethod
43
- def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
52
+ def save_parsed_document(
53
+ self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
54
+ ):
44
55
  """Save a parsed document."""
45
56
  pass
46
57
 
@@ -70,38 +81,56 @@ class EmbeddingsManager(ABC):
70
81
  @abstractmethod
71
82
  def configure_vector_store(
72
83
  self,
73
- table_name: str = "langchain_pg_embedding",
74
- vector_size: int = 768,
75
- content_column: str = "document",
76
- id_column: str = "id",
77
- metadata_json_column: str = "cmetadata",
78
- pg_record_manager: str = "postgres/langchain_pg_collection"
79
84
  ):
80
85
  """Configure the vector store."""
81
86
  pass
82
87
 
88
+ # @abstractmethod
89
+ # async def init_vector_store(
90
+ # self,
91
+ # table_name: str = "tenant_embeddings",
92
+ # content_column: str = "document",
93
+ # metadata_json_column: str = "cmetadata",
94
+ # id_column: str = "id",
95
+ # ):
96
+ # """Initialize the vector store."""
97
+ # pass
98
+
83
99
  @abstractmethod
84
- def init_vector_store(
100
+ def retrieve_vector_store(
85
101
  self,
86
- table_name: str = "langchain_pg_embedding",
87
- content_column: str = "document",
88
- metadata_json_column: str = "cmetadata",
89
- id_column: str = "id",
90
- ):
91
- """Initialize the vector store."""
102
+ ) -> tuple[PGVectorStore, SQLRecordManager]:
103
+ """Retrieve the vector store."""
92
104
  pass
93
105
 
94
106
  @abstractmethod
95
- def index_documents(self, documents: list[Document]):
107
+ def index_documents(
108
+ self,
109
+ docs: list[Document],
110
+ ) -> IndexingResult:
96
111
  """Index documents."""
97
112
  pass
98
113
 
99
114
  @abstractmethod
100
- def get_documents_keys_by_source_id(self, source_id: str):
101
- """Get documents keys by source ID."""
115
+ def search_records(
116
+ self,
117
+ query: str,
118
+ ) -> list[Document]:
119
+ """Search documents."""
102
120
  pass
103
121
 
104
122
  @abstractmethod
105
- def delete_documents_by_source_id(self, source_id: str):
106
- """Delete documents by source ID."""
123
+ def create_index(
124
+ self,
125
+ ):
107
126
  pass
127
+
128
+ # @abstractmethod
129
+ # def get_documents_keys_by_source_id(self, source_id: str):
130
+ # """Get documents keys by source ID."""
131
+ # pass
132
+
133
+ # @abstractmethod
134
+ # def delete_documents_by_source_id(self, source_id: str):
135
+ # """Delete documents by source ID."""
136
+ # pass
@@ -0,0 +1,59 @@
1
+ import logging
2
+
3
+ from langchain.indexes import SQLRecordManager
4
+ from langchain_core.documents import Document
5
+ from langchain_postgres import PGVectorStore
6
+
7
+ from .interfaces import (
8
+ EmbeddingsManager,
9
+ RagChunker,
10
+ )
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class KdbService:
16
+ """
17
+ Service for chunking documents.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ embeddings_manager: EmbeddingsManager,
23
+ ):
24
+ """
25
+ Initialize the ChunkerService.
26
+ """
27
+ self.embeddings_manager = embeddings_manager
28
+ self._vector_store = None
29
+ self._records_manager = None
30
+
31
+ def configure_kdb(self):
32
+ try:
33
+ self.embeddings_manager.configure_vector_store()
34
+ except Exception as e:
35
+ raise Exception(f"Error configuring KDB: {e}")
36
+
37
+ def create_vector_store_hsnw_index(self):
38
+ try:
39
+ self.embeddings_manager.create_index()
40
+ except Exception as e:
41
+ logger.error(f"Error creating vector store index: {e}")
42
+ raise Exception(f"Error creating vector store index: {e}")
43
+
44
+ def search(self, query: str) -> list[Document]:
45
+ try:
46
+ records = []
47
+ records = self.embeddings_manager.search_records(query)
48
+ print(records)
49
+ return records
50
+ except Exception as e:
51
+ logger.error(f"Error indexing documents: {e}")
52
+ raise Exception(f"Error indexing documents: {e}")
53
+
54
+ def index_documents_in_vector_store(self, documents: list[Document]) -> None:
55
+ try:
56
+ self.embeddings_manager.index_documents(documents)
57
+ except Exception as e:
58
+ logger.error(f"Error indexing documents: {e}")
59
+ raise Exception(f"Error indexing documents: {e}")
@@ -2,9 +2,12 @@ from enum import Enum
2
2
  from typing import Literal
3
3
 
4
4
 
5
- class KdbServices(Enum):
5
+ class KdbServices(str, Enum):
6
6
  REDIS = "redis"
7
7
  CHROMA = "chroma"
8
+ PG = "pg"
8
9
 
9
10
 
10
- kdb_services = Literal[KdbServices.REDIS.value, KdbServices.CHROMA.value]
11
+ kdb_services = Literal[
12
+ KdbServices.REDIS.value, KdbServices.CHROMA.value, KdbServices.PG.value
13
+ ]
@@ -2,7 +2,7 @@ from enum import Enum
2
2
  from typing import Literal
3
3
 
4
4
 
5
- class StorageServices(Enum):
5
+ class StorageServices(str, Enum):
6
6
  S3 = "s3"
7
7
  LOCAL = "local"
8
8
 
@@ -46,26 +46,29 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
46
46
  logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
47
47
  raise
48
48
 
49
- def configure_vector_store(
49
+ async def configure_vector_store(
50
50
  self,
51
51
  table_name: str = "",
52
52
  vector_size: int = 768,
53
53
  content_column: str = "document",
54
54
  id_column: str = "id",
55
+ metadata_json_column: str = "cmetadata",
56
+ pg_record_manager: str = "postgres/langchain_pg_collection",
55
57
  ):
56
58
  """Configure the vector store."""
57
59
  pass
58
60
 
59
- def init_vector_store(
61
+ async def init_vector_store(
60
62
  self,
61
63
  table_name: str = "",
62
64
  content_column: str = "document",
65
+ metadata_json_column: str = "cmetadata",
63
66
  id_column: str = "id",
64
67
  ):
65
68
  """Initialize the vector store."""
66
69
  pass
67
70
 
68
- def index_documents(self, documents: list[Document]):
71
+ async def index_documents(self, documents: list[Document]):
69
72
  """
70
73
  Add documents to the vector store with their embeddings.
71
74
 
@@ -85,7 +88,7 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
85
88
  """
86
89
  try:
87
90
  logger.info(f"Indexing {len(documents)} documents in vector store")
88
- self.chroma.add_documents(documents)
91
+ await self.chroma.aadd_documents(documents)
89
92
  except Exception as e:
90
93
  logger.error(f"Error indexing documents: {str(e)}")
91
94
  raise
@@ -110,12 +113,14 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
110
113
  logger.error(f"Error deleting documents by ID: {str(e)}")
111
114
  raise
112
115
 
113
- def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
116
+ async def delete_documents_by_metadata_key(
117
+ self, metadata_key: str, metadata_value: str
118
+ ):
114
119
  """
115
120
  Delete documents by filter from the vector store.
116
121
  """
117
122
  try:
118
- self.chroma.delete(where={metadata_key: metadata_value})
123
+ await self.chroma.adelete(where={metadata_key: metadata_value})
119
124
  except Exception as error:
120
125
  logger.error(
121
126
  f"Error deleting documents by filter: {str(filter)}, error: {error} "
@@ -1,13 +1,13 @@
1
- from langchain_core.documents import Document
2
- from langchain.indexes import index, SQLRecordManager
3
- from typing import List
4
1
  import logging
5
- from langchain_postgres import PGVectorStore, PGEngine
6
- from sqlalchemy import create_engine
7
- from dotenv import load_dotenv
8
- from wizit_context_ingestor.application.interfaces import EmbeddingsManager
9
2
 
10
- load_dotenv()
3
+ from langchain.indexes import IndexingResult, SQLRecordManager, aindex, index
4
+ from langchain_core.documents import Document
5
+ from langchain_postgres import PGEngine, PGVectorStore
6
+ from langchain_postgres.v2.indexes import HNSWIndex
7
+ from sqlalchemy.ext.asyncio import create_async_engine
8
+ from typing_extensions import Literal
9
+
10
+ from wizit_context_ingestor.application.interfaces import EmbeddingsManager
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
@@ -42,7 +42,17 @@ class PgEmbeddingsManager(EmbeddingsManager):
42
42
 
43
43
  __slots__ = ("embeddings_model", "pg_connection")
44
44
 
45
- def __init__(self, embeddings_model, pg_connection: str):
45
+ def __init__(
46
+ self,
47
+ embeddings_model,
48
+ pg_connection: str,
49
+ embeddings_vectors_table_name: str = "langchain_pg_embedding",
50
+ vector_size: int = 768,
51
+ content_column: str = "document",
52
+ id_column: str = "id",
53
+ metadata_json_column: str = "cmetadata",
54
+ records_manager_table_name: str = "langchain_record_manager",
55
+ ):
46
56
  """
47
57
  Initialize the PgEmbeddingsManager.
48
58
 
@@ -57,155 +67,135 @@ class PgEmbeddingsManager(EmbeddingsManager):
57
67
  """
58
68
  self.pg_connection = pg_connection
59
69
  self.embeddings_model = embeddings_model
60
- self.pg_engine = None
61
70
  self.vector_store = None
62
71
  self.record_manager = None
63
- try:
64
- self.pg_engine = PGEngine.from_connection_string(url=pg_connection)
65
- logger.info("PgEmbeddingsManager initialized")
66
- except Exception as e:
67
- logger.error(f"Failed to initialize PgEmbeddingsManager: {str(e)}")
68
- raise
72
+ self.pg_engine = PGEngine.from_connection_string(pg_connection)
73
+ self.embeddings_vectors_table_name = embeddings_vectors_table_name
74
+ self.vector_size = vector_size
75
+ self.content_column = content_column
76
+ self.id_column = id_column
77
+ self.metadata_json_column = metadata_json_column
78
+ self.records_manager_table_name = records_manager_table_name
79
+ # self.async_engine = create_async_engine(pg_connection)
80
+ # self.pg_engine = PGEngine.from_engine(
81
+ # self.async_engine
82
+ # )
83
+ logger.info("PgEmbeddingsManager initialized")
69
84
 
70
85
  def configure_vector_store(
71
86
  self,
72
- table_name: str = "langchain_pg_embedding",
73
- vector_size: int = 768,
74
- content_column: str = "document",
75
- id_column: str = "id",
76
- metadata_json_column: str = "cmetadata",
77
- pg_record_manager: str = "postgres/langchain_pg_collection",
78
87
  ):
79
- self.pg_engine.init_vectorstore_table(
80
- table_name=table_name,
81
- vector_size=vector_size,
82
- content_column=content_column,
83
- id_column=id_column,
84
- metadata_json_column=metadata_json_column,
85
- )
86
- self.record_manager = SQLRecordManager(
87
- pg_record_manager, engine=create_engine(url=self.pg_connection)
88
- )
89
- # TODO move this from here
90
- self.record_manager.create_schema()
91
-
92
- def init_vector_store(
88
+ try:
89
+ self.pg_engine.init_vectorstore_table(
90
+ table_name=self.embeddings_vectors_table_name,
91
+ vector_size=self.vector_size,
92
+ content_column=self.content_column,
93
+ id_column=self.id_column,
94
+ metadata_json_column=self.metadata_json_column,
95
+ )
96
+ record_manager = SQLRecordManager(
97
+ self.records_manager_table_name,
98
+ db_url=self.pg_connection,
99
+ async_mode=False,
100
+ )
101
+ record_manager.create_schema()
102
+ except Exception as e:
103
+ logger.error(f"Error configure_vector_store: {e}")
104
+ raise
105
+
106
+ def retrieve_vector_store(
93
107
  self,
94
- table_name: str = "langchain_pg_embedding",
95
- content_column: str = "document",
96
- metadata_json_column: str = "cmetadata",
97
- id_column: str = "id",
98
- pg_record_manager: str = "postgres/langchain_pg_collection",
99
- ):
100
- self.vector_store = PGVectorStore.create_sync(
101
- embedding_service=self.embeddings_model,
102
- engine=self.pg_engine,
103
- table_name=table_name,
104
- content_column=content_column,
105
- metadata_json_column=metadata_json_column,
106
- id_column=id_column,
107
- )
108
- self.record_manager = SQLRecordManager(
109
- pg_record_manager, engine=create_engine(url=self.pg_connection)
110
- )
111
-
112
- def vector_store_initialized(func):
108
+ ) -> tuple[PGVectorStore, SQLRecordManager]:
109
+ try:
110
+ self.vector_store = PGVectorStore.create_sync(
111
+ embedding_service=self.embeddings_model,
112
+ engine=self.pg_engine,
113
+ table_name=self.embeddings_vectors_table_name,
114
+ content_column=self.content_column,
115
+ metadata_json_column=self.metadata_json_column,
116
+ id_column=self.id_column,
117
+ )
118
+ self.record_manager = SQLRecordManager(
119
+ self.records_manager_table_name, db_url=self.pg_connection
120
+ )
121
+ return (self.vector_store, self.record_manager)
122
+ except Exception as e:
123
+ logger.error(f"Error retrieve vector store: ", e)
124
+ raise e
125
+
126
+ def check_vector_store_init(func):
113
127
  """validate vector store initialization"""
114
128
 
115
129
  def wrapper(self, *args, **kwargs):
116
- # Common validation logic
117
- if self.vector_store is None:
118
- raise Exception("Vector store not initialized")
119
- if self.record_manager is None:
120
- raise Exception("Record manager not initialized")
130
+ if self.vector_store is None or self.record_manager is None:
131
+ self.retrieve_vector_store()
121
132
  return func(self, *args, **kwargs)
122
133
 
123
134
  return wrapper
124
135
 
125
- @vector_store_initialized
126
- def index_documents(self, docs: List[Document]):
136
+ @check_vector_store_init
137
+ def create_index(self):
138
+ try:
139
+ if self.vector_size < 2000:
140
+ index = HNSWIndex()
141
+ self.vector_store.apply_vector_index(index)
142
+ else:
143
+ raise NotImplementedError(
144
+ "Indexing for vector size > 2000 is not supported"
145
+ )
146
+ except Exception as e:
147
+ logger.info(f"Error creating index: {e}")
148
+ raise e
149
+
150
+ @check_vector_store_init
151
+ def index_documents(
152
+ self,
153
+ docs: list[Document],
154
+ cleanup: Literal["incremental", "full", "scoped_full"] | None = "incremental",
155
+ source_id_key: str = "source",
156
+ ) -> IndexingResult:
127
157
  """
128
- Add documents to the vector store with their embeddings.
158
+ Index documents in the vector store with their embeddings.
129
159
 
130
- This method takes a list of Document objects, generates embeddings for them
131
- using the embeddings model, and stores both the documents and their
132
- embeddings in the PostgreSQL database.
160
+ This method takes a list of Document objects and indexes them using LangChain's
161
+ aindex function with incremental cleanup. The documents are processed through
162
+ the embeddings model and stored in the PostgreSQL database with pgvector.
133
163
 
134
164
  Args:
135
- docs: A list of LangChain Document objects to add to the vector store
136
- Each Document should have page_content and metadata attributes
137
- from langchain_core.documents import Document
165
+ vector_store: The PGVectorStore instance to use for storage
166
+ record_manager: The SQLRecordManager instance for tracking indexed documents
167
+ docs: A list of LangChain Document objects to index in the vector store.
168
+ Each Document should have page_content and metadata attributes.
169
+
138
170
  Returns:
139
- None
171
+ IndexingResult: Result object containing information about the indexing operation
140
172
 
141
173
  Raises:
142
- Exception: If there's an error adding documents to the vector store
174
+ Exception: If there's an error during the document indexing process
143
175
  """
144
176
  try:
145
177
  logger.info(f"Indexing {len(docs)} documents in vector store")
178
+ # await self.vector_store.aadd_documents(docs)
146
179
  return index(
147
180
  docs,
148
181
  self.record_manager,
149
182
  self.vector_store,
150
- cleanup="incremental",
151
- source_id_key="source",
183
+ cleanup=cleanup,
184
+ source_id_key=source_id_key,
152
185
  )
153
186
  except Exception as e:
154
187
  logger.error(f"Error indexing documents: {str(e)}")
155
- raise
188
+ raise e
156
189
 
157
- @vector_store_initialized
158
- def get_documents_keys_by_source_id(self, source_id: str):
159
- """
160
- Get document keys by source ID from the vector store.
161
- """
162
- try:
163
- return self.record_manager.list_keys(group_ids=[source_id])
164
- except Exception as e:
165
- logger.error(f"Error getting documents keys by source ID: {str(e)}")
166
- raise
167
-
168
- @vector_store_initialized
169
- def delete_documents_by_source_id(self, source_id: str):
170
- """
171
- Delete documents by source ID from the vector store.
172
- """
190
+ @check_vector_store_init
191
+ def search_records(
192
+ self,
193
+ query: str,
194
+ ) -> list[Document]:
173
195
  try:
174
- objects_keys = self.get_documents_keys_by_source_id(source_id)
175
- self.record_manager.delete_keys(objects_keys)
176
- self.vector_store.delete(ids=objects_keys)
196
+ logger.info(f"Searching for '{query}' in vector store")
197
+ reply = self.vector_store.search(query=query, search_type="similarity", k=1)
198
+ return reply
177
199
  except Exception as e:
178
- logger.error(f"Error deleting documents by source ID: {str(e)}")
179
- raise
180
-
181
- # def get_retriever(self, search_type: str = "mmr", k: int = 20):
182
- # """
183
- # Get a retriever interface to the vector store for semantic search.
184
-
185
- # This method returns a LangChain retriever object that can be used in retrieval
186
- # pipelines, retrieval-augmented generation, and other LangChain chains.
187
-
188
- # Args:
189
- # search_type: The search algorithm to use. Options include:
190
- # - "similarity" (standard cosine similarity)
191
- # - "mmr" (Maximum Marginal Relevance, balances relevance with diversity)
192
- # - "similarity_score_threshold" (filters by minimum similarity)
193
- # k: The number of documents to retrieve (default: 20)
194
-
195
- # Returns:
196
- # Retriever: A LangChain Retriever object that can be used in chains and pipelines
197
-
198
- # Raises:
199
- # Exception: If there's an error creating the retriever
200
-
201
- # Example:
202
- # >>> retriever = pg_manager.get_retriever(search_type="mmr", k=5)
203
- # >>> docs = retriever.get_relevant_documents("quantum computing")
204
- # """
205
- # try:
206
- # return self.vector_store.as_retriever(
207
- # search_type=search_type, search_kwargs={"k": k}
208
- # )
209
- # except Exception as e:
210
- # logger.info(f"failed to get vector store as retriever {str(e)}")
211
- # raise
200
+ logger.error(f"Error indexing documents: {str(e)}")
201
+ raise e
@@ -3,6 +3,7 @@
3
3
  # https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
4
4
  # https://python.langchain.com/docs/how_to/embed_text/
5
5
  import logging
6
+ import uuid
6
7
  from typing import List, Any
7
8
  from langchain_core.documents import Document
8
9
  from langchain_experimental.text_splitter import SemanticChunker
@@ -58,7 +59,7 @@ class SemanticChunks(RagChunker):
58
59
  source = document.metadata["source"]
59
60
  for i, chunk in enumerate(chunks):
60
61
  if document.metadata["source"]:
61
- chunk.id = f"{source}-{i}"
62
+ chunk.id = f"{uuid.uuid4()}"
62
63
  logger.info(f"{len(chunks)} chunks generated successfully")
63
64
  return chunks
64
65
  except Exception as e:
@@ -11,7 +11,7 @@ from .infra.rag.chroma_embeddings import ChromaEmbeddingsManager
11
11
  from .infra.secrets.aws_secrets_manager import AwsSecretsManager
12
12
  from .data.storage import storage_services, StorageServices
13
13
  from .data.kdb import kdb_services, KdbServices
14
- from .utils.file_utils import has_invalid_file_name_format
14
+ from .utils.file_utils import validate_file_name_format
15
15
  from langsmith import Client, tracing_context
16
16
 
17
17
 
@@ -143,7 +143,7 @@ class TranscriptionManager:
143
143
  Exception: If an error occurs during the transcription process.
144
144
  """
145
145
  try:
146
- if has_invalid_file_name_format(file_key):
146
+ if not validate_file_name_format(file_key):
147
147
  raise ValueError(
148
148
  "Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
149
149
  )
@@ -0,0 +1,173 @@
1
+ import json
2
+ from logging import getLogger
3
+ from typing import Any, Dict, Literal
4
+
5
+ from langchain_core.documents import Document
6
+ from langsmith import Client, tracing_context
7
+
8
+ from .application.context_chunk_service import ContextChunksInDocumentService
9
+ from .application.kdb_service import KdbService
10
+ from .data.storage import StorageServices
11
+ from .infra.persistence.local_storage import LocalStorageService
12
+ from .infra.persistence.s3_storage import S3StorageService
13
+ from .infra.rag.pg_embeddings import PgEmbeddingsManager
14
+ from .infra.rag.semantic_chunks import SemanticChunks
15
+ from .infra.secrets.aws_secrets_manager import AwsSecretsManager
16
+ from .infra.vertex_model import VertexModels
17
+ from .utils.file_utils import validate_file_name_format
18
+
19
+ logger = getLogger(__name__)
20
+
21
+
22
+ class KdbManager:
23
+ def __init__(
24
+ self, embeddings_model, kdb_service: Literal["pg"], kdb_params: Dict[Any, Any]
25
+ ):
26
+ self.kdb_service = kdb_service
27
+ self.kdb_params = kdb_params
28
+ self.embeddings_model = embeddings_model
29
+
30
+ def retrieve_kdb_service(self):
31
+ return PgEmbeddingsManager(self.embeddings_model, **self.kdb_params)
32
+
33
+
34
+ class PersistenceManager:
35
+ def __init__(
36
+ self,
37
+ storage_service: Literal["s3", "local"],
38
+ source_storage_route,
39
+ target_storage_route,
40
+ ):
41
+ self.storage_service = storage_service
42
+ self.source_storage_route = source_storage_route
43
+ self.target_storage_route = target_storage_route
44
+
45
+ def retrieve_storage_service(self):
46
+ if self.storage_service == StorageServices.S3.value:
47
+ return S3StorageService(
48
+ origin_bucket_name=self.source_storage_route,
49
+ target_bucket_name=self.target_storage_route,
50
+ )
51
+ elif self.storage_service == StorageServices.LOCAL.value:
52
+ return LocalStorageService(
53
+ source_storage_route=self.source_storage_route,
54
+ target_storage_route=self.target_storage_route,
55
+ )
56
+ else:
57
+ raise ValueError(f"Unsupported storage service: {self.storage_service}")
58
+
59
+
60
+ class ChunksManager:
61
+ def __init__(
62
+ self,
63
+ gcp_project_id: str,
64
+ gcp_project_location: str,
65
+ gcp_secret_name: str,
66
+ langsmith_api_key: str,
67
+ langsmith_project_name: str,
68
+ storage_service: Literal["s3", "local"],
69
+ kdb_service: Literal["pg"],
70
+ kdb_params: Dict[Any, Any],
71
+ llm_model_id: str = "claude-3-5-haiku@20241022",
72
+ embeddings_model_id: str = "text-multilingual-embedding-002",
73
+ target_language: str = "es",
74
+ ):
75
+ self.gcp_project_id = gcp_project_id
76
+ self.gcp_project_location = gcp_project_location
77
+ self.aws_secrets_manager = AwsSecretsManager()
78
+ self.gcp_secret_name = gcp_secret_name
79
+ self.llm_model_id = llm_model_id
80
+ self.target_language = target_language
81
+ self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
82
+ self.storage_service = storage_service
83
+ self.kdb_params = kdb_params
84
+ self.kdb_service = kdb_service
85
+ self.vertex_model = self._get_vertex_model()
86
+ self.embeddings_model = self.vertex_model.load_embeddings_model(
87
+ embeddings_model_id
88
+ )
89
+ self.langsmith_api_key = langsmith_api_key
90
+ self.langsmith_project_name = langsmith_project_name
91
+ self.langsmith_client = Client(api_key=self.langsmith_api_key)
92
+ self.kdb_manager = KdbManager(self.embeddings_model, "pg", self.kdb_params)
93
+ self.pg_embeddings_manager = self.kdb_manager.retrieve_kdb_service()
94
+ self.rag_chunker = SemanticChunks(self.embeddings_model)
95
+ self.kdb_service = KdbService(
96
+ self.pg_embeddings_manager,
97
+ )
98
+
99
+ def _get_gcp_sa_dict(self, gcp_secret_name: str):
100
+ vertex_gcp_sa = self.aws_secrets_manager.get_secret(gcp_secret_name)
101
+ vertex_gcp_sa_dict = json.loads(vertex_gcp_sa)
102
+ return vertex_gcp_sa_dict
103
+
104
+ def _get_vertex_model(self):
105
+ vertex_model = VertexModels(
106
+ self.gcp_project_id,
107
+ self.gcp_project_location,
108
+ self.gcp_sa_dict,
109
+ llm_model_id=self.llm_model_id,
110
+ )
111
+ return vertex_model
112
+
113
+ def provision_vector_store(self):
114
+ try:
115
+ self.kdb_service.configure_kdb()
116
+ self.kdb_service.create_vector_store_hsnw_index()
117
+ except Exception as e:
118
+ logger.error(f"Error configuring vector store: {e}")
119
+
120
+ def index_documents_in_vector_store(self, docs: list[Document]):
121
+ try:
122
+ self.kdb_service.index_documents_in_vector_store(docs)
123
+ except Exception as e:
124
+ logger.error(f"Error indexing documents in vector store: {e}")
125
+
126
+ def search_records(self, query):
127
+ return self.kdb_service.search(query)
128
+
129
+ def tracing(func):
130
+ async def gen_tracing_context(self, *args, **kwargs):
131
+ with tracing_context(
132
+ enabled=True,
133
+ project_name=self.langsmith_project_name,
134
+ client=self.langsmith_client,
135
+ ):
136
+ return await func(self, *args, **kwargs)
137
+
138
+ return gen_tracing_context
139
+
140
+ @tracing
141
+ async def gen_context_chunks(
142
+ self, file_key: str, source_storage_route: str, target_storage_route: str
143
+ ):
144
+ try:
145
+ validate_file_name_format(file_key)
146
+ persistence_layer = PersistenceManager(
147
+ self.storage_service, source_storage_route, target_storage_route
148
+ )
149
+ persistence_service = persistence_layer.retrieve_storage_service()
150
+ target_bucket_file_tags = {}
151
+ if persistence_service.supports_tagging:
152
+ target_bucket_file_tags = persistence_service.retrieve_file_tags(
153
+ file_key, target_storage_route
154
+ )
155
+ rag_chunker = SemanticChunks(self.embeddings_model)
156
+ kdb_manager = KdbManager(self.embeddings_model, "pg", self.kdb_params)
157
+ kdb_service = kdb_manager.retrieve_kdb_service()
158
+ context_chunks_in_document_service = ContextChunksInDocumentService(
159
+ ai_application_service=self.vertex_model,
160
+ persistence_service=persistence_service,
161
+ rag_chunker=rag_chunker,
162
+ embeddings_manager=kdb_service,
163
+ target_language=self.target_language,
164
+ )
165
+ context_chunks = (
166
+ await context_chunks_in_document_service.get_context_chunks_in_document(
167
+ file_key, target_bucket_file_tags
168
+ )
169
+ )
170
+ return context_chunks
171
+ except Exception as e:
172
+ print(f"Error getting context chunks in document: {e}")
173
+ raise e
@@ -1,13 +1,12 @@
1
1
  import re
2
2
 
3
3
 
4
- def has_invalid_file_name_format(file_name):
4
+ def validate_file_name_format(file_name):
5
5
  """Check if file name has special characters or spaces instead of underscores"""
6
- # Check for spaces
7
- if " " in file_name:
8
- return True
9
-
10
6
  # Check for special characters (anything that's not alphanumeric, underscore, dash, dot, slash, or backslash)
11
- if re.search(r"[^a-zA-Z0-9_.-/\\]", file_name):
7
+ if re.search(r"[^a-zA-Z0-9_.\-/\\]", file_name) is None:
12
8
  return True
13
- return False
9
+ else:
10
+ raise ValueError(
11
+ "Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
12
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: wizit-context-ingestor
3
- Version: 0.3.0b8
3
+ Version: 0.4.0
4
4
  Summary: Contextual Rag with Cloud Solutions
5
5
  Requires-Dist: anthropic[vertex]>=0.66.0
6
6
  Requires-Dist: boto3>=1.40.23
@@ -8,10 +8,13 @@ Requires-Dist: langchain-aws>=0.2.31
8
8
  Requires-Dist: langchain-chroma>=0.2.6
9
9
  Requires-Dist: langchain-experimental>=0.3.4
10
10
  Requires-Dist: langchain-google-vertexai>=2.0.28
11
+ Requires-Dist: langchain-postgres>=0.0.16
11
12
  Requires-Dist: langchain-redis>=0.2.3
12
13
  Requires-Dist: langgraph>=0.6.8
13
14
  Requires-Dist: pillow>=11.3.0
15
+ Requires-Dist: psycopg2-binary>=2.9.11
14
16
  Requires-Dist: pymupdf>=1.26.4
17
+ Requires-Dist: sqlalchemy[asyncio]>=2.0.43
15
18
  Requires-Python: >=3.12
16
19
  Description-Content-Type: text/markdown
17
20
 
@@ -1,13 +1,14 @@
1
1
  wizit_context_ingestor/.DS_Store,sha256=c7hZ0C8v2hxprMlCgmvxXDl92phew3iSATJzE1yYTBs,6148
2
- wizit_context_ingestor/__init__.py,sha256=TSTm5qSpNNCz9ilKYkXRUxupvmWG2AHfv7RBWFw8T4c,107
2
+ wizit_context_ingestor/__init__.py,sha256=aFOBTwoH2ZH4KvCzzdfQ0PH1yBV3aLdXXye-VtuXLRo,131
3
3
  wizit_context_ingestor/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- wizit_context_ingestor/application/context_chunk_service.py,sha256=LYRKBsY30IT2LczkgkYdPx7W3yycRy-0m7t3KKgq6Nw,9046
5
- wizit_context_ingestor/application/interfaces.py,sha256=W0qonE3t-S-zwAoKtDYc4oyW_GOILKVmrdy8LnC8MVI,3193
4
+ wizit_context_ingestor/application/context_chunk_service.py,sha256=-efSbpLJS5968Qh6Ho_c6et4-g_L0gPgR-Z9URO_cS0,5504
5
+ wizit_context_ingestor/application/interfaces.py,sha256=oG-a8JbGw-YT8Xtt0SDJ72FZwDNogyBzP93k_tBIC6k,3528
6
+ wizit_context_ingestor/application/kdb_service.py,sha256=CYQ-2aLvUZHCgCwmuMeZsANhLbm54gtfpiKsg1a81qI,1757
6
7
  wizit_context_ingestor/application/transcription_service.py,sha256=FlUcMGyAotAO8MmT5UMlPMbgIWVQLg7YO6rJx9ANn7A,8567
7
8
  wizit_context_ingestor/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- wizit_context_ingestor/data/kdb.py,sha256=GCkXQmnk2JCXV_VJ-h0k55AOIX8qohzBJN2v-9D1dlU,194
9
+ wizit_context_ingestor/data/kdb.py,sha256=lJ4F0ltVhtqC80QqHsm49a2FnSEI_fep61_Vfd65x34,241
9
10
  wizit_context_ingestor/data/prompts.py,sha256=bzgLdjINtXGQVTy4ZZktdcNItbtDQpM7maAQ2UBGdnY,14187
10
- wizit_context_ingestor/data/storage.py,sha256=aanXY1AV696cShHtDDhlJDhKPouZ1dq2lo_57yhTd20,198
11
+ wizit_context_ingestor/data/storage.py,sha256=e29IH47MfN0qAuzrcZ-VlHw1sk7dT5QqYk6wO273nCk,203
11
12
  wizit_context_ingestor/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
13
  wizit_context_ingestor/domain/models.py,sha256=DV83PArMyh-VoUqnVF_ohcgStsk549ixdYw98B8o2GI,381
13
14
  wizit_context_ingestor/domain/services.py,sha256=dg8UvYSjYsOMphrciZyGvuRriM8Qf08SstvO979XrFc,3344
@@ -16,20 +17,21 @@ wizit_context_ingestor/infra/aws_model.py,sha256=glIaewSdv6PDBXoCe6QgCUIzLCjtM7K
16
17
  wizit_context_ingestor/infra/persistence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
18
  wizit_context_ingestor/infra/persistence/local_storage.py,sha256=GtPUvtn8XlgcqwjWmSm2998sgyYlwkF22HoB40ri7c0,2029
18
19
  wizit_context_ingestor/infra/persistence/s3_storage.py,sha256=bzlQteLPPGS_Gbh39RkxyoK8G-CEOQewMNPuzPule9k,4906
19
- wizit_context_ingestor/infra/rag/chroma_embeddings.py,sha256=fV6Ays8Vu4rzwp7kJiFx5HwepGeepk95Kzh_68Qjtkc,4298
20
- wizit_context_ingestor/infra/rag/pg_embeddings.py,sha256=D7onh27SvqYahYAsLy6DeyklxGyBFYH2DwV42fVCalQ,8157
20
+ wizit_context_ingestor/infra/rag/chroma_embeddings.py,sha256=8VSXw38TX8nfGyJ5SXtbyqDVt3XI6gtEoie0g_v5Ax8,4517
21
+ wizit_context_ingestor/infra/rag/pg_embeddings.py,sha256=f4fq75uot4JPaz8j7fmKxGWGhaLs5WcCtfLWBLvMA_M,7754
21
22
  wizit_context_ingestor/infra/rag/redis_embeddings.py,sha256=pCP_I1RLeIUTYMSHkZT6AjIOyHA9A47wyffrZBjiG0s,5107
22
- wizit_context_ingestor/infra/rag/semantic_chunks.py,sha256=miPG7gGYVIyQzTQMtQwDPknI9JlZroUP0m9E7ahfGfY,2449
23
+ wizit_context_ingestor/infra/rag/semantic_chunks.py,sha256=Peylky5rsWERtu5psmet4SL_VYpnK5hsEIyE0Ua1wGE,2463
23
24
  wizit_context_ingestor/infra/secrets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
25
  wizit_context_ingestor/infra/secrets/aws_secrets_manager.py,sha256=vukil5sO9tQPTM74wUbyQqR8Z-z0ElyjeF2ns7rbVbQ,1249
25
26
  wizit_context_ingestor/infra/vertex_model.py,sha256=6L2C4qH7PSVjdOSzIEZlFtUwu1pgQVXtQBIU5isn644,7582
26
- wizit_context_ingestor/main.py,sha256=vP9311d9TcXtoxmOE1-4jAzKjrNU5ZaxiJiG2d9IV1w,11687
27
+ wizit_context_ingestor/main.py,sha256=2gJ7B39aS2U742cJQWjVGHbQ-NslYFdGQXzljGd83WA,11685
28
+ wizit_context_ingestor/main_chunks.py,sha256=07zT76VB_RzCXV3toJgTQBPCmiVud860vVzVkMLwDuA,6784
27
29
  wizit_context_ingestor/services/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
28
30
  wizit_context_ingestor/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
31
  wizit_context_ingestor/services/chunks.py,sha256=tQQsdsOscZWzqVY5WxVxr3ii62FOJ3nMARaJJz6CvjQ,2011
30
32
  wizit_context_ingestor/services/parse_doc.py,sha256=3CyZoGbiUfxbs0SXUWXjQevtusSzTBgvUVeNNSdxJLE,4491
31
33
  wizit_context_ingestor/services/pg_embeddings_manager.py,sha256=n1HOmu_Z_Z71H-rVAyJS3FdPKbBckm5W8_XethY8nuM,4998
32
- wizit_context_ingestor/utils/file_utils.py,sha256=Wuua14LivrfL8oBP-j3ZtSqc7uq4rrQzT-T-p7pxEpM,414
34
+ wizit_context_ingestor/utils/file_utils.py,sha256=BmNN71exw75IKWvdDvyvo4QlGpl5W-nTCgimDnmD4Eo,501
33
35
  wizit_context_ingestor/workflows/context_nodes.py,sha256=3qlFcxPUmehx04mQHpmouneKq--To8rwSDHCRFyWICo,3168
34
36
  wizit_context_ingestor/workflows/context_state.py,sha256=4MTIUjK-F2pWvIldovWZhMAqqCOpViKbvitJzETkSkY,324
35
37
  wizit_context_ingestor/workflows/context_tools.py,sha256=E9VTL3AC0MwSIuc1e-juZK7XCxnZfFv0-KpHfR2CNH4,2764
@@ -39,6 +41,6 @@ wizit_context_ingestor/workflows/transcription_schemas.py,sha256=CQCl7LXD5voxhJO
39
41
  wizit_context_ingestor/workflows/transcription_state.py,sha256=2Z_t2aZFEH_nAjdEO6RFBEmi_fwvr9cV0aLS1eIxiCQ,590
40
42
  wizit_context_ingestor/workflows/transcription_tools.py,sha256=FtIfWFITn8_Rr5SEobCeR55aJGZoHRMgF2UxRT5vJ-E,1373
41
43
  wizit_context_ingestor/workflows/transcription_workflow.py,sha256=77cLsYGdv01Py2GaKYpACuifPeSxH7tkVodvLv97sdg,1621
42
- wizit_context_ingestor-0.3.0b8.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
43
- wizit_context_ingestor-0.3.0b8.dist-info/METADATA,sha256=7pmXei8lCU0BAPl2m7T9k7nlZFAEOAANgndG3OPbIHY,3768
44
- wizit_context_ingestor-0.3.0b8.dist-info/RECORD,,
44
+ wizit_context_ingestor-0.4.0.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
45
+ wizit_context_ingestor-0.4.0.dist-info/METADATA,sha256=hZxwWZSUb5hQXEoWZ5ulaI0fmPZYSyDmgOWHxizp6oQ,3890
46
+ wizit_context_ingestor-0.4.0.dist-info/RECORD,,