wizit-context-ingestor 0.3.0b2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
- from .main import ChunksManager, TranscriptionManager
1
+ from .main import TranscriptionManager
2
+ from .main_chunks import ChunksManager
2
3
 
3
4
  __all__ = ["ChunksManager", "TranscriptionManager"]
@@ -1,18 +1,20 @@
1
+ import asyncio
2
+ import logging
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from langchain_core.documents import Document
6
+ from langchain_core.messages.human import HumanMessage
1
7
  from langchain_core.output_parsers.pydantic import PydanticOutputParser
2
8
  from langchain_core.prompts import ChatPromptTemplate
3
- from langchain_core.documents import Document
9
+
4
10
  from ..data.prompts import CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT, ContextChunk
5
- from langchain_core.messages.human import HumanMessage
6
11
  from ..workflows.context_workflow import ContextWorkflow
7
- from typing import Dict, Any, Optional, List
8
12
  from .interfaces import (
9
13
  AiApplicationService,
14
+ EmbeddingsManager,
10
15
  PersistenceService,
11
16
  RagChunker,
12
- EmbeddingsManager,
13
17
  )
14
- import logging
15
-
16
18
 
17
19
  logger = logging.getLogger(__name__)
18
20
 
@@ -38,13 +40,13 @@ class ContextChunksInDocumentService:
38
40
  self.rag_chunker = rag_chunker
39
41
  self.embeddings_manager = embeddings_manager
40
42
  self.target_language = target_language
41
- self.embeddings_manager.init_vector_store()
43
+ # self.embeddings_manager.init_vector_store()
42
44
  self.chat_model = self.ai_application_service.load_chat_model()
43
45
  # TODO
44
46
  self.context_additional_instructions = ""
45
47
  self.metadata_source = "source"
46
48
 
47
- def _retrieve_context_chunk_in_document_with_workflow(
49
+ async def _retrieve_context_chunk_in_document_with_workflow(
48
50
  self,
49
51
  workflow,
50
52
  markdown_content: str,
@@ -53,7 +55,7 @@ class ContextChunksInDocumentService:
53
55
  ) -> Document:
54
56
  """Retrieve context chunks in document."""
55
57
  try:
56
- result = workflow.invoke(
58
+ result = await workflow.ainvoke(
57
59
  {
58
60
  "messages": [
59
61
  HumanMessage(
@@ -74,9 +76,7 @@ class ContextChunksInDocumentService:
74
76
  }
75
77
  },
76
78
  )
77
- # chunk.page_content = (
78
- # f"Context:{result['context']}, Content:{chunk.page_content}"
79
- # )
79
+ chunk.page_content = f"<context>\n{result['context']}\n</context>\n <content>\n{chunk.page_content}\n</content>"
80
80
  chunk.metadata["context"] = result["context"]
81
81
  if chunk_metadata:
82
82
  for key, value in chunk_metadata.items():
@@ -86,75 +86,7 @@ class ContextChunksInDocumentService:
86
86
  logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
87
87
  raise
88
88
 
89
- # def _retrieve_context_chunk_in_document(
90
- # self,
91
- # markdown_content: str,
92
- # chunk: Document,
93
- # chunk_metadata: Optional[Dict[str, Any]] = None,
94
- # ) -> Document:
95
- # """Retrieve context chunks in document."""
96
- # try:
97
- # chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
98
- # # Create the prompt template with image
99
- # prompt = ChatPromptTemplate.from_messages(
100
- # [
101
- # ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
102
- # (
103
- # "human",
104
- # [
105
- # {
106
- # "type": "text",
107
- # "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated in '{self.target_language}' language",
108
- # }
109
- # ],
110
- # ),
111
- # ]
112
- # ).partial(
113
- # document_content=markdown_content,
114
- # format_instructions=chunk_output_parser.get_format_instructions(),
115
- # )
116
- # model_with_structured_output = self.chat_model.with_structured_output(
117
- # ContextChunk
118
- # )
119
- # # Create the chain
120
- # chain = prompt | model_with_structured_output
121
- # # Process the image
122
- # results = chain.invoke({})
123
- # # chunk.page_content = (
124
- # # f"Context:{results.context}, Content:{chunk.page_content}"
125
- # # )
126
- # chunk.metadata["context"] = results.context
127
- # if chunk_metadata:
128
- # for key, value in chunk_metadata.items():
129
- # chunk.metadata[key] = value
130
- # return chunk
131
-
132
- # except Exception as e:
133
- # logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
134
- # raise
135
-
136
- # def retrieve_context_chunks_in_document(
137
- # self,
138
- # markdown_content: str,
139
- # chunks: List[Document],
140
- # chunks_metadata: Optional[Dict[str, Any]] = None,
141
- # ) -> List[Document]:
142
- # """Retrieve context chunks in document."""
143
- # try:
144
- # context_chunks = list(
145
- # map(
146
- # lambda chunk: self._retrieve_context_chunk_in_document(
147
- # markdown_content, chunk, chunks_metadata
148
- # ),
149
- # chunks,
150
- # )
151
- # )
152
- # return context_chunks
153
- # except Exception as e:
154
- # logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
155
- # raise
156
-
157
- def retrieve_context_chunks_in_document_with_workflow(
89
+ async def retrieve_context_chunks_in_document_with_workflow(
158
90
  self,
159
91
  markdown_content: str,
160
92
  chunks: List[Document],
@@ -167,7 +99,7 @@ class ContextChunksInDocumentService:
167
99
  )
168
100
  compiled_context_workflow = context_workflow.gen_workflow()
169
101
  compiled_context_workflow = compiled_context_workflow.compile()
170
- context_chunks = list(
102
+ context_chunks_workflow_invocations = list(
171
103
  map(
172
104
  lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
173
105
  compiled_context_workflow,
@@ -178,12 +110,13 @@ class ContextChunksInDocumentService:
178
110
  chunks,
179
111
  )
180
112
  )
113
+ context_chunks = await asyncio.gather(*context_chunks_workflow_invocations)
181
114
  return context_chunks
182
115
  except Exception as e:
183
116
  logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
184
117
  raise
185
118
 
186
- def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
119
+ async def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
187
120
  """
188
121
  Get the context chunks in a document.
189
122
  """
@@ -199,30 +132,13 @@ class ContextChunksInDocumentService:
199
132
  logger.info(f"Document loaded:{file_key}")
200
133
  chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
201
134
  logger.info(f"Chunks generated:{len(chunks)}")
202
- context_chunks = self.retrieve_context_chunks_in_document_with_workflow(
203
- markdown_content, chunks, file_tags
135
+ context_chunks = (
136
+ await self.retrieve_context_chunks_in_document_with_workflow(
137
+ markdown_content, chunks, file_tags
138
+ )
204
139
  )
205
140
  logger.info(f"Context chunks generated:{len(context_chunks)}")
206
- # upsert validation
207
- try:
208
- print(f"deleting chunks: {file_key}")
209
- self.delete_document_context_chunks(file_key)
210
- except Exception as e:
211
- logger.error(f"could not delete by source: {e}")
212
- self.embeddings_manager.index_documents(context_chunks)
213
141
  return context_chunks
214
142
  except Exception as e:
215
- logger.error("Error get_context_chunks_in_document")
216
- raise e
217
-
218
- def delete_document_context_chunks(self, file_key: str):
219
- """
220
- Delete the context chunks in a document.
221
- """
222
- try:
223
- self.embeddings_manager.delete_documents_by_metadata_key(
224
- self.metadata_source, file_key
225
- )
226
- except Exception as e:
227
- logger.error(f"Error delete_document_context_chunks: {str(e)}")
143
+ logger.error(f"Error: {str(e)}")
228
144
  raise e
@@ -1,13 +1,19 @@
1
1
  """
2
2
  Application interfaces defining application layer contracts.
3
3
  """
4
+
4
5
  from abc import ABC, abstractmethod
5
- from ..domain.models import ParsedDocPage, ParsedDoc
6
- from typing import List, Union, Optional
7
- from langchain_core.documents import Document
6
+ from typing import List, Optional, Union
7
+
8
+ from langchain.indexes import IndexingResult, SQLRecordManager
8
9
  from langchain_aws import ChatBedrockConverse
10
+ from langchain_core.documents import Document
9
11
  from langchain_google_vertexai import ChatVertexAI
10
12
  from langchain_google_vertexai.model_garden import ChatAnthropicVertex
13
+ from langchain_postgres import PGVectorStore
14
+
15
+ from ..domain.models import ParsedDoc, ParsedDocPage
16
+
11
17
 
12
18
  class TranscriptionService(ABC):
13
19
  """Interface for transcription services."""
@@ -17,6 +23,7 @@ class TranscriptionService(ABC):
17
23
  """Parse a document page."""
18
24
  pass
19
25
 
26
+
20
27
  class AiApplicationService(ABC):
21
28
  """Interface for AI application services."""
22
29
 
@@ -26,7 +33,9 @@ class AiApplicationService(ABC):
26
33
  # pass
27
34
 
28
35
  @abstractmethod
29
- def load_chat_model(self, **kwargs) -> Union[ChatVertexAI, ChatAnthropicVertex, ChatBedrockConverse]:
36
+ def load_chat_model(
37
+ self, **kwargs
38
+ ) -> Union[ChatVertexAI, ChatAnthropicVertex, ChatBedrockConverse]:
30
39
  """Load a chat model."""
31
40
  pass
32
41
 
@@ -40,7 +49,9 @@ class PersistenceService(ABC):
40
49
  """Interface for persistence services."""
41
50
 
42
51
  @abstractmethod
43
- def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
52
+ def save_parsed_document(
53
+ self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
54
+ ):
44
55
  """Save a parsed document."""
45
56
  pass
46
57
 
@@ -70,38 +81,56 @@ class EmbeddingsManager(ABC):
70
81
  @abstractmethod
71
82
  def configure_vector_store(
72
83
  self,
73
- table_name: str = "langchain_pg_embedding",
74
- vector_size: int = 768,
75
- content_column: str = "document",
76
- id_column: str = "id",
77
- metadata_json_column: str = "cmetadata",
78
- pg_record_manager: str = "postgres/langchain_pg_collection"
79
84
  ):
80
85
  """Configure the vector store."""
81
86
  pass
82
87
 
88
+ # @abstractmethod
89
+ # async def init_vector_store(
90
+ # self,
91
+ # table_name: str = "tenant_embeddings",
92
+ # content_column: str = "document",
93
+ # metadata_json_column: str = "cmetadata",
94
+ # id_column: str = "id",
95
+ # ):
96
+ # """Initialize the vector store."""
97
+ # pass
98
+
83
99
  @abstractmethod
84
- def init_vector_store(
100
+ def retrieve_vector_store(
85
101
  self,
86
- table_name: str = "langchain_pg_embedding",
87
- content_column: str = "document",
88
- metadata_json_column: str = "cmetadata",
89
- id_column: str = "id",
90
- ):
91
- """Initialize the vector store."""
102
+ ) -> tuple[PGVectorStore, SQLRecordManager]:
103
+ """Retrieve the vector store."""
92
104
  pass
93
105
 
94
106
  @abstractmethod
95
- def index_documents(self, documents: list[Document]):
107
+ def index_documents(
108
+ self,
109
+ docs: list[Document],
110
+ ) -> IndexingResult:
96
111
  """Index documents."""
97
112
  pass
98
113
 
99
114
  @abstractmethod
100
- def get_documents_keys_by_source_id(self, source_id: str):
101
- """Get documents keys by source ID."""
115
+ def search_records(
116
+ self,
117
+ query: str,
118
+ ) -> list[Document]:
119
+ """Search documents."""
102
120
  pass
103
121
 
104
122
  @abstractmethod
105
- def delete_documents_by_source_id(self, source_id: str):
106
- """Delete documents by source ID."""
123
+ def create_index(
124
+ self,
125
+ ):
107
126
  pass
127
+
128
+ # @abstractmethod
129
+ # def get_documents_keys_by_source_id(self, source_id: str):
130
+ # """Get documents keys by source ID."""
131
+ # pass
132
+
133
+ # @abstractmethod
134
+ # def delete_documents_by_source_id(self, source_id: str):
135
+ # """Delete documents by source ID."""
136
+ # pass
@@ -0,0 +1,59 @@
1
+ import logging
2
+
3
+ from langchain.indexes import SQLRecordManager
4
+ from langchain_core.documents import Document
5
+ from langchain_postgres import PGVectorStore
6
+
7
+ from .interfaces import (
8
+ EmbeddingsManager,
9
+ RagChunker,
10
+ )
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class KdbService:
16
+ """
17
+ Service for chunking documents.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ embeddings_manager: EmbeddingsManager,
23
+ ):
24
+ """
25
+ Initialize the ChunkerService.
26
+ """
27
+ self.embeddings_manager = embeddings_manager
28
+ self._vector_store = None
29
+ self._records_manager = None
30
+
31
+ def configure_kdb(self):
32
+ try:
33
+ self.embeddings_manager.configure_vector_store()
34
+ except Exception as e:
35
+ raise Exception(f"Error configuring KDB: {e}")
36
+
37
+ def create_vector_store_hsnw_index(self):
38
+ try:
39
+ self.embeddings_manager.create_index()
40
+ except Exception as e:
41
+ logger.error(f"Error creating vector store index: {e}")
42
+ raise Exception(f"Error creating vector store index: {e}")
43
+
44
+ def search(self, query: str) -> list[Document]:
45
+ try:
46
+ records = []
47
+ records = self.embeddings_manager.search_records(query)
48
+ print(records)
49
+ return records
50
+ except Exception as e:
51
+ logger.error(f"Error indexing documents: {e}")
52
+ raise Exception(f"Error indexing documents: {e}")
53
+
54
+ def index_documents_in_vector_store(self, documents: list[Document]) -> None:
55
+ try:
56
+ self.embeddings_manager.index_documents(documents)
57
+ except Exception as e:
58
+ logger.error(f"Error indexing documents: {e}")
59
+ raise Exception(f"Error indexing documents: {e}")
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  from typing import Tuple, List, Dict, Optional
2
3
  from langchain_core.prompts import ChatPromptTemplate
3
4
  from langchain_core.output_parsers.pydantic import PydanticOutputParser
@@ -23,15 +24,15 @@ class TranscriptionService:
23
24
  persistence_service: PersistenceService,
24
25
  target_language: str = "es",
25
26
  transcription_additional_instructions: str = "",
26
- transcription_accuracy_threshold: int = 90,
27
+ transcription_accuracy_threshold: float = 0.90,
27
28
  max_transcription_retries: int = 2,
28
29
  ):
29
30
  self.ai_application_service = ai_application_service
30
31
  self.persistence_service = persistence_service
31
32
  self.target_language = target_language
32
33
  if (
33
- transcription_accuracy_threshold < 0
34
- or transcription_accuracy_threshold > 95
34
+ transcription_accuracy_threshold < 0.0
35
+ or transcription_accuracy_threshold > 0.95
35
36
  ):
36
37
  raise ValueError(
37
38
  "transcription_accuracy_threshold must be between 0 and 95"
@@ -46,6 +47,15 @@ class TranscriptionService:
46
47
  transcription_additional_instructions
47
48
  )
48
49
  self.chat_model = self.ai_application_service.load_chat_model()
50
+ self.transcription_workflow = TranscriptionWorkflow(
51
+ self.chat_model, self.transcription_additional_instructions
52
+ )
53
+ self.compiled_transcription_workflow = (
54
+ self.transcription_workflow.gen_workflow()
55
+ )
56
+ self.compiled_transcription_workflow = (
57
+ self.compiled_transcription_workflow.compile()
58
+ )
49
59
 
50
60
  # def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
51
61
  # """Transcribe an image to text.
@@ -101,19 +111,19 @@ class TranscriptionService:
101
111
  # logger.error(f"Failed to parse document page: {str(e)}")
102
112
  # raise
103
113
 
104
- def parse_doc_page_with_workflow(self, document: ParsedDocPage) -> ParsedDocPage:
114
+ async def parse_doc_page_with_workflow(
115
+ self, document: ParsedDocPage, retries: int = 0
116
+ ) -> ParsedDocPage:
105
117
  """Transcribe an image to text using an agent.
106
118
  Args:
107
119
  document: The document with the image to transcribe
108
120
  Returns:
109
121
  Processed text
110
122
  """
111
- transcription_workflow = TranscriptionWorkflow(
112
- self.chat_model, self.transcription_additional_instructions
113
- )
114
- compiled_transcription_workflow = transcription_workflow.gen_workflow()
115
- compiled_transcription_workflow = compiled_transcription_workflow.compile()
116
- result = compiled_transcription_workflow.invoke(
123
+ if retries > 1:
124
+ logger.info("Max retries exceeded")
125
+ return document
126
+ result = await self.compiled_transcription_workflow.ainvoke(
117
127
  {
118
128
  "messages": [
119
129
  HumanMessage(
@@ -143,23 +153,44 @@ class TranscriptionService:
143
153
  }
144
154
  },
145
155
  )
146
- if result["transcription"]:
156
+ if "transcription" in result:
147
157
  document.page_text = result["transcription"]
148
158
  else:
149
- raise ValueError("No transcription found")
159
+ return await self.parse_doc_page_with_workflow(
160
+ document, retries=retries + 1
161
+ )
150
162
  return document
151
163
 
152
- def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
164
+ # def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
165
+ # """
166
+ # Process a document by parsing it and returning the parsed content.
167
+ # """
168
+ # raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
169
+ # parse_doc_model_service = ParseDocModelService(raw_file_path)
170
+ # document_pages = parse_doc_model_service.parse_document_to_base64()
171
+ # parsed_pages = []
172
+ # for page in document_pages:
173
+ # page = self.parse_doc_page_with_workflow(page)
174
+ # parsed_pages.append(page)
175
+ # logger.info(f"Parsed {len(parsed_pages)} pages")
176
+ # parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
177
+ # return parsed_pages, parsed_document
178
+
179
+ async def process_document(
180
+ self, file_key: str
181
+ ) -> Tuple[List[ParsedDocPage], ParsedDoc]:
153
182
  """
154
183
  Process a document by parsing it and returning the parsed content.
155
184
  """
156
185
  raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
157
186
  parse_doc_model_service = ParseDocModelService(raw_file_path)
158
187
  document_pages = parse_doc_model_service.parse_document_to_base64()
188
+ parse_pages_workflow_tasks = []
159
189
  parsed_pages = []
160
190
  for page in document_pages:
161
- page = self.parse_doc_page_with_workflow(page)
162
- parsed_pages.append(page)
191
+ parse_pages_workflow_tasks.append(self.parse_doc_page_with_workflow(page))
192
+ # here
193
+ parsed_pages = await asyncio.gather(*parse_pages_workflow_tasks)
163
194
  logger.info(f"Parsed {len(parsed_pages)} pages")
164
195
  parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
165
196
  return parsed_pages, parsed_document
@@ -2,9 +2,12 @@ from enum import Enum
2
2
  from typing import Literal
3
3
 
4
4
 
5
- class KdbServices(Enum):
5
+ class KdbServices(str, Enum):
6
6
  REDIS = "redis"
7
7
  CHROMA = "chroma"
8
+ PG = "pg"
8
9
 
9
10
 
10
- kdb_services = Literal[KdbServices.REDIS.value, KdbServices.CHROMA.value]
11
+ kdb_services = Literal[
12
+ KdbServices.REDIS.value, KdbServices.CHROMA.value, KdbServices.PG.value
13
+ ]
@@ -227,7 +227,7 @@ Generate the optimized context following these specifications:
227
227
 
228
228
  WORKFLOW_CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
229
229
  You are an expert RAG (Retrieval-Augmented Generation) context generator that creates optimized contextual chunks from markdown document content for enhanced search and retrieval performance.
230
- OBJECTIVE: Generate rich, searchable context descriptions that maximize retrieval accuracy and relevance in RAG systems.
230
+ OBJECTIVE: Generate concise, searchable context descriptions that maximize retrieval accuracy and relevance in RAG systems.
231
231
  WORKFLOW:
232
232
  <task_analysis>
233
233
  1. LANGUAGE DETECTION: Identify the primary language used in the document content
@@ -243,10 +243,7 @@ Your generated context must synthesize ALL of these elements into a coherent des
243
243
  - chunk_keywords: Primary and secondary keywords, technical terms, and searchable phrases that would help users find this content
244
244
  - chunk_description: Clear explanation of what the chunk contains, including data types, concepts, and information presented
245
245
  - chunk_function: The chunk's specific purpose and role (e.g., definition, explanation, example, instruction, procedure, list, summary, analysis, conclusion)
246
- - chunk_structure: Format and organizational pattern (paragraph, bulleted list, numbered steps, table, code block, heading, etc.)
247
- - chunk_main_idea: The central concept, message, or takeaway that the chunk communicates
248
246
  - chunk_domain: Subject area or field of knowledge (e.g., technical documentation, legal text, medical information, business process)
249
- - chunk_audience: Intended reader level and background (e.g., beginner, expert, general audience, specific role)
250
247
  </context_elements>
251
248
  CRITICAL RULES:
252
249
  <critical_rules>
@@ -258,6 +255,7 @@ CRITICAL RULES:
258
255
  - Do NOT reproduce or quote the original chunk content verbatim
259
256
  - Ensure context is self-contained and understandable without the original chunk
260
257
  - Use natural language that flows well while incorporating all required elements
258
+ - Do not generate extensive contexts, two sentences or less is required, ensure concise and succinct context.
261
259
  </critical_rules>
262
260
 
263
261
  SEARCH OPTIMIZATION GUIDELINES:
@@ -2,7 +2,7 @@ from enum import Enum
2
2
  from typing import Literal
3
3
 
4
4
 
5
- class StorageServices(Enum):
5
+ class StorageServices(str, Enum):
6
6
  S3 = "s3"
7
7
  LOCAL = "local"
8
8
 
@@ -8,8 +8,9 @@ from ..domain.models import ParsedDocPage, ParsedDoc
8
8
 
9
9
  logger = logging.getLogger(__name__)
10
10
 
11
+
11
12
  # CHECK THIS THING IMPROVE THE WAY CODE IS STRUCTURED
12
- class ParseDocModelService():
13
+ class ParseDocModelService:
13
14
  """
14
15
  Class for parsing PDF documents, converting pages to base64 images
15
16
  """
@@ -25,7 +26,6 @@ class ParseDocModelService():
25
26
  self.pdf_document = pymupdf.open(file_path)
26
27
  self.page_count = self.pdf_document.page_count
27
28
 
28
-
29
29
  def pdf_page_to_base64(self, page_number: int) -> ParsedDocPage:
30
30
  """
31
31
  Convert a PDF page to a base64-encoded PNG image.
@@ -48,10 +48,7 @@ class ParseDocModelService():
48
48
  img.save(buffer, format="PNG")
49
49
  b64_encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
50
50
  logger.info(f"Page {page_number} encoded successfully")
51
- return ParsedDocPage(
52
- page_number=page_number,
53
- page_base64=b64_encoded_image
54
- )
51
+ return ParsedDocPage(page_number=page_number, page_base64=b64_encoded_image)
55
52
  except Exception as e:
56
53
  logger.error(f"Failed to parse b64 image: {str(e)}")
57
54
  raise
@@ -87,12 +84,10 @@ class ParseDocModelService():
87
84
  Create a markdown content from a list of parsed pages.
88
85
  """
89
86
  md_content = ""
90
- for page in parsed_pages:
87
+ sorted_pages = sorted(parsed_pages, key=lambda page: page.page_number)
88
+ for page in sorted_pages:
91
89
  md_content += f"## Page {page.page_number}\n\n"
92
90
  md_content += f"{page.page_text}\n\n"
93
- return ParsedDoc(
94
- pages=parsed_pages,
95
- document_text=md_content
96
- )
91
+ return ParsedDoc(pages=parsed_pages, document_text=md_content)
97
92
 
98
93
  # def
@@ -46,26 +46,29 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
46
46
  logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
47
47
  raise
48
48
 
49
- def configure_vector_store(
49
+ async def configure_vector_store(
50
50
  self,
51
51
  table_name: str = "",
52
52
  vector_size: int = 768,
53
53
  content_column: str = "document",
54
54
  id_column: str = "id",
55
+ metadata_json_column: str = "cmetadata",
56
+ pg_record_manager: str = "postgres/langchain_pg_collection",
55
57
  ):
56
58
  """Configure the vector store."""
57
59
  pass
58
60
 
59
- def init_vector_store(
61
+ async def init_vector_store(
60
62
  self,
61
63
  table_name: str = "",
62
64
  content_column: str = "document",
65
+ metadata_json_column: str = "cmetadata",
63
66
  id_column: str = "id",
64
67
  ):
65
68
  """Initialize the vector store."""
66
69
  pass
67
70
 
68
- def index_documents(self, documents: list[Document]):
71
+ async def index_documents(self, documents: list[Document]):
69
72
  """
70
73
  Add documents to the vector store with their embeddings.
71
74
 
@@ -85,7 +88,7 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
85
88
  """
86
89
  try:
87
90
  logger.info(f"Indexing {len(documents)} documents in vector store")
88
- self.chroma.add_documents(documents)
91
+ await self.chroma.aadd_documents(documents)
89
92
  except Exception as e:
90
93
  logger.error(f"Error indexing documents: {str(e)}")
91
94
  raise
@@ -110,12 +113,14 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
110
113
  logger.error(f"Error deleting documents by ID: {str(e)}")
111
114
  raise
112
115
 
113
- def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
116
+ async def delete_documents_by_metadata_key(
117
+ self, metadata_key: str, metadata_value: str
118
+ ):
114
119
  """
115
120
  Delete documents by filter from the vector store.
116
121
  """
117
122
  try:
118
- self.chroma.delete(where={metadata_key: metadata_value})
123
+ await self.chroma.adelete(where={metadata_key: metadata_value})
119
124
  except Exception as error:
120
125
  logger.error(
121
126
  f"Error deleting documents by filter: {str(filter)}, error: {error} "