wizit-context-ingestor 0.3.0b2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wizit_context_ingestor/__init__.py +2 -1
- wizit_context_ingestor/application/context_chunk_service.py +21 -105
- wizit_context_ingestor/application/interfaces.py +52 -23
- wizit_context_ingestor/application/kdb_service.py +59 -0
- wizit_context_ingestor/application/transcription_service.py +46 -15
- wizit_context_ingestor/data/kdb.py +5 -2
- wizit_context_ingestor/data/prompts.py +2 -4
- wizit_context_ingestor/data/storage.py +1 -1
- wizit_context_ingestor/domain/services.py +6 -11
- wizit_context_ingestor/infra/rag/chroma_embeddings.py +11 -6
- wizit_context_ingestor/infra/rag/pg_embeddings.py +117 -127
- wizit_context_ingestor/infra/rag/semantic_chunks.py +2 -3
- wizit_context_ingestor/infra/secrets/aws_secrets_manager.py +3 -4
- wizit_context_ingestor/main.py +15 -14
- wizit_context_ingestor/main_chunks.py +173 -0
- wizit_context_ingestor/utils/file_utils.py +7 -8
- {wizit_context_ingestor-0.3.0b2.dist-info → wizit_context_ingestor-0.4.0.dist-info}/METADATA +4 -1
- {wizit_context_ingestor-0.3.0b2.dist-info → wizit_context_ingestor-0.4.0.dist-info}/RECORD +19 -17
- {wizit_context_ingestor-0.3.0b2.dist-info → wizit_context_ingestor-0.4.0.dist-info}/WHEEL +0 -0
|
@@ -1,18 +1,20 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from langchain_core.messages.human import HumanMessage
|
|
1
7
|
from langchain_core.output_parsers.pydantic import PydanticOutputParser
|
|
2
8
|
from langchain_core.prompts import ChatPromptTemplate
|
|
3
|
-
|
|
9
|
+
|
|
4
10
|
from ..data.prompts import CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT, ContextChunk
|
|
5
|
-
from langchain_core.messages.human import HumanMessage
|
|
6
11
|
from ..workflows.context_workflow import ContextWorkflow
|
|
7
|
-
from typing import Dict, Any, Optional, List
|
|
8
12
|
from .interfaces import (
|
|
9
13
|
AiApplicationService,
|
|
14
|
+
EmbeddingsManager,
|
|
10
15
|
PersistenceService,
|
|
11
16
|
RagChunker,
|
|
12
|
-
EmbeddingsManager,
|
|
13
17
|
)
|
|
14
|
-
import logging
|
|
15
|
-
|
|
16
18
|
|
|
17
19
|
logger = logging.getLogger(__name__)
|
|
18
20
|
|
|
@@ -38,13 +40,13 @@ class ContextChunksInDocumentService:
|
|
|
38
40
|
self.rag_chunker = rag_chunker
|
|
39
41
|
self.embeddings_manager = embeddings_manager
|
|
40
42
|
self.target_language = target_language
|
|
41
|
-
self.embeddings_manager.init_vector_store()
|
|
43
|
+
# self.embeddings_manager.init_vector_store()
|
|
42
44
|
self.chat_model = self.ai_application_service.load_chat_model()
|
|
43
45
|
# TODO
|
|
44
46
|
self.context_additional_instructions = ""
|
|
45
47
|
self.metadata_source = "source"
|
|
46
48
|
|
|
47
|
-
def _retrieve_context_chunk_in_document_with_workflow(
|
|
49
|
+
async def _retrieve_context_chunk_in_document_with_workflow(
|
|
48
50
|
self,
|
|
49
51
|
workflow,
|
|
50
52
|
markdown_content: str,
|
|
@@ -53,7 +55,7 @@ class ContextChunksInDocumentService:
|
|
|
53
55
|
) -> Document:
|
|
54
56
|
"""Retrieve context chunks in document."""
|
|
55
57
|
try:
|
|
56
|
-
result = workflow.
|
|
58
|
+
result = await workflow.ainvoke(
|
|
57
59
|
{
|
|
58
60
|
"messages": [
|
|
59
61
|
HumanMessage(
|
|
@@ -74,9 +76,7 @@ class ContextChunksInDocumentService:
|
|
|
74
76
|
}
|
|
75
77
|
},
|
|
76
78
|
)
|
|
77
|
-
|
|
78
|
-
# f"Context:{result['context']}, Content:{chunk.page_content}"
|
|
79
|
-
# )
|
|
79
|
+
chunk.page_content = f"<context>\n{result['context']}\n</context>\n <content>\n{chunk.page_content}\n</content>"
|
|
80
80
|
chunk.metadata["context"] = result["context"]
|
|
81
81
|
if chunk_metadata:
|
|
82
82
|
for key, value in chunk_metadata.items():
|
|
@@ -86,75 +86,7 @@ class ContextChunksInDocumentService:
|
|
|
86
86
|
logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
87
87
|
raise
|
|
88
88
|
|
|
89
|
-
|
|
90
|
-
# self,
|
|
91
|
-
# markdown_content: str,
|
|
92
|
-
# chunk: Document,
|
|
93
|
-
# chunk_metadata: Optional[Dict[str, Any]] = None,
|
|
94
|
-
# ) -> Document:
|
|
95
|
-
# """Retrieve context chunks in document."""
|
|
96
|
-
# try:
|
|
97
|
-
# chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
|
|
98
|
-
# # Create the prompt template with image
|
|
99
|
-
# prompt = ChatPromptTemplate.from_messages(
|
|
100
|
-
# [
|
|
101
|
-
# ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
|
|
102
|
-
# (
|
|
103
|
-
# "human",
|
|
104
|
-
# [
|
|
105
|
-
# {
|
|
106
|
-
# "type": "text",
|
|
107
|
-
# "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated in '{self.target_language}' language",
|
|
108
|
-
# }
|
|
109
|
-
# ],
|
|
110
|
-
# ),
|
|
111
|
-
# ]
|
|
112
|
-
# ).partial(
|
|
113
|
-
# document_content=markdown_content,
|
|
114
|
-
# format_instructions=chunk_output_parser.get_format_instructions(),
|
|
115
|
-
# )
|
|
116
|
-
# model_with_structured_output = self.chat_model.with_structured_output(
|
|
117
|
-
# ContextChunk
|
|
118
|
-
# )
|
|
119
|
-
# # Create the chain
|
|
120
|
-
# chain = prompt | model_with_structured_output
|
|
121
|
-
# # Process the image
|
|
122
|
-
# results = chain.invoke({})
|
|
123
|
-
# # chunk.page_content = (
|
|
124
|
-
# # f"Context:{results.context}, Content:{chunk.page_content}"
|
|
125
|
-
# # )
|
|
126
|
-
# chunk.metadata["context"] = results.context
|
|
127
|
-
# if chunk_metadata:
|
|
128
|
-
# for key, value in chunk_metadata.items():
|
|
129
|
-
# chunk.metadata[key] = value
|
|
130
|
-
# return chunk
|
|
131
|
-
|
|
132
|
-
# except Exception as e:
|
|
133
|
-
# logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
134
|
-
# raise
|
|
135
|
-
|
|
136
|
-
# def retrieve_context_chunks_in_document(
|
|
137
|
-
# self,
|
|
138
|
-
# markdown_content: str,
|
|
139
|
-
# chunks: List[Document],
|
|
140
|
-
# chunks_metadata: Optional[Dict[str, Any]] = None,
|
|
141
|
-
# ) -> List[Document]:
|
|
142
|
-
# """Retrieve context chunks in document."""
|
|
143
|
-
# try:
|
|
144
|
-
# context_chunks = list(
|
|
145
|
-
# map(
|
|
146
|
-
# lambda chunk: self._retrieve_context_chunk_in_document(
|
|
147
|
-
# markdown_content, chunk, chunks_metadata
|
|
148
|
-
# ),
|
|
149
|
-
# chunks,
|
|
150
|
-
# )
|
|
151
|
-
# )
|
|
152
|
-
# return context_chunks
|
|
153
|
-
# except Exception as e:
|
|
154
|
-
# logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
155
|
-
# raise
|
|
156
|
-
|
|
157
|
-
def retrieve_context_chunks_in_document_with_workflow(
|
|
89
|
+
async def retrieve_context_chunks_in_document_with_workflow(
|
|
158
90
|
self,
|
|
159
91
|
markdown_content: str,
|
|
160
92
|
chunks: List[Document],
|
|
@@ -167,7 +99,7 @@ class ContextChunksInDocumentService:
|
|
|
167
99
|
)
|
|
168
100
|
compiled_context_workflow = context_workflow.gen_workflow()
|
|
169
101
|
compiled_context_workflow = compiled_context_workflow.compile()
|
|
170
|
-
|
|
102
|
+
context_chunks_workflow_invocations = list(
|
|
171
103
|
map(
|
|
172
104
|
lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
|
|
173
105
|
compiled_context_workflow,
|
|
@@ -178,12 +110,13 @@ class ContextChunksInDocumentService:
|
|
|
178
110
|
chunks,
|
|
179
111
|
)
|
|
180
112
|
)
|
|
113
|
+
context_chunks = await asyncio.gather(*context_chunks_workflow_invocations)
|
|
181
114
|
return context_chunks
|
|
182
115
|
except Exception as e:
|
|
183
116
|
logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
184
117
|
raise
|
|
185
118
|
|
|
186
|
-
def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
|
|
119
|
+
async def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
|
|
187
120
|
"""
|
|
188
121
|
Get the context chunks in a document.
|
|
189
122
|
"""
|
|
@@ -199,30 +132,13 @@ class ContextChunksInDocumentService:
|
|
|
199
132
|
logger.info(f"Document loaded:{file_key}")
|
|
200
133
|
chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
|
|
201
134
|
logger.info(f"Chunks generated:{len(chunks)}")
|
|
202
|
-
context_chunks =
|
|
203
|
-
|
|
135
|
+
context_chunks = (
|
|
136
|
+
await self.retrieve_context_chunks_in_document_with_workflow(
|
|
137
|
+
markdown_content, chunks, file_tags
|
|
138
|
+
)
|
|
204
139
|
)
|
|
205
140
|
logger.info(f"Context chunks generated:{len(context_chunks)}")
|
|
206
|
-
# upsert validation
|
|
207
|
-
try:
|
|
208
|
-
print(f"deleting chunks: {file_key}")
|
|
209
|
-
self.delete_document_context_chunks(file_key)
|
|
210
|
-
except Exception as e:
|
|
211
|
-
logger.error(f"could not delete by source: {e}")
|
|
212
|
-
self.embeddings_manager.index_documents(context_chunks)
|
|
213
141
|
return context_chunks
|
|
214
142
|
except Exception as e:
|
|
215
|
-
logger.error("Error
|
|
216
|
-
raise e
|
|
217
|
-
|
|
218
|
-
def delete_document_context_chunks(self, file_key: str):
|
|
219
|
-
"""
|
|
220
|
-
Delete the context chunks in a document.
|
|
221
|
-
"""
|
|
222
|
-
try:
|
|
223
|
-
self.embeddings_manager.delete_documents_by_metadata_key(
|
|
224
|
-
self.metadata_source, file_key
|
|
225
|
-
)
|
|
226
|
-
except Exception as e:
|
|
227
|
-
logger.error(f"Error delete_document_context_chunks: {str(e)}")
|
|
143
|
+
logger.error(f"Error: {str(e)}")
|
|
228
144
|
raise e
|
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Application interfaces defining application layer contracts.
|
|
3
3
|
"""
|
|
4
|
+
|
|
4
5
|
from abc import ABC, abstractmethod
|
|
5
|
-
from
|
|
6
|
-
|
|
7
|
-
from
|
|
6
|
+
from typing import List, Optional, Union
|
|
7
|
+
|
|
8
|
+
from langchain.indexes import IndexingResult, SQLRecordManager
|
|
8
9
|
from langchain_aws import ChatBedrockConverse
|
|
10
|
+
from langchain_core.documents import Document
|
|
9
11
|
from langchain_google_vertexai import ChatVertexAI
|
|
10
12
|
from langchain_google_vertexai.model_garden import ChatAnthropicVertex
|
|
13
|
+
from langchain_postgres import PGVectorStore
|
|
14
|
+
|
|
15
|
+
from ..domain.models import ParsedDoc, ParsedDocPage
|
|
16
|
+
|
|
11
17
|
|
|
12
18
|
class TranscriptionService(ABC):
|
|
13
19
|
"""Interface for transcription services."""
|
|
@@ -17,6 +23,7 @@ class TranscriptionService(ABC):
|
|
|
17
23
|
"""Parse a document page."""
|
|
18
24
|
pass
|
|
19
25
|
|
|
26
|
+
|
|
20
27
|
class AiApplicationService(ABC):
|
|
21
28
|
"""Interface for AI application services."""
|
|
22
29
|
|
|
@@ -26,7 +33,9 @@ class AiApplicationService(ABC):
|
|
|
26
33
|
# pass
|
|
27
34
|
|
|
28
35
|
@abstractmethod
|
|
29
|
-
def load_chat_model(
|
|
36
|
+
def load_chat_model(
|
|
37
|
+
self, **kwargs
|
|
38
|
+
) -> Union[ChatVertexAI, ChatAnthropicVertex, ChatBedrockConverse]:
|
|
30
39
|
"""Load a chat model."""
|
|
31
40
|
pass
|
|
32
41
|
|
|
@@ -40,7 +49,9 @@ class PersistenceService(ABC):
|
|
|
40
49
|
"""Interface for persistence services."""
|
|
41
50
|
|
|
42
51
|
@abstractmethod
|
|
43
|
-
def save_parsed_document(
|
|
52
|
+
def save_parsed_document(
|
|
53
|
+
self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
|
|
54
|
+
):
|
|
44
55
|
"""Save a parsed document."""
|
|
45
56
|
pass
|
|
46
57
|
|
|
@@ -70,38 +81,56 @@ class EmbeddingsManager(ABC):
|
|
|
70
81
|
@abstractmethod
|
|
71
82
|
def configure_vector_store(
|
|
72
83
|
self,
|
|
73
|
-
table_name: str = "langchain_pg_embedding",
|
|
74
|
-
vector_size: int = 768,
|
|
75
|
-
content_column: str = "document",
|
|
76
|
-
id_column: str = "id",
|
|
77
|
-
metadata_json_column: str = "cmetadata",
|
|
78
|
-
pg_record_manager: str = "postgres/langchain_pg_collection"
|
|
79
84
|
):
|
|
80
85
|
"""Configure the vector store."""
|
|
81
86
|
pass
|
|
82
87
|
|
|
88
|
+
# @abstractmethod
|
|
89
|
+
# async def init_vector_store(
|
|
90
|
+
# self,
|
|
91
|
+
# table_name: str = "tenant_embeddings",
|
|
92
|
+
# content_column: str = "document",
|
|
93
|
+
# metadata_json_column: str = "cmetadata",
|
|
94
|
+
# id_column: str = "id",
|
|
95
|
+
# ):
|
|
96
|
+
# """Initialize the vector store."""
|
|
97
|
+
# pass
|
|
98
|
+
|
|
83
99
|
@abstractmethod
|
|
84
|
-
def
|
|
100
|
+
def retrieve_vector_store(
|
|
85
101
|
self,
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
metadata_json_column: str = "cmetadata",
|
|
89
|
-
id_column: str = "id",
|
|
90
|
-
):
|
|
91
|
-
"""Initialize the vector store."""
|
|
102
|
+
) -> tuple[PGVectorStore, SQLRecordManager]:
|
|
103
|
+
"""Retrieve the vector store."""
|
|
92
104
|
pass
|
|
93
105
|
|
|
94
106
|
@abstractmethod
|
|
95
|
-
def index_documents(
|
|
107
|
+
def index_documents(
|
|
108
|
+
self,
|
|
109
|
+
docs: list[Document],
|
|
110
|
+
) -> IndexingResult:
|
|
96
111
|
"""Index documents."""
|
|
97
112
|
pass
|
|
98
113
|
|
|
99
114
|
@abstractmethod
|
|
100
|
-
def
|
|
101
|
-
|
|
115
|
+
def search_records(
|
|
116
|
+
self,
|
|
117
|
+
query: str,
|
|
118
|
+
) -> list[Document]:
|
|
119
|
+
"""Search documents."""
|
|
102
120
|
pass
|
|
103
121
|
|
|
104
122
|
@abstractmethod
|
|
105
|
-
def
|
|
106
|
-
|
|
123
|
+
def create_index(
|
|
124
|
+
self,
|
|
125
|
+
):
|
|
107
126
|
pass
|
|
127
|
+
|
|
128
|
+
# @abstractmethod
|
|
129
|
+
# def get_documents_keys_by_source_id(self, source_id: str):
|
|
130
|
+
# """Get documents keys by source ID."""
|
|
131
|
+
# pass
|
|
132
|
+
|
|
133
|
+
# @abstractmethod
|
|
134
|
+
# def delete_documents_by_source_id(self, source_id: str):
|
|
135
|
+
# """Delete documents by source ID."""
|
|
136
|
+
# pass
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from langchain.indexes import SQLRecordManager
|
|
4
|
+
from langchain_core.documents import Document
|
|
5
|
+
from langchain_postgres import PGVectorStore
|
|
6
|
+
|
|
7
|
+
from .interfaces import (
|
|
8
|
+
EmbeddingsManager,
|
|
9
|
+
RagChunker,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class KdbService:
|
|
16
|
+
"""
|
|
17
|
+
Service for chunking documents.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
embeddings_manager: EmbeddingsManager,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Initialize the ChunkerService.
|
|
26
|
+
"""
|
|
27
|
+
self.embeddings_manager = embeddings_manager
|
|
28
|
+
self._vector_store = None
|
|
29
|
+
self._records_manager = None
|
|
30
|
+
|
|
31
|
+
def configure_kdb(self):
|
|
32
|
+
try:
|
|
33
|
+
self.embeddings_manager.configure_vector_store()
|
|
34
|
+
except Exception as e:
|
|
35
|
+
raise Exception(f"Error configuring KDB: {e}")
|
|
36
|
+
|
|
37
|
+
def create_vector_store_hsnw_index(self):
|
|
38
|
+
try:
|
|
39
|
+
self.embeddings_manager.create_index()
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.error(f"Error creating vector store index: {e}")
|
|
42
|
+
raise Exception(f"Error creating vector store index: {e}")
|
|
43
|
+
|
|
44
|
+
def search(self, query: str) -> list[Document]:
|
|
45
|
+
try:
|
|
46
|
+
records = []
|
|
47
|
+
records = self.embeddings_manager.search_records(query)
|
|
48
|
+
print(records)
|
|
49
|
+
return records
|
|
50
|
+
except Exception as e:
|
|
51
|
+
logger.error(f"Error indexing documents: {e}")
|
|
52
|
+
raise Exception(f"Error indexing documents: {e}")
|
|
53
|
+
|
|
54
|
+
def index_documents_in_vector_store(self, documents: list[Document]) -> None:
|
|
55
|
+
try:
|
|
56
|
+
self.embeddings_manager.index_documents(documents)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.error(f"Error indexing documents: {e}")
|
|
59
|
+
raise Exception(f"Error indexing documents: {e}")
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
from typing import Tuple, List, Dict, Optional
|
|
2
3
|
from langchain_core.prompts import ChatPromptTemplate
|
|
3
4
|
from langchain_core.output_parsers.pydantic import PydanticOutputParser
|
|
@@ -23,15 +24,15 @@ class TranscriptionService:
|
|
|
23
24
|
persistence_service: PersistenceService,
|
|
24
25
|
target_language: str = "es",
|
|
25
26
|
transcription_additional_instructions: str = "",
|
|
26
|
-
transcription_accuracy_threshold:
|
|
27
|
+
transcription_accuracy_threshold: float = 0.90,
|
|
27
28
|
max_transcription_retries: int = 2,
|
|
28
29
|
):
|
|
29
30
|
self.ai_application_service = ai_application_service
|
|
30
31
|
self.persistence_service = persistence_service
|
|
31
32
|
self.target_language = target_language
|
|
32
33
|
if (
|
|
33
|
-
transcription_accuracy_threshold < 0
|
|
34
|
-
or transcription_accuracy_threshold > 95
|
|
34
|
+
transcription_accuracy_threshold < 0.0
|
|
35
|
+
or transcription_accuracy_threshold > 0.95
|
|
35
36
|
):
|
|
36
37
|
raise ValueError(
|
|
37
38
|
"transcription_accuracy_threshold must be between 0 and 95"
|
|
@@ -46,6 +47,15 @@ class TranscriptionService:
|
|
|
46
47
|
transcription_additional_instructions
|
|
47
48
|
)
|
|
48
49
|
self.chat_model = self.ai_application_service.load_chat_model()
|
|
50
|
+
self.transcription_workflow = TranscriptionWorkflow(
|
|
51
|
+
self.chat_model, self.transcription_additional_instructions
|
|
52
|
+
)
|
|
53
|
+
self.compiled_transcription_workflow = (
|
|
54
|
+
self.transcription_workflow.gen_workflow()
|
|
55
|
+
)
|
|
56
|
+
self.compiled_transcription_workflow = (
|
|
57
|
+
self.compiled_transcription_workflow.compile()
|
|
58
|
+
)
|
|
49
59
|
|
|
50
60
|
# def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
|
|
51
61
|
# """Transcribe an image to text.
|
|
@@ -101,19 +111,19 @@ class TranscriptionService:
|
|
|
101
111
|
# logger.error(f"Failed to parse document page: {str(e)}")
|
|
102
112
|
# raise
|
|
103
113
|
|
|
104
|
-
def parse_doc_page_with_workflow(
|
|
114
|
+
async def parse_doc_page_with_workflow(
|
|
115
|
+
self, document: ParsedDocPage, retries: int = 0
|
|
116
|
+
) -> ParsedDocPage:
|
|
105
117
|
"""Transcribe an image to text using an agent.
|
|
106
118
|
Args:
|
|
107
119
|
document: The document with the image to transcribe
|
|
108
120
|
Returns:
|
|
109
121
|
Processed text
|
|
110
122
|
"""
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
compiled_transcription_workflow = compiled_transcription_workflow.compile()
|
|
116
|
-
result = compiled_transcription_workflow.invoke(
|
|
123
|
+
if retries > 1:
|
|
124
|
+
logger.info("Max retries exceeded")
|
|
125
|
+
return document
|
|
126
|
+
result = await self.compiled_transcription_workflow.ainvoke(
|
|
117
127
|
{
|
|
118
128
|
"messages": [
|
|
119
129
|
HumanMessage(
|
|
@@ -143,23 +153,44 @@ class TranscriptionService:
|
|
|
143
153
|
}
|
|
144
154
|
},
|
|
145
155
|
)
|
|
146
|
-
if
|
|
156
|
+
if "transcription" in result:
|
|
147
157
|
document.page_text = result["transcription"]
|
|
148
158
|
else:
|
|
149
|
-
|
|
159
|
+
return await self.parse_doc_page_with_workflow(
|
|
160
|
+
document, retries=retries + 1
|
|
161
|
+
)
|
|
150
162
|
return document
|
|
151
163
|
|
|
152
|
-
def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
|
|
164
|
+
# def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
|
|
165
|
+
# """
|
|
166
|
+
# Process a document by parsing it and returning the parsed content.
|
|
167
|
+
# """
|
|
168
|
+
# raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
|
|
169
|
+
# parse_doc_model_service = ParseDocModelService(raw_file_path)
|
|
170
|
+
# document_pages = parse_doc_model_service.parse_document_to_base64()
|
|
171
|
+
# parsed_pages = []
|
|
172
|
+
# for page in document_pages:
|
|
173
|
+
# page = self.parse_doc_page_with_workflow(page)
|
|
174
|
+
# parsed_pages.append(page)
|
|
175
|
+
# logger.info(f"Parsed {len(parsed_pages)} pages")
|
|
176
|
+
# parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
|
|
177
|
+
# return parsed_pages, parsed_document
|
|
178
|
+
|
|
179
|
+
async def process_document(
|
|
180
|
+
self, file_key: str
|
|
181
|
+
) -> Tuple[List[ParsedDocPage], ParsedDoc]:
|
|
153
182
|
"""
|
|
154
183
|
Process a document by parsing it and returning the parsed content.
|
|
155
184
|
"""
|
|
156
185
|
raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
|
|
157
186
|
parse_doc_model_service = ParseDocModelService(raw_file_path)
|
|
158
187
|
document_pages = parse_doc_model_service.parse_document_to_base64()
|
|
188
|
+
parse_pages_workflow_tasks = []
|
|
159
189
|
parsed_pages = []
|
|
160
190
|
for page in document_pages:
|
|
161
|
-
|
|
162
|
-
|
|
191
|
+
parse_pages_workflow_tasks.append(self.parse_doc_page_with_workflow(page))
|
|
192
|
+
# here
|
|
193
|
+
parsed_pages = await asyncio.gather(*parse_pages_workflow_tasks)
|
|
163
194
|
logger.info(f"Parsed {len(parsed_pages)} pages")
|
|
164
195
|
parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
|
|
165
196
|
return parsed_pages, parsed_document
|
|
@@ -2,9 +2,12 @@ from enum import Enum
|
|
|
2
2
|
from typing import Literal
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
class KdbServices(Enum):
|
|
5
|
+
class KdbServices(str, Enum):
|
|
6
6
|
REDIS = "redis"
|
|
7
7
|
CHROMA = "chroma"
|
|
8
|
+
PG = "pg"
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
kdb_services = Literal[
|
|
11
|
+
kdb_services = Literal[
|
|
12
|
+
KdbServices.REDIS.value, KdbServices.CHROMA.value, KdbServices.PG.value
|
|
13
|
+
]
|
|
@@ -227,7 +227,7 @@ Generate the optimized context following these specifications:
|
|
|
227
227
|
|
|
228
228
|
WORKFLOW_CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
|
|
229
229
|
You are an expert RAG (Retrieval-Augmented Generation) context generator that creates optimized contextual chunks from markdown document content for enhanced search and retrieval performance.
|
|
230
|
-
OBJECTIVE: Generate
|
|
230
|
+
OBJECTIVE: Generate concise, searchable context descriptions that maximize retrieval accuracy and relevance in RAG systems.
|
|
231
231
|
WORKFLOW:
|
|
232
232
|
<task_analysis>
|
|
233
233
|
1. LANGUAGE DETECTION: Identify the primary language used in the document content
|
|
@@ -243,10 +243,7 @@ Your generated context must synthesize ALL of these elements into a coherent des
|
|
|
243
243
|
- chunk_keywords: Primary and secondary keywords, technical terms, and searchable phrases that would help users find this content
|
|
244
244
|
- chunk_description: Clear explanation of what the chunk contains, including data types, concepts, and information presented
|
|
245
245
|
- chunk_function: The chunk's specific purpose and role (e.g., definition, explanation, example, instruction, procedure, list, summary, analysis, conclusion)
|
|
246
|
-
- chunk_structure: Format and organizational pattern (paragraph, bulleted list, numbered steps, table, code block, heading, etc.)
|
|
247
|
-
- chunk_main_idea: The central concept, message, or takeaway that the chunk communicates
|
|
248
246
|
- chunk_domain: Subject area or field of knowledge (e.g., technical documentation, legal text, medical information, business process)
|
|
249
|
-
- chunk_audience: Intended reader level and background (e.g., beginner, expert, general audience, specific role)
|
|
250
247
|
</context_elements>
|
|
251
248
|
CRITICAL RULES:
|
|
252
249
|
<critical_rules>
|
|
@@ -258,6 +255,7 @@ CRITICAL RULES:
|
|
|
258
255
|
- Do NOT reproduce or quote the original chunk content verbatim
|
|
259
256
|
- Ensure context is self-contained and understandable without the original chunk
|
|
260
257
|
- Use natural language that flows well while incorporating all required elements
|
|
258
|
+
- Do not generate extensive contexts, two sentences or less is required, ensure concise and succinct context.
|
|
261
259
|
</critical_rules>
|
|
262
260
|
|
|
263
261
|
SEARCH OPTIMIZATION GUIDELINES:
|
|
@@ -8,8 +8,9 @@ from ..domain.models import ParsedDocPage, ParsedDoc
|
|
|
8
8
|
|
|
9
9
|
logger = logging.getLogger(__name__)
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
# CHECK THIS THING IMPROVE THE WAY CODE IS STRUCTURED
|
|
12
|
-
class ParseDocModelService
|
|
13
|
+
class ParseDocModelService:
|
|
13
14
|
"""
|
|
14
15
|
Class for parsing PDF documents, converting pages to base64 images
|
|
15
16
|
"""
|
|
@@ -25,7 +26,6 @@ class ParseDocModelService():
|
|
|
25
26
|
self.pdf_document = pymupdf.open(file_path)
|
|
26
27
|
self.page_count = self.pdf_document.page_count
|
|
27
28
|
|
|
28
|
-
|
|
29
29
|
def pdf_page_to_base64(self, page_number: int) -> ParsedDocPage:
|
|
30
30
|
"""
|
|
31
31
|
Convert a PDF page to a base64-encoded PNG image.
|
|
@@ -48,10 +48,7 @@ class ParseDocModelService():
|
|
|
48
48
|
img.save(buffer, format="PNG")
|
|
49
49
|
b64_encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
50
50
|
logger.info(f"Page {page_number} encoded successfully")
|
|
51
|
-
return ParsedDocPage(
|
|
52
|
-
page_number=page_number,
|
|
53
|
-
page_base64=b64_encoded_image
|
|
54
|
-
)
|
|
51
|
+
return ParsedDocPage(page_number=page_number, page_base64=b64_encoded_image)
|
|
55
52
|
except Exception as e:
|
|
56
53
|
logger.error(f"Failed to parse b64 image: {str(e)}")
|
|
57
54
|
raise
|
|
@@ -87,12 +84,10 @@ class ParseDocModelService():
|
|
|
87
84
|
Create a markdown content from a list of parsed pages.
|
|
88
85
|
"""
|
|
89
86
|
md_content = ""
|
|
90
|
-
|
|
87
|
+
sorted_pages = sorted(parsed_pages, key=lambda page: page.page_number)
|
|
88
|
+
for page in sorted_pages:
|
|
91
89
|
md_content += f"## Page {page.page_number}\n\n"
|
|
92
90
|
md_content += f"{page.page_text}\n\n"
|
|
93
|
-
return ParsedDoc(
|
|
94
|
-
pages=parsed_pages,
|
|
95
|
-
document_text=md_content
|
|
96
|
-
)
|
|
91
|
+
return ParsedDoc(pages=parsed_pages, document_text=md_content)
|
|
97
92
|
|
|
98
93
|
# def
|
|
@@ -46,26 +46,29 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
|
46
46
|
logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
|
|
47
47
|
raise
|
|
48
48
|
|
|
49
|
-
def configure_vector_store(
|
|
49
|
+
async def configure_vector_store(
|
|
50
50
|
self,
|
|
51
51
|
table_name: str = "",
|
|
52
52
|
vector_size: int = 768,
|
|
53
53
|
content_column: str = "document",
|
|
54
54
|
id_column: str = "id",
|
|
55
|
+
metadata_json_column: str = "cmetadata",
|
|
56
|
+
pg_record_manager: str = "postgres/langchain_pg_collection",
|
|
55
57
|
):
|
|
56
58
|
"""Configure the vector store."""
|
|
57
59
|
pass
|
|
58
60
|
|
|
59
|
-
def init_vector_store(
|
|
61
|
+
async def init_vector_store(
|
|
60
62
|
self,
|
|
61
63
|
table_name: str = "",
|
|
62
64
|
content_column: str = "document",
|
|
65
|
+
metadata_json_column: str = "cmetadata",
|
|
63
66
|
id_column: str = "id",
|
|
64
67
|
):
|
|
65
68
|
"""Initialize the vector store."""
|
|
66
69
|
pass
|
|
67
70
|
|
|
68
|
-
def index_documents(self, documents: list[Document]):
|
|
71
|
+
async def index_documents(self, documents: list[Document]):
|
|
69
72
|
"""
|
|
70
73
|
Add documents to the vector store with their embeddings.
|
|
71
74
|
|
|
@@ -85,7 +88,7 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
|
85
88
|
"""
|
|
86
89
|
try:
|
|
87
90
|
logger.info(f"Indexing {len(documents)} documents in vector store")
|
|
88
|
-
self.chroma.
|
|
91
|
+
await self.chroma.aadd_documents(documents)
|
|
89
92
|
except Exception as e:
|
|
90
93
|
logger.error(f"Error indexing documents: {str(e)}")
|
|
91
94
|
raise
|
|
@@ -110,12 +113,14 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
|
110
113
|
logger.error(f"Error deleting documents by ID: {str(e)}")
|
|
111
114
|
raise
|
|
112
115
|
|
|
113
|
-
def delete_documents_by_metadata_key(
|
|
116
|
+
async def delete_documents_by_metadata_key(
|
|
117
|
+
self, metadata_key: str, metadata_value: str
|
|
118
|
+
):
|
|
114
119
|
"""
|
|
115
120
|
Delete documents by filter from the vector store.
|
|
116
121
|
"""
|
|
117
122
|
try:
|
|
118
|
-
self.chroma.
|
|
123
|
+
await self.chroma.adelete(where={metadata_key: metadata_value})
|
|
119
124
|
except Exception as error:
|
|
120
125
|
logger.error(
|
|
121
126
|
f"Error deleting documents by filter: {str(filter)}, error: {error} "
|