wizit-context-ingestor 0.2.5b3__tar.gz → 0.3.0b2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wizit-context-ingestor might be problematic. Click here for more details.
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/PKG-INFO +9 -1
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/README.md +7 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/pyproject.toml +2 -1
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/__init__.py +3 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/application/context_chunk_service.py +228 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/application/transcription_service.py +178 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/data/kdb.py +10 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/data/prompts.py +295 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/data/storage.py +10 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/rag/chroma_embeddings.py +30 -31
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/vertex_model.py +56 -28
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/main.py +282 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/utils/file_utils.py +13 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/workflows/context_nodes.py +73 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/workflows/context_state.py +10 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/workflows/context_tools.py +58 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/workflows/context_workflow.py +42 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/workflows/transcription_state.py +17 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/workflows/transcription_tools.py +54 -0
- wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
- wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/__init__.py +0 -3
- wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/application/context_chunk_service.py +0 -114
- wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/application/transcription_service.py +0 -98
- wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/data/prompts.py +0 -148
- wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/main.py +0 -196
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/.DS_Store +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/application/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/application/interfaces.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/data/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/domain/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/domain/models.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/domain/services.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/aws_model.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/persistence/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/secrets/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/infra/secrets/aws_secrets_manager.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/services/.DS_Store +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/services/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/services/chunks.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/services/parse_doc.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.3.0b2}/src/wizit_context_ingestor/services/pg_embeddings_manager.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: wizit-context-ingestor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0b2
|
|
4
4
|
Summary: Contextual Rag with Cloud Solutions
|
|
5
5
|
Requires-Dist: anthropic[vertex]>=0.66.0
|
|
6
6
|
Requires-Dist: boto3>=1.40.23
|
|
@@ -9,6 +9,7 @@ Requires-Dist: langchain-chroma>=0.2.6
|
|
|
9
9
|
Requires-Dist: langchain-experimental>=0.3.4
|
|
10
10
|
Requires-Dist: langchain-google-vertexai>=2.0.28
|
|
11
11
|
Requires-Dist: langchain-redis>=0.2.3
|
|
12
|
+
Requires-Dist: langgraph>=0.6.8
|
|
12
13
|
Requires-Dist: pillow>=11.3.0
|
|
13
14
|
Requires-Dist: pymupdf>=1.26.4
|
|
14
15
|
Requires-Python: >=3.12
|
|
@@ -138,6 +139,13 @@ Finally
|
|
|
138
139
|
poetry publish -r tbbcmegaingestor
|
|
139
140
|
```
|
|
140
141
|
|
|
142
|
+
# USAGE
|
|
143
|
+
|
|
144
|
+
## For transcriptions
|
|
145
|
+
|
|
146
|
+
----- TODO ---
|
|
147
|
+
You can provide number of retries and a transcription quality threshold
|
|
148
|
+
|
|
141
149
|
## License
|
|
142
150
|
|
|
143
151
|
This project is licensed under the Apache License - see the LICENSE file for details.
|
|
@@ -122,6 +122,13 @@ Finally
|
|
|
122
122
|
poetry publish -r tbbcmegaingestor
|
|
123
123
|
```
|
|
124
124
|
|
|
125
|
+
# USAGE
|
|
126
|
+
|
|
127
|
+
## For transcriptions
|
|
128
|
+
|
|
129
|
+
----- TODO ---
|
|
130
|
+
You can provide number of retries and a transcription quality threshold
|
|
131
|
+
|
|
125
132
|
## License
|
|
126
133
|
|
|
127
134
|
This project is licensed under the Apache License - see the LICENSE file for details.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "wizit_context_ingestor"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0-beta-2"
|
|
4
4
|
description = "Contextual Rag with Cloud Solutions"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -12,6 +12,7 @@ dependencies = [
|
|
|
12
12
|
"langchain-experimental>=0.3.4",
|
|
13
13
|
"langchain-google-vertexai>=2.0.28",
|
|
14
14
|
"langchain-redis>=0.2.3",
|
|
15
|
+
"langgraph>=0.6.8",
|
|
15
16
|
"pillow>=11.3.0",
|
|
16
17
|
"pymupdf>=1.26.4",
|
|
17
18
|
]
|
wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/application/context_chunk_service.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
from langchain_core.output_parsers.pydantic import PydanticOutputParser
|
|
2
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
3
|
+
from langchain_core.documents import Document
|
|
4
|
+
from ..data.prompts import CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT, ContextChunk
|
|
5
|
+
from langchain_core.messages.human import HumanMessage
|
|
6
|
+
from ..workflows.context_workflow import ContextWorkflow
|
|
7
|
+
from typing import Dict, Any, Optional, List
|
|
8
|
+
from .interfaces import (
|
|
9
|
+
AiApplicationService,
|
|
10
|
+
PersistenceService,
|
|
11
|
+
RagChunker,
|
|
12
|
+
EmbeddingsManager,
|
|
13
|
+
)
|
|
14
|
+
import logging
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ContextChunksInDocumentService:
|
|
21
|
+
"""
|
|
22
|
+
Service for chunking documents.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
ai_application_service: AiApplicationService,
|
|
28
|
+
persistence_service: PersistenceService,
|
|
29
|
+
rag_chunker: RagChunker,
|
|
30
|
+
embeddings_manager: EmbeddingsManager,
|
|
31
|
+
target_language: str = "es",
|
|
32
|
+
):
|
|
33
|
+
"""
|
|
34
|
+
Initialize the ChunkerService.
|
|
35
|
+
"""
|
|
36
|
+
self.ai_application_service = ai_application_service
|
|
37
|
+
self.persistence_service = persistence_service
|
|
38
|
+
self.rag_chunker = rag_chunker
|
|
39
|
+
self.embeddings_manager = embeddings_manager
|
|
40
|
+
self.target_language = target_language
|
|
41
|
+
self.embeddings_manager.init_vector_store()
|
|
42
|
+
self.chat_model = self.ai_application_service.load_chat_model()
|
|
43
|
+
# TODO
|
|
44
|
+
self.context_additional_instructions = ""
|
|
45
|
+
self.metadata_source = "source"
|
|
46
|
+
|
|
47
|
+
def _retrieve_context_chunk_in_document_with_workflow(
|
|
48
|
+
self,
|
|
49
|
+
workflow,
|
|
50
|
+
markdown_content: str,
|
|
51
|
+
chunk: Document,
|
|
52
|
+
chunk_metadata: Optional[Dict[str, Any]] = None,
|
|
53
|
+
) -> Document:
|
|
54
|
+
"""Retrieve context chunks in document."""
|
|
55
|
+
try:
|
|
56
|
+
result = workflow.invoke(
|
|
57
|
+
{
|
|
58
|
+
"messages": [
|
|
59
|
+
HumanMessage(
|
|
60
|
+
content=[
|
|
61
|
+
{
|
|
62
|
+
"type": "text",
|
|
63
|
+
"text": f"Retrieve a complete context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated with the same document's language.",
|
|
64
|
+
},
|
|
65
|
+
]
|
|
66
|
+
)
|
|
67
|
+
],
|
|
68
|
+
"document_content": markdown_content,
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
"configurable": {
|
|
72
|
+
"transcription_accuracy_threshold": 0.95,
|
|
73
|
+
"max_transcription_retries": 2,
|
|
74
|
+
}
|
|
75
|
+
},
|
|
76
|
+
)
|
|
77
|
+
# chunk.page_content = (
|
|
78
|
+
# f"Context:{result['context']}, Content:{chunk.page_content}"
|
|
79
|
+
# )
|
|
80
|
+
chunk.metadata["context"] = result["context"]
|
|
81
|
+
if chunk_metadata:
|
|
82
|
+
for key, value in chunk_metadata.items():
|
|
83
|
+
chunk.metadata[key] = value
|
|
84
|
+
return chunk
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
87
|
+
raise
|
|
88
|
+
|
|
89
|
+
# def _retrieve_context_chunk_in_document(
|
|
90
|
+
# self,
|
|
91
|
+
# markdown_content: str,
|
|
92
|
+
# chunk: Document,
|
|
93
|
+
# chunk_metadata: Optional[Dict[str, Any]] = None,
|
|
94
|
+
# ) -> Document:
|
|
95
|
+
# """Retrieve context chunks in document."""
|
|
96
|
+
# try:
|
|
97
|
+
# chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
|
|
98
|
+
# # Create the prompt template with image
|
|
99
|
+
# prompt = ChatPromptTemplate.from_messages(
|
|
100
|
+
# [
|
|
101
|
+
# ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
|
|
102
|
+
# (
|
|
103
|
+
# "human",
|
|
104
|
+
# [
|
|
105
|
+
# {
|
|
106
|
+
# "type": "text",
|
|
107
|
+
# "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated in '{self.target_language}' language",
|
|
108
|
+
# }
|
|
109
|
+
# ],
|
|
110
|
+
# ),
|
|
111
|
+
# ]
|
|
112
|
+
# ).partial(
|
|
113
|
+
# document_content=markdown_content,
|
|
114
|
+
# format_instructions=chunk_output_parser.get_format_instructions(),
|
|
115
|
+
# )
|
|
116
|
+
# model_with_structured_output = self.chat_model.with_structured_output(
|
|
117
|
+
# ContextChunk
|
|
118
|
+
# )
|
|
119
|
+
# # Create the chain
|
|
120
|
+
# chain = prompt | model_with_structured_output
|
|
121
|
+
# # Process the image
|
|
122
|
+
# results = chain.invoke({})
|
|
123
|
+
# # chunk.page_content = (
|
|
124
|
+
# # f"Context:{results.context}, Content:{chunk.page_content}"
|
|
125
|
+
# # )
|
|
126
|
+
# chunk.metadata["context"] = results.context
|
|
127
|
+
# if chunk_metadata:
|
|
128
|
+
# for key, value in chunk_metadata.items():
|
|
129
|
+
# chunk.metadata[key] = value
|
|
130
|
+
# return chunk
|
|
131
|
+
|
|
132
|
+
# except Exception as e:
|
|
133
|
+
# logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
134
|
+
# raise
|
|
135
|
+
|
|
136
|
+
# def retrieve_context_chunks_in_document(
|
|
137
|
+
# self,
|
|
138
|
+
# markdown_content: str,
|
|
139
|
+
# chunks: List[Document],
|
|
140
|
+
# chunks_metadata: Optional[Dict[str, Any]] = None,
|
|
141
|
+
# ) -> List[Document]:
|
|
142
|
+
# """Retrieve context chunks in document."""
|
|
143
|
+
# try:
|
|
144
|
+
# context_chunks = list(
|
|
145
|
+
# map(
|
|
146
|
+
# lambda chunk: self._retrieve_context_chunk_in_document(
|
|
147
|
+
# markdown_content, chunk, chunks_metadata
|
|
148
|
+
# ),
|
|
149
|
+
# chunks,
|
|
150
|
+
# )
|
|
151
|
+
# )
|
|
152
|
+
# return context_chunks
|
|
153
|
+
# except Exception as e:
|
|
154
|
+
# logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
155
|
+
# raise
|
|
156
|
+
|
|
157
|
+
def retrieve_context_chunks_in_document_with_workflow(
|
|
158
|
+
self,
|
|
159
|
+
markdown_content: str,
|
|
160
|
+
chunks: List[Document],
|
|
161
|
+
chunks_metadata: Optional[Dict[str, Any]] = None,
|
|
162
|
+
) -> List[Document]:
|
|
163
|
+
"""Retrieve context chunks in document."""
|
|
164
|
+
try:
|
|
165
|
+
context_workflow = ContextWorkflow(
|
|
166
|
+
self.chat_model, self.context_additional_instructions
|
|
167
|
+
)
|
|
168
|
+
compiled_context_workflow = context_workflow.gen_workflow()
|
|
169
|
+
compiled_context_workflow = compiled_context_workflow.compile()
|
|
170
|
+
context_chunks = list(
|
|
171
|
+
map(
|
|
172
|
+
lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
|
|
173
|
+
compiled_context_workflow,
|
|
174
|
+
markdown_content,
|
|
175
|
+
chunk,
|
|
176
|
+
chunks_metadata,
|
|
177
|
+
),
|
|
178
|
+
chunks,
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
return context_chunks
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
184
|
+
raise
|
|
185
|
+
|
|
186
|
+
def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
|
|
187
|
+
"""
|
|
188
|
+
Get the context chunks in a document.
|
|
189
|
+
"""
|
|
190
|
+
try:
|
|
191
|
+
markdown_content = self.persistence_service.load_markdown_file_content(
|
|
192
|
+
file_key
|
|
193
|
+
)
|
|
194
|
+
langchain_rag_document = Document(
|
|
195
|
+
id=file_key,
|
|
196
|
+
page_content=markdown_content,
|
|
197
|
+
metadata={self.metadata_source: file_key},
|
|
198
|
+
)
|
|
199
|
+
logger.info(f"Document loaded:{file_key}")
|
|
200
|
+
chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
|
|
201
|
+
logger.info(f"Chunks generated:{len(chunks)}")
|
|
202
|
+
context_chunks = self.retrieve_context_chunks_in_document_with_workflow(
|
|
203
|
+
markdown_content, chunks, file_tags
|
|
204
|
+
)
|
|
205
|
+
logger.info(f"Context chunks generated:{len(context_chunks)}")
|
|
206
|
+
# upsert validation
|
|
207
|
+
try:
|
|
208
|
+
print(f"deleting chunks: {file_key}")
|
|
209
|
+
self.delete_document_context_chunks(file_key)
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.error(f"could not delete by source: {e}")
|
|
212
|
+
self.embeddings_manager.index_documents(context_chunks)
|
|
213
|
+
return context_chunks
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.error("Error get_context_chunks_in_document")
|
|
216
|
+
raise e
|
|
217
|
+
|
|
218
|
+
def delete_document_context_chunks(self, file_key: str):
|
|
219
|
+
"""
|
|
220
|
+
Delete the context chunks in a document.
|
|
221
|
+
"""
|
|
222
|
+
try:
|
|
223
|
+
self.embeddings_manager.delete_documents_by_metadata_key(
|
|
224
|
+
self.metadata_source, file_key
|
|
225
|
+
)
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.error(f"Error delete_document_context_chunks: {str(e)}")
|
|
228
|
+
raise e
|
wizit_context_ingestor-0.3.0b2/src/wizit_context_ingestor/application/transcription_service.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
from typing import Tuple, List, Dict, Optional
|
|
2
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
3
|
+
from langchain_core.output_parsers.pydantic import PydanticOutputParser
|
|
4
|
+
from langchain_core.messages import HumanMessage
|
|
5
|
+
from logging import getLogger
|
|
6
|
+
from ..data.prompts import IMAGE_TRANSCRIPTION_SYSTEM_PROMPT, Transcription
|
|
7
|
+
from ..domain.models import ParsedDoc, ParsedDocPage
|
|
8
|
+
from ..domain.services import ParseDocModelService
|
|
9
|
+
from .interfaces import AiApplicationService, PersistenceService
|
|
10
|
+
from ..workflows.transcription_workflow import TranscriptionWorkflow
|
|
11
|
+
|
|
12
|
+
logger = getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TranscriptionService:
|
|
16
|
+
"""
|
|
17
|
+
Service for transcribing documents.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
ai_application_service: AiApplicationService,
|
|
23
|
+
persistence_service: PersistenceService,
|
|
24
|
+
target_language: str = "es",
|
|
25
|
+
transcription_additional_instructions: str = "",
|
|
26
|
+
transcription_accuracy_threshold: int = 90,
|
|
27
|
+
max_transcription_retries: int = 2,
|
|
28
|
+
):
|
|
29
|
+
self.ai_application_service = ai_application_service
|
|
30
|
+
self.persistence_service = persistence_service
|
|
31
|
+
self.target_language = target_language
|
|
32
|
+
if (
|
|
33
|
+
transcription_accuracy_threshold < 0
|
|
34
|
+
or transcription_accuracy_threshold > 95
|
|
35
|
+
):
|
|
36
|
+
raise ValueError(
|
|
37
|
+
"transcription_accuracy_threshold must be between 0 and 95"
|
|
38
|
+
)
|
|
39
|
+
if max_transcription_retries < 1 or max_transcription_retries > 3:
|
|
40
|
+
raise ValueError(
|
|
41
|
+
"max_transcription_retries must be between 1 and 3 to prevent token exhaustion"
|
|
42
|
+
)
|
|
43
|
+
self.transcription_accuracy_threshold = transcription_accuracy_threshold
|
|
44
|
+
self.max_transcription_retries = max_transcription_retries
|
|
45
|
+
self.transcription_additional_instructions = (
|
|
46
|
+
transcription_additional_instructions
|
|
47
|
+
)
|
|
48
|
+
self.chat_model = self.ai_application_service.load_chat_model()
|
|
49
|
+
|
|
50
|
+
# def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
|
|
51
|
+
# """Transcribe an image to text.
|
|
52
|
+
# Args:
|
|
53
|
+
# document: The document with the image to transcribe
|
|
54
|
+
# Returns:
|
|
55
|
+
# Processed text
|
|
56
|
+
# """
|
|
57
|
+
# try:
|
|
58
|
+
# # Create the prompt template with image
|
|
59
|
+
# transcription_output_parser = PydanticOutputParser(
|
|
60
|
+
# pydantic_object=Transcription
|
|
61
|
+
# )
|
|
62
|
+
# prompt = ChatPromptTemplate.from_messages(
|
|
63
|
+
# [
|
|
64
|
+
# ("system", IMAGE_TRANSCRIPTION_SYSTEM_PROMPT),
|
|
65
|
+
# (
|
|
66
|
+
# "human",
|
|
67
|
+
# [
|
|
68
|
+
# {
|
|
69
|
+
# "type": "image",
|
|
70
|
+
# "image_url": {
|
|
71
|
+
# "url": f"data:image/png;base64,{document.page_base64}"
|
|
72
|
+
# },
|
|
73
|
+
# },
|
|
74
|
+
# {
|
|
75
|
+
# "type": "text",
|
|
76
|
+
# "text": "Transcribe the document, ensure all content transcribed accurately",
|
|
77
|
+
# },
|
|
78
|
+
# ],
|
|
79
|
+
# ),
|
|
80
|
+
# ]
|
|
81
|
+
# ).partial(
|
|
82
|
+
# transcription_additional_instructions=self.transcription_additional_instructions,
|
|
83
|
+
# format_instructions=transcription_output_parser.get_format_instructions(),
|
|
84
|
+
# )
|
|
85
|
+
# model_with_structured_output = self.chat_model.with_structured_output(
|
|
86
|
+
# Transcription
|
|
87
|
+
# )
|
|
88
|
+
# # Create the chain
|
|
89
|
+
# chain = prompt | model_with_structured_output
|
|
90
|
+
# # Process the image
|
|
91
|
+
# chain = chain.with_retry(
|
|
92
|
+
# stop_after_attempt=3, exponential_jitter_params={"initial": 60}
|
|
93
|
+
# )
|
|
94
|
+
# result = chain.invoke({})
|
|
95
|
+
# if result.transcription:
|
|
96
|
+
# document.page_text = result.transcription
|
|
97
|
+
# else:
|
|
98
|
+
# raise ValueError("No transcription found")
|
|
99
|
+
# return document
|
|
100
|
+
# except Exception as e:
|
|
101
|
+
# logger.error(f"Failed to parse document page: {str(e)}")
|
|
102
|
+
# raise
|
|
103
|
+
|
|
104
|
+
def parse_doc_page_with_workflow(self, document: ParsedDocPage) -> ParsedDocPage:
|
|
105
|
+
"""Transcribe an image to text using an agent.
|
|
106
|
+
Args:
|
|
107
|
+
document: The document with the image to transcribe
|
|
108
|
+
Returns:
|
|
109
|
+
Processed text
|
|
110
|
+
"""
|
|
111
|
+
transcription_workflow = TranscriptionWorkflow(
|
|
112
|
+
self.chat_model, self.transcription_additional_instructions
|
|
113
|
+
)
|
|
114
|
+
compiled_transcription_workflow = transcription_workflow.gen_workflow()
|
|
115
|
+
compiled_transcription_workflow = compiled_transcription_workflow.compile()
|
|
116
|
+
result = compiled_transcription_workflow.invoke(
|
|
117
|
+
{
|
|
118
|
+
"messages": [
|
|
119
|
+
HumanMessage(
|
|
120
|
+
content=[
|
|
121
|
+
{
|
|
122
|
+
"type": "text",
|
|
123
|
+
"text": "Transcribe the document, ensure all content transcribed accurately. transcription must be in the same language of source document.",
|
|
124
|
+
},
|
|
125
|
+
]
|
|
126
|
+
),
|
|
127
|
+
HumanMessage(
|
|
128
|
+
content=[
|
|
129
|
+
{
|
|
130
|
+
"type": "image_url",
|
|
131
|
+
"image_url": {
|
|
132
|
+
"url": f"data:image/png;base64,{document.page_base64}"
|
|
133
|
+
},
|
|
134
|
+
}
|
|
135
|
+
]
|
|
136
|
+
),
|
|
137
|
+
]
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
"configurable": {
|
|
141
|
+
"transcription_accuracy_threshold": self.transcription_accuracy_threshold,
|
|
142
|
+
"max_transcription_retries": self.max_transcription_retries,
|
|
143
|
+
}
|
|
144
|
+
},
|
|
145
|
+
)
|
|
146
|
+
if result["transcription"]:
|
|
147
|
+
document.page_text = result["transcription"]
|
|
148
|
+
else:
|
|
149
|
+
raise ValueError("No transcription found")
|
|
150
|
+
return document
|
|
151
|
+
|
|
152
|
+
def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
|
|
153
|
+
"""
|
|
154
|
+
Process a document by parsing it and returning the parsed content.
|
|
155
|
+
"""
|
|
156
|
+
raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
|
|
157
|
+
parse_doc_model_service = ParseDocModelService(raw_file_path)
|
|
158
|
+
document_pages = parse_doc_model_service.parse_document_to_base64()
|
|
159
|
+
parsed_pages = []
|
|
160
|
+
for page in document_pages:
|
|
161
|
+
page = self.parse_doc_page_with_workflow(page)
|
|
162
|
+
parsed_pages.append(page)
|
|
163
|
+
logger.info(f"Parsed {len(parsed_pages)} pages")
|
|
164
|
+
parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
|
|
165
|
+
return parsed_pages, parsed_document
|
|
166
|
+
|
|
167
|
+
def save_parsed_document(
|
|
168
|
+
self,
|
|
169
|
+
file_key: str,
|
|
170
|
+
parsed_document: ParsedDoc,
|
|
171
|
+
file_tags: Optional[Dict[str, str]] = {},
|
|
172
|
+
):
|
|
173
|
+
"""
|
|
174
|
+
Save the parsed document to a file.
|
|
175
|
+
"""
|
|
176
|
+
self.persistence_service.save_parsed_document(
|
|
177
|
+
file_key, parsed_document, file_tags
|
|
178
|
+
)
|