wizit-context-ingestor 0.2.5b2__tar.gz → 0.3.0b1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wizit-context-ingestor might be problematic. Click here for more details.

Files changed (49) hide show
  1. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/PKG-INFO +10 -1
  2. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/README.md +7 -0
  3. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/pyproject.toml +3 -1
  4. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/__init__.py +3 -0
  5. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/application/context_chunk_service.py +228 -0
  6. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/application/interfaces.py +1 -1
  7. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/application/transcription_service.py +178 -0
  8. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/data/kdb.py +10 -0
  9. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/data/prompts.py +295 -0
  10. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/data/storage.py +10 -0
  11. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
  12. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
  13. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/infra/rag/chroma_embeddings.py +135 -0
  14. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
  15. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
  16. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
  17. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/infra/vertex_model.py +56 -28
  18. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/main.py +248 -0
  19. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/utils/file_utils.py +13 -0
  20. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/workflows/context_nodes.py +73 -0
  21. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/workflows/context_state.py +10 -0
  22. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/workflows/context_tools.py +58 -0
  23. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/workflows/context_workflow.py +42 -0
  24. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
  25. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
  26. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/workflows/transcription_state.py +17 -0
  27. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/workflows/transcription_tools.py +54 -0
  28. wizit_context_ingestor-0.3.0b1/src/wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
  29. wizit_context_ingestor-0.2.5b2/src/wizit_context_ingestor/__init__.py +0 -3
  30. wizit_context_ingestor-0.2.5b2/src/wizit_context_ingestor/application/context_chunk_service.py +0 -114
  31. wizit_context_ingestor-0.2.5b2/src/wizit_context_ingestor/application/transcription_service.py +0 -95
  32. wizit_context_ingestor-0.2.5b2/src/wizit_context_ingestor/data/prompts.py +0 -141
  33. wizit_context_ingestor-0.2.5b2/src/wizit_context_ingestor/main.py +0 -193
  34. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/.DS_Store +0 -0
  35. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/application/__init__.py +0 -0
  36. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/data/__init__.py +0 -0
  37. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/domain/__init__.py +0 -0
  38. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/domain/models.py +0 -0
  39. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/domain/services.py +0 -0
  40. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/infra/__init__.py +0 -0
  41. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/infra/aws_model.py +0 -0
  42. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/infra/persistence/__init__.py +0 -0
  43. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/infra/secrets/__init__.py +0 -0
  44. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/infra/secrets/aws_secrets_manager.py +0 -0
  45. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/services/.DS_Store +0 -0
  46. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/services/__init__.py +0 -0
  47. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/services/chunks.py +0 -0
  48. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/services/parse_doc.py +0 -0
  49. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.3.0b1}/src/wizit_context_ingestor/services/pg_embeddings_manager.py +0 -0
@@ -1,13 +1,15 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: wizit-context-ingestor
3
- Version: 0.2.5b2
3
+ Version: 0.3.0b1
4
4
  Summary: Contextual Rag with Cloud Solutions
5
5
  Requires-Dist: anthropic[vertex]>=0.66.0
6
6
  Requires-Dist: boto3>=1.40.23
7
7
  Requires-Dist: langchain-aws>=0.2.31
8
+ Requires-Dist: langchain-chroma>=0.2.6
8
9
  Requires-Dist: langchain-experimental>=0.3.4
9
10
  Requires-Dist: langchain-google-vertexai>=2.0.28
10
11
  Requires-Dist: langchain-redis>=0.2.3
12
+ Requires-Dist: langgraph>=0.6.8
11
13
  Requires-Dist: pillow>=11.3.0
12
14
  Requires-Dist: pymupdf>=1.26.4
13
15
  Requires-Python: >=3.12
@@ -137,6 +139,13 @@ Finally
137
139
  poetry publish -r tbbcmegaingestor
138
140
  ```
139
141
 
142
+ # USAGE
143
+
144
+ ## For transcriptions
145
+
146
+ ----- TODO ---
147
+ You can provide number of retries and a transcription quality threshold
148
+
140
149
  ## License
141
150
 
142
151
  This project is licensed under the Apache License - see the LICENSE file for details.
@@ -122,6 +122,13 @@ Finally
122
122
  poetry publish -r tbbcmegaingestor
123
123
  ```
124
124
 
125
+ # USAGE
126
+
127
+ ## For transcriptions
128
+
129
+ ----- TODO ---
130
+ You can provide number of retries and a transcription quality threshold
131
+
125
132
  ## License
126
133
 
127
134
  This project is licensed under the Apache License - see the LICENSE file for details.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "wizit_context_ingestor"
3
- version = "0.2.5-beta-2"
3
+ version = "0.3.0-beta-1"
4
4
  description = "Contextual Rag with Cloud Solutions"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -8,9 +8,11 @@ dependencies = [
8
8
  "anthropic[vertex]>=0.66.0",
9
9
  "boto3>=1.40.23",
10
10
  "langchain-aws>=0.2.31",
11
+ "langchain-chroma>=0.2.6",
11
12
  "langchain-experimental>=0.3.4",
12
13
  "langchain-google-vertexai>=2.0.28",
13
14
  "langchain-redis>=0.2.3",
15
+ "langgraph>=0.6.8",
14
16
  "pillow>=11.3.0",
15
17
  "pymupdf>=1.26.4",
16
18
  ]
@@ -0,0 +1,3 @@
1
+ from .main import ChunksManager, TranscriptionManager
2
+
3
+ __all__ = ["ChunksManager", "TranscriptionManager"]
@@ -0,0 +1,228 @@
1
+ from langchain_core.output_parsers.pydantic import PydanticOutputParser
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+ from langchain_core.documents import Document
4
+ from ..data.prompts import CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT, ContextChunk
5
+ from langchain_core.messages.human import HumanMessage
6
+ from ..workflows.context_workflow import ContextWorkflow
7
+ from typing import Dict, Any, Optional, List
8
+ from .interfaces import (
9
+ AiApplicationService,
10
+ PersistenceService,
11
+ RagChunker,
12
+ EmbeddingsManager,
13
+ )
14
+ import logging
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class ContextChunksInDocumentService:
21
+ """
22
+ Service for chunking documents.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ ai_application_service: AiApplicationService,
28
+ persistence_service: PersistenceService,
29
+ rag_chunker: RagChunker,
30
+ embeddings_manager: EmbeddingsManager,
31
+ target_language: str = "es",
32
+ ):
33
+ """
34
+ Initialize the ChunkerService.
35
+ """
36
+ self.ai_application_service = ai_application_service
37
+ self.persistence_service = persistence_service
38
+ self.rag_chunker = rag_chunker
39
+ self.embeddings_manager = embeddings_manager
40
+ self.target_language = target_language
41
+ self.embeddings_manager.init_vector_store()
42
+ self.chat_model = self.ai_application_service.load_chat_model()
43
+ # TODO
44
+ self.context_additional_instructions = ""
45
+ self.metadata_source = "source"
46
+
47
+ def _retrieve_context_chunk_in_document_with_workflow(
48
+ self,
49
+ workflow,
50
+ markdown_content: str,
51
+ chunk: Document,
52
+ chunk_metadata: Optional[Dict[str, Any]] = None,
53
+ ) -> Document:
54
+ """Retrieve context chunks in document."""
55
+ try:
56
+ result = workflow.invoke(
57
+ {
58
+ "messages": [
59
+ HumanMessage(
60
+ content=[
61
+ {
62
+ "type": "text",
63
+ "text": f"Retrieve a complete context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated with the same document's language.",
64
+ },
65
+ ]
66
+ )
67
+ ],
68
+ "document_content": markdown_content,
69
+ },
70
+ {
71
+ "configurable": {
72
+ "transcription_accuracy_threshold": 0.95,
73
+ "max_transcription_retries": 2,
74
+ }
75
+ },
76
+ )
77
+ # chunk.page_content = (
78
+ # f"Context:{result['context']}, Content:{chunk.page_content}"
79
+ # )
80
+ chunk.metadata["context"] = result["context"]
81
+ if chunk_metadata:
82
+ for key, value in chunk_metadata.items():
83
+ chunk.metadata[key] = value
84
+ return chunk
85
+ except Exception as e:
86
+ logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
87
+ raise
88
+
89
+ # def _retrieve_context_chunk_in_document(
90
+ # self,
91
+ # markdown_content: str,
92
+ # chunk: Document,
93
+ # chunk_metadata: Optional[Dict[str, Any]] = None,
94
+ # ) -> Document:
95
+ # """Retrieve context chunks in document."""
96
+ # try:
97
+ # chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
98
+ # # Create the prompt template with image
99
+ # prompt = ChatPromptTemplate.from_messages(
100
+ # [
101
+ # ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
102
+ # (
103
+ # "human",
104
+ # [
105
+ # {
106
+ # "type": "text",
107
+ # "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated in '{self.target_language}' language",
108
+ # }
109
+ # ],
110
+ # ),
111
+ # ]
112
+ # ).partial(
113
+ # document_content=markdown_content,
114
+ # format_instructions=chunk_output_parser.get_format_instructions(),
115
+ # )
116
+ # model_with_structured_output = self.chat_model.with_structured_output(
117
+ # ContextChunk
118
+ # )
119
+ # # Create the chain
120
+ # chain = prompt | model_with_structured_output
121
+ # # Process the image
122
+ # results = chain.invoke({})
123
+ # # chunk.page_content = (
124
+ # # f"Context:{results.context}, Content:{chunk.page_content}"
125
+ # # )
126
+ # chunk.metadata["context"] = results.context
127
+ # if chunk_metadata:
128
+ # for key, value in chunk_metadata.items():
129
+ # chunk.metadata[key] = value
130
+ # return chunk
131
+
132
+ # except Exception as e:
133
+ # logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
134
+ # raise
135
+
136
+ # def retrieve_context_chunks_in_document(
137
+ # self,
138
+ # markdown_content: str,
139
+ # chunks: List[Document],
140
+ # chunks_metadata: Optional[Dict[str, Any]] = None,
141
+ # ) -> List[Document]:
142
+ # """Retrieve context chunks in document."""
143
+ # try:
144
+ # context_chunks = list(
145
+ # map(
146
+ # lambda chunk: self._retrieve_context_chunk_in_document(
147
+ # markdown_content, chunk, chunks_metadata
148
+ # ),
149
+ # chunks,
150
+ # )
151
+ # )
152
+ # return context_chunks
153
+ # except Exception as e:
154
+ # logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
155
+ # raise
156
+
157
+ def retrieve_context_chunks_in_document_with_workflow(
158
+ self,
159
+ markdown_content: str,
160
+ chunks: List[Document],
161
+ chunks_metadata: Optional[Dict[str, Any]] = None,
162
+ ) -> List[Document]:
163
+ """Retrieve context chunks in document."""
164
+ try:
165
+ context_workflow = ContextWorkflow(
166
+ self.chat_model, self.context_additional_instructions
167
+ )
168
+ compiled_context_workflow = context_workflow.gen_workflow()
169
+ compiled_context_workflow = compiled_context_workflow.compile()
170
+ context_chunks = list(
171
+ map(
172
+ lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
173
+ compiled_context_workflow,
174
+ markdown_content,
175
+ chunk,
176
+ chunks_metadata,
177
+ ),
178
+ chunks,
179
+ )
180
+ )
181
+ return context_chunks
182
+ except Exception as e:
183
+ logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
184
+ raise
185
+
186
+ def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
187
+ """
188
+ Get the context chunks in a document.
189
+ """
190
+ try:
191
+ markdown_content = self.persistence_service.load_markdown_file_content(
192
+ file_key
193
+ )
194
+ langchain_rag_document = Document(
195
+ id=file_key,
196
+ page_content=markdown_content,
197
+ metadata={self.metadata_source: file_key},
198
+ )
199
+ logger.info(f"Document loaded:{file_key}")
200
+ chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
201
+ logger.info(f"Chunks generated:{len(chunks)}")
202
+ context_chunks = self.retrieve_context_chunks_in_document_with_workflow(
203
+ markdown_content, chunks, file_tags
204
+ )
205
+ logger.info(f"Context chunks generated:{len(context_chunks)}")
206
+ # upsert validation
207
+ try:
208
+ print(f"deleting chunks: {file_key}")
209
+ self.delete_document_context_chunks(file_key)
210
+ except Exception as e:
211
+ logger.error(f"could not delete by source: {e}")
212
+ self.embeddings_manager.index_documents(context_chunks)
213
+ return context_chunks
214
+ except Exception as e:
215
+ logger.error("Error get_context_chunks_in_document")
216
+ raise e
217
+
218
+ def delete_document_context_chunks(self, file_key: str):
219
+ """
220
+ Delete the context chunks in a document.
221
+ """
222
+ try:
223
+ self.embeddings_manager.delete_documents_by_metadata_key(
224
+ self.metadata_source, file_key
225
+ )
226
+ except Exception as e:
227
+ logger.error(f"Error delete_document_context_chunks: {str(e)}")
228
+ raise e
@@ -92,7 +92,7 @@ class EmbeddingsManager(ABC):
92
92
  pass
93
93
 
94
94
  @abstractmethod
95
- def index_documents(self, documents: List[Document]):
95
+ def index_documents(self, documents: list[Document]):
96
96
  """Index documents."""
97
97
  pass
98
98
 
@@ -0,0 +1,178 @@
1
+ from typing import Tuple, List, Dict, Optional
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+ from langchain_core.output_parsers.pydantic import PydanticOutputParser
4
+ from langchain_core.messages import HumanMessage
5
+ from logging import getLogger
6
+ from ..data.prompts import IMAGE_TRANSCRIPTION_SYSTEM_PROMPT, Transcription
7
+ from ..domain.models import ParsedDoc, ParsedDocPage
8
+ from ..domain.services import ParseDocModelService
9
+ from .interfaces import AiApplicationService, PersistenceService
10
+ from ..workflows.transcription_workflow import TranscriptionWorkflow
11
+
12
+ logger = getLogger(__name__)
13
+
14
+
15
+ class TranscriptionService:
16
+ """
17
+ Service for transcribing documents.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ ai_application_service: AiApplicationService,
23
+ persistence_service: PersistenceService,
24
+ target_language: str = "es",
25
+ transcription_additional_instructions: str = "",
26
+ transcription_accuracy_threshold: int = 90,
27
+ max_transcription_retries: int = 2,
28
+ ):
29
+ self.ai_application_service = ai_application_service
30
+ self.persistence_service = persistence_service
31
+ self.target_language = target_language
32
+ if (
33
+ transcription_accuracy_threshold < 0
34
+ or transcription_accuracy_threshold > 95
35
+ ):
36
+ raise ValueError(
37
+ "transcription_accuracy_threshold must be between 0 and 95"
38
+ )
39
+ if max_transcription_retries < 1 or max_transcription_retries > 3:
40
+ raise ValueError(
41
+ "max_transcription_retries must be between 1 and 3 to prevent token exhaustion"
42
+ )
43
+ self.transcription_accuracy_threshold = transcription_accuracy_threshold
44
+ self.max_transcription_retries = max_transcription_retries
45
+ self.transcription_additional_instructions = (
46
+ transcription_additional_instructions
47
+ )
48
+ self.chat_model = self.ai_application_service.load_chat_model()
49
+
50
+ # def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
51
+ # """Transcribe an image to text.
52
+ # Args:
53
+ # document: The document with the image to transcribe
54
+ # Returns:
55
+ # Processed text
56
+ # """
57
+ # try:
58
+ # # Create the prompt template with image
59
+ # transcription_output_parser = PydanticOutputParser(
60
+ # pydantic_object=Transcription
61
+ # )
62
+ # prompt = ChatPromptTemplate.from_messages(
63
+ # [
64
+ # ("system", IMAGE_TRANSCRIPTION_SYSTEM_PROMPT),
65
+ # (
66
+ # "human",
67
+ # [
68
+ # {
69
+ # "type": "image",
70
+ # "image_url": {
71
+ # "url": f"data:image/png;base64,{document.page_base64}"
72
+ # },
73
+ # },
74
+ # {
75
+ # "type": "text",
76
+ # "text": "Transcribe the document, ensure all content transcribed accurately",
77
+ # },
78
+ # ],
79
+ # ),
80
+ # ]
81
+ # ).partial(
82
+ # transcription_additional_instructions=self.transcription_additional_instructions,
83
+ # format_instructions=transcription_output_parser.get_format_instructions(),
84
+ # )
85
+ # model_with_structured_output = self.chat_model.with_structured_output(
86
+ # Transcription
87
+ # )
88
+ # # Create the chain
89
+ # chain = prompt | model_with_structured_output
90
+ # # Process the image
91
+ # chain = chain.with_retry(
92
+ # stop_after_attempt=3, exponential_jitter_params={"initial": 60}
93
+ # )
94
+ # result = chain.invoke({})
95
+ # if result.transcription:
96
+ # document.page_text = result.transcription
97
+ # else:
98
+ # raise ValueError("No transcription found")
99
+ # return document
100
+ # except Exception as e:
101
+ # logger.error(f"Failed to parse document page: {str(e)}")
102
+ # raise
103
+
104
+ def parse_doc_page_with_workflow(self, document: ParsedDocPage) -> ParsedDocPage:
105
+ """Transcribe an image to text using an agent.
106
+ Args:
107
+ document: The document with the image to transcribe
108
+ Returns:
109
+ Processed text
110
+ """
111
+ transcription_workflow = TranscriptionWorkflow(
112
+ self.chat_model, self.transcription_additional_instructions
113
+ )
114
+ compiled_transcription_workflow = transcription_workflow.gen_workflow()
115
+ compiled_transcription_workflow = compiled_transcription_workflow.compile()
116
+ result = compiled_transcription_workflow.invoke(
117
+ {
118
+ "messages": [
119
+ HumanMessage(
120
+ content=[
121
+ {
122
+ "type": "text",
123
+ "text": "Transcribe the document, ensure all content transcribed accurately. transcription must be in the same language of source document.",
124
+ },
125
+ ]
126
+ ),
127
+ HumanMessage(
128
+ content=[
129
+ {
130
+ "type": "image_url",
131
+ "image_url": {
132
+ "url": f"data:image/png;base64,{document.page_base64}"
133
+ },
134
+ }
135
+ ]
136
+ ),
137
+ ]
138
+ },
139
+ {
140
+ "configurable": {
141
+ "transcription_accuracy_threshold": self.transcription_accuracy_threshold,
142
+ "max_transcription_retries": self.max_transcription_retries,
143
+ }
144
+ },
145
+ )
146
+ if result["transcription"]:
147
+ document.page_text = result["transcription"]
148
+ else:
149
+ raise ValueError("No transcription found")
150
+ return document
151
+
152
+ def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
153
+ """
154
+ Process a document by parsing it and returning the parsed content.
155
+ """
156
+ raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
157
+ parse_doc_model_service = ParseDocModelService(raw_file_path)
158
+ document_pages = parse_doc_model_service.parse_document_to_base64()
159
+ parsed_pages = []
160
+ for page in document_pages:
161
+ page = self.parse_doc_page_with_workflow(page)
162
+ parsed_pages.append(page)
163
+ logger.info(f"Parsed {len(parsed_pages)} pages")
164
+ parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
165
+ return parsed_pages, parsed_document
166
+
167
+ def save_parsed_document(
168
+ self,
169
+ file_key: str,
170
+ parsed_document: ParsedDoc,
171
+ file_tags: Optional[Dict[str, str]] = {},
172
+ ):
173
+ """
174
+ Save the parsed document to a file.
175
+ """
176
+ self.persistence_service.save_parsed_document(
177
+ file_key, parsed_document, file_tags
178
+ )
@@ -0,0 +1,10 @@
1
+ from enum import Enum
2
+ from typing import Literal
3
+
4
+
5
+ class KdbServices(Enum):
6
+ REDIS = "redis"
7
+ CHROMA = "chroma"
8
+
9
+
10
+ kdb_services = Literal[KdbServices.REDIS.value, KdbServices.CHROMA.value]