wizit-context-ingestor 0.2.5b3__py3-none-any.whl → 0.3.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. wizit_context_ingestor/__init__.py +2 -2
  2. wizit_context_ingestor/application/context_chunk_service.py +149 -35
  3. wizit_context_ingestor/application/transcription_service.py +132 -52
  4. wizit_context_ingestor/data/kdb.py +10 -0
  5. wizit_context_ingestor/data/prompts.py +150 -3
  6. wizit_context_ingestor/data/storage.py +10 -0
  7. wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
  8. wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
  9. wizit_context_ingestor/infra/rag/chroma_embeddings.py +30 -31
  10. wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
  11. wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
  12. wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
  13. wizit_context_ingestor/infra/vertex_model.py +56 -28
  14. wizit_context_ingestor/main.py +192 -106
  15. wizit_context_ingestor/utils/file_utils.py +13 -0
  16. wizit_context_ingestor/workflows/context_nodes.py +73 -0
  17. wizit_context_ingestor/workflows/context_state.py +10 -0
  18. wizit_context_ingestor/workflows/context_tools.py +58 -0
  19. wizit_context_ingestor/workflows/context_workflow.py +42 -0
  20. wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
  21. wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
  22. wizit_context_ingestor/workflows/transcription_state.py +17 -0
  23. wizit_context_ingestor/workflows/transcription_tools.py +54 -0
  24. wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
  25. {wizit_context_ingestor-0.2.5b3.dist-info → wizit_context_ingestor-0.3.0b2.dist-info}/METADATA +9 -1
  26. wizit_context_ingestor-0.3.0b2.dist-info/RECORD +44 -0
  27. {wizit_context_ingestor-0.2.5b3.dist-info → wizit_context_ingestor-0.3.0b2.dist-info}/WHEEL +1 -1
  28. wizit_context_ingestor-0.2.5b3.dist-info/RECORD +0 -32
@@ -1,3 +1,3 @@
1
- from .main import DeelabTranscribeManager, DeelabRedisChunksManager
1
+ from .main import ChunksManager, TranscriptionManager
2
2
 
3
- __all__ = ["DeelabTranscribeManager", "DeelabRedisChunksManager"]
3
+ __all__ = ["ChunksManager", "TranscriptionManager"]
@@ -2,8 +2,15 @@ from langchain_core.output_parsers.pydantic import PydanticOutputParser
2
2
  from langchain_core.prompts import ChatPromptTemplate
3
3
  from langchain_core.documents import Document
4
4
  from ..data.prompts import CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT, ContextChunk
5
+ from langchain_core.messages.human import HumanMessage
6
+ from ..workflows.context_workflow import ContextWorkflow
5
7
  from typing import Dict, Any, Optional, List
6
- from .interfaces import AiApplicationService, PersistenceService, RagChunker, EmbeddingsManager
8
+ from .interfaces import (
9
+ AiApplicationService,
10
+ PersistenceService,
11
+ RagChunker,
12
+ EmbeddingsManager,
13
+ )
7
14
  import logging
8
15
 
9
16
 
@@ -21,7 +28,7 @@ class ContextChunksInDocumentService:
21
28
  persistence_service: PersistenceService,
22
29
  rag_chunker: RagChunker,
23
30
  embeddings_manager: EmbeddingsManager,
24
- target_language: str = 'es'
31
+ target_language: str = "es",
25
32
  ):
26
33
  """
27
34
  Initialize the ChunkerService.
@@ -33,48 +40,144 @@ class ContextChunksInDocumentService:
33
40
  self.target_language = target_language
34
41
  self.embeddings_manager.init_vector_store()
35
42
  self.chat_model = self.ai_application_service.load_chat_model()
43
+ # TODO
44
+ self.context_additional_instructions = ""
45
+ self.metadata_source = "source"
36
46
 
37
- def _retrieve_context_chunk_in_document(self, markdown_content: str, chunk: Document, chunk_metadata: Optional[Dict[str, Any]] = None) -> Document:
47
+ def _retrieve_context_chunk_in_document_with_workflow(
48
+ self,
49
+ workflow,
50
+ markdown_content: str,
51
+ chunk: Document,
52
+ chunk_metadata: Optional[Dict[str, Any]] = None,
53
+ ) -> Document:
38
54
  """Retrieve context chunks in document."""
39
55
  try:
40
- chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
41
- # Create the prompt template with image
42
- prompt = ChatPromptTemplate.from_messages([
43
- ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
44
- (
45
- "human", [{
46
- "type": "text",
47
- "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated in '{self.target_language}' language"
48
- }]
49
- ),
50
- ]).partial(
51
- document_content=markdown_content,
52
- format_instructions=chunk_output_parser.get_format_instructions()
56
+ result = workflow.invoke(
57
+ {
58
+ "messages": [
59
+ HumanMessage(
60
+ content=[
61
+ {
62
+ "type": "text",
63
+ "text": f"Retrieve a complete context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated with the same document's language.",
64
+ },
65
+ ]
66
+ )
67
+ ],
68
+ "document_content": markdown_content,
69
+ },
70
+ {
71
+ "configurable": {
72
+ "transcription_accuracy_threshold": 0.95,
73
+ "max_transcription_retries": 2,
74
+ }
75
+ },
53
76
  )
54
- model_with_structured_output = self.chat_model.with_structured_output(ContextChunk)
55
- # Create the chain
56
- chain = prompt | model_with_structured_output
57
- # Process the image
58
- results = chain.invoke({})
59
- chunk.page_content = f"Context:{results.context}, Content:{chunk.page_content}"
60
- chunk.metadata["context"] = results.context
77
+ # chunk.page_content = (
78
+ # f"Context:{result['context']}, Content:{chunk.page_content}"
79
+ # )
80
+ chunk.metadata["context"] = result["context"]
61
81
  if chunk_metadata:
62
82
  for key, value in chunk_metadata.items():
63
83
  chunk.metadata[key] = value
64
84
  return chunk
65
-
66
85
  except Exception as e:
67
86
  logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
68
87
  raise
69
88
 
89
+ # def _retrieve_context_chunk_in_document(
90
+ # self,
91
+ # markdown_content: str,
92
+ # chunk: Document,
93
+ # chunk_metadata: Optional[Dict[str, Any]] = None,
94
+ # ) -> Document:
95
+ # """Retrieve context chunks in document."""
96
+ # try:
97
+ # chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
98
+ # # Create the prompt template with image
99
+ # prompt = ChatPromptTemplate.from_messages(
100
+ # [
101
+ # ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
102
+ # (
103
+ # "human",
104
+ # [
105
+ # {
106
+ # "type": "text",
107
+ # "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated in '{self.target_language}' language",
108
+ # }
109
+ # ],
110
+ # ),
111
+ # ]
112
+ # ).partial(
113
+ # document_content=markdown_content,
114
+ # format_instructions=chunk_output_parser.get_format_instructions(),
115
+ # )
116
+ # model_with_structured_output = self.chat_model.with_structured_output(
117
+ # ContextChunk
118
+ # )
119
+ # # Create the chain
120
+ # chain = prompt | model_with_structured_output
121
+ # # Process the image
122
+ # results = chain.invoke({})
123
+ # # chunk.page_content = (
124
+ # # f"Context:{results.context}, Content:{chunk.page_content}"
125
+ # # )
126
+ # chunk.metadata["context"] = results.context
127
+ # if chunk_metadata:
128
+ # for key, value in chunk_metadata.items():
129
+ # chunk.metadata[key] = value
130
+ # return chunk
131
+
132
+ # except Exception as e:
133
+ # logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
134
+ # raise
70
135
 
71
- def retrieve_context_chunks_in_document(self, markdown_content: str, chunks: List[Document], chunks_metadata: Optional[Dict[str, Any]] = None) -> List[Document]:
136
+ # def retrieve_context_chunks_in_document(
137
+ # self,
138
+ # markdown_content: str,
139
+ # chunks: List[Document],
140
+ # chunks_metadata: Optional[Dict[str, Any]] = None,
141
+ # ) -> List[Document]:
142
+ # """Retrieve context chunks in document."""
143
+ # try:
144
+ # context_chunks = list(
145
+ # map(
146
+ # lambda chunk: self._retrieve_context_chunk_in_document(
147
+ # markdown_content, chunk, chunks_metadata
148
+ # ),
149
+ # chunks,
150
+ # )
151
+ # )
152
+ # return context_chunks
153
+ # except Exception as e:
154
+ # logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
155
+ # raise
156
+
157
+ def retrieve_context_chunks_in_document_with_workflow(
158
+ self,
159
+ markdown_content: str,
160
+ chunks: List[Document],
161
+ chunks_metadata: Optional[Dict[str, Any]] = None,
162
+ ) -> List[Document]:
72
163
  """Retrieve context chunks in document."""
73
164
  try:
74
- context_chunks = list(map(
75
- lambda chunk: self._retrieve_context_chunk_in_document(markdown_content, chunk, chunks_metadata),
76
- chunks
77
- ))
165
+ context_workflow = ContextWorkflow(
166
+ self.chat_model, self.context_additional_instructions
167
+ )
168
+ compiled_context_workflow = context_workflow.gen_workflow()
169
+ compiled_context_workflow = compiled_context_workflow.compile()
170
+ context_chunks = list(
171
+ map(
172
+ lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
173
+ compiled_context_workflow,
174
+ markdown_content,
175
+ chunk,
176
+ chunks_metadata,
177
+ ),
178
+ chunks,
179
+ )
180
+ )
78
181
  return context_chunks
79
182
  except Exception as e:
80
183
  logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
@@ -85,18 +188,27 @@ class ContextChunksInDocumentService:
85
188
  Get the context chunks in a document.
86
189
  """
87
190
  try:
88
- markdown_content = self.persistence_service.load_markdown_file_content(file_key)
191
+ markdown_content = self.persistence_service.load_markdown_file_content(
192
+ file_key
193
+ )
89
194
  langchain_rag_document = Document(
195
+ id=file_key,
90
196
  page_content=markdown_content,
91
- metadata={
92
- "source": file_key
93
- }
197
+ metadata={self.metadata_source: file_key},
94
198
  )
95
199
  logger.info(f"Document loaded:{file_key}")
96
200
  chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
97
201
  logger.info(f"Chunks generated:{len(chunks)}")
98
- context_chunks = self.retrieve_context_chunks_in_document(markdown_content, chunks, file_tags)
202
+ context_chunks = self.retrieve_context_chunks_in_document_with_workflow(
203
+ markdown_content, chunks, file_tags
204
+ )
99
205
  logger.info(f"Context chunks generated:{len(context_chunks)}")
206
+ # upsert validation
207
+ try:
208
+ print(f"deleting chunks: {file_key}")
209
+ self.delete_document_context_chunks(file_key)
210
+ except Exception as e:
211
+ logger.error(f"could not delete by source: {e}")
100
212
  self.embeddings_manager.index_documents(context_chunks)
101
213
  return context_chunks
102
214
  except Exception as e:
@@ -108,7 +220,9 @@ class ContextChunksInDocumentService:
108
220
  Delete the context chunks in a document.
109
221
  """
110
222
  try:
111
- self.embeddings_manager.delete_documents_by_source_id(file_key)
223
+ self.embeddings_manager.delete_documents_by_metadata_key(
224
+ self.metadata_source, file_key
225
+ )
112
226
  except Exception as e:
113
227
  logger.error(f"Error delete_document_context_chunks: {str(e)}")
114
228
  raise e
@@ -1,79 +1,153 @@
1
1
  from typing import Tuple, List, Dict, Optional
2
2
  from langchain_core.prompts import ChatPromptTemplate
3
3
  from langchain_core.output_parsers.pydantic import PydanticOutputParser
4
+ from langchain_core.messages import HumanMessage
4
5
  from logging import getLogger
5
6
  from ..data.prompts import IMAGE_TRANSCRIPTION_SYSTEM_PROMPT, Transcription
6
7
  from ..domain.models import ParsedDoc, ParsedDocPage
7
8
  from ..domain.services import ParseDocModelService
8
9
  from .interfaces import AiApplicationService, PersistenceService
10
+ from ..workflows.transcription_workflow import TranscriptionWorkflow
9
11
 
10
12
  logger = getLogger(__name__)
11
13
 
12
14
 
13
15
  class TranscriptionService:
14
16
  """
15
- Service for transcribing documents.
17
+ Service for transcribing documents.
16
18
  """
17
19
 
18
20
  def __init__(
19
21
  self,
20
22
  ai_application_service: AiApplicationService,
21
23
  persistence_service: PersistenceService,
22
- target_language: str = 'es',
23
- transcription_additional_instructions: str = ''
24
+ target_language: str = "es",
25
+ transcription_additional_instructions: str = "",
26
+ transcription_accuracy_threshold: int = 90,
27
+ max_transcription_retries: int = 2,
24
28
  ):
25
29
  self.ai_application_service = ai_application_service
26
30
  self.persistence_service = persistence_service
27
31
  self.target_language = target_language
28
- self.transcription_additional_instructions = transcription_additional_instructions
32
+ if (
33
+ transcription_accuracy_threshold < 0
34
+ or transcription_accuracy_threshold > 95
35
+ ):
36
+ raise ValueError(
37
+ "transcription_accuracy_threshold must be between 0 and 95"
38
+ )
39
+ if max_transcription_retries < 1 or max_transcription_retries > 3:
40
+ raise ValueError(
41
+ "max_transcription_retries must be between 1 and 3 to prevent token exhaustion"
42
+ )
43
+ self.transcription_accuracy_threshold = transcription_accuracy_threshold
44
+ self.max_transcription_retries = max_transcription_retries
45
+ self.transcription_additional_instructions = (
46
+ transcription_additional_instructions
47
+ )
29
48
  self.chat_model = self.ai_application_service.load_chat_model()
30
49
 
31
- def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
32
- """Transcribe an image to text.
33
- Args:
34
- document: The document with the image to transcribe
35
- Returns:
36
- Processed text
37
- """
38
- try:
39
- # Create the prompt template with image
40
- transcription_output_parser = PydanticOutputParser(pydantic_object=Transcription)
41
- prompt = ChatPromptTemplate.from_messages([
42
- ("system", IMAGE_TRANSCRIPTION_SYSTEM_PROMPT),
43
- ("human", [{
44
- "type": "image",
45
- "image_url": {
46
- "url": f"data:image/png;base64,{document.page_base64}"
50
+ # def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
51
+ # """Transcribe an image to text.
52
+ # Args:
53
+ # document: The document with the image to transcribe
54
+ # Returns:
55
+ # Processed text
56
+ # """
57
+ # try:
58
+ # # Create the prompt template with image
59
+ # transcription_output_parser = PydanticOutputParser(
60
+ # pydantic_object=Transcription
61
+ # )
62
+ # prompt = ChatPromptTemplate.from_messages(
63
+ # [
64
+ # ("system", IMAGE_TRANSCRIPTION_SYSTEM_PROMPT),
65
+ # (
66
+ # "human",
67
+ # [
68
+ # {
69
+ # "type": "image",
70
+ # "image_url": {
71
+ # "url": f"data:image/png;base64,{document.page_base64}"
72
+ # },
73
+ # },
74
+ # {
75
+ # "type": "text",
76
+ # "text": "Transcribe the document, ensure all content transcribed accurately",
77
+ # },
78
+ # ],
79
+ # ),
80
+ # ]
81
+ # ).partial(
82
+ # transcription_additional_instructions=self.transcription_additional_instructions,
83
+ # format_instructions=transcription_output_parser.get_format_instructions(),
84
+ # )
85
+ # model_with_structured_output = self.chat_model.with_structured_output(
86
+ # Transcription
87
+ # )
88
+ # # Create the chain
89
+ # chain = prompt | model_with_structured_output
90
+ # # Process the image
91
+ # chain = chain.with_retry(
92
+ # stop_after_attempt=3, exponential_jitter_params={"initial": 60}
93
+ # )
94
+ # result = chain.invoke({})
95
+ # if result.transcription:
96
+ # document.page_text = result.transcription
97
+ # else:
98
+ # raise ValueError("No transcription found")
99
+ # return document
100
+ # except Exception as e:
101
+ # logger.error(f"Failed to parse document page: {str(e)}")
102
+ # raise
103
+
104
+ def parse_doc_page_with_workflow(self, document: ParsedDocPage) -> ParsedDocPage:
105
+ """Transcribe an image to text using an agent.
106
+ Args:
107
+ document: The document with the image to transcribe
108
+ Returns:
109
+ Processed text
110
+ """
111
+ transcription_workflow = TranscriptionWorkflow(
112
+ self.chat_model, self.transcription_additional_instructions
113
+ )
114
+ compiled_transcription_workflow = transcription_workflow.gen_workflow()
115
+ compiled_transcription_workflow = compiled_transcription_workflow.compile()
116
+ result = compiled_transcription_workflow.invoke(
117
+ {
118
+ "messages": [
119
+ HumanMessage(
120
+ content=[
121
+ {
122
+ "type": "text",
123
+ "text": "Transcribe the document, ensure all content transcribed accurately. transcription must be in the same language of source document.",
124
+ },
125
+ ]
126
+ ),
127
+ HumanMessage(
128
+ content=[
129
+ {
130
+ "type": "image_url",
131
+ "image_url": {
132
+ "url": f"data:image/png;base64,{document.page_base64}"
133
+ },
47
134
  }
48
- },
49
- {
50
- "type": "text",
51
- "text": f"Transcribe the document, ensure all content transcribed is using '{self.target_language}' language"
52
- }]
135
+ ]
53
136
  ),
54
- ]).partial(
55
- transcription_additional_instructions=self.transcription_additional_instructions,
56
- format_instructions=transcription_output_parser.get_format_instructions()
57
- )
58
- model_with_structured_output = self.chat_model.with_structured_output(Transcription)
59
- # Create the chain
60
- chain = prompt | model_with_structured_output
61
- # Process the image
62
- chain = chain.with_retry(
63
- stop_after_attempt=3,
64
- exponential_jitter_params={
65
- "initial": 60
66
- }
67
- )
68
- result = chain.invoke({})
69
- if result.transcription:
70
- document.page_text = result.transcription
71
- else:
72
- raise ValueError("No transcription found")
73
- return document
74
- except Exception as e:
75
- logger.error(f"Failed to parse document page: {str(e)}")
76
- raise
137
+ ]
138
+ },
139
+ {
140
+ "configurable": {
141
+ "transcription_accuracy_threshold": self.transcription_accuracy_threshold,
142
+ "max_transcription_retries": self.max_transcription_retries,
143
+ }
144
+ },
145
+ )
146
+ if result["transcription"]:
147
+ document.page_text = result["transcription"]
148
+ else:
149
+ raise ValueError("No transcription found")
150
+ return document
77
151
 
78
152
  def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
79
153
  """
@@ -84,15 +158,21 @@ class TranscriptionService:
84
158
  document_pages = parse_doc_model_service.parse_document_to_base64()
85
159
  parsed_pages = []
86
160
  for page in document_pages:
87
- page = self.parse_doc_page(page)
161
+ page = self.parse_doc_page_with_workflow(page)
88
162
  parsed_pages.append(page)
89
163
  logger.info(f"Parsed {len(parsed_pages)} pages")
90
164
  parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
91
165
  return parsed_pages, parsed_document
92
166
 
93
-
94
- def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[Dict[str, str]] = {}):
167
+ def save_parsed_document(
168
+ self,
169
+ file_key: str,
170
+ parsed_document: ParsedDoc,
171
+ file_tags: Optional[Dict[str, str]] = {},
172
+ ):
95
173
  """
96
174
  Save the parsed document to a file.
97
175
  """
98
- self.persistence_service.save_parsed_document(file_key, parsed_document, file_tags)
176
+ self.persistence_service.save_parsed_document(
177
+ file_key, parsed_document, file_tags
178
+ )
@@ -0,0 +1,10 @@
1
+ from enum import Enum
2
+ from typing import Literal
3
+
4
+
5
+ class KdbServices(Enum):
6
+ REDIS = "redis"
7
+ CHROMA = "chroma"
8
+
9
+
10
+ kdb_services = Literal[KdbServices.REDIS.value, KdbServices.CHROMA.value]
@@ -1,5 +1,93 @@
1
1
  from pydantic import BaseModel, Field
2
2
 
3
+ AGENT_TRANSCRIPTION_SYSTEM_PROMPT = """
4
+ You are an expert document transcription assistant.
5
+ Your task is to transcribe the exact text from the provided document with extreme accuracy while organizing the output using markdown formatting.
6
+ OBJECTIVE: Create a complete, accurate transcription that preserves the original document's content, structure and formatting.
7
+ TRANSCRIPTION RULES:
8
+ <hard_rules>
9
+ 1. document's languages must be detected to ensure correct transcription
10
+ 2. Systematically examine each content element (text, images, tables, formatting)
11
+ 3. Convert all content to markdown while preserving structure and meaning
12
+ 5. Ensure completeness and accuracy of the transcription
13
+ 6. TEXT TRANSCRIPTION:
14
+ - Transcribe all visible text exactly as it appears
15
+ - Include: paragraphs, headings, subheadings, headers, footers
16
+ - Include: footnotes, page numbers, bullet points, lists, captions
17
+ - Preserve: bold, italic, underlined, and other text formatting using markdown
18
+ 7. LANGUAGE REQUIREMENTS:
19
+ - Transcribed content MUST preserve document's language
20
+ - Translate any secondary language content to maintain consistency
21
+ 8. COMPLETENESS:
22
+ - Transcribe the entire document, partial transcriptions are not allowed
23
+ - Never summarize, modify, or generate additional content
24
+ - Maintain original meaning and context
25
+ 9. FORMATTING STANDARDS:
26
+ - Use proper markdown syntax for structure
27
+ - Avoid blank lines in transcription
28
+ - Exclude logos, watermarks, and decorative icons
29
+ - Omit special characters that interfere with markdown
30
+ 10. IMAGE HANDLING:
31
+ <image_transcription_rules>
32
+ - Extract and transcribe any text within images
33
+ - For data-rich images: create markdown tables when applicable
34
+ - For other images: provide descriptive content summaries
35
+ - Classify each visual element as: Chart, Diagram, Natural Image, Screenshot, or Other
36
+ - Format: <figure_type>Classification</figure_type>
37
+ - Wrap content in <figure></figure> tags with title/caption if available
38
+ </image_transcription_rules>
39
+ 11. TABLE PROCESSING:
40
+ <tables_transcription_rules>
41
+ - Convert all tables to proper markdown table format
42
+ - Preserve cell alignment and structure as closely as possible
43
+ - Maintain data relationships and hierarchy
44
+ - Include table headers and formatting
45
+ </tables_transcription_rules>
46
+ 12. QUALITY ASSURANCE:
47
+ - Ensure no content is omitted or added
48
+ - Check markdown formatting is correct
49
+ - Confirm structural integrity is maintained
50
+ </hard_rules>
51
+
52
+ CRITICAL REMINDERS:
53
+ <critical_reminders>
54
+ - Accuracy over speed, every character matters
55
+ - Preserve original document intent and meaning
56
+ - Maintain professional transcription standards
57
+ - Complete transcription is mandatory
58
+ </critical_reminders>
59
+ When provided, use the following transcription notes from previous transcriptions intents to improve the current transcription:
60
+ <transcription_notes>
61
+ {transcription_notes}
62
+ </transcription_notes>
63
+ When provided, use the following additional transcription instructions to improve results:
64
+ <additional_instructions>
65
+ {transcription_additional_instructions}
66
+ </additional_instructions>
67
+ """
68
+ # Generate the optimized transcription following these specifications:
69
+ # {format_instructions}
70
+
71
+
72
+ IMAGE_TRANSCRIPTION_CHECK_SYSTEM_PROMPT = """
73
+ You are an expert document transcription grader.
74
+ Your task is to evaluate the following transcription quality.
75
+ <rules>
76
+ - Provide an accurate evaluation of the transcription ensuring quality, completeness and accuracy.
77
+ - Transcription has markdown formatting, the markdown format must reflect the original document's structure and formatting.
78
+ - Compare the transcription with the original document (provided as image)
79
+ </rules>
80
+ <transcription>
81
+ {transcription}
82
+ </transcription>
83
+
84
+ When provided, evaluate whether the following additional transcription instructions provided by the user have been followed:
85
+ <additional_instructions>
86
+ {transcription_additional_instructions}
87
+ </additional_instructions>
88
+ """
89
+
90
+
3
91
  IMAGE_TRANSCRIPTION_SYSTEM_PROMPT = """
4
92
  You are an expert document transcription assistant. Your task is to transcribe the exact text from the provided document with extreme accuracy while organizing the output using markdown formatting.
5
93
 
@@ -78,8 +166,6 @@ CRITICAL REMINDERS:
78
166
 
79
167
  Generate the optimized transcription following these specifications:
80
168
  {format_instructions}
81
-
82
-
83
169
  """
84
170
 
85
171
  CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
@@ -139,10 +225,71 @@ Generate the optimized context following these specifications:
139
225
  {format_instructions}
140
226
  """
141
227
 
228
+ WORKFLOW_CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
229
+ You are an expert RAG (Retrieval-Augmented Generation) context generator that creates optimized contextual chunks from markdown document content for enhanced search and retrieval performance.
230
+ OBJECTIVE: Generate rich, searchable context descriptions that maximize retrieval accuracy and relevance in RAG systems.
231
+ WORKFLOW:
232
+ <task_analysis>
233
+ 1. LANGUAGE DETECTION: Identify the primary language used in the document content
234
+ 2. SEMANTIC ANALYSIS: Understand the chunk's meaning, relationships, and significance within the broader document
235
+ 3. CONTEXT GENERATION: Create comprehensive context metadata that enhances retrieval effectiveness
236
+ 4. SEARCH OPTIMIZATION: Ensure context includes terms and concepts that users might search for
237
+ 5. QUALITY VALIDATION: Verify context completeness and retrieval utility
238
+ </task_analysis>
239
+ CONTEXT GENERATION REQUIREMENTS:
240
+ <context_elements>
241
+ Your generated context must synthesize ALL of these elements into a coherent description:
242
+ - chunk_relation_with_document: How this chunk connects to and fits within the overall document structure and narrative
243
+ - chunk_keywords: Primary and secondary keywords, technical terms, and searchable phrases that would help users find this content
244
+ - chunk_description: Clear explanation of what the chunk contains, including data types, concepts, and information presented
245
+ - chunk_function: The chunk's specific purpose and role (e.g., definition, explanation, example, instruction, procedure, list, summary, analysis, conclusion)
246
+ - chunk_structure: Format and organizational pattern (paragraph, bulleted list, numbered steps, table, code block, heading, etc.)
247
+ - chunk_main_idea: The central concept, message, or takeaway that the chunk communicates
248
+ - chunk_domain: Subject area or field of knowledge (e.g., technical documentation, legal text, medical information, business process)
249
+ - chunk_audience: Intended reader level and background (e.g., beginner, expert, general audience, specific role)
250
+ </context_elements>
251
+ CRITICAL RULES:
252
+ <critical_rules>
253
+ - Context MUST be written in the SAME language as the source document content
254
+ - Be comprehensive yet concise - aim for maximum information density
255
+ - Prioritize search retrieval optimization and semantic understanding
256
+ - Include synonyms and alternative phrasings users might search for
257
+ - Focus on conceptual relationships and knowledge connections
258
+ - Do NOT reproduce or quote the original chunk content verbatim
259
+ - Ensure context is self-contained and understandable without the original chunk
260
+ - Use natural language that flows well while incorporating all required elements
261
+ </critical_rules>
262
+
263
+ SEARCH OPTIMIZATION GUIDELINES:
264
+ <search_optimization>
265
+ - Include both explicit terms from the content and implicit concepts
266
+ - Consider various ways users might phrase queries related to this content
267
+ - Incorporate hierarchical information (section → subsection → detail level)
268
+ - Add contextual bridges that connect this chunk to related topics
269
+ - Use varied vocabulary to capture different search approaches
270
+ </search_optimization>
271
+
272
+ <document_content>
273
+ {document_content}
274
+ </document_content>
275
+
276
+
277
+ When provided, follow these additional context extraction instructions:
278
+ <additional_instructions>
279
+ {context_additional_instructions}
280
+ </additional_instructions>
281
+
282
+ """
283
+
284
+
142
285
  class ContextChunk(BaseModel):
143
- context: str = Field(description="Context description that helps with search retrieval")
286
+ context: str = Field(
287
+ description="Context description that helps with search retrieval"
288
+ )
289
+
144
290
 
145
291
  class Transcription(BaseModel):
146
292
  """Document Transcription."""
293
+
147
294
  transcription: str = Field(description="Full transcription")
148
295
  language: str = Field(description="Main language")
@@ -0,0 +1,10 @@
1
+ from enum import Enum
2
+ from typing import Literal
3
+
4
+
5
+ class StorageServices(Enum):
6
+ S3 = "s3"
7
+ LOCAL = "local"
8
+
9
+
10
+ storage_services = Literal[StorageServices.S3.value, StorageServices.LOCAL.value]