wizit-context-ingestor 0.2.5b2__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. wizit_context_ingestor/__init__.py +2 -2
  2. wizit_context_ingestor/application/context_chunk_service.py +149 -35
  3. wizit_context_ingestor/application/interfaces.py +1 -1
  4. wizit_context_ingestor/application/transcription_service.py +132 -49
  5. wizit_context_ingestor/data/kdb.py +10 -0
  6. wizit_context_ingestor/data/prompts.py +156 -2
  7. wizit_context_ingestor/data/storage.py +10 -0
  8. wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
  9. wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
  10. wizit_context_ingestor/infra/rag/chroma_embeddings.py +135 -0
  11. wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
  12. wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
  13. wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
  14. wizit_context_ingestor/infra/vertex_model.py +56 -28
  15. wizit_context_ingestor/main.py +160 -105
  16. wizit_context_ingestor/utils/file_utils.py +13 -0
  17. wizit_context_ingestor/workflows/context_nodes.py +73 -0
  18. wizit_context_ingestor/workflows/context_state.py +10 -0
  19. wizit_context_ingestor/workflows/context_tools.py +58 -0
  20. wizit_context_ingestor/workflows/context_workflow.py +42 -0
  21. wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
  22. wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
  23. wizit_context_ingestor/workflows/transcription_state.py +17 -0
  24. wizit_context_ingestor/workflows/transcription_tools.py +54 -0
  25. wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
  26. {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/METADATA +10 -1
  27. wizit_context_ingestor-0.3.0b1.dist-info/RECORD +44 -0
  28. {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/WHEEL +1 -1
  29. wizit_context_ingestor-0.2.5b2.dist-info/RECORD +0 -31
@@ -1,3 +1,3 @@
1
- from .main import DeelabTranscribeManager, DeelabRedisChunksManager
1
+ from .main import ChunksManager, TranscriptionManager
2
2
 
3
- __all__ = ["DeelabTranscribeManager", "DeelabRedisChunksManager"]
3
+ __all__ = ["ChunksManager", "TranscriptionManager"]
@@ -2,8 +2,15 @@ from langchain_core.output_parsers.pydantic import PydanticOutputParser
2
2
  from langchain_core.prompts import ChatPromptTemplate
3
3
  from langchain_core.documents import Document
4
4
  from ..data.prompts import CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT, ContextChunk
5
+ from langchain_core.messages.human import HumanMessage
6
+ from ..workflows.context_workflow import ContextWorkflow
5
7
  from typing import Dict, Any, Optional, List
6
- from .interfaces import AiApplicationService, PersistenceService, RagChunker, EmbeddingsManager
8
+ from .interfaces import (
9
+ AiApplicationService,
10
+ PersistenceService,
11
+ RagChunker,
12
+ EmbeddingsManager,
13
+ )
7
14
  import logging
8
15
 
9
16
 
@@ -21,7 +28,7 @@ class ContextChunksInDocumentService:
21
28
  persistence_service: PersistenceService,
22
29
  rag_chunker: RagChunker,
23
30
  embeddings_manager: EmbeddingsManager,
24
- target_language: str = 'es'
31
+ target_language: str = "es",
25
32
  ):
26
33
  """
27
34
  Initialize the ChunkerService.
@@ -33,48 +40,144 @@ class ContextChunksInDocumentService:
33
40
  self.target_language = target_language
34
41
  self.embeddings_manager.init_vector_store()
35
42
  self.chat_model = self.ai_application_service.load_chat_model()
43
+ # TODO
44
+ self.context_additional_instructions = ""
45
+ self.metadata_source = "source"
36
46
 
37
- def _retrieve_context_chunk_in_document(self, markdown_content: str, chunk: Document, chunk_metadata: Optional[Dict[str, Any]] = None) -> Document:
47
+ def _retrieve_context_chunk_in_document_with_workflow(
48
+ self,
49
+ workflow,
50
+ markdown_content: str,
51
+ chunk: Document,
52
+ chunk_metadata: Optional[Dict[str, Any]] = None,
53
+ ) -> Document:
38
54
  """Retrieve context chunks in document."""
39
55
  try:
40
- chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
41
- # Create the prompt template with image
42
- prompt = ChatPromptTemplate.from_messages([
43
- ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
44
- (
45
- "human", [{
46
- "type": "text",
47
- "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated in '{self.target_language}' language"
48
- }]
49
- ),
50
- ]).partial(
51
- document_content=markdown_content,
52
- format_instructions=chunk_output_parser.get_format_instructions()
56
+ result = workflow.invoke(
57
+ {
58
+ "messages": [
59
+ HumanMessage(
60
+ content=[
61
+ {
62
+ "type": "text",
63
+ "text": f"Retrieve a complete context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated with the same document's language.",
64
+ },
65
+ ]
66
+ )
67
+ ],
68
+ "document_content": markdown_content,
69
+ },
70
+ {
71
+ "configurable": {
72
+ "transcription_accuracy_threshold": 0.95,
73
+ "max_transcription_retries": 2,
74
+ }
75
+ },
53
76
  )
54
- model_with_structured_output = self.chat_model.with_structured_output(ContextChunk)
55
- # Create the chain
56
- chain = prompt | model_with_structured_output
57
- # Process the image
58
- results = chain.invoke({})
59
- chunk.page_content = f"Context:{results.context}, Content:{chunk.page_content}"
60
- chunk.metadata["context"] = results.context
77
+ # chunk.page_content = (
78
+ # f"Context:{result['context']}, Content:{chunk.page_content}"
79
+ # )
80
+ chunk.metadata["context"] = result["context"]
61
81
  if chunk_metadata:
62
82
  for key, value in chunk_metadata.items():
63
83
  chunk.metadata[key] = value
64
84
  return chunk
65
-
66
85
  except Exception as e:
67
86
  logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
68
87
  raise
69
88
 
89
+ # def _retrieve_context_chunk_in_document(
90
+ # self,
91
+ # markdown_content: str,
92
+ # chunk: Document,
93
+ # chunk_metadata: Optional[Dict[str, Any]] = None,
94
+ # ) -> Document:
95
+ # """Retrieve context chunks in document."""
96
+ # try:
97
+ # chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
98
+ # # Create the prompt template with image
99
+ # prompt = ChatPromptTemplate.from_messages(
100
+ # [
101
+ # ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
102
+ # (
103
+ # "human",
104
+ # [
105
+ # {
106
+ # "type": "text",
107
+ # "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated in '{self.target_language}' language",
108
+ # }
109
+ # ],
110
+ # ),
111
+ # ]
112
+ # ).partial(
113
+ # document_content=markdown_content,
114
+ # format_instructions=chunk_output_parser.get_format_instructions(),
115
+ # )
116
+ # model_with_structured_output = self.chat_model.with_structured_output(
117
+ # ContextChunk
118
+ # )
119
+ # # Create the chain
120
+ # chain = prompt | model_with_structured_output
121
+ # # Process the image
122
+ # results = chain.invoke({})
123
+ # # chunk.page_content = (
124
+ # # f"Context:{results.context}, Content:{chunk.page_content}"
125
+ # # )
126
+ # chunk.metadata["context"] = results.context
127
+ # if chunk_metadata:
128
+ # for key, value in chunk_metadata.items():
129
+ # chunk.metadata[key] = value
130
+ # return chunk
131
+
132
+ # except Exception as e:
133
+ # logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
134
+ # raise
70
135
 
71
- def retrieve_context_chunks_in_document(self, markdown_content: str, chunks: List[Document], chunks_metadata: Optional[Dict[str, Any]] = None) -> List[Document]:
136
+ # def retrieve_context_chunks_in_document(
137
+ # self,
138
+ # markdown_content: str,
139
+ # chunks: List[Document],
140
+ # chunks_metadata: Optional[Dict[str, Any]] = None,
141
+ # ) -> List[Document]:
142
+ # """Retrieve context chunks in document."""
143
+ # try:
144
+ # context_chunks = list(
145
+ # map(
146
+ # lambda chunk: self._retrieve_context_chunk_in_document(
147
+ # markdown_content, chunk, chunks_metadata
148
+ # ),
149
+ # chunks,
150
+ # )
151
+ # )
152
+ # return context_chunks
153
+ # except Exception as e:
154
+ # logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
155
+ # raise
156
+
157
+ def retrieve_context_chunks_in_document_with_workflow(
158
+ self,
159
+ markdown_content: str,
160
+ chunks: List[Document],
161
+ chunks_metadata: Optional[Dict[str, Any]] = None,
162
+ ) -> List[Document]:
72
163
  """Retrieve context chunks in document."""
73
164
  try:
74
- context_chunks = list(map(
75
- lambda chunk: self._retrieve_context_chunk_in_document(markdown_content, chunk, chunks_metadata),
76
- chunks
77
- ))
165
+ context_workflow = ContextWorkflow(
166
+ self.chat_model, self.context_additional_instructions
167
+ )
168
+ compiled_context_workflow = context_workflow.gen_workflow()
169
+ compiled_context_workflow = compiled_context_workflow.compile()
170
+ context_chunks = list(
171
+ map(
172
+ lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
173
+ compiled_context_workflow,
174
+ markdown_content,
175
+ chunk,
176
+ chunks_metadata,
177
+ ),
178
+ chunks,
179
+ )
180
+ )
78
181
  return context_chunks
79
182
  except Exception as e:
80
183
  logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
@@ -85,18 +188,27 @@ class ContextChunksInDocumentService:
85
188
  Get the context chunks in a document.
86
189
  """
87
190
  try:
88
- markdown_content = self.persistence_service.load_markdown_file_content(file_key)
191
+ markdown_content = self.persistence_service.load_markdown_file_content(
192
+ file_key
193
+ )
89
194
  langchain_rag_document = Document(
195
+ id=file_key,
90
196
  page_content=markdown_content,
91
- metadata={
92
- "source": file_key
93
- }
197
+ metadata={self.metadata_source: file_key},
94
198
  )
95
199
  logger.info(f"Document loaded:{file_key}")
96
200
  chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
97
201
  logger.info(f"Chunks generated:{len(chunks)}")
98
- context_chunks = self.retrieve_context_chunks_in_document(markdown_content, chunks, file_tags)
202
+ context_chunks = self.retrieve_context_chunks_in_document_with_workflow(
203
+ markdown_content, chunks, file_tags
204
+ )
99
205
  logger.info(f"Context chunks generated:{len(context_chunks)}")
206
+ # upsert validation
207
+ try:
208
+ print(f"deleting chunks: {file_key}")
209
+ self.delete_document_context_chunks(file_key)
210
+ except Exception as e:
211
+ logger.error(f"could not delete by source: {e}")
100
212
  self.embeddings_manager.index_documents(context_chunks)
101
213
  return context_chunks
102
214
  except Exception as e:
@@ -108,7 +220,9 @@ class ContextChunksInDocumentService:
108
220
  Delete the context chunks in a document.
109
221
  """
110
222
  try:
111
- self.embeddings_manager.delete_documents_by_source_id(file_key)
223
+ self.embeddings_manager.delete_documents_by_metadata_key(
224
+ self.metadata_source, file_key
225
+ )
112
226
  except Exception as e:
113
227
  logger.error(f"Error delete_document_context_chunks: {str(e)}")
114
228
  raise e
@@ -92,7 +92,7 @@ class EmbeddingsManager(ABC):
92
92
  pass
93
93
 
94
94
  @abstractmethod
95
- def index_documents(self, documents: List[Document]):
95
+ def index_documents(self, documents: list[Document]):
96
96
  """Index documents."""
97
97
  pass
98
98
 
@@ -1,76 +1,153 @@
1
1
  from typing import Tuple, List, Dict, Optional
2
2
  from langchain_core.prompts import ChatPromptTemplate
3
3
  from langchain_core.output_parsers.pydantic import PydanticOutputParser
4
+ from langchain_core.messages import HumanMessage
4
5
  from logging import getLogger
5
6
  from ..data.prompts import IMAGE_TRANSCRIPTION_SYSTEM_PROMPT, Transcription
6
7
  from ..domain.models import ParsedDoc, ParsedDocPage
7
8
  from ..domain.services import ParseDocModelService
8
9
  from .interfaces import AiApplicationService, PersistenceService
10
+ from ..workflows.transcription_workflow import TranscriptionWorkflow
9
11
 
10
12
  logger = getLogger(__name__)
11
13
 
12
14
 
13
15
  class TranscriptionService:
14
16
  """
15
- Service for transcribing documents.
17
+ Service for transcribing documents.
16
18
  """
17
19
 
18
20
  def __init__(
19
21
  self,
20
22
  ai_application_service: AiApplicationService,
21
23
  persistence_service: PersistenceService,
22
- target_language: str = 'es'
24
+ target_language: str = "es",
25
+ transcription_additional_instructions: str = "",
26
+ transcription_accuracy_threshold: int = 90,
27
+ max_transcription_retries: int = 2,
23
28
  ):
24
29
  self.ai_application_service = ai_application_service
25
30
  self.persistence_service = persistence_service
26
31
  self.target_language = target_language
32
+ if (
33
+ transcription_accuracy_threshold < 0
34
+ or transcription_accuracy_threshold > 95
35
+ ):
36
+ raise ValueError(
37
+ "transcription_accuracy_threshold must be between 0 and 95"
38
+ )
39
+ if max_transcription_retries < 1 or max_transcription_retries > 3:
40
+ raise ValueError(
41
+ "max_transcription_retries must be between 1 and 3 to prevent token exhaustion"
42
+ )
43
+ self.transcription_accuracy_threshold = transcription_accuracy_threshold
44
+ self.max_transcription_retries = max_transcription_retries
45
+ self.transcription_additional_instructions = (
46
+ transcription_additional_instructions
47
+ )
27
48
  self.chat_model = self.ai_application_service.load_chat_model()
28
49
 
29
- def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
30
- """Transcribe an image to text.
31
- Args:
32
- document: The document with the image to transcribe
33
- Returns:
34
- Processed text
35
- """
36
- try:
37
- # Create the prompt template with image
38
- transcription_output_parser = PydanticOutputParser(pydantic_object=Transcription)
39
- prompt = ChatPromptTemplate.from_messages([
40
- ("system", IMAGE_TRANSCRIPTION_SYSTEM_PROMPT),
41
- ("human", [{
42
- "type": "image",
43
- "image_url": {
44
- "url": f"data:image/png;base64,{document.page_base64}"
50
+ # def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
51
+ # """Transcribe an image to text.
52
+ # Args:
53
+ # document: The document with the image to transcribe
54
+ # Returns:
55
+ # Processed text
56
+ # """
57
+ # try:
58
+ # # Create the prompt template with image
59
+ # transcription_output_parser = PydanticOutputParser(
60
+ # pydantic_object=Transcription
61
+ # )
62
+ # prompt = ChatPromptTemplate.from_messages(
63
+ # [
64
+ # ("system", IMAGE_TRANSCRIPTION_SYSTEM_PROMPT),
65
+ # (
66
+ # "human",
67
+ # [
68
+ # {
69
+ # "type": "image",
70
+ # "image_url": {
71
+ # "url": f"data:image/png;base64,{document.page_base64}"
72
+ # },
73
+ # },
74
+ # {
75
+ # "type": "text",
76
+ # "text": "Transcribe the document, ensure all content transcribed accurately",
77
+ # },
78
+ # ],
79
+ # ),
80
+ # ]
81
+ # ).partial(
82
+ # transcription_additional_instructions=self.transcription_additional_instructions,
83
+ # format_instructions=transcription_output_parser.get_format_instructions(),
84
+ # )
85
+ # model_with_structured_output = self.chat_model.with_structured_output(
86
+ # Transcription
87
+ # )
88
+ # # Create the chain
89
+ # chain = prompt | model_with_structured_output
90
+ # # Process the image
91
+ # chain = chain.with_retry(
92
+ # stop_after_attempt=3, exponential_jitter_params={"initial": 60}
93
+ # )
94
+ # result = chain.invoke({})
95
+ # if result.transcription:
96
+ # document.page_text = result.transcription
97
+ # else:
98
+ # raise ValueError("No transcription found")
99
+ # return document
100
+ # except Exception as e:
101
+ # logger.error(f"Failed to parse document page: {str(e)}")
102
+ # raise
103
+
104
+ def parse_doc_page_with_workflow(self, document: ParsedDocPage) -> ParsedDocPage:
105
+ """Transcribe an image to text using an agent.
106
+ Args:
107
+ document: The document with the image to transcribe
108
+ Returns:
109
+ Processed text
110
+ """
111
+ transcription_workflow = TranscriptionWorkflow(
112
+ self.chat_model, self.transcription_additional_instructions
113
+ )
114
+ compiled_transcription_workflow = transcription_workflow.gen_workflow()
115
+ compiled_transcription_workflow = compiled_transcription_workflow.compile()
116
+ result = compiled_transcription_workflow.invoke(
117
+ {
118
+ "messages": [
119
+ HumanMessage(
120
+ content=[
121
+ {
122
+ "type": "text",
123
+ "text": "Transcribe the document, ensure all content transcribed accurately. transcription must be in the same language of source document.",
124
+ },
125
+ ]
126
+ ),
127
+ HumanMessage(
128
+ content=[
129
+ {
130
+ "type": "image_url",
131
+ "image_url": {
132
+ "url": f"data:image/png;base64,{document.page_base64}"
133
+ },
45
134
  }
46
- },
47
- {
48
- "type": "text",
49
- "text": f"Transcribe the document, ensure all content transcribed is using '{self.target_language}' language"
50
- }]
135
+ ]
51
136
  ),
52
- ]).partial(
53
- format_instructions=transcription_output_parser.get_format_instructions()
54
- )
55
- model_with_structured_output = self.chat_model.with_structured_output(Transcription)
56
- # Create the chain
57
- chain = prompt | model_with_structured_output
58
- # Process the image
59
- chain = chain.with_retry(
60
- stop_after_attempt=3,
61
- exponential_jitter_params={
62
- "initial": 60
63
- }
64
- )
65
- result = chain.invoke({})
66
- if result.transcription:
67
- document.page_text = result.transcription
68
- else:
69
- raise ValueError("No transcription found")
70
- return document
71
- except Exception as e:
72
- logger.error(f"Failed to parse document page: {str(e)}")
73
- raise
137
+ ]
138
+ },
139
+ {
140
+ "configurable": {
141
+ "transcription_accuracy_threshold": self.transcription_accuracy_threshold,
142
+ "max_transcription_retries": self.max_transcription_retries,
143
+ }
144
+ },
145
+ )
146
+ if result["transcription"]:
147
+ document.page_text = result["transcription"]
148
+ else:
149
+ raise ValueError("No transcription found")
150
+ return document
74
151
 
75
152
  def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
76
153
  """
@@ -81,15 +158,21 @@ class TranscriptionService:
81
158
  document_pages = parse_doc_model_service.parse_document_to_base64()
82
159
  parsed_pages = []
83
160
  for page in document_pages:
84
- page = self.parse_doc_page(page)
161
+ page = self.parse_doc_page_with_workflow(page)
85
162
  parsed_pages.append(page)
86
163
  logger.info(f"Parsed {len(parsed_pages)} pages")
87
164
  parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
88
165
  return parsed_pages, parsed_document
89
166
 
90
-
91
- def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[Dict[str, str]] = {}):
167
+ def save_parsed_document(
168
+ self,
169
+ file_key: str,
170
+ parsed_document: ParsedDoc,
171
+ file_tags: Optional[Dict[str, str]] = {},
172
+ ):
92
173
  """
93
174
  Save the parsed document to a file.
94
175
  """
95
- self.persistence_service.save_parsed_document(file_key, parsed_document, file_tags)
176
+ self.persistence_service.save_parsed_document(
177
+ file_key, parsed_document, file_tags
178
+ )
@@ -0,0 +1,10 @@
1
+ from enum import Enum
2
+ from typing import Literal
3
+
4
+
5
+ class KdbServices(Enum):
6
+ REDIS = "redis"
7
+ CHROMA = "chroma"
8
+
9
+
10
+ kdb_services = Literal[KdbServices.REDIS.value, KdbServices.CHROMA.value]