wizit-context-ingestor 0.2.5b3__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/PKG-INFO +12 -1
  2. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/README.md +7 -0
  3. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/pyproject.toml +5 -1
  4. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/__init__.py +4 -0
  5. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/application/context_chunk_service.py +145 -0
  6. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/application/interfaces.py +52 -23
  7. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/application/kdb_service.py +59 -0
  8. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/application/transcription_service.py +209 -0
  9. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/data/kdb.py +13 -0
  10. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/data/prompts.py +293 -0
  11. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/data/storage.py +10 -0
  12. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/domain/services.py +6 -11
  13. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
  14. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
  15. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/rag/chroma_embeddings.py +37 -33
  16. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/infra/rag/pg_embeddings.py +201 -0
  17. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
  18. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/rag/semantic_chunks.py +8 -1
  19. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/secrets/aws_secrets_manager.py +3 -4
  20. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/vertex_model.py +56 -28
  21. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/main.py +283 -0
  22. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/main_chunks.py +173 -0
  23. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/utils/file_utils.py +12 -0
  24. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/context_nodes.py +73 -0
  25. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/context_state.py +10 -0
  26. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/context_tools.py +58 -0
  27. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/context_workflow.py +42 -0
  28. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
  29. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
  30. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/transcription_state.py +17 -0
  31. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/transcription_tools.py +54 -0
  32. wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
  33. wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/__init__.py +0 -3
  34. wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/application/context_chunk_service.py +0 -114
  35. wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/application/transcription_service.py +0 -98
  36. wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/data/prompts.py +0 -148
  37. wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/infra/rag/pg_embeddings.py +0 -208
  38. wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/main.py +0 -196
  39. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/.DS_Store +0 -0
  40. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/application/__init__.py +0 -0
  41. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/data/__init__.py +0 -0
  42. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/domain/__init__.py +0 -0
  43. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/domain/models.py +0 -0
  44. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/__init__.py +0 -0
  45. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/aws_model.py +0 -0
  46. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/persistence/__init__.py +0 -0
  47. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/secrets/__init__.py +0 -0
  48. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/services/.DS_Store +0 -0
  49. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/services/__init__.py +0 -0
  50. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/services/chunks.py +0 -0
  51. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/services/parse_doc.py +0 -0
  52. {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/services/pg_embeddings_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: wizit-context-ingestor
3
- Version: 0.2.5b3
3
+ Version: 0.4.1
4
4
  Summary: Contextual Rag with Cloud Solutions
5
5
  Requires-Dist: anthropic[vertex]>=0.66.0
6
6
  Requires-Dist: boto3>=1.40.23
@@ -8,9 +8,13 @@ Requires-Dist: langchain-aws>=0.2.31
8
8
  Requires-Dist: langchain-chroma>=0.2.6
9
9
  Requires-Dist: langchain-experimental>=0.3.4
10
10
  Requires-Dist: langchain-google-vertexai>=2.0.28
11
+ Requires-Dist: langchain-postgres>=0.0.16
11
12
  Requires-Dist: langchain-redis>=0.2.3
13
+ Requires-Dist: langgraph>=0.6.8
12
14
  Requires-Dist: pillow>=11.3.0
15
+ Requires-Dist: psycopg2-binary>=2.9.11
13
16
  Requires-Dist: pymupdf>=1.26.4
17
+ Requires-Dist: sqlalchemy[asyncio]>=2.0.43
14
18
  Requires-Python: >=3.12
15
19
  Description-Content-Type: text/markdown
16
20
 
@@ -138,6 +142,13 @@ Finally
138
142
  poetry publish -r tbbcmegaingestor
139
143
  ```
140
144
 
145
+ # USAGE
146
+
147
+ ## For transcriptions
148
+
149
+ ----- TODO ---
150
+ You can provide number of retries and a transcription quality threshold
151
+
141
152
  ## License
142
153
 
143
154
  This project is licensed under the Apache License - see the LICENSE file for details.
@@ -122,6 +122,13 @@ Finally
122
122
  poetry publish -r tbbcmegaingestor
123
123
  ```
124
124
 
125
+ # USAGE
126
+
127
+ ## For transcriptions
128
+
129
+ ----- TODO ---
130
+ You can provide number of retries and a transcription quality threshold
131
+
125
132
  ## License
126
133
 
127
134
  This project is licensed under the Apache License - see the LICENSE file for details.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "wizit_context_ingestor"
3
- version = "0.2.5-beta-3"
3
+ version = "0.4.1"
4
4
  description = "Contextual Rag with Cloud Solutions"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -11,9 +11,13 @@ dependencies = [
11
11
  "langchain-chroma>=0.2.6",
12
12
  "langchain-experimental>=0.3.4",
13
13
  "langchain-google-vertexai>=2.0.28",
14
+ "langchain-postgres>=0.0.16",
14
15
  "langchain-redis>=0.2.3",
16
+ "langgraph>=0.6.8",
15
17
  "pillow>=11.3.0",
18
+ "psycopg2-binary>=2.9.11",
16
19
  "pymupdf>=1.26.4",
20
+ "sqlalchemy[asyncio]>=2.0.43",
17
21
  ]
18
22
 
19
23
  [dependency-groups]
@@ -0,0 +1,4 @@
1
+ from .main import TranscriptionManager
2
+ from .main_chunks import ChunksManager
3
+
4
+ __all__ = ["ChunksManager", "TranscriptionManager"]
@@ -0,0 +1,145 @@
1
+ import asyncio
2
+ import logging
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from langchain_core.documents import Document
6
+ from langchain_core.messages.human import HumanMessage
7
+ from langchain_core.output_parsers.pydantic import PydanticOutputParser
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+
10
+ from ..data.prompts import CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT, ContextChunk
11
+ from ..workflows.context_workflow import ContextWorkflow
12
+ from .interfaces import (
13
+ AiApplicationService,
14
+ EmbeddingsManager,
15
+ PersistenceService,
16
+ RagChunker,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class ContextChunksInDocumentService:
23
+ """
24
+ Service for chunking documents.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ ai_application_service: AiApplicationService,
30
+ persistence_service: PersistenceService,
31
+ rag_chunker: RagChunker,
32
+ embeddings_manager: EmbeddingsManager,
33
+ target_language: str = "es",
34
+ ):
35
+ """
36
+ Initialize the ChunkerService.
37
+ """
38
+ self.ai_application_service = ai_application_service
39
+ self.persistence_service = persistence_service
40
+ self.rag_chunker = rag_chunker
41
+ self.embeddings_manager = embeddings_manager
42
+ self.target_language = target_language
43
+ # self.embeddings_manager.init_vector_store()
44
+ self.chat_model = self.ai_application_service.load_chat_model()
45
+ # TODO
46
+ self.context_additional_instructions = ""
47
+ self.metadata_source = "source"
48
+
49
+ async def _retrieve_context_chunk_in_document_with_workflow(
50
+ self,
51
+ workflow,
52
+ markdown_content: str,
53
+ chunk: Document,
54
+ chunk_metadata: Optional[Dict[str, Any]] = None,
55
+ ) -> Document:
56
+ """Retrieve context chunks in document."""
57
+ try:
58
+ result = await workflow.ainvoke(
59
+ {
60
+ "messages": [
61
+ HumanMessage(
62
+ content=[
63
+ {
64
+ "type": "text",
65
+ "text": f"Retrieve a complete context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated with the same document's language.",
66
+ },
67
+ ]
68
+ )
69
+ ],
70
+ "document_content": markdown_content,
71
+ },
72
+ {
73
+ "configurable": {
74
+ "transcription_accuracy_threshold": 0.95,
75
+ "max_transcription_retries": 2,
76
+ }
77
+ },
78
+ )
79
+ chunk.page_content = f"<context>\n{result['context']}\n</context>\n <content>\n{chunk.page_content}\n</content>"
80
+ # INFO: prevent context in metadata because it's already included in the chunk content, also generates issues when text is long
81
+ # chunk.metadata["context"] = result["context"]
82
+ if chunk_metadata:
83
+ for key, value in chunk_metadata.items():
84
+ chunk.metadata[key] = value
85
+ return chunk
86
+ except Exception as e:
87
+ logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
88
+ raise
89
+
90
+ async def retrieve_context_chunks_in_document_with_workflow(
91
+ self,
92
+ markdown_content: str,
93
+ chunks: List[Document],
94
+ chunks_metadata: Optional[Dict[str, Any]] = None,
95
+ ) -> List[Document]:
96
+ """Retrieve context chunks in document."""
97
+ try:
98
+ context_workflow = ContextWorkflow(
99
+ self.chat_model, self.context_additional_instructions
100
+ )
101
+ compiled_context_workflow = context_workflow.gen_workflow()
102
+ compiled_context_workflow = compiled_context_workflow.compile()
103
+ context_chunks_workflow_invocations = list(
104
+ map(
105
+ lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
106
+ compiled_context_workflow,
107
+ markdown_content,
108
+ chunk,
109
+ chunks_metadata,
110
+ ),
111
+ chunks,
112
+ )
113
+ )
114
+ context_chunks = await asyncio.gather(*context_chunks_workflow_invocations)
115
+ return context_chunks
116
+ except Exception as e:
117
+ logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
118
+ raise
119
+
120
+ async def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
121
+ """
122
+ Get the context chunks in a document.
123
+ """
124
+ try:
125
+ markdown_content = self.persistence_service.load_markdown_file_content(
126
+ file_key
127
+ )
128
+ langchain_rag_document = Document(
129
+ id=file_key,
130
+ page_content=markdown_content,
131
+ metadata={self.metadata_source: file_key},
132
+ )
133
+ logger.info(f"Document loaded:{file_key}")
134
+ chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
135
+ logger.info(f"Chunks generated:{len(chunks)}")
136
+ context_chunks = (
137
+ await self.retrieve_context_chunks_in_document_with_workflow(
138
+ markdown_content, chunks, file_tags
139
+ )
140
+ )
141
+ logger.info(f"Context chunks generated:{len(context_chunks)}")
142
+ return context_chunks
143
+ except Exception as e:
144
+ logger.error(f"Error: {str(e)}")
145
+ raise e
@@ -1,13 +1,19 @@
1
1
  """
2
2
  Application interfaces defining application layer contracts.
3
3
  """
4
+
4
5
  from abc import ABC, abstractmethod
5
- from ..domain.models import ParsedDocPage, ParsedDoc
6
- from typing import List, Union, Optional
7
- from langchain_core.documents import Document
6
+ from typing import List, Optional, Union
7
+
8
+ from langchain.indexes import IndexingResult, SQLRecordManager
8
9
  from langchain_aws import ChatBedrockConverse
10
+ from langchain_core.documents import Document
9
11
  from langchain_google_vertexai import ChatVertexAI
10
12
  from langchain_google_vertexai.model_garden import ChatAnthropicVertex
13
+ from langchain_postgres import PGVectorStore
14
+
15
+ from ..domain.models import ParsedDoc, ParsedDocPage
16
+
11
17
 
12
18
  class TranscriptionService(ABC):
13
19
  """Interface for transcription services."""
@@ -17,6 +23,7 @@ class TranscriptionService(ABC):
17
23
  """Parse a document page."""
18
24
  pass
19
25
 
26
+
20
27
  class AiApplicationService(ABC):
21
28
  """Interface for AI application services."""
22
29
 
@@ -26,7 +33,9 @@ class AiApplicationService(ABC):
26
33
  # pass
27
34
 
28
35
  @abstractmethod
29
- def load_chat_model(self, **kwargs) -> Union[ChatVertexAI, ChatAnthropicVertex, ChatBedrockConverse]:
36
+ def load_chat_model(
37
+ self, **kwargs
38
+ ) -> Union[ChatVertexAI, ChatAnthropicVertex, ChatBedrockConverse]:
30
39
  """Load a chat model."""
31
40
  pass
32
41
 
@@ -40,7 +49,9 @@ class PersistenceService(ABC):
40
49
  """Interface for persistence services."""
41
50
 
42
51
  @abstractmethod
43
- def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
52
+ def save_parsed_document(
53
+ self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
54
+ ):
44
55
  """Save a parsed document."""
45
56
  pass
46
57
 
@@ -70,38 +81,56 @@ class EmbeddingsManager(ABC):
70
81
  @abstractmethod
71
82
  def configure_vector_store(
72
83
  self,
73
- table_name: str = "langchain_pg_embedding",
74
- vector_size: int = 768,
75
- content_column: str = "document",
76
- id_column: str = "id",
77
- metadata_json_column: str = "cmetadata",
78
- pg_record_manager: str = "postgres/langchain_pg_collection"
79
84
  ):
80
85
  """Configure the vector store."""
81
86
  pass
82
87
 
88
+ # @abstractmethod
89
+ # async def init_vector_store(
90
+ # self,
91
+ # table_name: str = "tenant_embeddings",
92
+ # content_column: str = "document",
93
+ # metadata_json_column: str = "cmetadata",
94
+ # id_column: str = "id",
95
+ # ):
96
+ # """Initialize the vector store."""
97
+ # pass
98
+
83
99
  @abstractmethod
84
- def init_vector_store(
100
+ def retrieve_vector_store(
85
101
  self,
86
- table_name: str = "langchain_pg_embedding",
87
- content_column: str = "document",
88
- metadata_json_column: str = "cmetadata",
89
- id_column: str = "id",
90
- ):
91
- """Initialize the vector store."""
102
+ ) -> tuple[PGVectorStore, SQLRecordManager]:
103
+ """Retrieve the vector store."""
92
104
  pass
93
105
 
94
106
  @abstractmethod
95
- def index_documents(self, documents: list[Document]):
107
+ def index_documents(
108
+ self,
109
+ docs: list[Document],
110
+ ) -> IndexingResult:
96
111
  """Index documents."""
97
112
  pass
98
113
 
99
114
  @abstractmethod
100
- def get_documents_keys_by_source_id(self, source_id: str):
101
- """Get documents keys by source ID."""
115
+ def search_records(
116
+ self,
117
+ query: str,
118
+ ) -> list[Document]:
119
+ """Search documents."""
102
120
  pass
103
121
 
104
122
  @abstractmethod
105
- def delete_documents_by_source_id(self, source_id: str):
106
- """Delete documents by source ID."""
123
+ def create_index(
124
+ self,
125
+ ):
107
126
  pass
127
+
128
+ # @abstractmethod
129
+ # def get_documents_keys_by_source_id(self, source_id: str):
130
+ # """Get documents keys by source ID."""
131
+ # pass
132
+
133
+ # @abstractmethod
134
+ # def delete_documents_by_source_id(self, source_id: str):
135
+ # """Delete documents by source ID."""
136
+ # pass
@@ -0,0 +1,59 @@
1
+ import logging
2
+
3
+ from langchain.indexes import SQLRecordManager
4
+ from langchain_core.documents import Document
5
+ from langchain_postgres import PGVectorStore
6
+
7
+ from .interfaces import (
8
+ EmbeddingsManager,
9
+ RagChunker,
10
+ )
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class KdbService:
16
+ """
17
+ Service for chunking documents.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ embeddings_manager: EmbeddingsManager,
23
+ ):
24
+ """
25
+ Initialize the ChunkerService.
26
+ """
27
+ self.embeddings_manager = embeddings_manager
28
+ self._vector_store = None
29
+ self._records_manager = None
30
+
31
+ def configure_kdb(self):
32
+ try:
33
+ self.embeddings_manager.configure_vector_store()
34
+ except Exception as e:
35
+ raise Exception(f"Error configuring KDB: {e}")
36
+
37
+ def create_vector_store_hsnw_index(self):
38
+ try:
39
+ self.embeddings_manager.create_index()
40
+ except Exception as e:
41
+ logger.error(f"Error creating vector store index: {e}")
42
+ raise Exception(f"Error creating vector store index: {e}")
43
+
44
+ def search(self, query: str) -> list[Document]:
45
+ try:
46
+ records = []
47
+ records = self.embeddings_manager.search_records(query)
48
+ print(records)
49
+ return records
50
+ except Exception as e:
51
+ logger.error(f"Error indexing documents: {e}")
52
+ raise Exception(f"Error indexing documents: {e}")
53
+
54
+ def index_documents_in_vector_store(self, documents: list[Document]) -> None:
55
+ try:
56
+ self.embeddings_manager.index_documents(documents)
57
+ except Exception as e:
58
+ logger.error(f"Error indexing documents: {e}")
59
+ raise Exception(f"Error indexing documents: {e}")
@@ -0,0 +1,209 @@
1
+ import asyncio
2
+ from typing import Tuple, List, Dict, Optional
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_core.output_parsers.pydantic import PydanticOutputParser
5
+ from langchain_core.messages import HumanMessage
6
+ from logging import getLogger
7
+ from ..data.prompts import IMAGE_TRANSCRIPTION_SYSTEM_PROMPT, Transcription
8
+ from ..domain.models import ParsedDoc, ParsedDocPage
9
+ from ..domain.services import ParseDocModelService
10
+ from .interfaces import AiApplicationService, PersistenceService
11
+ from ..workflows.transcription_workflow import TranscriptionWorkflow
12
+
13
+ logger = getLogger(__name__)
14
+
15
+
16
+ class TranscriptionService:
17
+ """
18
+ Service for transcribing documents.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ ai_application_service: AiApplicationService,
24
+ persistence_service: PersistenceService,
25
+ target_language: str = "es",
26
+ transcription_additional_instructions: str = "",
27
+ transcription_accuracy_threshold: float = 0.90,
28
+ max_transcription_retries: int = 2,
29
+ ):
30
+ self.ai_application_service = ai_application_service
31
+ self.persistence_service = persistence_service
32
+ self.target_language = target_language
33
+ if (
34
+ transcription_accuracy_threshold < 0.0
35
+ or transcription_accuracy_threshold > 0.95
36
+ ):
37
+ raise ValueError(
38
+ "transcription_accuracy_threshold must be between 0 and 95"
39
+ )
40
+ if max_transcription_retries < 1 or max_transcription_retries > 3:
41
+ raise ValueError(
42
+ "max_transcription_retries must be between 1 and 3 to prevent token exhaustion"
43
+ )
44
+ self.transcription_accuracy_threshold = transcription_accuracy_threshold
45
+ self.max_transcription_retries = max_transcription_retries
46
+ self.transcription_additional_instructions = (
47
+ transcription_additional_instructions
48
+ )
49
+ self.chat_model = self.ai_application_service.load_chat_model()
50
+ self.transcription_workflow = TranscriptionWorkflow(
51
+ self.chat_model, self.transcription_additional_instructions
52
+ )
53
+ self.compiled_transcription_workflow = (
54
+ self.transcription_workflow.gen_workflow()
55
+ )
56
+ self.compiled_transcription_workflow = (
57
+ self.compiled_transcription_workflow.compile()
58
+ )
59
+
60
+ # def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
61
+ # """Transcribe an image to text.
62
+ # Args:
63
+ # document: The document with the image to transcribe
64
+ # Returns:
65
+ # Processed text
66
+ # """
67
+ # try:
68
+ # # Create the prompt template with image
69
+ # transcription_output_parser = PydanticOutputParser(
70
+ # pydantic_object=Transcription
71
+ # )
72
+ # prompt = ChatPromptTemplate.from_messages(
73
+ # [
74
+ # ("system", IMAGE_TRANSCRIPTION_SYSTEM_PROMPT),
75
+ # (
76
+ # "human",
77
+ # [
78
+ # {
79
+ # "type": "image",
80
+ # "image_url": {
81
+ # "url": f"data:image/png;base64,{document.page_base64}"
82
+ # },
83
+ # },
84
+ # {
85
+ # "type": "text",
86
+ # "text": "Transcribe the document, ensure all content transcribed accurately",
87
+ # },
88
+ # ],
89
+ # ),
90
+ # ]
91
+ # ).partial(
92
+ # transcription_additional_instructions=self.transcription_additional_instructions,
93
+ # format_instructions=transcription_output_parser.get_format_instructions(),
94
+ # )
95
+ # model_with_structured_output = self.chat_model.with_structured_output(
96
+ # Transcription
97
+ # )
98
+ # # Create the chain
99
+ # chain = prompt | model_with_structured_output
100
+ # # Process the image
101
+ # chain = chain.with_retry(
102
+ # stop_after_attempt=3, exponential_jitter_params={"initial": 60}
103
+ # )
104
+ # result = chain.invoke({})
105
+ # if result.transcription:
106
+ # document.page_text = result.transcription
107
+ # else:
108
+ # raise ValueError("No transcription found")
109
+ # return document
110
+ # except Exception as e:
111
+ # logger.error(f"Failed to parse document page: {str(e)}")
112
+ # raise
113
+
114
+ async def parse_doc_page_with_workflow(
115
+ self, document: ParsedDocPage, retries: int = 0
116
+ ) -> ParsedDocPage:
117
+ """Transcribe an image to text using an agent.
118
+ Args:
119
+ document: The document with the image to transcribe
120
+ Returns:
121
+ Processed text
122
+ """
123
+ if retries > 1:
124
+ logger.info("Max retries exceeded")
125
+ return document
126
+ result = await self.compiled_transcription_workflow.ainvoke(
127
+ {
128
+ "messages": [
129
+ HumanMessage(
130
+ content=[
131
+ {
132
+ "type": "text",
133
+ "text": "Transcribe the document, ensure all content transcribed accurately. transcription must be in the same language of source document.",
134
+ },
135
+ ]
136
+ ),
137
+ HumanMessage(
138
+ content=[
139
+ {
140
+ "type": "image_url",
141
+ "image_url": {
142
+ "url": f"data:image/png;base64,{document.page_base64}"
143
+ },
144
+ }
145
+ ]
146
+ ),
147
+ ]
148
+ },
149
+ {
150
+ "configurable": {
151
+ "transcription_accuracy_threshold": self.transcription_accuracy_threshold,
152
+ "max_transcription_retries": self.max_transcription_retries,
153
+ }
154
+ },
155
+ )
156
+ if "transcription" in result:
157
+ document.page_text = result["transcription"]
158
+ else:
159
+ return await self.parse_doc_page_with_workflow(
160
+ document, retries=retries + 1
161
+ )
162
+ return document
163
+
164
+ # def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
165
+ # """
166
+ # Process a document by parsing it and returning the parsed content.
167
+ # """
168
+ # raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
169
+ # parse_doc_model_service = ParseDocModelService(raw_file_path)
170
+ # document_pages = parse_doc_model_service.parse_document_to_base64()
171
+ # parsed_pages = []
172
+ # for page in document_pages:
173
+ # page = self.parse_doc_page_with_workflow(page)
174
+ # parsed_pages.append(page)
175
+ # logger.info(f"Parsed {len(parsed_pages)} pages")
176
+ # parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
177
+ # return parsed_pages, parsed_document
178
+
179
+ async def process_document(
180
+ self, file_key: str
181
+ ) -> Tuple[List[ParsedDocPage], ParsedDoc]:
182
+ """
183
+ Process a document by parsing it and returning the parsed content.
184
+ """
185
+ raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
186
+ parse_doc_model_service = ParseDocModelService(raw_file_path)
187
+ document_pages = parse_doc_model_service.parse_document_to_base64()
188
+ parse_pages_workflow_tasks = []
189
+ parsed_pages = []
190
+ for page in document_pages:
191
+ parse_pages_workflow_tasks.append(self.parse_doc_page_with_workflow(page))
192
+ # here
193
+ parsed_pages = await asyncio.gather(*parse_pages_workflow_tasks)
194
+ logger.info(f"Parsed {len(parsed_pages)} pages")
195
+ parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
196
+ return parsed_pages, parsed_document
197
+
198
+ def save_parsed_document(
199
+ self,
200
+ file_key: str,
201
+ parsed_document: ParsedDoc,
202
+ file_tags: Optional[Dict[str, str]] = {},
203
+ ):
204
+ """
205
+ Save the parsed document to a file.
206
+ """
207
+ self.persistence_service.save_parsed_document(
208
+ file_key, parsed_document, file_tags
209
+ )
@@ -0,0 +1,13 @@
1
+ from enum import Enum
2
+ from typing import Literal
3
+
4
+
5
+ class KdbServices(str, Enum):
6
+ REDIS = "redis"
7
+ CHROMA = "chroma"
8
+ PG = "pg"
9
+
10
+
11
+ kdb_services = Literal[
12
+ KdbServices.REDIS.value, KdbServices.CHROMA.value, KdbServices.PG.value
13
+ ]