wizit-context-ingestor 0.2.5b3__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/PKG-INFO +12 -1
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/README.md +7 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/pyproject.toml +5 -1
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/__init__.py +4 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/application/context_chunk_service.py +145 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/application/interfaces.py +52 -23
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/application/kdb_service.py +59 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/application/transcription_service.py +209 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/data/kdb.py +13 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/data/prompts.py +293 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/data/storage.py +10 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/domain/services.py +6 -11
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/rag/chroma_embeddings.py +37 -33
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/infra/rag/pg_embeddings.py +201 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/rag/semantic_chunks.py +8 -1
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/secrets/aws_secrets_manager.py +3 -4
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/vertex_model.py +56 -28
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/main.py +283 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/main_chunks.py +173 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/utils/file_utils.py +12 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/context_nodes.py +73 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/context_state.py +10 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/context_tools.py +58 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/context_workflow.py +42 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/transcription_state.py +17 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/transcription_tools.py +54 -0
- wizit_context_ingestor-0.4.1/src/wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
- wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/__init__.py +0 -3
- wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/application/context_chunk_service.py +0 -114
- wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/application/transcription_service.py +0 -98
- wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/data/prompts.py +0 -148
- wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/infra/rag/pg_embeddings.py +0 -208
- wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/main.py +0 -196
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/.DS_Store +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/application/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/data/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/domain/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/domain/models.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/aws_model.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/persistence/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/infra/secrets/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/services/.DS_Store +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/services/__init__.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/services/chunks.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/services/parse_doc.py +0 -0
- {wizit_context_ingestor-0.2.5b3 → wizit_context_ingestor-0.4.1}/src/wizit_context_ingestor/services/pg_embeddings_manager.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: wizit-context-ingestor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Contextual Rag with Cloud Solutions
|
|
5
5
|
Requires-Dist: anthropic[vertex]>=0.66.0
|
|
6
6
|
Requires-Dist: boto3>=1.40.23
|
|
@@ -8,9 +8,13 @@ Requires-Dist: langchain-aws>=0.2.31
|
|
|
8
8
|
Requires-Dist: langchain-chroma>=0.2.6
|
|
9
9
|
Requires-Dist: langchain-experimental>=0.3.4
|
|
10
10
|
Requires-Dist: langchain-google-vertexai>=2.0.28
|
|
11
|
+
Requires-Dist: langchain-postgres>=0.0.16
|
|
11
12
|
Requires-Dist: langchain-redis>=0.2.3
|
|
13
|
+
Requires-Dist: langgraph>=0.6.8
|
|
12
14
|
Requires-Dist: pillow>=11.3.0
|
|
15
|
+
Requires-Dist: psycopg2-binary>=2.9.11
|
|
13
16
|
Requires-Dist: pymupdf>=1.26.4
|
|
17
|
+
Requires-Dist: sqlalchemy[asyncio]>=2.0.43
|
|
14
18
|
Requires-Python: >=3.12
|
|
15
19
|
Description-Content-Type: text/markdown
|
|
16
20
|
|
|
@@ -138,6 +142,13 @@ Finally
|
|
|
138
142
|
poetry publish -r tbbcmegaingestor
|
|
139
143
|
```
|
|
140
144
|
|
|
145
|
+
# USAGE
|
|
146
|
+
|
|
147
|
+
## For transcriptions
|
|
148
|
+
|
|
149
|
+
----- TODO ---
|
|
150
|
+
You can provide number of retries and a transcription quality threshold
|
|
151
|
+
|
|
141
152
|
## License
|
|
142
153
|
|
|
143
154
|
This project is licensed under the Apache License - see the LICENSE file for details.
|
|
@@ -122,6 +122,13 @@ Finally
|
|
|
122
122
|
poetry publish -r tbbcmegaingestor
|
|
123
123
|
```
|
|
124
124
|
|
|
125
|
+
# USAGE
|
|
126
|
+
|
|
127
|
+
## For transcriptions
|
|
128
|
+
|
|
129
|
+
----- TODO ---
|
|
130
|
+
You can provide number of retries and a transcription quality threshold
|
|
131
|
+
|
|
125
132
|
## License
|
|
126
133
|
|
|
127
134
|
This project is licensed under the Apache License - see the LICENSE file for details.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "wizit_context_ingestor"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.1"
|
|
4
4
|
description = "Contextual Rag with Cloud Solutions"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -11,9 +11,13 @@ dependencies = [
|
|
|
11
11
|
"langchain-chroma>=0.2.6",
|
|
12
12
|
"langchain-experimental>=0.3.4",
|
|
13
13
|
"langchain-google-vertexai>=2.0.28",
|
|
14
|
+
"langchain-postgres>=0.0.16",
|
|
14
15
|
"langchain-redis>=0.2.3",
|
|
16
|
+
"langgraph>=0.6.8",
|
|
15
17
|
"pillow>=11.3.0",
|
|
18
|
+
"psycopg2-binary>=2.9.11",
|
|
16
19
|
"pymupdf>=1.26.4",
|
|
20
|
+
"sqlalchemy[asyncio]>=2.0.43",
|
|
17
21
|
]
|
|
18
22
|
|
|
19
23
|
[dependency-groups]
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from langchain_core.messages.human import HumanMessage
|
|
7
|
+
from langchain_core.output_parsers.pydantic import PydanticOutputParser
|
|
8
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
9
|
+
|
|
10
|
+
from ..data.prompts import CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT, ContextChunk
|
|
11
|
+
from ..workflows.context_workflow import ContextWorkflow
|
|
12
|
+
from .interfaces import (
|
|
13
|
+
AiApplicationService,
|
|
14
|
+
EmbeddingsManager,
|
|
15
|
+
PersistenceService,
|
|
16
|
+
RagChunker,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ContextChunksInDocumentService:
|
|
23
|
+
"""
|
|
24
|
+
Service for chunking documents.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
ai_application_service: AiApplicationService,
|
|
30
|
+
persistence_service: PersistenceService,
|
|
31
|
+
rag_chunker: RagChunker,
|
|
32
|
+
embeddings_manager: EmbeddingsManager,
|
|
33
|
+
target_language: str = "es",
|
|
34
|
+
):
|
|
35
|
+
"""
|
|
36
|
+
Initialize the ChunkerService.
|
|
37
|
+
"""
|
|
38
|
+
self.ai_application_service = ai_application_service
|
|
39
|
+
self.persistence_service = persistence_service
|
|
40
|
+
self.rag_chunker = rag_chunker
|
|
41
|
+
self.embeddings_manager = embeddings_manager
|
|
42
|
+
self.target_language = target_language
|
|
43
|
+
# self.embeddings_manager.init_vector_store()
|
|
44
|
+
self.chat_model = self.ai_application_service.load_chat_model()
|
|
45
|
+
# TODO
|
|
46
|
+
self.context_additional_instructions = ""
|
|
47
|
+
self.metadata_source = "source"
|
|
48
|
+
|
|
49
|
+
async def _retrieve_context_chunk_in_document_with_workflow(
|
|
50
|
+
self,
|
|
51
|
+
workflow,
|
|
52
|
+
markdown_content: str,
|
|
53
|
+
chunk: Document,
|
|
54
|
+
chunk_metadata: Optional[Dict[str, Any]] = None,
|
|
55
|
+
) -> Document:
|
|
56
|
+
"""Retrieve context chunks in document."""
|
|
57
|
+
try:
|
|
58
|
+
result = await workflow.ainvoke(
|
|
59
|
+
{
|
|
60
|
+
"messages": [
|
|
61
|
+
HumanMessage(
|
|
62
|
+
content=[
|
|
63
|
+
{
|
|
64
|
+
"type": "text",
|
|
65
|
+
"text": f"Retrieve a complete context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated with the same document's language.",
|
|
66
|
+
},
|
|
67
|
+
]
|
|
68
|
+
)
|
|
69
|
+
],
|
|
70
|
+
"document_content": markdown_content,
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"configurable": {
|
|
74
|
+
"transcription_accuracy_threshold": 0.95,
|
|
75
|
+
"max_transcription_retries": 2,
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
)
|
|
79
|
+
chunk.page_content = f"<context>\n{result['context']}\n</context>\n <content>\n{chunk.page_content}\n</content>"
|
|
80
|
+
# INFO: prevent context in metadata because it's already included in the chunk content, also generates issues when text is long
|
|
81
|
+
# chunk.metadata["context"] = result["context"]
|
|
82
|
+
if chunk_metadata:
|
|
83
|
+
for key, value in chunk_metadata.items():
|
|
84
|
+
chunk.metadata[key] = value
|
|
85
|
+
return chunk
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
88
|
+
raise
|
|
89
|
+
|
|
90
|
+
async def retrieve_context_chunks_in_document_with_workflow(
|
|
91
|
+
self,
|
|
92
|
+
markdown_content: str,
|
|
93
|
+
chunks: List[Document],
|
|
94
|
+
chunks_metadata: Optional[Dict[str, Any]] = None,
|
|
95
|
+
) -> List[Document]:
|
|
96
|
+
"""Retrieve context chunks in document."""
|
|
97
|
+
try:
|
|
98
|
+
context_workflow = ContextWorkflow(
|
|
99
|
+
self.chat_model, self.context_additional_instructions
|
|
100
|
+
)
|
|
101
|
+
compiled_context_workflow = context_workflow.gen_workflow()
|
|
102
|
+
compiled_context_workflow = compiled_context_workflow.compile()
|
|
103
|
+
context_chunks_workflow_invocations = list(
|
|
104
|
+
map(
|
|
105
|
+
lambda chunk: self._retrieve_context_chunk_in_document_with_workflow(
|
|
106
|
+
compiled_context_workflow,
|
|
107
|
+
markdown_content,
|
|
108
|
+
chunk,
|
|
109
|
+
chunks_metadata,
|
|
110
|
+
),
|
|
111
|
+
chunks,
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
context_chunks = await asyncio.gather(*context_chunks_workflow_invocations)
|
|
115
|
+
return context_chunks
|
|
116
|
+
except Exception as e:
|
|
117
|
+
logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
118
|
+
raise
|
|
119
|
+
|
|
120
|
+
async def get_context_chunks_in_document(self, file_key: str, file_tags: dict = {}):
|
|
121
|
+
"""
|
|
122
|
+
Get the context chunks in a document.
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
markdown_content = self.persistence_service.load_markdown_file_content(
|
|
126
|
+
file_key
|
|
127
|
+
)
|
|
128
|
+
langchain_rag_document = Document(
|
|
129
|
+
id=file_key,
|
|
130
|
+
page_content=markdown_content,
|
|
131
|
+
metadata={self.metadata_source: file_key},
|
|
132
|
+
)
|
|
133
|
+
logger.info(f"Document loaded:{file_key}")
|
|
134
|
+
chunks = self.rag_chunker.gen_chunks_for_document(langchain_rag_document)
|
|
135
|
+
logger.info(f"Chunks generated:{len(chunks)}")
|
|
136
|
+
context_chunks = (
|
|
137
|
+
await self.retrieve_context_chunks_in_document_with_workflow(
|
|
138
|
+
markdown_content, chunks, file_tags
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
logger.info(f"Context chunks generated:{len(context_chunks)}")
|
|
142
|
+
return context_chunks
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.error(f"Error: {str(e)}")
|
|
145
|
+
raise e
|
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Application interfaces defining application layer contracts.
|
|
3
3
|
"""
|
|
4
|
+
|
|
4
5
|
from abc import ABC, abstractmethod
|
|
5
|
-
from
|
|
6
|
-
|
|
7
|
-
from
|
|
6
|
+
from typing import List, Optional, Union
|
|
7
|
+
|
|
8
|
+
from langchain.indexes import IndexingResult, SQLRecordManager
|
|
8
9
|
from langchain_aws import ChatBedrockConverse
|
|
10
|
+
from langchain_core.documents import Document
|
|
9
11
|
from langchain_google_vertexai import ChatVertexAI
|
|
10
12
|
from langchain_google_vertexai.model_garden import ChatAnthropicVertex
|
|
13
|
+
from langchain_postgres import PGVectorStore
|
|
14
|
+
|
|
15
|
+
from ..domain.models import ParsedDoc, ParsedDocPage
|
|
16
|
+
|
|
11
17
|
|
|
12
18
|
class TranscriptionService(ABC):
|
|
13
19
|
"""Interface for transcription services."""
|
|
@@ -17,6 +23,7 @@ class TranscriptionService(ABC):
|
|
|
17
23
|
"""Parse a document page."""
|
|
18
24
|
pass
|
|
19
25
|
|
|
26
|
+
|
|
20
27
|
class AiApplicationService(ABC):
|
|
21
28
|
"""Interface for AI application services."""
|
|
22
29
|
|
|
@@ -26,7 +33,9 @@ class AiApplicationService(ABC):
|
|
|
26
33
|
# pass
|
|
27
34
|
|
|
28
35
|
@abstractmethod
|
|
29
|
-
def load_chat_model(
|
|
36
|
+
def load_chat_model(
|
|
37
|
+
self, **kwargs
|
|
38
|
+
) -> Union[ChatVertexAI, ChatAnthropicVertex, ChatBedrockConverse]:
|
|
30
39
|
"""Load a chat model."""
|
|
31
40
|
pass
|
|
32
41
|
|
|
@@ -40,7 +49,9 @@ class PersistenceService(ABC):
|
|
|
40
49
|
"""Interface for persistence services."""
|
|
41
50
|
|
|
42
51
|
@abstractmethod
|
|
43
|
-
def save_parsed_document(
|
|
52
|
+
def save_parsed_document(
|
|
53
|
+
self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
|
|
54
|
+
):
|
|
44
55
|
"""Save a parsed document."""
|
|
45
56
|
pass
|
|
46
57
|
|
|
@@ -70,38 +81,56 @@ class EmbeddingsManager(ABC):
|
|
|
70
81
|
@abstractmethod
|
|
71
82
|
def configure_vector_store(
|
|
72
83
|
self,
|
|
73
|
-
table_name: str = "langchain_pg_embedding",
|
|
74
|
-
vector_size: int = 768,
|
|
75
|
-
content_column: str = "document",
|
|
76
|
-
id_column: str = "id",
|
|
77
|
-
metadata_json_column: str = "cmetadata",
|
|
78
|
-
pg_record_manager: str = "postgres/langchain_pg_collection"
|
|
79
84
|
):
|
|
80
85
|
"""Configure the vector store."""
|
|
81
86
|
pass
|
|
82
87
|
|
|
88
|
+
# @abstractmethod
|
|
89
|
+
# async def init_vector_store(
|
|
90
|
+
# self,
|
|
91
|
+
# table_name: str = "tenant_embeddings",
|
|
92
|
+
# content_column: str = "document",
|
|
93
|
+
# metadata_json_column: str = "cmetadata",
|
|
94
|
+
# id_column: str = "id",
|
|
95
|
+
# ):
|
|
96
|
+
# """Initialize the vector store."""
|
|
97
|
+
# pass
|
|
98
|
+
|
|
83
99
|
@abstractmethod
|
|
84
|
-
def
|
|
100
|
+
def retrieve_vector_store(
|
|
85
101
|
self,
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
metadata_json_column: str = "cmetadata",
|
|
89
|
-
id_column: str = "id",
|
|
90
|
-
):
|
|
91
|
-
"""Initialize the vector store."""
|
|
102
|
+
) -> tuple[PGVectorStore, SQLRecordManager]:
|
|
103
|
+
"""Retrieve the vector store."""
|
|
92
104
|
pass
|
|
93
105
|
|
|
94
106
|
@abstractmethod
|
|
95
|
-
def index_documents(
|
|
107
|
+
def index_documents(
|
|
108
|
+
self,
|
|
109
|
+
docs: list[Document],
|
|
110
|
+
) -> IndexingResult:
|
|
96
111
|
"""Index documents."""
|
|
97
112
|
pass
|
|
98
113
|
|
|
99
114
|
@abstractmethod
|
|
100
|
-
def
|
|
101
|
-
|
|
115
|
+
def search_records(
|
|
116
|
+
self,
|
|
117
|
+
query: str,
|
|
118
|
+
) -> list[Document]:
|
|
119
|
+
"""Search documents."""
|
|
102
120
|
pass
|
|
103
121
|
|
|
104
122
|
@abstractmethod
|
|
105
|
-
def
|
|
106
|
-
|
|
123
|
+
def create_index(
|
|
124
|
+
self,
|
|
125
|
+
):
|
|
107
126
|
pass
|
|
127
|
+
|
|
128
|
+
# @abstractmethod
|
|
129
|
+
# def get_documents_keys_by_source_id(self, source_id: str):
|
|
130
|
+
# """Get documents keys by source ID."""
|
|
131
|
+
# pass
|
|
132
|
+
|
|
133
|
+
# @abstractmethod
|
|
134
|
+
# def delete_documents_by_source_id(self, source_id: str):
|
|
135
|
+
# """Delete documents by source ID."""
|
|
136
|
+
# pass
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from langchain.indexes import SQLRecordManager
|
|
4
|
+
from langchain_core.documents import Document
|
|
5
|
+
from langchain_postgres import PGVectorStore
|
|
6
|
+
|
|
7
|
+
from .interfaces import (
|
|
8
|
+
EmbeddingsManager,
|
|
9
|
+
RagChunker,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class KdbService:
|
|
16
|
+
"""
|
|
17
|
+
Service for chunking documents.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
embeddings_manager: EmbeddingsManager,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Initialize the ChunkerService.
|
|
26
|
+
"""
|
|
27
|
+
self.embeddings_manager = embeddings_manager
|
|
28
|
+
self._vector_store = None
|
|
29
|
+
self._records_manager = None
|
|
30
|
+
|
|
31
|
+
def configure_kdb(self):
|
|
32
|
+
try:
|
|
33
|
+
self.embeddings_manager.configure_vector_store()
|
|
34
|
+
except Exception as e:
|
|
35
|
+
raise Exception(f"Error configuring KDB: {e}")
|
|
36
|
+
|
|
37
|
+
def create_vector_store_hsnw_index(self):
|
|
38
|
+
try:
|
|
39
|
+
self.embeddings_manager.create_index()
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.error(f"Error creating vector store index: {e}")
|
|
42
|
+
raise Exception(f"Error creating vector store index: {e}")
|
|
43
|
+
|
|
44
|
+
def search(self, query: str) -> list[Document]:
|
|
45
|
+
try:
|
|
46
|
+
records = []
|
|
47
|
+
records = self.embeddings_manager.search_records(query)
|
|
48
|
+
print(records)
|
|
49
|
+
return records
|
|
50
|
+
except Exception as e:
|
|
51
|
+
logger.error(f"Error indexing documents: {e}")
|
|
52
|
+
raise Exception(f"Error indexing documents: {e}")
|
|
53
|
+
|
|
54
|
+
def index_documents_in_vector_store(self, documents: list[Document]) -> None:
|
|
55
|
+
try:
|
|
56
|
+
self.embeddings_manager.index_documents(documents)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.error(f"Error indexing documents: {e}")
|
|
59
|
+
raise Exception(f"Error indexing documents: {e}")
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Tuple, List, Dict, Optional
|
|
3
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
4
|
+
from langchain_core.output_parsers.pydantic import PydanticOutputParser
|
|
5
|
+
from langchain_core.messages import HumanMessage
|
|
6
|
+
from logging import getLogger
|
|
7
|
+
from ..data.prompts import IMAGE_TRANSCRIPTION_SYSTEM_PROMPT, Transcription
|
|
8
|
+
from ..domain.models import ParsedDoc, ParsedDocPage
|
|
9
|
+
from ..domain.services import ParseDocModelService
|
|
10
|
+
from .interfaces import AiApplicationService, PersistenceService
|
|
11
|
+
from ..workflows.transcription_workflow import TranscriptionWorkflow
|
|
12
|
+
|
|
13
|
+
logger = getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TranscriptionService:
|
|
17
|
+
"""
|
|
18
|
+
Service for transcribing documents.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
ai_application_service: AiApplicationService,
|
|
24
|
+
persistence_service: PersistenceService,
|
|
25
|
+
target_language: str = "es",
|
|
26
|
+
transcription_additional_instructions: str = "",
|
|
27
|
+
transcription_accuracy_threshold: float = 0.90,
|
|
28
|
+
max_transcription_retries: int = 2,
|
|
29
|
+
):
|
|
30
|
+
self.ai_application_service = ai_application_service
|
|
31
|
+
self.persistence_service = persistence_service
|
|
32
|
+
self.target_language = target_language
|
|
33
|
+
if (
|
|
34
|
+
transcription_accuracy_threshold < 0.0
|
|
35
|
+
or transcription_accuracy_threshold > 0.95
|
|
36
|
+
):
|
|
37
|
+
raise ValueError(
|
|
38
|
+
"transcription_accuracy_threshold must be between 0 and 95"
|
|
39
|
+
)
|
|
40
|
+
if max_transcription_retries < 1 or max_transcription_retries > 3:
|
|
41
|
+
raise ValueError(
|
|
42
|
+
"max_transcription_retries must be between 1 and 3 to prevent token exhaustion"
|
|
43
|
+
)
|
|
44
|
+
self.transcription_accuracy_threshold = transcription_accuracy_threshold
|
|
45
|
+
self.max_transcription_retries = max_transcription_retries
|
|
46
|
+
self.transcription_additional_instructions = (
|
|
47
|
+
transcription_additional_instructions
|
|
48
|
+
)
|
|
49
|
+
self.chat_model = self.ai_application_service.load_chat_model()
|
|
50
|
+
self.transcription_workflow = TranscriptionWorkflow(
|
|
51
|
+
self.chat_model, self.transcription_additional_instructions
|
|
52
|
+
)
|
|
53
|
+
self.compiled_transcription_workflow = (
|
|
54
|
+
self.transcription_workflow.gen_workflow()
|
|
55
|
+
)
|
|
56
|
+
self.compiled_transcription_workflow = (
|
|
57
|
+
self.compiled_transcription_workflow.compile()
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
|
|
61
|
+
# """Transcribe an image to text.
|
|
62
|
+
# Args:
|
|
63
|
+
# document: The document with the image to transcribe
|
|
64
|
+
# Returns:
|
|
65
|
+
# Processed text
|
|
66
|
+
# """
|
|
67
|
+
# try:
|
|
68
|
+
# # Create the prompt template with image
|
|
69
|
+
# transcription_output_parser = PydanticOutputParser(
|
|
70
|
+
# pydantic_object=Transcription
|
|
71
|
+
# )
|
|
72
|
+
# prompt = ChatPromptTemplate.from_messages(
|
|
73
|
+
# [
|
|
74
|
+
# ("system", IMAGE_TRANSCRIPTION_SYSTEM_PROMPT),
|
|
75
|
+
# (
|
|
76
|
+
# "human",
|
|
77
|
+
# [
|
|
78
|
+
# {
|
|
79
|
+
# "type": "image",
|
|
80
|
+
# "image_url": {
|
|
81
|
+
# "url": f"data:image/png;base64,{document.page_base64}"
|
|
82
|
+
# },
|
|
83
|
+
# },
|
|
84
|
+
# {
|
|
85
|
+
# "type": "text",
|
|
86
|
+
# "text": "Transcribe the document, ensure all content transcribed accurately",
|
|
87
|
+
# },
|
|
88
|
+
# ],
|
|
89
|
+
# ),
|
|
90
|
+
# ]
|
|
91
|
+
# ).partial(
|
|
92
|
+
# transcription_additional_instructions=self.transcription_additional_instructions,
|
|
93
|
+
# format_instructions=transcription_output_parser.get_format_instructions(),
|
|
94
|
+
# )
|
|
95
|
+
# model_with_structured_output = self.chat_model.with_structured_output(
|
|
96
|
+
# Transcription
|
|
97
|
+
# )
|
|
98
|
+
# # Create the chain
|
|
99
|
+
# chain = prompt | model_with_structured_output
|
|
100
|
+
# # Process the image
|
|
101
|
+
# chain = chain.with_retry(
|
|
102
|
+
# stop_after_attempt=3, exponential_jitter_params={"initial": 60}
|
|
103
|
+
# )
|
|
104
|
+
# result = chain.invoke({})
|
|
105
|
+
# if result.transcription:
|
|
106
|
+
# document.page_text = result.transcription
|
|
107
|
+
# else:
|
|
108
|
+
# raise ValueError("No transcription found")
|
|
109
|
+
# return document
|
|
110
|
+
# except Exception as e:
|
|
111
|
+
# logger.error(f"Failed to parse document page: {str(e)}")
|
|
112
|
+
# raise
|
|
113
|
+
|
|
114
|
+
async def parse_doc_page_with_workflow(
|
|
115
|
+
self, document: ParsedDocPage, retries: int = 0
|
|
116
|
+
) -> ParsedDocPage:
|
|
117
|
+
"""Transcribe an image to text using an agent.
|
|
118
|
+
Args:
|
|
119
|
+
document: The document with the image to transcribe
|
|
120
|
+
Returns:
|
|
121
|
+
Processed text
|
|
122
|
+
"""
|
|
123
|
+
if retries > 1:
|
|
124
|
+
logger.info("Max retries exceeded")
|
|
125
|
+
return document
|
|
126
|
+
result = await self.compiled_transcription_workflow.ainvoke(
|
|
127
|
+
{
|
|
128
|
+
"messages": [
|
|
129
|
+
HumanMessage(
|
|
130
|
+
content=[
|
|
131
|
+
{
|
|
132
|
+
"type": "text",
|
|
133
|
+
"text": "Transcribe the document, ensure all content transcribed accurately. transcription must be in the same language of source document.",
|
|
134
|
+
},
|
|
135
|
+
]
|
|
136
|
+
),
|
|
137
|
+
HumanMessage(
|
|
138
|
+
content=[
|
|
139
|
+
{
|
|
140
|
+
"type": "image_url",
|
|
141
|
+
"image_url": {
|
|
142
|
+
"url": f"data:image/png;base64,{document.page_base64}"
|
|
143
|
+
},
|
|
144
|
+
}
|
|
145
|
+
]
|
|
146
|
+
),
|
|
147
|
+
]
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
"configurable": {
|
|
151
|
+
"transcription_accuracy_threshold": self.transcription_accuracy_threshold,
|
|
152
|
+
"max_transcription_retries": self.max_transcription_retries,
|
|
153
|
+
}
|
|
154
|
+
},
|
|
155
|
+
)
|
|
156
|
+
if "transcription" in result:
|
|
157
|
+
document.page_text = result["transcription"]
|
|
158
|
+
else:
|
|
159
|
+
return await self.parse_doc_page_with_workflow(
|
|
160
|
+
document, retries=retries + 1
|
|
161
|
+
)
|
|
162
|
+
return document
|
|
163
|
+
|
|
164
|
+
# def process_document(self, file_key: str) -> Tuple[List[ParsedDocPage], ParsedDoc]:
|
|
165
|
+
# """
|
|
166
|
+
# Process a document by parsing it and returning the parsed content.
|
|
167
|
+
# """
|
|
168
|
+
# raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
|
|
169
|
+
# parse_doc_model_service = ParseDocModelService(raw_file_path)
|
|
170
|
+
# document_pages = parse_doc_model_service.parse_document_to_base64()
|
|
171
|
+
# parsed_pages = []
|
|
172
|
+
# for page in document_pages:
|
|
173
|
+
# page = self.parse_doc_page_with_workflow(page)
|
|
174
|
+
# parsed_pages.append(page)
|
|
175
|
+
# logger.info(f"Parsed {len(parsed_pages)} pages")
|
|
176
|
+
# parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
|
|
177
|
+
# return parsed_pages, parsed_document
|
|
178
|
+
|
|
179
|
+
async def process_document(
|
|
180
|
+
self, file_key: str
|
|
181
|
+
) -> Tuple[List[ParsedDocPage], ParsedDoc]:
|
|
182
|
+
"""
|
|
183
|
+
Process a document by parsing it and returning the parsed content.
|
|
184
|
+
"""
|
|
185
|
+
raw_file_path = self.persistence_service.retrieve_raw_file(file_key)
|
|
186
|
+
parse_doc_model_service = ParseDocModelService(raw_file_path)
|
|
187
|
+
document_pages = parse_doc_model_service.parse_document_to_base64()
|
|
188
|
+
parse_pages_workflow_tasks = []
|
|
189
|
+
parsed_pages = []
|
|
190
|
+
for page in document_pages:
|
|
191
|
+
parse_pages_workflow_tasks.append(self.parse_doc_page_with_workflow(page))
|
|
192
|
+
# here
|
|
193
|
+
parsed_pages = await asyncio.gather(*parse_pages_workflow_tasks)
|
|
194
|
+
logger.info(f"Parsed {len(parsed_pages)} pages")
|
|
195
|
+
parsed_document = parse_doc_model_service.create_md_content(parsed_pages)
|
|
196
|
+
return parsed_pages, parsed_document
|
|
197
|
+
|
|
198
|
+
def save_parsed_document(
|
|
199
|
+
self,
|
|
200
|
+
file_key: str,
|
|
201
|
+
parsed_document: ParsedDoc,
|
|
202
|
+
file_tags: Optional[Dict[str, str]] = {},
|
|
203
|
+
):
|
|
204
|
+
"""
|
|
205
|
+
Save the parsed document to a file.
|
|
206
|
+
"""
|
|
207
|
+
self.persistence_service.save_parsed_document(
|
|
208
|
+
file_key, parsed_document, file_tags
|
|
209
|
+
)
|