wizit-context-ingestor 0.2.5b2__py3-none-any.whl → 0.3.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wizit_context_ingestor/__init__.py +2 -2
- wizit_context_ingestor/application/context_chunk_service.py +149 -35
- wizit_context_ingestor/application/interfaces.py +1 -1
- wizit_context_ingestor/application/transcription_service.py +132 -49
- wizit_context_ingestor/data/kdb.py +10 -0
- wizit_context_ingestor/data/prompts.py +156 -2
- wizit_context_ingestor/data/storage.py +10 -0
- wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
- wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
- wizit_context_ingestor/infra/rag/chroma_embeddings.py +135 -0
- wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
- wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
- wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
- wizit_context_ingestor/infra/vertex_model.py +56 -28
- wizit_context_ingestor/main.py +160 -105
- wizit_context_ingestor/utils/file_utils.py +13 -0
- wizit_context_ingestor/workflows/context_nodes.py +73 -0
- wizit_context_ingestor/workflows/context_state.py +10 -0
- wizit_context_ingestor/workflows/context_tools.py +58 -0
- wizit_context_ingestor/workflows/context_workflow.py +42 -0
- wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
- wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
- wizit_context_ingestor/workflows/transcription_state.py +17 -0
- wizit_context_ingestor/workflows/transcription_tools.py +54 -0
- wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
- {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/METADATA +10 -1
- wizit_context_ingestor-0.3.0b1.dist-info/RECORD +44 -0
- {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/WHEEL +1 -1
- wizit_context_ingestor-0.2.5b2.dist-info/RECORD +0 -31
|
@@ -2,6 +2,7 @@ from langchain_core.documents import Document
|
|
|
2
2
|
from langchain_redis import RedisConfig, RedisVectorStore
|
|
3
3
|
from typing import List
|
|
4
4
|
import logging
|
|
5
|
+
|
|
5
6
|
# from dotenv import load_dotenv
|
|
6
7
|
from ...application.interfaces import EmbeddingsManager
|
|
7
8
|
|
|
@@ -9,10 +10,13 @@ from ...application.interfaces import EmbeddingsManager
|
|
|
9
10
|
|
|
10
11
|
logger = logging.getLogger(__name__)
|
|
11
12
|
|
|
12
|
-
class RedisEmbeddingsManager(EmbeddingsManager):
|
|
13
13
|
|
|
14
|
+
class RedisEmbeddingsManager(EmbeddingsManager):
|
|
14
15
|
__slots__ = ("embeddings_model", "redis_conn_string", "metadata_tags")
|
|
15
|
-
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self, embeddings_model, redis_conn_string: str, metadata_tags: List[str] = []
|
|
19
|
+
):
|
|
16
20
|
"""
|
|
17
21
|
Initialize the RedisEmbeddingsManager.
|
|
18
22
|
Args:
|
|
@@ -27,27 +31,23 @@ class RedisEmbeddingsManager(EmbeddingsManager):
|
|
|
27
31
|
"""
|
|
28
32
|
self.redis_conn_string = redis_conn_string
|
|
29
33
|
self.embeddings_model = embeddings_model
|
|
30
|
-
self.metadata_tags_schema = []
|
|
31
|
-
|
|
34
|
+
self.metadata_tags_schema = [{"type": "text", "name": "context"}]
|
|
32
35
|
for tag_key in metadata_tags:
|
|
33
|
-
|
|
34
|
-
"type": "tag",
|
|
35
|
-
"name": tag_key
|
|
36
|
-
})
|
|
36
|
+
self.metadata_tags_schema.append({"type": "text", "name": tag_key})
|
|
37
37
|
|
|
38
38
|
try:
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
39
|
+
self.redis_config = RedisConfig(
|
|
40
|
+
index_name="vector_store",
|
|
41
|
+
redis_url=self.redis_conn_string,
|
|
42
|
+
metadata_schema=self.metadata_tags_schema,
|
|
43
|
+
)
|
|
44
|
+
self.vector_store = RedisVectorStore(
|
|
45
|
+
self.embeddings_model, config=self.redis_config
|
|
46
|
+
)
|
|
47
|
+
logger.info("RedisEmbeddingsManager initialized")
|
|
48
48
|
except Exception as e:
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
logger.error(f"Failed to initialize RedisEmbeddingsManager: {str(e)}")
|
|
50
|
+
raise
|
|
51
51
|
|
|
52
52
|
def configure_vector_store(
|
|
53
53
|
self,
|
|
@@ -56,7 +56,7 @@ class RedisEmbeddingsManager(EmbeddingsManager):
|
|
|
56
56
|
content_column: str = "document",
|
|
57
57
|
id_column: str = "id",
|
|
58
58
|
metadata_json_column: str = "cmetadata",
|
|
59
|
-
pg_record_manager: str = "postgres/langchain_pg_collection"
|
|
59
|
+
pg_record_manager: str = "postgres/langchain_pg_collection",
|
|
60
60
|
):
|
|
61
61
|
"""Configure the vector store."""
|
|
62
62
|
pass
|
|
@@ -73,13 +73,14 @@ class RedisEmbeddingsManager(EmbeddingsManager):
|
|
|
73
73
|
|
|
74
74
|
def vector_store_initialized(func):
|
|
75
75
|
"""validate vector store initialization"""
|
|
76
|
+
|
|
76
77
|
def wrapper(self, *args, **kwargs):
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
return wrapper
|
|
78
|
+
# Common validation logic
|
|
79
|
+
if self.vector_store is None:
|
|
80
|
+
raise Exception("Vector store not initialized")
|
|
81
|
+
return func(self, *args, **kwargs)
|
|
82
82
|
|
|
83
|
+
return wrapper
|
|
83
84
|
|
|
84
85
|
@vector_store_initialized
|
|
85
86
|
def index_documents(self, docs: List[Document]):
|
|
@@ -129,6 +130,14 @@ class RedisEmbeddingsManager(EmbeddingsManager):
|
|
|
129
130
|
logger.error(f"Error deleting documents by ID: {str(e)}")
|
|
130
131
|
raise
|
|
131
132
|
|
|
133
|
+
@vector_store_initialized
|
|
134
|
+
def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
|
|
135
|
+
"""
|
|
136
|
+
Delete documents by filter from the vector store.
|
|
137
|
+
"""
|
|
138
|
+
# TODO investigate how to do this
|
|
139
|
+
pass
|
|
140
|
+
|
|
132
141
|
def get_documents_keys_by_source_id(self, source_id: str):
|
|
133
142
|
"""Get documents keys by source ID."""
|
|
134
143
|
pass
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from posix import fork
|
|
2
|
+
|
|
1
3
|
# check this documentation
|
|
2
4
|
# https://python.langchain.com/docs/how_to/semantic-chunker/
|
|
3
5
|
# https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
|
|
@@ -16,7 +18,9 @@ class SemanticChunks(RagChunker):
|
|
|
16
18
|
Class for semantically chunking documents into smaller pieces based on semantic similarity.
|
|
17
19
|
Uses LangChain's SemanticChunker to create semantically coherent document chunks.
|
|
18
20
|
"""
|
|
21
|
+
|
|
19
22
|
__slots__ = ("embeddings_model",)
|
|
23
|
+
|
|
20
24
|
def __init__(self, embeddings_model: Any):
|
|
21
25
|
"""
|
|
22
26
|
Initialize a document chunker with an embeddings model.
|
|
@@ -35,7 +39,7 @@ class SemanticChunks(RagChunker):
|
|
|
35
39
|
add_start_index=True,
|
|
36
40
|
breakpoint_threshold_type="percentile",
|
|
37
41
|
breakpoint_threshold_amount=95,
|
|
38
|
-
min_chunk_size=200
|
|
42
|
+
min_chunk_size=200,
|
|
39
43
|
)
|
|
40
44
|
|
|
41
45
|
def gen_chunks_for_document(self, document: Document) -> List[Document]:
|
|
@@ -53,6 +57,10 @@ class SemanticChunks(RagChunker):
|
|
|
53
57
|
"""
|
|
54
58
|
try:
|
|
55
59
|
chunks = self.text_splitter.split_documents([document])
|
|
60
|
+
source = document.metadata["source"]
|
|
61
|
+
for i, chunk in enumerate(chunks):
|
|
62
|
+
if document.metadata["source"]:
|
|
63
|
+
chunk.id = f"{source}-{i}"
|
|
56
64
|
logger.info(f"{len(chunks)} chunks generated successfully")
|
|
57
65
|
return chunks
|
|
58
66
|
except Exception as e:
|
|
@@ -15,14 +15,23 @@ class VertexModels(AiApplicationService):
|
|
|
15
15
|
A wrapper class for Google Cloud Vertex AI models that handles credentials and
|
|
16
16
|
provides methods to load embeddings and chat models.
|
|
17
17
|
"""
|
|
18
|
-
|
|
18
|
+
|
|
19
|
+
__slots__ = (
|
|
20
|
+
"project_id",
|
|
21
|
+
"location",
|
|
22
|
+
"json_service_account",
|
|
23
|
+
"scopes",
|
|
24
|
+
"llm_model_id",
|
|
25
|
+
)
|
|
26
|
+
|
|
19
27
|
def __init__(
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
28
|
+
self,
|
|
29
|
+
project_id: str,
|
|
30
|
+
location: str,
|
|
31
|
+
json_service_account: Dict[str, Any],
|
|
32
|
+
scopes: Optional[List[str]] = None,
|
|
33
|
+
llm_model_id: str = "claude-sonnet-4@20250514",
|
|
34
|
+
):
|
|
26
35
|
"""
|
|
27
36
|
Initialize the VertexModels class with Google Cloud credentials.
|
|
28
37
|
|
|
@@ -36,25 +45,24 @@ class VertexModels(AiApplicationService):
|
|
|
36
45
|
print(location)
|
|
37
46
|
self.scopes = scopes or ["https://www.googleapis.com/auth/cloud-platform"]
|
|
38
47
|
self.credentials = service_account.Credentials.from_service_account_info(
|
|
39
|
-
json_service_account,
|
|
40
|
-
scopes=self.scopes
|
|
48
|
+
json_service_account, scopes=self.scopes
|
|
41
49
|
)
|
|
42
50
|
self.llm_model_id = llm_model_id
|
|
43
51
|
self.project_id = project_id
|
|
44
52
|
self.location = location
|
|
45
53
|
vertexai_init(
|
|
46
|
-
project=project_id,
|
|
47
|
-
|
|
48
|
-
|
|
54
|
+
project=project_id, location=location, credentials=self.credentials
|
|
55
|
+
)
|
|
56
|
+
logger.info(
|
|
57
|
+
f"VertexModels initialized with project {project_id} in {location}"
|
|
49
58
|
)
|
|
50
|
-
logger.info(f"VertexModels initialized with project {project_id} in {location}")
|
|
51
59
|
except Exception as e:
|
|
52
60
|
logger.error(f"Failed to initialize VertexModels: {str(e)}")
|
|
53
61
|
raise
|
|
54
62
|
|
|
55
63
|
def load_embeddings_model(
|
|
56
|
-
self,
|
|
57
|
-
|
|
64
|
+
self, embeddings_model_id: str = "text-multilingual-embedding-002"
|
|
65
|
+
) -> VertexAIEmbeddings: # noqa: E125
|
|
58
66
|
"""
|
|
59
67
|
Load and return a Vertex AI embeddings model.
|
|
60
68
|
default embeddings length is 768 https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings
|
|
@@ -73,14 +81,18 @@ class VertexModels(AiApplicationService):
|
|
|
73
81
|
logger.debug(f"Loaded embedding model: {embeddings_model_id}")
|
|
74
82
|
return embeddings
|
|
75
83
|
except Exception as e:
|
|
76
|
-
logger.error(
|
|
84
|
+
logger.error(
|
|
85
|
+
f"Failed to load embeddings model {embeddings_model_id}: {str(e)}"
|
|
86
|
+
)
|
|
77
87
|
raise
|
|
78
88
|
|
|
79
|
-
def load_chat_model(
|
|
89
|
+
def load_chat_model(
|
|
90
|
+
self,
|
|
80
91
|
temperature: float = 0.15,
|
|
81
92
|
max_tokens: int = 8192,
|
|
82
93
|
stop: Optional[List[str]] = None,
|
|
83
|
-
**chat_model_params
|
|
94
|
+
**chat_model_params,
|
|
95
|
+
) -> Union[ChatVertexAI, ChatAnthropicVertex]:
|
|
84
96
|
"""
|
|
85
97
|
Load a Vertex AI chat model for text generation.
|
|
86
98
|
|
|
@@ -98,21 +110,35 @@ class VertexModels(AiApplicationService):
|
|
|
98
110
|
"""
|
|
99
111
|
try:
|
|
100
112
|
if "gemini" in self.llm_model_id:
|
|
101
|
-
return self.load_chat_model_gemini(
|
|
113
|
+
return self.load_chat_model_gemini(
|
|
114
|
+
self.llm_model_id,
|
|
115
|
+
temperature,
|
|
116
|
+
max_tokens,
|
|
117
|
+
stop,
|
|
118
|
+
**chat_model_params,
|
|
119
|
+
)
|
|
102
120
|
elif "claude" in self.llm_model_id:
|
|
103
|
-
return self.load_chat_model_anthropic(
|
|
121
|
+
return self.load_chat_model_anthropic(
|
|
122
|
+
self.llm_model_id,
|
|
123
|
+
temperature,
|
|
124
|
+
max_tokens,
|
|
125
|
+
stop,
|
|
126
|
+
**chat_model_params,
|
|
127
|
+
)
|
|
104
128
|
else:
|
|
105
129
|
raise ValueError(f"Unsupported chat model: {self.llm_model_id}")
|
|
106
130
|
except Exception as e:
|
|
107
131
|
logger.error(f"Failed to retrieve chat model {self.llm_model_id}: {str(e)}")
|
|
108
132
|
raise
|
|
109
133
|
|
|
110
|
-
def load_chat_model_gemini(
|
|
134
|
+
def load_chat_model_gemini(
|
|
135
|
+
self,
|
|
111
136
|
chat_model_id: str = "publishers/google/models/gemini-2.5-flash",
|
|
112
137
|
temperature: float = 0.15,
|
|
113
|
-
max_tokens: int =
|
|
138
|
+
max_tokens: int = 64000,
|
|
114
139
|
stop: Optional[List[str]] = None,
|
|
115
|
-
**chat_model_params
|
|
140
|
+
**chat_model_params,
|
|
141
|
+
) -> ChatVertexAI:
|
|
116
142
|
"""
|
|
117
143
|
Load a Vertex AI chat model for text generation.
|
|
118
144
|
|
|
@@ -137,7 +163,7 @@ class VertexModels(AiApplicationService):
|
|
|
137
163
|
max_tokens=max_tokens,
|
|
138
164
|
max_retries=1,
|
|
139
165
|
stop=stop,
|
|
140
|
-
**chat_model_params
|
|
166
|
+
**chat_model_params,
|
|
141
167
|
)
|
|
142
168
|
logger.debug(f"Retrieved chat model: {chat_model_id}")
|
|
143
169
|
return self.llm_model
|
|
@@ -145,12 +171,14 @@ class VertexModels(AiApplicationService):
|
|
|
145
171
|
logger.error(f"Failed to retrieve chat model {chat_model_id}: {str(e)}")
|
|
146
172
|
raise
|
|
147
173
|
|
|
148
|
-
def load_chat_model_anthropic(
|
|
174
|
+
def load_chat_model_anthropic(
|
|
175
|
+
self,
|
|
149
176
|
chat_model_id: str = "claude-3-5-haiku@20241022",
|
|
150
177
|
temperature: float = 0.7,
|
|
151
|
-
max_tokens: int =
|
|
178
|
+
max_tokens: int = 64000,
|
|
152
179
|
stop: Optional[List[str]] = None,
|
|
153
|
-
**chat_model_params
|
|
180
|
+
**chat_model_params,
|
|
181
|
+
) -> ChatAnthropicVertex:
|
|
154
182
|
"""
|
|
155
183
|
Load a Vertex AI chat model for text generation.
|
|
156
184
|
"""
|
|
@@ -163,7 +191,7 @@ class VertexModels(AiApplicationService):
|
|
|
163
191
|
max_tokens=max_tokens,
|
|
164
192
|
max_retries=1,
|
|
165
193
|
stop=stop,
|
|
166
|
-
**chat_model_params
|
|
194
|
+
**chat_model_params,
|
|
167
195
|
)
|
|
168
196
|
logger.debug(f"Retrieved chat model: {chat_model_id}")
|
|
169
197
|
return self.llm_model
|
wizit_context_ingestor/main.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
from typing import Dict, Any, Literal
|
|
2
3
|
from .infra.vertex_model import VertexModels
|
|
3
4
|
from .application.transcription_service import TranscriptionService
|
|
4
5
|
from .application.context_chunk_service import ContextChunksInDocumentService
|
|
@@ -6,16 +7,76 @@ from .infra.persistence.s3_storage import S3StorageService
|
|
|
6
7
|
from .infra.persistence.local_storage import LocalStorageService
|
|
7
8
|
from .infra.rag.semantic_chunks import SemanticChunks
|
|
8
9
|
from .infra.rag.redis_embeddings import RedisEmbeddingsManager
|
|
10
|
+
from .infra.rag.chroma_embeddings import ChromaEmbeddingsManager
|
|
9
11
|
from .infra.secrets.aws_secrets_manager import AwsSecretsManager
|
|
12
|
+
from .data.storage import storage_services, StorageServices
|
|
13
|
+
from .data.kdb import kdb_services, KdbServices
|
|
14
|
+
from .utils.file_utils import has_invalid_file_name_format
|
|
10
15
|
|
|
11
|
-
class DeelabTranscribeManager:
|
|
12
16
|
|
|
13
|
-
|
|
17
|
+
class KdbManager:
|
|
18
|
+
def __init__(
|
|
19
|
+
self, embeddings_model, kdb_service: kdb_services, kdb_params: Dict[Any, Any]
|
|
20
|
+
):
|
|
21
|
+
self.kdb_service = kdb_service
|
|
22
|
+
self.kdb_params = kdb_params
|
|
23
|
+
self.embeddings_model = embeddings_model
|
|
24
|
+
|
|
25
|
+
def retrieve_kdb_service(self):
|
|
26
|
+
if self.kdb_service == KdbServices.REDIS.value:
|
|
27
|
+
return RedisEmbeddingsManager(
|
|
28
|
+
self.embeddings_model,
|
|
29
|
+
**self.kdb_params,
|
|
30
|
+
)
|
|
31
|
+
elif self.kdb_service == KdbServices.CHROMA.value:
|
|
32
|
+
return ChromaEmbeddingsManager(
|
|
33
|
+
self.embeddings_model,
|
|
34
|
+
**self.kdb_params,
|
|
35
|
+
)
|
|
36
|
+
else:
|
|
37
|
+
raise ValueError(f"Unsupported kdb provider: {self.kdb_service}")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PersistenceManager:
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
storage_service: storage_services,
|
|
44
|
+
source_storage_route,
|
|
45
|
+
target_storage_route,
|
|
46
|
+
):
|
|
47
|
+
self.storage_service = storage_service
|
|
48
|
+
self.source_storage_route = source_storage_route
|
|
49
|
+
self.target_storage_route = target_storage_route
|
|
50
|
+
|
|
51
|
+
def retrieve_storage_service(self):
|
|
52
|
+
if self.storage_service == StorageServices.S3.value:
|
|
53
|
+
return S3StorageService(
|
|
54
|
+
origin_bucket_name=self.source_storage_route,
|
|
55
|
+
target_bucket_name=self.target_storage_route,
|
|
56
|
+
)
|
|
57
|
+
elif self.storage_service == StorageServices.LOCAL.value:
|
|
58
|
+
return LocalStorageService(
|
|
59
|
+
source_storage_route=self.source_storage_route,
|
|
60
|
+
target_storage_route=self.target_storage_route,
|
|
61
|
+
)
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError(f"Unsupported storage service: {self.storage_service}")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class TranscriptionManager:
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
14
69
|
gcp_project_id: str,
|
|
15
70
|
gcp_project_location: str,
|
|
16
71
|
gcp_secret_name: str,
|
|
17
|
-
|
|
18
|
-
|
|
72
|
+
storage_service: storage_services,
|
|
73
|
+
source_storage_route: str,
|
|
74
|
+
target_storage_route: str,
|
|
75
|
+
llm_model_id: str = "claude-sonnet-4@20250514",
|
|
76
|
+
target_language: str = "es",
|
|
77
|
+
transcription_additional_instructions: str = "",
|
|
78
|
+
transcription_accuracy_threshold: int = 90,
|
|
79
|
+
max_transcription_retries: int = 2,
|
|
19
80
|
):
|
|
20
81
|
self.gcp_project_id = gcp_project_id
|
|
21
82
|
self.gcp_project_location = gcp_project_location
|
|
@@ -23,6 +84,14 @@ class DeelabTranscribeManager:
|
|
|
23
84
|
self.gcp_secret_name = gcp_secret_name
|
|
24
85
|
self.llm_model_id = llm_model_id
|
|
25
86
|
self.target_language = target_language
|
|
87
|
+
self.storage_service = storage_service
|
|
88
|
+
self.source_storage_route = source_storage_route
|
|
89
|
+
self.target_storage_route = target_storage_route
|
|
90
|
+
self.transcription_additional_instructions = (
|
|
91
|
+
transcription_additional_instructions
|
|
92
|
+
)
|
|
93
|
+
self.transcription_accuracy_threshold = transcription_accuracy_threshold
|
|
94
|
+
self.max_transcription_retries = max_transcription_retries
|
|
26
95
|
self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
|
|
27
96
|
self.vertex_model = self._get_vertex_model()
|
|
28
97
|
|
|
@@ -36,50 +105,78 @@ class DeelabTranscribeManager:
|
|
|
36
105
|
self.gcp_project_id,
|
|
37
106
|
self.gcp_project_location,
|
|
38
107
|
self.gcp_sa_dict,
|
|
39
|
-
llm_model_id=self.llm_model_id
|
|
108
|
+
llm_model_id=self.llm_model_id,
|
|
40
109
|
)
|
|
41
110
|
return vertex_model
|
|
42
111
|
|
|
43
|
-
def
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
112
|
+
def transcribe_document(self, file_key: str):
|
|
113
|
+
"""Transcribe a document from source storage to target storage.
|
|
114
|
+
This method serves as a generic interface for transcribing documents from
|
|
115
|
+
various storage sources to target destinations. The specific implementation
|
|
116
|
+
depends on the storage route types provided.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
file_key (str): The unique identifier or path of the file to be transcribed.
|
|
120
|
+
Returns:
|
|
121
|
+
The result of the transcription process, typically the path or identifier
|
|
122
|
+
of the transcribed document.
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
Exception: If an error occurs during the transcription process.
|
|
126
|
+
"""
|
|
49
127
|
try:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
128
|
+
if has_invalid_file_name_format(file_key):
|
|
129
|
+
raise ValueError(
|
|
130
|
+
"Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
|
|
131
|
+
)
|
|
132
|
+
persistence_layer = PersistenceManager(
|
|
133
|
+
self.storage_service,
|
|
134
|
+
self.source_storage_route,
|
|
135
|
+
self.target_storage_route,
|
|
53
136
|
)
|
|
137
|
+
persistence_service = persistence_layer.retrieve_storage_service()
|
|
54
138
|
|
|
55
139
|
transcribe_document_service = TranscriptionService(
|
|
56
140
|
ai_application_service=self.vertex_model,
|
|
57
|
-
persistence_service=
|
|
58
|
-
target_language=self.target_language
|
|
141
|
+
persistence_service=persistence_service,
|
|
142
|
+
target_language=self.target_language,
|
|
143
|
+
transcription_additional_instructions=self.transcription_additional_instructions,
|
|
144
|
+
transcription_accuracy_threshold=self.transcription_accuracy_threshold,
|
|
145
|
+
max_transcription_retries=self.max_transcription_retries,
|
|
146
|
+
)
|
|
147
|
+
parsed_pages, parsed_document = (
|
|
148
|
+
transcribe_document_service.process_document(file_key)
|
|
149
|
+
)
|
|
150
|
+
source_storage_file_tags = {}
|
|
151
|
+
if persistence_service.supports_tagging:
|
|
152
|
+
# source_storage_file_tags.tag_file(file_key, {"status": "transcribed"})
|
|
153
|
+
source_storage_file_tags = persistence_service.retrieve_file_tags(
|
|
154
|
+
file_key, self.source_storage_route
|
|
155
|
+
)
|
|
156
|
+
transcribe_document_service.save_parsed_document(
|
|
157
|
+
f"{file_key}.md", parsed_document, source_storage_file_tags
|
|
59
158
|
)
|
|
60
|
-
parsed_pages, parsed_document = transcribe_document_service.process_document(file_key)
|
|
61
|
-
origin_bucket_file_tags = s3_persistence_service.retrieve_file_tags(file_key, s3_origin_bucket_name)
|
|
62
|
-
transcribe_document_service.save_parsed_document(f"{file_key}.md", parsed_document, origin_bucket_file_tags)
|
|
63
159
|
# create md document from parsed_pages
|
|
64
160
|
print("parsed_pages", len(parsed_pages))
|
|
65
161
|
# print("parsed_document", parsed_document)
|
|
66
162
|
return f"{file_key}.md"
|
|
67
163
|
except Exception as e:
|
|
68
|
-
print(f"Error
|
|
164
|
+
print(f"Error processing document: {e}")
|
|
69
165
|
raise e
|
|
70
166
|
|
|
71
167
|
|
|
72
|
-
class
|
|
73
|
-
|
|
168
|
+
class ChunksManager:
|
|
74
169
|
def __init__(
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
170
|
+
self,
|
|
171
|
+
gcp_project_id: str,
|
|
172
|
+
gcp_project_location: str,
|
|
173
|
+
gcp_secret_name: str,
|
|
174
|
+
storage_service: storage_services,
|
|
175
|
+
kdb_service: Literal["redis", "chroma"],
|
|
176
|
+
kdb_params: Dict[Any, Any],
|
|
177
|
+
llm_model_id: str = "claude-3-5-haiku@20241022",
|
|
178
|
+
embeddings_model_id: str = "text-multilingual-embedding-002",
|
|
179
|
+
target_language: str = "es",
|
|
83
180
|
):
|
|
84
181
|
self.gcp_project_id = gcp_project_id
|
|
85
182
|
self.gcp_project_location = gcp_project_location
|
|
@@ -88,9 +185,14 @@ class DeelabRedisChunksManager:
|
|
|
88
185
|
self.llm_model_id = llm_model_id
|
|
89
186
|
self.target_language = target_language
|
|
90
187
|
self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
|
|
91
|
-
self.
|
|
188
|
+
self.storage_service = storage_service
|
|
189
|
+
self.kdb_params = kdb_params
|
|
190
|
+
self.kdb_service = kdb_service
|
|
191
|
+
# self.redis_connection_string = redis_connection_string
|
|
92
192
|
self.vertex_model = self._get_vertex_model()
|
|
93
|
-
self.embeddings_model = self.vertex_model.load_embeddings_model(
|
|
193
|
+
self.embeddings_model = self.vertex_model.load_embeddings_model(
|
|
194
|
+
embeddings_model_id
|
|
195
|
+
)
|
|
94
196
|
|
|
95
197
|
def _get_gcp_sa_dict(self, gcp_secret_name: str):
|
|
96
198
|
vertex_gcp_sa = self.aws_secrets_manager.get_secret(gcp_secret_name)
|
|
@@ -102,92 +204,45 @@ class DeelabRedisChunksManager:
|
|
|
102
204
|
self.gcp_project_id,
|
|
103
205
|
self.gcp_project_location,
|
|
104
206
|
self.gcp_sa_dict,
|
|
105
|
-
llm_model_id=self.llm_model_id
|
|
207
|
+
llm_model_id=self.llm_model_id,
|
|
106
208
|
)
|
|
107
209
|
return vertex_model
|
|
108
210
|
|
|
109
|
-
def
|
|
110
|
-
self,
|
|
111
|
-
file_key: str
|
|
211
|
+
def gen_context_chunks(
|
|
212
|
+
self, file_key: str, source_storage_route: str, target_storage_route: str
|
|
112
213
|
):
|
|
113
214
|
try:
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
}
|
|
121
|
-
)
|
|
122
|
-
local_persistence_service = LocalStorageService()
|
|
123
|
-
context_chunks_in_document_service = ContextChunksInDocumentService(
|
|
124
|
-
ai_application_service=self.vertex_model,
|
|
125
|
-
persistence_service=local_persistence_service,
|
|
126
|
-
rag_chunker=rag_chunker,
|
|
127
|
-
embeddings_manager=redis_embeddings_manager,
|
|
128
|
-
target_language=self.target_language
|
|
215
|
+
if has_invalid_file_name_format(file_key):
|
|
216
|
+
raise ValueError(
|
|
217
|
+
"Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
|
|
218
|
+
)
|
|
219
|
+
persistence_layer = PersistenceManager(
|
|
220
|
+
self.storage_service, source_storage_route, target_storage_route
|
|
129
221
|
)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
# TODO
|
|
138
|
-
def context_chunks_in_document_from_aws_cloud(
|
|
139
|
-
self,
|
|
140
|
-
file_key: str,
|
|
141
|
-
s3_origin_bucket_name: str,
|
|
142
|
-
s3_target_bucket_name: str
|
|
143
|
-
):
|
|
144
|
-
try:
|
|
145
|
-
s3_persistence_service = S3StorageService(
|
|
146
|
-
origin_bucket_name=s3_origin_bucket_name,
|
|
147
|
-
target_bucket_name=s3_target_bucket_name
|
|
148
|
-
)
|
|
149
|
-
target_bucket_file_tags = s3_persistence_service.retrieve_file_tags(file_key, s3_target_bucket_name)
|
|
150
|
-
|
|
222
|
+
persistence_service = persistence_layer.retrieve_storage_service()
|
|
223
|
+
target_bucket_file_tags = []
|
|
224
|
+
if persistence_service.supports_tagging:
|
|
225
|
+
target_bucket_file_tags = persistence_service.retrieve_file_tags(
|
|
226
|
+
file_key, target_storage_route
|
|
227
|
+
)
|
|
151
228
|
rag_chunker = SemanticChunks(self.embeddings_model)
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
redis_conn_string=self.redis_connection_string,
|
|
155
|
-
metadata_tags=target_bucket_file_tags
|
|
229
|
+
kdb_manager = KdbManager(
|
|
230
|
+
self.embeddings_model, self.kdb_service, self.kdb_params
|
|
156
231
|
)
|
|
232
|
+
kdb_service = kdb_manager.retrieve_kdb_service()
|
|
157
233
|
context_chunks_in_document_service = ContextChunksInDocumentService(
|
|
158
234
|
ai_application_service=self.vertex_model,
|
|
159
|
-
persistence_service=
|
|
235
|
+
persistence_service=persistence_service,
|
|
160
236
|
rag_chunker=rag_chunker,
|
|
161
|
-
embeddings_manager=
|
|
162
|
-
target_language=self.target_language
|
|
237
|
+
embeddings_manager=kdb_service,
|
|
238
|
+
target_language=self.target_language,
|
|
239
|
+
)
|
|
240
|
+
context_chunks = (
|
|
241
|
+
context_chunks_in_document_service.get_context_chunks_in_document(
|
|
242
|
+
file_key, target_bucket_file_tags
|
|
243
|
+
)
|
|
163
244
|
)
|
|
164
|
-
context_chunks = context_chunks_in_document_service.get_context_chunks_in_document(file_key, target_bucket_file_tags)
|
|
165
245
|
return context_chunks
|
|
166
246
|
except Exception as e:
|
|
167
247
|
print(f"Error getting context chunks in document: {e}")
|
|
168
248
|
raise e
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
def delete_document_context_chunks_from_aws_cloud(
|
|
172
|
-
self,
|
|
173
|
-
file_key: str,
|
|
174
|
-
s3_origin_bucket_name: str,
|
|
175
|
-
s3_target_bucket_name: str
|
|
176
|
-
):
|
|
177
|
-
pass
|
|
178
|
-
# rag_chunker = SemanticChunks(self.embeddings_model)
|
|
179
|
-
# pg_embeddings_manager = PgEmbeddingsManager(
|
|
180
|
-
# embeddings_model=self.embeddings_model,
|
|
181
|
-
# pg_connection=self.vector_store_connection
|
|
182
|
-
# )
|
|
183
|
-
# s3_persistence_service = S3StorageService(
|
|
184
|
-
# origin_bucket_name=s3_origin_bucket_name,
|
|
185
|
-
# target_bucket_name=s3_target_bucket_name
|
|
186
|
-
# )
|
|
187
|
-
# context_chunks_in_document_service = ContextChunksInDocumentService(
|
|
188
|
-
# ai_application_service=self.vertex_model,
|
|
189
|
-
# persistence_service=s3_persistence_service,
|
|
190
|
-
# rag_chunker=rag_chunker,
|
|
191
|
-
# embeddings_manager=pg_embeddings_manager
|
|
192
|
-
# )
|
|
193
|
-
# context_chunks_in_document_service.delete_document_context_chunks(file_key)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def has_invalid_file_name_format(file_name):
|
|
5
|
+
"""Check if file name has special characters or spaces instead of underscores"""
|
|
6
|
+
# Check for spaces
|
|
7
|
+
if " " in file_name:
|
|
8
|
+
return True
|
|
9
|
+
|
|
10
|
+
# Check for special characters (anything that's not alphanumeric, underscore, dash, or dot)
|
|
11
|
+
if re.search(r"[^a-zA-Z0-9_.-]", file_name):
|
|
12
|
+
return True
|
|
13
|
+
return False
|