wizit-context-ingestor 0.2.5b2__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. wizit_context_ingestor/__init__.py +2 -2
  2. wizit_context_ingestor/application/context_chunk_service.py +149 -35
  3. wizit_context_ingestor/application/interfaces.py +1 -1
  4. wizit_context_ingestor/application/transcription_service.py +132 -49
  5. wizit_context_ingestor/data/kdb.py +10 -0
  6. wizit_context_ingestor/data/prompts.py +156 -2
  7. wizit_context_ingestor/data/storage.py +10 -0
  8. wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
  9. wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
  10. wizit_context_ingestor/infra/rag/chroma_embeddings.py +135 -0
  11. wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
  12. wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
  13. wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
  14. wizit_context_ingestor/infra/vertex_model.py +56 -28
  15. wizit_context_ingestor/main.py +160 -105
  16. wizit_context_ingestor/utils/file_utils.py +13 -0
  17. wizit_context_ingestor/workflows/context_nodes.py +73 -0
  18. wizit_context_ingestor/workflows/context_state.py +10 -0
  19. wizit_context_ingestor/workflows/context_tools.py +58 -0
  20. wizit_context_ingestor/workflows/context_workflow.py +42 -0
  21. wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
  22. wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
  23. wizit_context_ingestor/workflows/transcription_state.py +17 -0
  24. wizit_context_ingestor/workflows/transcription_tools.py +54 -0
  25. wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
  26. {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/METADATA +10 -1
  27. wizit_context_ingestor-0.3.0b1.dist-info/RECORD +44 -0
  28. {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.3.0b1.dist-info}/WHEEL +1 -1
  29. wizit_context_ingestor-0.2.5b2.dist-info/RECORD +0 -31
@@ -2,6 +2,7 @@ from langchain_core.documents import Document
2
2
  from langchain_redis import RedisConfig, RedisVectorStore
3
3
  from typing import List
4
4
  import logging
5
+
5
6
  # from dotenv import load_dotenv
6
7
  from ...application.interfaces import EmbeddingsManager
7
8
 
@@ -9,10 +10,13 @@ from ...application.interfaces import EmbeddingsManager
9
10
 
10
11
  logger = logging.getLogger(__name__)
11
12
 
12
- class RedisEmbeddingsManager(EmbeddingsManager):
13
13
 
14
+ class RedisEmbeddingsManager(EmbeddingsManager):
14
15
  __slots__ = ("embeddings_model", "redis_conn_string", "metadata_tags")
15
- def __init__(self, embeddings_model, redis_conn_string: str, metadata_tags: dict):
16
+
17
+ def __init__(
18
+ self, embeddings_model, redis_conn_string: str, metadata_tags: List[str] = []
19
+ ):
16
20
  """
17
21
  Initialize the RedisEmbeddingsManager.
18
22
  Args:
@@ -27,27 +31,23 @@ class RedisEmbeddingsManager(EmbeddingsManager):
27
31
  """
28
32
  self.redis_conn_string = redis_conn_string
29
33
  self.embeddings_model = embeddings_model
30
- self.metadata_tags_schema = []
31
-
34
+ self.metadata_tags_schema = [{"type": "text", "name": "context"}]
32
35
  for tag_key in metadata_tags:
33
- self.metadata_tags_schema.append({
34
- "type": "tag",
35
- "name": tag_key
36
- })
36
+ self.metadata_tags_schema.append({"type": "text", "name": tag_key})
37
37
 
38
38
  try:
39
- self.redis_config = RedisConfig(
40
- index_name="vector_store",
41
- redis_url=self.redis_conn_string,
42
- metadata_schema=[
43
- {"type": "text", "name": "context"}
44
- ]+self.metadata_tags_schema,
45
- )
46
- self.vector_store = RedisVectorStore(self.embeddings_model, config=self.redis_config)
47
- logger.info("RedisEmbeddingsManager initialized")
39
+ self.redis_config = RedisConfig(
40
+ index_name="vector_store",
41
+ redis_url=self.redis_conn_string,
42
+ metadata_schema=self.metadata_tags_schema,
43
+ )
44
+ self.vector_store = RedisVectorStore(
45
+ self.embeddings_model, config=self.redis_config
46
+ )
47
+ logger.info("RedisEmbeddingsManager initialized")
48
48
  except Exception as e:
49
- logger.error(f"Failed to initialize RedisEmbeddingsManager: {str(e)}")
50
- raise
49
+ logger.error(f"Failed to initialize RedisEmbeddingsManager: {str(e)}")
50
+ raise
51
51
 
52
52
  def configure_vector_store(
53
53
  self,
@@ -56,7 +56,7 @@ class RedisEmbeddingsManager(EmbeddingsManager):
56
56
  content_column: str = "document",
57
57
  id_column: str = "id",
58
58
  metadata_json_column: str = "cmetadata",
59
- pg_record_manager: str = "postgres/langchain_pg_collection"
59
+ pg_record_manager: str = "postgres/langchain_pg_collection",
60
60
  ):
61
61
  """Configure the vector store."""
62
62
  pass
@@ -73,13 +73,14 @@ class RedisEmbeddingsManager(EmbeddingsManager):
73
73
 
74
74
  def vector_store_initialized(func):
75
75
  """validate vector store initialization"""
76
+
76
77
  def wrapper(self, *args, **kwargs):
77
- # Common validation logic
78
- if self.vector_store is None:
79
- raise Exception("Vector store not initialized")
80
- return func(self, *args, **kwargs)
81
- return wrapper
78
+ # Common validation logic
79
+ if self.vector_store is None:
80
+ raise Exception("Vector store not initialized")
81
+ return func(self, *args, **kwargs)
82
82
 
83
+ return wrapper
83
84
 
84
85
  @vector_store_initialized
85
86
  def index_documents(self, docs: List[Document]):
@@ -129,6 +130,14 @@ class RedisEmbeddingsManager(EmbeddingsManager):
129
130
  logger.error(f"Error deleting documents by ID: {str(e)}")
130
131
  raise
131
132
 
133
+ @vector_store_initialized
134
+ def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
135
+ """
136
+ Delete documents by filter from the vector store.
137
+ """
138
+ # TODO investigate how to do this
139
+ pass
140
+
132
141
  def get_documents_keys_by_source_id(self, source_id: str):
133
142
  """Get documents keys by source ID."""
134
143
  pass
@@ -1,3 +1,5 @@
1
+ from posix import fork
2
+
1
3
  # check this documentation
2
4
  # https://python.langchain.com/docs/how_to/semantic-chunker/
3
5
  # https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
@@ -16,7 +18,9 @@ class SemanticChunks(RagChunker):
16
18
  Class for semantically chunking documents into smaller pieces based on semantic similarity.
17
19
  Uses LangChain's SemanticChunker to create semantically coherent document chunks.
18
20
  """
21
+
19
22
  __slots__ = ("embeddings_model",)
23
+
20
24
  def __init__(self, embeddings_model: Any):
21
25
  """
22
26
  Initialize a document chunker with an embeddings model.
@@ -35,7 +39,7 @@ class SemanticChunks(RagChunker):
35
39
  add_start_index=True,
36
40
  breakpoint_threshold_type="percentile",
37
41
  breakpoint_threshold_amount=95,
38
- min_chunk_size=200
42
+ min_chunk_size=200,
39
43
  )
40
44
 
41
45
  def gen_chunks_for_document(self, document: Document) -> List[Document]:
@@ -53,6 +57,10 @@ class SemanticChunks(RagChunker):
53
57
  """
54
58
  try:
55
59
  chunks = self.text_splitter.split_documents([document])
60
+ source = document.metadata["source"]
61
+ for i, chunk in enumerate(chunks):
62
+ if document.metadata["source"]:
63
+ chunk.id = f"{source}-{i}"
56
64
  logger.info(f"{len(chunks)} chunks generated successfully")
57
65
  return chunks
58
66
  except Exception as e:
@@ -15,14 +15,23 @@ class VertexModels(AiApplicationService):
15
15
  A wrapper class for Google Cloud Vertex AI models that handles credentials and
16
16
  provides methods to load embeddings and chat models.
17
17
  """
18
- __slots__ = ('project_id', 'location', 'json_service_account', 'scopes', 'llm_model_id')
18
+
19
+ __slots__ = (
20
+ "project_id",
21
+ "location",
22
+ "json_service_account",
23
+ "scopes",
24
+ "llm_model_id",
25
+ )
26
+
19
27
  def __init__(
20
- self,
21
- project_id: str,
22
- location: str,
23
- json_service_account: Dict[str, Any],
24
- scopes: Optional[List[str]] = None,
25
- llm_model_id: str = "claude-3-5-haiku@20241022"):
28
+ self,
29
+ project_id: str,
30
+ location: str,
31
+ json_service_account: Dict[str, Any],
32
+ scopes: Optional[List[str]] = None,
33
+ llm_model_id: str = "claude-sonnet-4@20250514",
34
+ ):
26
35
  """
27
36
  Initialize the VertexModels class with Google Cloud credentials.
28
37
 
@@ -36,25 +45,24 @@ class VertexModels(AiApplicationService):
36
45
  print(location)
37
46
  self.scopes = scopes or ["https://www.googleapis.com/auth/cloud-platform"]
38
47
  self.credentials = service_account.Credentials.from_service_account_info(
39
- json_service_account,
40
- scopes=self.scopes
48
+ json_service_account, scopes=self.scopes
41
49
  )
42
50
  self.llm_model_id = llm_model_id
43
51
  self.project_id = project_id
44
52
  self.location = location
45
53
  vertexai_init(
46
- project=project_id,
47
- location=location,
48
- credentials=self.credentials
54
+ project=project_id, location=location, credentials=self.credentials
55
+ )
56
+ logger.info(
57
+ f"VertexModels initialized with project {project_id} in {location}"
49
58
  )
50
- logger.info(f"VertexModels initialized with project {project_id} in {location}")
51
59
  except Exception as e:
52
60
  logger.error(f"Failed to initialize VertexModels: {str(e)}")
53
61
  raise
54
62
 
55
63
  def load_embeddings_model(
56
- self,
57
- embeddings_model_id: str = "text-multilingual-embedding-002") -> VertexAIEmbeddings: # noqa: E125
64
+ self, embeddings_model_id: str = "text-multilingual-embedding-002"
65
+ ) -> VertexAIEmbeddings: # noqa: E125
58
66
  """
59
67
  Load and return a Vertex AI embeddings model.
60
68
  default embeddings length is 768 https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings
@@ -73,14 +81,18 @@ class VertexModels(AiApplicationService):
73
81
  logger.debug(f"Loaded embedding model: {embeddings_model_id}")
74
82
  return embeddings
75
83
  except Exception as e:
76
- logger.error(f"Failed to load embeddings model {embeddings_model_id}: {str(e)}")
84
+ logger.error(
85
+ f"Failed to load embeddings model {embeddings_model_id}: {str(e)}"
86
+ )
77
87
  raise
78
88
 
79
- def load_chat_model(self,
89
+ def load_chat_model(
90
+ self,
80
91
  temperature: float = 0.15,
81
92
  max_tokens: int = 8192,
82
93
  stop: Optional[List[str]] = None,
83
- **chat_model_params) -> Union[ChatVertexAI, ChatAnthropicVertex]:
94
+ **chat_model_params,
95
+ ) -> Union[ChatVertexAI, ChatAnthropicVertex]:
84
96
  """
85
97
  Load a Vertex AI chat model for text generation.
86
98
 
@@ -98,21 +110,35 @@ class VertexModels(AiApplicationService):
98
110
  """
99
111
  try:
100
112
  if "gemini" in self.llm_model_id:
101
- return self.load_chat_model_gemini(self.llm_model_id, temperature, max_tokens, stop, **chat_model_params)
113
+ return self.load_chat_model_gemini(
114
+ self.llm_model_id,
115
+ temperature,
116
+ max_tokens,
117
+ stop,
118
+ **chat_model_params,
119
+ )
102
120
  elif "claude" in self.llm_model_id:
103
- return self.load_chat_model_anthropic(self.llm_model_id, temperature, max_tokens, stop, **chat_model_params)
121
+ return self.load_chat_model_anthropic(
122
+ self.llm_model_id,
123
+ temperature,
124
+ max_tokens,
125
+ stop,
126
+ **chat_model_params,
127
+ )
104
128
  else:
105
129
  raise ValueError(f"Unsupported chat model: {self.llm_model_id}")
106
130
  except Exception as e:
107
131
  logger.error(f"Failed to retrieve chat model {self.llm_model_id}: {str(e)}")
108
132
  raise
109
133
 
110
- def load_chat_model_gemini(self,
134
+ def load_chat_model_gemini(
135
+ self,
111
136
  chat_model_id: str = "publishers/google/models/gemini-2.5-flash",
112
137
  temperature: float = 0.15,
113
- max_tokens: int = 8192,
138
+ max_tokens: int = 64000,
114
139
  stop: Optional[List[str]] = None,
115
- **chat_model_params) -> ChatVertexAI:
140
+ **chat_model_params,
141
+ ) -> ChatVertexAI:
116
142
  """
117
143
  Load a Vertex AI chat model for text generation.
118
144
 
@@ -137,7 +163,7 @@ class VertexModels(AiApplicationService):
137
163
  max_tokens=max_tokens,
138
164
  max_retries=1,
139
165
  stop=stop,
140
- **chat_model_params
166
+ **chat_model_params,
141
167
  )
142
168
  logger.debug(f"Retrieved chat model: {chat_model_id}")
143
169
  return self.llm_model
@@ -145,12 +171,14 @@ class VertexModels(AiApplicationService):
145
171
  logger.error(f"Failed to retrieve chat model {chat_model_id}: {str(e)}")
146
172
  raise
147
173
 
148
- def load_chat_model_anthropic(self,
174
+ def load_chat_model_anthropic(
175
+ self,
149
176
  chat_model_id: str = "claude-3-5-haiku@20241022",
150
177
  temperature: float = 0.7,
151
- max_tokens: int = 8000,
178
+ max_tokens: int = 64000,
152
179
  stop: Optional[List[str]] = None,
153
- **chat_model_params) -> ChatAnthropicVertex:
180
+ **chat_model_params,
181
+ ) -> ChatAnthropicVertex:
154
182
  """
155
183
  Load a Vertex AI chat model for text generation.
156
184
  """
@@ -163,7 +191,7 @@ class VertexModels(AiApplicationService):
163
191
  max_tokens=max_tokens,
164
192
  max_retries=1,
165
193
  stop=stop,
166
- **chat_model_params
194
+ **chat_model_params,
167
195
  )
168
196
  logger.debug(f"Retrieved chat model: {chat_model_id}")
169
197
  return self.llm_model
@@ -1,4 +1,5 @@
1
1
  import json
2
+ from typing import Dict, Any, Literal
2
3
  from .infra.vertex_model import VertexModels
3
4
  from .application.transcription_service import TranscriptionService
4
5
  from .application.context_chunk_service import ContextChunksInDocumentService
@@ -6,16 +7,76 @@ from .infra.persistence.s3_storage import S3StorageService
6
7
  from .infra.persistence.local_storage import LocalStorageService
7
8
  from .infra.rag.semantic_chunks import SemanticChunks
8
9
  from .infra.rag.redis_embeddings import RedisEmbeddingsManager
10
+ from .infra.rag.chroma_embeddings import ChromaEmbeddingsManager
9
11
  from .infra.secrets.aws_secrets_manager import AwsSecretsManager
12
+ from .data.storage import storage_services, StorageServices
13
+ from .data.kdb import kdb_services, KdbServices
14
+ from .utils.file_utils import has_invalid_file_name_format
10
15
 
11
- class DeelabTranscribeManager:
12
16
 
13
- def __init__(self,
17
+ class KdbManager:
18
+ def __init__(
19
+ self, embeddings_model, kdb_service: kdb_services, kdb_params: Dict[Any, Any]
20
+ ):
21
+ self.kdb_service = kdb_service
22
+ self.kdb_params = kdb_params
23
+ self.embeddings_model = embeddings_model
24
+
25
+ def retrieve_kdb_service(self):
26
+ if self.kdb_service == KdbServices.REDIS.value:
27
+ return RedisEmbeddingsManager(
28
+ self.embeddings_model,
29
+ **self.kdb_params,
30
+ )
31
+ elif self.kdb_service == KdbServices.CHROMA.value:
32
+ return ChromaEmbeddingsManager(
33
+ self.embeddings_model,
34
+ **self.kdb_params,
35
+ )
36
+ else:
37
+ raise ValueError(f"Unsupported kdb provider: {self.kdb_service}")
38
+
39
+
40
+ class PersistenceManager:
41
+ def __init__(
42
+ self,
43
+ storage_service: storage_services,
44
+ source_storage_route,
45
+ target_storage_route,
46
+ ):
47
+ self.storage_service = storage_service
48
+ self.source_storage_route = source_storage_route
49
+ self.target_storage_route = target_storage_route
50
+
51
+ def retrieve_storage_service(self):
52
+ if self.storage_service == StorageServices.S3.value:
53
+ return S3StorageService(
54
+ origin_bucket_name=self.source_storage_route,
55
+ target_bucket_name=self.target_storage_route,
56
+ )
57
+ elif self.storage_service == StorageServices.LOCAL.value:
58
+ return LocalStorageService(
59
+ source_storage_route=self.source_storage_route,
60
+ target_storage_route=self.target_storage_route,
61
+ )
62
+ else:
63
+ raise ValueError(f"Unsupported storage service: {self.storage_service}")
64
+
65
+
66
+ class TranscriptionManager:
67
+ def __init__(
68
+ self,
14
69
  gcp_project_id: str,
15
70
  gcp_project_location: str,
16
71
  gcp_secret_name: str,
17
- llm_model_id: str = "claude-3-5-sonnet-v2@20241022",
18
- target_language: str = 'es',
72
+ storage_service: storage_services,
73
+ source_storage_route: str,
74
+ target_storage_route: str,
75
+ llm_model_id: str = "claude-sonnet-4@20250514",
76
+ target_language: str = "es",
77
+ transcription_additional_instructions: str = "",
78
+ transcription_accuracy_threshold: int = 90,
79
+ max_transcription_retries: int = 2,
19
80
  ):
20
81
  self.gcp_project_id = gcp_project_id
21
82
  self.gcp_project_location = gcp_project_location
@@ -23,6 +84,14 @@ class DeelabTranscribeManager:
23
84
  self.gcp_secret_name = gcp_secret_name
24
85
  self.llm_model_id = llm_model_id
25
86
  self.target_language = target_language
87
+ self.storage_service = storage_service
88
+ self.source_storage_route = source_storage_route
89
+ self.target_storage_route = target_storage_route
90
+ self.transcription_additional_instructions = (
91
+ transcription_additional_instructions
92
+ )
93
+ self.transcription_accuracy_threshold = transcription_accuracy_threshold
94
+ self.max_transcription_retries = max_transcription_retries
26
95
  self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
27
96
  self.vertex_model = self._get_vertex_model()
28
97
 
@@ -36,50 +105,78 @@ class DeelabTranscribeManager:
36
105
  self.gcp_project_id,
37
106
  self.gcp_project_location,
38
107
  self.gcp_sa_dict,
39
- llm_model_id=self.llm_model_id
108
+ llm_model_id=self.llm_model_id,
40
109
  )
41
110
  return vertex_model
42
111
 
43
- def aws_cloud_transcribe_document(
44
- self,
45
- file_key: str,
46
- s3_origin_bucket_name: str,
47
- s3_target_bucket_name: str
48
- ):
112
+ def transcribe_document(self, file_key: str):
113
+ """Transcribe a document from source storage to target storage.
114
+ This method serves as a generic interface for transcribing documents from
115
+ various storage sources to target destinations. The specific implementation
116
+ depends on the storage route types provided.
117
+
118
+ Args:
119
+ file_key (str): The unique identifier or path of the file to be transcribed.
120
+ Returns:
121
+ The result of the transcription process, typically the path or identifier
122
+ of the transcribed document.
123
+
124
+ Raises:
125
+ Exception: If an error occurs during the transcription process.
126
+ """
49
127
  try:
50
- s3_persistence_service = S3StorageService(
51
- origin_bucket_name=s3_origin_bucket_name,
52
- target_bucket_name=s3_target_bucket_name
128
+ if has_invalid_file_name_format(file_key):
129
+ raise ValueError(
130
+ "Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
131
+ )
132
+ persistence_layer = PersistenceManager(
133
+ self.storage_service,
134
+ self.source_storage_route,
135
+ self.target_storage_route,
53
136
  )
137
+ persistence_service = persistence_layer.retrieve_storage_service()
54
138
 
55
139
  transcribe_document_service = TranscriptionService(
56
140
  ai_application_service=self.vertex_model,
57
- persistence_service=s3_persistence_service,
58
- target_language=self.target_language
141
+ persistence_service=persistence_service,
142
+ target_language=self.target_language,
143
+ transcription_additional_instructions=self.transcription_additional_instructions,
144
+ transcription_accuracy_threshold=self.transcription_accuracy_threshold,
145
+ max_transcription_retries=self.max_transcription_retries,
146
+ )
147
+ parsed_pages, parsed_document = (
148
+ transcribe_document_service.process_document(file_key)
149
+ )
150
+ source_storage_file_tags = {}
151
+ if persistence_service.supports_tagging:
152
+ # source_storage_file_tags.tag_file(file_key, {"status": "transcribed"})
153
+ source_storage_file_tags = persistence_service.retrieve_file_tags(
154
+ file_key, self.source_storage_route
155
+ )
156
+ transcribe_document_service.save_parsed_document(
157
+ f"{file_key}.md", parsed_document, source_storage_file_tags
59
158
  )
60
- parsed_pages, parsed_document = transcribe_document_service.process_document(file_key)
61
- origin_bucket_file_tags = s3_persistence_service.retrieve_file_tags(file_key, s3_origin_bucket_name)
62
- transcribe_document_service.save_parsed_document(f"{file_key}.md", parsed_document, origin_bucket_file_tags)
63
159
  # create md document from parsed_pages
64
160
  print("parsed_pages", len(parsed_pages))
65
161
  # print("parsed_document", parsed_document)
66
162
  return f"{file_key}.md"
67
163
  except Exception as e:
68
- print(f"Error transcribing document: {e}")
164
+ print(f"Error processing document: {e}")
69
165
  raise e
70
166
 
71
167
 
72
- class DeelabRedisChunksManager:
73
-
168
+ class ChunksManager:
74
169
  def __init__(
75
- self,
76
- gcp_project_id: str,
77
- gcp_project_location: str,
78
- gcp_secret_name: str,
79
- redis_connection_string: str,
80
- llm_model_id: str = "claude-3-5-haiku@20241022",
81
- embeddings_model_id: str = "text-multilingual-embedding-002",
82
- target_language: str = "es"
170
+ self,
171
+ gcp_project_id: str,
172
+ gcp_project_location: str,
173
+ gcp_secret_name: str,
174
+ storage_service: storage_services,
175
+ kdb_service: Literal["redis", "chroma"],
176
+ kdb_params: Dict[Any, Any],
177
+ llm_model_id: str = "claude-3-5-haiku@20241022",
178
+ embeddings_model_id: str = "text-multilingual-embedding-002",
179
+ target_language: str = "es",
83
180
  ):
84
181
  self.gcp_project_id = gcp_project_id
85
182
  self.gcp_project_location = gcp_project_location
@@ -88,9 +185,14 @@ class DeelabRedisChunksManager:
88
185
  self.llm_model_id = llm_model_id
89
186
  self.target_language = target_language
90
187
  self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
91
- self.redis_connection_string = redis_connection_string
188
+ self.storage_service = storage_service
189
+ self.kdb_params = kdb_params
190
+ self.kdb_service = kdb_service
191
+ # self.redis_connection_string = redis_connection_string
92
192
  self.vertex_model = self._get_vertex_model()
93
- self.embeddings_model = self.vertex_model.load_embeddings_model(embeddings_model_id)
193
+ self.embeddings_model = self.vertex_model.load_embeddings_model(
194
+ embeddings_model_id
195
+ )
94
196
 
95
197
  def _get_gcp_sa_dict(self, gcp_secret_name: str):
96
198
  vertex_gcp_sa = self.aws_secrets_manager.get_secret(gcp_secret_name)
@@ -102,92 +204,45 @@ class DeelabRedisChunksManager:
102
204
  self.gcp_project_id,
103
205
  self.gcp_project_location,
104
206
  self.gcp_sa_dict,
105
- llm_model_id=self.llm_model_id
207
+ llm_model_id=self.llm_model_id,
106
208
  )
107
209
  return vertex_model
108
210
 
109
- def context_chunks_in_document(
110
- self,
111
- file_key: str
211
+ def gen_context_chunks(
212
+ self, file_key: str, source_storage_route: str, target_storage_route: str
112
213
  ):
113
214
  try:
114
- rag_chunker = SemanticChunks(self.embeddings_model)
115
- redis_embeddings_manager = RedisEmbeddingsManager(
116
- self.embeddings_model,
117
- self.redis_connection_string,
118
- {
119
- "file_key": file_key
120
- }
121
- )
122
- local_persistence_service = LocalStorageService()
123
- context_chunks_in_document_service = ContextChunksInDocumentService(
124
- ai_application_service=self.vertex_model,
125
- persistence_service=local_persistence_service,
126
- rag_chunker=rag_chunker,
127
- embeddings_manager=redis_embeddings_manager,
128
- target_language=self.target_language
215
+ if has_invalid_file_name_format(file_key):
216
+ raise ValueError(
217
+ "Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
218
+ )
219
+ persistence_layer = PersistenceManager(
220
+ self.storage_service, source_storage_route, target_storage_route
129
221
  )
130
- context_chunks = context_chunks_in_document_service.get_context_chunks_in_document(file_key)
131
- print("context_chunks", context_chunks)
132
- return context_chunks
133
- except Exception as e:
134
- print(f"Error getting context chunks in document: {e}")
135
- raise e
136
-
137
- # TODO
138
- def context_chunks_in_document_from_aws_cloud(
139
- self,
140
- file_key: str,
141
- s3_origin_bucket_name: str,
142
- s3_target_bucket_name: str
143
- ):
144
- try:
145
- s3_persistence_service = S3StorageService(
146
- origin_bucket_name=s3_origin_bucket_name,
147
- target_bucket_name=s3_target_bucket_name
148
- )
149
- target_bucket_file_tags = s3_persistence_service.retrieve_file_tags(file_key, s3_target_bucket_name)
150
-
222
+ persistence_service = persistence_layer.retrieve_storage_service()
223
+ target_bucket_file_tags = []
224
+ if persistence_service.supports_tagging:
225
+ target_bucket_file_tags = persistence_service.retrieve_file_tags(
226
+ file_key, target_storage_route
227
+ )
151
228
  rag_chunker = SemanticChunks(self.embeddings_model)
152
- redis_embeddings_manager = RedisEmbeddingsManager(
153
- embeddings_model=self.embeddings_model,
154
- redis_conn_string=self.redis_connection_string,
155
- metadata_tags=target_bucket_file_tags
229
+ kdb_manager = KdbManager(
230
+ self.embeddings_model, self.kdb_service, self.kdb_params
156
231
  )
232
+ kdb_service = kdb_manager.retrieve_kdb_service()
157
233
  context_chunks_in_document_service = ContextChunksInDocumentService(
158
234
  ai_application_service=self.vertex_model,
159
- persistence_service=s3_persistence_service,
235
+ persistence_service=persistence_service,
160
236
  rag_chunker=rag_chunker,
161
- embeddings_manager=redis_embeddings_manager,
162
- target_language=self.target_language
237
+ embeddings_manager=kdb_service,
238
+ target_language=self.target_language,
239
+ )
240
+ context_chunks = (
241
+ context_chunks_in_document_service.get_context_chunks_in_document(
242
+ file_key, target_bucket_file_tags
243
+ )
163
244
  )
164
- context_chunks = context_chunks_in_document_service.get_context_chunks_in_document(file_key, target_bucket_file_tags)
165
245
  return context_chunks
166
246
  except Exception as e:
167
247
  print(f"Error getting context chunks in document: {e}")
168
248
  raise e
169
-
170
-
171
- def delete_document_context_chunks_from_aws_cloud(
172
- self,
173
- file_key: str,
174
- s3_origin_bucket_name: str,
175
- s3_target_bucket_name: str
176
- ):
177
- pass
178
- # rag_chunker = SemanticChunks(self.embeddings_model)
179
- # pg_embeddings_manager = PgEmbeddingsManager(
180
- # embeddings_model=self.embeddings_model,
181
- # pg_connection=self.vector_store_connection
182
- # )
183
- # s3_persistence_service = S3StorageService(
184
- # origin_bucket_name=s3_origin_bucket_name,
185
- # target_bucket_name=s3_target_bucket_name
186
- # )
187
- # context_chunks_in_document_service = ContextChunksInDocumentService(
188
- # ai_application_service=self.vertex_model,
189
- # persistence_service=s3_persistence_service,
190
- # rag_chunker=rag_chunker,
191
- # embeddings_manager=pg_embeddings_manager
192
- # )
193
- # context_chunks_in_document_service.delete_document_context_chunks(file_key)
@@ -0,0 +1,13 @@
1
+ import re
2
+
3
+
4
+ def has_invalid_file_name_format(file_name):
5
+ """Check if file name has special characters or spaces instead of underscores"""
6
+ # Check for spaces
7
+ if " " in file_name:
8
+ return True
9
+
10
+ # Check for special characters (anything that's not alphanumeric, underscore, dash, or dot)
11
+ if re.search(r"[^a-zA-Z0-9_.-]", file_name):
12
+ return True
13
+ return False