wizit-context-ingestor 0.3.0b2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
- from langchain_core.documents import Document
2
- from langchain.indexes import index, SQLRecordManager
3
- from typing import List
4
1
  import logging
5
- from langchain_postgres import PGVectorStore, PGEngine
6
- from sqlalchemy import create_engine
7
- from dotenv import load_dotenv
8
- from wizit_context_ingestor.application.interfaces import EmbeddingsManager
9
2
 
10
- load_dotenv()
3
+ from langchain.indexes import IndexingResult, SQLRecordManager, aindex, index
4
+ from langchain_core.documents import Document
5
+ from langchain_postgres import PGEngine, PGVectorStore
6
+ from langchain_postgres.v2.indexes import HNSWIndex
7
+ from sqlalchemy.ext.asyncio import create_async_engine
8
+ from typing_extensions import Literal
9
+
10
+ from wizit_context_ingestor.application.interfaces import EmbeddingsManager
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
@@ -42,7 +42,17 @@ class PgEmbeddingsManager(EmbeddingsManager):
42
42
 
43
43
  __slots__ = ("embeddings_model", "pg_connection")
44
44
 
45
- def __init__(self, embeddings_model, pg_connection: str):
45
+ def __init__(
46
+ self,
47
+ embeddings_model,
48
+ pg_connection: str,
49
+ embeddings_vectors_table_name: str = "langchain_pg_embedding",
50
+ vector_size: int = 768,
51
+ content_column: str = "document",
52
+ id_column: str = "id",
53
+ metadata_json_column: str = "cmetadata",
54
+ records_manager_table_name: str = "langchain_record_manager",
55
+ ):
46
56
  """
47
57
  Initialize the PgEmbeddingsManager.
48
58
 
@@ -57,155 +67,135 @@ class PgEmbeddingsManager(EmbeddingsManager):
57
67
  """
58
68
  self.pg_connection = pg_connection
59
69
  self.embeddings_model = embeddings_model
60
- self.pg_engine = None
61
70
  self.vector_store = None
62
71
  self.record_manager = None
63
- try:
64
- self.pg_engine = PGEngine.from_connection_string(url=pg_connection)
65
- logger.info("PgEmbeddingsManager initialized")
66
- except Exception as e:
67
- logger.error(f"Failed to initialize PgEmbeddingsManager: {str(e)}")
68
- raise
72
+ self.pg_engine = PGEngine.from_connection_string(pg_connection)
73
+ self.embeddings_vectors_table_name = embeddings_vectors_table_name
74
+ self.vector_size = vector_size
75
+ self.content_column = content_column
76
+ self.id_column = id_column
77
+ self.metadata_json_column = metadata_json_column
78
+ self.records_manager_table_name = records_manager_table_name
79
+ # self.async_engine = create_async_engine(pg_connection)
80
+ # self.pg_engine = PGEngine.from_engine(
81
+ # self.async_engine
82
+ # )
83
+ logger.info("PgEmbeddingsManager initialized")
69
84
 
70
85
  def configure_vector_store(
71
86
  self,
72
- table_name: str = "langchain_pg_embedding",
73
- vector_size: int = 768,
74
- content_column: str = "document",
75
- id_column: str = "id",
76
- metadata_json_column: str = "cmetadata",
77
- pg_record_manager: str = "postgres/langchain_pg_collection",
78
87
  ):
79
- self.pg_engine.init_vectorstore_table(
80
- table_name=table_name,
81
- vector_size=vector_size,
82
- content_column=content_column,
83
- id_column=id_column,
84
- metadata_json_column=metadata_json_column,
85
- )
86
- self.record_manager = SQLRecordManager(
87
- pg_record_manager, engine=create_engine(url=self.pg_connection)
88
- )
89
- # TODO move this from here
90
- self.record_manager.create_schema()
91
-
92
- def init_vector_store(
88
+ try:
89
+ self.pg_engine.init_vectorstore_table(
90
+ table_name=self.embeddings_vectors_table_name,
91
+ vector_size=self.vector_size,
92
+ content_column=self.content_column,
93
+ id_column=self.id_column,
94
+ metadata_json_column=self.metadata_json_column,
95
+ )
96
+ record_manager = SQLRecordManager(
97
+ self.records_manager_table_name,
98
+ db_url=self.pg_connection,
99
+ async_mode=False,
100
+ )
101
+ record_manager.create_schema()
102
+ except Exception as e:
103
+ logger.error(f"Error configure_vector_store: {e}")
104
+ raise
105
+
106
+ def retrieve_vector_store(
93
107
  self,
94
- table_name: str = "langchain_pg_embedding",
95
- content_column: str = "document",
96
- metadata_json_column: str = "cmetadata",
97
- id_column: str = "id",
98
- pg_record_manager: str = "postgres/langchain_pg_collection",
99
- ):
100
- self.vector_store = PGVectorStore.create_sync(
101
- embedding_service=self.embeddings_model,
102
- engine=self.pg_engine,
103
- table_name=table_name,
104
- content_column=content_column,
105
- metadata_json_column=metadata_json_column,
106
- id_column=id_column,
107
- )
108
- self.record_manager = SQLRecordManager(
109
- pg_record_manager, engine=create_engine(url=self.pg_connection)
110
- )
111
-
112
- def vector_store_initialized(func):
108
+ ) -> tuple[PGVectorStore, SQLRecordManager]:
109
+ try:
110
+ self.vector_store = PGVectorStore.create_sync(
111
+ embedding_service=self.embeddings_model,
112
+ engine=self.pg_engine,
113
+ table_name=self.embeddings_vectors_table_name,
114
+ content_column=self.content_column,
115
+ metadata_json_column=self.metadata_json_column,
116
+ id_column=self.id_column,
117
+ )
118
+ self.record_manager = SQLRecordManager(
119
+ self.records_manager_table_name, db_url=self.pg_connection
120
+ )
121
+ return (self.vector_store, self.record_manager)
122
+ except Exception as e:
123
+ logger.error(f"Error retrieve vector store: ", e)
124
+ raise e
125
+
126
+ def check_vector_store_init(func):
113
127
  """validate vector store initialization"""
114
128
 
115
129
  def wrapper(self, *args, **kwargs):
116
- # Common validation logic
117
- if self.vector_store is None:
118
- raise Exception("Vector store not initialized")
119
- if self.record_manager is None:
120
- raise Exception("Record manager not initialized")
130
+ if self.vector_store is None or self.record_manager is None:
131
+ self.retrieve_vector_store()
121
132
  return func(self, *args, **kwargs)
122
133
 
123
134
  return wrapper
124
135
 
125
- @vector_store_initialized
126
- def index_documents(self, docs: List[Document]):
136
+ @check_vector_store_init
137
+ def create_index(self):
138
+ try:
139
+ if self.vector_size < 2000:
140
+ index = HNSWIndex()
141
+ self.vector_store.apply_vector_index(index)
142
+ else:
143
+ raise NotImplementedError(
144
+ "Indexing for vector size > 2000 is not supported"
145
+ )
146
+ except Exception as e:
147
+ logger.info(f"Error creating index: {e}")
148
+ raise e
149
+
150
+ @check_vector_store_init
151
+ def index_documents(
152
+ self,
153
+ docs: list[Document],
154
+ cleanup: Literal["incremental", "full", "scoped_full"] | None = "incremental",
155
+ source_id_key: str = "source",
156
+ ) -> IndexingResult:
127
157
  """
128
- Add documents to the vector store with their embeddings.
158
+ Index documents in the vector store with their embeddings.
129
159
 
130
- This method takes a list of Document objects, generates embeddings for them
131
- using the embeddings model, and stores both the documents and their
132
- embeddings in the PostgreSQL database.
160
+ This method takes a list of Document objects and indexes them using LangChain's
161
+ aindex function with incremental cleanup. The documents are processed through
162
+ the embeddings model and stored in the PostgreSQL database with pgvector.
133
163
 
134
164
  Args:
135
- docs: A list of LangChain Document objects to add to the vector store
136
- Each Document should have page_content and metadata attributes
137
- from langchain_core.documents import Document
165
+ vector_store: The PGVectorStore instance to use for storage
166
+ record_manager: The SQLRecordManager instance for tracking indexed documents
167
+ docs: A list of LangChain Document objects to index in the vector store.
168
+ Each Document should have page_content and metadata attributes.
169
+
138
170
  Returns:
139
- None
171
+ IndexingResult: Result object containing information about the indexing operation
140
172
 
141
173
  Raises:
142
- Exception: If there's an error adding documents to the vector store
174
+ Exception: If there's an error during the document indexing process
143
175
  """
144
176
  try:
145
177
  logger.info(f"Indexing {len(docs)} documents in vector store")
178
+ # await self.vector_store.aadd_documents(docs)
146
179
  return index(
147
180
  docs,
148
181
  self.record_manager,
149
182
  self.vector_store,
150
- cleanup="incremental",
151
- source_id_key="source",
183
+ cleanup=cleanup,
184
+ source_id_key=source_id_key,
152
185
  )
153
186
  except Exception as e:
154
187
  logger.error(f"Error indexing documents: {str(e)}")
155
- raise
188
+ raise e
156
189
 
157
- @vector_store_initialized
158
- def get_documents_keys_by_source_id(self, source_id: str):
159
- """
160
- Get document keys by source ID from the vector store.
161
- """
162
- try:
163
- return self.record_manager.list_keys(group_ids=[source_id])
164
- except Exception as e:
165
- logger.error(f"Error getting documents keys by source ID: {str(e)}")
166
- raise
167
-
168
- @vector_store_initialized
169
- def delete_documents_by_source_id(self, source_id: str):
170
- """
171
- Delete documents by source ID from the vector store.
172
- """
190
+ @check_vector_store_init
191
+ def search_records(
192
+ self,
193
+ query: str,
194
+ ) -> list[Document]:
173
195
  try:
174
- objects_keys = self.get_documents_keys_by_source_id(source_id)
175
- self.record_manager.delete_keys(objects_keys)
176
- self.vector_store.delete(ids=objects_keys)
196
+ logger.info(f"Searching for '{query}' in vector store")
197
+ reply = self.vector_store.search(query=query, search_type="similarity", k=1)
198
+ return reply
177
199
  except Exception as e:
178
- logger.error(f"Error deleting documents by source ID: {str(e)}")
179
- raise
180
-
181
- # def get_retriever(self, search_type: str = "mmr", k: int = 20):
182
- # """
183
- # Get a retriever interface to the vector store for semantic search.
184
-
185
- # This method returns a LangChain retriever object that can be used in retrieval
186
- # pipelines, retrieval-augmented generation, and other LangChain chains.
187
-
188
- # Args:
189
- # search_type: The search algorithm to use. Options include:
190
- # - "similarity" (standard cosine similarity)
191
- # - "mmr" (Maximum Marginal Relevance, balances relevance with diversity)
192
- # - "similarity_score_threshold" (filters by minimum similarity)
193
- # k: The number of documents to retrieve (default: 20)
194
-
195
- # Returns:
196
- # Retriever: A LangChain Retriever object that can be used in chains and pipelines
197
-
198
- # Raises:
199
- # Exception: If there's an error creating the retriever
200
-
201
- # Example:
202
- # >>> retriever = pg_manager.get_retriever(search_type="mmr", k=5)
203
- # >>> docs = retriever.get_relevant_documents("quantum computing")
204
- # """
205
- # try:
206
- # return self.vector_store.as_retriever(
207
- # search_type=search_type, search_kwargs={"k": k}
208
- # )
209
- # except Exception as e:
210
- # logger.info(f"failed to get vector store as retriever {str(e)}")
211
- # raise
200
+ logger.error(f"Error indexing documents: {str(e)}")
201
+ raise e
@@ -1,10 +1,9 @@
1
- from posix import fork
2
-
3
1
  # check this documentation
4
2
  # https://python.langchain.com/docs/how_to/semantic-chunker/
5
3
  # https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
6
4
  # https://python.langchain.com/docs/how_to/embed_text/
7
5
  import logging
6
+ import uuid
8
7
  from typing import List, Any
9
8
  from langchain_core.documents import Document
10
9
  from langchain_experimental.text_splitter import SemanticChunker
@@ -60,7 +59,7 @@ class SemanticChunks(RagChunker):
60
59
  source = document.metadata["source"]
61
60
  for i, chunk in enumerate(chunks):
62
61
  if document.metadata["source"]:
63
- chunk.id = f"{source}-{i}"
62
+ chunk.id = f"{uuid.uuid4()}"
64
63
  logger.info(f"{len(chunks)} chunks generated successfully")
65
64
  return chunks
66
65
  except Exception as e:
@@ -3,11 +3,10 @@ import logging
3
3
 
4
4
  logger = logging.getLogger(__name__)
5
5
 
6
- class AwsSecretsManager:
7
-
8
- def __init__(self):
9
- self.client = boto3_client('secretsmanager')
10
6
 
7
+ class AwsSecretsManager:
8
+ def __init__(self, aws_region="us-east-1"):
9
+ self.client = boto3_client("secretsmanager", region_name=aws_region)
11
10
 
12
11
  def get_secret(self, secret_name):
13
12
  """
@@ -11,7 +11,7 @@ from .infra.rag.chroma_embeddings import ChromaEmbeddingsManager
11
11
  from .infra.secrets.aws_secrets_manager import AwsSecretsManager
12
12
  from .data.storage import storage_services, StorageServices
13
13
  from .data.kdb import kdb_services, KdbServices
14
- from .utils.file_utils import has_invalid_file_name_format
14
+ from .utils.file_utils import validate_file_name_format
15
15
  from langsmith import Client, tracing_context
16
16
 
17
17
 
@@ -78,7 +78,7 @@ class TranscriptionManager:
78
78
  llm_model_id: str = "claude-sonnet-4@20250514",
79
79
  target_language: str = "es",
80
80
  transcription_additional_instructions: str = "",
81
- transcription_accuracy_threshold: int = 90,
81
+ transcription_accuracy_threshold: float = 0.90,
82
82
  max_transcription_retries: int = 2,
83
83
  ):
84
84
  self.gcp_project_id = gcp_project_id
@@ -116,18 +116,18 @@ class TranscriptionManager:
116
116
  return vertex_model
117
117
 
118
118
  def tracing(func):
119
- def gen_tracing_context(self, *args, **kwargs):
119
+ async def gen_tracing_context(self, *args, **kwargs):
120
120
  with tracing_context(
121
121
  enabled=True,
122
122
  project_name=self.langsmith_project_name,
123
123
  client=self.langsmith_client,
124
124
  ):
125
- return func(self, *args, **kwargs)
125
+ return await func(self, *args, **kwargs)
126
126
 
127
127
  return gen_tracing_context
128
128
 
129
129
  @tracing
130
- def transcribe_document(self, file_key: str):
130
+ async def transcribe_document(self, file_key: str):
131
131
  """Transcribe a document from source storage to target storage.
132
132
  This method serves as a generic interface for transcribing documents from
133
133
  various storage sources to target destinations. The specific implementation
@@ -143,7 +143,7 @@ class TranscriptionManager:
143
143
  Exception: If an error occurs during the transcription process.
144
144
  """
145
145
  try:
146
- if has_invalid_file_name_format(file_key):
146
+ if not validate_file_name_format(file_key):
147
147
  raise ValueError(
148
148
  "Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
149
149
  )
@@ -162,9 +162,10 @@ class TranscriptionManager:
162
162
  transcription_accuracy_threshold=self.transcription_accuracy_threshold,
163
163
  max_transcription_retries=self.max_transcription_retries,
164
164
  )
165
- parsed_pages, parsed_document = (
166
- transcribe_document_service.process_document(file_key)
167
- )
165
+ (
166
+ parsed_pages,
167
+ parsed_document,
168
+ ) = await transcribe_document_service.process_document(file_key)
168
169
  source_storage_file_tags = {}
169
170
  if persistence_service.supports_tagging:
170
171
  # source_storage_file_tags.tag_file(file_key, {"status": "transcribed"})
@@ -231,18 +232,18 @@ class ChunksManager:
231
232
  return vertex_model
232
233
 
233
234
  def tracing(func):
234
- def gen_tacing_context(self, *args, **kwargs):
235
+ async def gen_tracing_context(self, *args, **kwargs):
235
236
  with tracing_context(
236
237
  enabled=True,
237
238
  project_name=self.langsmith_project_name,
238
239
  client=self.langsmith_client,
239
240
  ):
240
- return func(self, *args, **kwargs)
241
+ return await func(self, *args, **kwargs)
241
242
 
242
- return gen_tacing_context
243
+ return gen_tracing_context
243
244
 
244
245
  @tracing
245
- def gen_context_chunks(
246
+ async def gen_context_chunks(
246
247
  self, file_key: str, source_storage_route: str, target_storage_route: str
247
248
  ):
248
249
  try:
@@ -272,7 +273,7 @@ class ChunksManager:
272
273
  target_language=self.target_language,
273
274
  )
274
275
  context_chunks = (
275
- context_chunks_in_document_service.get_context_chunks_in_document(
276
+ await context_chunks_in_document_service.get_context_chunks_in_document(
276
277
  file_key, target_bucket_file_tags
277
278
  )
278
279
  )
@@ -0,0 +1,173 @@
1
+ import json
2
+ from logging import getLogger
3
+ from typing import Any, Dict, Literal
4
+
5
+ from langchain_core.documents import Document
6
+ from langsmith import Client, tracing_context
7
+
8
+ from .application.context_chunk_service import ContextChunksInDocumentService
9
+ from .application.kdb_service import KdbService
10
+ from .data.storage import StorageServices
11
+ from .infra.persistence.local_storage import LocalStorageService
12
+ from .infra.persistence.s3_storage import S3StorageService
13
+ from .infra.rag.pg_embeddings import PgEmbeddingsManager
14
+ from .infra.rag.semantic_chunks import SemanticChunks
15
+ from .infra.secrets.aws_secrets_manager import AwsSecretsManager
16
+ from .infra.vertex_model import VertexModels
17
+ from .utils.file_utils import validate_file_name_format
18
+
19
+ logger = getLogger(__name__)
20
+
21
+
22
+ class KdbManager:
23
+ def __init__(
24
+ self, embeddings_model, kdb_service: Literal["pg"], kdb_params: Dict[Any, Any]
25
+ ):
26
+ self.kdb_service = kdb_service
27
+ self.kdb_params = kdb_params
28
+ self.embeddings_model = embeddings_model
29
+
30
+ def retrieve_kdb_service(self):
31
+ return PgEmbeddingsManager(self.embeddings_model, **self.kdb_params)
32
+
33
+
34
+ class PersistenceManager:
35
+ def __init__(
36
+ self,
37
+ storage_service: Literal["s3", "local"],
38
+ source_storage_route,
39
+ target_storage_route,
40
+ ):
41
+ self.storage_service = storage_service
42
+ self.source_storage_route = source_storage_route
43
+ self.target_storage_route = target_storage_route
44
+
45
+ def retrieve_storage_service(self):
46
+ if self.storage_service == StorageServices.S3.value:
47
+ return S3StorageService(
48
+ origin_bucket_name=self.source_storage_route,
49
+ target_bucket_name=self.target_storage_route,
50
+ )
51
+ elif self.storage_service == StorageServices.LOCAL.value:
52
+ return LocalStorageService(
53
+ source_storage_route=self.source_storage_route,
54
+ target_storage_route=self.target_storage_route,
55
+ )
56
+ else:
57
+ raise ValueError(f"Unsupported storage service: {self.storage_service}")
58
+
59
+
60
+ class ChunksManager:
61
+ def __init__(
62
+ self,
63
+ gcp_project_id: str,
64
+ gcp_project_location: str,
65
+ gcp_secret_name: str,
66
+ langsmith_api_key: str,
67
+ langsmith_project_name: str,
68
+ storage_service: Literal["s3", "local"],
69
+ kdb_service: Literal["pg"],
70
+ kdb_params: Dict[Any, Any],
71
+ llm_model_id: str = "claude-3-5-haiku@20241022",
72
+ embeddings_model_id: str = "text-multilingual-embedding-002",
73
+ target_language: str = "es",
74
+ ):
75
+ self.gcp_project_id = gcp_project_id
76
+ self.gcp_project_location = gcp_project_location
77
+ self.aws_secrets_manager = AwsSecretsManager()
78
+ self.gcp_secret_name = gcp_secret_name
79
+ self.llm_model_id = llm_model_id
80
+ self.target_language = target_language
81
+ self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
82
+ self.storage_service = storage_service
83
+ self.kdb_params = kdb_params
84
+ self.kdb_service = kdb_service
85
+ self.vertex_model = self._get_vertex_model()
86
+ self.embeddings_model = self.vertex_model.load_embeddings_model(
87
+ embeddings_model_id
88
+ )
89
+ self.langsmith_api_key = langsmith_api_key
90
+ self.langsmith_project_name = langsmith_project_name
91
+ self.langsmith_client = Client(api_key=self.langsmith_api_key)
92
+ self.kdb_manager = KdbManager(self.embeddings_model, "pg", self.kdb_params)
93
+ self.pg_embeddings_manager = self.kdb_manager.retrieve_kdb_service()
94
+ self.rag_chunker = SemanticChunks(self.embeddings_model)
95
+ self.kdb_service = KdbService(
96
+ self.pg_embeddings_manager,
97
+ )
98
+
99
+ def _get_gcp_sa_dict(self, gcp_secret_name: str):
100
+ vertex_gcp_sa = self.aws_secrets_manager.get_secret(gcp_secret_name)
101
+ vertex_gcp_sa_dict = json.loads(vertex_gcp_sa)
102
+ return vertex_gcp_sa_dict
103
+
104
+ def _get_vertex_model(self):
105
+ vertex_model = VertexModels(
106
+ self.gcp_project_id,
107
+ self.gcp_project_location,
108
+ self.gcp_sa_dict,
109
+ llm_model_id=self.llm_model_id,
110
+ )
111
+ return vertex_model
112
+
113
+ def provision_vector_store(self):
114
+ try:
115
+ self.kdb_service.configure_kdb()
116
+ self.kdb_service.create_vector_store_hsnw_index()
117
+ except Exception as e:
118
+ logger.error(f"Error configuring vector store: {e}")
119
+
120
+ def index_documents_in_vector_store(self, docs: list[Document]):
121
+ try:
122
+ self.kdb_service.index_documents_in_vector_store(docs)
123
+ except Exception as e:
124
+ logger.error(f"Error indexing documents in vector store: {e}")
125
+
126
+ def search_records(self, query):
127
+ return self.kdb_service.search(query)
128
+
129
+ def tracing(func):
130
+ async def gen_tracing_context(self, *args, **kwargs):
131
+ with tracing_context(
132
+ enabled=True,
133
+ project_name=self.langsmith_project_name,
134
+ client=self.langsmith_client,
135
+ ):
136
+ return await func(self, *args, **kwargs)
137
+
138
+ return gen_tracing_context
139
+
140
+ @tracing
141
+ async def gen_context_chunks(
142
+ self, file_key: str, source_storage_route: str, target_storage_route: str
143
+ ):
144
+ try:
145
+ validate_file_name_format(file_key)
146
+ persistence_layer = PersistenceManager(
147
+ self.storage_service, source_storage_route, target_storage_route
148
+ )
149
+ persistence_service = persistence_layer.retrieve_storage_service()
150
+ target_bucket_file_tags = {}
151
+ if persistence_service.supports_tagging:
152
+ target_bucket_file_tags = persistence_service.retrieve_file_tags(
153
+ file_key, target_storage_route
154
+ )
155
+ rag_chunker = SemanticChunks(self.embeddings_model)
156
+ kdb_manager = KdbManager(self.embeddings_model, "pg", self.kdb_params)
157
+ kdb_service = kdb_manager.retrieve_kdb_service()
158
+ context_chunks_in_document_service = ContextChunksInDocumentService(
159
+ ai_application_service=self.vertex_model,
160
+ persistence_service=persistence_service,
161
+ rag_chunker=rag_chunker,
162
+ embeddings_manager=kdb_service,
163
+ target_language=self.target_language,
164
+ )
165
+ context_chunks = (
166
+ await context_chunks_in_document_service.get_context_chunks_in_document(
167
+ file_key, target_bucket_file_tags
168
+ )
169
+ )
170
+ return context_chunks
171
+ except Exception as e:
172
+ print(f"Error getting context chunks in document: {e}")
173
+ raise e
@@ -1,13 +1,12 @@
1
1
  import re
2
2
 
3
3
 
4
- def has_invalid_file_name_format(file_name):
4
+ def validate_file_name_format(file_name):
5
5
  """Check if file name has special characters or spaces instead of underscores"""
6
- # Check for spaces
7
- if " " in file_name:
6
+ # Check for special characters (anything that's not alphanumeric, underscore, dash, dot, slash, or backslash)
7
+ if re.search(r"[^a-zA-Z0-9_.\-/\\]", file_name) is None:
8
8
  return True
9
-
10
- # Check for special characters (anything that's not alphanumeric, underscore, dash, or dot)
11
- if re.search(r"[^a-zA-Z0-9_.-]", file_name):
12
- return True
13
- return False
9
+ else:
10
+ raise ValueError(
11
+ "Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
12
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: wizit-context-ingestor
3
- Version: 0.3.0b2
3
+ Version: 0.4.0
4
4
  Summary: Contextual Rag with Cloud Solutions
5
5
  Requires-Dist: anthropic[vertex]>=0.66.0
6
6
  Requires-Dist: boto3>=1.40.23
@@ -8,10 +8,13 @@ Requires-Dist: langchain-aws>=0.2.31
8
8
  Requires-Dist: langchain-chroma>=0.2.6
9
9
  Requires-Dist: langchain-experimental>=0.3.4
10
10
  Requires-Dist: langchain-google-vertexai>=2.0.28
11
+ Requires-Dist: langchain-postgres>=0.0.16
11
12
  Requires-Dist: langchain-redis>=0.2.3
12
13
  Requires-Dist: langgraph>=0.6.8
13
14
  Requires-Dist: pillow>=11.3.0
15
+ Requires-Dist: psycopg2-binary>=2.9.11
14
16
  Requires-Dist: pymupdf>=1.26.4
17
+ Requires-Dist: sqlalchemy[asyncio]>=2.0.43
15
18
  Requires-Python: >=3.12
16
19
  Description-Content-Type: text/markdown
17
20