wizit-context-ingestor 0.2.5b3__py3-none-any.whl → 0.3.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. wizit_context_ingestor/__init__.py +2 -2
  2. wizit_context_ingestor/application/context_chunk_service.py +149 -35
  3. wizit_context_ingestor/application/transcription_service.py +132 -52
  4. wizit_context_ingestor/data/kdb.py +10 -0
  5. wizit_context_ingestor/data/prompts.py +150 -3
  6. wizit_context_ingestor/data/storage.py +10 -0
  7. wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
  8. wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
  9. wizit_context_ingestor/infra/rag/chroma_embeddings.py +30 -31
  10. wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
  11. wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
  12. wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
  13. wizit_context_ingestor/infra/vertex_model.py +56 -28
  14. wizit_context_ingestor/main.py +192 -106
  15. wizit_context_ingestor/utils/file_utils.py +13 -0
  16. wizit_context_ingestor/workflows/context_nodes.py +73 -0
  17. wizit_context_ingestor/workflows/context_state.py +10 -0
  18. wizit_context_ingestor/workflows/context_tools.py +58 -0
  19. wizit_context_ingestor/workflows/context_workflow.py +42 -0
  20. wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
  21. wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
  22. wizit_context_ingestor/workflows/transcription_state.py +17 -0
  23. wizit_context_ingestor/workflows/transcription_tools.py +54 -0
  24. wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
  25. {wizit_context_ingestor-0.2.5b3.dist-info → wizit_context_ingestor-0.3.0b2.dist-info}/METADATA +9 -1
  26. wizit_context_ingestor-0.3.0b2.dist-info/RECORD +44 -0
  27. {wizit_context_ingestor-0.2.5b3.dist-info → wizit_context_ingestor-0.3.0b2.dist-info}/WHEEL +1 -1
  28. wizit_context_ingestor-0.2.5b3.dist-info/RECORD +0 -32
@@ -3,22 +3,27 @@ from ...domain.models import ParsedDoc
3
3
  from typing import Optional
4
4
  import logging
5
5
  import os
6
+
6
7
  logger = logging.getLogger(__name__)
7
8
 
9
+
8
10
  class LocalStorageService(PersistenceService):
9
11
  """Persistence service for local storage."""
10
12
 
11
- def __init__(self):
12
- self.tmp_folder = "tmp"
13
+ def __init__(self, source_storage_route: str, target_storage_route: str):
14
+ self.source_storage_route = source_storage_route
15
+ self.target_storage_route = target_storage_route
16
+ self.supports_tagging = hasattr(self, "retrieve_file_tags")
13
17
 
14
18
  def load_markdown_file_content(self, file_key: str) -> str:
15
19
  """Load markdown file content from local storage."""
16
20
  file_content = None
17
- with open(f"{self.tmp_folder}/{file_key}", "r", encoding="utf-8") as file:
21
+ with open(
22
+ f"{self.source_storage_route}/{file_key}", "r", encoding="utf-8"
23
+ ) as file:
18
24
  file_content = file.read()
19
25
  return file_content
20
26
 
21
-
22
27
  def retrieve_raw_file(self, file_key: str) -> str:
23
28
  """Retrieve file path in tmp folder from local storage.
24
29
 
@@ -32,16 +37,21 @@ class LocalStorageService(PersistenceService):
32
37
  ClientError: If there's an error retrieving the object from local storage
33
38
  """
34
39
  try:
35
- tmp_file_path = f"{self.tmp_folder}/{file_key}"
40
+ tmp_file_path = f"{self.source_storage_route}/{file_key}"
36
41
  if not os.path.exists(tmp_file_path):
37
42
  raise FileNotFoundError(f"File {file_key} not found in local storage")
38
43
  return tmp_file_path
39
44
  except Exception as e:
40
- logger.error(f"Unexpected error retrieving file {file_key} from local storage: {str(e)}")
45
+ logger.error(
46
+ f"Unexpected error retrieving file {file_key} from local storage: {str(e)}"
47
+ )
41
48
  raise
42
49
 
43
-
44
- def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
50
+ def save_parsed_document(
51
+ self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
52
+ ):
45
53
  """Save a parsed document."""
46
- with open(f"{self.tmp_folder}/{file_key}", "w", encoding="utf-8") as f:
54
+ with open(
55
+ f"{self.target_storage_route}/{file_key}", "w", encoding="utf-8"
56
+ ) as f:
47
57
  f.write(parsed_document.document_text)
@@ -11,12 +11,19 @@ logger = logging.getLogger(__name__)
11
11
 
12
12
  class S3StorageService(PersistenceService):
13
13
  """Persistence service for S3 storage."""
14
- __slots__ = ('origin_bucket_name', 'target_bucket_name', 'region_name')
15
- def __init__(self, origin_bucket_name: str, target_bucket_name: str, region_name: str = 'us-east-1'):
16
- self.s3 = boto3_client('s3', region_name=region_name)
14
+
15
+ __slots__ = ("origin_bucket_name", "target_bucket_name", "region_name")
16
+
17
+ def __init__(
18
+ self,
19
+ origin_bucket_name: str,
20
+ target_bucket_name: str,
21
+ region_name: str = "us-east-1",
22
+ ):
23
+ self.s3 = boto3_client("s3", region_name=region_name)
17
24
  self.origin_bucket_name = origin_bucket_name
18
25
  self.target_bucket_name = target_bucket_name
19
-
26
+ self.supports_tagging = hasattr(self, "retrieve_file_tags")
20
27
 
21
28
  def load_markdown_file_content(self, file_key: str) -> str:
22
29
  """Load markdown file content from S3 storage.
@@ -36,9 +43,9 @@ class S3StorageService(PersistenceService):
36
43
  response = self.s3.get_object(Bucket=self.target_bucket_name, Key=file_key)
37
44
  tmp_file_key = f"/tmp/{file_key}"
38
45
  os.makedirs(os.path.dirname(tmp_file_key), exist_ok=True)
39
- with open(tmp_file_key, 'wb') as f:
40
- f.write(response['Body'].read())
41
- with open(tmp_file_key, 'r', encoding='utf-8') as f:
46
+ with open(tmp_file_key, "wb") as f:
47
+ f.write(response["Body"].read())
48
+ with open(tmp_file_key, "r", encoding="utf-8") as f:
42
49
  file_content = f.read()
43
50
  return file_content
44
51
  except ClientError as e:
@@ -48,7 +55,6 @@ class S3StorageService(PersistenceService):
48
55
  logger.error(f"Unexpected error loading file {file_key} from S3: {str(e)}")
49
56
  raise
50
57
 
51
-
52
58
  def retrieve_raw_file(self, file_key: str) -> str:
53
59
  """Retrieve file path in tmp folder from S3 storage.
54
60
 
@@ -67,18 +73,21 @@ class S3StorageService(PersistenceService):
67
73
  tmp_file_key = f"/tmp/{file_key}"
68
74
  # Create parent directories if they don't exist
69
75
  os.makedirs(os.path.dirname(tmp_file_key), exist_ok=True)
70
- with open(tmp_file_key, 'wb') as f:
71
- f.write(response['Body'].read())
76
+ with open(tmp_file_key, "wb") as f:
77
+ f.write(response["Body"].read())
72
78
  return tmp_file_key
73
79
  except ClientError as e:
74
80
  logger.error(f"Error retrieving file {file_key} from S3: {str(e)}")
75
81
  raise
76
82
  except Exception as e:
77
- logger.error(f"Unexpected error retrieving file {file_key} from S3: {str(e)}")
83
+ logger.error(
84
+ f"Unexpected error retrieving file {file_key} from S3: {str(e)}"
85
+ )
78
86
  raise
79
87
 
80
-
81
- def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
88
+ def save_parsed_document(
89
+ self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
90
+ ):
82
91
  """Save a parsed document to S3.
83
92
 
84
93
  Args:
@@ -91,21 +100,21 @@ class S3StorageService(PersistenceService):
91
100
  """
92
101
  try:
93
102
  # Convert document content to bytes
94
- content_bytes = parsed_document.document_text.encode('utf-8')
103
+ content_bytes = parsed_document.document_text.encode("utf-8")
95
104
  # Upload the file to S3
96
105
  if not file_tags:
97
106
  self.s3.put_object(
98
- Bucket=self.target_bucket_name,
99
- Key=file_key,
100
- Body=content_bytes
107
+ Bucket=self.target_bucket_name, Key=file_key, Body=content_bytes
101
108
  )
102
109
  else:
103
- tagging_string = "&".join([f"{key}={value}" for key, value in file_tags.items()])
110
+ tagging_string = "&".join(
111
+ [f"{key}={value}" for key, value in file_tags.items()]
112
+ )
104
113
  self.s3.put_object(
105
114
  Bucket=self.target_bucket_name,
106
115
  Key=file_key,
107
116
  Body=content_bytes,
108
- Tagging=tagging_string
117
+ Tagging=tagging_string,
109
118
  )
110
119
 
111
120
  logger.info(f"Successfully saved document to S3 as {file_key}")
@@ -122,8 +131,5 @@ class S3StorageService(PersistenceService):
122
131
  Args:
123
132
  file_key: The key (path) to retrieve tags
124
133
  """
125
- response = self.s3.get_object_tagging(
126
- Bucket=bucket_name,
127
- Key=file_key
128
- )
134
+ response = self.s3.get_object_tagging(Bucket=bucket_name, Key=file_key)
129
135
  return {item["Key"]: item["Value"] for item in response["TagSet"]}
@@ -1,25 +1,21 @@
1
- from typing_extensions import Sequence
2
- from test.test_typing import CoolEmployee
3
1
  from langchain_core.documents import Document
4
2
  from langchain_chroma import Chroma
5
- from typing import List
6
3
  import logging
7
- from uuid import uuid4
8
4
  from ...application.interfaces import EmbeddingsManager
9
5
 
10
6
  # load_dotenv()
11
7
 
12
8
  logger = logging.getLogger(__name__)
13
9
 
10
+
14
11
  class ChromaEmbeddingsManager(EmbeddingsManager):
12
+ __slots__ = ("embeddings_model", "collection_name")
15
13
 
16
- __slots__ = ("embeddings_model", "chroma_host", "collection_name", "metadata_tags")
17
14
  def __init__(
18
15
  self,
19
16
  embeddings_model,
20
- chroma_host,
21
- collection_name: str,
22
- metadata_tags: dict
17
+ chroma_host=None,
18
+ **chroma_conn_kwargs,
23
19
  ):
24
20
  """
25
21
  Initialize the ChromaEmbeddingsManager.
@@ -27,33 +23,28 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
27
23
  embeddings_model: The embeddings model to use for generating vector embeddings
28
24
  (typically a LangChain embeddings model instance)
29
25
  chroma_host: The Chroma host URL
30
- collection_name: The Chroma collection name
31
- metadata_tags: Tags to add as metadata to Chroma vector store
32
26
 
33
27
  Raises:
34
28
  Exception: If there's an error initializing the RedisEmbeddingsManager
35
29
  """
36
- self.collection_name = collection_name
37
30
  self.embeddings_model = embeddings_model
38
31
  self.chroma_host = chroma_host
39
- self.metadata_tags_schema = []
40
-
41
- for tag_key in metadata_tags:
42
- self.metadata_tags_schema.append({
43
- "type": "tag",
44
- "name": tag_key
45
- })
46
-
47
32
  try:
48
- self.chroma = Chroma(
49
- collection_name=self.collection_name,
50
- embedding_function=self.embeddings_model,
51
- host=self.chroma_host,
52
- )
53
- logger.info("ChromaEmbeddingsManager initialized")
33
+ if chroma_host:
34
+ self.chroma = Chroma(
35
+ embedding_function=self.embeddings_model,
36
+ host=chroma_host,
37
+ **chroma_conn_kwargs,
38
+ )
39
+ logger.info("ChromaEmbeddingsManager initialized")
40
+ else:
41
+ self.chroma = Chroma(
42
+ embedding_function=self.embeddings_model, **chroma_conn_kwargs
43
+ )
44
+ logger.info("ChromaEmbeddingsManager initialized")
54
45
  except Exception as e:
55
- logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
56
- raise
46
+ logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
47
+ raise
57
48
 
58
49
  def configure_vector_store(
59
50
  self,
@@ -61,8 +52,6 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
61
52
  vector_size: int = 768,
62
53
  content_column: str = "document",
63
54
  id_column: str = "id",
64
- metadata_json_column: str = "cmetadata",
65
- pg_record_manager: str = ""
66
55
  ):
67
56
  """Configure the vector store."""
68
57
  pass
@@ -71,13 +60,11 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
71
60
  self,
72
61
  table_name: str = "",
73
62
  content_column: str = "document",
74
- metadata_json_column: str = "cmetadata",
75
63
  id_column: str = "id",
76
64
  ):
77
65
  """Initialize the vector store."""
78
66
  pass
79
67
 
80
-
81
68
  def index_documents(self, documents: list[Document]):
82
69
  """
83
70
  Add documents to the vector store with their embeddings.
@@ -123,6 +110,18 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
123
110
  logger.error(f"Error deleting documents by ID: {str(e)}")
124
111
  raise
125
112
 
113
+ def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
114
+ """
115
+ Delete documents by filter from the vector store.
116
+ """
117
+ try:
118
+ self.chroma.delete(where={metadata_key: metadata_value})
119
+ except Exception as error:
120
+ logger.error(
121
+ f"Error deleting documents by filter: {str(filter)}, error: {error} "
122
+ )
123
+ raise
124
+
126
125
  def get_documents_keys_by_source_id(self, source_id: str):
127
126
  """Get documents keys by source ID."""
128
127
  pass
@@ -6,6 +6,7 @@ from langchain_postgres import PGVectorStore, PGEngine
6
6
  from sqlalchemy import create_engine
7
7
  from dotenv import load_dotenv
8
8
  from wizit_context_ingestor.application.interfaces import EmbeddingsManager
9
+
9
10
  load_dotenv()
10
11
 
11
12
  logger = logging.getLogger(__name__)
@@ -38,19 +39,21 @@ class PgEmbeddingsManager(EmbeddingsManager):
38
39
  ... )
39
40
  >>> documents = [Document(page_content="Sample text", metadata={"source": "example"})]
40
41
  """
42
+
41
43
  __slots__ = ("embeddings_model", "pg_connection")
44
+
42
45
  def __init__(self, embeddings_model, pg_connection: str):
43
46
  """
44
- Initialize the PgEmbeddingsManager.
47
+ Initialize the PgEmbeddingsManager.
45
48
 
46
- Args:
47
- embeddings_model: The embeddings model to use for generating vector embeddings
48
- (typically a LangChain embeddings model instance)
49
- pg_connection: The PostgreSQL connection string
50
- (format: postgresql://user:password@host:port/database)
49
+ Args:
50
+ embeddings_model: The embeddings model to use for generating vector embeddings
51
+ (typically a LangChain embeddings model instance)
52
+ pg_connection: The PostgreSQL connection string
53
+ (format: postgresql://user:password@host:port/database)
51
54
 
52
- Raises:
53
- Exception: If there's an error initializing the vector store
55
+ Raises:
56
+ Exception: If there's an error initializing the vector store
54
57
  """
55
58
  self.pg_connection = pg_connection
56
59
  self.embeddings_model = embeddings_model
@@ -58,65 +61,65 @@ class PgEmbeddingsManager(EmbeddingsManager):
58
61
  self.vector_store = None
59
62
  self.record_manager = None
60
63
  try:
61
- self.pg_engine = PGEngine.from_connection_string(url=pg_connection)
62
- logger.info("PgEmbeddingsManager initialized")
64
+ self.pg_engine = PGEngine.from_connection_string(url=pg_connection)
65
+ logger.info("PgEmbeddingsManager initialized")
63
66
  except Exception as e:
64
67
  logger.error(f"Failed to initialize PgEmbeddingsManager: {str(e)}")
65
68
  raise
66
69
 
67
70
  def configure_vector_store(
68
- self,
69
- table_name: str = "langchain_pg_embedding",
70
- vector_size: int = 768,
71
- content_column: str = "document",
72
- id_column: str = "id",
73
- metadata_json_column: str = "cmetadata",
74
- pg_record_manager: str = "postgres/langchain_pg_collection"
71
+ self,
72
+ table_name: str = "langchain_pg_embedding",
73
+ vector_size: int = 768,
74
+ content_column: str = "document",
75
+ id_column: str = "id",
76
+ metadata_json_column: str = "cmetadata",
77
+ pg_record_manager: str = "postgres/langchain_pg_collection",
75
78
  ):
76
- self.pg_engine.init_vectorstore_table(
77
- table_name=table_name,
78
- vector_size=vector_size,
79
- content_column=content_column,
80
- id_column=id_column,
81
- metadata_json_column=metadata_json_column,
82
- )
83
- self.record_manager = SQLRecordManager(
84
- pg_record_manager,
85
- engine=create_engine(url=self.pg_connection)
86
- )
87
- # TODO move this from here
88
- self.record_manager.create_schema()
79
+ self.pg_engine.init_vectorstore_table(
80
+ table_name=table_name,
81
+ vector_size=vector_size,
82
+ content_column=content_column,
83
+ id_column=id_column,
84
+ metadata_json_column=metadata_json_column,
85
+ )
86
+ self.record_manager = SQLRecordManager(
87
+ pg_record_manager, engine=create_engine(url=self.pg_connection)
88
+ )
89
+ # TODO move this from here
90
+ self.record_manager.create_schema()
89
91
 
90
92
  def init_vector_store(
91
- self,
92
- table_name: str = "langchain_pg_embedding",
93
- content_column: str = "document",
94
- metadata_json_column: str = "cmetadata",
95
- id_column: str = "id",
96
- pg_record_manager: str = "postgres/langchain_pg_collection"
93
+ self,
94
+ table_name: str = "langchain_pg_embedding",
95
+ content_column: str = "document",
96
+ metadata_json_column: str = "cmetadata",
97
+ id_column: str = "id",
98
+ pg_record_manager: str = "postgres/langchain_pg_collection",
97
99
  ):
98
- self.vector_store = PGVectorStore.create_sync(
99
- embedding_service=self.embeddings_model,
100
- engine=self.pg_engine,
101
- table_name=table_name,
102
- content_column=content_column,
103
- metadata_json_column=metadata_json_column,
104
- id_column=id_column,
105
- )
106
- self.record_manager = SQLRecordManager(
107
- pg_record_manager,
108
- engine=create_engine(url=self.pg_connection)
109
- )
100
+ self.vector_store = PGVectorStore.create_sync(
101
+ embedding_service=self.embeddings_model,
102
+ engine=self.pg_engine,
103
+ table_name=table_name,
104
+ content_column=content_column,
105
+ metadata_json_column=metadata_json_column,
106
+ id_column=id_column,
107
+ )
108
+ self.record_manager = SQLRecordManager(
109
+ pg_record_manager, engine=create_engine(url=self.pg_connection)
110
+ )
110
111
 
111
112
  def vector_store_initialized(func):
112
113
  """validate vector store initialization"""
114
+
113
115
  def wrapper(self, *args, **kwargs):
114
- # Common validation logic
115
- if self.vector_store is None:
116
- raise Exception("Vector store not initialized")
117
- if self.record_manager is None:
118
- raise Exception("Record manager not initialized")
119
- return func(self, *args, **kwargs)
116
+ # Common validation logic
117
+ if self.vector_store is None:
118
+ raise Exception("Vector store not initialized")
119
+ if self.record_manager is None:
120
+ raise Exception("Record manager not initialized")
121
+ return func(self, *args, **kwargs)
122
+
120
123
  return wrapper
121
124
 
122
125
  @vector_store_initialized
@@ -2,6 +2,7 @@ from langchain_core.documents import Document
2
2
  from langchain_redis import RedisConfig, RedisVectorStore
3
3
  from typing import List
4
4
  import logging
5
+
5
6
  # from dotenv import load_dotenv
6
7
  from ...application.interfaces import EmbeddingsManager
7
8
 
@@ -9,10 +10,13 @@ from ...application.interfaces import EmbeddingsManager
9
10
 
10
11
  logger = logging.getLogger(__name__)
11
12
 
12
- class RedisEmbeddingsManager(EmbeddingsManager):
13
13
 
14
+ class RedisEmbeddingsManager(EmbeddingsManager):
14
15
  __slots__ = ("embeddings_model", "redis_conn_string", "metadata_tags")
15
- def __init__(self, embeddings_model, redis_conn_string: str, metadata_tags: dict):
16
+
17
+ def __init__(
18
+ self, embeddings_model, redis_conn_string: str, metadata_tags: List[str] = []
19
+ ):
16
20
  """
17
21
  Initialize the RedisEmbeddingsManager.
18
22
  Args:
@@ -27,27 +31,23 @@ class RedisEmbeddingsManager(EmbeddingsManager):
27
31
  """
28
32
  self.redis_conn_string = redis_conn_string
29
33
  self.embeddings_model = embeddings_model
30
- self.metadata_tags_schema = []
31
-
34
+ self.metadata_tags_schema = [{"type": "text", "name": "context"}]
32
35
  for tag_key in metadata_tags:
33
- self.metadata_tags_schema.append({
34
- "type": "tag",
35
- "name": tag_key
36
- })
36
+ self.metadata_tags_schema.append({"type": "text", "name": tag_key})
37
37
 
38
38
  try:
39
- self.redis_config = RedisConfig(
40
- index_name="vector_store",
41
- redis_url=self.redis_conn_string,
42
- metadata_schema=[
43
- {"type": "text", "name": "context"}
44
- ]+self.metadata_tags_schema,
45
- )
46
- self.vector_store = RedisVectorStore(self.embeddings_model, config=self.redis_config)
47
- logger.info("RedisEmbeddingsManager initialized")
39
+ self.redis_config = RedisConfig(
40
+ index_name="vector_store",
41
+ redis_url=self.redis_conn_string,
42
+ metadata_schema=self.metadata_tags_schema,
43
+ )
44
+ self.vector_store = RedisVectorStore(
45
+ self.embeddings_model, config=self.redis_config
46
+ )
47
+ logger.info("RedisEmbeddingsManager initialized")
48
48
  except Exception as e:
49
- logger.error(f"Failed to initialize RedisEmbeddingsManager: {str(e)}")
50
- raise
49
+ logger.error(f"Failed to initialize RedisEmbeddingsManager: {str(e)}")
50
+ raise
51
51
 
52
52
  def configure_vector_store(
53
53
  self,
@@ -56,7 +56,7 @@ class RedisEmbeddingsManager(EmbeddingsManager):
56
56
  content_column: str = "document",
57
57
  id_column: str = "id",
58
58
  metadata_json_column: str = "cmetadata",
59
- pg_record_manager: str = "postgres/langchain_pg_collection"
59
+ pg_record_manager: str = "postgres/langchain_pg_collection",
60
60
  ):
61
61
  """Configure the vector store."""
62
62
  pass
@@ -73,13 +73,14 @@ class RedisEmbeddingsManager(EmbeddingsManager):
73
73
 
74
74
  def vector_store_initialized(func):
75
75
  """validate vector store initialization"""
76
+
76
77
  def wrapper(self, *args, **kwargs):
77
- # Common validation logic
78
- if self.vector_store is None:
79
- raise Exception("Vector store not initialized")
80
- return func(self, *args, **kwargs)
81
- return wrapper
78
+ # Common validation logic
79
+ if self.vector_store is None:
80
+ raise Exception("Vector store not initialized")
81
+ return func(self, *args, **kwargs)
82
82
 
83
+ return wrapper
83
84
 
84
85
  @vector_store_initialized
85
86
  def index_documents(self, docs: List[Document]):
@@ -129,6 +130,14 @@ class RedisEmbeddingsManager(EmbeddingsManager):
129
130
  logger.error(f"Error deleting documents by ID: {str(e)}")
130
131
  raise
131
132
 
133
+ @vector_store_initialized
134
+ def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
135
+ """
136
+ Delete documents by filter from the vector store.
137
+ """
138
+ # TODO investigate how to do this
139
+ pass
140
+
132
141
  def get_documents_keys_by_source_id(self, source_id: str):
133
142
  """Get documents keys by source ID."""
134
143
  pass
@@ -1,3 +1,5 @@
1
+ from posix import fork
2
+
1
3
  # check this documentation
2
4
  # https://python.langchain.com/docs/how_to/semantic-chunker/
3
5
  # https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
@@ -16,7 +18,9 @@ class SemanticChunks(RagChunker):
16
18
  Class for semantically chunking documents into smaller pieces based on semantic similarity.
17
19
  Uses LangChain's SemanticChunker to create semantically coherent document chunks.
18
20
  """
21
+
19
22
  __slots__ = ("embeddings_model",)
23
+
20
24
  def __init__(self, embeddings_model: Any):
21
25
  """
22
26
  Initialize a document chunker with an embeddings model.
@@ -35,7 +39,7 @@ class SemanticChunks(RagChunker):
35
39
  add_start_index=True,
36
40
  breakpoint_threshold_type="percentile",
37
41
  breakpoint_threshold_amount=95,
38
- min_chunk_size=200
42
+ min_chunk_size=200,
39
43
  )
40
44
 
41
45
  def gen_chunks_for_document(self, document: Document) -> List[Document]:
@@ -53,6 +57,10 @@ class SemanticChunks(RagChunker):
53
57
  """
54
58
  try:
55
59
  chunks = self.text_splitter.split_documents([document])
60
+ source = document.metadata["source"]
61
+ for i, chunk in enumerate(chunks):
62
+ if document.metadata["source"]:
63
+ chunk.id = f"{source}-{i}"
56
64
  logger.info(f"{len(chunks)} chunks generated successfully")
57
65
  return chunks
58
66
  except Exception as e: