wizit-context-ingestor 0.2.5b3__py3-none-any.whl → 0.3.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wizit_context_ingestor/__init__.py +2 -2
- wizit_context_ingestor/application/context_chunk_service.py +149 -35
- wizit_context_ingestor/application/transcription_service.py +132 -52
- wizit_context_ingestor/data/kdb.py +10 -0
- wizit_context_ingestor/data/prompts.py +150 -3
- wizit_context_ingestor/data/storage.py +10 -0
- wizit_context_ingestor/infra/persistence/local_storage.py +19 -9
- wizit_context_ingestor/infra/persistence/s3_storage.py +29 -23
- wizit_context_ingestor/infra/rag/chroma_embeddings.py +30 -31
- wizit_context_ingestor/infra/rag/pg_embeddings.py +57 -54
- wizit_context_ingestor/infra/rag/redis_embeddings.py +34 -25
- wizit_context_ingestor/infra/rag/semantic_chunks.py +9 -1
- wizit_context_ingestor/infra/vertex_model.py +56 -28
- wizit_context_ingestor/main.py +192 -106
- wizit_context_ingestor/utils/file_utils.py +13 -0
- wizit_context_ingestor/workflows/context_nodes.py +73 -0
- wizit_context_ingestor/workflows/context_state.py +10 -0
- wizit_context_ingestor/workflows/context_tools.py +58 -0
- wizit_context_ingestor/workflows/context_workflow.py +42 -0
- wizit_context_ingestor/workflows/transcription_nodes.py +136 -0
- wizit_context_ingestor/workflows/transcription_schemas.py +25 -0
- wizit_context_ingestor/workflows/transcription_state.py +17 -0
- wizit_context_ingestor/workflows/transcription_tools.py +54 -0
- wizit_context_ingestor/workflows/transcription_workflow.py +42 -0
- {wizit_context_ingestor-0.2.5b3.dist-info → wizit_context_ingestor-0.3.0b2.dist-info}/METADATA +9 -1
- wizit_context_ingestor-0.3.0b2.dist-info/RECORD +44 -0
- {wizit_context_ingestor-0.2.5b3.dist-info → wizit_context_ingestor-0.3.0b2.dist-info}/WHEEL +1 -1
- wizit_context_ingestor-0.2.5b3.dist-info/RECORD +0 -32
|
@@ -3,22 +3,27 @@ from ...domain.models import ParsedDoc
|
|
|
3
3
|
from typing import Optional
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
|
|
6
7
|
logger = logging.getLogger(__name__)
|
|
7
8
|
|
|
9
|
+
|
|
8
10
|
class LocalStorageService(PersistenceService):
|
|
9
11
|
"""Persistence service for local storage."""
|
|
10
12
|
|
|
11
|
-
def __init__(self):
|
|
12
|
-
self.
|
|
13
|
+
def __init__(self, source_storage_route: str, target_storage_route: str):
|
|
14
|
+
self.source_storage_route = source_storage_route
|
|
15
|
+
self.target_storage_route = target_storage_route
|
|
16
|
+
self.supports_tagging = hasattr(self, "retrieve_file_tags")
|
|
13
17
|
|
|
14
18
|
def load_markdown_file_content(self, file_key: str) -> str:
|
|
15
19
|
"""Load markdown file content from local storage."""
|
|
16
20
|
file_content = None
|
|
17
|
-
with open(
|
|
21
|
+
with open(
|
|
22
|
+
f"{self.source_storage_route}/{file_key}", "r", encoding="utf-8"
|
|
23
|
+
) as file:
|
|
18
24
|
file_content = file.read()
|
|
19
25
|
return file_content
|
|
20
26
|
|
|
21
|
-
|
|
22
27
|
def retrieve_raw_file(self, file_key: str) -> str:
|
|
23
28
|
"""Retrieve file path in tmp folder from local storage.
|
|
24
29
|
|
|
@@ -32,16 +37,21 @@ class LocalStorageService(PersistenceService):
|
|
|
32
37
|
ClientError: If there's an error retrieving the object from local storage
|
|
33
38
|
"""
|
|
34
39
|
try:
|
|
35
|
-
tmp_file_path = f"{self.
|
|
40
|
+
tmp_file_path = f"{self.source_storage_route}/{file_key}"
|
|
36
41
|
if not os.path.exists(tmp_file_path):
|
|
37
42
|
raise FileNotFoundError(f"File {file_key} not found in local storage")
|
|
38
43
|
return tmp_file_path
|
|
39
44
|
except Exception as e:
|
|
40
|
-
logger.error(
|
|
45
|
+
logger.error(
|
|
46
|
+
f"Unexpected error retrieving file {file_key} from local storage: {str(e)}"
|
|
47
|
+
)
|
|
41
48
|
raise
|
|
42
49
|
|
|
43
|
-
|
|
44
|
-
|
|
50
|
+
def save_parsed_document(
|
|
51
|
+
self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
|
|
52
|
+
):
|
|
45
53
|
"""Save a parsed document."""
|
|
46
|
-
with open(
|
|
54
|
+
with open(
|
|
55
|
+
f"{self.target_storage_route}/{file_key}", "w", encoding="utf-8"
|
|
56
|
+
) as f:
|
|
47
57
|
f.write(parsed_document.document_text)
|
|
@@ -11,12 +11,19 @@ logger = logging.getLogger(__name__)
|
|
|
11
11
|
|
|
12
12
|
class S3StorageService(PersistenceService):
|
|
13
13
|
"""Persistence service for S3 storage."""
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
|
|
15
|
+
__slots__ = ("origin_bucket_name", "target_bucket_name", "region_name")
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
origin_bucket_name: str,
|
|
20
|
+
target_bucket_name: str,
|
|
21
|
+
region_name: str = "us-east-1",
|
|
22
|
+
):
|
|
23
|
+
self.s3 = boto3_client("s3", region_name=region_name)
|
|
17
24
|
self.origin_bucket_name = origin_bucket_name
|
|
18
25
|
self.target_bucket_name = target_bucket_name
|
|
19
|
-
|
|
26
|
+
self.supports_tagging = hasattr(self, "retrieve_file_tags")
|
|
20
27
|
|
|
21
28
|
def load_markdown_file_content(self, file_key: str) -> str:
|
|
22
29
|
"""Load markdown file content from S3 storage.
|
|
@@ -36,9 +43,9 @@ class S3StorageService(PersistenceService):
|
|
|
36
43
|
response = self.s3.get_object(Bucket=self.target_bucket_name, Key=file_key)
|
|
37
44
|
tmp_file_key = f"/tmp/{file_key}"
|
|
38
45
|
os.makedirs(os.path.dirname(tmp_file_key), exist_ok=True)
|
|
39
|
-
with open(tmp_file_key,
|
|
40
|
-
f.write(response[
|
|
41
|
-
with open(tmp_file_key,
|
|
46
|
+
with open(tmp_file_key, "wb") as f:
|
|
47
|
+
f.write(response["Body"].read())
|
|
48
|
+
with open(tmp_file_key, "r", encoding="utf-8") as f:
|
|
42
49
|
file_content = f.read()
|
|
43
50
|
return file_content
|
|
44
51
|
except ClientError as e:
|
|
@@ -48,7 +55,6 @@ class S3StorageService(PersistenceService):
|
|
|
48
55
|
logger.error(f"Unexpected error loading file {file_key} from S3: {str(e)}")
|
|
49
56
|
raise
|
|
50
57
|
|
|
51
|
-
|
|
52
58
|
def retrieve_raw_file(self, file_key: str) -> str:
|
|
53
59
|
"""Retrieve file path in tmp folder from S3 storage.
|
|
54
60
|
|
|
@@ -67,18 +73,21 @@ class S3StorageService(PersistenceService):
|
|
|
67
73
|
tmp_file_key = f"/tmp/{file_key}"
|
|
68
74
|
# Create parent directories if they don't exist
|
|
69
75
|
os.makedirs(os.path.dirname(tmp_file_key), exist_ok=True)
|
|
70
|
-
with open(tmp_file_key,
|
|
71
|
-
f.write(response[
|
|
76
|
+
with open(tmp_file_key, "wb") as f:
|
|
77
|
+
f.write(response["Body"].read())
|
|
72
78
|
return tmp_file_key
|
|
73
79
|
except ClientError as e:
|
|
74
80
|
logger.error(f"Error retrieving file {file_key} from S3: {str(e)}")
|
|
75
81
|
raise
|
|
76
82
|
except Exception as e:
|
|
77
|
-
logger.error(
|
|
83
|
+
logger.error(
|
|
84
|
+
f"Unexpected error retrieving file {file_key} from S3: {str(e)}"
|
|
85
|
+
)
|
|
78
86
|
raise
|
|
79
87
|
|
|
80
|
-
|
|
81
|
-
|
|
88
|
+
def save_parsed_document(
|
|
89
|
+
self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
|
|
90
|
+
):
|
|
82
91
|
"""Save a parsed document to S3.
|
|
83
92
|
|
|
84
93
|
Args:
|
|
@@ -91,21 +100,21 @@ class S3StorageService(PersistenceService):
|
|
|
91
100
|
"""
|
|
92
101
|
try:
|
|
93
102
|
# Convert document content to bytes
|
|
94
|
-
content_bytes = parsed_document.document_text.encode(
|
|
103
|
+
content_bytes = parsed_document.document_text.encode("utf-8")
|
|
95
104
|
# Upload the file to S3
|
|
96
105
|
if not file_tags:
|
|
97
106
|
self.s3.put_object(
|
|
98
|
-
Bucket=self.target_bucket_name,
|
|
99
|
-
Key=file_key,
|
|
100
|
-
Body=content_bytes
|
|
107
|
+
Bucket=self.target_bucket_name, Key=file_key, Body=content_bytes
|
|
101
108
|
)
|
|
102
109
|
else:
|
|
103
|
-
tagging_string = "&".join(
|
|
110
|
+
tagging_string = "&".join(
|
|
111
|
+
[f"{key}={value}" for key, value in file_tags.items()]
|
|
112
|
+
)
|
|
104
113
|
self.s3.put_object(
|
|
105
114
|
Bucket=self.target_bucket_name,
|
|
106
115
|
Key=file_key,
|
|
107
116
|
Body=content_bytes,
|
|
108
|
-
Tagging=tagging_string
|
|
117
|
+
Tagging=tagging_string,
|
|
109
118
|
)
|
|
110
119
|
|
|
111
120
|
logger.info(f"Successfully saved document to S3 as {file_key}")
|
|
@@ -122,8 +131,5 @@ class S3StorageService(PersistenceService):
|
|
|
122
131
|
Args:
|
|
123
132
|
file_key: The key (path) to retrieve tags
|
|
124
133
|
"""
|
|
125
|
-
response = self.s3.get_object_tagging(
|
|
126
|
-
Bucket=bucket_name,
|
|
127
|
-
Key=file_key
|
|
128
|
-
)
|
|
134
|
+
response = self.s3.get_object_tagging(Bucket=bucket_name, Key=file_key)
|
|
129
135
|
return {item["Key"]: item["Value"] for item in response["TagSet"]}
|
|
@@ -1,25 +1,21 @@
|
|
|
1
|
-
from typing_extensions import Sequence
|
|
2
|
-
from test.test_typing import CoolEmployee
|
|
3
1
|
from langchain_core.documents import Document
|
|
4
2
|
from langchain_chroma import Chroma
|
|
5
|
-
from typing import List
|
|
6
3
|
import logging
|
|
7
|
-
from uuid import uuid4
|
|
8
4
|
from ...application.interfaces import EmbeddingsManager
|
|
9
5
|
|
|
10
6
|
# load_dotenv()
|
|
11
7
|
|
|
12
8
|
logger = logging.getLogger(__name__)
|
|
13
9
|
|
|
10
|
+
|
|
14
11
|
class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
12
|
+
__slots__ = ("embeddings_model", "collection_name")
|
|
15
13
|
|
|
16
|
-
__slots__ = ("embeddings_model", "chroma_host", "collection_name", "metadata_tags")
|
|
17
14
|
def __init__(
|
|
18
15
|
self,
|
|
19
16
|
embeddings_model,
|
|
20
|
-
chroma_host,
|
|
21
|
-
|
|
22
|
-
metadata_tags: dict
|
|
17
|
+
chroma_host=None,
|
|
18
|
+
**chroma_conn_kwargs,
|
|
23
19
|
):
|
|
24
20
|
"""
|
|
25
21
|
Initialize the ChromaEmbeddingsManager.
|
|
@@ -27,33 +23,28 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
|
27
23
|
embeddings_model: The embeddings model to use for generating vector embeddings
|
|
28
24
|
(typically a LangChain embeddings model instance)
|
|
29
25
|
chroma_host: The Chroma host URL
|
|
30
|
-
collection_name: The Chroma collection name
|
|
31
|
-
metadata_tags: Tags to add as metadata to Chroma vector store
|
|
32
26
|
|
|
33
27
|
Raises:
|
|
34
28
|
Exception: If there's an error initializing the RedisEmbeddingsManager
|
|
35
29
|
"""
|
|
36
|
-
self.collection_name = collection_name
|
|
37
30
|
self.embeddings_model = embeddings_model
|
|
38
31
|
self.chroma_host = chroma_host
|
|
39
|
-
self.metadata_tags_schema = []
|
|
40
|
-
|
|
41
|
-
for tag_key in metadata_tags:
|
|
42
|
-
self.metadata_tags_schema.append({
|
|
43
|
-
"type": "tag",
|
|
44
|
-
"name": tag_key
|
|
45
|
-
})
|
|
46
|
-
|
|
47
32
|
try:
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
33
|
+
if chroma_host:
|
|
34
|
+
self.chroma = Chroma(
|
|
35
|
+
embedding_function=self.embeddings_model,
|
|
36
|
+
host=chroma_host,
|
|
37
|
+
**chroma_conn_kwargs,
|
|
38
|
+
)
|
|
39
|
+
logger.info("ChromaEmbeddingsManager initialized")
|
|
40
|
+
else:
|
|
41
|
+
self.chroma = Chroma(
|
|
42
|
+
embedding_function=self.embeddings_model, **chroma_conn_kwargs
|
|
43
|
+
)
|
|
44
|
+
logger.info("ChromaEmbeddingsManager initialized")
|
|
54
45
|
except Exception as e:
|
|
55
|
-
|
|
56
|
-
|
|
46
|
+
logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
|
|
47
|
+
raise
|
|
57
48
|
|
|
58
49
|
def configure_vector_store(
|
|
59
50
|
self,
|
|
@@ -61,8 +52,6 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
|
61
52
|
vector_size: int = 768,
|
|
62
53
|
content_column: str = "document",
|
|
63
54
|
id_column: str = "id",
|
|
64
|
-
metadata_json_column: str = "cmetadata",
|
|
65
|
-
pg_record_manager: str = ""
|
|
66
55
|
):
|
|
67
56
|
"""Configure the vector store."""
|
|
68
57
|
pass
|
|
@@ -71,13 +60,11 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
|
71
60
|
self,
|
|
72
61
|
table_name: str = "",
|
|
73
62
|
content_column: str = "document",
|
|
74
|
-
metadata_json_column: str = "cmetadata",
|
|
75
63
|
id_column: str = "id",
|
|
76
64
|
):
|
|
77
65
|
"""Initialize the vector store."""
|
|
78
66
|
pass
|
|
79
67
|
|
|
80
|
-
|
|
81
68
|
def index_documents(self, documents: list[Document]):
|
|
82
69
|
"""
|
|
83
70
|
Add documents to the vector store with their embeddings.
|
|
@@ -123,6 +110,18 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
|
123
110
|
logger.error(f"Error deleting documents by ID: {str(e)}")
|
|
124
111
|
raise
|
|
125
112
|
|
|
113
|
+
def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
|
|
114
|
+
"""
|
|
115
|
+
Delete documents by filter from the vector store.
|
|
116
|
+
"""
|
|
117
|
+
try:
|
|
118
|
+
self.chroma.delete(where={metadata_key: metadata_value})
|
|
119
|
+
except Exception as error:
|
|
120
|
+
logger.error(
|
|
121
|
+
f"Error deleting documents by filter: {str(filter)}, error: {error} "
|
|
122
|
+
)
|
|
123
|
+
raise
|
|
124
|
+
|
|
126
125
|
def get_documents_keys_by_source_id(self, source_id: str):
|
|
127
126
|
"""Get documents keys by source ID."""
|
|
128
127
|
pass
|
|
@@ -6,6 +6,7 @@ from langchain_postgres import PGVectorStore, PGEngine
|
|
|
6
6
|
from sqlalchemy import create_engine
|
|
7
7
|
from dotenv import load_dotenv
|
|
8
8
|
from wizit_context_ingestor.application.interfaces import EmbeddingsManager
|
|
9
|
+
|
|
9
10
|
load_dotenv()
|
|
10
11
|
|
|
11
12
|
logger = logging.getLogger(__name__)
|
|
@@ -38,19 +39,21 @@ class PgEmbeddingsManager(EmbeddingsManager):
|
|
|
38
39
|
... )
|
|
39
40
|
>>> documents = [Document(page_content="Sample text", metadata={"source": "example"})]
|
|
40
41
|
"""
|
|
42
|
+
|
|
41
43
|
__slots__ = ("embeddings_model", "pg_connection")
|
|
44
|
+
|
|
42
45
|
def __init__(self, embeddings_model, pg_connection: str):
|
|
43
46
|
"""
|
|
44
|
-
|
|
47
|
+
Initialize the PgEmbeddingsManager.
|
|
45
48
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
Args:
|
|
50
|
+
embeddings_model: The embeddings model to use for generating vector embeddings
|
|
51
|
+
(typically a LangChain embeddings model instance)
|
|
52
|
+
pg_connection: The PostgreSQL connection string
|
|
53
|
+
(format: postgresql://user:password@host:port/database)
|
|
51
54
|
|
|
52
|
-
|
|
53
|
-
|
|
55
|
+
Raises:
|
|
56
|
+
Exception: If there's an error initializing the vector store
|
|
54
57
|
"""
|
|
55
58
|
self.pg_connection = pg_connection
|
|
56
59
|
self.embeddings_model = embeddings_model
|
|
@@ -58,65 +61,65 @@ class PgEmbeddingsManager(EmbeddingsManager):
|
|
|
58
61
|
self.vector_store = None
|
|
59
62
|
self.record_manager = None
|
|
60
63
|
try:
|
|
61
|
-
|
|
62
|
-
|
|
64
|
+
self.pg_engine = PGEngine.from_connection_string(url=pg_connection)
|
|
65
|
+
logger.info("PgEmbeddingsManager initialized")
|
|
63
66
|
except Exception as e:
|
|
64
67
|
logger.error(f"Failed to initialize PgEmbeddingsManager: {str(e)}")
|
|
65
68
|
raise
|
|
66
69
|
|
|
67
70
|
def configure_vector_store(
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
71
|
+
self,
|
|
72
|
+
table_name: str = "langchain_pg_embedding",
|
|
73
|
+
vector_size: int = 768,
|
|
74
|
+
content_column: str = "document",
|
|
75
|
+
id_column: str = "id",
|
|
76
|
+
metadata_json_column: str = "cmetadata",
|
|
77
|
+
pg_record_manager: str = "postgres/langchain_pg_collection",
|
|
75
78
|
):
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
self.record_manager.create_schema()
|
|
79
|
+
self.pg_engine.init_vectorstore_table(
|
|
80
|
+
table_name=table_name,
|
|
81
|
+
vector_size=vector_size,
|
|
82
|
+
content_column=content_column,
|
|
83
|
+
id_column=id_column,
|
|
84
|
+
metadata_json_column=metadata_json_column,
|
|
85
|
+
)
|
|
86
|
+
self.record_manager = SQLRecordManager(
|
|
87
|
+
pg_record_manager, engine=create_engine(url=self.pg_connection)
|
|
88
|
+
)
|
|
89
|
+
# TODO move this from here
|
|
90
|
+
self.record_manager.create_schema()
|
|
89
91
|
|
|
90
92
|
def init_vector_store(
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
93
|
+
self,
|
|
94
|
+
table_name: str = "langchain_pg_embedding",
|
|
95
|
+
content_column: str = "document",
|
|
96
|
+
metadata_json_column: str = "cmetadata",
|
|
97
|
+
id_column: str = "id",
|
|
98
|
+
pg_record_manager: str = "postgres/langchain_pg_collection",
|
|
97
99
|
):
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
)
|
|
100
|
+
self.vector_store = PGVectorStore.create_sync(
|
|
101
|
+
embedding_service=self.embeddings_model,
|
|
102
|
+
engine=self.pg_engine,
|
|
103
|
+
table_name=table_name,
|
|
104
|
+
content_column=content_column,
|
|
105
|
+
metadata_json_column=metadata_json_column,
|
|
106
|
+
id_column=id_column,
|
|
107
|
+
)
|
|
108
|
+
self.record_manager = SQLRecordManager(
|
|
109
|
+
pg_record_manager, engine=create_engine(url=self.pg_connection)
|
|
110
|
+
)
|
|
110
111
|
|
|
111
112
|
def vector_store_initialized(func):
|
|
112
113
|
"""validate vector store initialization"""
|
|
114
|
+
|
|
113
115
|
def wrapper(self, *args, **kwargs):
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
116
|
+
# Common validation logic
|
|
117
|
+
if self.vector_store is None:
|
|
118
|
+
raise Exception("Vector store not initialized")
|
|
119
|
+
if self.record_manager is None:
|
|
120
|
+
raise Exception("Record manager not initialized")
|
|
121
|
+
return func(self, *args, **kwargs)
|
|
122
|
+
|
|
120
123
|
return wrapper
|
|
121
124
|
|
|
122
125
|
@vector_store_initialized
|
|
@@ -2,6 +2,7 @@ from langchain_core.documents import Document
|
|
|
2
2
|
from langchain_redis import RedisConfig, RedisVectorStore
|
|
3
3
|
from typing import List
|
|
4
4
|
import logging
|
|
5
|
+
|
|
5
6
|
# from dotenv import load_dotenv
|
|
6
7
|
from ...application.interfaces import EmbeddingsManager
|
|
7
8
|
|
|
@@ -9,10 +10,13 @@ from ...application.interfaces import EmbeddingsManager
|
|
|
9
10
|
|
|
10
11
|
logger = logging.getLogger(__name__)
|
|
11
12
|
|
|
12
|
-
class RedisEmbeddingsManager(EmbeddingsManager):
|
|
13
13
|
|
|
14
|
+
class RedisEmbeddingsManager(EmbeddingsManager):
|
|
14
15
|
__slots__ = ("embeddings_model", "redis_conn_string", "metadata_tags")
|
|
15
|
-
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self, embeddings_model, redis_conn_string: str, metadata_tags: List[str] = []
|
|
19
|
+
):
|
|
16
20
|
"""
|
|
17
21
|
Initialize the RedisEmbeddingsManager.
|
|
18
22
|
Args:
|
|
@@ -27,27 +31,23 @@ class RedisEmbeddingsManager(EmbeddingsManager):
|
|
|
27
31
|
"""
|
|
28
32
|
self.redis_conn_string = redis_conn_string
|
|
29
33
|
self.embeddings_model = embeddings_model
|
|
30
|
-
self.metadata_tags_schema = []
|
|
31
|
-
|
|
34
|
+
self.metadata_tags_schema = [{"type": "text", "name": "context"}]
|
|
32
35
|
for tag_key in metadata_tags:
|
|
33
|
-
|
|
34
|
-
"type": "tag",
|
|
35
|
-
"name": tag_key
|
|
36
|
-
})
|
|
36
|
+
self.metadata_tags_schema.append({"type": "text", "name": tag_key})
|
|
37
37
|
|
|
38
38
|
try:
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
39
|
+
self.redis_config = RedisConfig(
|
|
40
|
+
index_name="vector_store",
|
|
41
|
+
redis_url=self.redis_conn_string,
|
|
42
|
+
metadata_schema=self.metadata_tags_schema,
|
|
43
|
+
)
|
|
44
|
+
self.vector_store = RedisVectorStore(
|
|
45
|
+
self.embeddings_model, config=self.redis_config
|
|
46
|
+
)
|
|
47
|
+
logger.info("RedisEmbeddingsManager initialized")
|
|
48
48
|
except Exception as e:
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
logger.error(f"Failed to initialize RedisEmbeddingsManager: {str(e)}")
|
|
50
|
+
raise
|
|
51
51
|
|
|
52
52
|
def configure_vector_store(
|
|
53
53
|
self,
|
|
@@ -56,7 +56,7 @@ class RedisEmbeddingsManager(EmbeddingsManager):
|
|
|
56
56
|
content_column: str = "document",
|
|
57
57
|
id_column: str = "id",
|
|
58
58
|
metadata_json_column: str = "cmetadata",
|
|
59
|
-
pg_record_manager: str = "postgres/langchain_pg_collection"
|
|
59
|
+
pg_record_manager: str = "postgres/langchain_pg_collection",
|
|
60
60
|
):
|
|
61
61
|
"""Configure the vector store."""
|
|
62
62
|
pass
|
|
@@ -73,13 +73,14 @@ class RedisEmbeddingsManager(EmbeddingsManager):
|
|
|
73
73
|
|
|
74
74
|
def vector_store_initialized(func):
|
|
75
75
|
"""validate vector store initialization"""
|
|
76
|
+
|
|
76
77
|
def wrapper(self, *args, **kwargs):
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
return wrapper
|
|
78
|
+
# Common validation logic
|
|
79
|
+
if self.vector_store is None:
|
|
80
|
+
raise Exception("Vector store not initialized")
|
|
81
|
+
return func(self, *args, **kwargs)
|
|
82
82
|
|
|
83
|
+
return wrapper
|
|
83
84
|
|
|
84
85
|
@vector_store_initialized
|
|
85
86
|
def index_documents(self, docs: List[Document]):
|
|
@@ -129,6 +130,14 @@ class RedisEmbeddingsManager(EmbeddingsManager):
|
|
|
129
130
|
logger.error(f"Error deleting documents by ID: {str(e)}")
|
|
130
131
|
raise
|
|
131
132
|
|
|
133
|
+
@vector_store_initialized
|
|
134
|
+
def delete_documents_by_metadata_key(self, metadata_key: str, metadata_value: str):
|
|
135
|
+
"""
|
|
136
|
+
Delete documents by filter from the vector store.
|
|
137
|
+
"""
|
|
138
|
+
# TODO investigate how to do this
|
|
139
|
+
pass
|
|
140
|
+
|
|
132
141
|
def get_documents_keys_by_source_id(self, source_id: str):
|
|
133
142
|
"""Get documents keys by source ID."""
|
|
134
143
|
pass
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from posix import fork
|
|
2
|
+
|
|
1
3
|
# check this documentation
|
|
2
4
|
# https://python.langchain.com/docs/how_to/semantic-chunker/
|
|
3
5
|
# https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
|
|
@@ -16,7 +18,9 @@ class SemanticChunks(RagChunker):
|
|
|
16
18
|
Class for semantically chunking documents into smaller pieces based on semantic similarity.
|
|
17
19
|
Uses LangChain's SemanticChunker to create semantically coherent document chunks.
|
|
18
20
|
"""
|
|
21
|
+
|
|
19
22
|
__slots__ = ("embeddings_model",)
|
|
23
|
+
|
|
20
24
|
def __init__(self, embeddings_model: Any):
|
|
21
25
|
"""
|
|
22
26
|
Initialize a document chunker with an embeddings model.
|
|
@@ -35,7 +39,7 @@ class SemanticChunks(RagChunker):
|
|
|
35
39
|
add_start_index=True,
|
|
36
40
|
breakpoint_threshold_type="percentile",
|
|
37
41
|
breakpoint_threshold_amount=95,
|
|
38
|
-
min_chunk_size=200
|
|
42
|
+
min_chunk_size=200,
|
|
39
43
|
)
|
|
40
44
|
|
|
41
45
|
def gen_chunks_for_document(self, document: Document) -> List[Document]:
|
|
@@ -53,6 +57,10 @@ class SemanticChunks(RagChunker):
|
|
|
53
57
|
"""
|
|
54
58
|
try:
|
|
55
59
|
chunks = self.text_splitter.split_documents([document])
|
|
60
|
+
source = document.metadata["source"]
|
|
61
|
+
for i, chunk in enumerate(chunks):
|
|
62
|
+
if document.metadata["source"]:
|
|
63
|
+
chunk.id = f"{source}-{i}"
|
|
56
64
|
logger.info(f"{len(chunks)} chunks generated successfully")
|
|
57
65
|
return chunks
|
|
58
66
|
except Exception as e:
|