wizit-context-ingestor 0.3.0b8__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wizit_context_ingestor/__init__.py +2 -1
- wizit_context_ingestor/application/context_chunk_service.py +9 -95
- wizit_context_ingestor/application/interfaces.py +52 -23
- wizit_context_ingestor/application/kdb_service.py +59 -0
- wizit_context_ingestor/data/kdb.py +5 -2
- wizit_context_ingestor/data/storage.py +1 -1
- wizit_context_ingestor/infra/rag/chroma_embeddings.py +11 -6
- wizit_context_ingestor/infra/rag/pg_embeddings.py +117 -127
- wizit_context_ingestor/infra/rag/semantic_chunks.py +2 -1
- wizit_context_ingestor/main.py +2 -2
- wizit_context_ingestor/main_chunks.py +173 -0
- wizit_context_ingestor/utils/file_utils.py +6 -7
- {wizit_context_ingestor-0.3.0b8.dist-info → wizit_context_ingestor-0.4.0.dist-info}/METADATA +4 -1
- {wizit_context_ingestor-0.3.0b8.dist-info → wizit_context_ingestor-0.4.0.dist-info}/RECORD +15 -13
- {wizit_context_ingestor-0.3.0b8.dist-info → wizit_context_ingestor-0.4.0.dist-info}/WHEEL +0 -0
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from langchain_core.messages.human import HumanMessage
|
|
2
7
|
from langchain_core.output_parsers.pydantic import PydanticOutputParser
|
|
3
8
|
from langchain_core.prompts import ChatPromptTemplate
|
|
4
|
-
|
|
9
|
+
|
|
5
10
|
from ..data.prompts import CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT, ContextChunk
|
|
6
|
-
from langchain_core.messages.human import HumanMessage
|
|
7
11
|
from ..workflows.context_workflow import ContextWorkflow
|
|
8
|
-
from typing import Dict, Any, Optional, List
|
|
9
12
|
from .interfaces import (
|
|
10
13
|
AiApplicationService,
|
|
14
|
+
EmbeddingsManager,
|
|
11
15
|
PersistenceService,
|
|
12
16
|
RagChunker,
|
|
13
|
-
EmbeddingsManager,
|
|
14
17
|
)
|
|
15
|
-
import logging
|
|
16
|
-
|
|
17
18
|
|
|
18
19
|
logger = logging.getLogger(__name__)
|
|
19
20
|
|
|
@@ -39,7 +40,7 @@ class ContextChunksInDocumentService:
|
|
|
39
40
|
self.rag_chunker = rag_chunker
|
|
40
41
|
self.embeddings_manager = embeddings_manager
|
|
41
42
|
self.target_language = target_language
|
|
42
|
-
self.embeddings_manager.init_vector_store()
|
|
43
|
+
# self.embeddings_manager.init_vector_store()
|
|
43
44
|
self.chat_model = self.ai_application_service.load_chat_model()
|
|
44
45
|
# TODO
|
|
45
46
|
self.context_additional_instructions = ""
|
|
@@ -85,74 +86,6 @@ class ContextChunksInDocumentService:
|
|
|
85
86
|
logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
86
87
|
raise
|
|
87
88
|
|
|
88
|
-
# def _retrieve_context_chunk_in_document(
|
|
89
|
-
# self,
|
|
90
|
-
# markdown_content: str,
|
|
91
|
-
# chunk: Document,
|
|
92
|
-
# chunk_metadata: Optional[Dict[str, Any]] = None,
|
|
93
|
-
# ) -> Document:
|
|
94
|
-
# """Retrieve context chunks in document."""
|
|
95
|
-
# try:
|
|
96
|
-
# chunk_output_parser = PydanticOutputParser(pydantic_object=ContextChunk)
|
|
97
|
-
# # Create the prompt template with image
|
|
98
|
-
# prompt = ChatPromptTemplate.from_messages(
|
|
99
|
-
# [
|
|
100
|
-
# ("system", CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT),
|
|
101
|
-
# (
|
|
102
|
-
# "human",
|
|
103
|
-
# [
|
|
104
|
-
# {
|
|
105
|
-
# "type": "text",
|
|
106
|
-
# "text": f"Generate context for the following chunk: <chunk>{chunk.page_content}</chunk>, ensure all content chunks are generated in '{self.target_language}' language",
|
|
107
|
-
# }
|
|
108
|
-
# ],
|
|
109
|
-
# ),
|
|
110
|
-
# ]
|
|
111
|
-
# ).partial(
|
|
112
|
-
# document_content=markdown_content,
|
|
113
|
-
# format_instructions=chunk_output_parser.get_format_instructions(),
|
|
114
|
-
# )
|
|
115
|
-
# model_with_structured_output = self.chat_model.with_structured_output(
|
|
116
|
-
# ContextChunk
|
|
117
|
-
# )
|
|
118
|
-
# # Create the chain
|
|
119
|
-
# chain = prompt | model_with_structured_output
|
|
120
|
-
# # Process the image
|
|
121
|
-
# results = chain.invoke({})
|
|
122
|
-
# # chunk.page_content = (
|
|
123
|
-
# # f"Context:{results.context}, Content:{chunk.page_content}"
|
|
124
|
-
# # )
|
|
125
|
-
# chunk.metadata["context"] = results.context
|
|
126
|
-
# if chunk_metadata:
|
|
127
|
-
# for key, value in chunk_metadata.items():
|
|
128
|
-
# chunk.metadata[key] = value
|
|
129
|
-
# return chunk
|
|
130
|
-
|
|
131
|
-
# except Exception as e:
|
|
132
|
-
# logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
133
|
-
# raise
|
|
134
|
-
|
|
135
|
-
# def retrieve_context_chunks_in_document(
|
|
136
|
-
# self,
|
|
137
|
-
# markdown_content: str,
|
|
138
|
-
# chunks: List[Document],
|
|
139
|
-
# chunks_metadata: Optional[Dict[str, Any]] = None,
|
|
140
|
-
# ) -> List[Document]:
|
|
141
|
-
# """Retrieve context chunks in document."""
|
|
142
|
-
# try:
|
|
143
|
-
# context_chunks = list(
|
|
144
|
-
# map(
|
|
145
|
-
# lambda chunk: self._retrieve_context_chunk_in_document(
|
|
146
|
-
# markdown_content, chunk, chunks_metadata
|
|
147
|
-
# ),
|
|
148
|
-
# chunks,
|
|
149
|
-
# )
|
|
150
|
-
# )
|
|
151
|
-
# return context_chunks
|
|
152
|
-
# except Exception as e:
|
|
153
|
-
# logger.error(f"Failed to retrieve context chunks in document: {str(e)}")
|
|
154
|
-
# raise
|
|
155
|
-
|
|
156
89
|
async def retrieve_context_chunks_in_document_with_workflow(
|
|
157
90
|
self,
|
|
158
91
|
markdown_content: str,
|
|
@@ -205,26 +138,7 @@ class ContextChunksInDocumentService:
|
|
|
205
138
|
)
|
|
206
139
|
)
|
|
207
140
|
logger.info(f"Context chunks generated:{len(context_chunks)}")
|
|
208
|
-
# upsert validation
|
|
209
|
-
try:
|
|
210
|
-
print(f"deleting chunks: {file_key}")
|
|
211
|
-
self.delete_document_context_chunks(file_key)
|
|
212
|
-
except Exception as e:
|
|
213
|
-
logger.error(f"could not delete by source: {e}")
|
|
214
|
-
self.embeddings_manager.index_documents(context_chunks)
|
|
215
141
|
return context_chunks
|
|
216
142
|
except Exception as e:
|
|
217
|
-
logger.error("Error
|
|
218
|
-
raise e
|
|
219
|
-
|
|
220
|
-
def delete_document_context_chunks(self, file_key: str):
|
|
221
|
-
"""
|
|
222
|
-
Delete the context chunks in a document.
|
|
223
|
-
"""
|
|
224
|
-
try:
|
|
225
|
-
self.embeddings_manager.delete_documents_by_metadata_key(
|
|
226
|
-
self.metadata_source, file_key
|
|
227
|
-
)
|
|
228
|
-
except Exception as e:
|
|
229
|
-
logger.error(f"Error delete_document_context_chunks: {str(e)}")
|
|
143
|
+
logger.error(f"Error: {str(e)}")
|
|
230
144
|
raise e
|
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Application interfaces defining application layer contracts.
|
|
3
3
|
"""
|
|
4
|
+
|
|
4
5
|
from abc import ABC, abstractmethod
|
|
5
|
-
from
|
|
6
|
-
|
|
7
|
-
from
|
|
6
|
+
from typing import List, Optional, Union
|
|
7
|
+
|
|
8
|
+
from langchain.indexes import IndexingResult, SQLRecordManager
|
|
8
9
|
from langchain_aws import ChatBedrockConverse
|
|
10
|
+
from langchain_core.documents import Document
|
|
9
11
|
from langchain_google_vertexai import ChatVertexAI
|
|
10
12
|
from langchain_google_vertexai.model_garden import ChatAnthropicVertex
|
|
13
|
+
from langchain_postgres import PGVectorStore
|
|
14
|
+
|
|
15
|
+
from ..domain.models import ParsedDoc, ParsedDocPage
|
|
16
|
+
|
|
11
17
|
|
|
12
18
|
class TranscriptionService(ABC):
|
|
13
19
|
"""Interface for transcription services."""
|
|
@@ -17,6 +23,7 @@ class TranscriptionService(ABC):
|
|
|
17
23
|
"""Parse a document page."""
|
|
18
24
|
pass
|
|
19
25
|
|
|
26
|
+
|
|
20
27
|
class AiApplicationService(ABC):
|
|
21
28
|
"""Interface for AI application services."""
|
|
22
29
|
|
|
@@ -26,7 +33,9 @@ class AiApplicationService(ABC):
|
|
|
26
33
|
# pass
|
|
27
34
|
|
|
28
35
|
@abstractmethod
|
|
29
|
-
def load_chat_model(
|
|
36
|
+
def load_chat_model(
|
|
37
|
+
self, **kwargs
|
|
38
|
+
) -> Union[ChatVertexAI, ChatAnthropicVertex, ChatBedrockConverse]:
|
|
30
39
|
"""Load a chat model."""
|
|
31
40
|
pass
|
|
32
41
|
|
|
@@ -40,7 +49,9 @@ class PersistenceService(ABC):
|
|
|
40
49
|
"""Interface for persistence services."""
|
|
41
50
|
|
|
42
51
|
@abstractmethod
|
|
43
|
-
def save_parsed_document(
|
|
52
|
+
def save_parsed_document(
|
|
53
|
+
self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}
|
|
54
|
+
):
|
|
44
55
|
"""Save a parsed document."""
|
|
45
56
|
pass
|
|
46
57
|
|
|
@@ -70,38 +81,56 @@ class EmbeddingsManager(ABC):
|
|
|
70
81
|
@abstractmethod
|
|
71
82
|
def configure_vector_store(
|
|
72
83
|
self,
|
|
73
|
-
table_name: str = "langchain_pg_embedding",
|
|
74
|
-
vector_size: int = 768,
|
|
75
|
-
content_column: str = "document",
|
|
76
|
-
id_column: str = "id",
|
|
77
|
-
metadata_json_column: str = "cmetadata",
|
|
78
|
-
pg_record_manager: str = "postgres/langchain_pg_collection"
|
|
79
84
|
):
|
|
80
85
|
"""Configure the vector store."""
|
|
81
86
|
pass
|
|
82
87
|
|
|
88
|
+
# @abstractmethod
|
|
89
|
+
# async def init_vector_store(
|
|
90
|
+
# self,
|
|
91
|
+
# table_name: str = "tenant_embeddings",
|
|
92
|
+
# content_column: str = "document",
|
|
93
|
+
# metadata_json_column: str = "cmetadata",
|
|
94
|
+
# id_column: str = "id",
|
|
95
|
+
# ):
|
|
96
|
+
# """Initialize the vector store."""
|
|
97
|
+
# pass
|
|
98
|
+
|
|
83
99
|
@abstractmethod
|
|
84
|
-
def
|
|
100
|
+
def retrieve_vector_store(
|
|
85
101
|
self,
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
metadata_json_column: str = "cmetadata",
|
|
89
|
-
id_column: str = "id",
|
|
90
|
-
):
|
|
91
|
-
"""Initialize the vector store."""
|
|
102
|
+
) -> tuple[PGVectorStore, SQLRecordManager]:
|
|
103
|
+
"""Retrieve the vector store."""
|
|
92
104
|
pass
|
|
93
105
|
|
|
94
106
|
@abstractmethod
|
|
95
|
-
def index_documents(
|
|
107
|
+
def index_documents(
|
|
108
|
+
self,
|
|
109
|
+
docs: list[Document],
|
|
110
|
+
) -> IndexingResult:
|
|
96
111
|
"""Index documents."""
|
|
97
112
|
pass
|
|
98
113
|
|
|
99
114
|
@abstractmethod
|
|
100
|
-
def
|
|
101
|
-
|
|
115
|
+
def search_records(
|
|
116
|
+
self,
|
|
117
|
+
query: str,
|
|
118
|
+
) -> list[Document]:
|
|
119
|
+
"""Search documents."""
|
|
102
120
|
pass
|
|
103
121
|
|
|
104
122
|
@abstractmethod
|
|
105
|
-
def
|
|
106
|
-
|
|
123
|
+
def create_index(
|
|
124
|
+
self,
|
|
125
|
+
):
|
|
107
126
|
pass
|
|
127
|
+
|
|
128
|
+
# @abstractmethod
|
|
129
|
+
# def get_documents_keys_by_source_id(self, source_id: str):
|
|
130
|
+
# """Get documents keys by source ID."""
|
|
131
|
+
# pass
|
|
132
|
+
|
|
133
|
+
# @abstractmethod
|
|
134
|
+
# def delete_documents_by_source_id(self, source_id: str):
|
|
135
|
+
# """Delete documents by source ID."""
|
|
136
|
+
# pass
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from langchain.indexes import SQLRecordManager
|
|
4
|
+
from langchain_core.documents import Document
|
|
5
|
+
from langchain_postgres import PGVectorStore
|
|
6
|
+
|
|
7
|
+
from .interfaces import (
|
|
8
|
+
EmbeddingsManager,
|
|
9
|
+
RagChunker,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class KdbService:
|
|
16
|
+
"""
|
|
17
|
+
Service for chunking documents.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
embeddings_manager: EmbeddingsManager,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Initialize the ChunkerService.
|
|
26
|
+
"""
|
|
27
|
+
self.embeddings_manager = embeddings_manager
|
|
28
|
+
self._vector_store = None
|
|
29
|
+
self._records_manager = None
|
|
30
|
+
|
|
31
|
+
def configure_kdb(self):
|
|
32
|
+
try:
|
|
33
|
+
self.embeddings_manager.configure_vector_store()
|
|
34
|
+
except Exception as e:
|
|
35
|
+
raise Exception(f"Error configuring KDB: {e}")
|
|
36
|
+
|
|
37
|
+
def create_vector_store_hsnw_index(self):
|
|
38
|
+
try:
|
|
39
|
+
self.embeddings_manager.create_index()
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.error(f"Error creating vector store index: {e}")
|
|
42
|
+
raise Exception(f"Error creating vector store index: {e}")
|
|
43
|
+
|
|
44
|
+
def search(self, query: str) -> list[Document]:
|
|
45
|
+
try:
|
|
46
|
+
records = []
|
|
47
|
+
records = self.embeddings_manager.search_records(query)
|
|
48
|
+
print(records)
|
|
49
|
+
return records
|
|
50
|
+
except Exception as e:
|
|
51
|
+
logger.error(f"Error indexing documents: {e}")
|
|
52
|
+
raise Exception(f"Error indexing documents: {e}")
|
|
53
|
+
|
|
54
|
+
def index_documents_in_vector_store(self, documents: list[Document]) -> None:
|
|
55
|
+
try:
|
|
56
|
+
self.embeddings_manager.index_documents(documents)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.error(f"Error indexing documents: {e}")
|
|
59
|
+
raise Exception(f"Error indexing documents: {e}")
|
|
@@ -2,9 +2,12 @@ from enum import Enum
|
|
|
2
2
|
from typing import Literal
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
class KdbServices(Enum):
|
|
5
|
+
class KdbServices(str, Enum):
|
|
6
6
|
REDIS = "redis"
|
|
7
7
|
CHROMA = "chroma"
|
|
8
|
+
PG = "pg"
|
|
8
9
|
|
|
9
10
|
|
|
10
|
-
kdb_services = Literal[
|
|
11
|
+
kdb_services = Literal[
|
|
12
|
+
KdbServices.REDIS.value, KdbServices.CHROMA.value, KdbServices.PG.value
|
|
13
|
+
]
|
|
@@ -46,26 +46,29 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
|
46
46
|
logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
|
|
47
47
|
raise
|
|
48
48
|
|
|
49
|
-
def configure_vector_store(
|
|
49
|
+
async def configure_vector_store(
|
|
50
50
|
self,
|
|
51
51
|
table_name: str = "",
|
|
52
52
|
vector_size: int = 768,
|
|
53
53
|
content_column: str = "document",
|
|
54
54
|
id_column: str = "id",
|
|
55
|
+
metadata_json_column: str = "cmetadata",
|
|
56
|
+
pg_record_manager: str = "postgres/langchain_pg_collection",
|
|
55
57
|
):
|
|
56
58
|
"""Configure the vector store."""
|
|
57
59
|
pass
|
|
58
60
|
|
|
59
|
-
def init_vector_store(
|
|
61
|
+
async def init_vector_store(
|
|
60
62
|
self,
|
|
61
63
|
table_name: str = "",
|
|
62
64
|
content_column: str = "document",
|
|
65
|
+
metadata_json_column: str = "cmetadata",
|
|
63
66
|
id_column: str = "id",
|
|
64
67
|
):
|
|
65
68
|
"""Initialize the vector store."""
|
|
66
69
|
pass
|
|
67
70
|
|
|
68
|
-
def index_documents(self, documents: list[Document]):
|
|
71
|
+
async def index_documents(self, documents: list[Document]):
|
|
69
72
|
"""
|
|
70
73
|
Add documents to the vector store with their embeddings.
|
|
71
74
|
|
|
@@ -85,7 +88,7 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
|
85
88
|
"""
|
|
86
89
|
try:
|
|
87
90
|
logger.info(f"Indexing {len(documents)} documents in vector store")
|
|
88
|
-
self.chroma.
|
|
91
|
+
await self.chroma.aadd_documents(documents)
|
|
89
92
|
except Exception as e:
|
|
90
93
|
logger.error(f"Error indexing documents: {str(e)}")
|
|
91
94
|
raise
|
|
@@ -110,12 +113,14 @@ class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
|
110
113
|
logger.error(f"Error deleting documents by ID: {str(e)}")
|
|
111
114
|
raise
|
|
112
115
|
|
|
113
|
-
def delete_documents_by_metadata_key(
|
|
116
|
+
async def delete_documents_by_metadata_key(
|
|
117
|
+
self, metadata_key: str, metadata_value: str
|
|
118
|
+
):
|
|
114
119
|
"""
|
|
115
120
|
Delete documents by filter from the vector store.
|
|
116
121
|
"""
|
|
117
122
|
try:
|
|
118
|
-
self.chroma.
|
|
123
|
+
await self.chroma.adelete(where={metadata_key: metadata_value})
|
|
119
124
|
except Exception as error:
|
|
120
125
|
logger.error(
|
|
121
126
|
f"Error deleting documents by filter: {str(filter)}, error: {error} "
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
from langchain_core.documents import Document
|
|
2
|
-
from langchain.indexes import index, SQLRecordManager
|
|
3
|
-
from typing import List
|
|
4
1
|
import logging
|
|
5
|
-
from langchain_postgres import PGVectorStore, PGEngine
|
|
6
|
-
from sqlalchemy import create_engine
|
|
7
|
-
from dotenv import load_dotenv
|
|
8
|
-
from wizit_context_ingestor.application.interfaces import EmbeddingsManager
|
|
9
2
|
|
|
10
|
-
|
|
3
|
+
from langchain.indexes import IndexingResult, SQLRecordManager, aindex, index
|
|
4
|
+
from langchain_core.documents import Document
|
|
5
|
+
from langchain_postgres import PGEngine, PGVectorStore
|
|
6
|
+
from langchain_postgres.v2.indexes import HNSWIndex
|
|
7
|
+
from sqlalchemy.ext.asyncio import create_async_engine
|
|
8
|
+
from typing_extensions import Literal
|
|
9
|
+
|
|
10
|
+
from wizit_context_ingestor.application.interfaces import EmbeddingsManager
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
@@ -42,7 +42,17 @@ class PgEmbeddingsManager(EmbeddingsManager):
|
|
|
42
42
|
|
|
43
43
|
__slots__ = ("embeddings_model", "pg_connection")
|
|
44
44
|
|
|
45
|
-
def __init__(
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
embeddings_model,
|
|
48
|
+
pg_connection: str,
|
|
49
|
+
embeddings_vectors_table_name: str = "langchain_pg_embedding",
|
|
50
|
+
vector_size: int = 768,
|
|
51
|
+
content_column: str = "document",
|
|
52
|
+
id_column: str = "id",
|
|
53
|
+
metadata_json_column: str = "cmetadata",
|
|
54
|
+
records_manager_table_name: str = "langchain_record_manager",
|
|
55
|
+
):
|
|
46
56
|
"""
|
|
47
57
|
Initialize the PgEmbeddingsManager.
|
|
48
58
|
|
|
@@ -57,155 +67,135 @@ class PgEmbeddingsManager(EmbeddingsManager):
|
|
|
57
67
|
"""
|
|
58
68
|
self.pg_connection = pg_connection
|
|
59
69
|
self.embeddings_model = embeddings_model
|
|
60
|
-
self.pg_engine = None
|
|
61
70
|
self.vector_store = None
|
|
62
71
|
self.record_manager = None
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
72
|
+
self.pg_engine = PGEngine.from_connection_string(pg_connection)
|
|
73
|
+
self.embeddings_vectors_table_name = embeddings_vectors_table_name
|
|
74
|
+
self.vector_size = vector_size
|
|
75
|
+
self.content_column = content_column
|
|
76
|
+
self.id_column = id_column
|
|
77
|
+
self.metadata_json_column = metadata_json_column
|
|
78
|
+
self.records_manager_table_name = records_manager_table_name
|
|
79
|
+
# self.async_engine = create_async_engine(pg_connection)
|
|
80
|
+
# self.pg_engine = PGEngine.from_engine(
|
|
81
|
+
# self.async_engine
|
|
82
|
+
# )
|
|
83
|
+
logger.info("PgEmbeddingsManager initialized")
|
|
69
84
|
|
|
70
85
|
def configure_vector_store(
|
|
71
86
|
self,
|
|
72
|
-
table_name: str = "langchain_pg_embedding",
|
|
73
|
-
vector_size: int = 768,
|
|
74
|
-
content_column: str = "document",
|
|
75
|
-
id_column: str = "id",
|
|
76
|
-
metadata_json_column: str = "cmetadata",
|
|
77
|
-
pg_record_manager: str = "postgres/langchain_pg_collection",
|
|
78
87
|
):
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
88
|
+
try:
|
|
89
|
+
self.pg_engine.init_vectorstore_table(
|
|
90
|
+
table_name=self.embeddings_vectors_table_name,
|
|
91
|
+
vector_size=self.vector_size,
|
|
92
|
+
content_column=self.content_column,
|
|
93
|
+
id_column=self.id_column,
|
|
94
|
+
metadata_json_column=self.metadata_json_column,
|
|
95
|
+
)
|
|
96
|
+
record_manager = SQLRecordManager(
|
|
97
|
+
self.records_manager_table_name,
|
|
98
|
+
db_url=self.pg_connection,
|
|
99
|
+
async_mode=False,
|
|
100
|
+
)
|
|
101
|
+
record_manager.create_schema()
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error(f"Error configure_vector_store: {e}")
|
|
104
|
+
raise
|
|
105
|
+
|
|
106
|
+
def retrieve_vector_store(
|
|
93
107
|
self,
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def
|
|
108
|
+
) -> tuple[PGVectorStore, SQLRecordManager]:
|
|
109
|
+
try:
|
|
110
|
+
self.vector_store = PGVectorStore.create_sync(
|
|
111
|
+
embedding_service=self.embeddings_model,
|
|
112
|
+
engine=self.pg_engine,
|
|
113
|
+
table_name=self.embeddings_vectors_table_name,
|
|
114
|
+
content_column=self.content_column,
|
|
115
|
+
metadata_json_column=self.metadata_json_column,
|
|
116
|
+
id_column=self.id_column,
|
|
117
|
+
)
|
|
118
|
+
self.record_manager = SQLRecordManager(
|
|
119
|
+
self.records_manager_table_name, db_url=self.pg_connection
|
|
120
|
+
)
|
|
121
|
+
return (self.vector_store, self.record_manager)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"Error retrieve vector store: ", e)
|
|
124
|
+
raise e
|
|
125
|
+
|
|
126
|
+
def check_vector_store_init(func):
|
|
113
127
|
"""validate vector store initialization"""
|
|
114
128
|
|
|
115
129
|
def wrapper(self, *args, **kwargs):
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
raise Exception("Vector store not initialized")
|
|
119
|
-
if self.record_manager is None:
|
|
120
|
-
raise Exception("Record manager not initialized")
|
|
130
|
+
if self.vector_store is None or self.record_manager is None:
|
|
131
|
+
self.retrieve_vector_store()
|
|
121
132
|
return func(self, *args, **kwargs)
|
|
122
133
|
|
|
123
134
|
return wrapper
|
|
124
135
|
|
|
125
|
-
@
|
|
126
|
-
def
|
|
136
|
+
@check_vector_store_init
|
|
137
|
+
def create_index(self):
|
|
138
|
+
try:
|
|
139
|
+
if self.vector_size < 2000:
|
|
140
|
+
index = HNSWIndex()
|
|
141
|
+
self.vector_store.apply_vector_index(index)
|
|
142
|
+
else:
|
|
143
|
+
raise NotImplementedError(
|
|
144
|
+
"Indexing for vector size > 2000 is not supported"
|
|
145
|
+
)
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.info(f"Error creating index: {e}")
|
|
148
|
+
raise e
|
|
149
|
+
|
|
150
|
+
@check_vector_store_init
|
|
151
|
+
def index_documents(
|
|
152
|
+
self,
|
|
153
|
+
docs: list[Document],
|
|
154
|
+
cleanup: Literal["incremental", "full", "scoped_full"] | None = "incremental",
|
|
155
|
+
source_id_key: str = "source",
|
|
156
|
+
) -> IndexingResult:
|
|
127
157
|
"""
|
|
128
|
-
|
|
158
|
+
Index documents in the vector store with their embeddings.
|
|
129
159
|
|
|
130
|
-
This method takes a list of Document objects
|
|
131
|
-
|
|
132
|
-
embeddings in the PostgreSQL database.
|
|
160
|
+
This method takes a list of Document objects and indexes them using LangChain's
|
|
161
|
+
aindex function with incremental cleanup. The documents are processed through
|
|
162
|
+
the embeddings model and stored in the PostgreSQL database with pgvector.
|
|
133
163
|
|
|
134
164
|
Args:
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
165
|
+
vector_store: The PGVectorStore instance to use for storage
|
|
166
|
+
record_manager: The SQLRecordManager instance for tracking indexed documents
|
|
167
|
+
docs: A list of LangChain Document objects to index in the vector store.
|
|
168
|
+
Each Document should have page_content and metadata attributes.
|
|
169
|
+
|
|
138
170
|
Returns:
|
|
139
|
-
|
|
171
|
+
IndexingResult: Result object containing information about the indexing operation
|
|
140
172
|
|
|
141
173
|
Raises:
|
|
142
|
-
|
|
174
|
+
Exception: If there's an error during the document indexing process
|
|
143
175
|
"""
|
|
144
176
|
try:
|
|
145
177
|
logger.info(f"Indexing {len(docs)} documents in vector store")
|
|
178
|
+
# await self.vector_store.aadd_documents(docs)
|
|
146
179
|
return index(
|
|
147
180
|
docs,
|
|
148
181
|
self.record_manager,
|
|
149
182
|
self.vector_store,
|
|
150
|
-
cleanup=
|
|
151
|
-
source_id_key=
|
|
183
|
+
cleanup=cleanup,
|
|
184
|
+
source_id_key=source_id_key,
|
|
152
185
|
)
|
|
153
186
|
except Exception as e:
|
|
154
187
|
logger.error(f"Error indexing documents: {str(e)}")
|
|
155
|
-
raise
|
|
188
|
+
raise e
|
|
156
189
|
|
|
157
|
-
@
|
|
158
|
-
def
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
try:
|
|
163
|
-
return self.record_manager.list_keys(group_ids=[source_id])
|
|
164
|
-
except Exception as e:
|
|
165
|
-
logger.error(f"Error getting documents keys by source ID: {str(e)}")
|
|
166
|
-
raise
|
|
167
|
-
|
|
168
|
-
@vector_store_initialized
|
|
169
|
-
def delete_documents_by_source_id(self, source_id: str):
|
|
170
|
-
"""
|
|
171
|
-
Delete documents by source ID from the vector store.
|
|
172
|
-
"""
|
|
190
|
+
@check_vector_store_init
|
|
191
|
+
def search_records(
|
|
192
|
+
self,
|
|
193
|
+
query: str,
|
|
194
|
+
) -> list[Document]:
|
|
173
195
|
try:
|
|
174
|
-
|
|
175
|
-
self.
|
|
176
|
-
|
|
196
|
+
logger.info(f"Searching for '{query}' in vector store")
|
|
197
|
+
reply = self.vector_store.search(query=query, search_type="similarity", k=1)
|
|
198
|
+
return reply
|
|
177
199
|
except Exception as e:
|
|
178
|
-
logger.error(f"Error
|
|
179
|
-
raise
|
|
180
|
-
|
|
181
|
-
# def get_retriever(self, search_type: str = "mmr", k: int = 20):
|
|
182
|
-
# """
|
|
183
|
-
# Get a retriever interface to the vector store for semantic search.
|
|
184
|
-
|
|
185
|
-
# This method returns a LangChain retriever object that can be used in retrieval
|
|
186
|
-
# pipelines, retrieval-augmented generation, and other LangChain chains.
|
|
187
|
-
|
|
188
|
-
# Args:
|
|
189
|
-
# search_type: The search algorithm to use. Options include:
|
|
190
|
-
# - "similarity" (standard cosine similarity)
|
|
191
|
-
# - "mmr" (Maximum Marginal Relevance, balances relevance with diversity)
|
|
192
|
-
# - "similarity_score_threshold" (filters by minimum similarity)
|
|
193
|
-
# k: The number of documents to retrieve (default: 20)
|
|
194
|
-
|
|
195
|
-
# Returns:
|
|
196
|
-
# Retriever: A LangChain Retriever object that can be used in chains and pipelines
|
|
197
|
-
|
|
198
|
-
# Raises:
|
|
199
|
-
# Exception: If there's an error creating the retriever
|
|
200
|
-
|
|
201
|
-
# Example:
|
|
202
|
-
# >>> retriever = pg_manager.get_retriever(search_type="mmr", k=5)
|
|
203
|
-
# >>> docs = retriever.get_relevant_documents("quantum computing")
|
|
204
|
-
# """
|
|
205
|
-
# try:
|
|
206
|
-
# return self.vector_store.as_retriever(
|
|
207
|
-
# search_type=search_type, search_kwargs={"k": k}
|
|
208
|
-
# )
|
|
209
|
-
# except Exception as e:
|
|
210
|
-
# logger.info(f"failed to get vector store as retriever {str(e)}")
|
|
211
|
-
# raise
|
|
200
|
+
logger.error(f"Error indexing documents: {str(e)}")
|
|
201
|
+
raise e
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
|
|
4
4
|
# https://python.langchain.com/docs/how_to/embed_text/
|
|
5
5
|
import logging
|
|
6
|
+
import uuid
|
|
6
7
|
from typing import List, Any
|
|
7
8
|
from langchain_core.documents import Document
|
|
8
9
|
from langchain_experimental.text_splitter import SemanticChunker
|
|
@@ -58,7 +59,7 @@ class SemanticChunks(RagChunker):
|
|
|
58
59
|
source = document.metadata["source"]
|
|
59
60
|
for i, chunk in enumerate(chunks):
|
|
60
61
|
if document.metadata["source"]:
|
|
61
|
-
chunk.id = f"{
|
|
62
|
+
chunk.id = f"{uuid.uuid4()}"
|
|
62
63
|
logger.info(f"{len(chunks)} chunks generated successfully")
|
|
63
64
|
return chunks
|
|
64
65
|
except Exception as e:
|
wizit_context_ingestor/main.py
CHANGED
|
@@ -11,7 +11,7 @@ from .infra.rag.chroma_embeddings import ChromaEmbeddingsManager
|
|
|
11
11
|
from .infra.secrets.aws_secrets_manager import AwsSecretsManager
|
|
12
12
|
from .data.storage import storage_services, StorageServices
|
|
13
13
|
from .data.kdb import kdb_services, KdbServices
|
|
14
|
-
from .utils.file_utils import
|
|
14
|
+
from .utils.file_utils import validate_file_name_format
|
|
15
15
|
from langsmith import Client, tracing_context
|
|
16
16
|
|
|
17
17
|
|
|
@@ -143,7 +143,7 @@ class TranscriptionManager:
|
|
|
143
143
|
Exception: If an error occurs during the transcription process.
|
|
144
144
|
"""
|
|
145
145
|
try:
|
|
146
|
-
if
|
|
146
|
+
if not validate_file_name_format(file_key):
|
|
147
147
|
raise ValueError(
|
|
148
148
|
"Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
|
|
149
149
|
)
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from logging import getLogger
|
|
3
|
+
from typing import Any, Dict, Literal
|
|
4
|
+
|
|
5
|
+
from langchain_core.documents import Document
|
|
6
|
+
from langsmith import Client, tracing_context
|
|
7
|
+
|
|
8
|
+
from .application.context_chunk_service import ContextChunksInDocumentService
|
|
9
|
+
from .application.kdb_service import KdbService
|
|
10
|
+
from .data.storage import StorageServices
|
|
11
|
+
from .infra.persistence.local_storage import LocalStorageService
|
|
12
|
+
from .infra.persistence.s3_storage import S3StorageService
|
|
13
|
+
from .infra.rag.pg_embeddings import PgEmbeddingsManager
|
|
14
|
+
from .infra.rag.semantic_chunks import SemanticChunks
|
|
15
|
+
from .infra.secrets.aws_secrets_manager import AwsSecretsManager
|
|
16
|
+
from .infra.vertex_model import VertexModels
|
|
17
|
+
from .utils.file_utils import validate_file_name_format
|
|
18
|
+
|
|
19
|
+
logger = getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class KdbManager:
|
|
23
|
+
def __init__(
|
|
24
|
+
self, embeddings_model, kdb_service: Literal["pg"], kdb_params: Dict[Any, Any]
|
|
25
|
+
):
|
|
26
|
+
self.kdb_service = kdb_service
|
|
27
|
+
self.kdb_params = kdb_params
|
|
28
|
+
self.embeddings_model = embeddings_model
|
|
29
|
+
|
|
30
|
+
def retrieve_kdb_service(self):
|
|
31
|
+
return PgEmbeddingsManager(self.embeddings_model, **self.kdb_params)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PersistenceManager:
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
storage_service: Literal["s3", "local"],
|
|
38
|
+
source_storage_route,
|
|
39
|
+
target_storage_route,
|
|
40
|
+
):
|
|
41
|
+
self.storage_service = storage_service
|
|
42
|
+
self.source_storage_route = source_storage_route
|
|
43
|
+
self.target_storage_route = target_storage_route
|
|
44
|
+
|
|
45
|
+
def retrieve_storage_service(self):
|
|
46
|
+
if self.storage_service == StorageServices.S3.value:
|
|
47
|
+
return S3StorageService(
|
|
48
|
+
origin_bucket_name=self.source_storage_route,
|
|
49
|
+
target_bucket_name=self.target_storage_route,
|
|
50
|
+
)
|
|
51
|
+
elif self.storage_service == StorageServices.LOCAL.value:
|
|
52
|
+
return LocalStorageService(
|
|
53
|
+
source_storage_route=self.source_storage_route,
|
|
54
|
+
target_storage_route=self.target_storage_route,
|
|
55
|
+
)
|
|
56
|
+
else:
|
|
57
|
+
raise ValueError(f"Unsupported storage service: {self.storage_service}")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ChunksManager:
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
gcp_project_id: str,
|
|
64
|
+
gcp_project_location: str,
|
|
65
|
+
gcp_secret_name: str,
|
|
66
|
+
langsmith_api_key: str,
|
|
67
|
+
langsmith_project_name: str,
|
|
68
|
+
storage_service: Literal["s3", "local"],
|
|
69
|
+
kdb_service: Literal["pg"],
|
|
70
|
+
kdb_params: Dict[Any, Any],
|
|
71
|
+
llm_model_id: str = "claude-3-5-haiku@20241022",
|
|
72
|
+
embeddings_model_id: str = "text-multilingual-embedding-002",
|
|
73
|
+
target_language: str = "es",
|
|
74
|
+
):
|
|
75
|
+
self.gcp_project_id = gcp_project_id
|
|
76
|
+
self.gcp_project_location = gcp_project_location
|
|
77
|
+
self.aws_secrets_manager = AwsSecretsManager()
|
|
78
|
+
self.gcp_secret_name = gcp_secret_name
|
|
79
|
+
self.llm_model_id = llm_model_id
|
|
80
|
+
self.target_language = target_language
|
|
81
|
+
self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
|
|
82
|
+
self.storage_service = storage_service
|
|
83
|
+
self.kdb_params = kdb_params
|
|
84
|
+
self.kdb_service = kdb_service
|
|
85
|
+
self.vertex_model = self._get_vertex_model()
|
|
86
|
+
self.embeddings_model = self.vertex_model.load_embeddings_model(
|
|
87
|
+
embeddings_model_id
|
|
88
|
+
)
|
|
89
|
+
self.langsmith_api_key = langsmith_api_key
|
|
90
|
+
self.langsmith_project_name = langsmith_project_name
|
|
91
|
+
self.langsmith_client = Client(api_key=self.langsmith_api_key)
|
|
92
|
+
self.kdb_manager = KdbManager(self.embeddings_model, "pg", self.kdb_params)
|
|
93
|
+
self.pg_embeddings_manager = self.kdb_manager.retrieve_kdb_service()
|
|
94
|
+
self.rag_chunker = SemanticChunks(self.embeddings_model)
|
|
95
|
+
self.kdb_service = KdbService(
|
|
96
|
+
self.pg_embeddings_manager,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def _get_gcp_sa_dict(self, gcp_secret_name: str):
|
|
100
|
+
vertex_gcp_sa = self.aws_secrets_manager.get_secret(gcp_secret_name)
|
|
101
|
+
vertex_gcp_sa_dict = json.loads(vertex_gcp_sa)
|
|
102
|
+
return vertex_gcp_sa_dict
|
|
103
|
+
|
|
104
|
+
def _get_vertex_model(self):
|
|
105
|
+
vertex_model = VertexModels(
|
|
106
|
+
self.gcp_project_id,
|
|
107
|
+
self.gcp_project_location,
|
|
108
|
+
self.gcp_sa_dict,
|
|
109
|
+
llm_model_id=self.llm_model_id,
|
|
110
|
+
)
|
|
111
|
+
return vertex_model
|
|
112
|
+
|
|
113
|
+
def provision_vector_store(self):
|
|
114
|
+
try:
|
|
115
|
+
self.kdb_service.configure_kdb()
|
|
116
|
+
self.kdb_service.create_vector_store_hsnw_index()
|
|
117
|
+
except Exception as e:
|
|
118
|
+
logger.error(f"Error configuring vector store: {e}")
|
|
119
|
+
|
|
120
|
+
def index_documents_in_vector_store(self, docs: list[Document]):
|
|
121
|
+
try:
|
|
122
|
+
self.kdb_service.index_documents_in_vector_store(docs)
|
|
123
|
+
except Exception as e:
|
|
124
|
+
logger.error(f"Error indexing documents in vector store: {e}")
|
|
125
|
+
|
|
126
|
+
def search_records(self, query):
|
|
127
|
+
return self.kdb_service.search(query)
|
|
128
|
+
|
|
129
|
+
def tracing(func):
|
|
130
|
+
async def gen_tracing_context(self, *args, **kwargs):
|
|
131
|
+
with tracing_context(
|
|
132
|
+
enabled=True,
|
|
133
|
+
project_name=self.langsmith_project_name,
|
|
134
|
+
client=self.langsmith_client,
|
|
135
|
+
):
|
|
136
|
+
return await func(self, *args, **kwargs)
|
|
137
|
+
|
|
138
|
+
return gen_tracing_context
|
|
139
|
+
|
|
140
|
+
@tracing
|
|
141
|
+
async def gen_context_chunks(
|
|
142
|
+
self, file_key: str, source_storage_route: str, target_storage_route: str
|
|
143
|
+
):
|
|
144
|
+
try:
|
|
145
|
+
validate_file_name_format(file_key)
|
|
146
|
+
persistence_layer = PersistenceManager(
|
|
147
|
+
self.storage_service, source_storage_route, target_storage_route
|
|
148
|
+
)
|
|
149
|
+
persistence_service = persistence_layer.retrieve_storage_service()
|
|
150
|
+
target_bucket_file_tags = {}
|
|
151
|
+
if persistence_service.supports_tagging:
|
|
152
|
+
target_bucket_file_tags = persistence_service.retrieve_file_tags(
|
|
153
|
+
file_key, target_storage_route
|
|
154
|
+
)
|
|
155
|
+
rag_chunker = SemanticChunks(self.embeddings_model)
|
|
156
|
+
kdb_manager = KdbManager(self.embeddings_model, "pg", self.kdb_params)
|
|
157
|
+
kdb_service = kdb_manager.retrieve_kdb_service()
|
|
158
|
+
context_chunks_in_document_service = ContextChunksInDocumentService(
|
|
159
|
+
ai_application_service=self.vertex_model,
|
|
160
|
+
persistence_service=persistence_service,
|
|
161
|
+
rag_chunker=rag_chunker,
|
|
162
|
+
embeddings_manager=kdb_service,
|
|
163
|
+
target_language=self.target_language,
|
|
164
|
+
)
|
|
165
|
+
context_chunks = (
|
|
166
|
+
await context_chunks_in_document_service.get_context_chunks_in_document(
|
|
167
|
+
file_key, target_bucket_file_tags
|
|
168
|
+
)
|
|
169
|
+
)
|
|
170
|
+
return context_chunks
|
|
171
|
+
except Exception as e:
|
|
172
|
+
print(f"Error getting context chunks in document: {e}")
|
|
173
|
+
raise e
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
import re
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
def
|
|
4
|
+
def validate_file_name_format(file_name):
|
|
5
5
|
"""Check if file name has special characters or spaces instead of underscores"""
|
|
6
|
-
# Check for spaces
|
|
7
|
-
if " " in file_name:
|
|
8
|
-
return True
|
|
9
|
-
|
|
10
6
|
# Check for special characters (anything that's not alphanumeric, underscore, dash, dot, slash, or backslash)
|
|
11
|
-
if re.search(r"[^a-zA-Z0-9_
|
|
7
|
+
if re.search(r"[^a-zA-Z0-9_.\-/\\]", file_name) is None:
|
|
12
8
|
return True
|
|
13
|
-
|
|
9
|
+
else:
|
|
10
|
+
raise ValueError(
|
|
11
|
+
"Invalid file name format, do not provide special characters or spaces (instead use underscores or hyphens)"
|
|
12
|
+
)
|
{wizit_context_ingestor-0.3.0b8.dist-info → wizit_context_ingestor-0.4.0.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: wizit-context-ingestor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Contextual Rag with Cloud Solutions
|
|
5
5
|
Requires-Dist: anthropic[vertex]>=0.66.0
|
|
6
6
|
Requires-Dist: boto3>=1.40.23
|
|
@@ -8,10 +8,13 @@ Requires-Dist: langchain-aws>=0.2.31
|
|
|
8
8
|
Requires-Dist: langchain-chroma>=0.2.6
|
|
9
9
|
Requires-Dist: langchain-experimental>=0.3.4
|
|
10
10
|
Requires-Dist: langchain-google-vertexai>=2.0.28
|
|
11
|
+
Requires-Dist: langchain-postgres>=0.0.16
|
|
11
12
|
Requires-Dist: langchain-redis>=0.2.3
|
|
12
13
|
Requires-Dist: langgraph>=0.6.8
|
|
13
14
|
Requires-Dist: pillow>=11.3.0
|
|
15
|
+
Requires-Dist: psycopg2-binary>=2.9.11
|
|
14
16
|
Requires-Dist: pymupdf>=1.26.4
|
|
17
|
+
Requires-Dist: sqlalchemy[asyncio]>=2.0.43
|
|
15
18
|
Requires-Python: >=3.12
|
|
16
19
|
Description-Content-Type: text/markdown
|
|
17
20
|
|
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
wizit_context_ingestor/.DS_Store,sha256=c7hZ0C8v2hxprMlCgmvxXDl92phew3iSATJzE1yYTBs,6148
|
|
2
|
-
wizit_context_ingestor/__init__.py,sha256=
|
|
2
|
+
wizit_context_ingestor/__init__.py,sha256=aFOBTwoH2ZH4KvCzzdfQ0PH1yBV3aLdXXye-VtuXLRo,131
|
|
3
3
|
wizit_context_ingestor/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
wizit_context_ingestor/application/context_chunk_service.py,sha256
|
|
5
|
-
wizit_context_ingestor/application/interfaces.py,sha256=
|
|
4
|
+
wizit_context_ingestor/application/context_chunk_service.py,sha256=-efSbpLJS5968Qh6Ho_c6et4-g_L0gPgR-Z9URO_cS0,5504
|
|
5
|
+
wizit_context_ingestor/application/interfaces.py,sha256=oG-a8JbGw-YT8Xtt0SDJ72FZwDNogyBzP93k_tBIC6k,3528
|
|
6
|
+
wizit_context_ingestor/application/kdb_service.py,sha256=CYQ-2aLvUZHCgCwmuMeZsANhLbm54gtfpiKsg1a81qI,1757
|
|
6
7
|
wizit_context_ingestor/application/transcription_service.py,sha256=FlUcMGyAotAO8MmT5UMlPMbgIWVQLg7YO6rJx9ANn7A,8567
|
|
7
8
|
wizit_context_ingestor/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
wizit_context_ingestor/data/kdb.py,sha256=
|
|
9
|
+
wizit_context_ingestor/data/kdb.py,sha256=lJ4F0ltVhtqC80QqHsm49a2FnSEI_fep61_Vfd65x34,241
|
|
9
10
|
wizit_context_ingestor/data/prompts.py,sha256=bzgLdjINtXGQVTy4ZZktdcNItbtDQpM7maAQ2UBGdnY,14187
|
|
10
|
-
wizit_context_ingestor/data/storage.py,sha256=
|
|
11
|
+
wizit_context_ingestor/data/storage.py,sha256=e29IH47MfN0qAuzrcZ-VlHw1sk7dT5QqYk6wO273nCk,203
|
|
11
12
|
wizit_context_ingestor/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
13
|
wizit_context_ingestor/domain/models.py,sha256=DV83PArMyh-VoUqnVF_ohcgStsk549ixdYw98B8o2GI,381
|
|
13
14
|
wizit_context_ingestor/domain/services.py,sha256=dg8UvYSjYsOMphrciZyGvuRriM8Qf08SstvO979XrFc,3344
|
|
@@ -16,20 +17,21 @@ wizit_context_ingestor/infra/aws_model.py,sha256=glIaewSdv6PDBXoCe6QgCUIzLCjtM7K
|
|
|
16
17
|
wizit_context_ingestor/infra/persistence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
18
|
wizit_context_ingestor/infra/persistence/local_storage.py,sha256=GtPUvtn8XlgcqwjWmSm2998sgyYlwkF22HoB40ri7c0,2029
|
|
18
19
|
wizit_context_ingestor/infra/persistence/s3_storage.py,sha256=bzlQteLPPGS_Gbh39RkxyoK8G-CEOQewMNPuzPule9k,4906
|
|
19
|
-
wizit_context_ingestor/infra/rag/chroma_embeddings.py,sha256=
|
|
20
|
-
wizit_context_ingestor/infra/rag/pg_embeddings.py,sha256=
|
|
20
|
+
wizit_context_ingestor/infra/rag/chroma_embeddings.py,sha256=8VSXw38TX8nfGyJ5SXtbyqDVt3XI6gtEoie0g_v5Ax8,4517
|
|
21
|
+
wizit_context_ingestor/infra/rag/pg_embeddings.py,sha256=f4fq75uot4JPaz8j7fmKxGWGhaLs5WcCtfLWBLvMA_M,7754
|
|
21
22
|
wizit_context_ingestor/infra/rag/redis_embeddings.py,sha256=pCP_I1RLeIUTYMSHkZT6AjIOyHA9A47wyffrZBjiG0s,5107
|
|
22
|
-
wizit_context_ingestor/infra/rag/semantic_chunks.py,sha256=
|
|
23
|
+
wizit_context_ingestor/infra/rag/semantic_chunks.py,sha256=Peylky5rsWERtu5psmet4SL_VYpnK5hsEIyE0Ua1wGE,2463
|
|
23
24
|
wizit_context_ingestor/infra/secrets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
25
|
wizit_context_ingestor/infra/secrets/aws_secrets_manager.py,sha256=vukil5sO9tQPTM74wUbyQqR8Z-z0ElyjeF2ns7rbVbQ,1249
|
|
25
26
|
wizit_context_ingestor/infra/vertex_model.py,sha256=6L2C4qH7PSVjdOSzIEZlFtUwu1pgQVXtQBIU5isn644,7582
|
|
26
|
-
wizit_context_ingestor/main.py,sha256=
|
|
27
|
+
wizit_context_ingestor/main.py,sha256=2gJ7B39aS2U742cJQWjVGHbQ-NslYFdGQXzljGd83WA,11685
|
|
28
|
+
wizit_context_ingestor/main_chunks.py,sha256=07zT76VB_RzCXV3toJgTQBPCmiVud860vVzVkMLwDuA,6784
|
|
27
29
|
wizit_context_ingestor/services/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
|
|
28
30
|
wizit_context_ingestor/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
31
|
wizit_context_ingestor/services/chunks.py,sha256=tQQsdsOscZWzqVY5WxVxr3ii62FOJ3nMARaJJz6CvjQ,2011
|
|
30
32
|
wizit_context_ingestor/services/parse_doc.py,sha256=3CyZoGbiUfxbs0SXUWXjQevtusSzTBgvUVeNNSdxJLE,4491
|
|
31
33
|
wizit_context_ingestor/services/pg_embeddings_manager.py,sha256=n1HOmu_Z_Z71H-rVAyJS3FdPKbBckm5W8_XethY8nuM,4998
|
|
32
|
-
wizit_context_ingestor/utils/file_utils.py,sha256=
|
|
34
|
+
wizit_context_ingestor/utils/file_utils.py,sha256=BmNN71exw75IKWvdDvyvo4QlGpl5W-nTCgimDnmD4Eo,501
|
|
33
35
|
wizit_context_ingestor/workflows/context_nodes.py,sha256=3qlFcxPUmehx04mQHpmouneKq--To8rwSDHCRFyWICo,3168
|
|
34
36
|
wizit_context_ingestor/workflows/context_state.py,sha256=4MTIUjK-F2pWvIldovWZhMAqqCOpViKbvitJzETkSkY,324
|
|
35
37
|
wizit_context_ingestor/workflows/context_tools.py,sha256=E9VTL3AC0MwSIuc1e-juZK7XCxnZfFv0-KpHfR2CNH4,2764
|
|
@@ -39,6 +41,6 @@ wizit_context_ingestor/workflows/transcription_schemas.py,sha256=CQCl7LXD5voxhJO
|
|
|
39
41
|
wizit_context_ingestor/workflows/transcription_state.py,sha256=2Z_t2aZFEH_nAjdEO6RFBEmi_fwvr9cV0aLS1eIxiCQ,590
|
|
40
42
|
wizit_context_ingestor/workflows/transcription_tools.py,sha256=FtIfWFITn8_Rr5SEobCeR55aJGZoHRMgF2UxRT5vJ-E,1373
|
|
41
43
|
wizit_context_ingestor/workflows/transcription_workflow.py,sha256=77cLsYGdv01Py2GaKYpACuifPeSxH7tkVodvLv97sdg,1621
|
|
42
|
-
wizit_context_ingestor-0.
|
|
43
|
-
wizit_context_ingestor-0.
|
|
44
|
-
wizit_context_ingestor-0.
|
|
44
|
+
wizit_context_ingestor-0.4.0.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
45
|
+
wizit_context_ingestor-0.4.0.dist-info/METADATA,sha256=hZxwWZSUb5hQXEoWZ5ulaI0fmPZYSyDmgOWHxizp6oQ,3890
|
|
46
|
+
wizit_context_ingestor-0.4.0.dist-info/RECORD,,
|
|
File without changes
|