wizit-context-ingestor 0.2.5b2__py3-none-any.whl → 0.2.5b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wizit-context-ingestor might be problematic. Click here for more details.
- wizit_context_ingestor/application/interfaces.py +1 -1
- wizit_context_ingestor/application/transcription_service.py +4 -1
- wizit_context_ingestor/data/prompts.py +7 -0
- wizit_context_ingestor/infra/rag/chroma_embeddings.py +132 -0
- wizit_context_ingestor/main.py +5 -2
- {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.2.5b3.dist-info}/METADATA +2 -1
- {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.2.5b3.dist-info}/RECORD +8 -7
- {wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.2.5b3.dist-info}/WHEEL +1 -1
|
@@ -19,11 +19,13 @@ class TranscriptionService:
|
|
|
19
19
|
self,
|
|
20
20
|
ai_application_service: AiApplicationService,
|
|
21
21
|
persistence_service: PersistenceService,
|
|
22
|
-
target_language: str = 'es'
|
|
22
|
+
target_language: str = 'es',
|
|
23
|
+
transcription_additional_instructions: str = ''
|
|
23
24
|
):
|
|
24
25
|
self.ai_application_service = ai_application_service
|
|
25
26
|
self.persistence_service = persistence_service
|
|
26
27
|
self.target_language = target_language
|
|
28
|
+
self.transcription_additional_instructions = transcription_additional_instructions
|
|
27
29
|
self.chat_model = self.ai_application_service.load_chat_model()
|
|
28
30
|
|
|
29
31
|
def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
|
|
@@ -50,6 +52,7 @@ class TranscriptionService:
|
|
|
50
52
|
}]
|
|
51
53
|
),
|
|
52
54
|
]).partial(
|
|
55
|
+
transcription_additional_instructions=self.transcription_additional_instructions,
|
|
53
56
|
format_instructions=transcription_output_parser.get_format_instructions()
|
|
54
57
|
)
|
|
55
58
|
model_with_structured_output = self.chat_model.with_structured_output(Transcription)
|
|
@@ -22,6 +22,7 @@ TRANSCRIPTION RULES:
|
|
|
22
22
|
- Include: footnotes, page numbers, bullet points, lists, captions
|
|
23
23
|
- Preserve: bold, italic, underlined, and other text formatting using markdown
|
|
24
24
|
- Mark unclear text as [unclear] or [illegible] with best guess in brackets
|
|
25
|
+
- Enclose all underlined content in <UnderlinedContent></UnderlinedContent> tags
|
|
25
26
|
|
|
26
27
|
2. LANGUAGE REQUIREMENTS:
|
|
27
28
|
- All transcribed content MUST be in the document's primary language
|
|
@@ -70,9 +71,15 @@ CRITICAL REMINDERS:
|
|
|
70
71
|
- Maintain professional transcription standards
|
|
71
72
|
- Complete transcription is mandatory
|
|
72
73
|
|
|
74
|
+
<additional_instructions>
|
|
75
|
+
{transcription_additional_instructions}
|
|
76
|
+
</additional_instructions>
|
|
77
|
+
|
|
78
|
+
|
|
73
79
|
Generate the optimized transcription following these specifications:
|
|
74
80
|
{format_instructions}
|
|
75
81
|
|
|
82
|
+
|
|
76
83
|
"""
|
|
77
84
|
|
|
78
85
|
CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
from typing_extensions import Sequence
|
|
2
|
+
from test.test_typing import CoolEmployee
|
|
3
|
+
from langchain_core.documents import Document
|
|
4
|
+
from langchain_chroma import Chroma
|
|
5
|
+
from typing import List
|
|
6
|
+
import logging
|
|
7
|
+
from uuid import uuid4
|
|
8
|
+
from ...application.interfaces import EmbeddingsManager
|
|
9
|
+
|
|
10
|
+
# load_dotenv()
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
class ChromaEmbeddingsManager(EmbeddingsManager):
|
|
15
|
+
|
|
16
|
+
__slots__ = ("embeddings_model", "chroma_host", "collection_name", "metadata_tags")
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
embeddings_model,
|
|
20
|
+
chroma_host,
|
|
21
|
+
collection_name: str,
|
|
22
|
+
metadata_tags: dict
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Initialize the ChromaEmbeddingsManager.
|
|
26
|
+
Args:
|
|
27
|
+
embeddings_model: The embeddings model to use for generating vector embeddings
|
|
28
|
+
(typically a LangChain embeddings model instance)
|
|
29
|
+
chroma_host: The Chroma host URL
|
|
30
|
+
collection_name: The Chroma collection name
|
|
31
|
+
metadata_tags: Tags to add as metadata to Chroma vector store
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
Exception: If there's an error initializing the RedisEmbeddingsManager
|
|
35
|
+
"""
|
|
36
|
+
self.collection_name = collection_name
|
|
37
|
+
self.embeddings_model = embeddings_model
|
|
38
|
+
self.chroma_host = chroma_host
|
|
39
|
+
self.metadata_tags_schema = []
|
|
40
|
+
|
|
41
|
+
for tag_key in metadata_tags:
|
|
42
|
+
self.metadata_tags_schema.append({
|
|
43
|
+
"type": "tag",
|
|
44
|
+
"name": tag_key
|
|
45
|
+
})
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
self.chroma = Chroma(
|
|
49
|
+
collection_name=self.collection_name,
|
|
50
|
+
embedding_function=self.embeddings_model,
|
|
51
|
+
host=self.chroma_host,
|
|
52
|
+
)
|
|
53
|
+
logger.info("ChromaEmbeddingsManager initialized")
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
|
|
56
|
+
raise
|
|
57
|
+
|
|
58
|
+
def configure_vector_store(
|
|
59
|
+
self,
|
|
60
|
+
table_name: str = "",
|
|
61
|
+
vector_size: int = 768,
|
|
62
|
+
content_column: str = "document",
|
|
63
|
+
id_column: str = "id",
|
|
64
|
+
metadata_json_column: str = "cmetadata",
|
|
65
|
+
pg_record_manager: str = ""
|
|
66
|
+
):
|
|
67
|
+
"""Configure the vector store."""
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
def init_vector_store(
|
|
71
|
+
self,
|
|
72
|
+
table_name: str = "",
|
|
73
|
+
content_column: str = "document",
|
|
74
|
+
metadata_json_column: str = "cmetadata",
|
|
75
|
+
id_column: str = "id",
|
|
76
|
+
):
|
|
77
|
+
"""Initialize the vector store."""
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def index_documents(self, documents: list[Document]):
|
|
82
|
+
"""
|
|
83
|
+
Add documents to the vector store with their embeddings.
|
|
84
|
+
|
|
85
|
+
This method takes a list of Document objects, generates embeddings for them
|
|
86
|
+
using the embeddings model, and stores both the documents and their
|
|
87
|
+
embeddings in the PostgreSQL database.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
docs: A list of LangChain Document objects to add to the vector store
|
|
91
|
+
Each Document should have page_content and metadata attributes
|
|
92
|
+
from langchain_core.documents import Document
|
|
93
|
+
Returns:
|
|
94
|
+
None
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
Exception: If there's an error adding documents to the vector store
|
|
98
|
+
"""
|
|
99
|
+
try:
|
|
100
|
+
logger.info(f"Indexing {len(documents)} documents in vector store")
|
|
101
|
+
self.chroma.add_documents(documents)
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error(f"Error indexing documents: {str(e)}")
|
|
104
|
+
raise
|
|
105
|
+
|
|
106
|
+
def get_documents_by_id(self, ids: list[str]):
|
|
107
|
+
"""
|
|
108
|
+
Get document by ID from the vector store.
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
return self.chroma.get_by_ids(ids)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.error(f"Error getting documents by ID: {str(e)}")
|
|
114
|
+
raise
|
|
115
|
+
|
|
116
|
+
def delete_documents_by_id(self, ids: list[str]):
|
|
117
|
+
"""
|
|
118
|
+
Delete documents by ID from the vector store.
|
|
119
|
+
"""
|
|
120
|
+
try:
|
|
121
|
+
self.chroma.delete(ids)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"Error deleting documents by ID: {str(e)}")
|
|
124
|
+
raise
|
|
125
|
+
|
|
126
|
+
def get_documents_keys_by_source_id(self, source_id: str):
|
|
127
|
+
"""Get documents keys by source ID."""
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
def delete_documents_by_source_id(self, source_id: str):
|
|
131
|
+
"""Delete documents by source ID."""
|
|
132
|
+
pass
|
wizit_context_ingestor/main.py
CHANGED
|
@@ -14,8 +14,9 @@ class DeelabTranscribeManager:
|
|
|
14
14
|
gcp_project_id: str,
|
|
15
15
|
gcp_project_location: str,
|
|
16
16
|
gcp_secret_name: str,
|
|
17
|
-
llm_model_id: str = "claude-
|
|
17
|
+
llm_model_id: str = "claude-sonnet-4@20250514",
|
|
18
18
|
target_language: str = 'es',
|
|
19
|
+
transcription_additional_instructions: str = ''
|
|
19
20
|
):
|
|
20
21
|
self.gcp_project_id = gcp_project_id
|
|
21
22
|
self.gcp_project_location = gcp_project_location
|
|
@@ -23,6 +24,7 @@ class DeelabTranscribeManager:
|
|
|
23
24
|
self.gcp_secret_name = gcp_secret_name
|
|
24
25
|
self.llm_model_id = llm_model_id
|
|
25
26
|
self.target_language = target_language
|
|
27
|
+
self.transcription_additional_instructions = transcription_additional_instructions
|
|
26
28
|
self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
|
|
27
29
|
self.vertex_model = self._get_vertex_model()
|
|
28
30
|
|
|
@@ -55,7 +57,8 @@ class DeelabTranscribeManager:
|
|
|
55
57
|
transcribe_document_service = TranscriptionService(
|
|
56
58
|
ai_application_service=self.vertex_model,
|
|
57
59
|
persistence_service=s3_persistence_service,
|
|
58
|
-
target_language=self.target_language
|
|
60
|
+
target_language=self.target_language,
|
|
61
|
+
transcription_additional_instructions=self.transcription_additional_instructions
|
|
59
62
|
)
|
|
60
63
|
parsed_pages, parsed_document = transcribe_document_service.process_document(file_key)
|
|
61
64
|
origin_bucket_file_tags = s3_persistence_service.retrieve_file_tags(file_key, s3_origin_bucket_name)
|
{wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.2.5b3.dist-info}/METADATA
RENAMED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: wizit-context-ingestor
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5b3
|
|
4
4
|
Summary: Contextual Rag with Cloud Solutions
|
|
5
5
|
Requires-Dist: anthropic[vertex]>=0.66.0
|
|
6
6
|
Requires-Dist: boto3>=1.40.23
|
|
7
7
|
Requires-Dist: langchain-aws>=0.2.31
|
|
8
|
+
Requires-Dist: langchain-chroma>=0.2.6
|
|
8
9
|
Requires-Dist: langchain-experimental>=0.3.4
|
|
9
10
|
Requires-Dist: langchain-google-vertexai>=2.0.28
|
|
10
11
|
Requires-Dist: langchain-redis>=0.2.3
|
{wizit_context_ingestor-0.2.5b2.dist-info → wizit_context_ingestor-0.2.5b3.dist-info}/RECORD
RENAMED
|
@@ -2,10 +2,10 @@ wizit_context_ingestor/.DS_Store,sha256=c7hZ0C8v2hxprMlCgmvxXDl92phew3iSATJzE1yY
|
|
|
2
2
|
wizit_context_ingestor/__init__.py,sha256=GQdqSrpsSS7mdbfIn-Osse4EI54PvqlDYeBZwCuuNNA,134
|
|
3
3
|
wizit_context_ingestor/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
wizit_context_ingestor/application/context_chunk_service.py,sha256=0nnn6vbxnLovoriu0f7EIqiAJA713Pd8L95QNK6fjnM,4916
|
|
5
|
-
wizit_context_ingestor/application/interfaces.py,sha256=
|
|
6
|
-
wizit_context_ingestor/application/transcription_service.py,sha256=
|
|
5
|
+
wizit_context_ingestor/application/interfaces.py,sha256=W0qonE3t-S-zwAoKtDYc4oyW_GOILKVmrdy8LnC8MVI,3193
|
|
6
|
+
wizit_context_ingestor/application/transcription_service.py,sha256=nYJ3pNdVumTeV0pjFrmLNrsj8ZdIfQczxdL7jpKuQmA,4323
|
|
7
7
|
wizit_context_ingestor/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
wizit_context_ingestor/data/prompts.py,sha256=
|
|
8
|
+
wizit_context_ingestor/data/prompts.py,sha256=VG8SCMrp5CvhlKk08D-kvARggNtt-xhND6_PL2Xfk30,6906
|
|
9
9
|
wizit_context_ingestor/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
wizit_context_ingestor/domain/models.py,sha256=DV83PArMyh-VoUqnVF_ohcgStsk549ixdYw98B8o2GI,381
|
|
11
11
|
wizit_context_ingestor/domain/services.py,sha256=0i9WwZ0ufBgnzNJ5dt8Iop9VLTeK_AqjcaH8p3Av26I,3347
|
|
@@ -14,18 +14,19 @@ wizit_context_ingestor/infra/aws_model.py,sha256=glIaewSdv6PDBXoCe6QgCUIzLCjtM7K
|
|
|
14
14
|
wizit_context_ingestor/infra/persistence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
15
|
wizit_context_ingestor/infra/persistence/local_storage.py,sha256=sDFat-FMN123FUWZp_ztwoVjl0HrzChCDZmicFemy5o,1707
|
|
16
16
|
wizit_context_ingestor/infra/persistence/s3_storage.py,sha256=HYO3gWNE64ECSYYrxrIi9-2jWv1vwwGEE5QX-ZqpOCs,4791
|
|
17
|
+
wizit_context_ingestor/infra/rag/chroma_embeddings.py,sha256=MZls9JoessXm48dqY-an3zRDehO_j3FkWBDF9ls2RAU,4297
|
|
17
18
|
wizit_context_ingestor/infra/rag/pg_embeddings.py,sha256=5m4R4GmwMU3C7AR3Je3nCdgO-2jyIaCG4QN9phGD68Q,8072
|
|
18
19
|
wizit_context_ingestor/infra/rag/redis_embeddings.py,sha256=wlgSBedq_kcrZ3SF4vGVTWM0B350kkd8C894i4mMUA8,4828
|
|
19
20
|
wizit_context_ingestor/infra/rag/semantic_chunks.py,sha256=tM6bSacBvu-VWb3VkxgQNrfskz3zFxOOAU23D2kZWD8,2255
|
|
20
21
|
wizit_context_ingestor/infra/secrets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
22
|
wizit_context_ingestor/infra/secrets/aws_secrets_manager.py,sha256=1k_R_uzLabptiZ1GXAoqAgYpk8EykXIb-pUDdidUDJQ,1202
|
|
22
23
|
wizit_context_ingestor/infra/vertex_model.py,sha256=Izpz2ZQ4Koh4PSrHAj_0iUv4Rx354SlUqqw-LrLXCOE,7256
|
|
23
|
-
wizit_context_ingestor/main.py,sha256=
|
|
24
|
+
wizit_context_ingestor/main.py,sha256=dX0sQcbnpyFI0uUiU3g-qn5069xk2KILviK7NqrFOIk,8206
|
|
24
25
|
wizit_context_ingestor/services/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
|
|
25
26
|
wizit_context_ingestor/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
27
|
wizit_context_ingestor/services/chunks.py,sha256=tQQsdsOscZWzqVY5WxVxr3ii62FOJ3nMARaJJz6CvjQ,2011
|
|
27
28
|
wizit_context_ingestor/services/parse_doc.py,sha256=3CyZoGbiUfxbs0SXUWXjQevtusSzTBgvUVeNNSdxJLE,4491
|
|
28
29
|
wizit_context_ingestor/services/pg_embeddings_manager.py,sha256=n1HOmu_Z_Z71H-rVAyJS3FdPKbBckm5W8_XethY8nuM,4998
|
|
29
|
-
wizit_context_ingestor-0.2.
|
|
30
|
-
wizit_context_ingestor-0.2.
|
|
31
|
-
wizit_context_ingestor-0.2.
|
|
30
|
+
wizit_context_ingestor-0.2.5b3.dist-info/WHEEL,sha256=-neZj6nU9KAMg2CnCY6T3w8J53nx1kFGw_9HfoSzM60,79
|
|
31
|
+
wizit_context_ingestor-0.2.5b3.dist-info/METADATA,sha256=sexOso1mw8Gw3AEd5yD-F020VGjep0S-XLbNjJCB6LU,3616
|
|
32
|
+
wizit_context_ingestor-0.2.5b3.dist-info/RECORD,,
|