wizit-context-ingestor 0.2.5b2__tar.gz → 0.2.5b3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/PKG-INFO +2 -1
  2. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/pyproject.toml +2 -1
  3. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/interfaces.py +1 -1
  4. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/transcription_service.py +4 -1
  5. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/data/prompts.py +7 -0
  6. wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/infra/rag/chroma_embeddings.py +132 -0
  7. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/main.py +5 -2
  8. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/README.md +0 -0
  9. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/.DS_Store +0 -0
  10. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/__init__.py +0 -0
  11. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/__init__.py +0 -0
  12. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/context_chunk_service.py +0 -0
  13. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/data/__init__.py +0 -0
  14. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/domain/__init__.py +0 -0
  15. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/domain/models.py +0 -0
  16. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/domain/services.py +0 -0
  17. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/__init__.py +0 -0
  18. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/aws_model.py +0 -0
  19. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/persistence/__init__.py +0 -0
  20. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/persistence/local_storage.py +0 -0
  21. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/persistence/s3_storage.py +0 -0
  22. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/rag/pg_embeddings.py +0 -0
  23. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/rag/redis_embeddings.py +0 -0
  24. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/rag/semantic_chunks.py +0 -0
  25. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/secrets/__init__.py +0 -0
  26. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/secrets/aws_secrets_manager.py +0 -0
  27. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/vertex_model.py +0 -0
  28. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/services/.DS_Store +0 -0
  29. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/services/__init__.py +0 -0
  30. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/services/chunks.py +0 -0
  31. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/services/parse_doc.py +0 -0
  32. {wizit_context_ingestor-0.2.5b2 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/services/pg_embeddings_manager.py +0 -0
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: wizit-context-ingestor
3
- Version: 0.2.5b2
3
+ Version: 0.2.5b3
4
4
  Summary: Contextual Rag with Cloud Solutions
5
5
  Requires-Dist: anthropic[vertex]>=0.66.0
6
6
  Requires-Dist: boto3>=1.40.23
7
7
  Requires-Dist: langchain-aws>=0.2.31
8
+ Requires-Dist: langchain-chroma>=0.2.6
8
9
  Requires-Dist: langchain-experimental>=0.3.4
9
10
  Requires-Dist: langchain-google-vertexai>=2.0.28
10
11
  Requires-Dist: langchain-redis>=0.2.3
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "wizit_context_ingestor"
3
- version = "0.2.5-beta-2"
3
+ version = "0.2.5-beta-3"
4
4
  description = "Contextual Rag with Cloud Solutions"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -8,6 +8,7 @@ dependencies = [
8
8
  "anthropic[vertex]>=0.66.0",
9
9
  "boto3>=1.40.23",
10
10
  "langchain-aws>=0.2.31",
11
+ "langchain-chroma>=0.2.6",
11
12
  "langchain-experimental>=0.3.4",
12
13
  "langchain-google-vertexai>=2.0.28",
13
14
  "langchain-redis>=0.2.3",
@@ -92,7 +92,7 @@ class EmbeddingsManager(ABC):
92
92
  pass
93
93
 
94
94
  @abstractmethod
95
- def index_documents(self, documents: List[Document]):
95
+ def index_documents(self, documents: list[Document]):
96
96
  """Index documents."""
97
97
  pass
98
98
 
@@ -19,11 +19,13 @@ class TranscriptionService:
19
19
  self,
20
20
  ai_application_service: AiApplicationService,
21
21
  persistence_service: PersistenceService,
22
- target_language: str = 'es'
22
+ target_language: str = 'es',
23
+ transcription_additional_instructions: str = ''
23
24
  ):
24
25
  self.ai_application_service = ai_application_service
25
26
  self.persistence_service = persistence_service
26
27
  self.target_language = target_language
28
+ self.transcription_additional_instructions = transcription_additional_instructions
27
29
  self.chat_model = self.ai_application_service.load_chat_model()
28
30
 
29
31
  def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
@@ -50,6 +52,7 @@ class TranscriptionService:
50
52
  }]
51
53
  ),
52
54
  ]).partial(
55
+ transcription_additional_instructions=self.transcription_additional_instructions,
53
56
  format_instructions=transcription_output_parser.get_format_instructions()
54
57
  )
55
58
  model_with_structured_output = self.chat_model.with_structured_output(Transcription)
@@ -22,6 +22,7 @@ TRANSCRIPTION RULES:
22
22
  - Include: footnotes, page numbers, bullet points, lists, captions
23
23
  - Preserve: bold, italic, underlined, and other text formatting using markdown
24
24
  - Mark unclear text as [unclear] or [illegible] with best guess in brackets
25
+ - Enclose all underlined content in <UnderlinedContent></UnderlinedContent> tags
25
26
 
26
27
  2. LANGUAGE REQUIREMENTS:
27
28
  - All transcribed content MUST be in the document's primary language
@@ -70,9 +71,15 @@ CRITICAL REMINDERS:
70
71
  - Maintain professional transcription standards
71
72
  - Complete transcription is mandatory
72
73
 
74
+ <additional_instructions>
75
+ {transcription_additional_instructions}
76
+ </additional_instructions>
77
+
78
+
73
79
  Generate the optimized transcription following these specifications:
74
80
  {format_instructions}
75
81
 
82
+
76
83
  """
77
84
 
78
85
  CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
@@ -0,0 +1,132 @@
1
+ from typing_extensions import Sequence
2
+ from test.test_typing import CoolEmployee
3
+ from langchain_core.documents import Document
4
+ from langchain_chroma import Chroma
5
+ from typing import List
6
+ import logging
7
+ from uuid import uuid4
8
+ from ...application.interfaces import EmbeddingsManager
9
+
10
+ # load_dotenv()
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class ChromaEmbeddingsManager(EmbeddingsManager):
15
+
16
+ __slots__ = ("embeddings_model", "chroma_host", "collection_name", "metadata_tags")
17
+ def __init__(
18
+ self,
19
+ embeddings_model,
20
+ chroma_host,
21
+ collection_name: str,
22
+ metadata_tags: dict
23
+ ):
24
+ """
25
+ Initialize the ChromaEmbeddingsManager.
26
+ Args:
27
+ embeddings_model: The embeddings model to use for generating vector embeddings
28
+ (typically a LangChain embeddings model instance)
29
+ chroma_host: The Chroma host URL
30
+ collection_name: The Chroma collection name
31
+ metadata_tags: Tags to add as metadata to Chroma vector store
32
+
33
+ Raises:
34
+ Exception: If there's an error initializing the RedisEmbeddingsManager
35
+ """
36
+ self.collection_name = collection_name
37
+ self.embeddings_model = embeddings_model
38
+ self.chroma_host = chroma_host
39
+ self.metadata_tags_schema = []
40
+
41
+ for tag_key in metadata_tags:
42
+ self.metadata_tags_schema.append({
43
+ "type": "tag",
44
+ "name": tag_key
45
+ })
46
+
47
+ try:
48
+ self.chroma = Chroma(
49
+ collection_name=self.collection_name,
50
+ embedding_function=self.embeddings_model,
51
+ host=self.chroma_host,
52
+ )
53
+ logger.info("ChromaEmbeddingsManager initialized")
54
+ except Exception as e:
55
+ logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
56
+ raise
57
+
58
+ def configure_vector_store(
59
+ self,
60
+ table_name: str = "",
61
+ vector_size: int = 768,
62
+ content_column: str = "document",
63
+ id_column: str = "id",
64
+ metadata_json_column: str = "cmetadata",
65
+ pg_record_manager: str = ""
66
+ ):
67
+ """Configure the vector store."""
68
+ pass
69
+
70
+ def init_vector_store(
71
+ self,
72
+ table_name: str = "",
73
+ content_column: str = "document",
74
+ metadata_json_column: str = "cmetadata",
75
+ id_column: str = "id",
76
+ ):
77
+ """Initialize the vector store."""
78
+ pass
79
+
80
+
81
+ def index_documents(self, documents: list[Document]):
82
+ """
83
+ Add documents to the vector store with their embeddings.
84
+
85
+ This method takes a list of Document objects, generates embeddings for them
86
+ using the embeddings model, and stores both the documents and their
87
+ embeddings in the PostgreSQL database.
88
+
89
+ Args:
90
+ docs: A list of LangChain Document objects to add to the vector store
91
+ Each Document should have page_content and metadata attributes
92
+ from langchain_core.documents import Document
93
+ Returns:
94
+ None
95
+
96
+ Raises:
97
+ Exception: If there's an error adding documents to the vector store
98
+ """
99
+ try:
100
+ logger.info(f"Indexing {len(documents)} documents in vector store")
101
+ self.chroma.add_documents(documents)
102
+ except Exception as e:
103
+ logger.error(f"Error indexing documents: {str(e)}")
104
+ raise
105
+
106
+ def get_documents_by_id(self, ids: list[str]):
107
+ """
108
+ Get document by ID from the vector store.
109
+ """
110
+ try:
111
+ return self.chroma.get_by_ids(ids)
112
+ except Exception as e:
113
+ logger.error(f"Error getting documents by ID: {str(e)}")
114
+ raise
115
+
116
+ def delete_documents_by_id(self, ids: list[str]):
117
+ """
118
+ Delete documents by ID from the vector store.
119
+ """
120
+ try:
121
+ self.chroma.delete(ids)
122
+ except Exception as e:
123
+ logger.error(f"Error deleting documents by ID: {str(e)}")
124
+ raise
125
+
126
+ def get_documents_keys_by_source_id(self, source_id: str):
127
+ """Get documents keys by source ID."""
128
+ pass
129
+
130
+ def delete_documents_by_source_id(self, source_id: str):
131
+ """Delete documents by source ID."""
132
+ pass
@@ -14,8 +14,9 @@ class DeelabTranscribeManager:
14
14
  gcp_project_id: str,
15
15
  gcp_project_location: str,
16
16
  gcp_secret_name: str,
17
- llm_model_id: str = "claude-3-5-sonnet-v2@20241022",
17
+ llm_model_id: str = "claude-sonnet-4@20250514",
18
18
  target_language: str = 'es',
19
+ transcription_additional_instructions: str = ''
19
20
  ):
20
21
  self.gcp_project_id = gcp_project_id
21
22
  self.gcp_project_location = gcp_project_location
@@ -23,6 +24,7 @@ class DeelabTranscribeManager:
23
24
  self.gcp_secret_name = gcp_secret_name
24
25
  self.llm_model_id = llm_model_id
25
26
  self.target_language = target_language
27
+ self.transcription_additional_instructions = transcription_additional_instructions
26
28
  self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
27
29
  self.vertex_model = self._get_vertex_model()
28
30
 
@@ -55,7 +57,8 @@ class DeelabTranscribeManager:
55
57
  transcribe_document_service = TranscriptionService(
56
58
  ai_application_service=self.vertex_model,
57
59
  persistence_service=s3_persistence_service,
58
- target_language=self.target_language
60
+ target_language=self.target_language,
61
+ transcription_additional_instructions=self.transcription_additional_instructions
59
62
  )
60
63
  parsed_pages, parsed_document = transcribe_document_service.process_document(file_key)
61
64
  origin_bucket_file_tags = s3_persistence_service.retrieve_file_tags(file_key, s3_origin_bucket_name)