wizit-context-ingestor 0.2.5b1__tar.gz → 0.2.5b3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/PKG-INFO +2 -1
  2. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/pyproject.toml +2 -1
  3. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/interfaces.py +1 -1
  4. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/transcription_service.py +4 -1
  5. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/data/prompts.py +7 -0
  6. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/domain/services.py +12 -12
  7. wizit_context_ingestor-0.2.5b3/src/wizit_context_ingestor/infra/rag/chroma_embeddings.py +132 -0
  8. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/main.py +5 -2
  9. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/services/parse_doc.py +11 -11
  10. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/README.md +0 -0
  11. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/.DS_Store +0 -0
  12. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/__init__.py +0 -0
  13. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/__init__.py +0 -0
  14. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/application/context_chunk_service.py +0 -0
  15. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/data/__init__.py +0 -0
  16. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/domain/__init__.py +0 -0
  17. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/domain/models.py +0 -0
  18. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/__init__.py +0 -0
  19. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/aws_model.py +0 -0
  20. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/persistence/__init__.py +0 -0
  21. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/persistence/local_storage.py +0 -0
  22. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/persistence/s3_storage.py +0 -0
  23. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/rag/pg_embeddings.py +0 -0
  24. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/rag/redis_embeddings.py +0 -0
  25. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/rag/semantic_chunks.py +0 -0
  26. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/secrets/__init__.py +0 -0
  27. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/secrets/aws_secrets_manager.py +0 -0
  28. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/infra/vertex_model.py +0 -0
  29. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/services/.DS_Store +0 -0
  30. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/services/__init__.py +0 -0
  31. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/services/chunks.py +0 -0
  32. {wizit_context_ingestor-0.2.5b1 → wizit_context_ingestor-0.2.5b3}/src/wizit_context_ingestor/services/pg_embeddings_manager.py +0 -0
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: wizit-context-ingestor
3
- Version: 0.2.5b1
3
+ Version: 0.2.5b3
4
4
  Summary: Contextual Rag with Cloud Solutions
5
5
  Requires-Dist: anthropic[vertex]>=0.66.0
6
6
  Requires-Dist: boto3>=1.40.23
7
7
  Requires-Dist: langchain-aws>=0.2.31
8
+ Requires-Dist: langchain-chroma>=0.2.6
8
9
  Requires-Dist: langchain-experimental>=0.3.4
9
10
  Requires-Dist: langchain-google-vertexai>=2.0.28
10
11
  Requires-Dist: langchain-redis>=0.2.3
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "wizit_context_ingestor"
3
- version = "0.2.5-beta-1"
3
+ version = "0.2.5-beta-3"
4
4
  description = "Contextual Rag with Cloud Solutions"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -8,6 +8,7 @@ dependencies = [
8
8
  "anthropic[vertex]>=0.66.0",
9
9
  "boto3>=1.40.23",
10
10
  "langchain-aws>=0.2.31",
11
+ "langchain-chroma>=0.2.6",
11
12
  "langchain-experimental>=0.3.4",
12
13
  "langchain-google-vertexai>=2.0.28",
13
14
  "langchain-redis>=0.2.3",
@@ -92,7 +92,7 @@ class EmbeddingsManager(ABC):
92
92
  pass
93
93
 
94
94
  @abstractmethod
95
- def index_documents(self, documents: List[Document]):
95
+ def index_documents(self, documents: list[Document]):
96
96
  """Index documents."""
97
97
  pass
98
98
 
@@ -19,11 +19,13 @@ class TranscriptionService:
19
19
  self,
20
20
  ai_application_service: AiApplicationService,
21
21
  persistence_service: PersistenceService,
22
- target_language: str = 'es'
22
+ target_language: str = 'es',
23
+ transcription_additional_instructions: str = ''
23
24
  ):
24
25
  self.ai_application_service = ai_application_service
25
26
  self.persistence_service = persistence_service
26
27
  self.target_language = target_language
28
+ self.transcription_additional_instructions = transcription_additional_instructions
27
29
  self.chat_model = self.ai_application_service.load_chat_model()
28
30
 
29
31
  def parse_doc_page(self, document: ParsedDocPage) -> ParsedDocPage:
@@ -50,6 +52,7 @@ class TranscriptionService:
50
52
  }]
51
53
  ),
52
54
  ]).partial(
55
+ transcription_additional_instructions=self.transcription_additional_instructions,
53
56
  format_instructions=transcription_output_parser.get_format_instructions()
54
57
  )
55
58
  model_with_structured_output = self.chat_model.with_structured_output(Transcription)
@@ -22,6 +22,7 @@ TRANSCRIPTION RULES:
22
22
  - Include: footnotes, page numbers, bullet points, lists, captions
23
23
  - Preserve: bold, italic, underlined, and other text formatting using markdown
24
24
  - Mark unclear text as [unclear] or [illegible] with best guess in brackets
25
+ - Enclose all underlined content in <UnderlinedContent></UnderlinedContent> tags
25
26
 
26
27
  2. LANGUAGE REQUIREMENTS:
27
28
  - All transcribed content MUST be in the document's primary language
@@ -70,9 +71,15 @@ CRITICAL REMINDERS:
70
71
  - Maintain professional transcription standards
71
72
  - Complete transcription is mandatory
72
73
 
74
+ <additional_instructions>
75
+ {transcription_additional_instructions}
76
+ </additional_instructions>
77
+
78
+
73
79
  Generate the optimized transcription following these specifications:
74
80
  {format_instructions}
75
81
 
82
+
76
83
  """
77
84
 
78
85
  CONTEXT_CHUNKS_IN_DOCUMENT_SYSTEM_PROMPT = """
@@ -1,7 +1,7 @@
1
1
  import base64
2
2
  import logging
3
3
  import io
4
- import fitz
4
+ import pymupdf
5
5
  from PIL import Image
6
6
  from typing import List
7
7
  from ..domain.models import ParsedDocPage, ParsedDoc
@@ -17,25 +17,25 @@ class ParseDocModelService():
17
17
  def __init__(self, file_path: str):
18
18
  """
19
19
  Initialize a PDF document parser.
20
-
20
+
21
21
  Args:
22
22
  file_path: Path to the PDF file to parse
23
23
  """
24
24
  self.file_path = file_path
25
- self.pdf_document = fitz.open(file_path)
25
+ self.pdf_document = pymupdf.open(file_path)
26
26
  self.page_count = self.pdf_document.page_count
27
27
 
28
-
28
+
29
29
  def pdf_page_to_base64(self, page_number: int) -> ParsedDocPage:
30
30
  """
31
31
  Convert a PDF page to a base64-encoded PNG image.
32
-
32
+
33
33
  Args:
34
34
  page_number: One-indexed page number to convert
35
-
35
+
36
36
  Returns:
37
37
  Base64 encoded string of the page image
38
-
38
+
39
39
  Raises:
40
40
  Exception: If there's an error during conversion
41
41
  """
@@ -49,7 +49,7 @@ class ParseDocModelService():
49
49
  b64_encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
50
50
  logger.info(f"Page {page_number} encoded successfully")
51
51
  return ParsedDocPage(
52
- page_number=page_number,
52
+ page_number=page_number,
53
53
  page_base64=b64_encoded_image
54
54
  )
55
55
  except Exception as e:
@@ -59,15 +59,15 @@ class ParseDocModelService():
59
59
  def parse_document_to_base64(self) -> List[ParsedDocPage]:
60
60
  """
61
61
  Convert all pages in the PDF document to base64-encoded images.
62
-
62
+
63
63
  Returns:
64
64
  List of base64 encoded strings for each page
65
-
65
+
66
66
  Raises:
67
67
  Exception: If there's an error during conversion
68
68
  """
69
69
  # BASE DE DATOS SINTETICOS DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO, FINE TUNING PARA EL LLM
70
- # GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
70
+ # GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
71
71
  # RAG --> FINETUNING AUTOMATICO / CONSULTAR EL MODELO
72
72
  # OPENAI --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
73
73
  # COLAB --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
@@ -95,4 +95,4 @@ class ParseDocModelService():
95
95
  document_text=md_content
96
96
  )
97
97
 
98
- # def
98
+ # def
@@ -0,0 +1,132 @@
1
+ from typing_extensions import Sequence
2
+ from test.test_typing import CoolEmployee
3
+ from langchain_core.documents import Document
4
+ from langchain_chroma import Chroma
5
+ from typing import List
6
+ import logging
7
+ from uuid import uuid4
8
+ from ...application.interfaces import EmbeddingsManager
9
+
10
+ # load_dotenv()
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class ChromaEmbeddingsManager(EmbeddingsManager):
15
+
16
+ __slots__ = ("embeddings_model", "chroma_host", "collection_name", "metadata_tags")
17
+ def __init__(
18
+ self,
19
+ embeddings_model,
20
+ chroma_host,
21
+ collection_name: str,
22
+ metadata_tags: dict
23
+ ):
24
+ """
25
+ Initialize the ChromaEmbeddingsManager.
26
+ Args:
27
+ embeddings_model: The embeddings model to use for generating vector embeddings
28
+ (typically a LangChain embeddings model instance)
29
+ chroma_host: The Chroma host URL
30
+ collection_name: The Chroma collection name
31
+ metadata_tags: Tags to add as metadata to Chroma vector store
32
+
33
+ Raises:
34
+ Exception: If there's an error initializing the RedisEmbeddingsManager
35
+ """
36
+ self.collection_name = collection_name
37
+ self.embeddings_model = embeddings_model
38
+ self.chroma_host = chroma_host
39
+ self.metadata_tags_schema = []
40
+
41
+ for tag_key in metadata_tags:
42
+ self.metadata_tags_schema.append({
43
+ "type": "tag",
44
+ "name": tag_key
45
+ })
46
+
47
+ try:
48
+ self.chroma = Chroma(
49
+ collection_name=self.collection_name,
50
+ embedding_function=self.embeddings_model,
51
+ host=self.chroma_host,
52
+ )
53
+ logger.info("ChromaEmbeddingsManager initialized")
54
+ except Exception as e:
55
+ logger.error(f"Failed to initialize ChromaEmbeddingsManager: {str(e)}")
56
+ raise
57
+
58
+ def configure_vector_store(
59
+ self,
60
+ table_name: str = "",
61
+ vector_size: int = 768,
62
+ content_column: str = "document",
63
+ id_column: str = "id",
64
+ metadata_json_column: str = "cmetadata",
65
+ pg_record_manager: str = ""
66
+ ):
67
+ """Configure the vector store."""
68
+ pass
69
+
70
+ def init_vector_store(
71
+ self,
72
+ table_name: str = "",
73
+ content_column: str = "document",
74
+ metadata_json_column: str = "cmetadata",
75
+ id_column: str = "id",
76
+ ):
77
+ """Initialize the vector store."""
78
+ pass
79
+
80
+
81
+ def index_documents(self, documents: list[Document]):
82
+ """
83
+ Add documents to the vector store with their embeddings.
84
+
85
+ This method takes a list of Document objects, generates embeddings for them
86
+ using the embeddings model, and stores both the documents and their
87
+ embeddings in the PostgreSQL database.
88
+
89
+ Args:
90
+ docs: A list of LangChain Document objects to add to the vector store
91
+ Each Document should have page_content and metadata attributes
92
+ from langchain_core.documents import Document
93
+ Returns:
94
+ None
95
+
96
+ Raises:
97
+ Exception: If there's an error adding documents to the vector store
98
+ """
99
+ try:
100
+ logger.info(f"Indexing {len(documents)} documents in vector store")
101
+ self.chroma.add_documents(documents)
102
+ except Exception as e:
103
+ logger.error(f"Error indexing documents: {str(e)}")
104
+ raise
105
+
106
+ def get_documents_by_id(self, ids: list[str]):
107
+ """
108
+ Get document by ID from the vector store.
109
+ """
110
+ try:
111
+ return self.chroma.get_by_ids(ids)
112
+ except Exception as e:
113
+ logger.error(f"Error getting documents by ID: {str(e)}")
114
+ raise
115
+
116
+ def delete_documents_by_id(self, ids: list[str]):
117
+ """
118
+ Delete documents by ID from the vector store.
119
+ """
120
+ try:
121
+ self.chroma.delete(ids)
122
+ except Exception as e:
123
+ logger.error(f"Error deleting documents by ID: {str(e)}")
124
+ raise
125
+
126
+ def get_documents_keys_by_source_id(self, source_id: str):
127
+ """Get documents keys by source ID."""
128
+ pass
129
+
130
+ def delete_documents_by_source_id(self, source_id: str):
131
+ """Delete documents by source ID."""
132
+ pass
@@ -14,8 +14,9 @@ class DeelabTranscribeManager:
14
14
  gcp_project_id: str,
15
15
  gcp_project_location: str,
16
16
  gcp_secret_name: str,
17
- llm_model_id: str = "claude-3-5-sonnet-v2@20241022",
17
+ llm_model_id: str = "claude-sonnet-4@20250514",
18
18
  target_language: str = 'es',
19
+ transcription_additional_instructions: str = ''
19
20
  ):
20
21
  self.gcp_project_id = gcp_project_id
21
22
  self.gcp_project_location = gcp_project_location
@@ -23,6 +24,7 @@ class DeelabTranscribeManager:
23
24
  self.gcp_secret_name = gcp_secret_name
24
25
  self.llm_model_id = llm_model_id
25
26
  self.target_language = target_language
27
+ self.transcription_additional_instructions = transcription_additional_instructions
26
28
  self.gcp_sa_dict = self._get_gcp_sa_dict(gcp_secret_name)
27
29
  self.vertex_model = self._get_vertex_model()
28
30
 
@@ -55,7 +57,8 @@ class DeelabTranscribeManager:
55
57
  transcribe_document_service = TranscriptionService(
56
58
  ai_application_service=self.vertex_model,
57
59
  persistence_service=s3_persistence_service,
58
- target_language=self.target_language
60
+ target_language=self.target_language,
61
+ transcription_additional_instructions=self.transcription_additional_instructions
59
62
  )
60
63
  parsed_pages, parsed_document = transcribe_document_service.process_document(file_key)
61
64
  origin_bucket_file_tags = s3_persistence_service.retrieve_file_tags(file_key, s3_origin_bucket_name)
@@ -4,7 +4,7 @@ from langchain_core.output_parsers import StrOutputParser
4
4
  import base64
5
5
  import logging
6
6
  import io
7
- import fitz
7
+ import pymupdf
8
8
  from PIL import Image
9
9
  from typing import List, Any
10
10
  from dotenv import load_dotenv
@@ -23,13 +23,13 @@ class ParseDoc:
23
23
  def __init__(self, file_path: str, system_prompt, chat_model: Any):
24
24
  """
25
25
  Initialize a PDF document parser.
26
-
26
+
27
27
  Args:
28
28
  file_path: Path to the PDF file to parse
29
29
  chat_model: Language model for processing document content
30
30
  """
31
31
  self.file_path = file_path
32
- self.pdf_document = fitz.open(file_path)
32
+ self.pdf_document = pymupdf.open(file_path)
33
33
  self.page_count = self.pdf_document.page_count
34
34
  self.system_prompt = system_prompt
35
35
  self.chat_model = chat_model
@@ -37,13 +37,13 @@ class ParseDoc:
37
37
  def pdf_page_to_base64(self, page_number: int) -> str:
38
38
  """
39
39
  Convert a PDF page to a base64-encoded PNG image.
40
-
40
+
41
41
  Args:
42
42
  page_number: One-indexed page number to convert
43
-
43
+
44
44
  Returns:
45
45
  Base64 encoded string of the page image
46
-
46
+
47
47
  Raises:
48
48
  Exception: If there's an error during conversion
49
49
  """
@@ -69,10 +69,10 @@ class ParseDoc:
69
69
  def parse_document_to_base64(self) -> List[str]:
70
70
  """
71
71
  Convert all pages in the PDF document to base64-encoded images.
72
-
72
+
73
73
  Returns:
74
74
  List of base64 encoded strings for each page
75
-
75
+
76
76
  Raises:
77
77
  Exception: If there's an error during conversion
78
78
  """
@@ -90,14 +90,14 @@ class ParseDoc:
90
90
  def parse_with_llm(self, base_64_image: str, prompt: str) -> AIMessage:
91
91
  """
92
92
  Process a base64-encoded image with a language model using the provided prompt.
93
-
93
+
94
94
  Args:
95
95
  base_64_image: Base64 encoded image string
96
96
  prompt: Text prompt to send with the image
97
-
97
+
98
98
  Returns:
99
99
  Language model response
100
-
100
+
101
101
  Raises:
102
102
  Exception: If there's an error during processing
103
103
  """