wizit-context-ingestor 0.2.4b0__tar.gz → 0.2.5b2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wizit-context-ingestor might be problematic. Click here for more details.

Files changed (31) hide show
  1. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/PKG-INFO +1 -1
  2. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/pyproject.toml +1 -1
  3. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/application/transcription_service.py +6 -0
  4. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/domain/services.py +12 -12
  5. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/aws_model.py +0 -1
  6. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/persistence/local_storage.py +2 -1
  7. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/persistence/s3_storage.py +2 -2
  8. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/main.py +5 -2
  9. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/services/parse_doc.py +11 -11
  10. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/README.md +0 -0
  11. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/.DS_Store +0 -0
  12. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/__init__.py +0 -0
  13. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/application/__init__.py +0 -0
  14. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/application/context_chunk_service.py +0 -0
  15. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/application/interfaces.py +0 -0
  16. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/data/__init__.py +0 -0
  17. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/data/prompts.py +0 -0
  18. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/domain/__init__.py +0 -0
  19. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/domain/models.py +0 -0
  20. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/__init__.py +0 -0
  21. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/persistence/__init__.py +0 -0
  22. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/rag/pg_embeddings.py +0 -0
  23. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/rag/redis_embeddings.py +0 -0
  24. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/rag/semantic_chunks.py +0 -0
  25. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/secrets/__init__.py +0 -0
  26. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/secrets/aws_secrets_manager.py +0 -0
  27. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/vertex_model.py +0 -0
  28. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/services/.DS_Store +0 -0
  29. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/services/__init__.py +0 -0
  30. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/services/chunks.py +0 -0
  31. {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/services/pg_embeddings_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: wizit-context-ingestor
3
- Version: 0.2.4b0
3
+ Version: 0.2.5b2
4
4
  Summary: Contextual Rag with Cloud Solutions
5
5
  Requires-Dist: anthropic[vertex]>=0.66.0
6
6
  Requires-Dist: boto3>=1.40.23
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "wizit_context_ingestor"
3
- version = "0.2.4-beta"
3
+ version = "0.2.5-beta-2"
4
4
  description = "Contextual Rag with Cloud Solutions"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -56,6 +56,12 @@ class TranscriptionService:
56
56
  # Create the chain
57
57
  chain = prompt | model_with_structured_output
58
58
  # Process the image
59
+ chain = chain.with_retry(
60
+ stop_after_attempt=3,
61
+ exponential_jitter_params={
62
+ "initial": 60
63
+ }
64
+ )
59
65
  result = chain.invoke({})
60
66
  if result.transcription:
61
67
  document.page_text = result.transcription
@@ -1,7 +1,7 @@
1
1
  import base64
2
2
  import logging
3
3
  import io
4
- import fitz
4
+ import pymupdf
5
5
  from PIL import Image
6
6
  from typing import List
7
7
  from ..domain.models import ParsedDocPage, ParsedDoc
@@ -17,25 +17,25 @@ class ParseDocModelService():
17
17
  def __init__(self, file_path: str):
18
18
  """
19
19
  Initialize a PDF document parser.
20
-
20
+
21
21
  Args:
22
22
  file_path: Path to the PDF file to parse
23
23
  """
24
24
  self.file_path = file_path
25
- self.pdf_document = fitz.open(file_path)
25
+ self.pdf_document = pymupdf.open(file_path)
26
26
  self.page_count = self.pdf_document.page_count
27
27
 
28
-
28
+
29
29
  def pdf_page_to_base64(self, page_number: int) -> ParsedDocPage:
30
30
  """
31
31
  Convert a PDF page to a base64-encoded PNG image.
32
-
32
+
33
33
  Args:
34
34
  page_number: One-indexed page number to convert
35
-
35
+
36
36
  Returns:
37
37
  Base64 encoded string of the page image
38
-
38
+
39
39
  Raises:
40
40
  Exception: If there's an error during conversion
41
41
  """
@@ -49,7 +49,7 @@ class ParseDocModelService():
49
49
  b64_encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
50
50
  logger.info(f"Page {page_number} encoded successfully")
51
51
  return ParsedDocPage(
52
- page_number=page_number,
52
+ page_number=page_number,
53
53
  page_base64=b64_encoded_image
54
54
  )
55
55
  except Exception as e:
@@ -59,15 +59,15 @@ class ParseDocModelService():
59
59
  def parse_document_to_base64(self) -> List[ParsedDocPage]:
60
60
  """
61
61
  Convert all pages in the PDF document to base64-encoded images.
62
-
62
+
63
63
  Returns:
64
64
  List of base64 encoded strings for each page
65
-
65
+
66
66
  Raises:
67
67
  Exception: If there's an error during conversion
68
68
  """
69
69
  # BASE DE DATOS SINTETICOS DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO, FINE TUNING PARA EL LLM
70
- # GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
70
+ # GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
71
71
  # RAG --> FINETUNING AUTOMATICO / CONSULTAR EL MODELO
72
72
  # OPENAI --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
73
73
  # COLAB --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
@@ -95,4 +95,4 @@ class ParseDocModelService():
95
95
  document_text=md_content
96
96
  )
97
97
 
98
- # def
98
+ # def
@@ -37,7 +37,6 @@ class AWSModels(AiApplicationService):
37
37
  temperature: float = 0.7,
38
38
  max_tokens: int = 8000,
39
39
  region_name: str = "us-east-1") -> ChatBedrockConverse:
40
-
41
40
  """
42
41
  Load an AWS AI chat model for text generation.
43
42
 
@@ -1,5 +1,6 @@
1
1
  from ...application.interfaces import PersistenceService
2
2
  from ...domain.models import ParsedDoc
3
+ from typing import Optional
3
4
  import logging
4
5
  import os
5
6
  logger = logging.getLogger(__name__)
@@ -40,7 +41,7 @@ class LocalStorageService(PersistenceService):
40
41
  raise
41
42
 
42
43
 
43
- def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc):
44
+ def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
44
45
  """Save a parsed document."""
45
46
  with open(f"{self.tmp_folder}/{file_key}", "w", encoding="utf-8") as f:
46
47
  f.write(parsed_document.document_text)
@@ -4,6 +4,7 @@ from boto3 import client as boto3_client
4
4
  import logging
5
5
  import os
6
6
  from botocore.exceptions import ClientError
7
+ from typing import Optional
7
8
 
8
9
  logger = logging.getLogger(__name__)
9
10
 
@@ -77,8 +78,7 @@ class S3StorageService(PersistenceService):
77
78
  raise
78
79
 
79
80
 
80
-
81
- def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: dict = {}):
81
+ def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
82
82
  """Save a parsed document to S3.
83
83
 
84
84
  Args:
@@ -113,8 +113,11 @@ class DeelabRedisChunksManager:
113
113
  try:
114
114
  rag_chunker = SemanticChunks(self.embeddings_model)
115
115
  redis_embeddings_manager = RedisEmbeddingsManager(
116
- embeddings_model=self.embeddings_model,
117
- redis_connection_string=self.redis_connection_string
116
+ self.embeddings_model,
117
+ self.redis_connection_string,
118
+ {
119
+ "file_key": file_key
120
+ }
118
121
  )
119
122
  local_persistence_service = LocalStorageService()
120
123
  context_chunks_in_document_service = ContextChunksInDocumentService(
@@ -4,7 +4,7 @@ from langchain_core.output_parsers import StrOutputParser
4
4
  import base64
5
5
  import logging
6
6
  import io
7
- import fitz
7
+ import pymupdf
8
8
  from PIL import Image
9
9
  from typing import List, Any
10
10
  from dotenv import load_dotenv
@@ -23,13 +23,13 @@ class ParseDoc:
23
23
  def __init__(self, file_path: str, system_prompt, chat_model: Any):
24
24
  """
25
25
  Initialize a PDF document parser.
26
-
26
+
27
27
  Args:
28
28
  file_path: Path to the PDF file to parse
29
29
  chat_model: Language model for processing document content
30
30
  """
31
31
  self.file_path = file_path
32
- self.pdf_document = fitz.open(file_path)
32
+ self.pdf_document = pymupdf.open(file_path)
33
33
  self.page_count = self.pdf_document.page_count
34
34
  self.system_prompt = system_prompt
35
35
  self.chat_model = chat_model
@@ -37,13 +37,13 @@ class ParseDoc:
37
37
  def pdf_page_to_base64(self, page_number: int) -> str:
38
38
  """
39
39
  Convert a PDF page to a base64-encoded PNG image.
40
-
40
+
41
41
  Args:
42
42
  page_number: One-indexed page number to convert
43
-
43
+
44
44
  Returns:
45
45
  Base64 encoded string of the page image
46
-
46
+
47
47
  Raises:
48
48
  Exception: If there's an error during conversion
49
49
  """
@@ -69,10 +69,10 @@ class ParseDoc:
69
69
  def parse_document_to_base64(self) -> List[str]:
70
70
  """
71
71
  Convert all pages in the PDF document to base64-encoded images.
72
-
72
+
73
73
  Returns:
74
74
  List of base64 encoded strings for each page
75
-
75
+
76
76
  Raises:
77
77
  Exception: If there's an error during conversion
78
78
  """
@@ -90,14 +90,14 @@ class ParseDoc:
90
90
  def parse_with_llm(self, base_64_image: str, prompt: str) -> AIMessage:
91
91
  """
92
92
  Process a base64-encoded image with a language model using the provided prompt.
93
-
93
+
94
94
  Args:
95
95
  base_64_image: Base64 encoded image string
96
96
  prompt: Text prompt to send with the image
97
-
97
+
98
98
  Returns:
99
99
  Language model response
100
-
100
+
101
101
  Raises:
102
102
  Exception: If there's an error during processing
103
103
  """