wizit-context-ingestor 0.2.4b0__tar.gz → 0.2.5b2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wizit-context-ingestor might be problematic. Click here for more details.
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/PKG-INFO +1 -1
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/pyproject.toml +1 -1
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/application/transcription_service.py +6 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/domain/services.py +12 -12
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/aws_model.py +0 -1
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/persistence/local_storage.py +2 -1
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/persistence/s3_storage.py +2 -2
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/main.py +5 -2
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/services/parse_doc.py +11 -11
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/README.md +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/.DS_Store +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/__init__.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/application/__init__.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/application/context_chunk_service.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/application/interfaces.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/data/__init__.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/data/prompts.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/domain/__init__.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/domain/models.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/__init__.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/persistence/__init__.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/rag/pg_embeddings.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/rag/redis_embeddings.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/rag/semantic_chunks.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/secrets/__init__.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/secrets/aws_secrets_manager.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/infra/vertex_model.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/services/.DS_Store +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/services/__init__.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/services/chunks.py +0 -0
- {wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/services/pg_embeddings_manager.py +0 -0
|
@@ -56,6 +56,12 @@ class TranscriptionService:
|
|
|
56
56
|
# Create the chain
|
|
57
57
|
chain = prompt | model_with_structured_output
|
|
58
58
|
# Process the image
|
|
59
|
+
chain = chain.with_retry(
|
|
60
|
+
stop_after_attempt=3,
|
|
61
|
+
exponential_jitter_params={
|
|
62
|
+
"initial": 60
|
|
63
|
+
}
|
|
64
|
+
)
|
|
59
65
|
result = chain.invoke({})
|
|
60
66
|
if result.transcription:
|
|
61
67
|
document.page_text = result.transcription
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import logging
|
|
3
3
|
import io
|
|
4
|
-
import
|
|
4
|
+
import pymupdf
|
|
5
5
|
from PIL import Image
|
|
6
6
|
from typing import List
|
|
7
7
|
from ..domain.models import ParsedDocPage, ParsedDoc
|
|
@@ -17,25 +17,25 @@ class ParseDocModelService():
|
|
|
17
17
|
def __init__(self, file_path: str):
|
|
18
18
|
"""
|
|
19
19
|
Initialize a PDF document parser.
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
Args:
|
|
22
22
|
file_path: Path to the PDF file to parse
|
|
23
23
|
"""
|
|
24
24
|
self.file_path = file_path
|
|
25
|
-
self.pdf_document =
|
|
25
|
+
self.pdf_document = pymupdf.open(file_path)
|
|
26
26
|
self.page_count = self.pdf_document.page_count
|
|
27
27
|
|
|
28
|
-
|
|
28
|
+
|
|
29
29
|
def pdf_page_to_base64(self, page_number: int) -> ParsedDocPage:
|
|
30
30
|
"""
|
|
31
31
|
Convert a PDF page to a base64-encoded PNG image.
|
|
32
|
-
|
|
32
|
+
|
|
33
33
|
Args:
|
|
34
34
|
page_number: One-indexed page number to convert
|
|
35
|
-
|
|
35
|
+
|
|
36
36
|
Returns:
|
|
37
37
|
Base64 encoded string of the page image
|
|
38
|
-
|
|
38
|
+
|
|
39
39
|
Raises:
|
|
40
40
|
Exception: If there's an error during conversion
|
|
41
41
|
"""
|
|
@@ -49,7 +49,7 @@ class ParseDocModelService():
|
|
|
49
49
|
b64_encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
50
50
|
logger.info(f"Page {page_number} encoded successfully")
|
|
51
51
|
return ParsedDocPage(
|
|
52
|
-
page_number=page_number,
|
|
52
|
+
page_number=page_number,
|
|
53
53
|
page_base64=b64_encoded_image
|
|
54
54
|
)
|
|
55
55
|
except Exception as e:
|
|
@@ -59,15 +59,15 @@ class ParseDocModelService():
|
|
|
59
59
|
def parse_document_to_base64(self) -> List[ParsedDocPage]:
|
|
60
60
|
"""
|
|
61
61
|
Convert all pages in the PDF document to base64-encoded images.
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
Returns:
|
|
64
64
|
List of base64 encoded strings for each page
|
|
65
|
-
|
|
65
|
+
|
|
66
66
|
Raises:
|
|
67
67
|
Exception: If there's an error during conversion
|
|
68
68
|
"""
|
|
69
69
|
# BASE DE DATOS SINTETICOS DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO, FINE TUNING PARA EL LLM
|
|
70
|
-
# GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
|
|
70
|
+
# GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
|
|
71
71
|
# RAG --> FINETUNING AUTOMATICO / CONSULTAR EL MODELO
|
|
72
72
|
# OPENAI --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
|
|
73
73
|
# COLAB --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
|
|
@@ -95,4 +95,4 @@ class ParseDocModelService():
|
|
|
95
95
|
document_text=md_content
|
|
96
96
|
)
|
|
97
97
|
|
|
98
|
-
# def
|
|
98
|
+
# def
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from ...application.interfaces import PersistenceService
|
|
2
2
|
from ...domain.models import ParsedDoc
|
|
3
|
+
from typing import Optional
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
5
6
|
logger = logging.getLogger(__name__)
|
|
@@ -40,7 +41,7 @@ class LocalStorageService(PersistenceService):
|
|
|
40
41
|
raise
|
|
41
42
|
|
|
42
43
|
|
|
43
|
-
def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc):
|
|
44
|
+
def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
|
|
44
45
|
"""Save a parsed document."""
|
|
45
46
|
with open(f"{self.tmp_folder}/{file_key}", "w", encoding="utf-8") as f:
|
|
46
47
|
f.write(parsed_document.document_text)
|
|
@@ -4,6 +4,7 @@ from boto3 import client as boto3_client
|
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
6
|
from botocore.exceptions import ClientError
|
|
7
|
+
from typing import Optional
|
|
7
8
|
|
|
8
9
|
logger = logging.getLogger(__name__)
|
|
9
10
|
|
|
@@ -77,8 +78,7 @@ class S3StorageService(PersistenceService):
|
|
|
77
78
|
raise
|
|
78
79
|
|
|
79
80
|
|
|
80
|
-
|
|
81
|
-
def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: dict = {}):
|
|
81
|
+
def save_parsed_document(self, file_key: str, parsed_document: ParsedDoc, file_tags: Optional[dict] = {}):
|
|
82
82
|
"""Save a parsed document to S3.
|
|
83
83
|
|
|
84
84
|
Args:
|
{wizit_context_ingestor-0.2.4b0 → wizit_context_ingestor-0.2.5b2}/src/wizit_context_ingestor/main.py
RENAMED
|
@@ -113,8 +113,11 @@ class DeelabRedisChunksManager:
|
|
|
113
113
|
try:
|
|
114
114
|
rag_chunker = SemanticChunks(self.embeddings_model)
|
|
115
115
|
redis_embeddings_manager = RedisEmbeddingsManager(
|
|
116
|
-
|
|
117
|
-
|
|
116
|
+
self.embeddings_model,
|
|
117
|
+
self.redis_connection_string,
|
|
118
|
+
{
|
|
119
|
+
"file_key": file_key
|
|
120
|
+
}
|
|
118
121
|
)
|
|
119
122
|
local_persistence_service = LocalStorageService()
|
|
120
123
|
context_chunks_in_document_service = ContextChunksInDocumentService(
|
|
@@ -4,7 +4,7 @@ from langchain_core.output_parsers import StrOutputParser
|
|
|
4
4
|
import base64
|
|
5
5
|
import logging
|
|
6
6
|
import io
|
|
7
|
-
import
|
|
7
|
+
import pymupdf
|
|
8
8
|
from PIL import Image
|
|
9
9
|
from typing import List, Any
|
|
10
10
|
from dotenv import load_dotenv
|
|
@@ -23,13 +23,13 @@ class ParseDoc:
|
|
|
23
23
|
def __init__(self, file_path: str, system_prompt, chat_model: Any):
|
|
24
24
|
"""
|
|
25
25
|
Initialize a PDF document parser.
|
|
26
|
-
|
|
26
|
+
|
|
27
27
|
Args:
|
|
28
28
|
file_path: Path to the PDF file to parse
|
|
29
29
|
chat_model: Language model for processing document content
|
|
30
30
|
"""
|
|
31
31
|
self.file_path = file_path
|
|
32
|
-
self.pdf_document =
|
|
32
|
+
self.pdf_document = pymupdf.open(file_path)
|
|
33
33
|
self.page_count = self.pdf_document.page_count
|
|
34
34
|
self.system_prompt = system_prompt
|
|
35
35
|
self.chat_model = chat_model
|
|
@@ -37,13 +37,13 @@ class ParseDoc:
|
|
|
37
37
|
def pdf_page_to_base64(self, page_number: int) -> str:
|
|
38
38
|
"""
|
|
39
39
|
Convert a PDF page to a base64-encoded PNG image.
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
Args:
|
|
42
42
|
page_number: One-indexed page number to convert
|
|
43
|
-
|
|
43
|
+
|
|
44
44
|
Returns:
|
|
45
45
|
Base64 encoded string of the page image
|
|
46
|
-
|
|
46
|
+
|
|
47
47
|
Raises:
|
|
48
48
|
Exception: If there's an error during conversion
|
|
49
49
|
"""
|
|
@@ -69,10 +69,10 @@ class ParseDoc:
|
|
|
69
69
|
def parse_document_to_base64(self) -> List[str]:
|
|
70
70
|
"""
|
|
71
71
|
Convert all pages in the PDF document to base64-encoded images.
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
Returns:
|
|
74
74
|
List of base64 encoded strings for each page
|
|
75
|
-
|
|
75
|
+
|
|
76
76
|
Raises:
|
|
77
77
|
Exception: If there's an error during conversion
|
|
78
78
|
"""
|
|
@@ -90,14 +90,14 @@ class ParseDoc:
|
|
|
90
90
|
def parse_with_llm(self, base_64_image: str, prompt: str) -> AIMessage:
|
|
91
91
|
"""
|
|
92
92
|
Process a base64-encoded image with a language model using the provided prompt.
|
|
93
|
-
|
|
93
|
+
|
|
94
94
|
Args:
|
|
95
95
|
base_64_image: Base64 encoded image string
|
|
96
96
|
prompt: Text prompt to send with the image
|
|
97
|
-
|
|
97
|
+
|
|
98
98
|
Returns:
|
|
99
99
|
Language model response
|
|
100
|
-
|
|
100
|
+
|
|
101
101
|
Raises:
|
|
102
102
|
Exception: If there's an error during processing
|
|
103
103
|
"""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|