wizit-context-ingestor 0.2.5b1__py3-none-any.whl → 0.2.5b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wizit-context-ingestor might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
1
  import base64
2
2
  import logging
3
3
  import io
4
- import fitz
4
+ import pymupdf
5
5
  from PIL import Image
6
6
  from typing import List
7
7
  from ..domain.models import ParsedDocPage, ParsedDoc
@@ -17,25 +17,25 @@ class ParseDocModelService():
17
17
  def __init__(self, file_path: str):
18
18
  """
19
19
  Initialize a PDF document parser.
20
-
20
+
21
21
  Args:
22
22
  file_path: Path to the PDF file to parse
23
23
  """
24
24
  self.file_path = file_path
25
- self.pdf_document = fitz.open(file_path)
25
+ self.pdf_document = pymupdf.open(file_path)
26
26
  self.page_count = self.pdf_document.page_count
27
27
 
28
-
28
+
29
29
  def pdf_page_to_base64(self, page_number: int) -> ParsedDocPage:
30
30
  """
31
31
  Convert a PDF page to a base64-encoded PNG image.
32
-
32
+
33
33
  Args:
34
34
  page_number: One-indexed page number to convert
35
-
35
+
36
36
  Returns:
37
37
  Base64 encoded string of the page image
38
-
38
+
39
39
  Raises:
40
40
  Exception: If there's an error during conversion
41
41
  """
@@ -49,7 +49,7 @@ class ParseDocModelService():
49
49
  b64_encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
50
50
  logger.info(f"Page {page_number} encoded successfully")
51
51
  return ParsedDocPage(
52
- page_number=page_number,
52
+ page_number=page_number,
53
53
  page_base64=b64_encoded_image
54
54
  )
55
55
  except Exception as e:
@@ -59,15 +59,15 @@ class ParseDocModelService():
59
59
  def parse_document_to_base64(self) -> List[ParsedDocPage]:
60
60
  """
61
61
  Convert all pages in the PDF document to base64-encoded images.
62
-
62
+
63
63
  Returns:
64
64
  List of base64 encoded strings for each page
65
-
65
+
66
66
  Raises:
67
67
  Exception: If there's an error during conversion
68
68
  """
69
69
  # BASE DE DATOS SINTETICOS DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO, FINE TUNING PARA EL LLM
70
- # GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
70
+ # GEMMA 2 --> DATASET DE PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
71
71
  # RAG --> FINETUNING AUTOMATICO / CONSULTAR EL MODELO
72
72
  # OPENAI --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
73
73
  # COLAB --> PREGUNTAS Y RESPUESTAS SOBRE EL DOCUMENTO
@@ -95,4 +95,4 @@ class ParseDocModelService():
95
95
  document_text=md_content
96
96
  )
97
97
 
98
- # def
98
+ # def
@@ -4,7 +4,7 @@ from langchain_core.output_parsers import StrOutputParser
4
4
  import base64
5
5
  import logging
6
6
  import io
7
- import fitz
7
+ import pymupdf
8
8
  from PIL import Image
9
9
  from typing import List, Any
10
10
  from dotenv import load_dotenv
@@ -23,13 +23,13 @@ class ParseDoc:
23
23
  def __init__(self, file_path: str, system_prompt, chat_model: Any):
24
24
  """
25
25
  Initialize a PDF document parser.
26
-
26
+
27
27
  Args:
28
28
  file_path: Path to the PDF file to parse
29
29
  chat_model: Language model for processing document content
30
30
  """
31
31
  self.file_path = file_path
32
- self.pdf_document = fitz.open(file_path)
32
+ self.pdf_document = pymupdf.open(file_path)
33
33
  self.page_count = self.pdf_document.page_count
34
34
  self.system_prompt = system_prompt
35
35
  self.chat_model = chat_model
@@ -37,13 +37,13 @@ class ParseDoc:
37
37
  def pdf_page_to_base64(self, page_number: int) -> str:
38
38
  """
39
39
  Convert a PDF page to a base64-encoded PNG image.
40
-
40
+
41
41
  Args:
42
42
  page_number: One-indexed page number to convert
43
-
43
+
44
44
  Returns:
45
45
  Base64 encoded string of the page image
46
-
46
+
47
47
  Raises:
48
48
  Exception: If there's an error during conversion
49
49
  """
@@ -69,10 +69,10 @@ class ParseDoc:
69
69
  def parse_document_to_base64(self) -> List[str]:
70
70
  """
71
71
  Convert all pages in the PDF document to base64-encoded images.
72
-
72
+
73
73
  Returns:
74
74
  List of base64 encoded strings for each page
75
-
75
+
76
76
  Raises:
77
77
  Exception: If there's an error during conversion
78
78
  """
@@ -90,14 +90,14 @@ class ParseDoc:
90
90
  def parse_with_llm(self, base_64_image: str, prompt: str) -> AIMessage:
91
91
  """
92
92
  Process a base64-encoded image with a language model using the provided prompt.
93
-
93
+
94
94
  Args:
95
95
  base_64_image: Base64 encoded image string
96
96
  prompt: Text prompt to send with the image
97
-
97
+
98
98
  Returns:
99
99
  Language model response
100
-
100
+
101
101
  Raises:
102
102
  Exception: If there's an error during processing
103
103
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: wizit-context-ingestor
3
- Version: 0.2.5b1
3
+ Version: 0.2.5b2
4
4
  Summary: Contextual Rag with Cloud Solutions
5
5
  Requires-Dist: anthropic[vertex]>=0.66.0
6
6
  Requires-Dist: boto3>=1.40.23
@@ -8,7 +8,7 @@ wizit_context_ingestor/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
8
8
  wizit_context_ingestor/data/prompts.py,sha256=wK4HjgbxMOMo0qKeqYs4ujJ0jZ1oDJ6Jw8kujByDdlY,6721
9
9
  wizit_context_ingestor/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  wizit_context_ingestor/domain/models.py,sha256=DV83PArMyh-VoUqnVF_ohcgStsk549ixdYw98B8o2GI,381
11
- wizit_context_ingestor/domain/services.py,sha256=R14ImkiDzv-X1o1qYl9J__FBclnt9Uj85xbo0wgOqCg,3407
11
+ wizit_context_ingestor/domain/services.py,sha256=0i9WwZ0ufBgnzNJ5dt8Iop9VLTeK_AqjcaH8p3Av26I,3347
12
12
  wizit_context_ingestor/infra/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  wizit_context_ingestor/infra/aws_model.py,sha256=glIaewSdv6PDBXoCe6QgCUIzLCjtM7KlayEERXRNFwo,2539
14
14
  wizit_context_ingestor/infra/persistence/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -24,8 +24,8 @@ wizit_context_ingestor/main.py,sha256=x1bieggNs3uzLkNHXZk4fBz0fYlOyff1TGvjV3Af2y
24
24
  wizit_context_ingestor/services/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
25
25
  wizit_context_ingestor/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  wizit_context_ingestor/services/chunks.py,sha256=tQQsdsOscZWzqVY5WxVxr3ii62FOJ3nMARaJJz6CvjQ,2011
27
- wizit_context_ingestor/services/parse_doc.py,sha256=5EIQnGWSyXwepazcO4lj-MKBaBWdGzIQIwKDE1soZTc,4577
27
+ wizit_context_ingestor/services/parse_doc.py,sha256=3CyZoGbiUfxbs0SXUWXjQevtusSzTBgvUVeNNSdxJLE,4491
28
28
  wizit_context_ingestor/services/pg_embeddings_manager.py,sha256=n1HOmu_Z_Z71H-rVAyJS3FdPKbBckm5W8_XethY8nuM,4998
29
- wizit_context_ingestor-0.2.5b1.dist-info/WHEEL,sha256=F3mArEuDT3LDFEqo9fCiUx6ISLN64aIhcGSiIwtu4r8,79
30
- wizit_context_ingestor-0.2.5b1.dist-info/METADATA,sha256=yMnfleIUWwt32cVfndgq8OrMC7TkJZw0gbJzN05qwfY,3577
31
- wizit_context_ingestor-0.2.5b1.dist-info/RECORD,,
29
+ wizit_context_ingestor-0.2.5b2.dist-info/WHEEL,sha256=F3mArEuDT3LDFEqo9fCiUx6ISLN64aIhcGSiIwtu4r8,79
30
+ wizit_context_ingestor-0.2.5b2.dist-info/METADATA,sha256=AaF5SD1HqVjrW0Px_xjhioJryJXPUh9Q9SO4QJtRbwo,3577
31
+ wizit_context_ingestor-0.2.5b2.dist-info/RECORD,,