ai-parrot 0.3.8__cp311-cp311-manylinux_2_28_x86_64.whl → 0.3.10__cp311-cp311-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-parrot might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ai-parrot
3
- Version: 0.3.8
3
+ Version: 0.3.10
4
4
  Summary: Live Chatbots based on Langchain chatbots and Agents Integrated into Navigator Framework or used into aiohttp applications.
5
5
  Home-page: https://github.com/phenobarbital/ai-parrot
6
6
  Author: Jesus Lara
@@ -78,13 +78,13 @@ Requires-Dist: O365==2.0.35
78
78
  Requires-Dist: stackapi==0.3.1
79
79
  Requires-Dist: torchvision==0.19.1
80
80
  Requires-Dist: tf-keras==2.17.0
81
+ Requires-Dist: simsimd==4.3.1
82
+ Requires-Dist: opencv-python==4.10.0.84
81
83
  Provides-Extra: analytics
82
84
  Requires-Dist: annoy==1.17.3; extra == "analytics"
83
85
  Requires-Dist: gradio-tools==0.0.9; extra == "analytics"
84
86
  Requires-Dist: gradio-client==0.2.9; extra == "analytics"
85
87
  Requires-Dist: streamlit==1.37.1; extra == "analytics"
86
- Requires-Dist: simsimd==4.3.1; extra == "analytics"
87
- Requires-Dist: opencv-python==4.10.0.84; extra == "analytics"
88
88
  Provides-Extra: anthropic
89
89
  Requires-Dist: langchain-anthropic==0.1.11; extra == "anthropic"
90
90
  Requires-Dist: anthropic==0.25.2; extra == "anthropic"
@@ -131,6 +131,7 @@ Requires-Dist: ftfy==6.2.3; extra == "loaders"
131
131
  Requires-Dist: librosa==0.10.1; extra == "loaders"
132
132
  Requires-Dist: XlsxWriter==3.2.0; extra == "loaders"
133
133
  Requires-Dist: timm==1.0.9; extra == "loaders"
134
+ Requires-Dist: easyocr==1.7.1; extra == "loaders"
134
135
  Provides-Extra: milvus
135
136
  Requires-Dist: langchain-milvus>=0.1.4; extra == "milvus"
136
137
  Requires-Dist: milvus==2.3.5; extra == "milvus"
@@ -4,7 +4,7 @@ parrot/exceptions.cpython-311-x86_64-linux-gnu.so,sha256=VNyBh3uLxGQgB0l1bkWjQDq
4
4
  parrot/manager.py,sha256=NhzXoWxSgtoWHpmYP8cV2Ujq_SlvCbQYQBaohAeL2TM,5935
5
5
  parrot/models.py,sha256=RsVQCqhSXBKRPcu-BCga9Y1wyvENFXDCuq3_ObIKvAo,13452
6
6
  parrot/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- parrot/version.py,sha256=GrRYsZgC8VWhmiAeLkx1nVkvl0dOVtxfv04pU-T01pg,373
7
+ parrot/version.py,sha256=HoNVx3mljjW-CBZ6CYPJUMxrG6X1SETBEn5g7YQTv4g,374
8
8
  parrot/chatbots/__init__.py,sha256=ypskCnME0xUv6psBEGCEyXCrD0J0ULHSllpVmSxqb4A,200
9
9
  parrot/chatbots/abstract.py,sha256=CmDn3k4r9uKImOZRN4L9zxLbCdC-1MPUAorDlfZT-kA,26421
10
10
  parrot/chatbots/asktroc.py,sha256=gyWzyvpAnmXwXd-3DEKoIJtAxt6NnP5mUZdZbkFky8s,604
@@ -56,7 +56,7 @@ parrot/loaders/excel.py,sha256=Y1agxm-jG4AgsA2wlPP3p8uBH40wYW1KM2ycTTLKUm4,12441
56
56
  parrot/loaders/github.py,sha256=CscyUIqoHTytqCbRUUTcV3QSxI8XoDntq5aTU0vdhzQ,2593
57
57
  parrot/loaders/image.py,sha256=A9KCXXoGuhDoyeJaascY7Q1ZK12Kf1ggE1drzJjS3AU,3946
58
58
  parrot/loaders/json.py,sha256=6B43k591OpvoJLbsJa8CxJue_lAt713SCdldn8bFW3c,1481
59
- parrot/loaders/pdf.py,sha256=nyeT4emrewxeO2dUQxW3QOcdk1vg1JYtPKNAV8tThm0,17512
59
+ parrot/loaders/pdf.py,sha256=YnWXFVJjT76cGcRclAKHmFPeMt7SXAuqywIt0UMI9P4,20722
60
60
  parrot/loaders/pdfchapters.py,sha256=YhA8Cdx3qXBR0vuTVnQ12XgH1DXT_rp1Tawzh4V2U3o,5637
61
61
  parrot/loaders/pdffn.py,sha256=gA-vJEWUiIUwbMxP8Nmvlzlcb39DVV69vGKtSzavUoI,4004
62
62
  parrot/loaders/pdfimages.py,sha256=4Q_HKiAee_hALBsG2qF7PpMgKP1AivHXhmcsCkUa9eE,7899
@@ -103,8 +103,8 @@ resources/users/handlers.py,sha256=BGzqBvPY_OaIF_nONWX4b_B5OyyBrdGuSihIsdlFwjk,2
103
103
  resources/users/models.py,sha256=glk7Emv7QCi6i32xRFDrGc8UwK23_LPg0XUOJoHnwRU,6799
104
104
  settings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
105
  settings/settings.py,sha256=9ueEvyLNurUX-AaIeRPV8GKX1c4YjDLbksUAeqEq6Ck,1854
106
- ai_parrot-0.3.8.dist-info/LICENSE,sha256=vRKOoa7onTsLNvSzJtGtMaNhWWh8B3YAT733Tlu6M4o,1070
107
- ai_parrot-0.3.8.dist-info/METADATA,sha256=81l4aL6ASc4ERr1DsOSl7RhjnLhbuOGnPoJfqy9IjMg,9721
108
- ai_parrot-0.3.8.dist-info/WHEEL,sha256=UQ-0qXN3LQUffjrV43_e_ZXj2pgORBqTmXipnkj0E8I,113
109
- ai_parrot-0.3.8.dist-info/top_level.txt,sha256=qHoO4BhYDfeTkyKnciZSQtn5FSLN3Q-P5xCTkyvbuxg,26
110
- ai_parrot-0.3.8.dist-info/RECORD,,
106
+ ai_parrot-0.3.10.dist-info/LICENSE,sha256=vRKOoa7onTsLNvSzJtGtMaNhWWh8B3YAT733Tlu6M4o,1070
107
+ ai_parrot-0.3.10.dist-info/METADATA,sha256=Kq5FpMgf-M5vRQ9gNDNyqWCOL_3SrMB6hTsnxWW9Gbg,9728
108
+ ai_parrot-0.3.10.dist-info/WHEEL,sha256=UQ-0qXN3LQUffjrV43_e_ZXj2pgORBqTmXipnkj0E8I,113
109
+ ai_parrot-0.3.10.dist-info/top_level.txt,sha256=qHoO4BhYDfeTkyKnciZSQtn5FSLN3Q-P5xCTkyvbuxg,26
110
+ ai_parrot-0.3.10.dist-info/RECORD,,
parrot/loaders/pdf.py CHANGED
@@ -6,6 +6,7 @@ import re
6
6
  import ftfy
7
7
  import fitz
8
8
  import pytesseract
9
+ from pytesseract import Output
9
10
  from paddleocr import PaddleOCR
10
11
  import torch
11
12
  import cv2
@@ -15,16 +16,38 @@ from transformers import (
15
16
  # VisionEncoderDecoderConfig,
16
17
  # ViTImageProcessor,
17
18
  # AutoTokenizer,
19
+ LayoutLMv3FeatureExtractor,
20
+ LayoutLMv3TokenizerFast,
18
21
  LayoutLMv3ForTokenClassification,
19
22
  LayoutLMv3Processor
20
23
  )
21
24
  from pdf4llm import to_markdown
22
25
  from PIL import Image
23
26
  from langchain.docstore.document import Document
24
- from navconfig import config
27
+ from navconfig.logging import logging
25
28
  from .basepdf import BasePDF
26
29
 
27
30
 
31
+ logging.getLogger(name='ppocr').setLevel(logging.INFO)
32
+
33
+ # Function to rescale bounding boxes
34
+ def rescale_bounding_boxes(bboxes, image_width, image_height, target_size=1000):
35
+ """Rescale bounding boxes to fit within the target size for LayoutLMv3."""
36
+ rescaled_bboxes = []
37
+ for bbox in bboxes:
38
+ x1, y1 = bbox[0]
39
+ x2, y2 = bbox[2]
40
+ # Rescale based on the image dimensions
41
+ rescaled_bbox = [
42
+ int(x1 / image_width * target_size),
43
+ int(y1 / image_height * target_size),
44
+ int(x2 / image_width * target_size),
45
+ int(y2 / image_height * target_size)
46
+ ]
47
+ rescaled_bboxes.append(rescaled_bbox)
48
+ return rescaled_bboxes
49
+
50
+
28
51
  class PDFLoader(BasePDF):
29
52
  """
30
53
  Loader for PDF files.
@@ -50,13 +73,22 @@ class PDFLoader(BasePDF):
50
73
  self.page_as_images = kwargs.get('page_as_images', False)
51
74
  if self.page_as_images is True:
52
75
  # Load the processor and model from Hugging Face
76
+ # self.feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=False)
77
+ # self.image_tokenizer = LayoutLMv3TokenizerFast.from_pretrained(
78
+ # "microsoft/layoutlmv3-base"
79
+ # )
80
+ # self.image_processor = LayoutLMv3Processor(
81
+ # self.feature_extractor,
82
+ # self.image_tokenizer
83
+ # )
53
84
  self.image_processor = LayoutLMv3Processor.from_pretrained(
54
85
  "microsoft/layoutlmv3-base",
55
- apply_ocr=True
86
+ apply_ocr=False
56
87
  )
88
+ # LayoutLMv3ForSequenceClassification.from_pretrained
57
89
  self.image_model = LayoutLMv3ForTokenClassification.from_pretrained(
58
- # "microsoft/layoutlmv3-base-finetuned-funsd"
59
- "HYPJUDY/layoutlmv3-base-finetuned-funsd"
90
+ "microsoft/layoutlmv3-base"
91
+ # "HYPJUDY/layoutlmv3-base-finetuned-funsd"
60
92
  )
61
93
  # Set device to GPU if available
62
94
  self.image_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -73,63 +105,63 @@ class PDFLoader(BasePDF):
73
105
  if table_settings:
74
106
  self.table_settings.update(table_settings)
75
107
 
76
- def explain_image(self, image_path):
77
- """Function to explain the image."""
78
- # with open(image_path, "rb") as image_file:
79
- # image_content = image_file.read()
80
-
81
- # Open the image
82
- image = cv2.imread(image_path)
83
- task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
84
- question = "Extract Questions about Happily Greet"
85
- prompt = task_prompt.replace("{user_input}", question)
86
-
87
- decoder_input_ids = self.image_processor.tokenizer(
88
- prompt,
89
- add_special_tokens=False,
90
- return_tensors="pt",
91
- ).input_ids
92
-
93
- pixel_values = self.image_processor(
94
- image,
95
- return_tensors="pt"
96
- ).pixel_values
97
-
98
- # Send inputs to the appropriate device
99
- pixel_values = pixel_values.to(self.image_device)
100
- decoder_input_ids = decoder_input_ids.to(self.image_device)
101
-
102
- outputs = self.image_model.generate(
103
- pixel_values,
104
- decoder_input_ids=decoder_input_ids,
105
- max_length=self.image_model.decoder.config.max_position_embeddings,
106
- pad_token_id=self.image_processor.tokenizer.pad_token_id,
107
- eos_token_id=self.image_processor.tokenizer.eos_token_id,
108
- bad_words_ids=[[self.image_processor.tokenizer.unk_token_id]],
109
- # use_cache=True
110
- return_dict_in_generate=True,
111
- )
112
-
113
- sequence = self.image_processor.batch_decode(outputs.sequences)[0]
114
-
115
-
116
- sequence = sequence.replace(
117
- self.image_processor.tokenizer.eos_token, ""
118
- ).replace(
119
- self.image_processor.tokenizer.pad_token, ""
120
- )
121
- # remove first task start token
122
- sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
123
- # Print the extracted sequence
124
- print("Extracted Text:", sequence)
125
-
126
- print(self.image_processor.token2json(sequence))
127
-
128
- # Format the output as Markdown (optional step)
129
- markdown_text = self.format_as_markdown(sequence)
130
- print("Markdown Format:\n", markdown_text)
131
-
132
- return None
108
+ # def explain_image(self, image_path):
109
+ # """Function to explain the image."""
110
+ # # with open(image_path, "rb") as image_file:
111
+ # # image_content = image_file.read()
112
+
113
+ # # Open the image
114
+ # image = cv2.imread(image_path)
115
+ # task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
116
+ # question = "Extract Questions about Happily Greet"
117
+ # prompt = task_prompt.replace("{user_input}", question)
118
+
119
+ # decoder_input_ids = self.image_processor.tokenizer(
120
+ # prompt,
121
+ # add_special_tokens=False,
122
+ # return_tensors="pt",
123
+ # ).input_ids
124
+
125
+ # pixel_values = self.image_processor(
126
+ # image,
127
+ # return_tensors="pt"
128
+ # ).pixel_values
129
+
130
+ # # Send inputs to the appropriate device
131
+ # pixel_values = pixel_values.to(self.image_device)
132
+ # decoder_input_ids = decoder_input_ids.to(self.image_device)
133
+
134
+ # outputs = self.image_model.generate(
135
+ # pixel_values,
136
+ # decoder_input_ids=decoder_input_ids,
137
+ # max_length=self.image_model.decoder.config.max_position_embeddings,
138
+ # pad_token_id=self.image_processor.tokenizer.pad_token_id,
139
+ # eos_token_id=self.image_processor.tokenizer.eos_token_id,
140
+ # bad_words_ids=[[self.image_processor.tokenizer.unk_token_id]],
141
+ # # use_cache=True
142
+ # return_dict_in_generate=True,
143
+ # )
144
+
145
+ # sequence = self.image_processor.batch_decode(outputs.sequences)[0]
146
+
147
+
148
+ # sequence = sequence.replace(
149
+ # self.image_processor.tokenizer.eos_token, ""
150
+ # ).replace(
151
+ # self.image_processor.tokenizer.pad_token, ""
152
+ # )
153
+ # # remove first task start token
154
+ # sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
155
+ # # Print the extracted sequence
156
+ # print("Extracted Text:", sequence)
157
+
158
+ # print(self.image_processor.token2json(sequence))
159
+
160
+ # # Format the output as Markdown (optional step)
161
+ # markdown_text = self.format_as_markdown(sequence)
162
+ # print("Markdown Format:\n", markdown_text)
163
+
164
+ # return None
133
165
 
134
166
  def convert_to_markdown(self, text):
135
167
  """
@@ -141,7 +173,7 @@ class PDFLoader(BasePDF):
141
173
  # Detect headings and bold them
142
174
  markdown_text = re.sub(r"(^.*Scorecard.*$)", r"## \1", markdown_text)
143
175
  # Convert lines with ":" to a list item (rough approach)
144
- markdown_text = re.sub(r"(\w+):", r"- **\1**:", markdown_text)
176
+ # markdown_text = re.sub(r"(\w+):", r"- **\1**:", markdown_text)
145
177
  # Return the markdown formatted text
146
178
  return markdown_text
147
179
 
@@ -164,40 +196,77 @@ class PDFLoader(BasePDF):
164
196
 
165
197
  return cleaned_text.strip()
166
198
 
167
- def extract_page_text(self, image_path) -> str:
168
- # Open the image
169
- image = Image.open(image_path).convert("RGB")
170
-
171
- # Processor handles the OCR internally, no need for words or boxes
172
- encoding = self.image_processor(image, return_tensors="pt", truncation=True)
173
- encoding = {k: v.to(self.image_device) for k, v in encoding.items()}
199
+ def create_bounding_box(self, bbox_data):
200
+ xs = []
201
+ ys = []
202
+ for x, y in bbox_data:
203
+ xs.append(x)
204
+ ys.append(y)
174
205
 
175
- # Forward pass
176
- outputs = self.image_model(**encoding)
177
- logits = outputs.logits
206
+ left = int(min(xs))
207
+ top = int(min(ys))
208
+ right = int(max(xs))
209
+ bottom = int(max(ys))
178
210
 
179
- # Get predictions
180
- predictions = logits.argmax(-1).squeeze().tolist()
181
- labels = [self.image_model.config.id2label[pred] for pred in predictions]
211
+ return [left, top, right, bottom]
182
212
 
183
- # Get the words and boxes from the processor's OCR step
184
- words = self.image_processor.tokenizer.convert_ids_to_tokens(
185
- encoding['input_ids'].squeeze().tolist()
186
- )
187
- boxes = encoding['bbox'].squeeze().tolist()
188
-
189
- # Combine words and labels, preserving line breaks based on vertical box position
190
- extracted_text = ""
191
- last_box = None
192
- for word, label, box in zip(words, labels, boxes):
193
- if label != 'O':
194
- # Check if the current word is on a new line based on the vertical position of the box
195
- if last_box and abs(box[1] - last_box[1]) > 10: # A threshold for line breaks
196
- extracted_text += "\n" # Add a line break
197
-
198
- extracted_text += f"{word} "
199
- last_box = box
200
- cleaned_text = self.clean_tokenized_text(extracted_text)
213
+ def extract_page_text(self, image_path) -> str:
214
+ # Open the image
215
+ image = Image.open(image_path).convert("RGB")
216
+ image_width, image_height = image.size
217
+
218
+ # Initialize PaddleOCR with English language
219
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
220
+ ocr_result = ocr.ocr(str(image_path), cls=True)
221
+
222
+ # Collect the text and bounding boxes
223
+ text_with_boxes = []
224
+ for line in ocr_result[0]:
225
+ text = line[1][0] # Extract the text
226
+ bbox = line[0] # Extract the bounding box
227
+ text_with_boxes.append((text, bbox))
228
+
229
+ # Step 2: Sort text based on y-coordinate (top-down order)
230
+ def average_y(bbox):
231
+ return sum([point[1] for point in bbox]) / len(bbox)
232
+
233
+ text_with_boxes.sort(key=lambda x: average_y(x[1]))
234
+
235
+ # Insert line breaks based on y-coordinate differences
236
+ words_with_newlines = []
237
+ last_y = None
238
+ threshold = 20 # You can adjust this value based on the document's layout
239
+
240
+ for _, (word, bbox) in enumerate(text_with_boxes):
241
+ current_y = average_y(bbox)
242
+ if last_y is not None and current_y - last_y > threshold:
243
+ words_with_newlines.append("\n") # Insert a line break
244
+ words_with_newlines.append(word)
245
+ last_y = current_y
246
+
247
+ # # Step 3: Extract words and bounding boxes after sorting
248
+ # words = [item[0] for item in text_with_boxes]
249
+ # bounding_boxes = [item[1] for item in text_with_boxes]
250
+
251
+ # # Step 4: Rescale bounding boxes to the 0-1000 range for LayoutLMv3
252
+ # boxes = rescale_bounding_boxes(
253
+ # bounding_boxes,
254
+ # image_width,
255
+ # image_height
256
+ # )
257
+
258
+ # # Print extracted text and bounding boxes
259
+ # # for word, bbox in zip(words, boxes):
260
+ # # print(f"Word: {word}, Bounding Box: {bbox}")
261
+
262
+ # # Processor handles the OCR internally, no need for words or boxes
263
+ # encoded_inputs = self.image_processor(image, words, boxes=boxes, return_tensors="pt")
264
+ # outputs = self.image_model(**encoded_inputs)
265
+
266
+ # Step 7: Join the sorted words into a paragraph
267
+ paragraph = " ".join(words_with_newlines)
268
+
269
+ cleaned_text = self.clean_tokenized_text(paragraph)
201
270
  markdown_text = self.convert_to_markdown(cleaned_text)
202
271
  return markdown_text
203
272
 
@@ -217,6 +286,10 @@ class PDFLoader(BasePDF):
217
286
  docs = []
218
287
  try:
219
288
  md_text = to_markdown(pdf) # get markdown for all pages
289
+ try:
290
+ summary_document = self.get_summary_from_text(md_text)
291
+ except Exception:
292
+ summary_document = ''
220
293
  _meta = {
221
294
  "url": f'{path}',
222
295
  "source": f"{path.name}",
@@ -226,11 +299,11 @@ class PDFLoader(BasePDF):
226
299
  "answer": '',
227
300
  "source_type": self._source_type,
228
301
  "data": {},
229
- "summary": '',
302
+ "summary": '-',
230
303
  "document_meta": {
231
- "title": pdf.metadata.get("title", ""),
232
- "creationDate": pdf.metadata.get("creationDate", ""),
233
- "author": pdf.metadata.get("author", ""),
304
+ "title": pdf.metadata.get("title", ""), # pylint: disable=E1101
305
+ "creationDate": pdf.metadata.get("creationDate", ""), # pylint: disable=E1101
306
+ "author": pdf.metadata.get("author", ""), # pylint: disable=E1101
234
307
  }
235
308
  }
236
309
  docs.append(
@@ -239,6 +312,14 @@ class PDFLoader(BasePDF):
239
312
  metadata=_meta
240
313
  )
241
314
  )
315
+ if summary_document:
316
+ summary_document = f"**Summary**\n{path.name}\n" + summary_document
317
+ docs.append(
318
+ Document(
319
+ page_content=summary_document,
320
+ metadata=_meta
321
+ )
322
+ )
242
323
  except Exception:
243
324
  pass
244
325
  for page_number in range(pdf.page_count):
@@ -250,9 +331,9 @@ class PDFLoader(BasePDF):
250
331
  try:
251
332
  summary = self.get_summary_from_text(text)
252
333
  except Exception:
253
- summary = ''
334
+ summary = '-'
254
335
  metadata = {
255
- "url": '',
336
+ "url": f"{path}:#{page_num}",
256
337
  "source": f"{path.name} Page.#{page_num}",
257
338
  "filename": path.name,
258
339
  "index": f"{page_num}",
@@ -261,11 +342,10 @@ class PDFLoader(BasePDF):
261
342
  "answer": '',
262
343
  "source_type": self._source_type,
263
344
  "data": {},
264
- "summary": summary,
345
+ "summary": '',
265
346
  "document_meta": {
266
- "title": pdf.metadata.get("title", ""),
267
- "creationDate": pdf.metadata.get("creationDate", ""),
268
- "author": pdf.metadata.get("author", ""),
347
+ "title": pdf.metadata.get("title", ""), # pylint: disable=E1101
348
+ "author": pdf.metadata.get("author", ""), # pylint: disable=E1101
269
349
  }
270
350
  }
271
351
  docs.append(
@@ -274,6 +354,15 @@ class PDFLoader(BasePDF):
274
354
  metadata=metadata
275
355
  )
276
356
  )
357
+ # And Summary Document:
358
+ if summary:
359
+ sm = f"**Summary**\n{path.name} Page.#{page_num}\n" + summary
360
+ docs.append(
361
+ Document(
362
+ page_content=sm,
363
+ metadata=metadata
364
+ )
365
+ )
277
366
  # Extract images and use OCR to get text from each image
278
367
  # second: images
279
368
  file_name = path.stem.replace(' ', '_').replace('.', '').lower()
@@ -338,7 +427,7 @@ class PDFLoader(BasePDF):
338
427
  df = df.dropna(axis=1, how='all')
339
428
  df = df.dropna(how='all', axis=0) # Drop empty rows
340
429
  table_meta = {
341
- "url": '',
430
+ "url": f"{path.name} Page.#{page_num} Table.#{tab_idx}",
342
431
  "source": f"{path.name} Page.#{page_num} Table.#{tab_idx}",
343
432
  "filename": path.name,
344
433
  "index": f"{path.name}:{page_num}",
@@ -346,7 +435,7 @@ class PDFLoader(BasePDF):
346
435
  "answer": '',
347
436
  "type": 'table',
348
437
  "data": {},
349
- "summary": '',
438
+ "summary": '-',
350
439
  "document_meta": {
351
440
  "table_index": tab_idx,
352
441
  "table_shape": df.shape,
@@ -366,9 +455,10 @@ class PDFLoader(BasePDF):
366
455
  if self.page_as_images is True:
367
456
  # Convert the page to a Pixmap (which is an image)
368
457
  mat = fitz.Matrix(2, 2)
369
- pix = page.get_pixmap(dpi=300, matrix=mat) # Increase DPI for better resolution
458
+ pix = page.get_pixmap(dpi=600, matrix=mat) # Increase DPI for better resolution
370
459
  img_name = f'{file_name}_page_{page_num}.png'
371
460
  img_path = self._imgdir.joinpath(img_name)
461
+ print('IMAGE > ', img_path)
372
462
  if img_path.exists():
373
463
  img_path.unlink(missing_ok=True)
374
464
  self.logger.notice(
@@ -380,7 +470,7 @@ class PDFLoader(BasePDF):
380
470
  # TODO passing the image to a AI visual to get explanation
381
471
  # Get the extracted text from the image
382
472
  text = self.extract_page_text(img_path)
383
- print('TEXT EXTRACTED >> ', text)
473
+ # print('TEXT EXTRACTED >> ', text)
384
474
  url = f'/static/images/{img_name}'
385
475
  image_meta = {
386
476
  "url": url,
@@ -391,7 +481,7 @@ class PDFLoader(BasePDF):
391
481
  "answer": '',
392
482
  "type": 'page',
393
483
  "data": {},
394
- "summary": '',
484
+ "summary": '-',
395
485
  "document_meta": {
396
486
  "image_name": img_name,
397
487
  "page_number": f"{page_number}"
@@ -406,25 +496,16 @@ class PDFLoader(BasePDF):
406
496
  else:
407
497
  return []
408
498
 
409
- def get_ocr(self, img_path) -> list:
410
- # Initialize PaddleOCR with table recognition
411
- self.ocr_model = PaddleOCR(
499
+ def get_paddleocr(self, img_path) -> list:
500
+ # Initialize PaddleOCR
501
+ ocr_model = PaddleOCR(
412
502
  lang='en',
413
503
  det_model_dir=None,
414
504
  rec_model_dir=None,
415
505
  rec_char_dict_path=None,
416
- table=True,
417
- # use_angle_cls=True,
506
+ # table=True,
507
+ use_angle_cls=True,
418
508
  # use_gpu=True
419
509
  )
420
- result = self.ocr_model.ocr(img_path, cls=True)
421
-
422
- # extract tables:
423
- # The result contains the table structure and content
424
- tables = []
425
- for line in result:
426
- if 'html' in line[1]:
427
- html_table = line[1]['html']
428
- tables.append(html_table)
429
-
430
- print('TABLES > ', tables)
510
+ result = ocr_model.ocr(img_path, cls=True)
511
+ return result
parrot/version.py CHANGED
@@ -3,7 +3,7 @@
3
3
  __title__ = "ai-parrot"
4
4
  __description__ = "Live Chatbots based on Langchain chatbots and Agents \
5
5
  Integrated into Navigator Framework or used into aiohttp applications."
6
- __version__ = "0.3.8"
6
+ __version__ = "0.3.10"
7
7
  __author__ = "Jesus Lara"
8
8
  __author_email__ = "jesuslarag@gmail.com"
9
9
  __license__ = "MIT"