ai-parrot 0.3.4__cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-parrot might be problematic. Click here for more details.

Files changed (109) hide show
  1. ai_parrot-0.3.4.dist-info/LICENSE +21 -0
  2. ai_parrot-0.3.4.dist-info/METADATA +319 -0
  3. ai_parrot-0.3.4.dist-info/RECORD +109 -0
  4. ai_parrot-0.3.4.dist-info/WHEEL +6 -0
  5. ai_parrot-0.3.4.dist-info/top_level.txt +3 -0
  6. parrot/__init__.py +21 -0
  7. parrot/chatbots/__init__.py +7 -0
  8. parrot/chatbots/abstract.py +728 -0
  9. parrot/chatbots/asktroc.py +16 -0
  10. parrot/chatbots/base.py +366 -0
  11. parrot/chatbots/basic.py +9 -0
  12. parrot/chatbots/bose.py +17 -0
  13. parrot/chatbots/cody.py +17 -0
  14. parrot/chatbots/copilot.py +83 -0
  15. parrot/chatbots/dataframe.py +103 -0
  16. parrot/chatbots/hragents.py +15 -0
  17. parrot/chatbots/odoo.py +17 -0
  18. parrot/chatbots/retrievals/__init__.py +578 -0
  19. parrot/chatbots/retrievals/constitutional.py +19 -0
  20. parrot/conf.py +110 -0
  21. parrot/crew/__init__.py +3 -0
  22. parrot/crew/tools/__init__.py +22 -0
  23. parrot/crew/tools/bing.py +13 -0
  24. parrot/crew/tools/config.py +43 -0
  25. parrot/crew/tools/duckgo.py +62 -0
  26. parrot/crew/tools/file.py +24 -0
  27. parrot/crew/tools/google.py +168 -0
  28. parrot/crew/tools/gtrends.py +16 -0
  29. parrot/crew/tools/md2pdf.py +25 -0
  30. parrot/crew/tools/rag.py +42 -0
  31. parrot/crew/tools/search.py +32 -0
  32. parrot/crew/tools/url.py +21 -0
  33. parrot/exceptions.cpython-39-x86_64-linux-gnu.so +0 -0
  34. parrot/handlers/__init__.py +4 -0
  35. parrot/handlers/bots.py +196 -0
  36. parrot/handlers/chat.py +162 -0
  37. parrot/interfaces/__init__.py +6 -0
  38. parrot/interfaces/database.py +29 -0
  39. parrot/llms/__init__.py +137 -0
  40. parrot/llms/abstract.py +47 -0
  41. parrot/llms/anthropic.py +42 -0
  42. parrot/llms/google.py +42 -0
  43. parrot/llms/groq.py +45 -0
  44. parrot/llms/hf.py +45 -0
  45. parrot/llms/openai.py +59 -0
  46. parrot/llms/pipes.py +114 -0
  47. parrot/llms/vertex.py +78 -0
  48. parrot/loaders/__init__.py +20 -0
  49. parrot/loaders/abstract.py +456 -0
  50. parrot/loaders/audio.py +106 -0
  51. parrot/loaders/basepdf.py +102 -0
  52. parrot/loaders/basevideo.py +280 -0
  53. parrot/loaders/csv.py +42 -0
  54. parrot/loaders/dir.py +37 -0
  55. parrot/loaders/excel.py +349 -0
  56. parrot/loaders/github.py +65 -0
  57. parrot/loaders/handlers/__init__.py +5 -0
  58. parrot/loaders/handlers/data.py +213 -0
  59. parrot/loaders/image.py +119 -0
  60. parrot/loaders/json.py +52 -0
  61. parrot/loaders/pdf.py +437 -0
  62. parrot/loaders/pdfchapters.py +142 -0
  63. parrot/loaders/pdffn.py +112 -0
  64. parrot/loaders/pdfimages.py +207 -0
  65. parrot/loaders/pdfmark.py +88 -0
  66. parrot/loaders/pdftables.py +145 -0
  67. parrot/loaders/ppt.py +30 -0
  68. parrot/loaders/qa.py +81 -0
  69. parrot/loaders/repo.py +103 -0
  70. parrot/loaders/rtd.py +65 -0
  71. parrot/loaders/txt.py +92 -0
  72. parrot/loaders/utils/__init__.py +1 -0
  73. parrot/loaders/utils/models.py +25 -0
  74. parrot/loaders/video.py +96 -0
  75. parrot/loaders/videolocal.py +120 -0
  76. parrot/loaders/vimeo.py +106 -0
  77. parrot/loaders/web.py +216 -0
  78. parrot/loaders/web_base.py +112 -0
  79. parrot/loaders/word.py +125 -0
  80. parrot/loaders/youtube.py +192 -0
  81. parrot/manager.py +166 -0
  82. parrot/models.py +372 -0
  83. parrot/py.typed +0 -0
  84. parrot/stores/__init__.py +48 -0
  85. parrot/stores/abstract.py +171 -0
  86. parrot/stores/milvus.py +632 -0
  87. parrot/stores/qdrant.py +153 -0
  88. parrot/tools/__init__.py +12 -0
  89. parrot/tools/abstract.py +53 -0
  90. parrot/tools/asknews.py +32 -0
  91. parrot/tools/bing.py +13 -0
  92. parrot/tools/duck.py +62 -0
  93. parrot/tools/google.py +170 -0
  94. parrot/tools/stack.py +26 -0
  95. parrot/tools/weather.py +70 -0
  96. parrot/tools/wikipedia.py +59 -0
  97. parrot/tools/zipcode.py +179 -0
  98. parrot/utils/__init__.py +2 -0
  99. parrot/utils/parsers/__init__.py +5 -0
  100. parrot/utils/parsers/toml.cpython-39-x86_64-linux-gnu.so +0 -0
  101. parrot/utils/toml.py +11 -0
  102. parrot/utils/types.cpython-39-x86_64-linux-gnu.so +0 -0
  103. parrot/utils/uv.py +11 -0
  104. parrot/version.py +10 -0
  105. resources/users/__init__.py +5 -0
  106. resources/users/handlers.py +13 -0
  107. resources/users/models.py +205 -0
  108. settings/__init__.py +0 -0
  109. settings/settings.py +51 -0
@@ -0,0 +1,119 @@
1
+ from typing import Any
2
+ from collections.abc import Callable
3
+ from pathlib import Path, PurePath
4
+ import numpy as np
5
+ from PIL import Image
6
+ from langchain.docstore.document import Document
7
+ from transformers import CLIPModel
8
+ import torch
9
+ from torchvision import transforms
10
+ from .abstract import AbstractLoader
11
+ from ..stores.abstract import AbstractStore
12
+
13
+
14
+ class ImageLoader(AbstractLoader):
15
+ """
16
+ Image Loader.
17
+ """
18
+ _extension = ['.jpg', '.jpeg', '.png']
19
+ chunk_size = 768
20
+
21
+ def __init__(
22
+ self,
23
+ path: PurePath,
24
+ store: AbstractStore,
25
+ tokenizer: Callable[..., Any] = None,
26
+ text_splitter: Callable[..., Any] = None,
27
+ source_type: str = 'image',
28
+ **kwargs
29
+ ):
30
+ super().__init__(tokenizer, text_splitter, source_type, **kwargs)
31
+ self.path = path
32
+ if isinstance(path, str):
33
+ self.path = Path(path).resolve()
34
+ # Model:
35
+ self._model = CLIPModel.from_pretrained(
36
+ # "openai/clip-vit-base-patch32"
37
+ "openai/clip-vit-large-patch14-336"
38
+ )
39
+ # Define image preprocessing
40
+ self._preprocess = transforms.Compose(
41
+ [
42
+ transforms.Resize((336, 336)), # Adjust the size to match the model's expected input
43
+ transforms.CenterCrop(336), # Optionally add a center crop if needed
44
+ transforms.ToTensor(),
45
+ transforms.Normalize(
46
+ (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)
47
+ ) # CLIP's original normalization
48
+ ]
49
+ )
50
+ # required Milvus Store:
51
+ self.store = store
52
+
53
+ def transform_image(self, img_data):
54
+ image = self._preprocess(img_data)
55
+ image = image.unsqueeze(0)
56
+ with torch.no_grad():
57
+ features = self._model.get_image_features(pixel_values=image)
58
+ embedding = features.squeeze().cpu().numpy()
59
+ return embedding.astype(np.float32)
60
+
61
+ def _insert_image(self, data):
62
+ return self.store.insert(data)
63
+
64
+ def _load_image(self, path) -> list:
65
+ """
66
+ Load an Image file.
67
+ Args:
68
+ path (Path): The path to the Image file.
69
+ Returns:
70
+ list: A list of Langchain Documents.
71
+ """
72
+ if self._check_path(path):
73
+ self.logger.info(f"Loading Image file: {path}")
74
+ img = Image.open(path).convert('RGB')
75
+ embedding = self.transform_image(img).tolist()
76
+ data={
77
+ "url": '',
78
+ "source": f"{path.name}",
79
+ "filename": path,
80
+ "question": '',
81
+ "answer": '',
82
+ "source_type": self._source_type,
83
+ "type": "image",
84
+ "text": '',
85
+ "vector": embedding,
86
+ "document_meta": {
87
+ "image": path.name,
88
+ "extension": path.suffix
89
+ }
90
+ }
91
+ self._insert_image([embedding])
92
+ return []
93
+
94
+ def load(self) -> list:
95
+ """
96
+ Load data from a Image file.
97
+ Returns:
98
+ list: A list of Langchain Documents.
99
+ """
100
+ if not self.path.exists():
101
+ raise FileNotFoundError(f"Image file/directory not found: {self.path}")
102
+ if self.path.is_dir():
103
+ # iterate over the files in the directory
104
+ for ext in self._extension:
105
+ for item in self.path.glob(f'*{ext}'):
106
+ self._load_image(item)
107
+ elif self.path.is_file():
108
+ self._load_image(self.path)
109
+ else:
110
+ raise ValueError(
111
+ f"Image Loader: Invalid path: {self.path}"
112
+ )
113
+ # Load Image loads the image directly to database.
114
+ return True
115
+
116
+ def parse(self, source):
117
+ raise NotImplementedError(
118
+ "Parser method is not implemented for ImageLoader."
119
+ )
parrot/loaders/json.py ADDED
@@ -0,0 +1,52 @@
1
+ from collections.abc import Callable
2
+ from pathlib import PurePath
3
+ from langchain_community.document_loaders import JSONLoader as JSLoader
4
+ from .abstract import AbstractLoader
5
+
6
+
7
+ class JSONLoader(AbstractLoader):
8
+ """
9
+ Loader for JSON files.
10
+ """
11
+ _extension = ['.json']
12
+ extract_metadata: Callable = None
13
+
14
+ def extract_metadata(self, record: dict, metadata: dict) -> dict:
15
+ meta = {
16
+ "source_type": self._source_type,
17
+ "priority": self._priority,
18
+ }
19
+ return meta
20
+
21
+ def load(self, path: PurePath) -> list:
22
+ """
23
+ Load data from a JSON file.
24
+
25
+ Args:
26
+ source (str): The path to the JSON file.
27
+
28
+ Returns:
29
+ list: A list of Langchain Documents.
30
+ """
31
+ if self._check_path(path):
32
+ self.logger.info(f"Loading JSON file: {path}")
33
+ # Create metadata for each chunk
34
+ meta = {
35
+ "filename": str(path),
36
+ }
37
+ args = {
38
+ "metadata_func": self.extract_metadata,
39
+ }
40
+ loader = JSLoader(
41
+ file_path=path,
42
+ jq_schema=".",
43
+ text_content=False,
44
+ **args
45
+ )
46
+ documents = loader.load()
47
+ for doc in documents:
48
+ doc.metadata.update(meta)
49
+ # Split the documents into chunks
50
+ return self.split_documents(documents)
51
+ else:
52
+ return []
parrot/loaders/pdf.py ADDED
@@ -0,0 +1,437 @@
1
+ from collections.abc import Callable
2
+ from pathlib import Path, PurePath
3
+ from typing import Any
4
+ from io import BytesIO
5
+ import re
6
+ import ftfy
7
+ import fitz
8
+ import pytesseract
9
+ from paddleocr import PaddleOCR
10
+ import torch
11
+ import cv2
12
+ from transformers import (
13
+ # DonutProcessor,
14
+ # VisionEncoderDecoderModel,
15
+ # VisionEncoderDecoderConfig,
16
+ # ViTImageProcessor,
17
+ # AutoTokenizer,
18
+ LayoutLMv3ForTokenClassification,
19
+ LayoutLMv3Processor
20
+ )
21
+ from pdf4llm import to_markdown
22
+ from PIL import Image
23
+ from langchain.docstore.document import Document
24
+ from navconfig import config
25
+ from .basepdf import BasePDF
26
+
27
+
28
+ class PDFLoader(BasePDF):
29
+ """
30
+ Loader for PDF files.
31
+ """
32
+ def __init__(
33
+ self,
34
+ path: PurePath,
35
+ tokenizer: Callable[..., Any] = None,
36
+ text_splitter: Callable[..., Any] = None,
37
+ source_type: str = 'pdf',
38
+ language: str = "eng",
39
+ **kwargs
40
+ ):
41
+ super().__init__(
42
+ path=path,
43
+ tokenizer=tokenizer,
44
+ text_splitter=text_splitter,
45
+ source_type=source_type,
46
+ language=language,
47
+ **kwargs
48
+ )
49
+ self.parse_images = kwargs.get('parse_images', False)
50
+ self.page_as_images = kwargs.get('page_as_images', False)
51
+ if self.page_as_images is True:
52
+ # # Load the processor and model from Hugging Face
53
+ # self.image_processor = DonutProcessor.from_pretrained(
54
+ # "naver-clova-ix/donut-base-finetuned-docvqa"
55
+ # )
56
+ # self.image_model = VisionEncoderDecoderModel.from_pretrained(
57
+ # "naver-clova-ix/donut-base-finetuned-docvqa",
58
+
59
+ # )
60
+ # Load the processor and model from Hugging Face
61
+ self.image_processor = LayoutLMv3Processor.from_pretrained(
62
+ "microsoft/layoutlmv3-base",
63
+ apply_ocr=True
64
+ )
65
+ self.image_model = LayoutLMv3ForTokenClassification.from_pretrained(
66
+ # "microsoft/layoutlmv3-base-finetuned-funsd"
67
+ "HYPJUDY/layoutlmv3-base-finetuned-funsd"
68
+ )
69
+ # Set device to GPU if available
70
+ self.image_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
71
+ self.image_model.to(self.image_device)
72
+
73
+ # Table Settings:
74
+ self.table_settings = {
75
+ #"vertical_strategy": "text",
76
+ # "horizontal_strategy": "text",
77
+ "intersection_x_tolerance": 3,
78
+ "intersection_y_tolerance": 3
79
+ }
80
+ table_settings = kwargs.get('table_setttings', {})
81
+ if table_settings:
82
+ self.table_settings.update(table_settings)
83
+
84
+ def explain_image(self, image_path):
85
+ """Function to explain the image."""
86
+ # with open(image_path, "rb") as image_file:
87
+ # image_content = image_file.read()
88
+
89
+ # Open the image
90
+ image = cv2.imread(image_path)
91
+ task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
92
+ question = "Extract Questions about Happily Greet"
93
+ prompt = task_prompt.replace("{user_input}", question)
94
+
95
+ decoder_input_ids = self.image_processor.tokenizer(
96
+ prompt,
97
+ add_special_tokens=False,
98
+ return_tensors="pt",
99
+ ).input_ids
100
+
101
+ pixel_values = self.image_processor(
102
+ image,
103
+ return_tensors="pt"
104
+ ).pixel_values
105
+
106
+ # Send inputs to the appropriate device
107
+ pixel_values = pixel_values.to(self.image_device)
108
+ decoder_input_ids = decoder_input_ids.to(self.image_device)
109
+
110
+ outputs = self.image_model.generate(
111
+ pixel_values,
112
+ decoder_input_ids=decoder_input_ids,
113
+ max_length=self.image_model.decoder.config.max_position_embeddings,
114
+ pad_token_id=self.image_processor.tokenizer.pad_token_id,
115
+ eos_token_id=self.image_processor.tokenizer.eos_token_id,
116
+ bad_words_ids=[[self.image_processor.tokenizer.unk_token_id]],
117
+ # use_cache=True
118
+ return_dict_in_generate=True,
119
+ )
120
+
121
+ sequence = self.image_processor.batch_decode(outputs.sequences)[0]
122
+
123
+
124
+ sequence = sequence.replace(
125
+ self.image_processor.tokenizer.eos_token, ""
126
+ ).replace(
127
+ self.image_processor.tokenizer.pad_token, ""
128
+ )
129
+ # remove first task start token
130
+ sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
131
+ # Print the extracted sequence
132
+ print("Extracted Text:", sequence)
133
+
134
+ print(self.image_processor.token2json(sequence))
135
+
136
+ # Format the output as Markdown (optional step)
137
+ markdown_text = self.format_as_markdown(sequence)
138
+ print("Markdown Format:\n", markdown_text)
139
+
140
+ return None
141
+
142
+ def convert_to_markdown(self, text):
143
+ """
144
+ Convert the cleaned text into a markdown format.
145
+ You can enhance this function to detect tables, headings, etc.
146
+ """
147
+ # For example, we can identify sections or headers and format them in Markdown
148
+ markdown_text = text
149
+ # Detect headings and bold them
150
+ markdown_text = re.sub(r"(^.*Scorecard.*$)", r"## \1", markdown_text)
151
+ # Convert lines with ":" to a list item (rough approach)
152
+ markdown_text = re.sub(r"(\w+):", r"- **\1**:", markdown_text)
153
+ # Return the markdown formatted text
154
+ return markdown_text
155
+
156
+ def clean_tokenized_text(self, tokenized_text):
157
+ """
158
+ Clean the tokenized text by fixing encoding issues and formatting, preserving line breaks.
159
+ """
160
+ # Fix encoding issues using ftfy
161
+ cleaned_text = ftfy.fix_text(tokenized_text)
162
+
163
+ # Remove <s> and </s> tags (special tokens)
164
+ cleaned_text = cleaned_text.replace("<s>", "").replace("</s>", "")
165
+
166
+ # Replace special characters like 'Ġ' and fix multiple spaces, preserving new lines
167
+ cleaned_text = cleaned_text.replace("Ġ", " ")
168
+
169
+ # Avoid collapsing line breaks, but still normalize multiple spaces
170
+ # Replace multiple spaces with a single space, but preserve line breaks
171
+ cleaned_text = re.sub(r" +", " ", cleaned_text)
172
+
173
+ return cleaned_text.strip()
174
+
175
+ def extract_page_text(self, image_path) -> str:
176
+ # Open the image
177
+ image = Image.open(image_path).convert("RGB")
178
+
179
+ # Processor handles the OCR internally, no need for words or boxes
180
+ encoding = self.image_processor(image, return_tensors="pt", truncation=True)
181
+ encoding = {k: v.to(self.image_device) for k, v in encoding.items()}
182
+
183
+ # Forward pass
184
+ outputs = self.image_model(**encoding)
185
+ logits = outputs.logits
186
+
187
+ # Get predictions
188
+ predictions = logits.argmax(-1).squeeze().tolist()
189
+ labels = [self.image_model.config.id2label[pred] for pred in predictions]
190
+
191
+ # Get the words and boxes from the processor's OCR step
192
+ words = self.image_processor.tokenizer.convert_ids_to_tokens(
193
+ encoding['input_ids'].squeeze().tolist()
194
+ )
195
+ boxes = encoding['bbox'].squeeze().tolist()
196
+
197
+ # Combine words and labels, preserving line breaks based on vertical box position
198
+ extracted_text = ""
199
+ last_box = None
200
+ for word, label, box in zip(words, labels, boxes):
201
+ if label != 'O':
202
+ # Check if the current word is on a new line based on the vertical position of the box
203
+ if last_box and abs(box[1] - last_box[1]) > 10: # A threshold for line breaks
204
+ extracted_text += "\n" # Add a line break
205
+
206
+ extracted_text += f"{word} "
207
+ last_box = box
208
+ cleaned_text = self.clean_tokenized_text(extracted_text)
209
+ markdown_text = self.convert_to_markdown(cleaned_text)
210
+ return markdown_text
211
+
212
+ def _load_pdf(self, path: Path) -> list:
213
+ """
214
+ Load a PDF file using the Fitz library.
215
+
216
+ Args:
217
+ path (Path): The path to the PDF file.
218
+
219
+ Returns:
220
+ list: A list of Langchain Documents.
221
+ """
222
+ if self._check_path(path):
223
+ self.logger.info(f"Loading PDF file: {path}")
224
+ pdf = fitz.open(str(path)) # Open the PDF file
225
+ docs = []
226
+ try:
227
+ md_text = to_markdown(pdf) # get markdown for all pages
228
+ _meta = {
229
+ "url": f'{path}',
230
+ "source": f"{path.name}",
231
+ "filename": path.name,
232
+ "type": 'pdf',
233
+ "question": '',
234
+ "answer": '',
235
+ "source_type": self._source_type,
236
+ "data": {},
237
+ "summary": '',
238
+ "document_meta": {
239
+ "title": pdf.metadata.get("title", ""),
240
+ "creationDate": pdf.metadata.get("creationDate", ""),
241
+ "author": pdf.metadata.get("author", ""),
242
+ }
243
+ }
244
+ docs.append(
245
+ Document(
246
+ page_content=md_text,
247
+ metadata=_meta
248
+ )
249
+ )
250
+ except Exception:
251
+ pass
252
+ for page_number in range(pdf.page_count):
253
+ page = pdf[page_number]
254
+ text = page.get_text()
255
+ # first: text
256
+ if text:
257
+ page_num = page_number + 1
258
+ try:
259
+ summary = self.get_summary_from_text(text)
260
+ except Exception:
261
+ summary = ''
262
+ metadata = {
263
+ "url": '',
264
+ "source": f"{path.name} Page.#{page_num}",
265
+ "filename": path.name,
266
+ "index": f"{page_num}",
267
+ "type": 'pdf',
268
+ "question": '',
269
+ "answer": '',
270
+ "source_type": self._source_type,
271
+ "data": {},
272
+ "summary": summary,
273
+ "document_meta": {
274
+ "title": pdf.metadata.get("title", ""),
275
+ "creationDate": pdf.metadata.get("creationDate", ""),
276
+ "author": pdf.metadata.get("author", ""),
277
+ }
278
+ }
279
+ docs.append(
280
+ Document(
281
+ page_content=text,
282
+ metadata=metadata
283
+ )
284
+ )
285
+ # Extract images and use OCR to get text from each image
286
+ # second: images
287
+ file_name = path.stem.replace(' ', '_').replace('.', '').lower()
288
+ if self.parse_images is True:
289
+ # extract any images in page:
290
+ image_list = page.get_images(full=True)
291
+ for img_index, img in enumerate(image_list):
292
+ xref = img[0]
293
+ base_image = pdf.extract_image(xref)
294
+ image = Image.open(BytesIO(base_image["image"]))
295
+ url = ''
296
+ if self.save_images is True:
297
+ img_name = f'image_{file_name}_{page_num}_{img_index}.png'
298
+ img_path = self._imgdir.joinpath(img_name)
299
+ self.logger.notice(
300
+ f"Saving Image Page on {img_path}"
301
+ )
302
+ try:
303
+ image.save(
304
+ img_path,
305
+ format="png",
306
+ optimize=True
307
+ )
308
+ url = f'/static/images/{img_name}'
309
+ except OSError:
310
+ pass
311
+ # Use Tesseract to extract text from image
312
+ image_text = pytesseract.image_to_string(
313
+ image,
314
+ lang=self._lang
315
+ )
316
+ # TODO: add the summary (explanation)
317
+ # Create a document for each image
318
+ image_meta = {
319
+ "url": url,
320
+ "source": f"{path.name} Page.#{page_num}",
321
+ "filename": path.name,
322
+ "index": f"{path.name}:{page_num}",
323
+ "question": '',
324
+ "answer": '',
325
+ "type": 'image',
326
+ "data": {},
327
+ "summary": '',
328
+ "document_meta": {
329
+ "image_index": img_index,
330
+ "image_name": img_name,
331
+ "description": f"Extracted from {page_number}."
332
+ },
333
+ "source_type": self._source_type
334
+ }
335
+ docs.append(
336
+ Document(page_content=image_text, metadata=image_meta)
337
+ )
338
+ # third: tables
339
+ # Look for tables on this page and display the table count
340
+ try:
341
+ tabs = page.find_tables()
342
+ for tab_idx, tab in enumerate(tabs):
343
+ # iterating over all tables in page:
344
+ df = tab.to_pandas() # convert to pandas DataFrame
345
+ # converting to markdown, but after pre-processing pandas
346
+ df = df.dropna(axis=1, how='all')
347
+ df = df.dropna(how='all', axis=0) # Drop empty rows
348
+ table_meta = {
349
+ "url": '',
350
+ "source": f"{path.name} Page.#{page_num} Table.#{tab_idx}",
351
+ "filename": path.name,
352
+ "index": f"{path.name}:{page_num}",
353
+ "question": '',
354
+ "answer": '',
355
+ "type": 'table',
356
+ "data": {},
357
+ "summary": '',
358
+ "document_meta": {
359
+ "table_index": tab_idx,
360
+ "table_shape": df.shape,
361
+ "table_columns": df.columns.tolist(),
362
+ "description": f"Extracted from {page_number}."
363
+ },
364
+ "source_type": self._source_type
365
+ }
366
+ txt = df.to_markdown()
367
+ if txt:
368
+ docs.append(
369
+ Document(page_content=txt, metadata=table_meta)
370
+ )
371
+ except Exception as exc:
372
+ print(exc)
373
+ # fourth: page as image
374
+ if self.page_as_images is True:
375
+ # Convert the page to a Pixmap (which is an image)
376
+ mat = fitz.Matrix(2, 2)
377
+ pix = page.get_pixmap(dpi=300, matrix=mat) # Increase DPI for better resolution
378
+ img_name = f'{file_name}_page_{page_num}.png'
379
+ img_path = self._imgdir.joinpath(img_name)
380
+ if img_path.exists():
381
+ img_path.unlink(missing_ok=True)
382
+ self.logger.notice(
383
+ f"Saving Page {page_number} as Image on {img_path}"
384
+ )
385
+ pix.save(
386
+ img_path
387
+ )
388
+ # TODO passing the image to a AI visual to get explanation
389
+ # Get the extracted text from the image
390
+ text = self.extract_page_text(img_path)
391
+ url = f'/static/images/{img_name}'
392
+ image_meta = {
393
+ "url": url,
394
+ "source": f"{path.name} Page.#{page_num}",
395
+ "filename": path.name,
396
+ "index": f"{path.name}:{page_num}",
397
+ "question": '',
398
+ "answer": '',
399
+ "type": 'page',
400
+ "data": {},
401
+ "summary": '',
402
+ "document_meta": {
403
+ "image_name": img_name,
404
+ "page_number": f"{page_number}"
405
+ },
406
+ "source_type": self._source_type
407
+ }
408
+ docs.append(
409
+ Document(page_content=text, metadata=image_meta)
410
+ )
411
+ pdf.close()
412
+ return docs
413
+ else:
414
+ return []
415
+
416
+ def get_ocr(self, img_path) -> list:
417
+ # Initialize PaddleOCR with table recognition
418
+ self.ocr_model = PaddleOCR(
419
+ lang='en',
420
+ det_model_dir=None,
421
+ rec_model_dir=None,
422
+ rec_char_dict_path=None,
423
+ table=True,
424
+ # use_angle_cls=True,
425
+ # use_gpu=True
426
+ )
427
+ result = self.ocr_model.ocr(img_path, cls=True)
428
+
429
+ # extract tables:
430
+ # The result contains the table structure and content
431
+ tables = []
432
+ for line in result:
433
+ if 'html' in line[1]:
434
+ html_table = line[1]['html']
435
+ tables.append(html_table)
436
+
437
+ print('TABLES > ', tables)