ai-parrot 0.3.4__cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.3.4.dist-info/LICENSE +21 -0
- ai_parrot-0.3.4.dist-info/METADATA +319 -0
- ai_parrot-0.3.4.dist-info/RECORD +109 -0
- ai_parrot-0.3.4.dist-info/WHEEL +6 -0
- ai_parrot-0.3.4.dist-info/top_level.txt +3 -0
- parrot/__init__.py +21 -0
- parrot/chatbots/__init__.py +7 -0
- parrot/chatbots/abstract.py +728 -0
- parrot/chatbots/asktroc.py +16 -0
- parrot/chatbots/base.py +366 -0
- parrot/chatbots/basic.py +9 -0
- parrot/chatbots/bose.py +17 -0
- parrot/chatbots/cody.py +17 -0
- parrot/chatbots/copilot.py +83 -0
- parrot/chatbots/dataframe.py +103 -0
- parrot/chatbots/hragents.py +15 -0
- parrot/chatbots/odoo.py +17 -0
- parrot/chatbots/retrievals/__init__.py +578 -0
- parrot/chatbots/retrievals/constitutional.py +19 -0
- parrot/conf.py +110 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-39-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +162 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +29 -0
- parrot/llms/__init__.py +137 -0
- parrot/llms/abstract.py +47 -0
- parrot/llms/anthropic.py +42 -0
- parrot/llms/google.py +42 -0
- parrot/llms/groq.py +45 -0
- parrot/llms/hf.py +45 -0
- parrot/llms/openai.py +59 -0
- parrot/llms/pipes.py +114 -0
- parrot/llms/vertex.py +78 -0
- parrot/loaders/__init__.py +20 -0
- parrot/loaders/abstract.py +456 -0
- parrot/loaders/audio.py +106 -0
- parrot/loaders/basepdf.py +102 -0
- parrot/loaders/basevideo.py +280 -0
- parrot/loaders/csv.py +42 -0
- parrot/loaders/dir.py +37 -0
- parrot/loaders/excel.py +349 -0
- parrot/loaders/github.py +65 -0
- parrot/loaders/handlers/__init__.py +5 -0
- parrot/loaders/handlers/data.py +213 -0
- parrot/loaders/image.py +119 -0
- parrot/loaders/json.py +52 -0
- parrot/loaders/pdf.py +437 -0
- parrot/loaders/pdfchapters.py +142 -0
- parrot/loaders/pdffn.py +112 -0
- parrot/loaders/pdfimages.py +207 -0
- parrot/loaders/pdfmark.py +88 -0
- parrot/loaders/pdftables.py +145 -0
- parrot/loaders/ppt.py +30 -0
- parrot/loaders/qa.py +81 -0
- parrot/loaders/repo.py +103 -0
- parrot/loaders/rtd.py +65 -0
- parrot/loaders/txt.py +92 -0
- parrot/loaders/utils/__init__.py +1 -0
- parrot/loaders/utils/models.py +25 -0
- parrot/loaders/video.py +96 -0
- parrot/loaders/videolocal.py +120 -0
- parrot/loaders/vimeo.py +106 -0
- parrot/loaders/web.py +216 -0
- parrot/loaders/web_base.py +112 -0
- parrot/loaders/word.py +125 -0
- parrot/loaders/youtube.py +192 -0
- parrot/manager.py +166 -0
- parrot/models.py +372 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +48 -0
- parrot/stores/abstract.py +171 -0
- parrot/stores/milvus.py +632 -0
- parrot/stores/qdrant.py +153 -0
- parrot/tools/__init__.py +12 -0
- parrot/tools/abstract.py +53 -0
- parrot/tools/asknews.py +32 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/google.py +170 -0
- parrot/tools/stack.py +26 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +59 -0
- parrot/tools/zipcode.py +179 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-39-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-39-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
- settings/__init__.py +0 -0
- settings/settings.py +51 -0
parrot/loaders/image.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from pathlib import Path, PurePath
|
|
4
|
+
import numpy as np
|
|
5
|
+
from PIL import Image
|
|
6
|
+
from langchain.docstore.document import Document
|
|
7
|
+
from transformers import CLIPModel
|
|
8
|
+
import torch
|
|
9
|
+
from torchvision import transforms
|
|
10
|
+
from .abstract import AbstractLoader
|
|
11
|
+
from ..stores.abstract import AbstractStore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ImageLoader(AbstractLoader):
|
|
15
|
+
"""
|
|
16
|
+
Image Loader.
|
|
17
|
+
"""
|
|
18
|
+
_extension = ['.jpg', '.jpeg', '.png']
|
|
19
|
+
chunk_size = 768
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
path: PurePath,
|
|
24
|
+
store: AbstractStore,
|
|
25
|
+
tokenizer: Callable[..., Any] = None,
|
|
26
|
+
text_splitter: Callable[..., Any] = None,
|
|
27
|
+
source_type: str = 'image',
|
|
28
|
+
**kwargs
|
|
29
|
+
):
|
|
30
|
+
super().__init__(tokenizer, text_splitter, source_type, **kwargs)
|
|
31
|
+
self.path = path
|
|
32
|
+
if isinstance(path, str):
|
|
33
|
+
self.path = Path(path).resolve()
|
|
34
|
+
# Model:
|
|
35
|
+
self._model = CLIPModel.from_pretrained(
|
|
36
|
+
# "openai/clip-vit-base-patch32"
|
|
37
|
+
"openai/clip-vit-large-patch14-336"
|
|
38
|
+
)
|
|
39
|
+
# Define image preprocessing
|
|
40
|
+
self._preprocess = transforms.Compose(
|
|
41
|
+
[
|
|
42
|
+
transforms.Resize((336, 336)), # Adjust the size to match the model's expected input
|
|
43
|
+
transforms.CenterCrop(336), # Optionally add a center crop if needed
|
|
44
|
+
transforms.ToTensor(),
|
|
45
|
+
transforms.Normalize(
|
|
46
|
+
(0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)
|
|
47
|
+
) # CLIP's original normalization
|
|
48
|
+
]
|
|
49
|
+
)
|
|
50
|
+
# required Milvus Store:
|
|
51
|
+
self.store = store
|
|
52
|
+
|
|
53
|
+
def transform_image(self, img_data):
|
|
54
|
+
image = self._preprocess(img_data)
|
|
55
|
+
image = image.unsqueeze(0)
|
|
56
|
+
with torch.no_grad():
|
|
57
|
+
features = self._model.get_image_features(pixel_values=image)
|
|
58
|
+
embedding = features.squeeze().cpu().numpy()
|
|
59
|
+
return embedding.astype(np.float32)
|
|
60
|
+
|
|
61
|
+
def _insert_image(self, data):
|
|
62
|
+
return self.store.insert(data)
|
|
63
|
+
|
|
64
|
+
def _load_image(self, path) -> list:
|
|
65
|
+
"""
|
|
66
|
+
Load an Image file.
|
|
67
|
+
Args:
|
|
68
|
+
path (Path): The path to the Image file.
|
|
69
|
+
Returns:
|
|
70
|
+
list: A list of Langchain Documents.
|
|
71
|
+
"""
|
|
72
|
+
if self._check_path(path):
|
|
73
|
+
self.logger.info(f"Loading Image file: {path}")
|
|
74
|
+
img = Image.open(path).convert('RGB')
|
|
75
|
+
embedding = self.transform_image(img).tolist()
|
|
76
|
+
data={
|
|
77
|
+
"url": '',
|
|
78
|
+
"source": f"{path.name}",
|
|
79
|
+
"filename": path,
|
|
80
|
+
"question": '',
|
|
81
|
+
"answer": '',
|
|
82
|
+
"source_type": self._source_type,
|
|
83
|
+
"type": "image",
|
|
84
|
+
"text": '',
|
|
85
|
+
"vector": embedding,
|
|
86
|
+
"document_meta": {
|
|
87
|
+
"image": path.name,
|
|
88
|
+
"extension": path.suffix
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
self._insert_image([embedding])
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
def load(self) -> list:
|
|
95
|
+
"""
|
|
96
|
+
Load data from a Image file.
|
|
97
|
+
Returns:
|
|
98
|
+
list: A list of Langchain Documents.
|
|
99
|
+
"""
|
|
100
|
+
if not self.path.exists():
|
|
101
|
+
raise FileNotFoundError(f"Image file/directory not found: {self.path}")
|
|
102
|
+
if self.path.is_dir():
|
|
103
|
+
# iterate over the files in the directory
|
|
104
|
+
for ext in self._extension:
|
|
105
|
+
for item in self.path.glob(f'*{ext}'):
|
|
106
|
+
self._load_image(item)
|
|
107
|
+
elif self.path.is_file():
|
|
108
|
+
self._load_image(self.path)
|
|
109
|
+
else:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
f"Image Loader: Invalid path: {self.path}"
|
|
112
|
+
)
|
|
113
|
+
# Load Image loads the image directly to database.
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
def parse(self, source):
|
|
117
|
+
raise NotImplementedError(
|
|
118
|
+
"Parser method is not implemented for ImageLoader."
|
|
119
|
+
)
|
parrot/loaders/json.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import PurePath
|
|
3
|
+
from langchain_community.document_loaders import JSONLoader as JSLoader
|
|
4
|
+
from .abstract import AbstractLoader
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class JSONLoader(AbstractLoader):
|
|
8
|
+
"""
|
|
9
|
+
Loader for JSON files.
|
|
10
|
+
"""
|
|
11
|
+
_extension = ['.json']
|
|
12
|
+
extract_metadata: Callable = None
|
|
13
|
+
|
|
14
|
+
def extract_metadata(self, record: dict, metadata: dict) -> dict:
|
|
15
|
+
meta = {
|
|
16
|
+
"source_type": self._source_type,
|
|
17
|
+
"priority": self._priority,
|
|
18
|
+
}
|
|
19
|
+
return meta
|
|
20
|
+
|
|
21
|
+
def load(self, path: PurePath) -> list:
|
|
22
|
+
"""
|
|
23
|
+
Load data from a JSON file.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
source (str): The path to the JSON file.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
list: A list of Langchain Documents.
|
|
30
|
+
"""
|
|
31
|
+
if self._check_path(path):
|
|
32
|
+
self.logger.info(f"Loading JSON file: {path}")
|
|
33
|
+
# Create metadata for each chunk
|
|
34
|
+
meta = {
|
|
35
|
+
"filename": str(path),
|
|
36
|
+
}
|
|
37
|
+
args = {
|
|
38
|
+
"metadata_func": self.extract_metadata,
|
|
39
|
+
}
|
|
40
|
+
loader = JSLoader(
|
|
41
|
+
file_path=path,
|
|
42
|
+
jq_schema=".",
|
|
43
|
+
text_content=False,
|
|
44
|
+
**args
|
|
45
|
+
)
|
|
46
|
+
documents = loader.load()
|
|
47
|
+
for doc in documents:
|
|
48
|
+
doc.metadata.update(meta)
|
|
49
|
+
# Split the documents into chunks
|
|
50
|
+
return self.split_documents(documents)
|
|
51
|
+
else:
|
|
52
|
+
return []
|
parrot/loaders/pdf.py
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import Path, PurePath
|
|
3
|
+
from typing import Any
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
import re
|
|
6
|
+
import ftfy
|
|
7
|
+
import fitz
|
|
8
|
+
import pytesseract
|
|
9
|
+
from paddleocr import PaddleOCR
|
|
10
|
+
import torch
|
|
11
|
+
import cv2
|
|
12
|
+
from transformers import (
|
|
13
|
+
# DonutProcessor,
|
|
14
|
+
# VisionEncoderDecoderModel,
|
|
15
|
+
# VisionEncoderDecoderConfig,
|
|
16
|
+
# ViTImageProcessor,
|
|
17
|
+
# AutoTokenizer,
|
|
18
|
+
LayoutLMv3ForTokenClassification,
|
|
19
|
+
LayoutLMv3Processor
|
|
20
|
+
)
|
|
21
|
+
from pdf4llm import to_markdown
|
|
22
|
+
from PIL import Image
|
|
23
|
+
from langchain.docstore.document import Document
|
|
24
|
+
from navconfig import config
|
|
25
|
+
from .basepdf import BasePDF
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PDFLoader(BasePDF):
|
|
29
|
+
"""
|
|
30
|
+
Loader for PDF files.
|
|
31
|
+
"""
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
path: PurePath,
|
|
35
|
+
tokenizer: Callable[..., Any] = None,
|
|
36
|
+
text_splitter: Callable[..., Any] = None,
|
|
37
|
+
source_type: str = 'pdf',
|
|
38
|
+
language: str = "eng",
|
|
39
|
+
**kwargs
|
|
40
|
+
):
|
|
41
|
+
super().__init__(
|
|
42
|
+
path=path,
|
|
43
|
+
tokenizer=tokenizer,
|
|
44
|
+
text_splitter=text_splitter,
|
|
45
|
+
source_type=source_type,
|
|
46
|
+
language=language,
|
|
47
|
+
**kwargs
|
|
48
|
+
)
|
|
49
|
+
self.parse_images = kwargs.get('parse_images', False)
|
|
50
|
+
self.page_as_images = kwargs.get('page_as_images', False)
|
|
51
|
+
if self.page_as_images is True:
|
|
52
|
+
# # Load the processor and model from Hugging Face
|
|
53
|
+
# self.image_processor = DonutProcessor.from_pretrained(
|
|
54
|
+
# "naver-clova-ix/donut-base-finetuned-docvqa"
|
|
55
|
+
# )
|
|
56
|
+
# self.image_model = VisionEncoderDecoderModel.from_pretrained(
|
|
57
|
+
# "naver-clova-ix/donut-base-finetuned-docvqa",
|
|
58
|
+
|
|
59
|
+
# )
|
|
60
|
+
# Load the processor and model from Hugging Face
|
|
61
|
+
self.image_processor = LayoutLMv3Processor.from_pretrained(
|
|
62
|
+
"microsoft/layoutlmv3-base",
|
|
63
|
+
apply_ocr=True
|
|
64
|
+
)
|
|
65
|
+
self.image_model = LayoutLMv3ForTokenClassification.from_pretrained(
|
|
66
|
+
# "microsoft/layoutlmv3-base-finetuned-funsd"
|
|
67
|
+
"HYPJUDY/layoutlmv3-base-finetuned-funsd"
|
|
68
|
+
)
|
|
69
|
+
# Set device to GPU if available
|
|
70
|
+
self.image_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
71
|
+
self.image_model.to(self.image_device)
|
|
72
|
+
|
|
73
|
+
# Table Settings:
|
|
74
|
+
self.table_settings = {
|
|
75
|
+
#"vertical_strategy": "text",
|
|
76
|
+
# "horizontal_strategy": "text",
|
|
77
|
+
"intersection_x_tolerance": 3,
|
|
78
|
+
"intersection_y_tolerance": 3
|
|
79
|
+
}
|
|
80
|
+
table_settings = kwargs.get('table_setttings', {})
|
|
81
|
+
if table_settings:
|
|
82
|
+
self.table_settings.update(table_settings)
|
|
83
|
+
|
|
84
|
+
def explain_image(self, image_path):
|
|
85
|
+
"""Function to explain the image."""
|
|
86
|
+
# with open(image_path, "rb") as image_file:
|
|
87
|
+
# image_content = image_file.read()
|
|
88
|
+
|
|
89
|
+
# Open the image
|
|
90
|
+
image = cv2.imread(image_path)
|
|
91
|
+
task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
|
|
92
|
+
question = "Extract Questions about Happily Greet"
|
|
93
|
+
prompt = task_prompt.replace("{user_input}", question)
|
|
94
|
+
|
|
95
|
+
decoder_input_ids = self.image_processor.tokenizer(
|
|
96
|
+
prompt,
|
|
97
|
+
add_special_tokens=False,
|
|
98
|
+
return_tensors="pt",
|
|
99
|
+
).input_ids
|
|
100
|
+
|
|
101
|
+
pixel_values = self.image_processor(
|
|
102
|
+
image,
|
|
103
|
+
return_tensors="pt"
|
|
104
|
+
).pixel_values
|
|
105
|
+
|
|
106
|
+
# Send inputs to the appropriate device
|
|
107
|
+
pixel_values = pixel_values.to(self.image_device)
|
|
108
|
+
decoder_input_ids = decoder_input_ids.to(self.image_device)
|
|
109
|
+
|
|
110
|
+
outputs = self.image_model.generate(
|
|
111
|
+
pixel_values,
|
|
112
|
+
decoder_input_ids=decoder_input_ids,
|
|
113
|
+
max_length=self.image_model.decoder.config.max_position_embeddings,
|
|
114
|
+
pad_token_id=self.image_processor.tokenizer.pad_token_id,
|
|
115
|
+
eos_token_id=self.image_processor.tokenizer.eos_token_id,
|
|
116
|
+
bad_words_ids=[[self.image_processor.tokenizer.unk_token_id]],
|
|
117
|
+
# use_cache=True
|
|
118
|
+
return_dict_in_generate=True,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
sequence = self.image_processor.batch_decode(outputs.sequences)[0]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
sequence = sequence.replace(
|
|
125
|
+
self.image_processor.tokenizer.eos_token, ""
|
|
126
|
+
).replace(
|
|
127
|
+
self.image_processor.tokenizer.pad_token, ""
|
|
128
|
+
)
|
|
129
|
+
# remove first task start token
|
|
130
|
+
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
|
|
131
|
+
# Print the extracted sequence
|
|
132
|
+
print("Extracted Text:", sequence)
|
|
133
|
+
|
|
134
|
+
print(self.image_processor.token2json(sequence))
|
|
135
|
+
|
|
136
|
+
# Format the output as Markdown (optional step)
|
|
137
|
+
markdown_text = self.format_as_markdown(sequence)
|
|
138
|
+
print("Markdown Format:\n", markdown_text)
|
|
139
|
+
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
def convert_to_markdown(self, text):
|
|
143
|
+
"""
|
|
144
|
+
Convert the cleaned text into a markdown format.
|
|
145
|
+
You can enhance this function to detect tables, headings, etc.
|
|
146
|
+
"""
|
|
147
|
+
# For example, we can identify sections or headers and format them in Markdown
|
|
148
|
+
markdown_text = text
|
|
149
|
+
# Detect headings and bold them
|
|
150
|
+
markdown_text = re.sub(r"(^.*Scorecard.*$)", r"## \1", markdown_text)
|
|
151
|
+
# Convert lines with ":" to a list item (rough approach)
|
|
152
|
+
markdown_text = re.sub(r"(\w+):", r"- **\1**:", markdown_text)
|
|
153
|
+
# Return the markdown formatted text
|
|
154
|
+
return markdown_text
|
|
155
|
+
|
|
156
|
+
def clean_tokenized_text(self, tokenized_text):
|
|
157
|
+
"""
|
|
158
|
+
Clean the tokenized text by fixing encoding issues and formatting, preserving line breaks.
|
|
159
|
+
"""
|
|
160
|
+
# Fix encoding issues using ftfy
|
|
161
|
+
cleaned_text = ftfy.fix_text(tokenized_text)
|
|
162
|
+
|
|
163
|
+
# Remove <s> and </s> tags (special tokens)
|
|
164
|
+
cleaned_text = cleaned_text.replace("<s>", "").replace("</s>", "")
|
|
165
|
+
|
|
166
|
+
# Replace special characters like 'Ġ' and fix multiple spaces, preserving new lines
|
|
167
|
+
cleaned_text = cleaned_text.replace("Ġ", " ")
|
|
168
|
+
|
|
169
|
+
# Avoid collapsing line breaks, but still normalize multiple spaces
|
|
170
|
+
# Replace multiple spaces with a single space, but preserve line breaks
|
|
171
|
+
cleaned_text = re.sub(r" +", " ", cleaned_text)
|
|
172
|
+
|
|
173
|
+
return cleaned_text.strip()
|
|
174
|
+
|
|
175
|
+
def extract_page_text(self, image_path) -> str:
|
|
176
|
+
# Open the image
|
|
177
|
+
image = Image.open(image_path).convert("RGB")
|
|
178
|
+
|
|
179
|
+
# Processor handles the OCR internally, no need for words or boxes
|
|
180
|
+
encoding = self.image_processor(image, return_tensors="pt", truncation=True)
|
|
181
|
+
encoding = {k: v.to(self.image_device) for k, v in encoding.items()}
|
|
182
|
+
|
|
183
|
+
# Forward pass
|
|
184
|
+
outputs = self.image_model(**encoding)
|
|
185
|
+
logits = outputs.logits
|
|
186
|
+
|
|
187
|
+
# Get predictions
|
|
188
|
+
predictions = logits.argmax(-1).squeeze().tolist()
|
|
189
|
+
labels = [self.image_model.config.id2label[pred] for pred in predictions]
|
|
190
|
+
|
|
191
|
+
# Get the words and boxes from the processor's OCR step
|
|
192
|
+
words = self.image_processor.tokenizer.convert_ids_to_tokens(
|
|
193
|
+
encoding['input_ids'].squeeze().tolist()
|
|
194
|
+
)
|
|
195
|
+
boxes = encoding['bbox'].squeeze().tolist()
|
|
196
|
+
|
|
197
|
+
# Combine words and labels, preserving line breaks based on vertical box position
|
|
198
|
+
extracted_text = ""
|
|
199
|
+
last_box = None
|
|
200
|
+
for word, label, box in zip(words, labels, boxes):
|
|
201
|
+
if label != 'O':
|
|
202
|
+
# Check if the current word is on a new line based on the vertical position of the box
|
|
203
|
+
if last_box and abs(box[1] - last_box[1]) > 10: # A threshold for line breaks
|
|
204
|
+
extracted_text += "\n" # Add a line break
|
|
205
|
+
|
|
206
|
+
extracted_text += f"{word} "
|
|
207
|
+
last_box = box
|
|
208
|
+
cleaned_text = self.clean_tokenized_text(extracted_text)
|
|
209
|
+
markdown_text = self.convert_to_markdown(cleaned_text)
|
|
210
|
+
return markdown_text
|
|
211
|
+
|
|
212
|
+
def _load_pdf(self, path: Path) -> list:
|
|
213
|
+
"""
|
|
214
|
+
Load a PDF file using the Fitz library.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
path (Path): The path to the PDF file.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
list: A list of Langchain Documents.
|
|
221
|
+
"""
|
|
222
|
+
if self._check_path(path):
|
|
223
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
224
|
+
pdf = fitz.open(str(path)) # Open the PDF file
|
|
225
|
+
docs = []
|
|
226
|
+
try:
|
|
227
|
+
md_text = to_markdown(pdf) # get markdown for all pages
|
|
228
|
+
_meta = {
|
|
229
|
+
"url": f'{path}',
|
|
230
|
+
"source": f"{path.name}",
|
|
231
|
+
"filename": path.name,
|
|
232
|
+
"type": 'pdf',
|
|
233
|
+
"question": '',
|
|
234
|
+
"answer": '',
|
|
235
|
+
"source_type": self._source_type,
|
|
236
|
+
"data": {},
|
|
237
|
+
"summary": '',
|
|
238
|
+
"document_meta": {
|
|
239
|
+
"title": pdf.metadata.get("title", ""),
|
|
240
|
+
"creationDate": pdf.metadata.get("creationDate", ""),
|
|
241
|
+
"author": pdf.metadata.get("author", ""),
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
docs.append(
|
|
245
|
+
Document(
|
|
246
|
+
page_content=md_text,
|
|
247
|
+
metadata=_meta
|
|
248
|
+
)
|
|
249
|
+
)
|
|
250
|
+
except Exception:
|
|
251
|
+
pass
|
|
252
|
+
for page_number in range(pdf.page_count):
|
|
253
|
+
page = pdf[page_number]
|
|
254
|
+
text = page.get_text()
|
|
255
|
+
# first: text
|
|
256
|
+
if text:
|
|
257
|
+
page_num = page_number + 1
|
|
258
|
+
try:
|
|
259
|
+
summary = self.get_summary_from_text(text)
|
|
260
|
+
except Exception:
|
|
261
|
+
summary = ''
|
|
262
|
+
metadata = {
|
|
263
|
+
"url": '',
|
|
264
|
+
"source": f"{path.name} Page.#{page_num}",
|
|
265
|
+
"filename": path.name,
|
|
266
|
+
"index": f"{page_num}",
|
|
267
|
+
"type": 'pdf',
|
|
268
|
+
"question": '',
|
|
269
|
+
"answer": '',
|
|
270
|
+
"source_type": self._source_type,
|
|
271
|
+
"data": {},
|
|
272
|
+
"summary": summary,
|
|
273
|
+
"document_meta": {
|
|
274
|
+
"title": pdf.metadata.get("title", ""),
|
|
275
|
+
"creationDate": pdf.metadata.get("creationDate", ""),
|
|
276
|
+
"author": pdf.metadata.get("author", ""),
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
docs.append(
|
|
280
|
+
Document(
|
|
281
|
+
page_content=text,
|
|
282
|
+
metadata=metadata
|
|
283
|
+
)
|
|
284
|
+
)
|
|
285
|
+
# Extract images and use OCR to get text from each image
|
|
286
|
+
# second: images
|
|
287
|
+
file_name = path.stem.replace(' ', '_').replace('.', '').lower()
|
|
288
|
+
if self.parse_images is True:
|
|
289
|
+
# extract any images in page:
|
|
290
|
+
image_list = page.get_images(full=True)
|
|
291
|
+
for img_index, img in enumerate(image_list):
|
|
292
|
+
xref = img[0]
|
|
293
|
+
base_image = pdf.extract_image(xref)
|
|
294
|
+
image = Image.open(BytesIO(base_image["image"]))
|
|
295
|
+
url = ''
|
|
296
|
+
if self.save_images is True:
|
|
297
|
+
img_name = f'image_{file_name}_{page_num}_{img_index}.png'
|
|
298
|
+
img_path = self._imgdir.joinpath(img_name)
|
|
299
|
+
self.logger.notice(
|
|
300
|
+
f"Saving Image Page on {img_path}"
|
|
301
|
+
)
|
|
302
|
+
try:
|
|
303
|
+
image.save(
|
|
304
|
+
img_path,
|
|
305
|
+
format="png",
|
|
306
|
+
optimize=True
|
|
307
|
+
)
|
|
308
|
+
url = f'/static/images/{img_name}'
|
|
309
|
+
except OSError:
|
|
310
|
+
pass
|
|
311
|
+
# Use Tesseract to extract text from image
|
|
312
|
+
image_text = pytesseract.image_to_string(
|
|
313
|
+
image,
|
|
314
|
+
lang=self._lang
|
|
315
|
+
)
|
|
316
|
+
# TODO: add the summary (explanation)
|
|
317
|
+
# Create a document for each image
|
|
318
|
+
image_meta = {
|
|
319
|
+
"url": url,
|
|
320
|
+
"source": f"{path.name} Page.#{page_num}",
|
|
321
|
+
"filename": path.name,
|
|
322
|
+
"index": f"{path.name}:{page_num}",
|
|
323
|
+
"question": '',
|
|
324
|
+
"answer": '',
|
|
325
|
+
"type": 'image',
|
|
326
|
+
"data": {},
|
|
327
|
+
"summary": '',
|
|
328
|
+
"document_meta": {
|
|
329
|
+
"image_index": img_index,
|
|
330
|
+
"image_name": img_name,
|
|
331
|
+
"description": f"Extracted from {page_number}."
|
|
332
|
+
},
|
|
333
|
+
"source_type": self._source_type
|
|
334
|
+
}
|
|
335
|
+
docs.append(
|
|
336
|
+
Document(page_content=image_text, metadata=image_meta)
|
|
337
|
+
)
|
|
338
|
+
# third: tables
|
|
339
|
+
# Look for tables on this page and display the table count
|
|
340
|
+
try:
|
|
341
|
+
tabs = page.find_tables()
|
|
342
|
+
for tab_idx, tab in enumerate(tabs):
|
|
343
|
+
# iterating over all tables in page:
|
|
344
|
+
df = tab.to_pandas() # convert to pandas DataFrame
|
|
345
|
+
# converting to markdown, but after pre-processing pandas
|
|
346
|
+
df = df.dropna(axis=1, how='all')
|
|
347
|
+
df = df.dropna(how='all', axis=0) # Drop empty rows
|
|
348
|
+
table_meta = {
|
|
349
|
+
"url": '',
|
|
350
|
+
"source": f"{path.name} Page.#{page_num} Table.#{tab_idx}",
|
|
351
|
+
"filename": path.name,
|
|
352
|
+
"index": f"{path.name}:{page_num}",
|
|
353
|
+
"question": '',
|
|
354
|
+
"answer": '',
|
|
355
|
+
"type": 'table',
|
|
356
|
+
"data": {},
|
|
357
|
+
"summary": '',
|
|
358
|
+
"document_meta": {
|
|
359
|
+
"table_index": tab_idx,
|
|
360
|
+
"table_shape": df.shape,
|
|
361
|
+
"table_columns": df.columns.tolist(),
|
|
362
|
+
"description": f"Extracted from {page_number}."
|
|
363
|
+
},
|
|
364
|
+
"source_type": self._source_type
|
|
365
|
+
}
|
|
366
|
+
txt = df.to_markdown()
|
|
367
|
+
if txt:
|
|
368
|
+
docs.append(
|
|
369
|
+
Document(page_content=txt, metadata=table_meta)
|
|
370
|
+
)
|
|
371
|
+
except Exception as exc:
|
|
372
|
+
print(exc)
|
|
373
|
+
# fourth: page as image
|
|
374
|
+
if self.page_as_images is True:
|
|
375
|
+
# Convert the page to a Pixmap (which is an image)
|
|
376
|
+
mat = fitz.Matrix(2, 2)
|
|
377
|
+
pix = page.get_pixmap(dpi=300, matrix=mat) # Increase DPI for better resolution
|
|
378
|
+
img_name = f'{file_name}_page_{page_num}.png'
|
|
379
|
+
img_path = self._imgdir.joinpath(img_name)
|
|
380
|
+
if img_path.exists():
|
|
381
|
+
img_path.unlink(missing_ok=True)
|
|
382
|
+
self.logger.notice(
|
|
383
|
+
f"Saving Page {page_number} as Image on {img_path}"
|
|
384
|
+
)
|
|
385
|
+
pix.save(
|
|
386
|
+
img_path
|
|
387
|
+
)
|
|
388
|
+
# TODO passing the image to a AI visual to get explanation
|
|
389
|
+
# Get the extracted text from the image
|
|
390
|
+
text = self.extract_page_text(img_path)
|
|
391
|
+
url = f'/static/images/{img_name}'
|
|
392
|
+
image_meta = {
|
|
393
|
+
"url": url,
|
|
394
|
+
"source": f"{path.name} Page.#{page_num}",
|
|
395
|
+
"filename": path.name,
|
|
396
|
+
"index": f"{path.name}:{page_num}",
|
|
397
|
+
"question": '',
|
|
398
|
+
"answer": '',
|
|
399
|
+
"type": 'page',
|
|
400
|
+
"data": {},
|
|
401
|
+
"summary": '',
|
|
402
|
+
"document_meta": {
|
|
403
|
+
"image_name": img_name,
|
|
404
|
+
"page_number": f"{page_number}"
|
|
405
|
+
},
|
|
406
|
+
"source_type": self._source_type
|
|
407
|
+
}
|
|
408
|
+
docs.append(
|
|
409
|
+
Document(page_content=text, metadata=image_meta)
|
|
410
|
+
)
|
|
411
|
+
pdf.close()
|
|
412
|
+
return docs
|
|
413
|
+
else:
|
|
414
|
+
return []
|
|
415
|
+
|
|
416
|
+
def get_ocr(self, img_path) -> list:
|
|
417
|
+
# Initialize PaddleOCR with table recognition
|
|
418
|
+
self.ocr_model = PaddleOCR(
|
|
419
|
+
lang='en',
|
|
420
|
+
det_model_dir=None,
|
|
421
|
+
rec_model_dir=None,
|
|
422
|
+
rec_char_dict_path=None,
|
|
423
|
+
table=True,
|
|
424
|
+
# use_angle_cls=True,
|
|
425
|
+
# use_gpu=True
|
|
426
|
+
)
|
|
427
|
+
result = self.ocr_model.ocr(img_path, cls=True)
|
|
428
|
+
|
|
429
|
+
# extract tables:
|
|
430
|
+
# The result contains the table structure and content
|
|
431
|
+
tables = []
|
|
432
|
+
for line in result:
|
|
433
|
+
if 'html' in line[1]:
|
|
434
|
+
html_table = line[1]['html']
|
|
435
|
+
tables.append(html_table)
|
|
436
|
+
|
|
437
|
+
print('TABLES > ', tables)
|