ai-parrot 0.1.0__cp311-cp311-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ai-parrot might be problematic. Click here for more details.
- ai_parrot-0.1.0.dist-info/LICENSE +21 -0
- ai_parrot-0.1.0.dist-info/METADATA +299 -0
- ai_parrot-0.1.0.dist-info/RECORD +108 -0
- ai_parrot-0.1.0.dist-info/WHEEL +5 -0
- ai_parrot-0.1.0.dist-info/top_level.txt +3 -0
- parrot/__init__.py +18 -0
- parrot/chatbots/__init__.py +7 -0
- parrot/chatbots/abstract.py +965 -0
- parrot/chatbots/asktroc.py +16 -0
- parrot/chatbots/base.py +257 -0
- parrot/chatbots/basic.py +9 -0
- parrot/chatbots/bose.py +17 -0
- parrot/chatbots/cody.py +17 -0
- parrot/chatbots/copilot.py +100 -0
- parrot/chatbots/dataframe.py +103 -0
- parrot/chatbots/hragents.py +15 -0
- parrot/chatbots/oddie.py +17 -0
- parrot/chatbots/retrievals/__init__.py +515 -0
- parrot/chatbots/retrievals/constitutional.py +19 -0
- parrot/conf.py +108 -0
- parrot/crew/__init__.py +3 -0
- parrot/crew/tools/__init__.py +22 -0
- parrot/crew/tools/bing.py +13 -0
- parrot/crew/tools/config.py +43 -0
- parrot/crew/tools/duckgo.py +62 -0
- parrot/crew/tools/file.py +24 -0
- parrot/crew/tools/google.py +168 -0
- parrot/crew/tools/gtrends.py +16 -0
- parrot/crew/tools/md2pdf.py +25 -0
- parrot/crew/tools/rag.py +42 -0
- parrot/crew/tools/search.py +32 -0
- parrot/crew/tools/url.py +21 -0
- parrot/exceptions.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/handlers/__init__.py +4 -0
- parrot/handlers/bots.py +196 -0
- parrot/handlers/chat.py +169 -0
- parrot/interfaces/__init__.py +6 -0
- parrot/interfaces/database.py +29 -0
- parrot/llms/__init__.py +0 -0
- parrot/llms/abstract.py +41 -0
- parrot/llms/anthropic.py +36 -0
- parrot/llms/google.py +37 -0
- parrot/llms/groq.py +33 -0
- parrot/llms/hf.py +39 -0
- parrot/llms/openai.py +49 -0
- parrot/llms/pipes.py +103 -0
- parrot/llms/vertex.py +68 -0
- parrot/loaders/__init__.py +20 -0
- parrot/loaders/abstract.py +456 -0
- parrot/loaders/basepdf.py +102 -0
- parrot/loaders/basevideo.py +280 -0
- parrot/loaders/csv.py +42 -0
- parrot/loaders/dir.py +37 -0
- parrot/loaders/excel.py +349 -0
- parrot/loaders/github.py +65 -0
- parrot/loaders/handlers/__init__.py +5 -0
- parrot/loaders/handlers/data.py +213 -0
- parrot/loaders/image.py +119 -0
- parrot/loaders/json.py +52 -0
- parrot/loaders/pdf.py +187 -0
- parrot/loaders/pdfchapters.py +142 -0
- parrot/loaders/pdffn.py +112 -0
- parrot/loaders/pdfimages.py +207 -0
- parrot/loaders/pdfmark.py +88 -0
- parrot/loaders/pdftables.py +145 -0
- parrot/loaders/ppt.py +30 -0
- parrot/loaders/qa.py +81 -0
- parrot/loaders/repo.py +103 -0
- parrot/loaders/rtd.py +65 -0
- parrot/loaders/txt.py +92 -0
- parrot/loaders/utils/__init__.py +1 -0
- parrot/loaders/utils/models.py +25 -0
- parrot/loaders/video.py +96 -0
- parrot/loaders/videolocal.py +107 -0
- parrot/loaders/vimeo.py +106 -0
- parrot/loaders/web.py +216 -0
- parrot/loaders/web_base.py +112 -0
- parrot/loaders/word.py +125 -0
- parrot/loaders/youtube.py +192 -0
- parrot/manager.py +152 -0
- parrot/models.py +347 -0
- parrot/py.typed +0 -0
- parrot/stores/__init__.py +0 -0
- parrot/stores/abstract.py +170 -0
- parrot/stores/milvus.py +540 -0
- parrot/stores/qdrant.py +153 -0
- parrot/tools/__init__.py +16 -0
- parrot/tools/abstract.py +53 -0
- parrot/tools/asknews.py +32 -0
- parrot/tools/bing.py +13 -0
- parrot/tools/duck.py +62 -0
- parrot/tools/google.py +170 -0
- parrot/tools/stack.py +26 -0
- parrot/tools/weather.py +70 -0
- parrot/tools/wikipedia.py +59 -0
- parrot/tools/zipcode.py +179 -0
- parrot/utils/__init__.py +2 -0
- parrot/utils/parsers/__init__.py +5 -0
- parrot/utils/parsers/toml.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/utils/toml.py +11 -0
- parrot/utils/types.cpython-311-x86_64-linux-gnu.so +0 -0
- parrot/utils/uv.py +11 -0
- parrot/version.py +10 -0
- resources/users/__init__.py +5 -0
- resources/users/handlers.py +13 -0
- resources/users/models.py +205 -0
- settings/__init__.py +0 -0
- settings/settings.py +51 -0
parrot/loaders/image.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
from collections.abc import Callable
|
|
3
|
+
from pathlib import Path, PurePath
|
|
4
|
+
import numpy as np
|
|
5
|
+
from PIL import Image
|
|
6
|
+
from langchain.docstore.document import Document
|
|
7
|
+
from transformers import CLIPModel
|
|
8
|
+
import torch
|
|
9
|
+
from torchvision import transforms
|
|
10
|
+
from .abstract import AbstractLoader
|
|
11
|
+
from ..stores.abstract import AbstractStore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ImageLoader(AbstractLoader):
|
|
15
|
+
"""
|
|
16
|
+
Image Loader.
|
|
17
|
+
"""
|
|
18
|
+
_extension = ['.jpg', '.jpeg', '.png']
|
|
19
|
+
chunk_size = 768
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
path: PurePath,
|
|
24
|
+
store: AbstractStore,
|
|
25
|
+
tokenizer: Callable[..., Any] = None,
|
|
26
|
+
text_splitter: Callable[..., Any] = None,
|
|
27
|
+
source_type: str = 'image',
|
|
28
|
+
**kwargs
|
|
29
|
+
):
|
|
30
|
+
super().__init__(tokenizer, text_splitter, source_type, **kwargs)
|
|
31
|
+
self.path = path
|
|
32
|
+
if isinstance(path, str):
|
|
33
|
+
self.path = Path(path).resolve()
|
|
34
|
+
# Model:
|
|
35
|
+
self._model = CLIPModel.from_pretrained(
|
|
36
|
+
# "openai/clip-vit-base-patch32"
|
|
37
|
+
"openai/clip-vit-large-patch14-336"
|
|
38
|
+
)
|
|
39
|
+
# Define image preprocessing
|
|
40
|
+
self._preprocess = transforms.Compose(
|
|
41
|
+
[
|
|
42
|
+
transforms.Resize((336, 336)), # Adjust the size to match the model's expected input
|
|
43
|
+
transforms.CenterCrop(336), # Optionally add a center crop if needed
|
|
44
|
+
transforms.ToTensor(),
|
|
45
|
+
transforms.Normalize(
|
|
46
|
+
(0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)
|
|
47
|
+
) # CLIP's original normalization
|
|
48
|
+
]
|
|
49
|
+
)
|
|
50
|
+
# required Milvus Store:
|
|
51
|
+
self.store = store
|
|
52
|
+
|
|
53
|
+
def transform_image(self, img_data):
|
|
54
|
+
image = self._preprocess(img_data)
|
|
55
|
+
image = image.unsqueeze(0)
|
|
56
|
+
with torch.no_grad():
|
|
57
|
+
features = self._model.get_image_features(pixel_values=image)
|
|
58
|
+
embedding = features.squeeze().cpu().numpy()
|
|
59
|
+
return embedding.astype(np.float32)
|
|
60
|
+
|
|
61
|
+
def _insert_image(self, data):
|
|
62
|
+
return self.store.insert(data)
|
|
63
|
+
|
|
64
|
+
def _load_image(self, path) -> list:
|
|
65
|
+
"""
|
|
66
|
+
Load an Image file.
|
|
67
|
+
Args:
|
|
68
|
+
path (Path): The path to the Image file.
|
|
69
|
+
Returns:
|
|
70
|
+
list: A list of Langchain Documents.
|
|
71
|
+
"""
|
|
72
|
+
if self._check_path(path):
|
|
73
|
+
self.logger.info(f"Loading Image file: {path}")
|
|
74
|
+
img = Image.open(path).convert('RGB')
|
|
75
|
+
embedding = self.transform_image(img).tolist()
|
|
76
|
+
data={
|
|
77
|
+
"url": '',
|
|
78
|
+
"source": f"{path.name}",
|
|
79
|
+
"filename": path,
|
|
80
|
+
"question": '',
|
|
81
|
+
"answer": '',
|
|
82
|
+
"source_type": self._source_type,
|
|
83
|
+
"type": "image",
|
|
84
|
+
"text": '',
|
|
85
|
+
"vector": embedding,
|
|
86
|
+
"document_meta": {
|
|
87
|
+
"image": path.name,
|
|
88
|
+
"extension": path.suffix
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
self._insert_image([embedding])
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
def load(self) -> list:
|
|
95
|
+
"""
|
|
96
|
+
Load data from a Image file.
|
|
97
|
+
Returns:
|
|
98
|
+
list: A list of Langchain Documents.
|
|
99
|
+
"""
|
|
100
|
+
if not self.path.exists():
|
|
101
|
+
raise FileNotFoundError(f"Image file/directory not found: {self.path}")
|
|
102
|
+
if self.path.is_dir():
|
|
103
|
+
# iterate over the files in the directory
|
|
104
|
+
for ext in self._extension:
|
|
105
|
+
for item in self.path.glob(f'*{ext}'):
|
|
106
|
+
self._load_image(item)
|
|
107
|
+
elif self.path.is_file():
|
|
108
|
+
self._load_image(self.path)
|
|
109
|
+
else:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
f"Image Loader: Invalid path: {self.path}"
|
|
112
|
+
)
|
|
113
|
+
# Load Image loads the image directly to database.
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
def parse(self, source):
|
|
117
|
+
raise NotImplementedError(
|
|
118
|
+
"Parser method is not implemented for ImageLoader."
|
|
119
|
+
)
|
parrot/loaders/json.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import PurePath
|
|
3
|
+
from langchain_community.document_loaders import JSONLoader as JSLoader
|
|
4
|
+
from .abstract import AbstractLoader
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class JSONLoader(AbstractLoader):
|
|
8
|
+
"""
|
|
9
|
+
Loader for JSON files.
|
|
10
|
+
"""
|
|
11
|
+
_extension = ['.json']
|
|
12
|
+
extract_metadata: Callable = None
|
|
13
|
+
|
|
14
|
+
def extract_metadata(self, record: dict, metadata: dict) -> dict:
|
|
15
|
+
meta = {
|
|
16
|
+
"source_type": self._source_type,
|
|
17
|
+
"priority": self._priority,
|
|
18
|
+
}
|
|
19
|
+
return meta
|
|
20
|
+
|
|
21
|
+
def load(self, path: PurePath) -> list:
|
|
22
|
+
"""
|
|
23
|
+
Load data from a JSON file.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
source (str): The path to the JSON file.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
list: A list of Langchain Documents.
|
|
30
|
+
"""
|
|
31
|
+
if self._check_path(path):
|
|
32
|
+
self.logger.info(f"Loading JSON file: {path}")
|
|
33
|
+
# Create metadata for each chunk
|
|
34
|
+
meta = {
|
|
35
|
+
"filename": str(path),
|
|
36
|
+
}
|
|
37
|
+
args = {
|
|
38
|
+
"metadata_func": self.extract_metadata,
|
|
39
|
+
}
|
|
40
|
+
loader = JSLoader(
|
|
41
|
+
file_path=path,
|
|
42
|
+
jq_schema=".",
|
|
43
|
+
text_content=False,
|
|
44
|
+
**args
|
|
45
|
+
)
|
|
46
|
+
documents = loader.load()
|
|
47
|
+
for doc in documents:
|
|
48
|
+
doc.metadata.update(meta)
|
|
49
|
+
# Split the documents into chunks
|
|
50
|
+
return self.split_documents(documents)
|
|
51
|
+
else:
|
|
52
|
+
return []
|
parrot/loaders/pdf.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from pathlib import Path, PurePath
|
|
3
|
+
from typing import Any
|
|
4
|
+
from io import BytesIO
|
|
5
|
+
import fitz
|
|
6
|
+
import pytesseract
|
|
7
|
+
from PIL import Image
|
|
8
|
+
from langchain.docstore.document import Document
|
|
9
|
+
from .basepdf import BasePDF
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PDFLoader(BasePDF):
|
|
13
|
+
"""
|
|
14
|
+
Loader for PDF files.
|
|
15
|
+
"""
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
path: PurePath,
|
|
19
|
+
tokenizer: Callable[..., Any] = None,
|
|
20
|
+
text_splitter: Callable[..., Any] = None,
|
|
21
|
+
source_type: str = 'pdf',
|
|
22
|
+
language: str = "eng",
|
|
23
|
+
**kwargs
|
|
24
|
+
):
|
|
25
|
+
super().__init__(
|
|
26
|
+
path=path,
|
|
27
|
+
tokenizer=tokenizer,
|
|
28
|
+
text_splitter=text_splitter,
|
|
29
|
+
source_type=source_type,
|
|
30
|
+
language=language,
|
|
31
|
+
**kwargs
|
|
32
|
+
)
|
|
33
|
+
self.parse_images = kwargs.get('parse_images', False)
|
|
34
|
+
# Table Settings:
|
|
35
|
+
self.table_settings = {
|
|
36
|
+
#"vertical_strategy": "text",
|
|
37
|
+
# "horizontal_strategy": "text",
|
|
38
|
+
"intersection_x_tolerance": 3,
|
|
39
|
+
"intersection_y_tolerance": 3
|
|
40
|
+
}
|
|
41
|
+
table_settings = kwargs.get('table_setttings', {})
|
|
42
|
+
if table_settings:
|
|
43
|
+
self.table_settings.update(table_settings)
|
|
44
|
+
|
|
45
|
+
def _load_pdf(self, path: Path) -> list:
|
|
46
|
+
"""
|
|
47
|
+
Load a PDF file using the Fitz library.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
path (Path): The path to the PDF file.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
list: A list of Langchain Documents.
|
|
54
|
+
"""
|
|
55
|
+
if self._check_path(path):
|
|
56
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
57
|
+
pdf = fitz.open(str(path)) # Open the PDF file
|
|
58
|
+
docs = []
|
|
59
|
+
for page_number in range(pdf.page_count):
|
|
60
|
+
page = pdf[page_number]
|
|
61
|
+
text = page.get_text()
|
|
62
|
+
# first: text
|
|
63
|
+
if text:
|
|
64
|
+
page_num = page_number + 1
|
|
65
|
+
try:
|
|
66
|
+
summary = self.get_summary_from_text(text)
|
|
67
|
+
except Exception:
|
|
68
|
+
summary = ''
|
|
69
|
+
metadata = {
|
|
70
|
+
"url": '',
|
|
71
|
+
"source": f"{path.name} Page.#{page_num}",
|
|
72
|
+
"filename": path.name,
|
|
73
|
+
"index": f"{page_num}",
|
|
74
|
+
"type": 'pdf',
|
|
75
|
+
"question": '',
|
|
76
|
+
"answer": '',
|
|
77
|
+
"source_type": self._source_type,
|
|
78
|
+
"data": {},
|
|
79
|
+
"summary": summary,
|
|
80
|
+
"document_meta": {
|
|
81
|
+
"title": pdf.metadata.get("title", ""),
|
|
82
|
+
# "subject": pdf.metadata.get("subject", ""),
|
|
83
|
+
# "keywords": pdf.metadata.get("keywords", ""),
|
|
84
|
+
"creationDate": pdf.metadata.get("creationDate", ""),
|
|
85
|
+
# "modDate": pdf.metadata.get("modDate", ""),
|
|
86
|
+
# "producer": pdf.metadata.get("producer", ""),
|
|
87
|
+
# "creator": pdf.metadata.get("creator", ""),
|
|
88
|
+
"author": pdf.metadata.get("author", ""),
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
docs.append(
|
|
92
|
+
Document(
|
|
93
|
+
page_content=text,
|
|
94
|
+
metadata=metadata
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
# Extract images and use OCR to get text from each image
|
|
98
|
+
# second: images
|
|
99
|
+
if self.parse_images is True:
|
|
100
|
+
image_list = page.get_images(full=True)
|
|
101
|
+
file_name = path.stem.replace(' ', '_').replace('.', '').lower()
|
|
102
|
+
for img_index, img in enumerate(image_list):
|
|
103
|
+
xref = img[0]
|
|
104
|
+
base_image = pdf.extract_image(xref)
|
|
105
|
+
image = Image.open(BytesIO(base_image["image"]))
|
|
106
|
+
url = ''
|
|
107
|
+
if self.save_images is True:
|
|
108
|
+
img_name = f'image_{file_name}_{page_num}_{img_index}.png'
|
|
109
|
+
img_path = self._imgdir.joinpath(img_name)
|
|
110
|
+
self.logger.notice(
|
|
111
|
+
f"Saving Image Page on {img_path}"
|
|
112
|
+
)
|
|
113
|
+
try:
|
|
114
|
+
image.save(
|
|
115
|
+
img_path,
|
|
116
|
+
format="png",
|
|
117
|
+
optimize=True
|
|
118
|
+
)
|
|
119
|
+
url = f'/static/images/{img_name}'
|
|
120
|
+
except OSError:
|
|
121
|
+
pass
|
|
122
|
+
# Use Tesseract to extract text from image
|
|
123
|
+
image_text = pytesseract.image_to_string(
|
|
124
|
+
image,
|
|
125
|
+
lang=self._lang
|
|
126
|
+
)
|
|
127
|
+
# TODO: add the summary (explanation)
|
|
128
|
+
# Create a document for each image
|
|
129
|
+
image_meta = {
|
|
130
|
+
"url": url,
|
|
131
|
+
"source": f"{path.name} Page.#{page_num}",
|
|
132
|
+
"filename": path.name,
|
|
133
|
+
"index": f"{path.name}:{page_num}",
|
|
134
|
+
"question": '',
|
|
135
|
+
"answer": '',
|
|
136
|
+
"type": 'image',
|
|
137
|
+
"data": {},
|
|
138
|
+
"summary": '',
|
|
139
|
+
"document_meta": {
|
|
140
|
+
"image_index": img_index,
|
|
141
|
+
"image_name": img_name,
|
|
142
|
+
"description": f"Extracted from {page_number}."
|
|
143
|
+
},
|
|
144
|
+
"source_type": self._source_type
|
|
145
|
+
}
|
|
146
|
+
docs.append(
|
|
147
|
+
Document(page_content=image_text, metadata=image_meta)
|
|
148
|
+
)
|
|
149
|
+
# third: tables
|
|
150
|
+
# Look for tables on this page and display the table count
|
|
151
|
+
try:
|
|
152
|
+
tabs = page.find_tables()
|
|
153
|
+
for tab_idx, tab in enumerate(tabs):
|
|
154
|
+
# iterating over all tables in page:
|
|
155
|
+
df = tab.to_pandas() # convert to pandas DataFrame
|
|
156
|
+
# converting to markdown, but after pre-processing pandas
|
|
157
|
+
df = df.dropna(axis=1, how='all')
|
|
158
|
+
df = df.dropna(how='all', axis=0) # Drop empty rows
|
|
159
|
+
table_meta = {
|
|
160
|
+
"url": '',
|
|
161
|
+
"source": f"{path.name} Page.#{page_num} Table.#{tab_idx}",
|
|
162
|
+
"filename": path.name,
|
|
163
|
+
"index": f"{path.name}:{page_num}",
|
|
164
|
+
"question": '',
|
|
165
|
+
"answer": '',
|
|
166
|
+
"type": 'table',
|
|
167
|
+
"data": {},
|
|
168
|
+
"summary": '',
|
|
169
|
+
"document_meta": {
|
|
170
|
+
"table_index": tab_idx,
|
|
171
|
+
"table_shape": df.shape,
|
|
172
|
+
"table_columns": df.columns.tolist(),
|
|
173
|
+
"description": f"Extracted from {page_number}."
|
|
174
|
+
},
|
|
175
|
+
"source_type": self._source_type
|
|
176
|
+
}
|
|
177
|
+
txt = df.to_markdown()
|
|
178
|
+
if txt:
|
|
179
|
+
docs.append(
|
|
180
|
+
Document(page_content=txt, metadata=table_meta)
|
|
181
|
+
)
|
|
182
|
+
except Exception as exc:
|
|
183
|
+
print(exc)
|
|
184
|
+
pdf.close()
|
|
185
|
+
return docs
|
|
186
|
+
else:
|
|
187
|
+
return []
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any, Optional, List, Union
|
|
3
|
+
from pathlib import PurePath, Path
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import fitz # PyMuPDF
|
|
6
|
+
from langchain.docstore.document import Document
|
|
7
|
+
from langchain.text_splitter import (
|
|
8
|
+
RecursiveCharacterTextSplitter
|
|
9
|
+
)
|
|
10
|
+
from .basepdf import BasePDF
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PDFChapterLoader(BasePDF):
|
|
14
|
+
"""
|
|
15
|
+
Preserving Chapter Structure from PDF files.
|
|
16
|
+
"""
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
path: PurePath,
|
|
20
|
+
tokenizer: Callable[..., Any] = None,
|
|
21
|
+
text_splitter: Callable[..., Any] = None,
|
|
22
|
+
source_type: str = 'pdf',
|
|
23
|
+
language: str = "eng",
|
|
24
|
+
**kwargs
|
|
25
|
+
):
|
|
26
|
+
super().__init__(
|
|
27
|
+
path=path,
|
|
28
|
+
tokenizer=tokenizer,
|
|
29
|
+
text_splitter=text_splitter,
|
|
30
|
+
source_type=source_type,
|
|
31
|
+
language=language,
|
|
32
|
+
**kwargs
|
|
33
|
+
)
|
|
34
|
+
# Which Font is used for titles (Chapter separation)
|
|
35
|
+
self.title_font: list = kwargs.get('title_font', 'Calibri-Bold')
|
|
36
|
+
if not text_splitter:
|
|
37
|
+
self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
|
38
|
+
self.tokenizer,
|
|
39
|
+
chunk_size=2000,
|
|
40
|
+
chunk_overlap=100,
|
|
41
|
+
add_start_index=True, # If `True`, includes chunk's start index in metadata
|
|
42
|
+
strip_whitespace=True, # strips whitespace from the start and end
|
|
43
|
+
separators=["\n\n", "\n", "\r\n", "\r", "\f", "\v", "\x0b", "\x0c"],
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def eval_title(self, title_font: str) -> bool:
|
|
47
|
+
"""
|
|
48
|
+
Check if the font is a title font.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
title_font (str): The font to check.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
bool: True if the font is a title font.
|
|
55
|
+
"""
|
|
56
|
+
return 'Bold' in title_font or title_font == self.title_font
|
|
57
|
+
|
|
58
|
+
def _load_pdf(self, path: PurePath, **kwargs):
|
|
59
|
+
"""
|
|
60
|
+
Open a PDF file.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
path (PurePath): The path to the PDF file.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
pdfplumber.PDF: The PDF object.
|
|
67
|
+
"""
|
|
68
|
+
pdf = fitz.open(path)
|
|
69
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
70
|
+
chapters = []
|
|
71
|
+
current_chapter_text = ''
|
|
72
|
+
current_chapter_title = ''
|
|
73
|
+
current_chapter_page = None
|
|
74
|
+
chapter_titles = set() # Keep track of unique chapter titles
|
|
75
|
+
for page_num in range(len(pdf)):
|
|
76
|
+
page = pdf.load_page(page_num)
|
|
77
|
+
blocks = page.get_text("dict")["blocks"]
|
|
78
|
+
page_number = page_num + 1
|
|
79
|
+
metadata = {
|
|
80
|
+
"url": '',
|
|
81
|
+
"index": f"{path.name} #{page_number}",
|
|
82
|
+
"source": f"{path.name} #{page_number}",
|
|
83
|
+
"filename": path.name,
|
|
84
|
+
"source_type": self._source_type,
|
|
85
|
+
"type": "pdf",
|
|
86
|
+
"question": "",
|
|
87
|
+
"answer": "",
|
|
88
|
+
"summary": '',
|
|
89
|
+
"document_meta": {
|
|
90
|
+
"page_number": page_num,
|
|
91
|
+
# **pdf.metadata
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
for b in blocks:
|
|
95
|
+
if b['type'] == 0: # Text block
|
|
96
|
+
block_text = ''
|
|
97
|
+
for line in b["lines"]:
|
|
98
|
+
for span in line["spans"]:
|
|
99
|
+
block_text += span['text'] # Accumulate text within the block
|
|
100
|
+
|
|
101
|
+
# Check if the block text is a title by examining the font
|
|
102
|
+
if any(self.eval_title(span['font']) for line in b["lines"] for span in line["spans"]):
|
|
103
|
+
title = block_text.strip()
|
|
104
|
+
if title not in chapter_titles:
|
|
105
|
+
# Save the current chapter if it's not empty and start a new one
|
|
106
|
+
if current_chapter_text.strip() and current_chapter_text.strip() != current_chapter_title.strip():
|
|
107
|
+
chapters.append({
|
|
108
|
+
'chapter': current_chapter_title,
|
|
109
|
+
'content': current_chapter_text.strip(),
|
|
110
|
+
'page': current_chapter_page,
|
|
111
|
+
'meta': metadata
|
|
112
|
+
})
|
|
113
|
+
current_chapter_title = f"**{title}**: "
|
|
114
|
+
current_chapter_page = page_num + 1
|
|
115
|
+
current_chapter_text = current_chapter_title
|
|
116
|
+
chapter_titles.add(title)
|
|
117
|
+
else:
|
|
118
|
+
# Continue appending to the existing chapter
|
|
119
|
+
current_chapter_text += block_text
|
|
120
|
+
else:
|
|
121
|
+
# Continue appending text to the current chapter
|
|
122
|
+
current_chapter_text += block_text
|
|
123
|
+
|
|
124
|
+
# Add a newline after processing each block, if not a chapter title
|
|
125
|
+
if not block_text.strip().startswith(current_chapter_title):
|
|
126
|
+
current_chapter_text += "\n"
|
|
127
|
+
|
|
128
|
+
# Save the last chapter if it exists and it's not just the title
|
|
129
|
+
if current_chapter_text.strip() and current_chapter_text.strip() != current_chapter_title.strip():
|
|
130
|
+
chapters.append({
|
|
131
|
+
'chapter': current_chapter_title,
|
|
132
|
+
'content': current_chapter_text.strip(),
|
|
133
|
+
'page': current_chapter_page,
|
|
134
|
+
'meta': metadata
|
|
135
|
+
})
|
|
136
|
+
documents = []
|
|
137
|
+
for chapter in chapters:
|
|
138
|
+
documents.append(Document(
|
|
139
|
+
page_content=chapter['content'],
|
|
140
|
+
metadata=chapter['meta']
|
|
141
|
+
))
|
|
142
|
+
return documents
|
parrot/loaders/pdffn.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any, Optional, List, Union
|
|
3
|
+
from pathlib import PurePath, Path
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import fitz # PyMuPDF
|
|
6
|
+
from langchain.docstore.document import Document
|
|
7
|
+
from langchain.text_splitter import (
|
|
8
|
+
RecursiveCharacterTextSplitter
|
|
9
|
+
)
|
|
10
|
+
from .basepdf import BasePDF
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PDFFnLoader(BasePDF):
|
|
14
|
+
"""
|
|
15
|
+
Loading a PDF with including function processing.
|
|
16
|
+
"""
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
path: PurePath,
|
|
20
|
+
tokenizer: Callable[..., Any] = None,
|
|
21
|
+
text_splitter: Union[None, Callable[..., Any]] = None,
|
|
22
|
+
source_type: str = 'pdf',
|
|
23
|
+
language: str = "eng",
|
|
24
|
+
**kwargs
|
|
25
|
+
):
|
|
26
|
+
table_settings = kwargs.pop('table_settings', {})
|
|
27
|
+
super().__init__(
|
|
28
|
+
path=path,
|
|
29
|
+
tokenizer=tokenizer,
|
|
30
|
+
text_splitter=text_splitter,
|
|
31
|
+
source_type=source_type,
|
|
32
|
+
language=language,
|
|
33
|
+
**kwargs
|
|
34
|
+
)
|
|
35
|
+
if not text_splitter:
|
|
36
|
+
self.text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
|
|
37
|
+
self.tokenizer,
|
|
38
|
+
chunk_size=2000,
|
|
39
|
+
chunk_overlap=100,
|
|
40
|
+
add_start_index=True, # If `True`, includes chunk's start index in metadata
|
|
41
|
+
strip_whitespace=True, # strips whitespace from the start and end
|
|
42
|
+
separators=["\n\n", "\n", "\r\n", "\r", "\f", "\v", "\x0b", "\x0c"],
|
|
43
|
+
)
|
|
44
|
+
self.table_settings = {
|
|
45
|
+
"vertical_strategy": "lines",
|
|
46
|
+
"horizontal_strategy": "text",
|
|
47
|
+
"intersection_x_tolerance": 5,
|
|
48
|
+
"intersection_y_tolerance": 5,
|
|
49
|
+
"edge_min_length": 10,
|
|
50
|
+
}
|
|
51
|
+
# Define settings for Fitz Table Processing
|
|
52
|
+
self.table_settings = {**self.table_settings, **table_settings}
|
|
53
|
+
|
|
54
|
+
def set_metadata(self, path, page, page_number, **kwargs) -> dict:
|
|
55
|
+
n = page_number + 1
|
|
56
|
+
return {
|
|
57
|
+
"url": '',
|
|
58
|
+
"index": f"{path.name} #{page_number}",
|
|
59
|
+
"source": f"{path.name} #{page_number}",
|
|
60
|
+
"filename": path.name,
|
|
61
|
+
"source_type": self._source_type,
|
|
62
|
+
"type": "pdf",
|
|
63
|
+
"question": "",
|
|
64
|
+
"answer": "",
|
|
65
|
+
"summary": '',
|
|
66
|
+
"document_meta": {
|
|
67
|
+
"page_number": n,
|
|
68
|
+
**kwargs
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
def processing_table(self, table, table_idx, page, **kwargs) -> dict:
|
|
73
|
+
df = table.to_pandas() # convert to pandas DataFrame
|
|
74
|
+
df = df.dropna(axis=1, how='all')
|
|
75
|
+
df = df.dropna(how='all', axis=0) # Drop empty rows
|
|
76
|
+
table_data = []
|
|
77
|
+
# Extract text from each cell
|
|
78
|
+
for row_idx in range(table.row_count):
|
|
79
|
+
for col_idx in range(table.column_count):
|
|
80
|
+
cell = table[row_idx][col_idx]
|
|
81
|
+
print('CELL ', cell)
|
|
82
|
+
print('---------')
|
|
83
|
+
cell_text = cell.get_text("text", flags=fitz.TEXTFLAGS_HTML)
|
|
84
|
+
print(cell_text)
|
|
85
|
+
|
|
86
|
+
return table_data
|
|
87
|
+
|
|
88
|
+
def _load_pdf(self, path: PurePath, **kwargs):
|
|
89
|
+
"""
|
|
90
|
+
Open a PDF file.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
path (PurePath): The path to the PDF file.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
fitz.PDF: The PDF object.
|
|
97
|
+
"""
|
|
98
|
+
pdf = fitz.open(path)
|
|
99
|
+
self.logger.info(f"Loading PDF file: {path}")
|
|
100
|
+
for page_num in range(len(pdf)):
|
|
101
|
+
# Will extract first the table and second the block of texts
|
|
102
|
+
page = pdf.load_page(page_num)
|
|
103
|
+
parts = page.get_text("dict", flags=fitz.TEXTFLAGS_HTML)
|
|
104
|
+
# print('PARTS ', parts)
|
|
105
|
+
blocks = page.get_text("dict")["blocks"]
|
|
106
|
+
# print('BLOCKS >', blocks)
|
|
107
|
+
metadata = self.set_metadata(path, page, page_num)
|
|
108
|
+
# print('META > ', metadata)
|
|
109
|
+
tables = page.find_tables(**self.table_settings)
|
|
110
|
+
for tab_idx, table in enumerate(tables):
|
|
111
|
+
table_data = self.processing_table(table, tab_idx, page)
|
|
112
|
+
return []
|